diff --git "a/checkpoint-35800/trainer_state.json" "b/checkpoint-35800/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-35800/trainer_state.json" @@ -0,0 +1,251343 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 35800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005586592178770949, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 13.4193, + "step": 1 + }, + { + "epoch": 0.0011173184357541898, + "grad_norm": 33.38533401489258, + "learning_rate": 1e-05, + "loss": 12.9216, + "step": 2 + }, + { + "epoch": 0.0016759776536312849, + "grad_norm": Infinity, + "learning_rate": 1e-05, + "loss": 13.042, + "step": 3 + }, + { + "epoch": 0.0022346368715083797, + "grad_norm": 34.66608810424805, + "learning_rate": 2e-05, + "loss": 13.1416, + "step": 4 + }, + { + "epoch": 0.002793296089385475, + "grad_norm": 37.251033782958984, + "learning_rate": 3e-05, + "loss": 13.529, + "step": 5 + }, + { + "epoch": 0.0033519553072625698, + "grad_norm": 32.82163619995117, + "learning_rate": 4e-05, + "loss": 12.9028, + "step": 6 + }, + { + "epoch": 0.003910614525139665, + "grad_norm": 34.39775085449219, + "learning_rate": 5e-05, + "loss": 12.6227, + "step": 7 + }, + { + "epoch": 0.004469273743016759, + "grad_norm": 35.74417495727539, + "learning_rate": 6e-05, + "loss": 12.4218, + "step": 8 + }, + { + "epoch": 0.005027932960893855, + "grad_norm": 37.67799377441406, + "learning_rate": 7.000000000000001e-05, + "loss": 12.5441, + "step": 9 + }, + { + "epoch": 0.00558659217877095, + "grad_norm": 34.68803405761719, + "learning_rate": 8e-05, + "loss": 11.8226, + "step": 10 + }, + { + "epoch": 0.006145251396648044, + "grad_norm": 40.62199020385742, + "learning_rate": 8.999999999999999e-05, + "loss": 13.4227, + "step": 11 + }, + { + "epoch": 0.0067039106145251395, + "grad_norm": 41.13390350341797, + "learning_rate": 0.0001, + "loss": 12.0762, + "step": 12 + }, + { + "epoch": 0.007262569832402235, + "grad_norm": 32.25524139404297, + "learning_rate": 0.00011, + "loss": 11.2081, + "step": 13 + }, + { + "epoch": 0.00782122905027933, + "grad_norm": Infinity, + "learning_rate": 0.00011, + "loss": 11.9672, + "step": 14 + }, + { + "epoch": 0.008379888268156424, + "grad_norm": 37.992488861083984, + "learning_rate": 0.00012, + "loss": 11.5283, + "step": 15 + }, + { + "epoch": 0.008938547486033519, + "grad_norm": 37.751983642578125, + "learning_rate": 0.00013000000000000002, + "loss": 10.5765, + "step": 16 + }, + { + "epoch": 0.009497206703910615, + "grad_norm": 37.5428352355957, + "learning_rate": 0.00014000000000000001, + "loss": 10.9605, + "step": 17 + }, + { + "epoch": 0.01005586592178771, + "grad_norm": 39.13656234741211, + "learning_rate": 0.00015, + "loss": 10.0404, + "step": 18 + }, + { + "epoch": 0.010614525139664804, + "grad_norm": 39.929405212402344, + "learning_rate": 0.00016, + "loss": 9.638, + "step": 19 + }, + { + "epoch": 0.0111731843575419, + "grad_norm": 35.697303771972656, + "learning_rate": 0.00017, + "loss": 9.4459, + "step": 20 + }, + { + "epoch": 0.011731843575418994, + "grad_norm": 43.64272689819336, + "learning_rate": 0.00017999999999999998, + "loss": 9.1672, + "step": 21 + }, + { + "epoch": 0.012290502793296089, + "grad_norm": 36.118736267089844, + "learning_rate": 0.00019, + "loss": 8.3166, + "step": 22 + }, + { + "epoch": 0.012849162011173185, + "grad_norm": 37.35279083251953, + "learning_rate": 0.0002, + "loss": 8.3808, + "step": 23 + }, + { + "epoch": 0.013407821229050279, + "grad_norm": 38.960723876953125, + "learning_rate": 0.00021, + "loss": 7.4123, + "step": 24 + }, + { + "epoch": 0.013966480446927373, + "grad_norm": 34.48334884643555, + "learning_rate": 0.00022, + "loss": 7.0422, + "step": 25 + }, + { + "epoch": 0.01452513966480447, + "grad_norm": 37.087867736816406, + "learning_rate": 0.00023, + "loss": 6.8311, + "step": 26 + }, + { + "epoch": 0.015083798882681564, + "grad_norm": 37.18873596191406, + "learning_rate": 0.00024, + "loss": 5.968, + "step": 27 + }, + { + "epoch": 0.01564245810055866, + "grad_norm": 33.669677734375, + "learning_rate": 0.00025, + "loss": 5.592, + "step": 28 + }, + { + "epoch": 0.016201117318435754, + "grad_norm": 33.15557098388672, + "learning_rate": 0.00026000000000000003, + "loss": 5.8566, + "step": 29 + }, + { + "epoch": 0.01675977653631285, + "grad_norm": 25.873477935791016, + "learning_rate": 0.00027, + "loss": 4.7083, + "step": 30 + }, + { + "epoch": 0.017318435754189943, + "grad_norm": 20.893062591552734, + "learning_rate": 0.00028000000000000003, + "loss": 4.2827, + "step": 31 + }, + { + "epoch": 0.017877094972067038, + "grad_norm": 19.956817626953125, + "learning_rate": 0.00029, + "loss": 4.0665, + "step": 32 + }, + { + "epoch": 0.018435754189944135, + "grad_norm": 11.324917793273926, + "learning_rate": 0.0003, + "loss": 3.5017, + "step": 33 + }, + { + "epoch": 0.01899441340782123, + "grad_norm": 9.019549369812012, + "learning_rate": 0.00031, + "loss": 3.4026, + "step": 34 + }, + { + "epoch": 0.019553072625698324, + "grad_norm": 4.461162090301514, + "learning_rate": 0.00032, + "loss": 3.1863, + "step": 35 + }, + { + "epoch": 0.02011173184357542, + "grad_norm": 2.5391528606414795, + "learning_rate": 0.00033, + "loss": 3.0947, + "step": 36 + }, + { + "epoch": 0.020670391061452513, + "grad_norm": 2.935084581375122, + "learning_rate": 0.00034, + "loss": 3.0354, + "step": 37 + }, + { + "epoch": 0.021229050279329607, + "grad_norm": 4.849456787109375, + "learning_rate": 0.00035, + "loss": 2.9455, + "step": 38 + }, + { + "epoch": 0.021787709497206705, + "grad_norm": 5.720569610595703, + "learning_rate": 0.00035999999999999997, + "loss": 2.9848, + "step": 39 + }, + { + "epoch": 0.0223463687150838, + "grad_norm": 6.49008321762085, + "learning_rate": 0.00037, + "loss": 2.9493, + "step": 40 + }, + { + "epoch": 0.022905027932960894, + "grad_norm": 5.751615047454834, + "learning_rate": 0.00038, + "loss": 2.9418, + "step": 41 + }, + { + "epoch": 0.02346368715083799, + "grad_norm": 4.788735866546631, + "learning_rate": 0.00039000000000000005, + "loss": 2.8486, + "step": 42 + }, + { + "epoch": 0.024022346368715083, + "grad_norm": 3.7940988540649414, + "learning_rate": 0.0004, + "loss": 2.7338, + "step": 43 + }, + { + "epoch": 0.024581005586592177, + "grad_norm": 2.9017174243927, + "learning_rate": 0.00041, + "loss": 2.6205, + "step": 44 + }, + { + "epoch": 0.025139664804469275, + "grad_norm": 3.0014331340789795, + "learning_rate": 0.00042, + "loss": 2.5226, + "step": 45 + }, + { + "epoch": 0.02569832402234637, + "grad_norm": 3.001723527908325, + "learning_rate": 0.00043, + "loss": 2.5499, + "step": 46 + }, + { + "epoch": 0.026256983240223464, + "grad_norm": 3.0659825801849365, + "learning_rate": 0.00044, + "loss": 2.4628, + "step": 47 + }, + { + "epoch": 0.026815642458100558, + "grad_norm": 3.408596992492676, + "learning_rate": 0.00045000000000000004, + "loss": 2.31, + "step": 48 + }, + { + "epoch": 0.027374301675977653, + "grad_norm": 2.9686427116394043, + "learning_rate": 0.00046, + "loss": 2.2109, + "step": 49 + }, + { + "epoch": 0.027932960893854747, + "grad_norm": 2.8361899852752686, + "learning_rate": 0.00047, + "loss": 2.0227, + "step": 50 + }, + { + "epoch": 0.028491620111731845, + "grad_norm": 2.9370713233947754, + "learning_rate": 0.00048, + "loss": 1.877, + "step": 51 + }, + { + "epoch": 0.02905027932960894, + "grad_norm": 2.5855000019073486, + "learning_rate": 0.00049, + "loss": 1.7842, + "step": 52 + }, + { + "epoch": 0.029608938547486034, + "grad_norm": 2.46258544921875, + "learning_rate": 0.0005, + "loss": 1.5363, + "step": 53 + }, + { + "epoch": 0.030167597765363128, + "grad_norm": 2.1196603775024414, + "learning_rate": 0.00051, + "loss": 1.5057, + "step": 54 + }, + { + "epoch": 0.030726256983240222, + "grad_norm": 1.6629084348678589, + "learning_rate": 0.0005200000000000001, + "loss": 1.5028, + "step": 55 + }, + { + "epoch": 0.03128491620111732, + "grad_norm": 1.582436203956604, + "learning_rate": 0.0005300000000000001, + "loss": 1.3508, + "step": 56 + }, + { + "epoch": 0.031843575418994415, + "grad_norm": 1.4905234575271606, + "learning_rate": 0.00054, + "loss": 1.2586, + "step": 57 + }, + { + "epoch": 0.03240223463687151, + "grad_norm": 1.264930248260498, + "learning_rate": 0.00055, + "loss": 0.9889, + "step": 58 + }, + { + "epoch": 0.0329608938547486, + "grad_norm": 1.204182744026184, + "learning_rate": 0.0005600000000000001, + "loss": 0.9618, + "step": 59 + }, + { + "epoch": 0.0335195530726257, + "grad_norm": 1.1022368669509888, + "learning_rate": 0.00057, + "loss": 0.9186, + "step": 60 + }, + { + "epoch": 0.03407821229050279, + "grad_norm": 1.2111046314239502, + "learning_rate": 0.00058, + "loss": 0.9412, + "step": 61 + }, + { + "epoch": 0.034636871508379886, + "grad_norm": 1.2781904935836792, + "learning_rate": 0.00059, + "loss": 0.7073, + "step": 62 + }, + { + "epoch": 0.03519553072625698, + "grad_norm": 1.3620400428771973, + "learning_rate": 0.0006, + "loss": 0.7923, + "step": 63 + }, + { + "epoch": 0.035754189944134075, + "grad_norm": 0.6299511790275574, + "learning_rate": 0.00061, + "loss": 0.8297, + "step": 64 + }, + { + "epoch": 0.036312849162011177, + "grad_norm": 2.654103994369507, + "learning_rate": 0.00062, + "loss": 0.6226, + "step": 65 + }, + { + "epoch": 0.03687150837988827, + "grad_norm": 1.0151530504226685, + "learning_rate": 0.00063, + "loss": 0.8323, + "step": 66 + }, + { + "epoch": 0.037430167597765365, + "grad_norm": 0.9279337525367737, + "learning_rate": 0.00064, + "loss": 0.6149, + "step": 67 + }, + { + "epoch": 0.03798882681564246, + "grad_norm": 2.061647415161133, + "learning_rate": 0.0006500000000000001, + "loss": 0.8854, + "step": 68 + }, + { + "epoch": 0.038547486033519554, + "grad_norm": 1.1083693504333496, + "learning_rate": 0.00066, + "loss": 0.7137, + "step": 69 + }, + { + "epoch": 0.03910614525139665, + "grad_norm": 1.25771963596344, + "learning_rate": 0.00067, + "loss": 0.7517, + "step": 70 + }, + { + "epoch": 0.03966480446927374, + "grad_norm": 0.7894580960273743, + "learning_rate": 0.00068, + "loss": 0.7649, + "step": 71 + }, + { + "epoch": 0.04022346368715084, + "grad_norm": 1.381516456604004, + "learning_rate": 0.00069, + "loss": 0.8449, + "step": 72 + }, + { + "epoch": 0.04078212290502793, + "grad_norm": 1.1479374170303345, + "learning_rate": 0.0007, + "loss": 0.7623, + "step": 73 + }, + { + "epoch": 0.041340782122905026, + "grad_norm": 1.061487078666687, + "learning_rate": 0.00071, + "loss": 0.7552, + "step": 74 + }, + { + "epoch": 0.04189944134078212, + "grad_norm": 0.7779232263565063, + "learning_rate": 0.0007199999999999999, + "loss": 0.7695, + "step": 75 + }, + { + "epoch": 0.042458100558659215, + "grad_norm": 1.5185297727584839, + "learning_rate": 0.00073, + "loss": 0.6829, + "step": 76 + }, + { + "epoch": 0.043016759776536316, + "grad_norm": 1.2652549743652344, + "learning_rate": 0.00074, + "loss": 0.7714, + "step": 77 + }, + { + "epoch": 0.04357541899441341, + "grad_norm": 0.9421565532684326, + "learning_rate": 0.00075, + "loss": 0.725, + "step": 78 + }, + { + "epoch": 0.044134078212290505, + "grad_norm": 1.9182367324829102, + "learning_rate": 0.00076, + "loss": 0.6889, + "step": 79 + }, + { + "epoch": 0.0446927374301676, + "grad_norm": 1.2144354581832886, + "learning_rate": 0.0007700000000000001, + "loss": 0.5684, + "step": 80 + }, + { + "epoch": 0.045251396648044694, + "grad_norm": 1.2565786838531494, + "learning_rate": 0.0007800000000000001, + "loss": 0.7785, + "step": 81 + }, + { + "epoch": 0.04581005586592179, + "grad_norm": 0.9230783581733704, + "learning_rate": 0.00079, + "loss": 0.69, + "step": 82 + }, + { + "epoch": 0.04636871508379888, + "grad_norm": 1.5504589080810547, + "learning_rate": 0.0008, + "loss": 0.5854, + "step": 83 + }, + { + "epoch": 0.04692737430167598, + "grad_norm": 0.9252780079841614, + "learning_rate": 0.0008100000000000001, + "loss": 0.6917, + "step": 84 + }, + { + "epoch": 0.04748603351955307, + "grad_norm": 1.1322064399719238, + "learning_rate": 0.00082, + "loss": 0.6407, + "step": 85 + }, + { + "epoch": 0.048044692737430165, + "grad_norm": 2.5337114334106445, + "learning_rate": 0.00083, + "loss": 0.6398, + "step": 86 + }, + { + "epoch": 0.04860335195530726, + "grad_norm": 1.1239203214645386, + "learning_rate": 0.00084, + "loss": 0.7269, + "step": 87 + }, + { + "epoch": 0.049162011173184354, + "grad_norm": 1.083847999572754, + "learning_rate": 0.00085, + "loss": 0.6315, + "step": 88 + }, + { + "epoch": 0.049720670391061456, + "grad_norm": 0.7342848777770996, + "learning_rate": 0.00086, + "loss": 0.652, + "step": 89 + }, + { + "epoch": 0.05027932960893855, + "grad_norm": 4.147243976593018, + "learning_rate": 0.00087, + "loss": 0.7615, + "step": 90 + }, + { + "epoch": 0.050837988826815644, + "grad_norm": 2.0662992000579834, + "learning_rate": 0.00088, + "loss": 0.5951, + "step": 91 + }, + { + "epoch": 0.05139664804469274, + "grad_norm": 1.634759545326233, + "learning_rate": 0.0008900000000000001, + "loss": 0.6484, + "step": 92 + }, + { + "epoch": 0.05195530726256983, + "grad_norm": 0.8145692944526672, + "learning_rate": 0.0009000000000000001, + "loss": 0.6243, + "step": 93 + }, + { + "epoch": 0.05251396648044693, + "grad_norm": 2.5843019485473633, + "learning_rate": 0.00091, + "loss": 0.653, + "step": 94 + }, + { + "epoch": 0.05307262569832402, + "grad_norm": 0.823938250541687, + "learning_rate": 0.00092, + "loss": 0.6247, + "step": 95 + }, + { + "epoch": 0.053631284916201116, + "grad_norm": 0.8840071558952332, + "learning_rate": 0.00093, + "loss": 0.4915, + "step": 96 + }, + { + "epoch": 0.05418994413407821, + "grad_norm": 2.526801109313965, + "learning_rate": 0.00094, + "loss": 0.66, + "step": 97 + }, + { + "epoch": 0.054748603351955305, + "grad_norm": 1.1919090747833252, + "learning_rate": 0.00095, + "loss": 0.5974, + "step": 98 + }, + { + "epoch": 0.0553072625698324, + "grad_norm": 0.6100426912307739, + "learning_rate": 0.00096, + "loss": 0.5521, + "step": 99 + }, + { + "epoch": 0.055865921787709494, + "grad_norm": 1.6367216110229492, + "learning_rate": 0.0009699999999999999, + "loss": 0.6039, + "step": 100 + }, + { + "epoch": 0.056424581005586595, + "grad_norm": 0.8854618072509766, + "learning_rate": 0.00098, + "loss": 0.7298, + "step": 101 + }, + { + "epoch": 0.05698324022346369, + "grad_norm": 0.5425339341163635, + "learning_rate": 0.00099, + "loss": 0.6097, + "step": 102 + }, + { + "epoch": 0.057541899441340784, + "grad_norm": 0.9364147186279297, + "learning_rate": 0.001, + "loss": 0.5727, + "step": 103 + }, + { + "epoch": 0.05810055865921788, + "grad_norm": 1.6017168760299683, + "learning_rate": 0.0009999719887955182, + "loss": 0.7056, + "step": 104 + }, + { + "epoch": 0.05865921787709497, + "grad_norm": 1.070496678352356, + "learning_rate": 0.0009999439775910364, + "loss": 0.616, + "step": 105 + }, + { + "epoch": 0.05921787709497207, + "grad_norm": 2.1718647480010986, + "learning_rate": 0.0009999159663865546, + "loss": 0.6594, + "step": 106 + }, + { + "epoch": 0.05977653631284916, + "grad_norm": 10.674858093261719, + "learning_rate": 0.0009998879551820728, + "loss": 0.5402, + "step": 107 + }, + { + "epoch": 0.060335195530726256, + "grad_norm": 1.2746437788009644, + "learning_rate": 0.000999859943977591, + "loss": 0.5654, + "step": 108 + }, + { + "epoch": 0.06089385474860335, + "grad_norm": 2.5544512271881104, + "learning_rate": 0.0009998319327731093, + "loss": 0.6085, + "step": 109 + }, + { + "epoch": 0.061452513966480445, + "grad_norm": 0.6390484571456909, + "learning_rate": 0.0009998039215686275, + "loss": 0.5188, + "step": 110 + }, + { + "epoch": 0.06201117318435754, + "grad_norm": 1.117769479751587, + "learning_rate": 0.0009997759103641457, + "loss": 0.6537, + "step": 111 + }, + { + "epoch": 0.06256983240223464, + "grad_norm": 1.5526758432388306, + "learning_rate": 0.0009997478991596639, + "loss": 0.5633, + "step": 112 + }, + { + "epoch": 0.06312849162011173, + "grad_norm": 0.8329930305480957, + "learning_rate": 0.000999719887955182, + "loss": 0.734, + "step": 113 + }, + { + "epoch": 0.06368715083798883, + "grad_norm": 1.0422401428222656, + "learning_rate": 0.0009996918767507003, + "loss": 0.5845, + "step": 114 + }, + { + "epoch": 0.06424581005586592, + "grad_norm": 0.5255727767944336, + "learning_rate": 0.0009996638655462185, + "loss": 0.6155, + "step": 115 + }, + { + "epoch": 0.06480446927374302, + "grad_norm": 0.7957318425178528, + "learning_rate": 0.0009996358543417367, + "loss": 0.5424, + "step": 116 + }, + { + "epoch": 0.06536312849162011, + "grad_norm": 0.8654010891914368, + "learning_rate": 0.000999607843137255, + "loss": 0.5597, + "step": 117 + }, + { + "epoch": 0.0659217877094972, + "grad_norm": 0.7695738077163696, + "learning_rate": 0.0009995798319327731, + "loss": 0.649, + "step": 118 + }, + { + "epoch": 0.0664804469273743, + "grad_norm": 0.6359785795211792, + "learning_rate": 0.0009995518207282913, + "loss": 0.5015, + "step": 119 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 1.1970692873001099, + "learning_rate": 0.0009995238095238095, + "loss": 0.5476, + "step": 120 + }, + { + "epoch": 0.06759776536312849, + "grad_norm": 1.4306594133377075, + "learning_rate": 0.0009994957983193277, + "loss": 0.8179, + "step": 121 + }, + { + "epoch": 0.06815642458100558, + "grad_norm": 0.8186668157577515, + "learning_rate": 0.000999467787114846, + "loss": 0.5035, + "step": 122 + }, + { + "epoch": 0.06871508379888268, + "grad_norm": 0.9915730357170105, + "learning_rate": 0.0009994397759103641, + "loss": 0.6088, + "step": 123 + }, + { + "epoch": 0.06927374301675977, + "grad_norm": 1.1441738605499268, + "learning_rate": 0.0009994117647058823, + "loss": 0.5978, + "step": 124 + }, + { + "epoch": 0.06983240223463687, + "grad_norm": 3.2184536457061768, + "learning_rate": 0.0009993837535014006, + "loss": 0.6593, + "step": 125 + }, + { + "epoch": 0.07039106145251396, + "grad_norm": 6.630455017089844, + "learning_rate": 0.0009993557422969188, + "loss": 0.7603, + "step": 126 + }, + { + "epoch": 0.07094972067039106, + "grad_norm": 1.8797285556793213, + "learning_rate": 0.000999327731092437, + "loss": 0.6696, + "step": 127 + }, + { + "epoch": 0.07150837988826815, + "grad_norm": 1.5480901002883911, + "learning_rate": 0.0009992997198879552, + "loss": 0.5827, + "step": 128 + }, + { + "epoch": 0.07206703910614524, + "grad_norm": 0.9013562798500061, + "learning_rate": 0.0009992717086834734, + "loss": 0.5032, + "step": 129 + }, + { + "epoch": 0.07262569832402235, + "grad_norm": 1.7039713859558105, + "learning_rate": 0.0009992436974789916, + "loss": 0.6589, + "step": 130 + }, + { + "epoch": 0.07318435754189945, + "grad_norm": 1.4759498834609985, + "learning_rate": 0.0009992156862745098, + "loss": 0.5978, + "step": 131 + }, + { + "epoch": 0.07374301675977654, + "grad_norm": 1.7050954103469849, + "learning_rate": 0.000999187675070028, + "loss": 0.7511, + "step": 132 + }, + { + "epoch": 0.07430167597765364, + "grad_norm": 0.7413017749786377, + "learning_rate": 0.0009991596638655462, + "loss": 0.6098, + "step": 133 + }, + { + "epoch": 0.07486033519553073, + "grad_norm": 0.9138259291648865, + "learning_rate": 0.0009991316526610644, + "loss": 0.6288, + "step": 134 + }, + { + "epoch": 0.07541899441340782, + "grad_norm": 1.1756887435913086, + "learning_rate": 0.0009991036414565826, + "loss": 0.5499, + "step": 135 + }, + { + "epoch": 0.07597765363128492, + "grad_norm": 3.5213701725006104, + "learning_rate": 0.0009990756302521008, + "loss": 0.5325, + "step": 136 + }, + { + "epoch": 0.07653631284916201, + "grad_norm": 0.9328041672706604, + "learning_rate": 0.000999047619047619, + "loss": 0.607, + "step": 137 + }, + { + "epoch": 0.07709497206703911, + "grad_norm": 0.8740829825401306, + "learning_rate": 0.0009990196078431372, + "loss": 0.4362, + "step": 138 + }, + { + "epoch": 0.0776536312849162, + "grad_norm": 0.8908244371414185, + "learning_rate": 0.0009989915966386554, + "loss": 0.6038, + "step": 139 + }, + { + "epoch": 0.0782122905027933, + "grad_norm": 3.7454893589019775, + "learning_rate": 0.0009989635854341736, + "loss": 0.5341, + "step": 140 + }, + { + "epoch": 0.07877094972067039, + "grad_norm": 9.606756210327148, + "learning_rate": 0.0009989355742296918, + "loss": 0.6827, + "step": 141 + }, + { + "epoch": 0.07932960893854749, + "grad_norm": 0.8355755805969238, + "learning_rate": 0.00099890756302521, + "loss": 0.518, + "step": 142 + }, + { + "epoch": 0.07988826815642458, + "grad_norm": 1.1310237646102905, + "learning_rate": 0.0009988795518207283, + "loss": 0.8823, + "step": 143 + }, + { + "epoch": 0.08044692737430167, + "grad_norm": 0.8808151483535767, + "learning_rate": 0.0009988515406162465, + "loss": 0.6951, + "step": 144 + }, + { + "epoch": 0.08100558659217877, + "grad_norm": 9.77657413482666, + "learning_rate": 0.0009988235294117647, + "loss": 0.6022, + "step": 145 + }, + { + "epoch": 0.08156424581005586, + "grad_norm": 1.1427901983261108, + "learning_rate": 0.0009987955182072829, + "loss": 0.7025, + "step": 146 + }, + { + "epoch": 0.08212290502793296, + "grad_norm": 0.9580820798873901, + "learning_rate": 0.000998767507002801, + "loss": 0.6505, + "step": 147 + }, + { + "epoch": 0.08268156424581005, + "grad_norm": 0.7566211819648743, + "learning_rate": 0.0009987394957983193, + "loss": 0.6104, + "step": 148 + }, + { + "epoch": 0.08324022346368715, + "grad_norm": 0.7948142886161804, + "learning_rate": 0.0009987114845938375, + "loss": 0.4299, + "step": 149 + }, + { + "epoch": 0.08379888268156424, + "grad_norm": 1.3001176118850708, + "learning_rate": 0.0009986834733893557, + "loss": 0.7201, + "step": 150 + }, + { + "epoch": 0.08435754189944134, + "grad_norm": 1.0026098489761353, + "learning_rate": 0.0009986554621848741, + "loss": 0.5754, + "step": 151 + }, + { + "epoch": 0.08491620111731843, + "grad_norm": 0.7993031144142151, + "learning_rate": 0.0009986274509803921, + "loss": 0.5115, + "step": 152 + }, + { + "epoch": 0.08547486033519552, + "grad_norm": 2.829930305480957, + "learning_rate": 0.0009985994397759103, + "loss": 0.5267, + "step": 153 + }, + { + "epoch": 0.08603351955307263, + "grad_norm": 0.6446975469589233, + "learning_rate": 0.0009985714285714285, + "loss": 0.4949, + "step": 154 + }, + { + "epoch": 0.08659217877094973, + "grad_norm": 1.545230507850647, + "learning_rate": 0.0009985434173669467, + "loss": 0.6897, + "step": 155 + }, + { + "epoch": 0.08715083798882682, + "grad_norm": 0.6464013457298279, + "learning_rate": 0.0009985154061624652, + "loss": 0.4356, + "step": 156 + }, + { + "epoch": 0.08770949720670392, + "grad_norm": 0.5887356400489807, + "learning_rate": 0.0009984873949579831, + "loss": 0.5884, + "step": 157 + }, + { + "epoch": 0.08826815642458101, + "grad_norm": 0.7637951374053955, + "learning_rate": 0.0009984593837535014, + "loss": 0.5663, + "step": 158 + }, + { + "epoch": 0.0888268156424581, + "grad_norm": 0.8062839508056641, + "learning_rate": 0.0009984313725490196, + "loss": 0.5993, + "step": 159 + }, + { + "epoch": 0.0893854748603352, + "grad_norm": 0.8858649134635925, + "learning_rate": 0.0009984033613445378, + "loss": 0.462, + "step": 160 + }, + { + "epoch": 0.08994413407821229, + "grad_norm": 0.7305874824523926, + "learning_rate": 0.0009983753501400562, + "loss": 0.6053, + "step": 161 + }, + { + "epoch": 0.09050279329608939, + "grad_norm": 4.35394811630249, + "learning_rate": 0.0009983473389355742, + "loss": 0.5371, + "step": 162 + }, + { + "epoch": 0.09106145251396648, + "grad_norm": 0.7559210062026978, + "learning_rate": 0.0009983193277310924, + "loss": 0.5161, + "step": 163 + }, + { + "epoch": 0.09162011173184358, + "grad_norm": 1.1714906692504883, + "learning_rate": 0.0009982913165266106, + "loss": 0.6804, + "step": 164 + }, + { + "epoch": 0.09217877094972067, + "grad_norm": 0.6833853125572205, + "learning_rate": 0.0009982633053221288, + "loss": 0.544, + "step": 165 + }, + { + "epoch": 0.09273743016759776, + "grad_norm": 1.8271632194519043, + "learning_rate": 0.0009982352941176472, + "loss": 0.7229, + "step": 166 + }, + { + "epoch": 0.09329608938547486, + "grad_norm": 0.6042512059211731, + "learning_rate": 0.0009982072829131654, + "loss": 0.5214, + "step": 167 + }, + { + "epoch": 0.09385474860335195, + "grad_norm": 0.7902947068214417, + "learning_rate": 0.0009981792717086834, + "loss": 0.5494, + "step": 168 + }, + { + "epoch": 0.09441340782122905, + "grad_norm": 0.9695896506309509, + "learning_rate": 0.0009981512605042016, + "loss": 0.5285, + "step": 169 + }, + { + "epoch": 0.09497206703910614, + "grad_norm": 0.8638774752616882, + "learning_rate": 0.0009981232492997198, + "loss": 0.5545, + "step": 170 + }, + { + "epoch": 0.09553072625698324, + "grad_norm": 0.7909321188926697, + "learning_rate": 0.0009980952380952382, + "loss": 0.5636, + "step": 171 + }, + { + "epoch": 0.09608938547486033, + "grad_norm": 1.0302354097366333, + "learning_rate": 0.0009980672268907565, + "loss": 0.7091, + "step": 172 + }, + { + "epoch": 0.09664804469273743, + "grad_norm": 0.9037856459617615, + "learning_rate": 0.0009980392156862744, + "loss": 0.729, + "step": 173 + }, + { + "epoch": 0.09720670391061452, + "grad_norm": 0.796578586101532, + "learning_rate": 0.0009980112044817926, + "loss": 0.4461, + "step": 174 + }, + { + "epoch": 0.09776536312849161, + "grad_norm": 1.3324625492095947, + "learning_rate": 0.0009979831932773109, + "loss": 0.504, + "step": 175 + }, + { + "epoch": 0.09832402234636871, + "grad_norm": 1.0910922288894653, + "learning_rate": 0.0009979551820728293, + "loss": 0.5651, + "step": 176 + }, + { + "epoch": 0.09888268156424582, + "grad_norm": 0.774204671382904, + "learning_rate": 0.0009979271708683475, + "loss": 0.6074, + "step": 177 + }, + { + "epoch": 0.09944134078212291, + "grad_norm": 0.5686858296394348, + "learning_rate": 0.0009978991596638655, + "loss": 0.4672, + "step": 178 + }, + { + "epoch": 0.1, + "grad_norm": 0.905280351638794, + "learning_rate": 0.0009978711484593837, + "loss": 0.5032, + "step": 179 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 1.158460259437561, + "learning_rate": 0.0009978431372549019, + "loss": 0.5575, + "step": 180 + }, + { + "epoch": 0.1011173184357542, + "grad_norm": 1.3891628980636597, + "learning_rate": 0.0009978151260504203, + "loss": 0.6522, + "step": 181 + }, + { + "epoch": 0.10167597765363129, + "grad_norm": 0.6089535355567932, + "learning_rate": 0.0009977871148459385, + "loss": 0.5908, + "step": 182 + }, + { + "epoch": 0.10223463687150838, + "grad_norm": 1.1667119264602661, + "learning_rate": 0.0009977591036414567, + "loss": 0.6482, + "step": 183 + }, + { + "epoch": 0.10279329608938548, + "grad_norm": 4.3863139152526855, + "learning_rate": 0.0009977310924369747, + "loss": 0.5249, + "step": 184 + }, + { + "epoch": 0.10335195530726257, + "grad_norm": 0.6774610877037048, + "learning_rate": 0.000997703081232493, + "loss": 0.5093, + "step": 185 + }, + { + "epoch": 0.10391061452513967, + "grad_norm": 1.5877678394317627, + "learning_rate": 0.0009976750700280113, + "loss": 0.549, + "step": 186 + }, + { + "epoch": 0.10446927374301676, + "grad_norm": 0.8745883107185364, + "learning_rate": 0.0009976470588235295, + "loss": 0.4241, + "step": 187 + }, + { + "epoch": 0.10502793296089385, + "grad_norm": 0.8454338908195496, + "learning_rate": 0.0009976190476190477, + "loss": 0.6387, + "step": 188 + }, + { + "epoch": 0.10558659217877095, + "grad_norm": 0.5730166435241699, + "learning_rate": 0.0009975910364145657, + "loss": 0.5914, + "step": 189 + }, + { + "epoch": 0.10614525139664804, + "grad_norm": 0.7003739476203918, + "learning_rate": 0.000997563025210084, + "loss": 0.7188, + "step": 190 + }, + { + "epoch": 0.10670391061452514, + "grad_norm": 1.1836274862289429, + "learning_rate": 0.0009975350140056024, + "loss": 0.6011, + "step": 191 + }, + { + "epoch": 0.10726256983240223, + "grad_norm": 1.303983211517334, + "learning_rate": 0.0009975070028011206, + "loss": 0.5994, + "step": 192 + }, + { + "epoch": 0.10782122905027933, + "grad_norm": 3.37532901763916, + "learning_rate": 0.0009974789915966388, + "loss": 0.6153, + "step": 193 + }, + { + "epoch": 0.10837988826815642, + "grad_norm": 1.1878474950790405, + "learning_rate": 0.0009974509803921568, + "loss": 0.5201, + "step": 194 + }, + { + "epoch": 0.10893854748603352, + "grad_norm": 1.4199057817459106, + "learning_rate": 0.000997422969187675, + "loss": 0.5518, + "step": 195 + }, + { + "epoch": 0.10949720670391061, + "grad_norm": 0.9133730530738831, + "learning_rate": 0.0009973949579831934, + "loss": 0.5121, + "step": 196 + }, + { + "epoch": 0.1100558659217877, + "grad_norm": 0.983331561088562, + "learning_rate": 0.0009973669467787116, + "loss": 0.5642, + "step": 197 + }, + { + "epoch": 0.1106145251396648, + "grad_norm": 1.2697263956069946, + "learning_rate": 0.0009973389355742298, + "loss": 0.545, + "step": 198 + }, + { + "epoch": 0.1111731843575419, + "grad_norm": 0.9948388338088989, + "learning_rate": 0.000997310924369748, + "loss": 0.6011, + "step": 199 + }, + { + "epoch": 0.11173184357541899, + "grad_norm": 1.0217320919036865, + "learning_rate": 0.000997282913165266, + "loss": 0.6751, + "step": 200 + }, + { + "epoch": 0.1122905027932961, + "grad_norm": 2.5174546241760254, + "learning_rate": 0.0009972549019607844, + "loss": 0.4967, + "step": 201 + }, + { + "epoch": 0.11284916201117319, + "grad_norm": 0.7323604822158813, + "learning_rate": 0.0009972268907563026, + "loss": 0.6117, + "step": 202 + }, + { + "epoch": 0.11340782122905028, + "grad_norm": 0.8109521269798279, + "learning_rate": 0.0009971988795518208, + "loss": 0.5708, + "step": 203 + }, + { + "epoch": 0.11396648044692738, + "grad_norm": 0.7176600098609924, + "learning_rate": 0.000997170868347339, + "loss": 0.6001, + "step": 204 + }, + { + "epoch": 0.11452513966480447, + "grad_norm": 0.6863576173782349, + "learning_rate": 0.000997142857142857, + "loss": 0.5, + "step": 205 + }, + { + "epoch": 0.11508379888268157, + "grad_norm": 1.0173395872116089, + "learning_rate": 0.0009971148459383755, + "loss": 0.5232, + "step": 206 + }, + { + "epoch": 0.11564245810055866, + "grad_norm": 1.3011823892593384, + "learning_rate": 0.0009970868347338937, + "loss": 0.5667, + "step": 207 + }, + { + "epoch": 0.11620111731843576, + "grad_norm": 1.8995484113693237, + "learning_rate": 0.0009970588235294119, + "loss": 0.61, + "step": 208 + }, + { + "epoch": 0.11675977653631285, + "grad_norm": 3.420168876647949, + "learning_rate": 0.00099703081232493, + "loss": 0.6301, + "step": 209 + }, + { + "epoch": 0.11731843575418995, + "grad_norm": 6.120429515838623, + "learning_rate": 0.000997002801120448, + "loss": 0.5196, + "step": 210 + }, + { + "epoch": 0.11787709497206704, + "grad_norm": 0.8453794717788696, + "learning_rate": 0.0009969747899159665, + "loss": 0.64, + "step": 211 + }, + { + "epoch": 0.11843575418994413, + "grad_norm": 0.569521427154541, + "learning_rate": 0.0009969467787114847, + "loss": 0.501, + "step": 212 + }, + { + "epoch": 0.11899441340782123, + "grad_norm": 1.01449453830719, + "learning_rate": 0.000996918767507003, + "loss": 0.6819, + "step": 213 + }, + { + "epoch": 0.11955307262569832, + "grad_norm": 1.6356980800628662, + "learning_rate": 0.000996890756302521, + "loss": 0.6916, + "step": 214 + }, + { + "epoch": 0.12011173184357542, + "grad_norm": 2.6632416248321533, + "learning_rate": 0.0009968627450980393, + "loss": 0.5216, + "step": 215 + }, + { + "epoch": 0.12067039106145251, + "grad_norm": 0.8983120918273926, + "learning_rate": 0.0009968347338935573, + "loss": 0.6797, + "step": 216 + }, + { + "epoch": 0.1212290502793296, + "grad_norm": 0.6871258020401001, + "learning_rate": 0.0009968067226890757, + "loss": 0.6117, + "step": 217 + }, + { + "epoch": 0.1217877094972067, + "grad_norm": 0.7885923981666565, + "learning_rate": 0.000996778711484594, + "loss": 0.4373, + "step": 218 + }, + { + "epoch": 0.1223463687150838, + "grad_norm": 0.7660871148109436, + "learning_rate": 0.0009967507002801121, + "loss": 0.4945, + "step": 219 + }, + { + "epoch": 0.12290502793296089, + "grad_norm": 0.5301454067230225, + "learning_rate": 0.0009967226890756303, + "loss": 0.527, + "step": 220 + }, + { + "epoch": 0.12346368715083798, + "grad_norm": 0.6207997798919678, + "learning_rate": 0.0009966946778711483, + "loss": 0.4888, + "step": 221 + }, + { + "epoch": 0.12402234636871508, + "grad_norm": 1.0855971574783325, + "learning_rate": 0.0009966666666666668, + "loss": 0.6201, + "step": 222 + }, + { + "epoch": 0.12458100558659217, + "grad_norm": 4.373345851898193, + "learning_rate": 0.000996638655462185, + "loss": 0.6542, + "step": 223 + }, + { + "epoch": 0.12513966480446928, + "grad_norm": 4.327059268951416, + "learning_rate": 0.0009966106442577032, + "loss": 0.4733, + "step": 224 + }, + { + "epoch": 0.12569832402234637, + "grad_norm": 4.315557479858398, + "learning_rate": 0.0009965826330532214, + "loss": 0.6439, + "step": 225 + }, + { + "epoch": 0.12625698324022347, + "grad_norm": 2.901041269302368, + "learning_rate": 0.0009965546218487394, + "loss": 0.6209, + "step": 226 + }, + { + "epoch": 0.12681564245810056, + "grad_norm": 0.8233610391616821, + "learning_rate": 0.0009965266106442578, + "loss": 0.5543, + "step": 227 + }, + { + "epoch": 0.12737430167597766, + "grad_norm": 0.731225848197937, + "learning_rate": 0.000996498599439776, + "loss": 0.501, + "step": 228 + }, + { + "epoch": 0.12793296089385475, + "grad_norm": 2.698946714401245, + "learning_rate": 0.0009964705882352942, + "loss": 0.5389, + "step": 229 + }, + { + "epoch": 0.12849162011173185, + "grad_norm": 0.7199321389198303, + "learning_rate": 0.0009964425770308124, + "loss": 0.475, + "step": 230 + }, + { + "epoch": 0.12905027932960894, + "grad_norm": 1.217470645904541, + "learning_rate": 0.0009964145658263306, + "loss": 0.6172, + "step": 231 + }, + { + "epoch": 0.12960893854748604, + "grad_norm": 0.7861664891242981, + "learning_rate": 0.0009963865546218488, + "loss": 0.5498, + "step": 232 + }, + { + "epoch": 0.13016759776536313, + "grad_norm": 1.6193184852600098, + "learning_rate": 0.000996358543417367, + "loss": 0.6655, + "step": 233 + }, + { + "epoch": 0.13072625698324022, + "grad_norm": 0.7959342002868652, + "learning_rate": 0.0009963305322128852, + "loss": 0.4295, + "step": 234 + }, + { + "epoch": 0.13128491620111732, + "grad_norm": 0.7992600202560425, + "learning_rate": 0.0009963025210084034, + "loss": 0.5969, + "step": 235 + }, + { + "epoch": 0.1318435754189944, + "grad_norm": 0.7413977980613708, + "learning_rate": 0.0009962745098039216, + "loss": 0.6023, + "step": 236 + }, + { + "epoch": 0.1324022346368715, + "grad_norm": 3.2688894271850586, + "learning_rate": 0.0009962464985994398, + "loss": 0.5235, + "step": 237 + }, + { + "epoch": 0.1329608938547486, + "grad_norm": 1.4802277088165283, + "learning_rate": 0.000996218487394958, + "loss": 0.5943, + "step": 238 + }, + { + "epoch": 0.1335195530726257, + "grad_norm": 0.9552064538002014, + "learning_rate": 0.0009961904761904763, + "loss": 0.5415, + "step": 239 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 1.092341661453247, + "learning_rate": 0.0009961624649859945, + "loss": 0.6357, + "step": 240 + }, + { + "epoch": 0.13463687150837989, + "grad_norm": 1.7820661067962646, + "learning_rate": 0.0009961344537815127, + "loss": 0.5674, + "step": 241 + }, + { + "epoch": 0.13519553072625698, + "grad_norm": 0.7442818880081177, + "learning_rate": 0.0009961064425770309, + "loss": 0.5915, + "step": 242 + }, + { + "epoch": 0.13575418994413407, + "grad_norm": 4.437726020812988, + "learning_rate": 0.000996078431372549, + "loss": 0.536, + "step": 243 + }, + { + "epoch": 0.13631284916201117, + "grad_norm": 0.7915422916412354, + "learning_rate": 0.0009960504201680673, + "loss": 0.5699, + "step": 244 + }, + { + "epoch": 0.13687150837988826, + "grad_norm": 1.7991547584533691, + "learning_rate": 0.0009960224089635855, + "loss": 0.547, + "step": 245 + }, + { + "epoch": 0.13743016759776536, + "grad_norm": 1.4279496669769287, + "learning_rate": 0.0009959943977591037, + "loss": 0.455, + "step": 246 + }, + { + "epoch": 0.13798882681564245, + "grad_norm": 2.0297365188598633, + "learning_rate": 0.000995966386554622, + "loss": 0.5672, + "step": 247 + }, + { + "epoch": 0.13854748603351955, + "grad_norm": 0.9002755880355835, + "learning_rate": 0.00099593837535014, + "loss": 0.6349, + "step": 248 + }, + { + "epoch": 0.13910614525139664, + "grad_norm": 0.652621865272522, + "learning_rate": 0.0009959103641456583, + "loss": 0.5551, + "step": 249 + }, + { + "epoch": 0.13966480446927373, + "grad_norm": 0.738399088382721, + "learning_rate": 0.0009958823529411765, + "loss": 0.5952, + "step": 250 + }, + { + "epoch": 0.14022346368715083, + "grad_norm": 1.0210678577423096, + "learning_rate": 0.0009958543417366947, + "loss": 0.607, + "step": 251 + }, + { + "epoch": 0.14078212290502792, + "grad_norm": 1.1312799453735352, + "learning_rate": 0.000995826330532213, + "loss": 0.8005, + "step": 252 + }, + { + "epoch": 0.14134078212290502, + "grad_norm": 4.63861083984375, + "learning_rate": 0.0009957983193277311, + "loss": 0.5475, + "step": 253 + }, + { + "epoch": 0.1418994413407821, + "grad_norm": 2.7473626136779785, + "learning_rate": 0.0009957703081232493, + "loss": 0.5327, + "step": 254 + }, + { + "epoch": 0.1424581005586592, + "grad_norm": 1.1836369037628174, + "learning_rate": 0.0009957422969187675, + "loss": 0.5657, + "step": 255 + }, + { + "epoch": 0.1430167597765363, + "grad_norm": 1.3779765367507935, + "learning_rate": 0.0009957142857142858, + "loss": 0.4871, + "step": 256 + }, + { + "epoch": 0.1435754189944134, + "grad_norm": 0.649736762046814, + "learning_rate": 0.000995686274509804, + "loss": 0.4794, + "step": 257 + }, + { + "epoch": 0.1441340782122905, + "grad_norm": 1.8200886249542236, + "learning_rate": 0.0009956582633053222, + "loss": 0.6292, + "step": 258 + }, + { + "epoch": 0.14469273743016758, + "grad_norm": 1.9034805297851562, + "learning_rate": 0.0009956302521008404, + "loss": 0.682, + "step": 259 + }, + { + "epoch": 0.1452513966480447, + "grad_norm": 1.3746448755264282, + "learning_rate": 0.0009956022408963586, + "loss": 0.503, + "step": 260 + }, + { + "epoch": 0.1458100558659218, + "grad_norm": 1.5244262218475342, + "learning_rate": 0.0009955742296918768, + "loss": 0.6125, + "step": 261 + }, + { + "epoch": 0.1463687150837989, + "grad_norm": 0.6705476641654968, + "learning_rate": 0.000995546218487395, + "loss": 0.5163, + "step": 262 + }, + { + "epoch": 0.146927374301676, + "grad_norm": 1.0974162817001343, + "learning_rate": 0.0009955182072829132, + "loss": 0.6924, + "step": 263 + }, + { + "epoch": 0.14748603351955308, + "grad_norm": 1.7833330631256104, + "learning_rate": 0.0009954901960784314, + "loss": 0.6602, + "step": 264 + }, + { + "epoch": 0.14804469273743018, + "grad_norm": 0.9520974159240723, + "learning_rate": 0.0009954621848739496, + "loss": 0.5909, + "step": 265 + }, + { + "epoch": 0.14860335195530727, + "grad_norm": 0.8226149678230286, + "learning_rate": 0.0009954341736694678, + "loss": 0.6374, + "step": 266 + }, + { + "epoch": 0.14916201117318437, + "grad_norm": 0.6469694972038269, + "learning_rate": 0.000995406162464986, + "loss": 0.5713, + "step": 267 + }, + { + "epoch": 0.14972067039106146, + "grad_norm": 0.9335330128669739, + "learning_rate": 0.0009953781512605042, + "loss": 0.6174, + "step": 268 + }, + { + "epoch": 0.15027932960893856, + "grad_norm": 0.7867704033851624, + "learning_rate": 0.0009953501400560224, + "loss": 0.5148, + "step": 269 + }, + { + "epoch": 0.15083798882681565, + "grad_norm": 1.929457187652588, + "learning_rate": 0.0009953221288515406, + "loss": 0.5345, + "step": 270 + }, + { + "epoch": 0.15139664804469274, + "grad_norm": 0.9096598029136658, + "learning_rate": 0.0009952941176470588, + "loss": 0.4868, + "step": 271 + }, + { + "epoch": 0.15195530726256984, + "grad_norm": 0.7338758111000061, + "learning_rate": 0.000995266106442577, + "loss": 0.5996, + "step": 272 + }, + { + "epoch": 0.15251396648044693, + "grad_norm": 1.0851118564605713, + "learning_rate": 0.0009952380952380953, + "loss": 0.5014, + "step": 273 + }, + { + "epoch": 0.15307262569832403, + "grad_norm": 1.3886101245880127, + "learning_rate": 0.0009952100840336135, + "loss": 0.6848, + "step": 274 + }, + { + "epoch": 0.15363128491620112, + "grad_norm": 1.0894609689712524, + "learning_rate": 0.0009951820728291317, + "loss": 0.7159, + "step": 275 + }, + { + "epoch": 0.15418994413407822, + "grad_norm": 0.5999849438667297, + "learning_rate": 0.0009951540616246499, + "loss": 0.4273, + "step": 276 + }, + { + "epoch": 0.1547486033519553, + "grad_norm": 1.2654602527618408, + "learning_rate": 0.000995126050420168, + "loss": 0.6949, + "step": 277 + }, + { + "epoch": 0.1553072625698324, + "grad_norm": 1.3784074783325195, + "learning_rate": 0.0009950980392156863, + "loss": 0.5785, + "step": 278 + }, + { + "epoch": 0.1558659217877095, + "grad_norm": 3.8308889865875244, + "learning_rate": 0.0009950700280112045, + "loss": 0.5655, + "step": 279 + }, + { + "epoch": 0.1564245810055866, + "grad_norm": 1.7307665348052979, + "learning_rate": 0.0009950420168067227, + "loss": 0.5642, + "step": 280 + }, + { + "epoch": 0.1569832402234637, + "grad_norm": 3.464566230773926, + "learning_rate": 0.000995014005602241, + "loss": 0.5521, + "step": 281 + }, + { + "epoch": 0.15754189944134078, + "grad_norm": 1.9289082288742065, + "learning_rate": 0.0009949859943977591, + "loss": 0.5533, + "step": 282 + }, + { + "epoch": 0.15810055865921788, + "grad_norm": 1.3773527145385742, + "learning_rate": 0.0009949579831932773, + "loss": 0.6159, + "step": 283 + }, + { + "epoch": 0.15865921787709497, + "grad_norm": 0.7298031449317932, + "learning_rate": 0.0009949299719887955, + "loss": 0.4764, + "step": 284 + }, + { + "epoch": 0.15921787709497207, + "grad_norm": 4.222969055175781, + "learning_rate": 0.0009949019607843137, + "loss": 0.4393, + "step": 285 + }, + { + "epoch": 0.15977653631284916, + "grad_norm": 0.9089906215667725, + "learning_rate": 0.000994873949579832, + "loss": 0.5886, + "step": 286 + }, + { + "epoch": 0.16033519553072625, + "grad_norm": 0.988497257232666, + "learning_rate": 0.0009948459383753501, + "loss": 0.4182, + "step": 287 + }, + { + "epoch": 0.16089385474860335, + "grad_norm": 2.0216290950775146, + "learning_rate": 0.0009948179271708683, + "loss": 0.5996, + "step": 288 + }, + { + "epoch": 0.16145251396648044, + "grad_norm": 0.7447571158409119, + "learning_rate": 0.0009947899159663866, + "loss": 0.635, + "step": 289 + }, + { + "epoch": 0.16201117318435754, + "grad_norm": 0.9543245434761047, + "learning_rate": 0.0009947619047619048, + "loss": 0.5056, + "step": 290 + }, + { + "epoch": 0.16256983240223463, + "grad_norm": 1.108781099319458, + "learning_rate": 0.000994733893557423, + "loss": 0.5206, + "step": 291 + }, + { + "epoch": 0.16312849162011173, + "grad_norm": 2.061345338821411, + "learning_rate": 0.0009947058823529412, + "loss": 0.5572, + "step": 292 + }, + { + "epoch": 0.16368715083798882, + "grad_norm": 0.9002458453178406, + "learning_rate": 0.0009946778711484594, + "loss": 0.4982, + "step": 293 + }, + { + "epoch": 0.16424581005586592, + "grad_norm": 0.6461503505706787, + "learning_rate": 0.0009946498599439776, + "loss": 0.4248, + "step": 294 + }, + { + "epoch": 0.164804469273743, + "grad_norm": 0.9050043225288391, + "learning_rate": 0.0009946218487394958, + "loss": 0.6251, + "step": 295 + }, + { + "epoch": 0.1653631284916201, + "grad_norm": 1.5620747804641724, + "learning_rate": 0.000994593837535014, + "loss": 0.3986, + "step": 296 + }, + { + "epoch": 0.1659217877094972, + "grad_norm": 0.5747299194335938, + "learning_rate": 0.0009945658263305322, + "loss": 0.5393, + "step": 297 + }, + { + "epoch": 0.1664804469273743, + "grad_norm": 5.657763481140137, + "learning_rate": 0.0009945378151260504, + "loss": 0.6081, + "step": 298 + }, + { + "epoch": 0.1670391061452514, + "grad_norm": 1.2506048679351807, + "learning_rate": 0.0009945098039215686, + "loss": 0.5267, + "step": 299 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 1.3759843111038208, + "learning_rate": 0.0009944817927170868, + "loss": 0.4981, + "step": 300 + }, + { + "epoch": 0.16815642458100558, + "grad_norm": 0.6979724168777466, + "learning_rate": 0.000994453781512605, + "loss": 0.5283, + "step": 301 + }, + { + "epoch": 0.16871508379888267, + "grad_norm": 0.6828892230987549, + "learning_rate": 0.0009944257703081232, + "loss": 0.523, + "step": 302 + }, + { + "epoch": 0.16927374301675976, + "grad_norm": 2.234743356704712, + "learning_rate": 0.0009943977591036414, + "loss": 0.6468, + "step": 303 + }, + { + "epoch": 0.16983240223463686, + "grad_norm": 4.027324676513672, + "learning_rate": 0.0009943697478991596, + "loss": 0.473, + "step": 304 + }, + { + "epoch": 0.17039106145251395, + "grad_norm": 1.2224324941635132, + "learning_rate": 0.0009943417366946778, + "loss": 0.5357, + "step": 305 + }, + { + "epoch": 0.17094972067039105, + "grad_norm": 0.8478534817695618, + "learning_rate": 0.000994313725490196, + "loss": 0.5372, + "step": 306 + }, + { + "epoch": 0.17150837988826817, + "grad_norm": 0.9016631841659546, + "learning_rate": 0.0009942857142857143, + "loss": 0.5891, + "step": 307 + }, + { + "epoch": 0.17206703910614526, + "grad_norm": 1.09122896194458, + "learning_rate": 0.0009942577030812325, + "loss": 0.5555, + "step": 308 + }, + { + "epoch": 0.17262569832402236, + "grad_norm": 1.4219303131103516, + "learning_rate": 0.0009942296918767507, + "loss": 0.5375, + "step": 309 + }, + { + "epoch": 0.17318435754189945, + "grad_norm": 1.436359167098999, + "learning_rate": 0.0009942016806722689, + "loss": 0.7384, + "step": 310 + }, + { + "epoch": 0.17374301675977655, + "grad_norm": 0.825744092464447, + "learning_rate": 0.000994173669467787, + "loss": 0.6145, + "step": 311 + }, + { + "epoch": 0.17430167597765364, + "grad_norm": Infinity, + "learning_rate": 0.000994173669467787, + "loss": 0.6434, + "step": 312 + }, + { + "epoch": 0.17486033519553074, + "grad_norm": 1.1593785285949707, + "learning_rate": 0.0009941456582633053, + "loss": 0.4869, + "step": 313 + }, + { + "epoch": 0.17541899441340783, + "grad_norm": 1.1484551429748535, + "learning_rate": 0.0009941176470588235, + "loss": 0.6688, + "step": 314 + }, + { + "epoch": 0.17597765363128492, + "grad_norm": 1.0478200912475586, + "learning_rate": 0.0009940896358543417, + "loss": 0.573, + "step": 315 + }, + { + "epoch": 0.17653631284916202, + "grad_norm": 3.2461318969726562, + "learning_rate": 0.00099406162464986, + "loss": 0.6484, + "step": 316 + }, + { + "epoch": 0.1770949720670391, + "grad_norm": 2.6686880588531494, + "learning_rate": 0.0009940336134453781, + "loss": 0.6038, + "step": 317 + }, + { + "epoch": 0.1776536312849162, + "grad_norm": 1.0154472589492798, + "learning_rate": 0.0009940056022408963, + "loss": 0.7957, + "step": 318 + }, + { + "epoch": 0.1782122905027933, + "grad_norm": 0.7798212170600891, + "learning_rate": 0.0009939775910364145, + "loss": 0.5341, + "step": 319 + }, + { + "epoch": 0.1787709497206704, + "grad_norm": 1.3281465768814087, + "learning_rate": 0.0009939495798319327, + "loss": 0.6075, + "step": 320 + }, + { + "epoch": 0.1793296089385475, + "grad_norm": 0.6935046911239624, + "learning_rate": 0.000993921568627451, + "loss": 0.5669, + "step": 321 + }, + { + "epoch": 0.17988826815642459, + "grad_norm": 1.006704568862915, + "learning_rate": 0.0009938935574229691, + "loss": 0.531, + "step": 322 + }, + { + "epoch": 0.18044692737430168, + "grad_norm": 1.3580342531204224, + "learning_rate": 0.0009938655462184876, + "loss": 0.5573, + "step": 323 + }, + { + "epoch": 0.18100558659217877, + "grad_norm": 3.498706340789795, + "learning_rate": 0.0009938375350140056, + "loss": 0.6409, + "step": 324 + }, + { + "epoch": 0.18156424581005587, + "grad_norm": 1.06218683719635, + "learning_rate": 0.0009938095238095238, + "loss": 0.599, + "step": 325 + }, + { + "epoch": 0.18212290502793296, + "grad_norm": 1.1846939325332642, + "learning_rate": 0.000993781512605042, + "loss": 0.5959, + "step": 326 + }, + { + "epoch": 0.18268156424581006, + "grad_norm": 1.4181071519851685, + "learning_rate": 0.0009937535014005602, + "loss": 0.4734, + "step": 327 + }, + { + "epoch": 0.18324022346368715, + "grad_norm": 0.738722562789917, + "learning_rate": 0.0009937254901960786, + "loss": 0.5656, + "step": 328 + }, + { + "epoch": 0.18379888268156425, + "grad_norm": 1.360398292541504, + "learning_rate": 0.0009936974789915966, + "loss": 0.4343, + "step": 329 + }, + { + "epoch": 0.18435754189944134, + "grad_norm": 1.1944875717163086, + "learning_rate": 0.0009936694677871148, + "loss": 0.482, + "step": 330 + }, + { + "epoch": 0.18491620111731844, + "grad_norm": 0.5688614249229431, + "learning_rate": 0.000993641456582633, + "loss": 0.5339, + "step": 331 + }, + { + "epoch": 0.18547486033519553, + "grad_norm": 0.9470820426940918, + "learning_rate": 0.0009936134453781512, + "loss": 0.6469, + "step": 332 + }, + { + "epoch": 0.18603351955307262, + "grad_norm": 1.0332776308059692, + "learning_rate": 0.0009935854341736696, + "loss": 0.4985, + "step": 333 + }, + { + "epoch": 0.18659217877094972, + "grad_norm": 1.2427397966384888, + "learning_rate": 0.0009935574229691876, + "loss": 0.5668, + "step": 334 + }, + { + "epoch": 0.1871508379888268, + "grad_norm": 5.453192234039307, + "learning_rate": 0.0009935294117647058, + "loss": 0.4979, + "step": 335 + }, + { + "epoch": 0.1877094972067039, + "grad_norm": 0.9218781590461731, + "learning_rate": 0.000993501400560224, + "loss": 0.7005, + "step": 336 + }, + { + "epoch": 0.188268156424581, + "grad_norm": 0.8410573601722717, + "learning_rate": 0.0009934733893557422, + "loss": 0.4558, + "step": 337 + }, + { + "epoch": 0.1888268156424581, + "grad_norm": 0.7678621411323547, + "learning_rate": 0.0009934453781512607, + "loss": 0.6676, + "step": 338 + }, + { + "epoch": 0.1893854748603352, + "grad_norm": 0.8786569833755493, + "learning_rate": 0.0009934173669467789, + "loss": 0.5568, + "step": 339 + }, + { + "epoch": 0.18994413407821228, + "grad_norm": 4.146731376647949, + "learning_rate": 0.0009933893557422969, + "loss": 0.5414, + "step": 340 + }, + { + "epoch": 0.19050279329608938, + "grad_norm": 1.320022463798523, + "learning_rate": 0.000993361344537815, + "loss": 0.5405, + "step": 341 + }, + { + "epoch": 0.19106145251396647, + "grad_norm": 0.923991322517395, + "learning_rate": 0.0009933333333333333, + "loss": 0.5447, + "step": 342 + }, + { + "epoch": 0.19162011173184357, + "grad_norm": 1.1749308109283447, + "learning_rate": 0.0009933053221288517, + "loss": 0.5945, + "step": 343 + }, + { + "epoch": 0.19217877094972066, + "grad_norm": 0.7339878082275391, + "learning_rate": 0.00099327731092437, + "loss": 0.6541, + "step": 344 + }, + { + "epoch": 0.19273743016759776, + "grad_norm": 1.042183756828308, + "learning_rate": 0.0009932492997198879, + "loss": 0.5684, + "step": 345 + }, + { + "epoch": 0.19329608938547485, + "grad_norm": 0.8728654980659485, + "learning_rate": 0.000993221288515406, + "loss": 0.5173, + "step": 346 + }, + { + "epoch": 0.19385474860335195, + "grad_norm": 1.5398110151290894, + "learning_rate": 0.0009931932773109243, + "loss": 0.5283, + "step": 347 + }, + { + "epoch": 0.19441340782122904, + "grad_norm": 1.3644812107086182, + "learning_rate": 0.0009931652661064427, + "loss": 0.5595, + "step": 348 + }, + { + "epoch": 0.19497206703910613, + "grad_norm": 1.0397660732269287, + "learning_rate": 0.000993137254901961, + "loss": 0.592, + "step": 349 + }, + { + "epoch": 0.19553072625698323, + "grad_norm": 0.7873978614807129, + "learning_rate": 0.000993109243697479, + "loss": 0.5031, + "step": 350 + }, + { + "epoch": 0.19608938547486032, + "grad_norm": 0.8450810313224792, + "learning_rate": 0.0009930812324929971, + "loss": 0.5131, + "step": 351 + }, + { + "epoch": 0.19664804469273742, + "grad_norm": 0.907400906085968, + "learning_rate": 0.0009930532212885153, + "loss": 0.7516, + "step": 352 + }, + { + "epoch": 0.1972067039106145, + "grad_norm": 0.9214017987251282, + "learning_rate": 0.0009930252100840337, + "loss": 0.5093, + "step": 353 + }, + { + "epoch": 0.19776536312849163, + "grad_norm": 0.7404553890228271, + "learning_rate": 0.000992997198879552, + "loss": 0.4838, + "step": 354 + }, + { + "epoch": 0.19832402234636873, + "grad_norm": 1.0390986204147339, + "learning_rate": 0.0009929691876750702, + "loss": 0.5519, + "step": 355 + }, + { + "epoch": 0.19888268156424582, + "grad_norm": 0.8315941095352173, + "learning_rate": 0.0009929411764705881, + "loss": 0.5808, + "step": 356 + }, + { + "epoch": 0.19944134078212292, + "grad_norm": 2.7812280654907227, + "learning_rate": 0.0009929131652661064, + "loss": 0.4906, + "step": 357 + }, + { + "epoch": 0.2, + "grad_norm": 0.6816196441650391, + "learning_rate": 0.0009928851540616248, + "loss": 0.6241, + "step": 358 + }, + { + "epoch": 0.2005586592178771, + "grad_norm": 0.8561577796936035, + "learning_rate": 0.000992857142857143, + "loss": 0.7879, + "step": 359 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 13.957138061523438, + "learning_rate": 0.0009928291316526612, + "loss": 0.4757, + "step": 360 + }, + { + "epoch": 0.2016759776536313, + "grad_norm": 0.6377781629562378, + "learning_rate": 0.0009928011204481792, + "loss": 0.5755, + "step": 361 + }, + { + "epoch": 0.2022346368715084, + "grad_norm": 0.7347707748413086, + "learning_rate": 0.0009927731092436974, + "loss": 0.5293, + "step": 362 + }, + { + "epoch": 0.20279329608938548, + "grad_norm": 2.759532928466797, + "learning_rate": 0.0009927450980392158, + "loss": 0.4397, + "step": 363 + }, + { + "epoch": 0.20335195530726258, + "grad_norm": 1.5283832550048828, + "learning_rate": 0.000992717086834734, + "loss": 0.581, + "step": 364 + }, + { + "epoch": 0.20391061452513967, + "grad_norm": 2.0260815620422363, + "learning_rate": 0.0009926890756302522, + "loss": 0.6305, + "step": 365 + }, + { + "epoch": 0.20446927374301677, + "grad_norm": 0.6075266003608704, + "learning_rate": 0.0009926610644257702, + "loss": 0.4664, + "step": 366 + }, + { + "epoch": 0.20502793296089386, + "grad_norm": 0.9785339832305908, + "learning_rate": 0.0009926330532212884, + "loss": 0.6795, + "step": 367 + }, + { + "epoch": 0.20558659217877095, + "grad_norm": 0.975587010383606, + "learning_rate": 0.0009926050420168068, + "loss": 0.5089, + "step": 368 + }, + { + "epoch": 0.20614525139664805, + "grad_norm": 0.9132829308509827, + "learning_rate": 0.000992577030812325, + "loss": 0.5454, + "step": 369 + }, + { + "epoch": 0.20670391061452514, + "grad_norm": 1.2932801246643066, + "learning_rate": 0.0009925490196078432, + "loss": 0.5674, + "step": 370 + }, + { + "epoch": 0.20726256983240224, + "grad_norm": 1.3855035305023193, + "learning_rate": 0.0009925210084033615, + "loss": 0.4964, + "step": 371 + }, + { + "epoch": 0.20782122905027933, + "grad_norm": 1.4948569536209106, + "learning_rate": 0.0009924929971988794, + "loss": 0.6149, + "step": 372 + }, + { + "epoch": 0.20837988826815643, + "grad_norm": 0.5700994729995728, + "learning_rate": 0.0009924649859943979, + "loss": 0.4061, + "step": 373 + }, + { + "epoch": 0.20893854748603352, + "grad_norm": 0.5803407430648804, + "learning_rate": 0.000992436974789916, + "loss": 0.5132, + "step": 374 + }, + { + "epoch": 0.20949720670391062, + "grad_norm": 0.9584372639656067, + "learning_rate": 0.0009924089635854343, + "loss": 0.5948, + "step": 375 + }, + { + "epoch": 0.2100558659217877, + "grad_norm": 3.9942266941070557, + "learning_rate": 0.0009923809523809525, + "loss": 0.4893, + "step": 376 + }, + { + "epoch": 0.2106145251396648, + "grad_norm": 1.9359759092330933, + "learning_rate": 0.0009923529411764705, + "loss": 0.6682, + "step": 377 + }, + { + "epoch": 0.2111731843575419, + "grad_norm": 0.8714433908462524, + "learning_rate": 0.000992324929971989, + "loss": 0.4921, + "step": 378 + }, + { + "epoch": 0.211731843575419, + "grad_norm": 2.355104684829712, + "learning_rate": 0.000992296918767507, + "loss": 0.4428, + "step": 379 + }, + { + "epoch": 0.2122905027932961, + "grad_norm": 0.7010190486907959, + "learning_rate": 0.0009922689075630253, + "loss": 0.5505, + "step": 380 + }, + { + "epoch": 0.21284916201117318, + "grad_norm": 0.7834289073944092, + "learning_rate": 0.0009922408963585435, + "loss": 0.5067, + "step": 381 + }, + { + "epoch": 0.21340782122905028, + "grad_norm": 1.174731731414795, + "learning_rate": 0.0009922128851540615, + "loss": 0.4715, + "step": 382 + }, + { + "epoch": 0.21396648044692737, + "grad_norm": 0.8410313129425049, + "learning_rate": 0.00099218487394958, + "loss": 0.4943, + "step": 383 + }, + { + "epoch": 0.21452513966480447, + "grad_norm": 2.046583652496338, + "learning_rate": 0.0009921568627450981, + "loss": 0.5629, + "step": 384 + }, + { + "epoch": 0.21508379888268156, + "grad_norm": 1.2268325090408325, + "learning_rate": 0.0009921288515406163, + "loss": 0.4832, + "step": 385 + }, + { + "epoch": 0.21564245810055865, + "grad_norm": 6.274129390716553, + "learning_rate": 0.0009921008403361345, + "loss": 0.6357, + "step": 386 + }, + { + "epoch": 0.21620111731843575, + "grad_norm": 1.5891910791397095, + "learning_rate": 0.0009920728291316528, + "loss": 0.6177, + "step": 387 + }, + { + "epoch": 0.21675977653631284, + "grad_norm": 0.9734386801719666, + "learning_rate": 0.000992044817927171, + "loss": 0.5366, + "step": 388 + }, + { + "epoch": 0.21731843575418994, + "grad_norm": 0.5598587989807129, + "learning_rate": 0.0009920168067226892, + "loss": 0.405, + "step": 389 + }, + { + "epoch": 0.21787709497206703, + "grad_norm": 5.168168067932129, + "learning_rate": 0.0009919887955182074, + "loss": 0.5266, + "step": 390 + }, + { + "epoch": 0.21843575418994413, + "grad_norm": 0.7288747429847717, + "learning_rate": 0.0009919607843137256, + "loss": 0.4978, + "step": 391 + }, + { + "epoch": 0.21899441340782122, + "grad_norm": 0.8025857210159302, + "learning_rate": 0.0009919327731092438, + "loss": 0.54, + "step": 392 + }, + { + "epoch": 0.21955307262569831, + "grad_norm": 0.9832891225814819, + "learning_rate": 0.000991904761904762, + "loss": 0.5582, + "step": 393 + }, + { + "epoch": 0.2201117318435754, + "grad_norm": 0.5751308798789978, + "learning_rate": 0.0009918767507002802, + "loss": 0.4927, + "step": 394 + }, + { + "epoch": 0.2206703910614525, + "grad_norm": 0.8643447160720825, + "learning_rate": 0.0009918487394957984, + "loss": 0.6098, + "step": 395 + }, + { + "epoch": 0.2212290502793296, + "grad_norm": 0.8747020363807678, + "learning_rate": 0.0009918207282913166, + "loss": 0.5769, + "step": 396 + }, + { + "epoch": 0.2217877094972067, + "grad_norm": 1.0640156269073486, + "learning_rate": 0.0009917927170868348, + "loss": 0.5298, + "step": 397 + }, + { + "epoch": 0.2223463687150838, + "grad_norm": 1.4771331548690796, + "learning_rate": 0.000991764705882353, + "loss": 0.5066, + "step": 398 + }, + { + "epoch": 0.22290502793296088, + "grad_norm": 0.8105157017707825, + "learning_rate": 0.0009917366946778712, + "loss": 0.566, + "step": 399 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 1.858401894569397, + "learning_rate": 0.0009917086834733894, + "loss": 0.4828, + "step": 400 + }, + { + "epoch": 0.2240223463687151, + "grad_norm": 3.5270822048187256, + "learning_rate": 0.0009916806722689076, + "loss": 0.4755, + "step": 401 + }, + { + "epoch": 0.2245810055865922, + "grad_norm": 0.7943991422653198, + "learning_rate": 0.0009916526610644258, + "loss": 0.6251, + "step": 402 + }, + { + "epoch": 0.22513966480446929, + "grad_norm": 1.6298469305038452, + "learning_rate": 0.000991624649859944, + "loss": 0.5079, + "step": 403 + }, + { + "epoch": 0.22569832402234638, + "grad_norm": 1.080518364906311, + "learning_rate": 0.0009915966386554623, + "loss": 0.5656, + "step": 404 + }, + { + "epoch": 0.22625698324022347, + "grad_norm": 0.7638111710548401, + "learning_rate": 0.0009915686274509805, + "loss": 0.5934, + "step": 405 + }, + { + "epoch": 0.22681564245810057, + "grad_norm": 0.8922688961029053, + "learning_rate": 0.0009915406162464987, + "loss": 0.5072, + "step": 406 + }, + { + "epoch": 0.22737430167597766, + "grad_norm": 0.8489904403686523, + "learning_rate": 0.0009915126050420169, + "loss": 0.6803, + "step": 407 + }, + { + "epoch": 0.22793296089385476, + "grad_norm": 0.4775846302509308, + "learning_rate": 0.000991484593837535, + "loss": 0.4048, + "step": 408 + }, + { + "epoch": 0.22849162011173185, + "grad_norm": 0.7432544827461243, + "learning_rate": 0.0009914565826330533, + "loss": 0.6623, + "step": 409 + }, + { + "epoch": 0.22905027932960895, + "grad_norm": 1.2556594610214233, + "learning_rate": 0.0009914285714285715, + "loss": 0.5097, + "step": 410 + }, + { + "epoch": 0.22960893854748604, + "grad_norm": 1.8600044250488281, + "learning_rate": 0.0009914005602240897, + "loss": 0.7254, + "step": 411 + }, + { + "epoch": 0.23016759776536314, + "grad_norm": 3.979811906814575, + "learning_rate": 0.000991372549019608, + "loss": 0.7805, + "step": 412 + }, + { + "epoch": 0.23072625698324023, + "grad_norm": 0.9164400100708008, + "learning_rate": 0.000991344537815126, + "loss": 0.5806, + "step": 413 + }, + { + "epoch": 0.23128491620111732, + "grad_norm": 1.161778211593628, + "learning_rate": 0.0009913165266106443, + "loss": 0.5192, + "step": 414 + }, + { + "epoch": 0.23184357541899442, + "grad_norm": 0.6569408178329468, + "learning_rate": 0.0009912885154061625, + "loss": 0.5496, + "step": 415 + }, + { + "epoch": 0.2324022346368715, + "grad_norm": 0.6834328770637512, + "learning_rate": 0.0009912605042016807, + "loss": 0.6046, + "step": 416 + }, + { + "epoch": 0.2329608938547486, + "grad_norm": 1.3349989652633667, + "learning_rate": 0.000991232492997199, + "loss": 0.5765, + "step": 417 + }, + { + "epoch": 0.2335195530726257, + "grad_norm": 0.8419320583343506, + "learning_rate": 0.0009912044817927171, + "loss": 0.7966, + "step": 418 + }, + { + "epoch": 0.2340782122905028, + "grad_norm": 1.4713504314422607, + "learning_rate": 0.0009911764705882353, + "loss": 0.6522, + "step": 419 + }, + { + "epoch": 0.2346368715083799, + "grad_norm": 1.129390835762024, + "learning_rate": 0.0009911484593837535, + "loss": 0.4777, + "step": 420 + }, + { + "epoch": 0.23519553072625698, + "grad_norm": 0.8232699632644653, + "learning_rate": 0.0009911204481792718, + "loss": 0.4943, + "step": 421 + }, + { + "epoch": 0.23575418994413408, + "grad_norm": 0.7964234948158264, + "learning_rate": 0.00099109243697479, + "loss": 0.4717, + "step": 422 + }, + { + "epoch": 0.23631284916201117, + "grad_norm": 1.2052483558654785, + "learning_rate": 0.0009910644257703082, + "loss": 0.5989, + "step": 423 + }, + { + "epoch": 0.23687150837988827, + "grad_norm": 1.7664382457733154, + "learning_rate": 0.0009910364145658264, + "loss": 0.6418, + "step": 424 + }, + { + "epoch": 0.23743016759776536, + "grad_norm": 0.9030166864395142, + "learning_rate": 0.0009910084033613446, + "loss": 0.5307, + "step": 425 + }, + { + "epoch": 0.23798882681564246, + "grad_norm": 0.9568511247634888, + "learning_rate": 0.0009909803921568628, + "loss": 0.6072, + "step": 426 + }, + { + "epoch": 0.23854748603351955, + "grad_norm": 1.076154112815857, + "learning_rate": 0.000990952380952381, + "loss": 0.6198, + "step": 427 + }, + { + "epoch": 0.23910614525139665, + "grad_norm": 0.6711006164550781, + "learning_rate": 0.0009909243697478992, + "loss": 0.531, + "step": 428 + }, + { + "epoch": 0.23966480446927374, + "grad_norm": 0.7736319303512573, + "learning_rate": 0.0009908963585434174, + "loss": 0.6457, + "step": 429 + }, + { + "epoch": 0.24022346368715083, + "grad_norm": 1.0549379587173462, + "learning_rate": 0.0009908683473389356, + "loss": 0.6572, + "step": 430 + }, + { + "epoch": 0.24078212290502793, + "grad_norm": 1.4842422008514404, + "learning_rate": 0.0009908403361344538, + "loss": 0.5173, + "step": 431 + }, + { + "epoch": 0.24134078212290502, + "grad_norm": 0.9686875343322754, + "learning_rate": 0.000990812324929972, + "loss": 0.4831, + "step": 432 + }, + { + "epoch": 0.24189944134078212, + "grad_norm": 1.2189487218856812, + "learning_rate": 0.0009907843137254902, + "loss": 0.4976, + "step": 433 + }, + { + "epoch": 0.2424581005586592, + "grad_norm": 2.1376233100891113, + "learning_rate": 0.0009907563025210084, + "loss": 0.6494, + "step": 434 + }, + { + "epoch": 0.2430167597765363, + "grad_norm": 0.8061239719390869, + "learning_rate": 0.0009907282913165266, + "loss": 0.4959, + "step": 435 + }, + { + "epoch": 0.2435754189944134, + "grad_norm": 0.7881906032562256, + "learning_rate": 0.0009907002801120448, + "loss": 0.5501, + "step": 436 + }, + { + "epoch": 0.2441340782122905, + "grad_norm": 0.8568841218948364, + "learning_rate": 0.000990672268907563, + "loss": 0.6192, + "step": 437 + }, + { + "epoch": 0.2446927374301676, + "grad_norm": 2.3859543800354004, + "learning_rate": 0.0009906442577030813, + "loss": 0.4891, + "step": 438 + }, + { + "epoch": 0.24525139664804468, + "grad_norm": 0.701188325881958, + "learning_rate": 0.0009906162464985995, + "loss": 0.6303, + "step": 439 + }, + { + "epoch": 0.24581005586592178, + "grad_norm": 1.6035054922103882, + "learning_rate": 0.0009905882352941177, + "loss": 0.6473, + "step": 440 + }, + { + "epoch": 0.24636871508379887, + "grad_norm": 0.8858741521835327, + "learning_rate": 0.0009905602240896359, + "loss": 0.5283, + "step": 441 + }, + { + "epoch": 0.24692737430167597, + "grad_norm": 0.979948103427887, + "learning_rate": 0.000990532212885154, + "loss": 0.5517, + "step": 442 + }, + { + "epoch": 0.24748603351955306, + "grad_norm": 0.8531337380409241, + "learning_rate": 0.0009905042016806723, + "loss": 0.4889, + "step": 443 + }, + { + "epoch": 0.24804469273743016, + "grad_norm": 1.120388388633728, + "learning_rate": 0.0009904761904761905, + "loss": 0.5923, + "step": 444 + }, + { + "epoch": 0.24860335195530725, + "grad_norm": 1.450002670288086, + "learning_rate": 0.0009904481792717087, + "loss": 0.5827, + "step": 445 + }, + { + "epoch": 0.24916201117318434, + "grad_norm": 0.8190898299217224, + "learning_rate": 0.000990420168067227, + "loss": 0.4909, + "step": 446 + }, + { + "epoch": 0.24972067039106144, + "grad_norm": 1.3921219110488892, + "learning_rate": 0.0009903921568627451, + "loss": 0.6231, + "step": 447 + }, + { + "epoch": 0.25027932960893856, + "grad_norm": 4.642048358917236, + "learning_rate": 0.0009903641456582633, + "loss": 0.6153, + "step": 448 + }, + { + "epoch": 0.2508379888268156, + "grad_norm": 1.0766607522964478, + "learning_rate": 0.0009903361344537815, + "loss": 0.5779, + "step": 449 + }, + { + "epoch": 0.25139664804469275, + "grad_norm": 1.5149521827697754, + "learning_rate": 0.0009903081232492997, + "loss": 0.5621, + "step": 450 + }, + { + "epoch": 0.2519553072625698, + "grad_norm": 1.6316112279891968, + "learning_rate": 0.000990280112044818, + "loss": 0.5128, + "step": 451 + }, + { + "epoch": 0.25251396648044694, + "grad_norm": 0.7833787798881531, + "learning_rate": 0.0009902521008403361, + "loss": 0.4507, + "step": 452 + }, + { + "epoch": 0.253072625698324, + "grad_norm": 0.6542819738388062, + "learning_rate": 0.0009902240896358543, + "loss": 0.4431, + "step": 453 + }, + { + "epoch": 0.2536312849162011, + "grad_norm": 1.1062020063400269, + "learning_rate": 0.0009901960784313726, + "loss": 0.4658, + "step": 454 + }, + { + "epoch": 0.2541899441340782, + "grad_norm": 1.5975990295410156, + "learning_rate": 0.0009901680672268908, + "loss": 0.683, + "step": 455 + }, + { + "epoch": 0.2547486033519553, + "grad_norm": 0.9453422427177429, + "learning_rate": 0.000990140056022409, + "loss": 0.5247, + "step": 456 + }, + { + "epoch": 0.2553072625698324, + "grad_norm": 0.9046151638031006, + "learning_rate": 0.0009901120448179272, + "loss": 0.567, + "step": 457 + }, + { + "epoch": 0.2558659217877095, + "grad_norm": 4.5664167404174805, + "learning_rate": 0.0009900840336134454, + "loss": 0.5604, + "step": 458 + }, + { + "epoch": 0.25642458100558657, + "grad_norm": 1.0993216037750244, + "learning_rate": 0.0009900560224089636, + "loss": 0.5606, + "step": 459 + }, + { + "epoch": 0.2569832402234637, + "grad_norm": 1.1192004680633545, + "learning_rate": 0.0009900280112044818, + "loss": 0.449, + "step": 460 + }, + { + "epoch": 0.25754189944134076, + "grad_norm": 1.8525092601776123, + "learning_rate": 0.00099, + "loss": 0.5958, + "step": 461 + }, + { + "epoch": 0.2581005586592179, + "grad_norm": 0.8744431734085083, + "learning_rate": 0.0009899719887955182, + "loss": 0.6259, + "step": 462 + }, + { + "epoch": 0.25865921787709495, + "grad_norm": 0.5100352168083191, + "learning_rate": 0.0009899439775910364, + "loss": 0.5203, + "step": 463 + }, + { + "epoch": 0.25921787709497207, + "grad_norm": 2.1647567749023438, + "learning_rate": 0.0009899159663865546, + "loss": 0.4758, + "step": 464 + }, + { + "epoch": 0.25977653631284914, + "grad_norm": 1.028052806854248, + "learning_rate": 0.0009898879551820728, + "loss": 0.5388, + "step": 465 + }, + { + "epoch": 0.26033519553072626, + "grad_norm": 1.115816354751587, + "learning_rate": 0.000989859943977591, + "loss": 0.6204, + "step": 466 + }, + { + "epoch": 0.2608938547486033, + "grad_norm": 0.7185821533203125, + "learning_rate": 0.0009898319327731092, + "loss": 0.5762, + "step": 467 + }, + { + "epoch": 0.26145251396648045, + "grad_norm": 0.9607667922973633, + "learning_rate": 0.0009898039215686274, + "loss": 0.6681, + "step": 468 + }, + { + "epoch": 0.2620111731843575, + "grad_norm": 2.2605960369110107, + "learning_rate": 0.0009897759103641456, + "loss": 0.484, + "step": 469 + }, + { + "epoch": 0.26256983240223464, + "grad_norm": 1.1028248071670532, + "learning_rate": 0.0009897478991596638, + "loss": 0.511, + "step": 470 + }, + { + "epoch": 0.2631284916201117, + "grad_norm": 0.9246019124984741, + "learning_rate": 0.000989719887955182, + "loss": 0.5988, + "step": 471 + }, + { + "epoch": 0.2636871508379888, + "grad_norm": 1.1163049936294556, + "learning_rate": 0.0009896918767507003, + "loss": 0.5703, + "step": 472 + }, + { + "epoch": 0.26424581005586595, + "grad_norm": 0.9648666977882385, + "learning_rate": 0.0009896638655462185, + "loss": 0.7031, + "step": 473 + }, + { + "epoch": 0.264804469273743, + "grad_norm": 0.8844133019447327, + "learning_rate": 0.0009896358543417367, + "loss": 0.5242, + "step": 474 + }, + { + "epoch": 0.26536312849162014, + "grad_norm": 1.0383061170578003, + "learning_rate": 0.0009896078431372549, + "loss": 0.5462, + "step": 475 + }, + { + "epoch": 0.2659217877094972, + "grad_norm": 1.277012586593628, + "learning_rate": 0.000989579831932773, + "loss": 0.5399, + "step": 476 + }, + { + "epoch": 0.2664804469273743, + "grad_norm": 0.8860754370689392, + "learning_rate": 0.0009895518207282913, + "loss": 0.4776, + "step": 477 + }, + { + "epoch": 0.2670391061452514, + "grad_norm": 0.8229119181632996, + "learning_rate": 0.0009895238095238095, + "loss": 0.5386, + "step": 478 + }, + { + "epoch": 0.2675977653631285, + "grad_norm": 0.8690646290779114, + "learning_rate": 0.0009894957983193277, + "loss": 0.5621, + "step": 479 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 2.185850143432617, + "learning_rate": 0.000989467787114846, + "loss": 0.5787, + "step": 480 + }, + { + "epoch": 0.2687150837988827, + "grad_norm": 0.6414355635643005, + "learning_rate": 0.0009894397759103641, + "loss": 0.4936, + "step": 481 + }, + { + "epoch": 0.26927374301675977, + "grad_norm": 0.8222938179969788, + "learning_rate": 0.0009894117647058823, + "loss": 0.4996, + "step": 482 + }, + { + "epoch": 0.2698324022346369, + "grad_norm": 0.9511945247650146, + "learning_rate": 0.0009893837535014005, + "loss": 0.5639, + "step": 483 + }, + { + "epoch": 0.27039106145251396, + "grad_norm": 1.151688575744629, + "learning_rate": 0.0009893557422969187, + "loss": 0.5654, + "step": 484 + }, + { + "epoch": 0.2709497206703911, + "grad_norm": 0.7357352375984192, + "learning_rate": 0.000989327731092437, + "loss": 0.5087, + "step": 485 + }, + { + "epoch": 0.27150837988826815, + "grad_norm": 0.8794959187507629, + "learning_rate": 0.0009892997198879551, + "loss": 0.5731, + "step": 486 + }, + { + "epoch": 0.27206703910614527, + "grad_norm": 0.8126729726791382, + "learning_rate": 0.0009892717086834734, + "loss": 0.7194, + "step": 487 + }, + { + "epoch": 0.27262569832402234, + "grad_norm": 0.5910065770149231, + "learning_rate": 0.0009892436974789916, + "loss": 0.5187, + "step": 488 + }, + { + "epoch": 0.27318435754189946, + "grad_norm": 0.7675679922103882, + "learning_rate": 0.0009892156862745098, + "loss": 0.7064, + "step": 489 + }, + { + "epoch": 0.2737430167597765, + "grad_norm": 1.0286262035369873, + "learning_rate": 0.000989187675070028, + "loss": 0.547, + "step": 490 + }, + { + "epoch": 0.27430167597765365, + "grad_norm": 0.561172366142273, + "learning_rate": 0.0009891596638655462, + "loss": 0.5427, + "step": 491 + }, + { + "epoch": 0.2748603351955307, + "grad_norm": 0.6805617809295654, + "learning_rate": 0.0009891316526610644, + "loss": 0.4352, + "step": 492 + }, + { + "epoch": 0.27541899441340784, + "grad_norm": 1.0946743488311768, + "learning_rate": 0.0009891036414565826, + "loss": 0.5098, + "step": 493 + }, + { + "epoch": 0.2759776536312849, + "grad_norm": 0.8197351694107056, + "learning_rate": 0.0009890756302521008, + "loss": 0.5598, + "step": 494 + }, + { + "epoch": 0.276536312849162, + "grad_norm": 0.8153690695762634, + "learning_rate": 0.000989047619047619, + "loss": 0.5379, + "step": 495 + }, + { + "epoch": 0.2770949720670391, + "grad_norm": 0.7094146609306335, + "learning_rate": 0.0009890196078431372, + "loss": 0.6306, + "step": 496 + }, + { + "epoch": 0.2776536312849162, + "grad_norm": 0.5500633120536804, + "learning_rate": 0.0009889915966386554, + "loss": 0.5782, + "step": 497 + }, + { + "epoch": 0.2782122905027933, + "grad_norm": 0.6152987480163574, + "learning_rate": 0.0009889635854341736, + "loss": 0.4949, + "step": 498 + }, + { + "epoch": 0.2787709497206704, + "grad_norm": 0.7058537006378174, + "learning_rate": 0.000988935574229692, + "loss": 0.4576, + "step": 499 + }, + { + "epoch": 0.27932960893854747, + "grad_norm": 0.9381784200668335, + "learning_rate": 0.00098890756302521, + "loss": 0.4365, + "step": 500 + }, + { + "epoch": 0.27932960893854747, + "eval_cer": 0.10674500561901956, + "eval_loss": 0.39540618658065796, + "eval_runtime": 55.7698, + "eval_samples_per_second": 81.37, + "eval_steps_per_second": 5.092, + "eval_wer": 0.4124745118851429, + "step": 500 + }, + { + "epoch": 0.2798882681564246, + "grad_norm": 0.640254020690918, + "learning_rate": 0.0009888795518207282, + "loss": 0.7642, + "step": 501 + }, + { + "epoch": 0.28044692737430166, + "grad_norm": 0.6222963333129883, + "learning_rate": 0.0009888515406162464, + "loss": 0.4841, + "step": 502 + }, + { + "epoch": 0.2810055865921788, + "grad_norm": 0.6733713150024414, + "learning_rate": 0.0009888235294117646, + "loss": 0.5577, + "step": 503 + }, + { + "epoch": 0.28156424581005585, + "grad_norm": 0.5247134566307068, + "learning_rate": 0.000988795518207283, + "loss": 0.5758, + "step": 504 + }, + { + "epoch": 0.28212290502793297, + "grad_norm": 1.093283772468567, + "learning_rate": 0.000988767507002801, + "loss": 0.6831, + "step": 505 + }, + { + "epoch": 0.28268156424581004, + "grad_norm": 0.7806113362312317, + "learning_rate": 0.0009887394957983193, + "loss": 0.455, + "step": 506 + }, + { + "epoch": 0.28324022346368716, + "grad_norm": 0.6761318445205688, + "learning_rate": 0.0009887114845938375, + "loss": 0.4346, + "step": 507 + }, + { + "epoch": 0.2837988826815642, + "grad_norm": 0.8825002908706665, + "learning_rate": 0.0009886834733893557, + "loss": 0.6033, + "step": 508 + }, + { + "epoch": 0.28435754189944135, + "grad_norm": 1.0674254894256592, + "learning_rate": 0.000988655462184874, + "loss": 0.6355, + "step": 509 + }, + { + "epoch": 0.2849162011173184, + "grad_norm": 0.5881056785583496, + "learning_rate": 0.000988627450980392, + "loss": 0.6378, + "step": 510 + }, + { + "epoch": 0.28547486033519553, + "grad_norm": 1.8274636268615723, + "learning_rate": 0.0009885994397759103, + "loss": 0.6773, + "step": 511 + }, + { + "epoch": 0.2860335195530726, + "grad_norm": 0.7210019826889038, + "learning_rate": 0.0009885714285714285, + "loss": 0.4746, + "step": 512 + }, + { + "epoch": 0.2865921787709497, + "grad_norm": 0.5159186124801636, + "learning_rate": 0.0009885434173669467, + "loss": 0.4151, + "step": 513 + }, + { + "epoch": 0.2871508379888268, + "grad_norm": 0.47784191370010376, + "learning_rate": 0.0009885154061624651, + "loss": 0.4714, + "step": 514 + }, + { + "epoch": 0.2877094972067039, + "grad_norm": 2.3483059406280518, + "learning_rate": 0.0009884873949579833, + "loss": 0.5185, + "step": 515 + }, + { + "epoch": 0.288268156424581, + "grad_norm": 0.7674930691719055, + "learning_rate": 0.0009884593837535013, + "loss": 0.6903, + "step": 516 + }, + { + "epoch": 0.2888268156424581, + "grad_norm": 0.9830483794212341, + "learning_rate": 0.0009884313725490195, + "loss": 0.4401, + "step": 517 + }, + { + "epoch": 0.28938547486033517, + "grad_norm": 0.5300085544586182, + "learning_rate": 0.0009884033613445377, + "loss": 0.5136, + "step": 518 + }, + { + "epoch": 0.2899441340782123, + "grad_norm": 0.5695764422416687, + "learning_rate": 0.0009883753501400562, + "loss": 0.4509, + "step": 519 + }, + { + "epoch": 0.2905027932960894, + "grad_norm": 2.513749837875366, + "learning_rate": 0.0009883473389355744, + "loss": 0.5806, + "step": 520 + }, + { + "epoch": 0.2910614525139665, + "grad_norm": 0.5982545614242554, + "learning_rate": 0.0009883193277310924, + "loss": 0.5088, + "step": 521 + }, + { + "epoch": 0.2916201117318436, + "grad_norm": 0.5623366236686707, + "learning_rate": 0.0009882913165266106, + "loss": 0.548, + "step": 522 + }, + { + "epoch": 0.29217877094972067, + "grad_norm": 1.4201511144638062, + "learning_rate": 0.0009882633053221288, + "loss": 0.5219, + "step": 523 + }, + { + "epoch": 0.2927374301675978, + "grad_norm": 4.216770648956299, + "learning_rate": 0.0009882352941176472, + "loss": 0.4378, + "step": 524 + }, + { + "epoch": 0.29329608938547486, + "grad_norm": 1.6362286806106567, + "learning_rate": 0.0009882072829131654, + "loss": 0.4981, + "step": 525 + }, + { + "epoch": 0.293854748603352, + "grad_norm": 0.6319324374198914, + "learning_rate": 0.0009881792717086834, + "loss": 0.5748, + "step": 526 + }, + { + "epoch": 0.29441340782122905, + "grad_norm": 0.6910527348518372, + "learning_rate": 0.0009881512605042016, + "loss": 0.6463, + "step": 527 + }, + { + "epoch": 0.29497206703910617, + "grad_norm": 1.051849126815796, + "learning_rate": 0.0009881232492997198, + "loss": 0.4279, + "step": 528 + }, + { + "epoch": 0.29553072625698323, + "grad_norm": 1.8838139772415161, + "learning_rate": 0.0009880952380952382, + "loss": 0.5647, + "step": 529 + }, + { + "epoch": 0.29608938547486036, + "grad_norm": 1.694541335105896, + "learning_rate": 0.0009880672268907564, + "loss": 0.5946, + "step": 530 + }, + { + "epoch": 0.2966480446927374, + "grad_norm": 0.5781271457672119, + "learning_rate": 0.0009880392156862746, + "loss": 0.4582, + "step": 531 + }, + { + "epoch": 0.29720670391061454, + "grad_norm": 0.7101206183433533, + "learning_rate": 0.0009880112044817926, + "loss": 0.5532, + "step": 532 + }, + { + "epoch": 0.2977653631284916, + "grad_norm": 0.5283511877059937, + "learning_rate": 0.0009879831932773108, + "loss": 0.5119, + "step": 533 + }, + { + "epoch": 0.29832402234636873, + "grad_norm": 1.3297278881072998, + "learning_rate": 0.0009879551820728292, + "loss": 0.6357, + "step": 534 + }, + { + "epoch": 0.2988826815642458, + "grad_norm": 0.6172975897789001, + "learning_rate": 0.0009879271708683475, + "loss": 0.5827, + "step": 535 + }, + { + "epoch": 0.2994413407821229, + "grad_norm": 0.5623006224632263, + "learning_rate": 0.0009878991596638657, + "loss": 0.4352, + "step": 536 + }, + { + "epoch": 0.3, + "grad_norm": 0.9791301488876343, + "learning_rate": 0.0009878711484593837, + "loss": 0.4996, + "step": 537 + }, + { + "epoch": 0.3005586592178771, + "grad_norm": 0.9790188074111938, + "learning_rate": 0.0009878431372549019, + "loss": 0.5203, + "step": 538 + }, + { + "epoch": 0.3011173184357542, + "grad_norm": 0.7493875026702881, + "learning_rate": 0.0009878151260504203, + "loss": 0.5667, + "step": 539 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 1.7138361930847168, + "learning_rate": 0.0009877871148459385, + "loss": 0.5694, + "step": 540 + }, + { + "epoch": 0.30223463687150837, + "grad_norm": 1.1247279644012451, + "learning_rate": 0.0009877591036414567, + "loss": 0.5885, + "step": 541 + }, + { + "epoch": 0.3027932960893855, + "grad_norm": 0.6994509696960449, + "learning_rate": 0.0009877310924369747, + "loss": 0.5168, + "step": 542 + }, + { + "epoch": 0.30335195530726256, + "grad_norm": 1.5583826303482056, + "learning_rate": 0.0009877030812324929, + "loss": 0.5291, + "step": 543 + }, + { + "epoch": 0.3039106145251397, + "grad_norm": 0.9216806292533875, + "learning_rate": 0.0009876750700280113, + "loss": 0.4847, + "step": 544 + }, + { + "epoch": 0.30446927374301674, + "grad_norm": 1.0649412870407104, + "learning_rate": 0.0009876470588235295, + "loss": 0.6681, + "step": 545 + }, + { + "epoch": 0.30502793296089387, + "grad_norm": 0.8307594656944275, + "learning_rate": 0.0009876190476190477, + "loss": 0.539, + "step": 546 + }, + { + "epoch": 0.30558659217877093, + "grad_norm": 4.53532075881958, + "learning_rate": 0.000987591036414566, + "loss": 0.542, + "step": 547 + }, + { + "epoch": 0.30614525139664805, + "grad_norm": 1.1315875053405762, + "learning_rate": 0.000987563025210084, + "loss": 0.554, + "step": 548 + }, + { + "epoch": 0.3067039106145251, + "grad_norm": 1.2333356142044067, + "learning_rate": 0.0009875350140056023, + "loss": 0.4888, + "step": 549 + }, + { + "epoch": 0.30726256983240224, + "grad_norm": 0.7535827159881592, + "learning_rate": 0.0009875070028011205, + "loss": 0.4799, + "step": 550 + }, + { + "epoch": 0.3078212290502793, + "grad_norm": 0.681426465511322, + "learning_rate": 0.0009874789915966388, + "loss": 0.3963, + "step": 551 + }, + { + "epoch": 0.30837988826815643, + "grad_norm": 1.0495132207870483, + "learning_rate": 0.000987450980392157, + "loss": 0.4841, + "step": 552 + }, + { + "epoch": 0.3089385474860335, + "grad_norm": 0.8529074192047119, + "learning_rate": 0.000987422969187675, + "loss": 0.6456, + "step": 553 + }, + { + "epoch": 0.3094972067039106, + "grad_norm": 7.31911563873291, + "learning_rate": 0.0009873949579831934, + "loss": 0.564, + "step": 554 + }, + { + "epoch": 0.3100558659217877, + "grad_norm": 0.7274956107139587, + "learning_rate": 0.0009873669467787116, + "loss": 0.4856, + "step": 555 + }, + { + "epoch": 0.3106145251396648, + "grad_norm": 0.9564986824989319, + "learning_rate": 0.0009873389355742298, + "loss": 0.7139, + "step": 556 + }, + { + "epoch": 0.3111731843575419, + "grad_norm": 1.6834135055541992, + "learning_rate": 0.000987310924369748, + "loss": 0.4995, + "step": 557 + }, + { + "epoch": 0.311731843575419, + "grad_norm": 3.4599685668945312, + "learning_rate": 0.000987282913165266, + "loss": 0.5741, + "step": 558 + }, + { + "epoch": 0.31229050279329607, + "grad_norm": 0.7904874682426453, + "learning_rate": 0.0009872549019607844, + "loss": 0.464, + "step": 559 + }, + { + "epoch": 0.3128491620111732, + "grad_norm": 1.5327842235565186, + "learning_rate": 0.0009872268907563026, + "loss": 0.5896, + "step": 560 + }, + { + "epoch": 0.31340782122905025, + "grad_norm": 0.861058235168457, + "learning_rate": 0.0009871988795518208, + "loss": 0.4847, + "step": 561 + }, + { + "epoch": 0.3139664804469274, + "grad_norm": 0.7388689517974854, + "learning_rate": 0.000987170868347339, + "loss": 0.4693, + "step": 562 + }, + { + "epoch": 0.31452513966480444, + "grad_norm": 0.5862420201301575, + "learning_rate": 0.0009871428571428572, + "loss": 0.6276, + "step": 563 + }, + { + "epoch": 0.31508379888268156, + "grad_norm": 0.8733795285224915, + "learning_rate": 0.0009871148459383754, + "loss": 0.5302, + "step": 564 + }, + { + "epoch": 0.31564245810055863, + "grad_norm": 1.1133688688278198, + "learning_rate": 0.0009870868347338936, + "loss": 0.4618, + "step": 565 + }, + { + "epoch": 0.31620111731843575, + "grad_norm": 0.9358701705932617, + "learning_rate": 0.0009870588235294118, + "loss": 0.4899, + "step": 566 + }, + { + "epoch": 0.3167597765363129, + "grad_norm": 0.857302188873291, + "learning_rate": 0.00098703081232493, + "loss": 0.6417, + "step": 567 + }, + { + "epoch": 0.31731843575418994, + "grad_norm": 0.8492801189422607, + "learning_rate": 0.0009870028011204483, + "loss": 0.4556, + "step": 568 + }, + { + "epoch": 0.31787709497206706, + "grad_norm": 0.9835643172264099, + "learning_rate": 0.0009869747899159665, + "loss": 0.5778, + "step": 569 + }, + { + "epoch": 0.31843575418994413, + "grad_norm": 1.7731804847717285, + "learning_rate": 0.0009869467787114847, + "loss": 0.5371, + "step": 570 + }, + { + "epoch": 0.31899441340782125, + "grad_norm": 0.8389686942100525, + "learning_rate": 0.0009869187675070029, + "loss": 0.4409, + "step": 571 + }, + { + "epoch": 0.3195530726256983, + "grad_norm": 1.047930121421814, + "learning_rate": 0.000986890756302521, + "loss": 0.5879, + "step": 572 + }, + { + "epoch": 0.32011173184357544, + "grad_norm": 0.7291346192359924, + "learning_rate": 0.0009868627450980393, + "loss": 0.555, + "step": 573 + }, + { + "epoch": 0.3206703910614525, + "grad_norm": 0.9129700064659119, + "learning_rate": 0.0009868347338935575, + "loss": 0.5871, + "step": 574 + }, + { + "epoch": 0.32122905027932963, + "grad_norm": 0.8879346251487732, + "learning_rate": 0.0009868067226890757, + "loss": 0.4973, + "step": 575 + }, + { + "epoch": 0.3217877094972067, + "grad_norm": 1.487825870513916, + "learning_rate": 0.000986778711484594, + "loss": 0.5086, + "step": 576 + }, + { + "epoch": 0.3223463687150838, + "grad_norm": 0.638690710067749, + "learning_rate": 0.000986750700280112, + "loss": 0.486, + "step": 577 + }, + { + "epoch": 0.3229050279329609, + "grad_norm": 0.4099404215812683, + "learning_rate": 0.0009867226890756303, + "loss": 0.4206, + "step": 578 + }, + { + "epoch": 0.323463687150838, + "grad_norm": 1.078679084777832, + "learning_rate": 0.0009866946778711485, + "loss": 0.6084, + "step": 579 + }, + { + "epoch": 0.3240223463687151, + "grad_norm": 0.818130373954773, + "learning_rate": 0.0009866666666666667, + "loss": 0.5727, + "step": 580 + }, + { + "epoch": 0.3245810055865922, + "grad_norm": 6.597809314727783, + "learning_rate": 0.000986638655462185, + "loss": 0.4981, + "step": 581 + }, + { + "epoch": 0.32513966480446926, + "grad_norm": 0.6093296408653259, + "learning_rate": 0.0009866106442577031, + "loss": 0.543, + "step": 582 + }, + { + "epoch": 0.3256983240223464, + "grad_norm": 0.7406026721000671, + "learning_rate": 0.0009865826330532213, + "loss": 0.4562, + "step": 583 + }, + { + "epoch": 0.32625698324022345, + "grad_norm": 0.7541854977607727, + "learning_rate": 0.0009865546218487395, + "loss": 0.5853, + "step": 584 + }, + { + "epoch": 0.3268156424581006, + "grad_norm": 0.8364110589027405, + "learning_rate": 0.0009865266106442578, + "loss": 0.6134, + "step": 585 + }, + { + "epoch": 0.32737430167597764, + "grad_norm": 0.9898073673248291, + "learning_rate": 0.000986498599439776, + "loss": 0.5891, + "step": 586 + }, + { + "epoch": 0.32793296089385476, + "grad_norm": 0.7348021864891052, + "learning_rate": 0.0009864705882352942, + "loss": 0.5638, + "step": 587 + }, + { + "epoch": 0.32849162011173183, + "grad_norm": 0.566052258014679, + "learning_rate": 0.0009864425770308124, + "loss": 0.6911, + "step": 588 + }, + { + "epoch": 0.32905027932960895, + "grad_norm": 5.006649017333984, + "learning_rate": 0.0009864145658263306, + "loss": 0.4984, + "step": 589 + }, + { + "epoch": 0.329608938547486, + "grad_norm": 1.3679720163345337, + "learning_rate": 0.0009863865546218488, + "loss": 0.451, + "step": 590 + }, + { + "epoch": 0.33016759776536314, + "grad_norm": 0.7462795972824097, + "learning_rate": 0.000986358543417367, + "loss": 0.4371, + "step": 591 + }, + { + "epoch": 0.3307262569832402, + "grad_norm": 0.9732274413108826, + "learning_rate": 0.0009863305322128852, + "loss": 0.6407, + "step": 592 + }, + { + "epoch": 0.33128491620111733, + "grad_norm": 0.7388654351234436, + "learning_rate": 0.0009863025210084034, + "loss": 0.4886, + "step": 593 + }, + { + "epoch": 0.3318435754189944, + "grad_norm": 0.8793904185295105, + "learning_rate": 0.0009862745098039216, + "loss": 0.4647, + "step": 594 + }, + { + "epoch": 0.3324022346368715, + "grad_norm": 0.9062551856040955, + "learning_rate": 0.0009862464985994398, + "loss": 0.5031, + "step": 595 + }, + { + "epoch": 0.3329608938547486, + "grad_norm": 3.0678863525390625, + "learning_rate": 0.000986218487394958, + "loss": 0.6406, + "step": 596 + }, + { + "epoch": 0.3335195530726257, + "grad_norm": 0.9255324006080627, + "learning_rate": 0.0009861904761904762, + "loss": 0.56, + "step": 597 + }, + { + "epoch": 0.3340782122905028, + "grad_norm": 0.6190474033355713, + "learning_rate": 0.0009861624649859944, + "loss": 0.5146, + "step": 598 + }, + { + "epoch": 0.3346368715083799, + "grad_norm": 0.7674522399902344, + "learning_rate": 0.0009861344537815126, + "loss": 0.5423, + "step": 599 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 1.7365634441375732, + "learning_rate": 0.0009861064425770308, + "loss": 0.5815, + "step": 600 + }, + { + "epoch": 0.3357541899441341, + "grad_norm": 0.9577219486236572, + "learning_rate": 0.000986078431372549, + "loss": 0.5612, + "step": 601 + }, + { + "epoch": 0.33631284916201115, + "grad_norm": 0.9474151730537415, + "learning_rate": 0.0009860504201680673, + "loss": 0.512, + "step": 602 + }, + { + "epoch": 0.3368715083798883, + "grad_norm": 0.5228251814842224, + "learning_rate": 0.0009860224089635855, + "loss": 0.594, + "step": 603 + }, + { + "epoch": 0.33743016759776534, + "grad_norm": 0.7312942743301392, + "learning_rate": 0.0009859943977591037, + "loss": 0.5573, + "step": 604 + }, + { + "epoch": 0.33798882681564246, + "grad_norm": 0.7236953377723694, + "learning_rate": 0.0009859663865546219, + "loss": 0.6495, + "step": 605 + }, + { + "epoch": 0.33854748603351953, + "grad_norm": 0.749528169631958, + "learning_rate": 0.00098593837535014, + "loss": 0.498, + "step": 606 + }, + { + "epoch": 0.33910614525139665, + "grad_norm": 1.3720815181732178, + "learning_rate": 0.0009859103641456583, + "loss": 0.5544, + "step": 607 + }, + { + "epoch": 0.3396648044692737, + "grad_norm": 1.715617060661316, + "learning_rate": 0.0009858823529411765, + "loss": 0.4906, + "step": 608 + }, + { + "epoch": 0.34022346368715084, + "grad_norm": 0.5368160009384155, + "learning_rate": 0.0009858543417366947, + "loss": 0.4501, + "step": 609 + }, + { + "epoch": 0.3407821229050279, + "grad_norm": 0.6735332012176514, + "learning_rate": 0.000985826330532213, + "loss": 0.5236, + "step": 610 + }, + { + "epoch": 0.34134078212290503, + "grad_norm": 0.6093345880508423, + "learning_rate": 0.0009857983193277311, + "loss": 0.4786, + "step": 611 + }, + { + "epoch": 0.3418994413407821, + "grad_norm": 1.024030327796936, + "learning_rate": 0.0009857703081232493, + "loss": 0.5203, + "step": 612 + }, + { + "epoch": 0.3424581005586592, + "grad_norm": 0.609090268611908, + "learning_rate": 0.0009857422969187675, + "loss": 0.4912, + "step": 613 + }, + { + "epoch": 0.34301675977653634, + "grad_norm": 0.5807856917381287, + "learning_rate": 0.0009857142857142857, + "loss": 0.4129, + "step": 614 + }, + { + "epoch": 0.3435754189944134, + "grad_norm": 0.8883734345436096, + "learning_rate": 0.000985686274509804, + "loss": 0.5617, + "step": 615 + }, + { + "epoch": 0.34413407821229053, + "grad_norm": 8.983072280883789, + "learning_rate": 0.0009856582633053221, + "loss": 0.5198, + "step": 616 + }, + { + "epoch": 0.3446927374301676, + "grad_norm": 1.6517291069030762, + "learning_rate": 0.0009856302521008403, + "loss": 0.5649, + "step": 617 + }, + { + "epoch": 0.3452513966480447, + "grad_norm": 0.8718855977058411, + "learning_rate": 0.0009856022408963586, + "loss": 0.621, + "step": 618 + }, + { + "epoch": 0.3458100558659218, + "grad_norm": 0.5957295298576355, + "learning_rate": 0.0009855742296918768, + "loss": 0.4323, + "step": 619 + }, + { + "epoch": 0.3463687150837989, + "grad_norm": 4.571535110473633, + "learning_rate": 0.000985546218487395, + "loss": 0.6198, + "step": 620 + }, + { + "epoch": 0.346927374301676, + "grad_norm": 0.9922893643379211, + "learning_rate": 0.0009855182072829132, + "loss": 0.5402, + "step": 621 + }, + { + "epoch": 0.3474860335195531, + "grad_norm": 0.8614273071289062, + "learning_rate": 0.0009854901960784314, + "loss": 0.4172, + "step": 622 + }, + { + "epoch": 0.34804469273743016, + "grad_norm": 1.1592367887496948, + "learning_rate": 0.0009854621848739496, + "loss": 0.6982, + "step": 623 + }, + { + "epoch": 0.3486033519553073, + "grad_norm": 0.8100976347923279, + "learning_rate": 0.0009854341736694678, + "loss": 0.5332, + "step": 624 + }, + { + "epoch": 0.34916201117318435, + "grad_norm": 1.0766860246658325, + "learning_rate": 0.000985406162464986, + "loss": 0.6281, + "step": 625 + }, + { + "epoch": 0.34972067039106147, + "grad_norm": 0.5531345009803772, + "learning_rate": 0.0009853781512605042, + "loss": 0.4697, + "step": 626 + }, + { + "epoch": 0.35027932960893854, + "grad_norm": 0.6653377413749695, + "learning_rate": 0.0009853501400560224, + "loss": 0.476, + "step": 627 + }, + { + "epoch": 0.35083798882681566, + "grad_norm": 0.7839998006820679, + "learning_rate": 0.0009853221288515406, + "loss": 0.4844, + "step": 628 + }, + { + "epoch": 0.3513966480446927, + "grad_norm": 0.9125815629959106, + "learning_rate": 0.0009852941176470588, + "loss": 0.55, + "step": 629 + }, + { + "epoch": 0.35195530726256985, + "grad_norm": 0.7828765511512756, + "learning_rate": 0.000985266106442577, + "loss": 0.464, + "step": 630 + }, + { + "epoch": 0.3525139664804469, + "grad_norm": 0.9206963181495667, + "learning_rate": 0.0009852380952380952, + "loss": 0.6883, + "step": 631 + }, + { + "epoch": 0.35307262569832404, + "grad_norm": 1.4578670263290405, + "learning_rate": 0.0009852100840336134, + "loss": 0.5362, + "step": 632 + }, + { + "epoch": 0.3536312849162011, + "grad_norm": 0.7980212569236755, + "learning_rate": 0.0009851820728291316, + "loss": 0.5009, + "step": 633 + }, + { + "epoch": 0.3541899441340782, + "grad_norm": 0.9872021675109863, + "learning_rate": 0.0009851540616246498, + "loss": 0.3887, + "step": 634 + }, + { + "epoch": 0.3547486033519553, + "grad_norm": 1.398852825164795, + "learning_rate": 0.000985126050420168, + "loss": 0.6557, + "step": 635 + }, + { + "epoch": 0.3553072625698324, + "grad_norm": 0.9444912672042847, + "learning_rate": 0.0009850980392156863, + "loss": 0.5873, + "step": 636 + }, + { + "epoch": 0.3558659217877095, + "grad_norm": 1.0968694686889648, + "learning_rate": 0.0009850700280112045, + "loss": 0.5259, + "step": 637 + }, + { + "epoch": 0.3564245810055866, + "grad_norm": 0.5139156579971313, + "learning_rate": 0.0009850420168067227, + "loss": 0.5205, + "step": 638 + }, + { + "epoch": 0.35698324022346367, + "grad_norm": 1.0068995952606201, + "learning_rate": 0.0009850140056022409, + "loss": 0.5287, + "step": 639 + }, + { + "epoch": 0.3575418994413408, + "grad_norm": 2.2291014194488525, + "learning_rate": 0.000984985994397759, + "loss": 0.5895, + "step": 640 + }, + { + "epoch": 0.35810055865921786, + "grad_norm": 1.0994702577590942, + "learning_rate": 0.0009849579831932773, + "loss": 0.6271, + "step": 641 + }, + { + "epoch": 0.358659217877095, + "grad_norm": 0.701944887638092, + "learning_rate": 0.0009849299719887955, + "loss": 0.5157, + "step": 642 + }, + { + "epoch": 0.35921787709497205, + "grad_norm": 1.1416760683059692, + "learning_rate": 0.0009849019607843137, + "loss": 0.7159, + "step": 643 + }, + { + "epoch": 0.35977653631284917, + "grad_norm": 0.7814168930053711, + "learning_rate": 0.000984873949579832, + "loss": 0.397, + "step": 644 + }, + { + "epoch": 0.36033519553072624, + "grad_norm": 0.5681543946266174, + "learning_rate": 0.0009848459383753501, + "loss": 0.5497, + "step": 645 + }, + { + "epoch": 0.36089385474860336, + "grad_norm": 0.5196084380149841, + "learning_rate": 0.0009848179271708683, + "loss": 0.4015, + "step": 646 + }, + { + "epoch": 0.3614525139664804, + "grad_norm": 0.8165639042854309, + "learning_rate": 0.0009847899159663865, + "loss": 0.5294, + "step": 647 + }, + { + "epoch": 0.36201117318435755, + "grad_norm": 0.7947068810462952, + "learning_rate": 0.0009847619047619047, + "loss": 0.6244, + "step": 648 + }, + { + "epoch": 0.3625698324022346, + "grad_norm": 0.5430769920349121, + "learning_rate": 0.000984733893557423, + "loss": 0.514, + "step": 649 + }, + { + "epoch": 0.36312849162011174, + "grad_norm": 5.341940402984619, + "learning_rate": 0.0009847058823529411, + "loss": 0.4679, + "step": 650 + }, + { + "epoch": 0.3636871508379888, + "grad_norm": 0.7629307508468628, + "learning_rate": 0.0009846778711484594, + "loss": 0.7608, + "step": 651 + }, + { + "epoch": 0.3642458100558659, + "grad_norm": 0.6666110754013062, + "learning_rate": 0.0009846498599439776, + "loss": 0.5415, + "step": 652 + }, + { + "epoch": 0.364804469273743, + "grad_norm": 0.8909905552864075, + "learning_rate": 0.0009846218487394958, + "loss": 0.453, + "step": 653 + }, + { + "epoch": 0.3653631284916201, + "grad_norm": 0.8186153769493103, + "learning_rate": 0.0009845938375350142, + "loss": 0.4321, + "step": 654 + }, + { + "epoch": 0.3659217877094972, + "grad_norm": 0.7347453236579895, + "learning_rate": 0.0009845658263305322, + "loss": 0.3973, + "step": 655 + }, + { + "epoch": 0.3664804469273743, + "grad_norm": 0.8336395621299744, + "learning_rate": 0.0009845378151260504, + "loss": 0.5548, + "step": 656 + }, + { + "epoch": 0.36703910614525137, + "grad_norm": 0.8416475057601929, + "learning_rate": 0.0009845098039215686, + "loss": 0.4945, + "step": 657 + }, + { + "epoch": 0.3675977653631285, + "grad_norm": 0.8832162022590637, + "learning_rate": 0.0009844817927170868, + "loss": 0.5359, + "step": 658 + }, + { + "epoch": 0.36815642458100556, + "grad_norm": 0.6441562175750732, + "learning_rate": 0.0009844537815126052, + "loss": 0.4669, + "step": 659 + }, + { + "epoch": 0.3687150837988827, + "grad_norm": 1.0926862955093384, + "learning_rate": 0.0009844257703081232, + "loss": 0.5395, + "step": 660 + }, + { + "epoch": 0.3692737430167598, + "grad_norm": 0.9024230241775513, + "learning_rate": 0.0009843977591036414, + "loss": 0.543, + "step": 661 + }, + { + "epoch": 0.36983240223463687, + "grad_norm": 0.9043408036231995, + "learning_rate": 0.0009843697478991596, + "loss": 0.486, + "step": 662 + }, + { + "epoch": 0.370391061452514, + "grad_norm": 0.5910623073577881, + "learning_rate": 0.0009843417366946778, + "loss": 0.4853, + "step": 663 + }, + { + "epoch": 0.37094972067039106, + "grad_norm": 0.5296788811683655, + "learning_rate": 0.000984313725490196, + "loss": 0.438, + "step": 664 + }, + { + "epoch": 0.3715083798882682, + "grad_norm": 0.763145923614502, + "learning_rate": 0.0009842857142857142, + "loss": 0.5428, + "step": 665 + }, + { + "epoch": 0.37206703910614525, + "grad_norm": 1.2509864568710327, + "learning_rate": 0.0009842577030812324, + "loss": 0.6036, + "step": 666 + }, + { + "epoch": 0.37262569832402237, + "grad_norm": 0.636996328830719, + "learning_rate": 0.0009842296918767506, + "loss": 0.477, + "step": 667 + }, + { + "epoch": 0.37318435754189944, + "grad_norm": 0.8803697228431702, + "learning_rate": 0.0009842016806722689, + "loss": 0.5022, + "step": 668 + }, + { + "epoch": 0.37374301675977656, + "grad_norm": 0.5829938054084778, + "learning_rate": 0.000984173669467787, + "loss": 0.4451, + "step": 669 + }, + { + "epoch": 0.3743016759776536, + "grad_norm": 0.48007455468177795, + "learning_rate": 0.0009841456582633055, + "loss": 0.42, + "step": 670 + }, + { + "epoch": 0.37486033519553075, + "grad_norm": 1.1333321332931519, + "learning_rate": 0.0009841176470588235, + "loss": 0.4739, + "step": 671 + }, + { + "epoch": 0.3754189944134078, + "grad_norm": 0.5421041250228882, + "learning_rate": 0.0009840896358543417, + "loss": 0.5519, + "step": 672 + }, + { + "epoch": 0.37597765363128494, + "grad_norm": 0.621100902557373, + "learning_rate": 0.0009840616246498599, + "loss": 0.4365, + "step": 673 + }, + { + "epoch": 0.376536312849162, + "grad_norm": 1.0602147579193115, + "learning_rate": 0.000984033613445378, + "loss": 0.5093, + "step": 674 + }, + { + "epoch": 0.3770949720670391, + "grad_norm": 0.9669912457466125, + "learning_rate": 0.0009840056022408965, + "loss": 0.6225, + "step": 675 + }, + { + "epoch": 0.3776536312849162, + "grad_norm": 0.6287827491760254, + "learning_rate": 0.0009839775910364145, + "loss": 0.3752, + "step": 676 + }, + { + "epoch": 0.3782122905027933, + "grad_norm": 0.7915810346603394, + "learning_rate": 0.0009839495798319327, + "loss": 0.5571, + "step": 677 + }, + { + "epoch": 0.3787709497206704, + "grad_norm": 0.8399115204811096, + "learning_rate": 0.000983921568627451, + "loss": 0.6311, + "step": 678 + }, + { + "epoch": 0.3793296089385475, + "grad_norm": 1.0199395418167114, + "learning_rate": 0.0009838935574229691, + "loss": 0.6158, + "step": 679 + }, + { + "epoch": 0.37988826815642457, + "grad_norm": 0.7124590277671814, + "learning_rate": 0.0009838655462184875, + "loss": 0.4925, + "step": 680 + }, + { + "epoch": 0.3804469273743017, + "grad_norm": 0.5956300497055054, + "learning_rate": 0.0009838375350140055, + "loss": 0.5394, + "step": 681 + }, + { + "epoch": 0.38100558659217876, + "grad_norm": 0.5080326199531555, + "learning_rate": 0.0009838095238095237, + "loss": 0.5528, + "step": 682 + }, + { + "epoch": 0.3815642458100559, + "grad_norm": 0.6605573892593384, + "learning_rate": 0.000983781512605042, + "loss": 0.4708, + "step": 683 + }, + { + "epoch": 0.38212290502793295, + "grad_norm": 0.8146275281906128, + "learning_rate": 0.0009837535014005601, + "loss": 0.578, + "step": 684 + }, + { + "epoch": 0.38268156424581007, + "grad_norm": 1.4359737634658813, + "learning_rate": 0.0009837254901960786, + "loss": 0.442, + "step": 685 + }, + { + "epoch": 0.38324022346368714, + "grad_norm": 0.7695728540420532, + "learning_rate": 0.0009836974789915968, + "loss": 0.5112, + "step": 686 + }, + { + "epoch": 0.38379888268156426, + "grad_norm": 0.7323272228240967, + "learning_rate": 0.0009836694677871148, + "loss": 0.5192, + "step": 687 + }, + { + "epoch": 0.3843575418994413, + "grad_norm": 0.9150975942611694, + "learning_rate": 0.000983641456582633, + "loss": 0.4296, + "step": 688 + }, + { + "epoch": 0.38491620111731845, + "grad_norm": 1.4393677711486816, + "learning_rate": 0.0009836134453781512, + "loss": 0.4992, + "step": 689 + }, + { + "epoch": 0.3854748603351955, + "grad_norm": 1.056930661201477, + "learning_rate": 0.0009835854341736696, + "loss": 0.4393, + "step": 690 + }, + { + "epoch": 0.38603351955307263, + "grad_norm": 0.8621572256088257, + "learning_rate": 0.0009835574229691878, + "loss": 0.5995, + "step": 691 + }, + { + "epoch": 0.3865921787709497, + "grad_norm": 0.9120810031890869, + "learning_rate": 0.0009835294117647058, + "loss": 0.5515, + "step": 692 + }, + { + "epoch": 0.3871508379888268, + "grad_norm": 0.6020013093948364, + "learning_rate": 0.000983501400560224, + "loss": 0.4929, + "step": 693 + }, + { + "epoch": 0.3877094972067039, + "grad_norm": 0.8536820411682129, + "learning_rate": 0.0009834733893557422, + "loss": 0.4239, + "step": 694 + }, + { + "epoch": 0.388268156424581, + "grad_norm": 1.2177194356918335, + "learning_rate": 0.0009834453781512606, + "loss": 0.548, + "step": 695 + }, + { + "epoch": 0.3888268156424581, + "grad_norm": 0.8397508859634399, + "learning_rate": 0.0009834173669467788, + "loss": 0.5709, + "step": 696 + }, + { + "epoch": 0.3893854748603352, + "grad_norm": 0.75736403465271, + "learning_rate": 0.0009833893557422968, + "loss": 0.5774, + "step": 697 + }, + { + "epoch": 0.38994413407821227, + "grad_norm": 0.4960309863090515, + "learning_rate": 0.000983361344537815, + "loss": 0.485, + "step": 698 + }, + { + "epoch": 0.3905027932960894, + "grad_norm": 0.6970162391662598, + "learning_rate": 0.0009833333333333332, + "loss": 0.3803, + "step": 699 + }, + { + "epoch": 0.39106145251396646, + "grad_norm": 0.5962986946105957, + "learning_rate": 0.0009833053221288517, + "loss": 0.4879, + "step": 700 + }, + { + "epoch": 0.3916201117318436, + "grad_norm": 0.5089249610900879, + "learning_rate": 0.0009832773109243699, + "loss": 0.3975, + "step": 701 + }, + { + "epoch": 0.39217877094972065, + "grad_norm": 0.9010866284370422, + "learning_rate": 0.000983249299719888, + "loss": 0.3997, + "step": 702 + }, + { + "epoch": 0.39273743016759777, + "grad_norm": 0.587181568145752, + "learning_rate": 0.000983221288515406, + "loss": 0.4275, + "step": 703 + }, + { + "epoch": 0.39329608938547483, + "grad_norm": 0.6018347144126892, + "learning_rate": 0.0009831932773109243, + "loss": 0.5464, + "step": 704 + }, + { + "epoch": 0.39385474860335196, + "grad_norm": 0.748507559299469, + "learning_rate": 0.0009831652661064427, + "loss": 0.46, + "step": 705 + }, + { + "epoch": 0.394413407821229, + "grad_norm": 0.5247789025306702, + "learning_rate": 0.000983137254901961, + "loss": 0.4726, + "step": 706 + }, + { + "epoch": 0.39497206703910615, + "grad_norm": 0.43686094880104065, + "learning_rate": 0.000983109243697479, + "loss": 0.4056, + "step": 707 + }, + { + "epoch": 0.39553072625698327, + "grad_norm": 1.0581320524215698, + "learning_rate": 0.000983081232492997, + "loss": 0.6545, + "step": 708 + }, + { + "epoch": 0.39608938547486033, + "grad_norm": 0.581413984298706, + "learning_rate": 0.0009830532212885153, + "loss": 0.3951, + "step": 709 + }, + { + "epoch": 0.39664804469273746, + "grad_norm": 0.5816717147827148, + "learning_rate": 0.0009830252100840337, + "loss": 0.4465, + "step": 710 + }, + { + "epoch": 0.3972067039106145, + "grad_norm": 0.6564436554908752, + "learning_rate": 0.000982997198879552, + "loss": 0.4934, + "step": 711 + }, + { + "epoch": 0.39776536312849164, + "grad_norm": 0.5008072853088379, + "learning_rate": 0.0009829691876750701, + "loss": 0.4224, + "step": 712 + }, + { + "epoch": 0.3983240223463687, + "grad_norm": 0.6331523656845093, + "learning_rate": 0.0009829411764705881, + "loss": 0.5258, + "step": 713 + }, + { + "epoch": 0.39888268156424583, + "grad_norm": 0.5446946620941162, + "learning_rate": 0.0009829131652661063, + "loss": 0.5251, + "step": 714 + }, + { + "epoch": 0.3994413407821229, + "grad_norm": 1.016221284866333, + "learning_rate": 0.0009828851540616248, + "loss": 0.6111, + "step": 715 + }, + { + "epoch": 0.4, + "grad_norm": 0.5130570530891418, + "learning_rate": 0.000982857142857143, + "loss": 0.465, + "step": 716 + }, + { + "epoch": 0.4005586592178771, + "grad_norm": 0.7790815234184265, + "learning_rate": 0.0009828291316526612, + "loss": 0.5953, + "step": 717 + }, + { + "epoch": 0.4011173184357542, + "grad_norm": 0.6934060454368591, + "learning_rate": 0.0009828011204481794, + "loss": 0.5043, + "step": 718 + }, + { + "epoch": 0.4016759776536313, + "grad_norm": 0.6301409602165222, + "learning_rate": 0.0009827731092436974, + "loss": 0.5164, + "step": 719 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 0.7534012198448181, + "learning_rate": 0.0009827450980392158, + "loss": 0.5121, + "step": 720 + }, + { + "epoch": 0.40279329608938547, + "grad_norm": 0.6056898236274719, + "learning_rate": 0.000982717086834734, + "loss": 0.5822, + "step": 721 + }, + { + "epoch": 0.4033519553072626, + "grad_norm": 1.156616449356079, + "learning_rate": 0.0009826890756302522, + "loss": 0.5301, + "step": 722 + }, + { + "epoch": 0.40391061452513966, + "grad_norm": 1.0898436307907104, + "learning_rate": 0.0009826610644257704, + "loss": 0.3878, + "step": 723 + }, + { + "epoch": 0.4044692737430168, + "grad_norm": 0.7641041874885559, + "learning_rate": 0.0009826330532212884, + "loss": 0.6096, + "step": 724 + }, + { + "epoch": 0.40502793296089384, + "grad_norm": 0.5866867899894714, + "learning_rate": 0.0009826050420168068, + "loss": 0.5536, + "step": 725 + }, + { + "epoch": 0.40558659217877097, + "grad_norm": 5.730029582977295, + "learning_rate": 0.000982577030812325, + "loss": 0.4713, + "step": 726 + }, + { + "epoch": 0.40614525139664803, + "grad_norm": 1.1350740194320679, + "learning_rate": 0.0009825490196078432, + "loss": 0.5403, + "step": 727 + }, + { + "epoch": 0.40670391061452515, + "grad_norm": 1.1513550281524658, + "learning_rate": 0.0009825210084033614, + "loss": 0.4084, + "step": 728 + }, + { + "epoch": 0.4072625698324022, + "grad_norm": 0.6230626702308655, + "learning_rate": 0.0009824929971988794, + "loss": 0.5078, + "step": 729 + }, + { + "epoch": 0.40782122905027934, + "grad_norm": 0.6454103589057922, + "learning_rate": 0.0009824649859943978, + "loss": 0.5555, + "step": 730 + }, + { + "epoch": 0.4083798882681564, + "grad_norm": 0.5713451504707336, + "learning_rate": 0.000982436974789916, + "loss": 0.4608, + "step": 731 + }, + { + "epoch": 0.40893854748603353, + "grad_norm": 0.7829291820526123, + "learning_rate": 0.0009824089635854343, + "loss": 0.447, + "step": 732 + }, + { + "epoch": 0.4094972067039106, + "grad_norm": 1.2563079595565796, + "learning_rate": 0.0009823809523809525, + "loss": 0.6568, + "step": 733 + }, + { + "epoch": 0.4100558659217877, + "grad_norm": 9.718208312988281, + "learning_rate": 0.0009823529411764707, + "loss": 0.682, + "step": 734 + }, + { + "epoch": 0.4106145251396648, + "grad_norm": 0.726872444152832, + "learning_rate": 0.0009823249299719889, + "loss": 0.5385, + "step": 735 + }, + { + "epoch": 0.4111731843575419, + "grad_norm": 0.5961375832557678, + "learning_rate": 0.000982296918767507, + "loss": 0.5955, + "step": 736 + }, + { + "epoch": 0.411731843575419, + "grad_norm": 6.342020034790039, + "learning_rate": 0.0009822689075630253, + "loss": 0.4763, + "step": 737 + }, + { + "epoch": 0.4122905027932961, + "grad_norm": 0.8904435038566589, + "learning_rate": 0.0009822408963585435, + "loss": 0.5814, + "step": 738 + }, + { + "epoch": 0.41284916201117317, + "grad_norm": 0.5107899308204651, + "learning_rate": 0.0009822128851540617, + "loss": 0.4904, + "step": 739 + }, + { + "epoch": 0.4134078212290503, + "grad_norm": 0.8592984676361084, + "learning_rate": 0.00098218487394958, + "loss": 0.5191, + "step": 740 + }, + { + "epoch": 0.41396648044692735, + "grad_norm": 0.6866621971130371, + "learning_rate": 0.000982156862745098, + "loss": 0.6135, + "step": 741 + }, + { + "epoch": 0.4145251396648045, + "grad_norm": 0.9929611086845398, + "learning_rate": 0.0009821288515406163, + "loss": 0.5175, + "step": 742 + }, + { + "epoch": 0.41508379888268154, + "grad_norm": 4.138786315917969, + "learning_rate": 0.0009821008403361345, + "loss": 0.5835, + "step": 743 + }, + { + "epoch": 0.41564245810055866, + "grad_norm": 0.5353333950042725, + "learning_rate": 0.0009820728291316527, + "loss": 0.4488, + "step": 744 + }, + { + "epoch": 0.41620111731843573, + "grad_norm": 0.7421092987060547, + "learning_rate": 0.000982044817927171, + "loss": 0.4024, + "step": 745 + }, + { + "epoch": 0.41675977653631285, + "grad_norm": 0.5338488817214966, + "learning_rate": 0.0009820168067226891, + "loss": 0.51, + "step": 746 + }, + { + "epoch": 0.4173184357541899, + "grad_norm": 1.0539344549179077, + "learning_rate": 0.0009819887955182073, + "loss": 0.4863, + "step": 747 + }, + { + "epoch": 0.41787709497206704, + "grad_norm": 1.0292673110961914, + "learning_rate": 0.0009819607843137255, + "loss": 0.6767, + "step": 748 + }, + { + "epoch": 0.4184357541899441, + "grad_norm": 0.8841199278831482, + "learning_rate": 0.0009819327731092438, + "loss": 0.5283, + "step": 749 + }, + { + "epoch": 0.41899441340782123, + "grad_norm": 0.4805144965648651, + "learning_rate": 0.000981904761904762, + "loss": 0.5039, + "step": 750 + }, + { + "epoch": 0.4195530726256983, + "grad_norm": 0.49522754549980164, + "learning_rate": 0.0009818767507002802, + "loss": 0.4332, + "step": 751 + }, + { + "epoch": 0.4201117318435754, + "grad_norm": 1.9347999095916748, + "learning_rate": 0.0009818487394957984, + "loss": 0.6183, + "step": 752 + }, + { + "epoch": 0.4206703910614525, + "grad_norm": 0.7999465465545654, + "learning_rate": 0.0009818207282913166, + "loss": 0.552, + "step": 753 + }, + { + "epoch": 0.4212290502793296, + "grad_norm": 0.5772879123687744, + "learning_rate": 0.0009817927170868348, + "loss": 0.4982, + "step": 754 + }, + { + "epoch": 0.42178770949720673, + "grad_norm": 0.4552770256996155, + "learning_rate": 0.000981764705882353, + "loss": 0.4604, + "step": 755 + }, + { + "epoch": 0.4223463687150838, + "grad_norm": 0.6333035230636597, + "learning_rate": 0.0009817366946778712, + "loss": 0.5294, + "step": 756 + }, + { + "epoch": 0.4229050279329609, + "grad_norm": 0.5289617776870728, + "learning_rate": 0.0009817086834733894, + "loss": 0.5121, + "step": 757 + }, + { + "epoch": 0.423463687150838, + "grad_norm": 0.9566251039505005, + "learning_rate": 0.0009816806722689076, + "loss": 0.5332, + "step": 758 + }, + { + "epoch": 0.4240223463687151, + "grad_norm": 0.9983794093132019, + "learning_rate": 0.0009816526610644258, + "loss": 0.4375, + "step": 759 + }, + { + "epoch": 0.4245810055865922, + "grad_norm": 0.8549471497535706, + "learning_rate": 0.000981624649859944, + "loss": 0.4418, + "step": 760 + }, + { + "epoch": 0.4251396648044693, + "grad_norm": 2.0585970878601074, + "learning_rate": 0.0009815966386554622, + "loss": 0.6453, + "step": 761 + }, + { + "epoch": 0.42569832402234636, + "grad_norm": 0.5526566505432129, + "learning_rate": 0.0009815686274509804, + "loss": 0.4516, + "step": 762 + }, + { + "epoch": 0.4262569832402235, + "grad_norm": 0.6749558448791504, + "learning_rate": 0.0009815406162464986, + "loss": 0.4908, + "step": 763 + }, + { + "epoch": 0.42681564245810055, + "grad_norm": 1.370712399482727, + "learning_rate": 0.0009815126050420168, + "loss": 0.5092, + "step": 764 + }, + { + "epoch": 0.4273743016759777, + "grad_norm": 0.7793422341346741, + "learning_rate": 0.000981484593837535, + "loss": 0.5939, + "step": 765 + }, + { + "epoch": 0.42793296089385474, + "grad_norm": 0.5176940560340881, + "learning_rate": 0.0009814565826330533, + "loss": 0.4488, + "step": 766 + }, + { + "epoch": 0.42849162011173186, + "grad_norm": 1.0649203062057495, + "learning_rate": 0.0009814285714285715, + "loss": 0.619, + "step": 767 + }, + { + "epoch": 0.42905027932960893, + "grad_norm": 0.6778590083122253, + "learning_rate": 0.0009814005602240897, + "loss": 0.4603, + "step": 768 + }, + { + "epoch": 0.42960893854748605, + "grad_norm": 0.9636827707290649, + "learning_rate": 0.0009813725490196079, + "loss": 0.5105, + "step": 769 + }, + { + "epoch": 0.4301675977653631, + "grad_norm": 0.5597183108329773, + "learning_rate": 0.000981344537815126, + "loss": 0.4559, + "step": 770 + }, + { + "epoch": 0.43072625698324024, + "grad_norm": 0.5207726955413818, + "learning_rate": 0.0009813165266106443, + "loss": 0.4505, + "step": 771 + }, + { + "epoch": 0.4312849162011173, + "grad_norm": 0.6174609065055847, + "learning_rate": 0.0009812885154061625, + "loss": 0.4889, + "step": 772 + }, + { + "epoch": 0.43184357541899443, + "grad_norm": 0.5063725113868713, + "learning_rate": 0.0009812605042016807, + "loss": 0.4667, + "step": 773 + }, + { + "epoch": 0.4324022346368715, + "grad_norm": 1.2749954462051392, + "learning_rate": 0.000981232492997199, + "loss": 0.4954, + "step": 774 + }, + { + "epoch": 0.4329608938547486, + "grad_norm": 0.6341150403022766, + "learning_rate": 0.0009812044817927171, + "loss": 0.5442, + "step": 775 + }, + { + "epoch": 0.4335195530726257, + "grad_norm": 0.7182409167289734, + "learning_rate": 0.0009811764705882353, + "loss": 0.5457, + "step": 776 + }, + { + "epoch": 0.4340782122905028, + "grad_norm": 0.7968026399612427, + "learning_rate": 0.0009811484593837535, + "loss": 0.5734, + "step": 777 + }, + { + "epoch": 0.4346368715083799, + "grad_norm": 0.4441359341144562, + "learning_rate": 0.0009811204481792717, + "loss": 0.5043, + "step": 778 + }, + { + "epoch": 0.435195530726257, + "grad_norm": 0.48550713062286377, + "learning_rate": 0.00098109243697479, + "loss": 0.4485, + "step": 779 + }, + { + "epoch": 0.43575418994413406, + "grad_norm": 0.7723861336708069, + "learning_rate": 0.0009810644257703081, + "loss": 0.4585, + "step": 780 + }, + { + "epoch": 0.4363128491620112, + "grad_norm": 1.5276738405227661, + "learning_rate": 0.0009810364145658263, + "loss": 0.453, + "step": 781 + }, + { + "epoch": 0.43687150837988825, + "grad_norm": 0.5024844408035278, + "learning_rate": 0.0009810084033613446, + "loss": 0.4935, + "step": 782 + }, + { + "epoch": 0.4374301675977654, + "grad_norm": 0.5297918915748596, + "learning_rate": 0.0009809803921568628, + "loss": 0.4632, + "step": 783 + }, + { + "epoch": 0.43798882681564244, + "grad_norm": 0.6294731497764587, + "learning_rate": 0.000980952380952381, + "loss": 0.5002, + "step": 784 + }, + { + "epoch": 0.43854748603351956, + "grad_norm": 3.596766710281372, + "learning_rate": 0.0009809243697478992, + "loss": 0.6663, + "step": 785 + }, + { + "epoch": 0.43910614525139663, + "grad_norm": 0.6239421367645264, + "learning_rate": 0.0009808963585434174, + "loss": 0.5175, + "step": 786 + }, + { + "epoch": 0.43966480446927375, + "grad_norm": 0.9190245866775513, + "learning_rate": 0.0009808683473389356, + "loss": 0.4805, + "step": 787 + }, + { + "epoch": 0.4402234636871508, + "grad_norm": 1.2294410467147827, + "learning_rate": 0.0009808403361344538, + "loss": 0.5014, + "step": 788 + }, + { + "epoch": 0.44078212290502794, + "grad_norm": 1.0701860189437866, + "learning_rate": 0.000980812324929972, + "loss": 0.4388, + "step": 789 + }, + { + "epoch": 0.441340782122905, + "grad_norm": 0.6473264694213867, + "learning_rate": 0.0009807843137254902, + "loss": 0.5691, + "step": 790 + }, + { + "epoch": 0.44189944134078213, + "grad_norm": 0.5692540407180786, + "learning_rate": 0.0009807563025210084, + "loss": 0.5383, + "step": 791 + }, + { + "epoch": 0.4424581005586592, + "grad_norm": 0.49326780438423157, + "learning_rate": 0.0009807282913165266, + "loss": 0.5117, + "step": 792 + }, + { + "epoch": 0.4430167597765363, + "grad_norm": 0.5171276330947876, + "learning_rate": 0.0009807002801120448, + "loss": 0.4677, + "step": 793 + }, + { + "epoch": 0.4435754189944134, + "grad_norm": 1.14442777633667, + "learning_rate": 0.000980672268907563, + "loss": 0.5585, + "step": 794 + }, + { + "epoch": 0.4441340782122905, + "grad_norm": 0.7754183411598206, + "learning_rate": 0.0009806442577030812, + "loss": 0.6227, + "step": 795 + }, + { + "epoch": 0.4446927374301676, + "grad_norm": 0.7203547358512878, + "learning_rate": 0.0009806162464985994, + "loss": 0.5006, + "step": 796 + }, + { + "epoch": 0.4452513966480447, + "grad_norm": 1.2330557107925415, + "learning_rate": 0.0009805882352941176, + "loss": 0.5251, + "step": 797 + }, + { + "epoch": 0.44581005586592176, + "grad_norm": 0.7639957666397095, + "learning_rate": 0.0009805602240896358, + "loss": 0.4819, + "step": 798 + }, + { + "epoch": 0.4463687150837989, + "grad_norm": 0.7057264447212219, + "learning_rate": 0.000980532212885154, + "loss": 0.3599, + "step": 799 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 2.596640110015869, + "learning_rate": 0.0009805042016806723, + "loss": 0.5319, + "step": 800 + }, + { + "epoch": 0.4474860335195531, + "grad_norm": 0.5031240582466125, + "learning_rate": 0.0009804761904761905, + "loss": 0.4199, + "step": 801 + }, + { + "epoch": 0.4480446927374302, + "grad_norm": 0.6316196322441101, + "learning_rate": 0.0009804481792717087, + "loss": 0.5212, + "step": 802 + }, + { + "epoch": 0.44860335195530726, + "grad_norm": 5.0885090827941895, + "learning_rate": 0.0009804201680672269, + "loss": 0.4256, + "step": 803 + }, + { + "epoch": 0.4491620111731844, + "grad_norm": 2.0129776000976562, + "learning_rate": 0.000980392156862745, + "loss": 0.4249, + "step": 804 + }, + { + "epoch": 0.44972067039106145, + "grad_norm": 6.43217134475708, + "learning_rate": 0.0009803641456582633, + "loss": 0.5374, + "step": 805 + }, + { + "epoch": 0.45027932960893857, + "grad_norm": 0.7426183819770813, + "learning_rate": 0.0009803361344537815, + "loss": 0.5464, + "step": 806 + }, + { + "epoch": 0.45083798882681564, + "grad_norm": 1.1538949012756348, + "learning_rate": 0.0009803081232492997, + "loss": 0.5922, + "step": 807 + }, + { + "epoch": 0.45139664804469276, + "grad_norm": 0.7717742919921875, + "learning_rate": 0.000980280112044818, + "loss": 0.4813, + "step": 808 + }, + { + "epoch": 0.4519553072625698, + "grad_norm": 1.0451397895812988, + "learning_rate": 0.0009802521008403361, + "loss": 0.5399, + "step": 809 + }, + { + "epoch": 0.45251396648044695, + "grad_norm": 2.155149221420288, + "learning_rate": 0.0009802240896358543, + "loss": 0.603, + "step": 810 + }, + { + "epoch": 0.453072625698324, + "grad_norm": 0.6225429177284241, + "learning_rate": 0.0009801960784313725, + "loss": 0.4949, + "step": 811 + }, + { + "epoch": 0.45363128491620114, + "grad_norm": 0.8359684944152832, + "learning_rate": 0.0009801680672268907, + "loss": 0.4687, + "step": 812 + }, + { + "epoch": 0.4541899441340782, + "grad_norm": 2.504591703414917, + "learning_rate": 0.000980140056022409, + "loss": 0.4975, + "step": 813 + }, + { + "epoch": 0.4547486033519553, + "grad_norm": 1.0969488620758057, + "learning_rate": 0.0009801120448179271, + "loss": 0.4135, + "step": 814 + }, + { + "epoch": 0.4553072625698324, + "grad_norm": 0.8878272175788879, + "learning_rate": 0.0009800840336134454, + "loss": 0.5093, + "step": 815 + }, + { + "epoch": 0.4558659217877095, + "grad_norm": 1.4094451665878296, + "learning_rate": 0.0009800560224089636, + "loss": 0.536, + "step": 816 + }, + { + "epoch": 0.4564245810055866, + "grad_norm": 0.8703171014785767, + "learning_rate": 0.0009800280112044818, + "loss": 0.4872, + "step": 817 + }, + { + "epoch": 0.4569832402234637, + "grad_norm": 0.5140367150306702, + "learning_rate": 0.00098, + "loss": 0.5125, + "step": 818 + }, + { + "epoch": 0.45754189944134077, + "grad_norm": 0.6497352123260498, + "learning_rate": 0.0009799719887955182, + "loss": 0.5546, + "step": 819 + }, + { + "epoch": 0.4581005586592179, + "grad_norm": 0.5851194262504578, + "learning_rate": 0.0009799439775910364, + "loss": 0.4927, + "step": 820 + }, + { + "epoch": 0.45865921787709496, + "grad_norm": 4.143260955810547, + "learning_rate": 0.0009799159663865546, + "loss": 0.5803, + "step": 821 + }, + { + "epoch": 0.4592178770949721, + "grad_norm": 0.9179551005363464, + "learning_rate": 0.0009798879551820728, + "loss": 0.4332, + "step": 822 + }, + { + "epoch": 0.45977653631284915, + "grad_norm": 0.5912773013114929, + "learning_rate": 0.000979859943977591, + "loss": 0.5745, + "step": 823 + }, + { + "epoch": 0.46033519553072627, + "grad_norm": 1.2477174997329712, + "learning_rate": 0.0009798319327731092, + "loss": 0.4981, + "step": 824 + }, + { + "epoch": 0.46089385474860334, + "grad_norm": 0.6297051906585693, + "learning_rate": 0.0009798039215686276, + "loss": 0.452, + "step": 825 + }, + { + "epoch": 0.46145251396648046, + "grad_norm": 0.6839253306388855, + "learning_rate": 0.0009797759103641456, + "loss": 0.7986, + "step": 826 + }, + { + "epoch": 0.4620111731843575, + "grad_norm": 0.6715407371520996, + "learning_rate": 0.0009797478991596638, + "loss": 0.4612, + "step": 827 + }, + { + "epoch": 0.46256983240223465, + "grad_norm": 0.9140963554382324, + "learning_rate": 0.000979719887955182, + "loss": 0.5857, + "step": 828 + }, + { + "epoch": 0.4631284916201117, + "grad_norm": 0.7106597423553467, + "learning_rate": 0.0009796918767507002, + "loss": 0.5063, + "step": 829 + }, + { + "epoch": 0.46368715083798884, + "grad_norm": 0.7576602697372437, + "learning_rate": 0.0009796638655462187, + "loss": 0.5293, + "step": 830 + }, + { + "epoch": 0.4642458100558659, + "grad_norm": 1.3902246952056885, + "learning_rate": 0.0009796358543417366, + "loss": 0.5623, + "step": 831 + }, + { + "epoch": 0.464804469273743, + "grad_norm": 0.728994607925415, + "learning_rate": 0.0009796078431372549, + "loss": 0.4056, + "step": 832 + }, + { + "epoch": 0.4653631284916201, + "grad_norm": 1.275497555732727, + "learning_rate": 0.000979579831932773, + "loss": 0.4133, + "step": 833 + }, + { + "epoch": 0.4659217877094972, + "grad_norm": 0.6350085735321045, + "learning_rate": 0.0009795518207282913, + "loss": 0.6348, + "step": 834 + }, + { + "epoch": 0.4664804469273743, + "grad_norm": 0.937637209892273, + "learning_rate": 0.0009795238095238097, + "loss": 0.6311, + "step": 835 + }, + { + "epoch": 0.4670391061452514, + "grad_norm": 0.4883667826652527, + "learning_rate": 0.0009794957983193277, + "loss": 0.5412, + "step": 836 + }, + { + "epoch": 0.46759776536312847, + "grad_norm": 3.5121424198150635, + "learning_rate": 0.0009794677871148459, + "loss": 0.4174, + "step": 837 + }, + { + "epoch": 0.4681564245810056, + "grad_norm": 0.7006067037582397, + "learning_rate": 0.000979439775910364, + "loss": 0.5278, + "step": 838 + }, + { + "epoch": 0.46871508379888266, + "grad_norm": 0.529995322227478, + "learning_rate": 0.0009794117647058823, + "loss": 0.4906, + "step": 839 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 0.5436596274375916, + "learning_rate": 0.0009793837535014007, + "loss": 0.4689, + "step": 840 + }, + { + "epoch": 0.46983240223463685, + "grad_norm": 0.8091188073158264, + "learning_rate": 0.000979355742296919, + "loss": 0.5458, + "step": 841 + }, + { + "epoch": 0.47039106145251397, + "grad_norm": 1.0712378025054932, + "learning_rate": 0.000979327731092437, + "loss": 0.4206, + "step": 842 + }, + { + "epoch": 0.47094972067039104, + "grad_norm": 1.2762094736099243, + "learning_rate": 0.0009792997198879551, + "loss": 0.4527, + "step": 843 + }, + { + "epoch": 0.47150837988826816, + "grad_norm": 1.0467904806137085, + "learning_rate": 0.0009792717086834733, + "loss": 0.8351, + "step": 844 + }, + { + "epoch": 0.4720670391061452, + "grad_norm": 0.48221683502197266, + "learning_rate": 0.0009792436974789917, + "loss": 0.4944, + "step": 845 + }, + { + "epoch": 0.47262569832402235, + "grad_norm": 0.6988289952278137, + "learning_rate": 0.00097921568627451, + "loss": 0.4712, + "step": 846 + }, + { + "epoch": 0.4731843575418994, + "grad_norm": 0.9647338390350342, + "learning_rate": 0.000979187675070028, + "loss": 0.5967, + "step": 847 + }, + { + "epoch": 0.47374301675977654, + "grad_norm": 0.9094915390014648, + "learning_rate": 0.0009791596638655461, + "loss": 0.5468, + "step": 848 + }, + { + "epoch": 0.47430167597765366, + "grad_norm": 0.6334877610206604, + "learning_rate": 0.0009791316526610644, + "loss": 0.4925, + "step": 849 + }, + { + "epoch": 0.4748603351955307, + "grad_norm": 0.8837565183639526, + "learning_rate": 0.0009791036414565828, + "loss": 0.5654, + "step": 850 + }, + { + "epoch": 0.47541899441340785, + "grad_norm": 1.1544402837753296, + "learning_rate": 0.000979075630252101, + "loss": 0.6857, + "step": 851 + }, + { + "epoch": 0.4759776536312849, + "grad_norm": 0.6996060609817505, + "learning_rate": 0.000979047619047619, + "loss": 0.4821, + "step": 852 + }, + { + "epoch": 0.47653631284916204, + "grad_norm": 1.8354023694992065, + "learning_rate": 0.0009790196078431372, + "loss": 0.5043, + "step": 853 + }, + { + "epoch": 0.4770949720670391, + "grad_norm": 0.5960931777954102, + "learning_rate": 0.0009789915966386554, + "loss": 0.6045, + "step": 854 + }, + { + "epoch": 0.4776536312849162, + "grad_norm": 1.124712347984314, + "learning_rate": 0.0009789635854341738, + "loss": 0.5422, + "step": 855 + }, + { + "epoch": 0.4782122905027933, + "grad_norm": 0.5247573852539062, + "learning_rate": 0.000978935574229692, + "loss": 0.4228, + "step": 856 + }, + { + "epoch": 0.4787709497206704, + "grad_norm": 0.5940335988998413, + "learning_rate": 0.0009789075630252102, + "loss": 0.4598, + "step": 857 + }, + { + "epoch": 0.4793296089385475, + "grad_norm": 1.1262948513031006, + "learning_rate": 0.0009788795518207282, + "loss": 0.4873, + "step": 858 + }, + { + "epoch": 0.4798882681564246, + "grad_norm": 0.8122714757919312, + "learning_rate": 0.0009788515406162464, + "loss": 0.5261, + "step": 859 + }, + { + "epoch": 0.48044692737430167, + "grad_norm": 0.8120923638343811, + "learning_rate": 0.0009788235294117648, + "loss": 0.4576, + "step": 860 + }, + { + "epoch": 0.4810055865921788, + "grad_norm": 0.6665226817131042, + "learning_rate": 0.000978795518207283, + "loss": 0.5516, + "step": 861 + }, + { + "epoch": 0.48156424581005586, + "grad_norm": 1.4202057123184204, + "learning_rate": 0.0009787675070028012, + "loss": 0.6323, + "step": 862 + }, + { + "epoch": 0.482122905027933, + "grad_norm": 1.0429213047027588, + "learning_rate": 0.0009787394957983192, + "loss": 0.4993, + "step": 863 + }, + { + "epoch": 0.48268156424581005, + "grad_norm": 0.563971757888794, + "learning_rate": 0.0009787114845938374, + "loss": 0.4815, + "step": 864 + }, + { + "epoch": 0.48324022346368717, + "grad_norm": 1.0299530029296875, + "learning_rate": 0.0009786834733893559, + "loss": 0.5597, + "step": 865 + }, + { + "epoch": 0.48379888268156424, + "grad_norm": 1.0133742094039917, + "learning_rate": 0.000978655462184874, + "loss": 0.4798, + "step": 866 + }, + { + "epoch": 0.48435754189944136, + "grad_norm": 0.6536348462104797, + "learning_rate": 0.0009786274509803923, + "loss": 0.5242, + "step": 867 + }, + { + "epoch": 0.4849162011173184, + "grad_norm": 0.44831448793411255, + "learning_rate": 0.0009785994397759103, + "loss": 0.4123, + "step": 868 + }, + { + "epoch": 0.48547486033519555, + "grad_norm": 0.6059315800666809, + "learning_rate": 0.0009785714285714285, + "loss": 0.5477, + "step": 869 + }, + { + "epoch": 0.4860335195530726, + "grad_norm": 0.8129459619522095, + "learning_rate": 0.000978543417366947, + "loss": 0.425, + "step": 870 + }, + { + "epoch": 0.48659217877094973, + "grad_norm": 0.7330930829048157, + "learning_rate": 0.000978515406162465, + "loss": 0.5031, + "step": 871 + }, + { + "epoch": 0.4871508379888268, + "grad_norm": 1.3822016716003418, + "learning_rate": 0.0009784873949579833, + "loss": 0.6426, + "step": 872 + }, + { + "epoch": 0.4877094972067039, + "grad_norm": 0.629173755645752, + "learning_rate": 0.0009784593837535015, + "loss": 0.5425, + "step": 873 + }, + { + "epoch": 0.488268156424581, + "grad_norm": 0.7746703028678894, + "learning_rate": 0.0009784313725490195, + "loss": 0.6409, + "step": 874 + }, + { + "epoch": 0.4888268156424581, + "grad_norm": 0.7017098069190979, + "learning_rate": 0.000978403361344538, + "loss": 0.5994, + "step": 875 + }, + { + "epoch": 0.4893854748603352, + "grad_norm": 1.6050899028778076, + "learning_rate": 0.0009783753501400561, + "loss": 0.5533, + "step": 876 + }, + { + "epoch": 0.4899441340782123, + "grad_norm": 0.821544349193573, + "learning_rate": 0.0009783473389355743, + "loss": 0.5541, + "step": 877 + }, + { + "epoch": 0.49050279329608937, + "grad_norm": 1.6707091331481934, + "learning_rate": 0.0009783193277310925, + "loss": 0.5065, + "step": 878 + }, + { + "epoch": 0.4910614525139665, + "grad_norm": 0.6373299956321716, + "learning_rate": 0.0009782913165266105, + "loss": 0.464, + "step": 879 + }, + { + "epoch": 0.49162011173184356, + "grad_norm": 0.7731879353523254, + "learning_rate": 0.000978263305322129, + "loss": 0.4619, + "step": 880 + }, + { + "epoch": 0.4921787709497207, + "grad_norm": Infinity, + "learning_rate": 0.000978263305322129, + "loss": 0.521, + "step": 881 + }, + { + "epoch": 0.49273743016759775, + "grad_norm": 1.804014801979065, + "learning_rate": 0.0009782352941176472, + "loss": 0.4391, + "step": 882 + }, + { + "epoch": 0.49329608938547487, + "grad_norm": 0.5658578872680664, + "learning_rate": 0.0009782072829131654, + "loss": 0.4609, + "step": 883 + }, + { + "epoch": 0.49385474860335193, + "grad_norm": 1.8259243965148926, + "learning_rate": 0.0009781792717086836, + "loss": 0.4189, + "step": 884 + }, + { + "epoch": 0.49441340782122906, + "grad_norm": 1.3172889947891235, + "learning_rate": 0.0009781512605042016, + "loss": 0.4807, + "step": 885 + }, + { + "epoch": 0.4949720670391061, + "grad_norm": 1.0069423913955688, + "learning_rate": 0.0009781232492997198, + "loss": 0.4293, + "step": 886 + }, + { + "epoch": 0.49553072625698324, + "grad_norm": 0.9529631733894348, + "learning_rate": 0.0009780952380952382, + "loss": 0.523, + "step": 887 + }, + { + "epoch": 0.4960893854748603, + "grad_norm": 21.41440200805664, + "learning_rate": 0.0009780672268907564, + "loss": 0.5485, + "step": 888 + }, + { + "epoch": 0.49664804469273743, + "grad_norm": 0.8933480381965637, + "learning_rate": 0.0009780392156862746, + "loss": 0.5737, + "step": 889 + }, + { + "epoch": 0.4972067039106145, + "grad_norm": 0.5926549434661865, + "learning_rate": 0.0009780112044817928, + "loss": 0.4538, + "step": 890 + }, + { + "epoch": 0.4977653631284916, + "grad_norm": 0.7941659092903137, + "learning_rate": 0.0009779831932773108, + "loss": 0.5334, + "step": 891 + }, + { + "epoch": 0.4983240223463687, + "grad_norm": 0.679101824760437, + "learning_rate": 0.0009779551820728292, + "loss": 0.4683, + "step": 892 + }, + { + "epoch": 0.4988826815642458, + "grad_norm": 0.5870710015296936, + "learning_rate": 0.0009779271708683474, + "loss": 0.4707, + "step": 893 + }, + { + "epoch": 0.4994413407821229, + "grad_norm": 0.4131081998348236, + "learning_rate": 0.0009778991596638656, + "loss": 0.4766, + "step": 894 + }, + { + "epoch": 0.5, + "grad_norm": 0.7532033324241638, + "learning_rate": 0.0009778711484593838, + "loss": 0.5604, + "step": 895 + }, + { + "epoch": 0.5005586592178771, + "grad_norm": 0.5894113779067993, + "learning_rate": 0.0009778431372549018, + "loss": 0.556, + "step": 896 + }, + { + "epoch": 0.5011173184357542, + "grad_norm": 0.6159910559654236, + "learning_rate": 0.0009778151260504203, + "loss": 0.7493, + "step": 897 + }, + { + "epoch": 0.5016759776536313, + "grad_norm": 0.645681619644165, + "learning_rate": 0.0009777871148459385, + "loss": 0.4461, + "step": 898 + }, + { + "epoch": 0.5022346368715084, + "grad_norm": 0.7601158618927002, + "learning_rate": 0.0009777591036414567, + "loss": 0.5558, + "step": 899 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 1.1194024085998535, + "learning_rate": 0.0009777310924369749, + "loss": 0.5576, + "step": 900 + }, + { + "epoch": 0.5033519553072626, + "grad_norm": 2.758066177368164, + "learning_rate": 0.0009777030812324929, + "loss": 0.5647, + "step": 901 + }, + { + "epoch": 0.5039106145251396, + "grad_norm": 0.9305416345596313, + "learning_rate": 0.0009776750700280113, + "loss": 0.6004, + "step": 902 + }, + { + "epoch": 0.5044692737430168, + "grad_norm": 0.782402753829956, + "learning_rate": 0.0009776470588235295, + "loss": 0.4926, + "step": 903 + }, + { + "epoch": 0.5050279329608939, + "grad_norm": 3.9130053520202637, + "learning_rate": 0.0009776190476190477, + "loss": 0.4887, + "step": 904 + }, + { + "epoch": 0.505586592178771, + "grad_norm": 0.9924647808074951, + "learning_rate": 0.000977591036414566, + "loss": 0.5809, + "step": 905 + }, + { + "epoch": 0.506145251396648, + "grad_norm": 0.9683765172958374, + "learning_rate": 0.000977563025210084, + "loss": 0.6797, + "step": 906 + }, + { + "epoch": 0.5067039106145251, + "grad_norm": 0.656795859336853, + "learning_rate": 0.0009775350140056023, + "loss": 0.5957, + "step": 907 + }, + { + "epoch": 0.5072625698324023, + "grad_norm": 0.836251974105835, + "learning_rate": 0.0009775070028011205, + "loss": 0.5127, + "step": 908 + }, + { + "epoch": 0.5078212290502794, + "grad_norm": 0.6780369877815247, + "learning_rate": 0.0009774789915966387, + "loss": 0.5403, + "step": 909 + }, + { + "epoch": 0.5083798882681564, + "grad_norm": 0.9531988501548767, + "learning_rate": 0.000977450980392157, + "loss": 0.4681, + "step": 910 + }, + { + "epoch": 0.5089385474860335, + "grad_norm": 0.8000809550285339, + "learning_rate": 0.0009774229691876751, + "loss": 0.5315, + "step": 911 + }, + { + "epoch": 0.5094972067039106, + "grad_norm": 0.7712262272834778, + "learning_rate": 0.0009773949579831933, + "loss": 0.5366, + "step": 912 + }, + { + "epoch": 0.5100558659217878, + "grad_norm": 1.2008029222488403, + "learning_rate": 0.0009773669467787115, + "loss": 0.6025, + "step": 913 + }, + { + "epoch": 0.5106145251396648, + "grad_norm": 0.5094325542449951, + "learning_rate": 0.0009773389355742298, + "loss": 0.484, + "step": 914 + }, + { + "epoch": 0.5111731843575419, + "grad_norm": 0.6349051594734192, + "learning_rate": 0.000977310924369748, + "loss": 0.5264, + "step": 915 + }, + { + "epoch": 0.511731843575419, + "grad_norm": 1.3723305463790894, + "learning_rate": 0.0009772829131652662, + "loss": 0.3958, + "step": 916 + }, + { + "epoch": 0.5122905027932961, + "grad_norm": 0.5602142214775085, + "learning_rate": 0.0009772549019607844, + "loss": 0.6107, + "step": 917 + }, + { + "epoch": 0.5128491620111731, + "grad_norm": 0.6605849862098694, + "learning_rate": 0.0009772268907563026, + "loss": 0.4906, + "step": 918 + }, + { + "epoch": 0.5134078212290503, + "grad_norm": 0.6099862456321716, + "learning_rate": 0.0009771988795518208, + "loss": 0.4238, + "step": 919 + }, + { + "epoch": 0.5139664804469274, + "grad_norm": 0.7553727030754089, + "learning_rate": 0.000977170868347339, + "loss": 0.4375, + "step": 920 + }, + { + "epoch": 0.5145251396648045, + "grad_norm": 1.2150839567184448, + "learning_rate": 0.0009771428571428572, + "loss": 0.5048, + "step": 921 + }, + { + "epoch": 0.5150837988826815, + "grad_norm": 0.5379416346549988, + "learning_rate": 0.0009771148459383754, + "loss": 0.5527, + "step": 922 + }, + { + "epoch": 0.5156424581005586, + "grad_norm": 0.6172040104866028, + "learning_rate": 0.0009770868347338936, + "loss": 0.5659, + "step": 923 + }, + { + "epoch": 0.5162011173184358, + "grad_norm": 1.0052822828292847, + "learning_rate": 0.0009770588235294118, + "loss": 0.5649, + "step": 924 + }, + { + "epoch": 0.5167597765363129, + "grad_norm": 0.7726624011993408, + "learning_rate": 0.00097703081232493, + "loss": 0.4821, + "step": 925 + }, + { + "epoch": 0.5173184357541899, + "grad_norm": 0.6800291538238525, + "learning_rate": 0.0009770028011204482, + "loss": 0.6569, + "step": 926 + }, + { + "epoch": 0.517877094972067, + "grad_norm": 1.327609896659851, + "learning_rate": 0.0009769747899159664, + "loss": 0.5363, + "step": 927 + }, + { + "epoch": 0.5184357541899441, + "grad_norm": 1.0891976356506348, + "learning_rate": 0.0009769467787114846, + "loss": 0.495, + "step": 928 + }, + { + "epoch": 0.5189944134078213, + "grad_norm": 0.6245486736297607, + "learning_rate": 0.0009769187675070028, + "loss": 0.5165, + "step": 929 + }, + { + "epoch": 0.5195530726256983, + "grad_norm": 0.49972960352897644, + "learning_rate": 0.000976890756302521, + "loss": 0.4633, + "step": 930 + }, + { + "epoch": 0.5201117318435754, + "grad_norm": 2.4194657802581787, + "learning_rate": 0.0009768627450980393, + "loss": 0.4955, + "step": 931 + }, + { + "epoch": 0.5206703910614525, + "grad_norm": 0.6558769941329956, + "learning_rate": 0.0009768347338935575, + "loss": 0.509, + "step": 932 + }, + { + "epoch": 0.5212290502793296, + "grad_norm": 0.9265739917755127, + "learning_rate": 0.0009768067226890757, + "loss": 0.5005, + "step": 933 + }, + { + "epoch": 0.5217877094972067, + "grad_norm": 0.5933675169944763, + "learning_rate": 0.0009767787114845939, + "loss": 0.5826, + "step": 934 + }, + { + "epoch": 0.5223463687150838, + "grad_norm": 0.7285407781600952, + "learning_rate": 0.000976750700280112, + "loss": 0.4265, + "step": 935 + }, + { + "epoch": 0.5229050279329609, + "grad_norm": 0.7411402463912964, + "learning_rate": 0.0009767226890756303, + "loss": 0.6028, + "step": 936 + }, + { + "epoch": 0.523463687150838, + "grad_norm": 0.5636767148971558, + "learning_rate": 0.0009766946778711485, + "loss": 0.4115, + "step": 937 + }, + { + "epoch": 0.524022346368715, + "grad_norm": 0.7952788472175598, + "learning_rate": 0.0009766666666666667, + "loss": 0.8305, + "step": 938 + }, + { + "epoch": 0.5245810055865922, + "grad_norm": 0.5462428331375122, + "learning_rate": 0.000976638655462185, + "loss": 0.3768, + "step": 939 + }, + { + "epoch": 0.5251396648044693, + "grad_norm": 0.684895396232605, + "learning_rate": 0.0009766106442577031, + "loss": 0.607, + "step": 940 + }, + { + "epoch": 0.5256983240223464, + "grad_norm": 0.6716300845146179, + "learning_rate": 0.0009765826330532213, + "loss": 0.4181, + "step": 941 + }, + { + "epoch": 0.5262569832402234, + "grad_norm": 0.7308188080787659, + "learning_rate": 0.0009765546218487395, + "loss": 0.5381, + "step": 942 + }, + { + "epoch": 0.5268156424581005, + "grad_norm": 1.2455106973648071, + "learning_rate": 0.0009765266106442577, + "loss": 0.6004, + "step": 943 + }, + { + "epoch": 0.5273743016759777, + "grad_norm": 0.7769612669944763, + "learning_rate": 0.0009764985994397759, + "loss": 0.41, + "step": 944 + }, + { + "epoch": 0.5279329608938548, + "grad_norm": 1.355358362197876, + "learning_rate": 0.0009764705882352941, + "loss": 0.5135, + "step": 945 + }, + { + "epoch": 0.5284916201117319, + "grad_norm": 0.7444446682929993, + "learning_rate": 0.0009764425770308123, + "loss": 0.5822, + "step": 946 + }, + { + "epoch": 0.5290502793296089, + "grad_norm": 0.6030605435371399, + "learning_rate": 0.0009764145658263306, + "loss": 0.4447, + "step": 947 + }, + { + "epoch": 0.529608938547486, + "grad_norm": 0.6857961416244507, + "learning_rate": 0.0009763865546218488, + "loss": 0.5858, + "step": 948 + }, + { + "epoch": 0.5301675977653632, + "grad_norm": 1.5943262577056885, + "learning_rate": 0.000976358543417367, + "loss": 0.4036, + "step": 949 + }, + { + "epoch": 0.5307262569832403, + "grad_norm": 0.6407567858695984, + "learning_rate": 0.0009763305322128852, + "loss": 0.4712, + "step": 950 + }, + { + "epoch": 0.5312849162011173, + "grad_norm": 0.4918668270111084, + "learning_rate": 0.0009763025210084034, + "loss": 0.4965, + "step": 951 + }, + { + "epoch": 0.5318435754189944, + "grad_norm": 2.872591018676758, + "learning_rate": 0.0009762745098039216, + "loss": 0.7254, + "step": 952 + }, + { + "epoch": 0.5324022346368715, + "grad_norm": 0.8626722097396851, + "learning_rate": 0.0009762464985994398, + "loss": 0.5013, + "step": 953 + }, + { + "epoch": 0.5329608938547487, + "grad_norm": 0.8280640244483948, + "learning_rate": 0.000976218487394958, + "loss": 0.4996, + "step": 954 + }, + { + "epoch": 0.5335195530726257, + "grad_norm": 0.5562185049057007, + "learning_rate": 0.0009761904761904762, + "loss": 0.4468, + "step": 955 + }, + { + "epoch": 0.5340782122905028, + "grad_norm": 0.8575215935707092, + "learning_rate": 0.0009761624649859944, + "loss": 0.5449, + "step": 956 + }, + { + "epoch": 0.5346368715083799, + "grad_norm": 0.6176499128341675, + "learning_rate": 0.0009761344537815126, + "loss": 0.5362, + "step": 957 + }, + { + "epoch": 0.535195530726257, + "grad_norm": 1.6306699514389038, + "learning_rate": 0.0009761064425770308, + "loss": 0.6004, + "step": 958 + }, + { + "epoch": 0.535754189944134, + "grad_norm": 0.8023949861526489, + "learning_rate": 0.000976078431372549, + "loss": 0.5008, + "step": 959 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 0.5992342233657837, + "learning_rate": 0.0009760504201680672, + "loss": 0.4655, + "step": 960 + }, + { + "epoch": 0.5368715083798883, + "grad_norm": 0.8887758255004883, + "learning_rate": 0.0009760224089635854, + "loss": 0.5942, + "step": 961 + }, + { + "epoch": 0.5374301675977654, + "grad_norm": 0.6822399497032166, + "learning_rate": 0.0009759943977591038, + "loss": 0.7113, + "step": 962 + }, + { + "epoch": 0.5379888268156424, + "grad_norm": 0.43152928352355957, + "learning_rate": 0.0009759663865546218, + "loss": 0.5528, + "step": 963 + }, + { + "epoch": 0.5385474860335195, + "grad_norm": 0.5266768336296082, + "learning_rate": 0.0009759383753501401, + "loss": 0.5415, + "step": 964 + }, + { + "epoch": 0.5391061452513967, + "grad_norm": 0.5582186579704285, + "learning_rate": 0.0009759103641456583, + "loss": 0.4896, + "step": 965 + }, + { + "epoch": 0.5396648044692738, + "grad_norm": 0.5387858748435974, + "learning_rate": 0.0009758823529411765, + "loss": 0.4501, + "step": 966 + }, + { + "epoch": 0.5402234636871508, + "grad_norm": 0.564531683921814, + "learning_rate": 0.0009758543417366948, + "loss": 0.453, + "step": 967 + }, + { + "epoch": 0.5407821229050279, + "grad_norm": 0.6433477401733398, + "learning_rate": 0.0009758263305322129, + "loss": 0.432, + "step": 968 + }, + { + "epoch": 0.541340782122905, + "grad_norm": 0.7274858355522156, + "learning_rate": 0.0009757983193277311, + "loss": 0.7383, + "step": 969 + }, + { + "epoch": 0.5418994413407822, + "grad_norm": 1.164543628692627, + "learning_rate": 0.0009757703081232493, + "loss": 0.5345, + "step": 970 + }, + { + "epoch": 0.5424581005586592, + "grad_norm": 0.8189229369163513, + "learning_rate": 0.0009757422969187675, + "loss": 0.6865, + "step": 971 + }, + { + "epoch": 0.5430167597765363, + "grad_norm": 0.8637515306472778, + "learning_rate": 0.0009757142857142858, + "loss": 0.442, + "step": 972 + }, + { + "epoch": 0.5435754189944134, + "grad_norm": 1.665154218673706, + "learning_rate": 0.0009756862745098039, + "loss": 0.5287, + "step": 973 + }, + { + "epoch": 0.5441340782122905, + "grad_norm": 2.0094218254089355, + "learning_rate": 0.0009756582633053221, + "loss": 0.5974, + "step": 974 + }, + { + "epoch": 0.5446927374301676, + "grad_norm": 0.6737040281295776, + "learning_rate": 0.0009756302521008403, + "loss": 0.4898, + "step": 975 + }, + { + "epoch": 0.5452513966480447, + "grad_norm": 0.8267460465431213, + "learning_rate": 0.0009756022408963585, + "loss": 0.6025, + "step": 976 + }, + { + "epoch": 0.5458100558659218, + "grad_norm": 1.0673226118087769, + "learning_rate": 0.0009755742296918768, + "loss": 0.4838, + "step": 977 + }, + { + "epoch": 0.5463687150837989, + "grad_norm": 0.9449355006217957, + "learning_rate": 0.000975546218487395, + "loss": 0.5025, + "step": 978 + }, + { + "epoch": 0.5469273743016759, + "grad_norm": 0.8984672427177429, + "learning_rate": 0.0009755182072829131, + "loss": 0.5473, + "step": 979 + }, + { + "epoch": 0.547486033519553, + "grad_norm": 1.1542268991470337, + "learning_rate": 0.0009754901960784314, + "loss": 0.5518, + "step": 980 + }, + { + "epoch": 0.5480446927374302, + "grad_norm": 0.7877004742622375, + "learning_rate": 0.0009754621848739496, + "loss": 0.574, + "step": 981 + }, + { + "epoch": 0.5486033519553073, + "grad_norm": 1.4613877534866333, + "learning_rate": 0.0009754341736694679, + "loss": 0.6451, + "step": 982 + }, + { + "epoch": 0.5491620111731843, + "grad_norm": 0.9373902082443237, + "learning_rate": 0.0009754061624649861, + "loss": 0.5687, + "step": 983 + }, + { + "epoch": 0.5497206703910614, + "grad_norm": 0.8266605734825134, + "learning_rate": 0.0009753781512605042, + "loss": 0.5074, + "step": 984 + }, + { + "epoch": 0.5502793296089385, + "grad_norm": 0.8974220752716064, + "learning_rate": 0.0009753501400560224, + "loss": 0.5967, + "step": 985 + }, + { + "epoch": 0.5508379888268157, + "grad_norm": 4.773686408996582, + "learning_rate": 0.0009753221288515406, + "loss": 0.4411, + "step": 986 + }, + { + "epoch": 0.5513966480446927, + "grad_norm": 0.6652644872665405, + "learning_rate": 0.0009752941176470589, + "loss": 0.5025, + "step": 987 + }, + { + "epoch": 0.5519553072625698, + "grad_norm": 1.3818681240081787, + "learning_rate": 0.0009752661064425771, + "loss": 0.5055, + "step": 988 + }, + { + "epoch": 0.5525139664804469, + "grad_norm": 0.9579857587814331, + "learning_rate": 0.0009752380952380952, + "loss": 0.4816, + "step": 989 + }, + { + "epoch": 0.553072625698324, + "grad_norm": 0.6119722127914429, + "learning_rate": 0.0009752100840336134, + "loss": 0.5603, + "step": 990 + }, + { + "epoch": 0.5536312849162012, + "grad_norm": 0.7044486403465271, + "learning_rate": 0.0009751820728291316, + "loss": 0.4872, + "step": 991 + }, + { + "epoch": 0.5541899441340782, + "grad_norm": 1.1273928880691528, + "learning_rate": 0.0009751540616246499, + "loss": 0.5578, + "step": 992 + }, + { + "epoch": 0.5547486033519553, + "grad_norm": 0.8864875435829163, + "learning_rate": 0.0009751260504201681, + "loss": 0.4983, + "step": 993 + }, + { + "epoch": 0.5553072625698324, + "grad_norm": 0.8385987877845764, + "learning_rate": 0.0009750980392156863, + "loss": 0.6813, + "step": 994 + }, + { + "epoch": 0.5558659217877095, + "grad_norm": 3.663464307785034, + "learning_rate": 0.0009750700280112044, + "loss": 0.4018, + "step": 995 + }, + { + "epoch": 0.5564245810055866, + "grad_norm": 0.8634306788444519, + "learning_rate": 0.0009750420168067226, + "loss": 0.4952, + "step": 996 + }, + { + "epoch": 0.5569832402234637, + "grad_norm": 2.8100054264068604, + "learning_rate": 0.000975014005602241, + "loss": 0.3986, + "step": 997 + }, + { + "epoch": 0.5575418994413408, + "grad_norm": 1.1263238191604614, + "learning_rate": 0.0009749859943977592, + "loss": 0.4929, + "step": 998 + }, + { + "epoch": 0.5581005586592179, + "grad_norm": 0.7554044127464294, + "learning_rate": 0.0009749579831932774, + "loss": 0.7032, + "step": 999 + }, + { + "epoch": 0.5586592178770949, + "grad_norm": 0.6761658787727356, + "learning_rate": 0.0009749299719887955, + "loss": 0.5478, + "step": 1000 + }, + { + "epoch": 0.5586592178770949, + "eval_cer": 0.10311173665892005, + "eval_loss": 0.38809409737586975, + "eval_runtime": 55.6357, + "eval_samples_per_second": 81.566, + "eval_steps_per_second": 5.105, + "eval_wer": 0.4025027233876149, + "step": 1000 + }, + { + "epoch": 0.5592178770949721, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0009749019607843137, + "loss": 0.5917, + "step": 1001 + }, + { + "epoch": 0.5597765363128492, + "grad_norm": 0.6337428092956543, + "learning_rate": 0.000974873949579832, + "loss": 0.4803, + "step": 1002 + }, + { + "epoch": 0.5603351955307263, + "grad_norm": 0.7888473868370056, + "learning_rate": 0.0009748459383753502, + "loss": 0.548, + "step": 1003 + }, + { + "epoch": 0.5608938547486033, + "grad_norm": 0.9701579809188843, + "learning_rate": 0.0009748179271708684, + "loss": 0.4508, + "step": 1004 + }, + { + "epoch": 0.5614525139664804, + "grad_norm": 0.886437177658081, + "learning_rate": 0.0009747899159663865, + "loss": 0.5356, + "step": 1005 + }, + { + "epoch": 0.5620111731843576, + "grad_norm": 0.6148294806480408, + "learning_rate": 0.0009747619047619047, + "loss": 0.5619, + "step": 1006 + }, + { + "epoch": 0.5625698324022347, + "grad_norm": 0.5675657987594604, + "learning_rate": 0.000974733893557423, + "loss": 0.4057, + "step": 1007 + }, + { + "epoch": 0.5631284916201117, + "grad_norm": 0.6281993389129639, + "learning_rate": 0.0009747058823529412, + "loss": 0.6571, + "step": 1008 + }, + { + "epoch": 0.5636871508379888, + "grad_norm": 0.7165383696556091, + "learning_rate": 0.0009746778711484594, + "loss": 0.5533, + "step": 1009 + }, + { + "epoch": 0.5642458100558659, + "grad_norm": 0.6096805334091187, + "learning_rate": 0.0009746498599439776, + "loss": 0.4701, + "step": 1010 + }, + { + "epoch": 0.5648044692737431, + "grad_norm": 0.7585029602050781, + "learning_rate": 0.0009746218487394957, + "loss": 0.5767, + "step": 1011 + }, + { + "epoch": 0.5653631284916201, + "grad_norm": 0.7557053565979004, + "learning_rate": 0.000974593837535014, + "loss": 0.5908, + "step": 1012 + }, + { + "epoch": 0.5659217877094972, + "grad_norm": 0.555161714553833, + "learning_rate": 0.0009745658263305323, + "loss": 0.5017, + "step": 1013 + }, + { + "epoch": 0.5664804469273743, + "grad_norm": 1.2929558753967285, + "learning_rate": 0.0009745378151260505, + "loss": 0.5627, + "step": 1014 + }, + { + "epoch": 0.5670391061452514, + "grad_norm": 0.5683190822601318, + "learning_rate": 0.0009745098039215687, + "loss": 0.571, + "step": 1015 + }, + { + "epoch": 0.5675977653631284, + "grad_norm": 0.6255306601524353, + "learning_rate": 0.0009744817927170868, + "loss": 0.5816, + "step": 1016 + }, + { + "epoch": 0.5681564245810056, + "grad_norm": 0.9703459143638611, + "learning_rate": 0.0009744537815126051, + "loss": 0.5338, + "step": 1017 + }, + { + "epoch": 0.5687150837988827, + "grad_norm": 0.8128557205200195, + "learning_rate": 0.0009744257703081233, + "loss": 0.4785, + "step": 1018 + }, + { + "epoch": 0.5692737430167598, + "grad_norm": 0.5669181942939758, + "learning_rate": 0.0009743977591036415, + "loss": 0.4606, + "step": 1019 + }, + { + "epoch": 0.5698324022346368, + "grad_norm": 0.5641766786575317, + "learning_rate": 0.0009743697478991597, + "loss": 0.4018, + "step": 1020 + }, + { + "epoch": 0.570391061452514, + "grad_norm": 0.7533718347549438, + "learning_rate": 0.0009743417366946778, + "loss": 0.6241, + "step": 1021 + }, + { + "epoch": 0.5709497206703911, + "grad_norm": 1.3725582361221313, + "learning_rate": 0.0009743137254901961, + "loss": 0.4687, + "step": 1022 + }, + { + "epoch": 0.5715083798882682, + "grad_norm": 0.7779861688613892, + "learning_rate": 0.0009742857142857143, + "loss": 0.4283, + "step": 1023 + }, + { + "epoch": 0.5720670391061452, + "grad_norm": 1.0402158498764038, + "learning_rate": 0.0009742577030812325, + "loss": 0.6428, + "step": 1024 + }, + { + "epoch": 0.5726256983240223, + "grad_norm": 1.1067811250686646, + "learning_rate": 0.0009742296918767507, + "loss": 0.5387, + "step": 1025 + }, + { + "epoch": 0.5731843575418994, + "grad_norm": 0.5275371074676514, + "learning_rate": 0.0009742016806722689, + "loss": 0.5494, + "step": 1026 + }, + { + "epoch": 0.5737430167597766, + "grad_norm": 0.7539685368537903, + "learning_rate": 0.0009741736694677871, + "loss": 0.4766, + "step": 1027 + }, + { + "epoch": 0.5743016759776536, + "grad_norm": 0.6288818120956421, + "learning_rate": 0.0009741456582633053, + "loss": 0.4094, + "step": 1028 + }, + { + "epoch": 0.5748603351955307, + "grad_norm": 1.1248012781143188, + "learning_rate": 0.0009741176470588236, + "loss": 0.5811, + "step": 1029 + }, + { + "epoch": 0.5754189944134078, + "grad_norm": 0.6503866910934448, + "learning_rate": 0.0009740896358543418, + "loss": 0.4154, + "step": 1030 + }, + { + "epoch": 0.575977653631285, + "grad_norm": 0.7325912117958069, + "learning_rate": 0.00097406162464986, + "loss": 0.5496, + "step": 1031 + }, + { + "epoch": 0.576536312849162, + "grad_norm": 1.188781976699829, + "learning_rate": 0.0009740336134453782, + "loss": 0.4322, + "step": 1032 + }, + { + "epoch": 0.5770949720670391, + "grad_norm": 0.8566517233848572, + "learning_rate": 0.0009740056022408964, + "loss": 0.4415, + "step": 1033 + }, + { + "epoch": 0.5776536312849162, + "grad_norm": 1.23500394821167, + "learning_rate": 0.0009739775910364146, + "loss": 0.533, + "step": 1034 + }, + { + "epoch": 0.5782122905027933, + "grad_norm": 4.765673637390137, + "learning_rate": 0.0009739495798319328, + "loss": 0.6176, + "step": 1035 + }, + { + "epoch": 0.5787709497206703, + "grad_norm": 0.9743689298629761, + "learning_rate": 0.000973921568627451, + "loss": 0.4773, + "step": 1036 + }, + { + "epoch": 0.5793296089385475, + "grad_norm": 1.006348967552185, + "learning_rate": 0.0009738935574229693, + "loss": 0.5239, + "step": 1037 + }, + { + "epoch": 0.5798882681564246, + "grad_norm": 1.0814579725265503, + "learning_rate": 0.0009738655462184874, + "loss": 0.516, + "step": 1038 + }, + { + "epoch": 0.5804469273743017, + "grad_norm": 0.7172871232032776, + "learning_rate": 0.0009738375350140056, + "loss": 0.6491, + "step": 1039 + }, + { + "epoch": 0.5810055865921788, + "grad_norm": 0.6884520649909973, + "learning_rate": 0.0009738095238095238, + "loss": 0.5154, + "step": 1040 + }, + { + "epoch": 0.5815642458100558, + "grad_norm": 0.5274704098701477, + "learning_rate": 0.000973781512605042, + "loss": 0.5848, + "step": 1041 + }, + { + "epoch": 0.582122905027933, + "grad_norm": 3.0449068546295166, + "learning_rate": 0.0009737535014005603, + "loss": 0.5009, + "step": 1042 + }, + { + "epoch": 0.5826815642458101, + "grad_norm": 0.6482254266738892, + "learning_rate": 0.0009737254901960784, + "loss": 0.5343, + "step": 1043 + }, + { + "epoch": 0.5832402234636872, + "grad_norm": 0.6769239902496338, + "learning_rate": 0.0009736974789915966, + "loss": 0.573, + "step": 1044 + }, + { + "epoch": 0.5837988826815642, + "grad_norm": 0.5965105891227722, + "learning_rate": 0.0009736694677871148, + "loss": 0.5129, + "step": 1045 + }, + { + "epoch": 0.5843575418994413, + "grad_norm": 0.43163272738456726, + "learning_rate": 0.0009736414565826331, + "loss": 0.513, + "step": 1046 + }, + { + "epoch": 0.5849162011173185, + "grad_norm": 0.4251498579978943, + "learning_rate": 0.0009736134453781514, + "loss": 0.447, + "step": 1047 + }, + { + "epoch": 0.5854748603351956, + "grad_norm": 0.7185901999473572, + "learning_rate": 0.0009735854341736695, + "loss": 0.4406, + "step": 1048 + }, + { + "epoch": 0.5860335195530726, + "grad_norm": 0.6413022875785828, + "learning_rate": 0.0009735574229691877, + "loss": 0.4161, + "step": 1049 + }, + { + "epoch": 0.5865921787709497, + "grad_norm": 2.7174506187438965, + "learning_rate": 0.0009735294117647059, + "loss": 0.5461, + "step": 1050 + }, + { + "epoch": 0.5871508379888268, + "grad_norm": 0.715381383895874, + "learning_rate": 0.0009735014005602241, + "loss": 0.5319, + "step": 1051 + }, + { + "epoch": 0.587709497206704, + "grad_norm": 0.8026681542396545, + "learning_rate": 0.0009734733893557424, + "loss": 0.613, + "step": 1052 + }, + { + "epoch": 0.588268156424581, + "grad_norm": 0.765281081199646, + "learning_rate": 0.0009734453781512606, + "loss": 0.5044, + "step": 1053 + }, + { + "epoch": 0.5888268156424581, + "grad_norm": 0.5909101963043213, + "learning_rate": 0.0009734173669467787, + "loss": 0.4075, + "step": 1054 + }, + { + "epoch": 0.5893854748603352, + "grad_norm": 0.5381833910942078, + "learning_rate": 0.0009733893557422969, + "loss": 0.5116, + "step": 1055 + }, + { + "epoch": 0.5899441340782123, + "grad_norm": 0.6411459445953369, + "learning_rate": 0.0009733613445378151, + "loss": 0.5999, + "step": 1056 + }, + { + "epoch": 0.5905027932960893, + "grad_norm": 2.460374116897583, + "learning_rate": 0.0009733333333333334, + "loss": 0.5764, + "step": 1057 + }, + { + "epoch": 0.5910614525139665, + "grad_norm": 0.45168182253837585, + "learning_rate": 0.0009733053221288516, + "loss": 0.4098, + "step": 1058 + }, + { + "epoch": 0.5916201117318436, + "grad_norm": 0.52949458360672, + "learning_rate": 0.0009732773109243697, + "loss": 0.4437, + "step": 1059 + }, + { + "epoch": 0.5921787709497207, + "grad_norm": 0.49910610914230347, + "learning_rate": 0.0009732492997198879, + "loss": 0.475, + "step": 1060 + }, + { + "epoch": 0.5927374301675977, + "grad_norm": 0.5932276844978333, + "learning_rate": 0.0009732212885154061, + "loss": 0.4501, + "step": 1061 + }, + { + "epoch": 0.5932960893854748, + "grad_norm": 1.1807256937026978, + "learning_rate": 0.0009731932773109245, + "loss": 0.5197, + "step": 1062 + }, + { + "epoch": 0.593854748603352, + "grad_norm": 0.5904451608657837, + "learning_rate": 0.0009731652661064427, + "loss": 0.5183, + "step": 1063 + }, + { + "epoch": 0.5944134078212291, + "grad_norm": 1.4970983266830444, + "learning_rate": 0.0009731372549019608, + "loss": 0.4292, + "step": 1064 + }, + { + "epoch": 0.5949720670391061, + "grad_norm": 1.2578061819076538, + "learning_rate": 0.000973109243697479, + "loss": 0.4124, + "step": 1065 + }, + { + "epoch": 0.5955307262569832, + "grad_norm": 0.551940381526947, + "learning_rate": 0.0009730812324929972, + "loss": 0.4868, + "step": 1066 + }, + { + "epoch": 0.5960893854748603, + "grad_norm": 0.6726689338684082, + "learning_rate": 0.0009730532212885155, + "loss": 0.5478, + "step": 1067 + }, + { + "epoch": 0.5966480446927375, + "grad_norm": 0.7053314447402954, + "learning_rate": 0.0009730252100840337, + "loss": 0.5736, + "step": 1068 + }, + { + "epoch": 0.5972067039106145, + "grad_norm": 1.3705649375915527, + "learning_rate": 0.0009729971988795519, + "loss": 0.5976, + "step": 1069 + }, + { + "epoch": 0.5977653631284916, + "grad_norm": 0.7059028148651123, + "learning_rate": 0.00097296918767507, + "loss": 0.5561, + "step": 1070 + }, + { + "epoch": 0.5983240223463687, + "grad_norm": 0.7204971313476562, + "learning_rate": 0.0009729411764705882, + "loss": 0.4561, + "step": 1071 + }, + { + "epoch": 0.5988826815642458, + "grad_norm": 0.6502424478530884, + "learning_rate": 0.0009729131652661065, + "loss": 0.4368, + "step": 1072 + }, + { + "epoch": 0.5994413407821229, + "grad_norm": 0.7644829750061035, + "learning_rate": 0.0009728851540616247, + "loss": 0.6742, + "step": 1073 + }, + { + "epoch": 0.6, + "grad_norm": 0.8019526600837708, + "learning_rate": 0.0009728571428571429, + "loss": 0.3617, + "step": 1074 + }, + { + "epoch": 0.6005586592178771, + "grad_norm": 1.1168971061706543, + "learning_rate": 0.000972829131652661, + "loss": 0.5234, + "step": 1075 + }, + { + "epoch": 0.6011173184357542, + "grad_norm": 0.6615554690361023, + "learning_rate": 0.0009728011204481792, + "loss": 0.668, + "step": 1076 + }, + { + "epoch": 0.6016759776536312, + "grad_norm": 0.544056236743927, + "learning_rate": 0.0009727731092436975, + "loss": 0.5545, + "step": 1077 + }, + { + "epoch": 0.6022346368715084, + "grad_norm": 0.6310135722160339, + "learning_rate": 0.0009727450980392158, + "loss": 0.4709, + "step": 1078 + }, + { + "epoch": 0.6027932960893855, + "grad_norm": 0.5278035402297974, + "learning_rate": 0.000972717086834734, + "loss": 0.5032, + "step": 1079 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 0.8330789804458618, + "learning_rate": 0.0009726890756302521, + "loss": 0.5006, + "step": 1080 + }, + { + "epoch": 0.6039106145251396, + "grad_norm": 0.6948153376579285, + "learning_rate": 0.0009726610644257703, + "loss": 0.5183, + "step": 1081 + }, + { + "epoch": 0.6044692737430167, + "grad_norm": 0.6444036364555359, + "learning_rate": 0.0009726330532212886, + "loss": 0.4745, + "step": 1082 + }, + { + "epoch": 0.6050279329608939, + "grad_norm": 0.5123707056045532, + "learning_rate": 0.0009726050420168068, + "loss": 0.4727, + "step": 1083 + }, + { + "epoch": 0.605586592178771, + "grad_norm": 0.5131059885025024, + "learning_rate": 0.000972577030812325, + "loss": 0.4359, + "step": 1084 + }, + { + "epoch": 0.6061452513966481, + "grad_norm": 0.5386841297149658, + "learning_rate": 0.0009725490196078432, + "loss": 0.5728, + "step": 1085 + }, + { + "epoch": 0.6067039106145251, + "grad_norm": 1.0279717445373535, + "learning_rate": 0.0009725210084033613, + "loss": 0.4456, + "step": 1086 + }, + { + "epoch": 0.6072625698324022, + "grad_norm": 1.6170985698699951, + "learning_rate": 0.0009724929971988796, + "loss": 0.4718, + "step": 1087 + }, + { + "epoch": 0.6078212290502794, + "grad_norm": 0.5936034917831421, + "learning_rate": 0.0009724649859943978, + "loss": 0.442, + "step": 1088 + }, + { + "epoch": 0.6083798882681565, + "grad_norm": 0.8925131559371948, + "learning_rate": 0.000972436974789916, + "loss": 0.4927, + "step": 1089 + }, + { + "epoch": 0.6089385474860335, + "grad_norm": 1.0748463869094849, + "learning_rate": 0.0009724089635854342, + "loss": 0.5497, + "step": 1090 + }, + { + "epoch": 0.6094972067039106, + "grad_norm": 0.8236844539642334, + "learning_rate": 0.0009723809523809523, + "loss": 0.5295, + "step": 1091 + }, + { + "epoch": 0.6100558659217877, + "grad_norm": 11.054025650024414, + "learning_rate": 0.0009723529411764706, + "loss": 0.6345, + "step": 1092 + }, + { + "epoch": 0.6106145251396649, + "grad_norm": 0.896457850933075, + "learning_rate": 0.0009723249299719888, + "loss": 0.7274, + "step": 1093 + }, + { + "epoch": 0.6111731843575419, + "grad_norm": 0.8940154910087585, + "learning_rate": 0.000972296918767507, + "loss": 0.56, + "step": 1094 + }, + { + "epoch": 0.611731843575419, + "grad_norm": 0.5155821442604065, + "learning_rate": 0.0009722689075630253, + "loss": 0.415, + "step": 1095 + }, + { + "epoch": 0.6122905027932961, + "grad_norm": 0.7292872667312622, + "learning_rate": 0.0009722408963585434, + "loss": 0.4018, + "step": 1096 + }, + { + "epoch": 0.6128491620111732, + "grad_norm": 0.5831227898597717, + "learning_rate": 0.0009722128851540617, + "loss": 0.5924, + "step": 1097 + }, + { + "epoch": 0.6134078212290502, + "grad_norm": 0.5251787304878235, + "learning_rate": 0.0009721848739495799, + "loss": 0.5893, + "step": 1098 + }, + { + "epoch": 0.6139664804469274, + "grad_norm": 0.9705613851547241, + "learning_rate": 0.0009721568627450981, + "loss": 0.4265, + "step": 1099 + }, + { + "epoch": 0.6145251396648045, + "grad_norm": 7.225012302398682, + "learning_rate": 0.0009721288515406163, + "loss": 0.5099, + "step": 1100 + }, + { + "epoch": 0.6150837988826816, + "grad_norm": 0.9091275930404663, + "learning_rate": 0.0009721008403361345, + "loss": 0.435, + "step": 1101 + }, + { + "epoch": 0.6156424581005586, + "grad_norm": 1.5239793062210083, + "learning_rate": 0.0009720728291316527, + "loss": 0.6069, + "step": 1102 + }, + { + "epoch": 0.6162011173184357, + "grad_norm": 0.4193691313266754, + "learning_rate": 0.0009720448179271709, + "loss": 0.5286, + "step": 1103 + }, + { + "epoch": 0.6167597765363129, + "grad_norm": 0.6078191995620728, + "learning_rate": 0.0009720168067226891, + "loss": 0.4901, + "step": 1104 + }, + { + "epoch": 0.61731843575419, + "grad_norm": 4.932099342346191, + "learning_rate": 0.0009719887955182073, + "loss": 0.4849, + "step": 1105 + }, + { + "epoch": 0.617877094972067, + "grad_norm": 0.7159140110015869, + "learning_rate": 0.0009719607843137255, + "loss": 0.5934, + "step": 1106 + }, + { + "epoch": 0.6184357541899441, + "grad_norm": 0.5018102526664734, + "learning_rate": 0.0009719327731092437, + "loss": 0.4016, + "step": 1107 + }, + { + "epoch": 0.6189944134078212, + "grad_norm": 0.8189084529876709, + "learning_rate": 0.0009719047619047619, + "loss": 0.4621, + "step": 1108 + }, + { + "epoch": 0.6195530726256984, + "grad_norm": 0.8758090138435364, + "learning_rate": 0.0009718767507002801, + "loss": 0.6693, + "step": 1109 + }, + { + "epoch": 0.6201117318435754, + "grad_norm": 0.9009056091308594, + "learning_rate": 0.0009718487394957983, + "loss": 0.5139, + "step": 1110 + }, + { + "epoch": 0.6206703910614525, + "grad_norm": 0.44998493790626526, + "learning_rate": 0.0009718207282913166, + "loss": 0.4325, + "step": 1111 + }, + { + "epoch": 0.6212290502793296, + "grad_norm": 0.5723503828048706, + "learning_rate": 0.0009717927170868347, + "loss": 0.6231, + "step": 1112 + }, + { + "epoch": 0.6217877094972067, + "grad_norm": 0.5779495239257812, + "learning_rate": 0.000971764705882353, + "loss": 0.5373, + "step": 1113 + }, + { + "epoch": 0.6223463687150838, + "grad_norm": 0.6900322437286377, + "learning_rate": 0.0009717366946778712, + "loss": 0.5136, + "step": 1114 + }, + { + "epoch": 0.6229050279329609, + "grad_norm": 0.8103601932525635, + "learning_rate": 0.0009717086834733894, + "loss": 0.5172, + "step": 1115 + }, + { + "epoch": 0.623463687150838, + "grad_norm": 0.6067003011703491, + "learning_rate": 0.0009716806722689076, + "loss": 0.4749, + "step": 1116 + }, + { + "epoch": 0.6240223463687151, + "grad_norm": 0.9293099641799927, + "learning_rate": 0.0009716526610644258, + "loss": 0.6614, + "step": 1117 + }, + { + "epoch": 0.6245810055865921, + "grad_norm": 0.7477920651435852, + "learning_rate": 0.000971624649859944, + "loss": 0.628, + "step": 1118 + }, + { + "epoch": 0.6251396648044693, + "grad_norm": 0.705543577671051, + "learning_rate": 0.0009715966386554622, + "loss": 0.5577, + "step": 1119 + }, + { + "epoch": 0.6256983240223464, + "grad_norm": 0.9889763593673706, + "learning_rate": 0.0009715686274509804, + "loss": 0.5096, + "step": 1120 + }, + { + "epoch": 0.6262569832402235, + "grad_norm": 3.406019926071167, + "learning_rate": 0.0009715406162464986, + "loss": 0.5001, + "step": 1121 + }, + { + "epoch": 0.6268156424581005, + "grad_norm": 6.968403339385986, + "learning_rate": 0.0009715126050420168, + "loss": 0.4409, + "step": 1122 + }, + { + "epoch": 0.6273743016759776, + "grad_norm": 0.4938574731349945, + "learning_rate": 0.000971484593837535, + "loss": 0.4828, + "step": 1123 + }, + { + "epoch": 0.6279329608938548, + "grad_norm": 0.5067856907844543, + "learning_rate": 0.0009714565826330532, + "loss": 0.4099, + "step": 1124 + }, + { + "epoch": 0.6284916201117319, + "grad_norm": 0.687279999256134, + "learning_rate": 0.0009714285714285714, + "loss": 0.431, + "step": 1125 + }, + { + "epoch": 0.6290502793296089, + "grad_norm": 0.7488174438476562, + "learning_rate": 0.0009714005602240896, + "loss": 0.5415, + "step": 1126 + }, + { + "epoch": 0.629608938547486, + "grad_norm": 0.7109690308570862, + "learning_rate": 0.0009713725490196078, + "loss": 0.6235, + "step": 1127 + }, + { + "epoch": 0.6301675977653631, + "grad_norm": 1.0108140707015991, + "learning_rate": 0.0009713445378151261, + "loss": 0.8259, + "step": 1128 + }, + { + "epoch": 0.6307262569832403, + "grad_norm": 0.7121959328651428, + "learning_rate": 0.0009713165266106443, + "loss": 0.654, + "step": 1129 + }, + { + "epoch": 0.6312849162011173, + "grad_norm": 1.587821364402771, + "learning_rate": 0.0009712885154061625, + "loss": 0.5472, + "step": 1130 + }, + { + "epoch": 0.6318435754189944, + "grad_norm": 0.734961986541748, + "learning_rate": 0.0009712605042016807, + "loss": 0.4481, + "step": 1131 + }, + { + "epoch": 0.6324022346368715, + "grad_norm": 0.8328256011009216, + "learning_rate": 0.0009712324929971989, + "loss": 0.6009, + "step": 1132 + }, + { + "epoch": 0.6329608938547486, + "grad_norm": 0.5134819746017456, + "learning_rate": 0.0009712044817927172, + "loss": 0.4236, + "step": 1133 + }, + { + "epoch": 0.6335195530726258, + "grad_norm": 0.6215536594390869, + "learning_rate": 0.0009711764705882353, + "loss": 0.4078, + "step": 1134 + }, + { + "epoch": 0.6340782122905028, + "grad_norm": 0.7158108353614807, + "learning_rate": 0.0009711484593837535, + "loss": 0.4183, + "step": 1135 + }, + { + "epoch": 0.6346368715083799, + "grad_norm": 0.6285063624382019, + "learning_rate": 0.0009711204481792717, + "loss": 0.4381, + "step": 1136 + }, + { + "epoch": 0.635195530726257, + "grad_norm": 0.7607704997062683, + "learning_rate": 0.0009710924369747899, + "loss": 0.6068, + "step": 1137 + }, + { + "epoch": 0.6357541899441341, + "grad_norm": 0.7894600629806519, + "learning_rate": 0.0009710644257703082, + "loss": 0.5156, + "step": 1138 + }, + { + "epoch": 0.6363128491620111, + "grad_norm": 0.7930091023445129, + "learning_rate": 0.0009710364145658263, + "loss": 0.4807, + "step": 1139 + }, + { + "epoch": 0.6368715083798883, + "grad_norm": 1.073494553565979, + "learning_rate": 0.0009710084033613445, + "loss": 0.4868, + "step": 1140 + }, + { + "epoch": 0.6374301675977654, + "grad_norm": 0.9982699155807495, + "learning_rate": 0.0009709803921568627, + "loss": 0.4942, + "step": 1141 + }, + { + "epoch": 0.6379888268156425, + "grad_norm": 0.43620118498802185, + "learning_rate": 0.0009709523809523809, + "loss": 0.3666, + "step": 1142 + }, + { + "epoch": 0.6385474860335195, + "grad_norm": 0.987544596195221, + "learning_rate": 0.0009709243697478993, + "loss": 0.4831, + "step": 1143 + }, + { + "epoch": 0.6391061452513966, + "grad_norm": 3.113736391067505, + "learning_rate": 0.0009708963585434174, + "loss": 0.5172, + "step": 1144 + }, + { + "epoch": 0.6396648044692738, + "grad_norm": 0.6777386665344238, + "learning_rate": 0.0009708683473389356, + "loss": 0.4525, + "step": 1145 + }, + { + "epoch": 0.6402234636871509, + "grad_norm": 0.7435891628265381, + "learning_rate": 0.0009708403361344538, + "loss": 0.5556, + "step": 1146 + }, + { + "epoch": 0.6407821229050279, + "grad_norm": 0.8971303105354309, + "learning_rate": 0.000970812324929972, + "loss": 0.5126, + "step": 1147 + }, + { + "epoch": 0.641340782122905, + "grad_norm": 0.665515661239624, + "learning_rate": 0.0009707843137254903, + "loss": 0.6845, + "step": 1148 + }, + { + "epoch": 0.6418994413407821, + "grad_norm": 0.6780884265899658, + "learning_rate": 0.0009707563025210085, + "loss": 0.6369, + "step": 1149 + }, + { + "epoch": 0.6424581005586593, + "grad_norm": 0.8216647505760193, + "learning_rate": 0.0009707282913165266, + "loss": 0.4555, + "step": 1150 + }, + { + "epoch": 0.6430167597765363, + "grad_norm": 0.5883092284202576, + "learning_rate": 0.0009707002801120448, + "loss": 0.4918, + "step": 1151 + }, + { + "epoch": 0.6435754189944134, + "grad_norm": 0.8862646818161011, + "learning_rate": 0.000970672268907563, + "loss": 0.4957, + "step": 1152 + }, + { + "epoch": 0.6441340782122905, + "grad_norm": 0.6011155247688293, + "learning_rate": 0.0009706442577030813, + "loss": 0.47, + "step": 1153 + }, + { + "epoch": 0.6446927374301676, + "grad_norm": 0.6456702947616577, + "learning_rate": 0.0009706162464985995, + "loss": 0.5884, + "step": 1154 + }, + { + "epoch": 0.6452513966480447, + "grad_norm": 0.5160109996795654, + "learning_rate": 0.0009705882352941176, + "loss": 0.5162, + "step": 1155 + }, + { + "epoch": 0.6458100558659218, + "grad_norm": 0.5241988301277161, + "learning_rate": 0.0009705602240896358, + "loss": 0.5013, + "step": 1156 + }, + { + "epoch": 0.6463687150837989, + "grad_norm": 1.7749062776565552, + "learning_rate": 0.000970532212885154, + "loss": 0.6822, + "step": 1157 + }, + { + "epoch": 0.646927374301676, + "grad_norm": 0.6804704666137695, + "learning_rate": 0.0009705042016806723, + "loss": 0.5191, + "step": 1158 + }, + { + "epoch": 0.647486033519553, + "grad_norm": 1.4008277654647827, + "learning_rate": 0.0009704761904761905, + "loss": 0.5502, + "step": 1159 + }, + { + "epoch": 0.6480446927374302, + "grad_norm": 0.533053994178772, + "learning_rate": 0.0009704481792717086, + "loss": 0.4722, + "step": 1160 + }, + { + "epoch": 0.6486033519553073, + "grad_norm": 0.624987006187439, + "learning_rate": 0.0009704201680672269, + "loss": 0.5322, + "step": 1161 + }, + { + "epoch": 0.6491620111731844, + "grad_norm": 1.0220478773117065, + "learning_rate": 0.0009703921568627451, + "loss": 0.5202, + "step": 1162 + }, + { + "epoch": 0.6497206703910614, + "grad_norm": 7.964524269104004, + "learning_rate": 0.0009703641456582634, + "loss": 0.4592, + "step": 1163 + }, + { + "epoch": 0.6502793296089385, + "grad_norm": 0.7197114825248718, + "learning_rate": 0.0009703361344537816, + "loss": 0.6007, + "step": 1164 + }, + { + "epoch": 0.6508379888268156, + "grad_norm": 2.3243930339813232, + "learning_rate": 0.0009703081232492998, + "loss": 0.5335, + "step": 1165 + }, + { + "epoch": 0.6513966480446928, + "grad_norm": 0.48415863513946533, + "learning_rate": 0.0009702801120448179, + "loss": 0.5499, + "step": 1166 + }, + { + "epoch": 0.6519553072625698, + "grad_norm": 1.8503010272979736, + "learning_rate": 0.0009702521008403361, + "loss": 0.5757, + "step": 1167 + }, + { + "epoch": 0.6525139664804469, + "grad_norm": 0.6757665872573853, + "learning_rate": 0.0009702240896358544, + "loss": 0.684, + "step": 1168 + }, + { + "epoch": 0.653072625698324, + "grad_norm": 1.3364590406417847, + "learning_rate": 0.0009701960784313726, + "loss": 0.4595, + "step": 1169 + }, + { + "epoch": 0.6536312849162011, + "grad_norm": 0.6175302267074585, + "learning_rate": 0.0009701680672268908, + "loss": 0.4642, + "step": 1170 + }, + { + "epoch": 0.6541899441340782, + "grad_norm": 0.4852699637413025, + "learning_rate": 0.0009701400560224089, + "loss": 0.4996, + "step": 1171 + }, + { + "epoch": 0.6547486033519553, + "grad_norm": 0.9816637635231018, + "learning_rate": 0.0009701120448179271, + "loss": 0.4906, + "step": 1172 + }, + { + "epoch": 0.6553072625698324, + "grad_norm": 0.6328439712524414, + "learning_rate": 0.0009700840336134454, + "loss": 0.5194, + "step": 1173 + }, + { + "epoch": 0.6558659217877095, + "grad_norm": 1.1651662588119507, + "learning_rate": 0.0009700560224089636, + "loss": 0.4595, + "step": 1174 + }, + { + "epoch": 0.6564245810055865, + "grad_norm": 0.6353319883346558, + "learning_rate": 0.0009700280112044818, + "loss": 0.4036, + "step": 1175 + }, + { + "epoch": 0.6569832402234637, + "grad_norm": 0.6404257416725159, + "learning_rate": 0.0009699999999999999, + "loss": 0.4365, + "step": 1176 + }, + { + "epoch": 0.6575418994413408, + "grad_norm": 1.0196770429611206, + "learning_rate": 0.0009699719887955181, + "loss": 0.4619, + "step": 1177 + }, + { + "epoch": 0.6581005586592179, + "grad_norm": 1.550592303276062, + "learning_rate": 0.0009699439775910365, + "loss": 0.4848, + "step": 1178 + }, + { + "epoch": 0.658659217877095, + "grad_norm": 0.5176575779914856, + "learning_rate": 0.0009699159663865547, + "loss": 0.4501, + "step": 1179 + }, + { + "epoch": 0.659217877094972, + "grad_norm": 0.5924175381660461, + "learning_rate": 0.0009698879551820729, + "loss": 0.5381, + "step": 1180 + }, + { + "epoch": 0.6597765363128492, + "grad_norm": 0.6795361638069153, + "learning_rate": 0.0009698599439775911, + "loss": 0.419, + "step": 1181 + }, + { + "epoch": 0.6603351955307263, + "grad_norm": 1.0456833839416504, + "learning_rate": 0.0009698319327731092, + "loss": 0.4914, + "step": 1182 + }, + { + "epoch": 0.6608938547486034, + "grad_norm": 0.6252954602241516, + "learning_rate": 0.0009698039215686275, + "loss": 0.5764, + "step": 1183 + }, + { + "epoch": 0.6614525139664804, + "grad_norm": 11.64720344543457, + "learning_rate": 0.0009697759103641457, + "loss": 0.4989, + "step": 1184 + }, + { + "epoch": 0.6620111731843575, + "grad_norm": 1.2602818012237549, + "learning_rate": 0.0009697478991596639, + "loss": 0.432, + "step": 1185 + }, + { + "epoch": 0.6625698324022347, + "grad_norm": 1.4845386743545532, + "learning_rate": 0.0009697198879551821, + "loss": 0.5126, + "step": 1186 + }, + { + "epoch": 0.6631284916201118, + "grad_norm": 0.8853285908699036, + "learning_rate": 0.0009696918767507002, + "loss": 0.5589, + "step": 1187 + }, + { + "epoch": 0.6636871508379888, + "grad_norm": 0.7342753410339355, + "learning_rate": 0.0009696638655462185, + "loss": 0.5588, + "step": 1188 + }, + { + "epoch": 0.6642458100558659, + "grad_norm": 3.369445323944092, + "learning_rate": 0.0009696358543417367, + "loss": 0.4997, + "step": 1189 + }, + { + "epoch": 0.664804469273743, + "grad_norm": 0.5214580297470093, + "learning_rate": 0.0009696078431372549, + "loss": 0.476, + "step": 1190 + }, + { + "epoch": 0.6653631284916202, + "grad_norm": 1.2988337278366089, + "learning_rate": 0.0009695798319327731, + "loss": 0.4847, + "step": 1191 + }, + { + "epoch": 0.6659217877094972, + "grad_norm": 0.7430171370506287, + "learning_rate": 0.0009695518207282912, + "loss": 0.4597, + "step": 1192 + }, + { + "epoch": 0.6664804469273743, + "grad_norm": 1.2891651391983032, + "learning_rate": 0.0009695238095238096, + "loss": 0.4538, + "step": 1193 + }, + { + "epoch": 0.6670391061452514, + "grad_norm": 2.5343263149261475, + "learning_rate": 0.0009694957983193278, + "loss": 0.464, + "step": 1194 + }, + { + "epoch": 0.6675977653631285, + "grad_norm": 0.5320484638214111, + "learning_rate": 0.000969467787114846, + "loss": 0.5846, + "step": 1195 + }, + { + "epoch": 0.6681564245810055, + "grad_norm": 0.6462830305099487, + "learning_rate": 0.0009694397759103642, + "loss": 0.5244, + "step": 1196 + }, + { + "epoch": 0.6687150837988827, + "grad_norm": 0.5505291223526001, + "learning_rate": 0.0009694117647058824, + "loss": 0.5034, + "step": 1197 + }, + { + "epoch": 0.6692737430167598, + "grad_norm": 0.5639625191688538, + "learning_rate": 0.0009693837535014006, + "loss": 0.632, + "step": 1198 + }, + { + "epoch": 0.6698324022346369, + "grad_norm": 0.7597293853759766, + "learning_rate": 0.0009693557422969188, + "loss": 0.6121, + "step": 1199 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.560198962688446, + "learning_rate": 0.000969327731092437, + "loss": 0.4575, + "step": 1200 + }, + { + "epoch": 0.670949720670391, + "grad_norm": 0.5961484909057617, + "learning_rate": 0.0009692997198879552, + "loss": 0.5024, + "step": 1201 + }, + { + "epoch": 0.6715083798882682, + "grad_norm": 0.6472962498664856, + "learning_rate": 0.0009692717086834734, + "loss": 0.4449, + "step": 1202 + }, + { + "epoch": 0.6720670391061453, + "grad_norm": 0.8945441246032715, + "learning_rate": 0.0009692436974789916, + "loss": 0.4517, + "step": 1203 + }, + { + "epoch": 0.6726256983240223, + "grad_norm": 0.6092879772186279, + "learning_rate": 0.0009692156862745098, + "loss": 0.534, + "step": 1204 + }, + { + "epoch": 0.6731843575418994, + "grad_norm": 0.8901610374450684, + "learning_rate": 0.000969187675070028, + "loss": 0.542, + "step": 1205 + }, + { + "epoch": 0.6737430167597765, + "grad_norm": 0.6504933834075928, + "learning_rate": 0.0009691596638655462, + "loss": 0.4646, + "step": 1206 + }, + { + "epoch": 0.6743016759776537, + "grad_norm": 0.6498280763626099, + "learning_rate": 0.0009691316526610644, + "loss": 0.5283, + "step": 1207 + }, + { + "epoch": 0.6748603351955307, + "grad_norm": 1.006180763244629, + "learning_rate": 0.0009691036414565826, + "loss": 0.4876, + "step": 1208 + }, + { + "epoch": 0.6754189944134078, + "grad_norm": 0.6444622278213501, + "learning_rate": 0.0009690756302521008, + "loss": 0.3993, + "step": 1209 + }, + { + "epoch": 0.6759776536312849, + "grad_norm": 8.225159645080566, + "learning_rate": 0.0009690476190476191, + "loss": 0.5376, + "step": 1210 + }, + { + "epoch": 0.676536312849162, + "grad_norm": 0.7043599486351013, + "learning_rate": 0.0009690196078431373, + "loss": 0.6021, + "step": 1211 + }, + { + "epoch": 0.6770949720670391, + "grad_norm": 0.8514872789382935, + "learning_rate": 0.0009689915966386555, + "loss": 0.4462, + "step": 1212 + }, + { + "epoch": 0.6776536312849162, + "grad_norm": 1.7722604274749756, + "learning_rate": 0.0009689635854341738, + "loss": 0.5955, + "step": 1213 + }, + { + "epoch": 0.6782122905027933, + "grad_norm": 0.4466038644313812, + "learning_rate": 0.0009689355742296919, + "loss": 0.4371, + "step": 1214 + }, + { + "epoch": 0.6787709497206704, + "grad_norm": 0.4447239339351654, + "learning_rate": 0.0009689075630252101, + "loss": 0.3808, + "step": 1215 + }, + { + "epoch": 0.6793296089385474, + "grad_norm": 0.5980348587036133, + "learning_rate": 0.0009688795518207283, + "loss": 0.5303, + "step": 1216 + }, + { + "epoch": 0.6798882681564246, + "grad_norm": 1.0799697637557983, + "learning_rate": 0.0009688515406162465, + "loss": 0.5761, + "step": 1217 + }, + { + "epoch": 0.6804469273743017, + "grad_norm": 0.9614025354385376, + "learning_rate": 0.0009688235294117648, + "loss": 0.5431, + "step": 1218 + }, + { + "epoch": 0.6810055865921788, + "grad_norm": 0.9209677577018738, + "learning_rate": 0.0009687955182072829, + "loss": 0.4556, + "step": 1219 + }, + { + "epoch": 0.6815642458100558, + "grad_norm": 0.6238844990730286, + "learning_rate": 0.0009687675070028011, + "loss": 0.4895, + "step": 1220 + }, + { + "epoch": 0.6821229050279329, + "grad_norm": 0.855700671672821, + "learning_rate": 0.0009687394957983193, + "loss": 0.5299, + "step": 1221 + }, + { + "epoch": 0.6826815642458101, + "grad_norm": 1.0048364400863647, + "learning_rate": 0.0009687114845938375, + "loss": 0.4399, + "step": 1222 + }, + { + "epoch": 0.6832402234636872, + "grad_norm": 0.5466775298118591, + "learning_rate": 0.0009686834733893558, + "loss": 0.4425, + "step": 1223 + }, + { + "epoch": 0.6837988826815642, + "grad_norm": 0.9739848375320435, + "learning_rate": 0.0009686554621848739, + "loss": 0.4623, + "step": 1224 + }, + { + "epoch": 0.6843575418994413, + "grad_norm": 11.794245719909668, + "learning_rate": 0.0009686274509803921, + "loss": 0.531, + "step": 1225 + }, + { + "epoch": 0.6849162011173184, + "grad_norm": 0.8747836351394653, + "learning_rate": 0.0009685994397759104, + "loss": 0.5467, + "step": 1226 + }, + { + "epoch": 0.6854748603351956, + "grad_norm": 0.7160051465034485, + "learning_rate": 0.0009685714285714286, + "loss": 0.4519, + "step": 1227 + }, + { + "epoch": 0.6860335195530727, + "grad_norm": 0.5023255944252014, + "learning_rate": 0.0009685434173669469, + "loss": 0.419, + "step": 1228 + }, + { + "epoch": 0.6865921787709497, + "grad_norm": 0.48213818669319153, + "learning_rate": 0.0009685154061624651, + "loss": 0.4597, + "step": 1229 + }, + { + "epoch": 0.6871508379888268, + "grad_norm": 0.6828752160072327, + "learning_rate": 0.0009684873949579832, + "loss": 0.4993, + "step": 1230 + }, + { + "epoch": 0.6877094972067039, + "grad_norm": 0.667091965675354, + "learning_rate": 0.0009684593837535014, + "loss": 0.4369, + "step": 1231 + }, + { + "epoch": 0.6882681564245811, + "grad_norm": 0.5374354720115662, + "learning_rate": 0.0009684313725490196, + "loss": 0.5498, + "step": 1232 + }, + { + "epoch": 0.6888268156424581, + "grad_norm": 0.595704197883606, + "learning_rate": 0.0009684033613445379, + "loss": 0.4821, + "step": 1233 + }, + { + "epoch": 0.6893854748603352, + "grad_norm": 0.6341480016708374, + "learning_rate": 0.0009683753501400561, + "loss": 0.4737, + "step": 1234 + }, + { + "epoch": 0.6899441340782123, + "grad_norm": 0.8466470241546631, + "learning_rate": 0.0009683473389355742, + "loss": 0.5707, + "step": 1235 + }, + { + "epoch": 0.6905027932960894, + "grad_norm": 0.5244937539100647, + "learning_rate": 0.0009683193277310924, + "loss": 0.4776, + "step": 1236 + }, + { + "epoch": 0.6910614525139664, + "grad_norm": 0.6936596632003784, + "learning_rate": 0.0009682913165266106, + "loss": 0.5769, + "step": 1237 + }, + { + "epoch": 0.6916201117318436, + "grad_norm": 0.6972819566726685, + "learning_rate": 0.0009682633053221289, + "loss": 0.5927, + "step": 1238 + }, + { + "epoch": 0.6921787709497207, + "grad_norm": 0.4999435544013977, + "learning_rate": 0.0009682352941176471, + "loss": 0.4442, + "step": 1239 + }, + { + "epoch": 0.6927374301675978, + "grad_norm": 0.6698967218399048, + "learning_rate": 0.0009682072829131652, + "loss": 0.565, + "step": 1240 + }, + { + "epoch": 0.6932960893854748, + "grad_norm": 0.6050639152526855, + "learning_rate": 0.0009681792717086834, + "loss": 0.4654, + "step": 1241 + }, + { + "epoch": 0.693854748603352, + "grad_norm": 2.451584815979004, + "learning_rate": 0.0009681512605042016, + "loss": 0.5334, + "step": 1242 + }, + { + "epoch": 0.6944134078212291, + "grad_norm": 0.6050121188163757, + "learning_rate": 0.00096812324929972, + "loss": 0.5986, + "step": 1243 + }, + { + "epoch": 0.6949720670391062, + "grad_norm": 1.0941985845565796, + "learning_rate": 0.0009680952380952382, + "loss": 0.457, + "step": 1244 + }, + { + "epoch": 0.6955307262569832, + "grad_norm": 0.7693732380867004, + "learning_rate": 0.0009680672268907564, + "loss": 0.5854, + "step": 1245 + }, + { + "epoch": 0.6960893854748603, + "grad_norm": 0.6690261960029602, + "learning_rate": 0.0009680392156862745, + "loss": 0.4599, + "step": 1246 + }, + { + "epoch": 0.6966480446927374, + "grad_norm": 0.7835928797721863, + "learning_rate": 0.0009680112044817927, + "loss": 0.5687, + "step": 1247 + }, + { + "epoch": 0.6972067039106146, + "grad_norm": 0.5262266397476196, + "learning_rate": 0.000967983193277311, + "loss": 0.4271, + "step": 1248 + }, + { + "epoch": 0.6977653631284916, + "grad_norm": 0.9227473735809326, + "learning_rate": 0.0009679551820728292, + "loss": 0.4949, + "step": 1249 + }, + { + "epoch": 0.6983240223463687, + "grad_norm": 2.512376546859741, + "learning_rate": 0.0009679271708683474, + "loss": 0.4703, + "step": 1250 + }, + { + "epoch": 0.6988826815642458, + "grad_norm": 0.6254360675811768, + "learning_rate": 0.0009678991596638655, + "loss": 0.5113, + "step": 1251 + }, + { + "epoch": 0.6994413407821229, + "grad_norm": 0.517559289932251, + "learning_rate": 0.0009678711484593837, + "loss": 0.4107, + "step": 1252 + }, + { + "epoch": 0.7, + "grad_norm": 0.49326375126838684, + "learning_rate": 0.000967843137254902, + "loss": 0.4401, + "step": 1253 + }, + { + "epoch": 0.7005586592178771, + "grad_norm": 0.6251634955406189, + "learning_rate": 0.0009678151260504202, + "loss": 0.4965, + "step": 1254 + }, + { + "epoch": 0.7011173184357542, + "grad_norm": 0.515642523765564, + "learning_rate": 0.0009677871148459384, + "loss": 0.4646, + "step": 1255 + }, + { + "epoch": 0.7016759776536313, + "grad_norm": 0.934600293636322, + "learning_rate": 0.0009677591036414565, + "loss": 0.5258, + "step": 1256 + }, + { + "epoch": 0.7022346368715083, + "grad_norm": 4.897491455078125, + "learning_rate": 0.0009677310924369747, + "loss": 0.518, + "step": 1257 + }, + { + "epoch": 0.7027932960893855, + "grad_norm": 0.7678471207618713, + "learning_rate": 0.000967703081232493, + "loss": 0.4552, + "step": 1258 + }, + { + "epoch": 0.7033519553072626, + "grad_norm": 0.8946427702903748, + "learning_rate": 0.0009676750700280113, + "loss": 0.5568, + "step": 1259 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 0.5655092000961304, + "learning_rate": 0.0009676470588235295, + "loss": 0.501, + "step": 1260 + }, + { + "epoch": 0.7044692737430167, + "grad_norm": 0.6731577515602112, + "learning_rate": 0.0009676190476190477, + "loss": 0.5235, + "step": 1261 + }, + { + "epoch": 0.7050279329608938, + "grad_norm": 0.6445982456207275, + "learning_rate": 0.0009675910364145658, + "loss": 0.4754, + "step": 1262 + }, + { + "epoch": 0.705586592178771, + "grad_norm": 0.528878927230835, + "learning_rate": 0.0009675630252100841, + "loss": 0.5521, + "step": 1263 + }, + { + "epoch": 0.7061452513966481, + "grad_norm": 1.5173887014389038, + "learning_rate": 0.0009675350140056023, + "loss": 0.771, + "step": 1264 + }, + { + "epoch": 0.7067039106145251, + "grad_norm": 0.615871787071228, + "learning_rate": 0.0009675070028011205, + "loss": 0.4714, + "step": 1265 + }, + { + "epoch": 0.7072625698324022, + "grad_norm": 0.6621685028076172, + "learning_rate": 0.0009674789915966387, + "loss": 0.5297, + "step": 1266 + }, + { + "epoch": 0.7078212290502793, + "grad_norm": 0.9935075044631958, + "learning_rate": 0.0009674509803921568, + "loss": 0.4746, + "step": 1267 + }, + { + "epoch": 0.7083798882681565, + "grad_norm": 0.6970776319503784, + "learning_rate": 0.0009674229691876751, + "loss": 0.5013, + "step": 1268 + }, + { + "epoch": 0.7089385474860335, + "grad_norm": 0.5533170104026794, + "learning_rate": 0.0009673949579831933, + "loss": 0.5044, + "step": 1269 + }, + { + "epoch": 0.7094972067039106, + "grad_norm": 0.9132866859436035, + "learning_rate": 0.0009673669467787115, + "loss": 0.625, + "step": 1270 + }, + { + "epoch": 0.7100558659217877, + "grad_norm": 0.604953944683075, + "learning_rate": 0.0009673389355742297, + "loss": 0.6432, + "step": 1271 + }, + { + "epoch": 0.7106145251396648, + "grad_norm": 0.5731745362281799, + "learning_rate": 0.0009673109243697478, + "loss": 0.4679, + "step": 1272 + }, + { + "epoch": 0.711173184357542, + "grad_norm": 0.8611568808555603, + "learning_rate": 0.0009672829131652661, + "loss": 0.4981, + "step": 1273 + }, + { + "epoch": 0.711731843575419, + "grad_norm": 0.823574423789978, + "learning_rate": 0.0009672549019607843, + "loss": 0.4382, + "step": 1274 + }, + { + "epoch": 0.7122905027932961, + "grad_norm": 1.7036203145980835, + "learning_rate": 0.0009672268907563026, + "loss": 0.5302, + "step": 1275 + }, + { + "epoch": 0.7128491620111732, + "grad_norm": 0.779194712638855, + "learning_rate": 0.0009671988795518208, + "loss": 0.483, + "step": 1276 + }, + { + "epoch": 0.7134078212290503, + "grad_norm": 0.7826222777366638, + "learning_rate": 0.000967170868347339, + "loss": 0.6213, + "step": 1277 + }, + { + "epoch": 0.7139664804469273, + "grad_norm": 0.8241113424301147, + "learning_rate": 0.0009671428571428572, + "loss": 0.5697, + "step": 1278 + }, + { + "epoch": 0.7145251396648045, + "grad_norm": 0.6261003613471985, + "learning_rate": 0.0009671148459383754, + "loss": 0.5607, + "step": 1279 + }, + { + "epoch": 0.7150837988826816, + "grad_norm": 1.5766152143478394, + "learning_rate": 0.0009670868347338936, + "loss": 0.5122, + "step": 1280 + }, + { + "epoch": 0.7156424581005587, + "grad_norm": 3.283601999282837, + "learning_rate": 0.0009670588235294118, + "loss": 0.5834, + "step": 1281 + }, + { + "epoch": 0.7162011173184357, + "grad_norm": 1.238656997680664, + "learning_rate": 0.00096703081232493, + "loss": 0.4688, + "step": 1282 + }, + { + "epoch": 0.7167597765363128, + "grad_norm": 0.6876110434532166, + "learning_rate": 0.0009670028011204482, + "loss": 0.4625, + "step": 1283 + }, + { + "epoch": 0.71731843575419, + "grad_norm": 1.1952418088912964, + "learning_rate": 0.0009669747899159664, + "loss": 0.4241, + "step": 1284 + }, + { + "epoch": 0.7178770949720671, + "grad_norm": 0.725604236125946, + "learning_rate": 0.0009669467787114846, + "loss": 0.4743, + "step": 1285 + }, + { + "epoch": 0.7184357541899441, + "grad_norm": 0.6231208443641663, + "learning_rate": 0.0009669187675070028, + "loss": 0.5137, + "step": 1286 + }, + { + "epoch": 0.7189944134078212, + "grad_norm": 2.544731378555298, + "learning_rate": 0.000966890756302521, + "loss": 0.4742, + "step": 1287 + }, + { + "epoch": 0.7195530726256983, + "grad_norm": 0.4675057530403137, + "learning_rate": 0.0009668627450980393, + "loss": 0.4382, + "step": 1288 + }, + { + "epoch": 0.7201117318435755, + "grad_norm": 0.8564208745956421, + "learning_rate": 0.0009668347338935574, + "loss": 0.48, + "step": 1289 + }, + { + "epoch": 0.7206703910614525, + "grad_norm": 0.5290160775184631, + "learning_rate": 0.0009668067226890756, + "loss": 0.4523, + "step": 1290 + }, + { + "epoch": 0.7212290502793296, + "grad_norm": 3.9657459259033203, + "learning_rate": 0.0009667787114845938, + "loss": 0.5707, + "step": 1291 + }, + { + "epoch": 0.7217877094972067, + "grad_norm": 0.5429648160934448, + "learning_rate": 0.0009667507002801121, + "loss": 0.531, + "step": 1292 + }, + { + "epoch": 0.7223463687150838, + "grad_norm": 0.4244798421859741, + "learning_rate": 0.0009667226890756304, + "loss": 0.5218, + "step": 1293 + }, + { + "epoch": 0.7229050279329609, + "grad_norm": 0.5533431172370911, + "learning_rate": 0.0009666946778711485, + "loss": 0.4547, + "step": 1294 + }, + { + "epoch": 0.723463687150838, + "grad_norm": 0.5960068702697754, + "learning_rate": 0.0009666666666666667, + "loss": 0.5347, + "step": 1295 + }, + { + "epoch": 0.7240223463687151, + "grad_norm": 0.4864499866962433, + "learning_rate": 0.0009666386554621849, + "loss": 0.4628, + "step": 1296 + }, + { + "epoch": 0.7245810055865922, + "grad_norm": 0.509096622467041, + "learning_rate": 0.0009666106442577031, + "loss": 0.5803, + "step": 1297 + }, + { + "epoch": 0.7251396648044692, + "grad_norm": 0.47281110286712646, + "learning_rate": 0.0009665826330532214, + "loss": 0.4286, + "step": 1298 + }, + { + "epoch": 0.7256983240223464, + "grad_norm": 0.655084490776062, + "learning_rate": 0.0009665546218487395, + "loss": 0.5528, + "step": 1299 + }, + { + "epoch": 0.7262569832402235, + "grad_norm": 0.5568073391914368, + "learning_rate": 0.0009665266106442577, + "loss": 0.4891, + "step": 1300 + }, + { + "epoch": 0.7268156424581006, + "grad_norm": 0.5502691268920898, + "learning_rate": 0.0009664985994397759, + "loss": 0.5156, + "step": 1301 + }, + { + "epoch": 0.7273743016759776, + "grad_norm": 0.7386418581008911, + "learning_rate": 0.0009664705882352941, + "loss": 0.5032, + "step": 1302 + }, + { + "epoch": 0.7279329608938547, + "grad_norm": 0.5926641821861267, + "learning_rate": 0.0009664425770308124, + "loss": 0.571, + "step": 1303 + }, + { + "epoch": 0.7284916201117319, + "grad_norm": 1.3213979005813599, + "learning_rate": 0.0009664145658263306, + "loss": 0.4977, + "step": 1304 + }, + { + "epoch": 0.729050279329609, + "grad_norm": 1.4560959339141846, + "learning_rate": 0.0009663865546218487, + "loss": 0.6771, + "step": 1305 + }, + { + "epoch": 0.729608938547486, + "grad_norm": 1.2416977882385254, + "learning_rate": 0.0009663585434173669, + "loss": 0.4763, + "step": 1306 + }, + { + "epoch": 0.7301675977653631, + "grad_norm": 0.6432930827140808, + "learning_rate": 0.0009663305322128851, + "loss": 0.4798, + "step": 1307 + }, + { + "epoch": 0.7307262569832402, + "grad_norm": 2.4803335666656494, + "learning_rate": 0.0009663025210084035, + "loss": 0.6173, + "step": 1308 + }, + { + "epoch": 0.7312849162011174, + "grad_norm": 0.4887591302394867, + "learning_rate": 0.0009662745098039217, + "loss": 0.4394, + "step": 1309 + }, + { + "epoch": 0.7318435754189944, + "grad_norm": 0.5033316016197205, + "learning_rate": 0.0009662464985994398, + "loss": 0.507, + "step": 1310 + }, + { + "epoch": 0.7324022346368715, + "grad_norm": 1.2471177577972412, + "learning_rate": 0.000966218487394958, + "loss": 0.4824, + "step": 1311 + }, + { + "epoch": 0.7329608938547486, + "grad_norm": 0.6987435221672058, + "learning_rate": 0.0009661904761904762, + "loss": 0.4418, + "step": 1312 + }, + { + "epoch": 0.7335195530726257, + "grad_norm": 0.6770135760307312, + "learning_rate": 0.0009661624649859945, + "loss": 0.6289, + "step": 1313 + }, + { + "epoch": 0.7340782122905027, + "grad_norm": 0.784820020198822, + "learning_rate": 0.0009661344537815127, + "loss": 0.6944, + "step": 1314 + }, + { + "epoch": 0.7346368715083799, + "grad_norm": 2.167708158493042, + "learning_rate": 0.0009661064425770308, + "loss": 0.5111, + "step": 1315 + }, + { + "epoch": 0.735195530726257, + "grad_norm": 0.5602216124534607, + "learning_rate": 0.000966078431372549, + "loss": 0.4602, + "step": 1316 + }, + { + "epoch": 0.7357541899441341, + "grad_norm": 0.9406470656394958, + "learning_rate": 0.0009660504201680672, + "loss": 0.4209, + "step": 1317 + }, + { + "epoch": 0.7363128491620111, + "grad_norm": 2.0616462230682373, + "learning_rate": 0.0009660224089635855, + "loss": 0.6191, + "step": 1318 + }, + { + "epoch": 0.7368715083798882, + "grad_norm": 0.5870423316955566, + "learning_rate": 0.0009659943977591037, + "loss": 0.5135, + "step": 1319 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 0.5533879995346069, + "learning_rate": 0.0009659663865546219, + "loss": 0.5163, + "step": 1320 + }, + { + "epoch": 0.7379888268156425, + "grad_norm": 10.194575309753418, + "learning_rate": 0.00096593837535014, + "loss": 0.6292, + "step": 1321 + }, + { + "epoch": 0.7385474860335196, + "grad_norm": 1.2345503568649292, + "learning_rate": 0.0009659103641456582, + "loss": 0.5805, + "step": 1322 + }, + { + "epoch": 0.7391061452513966, + "grad_norm": 0.7731404900550842, + "learning_rate": 0.0009658823529411765, + "loss": 0.4614, + "step": 1323 + }, + { + "epoch": 0.7396648044692737, + "grad_norm": 0.6412000060081482, + "learning_rate": 0.0009658543417366948, + "loss": 0.4785, + "step": 1324 + }, + { + "epoch": 0.7402234636871509, + "grad_norm": 2.088176965713501, + "learning_rate": 0.000965826330532213, + "loss": 0.4535, + "step": 1325 + }, + { + "epoch": 0.740782122905028, + "grad_norm": 0.6518339514732361, + "learning_rate": 0.0009657983193277311, + "loss": 0.4381, + "step": 1326 + }, + { + "epoch": 0.741340782122905, + "grad_norm": 0.793572723865509, + "learning_rate": 0.0009657703081232493, + "loss": 0.3524, + "step": 1327 + }, + { + "epoch": 0.7418994413407821, + "grad_norm": 0.8990557193756104, + "learning_rate": 0.0009657422969187676, + "loss": 0.3739, + "step": 1328 + }, + { + "epoch": 0.7424581005586592, + "grad_norm": 1.6538406610488892, + "learning_rate": 0.0009657142857142858, + "loss": 0.6079, + "step": 1329 + }, + { + "epoch": 0.7430167597765364, + "grad_norm": 0.6119401454925537, + "learning_rate": 0.000965686274509804, + "loss": 0.6339, + "step": 1330 + }, + { + "epoch": 0.7435754189944134, + "grad_norm": 0.6402374505996704, + "learning_rate": 0.0009656582633053221, + "loss": 0.5539, + "step": 1331 + }, + { + "epoch": 0.7441340782122905, + "grad_norm": 0.6331562399864197, + "learning_rate": 0.0009656302521008403, + "loss": 0.5754, + "step": 1332 + }, + { + "epoch": 0.7446927374301676, + "grad_norm": 2.119746446609497, + "learning_rate": 0.0009656022408963585, + "loss": 0.4693, + "step": 1333 + }, + { + "epoch": 0.7452513966480447, + "grad_norm": 0.9882779121398926, + "learning_rate": 0.0009655742296918768, + "loss": 0.5527, + "step": 1334 + }, + { + "epoch": 0.7458100558659218, + "grad_norm": 0.5713686347007751, + "learning_rate": 0.000965546218487395, + "loss": 0.5648, + "step": 1335 + }, + { + "epoch": 0.7463687150837989, + "grad_norm": 0.5363240242004395, + "learning_rate": 0.0009655182072829132, + "loss": 0.4486, + "step": 1336 + }, + { + "epoch": 0.746927374301676, + "grad_norm": 0.482914537191391, + "learning_rate": 0.0009654901960784313, + "loss": 0.4789, + "step": 1337 + }, + { + "epoch": 0.7474860335195531, + "grad_norm": 0.8363133668899536, + "learning_rate": 0.0009654621848739495, + "loss": 0.5066, + "step": 1338 + }, + { + "epoch": 0.7480446927374301, + "grad_norm": 0.7073422074317932, + "learning_rate": 0.0009654341736694678, + "loss": 0.5763, + "step": 1339 + }, + { + "epoch": 0.7486033519553073, + "grad_norm": 1.0467145442962646, + "learning_rate": 0.000965406162464986, + "loss": 0.5788, + "step": 1340 + }, + { + "epoch": 0.7491620111731844, + "grad_norm": 3.773933172225952, + "learning_rate": 0.0009653781512605043, + "loss": 0.5293, + "step": 1341 + }, + { + "epoch": 0.7497206703910615, + "grad_norm": 0.9196529388427734, + "learning_rate": 0.0009653501400560224, + "loss": 0.4952, + "step": 1342 + }, + { + "epoch": 0.7502793296089385, + "grad_norm": 12.447409629821777, + "learning_rate": 0.0009653221288515406, + "loss": 0.4218, + "step": 1343 + }, + { + "epoch": 0.7508379888268156, + "grad_norm": 1.0569573640823364, + "learning_rate": 0.0009652941176470589, + "loss": 0.5137, + "step": 1344 + }, + { + "epoch": 0.7513966480446927, + "grad_norm": 0.6175000071525574, + "learning_rate": 0.0009652661064425771, + "loss": 0.5575, + "step": 1345 + }, + { + "epoch": 0.7519553072625699, + "grad_norm": 2.4520671367645264, + "learning_rate": 0.0009652380952380953, + "loss": 0.4509, + "step": 1346 + }, + { + "epoch": 0.7525139664804469, + "grad_norm": 0.5845544934272766, + "learning_rate": 0.0009652100840336134, + "loss": 0.5234, + "step": 1347 + }, + { + "epoch": 0.753072625698324, + "grad_norm": 0.6283985376358032, + "learning_rate": 0.0009651820728291316, + "loss": 0.4814, + "step": 1348 + }, + { + "epoch": 0.7536312849162011, + "grad_norm": 0.5294947624206543, + "learning_rate": 0.0009651540616246499, + "loss": 0.4875, + "step": 1349 + }, + { + "epoch": 0.7541899441340782, + "grad_norm": 0.8415542840957642, + "learning_rate": 0.0009651260504201681, + "loss": 0.4588, + "step": 1350 + }, + { + "epoch": 0.7547486033519553, + "grad_norm": 1.381120204925537, + "learning_rate": 0.0009650980392156863, + "loss": 0.5656, + "step": 1351 + }, + { + "epoch": 0.7553072625698324, + "grad_norm": 1.033808708190918, + "learning_rate": 0.0009650700280112045, + "loss": 0.4942, + "step": 1352 + }, + { + "epoch": 0.7558659217877095, + "grad_norm": 0.4895726144313812, + "learning_rate": 0.0009650420168067226, + "loss": 0.5065, + "step": 1353 + }, + { + "epoch": 0.7564245810055866, + "grad_norm": 0.4510646462440491, + "learning_rate": 0.0009650140056022409, + "loss": 0.4549, + "step": 1354 + }, + { + "epoch": 0.7569832402234636, + "grad_norm": 1.4920895099639893, + "learning_rate": 0.0009649859943977591, + "loss": 0.5378, + "step": 1355 + }, + { + "epoch": 0.7575418994413408, + "grad_norm": 0.6821523904800415, + "learning_rate": 0.0009649579831932773, + "loss": 0.4238, + "step": 1356 + }, + { + "epoch": 0.7581005586592179, + "grad_norm": 0.5303128361701965, + "learning_rate": 0.0009649299719887956, + "loss": 0.5003, + "step": 1357 + }, + { + "epoch": 0.758659217877095, + "grad_norm": 0.560666024684906, + "learning_rate": 0.0009649019607843136, + "loss": 0.4822, + "step": 1358 + }, + { + "epoch": 0.759217877094972, + "grad_norm": 0.6806285381317139, + "learning_rate": 0.000964873949579832, + "loss": 0.5673, + "step": 1359 + }, + { + "epoch": 0.7597765363128491, + "grad_norm": 0.49648961424827576, + "learning_rate": 0.0009648459383753502, + "loss": 0.4474, + "step": 1360 + }, + { + "epoch": 0.7603351955307263, + "grad_norm": 0.7954347133636475, + "learning_rate": 0.0009648179271708684, + "loss": 0.4837, + "step": 1361 + }, + { + "epoch": 0.7608938547486034, + "grad_norm": 0.6850284934043884, + "learning_rate": 0.0009647899159663866, + "loss": 0.5586, + "step": 1362 + }, + { + "epoch": 0.7614525139664804, + "grad_norm": 1.3656668663024902, + "learning_rate": 0.0009647619047619047, + "loss": 0.3773, + "step": 1363 + }, + { + "epoch": 0.7620111731843575, + "grad_norm": 1.313139796257019, + "learning_rate": 0.000964733893557423, + "loss": 0.4479, + "step": 1364 + }, + { + "epoch": 0.7625698324022346, + "grad_norm": 0.47272732853889465, + "learning_rate": 0.0009647058823529412, + "loss": 0.4587, + "step": 1365 + }, + { + "epoch": 0.7631284916201118, + "grad_norm": 0.7108208537101746, + "learning_rate": 0.0009646778711484594, + "loss": 0.5143, + "step": 1366 + }, + { + "epoch": 0.7636871508379889, + "grad_norm": 0.8293966054916382, + "learning_rate": 0.0009646498599439776, + "loss": 0.5347, + "step": 1367 + }, + { + "epoch": 0.7642458100558659, + "grad_norm": 18.415096282958984, + "learning_rate": 0.0009646218487394958, + "loss": 0.5508, + "step": 1368 + }, + { + "epoch": 0.764804469273743, + "grad_norm": 4.5069122314453125, + "learning_rate": 0.000964593837535014, + "loss": 0.4962, + "step": 1369 + }, + { + "epoch": 0.7653631284916201, + "grad_norm": 4.609315872192383, + "learning_rate": 0.0009645658263305322, + "loss": 0.4428, + "step": 1370 + }, + { + "epoch": 0.7659217877094973, + "grad_norm": 0.7481788396835327, + "learning_rate": 0.0009645378151260504, + "loss": 0.5505, + "step": 1371 + }, + { + "epoch": 0.7664804469273743, + "grad_norm": 0.6795213222503662, + "learning_rate": 0.0009645098039215686, + "loss": 0.6345, + "step": 1372 + }, + { + "epoch": 0.7670391061452514, + "grad_norm": 2.641211986541748, + "learning_rate": 0.0009644817927170868, + "loss": 0.6124, + "step": 1373 + }, + { + "epoch": 0.7675977653631285, + "grad_norm": 0.9110129475593567, + "learning_rate": 0.0009644537815126051, + "loss": 0.5255, + "step": 1374 + }, + { + "epoch": 0.7681564245810056, + "grad_norm": 0.8977395296096802, + "learning_rate": 0.0009644257703081233, + "loss": 0.5757, + "step": 1375 + }, + { + "epoch": 0.7687150837988826, + "grad_norm": 1.586690068244934, + "learning_rate": 0.0009643977591036415, + "loss": 0.56, + "step": 1376 + }, + { + "epoch": 0.7692737430167598, + "grad_norm": 0.4725937843322754, + "learning_rate": 0.0009643697478991597, + "loss": 0.461, + "step": 1377 + }, + { + "epoch": 0.7698324022346369, + "grad_norm": 0.7157468795776367, + "learning_rate": 0.0009643417366946779, + "loss": 0.627, + "step": 1378 + }, + { + "epoch": 0.770391061452514, + "grad_norm": 1.6860612630844116, + "learning_rate": 0.0009643137254901961, + "loss": 0.4543, + "step": 1379 + }, + { + "epoch": 0.770949720670391, + "grad_norm": 0.9065467119216919, + "learning_rate": 0.0009642857142857143, + "loss": 0.7157, + "step": 1380 + }, + { + "epoch": 0.7715083798882681, + "grad_norm": 1.117568850517273, + "learning_rate": 0.0009642577030812325, + "loss": 0.5059, + "step": 1381 + }, + { + "epoch": 0.7720670391061453, + "grad_norm": 2.117650270462036, + "learning_rate": 0.0009642296918767507, + "loss": 0.6479, + "step": 1382 + }, + { + "epoch": 0.7726256983240224, + "grad_norm": 0.47136661410331726, + "learning_rate": 0.0009642016806722689, + "loss": 0.4798, + "step": 1383 + }, + { + "epoch": 0.7731843575418994, + "grad_norm": 0.6892040371894836, + "learning_rate": 0.0009641736694677872, + "loss": 0.4799, + "step": 1384 + }, + { + "epoch": 0.7737430167597765, + "grad_norm": 1.518065094947815, + "learning_rate": 0.0009641456582633053, + "loss": 0.5524, + "step": 1385 + }, + { + "epoch": 0.7743016759776536, + "grad_norm": 0.9349548816680908, + "learning_rate": 0.0009641176470588235, + "loss": 0.4479, + "step": 1386 + }, + { + "epoch": 0.7748603351955308, + "grad_norm": 0.7361342906951904, + "learning_rate": 0.0009640896358543417, + "loss": 0.6852, + "step": 1387 + }, + { + "epoch": 0.7754189944134078, + "grad_norm": 0.6144516468048096, + "learning_rate": 0.0009640616246498599, + "loss": 0.4628, + "step": 1388 + }, + { + "epoch": 0.7759776536312849, + "grad_norm": 1.3548355102539062, + "learning_rate": 0.0009640336134453783, + "loss": 0.4708, + "step": 1389 + }, + { + "epoch": 0.776536312849162, + "grad_norm": 0.6006221175193787, + "learning_rate": 0.0009640056022408964, + "loss": 0.5141, + "step": 1390 + }, + { + "epoch": 0.7770949720670391, + "grad_norm": 1.1799458265304565, + "learning_rate": 0.0009639775910364146, + "loss": 0.5467, + "step": 1391 + }, + { + "epoch": 0.7776536312849162, + "grad_norm": 1.314858078956604, + "learning_rate": 0.0009639495798319328, + "loss": 0.4201, + "step": 1392 + }, + { + "epoch": 0.7782122905027933, + "grad_norm": 0.7050217390060425, + "learning_rate": 0.000963921568627451, + "loss": 0.5912, + "step": 1393 + }, + { + "epoch": 0.7787709497206704, + "grad_norm": 0.8072057962417603, + "learning_rate": 0.0009638935574229693, + "loss": 0.7952, + "step": 1394 + }, + { + "epoch": 0.7793296089385475, + "grad_norm": 0.7330551743507385, + "learning_rate": 0.0009638655462184874, + "loss": 0.6818, + "step": 1395 + }, + { + "epoch": 0.7798882681564245, + "grad_norm": 4.893136024475098, + "learning_rate": 0.0009638375350140056, + "loss": 0.4728, + "step": 1396 + }, + { + "epoch": 0.7804469273743017, + "grad_norm": 0.7894774079322815, + "learning_rate": 0.0009638095238095238, + "loss": 0.711, + "step": 1397 + }, + { + "epoch": 0.7810055865921788, + "grad_norm": 0.9732863903045654, + "learning_rate": 0.000963781512605042, + "loss": 0.469, + "step": 1398 + }, + { + "epoch": 0.7815642458100559, + "grad_norm": 0.8147786259651184, + "learning_rate": 0.0009637535014005603, + "loss": 0.5458, + "step": 1399 + }, + { + "epoch": 0.7821229050279329, + "grad_norm": 0.816346287727356, + "learning_rate": 0.0009637254901960785, + "loss": 0.5431, + "step": 1400 + }, + { + "epoch": 0.78268156424581, + "grad_norm": 0.7073534727096558, + "learning_rate": 0.0009636974789915966, + "loss": 0.5808, + "step": 1401 + }, + { + "epoch": 0.7832402234636872, + "grad_norm": 0.7590193152427673, + "learning_rate": 0.0009636694677871148, + "loss": 0.4888, + "step": 1402 + }, + { + "epoch": 0.7837988826815643, + "grad_norm": 8.355823516845703, + "learning_rate": 0.000963641456582633, + "loss": 0.3829, + "step": 1403 + }, + { + "epoch": 0.7843575418994413, + "grad_norm": 0.7909501194953918, + "learning_rate": 0.0009636134453781513, + "loss": 0.4857, + "step": 1404 + }, + { + "epoch": 0.7849162011173184, + "grad_norm": 0.8943551182746887, + "learning_rate": 0.0009635854341736695, + "loss": 0.6265, + "step": 1405 + }, + { + "epoch": 0.7854748603351955, + "grad_norm": 0.6333885788917542, + "learning_rate": 0.0009635574229691876, + "loss": 0.4893, + "step": 1406 + }, + { + "epoch": 0.7860335195530727, + "grad_norm": 0.7539492845535278, + "learning_rate": 0.0009635294117647059, + "loss": 0.5991, + "step": 1407 + }, + { + "epoch": 0.7865921787709497, + "grad_norm": 0.5527846217155457, + "learning_rate": 0.0009635014005602241, + "loss": 0.5105, + "step": 1408 + }, + { + "epoch": 0.7871508379888268, + "grad_norm": 0.827819287776947, + "learning_rate": 0.0009634733893557424, + "loss": 0.5239, + "step": 1409 + }, + { + "epoch": 0.7877094972067039, + "grad_norm": 0.6314797401428223, + "learning_rate": 0.0009634453781512606, + "loss": 0.3903, + "step": 1410 + }, + { + "epoch": 0.788268156424581, + "grad_norm": 4.768505096435547, + "learning_rate": 0.0009634173669467787, + "loss": 0.5414, + "step": 1411 + }, + { + "epoch": 0.788826815642458, + "grad_norm": 1.0183470249176025, + "learning_rate": 0.0009633893557422969, + "loss": 0.4332, + "step": 1412 + }, + { + "epoch": 0.7893854748603352, + "grad_norm": 0.769864559173584, + "learning_rate": 0.0009633613445378151, + "loss": 0.4566, + "step": 1413 + }, + { + "epoch": 0.7899441340782123, + "grad_norm": Infinity, + "learning_rate": 0.0009633613445378151, + "loss": 0.5697, + "step": 1414 + }, + { + "epoch": 0.7905027932960894, + "grad_norm": 0.5988388657569885, + "learning_rate": 0.0009633333333333334, + "loss": 0.3823, + "step": 1415 + }, + { + "epoch": 0.7910614525139665, + "grad_norm": 0.9508142471313477, + "learning_rate": 0.0009633053221288516, + "loss": 0.5901, + "step": 1416 + }, + { + "epoch": 0.7916201117318435, + "grad_norm": 0.6289923787117004, + "learning_rate": 0.0009632773109243698, + "loss": 0.4294, + "step": 1417 + }, + { + "epoch": 0.7921787709497207, + "grad_norm": 1.3091530799865723, + "learning_rate": 0.0009632492997198879, + "loss": 0.4808, + "step": 1418 + }, + { + "epoch": 0.7927374301675978, + "grad_norm": 1.4104604721069336, + "learning_rate": 0.0009632212885154061, + "loss": 0.5711, + "step": 1419 + }, + { + "epoch": 0.7932960893854749, + "grad_norm": 0.5860559940338135, + "learning_rate": 0.0009631932773109244, + "loss": 0.5183, + "step": 1420 + }, + { + "epoch": 0.7938547486033519, + "grad_norm": 0.6045873761177063, + "learning_rate": 0.0009631652661064426, + "loss": 0.4496, + "step": 1421 + }, + { + "epoch": 0.794413407821229, + "grad_norm": 0.5462859869003296, + "learning_rate": 0.0009631372549019608, + "loss": 0.4219, + "step": 1422 + }, + { + "epoch": 0.7949720670391062, + "grad_norm": 0.7901493906974792, + "learning_rate": 0.0009631092436974789, + "loss": 0.6415, + "step": 1423 + }, + { + "epoch": 0.7955307262569833, + "grad_norm": 0.5696995854377747, + "learning_rate": 0.0009630812324929971, + "loss": 0.4359, + "step": 1424 + }, + { + "epoch": 0.7960893854748603, + "grad_norm": 0.8355426788330078, + "learning_rate": 0.0009630532212885155, + "loss": 0.5583, + "step": 1425 + }, + { + "epoch": 0.7966480446927374, + "grad_norm": 0.7962026000022888, + "learning_rate": 0.0009630252100840337, + "loss": 0.4758, + "step": 1426 + }, + { + "epoch": 0.7972067039106145, + "grad_norm": 9.704211235046387, + "learning_rate": 0.0009629971988795519, + "loss": 0.579, + "step": 1427 + }, + { + "epoch": 0.7977653631284917, + "grad_norm": 1.9955533742904663, + "learning_rate": 0.00096296918767507, + "loss": 0.5201, + "step": 1428 + }, + { + "epoch": 0.7983240223463687, + "grad_norm": 0.8000580072402954, + "learning_rate": 0.0009629411764705882, + "loss": 0.7567, + "step": 1429 + }, + { + "epoch": 0.7988826815642458, + "grad_norm": 0.9265885949134827, + "learning_rate": 0.0009629131652661065, + "loss": 0.5148, + "step": 1430 + }, + { + "epoch": 0.7994413407821229, + "grad_norm": 0.6476082801818848, + "learning_rate": 0.0009628851540616247, + "loss": 0.5419, + "step": 1431 + }, + { + "epoch": 0.8, + "grad_norm": 0.5342894792556763, + "learning_rate": 0.0009628571428571429, + "loss": 0.4456, + "step": 1432 + }, + { + "epoch": 0.8005586592178771, + "grad_norm": 0.6798768043518066, + "learning_rate": 0.0009628291316526611, + "loss": 0.569, + "step": 1433 + }, + { + "epoch": 0.8011173184357542, + "grad_norm": 0.45200982689857483, + "learning_rate": 0.0009628011204481792, + "loss": 0.4152, + "step": 1434 + }, + { + "epoch": 0.8016759776536313, + "grad_norm": 0.7450093030929565, + "learning_rate": 0.0009627731092436975, + "loss": 0.4831, + "step": 1435 + }, + { + "epoch": 0.8022346368715084, + "grad_norm": 0.3933943212032318, + "learning_rate": 0.0009627450980392157, + "loss": 0.3844, + "step": 1436 + }, + { + "epoch": 0.8027932960893854, + "grad_norm": 0.599287211894989, + "learning_rate": 0.0009627170868347339, + "loss": 0.5188, + "step": 1437 + }, + { + "epoch": 0.8033519553072626, + "grad_norm": 0.7025023698806763, + "learning_rate": 0.0009626890756302521, + "loss": 0.5235, + "step": 1438 + }, + { + "epoch": 0.8039106145251397, + "grad_norm": 0.6847041249275208, + "learning_rate": 0.0009626610644257702, + "loss": 0.601, + "step": 1439 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 0.6263582110404968, + "learning_rate": 0.0009626330532212886, + "loss": 0.4547, + "step": 1440 + }, + { + "epoch": 0.8050279329608938, + "grad_norm": 0.5411430597305298, + "learning_rate": 0.0009626050420168068, + "loss": 0.4323, + "step": 1441 + }, + { + "epoch": 0.8055865921787709, + "grad_norm": 0.7263736128807068, + "learning_rate": 0.000962577030812325, + "loss": 0.6322, + "step": 1442 + }, + { + "epoch": 0.806145251396648, + "grad_norm": 0.6641649007797241, + "learning_rate": 0.0009625490196078432, + "loss": 0.4404, + "step": 1443 + }, + { + "epoch": 0.8067039106145252, + "grad_norm": 0.5103501677513123, + "learning_rate": 0.0009625210084033613, + "loss": 0.4959, + "step": 1444 + }, + { + "epoch": 0.8072625698324022, + "grad_norm": 0.6319855451583862, + "learning_rate": 0.0009624929971988796, + "loss": 0.6303, + "step": 1445 + }, + { + "epoch": 0.8078212290502793, + "grad_norm": 0.37431323528289795, + "learning_rate": 0.0009624649859943978, + "loss": 0.3968, + "step": 1446 + }, + { + "epoch": 0.8083798882681564, + "grad_norm": 1.0800323486328125, + "learning_rate": 0.000962436974789916, + "loss": 0.6015, + "step": 1447 + }, + { + "epoch": 0.8089385474860336, + "grad_norm": 0.5986151099205017, + "learning_rate": 0.0009624089635854342, + "loss": 0.5188, + "step": 1448 + }, + { + "epoch": 0.8094972067039106, + "grad_norm": 0.6418584585189819, + "learning_rate": 0.0009623809523809524, + "loss": 0.5892, + "step": 1449 + }, + { + "epoch": 0.8100558659217877, + "grad_norm": 0.598464846611023, + "learning_rate": 0.0009623529411764706, + "loss": 0.5116, + "step": 1450 + }, + { + "epoch": 0.8106145251396648, + "grad_norm": 0.6212530136108398, + "learning_rate": 0.0009623249299719888, + "loss": 0.4941, + "step": 1451 + }, + { + "epoch": 0.8111731843575419, + "grad_norm": 1.3260297775268555, + "learning_rate": 0.000962296918767507, + "loss": 0.4667, + "step": 1452 + }, + { + "epoch": 0.8117318435754189, + "grad_norm": 0.5975127816200256, + "learning_rate": 0.0009622689075630252, + "loss": 0.5124, + "step": 1453 + }, + { + "epoch": 0.8122905027932961, + "grad_norm": 0.5148320198059082, + "learning_rate": 0.0009622408963585434, + "loss": 0.4867, + "step": 1454 + }, + { + "epoch": 0.8128491620111732, + "grad_norm": 0.6139815449714661, + "learning_rate": 0.0009622128851540616, + "loss": 0.4463, + "step": 1455 + }, + { + "epoch": 0.8134078212290503, + "grad_norm": 0.8188477158546448, + "learning_rate": 0.0009621848739495798, + "loss": 0.5896, + "step": 1456 + }, + { + "epoch": 0.8139664804469273, + "grad_norm": 0.7311028838157654, + "learning_rate": 0.0009621568627450981, + "loss": 0.5491, + "step": 1457 + }, + { + "epoch": 0.8145251396648044, + "grad_norm": 0.4400715231895447, + "learning_rate": 0.0009621288515406163, + "loss": 0.41, + "step": 1458 + }, + { + "epoch": 0.8150837988826816, + "grad_norm": 1.0677579641342163, + "learning_rate": 0.0009621008403361345, + "loss": 0.4221, + "step": 1459 + }, + { + "epoch": 0.8156424581005587, + "grad_norm": 0.7484898567199707, + "learning_rate": 0.0009620728291316528, + "loss": 0.6855, + "step": 1460 + }, + { + "epoch": 0.8162011173184358, + "grad_norm": 0.4938328266143799, + "learning_rate": 0.0009620448179271709, + "loss": 0.4868, + "step": 1461 + }, + { + "epoch": 0.8167597765363128, + "grad_norm": 0.7591249346733093, + "learning_rate": 0.0009620168067226891, + "loss": 0.4954, + "step": 1462 + }, + { + "epoch": 0.8173184357541899, + "grad_norm": 0.6450791358947754, + "learning_rate": 0.0009619887955182073, + "loss": 0.5415, + "step": 1463 + }, + { + "epoch": 0.8178770949720671, + "grad_norm": 0.5208788514137268, + "learning_rate": 0.0009619607843137255, + "loss": 0.4769, + "step": 1464 + }, + { + "epoch": 0.8184357541899442, + "grad_norm": 1.159798264503479, + "learning_rate": 0.0009619327731092438, + "loss": 0.7681, + "step": 1465 + }, + { + "epoch": 0.8189944134078212, + "grad_norm": 0.8069193363189697, + "learning_rate": 0.0009619047619047619, + "loss": 0.5153, + "step": 1466 + }, + { + "epoch": 0.8195530726256983, + "grad_norm": 0.6228049397468567, + "learning_rate": 0.0009618767507002801, + "loss": 0.6156, + "step": 1467 + }, + { + "epoch": 0.8201117318435754, + "grad_norm": 0.8810644745826721, + "learning_rate": 0.0009618487394957983, + "loss": 0.648, + "step": 1468 + }, + { + "epoch": 0.8206703910614526, + "grad_norm": 1.2396124601364136, + "learning_rate": 0.0009618207282913165, + "loss": 0.4615, + "step": 1469 + }, + { + "epoch": 0.8212290502793296, + "grad_norm": 0.7651035189628601, + "learning_rate": 0.0009617927170868348, + "loss": 0.5078, + "step": 1470 + }, + { + "epoch": 0.8217877094972067, + "grad_norm": 0.5683399438858032, + "learning_rate": 0.0009617647058823529, + "loss": 0.5043, + "step": 1471 + }, + { + "epoch": 0.8223463687150838, + "grad_norm": 2.3346378803253174, + "learning_rate": 0.0009617366946778711, + "loss": 0.5614, + "step": 1472 + }, + { + "epoch": 0.8229050279329609, + "grad_norm": 0.4906153976917267, + "learning_rate": 0.0009617086834733894, + "loss": 0.4891, + "step": 1473 + }, + { + "epoch": 0.823463687150838, + "grad_norm": 1.4745047092437744, + "learning_rate": 0.0009616806722689076, + "loss": 0.3869, + "step": 1474 + }, + { + "epoch": 0.8240223463687151, + "grad_norm": 0.8142638802528381, + "learning_rate": 0.0009616526610644259, + "loss": 0.6038, + "step": 1475 + }, + { + "epoch": 0.8245810055865922, + "grad_norm": 0.8037557005882263, + "learning_rate": 0.0009616246498599441, + "loss": 0.4948, + "step": 1476 + }, + { + "epoch": 0.8251396648044693, + "grad_norm": 3.388263463973999, + "learning_rate": 0.0009615966386554622, + "loss": 0.4969, + "step": 1477 + }, + { + "epoch": 0.8256983240223463, + "grad_norm": 4.625975131988525, + "learning_rate": 0.0009615686274509804, + "loss": 0.5537, + "step": 1478 + }, + { + "epoch": 0.8262569832402235, + "grad_norm": 1.5438491106033325, + "learning_rate": 0.0009615406162464986, + "loss": 0.499, + "step": 1479 + }, + { + "epoch": 0.8268156424581006, + "grad_norm": 0.7401479482650757, + "learning_rate": 0.0009615126050420169, + "loss": 0.5007, + "step": 1480 + }, + { + "epoch": 0.8273743016759777, + "grad_norm": 0.6178761124610901, + "learning_rate": 0.0009614845938375351, + "loss": 0.5297, + "step": 1481 + }, + { + "epoch": 0.8279329608938547, + "grad_norm": 2.41516375541687, + "learning_rate": 0.0009614565826330532, + "loss": 0.5436, + "step": 1482 + }, + { + "epoch": 0.8284916201117318, + "grad_norm": 0.5793017745018005, + "learning_rate": 0.0009614285714285714, + "loss": 0.5233, + "step": 1483 + }, + { + "epoch": 0.829050279329609, + "grad_norm": 1.0162672996520996, + "learning_rate": 0.0009614005602240896, + "loss": 0.5452, + "step": 1484 + }, + { + "epoch": 0.8296089385474861, + "grad_norm": 0.6787902116775513, + "learning_rate": 0.0009613725490196079, + "loss": 0.6462, + "step": 1485 + }, + { + "epoch": 0.8301675977653631, + "grad_norm": 0.5842154622077942, + "learning_rate": 0.0009613445378151261, + "loss": 0.6168, + "step": 1486 + }, + { + "epoch": 0.8307262569832402, + "grad_norm": 0.534328818321228, + "learning_rate": 0.0009613165266106442, + "loss": 0.5113, + "step": 1487 + }, + { + "epoch": 0.8312849162011173, + "grad_norm": 1.2192753553390503, + "learning_rate": 0.0009612885154061624, + "loss": 0.4032, + "step": 1488 + }, + { + "epoch": 0.8318435754189945, + "grad_norm": 0.875845193862915, + "learning_rate": 0.0009612605042016806, + "loss": 0.5181, + "step": 1489 + }, + { + "epoch": 0.8324022346368715, + "grad_norm": 0.49047836661338806, + "learning_rate": 0.000961232492997199, + "loss": 0.4514, + "step": 1490 + }, + { + "epoch": 0.8329608938547486, + "grad_norm": 0.4537179470062256, + "learning_rate": 0.0009612044817927172, + "loss": 0.4166, + "step": 1491 + }, + { + "epoch": 0.8335195530726257, + "grad_norm": 0.5758245587348938, + "learning_rate": 0.0009611764705882354, + "loss": 0.4441, + "step": 1492 + }, + { + "epoch": 0.8340782122905028, + "grad_norm": 1.1364498138427734, + "learning_rate": 0.0009611484593837535, + "loss": 0.5217, + "step": 1493 + }, + { + "epoch": 0.8346368715083798, + "grad_norm": 0.6305497884750366, + "learning_rate": 0.0009611204481792717, + "loss": 0.5642, + "step": 1494 + }, + { + "epoch": 0.835195530726257, + "grad_norm": 0.4967428147792816, + "learning_rate": 0.00096109243697479, + "loss": 0.4663, + "step": 1495 + }, + { + "epoch": 0.8357541899441341, + "grad_norm": 5.082947731018066, + "learning_rate": 0.0009610644257703082, + "loss": 0.6391, + "step": 1496 + }, + { + "epoch": 0.8363128491620112, + "grad_norm": 1.2604416608810425, + "learning_rate": 0.0009610364145658264, + "loss": 0.4479, + "step": 1497 + }, + { + "epoch": 0.8368715083798882, + "grad_norm": 20.05978775024414, + "learning_rate": 0.0009610084033613445, + "loss": 0.6632, + "step": 1498 + }, + { + "epoch": 0.8374301675977653, + "grad_norm": 4.604385852813721, + "learning_rate": 0.0009609803921568627, + "loss": 0.4911, + "step": 1499 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 1.2543137073516846, + "learning_rate": 0.000960952380952381, + "loss": 0.5289, + "step": 1500 + }, + { + "epoch": 0.8379888268156425, + "eval_cer": 0.09844740488581934, + "eval_loss": 0.37865063548088074, + "eval_runtime": 55.249, + "eval_samples_per_second": 82.137, + "eval_steps_per_second": 5.14, + "eval_wer": 0.3857154828077428, + "step": 1500 + }, + { + "epoch": 0.8385474860335196, + "grad_norm": 0.6591171622276306, + "learning_rate": 0.0009609243697478992, + "loss": 0.5223, + "step": 1501 + }, + { + "epoch": 0.8391061452513966, + "grad_norm": 0.8398429155349731, + "learning_rate": 0.0009608963585434174, + "loss": 0.7085, + "step": 1502 + }, + { + "epoch": 0.8396648044692737, + "grad_norm": 0.5711323618888855, + "learning_rate": 0.0009608683473389355, + "loss": 0.4737, + "step": 1503 + }, + { + "epoch": 0.8402234636871508, + "grad_norm": 0.5332566499710083, + "learning_rate": 0.0009608403361344537, + "loss": 0.5858, + "step": 1504 + }, + { + "epoch": 0.840782122905028, + "grad_norm": 0.7061896920204163, + "learning_rate": 0.000960812324929972, + "loss": 0.4925, + "step": 1505 + }, + { + "epoch": 0.841340782122905, + "grad_norm": 0.5304086804389954, + "learning_rate": 0.0009607843137254903, + "loss": 0.4686, + "step": 1506 + }, + { + "epoch": 0.8418994413407821, + "grad_norm": 2.148744821548462, + "learning_rate": 0.0009607563025210085, + "loss": 0.4337, + "step": 1507 + }, + { + "epoch": 0.8424581005586592, + "grad_norm": 0.8156803846359253, + "learning_rate": 0.0009607282913165267, + "loss": 0.438, + "step": 1508 + }, + { + "epoch": 0.8430167597765363, + "grad_norm": 1.305841088294983, + "learning_rate": 0.0009607002801120448, + "loss": 0.5863, + "step": 1509 + }, + { + "epoch": 0.8435754189944135, + "grad_norm": 0.47048819065093994, + "learning_rate": 0.0009606722689075631, + "loss": 0.5107, + "step": 1510 + }, + { + "epoch": 0.8441340782122905, + "grad_norm": 0.4968303143978119, + "learning_rate": 0.0009606442577030813, + "loss": 0.4913, + "step": 1511 + }, + { + "epoch": 0.8446927374301676, + "grad_norm": 0.7322350144386292, + "learning_rate": 0.0009606162464985995, + "loss": 0.567, + "step": 1512 + }, + { + "epoch": 0.8452513966480447, + "grad_norm": 0.6126371026039124, + "learning_rate": 0.0009605882352941177, + "loss": 0.4864, + "step": 1513 + }, + { + "epoch": 0.8458100558659218, + "grad_norm": 0.732340395450592, + "learning_rate": 0.0009605602240896358, + "loss": 0.5562, + "step": 1514 + }, + { + "epoch": 0.8463687150837989, + "grad_norm": 0.4569789171218872, + "learning_rate": 0.0009605322128851541, + "loss": 0.4622, + "step": 1515 + }, + { + "epoch": 0.846927374301676, + "grad_norm": 0.9417468905448914, + "learning_rate": 0.0009605042016806723, + "loss": 0.6526, + "step": 1516 + }, + { + "epoch": 0.8474860335195531, + "grad_norm": 0.9558619260787964, + "learning_rate": 0.0009604761904761905, + "loss": 0.551, + "step": 1517 + }, + { + "epoch": 0.8480446927374302, + "grad_norm": 0.900699257850647, + "learning_rate": 0.0009604481792717087, + "loss": 0.5343, + "step": 1518 + }, + { + "epoch": 0.8486033519553072, + "grad_norm": 1.0645067691802979, + "learning_rate": 0.0009604201680672268, + "loss": 0.4961, + "step": 1519 + }, + { + "epoch": 0.8491620111731844, + "grad_norm": 0.43453070521354675, + "learning_rate": 0.0009603921568627451, + "loss": 0.4925, + "step": 1520 + }, + { + "epoch": 0.8497206703910615, + "grad_norm": 0.6701645851135254, + "learning_rate": 0.0009603641456582633, + "loss": 0.5334, + "step": 1521 + }, + { + "epoch": 0.8502793296089386, + "grad_norm": 0.47887933254241943, + "learning_rate": 0.0009603361344537816, + "loss": 0.4878, + "step": 1522 + }, + { + "epoch": 0.8508379888268156, + "grad_norm": 0.48265787959098816, + "learning_rate": 0.0009603081232492998, + "loss": 0.4657, + "step": 1523 + }, + { + "epoch": 0.8513966480446927, + "grad_norm": 0.6489775776863098, + "learning_rate": 0.000960280112044818, + "loss": 0.4002, + "step": 1524 + }, + { + "epoch": 0.8519553072625698, + "grad_norm": 0.7259930372238159, + "learning_rate": 0.0009602521008403362, + "loss": 0.4652, + "step": 1525 + }, + { + "epoch": 0.852513966480447, + "grad_norm": 1.5105582475662231, + "learning_rate": 0.0009602240896358544, + "loss": 0.7282, + "step": 1526 + }, + { + "epoch": 0.853072625698324, + "grad_norm": 0.6706119775772095, + "learning_rate": 0.0009601960784313726, + "loss": 0.6042, + "step": 1527 + }, + { + "epoch": 0.8536312849162011, + "grad_norm": 0.4488590359687805, + "learning_rate": 0.0009601680672268908, + "loss": 0.497, + "step": 1528 + }, + { + "epoch": 0.8541899441340782, + "grad_norm": 7.585607528686523, + "learning_rate": 0.000960140056022409, + "loss": 0.4696, + "step": 1529 + }, + { + "epoch": 0.8547486033519553, + "grad_norm": 0.49235406517982483, + "learning_rate": 0.0009601120448179272, + "loss": 0.4489, + "step": 1530 + }, + { + "epoch": 0.8553072625698324, + "grad_norm": 1.6137057542800903, + "learning_rate": 0.0009600840336134454, + "loss": 0.4593, + "step": 1531 + }, + { + "epoch": 0.8558659217877095, + "grad_norm": 0.6306363344192505, + "learning_rate": 0.0009600560224089636, + "loss": 0.4695, + "step": 1532 + }, + { + "epoch": 0.8564245810055866, + "grad_norm": 1.7941296100616455, + "learning_rate": 0.0009600280112044818, + "loss": 0.4729, + "step": 1533 + }, + { + "epoch": 0.8569832402234637, + "grad_norm": 0.74614417552948, + "learning_rate": 0.00096, + "loss": 0.4275, + "step": 1534 + }, + { + "epoch": 0.8575418994413407, + "grad_norm": 0.6001713871955872, + "learning_rate": 0.0009599719887955182, + "loss": 0.469, + "step": 1535 + }, + { + "epoch": 0.8581005586592179, + "grad_norm": 0.963833212852478, + "learning_rate": 0.0009599439775910364, + "loss": 0.5154, + "step": 1536 + }, + { + "epoch": 0.858659217877095, + "grad_norm": 1.13657546043396, + "learning_rate": 0.0009599159663865546, + "loss": 0.5546, + "step": 1537 + }, + { + "epoch": 0.8592178770949721, + "grad_norm": 0.5945402383804321, + "learning_rate": 0.0009598879551820728, + "loss": 0.4922, + "step": 1538 + }, + { + "epoch": 0.8597765363128491, + "grad_norm": 3.436249017715454, + "learning_rate": 0.0009598599439775911, + "loss": 0.567, + "step": 1539 + }, + { + "epoch": 0.8603351955307262, + "grad_norm": 0.5806109309196472, + "learning_rate": 0.0009598319327731094, + "loss": 0.4277, + "step": 1540 + }, + { + "epoch": 0.8608938547486034, + "grad_norm": 5.576815605163574, + "learning_rate": 0.0009598039215686275, + "loss": 0.4417, + "step": 1541 + }, + { + "epoch": 0.8614525139664805, + "grad_norm": 1.2157827615737915, + "learning_rate": 0.0009597759103641457, + "loss": 0.4929, + "step": 1542 + }, + { + "epoch": 0.8620111731843575, + "grad_norm": 0.614841103553772, + "learning_rate": 0.0009597478991596639, + "loss": 0.5091, + "step": 1543 + }, + { + "epoch": 0.8625698324022346, + "grad_norm": 0.5197077989578247, + "learning_rate": 0.0009597198879551821, + "loss": 0.3851, + "step": 1544 + }, + { + "epoch": 0.8631284916201117, + "grad_norm": 0.4492049813270569, + "learning_rate": 0.0009596918767507004, + "loss": 0.4737, + "step": 1545 + }, + { + "epoch": 0.8636871508379889, + "grad_norm": 2.4102890491485596, + "learning_rate": 0.0009596638655462185, + "loss": 0.439, + "step": 1546 + }, + { + "epoch": 0.8642458100558659, + "grad_norm": 0.7981939315795898, + "learning_rate": 0.0009596358543417367, + "loss": 0.6839, + "step": 1547 + }, + { + "epoch": 0.864804469273743, + "grad_norm": 0.5832147598266602, + "learning_rate": 0.0009596078431372549, + "loss": 0.5052, + "step": 1548 + }, + { + "epoch": 0.8653631284916201, + "grad_norm": 0.976466178894043, + "learning_rate": 0.0009595798319327731, + "loss": 0.3913, + "step": 1549 + }, + { + "epoch": 0.8659217877094972, + "grad_norm": 1.053292155265808, + "learning_rate": 0.0009595518207282914, + "loss": 0.554, + "step": 1550 + }, + { + "epoch": 0.8664804469273742, + "grad_norm": 1.20173978805542, + "learning_rate": 0.0009595238095238095, + "loss": 0.5341, + "step": 1551 + }, + { + "epoch": 0.8670391061452514, + "grad_norm": 0.8238740563392639, + "learning_rate": 0.0009594957983193277, + "loss": 0.5558, + "step": 1552 + }, + { + "epoch": 0.8675977653631285, + "grad_norm": 0.5024362802505493, + "learning_rate": 0.0009594677871148459, + "loss": 0.4121, + "step": 1553 + }, + { + "epoch": 0.8681564245810056, + "grad_norm": 0.7707139253616333, + "learning_rate": 0.0009594397759103641, + "loss": 0.4614, + "step": 1554 + }, + { + "epoch": 0.8687150837988827, + "grad_norm": 0.5292174220085144, + "learning_rate": 0.0009594117647058825, + "loss": 0.4525, + "step": 1555 + }, + { + "epoch": 0.8692737430167597, + "grad_norm": 0.7752367854118347, + "learning_rate": 0.0009593837535014007, + "loss": 0.4536, + "step": 1556 + }, + { + "epoch": 0.8698324022346369, + "grad_norm": 0.5903460383415222, + "learning_rate": 0.0009593557422969188, + "loss": 0.4733, + "step": 1557 + }, + { + "epoch": 0.870391061452514, + "grad_norm": 1.443812608718872, + "learning_rate": 0.000959327731092437, + "loss": 0.7236, + "step": 1558 + }, + { + "epoch": 0.8709497206703911, + "grad_norm": 0.6632566452026367, + "learning_rate": 0.0009592997198879552, + "loss": 0.5525, + "step": 1559 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 0.5941494107246399, + "learning_rate": 0.0009592717086834734, + "loss": 0.4692, + "step": 1560 + }, + { + "epoch": 0.8720670391061452, + "grad_norm": 0.42596128582954407, + "learning_rate": 0.0009592436974789917, + "loss": 0.3983, + "step": 1561 + }, + { + "epoch": 0.8726256983240224, + "grad_norm": 0.7327626347541809, + "learning_rate": 0.0009592156862745098, + "loss": 0.5121, + "step": 1562 + }, + { + "epoch": 0.8731843575418995, + "grad_norm": 2.932777166366577, + "learning_rate": 0.000959187675070028, + "loss": 0.4547, + "step": 1563 + }, + { + "epoch": 0.8737430167597765, + "grad_norm": 0.5944148898124695, + "learning_rate": 0.0009591596638655462, + "loss": 0.6036, + "step": 1564 + }, + { + "epoch": 0.8743016759776536, + "grad_norm": 0.6992055177688599, + "learning_rate": 0.0009591316526610644, + "loss": 0.5199, + "step": 1565 + }, + { + "epoch": 0.8748603351955307, + "grad_norm": 0.7510950565338135, + "learning_rate": 0.0009591036414565827, + "loss": 0.6397, + "step": 1566 + }, + { + "epoch": 0.8754189944134079, + "grad_norm": 0.6152811646461487, + "learning_rate": 0.0009590756302521008, + "loss": 0.511, + "step": 1567 + }, + { + "epoch": 0.8759776536312849, + "grad_norm": 0.5206465721130371, + "learning_rate": 0.000959047619047619, + "loss": 0.4913, + "step": 1568 + }, + { + "epoch": 0.876536312849162, + "grad_norm": 1.9374691247940063, + "learning_rate": 0.0009590196078431372, + "loss": 0.9544, + "step": 1569 + }, + { + "epoch": 0.8770949720670391, + "grad_norm": 0.9915938377380371, + "learning_rate": 0.0009589915966386554, + "loss": 0.5381, + "step": 1570 + }, + { + "epoch": 0.8776536312849162, + "grad_norm": 1.0157009363174438, + "learning_rate": 0.0009589635854341738, + "loss": 0.4593, + "step": 1571 + }, + { + "epoch": 0.8782122905027933, + "grad_norm": 0.7500250339508057, + "learning_rate": 0.000958935574229692, + "loss": 0.5349, + "step": 1572 + }, + { + "epoch": 0.8787709497206704, + "grad_norm": 0.6948181986808777, + "learning_rate": 0.0009589075630252101, + "loss": 0.5535, + "step": 1573 + }, + { + "epoch": 0.8793296089385475, + "grad_norm": 0.7886123657226562, + "learning_rate": 0.0009588795518207283, + "loss": 0.5553, + "step": 1574 + }, + { + "epoch": 0.8798882681564246, + "grad_norm": 0.46402987837791443, + "learning_rate": 0.0009588515406162465, + "loss": 0.4221, + "step": 1575 + }, + { + "epoch": 0.8804469273743016, + "grad_norm": 0.9824563264846802, + "learning_rate": 0.0009588235294117648, + "loss": 0.5114, + "step": 1576 + }, + { + "epoch": 0.8810055865921788, + "grad_norm": 2.5311214923858643, + "learning_rate": 0.000958795518207283, + "loss": 0.4866, + "step": 1577 + }, + { + "epoch": 0.8815642458100559, + "grad_norm": 4.942215442657471, + "learning_rate": 0.0009587675070028011, + "loss": 0.6619, + "step": 1578 + }, + { + "epoch": 0.882122905027933, + "grad_norm": 1.346177577972412, + "learning_rate": 0.0009587394957983193, + "loss": 0.5644, + "step": 1579 + }, + { + "epoch": 0.88268156424581, + "grad_norm": 0.5623987317085266, + "learning_rate": 0.0009587114845938375, + "loss": 0.5696, + "step": 1580 + }, + { + "epoch": 0.8832402234636871, + "grad_norm": 0.5278266668319702, + "learning_rate": 0.0009586834733893558, + "loss": 0.5945, + "step": 1581 + }, + { + "epoch": 0.8837988826815643, + "grad_norm": 0.5834859013557434, + "learning_rate": 0.000958655462184874, + "loss": 0.4662, + "step": 1582 + }, + { + "epoch": 0.8843575418994414, + "grad_norm": 1.6163126230239868, + "learning_rate": 0.0009586274509803921, + "loss": 0.4954, + "step": 1583 + }, + { + "epoch": 0.8849162011173184, + "grad_norm": 0.6733283996582031, + "learning_rate": 0.0009585994397759103, + "loss": 0.4711, + "step": 1584 + }, + { + "epoch": 0.8854748603351955, + "grad_norm": 0.5481908917427063, + "learning_rate": 0.0009585714285714285, + "loss": 0.5297, + "step": 1585 + }, + { + "epoch": 0.8860335195530726, + "grad_norm": 1.2589175701141357, + "learning_rate": 0.0009585434173669468, + "loss": 0.5592, + "step": 1586 + }, + { + "epoch": 0.8865921787709498, + "grad_norm": 0.6359618306159973, + "learning_rate": 0.000958515406162465, + "loss": 0.5307, + "step": 1587 + }, + { + "epoch": 0.8871508379888268, + "grad_norm": 3.2101893424987793, + "learning_rate": 0.0009584873949579833, + "loss": 0.4774, + "step": 1588 + }, + { + "epoch": 0.8877094972067039, + "grad_norm": 0.7876462936401367, + "learning_rate": 0.0009584593837535014, + "loss": 0.5563, + "step": 1589 + }, + { + "epoch": 0.888268156424581, + "grad_norm": 0.5258017778396606, + "learning_rate": 0.0009584313725490196, + "loss": 0.4406, + "step": 1590 + }, + { + "epoch": 0.8888268156424581, + "grad_norm": 0.7677879929542542, + "learning_rate": 0.0009584033613445379, + "loss": 0.5258, + "step": 1591 + }, + { + "epoch": 0.8893854748603351, + "grad_norm": 3.9905264377593994, + "learning_rate": 0.0009583753501400561, + "loss": 0.465, + "step": 1592 + }, + { + "epoch": 0.8899441340782123, + "grad_norm": 1.254267692565918, + "learning_rate": 0.0009583473389355743, + "loss": 0.4444, + "step": 1593 + }, + { + "epoch": 0.8905027932960894, + "grad_norm": 0.5092368125915527, + "learning_rate": 0.0009583193277310924, + "loss": 0.3964, + "step": 1594 + }, + { + "epoch": 0.8910614525139665, + "grad_norm": 0.6793869137763977, + "learning_rate": 0.0009582913165266106, + "loss": 0.5527, + "step": 1595 + }, + { + "epoch": 0.8916201117318435, + "grad_norm": 0.542073130607605, + "learning_rate": 0.0009582633053221289, + "loss": 0.5264, + "step": 1596 + }, + { + "epoch": 0.8921787709497206, + "grad_norm": 0.5154288411140442, + "learning_rate": 0.0009582352941176471, + "loss": 0.4487, + "step": 1597 + }, + { + "epoch": 0.8927374301675978, + "grad_norm": 0.7843811511993408, + "learning_rate": 0.0009582072829131653, + "loss": 0.5066, + "step": 1598 + }, + { + "epoch": 0.8932960893854749, + "grad_norm": 0.7047399282455444, + "learning_rate": 0.0009581792717086834, + "loss": 0.5834, + "step": 1599 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 0.6373627781867981, + "learning_rate": 0.0009581512605042016, + "loss": 0.4788, + "step": 1600 + }, + { + "epoch": 0.894413407821229, + "grad_norm": 0.9333066344261169, + "learning_rate": 0.0009581232492997199, + "loss": 0.4882, + "step": 1601 + }, + { + "epoch": 0.8949720670391061, + "grad_norm": 0.8540565967559814, + "learning_rate": 0.0009580952380952381, + "loss": 0.5884, + "step": 1602 + }, + { + "epoch": 0.8955307262569833, + "grad_norm": 0.5950315594673157, + "learning_rate": 0.0009580672268907563, + "loss": 0.6124, + "step": 1603 + }, + { + "epoch": 0.8960893854748604, + "grad_norm": 1.0109556913375854, + "learning_rate": 0.0009580392156862746, + "loss": 0.5261, + "step": 1604 + }, + { + "epoch": 0.8966480446927374, + "grad_norm": 1.743308663368225, + "learning_rate": 0.0009580112044817926, + "loss": 0.442, + "step": 1605 + }, + { + "epoch": 0.8972067039106145, + "grad_norm": 0.5196111798286438, + "learning_rate": 0.000957983193277311, + "loss": 0.4999, + "step": 1606 + }, + { + "epoch": 0.8977653631284916, + "grad_norm": 1.3952412605285645, + "learning_rate": 0.0009579551820728292, + "loss": 0.5323, + "step": 1607 + }, + { + "epoch": 0.8983240223463688, + "grad_norm": 0.7273664474487305, + "learning_rate": 0.0009579271708683474, + "loss": 0.5089, + "step": 1608 + }, + { + "epoch": 0.8988826815642458, + "grad_norm": 0.6543103456497192, + "learning_rate": 0.0009578991596638656, + "loss": 0.4647, + "step": 1609 + }, + { + "epoch": 0.8994413407821229, + "grad_norm": 1.605989694595337, + "learning_rate": 0.0009578711484593837, + "loss": 0.5853, + "step": 1610 + }, + { + "epoch": 0.9, + "grad_norm": 0.6041949987411499, + "learning_rate": 0.000957843137254902, + "loss": 0.4679, + "step": 1611 + }, + { + "epoch": 0.9005586592178771, + "grad_norm": 0.6679384112358093, + "learning_rate": 0.0009578151260504202, + "loss": 0.5184, + "step": 1612 + }, + { + "epoch": 0.9011173184357542, + "grad_norm": 1.1944327354431152, + "learning_rate": 0.0009577871148459384, + "loss": 0.4659, + "step": 1613 + }, + { + "epoch": 0.9016759776536313, + "grad_norm": 4.223563194274902, + "learning_rate": 0.0009577591036414566, + "loss": 0.7264, + "step": 1614 + }, + { + "epoch": 0.9022346368715084, + "grad_norm": 1.6066884994506836, + "learning_rate": 0.0009577310924369747, + "loss": 0.4625, + "step": 1615 + }, + { + "epoch": 0.9027932960893855, + "grad_norm": 0.6855906248092651, + "learning_rate": 0.000957703081232493, + "loss": 0.4257, + "step": 1616 + }, + { + "epoch": 0.9033519553072625, + "grad_norm": 0.42809733748435974, + "learning_rate": 0.0009576750700280112, + "loss": 0.4092, + "step": 1617 + }, + { + "epoch": 0.9039106145251397, + "grad_norm": 0.7035035490989685, + "learning_rate": 0.0009576470588235294, + "loss": 0.4174, + "step": 1618 + }, + { + "epoch": 0.9044692737430168, + "grad_norm": 0.6464139223098755, + "learning_rate": 0.0009576190476190476, + "loss": 0.512, + "step": 1619 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 0.48409298062324524, + "learning_rate": 0.0009575910364145658, + "loss": 0.456, + "step": 1620 + }, + { + "epoch": 0.9055865921787709, + "grad_norm": 0.4653385281562805, + "learning_rate": 0.0009575630252100841, + "loss": 0.4392, + "step": 1621 + }, + { + "epoch": 0.906145251396648, + "grad_norm": 0.5446961522102356, + "learning_rate": 0.0009575350140056023, + "loss": 0.548, + "step": 1622 + }, + { + "epoch": 0.9067039106145252, + "grad_norm": 0.5252688527107239, + "learning_rate": 0.0009575070028011205, + "loss": 0.442, + "step": 1623 + }, + { + "epoch": 0.9072625698324023, + "grad_norm": 0.6384592652320862, + "learning_rate": 0.0009574789915966387, + "loss": 0.4584, + "step": 1624 + }, + { + "epoch": 0.9078212290502793, + "grad_norm": 0.6891541481018066, + "learning_rate": 0.0009574509803921569, + "loss": 0.5276, + "step": 1625 + }, + { + "epoch": 0.9083798882681564, + "grad_norm": 0.5562767386436462, + "learning_rate": 0.0009574229691876751, + "loss": 0.4626, + "step": 1626 + }, + { + "epoch": 0.9089385474860335, + "grad_norm": 0.829547107219696, + "learning_rate": 0.0009573949579831933, + "loss": 0.6602, + "step": 1627 + }, + { + "epoch": 0.9094972067039107, + "grad_norm": 0.7216253280639648, + "learning_rate": 0.0009573669467787115, + "loss": 0.4428, + "step": 1628 + }, + { + "epoch": 0.9100558659217877, + "grad_norm": 0.6616451740264893, + "learning_rate": 0.0009573389355742297, + "loss": 0.6104, + "step": 1629 + }, + { + "epoch": 0.9106145251396648, + "grad_norm": 0.8982250690460205, + "learning_rate": 0.0009573109243697479, + "loss": 0.6243, + "step": 1630 + }, + { + "epoch": 0.9111731843575419, + "grad_norm": 0.7317488789558411, + "learning_rate": 0.0009572829131652661, + "loss": 0.5087, + "step": 1631 + }, + { + "epoch": 0.911731843575419, + "grad_norm": 0.5401996970176697, + "learning_rate": 0.0009572549019607843, + "loss": 0.5399, + "step": 1632 + }, + { + "epoch": 0.912290502793296, + "grad_norm": 0.48129868507385254, + "learning_rate": 0.0009572268907563025, + "loss": 0.45, + "step": 1633 + }, + { + "epoch": 0.9128491620111732, + "grad_norm": 0.8447461724281311, + "learning_rate": 0.0009571988795518207, + "loss": 0.5154, + "step": 1634 + }, + { + "epoch": 0.9134078212290503, + "grad_norm": 0.7238785624504089, + "learning_rate": 0.0009571708683473389, + "loss": 0.5845, + "step": 1635 + }, + { + "epoch": 0.9139664804469274, + "grad_norm": 4.996368408203125, + "learning_rate": 0.0009571428571428573, + "loss": 0.4767, + "step": 1636 + }, + { + "epoch": 0.9145251396648044, + "grad_norm": 0.6995401382446289, + "learning_rate": 0.0009571148459383754, + "loss": 0.4001, + "step": 1637 + }, + { + "epoch": 0.9150837988826815, + "grad_norm": 0.6319295763969421, + "learning_rate": 0.0009570868347338936, + "loss": 0.5082, + "step": 1638 + }, + { + "epoch": 0.9156424581005587, + "grad_norm": 6.914177417755127, + "learning_rate": 0.0009570588235294118, + "loss": 0.4413, + "step": 1639 + }, + { + "epoch": 0.9162011173184358, + "grad_norm": 5.069177150726318, + "learning_rate": 0.00095703081232493, + "loss": 0.4618, + "step": 1640 + }, + { + "epoch": 0.9167597765363128, + "grad_norm": 0.8674466609954834, + "learning_rate": 0.0009570028011204483, + "loss": 0.5621, + "step": 1641 + }, + { + "epoch": 0.9173184357541899, + "grad_norm": 0.7931159734725952, + "learning_rate": 0.0009569747899159664, + "loss": 0.4304, + "step": 1642 + }, + { + "epoch": 0.917877094972067, + "grad_norm": 0.6191529035568237, + "learning_rate": 0.0009569467787114846, + "loss": 0.5212, + "step": 1643 + }, + { + "epoch": 0.9184357541899442, + "grad_norm": 0.4811217784881592, + "learning_rate": 0.0009569187675070028, + "loss": 0.5537, + "step": 1644 + }, + { + "epoch": 0.9189944134078212, + "grad_norm": 0.754035234451294, + "learning_rate": 0.000956890756302521, + "loss": 0.5198, + "step": 1645 + }, + { + "epoch": 0.9195530726256983, + "grad_norm": 0.6202226877212524, + "learning_rate": 0.0009568627450980393, + "loss": 0.5066, + "step": 1646 + }, + { + "epoch": 0.9201117318435754, + "grad_norm": 0.7443351745605469, + "learning_rate": 0.0009568347338935574, + "loss": 0.408, + "step": 1647 + }, + { + "epoch": 0.9206703910614525, + "grad_norm": 0.9001688361167908, + "learning_rate": 0.0009568067226890756, + "loss": 0.5128, + "step": 1648 + }, + { + "epoch": 0.9212290502793297, + "grad_norm": 1.2606867551803589, + "learning_rate": 0.0009567787114845938, + "loss": 0.5465, + "step": 1649 + }, + { + "epoch": 0.9217877094972067, + "grad_norm": 0.5945485830307007, + "learning_rate": 0.000956750700280112, + "loss": 0.5672, + "step": 1650 + }, + { + "epoch": 0.9223463687150838, + "grad_norm": 2.541149616241455, + "learning_rate": 0.0009567226890756303, + "loss": 0.5646, + "step": 1651 + }, + { + "epoch": 0.9229050279329609, + "grad_norm": 0.5519164800643921, + "learning_rate": 0.0009566946778711485, + "loss": 0.5333, + "step": 1652 + }, + { + "epoch": 0.923463687150838, + "grad_norm": 1.086621642112732, + "learning_rate": 0.0009566666666666666, + "loss": 0.4063, + "step": 1653 + }, + { + "epoch": 0.924022346368715, + "grad_norm": 0.8017902374267578, + "learning_rate": 0.0009566386554621849, + "loss": 0.5293, + "step": 1654 + }, + { + "epoch": 0.9245810055865922, + "grad_norm": 0.48129141330718994, + "learning_rate": 0.0009566106442577031, + "loss": 0.4635, + "step": 1655 + }, + { + "epoch": 0.9251396648044693, + "grad_norm": 0.7882272601127625, + "learning_rate": 0.0009565826330532214, + "loss": 0.539, + "step": 1656 + }, + { + "epoch": 0.9256983240223464, + "grad_norm": 0.5756723284721375, + "learning_rate": 0.0009565546218487396, + "loss": 0.5251, + "step": 1657 + }, + { + "epoch": 0.9262569832402234, + "grad_norm": 0.5985224843025208, + "learning_rate": 0.0009565266106442577, + "loss": 0.5465, + "step": 1658 + }, + { + "epoch": 0.9268156424581006, + "grad_norm": 0.8080509901046753, + "learning_rate": 0.0009564985994397759, + "loss": 0.5516, + "step": 1659 + }, + { + "epoch": 0.9273743016759777, + "grad_norm": 2.229174852371216, + "learning_rate": 0.0009564705882352941, + "loss": 0.5039, + "step": 1660 + }, + { + "epoch": 0.9279329608938548, + "grad_norm": 1.6585955619812012, + "learning_rate": 0.0009564425770308124, + "loss": 0.5118, + "step": 1661 + }, + { + "epoch": 0.9284916201117318, + "grad_norm": 0.7574945092201233, + "learning_rate": 0.0009564145658263306, + "loss": 0.5281, + "step": 1662 + }, + { + "epoch": 0.9290502793296089, + "grad_norm": 0.6235692501068115, + "learning_rate": 0.0009563865546218487, + "loss": 0.4852, + "step": 1663 + }, + { + "epoch": 0.929608938547486, + "grad_norm": 0.4636309742927551, + "learning_rate": 0.0009563585434173669, + "loss": 0.4089, + "step": 1664 + }, + { + "epoch": 0.9301675977653632, + "grad_norm": 0.5483853220939636, + "learning_rate": 0.0009563305322128851, + "loss": 0.374, + "step": 1665 + }, + { + "epoch": 0.9307262569832402, + "grad_norm": 1.0265727043151855, + "learning_rate": 0.0009563025210084034, + "loss": 0.5324, + "step": 1666 + }, + { + "epoch": 0.9312849162011173, + "grad_norm": 0.7922943234443665, + "learning_rate": 0.0009562745098039216, + "loss": 0.4633, + "step": 1667 + }, + { + "epoch": 0.9318435754189944, + "grad_norm": 0.6494239568710327, + "learning_rate": 0.0009562464985994398, + "loss": 0.5183, + "step": 1668 + }, + { + "epoch": 0.9324022346368716, + "grad_norm": 0.6400390267372131, + "learning_rate": 0.0009562184873949579, + "loss": 0.4662, + "step": 1669 + }, + { + "epoch": 0.9329608938547486, + "grad_norm": 0.6184152364730835, + "learning_rate": 0.0009561904761904761, + "loss": 0.497, + "step": 1670 + }, + { + "epoch": 0.9335195530726257, + "grad_norm": 2.4548275470733643, + "learning_rate": 0.0009561624649859945, + "loss": 0.4708, + "step": 1671 + }, + { + "epoch": 0.9340782122905028, + "grad_norm": 0.7691540718078613, + "learning_rate": 0.0009561344537815127, + "loss": 0.6856, + "step": 1672 + }, + { + "epoch": 0.9346368715083799, + "grad_norm": 0.8598730564117432, + "learning_rate": 0.0009561064425770309, + "loss": 0.5985, + "step": 1673 + }, + { + "epoch": 0.9351955307262569, + "grad_norm": 12.871362686157227, + "learning_rate": 0.000956078431372549, + "loss": 0.4781, + "step": 1674 + }, + { + "epoch": 0.9357541899441341, + "grad_norm": 0.4955989122390747, + "learning_rate": 0.0009560504201680672, + "loss": 0.3648, + "step": 1675 + }, + { + "epoch": 0.9363128491620112, + "grad_norm": 0.7067756652832031, + "learning_rate": 0.0009560224089635855, + "loss": 0.4822, + "step": 1676 + }, + { + "epoch": 0.9368715083798883, + "grad_norm": 0.8544894456863403, + "learning_rate": 0.0009559943977591037, + "loss": 0.4771, + "step": 1677 + }, + { + "epoch": 0.9374301675977653, + "grad_norm": 1.1533461809158325, + "learning_rate": 0.0009559663865546219, + "loss": 0.5395, + "step": 1678 + }, + { + "epoch": 0.9379888268156424, + "grad_norm": 0.9036091566085815, + "learning_rate": 0.00095593837535014, + "loss": 0.4618, + "step": 1679 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 1.4535365104675293, + "learning_rate": 0.0009559103641456582, + "loss": 0.4719, + "step": 1680 + }, + { + "epoch": 0.9391061452513967, + "grad_norm": 1.0793883800506592, + "learning_rate": 0.0009558823529411765, + "loss": 0.5846, + "step": 1681 + }, + { + "epoch": 0.9396648044692737, + "grad_norm": 0.9983944892883301, + "learning_rate": 0.0009558543417366947, + "loss": 0.5385, + "step": 1682 + }, + { + "epoch": 0.9402234636871508, + "grad_norm": 0.5338730812072754, + "learning_rate": 0.0009558263305322129, + "loss": 0.4756, + "step": 1683 + }, + { + "epoch": 0.9407821229050279, + "grad_norm": 0.8283085823059082, + "learning_rate": 0.0009557983193277311, + "loss": 0.494, + "step": 1684 + }, + { + "epoch": 0.9413407821229051, + "grad_norm": 1.6396960020065308, + "learning_rate": 0.0009557703081232492, + "loss": 0.4261, + "step": 1685 + }, + { + "epoch": 0.9418994413407821, + "grad_norm": 0.6962085962295532, + "learning_rate": 0.0009557422969187676, + "loss": 0.5559, + "step": 1686 + }, + { + "epoch": 0.9424581005586592, + "grad_norm": 0.8268210887908936, + "learning_rate": 0.0009557142857142858, + "loss": 0.4957, + "step": 1687 + }, + { + "epoch": 0.9430167597765363, + "grad_norm": 0.529246985912323, + "learning_rate": 0.000955686274509804, + "loss": 0.4346, + "step": 1688 + }, + { + "epoch": 0.9435754189944134, + "grad_norm": 1.1125683784484863, + "learning_rate": 0.0009556582633053222, + "loss": 0.4382, + "step": 1689 + }, + { + "epoch": 0.9441340782122905, + "grad_norm": 0.6227396130561829, + "learning_rate": 0.0009556302521008403, + "loss": 0.4709, + "step": 1690 + }, + { + "epoch": 0.9446927374301676, + "grad_norm": 0.5941323041915894, + "learning_rate": 0.0009556022408963586, + "loss": 0.3957, + "step": 1691 + }, + { + "epoch": 0.9452513966480447, + "grad_norm": 1.0418164730072021, + "learning_rate": 0.0009555742296918768, + "loss": 0.4414, + "step": 1692 + }, + { + "epoch": 0.9458100558659218, + "grad_norm": 0.851777195930481, + "learning_rate": 0.000955546218487395, + "loss": 0.5859, + "step": 1693 + }, + { + "epoch": 0.9463687150837988, + "grad_norm": 0.585878849029541, + "learning_rate": 0.0009555182072829132, + "loss": 0.4641, + "step": 1694 + }, + { + "epoch": 0.946927374301676, + "grad_norm": 0.8957218527793884, + "learning_rate": 0.0009554901960784313, + "loss": 0.5636, + "step": 1695 + }, + { + "epoch": 0.9474860335195531, + "grad_norm": 0.8570482134819031, + "learning_rate": 0.0009554621848739496, + "loss": 0.5148, + "step": 1696 + }, + { + "epoch": 0.9480446927374302, + "grad_norm": 0.5498955845832825, + "learning_rate": 0.0009554341736694678, + "loss": 0.512, + "step": 1697 + }, + { + "epoch": 0.9486033519553073, + "grad_norm": 0.6170614361763, + "learning_rate": 0.000955406162464986, + "loss": 0.5516, + "step": 1698 + }, + { + "epoch": 0.9491620111731843, + "grad_norm": 0.8566698431968689, + "learning_rate": 0.0009553781512605042, + "loss": 0.4912, + "step": 1699 + }, + { + "epoch": 0.9497206703910615, + "grad_norm": 0.5369873642921448, + "learning_rate": 0.0009553501400560224, + "loss": 0.4751, + "step": 1700 + }, + { + "epoch": 0.9502793296089386, + "grad_norm": 0.694010317325592, + "learning_rate": 0.0009553221288515406, + "loss": 0.4858, + "step": 1701 + }, + { + "epoch": 0.9508379888268157, + "grad_norm": 5.918373107910156, + "learning_rate": 0.0009552941176470588, + "loss": 0.5437, + "step": 1702 + }, + { + "epoch": 0.9513966480446927, + "grad_norm": 0.7442026734352112, + "learning_rate": 0.0009552661064425771, + "loss": 0.4207, + "step": 1703 + }, + { + "epoch": 0.9519553072625698, + "grad_norm": 0.6700260043144226, + "learning_rate": 0.0009552380952380953, + "loss": 0.4938, + "step": 1704 + }, + { + "epoch": 0.952513966480447, + "grad_norm": 0.7718746066093445, + "learning_rate": 0.0009552100840336135, + "loss": 0.3786, + "step": 1705 + }, + { + "epoch": 0.9530726256983241, + "grad_norm": 1.0982671976089478, + "learning_rate": 0.0009551820728291317, + "loss": 0.4839, + "step": 1706 + }, + { + "epoch": 0.9536312849162011, + "grad_norm": 0.5608755946159363, + "learning_rate": 0.0009551540616246499, + "loss": 0.5376, + "step": 1707 + }, + { + "epoch": 0.9541899441340782, + "grad_norm": 2.001518726348877, + "learning_rate": 0.0009551260504201681, + "loss": 0.6045, + "step": 1708 + }, + { + "epoch": 0.9547486033519553, + "grad_norm": 1.1883184909820557, + "learning_rate": 0.0009550980392156863, + "loss": 0.4384, + "step": 1709 + }, + { + "epoch": 0.9553072625698324, + "grad_norm": 0.7585318088531494, + "learning_rate": 0.0009550700280112045, + "loss": 0.5194, + "step": 1710 + }, + { + "epoch": 0.9558659217877095, + "grad_norm": 1.5778087377548218, + "learning_rate": 0.0009550420168067228, + "loss": 0.4411, + "step": 1711 + }, + { + "epoch": 0.9564245810055866, + "grad_norm": 0.7450963258743286, + "learning_rate": 0.0009550140056022409, + "loss": 0.4841, + "step": 1712 + }, + { + "epoch": 0.9569832402234637, + "grad_norm": 0.5968680381774902, + "learning_rate": 0.0009549859943977591, + "loss": 0.6217, + "step": 1713 + }, + { + "epoch": 0.9575418994413408, + "grad_norm": 1.0186375379562378, + "learning_rate": 0.0009549579831932773, + "loss": 0.5569, + "step": 1714 + }, + { + "epoch": 0.9581005586592178, + "grad_norm": 0.5329403281211853, + "learning_rate": 0.0009549299719887955, + "loss": 0.4947, + "step": 1715 + }, + { + "epoch": 0.958659217877095, + "grad_norm": 0.572655200958252, + "learning_rate": 0.0009549019607843138, + "loss": 0.5223, + "step": 1716 + }, + { + "epoch": 0.9592178770949721, + "grad_norm": 0.5006478428840637, + "learning_rate": 0.0009548739495798319, + "loss": 0.4951, + "step": 1717 + }, + { + "epoch": 0.9597765363128492, + "grad_norm": 0.6001412868499756, + "learning_rate": 0.0009548459383753501, + "loss": 0.4952, + "step": 1718 + }, + { + "epoch": 0.9603351955307262, + "grad_norm": 2.2571427822113037, + "learning_rate": 0.0009548179271708684, + "loss": 0.4362, + "step": 1719 + }, + { + "epoch": 0.9608938547486033, + "grad_norm": 0.5362712740898132, + "learning_rate": 0.0009547899159663866, + "loss": 0.4559, + "step": 1720 + }, + { + "epoch": 0.9614525139664805, + "grad_norm": 0.6957347393035889, + "learning_rate": 0.0009547619047619049, + "loss": 0.4905, + "step": 1721 + }, + { + "epoch": 0.9620111731843576, + "grad_norm": 0.6331759095191956, + "learning_rate": 0.000954733893557423, + "loss": 0.4175, + "step": 1722 + }, + { + "epoch": 0.9625698324022346, + "grad_norm": 0.5453664064407349, + "learning_rate": 0.0009547058823529412, + "loss": 0.5286, + "step": 1723 + }, + { + "epoch": 0.9631284916201117, + "grad_norm": 0.594581127166748, + "learning_rate": 0.0009546778711484594, + "loss": 0.4282, + "step": 1724 + }, + { + "epoch": 0.9636871508379888, + "grad_norm": 0.9801068305969238, + "learning_rate": 0.0009546498599439776, + "loss": 0.5001, + "step": 1725 + }, + { + "epoch": 0.964245810055866, + "grad_norm": 1.5954480171203613, + "learning_rate": 0.0009546218487394959, + "loss": 0.4733, + "step": 1726 + }, + { + "epoch": 0.964804469273743, + "grad_norm": 0.4829848110675812, + "learning_rate": 0.0009545938375350141, + "loss": 0.3831, + "step": 1727 + }, + { + "epoch": 0.9653631284916201, + "grad_norm": 0.6249749064445496, + "learning_rate": 0.0009545658263305322, + "loss": 0.4997, + "step": 1728 + }, + { + "epoch": 0.9659217877094972, + "grad_norm": 1.116471529006958, + "learning_rate": 0.0009545378151260504, + "loss": 0.7677, + "step": 1729 + }, + { + "epoch": 0.9664804469273743, + "grad_norm": 2.0907201766967773, + "learning_rate": 0.0009545098039215686, + "loss": 0.3794, + "step": 1730 + }, + { + "epoch": 0.9670391061452513, + "grad_norm": 0.9863475561141968, + "learning_rate": 0.0009544817927170869, + "loss": 0.5375, + "step": 1731 + }, + { + "epoch": 0.9675977653631285, + "grad_norm": 3.3474059104919434, + "learning_rate": 0.0009544537815126051, + "loss": 0.4954, + "step": 1732 + }, + { + "epoch": 0.9681564245810056, + "grad_norm": 1.8166532516479492, + "learning_rate": 0.0009544257703081232, + "loss": 0.418, + "step": 1733 + }, + { + "epoch": 0.9687150837988827, + "grad_norm": 1.42117440700531, + "learning_rate": 0.0009543977591036414, + "loss": 0.5219, + "step": 1734 + }, + { + "epoch": 0.9692737430167597, + "grad_norm": 0.9605877995491028, + "learning_rate": 0.0009543697478991596, + "loss": 0.4613, + "step": 1735 + }, + { + "epoch": 0.9698324022346368, + "grad_norm": 4.026614665985107, + "learning_rate": 0.000954341736694678, + "loss": 0.7381, + "step": 1736 + }, + { + "epoch": 0.970391061452514, + "grad_norm": 0.5737302303314209, + "learning_rate": 0.0009543137254901962, + "loss": 0.6071, + "step": 1737 + }, + { + "epoch": 0.9709497206703911, + "grad_norm": 0.6030662655830383, + "learning_rate": 0.0009542857142857143, + "loss": 0.5175, + "step": 1738 + }, + { + "epoch": 0.9715083798882681, + "grad_norm": 0.5691815614700317, + "learning_rate": 0.0009542577030812325, + "loss": 0.4475, + "step": 1739 + }, + { + "epoch": 0.9720670391061452, + "grad_norm": 0.4971826374530792, + "learning_rate": 0.0009542296918767507, + "loss": 0.4734, + "step": 1740 + }, + { + "epoch": 0.9726256983240223, + "grad_norm": 0.7168786525726318, + "learning_rate": 0.000954201680672269, + "loss": 0.5472, + "step": 1741 + }, + { + "epoch": 0.9731843575418995, + "grad_norm": 0.49384424090385437, + "learning_rate": 0.0009541736694677872, + "loss": 0.5945, + "step": 1742 + }, + { + "epoch": 0.9737430167597766, + "grad_norm": 1.3744165897369385, + "learning_rate": 0.0009541456582633054, + "loss": 0.4235, + "step": 1743 + }, + { + "epoch": 0.9743016759776536, + "grad_norm": 1.0167593955993652, + "learning_rate": 0.0009541176470588235, + "loss": 0.5223, + "step": 1744 + }, + { + "epoch": 0.9748603351955307, + "grad_norm": 0.8459349274635315, + "learning_rate": 0.0009540896358543417, + "loss": 0.3932, + "step": 1745 + }, + { + "epoch": 0.9754189944134078, + "grad_norm": 0.5819987654685974, + "learning_rate": 0.00095406162464986, + "loss": 0.4391, + "step": 1746 + }, + { + "epoch": 0.975977653631285, + "grad_norm": 0.5702646970748901, + "learning_rate": 0.0009540336134453782, + "loss": 0.5308, + "step": 1747 + }, + { + "epoch": 0.976536312849162, + "grad_norm": 0.6302896738052368, + "learning_rate": 0.0009540056022408964, + "loss": 0.4809, + "step": 1748 + }, + { + "epoch": 0.9770949720670391, + "grad_norm": 0.7520524859428406, + "learning_rate": 0.0009539775910364145, + "loss": 0.461, + "step": 1749 + }, + { + "epoch": 0.9776536312849162, + "grad_norm": 1.06060791015625, + "learning_rate": 0.0009539495798319327, + "loss": 0.6726, + "step": 1750 + }, + { + "epoch": 0.9782122905027933, + "grad_norm": 0.8254221677780151, + "learning_rate": 0.000953921568627451, + "loss": 0.5178, + "step": 1751 + }, + { + "epoch": 0.9787709497206704, + "grad_norm": 0.5896238684654236, + "learning_rate": 0.0009538935574229693, + "loss": 0.6828, + "step": 1752 + }, + { + "epoch": 0.9793296089385475, + "grad_norm": 0.4934066832065582, + "learning_rate": 0.0009538655462184875, + "loss": 0.4935, + "step": 1753 + }, + { + "epoch": 0.9798882681564246, + "grad_norm": 0.6261158585548401, + "learning_rate": 0.0009538375350140056, + "loss": 0.4269, + "step": 1754 + }, + { + "epoch": 0.9804469273743017, + "grad_norm": 1.0440266132354736, + "learning_rate": 0.0009538095238095238, + "loss": 0.5511, + "step": 1755 + }, + { + "epoch": 0.9810055865921787, + "grad_norm": 0.5914115309715271, + "learning_rate": 0.0009537815126050421, + "loss": 0.5074, + "step": 1756 + }, + { + "epoch": 0.9815642458100559, + "grad_norm": 0.6855108737945557, + "learning_rate": 0.0009537535014005603, + "loss": 0.5304, + "step": 1757 + }, + { + "epoch": 0.982122905027933, + "grad_norm": 0.9795001149177551, + "learning_rate": 0.0009537254901960785, + "loss": 0.4309, + "step": 1758 + }, + { + "epoch": 0.9826815642458101, + "grad_norm": 0.8306459784507751, + "learning_rate": 0.0009536974789915967, + "loss": 0.516, + "step": 1759 + }, + { + "epoch": 0.9832402234636871, + "grad_norm": 2.5804319381713867, + "learning_rate": 0.0009536694677871148, + "loss": 0.5219, + "step": 1760 + }, + { + "epoch": 0.9837988826815642, + "grad_norm": 0.8659358620643616, + "learning_rate": 0.0009536414565826331, + "loss": 0.4489, + "step": 1761 + }, + { + "epoch": 0.9843575418994414, + "grad_norm": 17.09529685974121, + "learning_rate": 0.0009536134453781513, + "loss": 0.5589, + "step": 1762 + }, + { + "epoch": 0.9849162011173185, + "grad_norm": 0.4888550937175751, + "learning_rate": 0.0009535854341736695, + "loss": 0.5087, + "step": 1763 + }, + { + "epoch": 0.9854748603351955, + "grad_norm": 0.8920523524284363, + "learning_rate": 0.0009535574229691877, + "loss": 0.389, + "step": 1764 + }, + { + "epoch": 0.9860335195530726, + "grad_norm": 0.6390886306762695, + "learning_rate": 0.0009535294117647058, + "loss": 0.4592, + "step": 1765 + }, + { + "epoch": 0.9865921787709497, + "grad_norm": 0.8547239899635315, + "learning_rate": 0.0009535014005602241, + "loss": 0.505, + "step": 1766 + }, + { + "epoch": 0.9871508379888269, + "grad_norm": 1.1240209341049194, + "learning_rate": 0.0009534733893557423, + "loss": 0.5856, + "step": 1767 + }, + { + "epoch": 0.9877094972067039, + "grad_norm": 2.0392749309539795, + "learning_rate": 0.0009534453781512606, + "loss": 0.5459, + "step": 1768 + }, + { + "epoch": 0.988268156424581, + "grad_norm": 1.228261947631836, + "learning_rate": 0.0009534173669467788, + "loss": 0.5185, + "step": 1769 + }, + { + "epoch": 0.9888268156424581, + "grad_norm": 1.0804953575134277, + "learning_rate": 0.0009533893557422969, + "loss": 0.6126, + "step": 1770 + }, + { + "epoch": 0.9893854748603352, + "grad_norm": 1.4725781679153442, + "learning_rate": 0.0009533613445378152, + "loss": 0.5833, + "step": 1771 + }, + { + "epoch": 0.9899441340782122, + "grad_norm": 0.6424615979194641, + "learning_rate": 0.0009533333333333334, + "loss": 0.51, + "step": 1772 + }, + { + "epoch": 0.9905027932960894, + "grad_norm": 0.6401039361953735, + "learning_rate": 0.0009533053221288516, + "loss": 0.4098, + "step": 1773 + }, + { + "epoch": 0.9910614525139665, + "grad_norm": 0.7132853269577026, + "learning_rate": 0.0009532773109243698, + "loss": 0.5708, + "step": 1774 + }, + { + "epoch": 0.9916201117318436, + "grad_norm": 0.5756059885025024, + "learning_rate": 0.000953249299719888, + "loss": 0.5526, + "step": 1775 + }, + { + "epoch": 0.9921787709497206, + "grad_norm": 0.6990247368812561, + "learning_rate": 0.0009532212885154062, + "loss": 0.5504, + "step": 1776 + }, + { + "epoch": 0.9927374301675977, + "grad_norm": 0.7180444598197937, + "learning_rate": 0.0009531932773109244, + "loss": 0.4779, + "step": 1777 + }, + { + "epoch": 0.9932960893854749, + "grad_norm": 0.5672318339347839, + "learning_rate": 0.0009531652661064426, + "loss": 0.486, + "step": 1778 + }, + { + "epoch": 0.993854748603352, + "grad_norm": 0.7721855640411377, + "learning_rate": 0.0009531372549019608, + "loss": 0.5043, + "step": 1779 + }, + { + "epoch": 0.994413407821229, + "grad_norm": 0.7544890642166138, + "learning_rate": 0.000953109243697479, + "loss": 0.5308, + "step": 1780 + }, + { + "epoch": 0.9949720670391061, + "grad_norm": 1.666582703590393, + "learning_rate": 0.0009530812324929971, + "loss": 0.4838, + "step": 1781 + }, + { + "epoch": 0.9955307262569832, + "grad_norm": 0.5985055565834045, + "learning_rate": 0.0009530532212885154, + "loss": 0.4106, + "step": 1782 + }, + { + "epoch": 0.9960893854748604, + "grad_norm": 3.151643991470337, + "learning_rate": 0.0009530252100840336, + "loss": 0.4749, + "step": 1783 + }, + { + "epoch": 0.9966480446927374, + "grad_norm": 2.540830135345459, + "learning_rate": 0.0009529971988795518, + "loss": 0.5104, + "step": 1784 + }, + { + "epoch": 0.9972067039106145, + "grad_norm": 0.5820245742797852, + "learning_rate": 0.0009529691876750701, + "loss": 0.4112, + "step": 1785 + }, + { + "epoch": 0.9977653631284916, + "grad_norm": 0.8016902208328247, + "learning_rate": 0.0009529411764705882, + "loss": 0.4685, + "step": 1786 + }, + { + "epoch": 0.9983240223463687, + "grad_norm": 0.6371076107025146, + "learning_rate": 0.0009529131652661065, + "loss": 0.5928, + "step": 1787 + }, + { + "epoch": 0.9988826815642458, + "grad_norm": 0.6752023100852966, + "learning_rate": 0.0009528851540616247, + "loss": 0.5414, + "step": 1788 + }, + { + "epoch": 0.9994413407821229, + "grad_norm": 0.6677910089492798, + "learning_rate": 0.0009528571428571429, + "loss": 0.4194, + "step": 1789 + }, + { + "epoch": 1.0, + "grad_norm": 0.8213701844215393, + "learning_rate": 0.0009528291316526611, + "loss": 0.5417, + "step": 1790 + }, + { + "epoch": 1.000558659217877, + "grad_norm": 0.8009673357009888, + "learning_rate": 0.0009528011204481793, + "loss": 0.583, + "step": 1791 + }, + { + "epoch": 1.0011173184357542, + "grad_norm": 0.5417811274528503, + "learning_rate": 0.0009527731092436975, + "loss": 0.532, + "step": 1792 + }, + { + "epoch": 1.0016759776536313, + "grad_norm": 1.0531492233276367, + "learning_rate": 0.0009527450980392157, + "loss": 0.5112, + "step": 1793 + }, + { + "epoch": 1.0022346368715085, + "grad_norm": 2.9258675575256348, + "learning_rate": 0.0009527170868347339, + "loss": 0.4589, + "step": 1794 + }, + { + "epoch": 1.0027932960893855, + "grad_norm": 0.7311922907829285, + "learning_rate": 0.0009526890756302521, + "loss": 0.4816, + "step": 1795 + }, + { + "epoch": 1.0033519553072625, + "grad_norm": 0.5544597506523132, + "learning_rate": 0.0009526610644257703, + "loss": 0.4589, + "step": 1796 + }, + { + "epoch": 1.0039106145251397, + "grad_norm": 0.6038646697998047, + "learning_rate": 0.0009526330532212885, + "loss": 0.4494, + "step": 1797 + }, + { + "epoch": 1.0044692737430168, + "grad_norm": 1.2839605808258057, + "learning_rate": 0.0009526050420168067, + "loss": 0.5132, + "step": 1798 + }, + { + "epoch": 1.0050279329608938, + "grad_norm": 0.5195278525352478, + "learning_rate": 0.0009525770308123249, + "loss": 0.4801, + "step": 1799 + }, + { + "epoch": 1.005586592178771, + "grad_norm": 0.5706222653388977, + "learning_rate": 0.0009525490196078431, + "loss": 0.3827, + "step": 1800 + }, + { + "epoch": 1.006145251396648, + "grad_norm": 2.2582526206970215, + "learning_rate": 0.0009525210084033614, + "loss": 0.4738, + "step": 1801 + }, + { + "epoch": 1.0067039106145252, + "grad_norm": 5.209932327270508, + "learning_rate": 0.0009524929971988796, + "loss": 0.3825, + "step": 1802 + }, + { + "epoch": 1.0072625698324023, + "grad_norm": 1.2498819828033447, + "learning_rate": 0.0009524649859943978, + "loss": 0.5034, + "step": 1803 + }, + { + "epoch": 1.0078212290502793, + "grad_norm": 0.6335483193397522, + "learning_rate": 0.000952436974789916, + "loss": 0.4548, + "step": 1804 + }, + { + "epoch": 1.0083798882681565, + "grad_norm": 1.3591705560684204, + "learning_rate": 0.0009524089635854342, + "loss": 0.4128, + "step": 1805 + }, + { + "epoch": 1.0089385474860335, + "grad_norm": 0.4688738286495209, + "learning_rate": 0.0009523809523809524, + "loss": 0.3854, + "step": 1806 + }, + { + "epoch": 1.0094972067039105, + "grad_norm": 0.5695164799690247, + "learning_rate": 0.0009523529411764707, + "loss": 0.4268, + "step": 1807 + }, + { + "epoch": 1.0100558659217878, + "grad_norm": 0.5155953168869019, + "learning_rate": 0.0009523249299719888, + "loss": 0.4241, + "step": 1808 + }, + { + "epoch": 1.0106145251396648, + "grad_norm": 0.6092314720153809, + "learning_rate": 0.000952296918767507, + "loss": 0.4624, + "step": 1809 + }, + { + "epoch": 1.011173184357542, + "grad_norm": 0.7365458607673645, + "learning_rate": 0.0009522689075630252, + "loss": 0.5287, + "step": 1810 + }, + { + "epoch": 1.011731843575419, + "grad_norm": 0.522511899471283, + "learning_rate": 0.0009522408963585434, + "loss": 0.4051, + "step": 1811 + }, + { + "epoch": 1.012290502793296, + "grad_norm": 1.2410948276519775, + "learning_rate": 0.0009522128851540617, + "loss": 0.6009, + "step": 1812 + }, + { + "epoch": 1.0128491620111733, + "grad_norm": 1.6744202375411987, + "learning_rate": 0.0009521848739495798, + "loss": 0.5283, + "step": 1813 + }, + { + "epoch": 1.0134078212290503, + "grad_norm": 0.6357499957084656, + "learning_rate": 0.000952156862745098, + "loss": 0.46, + "step": 1814 + }, + { + "epoch": 1.0139664804469273, + "grad_norm": 0.5928827524185181, + "learning_rate": 0.0009521288515406162, + "loss": 0.5303, + "step": 1815 + }, + { + "epoch": 1.0145251396648045, + "grad_norm": 0.43160295486450195, + "learning_rate": 0.0009521008403361344, + "loss": 0.464, + "step": 1816 + }, + { + "epoch": 1.0150837988826815, + "grad_norm": 0.7292526960372925, + "learning_rate": 0.0009520728291316528, + "loss": 0.6032, + "step": 1817 + }, + { + "epoch": 1.0156424581005588, + "grad_norm": 1.6284388303756714, + "learning_rate": 0.0009520448179271709, + "loss": 0.6681, + "step": 1818 + }, + { + "epoch": 1.0162011173184358, + "grad_norm": 1.0405173301696777, + "learning_rate": 0.0009520168067226891, + "loss": 0.536, + "step": 1819 + }, + { + "epoch": 1.0167597765363128, + "grad_norm": 0.5934048295021057, + "learning_rate": 0.0009519887955182073, + "loss": 0.4234, + "step": 1820 + }, + { + "epoch": 1.01731843575419, + "grad_norm": 0.8506879210472107, + "learning_rate": 0.0009519607843137255, + "loss": 0.5327, + "step": 1821 + }, + { + "epoch": 1.017877094972067, + "grad_norm": 1.1442513465881348, + "learning_rate": 0.0009519327731092438, + "loss": 0.6624, + "step": 1822 + }, + { + "epoch": 1.018435754189944, + "grad_norm": 0.9382103681564331, + "learning_rate": 0.000951904761904762, + "loss": 0.4661, + "step": 1823 + }, + { + "epoch": 1.0189944134078213, + "grad_norm": 1.3499640226364136, + "learning_rate": 0.0009518767507002801, + "loss": 0.7089, + "step": 1824 + }, + { + "epoch": 1.0195530726256983, + "grad_norm": 1.1573398113250732, + "learning_rate": 0.0009518487394957983, + "loss": 0.4748, + "step": 1825 + }, + { + "epoch": 1.0201117318435755, + "grad_norm": 0.8220486640930176, + "learning_rate": 0.0009518207282913165, + "loss": 0.5487, + "step": 1826 + }, + { + "epoch": 1.0206703910614525, + "grad_norm": 0.7242711186408997, + "learning_rate": 0.0009517927170868348, + "loss": 0.4882, + "step": 1827 + }, + { + "epoch": 1.0212290502793295, + "grad_norm": 4.589696407318115, + "learning_rate": 0.000951764705882353, + "loss": 0.6962, + "step": 1828 + }, + { + "epoch": 1.0217877094972068, + "grad_norm": 0.7695217132568359, + "learning_rate": 0.0009517366946778711, + "loss": 0.5039, + "step": 1829 + }, + { + "epoch": 1.0223463687150838, + "grad_norm": 1.1315627098083496, + "learning_rate": 0.0009517086834733893, + "loss": 0.5383, + "step": 1830 + }, + { + "epoch": 1.0229050279329608, + "grad_norm": 0.9076684713363647, + "learning_rate": 0.0009516806722689075, + "loss": 0.4484, + "step": 1831 + }, + { + "epoch": 1.023463687150838, + "grad_norm": 7.834429740905762, + "learning_rate": 0.0009516526610644258, + "loss": 0.6796, + "step": 1832 + }, + { + "epoch": 1.024022346368715, + "grad_norm": 0.5402182340621948, + "learning_rate": 0.000951624649859944, + "loss": 0.4995, + "step": 1833 + }, + { + "epoch": 1.0245810055865923, + "grad_norm": 0.6438533067703247, + "learning_rate": 0.0009515966386554621, + "loss": 0.6354, + "step": 1834 + }, + { + "epoch": 1.0251396648044693, + "grad_norm": 0.8940802812576294, + "learning_rate": 0.0009515686274509804, + "loss": 0.5287, + "step": 1835 + }, + { + "epoch": 1.0256983240223463, + "grad_norm": 0.9579599499702454, + "learning_rate": 0.0009515406162464986, + "loss": 0.4779, + "step": 1836 + }, + { + "epoch": 1.0262569832402235, + "grad_norm": 0.7511836886405945, + "learning_rate": 0.0009515126050420169, + "loss": 0.4558, + "step": 1837 + }, + { + "epoch": 1.0268156424581005, + "grad_norm": 0.46512100100517273, + "learning_rate": 0.0009514845938375351, + "loss": 0.472, + "step": 1838 + }, + { + "epoch": 1.0273743016759775, + "grad_norm": 0.5905054807662964, + "learning_rate": 0.0009514565826330533, + "loss": 0.4722, + "step": 1839 + }, + { + "epoch": 1.0279329608938548, + "grad_norm": 0.5499210953712463, + "learning_rate": 0.0009514285714285714, + "loss": 0.4799, + "step": 1840 + }, + { + "epoch": 1.0284916201117318, + "grad_norm": 0.8468977212905884, + "learning_rate": 0.0009514005602240896, + "loss": 0.444, + "step": 1841 + }, + { + "epoch": 1.029050279329609, + "grad_norm": 0.7623924016952515, + "learning_rate": 0.0009513725490196079, + "loss": 0.4681, + "step": 1842 + }, + { + "epoch": 1.029608938547486, + "grad_norm": 0.5783141851425171, + "learning_rate": 0.0009513445378151261, + "loss": 0.4932, + "step": 1843 + }, + { + "epoch": 1.030167597765363, + "grad_norm": 0.6379013061523438, + "learning_rate": 0.0009513165266106443, + "loss": 0.5716, + "step": 1844 + }, + { + "epoch": 1.0307262569832403, + "grad_norm": 0.7331202030181885, + "learning_rate": 0.0009512885154061624, + "loss": 0.4656, + "step": 1845 + }, + { + "epoch": 1.0312849162011173, + "grad_norm": 0.8809279799461365, + "learning_rate": 0.0009512605042016806, + "loss": 0.6752, + "step": 1846 + }, + { + "epoch": 1.0318435754189945, + "grad_norm": 0.4546876847743988, + "learning_rate": 0.0009512324929971989, + "loss": 0.4629, + "step": 1847 + }, + { + "epoch": 1.0324022346368715, + "grad_norm": 12.710982322692871, + "learning_rate": 0.0009512044817927171, + "loss": 0.4853, + "step": 1848 + }, + { + "epoch": 1.0329608938547485, + "grad_norm": 0.8372478485107422, + "learning_rate": 0.0009511764705882353, + "loss": 0.4681, + "step": 1849 + }, + { + "epoch": 1.0335195530726258, + "grad_norm": 0.9921114444732666, + "learning_rate": 0.0009511484593837534, + "loss": 0.5301, + "step": 1850 + }, + { + "epoch": 1.0340782122905028, + "grad_norm": 1.7067806720733643, + "learning_rate": 0.0009511204481792716, + "loss": 0.5073, + "step": 1851 + }, + { + "epoch": 1.0346368715083798, + "grad_norm": 0.6098561882972717, + "learning_rate": 0.00095109243697479, + "loss": 0.4594, + "step": 1852 + }, + { + "epoch": 1.035195530726257, + "grad_norm": 0.6715817451477051, + "learning_rate": 0.0009510644257703082, + "loss": 0.5017, + "step": 1853 + }, + { + "epoch": 1.035754189944134, + "grad_norm": 0.6328045725822449, + "learning_rate": 0.0009510364145658264, + "loss": 0.58, + "step": 1854 + }, + { + "epoch": 1.0363128491620113, + "grad_norm": 0.6053429841995239, + "learning_rate": 0.0009510084033613446, + "loss": 0.5138, + "step": 1855 + }, + { + "epoch": 1.0368715083798883, + "grad_norm": 2.4972400665283203, + "learning_rate": 0.0009509803921568627, + "loss": 0.5526, + "step": 1856 + }, + { + "epoch": 1.0374301675977653, + "grad_norm": 1.4493807554244995, + "learning_rate": 0.000950952380952381, + "loss": 0.5394, + "step": 1857 + }, + { + "epoch": 1.0379888268156425, + "grad_norm": 0.7984095215797424, + "learning_rate": 0.0009509243697478992, + "loss": 0.3829, + "step": 1858 + }, + { + "epoch": 1.0385474860335195, + "grad_norm": 0.600412130355835, + "learning_rate": 0.0009508963585434174, + "loss": 0.517, + "step": 1859 + }, + { + "epoch": 1.0391061452513966, + "grad_norm": 0.9996770620346069, + "learning_rate": 0.0009508683473389356, + "loss": 0.4835, + "step": 1860 + }, + { + "epoch": 1.0396648044692738, + "grad_norm": 0.6391316056251526, + "learning_rate": 0.0009508403361344537, + "loss": 0.493, + "step": 1861 + }, + { + "epoch": 1.0402234636871508, + "grad_norm": 0.7396573424339294, + "learning_rate": 0.000950812324929972, + "loss": 0.4412, + "step": 1862 + }, + { + "epoch": 1.040782122905028, + "grad_norm": 0.7529088854789734, + "learning_rate": 0.0009507843137254902, + "loss": 0.5204, + "step": 1863 + }, + { + "epoch": 1.041340782122905, + "grad_norm": 1.4906047582626343, + "learning_rate": 0.0009507563025210084, + "loss": 0.3793, + "step": 1864 + }, + { + "epoch": 1.041899441340782, + "grad_norm": 0.8225281238555908, + "learning_rate": 0.0009507282913165266, + "loss": 0.684, + "step": 1865 + }, + { + "epoch": 1.0424581005586593, + "grad_norm": 1.064224123954773, + "learning_rate": 0.0009507002801120447, + "loss": 0.4323, + "step": 1866 + }, + { + "epoch": 1.0430167597765363, + "grad_norm": 0.8468764424324036, + "learning_rate": 0.0009506722689075631, + "loss": 0.5064, + "step": 1867 + }, + { + "epoch": 1.0435754189944133, + "grad_norm": 1.3305532932281494, + "learning_rate": 0.0009506442577030813, + "loss": 0.6069, + "step": 1868 + }, + { + "epoch": 1.0441340782122905, + "grad_norm": 0.5554779767990112, + "learning_rate": 0.0009506162464985995, + "loss": 0.5527, + "step": 1869 + }, + { + "epoch": 1.0446927374301676, + "grad_norm": 0.6082237362861633, + "learning_rate": 0.0009505882352941177, + "loss": 0.4117, + "step": 1870 + }, + { + "epoch": 1.0452513966480448, + "grad_norm": 0.5815912485122681, + "learning_rate": 0.0009505602240896359, + "loss": 0.4673, + "step": 1871 + }, + { + "epoch": 1.0458100558659218, + "grad_norm": 3.348129987716675, + "learning_rate": 0.0009505322128851541, + "loss": 0.4915, + "step": 1872 + }, + { + "epoch": 1.0463687150837988, + "grad_norm": 0.7753360867500305, + "learning_rate": 0.0009505042016806723, + "loss": 0.4562, + "step": 1873 + }, + { + "epoch": 1.046927374301676, + "grad_norm": 0.8797411918640137, + "learning_rate": 0.0009504761904761905, + "loss": 0.5448, + "step": 1874 + }, + { + "epoch": 1.047486033519553, + "grad_norm": 0.4864180386066437, + "learning_rate": 0.0009504481792717087, + "loss": 0.4564, + "step": 1875 + }, + { + "epoch": 1.04804469273743, + "grad_norm": 0.4797047972679138, + "learning_rate": 0.0009504201680672269, + "loss": 0.4936, + "step": 1876 + }, + { + "epoch": 1.0486033519553073, + "grad_norm": 0.6791684031486511, + "learning_rate": 0.0009503921568627451, + "loss": 0.5268, + "step": 1877 + }, + { + "epoch": 1.0491620111731843, + "grad_norm": 0.6789284348487854, + "learning_rate": 0.0009503641456582633, + "loss": 0.5303, + "step": 1878 + }, + { + "epoch": 1.0497206703910615, + "grad_norm": 2.4155874252319336, + "learning_rate": 0.0009503361344537815, + "loss": 0.5527, + "step": 1879 + }, + { + "epoch": 1.0502793296089385, + "grad_norm": 0.5727857351303101, + "learning_rate": 0.0009503081232492997, + "loss": 0.4154, + "step": 1880 + }, + { + "epoch": 1.0508379888268156, + "grad_norm": 0.8701090216636658, + "learning_rate": 0.0009502801120448179, + "loss": 0.5501, + "step": 1881 + }, + { + "epoch": 1.0513966480446928, + "grad_norm": 0.5002401471138, + "learning_rate": 0.0009502521008403361, + "loss": 0.4729, + "step": 1882 + }, + { + "epoch": 1.0519553072625698, + "grad_norm": 0.6317011117935181, + "learning_rate": 0.0009502240896358544, + "loss": 0.5321, + "step": 1883 + }, + { + "epoch": 1.052513966480447, + "grad_norm": 0.671137809753418, + "learning_rate": 0.0009501960784313726, + "loss": 0.5161, + "step": 1884 + }, + { + "epoch": 1.053072625698324, + "grad_norm": 0.9408466219902039, + "learning_rate": 0.0009501680672268908, + "loss": 0.4937, + "step": 1885 + }, + { + "epoch": 1.053631284916201, + "grad_norm": 0.5962716937065125, + "learning_rate": 0.000950140056022409, + "loss": 0.4943, + "step": 1886 + }, + { + "epoch": 1.0541899441340783, + "grad_norm": 1.048824667930603, + "learning_rate": 0.0009501120448179273, + "loss": 0.4504, + "step": 1887 + }, + { + "epoch": 1.0547486033519553, + "grad_norm": 0.581483006477356, + "learning_rate": 0.0009500840336134454, + "loss": 0.494, + "step": 1888 + }, + { + "epoch": 1.0553072625698323, + "grad_norm": 0.7170062065124512, + "learning_rate": 0.0009500560224089636, + "loss": 0.4955, + "step": 1889 + }, + { + "epoch": 1.0558659217877095, + "grad_norm": 0.5089759230613708, + "learning_rate": 0.0009500280112044818, + "loss": 0.4707, + "step": 1890 + }, + { + "epoch": 1.0564245810055866, + "grad_norm": 2.935894727706909, + "learning_rate": 0.00095, + "loss": 0.5117, + "step": 1891 + }, + { + "epoch": 1.0569832402234638, + "grad_norm": 0.6432769298553467, + "learning_rate": 0.0009499719887955183, + "loss": 0.5352, + "step": 1892 + }, + { + "epoch": 1.0575418994413408, + "grad_norm": 0.6752887964248657, + "learning_rate": 0.0009499439775910364, + "loss": 0.4226, + "step": 1893 + }, + { + "epoch": 1.0581005586592178, + "grad_norm": 1.1219720840454102, + "learning_rate": 0.0009499159663865546, + "loss": 0.6474, + "step": 1894 + }, + { + "epoch": 1.058659217877095, + "grad_norm": 0.7912245988845825, + "learning_rate": 0.0009498879551820728, + "loss": 0.608, + "step": 1895 + }, + { + "epoch": 1.059217877094972, + "grad_norm": 1.3063058853149414, + "learning_rate": 0.000949859943977591, + "loss": 0.4818, + "step": 1896 + }, + { + "epoch": 1.059776536312849, + "grad_norm": 0.8381514549255371, + "learning_rate": 0.0009498319327731093, + "loss": 0.4716, + "step": 1897 + }, + { + "epoch": 1.0603351955307263, + "grad_norm": 1.1324119567871094, + "learning_rate": 0.0009498039215686274, + "loss": 0.5246, + "step": 1898 + }, + { + "epoch": 1.0608938547486033, + "grad_norm": 0.8672372698783875, + "learning_rate": 0.0009497759103641456, + "loss": 0.4882, + "step": 1899 + }, + { + "epoch": 1.0614525139664805, + "grad_norm": 1.3416739702224731, + "learning_rate": 0.0009497478991596639, + "loss": 0.5836, + "step": 1900 + }, + { + "epoch": 1.0620111731843576, + "grad_norm": 0.539715588092804, + "learning_rate": 0.0009497198879551821, + "loss": 0.5111, + "step": 1901 + }, + { + "epoch": 1.0625698324022346, + "grad_norm": 0.629913866519928, + "learning_rate": 0.0009496918767507004, + "loss": 0.468, + "step": 1902 + }, + { + "epoch": 1.0631284916201118, + "grad_norm": 0.60403972864151, + "learning_rate": 0.0009496638655462186, + "loss": 0.5335, + "step": 1903 + }, + { + "epoch": 1.0636871508379888, + "grad_norm": 0.5676341652870178, + "learning_rate": 0.0009496358543417367, + "loss": 0.6233, + "step": 1904 + }, + { + "epoch": 1.0642458100558658, + "grad_norm": 0.46623653173446655, + "learning_rate": 0.0009496078431372549, + "loss": 0.3396, + "step": 1905 + }, + { + "epoch": 1.064804469273743, + "grad_norm": 2.7224881649017334, + "learning_rate": 0.0009495798319327731, + "loss": 0.5402, + "step": 1906 + }, + { + "epoch": 1.06536312849162, + "grad_norm": 0.7040708065032959, + "learning_rate": 0.0009495518207282914, + "loss": 0.4954, + "step": 1907 + }, + { + "epoch": 1.0659217877094973, + "grad_norm": 4.1388044357299805, + "learning_rate": 0.0009495238095238096, + "loss": 0.4982, + "step": 1908 + }, + { + "epoch": 1.0664804469273743, + "grad_norm": 0.6924494504928589, + "learning_rate": 0.0009494957983193277, + "loss": 0.4696, + "step": 1909 + }, + { + "epoch": 1.0670391061452513, + "grad_norm": 0.7390439510345459, + "learning_rate": 0.0009494677871148459, + "loss": 0.5629, + "step": 1910 + }, + { + "epoch": 1.0675977653631286, + "grad_norm": 0.7155113220214844, + "learning_rate": 0.0009494397759103641, + "loss": 0.4215, + "step": 1911 + }, + { + "epoch": 1.0681564245810056, + "grad_norm": 1.014770746231079, + "learning_rate": 0.0009494117647058824, + "loss": 0.5626, + "step": 1912 + }, + { + "epoch": 1.0687150837988826, + "grad_norm": 0.5904166102409363, + "learning_rate": 0.0009493837535014006, + "loss": 0.5667, + "step": 1913 + }, + { + "epoch": 1.0692737430167598, + "grad_norm": 0.9801692366600037, + "learning_rate": 0.0009493557422969187, + "loss": 0.5677, + "step": 1914 + }, + { + "epoch": 1.0698324022346368, + "grad_norm": 0.8028150200843811, + "learning_rate": 0.0009493277310924369, + "loss": 0.427, + "step": 1915 + }, + { + "epoch": 1.070391061452514, + "grad_norm": 1.2819465398788452, + "learning_rate": 0.0009492997198879551, + "loss": 0.5448, + "step": 1916 + }, + { + "epoch": 1.070949720670391, + "grad_norm": 0.5701756477355957, + "learning_rate": 0.0009492717086834735, + "loss": 0.5001, + "step": 1917 + }, + { + "epoch": 1.071508379888268, + "grad_norm": 0.5249055624008179, + "learning_rate": 0.0009492436974789917, + "loss": 0.4983, + "step": 1918 + }, + { + "epoch": 1.0720670391061453, + "grad_norm": 4.500863552093506, + "learning_rate": 0.0009492156862745099, + "loss": 0.5184, + "step": 1919 + }, + { + "epoch": 1.0726256983240223, + "grad_norm": 0.6784878373146057, + "learning_rate": 0.000949187675070028, + "loss": 0.5462, + "step": 1920 + }, + { + "epoch": 1.0731843575418996, + "grad_norm": 0.9951833486557007, + "learning_rate": 0.0009491596638655462, + "loss": 0.5515, + "step": 1921 + }, + { + "epoch": 1.0737430167597766, + "grad_norm": 0.5707071423530579, + "learning_rate": 0.0009491316526610645, + "loss": 0.5275, + "step": 1922 + }, + { + "epoch": 1.0743016759776536, + "grad_norm": 0.6549288034439087, + "learning_rate": 0.0009491036414565827, + "loss": 0.5335, + "step": 1923 + }, + { + "epoch": 1.0748603351955308, + "grad_norm": 0.5716150403022766, + "learning_rate": 0.0009490756302521009, + "loss": 0.4694, + "step": 1924 + }, + { + "epoch": 1.0754189944134078, + "grad_norm": 0.6139609813690186, + "learning_rate": 0.000949047619047619, + "loss": 0.5359, + "step": 1925 + }, + { + "epoch": 1.0759776536312848, + "grad_norm": 1.3087342977523804, + "learning_rate": 0.0009490196078431372, + "loss": 0.5169, + "step": 1926 + }, + { + "epoch": 1.076536312849162, + "grad_norm": 0.472334086894989, + "learning_rate": 0.0009489915966386555, + "loss": 0.479, + "step": 1927 + }, + { + "epoch": 1.077094972067039, + "grad_norm": 0.48861464858055115, + "learning_rate": 0.0009489635854341737, + "loss": 0.5137, + "step": 1928 + }, + { + "epoch": 1.077653631284916, + "grad_norm": 0.8765335083007812, + "learning_rate": 0.0009489355742296919, + "loss": 0.5395, + "step": 1929 + }, + { + "epoch": 1.0782122905027933, + "grad_norm": 0.8806900978088379, + "learning_rate": 0.00094890756302521, + "loss": 0.4267, + "step": 1930 + }, + { + "epoch": 1.0787709497206703, + "grad_norm": 0.614098072052002, + "learning_rate": 0.0009488795518207282, + "loss": 0.6302, + "step": 1931 + }, + { + "epoch": 1.0793296089385476, + "grad_norm": 0.6210005283355713, + "learning_rate": 0.0009488515406162466, + "loss": 0.4408, + "step": 1932 + }, + { + "epoch": 1.0798882681564246, + "grad_norm": 0.3822392523288727, + "learning_rate": 0.0009488235294117648, + "loss": 0.3646, + "step": 1933 + }, + { + "epoch": 1.0804469273743016, + "grad_norm": 0.5221074223518372, + "learning_rate": 0.000948795518207283, + "loss": 0.4972, + "step": 1934 + }, + { + "epoch": 1.0810055865921788, + "grad_norm": 0.7435020208358765, + "learning_rate": 0.0009487675070028012, + "loss": 0.4364, + "step": 1935 + }, + { + "epoch": 1.0815642458100558, + "grad_norm": 1.6197137832641602, + "learning_rate": 0.0009487394957983193, + "loss": 0.517, + "step": 1936 + }, + { + "epoch": 1.082122905027933, + "grad_norm": 0.582164466381073, + "learning_rate": 0.0009487114845938376, + "loss": 0.4018, + "step": 1937 + }, + { + "epoch": 1.08268156424581, + "grad_norm": 0.6637523174285889, + "learning_rate": 0.0009486834733893558, + "loss": 0.539, + "step": 1938 + }, + { + "epoch": 1.083240223463687, + "grad_norm": 0.5906127095222473, + "learning_rate": 0.000948655462184874, + "loss": 0.4602, + "step": 1939 + }, + { + "epoch": 1.0837988826815643, + "grad_norm": 0.7345956563949585, + "learning_rate": 0.0009486274509803922, + "loss": 0.4681, + "step": 1940 + }, + { + "epoch": 1.0843575418994413, + "grad_norm": 1.3723267316818237, + "learning_rate": 0.0009485994397759103, + "loss": 0.698, + "step": 1941 + }, + { + "epoch": 1.0849162011173183, + "grad_norm": 0.6047074794769287, + "learning_rate": 0.0009485714285714286, + "loss": 0.4643, + "step": 1942 + }, + { + "epoch": 1.0854748603351956, + "grad_norm": 3.5328667163848877, + "learning_rate": 0.0009485434173669468, + "loss": 0.4468, + "step": 1943 + }, + { + "epoch": 1.0860335195530726, + "grad_norm": 0.9016256928443909, + "learning_rate": 0.000948515406162465, + "loss": 0.4569, + "step": 1944 + }, + { + "epoch": 1.0865921787709498, + "grad_norm": 0.4552290141582489, + "learning_rate": 0.0009484873949579832, + "loss": 0.5389, + "step": 1945 + }, + { + "epoch": 1.0871508379888268, + "grad_norm": 0.6696059107780457, + "learning_rate": 0.0009484593837535013, + "loss": 0.4362, + "step": 1946 + }, + { + "epoch": 1.0877094972067038, + "grad_norm": 0.6108041405677795, + "learning_rate": 0.0009484313725490196, + "loss": 0.4632, + "step": 1947 + }, + { + "epoch": 1.088268156424581, + "grad_norm": 0.562382161617279, + "learning_rate": 0.0009484033613445378, + "loss": 0.5649, + "step": 1948 + }, + { + "epoch": 1.088826815642458, + "grad_norm": 0.5756879448890686, + "learning_rate": 0.000948375350140056, + "loss": 0.3809, + "step": 1949 + }, + { + "epoch": 1.089385474860335, + "grad_norm": 1.2760363817214966, + "learning_rate": 0.0009483473389355743, + "loss": 0.5882, + "step": 1950 + }, + { + "epoch": 1.0899441340782123, + "grad_norm": 0.6584094762802124, + "learning_rate": 0.0009483193277310925, + "loss": 0.525, + "step": 1951 + }, + { + "epoch": 1.0905027932960893, + "grad_norm": 0.5820613503456116, + "learning_rate": 0.0009482913165266107, + "loss": 0.3538, + "step": 1952 + }, + { + "epoch": 1.0910614525139666, + "grad_norm": 0.7930244207382202, + "learning_rate": 0.0009482633053221289, + "loss": 0.5407, + "step": 1953 + }, + { + "epoch": 1.0916201117318436, + "grad_norm": 0.4802961051464081, + "learning_rate": 0.0009482352941176471, + "loss": 0.4492, + "step": 1954 + }, + { + "epoch": 1.0921787709497206, + "grad_norm": 0.7456192374229431, + "learning_rate": 0.0009482072829131653, + "loss": 0.5938, + "step": 1955 + }, + { + "epoch": 1.0927374301675978, + "grad_norm": 0.6738383173942566, + "learning_rate": 0.0009481792717086835, + "loss": 0.5535, + "step": 1956 + }, + { + "epoch": 1.0932960893854748, + "grad_norm": 1.3703727722167969, + "learning_rate": 0.0009481512605042017, + "loss": 0.5562, + "step": 1957 + }, + { + "epoch": 1.0938547486033519, + "grad_norm": 1.293674349784851, + "learning_rate": 0.0009481232492997199, + "loss": 0.5104, + "step": 1958 + }, + { + "epoch": 1.094413407821229, + "grad_norm": 1.3170990943908691, + "learning_rate": 0.0009480952380952381, + "loss": 0.5793, + "step": 1959 + }, + { + "epoch": 1.094972067039106, + "grad_norm": 0.9616127014160156, + "learning_rate": 0.0009480672268907563, + "loss": 0.6517, + "step": 1960 + }, + { + "epoch": 1.0955307262569833, + "grad_norm": 1.4081939458847046, + "learning_rate": 0.0009480392156862745, + "loss": 0.5712, + "step": 1961 + }, + { + "epoch": 1.0960893854748603, + "grad_norm": 0.7713123559951782, + "learning_rate": 0.0009480112044817928, + "loss": 0.546, + "step": 1962 + }, + { + "epoch": 1.0966480446927374, + "grad_norm": 0.6402348279953003, + "learning_rate": 0.0009479831932773109, + "loss": 0.6007, + "step": 1963 + }, + { + "epoch": 1.0972067039106146, + "grad_norm": 0.5454915165901184, + "learning_rate": 0.0009479551820728291, + "loss": 0.4806, + "step": 1964 + }, + { + "epoch": 1.0977653631284916, + "grad_norm": 0.6856116056442261, + "learning_rate": 0.0009479271708683474, + "loss": 0.4945, + "step": 1965 + }, + { + "epoch": 1.0983240223463686, + "grad_norm": 0.6550681591033936, + "learning_rate": 0.0009478991596638656, + "loss": 0.3967, + "step": 1966 + }, + { + "epoch": 1.0988826815642458, + "grad_norm": 0.7778172492980957, + "learning_rate": 0.0009478711484593839, + "loss": 0.534, + "step": 1967 + }, + { + "epoch": 1.0994413407821229, + "grad_norm": 0.8509172201156616, + "learning_rate": 0.000947843137254902, + "loss": 0.4184, + "step": 1968 + }, + { + "epoch": 1.1, + "grad_norm": 0.7637555599212646, + "learning_rate": 0.0009478151260504202, + "loss": 0.5898, + "step": 1969 + }, + { + "epoch": 1.100558659217877, + "grad_norm": 0.6652496457099915, + "learning_rate": 0.0009477871148459384, + "loss": 0.5575, + "step": 1970 + }, + { + "epoch": 1.1011173184357541, + "grad_norm": 1.1267739534378052, + "learning_rate": 0.0009477591036414566, + "loss": 0.4535, + "step": 1971 + }, + { + "epoch": 1.1016759776536313, + "grad_norm": 0.7473882436752319, + "learning_rate": 0.0009477310924369749, + "loss": 0.4894, + "step": 1972 + }, + { + "epoch": 1.1022346368715084, + "grad_norm": 0.4576612710952759, + "learning_rate": 0.000947703081232493, + "loss": 0.405, + "step": 1973 + }, + { + "epoch": 1.1027932960893856, + "grad_norm": 0.5016672611236572, + "learning_rate": 0.0009476750700280112, + "loss": 0.4387, + "step": 1974 + }, + { + "epoch": 1.1033519553072626, + "grad_norm": 1.9634641408920288, + "learning_rate": 0.0009476470588235294, + "loss": 0.4615, + "step": 1975 + }, + { + "epoch": 1.1039106145251396, + "grad_norm": 0.561021625995636, + "learning_rate": 0.0009476190476190476, + "loss": 0.4402, + "step": 1976 + }, + { + "epoch": 1.1044692737430168, + "grad_norm": 0.4194818437099457, + "learning_rate": 0.0009475910364145659, + "loss": 0.4603, + "step": 1977 + }, + { + "epoch": 1.1050279329608939, + "grad_norm": 6.717833042144775, + "learning_rate": 0.0009475630252100841, + "loss": 0.5868, + "step": 1978 + }, + { + "epoch": 1.1055865921787709, + "grad_norm": 1.257798671722412, + "learning_rate": 0.0009475350140056022, + "loss": 0.5072, + "step": 1979 + }, + { + "epoch": 1.106145251396648, + "grad_norm": 0.505747377872467, + "learning_rate": 0.0009475070028011204, + "loss": 0.3985, + "step": 1980 + }, + { + "epoch": 1.106703910614525, + "grad_norm": 0.8068348169326782, + "learning_rate": 0.0009474789915966386, + "loss": 0.5688, + "step": 1981 + }, + { + "epoch": 1.1072625698324021, + "grad_norm": 0.7358660101890564, + "learning_rate": 0.000947450980392157, + "loss": 0.5501, + "step": 1982 + }, + { + "epoch": 1.1078212290502794, + "grad_norm": 0.4912152886390686, + "learning_rate": 0.0009474229691876752, + "loss": 0.4926, + "step": 1983 + }, + { + "epoch": 1.1083798882681564, + "grad_norm": 0.9780363440513611, + "learning_rate": 0.0009473949579831933, + "loss": 0.4942, + "step": 1984 + }, + { + "epoch": 1.1089385474860336, + "grad_norm": 9.432075500488281, + "learning_rate": 0.0009473669467787115, + "loss": 0.4413, + "step": 1985 + }, + { + "epoch": 1.1094972067039106, + "grad_norm": 1.2021106481552124, + "learning_rate": 0.0009473389355742297, + "loss": 0.4839, + "step": 1986 + }, + { + "epoch": 1.1100558659217876, + "grad_norm": 2.9500606060028076, + "learning_rate": 0.000947310924369748, + "loss": 0.5105, + "step": 1987 + }, + { + "epoch": 1.1106145251396649, + "grad_norm": 0.6623552441596985, + "learning_rate": 0.0009472829131652662, + "loss": 0.5214, + "step": 1988 + }, + { + "epoch": 1.1111731843575419, + "grad_norm": 0.6067897081375122, + "learning_rate": 0.0009472549019607843, + "loss": 0.4155, + "step": 1989 + }, + { + "epoch": 1.111731843575419, + "grad_norm": 0.576686441898346, + "learning_rate": 0.0009472268907563025, + "loss": 0.474, + "step": 1990 + }, + { + "epoch": 1.112290502793296, + "grad_norm": 0.4937525987625122, + "learning_rate": 0.0009471988795518207, + "loss": 0.5354, + "step": 1991 + }, + { + "epoch": 1.1128491620111731, + "grad_norm": 0.7148085832595825, + "learning_rate": 0.000947170868347339, + "loss": 0.4498, + "step": 1992 + }, + { + "epoch": 1.1134078212290504, + "grad_norm": 0.6365212202072144, + "learning_rate": 0.0009471428571428572, + "loss": 0.4725, + "step": 1993 + }, + { + "epoch": 1.1139664804469274, + "grad_norm": 12.856399536132812, + "learning_rate": 0.0009471148459383754, + "loss": 0.4554, + "step": 1994 + }, + { + "epoch": 1.1145251396648044, + "grad_norm": 1.4765926599502563, + "learning_rate": 0.0009470868347338935, + "loss": 0.4718, + "step": 1995 + }, + { + "epoch": 1.1150837988826816, + "grad_norm": 0.6106082797050476, + "learning_rate": 0.0009470588235294117, + "loss": 0.4687, + "step": 1996 + }, + { + "epoch": 1.1156424581005586, + "grad_norm": 0.7510437965393066, + "learning_rate": 0.00094703081232493, + "loss": 0.5655, + "step": 1997 + }, + { + "epoch": 1.1162011173184359, + "grad_norm": 2.784778594970703, + "learning_rate": 0.0009470028011204483, + "loss": 0.5206, + "step": 1998 + }, + { + "epoch": 1.1167597765363129, + "grad_norm": 0.5566121339797974, + "learning_rate": 0.0009469747899159665, + "loss": 0.4103, + "step": 1999 + }, + { + "epoch": 1.1173184357541899, + "grad_norm": 0.49366363883018494, + "learning_rate": 0.0009469467787114846, + "loss": 0.5134, + "step": 2000 + }, + { + "epoch": 1.1173184357541899, + "eval_cer": 0.10084230739855761, + "eval_loss": 0.376040518283844, + "eval_runtime": 55.697, + "eval_samples_per_second": 81.477, + "eval_steps_per_second": 5.099, + "eval_wer": 0.4046535012988464, + "step": 2000 + }, + { + "epoch": 1.117877094972067, + "grad_norm": 0.6351279020309448, + "learning_rate": 0.0009469187675070028, + "loss": 0.6502, + "step": 2001 + }, + { + "epoch": 1.1184357541899441, + "grad_norm": 0.8336634635925293, + "learning_rate": 0.0009468907563025211, + "loss": 0.5242, + "step": 2002 + }, + { + "epoch": 1.1189944134078211, + "grad_norm": 0.7816217541694641, + "learning_rate": 0.0009468627450980393, + "loss": 0.5954, + "step": 2003 + }, + { + "epoch": 1.1195530726256984, + "grad_norm": 0.7004947066307068, + "learning_rate": 0.0009468347338935575, + "loss": 0.474, + "step": 2004 + }, + { + "epoch": 1.1201117318435754, + "grad_norm": 0.611807644367218, + "learning_rate": 0.0009468067226890756, + "loss": 0.4017, + "step": 2005 + }, + { + "epoch": 1.1206703910614526, + "grad_norm": 2.6075925827026367, + "learning_rate": 0.0009467787114845938, + "loss": 0.5336, + "step": 2006 + }, + { + "epoch": 1.1212290502793296, + "grad_norm": 0.9471938610076904, + "learning_rate": 0.000946750700280112, + "loss": 0.422, + "step": 2007 + }, + { + "epoch": 1.1217877094972066, + "grad_norm": 0.5446215271949768, + "learning_rate": 0.0009467226890756303, + "loss": 0.4789, + "step": 2008 + }, + { + "epoch": 1.1223463687150839, + "grad_norm": 4.740174293518066, + "learning_rate": 0.0009466946778711485, + "loss": 0.5513, + "step": 2009 + }, + { + "epoch": 1.1229050279329609, + "grad_norm": 0.3919450342655182, + "learning_rate": 0.0009466666666666667, + "loss": 0.4792, + "step": 2010 + }, + { + "epoch": 1.1234636871508379, + "grad_norm": 0.6619439125061035, + "learning_rate": 0.0009466386554621848, + "loss": 0.4726, + "step": 2011 + }, + { + "epoch": 1.1240223463687151, + "grad_norm": 1.0391995906829834, + "learning_rate": 0.000946610644257703, + "loss": 0.6268, + "step": 2012 + }, + { + "epoch": 1.1245810055865921, + "grad_norm": 0.7475591897964478, + "learning_rate": 0.0009465826330532213, + "loss": 0.5402, + "step": 2013 + }, + { + "epoch": 1.1251396648044694, + "grad_norm": 0.45967820286750793, + "learning_rate": 0.0009465546218487396, + "loss": 0.548, + "step": 2014 + }, + { + "epoch": 1.1256983240223464, + "grad_norm": 0.503616452217102, + "learning_rate": 0.0009465266106442578, + "loss": 0.5305, + "step": 2015 + }, + { + "epoch": 1.1262569832402234, + "grad_norm": 0.5683256387710571, + "learning_rate": 0.0009464985994397759, + "loss": 0.3593, + "step": 2016 + }, + { + "epoch": 1.1268156424581006, + "grad_norm": 1.0615944862365723, + "learning_rate": 0.0009464705882352941, + "loss": 0.5239, + "step": 2017 + }, + { + "epoch": 1.1273743016759776, + "grad_norm": 2.092423439025879, + "learning_rate": 0.0009464425770308124, + "loss": 0.4618, + "step": 2018 + }, + { + "epoch": 1.1279329608938546, + "grad_norm": 0.960770845413208, + "learning_rate": 0.0009464145658263306, + "loss": 0.4726, + "step": 2019 + }, + { + "epoch": 1.1284916201117319, + "grad_norm": 0.6446082592010498, + "learning_rate": 0.0009463865546218488, + "loss": 0.488, + "step": 2020 + }, + { + "epoch": 1.1290502793296089, + "grad_norm": 1.553737998008728, + "learning_rate": 0.0009463585434173669, + "loss": 0.4262, + "step": 2021 + }, + { + "epoch": 1.1296089385474861, + "grad_norm": 1.2342222929000854, + "learning_rate": 0.0009463305322128851, + "loss": 0.4967, + "step": 2022 + }, + { + "epoch": 1.1301675977653631, + "grad_norm": 0.9561513066291809, + "learning_rate": 0.0009463025210084034, + "loss": 0.5597, + "step": 2023 + }, + { + "epoch": 1.1307262569832401, + "grad_norm": 1.0787378549575806, + "learning_rate": 0.0009462745098039216, + "loss": 0.5564, + "step": 2024 + }, + { + "epoch": 1.1312849162011174, + "grad_norm": 0.6298704743385315, + "learning_rate": 0.0009462464985994398, + "loss": 0.5185, + "step": 2025 + }, + { + "epoch": 1.1318435754189944, + "grad_norm": 1.0412228107452393, + "learning_rate": 0.000946218487394958, + "loss": 0.4284, + "step": 2026 + }, + { + "epoch": 1.1324022346368716, + "grad_norm": 0.9268519878387451, + "learning_rate": 0.0009461904761904761, + "loss": 0.435, + "step": 2027 + }, + { + "epoch": 1.1329608938547486, + "grad_norm": 1.1674823760986328, + "learning_rate": 0.0009461624649859944, + "loss": 0.521, + "step": 2028 + }, + { + "epoch": 1.1335195530726256, + "grad_norm": 0.8815200328826904, + "learning_rate": 0.0009461344537815126, + "loss": 0.5258, + "step": 2029 + }, + { + "epoch": 1.1340782122905029, + "grad_norm": 0.7177609205245972, + "learning_rate": 0.0009461064425770308, + "loss": 0.5613, + "step": 2030 + }, + { + "epoch": 1.1346368715083799, + "grad_norm": 0.7643547654151917, + "learning_rate": 0.000946078431372549, + "loss": 0.5462, + "step": 2031 + }, + { + "epoch": 1.135195530726257, + "grad_norm": 1.8996915817260742, + "learning_rate": 0.0009460504201680672, + "loss": 0.5297, + "step": 2032 + }, + { + "epoch": 1.1357541899441341, + "grad_norm": 0.5828958749771118, + "learning_rate": 0.0009460224089635855, + "loss": 0.3906, + "step": 2033 + }, + { + "epoch": 1.1363128491620111, + "grad_norm": 0.6114552617073059, + "learning_rate": 0.0009459943977591037, + "loss": 0.4089, + "step": 2034 + }, + { + "epoch": 1.1368715083798882, + "grad_norm": 0.4777858555316925, + "learning_rate": 0.0009459663865546219, + "loss": 0.48, + "step": 2035 + }, + { + "epoch": 1.1374301675977654, + "grad_norm": 0.37437257170677185, + "learning_rate": 0.0009459383753501401, + "loss": 0.3888, + "step": 2036 + }, + { + "epoch": 1.1379888268156424, + "grad_norm": 0.672275722026825, + "learning_rate": 0.0009459103641456582, + "loss": 0.4543, + "step": 2037 + }, + { + "epoch": 1.1385474860335196, + "grad_norm": 0.8527001142501831, + "learning_rate": 0.0009458823529411765, + "loss": 0.4963, + "step": 2038 + }, + { + "epoch": 1.1391061452513966, + "grad_norm": 0.7076776027679443, + "learning_rate": 0.0009458543417366947, + "loss": 0.4566, + "step": 2039 + }, + { + "epoch": 1.1396648044692737, + "grad_norm": 0.9013495445251465, + "learning_rate": 0.0009458263305322129, + "loss": 0.446, + "step": 2040 + }, + { + "epoch": 1.1402234636871509, + "grad_norm": 2.5649375915527344, + "learning_rate": 0.0009457983193277311, + "loss": 0.5934, + "step": 2041 + }, + { + "epoch": 1.140782122905028, + "grad_norm": 0.8204906582832336, + "learning_rate": 0.0009457703081232493, + "loss": 0.5531, + "step": 2042 + }, + { + "epoch": 1.1413407821229051, + "grad_norm": 0.7193981409072876, + "learning_rate": 0.0009457422969187675, + "loss": 0.5328, + "step": 2043 + }, + { + "epoch": 1.1418994413407821, + "grad_norm": 0.6250834465026855, + "learning_rate": 0.0009457142857142857, + "loss": 0.4741, + "step": 2044 + }, + { + "epoch": 1.1424581005586592, + "grad_norm": 0.7503669261932373, + "learning_rate": 0.0009456862745098039, + "loss": 0.7432, + "step": 2045 + }, + { + "epoch": 1.1430167597765364, + "grad_norm": 0.5963375568389893, + "learning_rate": 0.0009456582633053221, + "loss": 0.4875, + "step": 2046 + }, + { + "epoch": 1.1435754189944134, + "grad_norm": 0.685600996017456, + "learning_rate": 0.0009456302521008404, + "loss": 0.3985, + "step": 2047 + }, + { + "epoch": 1.1441340782122904, + "grad_norm": 0.5940116047859192, + "learning_rate": 0.0009456022408963586, + "loss": 0.5145, + "step": 2048 + }, + { + "epoch": 1.1446927374301676, + "grad_norm": 0.593612015247345, + "learning_rate": 0.0009455742296918768, + "loss": 0.502, + "step": 2049 + }, + { + "epoch": 1.1452513966480447, + "grad_norm": 0.7128913998603821, + "learning_rate": 0.000945546218487395, + "loss": 0.4228, + "step": 2050 + }, + { + "epoch": 1.1458100558659219, + "grad_norm": 0.7061253786087036, + "learning_rate": 0.0009455182072829132, + "loss": 0.5887, + "step": 2051 + }, + { + "epoch": 1.146368715083799, + "grad_norm": 0.5116814374923706, + "learning_rate": 0.0009454901960784314, + "loss": 0.4821, + "step": 2052 + }, + { + "epoch": 1.146927374301676, + "grad_norm": 1.508675456047058, + "learning_rate": 0.0009454621848739496, + "loss": 0.6087, + "step": 2053 + }, + { + "epoch": 1.1474860335195531, + "grad_norm": 1.2514795064926147, + "learning_rate": 0.0009454341736694678, + "loss": 0.4883, + "step": 2054 + }, + { + "epoch": 1.1480446927374302, + "grad_norm": 0.6793577075004578, + "learning_rate": 0.000945406162464986, + "loss": 0.6769, + "step": 2055 + }, + { + "epoch": 1.1486033519553072, + "grad_norm": 0.5529429912567139, + "learning_rate": 0.0009453781512605042, + "loss": 0.491, + "step": 2056 + }, + { + "epoch": 1.1491620111731844, + "grad_norm": 0.8128876090049744, + "learning_rate": 0.0009453501400560224, + "loss": 0.4867, + "step": 2057 + }, + { + "epoch": 1.1497206703910614, + "grad_norm": 0.504065752029419, + "learning_rate": 0.0009453221288515407, + "loss": 0.4177, + "step": 2058 + }, + { + "epoch": 1.1502793296089386, + "grad_norm": 0.9824762940406799, + "learning_rate": 0.0009452941176470588, + "loss": 0.6173, + "step": 2059 + }, + { + "epoch": 1.1508379888268156, + "grad_norm": 0.5630059242248535, + "learning_rate": 0.000945266106442577, + "loss": 0.4012, + "step": 2060 + }, + { + "epoch": 1.1513966480446927, + "grad_norm": 0.6323129534721375, + "learning_rate": 0.0009452380952380952, + "loss": 0.5885, + "step": 2061 + }, + { + "epoch": 1.15195530726257, + "grad_norm": 1.6381089687347412, + "learning_rate": 0.0009452100840336134, + "loss": 0.5873, + "step": 2062 + }, + { + "epoch": 1.152513966480447, + "grad_norm": 0.6622616648674011, + "learning_rate": 0.0009451820728291318, + "loss": 0.5932, + "step": 2063 + }, + { + "epoch": 1.1530726256983241, + "grad_norm": 0.763674795627594, + "learning_rate": 0.0009451540616246499, + "loss": 0.6191, + "step": 2064 + }, + { + "epoch": 1.1536312849162011, + "grad_norm": 0.5640531182289124, + "learning_rate": 0.0009451260504201681, + "loss": 0.5414, + "step": 2065 + }, + { + "epoch": 1.1541899441340782, + "grad_norm": 1.0875272750854492, + "learning_rate": 0.0009450980392156863, + "loss": 0.4564, + "step": 2066 + }, + { + "epoch": 1.1547486033519554, + "grad_norm": 0.4861888289451599, + "learning_rate": 0.0009450700280112045, + "loss": 0.4297, + "step": 2067 + }, + { + "epoch": 1.1553072625698324, + "grad_norm": 1.0888410806655884, + "learning_rate": 0.0009450420168067228, + "loss": 0.4436, + "step": 2068 + }, + { + "epoch": 1.1558659217877094, + "grad_norm": 1.016395926475525, + "learning_rate": 0.0009450140056022409, + "loss": 0.4985, + "step": 2069 + }, + { + "epoch": 1.1564245810055866, + "grad_norm": 0.5680158138275146, + "learning_rate": 0.0009449859943977591, + "loss": 0.3889, + "step": 2070 + }, + { + "epoch": 1.1569832402234637, + "grad_norm": 0.6512601971626282, + "learning_rate": 0.0009449579831932773, + "loss": 0.497, + "step": 2071 + }, + { + "epoch": 1.1575418994413407, + "grad_norm": 0.7783377170562744, + "learning_rate": 0.0009449299719887955, + "loss": 0.6509, + "step": 2072 + }, + { + "epoch": 1.158100558659218, + "grad_norm": 0.7246847152709961, + "learning_rate": 0.0009449019607843138, + "loss": 0.6507, + "step": 2073 + }, + { + "epoch": 1.158659217877095, + "grad_norm": 1.2934362888336182, + "learning_rate": 0.000944873949579832, + "loss": 0.4753, + "step": 2074 + }, + { + "epoch": 1.1592178770949721, + "grad_norm": 0.5288491249084473, + "learning_rate": 0.0009448459383753501, + "loss": 0.4657, + "step": 2075 + }, + { + "epoch": 1.1597765363128492, + "grad_norm": 0.5828776955604553, + "learning_rate": 0.0009448179271708683, + "loss": 0.4424, + "step": 2076 + }, + { + "epoch": 1.1603351955307262, + "grad_norm": 0.7760799527168274, + "learning_rate": 0.0009447899159663865, + "loss": 0.5057, + "step": 2077 + }, + { + "epoch": 1.1608938547486034, + "grad_norm": 1.175264835357666, + "learning_rate": 0.0009447619047619048, + "loss": 0.5168, + "step": 2078 + }, + { + "epoch": 1.1614525139664804, + "grad_norm": 0.5211125016212463, + "learning_rate": 0.000944733893557423, + "loss": 0.4559, + "step": 2079 + }, + { + "epoch": 1.1620111731843576, + "grad_norm": 0.4531496465206146, + "learning_rate": 0.0009447058823529411, + "loss": 0.4065, + "step": 2080 + }, + { + "epoch": 1.1625698324022347, + "grad_norm": 0.5101728439331055, + "learning_rate": 0.0009446778711484594, + "loss": 0.527, + "step": 2081 + }, + { + "epoch": 1.1631284916201117, + "grad_norm": 0.6482619047164917, + "learning_rate": 0.0009446498599439776, + "loss": 0.5388, + "step": 2082 + }, + { + "epoch": 1.163687150837989, + "grad_norm": 0.490764319896698, + "learning_rate": 0.0009446218487394959, + "loss": 0.4549, + "step": 2083 + }, + { + "epoch": 1.164245810055866, + "grad_norm": 0.5618298649787903, + "learning_rate": 0.0009445938375350141, + "loss": 0.5465, + "step": 2084 + }, + { + "epoch": 1.164804469273743, + "grad_norm": 0.8647662401199341, + "learning_rate": 0.0009445658263305322, + "loss": 0.4772, + "step": 2085 + }, + { + "epoch": 1.1653631284916202, + "grad_norm": 0.6574456095695496, + "learning_rate": 0.0009445378151260504, + "loss": 0.5492, + "step": 2086 + }, + { + "epoch": 1.1659217877094972, + "grad_norm": 1.570063591003418, + "learning_rate": 0.0009445098039215686, + "loss": 0.4776, + "step": 2087 + }, + { + "epoch": 1.1664804469273742, + "grad_norm": 0.6808940172195435, + "learning_rate": 0.0009444817927170869, + "loss": 0.5196, + "step": 2088 + }, + { + "epoch": 1.1670391061452514, + "grad_norm": 0.4464326500892639, + "learning_rate": 0.0009444537815126051, + "loss": 0.5087, + "step": 2089 + }, + { + "epoch": 1.1675977653631284, + "grad_norm": 0.4341641068458557, + "learning_rate": 0.0009444257703081233, + "loss": 0.4685, + "step": 2090 + }, + { + "epoch": 1.1681564245810057, + "grad_norm": 1.411848783493042, + "learning_rate": 0.0009443977591036414, + "loss": 0.5911, + "step": 2091 + }, + { + "epoch": 1.1687150837988827, + "grad_norm": 0.5872650146484375, + "learning_rate": 0.0009443697478991596, + "loss": 0.511, + "step": 2092 + }, + { + "epoch": 1.1692737430167597, + "grad_norm": 1.6268452405929565, + "learning_rate": 0.0009443417366946779, + "loss": 0.3877, + "step": 2093 + }, + { + "epoch": 1.169832402234637, + "grad_norm": 0.6332060694694519, + "learning_rate": 0.0009443137254901961, + "loss": 0.5258, + "step": 2094 + }, + { + "epoch": 1.170391061452514, + "grad_norm": 0.45429572463035583, + "learning_rate": 0.0009442857142857143, + "loss": 0.3962, + "step": 2095 + }, + { + "epoch": 1.1709497206703912, + "grad_norm": 2.5373170375823975, + "learning_rate": 0.0009442577030812324, + "loss": 0.4432, + "step": 2096 + }, + { + "epoch": 1.1715083798882682, + "grad_norm": 0.6933470368385315, + "learning_rate": 0.0009442296918767506, + "loss": 0.4427, + "step": 2097 + }, + { + "epoch": 1.1720670391061452, + "grad_norm": 0.6767030358314514, + "learning_rate": 0.000944201680672269, + "loss": 0.4712, + "step": 2098 + }, + { + "epoch": 1.1726256983240224, + "grad_norm": 4.434169769287109, + "learning_rate": 0.0009441736694677872, + "loss": 0.5754, + "step": 2099 + }, + { + "epoch": 1.1731843575418994, + "grad_norm": 34.56585693359375, + "learning_rate": 0.0009441456582633054, + "loss": 0.5498, + "step": 2100 + }, + { + "epoch": 1.1737430167597767, + "grad_norm": 0.7130588889122009, + "learning_rate": 0.0009441176470588235, + "loss": 0.5418, + "step": 2101 + }, + { + "epoch": 1.1743016759776537, + "grad_norm": 1.0188194513320923, + "learning_rate": 0.0009440896358543417, + "loss": 0.4108, + "step": 2102 + }, + { + "epoch": 1.1748603351955307, + "grad_norm": 1.7920080423355103, + "learning_rate": 0.00094406162464986, + "loss": 0.5079, + "step": 2103 + }, + { + "epoch": 1.175418994413408, + "grad_norm": 0.7652671933174133, + "learning_rate": 0.0009440336134453782, + "loss": 0.4698, + "step": 2104 + }, + { + "epoch": 1.175977653631285, + "grad_norm": 2.849742889404297, + "learning_rate": 0.0009440056022408964, + "loss": 0.444, + "step": 2105 + }, + { + "epoch": 1.176536312849162, + "grad_norm": 0.685455322265625, + "learning_rate": 0.0009439775910364146, + "loss": 0.468, + "step": 2106 + }, + { + "epoch": 1.1770949720670392, + "grad_norm": 0.7482166290283203, + "learning_rate": 0.0009439495798319327, + "loss": 0.4793, + "step": 2107 + }, + { + "epoch": 1.1776536312849162, + "grad_norm": 0.8171047568321228, + "learning_rate": 0.000943921568627451, + "loss": 0.5212, + "step": 2108 + }, + { + "epoch": 1.1782122905027932, + "grad_norm": 0.5344380140304565, + "learning_rate": 0.0009438935574229692, + "loss": 0.4642, + "step": 2109 + }, + { + "epoch": 1.1787709497206704, + "grad_norm": 0.45532694458961487, + "learning_rate": 0.0009438655462184874, + "loss": 0.4947, + "step": 2110 + }, + { + "epoch": 1.1793296089385474, + "grad_norm": 0.890842616558075, + "learning_rate": 0.0009438375350140056, + "loss": 0.5628, + "step": 2111 + }, + { + "epoch": 1.1798882681564247, + "grad_norm": 0.8063690066337585, + "learning_rate": 0.0009438095238095237, + "loss": 0.495, + "step": 2112 + }, + { + "epoch": 1.1804469273743017, + "grad_norm": 0.8815364837646484, + "learning_rate": 0.000943781512605042, + "loss": 0.5065, + "step": 2113 + }, + { + "epoch": 1.1810055865921787, + "grad_norm": 0.71140456199646, + "learning_rate": 0.0009437535014005603, + "loss": 0.4184, + "step": 2114 + }, + { + "epoch": 1.181564245810056, + "grad_norm": 0.833466112613678, + "learning_rate": 0.0009437254901960785, + "loss": 0.439, + "step": 2115 + }, + { + "epoch": 1.182122905027933, + "grad_norm": 4.519916534423828, + "learning_rate": 0.0009436974789915967, + "loss": 0.4693, + "step": 2116 + }, + { + "epoch": 1.1826815642458102, + "grad_norm": 1.029601812362671, + "learning_rate": 0.0009436694677871148, + "loss": 0.5891, + "step": 2117 + }, + { + "epoch": 1.1832402234636872, + "grad_norm": 0.4797174632549286, + "learning_rate": 0.0009436414565826331, + "loss": 0.4465, + "step": 2118 + }, + { + "epoch": 1.1837988826815642, + "grad_norm": 1.2280062437057495, + "learning_rate": 0.0009436134453781513, + "loss": 0.5429, + "step": 2119 + }, + { + "epoch": 1.1843575418994414, + "grad_norm": 1.3682737350463867, + "learning_rate": 0.0009435854341736695, + "loss": 0.4934, + "step": 2120 + }, + { + "epoch": 1.1849162011173184, + "grad_norm": 4.337080955505371, + "learning_rate": 0.0009435574229691877, + "loss": 0.5855, + "step": 2121 + }, + { + "epoch": 1.1854748603351954, + "grad_norm": 1.0282882452011108, + "learning_rate": 0.0009435294117647059, + "loss": 0.4405, + "step": 2122 + }, + { + "epoch": 1.1860335195530727, + "grad_norm": 1.3295059204101562, + "learning_rate": 0.0009435014005602241, + "loss": 0.4725, + "step": 2123 + }, + { + "epoch": 1.1865921787709497, + "grad_norm": 0.720296323299408, + "learning_rate": 0.0009434733893557423, + "loss": 0.515, + "step": 2124 + }, + { + "epoch": 1.1871508379888267, + "grad_norm": 0.8459692597389221, + "learning_rate": 0.0009434453781512605, + "loss": 0.7379, + "step": 2125 + }, + { + "epoch": 1.187709497206704, + "grad_norm": 0.5645942091941833, + "learning_rate": 0.0009434173669467787, + "loss": 0.4765, + "step": 2126 + }, + { + "epoch": 1.188268156424581, + "grad_norm": 0.9478999376296997, + "learning_rate": 0.0009433893557422969, + "loss": 0.4183, + "step": 2127 + }, + { + "epoch": 1.1888268156424582, + "grad_norm": 0.7287110686302185, + "learning_rate": 0.0009433613445378151, + "loss": 0.3978, + "step": 2128 + }, + { + "epoch": 1.1893854748603352, + "grad_norm": 1.2676537036895752, + "learning_rate": 0.0009433333333333334, + "loss": 0.4012, + "step": 2129 + }, + { + "epoch": 1.1899441340782122, + "grad_norm": 0.8754597306251526, + "learning_rate": 0.0009433053221288516, + "loss": 0.5435, + "step": 2130 + }, + { + "epoch": 1.1905027932960894, + "grad_norm": 0.6942610740661621, + "learning_rate": 0.0009432773109243698, + "loss": 0.5449, + "step": 2131 + }, + { + "epoch": 1.1910614525139664, + "grad_norm": 1.1593784093856812, + "learning_rate": 0.000943249299719888, + "loss": 0.4472, + "step": 2132 + }, + { + "epoch": 1.1916201117318437, + "grad_norm": 0.8377334475517273, + "learning_rate": 0.0009432212885154063, + "loss": 0.5899, + "step": 2133 + }, + { + "epoch": 1.1921787709497207, + "grad_norm": 0.6997050046920776, + "learning_rate": 0.0009431932773109244, + "loss": 0.5047, + "step": 2134 + }, + { + "epoch": 1.1927374301675977, + "grad_norm": 1.3585000038146973, + "learning_rate": 0.0009431652661064426, + "loss": 0.4843, + "step": 2135 + }, + { + "epoch": 1.193296089385475, + "grad_norm": 0.7649428844451904, + "learning_rate": 0.0009431372549019608, + "loss": 0.5575, + "step": 2136 + }, + { + "epoch": 1.193854748603352, + "grad_norm": 10.159912109375, + "learning_rate": 0.000943109243697479, + "loss": 0.6898, + "step": 2137 + }, + { + "epoch": 1.194413407821229, + "grad_norm": 1.1560299396514893, + "learning_rate": 0.0009430812324929973, + "loss": 0.4705, + "step": 2138 + }, + { + "epoch": 1.1949720670391062, + "grad_norm": 1.281246304512024, + "learning_rate": 0.0009430532212885154, + "loss": 0.4527, + "step": 2139 + }, + { + "epoch": 1.1955307262569832, + "grad_norm": 0.7689476609230042, + "learning_rate": 0.0009430252100840336, + "loss": 0.3603, + "step": 2140 + }, + { + "epoch": 1.1960893854748602, + "grad_norm": 0.6380707621574402, + "learning_rate": 0.0009429971988795518, + "loss": 0.6703, + "step": 2141 + }, + { + "epoch": 1.1966480446927374, + "grad_norm": 1.4263190031051636, + "learning_rate": 0.00094296918767507, + "loss": 0.4352, + "step": 2142 + }, + { + "epoch": 1.1972067039106145, + "grad_norm": 1.352187156677246, + "learning_rate": 0.0009429411764705883, + "loss": 0.4378, + "step": 2143 + }, + { + "epoch": 1.1977653631284917, + "grad_norm": 0.5018616318702698, + "learning_rate": 0.0009429131652661064, + "loss": 0.5471, + "step": 2144 + }, + { + "epoch": 1.1983240223463687, + "grad_norm": 0.6830251812934875, + "learning_rate": 0.0009428851540616246, + "loss": 0.4867, + "step": 2145 + }, + { + "epoch": 1.1988826815642457, + "grad_norm": 0.826819896697998, + "learning_rate": 0.0009428571428571429, + "loss": 0.4842, + "step": 2146 + }, + { + "epoch": 1.199441340782123, + "grad_norm": 0.6691630482673645, + "learning_rate": 0.0009428291316526611, + "loss": 0.4429, + "step": 2147 + }, + { + "epoch": 1.2, + "grad_norm": 0.5742301940917969, + "learning_rate": 0.0009428011204481794, + "loss": 0.5749, + "step": 2148 + }, + { + "epoch": 1.2005586592178772, + "grad_norm": 0.5265778303146362, + "learning_rate": 0.0009427731092436976, + "loss": 0.4716, + "step": 2149 + }, + { + "epoch": 1.2011173184357542, + "grad_norm": 1.4679458141326904, + "learning_rate": 0.0009427450980392157, + "loss": 0.4507, + "step": 2150 + }, + { + "epoch": 1.2016759776536312, + "grad_norm": 0.9520155191421509, + "learning_rate": 0.0009427170868347339, + "loss": 0.5619, + "step": 2151 + }, + { + "epoch": 1.2022346368715084, + "grad_norm": 2.3973822593688965, + "learning_rate": 0.0009426890756302521, + "loss": 0.6133, + "step": 2152 + }, + { + "epoch": 1.2027932960893855, + "grad_norm": 0.5013342499732971, + "learning_rate": 0.0009426610644257704, + "loss": 0.4829, + "step": 2153 + }, + { + "epoch": 1.2033519553072627, + "grad_norm": 0.9384273886680603, + "learning_rate": 0.0009426330532212886, + "loss": 0.4524, + "step": 2154 + }, + { + "epoch": 1.2039106145251397, + "grad_norm": 0.46103498339653015, + "learning_rate": 0.0009426050420168067, + "loss": 0.4417, + "step": 2155 + }, + { + "epoch": 1.2044692737430167, + "grad_norm": 2.0672597885131836, + "learning_rate": 0.0009425770308123249, + "loss": 0.5018, + "step": 2156 + }, + { + "epoch": 1.205027932960894, + "grad_norm": 0.7411819696426392, + "learning_rate": 0.0009425490196078431, + "loss": 0.4332, + "step": 2157 + }, + { + "epoch": 1.205586592178771, + "grad_norm": 0.46277377009391785, + "learning_rate": 0.0009425210084033614, + "loss": 0.5167, + "step": 2158 + }, + { + "epoch": 1.206145251396648, + "grad_norm": 0.554575502872467, + "learning_rate": 0.0009424929971988796, + "loss": 0.5481, + "step": 2159 + }, + { + "epoch": 1.2067039106145252, + "grad_norm": 0.7418820858001709, + "learning_rate": 0.0009424649859943977, + "loss": 0.6344, + "step": 2160 + }, + { + "epoch": 1.2072625698324022, + "grad_norm": 0.8901815414428711, + "learning_rate": 0.0009424369747899159, + "loss": 0.6099, + "step": 2161 + }, + { + "epoch": 1.2078212290502792, + "grad_norm": 0.4801618456840515, + "learning_rate": 0.0009424089635854341, + "loss": 0.5136, + "step": 2162 + }, + { + "epoch": 1.2083798882681565, + "grad_norm": 28.97088050842285, + "learning_rate": 0.0009423809523809525, + "loss": 0.3598, + "step": 2163 + }, + { + "epoch": 1.2089385474860335, + "grad_norm": 0.5268792510032654, + "learning_rate": 0.0009423529411764707, + "loss": 0.5499, + "step": 2164 + }, + { + "epoch": 1.2094972067039107, + "grad_norm": 0.5892419815063477, + "learning_rate": 0.0009423249299719889, + "loss": 0.563, + "step": 2165 + }, + { + "epoch": 1.2100558659217877, + "grad_norm": 1.8505092859268188, + "learning_rate": 0.000942296918767507, + "loss": 0.692, + "step": 2166 + }, + { + "epoch": 1.2106145251396647, + "grad_norm": 0.7788013219833374, + "learning_rate": 0.0009422689075630252, + "loss": 0.511, + "step": 2167 + }, + { + "epoch": 1.211173184357542, + "grad_norm": 0.7060709595680237, + "learning_rate": 0.0009422408963585435, + "loss": 0.6062, + "step": 2168 + }, + { + "epoch": 1.211731843575419, + "grad_norm": 0.5576009750366211, + "learning_rate": 0.0009422128851540617, + "loss": 0.5004, + "step": 2169 + }, + { + "epoch": 1.2122905027932962, + "grad_norm": 0.41443267464637756, + "learning_rate": 0.0009421848739495799, + "loss": 0.4762, + "step": 2170 + }, + { + "epoch": 1.2128491620111732, + "grad_norm": 0.662115752696991, + "learning_rate": 0.000942156862745098, + "loss": 0.4955, + "step": 2171 + }, + { + "epoch": 1.2134078212290502, + "grad_norm": 0.6038459539413452, + "learning_rate": 0.0009421288515406162, + "loss": 0.4901, + "step": 2172 + }, + { + "epoch": 1.2139664804469275, + "grad_norm": 0.7786489129066467, + "learning_rate": 0.0009421008403361345, + "loss": 0.4629, + "step": 2173 + }, + { + "epoch": 1.2145251396648045, + "grad_norm": 0.5956007838249207, + "learning_rate": 0.0009420728291316527, + "loss": 0.5035, + "step": 2174 + }, + { + "epoch": 1.2150837988826815, + "grad_norm": 3.5090713500976562, + "learning_rate": 0.0009420448179271709, + "loss": 0.5553, + "step": 2175 + }, + { + "epoch": 1.2156424581005587, + "grad_norm": 0.5134099125862122, + "learning_rate": 0.000942016806722689, + "loss": 0.5324, + "step": 2176 + }, + { + "epoch": 1.2162011173184357, + "grad_norm": 0.6300668716430664, + "learning_rate": 0.0009419887955182072, + "loss": 0.7685, + "step": 2177 + }, + { + "epoch": 1.2167597765363127, + "grad_norm": 1.9936174154281616, + "learning_rate": 0.0009419607843137256, + "loss": 0.4019, + "step": 2178 + }, + { + "epoch": 1.21731843575419, + "grad_norm": 0.48586663603782654, + "learning_rate": 0.0009419327731092438, + "loss": 0.6216, + "step": 2179 + }, + { + "epoch": 1.217877094972067, + "grad_norm": 0.5281707048416138, + "learning_rate": 0.000941904761904762, + "loss": 0.5676, + "step": 2180 + }, + { + "epoch": 1.2184357541899442, + "grad_norm": 1.4248437881469727, + "learning_rate": 0.0009418767507002802, + "loss": 0.4664, + "step": 2181 + }, + { + "epoch": 1.2189944134078212, + "grad_norm": 1.0297634601593018, + "learning_rate": 0.0009418487394957983, + "loss": 0.4404, + "step": 2182 + }, + { + "epoch": 1.2195530726256982, + "grad_norm": 1.143286108970642, + "learning_rate": 0.0009418207282913166, + "loss": 0.5365, + "step": 2183 + }, + { + "epoch": 1.2201117318435755, + "grad_norm": 0.5228847861289978, + "learning_rate": 0.0009417927170868348, + "loss": 0.5826, + "step": 2184 + }, + { + "epoch": 1.2206703910614525, + "grad_norm": 0.5148554444313049, + "learning_rate": 0.000941764705882353, + "loss": 0.546, + "step": 2185 + }, + { + "epoch": 1.2212290502793297, + "grad_norm": 0.4707455039024353, + "learning_rate": 0.0009417366946778712, + "loss": 0.5057, + "step": 2186 + }, + { + "epoch": 1.2217877094972067, + "grad_norm": 0.8517330288887024, + "learning_rate": 0.0009417086834733893, + "loss": 0.4911, + "step": 2187 + }, + { + "epoch": 1.2223463687150837, + "grad_norm": 0.7839780449867249, + "learning_rate": 0.0009416806722689076, + "loss": 0.4379, + "step": 2188 + }, + { + "epoch": 1.222905027932961, + "grad_norm": 0.3970915973186493, + "learning_rate": 0.0009416526610644258, + "loss": 0.3645, + "step": 2189 + }, + { + "epoch": 1.223463687150838, + "grad_norm": 1.4941742420196533, + "learning_rate": 0.000941624649859944, + "loss": 0.5191, + "step": 2190 + }, + { + "epoch": 1.2240223463687152, + "grad_norm": 0.6183914542198181, + "learning_rate": 0.0009415966386554622, + "loss": 0.4913, + "step": 2191 + }, + { + "epoch": 1.2245810055865922, + "grad_norm": 0.5494821667671204, + "learning_rate": 0.0009415686274509803, + "loss": 0.5324, + "step": 2192 + }, + { + "epoch": 1.2251396648044692, + "grad_norm": 0.8816516995429993, + "learning_rate": 0.0009415406162464986, + "loss": 0.5819, + "step": 2193 + }, + { + "epoch": 1.2256983240223465, + "grad_norm": 1.034061312675476, + "learning_rate": 0.0009415126050420168, + "loss": 0.5174, + "step": 2194 + }, + { + "epoch": 1.2262569832402235, + "grad_norm": 0.5098188519477844, + "learning_rate": 0.000941484593837535, + "loss": 0.596, + "step": 2195 + }, + { + "epoch": 1.2268156424581005, + "grad_norm": 1.2608144283294678, + "learning_rate": 0.0009414565826330533, + "loss": 0.4305, + "step": 2196 + }, + { + "epoch": 1.2273743016759777, + "grad_norm": 0.5888888239860535, + "learning_rate": 0.0009414285714285715, + "loss": 0.3995, + "step": 2197 + }, + { + "epoch": 1.2279329608938547, + "grad_norm": 0.8148500323295593, + "learning_rate": 0.0009414005602240897, + "loss": 0.5187, + "step": 2198 + }, + { + "epoch": 1.2284916201117317, + "grad_norm": 0.5967636704444885, + "learning_rate": 0.0009413725490196079, + "loss": 0.5153, + "step": 2199 + }, + { + "epoch": 1.229050279329609, + "grad_norm": 0.9023078083992004, + "learning_rate": 0.0009413445378151261, + "loss": 0.443, + "step": 2200 + }, + { + "epoch": 1.229608938547486, + "grad_norm": 0.6595953702926636, + "learning_rate": 0.0009413165266106443, + "loss": 0.6693, + "step": 2201 + }, + { + "epoch": 1.2301675977653632, + "grad_norm": 0.4447948634624481, + "learning_rate": 0.0009412885154061625, + "loss": 0.3766, + "step": 2202 + }, + { + "epoch": 1.2307262569832402, + "grad_norm": 2.511414051055908, + "learning_rate": 0.0009412605042016807, + "loss": 0.4313, + "step": 2203 + }, + { + "epoch": 1.2312849162011172, + "grad_norm": 1.5143481492996216, + "learning_rate": 0.0009412324929971989, + "loss": 0.5101, + "step": 2204 + }, + { + "epoch": 1.2318435754189945, + "grad_norm": 0.659372091293335, + "learning_rate": 0.0009412044817927171, + "loss": 0.4999, + "step": 2205 + }, + { + "epoch": 1.2324022346368715, + "grad_norm": 0.5657572746276855, + "learning_rate": 0.0009411764705882353, + "loss": 0.4752, + "step": 2206 + }, + { + "epoch": 1.2329608938547487, + "grad_norm": 0.5374714732170105, + "learning_rate": 0.0009411484593837535, + "loss": 0.5059, + "step": 2207 + }, + { + "epoch": 1.2335195530726257, + "grad_norm": 1.0359647274017334, + "learning_rate": 0.0009411204481792717, + "loss": 0.5077, + "step": 2208 + }, + { + "epoch": 1.2340782122905027, + "grad_norm": 0.7191410064697266, + "learning_rate": 0.0009410924369747899, + "loss": 0.4023, + "step": 2209 + }, + { + "epoch": 1.23463687150838, + "grad_norm": 0.8112833499908447, + "learning_rate": 0.0009410644257703081, + "loss": 0.5111, + "step": 2210 + }, + { + "epoch": 1.235195530726257, + "grad_norm": 0.760631799697876, + "learning_rate": 0.0009410364145658264, + "loss": 0.637, + "step": 2211 + }, + { + "epoch": 1.235754189944134, + "grad_norm": 14.107086181640625, + "learning_rate": 0.0009410084033613446, + "loss": 0.5136, + "step": 2212 + }, + { + "epoch": 1.2363128491620112, + "grad_norm": 0.7132804989814758, + "learning_rate": 0.0009409803921568629, + "loss": 0.4148, + "step": 2213 + }, + { + "epoch": 1.2368715083798882, + "grad_norm": 0.5110153555870056, + "learning_rate": 0.000940952380952381, + "loss": 0.4572, + "step": 2214 + }, + { + "epoch": 1.2374301675977653, + "grad_norm": 0.7452877759933472, + "learning_rate": 0.0009409243697478992, + "loss": 0.4563, + "step": 2215 + }, + { + "epoch": 1.2379888268156425, + "grad_norm": 0.6648342609405518, + "learning_rate": 0.0009408963585434174, + "loss": 0.4606, + "step": 2216 + }, + { + "epoch": 1.2385474860335195, + "grad_norm": 0.5784304738044739, + "learning_rate": 0.0009408683473389356, + "loss": 0.5491, + "step": 2217 + }, + { + "epoch": 1.2391061452513967, + "grad_norm": 0.6082378029823303, + "learning_rate": 0.0009408403361344539, + "loss": 0.5645, + "step": 2218 + }, + { + "epoch": 1.2396648044692737, + "grad_norm": 0.8048925995826721, + "learning_rate": 0.000940812324929972, + "loss": 0.5302, + "step": 2219 + }, + { + "epoch": 1.2402234636871508, + "grad_norm": 2.321373224258423, + "learning_rate": 0.0009407843137254902, + "loss": 0.4928, + "step": 2220 + }, + { + "epoch": 1.240782122905028, + "grad_norm": 0.5194941759109497, + "learning_rate": 0.0009407563025210084, + "loss": 0.4633, + "step": 2221 + }, + { + "epoch": 1.241340782122905, + "grad_norm": 8.402451515197754, + "learning_rate": 0.0009407282913165266, + "loss": 0.4617, + "step": 2222 + }, + { + "epoch": 1.2418994413407822, + "grad_norm": 0.4527837336063385, + "learning_rate": 0.0009407002801120449, + "loss": 0.5353, + "step": 2223 + }, + { + "epoch": 1.2424581005586592, + "grad_norm": 0.7383092045783997, + "learning_rate": 0.000940672268907563, + "loss": 0.5599, + "step": 2224 + }, + { + "epoch": 1.2430167597765363, + "grad_norm": 2.717256546020508, + "learning_rate": 0.0009406442577030812, + "loss": 0.5823, + "step": 2225 + }, + { + "epoch": 1.2435754189944135, + "grad_norm": 0.6024227142333984, + "learning_rate": 0.0009406162464985994, + "loss": 0.5049, + "step": 2226 + }, + { + "epoch": 1.2441340782122905, + "grad_norm": 0.48112189769744873, + "learning_rate": 0.0009405882352941176, + "loss": 0.4484, + "step": 2227 + }, + { + "epoch": 1.2446927374301675, + "grad_norm": 1.0186944007873535, + "learning_rate": 0.0009405602240896359, + "loss": 0.5297, + "step": 2228 + }, + { + "epoch": 1.2452513966480447, + "grad_norm": 0.48272573947906494, + "learning_rate": 0.0009405322128851542, + "loss": 0.4606, + "step": 2229 + }, + { + "epoch": 1.2458100558659218, + "grad_norm": 0.6424063444137573, + "learning_rate": 0.0009405042016806723, + "loss": 0.5134, + "step": 2230 + }, + { + "epoch": 1.2463687150837988, + "grad_norm": 0.8302348852157593, + "learning_rate": 0.0009404761904761905, + "loss": 0.4915, + "step": 2231 + }, + { + "epoch": 1.246927374301676, + "grad_norm": 0.5846266150474548, + "learning_rate": 0.0009404481792717087, + "loss": 0.444, + "step": 2232 + }, + { + "epoch": 1.247486033519553, + "grad_norm": 0.5861256718635559, + "learning_rate": 0.0009404201680672269, + "loss": 0.4968, + "step": 2233 + }, + { + "epoch": 1.2480446927374302, + "grad_norm": 0.5197890400886536, + "learning_rate": 0.0009403921568627452, + "loss": 0.4419, + "step": 2234 + }, + { + "epoch": 1.2486033519553073, + "grad_norm": 3.3656036853790283, + "learning_rate": 0.0009403641456582633, + "loss": 0.4971, + "step": 2235 + }, + { + "epoch": 1.2491620111731843, + "grad_norm": 2.8907134532928467, + "learning_rate": 0.0009403361344537815, + "loss": 0.4644, + "step": 2236 + }, + { + "epoch": 1.2497206703910615, + "grad_norm": 0.8501009345054626, + "learning_rate": 0.0009403081232492997, + "loss": 0.4333, + "step": 2237 + }, + { + "epoch": 1.2502793296089385, + "grad_norm": 0.7901805639266968, + "learning_rate": 0.0009402801120448179, + "loss": 0.5054, + "step": 2238 + }, + { + "epoch": 1.2508379888268157, + "grad_norm": 9.953897476196289, + "learning_rate": 0.0009402521008403362, + "loss": 0.5503, + "step": 2239 + }, + { + "epoch": 1.2513966480446927, + "grad_norm": 0.4574260413646698, + "learning_rate": 0.0009402240896358543, + "loss": 0.413, + "step": 2240 + }, + { + "epoch": 1.2519553072625698, + "grad_norm": 0.880647599697113, + "learning_rate": 0.0009401960784313725, + "loss": 0.488, + "step": 2241 + }, + { + "epoch": 1.252513966480447, + "grad_norm": 0.6018409729003906, + "learning_rate": 0.0009401680672268907, + "loss": 0.4849, + "step": 2242 + }, + { + "epoch": 1.253072625698324, + "grad_norm": 0.7612437009811401, + "learning_rate": 0.0009401400560224089, + "loss": 0.5401, + "step": 2243 + }, + { + "epoch": 1.2536312849162012, + "grad_norm": 0.6563434600830078, + "learning_rate": 0.0009401120448179273, + "loss": 0.512, + "step": 2244 + }, + { + "epoch": 1.2541899441340782, + "grad_norm": 0.7319206595420837, + "learning_rate": 0.0009400840336134455, + "loss": 0.6396, + "step": 2245 + }, + { + "epoch": 1.2547486033519553, + "grad_norm": 0.7326250076293945, + "learning_rate": 0.0009400560224089636, + "loss": 0.4986, + "step": 2246 + }, + { + "epoch": 1.2553072625698323, + "grad_norm": 0.666903555393219, + "learning_rate": 0.0009400280112044818, + "loss": 0.362, + "step": 2247 + }, + { + "epoch": 1.2558659217877095, + "grad_norm": 0.6648302674293518, + "learning_rate": 0.00094, + "loss": 0.4379, + "step": 2248 + }, + { + "epoch": 1.2564245810055865, + "grad_norm": 0.6782199144363403, + "learning_rate": 0.0009399719887955183, + "loss": 0.3984, + "step": 2249 + }, + { + "epoch": 1.2569832402234637, + "grad_norm": 0.8376119136810303, + "learning_rate": 0.0009399439775910365, + "loss": 0.5668, + "step": 2250 + }, + { + "epoch": 1.2575418994413408, + "grad_norm": 0.7745038270950317, + "learning_rate": 0.0009399159663865546, + "loss": 0.5542, + "step": 2251 + }, + { + "epoch": 1.2581005586592178, + "grad_norm": 0.49192437529563904, + "learning_rate": 0.0009398879551820728, + "loss": 0.4786, + "step": 2252 + }, + { + "epoch": 1.258659217877095, + "grad_norm": 0.7043928503990173, + "learning_rate": 0.000939859943977591, + "loss": 0.4761, + "step": 2253 + }, + { + "epoch": 1.259217877094972, + "grad_norm": 0.47638171911239624, + "learning_rate": 0.0009398319327731093, + "loss": 0.464, + "step": 2254 + }, + { + "epoch": 1.2597765363128492, + "grad_norm": 0.6261572241783142, + "learning_rate": 0.0009398039215686275, + "loss": 0.3789, + "step": 2255 + }, + { + "epoch": 1.2603351955307263, + "grad_norm": 0.9678022265434265, + "learning_rate": 0.0009397759103641456, + "loss": 0.4295, + "step": 2256 + }, + { + "epoch": 1.2608938547486033, + "grad_norm": 0.8020207285881042, + "learning_rate": 0.0009397478991596638, + "loss": 0.6257, + "step": 2257 + }, + { + "epoch": 1.2614525139664805, + "grad_norm": 0.5573917031288147, + "learning_rate": 0.000939719887955182, + "loss": 0.4932, + "step": 2258 + }, + { + "epoch": 1.2620111731843575, + "grad_norm": 1.220342755317688, + "learning_rate": 0.0009396918767507003, + "loss": 0.5833, + "step": 2259 + }, + { + "epoch": 1.2625698324022347, + "grad_norm": 0.5947352647781372, + "learning_rate": 0.0009396638655462186, + "loss": 0.5532, + "step": 2260 + }, + { + "epoch": 1.2631284916201118, + "grad_norm": 0.6306976675987244, + "learning_rate": 0.0009396358543417368, + "loss": 0.4108, + "step": 2261 + }, + { + "epoch": 1.2636871508379888, + "grad_norm": 1.0991276502609253, + "learning_rate": 0.0009396078431372549, + "loss": 0.5008, + "step": 2262 + }, + { + "epoch": 1.264245810055866, + "grad_norm": 0.4786874055862427, + "learning_rate": 0.0009395798319327731, + "loss": 0.4771, + "step": 2263 + }, + { + "epoch": 1.264804469273743, + "grad_norm": 1.4687696695327759, + "learning_rate": 0.0009395518207282914, + "loss": 0.6492, + "step": 2264 + }, + { + "epoch": 1.2653631284916202, + "grad_norm": 0.4954474866390228, + "learning_rate": 0.0009395238095238096, + "loss": 0.4555, + "step": 2265 + }, + { + "epoch": 1.2659217877094973, + "grad_norm": 0.5406965613365173, + "learning_rate": 0.0009394957983193278, + "loss": 0.5395, + "step": 2266 + }, + { + "epoch": 1.2664804469273743, + "grad_norm": 1.00322687625885, + "learning_rate": 0.0009394677871148459, + "loss": 0.4583, + "step": 2267 + }, + { + "epoch": 1.2670391061452513, + "grad_norm": 0.7092496752738953, + "learning_rate": 0.0009394397759103641, + "loss": 0.5007, + "step": 2268 + }, + { + "epoch": 1.2675977653631285, + "grad_norm": 0.8788418173789978, + "learning_rate": 0.0009394117647058824, + "loss": 0.489, + "step": 2269 + }, + { + "epoch": 1.2681564245810055, + "grad_norm": 0.5346425771713257, + "learning_rate": 0.0009393837535014006, + "loss": 0.4748, + "step": 2270 + }, + { + "epoch": 1.2687150837988828, + "grad_norm": 0.531053900718689, + "learning_rate": 0.0009393557422969188, + "loss": 0.3946, + "step": 2271 + }, + { + "epoch": 1.2692737430167598, + "grad_norm": 0.5598241686820984, + "learning_rate": 0.0009393277310924369, + "loss": 0.5454, + "step": 2272 + }, + { + "epoch": 1.2698324022346368, + "grad_norm": 0.6497790813446045, + "learning_rate": 0.0009392997198879551, + "loss": 0.4848, + "step": 2273 + }, + { + "epoch": 1.270391061452514, + "grad_norm": 0.47104766964912415, + "learning_rate": 0.0009392717086834734, + "loss": 0.4843, + "step": 2274 + }, + { + "epoch": 1.270949720670391, + "grad_norm": 0.6373875737190247, + "learning_rate": 0.0009392436974789916, + "loss": 0.4422, + "step": 2275 + }, + { + "epoch": 1.2715083798882683, + "grad_norm": 0.6158223152160645, + "learning_rate": 0.0009392156862745098, + "loss": 0.5435, + "step": 2276 + }, + { + "epoch": 1.2720670391061453, + "grad_norm": 0.8325719833374023, + "learning_rate": 0.000939187675070028, + "loss": 0.4364, + "step": 2277 + }, + { + "epoch": 1.2726256983240223, + "grad_norm": 0.5909124612808228, + "learning_rate": 0.0009391596638655462, + "loss": 0.5291, + "step": 2278 + }, + { + "epoch": 1.2731843575418995, + "grad_norm": 0.6929603219032288, + "learning_rate": 0.0009391316526610645, + "loss": 0.5202, + "step": 2279 + }, + { + "epoch": 1.2737430167597765, + "grad_norm": 0.5221192240715027, + "learning_rate": 0.0009391036414565827, + "loss": 0.4662, + "step": 2280 + }, + { + "epoch": 1.2743016759776538, + "grad_norm": 0.4766484797000885, + "learning_rate": 0.0009390756302521009, + "loss": 0.3955, + "step": 2281 + }, + { + "epoch": 1.2748603351955308, + "grad_norm": 0.4216242730617523, + "learning_rate": 0.0009390476190476191, + "loss": 0.3814, + "step": 2282 + }, + { + "epoch": 1.2754189944134078, + "grad_norm": 0.45738425850868225, + "learning_rate": 0.0009390196078431372, + "loss": 0.5629, + "step": 2283 + }, + { + "epoch": 1.2759776536312848, + "grad_norm": 1.963683009147644, + "learning_rate": 0.0009389915966386555, + "loss": 0.5602, + "step": 2284 + }, + { + "epoch": 1.276536312849162, + "grad_norm": 0.45442578196525574, + "learning_rate": 0.0009389635854341737, + "loss": 0.5522, + "step": 2285 + }, + { + "epoch": 1.277094972067039, + "grad_norm": 0.5828547477722168, + "learning_rate": 0.0009389355742296919, + "loss": 0.3992, + "step": 2286 + }, + { + "epoch": 1.2776536312849163, + "grad_norm": 0.6319419145584106, + "learning_rate": 0.0009389075630252101, + "loss": 0.4846, + "step": 2287 + }, + { + "epoch": 1.2782122905027933, + "grad_norm": 0.5193390846252441, + "learning_rate": 0.0009388795518207282, + "loss": 0.4614, + "step": 2288 + }, + { + "epoch": 1.2787709497206703, + "grad_norm": 0.7177364230155945, + "learning_rate": 0.0009388515406162465, + "loss": 0.4653, + "step": 2289 + }, + { + "epoch": 1.2793296089385475, + "grad_norm": 0.3906303346157074, + "learning_rate": 0.0009388235294117647, + "loss": 0.4146, + "step": 2290 + }, + { + "epoch": 1.2798882681564245, + "grad_norm": 0.47126659750938416, + "learning_rate": 0.0009387955182072829, + "loss": 0.4305, + "step": 2291 + }, + { + "epoch": 1.2804469273743018, + "grad_norm": 0.41768181324005127, + "learning_rate": 0.0009387675070028011, + "loss": 0.4524, + "step": 2292 + }, + { + "epoch": 1.2810055865921788, + "grad_norm": 0.5221830010414124, + "learning_rate": 0.0009387394957983194, + "loss": 0.5144, + "step": 2293 + }, + { + "epoch": 1.2815642458100558, + "grad_norm": 0.8605964779853821, + "learning_rate": 0.0009387114845938376, + "loss": 0.4509, + "step": 2294 + }, + { + "epoch": 1.282122905027933, + "grad_norm": 1.1724451780319214, + "learning_rate": 0.0009386834733893558, + "loss": 0.6453, + "step": 2295 + }, + { + "epoch": 1.28268156424581, + "grad_norm": 0.6247095465660095, + "learning_rate": 0.000938655462184874, + "loss": 0.5012, + "step": 2296 + }, + { + "epoch": 1.2832402234636873, + "grad_norm": 0.44797074794769287, + "learning_rate": 0.0009386274509803922, + "loss": 0.4417, + "step": 2297 + }, + { + "epoch": 1.2837988826815643, + "grad_norm": 0.48256805539131165, + "learning_rate": 0.0009385994397759104, + "loss": 0.4259, + "step": 2298 + }, + { + "epoch": 1.2843575418994413, + "grad_norm": 0.5220387578010559, + "learning_rate": 0.0009385714285714286, + "loss": 0.419, + "step": 2299 + }, + { + "epoch": 1.2849162011173183, + "grad_norm": 1.777891993522644, + "learning_rate": 0.0009385434173669468, + "loss": 0.3602, + "step": 2300 + }, + { + "epoch": 1.2854748603351955, + "grad_norm": 0.6454483270645142, + "learning_rate": 0.000938515406162465, + "loss": 0.5735, + "step": 2301 + }, + { + "epoch": 1.2860335195530725, + "grad_norm": 0.6514875888824463, + "learning_rate": 0.0009384873949579832, + "loss": 0.4505, + "step": 2302 + }, + { + "epoch": 1.2865921787709498, + "grad_norm": 1.135413408279419, + "learning_rate": 0.0009384593837535014, + "loss": 0.4506, + "step": 2303 + }, + { + "epoch": 1.2871508379888268, + "grad_norm": 1.0130354166030884, + "learning_rate": 0.0009384313725490196, + "loss": 0.3847, + "step": 2304 + }, + { + "epoch": 1.2877094972067038, + "grad_norm": 0.6721040606498718, + "learning_rate": 0.0009384033613445378, + "loss": 0.4477, + "step": 2305 + }, + { + "epoch": 1.288268156424581, + "grad_norm": 0.7059589624404907, + "learning_rate": 0.000938375350140056, + "loss": 0.7653, + "step": 2306 + }, + { + "epoch": 1.288826815642458, + "grad_norm": 0.4800010621547699, + "learning_rate": 0.0009383473389355742, + "loss": 0.4682, + "step": 2307 + }, + { + "epoch": 1.2893854748603353, + "grad_norm": 0.5354390740394592, + "learning_rate": 0.0009383193277310924, + "loss": 0.4465, + "step": 2308 + }, + { + "epoch": 1.2899441340782123, + "grad_norm": 0.5681769847869873, + "learning_rate": 0.0009382913165266108, + "loss": 0.5501, + "step": 2309 + }, + { + "epoch": 1.2905027932960893, + "grad_norm": 0.5556226372718811, + "learning_rate": 0.0009382633053221289, + "loss": 0.6201, + "step": 2310 + }, + { + "epoch": 1.2910614525139665, + "grad_norm": 0.5704392194747925, + "learning_rate": 0.0009382352941176471, + "loss": 0.5139, + "step": 2311 + }, + { + "epoch": 1.2916201117318435, + "grad_norm": 0.5250219106674194, + "learning_rate": 0.0009382072829131653, + "loss": 0.5067, + "step": 2312 + }, + { + "epoch": 1.2921787709497208, + "grad_norm": 0.48458990454673767, + "learning_rate": 0.0009381792717086835, + "loss": 0.3434, + "step": 2313 + }, + { + "epoch": 1.2927374301675978, + "grad_norm": 0.4792317748069763, + "learning_rate": 0.0009381512605042018, + "loss": 0.4662, + "step": 2314 + }, + { + "epoch": 1.2932960893854748, + "grad_norm": 0.44975733757019043, + "learning_rate": 0.0009381232492997199, + "loss": 0.4624, + "step": 2315 + }, + { + "epoch": 1.293854748603352, + "grad_norm": 0.8396225571632385, + "learning_rate": 0.0009380952380952381, + "loss": 0.5841, + "step": 2316 + }, + { + "epoch": 1.294413407821229, + "grad_norm": 2.294858932495117, + "learning_rate": 0.0009380672268907563, + "loss": 0.5021, + "step": 2317 + }, + { + "epoch": 1.2949720670391063, + "grad_norm": 1.1718876361846924, + "learning_rate": 0.0009380392156862745, + "loss": 0.5981, + "step": 2318 + }, + { + "epoch": 1.2955307262569833, + "grad_norm": 1.5335103273391724, + "learning_rate": 0.0009380112044817928, + "loss": 0.635, + "step": 2319 + }, + { + "epoch": 1.2960893854748603, + "grad_norm": 0.9901288151741028, + "learning_rate": 0.0009379831932773109, + "loss": 0.5752, + "step": 2320 + }, + { + "epoch": 1.2966480446927373, + "grad_norm": 1.2723642587661743, + "learning_rate": 0.0009379551820728291, + "loss": 0.584, + "step": 2321 + }, + { + "epoch": 1.2972067039106145, + "grad_norm": 0.5291574001312256, + "learning_rate": 0.0009379271708683473, + "loss": 0.354, + "step": 2322 + }, + { + "epoch": 1.2977653631284916, + "grad_norm": 0.8696710467338562, + "learning_rate": 0.0009378991596638655, + "loss": 0.4486, + "step": 2323 + }, + { + "epoch": 1.2983240223463688, + "grad_norm": 0.6118862628936768, + "learning_rate": 0.0009378711484593838, + "loss": 0.5562, + "step": 2324 + }, + { + "epoch": 1.2988826815642458, + "grad_norm": 0.6964899301528931, + "learning_rate": 0.000937843137254902, + "loss": 0.4508, + "step": 2325 + }, + { + "epoch": 1.2994413407821228, + "grad_norm": 0.6855241656303406, + "learning_rate": 0.0009378151260504201, + "loss": 0.5312, + "step": 2326 + }, + { + "epoch": 1.3, + "grad_norm": 1.0400898456573486, + "learning_rate": 0.0009377871148459384, + "loss": 0.5504, + "step": 2327 + }, + { + "epoch": 1.300558659217877, + "grad_norm": 0.5161594152450562, + "learning_rate": 0.0009377591036414566, + "loss": 0.5327, + "step": 2328 + }, + { + "epoch": 1.3011173184357543, + "grad_norm": 0.5096378326416016, + "learning_rate": 0.0009377310924369749, + "loss": 0.4295, + "step": 2329 + }, + { + "epoch": 1.3016759776536313, + "grad_norm": 0.4510524272918701, + "learning_rate": 0.0009377030812324931, + "loss": 0.4408, + "step": 2330 + }, + { + "epoch": 1.3022346368715083, + "grad_norm": 0.43716779351234436, + "learning_rate": 0.0009376750700280112, + "loss": 0.4222, + "step": 2331 + }, + { + "epoch": 1.3027932960893855, + "grad_norm": 0.6374119520187378, + "learning_rate": 0.0009376470588235294, + "loss": 0.3717, + "step": 2332 + }, + { + "epoch": 1.3033519553072626, + "grad_norm": 0.5533728003501892, + "learning_rate": 0.0009376190476190476, + "loss": 0.5172, + "step": 2333 + }, + { + "epoch": 1.3039106145251398, + "grad_norm": 0.7198346257209778, + "learning_rate": 0.0009375910364145659, + "loss": 0.5654, + "step": 2334 + }, + { + "epoch": 1.3044692737430168, + "grad_norm": 0.4423556327819824, + "learning_rate": 0.0009375630252100841, + "loss": 0.4037, + "step": 2335 + }, + { + "epoch": 1.3050279329608938, + "grad_norm": 0.5753763914108276, + "learning_rate": 0.0009375350140056022, + "loss": 0.503, + "step": 2336 + }, + { + "epoch": 1.3055865921787708, + "grad_norm": 1.4048054218292236, + "learning_rate": 0.0009375070028011204, + "loss": 0.4906, + "step": 2337 + }, + { + "epoch": 1.306145251396648, + "grad_norm": 0.6295005679130554, + "learning_rate": 0.0009374789915966386, + "loss": 0.5017, + "step": 2338 + }, + { + "epoch": 1.306703910614525, + "grad_norm": 0.4695497155189514, + "learning_rate": 0.0009374509803921569, + "loss": 0.507, + "step": 2339 + }, + { + "epoch": 1.3072625698324023, + "grad_norm": 0.4623326361179352, + "learning_rate": 0.0009374229691876751, + "loss": 0.4622, + "step": 2340 + }, + { + "epoch": 1.3078212290502793, + "grad_norm": 0.498626172542572, + "learning_rate": 0.0009373949579831933, + "loss": 0.4211, + "step": 2341 + }, + { + "epoch": 1.3083798882681563, + "grad_norm": 1.420953631401062, + "learning_rate": 0.0009373669467787114, + "loss": 0.4418, + "step": 2342 + }, + { + "epoch": 1.3089385474860336, + "grad_norm": 0.6144613027572632, + "learning_rate": 0.0009373389355742296, + "loss": 0.5697, + "step": 2343 + }, + { + "epoch": 1.3094972067039106, + "grad_norm": 0.5299192070960999, + "learning_rate": 0.000937310924369748, + "loss": 0.428, + "step": 2344 + }, + { + "epoch": 1.3100558659217878, + "grad_norm": 0.5369188189506531, + "learning_rate": 0.0009372829131652662, + "loss": 0.3956, + "step": 2345 + }, + { + "epoch": 1.3106145251396648, + "grad_norm": 0.853063702583313, + "learning_rate": 0.0009372549019607844, + "loss": 0.5381, + "step": 2346 + }, + { + "epoch": 1.3111731843575418, + "grad_norm": 6.990716934204102, + "learning_rate": 0.0009372268907563025, + "loss": 0.5895, + "step": 2347 + }, + { + "epoch": 1.311731843575419, + "grad_norm": 0.8140488266944885, + "learning_rate": 0.0009371988795518207, + "loss": 0.5286, + "step": 2348 + }, + { + "epoch": 1.312290502793296, + "grad_norm": 1.4516748189926147, + "learning_rate": 0.000937170868347339, + "loss": 0.4647, + "step": 2349 + }, + { + "epoch": 1.3128491620111733, + "grad_norm": 0.6451416015625, + "learning_rate": 0.0009371428571428572, + "loss": 0.522, + "step": 2350 + }, + { + "epoch": 1.3134078212290503, + "grad_norm": 0.7677500247955322, + "learning_rate": 0.0009371148459383754, + "loss": 0.5252, + "step": 2351 + }, + { + "epoch": 1.3139664804469273, + "grad_norm": 1.2045583724975586, + "learning_rate": 0.0009370868347338935, + "loss": 0.5289, + "step": 2352 + }, + { + "epoch": 1.3145251396648043, + "grad_norm": 0.47654032707214355, + "learning_rate": 0.0009370588235294117, + "loss": 0.4123, + "step": 2353 + }, + { + "epoch": 1.3150837988826816, + "grad_norm": 0.4769761860370636, + "learning_rate": 0.00093703081232493, + "loss": 0.4522, + "step": 2354 + }, + { + "epoch": 1.3156424581005586, + "grad_norm": 0.8925811648368835, + "learning_rate": 0.0009370028011204482, + "loss": 0.4944, + "step": 2355 + }, + { + "epoch": 1.3162011173184358, + "grad_norm": 0.5771417617797852, + "learning_rate": 0.0009369747899159664, + "loss": 0.4573, + "step": 2356 + }, + { + "epoch": 1.3167597765363128, + "grad_norm": 0.9821098446846008, + "learning_rate": 0.0009369467787114846, + "loss": 0.6322, + "step": 2357 + }, + { + "epoch": 1.3173184357541898, + "grad_norm": 0.5378075838088989, + "learning_rate": 0.0009369187675070027, + "loss": 0.546, + "step": 2358 + }, + { + "epoch": 1.317877094972067, + "grad_norm": 7.0606560707092285, + "learning_rate": 0.000936890756302521, + "loss": 0.6345, + "step": 2359 + }, + { + "epoch": 1.318435754189944, + "grad_norm": 0.7089935541152954, + "learning_rate": 0.0009368627450980393, + "loss": 0.5802, + "step": 2360 + }, + { + "epoch": 1.3189944134078213, + "grad_norm": 1.0092034339904785, + "learning_rate": 0.0009368347338935575, + "loss": 0.5194, + "step": 2361 + }, + { + "epoch": 1.3195530726256983, + "grad_norm": 1.243813395500183, + "learning_rate": 0.0009368067226890757, + "loss": 0.5829, + "step": 2362 + }, + { + "epoch": 1.3201117318435753, + "grad_norm": 0.7206133604049683, + "learning_rate": 0.0009367787114845938, + "loss": 0.4782, + "step": 2363 + }, + { + "epoch": 1.3206703910614526, + "grad_norm": 0.5967955589294434, + "learning_rate": 0.0009367507002801121, + "loss": 0.4524, + "step": 2364 + }, + { + "epoch": 1.3212290502793296, + "grad_norm": 0.6767614483833313, + "learning_rate": 0.0009367226890756303, + "loss": 0.4303, + "step": 2365 + }, + { + "epoch": 1.3217877094972068, + "grad_norm": 0.6356673240661621, + "learning_rate": 0.0009366946778711485, + "loss": 0.4548, + "step": 2366 + }, + { + "epoch": 1.3223463687150838, + "grad_norm": 0.6191254258155823, + "learning_rate": 0.0009366666666666667, + "loss": 0.4844, + "step": 2367 + }, + { + "epoch": 1.3229050279329608, + "grad_norm": 0.7626490592956543, + "learning_rate": 0.0009366386554621848, + "loss": 0.6647, + "step": 2368 + }, + { + "epoch": 1.323463687150838, + "grad_norm": 0.7978389263153076, + "learning_rate": 0.0009366106442577031, + "loss": 0.4788, + "step": 2369 + }, + { + "epoch": 1.324022346368715, + "grad_norm": 0.5534348487854004, + "learning_rate": 0.0009365826330532213, + "loss": 0.466, + "step": 2370 + }, + { + "epoch": 1.3245810055865923, + "grad_norm": 0.6681626439094543, + "learning_rate": 0.0009365546218487395, + "loss": 0.4191, + "step": 2371 + }, + { + "epoch": 1.3251396648044693, + "grad_norm": 0.6625450253486633, + "learning_rate": 0.0009365266106442577, + "loss": 0.4959, + "step": 2372 + }, + { + "epoch": 1.3256983240223463, + "grad_norm": 1.1665563583374023, + "learning_rate": 0.0009364985994397759, + "loss": 0.4161, + "step": 2373 + }, + { + "epoch": 1.3262569832402233, + "grad_norm": 0.9296486377716064, + "learning_rate": 0.0009364705882352941, + "loss": 0.4142, + "step": 2374 + }, + { + "epoch": 1.3268156424581006, + "grad_norm": 0.6072700619697571, + "learning_rate": 0.0009364425770308124, + "loss": 0.4306, + "step": 2375 + }, + { + "epoch": 1.3273743016759776, + "grad_norm": 0.5891187787055969, + "learning_rate": 0.0009364145658263306, + "loss": 0.5263, + "step": 2376 + }, + { + "epoch": 1.3279329608938548, + "grad_norm": 0.5013577938079834, + "learning_rate": 0.0009363865546218488, + "loss": 0.428, + "step": 2377 + }, + { + "epoch": 1.3284916201117318, + "grad_norm": 0.48129329085350037, + "learning_rate": 0.000936358543417367, + "loss": 0.4253, + "step": 2378 + }, + { + "epoch": 1.3290502793296088, + "grad_norm": 3.904083728790283, + "learning_rate": 0.0009363305322128852, + "loss": 0.4978, + "step": 2379 + }, + { + "epoch": 1.329608938547486, + "grad_norm": 0.6414837837219238, + "learning_rate": 0.0009363025210084034, + "loss": 0.468, + "step": 2380 + }, + { + "epoch": 1.330167597765363, + "grad_norm": 1.2827571630477905, + "learning_rate": 0.0009362745098039216, + "loss": 0.5169, + "step": 2381 + }, + { + "epoch": 1.3307262569832403, + "grad_norm": 0.7587721943855286, + "learning_rate": 0.0009362464985994398, + "loss": 0.385, + "step": 2382 + }, + { + "epoch": 1.3312849162011173, + "grad_norm": 0.4111006557941437, + "learning_rate": 0.000936218487394958, + "loss": 0.4295, + "step": 2383 + }, + { + "epoch": 1.3318435754189943, + "grad_norm": 0.951179563999176, + "learning_rate": 0.0009361904761904763, + "loss": 0.5763, + "step": 2384 + }, + { + "epoch": 1.3324022346368716, + "grad_norm": 0.6663637161254883, + "learning_rate": 0.0009361624649859944, + "loss": 0.4535, + "step": 2385 + }, + { + "epoch": 1.3329608938547486, + "grad_norm": 0.4615425765514374, + "learning_rate": 0.0009361344537815126, + "loss": 0.3139, + "step": 2386 + }, + { + "epoch": 1.3335195530726258, + "grad_norm": 0.9143214821815491, + "learning_rate": 0.0009361064425770308, + "loss": 0.4999, + "step": 2387 + }, + { + "epoch": 1.3340782122905028, + "grad_norm": 0.3537091016769409, + "learning_rate": 0.000936078431372549, + "loss": 0.3886, + "step": 2388 + }, + { + "epoch": 1.3346368715083798, + "grad_norm": 2.323657512664795, + "learning_rate": 0.0009360504201680673, + "loss": 0.4529, + "step": 2389 + }, + { + "epoch": 1.3351955307262569, + "grad_norm": 0.6156639456748962, + "learning_rate": 0.0009360224089635854, + "loss": 0.417, + "step": 2390 + }, + { + "epoch": 1.335754189944134, + "grad_norm": 1.5982677936553955, + "learning_rate": 0.0009359943977591036, + "loss": 0.3718, + "step": 2391 + }, + { + "epoch": 1.336312849162011, + "grad_norm": 0.5884528160095215, + "learning_rate": 0.0009359663865546219, + "loss": 0.5476, + "step": 2392 + }, + { + "epoch": 1.3368715083798883, + "grad_norm": 0.4246930181980133, + "learning_rate": 0.0009359383753501401, + "loss": 0.4264, + "step": 2393 + }, + { + "epoch": 1.3374301675977653, + "grad_norm": 0.5364925861358643, + "learning_rate": 0.0009359103641456584, + "loss": 0.4244, + "step": 2394 + }, + { + "epoch": 1.3379888268156424, + "grad_norm": 0.5065272450447083, + "learning_rate": 0.0009358823529411765, + "loss": 0.4616, + "step": 2395 + }, + { + "epoch": 1.3385474860335196, + "grad_norm": 0.6549285650253296, + "learning_rate": 0.0009358543417366947, + "loss": 0.5211, + "step": 2396 + }, + { + "epoch": 1.3391061452513966, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0009358263305322129, + "loss": 0.5934, + "step": 2397 + }, + { + "epoch": 1.3396648044692738, + "grad_norm": 2.045487880706787, + "learning_rate": 0.0009357983193277311, + "loss": 0.5649, + "step": 2398 + }, + { + "epoch": 1.3402234636871508, + "grad_norm": 0.5242357850074768, + "learning_rate": 0.0009357703081232494, + "loss": 0.3817, + "step": 2399 + }, + { + "epoch": 1.3407821229050279, + "grad_norm": 0.6028469800949097, + "learning_rate": 0.0009357422969187676, + "loss": 0.5627, + "step": 2400 + }, + { + "epoch": 1.341340782122905, + "grad_norm": 0.5879208445549011, + "learning_rate": 0.0009357142857142857, + "loss": 0.4442, + "step": 2401 + }, + { + "epoch": 1.341899441340782, + "grad_norm": 1.3531235456466675, + "learning_rate": 0.0009356862745098039, + "loss": 0.5208, + "step": 2402 + }, + { + "epoch": 1.3424581005586593, + "grad_norm": 0.8171552419662476, + "learning_rate": 0.0009356582633053221, + "loss": 0.4274, + "step": 2403 + }, + { + "epoch": 1.3430167597765363, + "grad_norm": 0.6056199669837952, + "learning_rate": 0.0009356302521008404, + "loss": 0.5542, + "step": 2404 + }, + { + "epoch": 1.3435754189944134, + "grad_norm": 0.8101986646652222, + "learning_rate": 0.0009356022408963586, + "loss": 0.6412, + "step": 2405 + }, + { + "epoch": 1.3441340782122906, + "grad_norm": 0.5022657513618469, + "learning_rate": 0.0009355742296918767, + "loss": 0.4424, + "step": 2406 + }, + { + "epoch": 1.3446927374301676, + "grad_norm": 1.1738780736923218, + "learning_rate": 0.0009355462184873949, + "loss": 0.642, + "step": 2407 + }, + { + "epoch": 1.3452513966480448, + "grad_norm": 0.43925851583480835, + "learning_rate": 0.0009355182072829131, + "loss": 0.5324, + "step": 2408 + }, + { + "epoch": 1.3458100558659218, + "grad_norm": 0.9970035552978516, + "learning_rate": 0.0009354901960784315, + "loss": 0.584, + "step": 2409 + }, + { + "epoch": 1.3463687150837989, + "grad_norm": 1.432970643043518, + "learning_rate": 0.0009354621848739497, + "loss": 0.4647, + "step": 2410 + }, + { + "epoch": 1.3469273743016759, + "grad_norm": 0.7642507553100586, + "learning_rate": 0.0009354341736694678, + "loss": 0.4856, + "step": 2411 + }, + { + "epoch": 1.347486033519553, + "grad_norm": 0.5948406457901001, + "learning_rate": 0.000935406162464986, + "loss": 0.4705, + "step": 2412 + }, + { + "epoch": 1.34804469273743, + "grad_norm": 0.5157648324966431, + "learning_rate": 0.0009353781512605042, + "loss": 0.4436, + "step": 2413 + }, + { + "epoch": 1.3486033519553073, + "grad_norm": 0.664602518081665, + "learning_rate": 0.0009353501400560225, + "loss": 0.4444, + "step": 2414 + }, + { + "epoch": 1.3491620111731844, + "grad_norm": 0.5324051380157471, + "learning_rate": 0.0009353221288515407, + "loss": 0.4082, + "step": 2415 + }, + { + "epoch": 1.3497206703910614, + "grad_norm": 0.7975985407829285, + "learning_rate": 0.0009352941176470589, + "loss": 0.4899, + "step": 2416 + }, + { + "epoch": 1.3502793296089386, + "grad_norm": 0.5563843846321106, + "learning_rate": 0.000935266106442577, + "loss": 0.3955, + "step": 2417 + }, + { + "epoch": 1.3508379888268156, + "grad_norm": 2.0421030521392822, + "learning_rate": 0.0009352380952380952, + "loss": 0.4895, + "step": 2418 + }, + { + "epoch": 1.3513966480446928, + "grad_norm": 0.6686625480651855, + "learning_rate": 0.0009352100840336135, + "loss": 0.4811, + "step": 2419 + }, + { + "epoch": 1.3519553072625698, + "grad_norm": 0.4972870349884033, + "learning_rate": 0.0009351820728291317, + "loss": 0.4943, + "step": 2420 + }, + { + "epoch": 1.3525139664804469, + "grad_norm": 0.680006206035614, + "learning_rate": 0.0009351540616246499, + "loss": 0.4926, + "step": 2421 + }, + { + "epoch": 1.353072625698324, + "grad_norm": 0.5970544219017029, + "learning_rate": 0.000935126050420168, + "loss": 0.437, + "step": 2422 + }, + { + "epoch": 1.353631284916201, + "grad_norm": 0.48182040452957153, + "learning_rate": 0.0009350980392156862, + "loss": 0.4512, + "step": 2423 + }, + { + "epoch": 1.3541899441340783, + "grad_norm": 0.6070398688316345, + "learning_rate": 0.0009350700280112046, + "loss": 0.6856, + "step": 2424 + }, + { + "epoch": 1.3547486033519553, + "grad_norm": 0.5730800628662109, + "learning_rate": 0.0009350420168067228, + "loss": 0.558, + "step": 2425 + }, + { + "epoch": 1.3553072625698324, + "grad_norm": 2.4738833904266357, + "learning_rate": 0.000935014005602241, + "loss": 0.4682, + "step": 2426 + }, + { + "epoch": 1.3558659217877094, + "grad_norm": 0.8916347026824951, + "learning_rate": 0.0009349859943977591, + "loss": 0.611, + "step": 2427 + }, + { + "epoch": 1.3564245810055866, + "grad_norm": 0.8283774852752686, + "learning_rate": 0.0009349579831932773, + "loss": 0.4222, + "step": 2428 + }, + { + "epoch": 1.3569832402234636, + "grad_norm": 0.5654096007347107, + "learning_rate": 0.0009349299719887956, + "loss": 0.5877, + "step": 2429 + }, + { + "epoch": 1.3575418994413408, + "grad_norm": 0.4563155770301819, + "learning_rate": 0.0009349019607843138, + "loss": 0.4535, + "step": 2430 + }, + { + "epoch": 1.3581005586592179, + "grad_norm": 0.7458542585372925, + "learning_rate": 0.000934873949579832, + "loss": 0.39, + "step": 2431 + }, + { + "epoch": 1.3586592178770949, + "grad_norm": 0.9126211404800415, + "learning_rate": 0.0009348459383753502, + "loss": 0.4917, + "step": 2432 + }, + { + "epoch": 1.359217877094972, + "grad_norm": 0.8338176012039185, + "learning_rate": 0.0009348179271708683, + "loss": 0.476, + "step": 2433 + }, + { + "epoch": 1.3597765363128491, + "grad_norm": 0.6514798402786255, + "learning_rate": 0.0009347899159663866, + "loss": 0.6019, + "step": 2434 + }, + { + "epoch": 1.3603351955307263, + "grad_norm": 0.657715380191803, + "learning_rate": 0.0009347619047619048, + "loss": 0.4768, + "step": 2435 + }, + { + "epoch": 1.3608938547486034, + "grad_norm": 0.6107572913169861, + "learning_rate": 0.000934733893557423, + "loss": 0.5799, + "step": 2436 + }, + { + "epoch": 1.3614525139664804, + "grad_norm": 1.2885218858718872, + "learning_rate": 0.0009347058823529412, + "loss": 0.5081, + "step": 2437 + }, + { + "epoch": 1.3620111731843576, + "grad_norm": 1.0166044235229492, + "learning_rate": 0.0009346778711484593, + "loss": 0.5626, + "step": 2438 + }, + { + "epoch": 1.3625698324022346, + "grad_norm": 0.8314241766929626, + "learning_rate": 0.0009346498599439776, + "loss": 0.5662, + "step": 2439 + }, + { + "epoch": 1.3631284916201118, + "grad_norm": 0.4049365818500519, + "learning_rate": 0.0009346218487394958, + "loss": 0.4666, + "step": 2440 + }, + { + "epoch": 1.3636871508379889, + "grad_norm": 1.9691574573516846, + "learning_rate": 0.000934593837535014, + "loss": 0.3739, + "step": 2441 + }, + { + "epoch": 1.3642458100558659, + "grad_norm": 1.2149285078048706, + "learning_rate": 0.0009345658263305323, + "loss": 0.6774, + "step": 2442 + }, + { + "epoch": 1.3648044692737429, + "grad_norm": 9.740959167480469, + "learning_rate": 0.0009345378151260504, + "loss": 0.4811, + "step": 2443 + }, + { + "epoch": 1.3653631284916201, + "grad_norm": 0.7420990467071533, + "learning_rate": 0.0009345098039215687, + "loss": 0.5085, + "step": 2444 + }, + { + "epoch": 1.3659217877094971, + "grad_norm": 0.8175650835037231, + "learning_rate": 0.0009344817927170869, + "loss": 0.4328, + "step": 2445 + }, + { + "epoch": 1.3664804469273744, + "grad_norm": 0.8845696449279785, + "learning_rate": 0.0009344537815126051, + "loss": 0.4291, + "step": 2446 + }, + { + "epoch": 1.3670391061452514, + "grad_norm": 0.6789989471435547, + "learning_rate": 0.0009344257703081233, + "loss": 0.5748, + "step": 2447 + }, + { + "epoch": 1.3675977653631284, + "grad_norm": 3.8430233001708984, + "learning_rate": 0.0009343977591036415, + "loss": 0.3884, + "step": 2448 + }, + { + "epoch": 1.3681564245810056, + "grad_norm": 12.143961906433105, + "learning_rate": 0.0009343697478991596, + "loss": 0.4391, + "step": 2449 + }, + { + "epoch": 1.3687150837988826, + "grad_norm": 1.053959846496582, + "learning_rate": 0.0009343417366946779, + "loss": 0.4391, + "step": 2450 + }, + { + "epoch": 1.3692737430167599, + "grad_norm": 0.7687263488769531, + "learning_rate": 0.0009343137254901961, + "loss": 0.5036, + "step": 2451 + }, + { + "epoch": 1.3698324022346369, + "grad_norm": 0.618775486946106, + "learning_rate": 0.0009342857142857143, + "loss": 0.6145, + "step": 2452 + }, + { + "epoch": 1.3703910614525139, + "grad_norm": 3.653212785720825, + "learning_rate": 0.0009342577030812325, + "loss": 0.5035, + "step": 2453 + }, + { + "epoch": 1.3709497206703911, + "grad_norm": 0.7300713062286377, + "learning_rate": 0.0009342296918767506, + "loss": 0.4668, + "step": 2454 + }, + { + "epoch": 1.3715083798882681, + "grad_norm": 0.7744580507278442, + "learning_rate": 0.0009342016806722689, + "loss": 0.4067, + "step": 2455 + }, + { + "epoch": 1.3720670391061454, + "grad_norm": 1.296471357345581, + "learning_rate": 0.0009341736694677871, + "loss": 0.5711, + "step": 2456 + }, + { + "epoch": 1.3726256983240224, + "grad_norm": 0.9308703541755676, + "learning_rate": 0.0009341456582633054, + "loss": 0.5285, + "step": 2457 + }, + { + "epoch": 1.3731843575418994, + "grad_norm": 0.6054986119270325, + "learning_rate": 0.0009341176470588236, + "loss": 0.5099, + "step": 2458 + }, + { + "epoch": 1.3737430167597766, + "grad_norm": 16.23585319519043, + "learning_rate": 0.0009340896358543417, + "loss": 0.6635, + "step": 2459 + }, + { + "epoch": 1.3743016759776536, + "grad_norm": 1.516026258468628, + "learning_rate": 0.00093406162464986, + "loss": 0.5686, + "step": 2460 + }, + { + "epoch": 1.3748603351955309, + "grad_norm": 2.034881353378296, + "learning_rate": 0.0009340336134453782, + "loss": 0.4215, + "step": 2461 + }, + { + "epoch": 1.3754189944134079, + "grad_norm": 1.4947832822799683, + "learning_rate": 0.0009340056022408964, + "loss": 0.6685, + "step": 2462 + }, + { + "epoch": 1.3759776536312849, + "grad_norm": 1.3847365379333496, + "learning_rate": 0.0009339775910364146, + "loss": 0.6743, + "step": 2463 + }, + { + "epoch": 1.376536312849162, + "grad_norm": 2.259474515914917, + "learning_rate": 0.0009339495798319328, + "loss": 0.4427, + "step": 2464 + }, + { + "epoch": 1.3770949720670391, + "grad_norm": 0.9777543544769287, + "learning_rate": 0.000933921568627451, + "loss": 0.5175, + "step": 2465 + }, + { + "epoch": 1.3776536312849161, + "grad_norm": 0.8059589862823486, + "learning_rate": 0.0009338935574229692, + "loss": 0.4721, + "step": 2466 + }, + { + "epoch": 1.3782122905027934, + "grad_norm": 0.6438359022140503, + "learning_rate": 0.0009338655462184874, + "loss": 0.4236, + "step": 2467 + }, + { + "epoch": 1.3787709497206704, + "grad_norm": 1.8244068622589111, + "learning_rate": 0.0009338375350140056, + "loss": 0.5051, + "step": 2468 + }, + { + "epoch": 1.3793296089385474, + "grad_norm": 9.361257553100586, + "learning_rate": 0.0009338095238095238, + "loss": 0.5777, + "step": 2469 + }, + { + "epoch": 1.3798882681564246, + "grad_norm": 0.803830087184906, + "learning_rate": 0.000933781512605042, + "loss": 0.5565, + "step": 2470 + }, + { + "epoch": 1.3804469273743016, + "grad_norm": 0.6548046469688416, + "learning_rate": 0.0009337535014005602, + "loss": 0.4599, + "step": 2471 + }, + { + "epoch": 1.3810055865921789, + "grad_norm": 0.7830075621604919, + "learning_rate": 0.0009337254901960784, + "loss": 0.4607, + "step": 2472 + }, + { + "epoch": 1.3815642458100559, + "grad_norm": 3.2481491565704346, + "learning_rate": 0.0009336974789915966, + "loss": 0.6175, + "step": 2473 + }, + { + "epoch": 1.382122905027933, + "grad_norm": 0.8518757224082947, + "learning_rate": 0.0009336694677871149, + "loss": 0.5009, + "step": 2474 + }, + { + "epoch": 1.3826815642458101, + "grad_norm": 0.5780080556869507, + "learning_rate": 0.0009336414565826331, + "loss": 0.4583, + "step": 2475 + }, + { + "epoch": 1.3832402234636871, + "grad_norm": 1.9408583641052246, + "learning_rate": 0.0009336134453781513, + "loss": 0.486, + "step": 2476 + }, + { + "epoch": 1.3837988826815644, + "grad_norm": 0.7586171627044678, + "learning_rate": 0.0009335854341736695, + "loss": 0.5087, + "step": 2477 + }, + { + "epoch": 1.3843575418994414, + "grad_norm": 1.2310386896133423, + "learning_rate": 0.0009335574229691877, + "loss": 0.5671, + "step": 2478 + }, + { + "epoch": 1.3849162011173184, + "grad_norm": 0.8117244839668274, + "learning_rate": 0.0009335294117647059, + "loss": 0.5861, + "step": 2479 + }, + { + "epoch": 1.3854748603351954, + "grad_norm": 0.6689615249633789, + "learning_rate": 0.0009335014005602242, + "loss": 0.5152, + "step": 2480 + }, + { + "epoch": 1.3860335195530726, + "grad_norm": 0.7848302721977234, + "learning_rate": 0.0009334733893557423, + "loss": 0.5771, + "step": 2481 + }, + { + "epoch": 1.3865921787709496, + "grad_norm": 0.8203095197677612, + "learning_rate": 0.0009334453781512605, + "loss": 0.5806, + "step": 2482 + }, + { + "epoch": 1.3871508379888269, + "grad_norm": 1.5317904949188232, + "learning_rate": 0.0009334173669467787, + "loss": 0.5107, + "step": 2483 + }, + { + "epoch": 1.387709497206704, + "grad_norm": 1.7203454971313477, + "learning_rate": 0.0009333893557422969, + "loss": 0.4391, + "step": 2484 + }, + { + "epoch": 1.388268156424581, + "grad_norm": 3.301591634750366, + "learning_rate": 0.0009333613445378152, + "loss": 0.516, + "step": 2485 + }, + { + "epoch": 1.3888268156424581, + "grad_norm": 0.7346853613853455, + "learning_rate": 0.0009333333333333333, + "loss": 0.5327, + "step": 2486 + }, + { + "epoch": 1.3893854748603351, + "grad_norm": 0.6157276630401611, + "learning_rate": 0.0009333053221288515, + "loss": 0.5551, + "step": 2487 + }, + { + "epoch": 1.3899441340782124, + "grad_norm": 0.6655274629592896, + "learning_rate": 0.0009332773109243697, + "loss": 0.4512, + "step": 2488 + }, + { + "epoch": 1.3905027932960894, + "grad_norm": 0.6196253895759583, + "learning_rate": 0.0009332492997198879, + "loss": 0.4572, + "step": 2489 + }, + { + "epoch": 1.3910614525139664, + "grad_norm": 0.5501835942268372, + "learning_rate": 0.0009332212885154063, + "loss": 0.5067, + "step": 2490 + }, + { + "epoch": 1.3916201117318436, + "grad_norm": 1.2724072933197021, + "learning_rate": 0.0009331932773109244, + "loss": 0.5171, + "step": 2491 + }, + { + "epoch": 1.3921787709497206, + "grad_norm": 0.9336773157119751, + "learning_rate": 0.0009331652661064426, + "loss": 0.4704, + "step": 2492 + }, + { + "epoch": 1.3927374301675979, + "grad_norm": 0.9166853427886963, + "learning_rate": 0.0009331372549019608, + "loss": 0.5213, + "step": 2493 + }, + { + "epoch": 1.393296089385475, + "grad_norm": 0.6173405051231384, + "learning_rate": 0.000933109243697479, + "loss": 0.5639, + "step": 2494 + }, + { + "epoch": 1.393854748603352, + "grad_norm": 0.5503080487251282, + "learning_rate": 0.0009330812324929973, + "loss": 0.4921, + "step": 2495 + }, + { + "epoch": 1.394413407821229, + "grad_norm": 19.085041046142578, + "learning_rate": 0.0009330532212885155, + "loss": 0.4346, + "step": 2496 + }, + { + "epoch": 1.3949720670391061, + "grad_norm": 0.6812968254089355, + "learning_rate": 0.0009330252100840336, + "loss": 0.4317, + "step": 2497 + }, + { + "epoch": 1.3955307262569834, + "grad_norm": 0.8005194067955017, + "learning_rate": 0.0009329971988795518, + "loss": 0.4454, + "step": 2498 + }, + { + "epoch": 1.3960893854748604, + "grad_norm": 0.7951617240905762, + "learning_rate": 0.00093296918767507, + "loss": 0.5179, + "step": 2499 + }, + { + "epoch": 1.3966480446927374, + "grad_norm": 0.5328164100646973, + "learning_rate": 0.0009329411764705883, + "loss": 0.429, + "step": 2500 + }, + { + "epoch": 1.3966480446927374, + "eval_cer": 0.0976836546539666, + "eval_loss": 0.36993345618247986, + "eval_runtime": 55.65, + "eval_samples_per_second": 81.545, + "eval_steps_per_second": 5.103, + "eval_wer": 0.38507304265243986, + "step": 2500 + }, + { + "epoch": 1.3972067039106144, + "grad_norm": 1.18572998046875, + "learning_rate": 0.0009329131652661065, + "loss": 0.5513, + "step": 2501 + }, + { + "epoch": 1.3977653631284916, + "grad_norm": 1.1080896854400635, + "learning_rate": 0.0009328851540616246, + "loss": 0.4954, + "step": 2502 + }, + { + "epoch": 1.3983240223463687, + "grad_norm": 0.789019763469696, + "learning_rate": 0.0009328571428571428, + "loss": 0.4919, + "step": 2503 + }, + { + "epoch": 1.3988826815642459, + "grad_norm": 1.1073821783065796, + "learning_rate": 0.000932829131652661, + "loss": 0.4855, + "step": 2504 + }, + { + "epoch": 1.399441340782123, + "grad_norm": 0.848599374294281, + "learning_rate": 0.0009328011204481793, + "loss": 0.3677, + "step": 2505 + }, + { + "epoch": 1.4, + "grad_norm": 0.8059666752815247, + "learning_rate": 0.0009327731092436976, + "loss": 0.6229, + "step": 2506 + }, + { + "epoch": 1.4005586592178771, + "grad_norm": 1.0171482563018799, + "learning_rate": 0.0009327450980392156, + "loss": 0.4088, + "step": 2507 + }, + { + "epoch": 1.4011173184357542, + "grad_norm": 0.8622143864631653, + "learning_rate": 0.0009327170868347339, + "loss": 0.5827, + "step": 2508 + }, + { + "epoch": 1.4016759776536314, + "grad_norm": 0.6227065324783325, + "learning_rate": 0.0009326890756302521, + "loss": 0.4108, + "step": 2509 + }, + { + "epoch": 1.4022346368715084, + "grad_norm": 0.5788086652755737, + "learning_rate": 0.0009326610644257704, + "loss": 0.5117, + "step": 2510 + }, + { + "epoch": 1.4027932960893854, + "grad_norm": 0.46648746728897095, + "learning_rate": 0.0009326330532212886, + "loss": 0.5672, + "step": 2511 + }, + { + "epoch": 1.4033519553072626, + "grad_norm": 0.6061733365058899, + "learning_rate": 0.0009326050420168068, + "loss": 0.5534, + "step": 2512 + }, + { + "epoch": 1.4039106145251397, + "grad_norm": 0.6212568283081055, + "learning_rate": 0.0009325770308123249, + "loss": 0.4059, + "step": 2513 + }, + { + "epoch": 1.4044692737430169, + "grad_norm": 0.6439541578292847, + "learning_rate": 0.0009325490196078431, + "loss": 0.4233, + "step": 2514 + }, + { + "epoch": 1.405027932960894, + "grad_norm": 1.7958157062530518, + "learning_rate": 0.0009325210084033614, + "loss": 0.4694, + "step": 2515 + }, + { + "epoch": 1.405586592178771, + "grad_norm": 0.43417346477508545, + "learning_rate": 0.0009324929971988796, + "loss": 0.4863, + "step": 2516 + }, + { + "epoch": 1.406145251396648, + "grad_norm": 0.9820460677146912, + "learning_rate": 0.0009324649859943978, + "loss": 0.4064, + "step": 2517 + }, + { + "epoch": 1.4067039106145252, + "grad_norm": 0.4516315460205078, + "learning_rate": 0.0009324369747899159, + "loss": 0.5079, + "step": 2518 + }, + { + "epoch": 1.4072625698324022, + "grad_norm": 0.9147122502326965, + "learning_rate": 0.0009324089635854341, + "loss": 0.5505, + "step": 2519 + }, + { + "epoch": 1.4078212290502794, + "grad_norm": 0.4546954035758972, + "learning_rate": 0.0009323809523809524, + "loss": 0.3978, + "step": 2520 + }, + { + "epoch": 1.4083798882681564, + "grad_norm": 0.43666183948516846, + "learning_rate": 0.0009323529411764706, + "loss": 0.4012, + "step": 2521 + }, + { + "epoch": 1.4089385474860334, + "grad_norm": 3.7608439922332764, + "learning_rate": 0.0009323249299719888, + "loss": 0.4185, + "step": 2522 + }, + { + "epoch": 1.4094972067039107, + "grad_norm": 0.40791329741477966, + "learning_rate": 0.0009322969187675069, + "loss": 0.4971, + "step": 2523 + }, + { + "epoch": 1.4100558659217877, + "grad_norm": 1.2691696882247925, + "learning_rate": 0.0009322689075630252, + "loss": 0.4989, + "step": 2524 + }, + { + "epoch": 1.410614525139665, + "grad_norm": 0.5668780207633972, + "learning_rate": 0.0009322408963585435, + "loss": 0.4954, + "step": 2525 + }, + { + "epoch": 1.411173184357542, + "grad_norm": 1.1780273914337158, + "learning_rate": 0.0009322128851540617, + "loss": 0.4284, + "step": 2526 + }, + { + "epoch": 1.411731843575419, + "grad_norm": 0.568503737449646, + "learning_rate": 0.0009321848739495799, + "loss": 0.4869, + "step": 2527 + }, + { + "epoch": 1.4122905027932962, + "grad_norm": 0.8034923076629639, + "learning_rate": 0.0009321568627450981, + "loss": 0.5239, + "step": 2528 + }, + { + "epoch": 1.4128491620111732, + "grad_norm": 1.2309287786483765, + "learning_rate": 0.0009321288515406162, + "loss": 0.4618, + "step": 2529 + }, + { + "epoch": 1.4134078212290504, + "grad_norm": 0.44164738059043884, + "learning_rate": 0.0009321008403361345, + "loss": 0.5138, + "step": 2530 + }, + { + "epoch": 1.4139664804469274, + "grad_norm": 1.4449609518051147, + "learning_rate": 0.0009320728291316527, + "loss": 0.5402, + "step": 2531 + }, + { + "epoch": 1.4145251396648044, + "grad_norm": 0.8091464638710022, + "learning_rate": 0.0009320448179271709, + "loss": 0.4908, + "step": 2532 + }, + { + "epoch": 1.4150837988826814, + "grad_norm": 0.6629249453544617, + "learning_rate": 0.0009320168067226891, + "loss": 0.5608, + "step": 2533 + }, + { + "epoch": 1.4156424581005587, + "grad_norm": 0.6336396932601929, + "learning_rate": 0.0009319887955182072, + "loss": 0.4957, + "step": 2534 + }, + { + "epoch": 1.4162011173184357, + "grad_norm": 0.8397176861763, + "learning_rate": 0.0009319607843137255, + "loss": 0.5222, + "step": 2535 + }, + { + "epoch": 1.416759776536313, + "grad_norm": 2.699629783630371, + "learning_rate": 0.0009319327731092437, + "loss": 0.489, + "step": 2536 + }, + { + "epoch": 1.41731843575419, + "grad_norm": 9.770133018493652, + "learning_rate": 0.0009319047619047619, + "loss": 0.5091, + "step": 2537 + }, + { + "epoch": 1.417877094972067, + "grad_norm": 1.0547733306884766, + "learning_rate": 0.0009318767507002801, + "loss": 0.4625, + "step": 2538 + }, + { + "epoch": 1.4184357541899442, + "grad_norm": 0.6585387587547302, + "learning_rate": 0.0009318487394957982, + "loss": 0.4785, + "step": 2539 + }, + { + "epoch": 1.4189944134078212, + "grad_norm": 0.8304965496063232, + "learning_rate": 0.0009318207282913166, + "loss": 0.4586, + "step": 2540 + }, + { + "epoch": 1.4195530726256984, + "grad_norm": 0.5305891633033752, + "learning_rate": 0.0009317927170868348, + "loss": 0.4909, + "step": 2541 + }, + { + "epoch": 1.4201117318435754, + "grad_norm": 0.9010879993438721, + "learning_rate": 0.000931764705882353, + "loss": 0.8379, + "step": 2542 + }, + { + "epoch": 1.4206703910614524, + "grad_norm": 0.5750892162322998, + "learning_rate": 0.0009317366946778712, + "loss": 0.4265, + "step": 2543 + }, + { + "epoch": 1.4212290502793297, + "grad_norm": 0.6748759150505066, + "learning_rate": 0.0009317086834733894, + "loss": 0.4119, + "step": 2544 + }, + { + "epoch": 1.4217877094972067, + "grad_norm": 1.5361045598983765, + "learning_rate": 0.0009316806722689076, + "loss": 0.4722, + "step": 2545 + }, + { + "epoch": 1.422346368715084, + "grad_norm": 0.6403470039367676, + "learning_rate": 0.0009316526610644258, + "loss": 0.4722, + "step": 2546 + }, + { + "epoch": 1.422905027932961, + "grad_norm": 1.2295637130737305, + "learning_rate": 0.000931624649859944, + "loss": 0.5622, + "step": 2547 + }, + { + "epoch": 1.423463687150838, + "grad_norm": 0.766720712184906, + "learning_rate": 0.0009315966386554622, + "loss": 0.4731, + "step": 2548 + }, + { + "epoch": 1.4240223463687152, + "grad_norm": 0.6608802676200867, + "learning_rate": 0.0009315686274509804, + "loss": 0.4391, + "step": 2549 + }, + { + "epoch": 1.4245810055865922, + "grad_norm": 0.4678759276866913, + "learning_rate": 0.0009315406162464986, + "loss": 0.5577, + "step": 2550 + }, + { + "epoch": 1.4251396648044694, + "grad_norm": 0.5720287561416626, + "learning_rate": 0.0009315126050420168, + "loss": 0.5004, + "step": 2551 + }, + { + "epoch": 1.4256983240223464, + "grad_norm": 0.7422264218330383, + "learning_rate": 0.000931484593837535, + "loss": 0.5391, + "step": 2552 + }, + { + "epoch": 1.4262569832402234, + "grad_norm": 0.8045322299003601, + "learning_rate": 0.0009314565826330532, + "loss": 0.5479, + "step": 2553 + }, + { + "epoch": 1.4268156424581004, + "grad_norm": 0.7911209464073181, + "learning_rate": 0.0009314285714285714, + "loss": 0.4286, + "step": 2554 + }, + { + "epoch": 1.4273743016759777, + "grad_norm": 0.8115670084953308, + "learning_rate": 0.0009314005602240896, + "loss": 0.6007, + "step": 2555 + }, + { + "epoch": 1.4279329608938547, + "grad_norm": 0.7535646557807922, + "learning_rate": 0.0009313725490196079, + "loss": 0.4321, + "step": 2556 + }, + { + "epoch": 1.428491620111732, + "grad_norm": 0.6910133957862854, + "learning_rate": 0.0009313445378151261, + "loss": 0.5538, + "step": 2557 + }, + { + "epoch": 1.429050279329609, + "grad_norm": 0.6578505635261536, + "learning_rate": 0.0009313165266106443, + "loss": 0.5565, + "step": 2558 + }, + { + "epoch": 1.429608938547486, + "grad_norm": 2.156248092651367, + "learning_rate": 0.0009312885154061625, + "loss": 0.5567, + "step": 2559 + }, + { + "epoch": 1.4301675977653632, + "grad_norm": 0.5340940356254578, + "learning_rate": 0.0009312605042016808, + "loss": 0.6375, + "step": 2560 + }, + { + "epoch": 1.4307262569832402, + "grad_norm": 0.6691848039627075, + "learning_rate": 0.0009312324929971989, + "loss": 0.6056, + "step": 2561 + }, + { + "epoch": 1.4312849162011174, + "grad_norm": 0.8040280938148499, + "learning_rate": 0.0009312044817927171, + "loss": 0.6013, + "step": 2562 + }, + { + "epoch": 1.4318435754189944, + "grad_norm": 0.5586495995521545, + "learning_rate": 0.0009311764705882353, + "loss": 0.5543, + "step": 2563 + }, + { + "epoch": 1.4324022346368714, + "grad_norm": 1.5113362073898315, + "learning_rate": 0.0009311484593837535, + "loss": 0.3772, + "step": 2564 + }, + { + "epoch": 1.4329608938547487, + "grad_norm": 0.8635311722755432, + "learning_rate": 0.0009311204481792718, + "loss": 0.5313, + "step": 2565 + }, + { + "epoch": 1.4335195530726257, + "grad_norm": 0.886853039264679, + "learning_rate": 0.0009310924369747899, + "loss": 0.4537, + "step": 2566 + }, + { + "epoch": 1.434078212290503, + "grad_norm": 2.031019449234009, + "learning_rate": 0.0009310644257703081, + "loss": 0.5883, + "step": 2567 + }, + { + "epoch": 1.43463687150838, + "grad_norm": 1.0636546611785889, + "learning_rate": 0.0009310364145658263, + "loss": 0.4193, + "step": 2568 + }, + { + "epoch": 1.435195530726257, + "grad_norm": 0.42769888043403625, + "learning_rate": 0.0009310084033613445, + "loss": 0.4232, + "step": 2569 + }, + { + "epoch": 1.435754189944134, + "grad_norm": 0.8384038209915161, + "learning_rate": 0.0009309803921568628, + "loss": 0.503, + "step": 2570 + }, + { + "epoch": 1.4363128491620112, + "grad_norm": 0.7572436928749084, + "learning_rate": 0.0009309523809523809, + "loss": 0.5198, + "step": 2571 + }, + { + "epoch": 1.4368715083798882, + "grad_norm": 1.6784976720809937, + "learning_rate": 0.0009309243697478991, + "loss": 0.6243, + "step": 2572 + }, + { + "epoch": 1.4374301675977654, + "grad_norm": 0.7462752461433411, + "learning_rate": 0.0009308963585434174, + "loss": 0.6218, + "step": 2573 + }, + { + "epoch": 1.4379888268156424, + "grad_norm": 1.355023741722107, + "learning_rate": 0.0009308683473389356, + "loss": 0.4417, + "step": 2574 + }, + { + "epoch": 1.4385474860335195, + "grad_norm": 0.46579670906066895, + "learning_rate": 0.0009308403361344539, + "loss": 0.4145, + "step": 2575 + }, + { + "epoch": 1.4391061452513967, + "grad_norm": 6.230794429779053, + "learning_rate": 0.0009308123249299721, + "loss": 0.6353, + "step": 2576 + }, + { + "epoch": 1.4396648044692737, + "grad_norm": 0.6536762714385986, + "learning_rate": 0.0009307843137254902, + "loss": 0.466, + "step": 2577 + }, + { + "epoch": 1.440223463687151, + "grad_norm": 0.46875396370887756, + "learning_rate": 0.0009307563025210084, + "loss": 0.5062, + "step": 2578 + }, + { + "epoch": 1.440782122905028, + "grad_norm": 0.4661177694797516, + "learning_rate": 0.0009307282913165266, + "loss": 0.4033, + "step": 2579 + }, + { + "epoch": 1.441340782122905, + "grad_norm": 0.633985161781311, + "learning_rate": 0.0009307002801120449, + "loss": 0.5414, + "step": 2580 + }, + { + "epoch": 1.4418994413407822, + "grad_norm": 0.8244467973709106, + "learning_rate": 0.0009306722689075631, + "loss": 0.5299, + "step": 2581 + }, + { + "epoch": 1.4424581005586592, + "grad_norm": 0.6113318204879761, + "learning_rate": 0.0009306442577030812, + "loss": 0.4394, + "step": 2582 + }, + { + "epoch": 1.4430167597765364, + "grad_norm": 1.072367787361145, + "learning_rate": 0.0009306162464985994, + "loss": 0.4307, + "step": 2583 + }, + { + "epoch": 1.4435754189944134, + "grad_norm": 0.42234715819358826, + "learning_rate": 0.0009305882352941176, + "loss": 0.4104, + "step": 2584 + }, + { + "epoch": 1.4441340782122905, + "grad_norm": 0.931414783000946, + "learning_rate": 0.0009305602240896359, + "loss": 0.5007, + "step": 2585 + }, + { + "epoch": 1.4446927374301675, + "grad_norm": 0.9446127414703369, + "learning_rate": 0.0009305322128851541, + "loss": 0.4738, + "step": 2586 + }, + { + "epoch": 1.4452513966480447, + "grad_norm": 0.6625229120254517, + "learning_rate": 0.0009305042016806722, + "loss": 0.5122, + "step": 2587 + }, + { + "epoch": 1.4458100558659217, + "grad_norm": 0.5373765826225281, + "learning_rate": 0.0009304761904761904, + "loss": 0.3953, + "step": 2588 + }, + { + "epoch": 1.446368715083799, + "grad_norm": 0.7654617428779602, + "learning_rate": 0.0009304481792717086, + "loss": 0.5313, + "step": 2589 + }, + { + "epoch": 1.446927374301676, + "grad_norm": 0.6200145483016968, + "learning_rate": 0.000930420168067227, + "loss": 0.5059, + "step": 2590 + }, + { + "epoch": 1.447486033519553, + "grad_norm": 0.530850887298584, + "learning_rate": 0.0009303921568627452, + "loss": 0.4399, + "step": 2591 + }, + { + "epoch": 1.4480446927374302, + "grad_norm": 0.5828093886375427, + "learning_rate": 0.0009303641456582634, + "loss": 0.5025, + "step": 2592 + }, + { + "epoch": 1.4486033519553072, + "grad_norm": 0.5664350986480713, + "learning_rate": 0.0009303361344537815, + "loss": 0.5232, + "step": 2593 + }, + { + "epoch": 1.4491620111731844, + "grad_norm": 0.5003907084465027, + "learning_rate": 0.0009303081232492997, + "loss": 0.5158, + "step": 2594 + }, + { + "epoch": 1.4497206703910615, + "grad_norm": 0.4369567334651947, + "learning_rate": 0.000930280112044818, + "loss": 0.3871, + "step": 2595 + }, + { + "epoch": 1.4502793296089385, + "grad_norm": 0.4398859143257141, + "learning_rate": 0.0009302521008403362, + "loss": 0.3881, + "step": 2596 + }, + { + "epoch": 1.4508379888268157, + "grad_norm": 0.46730324625968933, + "learning_rate": 0.0009302240896358544, + "loss": 0.3999, + "step": 2597 + }, + { + "epoch": 1.4513966480446927, + "grad_norm": 0.6658786535263062, + "learning_rate": 0.0009301960784313725, + "loss": 0.4737, + "step": 2598 + }, + { + "epoch": 1.45195530726257, + "grad_norm": 0.7979653477668762, + "learning_rate": 0.0009301680672268907, + "loss": 0.6727, + "step": 2599 + }, + { + "epoch": 1.452513966480447, + "grad_norm": 0.7182505130767822, + "learning_rate": 0.000930140056022409, + "loss": 0.4866, + "step": 2600 + }, + { + "epoch": 1.453072625698324, + "grad_norm": 2.0803158283233643, + "learning_rate": 0.0009301120448179272, + "loss": 0.3919, + "step": 2601 + }, + { + "epoch": 1.4536312849162012, + "grad_norm": 0.6667923927307129, + "learning_rate": 0.0009300840336134454, + "loss": 0.5225, + "step": 2602 + }, + { + "epoch": 1.4541899441340782, + "grad_norm": 0.7217258810997009, + "learning_rate": 0.0009300560224089635, + "loss": 0.4409, + "step": 2603 + }, + { + "epoch": 1.4547486033519554, + "grad_norm": 0.5033441781997681, + "learning_rate": 0.0009300280112044817, + "loss": 0.4594, + "step": 2604 + }, + { + "epoch": 1.4553072625698324, + "grad_norm": 0.5012925267219543, + "learning_rate": 0.00093, + "loss": 0.5098, + "step": 2605 + }, + { + "epoch": 1.4558659217877095, + "grad_norm": 0.81548011302948, + "learning_rate": 0.0009299719887955183, + "loss": 0.5645, + "step": 2606 + }, + { + "epoch": 1.4564245810055865, + "grad_norm": 0.6461879014968872, + "learning_rate": 0.0009299439775910365, + "loss": 0.4361, + "step": 2607 + }, + { + "epoch": 1.4569832402234637, + "grad_norm": 0.8119282126426697, + "learning_rate": 0.0009299159663865547, + "loss": 0.5991, + "step": 2608 + }, + { + "epoch": 1.4575418994413407, + "grad_norm": 1.2009344100952148, + "learning_rate": 0.0009298879551820728, + "loss": 0.4882, + "step": 2609 + }, + { + "epoch": 1.458100558659218, + "grad_norm": 0.7457526922225952, + "learning_rate": 0.0009298599439775911, + "loss": 0.5036, + "step": 2610 + }, + { + "epoch": 1.458659217877095, + "grad_norm": 0.6496821045875549, + "learning_rate": 0.0009298319327731093, + "loss": 0.5617, + "step": 2611 + }, + { + "epoch": 1.459217877094972, + "grad_norm": 0.4850807189941406, + "learning_rate": 0.0009298039215686275, + "loss": 0.5347, + "step": 2612 + }, + { + "epoch": 1.4597765363128492, + "grad_norm": 0.5157176852226257, + "learning_rate": 0.0009297759103641457, + "loss": 0.3972, + "step": 2613 + }, + { + "epoch": 1.4603351955307262, + "grad_norm": 0.6404157876968384, + "learning_rate": 0.0009297478991596638, + "loss": 0.3821, + "step": 2614 + }, + { + "epoch": 1.4608938547486034, + "grad_norm": 0.6212974786758423, + "learning_rate": 0.0009297198879551821, + "loss": 0.5472, + "step": 2615 + }, + { + "epoch": 1.4614525139664805, + "grad_norm": 0.8928610682487488, + "learning_rate": 0.0009296918767507003, + "loss": 0.3099, + "step": 2616 + }, + { + "epoch": 1.4620111731843575, + "grad_norm": 4.528392314910889, + "learning_rate": 0.0009296638655462185, + "loss": 0.4775, + "step": 2617 + }, + { + "epoch": 1.4625698324022347, + "grad_norm": 0.6402145624160767, + "learning_rate": 0.0009296358543417367, + "loss": 0.4919, + "step": 2618 + }, + { + "epoch": 1.4631284916201117, + "grad_norm": 0.5726767778396606, + "learning_rate": 0.0009296078431372548, + "loss": 0.5082, + "step": 2619 + }, + { + "epoch": 1.463687150837989, + "grad_norm": 0.46533146500587463, + "learning_rate": 0.0009295798319327731, + "loss": 0.3863, + "step": 2620 + }, + { + "epoch": 1.464245810055866, + "grad_norm": 0.49588528275489807, + "learning_rate": 0.0009295518207282913, + "loss": 0.3835, + "step": 2621 + }, + { + "epoch": 1.464804469273743, + "grad_norm": 0.45549502968788147, + "learning_rate": 0.0009295238095238096, + "loss": 0.3506, + "step": 2622 + }, + { + "epoch": 1.46536312849162, + "grad_norm": 0.5790160894393921, + "learning_rate": 0.0009294957983193278, + "loss": 0.6728, + "step": 2623 + }, + { + "epoch": 1.4659217877094972, + "grad_norm": 0.6579418778419495, + "learning_rate": 0.000929467787114846, + "loss": 0.5358, + "step": 2624 + }, + { + "epoch": 1.4664804469273742, + "grad_norm": 0.45469197630882263, + "learning_rate": 0.0009294397759103642, + "loss": 0.4329, + "step": 2625 + }, + { + "epoch": 1.4670391061452515, + "grad_norm": 0.5428677797317505, + "learning_rate": 0.0009294117647058824, + "loss": 0.4543, + "step": 2626 + }, + { + "epoch": 1.4675977653631285, + "grad_norm": 0.5525240898132324, + "learning_rate": 0.0009293837535014006, + "loss": 0.5091, + "step": 2627 + }, + { + "epoch": 1.4681564245810055, + "grad_norm": 0.7379921078681946, + "learning_rate": 0.0009293557422969188, + "loss": 0.4923, + "step": 2628 + }, + { + "epoch": 1.4687150837988827, + "grad_norm": 1.9589022397994995, + "learning_rate": 0.000929327731092437, + "loss": 0.5534, + "step": 2629 + }, + { + "epoch": 1.4692737430167597, + "grad_norm": 0.4258909821510315, + "learning_rate": 0.0009292997198879552, + "loss": 0.4178, + "step": 2630 + }, + { + "epoch": 1.469832402234637, + "grad_norm": 0.4239647388458252, + "learning_rate": 0.0009292717086834734, + "loss": 0.4064, + "step": 2631 + }, + { + "epoch": 1.470391061452514, + "grad_norm": 0.5227537155151367, + "learning_rate": 0.0009292436974789916, + "loss": 0.4668, + "step": 2632 + }, + { + "epoch": 1.470949720670391, + "grad_norm": 0.6114113926887512, + "learning_rate": 0.0009292156862745098, + "loss": 0.5525, + "step": 2633 + }, + { + "epoch": 1.4715083798882682, + "grad_norm": 0.6233765482902527, + "learning_rate": 0.000929187675070028, + "loss": 0.5509, + "step": 2634 + }, + { + "epoch": 1.4720670391061452, + "grad_norm": 2.157526731491089, + "learning_rate": 0.0009291596638655463, + "loss": 0.4408, + "step": 2635 + }, + { + "epoch": 1.4726256983240225, + "grad_norm": 1.38654625415802, + "learning_rate": 0.0009291316526610644, + "loss": 0.5012, + "step": 2636 + }, + { + "epoch": 1.4731843575418995, + "grad_norm": 0.6054531335830688, + "learning_rate": 0.0009291036414565826, + "loss": 0.4161, + "step": 2637 + }, + { + "epoch": 1.4737430167597765, + "grad_norm": 0.721886396408081, + "learning_rate": 0.0009290756302521009, + "loss": 0.5071, + "step": 2638 + }, + { + "epoch": 1.4743016759776537, + "grad_norm": 1.330005168914795, + "learning_rate": 0.0009290476190476191, + "loss": 0.5051, + "step": 2639 + }, + { + "epoch": 1.4748603351955307, + "grad_norm": 0.630526602268219, + "learning_rate": 0.0009290196078431374, + "loss": 0.5096, + "step": 2640 + }, + { + "epoch": 1.475418994413408, + "grad_norm": 1.3126240968704224, + "learning_rate": 0.0009289915966386555, + "loss": 0.3866, + "step": 2641 + }, + { + "epoch": 1.475977653631285, + "grad_norm": 0.9824892282485962, + "learning_rate": 0.0009289635854341737, + "loss": 0.5208, + "step": 2642 + }, + { + "epoch": 1.476536312849162, + "grad_norm": 0.6409022808074951, + "learning_rate": 0.0009289355742296919, + "loss": 0.4574, + "step": 2643 + }, + { + "epoch": 1.477094972067039, + "grad_norm": 0.5286266803741455, + "learning_rate": 0.0009289075630252101, + "loss": 0.441, + "step": 2644 + }, + { + "epoch": 1.4776536312849162, + "grad_norm": 0.5803026556968689, + "learning_rate": 0.0009288795518207284, + "loss": 0.534, + "step": 2645 + }, + { + "epoch": 1.4782122905027932, + "grad_norm": 0.670640766620636, + "learning_rate": 0.0009288515406162465, + "loss": 0.4967, + "step": 2646 + }, + { + "epoch": 1.4787709497206705, + "grad_norm": 0.7008808851242065, + "learning_rate": 0.0009288235294117647, + "loss": 0.5124, + "step": 2647 + }, + { + "epoch": 1.4793296089385475, + "grad_norm": 1.3227956295013428, + "learning_rate": 0.0009287955182072829, + "loss": 0.4875, + "step": 2648 + }, + { + "epoch": 1.4798882681564245, + "grad_norm": 0.7477467656135559, + "learning_rate": 0.0009287675070028011, + "loss": 0.7499, + "step": 2649 + }, + { + "epoch": 1.4804469273743017, + "grad_norm": 0.555888295173645, + "learning_rate": 0.0009287394957983194, + "loss": 0.6365, + "step": 2650 + }, + { + "epoch": 1.4810055865921787, + "grad_norm": 0.739152729511261, + "learning_rate": 0.0009287114845938376, + "loss": 0.6218, + "step": 2651 + }, + { + "epoch": 1.481564245810056, + "grad_norm": 2.709662675857544, + "learning_rate": 0.0009286834733893557, + "loss": 0.5598, + "step": 2652 + }, + { + "epoch": 1.482122905027933, + "grad_norm": 0.6691284775733948, + "learning_rate": 0.0009286554621848739, + "loss": 0.4887, + "step": 2653 + }, + { + "epoch": 1.48268156424581, + "grad_norm": 0.5898959040641785, + "learning_rate": 0.0009286274509803921, + "loss": 0.5642, + "step": 2654 + }, + { + "epoch": 1.4832402234636872, + "grad_norm": 0.515849232673645, + "learning_rate": 0.0009285994397759105, + "loss": 0.4541, + "step": 2655 + }, + { + "epoch": 1.4837988826815642, + "grad_norm": 0.6814642548561096, + "learning_rate": 0.0009285714285714287, + "loss": 0.502, + "step": 2656 + }, + { + "epoch": 1.4843575418994415, + "grad_norm": 0.47332486510276794, + "learning_rate": 0.0009285434173669468, + "loss": 0.4856, + "step": 2657 + }, + { + "epoch": 1.4849162011173185, + "grad_norm": 0.522954523563385, + "learning_rate": 0.000928515406162465, + "loss": 0.5723, + "step": 2658 + }, + { + "epoch": 1.4854748603351955, + "grad_norm": 0.7725816965103149, + "learning_rate": 0.0009284873949579832, + "loss": 0.49, + "step": 2659 + }, + { + "epoch": 1.4860335195530725, + "grad_norm": 1.1196281909942627, + "learning_rate": 0.0009284593837535015, + "loss": 0.6984, + "step": 2660 + }, + { + "epoch": 1.4865921787709497, + "grad_norm": 6.089732646942139, + "learning_rate": 0.0009284313725490197, + "loss": 0.4783, + "step": 2661 + }, + { + "epoch": 1.4871508379888267, + "grad_norm": 1.3049112558364868, + "learning_rate": 0.0009284033613445378, + "loss": 0.5496, + "step": 2662 + }, + { + "epoch": 1.487709497206704, + "grad_norm": 0.7450880408287048, + "learning_rate": 0.000928375350140056, + "loss": 0.4972, + "step": 2663 + }, + { + "epoch": 1.488268156424581, + "grad_norm": 0.7437418103218079, + "learning_rate": 0.0009283473389355742, + "loss": 0.6042, + "step": 2664 + }, + { + "epoch": 1.488826815642458, + "grad_norm": 0.4941578805446625, + "learning_rate": 0.0009283193277310925, + "loss": 0.5089, + "step": 2665 + }, + { + "epoch": 1.4893854748603352, + "grad_norm": 0.5777472853660583, + "learning_rate": 0.0009282913165266107, + "loss": 0.358, + "step": 2666 + }, + { + "epoch": 1.4899441340782122, + "grad_norm": 1.475481629371643, + "learning_rate": 0.0009282633053221289, + "loss": 0.5799, + "step": 2667 + }, + { + "epoch": 1.4905027932960895, + "grad_norm": 0.7806932330131531, + "learning_rate": 0.000928235294117647, + "loss": 0.3583, + "step": 2668 + }, + { + "epoch": 1.4910614525139665, + "grad_norm": 4.788658142089844, + "learning_rate": 0.0009282072829131652, + "loss": 0.5905, + "step": 2669 + }, + { + "epoch": 1.4916201117318435, + "grad_norm": 0.5692121386528015, + "learning_rate": 0.0009281792717086836, + "loss": 0.5891, + "step": 2670 + }, + { + "epoch": 1.4921787709497207, + "grad_norm": 1.0268464088439941, + "learning_rate": 0.0009281512605042018, + "loss": 0.4142, + "step": 2671 + }, + { + "epoch": 1.4927374301675977, + "grad_norm": 0.8897309303283691, + "learning_rate": 0.00092812324929972, + "loss": 0.3936, + "step": 2672 + }, + { + "epoch": 1.493296089385475, + "grad_norm": 0.9935110807418823, + "learning_rate": 0.0009280952380952381, + "loss": 0.4195, + "step": 2673 + }, + { + "epoch": 1.493854748603352, + "grad_norm": 0.5382152199745178, + "learning_rate": 0.0009280672268907563, + "loss": 0.4481, + "step": 2674 + }, + { + "epoch": 1.494413407821229, + "grad_norm": 1.0676406621932983, + "learning_rate": 0.0009280392156862745, + "loss": 0.3805, + "step": 2675 + }, + { + "epoch": 1.494972067039106, + "grad_norm": 0.5046552419662476, + "learning_rate": 0.0009280112044817928, + "loss": 0.5647, + "step": 2676 + }, + { + "epoch": 1.4955307262569832, + "grad_norm": 0.48943591117858887, + "learning_rate": 0.000927983193277311, + "loss": 0.4288, + "step": 2677 + }, + { + "epoch": 1.4960893854748603, + "grad_norm": 3.701481342315674, + "learning_rate": 0.0009279551820728291, + "loss": 0.437, + "step": 2678 + }, + { + "epoch": 1.4966480446927375, + "grad_norm": 1.0245435237884521, + "learning_rate": 0.0009279271708683473, + "loss": 0.4463, + "step": 2679 + }, + { + "epoch": 1.4972067039106145, + "grad_norm": 0.6588135957717896, + "learning_rate": 0.0009278991596638655, + "loss": 0.5567, + "step": 2680 + }, + { + "epoch": 1.4977653631284915, + "grad_norm": 0.5455290079116821, + "learning_rate": 0.0009278711484593838, + "loss": 0.4625, + "step": 2681 + }, + { + "epoch": 1.4983240223463687, + "grad_norm": 0.5909234881401062, + "learning_rate": 0.000927843137254902, + "loss": 0.4309, + "step": 2682 + }, + { + "epoch": 1.4988826815642458, + "grad_norm": 0.6696202754974365, + "learning_rate": 0.0009278151260504202, + "loss": 0.4379, + "step": 2683 + }, + { + "epoch": 1.499441340782123, + "grad_norm": 0.8101652264595032, + "learning_rate": 0.0009277871148459383, + "loss": 0.4769, + "step": 2684 + }, + { + "epoch": 1.5, + "grad_norm": 0.5355522632598877, + "learning_rate": 0.0009277591036414565, + "loss": 0.5438, + "step": 2685 + }, + { + "epoch": 1.500558659217877, + "grad_norm": 0.49236586689949036, + "learning_rate": 0.0009277310924369748, + "loss": 0.4865, + "step": 2686 + }, + { + "epoch": 1.5011173184357542, + "grad_norm": 0.7549045085906982, + "learning_rate": 0.000927703081232493, + "loss": 0.531, + "step": 2687 + }, + { + "epoch": 1.5016759776536313, + "grad_norm": 0.6821882128715515, + "learning_rate": 0.0009276750700280113, + "loss": 0.5781, + "step": 2688 + }, + { + "epoch": 1.5022346368715085, + "grad_norm": 0.6423541307449341, + "learning_rate": 0.0009276470588235294, + "loss": 0.4837, + "step": 2689 + }, + { + "epoch": 1.5027932960893855, + "grad_norm": 1.8678420782089233, + "learning_rate": 0.0009276190476190476, + "loss": 0.4704, + "step": 2690 + }, + { + "epoch": 1.5033519553072625, + "grad_norm": 0.5778162479400635, + "learning_rate": 0.0009275910364145659, + "loss": 0.4484, + "step": 2691 + }, + { + "epoch": 1.5039106145251395, + "grad_norm": 0.4615047574043274, + "learning_rate": 0.0009275630252100841, + "loss": 0.4259, + "step": 2692 + }, + { + "epoch": 1.5044692737430168, + "grad_norm": 0.6708247661590576, + "learning_rate": 0.0009275350140056023, + "loss": 0.6335, + "step": 2693 + }, + { + "epoch": 1.505027932960894, + "grad_norm": 0.6936284303665161, + "learning_rate": 0.0009275070028011204, + "loss": 0.4856, + "step": 2694 + }, + { + "epoch": 1.505586592178771, + "grad_norm": 0.707811176776886, + "learning_rate": 0.0009274789915966386, + "loss": 0.5278, + "step": 2695 + }, + { + "epoch": 1.506145251396648, + "grad_norm": 2.1636545658111572, + "learning_rate": 0.0009274509803921569, + "loss": 0.4717, + "step": 2696 + }, + { + "epoch": 1.506703910614525, + "grad_norm": 0.48805782198905945, + "learning_rate": 0.0009274229691876751, + "loss": 0.3648, + "step": 2697 + }, + { + "epoch": 1.5072625698324023, + "grad_norm": 0.5463467836380005, + "learning_rate": 0.0009273949579831933, + "loss": 0.4566, + "step": 2698 + }, + { + "epoch": 1.5078212290502795, + "grad_norm": 1.0485665798187256, + "learning_rate": 0.0009273669467787115, + "loss": 0.5163, + "step": 2699 + }, + { + "epoch": 1.5083798882681565, + "grad_norm": 0.8843816518783569, + "learning_rate": 0.0009273389355742296, + "loss": 0.6677, + "step": 2700 + }, + { + "epoch": 1.5089385474860335, + "grad_norm": 0.5736004710197449, + "learning_rate": 0.0009273109243697479, + "loss": 0.4338, + "step": 2701 + }, + { + "epoch": 1.5094972067039105, + "grad_norm": 4.1943678855896, + "learning_rate": 0.0009272829131652661, + "loss": 0.4167, + "step": 2702 + }, + { + "epoch": 1.5100558659217878, + "grad_norm": 0.8349507451057434, + "learning_rate": 0.0009272549019607843, + "loss": 0.4526, + "step": 2703 + }, + { + "epoch": 1.5106145251396648, + "grad_norm": 0.6282299757003784, + "learning_rate": 0.0009272268907563026, + "loss": 0.5266, + "step": 2704 + }, + { + "epoch": 1.511173184357542, + "grad_norm": 0.5767343640327454, + "learning_rate": 0.0009271988795518207, + "loss": 0.4249, + "step": 2705 + }, + { + "epoch": 1.511731843575419, + "grad_norm": 0.5181618332862854, + "learning_rate": 0.000927170868347339, + "loss": 0.4287, + "step": 2706 + }, + { + "epoch": 1.512290502793296, + "grad_norm": 0.5861397981643677, + "learning_rate": 0.0009271428571428572, + "loss": 0.4934, + "step": 2707 + }, + { + "epoch": 1.512849162011173, + "grad_norm": 0.7683354616165161, + "learning_rate": 0.0009271148459383754, + "loss": 0.4592, + "step": 2708 + }, + { + "epoch": 1.5134078212290503, + "grad_norm": 0.43219852447509766, + "learning_rate": 0.0009270868347338936, + "loss": 0.3708, + "step": 2709 + }, + { + "epoch": 1.5139664804469275, + "grad_norm": 1.0593012571334839, + "learning_rate": 0.0009270588235294117, + "loss": 0.563, + "step": 2710 + }, + { + "epoch": 1.5145251396648045, + "grad_norm": 0.5885233879089355, + "learning_rate": 0.00092703081232493, + "loss": 0.41, + "step": 2711 + }, + { + "epoch": 1.5150837988826815, + "grad_norm": 0.5605961084365845, + "learning_rate": 0.0009270028011204482, + "loss": 0.5442, + "step": 2712 + }, + { + "epoch": 1.5156424581005585, + "grad_norm": 1.160618543624878, + "learning_rate": 0.0009269747899159664, + "loss": 0.5942, + "step": 2713 + }, + { + "epoch": 1.5162011173184358, + "grad_norm": 0.9720863103866577, + "learning_rate": 0.0009269467787114846, + "loss": 0.458, + "step": 2714 + }, + { + "epoch": 1.516759776536313, + "grad_norm": 1.003719449043274, + "learning_rate": 0.0009269187675070028, + "loss": 0.5449, + "step": 2715 + }, + { + "epoch": 1.51731843575419, + "grad_norm": 0.5648466944694519, + "learning_rate": 0.000926890756302521, + "loss": 0.4219, + "step": 2716 + }, + { + "epoch": 1.517877094972067, + "grad_norm": 0.5501232743263245, + "learning_rate": 0.0009268627450980392, + "loss": 0.5251, + "step": 2717 + }, + { + "epoch": 1.518435754189944, + "grad_norm": 0.5324401259422302, + "learning_rate": 0.0009268347338935574, + "loss": 0.4915, + "step": 2718 + }, + { + "epoch": 1.5189944134078213, + "grad_norm": 0.5625090003013611, + "learning_rate": 0.0009268067226890756, + "loss": 0.5022, + "step": 2719 + }, + { + "epoch": 1.5195530726256983, + "grad_norm": 0.6464283466339111, + "learning_rate": 0.0009267787114845939, + "loss": 0.5249, + "step": 2720 + }, + { + "epoch": 1.5201117318435755, + "grad_norm": 0.7982908487319946, + "learning_rate": 0.0009267507002801121, + "loss": 0.5135, + "step": 2721 + }, + { + "epoch": 1.5206703910614525, + "grad_norm": 0.6112183928489685, + "learning_rate": 0.0009267226890756303, + "loss": 0.3826, + "step": 2722 + }, + { + "epoch": 1.5212290502793295, + "grad_norm": 0.6859140992164612, + "learning_rate": 0.0009266946778711485, + "loss": 0.4637, + "step": 2723 + }, + { + "epoch": 1.5217877094972065, + "grad_norm": 1.0874067544937134, + "learning_rate": 0.0009266666666666667, + "loss": 0.503, + "step": 2724 + }, + { + "epoch": 1.5223463687150838, + "grad_norm": 0.5052918195724487, + "learning_rate": 0.0009266386554621849, + "loss": 0.4556, + "step": 2725 + }, + { + "epoch": 1.522905027932961, + "grad_norm": 0.8066805005073547, + "learning_rate": 0.0009266106442577031, + "loss": 0.5166, + "step": 2726 + }, + { + "epoch": 1.523463687150838, + "grad_norm": 0.6549170613288879, + "learning_rate": 0.0009265826330532213, + "loss": 0.4763, + "step": 2727 + }, + { + "epoch": 1.524022346368715, + "grad_norm": 1.8753679990768433, + "learning_rate": 0.0009265546218487395, + "loss": 0.4528, + "step": 2728 + }, + { + "epoch": 1.524581005586592, + "grad_norm": 1.3903990983963013, + "learning_rate": 0.0009265266106442577, + "loss": 0.5485, + "step": 2729 + }, + { + "epoch": 1.5251396648044693, + "grad_norm": 1.8706400394439697, + "learning_rate": 0.0009264985994397759, + "loss": 0.5387, + "step": 2730 + }, + { + "epoch": 1.5256983240223465, + "grad_norm": 0.7359647154808044, + "learning_rate": 0.0009264705882352942, + "loss": 0.4494, + "step": 2731 + }, + { + "epoch": 1.5262569832402235, + "grad_norm": 0.452828049659729, + "learning_rate": 0.0009264425770308123, + "loss": 0.4034, + "step": 2732 + }, + { + "epoch": 1.5268156424581005, + "grad_norm": 0.49070021510124207, + "learning_rate": 0.0009264145658263305, + "loss": 0.4653, + "step": 2733 + }, + { + "epoch": 1.5273743016759775, + "grad_norm": 1.677897572517395, + "learning_rate": 0.0009263865546218487, + "loss": 0.3873, + "step": 2734 + }, + { + "epoch": 1.5279329608938548, + "grad_norm": 0.7672814130783081, + "learning_rate": 0.0009263585434173669, + "loss": 0.5337, + "step": 2735 + }, + { + "epoch": 1.528491620111732, + "grad_norm": 0.6242414116859436, + "learning_rate": 0.0009263305322128853, + "loss": 0.432, + "step": 2736 + }, + { + "epoch": 1.529050279329609, + "grad_norm": 0.5464147925376892, + "learning_rate": 0.0009263025210084034, + "loss": 0.598, + "step": 2737 + }, + { + "epoch": 1.529608938547486, + "grad_norm": 0.6822505593299866, + "learning_rate": 0.0009262745098039216, + "loss": 0.4842, + "step": 2738 + }, + { + "epoch": 1.530167597765363, + "grad_norm": 1.3498238325119019, + "learning_rate": 0.0009262464985994398, + "loss": 0.4758, + "step": 2739 + }, + { + "epoch": 1.5307262569832403, + "grad_norm": 0.6377862095832825, + "learning_rate": 0.000926218487394958, + "loss": 0.4904, + "step": 2740 + }, + { + "epoch": 1.5312849162011173, + "grad_norm": 1.275589942932129, + "learning_rate": 0.0009261904761904763, + "loss": 0.4656, + "step": 2741 + }, + { + "epoch": 1.5318435754189945, + "grad_norm": 0.5122226476669312, + "learning_rate": 0.0009261624649859944, + "loss": 0.5018, + "step": 2742 + }, + { + "epoch": 1.5324022346368715, + "grad_norm": 1.044105052947998, + "learning_rate": 0.0009261344537815126, + "loss": 0.6277, + "step": 2743 + }, + { + "epoch": 1.5329608938547485, + "grad_norm": 0.4903325140476227, + "learning_rate": 0.0009261064425770308, + "loss": 0.4832, + "step": 2744 + }, + { + "epoch": 1.5335195530726256, + "grad_norm": 1.3142452239990234, + "learning_rate": 0.000926078431372549, + "loss": 0.4616, + "step": 2745 + }, + { + "epoch": 1.5340782122905028, + "grad_norm": 0.7310460805892944, + "learning_rate": 0.0009260504201680673, + "loss": 0.5511, + "step": 2746 + }, + { + "epoch": 1.53463687150838, + "grad_norm": 1.4101277589797974, + "learning_rate": 0.0009260224089635855, + "loss": 0.5203, + "step": 2747 + }, + { + "epoch": 1.535195530726257, + "grad_norm": 1.755393147468567, + "learning_rate": 0.0009259943977591036, + "loss": 0.5651, + "step": 2748 + }, + { + "epoch": 1.535754189944134, + "grad_norm": 0.6259930729866028, + "learning_rate": 0.0009259663865546218, + "loss": 0.4588, + "step": 2749 + }, + { + "epoch": 1.536312849162011, + "grad_norm": 2.5083696842193604, + "learning_rate": 0.00092593837535014, + "loss": 0.5273, + "step": 2750 + }, + { + "epoch": 1.5368715083798883, + "grad_norm": 0.600470781326294, + "learning_rate": 0.0009259103641456583, + "loss": 0.5336, + "step": 2751 + }, + { + "epoch": 1.5374301675977655, + "grad_norm": 1.3746676445007324, + "learning_rate": 0.0009258823529411766, + "loss": 0.4209, + "step": 2752 + }, + { + "epoch": 1.5379888268156425, + "grad_norm": 0.4862399995326996, + "learning_rate": 0.0009258543417366946, + "loss": 0.4566, + "step": 2753 + }, + { + "epoch": 1.5385474860335195, + "grad_norm": 0.7803627252578735, + "learning_rate": 0.0009258263305322129, + "loss": 0.5474, + "step": 2754 + }, + { + "epoch": 1.5391061452513966, + "grad_norm": 1.5817126035690308, + "learning_rate": 0.0009257983193277311, + "loss": 0.4603, + "step": 2755 + }, + { + "epoch": 1.5396648044692738, + "grad_norm": 0.5320682525634766, + "learning_rate": 0.0009257703081232494, + "loss": 0.4772, + "step": 2756 + }, + { + "epoch": 1.5402234636871508, + "grad_norm": 1.7863763570785522, + "learning_rate": 0.0009257422969187676, + "loss": 0.5236, + "step": 2757 + }, + { + "epoch": 1.540782122905028, + "grad_norm": 0.7422907948493958, + "learning_rate": 0.0009257142857142857, + "loss": 0.6242, + "step": 2758 + }, + { + "epoch": 1.541340782122905, + "grad_norm": 0.6582601070404053, + "learning_rate": 0.0009256862745098039, + "loss": 0.5305, + "step": 2759 + }, + { + "epoch": 1.541899441340782, + "grad_norm": 0.5598275065422058, + "learning_rate": 0.0009256582633053221, + "loss": 0.5369, + "step": 2760 + }, + { + "epoch": 1.542458100558659, + "grad_norm": 0.5950160026550293, + "learning_rate": 0.0009256302521008404, + "loss": 0.5478, + "step": 2761 + }, + { + "epoch": 1.5430167597765363, + "grad_norm": 0.6061046123504639, + "learning_rate": 0.0009256022408963586, + "loss": 0.4774, + "step": 2762 + }, + { + "epoch": 1.5435754189944135, + "grad_norm": 4.923741340637207, + "learning_rate": 0.0009255742296918768, + "loss": 0.4244, + "step": 2763 + }, + { + "epoch": 1.5441340782122905, + "grad_norm": 0.6583361029624939, + "learning_rate": 0.0009255462184873949, + "loss": 0.4424, + "step": 2764 + }, + { + "epoch": 1.5446927374301676, + "grad_norm": 0.5482707023620605, + "learning_rate": 0.0009255182072829131, + "loss": 0.4597, + "step": 2765 + }, + { + "epoch": 1.5452513966480446, + "grad_norm": 0.5797296762466431, + "learning_rate": 0.0009254901960784314, + "loss": 0.5291, + "step": 2766 + }, + { + "epoch": 1.5458100558659218, + "grad_norm": 0.6409083604812622, + "learning_rate": 0.0009254621848739496, + "loss": 0.5089, + "step": 2767 + }, + { + "epoch": 1.546368715083799, + "grad_norm": 0.571533739566803, + "learning_rate": 0.0009254341736694678, + "loss": 0.4228, + "step": 2768 + }, + { + "epoch": 1.546927374301676, + "grad_norm": 1.3261182308197021, + "learning_rate": 0.0009254061624649859, + "loss": 0.6556, + "step": 2769 + }, + { + "epoch": 1.547486033519553, + "grad_norm": 1.483222246170044, + "learning_rate": 0.0009253781512605042, + "loss": 0.4688, + "step": 2770 + }, + { + "epoch": 1.54804469273743, + "grad_norm": 1.9051486253738403, + "learning_rate": 0.0009253501400560225, + "loss": 0.4937, + "step": 2771 + }, + { + "epoch": 1.5486033519553073, + "grad_norm": 0.6750649213790894, + "learning_rate": 0.0009253221288515407, + "loss": 0.4989, + "step": 2772 + }, + { + "epoch": 1.5491620111731843, + "grad_norm": 1.6300128698349, + "learning_rate": 0.0009252941176470589, + "loss": 0.4737, + "step": 2773 + }, + { + "epoch": 1.5497206703910615, + "grad_norm": 1.0556766986846924, + "learning_rate": 0.000925266106442577, + "loss": 0.6323, + "step": 2774 + }, + { + "epoch": 1.5502793296089385, + "grad_norm": 0.4854927957057953, + "learning_rate": 0.0009252380952380952, + "loss": 0.4267, + "step": 2775 + }, + { + "epoch": 1.5508379888268156, + "grad_norm": 0.9065334796905518, + "learning_rate": 0.0009252100840336135, + "loss": 0.5673, + "step": 2776 + }, + { + "epoch": 1.5513966480446926, + "grad_norm": 0.6424968838691711, + "learning_rate": 0.0009251820728291317, + "loss": 0.5238, + "step": 2777 + }, + { + "epoch": 1.5519553072625698, + "grad_norm": 0.8335216045379639, + "learning_rate": 0.0009251540616246499, + "loss": 0.5646, + "step": 2778 + }, + { + "epoch": 1.552513966480447, + "grad_norm": 0.49993088841438293, + "learning_rate": 0.0009251260504201681, + "loss": 0.4298, + "step": 2779 + }, + { + "epoch": 1.553072625698324, + "grad_norm": 0.9979957938194275, + "learning_rate": 0.0009250980392156862, + "loss": 0.4548, + "step": 2780 + }, + { + "epoch": 1.553631284916201, + "grad_norm": 0.6388674974441528, + "learning_rate": 0.0009250700280112045, + "loss": 0.4553, + "step": 2781 + }, + { + "epoch": 1.554189944134078, + "grad_norm": 1.2615723609924316, + "learning_rate": 0.0009250420168067227, + "loss": 0.4694, + "step": 2782 + }, + { + "epoch": 1.5547486033519553, + "grad_norm": 0.5050574541091919, + "learning_rate": 0.0009250140056022409, + "loss": 0.4345, + "step": 2783 + }, + { + "epoch": 1.5553072625698325, + "grad_norm": 1.7929205894470215, + "learning_rate": 0.0009249859943977591, + "loss": 0.4348, + "step": 2784 + }, + { + "epoch": 1.5558659217877095, + "grad_norm": 0.5956754088401794, + "learning_rate": 0.0009249579831932772, + "loss": 0.471, + "step": 2785 + }, + { + "epoch": 1.5564245810055866, + "grad_norm": 0.7108403444290161, + "learning_rate": 0.0009249299719887956, + "loss": 0.4583, + "step": 2786 + }, + { + "epoch": 1.5569832402234636, + "grad_norm": 0.5493324398994446, + "learning_rate": 0.0009249019607843138, + "loss": 0.5494, + "step": 2787 + }, + { + "epoch": 1.5575418994413408, + "grad_norm": 0.6735252141952515, + "learning_rate": 0.000924873949579832, + "loss": 0.5228, + "step": 2788 + }, + { + "epoch": 1.558100558659218, + "grad_norm": 0.6224062442779541, + "learning_rate": 0.0009248459383753502, + "loss": 0.5515, + "step": 2789 + }, + { + "epoch": 1.558659217877095, + "grad_norm": 0.7120926380157471, + "learning_rate": 0.0009248179271708683, + "loss": 0.5912, + "step": 2790 + }, + { + "epoch": 1.559217877094972, + "grad_norm": 0.7107549905776978, + "learning_rate": 0.0009247899159663866, + "loss": 0.447, + "step": 2791 + }, + { + "epoch": 1.559776536312849, + "grad_norm": 0.7261028289794922, + "learning_rate": 0.0009247619047619048, + "loss": 0.7375, + "step": 2792 + }, + { + "epoch": 1.5603351955307263, + "grad_norm": 0.6139434576034546, + "learning_rate": 0.000924733893557423, + "loss": 0.4755, + "step": 2793 + }, + { + "epoch": 1.5608938547486033, + "grad_norm": 18.866910934448242, + "learning_rate": 0.0009247058823529412, + "loss": 0.6126, + "step": 2794 + }, + { + "epoch": 1.5614525139664805, + "grad_norm": 0.5344040989875793, + "learning_rate": 0.0009246778711484594, + "loss": 0.5019, + "step": 2795 + }, + { + "epoch": 1.5620111731843576, + "grad_norm": 0.5899941325187683, + "learning_rate": 0.0009246498599439776, + "loss": 0.5483, + "step": 2796 + }, + { + "epoch": 1.5625698324022346, + "grad_norm": 0.5017511248588562, + "learning_rate": 0.0009246218487394958, + "loss": 0.5214, + "step": 2797 + }, + { + "epoch": 1.5631284916201116, + "grad_norm": 0.8440216183662415, + "learning_rate": 0.000924593837535014, + "loss": 0.4348, + "step": 2798 + }, + { + "epoch": 1.5636871508379888, + "grad_norm": 0.5344979763031006, + "learning_rate": 0.0009245658263305322, + "loss": 0.5582, + "step": 2799 + }, + { + "epoch": 1.564245810055866, + "grad_norm": 0.6181933283805847, + "learning_rate": 0.0009245378151260504, + "loss": 0.446, + "step": 2800 + }, + { + "epoch": 1.564804469273743, + "grad_norm": 0.5276536345481873, + "learning_rate": 0.0009245098039215686, + "loss": 0.3863, + "step": 2801 + }, + { + "epoch": 1.56536312849162, + "grad_norm": 0.4638065993785858, + "learning_rate": 0.0009244817927170869, + "loss": 0.4631, + "step": 2802 + }, + { + "epoch": 1.565921787709497, + "grad_norm": 0.4662304222583771, + "learning_rate": 0.0009244537815126051, + "loss": 0.4591, + "step": 2803 + }, + { + "epoch": 1.5664804469273743, + "grad_norm": 0.7063485980033875, + "learning_rate": 0.0009244257703081233, + "loss": 0.4275, + "step": 2804 + }, + { + "epoch": 1.5670391061452515, + "grad_norm": 0.8666127324104309, + "learning_rate": 0.0009243977591036415, + "loss": 0.6861, + "step": 2805 + }, + { + "epoch": 1.5675977653631286, + "grad_norm": 4.635715961456299, + "learning_rate": 0.0009243697478991597, + "loss": 0.5489, + "step": 2806 + }, + { + "epoch": 1.5681564245810056, + "grad_norm": 0.9933450222015381, + "learning_rate": 0.0009243417366946779, + "loss": 0.5432, + "step": 2807 + }, + { + "epoch": 1.5687150837988826, + "grad_norm": 0.6908605694770813, + "learning_rate": 0.0009243137254901961, + "loss": 0.4025, + "step": 2808 + }, + { + "epoch": 1.5692737430167598, + "grad_norm": 0.7941877245903015, + "learning_rate": 0.0009242857142857143, + "loss": 0.5637, + "step": 2809 + }, + { + "epoch": 1.5698324022346368, + "grad_norm": 0.4864341914653778, + "learning_rate": 0.0009242577030812325, + "loss": 0.4766, + "step": 2810 + }, + { + "epoch": 1.570391061452514, + "grad_norm": 0.5854458212852478, + "learning_rate": 0.0009242296918767508, + "loss": 0.4063, + "step": 2811 + }, + { + "epoch": 1.570949720670391, + "grad_norm": 0.5189414620399475, + "learning_rate": 0.0009242016806722689, + "loss": 0.4168, + "step": 2812 + }, + { + "epoch": 1.571508379888268, + "grad_norm": 0.6213449239730835, + "learning_rate": 0.0009241736694677871, + "loss": 0.4899, + "step": 2813 + }, + { + "epoch": 1.572067039106145, + "grad_norm": 2.9355480670928955, + "learning_rate": 0.0009241456582633053, + "loss": 0.5294, + "step": 2814 + }, + { + "epoch": 1.5726256983240223, + "grad_norm": 0.7673971056938171, + "learning_rate": 0.0009241176470588235, + "loss": 0.5934, + "step": 2815 + }, + { + "epoch": 1.5731843575418996, + "grad_norm": 0.639930248260498, + "learning_rate": 0.0009240896358543418, + "loss": 0.6002, + "step": 2816 + }, + { + "epoch": 1.5737430167597766, + "grad_norm": 0.5959346294403076, + "learning_rate": 0.0009240616246498599, + "loss": 0.4396, + "step": 2817 + }, + { + "epoch": 1.5743016759776536, + "grad_norm": 3.7702465057373047, + "learning_rate": 0.0009240336134453781, + "loss": 0.4159, + "step": 2818 + }, + { + "epoch": 1.5748603351955306, + "grad_norm": 2.404097318649292, + "learning_rate": 0.0009240056022408964, + "loss": 0.4869, + "step": 2819 + }, + { + "epoch": 1.5754189944134078, + "grad_norm": 0.9831011891365051, + "learning_rate": 0.0009239775910364146, + "loss": 0.6152, + "step": 2820 + }, + { + "epoch": 1.575977653631285, + "grad_norm": 0.5874549746513367, + "learning_rate": 0.0009239495798319329, + "loss": 0.5179, + "step": 2821 + }, + { + "epoch": 1.576536312849162, + "grad_norm": 0.811124324798584, + "learning_rate": 0.000923921568627451, + "loss": 0.6292, + "step": 2822 + }, + { + "epoch": 1.577094972067039, + "grad_norm": 36.496070861816406, + "learning_rate": 0.0009238935574229692, + "loss": 0.4898, + "step": 2823 + }, + { + "epoch": 1.577653631284916, + "grad_norm": 0.915553867816925, + "learning_rate": 0.0009238655462184874, + "loss": 0.5476, + "step": 2824 + }, + { + "epoch": 1.5782122905027933, + "grad_norm": 1.1290582418441772, + "learning_rate": 0.0009238375350140056, + "loss": 0.5451, + "step": 2825 + }, + { + "epoch": 1.5787709497206703, + "grad_norm": 2.8792660236358643, + "learning_rate": 0.0009238095238095239, + "loss": 0.4526, + "step": 2826 + }, + { + "epoch": 1.5793296089385476, + "grad_norm": 0.5938659906387329, + "learning_rate": 0.0009237815126050421, + "loss": 0.4981, + "step": 2827 + }, + { + "epoch": 1.5798882681564246, + "grad_norm": 0.6090506911277771, + "learning_rate": 0.0009237535014005602, + "loss": 0.4586, + "step": 2828 + }, + { + "epoch": 1.5804469273743016, + "grad_norm": 2.3405656814575195, + "learning_rate": 0.0009237254901960784, + "loss": 0.566, + "step": 2829 + }, + { + "epoch": 1.5810055865921788, + "grad_norm": 0.6403980255126953, + "learning_rate": 0.0009236974789915966, + "loss": 0.425, + "step": 2830 + }, + { + "epoch": 1.5815642458100558, + "grad_norm": 0.9365788698196411, + "learning_rate": 0.0009236694677871149, + "loss": 0.4745, + "step": 2831 + }, + { + "epoch": 1.582122905027933, + "grad_norm": 0.6935022473335266, + "learning_rate": 0.0009236414565826331, + "loss": 0.5897, + "step": 2832 + }, + { + "epoch": 1.58268156424581, + "grad_norm": 0.483063668012619, + "learning_rate": 0.0009236134453781512, + "loss": 0.4582, + "step": 2833 + }, + { + "epoch": 1.583240223463687, + "grad_norm": 0.5326879620552063, + "learning_rate": 0.0009235854341736694, + "loss": 0.4834, + "step": 2834 + }, + { + "epoch": 1.583798882681564, + "grad_norm": 1.6950187683105469, + "learning_rate": 0.0009235574229691876, + "loss": 0.5464, + "step": 2835 + }, + { + "epoch": 1.5843575418994413, + "grad_norm": 4.1505231857299805, + "learning_rate": 0.000923529411764706, + "loss": 0.4663, + "step": 2836 + }, + { + "epoch": 1.5849162011173186, + "grad_norm": 0.7821114659309387, + "learning_rate": 0.0009235014005602242, + "loss": 0.4632, + "step": 2837 + }, + { + "epoch": 1.5854748603351956, + "grad_norm": 0.6650540232658386, + "learning_rate": 0.0009234733893557423, + "loss": 0.558, + "step": 2838 + }, + { + "epoch": 1.5860335195530726, + "grad_norm": 0.8296647667884827, + "learning_rate": 0.0009234453781512605, + "loss": 0.5619, + "step": 2839 + }, + { + "epoch": 1.5865921787709496, + "grad_norm": 1.3736047744750977, + "learning_rate": 0.0009234173669467787, + "loss": 0.6286, + "step": 2840 + }, + { + "epoch": 1.5871508379888268, + "grad_norm": 0.9090030193328857, + "learning_rate": 0.000923389355742297, + "loss": 0.4552, + "step": 2841 + }, + { + "epoch": 1.587709497206704, + "grad_norm": 2.0754737854003906, + "learning_rate": 0.0009233613445378152, + "loss": 0.5756, + "step": 2842 + }, + { + "epoch": 1.588268156424581, + "grad_norm": 2.355257272720337, + "learning_rate": 0.0009233333333333334, + "loss": 0.5973, + "step": 2843 + }, + { + "epoch": 1.588826815642458, + "grad_norm": 0.5823377370834351, + "learning_rate": 0.0009233053221288515, + "loss": 0.4513, + "step": 2844 + }, + { + "epoch": 1.589385474860335, + "grad_norm": 0.4742606580257416, + "learning_rate": 0.0009232773109243697, + "loss": 0.5143, + "step": 2845 + }, + { + "epoch": 1.5899441340782123, + "grad_norm": 0.868121325969696, + "learning_rate": 0.000923249299719888, + "loss": 0.4596, + "step": 2846 + }, + { + "epoch": 1.5905027932960893, + "grad_norm": 0.8557822108268738, + "learning_rate": 0.0009232212885154062, + "loss": 0.5136, + "step": 2847 + }, + { + "epoch": 1.5910614525139666, + "grad_norm": 0.8515350818634033, + "learning_rate": 0.0009231932773109244, + "loss": 0.5733, + "step": 2848 + }, + { + "epoch": 1.5916201117318436, + "grad_norm": 0.8426374793052673, + "learning_rate": 0.0009231652661064425, + "loss": 0.6004, + "step": 2849 + }, + { + "epoch": 1.5921787709497206, + "grad_norm": 0.5116023421287537, + "learning_rate": 0.0009231372549019607, + "loss": 0.4745, + "step": 2850 + }, + { + "epoch": 1.5927374301675976, + "grad_norm": 0.7062660455703735, + "learning_rate": 0.000923109243697479, + "loss": 0.5869, + "step": 2851 + }, + { + "epoch": 1.5932960893854748, + "grad_norm": 0.5434456467628479, + "learning_rate": 0.0009230812324929973, + "loss": 0.4573, + "step": 2852 + }, + { + "epoch": 1.593854748603352, + "grad_norm": 0.7755725383758545, + "learning_rate": 0.0009230532212885155, + "loss": 0.4905, + "step": 2853 + }, + { + "epoch": 1.594413407821229, + "grad_norm": 0.7681111693382263, + "learning_rate": 0.0009230252100840336, + "loss": 0.534, + "step": 2854 + }, + { + "epoch": 1.594972067039106, + "grad_norm": 0.5175493955612183, + "learning_rate": 0.0009229971988795518, + "loss": 0.4965, + "step": 2855 + }, + { + "epoch": 1.5955307262569831, + "grad_norm": 0.5323821902275085, + "learning_rate": 0.0009229691876750701, + "loss": 0.4737, + "step": 2856 + }, + { + "epoch": 1.5960893854748603, + "grad_norm": 4.029867172241211, + "learning_rate": 0.0009229411764705883, + "loss": 0.4348, + "step": 2857 + }, + { + "epoch": 1.5966480446927376, + "grad_norm": 4.8672590255737305, + "learning_rate": 0.0009229131652661065, + "loss": 0.6158, + "step": 2858 + }, + { + "epoch": 1.5972067039106146, + "grad_norm": 1.7575292587280273, + "learning_rate": 0.0009228851540616247, + "loss": 0.3117, + "step": 2859 + }, + { + "epoch": 1.5977653631284916, + "grad_norm": 0.5673686265945435, + "learning_rate": 0.0009228571428571428, + "loss": 0.5704, + "step": 2860 + }, + { + "epoch": 1.5983240223463686, + "grad_norm": 0.6373631954193115, + "learning_rate": 0.0009228291316526611, + "loss": 0.4548, + "step": 2861 + }, + { + "epoch": 1.5988826815642458, + "grad_norm": 0.8258795142173767, + "learning_rate": 0.0009228011204481793, + "loss": 0.6333, + "step": 2862 + }, + { + "epoch": 1.5994413407821229, + "grad_norm": 0.6410709619522095, + "learning_rate": 0.0009227731092436975, + "loss": 0.4179, + "step": 2863 + }, + { + "epoch": 1.6, + "grad_norm": 0.688508152961731, + "learning_rate": 0.0009227450980392157, + "loss": 0.3971, + "step": 2864 + }, + { + "epoch": 1.600558659217877, + "grad_norm": 4.202364444732666, + "learning_rate": 0.0009227170868347338, + "loss": 0.4572, + "step": 2865 + }, + { + "epoch": 1.6011173184357541, + "grad_norm": 0.7221179008483887, + "learning_rate": 0.0009226890756302521, + "loss": 0.4466, + "step": 2866 + }, + { + "epoch": 1.6016759776536311, + "grad_norm": 2.5795836448669434, + "learning_rate": 0.0009226610644257703, + "loss": 0.5522, + "step": 2867 + }, + { + "epoch": 1.6022346368715084, + "grad_norm": 0.6176924705505371, + "learning_rate": 0.0009226330532212886, + "loss": 0.526, + "step": 2868 + }, + { + "epoch": 1.6027932960893856, + "grad_norm": 0.4730075001716614, + "learning_rate": 0.0009226050420168068, + "loss": 0.4054, + "step": 2869 + }, + { + "epoch": 1.6033519553072626, + "grad_norm": 0.5490831732749939, + "learning_rate": 0.0009225770308123249, + "loss": 0.5697, + "step": 2870 + }, + { + "epoch": 1.6039106145251396, + "grad_norm": 0.5738762021064758, + "learning_rate": 0.0009225490196078432, + "loss": 0.4648, + "step": 2871 + }, + { + "epoch": 1.6044692737430166, + "grad_norm": 0.6908128261566162, + "learning_rate": 0.0009225210084033614, + "loss": 0.3872, + "step": 2872 + }, + { + "epoch": 1.6050279329608939, + "grad_norm": 0.6184394955635071, + "learning_rate": 0.0009224929971988796, + "loss": 0.3635, + "step": 2873 + }, + { + "epoch": 1.605586592178771, + "grad_norm": 0.6684905290603638, + "learning_rate": 0.0009224649859943978, + "loss": 0.5747, + "step": 2874 + }, + { + "epoch": 1.606145251396648, + "grad_norm": 0.6855420470237732, + "learning_rate": 0.000922436974789916, + "loss": 0.3914, + "step": 2875 + }, + { + "epoch": 1.606703910614525, + "grad_norm": 0.7845286130905151, + "learning_rate": 0.0009224089635854342, + "loss": 0.4643, + "step": 2876 + }, + { + "epoch": 1.6072625698324021, + "grad_norm": 0.7778080701828003, + "learning_rate": 0.0009223809523809524, + "loss": 0.508, + "step": 2877 + }, + { + "epoch": 1.6078212290502794, + "grad_norm": 0.5270350575447083, + "learning_rate": 0.0009223529411764706, + "loss": 0.5029, + "step": 2878 + }, + { + "epoch": 1.6083798882681566, + "grad_norm": 0.5413338541984558, + "learning_rate": 0.0009223249299719888, + "loss": 0.5288, + "step": 2879 + }, + { + "epoch": 1.6089385474860336, + "grad_norm": 0.4262201488018036, + "learning_rate": 0.000922296918767507, + "loss": 0.3522, + "step": 2880 + }, + { + "epoch": 1.6094972067039106, + "grad_norm": 1.0702208280563354, + "learning_rate": 0.0009222689075630252, + "loss": 0.463, + "step": 2881 + }, + { + "epoch": 1.6100558659217876, + "grad_norm": 0.7859681248664856, + "learning_rate": 0.0009222408963585434, + "loss": 0.523, + "step": 2882 + }, + { + "epoch": 1.6106145251396649, + "grad_norm": 0.5103206038475037, + "learning_rate": 0.0009222128851540616, + "loss": 0.3756, + "step": 2883 + }, + { + "epoch": 1.6111731843575419, + "grad_norm": 0.5550638437271118, + "learning_rate": 0.0009221848739495799, + "loss": 0.4632, + "step": 2884 + }, + { + "epoch": 1.611731843575419, + "grad_norm": 0.6195359826087952, + "learning_rate": 0.0009221568627450981, + "loss": 0.508, + "step": 2885 + }, + { + "epoch": 1.612290502793296, + "grad_norm": 0.6476532220840454, + "learning_rate": 0.0009221288515406164, + "loss": 0.5528, + "step": 2886 + }, + { + "epoch": 1.6128491620111731, + "grad_norm": 0.8326266407966614, + "learning_rate": 0.0009221008403361345, + "loss": 0.5712, + "step": 2887 + }, + { + "epoch": 1.6134078212290501, + "grad_norm": 0.4361787736415863, + "learning_rate": 0.0009220728291316527, + "loss": 0.4299, + "step": 2888 + }, + { + "epoch": 1.6139664804469274, + "grad_norm": 0.841769278049469, + "learning_rate": 0.0009220448179271709, + "loss": 0.8441, + "step": 2889 + }, + { + "epoch": 1.6145251396648046, + "grad_norm": 0.6139340996742249, + "learning_rate": 0.0009220168067226891, + "loss": 0.4644, + "step": 2890 + }, + { + "epoch": 1.6150837988826816, + "grad_norm": 0.8081483244895935, + "learning_rate": 0.0009219887955182074, + "loss": 0.6304, + "step": 2891 + }, + { + "epoch": 1.6156424581005586, + "grad_norm": 1.714690923690796, + "learning_rate": 0.0009219607843137255, + "loss": 0.557, + "step": 2892 + }, + { + "epoch": 1.6162011173184356, + "grad_norm": 1.9895784854888916, + "learning_rate": 0.0009219327731092437, + "loss": 0.4478, + "step": 2893 + }, + { + "epoch": 1.6167597765363129, + "grad_norm": 0.5913851261138916, + "learning_rate": 0.0009219047619047619, + "loss": 0.4804, + "step": 2894 + }, + { + "epoch": 1.61731843575419, + "grad_norm": 0.5215538740158081, + "learning_rate": 0.0009218767507002801, + "loss": 0.4256, + "step": 2895 + }, + { + "epoch": 1.617877094972067, + "grad_norm": 0.5980809330940247, + "learning_rate": 0.0009218487394957983, + "loss": 0.5376, + "step": 2896 + }, + { + "epoch": 1.6184357541899441, + "grad_norm": 0.5189204216003418, + "learning_rate": 0.0009218207282913165, + "loss": 0.4814, + "step": 2897 + }, + { + "epoch": 1.6189944134078211, + "grad_norm": 0.6995025277137756, + "learning_rate": 0.0009217927170868347, + "loss": 0.469, + "step": 2898 + }, + { + "epoch": 1.6195530726256984, + "grad_norm": 0.6404252052307129, + "learning_rate": 0.0009217647058823529, + "loss": 0.5438, + "step": 2899 + }, + { + "epoch": 1.6201117318435754, + "grad_norm": 0.5329297184944153, + "learning_rate": 0.0009217366946778711, + "loss": 0.4656, + "step": 2900 + }, + { + "epoch": 1.6206703910614526, + "grad_norm": 0.752088725566864, + "learning_rate": 0.0009217086834733894, + "loss": 0.4993, + "step": 2901 + }, + { + "epoch": 1.6212290502793296, + "grad_norm": 1.3972595930099487, + "learning_rate": 0.0009216806722689077, + "loss": 0.4056, + "step": 2902 + }, + { + "epoch": 1.6217877094972066, + "grad_norm": 1.312340497970581, + "learning_rate": 0.0009216526610644258, + "loss": 0.6291, + "step": 2903 + }, + { + "epoch": 1.6223463687150836, + "grad_norm": 0.6492317318916321, + "learning_rate": 0.000921624649859944, + "loss": 0.5123, + "step": 2904 + }, + { + "epoch": 1.6229050279329609, + "grad_norm": 0.7971457839012146, + "learning_rate": 0.0009215966386554622, + "loss": 0.5297, + "step": 2905 + }, + { + "epoch": 1.623463687150838, + "grad_norm": 1.1285161972045898, + "learning_rate": 0.0009215686274509804, + "loss": 0.5025, + "step": 2906 + }, + { + "epoch": 1.6240223463687151, + "grad_norm": 0.759039580821991, + "learning_rate": 0.0009215406162464987, + "loss": 0.4019, + "step": 2907 + }, + { + "epoch": 1.6245810055865921, + "grad_norm": 0.7061336636543274, + "learning_rate": 0.0009215126050420168, + "loss": 0.4109, + "step": 2908 + }, + { + "epoch": 1.6251396648044691, + "grad_norm": 0.6587031483650208, + "learning_rate": 0.000921484593837535, + "loss": 0.4681, + "step": 2909 + }, + { + "epoch": 1.6256983240223464, + "grad_norm": 0.7848811149597168, + "learning_rate": 0.0009214565826330532, + "loss": 0.5185, + "step": 2910 + }, + { + "epoch": 1.6262569832402236, + "grad_norm": 2.2897210121154785, + "learning_rate": 0.0009214285714285714, + "loss": 0.5306, + "step": 2911 + }, + { + "epoch": 1.6268156424581006, + "grad_norm": 0.8609905242919922, + "learning_rate": 0.0009214005602240897, + "loss": 0.3421, + "step": 2912 + }, + { + "epoch": 1.6273743016759776, + "grad_norm": 0.7585361003875732, + "learning_rate": 0.0009213725490196078, + "loss": 0.3992, + "step": 2913 + }, + { + "epoch": 1.6279329608938546, + "grad_norm": 0.6667531132698059, + "learning_rate": 0.000921344537815126, + "loss": 0.4215, + "step": 2914 + }, + { + "epoch": 1.6284916201117319, + "grad_norm": 0.5932943224906921, + "learning_rate": 0.0009213165266106442, + "loss": 0.375, + "step": 2915 + }, + { + "epoch": 1.6290502793296089, + "grad_norm": 0.651016116142273, + "learning_rate": 0.0009212885154061624, + "loss": 0.5173, + "step": 2916 + }, + { + "epoch": 1.6296089385474861, + "grad_norm": 0.7457938194274902, + "learning_rate": 0.0009212605042016808, + "loss": 0.7692, + "step": 2917 + }, + { + "epoch": 1.6301675977653631, + "grad_norm": 0.6583865284919739, + "learning_rate": 0.000921232492997199, + "loss": 0.7213, + "step": 2918 + }, + { + "epoch": 1.6307262569832401, + "grad_norm": 0.902870774269104, + "learning_rate": 0.0009212044817927171, + "loss": 0.4226, + "step": 2919 + }, + { + "epoch": 1.6312849162011172, + "grad_norm": 0.6329991817474365, + "learning_rate": 0.0009211764705882353, + "loss": 0.379, + "step": 2920 + }, + { + "epoch": 1.6318435754189944, + "grad_norm": 2.070338487625122, + "learning_rate": 0.0009211484593837535, + "loss": 0.4733, + "step": 2921 + }, + { + "epoch": 1.6324022346368716, + "grad_norm": 0.6288262009620667, + "learning_rate": 0.0009211204481792718, + "loss": 0.4602, + "step": 2922 + }, + { + "epoch": 1.6329608938547486, + "grad_norm": 0.7328474521636963, + "learning_rate": 0.00092109243697479, + "loss": 0.492, + "step": 2923 + }, + { + "epoch": 1.6335195530726256, + "grad_norm": 1.1070021390914917, + "learning_rate": 0.0009210644257703081, + "loss": 0.4978, + "step": 2924 + }, + { + "epoch": 1.6340782122905027, + "grad_norm": 0.955201268196106, + "learning_rate": 0.0009210364145658263, + "loss": 0.4917, + "step": 2925 + }, + { + "epoch": 1.6346368715083799, + "grad_norm": 0.4593413174152374, + "learning_rate": 0.0009210084033613445, + "loss": 0.3933, + "step": 2926 + }, + { + "epoch": 1.6351955307262571, + "grad_norm": 0.5655298829078674, + "learning_rate": 0.0009209803921568628, + "loss": 0.4398, + "step": 2927 + }, + { + "epoch": 1.6357541899441341, + "grad_norm": 2.355804920196533, + "learning_rate": 0.000920952380952381, + "loss": 0.4983, + "step": 2928 + }, + { + "epoch": 1.6363128491620111, + "grad_norm": 0.8567566275596619, + "learning_rate": 0.0009209243697478991, + "loss": 0.4985, + "step": 2929 + }, + { + "epoch": 1.6368715083798882, + "grad_norm": 0.6869891881942749, + "learning_rate": 0.0009208963585434173, + "loss": 0.4451, + "step": 2930 + }, + { + "epoch": 1.6374301675977654, + "grad_norm": 0.6745887398719788, + "learning_rate": 0.0009208683473389355, + "loss": 0.4912, + "step": 2931 + }, + { + "epoch": 1.6379888268156426, + "grad_norm": 0.982644259929657, + "learning_rate": 0.0009208403361344538, + "loss": 0.456, + "step": 2932 + }, + { + "epoch": 1.6385474860335196, + "grad_norm": 0.4889505207538605, + "learning_rate": 0.000920812324929972, + "loss": 0.4856, + "step": 2933 + }, + { + "epoch": 1.6391061452513966, + "grad_norm": 0.6137354969978333, + "learning_rate": 0.0009207843137254903, + "loss": 0.5827, + "step": 2934 + }, + { + "epoch": 1.6396648044692737, + "grad_norm": 0.7956241369247437, + "learning_rate": 0.0009207563025210084, + "loss": 0.4139, + "step": 2935 + }, + { + "epoch": 1.6402234636871509, + "grad_norm": 0.8453567624092102, + "learning_rate": 0.0009207282913165266, + "loss": 0.5177, + "step": 2936 + }, + { + "epoch": 1.640782122905028, + "grad_norm": 0.6705008745193481, + "learning_rate": 0.0009207002801120449, + "loss": 0.6625, + "step": 2937 + }, + { + "epoch": 1.6413407821229051, + "grad_norm": 0.6575820446014404, + "learning_rate": 0.0009206722689075631, + "loss": 0.5216, + "step": 2938 + }, + { + "epoch": 1.6418994413407821, + "grad_norm": 0.5475789308547974, + "learning_rate": 0.0009206442577030813, + "loss": 0.431, + "step": 2939 + }, + { + "epoch": 1.6424581005586592, + "grad_norm": 0.4481825530529022, + "learning_rate": 0.0009206162464985994, + "loss": 0.3985, + "step": 2940 + }, + { + "epoch": 1.6430167597765362, + "grad_norm": 0.4677400588989258, + "learning_rate": 0.0009205882352941176, + "loss": 0.4608, + "step": 2941 + }, + { + "epoch": 1.6435754189944134, + "grad_norm": 0.5366738438606262, + "learning_rate": 0.0009205602240896359, + "loss": 0.5212, + "step": 2942 + }, + { + "epoch": 1.6441340782122906, + "grad_norm": 1.7023563385009766, + "learning_rate": 0.0009205322128851541, + "loss": 0.4121, + "step": 2943 + }, + { + "epoch": 1.6446927374301676, + "grad_norm": 0.5010234713554382, + "learning_rate": 0.0009205042016806723, + "loss": 0.4885, + "step": 2944 + }, + { + "epoch": 1.6452513966480447, + "grad_norm": 0.538973867893219, + "learning_rate": 0.0009204761904761904, + "loss": 0.5003, + "step": 2945 + }, + { + "epoch": 1.6458100558659217, + "grad_norm": 0.5855522751808167, + "learning_rate": 0.0009204481792717086, + "loss": 0.5985, + "step": 2946 + }, + { + "epoch": 1.646368715083799, + "grad_norm": 0.7652376890182495, + "learning_rate": 0.0009204201680672269, + "loss": 0.4511, + "step": 2947 + }, + { + "epoch": 1.6469273743016761, + "grad_norm": 1.4848134517669678, + "learning_rate": 0.0009203921568627451, + "loss": 0.4837, + "step": 2948 + }, + { + "epoch": 1.6474860335195531, + "grad_norm": 0.8052696585655212, + "learning_rate": 0.0009203641456582633, + "loss": 0.4961, + "step": 2949 + }, + { + "epoch": 1.6480446927374302, + "grad_norm": 0.6934226751327515, + "learning_rate": 0.0009203361344537816, + "loss": 0.4663, + "step": 2950 + }, + { + "epoch": 1.6486033519553072, + "grad_norm": 1.0082859992980957, + "learning_rate": 0.0009203081232492997, + "loss": 0.6173, + "step": 2951 + }, + { + "epoch": 1.6491620111731844, + "grad_norm": 0.584140419960022, + "learning_rate": 0.000920280112044818, + "loss": 0.5377, + "step": 2952 + }, + { + "epoch": 1.6497206703910614, + "grad_norm": 3.865908622741699, + "learning_rate": 0.0009202521008403362, + "loss": 0.4542, + "step": 2953 + }, + { + "epoch": 1.6502793296089386, + "grad_norm": 0.9439364075660706, + "learning_rate": 0.0009202240896358544, + "loss": 0.5125, + "step": 2954 + }, + { + "epoch": 1.6508379888268156, + "grad_norm": 0.5345238447189331, + "learning_rate": 0.0009201960784313726, + "loss": 0.5079, + "step": 2955 + }, + { + "epoch": 1.6513966480446927, + "grad_norm": 1.526811957359314, + "learning_rate": 0.0009201680672268907, + "loss": 0.4813, + "step": 2956 + }, + { + "epoch": 1.6519553072625697, + "grad_norm": 0.563715398311615, + "learning_rate": 0.000920140056022409, + "loss": 0.5363, + "step": 2957 + }, + { + "epoch": 1.652513966480447, + "grad_norm": 0.7194618582725525, + "learning_rate": 0.0009201120448179272, + "loss": 0.5975, + "step": 2958 + }, + { + "epoch": 1.6530726256983241, + "grad_norm": 0.6104727983474731, + "learning_rate": 0.0009200840336134454, + "loss": 0.4158, + "step": 2959 + }, + { + "epoch": 1.6536312849162011, + "grad_norm": 0.5742731094360352, + "learning_rate": 0.0009200560224089636, + "loss": 0.5462, + "step": 2960 + }, + { + "epoch": 1.6541899441340782, + "grad_norm": 0.6003073453903198, + "learning_rate": 0.0009200280112044817, + "loss": 0.4897, + "step": 2961 + }, + { + "epoch": 1.6547486033519552, + "grad_norm": 0.7050849795341492, + "learning_rate": 0.00092, + "loss": 0.48, + "step": 2962 + }, + { + "epoch": 1.6553072625698324, + "grad_norm": 0.6076145172119141, + "learning_rate": 0.0009199719887955182, + "loss": 0.3207, + "step": 2963 + }, + { + "epoch": 1.6558659217877096, + "grad_norm": 0.9574232697486877, + "learning_rate": 0.0009199439775910364, + "loss": 0.6019, + "step": 2964 + }, + { + "epoch": 1.6564245810055866, + "grad_norm": 0.661709189414978, + "learning_rate": 0.0009199159663865546, + "loss": 0.4523, + "step": 2965 + }, + { + "epoch": 1.6569832402234637, + "grad_norm": 0.5521332621574402, + "learning_rate": 0.0009198879551820729, + "loss": 0.5107, + "step": 2966 + }, + { + "epoch": 1.6575418994413407, + "grad_norm": 2.8413784503936768, + "learning_rate": 0.0009198599439775911, + "loss": 0.615, + "step": 2967 + }, + { + "epoch": 1.658100558659218, + "grad_norm": 0.6800103187561035, + "learning_rate": 0.0009198319327731093, + "loss": 0.5227, + "step": 2968 + }, + { + "epoch": 1.6586592178770951, + "grad_norm": 0.5890018939971924, + "learning_rate": 0.0009198039215686275, + "loss": 0.4577, + "step": 2969 + }, + { + "epoch": 1.6592178770949721, + "grad_norm": 0.6618414521217346, + "learning_rate": 0.0009197759103641457, + "loss": 0.4643, + "step": 2970 + }, + { + "epoch": 1.6597765363128492, + "grad_norm": 0.8312742114067078, + "learning_rate": 0.0009197478991596639, + "loss": 0.3716, + "step": 2971 + }, + { + "epoch": 1.6603351955307262, + "grad_norm": 0.8219579458236694, + "learning_rate": 0.0009197198879551821, + "loss": 0.5036, + "step": 2972 + }, + { + "epoch": 1.6608938547486034, + "grad_norm": 0.5063742399215698, + "learning_rate": 0.0009196918767507003, + "loss": 0.5249, + "step": 2973 + }, + { + "epoch": 1.6614525139664804, + "grad_norm": 0.767436146736145, + "learning_rate": 0.0009196638655462185, + "loss": 0.4814, + "step": 2974 + }, + { + "epoch": 1.6620111731843576, + "grad_norm": 0.6326186656951904, + "learning_rate": 0.0009196358543417367, + "loss": 0.5397, + "step": 2975 + }, + { + "epoch": 1.6625698324022347, + "grad_norm": 0.5269055962562561, + "learning_rate": 0.0009196078431372549, + "loss": 0.462, + "step": 2976 + }, + { + "epoch": 1.6631284916201117, + "grad_norm": 1.0455354452133179, + "learning_rate": 0.0009195798319327731, + "loss": 0.4637, + "step": 2977 + }, + { + "epoch": 1.6636871508379887, + "grad_norm": 0.49353930354118347, + "learning_rate": 0.0009195518207282913, + "loss": 0.4772, + "step": 2978 + }, + { + "epoch": 1.664245810055866, + "grad_norm": 3.803060531616211, + "learning_rate": 0.0009195238095238095, + "loss": 0.4495, + "step": 2979 + }, + { + "epoch": 1.6648044692737431, + "grad_norm": 0.6462464928627014, + "learning_rate": 0.0009194957983193277, + "loss": 0.4297, + "step": 2980 + }, + { + "epoch": 1.6653631284916202, + "grad_norm": 0.6319571137428284, + "learning_rate": 0.0009194677871148459, + "loss": 0.4821, + "step": 2981 + }, + { + "epoch": 1.6659217877094972, + "grad_norm": 0.488567978143692, + "learning_rate": 0.0009194397759103643, + "loss": 0.4366, + "step": 2982 + }, + { + "epoch": 1.6664804469273742, + "grad_norm": 0.5684471726417542, + "learning_rate": 0.0009194117647058824, + "loss": 0.4164, + "step": 2983 + }, + { + "epoch": 1.6670391061452514, + "grad_norm": 0.7592557668685913, + "learning_rate": 0.0009193837535014006, + "loss": 0.4645, + "step": 2984 + }, + { + "epoch": 1.6675977653631286, + "grad_norm": 0.6510218381881714, + "learning_rate": 0.0009193557422969188, + "loss": 0.7281, + "step": 2985 + }, + { + "epoch": 1.6681564245810057, + "grad_norm": 0.6027083396911621, + "learning_rate": 0.000919327731092437, + "loss": 0.4817, + "step": 2986 + }, + { + "epoch": 1.6687150837988827, + "grad_norm": 2.120337963104248, + "learning_rate": 0.0009192997198879553, + "loss": 0.5205, + "step": 2987 + }, + { + "epoch": 1.6692737430167597, + "grad_norm": 0.4455220103263855, + "learning_rate": 0.0009192717086834734, + "loss": 0.4645, + "step": 2988 + }, + { + "epoch": 1.669832402234637, + "grad_norm": 0.5661830306053162, + "learning_rate": 0.0009192436974789916, + "loss": 0.468, + "step": 2989 + }, + { + "epoch": 1.670391061452514, + "grad_norm": 0.59552001953125, + "learning_rate": 0.0009192156862745098, + "loss": 0.5502, + "step": 2990 + }, + { + "epoch": 1.6709497206703912, + "grad_norm": 1.0381678342819214, + "learning_rate": 0.000919187675070028, + "loss": 0.5192, + "step": 2991 + }, + { + "epoch": 1.6715083798882682, + "grad_norm": 0.7575311660766602, + "learning_rate": 0.0009191596638655463, + "loss": 0.4759, + "step": 2992 + }, + { + "epoch": 1.6720670391061452, + "grad_norm": 0.8445690274238586, + "learning_rate": 0.0009191316526610644, + "loss": 0.6034, + "step": 2993 + }, + { + "epoch": 1.6726256983240222, + "grad_norm": 0.5338285565376282, + "learning_rate": 0.0009191036414565826, + "loss": 0.4491, + "step": 2994 + }, + { + "epoch": 1.6731843575418994, + "grad_norm": 0.4428769052028656, + "learning_rate": 0.0009190756302521008, + "loss": 0.4677, + "step": 2995 + }, + { + "epoch": 1.6737430167597767, + "grad_norm": 0.5209064483642578, + "learning_rate": 0.000919047619047619, + "loss": 0.4895, + "step": 2996 + }, + { + "epoch": 1.6743016759776537, + "grad_norm": 0.43626096844673157, + "learning_rate": 0.0009190196078431373, + "loss": 0.4419, + "step": 2997 + }, + { + "epoch": 1.6748603351955307, + "grad_norm": 1.7064591646194458, + "learning_rate": 0.0009189915966386556, + "loss": 0.7224, + "step": 2998 + }, + { + "epoch": 1.6754189944134077, + "grad_norm": 1.0908819437026978, + "learning_rate": 0.0009189635854341736, + "loss": 0.5763, + "step": 2999 + }, + { + "epoch": 1.675977653631285, + "grad_norm": 0.6949249505996704, + "learning_rate": 0.0009189355742296919, + "loss": 0.6624, + "step": 3000 + }, + { + "epoch": 1.675977653631285, + "eval_cer": 0.09953302128681003, + "eval_loss": 0.36958059668540955, + "eval_runtime": 55.7861, + "eval_samples_per_second": 81.346, + "eval_steps_per_second": 5.091, + "eval_wer": 0.39183263037345323, + "step": 3000 + }, + { + "epoch": 1.6765363128491622, + "grad_norm": 1.1103029251098633, + "learning_rate": 0.0009189075630252101, + "loss": 0.4821, + "step": 3001 + }, + { + "epoch": 1.6770949720670392, + "grad_norm": 1.1582223176956177, + "learning_rate": 0.0009188795518207284, + "loss": 0.5081, + "step": 3002 + }, + { + "epoch": 1.6776536312849162, + "grad_norm": 1.3446568250656128, + "learning_rate": 0.0009188515406162466, + "loss": 0.4201, + "step": 3003 + }, + { + "epoch": 1.6782122905027932, + "grad_norm": 1.1600929498672485, + "learning_rate": 0.0009188235294117647, + "loss": 0.4844, + "step": 3004 + }, + { + "epoch": 1.6787709497206704, + "grad_norm": 0.6011174917221069, + "learning_rate": 0.0009187955182072829, + "loss": 0.4485, + "step": 3005 + }, + { + "epoch": 1.6793296089385474, + "grad_norm": 0.8195842504501343, + "learning_rate": 0.0009187675070028011, + "loss": 0.6858, + "step": 3006 + }, + { + "epoch": 1.6798882681564247, + "grad_norm": 0.6972982287406921, + "learning_rate": 0.0009187394957983194, + "loss": 0.3695, + "step": 3007 + }, + { + "epoch": 1.6804469273743017, + "grad_norm": 0.7719422578811646, + "learning_rate": 0.0009187114845938376, + "loss": 0.4631, + "step": 3008 + }, + { + "epoch": 1.6810055865921787, + "grad_norm": 0.5306065678596497, + "learning_rate": 0.0009186834733893557, + "loss": 0.3978, + "step": 3009 + }, + { + "epoch": 1.6815642458100557, + "grad_norm": 1.917561650276184, + "learning_rate": 0.0009186554621848739, + "loss": 0.6242, + "step": 3010 + }, + { + "epoch": 1.682122905027933, + "grad_norm": 1.2080435752868652, + "learning_rate": 0.0009186274509803921, + "loss": 0.5714, + "step": 3011 + }, + { + "epoch": 1.6826815642458102, + "grad_norm": 1.7968887090682983, + "learning_rate": 0.0009185994397759104, + "loss": 0.4285, + "step": 3012 + }, + { + "epoch": 1.6832402234636872, + "grad_norm": 0.4624374806880951, + "learning_rate": 0.0009185714285714286, + "loss": 0.5075, + "step": 3013 + }, + { + "epoch": 1.6837988826815642, + "grad_norm": 0.5181758999824524, + "learning_rate": 0.0009185434173669468, + "loss": 0.3951, + "step": 3014 + }, + { + "epoch": 1.6843575418994412, + "grad_norm": 0.5095945596694946, + "learning_rate": 0.0009185154061624649, + "loss": 0.4684, + "step": 3015 + }, + { + "epoch": 1.6849162011173184, + "grad_norm": 0.5722306966781616, + "learning_rate": 0.0009184873949579832, + "loss": 0.4164, + "step": 3016 + }, + { + "epoch": 1.6854748603351957, + "grad_norm": 0.9484612345695496, + "learning_rate": 0.0009184593837535015, + "loss": 0.5633, + "step": 3017 + }, + { + "epoch": 1.6860335195530727, + "grad_norm": 0.46368923783302307, + "learning_rate": 0.0009184313725490197, + "loss": 0.4649, + "step": 3018 + }, + { + "epoch": 1.6865921787709497, + "grad_norm": 0.7870525121688843, + "learning_rate": 0.0009184033613445379, + "loss": 0.4788, + "step": 3019 + }, + { + "epoch": 1.6871508379888267, + "grad_norm": 1.2743964195251465, + "learning_rate": 0.000918375350140056, + "loss": 0.3967, + "step": 3020 + }, + { + "epoch": 1.687709497206704, + "grad_norm": 1.8453387022018433, + "learning_rate": 0.0009183473389355742, + "loss": 0.4935, + "step": 3021 + }, + { + "epoch": 1.6882681564245812, + "grad_norm": 4.191108226776123, + "learning_rate": 0.0009183193277310925, + "loss": 0.6462, + "step": 3022 + }, + { + "epoch": 1.6888268156424582, + "grad_norm": 0.9374301433563232, + "learning_rate": 0.0009182913165266107, + "loss": 0.7021, + "step": 3023 + }, + { + "epoch": 1.6893854748603352, + "grad_norm": 0.5467361211776733, + "learning_rate": 0.0009182633053221289, + "loss": 0.5279, + "step": 3024 + }, + { + "epoch": 1.6899441340782122, + "grad_norm": 0.6957757472991943, + "learning_rate": 0.000918235294117647, + "loss": 0.6554, + "step": 3025 + }, + { + "epoch": 1.6905027932960894, + "grad_norm": 0.9166900515556335, + "learning_rate": 0.0009182072829131652, + "loss": 0.5043, + "step": 3026 + }, + { + "epoch": 1.6910614525139664, + "grad_norm": 0.8530071377754211, + "learning_rate": 0.0009181792717086835, + "loss": 0.4453, + "step": 3027 + }, + { + "epoch": 1.6916201117318437, + "grad_norm": 0.5116307139396667, + "learning_rate": 0.0009181512605042017, + "loss": 0.58, + "step": 3028 + }, + { + "epoch": 1.6921787709497207, + "grad_norm": 0.6031655073165894, + "learning_rate": 0.0009181232492997199, + "loss": 0.4278, + "step": 3029 + }, + { + "epoch": 1.6927374301675977, + "grad_norm": 3.58233642578125, + "learning_rate": 0.0009180952380952381, + "loss": 0.4304, + "step": 3030 + }, + { + "epoch": 1.6932960893854747, + "grad_norm": 0.4411269724369049, + "learning_rate": 0.0009180672268907562, + "loss": 0.4872, + "step": 3031 + }, + { + "epoch": 1.693854748603352, + "grad_norm": 0.7215275168418884, + "learning_rate": 0.0009180392156862746, + "loss": 0.5153, + "step": 3032 + }, + { + "epoch": 1.6944134078212292, + "grad_norm": 0.5682253837585449, + "learning_rate": 0.0009180112044817928, + "loss": 0.4766, + "step": 3033 + }, + { + "epoch": 1.6949720670391062, + "grad_norm": 0.5677854418754578, + "learning_rate": 0.000917983193277311, + "loss": 0.5427, + "step": 3034 + }, + { + "epoch": 1.6955307262569832, + "grad_norm": 0.6147822141647339, + "learning_rate": 0.0009179551820728292, + "loss": 0.5426, + "step": 3035 + }, + { + "epoch": 1.6960893854748602, + "grad_norm": 0.4158145487308502, + "learning_rate": 0.0009179271708683473, + "loss": 0.3587, + "step": 3036 + }, + { + "epoch": 1.6966480446927374, + "grad_norm": 0.7393187880516052, + "learning_rate": 0.0009178991596638656, + "loss": 0.5026, + "step": 3037 + }, + { + "epoch": 1.6972067039106147, + "grad_norm": 0.8531793355941772, + "learning_rate": 0.0009178711484593838, + "loss": 0.5205, + "step": 3038 + }, + { + "epoch": 1.6977653631284917, + "grad_norm": 0.47491392493247986, + "learning_rate": 0.000917843137254902, + "loss": 0.5226, + "step": 3039 + }, + { + "epoch": 1.6983240223463687, + "grad_norm": 0.9626147150993347, + "learning_rate": 0.0009178151260504202, + "loss": 0.7377, + "step": 3040 + }, + { + "epoch": 1.6988826815642457, + "grad_norm": 0.6460261940956116, + "learning_rate": 0.0009177871148459383, + "loss": 0.4547, + "step": 3041 + }, + { + "epoch": 1.699441340782123, + "grad_norm": 1.0530688762664795, + "learning_rate": 0.0009177591036414566, + "loss": 0.6088, + "step": 3042 + }, + { + "epoch": 1.7, + "grad_norm": 0.496269166469574, + "learning_rate": 0.0009177310924369748, + "loss": 0.4702, + "step": 3043 + }, + { + "epoch": 1.7005586592178772, + "grad_norm": 0.6647741198539734, + "learning_rate": 0.000917703081232493, + "loss": 0.4477, + "step": 3044 + }, + { + "epoch": 1.7011173184357542, + "grad_norm": 0.5344812273979187, + "learning_rate": 0.0009176750700280112, + "loss": 0.4469, + "step": 3045 + }, + { + "epoch": 1.7016759776536312, + "grad_norm": 0.5009032487869263, + "learning_rate": 0.0009176470588235294, + "loss": 0.6159, + "step": 3046 + }, + { + "epoch": 1.7022346368715082, + "grad_norm": 0.6459294557571411, + "learning_rate": 0.0009176190476190476, + "loss": 0.5017, + "step": 3047 + }, + { + "epoch": 1.7027932960893855, + "grad_norm": 0.8312904834747314, + "learning_rate": 0.0009175910364145659, + "loss": 0.7266, + "step": 3048 + }, + { + "epoch": 1.7033519553072627, + "grad_norm": 1.9589706659317017, + "learning_rate": 0.0009175630252100841, + "loss": 0.5248, + "step": 3049 + }, + { + "epoch": 1.7039106145251397, + "grad_norm": 1.463280200958252, + "learning_rate": 0.0009175350140056023, + "loss": 0.501, + "step": 3050 + }, + { + "epoch": 1.7044692737430167, + "grad_norm": 1.3849666118621826, + "learning_rate": 0.0009175070028011205, + "loss": 0.4167, + "step": 3051 + }, + { + "epoch": 1.7050279329608937, + "grad_norm": 1.0459672212600708, + "learning_rate": 0.0009174789915966387, + "loss": 0.4104, + "step": 3052 + }, + { + "epoch": 1.705586592178771, + "grad_norm": 2.0638487339019775, + "learning_rate": 0.0009174509803921569, + "loss": 0.4553, + "step": 3053 + }, + { + "epoch": 1.7061452513966482, + "grad_norm": 0.4697238504886627, + "learning_rate": 0.0009174229691876751, + "loss": 0.473, + "step": 3054 + }, + { + "epoch": 1.7067039106145252, + "grad_norm": 0.3627098500728607, + "learning_rate": 0.0009173949579831933, + "loss": 0.4106, + "step": 3055 + }, + { + "epoch": 1.7072625698324022, + "grad_norm": 0.8281555771827698, + "learning_rate": 0.0009173669467787115, + "loss": 0.4971, + "step": 3056 + }, + { + "epoch": 1.7078212290502792, + "grad_norm": 0.600597620010376, + "learning_rate": 0.0009173389355742298, + "loss": 0.5494, + "step": 3057 + }, + { + "epoch": 1.7083798882681565, + "grad_norm": 0.8619397878646851, + "learning_rate": 0.0009173109243697479, + "loss": 0.4277, + "step": 3058 + }, + { + "epoch": 1.7089385474860335, + "grad_norm": 0.4999079704284668, + "learning_rate": 0.0009172829131652661, + "loss": 0.4587, + "step": 3059 + }, + { + "epoch": 1.7094972067039107, + "grad_norm": 0.6281794309616089, + "learning_rate": 0.0009172549019607843, + "loss": 0.6208, + "step": 3060 + }, + { + "epoch": 1.7100558659217877, + "grad_norm": 0.695807158946991, + "learning_rate": 0.0009172268907563025, + "loss": 0.4673, + "step": 3061 + }, + { + "epoch": 1.7106145251396647, + "grad_norm": 3.4741170406341553, + "learning_rate": 0.0009171988795518208, + "loss": 0.4197, + "step": 3062 + }, + { + "epoch": 1.711173184357542, + "grad_norm": 0.6798413991928101, + "learning_rate": 0.0009171708683473389, + "loss": 0.4741, + "step": 3063 + }, + { + "epoch": 1.711731843575419, + "grad_norm": 0.5536423325538635, + "learning_rate": 0.0009171428571428571, + "loss": 0.4942, + "step": 3064 + }, + { + "epoch": 1.7122905027932962, + "grad_norm": 0.8299341201782227, + "learning_rate": 0.0009171148459383754, + "loss": 0.5756, + "step": 3065 + }, + { + "epoch": 1.7128491620111732, + "grad_norm": 0.6157573461532593, + "learning_rate": 0.0009170868347338936, + "loss": 0.4403, + "step": 3066 + }, + { + "epoch": 1.7134078212290502, + "grad_norm": 0.5600780248641968, + "learning_rate": 0.0009170588235294119, + "loss": 0.4841, + "step": 3067 + }, + { + "epoch": 1.7139664804469272, + "grad_norm": 0.5978518128395081, + "learning_rate": 0.00091703081232493, + "loss": 0.6111, + "step": 3068 + }, + { + "epoch": 1.7145251396648045, + "grad_norm": 0.7723780274391174, + "learning_rate": 0.0009170028011204482, + "loss": 0.61, + "step": 3069 + }, + { + "epoch": 1.7150837988826817, + "grad_norm": 0.49161583185195923, + "learning_rate": 0.0009169747899159664, + "loss": 0.5156, + "step": 3070 + }, + { + "epoch": 1.7156424581005587, + "grad_norm": 0.5228260159492493, + "learning_rate": 0.0009169467787114846, + "loss": 0.4738, + "step": 3071 + }, + { + "epoch": 1.7162011173184357, + "grad_norm": 0.4669853448867798, + "learning_rate": 0.0009169187675070029, + "loss": 0.3075, + "step": 3072 + }, + { + "epoch": 1.7167597765363127, + "grad_norm": 0.6266192197799683, + "learning_rate": 0.0009168907563025211, + "loss": 0.4909, + "step": 3073 + }, + { + "epoch": 1.71731843575419, + "grad_norm": 0.7092539668083191, + "learning_rate": 0.0009168627450980392, + "loss": 0.5391, + "step": 3074 + }, + { + "epoch": 1.7178770949720672, + "grad_norm": 0.5469465255737305, + "learning_rate": 0.0009168347338935574, + "loss": 0.5518, + "step": 3075 + }, + { + "epoch": 1.7184357541899442, + "grad_norm": 0.4958898723125458, + "learning_rate": 0.0009168067226890756, + "loss": 0.5336, + "step": 3076 + }, + { + "epoch": 1.7189944134078212, + "grad_norm": 1.5030094385147095, + "learning_rate": 0.0009167787114845939, + "loss": 0.4746, + "step": 3077 + }, + { + "epoch": 1.7195530726256982, + "grad_norm": 0.9934349656105042, + "learning_rate": 0.0009167507002801121, + "loss": 0.4497, + "step": 3078 + }, + { + "epoch": 1.7201117318435755, + "grad_norm": 1.688143253326416, + "learning_rate": 0.0009167226890756302, + "loss": 0.5267, + "step": 3079 + }, + { + "epoch": 1.7206703910614525, + "grad_norm": 1.1756359338760376, + "learning_rate": 0.0009166946778711484, + "loss": 0.3613, + "step": 3080 + }, + { + "epoch": 1.7212290502793297, + "grad_norm": 0.740430474281311, + "learning_rate": 0.0009166666666666666, + "loss": 0.5577, + "step": 3081 + }, + { + "epoch": 1.7217877094972067, + "grad_norm": 0.6306280493736267, + "learning_rate": 0.000916638655462185, + "loss": 0.4266, + "step": 3082 + }, + { + "epoch": 1.7223463687150837, + "grad_norm": 0.4091308116912842, + "learning_rate": 0.0009166106442577032, + "loss": 0.4428, + "step": 3083 + }, + { + "epoch": 1.7229050279329607, + "grad_norm": 1.7345776557922363, + "learning_rate": 0.0009165826330532213, + "loss": 0.5564, + "step": 3084 + }, + { + "epoch": 1.723463687150838, + "grad_norm": 0.6802529692649841, + "learning_rate": 0.0009165546218487395, + "loss": 0.3493, + "step": 3085 + }, + { + "epoch": 1.7240223463687152, + "grad_norm": 0.972693681716919, + "learning_rate": 0.0009165266106442577, + "loss": 0.5288, + "step": 3086 + }, + { + "epoch": 1.7245810055865922, + "grad_norm": 0.9185463786125183, + "learning_rate": 0.000916498599439776, + "loss": 0.4222, + "step": 3087 + }, + { + "epoch": 1.7251396648044692, + "grad_norm": 0.4764520525932312, + "learning_rate": 0.0009164705882352942, + "loss": 0.4483, + "step": 3088 + }, + { + "epoch": 1.7256983240223462, + "grad_norm": 1.3819745779037476, + "learning_rate": 0.0009164425770308124, + "loss": 0.5322, + "step": 3089 + }, + { + "epoch": 1.7262569832402235, + "grad_norm": 0.6650181412696838, + "learning_rate": 0.0009164145658263305, + "loss": 0.5584, + "step": 3090 + }, + { + "epoch": 1.7268156424581007, + "grad_norm": 0.44552111625671387, + "learning_rate": 0.0009163865546218487, + "loss": 0.413, + "step": 3091 + }, + { + "epoch": 1.7273743016759777, + "grad_norm": 0.6607682108879089, + "learning_rate": 0.000916358543417367, + "loss": 0.4661, + "step": 3092 + }, + { + "epoch": 1.7279329608938547, + "grad_norm": 0.6770963668823242, + "learning_rate": 0.0009163305322128852, + "loss": 0.6304, + "step": 3093 + }, + { + "epoch": 1.7284916201117317, + "grad_norm": 0.601143479347229, + "learning_rate": 0.0009163025210084034, + "loss": 0.4934, + "step": 3094 + }, + { + "epoch": 1.729050279329609, + "grad_norm": 0.5488032698631287, + "learning_rate": 0.0009162745098039215, + "loss": 0.4922, + "step": 3095 + }, + { + "epoch": 1.729608938547486, + "grad_norm": 0.8162834644317627, + "learning_rate": 0.0009162464985994397, + "loss": 0.5126, + "step": 3096 + }, + { + "epoch": 1.7301675977653632, + "grad_norm": 1.1971279382705688, + "learning_rate": 0.000916218487394958, + "loss": 0.511, + "step": 3097 + }, + { + "epoch": 1.7307262569832402, + "grad_norm": 1.797239065170288, + "learning_rate": 0.0009161904761904763, + "loss": 0.5462, + "step": 3098 + }, + { + "epoch": 1.7312849162011172, + "grad_norm": 0.5142049193382263, + "learning_rate": 0.0009161624649859945, + "loss": 0.4754, + "step": 3099 + }, + { + "epoch": 1.7318435754189943, + "grad_norm": 0.5536891222000122, + "learning_rate": 0.0009161344537815126, + "loss": 0.4698, + "step": 3100 + }, + { + "epoch": 1.7324022346368715, + "grad_norm": 1.0244752168655396, + "learning_rate": 0.0009161064425770308, + "loss": 0.3972, + "step": 3101 + }, + { + "epoch": 1.7329608938547487, + "grad_norm": 0.6083419919013977, + "learning_rate": 0.0009160784313725491, + "loss": 0.4494, + "step": 3102 + }, + { + "epoch": 1.7335195530726257, + "grad_norm": 1.548628807067871, + "learning_rate": 0.0009160504201680673, + "loss": 0.5115, + "step": 3103 + }, + { + "epoch": 1.7340782122905027, + "grad_norm": 1.2171717882156372, + "learning_rate": 0.0009160224089635855, + "loss": 0.5723, + "step": 3104 + }, + { + "epoch": 1.7346368715083798, + "grad_norm": 0.6739428043365479, + "learning_rate": 0.0009159943977591037, + "loss": 0.4302, + "step": 3105 + }, + { + "epoch": 1.735195530726257, + "grad_norm": 0.5084200501441956, + "learning_rate": 0.0009159663865546218, + "loss": 0.4626, + "step": 3106 + }, + { + "epoch": 1.7357541899441342, + "grad_norm": 0.6711824536323547, + "learning_rate": 0.0009159383753501401, + "loss": 0.4492, + "step": 3107 + }, + { + "epoch": 1.7363128491620112, + "grad_norm": 7.0675458908081055, + "learning_rate": 0.0009159103641456583, + "loss": 0.4145, + "step": 3108 + }, + { + "epoch": 1.7368715083798882, + "grad_norm": 0.5331434011459351, + "learning_rate": 0.0009158823529411765, + "loss": 0.5088, + "step": 3109 + }, + { + "epoch": 1.7374301675977653, + "grad_norm": 0.5827344059944153, + "learning_rate": 0.0009158543417366947, + "loss": 0.3838, + "step": 3110 + }, + { + "epoch": 1.7379888268156425, + "grad_norm": 0.9303395748138428, + "learning_rate": 0.0009158263305322128, + "loss": 0.6908, + "step": 3111 + }, + { + "epoch": 1.7385474860335197, + "grad_norm": 0.6547474265098572, + "learning_rate": 0.0009157983193277311, + "loss": 0.562, + "step": 3112 + }, + { + "epoch": 1.7391061452513967, + "grad_norm": 0.5259217619895935, + "learning_rate": 0.0009157703081232493, + "loss": 0.4964, + "step": 3113 + }, + { + "epoch": 1.7396648044692737, + "grad_norm": 0.547927737236023, + "learning_rate": 0.0009157422969187676, + "loss": 0.529, + "step": 3114 + }, + { + "epoch": 1.7402234636871508, + "grad_norm": 0.5204383134841919, + "learning_rate": 0.0009157142857142858, + "loss": 0.4458, + "step": 3115 + }, + { + "epoch": 1.740782122905028, + "grad_norm": 0.6744920611381531, + "learning_rate": 0.0009156862745098039, + "loss": 0.4247, + "step": 3116 + }, + { + "epoch": 1.741340782122905, + "grad_norm": 2.0540030002593994, + "learning_rate": 0.0009156582633053222, + "loss": 0.5017, + "step": 3117 + }, + { + "epoch": 1.7418994413407822, + "grad_norm": 1.4944185018539429, + "learning_rate": 0.0009156302521008404, + "loss": 0.7158, + "step": 3118 + }, + { + "epoch": 1.7424581005586592, + "grad_norm": 0.5214855074882507, + "learning_rate": 0.0009156022408963586, + "loss": 0.5148, + "step": 3119 + }, + { + "epoch": 1.7430167597765363, + "grad_norm": 3.0218491554260254, + "learning_rate": 0.0009155742296918768, + "loss": 0.5922, + "step": 3120 + }, + { + "epoch": 1.7435754189944133, + "grad_norm": 1.070124864578247, + "learning_rate": 0.000915546218487395, + "loss": 0.5451, + "step": 3121 + }, + { + "epoch": 1.7441340782122905, + "grad_norm": 1.2633320093154907, + "learning_rate": 0.0009155182072829131, + "loss": 0.5398, + "step": 3122 + }, + { + "epoch": 1.7446927374301677, + "grad_norm": 0.739230215549469, + "learning_rate": 0.0009154901960784314, + "loss": 0.4518, + "step": 3123 + }, + { + "epoch": 1.7452513966480447, + "grad_norm": 0.8199949264526367, + "learning_rate": 0.0009154621848739496, + "loss": 0.5391, + "step": 3124 + }, + { + "epoch": 1.7458100558659218, + "grad_norm": 1.691592812538147, + "learning_rate": 0.0009154341736694678, + "loss": 0.4119, + "step": 3125 + }, + { + "epoch": 1.7463687150837988, + "grad_norm": 0.8518623113632202, + "learning_rate": 0.000915406162464986, + "loss": 0.4361, + "step": 3126 + }, + { + "epoch": 1.746927374301676, + "grad_norm": 0.9469772577285767, + "learning_rate": 0.0009153781512605041, + "loss": 0.469, + "step": 3127 + }, + { + "epoch": 1.7474860335195532, + "grad_norm": 0.5455501675605774, + "learning_rate": 0.0009153501400560224, + "loss": 0.4906, + "step": 3128 + }, + { + "epoch": 1.7480446927374302, + "grad_norm": 0.39891985058784485, + "learning_rate": 0.0009153221288515406, + "loss": 0.4374, + "step": 3129 + }, + { + "epoch": 1.7486033519553073, + "grad_norm": 1.063988447189331, + "learning_rate": 0.0009152941176470589, + "loss": 0.5026, + "step": 3130 + }, + { + "epoch": 1.7491620111731843, + "grad_norm": 0.5377022624015808, + "learning_rate": 0.0009152661064425771, + "loss": 0.4527, + "step": 3131 + }, + { + "epoch": 1.7497206703910615, + "grad_norm": 0.9185256958007812, + "learning_rate": 0.0009152380952380952, + "loss": 0.6537, + "step": 3132 + }, + { + "epoch": 1.7502793296089385, + "grad_norm": 1.6892807483673096, + "learning_rate": 0.0009152100840336135, + "loss": 0.5244, + "step": 3133 + }, + { + "epoch": 1.7508379888268157, + "grad_norm": 0.6731375455856323, + "learning_rate": 0.0009151820728291317, + "loss": 0.5268, + "step": 3134 + }, + { + "epoch": 1.7513966480446927, + "grad_norm": 0.7444874048233032, + "learning_rate": 0.0009151540616246499, + "loss": 0.5966, + "step": 3135 + }, + { + "epoch": 1.7519553072625698, + "grad_norm": 0.6658584475517273, + "learning_rate": 0.0009151260504201681, + "loss": 0.4287, + "step": 3136 + }, + { + "epoch": 1.7525139664804468, + "grad_norm": 0.5693717002868652, + "learning_rate": 0.0009150980392156863, + "loss": 0.4879, + "step": 3137 + }, + { + "epoch": 1.753072625698324, + "grad_norm": 0.857803225517273, + "learning_rate": 0.0009150700280112045, + "loss": 0.446, + "step": 3138 + }, + { + "epoch": 1.7536312849162012, + "grad_norm": 0.8032236695289612, + "learning_rate": 0.0009150420168067227, + "loss": 0.3945, + "step": 3139 + }, + { + "epoch": 1.7541899441340782, + "grad_norm": 0.6887109875679016, + "learning_rate": 0.0009150140056022409, + "loss": 0.6698, + "step": 3140 + }, + { + "epoch": 1.7547486033519553, + "grad_norm": 0.9623390436172485, + "learning_rate": 0.0009149859943977591, + "loss": 0.4731, + "step": 3141 + }, + { + "epoch": 1.7553072625698323, + "grad_norm": 0.572283148765564, + "learning_rate": 0.0009149579831932773, + "loss": 0.457, + "step": 3142 + }, + { + "epoch": 1.7558659217877095, + "grad_norm": 0.5509768128395081, + "learning_rate": 0.0009149299719887955, + "loss": 0.4345, + "step": 3143 + }, + { + "epoch": 1.7564245810055867, + "grad_norm": 2.6843929290771484, + "learning_rate": 0.0009149019607843137, + "loss": 0.5544, + "step": 3144 + }, + { + "epoch": 1.7569832402234637, + "grad_norm": 1.8658711910247803, + "learning_rate": 0.0009148739495798319, + "loss": 0.5912, + "step": 3145 + }, + { + "epoch": 1.7575418994413408, + "grad_norm": 1.2871838808059692, + "learning_rate": 0.0009148459383753501, + "loss": 0.5554, + "step": 3146 + }, + { + "epoch": 1.7581005586592178, + "grad_norm": 0.5986303091049194, + "learning_rate": 0.0009148179271708684, + "loss": 0.4511, + "step": 3147 + }, + { + "epoch": 1.758659217877095, + "grad_norm": 0.5408793687820435, + "learning_rate": 0.0009147899159663866, + "loss": 0.4302, + "step": 3148 + }, + { + "epoch": 1.759217877094972, + "grad_norm": 1.0741839408874512, + "learning_rate": 0.0009147619047619048, + "loss": 0.6019, + "step": 3149 + }, + { + "epoch": 1.7597765363128492, + "grad_norm": 0.5812860727310181, + "learning_rate": 0.000914733893557423, + "loss": 0.5739, + "step": 3150 + }, + { + "epoch": 1.7603351955307263, + "grad_norm": 0.5961041450500488, + "learning_rate": 0.0009147058823529412, + "loss": 0.4712, + "step": 3151 + }, + { + "epoch": 1.7608938547486033, + "grad_norm": 0.8099181652069092, + "learning_rate": 0.0009146778711484594, + "loss": 0.5246, + "step": 3152 + }, + { + "epoch": 1.7614525139664803, + "grad_norm": 0.5553327202796936, + "learning_rate": 0.0009146498599439777, + "loss": 0.5419, + "step": 3153 + }, + { + "epoch": 1.7620111731843575, + "grad_norm": 0.5164934992790222, + "learning_rate": 0.0009146218487394958, + "loss": 0.4026, + "step": 3154 + }, + { + "epoch": 1.7625698324022347, + "grad_norm": 0.5284751653671265, + "learning_rate": 0.000914593837535014, + "loss": 0.621, + "step": 3155 + }, + { + "epoch": 1.7631284916201118, + "grad_norm": 0.5386056900024414, + "learning_rate": 0.0009145658263305322, + "loss": 0.5432, + "step": 3156 + }, + { + "epoch": 1.7636871508379888, + "grad_norm": 4.473935604095459, + "learning_rate": 0.0009145378151260504, + "loss": 0.479, + "step": 3157 + }, + { + "epoch": 1.7642458100558658, + "grad_norm": 0.5091344118118286, + "learning_rate": 0.0009145098039215687, + "loss": 0.3973, + "step": 3158 + }, + { + "epoch": 1.764804469273743, + "grad_norm": 0.7269484400749207, + "learning_rate": 0.0009144817927170868, + "loss": 0.4625, + "step": 3159 + }, + { + "epoch": 1.7653631284916202, + "grad_norm": 0.6569042205810547, + "learning_rate": 0.000914453781512605, + "loss": 0.6453, + "step": 3160 + }, + { + "epoch": 1.7659217877094973, + "grad_norm": 0.5146526098251343, + "learning_rate": 0.0009144257703081232, + "loss": 0.4935, + "step": 3161 + }, + { + "epoch": 1.7664804469273743, + "grad_norm": 0.5423449277877808, + "learning_rate": 0.0009143977591036414, + "loss": 0.3714, + "step": 3162 + }, + { + "epoch": 1.7670391061452513, + "grad_norm": 0.7551223635673523, + "learning_rate": 0.0009143697478991598, + "loss": 0.5137, + "step": 3163 + }, + { + "epoch": 1.7675977653631285, + "grad_norm": 0.49909594655036926, + "learning_rate": 0.0009143417366946779, + "loss": 0.4782, + "step": 3164 + }, + { + "epoch": 1.7681564245810057, + "grad_norm": 0.42767465114593506, + "learning_rate": 0.0009143137254901961, + "loss": 0.391, + "step": 3165 + }, + { + "epoch": 1.7687150837988828, + "grad_norm": 0.5423476099967957, + "learning_rate": 0.0009142857142857143, + "loss": 0.4518, + "step": 3166 + }, + { + "epoch": 1.7692737430167598, + "grad_norm": 0.7874027490615845, + "learning_rate": 0.0009142577030812325, + "loss": 0.4433, + "step": 3167 + }, + { + "epoch": 1.7698324022346368, + "grad_norm": 0.7019028663635254, + "learning_rate": 0.0009142296918767508, + "loss": 0.5464, + "step": 3168 + }, + { + "epoch": 1.770391061452514, + "grad_norm": 15.238758087158203, + "learning_rate": 0.000914201680672269, + "loss": 0.4594, + "step": 3169 + }, + { + "epoch": 1.770949720670391, + "grad_norm": 0.6703578233718872, + "learning_rate": 0.0009141736694677871, + "loss": 0.5333, + "step": 3170 + }, + { + "epoch": 1.7715083798882683, + "grad_norm": 1.0852935314178467, + "learning_rate": 0.0009141456582633053, + "loss": 0.5146, + "step": 3171 + }, + { + "epoch": 1.7720670391061453, + "grad_norm": 0.6139355897903442, + "learning_rate": 0.0009141176470588235, + "loss": 0.5606, + "step": 3172 + }, + { + "epoch": 1.7726256983240223, + "grad_norm": 0.4671594202518463, + "learning_rate": 0.0009140896358543418, + "loss": 0.4041, + "step": 3173 + }, + { + "epoch": 1.7731843575418993, + "grad_norm": 0.8935621380805969, + "learning_rate": 0.00091406162464986, + "loss": 0.5659, + "step": 3174 + }, + { + "epoch": 1.7737430167597765, + "grad_norm": 0.633843719959259, + "learning_rate": 0.0009140336134453781, + "loss": 0.4534, + "step": 3175 + }, + { + "epoch": 1.7743016759776538, + "grad_norm": 0.6199772953987122, + "learning_rate": 0.0009140056022408963, + "loss": 0.3949, + "step": 3176 + }, + { + "epoch": 1.7748603351955308, + "grad_norm": 0.6112352609634399, + "learning_rate": 0.0009139775910364145, + "loss": 0.3991, + "step": 3177 + }, + { + "epoch": 1.7754189944134078, + "grad_norm": 14.571758270263672, + "learning_rate": 0.0009139495798319328, + "loss": 0.525, + "step": 3178 + }, + { + "epoch": 1.7759776536312848, + "grad_norm": 0.7849811911582947, + "learning_rate": 0.000913921568627451, + "loss": 0.4102, + "step": 3179 + }, + { + "epoch": 1.776536312849162, + "grad_norm": 6.994330406188965, + "learning_rate": 0.0009138935574229692, + "loss": 0.5312, + "step": 3180 + }, + { + "epoch": 1.7770949720670393, + "grad_norm": 0.6431810259819031, + "learning_rate": 0.0009138655462184874, + "loss": 0.4155, + "step": 3181 + }, + { + "epoch": 1.7776536312849163, + "grad_norm": 4.65981912612915, + "learning_rate": 0.0009138375350140056, + "loss": 0.5503, + "step": 3182 + }, + { + "epoch": 1.7782122905027933, + "grad_norm": 0.9029192328453064, + "learning_rate": 0.0009138095238095239, + "loss": 0.4662, + "step": 3183 + }, + { + "epoch": 1.7787709497206703, + "grad_norm": 0.5947895050048828, + "learning_rate": 0.0009137815126050421, + "loss": 0.5221, + "step": 3184 + }, + { + "epoch": 1.7793296089385475, + "grad_norm": 0.6901898980140686, + "learning_rate": 0.0009137535014005603, + "loss": 0.5007, + "step": 3185 + }, + { + "epoch": 1.7798882681564245, + "grad_norm": 0.5442327260971069, + "learning_rate": 0.0009137254901960784, + "loss": 0.445, + "step": 3186 + }, + { + "epoch": 1.7804469273743018, + "grad_norm": 0.570530891418457, + "learning_rate": 0.0009136974789915966, + "loss": 0.485, + "step": 3187 + }, + { + "epoch": 1.7810055865921788, + "grad_norm": 2.407280445098877, + "learning_rate": 0.0009136694677871149, + "loss": 0.4441, + "step": 3188 + }, + { + "epoch": 1.7815642458100558, + "grad_norm": 0.8397179245948792, + "learning_rate": 0.0009136414565826331, + "loss": 0.4369, + "step": 3189 + }, + { + "epoch": 1.7821229050279328, + "grad_norm": 0.986168622970581, + "learning_rate": 0.0009136134453781513, + "loss": 0.4832, + "step": 3190 + }, + { + "epoch": 1.78268156424581, + "grad_norm": 0.5946182608604431, + "learning_rate": 0.0009135854341736694, + "loss": 0.5193, + "step": 3191 + }, + { + "epoch": 1.7832402234636873, + "grad_norm": 1.54610013961792, + "learning_rate": 0.0009135574229691876, + "loss": 0.4778, + "step": 3192 + }, + { + "epoch": 1.7837988826815643, + "grad_norm": 0.6242474317550659, + "learning_rate": 0.0009135294117647059, + "loss": 0.5603, + "step": 3193 + }, + { + "epoch": 1.7843575418994413, + "grad_norm": 0.6124485731124878, + "learning_rate": 0.0009135014005602241, + "loss": 0.5536, + "step": 3194 + }, + { + "epoch": 1.7849162011173183, + "grad_norm": 0.7565421462059021, + "learning_rate": 0.0009134733893557423, + "loss": 0.4548, + "step": 3195 + }, + { + "epoch": 1.7854748603351955, + "grad_norm": 0.6444962024688721, + "learning_rate": 0.0009134453781512604, + "loss": 0.4885, + "step": 3196 + }, + { + "epoch": 1.7860335195530728, + "grad_norm": 0.5428703427314758, + "learning_rate": 0.0009134173669467787, + "loss": 0.5451, + "step": 3197 + }, + { + "epoch": 1.7865921787709498, + "grad_norm": 1.2530630826950073, + "learning_rate": 0.000913389355742297, + "loss": 0.8271, + "step": 3198 + }, + { + "epoch": 1.7871508379888268, + "grad_norm": 0.5181470513343811, + "learning_rate": 0.0009133613445378152, + "loss": 0.5571, + "step": 3199 + }, + { + "epoch": 1.7877094972067038, + "grad_norm": 1.4144065380096436, + "learning_rate": 0.0009133333333333334, + "loss": 0.4939, + "step": 3200 + }, + { + "epoch": 1.788268156424581, + "grad_norm": 0.9006614089012146, + "learning_rate": 0.0009133053221288516, + "loss": 0.4826, + "step": 3201 + }, + { + "epoch": 1.788826815642458, + "grad_norm": 0.7793147563934326, + "learning_rate": 0.0009132773109243697, + "loss": 0.4808, + "step": 3202 + }, + { + "epoch": 1.7893854748603353, + "grad_norm": 0.7988924384117126, + "learning_rate": 0.000913249299719888, + "loss": 0.5851, + "step": 3203 + }, + { + "epoch": 1.7899441340782123, + "grad_norm": 0.6632587909698486, + "learning_rate": 0.0009132212885154062, + "loss": 0.3586, + "step": 3204 + }, + { + "epoch": 1.7905027932960893, + "grad_norm": 0.6377608776092529, + "learning_rate": 0.0009131932773109244, + "loss": 0.503, + "step": 3205 + }, + { + "epoch": 1.7910614525139665, + "grad_norm": 0.5571548938751221, + "learning_rate": 0.0009131652661064426, + "loss": 0.4389, + "step": 3206 + }, + { + "epoch": 1.7916201117318435, + "grad_norm": 0.8831945061683655, + "learning_rate": 0.0009131372549019607, + "loss": 0.4241, + "step": 3207 + }, + { + "epoch": 1.7921787709497208, + "grad_norm": 0.6242866516113281, + "learning_rate": 0.000913109243697479, + "loss": 0.5814, + "step": 3208 + }, + { + "epoch": 1.7927374301675978, + "grad_norm": 0.5590229630470276, + "learning_rate": 0.0009130812324929972, + "loss": 0.5109, + "step": 3209 + }, + { + "epoch": 1.7932960893854748, + "grad_norm": 0.554449737071991, + "learning_rate": 0.0009130532212885154, + "loss": 0.5484, + "step": 3210 + }, + { + "epoch": 1.7938547486033518, + "grad_norm": 0.45062708854675293, + "learning_rate": 0.0009130252100840336, + "loss": 0.4523, + "step": 3211 + }, + { + "epoch": 1.794413407821229, + "grad_norm": 0.4103892147541046, + "learning_rate": 0.0009129971988795517, + "loss": 0.4018, + "step": 3212 + }, + { + "epoch": 1.7949720670391063, + "grad_norm": 0.712181031703949, + "learning_rate": 0.0009129691876750701, + "loss": 0.5398, + "step": 3213 + }, + { + "epoch": 1.7955307262569833, + "grad_norm": 0.5673133134841919, + "learning_rate": 0.0009129411764705883, + "loss": 0.4612, + "step": 3214 + }, + { + "epoch": 1.7960893854748603, + "grad_norm": 0.8361480832099915, + "learning_rate": 0.0009129131652661065, + "loss": 0.4171, + "step": 3215 + }, + { + "epoch": 1.7966480446927373, + "grad_norm": 1.4768726825714111, + "learning_rate": 0.0009128851540616247, + "loss": 0.4183, + "step": 3216 + }, + { + "epoch": 1.7972067039106145, + "grad_norm": 2.5639262199401855, + "learning_rate": 0.0009128571428571429, + "loss": 0.4561, + "step": 3217 + }, + { + "epoch": 1.7977653631284918, + "grad_norm": 0.7279541492462158, + "learning_rate": 0.0009128291316526611, + "loss": 0.4435, + "step": 3218 + }, + { + "epoch": 1.7983240223463688, + "grad_norm": 1.6590209007263184, + "learning_rate": 0.0009128011204481793, + "loss": 0.5238, + "step": 3219 + }, + { + "epoch": 1.7988826815642458, + "grad_norm": 0.44741788506507874, + "learning_rate": 0.0009127731092436975, + "loss": 0.5591, + "step": 3220 + }, + { + "epoch": 1.7994413407821228, + "grad_norm": 0.5549837946891785, + "learning_rate": 0.0009127450980392157, + "loss": 0.5138, + "step": 3221 + }, + { + "epoch": 1.8, + "grad_norm": 0.6384388208389282, + "learning_rate": 0.0009127170868347339, + "loss": 0.4149, + "step": 3222 + }, + { + "epoch": 1.800558659217877, + "grad_norm": 0.4796817898750305, + "learning_rate": 0.0009126890756302521, + "loss": 0.4302, + "step": 3223 + }, + { + "epoch": 1.8011173184357543, + "grad_norm": 0.6586772203445435, + "learning_rate": 0.0009126610644257703, + "loss": 0.5628, + "step": 3224 + }, + { + "epoch": 1.8016759776536313, + "grad_norm": 0.7913119792938232, + "learning_rate": 0.0009126330532212885, + "loss": 0.5819, + "step": 3225 + }, + { + "epoch": 1.8022346368715083, + "grad_norm": 0.523750364780426, + "learning_rate": 0.0009126050420168067, + "loss": 0.502, + "step": 3226 + }, + { + "epoch": 1.8027932960893853, + "grad_norm": 1.5710994005203247, + "learning_rate": 0.0009125770308123249, + "loss": 0.7304, + "step": 3227 + }, + { + "epoch": 1.8033519553072626, + "grad_norm": 0.5283070802688599, + "learning_rate": 0.0009125490196078431, + "loss": 0.4541, + "step": 3228 + }, + { + "epoch": 1.8039106145251398, + "grad_norm": 0.6923981308937073, + "learning_rate": 0.0009125210084033614, + "loss": 0.5953, + "step": 3229 + }, + { + "epoch": 1.8044692737430168, + "grad_norm": 0.3811795711517334, + "learning_rate": 0.0009124929971988796, + "loss": 0.3949, + "step": 3230 + }, + { + "epoch": 1.8050279329608938, + "grad_norm": 0.8328307271003723, + "learning_rate": 0.0009124649859943978, + "loss": 0.5691, + "step": 3231 + }, + { + "epoch": 1.8055865921787708, + "grad_norm": 1.4995604753494263, + "learning_rate": 0.000912436974789916, + "loss": 0.5034, + "step": 3232 + }, + { + "epoch": 1.806145251396648, + "grad_norm": 0.5103375911712646, + "learning_rate": 0.0009124089635854343, + "loss": 0.4367, + "step": 3233 + }, + { + "epoch": 1.8067039106145253, + "grad_norm": 0.5794250965118408, + "learning_rate": 0.0009123809523809524, + "loss": 0.5514, + "step": 3234 + }, + { + "epoch": 1.8072625698324023, + "grad_norm": 1.3814373016357422, + "learning_rate": 0.0009123529411764706, + "loss": 0.4518, + "step": 3235 + }, + { + "epoch": 1.8078212290502793, + "grad_norm": 1.232062816619873, + "learning_rate": 0.0009123249299719888, + "loss": 0.598, + "step": 3236 + }, + { + "epoch": 1.8083798882681563, + "grad_norm": 0.550687849521637, + "learning_rate": 0.000912296918767507, + "loss": 0.5147, + "step": 3237 + }, + { + "epoch": 1.8089385474860336, + "grad_norm": 0.7990720272064209, + "learning_rate": 0.0009122689075630253, + "loss": 0.5182, + "step": 3238 + }, + { + "epoch": 1.8094972067039106, + "grad_norm": 0.6354506611824036, + "learning_rate": 0.0009122408963585434, + "loss": 0.5284, + "step": 3239 + }, + { + "epoch": 1.8100558659217878, + "grad_norm": 0.6284214854240417, + "learning_rate": 0.0009122128851540616, + "loss": 0.4155, + "step": 3240 + }, + { + "epoch": 1.8106145251396648, + "grad_norm": 7.2653069496154785, + "learning_rate": 0.0009121848739495798, + "loss": 0.5143, + "step": 3241 + }, + { + "epoch": 1.8111731843575418, + "grad_norm": 0.8755322694778442, + "learning_rate": 0.000912156862745098, + "loss": 0.6028, + "step": 3242 + }, + { + "epoch": 1.8117318435754188, + "grad_norm": 0.8377264738082886, + "learning_rate": 0.0009121288515406163, + "loss": 0.5066, + "step": 3243 + }, + { + "epoch": 1.812290502793296, + "grad_norm": 0.6050659418106079, + "learning_rate": 0.0009121008403361344, + "loss": 0.6722, + "step": 3244 + }, + { + "epoch": 1.8128491620111733, + "grad_norm": 0.8741009831428528, + "learning_rate": 0.0009120728291316526, + "loss": 0.4339, + "step": 3245 + }, + { + "epoch": 1.8134078212290503, + "grad_norm": 0.5095301270484924, + "learning_rate": 0.0009120448179271709, + "loss": 0.5186, + "step": 3246 + }, + { + "epoch": 1.8139664804469273, + "grad_norm": 0.7434853911399841, + "learning_rate": 0.0009120168067226891, + "loss": 0.4591, + "step": 3247 + }, + { + "epoch": 1.8145251396648043, + "grad_norm": 0.5455620884895325, + "learning_rate": 0.0009119887955182074, + "loss": 0.5082, + "step": 3248 + }, + { + "epoch": 1.8150837988826816, + "grad_norm": 0.6582542657852173, + "learning_rate": 0.0009119607843137256, + "loss": 0.5654, + "step": 3249 + }, + { + "epoch": 1.8156424581005588, + "grad_norm": 0.6216452717781067, + "learning_rate": 0.0009119327731092437, + "loss": 0.4429, + "step": 3250 + }, + { + "epoch": 1.8162011173184358, + "grad_norm": 0.5055205821990967, + "learning_rate": 0.0009119047619047619, + "loss": 0.5245, + "step": 3251 + }, + { + "epoch": 1.8167597765363128, + "grad_norm": 0.690655529499054, + "learning_rate": 0.0009118767507002801, + "loss": 0.4268, + "step": 3252 + }, + { + "epoch": 1.8173184357541898, + "grad_norm": 0.9002646803855896, + "learning_rate": 0.0009118487394957984, + "loss": 0.4873, + "step": 3253 + }, + { + "epoch": 1.817877094972067, + "grad_norm": 0.3966941237449646, + "learning_rate": 0.0009118207282913166, + "loss": 0.5805, + "step": 3254 + }, + { + "epoch": 1.8184357541899443, + "grad_norm": 0.5643709897994995, + "learning_rate": 0.0009117927170868347, + "loss": 0.4476, + "step": 3255 + }, + { + "epoch": 1.8189944134078213, + "grad_norm": 0.5275784730911255, + "learning_rate": 0.0009117647058823529, + "loss": 0.5981, + "step": 3256 + }, + { + "epoch": 1.8195530726256983, + "grad_norm": 0.7534781098365784, + "learning_rate": 0.0009117366946778711, + "loss": 0.4925, + "step": 3257 + }, + { + "epoch": 1.8201117318435753, + "grad_norm": 0.5781060457229614, + "learning_rate": 0.0009117086834733894, + "loss": 0.4801, + "step": 3258 + }, + { + "epoch": 1.8206703910614526, + "grad_norm": 0.4545038044452667, + "learning_rate": 0.0009116806722689076, + "loss": 0.5501, + "step": 3259 + }, + { + "epoch": 1.8212290502793296, + "grad_norm": 0.4350161850452423, + "learning_rate": 0.0009116526610644257, + "loss": 0.4169, + "step": 3260 + }, + { + "epoch": 1.8217877094972068, + "grad_norm": 0.964316725730896, + "learning_rate": 0.0009116246498599439, + "loss": 0.5944, + "step": 3261 + }, + { + "epoch": 1.8223463687150838, + "grad_norm": 0.5807787179946899, + "learning_rate": 0.0009115966386554622, + "loss": 0.3721, + "step": 3262 + }, + { + "epoch": 1.8229050279329608, + "grad_norm": 0.5068808197975159, + "learning_rate": 0.0009115686274509805, + "loss": 0.5274, + "step": 3263 + }, + { + "epoch": 1.8234636871508378, + "grad_norm": 0.5355903506278992, + "learning_rate": 0.0009115406162464987, + "loss": 0.49, + "step": 3264 + }, + { + "epoch": 1.824022346368715, + "grad_norm": 0.5871938467025757, + "learning_rate": 0.0009115126050420169, + "loss": 0.5894, + "step": 3265 + }, + { + "epoch": 1.8245810055865923, + "grad_norm": 1.748473882675171, + "learning_rate": 0.000911484593837535, + "loss": 0.5611, + "step": 3266 + }, + { + "epoch": 1.8251396648044693, + "grad_norm": 1.1771451234817505, + "learning_rate": 0.0009114565826330532, + "loss": 0.5533, + "step": 3267 + }, + { + "epoch": 1.8256983240223463, + "grad_norm": 0.746534526348114, + "learning_rate": 0.0009114285714285715, + "loss": 0.5372, + "step": 3268 + }, + { + "epoch": 1.8262569832402233, + "grad_norm": 1.1431599855422974, + "learning_rate": 0.0009114005602240897, + "loss": 0.4518, + "step": 3269 + }, + { + "epoch": 1.8268156424581006, + "grad_norm": 0.645499050617218, + "learning_rate": 0.0009113725490196079, + "loss": 0.4456, + "step": 3270 + }, + { + "epoch": 1.8273743016759778, + "grad_norm": 0.4873228371143341, + "learning_rate": 0.000911344537815126, + "loss": 0.5993, + "step": 3271 + }, + { + "epoch": 1.8279329608938548, + "grad_norm": 0.6083270907402039, + "learning_rate": 0.0009113165266106442, + "loss": 0.4335, + "step": 3272 + }, + { + "epoch": 1.8284916201117318, + "grad_norm": 0.5168682336807251, + "learning_rate": 0.0009112885154061625, + "loss": 0.5144, + "step": 3273 + }, + { + "epoch": 1.8290502793296088, + "grad_norm": 0.43728867173194885, + "learning_rate": 0.0009112605042016807, + "loss": 0.4251, + "step": 3274 + }, + { + "epoch": 1.829608938547486, + "grad_norm": 0.7454100847244263, + "learning_rate": 0.0009112324929971989, + "loss": 0.5582, + "step": 3275 + }, + { + "epoch": 1.830167597765363, + "grad_norm": 0.5789195895195007, + "learning_rate": 0.000911204481792717, + "loss": 0.4246, + "step": 3276 + }, + { + "epoch": 1.8307262569832403, + "grad_norm": 0.6177477240562439, + "learning_rate": 0.0009111764705882352, + "loss": 0.5446, + "step": 3277 + }, + { + "epoch": 1.8312849162011173, + "grad_norm": 0.5867534279823303, + "learning_rate": 0.0009111484593837536, + "loss": 0.4863, + "step": 3278 + }, + { + "epoch": 1.8318435754189943, + "grad_norm": 0.5285478234291077, + "learning_rate": 0.0009111204481792718, + "loss": 0.3922, + "step": 3279 + }, + { + "epoch": 1.8324022346368714, + "grad_norm": 0.6765609383583069, + "learning_rate": 0.00091109243697479, + "loss": 0.6008, + "step": 3280 + }, + { + "epoch": 1.8329608938547486, + "grad_norm": 0.39646437764167786, + "learning_rate": 0.0009110644257703082, + "loss": 0.4554, + "step": 3281 + }, + { + "epoch": 1.8335195530726258, + "grad_norm": 3.053067445755005, + "learning_rate": 0.0009110364145658263, + "loss": 0.4177, + "step": 3282 + }, + { + "epoch": 1.8340782122905028, + "grad_norm": 1.0648759603500366, + "learning_rate": 0.0009110084033613446, + "loss": 0.4587, + "step": 3283 + }, + { + "epoch": 1.8346368715083798, + "grad_norm": 0.6261885166168213, + "learning_rate": 0.0009109803921568628, + "loss": 0.5741, + "step": 3284 + }, + { + "epoch": 1.8351955307262569, + "grad_norm": 0.7103239297866821, + "learning_rate": 0.000910952380952381, + "loss": 0.4021, + "step": 3285 + }, + { + "epoch": 1.835754189944134, + "grad_norm": 3.89915132522583, + "learning_rate": 0.0009109243697478992, + "loss": 0.4802, + "step": 3286 + }, + { + "epoch": 1.8363128491620113, + "grad_norm": 1.1798536777496338, + "learning_rate": 0.0009108963585434173, + "loss": 0.6326, + "step": 3287 + }, + { + "epoch": 1.8368715083798883, + "grad_norm": 0.6962305903434753, + "learning_rate": 0.0009108683473389356, + "loss": 0.4404, + "step": 3288 + }, + { + "epoch": 1.8374301675977653, + "grad_norm": 0.4706648588180542, + "learning_rate": 0.0009108403361344538, + "loss": 0.5461, + "step": 3289 + }, + { + "epoch": 1.8379888268156424, + "grad_norm": 5.161343574523926, + "learning_rate": 0.000910812324929972, + "loss": 0.4162, + "step": 3290 + }, + { + "epoch": 1.8385474860335196, + "grad_norm": 1.0783441066741943, + "learning_rate": 0.0009107843137254902, + "loss": 0.5888, + "step": 3291 + }, + { + "epoch": 1.8391061452513966, + "grad_norm": 2.5199368000030518, + "learning_rate": 0.0009107563025210083, + "loss": 0.4219, + "step": 3292 + }, + { + "epoch": 1.8396648044692738, + "grad_norm": 0.6675357818603516, + "learning_rate": 0.0009107282913165266, + "loss": 0.4912, + "step": 3293 + }, + { + "epoch": 1.8402234636871508, + "grad_norm": 0.842688262462616, + "learning_rate": 0.0009107002801120449, + "loss": 0.5007, + "step": 3294 + }, + { + "epoch": 1.8407821229050279, + "grad_norm": 0.9746347069740295, + "learning_rate": 0.0009106722689075631, + "loss": 0.5165, + "step": 3295 + }, + { + "epoch": 1.8413407821229049, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0009106442577030813, + "loss": 0.5351, + "step": 3296 + }, + { + "epoch": 1.841899441340782, + "grad_norm": 0.4980505406856537, + "learning_rate": 0.0009106162464985995, + "loss": 0.4395, + "step": 3297 + }, + { + "epoch": 1.8424581005586593, + "grad_norm": 0.6395367383956909, + "learning_rate": 0.0009105882352941177, + "loss": 0.4772, + "step": 3298 + }, + { + "epoch": 1.8430167597765363, + "grad_norm": 0.4734107553958893, + "learning_rate": 0.0009105602240896359, + "loss": 0.446, + "step": 3299 + }, + { + "epoch": 1.8435754189944134, + "grad_norm": 0.6829903721809387, + "learning_rate": 0.0009105322128851541, + "loss": 0.4506, + "step": 3300 + }, + { + "epoch": 1.8441340782122904, + "grad_norm": 4.157322406768799, + "learning_rate": 0.0009105042016806723, + "loss": 0.5003, + "step": 3301 + }, + { + "epoch": 1.8446927374301676, + "grad_norm": 1.6932682991027832, + "learning_rate": 0.0009104761904761905, + "loss": 0.4878, + "step": 3302 + }, + { + "epoch": 1.8452513966480448, + "grad_norm": 0.6642430424690247, + "learning_rate": 0.0009104481792717087, + "loss": 0.3785, + "step": 3303 + }, + { + "epoch": 1.8458100558659218, + "grad_norm": 0.5883099436759949, + "learning_rate": 0.0009104201680672269, + "loss": 0.4769, + "step": 3304 + }, + { + "epoch": 1.8463687150837989, + "grad_norm": 0.6421769261360168, + "learning_rate": 0.0009103921568627451, + "loss": 0.5527, + "step": 3305 + }, + { + "epoch": 1.8469273743016759, + "grad_norm": 0.6536919474601746, + "learning_rate": 0.0009103641456582633, + "loss": 0.465, + "step": 3306 + }, + { + "epoch": 1.847486033519553, + "grad_norm": 1.170295238494873, + "learning_rate": 0.0009103361344537815, + "loss": 0.5485, + "step": 3307 + }, + { + "epoch": 1.8480446927374303, + "grad_norm": 0.5307703018188477, + "learning_rate": 0.0009103081232492998, + "loss": 0.365, + "step": 3308 + }, + { + "epoch": 1.8486033519553073, + "grad_norm": 0.5870599746704102, + "learning_rate": 0.0009102801120448179, + "loss": 0.3947, + "step": 3309 + }, + { + "epoch": 1.8491620111731844, + "grad_norm": 1.0016573667526245, + "learning_rate": 0.0009102521008403361, + "loss": 0.5521, + "step": 3310 + }, + { + "epoch": 1.8497206703910614, + "grad_norm": 2.484774112701416, + "learning_rate": 0.0009102240896358544, + "loss": 0.5415, + "step": 3311 + }, + { + "epoch": 1.8502793296089386, + "grad_norm": 0.8974372744560242, + "learning_rate": 0.0009101960784313726, + "loss": 0.4474, + "step": 3312 + }, + { + "epoch": 1.8508379888268156, + "grad_norm": 24.924850463867188, + "learning_rate": 0.0009101680672268909, + "loss": 0.5219, + "step": 3313 + }, + { + "epoch": 1.8513966480446928, + "grad_norm": 0.6826589107513428, + "learning_rate": 0.000910140056022409, + "loss": 0.4647, + "step": 3314 + }, + { + "epoch": 1.8519553072625698, + "grad_norm": 0.7049552798271179, + "learning_rate": 0.0009101120448179272, + "loss": 0.6995, + "step": 3315 + }, + { + "epoch": 1.8525139664804469, + "grad_norm": 0.6224097013473511, + "learning_rate": 0.0009100840336134454, + "loss": 0.5642, + "step": 3316 + }, + { + "epoch": 1.8530726256983239, + "grad_norm": 0.4712682068347931, + "learning_rate": 0.0009100560224089636, + "loss": 0.4338, + "step": 3317 + }, + { + "epoch": 1.853631284916201, + "grad_norm": 0.5854752063751221, + "learning_rate": 0.0009100280112044819, + "loss": 0.3791, + "step": 3318 + }, + { + "epoch": 1.8541899441340783, + "grad_norm": 1.2928085327148438, + "learning_rate": 0.00091, + "loss": 0.4136, + "step": 3319 + }, + { + "epoch": 1.8547486033519553, + "grad_norm": 3.0672287940979004, + "learning_rate": 0.0009099719887955182, + "loss": 0.4559, + "step": 3320 + }, + { + "epoch": 1.8553072625698324, + "grad_norm": 0.8925442695617676, + "learning_rate": 0.0009099439775910364, + "loss": 0.5204, + "step": 3321 + }, + { + "epoch": 1.8558659217877094, + "grad_norm": 0.5389899611473083, + "learning_rate": 0.0009099159663865546, + "loss": 0.4773, + "step": 3322 + }, + { + "epoch": 1.8564245810055866, + "grad_norm": 0.4795750379562378, + "learning_rate": 0.0009098879551820729, + "loss": 0.457, + "step": 3323 + }, + { + "epoch": 1.8569832402234638, + "grad_norm": 2.4752981662750244, + "learning_rate": 0.0009098599439775911, + "loss": 0.6095, + "step": 3324 + }, + { + "epoch": 1.8575418994413408, + "grad_norm": 0.7278405427932739, + "learning_rate": 0.0009098319327731092, + "loss": 0.4303, + "step": 3325 + }, + { + "epoch": 1.8581005586592179, + "grad_norm": 2.557514190673828, + "learning_rate": 0.0009098039215686274, + "loss": 0.4835, + "step": 3326 + }, + { + "epoch": 1.8586592178770949, + "grad_norm": 0.7001586556434631, + "learning_rate": 0.0009097759103641456, + "loss": 0.5267, + "step": 3327 + }, + { + "epoch": 1.859217877094972, + "grad_norm": 0.5900753736495972, + "learning_rate": 0.000909747899159664, + "loss": 0.5004, + "step": 3328 + }, + { + "epoch": 1.8597765363128491, + "grad_norm": 10.75402545928955, + "learning_rate": 0.0009097198879551822, + "loss": 0.4819, + "step": 3329 + }, + { + "epoch": 1.8603351955307263, + "grad_norm": 1.0458437204360962, + "learning_rate": 0.0009096918767507003, + "loss": 0.5626, + "step": 3330 + }, + { + "epoch": 1.8608938547486034, + "grad_norm": 0.5565247535705566, + "learning_rate": 0.0009096638655462185, + "loss": 0.4633, + "step": 3331 + }, + { + "epoch": 1.8614525139664804, + "grad_norm": 0.6101935505867004, + "learning_rate": 0.0009096358543417367, + "loss": 0.3707, + "step": 3332 + }, + { + "epoch": 1.8620111731843574, + "grad_norm": 1.3918886184692383, + "learning_rate": 0.000909607843137255, + "loss": 0.5561, + "step": 3333 + }, + { + "epoch": 1.8625698324022346, + "grad_norm": 1.9468659162521362, + "learning_rate": 0.0009095798319327732, + "loss": 0.4529, + "step": 3334 + }, + { + "epoch": 1.8631284916201118, + "grad_norm": 0.4596400260925293, + "learning_rate": 0.0009095518207282913, + "loss": 0.4397, + "step": 3335 + }, + { + "epoch": 1.8636871508379889, + "grad_norm": 0.6582339406013489, + "learning_rate": 0.0009095238095238095, + "loss": 0.4956, + "step": 3336 + }, + { + "epoch": 1.8642458100558659, + "grad_norm": 0.6420575380325317, + "learning_rate": 0.0009094957983193277, + "loss": 0.5357, + "step": 3337 + }, + { + "epoch": 1.8648044692737429, + "grad_norm": 0.6272958517074585, + "learning_rate": 0.000909467787114846, + "loss": 0.386, + "step": 3338 + }, + { + "epoch": 1.8653631284916201, + "grad_norm": 0.606256902217865, + "learning_rate": 0.0009094397759103642, + "loss": 0.4967, + "step": 3339 + }, + { + "epoch": 1.8659217877094973, + "grad_norm": 0.652633786201477, + "learning_rate": 0.0009094117647058824, + "loss": 0.4033, + "step": 3340 + }, + { + "epoch": 1.8664804469273744, + "grad_norm": 0.33578285574913025, + "learning_rate": 0.0009093837535014005, + "loss": 0.3376, + "step": 3341 + }, + { + "epoch": 1.8670391061452514, + "grad_norm": 0.5056957006454468, + "learning_rate": 0.0009093557422969187, + "loss": 0.5379, + "step": 3342 + }, + { + "epoch": 1.8675977653631284, + "grad_norm": 0.40216267108917236, + "learning_rate": 0.0009093277310924369, + "loss": 0.4526, + "step": 3343 + }, + { + "epoch": 1.8681564245810056, + "grad_norm": 3.4325509071350098, + "learning_rate": 0.0009092997198879553, + "loss": 0.5134, + "step": 3344 + }, + { + "epoch": 1.8687150837988828, + "grad_norm": 0.6396369338035583, + "learning_rate": 0.0009092717086834735, + "loss": 0.6644, + "step": 3345 + }, + { + "epoch": 1.8692737430167599, + "grad_norm": 0.8216027617454529, + "learning_rate": 0.0009092436974789916, + "loss": 0.4511, + "step": 3346 + }, + { + "epoch": 1.8698324022346369, + "grad_norm": 0.6777360439300537, + "learning_rate": 0.0009092156862745098, + "loss": 0.5113, + "step": 3347 + }, + { + "epoch": 1.8703910614525139, + "grad_norm": 0.6325851678848267, + "learning_rate": 0.000909187675070028, + "loss": 0.5297, + "step": 3348 + }, + { + "epoch": 1.8709497206703911, + "grad_norm": 1.1637097597122192, + "learning_rate": 0.0009091596638655463, + "loss": 0.421, + "step": 3349 + }, + { + "epoch": 1.8715083798882681, + "grad_norm": 0.5719764232635498, + "learning_rate": 0.0009091316526610645, + "loss": 0.4587, + "step": 3350 + }, + { + "epoch": 1.8720670391061454, + "grad_norm": 0.568630576133728, + "learning_rate": 0.0009091036414565826, + "loss": 0.4217, + "step": 3351 + }, + { + "epoch": 1.8726256983240224, + "grad_norm": 0.706224799156189, + "learning_rate": 0.0009090756302521008, + "loss": 0.4824, + "step": 3352 + }, + { + "epoch": 1.8731843575418994, + "grad_norm": 1.231709361076355, + "learning_rate": 0.000909047619047619, + "loss": 0.4946, + "step": 3353 + }, + { + "epoch": 1.8737430167597764, + "grad_norm": 0.5383389592170715, + "learning_rate": 0.0009090196078431373, + "loss": 0.5246, + "step": 3354 + }, + { + "epoch": 1.8743016759776536, + "grad_norm": 0.45403286814689636, + "learning_rate": 0.0009089915966386555, + "loss": 0.4279, + "step": 3355 + }, + { + "epoch": 1.8748603351955309, + "grad_norm": 0.4688047766685486, + "learning_rate": 0.0009089635854341737, + "loss": 0.3409, + "step": 3356 + }, + { + "epoch": 1.8754189944134079, + "grad_norm": 0.7380261421203613, + "learning_rate": 0.0009089355742296918, + "loss": 0.4302, + "step": 3357 + }, + { + "epoch": 1.8759776536312849, + "grad_norm": 1.5678224563598633, + "learning_rate": 0.00090890756302521, + "loss": 0.445, + "step": 3358 + }, + { + "epoch": 1.876536312849162, + "grad_norm": 0.8303191661834717, + "learning_rate": 0.0009088795518207283, + "loss": 0.4072, + "step": 3359 + }, + { + "epoch": 1.8770949720670391, + "grad_norm": 8.348840713500977, + "learning_rate": 0.0009088515406162466, + "loss": 0.5845, + "step": 3360 + }, + { + "epoch": 1.8776536312849164, + "grad_norm": 1.5485373735427856, + "learning_rate": 0.0009088235294117648, + "loss": 0.4745, + "step": 3361 + }, + { + "epoch": 1.8782122905027934, + "grad_norm": 1.3296009302139282, + "learning_rate": 0.0009087955182072829, + "loss": 0.5202, + "step": 3362 + }, + { + "epoch": 1.8787709497206704, + "grad_norm": 0.43270382285118103, + "learning_rate": 0.0009087675070028011, + "loss": 0.4692, + "step": 3363 + }, + { + "epoch": 1.8793296089385474, + "grad_norm": 0.46897760033607483, + "learning_rate": 0.0009087394957983194, + "loss": 0.5073, + "step": 3364 + }, + { + "epoch": 1.8798882681564246, + "grad_norm": 0.38963064551353455, + "learning_rate": 0.0009087114845938376, + "loss": 0.4243, + "step": 3365 + }, + { + "epoch": 1.8804469273743016, + "grad_norm": 0.6272234320640564, + "learning_rate": 0.0009086834733893558, + "loss": 0.4974, + "step": 3366 + }, + { + "epoch": 1.8810055865921789, + "grad_norm": 0.8900682330131531, + "learning_rate": 0.0009086554621848739, + "loss": 0.42, + "step": 3367 + }, + { + "epoch": 1.8815642458100559, + "grad_norm": 0.6141631603240967, + "learning_rate": 0.0009086274509803921, + "loss": 0.5032, + "step": 3368 + }, + { + "epoch": 1.882122905027933, + "grad_norm": 0.5407202839851379, + "learning_rate": 0.0009085994397759104, + "loss": 0.4056, + "step": 3369 + }, + { + "epoch": 1.88268156424581, + "grad_norm": 0.6230933666229248, + "learning_rate": 0.0009085714285714286, + "loss": 0.6713, + "step": 3370 + }, + { + "epoch": 1.8832402234636871, + "grad_norm": 0.6125653386116028, + "learning_rate": 0.0009085434173669468, + "loss": 0.4475, + "step": 3371 + }, + { + "epoch": 1.8837988826815644, + "grad_norm": 1.4241896867752075, + "learning_rate": 0.000908515406162465, + "loss": 0.4833, + "step": 3372 + }, + { + "epoch": 1.8843575418994414, + "grad_norm": 0.5077396631240845, + "learning_rate": 0.0009084873949579831, + "loss": 0.462, + "step": 3373 + }, + { + "epoch": 1.8849162011173184, + "grad_norm": 0.4441854953765869, + "learning_rate": 0.0009084593837535014, + "loss": 0.4636, + "step": 3374 + }, + { + "epoch": 1.8854748603351954, + "grad_norm": 1.3085445165634155, + "learning_rate": 0.0009084313725490196, + "loss": 0.4555, + "step": 3375 + }, + { + "epoch": 1.8860335195530726, + "grad_norm": 0.6074004769325256, + "learning_rate": 0.0009084033613445379, + "loss": 0.4144, + "step": 3376 + }, + { + "epoch": 1.8865921787709499, + "grad_norm": 0.5048871636390686, + "learning_rate": 0.0009083753501400561, + "loss": 0.4632, + "step": 3377 + }, + { + "epoch": 1.8871508379888269, + "grad_norm": 0.586675763130188, + "learning_rate": 0.0009083473389355742, + "loss": 0.5268, + "step": 3378 + }, + { + "epoch": 1.887709497206704, + "grad_norm": 0.48384320735931396, + "learning_rate": 0.0009083193277310925, + "loss": 0.4257, + "step": 3379 + }, + { + "epoch": 1.888268156424581, + "grad_norm": 0.5696973204612732, + "learning_rate": 0.0009082913165266107, + "loss": 0.3715, + "step": 3380 + }, + { + "epoch": 1.8888268156424581, + "grad_norm": 0.47410765290260315, + "learning_rate": 0.0009082633053221289, + "loss": 0.392, + "step": 3381 + }, + { + "epoch": 1.8893854748603351, + "grad_norm": 1.4106309413909912, + "learning_rate": 0.0009082352941176471, + "loss": 0.5432, + "step": 3382 + }, + { + "epoch": 1.8899441340782124, + "grad_norm": 3.170473575592041, + "learning_rate": 0.0009082072829131652, + "loss": 0.3659, + "step": 3383 + }, + { + "epoch": 1.8905027932960894, + "grad_norm": 0.5562348365783691, + "learning_rate": 0.0009081792717086835, + "loss": 0.5037, + "step": 3384 + }, + { + "epoch": 1.8910614525139664, + "grad_norm": 0.6246789693832397, + "learning_rate": 0.0009081512605042017, + "loss": 0.4931, + "step": 3385 + }, + { + "epoch": 1.8916201117318434, + "grad_norm": 0.6318274140357971, + "learning_rate": 0.0009081232492997199, + "loss": 0.4445, + "step": 3386 + }, + { + "epoch": 1.8921787709497206, + "grad_norm": 0.9648261070251465, + "learning_rate": 0.0009080952380952381, + "loss": 0.4032, + "step": 3387 + }, + { + "epoch": 1.8927374301675979, + "grad_norm": 0.51225745677948, + "learning_rate": 0.0009080672268907563, + "loss": 0.4821, + "step": 3388 + }, + { + "epoch": 1.893296089385475, + "grad_norm": 0.4842418134212494, + "learning_rate": 0.0009080392156862745, + "loss": 0.4827, + "step": 3389 + }, + { + "epoch": 1.893854748603352, + "grad_norm": 0.5580561757087708, + "learning_rate": 0.0009080112044817927, + "loss": 0.5639, + "step": 3390 + }, + { + "epoch": 1.894413407821229, + "grad_norm": 0.42107322812080383, + "learning_rate": 0.0009079831932773109, + "loss": 0.5225, + "step": 3391 + }, + { + "epoch": 1.8949720670391061, + "grad_norm": 0.8365760445594788, + "learning_rate": 0.0009079551820728291, + "loss": 0.386, + "step": 3392 + }, + { + "epoch": 1.8955307262569834, + "grad_norm": 0.5805233120918274, + "learning_rate": 0.0009079271708683474, + "loss": 0.4543, + "step": 3393 + }, + { + "epoch": 1.8960893854748604, + "grad_norm": 0.4055412709712982, + "learning_rate": 0.0009078991596638656, + "loss": 0.4576, + "step": 3394 + }, + { + "epoch": 1.8966480446927374, + "grad_norm": 0.714622437953949, + "learning_rate": 0.0009078711484593838, + "loss": 0.5736, + "step": 3395 + }, + { + "epoch": 1.8972067039106144, + "grad_norm": 0.9913580417633057, + "learning_rate": 0.000907843137254902, + "loss": 0.4861, + "step": 3396 + }, + { + "epoch": 1.8977653631284916, + "grad_norm": 0.5982085466384888, + "learning_rate": 0.0009078151260504202, + "loss": 0.4136, + "step": 3397 + }, + { + "epoch": 1.8983240223463689, + "grad_norm": 0.389060914516449, + "learning_rate": 0.0009077871148459384, + "loss": 0.3704, + "step": 3398 + }, + { + "epoch": 1.8988826815642459, + "grad_norm": 0.47859811782836914, + "learning_rate": 0.0009077591036414566, + "loss": 0.4942, + "step": 3399 + }, + { + "epoch": 1.899441340782123, + "grad_norm": 0.519008994102478, + "learning_rate": 0.0009077310924369748, + "loss": 0.4471, + "step": 3400 + }, + { + "epoch": 1.9, + "grad_norm": 0.4630208909511566, + "learning_rate": 0.000907703081232493, + "loss": 0.4148, + "step": 3401 + }, + { + "epoch": 1.9005586592178771, + "grad_norm": 1.0838171243667603, + "learning_rate": 0.0009076750700280112, + "loss": 0.589, + "step": 3402 + }, + { + "epoch": 1.9011173184357542, + "grad_norm": 0.6444001793861389, + "learning_rate": 0.0009076470588235294, + "loss": 0.4501, + "step": 3403 + }, + { + "epoch": 1.9016759776536314, + "grad_norm": 0.4720785319805145, + "learning_rate": 0.0009076190476190477, + "loss": 0.515, + "step": 3404 + }, + { + "epoch": 1.9022346368715084, + "grad_norm": 1.9518955945968628, + "learning_rate": 0.0009075910364145658, + "loss": 0.4045, + "step": 3405 + }, + { + "epoch": 1.9027932960893854, + "grad_norm": 0.7964692115783691, + "learning_rate": 0.000907563025210084, + "loss": 0.4541, + "step": 3406 + }, + { + "epoch": 1.9033519553072624, + "grad_norm": 0.9589069485664368, + "learning_rate": 0.0009075350140056022, + "loss": 0.6191, + "step": 3407 + }, + { + "epoch": 1.9039106145251397, + "grad_norm": 0.45958298444747925, + "learning_rate": 0.0009075070028011204, + "loss": 0.4787, + "step": 3408 + }, + { + "epoch": 1.9044692737430169, + "grad_norm": 0.4079684317111969, + "learning_rate": 0.0009074789915966388, + "loss": 0.4345, + "step": 3409 + }, + { + "epoch": 1.905027932960894, + "grad_norm": 0.5356015563011169, + "learning_rate": 0.0009074509803921569, + "loss": 0.3998, + "step": 3410 + }, + { + "epoch": 1.905586592178771, + "grad_norm": 0.6153750419616699, + "learning_rate": 0.0009074229691876751, + "loss": 0.4284, + "step": 3411 + }, + { + "epoch": 1.906145251396648, + "grad_norm": 0.7890805006027222, + "learning_rate": 0.0009073949579831933, + "loss": 0.402, + "step": 3412 + }, + { + "epoch": 1.9067039106145252, + "grad_norm": 0.4278022348880768, + "learning_rate": 0.0009073669467787115, + "loss": 0.4263, + "step": 3413 + }, + { + "epoch": 1.9072625698324024, + "grad_norm": 0.6270452737808228, + "learning_rate": 0.0009073389355742298, + "loss": 0.4042, + "step": 3414 + }, + { + "epoch": 1.9078212290502794, + "grad_norm": 1.2224963903427124, + "learning_rate": 0.0009073109243697479, + "loss": 0.4413, + "step": 3415 + }, + { + "epoch": 1.9083798882681564, + "grad_norm": 0.6034187078475952, + "learning_rate": 0.0009072829131652661, + "loss": 0.4356, + "step": 3416 + }, + { + "epoch": 1.9089385474860334, + "grad_norm": 1.150604248046875, + "learning_rate": 0.0009072549019607843, + "loss": 0.6877, + "step": 3417 + }, + { + "epoch": 1.9094972067039107, + "grad_norm": 0.5894142985343933, + "learning_rate": 0.0009072268907563025, + "loss": 0.5169, + "step": 3418 + }, + { + "epoch": 1.9100558659217877, + "grad_norm": 0.7291961312294006, + "learning_rate": 0.0009071988795518208, + "loss": 0.6585, + "step": 3419 + }, + { + "epoch": 1.910614525139665, + "grad_norm": 5.241934776306152, + "learning_rate": 0.000907170868347339, + "loss": 0.4698, + "step": 3420 + }, + { + "epoch": 1.911173184357542, + "grad_norm": 0.6813094019889832, + "learning_rate": 0.0009071428571428571, + "loss": 0.4977, + "step": 3421 + }, + { + "epoch": 1.911731843575419, + "grad_norm": 0.7271748781204224, + "learning_rate": 0.0009071148459383753, + "loss": 0.377, + "step": 3422 + }, + { + "epoch": 1.912290502793296, + "grad_norm": 0.7742030024528503, + "learning_rate": 0.0009070868347338935, + "loss": 0.48, + "step": 3423 + }, + { + "epoch": 1.9128491620111732, + "grad_norm": 0.5869528651237488, + "learning_rate": 0.0009070588235294118, + "loss": 0.4957, + "step": 3424 + }, + { + "epoch": 1.9134078212290504, + "grad_norm": 0.7639419436454773, + "learning_rate": 0.00090703081232493, + "loss": 0.4396, + "step": 3425 + }, + { + "epoch": 1.9139664804469274, + "grad_norm": 0.6555896997451782, + "learning_rate": 0.0009070028011204482, + "loss": 0.4396, + "step": 3426 + }, + { + "epoch": 1.9145251396648044, + "grad_norm": 0.691922664642334, + "learning_rate": 0.0009069747899159664, + "loss": 0.4835, + "step": 3427 + }, + { + "epoch": 1.9150837988826814, + "grad_norm": 0.4975016415119171, + "learning_rate": 0.0009069467787114846, + "loss": 0.4799, + "step": 3428 + }, + { + "epoch": 1.9156424581005587, + "grad_norm": 2.0976202487945557, + "learning_rate": 0.0009069187675070029, + "loss": 0.4976, + "step": 3429 + }, + { + "epoch": 1.916201117318436, + "grad_norm": 0.48069363832473755, + "learning_rate": 0.0009068907563025211, + "loss": 0.5978, + "step": 3430 + }, + { + "epoch": 1.916759776536313, + "grad_norm": 0.6229406595230103, + "learning_rate": 0.0009068627450980392, + "loss": 0.5495, + "step": 3431 + }, + { + "epoch": 1.91731843575419, + "grad_norm": 0.7944614291191101, + "learning_rate": 0.0009068347338935574, + "loss": 0.471, + "step": 3432 + }, + { + "epoch": 1.917877094972067, + "grad_norm": 0.5190448760986328, + "learning_rate": 0.0009068067226890756, + "loss": 0.5354, + "step": 3433 + }, + { + "epoch": 1.9184357541899442, + "grad_norm": 1.036592721939087, + "learning_rate": 0.0009067787114845939, + "loss": 0.4768, + "step": 3434 + }, + { + "epoch": 1.9189944134078212, + "grad_norm": 0.7237743735313416, + "learning_rate": 0.0009067507002801121, + "loss": 0.6533, + "step": 3435 + }, + { + "epoch": 1.9195530726256984, + "grad_norm": 0.7009897828102112, + "learning_rate": 0.0009067226890756303, + "loss": 0.5783, + "step": 3436 + }, + { + "epoch": 1.9201117318435754, + "grad_norm": 0.5043143630027771, + "learning_rate": 0.0009066946778711484, + "loss": 0.4794, + "step": 3437 + }, + { + "epoch": 1.9206703910614524, + "grad_norm": 0.4626138508319855, + "learning_rate": 0.0009066666666666666, + "loss": 0.5044, + "step": 3438 + }, + { + "epoch": 1.9212290502793297, + "grad_norm": 0.7077670693397522, + "learning_rate": 0.0009066386554621849, + "loss": 0.4474, + "step": 3439 + }, + { + "epoch": 1.9217877094972067, + "grad_norm": 0.7444259524345398, + "learning_rate": 0.0009066106442577031, + "loss": 0.519, + "step": 3440 + }, + { + "epoch": 1.922346368715084, + "grad_norm": 0.3730005919933319, + "learning_rate": 0.0009065826330532213, + "loss": 0.4742, + "step": 3441 + }, + { + "epoch": 1.922905027932961, + "grad_norm": 0.39288267493247986, + "learning_rate": 0.0009065546218487394, + "loss": 0.5099, + "step": 3442 + }, + { + "epoch": 1.923463687150838, + "grad_norm": 0.37690800428390503, + "learning_rate": 0.0009065266106442577, + "loss": 0.3998, + "step": 3443 + }, + { + "epoch": 1.924022346368715, + "grad_norm": 0.6762821674346924, + "learning_rate": 0.000906498599439776, + "loss": 0.4353, + "step": 3444 + }, + { + "epoch": 1.9245810055865922, + "grad_norm": 0.6250193119049072, + "learning_rate": 0.0009064705882352942, + "loss": 0.4794, + "step": 3445 + }, + { + "epoch": 1.9251396648044694, + "grad_norm": 0.6170416474342346, + "learning_rate": 0.0009064425770308124, + "loss": 0.5364, + "step": 3446 + }, + { + "epoch": 1.9256983240223464, + "grad_norm": 2.247410297393799, + "learning_rate": 0.0009064145658263305, + "loss": 0.5995, + "step": 3447 + }, + { + "epoch": 1.9262569832402234, + "grad_norm": 1.9219515323638916, + "learning_rate": 0.0009063865546218487, + "loss": 0.5276, + "step": 3448 + }, + { + "epoch": 1.9268156424581004, + "grad_norm": 1.4085333347320557, + "learning_rate": 0.000906358543417367, + "loss": 0.4371, + "step": 3449 + }, + { + "epoch": 1.9273743016759777, + "grad_norm": 0.7772863507270813, + "learning_rate": 0.0009063305322128852, + "loss": 0.5818, + "step": 3450 + }, + { + "epoch": 1.927932960893855, + "grad_norm": 0.627034604549408, + "learning_rate": 0.0009063025210084034, + "loss": 0.442, + "step": 3451 + }, + { + "epoch": 1.928491620111732, + "grad_norm": 0.6423159241676331, + "learning_rate": 0.0009062745098039216, + "loss": 0.5428, + "step": 3452 + }, + { + "epoch": 1.929050279329609, + "grad_norm": 0.6216955780982971, + "learning_rate": 0.0009062464985994397, + "loss": 0.477, + "step": 3453 + }, + { + "epoch": 1.929608938547486, + "grad_norm": 0.46773308515548706, + "learning_rate": 0.000906218487394958, + "loss": 0.4536, + "step": 3454 + }, + { + "epoch": 1.9301675977653632, + "grad_norm": 0.6190614700317383, + "learning_rate": 0.0009061904761904762, + "loss": 0.52, + "step": 3455 + }, + { + "epoch": 1.9307262569832402, + "grad_norm": 0.5048285722732544, + "learning_rate": 0.0009061624649859944, + "loss": 0.4504, + "step": 3456 + }, + { + "epoch": 1.9312849162011174, + "grad_norm": 0.639789342880249, + "learning_rate": 0.0009061344537815126, + "loss": 0.5773, + "step": 3457 + }, + { + "epoch": 1.9318435754189944, + "grad_norm": 0.7518055438995361, + "learning_rate": 0.0009061064425770307, + "loss": 0.6785, + "step": 3458 + }, + { + "epoch": 1.9324022346368714, + "grad_norm": 0.5717806220054626, + "learning_rate": 0.0009060784313725491, + "loss": 0.5382, + "step": 3459 + }, + { + "epoch": 1.9329608938547485, + "grad_norm": 0.8811807632446289, + "learning_rate": 0.0009060504201680673, + "loss": 0.4675, + "step": 3460 + }, + { + "epoch": 1.9335195530726257, + "grad_norm": 0.39842790365219116, + "learning_rate": 0.0009060224089635855, + "loss": 0.3666, + "step": 3461 + }, + { + "epoch": 1.934078212290503, + "grad_norm": 0.8868865966796875, + "learning_rate": 0.0009059943977591037, + "loss": 0.4776, + "step": 3462 + }, + { + "epoch": 1.93463687150838, + "grad_norm": 0.43413469195365906, + "learning_rate": 0.0009059663865546218, + "loss": 0.4532, + "step": 3463 + }, + { + "epoch": 1.935195530726257, + "grad_norm": 1.447930932044983, + "learning_rate": 0.0009059383753501401, + "loss": 0.391, + "step": 3464 + }, + { + "epoch": 1.935754189944134, + "grad_norm": 0.8444411754608154, + "learning_rate": 0.0009059103641456583, + "loss": 0.4676, + "step": 3465 + }, + { + "epoch": 1.9363128491620112, + "grad_norm": 1.0747498273849487, + "learning_rate": 0.0009058823529411765, + "loss": 0.457, + "step": 3466 + }, + { + "epoch": 1.9368715083798884, + "grad_norm": 0.6661196351051331, + "learning_rate": 0.0009058543417366947, + "loss": 0.5078, + "step": 3467 + }, + { + "epoch": 1.9374301675977654, + "grad_norm": 0.6356555819511414, + "learning_rate": 0.0009058263305322129, + "loss": 0.49, + "step": 3468 + }, + { + "epoch": 1.9379888268156424, + "grad_norm": 0.7038290500640869, + "learning_rate": 0.0009057983193277311, + "loss": 0.5258, + "step": 3469 + }, + { + "epoch": 1.9385474860335195, + "grad_norm": 0.5217005014419556, + "learning_rate": 0.0009057703081232493, + "loss": 0.5581, + "step": 3470 + }, + { + "epoch": 1.9391061452513967, + "grad_norm": 1.912278652191162, + "learning_rate": 0.0009057422969187675, + "loss": 0.5453, + "step": 3471 + }, + { + "epoch": 1.9396648044692737, + "grad_norm": 0.6004762649536133, + "learning_rate": 0.0009057142857142857, + "loss": 0.4522, + "step": 3472 + }, + { + "epoch": 1.940223463687151, + "grad_norm": 0.5349053740501404, + "learning_rate": 0.0009056862745098039, + "loss": 0.5741, + "step": 3473 + }, + { + "epoch": 1.940782122905028, + "grad_norm": 0.5003183484077454, + "learning_rate": 0.0009056582633053221, + "loss": 0.4716, + "step": 3474 + }, + { + "epoch": 1.941340782122905, + "grad_norm": 1.856022596359253, + "learning_rate": 0.0009056302521008404, + "loss": 0.533, + "step": 3475 + }, + { + "epoch": 1.941899441340782, + "grad_norm": 0.43124836683273315, + "learning_rate": 0.0009056022408963586, + "loss": 0.4012, + "step": 3476 + }, + { + "epoch": 1.9424581005586592, + "grad_norm": 0.8166503310203552, + "learning_rate": 0.0009055742296918768, + "loss": 0.5546, + "step": 3477 + }, + { + "epoch": 1.9430167597765364, + "grad_norm": 0.5390708446502686, + "learning_rate": 0.000905546218487395, + "loss": 0.3833, + "step": 3478 + }, + { + "epoch": 1.9435754189944134, + "grad_norm": 0.5894172787666321, + "learning_rate": 0.0009055182072829132, + "loss": 0.5986, + "step": 3479 + }, + { + "epoch": 1.9441340782122905, + "grad_norm": 0.7875383496284485, + "learning_rate": 0.0009054901960784314, + "loss": 0.4902, + "step": 3480 + }, + { + "epoch": 1.9446927374301675, + "grad_norm": 0.431085467338562, + "learning_rate": 0.0009054621848739496, + "loss": 0.3963, + "step": 3481 + }, + { + "epoch": 1.9452513966480447, + "grad_norm": 0.5462560653686523, + "learning_rate": 0.0009054341736694678, + "loss": 0.4574, + "step": 3482 + }, + { + "epoch": 1.945810055865922, + "grad_norm": 0.4532710015773773, + "learning_rate": 0.000905406162464986, + "loss": 0.4163, + "step": 3483 + }, + { + "epoch": 1.946368715083799, + "grad_norm": 0.3987860679626465, + "learning_rate": 0.0009053781512605043, + "loss": 0.327, + "step": 3484 + }, + { + "epoch": 1.946927374301676, + "grad_norm": 5.585010528564453, + "learning_rate": 0.0009053501400560224, + "loss": 0.4719, + "step": 3485 + }, + { + "epoch": 1.947486033519553, + "grad_norm": 2.0112860202789307, + "learning_rate": 0.0009053221288515406, + "loss": 0.4288, + "step": 3486 + }, + { + "epoch": 1.9480446927374302, + "grad_norm": 0.5010934472084045, + "learning_rate": 0.0009052941176470588, + "loss": 0.4974, + "step": 3487 + }, + { + "epoch": 1.9486033519553074, + "grad_norm": 0.6223816275596619, + "learning_rate": 0.000905266106442577, + "loss": 0.4894, + "step": 3488 + }, + { + "epoch": 1.9491620111731844, + "grad_norm": 0.5376791954040527, + "learning_rate": 0.0009052380952380953, + "loss": 0.4926, + "step": 3489 + }, + { + "epoch": 1.9497206703910615, + "grad_norm": 0.563200056552887, + "learning_rate": 0.0009052100840336134, + "loss": 0.3986, + "step": 3490 + }, + { + "epoch": 1.9502793296089385, + "grad_norm": 0.4292796850204468, + "learning_rate": 0.0009051820728291316, + "loss": 0.4133, + "step": 3491 + }, + { + "epoch": 1.9508379888268157, + "grad_norm": 0.5368371605873108, + "learning_rate": 0.0009051540616246499, + "loss": 0.4386, + "step": 3492 + }, + { + "epoch": 1.9513966480446927, + "grad_norm": 0.5037499666213989, + "learning_rate": 0.0009051260504201681, + "loss": 0.4515, + "step": 3493 + }, + { + "epoch": 1.95195530726257, + "grad_norm": 0.7200754284858704, + "learning_rate": 0.0009050980392156864, + "loss": 0.5068, + "step": 3494 + }, + { + "epoch": 1.952513966480447, + "grad_norm": 0.6245931386947632, + "learning_rate": 0.0009050700280112045, + "loss": 0.4626, + "step": 3495 + }, + { + "epoch": 1.953072625698324, + "grad_norm": 0.4852912425994873, + "learning_rate": 0.0009050420168067227, + "loss": 0.5257, + "step": 3496 + }, + { + "epoch": 1.953631284916201, + "grad_norm": 0.4774476885795593, + "learning_rate": 0.0009050140056022409, + "loss": 0.4487, + "step": 3497 + }, + { + "epoch": 1.9541899441340782, + "grad_norm": 1.0392900705337524, + "learning_rate": 0.0009049859943977591, + "loss": 0.4675, + "step": 3498 + }, + { + "epoch": 1.9547486033519554, + "grad_norm": 0.8098852634429932, + "learning_rate": 0.0009049579831932774, + "loss": 0.6607, + "step": 3499 + }, + { + "epoch": 1.9553072625698324, + "grad_norm": 0.5951655507087708, + "learning_rate": 0.0009049299719887956, + "loss": 0.4997, + "step": 3500 + }, + { + "epoch": 1.9553072625698324, + "eval_cer": 0.09693081513971175, + "eval_loss": 0.3673718273639679, + "eval_runtime": 55.528, + "eval_samples_per_second": 81.725, + "eval_steps_per_second": 5.115, + "eval_wer": 0.3797100639646937, + "step": 3500 + }, + { + "epoch": 1.9558659217877095, + "grad_norm": 0.4045039713382721, + "learning_rate": 0.0009049019607843137, + "loss": 0.3696, + "step": 3501 + }, + { + "epoch": 1.9564245810055865, + "grad_norm": 0.48960962891578674, + "learning_rate": 0.0009048739495798319, + "loss": 0.5343, + "step": 3502 + }, + { + "epoch": 1.9569832402234637, + "grad_norm": 0.3607482314109802, + "learning_rate": 0.0009048459383753501, + "loss": 0.4177, + "step": 3503 + }, + { + "epoch": 1.957541899441341, + "grad_norm": 0.5582266449928284, + "learning_rate": 0.0009048179271708684, + "loss": 0.5064, + "step": 3504 + }, + { + "epoch": 1.958100558659218, + "grad_norm": 0.4474537670612335, + "learning_rate": 0.0009047899159663866, + "loss": 0.4405, + "step": 3505 + }, + { + "epoch": 1.958659217877095, + "grad_norm": 0.8795913457870483, + "learning_rate": 0.0009047619047619047, + "loss": 0.4723, + "step": 3506 + }, + { + "epoch": 1.959217877094972, + "grad_norm": 0.42152541875839233, + "learning_rate": 0.0009047338935574229, + "loss": 0.4382, + "step": 3507 + }, + { + "epoch": 1.9597765363128492, + "grad_norm": 0.630465567111969, + "learning_rate": 0.0009047058823529412, + "loss": 0.3956, + "step": 3508 + }, + { + "epoch": 1.9603351955307262, + "grad_norm": 0.38271746039390564, + "learning_rate": 0.0009046778711484595, + "loss": 0.4385, + "step": 3509 + }, + { + "epoch": 1.9608938547486034, + "grad_norm": 0.7205129861831665, + "learning_rate": 0.0009046498599439777, + "loss": 0.5121, + "step": 3510 + }, + { + "epoch": 1.9614525139664805, + "grad_norm": 0.6931906342506409, + "learning_rate": 0.0009046218487394958, + "loss": 0.4694, + "step": 3511 + }, + { + "epoch": 1.9620111731843575, + "grad_norm": 0.5405535101890564, + "learning_rate": 0.000904593837535014, + "loss": 0.5335, + "step": 3512 + }, + { + "epoch": 1.9625698324022345, + "grad_norm": 0.4896290600299835, + "learning_rate": 0.0009045658263305322, + "loss": 0.5676, + "step": 3513 + }, + { + "epoch": 1.9631284916201117, + "grad_norm": 0.6114362478256226, + "learning_rate": 0.0009045378151260505, + "loss": 0.4836, + "step": 3514 + }, + { + "epoch": 1.963687150837989, + "grad_norm": 0.5427936911582947, + "learning_rate": 0.0009045098039215687, + "loss": 0.4397, + "step": 3515 + }, + { + "epoch": 1.964245810055866, + "grad_norm": 0.7454958558082581, + "learning_rate": 0.0009044817927170869, + "loss": 0.4664, + "step": 3516 + }, + { + "epoch": 1.964804469273743, + "grad_norm": 0.5674418807029724, + "learning_rate": 0.000904453781512605, + "loss": 0.417, + "step": 3517 + }, + { + "epoch": 1.96536312849162, + "grad_norm": 0.6599341034889221, + "learning_rate": 0.0009044257703081232, + "loss": 0.6134, + "step": 3518 + }, + { + "epoch": 1.9659217877094972, + "grad_norm": 0.6833198070526123, + "learning_rate": 0.0009043977591036415, + "loss": 0.539, + "step": 3519 + }, + { + "epoch": 1.9664804469273744, + "grad_norm": 1.0619444847106934, + "learning_rate": 0.0009043697478991597, + "loss": 0.596, + "step": 3520 + }, + { + "epoch": 1.9670391061452515, + "grad_norm": 0.6789833903312683, + "learning_rate": 0.0009043417366946779, + "loss": 0.497, + "step": 3521 + }, + { + "epoch": 1.9675977653631285, + "grad_norm": 0.4983786940574646, + "learning_rate": 0.000904313725490196, + "loss": 0.3507, + "step": 3522 + }, + { + "epoch": 1.9681564245810055, + "grad_norm": 0.7860569357872009, + "learning_rate": 0.0009042857142857142, + "loss": 0.4384, + "step": 3523 + }, + { + "epoch": 1.9687150837988827, + "grad_norm": 0.530733585357666, + "learning_rate": 0.0009042577030812326, + "loss": 0.4214, + "step": 3524 + }, + { + "epoch": 1.9692737430167597, + "grad_norm": 0.8622692823410034, + "learning_rate": 0.0009042296918767508, + "loss": 0.5221, + "step": 3525 + }, + { + "epoch": 1.969832402234637, + "grad_norm": 0.9509183168411255, + "learning_rate": 0.000904201680672269, + "loss": 0.5467, + "step": 3526 + }, + { + "epoch": 1.970391061452514, + "grad_norm": 0.5086579918861389, + "learning_rate": 0.0009041736694677871, + "loss": 0.5532, + "step": 3527 + }, + { + "epoch": 1.970949720670391, + "grad_norm": 0.6074703335762024, + "learning_rate": 0.0009041456582633053, + "loss": 0.5062, + "step": 3528 + }, + { + "epoch": 1.971508379888268, + "grad_norm": 1.8037029504776, + "learning_rate": 0.0009041176470588236, + "loss": 0.4707, + "step": 3529 + }, + { + "epoch": 1.9720670391061452, + "grad_norm": 5.210567474365234, + "learning_rate": 0.0009040896358543418, + "loss": 0.6929, + "step": 3530 + }, + { + "epoch": 1.9726256983240225, + "grad_norm": 0.5642329454421997, + "learning_rate": 0.00090406162464986, + "loss": 0.4816, + "step": 3531 + }, + { + "epoch": 1.9731843575418995, + "grad_norm": 0.7043510675430298, + "learning_rate": 0.0009040336134453782, + "loss": 0.458, + "step": 3532 + }, + { + "epoch": 1.9737430167597765, + "grad_norm": 0.5983909964561462, + "learning_rate": 0.0009040056022408963, + "loss": 0.3557, + "step": 3533 + }, + { + "epoch": 1.9743016759776535, + "grad_norm": 0.5334946513175964, + "learning_rate": 0.0009039775910364146, + "loss": 0.5308, + "step": 3534 + }, + { + "epoch": 1.9748603351955307, + "grad_norm": 0.4784345328807831, + "learning_rate": 0.0009039495798319328, + "loss": 0.4847, + "step": 3535 + }, + { + "epoch": 1.975418994413408, + "grad_norm": 0.5475999116897583, + "learning_rate": 0.000903921568627451, + "loss": 0.3819, + "step": 3536 + }, + { + "epoch": 1.975977653631285, + "grad_norm": 0.5399876236915588, + "learning_rate": 0.0009038935574229692, + "loss": 0.3594, + "step": 3537 + }, + { + "epoch": 1.976536312849162, + "grad_norm": 0.6344031691551208, + "learning_rate": 0.0009038655462184873, + "loss": 0.5733, + "step": 3538 + }, + { + "epoch": 1.977094972067039, + "grad_norm": 0.7832716703414917, + "learning_rate": 0.0009038375350140056, + "loss": 0.5387, + "step": 3539 + }, + { + "epoch": 1.9776536312849162, + "grad_norm": 0.48385801911354065, + "learning_rate": 0.0009038095238095239, + "loss": 0.4875, + "step": 3540 + }, + { + "epoch": 1.9782122905027935, + "grad_norm": 7.341414928436279, + "learning_rate": 0.0009037815126050421, + "loss": 0.403, + "step": 3541 + }, + { + "epoch": 1.9787709497206705, + "grad_norm": 0.5225608944892883, + "learning_rate": 0.0009037535014005603, + "loss": 0.4762, + "step": 3542 + }, + { + "epoch": 1.9793296089385475, + "grad_norm": 0.5041249394416809, + "learning_rate": 0.0009037254901960784, + "loss": 0.3955, + "step": 3543 + }, + { + "epoch": 1.9798882681564245, + "grad_norm": 10.300610542297363, + "learning_rate": 0.0009036974789915967, + "loss": 0.5804, + "step": 3544 + }, + { + "epoch": 1.9804469273743017, + "grad_norm": 0.6910036206245422, + "learning_rate": 0.0009036694677871149, + "loss": 0.4921, + "step": 3545 + }, + { + "epoch": 1.9810055865921787, + "grad_norm": 0.7483671307563782, + "learning_rate": 0.0009036414565826331, + "loss": 0.5679, + "step": 3546 + }, + { + "epoch": 1.981564245810056, + "grad_norm": 1.0613023042678833, + "learning_rate": 0.0009036134453781513, + "loss": 0.5126, + "step": 3547 + }, + { + "epoch": 1.982122905027933, + "grad_norm": 0.8317072987556458, + "learning_rate": 0.0009035854341736695, + "loss": 0.533, + "step": 3548 + }, + { + "epoch": 1.98268156424581, + "grad_norm": 1.4939213991165161, + "learning_rate": 0.0009035574229691877, + "loss": 0.4655, + "step": 3549 + }, + { + "epoch": 1.983240223463687, + "grad_norm": 0.5927630066871643, + "learning_rate": 0.0009035294117647059, + "loss": 0.5273, + "step": 3550 + }, + { + "epoch": 1.9837988826815642, + "grad_norm": 0.5648838877677917, + "learning_rate": 0.0009035014005602241, + "loss": 0.5222, + "step": 3551 + }, + { + "epoch": 1.9843575418994415, + "grad_norm": 0.6251527667045593, + "learning_rate": 0.0009034733893557423, + "loss": 0.4953, + "step": 3552 + }, + { + "epoch": 1.9849162011173185, + "grad_norm": 0.7572916150093079, + "learning_rate": 0.0009034453781512605, + "loss": 0.4855, + "step": 3553 + }, + { + "epoch": 1.9854748603351955, + "grad_norm": 0.5038579106330872, + "learning_rate": 0.0009034173669467787, + "loss": 0.5029, + "step": 3554 + }, + { + "epoch": 1.9860335195530725, + "grad_norm": 0.5522314310073853, + "learning_rate": 0.0009033893557422969, + "loss": 0.4267, + "step": 3555 + }, + { + "epoch": 1.9865921787709497, + "grad_norm": 0.4981827735900879, + "learning_rate": 0.0009033613445378151, + "loss": 0.4412, + "step": 3556 + }, + { + "epoch": 1.987150837988827, + "grad_norm": 0.7789214849472046, + "learning_rate": 0.0009033333333333334, + "loss": 0.4464, + "step": 3557 + }, + { + "epoch": 1.987709497206704, + "grad_norm": 0.7598844766616821, + "learning_rate": 0.0009033053221288516, + "loss": 0.3918, + "step": 3558 + }, + { + "epoch": 1.988268156424581, + "grad_norm": 7.820371627807617, + "learning_rate": 0.0009032773109243699, + "loss": 0.3769, + "step": 3559 + }, + { + "epoch": 1.988826815642458, + "grad_norm": 2.5441315174102783, + "learning_rate": 0.000903249299719888, + "loss": 0.4993, + "step": 3560 + }, + { + "epoch": 1.9893854748603352, + "grad_norm": 0.6510112881660461, + "learning_rate": 0.0009032212885154062, + "loss": 0.4605, + "step": 3561 + }, + { + "epoch": 1.9899441340782122, + "grad_norm": 4.732877731323242, + "learning_rate": 0.0009031932773109244, + "loss": 0.532, + "step": 3562 + }, + { + "epoch": 1.9905027932960895, + "grad_norm": 0.5059810876846313, + "learning_rate": 0.0009031652661064426, + "loss": 0.5014, + "step": 3563 + }, + { + "epoch": 1.9910614525139665, + "grad_norm": 0.6189684867858887, + "learning_rate": 0.0009031372549019609, + "loss": 0.4179, + "step": 3564 + }, + { + "epoch": 1.9916201117318435, + "grad_norm": 0.5997883081436157, + "learning_rate": 0.000903109243697479, + "loss": 0.4231, + "step": 3565 + }, + { + "epoch": 1.9921787709497205, + "grad_norm": 0.5159011483192444, + "learning_rate": 0.0009030812324929972, + "loss": 0.3627, + "step": 3566 + }, + { + "epoch": 1.9927374301675977, + "grad_norm": 0.5479264259338379, + "learning_rate": 0.0009030532212885154, + "loss": 0.4364, + "step": 3567 + }, + { + "epoch": 1.993296089385475, + "grad_norm": 0.5636224150657654, + "learning_rate": 0.0009030252100840336, + "loss": 0.5088, + "step": 3568 + }, + { + "epoch": 1.993854748603352, + "grad_norm": 0.48500046133995056, + "learning_rate": 0.0009029971988795518, + "loss": 0.4343, + "step": 3569 + }, + { + "epoch": 1.994413407821229, + "grad_norm": 0.7204303741455078, + "learning_rate": 0.00090296918767507, + "loss": 0.5234, + "step": 3570 + }, + { + "epoch": 1.994972067039106, + "grad_norm": 0.6034387350082397, + "learning_rate": 0.0009029411764705882, + "loss": 0.5912, + "step": 3571 + }, + { + "epoch": 1.9955307262569832, + "grad_norm": 0.7989311218261719, + "learning_rate": 0.0009029131652661064, + "loss": 0.5422, + "step": 3572 + }, + { + "epoch": 1.9960893854748605, + "grad_norm": 0.4728044867515564, + "learning_rate": 0.0009028851540616246, + "loss": 0.4165, + "step": 3573 + }, + { + "epoch": 1.9966480446927375, + "grad_norm": 1.8345197439193726, + "learning_rate": 0.0009028571428571429, + "loss": 0.5437, + "step": 3574 + }, + { + "epoch": 1.9972067039106145, + "grad_norm": 0.589110255241394, + "learning_rate": 0.0009028291316526612, + "loss": 0.544, + "step": 3575 + }, + { + "epoch": 1.9977653631284915, + "grad_norm": 0.617041826248169, + "learning_rate": 0.0009028011204481793, + "loss": 0.5897, + "step": 3576 + }, + { + "epoch": 1.9983240223463687, + "grad_norm": 0.594106912612915, + "learning_rate": 0.0009027731092436975, + "loss": 0.4297, + "step": 3577 + }, + { + "epoch": 1.9988826815642458, + "grad_norm": 0.9016503691673279, + "learning_rate": 0.0009027450980392157, + "loss": 0.5086, + "step": 3578 + }, + { + "epoch": 1.999441340782123, + "grad_norm": 0.5713033080101013, + "learning_rate": 0.0009027170868347339, + "loss": 0.5764, + "step": 3579 + }, + { + "epoch": 2.0, + "grad_norm": 0.7048798203468323, + "learning_rate": 0.0009026890756302522, + "loss": 0.4876, + "step": 3580 + }, + { + "epoch": 2.000558659217877, + "grad_norm": 0.5450454354286194, + "learning_rate": 0.0009026610644257703, + "loss": 0.503, + "step": 3581 + }, + { + "epoch": 2.001117318435754, + "grad_norm": 0.6440742015838623, + "learning_rate": 0.0009026330532212885, + "loss": 0.438, + "step": 3582 + }, + { + "epoch": 2.0016759776536315, + "grad_norm": 0.5473048686981201, + "learning_rate": 0.0009026050420168067, + "loss": 0.4054, + "step": 3583 + }, + { + "epoch": 2.0022346368715085, + "grad_norm": 0.3822300136089325, + "learning_rate": 0.0009025770308123249, + "loss": 0.4562, + "step": 3584 + }, + { + "epoch": 2.0027932960893855, + "grad_norm": 0.6752724647521973, + "learning_rate": 0.0009025490196078432, + "loss": 0.4064, + "step": 3585 + }, + { + "epoch": 2.0033519553072625, + "grad_norm": 0.4130238890647888, + "learning_rate": 0.0009025210084033613, + "loss": 0.4614, + "step": 3586 + }, + { + "epoch": 2.0039106145251395, + "grad_norm": 0.5895639061927795, + "learning_rate": 0.0009024929971988795, + "loss": 0.4011, + "step": 3587 + }, + { + "epoch": 2.004469273743017, + "grad_norm": 0.5819107294082642, + "learning_rate": 0.0009024649859943977, + "loss": 0.4264, + "step": 3588 + }, + { + "epoch": 2.005027932960894, + "grad_norm": 0.6172674298286438, + "learning_rate": 0.0009024369747899159, + "loss": 0.401, + "step": 3589 + }, + { + "epoch": 2.005586592178771, + "grad_norm": 1.1746076345443726, + "learning_rate": 0.0009024089635854343, + "loss": 0.4585, + "step": 3590 + }, + { + "epoch": 2.006145251396648, + "grad_norm": 0.6145292520523071, + "learning_rate": 0.0009023809523809525, + "loss": 0.5267, + "step": 3591 + }, + { + "epoch": 2.006703910614525, + "grad_norm": 0.5318580269813538, + "learning_rate": 0.0009023529411764706, + "loss": 0.4666, + "step": 3592 + }, + { + "epoch": 2.007262569832402, + "grad_norm": 0.5863227844238281, + "learning_rate": 0.0009023249299719888, + "loss": 0.4878, + "step": 3593 + }, + { + "epoch": 2.0078212290502795, + "grad_norm": 0.8668081760406494, + "learning_rate": 0.000902296918767507, + "loss": 0.4759, + "step": 3594 + }, + { + "epoch": 2.0083798882681565, + "grad_norm": 0.8008526563644409, + "learning_rate": 0.0009022689075630253, + "loss": 0.507, + "step": 3595 + }, + { + "epoch": 2.0089385474860335, + "grad_norm": 0.5928978323936462, + "learning_rate": 0.0009022408963585435, + "loss": 0.4554, + "step": 3596 + }, + { + "epoch": 2.0094972067039105, + "grad_norm": 0.652567446231842, + "learning_rate": 0.0009022128851540616, + "loss": 0.4781, + "step": 3597 + }, + { + "epoch": 2.0100558659217875, + "grad_norm": 1.2569924592971802, + "learning_rate": 0.0009021848739495798, + "loss": 0.4537, + "step": 3598 + }, + { + "epoch": 2.010614525139665, + "grad_norm": 0.7195703387260437, + "learning_rate": 0.000902156862745098, + "loss": 0.4226, + "step": 3599 + }, + { + "epoch": 2.011173184357542, + "grad_norm": 0.6938717365264893, + "learning_rate": 0.0009021288515406163, + "loss": 0.5468, + "step": 3600 + }, + { + "epoch": 2.011731843575419, + "grad_norm": 0.6042268872261047, + "learning_rate": 0.0009021008403361345, + "loss": 0.5378, + "step": 3601 + }, + { + "epoch": 2.012290502793296, + "grad_norm": 0.748271644115448, + "learning_rate": 0.0009020728291316526, + "loss": 0.4694, + "step": 3602 + }, + { + "epoch": 2.012849162011173, + "grad_norm": 0.509853184223175, + "learning_rate": 0.0009020448179271708, + "loss": 0.4945, + "step": 3603 + }, + { + "epoch": 2.0134078212290505, + "grad_norm": 0.5544410943984985, + "learning_rate": 0.000902016806722689, + "loss": 0.5588, + "step": 3604 + }, + { + "epoch": 2.0139664804469275, + "grad_norm": 0.7388685941696167, + "learning_rate": 0.0009019887955182073, + "loss": 0.5227, + "step": 3605 + }, + { + "epoch": 2.0145251396648045, + "grad_norm": 0.7522904276847839, + "learning_rate": 0.0009019607843137256, + "loss": 0.5748, + "step": 3606 + }, + { + "epoch": 2.0150837988826815, + "grad_norm": 0.8101471662521362, + "learning_rate": 0.0009019327731092438, + "loss": 0.4948, + "step": 3607 + }, + { + "epoch": 2.0156424581005585, + "grad_norm": 0.4869842231273651, + "learning_rate": 0.0009019047619047619, + "loss": 0.3863, + "step": 3608 + }, + { + "epoch": 2.0162011173184355, + "grad_norm": 0.7836116552352905, + "learning_rate": 0.0009018767507002801, + "loss": 0.4181, + "step": 3609 + }, + { + "epoch": 2.016759776536313, + "grad_norm": 0.5335279703140259, + "learning_rate": 0.0009018487394957984, + "loss": 0.4375, + "step": 3610 + }, + { + "epoch": 2.01731843575419, + "grad_norm": 1.068753957748413, + "learning_rate": 0.0009018207282913166, + "loss": 0.4274, + "step": 3611 + }, + { + "epoch": 2.017877094972067, + "grad_norm": 0.5417978763580322, + "learning_rate": 0.0009017927170868348, + "loss": 0.4648, + "step": 3612 + }, + { + "epoch": 2.018435754189944, + "grad_norm": 0.5654529929161072, + "learning_rate": 0.0009017647058823529, + "loss": 0.5148, + "step": 3613 + }, + { + "epoch": 2.018994413407821, + "grad_norm": 0.5350977778434753, + "learning_rate": 0.0009017366946778711, + "loss": 0.4992, + "step": 3614 + }, + { + "epoch": 2.0195530726256985, + "grad_norm": 0.7936908602714539, + "learning_rate": 0.0009017086834733894, + "loss": 0.3505, + "step": 3615 + }, + { + "epoch": 2.0201117318435755, + "grad_norm": 0.7255499958992004, + "learning_rate": 0.0009016806722689076, + "loss": 0.6074, + "step": 3616 + }, + { + "epoch": 2.0206703910614525, + "grad_norm": 0.5595179796218872, + "learning_rate": 0.0009016526610644258, + "loss": 0.4777, + "step": 3617 + }, + { + "epoch": 2.0212290502793295, + "grad_norm": 0.6538064479827881, + "learning_rate": 0.0009016246498599439, + "loss": 0.4465, + "step": 3618 + }, + { + "epoch": 2.0217877094972065, + "grad_norm": 1.006617546081543, + "learning_rate": 0.0009015966386554621, + "loss": 0.4811, + "step": 3619 + }, + { + "epoch": 2.022346368715084, + "grad_norm": 0.5685110688209534, + "learning_rate": 0.0009015686274509804, + "loss": 0.4459, + "step": 3620 + }, + { + "epoch": 2.022905027932961, + "grad_norm": 0.5071479082107544, + "learning_rate": 0.0009015406162464986, + "loss": 0.5977, + "step": 3621 + }, + { + "epoch": 2.023463687150838, + "grad_norm": 0.4989989995956421, + "learning_rate": 0.0009015126050420169, + "loss": 0.5899, + "step": 3622 + }, + { + "epoch": 2.024022346368715, + "grad_norm": 1.2915453910827637, + "learning_rate": 0.0009014845938375351, + "loss": 0.6405, + "step": 3623 + }, + { + "epoch": 2.024581005586592, + "grad_norm": 0.49206650257110596, + "learning_rate": 0.0009014565826330532, + "loss": 0.5031, + "step": 3624 + }, + { + "epoch": 2.0251396648044695, + "grad_norm": 0.8080052733421326, + "learning_rate": 0.0009014285714285715, + "loss": 0.4978, + "step": 3625 + }, + { + "epoch": 2.0256983240223465, + "grad_norm": 0.6381545662879944, + "learning_rate": 0.0009014005602240897, + "loss": 0.4272, + "step": 3626 + }, + { + "epoch": 2.0262569832402235, + "grad_norm": 1.6535542011260986, + "learning_rate": 0.0009013725490196079, + "loss": 0.4504, + "step": 3627 + }, + { + "epoch": 2.0268156424581005, + "grad_norm": 0.4633352756500244, + "learning_rate": 0.0009013445378151261, + "loss": 0.5212, + "step": 3628 + }, + { + "epoch": 2.0273743016759775, + "grad_norm": 0.45375335216522217, + "learning_rate": 0.0009013165266106442, + "loss": 0.4174, + "step": 3629 + }, + { + "epoch": 2.0279329608938546, + "grad_norm": 0.4371800422668457, + "learning_rate": 0.0009012885154061625, + "loss": 0.4732, + "step": 3630 + }, + { + "epoch": 2.028491620111732, + "grad_norm": 0.7216630578041077, + "learning_rate": 0.0009012605042016807, + "loss": 0.5108, + "step": 3631 + }, + { + "epoch": 2.029050279329609, + "grad_norm": 0.5368731617927551, + "learning_rate": 0.0009012324929971989, + "loss": 0.4784, + "step": 3632 + }, + { + "epoch": 2.029608938547486, + "grad_norm": 0.42912301421165466, + "learning_rate": 0.0009012044817927171, + "loss": 0.3978, + "step": 3633 + }, + { + "epoch": 2.030167597765363, + "grad_norm": 0.6719613075256348, + "learning_rate": 0.0009011764705882352, + "loss": 0.5235, + "step": 3634 + }, + { + "epoch": 2.03072625698324, + "grad_norm": 0.5906072854995728, + "learning_rate": 0.0009011484593837535, + "loss": 0.4929, + "step": 3635 + }, + { + "epoch": 2.0312849162011175, + "grad_norm": 2.1807973384857178, + "learning_rate": 0.0009011204481792717, + "loss": 0.419, + "step": 3636 + }, + { + "epoch": 2.0318435754189945, + "grad_norm": 13.073674201965332, + "learning_rate": 0.0009010924369747899, + "loss": 0.7023, + "step": 3637 + }, + { + "epoch": 2.0324022346368715, + "grad_norm": 0.5532638430595398, + "learning_rate": 0.0009010644257703081, + "loss": 0.4215, + "step": 3638 + }, + { + "epoch": 2.0329608938547485, + "grad_norm": 0.5153961777687073, + "learning_rate": 0.0009010364145658264, + "loss": 0.4712, + "step": 3639 + }, + { + "epoch": 2.0335195530726256, + "grad_norm": 1.0909032821655273, + "learning_rate": 0.0009010084033613446, + "loss": 0.4436, + "step": 3640 + }, + { + "epoch": 2.034078212290503, + "grad_norm": 0.5917366147041321, + "learning_rate": 0.0009009803921568628, + "loss": 0.4256, + "step": 3641 + }, + { + "epoch": 2.03463687150838, + "grad_norm": 1.351475477218628, + "learning_rate": 0.000900952380952381, + "loss": 0.4113, + "step": 3642 + }, + { + "epoch": 2.035195530726257, + "grad_norm": 0.4937914311885834, + "learning_rate": 0.0009009243697478992, + "loss": 0.5164, + "step": 3643 + }, + { + "epoch": 2.035754189944134, + "grad_norm": 0.5243120789527893, + "learning_rate": 0.0009008963585434174, + "loss": 0.451, + "step": 3644 + }, + { + "epoch": 2.036312849162011, + "grad_norm": 0.6904476284980774, + "learning_rate": 0.0009008683473389356, + "loss": 0.4354, + "step": 3645 + }, + { + "epoch": 2.036871508379888, + "grad_norm": 0.6102162599563599, + "learning_rate": 0.0009008403361344538, + "loss": 0.4485, + "step": 3646 + }, + { + "epoch": 2.0374301675977655, + "grad_norm": 1.0556654930114746, + "learning_rate": 0.000900812324929972, + "loss": 0.4758, + "step": 3647 + }, + { + "epoch": 2.0379888268156425, + "grad_norm": 0.48797985911369324, + "learning_rate": 0.0009007843137254902, + "loss": 0.4622, + "step": 3648 + }, + { + "epoch": 2.0385474860335195, + "grad_norm": 0.5164204239845276, + "learning_rate": 0.0009007563025210084, + "loss": 0.459, + "step": 3649 + }, + { + "epoch": 2.0391061452513966, + "grad_norm": 2.2609622478485107, + "learning_rate": 0.0009007282913165266, + "loss": 0.4979, + "step": 3650 + }, + { + "epoch": 2.0396648044692736, + "grad_norm": 0.8742501139640808, + "learning_rate": 0.0009007002801120448, + "loss": 0.4722, + "step": 3651 + }, + { + "epoch": 2.040223463687151, + "grad_norm": 2.1491634845733643, + "learning_rate": 0.000900672268907563, + "loss": 0.5696, + "step": 3652 + }, + { + "epoch": 2.040782122905028, + "grad_norm": 0.6949541568756104, + "learning_rate": 0.0009006442577030812, + "loss": 0.4896, + "step": 3653 + }, + { + "epoch": 2.041340782122905, + "grad_norm": 1.1042120456695557, + "learning_rate": 0.0009006162464985994, + "loss": 0.4089, + "step": 3654 + }, + { + "epoch": 2.041899441340782, + "grad_norm": 0.5481662750244141, + "learning_rate": 0.0009005882352941178, + "loss": 0.479, + "step": 3655 + }, + { + "epoch": 2.042458100558659, + "grad_norm": 0.4432147741317749, + "learning_rate": 0.0009005602240896359, + "loss": 0.4474, + "step": 3656 + }, + { + "epoch": 2.0430167597765365, + "grad_norm": 0.711911141872406, + "learning_rate": 0.0009005322128851541, + "loss": 0.443, + "step": 3657 + }, + { + "epoch": 2.0435754189944135, + "grad_norm": 0.6026064157485962, + "learning_rate": 0.0009005042016806723, + "loss": 0.4773, + "step": 3658 + }, + { + "epoch": 2.0441340782122905, + "grad_norm": 0.966338038444519, + "learning_rate": 0.0009004761904761905, + "loss": 0.7091, + "step": 3659 + }, + { + "epoch": 2.0446927374301676, + "grad_norm": 0.6239568591117859, + "learning_rate": 0.0009004481792717088, + "loss": 0.5048, + "step": 3660 + }, + { + "epoch": 2.0452513966480446, + "grad_norm": 0.6432452201843262, + "learning_rate": 0.0009004201680672269, + "loss": 0.4432, + "step": 3661 + }, + { + "epoch": 2.0458100558659216, + "grad_norm": 0.912550151348114, + "learning_rate": 0.0009003921568627451, + "loss": 0.526, + "step": 3662 + }, + { + "epoch": 2.046368715083799, + "grad_norm": 0.8593716621398926, + "learning_rate": 0.0009003641456582633, + "loss": 0.4943, + "step": 3663 + }, + { + "epoch": 2.046927374301676, + "grad_norm": 2.3145017623901367, + "learning_rate": 0.0009003361344537815, + "loss": 0.5077, + "step": 3664 + }, + { + "epoch": 2.047486033519553, + "grad_norm": 2.300337553024292, + "learning_rate": 0.0009003081232492998, + "loss": 0.4057, + "step": 3665 + }, + { + "epoch": 2.04804469273743, + "grad_norm": 0.6351850032806396, + "learning_rate": 0.0009002801120448179, + "loss": 0.4694, + "step": 3666 + }, + { + "epoch": 2.048603351955307, + "grad_norm": 0.565089762210846, + "learning_rate": 0.0009002521008403361, + "loss": 0.4212, + "step": 3667 + }, + { + "epoch": 2.0491620111731845, + "grad_norm": 2.1437771320343018, + "learning_rate": 0.0009002240896358543, + "loss": 0.4626, + "step": 3668 + }, + { + "epoch": 2.0497206703910615, + "grad_norm": 6.8366804122924805, + "learning_rate": 0.0009001960784313725, + "loss": 0.5141, + "step": 3669 + }, + { + "epoch": 2.0502793296089385, + "grad_norm": 0.561722457408905, + "learning_rate": 0.0009001680672268908, + "loss": 0.4529, + "step": 3670 + }, + { + "epoch": 2.0508379888268156, + "grad_norm": 0.43533599376678467, + "learning_rate": 0.000900140056022409, + "loss": 0.4096, + "step": 3671 + }, + { + "epoch": 2.0513966480446926, + "grad_norm": 1.6014506816864014, + "learning_rate": 0.0009001120448179272, + "loss": 0.5019, + "step": 3672 + }, + { + "epoch": 2.05195530726257, + "grad_norm": 0.5009967088699341, + "learning_rate": 0.0009000840336134454, + "loss": 0.4794, + "step": 3673 + }, + { + "epoch": 2.052513966480447, + "grad_norm": 2.5643579959869385, + "learning_rate": 0.0009000560224089636, + "loss": 0.5311, + "step": 3674 + }, + { + "epoch": 2.053072625698324, + "grad_norm": 0.6453258991241455, + "learning_rate": 0.0009000280112044819, + "loss": 0.4993, + "step": 3675 + }, + { + "epoch": 2.053631284916201, + "grad_norm": 0.3981837332248688, + "learning_rate": 0.0009000000000000001, + "loss": 0.5037, + "step": 3676 + }, + { + "epoch": 2.054189944134078, + "grad_norm": 0.6156757473945618, + "learning_rate": 0.0008999719887955182, + "loss": 0.5961, + "step": 3677 + }, + { + "epoch": 2.054748603351955, + "grad_norm": 0.44877707958221436, + "learning_rate": 0.0008999439775910364, + "loss": 0.3651, + "step": 3678 + }, + { + "epoch": 2.0553072625698325, + "grad_norm": 0.5611476898193359, + "learning_rate": 0.0008999159663865546, + "loss": 0.448, + "step": 3679 + }, + { + "epoch": 2.0558659217877095, + "grad_norm": 3.6825473308563232, + "learning_rate": 0.0008998879551820729, + "loss": 0.5328, + "step": 3680 + }, + { + "epoch": 2.0564245810055866, + "grad_norm": 0.6583296656608582, + "learning_rate": 0.0008998599439775911, + "loss": 0.4755, + "step": 3681 + }, + { + "epoch": 2.0569832402234636, + "grad_norm": 0.5341112017631531, + "learning_rate": 0.0008998319327731092, + "loss": 0.4938, + "step": 3682 + }, + { + "epoch": 2.0575418994413406, + "grad_norm": 0.5836253762245178, + "learning_rate": 0.0008998039215686274, + "loss": 0.4253, + "step": 3683 + }, + { + "epoch": 2.058100558659218, + "grad_norm": 0.7295217514038086, + "learning_rate": 0.0008997759103641456, + "loss": 0.6345, + "step": 3684 + }, + { + "epoch": 2.058659217877095, + "grad_norm": 0.4508606493473053, + "learning_rate": 0.0008997478991596639, + "loss": 0.4422, + "step": 3685 + }, + { + "epoch": 2.059217877094972, + "grad_norm": 0.6020694375038147, + "learning_rate": 0.0008997198879551821, + "loss": 0.5392, + "step": 3686 + }, + { + "epoch": 2.059776536312849, + "grad_norm": 0.6957813501358032, + "learning_rate": 0.0008996918767507003, + "loss": 0.548, + "step": 3687 + }, + { + "epoch": 2.060335195530726, + "grad_norm": 1.2248607873916626, + "learning_rate": 0.0008996638655462184, + "loss": 0.4601, + "step": 3688 + }, + { + "epoch": 2.0608938547486035, + "grad_norm": 0.5103458166122437, + "learning_rate": 0.0008996358543417367, + "loss": 0.513, + "step": 3689 + }, + { + "epoch": 2.0614525139664805, + "grad_norm": 0.4685027301311493, + "learning_rate": 0.000899607843137255, + "loss": 0.4959, + "step": 3690 + }, + { + "epoch": 2.0620111731843576, + "grad_norm": 0.8616878390312195, + "learning_rate": 0.0008995798319327732, + "loss": 0.6087, + "step": 3691 + }, + { + "epoch": 2.0625698324022346, + "grad_norm": 0.749208390712738, + "learning_rate": 0.0008995518207282914, + "loss": 0.4034, + "step": 3692 + }, + { + "epoch": 2.0631284916201116, + "grad_norm": 0.4404391050338745, + "learning_rate": 0.0008995238095238095, + "loss": 0.4779, + "step": 3693 + }, + { + "epoch": 2.063687150837989, + "grad_norm": 0.515535831451416, + "learning_rate": 0.0008994957983193277, + "loss": 0.4963, + "step": 3694 + }, + { + "epoch": 2.064245810055866, + "grad_norm": 0.4829117953777313, + "learning_rate": 0.000899467787114846, + "loss": 0.3912, + "step": 3695 + }, + { + "epoch": 2.064804469273743, + "grad_norm": 0.8293478488922119, + "learning_rate": 0.0008994397759103642, + "loss": 0.3592, + "step": 3696 + }, + { + "epoch": 2.06536312849162, + "grad_norm": 3.508714437484741, + "learning_rate": 0.0008994117647058824, + "loss": 0.5117, + "step": 3697 + }, + { + "epoch": 2.065921787709497, + "grad_norm": 0.937251627445221, + "learning_rate": 0.0008993837535014005, + "loss": 0.581, + "step": 3698 + }, + { + "epoch": 2.066480446927374, + "grad_norm": 0.4724261462688446, + "learning_rate": 0.0008993557422969187, + "loss": 0.4653, + "step": 3699 + }, + { + "epoch": 2.0670391061452515, + "grad_norm": 0.8045380115509033, + "learning_rate": 0.000899327731092437, + "loss": 0.4063, + "step": 3700 + }, + { + "epoch": 2.0675977653631286, + "grad_norm": 0.44875216484069824, + "learning_rate": 0.0008992997198879552, + "loss": 0.4156, + "step": 3701 + }, + { + "epoch": 2.0681564245810056, + "grad_norm": 6.698723793029785, + "learning_rate": 0.0008992717086834734, + "loss": 0.4371, + "step": 3702 + }, + { + "epoch": 2.0687150837988826, + "grad_norm": 0.6368177533149719, + "learning_rate": 0.0008992436974789916, + "loss": 0.3801, + "step": 3703 + }, + { + "epoch": 2.0692737430167596, + "grad_norm": 0.8219538927078247, + "learning_rate": 0.0008992156862745097, + "loss": 0.4772, + "step": 3704 + }, + { + "epoch": 2.069832402234637, + "grad_norm": 0.9575294256210327, + "learning_rate": 0.0008991876750700281, + "loss": 0.4072, + "step": 3705 + }, + { + "epoch": 2.070391061452514, + "grad_norm": 1.4106591939926147, + "learning_rate": 0.0008991596638655463, + "loss": 0.4239, + "step": 3706 + }, + { + "epoch": 2.070949720670391, + "grad_norm": 0.7687220573425293, + "learning_rate": 0.0008991316526610645, + "loss": 0.5372, + "step": 3707 + }, + { + "epoch": 2.071508379888268, + "grad_norm": 0.7370675206184387, + "learning_rate": 0.0008991036414565827, + "loss": 0.5408, + "step": 3708 + }, + { + "epoch": 2.072067039106145, + "grad_norm": 0.470316618680954, + "learning_rate": 0.0008990756302521008, + "loss": 0.4622, + "step": 3709 + }, + { + "epoch": 2.0726256983240225, + "grad_norm": 0.5752003192901611, + "learning_rate": 0.0008990476190476191, + "loss": 0.5363, + "step": 3710 + }, + { + "epoch": 2.0731843575418996, + "grad_norm": 0.5439411401748657, + "learning_rate": 0.0008990196078431373, + "loss": 0.4332, + "step": 3711 + }, + { + "epoch": 2.0737430167597766, + "grad_norm": 0.5157110095024109, + "learning_rate": 0.0008989915966386555, + "loss": 0.4944, + "step": 3712 + }, + { + "epoch": 2.0743016759776536, + "grad_norm": 0.6001066565513611, + "learning_rate": 0.0008989635854341737, + "loss": 0.5176, + "step": 3713 + }, + { + "epoch": 2.0748603351955306, + "grad_norm": 0.7983638644218445, + "learning_rate": 0.0008989355742296918, + "loss": 0.4992, + "step": 3714 + }, + { + "epoch": 2.0754189944134076, + "grad_norm": 0.94972163438797, + "learning_rate": 0.0008989075630252101, + "loss": 0.4639, + "step": 3715 + }, + { + "epoch": 2.075977653631285, + "grad_norm": 0.5934249758720398, + "learning_rate": 0.0008988795518207283, + "loss": 0.4983, + "step": 3716 + }, + { + "epoch": 2.076536312849162, + "grad_norm": 0.6522203087806702, + "learning_rate": 0.0008988515406162465, + "loss": 0.4679, + "step": 3717 + }, + { + "epoch": 2.077094972067039, + "grad_norm": 0.5073977112770081, + "learning_rate": 0.0008988235294117647, + "loss": 0.4703, + "step": 3718 + }, + { + "epoch": 2.077653631284916, + "grad_norm": 0.6643916964530945, + "learning_rate": 0.0008987955182072829, + "loss": 0.4247, + "step": 3719 + }, + { + "epoch": 2.078212290502793, + "grad_norm": 0.5237669944763184, + "learning_rate": 0.0008987675070028011, + "loss": 0.4487, + "step": 3720 + }, + { + "epoch": 2.0787709497206706, + "grad_norm": 9.248260498046875, + "learning_rate": 0.0008987394957983194, + "loss": 0.4931, + "step": 3721 + }, + { + "epoch": 2.0793296089385476, + "grad_norm": 0.7812570333480835, + "learning_rate": 0.0008987114845938376, + "loss": 0.4135, + "step": 3722 + }, + { + "epoch": 2.0798882681564246, + "grad_norm": 0.7737398147583008, + "learning_rate": 0.0008986834733893558, + "loss": 0.3451, + "step": 3723 + }, + { + "epoch": 2.0804469273743016, + "grad_norm": 0.5317307710647583, + "learning_rate": 0.000898655462184874, + "loss": 0.6042, + "step": 3724 + }, + { + "epoch": 2.0810055865921786, + "grad_norm": 0.6720497012138367, + "learning_rate": 0.0008986274509803922, + "loss": 0.5637, + "step": 3725 + }, + { + "epoch": 2.081564245810056, + "grad_norm": 0.5801661014556885, + "learning_rate": 0.0008985994397759104, + "loss": 0.4899, + "step": 3726 + }, + { + "epoch": 2.082122905027933, + "grad_norm": 0.5745871067047119, + "learning_rate": 0.0008985714285714286, + "loss": 0.5712, + "step": 3727 + }, + { + "epoch": 2.08268156424581, + "grad_norm": 0.5367339253425598, + "learning_rate": 0.0008985434173669468, + "loss": 0.4759, + "step": 3728 + }, + { + "epoch": 2.083240223463687, + "grad_norm": 0.4944542646408081, + "learning_rate": 0.000898515406162465, + "loss": 0.542, + "step": 3729 + }, + { + "epoch": 2.083798882681564, + "grad_norm": 0.5606075525283813, + "learning_rate": 0.0008984873949579833, + "loss": 0.4008, + "step": 3730 + }, + { + "epoch": 2.0843575418994416, + "grad_norm": 5.330539703369141, + "learning_rate": 0.0008984593837535014, + "loss": 0.5064, + "step": 3731 + }, + { + "epoch": 2.0849162011173186, + "grad_norm": 0.6617708802223206, + "learning_rate": 0.0008984313725490196, + "loss": 0.4042, + "step": 3732 + }, + { + "epoch": 2.0854748603351956, + "grad_norm": 0.4983225166797638, + "learning_rate": 0.0008984033613445378, + "loss": 0.4525, + "step": 3733 + }, + { + "epoch": 2.0860335195530726, + "grad_norm": 0.6001350283622742, + "learning_rate": 0.000898375350140056, + "loss": 0.4917, + "step": 3734 + }, + { + "epoch": 2.0865921787709496, + "grad_norm": 2.2164862155914307, + "learning_rate": 0.0008983473389355743, + "loss": 0.5909, + "step": 3735 + }, + { + "epoch": 2.0871508379888266, + "grad_norm": 0.5075696110725403, + "learning_rate": 0.0008983193277310924, + "loss": 0.3998, + "step": 3736 + }, + { + "epoch": 2.087709497206704, + "grad_norm": 0.5509397387504578, + "learning_rate": 0.0008982913165266106, + "loss": 0.5552, + "step": 3737 + }, + { + "epoch": 2.088268156424581, + "grad_norm": 0.48098692297935486, + "learning_rate": 0.0008982633053221289, + "loss": 0.4863, + "step": 3738 + }, + { + "epoch": 2.088826815642458, + "grad_norm": 2.272890090942383, + "learning_rate": 0.0008982352941176471, + "loss": 0.5314, + "step": 3739 + }, + { + "epoch": 2.089385474860335, + "grad_norm": 1.3531925678253174, + "learning_rate": 0.0008982072829131654, + "loss": 0.5998, + "step": 3740 + }, + { + "epoch": 2.089944134078212, + "grad_norm": 0.8127819895744324, + "learning_rate": 0.0008981792717086835, + "loss": 0.4904, + "step": 3741 + }, + { + "epoch": 2.0905027932960896, + "grad_norm": 0.9548804759979248, + "learning_rate": 0.0008981512605042017, + "loss": 0.4814, + "step": 3742 + }, + { + "epoch": 2.0910614525139666, + "grad_norm": 0.5896094441413879, + "learning_rate": 0.0008981232492997199, + "loss": 0.4818, + "step": 3743 + }, + { + "epoch": 2.0916201117318436, + "grad_norm": 0.4566092789173126, + "learning_rate": 0.0008980952380952381, + "loss": 0.3928, + "step": 3744 + }, + { + "epoch": 2.0921787709497206, + "grad_norm": 0.528643786907196, + "learning_rate": 0.0008980672268907564, + "loss": 0.5536, + "step": 3745 + }, + { + "epoch": 2.0927374301675976, + "grad_norm": 0.7363616228103638, + "learning_rate": 0.0008980392156862746, + "loss": 0.5224, + "step": 3746 + }, + { + "epoch": 2.093296089385475, + "grad_norm": 0.6808377504348755, + "learning_rate": 0.0008980112044817927, + "loss": 0.4684, + "step": 3747 + }, + { + "epoch": 2.093854748603352, + "grad_norm": 0.5341591238975525, + "learning_rate": 0.0008979831932773109, + "loss": 0.3539, + "step": 3748 + }, + { + "epoch": 2.094413407821229, + "grad_norm": 0.40797972679138184, + "learning_rate": 0.0008979551820728291, + "loss": 0.4366, + "step": 3749 + }, + { + "epoch": 2.094972067039106, + "grad_norm": 0.4666319489479065, + "learning_rate": 0.0008979271708683474, + "loss": 0.5171, + "step": 3750 + }, + { + "epoch": 2.095530726256983, + "grad_norm": 1.6182719469070435, + "learning_rate": 0.0008978991596638656, + "loss": 0.4744, + "step": 3751 + }, + { + "epoch": 2.09608938547486, + "grad_norm": 0.5935097932815552, + "learning_rate": 0.0008978711484593837, + "loss": 0.5552, + "step": 3752 + }, + { + "epoch": 2.0966480446927376, + "grad_norm": 1.1712733507156372, + "learning_rate": 0.0008978431372549019, + "loss": 0.4601, + "step": 3753 + }, + { + "epoch": 2.0972067039106146, + "grad_norm": 0.9645431041717529, + "learning_rate": 0.0008978151260504202, + "loss": 0.5203, + "step": 3754 + }, + { + "epoch": 2.0977653631284916, + "grad_norm": 0.7179100513458252, + "learning_rate": 0.0008977871148459385, + "loss": 0.5173, + "step": 3755 + }, + { + "epoch": 2.0983240223463686, + "grad_norm": 0.7007253170013428, + "learning_rate": 0.0008977591036414567, + "loss": 0.4079, + "step": 3756 + }, + { + "epoch": 2.0988826815642456, + "grad_norm": 0.6294612288475037, + "learning_rate": 0.0008977310924369748, + "loss": 0.4309, + "step": 3757 + }, + { + "epoch": 2.099441340782123, + "grad_norm": 0.4474851191043854, + "learning_rate": 0.000897703081232493, + "loss": 0.4335, + "step": 3758 + }, + { + "epoch": 2.1, + "grad_norm": 1.6781375408172607, + "learning_rate": 0.0008976750700280112, + "loss": 0.59, + "step": 3759 + }, + { + "epoch": 2.100558659217877, + "grad_norm": 0.5361772775650024, + "learning_rate": 0.0008976470588235295, + "loss": 0.5142, + "step": 3760 + }, + { + "epoch": 2.101117318435754, + "grad_norm": 0.60182785987854, + "learning_rate": 0.0008976190476190477, + "loss": 0.4508, + "step": 3761 + }, + { + "epoch": 2.101675977653631, + "grad_norm": 0.5234772562980652, + "learning_rate": 0.0008975910364145659, + "loss": 0.3858, + "step": 3762 + }, + { + "epoch": 2.1022346368715086, + "grad_norm": 0.5795048475265503, + "learning_rate": 0.000897563025210084, + "loss": 0.4334, + "step": 3763 + }, + { + "epoch": 2.1027932960893856, + "grad_norm": 0.5706015229225159, + "learning_rate": 0.0008975350140056022, + "loss": 0.5285, + "step": 3764 + }, + { + "epoch": 2.1033519553072626, + "grad_norm": 0.3967292606830597, + "learning_rate": 0.0008975070028011205, + "loss": 0.4253, + "step": 3765 + }, + { + "epoch": 2.1039106145251396, + "grad_norm": 0.5121884346008301, + "learning_rate": 0.0008974789915966387, + "loss": 0.4451, + "step": 3766 + }, + { + "epoch": 2.1044692737430166, + "grad_norm": 0.6755759716033936, + "learning_rate": 0.0008974509803921569, + "loss": 0.5626, + "step": 3767 + }, + { + "epoch": 2.105027932960894, + "grad_norm": 0.5974379181861877, + "learning_rate": 0.000897422969187675, + "loss": 0.4261, + "step": 3768 + }, + { + "epoch": 2.105586592178771, + "grad_norm": 0.506280779838562, + "learning_rate": 0.0008973949579831932, + "loss": 0.664, + "step": 3769 + }, + { + "epoch": 2.106145251396648, + "grad_norm": 2.147064208984375, + "learning_rate": 0.0008973669467787116, + "loss": 0.4534, + "step": 3770 + }, + { + "epoch": 2.106703910614525, + "grad_norm": 0.4539654552936554, + "learning_rate": 0.0008973389355742298, + "loss": 0.4992, + "step": 3771 + }, + { + "epoch": 2.107262569832402, + "grad_norm": 0.4924386143684387, + "learning_rate": 0.000897310924369748, + "loss": 0.4914, + "step": 3772 + }, + { + "epoch": 2.107821229050279, + "grad_norm": 1.0393178462982178, + "learning_rate": 0.0008972829131652661, + "loss": 0.4184, + "step": 3773 + }, + { + "epoch": 2.1083798882681566, + "grad_norm": 0.6019902229309082, + "learning_rate": 0.0008972549019607843, + "loss": 0.4533, + "step": 3774 + }, + { + "epoch": 2.1089385474860336, + "grad_norm": 0.6919354796409607, + "learning_rate": 0.0008972268907563026, + "loss": 0.5453, + "step": 3775 + }, + { + "epoch": 2.1094972067039106, + "grad_norm": 0.4117518365383148, + "learning_rate": 0.0008971988795518208, + "loss": 0.4395, + "step": 3776 + }, + { + "epoch": 2.1100558659217876, + "grad_norm": 0.47695526480674744, + "learning_rate": 0.000897170868347339, + "loss": 0.4572, + "step": 3777 + }, + { + "epoch": 2.1106145251396646, + "grad_norm": 0.5436705350875854, + "learning_rate": 0.0008971428571428572, + "loss": 0.4931, + "step": 3778 + }, + { + "epoch": 2.111173184357542, + "grad_norm": 0.5723715424537659, + "learning_rate": 0.0008971148459383753, + "loss": 0.5657, + "step": 3779 + }, + { + "epoch": 2.111731843575419, + "grad_norm": 0.5120642185211182, + "learning_rate": 0.0008970868347338936, + "loss": 0.3819, + "step": 3780 + }, + { + "epoch": 2.112290502793296, + "grad_norm": 0.683992862701416, + "learning_rate": 0.0008970588235294118, + "loss": 0.4697, + "step": 3781 + }, + { + "epoch": 2.112849162011173, + "grad_norm": 0.5189375281333923, + "learning_rate": 0.00089703081232493, + "loss": 0.4441, + "step": 3782 + }, + { + "epoch": 2.11340782122905, + "grad_norm": 0.634335994720459, + "learning_rate": 0.0008970028011204482, + "loss": 0.522, + "step": 3783 + }, + { + "epoch": 2.1139664804469276, + "grad_norm": 0.5401212573051453, + "learning_rate": 0.0008969747899159663, + "loss": 0.4753, + "step": 3784 + }, + { + "epoch": 2.1145251396648046, + "grad_norm": 0.6072887182235718, + "learning_rate": 0.0008969467787114846, + "loss": 0.4758, + "step": 3785 + }, + { + "epoch": 2.1150837988826816, + "grad_norm": 5.551515579223633, + "learning_rate": 0.0008969187675070029, + "loss": 0.4116, + "step": 3786 + }, + { + "epoch": 2.1156424581005586, + "grad_norm": 0.4808572232723236, + "learning_rate": 0.0008968907563025211, + "loss": 0.4224, + "step": 3787 + }, + { + "epoch": 2.1162011173184356, + "grad_norm": 0.45117831230163574, + "learning_rate": 0.0008968627450980393, + "loss": 0.507, + "step": 3788 + }, + { + "epoch": 2.1167597765363126, + "grad_norm": 0.4961342215538025, + "learning_rate": 0.0008968347338935574, + "loss": 0.4282, + "step": 3789 + }, + { + "epoch": 2.11731843575419, + "grad_norm": 1.128710150718689, + "learning_rate": 0.0008968067226890756, + "loss": 0.4983, + "step": 3790 + }, + { + "epoch": 2.117877094972067, + "grad_norm": 0.8296599388122559, + "learning_rate": 0.0008967787114845939, + "loss": 0.5818, + "step": 3791 + }, + { + "epoch": 2.118435754189944, + "grad_norm": 0.6532965302467346, + "learning_rate": 0.0008967507002801121, + "loss": 0.4443, + "step": 3792 + }, + { + "epoch": 2.118994413407821, + "grad_norm": 2.0979795455932617, + "learning_rate": 0.0008967226890756303, + "loss": 0.4998, + "step": 3793 + }, + { + "epoch": 2.119553072625698, + "grad_norm": 3.302300214767456, + "learning_rate": 0.0008966946778711485, + "loss": 0.447, + "step": 3794 + }, + { + "epoch": 2.1201117318435756, + "grad_norm": 0.5424798727035522, + "learning_rate": 0.0008966666666666666, + "loss": 0.543, + "step": 3795 + }, + { + "epoch": 2.1206703910614526, + "grad_norm": 0.6697589755058289, + "learning_rate": 0.0008966386554621849, + "loss": 0.5218, + "step": 3796 + }, + { + "epoch": 2.1212290502793296, + "grad_norm": 0.6120874881744385, + "learning_rate": 0.0008966106442577031, + "loss": 0.4499, + "step": 3797 + }, + { + "epoch": 2.1217877094972066, + "grad_norm": 0.5173333287239075, + "learning_rate": 0.0008965826330532213, + "loss": 0.3973, + "step": 3798 + }, + { + "epoch": 2.1223463687150836, + "grad_norm": 0.4179285764694214, + "learning_rate": 0.0008965546218487395, + "loss": 0.5076, + "step": 3799 + }, + { + "epoch": 2.122905027932961, + "grad_norm": 0.5040994882583618, + "learning_rate": 0.0008965266106442576, + "loss": 0.527, + "step": 3800 + }, + { + "epoch": 2.123463687150838, + "grad_norm": 0.38026806712150574, + "learning_rate": 0.0008964985994397759, + "loss": 0.47, + "step": 3801 + }, + { + "epoch": 2.124022346368715, + "grad_norm": 0.7907775640487671, + "learning_rate": 0.0008964705882352941, + "loss": 0.4527, + "step": 3802 + }, + { + "epoch": 2.124581005586592, + "grad_norm": 0.6088317632675171, + "learning_rate": 0.0008964425770308124, + "loss": 0.4411, + "step": 3803 + }, + { + "epoch": 2.125139664804469, + "grad_norm": 0.45487162470817566, + "learning_rate": 0.0008964145658263306, + "loss": 0.4223, + "step": 3804 + }, + { + "epoch": 2.1256983240223466, + "grad_norm": 0.5422521829605103, + "learning_rate": 0.0008963865546218487, + "loss": 0.5266, + "step": 3805 + }, + { + "epoch": 2.1262569832402236, + "grad_norm": 0.5822058320045471, + "learning_rate": 0.000896358543417367, + "loss": 0.441, + "step": 3806 + }, + { + "epoch": 2.1268156424581006, + "grad_norm": 0.47583141922950745, + "learning_rate": 0.0008963305322128852, + "loss": 0.4818, + "step": 3807 + }, + { + "epoch": 2.1273743016759776, + "grad_norm": 0.6223709583282471, + "learning_rate": 0.0008963025210084034, + "loss": 0.4252, + "step": 3808 + }, + { + "epoch": 2.1279329608938546, + "grad_norm": 0.8416194319725037, + "learning_rate": 0.0008962745098039216, + "loss": 0.6117, + "step": 3809 + }, + { + "epoch": 2.1284916201117317, + "grad_norm": 1.0066306591033936, + "learning_rate": 0.0008962464985994398, + "loss": 0.5445, + "step": 3810 + }, + { + "epoch": 2.129050279329609, + "grad_norm": 0.6163341999053955, + "learning_rate": 0.000896218487394958, + "loss": 0.5861, + "step": 3811 + }, + { + "epoch": 2.129608938547486, + "grad_norm": 0.5084480047225952, + "learning_rate": 0.0008961904761904762, + "loss": 0.4558, + "step": 3812 + }, + { + "epoch": 2.130167597765363, + "grad_norm": 0.447477787733078, + "learning_rate": 0.0008961624649859944, + "loss": 0.4645, + "step": 3813 + }, + { + "epoch": 2.13072625698324, + "grad_norm": 0.5391470789909363, + "learning_rate": 0.0008961344537815126, + "loss": 0.4588, + "step": 3814 + }, + { + "epoch": 2.131284916201117, + "grad_norm": 0.5313900709152222, + "learning_rate": 0.0008961064425770308, + "loss": 0.5352, + "step": 3815 + }, + { + "epoch": 2.1318435754189946, + "grad_norm": 0.5331398844718933, + "learning_rate": 0.000896078431372549, + "loss": 0.4543, + "step": 3816 + }, + { + "epoch": 2.1324022346368716, + "grad_norm": 0.5167649984359741, + "learning_rate": 0.0008960504201680672, + "loss": 0.5248, + "step": 3817 + }, + { + "epoch": 2.1329608938547486, + "grad_norm": 2.74029541015625, + "learning_rate": 0.0008960224089635854, + "loss": 0.6056, + "step": 3818 + }, + { + "epoch": 2.1335195530726256, + "grad_norm": 2.7519335746765137, + "learning_rate": 0.0008959943977591036, + "loss": 0.6211, + "step": 3819 + }, + { + "epoch": 2.1340782122905027, + "grad_norm": 0.5218809247016907, + "learning_rate": 0.0008959663865546219, + "loss": 0.5768, + "step": 3820 + }, + { + "epoch": 2.1346368715083797, + "grad_norm": 0.5395709872245789, + "learning_rate": 0.0008959383753501401, + "loss": 0.5176, + "step": 3821 + }, + { + "epoch": 2.135195530726257, + "grad_norm": 0.49308541417121887, + "learning_rate": 0.0008959103641456583, + "loss": 0.4127, + "step": 3822 + }, + { + "epoch": 2.135754189944134, + "grad_norm": 0.6129255890846252, + "learning_rate": 0.0008958823529411765, + "loss": 0.6359, + "step": 3823 + }, + { + "epoch": 2.136312849162011, + "grad_norm": 0.5810296535491943, + "learning_rate": 0.0008958543417366947, + "loss": 0.3593, + "step": 3824 + }, + { + "epoch": 2.136871508379888, + "grad_norm": 1.3670779466629028, + "learning_rate": 0.0008958263305322129, + "loss": 0.4384, + "step": 3825 + }, + { + "epoch": 2.137430167597765, + "grad_norm": 0.4468787908554077, + "learning_rate": 0.0008957983193277312, + "loss": 0.3445, + "step": 3826 + }, + { + "epoch": 2.1379888268156426, + "grad_norm": 0.47791972756385803, + "learning_rate": 0.0008957703081232493, + "loss": 0.4176, + "step": 3827 + }, + { + "epoch": 2.1385474860335196, + "grad_norm": 0.8680862784385681, + "learning_rate": 0.0008957422969187675, + "loss": 0.4593, + "step": 3828 + }, + { + "epoch": 2.1391061452513966, + "grad_norm": 0.47572362422943115, + "learning_rate": 0.0008957142857142857, + "loss": 0.5265, + "step": 3829 + }, + { + "epoch": 2.1396648044692737, + "grad_norm": 0.6244614124298096, + "learning_rate": 0.0008956862745098039, + "loss": 0.4254, + "step": 3830 + }, + { + "epoch": 2.1402234636871507, + "grad_norm": 0.7154577970504761, + "learning_rate": 0.0008956582633053222, + "loss": 0.4698, + "step": 3831 + }, + { + "epoch": 2.140782122905028, + "grad_norm": 0.7574757933616638, + "learning_rate": 0.0008956302521008403, + "loss": 0.4249, + "step": 3832 + }, + { + "epoch": 2.141340782122905, + "grad_norm": 0.5189238786697388, + "learning_rate": 0.0008956022408963585, + "loss": 0.4772, + "step": 3833 + }, + { + "epoch": 2.141899441340782, + "grad_norm": 3.430995225906372, + "learning_rate": 0.0008955742296918767, + "loss": 0.4976, + "step": 3834 + }, + { + "epoch": 2.142458100558659, + "grad_norm": 0.6838756799697876, + "learning_rate": 0.0008955462184873949, + "loss": 0.5701, + "step": 3835 + }, + { + "epoch": 2.143016759776536, + "grad_norm": 0.5704289078712463, + "learning_rate": 0.0008955182072829133, + "loss": 0.4228, + "step": 3836 + }, + { + "epoch": 2.1435754189944136, + "grad_norm": 0.4766763746738434, + "learning_rate": 0.0008954901960784314, + "loss": 0.5659, + "step": 3837 + }, + { + "epoch": 2.1441340782122906, + "grad_norm": 0.8810789585113525, + "learning_rate": 0.0008954621848739496, + "loss": 0.5164, + "step": 3838 + }, + { + "epoch": 2.1446927374301676, + "grad_norm": 0.4730139970779419, + "learning_rate": 0.0008954341736694678, + "loss": 0.4843, + "step": 3839 + }, + { + "epoch": 2.1452513966480447, + "grad_norm": 0.7862569093704224, + "learning_rate": 0.000895406162464986, + "loss": 0.4956, + "step": 3840 + }, + { + "epoch": 2.1458100558659217, + "grad_norm": 5.963804244995117, + "learning_rate": 0.0008953781512605043, + "loss": 0.4541, + "step": 3841 + }, + { + "epoch": 2.146368715083799, + "grad_norm": 0.562130331993103, + "learning_rate": 0.0008953501400560225, + "loss": 0.4078, + "step": 3842 + }, + { + "epoch": 2.146927374301676, + "grad_norm": 0.5721172094345093, + "learning_rate": 0.0008953221288515406, + "loss": 0.5345, + "step": 3843 + }, + { + "epoch": 2.147486033519553, + "grad_norm": 0.67780601978302, + "learning_rate": 0.0008952941176470588, + "loss": 0.4785, + "step": 3844 + }, + { + "epoch": 2.14804469273743, + "grad_norm": 0.45617246627807617, + "learning_rate": 0.000895266106442577, + "loss": 0.4376, + "step": 3845 + }, + { + "epoch": 2.148603351955307, + "grad_norm": 0.43296343088150024, + "learning_rate": 0.0008952380952380953, + "loss": 0.4024, + "step": 3846 + }, + { + "epoch": 2.149162011173184, + "grad_norm": 0.5535196661949158, + "learning_rate": 0.0008952100840336135, + "loss": 0.43, + "step": 3847 + }, + { + "epoch": 2.1497206703910616, + "grad_norm": 0.6729576587677002, + "learning_rate": 0.0008951820728291316, + "loss": 0.475, + "step": 3848 + }, + { + "epoch": 2.1502793296089386, + "grad_norm": 0.5174770355224609, + "learning_rate": 0.0008951540616246498, + "loss": 0.459, + "step": 3849 + }, + { + "epoch": 2.1508379888268156, + "grad_norm": 1.0330302715301514, + "learning_rate": 0.000895126050420168, + "loss": 0.5393, + "step": 3850 + }, + { + "epoch": 2.1513966480446927, + "grad_norm": 0.4281381666660309, + "learning_rate": 0.0008950980392156863, + "loss": 0.3785, + "step": 3851 + }, + { + "epoch": 2.1519553072625697, + "grad_norm": 0.8313674926757812, + "learning_rate": 0.0008950700280112046, + "loss": 0.4172, + "step": 3852 + }, + { + "epoch": 2.152513966480447, + "grad_norm": 1.3019518852233887, + "learning_rate": 0.0008950420168067227, + "loss": 0.4977, + "step": 3853 + }, + { + "epoch": 2.153072625698324, + "grad_norm": 2.2441890239715576, + "learning_rate": 0.0008950140056022409, + "loss": 0.5788, + "step": 3854 + }, + { + "epoch": 2.153631284916201, + "grad_norm": 0.8006486892700195, + "learning_rate": 0.0008949859943977591, + "loss": 0.496, + "step": 3855 + }, + { + "epoch": 2.154189944134078, + "grad_norm": 0.5741393566131592, + "learning_rate": 0.0008949579831932774, + "loss": 0.5004, + "step": 3856 + }, + { + "epoch": 2.154748603351955, + "grad_norm": 0.7082931399345398, + "learning_rate": 0.0008949299719887956, + "loss": 0.4286, + "step": 3857 + }, + { + "epoch": 2.155307262569832, + "grad_norm": 17.997554779052734, + "learning_rate": 0.0008949019607843138, + "loss": 0.366, + "step": 3858 + }, + { + "epoch": 2.1558659217877096, + "grad_norm": 3.6697189807891846, + "learning_rate": 0.0008948739495798319, + "loss": 0.5159, + "step": 3859 + }, + { + "epoch": 2.1564245810055866, + "grad_norm": 1.1577590703964233, + "learning_rate": 0.0008948459383753501, + "loss": 0.4831, + "step": 3860 + }, + { + "epoch": 2.1569832402234637, + "grad_norm": 0.8880429267883301, + "learning_rate": 0.0008948179271708684, + "loss": 0.59, + "step": 3861 + }, + { + "epoch": 2.1575418994413407, + "grad_norm": 3.190842628479004, + "learning_rate": 0.0008947899159663866, + "loss": 0.6169, + "step": 3862 + }, + { + "epoch": 2.1581005586592177, + "grad_norm": 1.0041362047195435, + "learning_rate": 0.0008947619047619048, + "loss": 0.5001, + "step": 3863 + }, + { + "epoch": 2.158659217877095, + "grad_norm": 0.629850447177887, + "learning_rate": 0.0008947338935574229, + "loss": 0.5899, + "step": 3864 + }, + { + "epoch": 2.159217877094972, + "grad_norm": 1.105079174041748, + "learning_rate": 0.0008947058823529411, + "loss": 0.4194, + "step": 3865 + }, + { + "epoch": 2.159776536312849, + "grad_norm": 1.2017194032669067, + "learning_rate": 0.0008946778711484594, + "loss": 0.5322, + "step": 3866 + }, + { + "epoch": 2.160335195530726, + "grad_norm": 0.8280863165855408, + "learning_rate": 0.0008946498599439776, + "loss": 0.5519, + "step": 3867 + }, + { + "epoch": 2.160893854748603, + "grad_norm": 2.054886817932129, + "learning_rate": 0.0008946218487394959, + "loss": 0.5436, + "step": 3868 + }, + { + "epoch": 2.1614525139664806, + "grad_norm": 0.6127734184265137, + "learning_rate": 0.000894593837535014, + "loss": 0.5537, + "step": 3869 + }, + { + "epoch": 2.1620111731843576, + "grad_norm": 0.6851430535316467, + "learning_rate": 0.0008945658263305322, + "loss": 0.4636, + "step": 3870 + }, + { + "epoch": 2.1625698324022347, + "grad_norm": 0.6148260831832886, + "learning_rate": 0.0008945378151260505, + "loss": 0.373, + "step": 3871 + }, + { + "epoch": 2.1631284916201117, + "grad_norm": 0.5657532215118408, + "learning_rate": 0.0008945098039215687, + "loss": 0.603, + "step": 3872 + }, + { + "epoch": 2.1636871508379887, + "grad_norm": 0.6626763939857483, + "learning_rate": 0.0008944817927170869, + "loss": 0.5025, + "step": 3873 + }, + { + "epoch": 2.164245810055866, + "grad_norm": 0.41087618470191956, + "learning_rate": 0.0008944537815126051, + "loss": 0.3988, + "step": 3874 + }, + { + "epoch": 2.164804469273743, + "grad_norm": 0.6639741659164429, + "learning_rate": 0.0008944257703081232, + "loss": 0.4227, + "step": 3875 + }, + { + "epoch": 2.16536312849162, + "grad_norm": 0.4683172106742859, + "learning_rate": 0.0008943977591036415, + "loss": 0.4732, + "step": 3876 + }, + { + "epoch": 2.165921787709497, + "grad_norm": 0.47243303060531616, + "learning_rate": 0.0008943697478991597, + "loss": 0.3917, + "step": 3877 + }, + { + "epoch": 2.166480446927374, + "grad_norm": 0.7888584136962891, + "learning_rate": 0.0008943417366946779, + "loss": 0.6877, + "step": 3878 + }, + { + "epoch": 2.167039106145251, + "grad_norm": 0.5306342244148254, + "learning_rate": 0.0008943137254901961, + "loss": 0.4708, + "step": 3879 + }, + { + "epoch": 2.1675977653631286, + "grad_norm": 0.5245224833488464, + "learning_rate": 0.0008942857142857142, + "loss": 0.4673, + "step": 3880 + }, + { + "epoch": 2.1681564245810057, + "grad_norm": 0.6621931791305542, + "learning_rate": 0.0008942577030812325, + "loss": 0.5821, + "step": 3881 + }, + { + "epoch": 2.1687150837988827, + "grad_norm": 0.5790925025939941, + "learning_rate": 0.0008942296918767507, + "loss": 0.4347, + "step": 3882 + }, + { + "epoch": 2.1692737430167597, + "grad_norm": 0.5362594127655029, + "learning_rate": 0.0008942016806722689, + "loss": 0.531, + "step": 3883 + }, + { + "epoch": 2.1698324022346367, + "grad_norm": 0.761273980140686, + "learning_rate": 0.0008941736694677871, + "loss": 0.4826, + "step": 3884 + }, + { + "epoch": 2.170391061452514, + "grad_norm": 0.6512978672981262, + "learning_rate": 0.0008941456582633052, + "loss": 0.3859, + "step": 3885 + }, + { + "epoch": 2.170949720670391, + "grad_norm": 0.546739399433136, + "learning_rate": 0.0008941176470588236, + "loss": 0.5693, + "step": 3886 + }, + { + "epoch": 2.171508379888268, + "grad_norm": 0.5866378545761108, + "learning_rate": 0.0008940896358543418, + "loss": 0.5104, + "step": 3887 + }, + { + "epoch": 2.172067039106145, + "grad_norm": 0.4839584231376648, + "learning_rate": 0.00089406162464986, + "loss": 0.4134, + "step": 3888 + }, + { + "epoch": 2.172625698324022, + "grad_norm": 1.3358453512191772, + "learning_rate": 0.0008940336134453782, + "loss": 0.4414, + "step": 3889 + }, + { + "epoch": 2.1731843575418996, + "grad_norm": 0.529212474822998, + "learning_rate": 0.0008940056022408964, + "loss": 0.4399, + "step": 3890 + }, + { + "epoch": 2.1737430167597767, + "grad_norm": 0.6501895189285278, + "learning_rate": 0.0008939775910364146, + "loss": 0.4201, + "step": 3891 + }, + { + "epoch": 2.1743016759776537, + "grad_norm": 0.6130054593086243, + "learning_rate": 0.0008939495798319328, + "loss": 0.4556, + "step": 3892 + }, + { + "epoch": 2.1748603351955307, + "grad_norm": 0.510717511177063, + "learning_rate": 0.000893921568627451, + "loss": 0.4676, + "step": 3893 + }, + { + "epoch": 2.1754189944134077, + "grad_norm": 0.7903940677642822, + "learning_rate": 0.0008938935574229692, + "loss": 0.459, + "step": 3894 + }, + { + "epoch": 2.1759776536312847, + "grad_norm": 0.46124550700187683, + "learning_rate": 0.0008938655462184874, + "loss": 0.4928, + "step": 3895 + }, + { + "epoch": 2.176536312849162, + "grad_norm": 2.1835978031158447, + "learning_rate": 0.0008938375350140056, + "loss": 0.669, + "step": 3896 + }, + { + "epoch": 2.177094972067039, + "grad_norm": 1.0175588130950928, + "learning_rate": 0.0008938095238095238, + "loss": 0.4085, + "step": 3897 + }, + { + "epoch": 2.177653631284916, + "grad_norm": 0.813089907169342, + "learning_rate": 0.000893781512605042, + "loss": 0.6075, + "step": 3898 + }, + { + "epoch": 2.178212290502793, + "grad_norm": 0.4249681234359741, + "learning_rate": 0.0008937535014005602, + "loss": 0.4756, + "step": 3899 + }, + { + "epoch": 2.17877094972067, + "grad_norm": 2.335358142852783, + "learning_rate": 0.0008937254901960784, + "loss": 0.3813, + "step": 3900 + }, + { + "epoch": 2.1793296089385477, + "grad_norm": 0.6547650694847107, + "learning_rate": 0.0008936974789915966, + "loss": 0.5097, + "step": 3901 + }, + { + "epoch": 2.1798882681564247, + "grad_norm": 0.45946362614631653, + "learning_rate": 0.0008936694677871149, + "loss": 0.4532, + "step": 3902 + }, + { + "epoch": 2.1804469273743017, + "grad_norm": 0.6938641667366028, + "learning_rate": 0.0008936414565826331, + "loss": 0.4465, + "step": 3903 + }, + { + "epoch": 2.1810055865921787, + "grad_norm": 0.4258013665676117, + "learning_rate": 0.0008936134453781513, + "loss": 0.4379, + "step": 3904 + }, + { + "epoch": 2.1815642458100557, + "grad_norm": 0.7088890075683594, + "learning_rate": 0.0008935854341736695, + "loss": 0.4482, + "step": 3905 + }, + { + "epoch": 2.182122905027933, + "grad_norm": 0.6079127192497253, + "learning_rate": 0.0008935574229691878, + "loss": 0.45, + "step": 3906 + }, + { + "epoch": 2.18268156424581, + "grad_norm": 0.8777434825897217, + "learning_rate": 0.0008935294117647059, + "loss": 0.495, + "step": 3907 + }, + { + "epoch": 2.183240223463687, + "grad_norm": 3.983078956604004, + "learning_rate": 0.0008935014005602241, + "loss": 0.6685, + "step": 3908 + }, + { + "epoch": 2.183798882681564, + "grad_norm": 0.49473270773887634, + "learning_rate": 0.0008934733893557423, + "loss": 0.4859, + "step": 3909 + }, + { + "epoch": 2.184357541899441, + "grad_norm": 0.42636168003082275, + "learning_rate": 0.0008934453781512605, + "loss": 0.3903, + "step": 3910 + }, + { + "epoch": 2.1849162011173187, + "grad_norm": 0.5656536221504211, + "learning_rate": 0.0008934173669467788, + "loss": 0.4414, + "step": 3911 + }, + { + "epoch": 2.1854748603351957, + "grad_norm": 0.6124873757362366, + "learning_rate": 0.0008933893557422969, + "loss": 0.5598, + "step": 3912 + }, + { + "epoch": 2.1860335195530727, + "grad_norm": 0.5090885162353516, + "learning_rate": 0.0008933613445378151, + "loss": 0.4544, + "step": 3913 + }, + { + "epoch": 2.1865921787709497, + "grad_norm": 0.6983973979949951, + "learning_rate": 0.0008933333333333333, + "loss": 0.5386, + "step": 3914 + }, + { + "epoch": 2.1871508379888267, + "grad_norm": 1.056702733039856, + "learning_rate": 0.0008933053221288515, + "loss": 0.5133, + "step": 3915 + }, + { + "epoch": 2.1877094972067037, + "grad_norm": 0.4468769133090973, + "learning_rate": 0.0008932773109243698, + "loss": 0.4828, + "step": 3916 + }, + { + "epoch": 2.188268156424581, + "grad_norm": 0.665208637714386, + "learning_rate": 0.0008932492997198879, + "loss": 0.5185, + "step": 3917 + }, + { + "epoch": 2.188826815642458, + "grad_norm": 1.4855750799179077, + "learning_rate": 0.0008932212885154062, + "loss": 0.5367, + "step": 3918 + }, + { + "epoch": 2.189385474860335, + "grad_norm": 0.5110841393470764, + "learning_rate": 0.0008931932773109244, + "loss": 0.5942, + "step": 3919 + }, + { + "epoch": 2.189944134078212, + "grad_norm": 0.48792195320129395, + "learning_rate": 0.0008931652661064426, + "loss": 0.6568, + "step": 3920 + }, + { + "epoch": 2.190502793296089, + "grad_norm": 0.9303848743438721, + "learning_rate": 0.0008931372549019609, + "loss": 0.4585, + "step": 3921 + }, + { + "epoch": 2.1910614525139667, + "grad_norm": 0.5885030031204224, + "learning_rate": 0.0008931092436974791, + "loss": 0.3972, + "step": 3922 + }, + { + "epoch": 2.1916201117318437, + "grad_norm": 0.4627387821674347, + "learning_rate": 0.0008930812324929972, + "loss": 0.3675, + "step": 3923 + }, + { + "epoch": 2.1921787709497207, + "grad_norm": 0.6177243590354919, + "learning_rate": 0.0008930532212885154, + "loss": 0.3952, + "step": 3924 + }, + { + "epoch": 2.1927374301675977, + "grad_norm": 7.05940580368042, + "learning_rate": 0.0008930252100840336, + "loss": 0.4125, + "step": 3925 + }, + { + "epoch": 2.1932960893854747, + "grad_norm": 0.6348626613616943, + "learning_rate": 0.0008929971988795519, + "loss": 0.4596, + "step": 3926 + }, + { + "epoch": 2.1938547486033517, + "grad_norm": 0.8002322912216187, + "learning_rate": 0.0008929691876750701, + "loss": 0.4804, + "step": 3927 + }, + { + "epoch": 2.194413407821229, + "grad_norm": 1.0392259359359741, + "learning_rate": 0.0008929411764705882, + "loss": 0.4416, + "step": 3928 + }, + { + "epoch": 2.194972067039106, + "grad_norm": 0.43021318316459656, + "learning_rate": 0.0008929131652661064, + "loss": 0.3832, + "step": 3929 + }, + { + "epoch": 2.195530726256983, + "grad_norm": 0.8332733511924744, + "learning_rate": 0.0008928851540616246, + "loss": 0.4439, + "step": 3930 + }, + { + "epoch": 2.19608938547486, + "grad_norm": 0.4537784159183502, + "learning_rate": 0.0008928571428571429, + "loss": 0.4118, + "step": 3931 + }, + { + "epoch": 2.1966480446927372, + "grad_norm": 0.6397614479064941, + "learning_rate": 0.0008928291316526611, + "loss": 0.6567, + "step": 3932 + }, + { + "epoch": 2.1972067039106147, + "grad_norm": 0.613127589225769, + "learning_rate": 0.0008928011204481792, + "loss": 0.4477, + "step": 3933 + }, + { + "epoch": 2.1977653631284917, + "grad_norm": 0.4283999502658844, + "learning_rate": 0.0008927731092436974, + "loss": 0.4176, + "step": 3934 + }, + { + "epoch": 2.1983240223463687, + "grad_norm": 0.9790128469467163, + "learning_rate": 0.0008927450980392157, + "loss": 0.397, + "step": 3935 + }, + { + "epoch": 2.1988826815642457, + "grad_norm": 0.5619804263114929, + "learning_rate": 0.000892717086834734, + "loss": 0.5685, + "step": 3936 + }, + { + "epoch": 2.1994413407821227, + "grad_norm": 0.6304671764373779, + "learning_rate": 0.0008926890756302522, + "loss": 0.5531, + "step": 3937 + }, + { + "epoch": 2.2, + "grad_norm": 0.5057224631309509, + "learning_rate": 0.0008926610644257704, + "loss": 0.5876, + "step": 3938 + }, + { + "epoch": 2.200558659217877, + "grad_norm": 0.5985106825828552, + "learning_rate": 0.0008926330532212885, + "loss": 0.7314, + "step": 3939 + }, + { + "epoch": 2.201117318435754, + "grad_norm": 0.8729943633079529, + "learning_rate": 0.0008926050420168067, + "loss": 0.551, + "step": 3940 + }, + { + "epoch": 2.201675977653631, + "grad_norm": 0.47301408648490906, + "learning_rate": 0.000892577030812325, + "loss": 0.433, + "step": 3941 + }, + { + "epoch": 2.2022346368715082, + "grad_norm": 0.5290800333023071, + "learning_rate": 0.0008925490196078432, + "loss": 0.7571, + "step": 3942 + }, + { + "epoch": 2.2027932960893857, + "grad_norm": 0.9559956192970276, + "learning_rate": 0.0008925210084033614, + "loss": 0.4057, + "step": 3943 + }, + { + "epoch": 2.2033519553072627, + "grad_norm": 0.6105080842971802, + "learning_rate": 0.0008924929971988795, + "loss": 0.4293, + "step": 3944 + }, + { + "epoch": 2.2039106145251397, + "grad_norm": 1.232312560081482, + "learning_rate": 0.0008924649859943977, + "loss": 0.4993, + "step": 3945 + }, + { + "epoch": 2.2044692737430167, + "grad_norm": 0.8616457581520081, + "learning_rate": 0.000892436974789916, + "loss": 0.5239, + "step": 3946 + }, + { + "epoch": 2.2050279329608937, + "grad_norm": 0.7790232300758362, + "learning_rate": 0.0008924089635854342, + "loss": 0.5504, + "step": 3947 + }, + { + "epoch": 2.205586592178771, + "grad_norm": 1.1775554418563843, + "learning_rate": 0.0008923809523809524, + "loss": 0.4108, + "step": 3948 + }, + { + "epoch": 2.206145251396648, + "grad_norm": 0.7443337440490723, + "learning_rate": 0.0008923529411764705, + "loss": 0.6079, + "step": 3949 + }, + { + "epoch": 2.206703910614525, + "grad_norm": 1.3190522193908691, + "learning_rate": 0.0008923249299719887, + "loss": 0.527, + "step": 3950 + }, + { + "epoch": 2.207262569832402, + "grad_norm": 0.5428374409675598, + "learning_rate": 0.0008922969187675071, + "loss": 0.5548, + "step": 3951 + }, + { + "epoch": 2.207821229050279, + "grad_norm": 0.6532554030418396, + "learning_rate": 0.0008922689075630253, + "loss": 0.6963, + "step": 3952 + }, + { + "epoch": 2.2083798882681562, + "grad_norm": 0.888231635093689, + "learning_rate": 0.0008922408963585435, + "loss": 0.5792, + "step": 3953 + }, + { + "epoch": 2.2089385474860337, + "grad_norm": 0.4321534037590027, + "learning_rate": 0.0008922128851540617, + "loss": 0.4117, + "step": 3954 + }, + { + "epoch": 2.2094972067039107, + "grad_norm": 0.5281447768211365, + "learning_rate": 0.0008921848739495798, + "loss": 0.5289, + "step": 3955 + }, + { + "epoch": 2.2100558659217877, + "grad_norm": 1.008543848991394, + "learning_rate": 0.0008921568627450981, + "loss": 0.4983, + "step": 3956 + }, + { + "epoch": 2.2106145251396647, + "grad_norm": 2.2757229804992676, + "learning_rate": 0.0008921288515406163, + "loss": 0.4311, + "step": 3957 + }, + { + "epoch": 2.2111731843575417, + "grad_norm": 1.3314635753631592, + "learning_rate": 0.0008921008403361345, + "loss": 0.5646, + "step": 3958 + }, + { + "epoch": 2.211731843575419, + "grad_norm": 0.6620938181877136, + "learning_rate": 0.0008920728291316527, + "loss": 0.4986, + "step": 3959 + }, + { + "epoch": 2.212290502793296, + "grad_norm": 1.910640835762024, + "learning_rate": 0.0008920448179271708, + "loss": 0.6316, + "step": 3960 + }, + { + "epoch": 2.212849162011173, + "grad_norm": 0.7703306078910828, + "learning_rate": 0.0008920168067226891, + "loss": 0.424, + "step": 3961 + }, + { + "epoch": 2.21340782122905, + "grad_norm": 0.5946208238601685, + "learning_rate": 0.0008919887955182073, + "loss": 0.5351, + "step": 3962 + }, + { + "epoch": 2.2139664804469272, + "grad_norm": 0.9763235449790955, + "learning_rate": 0.0008919607843137255, + "loss": 0.4968, + "step": 3963 + }, + { + "epoch": 2.2145251396648042, + "grad_norm": 7.759881496429443, + "learning_rate": 0.0008919327731092437, + "loss": 0.4845, + "step": 3964 + }, + { + "epoch": 2.2150837988826817, + "grad_norm": 0.6663423776626587, + "learning_rate": 0.0008919047619047618, + "loss": 0.4836, + "step": 3965 + }, + { + "epoch": 2.2156424581005587, + "grad_norm": 0.7887842655181885, + "learning_rate": 0.0008918767507002801, + "loss": 0.4292, + "step": 3966 + }, + { + "epoch": 2.2162011173184357, + "grad_norm": 0.570859432220459, + "learning_rate": 0.0008918487394957984, + "loss": 0.539, + "step": 3967 + }, + { + "epoch": 2.2167597765363127, + "grad_norm": 0.7126721739768982, + "learning_rate": 0.0008918207282913166, + "loss": 0.4837, + "step": 3968 + }, + { + "epoch": 2.2173184357541897, + "grad_norm": 0.5668030977249146, + "learning_rate": 0.0008917927170868348, + "loss": 0.5257, + "step": 3969 + }, + { + "epoch": 2.217877094972067, + "grad_norm": 0.41417011618614197, + "learning_rate": 0.000891764705882353, + "loss": 0.3347, + "step": 3970 + }, + { + "epoch": 2.218435754189944, + "grad_norm": 0.4726235270500183, + "learning_rate": 0.0008917366946778712, + "loss": 0.5055, + "step": 3971 + }, + { + "epoch": 2.218994413407821, + "grad_norm": 0.44764265418052673, + "learning_rate": 0.0008917086834733894, + "loss": 0.4741, + "step": 3972 + }, + { + "epoch": 2.2195530726256982, + "grad_norm": 0.4934897720813751, + "learning_rate": 0.0008916806722689076, + "loss": 0.3788, + "step": 3973 + }, + { + "epoch": 2.2201117318435752, + "grad_norm": 0.6134538054466248, + "learning_rate": 0.0008916526610644258, + "loss": 0.6054, + "step": 3974 + }, + { + "epoch": 2.2206703910614527, + "grad_norm": 8.542625427246094, + "learning_rate": 0.000891624649859944, + "loss": 0.5029, + "step": 3975 + }, + { + "epoch": 2.2212290502793297, + "grad_norm": 0.4735102653503418, + "learning_rate": 0.0008915966386554622, + "loss": 0.5312, + "step": 3976 + }, + { + "epoch": 2.2217877094972067, + "grad_norm": 0.8623790740966797, + "learning_rate": 0.0008915686274509804, + "loss": 0.4552, + "step": 3977 + }, + { + "epoch": 2.2223463687150837, + "grad_norm": 0.8594672679901123, + "learning_rate": 0.0008915406162464986, + "loss": 0.7722, + "step": 3978 + }, + { + "epoch": 2.2229050279329607, + "grad_norm": 1.4279512166976929, + "learning_rate": 0.0008915126050420168, + "loss": 0.5103, + "step": 3979 + }, + { + "epoch": 2.223463687150838, + "grad_norm": 0.8403601050376892, + "learning_rate": 0.000891484593837535, + "loss": 0.4125, + "step": 3980 + }, + { + "epoch": 2.224022346368715, + "grad_norm": 0.5547012090682983, + "learning_rate": 0.0008914565826330533, + "loss": 0.3444, + "step": 3981 + }, + { + "epoch": 2.224581005586592, + "grad_norm": 1.2253466844558716, + "learning_rate": 0.0008914285714285714, + "loss": 0.4463, + "step": 3982 + }, + { + "epoch": 2.2251396648044692, + "grad_norm": 0.5143917798995972, + "learning_rate": 0.0008914005602240896, + "loss": 0.5134, + "step": 3983 + }, + { + "epoch": 2.2256983240223462, + "grad_norm": 1.185542106628418, + "learning_rate": 0.0008913725490196079, + "loss": 0.5527, + "step": 3984 + }, + { + "epoch": 2.2262569832402237, + "grad_norm": 0.5383661985397339, + "learning_rate": 0.0008913445378151261, + "loss": 0.5151, + "step": 3985 + }, + { + "epoch": 2.2268156424581007, + "grad_norm": 0.44264593720436096, + "learning_rate": 0.0008913165266106444, + "loss": 0.4296, + "step": 3986 + }, + { + "epoch": 2.2273743016759777, + "grad_norm": 0.521536648273468, + "learning_rate": 0.0008912885154061625, + "loss": 0.4164, + "step": 3987 + }, + { + "epoch": 2.2279329608938547, + "grad_norm": 1.3124431371688843, + "learning_rate": 0.0008912605042016807, + "loss": 0.4444, + "step": 3988 + }, + { + "epoch": 2.2284916201117317, + "grad_norm": 0.6384272575378418, + "learning_rate": 0.0008912324929971989, + "loss": 0.6504, + "step": 3989 + }, + { + "epoch": 2.2290502793296088, + "grad_norm": 0.8629913330078125, + "learning_rate": 0.0008912044817927171, + "loss": 0.4836, + "step": 3990 + }, + { + "epoch": 2.229608938547486, + "grad_norm": 0.5492082834243774, + "learning_rate": 0.0008911764705882354, + "loss": 0.4701, + "step": 3991 + }, + { + "epoch": 2.230167597765363, + "grad_norm": 0.4890977144241333, + "learning_rate": 0.0008911484593837535, + "loss": 0.4474, + "step": 3992 + }, + { + "epoch": 2.2307262569832402, + "grad_norm": 0.5650962591171265, + "learning_rate": 0.0008911204481792717, + "loss": 0.4234, + "step": 3993 + }, + { + "epoch": 2.2312849162011172, + "grad_norm": 1.309747338294983, + "learning_rate": 0.0008910924369747899, + "loss": 0.3745, + "step": 3994 + }, + { + "epoch": 2.2318435754189943, + "grad_norm": 0.6015917062759399, + "learning_rate": 0.0008910644257703081, + "loss": 0.5381, + "step": 3995 + }, + { + "epoch": 2.2324022346368717, + "grad_norm": 1.0430498123168945, + "learning_rate": 0.0008910364145658264, + "loss": 0.5379, + "step": 3996 + }, + { + "epoch": 2.2329608938547487, + "grad_norm": 0.5327572822570801, + "learning_rate": 0.0008910084033613446, + "loss": 0.4768, + "step": 3997 + }, + { + "epoch": 2.2335195530726257, + "grad_norm": 3.224012613296509, + "learning_rate": 0.0008909803921568627, + "loss": 0.519, + "step": 3998 + }, + { + "epoch": 2.2340782122905027, + "grad_norm": 1.0534822940826416, + "learning_rate": 0.0008909523809523809, + "loss": 0.459, + "step": 3999 + }, + { + "epoch": 2.2346368715083798, + "grad_norm": 0.5711976289749146, + "learning_rate": 0.0008909243697478992, + "loss": 0.3572, + "step": 4000 + }, + { + "epoch": 2.2346368715083798, + "eval_cer": 0.09715448485046861, + "eval_loss": 0.36586418747901917, + "eval_runtime": 55.7043, + "eval_samples_per_second": 81.466, + "eval_steps_per_second": 5.098, + "eval_wer": 0.3814697913466104, + "step": 4000 + }, + { + "epoch": 2.2351955307262568, + "grad_norm": 0.5830895304679871, + "learning_rate": 0.0008908963585434175, + "loss": 0.4802, + "step": 4001 + }, + { + "epoch": 2.235754189944134, + "grad_norm": 0.8545703887939453, + "learning_rate": 0.0008908683473389357, + "loss": 0.5038, + "step": 4002 + }, + { + "epoch": 2.2363128491620112, + "grad_norm": 0.6758176684379578, + "learning_rate": 0.0008908403361344538, + "loss": 0.4568, + "step": 4003 + }, + { + "epoch": 2.2368715083798882, + "grad_norm": 0.6274861693382263, + "learning_rate": 0.000890812324929972, + "loss": 0.4008, + "step": 4004 + }, + { + "epoch": 2.2374301675977653, + "grad_norm": 0.9927757382392883, + "learning_rate": 0.0008907843137254902, + "loss": 0.5065, + "step": 4005 + }, + { + "epoch": 2.2379888268156423, + "grad_norm": 0.6257267594337463, + "learning_rate": 0.0008907563025210085, + "loss": 0.5046, + "step": 4006 + }, + { + "epoch": 2.2385474860335197, + "grad_norm": 0.5576756596565247, + "learning_rate": 0.0008907282913165267, + "loss": 0.4643, + "step": 4007 + }, + { + "epoch": 2.2391061452513967, + "grad_norm": 0.5263304710388184, + "learning_rate": 0.0008907002801120448, + "loss": 0.5706, + "step": 4008 + }, + { + "epoch": 2.2396648044692737, + "grad_norm": 0.4375152885913849, + "learning_rate": 0.000890672268907563, + "loss": 0.4671, + "step": 4009 + }, + { + "epoch": 2.2402234636871508, + "grad_norm": 0.6941837668418884, + "learning_rate": 0.0008906442577030812, + "loss": 0.4392, + "step": 4010 + }, + { + "epoch": 2.2407821229050278, + "grad_norm": 0.44970181584358215, + "learning_rate": 0.0008906162464985994, + "loss": 0.4216, + "step": 4011 + }, + { + "epoch": 2.241340782122905, + "grad_norm": 0.6526822447776794, + "learning_rate": 0.0008905882352941177, + "loss": 0.4845, + "step": 4012 + }, + { + "epoch": 2.2418994413407822, + "grad_norm": 0.4581945240497589, + "learning_rate": 0.0008905602240896359, + "loss": 0.4041, + "step": 4013 + }, + { + "epoch": 2.2424581005586592, + "grad_norm": 0.7085303068161011, + "learning_rate": 0.000890532212885154, + "loss": 0.3743, + "step": 4014 + }, + { + "epoch": 2.2430167597765363, + "grad_norm": 3.2189433574676514, + "learning_rate": 0.0008905042016806722, + "loss": 0.7906, + "step": 4015 + }, + { + "epoch": 2.2435754189944133, + "grad_norm": 1.275059461593628, + "learning_rate": 0.0008904761904761904, + "loss": 0.4727, + "step": 4016 + }, + { + "epoch": 2.2441340782122907, + "grad_norm": 0.7866321802139282, + "learning_rate": 0.0008904481792717088, + "loss": 0.431, + "step": 4017 + }, + { + "epoch": 2.2446927374301677, + "grad_norm": 0.6882569789886475, + "learning_rate": 0.000890420168067227, + "loss": 0.517, + "step": 4018 + }, + { + "epoch": 2.2452513966480447, + "grad_norm": 1.0975369215011597, + "learning_rate": 0.0008903921568627451, + "loss": 0.455, + "step": 4019 + }, + { + "epoch": 2.2458100558659218, + "grad_norm": 0.4408380091190338, + "learning_rate": 0.0008903641456582633, + "loss": 0.4277, + "step": 4020 + }, + { + "epoch": 2.2463687150837988, + "grad_norm": 0.4277670979499817, + "learning_rate": 0.0008903361344537815, + "loss": 0.4198, + "step": 4021 + }, + { + "epoch": 2.2469273743016758, + "grad_norm": 0.6119745969772339, + "learning_rate": 0.0008903081232492998, + "loss": 0.4625, + "step": 4022 + }, + { + "epoch": 2.2474860335195532, + "grad_norm": 0.6843535900115967, + "learning_rate": 0.000890280112044818, + "loss": 0.4642, + "step": 4023 + }, + { + "epoch": 2.2480446927374302, + "grad_norm": 1.420467734336853, + "learning_rate": 0.0008902521008403361, + "loss": 0.3801, + "step": 4024 + }, + { + "epoch": 2.2486033519553073, + "grad_norm": 0.6617404818534851, + "learning_rate": 0.0008902240896358543, + "loss": 0.4799, + "step": 4025 + }, + { + "epoch": 2.2491620111731843, + "grad_norm": 0.6966404318809509, + "learning_rate": 0.0008901960784313725, + "loss": 0.5039, + "step": 4026 + }, + { + "epoch": 2.2497206703910613, + "grad_norm": 0.6200904846191406, + "learning_rate": 0.0008901680672268908, + "loss": 0.373, + "step": 4027 + }, + { + "epoch": 2.2502793296089387, + "grad_norm": 1.8639546632766724, + "learning_rate": 0.000890140056022409, + "loss": 0.5356, + "step": 4028 + }, + { + "epoch": 2.2508379888268157, + "grad_norm": 0.7014147639274597, + "learning_rate": 0.0008901120448179272, + "loss": 0.362, + "step": 4029 + }, + { + "epoch": 2.2513966480446927, + "grad_norm": 0.5041413307189941, + "learning_rate": 0.0008900840336134453, + "loss": 0.444, + "step": 4030 + }, + { + "epoch": 2.2519553072625698, + "grad_norm": 0.5858452320098877, + "learning_rate": 0.0008900560224089635, + "loss": 0.6114, + "step": 4031 + }, + { + "epoch": 2.2525139664804468, + "grad_norm": 0.7455693483352661, + "learning_rate": 0.0008900280112044819, + "loss": 0.4134, + "step": 4032 + }, + { + "epoch": 2.253072625698324, + "grad_norm": 1.9302022457122803, + "learning_rate": 0.0008900000000000001, + "loss": 0.6002, + "step": 4033 + }, + { + "epoch": 2.2536312849162012, + "grad_norm": 0.5668044686317444, + "learning_rate": 0.0008899719887955183, + "loss": 0.4839, + "step": 4034 + }, + { + "epoch": 2.2541899441340782, + "grad_norm": 3.9623420238494873, + "learning_rate": 0.0008899439775910364, + "loss": 0.4099, + "step": 4035 + }, + { + "epoch": 2.2547486033519553, + "grad_norm": 0.4752205014228821, + "learning_rate": 0.0008899159663865546, + "loss": 0.4718, + "step": 4036 + }, + { + "epoch": 2.2553072625698323, + "grad_norm": 0.9224374294281006, + "learning_rate": 0.0008898879551820729, + "loss": 0.5898, + "step": 4037 + }, + { + "epoch": 2.2558659217877093, + "grad_norm": 0.5311445593833923, + "learning_rate": 0.0008898599439775911, + "loss": 0.4319, + "step": 4038 + }, + { + "epoch": 2.2564245810055867, + "grad_norm": 0.527908205986023, + "learning_rate": 0.0008898319327731093, + "loss": 0.4151, + "step": 4039 + }, + { + "epoch": 2.2569832402234637, + "grad_norm": 0.5853949785232544, + "learning_rate": 0.0008898039215686274, + "loss": 0.5628, + "step": 4040 + }, + { + "epoch": 2.2575418994413408, + "grad_norm": 1.008521318435669, + "learning_rate": 0.0008897759103641456, + "loss": 0.5459, + "step": 4041 + }, + { + "epoch": 2.2581005586592178, + "grad_norm": 1.5214451551437378, + "learning_rate": 0.0008897478991596639, + "loss": 0.3984, + "step": 4042 + }, + { + "epoch": 2.258659217877095, + "grad_norm": 0.6478580236434937, + "learning_rate": 0.0008897198879551821, + "loss": 0.4768, + "step": 4043 + }, + { + "epoch": 2.2592178770949722, + "grad_norm": 0.7572171688079834, + "learning_rate": 0.0008896918767507003, + "loss": 0.5513, + "step": 4044 + }, + { + "epoch": 2.2597765363128492, + "grad_norm": 0.4587641656398773, + "learning_rate": 0.0008896638655462185, + "loss": 0.4634, + "step": 4045 + }, + { + "epoch": 2.2603351955307263, + "grad_norm": 0.5504059195518494, + "learning_rate": 0.0008896358543417366, + "loss": 0.4738, + "step": 4046 + }, + { + "epoch": 2.2608938547486033, + "grad_norm": 0.674247682094574, + "learning_rate": 0.0008896078431372549, + "loss": 0.4244, + "step": 4047 + }, + { + "epoch": 2.2614525139664803, + "grad_norm": 0.9136465787887573, + "learning_rate": 0.0008895798319327731, + "loss": 0.607, + "step": 4048 + }, + { + "epoch": 2.2620111731843577, + "grad_norm": 0.725601851940155, + "learning_rate": 0.0008895518207282914, + "loss": 0.461, + "step": 4049 + }, + { + "epoch": 2.2625698324022347, + "grad_norm": 0.6201837658882141, + "learning_rate": 0.0008895238095238096, + "loss": 0.5212, + "step": 4050 + }, + { + "epoch": 2.2631284916201118, + "grad_norm": 0.5169089436531067, + "learning_rate": 0.0008894957983193277, + "loss": 0.5246, + "step": 4051 + }, + { + "epoch": 2.2636871508379888, + "grad_norm": 0.5812177658081055, + "learning_rate": 0.000889467787114846, + "loss": 0.399, + "step": 4052 + }, + { + "epoch": 2.264245810055866, + "grad_norm": 0.5390665531158447, + "learning_rate": 0.0008894397759103642, + "loss": 0.5036, + "step": 4053 + }, + { + "epoch": 2.2648044692737432, + "grad_norm": 0.6369917988777161, + "learning_rate": 0.0008894117647058824, + "loss": 0.4432, + "step": 4054 + }, + { + "epoch": 2.2653631284916202, + "grad_norm": 0.6308425664901733, + "learning_rate": 0.0008893837535014006, + "loss": 0.5208, + "step": 4055 + }, + { + "epoch": 2.2659217877094973, + "grad_norm": 0.4943470060825348, + "learning_rate": 0.0008893557422969187, + "loss": 0.4957, + "step": 4056 + }, + { + "epoch": 2.2664804469273743, + "grad_norm": 0.6871525645256042, + "learning_rate": 0.000889327731092437, + "loss": 0.5049, + "step": 4057 + }, + { + "epoch": 2.2670391061452513, + "grad_norm": 1.1248698234558105, + "learning_rate": 0.0008892997198879552, + "loss": 0.521, + "step": 4058 + }, + { + "epoch": 2.2675977653631287, + "grad_norm": 0.6286033391952515, + "learning_rate": 0.0008892717086834734, + "loss": 0.4176, + "step": 4059 + }, + { + "epoch": 2.2681564245810057, + "grad_norm": 1.6168240308761597, + "learning_rate": 0.0008892436974789916, + "loss": 0.564, + "step": 4060 + }, + { + "epoch": 2.2687150837988828, + "grad_norm": 0.4227534234523773, + "learning_rate": 0.0008892156862745098, + "loss": 0.3782, + "step": 4061 + }, + { + "epoch": 2.2692737430167598, + "grad_norm": 0.6461851596832275, + "learning_rate": 0.000889187675070028, + "loss": 0.4866, + "step": 4062 + }, + { + "epoch": 2.269832402234637, + "grad_norm": 0.7072750329971313, + "learning_rate": 0.0008891596638655462, + "loss": 0.8246, + "step": 4063 + }, + { + "epoch": 2.270391061452514, + "grad_norm": 0.545943558216095, + "learning_rate": 0.0008891316526610644, + "loss": 0.5434, + "step": 4064 + }, + { + "epoch": 2.2709497206703912, + "grad_norm": 0.5444265007972717, + "learning_rate": 0.0008891036414565826, + "loss": 0.493, + "step": 4065 + }, + { + "epoch": 2.2715083798882683, + "grad_norm": 0.6745163798332214, + "learning_rate": 0.0008890756302521009, + "loss": 0.4715, + "step": 4066 + }, + { + "epoch": 2.2720670391061453, + "grad_norm": 2.209089756011963, + "learning_rate": 0.0008890476190476191, + "loss": 0.4144, + "step": 4067 + }, + { + "epoch": 2.2726256983240223, + "grad_norm": 0.6629486083984375, + "learning_rate": 0.0008890196078431373, + "loss": 0.4839, + "step": 4068 + }, + { + "epoch": 2.2731843575418993, + "grad_norm": 0.6990852355957031, + "learning_rate": 0.0008889915966386555, + "loss": 0.4421, + "step": 4069 + }, + { + "epoch": 2.2737430167597763, + "grad_norm": 0.46221858263015747, + "learning_rate": 0.0008889635854341737, + "loss": 0.3601, + "step": 4070 + }, + { + "epoch": 2.2743016759776538, + "grad_norm": 1.5171685218811035, + "learning_rate": 0.0008889355742296919, + "loss": 0.6345, + "step": 4071 + }, + { + "epoch": 2.2748603351955308, + "grad_norm": 0.8347725868225098, + "learning_rate": 0.0008889075630252101, + "loss": 0.4724, + "step": 4072 + }, + { + "epoch": 2.275418994413408, + "grad_norm": 0.8772679567337036, + "learning_rate": 0.0008888795518207283, + "loss": 0.6179, + "step": 4073 + }, + { + "epoch": 2.275977653631285, + "grad_norm": 0.5657960772514343, + "learning_rate": 0.0008888515406162465, + "loss": 0.4992, + "step": 4074 + }, + { + "epoch": 2.276536312849162, + "grad_norm": 0.8635707497596741, + "learning_rate": 0.0008888235294117647, + "loss": 0.4538, + "step": 4075 + }, + { + "epoch": 2.2770949720670393, + "grad_norm": 1.282912254333496, + "learning_rate": 0.0008887955182072829, + "loss": 0.7064, + "step": 4076 + }, + { + "epoch": 2.2776536312849163, + "grad_norm": 0.5405939817428589, + "learning_rate": 0.0008887675070028012, + "loss": 0.3681, + "step": 4077 + }, + { + "epoch": 2.2782122905027933, + "grad_norm": 0.7191981077194214, + "learning_rate": 0.0008887394957983193, + "loss": 0.4156, + "step": 4078 + }, + { + "epoch": 2.2787709497206703, + "grad_norm": 0.4254477620124817, + "learning_rate": 0.0008887114845938375, + "loss": 0.4195, + "step": 4079 + }, + { + "epoch": 2.2793296089385473, + "grad_norm": 0.8394465446472168, + "learning_rate": 0.0008886834733893557, + "loss": 0.5782, + "step": 4080 + }, + { + "epoch": 2.2798882681564248, + "grad_norm": 0.48060542345046997, + "learning_rate": 0.0008886554621848739, + "loss": 0.3882, + "step": 4081 + }, + { + "epoch": 2.2804469273743018, + "grad_norm": 0.714497983455658, + "learning_rate": 0.0008886274509803923, + "loss": 0.5803, + "step": 4082 + }, + { + "epoch": 2.281005586592179, + "grad_norm": 0.6810247898101807, + "learning_rate": 0.0008885994397759104, + "loss": 0.4857, + "step": 4083 + }, + { + "epoch": 2.281564245810056, + "grad_norm": 0.5977585911750793, + "learning_rate": 0.0008885714285714286, + "loss": 0.5131, + "step": 4084 + }, + { + "epoch": 2.282122905027933, + "grad_norm": 0.5889948606491089, + "learning_rate": 0.0008885434173669468, + "loss": 0.5389, + "step": 4085 + }, + { + "epoch": 2.2826815642458103, + "grad_norm": 0.8167197108268738, + "learning_rate": 0.000888515406162465, + "loss": 0.4406, + "step": 4086 + }, + { + "epoch": 2.2832402234636873, + "grad_norm": 0.4687158465385437, + "learning_rate": 0.0008884873949579833, + "loss": 0.4221, + "step": 4087 + }, + { + "epoch": 2.2837988826815643, + "grad_norm": 0.5708418488502502, + "learning_rate": 0.0008884593837535014, + "loss": 0.3652, + "step": 4088 + }, + { + "epoch": 2.2843575418994413, + "grad_norm": 0.5562434792518616, + "learning_rate": 0.0008884313725490196, + "loss": 0.4668, + "step": 4089 + }, + { + "epoch": 2.2849162011173183, + "grad_norm": 0.6504343152046204, + "learning_rate": 0.0008884033613445378, + "loss": 0.3874, + "step": 4090 + }, + { + "epoch": 2.2854748603351958, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.000888375350140056, + "loss": 0.4679, + "step": 4091 + }, + { + "epoch": 2.2860335195530728, + "grad_norm": 0.7786232233047485, + "learning_rate": 0.0008883473389355743, + "loss": 0.4963, + "step": 4092 + }, + { + "epoch": 2.28659217877095, + "grad_norm": 0.4540756344795227, + "learning_rate": 0.0008883193277310925, + "loss": 0.3871, + "step": 4093 + }, + { + "epoch": 2.287150837988827, + "grad_norm": 0.827295184135437, + "learning_rate": 0.0008882913165266106, + "loss": 0.7639, + "step": 4094 + }, + { + "epoch": 2.287709497206704, + "grad_norm": 1.3360422849655151, + "learning_rate": 0.0008882633053221288, + "loss": 0.5245, + "step": 4095 + }, + { + "epoch": 2.288268156424581, + "grad_norm": 0.6237481832504272, + "learning_rate": 0.000888235294117647, + "loss": 0.5449, + "step": 4096 + }, + { + "epoch": 2.2888268156424583, + "grad_norm": 1.0694364309310913, + "learning_rate": 0.0008882072829131653, + "loss": 0.4615, + "step": 4097 + }, + { + "epoch": 2.2893854748603353, + "grad_norm": 0.47446396946907043, + "learning_rate": 0.0008881792717086836, + "loss": 0.4717, + "step": 4098 + }, + { + "epoch": 2.2899441340782123, + "grad_norm": 0.642112672328949, + "learning_rate": 0.0008881512605042017, + "loss": 0.5206, + "step": 4099 + }, + { + "epoch": 2.2905027932960893, + "grad_norm": 0.4892105162143707, + "learning_rate": 0.0008881232492997199, + "loss": 0.5134, + "step": 4100 + }, + { + "epoch": 2.2910614525139663, + "grad_norm": 2.5501768589019775, + "learning_rate": 0.0008880952380952381, + "loss": 0.4788, + "step": 4101 + }, + { + "epoch": 2.2916201117318438, + "grad_norm": 1.609928846359253, + "learning_rate": 0.0008880672268907564, + "loss": 0.3942, + "step": 4102 + }, + { + "epoch": 2.292178770949721, + "grad_norm": 0.5709579586982727, + "learning_rate": 0.0008880392156862746, + "loss": 0.4299, + "step": 4103 + }, + { + "epoch": 2.292737430167598, + "grad_norm": 0.5986993908882141, + "learning_rate": 0.0008880112044817927, + "loss": 0.4683, + "step": 4104 + }, + { + "epoch": 2.293296089385475, + "grad_norm": 0.8785414695739746, + "learning_rate": 0.0008879831932773109, + "loss": 0.4779, + "step": 4105 + }, + { + "epoch": 2.293854748603352, + "grad_norm": 0.4095707833766937, + "learning_rate": 0.0008879551820728291, + "loss": 0.4205, + "step": 4106 + }, + { + "epoch": 2.294413407821229, + "grad_norm": 0.5068609714508057, + "learning_rate": 0.0008879271708683474, + "loss": 0.5193, + "step": 4107 + }, + { + "epoch": 2.2949720670391063, + "grad_norm": 0.5796323418617249, + "learning_rate": 0.0008878991596638656, + "loss": 0.5022, + "step": 4108 + }, + { + "epoch": 2.2955307262569833, + "grad_norm": 0.6752324104309082, + "learning_rate": 0.0008878711484593838, + "loss": 0.4512, + "step": 4109 + }, + { + "epoch": 2.2960893854748603, + "grad_norm": 0.7865152359008789, + "learning_rate": 0.0008878431372549019, + "loss": 0.4861, + "step": 4110 + }, + { + "epoch": 2.2966480446927373, + "grad_norm": 0.5990186929702759, + "learning_rate": 0.0008878151260504201, + "loss": 0.5114, + "step": 4111 + }, + { + "epoch": 2.2972067039106143, + "grad_norm": 0.5680040121078491, + "learning_rate": 0.0008877871148459384, + "loss": 0.5215, + "step": 4112 + }, + { + "epoch": 2.2977653631284918, + "grad_norm": 0.5190915465354919, + "learning_rate": 0.0008877591036414566, + "loss": 0.5886, + "step": 4113 + }, + { + "epoch": 2.298324022346369, + "grad_norm": 0.8567480444908142, + "learning_rate": 0.0008877310924369749, + "loss": 0.3847, + "step": 4114 + }, + { + "epoch": 2.298882681564246, + "grad_norm": 0.7038180232048035, + "learning_rate": 0.000887703081232493, + "loss": 0.5703, + "step": 4115 + }, + { + "epoch": 2.299441340782123, + "grad_norm": 0.5825362801551819, + "learning_rate": 0.0008876750700280112, + "loss": 0.5479, + "step": 4116 + }, + { + "epoch": 2.3, + "grad_norm": 2.6037185192108154, + "learning_rate": 0.0008876470588235295, + "loss": 0.5473, + "step": 4117 + }, + { + "epoch": 2.3005586592178773, + "grad_norm": 0.8928390145301819, + "learning_rate": 0.0008876190476190477, + "loss": 0.5253, + "step": 4118 + }, + { + "epoch": 2.3011173184357543, + "grad_norm": 0.685375988483429, + "learning_rate": 0.0008875910364145659, + "loss": 0.5245, + "step": 4119 + }, + { + "epoch": 2.3016759776536313, + "grad_norm": 0.5561337471008301, + "learning_rate": 0.000887563025210084, + "loss": 0.4835, + "step": 4120 + }, + { + "epoch": 2.3022346368715083, + "grad_norm": 1.3214690685272217, + "learning_rate": 0.0008875350140056022, + "loss": 0.4703, + "step": 4121 + }, + { + "epoch": 2.3027932960893853, + "grad_norm": 1.1387779712677002, + "learning_rate": 0.0008875070028011205, + "loss": 0.4082, + "step": 4122 + }, + { + "epoch": 2.3033519553072628, + "grad_norm": 7.468307971954346, + "learning_rate": 0.0008874789915966387, + "loss": 0.398, + "step": 4123 + }, + { + "epoch": 2.30391061452514, + "grad_norm": 0.8910265564918518, + "learning_rate": 0.0008874509803921569, + "loss": 0.438, + "step": 4124 + }, + { + "epoch": 2.304469273743017, + "grad_norm": 0.8785117268562317, + "learning_rate": 0.0008874229691876751, + "loss": 0.5598, + "step": 4125 + }, + { + "epoch": 2.305027932960894, + "grad_norm": 0.6925243735313416, + "learning_rate": 0.0008873949579831932, + "loss": 0.4195, + "step": 4126 + }, + { + "epoch": 2.305586592178771, + "grad_norm": 1.0697098970413208, + "learning_rate": 0.0008873669467787115, + "loss": 0.6214, + "step": 4127 + }, + { + "epoch": 2.3061452513966483, + "grad_norm": 0.6937605738639832, + "learning_rate": 0.0008873389355742297, + "loss": 0.502, + "step": 4128 + }, + { + "epoch": 2.3067039106145253, + "grad_norm": 0.6571819186210632, + "learning_rate": 0.0008873109243697479, + "loss": 0.4924, + "step": 4129 + }, + { + "epoch": 2.3072625698324023, + "grad_norm": 0.5332193970680237, + "learning_rate": 0.0008872829131652661, + "loss": 0.4646, + "step": 4130 + }, + { + "epoch": 2.3078212290502793, + "grad_norm": 0.5302227139472961, + "learning_rate": 0.0008872549019607842, + "loss": 0.4596, + "step": 4131 + }, + { + "epoch": 2.3083798882681563, + "grad_norm": 2.9041993618011475, + "learning_rate": 0.0008872268907563026, + "loss": 0.352, + "step": 4132 + }, + { + "epoch": 2.3089385474860333, + "grad_norm": 0.6829684376716614, + "learning_rate": 0.0008871988795518208, + "loss": 0.6677, + "step": 4133 + }, + { + "epoch": 2.309497206703911, + "grad_norm": 0.7759153246879578, + "learning_rate": 0.000887170868347339, + "loss": 0.5889, + "step": 4134 + }, + { + "epoch": 2.310055865921788, + "grad_norm": 0.7212245464324951, + "learning_rate": 0.0008871428571428572, + "loss": 0.482, + "step": 4135 + }, + { + "epoch": 2.310614525139665, + "grad_norm": 0.9701433181762695, + "learning_rate": 0.0008871148459383753, + "loss": 0.3879, + "step": 4136 + }, + { + "epoch": 2.311173184357542, + "grad_norm": 0.6263357996940613, + "learning_rate": 0.0008870868347338936, + "loss": 0.4101, + "step": 4137 + }, + { + "epoch": 2.311731843575419, + "grad_norm": 0.5934965014457703, + "learning_rate": 0.0008870588235294118, + "loss": 0.5446, + "step": 4138 + }, + { + "epoch": 2.312290502793296, + "grad_norm": 0.5642522573471069, + "learning_rate": 0.00088703081232493, + "loss": 0.463, + "step": 4139 + }, + { + "epoch": 2.3128491620111733, + "grad_norm": 0.565039336681366, + "learning_rate": 0.0008870028011204482, + "loss": 0.5694, + "step": 4140 + }, + { + "epoch": 2.3134078212290503, + "grad_norm": 0.5565946102142334, + "learning_rate": 0.0008869747899159664, + "loss": 0.7868, + "step": 4141 + }, + { + "epoch": 2.3139664804469273, + "grad_norm": 1.1255923509597778, + "learning_rate": 0.0008869467787114846, + "loss": 0.5169, + "step": 4142 + }, + { + "epoch": 2.3145251396648043, + "grad_norm": 0.8964384198188782, + "learning_rate": 0.0008869187675070028, + "loss": 0.7309, + "step": 4143 + }, + { + "epoch": 2.3150837988826813, + "grad_norm": 0.6354342699050903, + "learning_rate": 0.000886890756302521, + "loss": 0.5366, + "step": 4144 + }, + { + "epoch": 2.315642458100559, + "grad_norm": 0.6353447437286377, + "learning_rate": 0.0008868627450980392, + "loss": 0.5252, + "step": 4145 + }, + { + "epoch": 2.316201117318436, + "grad_norm": 0.4865851402282715, + "learning_rate": 0.0008868347338935574, + "loss": 0.3709, + "step": 4146 + }, + { + "epoch": 2.316759776536313, + "grad_norm": 0.5829808712005615, + "learning_rate": 0.0008868067226890756, + "loss": 0.4164, + "step": 4147 + }, + { + "epoch": 2.31731843575419, + "grad_norm": 0.5432511568069458, + "learning_rate": 0.0008867787114845939, + "loss": 0.4828, + "step": 4148 + }, + { + "epoch": 2.317877094972067, + "grad_norm": 0.8548393249511719, + "learning_rate": 0.0008867507002801121, + "loss": 0.5463, + "step": 4149 + }, + { + "epoch": 2.3184357541899443, + "grad_norm": 0.4730907380580902, + "learning_rate": 0.0008867226890756303, + "loss": 0.4975, + "step": 4150 + }, + { + "epoch": 2.3189944134078213, + "grad_norm": 0.49459514021873474, + "learning_rate": 0.0008866946778711485, + "loss": 0.4229, + "step": 4151 + }, + { + "epoch": 2.3195530726256983, + "grad_norm": 0.4433779716491699, + "learning_rate": 0.0008866666666666667, + "loss": 0.4425, + "step": 4152 + }, + { + "epoch": 2.3201117318435753, + "grad_norm": 0.5054029822349548, + "learning_rate": 0.0008866386554621849, + "loss": 0.5313, + "step": 4153 + }, + { + "epoch": 2.3206703910614523, + "grad_norm": 0.5901767015457153, + "learning_rate": 0.0008866106442577031, + "loss": 0.5021, + "step": 4154 + }, + { + "epoch": 2.32122905027933, + "grad_norm": 0.4914945662021637, + "learning_rate": 0.0008865826330532213, + "loss": 0.5471, + "step": 4155 + }, + { + "epoch": 2.321787709497207, + "grad_norm": 1.8711297512054443, + "learning_rate": 0.0008865546218487395, + "loss": 0.4219, + "step": 4156 + }, + { + "epoch": 2.322346368715084, + "grad_norm": 1.6710267066955566, + "learning_rate": 0.0008865266106442578, + "loss": 0.4895, + "step": 4157 + }, + { + "epoch": 2.322905027932961, + "grad_norm": 0.5949874520301819, + "learning_rate": 0.0008864985994397759, + "loss": 0.4086, + "step": 4158 + }, + { + "epoch": 2.323463687150838, + "grad_norm": 2.5391275882720947, + "learning_rate": 0.0008864705882352941, + "loss": 0.5711, + "step": 4159 + }, + { + "epoch": 2.3240223463687153, + "grad_norm": 0.7851516604423523, + "learning_rate": 0.0008864425770308123, + "loss": 0.5719, + "step": 4160 + }, + { + "epoch": 2.3245810055865923, + "grad_norm": 1.2702527046203613, + "learning_rate": 0.0008864145658263305, + "loss": 0.4646, + "step": 4161 + }, + { + "epoch": 2.3251396648044693, + "grad_norm": 0.7813109159469604, + "learning_rate": 0.0008863865546218488, + "loss": 0.5545, + "step": 4162 + }, + { + "epoch": 2.3256983240223463, + "grad_norm": 0.49233749508857727, + "learning_rate": 0.0008863585434173669, + "loss": 0.4892, + "step": 4163 + }, + { + "epoch": 2.3262569832402233, + "grad_norm": 6.627612113952637, + "learning_rate": 0.0008863305322128852, + "loss": 0.5691, + "step": 4164 + }, + { + "epoch": 2.326815642458101, + "grad_norm": 0.8298326730728149, + "learning_rate": 0.0008863025210084034, + "loss": 0.4895, + "step": 4165 + }, + { + "epoch": 2.327374301675978, + "grad_norm": 0.5120519399642944, + "learning_rate": 0.0008862745098039216, + "loss": 0.4642, + "step": 4166 + }, + { + "epoch": 2.327932960893855, + "grad_norm": 0.48763808608055115, + "learning_rate": 0.0008862464985994399, + "loss": 0.386, + "step": 4167 + }, + { + "epoch": 2.328491620111732, + "grad_norm": 0.706680953502655, + "learning_rate": 0.000886218487394958, + "loss": 0.3697, + "step": 4168 + }, + { + "epoch": 2.329050279329609, + "grad_norm": 0.6681955456733704, + "learning_rate": 0.0008861904761904762, + "loss": 0.5167, + "step": 4169 + }, + { + "epoch": 2.329608938547486, + "grad_norm": 0.5295430421829224, + "learning_rate": 0.0008861624649859944, + "loss": 0.5733, + "step": 4170 + }, + { + "epoch": 2.3301675977653633, + "grad_norm": 0.5398170351982117, + "learning_rate": 0.0008861344537815126, + "loss": 0.5139, + "step": 4171 + }, + { + "epoch": 2.3307262569832403, + "grad_norm": 0.5111408233642578, + "learning_rate": 0.0008861064425770309, + "loss": 0.4717, + "step": 4172 + }, + { + "epoch": 2.3312849162011173, + "grad_norm": 0.5647570490837097, + "learning_rate": 0.0008860784313725491, + "loss": 0.4832, + "step": 4173 + }, + { + "epoch": 2.3318435754189943, + "grad_norm": 0.4562356472015381, + "learning_rate": 0.0008860504201680672, + "loss": 0.4556, + "step": 4174 + }, + { + "epoch": 2.3324022346368714, + "grad_norm": 0.5109476447105408, + "learning_rate": 0.0008860224089635854, + "loss": 0.4676, + "step": 4175 + }, + { + "epoch": 2.3329608938547484, + "grad_norm": 0.45182135701179504, + "learning_rate": 0.0008859943977591036, + "loss": 0.4014, + "step": 4176 + }, + { + "epoch": 2.333519553072626, + "grad_norm": 0.6884251832962036, + "learning_rate": 0.0008859663865546219, + "loss": 0.4297, + "step": 4177 + }, + { + "epoch": 2.334078212290503, + "grad_norm": 0.4124736189842224, + "learning_rate": 0.0008859383753501401, + "loss": 0.3958, + "step": 4178 + }, + { + "epoch": 2.33463687150838, + "grad_norm": 0.45792844891548157, + "learning_rate": 0.0008859103641456582, + "loss": 0.4592, + "step": 4179 + }, + { + "epoch": 2.335195530726257, + "grad_norm": 0.4927826523780823, + "learning_rate": 0.0008858823529411764, + "loss": 0.3621, + "step": 4180 + }, + { + "epoch": 2.335754189944134, + "grad_norm": 0.4476109445095062, + "learning_rate": 0.0008858543417366947, + "loss": 0.4068, + "step": 4181 + }, + { + "epoch": 2.3363128491620113, + "grad_norm": 0.8651317358016968, + "learning_rate": 0.000885826330532213, + "loss": 0.5339, + "step": 4182 + }, + { + "epoch": 2.3368715083798883, + "grad_norm": 0.5524964332580566, + "learning_rate": 0.0008857983193277312, + "loss": 0.5092, + "step": 4183 + }, + { + "epoch": 2.3374301675977653, + "grad_norm": 0.4942110478878021, + "learning_rate": 0.0008857703081232493, + "loss": 0.3719, + "step": 4184 + }, + { + "epoch": 2.3379888268156424, + "grad_norm": 1.1364790201187134, + "learning_rate": 0.0008857422969187675, + "loss": 0.4955, + "step": 4185 + }, + { + "epoch": 2.3385474860335194, + "grad_norm": 0.45928874611854553, + "learning_rate": 0.0008857142857142857, + "loss": 0.512, + "step": 4186 + }, + { + "epoch": 2.339106145251397, + "grad_norm": 0.49775776267051697, + "learning_rate": 0.000885686274509804, + "loss": 0.5159, + "step": 4187 + }, + { + "epoch": 2.339664804469274, + "grad_norm": 0.598150908946991, + "learning_rate": 0.0008856582633053222, + "loss": 0.3664, + "step": 4188 + }, + { + "epoch": 2.340223463687151, + "grad_norm": 1.0361453294754028, + "learning_rate": 0.0008856302521008404, + "loss": 0.4969, + "step": 4189 + }, + { + "epoch": 2.340782122905028, + "grad_norm": 0.5566352009773254, + "learning_rate": 0.0008856022408963585, + "loss": 0.5234, + "step": 4190 + }, + { + "epoch": 2.341340782122905, + "grad_norm": 0.5165764689445496, + "learning_rate": 0.0008855742296918767, + "loss": 0.4647, + "step": 4191 + }, + { + "epoch": 2.3418994413407823, + "grad_norm": 2.389528751373291, + "learning_rate": 0.000885546218487395, + "loss": 0.4856, + "step": 4192 + }, + { + "epoch": 2.3424581005586593, + "grad_norm": 0.8305392265319824, + "learning_rate": 0.0008855182072829132, + "loss": 0.4363, + "step": 4193 + }, + { + "epoch": 2.3430167597765363, + "grad_norm": 0.5118175745010376, + "learning_rate": 0.0008854901960784314, + "loss": 0.5227, + "step": 4194 + }, + { + "epoch": 2.3435754189944134, + "grad_norm": 0.5471662282943726, + "learning_rate": 0.0008854621848739495, + "loss": 0.4773, + "step": 4195 + }, + { + "epoch": 2.3441340782122904, + "grad_norm": 0.8572961091995239, + "learning_rate": 0.0008854341736694677, + "loss": 0.5489, + "step": 4196 + }, + { + "epoch": 2.344692737430168, + "grad_norm": 0.6659944653511047, + "learning_rate": 0.0008854061624649861, + "loss": 0.5261, + "step": 4197 + }, + { + "epoch": 2.345251396648045, + "grad_norm": 0.5566151738166809, + "learning_rate": 0.0008853781512605043, + "loss": 0.516, + "step": 4198 + }, + { + "epoch": 2.345810055865922, + "grad_norm": 0.531288206577301, + "learning_rate": 0.0008853501400560225, + "loss": 0.4466, + "step": 4199 + }, + { + "epoch": 2.346368715083799, + "grad_norm": 0.6818004250526428, + "learning_rate": 0.0008853221288515406, + "loss": 0.4464, + "step": 4200 + }, + { + "epoch": 2.346927374301676, + "grad_norm": 0.7107765674591064, + "learning_rate": 0.0008852941176470588, + "loss": 0.5624, + "step": 4201 + }, + { + "epoch": 2.3474860335195533, + "grad_norm": 0.5839899778366089, + "learning_rate": 0.0008852661064425771, + "loss": 0.4085, + "step": 4202 + }, + { + "epoch": 2.3480446927374303, + "grad_norm": 0.6199565529823303, + "learning_rate": 0.0008852380952380953, + "loss": 0.6021, + "step": 4203 + }, + { + "epoch": 2.3486033519553073, + "grad_norm": 0.5469422936439514, + "learning_rate": 0.0008852100840336135, + "loss": 0.4661, + "step": 4204 + }, + { + "epoch": 2.3491620111731844, + "grad_norm": 0.9261488318443298, + "learning_rate": 0.0008851820728291317, + "loss": 0.464, + "step": 4205 + }, + { + "epoch": 2.3497206703910614, + "grad_norm": 0.5453421473503113, + "learning_rate": 0.0008851540616246498, + "loss": 0.3775, + "step": 4206 + }, + { + "epoch": 2.3502793296089384, + "grad_norm": 0.6501973271369934, + "learning_rate": 0.0008851260504201681, + "loss": 0.4996, + "step": 4207 + }, + { + "epoch": 2.350837988826816, + "grad_norm": 0.6161912679672241, + "learning_rate": 0.0008850980392156863, + "loss": 0.4738, + "step": 4208 + }, + { + "epoch": 2.351396648044693, + "grad_norm": 0.5534847974777222, + "learning_rate": 0.0008850700280112045, + "loss": 0.4977, + "step": 4209 + }, + { + "epoch": 2.35195530726257, + "grad_norm": 0.9877995252609253, + "learning_rate": 0.0008850420168067227, + "loss": 0.5588, + "step": 4210 + }, + { + "epoch": 2.352513966480447, + "grad_norm": 0.6785343289375305, + "learning_rate": 0.0008850140056022408, + "loss": 0.5344, + "step": 4211 + }, + { + "epoch": 2.353072625698324, + "grad_norm": 0.5983065962791443, + "learning_rate": 0.0008849859943977591, + "loss": 0.5827, + "step": 4212 + }, + { + "epoch": 2.353631284916201, + "grad_norm": 0.4906616508960724, + "learning_rate": 0.0008849579831932774, + "loss": 0.5322, + "step": 4213 + }, + { + "epoch": 2.3541899441340783, + "grad_norm": 0.5360569953918457, + "learning_rate": 0.0008849299719887956, + "loss": 0.4714, + "step": 4214 + }, + { + "epoch": 2.3547486033519553, + "grad_norm": 3.384766101837158, + "learning_rate": 0.0008849019607843138, + "loss": 0.522, + "step": 4215 + }, + { + "epoch": 2.3553072625698324, + "grad_norm": 0.7306132316589355, + "learning_rate": 0.0008848739495798319, + "loss": 0.4352, + "step": 4216 + }, + { + "epoch": 2.3558659217877094, + "grad_norm": 0.46426141262054443, + "learning_rate": 0.0008848459383753502, + "loss": 0.4176, + "step": 4217 + }, + { + "epoch": 2.3564245810055864, + "grad_norm": 0.4560985565185547, + "learning_rate": 0.0008848179271708684, + "loss": 0.4916, + "step": 4218 + }, + { + "epoch": 2.356983240223464, + "grad_norm": 0.6996239423751831, + "learning_rate": 0.0008847899159663866, + "loss": 0.5451, + "step": 4219 + }, + { + "epoch": 2.357541899441341, + "grad_norm": 0.3714365065097809, + "learning_rate": 0.0008847619047619048, + "loss": 0.4146, + "step": 4220 + }, + { + "epoch": 2.358100558659218, + "grad_norm": 0.7201758027076721, + "learning_rate": 0.000884733893557423, + "loss": 0.5827, + "step": 4221 + }, + { + "epoch": 2.358659217877095, + "grad_norm": 0.7988872528076172, + "learning_rate": 0.0008847058823529412, + "loss": 0.6139, + "step": 4222 + }, + { + "epoch": 2.359217877094972, + "grad_norm": 0.433135062456131, + "learning_rate": 0.0008846778711484594, + "loss": 0.4678, + "step": 4223 + }, + { + "epoch": 2.3597765363128493, + "grad_norm": 0.5376521348953247, + "learning_rate": 0.0008846498599439776, + "loss": 0.5302, + "step": 4224 + }, + { + "epoch": 2.3603351955307263, + "grad_norm": 0.6785982251167297, + "learning_rate": 0.0008846218487394958, + "loss": 0.5146, + "step": 4225 + }, + { + "epoch": 2.3608938547486034, + "grad_norm": 0.6107456088066101, + "learning_rate": 0.000884593837535014, + "loss": 0.383, + "step": 4226 + }, + { + "epoch": 2.3614525139664804, + "grad_norm": 0.7669987082481384, + "learning_rate": 0.0008845658263305322, + "loss": 0.5966, + "step": 4227 + }, + { + "epoch": 2.3620111731843574, + "grad_norm": 0.5143811702728271, + "learning_rate": 0.0008845378151260504, + "loss": 0.4405, + "step": 4228 + }, + { + "epoch": 2.362569832402235, + "grad_norm": 0.6507100462913513, + "learning_rate": 0.0008845098039215686, + "loss": 0.5834, + "step": 4229 + }, + { + "epoch": 2.363128491620112, + "grad_norm": 0.7228155136108398, + "learning_rate": 0.0008844817927170869, + "loss": 0.4828, + "step": 4230 + }, + { + "epoch": 2.363687150837989, + "grad_norm": 0.7891553640365601, + "learning_rate": 0.0008844537815126051, + "loss": 0.4341, + "step": 4231 + }, + { + "epoch": 2.364245810055866, + "grad_norm": 0.8317373991012573, + "learning_rate": 0.0008844257703081234, + "loss": 0.3927, + "step": 4232 + }, + { + "epoch": 2.364804469273743, + "grad_norm": 0.5360849499702454, + "learning_rate": 0.0008843977591036415, + "loss": 0.3607, + "step": 4233 + }, + { + "epoch": 2.3653631284916203, + "grad_norm": 0.4699600338935852, + "learning_rate": 0.0008843697478991597, + "loss": 0.4963, + "step": 4234 + }, + { + "epoch": 2.3659217877094973, + "grad_norm": 0.48334765434265137, + "learning_rate": 0.0008843417366946779, + "loss": 0.3855, + "step": 4235 + }, + { + "epoch": 2.3664804469273744, + "grad_norm": 0.40742337703704834, + "learning_rate": 0.0008843137254901961, + "loss": 0.4161, + "step": 4236 + }, + { + "epoch": 2.3670391061452514, + "grad_norm": 0.47621408104896545, + "learning_rate": 0.0008842857142857143, + "loss": 0.4672, + "step": 4237 + }, + { + "epoch": 2.3675977653631284, + "grad_norm": 0.7323170900344849, + "learning_rate": 0.0008842577030812325, + "loss": 0.5033, + "step": 4238 + }, + { + "epoch": 2.3681564245810054, + "grad_norm": 0.4447155296802521, + "learning_rate": 0.0008842296918767507, + "loss": 0.339, + "step": 4239 + }, + { + "epoch": 2.368715083798883, + "grad_norm": 0.6605436205863953, + "learning_rate": 0.0008842016806722689, + "loss": 0.3911, + "step": 4240 + }, + { + "epoch": 2.36927374301676, + "grad_norm": 1.070167064666748, + "learning_rate": 0.0008841736694677871, + "loss": 0.6765, + "step": 4241 + }, + { + "epoch": 2.369832402234637, + "grad_norm": 0.5497492551803589, + "learning_rate": 0.0008841456582633053, + "loss": 0.4693, + "step": 4242 + }, + { + "epoch": 2.370391061452514, + "grad_norm": 0.47830671072006226, + "learning_rate": 0.0008841176470588235, + "loss": 0.459, + "step": 4243 + }, + { + "epoch": 2.370949720670391, + "grad_norm": 0.4435995817184448, + "learning_rate": 0.0008840896358543417, + "loss": 0.436, + "step": 4244 + }, + { + "epoch": 2.3715083798882683, + "grad_norm": 0.9786117076873779, + "learning_rate": 0.0008840616246498599, + "loss": 0.5611, + "step": 4245 + }, + { + "epoch": 2.3720670391061454, + "grad_norm": 0.6978351473808289, + "learning_rate": 0.0008840336134453782, + "loss": 0.4574, + "step": 4246 + }, + { + "epoch": 2.3726256983240224, + "grad_norm": 0.8869118690490723, + "learning_rate": 0.0008840056022408964, + "loss": 0.4586, + "step": 4247 + }, + { + "epoch": 2.3731843575418994, + "grad_norm": 1.659661054611206, + "learning_rate": 0.0008839775910364147, + "loss": 0.4293, + "step": 4248 + }, + { + "epoch": 2.3737430167597764, + "grad_norm": 1.223410725593567, + "learning_rate": 0.0008839495798319328, + "loss": 0.5203, + "step": 4249 + }, + { + "epoch": 2.3743016759776534, + "grad_norm": 0.7573944330215454, + "learning_rate": 0.000883921568627451, + "loss": 0.4359, + "step": 4250 + }, + { + "epoch": 2.374860335195531, + "grad_norm": 0.5053035020828247, + "learning_rate": 0.0008838935574229692, + "loss": 0.4868, + "step": 4251 + }, + { + "epoch": 2.375418994413408, + "grad_norm": 0.4673672616481781, + "learning_rate": 0.0008838655462184874, + "loss": 0.4322, + "step": 4252 + }, + { + "epoch": 2.375977653631285, + "grad_norm": 0.44579166173934937, + "learning_rate": 0.0008838375350140057, + "loss": 0.3557, + "step": 4253 + }, + { + "epoch": 2.376536312849162, + "grad_norm": 0.46582189202308655, + "learning_rate": 0.0008838095238095238, + "loss": 0.4105, + "step": 4254 + }, + { + "epoch": 2.377094972067039, + "grad_norm": 0.44945183396339417, + "learning_rate": 0.000883781512605042, + "loss": 0.43, + "step": 4255 + }, + { + "epoch": 2.3776536312849164, + "grad_norm": 10.617576599121094, + "learning_rate": 0.0008837535014005602, + "loss": 0.4773, + "step": 4256 + }, + { + "epoch": 2.3782122905027934, + "grad_norm": 0.6170254349708557, + "learning_rate": 0.0008837254901960784, + "loss": 0.5113, + "step": 4257 + }, + { + "epoch": 2.3787709497206704, + "grad_norm": 0.5523990392684937, + "learning_rate": 0.0008836974789915967, + "loss": 0.543, + "step": 4258 + }, + { + "epoch": 2.3793296089385474, + "grad_norm": 1.5440400838851929, + "learning_rate": 0.0008836694677871148, + "loss": 0.4009, + "step": 4259 + }, + { + "epoch": 2.3798882681564244, + "grad_norm": 0.7903233170509338, + "learning_rate": 0.000883641456582633, + "loss": 0.5596, + "step": 4260 + }, + { + "epoch": 2.380446927374302, + "grad_norm": 0.9833642840385437, + "learning_rate": 0.0008836134453781512, + "loss": 0.4112, + "step": 4261 + }, + { + "epoch": 2.381005586592179, + "grad_norm": 0.7449160814285278, + "learning_rate": 0.0008835854341736694, + "loss": 0.5871, + "step": 4262 + }, + { + "epoch": 2.381564245810056, + "grad_norm": 0.7182409167289734, + "learning_rate": 0.0008835574229691878, + "loss": 0.5365, + "step": 4263 + }, + { + "epoch": 2.382122905027933, + "grad_norm": 1.0733376741409302, + "learning_rate": 0.000883529411764706, + "loss": 0.3719, + "step": 4264 + }, + { + "epoch": 2.38268156424581, + "grad_norm": 0.48038485646247864, + "learning_rate": 0.0008835014005602241, + "loss": 0.4889, + "step": 4265 + }, + { + "epoch": 2.3832402234636874, + "grad_norm": 0.5214555263519287, + "learning_rate": 0.0008834733893557423, + "loss": 0.5317, + "step": 4266 + }, + { + "epoch": 2.3837988826815644, + "grad_norm": 0.6974785327911377, + "learning_rate": 0.0008834453781512605, + "loss": 0.5347, + "step": 4267 + }, + { + "epoch": 2.3843575418994414, + "grad_norm": 0.46163174510002136, + "learning_rate": 0.0008834173669467788, + "loss": 0.5655, + "step": 4268 + }, + { + "epoch": 2.3849162011173184, + "grad_norm": 0.5612812638282776, + "learning_rate": 0.000883389355742297, + "loss": 0.5412, + "step": 4269 + }, + { + "epoch": 2.3854748603351954, + "grad_norm": 0.5364530682563782, + "learning_rate": 0.0008833613445378151, + "loss": 0.3914, + "step": 4270 + }, + { + "epoch": 2.386033519553073, + "grad_norm": 0.5268359184265137, + "learning_rate": 0.0008833333333333333, + "loss": 0.4727, + "step": 4271 + }, + { + "epoch": 2.38659217877095, + "grad_norm": 0.5146215558052063, + "learning_rate": 0.0008833053221288515, + "loss": 0.5487, + "step": 4272 + }, + { + "epoch": 2.387150837988827, + "grad_norm": 0.4384045898914337, + "learning_rate": 0.0008832773109243698, + "loss": 0.3752, + "step": 4273 + }, + { + "epoch": 2.387709497206704, + "grad_norm": 0.6137128472328186, + "learning_rate": 0.000883249299719888, + "loss": 0.4412, + "step": 4274 + }, + { + "epoch": 2.388268156424581, + "grad_norm": 0.5418224334716797, + "learning_rate": 0.0008832212885154061, + "loss": 0.4486, + "step": 4275 + }, + { + "epoch": 2.388826815642458, + "grad_norm": 0.544743001461029, + "learning_rate": 0.0008831932773109243, + "loss": 0.4664, + "step": 4276 + }, + { + "epoch": 2.3893854748603354, + "grad_norm": 0.39321839809417725, + "learning_rate": 0.0008831652661064425, + "loss": 0.3911, + "step": 4277 + }, + { + "epoch": 2.3899441340782124, + "grad_norm": 0.8188342452049255, + "learning_rate": 0.0008831372549019609, + "loss": 0.4849, + "step": 4278 + }, + { + "epoch": 2.3905027932960894, + "grad_norm": 0.7178171277046204, + "learning_rate": 0.0008831092436974791, + "loss": 0.5732, + "step": 4279 + }, + { + "epoch": 2.3910614525139664, + "grad_norm": 0.5431494116783142, + "learning_rate": 0.0008830812324929973, + "loss": 0.5017, + "step": 4280 + }, + { + "epoch": 2.3916201117318434, + "grad_norm": 0.6249786019325256, + "learning_rate": 0.0008830532212885154, + "loss": 0.4637, + "step": 4281 + }, + { + "epoch": 2.3921787709497204, + "grad_norm": 0.5498879551887512, + "learning_rate": 0.0008830252100840336, + "loss": 0.4105, + "step": 4282 + }, + { + "epoch": 2.392737430167598, + "grad_norm": 0.4963003695011139, + "learning_rate": 0.0008829971988795519, + "loss": 0.399, + "step": 4283 + }, + { + "epoch": 2.393296089385475, + "grad_norm": 1.0586880445480347, + "learning_rate": 0.0008829691876750701, + "loss": 0.3949, + "step": 4284 + }, + { + "epoch": 2.393854748603352, + "grad_norm": 0.433883398771286, + "learning_rate": 0.0008829411764705883, + "loss": 0.47, + "step": 4285 + }, + { + "epoch": 2.394413407821229, + "grad_norm": 0.5674890875816345, + "learning_rate": 0.0008829131652661064, + "loss": 0.4918, + "step": 4286 + }, + { + "epoch": 2.394972067039106, + "grad_norm": 0.48947373032569885, + "learning_rate": 0.0008828851540616246, + "loss": 0.5143, + "step": 4287 + }, + { + "epoch": 2.3955307262569834, + "grad_norm": 0.513671338558197, + "learning_rate": 0.0008828571428571429, + "loss": 0.4568, + "step": 4288 + }, + { + "epoch": 2.3960893854748604, + "grad_norm": 0.7631064057350159, + "learning_rate": 0.0008828291316526611, + "loss": 0.7101, + "step": 4289 + }, + { + "epoch": 2.3966480446927374, + "grad_norm": 0.8110824227333069, + "learning_rate": 0.0008828011204481793, + "loss": 0.6534, + "step": 4290 + }, + { + "epoch": 2.3972067039106144, + "grad_norm": 1.466528058052063, + "learning_rate": 0.0008827731092436974, + "loss": 0.4679, + "step": 4291 + }, + { + "epoch": 2.3977653631284914, + "grad_norm": 2.621582269668579, + "learning_rate": 0.0008827450980392156, + "loss": 0.4822, + "step": 4292 + }, + { + "epoch": 2.398324022346369, + "grad_norm": 0.5260323882102966, + "learning_rate": 0.0008827170868347339, + "loss": 0.6171, + "step": 4293 + }, + { + "epoch": 2.398882681564246, + "grad_norm": 0.4889635741710663, + "learning_rate": 0.0008826890756302521, + "loss": 0.621, + "step": 4294 + }, + { + "epoch": 2.399441340782123, + "grad_norm": 0.8604654669761658, + "learning_rate": 0.0008826610644257704, + "loss": 0.4936, + "step": 4295 + }, + { + "epoch": 2.4, + "grad_norm": 25.43321990966797, + "learning_rate": 0.0008826330532212886, + "loss": 0.4807, + "step": 4296 + }, + { + "epoch": 2.400558659217877, + "grad_norm": 0.6410494446754456, + "learning_rate": 0.0008826050420168067, + "loss": 0.5789, + "step": 4297 + }, + { + "epoch": 2.4011173184357544, + "grad_norm": 1.1895076036453247, + "learning_rate": 0.000882577030812325, + "loss": 0.4906, + "step": 4298 + }, + { + "epoch": 2.4016759776536314, + "grad_norm": 0.6382520198822021, + "learning_rate": 0.0008825490196078432, + "loss": 0.5272, + "step": 4299 + }, + { + "epoch": 2.4022346368715084, + "grad_norm": 1.1108702421188354, + "learning_rate": 0.0008825210084033614, + "loss": 0.3921, + "step": 4300 + }, + { + "epoch": 2.4027932960893854, + "grad_norm": 0.46631813049316406, + "learning_rate": 0.0008824929971988796, + "loss": 0.499, + "step": 4301 + }, + { + "epoch": 2.4033519553072624, + "grad_norm": 0.5861110091209412, + "learning_rate": 0.0008824649859943977, + "loss": 0.4722, + "step": 4302 + }, + { + "epoch": 2.40391061452514, + "grad_norm": 0.4181196987628937, + "learning_rate": 0.000882436974789916, + "loss": 0.4135, + "step": 4303 + }, + { + "epoch": 2.404469273743017, + "grad_norm": 0.49521398544311523, + "learning_rate": 0.0008824089635854342, + "loss": 0.4804, + "step": 4304 + }, + { + "epoch": 2.405027932960894, + "grad_norm": 0.7357967495918274, + "learning_rate": 0.0008823809523809524, + "loss": 0.5264, + "step": 4305 + }, + { + "epoch": 2.405586592178771, + "grad_norm": 0.44721147418022156, + "learning_rate": 0.0008823529411764706, + "loss": 0.481, + "step": 4306 + }, + { + "epoch": 2.406145251396648, + "grad_norm": 0.49752721190452576, + "learning_rate": 0.0008823249299719887, + "loss": 0.464, + "step": 4307 + }, + { + "epoch": 2.4067039106145254, + "grad_norm": 0.4879881739616394, + "learning_rate": 0.000882296918767507, + "loss": 0.4486, + "step": 4308 + }, + { + "epoch": 2.4072625698324024, + "grad_norm": 0.4629369378089905, + "learning_rate": 0.0008822689075630252, + "loss": 0.4524, + "step": 4309 + }, + { + "epoch": 2.4078212290502794, + "grad_norm": 1.4756484031677246, + "learning_rate": 0.0008822408963585434, + "loss": 0.5583, + "step": 4310 + }, + { + "epoch": 2.4083798882681564, + "grad_norm": 0.5566049814224243, + "learning_rate": 0.0008822128851540616, + "loss": 0.4512, + "step": 4311 + }, + { + "epoch": 2.4089385474860334, + "grad_norm": 0.6636320352554321, + "learning_rate": 0.0008821848739495799, + "loss": 0.5222, + "step": 4312 + }, + { + "epoch": 2.4094972067039104, + "grad_norm": 0.7487527132034302, + "learning_rate": 0.0008821568627450981, + "loss": 0.521, + "step": 4313 + }, + { + "epoch": 2.410055865921788, + "grad_norm": 0.7668407559394836, + "learning_rate": 0.0008821288515406163, + "loss": 0.5283, + "step": 4314 + }, + { + "epoch": 2.410614525139665, + "grad_norm": 0.3995644152164459, + "learning_rate": 0.0008821008403361345, + "loss": 0.4354, + "step": 4315 + }, + { + "epoch": 2.411173184357542, + "grad_norm": 0.5607393980026245, + "learning_rate": 0.0008820728291316527, + "loss": 0.5221, + "step": 4316 + }, + { + "epoch": 2.411731843575419, + "grad_norm": 0.49787914752960205, + "learning_rate": 0.0008820448179271709, + "loss": 0.5637, + "step": 4317 + }, + { + "epoch": 2.412290502793296, + "grad_norm": 0.5838570594787598, + "learning_rate": 0.0008820168067226891, + "loss": 0.4391, + "step": 4318 + }, + { + "epoch": 2.412849162011173, + "grad_norm": 0.5224358439445496, + "learning_rate": 0.0008819887955182073, + "loss": 0.4389, + "step": 4319 + }, + { + "epoch": 2.4134078212290504, + "grad_norm": 0.5662122964859009, + "learning_rate": 0.0008819607843137255, + "loss": 0.4244, + "step": 4320 + }, + { + "epoch": 2.4139664804469274, + "grad_norm": 0.6424004435539246, + "learning_rate": 0.0008819327731092437, + "loss": 0.4881, + "step": 4321 + }, + { + "epoch": 2.4145251396648044, + "grad_norm": 0.7683541178703308, + "learning_rate": 0.0008819047619047619, + "loss": 0.3957, + "step": 4322 + }, + { + "epoch": 2.4150837988826814, + "grad_norm": 0.4299624562263489, + "learning_rate": 0.0008818767507002801, + "loss": 0.3726, + "step": 4323 + }, + { + "epoch": 2.4156424581005584, + "grad_norm": 0.91096031665802, + "learning_rate": 0.0008818487394957983, + "loss": 0.5559, + "step": 4324 + }, + { + "epoch": 2.416201117318436, + "grad_norm": 0.7092557549476624, + "learning_rate": 0.0008818207282913165, + "loss": 0.4296, + "step": 4325 + }, + { + "epoch": 2.416759776536313, + "grad_norm": 4.360379219055176, + "learning_rate": 0.0008817927170868347, + "loss": 0.4725, + "step": 4326 + }, + { + "epoch": 2.41731843575419, + "grad_norm": 0.9486434459686279, + "learning_rate": 0.0008817647058823529, + "loss": 0.5032, + "step": 4327 + }, + { + "epoch": 2.417877094972067, + "grad_norm": 0.48562589287757874, + "learning_rate": 0.0008817366946778713, + "loss": 0.4271, + "step": 4328 + }, + { + "epoch": 2.418435754189944, + "grad_norm": 0.5344672799110413, + "learning_rate": 0.0008817086834733894, + "loss": 0.4067, + "step": 4329 + }, + { + "epoch": 2.4189944134078214, + "grad_norm": 2.026885509490967, + "learning_rate": 0.0008816806722689076, + "loss": 0.5047, + "step": 4330 + }, + { + "epoch": 2.4195530726256984, + "grad_norm": 0.49559345841407776, + "learning_rate": 0.0008816526610644258, + "loss": 0.4767, + "step": 4331 + }, + { + "epoch": 2.4201117318435754, + "grad_norm": 1.3192914724349976, + "learning_rate": 0.000881624649859944, + "loss": 0.483, + "step": 4332 + }, + { + "epoch": 2.4206703910614524, + "grad_norm": 0.5153898596763611, + "learning_rate": 0.0008815966386554623, + "loss": 0.4174, + "step": 4333 + }, + { + "epoch": 2.4212290502793294, + "grad_norm": 0.4890030324459076, + "learning_rate": 0.0008815686274509804, + "loss": 0.4532, + "step": 4334 + }, + { + "epoch": 2.421787709497207, + "grad_norm": 0.4998883306980133, + "learning_rate": 0.0008815406162464986, + "loss": 0.5379, + "step": 4335 + }, + { + "epoch": 2.422346368715084, + "grad_norm": 6.456830024719238, + "learning_rate": 0.0008815126050420168, + "loss": 0.4178, + "step": 4336 + }, + { + "epoch": 2.422905027932961, + "grad_norm": 0.7396804094314575, + "learning_rate": 0.000881484593837535, + "loss": 0.5235, + "step": 4337 + }, + { + "epoch": 2.423463687150838, + "grad_norm": 0.5532344579696655, + "learning_rate": 0.0008814565826330533, + "loss": 0.5341, + "step": 4338 + }, + { + "epoch": 2.424022346368715, + "grad_norm": 0.5099005103111267, + "learning_rate": 0.0008814285714285714, + "loss": 0.4806, + "step": 4339 + }, + { + "epoch": 2.4245810055865924, + "grad_norm": 0.6905778646469116, + "learning_rate": 0.0008814005602240896, + "loss": 0.5037, + "step": 4340 + }, + { + "epoch": 2.4251396648044694, + "grad_norm": 0.5489441752433777, + "learning_rate": 0.0008813725490196078, + "loss": 0.4945, + "step": 4341 + }, + { + "epoch": 2.4256983240223464, + "grad_norm": 0.5421143174171448, + "learning_rate": 0.000881344537815126, + "loss": 0.6252, + "step": 4342 + }, + { + "epoch": 2.4262569832402234, + "grad_norm": 0.41108086705207825, + "learning_rate": 0.0008813165266106443, + "loss": 0.4741, + "step": 4343 + }, + { + "epoch": 2.4268156424581004, + "grad_norm": 0.5088945031166077, + "learning_rate": 0.0008812885154061626, + "loss": 0.4808, + "step": 4344 + }, + { + "epoch": 2.427374301675978, + "grad_norm": 0.5964069962501526, + "learning_rate": 0.0008812605042016807, + "loss": 0.4156, + "step": 4345 + }, + { + "epoch": 2.427932960893855, + "grad_norm": 0.5038523077964783, + "learning_rate": 0.0008812324929971989, + "loss": 0.4349, + "step": 4346 + }, + { + "epoch": 2.428491620111732, + "grad_norm": 3.671360969543457, + "learning_rate": 0.0008812044817927171, + "loss": 0.4267, + "step": 4347 + }, + { + "epoch": 2.429050279329609, + "grad_norm": 0.5666340589523315, + "learning_rate": 0.0008811764705882354, + "loss": 0.4614, + "step": 4348 + }, + { + "epoch": 2.429608938547486, + "grad_norm": 1.0279804468154907, + "learning_rate": 0.0008811484593837536, + "loss": 0.6051, + "step": 4349 + }, + { + "epoch": 2.430167597765363, + "grad_norm": 0.6092056632041931, + "learning_rate": 0.0008811204481792717, + "loss": 0.5395, + "step": 4350 + }, + { + "epoch": 2.4307262569832404, + "grad_norm": 1.9469459056854248, + "learning_rate": 0.0008810924369747899, + "loss": 0.5241, + "step": 4351 + }, + { + "epoch": 2.4312849162011174, + "grad_norm": 0.5555824637413025, + "learning_rate": 0.0008810644257703081, + "loss": 0.4475, + "step": 4352 + }, + { + "epoch": 2.4318435754189944, + "grad_norm": 0.6190809607505798, + "learning_rate": 0.0008810364145658264, + "loss": 0.6339, + "step": 4353 + }, + { + "epoch": 2.4324022346368714, + "grad_norm": 0.5151634216308594, + "learning_rate": 0.0008810084033613446, + "loss": 0.5336, + "step": 4354 + }, + { + "epoch": 2.4329608938547485, + "grad_norm": 0.5393658876419067, + "learning_rate": 0.0008809803921568627, + "loss": 0.4911, + "step": 4355 + }, + { + "epoch": 2.4335195530726255, + "grad_norm": 0.41556069254875183, + "learning_rate": 0.0008809523809523809, + "loss": 0.3815, + "step": 4356 + }, + { + "epoch": 2.434078212290503, + "grad_norm": 0.4446249008178711, + "learning_rate": 0.0008809243697478991, + "loss": 0.4646, + "step": 4357 + }, + { + "epoch": 2.43463687150838, + "grad_norm": 0.4504421055316925, + "learning_rate": 0.0008808963585434174, + "loss": 0.4485, + "step": 4358 + }, + { + "epoch": 2.435195530726257, + "grad_norm": 0.5532277226448059, + "learning_rate": 0.0008808683473389356, + "loss": 0.4241, + "step": 4359 + }, + { + "epoch": 2.435754189944134, + "grad_norm": 0.5600268840789795, + "learning_rate": 0.0008808403361344539, + "loss": 0.4679, + "step": 4360 + }, + { + "epoch": 2.436312849162011, + "grad_norm": 0.47818443179130554, + "learning_rate": 0.000880812324929972, + "loss": 0.5175, + "step": 4361 + }, + { + "epoch": 2.4368715083798884, + "grad_norm": 0.4332275092601776, + "learning_rate": 0.0008807843137254902, + "loss": 0.3947, + "step": 4362 + }, + { + "epoch": 2.4374301675977654, + "grad_norm": 1.0805692672729492, + "learning_rate": 0.0008807563025210085, + "loss": 0.391, + "step": 4363 + }, + { + "epoch": 2.4379888268156424, + "grad_norm": 0.4999154210090637, + "learning_rate": 0.0008807282913165267, + "loss": 0.4842, + "step": 4364 + }, + { + "epoch": 2.4385474860335195, + "grad_norm": 0.391719251871109, + "learning_rate": 0.0008807002801120449, + "loss": 0.4172, + "step": 4365 + }, + { + "epoch": 2.4391061452513965, + "grad_norm": 0.9226119518280029, + "learning_rate": 0.000880672268907563, + "loss": 0.4765, + "step": 4366 + }, + { + "epoch": 2.439664804469274, + "grad_norm": 0.6326907277107239, + "learning_rate": 0.0008806442577030812, + "loss": 0.4686, + "step": 4367 + }, + { + "epoch": 2.440223463687151, + "grad_norm": 0.5321052074432373, + "learning_rate": 0.0008806162464985995, + "loss": 0.5778, + "step": 4368 + }, + { + "epoch": 2.440782122905028, + "grad_norm": 6.32752799987793, + "learning_rate": 0.0008805882352941177, + "loss": 0.3842, + "step": 4369 + }, + { + "epoch": 2.441340782122905, + "grad_norm": 2.7710537910461426, + "learning_rate": 0.0008805602240896359, + "loss": 0.6063, + "step": 4370 + }, + { + "epoch": 2.441899441340782, + "grad_norm": 0.5310767889022827, + "learning_rate": 0.000880532212885154, + "loss": 0.495, + "step": 4371 + }, + { + "epoch": 2.4424581005586594, + "grad_norm": 0.8203009963035583, + "learning_rate": 0.0008805042016806722, + "loss": 0.4765, + "step": 4372 + }, + { + "epoch": 2.4430167597765364, + "grad_norm": 0.4067409634590149, + "learning_rate": 0.0008804761904761905, + "loss": 0.4315, + "step": 4373 + }, + { + "epoch": 2.4435754189944134, + "grad_norm": 0.6586512327194214, + "learning_rate": 0.0008804481792717087, + "loss": 0.4524, + "step": 4374 + }, + { + "epoch": 2.4441340782122905, + "grad_norm": 1.092674970626831, + "learning_rate": 0.0008804201680672269, + "loss": 0.4151, + "step": 4375 + }, + { + "epoch": 2.4446927374301675, + "grad_norm": 0.864611029624939, + "learning_rate": 0.0008803921568627451, + "loss": 0.4224, + "step": 4376 + }, + { + "epoch": 2.445251396648045, + "grad_norm": 1.6714537143707275, + "learning_rate": 0.0008803641456582632, + "loss": 0.4327, + "step": 4377 + }, + { + "epoch": 2.445810055865922, + "grad_norm": 2.8098738193511963, + "learning_rate": 0.0008803361344537816, + "loss": 0.4702, + "step": 4378 + }, + { + "epoch": 2.446368715083799, + "grad_norm": 0.5370066165924072, + "learning_rate": 0.0008803081232492998, + "loss": 0.462, + "step": 4379 + }, + { + "epoch": 2.446927374301676, + "grad_norm": 0.7801875472068787, + "learning_rate": 0.000880280112044818, + "loss": 0.5963, + "step": 4380 + }, + { + "epoch": 2.447486033519553, + "grad_norm": 0.5990810394287109, + "learning_rate": 0.0008802521008403362, + "loss": 0.4869, + "step": 4381 + }, + { + "epoch": 2.4480446927374304, + "grad_norm": 0.6246578693389893, + "learning_rate": 0.0008802240896358543, + "loss": 0.5052, + "step": 4382 + }, + { + "epoch": 2.4486033519553074, + "grad_norm": 0.5706420540809631, + "learning_rate": 0.0008801960784313726, + "loss": 0.4506, + "step": 4383 + }, + { + "epoch": 2.4491620111731844, + "grad_norm": 0.761964738368988, + "learning_rate": 0.0008801680672268908, + "loss": 0.4354, + "step": 4384 + }, + { + "epoch": 2.4497206703910615, + "grad_norm": 0.4557899534702301, + "learning_rate": 0.000880140056022409, + "loss": 0.4536, + "step": 4385 + }, + { + "epoch": 2.4502793296089385, + "grad_norm": 0.8080939054489136, + "learning_rate": 0.0008801120448179272, + "loss": 0.4344, + "step": 4386 + }, + { + "epoch": 2.4508379888268155, + "grad_norm": 5.231118202209473, + "learning_rate": 0.0008800840336134453, + "loss": 0.4743, + "step": 4387 + }, + { + "epoch": 2.451396648044693, + "grad_norm": 0.5391502380371094, + "learning_rate": 0.0008800560224089636, + "loss": 0.4194, + "step": 4388 + }, + { + "epoch": 2.45195530726257, + "grad_norm": 1.0946112871170044, + "learning_rate": 0.0008800280112044818, + "loss": 0.5505, + "step": 4389 + }, + { + "epoch": 2.452513966480447, + "grad_norm": 0.7133363485336304, + "learning_rate": 0.00088, + "loss": 0.5146, + "step": 4390 + }, + { + "epoch": 2.453072625698324, + "grad_norm": 0.5240291953086853, + "learning_rate": 0.0008799719887955182, + "loss": 0.454, + "step": 4391 + }, + { + "epoch": 2.453631284916201, + "grad_norm": 1.0536044836044312, + "learning_rate": 0.0008799439775910364, + "loss": 0.495, + "step": 4392 + }, + { + "epoch": 2.454189944134078, + "grad_norm": 0.5113751292228699, + "learning_rate": 0.0008799159663865546, + "loss": 0.4708, + "step": 4393 + }, + { + "epoch": 2.4547486033519554, + "grad_norm": 1.1066749095916748, + "learning_rate": 0.0008798879551820729, + "loss": 0.5135, + "step": 4394 + }, + { + "epoch": 2.4553072625698324, + "grad_norm": 0.6315325498580933, + "learning_rate": 0.0008798599439775911, + "loss": 0.5431, + "step": 4395 + }, + { + "epoch": 2.4558659217877095, + "grad_norm": 2.807061195373535, + "learning_rate": 0.0008798319327731093, + "loss": 0.4797, + "step": 4396 + }, + { + "epoch": 2.4564245810055865, + "grad_norm": 3.1679513454437256, + "learning_rate": 0.0008798039215686275, + "loss": 0.5308, + "step": 4397 + }, + { + "epoch": 2.4569832402234635, + "grad_norm": 0.4514045715332031, + "learning_rate": 0.0008797759103641457, + "loss": 0.4231, + "step": 4398 + }, + { + "epoch": 2.457541899441341, + "grad_norm": 0.49052560329437256, + "learning_rate": 0.0008797478991596639, + "loss": 0.4859, + "step": 4399 + }, + { + "epoch": 2.458100558659218, + "grad_norm": 0.45187923312187195, + "learning_rate": 0.0008797198879551821, + "loss": 0.4388, + "step": 4400 + }, + { + "epoch": 2.458659217877095, + "grad_norm": 8.988836288452148, + "learning_rate": 0.0008796918767507003, + "loss": 0.4998, + "step": 4401 + }, + { + "epoch": 2.459217877094972, + "grad_norm": 0.49433860182762146, + "learning_rate": 0.0008796638655462185, + "loss": 0.4329, + "step": 4402 + }, + { + "epoch": 2.459776536312849, + "grad_norm": 0.4444673955440521, + "learning_rate": 0.0008796358543417367, + "loss": 0.3616, + "step": 4403 + }, + { + "epoch": 2.4603351955307264, + "grad_norm": 1.8196717500686646, + "learning_rate": 0.0008796078431372549, + "loss": 0.5, + "step": 4404 + }, + { + "epoch": 2.4608938547486034, + "grad_norm": 0.5254615545272827, + "learning_rate": 0.0008795798319327731, + "loss": 0.4922, + "step": 4405 + }, + { + "epoch": 2.4614525139664805, + "grad_norm": 0.5483995079994202, + "learning_rate": 0.0008795518207282913, + "loss": 0.5123, + "step": 4406 + }, + { + "epoch": 2.4620111731843575, + "grad_norm": 2.8272669315338135, + "learning_rate": 0.0008795238095238095, + "loss": 0.4422, + "step": 4407 + }, + { + "epoch": 2.4625698324022345, + "grad_norm": 0.7430616617202759, + "learning_rate": 0.0008794957983193278, + "loss": 0.5514, + "step": 4408 + }, + { + "epoch": 2.463128491620112, + "grad_norm": 3.747798442840576, + "learning_rate": 0.0008794677871148459, + "loss": 0.3925, + "step": 4409 + }, + { + "epoch": 2.463687150837989, + "grad_norm": 0.590506911277771, + "learning_rate": 0.0008794397759103642, + "loss": 0.4449, + "step": 4410 + }, + { + "epoch": 2.464245810055866, + "grad_norm": 0.42584675550460815, + "learning_rate": 0.0008794117647058824, + "loss": 0.4518, + "step": 4411 + }, + { + "epoch": 2.464804469273743, + "grad_norm": 1.0190391540527344, + "learning_rate": 0.0008793837535014006, + "loss": 0.6653, + "step": 4412 + }, + { + "epoch": 2.46536312849162, + "grad_norm": 0.45869314670562744, + "learning_rate": 0.0008793557422969189, + "loss": 0.4285, + "step": 4413 + }, + { + "epoch": 2.4659217877094974, + "grad_norm": 0.5037466883659363, + "learning_rate": 0.000879327731092437, + "loss": 0.4296, + "step": 4414 + }, + { + "epoch": 2.4664804469273744, + "grad_norm": 1.2750922441482544, + "learning_rate": 0.0008792997198879552, + "loss": 0.4937, + "step": 4415 + }, + { + "epoch": 2.4670391061452515, + "grad_norm": 1.1161415576934814, + "learning_rate": 0.0008792717086834734, + "loss": 0.5768, + "step": 4416 + }, + { + "epoch": 2.4675977653631285, + "grad_norm": 0.8670387864112854, + "learning_rate": 0.0008792436974789916, + "loss": 0.4481, + "step": 4417 + }, + { + "epoch": 2.4681564245810055, + "grad_norm": 1.4730231761932373, + "learning_rate": 0.0008792156862745099, + "loss": 0.5884, + "step": 4418 + }, + { + "epoch": 2.4687150837988825, + "grad_norm": 0.7452554702758789, + "learning_rate": 0.000879187675070028, + "loss": 0.5316, + "step": 4419 + }, + { + "epoch": 2.46927374301676, + "grad_norm": 0.4893207550048828, + "learning_rate": 0.0008791596638655462, + "loss": 0.3357, + "step": 4420 + }, + { + "epoch": 2.469832402234637, + "grad_norm": 2.1996099948883057, + "learning_rate": 0.0008791316526610644, + "loss": 0.3918, + "step": 4421 + }, + { + "epoch": 2.470391061452514, + "grad_norm": 0.6242367029190063, + "learning_rate": 0.0008791036414565826, + "loss": 0.4399, + "step": 4422 + }, + { + "epoch": 2.470949720670391, + "grad_norm": 0.5205169916152954, + "learning_rate": 0.0008790756302521009, + "loss": 0.4872, + "step": 4423 + }, + { + "epoch": 2.471508379888268, + "grad_norm": 0.5776616334915161, + "learning_rate": 0.0008790476190476191, + "loss": 0.5271, + "step": 4424 + }, + { + "epoch": 2.472067039106145, + "grad_norm": 0.6396153569221497, + "learning_rate": 0.0008790196078431372, + "loss": 0.4457, + "step": 4425 + }, + { + "epoch": 2.4726256983240225, + "grad_norm": 1.701348066329956, + "learning_rate": 0.0008789915966386554, + "loss": 0.5394, + "step": 4426 + }, + { + "epoch": 2.4731843575418995, + "grad_norm": 0.649161696434021, + "learning_rate": 0.0008789635854341737, + "loss": 0.5024, + "step": 4427 + }, + { + "epoch": 2.4737430167597765, + "grad_norm": 0.8495590090751648, + "learning_rate": 0.000878935574229692, + "loss": 0.4276, + "step": 4428 + }, + { + "epoch": 2.4743016759776535, + "grad_norm": 1.5100113153457642, + "learning_rate": 0.0008789075630252102, + "loss": 0.5683, + "step": 4429 + }, + { + "epoch": 2.4748603351955305, + "grad_norm": 0.5396854281425476, + "learning_rate": 0.0008788795518207283, + "loss": 0.485, + "step": 4430 + }, + { + "epoch": 2.475418994413408, + "grad_norm": 0.9645025134086609, + "learning_rate": 0.0008788515406162465, + "loss": 0.5211, + "step": 4431 + }, + { + "epoch": 2.475977653631285, + "grad_norm": 0.522449254989624, + "learning_rate": 0.0008788235294117647, + "loss": 0.5439, + "step": 4432 + }, + { + "epoch": 2.476536312849162, + "grad_norm": 0.6460433602333069, + "learning_rate": 0.000878795518207283, + "loss": 0.4083, + "step": 4433 + }, + { + "epoch": 2.477094972067039, + "grad_norm": 0.8278968930244446, + "learning_rate": 0.0008787675070028012, + "loss": 0.3941, + "step": 4434 + }, + { + "epoch": 2.477653631284916, + "grad_norm": 0.5852335691452026, + "learning_rate": 0.0008787394957983193, + "loss": 0.6851, + "step": 4435 + }, + { + "epoch": 2.4782122905027935, + "grad_norm": 0.4538413882255554, + "learning_rate": 0.0008787114845938375, + "loss": 0.4634, + "step": 4436 + }, + { + "epoch": 2.4787709497206705, + "grad_norm": 0.5930296182632446, + "learning_rate": 0.0008786834733893557, + "loss": 0.4946, + "step": 4437 + }, + { + "epoch": 2.4793296089385475, + "grad_norm": 0.569406270980835, + "learning_rate": 0.000878655462184874, + "loss": 0.6094, + "step": 4438 + }, + { + "epoch": 2.4798882681564245, + "grad_norm": 0.5196834206581116, + "learning_rate": 0.0008786274509803922, + "loss": 0.5329, + "step": 4439 + }, + { + "epoch": 2.4804469273743015, + "grad_norm": 1.3550713062286377, + "learning_rate": 0.0008785994397759104, + "loss": 0.4816, + "step": 4440 + }, + { + "epoch": 2.481005586592179, + "grad_norm": 0.6692785620689392, + "learning_rate": 0.0008785714285714285, + "loss": 0.3862, + "step": 4441 + }, + { + "epoch": 2.481564245810056, + "grad_norm": 0.3891158998012543, + "learning_rate": 0.0008785434173669467, + "loss": 0.4182, + "step": 4442 + }, + { + "epoch": 2.482122905027933, + "grad_norm": 0.8678598403930664, + "learning_rate": 0.0008785154061624651, + "loss": 0.5296, + "step": 4443 + }, + { + "epoch": 2.48268156424581, + "grad_norm": 0.6176441311836243, + "learning_rate": 0.0008784873949579833, + "loss": 0.4719, + "step": 4444 + }, + { + "epoch": 2.483240223463687, + "grad_norm": 2.4637835025787354, + "learning_rate": 0.0008784593837535015, + "loss": 0.4647, + "step": 4445 + }, + { + "epoch": 2.4837988826815645, + "grad_norm": 0.5736443996429443, + "learning_rate": 0.0008784313725490196, + "loss": 0.5703, + "step": 4446 + }, + { + "epoch": 2.4843575418994415, + "grad_norm": 0.6970122456550598, + "learning_rate": 0.0008784033613445378, + "loss": 0.4167, + "step": 4447 + }, + { + "epoch": 2.4849162011173185, + "grad_norm": 0.4753473997116089, + "learning_rate": 0.0008783753501400561, + "loss": 0.5326, + "step": 4448 + }, + { + "epoch": 2.4854748603351955, + "grad_norm": 0.6131035685539246, + "learning_rate": 0.0008783473389355743, + "loss": 0.4966, + "step": 4449 + }, + { + "epoch": 2.4860335195530725, + "grad_norm": 0.8042401075363159, + "learning_rate": 0.0008783193277310925, + "loss": 0.5292, + "step": 4450 + }, + { + "epoch": 2.48659217877095, + "grad_norm": 0.5250211358070374, + "learning_rate": 0.0008782913165266106, + "loss": 0.4898, + "step": 4451 + }, + { + "epoch": 2.487150837988827, + "grad_norm": 0.6477358937263489, + "learning_rate": 0.0008782633053221288, + "loss": 0.3879, + "step": 4452 + }, + { + "epoch": 2.487709497206704, + "grad_norm": 0.5329732298851013, + "learning_rate": 0.0008782352941176471, + "loss": 0.468, + "step": 4453 + }, + { + "epoch": 2.488268156424581, + "grad_norm": 0.6341084241867065, + "learning_rate": 0.0008782072829131653, + "loss": 0.4227, + "step": 4454 + }, + { + "epoch": 2.488826815642458, + "grad_norm": 0.4659540355205536, + "learning_rate": 0.0008781792717086835, + "loss": 0.5092, + "step": 4455 + }, + { + "epoch": 2.489385474860335, + "grad_norm": 0.7676764130592346, + "learning_rate": 0.0008781512605042017, + "loss": 0.4128, + "step": 4456 + }, + { + "epoch": 2.4899441340782125, + "grad_norm": 0.7669159770011902, + "learning_rate": 0.0008781232492997198, + "loss": 0.3884, + "step": 4457 + }, + { + "epoch": 2.4905027932960895, + "grad_norm": 0.5939233899116516, + "learning_rate": 0.000878095238095238, + "loss": 0.3679, + "step": 4458 + }, + { + "epoch": 2.4910614525139665, + "grad_norm": 0.640285074710846, + "learning_rate": 0.0008780672268907564, + "loss": 0.4908, + "step": 4459 + }, + { + "epoch": 2.4916201117318435, + "grad_norm": 0.40500783920288086, + "learning_rate": 0.0008780392156862746, + "loss": 0.3761, + "step": 4460 + }, + { + "epoch": 2.4921787709497205, + "grad_norm": 0.5249844193458557, + "learning_rate": 0.0008780112044817928, + "loss": 0.4169, + "step": 4461 + }, + { + "epoch": 2.4927374301675975, + "grad_norm": 0.611035943031311, + "learning_rate": 0.0008779831932773109, + "loss": 0.4091, + "step": 4462 + }, + { + "epoch": 2.493296089385475, + "grad_norm": 1.7709965705871582, + "learning_rate": 0.0008779551820728291, + "loss": 0.4715, + "step": 4463 + }, + { + "epoch": 2.493854748603352, + "grad_norm": 0.667874276638031, + "learning_rate": 0.0008779271708683474, + "loss": 0.5608, + "step": 4464 + }, + { + "epoch": 2.494413407821229, + "grad_norm": 0.5665532946586609, + "learning_rate": 0.0008778991596638656, + "loss": 0.4516, + "step": 4465 + }, + { + "epoch": 2.494972067039106, + "grad_norm": 0.8860997557640076, + "learning_rate": 0.0008778711484593838, + "loss": 0.3293, + "step": 4466 + }, + { + "epoch": 2.495530726256983, + "grad_norm": 0.5210645198822021, + "learning_rate": 0.0008778431372549019, + "loss": 0.4554, + "step": 4467 + }, + { + "epoch": 2.4960893854748605, + "grad_norm": 0.7173487544059753, + "learning_rate": 0.0008778151260504201, + "loss": 0.4601, + "step": 4468 + }, + { + "epoch": 2.4966480446927375, + "grad_norm": 1.1310702562332153, + "learning_rate": 0.0008777871148459384, + "loss": 0.5989, + "step": 4469 + }, + { + "epoch": 2.4972067039106145, + "grad_norm": 1.9089235067367554, + "learning_rate": 0.0008777591036414566, + "loss": 0.3195, + "step": 4470 + }, + { + "epoch": 2.4977653631284915, + "grad_norm": 0.8973751068115234, + "learning_rate": 0.0008777310924369748, + "loss": 0.5801, + "step": 4471 + }, + { + "epoch": 2.4983240223463685, + "grad_norm": 0.9188690781593323, + "learning_rate": 0.000877703081232493, + "loss": 0.5607, + "step": 4472 + }, + { + "epoch": 2.498882681564246, + "grad_norm": 0.5488870143890381, + "learning_rate": 0.0008776750700280111, + "loss": 0.4537, + "step": 4473 + }, + { + "epoch": 2.499441340782123, + "grad_norm": 0.5225598216056824, + "learning_rate": 0.0008776470588235294, + "loss": 0.4182, + "step": 4474 + }, + { + "epoch": 2.5, + "grad_norm": 1.4376113414764404, + "learning_rate": 0.0008776190476190476, + "loss": 0.5006, + "step": 4475 + }, + { + "epoch": 2.500558659217877, + "grad_norm": 0.47329193353652954, + "learning_rate": 0.0008775910364145659, + "loss": 0.499, + "step": 4476 + }, + { + "epoch": 2.501117318435754, + "grad_norm": 0.8424351811408997, + "learning_rate": 0.0008775630252100841, + "loss": 0.5124, + "step": 4477 + }, + { + "epoch": 2.5016759776536315, + "grad_norm": 0.9163236618041992, + "learning_rate": 0.0008775350140056022, + "loss": 0.5137, + "step": 4478 + }, + { + "epoch": 2.5022346368715085, + "grad_norm": 0.7280294895172119, + "learning_rate": 0.0008775070028011205, + "loss": 0.4547, + "step": 4479 + }, + { + "epoch": 2.5027932960893855, + "grad_norm": 0.5652651786804199, + "learning_rate": 0.0008774789915966387, + "loss": 0.3931, + "step": 4480 + }, + { + "epoch": 2.5033519553072625, + "grad_norm": 0.874708354473114, + "learning_rate": 0.0008774509803921569, + "loss": 0.4496, + "step": 4481 + }, + { + "epoch": 2.5039106145251395, + "grad_norm": 0.5706674456596375, + "learning_rate": 0.0008774229691876751, + "loss": 0.4237, + "step": 4482 + }, + { + "epoch": 2.504469273743017, + "grad_norm": 0.4492708146572113, + "learning_rate": 0.0008773949579831932, + "loss": 0.4023, + "step": 4483 + }, + { + "epoch": 2.505027932960894, + "grad_norm": 0.8117376565933228, + "learning_rate": 0.0008773669467787115, + "loss": 0.5415, + "step": 4484 + }, + { + "epoch": 2.505586592178771, + "grad_norm": 0.5417127013206482, + "learning_rate": 0.0008773389355742297, + "loss": 0.4804, + "step": 4485 + }, + { + "epoch": 2.506145251396648, + "grad_norm": 0.5969968438148499, + "learning_rate": 0.0008773109243697479, + "loss": 0.5356, + "step": 4486 + }, + { + "epoch": 2.506703910614525, + "grad_norm": 1.2206852436065674, + "learning_rate": 0.0008772829131652661, + "loss": 0.6047, + "step": 4487 + }, + { + "epoch": 2.5072625698324025, + "grad_norm": 0.9537792801856995, + "learning_rate": 0.0008772549019607843, + "loss": 0.475, + "step": 4488 + }, + { + "epoch": 2.5078212290502795, + "grad_norm": 4.100423336029053, + "learning_rate": 0.0008772268907563025, + "loss": 0.4823, + "step": 4489 + }, + { + "epoch": 2.5083798882681565, + "grad_norm": 0.5287656784057617, + "learning_rate": 0.0008771988795518207, + "loss": 0.5067, + "step": 4490 + }, + { + "epoch": 2.5089385474860335, + "grad_norm": 0.8828968405723572, + "learning_rate": 0.0008771708683473389, + "loss": 0.4223, + "step": 4491 + }, + { + "epoch": 2.5094972067039105, + "grad_norm": 0.6327812075614929, + "learning_rate": 0.0008771428571428572, + "loss": 0.4399, + "step": 4492 + }, + { + "epoch": 2.510055865921788, + "grad_norm": 0.451775461435318, + "learning_rate": 0.0008771148459383754, + "loss": 0.3741, + "step": 4493 + }, + { + "epoch": 2.5106145251396645, + "grad_norm": 0.7219546437263489, + "learning_rate": 0.0008770868347338936, + "loss": 0.5053, + "step": 4494 + }, + { + "epoch": 2.511173184357542, + "grad_norm": 0.5533103346824646, + "learning_rate": 0.0008770588235294118, + "loss": 0.4911, + "step": 4495 + }, + { + "epoch": 2.511731843575419, + "grad_norm": 0.7420123219490051, + "learning_rate": 0.00087703081232493, + "loss": 0.4492, + "step": 4496 + }, + { + "epoch": 2.512290502793296, + "grad_norm": 0.43191009759902954, + "learning_rate": 0.0008770028011204482, + "loss": 0.4421, + "step": 4497 + }, + { + "epoch": 2.512849162011173, + "grad_norm": 0.49949371814727783, + "learning_rate": 0.0008769747899159664, + "loss": 0.5096, + "step": 4498 + }, + { + "epoch": 2.51340782122905, + "grad_norm": 0.6802654266357422, + "learning_rate": 0.0008769467787114847, + "loss": 0.4505, + "step": 4499 + }, + { + "epoch": 2.5139664804469275, + "grad_norm": 0.4956294596195221, + "learning_rate": 0.0008769187675070028, + "loss": 0.4281, + "step": 4500 + }, + { + "epoch": 2.5139664804469275, + "eval_cer": 0.09615615419026109, + "eval_loss": 0.36138299107551575, + "eval_runtime": 55.6478, + "eval_samples_per_second": 81.549, + "eval_steps_per_second": 5.104, + "eval_wer": 0.3760509482975336, + "step": 4500 + }, + { + "epoch": 2.5145251396648045, + "grad_norm": 0.5228537321090698, + "learning_rate": 0.000876890756302521, + "loss": 0.4907, + "step": 4501 + }, + { + "epoch": 2.5150837988826815, + "grad_norm": 0.6658404469490051, + "learning_rate": 0.0008768627450980392, + "loss": 0.39, + "step": 4502 + }, + { + "epoch": 2.5156424581005585, + "grad_norm": 0.5798348188400269, + "learning_rate": 0.0008768347338935574, + "loss": 0.4364, + "step": 4503 + }, + { + "epoch": 2.5162011173184355, + "grad_norm": 1.7013322114944458, + "learning_rate": 0.0008768067226890757, + "loss": 0.4757, + "step": 4504 + }, + { + "epoch": 2.516759776536313, + "grad_norm": 0.6113928556442261, + "learning_rate": 0.0008767787114845938, + "loss": 0.5182, + "step": 4505 + }, + { + "epoch": 2.51731843575419, + "grad_norm": 0.8509871959686279, + "learning_rate": 0.000876750700280112, + "loss": 0.6024, + "step": 4506 + }, + { + "epoch": 2.517877094972067, + "grad_norm": 0.48457011580467224, + "learning_rate": 0.0008767226890756302, + "loss": 0.3722, + "step": 4507 + }, + { + "epoch": 2.518435754189944, + "grad_norm": 0.5147897005081177, + "learning_rate": 0.0008766946778711484, + "loss": 0.4042, + "step": 4508 + }, + { + "epoch": 2.518994413407821, + "grad_norm": 0.652128279209137, + "learning_rate": 0.0008766666666666668, + "loss": 0.5105, + "step": 4509 + }, + { + "epoch": 2.5195530726256985, + "grad_norm": 0.5479727983474731, + "learning_rate": 0.0008766386554621849, + "loss": 0.4718, + "step": 4510 + }, + { + "epoch": 2.5201117318435755, + "grad_norm": 0.6929388642311096, + "learning_rate": 0.0008766106442577031, + "loss": 0.4866, + "step": 4511 + }, + { + "epoch": 2.5206703910614525, + "grad_norm": 0.577779233455658, + "learning_rate": 0.0008765826330532213, + "loss": 0.5949, + "step": 4512 + }, + { + "epoch": 2.5212290502793295, + "grad_norm": 0.5748147368431091, + "learning_rate": 0.0008765546218487395, + "loss": 0.5287, + "step": 4513 + }, + { + "epoch": 2.5217877094972065, + "grad_norm": 0.9498600959777832, + "learning_rate": 0.0008765266106442578, + "loss": 0.4796, + "step": 4514 + }, + { + "epoch": 2.522346368715084, + "grad_norm": 0.8140057325363159, + "learning_rate": 0.000876498599439776, + "loss": 0.4023, + "step": 4515 + }, + { + "epoch": 2.522905027932961, + "grad_norm": 0.45185020565986633, + "learning_rate": 0.0008764705882352941, + "loss": 0.4363, + "step": 4516 + }, + { + "epoch": 2.523463687150838, + "grad_norm": 0.5347515344619751, + "learning_rate": 0.0008764425770308123, + "loss": 0.4074, + "step": 4517 + }, + { + "epoch": 2.524022346368715, + "grad_norm": 0.4780946969985962, + "learning_rate": 0.0008764145658263305, + "loss": 0.467, + "step": 4518 + }, + { + "epoch": 2.524581005586592, + "grad_norm": 0.7791872024536133, + "learning_rate": 0.0008763865546218488, + "loss": 0.4619, + "step": 4519 + }, + { + "epoch": 2.5251396648044695, + "grad_norm": 0.6607198119163513, + "learning_rate": 0.000876358543417367, + "loss": 0.4721, + "step": 4520 + }, + { + "epoch": 2.5256983240223465, + "grad_norm": 0.6076475977897644, + "learning_rate": 0.0008763305322128851, + "loss": 0.4908, + "step": 4521 + }, + { + "epoch": 2.5262569832402235, + "grad_norm": 1.4148967266082764, + "learning_rate": 0.0008763025210084033, + "loss": 0.4464, + "step": 4522 + }, + { + "epoch": 2.5268156424581005, + "grad_norm": 0.6420329809188843, + "learning_rate": 0.0008762745098039215, + "loss": 0.4436, + "step": 4523 + }, + { + "epoch": 2.5273743016759775, + "grad_norm": 2.79652738571167, + "learning_rate": 0.0008762464985994399, + "loss": 0.5347, + "step": 4524 + }, + { + "epoch": 2.527932960893855, + "grad_norm": 0.6680772304534912, + "learning_rate": 0.0008762184873949581, + "loss": 0.5336, + "step": 4525 + }, + { + "epoch": 2.528491620111732, + "grad_norm": 1.2352335453033447, + "learning_rate": 0.0008761904761904762, + "loss": 0.5413, + "step": 4526 + }, + { + "epoch": 2.529050279329609, + "grad_norm": 0.7459228038787842, + "learning_rate": 0.0008761624649859944, + "loss": 0.4987, + "step": 4527 + }, + { + "epoch": 2.529608938547486, + "grad_norm": 0.6470720767974854, + "learning_rate": 0.0008761344537815126, + "loss": 0.6132, + "step": 4528 + }, + { + "epoch": 2.530167597765363, + "grad_norm": 0.8248283267021179, + "learning_rate": 0.0008761064425770309, + "loss": 0.5492, + "step": 4529 + }, + { + "epoch": 2.5307262569832405, + "grad_norm": 0.6090158820152283, + "learning_rate": 0.0008760784313725491, + "loss": 0.4907, + "step": 4530 + }, + { + "epoch": 2.531284916201117, + "grad_norm": 1.093633770942688, + "learning_rate": 0.0008760504201680673, + "loss": 0.4484, + "step": 4531 + }, + { + "epoch": 2.5318435754189945, + "grad_norm": 2.0259437561035156, + "learning_rate": 0.0008760224089635854, + "loss": 0.496, + "step": 4532 + }, + { + "epoch": 2.5324022346368715, + "grad_norm": 0.5741031169891357, + "learning_rate": 0.0008759943977591036, + "loss": 0.5206, + "step": 4533 + }, + { + "epoch": 2.5329608938547485, + "grad_norm": 0.9330551624298096, + "learning_rate": 0.0008759663865546219, + "loss": 0.4618, + "step": 4534 + }, + { + "epoch": 2.5335195530726256, + "grad_norm": 4.283111572265625, + "learning_rate": 0.0008759383753501401, + "loss": 0.4011, + "step": 4535 + }, + { + "epoch": 2.5340782122905026, + "grad_norm": 0.6364736557006836, + "learning_rate": 0.0008759103641456583, + "loss": 0.4744, + "step": 4536 + }, + { + "epoch": 2.53463687150838, + "grad_norm": 0.7679871916770935, + "learning_rate": 0.0008758823529411764, + "loss": 0.4555, + "step": 4537 + }, + { + "epoch": 2.535195530726257, + "grad_norm": 0.7553772926330566, + "learning_rate": 0.0008758543417366946, + "loss": 0.382, + "step": 4538 + }, + { + "epoch": 2.535754189944134, + "grad_norm": 0.6703191995620728, + "learning_rate": 0.0008758263305322129, + "loss": 0.3969, + "step": 4539 + }, + { + "epoch": 2.536312849162011, + "grad_norm": 0.5475282669067383, + "learning_rate": 0.0008757983193277311, + "loss": 0.4321, + "step": 4540 + }, + { + "epoch": 2.536871508379888, + "grad_norm": 0.9024701714515686, + "learning_rate": 0.0008757703081232494, + "loss": 0.4568, + "step": 4541 + }, + { + "epoch": 2.5374301675977655, + "grad_norm": 0.8123811483383179, + "learning_rate": 0.0008757422969187675, + "loss": 0.4794, + "step": 4542 + }, + { + "epoch": 2.5379888268156425, + "grad_norm": 0.5144625306129456, + "learning_rate": 0.0008757142857142857, + "loss": 0.5732, + "step": 4543 + }, + { + "epoch": 2.5385474860335195, + "grad_norm": 0.8084815144538879, + "learning_rate": 0.000875686274509804, + "loss": 0.4647, + "step": 4544 + }, + { + "epoch": 2.5391061452513966, + "grad_norm": 1.0256503820419312, + "learning_rate": 0.0008756582633053222, + "loss": 0.4485, + "step": 4545 + }, + { + "epoch": 2.5396648044692736, + "grad_norm": 0.6930704116821289, + "learning_rate": 0.0008756302521008404, + "loss": 0.495, + "step": 4546 + }, + { + "epoch": 2.540223463687151, + "grad_norm": 0.8160821795463562, + "learning_rate": 0.0008756022408963586, + "loss": 0.562, + "step": 4547 + }, + { + "epoch": 2.540782122905028, + "grad_norm": 0.5813196897506714, + "learning_rate": 0.0008755742296918767, + "loss": 0.4651, + "step": 4548 + }, + { + "epoch": 2.541340782122905, + "grad_norm": 0.617099404335022, + "learning_rate": 0.000875546218487395, + "loss": 0.4141, + "step": 4549 + }, + { + "epoch": 2.541899441340782, + "grad_norm": 0.4484594166278839, + "learning_rate": 0.0008755182072829132, + "loss": 0.4557, + "step": 4550 + }, + { + "epoch": 2.542458100558659, + "grad_norm": 6.695379734039307, + "learning_rate": 0.0008754901960784314, + "loss": 0.4751, + "step": 4551 + }, + { + "epoch": 2.5430167597765365, + "grad_norm": 1.1015985012054443, + "learning_rate": 0.0008754621848739496, + "loss": 0.4442, + "step": 4552 + }, + { + "epoch": 2.5435754189944135, + "grad_norm": 1.083116888999939, + "learning_rate": 0.0008754341736694677, + "loss": 0.5141, + "step": 4553 + }, + { + "epoch": 2.5441340782122905, + "grad_norm": 0.5880929231643677, + "learning_rate": 0.000875406162464986, + "loss": 0.3553, + "step": 4554 + }, + { + "epoch": 2.5446927374301676, + "grad_norm": 1.604151964187622, + "learning_rate": 0.0008753781512605042, + "loss": 0.5281, + "step": 4555 + }, + { + "epoch": 2.5452513966480446, + "grad_norm": 0.6877735257148743, + "learning_rate": 0.0008753501400560224, + "loss": 0.4869, + "step": 4556 + }, + { + "epoch": 2.545810055865922, + "grad_norm": 0.6027437448501587, + "learning_rate": 0.0008753221288515406, + "loss": 0.5367, + "step": 4557 + }, + { + "epoch": 2.546368715083799, + "grad_norm": 0.7366870641708374, + "learning_rate": 0.0008752941176470587, + "loss": 0.548, + "step": 4558 + }, + { + "epoch": 2.546927374301676, + "grad_norm": 1.889333724975586, + "learning_rate": 0.0008752661064425771, + "loss": 0.4813, + "step": 4559 + }, + { + "epoch": 2.547486033519553, + "grad_norm": 0.6310818791389465, + "learning_rate": 0.0008752380952380953, + "loss": 0.4387, + "step": 4560 + }, + { + "epoch": 2.54804469273743, + "grad_norm": 1.465470790863037, + "learning_rate": 0.0008752100840336135, + "loss": 0.5065, + "step": 4561 + }, + { + "epoch": 2.5486033519553075, + "grad_norm": 0.4020266830921173, + "learning_rate": 0.0008751820728291317, + "loss": 0.4131, + "step": 4562 + }, + { + "epoch": 2.549162011173184, + "grad_norm": 0.6013814210891724, + "learning_rate": 0.0008751540616246499, + "loss": 0.5172, + "step": 4563 + }, + { + "epoch": 2.5497206703910615, + "grad_norm": 1.005944013595581, + "learning_rate": 0.0008751260504201681, + "loss": 0.6483, + "step": 4564 + }, + { + "epoch": 2.5502793296089385, + "grad_norm": 0.617769718170166, + "learning_rate": 0.0008750980392156863, + "loss": 0.4005, + "step": 4565 + }, + { + "epoch": 2.5508379888268156, + "grad_norm": 0.7957714796066284, + "learning_rate": 0.0008750700280112045, + "loss": 0.5973, + "step": 4566 + }, + { + "epoch": 2.5513966480446926, + "grad_norm": 0.5730568766593933, + "learning_rate": 0.0008750420168067227, + "loss": 0.4122, + "step": 4567 + }, + { + "epoch": 2.5519553072625696, + "grad_norm": 0.410114586353302, + "learning_rate": 0.0008750140056022409, + "loss": 0.3875, + "step": 4568 + }, + { + "epoch": 2.552513966480447, + "grad_norm": 0.5299544930458069, + "learning_rate": 0.0008749859943977591, + "loss": 0.5132, + "step": 4569 + }, + { + "epoch": 2.553072625698324, + "grad_norm": 0.42914700508117676, + "learning_rate": 0.0008749579831932773, + "loss": 0.4213, + "step": 4570 + }, + { + "epoch": 2.553631284916201, + "grad_norm": 0.7517532110214233, + "learning_rate": 0.0008749299719887955, + "loss": 0.5517, + "step": 4571 + }, + { + "epoch": 2.554189944134078, + "grad_norm": 0.4640340805053711, + "learning_rate": 0.0008749019607843137, + "loss": 0.5067, + "step": 4572 + }, + { + "epoch": 2.554748603351955, + "grad_norm": 3.790728807449341, + "learning_rate": 0.0008748739495798319, + "loss": 0.5391, + "step": 4573 + }, + { + "epoch": 2.5553072625698325, + "grad_norm": 0.4719926118850708, + "learning_rate": 0.0008748459383753502, + "loss": 0.3969, + "step": 4574 + }, + { + "epoch": 2.5558659217877095, + "grad_norm": 0.5402041077613831, + "learning_rate": 0.0008748179271708684, + "loss": 0.4546, + "step": 4575 + }, + { + "epoch": 2.5564245810055866, + "grad_norm": 0.8960727453231812, + "learning_rate": 0.0008747899159663866, + "loss": 0.6047, + "step": 4576 + }, + { + "epoch": 2.5569832402234636, + "grad_norm": 0.6102758049964905, + "learning_rate": 0.0008747619047619048, + "loss": 0.6259, + "step": 4577 + }, + { + "epoch": 2.5575418994413406, + "grad_norm": 0.5716445446014404, + "learning_rate": 0.000874733893557423, + "loss": 0.5124, + "step": 4578 + }, + { + "epoch": 2.558100558659218, + "grad_norm": 0.5660879015922546, + "learning_rate": 0.0008747058823529413, + "loss": 0.4046, + "step": 4579 + }, + { + "epoch": 2.558659217877095, + "grad_norm": 1.2680186033248901, + "learning_rate": 0.0008746778711484594, + "loss": 0.4978, + "step": 4580 + }, + { + "epoch": 2.559217877094972, + "grad_norm": 0.884227454662323, + "learning_rate": 0.0008746498599439776, + "loss": 0.3966, + "step": 4581 + }, + { + "epoch": 2.559776536312849, + "grad_norm": 0.6963787078857422, + "learning_rate": 0.0008746218487394958, + "loss": 0.6064, + "step": 4582 + }, + { + "epoch": 2.560335195530726, + "grad_norm": 0.42113932967185974, + "learning_rate": 0.000874593837535014, + "loss": 0.4798, + "step": 4583 + }, + { + "epoch": 2.5608938547486035, + "grad_norm": 4.617695331573486, + "learning_rate": 0.0008745658263305323, + "loss": 0.4827, + "step": 4584 + }, + { + "epoch": 2.5614525139664805, + "grad_norm": 0.632781982421875, + "learning_rate": 0.0008745378151260504, + "loss": 0.452, + "step": 4585 + }, + { + "epoch": 2.5620111731843576, + "grad_norm": 0.9550939798355103, + "learning_rate": 0.0008745098039215686, + "loss": 0.6119, + "step": 4586 + }, + { + "epoch": 2.5625698324022346, + "grad_norm": 0.5491369366645813, + "learning_rate": 0.0008744817927170868, + "loss": 0.495, + "step": 4587 + }, + { + "epoch": 2.5631284916201116, + "grad_norm": 0.6712592244148254, + "learning_rate": 0.000874453781512605, + "loss": 0.4184, + "step": 4588 + }, + { + "epoch": 2.563687150837989, + "grad_norm": 0.7854418158531189, + "learning_rate": 0.0008744257703081233, + "loss": 0.47, + "step": 4589 + }, + { + "epoch": 2.564245810055866, + "grad_norm": 0.9034537076950073, + "learning_rate": 0.0008743977591036414, + "loss": 0.6141, + "step": 4590 + }, + { + "epoch": 2.564804469273743, + "grad_norm": 0.8327544927597046, + "learning_rate": 0.0008743697478991597, + "loss": 0.343, + "step": 4591 + }, + { + "epoch": 2.56536312849162, + "grad_norm": 0.4472448229789734, + "learning_rate": 0.0008743417366946779, + "loss": 0.404, + "step": 4592 + }, + { + "epoch": 2.565921787709497, + "grad_norm": 3.9351420402526855, + "learning_rate": 0.0008743137254901961, + "loss": 0.5229, + "step": 4593 + }, + { + "epoch": 2.5664804469273745, + "grad_norm": 0.6701863408088684, + "learning_rate": 0.0008742857142857144, + "loss": 0.3762, + "step": 4594 + }, + { + "epoch": 2.5670391061452515, + "grad_norm": 0.4767731726169586, + "learning_rate": 0.0008742577030812326, + "loss": 0.4533, + "step": 4595 + }, + { + "epoch": 2.5675977653631286, + "grad_norm": 0.5488147139549255, + "learning_rate": 0.0008742296918767507, + "loss": 0.3994, + "step": 4596 + }, + { + "epoch": 2.5681564245810056, + "grad_norm": 0.4771422743797302, + "learning_rate": 0.0008742016806722689, + "loss": 0.4987, + "step": 4597 + }, + { + "epoch": 2.5687150837988826, + "grad_norm": 0.544771134853363, + "learning_rate": 0.0008741736694677871, + "loss": 0.4167, + "step": 4598 + }, + { + "epoch": 2.56927374301676, + "grad_norm": 1.0047450065612793, + "learning_rate": 0.0008741456582633054, + "loss": 0.5022, + "step": 4599 + }, + { + "epoch": 2.5698324022346366, + "grad_norm": 0.8253680467605591, + "learning_rate": 0.0008741176470588236, + "loss": 0.5205, + "step": 4600 + }, + { + "epoch": 2.570391061452514, + "grad_norm": 0.9420467615127563, + "learning_rate": 0.0008740896358543417, + "loss": 0.524, + "step": 4601 + }, + { + "epoch": 2.570949720670391, + "grad_norm": 1.6282472610473633, + "learning_rate": 0.0008740616246498599, + "loss": 0.4728, + "step": 4602 + }, + { + "epoch": 2.571508379888268, + "grad_norm": 0.7011590003967285, + "learning_rate": 0.0008740336134453781, + "loss": 0.6335, + "step": 4603 + }, + { + "epoch": 2.572067039106145, + "grad_norm": 0.6995001435279846, + "learning_rate": 0.0008740056022408964, + "loss": 0.5563, + "step": 4604 + }, + { + "epoch": 2.572625698324022, + "grad_norm": 0.6163257360458374, + "learning_rate": 0.0008739775910364146, + "loss": 0.4066, + "step": 4605 + }, + { + "epoch": 2.5731843575418996, + "grad_norm": 0.3490305542945862, + "learning_rate": 0.0008739495798319327, + "loss": 0.3272, + "step": 4606 + }, + { + "epoch": 2.5737430167597766, + "grad_norm": 0.5445644855499268, + "learning_rate": 0.000873921568627451, + "loss": 0.383, + "step": 4607 + }, + { + "epoch": 2.5743016759776536, + "grad_norm": 0.47447124123573303, + "learning_rate": 0.0008738935574229692, + "loss": 0.4254, + "step": 4608 + }, + { + "epoch": 2.5748603351955306, + "grad_norm": 1.102980375289917, + "learning_rate": 0.0008738655462184875, + "loss": 0.5693, + "step": 4609 + }, + { + "epoch": 2.5754189944134076, + "grad_norm": 0.5574500560760498, + "learning_rate": 0.0008738375350140057, + "loss": 0.4323, + "step": 4610 + }, + { + "epoch": 2.575977653631285, + "grad_norm": 0.7321161031723022, + "learning_rate": 0.0008738095238095239, + "loss": 0.5591, + "step": 4611 + }, + { + "epoch": 2.576536312849162, + "grad_norm": 0.6664870381355286, + "learning_rate": 0.000873781512605042, + "loss": 0.5209, + "step": 4612 + }, + { + "epoch": 2.577094972067039, + "grad_norm": 1.344436764717102, + "learning_rate": 0.0008737535014005602, + "loss": 0.4379, + "step": 4613 + }, + { + "epoch": 2.577653631284916, + "grad_norm": 0.8389381170272827, + "learning_rate": 0.0008737254901960785, + "loss": 0.4701, + "step": 4614 + }, + { + "epoch": 2.578212290502793, + "grad_norm": 0.7012710571289062, + "learning_rate": 0.0008736974789915967, + "loss": 0.4322, + "step": 4615 + }, + { + "epoch": 2.5787709497206706, + "grad_norm": 0.5328612327575684, + "learning_rate": 0.0008736694677871149, + "loss": 0.4715, + "step": 4616 + }, + { + "epoch": 2.5793296089385476, + "grad_norm": 0.5420647859573364, + "learning_rate": 0.000873641456582633, + "loss": 0.4558, + "step": 4617 + }, + { + "epoch": 2.5798882681564246, + "grad_norm": 0.5130098462104797, + "learning_rate": 0.0008736134453781512, + "loss": 0.5482, + "step": 4618 + }, + { + "epoch": 2.5804469273743016, + "grad_norm": 0.5072817206382751, + "learning_rate": 0.0008735854341736695, + "loss": 0.3707, + "step": 4619 + }, + { + "epoch": 2.5810055865921786, + "grad_norm": 1.2881916761398315, + "learning_rate": 0.0008735574229691877, + "loss": 0.4751, + "step": 4620 + }, + { + "epoch": 2.581564245810056, + "grad_norm": 0.6830811500549316, + "learning_rate": 0.0008735294117647059, + "loss": 0.5533, + "step": 4621 + }, + { + "epoch": 2.582122905027933, + "grad_norm": 0.6909314393997192, + "learning_rate": 0.000873501400560224, + "loss": 0.4638, + "step": 4622 + }, + { + "epoch": 2.58268156424581, + "grad_norm": 0.6132743954658508, + "learning_rate": 0.0008734733893557422, + "loss": 0.5715, + "step": 4623 + }, + { + "epoch": 2.583240223463687, + "grad_norm": 0.9117586612701416, + "learning_rate": 0.0008734453781512606, + "loss": 0.535, + "step": 4624 + }, + { + "epoch": 2.583798882681564, + "grad_norm": 0.6704279184341431, + "learning_rate": 0.0008734173669467788, + "loss": 0.4819, + "step": 4625 + }, + { + "epoch": 2.5843575418994416, + "grad_norm": 0.7600635886192322, + "learning_rate": 0.000873389355742297, + "loss": 0.5432, + "step": 4626 + }, + { + "epoch": 2.5849162011173186, + "grad_norm": 0.6423128843307495, + "learning_rate": 0.0008733613445378152, + "loss": 0.5347, + "step": 4627 + }, + { + "epoch": 2.5854748603351956, + "grad_norm": 0.9727770686149597, + "learning_rate": 0.0008733333333333333, + "loss": 0.5764, + "step": 4628 + }, + { + "epoch": 2.5860335195530726, + "grad_norm": 0.5933406352996826, + "learning_rate": 0.0008733053221288516, + "loss": 0.4209, + "step": 4629 + }, + { + "epoch": 2.5865921787709496, + "grad_norm": 0.510667085647583, + "learning_rate": 0.0008732773109243698, + "loss": 0.5379, + "step": 4630 + }, + { + "epoch": 2.587150837988827, + "grad_norm": 0.6274928450584412, + "learning_rate": 0.000873249299719888, + "loss": 0.5807, + "step": 4631 + }, + { + "epoch": 2.587709497206704, + "grad_norm": 0.5118690133094788, + "learning_rate": 0.0008732212885154062, + "loss": 0.5204, + "step": 4632 + }, + { + "epoch": 2.588268156424581, + "grad_norm": 0.5862618088722229, + "learning_rate": 0.0008731932773109243, + "loss": 0.4826, + "step": 4633 + }, + { + "epoch": 2.588826815642458, + "grad_norm": 0.6152790188789368, + "learning_rate": 0.0008731652661064426, + "loss": 0.4975, + "step": 4634 + }, + { + "epoch": 2.589385474860335, + "grad_norm": 0.5919183492660522, + "learning_rate": 0.0008731372549019608, + "loss": 0.5434, + "step": 4635 + }, + { + "epoch": 2.5899441340782126, + "grad_norm": 0.5699243545532227, + "learning_rate": 0.000873109243697479, + "loss": 0.4746, + "step": 4636 + }, + { + "epoch": 2.590502793296089, + "grad_norm": 0.6677420139312744, + "learning_rate": 0.0008730812324929972, + "loss": 0.5273, + "step": 4637 + }, + { + "epoch": 2.5910614525139666, + "grad_norm": 0.6572413444519043, + "learning_rate": 0.0008730532212885153, + "loss": 0.5044, + "step": 4638 + }, + { + "epoch": 2.5916201117318436, + "grad_norm": 0.6275217533111572, + "learning_rate": 0.0008730252100840336, + "loss": 0.4657, + "step": 4639 + }, + { + "epoch": 2.5921787709497206, + "grad_norm": 0.9364564418792725, + "learning_rate": 0.0008729971988795519, + "loss": 0.4398, + "step": 4640 + }, + { + "epoch": 2.5927374301675976, + "grad_norm": 0.7408837080001831, + "learning_rate": 0.0008729691876750701, + "loss": 0.418, + "step": 4641 + }, + { + "epoch": 2.5932960893854746, + "grad_norm": 0.6557133793830872, + "learning_rate": 0.0008729411764705883, + "loss": 0.4434, + "step": 4642 + }, + { + "epoch": 2.593854748603352, + "grad_norm": 0.5375706553459167, + "learning_rate": 0.0008729131652661065, + "loss": 0.5618, + "step": 4643 + }, + { + "epoch": 2.594413407821229, + "grad_norm": 0.4827219247817993, + "learning_rate": 0.0008728851540616247, + "loss": 0.5042, + "step": 4644 + }, + { + "epoch": 2.594972067039106, + "grad_norm": 0.5630202293395996, + "learning_rate": 0.0008728571428571429, + "loss": 0.4861, + "step": 4645 + }, + { + "epoch": 2.595530726256983, + "grad_norm": 0.8943331837654114, + "learning_rate": 0.0008728291316526611, + "loss": 0.514, + "step": 4646 + }, + { + "epoch": 2.59608938547486, + "grad_norm": 1.135999083518982, + "learning_rate": 0.0008728011204481793, + "loss": 0.5324, + "step": 4647 + }, + { + "epoch": 2.5966480446927376, + "grad_norm": 0.597041666507721, + "learning_rate": 0.0008727731092436975, + "loss": 0.4565, + "step": 4648 + }, + { + "epoch": 2.5972067039106146, + "grad_norm": 0.3813895881175995, + "learning_rate": 0.0008727450980392157, + "loss": 0.3213, + "step": 4649 + }, + { + "epoch": 2.5977653631284916, + "grad_norm": 1.0273348093032837, + "learning_rate": 0.0008727170868347339, + "loss": 0.5125, + "step": 4650 + }, + { + "epoch": 2.5983240223463686, + "grad_norm": 0.42914119362831116, + "learning_rate": 0.0008726890756302521, + "loss": 0.4083, + "step": 4651 + }, + { + "epoch": 2.5988826815642456, + "grad_norm": 1.0949652194976807, + "learning_rate": 0.0008726610644257703, + "loss": 0.5836, + "step": 4652 + }, + { + "epoch": 2.599441340782123, + "grad_norm": 2.259638547897339, + "learning_rate": 0.0008726330532212885, + "loss": 0.5527, + "step": 4653 + }, + { + "epoch": 2.6, + "grad_norm": 2.4475438594818115, + "learning_rate": 0.0008726050420168068, + "loss": 0.5693, + "step": 4654 + }, + { + "epoch": 2.600558659217877, + "grad_norm": 0.6083667874336243, + "learning_rate": 0.0008725770308123249, + "loss": 0.4522, + "step": 4655 + }, + { + "epoch": 2.601117318435754, + "grad_norm": 6.451206684112549, + "learning_rate": 0.0008725490196078432, + "loss": 0.5011, + "step": 4656 + }, + { + "epoch": 2.601675977653631, + "grad_norm": 4.511854648590088, + "learning_rate": 0.0008725210084033614, + "loss": 0.4763, + "step": 4657 + }, + { + "epoch": 2.6022346368715086, + "grad_norm": 3.012691020965576, + "learning_rate": 0.0008724929971988796, + "loss": 0.61, + "step": 4658 + }, + { + "epoch": 2.6027932960893856, + "grad_norm": 0.4272519648075104, + "learning_rate": 0.0008724649859943979, + "loss": 0.4468, + "step": 4659 + }, + { + "epoch": 2.6033519553072626, + "grad_norm": 0.5548369884490967, + "learning_rate": 0.000872436974789916, + "loss": 0.4977, + "step": 4660 + }, + { + "epoch": 2.6039106145251396, + "grad_norm": 0.4272269606590271, + "learning_rate": 0.0008724089635854342, + "loss": 0.3786, + "step": 4661 + }, + { + "epoch": 2.6044692737430166, + "grad_norm": 0.6776177287101746, + "learning_rate": 0.0008723809523809524, + "loss": 0.6815, + "step": 4662 + }, + { + "epoch": 2.605027932960894, + "grad_norm": 1.3994733095169067, + "learning_rate": 0.0008723529411764706, + "loss": 0.4271, + "step": 4663 + }, + { + "epoch": 2.605586592178771, + "grad_norm": 0.7243050932884216, + "learning_rate": 0.0008723249299719889, + "loss": 0.4679, + "step": 4664 + }, + { + "epoch": 2.606145251396648, + "grad_norm": 0.6003062129020691, + "learning_rate": 0.000872296918767507, + "loss": 0.4951, + "step": 4665 + }, + { + "epoch": 2.606703910614525, + "grad_norm": 1.933485746383667, + "learning_rate": 0.0008722689075630252, + "loss": 0.4334, + "step": 4666 + }, + { + "epoch": 2.607262569832402, + "grad_norm": 0.694419801235199, + "learning_rate": 0.0008722408963585434, + "loss": 0.6094, + "step": 4667 + }, + { + "epoch": 2.6078212290502796, + "grad_norm": 1.0899416208267212, + "learning_rate": 0.0008722128851540616, + "loss": 0.6327, + "step": 4668 + }, + { + "epoch": 2.6083798882681566, + "grad_norm": 0.6413179039955139, + "learning_rate": 0.0008721848739495799, + "loss": 0.4992, + "step": 4669 + }, + { + "epoch": 2.6089385474860336, + "grad_norm": 0.5404636263847351, + "learning_rate": 0.0008721568627450981, + "loss": 0.5117, + "step": 4670 + }, + { + "epoch": 2.6094972067039106, + "grad_norm": 0.6101294159889221, + "learning_rate": 0.0008721288515406162, + "loss": 0.4767, + "step": 4671 + }, + { + "epoch": 2.6100558659217876, + "grad_norm": 1.3087968826293945, + "learning_rate": 0.0008721008403361344, + "loss": 0.5354, + "step": 4672 + }, + { + "epoch": 2.610614525139665, + "grad_norm": 0.7199153304100037, + "learning_rate": 0.0008720728291316527, + "loss": 0.4506, + "step": 4673 + }, + { + "epoch": 2.6111731843575416, + "grad_norm": 0.6021122336387634, + "learning_rate": 0.000872044817927171, + "loss": 0.4336, + "step": 4674 + }, + { + "epoch": 2.611731843575419, + "grad_norm": 0.6390370726585388, + "learning_rate": 0.0008720168067226892, + "loss": 0.4852, + "step": 4675 + }, + { + "epoch": 2.612290502793296, + "grad_norm": 0.8552182912826538, + "learning_rate": 0.0008719887955182073, + "loss": 0.5133, + "step": 4676 + }, + { + "epoch": 2.612849162011173, + "grad_norm": 0.5973447561264038, + "learning_rate": 0.0008719607843137255, + "loss": 0.5347, + "step": 4677 + }, + { + "epoch": 2.61340782122905, + "grad_norm": 0.6187921166419983, + "learning_rate": 0.0008719327731092437, + "loss": 0.4583, + "step": 4678 + }, + { + "epoch": 2.613966480446927, + "grad_norm": 0.8108627796173096, + "learning_rate": 0.000871904761904762, + "loss": 0.5222, + "step": 4679 + }, + { + "epoch": 2.6145251396648046, + "grad_norm": 0.6222319602966309, + "learning_rate": 0.0008718767507002802, + "loss": 0.5005, + "step": 4680 + }, + { + "epoch": 2.6150837988826816, + "grad_norm": 2.097660779953003, + "learning_rate": 0.0008718487394957983, + "loss": 0.4469, + "step": 4681 + }, + { + "epoch": 2.6156424581005586, + "grad_norm": 0.6373857259750366, + "learning_rate": 0.0008718207282913165, + "loss": 0.4338, + "step": 4682 + }, + { + "epoch": 2.6162011173184356, + "grad_norm": 0.5153201818466187, + "learning_rate": 0.0008717927170868347, + "loss": 0.484, + "step": 4683 + }, + { + "epoch": 2.6167597765363126, + "grad_norm": 7.995108127593994, + "learning_rate": 0.0008717647058823529, + "loss": 0.4267, + "step": 4684 + }, + { + "epoch": 2.61731843575419, + "grad_norm": 0.46427232027053833, + "learning_rate": 0.0008717366946778712, + "loss": 0.4209, + "step": 4685 + }, + { + "epoch": 2.617877094972067, + "grad_norm": 0.644861102104187, + "learning_rate": 0.0008717086834733894, + "loss": 0.5519, + "step": 4686 + }, + { + "epoch": 2.618435754189944, + "grad_norm": 0.46076497435569763, + "learning_rate": 0.0008716806722689075, + "loss": 0.4472, + "step": 4687 + }, + { + "epoch": 2.618994413407821, + "grad_norm": 0.5737087726593018, + "learning_rate": 0.0008716526610644257, + "loss": 0.5042, + "step": 4688 + }, + { + "epoch": 2.619553072625698, + "grad_norm": 1.2339030504226685, + "learning_rate": 0.000871624649859944, + "loss": 0.6456, + "step": 4689 + }, + { + "epoch": 2.6201117318435756, + "grad_norm": 0.5793154239654541, + "learning_rate": 0.0008715966386554623, + "loss": 0.5078, + "step": 4690 + }, + { + "epoch": 2.6206703910614526, + "grad_norm": 0.4105638265609741, + "learning_rate": 0.0008715686274509805, + "loss": 0.4385, + "step": 4691 + }, + { + "epoch": 2.6212290502793296, + "grad_norm": 0.5621259808540344, + "learning_rate": 0.0008715406162464986, + "loss": 0.5668, + "step": 4692 + }, + { + "epoch": 2.6217877094972066, + "grad_norm": 0.5614182353019714, + "learning_rate": 0.0008715126050420168, + "loss": 0.4502, + "step": 4693 + }, + { + "epoch": 2.6223463687150836, + "grad_norm": 0.38099655508995056, + "learning_rate": 0.000871484593837535, + "loss": 0.4513, + "step": 4694 + }, + { + "epoch": 2.622905027932961, + "grad_norm": 1.6032222509384155, + "learning_rate": 0.0008714565826330533, + "loss": 0.4963, + "step": 4695 + }, + { + "epoch": 2.623463687150838, + "grad_norm": 0.42527201771736145, + "learning_rate": 0.0008714285714285715, + "loss": 0.4399, + "step": 4696 + }, + { + "epoch": 2.624022346368715, + "grad_norm": 0.7446414232254028, + "learning_rate": 0.0008714005602240896, + "loss": 0.8152, + "step": 4697 + }, + { + "epoch": 2.624581005586592, + "grad_norm": 0.764163613319397, + "learning_rate": 0.0008713725490196078, + "loss": 0.4505, + "step": 4698 + }, + { + "epoch": 2.625139664804469, + "grad_norm": 0.4340057075023651, + "learning_rate": 0.000871344537815126, + "loss": 0.488, + "step": 4699 + }, + { + "epoch": 2.6256983240223466, + "grad_norm": 1.5946266651153564, + "learning_rate": 0.0008713165266106443, + "loss": 0.5975, + "step": 4700 + }, + { + "epoch": 2.6262569832402236, + "grad_norm": 0.6357771754264832, + "learning_rate": 0.0008712885154061625, + "loss": 0.5358, + "step": 4701 + }, + { + "epoch": 2.6268156424581006, + "grad_norm": 0.6142751574516296, + "learning_rate": 0.0008712605042016807, + "loss": 0.5015, + "step": 4702 + }, + { + "epoch": 2.6273743016759776, + "grad_norm": 0.6175709366798401, + "learning_rate": 0.0008712324929971988, + "loss": 0.5146, + "step": 4703 + }, + { + "epoch": 2.6279329608938546, + "grad_norm": 0.6648790836334229, + "learning_rate": 0.000871204481792717, + "loss": 0.5756, + "step": 4704 + }, + { + "epoch": 2.628491620111732, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0008711764705882354, + "loss": 0.5286, + "step": 4705 + }, + { + "epoch": 2.6290502793296087, + "grad_norm": 0.7095016837120056, + "learning_rate": 0.0008711484593837536, + "loss": 0.4608, + "step": 4706 + }, + { + "epoch": 2.629608938547486, + "grad_norm": 0.7443677186965942, + "learning_rate": 0.0008711204481792718, + "loss": 0.4947, + "step": 4707 + }, + { + "epoch": 2.630167597765363, + "grad_norm": 0.8868491053581238, + "learning_rate": 0.0008710924369747899, + "loss": 0.3831, + "step": 4708 + }, + { + "epoch": 2.63072625698324, + "grad_norm": 0.4886857271194458, + "learning_rate": 0.0008710644257703081, + "loss": 0.6954, + "step": 4709 + }, + { + "epoch": 2.631284916201117, + "grad_norm": 0.5084818601608276, + "learning_rate": 0.0008710364145658264, + "loss": 0.583, + "step": 4710 + }, + { + "epoch": 2.631843575418994, + "grad_norm": 0.499833881855011, + "learning_rate": 0.0008710084033613446, + "loss": 0.4395, + "step": 4711 + }, + { + "epoch": 2.6324022346368716, + "grad_norm": 0.697119414806366, + "learning_rate": 0.0008709803921568628, + "loss": 0.4771, + "step": 4712 + }, + { + "epoch": 2.6329608938547486, + "grad_norm": 0.8509107232093811, + "learning_rate": 0.0008709523809523809, + "loss": 0.546, + "step": 4713 + }, + { + "epoch": 2.6335195530726256, + "grad_norm": 0.7323441505432129, + "learning_rate": 0.0008709243697478991, + "loss": 0.4639, + "step": 4714 + }, + { + "epoch": 2.6340782122905027, + "grad_norm": 0.41734227538108826, + "learning_rate": 0.0008708963585434174, + "loss": 0.4869, + "step": 4715 + }, + { + "epoch": 2.6346368715083797, + "grad_norm": 0.5006591081619263, + "learning_rate": 0.0008708683473389356, + "loss": 0.461, + "step": 4716 + }, + { + "epoch": 2.635195530726257, + "grad_norm": 1.191251277923584, + "learning_rate": 0.0008708403361344538, + "loss": 0.5559, + "step": 4717 + }, + { + "epoch": 2.635754189944134, + "grad_norm": 0.7904143333435059, + "learning_rate": 0.000870812324929972, + "loss": 0.5127, + "step": 4718 + }, + { + "epoch": 2.636312849162011, + "grad_norm": 0.9613448977470398, + "learning_rate": 0.0008707843137254901, + "loss": 0.4754, + "step": 4719 + }, + { + "epoch": 2.636871508379888, + "grad_norm": 0.43707001209259033, + "learning_rate": 0.0008707563025210084, + "loss": 0.4735, + "step": 4720 + }, + { + "epoch": 2.637430167597765, + "grad_norm": 0.7447942495346069, + "learning_rate": 0.0008707282913165266, + "loss": 0.4612, + "step": 4721 + }, + { + "epoch": 2.6379888268156426, + "grad_norm": 0.7653697729110718, + "learning_rate": 0.0008707002801120449, + "loss": 0.4265, + "step": 4722 + }, + { + "epoch": 2.6385474860335196, + "grad_norm": 0.9355491995811462, + "learning_rate": 0.0008706722689075631, + "loss": 0.6343, + "step": 4723 + }, + { + "epoch": 2.6391061452513966, + "grad_norm": 0.6037322878837585, + "learning_rate": 0.0008706442577030812, + "loss": 0.3857, + "step": 4724 + }, + { + "epoch": 2.6396648044692737, + "grad_norm": 0.5230147242546082, + "learning_rate": 0.0008706162464985995, + "loss": 0.526, + "step": 4725 + }, + { + "epoch": 2.6402234636871507, + "grad_norm": 0.9687732458114624, + "learning_rate": 0.0008705882352941177, + "loss": 0.5247, + "step": 4726 + }, + { + "epoch": 2.640782122905028, + "grad_norm": 0.6201425194740295, + "learning_rate": 0.0008705602240896359, + "loss": 0.513, + "step": 4727 + }, + { + "epoch": 2.641340782122905, + "grad_norm": 0.6437943577766418, + "learning_rate": 0.0008705322128851541, + "loss": 0.4176, + "step": 4728 + }, + { + "epoch": 2.641899441340782, + "grad_norm": 0.44939082860946655, + "learning_rate": 0.0008705042016806722, + "loss": 0.4928, + "step": 4729 + }, + { + "epoch": 2.642458100558659, + "grad_norm": 0.38824886083602905, + "learning_rate": 0.0008704761904761905, + "loss": 0.402, + "step": 4730 + }, + { + "epoch": 2.643016759776536, + "grad_norm": 0.5163977742195129, + "learning_rate": 0.0008704481792717087, + "loss": 0.3362, + "step": 4731 + }, + { + "epoch": 2.6435754189944136, + "grad_norm": 0.7596457600593567, + "learning_rate": 0.0008704201680672269, + "loss": 0.4694, + "step": 4732 + }, + { + "epoch": 2.6441340782122906, + "grad_norm": 0.39082902669906616, + "learning_rate": 0.0008703921568627451, + "loss": 0.428, + "step": 4733 + }, + { + "epoch": 2.6446927374301676, + "grad_norm": 0.7579891681671143, + "learning_rate": 0.0008703641456582633, + "loss": 0.4739, + "step": 4734 + }, + { + "epoch": 2.6452513966480447, + "grad_norm": 0.5568997859954834, + "learning_rate": 0.0008703361344537815, + "loss": 0.496, + "step": 4735 + }, + { + "epoch": 2.6458100558659217, + "grad_norm": 0.5152088403701782, + "learning_rate": 0.0008703081232492997, + "loss": 0.4694, + "step": 4736 + }, + { + "epoch": 2.646368715083799, + "grad_norm": 0.4541816711425781, + "learning_rate": 0.0008702801120448179, + "loss": 0.5664, + "step": 4737 + }, + { + "epoch": 2.646927374301676, + "grad_norm": 0.7481280565261841, + "learning_rate": 0.0008702521008403362, + "loss": 0.4173, + "step": 4738 + }, + { + "epoch": 2.647486033519553, + "grad_norm": 0.681196391582489, + "learning_rate": 0.0008702240896358544, + "loss": 0.441, + "step": 4739 + }, + { + "epoch": 2.64804469273743, + "grad_norm": 0.5350286960601807, + "learning_rate": 0.0008701960784313726, + "loss": 0.5377, + "step": 4740 + }, + { + "epoch": 2.648603351955307, + "grad_norm": 0.5217966437339783, + "learning_rate": 0.0008701680672268908, + "loss": 0.4624, + "step": 4741 + }, + { + "epoch": 2.6491620111731846, + "grad_norm": 1.993146300315857, + "learning_rate": 0.000870140056022409, + "loss": 0.4675, + "step": 4742 + }, + { + "epoch": 2.649720670391061, + "grad_norm": 0.554341197013855, + "learning_rate": 0.0008701120448179272, + "loss": 0.393, + "step": 4743 + }, + { + "epoch": 2.6502793296089386, + "grad_norm": 0.6307961344718933, + "learning_rate": 0.0008700840336134454, + "loss": 0.5158, + "step": 4744 + }, + { + "epoch": 2.6508379888268156, + "grad_norm": 0.4391495883464813, + "learning_rate": 0.0008700560224089636, + "loss": 0.3746, + "step": 4745 + }, + { + "epoch": 2.6513966480446927, + "grad_norm": 3.2256343364715576, + "learning_rate": 0.0008700280112044818, + "loss": 0.4607, + "step": 4746 + }, + { + "epoch": 2.6519553072625697, + "grad_norm": 0.5182428956031799, + "learning_rate": 0.00087, + "loss": 0.3851, + "step": 4747 + }, + { + "epoch": 2.6525139664804467, + "grad_norm": 4.928438186645508, + "learning_rate": 0.0008699719887955182, + "loss": 0.6138, + "step": 4748 + }, + { + "epoch": 2.653072625698324, + "grad_norm": 0.9736620187759399, + "learning_rate": 0.0008699439775910364, + "loss": 0.4929, + "step": 4749 + }, + { + "epoch": 2.653631284916201, + "grad_norm": 0.4683849811553955, + "learning_rate": 0.0008699159663865547, + "loss": 0.432, + "step": 4750 + }, + { + "epoch": 2.654189944134078, + "grad_norm": 0.45677450299263, + "learning_rate": 0.0008698879551820728, + "loss": 0.398, + "step": 4751 + }, + { + "epoch": 2.654748603351955, + "grad_norm": 0.5331783294677734, + "learning_rate": 0.000869859943977591, + "loss": 0.4654, + "step": 4752 + }, + { + "epoch": 2.655307262569832, + "grad_norm": 5.260037899017334, + "learning_rate": 0.0008698319327731092, + "loss": 0.5979, + "step": 4753 + }, + { + "epoch": 2.6558659217877096, + "grad_norm": 0.48784613609313965, + "learning_rate": 0.0008698039215686274, + "loss": 0.4364, + "step": 4754 + }, + { + "epoch": 2.6564245810055866, + "grad_norm": 0.47899293899536133, + "learning_rate": 0.0008697759103641458, + "loss": 0.3693, + "step": 4755 + }, + { + "epoch": 2.6569832402234637, + "grad_norm": 1.0056439638137817, + "learning_rate": 0.0008697478991596639, + "loss": 0.4495, + "step": 4756 + }, + { + "epoch": 2.6575418994413407, + "grad_norm": 0.5287572145462036, + "learning_rate": 0.0008697198879551821, + "loss": 0.4169, + "step": 4757 + }, + { + "epoch": 2.6581005586592177, + "grad_norm": 0.7584150433540344, + "learning_rate": 0.0008696918767507003, + "loss": 0.4563, + "step": 4758 + }, + { + "epoch": 2.658659217877095, + "grad_norm": 0.4861416816711426, + "learning_rate": 0.0008696638655462185, + "loss": 0.3965, + "step": 4759 + }, + { + "epoch": 2.659217877094972, + "grad_norm": 0.5171219706535339, + "learning_rate": 0.0008696358543417368, + "loss": 0.3695, + "step": 4760 + }, + { + "epoch": 2.659776536312849, + "grad_norm": 0.4791187047958374, + "learning_rate": 0.0008696078431372549, + "loss": 0.401, + "step": 4761 + }, + { + "epoch": 2.660335195530726, + "grad_norm": 0.41869813203811646, + "learning_rate": 0.0008695798319327731, + "loss": 0.3108, + "step": 4762 + }, + { + "epoch": 2.660893854748603, + "grad_norm": 2.8987014293670654, + "learning_rate": 0.0008695518207282913, + "loss": 0.5095, + "step": 4763 + }, + { + "epoch": 2.6614525139664806, + "grad_norm": 0.5957987308502197, + "learning_rate": 0.0008695238095238095, + "loss": 0.501, + "step": 4764 + }, + { + "epoch": 2.6620111731843576, + "grad_norm": 4.71506929397583, + "learning_rate": 0.0008694957983193278, + "loss": 0.4998, + "step": 4765 + }, + { + "epoch": 2.6625698324022347, + "grad_norm": 0.5180720090866089, + "learning_rate": 0.000869467787114846, + "loss": 0.3085, + "step": 4766 + }, + { + "epoch": 2.6631284916201117, + "grad_norm": 0.47399231791496277, + "learning_rate": 0.0008694397759103641, + "loss": 0.4256, + "step": 4767 + }, + { + "epoch": 2.6636871508379887, + "grad_norm": 0.5193911194801331, + "learning_rate": 0.0008694117647058823, + "loss": 0.4369, + "step": 4768 + }, + { + "epoch": 2.664245810055866, + "grad_norm": 0.43193545937538147, + "learning_rate": 0.0008693837535014005, + "loss": 0.4277, + "step": 4769 + }, + { + "epoch": 2.664804469273743, + "grad_norm": 0.6616137623786926, + "learning_rate": 0.0008693557422969189, + "loss": 0.3911, + "step": 4770 + }, + { + "epoch": 2.66536312849162, + "grad_norm": 0.5150429010391235, + "learning_rate": 0.0008693277310924371, + "loss": 0.5552, + "step": 4771 + }, + { + "epoch": 2.665921787709497, + "grad_norm": 0.4323931932449341, + "learning_rate": 0.0008692997198879552, + "loss": 0.3487, + "step": 4772 + }, + { + "epoch": 2.666480446927374, + "grad_norm": 0.6784036159515381, + "learning_rate": 0.0008692717086834734, + "loss": 0.5034, + "step": 4773 + }, + { + "epoch": 2.6670391061452516, + "grad_norm": 0.5154772996902466, + "learning_rate": 0.0008692436974789916, + "loss": 0.4521, + "step": 4774 + }, + { + "epoch": 2.6675977653631286, + "grad_norm": 0.5224826335906982, + "learning_rate": 0.0008692156862745099, + "loss": 0.4242, + "step": 4775 + }, + { + "epoch": 2.6681564245810057, + "grad_norm": 0.631027102470398, + "learning_rate": 0.0008691876750700281, + "loss": 0.4868, + "step": 4776 + }, + { + "epoch": 2.6687150837988827, + "grad_norm": 0.5402419567108154, + "learning_rate": 0.0008691596638655462, + "loss": 0.4344, + "step": 4777 + }, + { + "epoch": 2.6692737430167597, + "grad_norm": 0.5768053531646729, + "learning_rate": 0.0008691316526610644, + "loss": 0.401, + "step": 4778 + }, + { + "epoch": 2.669832402234637, + "grad_norm": 0.7169563174247742, + "learning_rate": 0.0008691036414565826, + "loss": 0.4646, + "step": 4779 + }, + { + "epoch": 2.6703910614525137, + "grad_norm": 2.339073657989502, + "learning_rate": 0.0008690756302521009, + "loss": 0.4197, + "step": 4780 + }, + { + "epoch": 2.670949720670391, + "grad_norm": 0.488383948802948, + "learning_rate": 0.0008690476190476191, + "loss": 0.5487, + "step": 4781 + }, + { + "epoch": 2.671508379888268, + "grad_norm": 0.7402324676513672, + "learning_rate": 0.0008690196078431373, + "loss": 0.5294, + "step": 4782 + }, + { + "epoch": 2.672067039106145, + "grad_norm": 0.5183477401733398, + "learning_rate": 0.0008689915966386554, + "loss": 0.4699, + "step": 4783 + }, + { + "epoch": 2.672625698324022, + "grad_norm": 0.5226608514785767, + "learning_rate": 0.0008689635854341736, + "loss": 0.4893, + "step": 4784 + }, + { + "epoch": 2.673184357541899, + "grad_norm": 0.5028848052024841, + "learning_rate": 0.0008689355742296919, + "loss": 0.4526, + "step": 4785 + }, + { + "epoch": 2.6737430167597767, + "grad_norm": 0.46172961592674255, + "learning_rate": 0.0008689075630252101, + "loss": 0.503, + "step": 4786 + }, + { + "epoch": 2.6743016759776537, + "grad_norm": 0.4455204904079437, + "learning_rate": 0.0008688795518207284, + "loss": 0.4155, + "step": 4787 + }, + { + "epoch": 2.6748603351955307, + "grad_norm": 1.6291117668151855, + "learning_rate": 0.0008688515406162465, + "loss": 0.4509, + "step": 4788 + }, + { + "epoch": 2.6754189944134077, + "grad_norm": 1.36480712890625, + "learning_rate": 0.0008688235294117647, + "loss": 0.4896, + "step": 4789 + }, + { + "epoch": 2.6759776536312847, + "grad_norm": 1.3482952117919922, + "learning_rate": 0.000868795518207283, + "loss": 0.4216, + "step": 4790 + }, + { + "epoch": 2.676536312849162, + "grad_norm": 0.5353065729141235, + "learning_rate": 0.0008687675070028012, + "loss": 0.5104, + "step": 4791 + }, + { + "epoch": 2.677094972067039, + "grad_norm": 0.4740195572376251, + "learning_rate": 0.0008687394957983194, + "loss": 0.5258, + "step": 4792 + }, + { + "epoch": 2.677653631284916, + "grad_norm": 0.4119703471660614, + "learning_rate": 0.0008687114845938375, + "loss": 0.4302, + "step": 4793 + }, + { + "epoch": 2.678212290502793, + "grad_norm": 0.7389618754386902, + "learning_rate": 0.0008686834733893557, + "loss": 0.5686, + "step": 4794 + }, + { + "epoch": 2.67877094972067, + "grad_norm": 0.49716615676879883, + "learning_rate": 0.000868655462184874, + "loss": 0.5795, + "step": 4795 + }, + { + "epoch": 2.6793296089385477, + "grad_norm": 0.6634438037872314, + "learning_rate": 0.0008686274509803922, + "loss": 0.4977, + "step": 4796 + }, + { + "epoch": 2.6798882681564247, + "grad_norm": 0.6217042803764343, + "learning_rate": 0.0008685994397759104, + "loss": 0.4127, + "step": 4797 + }, + { + "epoch": 2.6804469273743017, + "grad_norm": 0.45596328377723694, + "learning_rate": 0.0008685714285714286, + "loss": 0.3753, + "step": 4798 + }, + { + "epoch": 2.6810055865921787, + "grad_norm": 0.5892964601516724, + "learning_rate": 0.0008685434173669467, + "loss": 0.5285, + "step": 4799 + }, + { + "epoch": 2.6815642458100557, + "grad_norm": 0.5044070482254028, + "learning_rate": 0.000868515406162465, + "loss": 0.5043, + "step": 4800 + }, + { + "epoch": 2.682122905027933, + "grad_norm": 1.1690303087234497, + "learning_rate": 0.0008684873949579832, + "loss": 0.5025, + "step": 4801 + }, + { + "epoch": 2.68268156424581, + "grad_norm": 0.5131223201751709, + "learning_rate": 0.0008684593837535014, + "loss": 0.4858, + "step": 4802 + }, + { + "epoch": 2.683240223463687, + "grad_norm": 0.4487779140472412, + "learning_rate": 0.0008684313725490196, + "loss": 0.4132, + "step": 4803 + }, + { + "epoch": 2.683798882681564, + "grad_norm": 0.5181785821914673, + "learning_rate": 0.0008684033613445377, + "loss": 0.5784, + "step": 4804 + }, + { + "epoch": 2.684357541899441, + "grad_norm": 0.6847459077835083, + "learning_rate": 0.0008683753501400561, + "loss": 0.5493, + "step": 4805 + }, + { + "epoch": 2.6849162011173187, + "grad_norm": 0.7921627163887024, + "learning_rate": 0.0008683473389355743, + "loss": 0.4167, + "step": 4806 + }, + { + "epoch": 2.6854748603351957, + "grad_norm": 0.8112843632698059, + "learning_rate": 0.0008683193277310925, + "loss": 0.5319, + "step": 4807 + }, + { + "epoch": 2.6860335195530727, + "grad_norm": 0.7470324635505676, + "learning_rate": 0.0008682913165266107, + "loss": 0.4181, + "step": 4808 + }, + { + "epoch": 2.6865921787709497, + "grad_norm": 0.6501536965370178, + "learning_rate": 0.0008682633053221288, + "loss": 0.5372, + "step": 4809 + }, + { + "epoch": 2.6871508379888267, + "grad_norm": 0.46358639001846313, + "learning_rate": 0.0008682352941176471, + "loss": 0.527, + "step": 4810 + }, + { + "epoch": 2.687709497206704, + "grad_norm": 0.37311455607414246, + "learning_rate": 0.0008682072829131653, + "loss": 0.4203, + "step": 4811 + }, + { + "epoch": 2.688268156424581, + "grad_norm": 0.6030970811843872, + "learning_rate": 0.0008681792717086835, + "loss": 0.4161, + "step": 4812 + }, + { + "epoch": 2.688826815642458, + "grad_norm": 0.6002731323242188, + "learning_rate": 0.0008681512605042017, + "loss": 0.5618, + "step": 4813 + }, + { + "epoch": 2.689385474860335, + "grad_norm": 0.8159639835357666, + "learning_rate": 0.0008681232492997199, + "loss": 0.5235, + "step": 4814 + }, + { + "epoch": 2.689944134078212, + "grad_norm": 2.1633708477020264, + "learning_rate": 0.0008680952380952381, + "loss": 0.5416, + "step": 4815 + }, + { + "epoch": 2.6905027932960897, + "grad_norm": 0.7509324550628662, + "learning_rate": 0.0008680672268907563, + "loss": 0.5318, + "step": 4816 + }, + { + "epoch": 2.6910614525139662, + "grad_norm": 0.553111732006073, + "learning_rate": 0.0008680392156862745, + "loss": 0.4199, + "step": 4817 + }, + { + "epoch": 2.6916201117318437, + "grad_norm": 0.85640949010849, + "learning_rate": 0.0008680112044817927, + "loss": 0.5516, + "step": 4818 + }, + { + "epoch": 2.6921787709497207, + "grad_norm": 3.5856409072875977, + "learning_rate": 0.0008679831932773109, + "loss": 0.4912, + "step": 4819 + }, + { + "epoch": 2.6927374301675977, + "grad_norm": 0.6759235262870789, + "learning_rate": 0.0008679551820728292, + "loss": 0.4594, + "step": 4820 + }, + { + "epoch": 2.6932960893854747, + "grad_norm": 1.2774004936218262, + "learning_rate": 0.0008679271708683474, + "loss": 0.4193, + "step": 4821 + }, + { + "epoch": 2.6938547486033517, + "grad_norm": 0.9100181460380554, + "learning_rate": 0.0008678991596638656, + "loss": 0.5827, + "step": 4822 + }, + { + "epoch": 2.694413407821229, + "grad_norm": 1.0939244031906128, + "learning_rate": 0.0008678711484593838, + "loss": 0.5164, + "step": 4823 + }, + { + "epoch": 2.694972067039106, + "grad_norm": 1.1153905391693115, + "learning_rate": 0.000867843137254902, + "loss": 0.5033, + "step": 4824 + }, + { + "epoch": 2.695530726256983, + "grad_norm": 0.7720142602920532, + "learning_rate": 0.0008678151260504202, + "loss": 0.5977, + "step": 4825 + }, + { + "epoch": 2.69608938547486, + "grad_norm": 0.4999604821205139, + "learning_rate": 0.0008677871148459384, + "loss": 0.4439, + "step": 4826 + }, + { + "epoch": 2.6966480446927372, + "grad_norm": 0.6050472259521484, + "learning_rate": 0.0008677591036414566, + "loss": 0.523, + "step": 4827 + }, + { + "epoch": 2.6972067039106147, + "grad_norm": 0.9201844334602356, + "learning_rate": 0.0008677310924369748, + "loss": 0.424, + "step": 4828 + }, + { + "epoch": 2.6977653631284917, + "grad_norm": 0.39806562662124634, + "learning_rate": 0.000867703081232493, + "loss": 0.3526, + "step": 4829 + }, + { + "epoch": 2.6983240223463687, + "grad_norm": 0.6608548164367676, + "learning_rate": 0.0008676750700280113, + "loss": 0.4449, + "step": 4830 + }, + { + "epoch": 2.6988826815642457, + "grad_norm": 0.6510975956916809, + "learning_rate": 0.0008676470588235294, + "loss": 0.5388, + "step": 4831 + }, + { + "epoch": 2.6994413407821227, + "grad_norm": 5.686730861663818, + "learning_rate": 0.0008676190476190476, + "loss": 0.4578, + "step": 4832 + }, + { + "epoch": 2.7, + "grad_norm": 0.9750047326087952, + "learning_rate": 0.0008675910364145658, + "loss": 0.5236, + "step": 4833 + }, + { + "epoch": 2.700558659217877, + "grad_norm": 0.8430211544036865, + "learning_rate": 0.000867563025210084, + "loss": 0.6356, + "step": 4834 + }, + { + "epoch": 2.701117318435754, + "grad_norm": 0.6292615532875061, + "learning_rate": 0.0008675350140056023, + "loss": 0.5324, + "step": 4835 + }, + { + "epoch": 2.701675977653631, + "grad_norm": 0.6719739437103271, + "learning_rate": 0.0008675070028011204, + "loss": 0.4924, + "step": 4836 + }, + { + "epoch": 2.7022346368715082, + "grad_norm": 0.5120917558670044, + "learning_rate": 0.0008674789915966387, + "loss": 0.3827, + "step": 4837 + }, + { + "epoch": 2.7027932960893857, + "grad_norm": 1.0064018964767456, + "learning_rate": 0.0008674509803921569, + "loss": 0.5709, + "step": 4838 + }, + { + "epoch": 2.7033519553072627, + "grad_norm": 0.8618167042732239, + "learning_rate": 0.0008674229691876751, + "loss": 0.4471, + "step": 4839 + }, + { + "epoch": 2.7039106145251397, + "grad_norm": 1.8580939769744873, + "learning_rate": 0.0008673949579831934, + "loss": 0.4313, + "step": 4840 + }, + { + "epoch": 2.7044692737430167, + "grad_norm": 0.8321442008018494, + "learning_rate": 0.0008673669467787115, + "loss": 0.6412, + "step": 4841 + }, + { + "epoch": 2.7050279329608937, + "grad_norm": 0.6822600364685059, + "learning_rate": 0.0008673389355742297, + "loss": 0.5894, + "step": 4842 + }, + { + "epoch": 2.705586592178771, + "grad_norm": 0.6240845322608948, + "learning_rate": 0.0008673109243697479, + "loss": 0.5778, + "step": 4843 + }, + { + "epoch": 2.706145251396648, + "grad_norm": 0.9722593426704407, + "learning_rate": 0.0008672829131652661, + "loss": 0.3861, + "step": 4844 + }, + { + "epoch": 2.706703910614525, + "grad_norm": 0.8506166934967041, + "learning_rate": 0.0008672549019607844, + "loss": 0.515, + "step": 4845 + }, + { + "epoch": 2.707262569832402, + "grad_norm": 0.9309301376342773, + "learning_rate": 0.0008672268907563026, + "loss": 0.4268, + "step": 4846 + }, + { + "epoch": 2.707821229050279, + "grad_norm": 0.4745652973651886, + "learning_rate": 0.0008671988795518207, + "loss": 0.5309, + "step": 4847 + }, + { + "epoch": 2.7083798882681567, + "grad_norm": 0.9603132605552673, + "learning_rate": 0.0008671708683473389, + "loss": 0.5937, + "step": 4848 + }, + { + "epoch": 2.7089385474860332, + "grad_norm": 0.5685709714889526, + "learning_rate": 0.0008671428571428571, + "loss": 0.4115, + "step": 4849 + }, + { + "epoch": 2.7094972067039107, + "grad_norm": 0.6907859444618225, + "learning_rate": 0.0008671148459383754, + "loss": 0.3736, + "step": 4850 + }, + { + "epoch": 2.7100558659217877, + "grad_norm": 0.66511470079422, + "learning_rate": 0.0008670868347338936, + "loss": 0.4716, + "step": 4851 + }, + { + "epoch": 2.7106145251396647, + "grad_norm": 7.405402660369873, + "learning_rate": 0.0008670588235294117, + "loss": 0.4322, + "step": 4852 + }, + { + "epoch": 2.711173184357542, + "grad_norm": 0.5701492428779602, + "learning_rate": 0.00086703081232493, + "loss": 0.4989, + "step": 4853 + }, + { + "epoch": 2.7117318435754187, + "grad_norm": 0.4032461941242218, + "learning_rate": 0.0008670028011204482, + "loss": 0.4713, + "step": 4854 + }, + { + "epoch": 2.712290502793296, + "grad_norm": 0.590755820274353, + "learning_rate": 0.0008669747899159665, + "loss": 0.4635, + "step": 4855 + }, + { + "epoch": 2.712849162011173, + "grad_norm": 0.7063531279563904, + "learning_rate": 0.0008669467787114847, + "loss": 0.4278, + "step": 4856 + }, + { + "epoch": 2.71340782122905, + "grad_norm": 2.3604865074157715, + "learning_rate": 0.0008669187675070028, + "loss": 0.6278, + "step": 4857 + }, + { + "epoch": 2.7139664804469272, + "grad_norm": 0.5989410281181335, + "learning_rate": 0.000866890756302521, + "loss": 0.4891, + "step": 4858 + }, + { + "epoch": 2.7145251396648042, + "grad_norm": 1.384153127670288, + "learning_rate": 0.0008668627450980392, + "loss": 0.4805, + "step": 4859 + }, + { + "epoch": 2.7150837988826817, + "grad_norm": 9.467945098876953, + "learning_rate": 0.0008668347338935575, + "loss": 0.4178, + "step": 4860 + }, + { + "epoch": 2.7156424581005587, + "grad_norm": 0.7025426626205444, + "learning_rate": 0.0008668067226890757, + "loss": 0.4701, + "step": 4861 + }, + { + "epoch": 2.7162011173184357, + "grad_norm": 0.6365002989768982, + "learning_rate": 0.0008667787114845939, + "loss": 0.5429, + "step": 4862 + }, + { + "epoch": 2.7167597765363127, + "grad_norm": 3.0573928356170654, + "learning_rate": 0.000866750700280112, + "loss": 0.4711, + "step": 4863 + }, + { + "epoch": 2.7173184357541897, + "grad_norm": 1.6173280477523804, + "learning_rate": 0.0008667226890756302, + "loss": 0.5303, + "step": 4864 + }, + { + "epoch": 2.717877094972067, + "grad_norm": 16.50718879699707, + "learning_rate": 0.0008666946778711485, + "loss": 0.5031, + "step": 4865 + }, + { + "epoch": 2.718435754189944, + "grad_norm": 0.6878076195716858, + "learning_rate": 0.0008666666666666667, + "loss": 0.4979, + "step": 4866 + }, + { + "epoch": 2.718994413407821, + "grad_norm": 0.42856016755104065, + "learning_rate": 0.0008666386554621849, + "loss": 0.4511, + "step": 4867 + }, + { + "epoch": 2.7195530726256982, + "grad_norm": 0.4885893762111664, + "learning_rate": 0.000866610644257703, + "loss": 0.4235, + "step": 4868 + }, + { + "epoch": 2.7201117318435752, + "grad_norm": 0.9483295679092407, + "learning_rate": 0.0008665826330532212, + "loss": 0.4626, + "step": 4869 + }, + { + "epoch": 2.7206703910614527, + "grad_norm": 0.48496589064598083, + "learning_rate": 0.0008665546218487396, + "loss": 0.4604, + "step": 4870 + }, + { + "epoch": 2.7212290502793297, + "grad_norm": 0.4748501181602478, + "learning_rate": 0.0008665266106442578, + "loss": 0.4825, + "step": 4871 + }, + { + "epoch": 2.7217877094972067, + "grad_norm": 0.441242516040802, + "learning_rate": 0.000866498599439776, + "loss": 0.3897, + "step": 4872 + }, + { + "epoch": 2.7223463687150837, + "grad_norm": 0.6068809628486633, + "learning_rate": 0.0008664705882352941, + "loss": 0.4513, + "step": 4873 + }, + { + "epoch": 2.7229050279329607, + "grad_norm": 0.5913217067718506, + "learning_rate": 0.0008664425770308123, + "loss": 0.4436, + "step": 4874 + }, + { + "epoch": 2.723463687150838, + "grad_norm": 0.45163917541503906, + "learning_rate": 0.0008664145658263306, + "loss": 0.4221, + "step": 4875 + }, + { + "epoch": 2.724022346368715, + "grad_norm": 2.0871331691741943, + "learning_rate": 0.0008663865546218488, + "loss": 0.5328, + "step": 4876 + }, + { + "epoch": 2.724581005586592, + "grad_norm": 0.6465966701507568, + "learning_rate": 0.000866358543417367, + "loss": 0.5189, + "step": 4877 + }, + { + "epoch": 2.7251396648044692, + "grad_norm": 0.4940882623195648, + "learning_rate": 0.0008663305322128852, + "loss": 0.4098, + "step": 4878 + }, + { + "epoch": 2.7256983240223462, + "grad_norm": 0.6184472441673279, + "learning_rate": 0.0008663025210084033, + "loss": 0.3774, + "step": 4879 + }, + { + "epoch": 2.7262569832402237, + "grad_norm": 0.7680293917655945, + "learning_rate": 0.0008662745098039216, + "loss": 0.5625, + "step": 4880 + }, + { + "epoch": 2.7268156424581007, + "grad_norm": 2.2876410484313965, + "learning_rate": 0.0008662464985994398, + "loss": 0.3986, + "step": 4881 + }, + { + "epoch": 2.7273743016759777, + "grad_norm": 0.501945436000824, + "learning_rate": 0.000866218487394958, + "loss": 0.3579, + "step": 4882 + }, + { + "epoch": 2.7279329608938547, + "grad_norm": 0.4877082407474518, + "learning_rate": 0.0008661904761904762, + "loss": 0.4911, + "step": 4883 + }, + { + "epoch": 2.7284916201117317, + "grad_norm": 0.6780194640159607, + "learning_rate": 0.0008661624649859943, + "loss": 0.5406, + "step": 4884 + }, + { + "epoch": 2.729050279329609, + "grad_norm": 1.2068274021148682, + "learning_rate": 0.0008661344537815126, + "loss": 0.4233, + "step": 4885 + }, + { + "epoch": 2.7296089385474858, + "grad_norm": 0.6840962767601013, + "learning_rate": 0.0008661064425770309, + "loss": 0.586, + "step": 4886 + }, + { + "epoch": 2.730167597765363, + "grad_norm": 0.8631529211997986, + "learning_rate": 0.0008660784313725491, + "loss": 0.4502, + "step": 4887 + }, + { + "epoch": 2.7307262569832402, + "grad_norm": 0.4405871331691742, + "learning_rate": 0.0008660504201680673, + "loss": 0.4645, + "step": 4888 + }, + { + "epoch": 2.7312849162011172, + "grad_norm": 0.45140179991722107, + "learning_rate": 0.0008660224089635854, + "loss": 0.4729, + "step": 4889 + }, + { + "epoch": 2.7318435754189943, + "grad_norm": 0.6622568368911743, + "learning_rate": 0.0008659943977591037, + "loss": 0.4638, + "step": 4890 + }, + { + "epoch": 2.7324022346368713, + "grad_norm": 0.5176422595977783, + "learning_rate": 0.0008659663865546219, + "loss": 0.4697, + "step": 4891 + }, + { + "epoch": 2.7329608938547487, + "grad_norm": 0.45798808336257935, + "learning_rate": 0.0008659383753501401, + "loss": 0.4678, + "step": 4892 + }, + { + "epoch": 2.7335195530726257, + "grad_norm": 2.073618173599243, + "learning_rate": 0.0008659103641456583, + "loss": 0.505, + "step": 4893 + }, + { + "epoch": 2.7340782122905027, + "grad_norm": 0.5658608675003052, + "learning_rate": 0.0008658823529411765, + "loss": 0.4332, + "step": 4894 + }, + { + "epoch": 2.7346368715083798, + "grad_norm": 1.4124257564544678, + "learning_rate": 0.0008658543417366947, + "loss": 0.4317, + "step": 4895 + }, + { + "epoch": 2.7351955307262568, + "grad_norm": 0.5639500617980957, + "learning_rate": 0.0008658263305322129, + "loss": 0.419, + "step": 4896 + }, + { + "epoch": 2.735754189944134, + "grad_norm": 0.4894201457500458, + "learning_rate": 0.0008657983193277311, + "loss": 0.4356, + "step": 4897 + }, + { + "epoch": 2.7363128491620112, + "grad_norm": 2.7163777351379395, + "learning_rate": 0.0008657703081232493, + "loss": 0.5886, + "step": 4898 + }, + { + "epoch": 2.7368715083798882, + "grad_norm": 0.5326542854309082, + "learning_rate": 0.0008657422969187675, + "loss": 0.5377, + "step": 4899 + }, + { + "epoch": 2.7374301675977653, + "grad_norm": 0.6481336951255798, + "learning_rate": 0.0008657142857142857, + "loss": 0.4702, + "step": 4900 + }, + { + "epoch": 2.7379888268156423, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0008656862745098039, + "loss": 0.3616, + "step": 4901 + }, + { + "epoch": 2.7385474860335197, + "grad_norm": 1.6935418844223022, + "learning_rate": 0.0008656582633053222, + "loss": 0.4699, + "step": 4902 + }, + { + "epoch": 2.7391061452513967, + "grad_norm": 0.5431769490242004, + "learning_rate": 0.0008656302521008404, + "loss": 0.5993, + "step": 4903 + }, + { + "epoch": 2.7396648044692737, + "grad_norm": 4.07416296005249, + "learning_rate": 0.0008656022408963586, + "loss": 0.3618, + "step": 4904 + }, + { + "epoch": 2.7402234636871508, + "grad_norm": 0.7660875916481018, + "learning_rate": 0.0008655742296918767, + "loss": 0.4801, + "step": 4905 + }, + { + "epoch": 2.7407821229050278, + "grad_norm": 0.9906030893325806, + "learning_rate": 0.000865546218487395, + "loss": 0.4216, + "step": 4906 + }, + { + "epoch": 2.741340782122905, + "grad_norm": 0.586628794670105, + "learning_rate": 0.0008655182072829132, + "loss": 0.588, + "step": 4907 + }, + { + "epoch": 2.7418994413407822, + "grad_norm": 0.4912775456905365, + "learning_rate": 0.0008654901960784314, + "loss": 0.4886, + "step": 4908 + }, + { + "epoch": 2.7424581005586592, + "grad_norm": 0.7644976377487183, + "learning_rate": 0.0008654621848739496, + "loss": 0.5569, + "step": 4909 + }, + { + "epoch": 2.7430167597765363, + "grad_norm": 0.5250295996665955, + "learning_rate": 0.0008654341736694678, + "loss": 0.5051, + "step": 4910 + }, + { + "epoch": 2.7435754189944133, + "grad_norm": 0.49718669056892395, + "learning_rate": 0.000865406162464986, + "loss": 0.5335, + "step": 4911 + }, + { + "epoch": 2.7441340782122907, + "grad_norm": 0.6537137031555176, + "learning_rate": 0.0008653781512605042, + "loss": 0.4848, + "step": 4912 + }, + { + "epoch": 2.7446927374301677, + "grad_norm": 0.636180579662323, + "learning_rate": 0.0008653501400560224, + "loss": 0.4697, + "step": 4913 + }, + { + "epoch": 2.7452513966480447, + "grad_norm": 1.114094853401184, + "learning_rate": 0.0008653221288515406, + "loss": 0.5721, + "step": 4914 + }, + { + "epoch": 2.7458100558659218, + "grad_norm": 0.44556185603141785, + "learning_rate": 0.0008652941176470588, + "loss": 0.3715, + "step": 4915 + }, + { + "epoch": 2.7463687150837988, + "grad_norm": 0.7542825937271118, + "learning_rate": 0.000865266106442577, + "loss": 0.5613, + "step": 4916 + }, + { + "epoch": 2.746927374301676, + "grad_norm": 0.6731939315795898, + "learning_rate": 0.0008652380952380952, + "loss": 0.4524, + "step": 4917 + }, + { + "epoch": 2.7474860335195532, + "grad_norm": 0.5472222566604614, + "learning_rate": 0.0008652100840336134, + "loss": 0.3921, + "step": 4918 + }, + { + "epoch": 2.7480446927374302, + "grad_norm": 0.5595595240592957, + "learning_rate": 0.0008651820728291317, + "loss": 0.5001, + "step": 4919 + }, + { + "epoch": 2.7486033519553073, + "grad_norm": 0.5129935145378113, + "learning_rate": 0.0008651540616246499, + "loss": 0.4708, + "step": 4920 + }, + { + "epoch": 2.7491620111731843, + "grad_norm": 0.676902174949646, + "learning_rate": 0.0008651260504201682, + "loss": 0.4269, + "step": 4921 + }, + { + "epoch": 2.7497206703910617, + "grad_norm": 0.551462709903717, + "learning_rate": 0.0008650980392156863, + "loss": 0.4705, + "step": 4922 + }, + { + "epoch": 2.7502793296089383, + "grad_norm": 0.5070472955703735, + "learning_rate": 0.0008650700280112045, + "loss": 0.5748, + "step": 4923 + }, + { + "epoch": 2.7508379888268157, + "grad_norm": 1.1104278564453125, + "learning_rate": 0.0008650420168067227, + "loss": 0.5349, + "step": 4924 + }, + { + "epoch": 2.7513966480446927, + "grad_norm": 0.5214980840682983, + "learning_rate": 0.0008650140056022409, + "loss": 0.455, + "step": 4925 + }, + { + "epoch": 2.7519553072625698, + "grad_norm": 0.6127670407295227, + "learning_rate": 0.0008649859943977592, + "loss": 0.3987, + "step": 4926 + }, + { + "epoch": 2.7525139664804468, + "grad_norm": 0.6497572660446167, + "learning_rate": 0.0008649579831932773, + "loss": 0.5737, + "step": 4927 + }, + { + "epoch": 2.753072625698324, + "grad_norm": 0.6777095198631287, + "learning_rate": 0.0008649299719887955, + "loss": 0.4693, + "step": 4928 + }, + { + "epoch": 2.7536312849162012, + "grad_norm": 0.700614869594574, + "learning_rate": 0.0008649019607843137, + "loss": 0.904, + "step": 4929 + }, + { + "epoch": 2.7541899441340782, + "grad_norm": 0.5954992175102234, + "learning_rate": 0.0008648739495798319, + "loss": 0.5217, + "step": 4930 + }, + { + "epoch": 2.7547486033519553, + "grad_norm": 0.6739497780799866, + "learning_rate": 0.0008648459383753502, + "loss": 0.4592, + "step": 4931 + }, + { + "epoch": 2.7553072625698323, + "grad_norm": 0.5994608402252197, + "learning_rate": 0.0008648179271708683, + "loss": 0.5233, + "step": 4932 + }, + { + "epoch": 2.7558659217877093, + "grad_norm": 4.368466854095459, + "learning_rate": 0.0008647899159663865, + "loss": 0.4687, + "step": 4933 + }, + { + "epoch": 2.7564245810055867, + "grad_norm": 1.4562932252883911, + "learning_rate": 0.0008647619047619047, + "loss": 0.4545, + "step": 4934 + }, + { + "epoch": 2.7569832402234637, + "grad_norm": 0.650810718536377, + "learning_rate": 0.000864733893557423, + "loss": 0.4358, + "step": 4935 + }, + { + "epoch": 2.7575418994413408, + "grad_norm": 0.8807802796363831, + "learning_rate": 0.0008647058823529413, + "loss": 0.6186, + "step": 4936 + }, + { + "epoch": 2.7581005586592178, + "grad_norm": 1.7569150924682617, + "learning_rate": 0.0008646778711484595, + "loss": 0.4967, + "step": 4937 + }, + { + "epoch": 2.758659217877095, + "grad_norm": 0.48197001218795776, + "learning_rate": 0.0008646498599439776, + "loss": 0.4785, + "step": 4938 + }, + { + "epoch": 2.7592178770949722, + "grad_norm": 1.0546276569366455, + "learning_rate": 0.0008646218487394958, + "loss": 0.4092, + "step": 4939 + }, + { + "epoch": 2.7597765363128492, + "grad_norm": 0.7439518570899963, + "learning_rate": 0.000864593837535014, + "loss": 0.5937, + "step": 4940 + }, + { + "epoch": 2.7603351955307263, + "grad_norm": 0.5570810437202454, + "learning_rate": 0.0008645658263305323, + "loss": 0.432, + "step": 4941 + }, + { + "epoch": 2.7608938547486033, + "grad_norm": 0.6318339705467224, + "learning_rate": 0.0008645378151260505, + "loss": 0.5582, + "step": 4942 + }, + { + "epoch": 2.7614525139664803, + "grad_norm": 2.140132427215576, + "learning_rate": 0.0008645098039215686, + "loss": 0.5217, + "step": 4943 + }, + { + "epoch": 2.7620111731843577, + "grad_norm": 0.5386053323745728, + "learning_rate": 0.0008644817927170868, + "loss": 0.5861, + "step": 4944 + }, + { + "epoch": 2.7625698324022347, + "grad_norm": 1.3792846202850342, + "learning_rate": 0.000864453781512605, + "loss": 0.5431, + "step": 4945 + }, + { + "epoch": 2.7631284916201118, + "grad_norm": 0.8350787162780762, + "learning_rate": 0.0008644257703081233, + "loss": 0.4192, + "step": 4946 + }, + { + "epoch": 2.7636871508379888, + "grad_norm": 0.6028815507888794, + "learning_rate": 0.0008643977591036415, + "loss": 0.3765, + "step": 4947 + }, + { + "epoch": 2.764245810055866, + "grad_norm": 0.7146716713905334, + "learning_rate": 0.0008643697478991596, + "loss": 0.4428, + "step": 4948 + }, + { + "epoch": 2.7648044692737432, + "grad_norm": 0.491089791059494, + "learning_rate": 0.0008643417366946778, + "loss": 0.4343, + "step": 4949 + }, + { + "epoch": 2.7653631284916202, + "grad_norm": 0.615129828453064, + "learning_rate": 0.000864313725490196, + "loss": 0.398, + "step": 4950 + }, + { + "epoch": 2.7659217877094973, + "grad_norm": 0.448180615901947, + "learning_rate": 0.0008642857142857144, + "loss": 0.3798, + "step": 4951 + }, + { + "epoch": 2.7664804469273743, + "grad_norm": 0.8530778884887695, + "learning_rate": 0.0008642577030812326, + "loss": 0.5042, + "step": 4952 + }, + { + "epoch": 2.7670391061452513, + "grad_norm": 0.7157233953475952, + "learning_rate": 0.0008642296918767508, + "loss": 0.5035, + "step": 4953 + }, + { + "epoch": 2.7675977653631287, + "grad_norm": 8.344500541687012, + "learning_rate": 0.0008642016806722689, + "loss": 0.543, + "step": 4954 + }, + { + "epoch": 2.7681564245810057, + "grad_norm": 0.6947936415672302, + "learning_rate": 0.0008641736694677871, + "loss": 0.4683, + "step": 4955 + }, + { + "epoch": 2.7687150837988828, + "grad_norm": 0.5956802368164062, + "learning_rate": 0.0008641456582633054, + "loss": 0.5348, + "step": 4956 + }, + { + "epoch": 2.7692737430167598, + "grad_norm": 0.8770701885223389, + "learning_rate": 0.0008641176470588236, + "loss": 0.3736, + "step": 4957 + }, + { + "epoch": 2.769832402234637, + "grad_norm": 0.5714986324310303, + "learning_rate": 0.0008640896358543418, + "loss": 0.448, + "step": 4958 + }, + { + "epoch": 2.7703910614525142, + "grad_norm": 0.5729160904884338, + "learning_rate": 0.0008640616246498599, + "loss": 0.4438, + "step": 4959 + }, + { + "epoch": 2.770949720670391, + "grad_norm": 0.3909223675727844, + "learning_rate": 0.0008640336134453781, + "loss": 0.3621, + "step": 4960 + }, + { + "epoch": 2.7715083798882683, + "grad_norm": 0.4876652657985687, + "learning_rate": 0.0008640056022408964, + "loss": 0.5069, + "step": 4961 + }, + { + "epoch": 2.7720670391061453, + "grad_norm": 0.6069240570068359, + "learning_rate": 0.0008639775910364146, + "loss": 0.4026, + "step": 4962 + }, + { + "epoch": 2.7726256983240223, + "grad_norm": 0.710731029510498, + "learning_rate": 0.0008639495798319328, + "loss": 0.5177, + "step": 4963 + }, + { + "epoch": 2.7731843575418993, + "grad_norm": 0.5441970229148865, + "learning_rate": 0.0008639215686274509, + "loss": 0.4089, + "step": 4964 + }, + { + "epoch": 2.7737430167597763, + "grad_norm": 0.5625534057617188, + "learning_rate": 0.0008638935574229691, + "loss": 0.3967, + "step": 4965 + }, + { + "epoch": 2.7743016759776538, + "grad_norm": 3.055351734161377, + "learning_rate": 0.0008638655462184874, + "loss": 0.4838, + "step": 4966 + }, + { + "epoch": 2.7748603351955308, + "grad_norm": 0.5649999976158142, + "learning_rate": 0.0008638375350140056, + "loss": 0.5054, + "step": 4967 + }, + { + "epoch": 2.775418994413408, + "grad_norm": 1.1882247924804688, + "learning_rate": 0.0008638095238095239, + "loss": 0.4907, + "step": 4968 + }, + { + "epoch": 2.775977653631285, + "grad_norm": 0.5228495001792908, + "learning_rate": 0.0008637815126050421, + "loss": 0.5175, + "step": 4969 + }, + { + "epoch": 2.776536312849162, + "grad_norm": 0.7560868263244629, + "learning_rate": 0.0008637535014005602, + "loss": 0.4681, + "step": 4970 + }, + { + "epoch": 2.7770949720670393, + "grad_norm": 0.7712761759757996, + "learning_rate": 0.0008637254901960785, + "loss": 0.429, + "step": 4971 + }, + { + "epoch": 2.7776536312849163, + "grad_norm": 0.5001828670501709, + "learning_rate": 0.0008636974789915967, + "loss": 0.3925, + "step": 4972 + }, + { + "epoch": 2.7782122905027933, + "grad_norm": 2.4527015686035156, + "learning_rate": 0.0008636694677871149, + "loss": 0.3796, + "step": 4973 + }, + { + "epoch": 2.7787709497206703, + "grad_norm": 2.300713062286377, + "learning_rate": 0.0008636414565826331, + "loss": 0.3545, + "step": 4974 + }, + { + "epoch": 2.7793296089385473, + "grad_norm": 0.5880308747291565, + "learning_rate": 0.0008636134453781512, + "loss": 0.4987, + "step": 4975 + }, + { + "epoch": 2.7798882681564248, + "grad_norm": 0.4641307294368744, + "learning_rate": 0.0008635854341736695, + "loss": 0.3935, + "step": 4976 + }, + { + "epoch": 2.7804469273743018, + "grad_norm": 0.6852543950080872, + "learning_rate": 0.0008635574229691877, + "loss": 0.4961, + "step": 4977 + }, + { + "epoch": 2.781005586592179, + "grad_norm": 0.6853237748146057, + "learning_rate": 0.0008635294117647059, + "loss": 0.4439, + "step": 4978 + }, + { + "epoch": 2.781564245810056, + "grad_norm": 0.6090775728225708, + "learning_rate": 0.0008635014005602241, + "loss": 0.4761, + "step": 4979 + }, + { + "epoch": 2.782122905027933, + "grad_norm": 0.5112475156784058, + "learning_rate": 0.0008634733893557422, + "loss": 0.3353, + "step": 4980 + }, + { + "epoch": 2.7826815642458103, + "grad_norm": 0.49931249022483826, + "learning_rate": 0.0008634453781512605, + "loss": 0.5073, + "step": 4981 + }, + { + "epoch": 2.7832402234636873, + "grad_norm": 0.4504586160182953, + "learning_rate": 0.0008634173669467787, + "loss": 0.6052, + "step": 4982 + }, + { + "epoch": 2.7837988826815643, + "grad_norm": 0.733995795249939, + "learning_rate": 0.0008633893557422969, + "loss": 0.5849, + "step": 4983 + }, + { + "epoch": 2.7843575418994413, + "grad_norm": 0.5288225412368774, + "learning_rate": 0.0008633613445378152, + "loss": 0.5129, + "step": 4984 + }, + { + "epoch": 2.7849162011173183, + "grad_norm": 0.3947197198867798, + "learning_rate": 0.0008633333333333334, + "loss": 0.4521, + "step": 4985 + }, + { + "epoch": 2.7854748603351958, + "grad_norm": 0.4959245026111603, + "learning_rate": 0.0008633053221288516, + "loss": 0.4244, + "step": 4986 + }, + { + "epoch": 2.7860335195530728, + "grad_norm": 0.49115273356437683, + "learning_rate": 0.0008632773109243698, + "loss": 0.4335, + "step": 4987 + }, + { + "epoch": 2.78659217877095, + "grad_norm": 0.5411583781242371, + "learning_rate": 0.000863249299719888, + "loss": 0.4724, + "step": 4988 + }, + { + "epoch": 2.787150837988827, + "grad_norm": 0.47826892137527466, + "learning_rate": 0.0008632212885154062, + "loss": 0.4941, + "step": 4989 + }, + { + "epoch": 2.787709497206704, + "grad_norm": 1.393666386604309, + "learning_rate": 0.0008631932773109244, + "loss": 0.4298, + "step": 4990 + }, + { + "epoch": 2.7882681564245813, + "grad_norm": 0.7982737421989441, + "learning_rate": 0.0008631652661064426, + "loss": 0.4934, + "step": 4991 + }, + { + "epoch": 2.788826815642458, + "grad_norm": 0.5343127250671387, + "learning_rate": 0.0008631372549019608, + "loss": 0.5208, + "step": 4992 + }, + { + "epoch": 2.7893854748603353, + "grad_norm": 0.9246816039085388, + "learning_rate": 0.000863109243697479, + "loss": 0.6124, + "step": 4993 + }, + { + "epoch": 2.7899441340782123, + "grad_norm": 0.6478784084320068, + "learning_rate": 0.0008630812324929972, + "loss": 0.5112, + "step": 4994 + }, + { + "epoch": 2.7905027932960893, + "grad_norm": 0.4586789011955261, + "learning_rate": 0.0008630532212885154, + "loss": 0.397, + "step": 4995 + }, + { + "epoch": 2.7910614525139668, + "grad_norm": 0.7183420658111572, + "learning_rate": 0.0008630252100840336, + "loss": 0.4592, + "step": 4996 + }, + { + "epoch": 2.7916201117318433, + "grad_norm": 0.6217768788337708, + "learning_rate": 0.0008629971988795518, + "loss": 0.5333, + "step": 4997 + }, + { + "epoch": 2.792178770949721, + "grad_norm": 0.6634668707847595, + "learning_rate": 0.00086296918767507, + "loss": 0.5981, + "step": 4998 + }, + { + "epoch": 2.792737430167598, + "grad_norm": 0.5104075074195862, + "learning_rate": 0.0008629411764705882, + "loss": 0.5016, + "step": 4999 + }, + { + "epoch": 2.793296089385475, + "grad_norm": 0.8200002312660217, + "learning_rate": 0.0008629131652661064, + "loss": 0.4535, + "step": 5000 + }, + { + "epoch": 2.793296089385475, + "eval_cer": 0.09497779668968828, + "eval_loss": 0.359159916639328, + "eval_runtime": 55.6906, + "eval_samples_per_second": 81.486, + "eval_steps_per_second": 5.1, + "eval_wer": 0.3739001703863021, + "step": 5000 + }, + { + "epoch": 2.793854748603352, + "grad_norm": 0.54245924949646, + "learning_rate": 0.0008628851540616248, + "loss": 0.4048, + "step": 5001 + }, + { + "epoch": 2.794413407821229, + "grad_norm": 0.6140114068984985, + "learning_rate": 0.0008628571428571429, + "loss": 0.4892, + "step": 5002 + }, + { + "epoch": 2.7949720670391063, + "grad_norm": 0.8202154636383057, + "learning_rate": 0.0008628291316526611, + "loss": 0.4681, + "step": 5003 + }, + { + "epoch": 2.7955307262569833, + "grad_norm": 0.5861547589302063, + "learning_rate": 0.0008628011204481793, + "loss": 0.4486, + "step": 5004 + }, + { + "epoch": 2.7960893854748603, + "grad_norm": 1.1219669580459595, + "learning_rate": 0.0008627731092436975, + "loss": 0.48, + "step": 5005 + }, + { + "epoch": 2.7966480446927373, + "grad_norm": 0.603406548500061, + "learning_rate": 0.0008627450980392158, + "loss": 0.4598, + "step": 5006 + }, + { + "epoch": 2.7972067039106143, + "grad_norm": 1.0853874683380127, + "learning_rate": 0.0008627170868347339, + "loss": 0.7008, + "step": 5007 + }, + { + "epoch": 2.7977653631284918, + "grad_norm": 0.48637187480926514, + "learning_rate": 0.0008626890756302521, + "loss": 0.3929, + "step": 5008 + }, + { + "epoch": 2.798324022346369, + "grad_norm": 0.5474040508270264, + "learning_rate": 0.0008626610644257703, + "loss": 0.4781, + "step": 5009 + }, + { + "epoch": 2.798882681564246, + "grad_norm": 0.6707407832145691, + "learning_rate": 0.0008626330532212885, + "loss": 0.5671, + "step": 5010 + }, + { + "epoch": 2.799441340782123, + "grad_norm": 0.5939853191375732, + "learning_rate": 0.0008626050420168068, + "loss": 0.4217, + "step": 5011 + }, + { + "epoch": 2.8, + "grad_norm": 0.561815083026886, + "learning_rate": 0.0008625770308123249, + "loss": 0.4973, + "step": 5012 + }, + { + "epoch": 2.8005586592178773, + "grad_norm": 0.3941192328929901, + "learning_rate": 0.0008625490196078431, + "loss": 0.3137, + "step": 5013 + }, + { + "epoch": 2.8011173184357543, + "grad_norm": 0.8116780519485474, + "learning_rate": 0.0008625210084033613, + "loss": 0.5507, + "step": 5014 + }, + { + "epoch": 2.8016759776536313, + "grad_norm": 0.8271045088768005, + "learning_rate": 0.0008624929971988795, + "loss": 0.4198, + "step": 5015 + }, + { + "epoch": 2.8022346368715083, + "grad_norm": 0.45983797311782837, + "learning_rate": 0.0008624649859943979, + "loss": 0.4398, + "step": 5016 + }, + { + "epoch": 2.8027932960893853, + "grad_norm": 0.7537004947662354, + "learning_rate": 0.0008624369747899161, + "loss": 0.4461, + "step": 5017 + }, + { + "epoch": 2.8033519553072628, + "grad_norm": 0.6530876755714417, + "learning_rate": 0.0008624089635854342, + "loss": 0.4467, + "step": 5018 + }, + { + "epoch": 2.80391061452514, + "grad_norm": 0.7464740872383118, + "learning_rate": 0.0008623809523809524, + "loss": 0.4614, + "step": 5019 + }, + { + "epoch": 2.804469273743017, + "grad_norm": 0.8537314534187317, + "learning_rate": 0.0008623529411764706, + "loss": 0.5005, + "step": 5020 + }, + { + "epoch": 2.805027932960894, + "grad_norm": 0.5359246134757996, + "learning_rate": 0.0008623249299719889, + "loss": 0.6119, + "step": 5021 + }, + { + "epoch": 2.805586592178771, + "grad_norm": 0.5170433521270752, + "learning_rate": 0.0008622969187675071, + "loss": 0.4415, + "step": 5022 + }, + { + "epoch": 2.8061452513966483, + "grad_norm": 0.5501569509506226, + "learning_rate": 0.0008622689075630252, + "loss": 0.5106, + "step": 5023 + }, + { + "epoch": 2.8067039106145253, + "grad_norm": 0.4375327229499817, + "learning_rate": 0.0008622408963585434, + "loss": 0.4642, + "step": 5024 + }, + { + "epoch": 2.8072625698324023, + "grad_norm": 0.4825558364391327, + "learning_rate": 0.0008622128851540616, + "loss": 0.4577, + "step": 5025 + }, + { + "epoch": 2.8078212290502793, + "grad_norm": 0.4116192162036896, + "learning_rate": 0.0008621848739495799, + "loss": 0.2807, + "step": 5026 + }, + { + "epoch": 2.8083798882681563, + "grad_norm": 0.5142484903335571, + "learning_rate": 0.0008621568627450981, + "loss": 0.4113, + "step": 5027 + }, + { + "epoch": 2.8089385474860338, + "grad_norm": 0.5388715863227844, + "learning_rate": 0.0008621288515406162, + "loss": 0.4539, + "step": 5028 + }, + { + "epoch": 2.8094972067039103, + "grad_norm": 0.649519145488739, + "learning_rate": 0.0008621008403361344, + "loss": 0.4808, + "step": 5029 + }, + { + "epoch": 2.810055865921788, + "grad_norm": 0.437104195356369, + "learning_rate": 0.0008620728291316526, + "loss": 0.4824, + "step": 5030 + }, + { + "epoch": 2.810614525139665, + "grad_norm": 0.5369560718536377, + "learning_rate": 0.0008620448179271709, + "loss": 0.5076, + "step": 5031 + }, + { + "epoch": 2.811173184357542, + "grad_norm": 0.5922414064407349, + "learning_rate": 0.0008620168067226891, + "loss": 0.4777, + "step": 5032 + }, + { + "epoch": 2.811731843575419, + "grad_norm": 1.1569852828979492, + "learning_rate": 0.0008619887955182074, + "loss": 0.4211, + "step": 5033 + }, + { + "epoch": 2.812290502793296, + "grad_norm": 0.5336988568305969, + "learning_rate": 0.0008619607843137255, + "loss": 0.4623, + "step": 5034 + }, + { + "epoch": 2.8128491620111733, + "grad_norm": 0.47945383191108704, + "learning_rate": 0.0008619327731092437, + "loss": 0.3797, + "step": 5035 + }, + { + "epoch": 2.8134078212290503, + "grad_norm": 1.145865797996521, + "learning_rate": 0.000861904761904762, + "loss": 0.4892, + "step": 5036 + }, + { + "epoch": 2.8139664804469273, + "grad_norm": 0.5120701789855957, + "learning_rate": 0.0008618767507002802, + "loss": 0.4875, + "step": 5037 + }, + { + "epoch": 2.8145251396648043, + "grad_norm": 2.864386796951294, + "learning_rate": 0.0008618487394957984, + "loss": 0.5305, + "step": 5038 + }, + { + "epoch": 2.8150837988826813, + "grad_norm": 1.0483943223953247, + "learning_rate": 0.0008618207282913165, + "loss": 0.3532, + "step": 5039 + }, + { + "epoch": 2.815642458100559, + "grad_norm": 0.45070531964302063, + "learning_rate": 0.0008617927170868347, + "loss": 0.366, + "step": 5040 + }, + { + "epoch": 2.816201117318436, + "grad_norm": 0.395125150680542, + "learning_rate": 0.000861764705882353, + "loss": 0.451, + "step": 5041 + }, + { + "epoch": 2.816759776536313, + "grad_norm": 0.435016930103302, + "learning_rate": 0.0008617366946778712, + "loss": 0.4727, + "step": 5042 + }, + { + "epoch": 2.81731843575419, + "grad_norm": 0.5069654583930969, + "learning_rate": 0.0008617086834733894, + "loss": 0.3629, + "step": 5043 + }, + { + "epoch": 2.817877094972067, + "grad_norm": 0.5725529193878174, + "learning_rate": 0.0008616806722689075, + "loss": 0.4249, + "step": 5044 + }, + { + "epoch": 2.8184357541899443, + "grad_norm": 0.5819624066352844, + "learning_rate": 0.0008616526610644257, + "loss": 0.502, + "step": 5045 + }, + { + "epoch": 2.8189944134078213, + "grad_norm": 1.1757373809814453, + "learning_rate": 0.000861624649859944, + "loss": 0.5448, + "step": 5046 + }, + { + "epoch": 2.8195530726256983, + "grad_norm": 0.3719157576560974, + "learning_rate": 0.0008615966386554622, + "loss": 0.4092, + "step": 5047 + }, + { + "epoch": 2.8201117318435753, + "grad_norm": 0.49994513392448425, + "learning_rate": 0.0008615686274509804, + "loss": 0.4789, + "step": 5048 + }, + { + "epoch": 2.8206703910614523, + "grad_norm": 0.8068432807922363, + "learning_rate": 0.0008615406162464986, + "loss": 0.3884, + "step": 5049 + }, + { + "epoch": 2.82122905027933, + "grad_norm": 0.5980684161186218, + "learning_rate": 0.0008615126050420167, + "loss": 0.4437, + "step": 5050 + }, + { + "epoch": 2.821787709497207, + "grad_norm": 0.6048242449760437, + "learning_rate": 0.0008614845938375351, + "loss": 0.4221, + "step": 5051 + }, + { + "epoch": 2.822346368715084, + "grad_norm": 0.9423447847366333, + "learning_rate": 0.0008614565826330533, + "loss": 0.4867, + "step": 5052 + }, + { + "epoch": 2.822905027932961, + "grad_norm": 0.8989359736442566, + "learning_rate": 0.0008614285714285715, + "loss": 0.6963, + "step": 5053 + }, + { + "epoch": 2.823463687150838, + "grad_norm": 0.6020029187202454, + "learning_rate": 0.0008614005602240897, + "loss": 0.5347, + "step": 5054 + }, + { + "epoch": 2.8240223463687153, + "grad_norm": 0.6163732409477234, + "learning_rate": 0.0008613725490196078, + "loss": 0.5634, + "step": 5055 + }, + { + "epoch": 2.8245810055865923, + "grad_norm": 0.7855117917060852, + "learning_rate": 0.0008613445378151261, + "loss": 0.5319, + "step": 5056 + }, + { + "epoch": 2.8251396648044693, + "grad_norm": 0.4465838670730591, + "learning_rate": 0.0008613165266106443, + "loss": 0.4587, + "step": 5057 + }, + { + "epoch": 2.8256983240223463, + "grad_norm": 3.268709897994995, + "learning_rate": 0.0008612885154061625, + "loss": 0.444, + "step": 5058 + }, + { + "epoch": 2.8262569832402233, + "grad_norm": 0.5292797088623047, + "learning_rate": 0.0008612605042016807, + "loss": 0.3942, + "step": 5059 + }, + { + "epoch": 2.826815642458101, + "grad_norm": 0.4475780725479126, + "learning_rate": 0.0008612324929971988, + "loss": 0.4248, + "step": 5060 + }, + { + "epoch": 2.827374301675978, + "grad_norm": 0.4191311299800873, + "learning_rate": 0.0008612044817927171, + "loss": 0.4121, + "step": 5061 + }, + { + "epoch": 2.827932960893855, + "grad_norm": 1.642168641090393, + "learning_rate": 0.0008611764705882353, + "loss": 0.416, + "step": 5062 + }, + { + "epoch": 2.828491620111732, + "grad_norm": 1.054935097694397, + "learning_rate": 0.0008611484593837535, + "loss": 0.4342, + "step": 5063 + }, + { + "epoch": 2.829050279329609, + "grad_norm": 0.5002058744430542, + "learning_rate": 0.0008611204481792717, + "loss": 0.411, + "step": 5064 + }, + { + "epoch": 2.8296089385474863, + "grad_norm": 0.5005667805671692, + "learning_rate": 0.0008610924369747899, + "loss": 0.3589, + "step": 5065 + }, + { + "epoch": 2.830167597765363, + "grad_norm": 1.0537171363830566, + "learning_rate": 0.0008610644257703082, + "loss": 0.479, + "step": 5066 + }, + { + "epoch": 2.8307262569832403, + "grad_norm": 0.433057963848114, + "learning_rate": 0.0008610364145658264, + "loss": 0.4614, + "step": 5067 + }, + { + "epoch": 2.8312849162011173, + "grad_norm": 0.4622815251350403, + "learning_rate": 0.0008610084033613446, + "loss": 0.4238, + "step": 5068 + }, + { + "epoch": 2.8318435754189943, + "grad_norm": 0.6391461491584778, + "learning_rate": 0.0008609803921568628, + "loss": 0.4696, + "step": 5069 + }, + { + "epoch": 2.8324022346368714, + "grad_norm": 1.1945433616638184, + "learning_rate": 0.000860952380952381, + "loss": 0.6053, + "step": 5070 + }, + { + "epoch": 2.8329608938547484, + "grad_norm": 0.6835968494415283, + "learning_rate": 0.0008609243697478992, + "loss": 0.4141, + "step": 5071 + }, + { + "epoch": 2.833519553072626, + "grad_norm": 0.7184941172599792, + "learning_rate": 0.0008608963585434174, + "loss": 0.6268, + "step": 5072 + }, + { + "epoch": 2.834078212290503, + "grad_norm": 0.9129653573036194, + "learning_rate": 0.0008608683473389356, + "loss": 0.5922, + "step": 5073 + }, + { + "epoch": 2.83463687150838, + "grad_norm": 1.3539282083511353, + "learning_rate": 0.0008608403361344538, + "loss": 0.4466, + "step": 5074 + }, + { + "epoch": 2.835195530726257, + "grad_norm": 0.6483432054519653, + "learning_rate": 0.000860812324929972, + "loss": 0.5087, + "step": 5075 + }, + { + "epoch": 2.835754189944134, + "grad_norm": 0.6693288683891296, + "learning_rate": 0.0008607843137254902, + "loss": 0.5042, + "step": 5076 + }, + { + "epoch": 2.8363128491620113, + "grad_norm": 0.6429671049118042, + "learning_rate": 0.0008607563025210084, + "loss": 0.3691, + "step": 5077 + }, + { + "epoch": 2.8368715083798883, + "grad_norm": 0.4690956771373749, + "learning_rate": 0.0008607282913165266, + "loss": 0.3897, + "step": 5078 + }, + { + "epoch": 2.8374301675977653, + "grad_norm": 1.886242151260376, + "learning_rate": 0.0008607002801120448, + "loss": 0.4195, + "step": 5079 + }, + { + "epoch": 2.8379888268156424, + "grad_norm": 0.5595972537994385, + "learning_rate": 0.000860672268907563, + "loss": 0.5348, + "step": 5080 + }, + { + "epoch": 2.8385474860335194, + "grad_norm": 1.1238492727279663, + "learning_rate": 0.0008606442577030813, + "loss": 0.4244, + "step": 5081 + }, + { + "epoch": 2.839106145251397, + "grad_norm": 0.822028398513794, + "learning_rate": 0.0008606162464985994, + "loss": 0.4276, + "step": 5082 + }, + { + "epoch": 2.839664804469274, + "grad_norm": 0.5736905932426453, + "learning_rate": 0.0008605882352941177, + "loss": 0.4062, + "step": 5083 + }, + { + "epoch": 2.840223463687151, + "grad_norm": 0.9170734286308289, + "learning_rate": 0.0008605602240896359, + "loss": 0.538, + "step": 5084 + }, + { + "epoch": 2.840782122905028, + "grad_norm": 0.4064418077468872, + "learning_rate": 0.0008605322128851541, + "loss": 0.4815, + "step": 5085 + }, + { + "epoch": 2.841340782122905, + "grad_norm": 0.7241451144218445, + "learning_rate": 0.0008605042016806724, + "loss": 0.455, + "step": 5086 + }, + { + "epoch": 2.8418994413407823, + "grad_norm": 0.7200766205787659, + "learning_rate": 0.0008604761904761905, + "loss": 0.7181, + "step": 5087 + }, + { + "epoch": 2.8424581005586593, + "grad_norm": 0.5524330735206604, + "learning_rate": 0.0008604481792717087, + "loss": 0.4517, + "step": 5088 + }, + { + "epoch": 2.8430167597765363, + "grad_norm": 0.5243330001831055, + "learning_rate": 0.0008604201680672269, + "loss": 0.5113, + "step": 5089 + }, + { + "epoch": 2.8435754189944134, + "grad_norm": 0.42244550585746765, + "learning_rate": 0.0008603921568627451, + "loss": 0.5435, + "step": 5090 + }, + { + "epoch": 2.8441340782122904, + "grad_norm": 1.268298625946045, + "learning_rate": 0.0008603641456582634, + "loss": 0.5057, + "step": 5091 + }, + { + "epoch": 2.844692737430168, + "grad_norm": 0.8056744337081909, + "learning_rate": 0.0008603361344537815, + "loss": 0.4779, + "step": 5092 + }, + { + "epoch": 2.845251396648045, + "grad_norm": 0.5748698115348816, + "learning_rate": 0.0008603081232492997, + "loss": 0.4207, + "step": 5093 + }, + { + "epoch": 2.845810055865922, + "grad_norm": 1.1256290674209595, + "learning_rate": 0.0008602801120448179, + "loss": 0.3795, + "step": 5094 + }, + { + "epoch": 2.846368715083799, + "grad_norm": 0.8879082798957825, + "learning_rate": 0.0008602521008403361, + "loss": 0.6347, + "step": 5095 + }, + { + "epoch": 2.846927374301676, + "grad_norm": 0.7756565809249878, + "learning_rate": 0.0008602240896358544, + "loss": 0.4921, + "step": 5096 + }, + { + "epoch": 2.8474860335195533, + "grad_norm": 0.4193631410598755, + "learning_rate": 0.0008601960784313726, + "loss": 0.4344, + "step": 5097 + }, + { + "epoch": 2.8480446927374303, + "grad_norm": 0.44814175367355347, + "learning_rate": 0.0008601680672268907, + "loss": 0.4646, + "step": 5098 + }, + { + "epoch": 2.8486033519553073, + "grad_norm": 2.894409656524658, + "learning_rate": 0.000860140056022409, + "loss": 0.3383, + "step": 5099 + }, + { + "epoch": 2.8491620111731844, + "grad_norm": 1.2867300510406494, + "learning_rate": 0.0008601120448179272, + "loss": 0.5005, + "step": 5100 + }, + { + "epoch": 2.8497206703910614, + "grad_norm": 0.4173518121242523, + "learning_rate": 0.0008600840336134455, + "loss": 0.3868, + "step": 5101 + }, + { + "epoch": 2.850279329608939, + "grad_norm": 0.4465699791908264, + "learning_rate": 0.0008600560224089637, + "loss": 0.3371, + "step": 5102 + }, + { + "epoch": 2.8508379888268154, + "grad_norm": 0.6916064620018005, + "learning_rate": 0.0008600280112044818, + "loss": 0.4853, + "step": 5103 + }, + { + "epoch": 2.851396648044693, + "grad_norm": 0.5090463757514954, + "learning_rate": 0.00086, + "loss": 0.4856, + "step": 5104 + }, + { + "epoch": 2.85195530726257, + "grad_norm": 0.6066995859146118, + "learning_rate": 0.0008599719887955182, + "loss": 0.4082, + "step": 5105 + }, + { + "epoch": 2.852513966480447, + "grad_norm": 0.6781220436096191, + "learning_rate": 0.0008599439775910365, + "loss": 0.549, + "step": 5106 + }, + { + "epoch": 2.853072625698324, + "grad_norm": 2.6489884853363037, + "learning_rate": 0.0008599159663865547, + "loss": 0.378, + "step": 5107 + }, + { + "epoch": 2.853631284916201, + "grad_norm": 0.6756976842880249, + "learning_rate": 0.0008598879551820728, + "loss": 0.6914, + "step": 5108 + }, + { + "epoch": 2.8541899441340783, + "grad_norm": 0.5543894171714783, + "learning_rate": 0.000859859943977591, + "loss": 0.4202, + "step": 5109 + }, + { + "epoch": 2.8547486033519553, + "grad_norm": 0.5163624882698059, + "learning_rate": 0.0008598319327731092, + "loss": 0.3955, + "step": 5110 + }, + { + "epoch": 2.8553072625698324, + "grad_norm": 0.6780015826225281, + "learning_rate": 0.0008598039215686275, + "loss": 0.5653, + "step": 5111 + }, + { + "epoch": 2.8558659217877094, + "grad_norm": 0.5120530128479004, + "learning_rate": 0.0008597759103641457, + "loss": 0.4575, + "step": 5112 + }, + { + "epoch": 2.8564245810055864, + "grad_norm": 0.6539584994316101, + "learning_rate": 0.0008597478991596639, + "loss": 0.6027, + "step": 5113 + }, + { + "epoch": 2.856983240223464, + "grad_norm": 0.587616503238678, + "learning_rate": 0.000859719887955182, + "loss": 0.4516, + "step": 5114 + }, + { + "epoch": 2.857541899441341, + "grad_norm": 1.3105885982513428, + "learning_rate": 0.0008596918767507002, + "loss": 0.4366, + "step": 5115 + }, + { + "epoch": 2.858100558659218, + "grad_norm": 0.6124499440193176, + "learning_rate": 0.0008596638655462186, + "loss": 0.4136, + "step": 5116 + }, + { + "epoch": 2.858659217877095, + "grad_norm": 1.268271565437317, + "learning_rate": 0.0008596358543417368, + "loss": 0.4719, + "step": 5117 + }, + { + "epoch": 2.859217877094972, + "grad_norm": 3.596426486968994, + "learning_rate": 0.000859607843137255, + "loss": 0.629, + "step": 5118 + }, + { + "epoch": 2.8597765363128493, + "grad_norm": 0.5195632576942444, + "learning_rate": 0.0008595798319327731, + "loss": 0.6498, + "step": 5119 + }, + { + "epoch": 2.8603351955307263, + "grad_norm": 0.6706558465957642, + "learning_rate": 0.0008595518207282913, + "loss": 0.6026, + "step": 5120 + }, + { + "epoch": 2.8608938547486034, + "grad_norm": 0.6112656593322754, + "learning_rate": 0.0008595238095238096, + "loss": 0.4883, + "step": 5121 + }, + { + "epoch": 2.8614525139664804, + "grad_norm": 0.5213063955307007, + "learning_rate": 0.0008594957983193278, + "loss": 0.4466, + "step": 5122 + }, + { + "epoch": 2.8620111731843574, + "grad_norm": 0.6336454749107361, + "learning_rate": 0.000859467787114846, + "loss": 0.6278, + "step": 5123 + }, + { + "epoch": 2.862569832402235, + "grad_norm": 0.7374244332313538, + "learning_rate": 0.0008594397759103641, + "loss": 0.6401, + "step": 5124 + }, + { + "epoch": 2.863128491620112, + "grad_norm": 0.6692697405815125, + "learning_rate": 0.0008594117647058823, + "loss": 0.4764, + "step": 5125 + }, + { + "epoch": 2.863687150837989, + "grad_norm": 0.7981042265892029, + "learning_rate": 0.0008593837535014006, + "loss": 0.603, + "step": 5126 + }, + { + "epoch": 2.864245810055866, + "grad_norm": 0.9843001365661621, + "learning_rate": 0.0008593557422969188, + "loss": 0.6013, + "step": 5127 + }, + { + "epoch": 2.864804469273743, + "grad_norm": 0.6881023049354553, + "learning_rate": 0.000859327731092437, + "loss": 0.4202, + "step": 5128 + }, + { + "epoch": 2.8653631284916203, + "grad_norm": 0.805983304977417, + "learning_rate": 0.0008592997198879552, + "loss": 0.4521, + "step": 5129 + }, + { + "epoch": 2.8659217877094973, + "grad_norm": 0.7637940645217896, + "learning_rate": 0.0008592717086834733, + "loss": 0.471, + "step": 5130 + }, + { + "epoch": 2.8664804469273744, + "grad_norm": 0.523261547088623, + "learning_rate": 0.0008592436974789915, + "loss": 0.4238, + "step": 5131 + }, + { + "epoch": 2.8670391061452514, + "grad_norm": 3.7323405742645264, + "learning_rate": 0.0008592156862745099, + "loss": 0.5386, + "step": 5132 + }, + { + "epoch": 2.8675977653631284, + "grad_norm": 0.5389082431793213, + "learning_rate": 0.0008591876750700281, + "loss": 0.504, + "step": 5133 + }, + { + "epoch": 2.868156424581006, + "grad_norm": 0.43062400817871094, + "learning_rate": 0.0008591596638655463, + "loss": 0.3815, + "step": 5134 + }, + { + "epoch": 2.868715083798883, + "grad_norm": 0.508239209651947, + "learning_rate": 0.0008591316526610644, + "loss": 0.5233, + "step": 5135 + }, + { + "epoch": 2.86927374301676, + "grad_norm": 1.392482042312622, + "learning_rate": 0.0008591036414565826, + "loss": 0.4106, + "step": 5136 + }, + { + "epoch": 2.869832402234637, + "grad_norm": 0.6519489884376526, + "learning_rate": 0.0008590756302521009, + "loss": 0.4012, + "step": 5137 + }, + { + "epoch": 2.870391061452514, + "grad_norm": 1.091198205947876, + "learning_rate": 0.0008590476190476191, + "loss": 0.6055, + "step": 5138 + }, + { + "epoch": 2.8709497206703913, + "grad_norm": 0.5800414085388184, + "learning_rate": 0.0008590196078431373, + "loss": 0.4781, + "step": 5139 + }, + { + "epoch": 2.871508379888268, + "grad_norm": 0.49878600239753723, + "learning_rate": 0.0008589915966386554, + "loss": 0.4931, + "step": 5140 + }, + { + "epoch": 2.8720670391061454, + "grad_norm": 0.5097025036811829, + "learning_rate": 0.0008589635854341736, + "loss": 0.4896, + "step": 5141 + }, + { + "epoch": 2.8726256983240224, + "grad_norm": 3.178318738937378, + "learning_rate": 0.0008589355742296919, + "loss": 0.4201, + "step": 5142 + }, + { + "epoch": 2.8731843575418994, + "grad_norm": 0.6847972273826599, + "learning_rate": 0.0008589075630252101, + "loss": 0.5028, + "step": 5143 + }, + { + "epoch": 2.8737430167597764, + "grad_norm": 1.0314918756484985, + "learning_rate": 0.0008588795518207283, + "loss": 0.4317, + "step": 5144 + }, + { + "epoch": 2.8743016759776534, + "grad_norm": 0.4729918837547302, + "learning_rate": 0.0008588515406162465, + "loss": 0.4613, + "step": 5145 + }, + { + "epoch": 2.874860335195531, + "grad_norm": 0.4969775676727295, + "learning_rate": 0.0008588235294117646, + "loss": 0.447, + "step": 5146 + }, + { + "epoch": 2.875418994413408, + "grad_norm": 0.619356632232666, + "learning_rate": 0.0008587955182072829, + "loss": 0.502, + "step": 5147 + }, + { + "epoch": 2.875977653631285, + "grad_norm": 0.4823377728462219, + "learning_rate": 0.0008587675070028012, + "loss": 0.4737, + "step": 5148 + }, + { + "epoch": 2.876536312849162, + "grad_norm": 0.48929011821746826, + "learning_rate": 0.0008587394957983194, + "loss": 0.479, + "step": 5149 + }, + { + "epoch": 2.877094972067039, + "grad_norm": 0.42125391960144043, + "learning_rate": 0.0008587114845938376, + "loss": 0.4672, + "step": 5150 + }, + { + "epoch": 2.8776536312849164, + "grad_norm": 0.889860212802887, + "learning_rate": 0.0008586834733893557, + "loss": 0.4701, + "step": 5151 + }, + { + "epoch": 2.8782122905027934, + "grad_norm": 6.053067684173584, + "learning_rate": 0.000858655462184874, + "loss": 0.4678, + "step": 5152 + }, + { + "epoch": 2.8787709497206704, + "grad_norm": 3.453068256378174, + "learning_rate": 0.0008586274509803922, + "loss": 0.6017, + "step": 5153 + }, + { + "epoch": 2.8793296089385474, + "grad_norm": 0.39680376648902893, + "learning_rate": 0.0008585994397759104, + "loss": 0.3969, + "step": 5154 + }, + { + "epoch": 2.8798882681564244, + "grad_norm": 0.6880179047584534, + "learning_rate": 0.0008585714285714286, + "loss": 0.421, + "step": 5155 + }, + { + "epoch": 2.880446927374302, + "grad_norm": 0.7897211313247681, + "learning_rate": 0.0008585434173669467, + "loss": 0.4925, + "step": 5156 + }, + { + "epoch": 2.881005586592179, + "grad_norm": 0.6023778915405273, + "learning_rate": 0.000858515406162465, + "loss": 0.518, + "step": 5157 + }, + { + "epoch": 2.881564245810056, + "grad_norm": 0.4900436997413635, + "learning_rate": 0.0008584873949579832, + "loss": 0.4901, + "step": 5158 + }, + { + "epoch": 2.882122905027933, + "grad_norm": 1.5217920541763306, + "learning_rate": 0.0008584593837535014, + "loss": 0.4754, + "step": 5159 + }, + { + "epoch": 2.88268156424581, + "grad_norm": 0.4940977990627289, + "learning_rate": 0.0008584313725490196, + "loss": 0.4261, + "step": 5160 + }, + { + "epoch": 2.8832402234636874, + "grad_norm": 2.331089735031128, + "learning_rate": 0.0008584033613445378, + "loss": 0.4189, + "step": 5161 + }, + { + "epoch": 2.8837988826815644, + "grad_norm": 0.4676640033721924, + "learning_rate": 0.000858375350140056, + "loss": 0.3533, + "step": 5162 + }, + { + "epoch": 2.8843575418994414, + "grad_norm": 1.4016258716583252, + "learning_rate": 0.0008583473389355742, + "loss": 0.4997, + "step": 5163 + }, + { + "epoch": 2.8849162011173184, + "grad_norm": 0.6847200989723206, + "learning_rate": 0.0008583193277310924, + "loss": 0.3922, + "step": 5164 + }, + { + "epoch": 2.8854748603351954, + "grad_norm": 0.6961644291877747, + "learning_rate": 0.0008582913165266107, + "loss": 0.6224, + "step": 5165 + }, + { + "epoch": 2.886033519553073, + "grad_norm": 0.6959324479103088, + "learning_rate": 0.0008582633053221289, + "loss": 0.5146, + "step": 5166 + }, + { + "epoch": 2.88659217877095, + "grad_norm": 0.689371645450592, + "learning_rate": 0.0008582352941176471, + "loss": 0.4311, + "step": 5167 + }, + { + "epoch": 2.887150837988827, + "grad_norm": 0.8468054533004761, + "learning_rate": 0.0008582072829131653, + "loss": 0.4509, + "step": 5168 + }, + { + "epoch": 2.887709497206704, + "grad_norm": 0.4900321364402771, + "learning_rate": 0.0008581792717086835, + "loss": 0.5418, + "step": 5169 + }, + { + "epoch": 2.888268156424581, + "grad_norm": 0.6423742771148682, + "learning_rate": 0.0008581512605042017, + "loss": 0.6463, + "step": 5170 + }, + { + "epoch": 2.8888268156424584, + "grad_norm": 0.8085363507270813, + "learning_rate": 0.0008581232492997199, + "loss": 0.4523, + "step": 5171 + }, + { + "epoch": 2.889385474860335, + "grad_norm": 0.6199180483818054, + "learning_rate": 0.0008580952380952382, + "loss": 0.4751, + "step": 5172 + }, + { + "epoch": 2.8899441340782124, + "grad_norm": 0.43933606147766113, + "learning_rate": 0.0008580672268907563, + "loss": 0.4845, + "step": 5173 + }, + { + "epoch": 2.8905027932960894, + "grad_norm": 1.1304324865341187, + "learning_rate": 0.0008580392156862745, + "loss": 0.6311, + "step": 5174 + }, + { + "epoch": 2.8910614525139664, + "grad_norm": 0.573530912399292, + "learning_rate": 0.0008580112044817927, + "loss": 0.571, + "step": 5175 + }, + { + "epoch": 2.8916201117318434, + "grad_norm": 0.5387104749679565, + "learning_rate": 0.0008579831932773109, + "loss": 0.57, + "step": 5176 + }, + { + "epoch": 2.8921787709497204, + "grad_norm": 0.5040245652198792, + "learning_rate": 0.0008579551820728292, + "loss": 0.4616, + "step": 5177 + }, + { + "epoch": 2.892737430167598, + "grad_norm": 0.506863534450531, + "learning_rate": 0.0008579271708683473, + "loss": 0.4317, + "step": 5178 + }, + { + "epoch": 2.893296089385475, + "grad_norm": 0.5977319478988647, + "learning_rate": 0.0008578991596638655, + "loss": 0.4988, + "step": 5179 + }, + { + "epoch": 2.893854748603352, + "grad_norm": 0.6969428658485413, + "learning_rate": 0.0008578711484593837, + "loss": 0.5135, + "step": 5180 + }, + { + "epoch": 2.894413407821229, + "grad_norm": 0.6479884386062622, + "learning_rate": 0.000857843137254902, + "loss": 0.498, + "step": 5181 + }, + { + "epoch": 2.894972067039106, + "grad_norm": 0.5598331093788147, + "learning_rate": 0.0008578151260504203, + "loss": 0.438, + "step": 5182 + }, + { + "epoch": 2.8955307262569834, + "grad_norm": 0.4293232858181, + "learning_rate": 0.0008577871148459384, + "loss": 0.4494, + "step": 5183 + }, + { + "epoch": 2.8960893854748604, + "grad_norm": 0.4953780770301819, + "learning_rate": 0.0008577591036414566, + "loss": 0.3984, + "step": 5184 + }, + { + "epoch": 2.8966480446927374, + "grad_norm": 0.5538314580917358, + "learning_rate": 0.0008577310924369748, + "loss": 0.4605, + "step": 5185 + }, + { + "epoch": 2.8972067039106144, + "grad_norm": 0.610962450504303, + "learning_rate": 0.000857703081232493, + "loss": 0.5514, + "step": 5186 + }, + { + "epoch": 2.8977653631284914, + "grad_norm": 0.4831545948982239, + "learning_rate": 0.0008576750700280113, + "loss": 0.418, + "step": 5187 + }, + { + "epoch": 2.898324022346369, + "grad_norm": 0.6809319853782654, + "learning_rate": 0.0008576470588235295, + "loss": 0.5996, + "step": 5188 + }, + { + "epoch": 2.898882681564246, + "grad_norm": 0.5042480826377869, + "learning_rate": 0.0008576190476190476, + "loss": 0.5174, + "step": 5189 + }, + { + "epoch": 2.899441340782123, + "grad_norm": 0.7760380506515503, + "learning_rate": 0.0008575910364145658, + "loss": 0.4816, + "step": 5190 + }, + { + "epoch": 2.9, + "grad_norm": 0.8557183742523193, + "learning_rate": 0.000857563025210084, + "loss": 0.3988, + "step": 5191 + }, + { + "epoch": 2.900558659217877, + "grad_norm": 0.5912318825721741, + "learning_rate": 0.0008575350140056023, + "loss": 0.5366, + "step": 5192 + }, + { + "epoch": 2.9011173184357544, + "grad_norm": 0.5452551245689392, + "learning_rate": 0.0008575070028011205, + "loss": 0.4688, + "step": 5193 + }, + { + "epoch": 2.9016759776536314, + "grad_norm": 0.5104190707206726, + "learning_rate": 0.0008574789915966386, + "loss": 0.51, + "step": 5194 + }, + { + "epoch": 2.9022346368715084, + "grad_norm": 0.6214990019798279, + "learning_rate": 0.0008574509803921568, + "loss": 0.5122, + "step": 5195 + }, + { + "epoch": 2.9027932960893854, + "grad_norm": 0.5402427911758423, + "learning_rate": 0.000857422969187675, + "loss": 0.489, + "step": 5196 + }, + { + "epoch": 2.9033519553072624, + "grad_norm": 2.7312228679656982, + "learning_rate": 0.0008573949579831934, + "loss": 0.4598, + "step": 5197 + }, + { + "epoch": 2.90391061452514, + "grad_norm": 1.3614888191223145, + "learning_rate": 0.0008573669467787116, + "loss": 0.412, + "step": 5198 + }, + { + "epoch": 2.904469273743017, + "grad_norm": 0.8111847639083862, + "learning_rate": 0.0008573389355742297, + "loss": 0.4629, + "step": 5199 + }, + { + "epoch": 2.905027932960894, + "grad_norm": 0.5172328948974609, + "learning_rate": 0.0008573109243697479, + "loss": 0.5623, + "step": 5200 + }, + { + "epoch": 2.905586592178771, + "grad_norm": 0.6620856523513794, + "learning_rate": 0.0008572829131652661, + "loss": 0.476, + "step": 5201 + }, + { + "epoch": 2.906145251396648, + "grad_norm": 0.5316200852394104, + "learning_rate": 0.0008572549019607844, + "loss": 0.5059, + "step": 5202 + }, + { + "epoch": 2.9067039106145254, + "grad_norm": 0.6617650389671326, + "learning_rate": 0.0008572268907563026, + "loss": 0.4201, + "step": 5203 + }, + { + "epoch": 2.9072625698324024, + "grad_norm": 0.5596368908882141, + "learning_rate": 0.0008571988795518208, + "loss": 0.3933, + "step": 5204 + }, + { + "epoch": 2.9078212290502794, + "grad_norm": 0.8836463093757629, + "learning_rate": 0.0008571708683473389, + "loss": 0.524, + "step": 5205 + }, + { + "epoch": 2.9083798882681564, + "grad_norm": 0.8760644197463989, + "learning_rate": 0.0008571428571428571, + "loss": 0.5705, + "step": 5206 + }, + { + "epoch": 2.9089385474860334, + "grad_norm": 0.5425623655319214, + "learning_rate": 0.0008571148459383754, + "loss": 0.5588, + "step": 5207 + }, + { + "epoch": 2.909497206703911, + "grad_norm": 0.6250994205474854, + "learning_rate": 0.0008570868347338936, + "loss": 0.6644, + "step": 5208 + }, + { + "epoch": 2.9100558659217874, + "grad_norm": 1.6671067476272583, + "learning_rate": 0.0008570588235294118, + "loss": 0.6824, + "step": 5209 + }, + { + "epoch": 2.910614525139665, + "grad_norm": 0.7472813725471497, + "learning_rate": 0.0008570308123249299, + "loss": 0.6222, + "step": 5210 + }, + { + "epoch": 2.911173184357542, + "grad_norm": 0.9850180149078369, + "learning_rate": 0.0008570028011204481, + "loss": 0.497, + "step": 5211 + }, + { + "epoch": 2.911731843575419, + "grad_norm": 2.756469964981079, + "learning_rate": 0.0008569747899159664, + "loss": 0.4527, + "step": 5212 + }, + { + "epoch": 2.912290502793296, + "grad_norm": 0.7195557951927185, + "learning_rate": 0.0008569467787114846, + "loss": 0.5287, + "step": 5213 + }, + { + "epoch": 2.912849162011173, + "grad_norm": 0.47098296880722046, + "learning_rate": 0.0008569187675070029, + "loss": 0.4728, + "step": 5214 + }, + { + "epoch": 2.9134078212290504, + "grad_norm": 0.7351492047309875, + "learning_rate": 0.000856890756302521, + "loss": 0.474, + "step": 5215 + }, + { + "epoch": 2.9139664804469274, + "grad_norm": 7.006438255310059, + "learning_rate": 0.0008568627450980392, + "loss": 0.5086, + "step": 5216 + }, + { + "epoch": 2.9145251396648044, + "grad_norm": 0.4575154483318329, + "learning_rate": 0.0008568347338935575, + "loss": 0.4303, + "step": 5217 + }, + { + "epoch": 2.9150837988826814, + "grad_norm": 0.5102495551109314, + "learning_rate": 0.0008568067226890757, + "loss": 0.504, + "step": 5218 + }, + { + "epoch": 2.9156424581005584, + "grad_norm": 0.5614028573036194, + "learning_rate": 0.0008567787114845939, + "loss": 0.4173, + "step": 5219 + }, + { + "epoch": 2.916201117318436, + "grad_norm": 0.4462522268295288, + "learning_rate": 0.0008567507002801121, + "loss": 0.4039, + "step": 5220 + }, + { + "epoch": 2.916759776536313, + "grad_norm": 2.253756046295166, + "learning_rate": 0.0008567226890756302, + "loss": 0.4502, + "step": 5221 + }, + { + "epoch": 2.91731843575419, + "grad_norm": 0.6509634256362915, + "learning_rate": 0.0008566946778711485, + "loss": 0.4553, + "step": 5222 + }, + { + "epoch": 2.917877094972067, + "grad_norm": 0.5005168318748474, + "learning_rate": 0.0008566666666666667, + "loss": 0.4467, + "step": 5223 + }, + { + "epoch": 2.918435754189944, + "grad_norm": 0.9374316334724426, + "learning_rate": 0.0008566386554621849, + "loss": 0.5944, + "step": 5224 + }, + { + "epoch": 2.9189944134078214, + "grad_norm": 1.4228215217590332, + "learning_rate": 0.0008566106442577031, + "loss": 0.5214, + "step": 5225 + }, + { + "epoch": 2.9195530726256984, + "grad_norm": 0.44831714034080505, + "learning_rate": 0.0008565826330532212, + "loss": 0.3466, + "step": 5226 + }, + { + "epoch": 2.9201117318435754, + "grad_norm": 3.154331684112549, + "learning_rate": 0.0008565546218487395, + "loss": 0.6113, + "step": 5227 + }, + { + "epoch": 2.9206703910614524, + "grad_norm": 1.5173183679580688, + "learning_rate": 0.0008565266106442577, + "loss": 0.3932, + "step": 5228 + }, + { + "epoch": 2.9212290502793294, + "grad_norm": 0.4372890591621399, + "learning_rate": 0.0008564985994397759, + "loss": 0.3439, + "step": 5229 + }, + { + "epoch": 2.921787709497207, + "grad_norm": 0.44898954033851624, + "learning_rate": 0.0008564705882352942, + "loss": 0.4945, + "step": 5230 + }, + { + "epoch": 2.922346368715084, + "grad_norm": 0.4339487552642822, + "learning_rate": 0.0008564425770308122, + "loss": 0.4724, + "step": 5231 + }, + { + "epoch": 2.922905027932961, + "grad_norm": 3.6003055572509766, + "learning_rate": 0.0008564145658263306, + "loss": 0.5035, + "step": 5232 + }, + { + "epoch": 2.923463687150838, + "grad_norm": 11.896994590759277, + "learning_rate": 0.0008563865546218488, + "loss": 0.7009, + "step": 5233 + }, + { + "epoch": 2.924022346368715, + "grad_norm": 0.3272635042667389, + "learning_rate": 0.000856358543417367, + "loss": 0.3322, + "step": 5234 + }, + { + "epoch": 2.9245810055865924, + "grad_norm": 0.4550877809524536, + "learning_rate": 0.0008563305322128852, + "loss": 0.5193, + "step": 5235 + }, + { + "epoch": 2.9251396648044694, + "grad_norm": 0.4681559205055237, + "learning_rate": 0.0008563025210084034, + "loss": 0.4387, + "step": 5236 + }, + { + "epoch": 2.9256983240223464, + "grad_norm": 0.7069253325462341, + "learning_rate": 0.0008562745098039216, + "loss": 0.4566, + "step": 5237 + }, + { + "epoch": 2.9262569832402234, + "grad_norm": 0.3951936662197113, + "learning_rate": 0.0008562464985994398, + "loss": 0.4267, + "step": 5238 + }, + { + "epoch": 2.9268156424581004, + "grad_norm": 2.9395580291748047, + "learning_rate": 0.000856218487394958, + "loss": 0.6986, + "step": 5239 + }, + { + "epoch": 2.927374301675978, + "grad_norm": 0.4268450140953064, + "learning_rate": 0.0008561904761904762, + "loss": 0.3528, + "step": 5240 + }, + { + "epoch": 2.927932960893855, + "grad_norm": 0.5216578245162964, + "learning_rate": 0.0008561624649859944, + "loss": 0.4481, + "step": 5241 + }, + { + "epoch": 2.928491620111732, + "grad_norm": 0.8292457461357117, + "learning_rate": 0.0008561344537815126, + "loss": 0.6018, + "step": 5242 + }, + { + "epoch": 2.929050279329609, + "grad_norm": 0.4213391840457916, + "learning_rate": 0.0008561064425770308, + "loss": 0.3861, + "step": 5243 + }, + { + "epoch": 2.929608938547486, + "grad_norm": 0.887485921382904, + "learning_rate": 0.000856078431372549, + "loss": 0.4673, + "step": 5244 + }, + { + "epoch": 2.9301675977653634, + "grad_norm": 0.4789484739303589, + "learning_rate": 0.0008560504201680672, + "loss": 0.3961, + "step": 5245 + }, + { + "epoch": 2.93072625698324, + "grad_norm": 0.6058626770973206, + "learning_rate": 0.0008560224089635854, + "loss": 0.502, + "step": 5246 + }, + { + "epoch": 2.9312849162011174, + "grad_norm": 0.5269392132759094, + "learning_rate": 0.0008559943977591037, + "loss": 0.4484, + "step": 5247 + }, + { + "epoch": 2.9318435754189944, + "grad_norm": 0.6030774712562561, + "learning_rate": 0.0008559663865546219, + "loss": 0.3759, + "step": 5248 + }, + { + "epoch": 2.9324022346368714, + "grad_norm": 0.6322498321533203, + "learning_rate": 0.0008559383753501401, + "loss": 0.4797, + "step": 5249 + }, + { + "epoch": 2.9329608938547485, + "grad_norm": 0.7021874189376831, + "learning_rate": 0.0008559103641456583, + "loss": 0.6254, + "step": 5250 + }, + { + "epoch": 2.9335195530726255, + "grad_norm": 1.421185851097107, + "learning_rate": 0.0008558823529411765, + "loss": 0.5824, + "step": 5251 + }, + { + "epoch": 2.934078212290503, + "grad_norm": 0.560073971748352, + "learning_rate": 0.0008558543417366948, + "loss": 0.629, + "step": 5252 + }, + { + "epoch": 2.93463687150838, + "grad_norm": 0.7694107890129089, + "learning_rate": 0.0008558263305322129, + "loss": 0.4859, + "step": 5253 + }, + { + "epoch": 2.935195530726257, + "grad_norm": 0.6547850966453552, + "learning_rate": 0.0008557983193277311, + "loss": 0.4928, + "step": 5254 + }, + { + "epoch": 2.935754189944134, + "grad_norm": 0.7182886600494385, + "learning_rate": 0.0008557703081232493, + "loss": 0.6571, + "step": 5255 + }, + { + "epoch": 2.936312849162011, + "grad_norm": 0.5350194573402405, + "learning_rate": 0.0008557422969187675, + "loss": 0.477, + "step": 5256 + }, + { + "epoch": 2.9368715083798884, + "grad_norm": 0.6255475282669067, + "learning_rate": 0.0008557142857142858, + "loss": 0.4823, + "step": 5257 + }, + { + "epoch": 2.9374301675977654, + "grad_norm": 3.8309788703918457, + "learning_rate": 0.0008556862745098039, + "loss": 0.467, + "step": 5258 + }, + { + "epoch": 2.9379888268156424, + "grad_norm": 0.5530065894126892, + "learning_rate": 0.0008556582633053221, + "loss": 0.414, + "step": 5259 + }, + { + "epoch": 2.9385474860335195, + "grad_norm": 0.4978122115135193, + "learning_rate": 0.0008556302521008403, + "loss": 0.4778, + "step": 5260 + }, + { + "epoch": 2.9391061452513965, + "grad_norm": 1.2582539319992065, + "learning_rate": 0.0008556022408963585, + "loss": 0.5299, + "step": 5261 + }, + { + "epoch": 2.939664804469274, + "grad_norm": 0.7021863460540771, + "learning_rate": 0.0008555742296918769, + "loss": 0.4803, + "step": 5262 + }, + { + "epoch": 2.940223463687151, + "grad_norm": 0.601661205291748, + "learning_rate": 0.000855546218487395, + "loss": 0.4241, + "step": 5263 + }, + { + "epoch": 2.940782122905028, + "grad_norm": 0.8026847243309021, + "learning_rate": 0.0008555182072829132, + "loss": 0.4989, + "step": 5264 + }, + { + "epoch": 2.941340782122905, + "grad_norm": 1.6611796617507935, + "learning_rate": 0.0008554901960784314, + "loss": 0.4394, + "step": 5265 + }, + { + "epoch": 2.941899441340782, + "grad_norm": 1.6839842796325684, + "learning_rate": 0.0008554621848739496, + "loss": 0.437, + "step": 5266 + }, + { + "epoch": 2.9424581005586594, + "grad_norm": 0.40616342425346375, + "learning_rate": 0.0008554341736694679, + "loss": 0.4184, + "step": 5267 + }, + { + "epoch": 2.9430167597765364, + "grad_norm": 0.5075446367263794, + "learning_rate": 0.0008554061624649861, + "loss": 0.4567, + "step": 5268 + }, + { + "epoch": 2.9435754189944134, + "grad_norm": 0.4084770679473877, + "learning_rate": 0.0008553781512605042, + "loss": 0.4064, + "step": 5269 + }, + { + "epoch": 2.9441340782122905, + "grad_norm": 0.5572300553321838, + "learning_rate": 0.0008553501400560224, + "loss": 0.367, + "step": 5270 + }, + { + "epoch": 2.9446927374301675, + "grad_norm": 0.44329598546028137, + "learning_rate": 0.0008553221288515406, + "loss": 0.4105, + "step": 5271 + }, + { + "epoch": 2.945251396648045, + "grad_norm": 1.13699209690094, + "learning_rate": 0.0008552941176470589, + "loss": 0.4623, + "step": 5272 + }, + { + "epoch": 2.945810055865922, + "grad_norm": 0.5589644908905029, + "learning_rate": 0.0008552661064425771, + "loss": 0.4697, + "step": 5273 + }, + { + "epoch": 2.946368715083799, + "grad_norm": 0.5557832717895508, + "learning_rate": 0.0008552380952380952, + "loss": 0.5364, + "step": 5274 + }, + { + "epoch": 2.946927374301676, + "grad_norm": 0.9397745728492737, + "learning_rate": 0.0008552100840336134, + "loss": 0.4608, + "step": 5275 + }, + { + "epoch": 2.947486033519553, + "grad_norm": 1.5104057788848877, + "learning_rate": 0.0008551820728291316, + "loss": 0.5783, + "step": 5276 + }, + { + "epoch": 2.9480446927374304, + "grad_norm": 0.6010105013847351, + "learning_rate": 0.0008551540616246499, + "loss": 0.5162, + "step": 5277 + }, + { + "epoch": 2.9486033519553074, + "grad_norm": 0.43357473611831665, + "learning_rate": 0.0008551260504201681, + "loss": 0.5003, + "step": 5278 + }, + { + "epoch": 2.9491620111731844, + "grad_norm": 6.203266620635986, + "learning_rate": 0.0008550980392156862, + "loss": 0.4421, + "step": 5279 + }, + { + "epoch": 2.9497206703910615, + "grad_norm": 1.873213291168213, + "learning_rate": 0.0008550700280112045, + "loss": 0.5304, + "step": 5280 + }, + { + "epoch": 2.9502793296089385, + "grad_norm": 0.6766135692596436, + "learning_rate": 0.0008550420168067227, + "loss": 0.409, + "step": 5281 + }, + { + "epoch": 2.950837988826816, + "grad_norm": 7.697422981262207, + "learning_rate": 0.000855014005602241, + "loss": 0.4847, + "step": 5282 + }, + { + "epoch": 2.9513966480446925, + "grad_norm": 0.6321967840194702, + "learning_rate": 0.0008549859943977592, + "loss": 0.525, + "step": 5283 + }, + { + "epoch": 2.95195530726257, + "grad_norm": 0.7350785732269287, + "learning_rate": 0.0008549579831932774, + "loss": 0.6233, + "step": 5284 + }, + { + "epoch": 2.952513966480447, + "grad_norm": 0.5767834782600403, + "learning_rate": 0.0008549299719887955, + "loss": 0.4759, + "step": 5285 + }, + { + "epoch": 2.953072625698324, + "grad_norm": 0.5822747349739075, + "learning_rate": 0.0008549019607843137, + "loss": 0.471, + "step": 5286 + }, + { + "epoch": 2.953631284916201, + "grad_norm": 0.6781153678894043, + "learning_rate": 0.000854873949579832, + "loss": 0.4501, + "step": 5287 + }, + { + "epoch": 2.954189944134078, + "grad_norm": 1.9439740180969238, + "learning_rate": 0.0008548459383753502, + "loss": 0.4536, + "step": 5288 + }, + { + "epoch": 2.9547486033519554, + "grad_norm": 0.7673845887184143, + "learning_rate": 0.0008548179271708684, + "loss": 0.5123, + "step": 5289 + }, + { + "epoch": 2.9553072625698324, + "grad_norm": 1.6544477939605713, + "learning_rate": 0.0008547899159663865, + "loss": 0.6024, + "step": 5290 + }, + { + "epoch": 2.9558659217877095, + "grad_norm": 0.5968123078346252, + "learning_rate": 0.0008547619047619047, + "loss": 0.4345, + "step": 5291 + }, + { + "epoch": 2.9564245810055865, + "grad_norm": 0.6262439489364624, + "learning_rate": 0.000854733893557423, + "loss": 0.5571, + "step": 5292 + }, + { + "epoch": 2.9569832402234635, + "grad_norm": 0.7656193375587463, + "learning_rate": 0.0008547058823529412, + "loss": 0.6512, + "step": 5293 + }, + { + "epoch": 2.957541899441341, + "grad_norm": 0.5560053586959839, + "learning_rate": 0.0008546778711484594, + "loss": 0.7462, + "step": 5294 + }, + { + "epoch": 2.958100558659218, + "grad_norm": 0.5246129035949707, + "learning_rate": 0.0008546498599439775, + "loss": 0.4772, + "step": 5295 + }, + { + "epoch": 2.958659217877095, + "grad_norm": 0.7463229298591614, + "learning_rate": 0.0008546218487394957, + "loss": 0.4147, + "step": 5296 + }, + { + "epoch": 2.959217877094972, + "grad_norm": 0.5940794944763184, + "learning_rate": 0.0008545938375350141, + "loss": 0.3936, + "step": 5297 + }, + { + "epoch": 2.959776536312849, + "grad_norm": 0.8366613984107971, + "learning_rate": 0.0008545658263305323, + "loss": 0.4459, + "step": 5298 + }, + { + "epoch": 2.9603351955307264, + "grad_norm": 0.6187811493873596, + "learning_rate": 0.0008545378151260505, + "loss": 0.4231, + "step": 5299 + }, + { + "epoch": 2.9608938547486034, + "grad_norm": 0.7047716379165649, + "learning_rate": 0.0008545098039215687, + "loss": 0.4942, + "step": 5300 + }, + { + "epoch": 2.9614525139664805, + "grad_norm": 0.8810130953788757, + "learning_rate": 0.0008544817927170868, + "loss": 0.487, + "step": 5301 + }, + { + "epoch": 2.9620111731843575, + "grad_norm": 0.864976704120636, + "learning_rate": 0.0008544537815126051, + "loss": 0.5124, + "step": 5302 + }, + { + "epoch": 2.9625698324022345, + "grad_norm": 0.6067531704902649, + "learning_rate": 0.0008544257703081233, + "loss": 0.5235, + "step": 5303 + }, + { + "epoch": 2.963128491620112, + "grad_norm": 0.6633782982826233, + "learning_rate": 0.0008543977591036415, + "loss": 0.5902, + "step": 5304 + }, + { + "epoch": 2.963687150837989, + "grad_norm": 0.5539004802703857, + "learning_rate": 0.0008543697478991597, + "loss": 0.4846, + "step": 5305 + }, + { + "epoch": 2.964245810055866, + "grad_norm": 0.8036938309669495, + "learning_rate": 0.0008543417366946778, + "loss": 0.5442, + "step": 5306 + }, + { + "epoch": 2.964804469273743, + "grad_norm": 0.47018709778785706, + "learning_rate": 0.0008543137254901961, + "loss": 0.541, + "step": 5307 + }, + { + "epoch": 2.96536312849162, + "grad_norm": 0.6300920844078064, + "learning_rate": 0.0008542857142857143, + "loss": 0.501, + "step": 5308 + }, + { + "epoch": 2.9659217877094974, + "grad_norm": 1.424096703529358, + "learning_rate": 0.0008542577030812325, + "loss": 0.4753, + "step": 5309 + }, + { + "epoch": 2.9664804469273744, + "grad_norm": 0.5059372782707214, + "learning_rate": 0.0008542296918767507, + "loss": 0.4065, + "step": 5310 + }, + { + "epoch": 2.9670391061452515, + "grad_norm": 0.9171116948127747, + "learning_rate": 0.0008542016806722688, + "loss": 0.4752, + "step": 5311 + }, + { + "epoch": 2.9675977653631285, + "grad_norm": 1.5030795335769653, + "learning_rate": 0.0008541736694677872, + "loss": 0.5334, + "step": 5312 + }, + { + "epoch": 2.9681564245810055, + "grad_norm": 0.91139817237854, + "learning_rate": 0.0008541456582633054, + "loss": 0.464, + "step": 5313 + }, + { + "epoch": 2.968715083798883, + "grad_norm": 0.5637885928153992, + "learning_rate": 0.0008541176470588236, + "loss": 0.4687, + "step": 5314 + }, + { + "epoch": 2.9692737430167595, + "grad_norm": 0.44936779141426086, + "learning_rate": 0.0008540896358543418, + "loss": 0.5043, + "step": 5315 + }, + { + "epoch": 2.969832402234637, + "grad_norm": 1.3325906991958618, + "learning_rate": 0.00085406162464986, + "loss": 0.4937, + "step": 5316 + }, + { + "epoch": 2.970391061452514, + "grad_norm": 3.632030487060547, + "learning_rate": 0.0008540336134453782, + "loss": 0.3722, + "step": 5317 + }, + { + "epoch": 2.970949720670391, + "grad_norm": 0.7179813981056213, + "learning_rate": 0.0008540056022408964, + "loss": 0.4831, + "step": 5318 + }, + { + "epoch": 2.971508379888268, + "grad_norm": 0.8740227818489075, + "learning_rate": 0.0008539775910364146, + "loss": 0.5092, + "step": 5319 + }, + { + "epoch": 2.972067039106145, + "grad_norm": 1.2245211601257324, + "learning_rate": 0.0008539495798319328, + "loss": 0.5828, + "step": 5320 + }, + { + "epoch": 2.9726256983240225, + "grad_norm": 0.5264342427253723, + "learning_rate": 0.000853921568627451, + "loss": 0.4307, + "step": 5321 + }, + { + "epoch": 2.9731843575418995, + "grad_norm": 1.093178629875183, + "learning_rate": 0.0008538935574229692, + "loss": 0.5418, + "step": 5322 + }, + { + "epoch": 2.9737430167597765, + "grad_norm": 1.6047625541687012, + "learning_rate": 0.0008538655462184874, + "loss": 0.5896, + "step": 5323 + }, + { + "epoch": 2.9743016759776535, + "grad_norm": 0.6026519536972046, + "learning_rate": 0.0008538375350140056, + "loss": 0.4694, + "step": 5324 + }, + { + "epoch": 2.9748603351955305, + "grad_norm": 0.44366228580474854, + "learning_rate": 0.0008538095238095238, + "loss": 0.475, + "step": 5325 + }, + { + "epoch": 2.975418994413408, + "grad_norm": 0.9205455780029297, + "learning_rate": 0.000853781512605042, + "loss": 0.5779, + "step": 5326 + }, + { + "epoch": 2.975977653631285, + "grad_norm": 0.8671701550483704, + "learning_rate": 0.0008537535014005602, + "loss": 0.4666, + "step": 5327 + }, + { + "epoch": 2.976536312849162, + "grad_norm": 0.6516280174255371, + "learning_rate": 0.0008537254901960784, + "loss": 0.3994, + "step": 5328 + }, + { + "epoch": 2.977094972067039, + "grad_norm": 0.47651827335357666, + "learning_rate": 0.0008536974789915967, + "loss": 0.4897, + "step": 5329 + }, + { + "epoch": 2.977653631284916, + "grad_norm": 1.0731016397476196, + "learning_rate": 0.0008536694677871149, + "loss": 0.5126, + "step": 5330 + }, + { + "epoch": 2.9782122905027935, + "grad_norm": 0.5467098951339722, + "learning_rate": 0.0008536414565826331, + "loss": 0.5058, + "step": 5331 + }, + { + "epoch": 2.9787709497206705, + "grad_norm": 0.6360552310943604, + "learning_rate": 0.0008536134453781514, + "loss": 0.6286, + "step": 5332 + }, + { + "epoch": 2.9793296089385475, + "grad_norm": 0.561384916305542, + "learning_rate": 0.0008535854341736695, + "loss": 0.4836, + "step": 5333 + }, + { + "epoch": 2.9798882681564245, + "grad_norm": 0.4336279034614563, + "learning_rate": 0.0008535574229691877, + "loss": 0.4819, + "step": 5334 + }, + { + "epoch": 2.9804469273743015, + "grad_norm": 0.4420956075191498, + "learning_rate": 0.0008535294117647059, + "loss": 0.4715, + "step": 5335 + }, + { + "epoch": 2.981005586592179, + "grad_norm": 6.295414924621582, + "learning_rate": 0.0008535014005602241, + "loss": 0.3881, + "step": 5336 + }, + { + "epoch": 2.981564245810056, + "grad_norm": 0.6499784588813782, + "learning_rate": 0.0008534733893557424, + "loss": 0.4992, + "step": 5337 + }, + { + "epoch": 2.982122905027933, + "grad_norm": 0.8571310639381409, + "learning_rate": 0.0008534453781512605, + "loss": 0.4905, + "step": 5338 + }, + { + "epoch": 2.98268156424581, + "grad_norm": 0.514336884021759, + "learning_rate": 0.0008534173669467787, + "loss": 0.4949, + "step": 5339 + }, + { + "epoch": 2.983240223463687, + "grad_norm": 5.9147443771362305, + "learning_rate": 0.0008533893557422969, + "loss": 0.4681, + "step": 5340 + }, + { + "epoch": 2.9837988826815645, + "grad_norm": 1.3518317937850952, + "learning_rate": 0.0008533613445378151, + "loss": 0.4052, + "step": 5341 + }, + { + "epoch": 2.9843575418994415, + "grad_norm": 0.9077181220054626, + "learning_rate": 0.0008533333333333334, + "loss": 0.482, + "step": 5342 + }, + { + "epoch": 2.9849162011173185, + "grad_norm": 0.7650498747825623, + "learning_rate": 0.0008533053221288515, + "loss": 0.4926, + "step": 5343 + }, + { + "epoch": 2.9854748603351955, + "grad_norm": 0.5614609718322754, + "learning_rate": 0.0008532773109243697, + "loss": 0.5085, + "step": 5344 + }, + { + "epoch": 2.9860335195530725, + "grad_norm": 4.789839267730713, + "learning_rate": 0.000853249299719888, + "loss": 0.4963, + "step": 5345 + }, + { + "epoch": 2.98659217877095, + "grad_norm": 0.6768234372138977, + "learning_rate": 0.0008532212885154062, + "loss": 0.3343, + "step": 5346 + }, + { + "epoch": 2.987150837988827, + "grad_norm": 0.8178666830062866, + "learning_rate": 0.0008531932773109245, + "loss": 0.5221, + "step": 5347 + }, + { + "epoch": 2.987709497206704, + "grad_norm": 0.5692423582077026, + "learning_rate": 0.0008531652661064427, + "loss": 0.4396, + "step": 5348 + }, + { + "epoch": 2.988268156424581, + "grad_norm": 0.4675418436527252, + "learning_rate": 0.0008531372549019608, + "loss": 0.3618, + "step": 5349 + }, + { + "epoch": 2.988826815642458, + "grad_norm": 0.6296323537826538, + "learning_rate": 0.000853109243697479, + "loss": 0.4867, + "step": 5350 + }, + { + "epoch": 2.9893854748603355, + "grad_norm": 0.8342413902282715, + "learning_rate": 0.0008530812324929972, + "loss": 0.3928, + "step": 5351 + }, + { + "epoch": 2.989944134078212, + "grad_norm": 0.6714411377906799, + "learning_rate": 0.0008530532212885154, + "loss": 0.4725, + "step": 5352 + }, + { + "epoch": 2.9905027932960895, + "grad_norm": 0.4294596016407013, + "learning_rate": 0.0008530252100840337, + "loss": 0.4436, + "step": 5353 + }, + { + "epoch": 2.9910614525139665, + "grad_norm": 0.42483019828796387, + "learning_rate": 0.0008529971988795518, + "loss": 0.3971, + "step": 5354 + }, + { + "epoch": 2.9916201117318435, + "grad_norm": 0.6476815342903137, + "learning_rate": 0.00085296918767507, + "loss": 0.6176, + "step": 5355 + }, + { + "epoch": 2.9921787709497205, + "grad_norm": 0.6838845014572144, + "learning_rate": 0.0008529411764705882, + "loss": 0.5515, + "step": 5356 + }, + { + "epoch": 2.9927374301675975, + "grad_norm": 2.527945041656494, + "learning_rate": 0.0008529131652661064, + "loss": 0.4539, + "step": 5357 + }, + { + "epoch": 2.993296089385475, + "grad_norm": 0.5473175048828125, + "learning_rate": 0.0008528851540616247, + "loss": 0.4663, + "step": 5358 + }, + { + "epoch": 2.993854748603352, + "grad_norm": 0.5364380478858948, + "learning_rate": 0.0008528571428571428, + "loss": 0.5466, + "step": 5359 + }, + { + "epoch": 2.994413407821229, + "grad_norm": 0.7082886099815369, + "learning_rate": 0.000852829131652661, + "loss": 0.3553, + "step": 5360 + }, + { + "epoch": 2.994972067039106, + "grad_norm": 19.685489654541016, + "learning_rate": 0.0008528011204481792, + "loss": 0.4668, + "step": 5361 + }, + { + "epoch": 2.995530726256983, + "grad_norm": 0.7132713794708252, + "learning_rate": 0.0008527731092436975, + "loss": 0.4904, + "step": 5362 + }, + { + "epoch": 2.9960893854748605, + "grad_norm": 0.6538237929344177, + "learning_rate": 0.0008527450980392158, + "loss": 0.5379, + "step": 5363 + }, + { + "epoch": 2.9966480446927375, + "grad_norm": 2.562812089920044, + "learning_rate": 0.000852717086834734, + "loss": 0.5682, + "step": 5364 + }, + { + "epoch": 2.9972067039106145, + "grad_norm": 0.47311344742774963, + "learning_rate": 0.0008526890756302521, + "loss": 0.4406, + "step": 5365 + }, + { + "epoch": 2.9977653631284915, + "grad_norm": 0.38964036107063293, + "learning_rate": 0.0008526610644257703, + "loss": 0.3839, + "step": 5366 + }, + { + "epoch": 2.9983240223463685, + "grad_norm": 0.43479278683662415, + "learning_rate": 0.0008526330532212885, + "loss": 0.4989, + "step": 5367 + }, + { + "epoch": 2.998882681564246, + "grad_norm": 0.530277669429779, + "learning_rate": 0.0008526050420168068, + "loss": 0.5573, + "step": 5368 + }, + { + "epoch": 2.999441340782123, + "grad_norm": 2.3859963417053223, + "learning_rate": 0.000852577030812325, + "loss": 0.6129, + "step": 5369 + }, + { + "epoch": 3.0, + "grad_norm": 0.8000469207763672, + "learning_rate": 0.0008525490196078431, + "loss": 0.6314, + "step": 5370 + }, + { + "epoch": 3.000558659217877, + "grad_norm": 0.3947930634021759, + "learning_rate": 0.0008525210084033613, + "loss": 0.4098, + "step": 5371 + }, + { + "epoch": 3.001117318435754, + "grad_norm": 0.6759277582168579, + "learning_rate": 0.0008524929971988795, + "loss": 0.5769, + "step": 5372 + }, + { + "epoch": 3.0016759776536315, + "grad_norm": 0.8616467118263245, + "learning_rate": 0.0008524649859943978, + "loss": 0.5528, + "step": 5373 + }, + { + "epoch": 3.0022346368715085, + "grad_norm": 0.4359557628631592, + "learning_rate": 0.000852436974789916, + "loss": 0.5051, + "step": 5374 + }, + { + "epoch": 3.0027932960893855, + "grad_norm": 0.6852437257766724, + "learning_rate": 0.0008524089635854341, + "loss": 0.4135, + "step": 5375 + }, + { + "epoch": 3.0033519553072625, + "grad_norm": 0.6340899467468262, + "learning_rate": 0.0008523809523809523, + "loss": 0.4783, + "step": 5376 + }, + { + "epoch": 3.0039106145251395, + "grad_norm": 0.42108026146888733, + "learning_rate": 0.0008523529411764705, + "loss": 0.4335, + "step": 5377 + }, + { + "epoch": 3.004469273743017, + "grad_norm": 0.4909720718860626, + "learning_rate": 0.0008523249299719889, + "loss": 0.4557, + "step": 5378 + }, + { + "epoch": 3.005027932960894, + "grad_norm": 0.5477637052536011, + "learning_rate": 0.0008522969187675071, + "loss": 0.4057, + "step": 5379 + }, + { + "epoch": 3.005586592178771, + "grad_norm": 0.8505667448043823, + "learning_rate": 0.0008522689075630253, + "loss": 0.5986, + "step": 5380 + }, + { + "epoch": 3.006145251396648, + "grad_norm": 0.5427995324134827, + "learning_rate": 0.0008522408963585434, + "loss": 0.5046, + "step": 5381 + }, + { + "epoch": 3.006703910614525, + "grad_norm": 0.6454780697822571, + "learning_rate": 0.0008522128851540616, + "loss": 0.4516, + "step": 5382 + }, + { + "epoch": 3.007262569832402, + "grad_norm": 0.5145334601402283, + "learning_rate": 0.0008521848739495799, + "loss": 0.4073, + "step": 5383 + }, + { + "epoch": 3.0078212290502795, + "grad_norm": 0.40927255153656006, + "learning_rate": 0.0008521568627450981, + "loss": 0.4487, + "step": 5384 + }, + { + "epoch": 3.0083798882681565, + "grad_norm": 1.5024330615997314, + "learning_rate": 0.0008521288515406163, + "loss": 0.4863, + "step": 5385 + }, + { + "epoch": 3.0089385474860335, + "grad_norm": 0.4533179998397827, + "learning_rate": 0.0008521008403361344, + "loss": 0.3654, + "step": 5386 + }, + { + "epoch": 3.0094972067039105, + "grad_norm": 0.8234923481941223, + "learning_rate": 0.0008520728291316526, + "loss": 0.5443, + "step": 5387 + }, + { + "epoch": 3.0100558659217875, + "grad_norm": 1.0196948051452637, + "learning_rate": 0.0008520448179271709, + "loss": 0.3855, + "step": 5388 + }, + { + "epoch": 3.010614525139665, + "grad_norm": 0.38857924938201904, + "learning_rate": 0.0008520168067226891, + "loss": 0.3629, + "step": 5389 + }, + { + "epoch": 3.011173184357542, + "grad_norm": 0.7608642578125, + "learning_rate": 0.0008519887955182073, + "loss": 0.4688, + "step": 5390 + }, + { + "epoch": 3.011731843575419, + "grad_norm": 0.7476602792739868, + "learning_rate": 0.0008519607843137254, + "loss": 0.5045, + "step": 5391 + }, + { + "epoch": 3.012290502793296, + "grad_norm": 0.4268593192100525, + "learning_rate": 0.0008519327731092436, + "loss": 0.4032, + "step": 5392 + }, + { + "epoch": 3.012849162011173, + "grad_norm": 0.847745954990387, + "learning_rate": 0.0008519047619047619, + "loss": 0.4937, + "step": 5393 + }, + { + "epoch": 3.0134078212290505, + "grad_norm": 0.5917583703994751, + "learning_rate": 0.0008518767507002802, + "loss": 0.4772, + "step": 5394 + }, + { + "epoch": 3.0139664804469275, + "grad_norm": 2.905977487564087, + "learning_rate": 0.0008518487394957984, + "loss": 0.4522, + "step": 5395 + }, + { + "epoch": 3.0145251396648045, + "grad_norm": 0.7927609086036682, + "learning_rate": 0.0008518207282913166, + "loss": 0.5007, + "step": 5396 + }, + { + "epoch": 3.0150837988826815, + "grad_norm": 0.4961709976196289, + "learning_rate": 0.0008517927170868347, + "loss": 0.4821, + "step": 5397 + }, + { + "epoch": 3.0156424581005585, + "grad_norm": 0.4693697988986969, + "learning_rate": 0.000851764705882353, + "loss": 0.431, + "step": 5398 + }, + { + "epoch": 3.0162011173184355, + "grad_norm": 0.5396574139595032, + "learning_rate": 0.0008517366946778712, + "loss": 0.4525, + "step": 5399 + }, + { + "epoch": 3.016759776536313, + "grad_norm": 0.7243140935897827, + "learning_rate": 0.0008517086834733894, + "loss": 0.5513, + "step": 5400 + }, + { + "epoch": 3.01731843575419, + "grad_norm": 0.7669305205345154, + "learning_rate": 0.0008516806722689076, + "loss": 0.4388, + "step": 5401 + }, + { + "epoch": 3.017877094972067, + "grad_norm": 2.39949893951416, + "learning_rate": 0.0008516526610644257, + "loss": 0.4158, + "step": 5402 + }, + { + "epoch": 3.018435754189944, + "grad_norm": 0.6900848150253296, + "learning_rate": 0.000851624649859944, + "loss": 0.4889, + "step": 5403 + }, + { + "epoch": 3.018994413407821, + "grad_norm": 0.5717877745628357, + "learning_rate": 0.0008515966386554622, + "loss": 0.4642, + "step": 5404 + }, + { + "epoch": 3.0195530726256985, + "grad_norm": 1.1549183130264282, + "learning_rate": 0.0008515686274509804, + "loss": 0.544, + "step": 5405 + }, + { + "epoch": 3.0201117318435755, + "grad_norm": 0.7069724798202515, + "learning_rate": 0.0008515406162464986, + "loss": 0.6365, + "step": 5406 + }, + { + "epoch": 3.0206703910614525, + "grad_norm": 0.7818683385848999, + "learning_rate": 0.0008515126050420167, + "loss": 0.6091, + "step": 5407 + }, + { + "epoch": 3.0212290502793295, + "grad_norm": 0.6126403212547302, + "learning_rate": 0.000851484593837535, + "loss": 0.4354, + "step": 5408 + }, + { + "epoch": 3.0217877094972065, + "grad_norm": 1.1813786029815674, + "learning_rate": 0.0008514565826330532, + "loss": 0.4934, + "step": 5409 + }, + { + "epoch": 3.022346368715084, + "grad_norm": 0.4681362509727478, + "learning_rate": 0.0008514285714285714, + "loss": 0.5089, + "step": 5410 + }, + { + "epoch": 3.022905027932961, + "grad_norm": 0.609001636505127, + "learning_rate": 0.0008514005602240897, + "loss": 0.4533, + "step": 5411 + }, + { + "epoch": 3.023463687150838, + "grad_norm": 0.6924839019775391, + "learning_rate": 0.0008513725490196079, + "loss": 0.4698, + "step": 5412 + }, + { + "epoch": 3.024022346368715, + "grad_norm": 0.5728451013565063, + "learning_rate": 0.0008513445378151261, + "loss": 0.4306, + "step": 5413 + }, + { + "epoch": 3.024581005586592, + "grad_norm": 1.0006122589111328, + "learning_rate": 0.0008513165266106443, + "loss": 0.5292, + "step": 5414 + }, + { + "epoch": 3.0251396648044695, + "grad_norm": 0.5900217890739441, + "learning_rate": 0.0008512885154061625, + "loss": 0.417, + "step": 5415 + }, + { + "epoch": 3.0256983240223465, + "grad_norm": 0.849025309085846, + "learning_rate": 0.0008512605042016807, + "loss": 0.3924, + "step": 5416 + }, + { + "epoch": 3.0262569832402235, + "grad_norm": 1.138856053352356, + "learning_rate": 0.0008512324929971989, + "loss": 0.5554, + "step": 5417 + }, + { + "epoch": 3.0268156424581005, + "grad_norm": 1.0687626600265503, + "learning_rate": 0.0008512044817927171, + "loss": 0.4906, + "step": 5418 + }, + { + "epoch": 3.0273743016759775, + "grad_norm": 0.3479156494140625, + "learning_rate": 0.0008511764705882353, + "loss": 0.4567, + "step": 5419 + }, + { + "epoch": 3.0279329608938546, + "grad_norm": 0.45297959446907043, + "learning_rate": 0.0008511484593837535, + "loss": 0.3967, + "step": 5420 + }, + { + "epoch": 3.028491620111732, + "grad_norm": 1.5802494287490845, + "learning_rate": 0.0008511204481792717, + "loss": 0.4969, + "step": 5421 + }, + { + "epoch": 3.029050279329609, + "grad_norm": 3.114891290664673, + "learning_rate": 0.0008510924369747899, + "loss": 0.4723, + "step": 5422 + }, + { + "epoch": 3.029608938547486, + "grad_norm": 0.4734114110469818, + "learning_rate": 0.0008510644257703082, + "loss": 0.4032, + "step": 5423 + }, + { + "epoch": 3.030167597765363, + "grad_norm": 0.5840662121772766, + "learning_rate": 0.0008510364145658263, + "loss": 0.4633, + "step": 5424 + }, + { + "epoch": 3.03072625698324, + "grad_norm": 0.6865155100822449, + "learning_rate": 0.0008510084033613445, + "loss": 0.4043, + "step": 5425 + }, + { + "epoch": 3.0312849162011175, + "grad_norm": 0.5602957010269165, + "learning_rate": 0.0008509803921568627, + "loss": 0.4446, + "step": 5426 + }, + { + "epoch": 3.0318435754189945, + "grad_norm": 0.45751628279685974, + "learning_rate": 0.000850952380952381, + "loss": 0.431, + "step": 5427 + }, + { + "epoch": 3.0324022346368715, + "grad_norm": 0.979215681552887, + "learning_rate": 0.0008509243697478993, + "loss": 0.5194, + "step": 5428 + }, + { + "epoch": 3.0329608938547485, + "grad_norm": 0.4992451071739197, + "learning_rate": 0.0008508963585434174, + "loss": 0.4721, + "step": 5429 + }, + { + "epoch": 3.0335195530726256, + "grad_norm": 0.4723842442035675, + "learning_rate": 0.0008508683473389356, + "loss": 0.3636, + "step": 5430 + }, + { + "epoch": 3.034078212290503, + "grad_norm": 1.1194945573806763, + "learning_rate": 0.0008508403361344538, + "loss": 0.4854, + "step": 5431 + }, + { + "epoch": 3.03463687150838, + "grad_norm": 0.6002005934715271, + "learning_rate": 0.000850812324929972, + "loss": 0.5133, + "step": 5432 + }, + { + "epoch": 3.035195530726257, + "grad_norm": 0.609556257724762, + "learning_rate": 0.0008507843137254903, + "loss": 0.3968, + "step": 5433 + }, + { + "epoch": 3.035754189944134, + "grad_norm": 0.5571757555007935, + "learning_rate": 0.0008507563025210084, + "loss": 0.6153, + "step": 5434 + }, + { + "epoch": 3.036312849162011, + "grad_norm": 0.46065160632133484, + "learning_rate": 0.0008507282913165266, + "loss": 0.545, + "step": 5435 + }, + { + "epoch": 3.036871508379888, + "grad_norm": 0.4290101230144501, + "learning_rate": 0.0008507002801120448, + "loss": 0.4854, + "step": 5436 + }, + { + "epoch": 3.0374301675977655, + "grad_norm": 0.8071284890174866, + "learning_rate": 0.000850672268907563, + "loss": 0.5238, + "step": 5437 + }, + { + "epoch": 3.0379888268156425, + "grad_norm": 0.6952858567237854, + "learning_rate": 0.0008506442577030813, + "loss": 0.489, + "step": 5438 + }, + { + "epoch": 3.0385474860335195, + "grad_norm": 0.6648319959640503, + "learning_rate": 0.0008506162464985995, + "loss": 0.597, + "step": 5439 + }, + { + "epoch": 3.0391061452513966, + "grad_norm": 2.418651580810547, + "learning_rate": 0.0008505882352941176, + "loss": 0.3934, + "step": 5440 + }, + { + "epoch": 3.0396648044692736, + "grad_norm": 0.6917094588279724, + "learning_rate": 0.0008505602240896358, + "loss": 0.4138, + "step": 5441 + }, + { + "epoch": 3.040223463687151, + "grad_norm": 0.5461602807044983, + "learning_rate": 0.000850532212885154, + "loss": 0.4808, + "step": 5442 + }, + { + "epoch": 3.040782122905028, + "grad_norm": 0.45151203870773315, + "learning_rate": 0.0008505042016806724, + "loss": 0.422, + "step": 5443 + }, + { + "epoch": 3.041340782122905, + "grad_norm": 0.5675491094589233, + "learning_rate": 0.0008504761904761906, + "loss": 0.4592, + "step": 5444 + }, + { + "epoch": 3.041899441340782, + "grad_norm": 0.7291675806045532, + "learning_rate": 0.0008504481792717087, + "loss": 0.5236, + "step": 5445 + }, + { + "epoch": 3.042458100558659, + "grad_norm": 0.40713581442832947, + "learning_rate": 0.0008504201680672269, + "loss": 0.4599, + "step": 5446 + }, + { + "epoch": 3.0430167597765365, + "grad_norm": 0.5759377479553223, + "learning_rate": 0.0008503921568627451, + "loss": 0.4811, + "step": 5447 + }, + { + "epoch": 3.0435754189944135, + "grad_norm": 0.870193362236023, + "learning_rate": 0.0008503641456582634, + "loss": 0.3958, + "step": 5448 + }, + { + "epoch": 3.0441340782122905, + "grad_norm": 0.4510118067264557, + "learning_rate": 0.0008503361344537816, + "loss": 0.4206, + "step": 5449 + }, + { + "epoch": 3.0446927374301676, + "grad_norm": 0.4428526759147644, + "learning_rate": 0.0008503081232492997, + "loss": 0.4322, + "step": 5450 + }, + { + "epoch": 3.0452513966480446, + "grad_norm": 0.5920659899711609, + "learning_rate": 0.0008502801120448179, + "loss": 0.5337, + "step": 5451 + }, + { + "epoch": 3.0458100558659216, + "grad_norm": 0.765612006187439, + "learning_rate": 0.0008502521008403361, + "loss": 0.4487, + "step": 5452 + }, + { + "epoch": 3.046368715083799, + "grad_norm": 0.5659612417221069, + "learning_rate": 0.0008502240896358544, + "loss": 0.5048, + "step": 5453 + }, + { + "epoch": 3.046927374301676, + "grad_norm": 0.47614389657974243, + "learning_rate": 0.0008501960784313726, + "loss": 0.4963, + "step": 5454 + }, + { + "epoch": 3.047486033519553, + "grad_norm": 2.2290472984313965, + "learning_rate": 0.0008501680672268908, + "loss": 0.4536, + "step": 5455 + }, + { + "epoch": 3.04804469273743, + "grad_norm": 0.5330693125724792, + "learning_rate": 0.0008501400560224089, + "loss": 0.4104, + "step": 5456 + }, + { + "epoch": 3.048603351955307, + "grad_norm": 0.8237184286117554, + "learning_rate": 0.0008501120448179271, + "loss": 0.431, + "step": 5457 + }, + { + "epoch": 3.0491620111731845, + "grad_norm": 0.504925012588501, + "learning_rate": 0.0008500840336134454, + "loss": 0.515, + "step": 5458 + }, + { + "epoch": 3.0497206703910615, + "grad_norm": 0.5609250068664551, + "learning_rate": 0.0008500560224089636, + "loss": 0.4171, + "step": 5459 + }, + { + "epoch": 3.0502793296089385, + "grad_norm": 0.6541677713394165, + "learning_rate": 0.0008500280112044819, + "loss": 0.4549, + "step": 5460 + }, + { + "epoch": 3.0508379888268156, + "grad_norm": 0.9161620736122131, + "learning_rate": 0.00085, + "loss": 0.5277, + "step": 5461 + }, + { + "epoch": 3.0513966480446926, + "grad_norm": 1.3137433528900146, + "learning_rate": 0.0008499719887955182, + "loss": 0.4882, + "step": 5462 + }, + { + "epoch": 3.05195530726257, + "grad_norm": 0.6924602389335632, + "learning_rate": 0.0008499439775910365, + "loss": 0.4953, + "step": 5463 + }, + { + "epoch": 3.052513966480447, + "grad_norm": 0.5731591582298279, + "learning_rate": 0.0008499159663865547, + "loss": 0.4847, + "step": 5464 + }, + { + "epoch": 3.053072625698324, + "grad_norm": 2.5538318157196045, + "learning_rate": 0.0008498879551820729, + "loss": 0.5312, + "step": 5465 + }, + { + "epoch": 3.053631284916201, + "grad_norm": 0.4962441027164459, + "learning_rate": 0.000849859943977591, + "loss": 0.49, + "step": 5466 + }, + { + "epoch": 3.054189944134078, + "grad_norm": 0.9920082092285156, + "learning_rate": 0.0008498319327731092, + "loss": 0.463, + "step": 5467 + }, + { + "epoch": 3.054748603351955, + "grad_norm": 0.42664119601249695, + "learning_rate": 0.0008498039215686275, + "loss": 0.45, + "step": 5468 + }, + { + "epoch": 3.0553072625698325, + "grad_norm": 0.6212796568870544, + "learning_rate": 0.0008497759103641457, + "loss": 0.4025, + "step": 5469 + }, + { + "epoch": 3.0558659217877095, + "grad_norm": 0.54094398021698, + "learning_rate": 0.0008497478991596639, + "loss": 0.5867, + "step": 5470 + }, + { + "epoch": 3.0564245810055866, + "grad_norm": 0.5930367112159729, + "learning_rate": 0.0008497198879551821, + "loss": 0.4479, + "step": 5471 + }, + { + "epoch": 3.0569832402234636, + "grad_norm": 0.38967838883399963, + "learning_rate": 0.0008496918767507002, + "loss": 0.3649, + "step": 5472 + }, + { + "epoch": 3.0575418994413406, + "grad_norm": 0.38972780108451843, + "learning_rate": 0.0008496638655462185, + "loss": 0.4095, + "step": 5473 + }, + { + "epoch": 3.058100558659218, + "grad_norm": 0.7354070544242859, + "learning_rate": 0.0008496358543417367, + "loss": 0.5083, + "step": 5474 + }, + { + "epoch": 3.058659217877095, + "grad_norm": 0.5130355358123779, + "learning_rate": 0.0008496078431372549, + "loss": 0.4713, + "step": 5475 + }, + { + "epoch": 3.059217877094972, + "grad_norm": 0.6274758577346802, + "learning_rate": 0.0008495798319327732, + "loss": 0.4377, + "step": 5476 + }, + { + "epoch": 3.059776536312849, + "grad_norm": 0.6343010067939758, + "learning_rate": 0.0008495518207282912, + "loss": 0.5356, + "step": 5477 + }, + { + "epoch": 3.060335195530726, + "grad_norm": 0.7146463394165039, + "learning_rate": 0.0008495238095238096, + "loss": 0.3958, + "step": 5478 + }, + { + "epoch": 3.0608938547486035, + "grad_norm": 0.5553720593452454, + "learning_rate": 0.0008494957983193278, + "loss": 0.4362, + "step": 5479 + }, + { + "epoch": 3.0614525139664805, + "grad_norm": 0.46876946091651917, + "learning_rate": 0.000849467787114846, + "loss": 0.3627, + "step": 5480 + }, + { + "epoch": 3.0620111731843576, + "grad_norm": 0.438692569732666, + "learning_rate": 0.0008494397759103642, + "loss": 0.4711, + "step": 5481 + }, + { + "epoch": 3.0625698324022346, + "grad_norm": 0.5679853558540344, + "learning_rate": 0.0008494117647058823, + "loss": 0.434, + "step": 5482 + }, + { + "epoch": 3.0631284916201116, + "grad_norm": 0.47009748220443726, + "learning_rate": 0.0008493837535014006, + "loss": 0.395, + "step": 5483 + }, + { + "epoch": 3.063687150837989, + "grad_norm": 0.5046628713607788, + "learning_rate": 0.0008493557422969188, + "loss": 0.3764, + "step": 5484 + }, + { + "epoch": 3.064245810055866, + "grad_norm": 0.9186261296272278, + "learning_rate": 0.000849327731092437, + "loss": 0.4576, + "step": 5485 + }, + { + "epoch": 3.064804469273743, + "grad_norm": 0.5351700186729431, + "learning_rate": 0.0008492997198879552, + "loss": 0.4692, + "step": 5486 + }, + { + "epoch": 3.06536312849162, + "grad_norm": 0.5569940805435181, + "learning_rate": 0.0008492717086834734, + "loss": 0.3793, + "step": 5487 + }, + { + "epoch": 3.065921787709497, + "grad_norm": 2.4412596225738525, + "learning_rate": 0.0008492436974789916, + "loss": 0.5074, + "step": 5488 + }, + { + "epoch": 3.066480446927374, + "grad_norm": 0.4022413194179535, + "learning_rate": 0.0008492156862745098, + "loss": 0.4553, + "step": 5489 + }, + { + "epoch": 3.0670391061452515, + "grad_norm": 1.9563251733779907, + "learning_rate": 0.000849187675070028, + "loss": 0.5153, + "step": 5490 + }, + { + "epoch": 3.0675977653631286, + "grad_norm": 0.6106370687484741, + "learning_rate": 0.0008491596638655462, + "loss": 0.5353, + "step": 5491 + }, + { + "epoch": 3.0681564245810056, + "grad_norm": 0.9997402429580688, + "learning_rate": 0.0008491316526610644, + "loss": 0.4995, + "step": 5492 + }, + { + "epoch": 3.0687150837988826, + "grad_norm": 0.7541323304176331, + "learning_rate": 0.0008491036414565827, + "loss": 0.5709, + "step": 5493 + }, + { + "epoch": 3.0692737430167596, + "grad_norm": 0.38327455520629883, + "learning_rate": 0.0008490756302521009, + "loss": 0.3823, + "step": 5494 + }, + { + "epoch": 3.069832402234637, + "grad_norm": 0.410897821187973, + "learning_rate": 0.0008490476190476191, + "loss": 0.4184, + "step": 5495 + }, + { + "epoch": 3.070391061452514, + "grad_norm": 0.47653019428253174, + "learning_rate": 0.0008490196078431373, + "loss": 0.3793, + "step": 5496 + }, + { + "epoch": 3.070949720670391, + "grad_norm": 1.0713906288146973, + "learning_rate": 0.0008489915966386555, + "loss": 0.4462, + "step": 5497 + }, + { + "epoch": 3.071508379888268, + "grad_norm": 0.5321863293647766, + "learning_rate": 0.0008489635854341737, + "loss": 0.5408, + "step": 5498 + }, + { + "epoch": 3.072067039106145, + "grad_norm": 0.4431574046611786, + "learning_rate": 0.0008489355742296919, + "loss": 0.4765, + "step": 5499 + }, + { + "epoch": 3.0726256983240225, + "grad_norm": 0.8000943660736084, + "learning_rate": 0.0008489075630252101, + "loss": 0.5143, + "step": 5500 + }, + { + "epoch": 3.0726256983240225, + "eval_cer": 0.09375579631872388, + "eval_loss": 0.35735177993774414, + "eval_runtime": 55.6504, + "eval_samples_per_second": 81.545, + "eval_steps_per_second": 5.103, + "eval_wer": 0.3709114270551102, + "step": 5500 + }, + { + "epoch": 3.0731843575418996, + "grad_norm": 0.5274427533149719, + "learning_rate": 0.0008488795518207283, + "loss": 0.3762, + "step": 5501 + }, + { + "epoch": 3.0737430167597766, + "grad_norm": 0.4781281650066376, + "learning_rate": 0.0008488515406162465, + "loss": 0.5234, + "step": 5502 + }, + { + "epoch": 3.0743016759776536, + "grad_norm": 0.5493807196617126, + "learning_rate": 0.0008488235294117648, + "loss": 0.5391, + "step": 5503 + }, + { + "epoch": 3.0748603351955306, + "grad_norm": 4.780723571777344, + "learning_rate": 0.0008487955182072829, + "loss": 0.5195, + "step": 5504 + }, + { + "epoch": 3.0754189944134076, + "grad_norm": 0.4696957468986511, + "learning_rate": 0.0008487675070028011, + "loss": 0.4055, + "step": 5505 + }, + { + "epoch": 3.075977653631285, + "grad_norm": 0.6096591949462891, + "learning_rate": 0.0008487394957983193, + "loss": 0.5075, + "step": 5506 + }, + { + "epoch": 3.076536312849162, + "grad_norm": 0.4517671763896942, + "learning_rate": 0.0008487114845938375, + "loss": 0.4828, + "step": 5507 + }, + { + "epoch": 3.077094972067039, + "grad_norm": 0.7566508650779724, + "learning_rate": 0.0008486834733893559, + "loss": 0.5829, + "step": 5508 + }, + { + "epoch": 3.077653631284916, + "grad_norm": 0.5540488958358765, + "learning_rate": 0.000848655462184874, + "loss": 0.5444, + "step": 5509 + }, + { + "epoch": 3.078212290502793, + "grad_norm": 0.5105178952217102, + "learning_rate": 0.0008486274509803922, + "loss": 0.3697, + "step": 5510 + }, + { + "epoch": 3.0787709497206706, + "grad_norm": 0.7785966992378235, + "learning_rate": 0.0008485994397759104, + "loss": 0.498, + "step": 5511 + }, + { + "epoch": 3.0793296089385476, + "grad_norm": 1.881880521774292, + "learning_rate": 0.0008485714285714286, + "loss": 0.6548, + "step": 5512 + }, + { + "epoch": 3.0798882681564246, + "grad_norm": 0.6333538889884949, + "learning_rate": 0.0008485434173669469, + "loss": 0.4901, + "step": 5513 + }, + { + "epoch": 3.0804469273743016, + "grad_norm": 0.412016898393631, + "learning_rate": 0.000848515406162465, + "loss": 0.4515, + "step": 5514 + }, + { + "epoch": 3.0810055865921786, + "grad_norm": 0.6195076704025269, + "learning_rate": 0.0008484873949579832, + "loss": 0.608, + "step": 5515 + }, + { + "epoch": 3.081564245810056, + "grad_norm": 0.45217135548591614, + "learning_rate": 0.0008484593837535014, + "loss": 0.4707, + "step": 5516 + }, + { + "epoch": 3.082122905027933, + "grad_norm": 0.7113887071609497, + "learning_rate": 0.0008484313725490196, + "loss": 0.4882, + "step": 5517 + }, + { + "epoch": 3.08268156424581, + "grad_norm": 0.8828628063201904, + "learning_rate": 0.0008484033613445379, + "loss": 0.4991, + "step": 5518 + }, + { + "epoch": 3.083240223463687, + "grad_norm": 0.5489522814750671, + "learning_rate": 0.0008483753501400561, + "loss": 0.4257, + "step": 5519 + }, + { + "epoch": 3.083798882681564, + "grad_norm": 0.49655401706695557, + "learning_rate": 0.0008483473389355742, + "loss": 0.4676, + "step": 5520 + }, + { + "epoch": 3.0843575418994416, + "grad_norm": 0.6676682233810425, + "learning_rate": 0.0008483193277310924, + "loss": 0.598, + "step": 5521 + }, + { + "epoch": 3.0849162011173186, + "grad_norm": 0.536139190196991, + "learning_rate": 0.0008482913165266106, + "loss": 0.4722, + "step": 5522 + }, + { + "epoch": 3.0854748603351956, + "grad_norm": 0.5223150253295898, + "learning_rate": 0.0008482633053221289, + "loss": 0.3845, + "step": 5523 + }, + { + "epoch": 3.0860335195530726, + "grad_norm": 1.3387963771820068, + "learning_rate": 0.0008482352941176471, + "loss": 0.5167, + "step": 5524 + }, + { + "epoch": 3.0865921787709496, + "grad_norm": 1.4403148889541626, + "learning_rate": 0.0008482072829131652, + "loss": 0.4271, + "step": 5525 + }, + { + "epoch": 3.0871508379888266, + "grad_norm": 1.0278398990631104, + "learning_rate": 0.0008481792717086835, + "loss": 0.4866, + "step": 5526 + }, + { + "epoch": 3.087709497206704, + "grad_norm": 0.41480597853660583, + "learning_rate": 0.0008481512605042017, + "loss": 0.3641, + "step": 5527 + }, + { + "epoch": 3.088268156424581, + "grad_norm": 0.8197434544563293, + "learning_rate": 0.00084812324929972, + "loss": 0.4056, + "step": 5528 + }, + { + "epoch": 3.088826815642458, + "grad_norm": 0.364523708820343, + "learning_rate": 0.0008480952380952382, + "loss": 0.3921, + "step": 5529 + }, + { + "epoch": 3.089385474860335, + "grad_norm": 0.6194524168968201, + "learning_rate": 0.0008480672268907563, + "loss": 0.5028, + "step": 5530 + }, + { + "epoch": 3.089944134078212, + "grad_norm": 0.503078818321228, + "learning_rate": 0.0008480392156862745, + "loss": 0.474, + "step": 5531 + }, + { + "epoch": 3.0905027932960896, + "grad_norm": 1.7513192892074585, + "learning_rate": 0.0008480112044817927, + "loss": 0.381, + "step": 5532 + }, + { + "epoch": 3.0910614525139666, + "grad_norm": 0.5741810202598572, + "learning_rate": 0.000847983193277311, + "loss": 0.4214, + "step": 5533 + }, + { + "epoch": 3.0916201117318436, + "grad_norm": 0.6218336820602417, + "learning_rate": 0.0008479551820728292, + "loss": 0.4276, + "step": 5534 + }, + { + "epoch": 3.0921787709497206, + "grad_norm": 0.5685567855834961, + "learning_rate": 0.0008479271708683474, + "loss": 0.4764, + "step": 5535 + }, + { + "epoch": 3.0927374301675976, + "grad_norm": 0.6985644698143005, + "learning_rate": 0.0008478991596638655, + "loss": 0.4882, + "step": 5536 + }, + { + "epoch": 3.093296089385475, + "grad_norm": 0.5651882886886597, + "learning_rate": 0.0008478711484593837, + "loss": 0.6333, + "step": 5537 + }, + { + "epoch": 3.093854748603352, + "grad_norm": 0.3793085515499115, + "learning_rate": 0.000847843137254902, + "loss": 0.4573, + "step": 5538 + }, + { + "epoch": 3.094413407821229, + "grad_norm": 0.5325741767883301, + "learning_rate": 0.0008478151260504202, + "loss": 0.4729, + "step": 5539 + }, + { + "epoch": 3.094972067039106, + "grad_norm": 0.8570863008499146, + "learning_rate": 0.0008477871148459384, + "loss": 0.7019, + "step": 5540 + }, + { + "epoch": 3.095530726256983, + "grad_norm": 0.6007282733917236, + "learning_rate": 0.0008477591036414565, + "loss": 0.5069, + "step": 5541 + }, + { + "epoch": 3.09608938547486, + "grad_norm": 0.5458068251609802, + "learning_rate": 0.0008477310924369747, + "loss": 0.5598, + "step": 5542 + }, + { + "epoch": 3.0966480446927376, + "grad_norm": 0.47937455773353577, + "learning_rate": 0.0008477030812324931, + "loss": 0.4483, + "step": 5543 + }, + { + "epoch": 3.0972067039106146, + "grad_norm": 0.4870924949645996, + "learning_rate": 0.0008476750700280113, + "loss": 0.4401, + "step": 5544 + }, + { + "epoch": 3.0977653631284916, + "grad_norm": 1.1074011325836182, + "learning_rate": 0.0008476470588235295, + "loss": 0.4424, + "step": 5545 + }, + { + "epoch": 3.0983240223463686, + "grad_norm": 0.5029626488685608, + "learning_rate": 0.0008476190476190476, + "loss": 0.4748, + "step": 5546 + }, + { + "epoch": 3.0988826815642456, + "grad_norm": 3.9221127033233643, + "learning_rate": 0.0008475910364145658, + "loss": 0.542, + "step": 5547 + }, + { + "epoch": 3.099441340782123, + "grad_norm": 0.6362393498420715, + "learning_rate": 0.0008475630252100841, + "loss": 0.4191, + "step": 5548 + }, + { + "epoch": 3.1, + "grad_norm": 0.4929668605327606, + "learning_rate": 0.0008475350140056023, + "loss": 0.3726, + "step": 5549 + }, + { + "epoch": 3.100558659217877, + "grad_norm": 0.5730764269828796, + "learning_rate": 0.0008475070028011205, + "loss": 0.4915, + "step": 5550 + }, + { + "epoch": 3.101117318435754, + "grad_norm": 0.5927708745002747, + "learning_rate": 0.0008474789915966387, + "loss": 0.4225, + "step": 5551 + }, + { + "epoch": 3.101675977653631, + "grad_norm": 0.4804950952529907, + "learning_rate": 0.0008474509803921568, + "loss": 0.3416, + "step": 5552 + }, + { + "epoch": 3.1022346368715086, + "grad_norm": 0.6153953671455383, + "learning_rate": 0.0008474229691876751, + "loss": 0.49, + "step": 5553 + }, + { + "epoch": 3.1027932960893856, + "grad_norm": 0.7899450659751892, + "learning_rate": 0.0008473949579831933, + "loss": 0.4699, + "step": 5554 + }, + { + "epoch": 3.1033519553072626, + "grad_norm": 0.5297183990478516, + "learning_rate": 0.0008473669467787115, + "loss": 0.4664, + "step": 5555 + }, + { + "epoch": 3.1039106145251396, + "grad_norm": 1.1394513845443726, + "learning_rate": 0.0008473389355742297, + "loss": 0.403, + "step": 5556 + }, + { + "epoch": 3.1044692737430166, + "grad_norm": 1.3090755939483643, + "learning_rate": 0.0008473109243697478, + "loss": 0.4964, + "step": 5557 + }, + { + "epoch": 3.105027932960894, + "grad_norm": 0.5511622428894043, + "learning_rate": 0.0008472829131652662, + "loss": 0.4778, + "step": 5558 + }, + { + "epoch": 3.105586592178771, + "grad_norm": 0.7322571277618408, + "learning_rate": 0.0008472549019607844, + "loss": 0.4158, + "step": 5559 + }, + { + "epoch": 3.106145251396648, + "grad_norm": 0.6841732263565063, + "learning_rate": 0.0008472268907563026, + "loss": 0.4086, + "step": 5560 + }, + { + "epoch": 3.106703910614525, + "grad_norm": 0.550640881061554, + "learning_rate": 0.0008471988795518208, + "loss": 0.4952, + "step": 5561 + }, + { + "epoch": 3.107262569832402, + "grad_norm": 1.0008827447891235, + "learning_rate": 0.0008471708683473389, + "loss": 0.4601, + "step": 5562 + }, + { + "epoch": 3.107821229050279, + "grad_norm": 0.6077209711074829, + "learning_rate": 0.0008471428571428572, + "loss": 0.582, + "step": 5563 + }, + { + "epoch": 3.1083798882681566, + "grad_norm": 0.5013401508331299, + "learning_rate": 0.0008471148459383754, + "loss": 0.4023, + "step": 5564 + }, + { + "epoch": 3.1089385474860336, + "grad_norm": 0.44117945432662964, + "learning_rate": 0.0008470868347338936, + "loss": 0.5147, + "step": 5565 + }, + { + "epoch": 3.1094972067039106, + "grad_norm": 0.7289434671401978, + "learning_rate": 0.0008470588235294118, + "loss": 0.4158, + "step": 5566 + }, + { + "epoch": 3.1100558659217876, + "grad_norm": 1.3812426328659058, + "learning_rate": 0.00084703081232493, + "loss": 0.542, + "step": 5567 + }, + { + "epoch": 3.1106145251396646, + "grad_norm": 0.683921754360199, + "learning_rate": 0.0008470028011204482, + "loss": 0.4842, + "step": 5568 + }, + { + "epoch": 3.111173184357542, + "grad_norm": 0.49743011593818665, + "learning_rate": 0.0008469747899159664, + "loss": 0.4649, + "step": 5569 + }, + { + "epoch": 3.111731843575419, + "grad_norm": 1.115496277809143, + "learning_rate": 0.0008469467787114846, + "loss": 0.2978, + "step": 5570 + }, + { + "epoch": 3.112290502793296, + "grad_norm": 0.47693005204200745, + "learning_rate": 0.0008469187675070028, + "loss": 0.4808, + "step": 5571 + }, + { + "epoch": 3.112849162011173, + "grad_norm": 0.7259455919265747, + "learning_rate": 0.000846890756302521, + "loss": 0.4257, + "step": 5572 + }, + { + "epoch": 3.11340782122905, + "grad_norm": 0.503884494304657, + "learning_rate": 0.0008468627450980391, + "loss": 0.403, + "step": 5573 + }, + { + "epoch": 3.1139664804469276, + "grad_norm": 0.4987591505050659, + "learning_rate": 0.0008468347338935574, + "loss": 0.4607, + "step": 5574 + }, + { + "epoch": 3.1145251396648046, + "grad_norm": 0.46255776286125183, + "learning_rate": 0.0008468067226890757, + "loss": 0.4712, + "step": 5575 + }, + { + "epoch": 3.1150837988826816, + "grad_norm": 0.520065188407898, + "learning_rate": 0.0008467787114845939, + "loss": 0.4239, + "step": 5576 + }, + { + "epoch": 3.1156424581005586, + "grad_norm": 0.7515888810157776, + "learning_rate": 0.0008467507002801121, + "loss": 0.4432, + "step": 5577 + }, + { + "epoch": 3.1162011173184356, + "grad_norm": 0.8942346572875977, + "learning_rate": 0.0008467226890756302, + "loss": 0.5083, + "step": 5578 + }, + { + "epoch": 3.1167597765363126, + "grad_norm": 0.38200417160987854, + "learning_rate": 0.0008466946778711485, + "loss": 0.4241, + "step": 5579 + }, + { + "epoch": 3.11731843575419, + "grad_norm": 0.4806053340435028, + "learning_rate": 0.0008466666666666667, + "loss": 0.4384, + "step": 5580 + }, + { + "epoch": 3.117877094972067, + "grad_norm": 0.6235408186912537, + "learning_rate": 0.0008466386554621849, + "loss": 0.4569, + "step": 5581 + }, + { + "epoch": 3.118435754189944, + "grad_norm": 0.9441065788269043, + "learning_rate": 0.0008466106442577031, + "loss": 0.4306, + "step": 5582 + }, + { + "epoch": 3.118994413407821, + "grad_norm": 0.46072858572006226, + "learning_rate": 0.0008465826330532213, + "loss": 0.5688, + "step": 5583 + }, + { + "epoch": 3.119553072625698, + "grad_norm": 0.6990225315093994, + "learning_rate": 0.0008465546218487395, + "loss": 0.51, + "step": 5584 + }, + { + "epoch": 3.1201117318435756, + "grad_norm": 0.6464018821716309, + "learning_rate": 0.0008465266106442577, + "loss": 0.5027, + "step": 5585 + }, + { + "epoch": 3.1206703910614526, + "grad_norm": 0.4172152578830719, + "learning_rate": 0.0008464985994397759, + "loss": 0.3654, + "step": 5586 + }, + { + "epoch": 3.1212290502793296, + "grad_norm": 1.3186568021774292, + "learning_rate": 0.0008464705882352941, + "loss": 0.4486, + "step": 5587 + }, + { + "epoch": 3.1217877094972066, + "grad_norm": 1.02861487865448, + "learning_rate": 0.0008464425770308123, + "loss": 0.4515, + "step": 5588 + }, + { + "epoch": 3.1223463687150836, + "grad_norm": 0.585443913936615, + "learning_rate": 0.0008464145658263305, + "loss": 0.5493, + "step": 5589 + }, + { + "epoch": 3.122905027932961, + "grad_norm": 0.5775396823883057, + "learning_rate": 0.0008463865546218487, + "loss": 0.36, + "step": 5590 + }, + { + "epoch": 3.123463687150838, + "grad_norm": 0.6701005101203918, + "learning_rate": 0.000846358543417367, + "loss": 0.6122, + "step": 5591 + }, + { + "epoch": 3.124022346368715, + "grad_norm": 0.7165709137916565, + "learning_rate": 0.0008463305322128852, + "loss": 0.6009, + "step": 5592 + }, + { + "epoch": 3.124581005586592, + "grad_norm": 0.5199567675590515, + "learning_rate": 0.0008463025210084034, + "loss": 0.4555, + "step": 5593 + }, + { + "epoch": 3.125139664804469, + "grad_norm": 0.4732874631881714, + "learning_rate": 0.0008462745098039217, + "loss": 0.5021, + "step": 5594 + }, + { + "epoch": 3.1256983240223466, + "grad_norm": 0.5118914842605591, + "learning_rate": 0.0008462464985994398, + "loss": 0.5008, + "step": 5595 + }, + { + "epoch": 3.1262569832402236, + "grad_norm": 0.6696200966835022, + "learning_rate": 0.000846218487394958, + "loss": 0.4497, + "step": 5596 + }, + { + "epoch": 3.1268156424581006, + "grad_norm": 0.6452716588973999, + "learning_rate": 0.0008461904761904762, + "loss": 0.3698, + "step": 5597 + }, + { + "epoch": 3.1273743016759776, + "grad_norm": 0.5884919762611389, + "learning_rate": 0.0008461624649859944, + "loss": 0.4545, + "step": 5598 + }, + { + "epoch": 3.1279329608938546, + "grad_norm": 1.2152279615402222, + "learning_rate": 0.0008461344537815127, + "loss": 0.5066, + "step": 5599 + }, + { + "epoch": 3.1284916201117317, + "grad_norm": 2.404604434967041, + "learning_rate": 0.0008461064425770308, + "loss": 0.4193, + "step": 5600 + }, + { + "epoch": 3.129050279329609, + "grad_norm": 0.5319065451622009, + "learning_rate": 0.000846078431372549, + "loss": 0.5891, + "step": 5601 + }, + { + "epoch": 3.129608938547486, + "grad_norm": 0.48796480894088745, + "learning_rate": 0.0008460504201680672, + "loss": 0.4536, + "step": 5602 + }, + { + "epoch": 3.130167597765363, + "grad_norm": 0.5275574922561646, + "learning_rate": 0.0008460224089635854, + "loss": 0.4787, + "step": 5603 + }, + { + "epoch": 3.13072625698324, + "grad_norm": 0.5120730400085449, + "learning_rate": 0.0008459943977591037, + "loss": 0.498, + "step": 5604 + }, + { + "epoch": 3.131284916201117, + "grad_norm": 0.593523383140564, + "learning_rate": 0.0008459663865546218, + "loss": 0.4781, + "step": 5605 + }, + { + "epoch": 3.1318435754189946, + "grad_norm": 0.7718291282653809, + "learning_rate": 0.00084593837535014, + "loss": 0.555, + "step": 5606 + }, + { + "epoch": 3.1324022346368716, + "grad_norm": 0.5640621185302734, + "learning_rate": 0.0008459103641456582, + "loss": 0.454, + "step": 5607 + }, + { + "epoch": 3.1329608938547486, + "grad_norm": 0.7334309220314026, + "learning_rate": 0.0008458823529411765, + "loss": 0.5365, + "step": 5608 + }, + { + "epoch": 3.1335195530726256, + "grad_norm": 0.8507549166679382, + "learning_rate": 0.0008458543417366948, + "loss": 0.3926, + "step": 5609 + }, + { + "epoch": 3.1340782122905027, + "grad_norm": 0.5288267135620117, + "learning_rate": 0.000845826330532213, + "loss": 0.4904, + "step": 5610 + }, + { + "epoch": 3.1346368715083797, + "grad_norm": 0.5941880941390991, + "learning_rate": 0.0008457983193277311, + "loss": 0.4464, + "step": 5611 + }, + { + "epoch": 3.135195530726257, + "grad_norm": 0.6736599802970886, + "learning_rate": 0.0008457703081232493, + "loss": 0.5198, + "step": 5612 + }, + { + "epoch": 3.135754189944134, + "grad_norm": 0.5984303951263428, + "learning_rate": 0.0008457422969187675, + "loss": 0.4512, + "step": 5613 + }, + { + "epoch": 3.136312849162011, + "grad_norm": 0.43068036437034607, + "learning_rate": 0.0008457142857142858, + "loss": 0.3395, + "step": 5614 + }, + { + "epoch": 3.136871508379888, + "grad_norm": 1.7956981658935547, + "learning_rate": 0.000845686274509804, + "loss": 0.4573, + "step": 5615 + }, + { + "epoch": 3.137430167597765, + "grad_norm": 1.5689762830734253, + "learning_rate": 0.0008456582633053221, + "loss": 0.463, + "step": 5616 + }, + { + "epoch": 3.1379888268156426, + "grad_norm": 0.7011322379112244, + "learning_rate": 0.0008456302521008403, + "loss": 0.3743, + "step": 5617 + }, + { + "epoch": 3.1385474860335196, + "grad_norm": 0.4777504801750183, + "learning_rate": 0.0008456022408963585, + "loss": 0.458, + "step": 5618 + }, + { + "epoch": 3.1391061452513966, + "grad_norm": 0.9461365938186646, + "learning_rate": 0.0008455742296918768, + "loss": 0.4364, + "step": 5619 + }, + { + "epoch": 3.1396648044692737, + "grad_norm": 0.49534231424331665, + "learning_rate": 0.000845546218487395, + "loss": 0.4795, + "step": 5620 + }, + { + "epoch": 3.1402234636871507, + "grad_norm": 0.61227947473526, + "learning_rate": 0.0008455182072829131, + "loss": 0.5414, + "step": 5621 + }, + { + "epoch": 3.140782122905028, + "grad_norm": 0.7459039688110352, + "learning_rate": 0.0008454901960784313, + "loss": 0.4904, + "step": 5622 + }, + { + "epoch": 3.141340782122905, + "grad_norm": 1.202772855758667, + "learning_rate": 0.0008454621848739495, + "loss": 0.5283, + "step": 5623 + }, + { + "epoch": 3.141899441340782, + "grad_norm": 0.4477715492248535, + "learning_rate": 0.0008454341736694679, + "loss": 0.3747, + "step": 5624 + }, + { + "epoch": 3.142458100558659, + "grad_norm": 0.5851945877075195, + "learning_rate": 0.0008454061624649861, + "loss": 0.5378, + "step": 5625 + }, + { + "epoch": 3.143016759776536, + "grad_norm": 0.4029100835323334, + "learning_rate": 0.0008453781512605043, + "loss": 0.4517, + "step": 5626 + }, + { + "epoch": 3.1435754189944136, + "grad_norm": 0.42703381180763245, + "learning_rate": 0.0008453501400560224, + "loss": 0.4815, + "step": 5627 + }, + { + "epoch": 3.1441340782122906, + "grad_norm": 12.102084159851074, + "learning_rate": 0.0008453221288515406, + "loss": 0.569, + "step": 5628 + }, + { + "epoch": 3.1446927374301676, + "grad_norm": 0.6901870369911194, + "learning_rate": 0.0008452941176470589, + "loss": 0.4042, + "step": 5629 + }, + { + "epoch": 3.1452513966480447, + "grad_norm": 1.1148213148117065, + "learning_rate": 0.0008452661064425771, + "loss": 0.3946, + "step": 5630 + }, + { + "epoch": 3.1458100558659217, + "grad_norm": 0.4705391526222229, + "learning_rate": 0.0008452380952380953, + "loss": 0.5268, + "step": 5631 + }, + { + "epoch": 3.146368715083799, + "grad_norm": 0.6951575875282288, + "learning_rate": 0.0008452100840336134, + "loss": 0.4138, + "step": 5632 + }, + { + "epoch": 3.146927374301676, + "grad_norm": 0.6584864854812622, + "learning_rate": 0.0008451820728291316, + "loss": 0.4196, + "step": 5633 + }, + { + "epoch": 3.147486033519553, + "grad_norm": 0.5888062715530396, + "learning_rate": 0.0008451540616246499, + "loss": 0.5384, + "step": 5634 + }, + { + "epoch": 3.14804469273743, + "grad_norm": 0.8504028916358948, + "learning_rate": 0.0008451260504201681, + "loss": 0.4661, + "step": 5635 + }, + { + "epoch": 3.148603351955307, + "grad_norm": 0.45534247159957886, + "learning_rate": 0.0008450980392156863, + "loss": 0.4792, + "step": 5636 + }, + { + "epoch": 3.149162011173184, + "grad_norm": 0.6078174710273743, + "learning_rate": 0.0008450700280112044, + "loss": 0.5263, + "step": 5637 + }, + { + "epoch": 3.1497206703910616, + "grad_norm": 0.48322969675064087, + "learning_rate": 0.0008450420168067226, + "loss": 0.4213, + "step": 5638 + }, + { + "epoch": 3.1502793296089386, + "grad_norm": 0.5949899554252625, + "learning_rate": 0.0008450140056022409, + "loss": 0.4008, + "step": 5639 + }, + { + "epoch": 3.1508379888268156, + "grad_norm": 0.4921998679637909, + "learning_rate": 0.0008449859943977592, + "loss": 0.4125, + "step": 5640 + }, + { + "epoch": 3.1513966480446927, + "grad_norm": 1.483995795249939, + "learning_rate": 0.0008449579831932774, + "loss": 0.5603, + "step": 5641 + }, + { + "epoch": 3.1519553072625697, + "grad_norm": 0.4538545608520508, + "learning_rate": 0.0008449299719887956, + "loss": 0.4636, + "step": 5642 + }, + { + "epoch": 3.152513966480447, + "grad_norm": 0.49479126930236816, + "learning_rate": 0.0008449019607843137, + "loss": 0.4627, + "step": 5643 + }, + { + "epoch": 3.153072625698324, + "grad_norm": 6.404916763305664, + "learning_rate": 0.000844873949579832, + "loss": 0.3233, + "step": 5644 + }, + { + "epoch": 3.153631284916201, + "grad_norm": 0.6194260120391846, + "learning_rate": 0.0008448459383753502, + "loss": 0.4499, + "step": 5645 + }, + { + "epoch": 3.154189944134078, + "grad_norm": 0.479398250579834, + "learning_rate": 0.0008448179271708684, + "loss": 0.5014, + "step": 5646 + }, + { + "epoch": 3.154748603351955, + "grad_norm": 0.6063601970672607, + "learning_rate": 0.0008447899159663866, + "loss": 0.533, + "step": 5647 + }, + { + "epoch": 3.155307262569832, + "grad_norm": 0.5322023630142212, + "learning_rate": 0.0008447619047619047, + "loss": 0.436, + "step": 5648 + }, + { + "epoch": 3.1558659217877096, + "grad_norm": 0.424039363861084, + "learning_rate": 0.000844733893557423, + "loss": 0.4873, + "step": 5649 + }, + { + "epoch": 3.1564245810055866, + "grad_norm": 0.47259289026260376, + "learning_rate": 0.0008447058823529412, + "loss": 0.4794, + "step": 5650 + }, + { + "epoch": 3.1569832402234637, + "grad_norm": 0.456737756729126, + "learning_rate": 0.0008446778711484594, + "loss": 0.3803, + "step": 5651 + }, + { + "epoch": 3.1575418994413407, + "grad_norm": 0.48854315280914307, + "learning_rate": 0.0008446498599439776, + "loss": 0.4304, + "step": 5652 + }, + { + "epoch": 3.1581005586592177, + "grad_norm": 0.39837995171546936, + "learning_rate": 0.0008446218487394957, + "loss": 0.4636, + "step": 5653 + }, + { + "epoch": 3.158659217877095, + "grad_norm": 0.4552333950996399, + "learning_rate": 0.000844593837535014, + "loss": 0.4663, + "step": 5654 + }, + { + "epoch": 3.159217877094972, + "grad_norm": 0.422201007604599, + "learning_rate": 0.0008445658263305322, + "loss": 0.5065, + "step": 5655 + }, + { + "epoch": 3.159776536312849, + "grad_norm": 0.422823965549469, + "learning_rate": 0.0008445378151260504, + "loss": 0.566, + "step": 5656 + }, + { + "epoch": 3.160335195530726, + "grad_norm": 0.49043142795562744, + "learning_rate": 0.0008445098039215687, + "loss": 0.4752, + "step": 5657 + }, + { + "epoch": 3.160893854748603, + "grad_norm": 0.5856580138206482, + "learning_rate": 0.0008444817927170869, + "loss": 0.4897, + "step": 5658 + }, + { + "epoch": 3.1614525139664806, + "grad_norm": 1.0482743978500366, + "learning_rate": 0.0008444537815126051, + "loss": 0.5223, + "step": 5659 + }, + { + "epoch": 3.1620111731843576, + "grad_norm": 0.6414709091186523, + "learning_rate": 0.0008444257703081233, + "loss": 0.381, + "step": 5660 + }, + { + "epoch": 3.1625698324022347, + "grad_norm": 0.49149081110954285, + "learning_rate": 0.0008443977591036415, + "loss": 0.4525, + "step": 5661 + }, + { + "epoch": 3.1631284916201117, + "grad_norm": 0.5326992869377136, + "learning_rate": 0.0008443697478991597, + "loss": 0.4691, + "step": 5662 + }, + { + "epoch": 3.1636871508379887, + "grad_norm": 0.6219455003738403, + "learning_rate": 0.0008443417366946779, + "loss": 0.4394, + "step": 5663 + }, + { + "epoch": 3.164245810055866, + "grad_norm": 0.7326778173446655, + "learning_rate": 0.0008443137254901961, + "loss": 0.6552, + "step": 5664 + }, + { + "epoch": 3.164804469273743, + "grad_norm": 0.3856636881828308, + "learning_rate": 0.0008442857142857143, + "loss": 0.3941, + "step": 5665 + }, + { + "epoch": 3.16536312849162, + "grad_norm": 0.4134659469127655, + "learning_rate": 0.0008442577030812325, + "loss": 0.4796, + "step": 5666 + }, + { + "epoch": 3.165921787709497, + "grad_norm": 0.46562039852142334, + "learning_rate": 0.0008442296918767507, + "loss": 0.3128, + "step": 5667 + }, + { + "epoch": 3.166480446927374, + "grad_norm": 0.572398841381073, + "learning_rate": 0.0008442016806722689, + "loss": 0.5344, + "step": 5668 + }, + { + "epoch": 3.167039106145251, + "grad_norm": 0.8000866174697876, + "learning_rate": 0.0008441736694677871, + "loss": 0.6003, + "step": 5669 + }, + { + "epoch": 3.1675977653631286, + "grad_norm": 1.1431472301483154, + "learning_rate": 0.0008441456582633053, + "loss": 0.4302, + "step": 5670 + }, + { + "epoch": 3.1681564245810057, + "grad_norm": 1.4717786312103271, + "learning_rate": 0.0008441176470588235, + "loss": 0.3784, + "step": 5671 + }, + { + "epoch": 3.1687150837988827, + "grad_norm": 0.513985276222229, + "learning_rate": 0.0008440896358543417, + "loss": 0.4501, + "step": 5672 + }, + { + "epoch": 3.1692737430167597, + "grad_norm": 0.9906004667282104, + "learning_rate": 0.00084406162464986, + "loss": 0.5115, + "step": 5673 + }, + { + "epoch": 3.1698324022346367, + "grad_norm": 0.8090980052947998, + "learning_rate": 0.0008440336134453783, + "loss": 0.4552, + "step": 5674 + }, + { + "epoch": 3.170391061452514, + "grad_norm": 0.5078025460243225, + "learning_rate": 0.0008440056022408964, + "loss": 0.4519, + "step": 5675 + }, + { + "epoch": 3.170949720670391, + "grad_norm": 0.6318791508674622, + "learning_rate": 0.0008439775910364146, + "loss": 0.4745, + "step": 5676 + }, + { + "epoch": 3.171508379888268, + "grad_norm": 0.6941470503807068, + "learning_rate": 0.0008439495798319328, + "loss": 0.3625, + "step": 5677 + }, + { + "epoch": 3.172067039106145, + "grad_norm": 0.4782574474811554, + "learning_rate": 0.000843921568627451, + "loss": 0.4658, + "step": 5678 + }, + { + "epoch": 3.172625698324022, + "grad_norm": 0.5398354530334473, + "learning_rate": 0.0008438935574229693, + "loss": 0.4546, + "step": 5679 + }, + { + "epoch": 3.1731843575418996, + "grad_norm": 0.5194029211997986, + "learning_rate": 0.0008438655462184874, + "loss": 0.5172, + "step": 5680 + }, + { + "epoch": 3.1737430167597767, + "grad_norm": 0.5547402501106262, + "learning_rate": 0.0008438375350140056, + "loss": 0.4501, + "step": 5681 + }, + { + "epoch": 3.1743016759776537, + "grad_norm": 0.6482549905776978, + "learning_rate": 0.0008438095238095238, + "loss": 0.4743, + "step": 5682 + }, + { + "epoch": 3.1748603351955307, + "grad_norm": 0.8866221308708191, + "learning_rate": 0.000843781512605042, + "loss": 0.5239, + "step": 5683 + }, + { + "epoch": 3.1754189944134077, + "grad_norm": 0.7125205397605896, + "learning_rate": 0.0008437535014005603, + "loss": 0.4964, + "step": 5684 + }, + { + "epoch": 3.1759776536312847, + "grad_norm": 0.4755825102329254, + "learning_rate": 0.0008437254901960784, + "loss": 0.4781, + "step": 5685 + }, + { + "epoch": 3.176536312849162, + "grad_norm": 0.4849969744682312, + "learning_rate": 0.0008436974789915966, + "loss": 0.4686, + "step": 5686 + }, + { + "epoch": 3.177094972067039, + "grad_norm": 6.330876350402832, + "learning_rate": 0.0008436694677871148, + "loss": 0.3751, + "step": 5687 + }, + { + "epoch": 3.177653631284916, + "grad_norm": 0.6688026189804077, + "learning_rate": 0.000843641456582633, + "loss": 0.4104, + "step": 5688 + }, + { + "epoch": 3.178212290502793, + "grad_norm": 0.8242323994636536, + "learning_rate": 0.0008436134453781514, + "loss": 0.4669, + "step": 5689 + }, + { + "epoch": 3.17877094972067, + "grad_norm": 0.9547811150550842, + "learning_rate": 0.0008435854341736696, + "loss": 0.4651, + "step": 5690 + }, + { + "epoch": 3.1793296089385477, + "grad_norm": 0.870112419128418, + "learning_rate": 0.0008435574229691877, + "loss": 0.5659, + "step": 5691 + }, + { + "epoch": 3.1798882681564247, + "grad_norm": 1.4759526252746582, + "learning_rate": 0.0008435294117647059, + "loss": 0.8533, + "step": 5692 + }, + { + "epoch": 3.1804469273743017, + "grad_norm": 0.8604695200920105, + "learning_rate": 0.0008435014005602241, + "loss": 0.5095, + "step": 5693 + }, + { + "epoch": 3.1810055865921787, + "grad_norm": 0.615852415561676, + "learning_rate": 0.0008434733893557424, + "loss": 0.4572, + "step": 5694 + }, + { + "epoch": 3.1815642458100557, + "grad_norm": 0.4653518497943878, + "learning_rate": 0.0008434453781512606, + "loss": 0.3781, + "step": 5695 + }, + { + "epoch": 3.182122905027933, + "grad_norm": 0.4282912313938141, + "learning_rate": 0.0008434173669467787, + "loss": 0.4924, + "step": 5696 + }, + { + "epoch": 3.18268156424581, + "grad_norm": 1.489055871963501, + "learning_rate": 0.0008433893557422969, + "loss": 0.4605, + "step": 5697 + }, + { + "epoch": 3.183240223463687, + "grad_norm": 0.5174456834793091, + "learning_rate": 0.0008433613445378151, + "loss": 0.5485, + "step": 5698 + }, + { + "epoch": 3.183798882681564, + "grad_norm": 0.47346973419189453, + "learning_rate": 0.0008433333333333334, + "loss": 0.5117, + "step": 5699 + }, + { + "epoch": 3.184357541899441, + "grad_norm": 3.379406690597534, + "learning_rate": 0.0008433053221288516, + "loss": 0.5139, + "step": 5700 + }, + { + "epoch": 3.1849162011173187, + "grad_norm": 0.47423630952835083, + "learning_rate": 0.0008432773109243697, + "loss": 0.5593, + "step": 5701 + }, + { + "epoch": 3.1854748603351957, + "grad_norm": 0.5654445886611938, + "learning_rate": 0.0008432492997198879, + "loss": 0.4591, + "step": 5702 + }, + { + "epoch": 3.1860335195530727, + "grad_norm": 1.4614496231079102, + "learning_rate": 0.0008432212885154061, + "loss": 0.524, + "step": 5703 + }, + { + "epoch": 3.1865921787709497, + "grad_norm": 0.8168285489082336, + "learning_rate": 0.0008431932773109244, + "loss": 0.4515, + "step": 5704 + }, + { + "epoch": 3.1871508379888267, + "grad_norm": 1.4792838096618652, + "learning_rate": 0.0008431652661064426, + "loss": 0.4342, + "step": 5705 + }, + { + "epoch": 3.1877094972067037, + "grad_norm": 0.3846934139728546, + "learning_rate": 0.0008431372549019609, + "loss": 0.3252, + "step": 5706 + }, + { + "epoch": 3.188268156424581, + "grad_norm": 0.7086730599403381, + "learning_rate": 0.000843109243697479, + "loss": 0.5533, + "step": 5707 + }, + { + "epoch": 3.188826815642458, + "grad_norm": 0.6092351675033569, + "learning_rate": 0.0008430812324929972, + "loss": 0.5899, + "step": 5708 + }, + { + "epoch": 3.189385474860335, + "grad_norm": 0.5143446326255798, + "learning_rate": 0.0008430532212885155, + "loss": 0.4684, + "step": 5709 + }, + { + "epoch": 3.189944134078212, + "grad_norm": 0.6106857061386108, + "learning_rate": 0.0008430252100840337, + "loss": 0.4214, + "step": 5710 + }, + { + "epoch": 3.190502793296089, + "grad_norm": 0.47395002841949463, + "learning_rate": 0.0008429971988795519, + "loss": 0.4858, + "step": 5711 + }, + { + "epoch": 3.1910614525139667, + "grad_norm": 0.5685007572174072, + "learning_rate": 0.00084296918767507, + "loss": 0.4735, + "step": 5712 + }, + { + "epoch": 3.1916201117318437, + "grad_norm": 1.178513765335083, + "learning_rate": 0.0008429411764705882, + "loss": 0.4756, + "step": 5713 + }, + { + "epoch": 3.1921787709497207, + "grad_norm": 0.5206089615821838, + "learning_rate": 0.0008429131652661065, + "loss": 0.588, + "step": 5714 + }, + { + "epoch": 3.1927374301675977, + "grad_norm": 0.39936062693595886, + "learning_rate": 0.0008428851540616247, + "loss": 0.4499, + "step": 5715 + }, + { + "epoch": 3.1932960893854747, + "grad_norm": 0.555731475353241, + "learning_rate": 0.0008428571428571429, + "loss": 0.4558, + "step": 5716 + }, + { + "epoch": 3.1938547486033517, + "grad_norm": 1.530231237411499, + "learning_rate": 0.000842829131652661, + "loss": 0.4141, + "step": 5717 + }, + { + "epoch": 3.194413407821229, + "grad_norm": 0.7651337385177612, + "learning_rate": 0.0008428011204481792, + "loss": 0.481, + "step": 5718 + }, + { + "epoch": 3.194972067039106, + "grad_norm": 0.5995467305183411, + "learning_rate": 0.0008427731092436975, + "loss": 0.5484, + "step": 5719 + }, + { + "epoch": 3.195530726256983, + "grad_norm": 0.5191141963005066, + "learning_rate": 0.0008427450980392157, + "loss": 0.4855, + "step": 5720 + }, + { + "epoch": 3.19608938547486, + "grad_norm": 0.6673374176025391, + "learning_rate": 0.0008427170868347339, + "loss": 0.5144, + "step": 5721 + }, + { + "epoch": 3.1966480446927372, + "grad_norm": 0.4839000999927521, + "learning_rate": 0.0008426890756302522, + "loss": 0.4386, + "step": 5722 + }, + { + "epoch": 3.1972067039106147, + "grad_norm": 0.9123650789260864, + "learning_rate": 0.0008426610644257702, + "loss": 0.6075, + "step": 5723 + }, + { + "epoch": 3.1977653631284917, + "grad_norm": 0.48614388704299927, + "learning_rate": 0.0008426330532212886, + "loss": 0.4924, + "step": 5724 + }, + { + "epoch": 3.1983240223463687, + "grad_norm": 0.4855598509311676, + "learning_rate": 0.0008426050420168068, + "loss": 0.5797, + "step": 5725 + }, + { + "epoch": 3.1988826815642457, + "grad_norm": 0.5441485643386841, + "learning_rate": 0.000842577030812325, + "loss": 0.4406, + "step": 5726 + }, + { + "epoch": 3.1994413407821227, + "grad_norm": 0.8743434548377991, + "learning_rate": 0.0008425490196078432, + "loss": 0.4153, + "step": 5727 + }, + { + "epoch": 3.2, + "grad_norm": 0.5514986515045166, + "learning_rate": 0.0008425210084033613, + "loss": 0.3868, + "step": 5728 + }, + { + "epoch": 3.200558659217877, + "grad_norm": 1.6775099039077759, + "learning_rate": 0.0008424929971988796, + "loss": 0.5614, + "step": 5729 + }, + { + "epoch": 3.201117318435754, + "grad_norm": 0.5128558278083801, + "learning_rate": 0.0008424649859943978, + "loss": 0.5077, + "step": 5730 + }, + { + "epoch": 3.201675977653631, + "grad_norm": 0.40318864583969116, + "learning_rate": 0.000842436974789916, + "loss": 0.3648, + "step": 5731 + }, + { + "epoch": 3.2022346368715082, + "grad_norm": 0.4633824825286865, + "learning_rate": 0.0008424089635854342, + "loss": 0.4542, + "step": 5732 + }, + { + "epoch": 3.2027932960893857, + "grad_norm": 1.8085352182388306, + "learning_rate": 0.0008423809523809523, + "loss": 0.4831, + "step": 5733 + }, + { + "epoch": 3.2033519553072627, + "grad_norm": 0.5630502104759216, + "learning_rate": 0.0008423529411764706, + "loss": 0.4112, + "step": 5734 + }, + { + "epoch": 3.2039106145251397, + "grad_norm": 2.502984046936035, + "learning_rate": 0.0008423249299719888, + "loss": 0.341, + "step": 5735 + }, + { + "epoch": 3.2044692737430167, + "grad_norm": 0.5019913911819458, + "learning_rate": 0.000842296918767507, + "loss": 0.4526, + "step": 5736 + }, + { + "epoch": 3.2050279329608937, + "grad_norm": 0.9250722527503967, + "learning_rate": 0.0008422689075630252, + "loss": 0.6133, + "step": 5737 + }, + { + "epoch": 3.205586592178771, + "grad_norm": 0.7668030261993408, + "learning_rate": 0.0008422408963585434, + "loss": 0.4595, + "step": 5738 + }, + { + "epoch": 3.206145251396648, + "grad_norm": 0.8034147024154663, + "learning_rate": 0.0008422128851540617, + "loss": 0.4791, + "step": 5739 + }, + { + "epoch": 3.206703910614525, + "grad_norm": 0.5307127833366394, + "learning_rate": 0.0008421848739495799, + "loss": 0.5161, + "step": 5740 + }, + { + "epoch": 3.207262569832402, + "grad_norm": 0.5297577977180481, + "learning_rate": 0.0008421568627450981, + "loss": 0.4318, + "step": 5741 + }, + { + "epoch": 3.207821229050279, + "grad_norm": 1.2233542203903198, + "learning_rate": 0.0008421288515406163, + "loss": 0.3532, + "step": 5742 + }, + { + "epoch": 3.2083798882681562, + "grad_norm": 0.7091529369354248, + "learning_rate": 0.0008421008403361345, + "loss": 0.4733, + "step": 5743 + }, + { + "epoch": 3.2089385474860337, + "grad_norm": 0.7562621831893921, + "learning_rate": 0.0008420728291316527, + "loss": 0.4636, + "step": 5744 + }, + { + "epoch": 3.2094972067039107, + "grad_norm": 3.031297445297241, + "learning_rate": 0.0008420448179271709, + "loss": 0.5458, + "step": 5745 + }, + { + "epoch": 3.2100558659217877, + "grad_norm": 0.46593883633613586, + "learning_rate": 0.0008420168067226891, + "loss": 0.4396, + "step": 5746 + }, + { + "epoch": 3.2106145251396647, + "grad_norm": 0.5773929357528687, + "learning_rate": 0.0008419887955182073, + "loss": 0.4911, + "step": 5747 + }, + { + "epoch": 3.2111731843575417, + "grad_norm": 0.8325348496437073, + "learning_rate": 0.0008419607843137255, + "loss": 0.5775, + "step": 5748 + }, + { + "epoch": 3.211731843575419, + "grad_norm": 0.6409006714820862, + "learning_rate": 0.0008419327731092437, + "loss": 0.4466, + "step": 5749 + }, + { + "epoch": 3.212290502793296, + "grad_norm": 2.3184618949890137, + "learning_rate": 0.0008419047619047619, + "loss": 0.5867, + "step": 5750 + }, + { + "epoch": 3.212849162011173, + "grad_norm": 0.8211964964866638, + "learning_rate": 0.0008418767507002801, + "loss": 0.5038, + "step": 5751 + }, + { + "epoch": 3.21340782122905, + "grad_norm": 0.5444333553314209, + "learning_rate": 0.0008418487394957983, + "loss": 0.4761, + "step": 5752 + }, + { + "epoch": 3.2139664804469272, + "grad_norm": 0.8410589694976807, + "learning_rate": 0.0008418207282913165, + "loss": 0.4396, + "step": 5753 + }, + { + "epoch": 3.2145251396648042, + "grad_norm": 0.5327155590057373, + "learning_rate": 0.0008417927170868349, + "loss": 0.3801, + "step": 5754 + }, + { + "epoch": 3.2150837988826817, + "grad_norm": 9.373048782348633, + "learning_rate": 0.000841764705882353, + "loss": 0.5984, + "step": 5755 + }, + { + "epoch": 3.2156424581005587, + "grad_norm": 0.5760027170181274, + "learning_rate": 0.0008417366946778712, + "loss": 0.5275, + "step": 5756 + }, + { + "epoch": 3.2162011173184357, + "grad_norm": 1.1095306873321533, + "learning_rate": 0.0008417086834733894, + "loss": 0.5204, + "step": 5757 + }, + { + "epoch": 3.2167597765363127, + "grad_norm": 0.5383431911468506, + "learning_rate": 0.0008416806722689076, + "loss": 0.538, + "step": 5758 + }, + { + "epoch": 3.2173184357541897, + "grad_norm": 0.9036241769790649, + "learning_rate": 0.0008416526610644259, + "loss": 0.5184, + "step": 5759 + }, + { + "epoch": 3.217877094972067, + "grad_norm": 0.6109788417816162, + "learning_rate": 0.000841624649859944, + "loss": 0.52, + "step": 5760 + }, + { + "epoch": 3.218435754189944, + "grad_norm": 0.6571570634841919, + "learning_rate": 0.0008415966386554622, + "loss": 0.5638, + "step": 5761 + }, + { + "epoch": 3.218994413407821, + "grad_norm": 2.4715847969055176, + "learning_rate": 0.0008415686274509804, + "loss": 0.3864, + "step": 5762 + }, + { + "epoch": 3.2195530726256982, + "grad_norm": 0.7069171667098999, + "learning_rate": 0.0008415406162464986, + "loss": 0.5143, + "step": 5763 + }, + { + "epoch": 3.2201117318435752, + "grad_norm": 0.6125462055206299, + "learning_rate": 0.0008415126050420169, + "loss": 0.5237, + "step": 5764 + }, + { + "epoch": 3.2206703910614527, + "grad_norm": 0.5367489457130432, + "learning_rate": 0.000841484593837535, + "loss": 0.466, + "step": 5765 + }, + { + "epoch": 3.2212290502793297, + "grad_norm": 0.6129879951477051, + "learning_rate": 0.0008414565826330532, + "loss": 0.4102, + "step": 5766 + }, + { + "epoch": 3.2217877094972067, + "grad_norm": 0.744072437286377, + "learning_rate": 0.0008414285714285714, + "loss": 0.4615, + "step": 5767 + }, + { + "epoch": 3.2223463687150837, + "grad_norm": 1.3382662534713745, + "learning_rate": 0.0008414005602240896, + "loss": 0.581, + "step": 5768 + }, + { + "epoch": 3.2229050279329607, + "grad_norm": 0.6672928929328918, + "learning_rate": 0.0008413725490196079, + "loss": 0.393, + "step": 5769 + }, + { + "epoch": 3.223463687150838, + "grad_norm": 0.5600702166557312, + "learning_rate": 0.0008413445378151261, + "loss": 0.5429, + "step": 5770 + }, + { + "epoch": 3.224022346368715, + "grad_norm": 0.5933088660240173, + "learning_rate": 0.0008413165266106442, + "loss": 0.4516, + "step": 5771 + }, + { + "epoch": 3.224581005586592, + "grad_norm": 0.5212042927742004, + "learning_rate": 0.0008412885154061625, + "loss": 0.4062, + "step": 5772 + }, + { + "epoch": 3.2251396648044692, + "grad_norm": 0.5427224636077881, + "learning_rate": 0.0008412605042016807, + "loss": 0.4972, + "step": 5773 + }, + { + "epoch": 3.2256983240223462, + "grad_norm": 0.524664580821991, + "learning_rate": 0.000841232492997199, + "loss": 0.4396, + "step": 5774 + }, + { + "epoch": 3.2262569832402237, + "grad_norm": 0.6468037962913513, + "learning_rate": 0.0008412044817927172, + "loss": 0.4934, + "step": 5775 + }, + { + "epoch": 3.2268156424581007, + "grad_norm": 0.4983474910259247, + "learning_rate": 0.0008411764705882353, + "loss": 0.437, + "step": 5776 + }, + { + "epoch": 3.2273743016759777, + "grad_norm": 1.0326396226882935, + "learning_rate": 0.0008411484593837535, + "loss": 0.4518, + "step": 5777 + }, + { + "epoch": 3.2279329608938547, + "grad_norm": 0.8367441296577454, + "learning_rate": 0.0008411204481792717, + "loss": 0.4202, + "step": 5778 + }, + { + "epoch": 3.2284916201117317, + "grad_norm": 1.5711349248886108, + "learning_rate": 0.00084109243697479, + "loss": 0.4431, + "step": 5779 + }, + { + "epoch": 3.2290502793296088, + "grad_norm": 0.5931336879730225, + "learning_rate": 0.0008410644257703082, + "loss": 0.3531, + "step": 5780 + }, + { + "epoch": 3.229608938547486, + "grad_norm": 0.7560968995094299, + "learning_rate": 0.0008410364145658263, + "loss": 0.5194, + "step": 5781 + }, + { + "epoch": 3.230167597765363, + "grad_norm": 1.7650152444839478, + "learning_rate": 0.0008410084033613445, + "loss": 0.522, + "step": 5782 + }, + { + "epoch": 3.2307262569832402, + "grad_norm": 0.871603786945343, + "learning_rate": 0.0008409803921568627, + "loss": 0.4031, + "step": 5783 + }, + { + "epoch": 3.2312849162011172, + "grad_norm": 0.46541014313697815, + "learning_rate": 0.000840952380952381, + "loss": 0.3239, + "step": 5784 + }, + { + "epoch": 3.2318435754189943, + "grad_norm": 0.5787938237190247, + "learning_rate": 0.0008409243697478992, + "loss": 0.4855, + "step": 5785 + }, + { + "epoch": 3.2324022346368717, + "grad_norm": 1.032808780670166, + "learning_rate": 0.0008408963585434174, + "loss": 0.5314, + "step": 5786 + }, + { + "epoch": 3.2329608938547487, + "grad_norm": 1.4101992845535278, + "learning_rate": 0.0008408683473389355, + "loss": 0.5148, + "step": 5787 + }, + { + "epoch": 3.2335195530726257, + "grad_norm": 0.9198086857795715, + "learning_rate": 0.0008408403361344537, + "loss": 0.5617, + "step": 5788 + }, + { + "epoch": 3.2340782122905027, + "grad_norm": 0.5087761878967285, + "learning_rate": 0.0008408123249299721, + "loss": 0.477, + "step": 5789 + }, + { + "epoch": 3.2346368715083798, + "grad_norm": 0.8337647318840027, + "learning_rate": 0.0008407843137254903, + "loss": 0.5226, + "step": 5790 + }, + { + "epoch": 3.2351955307262568, + "grad_norm": 0.7318830490112305, + "learning_rate": 0.0008407563025210085, + "loss": 0.4866, + "step": 5791 + }, + { + "epoch": 3.235754189944134, + "grad_norm": 0.5144968628883362, + "learning_rate": 0.0008407282913165266, + "loss": 0.4411, + "step": 5792 + }, + { + "epoch": 3.2363128491620112, + "grad_norm": 0.45033547282218933, + "learning_rate": 0.0008407002801120448, + "loss": 0.4586, + "step": 5793 + }, + { + "epoch": 3.2368715083798882, + "grad_norm": 0.5863280296325684, + "learning_rate": 0.0008406722689075631, + "loss": 0.3774, + "step": 5794 + }, + { + "epoch": 3.2374301675977653, + "grad_norm": 0.564110279083252, + "learning_rate": 0.0008406442577030813, + "loss": 0.4198, + "step": 5795 + }, + { + "epoch": 3.2379888268156423, + "grad_norm": 0.548296332359314, + "learning_rate": 0.0008406162464985995, + "loss": 0.4525, + "step": 5796 + }, + { + "epoch": 3.2385474860335197, + "grad_norm": 0.7311416268348694, + "learning_rate": 0.0008405882352941176, + "loss": 0.4919, + "step": 5797 + }, + { + "epoch": 3.2391061452513967, + "grad_norm": 0.8921449184417725, + "learning_rate": 0.0008405602240896358, + "loss": 0.3881, + "step": 5798 + }, + { + "epoch": 3.2396648044692737, + "grad_norm": 0.5490403175354004, + "learning_rate": 0.000840532212885154, + "loss": 0.4504, + "step": 5799 + }, + { + "epoch": 3.2402234636871508, + "grad_norm": 0.44778934121131897, + "learning_rate": 0.0008405042016806723, + "loss": 0.4101, + "step": 5800 + }, + { + "epoch": 3.2407821229050278, + "grad_norm": 0.504734992980957, + "learning_rate": 0.0008404761904761905, + "loss": 0.4183, + "step": 5801 + }, + { + "epoch": 3.241340782122905, + "grad_norm": 0.7137935161590576, + "learning_rate": 0.0008404481792717087, + "loss": 0.5006, + "step": 5802 + }, + { + "epoch": 3.2418994413407822, + "grad_norm": 0.6243535876274109, + "learning_rate": 0.0008404201680672268, + "loss": 0.5688, + "step": 5803 + }, + { + "epoch": 3.2424581005586592, + "grad_norm": 0.5670920014381409, + "learning_rate": 0.000840392156862745, + "loss": 0.5508, + "step": 5804 + }, + { + "epoch": 3.2430167597765363, + "grad_norm": 1.1637734174728394, + "learning_rate": 0.0008403641456582634, + "loss": 0.5075, + "step": 5805 + }, + { + "epoch": 3.2435754189944133, + "grad_norm": 2.084158420562744, + "learning_rate": 0.0008403361344537816, + "loss": 0.4104, + "step": 5806 + }, + { + "epoch": 3.2441340782122907, + "grad_norm": 0.5076479911804199, + "learning_rate": 0.0008403081232492998, + "loss": 0.4143, + "step": 5807 + }, + { + "epoch": 3.2446927374301677, + "grad_norm": 0.5585202574729919, + "learning_rate": 0.0008402801120448179, + "loss": 0.408, + "step": 5808 + }, + { + "epoch": 3.2452513966480447, + "grad_norm": 0.5491577386856079, + "learning_rate": 0.0008402521008403361, + "loss": 0.487, + "step": 5809 + }, + { + "epoch": 3.2458100558659218, + "grad_norm": 0.6580778956413269, + "learning_rate": 0.0008402240896358544, + "loss": 0.5002, + "step": 5810 + }, + { + "epoch": 3.2463687150837988, + "grad_norm": 0.820746123790741, + "learning_rate": 0.0008401960784313726, + "loss": 0.4189, + "step": 5811 + }, + { + "epoch": 3.2469273743016758, + "grad_norm": 1.0066092014312744, + "learning_rate": 0.0008401680672268908, + "loss": 0.518, + "step": 5812 + }, + { + "epoch": 3.2474860335195532, + "grad_norm": 0.4979892373085022, + "learning_rate": 0.0008401400560224089, + "loss": 0.4848, + "step": 5813 + }, + { + "epoch": 3.2480446927374302, + "grad_norm": 0.4993119537830353, + "learning_rate": 0.0008401120448179271, + "loss": 0.4859, + "step": 5814 + }, + { + "epoch": 3.2486033519553073, + "grad_norm": 0.8079336881637573, + "learning_rate": 0.0008400840336134454, + "loss": 0.425, + "step": 5815 + }, + { + "epoch": 3.2491620111731843, + "grad_norm": 0.47287917137145996, + "learning_rate": 0.0008400560224089636, + "loss": 0.3969, + "step": 5816 + }, + { + "epoch": 3.2497206703910613, + "grad_norm": 0.6306312084197998, + "learning_rate": 0.0008400280112044818, + "loss": 0.5888, + "step": 5817 + }, + { + "epoch": 3.2502793296089387, + "grad_norm": 0.4639133810997009, + "learning_rate": 0.00084, + "loss": 0.3681, + "step": 5818 + }, + { + "epoch": 3.2508379888268157, + "grad_norm": 0.3771038055419922, + "learning_rate": 0.0008399719887955181, + "loss": 0.341, + "step": 5819 + }, + { + "epoch": 3.2513966480446927, + "grad_norm": 0.6737233996391296, + "learning_rate": 0.0008399439775910364, + "loss": 0.464, + "step": 5820 + }, + { + "epoch": 3.2519553072625698, + "grad_norm": 0.6057330369949341, + "learning_rate": 0.0008399159663865547, + "loss": 0.4295, + "step": 5821 + }, + { + "epoch": 3.2525139664804468, + "grad_norm": 0.5017884969711304, + "learning_rate": 0.0008398879551820729, + "loss": 0.5652, + "step": 5822 + }, + { + "epoch": 3.253072625698324, + "grad_norm": 0.5959794521331787, + "learning_rate": 0.0008398599439775911, + "loss": 0.5196, + "step": 5823 + }, + { + "epoch": 3.2536312849162012, + "grad_norm": 0.6592594385147095, + "learning_rate": 0.0008398319327731092, + "loss": 0.4486, + "step": 5824 + }, + { + "epoch": 3.2541899441340782, + "grad_norm": 0.5312095880508423, + "learning_rate": 0.0008398039215686275, + "loss": 0.3249, + "step": 5825 + }, + { + "epoch": 3.2547486033519553, + "grad_norm": 0.6078124642372131, + "learning_rate": 0.0008397759103641457, + "loss": 0.4297, + "step": 5826 + }, + { + "epoch": 3.2553072625698323, + "grad_norm": 0.7393833994865417, + "learning_rate": 0.0008397478991596639, + "loss": 0.6271, + "step": 5827 + }, + { + "epoch": 3.2558659217877093, + "grad_norm": 0.7218764424324036, + "learning_rate": 0.0008397198879551821, + "loss": 0.5153, + "step": 5828 + }, + { + "epoch": 3.2564245810055867, + "grad_norm": 0.6063776612281799, + "learning_rate": 0.0008396918767507002, + "loss": 0.5261, + "step": 5829 + }, + { + "epoch": 3.2569832402234637, + "grad_norm": 0.5681562423706055, + "learning_rate": 0.0008396638655462185, + "loss": 0.4595, + "step": 5830 + }, + { + "epoch": 3.2575418994413408, + "grad_norm": 0.9323566555976868, + "learning_rate": 0.0008396358543417367, + "loss": 0.6893, + "step": 5831 + }, + { + "epoch": 3.2581005586592178, + "grad_norm": 0.4728999733924866, + "learning_rate": 0.0008396078431372549, + "loss": 0.5198, + "step": 5832 + }, + { + "epoch": 3.258659217877095, + "grad_norm": 0.5985539555549622, + "learning_rate": 0.0008395798319327731, + "loss": 0.4188, + "step": 5833 + }, + { + "epoch": 3.2592178770949722, + "grad_norm": 0.8027599453926086, + "learning_rate": 0.0008395518207282913, + "loss": 0.4697, + "step": 5834 + }, + { + "epoch": 3.2597765363128492, + "grad_norm": 0.5126614570617676, + "learning_rate": 0.0008395238095238095, + "loss": 0.4221, + "step": 5835 + }, + { + "epoch": 3.2603351955307263, + "grad_norm": 0.6420572400093079, + "learning_rate": 0.0008394957983193277, + "loss": 0.5148, + "step": 5836 + }, + { + "epoch": 3.2608938547486033, + "grad_norm": 0.7101128101348877, + "learning_rate": 0.000839467787114846, + "loss": 0.4463, + "step": 5837 + }, + { + "epoch": 3.2614525139664803, + "grad_norm": 0.5522117018699646, + "learning_rate": 0.0008394397759103642, + "loss": 0.4005, + "step": 5838 + }, + { + "epoch": 3.2620111731843577, + "grad_norm": 0.6390681862831116, + "learning_rate": 0.0008394117647058824, + "loss": 0.4511, + "step": 5839 + }, + { + "epoch": 3.2625698324022347, + "grad_norm": 0.4235347509384155, + "learning_rate": 0.0008393837535014006, + "loss": 0.4698, + "step": 5840 + }, + { + "epoch": 3.2631284916201118, + "grad_norm": 0.4609551727771759, + "learning_rate": 0.0008393557422969188, + "loss": 0.3928, + "step": 5841 + }, + { + "epoch": 3.2636871508379888, + "grad_norm": 0.6046702265739441, + "learning_rate": 0.000839327731092437, + "loss": 0.4491, + "step": 5842 + }, + { + "epoch": 3.264245810055866, + "grad_norm": 0.8087189197540283, + "learning_rate": 0.0008392997198879552, + "loss": 0.6472, + "step": 5843 + }, + { + "epoch": 3.2648044692737432, + "grad_norm": 0.4118671119213104, + "learning_rate": 0.0008392717086834734, + "loss": 0.4232, + "step": 5844 + }, + { + "epoch": 3.2653631284916202, + "grad_norm": 0.9294708967208862, + "learning_rate": 0.0008392436974789917, + "loss": 0.4206, + "step": 5845 + }, + { + "epoch": 3.2659217877094973, + "grad_norm": 0.44083964824676514, + "learning_rate": 0.0008392156862745098, + "loss": 0.4222, + "step": 5846 + }, + { + "epoch": 3.2664804469273743, + "grad_norm": 0.39109838008880615, + "learning_rate": 0.000839187675070028, + "loss": 0.3247, + "step": 5847 + }, + { + "epoch": 3.2670391061452513, + "grad_norm": 0.7346558570861816, + "learning_rate": 0.0008391596638655462, + "loss": 0.3942, + "step": 5848 + }, + { + "epoch": 3.2675977653631287, + "grad_norm": 1.5916489362716675, + "learning_rate": 0.0008391316526610644, + "loss": 0.4863, + "step": 5849 + }, + { + "epoch": 3.2681564245810057, + "grad_norm": 0.4552851617336273, + "learning_rate": 0.0008391036414565827, + "loss": 0.4117, + "step": 5850 + }, + { + "epoch": 3.2687150837988828, + "grad_norm": 0.6976941823959351, + "learning_rate": 0.0008390756302521008, + "loss": 0.4061, + "step": 5851 + }, + { + "epoch": 3.2692737430167598, + "grad_norm": 0.9338297247886658, + "learning_rate": 0.000839047619047619, + "loss": 0.5003, + "step": 5852 + }, + { + "epoch": 3.269832402234637, + "grad_norm": 0.7453265190124512, + "learning_rate": 0.0008390196078431372, + "loss": 0.4772, + "step": 5853 + }, + { + "epoch": 3.270391061452514, + "grad_norm": 0.512266218662262, + "learning_rate": 0.0008389915966386555, + "loss": 0.4417, + "step": 5854 + }, + { + "epoch": 3.2709497206703912, + "grad_norm": 0.45327645540237427, + "learning_rate": 0.0008389635854341738, + "loss": 0.4654, + "step": 5855 + }, + { + "epoch": 3.2715083798882683, + "grad_norm": 0.3998764753341675, + "learning_rate": 0.0008389355742296919, + "loss": 0.4208, + "step": 5856 + }, + { + "epoch": 3.2720670391061453, + "grad_norm": 0.7484354376792908, + "learning_rate": 0.0008389075630252101, + "loss": 0.4572, + "step": 5857 + }, + { + "epoch": 3.2726256983240223, + "grad_norm": 0.6923760771751404, + "learning_rate": 0.0008388795518207283, + "loss": 0.4934, + "step": 5858 + }, + { + "epoch": 3.2731843575418993, + "grad_norm": 0.8958445191383362, + "learning_rate": 0.0008388515406162465, + "loss": 0.6625, + "step": 5859 + }, + { + "epoch": 3.2737430167597763, + "grad_norm": 1.4391804933547974, + "learning_rate": 0.0008388235294117648, + "loss": 0.5695, + "step": 5860 + }, + { + "epoch": 3.2743016759776538, + "grad_norm": 0.5883399844169617, + "learning_rate": 0.000838795518207283, + "loss": 0.4544, + "step": 5861 + }, + { + "epoch": 3.2748603351955308, + "grad_norm": 0.6043676137924194, + "learning_rate": 0.0008387675070028011, + "loss": 0.4684, + "step": 5862 + }, + { + "epoch": 3.275418994413408, + "grad_norm": 0.949695348739624, + "learning_rate": 0.0008387394957983193, + "loss": 0.4743, + "step": 5863 + }, + { + "epoch": 3.275977653631285, + "grad_norm": 0.4443172812461853, + "learning_rate": 0.0008387114845938375, + "loss": 0.3997, + "step": 5864 + }, + { + "epoch": 3.276536312849162, + "grad_norm": 0.8300796151161194, + "learning_rate": 0.0008386834733893558, + "loss": 0.5845, + "step": 5865 + }, + { + "epoch": 3.2770949720670393, + "grad_norm": 0.6482070684432983, + "learning_rate": 0.000838655462184874, + "loss": 0.4247, + "step": 5866 + }, + { + "epoch": 3.2776536312849163, + "grad_norm": 2.993833065032959, + "learning_rate": 0.0008386274509803921, + "loss": 0.5309, + "step": 5867 + }, + { + "epoch": 3.2782122905027933, + "grad_norm": 0.6275423765182495, + "learning_rate": 0.0008385994397759103, + "loss": 0.48, + "step": 5868 + }, + { + "epoch": 3.2787709497206703, + "grad_norm": 0.7693106532096863, + "learning_rate": 0.0008385714285714285, + "loss": 0.507, + "step": 5869 + }, + { + "epoch": 3.2793296089385473, + "grad_norm": 0.4466283321380615, + "learning_rate": 0.0008385434173669469, + "loss": 0.3821, + "step": 5870 + }, + { + "epoch": 3.2798882681564248, + "grad_norm": 3.289853572845459, + "learning_rate": 0.0008385154061624651, + "loss": 0.4186, + "step": 5871 + }, + { + "epoch": 3.2804469273743018, + "grad_norm": 0.6237671375274658, + "learning_rate": 0.0008384873949579832, + "loss": 0.487, + "step": 5872 + }, + { + "epoch": 3.281005586592179, + "grad_norm": 0.9812585711479187, + "learning_rate": 0.0008384593837535014, + "loss": 0.5444, + "step": 5873 + }, + { + "epoch": 3.281564245810056, + "grad_norm": 2.3811252117156982, + "learning_rate": 0.0008384313725490196, + "loss": 0.4848, + "step": 5874 + }, + { + "epoch": 3.282122905027933, + "grad_norm": 0.8060725331306458, + "learning_rate": 0.0008384033613445379, + "loss": 0.4601, + "step": 5875 + }, + { + "epoch": 3.2826815642458103, + "grad_norm": 0.9154607057571411, + "learning_rate": 0.0008383753501400561, + "loss": 0.4821, + "step": 5876 + }, + { + "epoch": 3.2832402234636873, + "grad_norm": 1.448519229888916, + "learning_rate": 0.0008383473389355743, + "loss": 0.3754, + "step": 5877 + }, + { + "epoch": 3.2837988826815643, + "grad_norm": 0.47846508026123047, + "learning_rate": 0.0008383193277310924, + "loss": 0.4222, + "step": 5878 + }, + { + "epoch": 3.2843575418994413, + "grad_norm": 0.7119501829147339, + "learning_rate": 0.0008382913165266106, + "loss": 0.5035, + "step": 5879 + }, + { + "epoch": 3.2849162011173183, + "grad_norm": 0.6041054725646973, + "learning_rate": 0.0008382633053221289, + "loss": 0.5721, + "step": 5880 + }, + { + "epoch": 3.2854748603351958, + "grad_norm": 0.8183274865150452, + "learning_rate": 0.0008382352941176471, + "loss": 0.5763, + "step": 5881 + }, + { + "epoch": 3.2860335195530728, + "grad_norm": 0.5485641360282898, + "learning_rate": 0.0008382072829131653, + "loss": 0.5318, + "step": 5882 + }, + { + "epoch": 3.28659217877095, + "grad_norm": 0.9233376383781433, + "learning_rate": 0.0008381792717086834, + "loss": 0.4954, + "step": 5883 + }, + { + "epoch": 3.287150837988827, + "grad_norm": 0.6849166750907898, + "learning_rate": 0.0008381512605042016, + "loss": 0.4896, + "step": 5884 + }, + { + "epoch": 3.287709497206704, + "grad_norm": 0.7204636335372925, + "learning_rate": 0.0008381232492997199, + "loss": 0.4403, + "step": 5885 + }, + { + "epoch": 3.288268156424581, + "grad_norm": 0.9672218561172485, + "learning_rate": 0.0008380952380952382, + "loss": 0.4994, + "step": 5886 + }, + { + "epoch": 3.2888268156424583, + "grad_norm": 0.5536984205245972, + "learning_rate": 0.0008380672268907564, + "loss": 0.4526, + "step": 5887 + }, + { + "epoch": 3.2893854748603353, + "grad_norm": 0.6521371006965637, + "learning_rate": 0.0008380392156862745, + "loss": 0.4508, + "step": 5888 + }, + { + "epoch": 3.2899441340782123, + "grad_norm": 3.3169729709625244, + "learning_rate": 0.0008380112044817927, + "loss": 0.4629, + "step": 5889 + }, + { + "epoch": 3.2905027932960893, + "grad_norm": 2.767110586166382, + "learning_rate": 0.000837983193277311, + "loss": 0.4126, + "step": 5890 + }, + { + "epoch": 3.2910614525139663, + "grad_norm": 0.7629807591438293, + "learning_rate": 0.0008379551820728292, + "loss": 0.5102, + "step": 5891 + }, + { + "epoch": 3.2916201117318438, + "grad_norm": 0.6200615763664246, + "learning_rate": 0.0008379271708683474, + "loss": 0.6001, + "step": 5892 + }, + { + "epoch": 3.292178770949721, + "grad_norm": 0.524581789970398, + "learning_rate": 0.0008378991596638656, + "loss": 0.3683, + "step": 5893 + }, + { + "epoch": 3.292737430167598, + "grad_norm": 1.2102714776992798, + "learning_rate": 0.0008378711484593837, + "loss": 0.6946, + "step": 5894 + }, + { + "epoch": 3.293296089385475, + "grad_norm": 0.4404248297214508, + "learning_rate": 0.000837843137254902, + "loss": 0.4063, + "step": 5895 + }, + { + "epoch": 3.293854748603352, + "grad_norm": 0.6545925140380859, + "learning_rate": 0.0008378151260504202, + "loss": 0.3816, + "step": 5896 + }, + { + "epoch": 3.294413407821229, + "grad_norm": 1.7304235696792603, + "learning_rate": 0.0008377871148459384, + "loss": 0.4554, + "step": 5897 + }, + { + "epoch": 3.2949720670391063, + "grad_norm": 0.5310423374176025, + "learning_rate": 0.0008377591036414566, + "loss": 0.577, + "step": 5898 + }, + { + "epoch": 3.2955307262569833, + "grad_norm": 0.4148930311203003, + "learning_rate": 0.0008377310924369747, + "loss": 0.4106, + "step": 5899 + }, + { + "epoch": 3.2960893854748603, + "grad_norm": 0.5346720218658447, + "learning_rate": 0.000837703081232493, + "loss": 0.4965, + "step": 5900 + }, + { + "epoch": 3.2966480446927373, + "grad_norm": 0.535318911075592, + "learning_rate": 0.0008376750700280112, + "loss": 0.5348, + "step": 5901 + }, + { + "epoch": 3.2972067039106143, + "grad_norm": 1.3257360458374023, + "learning_rate": 0.0008376470588235294, + "loss": 0.4411, + "step": 5902 + }, + { + "epoch": 3.2977653631284918, + "grad_norm": 0.7085011601448059, + "learning_rate": 0.0008376190476190477, + "loss": 0.5034, + "step": 5903 + }, + { + "epoch": 3.298324022346369, + "grad_norm": 0.6488482356071472, + "learning_rate": 0.0008375910364145658, + "loss": 0.4438, + "step": 5904 + }, + { + "epoch": 3.298882681564246, + "grad_norm": 0.8926328420639038, + "learning_rate": 0.0008375630252100841, + "loss": 0.3763, + "step": 5905 + }, + { + "epoch": 3.299441340782123, + "grad_norm": 0.6652448773384094, + "learning_rate": 0.0008375350140056023, + "loss": 0.5289, + "step": 5906 + }, + { + "epoch": 3.3, + "grad_norm": 0.5910142660140991, + "learning_rate": 0.0008375070028011205, + "loss": 0.435, + "step": 5907 + }, + { + "epoch": 3.3005586592178773, + "grad_norm": 0.621354341506958, + "learning_rate": 0.0008374789915966387, + "loss": 0.4935, + "step": 5908 + }, + { + "epoch": 3.3011173184357543, + "grad_norm": 6.71200704574585, + "learning_rate": 0.0008374509803921569, + "loss": 0.4426, + "step": 5909 + }, + { + "epoch": 3.3016759776536313, + "grad_norm": 0.4947996139526367, + "learning_rate": 0.0008374229691876751, + "loss": 0.3556, + "step": 5910 + }, + { + "epoch": 3.3022346368715083, + "grad_norm": 0.4281250834465027, + "learning_rate": 0.0008373949579831933, + "loss": 0.4049, + "step": 5911 + }, + { + "epoch": 3.3027932960893853, + "grad_norm": 0.7397978901863098, + "learning_rate": 0.0008373669467787115, + "loss": 0.5779, + "step": 5912 + }, + { + "epoch": 3.3033519553072628, + "grad_norm": 9.742897987365723, + "learning_rate": 0.0008373389355742297, + "loss": 0.5392, + "step": 5913 + }, + { + "epoch": 3.30391061452514, + "grad_norm": 0.8706209659576416, + "learning_rate": 0.0008373109243697479, + "loss": 0.451, + "step": 5914 + }, + { + "epoch": 3.304469273743017, + "grad_norm": 0.49004945158958435, + "learning_rate": 0.0008372829131652661, + "loss": 0.5001, + "step": 5915 + }, + { + "epoch": 3.305027932960894, + "grad_norm": 1.3001060485839844, + "learning_rate": 0.0008372549019607843, + "loss": 0.5163, + "step": 5916 + }, + { + "epoch": 3.305586592178771, + "grad_norm": 0.7620853185653687, + "learning_rate": 0.0008372268907563025, + "loss": 0.486, + "step": 5917 + }, + { + "epoch": 3.3061452513966483, + "grad_norm": 0.6356363892555237, + "learning_rate": 0.0008371988795518207, + "loss": 0.6636, + "step": 5918 + }, + { + "epoch": 3.3067039106145253, + "grad_norm": 0.45684704184532166, + "learning_rate": 0.000837170868347339, + "loss": 0.4321, + "step": 5919 + }, + { + "epoch": 3.3072625698324023, + "grad_norm": 0.5030642151832581, + "learning_rate": 0.0008371428571428572, + "loss": 0.5827, + "step": 5920 + }, + { + "epoch": 3.3078212290502793, + "grad_norm": 0.9501070976257324, + "learning_rate": 0.0008371148459383754, + "loss": 0.9152, + "step": 5921 + }, + { + "epoch": 3.3083798882681563, + "grad_norm": 1.2457044124603271, + "learning_rate": 0.0008370868347338936, + "loss": 0.4704, + "step": 5922 + }, + { + "epoch": 3.3089385474860333, + "grad_norm": 0.5496012568473816, + "learning_rate": 0.0008370588235294118, + "loss": 0.3784, + "step": 5923 + }, + { + "epoch": 3.309497206703911, + "grad_norm": 0.775839626789093, + "learning_rate": 0.00083703081232493, + "loss": 0.5187, + "step": 5924 + }, + { + "epoch": 3.310055865921788, + "grad_norm": 0.7287942171096802, + "learning_rate": 0.0008370028011204483, + "loss": 0.4446, + "step": 5925 + }, + { + "epoch": 3.310614525139665, + "grad_norm": 1.3466728925704956, + "learning_rate": 0.0008369747899159664, + "loss": 0.501, + "step": 5926 + }, + { + "epoch": 3.311173184357542, + "grad_norm": 0.7960771322250366, + "learning_rate": 0.0008369467787114846, + "loss": 0.624, + "step": 5927 + }, + { + "epoch": 3.311731843575419, + "grad_norm": 0.6292746663093567, + "learning_rate": 0.0008369187675070028, + "loss": 0.5935, + "step": 5928 + }, + { + "epoch": 3.312290502793296, + "grad_norm": 0.4995972812175751, + "learning_rate": 0.000836890756302521, + "loss": 0.4782, + "step": 5929 + }, + { + "epoch": 3.3128491620111733, + "grad_norm": 0.7585958242416382, + "learning_rate": 0.0008368627450980393, + "loss": 0.4901, + "step": 5930 + }, + { + "epoch": 3.3134078212290503, + "grad_norm": 0.7725650072097778, + "learning_rate": 0.0008368347338935574, + "loss": 0.4647, + "step": 5931 + }, + { + "epoch": 3.3139664804469273, + "grad_norm": 0.5303656458854675, + "learning_rate": 0.0008368067226890756, + "loss": 0.4532, + "step": 5932 + }, + { + "epoch": 3.3145251396648043, + "grad_norm": 0.487987756729126, + "learning_rate": 0.0008367787114845938, + "loss": 0.4656, + "step": 5933 + }, + { + "epoch": 3.3150837988826813, + "grad_norm": 0.6956318020820618, + "learning_rate": 0.000836750700280112, + "loss": 0.5001, + "step": 5934 + }, + { + "epoch": 3.315642458100559, + "grad_norm": 0.7788981795310974, + "learning_rate": 0.0008367226890756304, + "loss": 0.508, + "step": 5935 + }, + { + "epoch": 3.316201117318436, + "grad_norm": 0.5326288342475891, + "learning_rate": 0.0008366946778711485, + "loss": 0.4932, + "step": 5936 + }, + { + "epoch": 3.316759776536313, + "grad_norm": 0.6776241064071655, + "learning_rate": 0.0008366666666666667, + "loss": 0.632, + "step": 5937 + }, + { + "epoch": 3.31731843575419, + "grad_norm": 1.221423625946045, + "learning_rate": 0.0008366386554621849, + "loss": 0.545, + "step": 5938 + }, + { + "epoch": 3.317877094972067, + "grad_norm": 0.6130251288414001, + "learning_rate": 0.0008366106442577031, + "loss": 0.5382, + "step": 5939 + }, + { + "epoch": 3.3184357541899443, + "grad_norm": 0.5487998723983765, + "learning_rate": 0.0008365826330532214, + "loss": 0.4817, + "step": 5940 + }, + { + "epoch": 3.3189944134078213, + "grad_norm": 0.3733319640159607, + "learning_rate": 0.0008365546218487396, + "loss": 0.4541, + "step": 5941 + }, + { + "epoch": 3.3195530726256983, + "grad_norm": 0.676230251789093, + "learning_rate": 0.0008365266106442577, + "loss": 0.4223, + "step": 5942 + }, + { + "epoch": 3.3201117318435753, + "grad_norm": 1.0991166830062866, + "learning_rate": 0.0008364985994397759, + "loss": 0.7488, + "step": 5943 + }, + { + "epoch": 3.3206703910614523, + "grad_norm": 0.7113136053085327, + "learning_rate": 0.0008364705882352941, + "loss": 0.4664, + "step": 5944 + }, + { + "epoch": 3.32122905027933, + "grad_norm": 0.6897918581962585, + "learning_rate": 0.0008364425770308124, + "loss": 0.5825, + "step": 5945 + }, + { + "epoch": 3.321787709497207, + "grad_norm": 0.740591824054718, + "learning_rate": 0.0008364145658263306, + "loss": 0.5044, + "step": 5946 + }, + { + "epoch": 3.322346368715084, + "grad_norm": 0.5311262607574463, + "learning_rate": 0.0008363865546218487, + "loss": 0.4535, + "step": 5947 + }, + { + "epoch": 3.322905027932961, + "grad_norm": 0.495380699634552, + "learning_rate": 0.0008363585434173669, + "loss": 0.5506, + "step": 5948 + }, + { + "epoch": 3.323463687150838, + "grad_norm": 0.5739846229553223, + "learning_rate": 0.0008363305322128851, + "loss": 0.4776, + "step": 5949 + }, + { + "epoch": 3.3240223463687153, + "grad_norm": 0.4468778669834137, + "learning_rate": 0.0008363025210084034, + "loss": 0.4511, + "step": 5950 + }, + { + "epoch": 3.3245810055865923, + "grad_norm": 0.4775971472263336, + "learning_rate": 0.0008362745098039216, + "loss": 0.4567, + "step": 5951 + }, + { + "epoch": 3.3251396648044693, + "grad_norm": 0.6873248219490051, + "learning_rate": 0.0008362464985994397, + "loss": 0.4802, + "step": 5952 + }, + { + "epoch": 3.3256983240223463, + "grad_norm": 0.539291262626648, + "learning_rate": 0.000836218487394958, + "loss": 0.5324, + "step": 5953 + }, + { + "epoch": 3.3262569832402233, + "grad_norm": 0.6163298487663269, + "learning_rate": 0.0008361904761904762, + "loss": 0.5937, + "step": 5954 + }, + { + "epoch": 3.326815642458101, + "grad_norm": 0.664472758769989, + "learning_rate": 0.0008361624649859945, + "loss": 0.536, + "step": 5955 + }, + { + "epoch": 3.327374301675978, + "grad_norm": 0.7042726874351501, + "learning_rate": 0.0008361344537815127, + "loss": 0.4491, + "step": 5956 + }, + { + "epoch": 3.327932960893855, + "grad_norm": 0.4764694273471832, + "learning_rate": 0.0008361064425770309, + "loss": 0.4625, + "step": 5957 + }, + { + "epoch": 3.328491620111732, + "grad_norm": 0.5840118527412415, + "learning_rate": 0.000836078431372549, + "loss": 0.5109, + "step": 5958 + }, + { + "epoch": 3.329050279329609, + "grad_norm": 0.5546570420265198, + "learning_rate": 0.0008360504201680672, + "loss": 0.3827, + "step": 5959 + }, + { + "epoch": 3.329608938547486, + "grad_norm": 0.49044451117515564, + "learning_rate": 0.0008360224089635855, + "loss": 0.3845, + "step": 5960 + }, + { + "epoch": 3.3301675977653633, + "grad_norm": 0.7842701077461243, + "learning_rate": 0.0008359943977591037, + "loss": 0.4405, + "step": 5961 + }, + { + "epoch": 3.3307262569832403, + "grad_norm": 0.5469485521316528, + "learning_rate": 0.0008359663865546219, + "loss": 0.4293, + "step": 5962 + }, + { + "epoch": 3.3312849162011173, + "grad_norm": 0.5998361110687256, + "learning_rate": 0.00083593837535014, + "loss": 0.4219, + "step": 5963 + }, + { + "epoch": 3.3318435754189943, + "grad_norm": 0.42217960953712463, + "learning_rate": 0.0008359103641456582, + "loss": 0.4884, + "step": 5964 + }, + { + "epoch": 3.3324022346368714, + "grad_norm": 0.43354055285453796, + "learning_rate": 0.0008358823529411765, + "loss": 0.3874, + "step": 5965 + }, + { + "epoch": 3.3329608938547484, + "grad_norm": 0.492789089679718, + "learning_rate": 0.0008358543417366947, + "loss": 0.445, + "step": 5966 + }, + { + "epoch": 3.333519553072626, + "grad_norm": 1.0337791442871094, + "learning_rate": 0.0008358263305322129, + "loss": 0.5195, + "step": 5967 + }, + { + "epoch": 3.334078212290503, + "grad_norm": 0.5146295428276062, + "learning_rate": 0.000835798319327731, + "loss": 0.4475, + "step": 5968 + }, + { + "epoch": 3.33463687150838, + "grad_norm": 2.359459638595581, + "learning_rate": 0.0008357703081232492, + "loss": 0.4291, + "step": 5969 + }, + { + "epoch": 3.335195530726257, + "grad_norm": 1.1305863857269287, + "learning_rate": 0.0008357422969187676, + "loss": 0.478, + "step": 5970 + }, + { + "epoch": 3.335754189944134, + "grad_norm": 1.1479641199111938, + "learning_rate": 0.0008357142857142858, + "loss": 0.4636, + "step": 5971 + }, + { + "epoch": 3.3363128491620113, + "grad_norm": 0.5999601483345032, + "learning_rate": 0.000835686274509804, + "loss": 0.4708, + "step": 5972 + }, + { + "epoch": 3.3368715083798883, + "grad_norm": 0.4380541741847992, + "learning_rate": 0.0008356582633053222, + "loss": 0.4034, + "step": 5973 + }, + { + "epoch": 3.3374301675977653, + "grad_norm": 0.5763838291168213, + "learning_rate": 0.0008356302521008403, + "loss": 0.4328, + "step": 5974 + }, + { + "epoch": 3.3379888268156424, + "grad_norm": 1.615910291671753, + "learning_rate": 0.0008356022408963586, + "loss": 0.6219, + "step": 5975 + }, + { + "epoch": 3.3385474860335194, + "grad_norm": 1.3738059997558594, + "learning_rate": 0.0008355742296918768, + "loss": 0.4257, + "step": 5976 + }, + { + "epoch": 3.339106145251397, + "grad_norm": 0.6873733997344971, + "learning_rate": 0.000835546218487395, + "loss": 0.574, + "step": 5977 + }, + { + "epoch": 3.339664804469274, + "grad_norm": 0.5822592377662659, + "learning_rate": 0.0008355182072829132, + "loss": 0.4977, + "step": 5978 + }, + { + "epoch": 3.340223463687151, + "grad_norm": 0.7007447481155396, + "learning_rate": 0.0008354901960784313, + "loss": 0.4804, + "step": 5979 + }, + { + "epoch": 3.340782122905028, + "grad_norm": 0.5462225675582886, + "learning_rate": 0.0008354621848739496, + "loss": 0.4344, + "step": 5980 + }, + { + "epoch": 3.341340782122905, + "grad_norm": 0.4206903278827667, + "learning_rate": 0.0008354341736694678, + "loss": 0.369, + "step": 5981 + }, + { + "epoch": 3.3418994413407823, + "grad_norm": 0.7577139735221863, + "learning_rate": 0.000835406162464986, + "loss": 0.5953, + "step": 5982 + }, + { + "epoch": 3.3424581005586593, + "grad_norm": 0.4644266963005066, + "learning_rate": 0.0008353781512605042, + "loss": 0.4187, + "step": 5983 + }, + { + "epoch": 3.3430167597765363, + "grad_norm": 0.8171898126602173, + "learning_rate": 0.0008353501400560223, + "loss": 0.3725, + "step": 5984 + }, + { + "epoch": 3.3435754189944134, + "grad_norm": 0.5116392374038696, + "learning_rate": 0.0008353221288515407, + "loss": 0.434, + "step": 5985 + }, + { + "epoch": 3.3441340782122904, + "grad_norm": 0.5752791166305542, + "learning_rate": 0.0008352941176470589, + "loss": 0.4831, + "step": 5986 + }, + { + "epoch": 3.344692737430168, + "grad_norm": 0.5015922784805298, + "learning_rate": 0.0008352661064425771, + "loss": 0.416, + "step": 5987 + }, + { + "epoch": 3.345251396648045, + "grad_norm": 0.41650280356407166, + "learning_rate": 0.0008352380952380953, + "loss": 0.4374, + "step": 5988 + }, + { + "epoch": 3.345810055865922, + "grad_norm": 0.5924286246299744, + "learning_rate": 0.0008352100840336135, + "loss": 0.4617, + "step": 5989 + }, + { + "epoch": 3.346368715083799, + "grad_norm": 0.51353520154953, + "learning_rate": 0.0008351820728291317, + "loss": 0.4073, + "step": 5990 + }, + { + "epoch": 3.346927374301676, + "grad_norm": 0.44078338146209717, + "learning_rate": 0.0008351540616246499, + "loss": 0.4688, + "step": 5991 + }, + { + "epoch": 3.3474860335195533, + "grad_norm": 5.740250587463379, + "learning_rate": 0.0008351260504201681, + "loss": 0.4629, + "step": 5992 + }, + { + "epoch": 3.3480446927374303, + "grad_norm": 0.580997884273529, + "learning_rate": 0.0008350980392156863, + "loss": 0.5854, + "step": 5993 + }, + { + "epoch": 3.3486033519553073, + "grad_norm": 0.7381086349487305, + "learning_rate": 0.0008350700280112045, + "loss": 0.5432, + "step": 5994 + }, + { + "epoch": 3.3491620111731844, + "grad_norm": 1.129784345626831, + "learning_rate": 0.0008350420168067227, + "loss": 0.5235, + "step": 5995 + }, + { + "epoch": 3.3497206703910614, + "grad_norm": 0.6231741905212402, + "learning_rate": 0.0008350140056022409, + "loss": 0.5439, + "step": 5996 + }, + { + "epoch": 3.3502793296089384, + "grad_norm": 0.8671806454658508, + "learning_rate": 0.0008349859943977591, + "loss": 0.4991, + "step": 5997 + }, + { + "epoch": 3.350837988826816, + "grad_norm": 0.4410226345062256, + "learning_rate": 0.0008349579831932773, + "loss": 0.4581, + "step": 5998 + }, + { + "epoch": 3.351396648044693, + "grad_norm": 0.445648193359375, + "learning_rate": 0.0008349299719887955, + "loss": 0.4117, + "step": 5999 + }, + { + "epoch": 3.35195530726257, + "grad_norm": 0.4277268946170807, + "learning_rate": 0.0008349019607843137, + "loss": 0.3061, + "step": 6000 + }, + { + "epoch": 3.35195530726257, + "eval_cer": 0.09412130535825342, + "eval_loss": 0.35406970977783203, + "eval_runtime": 55.3464, + "eval_samples_per_second": 81.993, + "eval_steps_per_second": 5.131, + "eval_wer": 0.3714142063070864, + "step": 6000 + }, + { + "epoch": 3.352513966480447, + "grad_norm": 0.6357645988464355, + "learning_rate": 0.000834873949579832, + "loss": 0.3904, + "step": 6001 + }, + { + "epoch": 3.353072625698324, + "grad_norm": 0.7088366746902466, + "learning_rate": 0.0008348459383753502, + "loss": 0.4961, + "step": 6002 + }, + { + "epoch": 3.353631284916201, + "grad_norm": 0.8757784962654114, + "learning_rate": 0.0008348179271708684, + "loss": 0.3754, + "step": 6003 + }, + { + "epoch": 3.3541899441340783, + "grad_norm": 0.9190798401832581, + "learning_rate": 0.0008347899159663866, + "loss": 0.5937, + "step": 6004 + }, + { + "epoch": 3.3547486033519553, + "grad_norm": 1.9690254926681519, + "learning_rate": 0.0008347619047619049, + "loss": 0.477, + "step": 6005 + }, + { + "epoch": 3.3553072625698324, + "grad_norm": 0.48200395703315735, + "learning_rate": 0.000834733893557423, + "loss": 0.5189, + "step": 6006 + }, + { + "epoch": 3.3558659217877094, + "grad_norm": 0.3942033648490906, + "learning_rate": 0.0008347058823529412, + "loss": 0.4153, + "step": 6007 + }, + { + "epoch": 3.3564245810055864, + "grad_norm": 0.6851130127906799, + "learning_rate": 0.0008346778711484594, + "loss": 0.4621, + "step": 6008 + }, + { + "epoch": 3.356983240223464, + "grad_norm": 0.7886162996292114, + "learning_rate": 0.0008346498599439776, + "loss": 0.5397, + "step": 6009 + }, + { + "epoch": 3.357541899441341, + "grad_norm": 1.5325363874435425, + "learning_rate": 0.0008346218487394959, + "loss": 0.3924, + "step": 6010 + }, + { + "epoch": 3.358100558659218, + "grad_norm": 1.0592213869094849, + "learning_rate": 0.000834593837535014, + "loss": 0.4683, + "step": 6011 + }, + { + "epoch": 3.358659217877095, + "grad_norm": 0.8366856575012207, + "learning_rate": 0.0008345658263305322, + "loss": 0.4993, + "step": 6012 + }, + { + "epoch": 3.359217877094972, + "grad_norm": 0.7793118357658386, + "learning_rate": 0.0008345378151260504, + "loss": 0.4538, + "step": 6013 + }, + { + "epoch": 3.3597765363128493, + "grad_norm": 0.5727178454399109, + "learning_rate": 0.0008345098039215686, + "loss": 0.6036, + "step": 6014 + }, + { + "epoch": 3.3603351955307263, + "grad_norm": 1.0506523847579956, + "learning_rate": 0.0008344817927170869, + "loss": 0.4593, + "step": 6015 + }, + { + "epoch": 3.3608938547486034, + "grad_norm": 1.3926310539245605, + "learning_rate": 0.000834453781512605, + "loss": 0.5351, + "step": 6016 + }, + { + "epoch": 3.3614525139664804, + "grad_norm": 0.6975616812705994, + "learning_rate": 0.0008344257703081232, + "loss": 0.4663, + "step": 6017 + }, + { + "epoch": 3.3620111731843574, + "grad_norm": 0.5686214566230774, + "learning_rate": 0.0008343977591036415, + "loss": 0.4689, + "step": 6018 + }, + { + "epoch": 3.362569832402235, + "grad_norm": 0.4696175456047058, + "learning_rate": 0.0008343697478991597, + "loss": 0.4944, + "step": 6019 + }, + { + "epoch": 3.363128491620112, + "grad_norm": 0.5842403769493103, + "learning_rate": 0.0008343417366946779, + "loss": 0.544, + "step": 6020 + }, + { + "epoch": 3.363687150837989, + "grad_norm": 0.5163576006889343, + "learning_rate": 0.0008343137254901962, + "loss": 0.473, + "step": 6021 + }, + { + "epoch": 3.364245810055866, + "grad_norm": 0.43924614787101746, + "learning_rate": 0.0008342857142857143, + "loss": 0.4131, + "step": 6022 + }, + { + "epoch": 3.364804469273743, + "grad_norm": 0.4216912090778351, + "learning_rate": 0.0008342577030812325, + "loss": 0.4767, + "step": 6023 + }, + { + "epoch": 3.3653631284916203, + "grad_norm": 1.3931881189346313, + "learning_rate": 0.0008342296918767507, + "loss": 0.4433, + "step": 6024 + }, + { + "epoch": 3.3659217877094973, + "grad_norm": 0.5150628685951233, + "learning_rate": 0.0008342016806722689, + "loss": 0.5303, + "step": 6025 + }, + { + "epoch": 3.3664804469273744, + "grad_norm": 0.40338414907455444, + "learning_rate": 0.0008341736694677872, + "loss": 0.4618, + "step": 6026 + }, + { + "epoch": 3.3670391061452514, + "grad_norm": 0.6179518699645996, + "learning_rate": 0.0008341456582633053, + "loss": 0.4445, + "step": 6027 + }, + { + "epoch": 3.3675977653631284, + "grad_norm": 0.5551663041114807, + "learning_rate": 0.0008341176470588235, + "loss": 0.4725, + "step": 6028 + }, + { + "epoch": 3.3681564245810054, + "grad_norm": 0.8116632699966431, + "learning_rate": 0.0008340896358543417, + "loss": 0.4221, + "step": 6029 + }, + { + "epoch": 3.368715083798883, + "grad_norm": 1.5497699975967407, + "learning_rate": 0.0008340616246498599, + "loss": 0.4863, + "step": 6030 + }, + { + "epoch": 3.36927374301676, + "grad_norm": 0.5297819375991821, + "learning_rate": 0.0008340336134453782, + "loss": 0.5314, + "step": 6031 + }, + { + "epoch": 3.369832402234637, + "grad_norm": 1.4641435146331787, + "learning_rate": 0.0008340056022408963, + "loss": 0.4431, + "step": 6032 + }, + { + "epoch": 3.370391061452514, + "grad_norm": 0.5053655505180359, + "learning_rate": 0.0008339775910364145, + "loss": 0.4246, + "step": 6033 + }, + { + "epoch": 3.370949720670391, + "grad_norm": 0.5946944952011108, + "learning_rate": 0.0008339495798319327, + "loss": 0.5037, + "step": 6034 + }, + { + "epoch": 3.3715083798882683, + "grad_norm": 0.5260332226753235, + "learning_rate": 0.000833921568627451, + "loss": 0.4625, + "step": 6035 + }, + { + "epoch": 3.3720670391061454, + "grad_norm": 0.5531105399131775, + "learning_rate": 0.0008338935574229693, + "loss": 0.4822, + "step": 6036 + }, + { + "epoch": 3.3726256983240224, + "grad_norm": 0.6000711917877197, + "learning_rate": 0.0008338655462184875, + "loss": 0.571, + "step": 6037 + }, + { + "epoch": 3.3731843575418994, + "grad_norm": 0.4021860659122467, + "learning_rate": 0.0008338375350140056, + "loss": 0.5426, + "step": 6038 + }, + { + "epoch": 3.3737430167597764, + "grad_norm": 1.9584848880767822, + "learning_rate": 0.0008338095238095238, + "loss": 0.5339, + "step": 6039 + }, + { + "epoch": 3.3743016759776534, + "grad_norm": 1.7087459564208984, + "learning_rate": 0.000833781512605042, + "loss": 0.3887, + "step": 6040 + }, + { + "epoch": 3.374860335195531, + "grad_norm": 0.6639626622200012, + "learning_rate": 0.0008337535014005603, + "loss": 0.5081, + "step": 6041 + }, + { + "epoch": 3.375418994413408, + "grad_norm": 0.48255738615989685, + "learning_rate": 0.0008337254901960785, + "loss": 0.4183, + "step": 6042 + }, + { + "epoch": 3.375977653631285, + "grad_norm": 0.6156942248344421, + "learning_rate": 0.0008336974789915966, + "loss": 0.368, + "step": 6043 + }, + { + "epoch": 3.376536312849162, + "grad_norm": 0.5803930163383484, + "learning_rate": 0.0008336694677871148, + "loss": 0.4946, + "step": 6044 + }, + { + "epoch": 3.377094972067039, + "grad_norm": 0.5024144649505615, + "learning_rate": 0.000833641456582633, + "loss": 0.4009, + "step": 6045 + }, + { + "epoch": 3.3776536312849164, + "grad_norm": 1.3605116605758667, + "learning_rate": 0.0008336134453781513, + "loss": 0.4125, + "step": 6046 + }, + { + "epoch": 3.3782122905027934, + "grad_norm": 0.5708194971084595, + "learning_rate": 0.0008335854341736695, + "loss": 0.4672, + "step": 6047 + }, + { + "epoch": 3.3787709497206704, + "grad_norm": 3.3018643856048584, + "learning_rate": 0.0008335574229691876, + "loss": 0.5313, + "step": 6048 + }, + { + "epoch": 3.3793296089385474, + "grad_norm": 2.0821480751037598, + "learning_rate": 0.0008335294117647058, + "loss": 0.5199, + "step": 6049 + }, + { + "epoch": 3.3798882681564244, + "grad_norm": 0.7997511625289917, + "learning_rate": 0.000833501400560224, + "loss": 0.4057, + "step": 6050 + }, + { + "epoch": 3.380446927374302, + "grad_norm": 0.5062386989593506, + "learning_rate": 0.0008334733893557424, + "loss": 0.3939, + "step": 6051 + }, + { + "epoch": 3.381005586592179, + "grad_norm": 1.694746971130371, + "learning_rate": 0.0008334453781512606, + "loss": 0.41, + "step": 6052 + }, + { + "epoch": 3.381564245810056, + "grad_norm": 0.704200804233551, + "learning_rate": 0.0008334173669467788, + "loss": 0.4948, + "step": 6053 + }, + { + "epoch": 3.382122905027933, + "grad_norm": 0.5387531518936157, + "learning_rate": 0.0008333893557422969, + "loss": 0.5413, + "step": 6054 + }, + { + "epoch": 3.38268156424581, + "grad_norm": 0.6832190752029419, + "learning_rate": 0.0008333613445378151, + "loss": 0.5196, + "step": 6055 + }, + { + "epoch": 3.3832402234636874, + "grad_norm": 0.564065158367157, + "learning_rate": 0.0008333333333333334, + "loss": 0.383, + "step": 6056 + }, + { + "epoch": 3.3837988826815644, + "grad_norm": 1.477967619895935, + "learning_rate": 0.0008333053221288516, + "loss": 0.3982, + "step": 6057 + }, + { + "epoch": 3.3843575418994414, + "grad_norm": 1.6309324502944946, + "learning_rate": 0.0008332773109243698, + "loss": 0.4391, + "step": 6058 + }, + { + "epoch": 3.3849162011173184, + "grad_norm": 2.5116946697235107, + "learning_rate": 0.0008332492997198879, + "loss": 0.4056, + "step": 6059 + }, + { + "epoch": 3.3854748603351954, + "grad_norm": 1.6525691747665405, + "learning_rate": 0.0008332212885154061, + "loss": 0.7121, + "step": 6060 + }, + { + "epoch": 3.386033519553073, + "grad_norm": 0.5799946784973145, + "learning_rate": 0.0008331932773109244, + "loss": 0.3619, + "step": 6061 + }, + { + "epoch": 3.38659217877095, + "grad_norm": 0.44262635707855225, + "learning_rate": 0.0008331652661064426, + "loss": 0.3979, + "step": 6062 + }, + { + "epoch": 3.387150837988827, + "grad_norm": 0.7026075124740601, + "learning_rate": 0.0008331372549019608, + "loss": 0.6531, + "step": 6063 + }, + { + "epoch": 3.387709497206704, + "grad_norm": 0.5166801810264587, + "learning_rate": 0.0008331092436974789, + "loss": 0.3797, + "step": 6064 + }, + { + "epoch": 3.388268156424581, + "grad_norm": 0.37580618262290955, + "learning_rate": 0.0008330812324929971, + "loss": 0.5012, + "step": 6065 + }, + { + "epoch": 3.388826815642458, + "grad_norm": 0.5464267134666443, + "learning_rate": 0.0008330532212885154, + "loss": 0.3618, + "step": 6066 + }, + { + "epoch": 3.3893854748603354, + "grad_norm": 0.7585722208023071, + "learning_rate": 0.0008330252100840337, + "loss": 0.4528, + "step": 6067 + }, + { + "epoch": 3.3899441340782124, + "grad_norm": 1.071548342704773, + "learning_rate": 0.0008329971988795519, + "loss": 0.4616, + "step": 6068 + }, + { + "epoch": 3.3905027932960894, + "grad_norm": 0.628174901008606, + "learning_rate": 0.0008329691876750701, + "loss": 0.4371, + "step": 6069 + }, + { + "epoch": 3.3910614525139664, + "grad_norm": 0.8733397126197815, + "learning_rate": 0.0008329411764705882, + "loss": 0.511, + "step": 6070 + }, + { + "epoch": 3.3916201117318434, + "grad_norm": 0.7845448851585388, + "learning_rate": 0.0008329131652661065, + "loss": 0.4789, + "step": 6071 + }, + { + "epoch": 3.3921787709497204, + "grad_norm": 0.4847370982170105, + "learning_rate": 0.0008328851540616247, + "loss": 0.3967, + "step": 6072 + }, + { + "epoch": 3.392737430167598, + "grad_norm": 0.5363072752952576, + "learning_rate": 0.0008328571428571429, + "loss": 0.5241, + "step": 6073 + }, + { + "epoch": 3.393296089385475, + "grad_norm": 0.8072665333747864, + "learning_rate": 0.0008328291316526611, + "loss": 0.4497, + "step": 6074 + }, + { + "epoch": 3.393854748603352, + "grad_norm": 0.7780736684799194, + "learning_rate": 0.0008328011204481792, + "loss": 0.4764, + "step": 6075 + }, + { + "epoch": 3.394413407821229, + "grad_norm": 2.6497485637664795, + "learning_rate": 0.0008327731092436975, + "loss": 0.5924, + "step": 6076 + }, + { + "epoch": 3.394972067039106, + "grad_norm": 0.677177369594574, + "learning_rate": 0.0008327450980392157, + "loss": 0.3832, + "step": 6077 + }, + { + "epoch": 3.3955307262569834, + "grad_norm": 0.6119154691696167, + "learning_rate": 0.0008327170868347339, + "loss": 0.4964, + "step": 6078 + }, + { + "epoch": 3.3960893854748604, + "grad_norm": 0.48202550411224365, + "learning_rate": 0.0008326890756302521, + "loss": 0.4755, + "step": 6079 + }, + { + "epoch": 3.3966480446927374, + "grad_norm": 0.5974293947219849, + "learning_rate": 0.0008326610644257702, + "loss": 0.4616, + "step": 6080 + }, + { + "epoch": 3.3972067039106144, + "grad_norm": 0.515937089920044, + "learning_rate": 0.0008326330532212885, + "loss": 0.5021, + "step": 6081 + }, + { + "epoch": 3.3977653631284914, + "grad_norm": 0.485859751701355, + "learning_rate": 0.0008326050420168067, + "loss": 0.3718, + "step": 6082 + }, + { + "epoch": 3.398324022346369, + "grad_norm": 0.47070395946502686, + "learning_rate": 0.000832577030812325, + "loss": 0.3567, + "step": 6083 + }, + { + "epoch": 3.398882681564246, + "grad_norm": 0.5709047913551331, + "learning_rate": 0.0008325490196078432, + "loss": 0.4434, + "step": 6084 + }, + { + "epoch": 3.399441340782123, + "grad_norm": 0.8484343886375427, + "learning_rate": 0.0008325210084033614, + "loss": 0.4589, + "step": 6085 + }, + { + "epoch": 3.4, + "grad_norm": 0.9602091908454895, + "learning_rate": 0.0008324929971988796, + "loss": 0.4809, + "step": 6086 + }, + { + "epoch": 3.400558659217877, + "grad_norm": 0.4323050081729889, + "learning_rate": 0.0008324649859943978, + "loss": 0.4197, + "step": 6087 + }, + { + "epoch": 3.4011173184357544, + "grad_norm": 1.7077631950378418, + "learning_rate": 0.000832436974789916, + "loss": 0.4206, + "step": 6088 + }, + { + "epoch": 3.4016759776536314, + "grad_norm": 0.47025471925735474, + "learning_rate": 0.0008324089635854342, + "loss": 0.4042, + "step": 6089 + }, + { + "epoch": 3.4022346368715084, + "grad_norm": 0.9566075801849365, + "learning_rate": 0.0008323809523809524, + "loss": 0.5372, + "step": 6090 + }, + { + "epoch": 3.4027932960893854, + "grad_norm": 0.6932033896446228, + "learning_rate": 0.0008323529411764706, + "loss": 0.5512, + "step": 6091 + }, + { + "epoch": 3.4033519553072624, + "grad_norm": 0.4438057541847229, + "learning_rate": 0.0008323249299719888, + "loss": 0.5734, + "step": 6092 + }, + { + "epoch": 3.40391061452514, + "grad_norm": 0.6034622192382812, + "learning_rate": 0.000832296918767507, + "loss": 0.5491, + "step": 6093 + }, + { + "epoch": 3.404469273743017, + "grad_norm": 2.567652702331543, + "learning_rate": 0.0008322689075630252, + "loss": 0.4947, + "step": 6094 + }, + { + "epoch": 3.405027932960894, + "grad_norm": 0.8938478827476501, + "learning_rate": 0.0008322408963585434, + "loss": 0.4776, + "step": 6095 + }, + { + "epoch": 3.405586592178771, + "grad_norm": 1.2218962907791138, + "learning_rate": 0.0008322128851540617, + "loss": 0.485, + "step": 6096 + }, + { + "epoch": 3.406145251396648, + "grad_norm": 0.5890511274337769, + "learning_rate": 0.0008321848739495798, + "loss": 0.4577, + "step": 6097 + }, + { + "epoch": 3.4067039106145254, + "grad_norm": 0.6005613207817078, + "learning_rate": 0.000832156862745098, + "loss": 0.5547, + "step": 6098 + }, + { + "epoch": 3.4072625698324024, + "grad_norm": 0.548182487487793, + "learning_rate": 0.0008321288515406162, + "loss": 0.4736, + "step": 6099 + }, + { + "epoch": 3.4078212290502794, + "grad_norm": 0.6354692578315735, + "learning_rate": 0.0008321008403361345, + "loss": 0.5198, + "step": 6100 + }, + { + "epoch": 3.4083798882681564, + "grad_norm": 0.7965889573097229, + "learning_rate": 0.0008320728291316528, + "loss": 0.4779, + "step": 6101 + }, + { + "epoch": 3.4089385474860334, + "grad_norm": 0.573383629322052, + "learning_rate": 0.0008320448179271709, + "loss": 0.5811, + "step": 6102 + }, + { + "epoch": 3.4094972067039104, + "grad_norm": 3.57027268409729, + "learning_rate": 0.0008320168067226891, + "loss": 0.4717, + "step": 6103 + }, + { + "epoch": 3.410055865921788, + "grad_norm": 0.4645558297634125, + "learning_rate": 0.0008319887955182073, + "loss": 0.393, + "step": 6104 + }, + { + "epoch": 3.410614525139665, + "grad_norm": 0.4812800884246826, + "learning_rate": 0.0008319607843137255, + "loss": 0.495, + "step": 6105 + }, + { + "epoch": 3.411173184357542, + "grad_norm": 0.9177267551422119, + "learning_rate": 0.0008319327731092438, + "loss": 0.4404, + "step": 6106 + }, + { + "epoch": 3.411731843575419, + "grad_norm": 0.4278791546821594, + "learning_rate": 0.0008319047619047619, + "loss": 0.478, + "step": 6107 + }, + { + "epoch": 3.412290502793296, + "grad_norm": 2.0132620334625244, + "learning_rate": 0.0008318767507002801, + "loss": 0.4125, + "step": 6108 + }, + { + "epoch": 3.412849162011173, + "grad_norm": 0.5608509182929993, + "learning_rate": 0.0008318487394957983, + "loss": 0.4765, + "step": 6109 + }, + { + "epoch": 3.4134078212290504, + "grad_norm": 1.7458564043045044, + "learning_rate": 0.0008318207282913165, + "loss": 0.4494, + "step": 6110 + }, + { + "epoch": 3.4139664804469274, + "grad_norm": 0.7779131531715393, + "learning_rate": 0.0008317927170868348, + "loss": 0.4889, + "step": 6111 + }, + { + "epoch": 3.4145251396648044, + "grad_norm": 1.0489270687103271, + "learning_rate": 0.000831764705882353, + "loss": 0.4302, + "step": 6112 + }, + { + "epoch": 3.4150837988826814, + "grad_norm": 2.2097907066345215, + "learning_rate": 0.0008317366946778711, + "loss": 0.5399, + "step": 6113 + }, + { + "epoch": 3.4156424581005584, + "grad_norm": 0.5458060503005981, + "learning_rate": 0.0008317086834733893, + "loss": 0.4644, + "step": 6114 + }, + { + "epoch": 3.416201117318436, + "grad_norm": 0.5295409560203552, + "learning_rate": 0.0008316806722689075, + "loss": 0.479, + "step": 6115 + }, + { + "epoch": 3.416759776536313, + "grad_norm": 0.9819905757904053, + "learning_rate": 0.0008316526610644259, + "loss": 0.5432, + "step": 6116 + }, + { + "epoch": 3.41731843575419, + "grad_norm": 0.4620686173439026, + "learning_rate": 0.0008316246498599441, + "loss": 0.3981, + "step": 6117 + }, + { + "epoch": 3.417877094972067, + "grad_norm": 0.5014457702636719, + "learning_rate": 0.0008315966386554622, + "loss": 0.4821, + "step": 6118 + }, + { + "epoch": 3.418435754189944, + "grad_norm": 0.49127867817878723, + "learning_rate": 0.0008315686274509804, + "loss": 0.4395, + "step": 6119 + }, + { + "epoch": 3.4189944134078214, + "grad_norm": 0.783652126789093, + "learning_rate": 0.0008315406162464986, + "loss": 0.449, + "step": 6120 + }, + { + "epoch": 3.4195530726256984, + "grad_norm": 0.48742324113845825, + "learning_rate": 0.0008315126050420169, + "loss": 0.4947, + "step": 6121 + }, + { + "epoch": 3.4201117318435754, + "grad_norm": 1.2025938034057617, + "learning_rate": 0.0008314845938375351, + "loss": 0.4748, + "step": 6122 + }, + { + "epoch": 3.4206703910614524, + "grad_norm": 0.46522942185401917, + "learning_rate": 0.0008314565826330532, + "loss": 0.4796, + "step": 6123 + }, + { + "epoch": 3.4212290502793294, + "grad_norm": 0.6571593880653381, + "learning_rate": 0.0008314285714285714, + "loss": 0.5064, + "step": 6124 + }, + { + "epoch": 3.421787709497207, + "grad_norm": 0.40364181995391846, + "learning_rate": 0.0008314005602240896, + "loss": 0.2829, + "step": 6125 + }, + { + "epoch": 3.422346368715084, + "grad_norm": 0.3179486393928528, + "learning_rate": 0.0008313725490196079, + "loss": 0.3246, + "step": 6126 + }, + { + "epoch": 3.422905027932961, + "grad_norm": 0.6112605333328247, + "learning_rate": 0.0008313445378151261, + "loss": 0.5162, + "step": 6127 + }, + { + "epoch": 3.423463687150838, + "grad_norm": 0.6073411107063293, + "learning_rate": 0.0008313165266106443, + "loss": 0.4942, + "step": 6128 + }, + { + "epoch": 3.424022346368715, + "grad_norm": 0.603777289390564, + "learning_rate": 0.0008312885154061624, + "loss": 0.6173, + "step": 6129 + }, + { + "epoch": 3.4245810055865924, + "grad_norm": 3.2474570274353027, + "learning_rate": 0.0008312605042016806, + "loss": 0.3928, + "step": 6130 + }, + { + "epoch": 3.4251396648044694, + "grad_norm": 0.4615461528301239, + "learning_rate": 0.0008312324929971989, + "loss": 0.4786, + "step": 6131 + }, + { + "epoch": 3.4256983240223464, + "grad_norm": 0.5394693613052368, + "learning_rate": 0.0008312044817927172, + "loss": 0.472, + "step": 6132 + }, + { + "epoch": 3.4262569832402234, + "grad_norm": 0.6481944918632507, + "learning_rate": 0.0008311764705882354, + "loss": 0.4809, + "step": 6133 + }, + { + "epoch": 3.4268156424581004, + "grad_norm": 0.6655814051628113, + "learning_rate": 0.0008311484593837535, + "loss": 0.4104, + "step": 6134 + }, + { + "epoch": 3.427374301675978, + "grad_norm": 0.6282750964164734, + "learning_rate": 0.0008311204481792717, + "loss": 0.4829, + "step": 6135 + }, + { + "epoch": 3.427932960893855, + "grad_norm": 0.5513433814048767, + "learning_rate": 0.00083109243697479, + "loss": 0.6495, + "step": 6136 + }, + { + "epoch": 3.428491620111732, + "grad_norm": 0.5036987066268921, + "learning_rate": 0.0008310644257703082, + "loss": 0.3979, + "step": 6137 + }, + { + "epoch": 3.429050279329609, + "grad_norm": 0.549089252948761, + "learning_rate": 0.0008310364145658264, + "loss": 0.5067, + "step": 6138 + }, + { + "epoch": 3.429608938547486, + "grad_norm": 0.48470252752304077, + "learning_rate": 0.0008310084033613445, + "loss": 0.4748, + "step": 6139 + }, + { + "epoch": 3.430167597765363, + "grad_norm": 0.4290856122970581, + "learning_rate": 0.0008309803921568627, + "loss": 0.457, + "step": 6140 + }, + { + "epoch": 3.4307262569832404, + "grad_norm": 0.6760498285293579, + "learning_rate": 0.000830952380952381, + "loss": 0.4882, + "step": 6141 + }, + { + "epoch": 3.4312849162011174, + "grad_norm": 0.6759735345840454, + "learning_rate": 0.0008309243697478992, + "loss": 0.5385, + "step": 6142 + }, + { + "epoch": 3.4318435754189944, + "grad_norm": 0.5476924777030945, + "learning_rate": 0.0008308963585434174, + "loss": 0.5527, + "step": 6143 + }, + { + "epoch": 3.4324022346368714, + "grad_norm": 0.54535311460495, + "learning_rate": 0.0008308683473389356, + "loss": 0.5131, + "step": 6144 + }, + { + "epoch": 3.4329608938547485, + "grad_norm": 0.5027291178703308, + "learning_rate": 0.0008308403361344537, + "loss": 0.3548, + "step": 6145 + }, + { + "epoch": 3.4335195530726255, + "grad_norm": 0.4983408749103546, + "learning_rate": 0.000830812324929972, + "loss": 0.4955, + "step": 6146 + }, + { + "epoch": 3.434078212290503, + "grad_norm": 0.5102941393852234, + "learning_rate": 0.0008307843137254902, + "loss": 0.5684, + "step": 6147 + }, + { + "epoch": 3.43463687150838, + "grad_norm": 0.7571179270744324, + "learning_rate": 0.0008307563025210084, + "loss": 0.5, + "step": 6148 + }, + { + "epoch": 3.435195530726257, + "grad_norm": 0.42837122082710266, + "learning_rate": 0.0008307282913165267, + "loss": 0.3529, + "step": 6149 + }, + { + "epoch": 3.435754189944134, + "grad_norm": 0.6471491456031799, + "learning_rate": 0.0008307002801120448, + "loss": 0.4824, + "step": 6150 + }, + { + "epoch": 3.436312849162011, + "grad_norm": 0.5041567087173462, + "learning_rate": 0.0008306722689075631, + "loss": 0.5364, + "step": 6151 + }, + { + "epoch": 3.4368715083798884, + "grad_norm": 0.5908088088035583, + "learning_rate": 0.0008306442577030813, + "loss": 0.4052, + "step": 6152 + }, + { + "epoch": 3.4374301675977654, + "grad_norm": 0.5950685739517212, + "learning_rate": 0.0008306162464985995, + "loss": 0.3639, + "step": 6153 + }, + { + "epoch": 3.4379888268156424, + "grad_norm": 0.4470774531364441, + "learning_rate": 0.0008305882352941177, + "loss": 0.5267, + "step": 6154 + }, + { + "epoch": 3.4385474860335195, + "grad_norm": 0.529109537601471, + "learning_rate": 0.0008305602240896358, + "loss": 0.3909, + "step": 6155 + }, + { + "epoch": 3.4391061452513965, + "grad_norm": 0.5458061099052429, + "learning_rate": 0.0008305322128851541, + "loss": 0.7119, + "step": 6156 + }, + { + "epoch": 3.439664804469274, + "grad_norm": 0.5622304081916809, + "learning_rate": 0.0008305042016806723, + "loss": 0.5789, + "step": 6157 + }, + { + "epoch": 3.440223463687151, + "grad_norm": 0.3653081953525543, + "learning_rate": 0.0008304761904761905, + "loss": 0.4522, + "step": 6158 + }, + { + "epoch": 3.440782122905028, + "grad_norm": 0.4660721719264984, + "learning_rate": 0.0008304481792717087, + "loss": 0.4748, + "step": 6159 + }, + { + "epoch": 3.441340782122905, + "grad_norm": 1.2203646898269653, + "learning_rate": 0.0008304201680672269, + "loss": 0.4994, + "step": 6160 + }, + { + "epoch": 3.441899441340782, + "grad_norm": 0.7291945219039917, + "learning_rate": 0.0008303921568627451, + "loss": 0.562, + "step": 6161 + }, + { + "epoch": 3.4424581005586594, + "grad_norm": 0.9149802327156067, + "learning_rate": 0.0008303641456582633, + "loss": 0.4369, + "step": 6162 + }, + { + "epoch": 3.4430167597765364, + "grad_norm": 0.5975952744483948, + "learning_rate": 0.0008303361344537815, + "loss": 0.5538, + "step": 6163 + }, + { + "epoch": 3.4435754189944134, + "grad_norm": 3.1010115146636963, + "learning_rate": 0.0008303081232492997, + "loss": 0.5367, + "step": 6164 + }, + { + "epoch": 3.4441340782122905, + "grad_norm": 0.722662627696991, + "learning_rate": 0.000830280112044818, + "loss": 0.4752, + "step": 6165 + }, + { + "epoch": 3.4446927374301675, + "grad_norm": 2.758788824081421, + "learning_rate": 0.0008302521008403362, + "loss": 0.5845, + "step": 6166 + }, + { + "epoch": 3.445251396648045, + "grad_norm": 2.9750373363494873, + "learning_rate": 0.0008302240896358544, + "loss": 0.4125, + "step": 6167 + }, + { + "epoch": 3.445810055865922, + "grad_norm": 0.5902764797210693, + "learning_rate": 0.0008301960784313726, + "loss": 0.4676, + "step": 6168 + }, + { + "epoch": 3.446368715083799, + "grad_norm": 0.645778477191925, + "learning_rate": 0.0008301680672268908, + "loss": 0.5567, + "step": 6169 + }, + { + "epoch": 3.446927374301676, + "grad_norm": 0.690401017665863, + "learning_rate": 0.000830140056022409, + "loss": 0.4933, + "step": 6170 + }, + { + "epoch": 3.447486033519553, + "grad_norm": 0.6196557879447937, + "learning_rate": 0.0008301120448179272, + "loss": 0.3777, + "step": 6171 + }, + { + "epoch": 3.4480446927374304, + "grad_norm": 0.46737533807754517, + "learning_rate": 0.0008300840336134454, + "loss": 0.4538, + "step": 6172 + }, + { + "epoch": 3.4486033519553074, + "grad_norm": 0.6995070576667786, + "learning_rate": 0.0008300560224089636, + "loss": 0.5321, + "step": 6173 + }, + { + "epoch": 3.4491620111731844, + "grad_norm": 0.6664303541183472, + "learning_rate": 0.0008300280112044818, + "loss": 0.3817, + "step": 6174 + }, + { + "epoch": 3.4497206703910615, + "grad_norm": 0.6369305849075317, + "learning_rate": 0.00083, + "loss": 0.4651, + "step": 6175 + }, + { + "epoch": 3.4502793296089385, + "grad_norm": 0.4330568313598633, + "learning_rate": 0.0008299719887955183, + "loss": 0.4245, + "step": 6176 + }, + { + "epoch": 3.4508379888268155, + "grad_norm": 0.6581286191940308, + "learning_rate": 0.0008299439775910364, + "loss": 0.5335, + "step": 6177 + }, + { + "epoch": 3.451396648044693, + "grad_norm": 0.5372915267944336, + "learning_rate": 0.0008299159663865546, + "loss": 0.4247, + "step": 6178 + }, + { + "epoch": 3.45195530726257, + "grad_norm": 0.5128198266029358, + "learning_rate": 0.0008298879551820728, + "loss": 0.3812, + "step": 6179 + }, + { + "epoch": 3.452513966480447, + "grad_norm": 0.5149964690208435, + "learning_rate": 0.000829859943977591, + "loss": 0.4034, + "step": 6180 + }, + { + "epoch": 3.453072625698324, + "grad_norm": 0.6453413367271423, + "learning_rate": 0.0008298319327731094, + "loss": 0.4377, + "step": 6181 + }, + { + "epoch": 3.453631284916201, + "grad_norm": 0.5139704346656799, + "learning_rate": 0.0008298039215686275, + "loss": 0.3467, + "step": 6182 + }, + { + "epoch": 3.454189944134078, + "grad_norm": 0.6001042723655701, + "learning_rate": 0.0008297759103641457, + "loss": 0.4628, + "step": 6183 + }, + { + "epoch": 3.4547486033519554, + "grad_norm": 0.6400231122970581, + "learning_rate": 0.0008297478991596639, + "loss": 0.4181, + "step": 6184 + }, + { + "epoch": 3.4553072625698324, + "grad_norm": 0.4790504276752472, + "learning_rate": 0.0008297198879551821, + "loss": 0.3962, + "step": 6185 + }, + { + "epoch": 3.4558659217877095, + "grad_norm": 0.4278772175312042, + "learning_rate": 0.0008296918767507004, + "loss": 0.4199, + "step": 6186 + }, + { + "epoch": 3.4564245810055865, + "grad_norm": 0.621315598487854, + "learning_rate": 0.0008296638655462185, + "loss": 0.5873, + "step": 6187 + }, + { + "epoch": 3.4569832402234635, + "grad_norm": 2.627028226852417, + "learning_rate": 0.0008296358543417367, + "loss": 0.6, + "step": 6188 + }, + { + "epoch": 3.457541899441341, + "grad_norm": 0.6533622145652771, + "learning_rate": 0.0008296078431372549, + "loss": 0.6004, + "step": 6189 + }, + { + "epoch": 3.458100558659218, + "grad_norm": 0.4335118532180786, + "learning_rate": 0.0008295798319327731, + "loss": 0.398, + "step": 6190 + }, + { + "epoch": 3.458659217877095, + "grad_norm": 3.6930038928985596, + "learning_rate": 0.0008295518207282914, + "loss": 0.5033, + "step": 6191 + }, + { + "epoch": 3.459217877094972, + "grad_norm": 0.4581165611743927, + "learning_rate": 0.0008295238095238096, + "loss": 0.475, + "step": 6192 + }, + { + "epoch": 3.459776536312849, + "grad_norm": 0.5832881331443787, + "learning_rate": 0.0008294957983193277, + "loss": 0.4453, + "step": 6193 + }, + { + "epoch": 3.4603351955307264, + "grad_norm": 0.48735567927360535, + "learning_rate": 0.0008294677871148459, + "loss": 0.435, + "step": 6194 + }, + { + "epoch": 3.4608938547486034, + "grad_norm": 3.0008339881896973, + "learning_rate": 0.0008294397759103641, + "loss": 0.5185, + "step": 6195 + }, + { + "epoch": 3.4614525139664805, + "grad_norm": 0.6516963243484497, + "learning_rate": 0.0008294117647058824, + "loss": 0.4898, + "step": 6196 + }, + { + "epoch": 3.4620111731843575, + "grad_norm": 0.5168313384056091, + "learning_rate": 0.0008293837535014006, + "loss": 0.4192, + "step": 6197 + }, + { + "epoch": 3.4625698324022345, + "grad_norm": 0.8226152062416077, + "learning_rate": 0.0008293557422969187, + "loss": 0.5007, + "step": 6198 + }, + { + "epoch": 3.463128491620112, + "grad_norm": 0.3868049383163452, + "learning_rate": 0.000829327731092437, + "loss": 0.3972, + "step": 6199 + }, + { + "epoch": 3.463687150837989, + "grad_norm": 0.5038139224052429, + "learning_rate": 0.0008292997198879552, + "loss": 0.308, + "step": 6200 + }, + { + "epoch": 3.464245810055866, + "grad_norm": 0.4974587559700012, + "learning_rate": 0.0008292717086834735, + "loss": 0.5803, + "step": 6201 + }, + { + "epoch": 3.464804469273743, + "grad_norm": 0.461847186088562, + "learning_rate": 0.0008292436974789917, + "loss": 0.4907, + "step": 6202 + }, + { + "epoch": 3.46536312849162, + "grad_norm": 0.4884839951992035, + "learning_rate": 0.0008292156862745098, + "loss": 0.4431, + "step": 6203 + }, + { + "epoch": 3.4659217877094974, + "grad_norm": 0.40575647354125977, + "learning_rate": 0.000829187675070028, + "loss": 0.3935, + "step": 6204 + }, + { + "epoch": 3.4664804469273744, + "grad_norm": 0.370565801858902, + "learning_rate": 0.0008291596638655462, + "loss": 0.426, + "step": 6205 + }, + { + "epoch": 3.4670391061452515, + "grad_norm": 0.8607147932052612, + "learning_rate": 0.0008291316526610645, + "loss": 0.3742, + "step": 6206 + }, + { + "epoch": 3.4675977653631285, + "grad_norm": 0.48300793766975403, + "learning_rate": 0.0008291036414565827, + "loss": 0.4999, + "step": 6207 + }, + { + "epoch": 3.4681564245810055, + "grad_norm": 0.3741581439971924, + "learning_rate": 0.0008290756302521009, + "loss": 0.4202, + "step": 6208 + }, + { + "epoch": 3.4687150837988825, + "grad_norm": 0.5670000910758972, + "learning_rate": 0.000829047619047619, + "loss": 0.5501, + "step": 6209 + }, + { + "epoch": 3.46927374301676, + "grad_norm": 0.6159808039665222, + "learning_rate": 0.0008290196078431372, + "loss": 0.3801, + "step": 6210 + }, + { + "epoch": 3.469832402234637, + "grad_norm": 0.7532714605331421, + "learning_rate": 0.0008289915966386555, + "loss": 0.4826, + "step": 6211 + }, + { + "epoch": 3.470391061452514, + "grad_norm": 0.7035219669342041, + "learning_rate": 0.0008289635854341737, + "loss": 0.4745, + "step": 6212 + }, + { + "epoch": 3.470949720670391, + "grad_norm": 0.4840216338634491, + "learning_rate": 0.0008289355742296919, + "loss": 0.4788, + "step": 6213 + }, + { + "epoch": 3.471508379888268, + "grad_norm": 0.5621411204338074, + "learning_rate": 0.00082890756302521, + "loss": 0.6408, + "step": 6214 + }, + { + "epoch": 3.472067039106145, + "grad_norm": 0.7430678009986877, + "learning_rate": 0.0008288795518207282, + "loss": 0.5221, + "step": 6215 + }, + { + "epoch": 3.4726256983240225, + "grad_norm": 0.6376789808273315, + "learning_rate": 0.0008288515406162466, + "loss": 0.3903, + "step": 6216 + }, + { + "epoch": 3.4731843575418995, + "grad_norm": 0.5655553936958313, + "learning_rate": 0.0008288235294117648, + "loss": 0.4414, + "step": 6217 + }, + { + "epoch": 3.4737430167597765, + "grad_norm": 0.4801236391067505, + "learning_rate": 0.000828795518207283, + "loss": 0.4709, + "step": 6218 + }, + { + "epoch": 3.4743016759776535, + "grad_norm": 0.591867208480835, + "learning_rate": 0.0008287675070028011, + "loss": 0.5107, + "step": 6219 + }, + { + "epoch": 3.4748603351955305, + "grad_norm": 0.5417670011520386, + "learning_rate": 0.0008287394957983193, + "loss": 0.428, + "step": 6220 + }, + { + "epoch": 3.475418994413408, + "grad_norm": 0.702387809753418, + "learning_rate": 0.0008287114845938376, + "loss": 0.5977, + "step": 6221 + }, + { + "epoch": 3.475977653631285, + "grad_norm": 0.6193615794181824, + "learning_rate": 0.0008286834733893558, + "loss": 0.622, + "step": 6222 + }, + { + "epoch": 3.476536312849162, + "grad_norm": 0.46409308910369873, + "learning_rate": 0.000828655462184874, + "loss": 0.3543, + "step": 6223 + }, + { + "epoch": 3.477094972067039, + "grad_norm": 0.9161087274551392, + "learning_rate": 0.0008286274509803922, + "loss": 0.5775, + "step": 6224 + }, + { + "epoch": 3.477653631284916, + "grad_norm": 0.5752775073051453, + "learning_rate": 0.0008285994397759103, + "loss": 0.4749, + "step": 6225 + }, + { + "epoch": 3.4782122905027935, + "grad_norm": 0.6740328669548035, + "learning_rate": 0.0008285714285714286, + "loss": 0.4558, + "step": 6226 + }, + { + "epoch": 3.4787709497206705, + "grad_norm": 0.48524630069732666, + "learning_rate": 0.0008285434173669468, + "loss": 0.3237, + "step": 6227 + }, + { + "epoch": 3.4793296089385475, + "grad_norm": 1.2408488988876343, + "learning_rate": 0.000828515406162465, + "loss": 0.3959, + "step": 6228 + }, + { + "epoch": 3.4798882681564245, + "grad_norm": 0.5826830863952637, + "learning_rate": 0.0008284873949579832, + "loss": 0.4294, + "step": 6229 + }, + { + "epoch": 3.4804469273743015, + "grad_norm": 0.8397267460823059, + "learning_rate": 0.0008284593837535013, + "loss": 0.3857, + "step": 6230 + }, + { + "epoch": 3.481005586592179, + "grad_norm": 2.1733570098876953, + "learning_rate": 0.0008284313725490197, + "loss": 0.3724, + "step": 6231 + }, + { + "epoch": 3.481564245810056, + "grad_norm": 0.6842045187950134, + "learning_rate": 0.0008284033613445379, + "loss": 0.5183, + "step": 6232 + }, + { + "epoch": 3.482122905027933, + "grad_norm": 0.7522661089897156, + "learning_rate": 0.0008283753501400561, + "loss": 0.3443, + "step": 6233 + }, + { + "epoch": 3.48268156424581, + "grad_norm": 0.5143001675605774, + "learning_rate": 0.0008283473389355743, + "loss": 0.4324, + "step": 6234 + }, + { + "epoch": 3.483240223463687, + "grad_norm": 1.29478120803833, + "learning_rate": 0.0008283193277310924, + "loss": 0.456, + "step": 6235 + }, + { + "epoch": 3.4837988826815645, + "grad_norm": 0.6133304238319397, + "learning_rate": 0.0008282913165266107, + "loss": 0.42, + "step": 6236 + }, + { + "epoch": 3.4843575418994415, + "grad_norm": 0.6396066546440125, + "learning_rate": 0.0008282633053221289, + "loss": 0.6273, + "step": 6237 + }, + { + "epoch": 3.4849162011173185, + "grad_norm": 0.6781561374664307, + "learning_rate": 0.0008282352941176471, + "loss": 0.4388, + "step": 6238 + }, + { + "epoch": 3.4854748603351955, + "grad_norm": 1.2636315822601318, + "learning_rate": 0.0008282072829131653, + "loss": 0.4019, + "step": 6239 + }, + { + "epoch": 3.4860335195530725, + "grad_norm": 0.6104376912117004, + "learning_rate": 0.0008281792717086835, + "loss": 0.5062, + "step": 6240 + }, + { + "epoch": 3.48659217877095, + "grad_norm": 1.448607087135315, + "learning_rate": 0.0008281512605042017, + "loss": 0.4492, + "step": 6241 + }, + { + "epoch": 3.487150837988827, + "grad_norm": 0.6598222255706787, + "learning_rate": 0.0008281232492997199, + "loss": 0.538, + "step": 6242 + }, + { + "epoch": 3.487709497206704, + "grad_norm": 0.8888305425643921, + "learning_rate": 0.0008280952380952381, + "loss": 0.4827, + "step": 6243 + }, + { + "epoch": 3.488268156424581, + "grad_norm": 0.6721920967102051, + "learning_rate": 0.0008280672268907563, + "loss": 0.5386, + "step": 6244 + }, + { + "epoch": 3.488826815642458, + "grad_norm": 1.0853923559188843, + "learning_rate": 0.0008280392156862745, + "loss": 0.4809, + "step": 6245 + }, + { + "epoch": 3.489385474860335, + "grad_norm": 0.5683623552322388, + "learning_rate": 0.0008280112044817926, + "loss": 0.3923, + "step": 6246 + }, + { + "epoch": 3.4899441340782125, + "grad_norm": 0.5953332185745239, + "learning_rate": 0.000827983193277311, + "loss": 0.4219, + "step": 6247 + }, + { + "epoch": 3.4905027932960895, + "grad_norm": 0.6139711737632751, + "learning_rate": 0.0008279551820728292, + "loss": 0.3811, + "step": 6248 + }, + { + "epoch": 3.4910614525139665, + "grad_norm": 1.4022561311721802, + "learning_rate": 0.0008279271708683474, + "loss": 0.5551, + "step": 6249 + }, + { + "epoch": 3.4916201117318435, + "grad_norm": 0.7792200446128845, + "learning_rate": 0.0008278991596638656, + "loss": 0.3993, + "step": 6250 + }, + { + "epoch": 3.4921787709497205, + "grad_norm": 1.1188433170318604, + "learning_rate": 0.0008278711484593837, + "loss": 0.46, + "step": 6251 + }, + { + "epoch": 3.4927374301675975, + "grad_norm": 0.5536515116691589, + "learning_rate": 0.000827843137254902, + "loss": 0.569, + "step": 6252 + }, + { + "epoch": 3.493296089385475, + "grad_norm": 0.7545760273933411, + "learning_rate": 0.0008278151260504202, + "loss": 0.5369, + "step": 6253 + }, + { + "epoch": 3.493854748603352, + "grad_norm": 0.7642180919647217, + "learning_rate": 0.0008277871148459384, + "loss": 0.6411, + "step": 6254 + }, + { + "epoch": 3.494413407821229, + "grad_norm": 0.5666323304176331, + "learning_rate": 0.0008277591036414566, + "loss": 0.4678, + "step": 6255 + }, + { + "epoch": 3.494972067039106, + "grad_norm": 0.5523326992988586, + "learning_rate": 0.0008277310924369748, + "loss": 0.4569, + "step": 6256 + }, + { + "epoch": 3.495530726256983, + "grad_norm": 0.5576061010360718, + "learning_rate": 0.000827703081232493, + "loss": 0.4409, + "step": 6257 + }, + { + "epoch": 3.4960893854748605, + "grad_norm": 0.4355257749557495, + "learning_rate": 0.0008276750700280112, + "loss": 0.396, + "step": 6258 + }, + { + "epoch": 3.4966480446927375, + "grad_norm": 2.5634796619415283, + "learning_rate": 0.0008276470588235294, + "loss": 0.5531, + "step": 6259 + }, + { + "epoch": 3.4972067039106145, + "grad_norm": 0.542377233505249, + "learning_rate": 0.0008276190476190476, + "loss": 0.6028, + "step": 6260 + }, + { + "epoch": 3.4977653631284915, + "grad_norm": 0.7469171285629272, + "learning_rate": 0.0008275910364145658, + "loss": 0.7268, + "step": 6261 + }, + { + "epoch": 3.4983240223463685, + "grad_norm": 0.46832942962646484, + "learning_rate": 0.000827563025210084, + "loss": 0.4185, + "step": 6262 + }, + { + "epoch": 3.498882681564246, + "grad_norm": 0.5550451278686523, + "learning_rate": 0.0008275350140056022, + "loss": 0.4034, + "step": 6263 + }, + { + "epoch": 3.499441340782123, + "grad_norm": 0.7761154770851135, + "learning_rate": 0.0008275070028011205, + "loss": 0.7377, + "step": 6264 + }, + { + "epoch": 3.5, + "grad_norm": 0.964921236038208, + "learning_rate": 0.0008274789915966387, + "loss": 0.56, + "step": 6265 + }, + { + "epoch": 3.500558659217877, + "grad_norm": 0.7288264632225037, + "learning_rate": 0.0008274509803921569, + "loss": 0.3962, + "step": 6266 + }, + { + "epoch": 3.501117318435754, + "grad_norm": 0.5606486201286316, + "learning_rate": 0.0008274229691876752, + "loss": 0.3888, + "step": 6267 + }, + { + "epoch": 3.5016759776536315, + "grad_norm": 0.6529912352561951, + "learning_rate": 0.0008273949579831933, + "loss": 0.6772, + "step": 6268 + }, + { + "epoch": 3.5022346368715085, + "grad_norm": 0.6502243876457214, + "learning_rate": 0.0008273669467787115, + "loss": 0.5324, + "step": 6269 + }, + { + "epoch": 3.5027932960893855, + "grad_norm": 0.7022506594657898, + "learning_rate": 0.0008273389355742297, + "loss": 0.4499, + "step": 6270 + }, + { + "epoch": 3.5033519553072625, + "grad_norm": 0.9374591708183289, + "learning_rate": 0.0008273109243697479, + "loss": 0.5747, + "step": 6271 + }, + { + "epoch": 3.5039106145251395, + "grad_norm": 0.4639132618904114, + "learning_rate": 0.0008272829131652662, + "loss": 0.441, + "step": 6272 + }, + { + "epoch": 3.504469273743017, + "grad_norm": 0.46832239627838135, + "learning_rate": 0.0008272549019607843, + "loss": 0.4049, + "step": 6273 + }, + { + "epoch": 3.505027932960894, + "grad_norm": 0.6344608664512634, + "learning_rate": 0.0008272268907563025, + "loss": 0.444, + "step": 6274 + }, + { + "epoch": 3.505586592178771, + "grad_norm": 0.731968879699707, + "learning_rate": 0.0008271988795518207, + "loss": 0.6096, + "step": 6275 + }, + { + "epoch": 3.506145251396648, + "grad_norm": 0.758788526058197, + "learning_rate": 0.0008271708683473389, + "loss": 0.3645, + "step": 6276 + }, + { + "epoch": 3.506703910614525, + "grad_norm": 0.5580565929412842, + "learning_rate": 0.0008271428571428572, + "loss": 0.4949, + "step": 6277 + }, + { + "epoch": 3.5072625698324025, + "grad_norm": 0.507298469543457, + "learning_rate": 0.0008271148459383753, + "loss": 0.4238, + "step": 6278 + }, + { + "epoch": 3.5078212290502795, + "grad_norm": 0.9117985367774963, + "learning_rate": 0.0008270868347338935, + "loss": 0.4057, + "step": 6279 + }, + { + "epoch": 3.5083798882681565, + "grad_norm": 2.154933214187622, + "learning_rate": 0.0008270588235294117, + "loss": 0.5608, + "step": 6280 + }, + { + "epoch": 3.5089385474860335, + "grad_norm": 0.5375345349311829, + "learning_rate": 0.00082703081232493, + "loss": 0.4625, + "step": 6281 + }, + { + "epoch": 3.5094972067039105, + "grad_norm": 0.4402356743812561, + "learning_rate": 0.0008270028011204483, + "loss": 0.3624, + "step": 6282 + }, + { + "epoch": 3.510055865921788, + "grad_norm": 0.8601878881454468, + "learning_rate": 0.0008269747899159665, + "loss": 0.585, + "step": 6283 + }, + { + "epoch": 3.5106145251396645, + "grad_norm": 0.4689737856388092, + "learning_rate": 0.0008269467787114846, + "loss": 0.5057, + "step": 6284 + }, + { + "epoch": 3.511173184357542, + "grad_norm": 0.6700764894485474, + "learning_rate": 0.0008269187675070028, + "loss": 0.4682, + "step": 6285 + }, + { + "epoch": 3.511731843575419, + "grad_norm": 0.7798172235488892, + "learning_rate": 0.000826890756302521, + "loss": 0.4129, + "step": 6286 + }, + { + "epoch": 3.512290502793296, + "grad_norm": 0.8098841309547424, + "learning_rate": 0.0008268627450980393, + "loss": 0.4749, + "step": 6287 + }, + { + "epoch": 3.512849162011173, + "grad_norm": 0.5506035089492798, + "learning_rate": 0.0008268347338935575, + "loss": 0.4992, + "step": 6288 + }, + { + "epoch": 3.51340782122905, + "grad_norm": 0.6817156672477722, + "learning_rate": 0.0008268067226890756, + "loss": 0.4863, + "step": 6289 + }, + { + "epoch": 3.5139664804469275, + "grad_norm": 0.544497549533844, + "learning_rate": 0.0008267787114845938, + "loss": 0.5951, + "step": 6290 + }, + { + "epoch": 3.5145251396648045, + "grad_norm": 0.7743503451347351, + "learning_rate": 0.000826750700280112, + "loss": 0.4196, + "step": 6291 + }, + { + "epoch": 3.5150837988826815, + "grad_norm": 0.9870738387107849, + "learning_rate": 0.0008267226890756303, + "loss": 0.5177, + "step": 6292 + }, + { + "epoch": 3.5156424581005585, + "grad_norm": 2.3739304542541504, + "learning_rate": 0.0008266946778711485, + "loss": 0.3858, + "step": 6293 + }, + { + "epoch": 3.5162011173184355, + "grad_norm": 0.6579199433326721, + "learning_rate": 0.0008266666666666666, + "loss": 0.4135, + "step": 6294 + }, + { + "epoch": 3.516759776536313, + "grad_norm": 0.5895616412162781, + "learning_rate": 0.0008266386554621848, + "loss": 0.4884, + "step": 6295 + }, + { + "epoch": 3.51731843575419, + "grad_norm": 0.7617058157920837, + "learning_rate": 0.000826610644257703, + "loss": 0.431, + "step": 6296 + }, + { + "epoch": 3.517877094972067, + "grad_norm": 2.0498316287994385, + "learning_rate": 0.0008265826330532214, + "loss": 0.524, + "step": 6297 + }, + { + "epoch": 3.518435754189944, + "grad_norm": 0.5397300124168396, + "learning_rate": 0.0008265546218487396, + "loss": 0.424, + "step": 6298 + }, + { + "epoch": 3.518994413407821, + "grad_norm": 0.907599925994873, + "learning_rate": 0.0008265266106442578, + "loss": 0.5571, + "step": 6299 + }, + { + "epoch": 3.5195530726256985, + "grad_norm": 0.5154718160629272, + "learning_rate": 0.0008264985994397759, + "loss": 0.4773, + "step": 6300 + }, + { + "epoch": 3.5201117318435755, + "grad_norm": 0.501197099685669, + "learning_rate": 0.0008264705882352941, + "loss": 0.4867, + "step": 6301 + }, + { + "epoch": 3.5206703910614525, + "grad_norm": 0.5469103455543518, + "learning_rate": 0.0008264425770308124, + "loss": 0.4265, + "step": 6302 + }, + { + "epoch": 3.5212290502793295, + "grad_norm": 0.5078546404838562, + "learning_rate": 0.0008264145658263306, + "loss": 0.5196, + "step": 6303 + }, + { + "epoch": 3.5217877094972065, + "grad_norm": 0.6976302266120911, + "learning_rate": 0.0008263865546218488, + "loss": 0.521, + "step": 6304 + }, + { + "epoch": 3.522346368715084, + "grad_norm": 0.52731853723526, + "learning_rate": 0.0008263585434173669, + "loss": 0.576, + "step": 6305 + }, + { + "epoch": 3.522905027932961, + "grad_norm": 0.8768479228019714, + "learning_rate": 0.0008263305322128851, + "loss": 0.4087, + "step": 6306 + }, + { + "epoch": 3.523463687150838, + "grad_norm": 0.6016049981117249, + "learning_rate": 0.0008263025210084034, + "loss": 0.4309, + "step": 6307 + }, + { + "epoch": 3.524022346368715, + "grad_norm": 1.674957036972046, + "learning_rate": 0.0008262745098039216, + "loss": 0.3762, + "step": 6308 + }, + { + "epoch": 3.524581005586592, + "grad_norm": 0.43001589179039, + "learning_rate": 0.0008262464985994398, + "loss": 0.4364, + "step": 6309 + }, + { + "epoch": 3.5251396648044695, + "grad_norm": 1.1241363286972046, + "learning_rate": 0.0008262184873949579, + "loss": 0.4668, + "step": 6310 + }, + { + "epoch": 3.5256983240223465, + "grad_norm": 0.4156871438026428, + "learning_rate": 0.0008261904761904761, + "loss": 0.4207, + "step": 6311 + }, + { + "epoch": 3.5262569832402235, + "grad_norm": 0.6592857837677002, + "learning_rate": 0.0008261624649859944, + "loss": 0.4434, + "step": 6312 + }, + { + "epoch": 3.5268156424581005, + "grad_norm": 0.5553011298179626, + "learning_rate": 0.0008261344537815127, + "loss": 0.4965, + "step": 6313 + }, + { + "epoch": 3.5273743016759775, + "grad_norm": 0.9002603888511658, + "learning_rate": 0.0008261064425770309, + "loss": 0.529, + "step": 6314 + }, + { + "epoch": 3.527932960893855, + "grad_norm": 0.6343851685523987, + "learning_rate": 0.0008260784313725491, + "loss": 0.4628, + "step": 6315 + }, + { + "epoch": 3.528491620111732, + "grad_norm": 0.5288112163543701, + "learning_rate": 0.0008260504201680672, + "loss": 0.4399, + "step": 6316 + }, + { + "epoch": 3.529050279329609, + "grad_norm": 0.805587887763977, + "learning_rate": 0.0008260224089635855, + "loss": 0.5357, + "step": 6317 + }, + { + "epoch": 3.529608938547486, + "grad_norm": 1.3080799579620361, + "learning_rate": 0.0008259943977591037, + "loss": 0.4533, + "step": 6318 + }, + { + "epoch": 3.530167597765363, + "grad_norm": 0.7621505856513977, + "learning_rate": 0.0008259663865546219, + "loss": 0.4495, + "step": 6319 + }, + { + "epoch": 3.5307262569832405, + "grad_norm": 0.6917195320129395, + "learning_rate": 0.0008259383753501401, + "loss": 0.5868, + "step": 6320 + }, + { + "epoch": 3.531284916201117, + "grad_norm": 0.602952241897583, + "learning_rate": 0.0008259103641456582, + "loss": 0.3552, + "step": 6321 + }, + { + "epoch": 3.5318435754189945, + "grad_norm": 0.443672239780426, + "learning_rate": 0.0008258823529411765, + "loss": 0.4313, + "step": 6322 + }, + { + "epoch": 3.5324022346368715, + "grad_norm": 0.5087466835975647, + "learning_rate": 0.0008258543417366947, + "loss": 0.3614, + "step": 6323 + }, + { + "epoch": 3.5329608938547485, + "grad_norm": 0.5650293827056885, + "learning_rate": 0.0008258263305322129, + "loss": 0.4144, + "step": 6324 + }, + { + "epoch": 3.5335195530726256, + "grad_norm": 0.547363817691803, + "learning_rate": 0.0008257983193277311, + "loss": 0.4474, + "step": 6325 + }, + { + "epoch": 3.5340782122905026, + "grad_norm": 0.47006767988204956, + "learning_rate": 0.0008257703081232492, + "loss": 0.4135, + "step": 6326 + }, + { + "epoch": 3.53463687150838, + "grad_norm": 0.5777665972709656, + "learning_rate": 0.0008257422969187675, + "loss": 0.4646, + "step": 6327 + }, + { + "epoch": 3.535195530726257, + "grad_norm": 0.4608403444290161, + "learning_rate": 0.0008257142857142857, + "loss": 0.3563, + "step": 6328 + }, + { + "epoch": 3.535754189944134, + "grad_norm": 0.4284411370754242, + "learning_rate": 0.000825686274509804, + "loss": 0.4066, + "step": 6329 + }, + { + "epoch": 3.536312849162011, + "grad_norm": 1.2041430473327637, + "learning_rate": 0.0008256582633053222, + "loss": 0.5643, + "step": 6330 + }, + { + "epoch": 3.536871508379888, + "grad_norm": 0.7602660655975342, + "learning_rate": 0.0008256302521008404, + "loss": 0.4196, + "step": 6331 + }, + { + "epoch": 3.5374301675977655, + "grad_norm": 2.63192081451416, + "learning_rate": 0.0008256022408963586, + "loss": 0.4932, + "step": 6332 + }, + { + "epoch": 3.5379888268156425, + "grad_norm": 0.6775304675102234, + "learning_rate": 0.0008255742296918768, + "loss": 0.4489, + "step": 6333 + }, + { + "epoch": 3.5385474860335195, + "grad_norm": 0.45529839396476746, + "learning_rate": 0.000825546218487395, + "loss": 0.3781, + "step": 6334 + }, + { + "epoch": 3.5391061452513966, + "grad_norm": 0.6338316202163696, + "learning_rate": 0.0008255182072829132, + "loss": 0.5974, + "step": 6335 + }, + { + "epoch": 3.5396648044692736, + "grad_norm": 0.5107612013816833, + "learning_rate": 0.0008254901960784314, + "loss": 0.4611, + "step": 6336 + }, + { + "epoch": 3.540223463687151, + "grad_norm": 0.48357951641082764, + "learning_rate": 0.0008254621848739496, + "loss": 0.4755, + "step": 6337 + }, + { + "epoch": 3.540782122905028, + "grad_norm": 0.6447745561599731, + "learning_rate": 0.0008254341736694678, + "loss": 0.5034, + "step": 6338 + }, + { + "epoch": 3.541340782122905, + "grad_norm": 0.6290780901908875, + "learning_rate": 0.000825406162464986, + "loss": 0.6193, + "step": 6339 + }, + { + "epoch": 3.541899441340782, + "grad_norm": 0.5462241768836975, + "learning_rate": 0.0008253781512605042, + "loss": 0.4217, + "step": 6340 + }, + { + "epoch": 3.542458100558659, + "grad_norm": 0.41927841305732727, + "learning_rate": 0.0008253501400560224, + "loss": 0.4314, + "step": 6341 + }, + { + "epoch": 3.5430167597765365, + "grad_norm": 0.3848975598812103, + "learning_rate": 0.0008253221288515406, + "loss": 0.4251, + "step": 6342 + }, + { + "epoch": 3.5435754189944135, + "grad_norm": 0.614855170249939, + "learning_rate": 0.0008252941176470588, + "loss": 0.3531, + "step": 6343 + }, + { + "epoch": 3.5441340782122905, + "grad_norm": 0.3214449882507324, + "learning_rate": 0.000825266106442577, + "loss": 0.3856, + "step": 6344 + }, + { + "epoch": 3.5446927374301676, + "grad_norm": 0.5649007558822632, + "learning_rate": 0.0008252380952380952, + "loss": 0.5055, + "step": 6345 + }, + { + "epoch": 3.5452513966480446, + "grad_norm": 0.5173424482345581, + "learning_rate": 0.0008252100840336135, + "loss": 0.4799, + "step": 6346 + }, + { + "epoch": 3.545810055865922, + "grad_norm": 0.4183085560798645, + "learning_rate": 0.0008251820728291318, + "loss": 0.378, + "step": 6347 + }, + { + "epoch": 3.546368715083799, + "grad_norm": 1.2555066347122192, + "learning_rate": 0.0008251540616246499, + "loss": 0.4145, + "step": 6348 + }, + { + "epoch": 3.546927374301676, + "grad_norm": 0.8779321908950806, + "learning_rate": 0.0008251260504201681, + "loss": 0.6206, + "step": 6349 + }, + { + "epoch": 3.547486033519553, + "grad_norm": 0.5858129262924194, + "learning_rate": 0.0008250980392156863, + "loss": 0.2681, + "step": 6350 + }, + { + "epoch": 3.54804469273743, + "grad_norm": 0.41439181566238403, + "learning_rate": 0.0008250700280112045, + "loss": 0.399, + "step": 6351 + }, + { + "epoch": 3.5486033519553075, + "grad_norm": 0.5139670372009277, + "learning_rate": 0.0008250420168067228, + "loss": 0.4459, + "step": 6352 + }, + { + "epoch": 3.549162011173184, + "grad_norm": 0.5441014766693115, + "learning_rate": 0.0008250140056022409, + "loss": 0.4204, + "step": 6353 + }, + { + "epoch": 3.5497206703910615, + "grad_norm": 0.5554166436195374, + "learning_rate": 0.0008249859943977591, + "loss": 0.4715, + "step": 6354 + }, + { + "epoch": 3.5502793296089385, + "grad_norm": 6.790686130523682, + "learning_rate": 0.0008249579831932773, + "loss": 0.4577, + "step": 6355 + }, + { + "epoch": 3.5508379888268156, + "grad_norm": 0.6405669450759888, + "learning_rate": 0.0008249299719887955, + "loss": 0.4864, + "step": 6356 + }, + { + "epoch": 3.5513966480446926, + "grad_norm": 0.49734386801719666, + "learning_rate": 0.0008249019607843138, + "loss": 0.5342, + "step": 6357 + }, + { + "epoch": 3.5519553072625696, + "grad_norm": 0.8427916765213013, + "learning_rate": 0.0008248739495798319, + "loss": 0.5436, + "step": 6358 + }, + { + "epoch": 3.552513966480447, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0008248459383753501, + "loss": 0.351, + "step": 6359 + }, + { + "epoch": 3.553072625698324, + "grad_norm": 0.841964066028595, + "learning_rate": 0.0008248179271708683, + "loss": 0.5495, + "step": 6360 + }, + { + "epoch": 3.553631284916201, + "grad_norm": 0.5770707726478577, + "learning_rate": 0.0008247899159663865, + "loss": 0.6003, + "step": 6361 + }, + { + "epoch": 3.554189944134078, + "grad_norm": 0.48813343048095703, + "learning_rate": 0.0008247619047619049, + "loss": 0.4487, + "step": 6362 + }, + { + "epoch": 3.554748603351955, + "grad_norm": 0.38025999069213867, + "learning_rate": 0.0008247338935574231, + "loss": 0.5018, + "step": 6363 + }, + { + "epoch": 3.5553072625698325, + "grad_norm": 0.6288140416145325, + "learning_rate": 0.0008247058823529412, + "loss": 0.3701, + "step": 6364 + }, + { + "epoch": 3.5558659217877095, + "grad_norm": 0.5462133884429932, + "learning_rate": 0.0008246778711484594, + "loss": 0.4782, + "step": 6365 + }, + { + "epoch": 3.5564245810055866, + "grad_norm": 0.41887781023979187, + "learning_rate": 0.0008246498599439776, + "loss": 0.3922, + "step": 6366 + }, + { + "epoch": 3.5569832402234636, + "grad_norm": 0.41739621758461, + "learning_rate": 0.0008246218487394959, + "loss": 0.3886, + "step": 6367 + }, + { + "epoch": 3.5575418994413406, + "grad_norm": 0.9115992784500122, + "learning_rate": 0.0008245938375350141, + "loss": 0.4778, + "step": 6368 + }, + { + "epoch": 3.558100558659218, + "grad_norm": 0.637393057346344, + "learning_rate": 0.0008245658263305322, + "loss": 0.5296, + "step": 6369 + }, + { + "epoch": 3.558659217877095, + "grad_norm": 0.8492223024368286, + "learning_rate": 0.0008245378151260504, + "loss": 0.4015, + "step": 6370 + }, + { + "epoch": 3.559217877094972, + "grad_norm": 0.6010319590568542, + "learning_rate": 0.0008245098039215686, + "loss": 0.4767, + "step": 6371 + }, + { + "epoch": 3.559776536312849, + "grad_norm": 0.5058609843254089, + "learning_rate": 0.0008244817927170869, + "loss": 0.5415, + "step": 6372 + }, + { + "epoch": 3.560335195530726, + "grad_norm": 0.767530083656311, + "learning_rate": 0.0008244537815126051, + "loss": 0.446, + "step": 6373 + }, + { + "epoch": 3.5608938547486035, + "grad_norm": 0.9753162860870361, + "learning_rate": 0.0008244257703081232, + "loss": 0.4386, + "step": 6374 + }, + { + "epoch": 3.5614525139664805, + "grad_norm": 0.3988550901412964, + "learning_rate": 0.0008243977591036414, + "loss": 0.3952, + "step": 6375 + }, + { + "epoch": 3.5620111731843576, + "grad_norm": 0.6361387968063354, + "learning_rate": 0.0008243697478991596, + "loss": 0.478, + "step": 6376 + }, + { + "epoch": 3.5625698324022346, + "grad_norm": 0.6381384134292603, + "learning_rate": 0.0008243417366946779, + "loss": 0.522, + "step": 6377 + }, + { + "epoch": 3.5631284916201116, + "grad_norm": 1.174181342124939, + "learning_rate": 0.0008243137254901962, + "loss": 0.5317, + "step": 6378 + }, + { + "epoch": 3.563687150837989, + "grad_norm": 0.7982651591300964, + "learning_rate": 0.0008242857142857144, + "loss": 0.4036, + "step": 6379 + }, + { + "epoch": 3.564245810055866, + "grad_norm": 0.3963726758956909, + "learning_rate": 0.0008242577030812325, + "loss": 0.3298, + "step": 6380 + }, + { + "epoch": 3.564804469273743, + "grad_norm": 0.488386869430542, + "learning_rate": 0.0008242296918767507, + "loss": 0.473, + "step": 6381 + }, + { + "epoch": 3.56536312849162, + "grad_norm": 0.552078366279602, + "learning_rate": 0.000824201680672269, + "loss": 0.3987, + "step": 6382 + }, + { + "epoch": 3.565921787709497, + "grad_norm": 0.6977636218070984, + "learning_rate": 0.0008241736694677872, + "loss": 0.5865, + "step": 6383 + }, + { + "epoch": 3.5664804469273745, + "grad_norm": 0.5130059719085693, + "learning_rate": 0.0008241456582633054, + "loss": 0.4821, + "step": 6384 + }, + { + "epoch": 3.5670391061452515, + "grad_norm": 0.516468346118927, + "learning_rate": 0.0008241176470588235, + "loss": 0.5345, + "step": 6385 + }, + { + "epoch": 3.5675977653631286, + "grad_norm": 3.8563690185546875, + "learning_rate": 0.0008240896358543417, + "loss": 0.4306, + "step": 6386 + }, + { + "epoch": 3.5681564245810056, + "grad_norm": 0.6177405118942261, + "learning_rate": 0.00082406162464986, + "loss": 0.4676, + "step": 6387 + }, + { + "epoch": 3.5687150837988826, + "grad_norm": 0.9815372228622437, + "learning_rate": 0.0008240336134453782, + "loss": 0.4165, + "step": 6388 + }, + { + "epoch": 3.56927374301676, + "grad_norm": 0.487702339887619, + "learning_rate": 0.0008240056022408964, + "loss": 0.4252, + "step": 6389 + }, + { + "epoch": 3.5698324022346366, + "grad_norm": 0.8767315745353699, + "learning_rate": 0.0008239775910364145, + "loss": 0.3639, + "step": 6390 + }, + { + "epoch": 3.570391061452514, + "grad_norm": 0.40929293632507324, + "learning_rate": 0.0008239495798319327, + "loss": 0.3955, + "step": 6391 + }, + { + "epoch": 3.570949720670391, + "grad_norm": 0.5693562626838684, + "learning_rate": 0.000823921568627451, + "loss": 0.4986, + "step": 6392 + }, + { + "epoch": 3.571508379888268, + "grad_norm": 0.4609087109565735, + "learning_rate": 0.0008238935574229692, + "loss": 0.5037, + "step": 6393 + }, + { + "epoch": 3.572067039106145, + "grad_norm": 1.7756911516189575, + "learning_rate": 0.0008238655462184874, + "loss": 0.5288, + "step": 6394 + }, + { + "epoch": 3.572625698324022, + "grad_norm": 0.6726887226104736, + "learning_rate": 0.0008238375350140057, + "loss": 0.4616, + "step": 6395 + }, + { + "epoch": 3.5731843575418996, + "grad_norm": 0.40285179018974304, + "learning_rate": 0.0008238095238095238, + "loss": 0.3809, + "step": 6396 + }, + { + "epoch": 3.5737430167597766, + "grad_norm": 1.3979545831680298, + "learning_rate": 0.0008237815126050421, + "loss": 0.4524, + "step": 6397 + }, + { + "epoch": 3.5743016759776536, + "grad_norm": 0.618635892868042, + "learning_rate": 0.0008237535014005603, + "loss": 0.4365, + "step": 6398 + }, + { + "epoch": 3.5748603351955306, + "grad_norm": 0.5117287635803223, + "learning_rate": 0.0008237254901960785, + "loss": 0.4755, + "step": 6399 + }, + { + "epoch": 3.5754189944134076, + "grad_norm": 1.8905123472213745, + "learning_rate": 0.0008236974789915967, + "loss": 0.4277, + "step": 6400 + }, + { + "epoch": 3.575977653631285, + "grad_norm": 0.5427520275115967, + "learning_rate": 0.0008236694677871148, + "loss": 0.5016, + "step": 6401 + }, + { + "epoch": 3.576536312849162, + "grad_norm": 0.42610442638397217, + "learning_rate": 0.0008236414565826331, + "loss": 0.3809, + "step": 6402 + }, + { + "epoch": 3.577094972067039, + "grad_norm": 0.5109557509422302, + "learning_rate": 0.0008236134453781513, + "loss": 0.5045, + "step": 6403 + }, + { + "epoch": 3.577653631284916, + "grad_norm": 0.3809089660644531, + "learning_rate": 0.0008235854341736695, + "loss": 0.3944, + "step": 6404 + }, + { + "epoch": 3.578212290502793, + "grad_norm": 0.5951355695724487, + "learning_rate": 0.0008235574229691877, + "loss": 0.423, + "step": 6405 + }, + { + "epoch": 3.5787709497206706, + "grad_norm": 0.6134072542190552, + "learning_rate": 0.0008235294117647058, + "loss": 0.4461, + "step": 6406 + }, + { + "epoch": 3.5793296089385476, + "grad_norm": 0.47377634048461914, + "learning_rate": 0.0008235014005602241, + "loss": 0.5002, + "step": 6407 + }, + { + "epoch": 3.5798882681564246, + "grad_norm": 0.5466529130935669, + "learning_rate": 0.0008234733893557423, + "loss": 0.4641, + "step": 6408 + }, + { + "epoch": 3.5804469273743016, + "grad_norm": 0.4330349862575531, + "learning_rate": 0.0008234453781512605, + "loss": 0.4305, + "step": 6409 + }, + { + "epoch": 3.5810055865921786, + "grad_norm": 0.8357173204421997, + "learning_rate": 0.0008234173669467787, + "loss": 0.524, + "step": 6410 + }, + { + "epoch": 3.581564245810056, + "grad_norm": 0.5404726266860962, + "learning_rate": 0.000823389355742297, + "loss": 0.5421, + "step": 6411 + }, + { + "epoch": 3.582122905027933, + "grad_norm": 0.40846315026283264, + "learning_rate": 0.0008233613445378152, + "loss": 0.4173, + "step": 6412 + }, + { + "epoch": 3.58268156424581, + "grad_norm": 0.4397639334201813, + "learning_rate": 0.0008233333333333334, + "loss": 0.3876, + "step": 6413 + }, + { + "epoch": 3.583240223463687, + "grad_norm": 2.4906694889068604, + "learning_rate": 0.0008233053221288516, + "loss": 0.426, + "step": 6414 + }, + { + "epoch": 3.583798882681564, + "grad_norm": 0.9563032388687134, + "learning_rate": 0.0008232773109243698, + "loss": 0.4847, + "step": 6415 + }, + { + "epoch": 3.5843575418994416, + "grad_norm": 0.40227943658828735, + "learning_rate": 0.000823249299719888, + "loss": 0.4023, + "step": 6416 + }, + { + "epoch": 3.5849162011173186, + "grad_norm": 4.108040809631348, + "learning_rate": 0.0008232212885154062, + "loss": 0.4843, + "step": 6417 + }, + { + "epoch": 3.5854748603351956, + "grad_norm": 0.43629199266433716, + "learning_rate": 0.0008231932773109244, + "loss": 0.4363, + "step": 6418 + }, + { + "epoch": 3.5860335195530726, + "grad_norm": 0.5753178000450134, + "learning_rate": 0.0008231652661064426, + "loss": 0.3836, + "step": 6419 + }, + { + "epoch": 3.5865921787709496, + "grad_norm": 0.6297770142555237, + "learning_rate": 0.0008231372549019608, + "loss": 0.4496, + "step": 6420 + }, + { + "epoch": 3.587150837988827, + "grad_norm": 1.8221763372421265, + "learning_rate": 0.000823109243697479, + "loss": 0.4854, + "step": 6421 + }, + { + "epoch": 3.587709497206704, + "grad_norm": 0.6255560517311096, + "learning_rate": 0.0008230812324929972, + "loss": 0.4266, + "step": 6422 + }, + { + "epoch": 3.588268156424581, + "grad_norm": 0.5087469816207886, + "learning_rate": 0.0008230532212885154, + "loss": 0.4282, + "step": 6423 + }, + { + "epoch": 3.588826815642458, + "grad_norm": 0.5999743342399597, + "learning_rate": 0.0008230252100840336, + "loss": 0.4311, + "step": 6424 + }, + { + "epoch": 3.589385474860335, + "grad_norm": 0.39028701186180115, + "learning_rate": 0.0008229971988795518, + "loss": 0.4005, + "step": 6425 + }, + { + "epoch": 3.5899441340782126, + "grad_norm": 0.5782503485679626, + "learning_rate": 0.00082296918767507, + "loss": 0.491, + "step": 6426 + }, + { + "epoch": 3.590502793296089, + "grad_norm": 0.4766971170902252, + "learning_rate": 0.0008229411764705884, + "loss": 0.5077, + "step": 6427 + }, + { + "epoch": 3.5910614525139666, + "grad_norm": 0.5042833089828491, + "learning_rate": 0.0008229131652661065, + "loss": 0.3267, + "step": 6428 + }, + { + "epoch": 3.5916201117318436, + "grad_norm": 0.6977332830429077, + "learning_rate": 0.0008228851540616247, + "loss": 0.6247, + "step": 6429 + }, + { + "epoch": 3.5921787709497206, + "grad_norm": 0.555036187171936, + "learning_rate": 0.0008228571428571429, + "loss": 0.5664, + "step": 6430 + }, + { + "epoch": 3.5927374301675976, + "grad_norm": 0.8549402952194214, + "learning_rate": 0.0008228291316526611, + "loss": 0.4764, + "step": 6431 + }, + { + "epoch": 3.5932960893854746, + "grad_norm": 0.40853291749954224, + "learning_rate": 0.0008228011204481794, + "loss": 0.3969, + "step": 6432 + }, + { + "epoch": 3.593854748603352, + "grad_norm": 0.5229538679122925, + "learning_rate": 0.0008227731092436975, + "loss": 0.5448, + "step": 6433 + }, + { + "epoch": 3.594413407821229, + "grad_norm": 0.5672826766967773, + "learning_rate": 0.0008227450980392157, + "loss": 0.3375, + "step": 6434 + }, + { + "epoch": 3.594972067039106, + "grad_norm": 0.5265349745750427, + "learning_rate": 0.0008227170868347339, + "loss": 0.4105, + "step": 6435 + }, + { + "epoch": 3.595530726256983, + "grad_norm": 1.0345433950424194, + "learning_rate": 0.0008226890756302521, + "loss": 0.4641, + "step": 6436 + }, + { + "epoch": 3.59608938547486, + "grad_norm": 0.5872271656990051, + "learning_rate": 0.0008226610644257704, + "loss": 0.6174, + "step": 6437 + }, + { + "epoch": 3.5966480446927376, + "grad_norm": 0.4988188147544861, + "learning_rate": 0.0008226330532212885, + "loss": 0.497, + "step": 6438 + }, + { + "epoch": 3.5972067039106146, + "grad_norm": 1.1633720397949219, + "learning_rate": 0.0008226050420168067, + "loss": 0.3614, + "step": 6439 + }, + { + "epoch": 3.5977653631284916, + "grad_norm": 0.5424314141273499, + "learning_rate": 0.0008225770308123249, + "loss": 0.465, + "step": 6440 + }, + { + "epoch": 3.5983240223463686, + "grad_norm": 0.7888202667236328, + "learning_rate": 0.0008225490196078431, + "loss": 0.5604, + "step": 6441 + }, + { + "epoch": 3.5988826815642456, + "grad_norm": 0.6142560839653015, + "learning_rate": 0.0008225210084033614, + "loss": 0.3842, + "step": 6442 + }, + { + "epoch": 3.599441340782123, + "grad_norm": 0.4416868984699249, + "learning_rate": 0.0008224929971988796, + "loss": 0.3801, + "step": 6443 + }, + { + "epoch": 3.6, + "grad_norm": 0.38662686944007874, + "learning_rate": 0.0008224649859943977, + "loss": 0.3578, + "step": 6444 + }, + { + "epoch": 3.600558659217877, + "grad_norm": 15.736149787902832, + "learning_rate": 0.000822436974789916, + "loss": 0.3535, + "step": 6445 + }, + { + "epoch": 3.601117318435754, + "grad_norm": 0.5843623280525208, + "learning_rate": 0.0008224089635854342, + "loss": 0.4783, + "step": 6446 + }, + { + "epoch": 3.601675977653631, + "grad_norm": 0.46261361241340637, + "learning_rate": 0.0008223809523809525, + "loss": 0.4116, + "step": 6447 + }, + { + "epoch": 3.6022346368715086, + "grad_norm": 0.623014509677887, + "learning_rate": 0.0008223529411764707, + "loss": 0.4058, + "step": 6448 + }, + { + "epoch": 3.6027932960893856, + "grad_norm": 0.5723839998245239, + "learning_rate": 0.0008223249299719888, + "loss": 0.4354, + "step": 6449 + }, + { + "epoch": 3.6033519553072626, + "grad_norm": 0.8480479717254639, + "learning_rate": 0.000822296918767507, + "loss": 0.5407, + "step": 6450 + }, + { + "epoch": 3.6039106145251396, + "grad_norm": 0.4948956072330475, + "learning_rate": 0.0008222689075630252, + "loss": 0.5575, + "step": 6451 + }, + { + "epoch": 3.6044692737430166, + "grad_norm": 1.755097508430481, + "learning_rate": 0.0008222408963585435, + "loss": 0.4199, + "step": 6452 + }, + { + "epoch": 3.605027932960894, + "grad_norm": 0.634207546710968, + "learning_rate": 0.0008222128851540617, + "loss": 0.606, + "step": 6453 + }, + { + "epoch": 3.605586592178771, + "grad_norm": 0.8725922703742981, + "learning_rate": 0.0008221848739495798, + "loss": 0.5041, + "step": 6454 + }, + { + "epoch": 3.606145251396648, + "grad_norm": 0.6817995309829712, + "learning_rate": 0.000822156862745098, + "loss": 0.5345, + "step": 6455 + }, + { + "epoch": 3.606703910614525, + "grad_norm": 0.6545759439468384, + "learning_rate": 0.0008221288515406162, + "loss": 0.4908, + "step": 6456 + }, + { + "epoch": 3.607262569832402, + "grad_norm": 0.7679895162582397, + "learning_rate": 0.0008221008403361345, + "loss": 0.4979, + "step": 6457 + }, + { + "epoch": 3.6078212290502796, + "grad_norm": 0.5004749894142151, + "learning_rate": 0.0008220728291316527, + "loss": 0.4487, + "step": 6458 + }, + { + "epoch": 3.6083798882681566, + "grad_norm": 0.7766402959823608, + "learning_rate": 0.0008220448179271709, + "loss": 0.5176, + "step": 6459 + }, + { + "epoch": 3.6089385474860336, + "grad_norm": 0.8019154667854309, + "learning_rate": 0.000822016806722689, + "loss": 0.4483, + "step": 6460 + }, + { + "epoch": 3.6094972067039106, + "grad_norm": 0.5108367800712585, + "learning_rate": 0.0008219887955182072, + "loss": 0.5476, + "step": 6461 + }, + { + "epoch": 3.6100558659217876, + "grad_norm": 0.4819338619709015, + "learning_rate": 0.0008219607843137256, + "loss": 0.4242, + "step": 6462 + }, + { + "epoch": 3.610614525139665, + "grad_norm": 0.5087612867355347, + "learning_rate": 0.0008219327731092438, + "loss": 0.5091, + "step": 6463 + }, + { + "epoch": 3.6111731843575416, + "grad_norm": 0.9313182830810547, + "learning_rate": 0.000821904761904762, + "loss": 0.7428, + "step": 6464 + }, + { + "epoch": 3.611731843575419, + "grad_norm": 1.1090079545974731, + "learning_rate": 0.0008218767507002801, + "loss": 0.4983, + "step": 6465 + }, + { + "epoch": 3.612290502793296, + "grad_norm": 0.4408183991909027, + "learning_rate": 0.0008218487394957983, + "loss": 0.4667, + "step": 6466 + }, + { + "epoch": 3.612849162011173, + "grad_norm": 0.7039101123809814, + "learning_rate": 0.0008218207282913165, + "loss": 0.3746, + "step": 6467 + }, + { + "epoch": 3.61340782122905, + "grad_norm": 0.6735904812812805, + "learning_rate": 0.0008217927170868348, + "loss": 0.4699, + "step": 6468 + }, + { + "epoch": 3.613966480446927, + "grad_norm": 0.6414945721626282, + "learning_rate": 0.000821764705882353, + "loss": 0.4009, + "step": 6469 + }, + { + "epoch": 3.6145251396648046, + "grad_norm": 0.6738402247428894, + "learning_rate": 0.0008217366946778711, + "loss": 0.5266, + "step": 6470 + }, + { + "epoch": 3.6150837988826816, + "grad_norm": 0.8753194212913513, + "learning_rate": 0.0008217086834733893, + "loss": 0.4415, + "step": 6471 + }, + { + "epoch": 3.6156424581005586, + "grad_norm": 0.537079393863678, + "learning_rate": 0.0008216806722689075, + "loss": 0.5859, + "step": 6472 + }, + { + "epoch": 3.6162011173184356, + "grad_norm": 0.831777811050415, + "learning_rate": 0.0008216526610644258, + "loss": 0.4068, + "step": 6473 + }, + { + "epoch": 3.6167597765363126, + "grad_norm": 0.46099838614463806, + "learning_rate": 0.000821624649859944, + "loss": 0.499, + "step": 6474 + }, + { + "epoch": 3.61731843575419, + "grad_norm": 0.6289117932319641, + "learning_rate": 0.0008215966386554622, + "loss": 0.4864, + "step": 6475 + }, + { + "epoch": 3.617877094972067, + "grad_norm": 0.7247985601425171, + "learning_rate": 0.0008215686274509803, + "loss": 0.5355, + "step": 6476 + }, + { + "epoch": 3.618435754189944, + "grad_norm": 0.4053064286708832, + "learning_rate": 0.0008215406162464985, + "loss": 0.4343, + "step": 6477 + }, + { + "epoch": 3.618994413407821, + "grad_norm": 0.6325204372406006, + "learning_rate": 0.0008215126050420169, + "loss": 0.3954, + "step": 6478 + }, + { + "epoch": 3.619553072625698, + "grad_norm": 0.5867834091186523, + "learning_rate": 0.0008214845938375351, + "loss": 0.3693, + "step": 6479 + }, + { + "epoch": 3.6201117318435756, + "grad_norm": 0.694374680519104, + "learning_rate": 0.0008214565826330533, + "loss": 0.5966, + "step": 6480 + }, + { + "epoch": 3.6206703910614526, + "grad_norm": 0.5166329145431519, + "learning_rate": 0.0008214285714285714, + "loss": 0.3699, + "step": 6481 + }, + { + "epoch": 3.6212290502793296, + "grad_norm": 22.80353355407715, + "learning_rate": 0.0008214005602240896, + "loss": 0.529, + "step": 6482 + }, + { + "epoch": 3.6217877094972066, + "grad_norm": 0.4705682098865509, + "learning_rate": 0.0008213725490196079, + "loss": 0.3925, + "step": 6483 + }, + { + "epoch": 3.6223463687150836, + "grad_norm": 1.3270971775054932, + "learning_rate": 0.0008213445378151261, + "loss": 0.4902, + "step": 6484 + }, + { + "epoch": 3.622905027932961, + "grad_norm": 2.1248257160186768, + "learning_rate": 0.0008213165266106443, + "loss": 0.4774, + "step": 6485 + }, + { + "epoch": 3.623463687150838, + "grad_norm": 0.5341605544090271, + "learning_rate": 0.0008212885154061624, + "loss": 0.4741, + "step": 6486 + }, + { + "epoch": 3.624022346368715, + "grad_norm": 0.4652163088321686, + "learning_rate": 0.0008212605042016806, + "loss": 0.3937, + "step": 6487 + }, + { + "epoch": 3.624581005586592, + "grad_norm": 0.5006020665168762, + "learning_rate": 0.0008212324929971989, + "loss": 0.4843, + "step": 6488 + }, + { + "epoch": 3.625139664804469, + "grad_norm": 0.752861499786377, + "learning_rate": 0.0008212044817927171, + "loss": 0.6368, + "step": 6489 + }, + { + "epoch": 3.6256983240223466, + "grad_norm": 0.7518357038497925, + "learning_rate": 0.0008211764705882353, + "loss": 0.4761, + "step": 6490 + }, + { + "epoch": 3.6262569832402236, + "grad_norm": 0.7350558042526245, + "learning_rate": 0.0008211484593837535, + "loss": 0.3349, + "step": 6491 + }, + { + "epoch": 3.6268156424581006, + "grad_norm": 0.4279731214046478, + "learning_rate": 0.0008211204481792716, + "loss": 0.3346, + "step": 6492 + }, + { + "epoch": 3.6273743016759776, + "grad_norm": 0.7430214881896973, + "learning_rate": 0.00082109243697479, + "loss": 0.3664, + "step": 6493 + }, + { + "epoch": 3.6279329608938546, + "grad_norm": 0.5654424428939819, + "learning_rate": 0.0008210644257703082, + "loss": 0.3604, + "step": 6494 + }, + { + "epoch": 3.628491620111732, + "grad_norm": 1.3331470489501953, + "learning_rate": 0.0008210364145658264, + "loss": 0.4775, + "step": 6495 + }, + { + "epoch": 3.6290502793296087, + "grad_norm": 0.5819928050041199, + "learning_rate": 0.0008210084033613446, + "loss": 0.4449, + "step": 6496 + }, + { + "epoch": 3.629608938547486, + "grad_norm": 0.5940769910812378, + "learning_rate": 0.0008209803921568627, + "loss": 0.483, + "step": 6497 + }, + { + "epoch": 3.630167597765363, + "grad_norm": 0.7750584483146667, + "learning_rate": 0.000820952380952381, + "loss": 0.509, + "step": 6498 + }, + { + "epoch": 3.63072625698324, + "grad_norm": 0.5893155932426453, + "learning_rate": 0.0008209243697478992, + "loss": 0.4488, + "step": 6499 + }, + { + "epoch": 3.631284916201117, + "grad_norm": 0.5505998730659485, + "learning_rate": 0.0008208963585434174, + "loss": 0.4699, + "step": 6500 + }, + { + "epoch": 3.631284916201117, + "eval_cer": 0.09406675177026393, + "eval_loss": 0.35760098695755005, + "eval_runtime": 55.7703, + "eval_samples_per_second": 81.369, + "eval_steps_per_second": 5.092, + "eval_wer": 0.3711069523197676, + "step": 6500 + }, + { + "epoch": 3.631843575418994, + "grad_norm": 0.5845969319343567, + "learning_rate": 0.0008208683473389356, + "loss": 0.4983, + "step": 6501 + }, + { + "epoch": 3.6324022346368716, + "grad_norm": 0.46298035979270935, + "learning_rate": 0.0008208403361344537, + "loss": 0.4643, + "step": 6502 + }, + { + "epoch": 3.6329608938547486, + "grad_norm": 0.3908912241458893, + "learning_rate": 0.000820812324929972, + "loss": 0.4557, + "step": 6503 + }, + { + "epoch": 3.6335195530726256, + "grad_norm": 0.5883150100708008, + "learning_rate": 0.0008207843137254902, + "loss": 0.4242, + "step": 6504 + }, + { + "epoch": 3.6340782122905027, + "grad_norm": 0.7345861792564392, + "learning_rate": 0.0008207563025210084, + "loss": 0.6422, + "step": 6505 + }, + { + "epoch": 3.6346368715083797, + "grad_norm": 0.4540961682796478, + "learning_rate": 0.0008207282913165266, + "loss": 0.4319, + "step": 6506 + }, + { + "epoch": 3.635195530726257, + "grad_norm": 0.4743022322654724, + "learning_rate": 0.0008207002801120448, + "loss": 0.5672, + "step": 6507 + }, + { + "epoch": 3.635754189944134, + "grad_norm": 0.4285391569137573, + "learning_rate": 0.000820672268907563, + "loss": 0.4264, + "step": 6508 + }, + { + "epoch": 3.636312849162011, + "grad_norm": 0.7424596548080444, + "learning_rate": 0.0008206442577030812, + "loss": 0.406, + "step": 6509 + }, + { + "epoch": 3.636871508379888, + "grad_norm": 0.5257818102836609, + "learning_rate": 0.0008206162464985995, + "loss": 0.4395, + "step": 6510 + }, + { + "epoch": 3.637430167597765, + "grad_norm": 0.6352879405021667, + "learning_rate": 0.0008205882352941177, + "loss": 0.6214, + "step": 6511 + }, + { + "epoch": 3.6379888268156426, + "grad_norm": 1.3795509338378906, + "learning_rate": 0.0008205602240896359, + "loss": 0.5127, + "step": 6512 + }, + { + "epoch": 3.6385474860335196, + "grad_norm": 0.4487282931804657, + "learning_rate": 0.0008205322128851541, + "loss": 0.4578, + "step": 6513 + }, + { + "epoch": 3.6391061452513966, + "grad_norm": 0.48283979296684265, + "learning_rate": 0.0008205042016806723, + "loss": 0.4808, + "step": 6514 + }, + { + "epoch": 3.6396648044692737, + "grad_norm": 0.4567292332649231, + "learning_rate": 0.0008204761904761905, + "loss": 0.5098, + "step": 6515 + }, + { + "epoch": 3.6402234636871507, + "grad_norm": 0.8431621193885803, + "learning_rate": 0.0008204481792717087, + "loss": 0.524, + "step": 6516 + }, + { + "epoch": 3.640782122905028, + "grad_norm": 0.5462682247161865, + "learning_rate": 0.0008204201680672269, + "loss": 0.3413, + "step": 6517 + }, + { + "epoch": 3.641340782122905, + "grad_norm": 0.42183825373649597, + "learning_rate": 0.0008203921568627452, + "loss": 0.4816, + "step": 6518 + }, + { + "epoch": 3.641899441340782, + "grad_norm": 1.2426623106002808, + "learning_rate": 0.0008203641456582633, + "loss": 0.4582, + "step": 6519 + }, + { + "epoch": 3.642458100558659, + "grad_norm": 0.6737184524536133, + "learning_rate": 0.0008203361344537815, + "loss": 0.5678, + "step": 6520 + }, + { + "epoch": 3.643016759776536, + "grad_norm": 0.4400502145290375, + "learning_rate": 0.0008203081232492997, + "loss": 0.4366, + "step": 6521 + }, + { + "epoch": 3.6435754189944136, + "grad_norm": 1.0397083759307861, + "learning_rate": 0.0008202801120448179, + "loss": 0.5584, + "step": 6522 + }, + { + "epoch": 3.6441340782122906, + "grad_norm": 0.49252504110336304, + "learning_rate": 0.0008202521008403362, + "loss": 0.4701, + "step": 6523 + }, + { + "epoch": 3.6446927374301676, + "grad_norm": 0.6515716910362244, + "learning_rate": 0.0008202240896358543, + "loss": 0.4662, + "step": 6524 + }, + { + "epoch": 3.6452513966480447, + "grad_norm": 0.532579243183136, + "learning_rate": 0.0008201960784313725, + "loss": 0.5567, + "step": 6525 + }, + { + "epoch": 3.6458100558659217, + "grad_norm": 0.5238862633705139, + "learning_rate": 0.0008201680672268907, + "loss": 0.4799, + "step": 6526 + }, + { + "epoch": 3.646368715083799, + "grad_norm": 0.5112152695655823, + "learning_rate": 0.000820140056022409, + "loss": 0.3977, + "step": 6527 + }, + { + "epoch": 3.646927374301676, + "grad_norm": 1.3041168451309204, + "learning_rate": 0.0008201120448179273, + "loss": 0.5636, + "step": 6528 + }, + { + "epoch": 3.647486033519553, + "grad_norm": 2.3652446269989014, + "learning_rate": 0.0008200840336134454, + "loss": 0.406, + "step": 6529 + }, + { + "epoch": 3.64804469273743, + "grad_norm": 0.4410611093044281, + "learning_rate": 0.0008200560224089636, + "loss": 0.4464, + "step": 6530 + }, + { + "epoch": 3.648603351955307, + "grad_norm": 0.5673394203186035, + "learning_rate": 0.0008200280112044818, + "loss": 0.4015, + "step": 6531 + }, + { + "epoch": 3.6491620111731846, + "grad_norm": 0.45798346400260925, + "learning_rate": 0.00082, + "loss": 0.5285, + "step": 6532 + }, + { + "epoch": 3.649720670391061, + "grad_norm": 0.417288213968277, + "learning_rate": 0.0008199719887955183, + "loss": 0.4092, + "step": 6533 + }, + { + "epoch": 3.6502793296089386, + "grad_norm": 0.5197558403015137, + "learning_rate": 0.0008199439775910365, + "loss": 0.421, + "step": 6534 + }, + { + "epoch": 3.6508379888268156, + "grad_norm": 0.8692957162857056, + "learning_rate": 0.0008199159663865546, + "loss": 0.5988, + "step": 6535 + }, + { + "epoch": 3.6513966480446927, + "grad_norm": 0.6718247532844543, + "learning_rate": 0.0008198879551820728, + "loss": 0.474, + "step": 6536 + }, + { + "epoch": 3.6519553072625697, + "grad_norm": 0.5074272751808167, + "learning_rate": 0.000819859943977591, + "loss": 0.4871, + "step": 6537 + }, + { + "epoch": 3.6525139664804467, + "grad_norm": 0.5672338604927063, + "learning_rate": 0.0008198319327731093, + "loss": 0.4436, + "step": 6538 + }, + { + "epoch": 3.653072625698324, + "grad_norm": 0.8451388478279114, + "learning_rate": 0.0008198039215686275, + "loss": 0.4759, + "step": 6539 + }, + { + "epoch": 3.653631284916201, + "grad_norm": 10.254681587219238, + "learning_rate": 0.0008197759103641456, + "loss": 0.4975, + "step": 6540 + }, + { + "epoch": 3.654189944134078, + "grad_norm": 1.752718210220337, + "learning_rate": 0.0008197478991596638, + "loss": 0.3744, + "step": 6541 + }, + { + "epoch": 3.654748603351955, + "grad_norm": 1.2247676849365234, + "learning_rate": 0.000819719887955182, + "loss": 0.6475, + "step": 6542 + }, + { + "epoch": 3.655307262569832, + "grad_norm": 0.6271907091140747, + "learning_rate": 0.0008196918767507004, + "loss": 0.437, + "step": 6543 + }, + { + "epoch": 3.6558659217877096, + "grad_norm": 0.39722007513046265, + "learning_rate": 0.0008196638655462186, + "loss": 0.4291, + "step": 6544 + }, + { + "epoch": 3.6564245810055866, + "grad_norm": 0.4425193667411804, + "learning_rate": 0.0008196358543417367, + "loss": 0.427, + "step": 6545 + }, + { + "epoch": 3.6569832402234637, + "grad_norm": 1.2784485816955566, + "learning_rate": 0.0008196078431372549, + "loss": 0.4491, + "step": 6546 + }, + { + "epoch": 3.6575418994413407, + "grad_norm": 0.5835462808609009, + "learning_rate": 0.0008195798319327731, + "loss": 0.4424, + "step": 6547 + }, + { + "epoch": 3.6581005586592177, + "grad_norm": 0.5545950531959534, + "learning_rate": 0.0008195518207282914, + "loss": 0.4382, + "step": 6548 + }, + { + "epoch": 3.658659217877095, + "grad_norm": 0.4416521191596985, + "learning_rate": 0.0008195238095238096, + "loss": 0.4208, + "step": 6549 + }, + { + "epoch": 3.659217877094972, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0008194957983193278, + "loss": 0.5201, + "step": 6550 + }, + { + "epoch": 3.659776536312849, + "grad_norm": 1.5149476528167725, + "learning_rate": 0.0008194677871148459, + "loss": 0.4185, + "step": 6551 + }, + { + "epoch": 3.660335195530726, + "grad_norm": 0.6551757454872131, + "learning_rate": 0.0008194397759103641, + "loss": 0.4841, + "step": 6552 + }, + { + "epoch": 3.660893854748603, + "grad_norm": 1.3892247676849365, + "learning_rate": 0.0008194117647058824, + "loss": 0.4641, + "step": 6553 + }, + { + "epoch": 3.6614525139664806, + "grad_norm": 0.5354766249656677, + "learning_rate": 0.0008193837535014006, + "loss": 0.4782, + "step": 6554 + }, + { + "epoch": 3.6620111731843576, + "grad_norm": 0.42245107889175415, + "learning_rate": 0.0008193557422969188, + "loss": 0.5282, + "step": 6555 + }, + { + "epoch": 3.6625698324022347, + "grad_norm": 0.483610600233078, + "learning_rate": 0.0008193277310924369, + "loss": 0.4826, + "step": 6556 + }, + { + "epoch": 3.6631284916201117, + "grad_norm": 0.4468355178833008, + "learning_rate": 0.0008192997198879551, + "loss": 0.4835, + "step": 6557 + }, + { + "epoch": 3.6636871508379887, + "grad_norm": 0.372497022151947, + "learning_rate": 0.0008192717086834734, + "loss": 0.408, + "step": 6558 + }, + { + "epoch": 3.664245810055866, + "grad_norm": 0.5888392329216003, + "learning_rate": 0.0008192436974789917, + "loss": 0.3686, + "step": 6559 + }, + { + "epoch": 3.664804469273743, + "grad_norm": 1.0408971309661865, + "learning_rate": 0.0008192156862745099, + "loss": 0.462, + "step": 6560 + }, + { + "epoch": 3.66536312849162, + "grad_norm": 0.5975467562675476, + "learning_rate": 0.000819187675070028, + "loss": 0.4531, + "step": 6561 + }, + { + "epoch": 3.665921787709497, + "grad_norm": 0.4743780791759491, + "learning_rate": 0.0008191596638655462, + "loss": 0.3791, + "step": 6562 + }, + { + "epoch": 3.666480446927374, + "grad_norm": 0.4812266528606415, + "learning_rate": 0.0008191316526610645, + "loss": 0.4072, + "step": 6563 + }, + { + "epoch": 3.6670391061452516, + "grad_norm": 0.39554843306541443, + "learning_rate": 0.0008191036414565827, + "loss": 0.3693, + "step": 6564 + }, + { + "epoch": 3.6675977653631286, + "grad_norm": 0.6460503339767456, + "learning_rate": 0.0008190756302521009, + "loss": 0.466, + "step": 6565 + }, + { + "epoch": 3.6681564245810057, + "grad_norm": 0.6518165469169617, + "learning_rate": 0.0008190476190476191, + "loss": 0.4814, + "step": 6566 + }, + { + "epoch": 3.6687150837988827, + "grad_norm": 0.6282866597175598, + "learning_rate": 0.0008190196078431372, + "loss": 0.685, + "step": 6567 + }, + { + "epoch": 3.6692737430167597, + "grad_norm": 0.5056010484695435, + "learning_rate": 0.0008189915966386555, + "loss": 0.4358, + "step": 6568 + }, + { + "epoch": 3.669832402234637, + "grad_norm": 0.5974522829055786, + "learning_rate": 0.0008189635854341737, + "loss": 0.489, + "step": 6569 + }, + { + "epoch": 3.6703910614525137, + "grad_norm": 0.3944227993488312, + "learning_rate": 0.0008189355742296919, + "loss": 0.4885, + "step": 6570 + }, + { + "epoch": 3.670949720670391, + "grad_norm": 0.9439200162887573, + "learning_rate": 0.0008189075630252101, + "loss": 0.3953, + "step": 6571 + }, + { + "epoch": 3.671508379888268, + "grad_norm": 0.6894012689590454, + "learning_rate": 0.0008188795518207282, + "loss": 0.5021, + "step": 6572 + }, + { + "epoch": 3.672067039106145, + "grad_norm": 0.5507104992866516, + "learning_rate": 0.0008188515406162465, + "loss": 0.4369, + "step": 6573 + }, + { + "epoch": 3.672625698324022, + "grad_norm": 0.3866426944732666, + "learning_rate": 0.0008188235294117647, + "loss": 0.4085, + "step": 6574 + }, + { + "epoch": 3.673184357541899, + "grad_norm": 0.5168094038963318, + "learning_rate": 0.000818795518207283, + "loss": 0.3898, + "step": 6575 + }, + { + "epoch": 3.6737430167597767, + "grad_norm": 0.44074076414108276, + "learning_rate": 0.0008187675070028012, + "loss": 0.3856, + "step": 6576 + }, + { + "epoch": 3.6743016759776537, + "grad_norm": 0.7955960631370544, + "learning_rate": 0.0008187394957983193, + "loss": 0.5206, + "step": 6577 + }, + { + "epoch": 3.6748603351955307, + "grad_norm": 0.5694375038146973, + "learning_rate": 0.0008187114845938376, + "loss": 0.5083, + "step": 6578 + }, + { + "epoch": 3.6754189944134077, + "grad_norm": 0.4771963059902191, + "learning_rate": 0.0008186834733893558, + "loss": 0.4795, + "step": 6579 + }, + { + "epoch": 3.6759776536312847, + "grad_norm": 0.7393046617507935, + "learning_rate": 0.000818655462184874, + "loss": 0.4858, + "step": 6580 + }, + { + "epoch": 3.676536312849162, + "grad_norm": 0.5868818163871765, + "learning_rate": 0.0008186274509803922, + "loss": 0.346, + "step": 6581 + }, + { + "epoch": 3.677094972067039, + "grad_norm": 0.5598104000091553, + "learning_rate": 0.0008185994397759104, + "loss": 0.4849, + "step": 6582 + }, + { + "epoch": 3.677653631284916, + "grad_norm": 0.7943028211593628, + "learning_rate": 0.0008185714285714286, + "loss": 0.4603, + "step": 6583 + }, + { + "epoch": 3.678212290502793, + "grad_norm": 0.5936654210090637, + "learning_rate": 0.0008185434173669468, + "loss": 0.3908, + "step": 6584 + }, + { + "epoch": 3.67877094972067, + "grad_norm": 0.585900068283081, + "learning_rate": 0.000818515406162465, + "loss": 0.4222, + "step": 6585 + }, + { + "epoch": 3.6793296089385477, + "grad_norm": 0.5086262822151184, + "learning_rate": 0.0008184873949579832, + "loss": 0.3499, + "step": 6586 + }, + { + "epoch": 3.6798882681564247, + "grad_norm": 0.4961419403553009, + "learning_rate": 0.0008184593837535014, + "loss": 0.4381, + "step": 6587 + }, + { + "epoch": 3.6804469273743017, + "grad_norm": 0.5073038935661316, + "learning_rate": 0.0008184313725490196, + "loss": 0.4638, + "step": 6588 + }, + { + "epoch": 3.6810055865921787, + "grad_norm": 0.5773311853408813, + "learning_rate": 0.0008184033613445378, + "loss": 0.4878, + "step": 6589 + }, + { + "epoch": 3.6815642458100557, + "grad_norm": 0.5299826264381409, + "learning_rate": 0.000818375350140056, + "loss": 0.4, + "step": 6590 + }, + { + "epoch": 3.682122905027933, + "grad_norm": 0.7297815084457397, + "learning_rate": 0.0008183473389355742, + "loss": 0.5699, + "step": 6591 + }, + { + "epoch": 3.68268156424581, + "grad_norm": 0.6074373126029968, + "learning_rate": 0.0008183193277310925, + "loss": 0.5335, + "step": 6592 + }, + { + "epoch": 3.683240223463687, + "grad_norm": 1.6804271936416626, + "learning_rate": 0.0008182913165266107, + "loss": 0.7456, + "step": 6593 + }, + { + "epoch": 3.683798882681564, + "grad_norm": 0.8368260860443115, + "learning_rate": 0.0008182633053221289, + "loss": 0.5148, + "step": 6594 + }, + { + "epoch": 3.684357541899441, + "grad_norm": 0.9733879566192627, + "learning_rate": 0.0008182352941176471, + "loss": 0.4015, + "step": 6595 + }, + { + "epoch": 3.6849162011173187, + "grad_norm": 1.1178513765335083, + "learning_rate": 0.0008182072829131653, + "loss": 0.6611, + "step": 6596 + }, + { + "epoch": 3.6854748603351957, + "grad_norm": 0.4010314643383026, + "learning_rate": 0.0008181792717086835, + "loss": 0.4945, + "step": 6597 + }, + { + "epoch": 3.6860335195530727, + "grad_norm": 0.7516027092933655, + "learning_rate": 0.0008181512605042018, + "loss": 0.4372, + "step": 6598 + }, + { + "epoch": 3.6865921787709497, + "grad_norm": 0.4921731650829315, + "learning_rate": 0.0008181232492997199, + "loss": 0.3972, + "step": 6599 + }, + { + "epoch": 3.6871508379888267, + "grad_norm": 0.6820230484008789, + "learning_rate": 0.0008180952380952381, + "loss": 0.3899, + "step": 6600 + }, + { + "epoch": 3.687709497206704, + "grad_norm": 0.5081499814987183, + "learning_rate": 0.0008180672268907563, + "loss": 0.4212, + "step": 6601 + }, + { + "epoch": 3.688268156424581, + "grad_norm": 0.7553133368492126, + "learning_rate": 0.0008180392156862745, + "loss": 0.4516, + "step": 6602 + }, + { + "epoch": 3.688826815642458, + "grad_norm": 0.7080279588699341, + "learning_rate": 0.0008180112044817928, + "loss": 0.4477, + "step": 6603 + }, + { + "epoch": 3.689385474860335, + "grad_norm": 0.6791130304336548, + "learning_rate": 0.0008179831932773109, + "loss": 0.5012, + "step": 6604 + }, + { + "epoch": 3.689944134078212, + "grad_norm": 0.42615175247192383, + "learning_rate": 0.0008179551820728291, + "loss": 0.5078, + "step": 6605 + }, + { + "epoch": 3.6905027932960897, + "grad_norm": 4.26744270324707, + "learning_rate": 0.0008179271708683473, + "loss": 0.4548, + "step": 6606 + }, + { + "epoch": 3.6910614525139662, + "grad_norm": 0.5060055255889893, + "learning_rate": 0.0008178991596638655, + "loss": 0.4112, + "step": 6607 + }, + { + "epoch": 3.6916201117318437, + "grad_norm": 0.4261416792869568, + "learning_rate": 0.0008178711484593839, + "loss": 0.4393, + "step": 6608 + }, + { + "epoch": 3.6921787709497207, + "grad_norm": 1.223616600036621, + "learning_rate": 0.000817843137254902, + "loss": 0.3641, + "step": 6609 + }, + { + "epoch": 3.6927374301675977, + "grad_norm": 0.5487021207809448, + "learning_rate": 0.0008178151260504202, + "loss": 0.5481, + "step": 6610 + }, + { + "epoch": 3.6932960893854747, + "grad_norm": 2.412358045578003, + "learning_rate": 0.0008177871148459384, + "loss": 0.5495, + "step": 6611 + }, + { + "epoch": 3.6938547486033517, + "grad_norm": 0.4571426808834076, + "learning_rate": 0.0008177591036414566, + "loss": 0.4205, + "step": 6612 + }, + { + "epoch": 3.694413407821229, + "grad_norm": 0.4575337767601013, + "learning_rate": 0.0008177310924369749, + "loss": 0.4602, + "step": 6613 + }, + { + "epoch": 3.694972067039106, + "grad_norm": 0.512572169303894, + "learning_rate": 0.0008177030812324931, + "loss": 0.5942, + "step": 6614 + }, + { + "epoch": 3.695530726256983, + "grad_norm": 0.45204320549964905, + "learning_rate": 0.0008176750700280112, + "loss": 0.423, + "step": 6615 + }, + { + "epoch": 3.69608938547486, + "grad_norm": 0.7094957828521729, + "learning_rate": 0.0008176470588235294, + "loss": 0.4425, + "step": 6616 + }, + { + "epoch": 3.6966480446927372, + "grad_norm": 0.54328852891922, + "learning_rate": 0.0008176190476190476, + "loss": 0.5917, + "step": 6617 + }, + { + "epoch": 3.6972067039106147, + "grad_norm": 0.699624240398407, + "learning_rate": 0.0008175910364145659, + "loss": 0.4372, + "step": 6618 + }, + { + "epoch": 3.6977653631284917, + "grad_norm": 0.6127368807792664, + "learning_rate": 0.0008175630252100841, + "loss": 0.5105, + "step": 6619 + }, + { + "epoch": 3.6983240223463687, + "grad_norm": 0.4927031099796295, + "learning_rate": 0.0008175350140056022, + "loss": 0.5134, + "step": 6620 + }, + { + "epoch": 3.6988826815642457, + "grad_norm": 0.7929823398590088, + "learning_rate": 0.0008175070028011204, + "loss": 0.4772, + "step": 6621 + }, + { + "epoch": 3.6994413407821227, + "grad_norm": 0.5992799401283264, + "learning_rate": 0.0008174789915966386, + "loss": 0.4889, + "step": 6622 + }, + { + "epoch": 3.7, + "grad_norm": 0.6249922513961792, + "learning_rate": 0.0008174509803921569, + "loss": 0.518, + "step": 6623 + }, + { + "epoch": 3.700558659217877, + "grad_norm": 0.6661367416381836, + "learning_rate": 0.0008174229691876752, + "loss": 0.4634, + "step": 6624 + }, + { + "epoch": 3.701117318435754, + "grad_norm": 2.2336106300354004, + "learning_rate": 0.0008173949579831932, + "loss": 0.4755, + "step": 6625 + }, + { + "epoch": 3.701675977653631, + "grad_norm": 1.5266122817993164, + "learning_rate": 0.0008173669467787115, + "loss": 0.6458, + "step": 6626 + }, + { + "epoch": 3.7022346368715082, + "grad_norm": 1.2884702682495117, + "learning_rate": 0.0008173389355742297, + "loss": 0.4475, + "step": 6627 + }, + { + "epoch": 3.7027932960893857, + "grad_norm": 0.8609706163406372, + "learning_rate": 0.000817310924369748, + "loss": 0.4115, + "step": 6628 + }, + { + "epoch": 3.7033519553072627, + "grad_norm": 0.5485642552375793, + "learning_rate": 0.0008172829131652662, + "loss": 0.4176, + "step": 6629 + }, + { + "epoch": 3.7039106145251397, + "grad_norm": 0.41080906987190247, + "learning_rate": 0.0008172549019607844, + "loss": 0.4145, + "step": 6630 + }, + { + "epoch": 3.7044692737430167, + "grad_norm": 0.49313127994537354, + "learning_rate": 0.0008172268907563025, + "loss": 0.4882, + "step": 6631 + }, + { + "epoch": 3.7050279329608937, + "grad_norm": 0.4483972489833832, + "learning_rate": 0.0008171988795518207, + "loss": 0.4773, + "step": 6632 + }, + { + "epoch": 3.705586592178771, + "grad_norm": 0.8375333547592163, + "learning_rate": 0.000817170868347339, + "loss": 0.6853, + "step": 6633 + }, + { + "epoch": 3.706145251396648, + "grad_norm": 0.4427073299884796, + "learning_rate": 0.0008171428571428572, + "loss": 0.4905, + "step": 6634 + }, + { + "epoch": 3.706703910614525, + "grad_norm": 0.44916847348213196, + "learning_rate": 0.0008171148459383754, + "loss": 0.4992, + "step": 6635 + }, + { + "epoch": 3.707262569832402, + "grad_norm": 0.5653901100158691, + "learning_rate": 0.0008170868347338935, + "loss": 0.5632, + "step": 6636 + }, + { + "epoch": 3.707821229050279, + "grad_norm": 0.582859456539154, + "learning_rate": 0.0008170588235294117, + "loss": 0.5395, + "step": 6637 + }, + { + "epoch": 3.7083798882681567, + "grad_norm": 1.1364970207214355, + "learning_rate": 0.00081703081232493, + "loss": 0.4551, + "step": 6638 + }, + { + "epoch": 3.7089385474860332, + "grad_norm": 0.4864155352115631, + "learning_rate": 0.0008170028011204482, + "loss": 0.468, + "step": 6639 + }, + { + "epoch": 3.7094972067039107, + "grad_norm": 0.46229758858680725, + "learning_rate": 0.0008169747899159664, + "loss": 0.479, + "step": 6640 + }, + { + "epoch": 3.7100558659217877, + "grad_norm": 0.6852522492408752, + "learning_rate": 0.0008169467787114845, + "loss": 0.4726, + "step": 6641 + }, + { + "epoch": 3.7106145251396647, + "grad_norm": 1.1568009853363037, + "learning_rate": 0.0008169187675070028, + "loss": 0.3804, + "step": 6642 + }, + { + "epoch": 3.711173184357542, + "grad_norm": 0.3943794369697571, + "learning_rate": 0.0008168907563025211, + "loss": 0.3449, + "step": 6643 + }, + { + "epoch": 3.7117318435754187, + "grad_norm": 0.5411002039909363, + "learning_rate": 0.0008168627450980393, + "loss": 0.4335, + "step": 6644 + }, + { + "epoch": 3.712290502793296, + "grad_norm": 0.7068082690238953, + "learning_rate": 0.0008168347338935575, + "loss": 0.4428, + "step": 6645 + }, + { + "epoch": 3.712849162011173, + "grad_norm": 1.0023226737976074, + "learning_rate": 0.0008168067226890757, + "loss": 0.667, + "step": 6646 + }, + { + "epoch": 3.71340782122905, + "grad_norm": 0.4348014295101166, + "learning_rate": 0.0008167787114845938, + "loss": 0.4338, + "step": 6647 + }, + { + "epoch": 3.7139664804469272, + "grad_norm": 0.3668944537639618, + "learning_rate": 0.0008167507002801121, + "loss": 0.4125, + "step": 6648 + }, + { + "epoch": 3.7145251396648042, + "grad_norm": 0.5998632311820984, + "learning_rate": 0.0008167226890756303, + "loss": 0.5072, + "step": 6649 + }, + { + "epoch": 3.7150837988826817, + "grad_norm": 0.44316115975379944, + "learning_rate": 0.0008166946778711485, + "loss": 0.4378, + "step": 6650 + }, + { + "epoch": 3.7156424581005587, + "grad_norm": 1.0044258832931519, + "learning_rate": 0.0008166666666666667, + "loss": 0.5984, + "step": 6651 + }, + { + "epoch": 3.7162011173184357, + "grad_norm": 0.4026990532875061, + "learning_rate": 0.0008166386554621848, + "loss": 0.515, + "step": 6652 + }, + { + "epoch": 3.7167597765363127, + "grad_norm": 0.5036965012550354, + "learning_rate": 0.0008166106442577031, + "loss": 0.4244, + "step": 6653 + }, + { + "epoch": 3.7173184357541897, + "grad_norm": 1.06436288356781, + "learning_rate": 0.0008165826330532213, + "loss": 0.4537, + "step": 6654 + }, + { + "epoch": 3.717877094972067, + "grad_norm": 1.7551262378692627, + "learning_rate": 0.0008165546218487395, + "loss": 0.5695, + "step": 6655 + }, + { + "epoch": 3.718435754189944, + "grad_norm": 0.41385704278945923, + "learning_rate": 0.0008165266106442577, + "loss": 0.4038, + "step": 6656 + }, + { + "epoch": 3.718994413407821, + "grad_norm": 0.5916339159011841, + "learning_rate": 0.0008164985994397758, + "loss": 0.4134, + "step": 6657 + }, + { + "epoch": 3.7195530726256982, + "grad_norm": 0.47273582220077515, + "learning_rate": 0.0008164705882352942, + "loss": 0.4651, + "step": 6658 + }, + { + "epoch": 3.7201117318435752, + "grad_norm": 0.7123088240623474, + "learning_rate": 0.0008164425770308124, + "loss": 0.5679, + "step": 6659 + }, + { + "epoch": 3.7206703910614527, + "grad_norm": 0.5929846167564392, + "learning_rate": 0.0008164145658263306, + "loss": 0.4358, + "step": 6660 + }, + { + "epoch": 3.7212290502793297, + "grad_norm": 1.381717324256897, + "learning_rate": 0.0008163865546218488, + "loss": 0.5122, + "step": 6661 + }, + { + "epoch": 3.7217877094972067, + "grad_norm": 2.166510820388794, + "learning_rate": 0.000816358543417367, + "loss": 0.6774, + "step": 6662 + }, + { + "epoch": 3.7223463687150837, + "grad_norm": 0.5627672672271729, + "learning_rate": 0.0008163305322128852, + "loss": 0.4467, + "step": 6663 + }, + { + "epoch": 3.7229050279329607, + "grad_norm": 0.5538860559463501, + "learning_rate": 0.0008163025210084034, + "loss": 0.5572, + "step": 6664 + }, + { + "epoch": 3.723463687150838, + "grad_norm": 0.5472835302352905, + "learning_rate": 0.0008162745098039216, + "loss": 0.4724, + "step": 6665 + }, + { + "epoch": 3.724022346368715, + "grad_norm": 0.6507546305656433, + "learning_rate": 0.0008162464985994398, + "loss": 0.3905, + "step": 6666 + }, + { + "epoch": 3.724581005586592, + "grad_norm": 1.0526360273361206, + "learning_rate": 0.000816218487394958, + "loss": 0.461, + "step": 6667 + }, + { + "epoch": 3.7251396648044692, + "grad_norm": 0.4964771270751953, + "learning_rate": 0.0008161904761904762, + "loss": 0.5815, + "step": 6668 + }, + { + "epoch": 3.7256983240223462, + "grad_norm": 0.6165770888328552, + "learning_rate": 0.0008161624649859944, + "loss": 0.5121, + "step": 6669 + }, + { + "epoch": 3.7262569832402237, + "grad_norm": 0.5066892504692078, + "learning_rate": 0.0008161344537815126, + "loss": 0.5568, + "step": 6670 + }, + { + "epoch": 3.7268156424581007, + "grad_norm": 0.475043922662735, + "learning_rate": 0.0008161064425770308, + "loss": 0.4911, + "step": 6671 + }, + { + "epoch": 3.7273743016759777, + "grad_norm": 3.152216911315918, + "learning_rate": 0.000816078431372549, + "loss": 0.3813, + "step": 6672 + }, + { + "epoch": 3.7279329608938547, + "grad_norm": 0.6873748302459717, + "learning_rate": 0.0008160504201680672, + "loss": 0.4272, + "step": 6673 + }, + { + "epoch": 3.7284916201117317, + "grad_norm": 0.6170068979263306, + "learning_rate": 0.0008160224089635855, + "loss": 0.5144, + "step": 6674 + }, + { + "epoch": 3.729050279329609, + "grad_norm": 0.4780423939228058, + "learning_rate": 0.0008159943977591037, + "loss": 0.4898, + "step": 6675 + }, + { + "epoch": 3.7296089385474858, + "grad_norm": 0.4953891932964325, + "learning_rate": 0.0008159663865546219, + "loss": 0.4198, + "step": 6676 + }, + { + "epoch": 3.730167597765363, + "grad_norm": 0.7887417078018188, + "learning_rate": 0.0008159383753501401, + "loss": 0.5373, + "step": 6677 + }, + { + "epoch": 3.7307262569832402, + "grad_norm": 0.36670422554016113, + "learning_rate": 0.0008159103641456584, + "loss": 0.3325, + "step": 6678 + }, + { + "epoch": 3.7312849162011172, + "grad_norm": 0.4578920304775238, + "learning_rate": 0.0008158823529411765, + "loss": 0.4092, + "step": 6679 + }, + { + "epoch": 3.7318435754189943, + "grad_norm": 0.5179921388626099, + "learning_rate": 0.0008158543417366947, + "loss": 0.4234, + "step": 6680 + }, + { + "epoch": 3.7324022346368713, + "grad_norm": 0.7124769687652588, + "learning_rate": 0.0008158263305322129, + "loss": 0.4455, + "step": 6681 + }, + { + "epoch": 3.7329608938547487, + "grad_norm": 0.549180269241333, + "learning_rate": 0.0008157983193277311, + "loss": 0.4463, + "step": 6682 + }, + { + "epoch": 3.7335195530726257, + "grad_norm": 0.6946071982383728, + "learning_rate": 0.0008157703081232494, + "loss": 0.5588, + "step": 6683 + }, + { + "epoch": 3.7340782122905027, + "grad_norm": 0.8743019700050354, + "learning_rate": 0.0008157422969187675, + "loss": 0.3841, + "step": 6684 + }, + { + "epoch": 3.7346368715083798, + "grad_norm": 0.4073009788990021, + "learning_rate": 0.0008157142857142857, + "loss": 0.3929, + "step": 6685 + }, + { + "epoch": 3.7351955307262568, + "grad_norm": 0.9444915056228638, + "learning_rate": 0.0008156862745098039, + "loss": 0.5225, + "step": 6686 + }, + { + "epoch": 3.735754189944134, + "grad_norm": 0.5588600635528564, + "learning_rate": 0.0008156582633053221, + "loss": 0.4839, + "step": 6687 + }, + { + "epoch": 3.7363128491620112, + "grad_norm": 2.7004554271698, + "learning_rate": 0.0008156302521008404, + "loss": 0.9044, + "step": 6688 + }, + { + "epoch": 3.7368715083798882, + "grad_norm": 0.42151328921318054, + "learning_rate": 0.0008156022408963585, + "loss": 0.3964, + "step": 6689 + }, + { + "epoch": 3.7374301675977653, + "grad_norm": 0.6013053059577942, + "learning_rate": 0.0008155742296918767, + "loss": 0.5085, + "step": 6690 + }, + { + "epoch": 3.7379888268156423, + "grad_norm": 4.067948341369629, + "learning_rate": 0.000815546218487395, + "loss": 0.4912, + "step": 6691 + }, + { + "epoch": 3.7385474860335197, + "grad_norm": 1.083478569984436, + "learning_rate": 0.0008155182072829132, + "loss": 0.4544, + "step": 6692 + }, + { + "epoch": 3.7391061452513967, + "grad_norm": 1.0198547840118408, + "learning_rate": 0.0008154901960784314, + "loss": 0.488, + "step": 6693 + }, + { + "epoch": 3.7396648044692737, + "grad_norm": 0.7450626492500305, + "learning_rate": 0.0008154621848739497, + "loss": 0.4072, + "step": 6694 + }, + { + "epoch": 3.7402234636871508, + "grad_norm": 0.5811417698860168, + "learning_rate": 0.0008154341736694678, + "loss": 0.4857, + "step": 6695 + }, + { + "epoch": 3.7407821229050278, + "grad_norm": 0.842680811882019, + "learning_rate": 0.000815406162464986, + "loss": 0.4494, + "step": 6696 + }, + { + "epoch": 3.741340782122905, + "grad_norm": 0.5124238729476929, + "learning_rate": 0.0008153781512605042, + "loss": 0.4462, + "step": 6697 + }, + { + "epoch": 3.7418994413407822, + "grad_norm": 2.8346331119537354, + "learning_rate": 0.0008153501400560224, + "loss": 0.6343, + "step": 6698 + }, + { + "epoch": 3.7424581005586592, + "grad_norm": 1.0235646963119507, + "learning_rate": 0.0008153221288515407, + "loss": 0.4705, + "step": 6699 + }, + { + "epoch": 3.7430167597765363, + "grad_norm": 0.7215641736984253, + "learning_rate": 0.0008152941176470588, + "loss": 0.6051, + "step": 6700 + }, + { + "epoch": 3.7435754189944133, + "grad_norm": 1.0025595426559448, + "learning_rate": 0.000815266106442577, + "loss": 0.435, + "step": 6701 + }, + { + "epoch": 3.7441340782122907, + "grad_norm": 0.43807944655418396, + "learning_rate": 0.0008152380952380952, + "loss": 0.4462, + "step": 6702 + }, + { + "epoch": 3.7446927374301677, + "grad_norm": 0.5602341294288635, + "learning_rate": 0.0008152100840336134, + "loss": 0.5095, + "step": 6703 + }, + { + "epoch": 3.7452513966480447, + "grad_norm": 0.6232250332832336, + "learning_rate": 0.0008151820728291317, + "loss": 0.5443, + "step": 6704 + }, + { + "epoch": 3.7458100558659218, + "grad_norm": 2.684713125228882, + "learning_rate": 0.0008151540616246498, + "loss": 0.4548, + "step": 6705 + }, + { + "epoch": 3.7463687150837988, + "grad_norm": 0.6109663248062134, + "learning_rate": 0.000815126050420168, + "loss": 0.4983, + "step": 6706 + }, + { + "epoch": 3.746927374301676, + "grad_norm": 0.5532881617546082, + "learning_rate": 0.0008150980392156862, + "loss": 0.4783, + "step": 6707 + }, + { + "epoch": 3.7474860335195532, + "grad_norm": 4.465295314788818, + "learning_rate": 0.0008150700280112045, + "loss": 0.5155, + "step": 6708 + }, + { + "epoch": 3.7480446927374302, + "grad_norm": 9.735363006591797, + "learning_rate": 0.0008150420168067228, + "loss": 0.3735, + "step": 6709 + }, + { + "epoch": 3.7486033519553073, + "grad_norm": 1.4678250551223755, + "learning_rate": 0.000815014005602241, + "loss": 0.4557, + "step": 6710 + }, + { + "epoch": 3.7491620111731843, + "grad_norm": 0.838957667350769, + "learning_rate": 0.0008149859943977591, + "loss": 0.4303, + "step": 6711 + }, + { + "epoch": 3.7497206703910617, + "grad_norm": 0.7221022844314575, + "learning_rate": 0.0008149579831932773, + "loss": 0.5343, + "step": 6712 + }, + { + "epoch": 3.7502793296089383, + "grad_norm": 0.6259539723396301, + "learning_rate": 0.0008149299719887955, + "loss": 0.5059, + "step": 6713 + }, + { + "epoch": 3.7508379888268157, + "grad_norm": 0.605409562587738, + "learning_rate": 0.0008149019607843138, + "loss": 0.5434, + "step": 6714 + }, + { + "epoch": 3.7513966480446927, + "grad_norm": 0.42584770917892456, + "learning_rate": 0.000814873949579832, + "loss": 0.3993, + "step": 6715 + }, + { + "epoch": 3.7519553072625698, + "grad_norm": 1.0011265277862549, + "learning_rate": 0.0008148459383753501, + "loss": 0.4591, + "step": 6716 + }, + { + "epoch": 3.7525139664804468, + "grad_norm": 0.740470290184021, + "learning_rate": 0.0008148179271708683, + "loss": 0.4431, + "step": 6717 + }, + { + "epoch": 3.753072625698324, + "grad_norm": 11.556285858154297, + "learning_rate": 0.0008147899159663865, + "loss": 0.4651, + "step": 6718 + }, + { + "epoch": 3.7536312849162012, + "grad_norm": 0.5221262574195862, + "learning_rate": 0.0008147619047619048, + "loss": 0.4661, + "step": 6719 + }, + { + "epoch": 3.7541899441340782, + "grad_norm": 0.6103655695915222, + "learning_rate": 0.000814733893557423, + "loss": 0.5202, + "step": 6720 + }, + { + "epoch": 3.7547486033519553, + "grad_norm": 0.817658007144928, + "learning_rate": 0.0008147058823529411, + "loss": 0.5072, + "step": 6721 + }, + { + "epoch": 3.7553072625698323, + "grad_norm": 0.695067822933197, + "learning_rate": 0.0008146778711484593, + "loss": 0.521, + "step": 6722 + }, + { + "epoch": 3.7558659217877093, + "grad_norm": 0.5560063123703003, + "learning_rate": 0.0008146498599439775, + "loss": 0.5336, + "step": 6723 + }, + { + "epoch": 3.7564245810055867, + "grad_norm": 3.9974701404571533, + "learning_rate": 0.0008146218487394959, + "loss": 0.5197, + "step": 6724 + }, + { + "epoch": 3.7569832402234637, + "grad_norm": 0.41313260793685913, + "learning_rate": 0.0008145938375350141, + "loss": 0.4011, + "step": 6725 + }, + { + "epoch": 3.7575418994413408, + "grad_norm": 1.449632167816162, + "learning_rate": 0.0008145658263305323, + "loss": 0.3802, + "step": 6726 + }, + { + "epoch": 3.7581005586592178, + "grad_norm": 0.9330488443374634, + "learning_rate": 0.0008145378151260504, + "loss": 0.7069, + "step": 6727 + }, + { + "epoch": 3.758659217877095, + "grad_norm": 1.2418686151504517, + "learning_rate": 0.0008145098039215686, + "loss": 0.4258, + "step": 6728 + }, + { + "epoch": 3.7592178770949722, + "grad_norm": 0.5921820998191833, + "learning_rate": 0.0008144817927170869, + "loss": 0.5241, + "step": 6729 + }, + { + "epoch": 3.7597765363128492, + "grad_norm": 0.5059346556663513, + "learning_rate": 0.0008144537815126051, + "loss": 0.4474, + "step": 6730 + }, + { + "epoch": 3.7603351955307263, + "grad_norm": 0.4856972098350525, + "learning_rate": 0.0008144257703081233, + "loss": 0.5256, + "step": 6731 + }, + { + "epoch": 3.7608938547486033, + "grad_norm": 0.9764010906219482, + "learning_rate": 0.0008143977591036414, + "loss": 0.4904, + "step": 6732 + }, + { + "epoch": 3.7614525139664803, + "grad_norm": 0.5025168061256409, + "learning_rate": 0.0008143697478991596, + "loss": 0.3938, + "step": 6733 + }, + { + "epoch": 3.7620111731843577, + "grad_norm": 0.758261501789093, + "learning_rate": 0.0008143417366946779, + "loss": 0.5647, + "step": 6734 + }, + { + "epoch": 3.7625698324022347, + "grad_norm": 0.9293955564498901, + "learning_rate": 0.0008143137254901961, + "loss": 0.5905, + "step": 6735 + }, + { + "epoch": 3.7631284916201118, + "grad_norm": 0.5527615547180176, + "learning_rate": 0.0008142857142857143, + "loss": 0.5036, + "step": 6736 + }, + { + "epoch": 3.7636871508379888, + "grad_norm": 0.9503520131111145, + "learning_rate": 0.0008142577030812324, + "loss": 0.5558, + "step": 6737 + }, + { + "epoch": 3.764245810055866, + "grad_norm": 0.6578249335289001, + "learning_rate": 0.0008142296918767506, + "loss": 0.4463, + "step": 6738 + }, + { + "epoch": 3.7648044692737432, + "grad_norm": 0.5259749293327332, + "learning_rate": 0.000814201680672269, + "loss": 0.4637, + "step": 6739 + }, + { + "epoch": 3.7653631284916202, + "grad_norm": 0.7044996023178101, + "learning_rate": 0.0008141736694677872, + "loss": 0.4722, + "step": 6740 + }, + { + "epoch": 3.7659217877094973, + "grad_norm": 0.5011742115020752, + "learning_rate": 0.0008141456582633054, + "loss": 0.482, + "step": 6741 + }, + { + "epoch": 3.7664804469273743, + "grad_norm": 0.4759159982204437, + "learning_rate": 0.0008141176470588236, + "loss": 0.3951, + "step": 6742 + }, + { + "epoch": 3.7670391061452513, + "grad_norm": 0.5874620079994202, + "learning_rate": 0.0008140896358543417, + "loss": 0.381, + "step": 6743 + }, + { + "epoch": 3.7675977653631287, + "grad_norm": 0.43097150325775146, + "learning_rate": 0.00081406162464986, + "loss": 0.4077, + "step": 6744 + }, + { + "epoch": 3.7681564245810057, + "grad_norm": 0.5463243126869202, + "learning_rate": 0.0008140336134453782, + "loss": 0.4109, + "step": 6745 + }, + { + "epoch": 3.7687150837988828, + "grad_norm": 0.48519882559776306, + "learning_rate": 0.0008140056022408964, + "loss": 0.4407, + "step": 6746 + }, + { + "epoch": 3.7692737430167598, + "grad_norm": 0.9570866227149963, + "learning_rate": 0.0008139775910364146, + "loss": 0.4123, + "step": 6747 + }, + { + "epoch": 3.769832402234637, + "grad_norm": 0.6733881235122681, + "learning_rate": 0.0008139495798319327, + "loss": 0.4993, + "step": 6748 + }, + { + "epoch": 3.7703910614525142, + "grad_norm": 0.414154052734375, + "learning_rate": 0.000813921568627451, + "loss": 0.4893, + "step": 6749 + }, + { + "epoch": 3.770949720670391, + "grad_norm": 0.5273184776306152, + "learning_rate": 0.0008138935574229692, + "loss": 0.532, + "step": 6750 + }, + { + "epoch": 3.7715083798882683, + "grad_norm": 0.5399999022483826, + "learning_rate": 0.0008138655462184874, + "loss": 0.4066, + "step": 6751 + }, + { + "epoch": 3.7720670391061453, + "grad_norm": 0.7483530640602112, + "learning_rate": 0.0008138375350140056, + "loss": 0.4364, + "step": 6752 + }, + { + "epoch": 3.7726256983240223, + "grad_norm": 1.0032182931900024, + "learning_rate": 0.0008138095238095237, + "loss": 0.4818, + "step": 6753 + }, + { + "epoch": 3.7731843575418993, + "grad_norm": 0.6334372758865356, + "learning_rate": 0.000813781512605042, + "loss": 0.4276, + "step": 6754 + }, + { + "epoch": 3.7737430167597763, + "grad_norm": 0.5629693269729614, + "learning_rate": 0.0008137535014005602, + "loss": 0.4405, + "step": 6755 + }, + { + "epoch": 3.7743016759776538, + "grad_norm": 0.8067587018013, + "learning_rate": 0.0008137254901960785, + "loss": 0.6695, + "step": 6756 + }, + { + "epoch": 3.7748603351955308, + "grad_norm": 0.4765067398548126, + "learning_rate": 0.0008136974789915967, + "loss": 0.4581, + "step": 6757 + }, + { + "epoch": 3.775418994413408, + "grad_norm": 0.3992355763912201, + "learning_rate": 0.0008136694677871149, + "loss": 0.3805, + "step": 6758 + }, + { + "epoch": 3.775977653631285, + "grad_norm": 0.8096543550491333, + "learning_rate": 0.0008136414565826331, + "loss": 0.3946, + "step": 6759 + }, + { + "epoch": 3.776536312849162, + "grad_norm": 0.6641272306442261, + "learning_rate": 0.0008136134453781513, + "loss": 0.4327, + "step": 6760 + }, + { + "epoch": 3.7770949720670393, + "grad_norm": 0.7377946972846985, + "learning_rate": 0.0008135854341736695, + "loss": 0.3635, + "step": 6761 + }, + { + "epoch": 3.7776536312849163, + "grad_norm": 0.49328842759132385, + "learning_rate": 0.0008135574229691877, + "loss": 0.4646, + "step": 6762 + }, + { + "epoch": 3.7782122905027933, + "grad_norm": 0.46599531173706055, + "learning_rate": 0.0008135294117647059, + "loss": 0.445, + "step": 6763 + }, + { + "epoch": 3.7787709497206703, + "grad_norm": 0.7711352109909058, + "learning_rate": 0.0008135014005602241, + "loss": 0.4761, + "step": 6764 + }, + { + "epoch": 3.7793296089385473, + "grad_norm": 1.0588428974151611, + "learning_rate": 0.0008134733893557423, + "loss": 0.4925, + "step": 6765 + }, + { + "epoch": 3.7798882681564248, + "grad_norm": 0.9532281756401062, + "learning_rate": 0.0008134453781512605, + "loss": 0.5696, + "step": 6766 + }, + { + "epoch": 3.7804469273743018, + "grad_norm": 0.5671836733818054, + "learning_rate": 0.0008134173669467787, + "loss": 0.4673, + "step": 6767 + }, + { + "epoch": 3.781005586592179, + "grad_norm": 1.4399776458740234, + "learning_rate": 0.0008133893557422969, + "loss": 0.5052, + "step": 6768 + }, + { + "epoch": 3.781564245810056, + "grad_norm": 0.5638650059700012, + "learning_rate": 0.0008133613445378152, + "loss": 0.5438, + "step": 6769 + }, + { + "epoch": 3.782122905027933, + "grad_norm": 0.5349329710006714, + "learning_rate": 0.0008133333333333333, + "loss": 0.4396, + "step": 6770 + }, + { + "epoch": 3.7826815642458103, + "grad_norm": 0.5024678111076355, + "learning_rate": 0.0008133053221288515, + "loss": 0.4636, + "step": 6771 + }, + { + "epoch": 3.7832402234636873, + "grad_norm": 0.6195822358131409, + "learning_rate": 0.0008132773109243697, + "loss": 0.3973, + "step": 6772 + }, + { + "epoch": 3.7837988826815643, + "grad_norm": 0.6856474280357361, + "learning_rate": 0.000813249299719888, + "loss": 0.4497, + "step": 6773 + }, + { + "epoch": 3.7843575418994413, + "grad_norm": 1.4107484817504883, + "learning_rate": 0.0008132212885154063, + "loss": 0.4807, + "step": 6774 + }, + { + "epoch": 3.7849162011173183, + "grad_norm": 0.4474133253097534, + "learning_rate": 0.0008131932773109244, + "loss": 0.4605, + "step": 6775 + }, + { + "epoch": 3.7854748603351958, + "grad_norm": 0.6936215162277222, + "learning_rate": 0.0008131652661064426, + "loss": 0.5026, + "step": 6776 + }, + { + "epoch": 3.7860335195530728, + "grad_norm": 0.611108660697937, + "learning_rate": 0.0008131372549019608, + "loss": 0.5067, + "step": 6777 + }, + { + "epoch": 3.78659217877095, + "grad_norm": 0.5875313878059387, + "learning_rate": 0.000813109243697479, + "loss": 0.4401, + "step": 6778 + }, + { + "epoch": 3.787150837988827, + "grad_norm": 0.5357989072799683, + "learning_rate": 0.0008130812324929973, + "loss": 0.4953, + "step": 6779 + }, + { + "epoch": 3.787709497206704, + "grad_norm": 0.539035975933075, + "learning_rate": 0.0008130532212885154, + "loss": 0.4428, + "step": 6780 + }, + { + "epoch": 3.7882681564245813, + "grad_norm": 0.5645351409912109, + "learning_rate": 0.0008130252100840336, + "loss": 0.5017, + "step": 6781 + }, + { + "epoch": 3.788826815642458, + "grad_norm": 0.4674677550792694, + "learning_rate": 0.0008129971988795518, + "loss": 0.4784, + "step": 6782 + }, + { + "epoch": 3.7893854748603353, + "grad_norm": 0.5537141561508179, + "learning_rate": 0.00081296918767507, + "loss": 0.4556, + "step": 6783 + }, + { + "epoch": 3.7899441340782123, + "grad_norm": 0.6152671575546265, + "learning_rate": 0.0008129411764705883, + "loss": 0.5349, + "step": 6784 + }, + { + "epoch": 3.7905027932960893, + "grad_norm": 2.1846680641174316, + "learning_rate": 0.0008129131652661065, + "loss": 0.5873, + "step": 6785 + }, + { + "epoch": 3.7910614525139668, + "grad_norm": 1.2025355100631714, + "learning_rate": 0.0008128851540616246, + "loss": 0.4106, + "step": 6786 + }, + { + "epoch": 3.7916201117318433, + "grad_norm": 0.44122374057769775, + "learning_rate": 0.0008128571428571428, + "loss": 0.4116, + "step": 6787 + }, + { + "epoch": 3.792178770949721, + "grad_norm": 0.578125536441803, + "learning_rate": 0.000812829131652661, + "loss": 0.4374, + "step": 6788 + }, + { + "epoch": 3.792737430167598, + "grad_norm": 0.5104628801345825, + "learning_rate": 0.0008128011204481794, + "loss": 0.4366, + "step": 6789 + }, + { + "epoch": 3.793296089385475, + "grad_norm": 0.566911518573761, + "learning_rate": 0.0008127731092436976, + "loss": 0.465, + "step": 6790 + }, + { + "epoch": 3.793854748603352, + "grad_norm": 0.727301836013794, + "learning_rate": 0.0008127450980392157, + "loss": 0.4883, + "step": 6791 + }, + { + "epoch": 3.794413407821229, + "grad_norm": 0.5929538607597351, + "learning_rate": 0.0008127170868347339, + "loss": 0.4769, + "step": 6792 + }, + { + "epoch": 3.7949720670391063, + "grad_norm": 0.6852974891662598, + "learning_rate": 0.0008126890756302521, + "loss": 0.3843, + "step": 6793 + }, + { + "epoch": 3.7955307262569833, + "grad_norm": 4.200253486633301, + "learning_rate": 0.0008126610644257704, + "loss": 0.4536, + "step": 6794 + }, + { + "epoch": 3.7960893854748603, + "grad_norm": 0.9143809080123901, + "learning_rate": 0.0008126330532212886, + "loss": 0.5735, + "step": 6795 + }, + { + "epoch": 3.7966480446927373, + "grad_norm": 0.7114330530166626, + "learning_rate": 0.0008126050420168067, + "loss": 0.4702, + "step": 6796 + }, + { + "epoch": 3.7972067039106143, + "grad_norm": 1.4647160768508911, + "learning_rate": 0.0008125770308123249, + "loss": 0.4906, + "step": 6797 + }, + { + "epoch": 3.7977653631284918, + "grad_norm": 0.730023205280304, + "learning_rate": 0.0008125490196078431, + "loss": 0.3852, + "step": 6798 + }, + { + "epoch": 3.798324022346369, + "grad_norm": 0.44529619812965393, + "learning_rate": 0.0008125210084033614, + "loss": 0.3604, + "step": 6799 + }, + { + "epoch": 3.798882681564246, + "grad_norm": 0.5767450332641602, + "learning_rate": 0.0008124929971988796, + "loss": 0.5622, + "step": 6800 + }, + { + "epoch": 3.799441340782123, + "grad_norm": 1.0275883674621582, + "learning_rate": 0.0008124649859943978, + "loss": 0.559, + "step": 6801 + }, + { + "epoch": 3.8, + "grad_norm": 1.4503183364868164, + "learning_rate": 0.0008124369747899159, + "loss": 0.5087, + "step": 6802 + }, + { + "epoch": 3.8005586592178773, + "grad_norm": 0.6960155963897705, + "learning_rate": 0.0008124089635854341, + "loss": 0.5502, + "step": 6803 + }, + { + "epoch": 3.8011173184357543, + "grad_norm": 0.5598259568214417, + "learning_rate": 0.0008123809523809524, + "loss": 0.4339, + "step": 6804 + }, + { + "epoch": 3.8016759776536313, + "grad_norm": 0.4804055094718933, + "learning_rate": 0.0008123529411764707, + "loss": 0.4475, + "step": 6805 + }, + { + "epoch": 3.8022346368715083, + "grad_norm": 0.4204331040382385, + "learning_rate": 0.0008123249299719889, + "loss": 0.3992, + "step": 6806 + }, + { + "epoch": 3.8027932960893853, + "grad_norm": 0.6922451853752136, + "learning_rate": 0.000812296918767507, + "loss": 0.4941, + "step": 6807 + }, + { + "epoch": 3.8033519553072628, + "grad_norm": 1.050072431564331, + "learning_rate": 0.0008122689075630252, + "loss": 0.5497, + "step": 6808 + }, + { + "epoch": 3.80391061452514, + "grad_norm": 0.7848671078681946, + "learning_rate": 0.0008122408963585435, + "loss": 0.4701, + "step": 6809 + }, + { + "epoch": 3.804469273743017, + "grad_norm": 0.5557612776756287, + "learning_rate": 0.0008122128851540617, + "loss": 0.49, + "step": 6810 + }, + { + "epoch": 3.805027932960894, + "grad_norm": 0.474393367767334, + "learning_rate": 0.0008121848739495799, + "loss": 0.4935, + "step": 6811 + }, + { + "epoch": 3.805586592178771, + "grad_norm": 0.5907094478607178, + "learning_rate": 0.000812156862745098, + "loss": 0.4478, + "step": 6812 + }, + { + "epoch": 3.8061452513966483, + "grad_norm": 0.4820985794067383, + "learning_rate": 0.0008121288515406162, + "loss": 0.4665, + "step": 6813 + }, + { + "epoch": 3.8067039106145253, + "grad_norm": 0.4366481900215149, + "learning_rate": 0.0008121008403361345, + "loss": 0.4316, + "step": 6814 + }, + { + "epoch": 3.8072625698324023, + "grad_norm": 0.5917373299598694, + "learning_rate": 0.0008120728291316527, + "loss": 0.4873, + "step": 6815 + }, + { + "epoch": 3.8078212290502793, + "grad_norm": 4.734868049621582, + "learning_rate": 0.0008120448179271709, + "loss": 0.4826, + "step": 6816 + }, + { + "epoch": 3.8083798882681563, + "grad_norm": 1.078062653541565, + "learning_rate": 0.0008120168067226891, + "loss": 0.4775, + "step": 6817 + }, + { + "epoch": 3.8089385474860338, + "grad_norm": 0.5761260986328125, + "learning_rate": 0.0008119887955182072, + "loss": 0.5683, + "step": 6818 + }, + { + "epoch": 3.8094972067039103, + "grad_norm": 6.666396617889404, + "learning_rate": 0.0008119607843137255, + "loss": 0.4769, + "step": 6819 + }, + { + "epoch": 3.810055865921788, + "grad_norm": 0.5542675852775574, + "learning_rate": 0.0008119327731092437, + "loss": 0.4529, + "step": 6820 + }, + { + "epoch": 3.810614525139665, + "grad_norm": 0.606116533279419, + "learning_rate": 0.000811904761904762, + "loss": 0.5309, + "step": 6821 + }, + { + "epoch": 3.811173184357542, + "grad_norm": 0.7645151019096375, + "learning_rate": 0.0008118767507002802, + "loss": 0.393, + "step": 6822 + }, + { + "epoch": 3.811731843575419, + "grad_norm": 0.4758078157901764, + "learning_rate": 0.0008118487394957983, + "loss": 0.4076, + "step": 6823 + }, + { + "epoch": 3.812290502793296, + "grad_norm": 0.7427760362625122, + "learning_rate": 0.0008118207282913166, + "loss": 0.517, + "step": 6824 + }, + { + "epoch": 3.8128491620111733, + "grad_norm": 0.6136566996574402, + "learning_rate": 0.0008117927170868348, + "loss": 0.419, + "step": 6825 + }, + { + "epoch": 3.8134078212290503, + "grad_norm": 1.243286371231079, + "learning_rate": 0.000811764705882353, + "loss": 0.4208, + "step": 6826 + }, + { + "epoch": 3.8139664804469273, + "grad_norm": 0.7931225895881653, + "learning_rate": 0.0008117366946778712, + "loss": 0.4574, + "step": 6827 + }, + { + "epoch": 3.8145251396648043, + "grad_norm": 0.4445907473564148, + "learning_rate": 0.0008117086834733893, + "loss": 0.3936, + "step": 6828 + }, + { + "epoch": 3.8150837988826813, + "grad_norm": 0.4751304090023041, + "learning_rate": 0.0008116806722689076, + "loss": 0.4431, + "step": 6829 + }, + { + "epoch": 3.815642458100559, + "grad_norm": 0.7380807399749756, + "learning_rate": 0.0008116526610644258, + "loss": 0.5626, + "step": 6830 + }, + { + "epoch": 3.816201117318436, + "grad_norm": 0.6071987152099609, + "learning_rate": 0.000811624649859944, + "loss": 0.5018, + "step": 6831 + }, + { + "epoch": 3.816759776536313, + "grad_norm": 0.6682330965995789, + "learning_rate": 0.0008115966386554622, + "loss": 0.559, + "step": 6832 + }, + { + "epoch": 3.81731843575419, + "grad_norm": 0.5862687230110168, + "learning_rate": 0.0008115686274509804, + "loss": 0.3786, + "step": 6833 + }, + { + "epoch": 3.817877094972067, + "grad_norm": 3.3499162197113037, + "learning_rate": 0.0008115406162464986, + "loss": 0.4302, + "step": 6834 + }, + { + "epoch": 3.8184357541899443, + "grad_norm": 0.5834749341011047, + "learning_rate": 0.0008115126050420168, + "loss": 0.4131, + "step": 6835 + }, + { + "epoch": 3.8189944134078213, + "grad_norm": 0.7662965059280396, + "learning_rate": 0.000811484593837535, + "loss": 0.4605, + "step": 6836 + }, + { + "epoch": 3.8195530726256983, + "grad_norm": 1.037217140197754, + "learning_rate": 0.0008114565826330532, + "loss": 0.4916, + "step": 6837 + }, + { + "epoch": 3.8201117318435753, + "grad_norm": 0.5889070630073547, + "learning_rate": 0.0008114285714285715, + "loss": 0.4462, + "step": 6838 + }, + { + "epoch": 3.8206703910614523, + "grad_norm": 1.1865012645721436, + "learning_rate": 0.0008114005602240897, + "loss": 0.4037, + "step": 6839 + }, + { + "epoch": 3.82122905027933, + "grad_norm": 0.7193288803100586, + "learning_rate": 0.0008113725490196079, + "loss": 0.5748, + "step": 6840 + }, + { + "epoch": 3.821787709497207, + "grad_norm": 0.8877421617507935, + "learning_rate": 0.0008113445378151261, + "loss": 0.4212, + "step": 6841 + }, + { + "epoch": 3.822346368715084, + "grad_norm": 0.7898842692375183, + "learning_rate": 0.0008113165266106443, + "loss": 0.4114, + "step": 6842 + }, + { + "epoch": 3.822905027932961, + "grad_norm": 0.6861692070960999, + "learning_rate": 0.0008112885154061625, + "loss": 0.5395, + "step": 6843 + }, + { + "epoch": 3.823463687150838, + "grad_norm": 0.44756853580474854, + "learning_rate": 0.0008112605042016807, + "loss": 0.2973, + "step": 6844 + }, + { + "epoch": 3.8240223463687153, + "grad_norm": 0.8027526140213013, + "learning_rate": 0.0008112324929971989, + "loss": 0.6062, + "step": 6845 + }, + { + "epoch": 3.8245810055865923, + "grad_norm": 0.5482564568519592, + "learning_rate": 0.0008112044817927171, + "loss": 0.5726, + "step": 6846 + }, + { + "epoch": 3.8251396648044693, + "grad_norm": 1.787686824798584, + "learning_rate": 0.0008111764705882353, + "loss": 0.4873, + "step": 6847 + }, + { + "epoch": 3.8256983240223463, + "grad_norm": 0.5472537279129028, + "learning_rate": 0.0008111484593837535, + "loss": 0.4862, + "step": 6848 + }, + { + "epoch": 3.8262569832402233, + "grad_norm": 20.877382278442383, + "learning_rate": 0.0008111204481792718, + "loss": 0.4342, + "step": 6849 + }, + { + "epoch": 3.826815642458101, + "grad_norm": 0.8070502281188965, + "learning_rate": 0.0008110924369747899, + "loss": 0.5737, + "step": 6850 + }, + { + "epoch": 3.827374301675978, + "grad_norm": 0.6082700490951538, + "learning_rate": 0.0008110644257703081, + "loss": 0.3914, + "step": 6851 + }, + { + "epoch": 3.827932960893855, + "grad_norm": 0.7312552332878113, + "learning_rate": 0.0008110364145658263, + "loss": 0.4696, + "step": 6852 + }, + { + "epoch": 3.828491620111732, + "grad_norm": 2.196409225463867, + "learning_rate": 0.0008110084033613445, + "loss": 0.4363, + "step": 6853 + }, + { + "epoch": 3.829050279329609, + "grad_norm": 0.8176388740539551, + "learning_rate": 0.0008109803921568629, + "loss": 0.4573, + "step": 6854 + }, + { + "epoch": 3.8296089385474863, + "grad_norm": 0.9460632801055908, + "learning_rate": 0.000810952380952381, + "loss": 0.4638, + "step": 6855 + }, + { + "epoch": 3.830167597765363, + "grad_norm": 2.6144094467163086, + "learning_rate": 0.0008109243697478992, + "loss": 0.5105, + "step": 6856 + }, + { + "epoch": 3.8307262569832403, + "grad_norm": 0.5067965388298035, + "learning_rate": 0.0008108963585434174, + "loss": 0.4402, + "step": 6857 + }, + { + "epoch": 3.8312849162011173, + "grad_norm": 0.5917914509773254, + "learning_rate": 0.0008108683473389356, + "loss": 0.4318, + "step": 6858 + }, + { + "epoch": 3.8318435754189943, + "grad_norm": 0.7330923080444336, + "learning_rate": 0.0008108403361344539, + "loss": 0.4405, + "step": 6859 + }, + { + "epoch": 3.8324022346368714, + "grad_norm": 0.6495806574821472, + "learning_rate": 0.000810812324929972, + "loss": 0.47, + "step": 6860 + }, + { + "epoch": 3.8329608938547484, + "grad_norm": 0.7199528217315674, + "learning_rate": 0.0008107843137254902, + "loss": 0.497, + "step": 6861 + }, + { + "epoch": 3.833519553072626, + "grad_norm": 0.4221893846988678, + "learning_rate": 0.0008107563025210084, + "loss": 0.4337, + "step": 6862 + }, + { + "epoch": 3.834078212290503, + "grad_norm": 0.5389212369918823, + "learning_rate": 0.0008107282913165266, + "loss": 0.5485, + "step": 6863 + }, + { + "epoch": 3.83463687150838, + "grad_norm": 0.9004932045936584, + "learning_rate": 0.0008107002801120449, + "loss": 0.5197, + "step": 6864 + }, + { + "epoch": 3.835195530726257, + "grad_norm": 0.41269150376319885, + "learning_rate": 0.0008106722689075631, + "loss": 0.462, + "step": 6865 + }, + { + "epoch": 3.835754189944134, + "grad_norm": 0.5771477818489075, + "learning_rate": 0.0008106442577030812, + "loss": 0.3544, + "step": 6866 + }, + { + "epoch": 3.8363128491620113, + "grad_norm": 3.602389335632324, + "learning_rate": 0.0008106162464985994, + "loss": 0.4453, + "step": 6867 + }, + { + "epoch": 3.8368715083798883, + "grad_norm": 0.49727004766464233, + "learning_rate": 0.0008105882352941176, + "loss": 0.4801, + "step": 6868 + }, + { + "epoch": 3.8374301675977653, + "grad_norm": 21.37616729736328, + "learning_rate": 0.0008105602240896359, + "loss": 0.5378, + "step": 6869 + }, + { + "epoch": 3.8379888268156424, + "grad_norm": 1.1396398544311523, + "learning_rate": 0.0008105322128851542, + "loss": 0.5094, + "step": 6870 + }, + { + "epoch": 3.8385474860335194, + "grad_norm": 1.5971423387527466, + "learning_rate": 0.0008105042016806722, + "loss": 0.4703, + "step": 6871 + }, + { + "epoch": 3.839106145251397, + "grad_norm": 0.5216647982597351, + "learning_rate": 0.0008104761904761905, + "loss": 0.4093, + "step": 6872 + }, + { + "epoch": 3.839664804469274, + "grad_norm": 0.5850594639778137, + "learning_rate": 0.0008104481792717087, + "loss": 0.5847, + "step": 6873 + }, + { + "epoch": 3.840223463687151, + "grad_norm": 0.5579874515533447, + "learning_rate": 0.000810420168067227, + "loss": 0.4894, + "step": 6874 + }, + { + "epoch": 3.840782122905028, + "grad_norm": 1.196791410446167, + "learning_rate": 0.0008103921568627452, + "loss": 0.5488, + "step": 6875 + }, + { + "epoch": 3.841340782122905, + "grad_norm": 0.92503422498703, + "learning_rate": 0.0008103641456582633, + "loss": 0.6328, + "step": 6876 + }, + { + "epoch": 3.8418994413407823, + "grad_norm": 0.5861455202102661, + "learning_rate": 0.0008103361344537815, + "loss": 0.4211, + "step": 6877 + }, + { + "epoch": 3.8424581005586593, + "grad_norm": 0.645728588104248, + "learning_rate": 0.0008103081232492997, + "loss": 0.5812, + "step": 6878 + }, + { + "epoch": 3.8430167597765363, + "grad_norm": 0.6964361667633057, + "learning_rate": 0.000810280112044818, + "loss": 0.4298, + "step": 6879 + }, + { + "epoch": 3.8435754189944134, + "grad_norm": 3.3224074840545654, + "learning_rate": 0.0008102521008403362, + "loss": 0.364, + "step": 6880 + }, + { + "epoch": 3.8441340782122904, + "grad_norm": 0.6650158762931824, + "learning_rate": 0.0008102240896358544, + "loss": 0.52, + "step": 6881 + }, + { + "epoch": 3.844692737430168, + "grad_norm": 0.6205301284790039, + "learning_rate": 0.0008101960784313725, + "loss": 0.4434, + "step": 6882 + }, + { + "epoch": 3.845251396648045, + "grad_norm": 1.7254462242126465, + "learning_rate": 0.0008101680672268907, + "loss": 0.5268, + "step": 6883 + }, + { + "epoch": 3.845810055865922, + "grad_norm": 0.6671683192253113, + "learning_rate": 0.000810140056022409, + "loss": 0.4726, + "step": 6884 + }, + { + "epoch": 3.846368715083799, + "grad_norm": 0.4430246353149414, + "learning_rate": 0.0008101120448179272, + "loss": 0.3557, + "step": 6885 + }, + { + "epoch": 3.846927374301676, + "grad_norm": 0.43659278750419617, + "learning_rate": 0.0008100840336134454, + "loss": 0.4207, + "step": 6886 + }, + { + "epoch": 3.8474860335195533, + "grad_norm": 0.9084954857826233, + "learning_rate": 0.0008100560224089635, + "loss": 0.6475, + "step": 6887 + }, + { + "epoch": 3.8480446927374303, + "grad_norm": 0.4397558569908142, + "learning_rate": 0.0008100280112044818, + "loss": 0.4208, + "step": 6888 + }, + { + "epoch": 3.8486033519553073, + "grad_norm": 0.5738127827644348, + "learning_rate": 0.0008100000000000001, + "loss": 0.5228, + "step": 6889 + }, + { + "epoch": 3.8491620111731844, + "grad_norm": 1.7955875396728516, + "learning_rate": 0.0008099719887955183, + "loss": 0.3892, + "step": 6890 + }, + { + "epoch": 3.8497206703910614, + "grad_norm": 0.47176024317741394, + "learning_rate": 0.0008099439775910365, + "loss": 0.4342, + "step": 6891 + }, + { + "epoch": 3.850279329608939, + "grad_norm": 0.6593926548957825, + "learning_rate": 0.0008099159663865546, + "loss": 0.4822, + "step": 6892 + }, + { + "epoch": 3.8508379888268154, + "grad_norm": 0.6891830563545227, + "learning_rate": 0.0008098879551820728, + "loss": 0.4241, + "step": 6893 + }, + { + "epoch": 3.851396648044693, + "grad_norm": 0.7264090180397034, + "learning_rate": 0.0008098599439775911, + "loss": 0.395, + "step": 6894 + }, + { + "epoch": 3.85195530726257, + "grad_norm": 0.6550943851470947, + "learning_rate": 0.0008098319327731093, + "loss": 0.4539, + "step": 6895 + }, + { + "epoch": 3.852513966480447, + "grad_norm": 3.825160503387451, + "learning_rate": 0.0008098039215686275, + "loss": 0.5699, + "step": 6896 + }, + { + "epoch": 3.853072625698324, + "grad_norm": 0.8306968808174133, + "learning_rate": 0.0008097759103641457, + "loss": 0.4448, + "step": 6897 + }, + { + "epoch": 3.853631284916201, + "grad_norm": 1.1376948356628418, + "learning_rate": 0.0008097478991596638, + "loss": 0.4899, + "step": 6898 + }, + { + "epoch": 3.8541899441340783, + "grad_norm": 0.5389619469642639, + "learning_rate": 0.0008097198879551821, + "loss": 0.4325, + "step": 6899 + }, + { + "epoch": 3.8547486033519553, + "grad_norm": 0.5750625133514404, + "learning_rate": 0.0008096918767507003, + "loss": 0.4977, + "step": 6900 + }, + { + "epoch": 3.8553072625698324, + "grad_norm": 0.5885065197944641, + "learning_rate": 0.0008096638655462185, + "loss": 0.4904, + "step": 6901 + }, + { + "epoch": 3.8558659217877094, + "grad_norm": 0.4275752902030945, + "learning_rate": 0.0008096358543417367, + "loss": 0.3351, + "step": 6902 + }, + { + "epoch": 3.8564245810055864, + "grad_norm": 0.6802307367324829, + "learning_rate": 0.0008096078431372548, + "loss": 0.6557, + "step": 6903 + }, + { + "epoch": 3.856983240223464, + "grad_norm": 1.1220121383666992, + "learning_rate": 0.0008095798319327732, + "loss": 0.5859, + "step": 6904 + }, + { + "epoch": 3.857541899441341, + "grad_norm": 0.5158030986785889, + "learning_rate": 0.0008095518207282914, + "loss": 0.3694, + "step": 6905 + }, + { + "epoch": 3.858100558659218, + "grad_norm": 3.132863998413086, + "learning_rate": 0.0008095238095238096, + "loss": 0.4492, + "step": 6906 + }, + { + "epoch": 3.858659217877095, + "grad_norm": 0.5213937759399414, + "learning_rate": 0.0008094957983193278, + "loss": 0.5282, + "step": 6907 + }, + { + "epoch": 3.859217877094972, + "grad_norm": 0.4709155857563019, + "learning_rate": 0.0008094677871148459, + "loss": 0.456, + "step": 6908 + }, + { + "epoch": 3.8597765363128493, + "grad_norm": 0.4908982813358307, + "learning_rate": 0.0008094397759103642, + "loss": 0.4422, + "step": 6909 + }, + { + "epoch": 3.8603351955307263, + "grad_norm": 6.982736110687256, + "learning_rate": 0.0008094117647058824, + "loss": 0.5534, + "step": 6910 + }, + { + "epoch": 3.8608938547486034, + "grad_norm": 0.5150156617164612, + "learning_rate": 0.0008093837535014006, + "loss": 0.3855, + "step": 6911 + }, + { + "epoch": 3.8614525139664804, + "grad_norm": 0.685828447341919, + "learning_rate": 0.0008093557422969188, + "loss": 0.5353, + "step": 6912 + }, + { + "epoch": 3.8620111731843574, + "grad_norm": 0.529667317867279, + "learning_rate": 0.000809327731092437, + "loss": 0.4746, + "step": 6913 + }, + { + "epoch": 3.862569832402235, + "grad_norm": 0.7907227277755737, + "learning_rate": 0.0008092997198879551, + "loss": 0.5419, + "step": 6914 + }, + { + "epoch": 3.863128491620112, + "grad_norm": 0.5212898254394531, + "learning_rate": 0.0008092717086834734, + "loss": 0.4079, + "step": 6915 + }, + { + "epoch": 3.863687150837989, + "grad_norm": 0.8711379766464233, + "learning_rate": 0.0008092436974789916, + "loss": 0.6348, + "step": 6916 + }, + { + "epoch": 3.864245810055866, + "grad_norm": 0.7282832264900208, + "learning_rate": 0.0008092156862745098, + "loss": 0.5151, + "step": 6917 + }, + { + "epoch": 3.864804469273743, + "grad_norm": 0.6962943077087402, + "learning_rate": 0.000809187675070028, + "loss": 0.5445, + "step": 6918 + }, + { + "epoch": 3.8653631284916203, + "grad_norm": 0.5087282657623291, + "learning_rate": 0.0008091596638655461, + "loss": 0.4424, + "step": 6919 + }, + { + "epoch": 3.8659217877094973, + "grad_norm": 0.6248132586479187, + "learning_rate": 0.0008091316526610645, + "loss": 0.4197, + "step": 6920 + }, + { + "epoch": 3.8664804469273744, + "grad_norm": 0.6364431977272034, + "learning_rate": 0.0008091036414565827, + "loss": 0.5092, + "step": 6921 + }, + { + "epoch": 3.8670391061452514, + "grad_norm": 0.543777346611023, + "learning_rate": 0.0008090756302521009, + "loss": 0.4106, + "step": 6922 + }, + { + "epoch": 3.8675977653631284, + "grad_norm": 0.6014625430107117, + "learning_rate": 0.0008090476190476191, + "loss": 0.4152, + "step": 6923 + }, + { + "epoch": 3.868156424581006, + "grad_norm": 0.39569205045700073, + "learning_rate": 0.0008090196078431372, + "loss": 0.3923, + "step": 6924 + }, + { + "epoch": 3.868715083798883, + "grad_norm": 0.429889053106308, + "learning_rate": 0.0008089915966386555, + "loss": 0.4411, + "step": 6925 + }, + { + "epoch": 3.86927374301676, + "grad_norm": 0.5761198401451111, + "learning_rate": 0.0008089635854341737, + "loss": 0.4711, + "step": 6926 + }, + { + "epoch": 3.869832402234637, + "grad_norm": 2.0339722633361816, + "learning_rate": 0.0008089355742296919, + "loss": 0.4117, + "step": 6927 + }, + { + "epoch": 3.870391061452514, + "grad_norm": 0.8914132714271545, + "learning_rate": 0.0008089075630252101, + "loss": 0.4967, + "step": 6928 + }, + { + "epoch": 3.8709497206703913, + "grad_norm": 0.4805266261100769, + "learning_rate": 0.0008088795518207283, + "loss": 0.4811, + "step": 6929 + }, + { + "epoch": 3.871508379888268, + "grad_norm": 0.7968722581863403, + "learning_rate": 0.0008088515406162465, + "loss": 0.575, + "step": 6930 + }, + { + "epoch": 3.8720670391061454, + "grad_norm": 0.5343600511550903, + "learning_rate": 0.0008088235294117647, + "loss": 0.4464, + "step": 6931 + }, + { + "epoch": 3.8726256983240224, + "grad_norm": 0.7171983122825623, + "learning_rate": 0.0008087955182072829, + "loss": 0.5011, + "step": 6932 + }, + { + "epoch": 3.8731843575418994, + "grad_norm": 2.3407227993011475, + "learning_rate": 0.0008087675070028011, + "loss": 0.5023, + "step": 6933 + }, + { + "epoch": 3.8737430167597764, + "grad_norm": 0.7763156294822693, + "learning_rate": 0.0008087394957983193, + "loss": 0.5205, + "step": 6934 + }, + { + "epoch": 3.8743016759776534, + "grad_norm": 0.8254567980766296, + "learning_rate": 0.0008087114845938375, + "loss": 0.4492, + "step": 6935 + }, + { + "epoch": 3.874860335195531, + "grad_norm": 0.5877164602279663, + "learning_rate": 0.0008086834733893557, + "loss": 0.4079, + "step": 6936 + }, + { + "epoch": 3.875418994413408, + "grad_norm": 0.4659554660320282, + "learning_rate": 0.000808655462184874, + "loss": 0.3999, + "step": 6937 + }, + { + "epoch": 3.875977653631285, + "grad_norm": 0.6956237554550171, + "learning_rate": 0.0008086274509803922, + "loss": 0.683, + "step": 6938 + }, + { + "epoch": 3.876536312849162, + "grad_norm": 0.721974790096283, + "learning_rate": 0.0008085994397759104, + "loss": 0.4509, + "step": 6939 + }, + { + "epoch": 3.877094972067039, + "grad_norm": 0.719066858291626, + "learning_rate": 0.0008085714285714286, + "loss": 0.5144, + "step": 6940 + }, + { + "epoch": 3.8776536312849164, + "grad_norm": 0.7596331834793091, + "learning_rate": 0.0008085434173669468, + "loss": 0.556, + "step": 6941 + }, + { + "epoch": 3.8782122905027934, + "grad_norm": 2.164646625518799, + "learning_rate": 0.000808515406162465, + "loss": 0.3667, + "step": 6942 + }, + { + "epoch": 3.8787709497206704, + "grad_norm": 0.6435976624488831, + "learning_rate": 0.0008084873949579832, + "loss": 0.3977, + "step": 6943 + }, + { + "epoch": 3.8793296089385474, + "grad_norm": 1.3901053667068481, + "learning_rate": 0.0008084593837535014, + "loss": 0.6029, + "step": 6944 + }, + { + "epoch": 3.8798882681564244, + "grad_norm": 0.5092434287071228, + "learning_rate": 0.0008084313725490197, + "loss": 0.4063, + "step": 6945 + }, + { + "epoch": 3.880446927374302, + "grad_norm": 0.49767711758613586, + "learning_rate": 0.0008084033613445378, + "loss": 0.3927, + "step": 6946 + }, + { + "epoch": 3.881005586592179, + "grad_norm": 0.5102831125259399, + "learning_rate": 0.000808375350140056, + "loss": 0.3939, + "step": 6947 + }, + { + "epoch": 3.881564245810056, + "grad_norm": 0.4946901798248291, + "learning_rate": 0.0008083473389355742, + "loss": 0.4616, + "step": 6948 + }, + { + "epoch": 3.882122905027933, + "grad_norm": 1.0997058153152466, + "learning_rate": 0.0008083193277310924, + "loss": 0.5067, + "step": 6949 + }, + { + "epoch": 3.88268156424581, + "grad_norm": 0.6095570921897888, + "learning_rate": 0.0008082913165266107, + "loss": 0.463, + "step": 6950 + }, + { + "epoch": 3.8832402234636874, + "grad_norm": 0.5534300208091736, + "learning_rate": 0.0008082633053221288, + "loss": 0.5615, + "step": 6951 + }, + { + "epoch": 3.8837988826815644, + "grad_norm": 0.48418498039245605, + "learning_rate": 0.000808235294117647, + "loss": 0.4672, + "step": 6952 + }, + { + "epoch": 3.8843575418994414, + "grad_norm": 0.4470704197883606, + "learning_rate": 0.0008082072829131652, + "loss": 0.4198, + "step": 6953 + }, + { + "epoch": 3.8849162011173184, + "grad_norm": 1.4324913024902344, + "learning_rate": 0.0008081792717086835, + "loss": 0.505, + "step": 6954 + }, + { + "epoch": 3.8854748603351954, + "grad_norm": 0.4697835445404053, + "learning_rate": 0.0008081512605042018, + "loss": 0.349, + "step": 6955 + }, + { + "epoch": 3.886033519553073, + "grad_norm": 0.625615656375885, + "learning_rate": 0.0008081232492997199, + "loss": 0.4977, + "step": 6956 + }, + { + "epoch": 3.88659217877095, + "grad_norm": 0.4555494487285614, + "learning_rate": 0.0008080952380952381, + "loss": 0.3867, + "step": 6957 + }, + { + "epoch": 3.887150837988827, + "grad_norm": 0.5971218347549438, + "learning_rate": 0.0008080672268907563, + "loss": 0.4265, + "step": 6958 + }, + { + "epoch": 3.887709497206704, + "grad_norm": 0.48569631576538086, + "learning_rate": 0.0008080392156862745, + "loss": 0.4778, + "step": 6959 + }, + { + "epoch": 3.888268156424581, + "grad_norm": 0.4957340955734253, + "learning_rate": 0.0008080112044817928, + "loss": 0.519, + "step": 6960 + }, + { + "epoch": 3.8888268156424584, + "grad_norm": 0.8952131867408752, + "learning_rate": 0.000807983193277311, + "loss": 0.4559, + "step": 6961 + }, + { + "epoch": 3.889385474860335, + "grad_norm": 0.5165050625801086, + "learning_rate": 0.0008079551820728291, + "loss": 0.5231, + "step": 6962 + }, + { + "epoch": 3.8899441340782124, + "grad_norm": 0.5500022172927856, + "learning_rate": 0.0008079271708683473, + "loss": 0.451, + "step": 6963 + }, + { + "epoch": 3.8905027932960894, + "grad_norm": 0.6725042462348938, + "learning_rate": 0.0008078991596638655, + "loss": 0.4561, + "step": 6964 + }, + { + "epoch": 3.8910614525139664, + "grad_norm": 0.9114112854003906, + "learning_rate": 0.0008078711484593838, + "loss": 0.7846, + "step": 6965 + }, + { + "epoch": 3.8916201117318434, + "grad_norm": 0.7531450390815735, + "learning_rate": 0.000807843137254902, + "loss": 0.3715, + "step": 6966 + }, + { + "epoch": 3.8921787709497204, + "grad_norm": 0.41826075315475464, + "learning_rate": 0.0008078151260504201, + "loss": 0.3883, + "step": 6967 + }, + { + "epoch": 3.892737430167598, + "grad_norm": 0.6685256958007812, + "learning_rate": 0.0008077871148459383, + "loss": 0.6353, + "step": 6968 + }, + { + "epoch": 3.893296089385475, + "grad_norm": 0.7648836374282837, + "learning_rate": 0.0008077591036414565, + "loss": 0.5177, + "step": 6969 + }, + { + "epoch": 3.893854748603352, + "grad_norm": 0.6054646372795105, + "learning_rate": 0.0008077310924369749, + "loss": 0.4793, + "step": 6970 + }, + { + "epoch": 3.894413407821229, + "grad_norm": 0.7664828300476074, + "learning_rate": 0.0008077030812324931, + "loss": 0.4502, + "step": 6971 + }, + { + "epoch": 3.894972067039106, + "grad_norm": 0.7895660996437073, + "learning_rate": 0.0008076750700280112, + "loss": 0.4989, + "step": 6972 + }, + { + "epoch": 3.8955307262569834, + "grad_norm": 0.4433261454105377, + "learning_rate": 0.0008076470588235294, + "loss": 0.4142, + "step": 6973 + }, + { + "epoch": 3.8960893854748604, + "grad_norm": 2.4702680110931396, + "learning_rate": 0.0008076190476190476, + "loss": 0.3818, + "step": 6974 + }, + { + "epoch": 3.8966480446927374, + "grad_norm": 5.3434343338012695, + "learning_rate": 0.0008075910364145659, + "loss": 0.4703, + "step": 6975 + }, + { + "epoch": 3.8972067039106144, + "grad_norm": 3.6028733253479004, + "learning_rate": 0.0008075630252100841, + "loss": 0.4731, + "step": 6976 + }, + { + "epoch": 3.8977653631284914, + "grad_norm": 0.8043778538703918, + "learning_rate": 0.0008075350140056023, + "loss": 0.3976, + "step": 6977 + }, + { + "epoch": 3.898324022346369, + "grad_norm": 0.6024851202964783, + "learning_rate": 0.0008075070028011204, + "loss": 0.4111, + "step": 6978 + }, + { + "epoch": 3.898882681564246, + "grad_norm": 3.9494760036468506, + "learning_rate": 0.0008074789915966386, + "loss": 0.3613, + "step": 6979 + }, + { + "epoch": 3.899441340782123, + "grad_norm": 0.4640497863292694, + "learning_rate": 0.0008074509803921569, + "loss": 0.5308, + "step": 6980 + }, + { + "epoch": 3.9, + "grad_norm": 0.9282743334770203, + "learning_rate": 0.0008074229691876751, + "loss": 0.4543, + "step": 6981 + }, + { + "epoch": 3.900558659217877, + "grad_norm": 0.5232148170471191, + "learning_rate": 0.0008073949579831933, + "loss": 0.3963, + "step": 6982 + }, + { + "epoch": 3.9011173184357544, + "grad_norm": 0.5028994679450989, + "learning_rate": 0.0008073669467787114, + "loss": 0.393, + "step": 6983 + }, + { + "epoch": 3.9016759776536314, + "grad_norm": 0.569754421710968, + "learning_rate": 0.0008073389355742296, + "loss": 0.4253, + "step": 6984 + }, + { + "epoch": 3.9022346368715084, + "grad_norm": 0.3445882499217987, + "learning_rate": 0.000807310924369748, + "loss": 0.3335, + "step": 6985 + }, + { + "epoch": 3.9027932960893854, + "grad_norm": 0.7102305293083191, + "learning_rate": 0.0008072829131652662, + "loss": 0.6117, + "step": 6986 + }, + { + "epoch": 3.9033519553072624, + "grad_norm": 0.4957873523235321, + "learning_rate": 0.0008072549019607844, + "loss": 0.4687, + "step": 6987 + }, + { + "epoch": 3.90391061452514, + "grad_norm": 0.8255720734596252, + "learning_rate": 0.0008072268907563025, + "loss": 0.4999, + "step": 6988 + }, + { + "epoch": 3.904469273743017, + "grad_norm": 0.517471194267273, + "learning_rate": 0.0008071988795518207, + "loss": 0.4892, + "step": 6989 + }, + { + "epoch": 3.905027932960894, + "grad_norm": 0.4388323128223419, + "learning_rate": 0.000807170868347339, + "loss": 0.3793, + "step": 6990 + }, + { + "epoch": 3.905586592178771, + "grad_norm": 0.5175237655639648, + "learning_rate": 0.0008071428571428572, + "loss": 0.4927, + "step": 6991 + }, + { + "epoch": 3.906145251396648, + "grad_norm": 0.4710371792316437, + "learning_rate": 0.0008071148459383754, + "loss": 0.45, + "step": 6992 + }, + { + "epoch": 3.9067039106145254, + "grad_norm": 1.0904172658920288, + "learning_rate": 0.0008070868347338936, + "loss": 0.4551, + "step": 6993 + }, + { + "epoch": 3.9072625698324024, + "grad_norm": 0.4209286868572235, + "learning_rate": 0.0008070588235294117, + "loss": 0.3991, + "step": 6994 + }, + { + "epoch": 3.9078212290502794, + "grad_norm": 0.5297243595123291, + "learning_rate": 0.00080703081232493, + "loss": 0.426, + "step": 6995 + }, + { + "epoch": 3.9083798882681564, + "grad_norm": 0.7202159762382507, + "learning_rate": 0.0008070028011204482, + "loss": 0.5661, + "step": 6996 + }, + { + "epoch": 3.9089385474860334, + "grad_norm": 1.644371747970581, + "learning_rate": 0.0008069747899159664, + "loss": 0.5367, + "step": 6997 + }, + { + "epoch": 3.909497206703911, + "grad_norm": 0.5092253684997559, + "learning_rate": 0.0008069467787114846, + "loss": 0.3324, + "step": 6998 + }, + { + "epoch": 3.9100558659217874, + "grad_norm": 0.6591435670852661, + "learning_rate": 0.0008069187675070027, + "loss": 0.4779, + "step": 6999 + }, + { + "epoch": 3.910614525139665, + "grad_norm": 0.8748725652694702, + "learning_rate": 0.000806890756302521, + "loss": 0.4345, + "step": 7000 + }, + { + "epoch": 3.910614525139665, + "eval_cer": 0.09470502874974088, + "eval_loss": 0.35776227712631226, + "eval_runtime": 55.4362, + "eval_samples_per_second": 81.86, + "eval_steps_per_second": 5.123, + "eval_wer": 0.3764978631881791, + "step": 7000 + }, + { + "epoch": 3.911173184357542, + "grad_norm": 0.6731039881706238, + "learning_rate": 0.0008068627450980392, + "loss": 0.4923, + "step": 7001 + }, + { + "epoch": 3.911731843575419, + "grad_norm": 0.45611515641212463, + "learning_rate": 0.0008068347338935575, + "loss": 0.442, + "step": 7002 + }, + { + "epoch": 3.912290502793296, + "grad_norm": 0.7433273196220398, + "learning_rate": 0.0008068067226890757, + "loss": 0.4177, + "step": 7003 + }, + { + "epoch": 3.912849162011173, + "grad_norm": 0.9995244741439819, + "learning_rate": 0.0008067787114845938, + "loss": 0.4453, + "step": 7004 + }, + { + "epoch": 3.9134078212290504, + "grad_norm": 0.5766714215278625, + "learning_rate": 0.0008067507002801121, + "loss": 0.3903, + "step": 7005 + }, + { + "epoch": 3.9139664804469274, + "grad_norm": 0.6587193608283997, + "learning_rate": 0.0008067226890756303, + "loss": 0.6398, + "step": 7006 + }, + { + "epoch": 3.9145251396648044, + "grad_norm": 0.5817322134971619, + "learning_rate": 0.0008066946778711485, + "loss": 0.5049, + "step": 7007 + }, + { + "epoch": 3.9150837988826814, + "grad_norm": 0.5882250070571899, + "learning_rate": 0.0008066666666666667, + "loss": 0.5014, + "step": 7008 + }, + { + "epoch": 3.9156424581005584, + "grad_norm": 0.4699248671531677, + "learning_rate": 0.0008066386554621849, + "loss": 0.4198, + "step": 7009 + }, + { + "epoch": 3.916201117318436, + "grad_norm": 0.5861327648162842, + "learning_rate": 0.0008066106442577031, + "loss": 0.5021, + "step": 7010 + }, + { + "epoch": 3.916759776536313, + "grad_norm": 0.6326070427894592, + "learning_rate": 0.0008065826330532213, + "loss": 0.3746, + "step": 7011 + }, + { + "epoch": 3.91731843575419, + "grad_norm": 0.4733527898788452, + "learning_rate": 0.0008065546218487395, + "loss": 0.3857, + "step": 7012 + }, + { + "epoch": 3.917877094972067, + "grad_norm": 0.8005883097648621, + "learning_rate": 0.0008065266106442577, + "loss": 0.4941, + "step": 7013 + }, + { + "epoch": 3.918435754189944, + "grad_norm": 0.5382456183433533, + "learning_rate": 0.0008064985994397759, + "loss": 0.4692, + "step": 7014 + }, + { + "epoch": 3.9189944134078214, + "grad_norm": 7.389460563659668, + "learning_rate": 0.0008064705882352941, + "loss": 0.4415, + "step": 7015 + }, + { + "epoch": 3.9195530726256984, + "grad_norm": 0.47508105635643005, + "learning_rate": 0.0008064425770308123, + "loss": 0.4821, + "step": 7016 + }, + { + "epoch": 3.9201117318435754, + "grad_norm": 0.557486355304718, + "learning_rate": 0.0008064145658263305, + "loss": 0.4912, + "step": 7017 + }, + { + "epoch": 3.9206703910614524, + "grad_norm": 0.5815549492835999, + "learning_rate": 0.0008063865546218487, + "loss": 0.4798, + "step": 7018 + }, + { + "epoch": 3.9212290502793294, + "grad_norm": 0.5075328946113586, + "learning_rate": 0.000806358543417367, + "loss": 0.3497, + "step": 7019 + }, + { + "epoch": 3.921787709497207, + "grad_norm": 0.5094476938247681, + "learning_rate": 0.0008063305322128853, + "loss": 0.4987, + "step": 7020 + }, + { + "epoch": 3.922346368715084, + "grad_norm": 0.5268604755401611, + "learning_rate": 0.0008063025210084034, + "loss": 0.5418, + "step": 7021 + }, + { + "epoch": 3.922905027932961, + "grad_norm": 0.46845272183418274, + "learning_rate": 0.0008062745098039216, + "loss": 0.4105, + "step": 7022 + }, + { + "epoch": 3.923463687150838, + "grad_norm": 4.790427207946777, + "learning_rate": 0.0008062464985994398, + "loss": 0.5401, + "step": 7023 + }, + { + "epoch": 3.924022346368715, + "grad_norm": 0.6535236239433289, + "learning_rate": 0.000806218487394958, + "loss": 0.565, + "step": 7024 + }, + { + "epoch": 3.9245810055865924, + "grad_norm": 0.5313844680786133, + "learning_rate": 0.0008061904761904763, + "loss": 0.4814, + "step": 7025 + }, + { + "epoch": 3.9251396648044694, + "grad_norm": 0.5265361666679382, + "learning_rate": 0.0008061624649859944, + "loss": 0.4422, + "step": 7026 + }, + { + "epoch": 3.9256983240223464, + "grad_norm": 0.46917101740837097, + "learning_rate": 0.0008061344537815126, + "loss": 0.4574, + "step": 7027 + }, + { + "epoch": 3.9262569832402234, + "grad_norm": 1.8454087972640991, + "learning_rate": 0.0008061064425770308, + "loss": 0.4999, + "step": 7028 + }, + { + "epoch": 3.9268156424581004, + "grad_norm": 0.6060447096824646, + "learning_rate": 0.000806078431372549, + "loss": 0.5635, + "step": 7029 + }, + { + "epoch": 3.927374301675978, + "grad_norm": 0.512328028678894, + "learning_rate": 0.0008060504201680673, + "loss": 0.4391, + "step": 7030 + }, + { + "epoch": 3.927932960893855, + "grad_norm": 0.5457378625869751, + "learning_rate": 0.0008060224089635854, + "loss": 0.4303, + "step": 7031 + }, + { + "epoch": 3.928491620111732, + "grad_norm": 0.4917803406715393, + "learning_rate": 0.0008059943977591036, + "loss": 0.351, + "step": 7032 + }, + { + "epoch": 3.929050279329609, + "grad_norm": 0.4893527925014496, + "learning_rate": 0.0008059663865546218, + "loss": 0.4786, + "step": 7033 + }, + { + "epoch": 3.929608938547486, + "grad_norm": 0.5211455821990967, + "learning_rate": 0.00080593837535014, + "loss": 0.5069, + "step": 7034 + }, + { + "epoch": 3.9301675977653634, + "grad_norm": 0.5341497659683228, + "learning_rate": 0.0008059103641456584, + "loss": 0.5896, + "step": 7035 + }, + { + "epoch": 3.93072625698324, + "grad_norm": 0.6408417224884033, + "learning_rate": 0.0008058823529411766, + "loss": 0.4398, + "step": 7036 + }, + { + "epoch": 3.9312849162011174, + "grad_norm": 0.7231680750846863, + "learning_rate": 0.0008058543417366947, + "loss": 0.5005, + "step": 7037 + }, + { + "epoch": 3.9318435754189944, + "grad_norm": 0.890194833278656, + "learning_rate": 0.0008058263305322129, + "loss": 0.5027, + "step": 7038 + }, + { + "epoch": 3.9324022346368714, + "grad_norm": 0.40565377473831177, + "learning_rate": 0.0008057983193277311, + "loss": 0.3459, + "step": 7039 + }, + { + "epoch": 3.9329608938547485, + "grad_norm": 0.5612426996231079, + "learning_rate": 0.0008057703081232494, + "loss": 0.4142, + "step": 7040 + }, + { + "epoch": 3.9335195530726255, + "grad_norm": 0.6588249206542969, + "learning_rate": 0.0008057422969187676, + "loss": 0.5066, + "step": 7041 + }, + { + "epoch": 3.934078212290503, + "grad_norm": 8.109230041503906, + "learning_rate": 0.0008057142857142857, + "loss": 0.3917, + "step": 7042 + }, + { + "epoch": 3.93463687150838, + "grad_norm": 6.23183536529541, + "learning_rate": 0.0008056862745098039, + "loss": 0.4999, + "step": 7043 + }, + { + "epoch": 3.935195530726257, + "grad_norm": 0.6060693264007568, + "learning_rate": 0.0008056582633053221, + "loss": 0.4874, + "step": 7044 + }, + { + "epoch": 3.935754189944134, + "grad_norm": 0.4651438891887665, + "learning_rate": 0.0008056302521008404, + "loss": 0.3899, + "step": 7045 + }, + { + "epoch": 3.936312849162011, + "grad_norm": 0.39840027689933777, + "learning_rate": 0.0008056022408963586, + "loss": 0.3739, + "step": 7046 + }, + { + "epoch": 3.9368715083798884, + "grad_norm": 0.4247986674308777, + "learning_rate": 0.0008055742296918767, + "loss": 0.4093, + "step": 7047 + }, + { + "epoch": 3.9374301675977654, + "grad_norm": 0.6381890177726746, + "learning_rate": 0.0008055462184873949, + "loss": 0.4181, + "step": 7048 + }, + { + "epoch": 3.9379888268156424, + "grad_norm": 0.6865086555480957, + "learning_rate": 0.0008055182072829131, + "loss": 0.5476, + "step": 7049 + }, + { + "epoch": 3.9385474860335195, + "grad_norm": 0.8059674501419067, + "learning_rate": 0.0008054901960784314, + "loss": 0.6844, + "step": 7050 + }, + { + "epoch": 3.9391061452513965, + "grad_norm": 0.41092970967292786, + "learning_rate": 0.0008054621848739497, + "loss": 0.3663, + "step": 7051 + }, + { + "epoch": 3.939664804469274, + "grad_norm": 0.5282922387123108, + "learning_rate": 0.0008054341736694679, + "loss": 0.5227, + "step": 7052 + }, + { + "epoch": 3.940223463687151, + "grad_norm": 0.7463955879211426, + "learning_rate": 0.000805406162464986, + "loss": 0.7564, + "step": 7053 + }, + { + "epoch": 3.940782122905028, + "grad_norm": 0.8797734975814819, + "learning_rate": 0.0008053781512605042, + "loss": 0.4228, + "step": 7054 + }, + { + "epoch": 3.941340782122905, + "grad_norm": 0.7570948004722595, + "learning_rate": 0.0008053501400560225, + "loss": 0.6247, + "step": 7055 + }, + { + "epoch": 3.941899441340782, + "grad_norm": 0.4585045576095581, + "learning_rate": 0.0008053221288515407, + "loss": 0.4591, + "step": 7056 + }, + { + "epoch": 3.9424581005586594, + "grad_norm": 0.6605092287063599, + "learning_rate": 0.0008052941176470589, + "loss": 0.443, + "step": 7057 + }, + { + "epoch": 3.9430167597765364, + "grad_norm": 0.5274098515510559, + "learning_rate": 0.000805266106442577, + "loss": 0.5547, + "step": 7058 + }, + { + "epoch": 3.9435754189944134, + "grad_norm": 0.5492943525314331, + "learning_rate": 0.0008052380952380952, + "loss": 0.4418, + "step": 7059 + }, + { + "epoch": 3.9441340782122905, + "grad_norm": 0.5419865250587463, + "learning_rate": 0.0008052100840336135, + "loss": 0.6448, + "step": 7060 + }, + { + "epoch": 3.9446927374301675, + "grad_norm": 0.7609917521476746, + "learning_rate": 0.0008051820728291317, + "loss": 0.6492, + "step": 7061 + }, + { + "epoch": 3.945251396648045, + "grad_norm": 0.5737199187278748, + "learning_rate": 0.0008051540616246499, + "loss": 0.4242, + "step": 7062 + }, + { + "epoch": 3.945810055865922, + "grad_norm": 0.4582638442516327, + "learning_rate": 0.000805126050420168, + "loss": 0.3942, + "step": 7063 + }, + { + "epoch": 3.946368715083799, + "grad_norm": 0.5929429531097412, + "learning_rate": 0.0008050980392156862, + "loss": 0.4288, + "step": 7064 + }, + { + "epoch": 3.946927374301676, + "grad_norm": 4.677650451660156, + "learning_rate": 0.0008050700280112045, + "loss": 0.4182, + "step": 7065 + }, + { + "epoch": 3.947486033519553, + "grad_norm": 0.4796430170536041, + "learning_rate": 0.0008050420168067227, + "loss": 0.3749, + "step": 7066 + }, + { + "epoch": 3.9480446927374304, + "grad_norm": 0.45303863286972046, + "learning_rate": 0.000805014005602241, + "loss": 0.4044, + "step": 7067 + }, + { + "epoch": 3.9486033519553074, + "grad_norm": 0.5329473614692688, + "learning_rate": 0.0008049859943977592, + "loss": 0.4969, + "step": 7068 + }, + { + "epoch": 3.9491620111731844, + "grad_norm": 0.7041875720024109, + "learning_rate": 0.0008049579831932773, + "loss": 0.5208, + "step": 7069 + }, + { + "epoch": 3.9497206703910615, + "grad_norm": 0.381235808134079, + "learning_rate": 0.0008049299719887956, + "loss": 0.3891, + "step": 7070 + }, + { + "epoch": 3.9502793296089385, + "grad_norm": 0.686011791229248, + "learning_rate": 0.0008049019607843138, + "loss": 0.4518, + "step": 7071 + }, + { + "epoch": 3.950837988826816, + "grad_norm": 0.5750831961631775, + "learning_rate": 0.000804873949579832, + "loss": 0.4822, + "step": 7072 + }, + { + "epoch": 3.9513966480446925, + "grad_norm": 0.4979250133037567, + "learning_rate": 0.0008048459383753502, + "loss": 0.4977, + "step": 7073 + }, + { + "epoch": 3.95195530726257, + "grad_norm": 0.4827015697956085, + "learning_rate": 0.0008048179271708683, + "loss": 0.4273, + "step": 7074 + }, + { + "epoch": 3.952513966480447, + "grad_norm": 0.4956649839878082, + "learning_rate": 0.0008047899159663866, + "loss": 0.4951, + "step": 7075 + }, + { + "epoch": 3.953072625698324, + "grad_norm": 0.5707218050956726, + "learning_rate": 0.0008047619047619048, + "loss": 0.5498, + "step": 7076 + }, + { + "epoch": 3.953631284916201, + "grad_norm": 0.6664177179336548, + "learning_rate": 0.000804733893557423, + "loss": 0.4399, + "step": 7077 + }, + { + "epoch": 3.954189944134078, + "grad_norm": 0.6316648125648499, + "learning_rate": 0.0008047058823529412, + "loss": 0.3213, + "step": 7078 + }, + { + "epoch": 3.9547486033519554, + "grad_norm": 0.8092297911643982, + "learning_rate": 0.0008046778711484593, + "loss": 0.4735, + "step": 7079 + }, + { + "epoch": 3.9553072625698324, + "grad_norm": 0.5198471546173096, + "learning_rate": 0.0008046498599439776, + "loss": 0.4956, + "step": 7080 + }, + { + "epoch": 3.9558659217877095, + "grad_norm": 0.4854726791381836, + "learning_rate": 0.0008046218487394958, + "loss": 0.3551, + "step": 7081 + }, + { + "epoch": 3.9564245810055865, + "grad_norm": 2.2213189601898193, + "learning_rate": 0.000804593837535014, + "loss": 0.4437, + "step": 7082 + }, + { + "epoch": 3.9569832402234635, + "grad_norm": 0.6036872267723083, + "learning_rate": 0.0008045658263305322, + "loss": 0.5081, + "step": 7083 + }, + { + "epoch": 3.957541899441341, + "grad_norm": 0.747298538684845, + "learning_rate": 0.0008045378151260505, + "loss": 0.7727, + "step": 7084 + }, + { + "epoch": 3.958100558659218, + "grad_norm": 0.5486094951629639, + "learning_rate": 0.0008045098039215687, + "loss": 0.6463, + "step": 7085 + }, + { + "epoch": 3.958659217877095, + "grad_norm": 1.0138646364212036, + "learning_rate": 0.0008044817927170869, + "loss": 0.758, + "step": 7086 + }, + { + "epoch": 3.959217877094972, + "grad_norm": 0.891136646270752, + "learning_rate": 0.0008044537815126051, + "loss": 0.6432, + "step": 7087 + }, + { + "epoch": 3.959776536312849, + "grad_norm": 0.6499348282814026, + "learning_rate": 0.0008044257703081233, + "loss": 0.3983, + "step": 7088 + }, + { + "epoch": 3.9603351955307264, + "grad_norm": 0.527060329914093, + "learning_rate": 0.0008043977591036415, + "loss": 0.4382, + "step": 7089 + }, + { + "epoch": 3.9608938547486034, + "grad_norm": 0.6966915130615234, + "learning_rate": 0.0008043697478991597, + "loss": 0.4623, + "step": 7090 + }, + { + "epoch": 3.9614525139664805, + "grad_norm": 0.4720066785812378, + "learning_rate": 0.0008043417366946779, + "loss": 0.47, + "step": 7091 + }, + { + "epoch": 3.9620111731843575, + "grad_norm": 0.6764178276062012, + "learning_rate": 0.0008043137254901961, + "loss": 0.4105, + "step": 7092 + }, + { + "epoch": 3.9625698324022345, + "grad_norm": 0.41640740633010864, + "learning_rate": 0.0008042857142857143, + "loss": 0.3539, + "step": 7093 + }, + { + "epoch": 3.963128491620112, + "grad_norm": 0.36168813705444336, + "learning_rate": 0.0008042577030812325, + "loss": 0.383, + "step": 7094 + }, + { + "epoch": 3.963687150837989, + "grad_norm": 0.7122403979301453, + "learning_rate": 0.0008042296918767507, + "loss": 0.7117, + "step": 7095 + }, + { + "epoch": 3.964245810055866, + "grad_norm": 0.5665606260299683, + "learning_rate": 0.0008042016806722689, + "loss": 0.5116, + "step": 7096 + }, + { + "epoch": 3.964804469273743, + "grad_norm": 0.6856590509414673, + "learning_rate": 0.0008041736694677871, + "loss": 0.4778, + "step": 7097 + }, + { + "epoch": 3.96536312849162, + "grad_norm": 0.5604067444801331, + "learning_rate": 0.0008041456582633053, + "loss": 0.4897, + "step": 7098 + }, + { + "epoch": 3.9659217877094974, + "grad_norm": 0.4804839789867401, + "learning_rate": 0.0008041176470588235, + "loss": 0.4412, + "step": 7099 + }, + { + "epoch": 3.9664804469273744, + "grad_norm": 0.4849615693092346, + "learning_rate": 0.0008040896358543419, + "loss": 0.4329, + "step": 7100 + }, + { + "epoch": 3.9670391061452515, + "grad_norm": 0.627479076385498, + "learning_rate": 0.00080406162464986, + "loss": 0.5344, + "step": 7101 + }, + { + "epoch": 3.9675977653631285, + "grad_norm": 0.8258705139160156, + "learning_rate": 0.0008040336134453782, + "loss": 0.5079, + "step": 7102 + }, + { + "epoch": 3.9681564245810055, + "grad_norm": 0.5508294701576233, + "learning_rate": 0.0008040056022408964, + "loss": 0.5612, + "step": 7103 + }, + { + "epoch": 3.968715083798883, + "grad_norm": 0.5187811851501465, + "learning_rate": 0.0008039775910364146, + "loss": 0.3862, + "step": 7104 + }, + { + "epoch": 3.9692737430167595, + "grad_norm": 0.7678642868995667, + "learning_rate": 0.0008039495798319329, + "loss": 0.4166, + "step": 7105 + }, + { + "epoch": 3.969832402234637, + "grad_norm": 0.4972739815711975, + "learning_rate": 0.000803921568627451, + "loss": 0.3716, + "step": 7106 + }, + { + "epoch": 3.970391061452514, + "grad_norm": 0.41464969515800476, + "learning_rate": 0.0008038935574229692, + "loss": 0.4396, + "step": 7107 + }, + { + "epoch": 3.970949720670391, + "grad_norm": 0.5230623483657837, + "learning_rate": 0.0008038655462184874, + "loss": 0.4529, + "step": 7108 + }, + { + "epoch": 3.971508379888268, + "grad_norm": 0.6593648195266724, + "learning_rate": 0.0008038375350140056, + "loss": 0.4248, + "step": 7109 + }, + { + "epoch": 3.972067039106145, + "grad_norm": 0.41191044449806213, + "learning_rate": 0.0008038095238095239, + "loss": 0.3882, + "step": 7110 + }, + { + "epoch": 3.9726256983240225, + "grad_norm": 0.6715481877326965, + "learning_rate": 0.000803781512605042, + "loss": 0.382, + "step": 7111 + }, + { + "epoch": 3.9731843575418995, + "grad_norm": 0.47481071949005127, + "learning_rate": 0.0008037535014005602, + "loss": 0.4108, + "step": 7112 + }, + { + "epoch": 3.9737430167597765, + "grad_norm": 1.0387659072875977, + "learning_rate": 0.0008037254901960784, + "loss": 0.5368, + "step": 7113 + }, + { + "epoch": 3.9743016759776535, + "grad_norm": 0.7313138842582703, + "learning_rate": 0.0008036974789915966, + "loss": 0.5214, + "step": 7114 + }, + { + "epoch": 3.9748603351955305, + "grad_norm": 0.6585182547569275, + "learning_rate": 0.0008036694677871149, + "loss": 0.3771, + "step": 7115 + }, + { + "epoch": 3.975418994413408, + "grad_norm": 0.5558366179466248, + "learning_rate": 0.0008036414565826332, + "loss": 0.4826, + "step": 7116 + }, + { + "epoch": 3.975977653631285, + "grad_norm": 0.7371494770050049, + "learning_rate": 0.0008036134453781512, + "loss": 0.7938, + "step": 7117 + }, + { + "epoch": 3.976536312849162, + "grad_norm": 0.4658590257167816, + "learning_rate": 0.0008035854341736695, + "loss": 0.4104, + "step": 7118 + }, + { + "epoch": 3.977094972067039, + "grad_norm": 0.695318877696991, + "learning_rate": 0.0008035574229691877, + "loss": 0.4875, + "step": 7119 + }, + { + "epoch": 3.977653631284916, + "grad_norm": 0.7919595837593079, + "learning_rate": 0.000803529411764706, + "loss": 0.5135, + "step": 7120 + }, + { + "epoch": 3.9782122905027935, + "grad_norm": 1.74186110496521, + "learning_rate": 0.0008035014005602242, + "loss": 0.545, + "step": 7121 + }, + { + "epoch": 3.9787709497206705, + "grad_norm": 0.54842209815979, + "learning_rate": 0.0008034733893557423, + "loss": 0.4151, + "step": 7122 + }, + { + "epoch": 3.9793296089385475, + "grad_norm": 0.8092365264892578, + "learning_rate": 0.0008034453781512605, + "loss": 0.4718, + "step": 7123 + }, + { + "epoch": 3.9798882681564245, + "grad_norm": 0.8299870491027832, + "learning_rate": 0.0008034173669467787, + "loss": 0.4668, + "step": 7124 + }, + { + "epoch": 3.9804469273743015, + "grad_norm": 2.1802937984466553, + "learning_rate": 0.000803389355742297, + "loss": 0.6053, + "step": 7125 + }, + { + "epoch": 3.981005586592179, + "grad_norm": 0.43331030011177063, + "learning_rate": 0.0008033613445378152, + "loss": 0.4514, + "step": 7126 + }, + { + "epoch": 3.981564245810056, + "grad_norm": 0.45537856221199036, + "learning_rate": 0.0008033333333333333, + "loss": 0.4293, + "step": 7127 + }, + { + "epoch": 3.982122905027933, + "grad_norm": 0.5720996260643005, + "learning_rate": 0.0008033053221288515, + "loss": 0.4547, + "step": 7128 + }, + { + "epoch": 3.98268156424581, + "grad_norm": 0.42259424924850464, + "learning_rate": 0.0008032773109243697, + "loss": 0.4906, + "step": 7129 + }, + { + "epoch": 3.983240223463687, + "grad_norm": 0.4236142933368683, + "learning_rate": 0.000803249299719888, + "loss": 0.4155, + "step": 7130 + }, + { + "epoch": 3.9837988826815645, + "grad_norm": 0.45177119970321655, + "learning_rate": 0.0008032212885154062, + "loss": 0.4964, + "step": 7131 + }, + { + "epoch": 3.9843575418994415, + "grad_norm": 0.6593208909034729, + "learning_rate": 0.0008031932773109244, + "loss": 0.6244, + "step": 7132 + }, + { + "epoch": 3.9849162011173185, + "grad_norm": 0.53520268201828, + "learning_rate": 0.0008031652661064425, + "loss": 0.5647, + "step": 7133 + }, + { + "epoch": 3.9854748603351955, + "grad_norm": 0.9825749397277832, + "learning_rate": 0.0008031372549019608, + "loss": 0.4624, + "step": 7134 + }, + { + "epoch": 3.9860335195530725, + "grad_norm": 0.6735973358154297, + "learning_rate": 0.000803109243697479, + "loss": 0.4715, + "step": 7135 + }, + { + "epoch": 3.98659217877095, + "grad_norm": 0.5279591083526611, + "learning_rate": 0.0008030812324929973, + "loss": 0.4966, + "step": 7136 + }, + { + "epoch": 3.987150837988827, + "grad_norm": 0.9416318535804749, + "learning_rate": 0.0008030532212885155, + "loss": 0.3927, + "step": 7137 + }, + { + "epoch": 3.987709497206704, + "grad_norm": 0.4318915605545044, + "learning_rate": 0.0008030252100840336, + "loss": 0.4582, + "step": 7138 + }, + { + "epoch": 3.988268156424581, + "grad_norm": 0.5767625570297241, + "learning_rate": 0.0008029971988795518, + "loss": 0.4547, + "step": 7139 + }, + { + "epoch": 3.988826815642458, + "grad_norm": 0.5418605208396912, + "learning_rate": 0.00080296918767507, + "loss": 0.3949, + "step": 7140 + }, + { + "epoch": 3.9893854748603355, + "grad_norm": 0.3813818097114563, + "learning_rate": 0.0008029411764705883, + "loss": 0.4219, + "step": 7141 + }, + { + "epoch": 3.989944134078212, + "grad_norm": 0.7461854219436646, + "learning_rate": 0.0008029131652661065, + "loss": 0.4156, + "step": 7142 + }, + { + "epoch": 3.9905027932960895, + "grad_norm": 0.40891969203948975, + "learning_rate": 0.0008028851540616246, + "loss": 0.4124, + "step": 7143 + }, + { + "epoch": 3.9910614525139665, + "grad_norm": 3.145090341567993, + "learning_rate": 0.0008028571428571428, + "loss": 0.4219, + "step": 7144 + }, + { + "epoch": 3.9916201117318435, + "grad_norm": 0.4977027475833893, + "learning_rate": 0.000802829131652661, + "loss": 0.43, + "step": 7145 + }, + { + "epoch": 3.9921787709497205, + "grad_norm": 0.6141577959060669, + "learning_rate": 0.0008028011204481793, + "loss": 0.4784, + "step": 7146 + }, + { + "epoch": 3.9927374301675975, + "grad_norm": 0.4877220094203949, + "learning_rate": 0.0008027731092436975, + "loss": 0.4408, + "step": 7147 + }, + { + "epoch": 3.993296089385475, + "grad_norm": 6.336646556854248, + "learning_rate": 0.0008027450980392157, + "loss": 0.6765, + "step": 7148 + }, + { + "epoch": 3.993854748603352, + "grad_norm": 0.48627859354019165, + "learning_rate": 0.0008027170868347338, + "loss": 0.411, + "step": 7149 + }, + { + "epoch": 3.994413407821229, + "grad_norm": 0.6415491700172424, + "learning_rate": 0.000802689075630252, + "loss": 0.4663, + "step": 7150 + }, + { + "epoch": 3.994972067039106, + "grad_norm": 0.628501296043396, + "learning_rate": 0.0008026610644257704, + "loss": 0.5153, + "step": 7151 + }, + { + "epoch": 3.995530726256983, + "grad_norm": 0.3527093529701233, + "learning_rate": 0.0008026330532212886, + "loss": 0.3798, + "step": 7152 + }, + { + "epoch": 3.9960893854748605, + "grad_norm": 0.388369619846344, + "learning_rate": 0.0008026050420168068, + "loss": 0.333, + "step": 7153 + }, + { + "epoch": 3.9966480446927375, + "grad_norm": 0.6742631793022156, + "learning_rate": 0.0008025770308123249, + "loss": 0.4374, + "step": 7154 + }, + { + "epoch": 3.9972067039106145, + "grad_norm": 1.166015863418579, + "learning_rate": 0.0008025490196078431, + "loss": 0.5468, + "step": 7155 + }, + { + "epoch": 3.9977653631284915, + "grad_norm": 1.0816982984542847, + "learning_rate": 0.0008025210084033614, + "loss": 0.4357, + "step": 7156 + }, + { + "epoch": 3.9983240223463685, + "grad_norm": 0.39572039246559143, + "learning_rate": 0.0008024929971988796, + "loss": 0.3401, + "step": 7157 + }, + { + "epoch": 3.998882681564246, + "grad_norm": 0.5916157364845276, + "learning_rate": 0.0008024649859943978, + "loss": 0.4312, + "step": 7158 + }, + { + "epoch": 3.999441340782123, + "grad_norm": 5.927062511444092, + "learning_rate": 0.0008024369747899159, + "loss": 0.5623, + "step": 7159 + }, + { + "epoch": 4.0, + "grad_norm": 1.4475934505462646, + "learning_rate": 0.0008024089635854341, + "loss": 0.5796, + "step": 7160 + }, + { + "epoch": 4.0005586592178775, + "grad_norm": 0.6633867025375366, + "learning_rate": 0.0008023809523809524, + "loss": 0.3719, + "step": 7161 + }, + { + "epoch": 4.001117318435754, + "grad_norm": 0.7885862588882446, + "learning_rate": 0.0008023529411764706, + "loss": 0.629, + "step": 7162 + }, + { + "epoch": 4.0016759776536315, + "grad_norm": 2.0921740531921387, + "learning_rate": 0.0008023249299719888, + "loss": 0.4163, + "step": 7163 + }, + { + "epoch": 4.002234636871508, + "grad_norm": 3.035512924194336, + "learning_rate": 0.000802296918767507, + "loss": 0.4565, + "step": 7164 + }, + { + "epoch": 4.0027932960893855, + "grad_norm": 0.6488288044929504, + "learning_rate": 0.0008022689075630251, + "loss": 0.4962, + "step": 7165 + }, + { + "epoch": 4.003351955307263, + "grad_norm": 0.852614164352417, + "learning_rate": 0.0008022408963585435, + "loss": 0.4056, + "step": 7166 + }, + { + "epoch": 4.0039106145251395, + "grad_norm": 0.5377743244171143, + "learning_rate": 0.0008022128851540617, + "loss": 0.4342, + "step": 7167 + }, + { + "epoch": 4.004469273743017, + "grad_norm": 2.5451347827911377, + "learning_rate": 0.0008021848739495799, + "loss": 0.5396, + "step": 7168 + }, + { + "epoch": 4.0050279329608935, + "grad_norm": 0.4638262987136841, + "learning_rate": 0.0008021568627450981, + "loss": 0.5815, + "step": 7169 + }, + { + "epoch": 4.005586592178771, + "grad_norm": 0.7978700995445251, + "learning_rate": 0.0008021288515406162, + "loss": 0.532, + "step": 7170 + }, + { + "epoch": 4.0061452513966485, + "grad_norm": 0.6896904706954956, + "learning_rate": 0.0008021008403361345, + "loss": 0.4431, + "step": 7171 + }, + { + "epoch": 4.006703910614525, + "grad_norm": 0.5434969067573547, + "learning_rate": 0.0008020728291316527, + "loss": 0.4866, + "step": 7172 + }, + { + "epoch": 4.0072625698324025, + "grad_norm": 1.9140547513961792, + "learning_rate": 0.0008020448179271709, + "loss": 0.4938, + "step": 7173 + }, + { + "epoch": 4.007821229050279, + "grad_norm": 0.5029609203338623, + "learning_rate": 0.0008020168067226891, + "loss": 0.4285, + "step": 7174 + }, + { + "epoch": 4.0083798882681565, + "grad_norm": 4.06582498550415, + "learning_rate": 0.0008019887955182072, + "loss": 0.5401, + "step": 7175 + }, + { + "epoch": 4.008938547486034, + "grad_norm": 0.7806519865989685, + "learning_rate": 0.0008019607843137255, + "loss": 0.5359, + "step": 7176 + }, + { + "epoch": 4.0094972067039105, + "grad_norm": 0.5114393830299377, + "learning_rate": 0.0008019327731092437, + "loss": 0.4199, + "step": 7177 + }, + { + "epoch": 4.010055865921788, + "grad_norm": 0.4742944538593292, + "learning_rate": 0.0008019047619047619, + "loss": 0.4514, + "step": 7178 + }, + { + "epoch": 4.0106145251396645, + "grad_norm": 0.9224108457565308, + "learning_rate": 0.0008018767507002801, + "loss": 0.4562, + "step": 7179 + }, + { + "epoch": 4.011173184357542, + "grad_norm": 0.3695765435695648, + "learning_rate": 0.0008018487394957983, + "loss": 0.4639, + "step": 7180 + }, + { + "epoch": 4.011731843575419, + "grad_norm": 0.5876614451408386, + "learning_rate": 0.0008018207282913165, + "loss": 0.3645, + "step": 7181 + }, + { + "epoch": 4.012290502793296, + "grad_norm": 1.222946286201477, + "learning_rate": 0.0008017927170868347, + "loss": 0.5316, + "step": 7182 + }, + { + "epoch": 4.0128491620111735, + "grad_norm": 0.7444854974746704, + "learning_rate": 0.000801764705882353, + "loss": 0.4371, + "step": 7183 + }, + { + "epoch": 4.01340782122905, + "grad_norm": 0.7403766512870789, + "learning_rate": 0.0008017366946778712, + "loss": 0.423, + "step": 7184 + }, + { + "epoch": 4.0139664804469275, + "grad_norm": 0.5357937216758728, + "learning_rate": 0.0008017086834733894, + "loss": 0.4277, + "step": 7185 + }, + { + "epoch": 4.014525139664804, + "grad_norm": 1.3541953563690186, + "learning_rate": 0.0008016806722689076, + "loss": 0.4835, + "step": 7186 + }, + { + "epoch": 4.0150837988826815, + "grad_norm": 0.7186394333839417, + "learning_rate": 0.0008016526610644258, + "loss": 0.4908, + "step": 7187 + }, + { + "epoch": 4.015642458100559, + "grad_norm": 0.5224939584732056, + "learning_rate": 0.000801624649859944, + "loss": 0.6235, + "step": 7188 + }, + { + "epoch": 4.0162011173184355, + "grad_norm": 0.5467060804367065, + "learning_rate": 0.0008015966386554622, + "loss": 0.4844, + "step": 7189 + }, + { + "epoch": 4.016759776536313, + "grad_norm": 0.8589743375778198, + "learning_rate": 0.0008015686274509804, + "loss": 0.5053, + "step": 7190 + }, + { + "epoch": 4.01731843575419, + "grad_norm": 2.804008722305298, + "learning_rate": 0.0008015406162464987, + "loss": 0.6737, + "step": 7191 + }, + { + "epoch": 4.017877094972067, + "grad_norm": 0.38436779379844666, + "learning_rate": 0.0008015126050420168, + "loss": 0.3745, + "step": 7192 + }, + { + "epoch": 4.0184357541899445, + "grad_norm": 0.5819133520126343, + "learning_rate": 0.000801484593837535, + "loss": 0.4057, + "step": 7193 + }, + { + "epoch": 4.018994413407821, + "grad_norm": 0.7214066386222839, + "learning_rate": 0.0008014565826330532, + "loss": 0.5086, + "step": 7194 + }, + { + "epoch": 4.0195530726256985, + "grad_norm": 0.4668353199958801, + "learning_rate": 0.0008014285714285714, + "loss": 0.4019, + "step": 7195 + }, + { + "epoch": 4.020111731843575, + "grad_norm": 0.6879488229751587, + "learning_rate": 0.0008014005602240897, + "loss": 0.379, + "step": 7196 + }, + { + "epoch": 4.0206703910614525, + "grad_norm": 0.7188233733177185, + "learning_rate": 0.0008013725490196078, + "loss": 0.3883, + "step": 7197 + }, + { + "epoch": 4.02122905027933, + "grad_norm": 0.4577838182449341, + "learning_rate": 0.000801344537815126, + "loss": 0.469, + "step": 7198 + }, + { + "epoch": 4.0217877094972065, + "grad_norm": 0.4861854910850525, + "learning_rate": 0.0008013165266106442, + "loss": 0.3752, + "step": 7199 + }, + { + "epoch": 4.022346368715084, + "grad_norm": 0.6420953869819641, + "learning_rate": 0.0008012885154061625, + "loss": 0.4756, + "step": 7200 + }, + { + "epoch": 4.022905027932961, + "grad_norm": 0.41528305411338806, + "learning_rate": 0.0008012605042016808, + "loss": 0.3527, + "step": 7201 + }, + { + "epoch": 4.023463687150838, + "grad_norm": 0.7263068556785583, + "learning_rate": 0.0008012324929971989, + "loss": 0.5823, + "step": 7202 + }, + { + "epoch": 4.0240223463687155, + "grad_norm": 0.5570254325866699, + "learning_rate": 0.0008012044817927171, + "loss": 0.4963, + "step": 7203 + }, + { + "epoch": 4.024581005586592, + "grad_norm": 0.5761282444000244, + "learning_rate": 0.0008011764705882353, + "loss": 0.3855, + "step": 7204 + }, + { + "epoch": 4.0251396648044695, + "grad_norm": 1.0314967632293701, + "learning_rate": 0.0008011484593837535, + "loss": 0.4605, + "step": 7205 + }, + { + "epoch": 4.025698324022346, + "grad_norm": 0.5157830119132996, + "learning_rate": 0.0008011204481792718, + "loss": 0.6801, + "step": 7206 + }, + { + "epoch": 4.0262569832402235, + "grad_norm": 0.5801483988761902, + "learning_rate": 0.00080109243697479, + "loss": 0.5254, + "step": 7207 + }, + { + "epoch": 4.026815642458101, + "grad_norm": 1.4835443496704102, + "learning_rate": 0.0008010644257703081, + "loss": 0.5329, + "step": 7208 + }, + { + "epoch": 4.0273743016759775, + "grad_norm": 0.48205339908599854, + "learning_rate": 0.0008010364145658263, + "loss": 0.4851, + "step": 7209 + }, + { + "epoch": 4.027932960893855, + "grad_norm": 0.5513060092926025, + "learning_rate": 0.0008010084033613445, + "loss": 0.4656, + "step": 7210 + }, + { + "epoch": 4.028491620111732, + "grad_norm": 0.703374981880188, + "learning_rate": 0.0008009803921568628, + "loss": 0.4033, + "step": 7211 + }, + { + "epoch": 4.029050279329609, + "grad_norm": 1.6137562990188599, + "learning_rate": 0.000800952380952381, + "loss": 0.5413, + "step": 7212 + }, + { + "epoch": 4.0296089385474865, + "grad_norm": 0.39633744955062866, + "learning_rate": 0.0008009243697478991, + "loss": 0.4198, + "step": 7213 + }, + { + "epoch": 4.030167597765363, + "grad_norm": 1.209067702293396, + "learning_rate": 0.0008008963585434173, + "loss": 0.5209, + "step": 7214 + }, + { + "epoch": 4.0307262569832405, + "grad_norm": 0.7100725769996643, + "learning_rate": 0.0008008683473389355, + "loss": 0.5627, + "step": 7215 + }, + { + "epoch": 4.031284916201117, + "grad_norm": 0.5386775732040405, + "learning_rate": 0.0008008403361344539, + "loss": 0.5655, + "step": 7216 + }, + { + "epoch": 4.0318435754189945, + "grad_norm": 0.5487700700759888, + "learning_rate": 0.0008008123249299721, + "loss": 0.4431, + "step": 7217 + }, + { + "epoch": 4.032402234636871, + "grad_norm": 0.4018005430698395, + "learning_rate": 0.0008007843137254902, + "loss": 0.3433, + "step": 7218 + }, + { + "epoch": 4.0329608938547485, + "grad_norm": 0.49806350469589233, + "learning_rate": 0.0008007563025210084, + "loss": 0.5024, + "step": 7219 + }, + { + "epoch": 4.033519553072626, + "grad_norm": 0.6888643503189087, + "learning_rate": 0.0008007282913165266, + "loss": 0.4066, + "step": 7220 + }, + { + "epoch": 4.034078212290503, + "grad_norm": 5.580629825592041, + "learning_rate": 0.0008007002801120449, + "loss": 0.5298, + "step": 7221 + }, + { + "epoch": 4.03463687150838, + "grad_norm": 0.8422892093658447, + "learning_rate": 0.0008006722689075631, + "loss": 0.4032, + "step": 7222 + }, + { + "epoch": 4.035195530726257, + "grad_norm": 0.5582205653190613, + "learning_rate": 0.0008006442577030813, + "loss": 0.3891, + "step": 7223 + }, + { + "epoch": 4.035754189944134, + "grad_norm": 2.858771562576294, + "learning_rate": 0.0008006162464985994, + "loss": 0.4365, + "step": 7224 + }, + { + "epoch": 4.0363128491620115, + "grad_norm": 0.8811796307563782, + "learning_rate": 0.0008005882352941176, + "loss": 0.5947, + "step": 7225 + }, + { + "epoch": 4.036871508379888, + "grad_norm": 0.5713050365447998, + "learning_rate": 0.0008005602240896359, + "loss": 0.4606, + "step": 7226 + }, + { + "epoch": 4.0374301675977655, + "grad_norm": 0.4544447064399719, + "learning_rate": 0.0008005322128851541, + "loss": 0.4567, + "step": 7227 + }, + { + "epoch": 4.037988826815642, + "grad_norm": 0.4898993670940399, + "learning_rate": 0.0008005042016806723, + "loss": 0.4513, + "step": 7228 + }, + { + "epoch": 4.0385474860335195, + "grad_norm": 0.4296659529209137, + "learning_rate": 0.0008004761904761904, + "loss": 0.3623, + "step": 7229 + }, + { + "epoch": 4.039106145251397, + "grad_norm": 0.5693150758743286, + "learning_rate": 0.0008004481792717086, + "loss": 0.5068, + "step": 7230 + }, + { + "epoch": 4.039664804469274, + "grad_norm": 0.5388212203979492, + "learning_rate": 0.000800420168067227, + "loss": 0.4271, + "step": 7231 + }, + { + "epoch": 4.040223463687151, + "grad_norm": 0.5692974328994751, + "learning_rate": 0.0008003921568627452, + "loss": 0.4069, + "step": 7232 + }, + { + "epoch": 4.040782122905028, + "grad_norm": 0.4252888262271881, + "learning_rate": 0.0008003641456582634, + "loss": 0.3949, + "step": 7233 + }, + { + "epoch": 4.041340782122905, + "grad_norm": 0.41795429587364197, + "learning_rate": 0.0008003361344537815, + "loss": 0.4216, + "step": 7234 + }, + { + "epoch": 4.0418994413407825, + "grad_norm": 0.5262228846549988, + "learning_rate": 0.0008003081232492997, + "loss": 0.4599, + "step": 7235 + }, + { + "epoch": 4.042458100558659, + "grad_norm": 0.7310873866081238, + "learning_rate": 0.000800280112044818, + "loss": 0.413, + "step": 7236 + }, + { + "epoch": 4.0430167597765365, + "grad_norm": 0.734113872051239, + "learning_rate": 0.0008002521008403362, + "loss": 0.4544, + "step": 7237 + }, + { + "epoch": 4.043575418994413, + "grad_norm": 0.5125024914741516, + "learning_rate": 0.0008002240896358544, + "loss": 0.3456, + "step": 7238 + }, + { + "epoch": 4.0441340782122905, + "grad_norm": 0.7371616363525391, + "learning_rate": 0.0008001960784313726, + "loss": 0.4838, + "step": 7239 + }, + { + "epoch": 4.044692737430168, + "grad_norm": 0.43709471821784973, + "learning_rate": 0.0008001680672268907, + "loss": 0.4565, + "step": 7240 + }, + { + "epoch": 4.045251396648045, + "grad_norm": 1.1052905321121216, + "learning_rate": 0.000800140056022409, + "loss": 0.3689, + "step": 7241 + }, + { + "epoch": 4.045810055865922, + "grad_norm": 0.48502761125564575, + "learning_rate": 0.0008001120448179272, + "loss": 0.4328, + "step": 7242 + }, + { + "epoch": 4.046368715083799, + "grad_norm": 0.5574578046798706, + "learning_rate": 0.0008000840336134454, + "loss": 0.4388, + "step": 7243 + }, + { + "epoch": 4.046927374301676, + "grad_norm": 2.0033042430877686, + "learning_rate": 0.0008000560224089636, + "loss": 0.3816, + "step": 7244 + }, + { + "epoch": 4.0474860335195535, + "grad_norm": 0.6378182172775269, + "learning_rate": 0.0008000280112044817, + "loss": 0.4358, + "step": 7245 + }, + { + "epoch": 4.04804469273743, + "grad_norm": 0.497710257768631, + "learning_rate": 0.0008, + "loss": 0.454, + "step": 7246 + }, + { + "epoch": 4.0486033519553075, + "grad_norm": 0.7362669110298157, + "learning_rate": 0.0007999719887955182, + "loss": 0.3975, + "step": 7247 + }, + { + "epoch": 4.049162011173184, + "grad_norm": 0.5754220485687256, + "learning_rate": 0.0007999439775910365, + "loss": 0.3739, + "step": 7248 + }, + { + "epoch": 4.0497206703910615, + "grad_norm": 0.9323798418045044, + "learning_rate": 0.0007999159663865547, + "loss": 0.4541, + "step": 7249 + }, + { + "epoch": 4.050279329608939, + "grad_norm": 0.45945262908935547, + "learning_rate": 0.0007998879551820728, + "loss": 0.3667, + "step": 7250 + }, + { + "epoch": 4.050837988826816, + "grad_norm": 0.4202263355255127, + "learning_rate": 0.0007998599439775911, + "loss": 0.4153, + "step": 7251 + }, + { + "epoch": 4.051396648044693, + "grad_norm": 0.4688238799571991, + "learning_rate": 0.0007998319327731093, + "loss": 0.3714, + "step": 7252 + }, + { + "epoch": 4.05195530726257, + "grad_norm": 0.5398940443992615, + "learning_rate": 0.0007998039215686275, + "loss": 0.2927, + "step": 7253 + }, + { + "epoch": 4.052513966480447, + "grad_norm": 0.9062156677246094, + "learning_rate": 0.0007997759103641457, + "loss": 0.4665, + "step": 7254 + }, + { + "epoch": 4.053072625698324, + "grad_norm": 0.4088192582130432, + "learning_rate": 0.0007997478991596639, + "loss": 0.4099, + "step": 7255 + }, + { + "epoch": 4.053631284916201, + "grad_norm": 0.44694456458091736, + "learning_rate": 0.0007997198879551821, + "loss": 0.4874, + "step": 7256 + }, + { + "epoch": 4.0541899441340785, + "grad_norm": 0.7342076897621155, + "learning_rate": 0.0007996918767507003, + "loss": 0.614, + "step": 7257 + }, + { + "epoch": 4.054748603351955, + "grad_norm": 0.5657563805580139, + "learning_rate": 0.0007996638655462185, + "loss": 0.4475, + "step": 7258 + }, + { + "epoch": 4.0553072625698325, + "grad_norm": 1.0953336954116821, + "learning_rate": 0.0007996358543417367, + "loss": 0.3817, + "step": 7259 + }, + { + "epoch": 4.055865921787709, + "grad_norm": 0.6628221273422241, + "learning_rate": 0.0007996078431372549, + "loss": 0.5107, + "step": 7260 + }, + { + "epoch": 4.056424581005587, + "grad_norm": 0.711455762386322, + "learning_rate": 0.0007995798319327731, + "loss": 0.4442, + "step": 7261 + }, + { + "epoch": 4.056983240223464, + "grad_norm": 0.589642345905304, + "learning_rate": 0.0007995518207282913, + "loss": 0.4535, + "step": 7262 + }, + { + "epoch": 4.057541899441341, + "grad_norm": 0.42798563838005066, + "learning_rate": 0.0007995238095238095, + "loss": 0.3706, + "step": 7263 + }, + { + "epoch": 4.058100558659218, + "grad_norm": 0.49190956354141235, + "learning_rate": 0.0007994957983193277, + "loss": 0.5598, + "step": 7264 + }, + { + "epoch": 4.058659217877095, + "grad_norm": 0.6466832756996155, + "learning_rate": 0.000799467787114846, + "loss": 0.4035, + "step": 7265 + }, + { + "epoch": 4.059217877094972, + "grad_norm": 0.7363291382789612, + "learning_rate": 0.0007994397759103642, + "loss": 0.3721, + "step": 7266 + }, + { + "epoch": 4.0597765363128495, + "grad_norm": 0.4434452950954437, + "learning_rate": 0.0007994117647058824, + "loss": 0.4365, + "step": 7267 + }, + { + "epoch": 4.060335195530726, + "grad_norm": 1.9777475595474243, + "learning_rate": 0.0007993837535014006, + "loss": 0.5794, + "step": 7268 + }, + { + "epoch": 4.0608938547486035, + "grad_norm": 0.4545868933200836, + "learning_rate": 0.0007993557422969188, + "loss": 0.373, + "step": 7269 + }, + { + "epoch": 4.06145251396648, + "grad_norm": 0.5304999351501465, + "learning_rate": 0.000799327731092437, + "loss": 0.5186, + "step": 7270 + }, + { + "epoch": 4.062011173184358, + "grad_norm": 0.4710586369037628, + "learning_rate": 0.0007992997198879553, + "loss": 0.4441, + "step": 7271 + }, + { + "epoch": 4.062569832402235, + "grad_norm": 0.5025726556777954, + "learning_rate": 0.0007992717086834734, + "loss": 0.5378, + "step": 7272 + }, + { + "epoch": 4.063128491620112, + "grad_norm": 0.7614200115203857, + "learning_rate": 0.0007992436974789916, + "loss": 0.4685, + "step": 7273 + }, + { + "epoch": 4.063687150837989, + "grad_norm": 0.6292237043380737, + "learning_rate": 0.0007992156862745098, + "loss": 0.5527, + "step": 7274 + }, + { + "epoch": 4.064245810055866, + "grad_norm": 0.5824453234672546, + "learning_rate": 0.000799187675070028, + "loss": 0.4934, + "step": 7275 + }, + { + "epoch": 4.064804469273743, + "grad_norm": 0.43286553025245667, + "learning_rate": 0.0007991596638655463, + "loss": 0.357, + "step": 7276 + }, + { + "epoch": 4.0653631284916205, + "grad_norm": 0.6792674660682678, + "learning_rate": 0.0007991316526610644, + "loss": 0.4923, + "step": 7277 + }, + { + "epoch": 4.065921787709497, + "grad_norm": 0.4438602924346924, + "learning_rate": 0.0007991036414565826, + "loss": 0.4477, + "step": 7278 + }, + { + "epoch": 4.0664804469273745, + "grad_norm": 0.47576025128364563, + "learning_rate": 0.0007990756302521008, + "loss": 0.5118, + "step": 7279 + }, + { + "epoch": 4.067039106145251, + "grad_norm": 0.6798005104064941, + "learning_rate": 0.000799047619047619, + "loss": 0.448, + "step": 7280 + }, + { + "epoch": 4.067597765363129, + "grad_norm": 0.5426568984985352, + "learning_rate": 0.0007990196078431374, + "loss": 0.3574, + "step": 7281 + }, + { + "epoch": 4.068156424581006, + "grad_norm": 0.5556155443191528, + "learning_rate": 0.0007989915966386555, + "loss": 0.4929, + "step": 7282 + }, + { + "epoch": 4.068715083798883, + "grad_norm": 0.5068587064743042, + "learning_rate": 0.0007989635854341737, + "loss": 0.4859, + "step": 7283 + }, + { + "epoch": 4.06927374301676, + "grad_norm": 0.6078105568885803, + "learning_rate": 0.0007989355742296919, + "loss": 0.4158, + "step": 7284 + }, + { + "epoch": 4.069832402234637, + "grad_norm": 2.5254971981048584, + "learning_rate": 0.0007989075630252101, + "loss": 0.3766, + "step": 7285 + }, + { + "epoch": 4.070391061452514, + "grad_norm": 0.4751959443092346, + "learning_rate": 0.0007988795518207284, + "loss": 0.4775, + "step": 7286 + }, + { + "epoch": 4.070949720670391, + "grad_norm": 0.8404338955879211, + "learning_rate": 0.0007988515406162466, + "loss": 0.4173, + "step": 7287 + }, + { + "epoch": 4.071508379888268, + "grad_norm": 0.4527067244052887, + "learning_rate": 0.0007988235294117647, + "loss": 0.3846, + "step": 7288 + }, + { + "epoch": 4.0720670391061455, + "grad_norm": 0.5585878491401672, + "learning_rate": 0.0007987955182072829, + "loss": 0.4817, + "step": 7289 + }, + { + "epoch": 4.072625698324022, + "grad_norm": 0.543225884437561, + "learning_rate": 0.0007987675070028011, + "loss": 0.4698, + "step": 7290 + }, + { + "epoch": 4.0731843575419, + "grad_norm": 0.6706470847129822, + "learning_rate": 0.0007987394957983194, + "loss": 0.5246, + "step": 7291 + }, + { + "epoch": 4.073743016759776, + "grad_norm": 0.5255739092826843, + "learning_rate": 0.0007987114845938376, + "loss": 0.5613, + "step": 7292 + }, + { + "epoch": 4.074301675977654, + "grad_norm": 0.8166330456733704, + "learning_rate": 0.0007986834733893557, + "loss": 0.5181, + "step": 7293 + }, + { + "epoch": 4.074860335195531, + "grad_norm": 0.4089374542236328, + "learning_rate": 0.0007986554621848739, + "loss": 0.4353, + "step": 7294 + }, + { + "epoch": 4.075418994413408, + "grad_norm": 0.43580949306488037, + "learning_rate": 0.0007986274509803921, + "loss": 0.4298, + "step": 7295 + }, + { + "epoch": 4.075977653631285, + "grad_norm": 0.5619518756866455, + "learning_rate": 0.0007985994397759104, + "loss": 0.4126, + "step": 7296 + }, + { + "epoch": 4.076536312849162, + "grad_norm": 0.6587222814559937, + "learning_rate": 0.0007985714285714287, + "loss": 0.5342, + "step": 7297 + }, + { + "epoch": 4.077094972067039, + "grad_norm": 0.7862265706062317, + "learning_rate": 0.0007985434173669468, + "loss": 0.4177, + "step": 7298 + }, + { + "epoch": 4.0776536312849165, + "grad_norm": 0.4547424018383026, + "learning_rate": 0.000798515406162465, + "loss": 0.4669, + "step": 7299 + }, + { + "epoch": 4.078212290502793, + "grad_norm": 0.5724118947982788, + "learning_rate": 0.0007984873949579832, + "loss": 0.5051, + "step": 7300 + }, + { + "epoch": 4.078770949720671, + "grad_norm": 0.41616714000701904, + "learning_rate": 0.0007984593837535015, + "loss": 0.4388, + "step": 7301 + }, + { + "epoch": 4.079329608938547, + "grad_norm": 0.5706039071083069, + "learning_rate": 0.0007984313725490197, + "loss": 0.4561, + "step": 7302 + }, + { + "epoch": 4.079888268156425, + "grad_norm": 0.8279286623001099, + "learning_rate": 0.0007984033613445379, + "loss": 0.4807, + "step": 7303 + }, + { + "epoch": 4.080446927374302, + "grad_norm": 0.5855101943016052, + "learning_rate": 0.000798375350140056, + "loss": 0.4687, + "step": 7304 + }, + { + "epoch": 4.081005586592179, + "grad_norm": 0.4317261576652527, + "learning_rate": 0.0007983473389355742, + "loss": 0.5011, + "step": 7305 + }, + { + "epoch": 4.081564245810056, + "grad_norm": 0.535336971282959, + "learning_rate": 0.0007983193277310925, + "loss": 0.5249, + "step": 7306 + }, + { + "epoch": 4.082122905027933, + "grad_norm": 0.6459070444107056, + "learning_rate": 0.0007982913165266107, + "loss": 0.4623, + "step": 7307 + }, + { + "epoch": 4.08268156424581, + "grad_norm": 0.37297797203063965, + "learning_rate": 0.0007982633053221289, + "loss": 0.4072, + "step": 7308 + }, + { + "epoch": 4.0832402234636875, + "grad_norm": 0.48094600439071655, + "learning_rate": 0.000798235294117647, + "loss": 0.4192, + "step": 7309 + }, + { + "epoch": 4.083798882681564, + "grad_norm": 0.5292118191719055, + "learning_rate": 0.0007982072829131652, + "loss": 0.4138, + "step": 7310 + }, + { + "epoch": 4.084357541899442, + "grad_norm": 0.4719665050506592, + "learning_rate": 0.0007981792717086835, + "loss": 0.3865, + "step": 7311 + }, + { + "epoch": 4.084916201117318, + "grad_norm": 0.4022238850593567, + "learning_rate": 0.0007981512605042017, + "loss": 0.3987, + "step": 7312 + }, + { + "epoch": 4.085474860335196, + "grad_norm": 0.38969457149505615, + "learning_rate": 0.00079812324929972, + "loss": 0.5041, + "step": 7313 + }, + { + "epoch": 4.086033519553073, + "grad_norm": 0.6393736600875854, + "learning_rate": 0.000798095238095238, + "loss": 0.4858, + "step": 7314 + }, + { + "epoch": 4.08659217877095, + "grad_norm": 0.4996229410171509, + "learning_rate": 0.0007980672268907563, + "loss": 0.4383, + "step": 7315 + }, + { + "epoch": 4.087150837988827, + "grad_norm": 0.9065155386924744, + "learning_rate": 0.0007980392156862746, + "loss": 0.6391, + "step": 7316 + }, + { + "epoch": 4.087709497206704, + "grad_norm": 0.49954017996788025, + "learning_rate": 0.0007980112044817928, + "loss": 0.5345, + "step": 7317 + }, + { + "epoch": 4.088268156424581, + "grad_norm": 0.4392645061016083, + "learning_rate": 0.000797983193277311, + "loss": 0.4522, + "step": 7318 + }, + { + "epoch": 4.0888268156424585, + "grad_norm": 0.8859760165214539, + "learning_rate": 0.0007979551820728292, + "loss": 0.3613, + "step": 7319 + }, + { + "epoch": 4.089385474860335, + "grad_norm": 0.6901087760925293, + "learning_rate": 0.0007979271708683473, + "loss": 0.428, + "step": 7320 + }, + { + "epoch": 4.089944134078213, + "grad_norm": 0.44879892468452454, + "learning_rate": 0.0007978991596638656, + "loss": 0.3359, + "step": 7321 + }, + { + "epoch": 4.090502793296089, + "grad_norm": 0.9341223239898682, + "learning_rate": 0.0007978711484593838, + "loss": 0.4614, + "step": 7322 + }, + { + "epoch": 4.091061452513967, + "grad_norm": 1.3537898063659668, + "learning_rate": 0.000797843137254902, + "loss": 0.423, + "step": 7323 + }, + { + "epoch": 4.091620111731843, + "grad_norm": 0.5254029035568237, + "learning_rate": 0.0007978151260504202, + "loss": 0.4986, + "step": 7324 + }, + { + "epoch": 4.092178770949721, + "grad_norm": 0.5618382692337036, + "learning_rate": 0.0007977871148459383, + "loss": 0.6393, + "step": 7325 + }, + { + "epoch": 4.092737430167598, + "grad_norm": 0.6128014922142029, + "learning_rate": 0.0007977591036414566, + "loss": 0.471, + "step": 7326 + }, + { + "epoch": 4.093296089385475, + "grad_norm": 0.46432483196258545, + "learning_rate": 0.0007977310924369748, + "loss": 0.3646, + "step": 7327 + }, + { + "epoch": 4.093854748603352, + "grad_norm": 0.5539751648902893, + "learning_rate": 0.000797703081232493, + "loss": 0.5108, + "step": 7328 + }, + { + "epoch": 4.094413407821229, + "grad_norm": 1.7766703367233276, + "learning_rate": 0.0007976750700280112, + "loss": 0.7217, + "step": 7329 + }, + { + "epoch": 4.094972067039106, + "grad_norm": 1.1616451740264893, + "learning_rate": 0.0007976470588235293, + "loss": 0.554, + "step": 7330 + }, + { + "epoch": 4.0955307262569836, + "grad_norm": 0.5152605772018433, + "learning_rate": 0.0007976190476190477, + "loss": 0.3991, + "step": 7331 + }, + { + "epoch": 4.09608938547486, + "grad_norm": 2.1343116760253906, + "learning_rate": 0.0007975910364145659, + "loss": 0.3697, + "step": 7332 + }, + { + "epoch": 4.096648044692738, + "grad_norm": 1.0786422491073608, + "learning_rate": 0.0007975630252100841, + "loss": 0.4623, + "step": 7333 + }, + { + "epoch": 4.097206703910614, + "grad_norm": 0.6544681787490845, + "learning_rate": 0.0007975350140056023, + "loss": 0.4705, + "step": 7334 + }, + { + "epoch": 4.097765363128492, + "grad_norm": 0.5635467767715454, + "learning_rate": 0.0007975070028011205, + "loss": 0.438, + "step": 7335 + }, + { + "epoch": 4.098324022346369, + "grad_norm": 1.2926359176635742, + "learning_rate": 0.0007974789915966387, + "loss": 0.4715, + "step": 7336 + }, + { + "epoch": 4.098882681564246, + "grad_norm": 0.9510411620140076, + "learning_rate": 0.0007974509803921569, + "loss": 0.5287, + "step": 7337 + }, + { + "epoch": 4.099441340782123, + "grad_norm": 0.5677906274795532, + "learning_rate": 0.0007974229691876751, + "loss": 0.5935, + "step": 7338 + }, + { + "epoch": 4.1, + "grad_norm": 1.177069902420044, + "learning_rate": 0.0007973949579831933, + "loss": 0.5339, + "step": 7339 + }, + { + "epoch": 4.100558659217877, + "grad_norm": 0.5134732723236084, + "learning_rate": 0.0007973669467787115, + "loss": 0.4403, + "step": 7340 + }, + { + "epoch": 4.1011173184357546, + "grad_norm": 1.2201228141784668, + "learning_rate": 0.0007973389355742297, + "loss": 0.446, + "step": 7341 + }, + { + "epoch": 4.101675977653631, + "grad_norm": 0.4552845060825348, + "learning_rate": 0.0007973109243697479, + "loss": 0.4449, + "step": 7342 + }, + { + "epoch": 4.102234636871509, + "grad_norm": 0.5437498688697815, + "learning_rate": 0.0007972829131652661, + "loss": 0.3812, + "step": 7343 + }, + { + "epoch": 4.102793296089385, + "grad_norm": 0.5181490778923035, + "learning_rate": 0.0007972549019607843, + "loss": 0.3754, + "step": 7344 + }, + { + "epoch": 4.103351955307263, + "grad_norm": 0.6079907417297363, + "learning_rate": 0.0007972268907563025, + "loss": 0.4544, + "step": 7345 + }, + { + "epoch": 4.10391061452514, + "grad_norm": 3.3960683345794678, + "learning_rate": 0.0007971988795518207, + "loss": 0.5105, + "step": 7346 + }, + { + "epoch": 4.104469273743017, + "grad_norm": 0.6494735479354858, + "learning_rate": 0.000797170868347339, + "loss": 0.4883, + "step": 7347 + }, + { + "epoch": 4.105027932960894, + "grad_norm": 0.4463353157043457, + "learning_rate": 0.0007971428571428572, + "loss": 0.4701, + "step": 7348 + }, + { + "epoch": 4.105586592178771, + "grad_norm": 0.7758894562721252, + "learning_rate": 0.0007971148459383754, + "loss": 0.4728, + "step": 7349 + }, + { + "epoch": 4.106145251396648, + "grad_norm": 0.394568532705307, + "learning_rate": 0.0007970868347338936, + "loss": 0.3183, + "step": 7350 + }, + { + "epoch": 4.1067039106145256, + "grad_norm": 0.441532701253891, + "learning_rate": 0.0007970588235294119, + "loss": 0.5152, + "step": 7351 + }, + { + "epoch": 4.107262569832402, + "grad_norm": 0.4544018507003784, + "learning_rate": 0.00079703081232493, + "loss": 0.5499, + "step": 7352 + }, + { + "epoch": 4.10782122905028, + "grad_norm": 0.5740039944648743, + "learning_rate": 0.0007970028011204482, + "loss": 0.4844, + "step": 7353 + }, + { + "epoch": 4.108379888268156, + "grad_norm": 0.7321734428405762, + "learning_rate": 0.0007969747899159664, + "loss": 0.5316, + "step": 7354 + }, + { + "epoch": 4.108938547486034, + "grad_norm": 0.5580518245697021, + "learning_rate": 0.0007969467787114846, + "loss": 0.4511, + "step": 7355 + }, + { + "epoch": 4.10949720670391, + "grad_norm": 0.5594097971916199, + "learning_rate": 0.0007969187675070029, + "loss": 0.6818, + "step": 7356 + }, + { + "epoch": 4.110055865921788, + "grad_norm": 0.6603025794029236, + "learning_rate": 0.000796890756302521, + "loss": 0.4692, + "step": 7357 + }, + { + "epoch": 4.110614525139665, + "grad_norm": 0.5391709804534912, + "learning_rate": 0.0007968627450980392, + "loss": 0.3589, + "step": 7358 + }, + { + "epoch": 4.111173184357542, + "grad_norm": 1.7947614192962646, + "learning_rate": 0.0007968347338935574, + "loss": 0.5526, + "step": 7359 + }, + { + "epoch": 4.111731843575419, + "grad_norm": 0.5025770664215088, + "learning_rate": 0.0007968067226890756, + "loss": 0.5131, + "step": 7360 + }, + { + "epoch": 4.112290502793296, + "grad_norm": 0.7856631278991699, + "learning_rate": 0.0007967787114845938, + "loss": 0.5122, + "step": 7361 + }, + { + "epoch": 4.112849162011173, + "grad_norm": 0.5644674897193909, + "learning_rate": 0.000796750700280112, + "loss": 0.4929, + "step": 7362 + }, + { + "epoch": 4.113407821229051, + "grad_norm": 0.6875373125076294, + "learning_rate": 0.0007967226890756302, + "loss": 0.3594, + "step": 7363 + }, + { + "epoch": 4.113966480446927, + "grad_norm": 0.5161254405975342, + "learning_rate": 0.0007966946778711485, + "loss": 0.5745, + "step": 7364 + }, + { + "epoch": 4.114525139664805, + "grad_norm": 0.9543882012367249, + "learning_rate": 0.0007966666666666667, + "loss": 0.4865, + "step": 7365 + }, + { + "epoch": 4.115083798882681, + "grad_norm": 0.9078003764152527, + "learning_rate": 0.0007966386554621849, + "loss": 0.4281, + "step": 7366 + }, + { + "epoch": 4.115642458100559, + "grad_norm": 0.7967604994773865, + "learning_rate": 0.0007966106442577032, + "loss": 0.5012, + "step": 7367 + }, + { + "epoch": 4.116201117318436, + "grad_norm": 0.8606827855110168, + "learning_rate": 0.0007965826330532213, + "loss": 0.4875, + "step": 7368 + }, + { + "epoch": 4.116759776536313, + "grad_norm": 0.47089481353759766, + "learning_rate": 0.0007965546218487395, + "loss": 0.4383, + "step": 7369 + }, + { + "epoch": 4.11731843575419, + "grad_norm": 0.548164427280426, + "learning_rate": 0.0007965266106442577, + "loss": 0.4271, + "step": 7370 + }, + { + "epoch": 4.117877094972067, + "grad_norm": 1.3291031122207642, + "learning_rate": 0.0007964985994397759, + "loss": 0.5259, + "step": 7371 + }, + { + "epoch": 4.118435754189944, + "grad_norm": 0.7199836373329163, + "learning_rate": 0.0007964705882352942, + "loss": 0.6184, + "step": 7372 + }, + { + "epoch": 4.118994413407822, + "grad_norm": 0.5368474125862122, + "learning_rate": 0.0007964425770308123, + "loss": 0.4822, + "step": 7373 + }, + { + "epoch": 4.119553072625698, + "grad_norm": 0.6486384868621826, + "learning_rate": 0.0007964145658263305, + "loss": 0.4145, + "step": 7374 + }, + { + "epoch": 4.120111731843576, + "grad_norm": 0.5339905619621277, + "learning_rate": 0.0007963865546218487, + "loss": 0.4962, + "step": 7375 + }, + { + "epoch": 4.120670391061452, + "grad_norm": 0.514297366142273, + "learning_rate": 0.0007963585434173669, + "loss": 0.4312, + "step": 7376 + }, + { + "epoch": 4.12122905027933, + "grad_norm": 0.96578049659729, + "learning_rate": 0.0007963305322128852, + "loss": 0.4341, + "step": 7377 + }, + { + "epoch": 4.121787709497207, + "grad_norm": 0.557553768157959, + "learning_rate": 0.0007963025210084033, + "loss": 0.4634, + "step": 7378 + }, + { + "epoch": 4.122346368715084, + "grad_norm": 0.4616701304912567, + "learning_rate": 0.0007962745098039215, + "loss": 0.4096, + "step": 7379 + }, + { + "epoch": 4.122905027932961, + "grad_norm": 0.5880672335624695, + "learning_rate": 0.0007962464985994398, + "loss": 0.4552, + "step": 7380 + }, + { + "epoch": 4.123463687150838, + "grad_norm": 0.976950466632843, + "learning_rate": 0.000796218487394958, + "loss": 0.7099, + "step": 7381 + }, + { + "epoch": 4.124022346368715, + "grad_norm": 0.8104521632194519, + "learning_rate": 0.0007961904761904763, + "loss": 0.5674, + "step": 7382 + }, + { + "epoch": 4.124581005586593, + "grad_norm": 0.49486520886421204, + "learning_rate": 0.0007961624649859945, + "loss": 0.42, + "step": 7383 + }, + { + "epoch": 4.125139664804469, + "grad_norm": 2.083012104034424, + "learning_rate": 0.0007961344537815126, + "loss": 0.4395, + "step": 7384 + }, + { + "epoch": 4.125698324022347, + "grad_norm": 0.9928786754608154, + "learning_rate": 0.0007961064425770308, + "loss": 0.3935, + "step": 7385 + }, + { + "epoch": 4.126256983240223, + "grad_norm": 0.7612307667732239, + "learning_rate": 0.000796078431372549, + "loss": 0.3452, + "step": 7386 + }, + { + "epoch": 4.126815642458101, + "grad_norm": 0.45569178462028503, + "learning_rate": 0.0007960504201680673, + "loss": 0.4506, + "step": 7387 + }, + { + "epoch": 4.127374301675978, + "grad_norm": 1.1247708797454834, + "learning_rate": 0.0007960224089635855, + "loss": 0.4767, + "step": 7388 + }, + { + "epoch": 4.127932960893855, + "grad_norm": 0.6095284819602966, + "learning_rate": 0.0007959943977591036, + "loss": 0.4221, + "step": 7389 + }, + { + "epoch": 4.128491620111732, + "grad_norm": 0.7278775572776794, + "learning_rate": 0.0007959663865546218, + "loss": 0.4723, + "step": 7390 + }, + { + "epoch": 4.129050279329609, + "grad_norm": 0.4910142123699188, + "learning_rate": 0.00079593837535014, + "loss": 0.4914, + "step": 7391 + }, + { + "epoch": 4.129608938547486, + "grad_norm": 0.538343071937561, + "learning_rate": 0.0007959103641456583, + "loss": 0.4726, + "step": 7392 + }, + { + "epoch": 4.130167597765363, + "grad_norm": 1.9881123304367065, + "learning_rate": 0.0007958823529411765, + "loss": 0.4276, + "step": 7393 + }, + { + "epoch": 4.13072625698324, + "grad_norm": 0.47892946004867554, + "learning_rate": 0.0007958543417366946, + "loss": 0.4023, + "step": 7394 + }, + { + "epoch": 4.131284916201118, + "grad_norm": 0.468246728181839, + "learning_rate": 0.0007958263305322128, + "loss": 0.4374, + "step": 7395 + }, + { + "epoch": 4.131843575418994, + "grad_norm": 0.4916151762008667, + "learning_rate": 0.000795798319327731, + "loss": 0.4503, + "step": 7396 + }, + { + "epoch": 4.132402234636872, + "grad_norm": 0.7064822316169739, + "learning_rate": 0.0007957703081232494, + "loss": 0.5386, + "step": 7397 + }, + { + "epoch": 4.132960893854748, + "grad_norm": 3.0665736198425293, + "learning_rate": 0.0007957422969187676, + "loss": 0.5213, + "step": 7398 + }, + { + "epoch": 4.133519553072626, + "grad_norm": 0.5969092845916748, + "learning_rate": 0.0007957142857142858, + "loss": 0.6016, + "step": 7399 + }, + { + "epoch": 4.134078212290503, + "grad_norm": 0.8750990629196167, + "learning_rate": 0.0007956862745098039, + "loss": 0.3964, + "step": 7400 + }, + { + "epoch": 4.13463687150838, + "grad_norm": 1.998945951461792, + "learning_rate": 0.0007956582633053221, + "loss": 0.4764, + "step": 7401 + }, + { + "epoch": 4.135195530726257, + "grad_norm": 1.0746474266052246, + "learning_rate": 0.0007956302521008404, + "loss": 0.4393, + "step": 7402 + }, + { + "epoch": 4.135754189944134, + "grad_norm": 0.5045647621154785, + "learning_rate": 0.0007956022408963586, + "loss": 0.4711, + "step": 7403 + }, + { + "epoch": 4.136312849162011, + "grad_norm": 0.4553874731063843, + "learning_rate": 0.0007955742296918768, + "loss": 0.4447, + "step": 7404 + }, + { + "epoch": 4.136871508379889, + "grad_norm": 0.4946921467781067, + "learning_rate": 0.0007955462184873949, + "loss": 0.3933, + "step": 7405 + }, + { + "epoch": 4.137430167597765, + "grad_norm": 0.6165663003921509, + "learning_rate": 0.0007955182072829131, + "loss": 0.4334, + "step": 7406 + }, + { + "epoch": 4.137988826815643, + "grad_norm": 0.46721312403678894, + "learning_rate": 0.0007954901960784314, + "loss": 0.4439, + "step": 7407 + }, + { + "epoch": 4.138547486033519, + "grad_norm": 0.686400294303894, + "learning_rate": 0.0007954621848739496, + "loss": 0.4862, + "step": 7408 + }, + { + "epoch": 4.139106145251397, + "grad_norm": 1.278274416923523, + "learning_rate": 0.0007954341736694678, + "loss": 0.5088, + "step": 7409 + }, + { + "epoch": 4.139664804469274, + "grad_norm": 0.7448779940605164, + "learning_rate": 0.0007954061624649859, + "loss": 0.5026, + "step": 7410 + }, + { + "epoch": 4.140223463687151, + "grad_norm": 2.245076894760132, + "learning_rate": 0.0007953781512605041, + "loss": 0.4031, + "step": 7411 + }, + { + "epoch": 4.140782122905028, + "grad_norm": 0.8871789574623108, + "learning_rate": 0.0007953501400560225, + "loss": 0.5067, + "step": 7412 + }, + { + "epoch": 4.141340782122905, + "grad_norm": 1.5357602834701538, + "learning_rate": 0.0007953221288515407, + "loss": 0.4572, + "step": 7413 + }, + { + "epoch": 4.141899441340782, + "grad_norm": 1.0998822450637817, + "learning_rate": 0.0007952941176470589, + "loss": 0.4849, + "step": 7414 + }, + { + "epoch": 4.14245810055866, + "grad_norm": 0.7770353555679321, + "learning_rate": 0.0007952661064425771, + "loss": 0.5473, + "step": 7415 + }, + { + "epoch": 4.143016759776536, + "grad_norm": 0.3788093030452728, + "learning_rate": 0.0007952380952380952, + "loss": 0.3726, + "step": 7416 + }, + { + "epoch": 4.143575418994414, + "grad_norm": 0.7680646777153015, + "learning_rate": 0.0007952100840336135, + "loss": 0.4861, + "step": 7417 + }, + { + "epoch": 4.14413407821229, + "grad_norm": 0.8892258405685425, + "learning_rate": 0.0007951820728291317, + "loss": 0.5427, + "step": 7418 + }, + { + "epoch": 4.144692737430168, + "grad_norm": 3.658684015274048, + "learning_rate": 0.0007951540616246499, + "loss": 0.4733, + "step": 7419 + }, + { + "epoch": 4.145251396648045, + "grad_norm": 0.9445491433143616, + "learning_rate": 0.0007951260504201681, + "loss": 0.3908, + "step": 7420 + }, + { + "epoch": 4.145810055865922, + "grad_norm": 0.44519010186195374, + "learning_rate": 0.0007950980392156862, + "loss": 0.5001, + "step": 7421 + }, + { + "epoch": 4.146368715083799, + "grad_norm": 1.0370937585830688, + "learning_rate": 0.0007950700280112045, + "loss": 0.5082, + "step": 7422 + }, + { + "epoch": 4.146927374301676, + "grad_norm": 0.6180882453918457, + "learning_rate": 0.0007950420168067227, + "loss": 0.4369, + "step": 7423 + }, + { + "epoch": 4.147486033519553, + "grad_norm": 1.2745493650436401, + "learning_rate": 0.0007950140056022409, + "loss": 0.4133, + "step": 7424 + }, + { + "epoch": 4.148044692737431, + "grad_norm": 0.6371618509292603, + "learning_rate": 0.0007949859943977591, + "loss": 0.6168, + "step": 7425 + }, + { + "epoch": 4.148603351955307, + "grad_norm": 0.5083094239234924, + "learning_rate": 0.0007949579831932772, + "loss": 0.5119, + "step": 7426 + }, + { + "epoch": 4.149162011173185, + "grad_norm": 1.522736668586731, + "learning_rate": 0.0007949299719887955, + "loss": 0.8669, + "step": 7427 + }, + { + "epoch": 4.149720670391061, + "grad_norm": 0.6432474851608276, + "learning_rate": 0.0007949019607843137, + "loss": 0.4847, + "step": 7428 + }, + { + "epoch": 4.150279329608939, + "grad_norm": 0.6489638090133667, + "learning_rate": 0.000794873949579832, + "loss": 0.4549, + "step": 7429 + }, + { + "epoch": 4.150837988826815, + "grad_norm": 0.603775143623352, + "learning_rate": 0.0007948459383753502, + "loss": 0.7055, + "step": 7430 + }, + { + "epoch": 4.151396648044693, + "grad_norm": 0.5119211077690125, + "learning_rate": 0.0007948179271708684, + "loss": 0.4463, + "step": 7431 + }, + { + "epoch": 4.15195530726257, + "grad_norm": 0.3933148980140686, + "learning_rate": 0.0007947899159663866, + "loss": 0.4412, + "step": 7432 + }, + { + "epoch": 4.152513966480447, + "grad_norm": 0.7747071385383606, + "learning_rate": 0.0007947619047619048, + "loss": 0.4774, + "step": 7433 + }, + { + "epoch": 4.153072625698324, + "grad_norm": 0.5461366772651672, + "learning_rate": 0.000794733893557423, + "loss": 0.542, + "step": 7434 + }, + { + "epoch": 4.153631284916201, + "grad_norm": 0.7179622650146484, + "learning_rate": 0.0007947058823529412, + "loss": 0.6526, + "step": 7435 + }, + { + "epoch": 4.154189944134078, + "grad_norm": 0.5717280507087708, + "learning_rate": 0.0007946778711484594, + "loss": 0.4339, + "step": 7436 + }, + { + "epoch": 4.154748603351956, + "grad_norm": 1.7125846147537231, + "learning_rate": 0.0007946498599439776, + "loss": 0.5721, + "step": 7437 + }, + { + "epoch": 4.155307262569832, + "grad_norm": 1.3465338945388794, + "learning_rate": 0.0007946218487394958, + "loss": 0.4633, + "step": 7438 + }, + { + "epoch": 4.15586592178771, + "grad_norm": 0.47714072465896606, + "learning_rate": 0.000794593837535014, + "loss": 0.4016, + "step": 7439 + }, + { + "epoch": 4.156424581005586, + "grad_norm": 0.6180019974708557, + "learning_rate": 0.0007945658263305322, + "loss": 0.4955, + "step": 7440 + }, + { + "epoch": 4.156983240223464, + "grad_norm": 1.0105268955230713, + "learning_rate": 0.0007945378151260504, + "loss": 0.4447, + "step": 7441 + }, + { + "epoch": 4.157541899441341, + "grad_norm": 1.1082491874694824, + "learning_rate": 0.0007945098039215687, + "loss": 0.5251, + "step": 7442 + }, + { + "epoch": 4.158100558659218, + "grad_norm": 1.6552913188934326, + "learning_rate": 0.0007944817927170868, + "loss": 0.4858, + "step": 7443 + }, + { + "epoch": 4.158659217877095, + "grad_norm": 2.7462158203125, + "learning_rate": 0.000794453781512605, + "loss": 0.4481, + "step": 7444 + }, + { + "epoch": 4.159217877094972, + "grad_norm": 0.5992055535316467, + "learning_rate": 0.0007944257703081232, + "loss": 0.4226, + "step": 7445 + }, + { + "epoch": 4.159776536312849, + "grad_norm": 0.5990080833435059, + "learning_rate": 0.0007943977591036415, + "loss": 0.436, + "step": 7446 + }, + { + "epoch": 4.160335195530727, + "grad_norm": 0.5530053377151489, + "learning_rate": 0.0007943697478991598, + "loss": 0.4323, + "step": 7447 + }, + { + "epoch": 4.160893854748603, + "grad_norm": 0.4704790711402893, + "learning_rate": 0.0007943417366946779, + "loss": 0.4573, + "step": 7448 + }, + { + "epoch": 4.161452513966481, + "grad_norm": 0.5873286724090576, + "learning_rate": 0.0007943137254901961, + "loss": 0.4024, + "step": 7449 + }, + { + "epoch": 4.162011173184357, + "grad_norm": 5.973817825317383, + "learning_rate": 0.0007942857142857143, + "loss": 0.4604, + "step": 7450 + }, + { + "epoch": 4.162569832402235, + "grad_norm": 0.5752110481262207, + "learning_rate": 0.0007942577030812325, + "loss": 0.3965, + "step": 7451 + }, + { + "epoch": 4.163128491620112, + "grad_norm": 0.4331759810447693, + "learning_rate": 0.0007942296918767508, + "loss": 0.4327, + "step": 7452 + }, + { + "epoch": 4.163687150837989, + "grad_norm": 0.749707043170929, + "learning_rate": 0.0007942016806722689, + "loss": 0.5496, + "step": 7453 + }, + { + "epoch": 4.164245810055866, + "grad_norm": 0.5506083965301514, + "learning_rate": 0.0007941736694677871, + "loss": 0.701, + "step": 7454 + }, + { + "epoch": 4.164804469273743, + "grad_norm": 0.7257410287857056, + "learning_rate": 0.0007941456582633053, + "loss": 0.5343, + "step": 7455 + }, + { + "epoch": 4.16536312849162, + "grad_norm": 11.58082103729248, + "learning_rate": 0.0007941176470588235, + "loss": 0.6011, + "step": 7456 + }, + { + "epoch": 4.165921787709498, + "grad_norm": 0.7507224082946777, + "learning_rate": 0.0007940896358543418, + "loss": 0.3415, + "step": 7457 + }, + { + "epoch": 4.166480446927374, + "grad_norm": 0.7059439420700073, + "learning_rate": 0.00079406162464986, + "loss": 0.4563, + "step": 7458 + }, + { + "epoch": 4.167039106145252, + "grad_norm": 0.4393032193183899, + "learning_rate": 0.0007940336134453781, + "loss": 0.5158, + "step": 7459 + }, + { + "epoch": 4.167597765363128, + "grad_norm": 0.7159056663513184, + "learning_rate": 0.0007940056022408963, + "loss": 0.4362, + "step": 7460 + }, + { + "epoch": 4.168156424581006, + "grad_norm": 0.481918603181839, + "learning_rate": 0.0007939775910364145, + "loss": 0.3555, + "step": 7461 + }, + { + "epoch": 4.168715083798883, + "grad_norm": 0.5716583728790283, + "learning_rate": 0.0007939495798319329, + "loss": 0.4048, + "step": 7462 + }, + { + "epoch": 4.16927374301676, + "grad_norm": 0.5609585046768188, + "learning_rate": 0.0007939215686274511, + "loss": 0.5613, + "step": 7463 + }, + { + "epoch": 4.169832402234637, + "grad_norm": 0.528323769569397, + "learning_rate": 0.0007938935574229692, + "loss": 0.3869, + "step": 7464 + }, + { + "epoch": 4.170391061452514, + "grad_norm": 0.45341894030570984, + "learning_rate": 0.0007938655462184874, + "loss": 0.4711, + "step": 7465 + }, + { + "epoch": 4.170949720670391, + "grad_norm": 0.5705549716949463, + "learning_rate": 0.0007938375350140056, + "loss": 0.4762, + "step": 7466 + }, + { + "epoch": 4.171508379888268, + "grad_norm": 0.5936793088912964, + "learning_rate": 0.0007938095238095239, + "loss": 0.3704, + "step": 7467 + }, + { + "epoch": 4.172067039106145, + "grad_norm": 0.4085972309112549, + "learning_rate": 0.0007937815126050421, + "loss": 0.4892, + "step": 7468 + }, + { + "epoch": 4.172625698324023, + "grad_norm": 0.5726138949394226, + "learning_rate": 0.0007937535014005602, + "loss": 0.4411, + "step": 7469 + }, + { + "epoch": 4.173184357541899, + "grad_norm": 0.38425758481025696, + "learning_rate": 0.0007937254901960784, + "loss": 0.4204, + "step": 7470 + }, + { + "epoch": 4.173743016759777, + "grad_norm": 0.4055267572402954, + "learning_rate": 0.0007936974789915966, + "loss": 0.4325, + "step": 7471 + }, + { + "epoch": 4.174301675977653, + "grad_norm": 2.616147756576538, + "learning_rate": 0.0007936694677871149, + "loss": 0.5505, + "step": 7472 + }, + { + "epoch": 4.174860335195531, + "grad_norm": 0.8516884446144104, + "learning_rate": 0.0007936414565826331, + "loss": 0.3946, + "step": 7473 + }, + { + "epoch": 4.175418994413408, + "grad_norm": 2.598735809326172, + "learning_rate": 0.0007936134453781513, + "loss": 0.347, + "step": 7474 + }, + { + "epoch": 4.175977653631285, + "grad_norm": 0.6081153154373169, + "learning_rate": 0.0007935854341736694, + "loss": 0.5275, + "step": 7475 + }, + { + "epoch": 4.176536312849162, + "grad_norm": 0.6708801984786987, + "learning_rate": 0.0007935574229691876, + "loss": 0.5353, + "step": 7476 + }, + { + "epoch": 4.177094972067039, + "grad_norm": 0.4806496202945709, + "learning_rate": 0.000793529411764706, + "loss": 0.4697, + "step": 7477 + }, + { + "epoch": 4.177653631284916, + "grad_norm": 1.1978561878204346, + "learning_rate": 0.0007935014005602242, + "loss": 0.5459, + "step": 7478 + }, + { + "epoch": 4.178212290502794, + "grad_norm": 0.4890676438808441, + "learning_rate": 0.0007934733893557424, + "loss": 0.4394, + "step": 7479 + }, + { + "epoch": 4.17877094972067, + "grad_norm": 0.6475544571876526, + "learning_rate": 0.0007934453781512605, + "loss": 0.4209, + "step": 7480 + }, + { + "epoch": 4.179329608938548, + "grad_norm": 0.4206632673740387, + "learning_rate": 0.0007934173669467787, + "loss": 0.3716, + "step": 7481 + }, + { + "epoch": 4.179888268156424, + "grad_norm": 0.4854796230792999, + "learning_rate": 0.000793389355742297, + "loss": 0.3513, + "step": 7482 + }, + { + "epoch": 4.180446927374302, + "grad_norm": 0.5116716027259827, + "learning_rate": 0.0007933613445378152, + "loss": 0.4828, + "step": 7483 + }, + { + "epoch": 4.181005586592179, + "grad_norm": 0.44999393820762634, + "learning_rate": 0.0007933333333333334, + "loss": 0.4766, + "step": 7484 + }, + { + "epoch": 4.181564245810056, + "grad_norm": 0.48781394958496094, + "learning_rate": 0.0007933053221288515, + "loss": 0.4577, + "step": 7485 + }, + { + "epoch": 4.182122905027933, + "grad_norm": 0.5214735269546509, + "learning_rate": 0.0007932773109243697, + "loss": 0.4235, + "step": 7486 + }, + { + "epoch": 4.18268156424581, + "grad_norm": 0.5006760358810425, + "learning_rate": 0.000793249299719888, + "loss": 0.5185, + "step": 7487 + }, + { + "epoch": 4.183240223463687, + "grad_norm": 0.9785987734794617, + "learning_rate": 0.0007932212885154062, + "loss": 0.4112, + "step": 7488 + }, + { + "epoch": 4.183798882681565, + "grad_norm": 0.6362124681472778, + "learning_rate": 0.0007931932773109244, + "loss": 0.5654, + "step": 7489 + }, + { + "epoch": 4.184357541899441, + "grad_norm": 0.4772784113883972, + "learning_rate": 0.0007931652661064426, + "loss": 0.4175, + "step": 7490 + }, + { + "epoch": 4.184916201117319, + "grad_norm": 0.4467301368713379, + "learning_rate": 0.0007931372549019607, + "loss": 0.4848, + "step": 7491 + }, + { + "epoch": 4.185474860335195, + "grad_norm": 0.6502392292022705, + "learning_rate": 0.000793109243697479, + "loss": 0.5171, + "step": 7492 + }, + { + "epoch": 4.186033519553073, + "grad_norm": 0.48810911178588867, + "learning_rate": 0.0007930812324929972, + "loss": 0.4794, + "step": 7493 + }, + { + "epoch": 4.18659217877095, + "grad_norm": 0.573898434638977, + "learning_rate": 0.0007930532212885155, + "loss": 0.3721, + "step": 7494 + }, + { + "epoch": 4.187150837988827, + "grad_norm": 0.47238484025001526, + "learning_rate": 0.0007930252100840337, + "loss": 0.5291, + "step": 7495 + }, + { + "epoch": 4.187709497206704, + "grad_norm": 0.5235174894332886, + "learning_rate": 0.0007929971988795518, + "loss": 0.384, + "step": 7496 + }, + { + "epoch": 4.188268156424581, + "grad_norm": 0.44877365231513977, + "learning_rate": 0.0007929691876750701, + "loss": 0.4032, + "step": 7497 + }, + { + "epoch": 4.188826815642458, + "grad_norm": 0.4492464065551758, + "learning_rate": 0.0007929411764705883, + "loss": 0.3234, + "step": 7498 + }, + { + "epoch": 4.189385474860336, + "grad_norm": 0.5851590037345886, + "learning_rate": 0.0007929131652661065, + "loss": 0.4365, + "step": 7499 + }, + { + "epoch": 4.189944134078212, + "grad_norm": 0.6477049589157104, + "learning_rate": 0.0007928851540616247, + "loss": 0.5514, + "step": 7500 + }, + { + "epoch": 4.189944134078212, + "eval_cer": 0.09442680545099451, + "eval_loss": 0.354936420917511, + "eval_runtime": 57.7153, + "eval_samples_per_second": 78.627, + "eval_steps_per_second": 4.921, + "eval_wer": 0.37376050948297534, + "step": 7500 + }, + { + "epoch": 4.19050279329609, + "grad_norm": 0.5460861921310425, + "learning_rate": 0.0007928571428571428, + "loss": 0.5029, + "step": 7501 + }, + { + "epoch": 4.191061452513966, + "grad_norm": 1.2369848489761353, + "learning_rate": 0.0007928291316526611, + "loss": 0.4525, + "step": 7502 + }, + { + "epoch": 4.191620111731844, + "grad_norm": 0.6093453764915466, + "learning_rate": 0.0007928011204481793, + "loss": 0.4699, + "step": 7503 + }, + { + "epoch": 4.19217877094972, + "grad_norm": 1.9455437660217285, + "learning_rate": 0.0007927731092436975, + "loss": 0.4312, + "step": 7504 + }, + { + "epoch": 4.192737430167598, + "grad_norm": 0.47835955023765564, + "learning_rate": 0.0007927450980392157, + "loss": 0.4457, + "step": 7505 + }, + { + "epoch": 4.193296089385475, + "grad_norm": 0.7246494293212891, + "learning_rate": 0.0007927170868347339, + "loss": 0.3625, + "step": 7506 + }, + { + "epoch": 4.193854748603352, + "grad_norm": 0.7044689655303955, + "learning_rate": 0.0007926890756302521, + "loss": 0.5968, + "step": 7507 + }, + { + "epoch": 4.194413407821229, + "grad_norm": 0.4945489764213562, + "learning_rate": 0.0007926610644257703, + "loss": 0.3848, + "step": 7508 + }, + { + "epoch": 4.194972067039106, + "grad_norm": 0.45708316564559937, + "learning_rate": 0.0007926330532212885, + "loss": 0.4053, + "step": 7509 + }, + { + "epoch": 4.195530726256983, + "grad_norm": 0.8317373991012573, + "learning_rate": 0.0007926050420168067, + "loss": 0.4273, + "step": 7510 + }, + { + "epoch": 4.196089385474861, + "grad_norm": 0.46699386835098267, + "learning_rate": 0.000792577030812325, + "loss": 0.4763, + "step": 7511 + }, + { + "epoch": 4.196648044692737, + "grad_norm": 1.889947533607483, + "learning_rate": 0.0007925490196078432, + "loss": 0.4931, + "step": 7512 + }, + { + "epoch": 4.197206703910615, + "grad_norm": 0.5027772188186646, + "learning_rate": 0.0007925210084033614, + "loss": 0.4235, + "step": 7513 + }, + { + "epoch": 4.197765363128491, + "grad_norm": 0.5661888122558594, + "learning_rate": 0.0007924929971988796, + "loss": 0.7281, + "step": 7514 + }, + { + "epoch": 4.198324022346369, + "grad_norm": 0.5596293807029724, + "learning_rate": 0.0007924649859943978, + "loss": 0.5043, + "step": 7515 + }, + { + "epoch": 4.198882681564246, + "grad_norm": 0.4898509085178375, + "learning_rate": 0.000792436974789916, + "loss": 0.3877, + "step": 7516 + }, + { + "epoch": 4.199441340782123, + "grad_norm": 1.6684132814407349, + "learning_rate": 0.0007924089635854342, + "loss": 0.4171, + "step": 7517 + }, + { + "epoch": 4.2, + "grad_norm": 0.99784255027771, + "learning_rate": 0.0007923809523809524, + "loss": 0.5618, + "step": 7518 + }, + { + "epoch": 4.200558659217877, + "grad_norm": 0.6530553102493286, + "learning_rate": 0.0007923529411764706, + "loss": 0.4725, + "step": 7519 + }, + { + "epoch": 4.201117318435754, + "grad_norm": 0.4599669575691223, + "learning_rate": 0.0007923249299719888, + "loss": 0.4406, + "step": 7520 + }, + { + "epoch": 4.201675977653632, + "grad_norm": 0.6810505986213684, + "learning_rate": 0.000792296918767507, + "loss": 0.5319, + "step": 7521 + }, + { + "epoch": 4.202234636871508, + "grad_norm": 0.7919774651527405, + "learning_rate": 0.0007922689075630253, + "loss": 0.5626, + "step": 7522 + }, + { + "epoch": 4.202793296089386, + "grad_norm": 1.0941392183303833, + "learning_rate": 0.0007922408963585434, + "loss": 0.5579, + "step": 7523 + }, + { + "epoch": 4.203351955307262, + "grad_norm": 0.6305790543556213, + "learning_rate": 0.0007922128851540616, + "loss": 0.5523, + "step": 7524 + }, + { + "epoch": 4.20391061452514, + "grad_norm": 0.6114245653152466, + "learning_rate": 0.0007921848739495798, + "loss": 0.4681, + "step": 7525 + }, + { + "epoch": 4.204469273743017, + "grad_norm": 0.8891258835792542, + "learning_rate": 0.000792156862745098, + "loss": 0.4285, + "step": 7526 + }, + { + "epoch": 4.205027932960894, + "grad_norm": 0.6705508828163147, + "learning_rate": 0.0007921288515406164, + "loss": 0.7093, + "step": 7527 + }, + { + "epoch": 4.205586592178771, + "grad_norm": 0.4631941318511963, + "learning_rate": 0.0007921008403361345, + "loss": 0.4092, + "step": 7528 + }, + { + "epoch": 4.206145251396648, + "grad_norm": 0.49045324325561523, + "learning_rate": 0.0007920728291316527, + "loss": 0.5919, + "step": 7529 + }, + { + "epoch": 4.206703910614525, + "grad_norm": 0.6780363321304321, + "learning_rate": 0.0007920448179271709, + "loss": 0.5094, + "step": 7530 + }, + { + "epoch": 4.207262569832403, + "grad_norm": 0.5109512209892273, + "learning_rate": 0.0007920168067226891, + "loss": 0.4773, + "step": 7531 + }, + { + "epoch": 4.207821229050279, + "grad_norm": 0.5376586318016052, + "learning_rate": 0.0007919887955182074, + "loss": 0.5607, + "step": 7532 + }, + { + "epoch": 4.208379888268157, + "grad_norm": 0.5449132323265076, + "learning_rate": 0.0007919607843137255, + "loss": 0.4484, + "step": 7533 + }, + { + "epoch": 4.208938547486033, + "grad_norm": 0.5536893010139465, + "learning_rate": 0.0007919327731092437, + "loss": 0.4407, + "step": 7534 + }, + { + "epoch": 4.209497206703911, + "grad_norm": 0.6017754673957825, + "learning_rate": 0.0007919047619047619, + "loss": 0.416, + "step": 7535 + }, + { + "epoch": 4.210055865921788, + "grad_norm": 0.4171697497367859, + "learning_rate": 0.0007918767507002801, + "loss": 0.461, + "step": 7536 + }, + { + "epoch": 4.210614525139665, + "grad_norm": 0.5887663960456848, + "learning_rate": 0.0007918487394957984, + "loss": 0.4654, + "step": 7537 + }, + { + "epoch": 4.211173184357542, + "grad_norm": 3.2149410247802734, + "learning_rate": 0.0007918207282913166, + "loss": 0.6762, + "step": 7538 + }, + { + "epoch": 4.211731843575419, + "grad_norm": 0.37442272901535034, + "learning_rate": 0.0007917927170868347, + "loss": 0.4011, + "step": 7539 + }, + { + "epoch": 4.212290502793296, + "grad_norm": 0.5815616846084595, + "learning_rate": 0.0007917647058823529, + "loss": 0.4366, + "step": 7540 + }, + { + "epoch": 4.212849162011173, + "grad_norm": 0.8288305997848511, + "learning_rate": 0.0007917366946778711, + "loss": 0.4455, + "step": 7541 + }, + { + "epoch": 4.21340782122905, + "grad_norm": 0.6441209316253662, + "learning_rate": 0.0007917086834733894, + "loss": 0.4496, + "step": 7542 + }, + { + "epoch": 4.213966480446928, + "grad_norm": 0.6896035671234131, + "learning_rate": 0.0007916806722689077, + "loss": 0.5166, + "step": 7543 + }, + { + "epoch": 4.214525139664804, + "grad_norm": 0.5576814413070679, + "learning_rate": 0.0007916526610644258, + "loss": 0.5518, + "step": 7544 + }, + { + "epoch": 4.215083798882682, + "grad_norm": 0.4375351667404175, + "learning_rate": 0.000791624649859944, + "loss": 0.4281, + "step": 7545 + }, + { + "epoch": 4.215642458100558, + "grad_norm": 0.8181184530258179, + "learning_rate": 0.0007915966386554622, + "loss": 0.6171, + "step": 7546 + }, + { + "epoch": 4.216201117318436, + "grad_norm": 0.5615286827087402, + "learning_rate": 0.0007915686274509805, + "loss": 0.5376, + "step": 7547 + }, + { + "epoch": 4.216759776536313, + "grad_norm": 0.5858601927757263, + "learning_rate": 0.0007915406162464987, + "loss": 0.5005, + "step": 7548 + }, + { + "epoch": 4.21731843575419, + "grad_norm": 0.8927748203277588, + "learning_rate": 0.0007915126050420168, + "loss": 0.5056, + "step": 7549 + }, + { + "epoch": 4.217877094972067, + "grad_norm": 0.4644601345062256, + "learning_rate": 0.000791484593837535, + "loss": 0.467, + "step": 7550 + }, + { + "epoch": 4.218435754189944, + "grad_norm": 0.5699589848518372, + "learning_rate": 0.0007914565826330532, + "loss": 0.4393, + "step": 7551 + }, + { + "epoch": 4.218994413407821, + "grad_norm": 0.6297087669372559, + "learning_rate": 0.0007914285714285715, + "loss": 0.3804, + "step": 7552 + }, + { + "epoch": 4.219553072625699, + "grad_norm": 0.4785193204879761, + "learning_rate": 0.0007914005602240897, + "loss": 0.4091, + "step": 7553 + }, + { + "epoch": 4.220111731843575, + "grad_norm": 0.40393710136413574, + "learning_rate": 0.0007913725490196079, + "loss": 0.4572, + "step": 7554 + }, + { + "epoch": 4.220670391061453, + "grad_norm": 0.4363790452480316, + "learning_rate": 0.000791344537815126, + "loss": 0.4269, + "step": 7555 + }, + { + "epoch": 4.221229050279329, + "grad_norm": 0.7848618030548096, + "learning_rate": 0.0007913165266106442, + "loss": 0.5952, + "step": 7556 + }, + { + "epoch": 4.221787709497207, + "grad_norm": 2.397650718688965, + "learning_rate": 0.0007912885154061625, + "loss": 0.3947, + "step": 7557 + }, + { + "epoch": 4.222346368715084, + "grad_norm": 0.982249915599823, + "learning_rate": 0.0007912605042016807, + "loss": 0.6712, + "step": 7558 + }, + { + "epoch": 4.222905027932961, + "grad_norm": 0.3479222059249878, + "learning_rate": 0.000791232492997199, + "loss": 0.36, + "step": 7559 + }, + { + "epoch": 4.223463687150838, + "grad_norm": 0.3779882788658142, + "learning_rate": 0.000791204481792717, + "loss": 0.3762, + "step": 7560 + }, + { + "epoch": 4.224022346368715, + "grad_norm": Infinity, + "learning_rate": 0.000791204481792717, + "loss": 0.5007, + "step": 7561 + }, + { + "epoch": 4.224581005586592, + "grad_norm": 0.5627475380897522, + "learning_rate": 0.0007911764705882353, + "loss": 0.451, + "step": 7562 + }, + { + "epoch": 4.22513966480447, + "grad_norm": 2.1478681564331055, + "learning_rate": 0.0007911484593837536, + "loss": 0.4829, + "step": 7563 + }, + { + "epoch": 4.225698324022346, + "grad_norm": 0.4308639168739319, + "learning_rate": 0.0007911204481792718, + "loss": 0.3444, + "step": 7564 + }, + { + "epoch": 4.226256983240224, + "grad_norm": 0.551754891872406, + "learning_rate": 0.00079109243697479, + "loss": 0.4436, + "step": 7565 + }, + { + "epoch": 4.2268156424581, + "grad_norm": 0.5118157267570496, + "learning_rate": 0.0007910644257703081, + "loss": 0.468, + "step": 7566 + }, + { + "epoch": 4.227374301675978, + "grad_norm": 0.47258880734443665, + "learning_rate": 0.0007910364145658263, + "loss": 0.419, + "step": 7567 + }, + { + "epoch": 4.227932960893855, + "grad_norm": 0.7057540416717529, + "learning_rate": 0.0007910084033613446, + "loss": 0.4491, + "step": 7568 + }, + { + "epoch": 4.228491620111732, + "grad_norm": 0.6563587188720703, + "learning_rate": 0.0007909803921568628, + "loss": 0.5068, + "step": 7569 + }, + { + "epoch": 4.229050279329609, + "grad_norm": 0.5046238899230957, + "learning_rate": 0.000790952380952381, + "loss": 0.4865, + "step": 7570 + }, + { + "epoch": 4.229608938547486, + "grad_norm": 0.5022178888320923, + "learning_rate": 0.0007909243697478992, + "loss": 0.4151, + "step": 7571 + }, + { + "epoch": 4.230167597765363, + "grad_norm": 0.5882951617240906, + "learning_rate": 0.0007908963585434173, + "loss": 0.6282, + "step": 7572 + }, + { + "epoch": 4.230726256983241, + "grad_norm": 0.4815919101238251, + "learning_rate": 0.0007908683473389356, + "loss": 0.4525, + "step": 7573 + }, + { + "epoch": 4.231284916201117, + "grad_norm": 0.5523713231086731, + "learning_rate": 0.0007908403361344538, + "loss": 0.4286, + "step": 7574 + }, + { + "epoch": 4.231843575418995, + "grad_norm": 0.6272464990615845, + "learning_rate": 0.000790812324929972, + "loss": 0.4915, + "step": 7575 + }, + { + "epoch": 4.232402234636871, + "grad_norm": 0.5121742486953735, + "learning_rate": 0.0007907843137254902, + "loss": 0.4691, + "step": 7576 + }, + { + "epoch": 4.232960893854749, + "grad_norm": 0.40071049332618713, + "learning_rate": 0.0007907563025210083, + "loss": 0.4459, + "step": 7577 + }, + { + "epoch": 4.233519553072625, + "grad_norm": 0.565533459186554, + "learning_rate": 0.0007907282913165267, + "loss": 0.6078, + "step": 7578 + }, + { + "epoch": 4.234078212290503, + "grad_norm": 0.7031722664833069, + "learning_rate": 0.0007907002801120449, + "loss": 0.3592, + "step": 7579 + }, + { + "epoch": 4.23463687150838, + "grad_norm": 0.5376628041267395, + "learning_rate": 0.0007906722689075631, + "loss": 0.4001, + "step": 7580 + }, + { + "epoch": 4.235195530726257, + "grad_norm": 1.1053074598312378, + "learning_rate": 0.0007906442577030813, + "loss": 0.354, + "step": 7581 + }, + { + "epoch": 4.235754189944134, + "grad_norm": 0.47745054960250854, + "learning_rate": 0.0007906162464985994, + "loss": 0.4687, + "step": 7582 + }, + { + "epoch": 4.236312849162011, + "grad_norm": 0.630379319190979, + "learning_rate": 0.0007905882352941176, + "loss": 0.6115, + "step": 7583 + }, + { + "epoch": 4.236871508379888, + "grad_norm": 0.5156353712081909, + "learning_rate": 0.0007905602240896359, + "loss": 0.3984, + "step": 7584 + }, + { + "epoch": 4.237430167597766, + "grad_norm": 0.7615495920181274, + "learning_rate": 0.0007905322128851541, + "loss": 0.5273, + "step": 7585 + }, + { + "epoch": 4.237988826815642, + "grad_norm": 0.5601665377616882, + "learning_rate": 0.0007905042016806723, + "loss": 0.4536, + "step": 7586 + }, + { + "epoch": 4.23854748603352, + "grad_norm": 0.7228991389274597, + "learning_rate": 0.0007904761904761905, + "loss": 0.3972, + "step": 7587 + }, + { + "epoch": 4.239106145251396, + "grad_norm": 0.9597086906433105, + "learning_rate": 0.0007904481792717086, + "loss": 0.5157, + "step": 7588 + }, + { + "epoch": 4.239664804469274, + "grad_norm": 0.46680310368537903, + "learning_rate": 0.0007904201680672269, + "loss": 0.4702, + "step": 7589 + }, + { + "epoch": 4.240223463687151, + "grad_norm": 0.5271770358085632, + "learning_rate": 0.0007903921568627451, + "loss": 0.4419, + "step": 7590 + }, + { + "epoch": 4.240782122905028, + "grad_norm": 0.611297070980072, + "learning_rate": 0.0007903641456582633, + "loss": 0.4898, + "step": 7591 + }, + { + "epoch": 4.241340782122905, + "grad_norm": 0.503616988658905, + "learning_rate": 0.0007903361344537815, + "loss": 0.455, + "step": 7592 + }, + { + "epoch": 4.241899441340782, + "grad_norm": 0.42161980271339417, + "learning_rate": 0.0007903081232492996, + "loss": 0.4363, + "step": 7593 + }, + { + "epoch": 4.242458100558659, + "grad_norm": 0.4580270051956177, + "learning_rate": 0.000790280112044818, + "loss": 0.4319, + "step": 7594 + }, + { + "epoch": 4.243016759776537, + "grad_norm": 0.6039807200431824, + "learning_rate": 0.0007902521008403362, + "loss": 0.4201, + "step": 7595 + }, + { + "epoch": 4.243575418994413, + "grad_norm": 0.7419303059577942, + "learning_rate": 0.0007902240896358544, + "loss": 0.3913, + "step": 7596 + }, + { + "epoch": 4.244134078212291, + "grad_norm": 0.4437410831451416, + "learning_rate": 0.0007901960784313726, + "loss": 0.4705, + "step": 7597 + }, + { + "epoch": 4.244692737430167, + "grad_norm": 0.5128861665725708, + "learning_rate": 0.0007901680672268907, + "loss": 0.3766, + "step": 7598 + }, + { + "epoch": 4.245251396648045, + "grad_norm": 0.5940971374511719, + "learning_rate": 0.000790140056022409, + "loss": 0.3963, + "step": 7599 + }, + { + "epoch": 4.245810055865922, + "grad_norm": 0.6905661225318909, + "learning_rate": 0.0007901120448179272, + "loss": 0.5304, + "step": 7600 + }, + { + "epoch": 4.246368715083799, + "grad_norm": 0.6886366605758667, + "learning_rate": 0.0007900840336134454, + "loss": 0.4096, + "step": 7601 + }, + { + "epoch": 4.246927374301676, + "grad_norm": 0.600100040435791, + "learning_rate": 0.0007900560224089636, + "loss": 0.4241, + "step": 7602 + }, + { + "epoch": 4.247486033519553, + "grad_norm": 0.5681229829788208, + "learning_rate": 0.0007900280112044818, + "loss": 0.4724, + "step": 7603 + }, + { + "epoch": 4.24804469273743, + "grad_norm": 0.6665762662887573, + "learning_rate": 0.00079, + "loss": 0.5504, + "step": 7604 + }, + { + "epoch": 4.248603351955307, + "grad_norm": 0.6190313100814819, + "learning_rate": 0.0007899719887955182, + "loss": 0.4141, + "step": 7605 + }, + { + "epoch": 4.249162011173184, + "grad_norm": 0.6787214279174805, + "learning_rate": 0.0007899439775910364, + "loss": 0.3803, + "step": 7606 + }, + { + "epoch": 4.249720670391062, + "grad_norm": 0.9613161087036133, + "learning_rate": 0.0007899159663865546, + "loss": 0.5026, + "step": 7607 + }, + { + "epoch": 4.250279329608938, + "grad_norm": 0.6363176703453064, + "learning_rate": 0.0007898879551820728, + "loss": 0.3536, + "step": 7608 + }, + { + "epoch": 4.250837988826816, + "grad_norm": 0.5358803868293762, + "learning_rate": 0.000789859943977591, + "loss": 0.4507, + "step": 7609 + }, + { + "epoch": 4.251396648044693, + "grad_norm": 0.6599241495132446, + "learning_rate": 0.0007898319327731092, + "loss": 0.5843, + "step": 7610 + }, + { + "epoch": 4.25195530726257, + "grad_norm": 0.9755486845970154, + "learning_rate": 0.0007898039215686275, + "loss": 0.4964, + "step": 7611 + }, + { + "epoch": 4.252513966480447, + "grad_norm": 1.0052839517593384, + "learning_rate": 0.0007897759103641457, + "loss": 0.4895, + "step": 7612 + }, + { + "epoch": 4.253072625698324, + "grad_norm": 0.41139498353004456, + "learning_rate": 0.0007897478991596639, + "loss": 0.4121, + "step": 7613 + }, + { + "epoch": 4.253631284916201, + "grad_norm": 0.6142210364341736, + "learning_rate": 0.0007897198879551821, + "loss": 0.5226, + "step": 7614 + }, + { + "epoch": 4.254189944134078, + "grad_norm": 0.5242356061935425, + "learning_rate": 0.0007896918767507003, + "loss": 0.3528, + "step": 7615 + }, + { + "epoch": 4.254748603351955, + "grad_norm": 0.6971606016159058, + "learning_rate": 0.0007896638655462185, + "loss": 0.3932, + "step": 7616 + }, + { + "epoch": 4.255307262569833, + "grad_norm": 0.998999297618866, + "learning_rate": 0.0007896358543417367, + "loss": 0.4605, + "step": 7617 + }, + { + "epoch": 4.255865921787709, + "grad_norm": 0.5562071204185486, + "learning_rate": 0.0007896078431372549, + "loss": 0.5531, + "step": 7618 + }, + { + "epoch": 4.256424581005587, + "grad_norm": 0.42885369062423706, + "learning_rate": 0.0007895798319327732, + "loss": 0.4067, + "step": 7619 + }, + { + "epoch": 4.256983240223463, + "grad_norm": 0.7504162192344666, + "learning_rate": 0.0007895518207282913, + "loss": 0.7144, + "step": 7620 + }, + { + "epoch": 4.257541899441341, + "grad_norm": 0.48764362931251526, + "learning_rate": 0.0007895238095238095, + "loss": 0.4938, + "step": 7621 + }, + { + "epoch": 4.258100558659218, + "grad_norm": 0.681430459022522, + "learning_rate": 0.0007894957983193277, + "loss": 0.4711, + "step": 7622 + }, + { + "epoch": 4.258659217877095, + "grad_norm": 0.5723384618759155, + "learning_rate": 0.0007894677871148459, + "loss": 0.449, + "step": 7623 + }, + { + "epoch": 4.259217877094972, + "grad_norm": 0.5640926957130432, + "learning_rate": 0.0007894397759103642, + "loss": 0.7216, + "step": 7624 + }, + { + "epoch": 4.259776536312849, + "grad_norm": 0.4291515350341797, + "learning_rate": 0.0007894117647058823, + "loss": 0.4026, + "step": 7625 + }, + { + "epoch": 4.260335195530726, + "grad_norm": 0.4841230809688568, + "learning_rate": 0.0007893837535014005, + "loss": 0.5593, + "step": 7626 + }, + { + "epoch": 4.260893854748604, + "grad_norm": 0.6583724617958069, + "learning_rate": 0.0007893557422969187, + "loss": 0.4614, + "step": 7627 + }, + { + "epoch": 4.26145251396648, + "grad_norm": 0.48537394404411316, + "learning_rate": 0.000789327731092437, + "loss": 0.454, + "step": 7628 + }, + { + "epoch": 4.262011173184358, + "grad_norm": 0.6702374815940857, + "learning_rate": 0.0007892997198879553, + "loss": 0.5409, + "step": 7629 + }, + { + "epoch": 4.262569832402234, + "grad_norm": 0.47325006127357483, + "learning_rate": 0.0007892717086834734, + "loss": 0.4857, + "step": 7630 + }, + { + "epoch": 4.263128491620112, + "grad_norm": 1.185408353805542, + "learning_rate": 0.0007892436974789916, + "loss": 0.4703, + "step": 7631 + }, + { + "epoch": 4.263687150837989, + "grad_norm": 0.6024292707443237, + "learning_rate": 0.0007892156862745098, + "loss": 0.4243, + "step": 7632 + }, + { + "epoch": 4.264245810055866, + "grad_norm": 0.5890686511993408, + "learning_rate": 0.000789187675070028, + "loss": 0.3634, + "step": 7633 + }, + { + "epoch": 4.264804469273743, + "grad_norm": 0.6138582825660706, + "learning_rate": 0.0007891596638655463, + "loss": 0.5212, + "step": 7634 + }, + { + "epoch": 4.26536312849162, + "grad_norm": 0.3765842318534851, + "learning_rate": 0.0007891316526610645, + "loss": 0.4334, + "step": 7635 + }, + { + "epoch": 4.265921787709497, + "grad_norm": 0.49022841453552246, + "learning_rate": 0.0007891036414565826, + "loss": 0.4759, + "step": 7636 + }, + { + "epoch": 4.266480446927375, + "grad_norm": 0.4493389427661896, + "learning_rate": 0.0007890756302521008, + "loss": 0.4684, + "step": 7637 + }, + { + "epoch": 4.267039106145251, + "grad_norm": 0.7142762541770935, + "learning_rate": 0.000789047619047619, + "loss": 0.6615, + "step": 7638 + }, + { + "epoch": 4.267597765363129, + "grad_norm": 3.7400758266448975, + "learning_rate": 0.0007890196078431373, + "loss": 0.5036, + "step": 7639 + }, + { + "epoch": 4.268156424581005, + "grad_norm": 5.28965950012207, + "learning_rate": 0.0007889915966386555, + "loss": 0.4729, + "step": 7640 + }, + { + "epoch": 4.268715083798883, + "grad_norm": 0.5655019879341125, + "learning_rate": 0.0007889635854341736, + "loss": 0.5352, + "step": 7641 + }, + { + "epoch": 4.269273743016759, + "grad_norm": 0.6827526092529297, + "learning_rate": 0.0007889355742296918, + "loss": 0.4772, + "step": 7642 + }, + { + "epoch": 4.269832402234637, + "grad_norm": 0.5001131892204285, + "learning_rate": 0.00078890756302521, + "loss": 0.4788, + "step": 7643 + }, + { + "epoch": 4.270391061452514, + "grad_norm": 1.6869261264801025, + "learning_rate": 0.0007888795518207284, + "loss": 0.5038, + "step": 7644 + }, + { + "epoch": 4.270949720670391, + "grad_norm": 0.5407483577728271, + "learning_rate": 0.0007888515406162466, + "loss": 0.4887, + "step": 7645 + }, + { + "epoch": 4.271508379888268, + "grad_norm": 0.5994249582290649, + "learning_rate": 0.0007888235294117647, + "loss": 0.4188, + "step": 7646 + }, + { + "epoch": 4.272067039106146, + "grad_norm": 0.505515456199646, + "learning_rate": 0.0007887955182072829, + "loss": 0.4225, + "step": 7647 + }, + { + "epoch": 4.272625698324022, + "grad_norm": 0.8638238906860352, + "learning_rate": 0.0007887675070028011, + "loss": 0.4624, + "step": 7648 + }, + { + "epoch": 4.2731843575419, + "grad_norm": 0.4794963598251343, + "learning_rate": 0.0007887394957983194, + "loss": 0.4043, + "step": 7649 + }, + { + "epoch": 4.273743016759776, + "grad_norm": 0.6042397022247314, + "learning_rate": 0.0007887114845938376, + "loss": 0.4738, + "step": 7650 + }, + { + "epoch": 4.274301675977654, + "grad_norm": 0.5241755247116089, + "learning_rate": 0.0007886834733893558, + "loss": 0.4644, + "step": 7651 + }, + { + "epoch": 4.27486033519553, + "grad_norm": 0.5943676829338074, + "learning_rate": 0.0007886554621848739, + "loss": 0.4022, + "step": 7652 + }, + { + "epoch": 4.275418994413408, + "grad_norm": 0.42731931805610657, + "learning_rate": 0.0007886274509803921, + "loss": 0.435, + "step": 7653 + }, + { + "epoch": 4.275977653631285, + "grad_norm": 0.4742490351200104, + "learning_rate": 0.0007885994397759104, + "loss": 0.4469, + "step": 7654 + }, + { + "epoch": 4.276536312849162, + "grad_norm": 6.935084342956543, + "learning_rate": 0.0007885714285714286, + "loss": 0.4071, + "step": 7655 + }, + { + "epoch": 4.277094972067039, + "grad_norm": 0.40504932403564453, + "learning_rate": 0.0007885434173669468, + "loss": 0.3801, + "step": 7656 + }, + { + "epoch": 4.277653631284916, + "grad_norm": 0.44817519187927246, + "learning_rate": 0.0007885154061624649, + "loss": 0.3845, + "step": 7657 + }, + { + "epoch": 4.278212290502793, + "grad_norm": 0.5161679983139038, + "learning_rate": 0.0007884873949579831, + "loss": 0.4996, + "step": 7658 + }, + { + "epoch": 4.278770949720671, + "grad_norm": 0.5023806095123291, + "learning_rate": 0.0007884593837535015, + "loss": 0.598, + "step": 7659 + }, + { + "epoch": 4.279329608938547, + "grad_norm": 0.6727524995803833, + "learning_rate": 0.0007884313725490197, + "loss": 0.4819, + "step": 7660 + }, + { + "epoch": 4.279888268156425, + "grad_norm": 0.7039174437522888, + "learning_rate": 0.0007884033613445379, + "loss": 0.4021, + "step": 7661 + }, + { + "epoch": 4.280446927374301, + "grad_norm": 0.688735842704773, + "learning_rate": 0.000788375350140056, + "loss": 0.464, + "step": 7662 + }, + { + "epoch": 4.281005586592179, + "grad_norm": 0.5400928854942322, + "learning_rate": 0.0007883473389355742, + "loss": 0.5049, + "step": 7663 + }, + { + "epoch": 4.281564245810056, + "grad_norm": 0.6422010064125061, + "learning_rate": 0.0007883193277310925, + "loss": 0.4616, + "step": 7664 + }, + { + "epoch": 4.282122905027933, + "grad_norm": 0.4987924098968506, + "learning_rate": 0.0007882913165266107, + "loss": 0.5504, + "step": 7665 + }, + { + "epoch": 4.28268156424581, + "grad_norm": 0.9322398900985718, + "learning_rate": 0.0007882633053221289, + "loss": 0.6021, + "step": 7666 + }, + { + "epoch": 4.283240223463687, + "grad_norm": 0.5510703325271606, + "learning_rate": 0.0007882352941176471, + "loss": 0.4432, + "step": 7667 + }, + { + "epoch": 4.283798882681564, + "grad_norm": 0.7457737922668457, + "learning_rate": 0.0007882072829131652, + "loss": 0.4212, + "step": 7668 + }, + { + "epoch": 4.284357541899442, + "grad_norm": 8.379426956176758, + "learning_rate": 0.0007881792717086835, + "loss": 0.4409, + "step": 7669 + }, + { + "epoch": 4.284916201117318, + "grad_norm": 0.7301014065742493, + "learning_rate": 0.0007881512605042017, + "loss": 0.4837, + "step": 7670 + }, + { + "epoch": 4.285474860335196, + "grad_norm": 0.8636860847473145, + "learning_rate": 0.0007881232492997199, + "loss": 0.4491, + "step": 7671 + }, + { + "epoch": 4.286033519553072, + "grad_norm": 0.5195205211639404, + "learning_rate": 0.0007880952380952381, + "loss": 0.5588, + "step": 7672 + }, + { + "epoch": 4.28659217877095, + "grad_norm": 0.5753498077392578, + "learning_rate": 0.0007880672268907562, + "loss": 0.5495, + "step": 7673 + }, + { + "epoch": 4.287150837988827, + "grad_norm": 0.5269135236740112, + "learning_rate": 0.0007880392156862745, + "loss": 0.3289, + "step": 7674 + }, + { + "epoch": 4.287709497206704, + "grad_norm": 0.7850725650787354, + "learning_rate": 0.0007880112044817927, + "loss": 0.3754, + "step": 7675 + }, + { + "epoch": 4.288268156424581, + "grad_norm": 0.5600786805152893, + "learning_rate": 0.000787983193277311, + "loss": 0.4287, + "step": 7676 + }, + { + "epoch": 4.288826815642458, + "grad_norm": 0.5487927198410034, + "learning_rate": 0.0007879551820728292, + "loss": 0.6042, + "step": 7677 + }, + { + "epoch": 4.289385474860335, + "grad_norm": 0.5627371668815613, + "learning_rate": 0.0007879271708683473, + "loss": 0.5317, + "step": 7678 + }, + { + "epoch": 4.289944134078212, + "grad_norm": 0.6596568822860718, + "learning_rate": 0.0007878991596638656, + "loss": 0.7527, + "step": 7679 + }, + { + "epoch": 4.290502793296089, + "grad_norm": 0.47150665521621704, + "learning_rate": 0.0007878711484593838, + "loss": 0.395, + "step": 7680 + }, + { + "epoch": 4.291061452513967, + "grad_norm": 0.6270442605018616, + "learning_rate": 0.000787843137254902, + "loss": 0.417, + "step": 7681 + }, + { + "epoch": 4.291620111731843, + "grad_norm": 0.5527673363685608, + "learning_rate": 0.0007878151260504202, + "loss": 0.4463, + "step": 7682 + }, + { + "epoch": 4.292178770949721, + "grad_norm": 0.8703657388687134, + "learning_rate": 0.0007877871148459384, + "loss": 0.4557, + "step": 7683 + }, + { + "epoch": 4.292737430167598, + "grad_norm": 0.594548761844635, + "learning_rate": 0.0007877591036414566, + "loss": 0.4068, + "step": 7684 + }, + { + "epoch": 4.293296089385475, + "grad_norm": 0.7797442078590393, + "learning_rate": 0.0007877310924369748, + "loss": 0.4055, + "step": 7685 + }, + { + "epoch": 4.293854748603352, + "grad_norm": 1.4545782804489136, + "learning_rate": 0.000787703081232493, + "loss": 0.5126, + "step": 7686 + }, + { + "epoch": 4.294413407821229, + "grad_norm": 0.6887294054031372, + "learning_rate": 0.0007876750700280112, + "loss": 0.5332, + "step": 7687 + }, + { + "epoch": 4.294972067039106, + "grad_norm": 0.6880782842636108, + "learning_rate": 0.0007876470588235294, + "loss": 0.5723, + "step": 7688 + }, + { + "epoch": 4.295530726256983, + "grad_norm": 1.3066493272781372, + "learning_rate": 0.0007876190476190476, + "loss": 0.5452, + "step": 7689 + }, + { + "epoch": 4.29608938547486, + "grad_norm": 0.5060390830039978, + "learning_rate": 0.0007875910364145658, + "loss": 0.4856, + "step": 7690 + }, + { + "epoch": 4.296648044692738, + "grad_norm": 0.7090829014778137, + "learning_rate": 0.000787563025210084, + "loss": 0.4813, + "step": 7691 + }, + { + "epoch": 4.297206703910614, + "grad_norm": 0.7248459458351135, + "learning_rate": 0.0007875350140056022, + "loss": 0.4917, + "step": 7692 + }, + { + "epoch": 4.297765363128492, + "grad_norm": 0.4943256080150604, + "learning_rate": 0.0007875070028011205, + "loss": 0.3881, + "step": 7693 + }, + { + "epoch": 4.298324022346368, + "grad_norm": 0.6828133463859558, + "learning_rate": 0.0007874789915966388, + "loss": 0.51, + "step": 7694 + }, + { + "epoch": 4.298882681564246, + "grad_norm": 0.6464107632637024, + "learning_rate": 0.0007874509803921569, + "loss": 0.4633, + "step": 7695 + }, + { + "epoch": 4.299441340782123, + "grad_norm": 0.49390271306037903, + "learning_rate": 0.0007874229691876751, + "loss": 0.3729, + "step": 7696 + }, + { + "epoch": 4.3, + "grad_norm": 0.8593959212303162, + "learning_rate": 0.0007873949579831933, + "loss": 0.4379, + "step": 7697 + }, + { + "epoch": 4.300558659217877, + "grad_norm": 0.4330633580684662, + "learning_rate": 0.0007873669467787115, + "loss": 0.3912, + "step": 7698 + }, + { + "epoch": 4.301117318435754, + "grad_norm": 0.6093931198120117, + "learning_rate": 0.0007873389355742298, + "loss": 0.3898, + "step": 7699 + }, + { + "epoch": 4.301675977653631, + "grad_norm": 0.6548773646354675, + "learning_rate": 0.0007873109243697479, + "loss": 0.3353, + "step": 7700 + }, + { + "epoch": 4.302234636871509, + "grad_norm": 0.5223304629325867, + "learning_rate": 0.0007872829131652661, + "loss": 0.4734, + "step": 7701 + }, + { + "epoch": 4.302793296089385, + "grad_norm": 1.0821515321731567, + "learning_rate": 0.0007872549019607843, + "loss": 0.4416, + "step": 7702 + }, + { + "epoch": 4.303351955307263, + "grad_norm": 2.705974817276001, + "learning_rate": 0.0007872268907563025, + "loss": 0.443, + "step": 7703 + }, + { + "epoch": 4.303910614525139, + "grad_norm": 0.5604917407035828, + "learning_rate": 0.0007871988795518208, + "loss": 0.4503, + "step": 7704 + }, + { + "epoch": 4.304469273743017, + "grad_norm": 0.6788303852081299, + "learning_rate": 0.0007871708683473389, + "loss": 0.441, + "step": 7705 + }, + { + "epoch": 4.305027932960894, + "grad_norm": 0.9525696635246277, + "learning_rate": 0.0007871428571428571, + "loss": 0.3924, + "step": 7706 + }, + { + "epoch": 4.305586592178771, + "grad_norm": 1.9131962060928345, + "learning_rate": 0.0007871148459383753, + "loss": 0.5338, + "step": 7707 + }, + { + "epoch": 4.306145251396648, + "grad_norm": 0.405402809381485, + "learning_rate": 0.0007870868347338935, + "loss": 0.428, + "step": 7708 + }, + { + "epoch": 4.306703910614525, + "grad_norm": 0.40964922308921814, + "learning_rate": 0.0007870588235294119, + "loss": 0.3859, + "step": 7709 + }, + { + "epoch": 4.307262569832402, + "grad_norm": 1.9622915983200073, + "learning_rate": 0.0007870308123249301, + "loss": 0.2718, + "step": 7710 + }, + { + "epoch": 4.30782122905028, + "grad_norm": 1.5938780307769775, + "learning_rate": 0.0007870028011204482, + "loss": 0.4962, + "step": 7711 + }, + { + "epoch": 4.308379888268156, + "grad_norm": 0.3893345892429352, + "learning_rate": 0.0007869747899159664, + "loss": 0.3454, + "step": 7712 + }, + { + "epoch": 4.308938547486034, + "grad_norm": 0.6588876247406006, + "learning_rate": 0.0007869467787114846, + "loss": 0.4171, + "step": 7713 + }, + { + "epoch": 4.30949720670391, + "grad_norm": 0.6357071399688721, + "learning_rate": 0.0007869187675070029, + "loss": 0.4736, + "step": 7714 + }, + { + "epoch": 4.310055865921788, + "grad_norm": 2.4534640312194824, + "learning_rate": 0.0007868907563025211, + "loss": 0.4182, + "step": 7715 + }, + { + "epoch": 4.310614525139664, + "grad_norm": 0.3842434883117676, + "learning_rate": 0.0007868627450980392, + "loss": 0.4628, + "step": 7716 + }, + { + "epoch": 4.311173184357542, + "grad_norm": 0.8772587776184082, + "learning_rate": 0.0007868347338935574, + "loss": 0.5188, + "step": 7717 + }, + { + "epoch": 4.311731843575419, + "grad_norm": 0.8579975366592407, + "learning_rate": 0.0007868067226890756, + "loss": 0.4531, + "step": 7718 + }, + { + "epoch": 4.312290502793296, + "grad_norm": 0.9199638962745667, + "learning_rate": 0.0007867787114845939, + "loss": 0.4908, + "step": 7719 + }, + { + "epoch": 4.312849162011173, + "grad_norm": 1.6186927556991577, + "learning_rate": 0.0007867507002801121, + "loss": 0.3756, + "step": 7720 + }, + { + "epoch": 4.31340782122905, + "grad_norm": 0.6694222688674927, + "learning_rate": 0.0007867226890756302, + "loss": 0.4425, + "step": 7721 + }, + { + "epoch": 4.313966480446927, + "grad_norm": 1.8523377180099487, + "learning_rate": 0.0007866946778711484, + "loss": 0.3456, + "step": 7722 + }, + { + "epoch": 4.314525139664805, + "grad_norm": 0.6399108171463013, + "learning_rate": 0.0007866666666666666, + "loss": 0.6505, + "step": 7723 + }, + { + "epoch": 4.315083798882681, + "grad_norm": 0.48726019263267517, + "learning_rate": 0.000786638655462185, + "loss": 0.4961, + "step": 7724 + }, + { + "epoch": 4.315642458100559, + "grad_norm": 0.5752406716346741, + "learning_rate": 0.0007866106442577032, + "loss": 0.5008, + "step": 7725 + }, + { + "epoch": 4.316201117318435, + "grad_norm": 0.5257292985916138, + "learning_rate": 0.0007865826330532214, + "loss": 0.5904, + "step": 7726 + }, + { + "epoch": 4.316759776536313, + "grad_norm": 0.8009903430938721, + "learning_rate": 0.0007865546218487395, + "loss": 0.4734, + "step": 7727 + }, + { + "epoch": 4.31731843575419, + "grad_norm": 2.6037352085113525, + "learning_rate": 0.0007865266106442577, + "loss": 0.3808, + "step": 7728 + }, + { + "epoch": 4.317877094972067, + "grad_norm": 0.42437905073165894, + "learning_rate": 0.000786498599439776, + "loss": 0.5026, + "step": 7729 + }, + { + "epoch": 4.318435754189944, + "grad_norm": 0.6118500828742981, + "learning_rate": 0.0007864705882352942, + "loss": 0.4495, + "step": 7730 + }, + { + "epoch": 4.318994413407821, + "grad_norm": 0.49491187930107117, + "learning_rate": 0.0007864425770308124, + "loss": 0.5, + "step": 7731 + }, + { + "epoch": 4.319553072625698, + "grad_norm": 0.5296121835708618, + "learning_rate": 0.0007864145658263305, + "loss": 0.4045, + "step": 7732 + }, + { + "epoch": 4.320111731843576, + "grad_norm": 0.45565447211265564, + "learning_rate": 0.0007863865546218487, + "loss": 0.4107, + "step": 7733 + }, + { + "epoch": 4.320670391061452, + "grad_norm": 0.556579053401947, + "learning_rate": 0.000786358543417367, + "loss": 0.4661, + "step": 7734 + }, + { + "epoch": 4.32122905027933, + "grad_norm": 0.434957891702652, + "learning_rate": 0.0007863305322128852, + "loss": 0.3443, + "step": 7735 + }, + { + "epoch": 4.321787709497206, + "grad_norm": 0.6243918538093567, + "learning_rate": 0.0007863025210084034, + "loss": 0.5631, + "step": 7736 + }, + { + "epoch": 4.322346368715084, + "grad_norm": 0.546448826789856, + "learning_rate": 0.0007862745098039215, + "loss": 0.4049, + "step": 7737 + }, + { + "epoch": 4.322905027932961, + "grad_norm": 0.6531659960746765, + "learning_rate": 0.0007862464985994397, + "loss": 0.4982, + "step": 7738 + }, + { + "epoch": 4.323463687150838, + "grad_norm": 1.112093448638916, + "learning_rate": 0.000786218487394958, + "loss": 0.4305, + "step": 7739 + }, + { + "epoch": 4.324022346368715, + "grad_norm": 0.5378101468086243, + "learning_rate": 0.0007861904761904762, + "loss": 0.5778, + "step": 7740 + }, + { + "epoch": 4.324581005586592, + "grad_norm": 1.5151773691177368, + "learning_rate": 0.0007861624649859945, + "loss": 0.5471, + "step": 7741 + }, + { + "epoch": 4.325139664804469, + "grad_norm": 1.3583226203918457, + "learning_rate": 0.0007861344537815127, + "loss": 0.4314, + "step": 7742 + }, + { + "epoch": 4.325698324022347, + "grad_norm": 0.5055039525032043, + "learning_rate": 0.0007861064425770308, + "loss": 0.4683, + "step": 7743 + }, + { + "epoch": 4.326256983240223, + "grad_norm": 1.527499794960022, + "learning_rate": 0.0007860784313725491, + "loss": 0.4038, + "step": 7744 + }, + { + "epoch": 4.326815642458101, + "grad_norm": 0.4101034998893738, + "learning_rate": 0.0007860504201680673, + "loss": 0.4146, + "step": 7745 + }, + { + "epoch": 4.327374301675977, + "grad_norm": 0.6584548950195312, + "learning_rate": 0.0007860224089635855, + "loss": 0.4659, + "step": 7746 + }, + { + "epoch": 4.327932960893855, + "grad_norm": 0.5194028615951538, + "learning_rate": 0.0007859943977591037, + "loss": 0.5274, + "step": 7747 + }, + { + "epoch": 4.328491620111732, + "grad_norm": 0.9011557698249817, + "learning_rate": 0.0007859663865546218, + "loss": 0.5186, + "step": 7748 + }, + { + "epoch": 4.329050279329609, + "grad_norm": 0.5350490808486938, + "learning_rate": 0.0007859383753501401, + "loss": 0.5378, + "step": 7749 + }, + { + "epoch": 4.329608938547486, + "grad_norm": 0.5108752250671387, + "learning_rate": 0.0007859103641456583, + "loss": 0.4121, + "step": 7750 + }, + { + "epoch": 4.330167597765363, + "grad_norm": 0.3764178454875946, + "learning_rate": 0.0007858823529411765, + "loss": 0.3697, + "step": 7751 + }, + { + "epoch": 4.33072625698324, + "grad_norm": 0.7153195142745972, + "learning_rate": 0.0007858543417366947, + "loss": 0.4836, + "step": 7752 + }, + { + "epoch": 4.331284916201117, + "grad_norm": 0.4839271008968353, + "learning_rate": 0.0007858263305322128, + "loss": 0.4223, + "step": 7753 + }, + { + "epoch": 4.331843575418994, + "grad_norm": 0.5079770684242249, + "learning_rate": 0.0007857983193277311, + "loss": 0.3767, + "step": 7754 + }, + { + "epoch": 4.332402234636872, + "grad_norm": 0.7013403177261353, + "learning_rate": 0.0007857703081232493, + "loss": 0.4769, + "step": 7755 + }, + { + "epoch": 4.332960893854748, + "grad_norm": 0.5539823770523071, + "learning_rate": 0.0007857422969187675, + "loss": 0.5086, + "step": 7756 + }, + { + "epoch": 4.333519553072626, + "grad_norm": 0.49007055163383484, + "learning_rate": 0.0007857142857142857, + "loss": 0.5165, + "step": 7757 + }, + { + "epoch": 4.334078212290502, + "grad_norm": 0.7004421949386597, + "learning_rate": 0.000785686274509804, + "loss": 0.4767, + "step": 7758 + }, + { + "epoch": 4.33463687150838, + "grad_norm": 2.1830976009368896, + "learning_rate": 0.0007856582633053222, + "loss": 0.4153, + "step": 7759 + }, + { + "epoch": 4.335195530726257, + "grad_norm": 0.42629343271255493, + "learning_rate": 0.0007856302521008404, + "loss": 0.4139, + "step": 7760 + }, + { + "epoch": 4.335754189944134, + "grad_norm": 0.6583541631698608, + "learning_rate": 0.0007856022408963586, + "loss": 0.536, + "step": 7761 + }, + { + "epoch": 4.336312849162011, + "grad_norm": 3.1524322032928467, + "learning_rate": 0.0007855742296918768, + "loss": 0.4799, + "step": 7762 + }, + { + "epoch": 4.336871508379888, + "grad_norm": 0.6064858436584473, + "learning_rate": 0.000785546218487395, + "loss": 0.4707, + "step": 7763 + }, + { + "epoch": 4.337430167597765, + "grad_norm": 0.4527079164981842, + "learning_rate": 0.0007855182072829132, + "loss": 0.4685, + "step": 7764 + }, + { + "epoch": 4.337988826815643, + "grad_norm": 0.4843803644180298, + "learning_rate": 0.0007854901960784314, + "loss": 0.4966, + "step": 7765 + }, + { + "epoch": 4.338547486033519, + "grad_norm": 0.5902796387672424, + "learning_rate": 0.0007854621848739496, + "loss": 0.385, + "step": 7766 + }, + { + "epoch": 4.339106145251397, + "grad_norm": 0.5157030820846558, + "learning_rate": 0.0007854341736694678, + "loss": 0.4824, + "step": 7767 + }, + { + "epoch": 4.339664804469273, + "grad_norm": 0.38306954503059387, + "learning_rate": 0.000785406162464986, + "loss": 0.4456, + "step": 7768 + }, + { + "epoch": 4.340223463687151, + "grad_norm": 0.3922554850578308, + "learning_rate": 0.0007853781512605042, + "loss": 0.3762, + "step": 7769 + }, + { + "epoch": 4.340782122905028, + "grad_norm": 0.3859819173812866, + "learning_rate": 0.0007853501400560224, + "loss": 0.4752, + "step": 7770 + }, + { + "epoch": 4.341340782122905, + "grad_norm": 0.7145147919654846, + "learning_rate": 0.0007853221288515406, + "loss": 0.4156, + "step": 7771 + }, + { + "epoch": 4.341899441340782, + "grad_norm": 0.5755166411399841, + "learning_rate": 0.0007852941176470588, + "loss": 0.5298, + "step": 7772 + }, + { + "epoch": 4.342458100558659, + "grad_norm": 0.8832987546920776, + "learning_rate": 0.000785266106442577, + "loss": 0.5056, + "step": 7773 + }, + { + "epoch": 4.343016759776536, + "grad_norm": 0.4791451096534729, + "learning_rate": 0.0007852380952380954, + "loss": 0.4247, + "step": 7774 + }, + { + "epoch": 4.343575418994414, + "grad_norm": 0.48688995838165283, + "learning_rate": 0.0007852100840336135, + "loss": 0.3427, + "step": 7775 + }, + { + "epoch": 4.34413407821229, + "grad_norm": 0.6144006252288818, + "learning_rate": 0.0007851820728291317, + "loss": 0.5207, + "step": 7776 + }, + { + "epoch": 4.344692737430168, + "grad_norm": 0.7153996229171753, + "learning_rate": 0.0007851540616246499, + "loss": 0.4333, + "step": 7777 + }, + { + "epoch": 4.345251396648044, + "grad_norm": 0.5082197189331055, + "learning_rate": 0.0007851260504201681, + "loss": 0.5625, + "step": 7778 + }, + { + "epoch": 4.345810055865922, + "grad_norm": 0.7209962606430054, + "learning_rate": 0.0007850980392156864, + "loss": 0.3082, + "step": 7779 + }, + { + "epoch": 4.346368715083799, + "grad_norm": 1.3629626035690308, + "learning_rate": 0.0007850700280112045, + "loss": 0.7621, + "step": 7780 + }, + { + "epoch": 4.346927374301676, + "grad_norm": 0.805191159248352, + "learning_rate": 0.0007850420168067227, + "loss": 0.3873, + "step": 7781 + }, + { + "epoch": 4.347486033519553, + "grad_norm": 0.5278725624084473, + "learning_rate": 0.0007850140056022409, + "loss": 0.5137, + "step": 7782 + }, + { + "epoch": 4.34804469273743, + "grad_norm": 1.0014837980270386, + "learning_rate": 0.0007849859943977591, + "loss": 0.4251, + "step": 7783 + }, + { + "epoch": 4.348603351955307, + "grad_norm": 1.8876439332962036, + "learning_rate": 0.0007849579831932774, + "loss": 0.4037, + "step": 7784 + }, + { + "epoch": 4.349162011173185, + "grad_norm": 1.614197015762329, + "learning_rate": 0.0007849299719887955, + "loss": 0.6387, + "step": 7785 + }, + { + "epoch": 4.349720670391061, + "grad_norm": 0.41699346899986267, + "learning_rate": 0.0007849019607843137, + "loss": 0.4062, + "step": 7786 + }, + { + "epoch": 4.350279329608939, + "grad_norm": 3.9328601360321045, + "learning_rate": 0.0007848739495798319, + "loss": 0.4809, + "step": 7787 + }, + { + "epoch": 4.350837988826815, + "grad_norm": 0.7399049997329712, + "learning_rate": 0.0007848459383753501, + "loss": 0.5158, + "step": 7788 + }, + { + "epoch": 4.351396648044693, + "grad_norm": 0.4915332496166229, + "learning_rate": 0.0007848179271708684, + "loss": 0.4852, + "step": 7789 + }, + { + "epoch": 4.351955307262569, + "grad_norm": 0.622516930103302, + "learning_rate": 0.0007847899159663867, + "loss": 0.5343, + "step": 7790 + }, + { + "epoch": 4.352513966480447, + "grad_norm": 0.5885820984840393, + "learning_rate": 0.0007847619047619047, + "loss": 0.3692, + "step": 7791 + }, + { + "epoch": 4.353072625698324, + "grad_norm": 0.8940126895904541, + "learning_rate": 0.000784733893557423, + "loss": 0.5232, + "step": 7792 + }, + { + "epoch": 4.353631284916201, + "grad_norm": 0.5329613089561462, + "learning_rate": 0.0007847058823529412, + "loss": 0.4829, + "step": 7793 + }, + { + "epoch": 4.354189944134078, + "grad_norm": 0.5722092986106873, + "learning_rate": 0.0007846778711484595, + "loss": 0.3944, + "step": 7794 + }, + { + "epoch": 4.354748603351955, + "grad_norm": 1.0231282711029053, + "learning_rate": 0.0007846498599439777, + "loss": 0.6275, + "step": 7795 + }, + { + "epoch": 4.355307262569832, + "grad_norm": 0.5589284896850586, + "learning_rate": 0.0007846218487394958, + "loss": 0.463, + "step": 7796 + }, + { + "epoch": 4.35586592178771, + "grad_norm": 0.7263539433479309, + "learning_rate": 0.000784593837535014, + "loss": 0.5055, + "step": 7797 + }, + { + "epoch": 4.356424581005586, + "grad_norm": 0.7210206389427185, + "learning_rate": 0.0007845658263305322, + "loss": 0.523, + "step": 7798 + }, + { + "epoch": 4.356983240223464, + "grad_norm": 0.7375660538673401, + "learning_rate": 0.0007845378151260505, + "loss": 0.5027, + "step": 7799 + }, + { + "epoch": 4.35754189944134, + "grad_norm": 0.7630391120910645, + "learning_rate": 0.0007845098039215687, + "loss": 0.5205, + "step": 7800 + }, + { + "epoch": 4.358100558659218, + "grad_norm": 1.0278470516204834, + "learning_rate": 0.0007844817927170868, + "loss": 0.4544, + "step": 7801 + }, + { + "epoch": 4.358659217877095, + "grad_norm": 0.5347258448600769, + "learning_rate": 0.000784453781512605, + "loss": 0.4345, + "step": 7802 + }, + { + "epoch": 4.359217877094972, + "grad_norm": 1.4582527875900269, + "learning_rate": 0.0007844257703081232, + "loss": 0.4329, + "step": 7803 + }, + { + "epoch": 4.359776536312849, + "grad_norm": 0.9960485100746155, + "learning_rate": 0.0007843977591036415, + "loss": 0.4255, + "step": 7804 + }, + { + "epoch": 4.360335195530726, + "grad_norm": 0.6236252784729004, + "learning_rate": 0.0007843697478991597, + "loss": 0.393, + "step": 7805 + }, + { + "epoch": 4.360893854748603, + "grad_norm": 0.4125659167766571, + "learning_rate": 0.000784341736694678, + "loss": 0.4279, + "step": 7806 + }, + { + "epoch": 4.361452513966481, + "grad_norm": 0.39700162410736084, + "learning_rate": 0.000784313725490196, + "loss": 0.3637, + "step": 7807 + }, + { + "epoch": 4.362011173184357, + "grad_norm": 0.9261670112609863, + "learning_rate": 0.0007842857142857143, + "loss": 0.3885, + "step": 7808 + }, + { + "epoch": 4.362569832402235, + "grad_norm": 0.5615954995155334, + "learning_rate": 0.0007842577030812325, + "loss": 0.4794, + "step": 7809 + }, + { + "epoch": 4.363128491620111, + "grad_norm": 0.527035653591156, + "learning_rate": 0.0007842296918767508, + "loss": 0.3945, + "step": 7810 + }, + { + "epoch": 4.363687150837989, + "grad_norm": 0.559574544429779, + "learning_rate": 0.000784201680672269, + "loss": 0.4401, + "step": 7811 + }, + { + "epoch": 4.364245810055866, + "grad_norm": 0.8743611574172974, + "learning_rate": 0.0007841736694677871, + "loss": 0.4705, + "step": 7812 + }, + { + "epoch": 4.364804469273743, + "grad_norm": 0.5005408525466919, + "learning_rate": 0.0007841456582633053, + "loss": 0.4856, + "step": 7813 + }, + { + "epoch": 4.36536312849162, + "grad_norm": 0.6589733958244324, + "learning_rate": 0.0007841176470588235, + "loss": 0.4012, + "step": 7814 + }, + { + "epoch": 4.365921787709497, + "grad_norm": 0.5443500280380249, + "learning_rate": 0.0007840896358543418, + "loss": 0.4373, + "step": 7815 + }, + { + "epoch": 4.366480446927374, + "grad_norm": 1.3163281679153442, + "learning_rate": 0.00078406162464986, + "loss": 0.4332, + "step": 7816 + }, + { + "epoch": 4.367039106145251, + "grad_norm": 0.557763934135437, + "learning_rate": 0.0007840336134453781, + "loss": 0.3935, + "step": 7817 + }, + { + "epoch": 4.367597765363128, + "grad_norm": 1.3015865087509155, + "learning_rate": 0.0007840056022408963, + "loss": 0.5916, + "step": 7818 + }, + { + "epoch": 4.368156424581006, + "grad_norm": 0.6346587538719177, + "learning_rate": 0.0007839775910364145, + "loss": 0.4878, + "step": 7819 + }, + { + "epoch": 4.368715083798882, + "grad_norm": 0.5230052471160889, + "learning_rate": 0.0007839495798319328, + "loss": 0.5098, + "step": 7820 + }, + { + "epoch": 4.36927374301676, + "grad_norm": 1.444474458694458, + "learning_rate": 0.000783921568627451, + "loss": 0.4916, + "step": 7821 + }, + { + "epoch": 4.369832402234637, + "grad_norm": 0.438775897026062, + "learning_rate": 0.0007838935574229692, + "loss": 0.4075, + "step": 7822 + }, + { + "epoch": 4.370391061452514, + "grad_norm": 0.8922704458236694, + "learning_rate": 0.0007838655462184873, + "loss": 0.4208, + "step": 7823 + }, + { + "epoch": 4.370949720670391, + "grad_norm": 0.5115568041801453, + "learning_rate": 0.0007838375350140055, + "loss": 0.4542, + "step": 7824 + }, + { + "epoch": 4.371508379888268, + "grad_norm": 0.5422394871711731, + "learning_rate": 0.0007838095238095239, + "loss": 0.6153, + "step": 7825 + }, + { + "epoch": 4.372067039106145, + "grad_norm": 0.5241320133209229, + "learning_rate": 0.0007837815126050421, + "loss": 0.5124, + "step": 7826 + }, + { + "epoch": 4.372625698324022, + "grad_norm": 1.0834288597106934, + "learning_rate": 0.0007837535014005603, + "loss": 0.4078, + "step": 7827 + }, + { + "epoch": 4.373184357541899, + "grad_norm": 0.45994749665260315, + "learning_rate": 0.0007837254901960784, + "loss": 0.4821, + "step": 7828 + }, + { + "epoch": 4.373743016759777, + "grad_norm": 0.39013850688934326, + "learning_rate": 0.0007836974789915966, + "loss": 0.4277, + "step": 7829 + }, + { + "epoch": 4.374301675977653, + "grad_norm": 0.5368283987045288, + "learning_rate": 0.0007836694677871149, + "loss": 0.4957, + "step": 7830 + }, + { + "epoch": 4.374860335195531, + "grad_norm": 0.389849454164505, + "learning_rate": 0.0007836414565826331, + "loss": 0.3773, + "step": 7831 + }, + { + "epoch": 4.375418994413407, + "grad_norm": 0.41745665669441223, + "learning_rate": 0.0007836134453781513, + "loss": 0.5205, + "step": 7832 + }, + { + "epoch": 4.375977653631285, + "grad_norm": 0.49127280712127686, + "learning_rate": 0.0007835854341736694, + "loss": 0.4195, + "step": 7833 + }, + { + "epoch": 4.376536312849162, + "grad_norm": 2.224849224090576, + "learning_rate": 0.0007835574229691876, + "loss": 0.3811, + "step": 7834 + }, + { + "epoch": 4.377094972067039, + "grad_norm": 0.5694179534912109, + "learning_rate": 0.0007835294117647059, + "loss": 0.4436, + "step": 7835 + }, + { + "epoch": 4.377653631284916, + "grad_norm": 0.39705926179885864, + "learning_rate": 0.0007835014005602241, + "loss": 0.3924, + "step": 7836 + }, + { + "epoch": 4.378212290502793, + "grad_norm": 0.5426232218742371, + "learning_rate": 0.0007834733893557423, + "loss": 0.5849, + "step": 7837 + }, + { + "epoch": 4.37877094972067, + "grad_norm": 0.4692694842815399, + "learning_rate": 0.0007834453781512605, + "loss": 0.4039, + "step": 7838 + }, + { + "epoch": 4.379329608938548, + "grad_norm": 0.9278736710548401, + "learning_rate": 0.0007834173669467786, + "loss": 0.5031, + "step": 7839 + }, + { + "epoch": 4.379888268156424, + "grad_norm": 1.2467507123947144, + "learning_rate": 0.000783389355742297, + "loss": 0.3864, + "step": 7840 + }, + { + "epoch": 4.380446927374302, + "grad_norm": 0.4491453170776367, + "learning_rate": 0.0007833613445378152, + "loss": 0.425, + "step": 7841 + }, + { + "epoch": 4.381005586592178, + "grad_norm": 0.44495099782943726, + "learning_rate": 0.0007833333333333334, + "loss": 0.5741, + "step": 7842 + }, + { + "epoch": 4.381564245810056, + "grad_norm": 0.5510337948799133, + "learning_rate": 0.0007833053221288516, + "loss": 0.4354, + "step": 7843 + }, + { + "epoch": 4.382122905027933, + "grad_norm": 0.6221334338188171, + "learning_rate": 0.0007832773109243697, + "loss": 0.472, + "step": 7844 + }, + { + "epoch": 4.38268156424581, + "grad_norm": 0.729478120803833, + "learning_rate": 0.000783249299719888, + "loss": 0.4852, + "step": 7845 + }, + { + "epoch": 4.383240223463687, + "grad_norm": 0.4749382734298706, + "learning_rate": 0.0007832212885154062, + "loss": 0.5305, + "step": 7846 + }, + { + "epoch": 4.383798882681564, + "grad_norm": 0.5327832698822021, + "learning_rate": 0.0007831932773109244, + "loss": 0.5709, + "step": 7847 + }, + { + "epoch": 4.384357541899441, + "grad_norm": 0.5454557538032532, + "learning_rate": 0.0007831652661064426, + "loss": 0.4244, + "step": 7848 + }, + { + "epoch": 4.384916201117319, + "grad_norm": 0.4900103807449341, + "learning_rate": 0.0007831372549019607, + "loss": 0.5125, + "step": 7849 + }, + { + "epoch": 4.385474860335195, + "grad_norm": 3.827024221420288, + "learning_rate": 0.000783109243697479, + "loss": 0.4195, + "step": 7850 + }, + { + "epoch": 4.386033519553073, + "grad_norm": 0.8269819021224976, + "learning_rate": 0.0007830812324929972, + "loss": 0.5196, + "step": 7851 + }, + { + "epoch": 4.386592178770949, + "grad_norm": 0.4805721640586853, + "learning_rate": 0.0007830532212885154, + "loss": 0.4915, + "step": 7852 + }, + { + "epoch": 4.387150837988827, + "grad_norm": 0.4305911362171173, + "learning_rate": 0.0007830252100840336, + "loss": 0.4626, + "step": 7853 + }, + { + "epoch": 4.3877094972067034, + "grad_norm": 0.5364338755607605, + "learning_rate": 0.0007829971988795518, + "loss": 0.4035, + "step": 7854 + }, + { + "epoch": 4.388268156424581, + "grad_norm": 0.6063306927680969, + "learning_rate": 0.00078296918767507, + "loss": 0.5625, + "step": 7855 + }, + { + "epoch": 4.388826815642458, + "grad_norm": 0.8494971990585327, + "learning_rate": 0.0007829411764705882, + "loss": 0.4519, + "step": 7856 + }, + { + "epoch": 4.389385474860335, + "grad_norm": 0.5195983052253723, + "learning_rate": 0.0007829131652661065, + "loss": 0.4807, + "step": 7857 + }, + { + "epoch": 4.389944134078212, + "grad_norm": 0.7848777174949646, + "learning_rate": 0.0007828851540616247, + "loss": 0.6383, + "step": 7858 + }, + { + "epoch": 4.39050279329609, + "grad_norm": 0.5168319344520569, + "learning_rate": 0.0007828571428571429, + "loss": 0.5109, + "step": 7859 + }, + { + "epoch": 4.391061452513966, + "grad_norm": 0.4914497435092926, + "learning_rate": 0.0007828291316526611, + "loss": 0.3733, + "step": 7860 + }, + { + "epoch": 4.391620111731844, + "grad_norm": 0.7273851633071899, + "learning_rate": 0.0007828011204481793, + "loss": 0.3579, + "step": 7861 + }, + { + "epoch": 4.39217877094972, + "grad_norm": 0.5737335681915283, + "learning_rate": 0.0007827731092436975, + "loss": 0.3731, + "step": 7862 + }, + { + "epoch": 4.392737430167598, + "grad_norm": 2.8053438663482666, + "learning_rate": 0.0007827450980392157, + "loss": 0.5674, + "step": 7863 + }, + { + "epoch": 4.3932960893854744, + "grad_norm": 0.4080544412136078, + "learning_rate": 0.0007827170868347339, + "loss": 0.3676, + "step": 7864 + }, + { + "epoch": 4.393854748603352, + "grad_norm": 0.45019564032554626, + "learning_rate": 0.0007826890756302522, + "loss": 0.3994, + "step": 7865 + }, + { + "epoch": 4.394413407821229, + "grad_norm": 0.5106619596481323, + "learning_rate": 0.0007826610644257703, + "loss": 0.4992, + "step": 7866 + }, + { + "epoch": 4.394972067039106, + "grad_norm": 0.4138012230396271, + "learning_rate": 0.0007826330532212885, + "loss": 0.4457, + "step": 7867 + }, + { + "epoch": 4.395530726256983, + "grad_norm": 0.5644697546958923, + "learning_rate": 0.0007826050420168067, + "loss": 0.6089, + "step": 7868 + }, + { + "epoch": 4.39608938547486, + "grad_norm": 0.5922969579696655, + "learning_rate": 0.0007825770308123249, + "loss": 0.5281, + "step": 7869 + }, + { + "epoch": 4.396648044692737, + "grad_norm": 0.4072439968585968, + "learning_rate": 0.0007825490196078432, + "loss": 0.4275, + "step": 7870 + }, + { + "epoch": 4.397206703910615, + "grad_norm": 0.7072669863700867, + "learning_rate": 0.0007825210084033613, + "loss": 0.4589, + "step": 7871 + }, + { + "epoch": 4.397765363128491, + "grad_norm": 0.6356338858604431, + "learning_rate": 0.0007824929971988795, + "loss": 0.4508, + "step": 7872 + }, + { + "epoch": 4.398324022346369, + "grad_norm": 0.8144755363464355, + "learning_rate": 0.0007824649859943977, + "loss": 0.5293, + "step": 7873 + }, + { + "epoch": 4.3988826815642454, + "grad_norm": 1.7882473468780518, + "learning_rate": 0.000782436974789916, + "loss": 0.4197, + "step": 7874 + }, + { + "epoch": 4.399441340782123, + "grad_norm": 0.9120373725891113, + "learning_rate": 0.0007824089635854343, + "loss": 0.5276, + "step": 7875 + }, + { + "epoch": 4.4, + "grad_norm": 0.6548590660095215, + "learning_rate": 0.0007823809523809524, + "loss": 0.5009, + "step": 7876 + }, + { + "epoch": 4.400558659217877, + "grad_norm": 0.4574669301509857, + "learning_rate": 0.0007823529411764706, + "loss": 0.4263, + "step": 7877 + }, + { + "epoch": 4.401117318435754, + "grad_norm": 1.0311496257781982, + "learning_rate": 0.0007823249299719888, + "loss": 0.4399, + "step": 7878 + }, + { + "epoch": 4.401675977653631, + "grad_norm": 0.5758455991744995, + "learning_rate": 0.000782296918767507, + "loss": 0.5233, + "step": 7879 + }, + { + "epoch": 4.402234636871508, + "grad_norm": 0.5126032829284668, + "learning_rate": 0.0007822689075630253, + "loss": 0.3857, + "step": 7880 + }, + { + "epoch": 4.402793296089386, + "grad_norm": 0.5499773621559143, + "learning_rate": 0.0007822408963585435, + "loss": 0.4389, + "step": 7881 + }, + { + "epoch": 4.403351955307262, + "grad_norm": 0.477298766374588, + "learning_rate": 0.0007822128851540616, + "loss": 0.4872, + "step": 7882 + }, + { + "epoch": 4.40391061452514, + "grad_norm": 0.58132004737854, + "learning_rate": 0.0007821848739495798, + "loss": 0.5054, + "step": 7883 + }, + { + "epoch": 4.4044692737430164, + "grad_norm": 0.6866540312767029, + "learning_rate": 0.000782156862745098, + "loss": 0.5742, + "step": 7884 + }, + { + "epoch": 4.405027932960894, + "grad_norm": 0.6847009062767029, + "learning_rate": 0.0007821288515406163, + "loss": 0.5601, + "step": 7885 + }, + { + "epoch": 4.405586592178771, + "grad_norm": 0.7857428789138794, + "learning_rate": 0.0007821008403361345, + "loss": 0.4504, + "step": 7886 + }, + { + "epoch": 4.406145251396648, + "grad_norm": 0.420052170753479, + "learning_rate": 0.0007820728291316526, + "loss": 0.4357, + "step": 7887 + }, + { + "epoch": 4.406703910614525, + "grad_norm": 0.7023537158966064, + "learning_rate": 0.0007820448179271708, + "loss": 0.5265, + "step": 7888 + }, + { + "epoch": 4.407262569832402, + "grad_norm": 0.5363319516181946, + "learning_rate": 0.000782016806722689, + "loss": 0.4931, + "step": 7889 + }, + { + "epoch": 4.407821229050279, + "grad_norm": 0.5596529245376587, + "learning_rate": 0.0007819887955182074, + "loss": 0.4552, + "step": 7890 + }, + { + "epoch": 4.408379888268156, + "grad_norm": 0.40796831250190735, + "learning_rate": 0.0007819607843137256, + "loss": 0.3805, + "step": 7891 + }, + { + "epoch": 4.408938547486033, + "grad_norm": 0.522153377532959, + "learning_rate": 0.0007819327731092437, + "loss": 0.4754, + "step": 7892 + }, + { + "epoch": 4.409497206703911, + "grad_norm": 1.0707128047943115, + "learning_rate": 0.0007819047619047619, + "loss": 0.4913, + "step": 7893 + }, + { + "epoch": 4.410055865921787, + "grad_norm": 18.974531173706055, + "learning_rate": 0.0007818767507002801, + "loss": 0.3199, + "step": 7894 + }, + { + "epoch": 4.410614525139665, + "grad_norm": 0.6861658096313477, + "learning_rate": 0.0007818487394957984, + "loss": 0.4648, + "step": 7895 + }, + { + "epoch": 4.411173184357542, + "grad_norm": 0.6258499622344971, + "learning_rate": 0.0007818207282913166, + "loss": 0.4205, + "step": 7896 + }, + { + "epoch": 4.411731843575419, + "grad_norm": 0.6685450673103333, + "learning_rate": 0.0007817927170868348, + "loss": 0.4357, + "step": 7897 + }, + { + "epoch": 4.412290502793296, + "grad_norm": 0.8548688292503357, + "learning_rate": 0.0007817647058823529, + "loss": 0.4457, + "step": 7898 + }, + { + "epoch": 4.412849162011173, + "grad_norm": 0.678438663482666, + "learning_rate": 0.0007817366946778711, + "loss": 0.7488, + "step": 7899 + }, + { + "epoch": 4.41340782122905, + "grad_norm": 0.5791630744934082, + "learning_rate": 0.0007817086834733894, + "loss": 0.533, + "step": 7900 + }, + { + "epoch": 4.413966480446927, + "grad_norm": 0.46744436025619507, + "learning_rate": 0.0007816806722689076, + "loss": 0.4145, + "step": 7901 + }, + { + "epoch": 4.414525139664804, + "grad_norm": 0.47127410769462585, + "learning_rate": 0.0007816526610644258, + "loss": 0.403, + "step": 7902 + }, + { + "epoch": 4.415083798882682, + "grad_norm": 0.5942014455795288, + "learning_rate": 0.0007816246498599439, + "loss": 0.4657, + "step": 7903 + }, + { + "epoch": 4.415642458100558, + "grad_norm": 0.5808228254318237, + "learning_rate": 0.0007815966386554621, + "loss": 0.7274, + "step": 7904 + }, + { + "epoch": 4.416201117318436, + "grad_norm": 0.634122908115387, + "learning_rate": 0.0007815686274509805, + "loss": 0.4007, + "step": 7905 + }, + { + "epoch": 4.4167597765363125, + "grad_norm": 0.6995765566825867, + "learning_rate": 0.0007815406162464987, + "loss": 0.476, + "step": 7906 + }, + { + "epoch": 4.41731843575419, + "grad_norm": 0.5117874145507812, + "learning_rate": 0.0007815126050420169, + "loss": 0.5667, + "step": 7907 + }, + { + "epoch": 4.417877094972067, + "grad_norm": 0.925533652305603, + "learning_rate": 0.000781484593837535, + "loss": 0.6343, + "step": 7908 + }, + { + "epoch": 4.418435754189944, + "grad_norm": 0.6137980818748474, + "learning_rate": 0.0007814565826330532, + "loss": 0.498, + "step": 7909 + }, + { + "epoch": 4.418994413407821, + "grad_norm": 0.6770252585411072, + "learning_rate": 0.0007814285714285715, + "loss": 0.5254, + "step": 7910 + }, + { + "epoch": 4.419553072625698, + "grad_norm": 0.6024884581565857, + "learning_rate": 0.0007814005602240897, + "loss": 0.3379, + "step": 7911 + }, + { + "epoch": 4.420111731843575, + "grad_norm": 2.3432741165161133, + "learning_rate": 0.0007813725490196079, + "loss": 0.4817, + "step": 7912 + }, + { + "epoch": 4.420670391061453, + "grad_norm": 0.4075697660446167, + "learning_rate": 0.0007813445378151261, + "loss": 0.465, + "step": 7913 + }, + { + "epoch": 4.421229050279329, + "grad_norm": 0.46285125613212585, + "learning_rate": 0.0007813165266106442, + "loss": 0.5354, + "step": 7914 + }, + { + "epoch": 4.421787709497207, + "grad_norm": 0.5492990016937256, + "learning_rate": 0.0007812885154061625, + "loss": 0.4388, + "step": 7915 + }, + { + "epoch": 4.4223463687150835, + "grad_norm": 0.481118768453598, + "learning_rate": 0.0007812605042016807, + "loss": 0.446, + "step": 7916 + }, + { + "epoch": 4.422905027932961, + "grad_norm": 0.629823625087738, + "learning_rate": 0.0007812324929971989, + "loss": 0.5816, + "step": 7917 + }, + { + "epoch": 4.423463687150838, + "grad_norm": 0.5381523370742798, + "learning_rate": 0.0007812044817927171, + "loss": 0.3537, + "step": 7918 + }, + { + "epoch": 4.424022346368715, + "grad_norm": 0.4390683174133301, + "learning_rate": 0.0007811764705882352, + "loss": 0.3276, + "step": 7919 + }, + { + "epoch": 4.424581005586592, + "grad_norm": 0.6150350570678711, + "learning_rate": 0.0007811484593837535, + "loss": 0.6111, + "step": 7920 + }, + { + "epoch": 4.425139664804469, + "grad_norm": 5.276768684387207, + "learning_rate": 0.0007811204481792717, + "loss": 0.5148, + "step": 7921 + }, + { + "epoch": 4.425698324022346, + "grad_norm": 0.46458321809768677, + "learning_rate": 0.00078109243697479, + "loss": 0.3958, + "step": 7922 + }, + { + "epoch": 4.426256983240224, + "grad_norm": 0.49893718957901, + "learning_rate": 0.0007810644257703082, + "loss": 0.3603, + "step": 7923 + }, + { + "epoch": 4.4268156424581, + "grad_norm": 0.4741962254047394, + "learning_rate": 0.0007810364145658263, + "loss": 0.5726, + "step": 7924 + }, + { + "epoch": 4.427374301675978, + "grad_norm": 0.5961857438087463, + "learning_rate": 0.0007810084033613446, + "loss": 0.3475, + "step": 7925 + }, + { + "epoch": 4.4279329608938545, + "grad_norm": 0.5745024681091309, + "learning_rate": 0.0007809803921568628, + "loss": 0.3747, + "step": 7926 + }, + { + "epoch": 4.428491620111732, + "grad_norm": 0.44040071964263916, + "learning_rate": 0.000780952380952381, + "loss": 0.528, + "step": 7927 + }, + { + "epoch": 4.4290502793296085, + "grad_norm": 0.9399561882019043, + "learning_rate": 0.0007809243697478992, + "loss": 0.3616, + "step": 7928 + }, + { + "epoch": 4.429608938547486, + "grad_norm": 0.6722110509872437, + "learning_rate": 0.0007808963585434174, + "loss": 0.5126, + "step": 7929 + }, + { + "epoch": 4.430167597765363, + "grad_norm": 0.5289178490638733, + "learning_rate": 0.0007808683473389356, + "loss": 0.3877, + "step": 7930 + }, + { + "epoch": 4.43072625698324, + "grad_norm": 0.403576135635376, + "learning_rate": 0.0007808403361344538, + "loss": 0.476, + "step": 7931 + }, + { + "epoch": 4.431284916201117, + "grad_norm": 5.940262317657471, + "learning_rate": 0.000780812324929972, + "loss": 0.3683, + "step": 7932 + }, + { + "epoch": 4.431843575418995, + "grad_norm": 0.41928836703300476, + "learning_rate": 0.0007807843137254902, + "loss": 0.4171, + "step": 7933 + }, + { + "epoch": 4.432402234636871, + "grad_norm": 0.5359971523284912, + "learning_rate": 0.0007807563025210084, + "loss": 0.4765, + "step": 7934 + }, + { + "epoch": 4.432960893854749, + "grad_norm": 0.44485288858413696, + "learning_rate": 0.0007807282913165266, + "loss": 0.4109, + "step": 7935 + }, + { + "epoch": 4.4335195530726255, + "grad_norm": 1.1097968816757202, + "learning_rate": 0.0007807002801120448, + "loss": 0.4427, + "step": 7936 + }, + { + "epoch": 4.434078212290503, + "grad_norm": 0.44607189297676086, + "learning_rate": 0.000780672268907563, + "loss": 0.4237, + "step": 7937 + }, + { + "epoch": 4.4346368715083795, + "grad_norm": 0.8777729272842407, + "learning_rate": 0.0007806442577030812, + "loss": 0.4944, + "step": 7938 + }, + { + "epoch": 4.435195530726257, + "grad_norm": 1.0470962524414062, + "learning_rate": 0.0007806162464985995, + "loss": 0.4583, + "step": 7939 + }, + { + "epoch": 4.435754189944134, + "grad_norm": 0.6383278965950012, + "learning_rate": 0.0007805882352941177, + "loss": 0.4549, + "step": 7940 + }, + { + "epoch": 4.436312849162011, + "grad_norm": 0.8835601806640625, + "learning_rate": 0.0007805602240896359, + "loss": 0.4746, + "step": 7941 + }, + { + "epoch": 4.436871508379888, + "grad_norm": 0.49641773104667664, + "learning_rate": 0.0007805322128851541, + "loss": 0.451, + "step": 7942 + }, + { + "epoch": 4.437430167597765, + "grad_norm": 0.529076337814331, + "learning_rate": 0.0007805042016806723, + "loss": 0.4666, + "step": 7943 + }, + { + "epoch": 4.437988826815642, + "grad_norm": 0.3934054672718048, + "learning_rate": 0.0007804761904761905, + "loss": 0.3835, + "step": 7944 + }, + { + "epoch": 4.43854748603352, + "grad_norm": 2.3144543170928955, + "learning_rate": 0.0007804481792717088, + "loss": 0.5169, + "step": 7945 + }, + { + "epoch": 4.4391061452513965, + "grad_norm": 0.6031854152679443, + "learning_rate": 0.0007804201680672269, + "loss": 0.4107, + "step": 7946 + }, + { + "epoch": 4.439664804469274, + "grad_norm": 0.43561676144599915, + "learning_rate": 0.0007803921568627451, + "loss": 0.4557, + "step": 7947 + }, + { + "epoch": 4.4402234636871505, + "grad_norm": 0.45999065041542053, + "learning_rate": 0.0007803641456582633, + "loss": 0.4173, + "step": 7948 + }, + { + "epoch": 4.440782122905028, + "grad_norm": 0.4490691125392914, + "learning_rate": 0.0007803361344537815, + "loss": 0.4402, + "step": 7949 + }, + { + "epoch": 4.441340782122905, + "grad_norm": 0.5090757608413696, + "learning_rate": 0.0007803081232492998, + "loss": 0.3995, + "step": 7950 + }, + { + "epoch": 4.441899441340782, + "grad_norm": 4.465834617614746, + "learning_rate": 0.0007802801120448179, + "loss": 0.3529, + "step": 7951 + }, + { + "epoch": 4.442458100558659, + "grad_norm": 0.6415351629257202, + "learning_rate": 0.0007802521008403361, + "loss": 0.4844, + "step": 7952 + }, + { + "epoch": 4.443016759776536, + "grad_norm": 0.479928582906723, + "learning_rate": 0.0007802240896358543, + "loss": 0.4223, + "step": 7953 + }, + { + "epoch": 4.443575418994413, + "grad_norm": 0.5131279230117798, + "learning_rate": 0.0007801960784313725, + "loss": 0.5309, + "step": 7954 + }, + { + "epoch": 4.444134078212291, + "grad_norm": 0.8900812864303589, + "learning_rate": 0.0007801680672268909, + "loss": 0.6127, + "step": 7955 + }, + { + "epoch": 4.4446927374301675, + "grad_norm": 1.5215139389038086, + "learning_rate": 0.000780140056022409, + "loss": 0.4101, + "step": 7956 + }, + { + "epoch": 4.445251396648045, + "grad_norm": 0.4844955503940582, + "learning_rate": 0.0007801120448179272, + "loss": 0.4045, + "step": 7957 + }, + { + "epoch": 4.4458100558659215, + "grad_norm": 0.5265480875968933, + "learning_rate": 0.0007800840336134454, + "loss": 0.4807, + "step": 7958 + }, + { + "epoch": 4.446368715083799, + "grad_norm": 0.8821099400520325, + "learning_rate": 0.0007800560224089636, + "loss": 0.4003, + "step": 7959 + }, + { + "epoch": 4.446927374301676, + "grad_norm": 0.5669199228286743, + "learning_rate": 0.0007800280112044819, + "loss": 0.4863, + "step": 7960 + }, + { + "epoch": 4.447486033519553, + "grad_norm": 0.5125001072883606, + "learning_rate": 0.0007800000000000001, + "loss": 0.3645, + "step": 7961 + }, + { + "epoch": 4.44804469273743, + "grad_norm": 0.48797422647476196, + "learning_rate": 0.0007799719887955182, + "loss": 0.462, + "step": 7962 + }, + { + "epoch": 4.448603351955307, + "grad_norm": 0.6037314534187317, + "learning_rate": 0.0007799439775910364, + "loss": 0.4518, + "step": 7963 + }, + { + "epoch": 4.449162011173184, + "grad_norm": 0.609241247177124, + "learning_rate": 0.0007799159663865546, + "loss": 0.355, + "step": 7964 + }, + { + "epoch": 4.449720670391061, + "grad_norm": 0.717847466468811, + "learning_rate": 0.0007798879551820729, + "loss": 0.5084, + "step": 7965 + }, + { + "epoch": 4.4502793296089385, + "grad_norm": 0.4940887987613678, + "learning_rate": 0.0007798599439775911, + "loss": 0.4814, + "step": 7966 + }, + { + "epoch": 4.450837988826816, + "grad_norm": 0.7615885138511658, + "learning_rate": 0.0007798319327731092, + "loss": 0.442, + "step": 7967 + }, + { + "epoch": 4.4513966480446925, + "grad_norm": 0.5444573760032654, + "learning_rate": 0.0007798039215686274, + "loss": 0.5411, + "step": 7968 + }, + { + "epoch": 4.45195530726257, + "grad_norm": 1.0647783279418945, + "learning_rate": 0.0007797759103641456, + "loss": 0.4967, + "step": 7969 + }, + { + "epoch": 4.452513966480447, + "grad_norm": 0.7675191164016724, + "learning_rate": 0.000779747899159664, + "loss": 0.3854, + "step": 7970 + }, + { + "epoch": 4.453072625698324, + "grad_norm": 0.4078288972377777, + "learning_rate": 0.0007797198879551822, + "loss": 0.4345, + "step": 7971 + }, + { + "epoch": 4.453631284916201, + "grad_norm": 0.5696030259132385, + "learning_rate": 0.0007796918767507003, + "loss": 0.41, + "step": 7972 + }, + { + "epoch": 4.454189944134078, + "grad_norm": 2.22721791267395, + "learning_rate": 0.0007796638655462185, + "loss": 0.4101, + "step": 7973 + }, + { + "epoch": 4.454748603351955, + "grad_norm": 0.5910645127296448, + "learning_rate": 0.0007796358543417367, + "loss": 0.4775, + "step": 7974 + }, + { + "epoch": 4.455307262569832, + "grad_norm": 0.8175109624862671, + "learning_rate": 0.000779607843137255, + "loss": 0.4807, + "step": 7975 + }, + { + "epoch": 4.4558659217877095, + "grad_norm": 0.5076733827590942, + "learning_rate": 0.0007795798319327732, + "loss": 0.5027, + "step": 7976 + }, + { + "epoch": 4.456424581005587, + "grad_norm": 1.0723190307617188, + "learning_rate": 0.0007795518207282914, + "loss": 0.6256, + "step": 7977 + }, + { + "epoch": 4.4569832402234635, + "grad_norm": 0.5672963857650757, + "learning_rate": 0.0007795238095238095, + "loss": 0.4953, + "step": 7978 + }, + { + "epoch": 4.457541899441341, + "grad_norm": 1.0351548194885254, + "learning_rate": 0.0007794957983193277, + "loss": 0.504, + "step": 7979 + }, + { + "epoch": 4.4581005586592175, + "grad_norm": 0.5324487090110779, + "learning_rate": 0.000779467787114846, + "loss": 0.4875, + "step": 7980 + }, + { + "epoch": 4.458659217877095, + "grad_norm": 0.5606355667114258, + "learning_rate": 0.0007794397759103642, + "loss": 0.5029, + "step": 7981 + }, + { + "epoch": 4.459217877094972, + "grad_norm": 0.5287914276123047, + "learning_rate": 0.0007794117647058824, + "loss": 0.4953, + "step": 7982 + }, + { + "epoch": 4.459776536312849, + "grad_norm": 0.7274687886238098, + "learning_rate": 0.0007793837535014005, + "loss": 0.5171, + "step": 7983 + }, + { + "epoch": 4.460335195530726, + "grad_norm": 1.0861328840255737, + "learning_rate": 0.0007793557422969187, + "loss": 0.4176, + "step": 7984 + }, + { + "epoch": 4.460893854748603, + "grad_norm": 0.4057898223400116, + "learning_rate": 0.000779327731092437, + "loss": 0.3301, + "step": 7985 + }, + { + "epoch": 4.4614525139664805, + "grad_norm": 0.43559736013412476, + "learning_rate": 0.0007792997198879552, + "loss": 0.4216, + "step": 7986 + }, + { + "epoch": 4.462011173184358, + "grad_norm": 0.7434982657432556, + "learning_rate": 0.0007792717086834735, + "loss": 0.5302, + "step": 7987 + }, + { + "epoch": 4.4625698324022345, + "grad_norm": 0.7655741572380066, + "learning_rate": 0.0007792436974789915, + "loss": 0.4237, + "step": 7988 + }, + { + "epoch": 4.463128491620112, + "grad_norm": 0.5505525469779968, + "learning_rate": 0.0007792156862745098, + "loss": 0.4644, + "step": 7989 + }, + { + "epoch": 4.4636871508379885, + "grad_norm": 0.45169299840927124, + "learning_rate": 0.0007791876750700281, + "loss": 0.3699, + "step": 7990 + }, + { + "epoch": 4.464245810055866, + "grad_norm": 2.1300570964813232, + "learning_rate": 0.0007791596638655463, + "loss": 0.431, + "step": 7991 + }, + { + "epoch": 4.464804469273743, + "grad_norm": 0.49451008439064026, + "learning_rate": 0.0007791316526610645, + "loss": 0.5059, + "step": 7992 + }, + { + "epoch": 4.46536312849162, + "grad_norm": 0.6582427620887756, + "learning_rate": 0.0007791036414565827, + "loss": 0.5685, + "step": 7993 + }, + { + "epoch": 4.465921787709497, + "grad_norm": 0.6332744359970093, + "learning_rate": 0.0007790756302521008, + "loss": 0.4735, + "step": 7994 + }, + { + "epoch": 4.466480446927374, + "grad_norm": 0.5850993394851685, + "learning_rate": 0.0007790476190476191, + "loss": 0.424, + "step": 7995 + }, + { + "epoch": 4.4670391061452515, + "grad_norm": 0.38231340050697327, + "learning_rate": 0.0007790196078431373, + "loss": 0.3643, + "step": 7996 + }, + { + "epoch": 4.467597765363129, + "grad_norm": 0.5427983403205872, + "learning_rate": 0.0007789915966386555, + "loss": 0.522, + "step": 7997 + }, + { + "epoch": 4.4681564245810055, + "grad_norm": 1.218829870223999, + "learning_rate": 0.0007789635854341737, + "loss": 0.4389, + "step": 7998 + }, + { + "epoch": 4.468715083798883, + "grad_norm": 0.4756756126880646, + "learning_rate": 0.0007789355742296918, + "loss": 0.3267, + "step": 7999 + }, + { + "epoch": 4.4692737430167595, + "grad_norm": 1.136558175086975, + "learning_rate": 0.0007789075630252101, + "loss": 0.4208, + "step": 8000 + }, + { + "epoch": 4.4692737430167595, + "eval_cer": 0.09291021570488692, + "eval_loss": 0.3547140657901764, + "eval_runtime": 55.8888, + "eval_samples_per_second": 81.197, + "eval_steps_per_second": 5.082, + "eval_wer": 0.36649814250998575, + "step": 8000 + }, + { + "epoch": 4.469832402234637, + "grad_norm": 0.3521578907966614, + "learning_rate": 0.0007788795518207283, + "loss": 0.4491, + "step": 8001 + }, + { + "epoch": 4.4703910614525135, + "grad_norm": 1.8238716125488281, + "learning_rate": 0.0007788515406162465, + "loss": 0.4901, + "step": 8002 + }, + { + "epoch": 4.470949720670391, + "grad_norm": 0.5393496751785278, + "learning_rate": 0.0007788235294117647, + "loss": 0.5206, + "step": 8003 + }, + { + "epoch": 4.471508379888268, + "grad_norm": 1.8020495176315308, + "learning_rate": 0.0007787955182072828, + "loss": 0.4162, + "step": 8004 + }, + { + "epoch": 4.472067039106145, + "grad_norm": 0.6742940545082092, + "learning_rate": 0.0007787675070028012, + "loss": 0.3896, + "step": 8005 + }, + { + "epoch": 4.4726256983240225, + "grad_norm": 0.39385151863098145, + "learning_rate": 0.0007787394957983194, + "loss": 0.3512, + "step": 8006 + }, + { + "epoch": 4.473184357541899, + "grad_norm": 0.5696566700935364, + "learning_rate": 0.0007787114845938376, + "loss": 0.4743, + "step": 8007 + }, + { + "epoch": 4.4737430167597765, + "grad_norm": 0.5739765167236328, + "learning_rate": 0.0007786834733893558, + "loss": 0.3647, + "step": 8008 + }, + { + "epoch": 4.474301675977654, + "grad_norm": 0.6203718781471252, + "learning_rate": 0.000778655462184874, + "loss": 0.4498, + "step": 8009 + }, + { + "epoch": 4.4748603351955305, + "grad_norm": 0.6317752599716187, + "learning_rate": 0.0007786274509803922, + "loss": 0.4509, + "step": 8010 + }, + { + "epoch": 4.475418994413408, + "grad_norm": 0.660818874835968, + "learning_rate": 0.0007785994397759104, + "loss": 0.4748, + "step": 8011 + }, + { + "epoch": 4.4759776536312845, + "grad_norm": 0.9094657301902771, + "learning_rate": 0.0007785714285714286, + "loss": 0.458, + "step": 8012 + }, + { + "epoch": 4.476536312849162, + "grad_norm": 0.5310734510421753, + "learning_rate": 0.0007785434173669468, + "loss": 0.4594, + "step": 8013 + }, + { + "epoch": 4.477094972067039, + "grad_norm": 0.6198703646659851, + "learning_rate": 0.000778515406162465, + "loss": 0.3527, + "step": 8014 + }, + { + "epoch": 4.477653631284916, + "grad_norm": 0.43491727113723755, + "learning_rate": 0.0007784873949579832, + "loss": 0.4105, + "step": 8015 + }, + { + "epoch": 4.4782122905027935, + "grad_norm": 0.5587567687034607, + "learning_rate": 0.0007784593837535014, + "loss": 0.451, + "step": 8016 + }, + { + "epoch": 4.47877094972067, + "grad_norm": 0.5126293301582336, + "learning_rate": 0.0007784313725490196, + "loss": 0.4141, + "step": 8017 + }, + { + "epoch": 4.4793296089385475, + "grad_norm": 0.43808814883232117, + "learning_rate": 0.0007784033613445378, + "loss": 0.4036, + "step": 8018 + }, + { + "epoch": 4.479888268156425, + "grad_norm": 0.5458375811576843, + "learning_rate": 0.000778375350140056, + "loss": 0.3898, + "step": 8019 + }, + { + "epoch": 4.4804469273743015, + "grad_norm": 0.3859800696372986, + "learning_rate": 0.0007783473389355742, + "loss": 0.3886, + "step": 8020 + }, + { + "epoch": 4.481005586592179, + "grad_norm": 0.5757061839103699, + "learning_rate": 0.0007783193277310925, + "loss": 0.4473, + "step": 8021 + }, + { + "epoch": 4.4815642458100555, + "grad_norm": 0.6346460580825806, + "learning_rate": 0.0007782913165266107, + "loss": 0.4926, + "step": 8022 + }, + { + "epoch": 4.482122905027933, + "grad_norm": 0.40187501907348633, + "learning_rate": 0.0007782633053221289, + "loss": 0.3927, + "step": 8023 + }, + { + "epoch": 4.48268156424581, + "grad_norm": 0.3430120348930359, + "learning_rate": 0.0007782352941176471, + "loss": 0.3219, + "step": 8024 + }, + { + "epoch": 4.483240223463687, + "grad_norm": 1.0303021669387817, + "learning_rate": 0.0007782072829131654, + "loss": 0.5212, + "step": 8025 + }, + { + "epoch": 4.4837988826815645, + "grad_norm": 0.706115186214447, + "learning_rate": 0.0007781792717086835, + "loss": 0.5569, + "step": 8026 + }, + { + "epoch": 4.484357541899441, + "grad_norm": 0.8488256335258484, + "learning_rate": 0.0007781512605042017, + "loss": 0.3556, + "step": 8027 + }, + { + "epoch": 4.4849162011173185, + "grad_norm": 0.3991093039512634, + "learning_rate": 0.0007781232492997199, + "loss": 0.5189, + "step": 8028 + }, + { + "epoch": 4.485474860335196, + "grad_norm": 0.838813841342926, + "learning_rate": 0.0007780952380952381, + "loss": 0.5975, + "step": 8029 + }, + { + "epoch": 4.4860335195530725, + "grad_norm": 0.697236955165863, + "learning_rate": 0.0007780672268907563, + "loss": 0.4672, + "step": 8030 + }, + { + "epoch": 4.48659217877095, + "grad_norm": 0.4864196181297302, + "learning_rate": 0.0007780392156862745, + "loss": 0.3837, + "step": 8031 + }, + { + "epoch": 4.4871508379888265, + "grad_norm": 0.3497955799102783, + "learning_rate": 0.0007780112044817927, + "loss": 0.4202, + "step": 8032 + }, + { + "epoch": 4.487709497206704, + "grad_norm": 0.516545295715332, + "learning_rate": 0.0007779831932773109, + "loss": 0.4012, + "step": 8033 + }, + { + "epoch": 4.488268156424581, + "grad_norm": 0.5043094158172607, + "learning_rate": 0.0007779551820728291, + "loss": 0.4695, + "step": 8034 + }, + { + "epoch": 4.488826815642458, + "grad_norm": 1.0219391584396362, + "learning_rate": 0.0007779271708683473, + "loss": 0.4908, + "step": 8035 + }, + { + "epoch": 4.4893854748603355, + "grad_norm": 0.6130095720291138, + "learning_rate": 0.0007778991596638655, + "loss": 0.4951, + "step": 8036 + }, + { + "epoch": 4.489944134078212, + "grad_norm": 0.5657668709754944, + "learning_rate": 0.0007778711484593837, + "loss": 0.4722, + "step": 8037 + }, + { + "epoch": 4.4905027932960895, + "grad_norm": 0.6289819478988647, + "learning_rate": 0.000777843137254902, + "loss": 0.5806, + "step": 8038 + }, + { + "epoch": 4.491061452513966, + "grad_norm": 0.5132222175598145, + "learning_rate": 0.0007778151260504202, + "loss": 0.4775, + "step": 8039 + }, + { + "epoch": 4.4916201117318435, + "grad_norm": 1.0361062288284302, + "learning_rate": 0.0007777871148459384, + "loss": 0.436, + "step": 8040 + }, + { + "epoch": 4.492178770949721, + "grad_norm": 0.6603046655654907, + "learning_rate": 0.0007777591036414567, + "loss": 0.4416, + "step": 8041 + }, + { + "epoch": 4.4927374301675975, + "grad_norm": 0.8079709410667419, + "learning_rate": 0.0007777310924369748, + "loss": 0.4466, + "step": 8042 + }, + { + "epoch": 4.493296089385475, + "grad_norm": 0.7630321979522705, + "learning_rate": 0.000777703081232493, + "loss": 0.4332, + "step": 8043 + }, + { + "epoch": 4.4938547486033515, + "grad_norm": 0.5348184704780579, + "learning_rate": 0.0007776750700280112, + "loss": 0.5214, + "step": 8044 + }, + { + "epoch": 4.494413407821229, + "grad_norm": 0.6643372774124146, + "learning_rate": 0.0007776470588235294, + "loss": 0.5079, + "step": 8045 + }, + { + "epoch": 4.4949720670391065, + "grad_norm": 0.44299614429473877, + "learning_rate": 0.0007776190476190477, + "loss": 0.5242, + "step": 8046 + }, + { + "epoch": 4.495530726256983, + "grad_norm": 1.164122462272644, + "learning_rate": 0.0007775910364145658, + "loss": 0.5211, + "step": 8047 + }, + { + "epoch": 4.4960893854748605, + "grad_norm": 0.5251310467720032, + "learning_rate": 0.000777563025210084, + "loss": 0.4982, + "step": 8048 + }, + { + "epoch": 4.496648044692737, + "grad_norm": 0.6160804629325867, + "learning_rate": 0.0007775350140056022, + "loss": 0.5599, + "step": 8049 + }, + { + "epoch": 4.4972067039106145, + "grad_norm": 0.49206408858299255, + "learning_rate": 0.0007775070028011204, + "loss": 0.5309, + "step": 8050 + }, + { + "epoch": 4.497765363128492, + "grad_norm": 0.4938124418258667, + "learning_rate": 0.0007774789915966387, + "loss": 0.4639, + "step": 8051 + }, + { + "epoch": 4.4983240223463685, + "grad_norm": 2.6455161571502686, + "learning_rate": 0.0007774509803921568, + "loss": 0.4613, + "step": 8052 + }, + { + "epoch": 4.498882681564246, + "grad_norm": 0.5216096639633179, + "learning_rate": 0.000777422969187675, + "loss": 0.4416, + "step": 8053 + }, + { + "epoch": 4.4994413407821225, + "grad_norm": 0.6239911913871765, + "learning_rate": 0.0007773949579831933, + "loss": 0.4668, + "step": 8054 + }, + { + "epoch": 4.5, + "grad_norm": 0.900478184223175, + "learning_rate": 0.0007773669467787115, + "loss": 0.4795, + "step": 8055 + }, + { + "epoch": 4.5005586592178775, + "grad_norm": 0.5670730471611023, + "learning_rate": 0.0007773389355742298, + "loss": 0.433, + "step": 8056 + }, + { + "epoch": 4.501117318435754, + "grad_norm": 0.3907197415828705, + "learning_rate": 0.000777310924369748, + "loss": 0.4655, + "step": 8057 + }, + { + "epoch": 4.5016759776536315, + "grad_norm": 0.9997380375862122, + "learning_rate": 0.0007772829131652661, + "loss": 0.447, + "step": 8058 + }, + { + "epoch": 4.502234636871508, + "grad_norm": 1.165135383605957, + "learning_rate": 0.0007772549019607843, + "loss": 0.5339, + "step": 8059 + }, + { + "epoch": 4.5027932960893855, + "grad_norm": 0.5992472767829895, + "learning_rate": 0.0007772268907563025, + "loss": 0.4652, + "step": 8060 + }, + { + "epoch": 4.503351955307263, + "grad_norm": 0.6337040662765503, + "learning_rate": 0.0007771988795518208, + "loss": 0.509, + "step": 8061 + }, + { + "epoch": 4.5039106145251395, + "grad_norm": 1.6205978393554688, + "learning_rate": 0.000777170868347339, + "loss": 0.3834, + "step": 8062 + }, + { + "epoch": 4.504469273743017, + "grad_norm": 0.5395073294639587, + "learning_rate": 0.0007771428571428571, + "loss": 0.3847, + "step": 8063 + }, + { + "epoch": 4.5050279329608935, + "grad_norm": 1.7489677667617798, + "learning_rate": 0.0007771148459383753, + "loss": 0.4477, + "step": 8064 + }, + { + "epoch": 4.505586592178771, + "grad_norm": 0.648113489151001, + "learning_rate": 0.0007770868347338935, + "loss": 0.5817, + "step": 8065 + }, + { + "epoch": 4.506145251396648, + "grad_norm": 0.6713204383850098, + "learning_rate": 0.0007770588235294118, + "loss": 0.4423, + "step": 8066 + }, + { + "epoch": 4.506703910614525, + "grad_norm": 0.5897493362426758, + "learning_rate": 0.00077703081232493, + "loss": 0.553, + "step": 8067 + }, + { + "epoch": 4.5072625698324025, + "grad_norm": 0.6195979714393616, + "learning_rate": 0.0007770028011204481, + "loss": 0.5004, + "step": 8068 + }, + { + "epoch": 4.507821229050279, + "grad_norm": 0.613839864730835, + "learning_rate": 0.0007769747899159663, + "loss": 0.4714, + "step": 8069 + }, + { + "epoch": 4.5083798882681565, + "grad_norm": 0.6737150549888611, + "learning_rate": 0.0007769467787114845, + "loss": 0.3804, + "step": 8070 + }, + { + "epoch": 4.508938547486034, + "grad_norm": 0.5012085437774658, + "learning_rate": 0.0007769187675070029, + "loss": 0.5254, + "step": 8071 + }, + { + "epoch": 4.5094972067039105, + "grad_norm": 0.6065489649772644, + "learning_rate": 0.0007768907563025211, + "loss": 0.5669, + "step": 8072 + }, + { + "epoch": 4.510055865921788, + "grad_norm": 1.0205323696136475, + "learning_rate": 0.0007768627450980393, + "loss": 0.4508, + "step": 8073 + }, + { + "epoch": 4.5106145251396645, + "grad_norm": 0.5331271886825562, + "learning_rate": 0.0007768347338935574, + "loss": 0.5316, + "step": 8074 + }, + { + "epoch": 4.511173184357542, + "grad_norm": 0.6974080801010132, + "learning_rate": 0.0007768067226890756, + "loss": 0.4413, + "step": 8075 + }, + { + "epoch": 4.511731843575419, + "grad_norm": 0.7939553260803223, + "learning_rate": 0.0007767787114845939, + "loss": 0.6073, + "step": 8076 + }, + { + "epoch": 4.512290502793296, + "grad_norm": 0.5240582227706909, + "learning_rate": 0.0007767507002801121, + "loss": 0.4328, + "step": 8077 + }, + { + "epoch": 4.5128491620111735, + "grad_norm": 0.8325620293617249, + "learning_rate": 0.0007767226890756303, + "loss": 0.5055, + "step": 8078 + }, + { + "epoch": 4.51340782122905, + "grad_norm": 0.9341689944267273, + "learning_rate": 0.0007766946778711484, + "loss": 0.3695, + "step": 8079 + }, + { + "epoch": 4.5139664804469275, + "grad_norm": 1.3748936653137207, + "learning_rate": 0.0007766666666666666, + "loss": 0.4082, + "step": 8080 + }, + { + "epoch": 4.514525139664805, + "grad_norm": 0.618179440498352, + "learning_rate": 0.0007766386554621849, + "loss": 0.424, + "step": 8081 + }, + { + "epoch": 4.5150837988826815, + "grad_norm": 0.9769489169120789, + "learning_rate": 0.0007766106442577031, + "loss": 0.554, + "step": 8082 + }, + { + "epoch": 4.515642458100559, + "grad_norm": 0.5236263275146484, + "learning_rate": 0.0007765826330532213, + "loss": 0.5879, + "step": 8083 + }, + { + "epoch": 4.5162011173184355, + "grad_norm": 0.588412344455719, + "learning_rate": 0.0007765546218487394, + "loss": 0.5008, + "step": 8084 + }, + { + "epoch": 4.516759776536313, + "grad_norm": 0.6860974431037903, + "learning_rate": 0.0007765266106442576, + "loss": 0.5397, + "step": 8085 + }, + { + "epoch": 4.51731843575419, + "grad_norm": 0.8217350244522095, + "learning_rate": 0.000776498599439776, + "loss": 0.5159, + "step": 8086 + }, + { + "epoch": 4.517877094972067, + "grad_norm": 0.7609260678291321, + "learning_rate": 0.0007764705882352942, + "loss": 0.464, + "step": 8087 + }, + { + "epoch": 4.5184357541899445, + "grad_norm": 0.8852810263633728, + "learning_rate": 0.0007764425770308124, + "loss": 0.4189, + "step": 8088 + }, + { + "epoch": 4.518994413407821, + "grad_norm": 0.6375750303268433, + "learning_rate": 0.0007764145658263306, + "loss": 0.5251, + "step": 8089 + }, + { + "epoch": 4.5195530726256985, + "grad_norm": 1.601333498954773, + "learning_rate": 0.0007763865546218487, + "loss": 0.4357, + "step": 8090 + }, + { + "epoch": 4.520111731843575, + "grad_norm": 1.655070424079895, + "learning_rate": 0.000776358543417367, + "loss": 0.3949, + "step": 8091 + }, + { + "epoch": 4.5206703910614525, + "grad_norm": 1.0199527740478516, + "learning_rate": 0.0007763305322128852, + "loss": 0.4465, + "step": 8092 + }, + { + "epoch": 4.52122905027933, + "grad_norm": 2.242251396179199, + "learning_rate": 0.0007763025210084034, + "loss": 0.4464, + "step": 8093 + }, + { + "epoch": 4.5217877094972065, + "grad_norm": 0.7333651781082153, + "learning_rate": 0.0007762745098039216, + "loss": 0.5593, + "step": 8094 + }, + { + "epoch": 4.522346368715084, + "grad_norm": 2.2402536869049072, + "learning_rate": 0.0007762464985994397, + "loss": 0.4828, + "step": 8095 + }, + { + "epoch": 4.522905027932961, + "grad_norm": 0.6529591679573059, + "learning_rate": 0.000776218487394958, + "loss": 0.4659, + "step": 8096 + }, + { + "epoch": 4.523463687150838, + "grad_norm": 1.3216164112091064, + "learning_rate": 0.0007761904761904762, + "loss": 0.3864, + "step": 8097 + }, + { + "epoch": 4.5240223463687155, + "grad_norm": 0.8586426973342896, + "learning_rate": 0.0007761624649859944, + "loss": 0.4498, + "step": 8098 + }, + { + "epoch": 4.524581005586592, + "grad_norm": 0.5126912593841553, + "learning_rate": 0.0007761344537815126, + "loss": 0.3494, + "step": 8099 + }, + { + "epoch": 4.5251396648044695, + "grad_norm": 0.6441032290458679, + "learning_rate": 0.0007761064425770307, + "loss": 0.434, + "step": 8100 + }, + { + "epoch": 4.525698324022346, + "grad_norm": 0.4300846457481384, + "learning_rate": 0.000776078431372549, + "loss": 0.414, + "step": 8101 + }, + { + "epoch": 4.5262569832402235, + "grad_norm": 2.3352677822113037, + "learning_rate": 0.0007760504201680672, + "loss": 0.4082, + "step": 8102 + }, + { + "epoch": 4.5268156424581, + "grad_norm": 0.7502132058143616, + "learning_rate": 0.0007760224089635855, + "loss": 0.557, + "step": 8103 + }, + { + "epoch": 4.5273743016759775, + "grad_norm": 0.8537489175796509, + "learning_rate": 0.0007759943977591037, + "loss": 0.4884, + "step": 8104 + }, + { + "epoch": 4.527932960893855, + "grad_norm": 0.59389328956604, + "learning_rate": 0.0007759663865546219, + "loss": 0.5073, + "step": 8105 + }, + { + "epoch": 4.528491620111732, + "grad_norm": 0.8228345513343811, + "learning_rate": 0.0007759383753501401, + "loss": 0.4008, + "step": 8106 + }, + { + "epoch": 4.529050279329609, + "grad_norm": 0.6935243606567383, + "learning_rate": 0.0007759103641456583, + "loss": 0.6024, + "step": 8107 + }, + { + "epoch": 4.5296089385474865, + "grad_norm": 0.5760891437530518, + "learning_rate": 0.0007758823529411765, + "loss": 0.5419, + "step": 8108 + }, + { + "epoch": 4.530167597765363, + "grad_norm": 1.1517388820648193, + "learning_rate": 0.0007758543417366947, + "loss": 0.4219, + "step": 8109 + }, + { + "epoch": 4.5307262569832405, + "grad_norm": 1.5431220531463623, + "learning_rate": 0.0007758263305322129, + "loss": 0.4962, + "step": 8110 + }, + { + "epoch": 4.531284916201117, + "grad_norm": 0.6908110976219177, + "learning_rate": 0.0007757983193277311, + "loss": 0.4801, + "step": 8111 + }, + { + "epoch": 4.5318435754189945, + "grad_norm": 0.872285008430481, + "learning_rate": 0.0007757703081232493, + "loss": 0.4446, + "step": 8112 + }, + { + "epoch": 4.532402234636871, + "grad_norm": 0.66185462474823, + "learning_rate": 0.0007757422969187675, + "loss": 0.4027, + "step": 8113 + }, + { + "epoch": 4.5329608938547485, + "grad_norm": 0.5269913673400879, + "learning_rate": 0.0007757142857142857, + "loss": 0.4448, + "step": 8114 + }, + { + "epoch": 4.533519553072626, + "grad_norm": 0.3996785879135132, + "learning_rate": 0.0007756862745098039, + "loss": 0.411, + "step": 8115 + }, + { + "epoch": 4.534078212290503, + "grad_norm": 0.9295637011528015, + "learning_rate": 0.0007756582633053222, + "loss": 0.5494, + "step": 8116 + }, + { + "epoch": 4.53463687150838, + "grad_norm": 0.7121503353118896, + "learning_rate": 0.0007756302521008403, + "loss": 0.6011, + "step": 8117 + }, + { + "epoch": 4.5351955307262575, + "grad_norm": 0.4291536808013916, + "learning_rate": 0.0007756022408963585, + "loss": 0.4044, + "step": 8118 + }, + { + "epoch": 4.535754189944134, + "grad_norm": 0.7450488209724426, + "learning_rate": 0.0007755742296918767, + "loss": 0.4788, + "step": 8119 + }, + { + "epoch": 4.5363128491620115, + "grad_norm": 0.8254243731498718, + "learning_rate": 0.000775546218487395, + "loss": 0.5847, + "step": 8120 + }, + { + "epoch": 4.536871508379888, + "grad_norm": 0.6319572925567627, + "learning_rate": 0.0007755182072829133, + "loss": 0.5136, + "step": 8121 + }, + { + "epoch": 4.5374301675977655, + "grad_norm": 0.5139955282211304, + "learning_rate": 0.0007754901960784314, + "loss": 0.4766, + "step": 8122 + }, + { + "epoch": 4.537988826815642, + "grad_norm": 0.66304612159729, + "learning_rate": 0.0007754621848739496, + "loss": 0.3934, + "step": 8123 + }, + { + "epoch": 4.5385474860335195, + "grad_norm": 0.9044004678726196, + "learning_rate": 0.0007754341736694678, + "loss": 0.4549, + "step": 8124 + }, + { + "epoch": 4.539106145251397, + "grad_norm": 0.5326232314109802, + "learning_rate": 0.000775406162464986, + "loss": 0.4392, + "step": 8125 + }, + { + "epoch": 4.539664804469274, + "grad_norm": 1.0321842432022095, + "learning_rate": 0.0007753781512605043, + "loss": 0.4118, + "step": 8126 + }, + { + "epoch": 4.540223463687151, + "grad_norm": 0.6323887705802917, + "learning_rate": 0.0007753501400560224, + "loss": 0.394, + "step": 8127 + }, + { + "epoch": 4.540782122905028, + "grad_norm": 0.7320694923400879, + "learning_rate": 0.0007753221288515406, + "loss": 0.4132, + "step": 8128 + }, + { + "epoch": 4.541340782122905, + "grad_norm": 0.5122719407081604, + "learning_rate": 0.0007752941176470588, + "loss": 0.4651, + "step": 8129 + }, + { + "epoch": 4.5418994413407825, + "grad_norm": 0.5361186265945435, + "learning_rate": 0.000775266106442577, + "loss": 0.3921, + "step": 8130 + }, + { + "epoch": 4.542458100558659, + "grad_norm": 0.7645115256309509, + "learning_rate": 0.0007752380952380953, + "loss": 0.6572, + "step": 8131 + }, + { + "epoch": 4.5430167597765365, + "grad_norm": 0.5335005521774292, + "learning_rate": 0.0007752100840336135, + "loss": 0.4962, + "step": 8132 + }, + { + "epoch": 4.543575418994413, + "grad_norm": 0.6003976464271545, + "learning_rate": 0.0007751820728291316, + "loss": 0.5033, + "step": 8133 + }, + { + "epoch": 4.5441340782122905, + "grad_norm": 0.5694627165794373, + "learning_rate": 0.0007751540616246498, + "loss": 0.439, + "step": 8134 + }, + { + "epoch": 4.544692737430168, + "grad_norm": 0.5356988310813904, + "learning_rate": 0.000775126050420168, + "loss": 0.3436, + "step": 8135 + }, + { + "epoch": 4.545251396648045, + "grad_norm": 0.5364171862602234, + "learning_rate": 0.0007750980392156864, + "loss": 0.5807, + "step": 8136 + }, + { + "epoch": 4.545810055865922, + "grad_norm": 0.528247594833374, + "learning_rate": 0.0007750700280112046, + "loss": 0.5342, + "step": 8137 + }, + { + "epoch": 4.546368715083799, + "grad_norm": 0.5301976799964905, + "learning_rate": 0.0007750420168067227, + "loss": 0.5402, + "step": 8138 + }, + { + "epoch": 4.546927374301676, + "grad_norm": 0.6062915921211243, + "learning_rate": 0.0007750140056022409, + "loss": 0.6306, + "step": 8139 + }, + { + "epoch": 4.547486033519553, + "grad_norm": 0.6419862508773804, + "learning_rate": 0.0007749859943977591, + "loss": 0.438, + "step": 8140 + }, + { + "epoch": 4.54804469273743, + "grad_norm": 0.4715009033679962, + "learning_rate": 0.0007749579831932774, + "loss": 0.4482, + "step": 8141 + }, + { + "epoch": 4.5486033519553075, + "grad_norm": 0.8993266224861145, + "learning_rate": 0.0007749299719887956, + "loss": 0.4836, + "step": 8142 + }, + { + "epoch": 4.549162011173184, + "grad_norm": 0.4348084330558777, + "learning_rate": 0.0007749019607843137, + "loss": 0.4708, + "step": 8143 + }, + { + "epoch": 4.5497206703910615, + "grad_norm": 0.4123710095882416, + "learning_rate": 0.0007748739495798319, + "loss": 0.3794, + "step": 8144 + }, + { + "epoch": 4.550279329608939, + "grad_norm": 0.5706660151481628, + "learning_rate": 0.0007748459383753501, + "loss": 0.5639, + "step": 8145 + }, + { + "epoch": 4.550837988826816, + "grad_norm": 0.7596791386604309, + "learning_rate": 0.0007748179271708684, + "loss": 0.471, + "step": 8146 + }, + { + "epoch": 4.551396648044693, + "grad_norm": 0.4690873324871063, + "learning_rate": 0.0007747899159663866, + "loss": 0.5288, + "step": 8147 + }, + { + "epoch": 4.55195530726257, + "grad_norm": 1.1551265716552734, + "learning_rate": 0.0007747619047619048, + "loss": 0.5019, + "step": 8148 + }, + { + "epoch": 4.552513966480447, + "grad_norm": 0.41823726892471313, + "learning_rate": 0.0007747338935574229, + "loss": 0.3941, + "step": 8149 + }, + { + "epoch": 4.553072625698324, + "grad_norm": 0.6825693249702454, + "learning_rate": 0.0007747058823529411, + "loss": 0.5645, + "step": 8150 + }, + { + "epoch": 4.553631284916201, + "grad_norm": 0.46231648325920105, + "learning_rate": 0.0007746778711484595, + "loss": 0.3865, + "step": 8151 + }, + { + "epoch": 4.5541899441340785, + "grad_norm": 0.5623779296875, + "learning_rate": 0.0007746498599439777, + "loss": 0.5154, + "step": 8152 + }, + { + "epoch": 4.554748603351955, + "grad_norm": 0.4641118347644806, + "learning_rate": 0.0007746218487394959, + "loss": 0.4932, + "step": 8153 + }, + { + "epoch": 4.5553072625698325, + "grad_norm": 0.699112057685852, + "learning_rate": 0.000774593837535014, + "loss": 0.5723, + "step": 8154 + }, + { + "epoch": 4.55586592178771, + "grad_norm": 0.7715858221054077, + "learning_rate": 0.0007745658263305322, + "loss": 0.4913, + "step": 8155 + }, + { + "epoch": 4.556424581005587, + "grad_norm": 0.8410941362380981, + "learning_rate": 0.0007745378151260505, + "loss": 0.3757, + "step": 8156 + }, + { + "epoch": 4.556983240223464, + "grad_norm": 0.4754253625869751, + "learning_rate": 0.0007745098039215687, + "loss": 0.3946, + "step": 8157 + }, + { + "epoch": 4.557541899441341, + "grad_norm": 2.006784439086914, + "learning_rate": 0.0007744817927170869, + "loss": 0.5234, + "step": 8158 + }, + { + "epoch": 4.558100558659218, + "grad_norm": 0.6154943704605103, + "learning_rate": 0.000774453781512605, + "loss": 0.4225, + "step": 8159 + }, + { + "epoch": 4.558659217877095, + "grad_norm": 0.6050153374671936, + "learning_rate": 0.0007744257703081232, + "loss": 0.5755, + "step": 8160 + }, + { + "epoch": 4.559217877094972, + "grad_norm": 8.078617095947266, + "learning_rate": 0.0007743977591036415, + "loss": 0.5154, + "step": 8161 + }, + { + "epoch": 4.5597765363128495, + "grad_norm": 1.2790827751159668, + "learning_rate": 0.0007743697478991597, + "loss": 0.4278, + "step": 8162 + }, + { + "epoch": 4.560335195530726, + "grad_norm": 0.8261299729347229, + "learning_rate": 0.0007743417366946779, + "loss": 0.3154, + "step": 8163 + }, + { + "epoch": 4.5608938547486035, + "grad_norm": 1.0802264213562012, + "learning_rate": 0.0007743137254901961, + "loss": 0.5002, + "step": 8164 + }, + { + "epoch": 4.56145251396648, + "grad_norm": 0.6667917370796204, + "learning_rate": 0.0007742857142857142, + "loss": 0.5824, + "step": 8165 + }, + { + "epoch": 4.562011173184358, + "grad_norm": 0.6015957593917847, + "learning_rate": 0.0007742577030812325, + "loss": 0.6011, + "step": 8166 + }, + { + "epoch": 4.562569832402235, + "grad_norm": 0.770133912563324, + "learning_rate": 0.0007742296918767507, + "loss": 0.5135, + "step": 8167 + }, + { + "epoch": 4.563128491620112, + "grad_norm": 0.7474347352981567, + "learning_rate": 0.000774201680672269, + "loss": 0.4336, + "step": 8168 + }, + { + "epoch": 4.563687150837989, + "grad_norm": 0.7899701595306396, + "learning_rate": 0.0007741736694677872, + "loss": 0.4852, + "step": 8169 + }, + { + "epoch": 4.564245810055866, + "grad_norm": 0.678300142288208, + "learning_rate": 0.0007741456582633053, + "loss": 0.4494, + "step": 8170 + }, + { + "epoch": 4.564804469273743, + "grad_norm": 0.6663585901260376, + "learning_rate": 0.0007741176470588236, + "loss": 0.4997, + "step": 8171 + }, + { + "epoch": 4.5653631284916205, + "grad_norm": 2.21020770072937, + "learning_rate": 0.0007740896358543418, + "loss": 0.4465, + "step": 8172 + }, + { + "epoch": 4.565921787709497, + "grad_norm": 0.6665765047073364, + "learning_rate": 0.00077406162464986, + "loss": 0.4638, + "step": 8173 + }, + { + "epoch": 4.5664804469273745, + "grad_norm": 0.5810917615890503, + "learning_rate": 0.0007740336134453782, + "loss": 0.423, + "step": 8174 + }, + { + "epoch": 4.567039106145251, + "grad_norm": 5.412701606750488, + "learning_rate": 0.0007740056022408963, + "loss": 0.4295, + "step": 8175 + }, + { + "epoch": 4.567597765363129, + "grad_norm": 0.7749119997024536, + "learning_rate": 0.0007739775910364146, + "loss": 0.4758, + "step": 8176 + }, + { + "epoch": 4.568156424581005, + "grad_norm": 0.5137836337089539, + "learning_rate": 0.0007739495798319328, + "loss": 0.3993, + "step": 8177 + }, + { + "epoch": 4.568715083798883, + "grad_norm": 0.7287721037864685, + "learning_rate": 0.000773921568627451, + "loss": 0.4921, + "step": 8178 + }, + { + "epoch": 4.56927374301676, + "grad_norm": 0.4986781179904938, + "learning_rate": 0.0007738935574229692, + "loss": 0.4413, + "step": 8179 + }, + { + "epoch": 4.569832402234637, + "grad_norm": 0.6719263195991516, + "learning_rate": 0.0007738655462184874, + "loss": 0.3405, + "step": 8180 + }, + { + "epoch": 4.570391061452514, + "grad_norm": 0.7535628080368042, + "learning_rate": 0.0007738375350140056, + "loss": 0.5292, + "step": 8181 + }, + { + "epoch": 4.5709497206703915, + "grad_norm": 0.5775241851806641, + "learning_rate": 0.0007738095238095238, + "loss": 0.5891, + "step": 8182 + }, + { + "epoch": 4.571508379888268, + "grad_norm": 0.48770424723625183, + "learning_rate": 0.000773781512605042, + "loss": 0.3605, + "step": 8183 + }, + { + "epoch": 4.5720670391061455, + "grad_norm": 0.6410732269287109, + "learning_rate": 0.0007737535014005602, + "loss": 0.5055, + "step": 8184 + }, + { + "epoch": 4.572625698324022, + "grad_norm": 0.6988441944122314, + "learning_rate": 0.0007737254901960785, + "loss": 0.5601, + "step": 8185 + }, + { + "epoch": 4.5731843575419, + "grad_norm": 13.589212417602539, + "learning_rate": 0.0007736974789915967, + "loss": 0.4929, + "step": 8186 + }, + { + "epoch": 4.573743016759776, + "grad_norm": 0.521863579750061, + "learning_rate": 0.0007736694677871149, + "loss": 0.3959, + "step": 8187 + }, + { + "epoch": 4.574301675977654, + "grad_norm": 0.5403456687927246, + "learning_rate": 0.0007736414565826331, + "loss": 0.4072, + "step": 8188 + }, + { + "epoch": 4.574860335195531, + "grad_norm": 0.4856480658054352, + "learning_rate": 0.0007736134453781513, + "loss": 0.3716, + "step": 8189 + }, + { + "epoch": 4.575418994413408, + "grad_norm": 0.6969072222709656, + "learning_rate": 0.0007735854341736695, + "loss": 0.5043, + "step": 8190 + }, + { + "epoch": 4.575977653631285, + "grad_norm": 0.40378105640411377, + "learning_rate": 0.0007735574229691877, + "loss": 0.432, + "step": 8191 + }, + { + "epoch": 4.576536312849162, + "grad_norm": 0.4986751973628998, + "learning_rate": 0.0007735294117647059, + "loss": 0.4641, + "step": 8192 + }, + { + "epoch": 4.577094972067039, + "grad_norm": 0.6272121071815491, + "learning_rate": 0.0007735014005602241, + "loss": 0.4402, + "step": 8193 + }, + { + "epoch": 4.5776536312849165, + "grad_norm": 0.34092170000076294, + "learning_rate": 0.0007734733893557423, + "loss": 0.3474, + "step": 8194 + }, + { + "epoch": 4.578212290502793, + "grad_norm": 1.0378425121307373, + "learning_rate": 0.0007734453781512605, + "loss": 0.7377, + "step": 8195 + }, + { + "epoch": 4.578770949720671, + "grad_norm": 0.49337831139564514, + "learning_rate": 0.0007734173669467788, + "loss": 0.4425, + "step": 8196 + }, + { + "epoch": 4.579329608938547, + "grad_norm": 0.47799861431121826, + "learning_rate": 0.0007733893557422969, + "loss": 0.5455, + "step": 8197 + }, + { + "epoch": 4.579888268156425, + "grad_norm": 0.6665588617324829, + "learning_rate": 0.0007733613445378151, + "loss": 0.4893, + "step": 8198 + }, + { + "epoch": 4.580446927374302, + "grad_norm": 0.4508746862411499, + "learning_rate": 0.0007733333333333333, + "loss": 0.3744, + "step": 8199 + }, + { + "epoch": 4.581005586592179, + "grad_norm": 0.5219548940658569, + "learning_rate": 0.0007733053221288515, + "loss": 0.435, + "step": 8200 + }, + { + "epoch": 4.581564245810056, + "grad_norm": 2.0675745010375977, + "learning_rate": 0.0007732773109243699, + "loss": 0.5323, + "step": 8201 + }, + { + "epoch": 4.582122905027933, + "grad_norm": 0.6970293521881104, + "learning_rate": 0.000773249299719888, + "loss": 0.4049, + "step": 8202 + }, + { + "epoch": 4.58268156424581, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0007732212885154062, + "loss": 0.4942, + "step": 8203 + }, + { + "epoch": 4.5832402234636875, + "grad_norm": 0.5304311513900757, + "learning_rate": 0.0007731932773109244, + "loss": 0.4727, + "step": 8204 + }, + { + "epoch": 4.583798882681564, + "grad_norm": 0.7618182897567749, + "learning_rate": 0.0007731652661064426, + "loss": 0.4719, + "step": 8205 + }, + { + "epoch": 4.584357541899442, + "grad_norm": 0.4298645555973053, + "learning_rate": 0.0007731372549019609, + "loss": 0.501, + "step": 8206 + }, + { + "epoch": 4.584916201117318, + "grad_norm": 0.5802479386329651, + "learning_rate": 0.000773109243697479, + "loss": 0.4233, + "step": 8207 + }, + { + "epoch": 4.585474860335196, + "grad_norm": 0.5593145489692688, + "learning_rate": 0.0007730812324929972, + "loss": 0.4219, + "step": 8208 + }, + { + "epoch": 4.586033519553073, + "grad_norm": 0.4342997968196869, + "learning_rate": 0.0007730532212885154, + "loss": 0.4518, + "step": 8209 + }, + { + "epoch": 4.58659217877095, + "grad_norm": 0.5315520763397217, + "learning_rate": 0.0007730252100840336, + "loss": 0.4455, + "step": 8210 + }, + { + "epoch": 4.587150837988827, + "grad_norm": 0.6056346297264099, + "learning_rate": 0.0007729971988795519, + "loss": 0.3567, + "step": 8211 + }, + { + "epoch": 4.587709497206704, + "grad_norm": 0.4693072438240051, + "learning_rate": 0.0007729691876750701, + "loss": 0.4232, + "step": 8212 + }, + { + "epoch": 4.588268156424581, + "grad_norm": 2.016322612762451, + "learning_rate": 0.0007729411764705882, + "loss": 0.4331, + "step": 8213 + }, + { + "epoch": 4.588826815642458, + "grad_norm": 0.9382308125495911, + "learning_rate": 0.0007729131652661064, + "loss": 0.5103, + "step": 8214 + }, + { + "epoch": 4.589385474860335, + "grad_norm": 0.7534018754959106, + "learning_rate": 0.0007728851540616246, + "loss": 0.4191, + "step": 8215 + }, + { + "epoch": 4.589944134078213, + "grad_norm": 0.6166900992393494, + "learning_rate": 0.000772857142857143, + "loss": 0.6275, + "step": 8216 + }, + { + "epoch": 4.590502793296089, + "grad_norm": 1.0928400754928589, + "learning_rate": 0.0007728291316526612, + "loss": 0.4504, + "step": 8217 + }, + { + "epoch": 4.591061452513967, + "grad_norm": 0.5267983078956604, + "learning_rate": 0.0007728011204481793, + "loss": 0.4287, + "step": 8218 + }, + { + "epoch": 4.591620111731844, + "grad_norm": 1.4918850660324097, + "learning_rate": 0.0007727731092436975, + "loss": 0.4322, + "step": 8219 + }, + { + "epoch": 4.592178770949721, + "grad_norm": 0.5473289489746094, + "learning_rate": 0.0007727450980392157, + "loss": 0.4785, + "step": 8220 + }, + { + "epoch": 4.592737430167598, + "grad_norm": 0.4721951186656952, + "learning_rate": 0.000772717086834734, + "loss": 0.4394, + "step": 8221 + }, + { + "epoch": 4.593296089385475, + "grad_norm": 0.521327793598175, + "learning_rate": 0.0007726890756302522, + "loss": 0.6071, + "step": 8222 + }, + { + "epoch": 4.593854748603352, + "grad_norm": 0.6525221467018127, + "learning_rate": 0.0007726610644257703, + "loss": 0.4779, + "step": 8223 + }, + { + "epoch": 4.594413407821229, + "grad_norm": 0.5691216588020325, + "learning_rate": 0.0007726330532212885, + "loss": 0.5468, + "step": 8224 + }, + { + "epoch": 4.594972067039106, + "grad_norm": 0.428978830575943, + "learning_rate": 0.0007726050420168067, + "loss": 0.4188, + "step": 8225 + }, + { + "epoch": 4.5955307262569836, + "grad_norm": 0.4750875234603882, + "learning_rate": 0.000772577030812325, + "loss": 0.4126, + "step": 8226 + }, + { + "epoch": 4.59608938547486, + "grad_norm": 0.47138598561286926, + "learning_rate": 0.0007725490196078432, + "loss": 0.4608, + "step": 8227 + }, + { + "epoch": 4.596648044692738, + "grad_norm": 2.1292762756347656, + "learning_rate": 0.0007725210084033614, + "loss": 0.4649, + "step": 8228 + }, + { + "epoch": 4.597206703910614, + "grad_norm": 0.5666723251342773, + "learning_rate": 0.0007724929971988795, + "loss": 0.5721, + "step": 8229 + }, + { + "epoch": 4.597765363128492, + "grad_norm": 0.6902828812599182, + "learning_rate": 0.0007724649859943977, + "loss": 0.3883, + "step": 8230 + }, + { + "epoch": 4.598324022346369, + "grad_norm": 0.5086633563041687, + "learning_rate": 0.000772436974789916, + "loss": 0.4883, + "step": 8231 + }, + { + "epoch": 4.598882681564246, + "grad_norm": 0.4486280381679535, + "learning_rate": 0.0007724089635854342, + "loss": 0.3923, + "step": 8232 + }, + { + "epoch": 4.599441340782123, + "grad_norm": 0.5192384719848633, + "learning_rate": 0.0007723809523809525, + "loss": 0.5095, + "step": 8233 + }, + { + "epoch": 4.6, + "grad_norm": 0.6304391026496887, + "learning_rate": 0.0007723529411764705, + "loss": 0.3598, + "step": 8234 + }, + { + "epoch": 4.600558659217877, + "grad_norm": 0.45914217829704285, + "learning_rate": 0.0007723249299719888, + "loss": 0.5354, + "step": 8235 + }, + { + "epoch": 4.6011173184357546, + "grad_norm": 0.5513746738433838, + "learning_rate": 0.0007722969187675071, + "loss": 0.4755, + "step": 8236 + }, + { + "epoch": 4.601675977653631, + "grad_norm": 0.4337019622325897, + "learning_rate": 0.0007722689075630253, + "loss": 0.4217, + "step": 8237 + }, + { + "epoch": 4.602234636871509, + "grad_norm": 0.4756260812282562, + "learning_rate": 0.0007722408963585435, + "loss": 0.4939, + "step": 8238 + }, + { + "epoch": 4.602793296089385, + "grad_norm": 6.45401668548584, + "learning_rate": 0.0007722128851540616, + "loss": 0.438, + "step": 8239 + }, + { + "epoch": 4.603351955307263, + "grad_norm": 0.9214911460876465, + "learning_rate": 0.0007721848739495798, + "loss": 0.5228, + "step": 8240 + }, + { + "epoch": 4.603910614525139, + "grad_norm": 0.5213215351104736, + "learning_rate": 0.0007721568627450981, + "loss": 0.4902, + "step": 8241 + }, + { + "epoch": 4.604469273743017, + "grad_norm": 1.2350841760635376, + "learning_rate": 0.0007721288515406163, + "loss": 0.4247, + "step": 8242 + }, + { + "epoch": 4.605027932960894, + "grad_norm": 0.5343257784843445, + "learning_rate": 0.0007721008403361345, + "loss": 0.5808, + "step": 8243 + }, + { + "epoch": 4.605586592178771, + "grad_norm": 1.4936286211013794, + "learning_rate": 0.0007720728291316527, + "loss": 0.6011, + "step": 8244 + }, + { + "epoch": 4.606145251396648, + "grad_norm": 0.5499721765518188, + "learning_rate": 0.0007720448179271708, + "loss": 0.4669, + "step": 8245 + }, + { + "epoch": 4.6067039106145256, + "grad_norm": 0.7632619142532349, + "learning_rate": 0.0007720168067226891, + "loss": 0.4634, + "step": 8246 + }, + { + "epoch": 4.607262569832402, + "grad_norm": 0.7666616439819336, + "learning_rate": 0.0007719887955182073, + "loss": 0.5135, + "step": 8247 + }, + { + "epoch": 4.60782122905028, + "grad_norm": 0.5069367289543152, + "learning_rate": 0.0007719607843137255, + "loss": 0.4172, + "step": 8248 + }, + { + "epoch": 4.608379888268156, + "grad_norm": 0.41482672095298767, + "learning_rate": 0.0007719327731092437, + "loss": 0.5592, + "step": 8249 + }, + { + "epoch": 4.608938547486034, + "grad_norm": 0.47095632553100586, + "learning_rate": 0.0007719047619047618, + "loss": 0.3942, + "step": 8250 + }, + { + "epoch": 4.60949720670391, + "grad_norm": 0.6012982726097107, + "learning_rate": 0.0007718767507002802, + "loss": 0.3931, + "step": 8251 + }, + { + "epoch": 4.610055865921788, + "grad_norm": 0.8348149657249451, + "learning_rate": 0.0007718487394957984, + "loss": 0.4903, + "step": 8252 + }, + { + "epoch": 4.610614525139665, + "grad_norm": 1.2411580085754395, + "learning_rate": 0.0007718207282913166, + "loss": 0.4595, + "step": 8253 + }, + { + "epoch": 4.611173184357542, + "grad_norm": 0.6678096055984497, + "learning_rate": 0.0007717927170868348, + "loss": 0.5467, + "step": 8254 + }, + { + "epoch": 4.611731843575419, + "grad_norm": 0.6354730129241943, + "learning_rate": 0.0007717647058823529, + "loss": 0.4105, + "step": 8255 + }, + { + "epoch": 4.6122905027932966, + "grad_norm": 1.0603047609329224, + "learning_rate": 0.0007717366946778711, + "loss": 0.6296, + "step": 8256 + }, + { + "epoch": 4.612849162011173, + "grad_norm": 0.6615439057350159, + "learning_rate": 0.0007717086834733894, + "loss": 0.5102, + "step": 8257 + }, + { + "epoch": 4.613407821229051, + "grad_norm": 0.5365902185440063, + "learning_rate": 0.0007716806722689076, + "loss": 0.44, + "step": 8258 + }, + { + "epoch": 4.613966480446927, + "grad_norm": 1.118316888809204, + "learning_rate": 0.0007716526610644258, + "loss": 0.5129, + "step": 8259 + }, + { + "epoch": 4.614525139664805, + "grad_norm": 0.3882056772708893, + "learning_rate": 0.000771624649859944, + "loss": 0.3273, + "step": 8260 + }, + { + "epoch": 4.615083798882681, + "grad_norm": 1.1657724380493164, + "learning_rate": 0.0007715966386554621, + "loss": 0.3995, + "step": 8261 + }, + { + "epoch": 4.615642458100559, + "grad_norm": 0.5781629681587219, + "learning_rate": 0.0007715686274509804, + "loss": 0.4789, + "step": 8262 + }, + { + "epoch": 4.616201117318436, + "grad_norm": 4.094382286071777, + "learning_rate": 0.0007715406162464986, + "loss": 0.4214, + "step": 8263 + }, + { + "epoch": 4.616759776536313, + "grad_norm": 0.7160081267356873, + "learning_rate": 0.0007715126050420168, + "loss": 0.428, + "step": 8264 + }, + { + "epoch": 4.61731843575419, + "grad_norm": 0.920839786529541, + "learning_rate": 0.000771484593837535, + "loss": 0.4489, + "step": 8265 + }, + { + "epoch": 4.617877094972067, + "grad_norm": 0.5811208486557007, + "learning_rate": 0.0007714565826330531, + "loss": 0.5054, + "step": 8266 + }, + { + "epoch": 4.618435754189944, + "grad_norm": 0.4951714277267456, + "learning_rate": 0.0007714285714285715, + "loss": 0.556, + "step": 8267 + }, + { + "epoch": 4.618994413407822, + "grad_norm": 0.7615565061569214, + "learning_rate": 0.0007714005602240897, + "loss": 0.5598, + "step": 8268 + }, + { + "epoch": 4.619553072625698, + "grad_norm": 0.5579128265380859, + "learning_rate": 0.0007713725490196079, + "loss": 0.3974, + "step": 8269 + }, + { + "epoch": 4.620111731843576, + "grad_norm": 0.607566237449646, + "learning_rate": 0.0007713445378151261, + "loss": 0.47, + "step": 8270 + }, + { + "epoch": 4.620670391061452, + "grad_norm": 0.4827634394168854, + "learning_rate": 0.0007713165266106442, + "loss": 0.4752, + "step": 8271 + }, + { + "epoch": 4.62122905027933, + "grad_norm": 0.5341469645500183, + "learning_rate": 0.0007712885154061625, + "loss": 0.3862, + "step": 8272 + }, + { + "epoch": 4.621787709497207, + "grad_norm": 0.463590532541275, + "learning_rate": 0.0007712605042016807, + "loss": 0.4355, + "step": 8273 + }, + { + "epoch": 4.622346368715084, + "grad_norm": 1.0764509439468384, + "learning_rate": 0.0007712324929971989, + "loss": 0.4899, + "step": 8274 + }, + { + "epoch": 4.622905027932961, + "grad_norm": 0.790708601474762, + "learning_rate": 0.0007712044817927171, + "loss": 0.4543, + "step": 8275 + }, + { + "epoch": 4.623463687150838, + "grad_norm": 0.6141035556793213, + "learning_rate": 0.0007711764705882353, + "loss": 0.3959, + "step": 8276 + }, + { + "epoch": 4.624022346368715, + "grad_norm": 0.8165810108184814, + "learning_rate": 0.0007711484593837535, + "loss": 0.5135, + "step": 8277 + }, + { + "epoch": 4.624581005586592, + "grad_norm": 0.947012186050415, + "learning_rate": 0.0007711204481792717, + "loss": 0.5232, + "step": 8278 + }, + { + "epoch": 4.625139664804469, + "grad_norm": 2.2460289001464844, + "learning_rate": 0.0007710924369747899, + "loss": 0.4597, + "step": 8279 + }, + { + "epoch": 4.625698324022347, + "grad_norm": 0.7627413868904114, + "learning_rate": 0.0007710644257703081, + "loss": 0.424, + "step": 8280 + }, + { + "epoch": 4.626256983240223, + "grad_norm": 0.7325783371925354, + "learning_rate": 0.0007710364145658263, + "loss": 0.3734, + "step": 8281 + }, + { + "epoch": 4.626815642458101, + "grad_norm": 0.8129794597625732, + "learning_rate": 0.0007710084033613445, + "loss": 0.5286, + "step": 8282 + }, + { + "epoch": 4.627374301675978, + "grad_norm": 2.142270088195801, + "learning_rate": 0.0007709803921568627, + "loss": 0.535, + "step": 8283 + }, + { + "epoch": 4.627932960893855, + "grad_norm": 0.9217079877853394, + "learning_rate": 0.000770952380952381, + "loss": 0.51, + "step": 8284 + }, + { + "epoch": 4.628491620111732, + "grad_norm": 0.6075193881988525, + "learning_rate": 0.0007709243697478992, + "loss": 0.4099, + "step": 8285 + }, + { + "epoch": 4.629050279329609, + "grad_norm": 0.5230545401573181, + "learning_rate": 0.0007708963585434174, + "loss": 0.3557, + "step": 8286 + }, + { + "epoch": 4.629608938547486, + "grad_norm": 0.9704474806785583, + "learning_rate": 0.0007708683473389356, + "loss": 0.3946, + "step": 8287 + }, + { + "epoch": 4.630167597765363, + "grad_norm": 0.46194612979888916, + "learning_rate": 0.0007708403361344538, + "loss": 0.4338, + "step": 8288 + }, + { + "epoch": 4.63072625698324, + "grad_norm": 0.7506082057952881, + "learning_rate": 0.000770812324929972, + "loss": 0.4781, + "step": 8289 + }, + { + "epoch": 4.631284916201118, + "grad_norm": 0.7269895672798157, + "learning_rate": 0.0007707843137254902, + "loss": 0.3245, + "step": 8290 + }, + { + "epoch": 4.631843575418994, + "grad_norm": 0.6341338753700256, + "learning_rate": 0.0007707563025210084, + "loss": 0.4871, + "step": 8291 + }, + { + "epoch": 4.632402234636872, + "grad_norm": 0.7796180248260498, + "learning_rate": 0.0007707282913165267, + "loss": 0.5501, + "step": 8292 + }, + { + "epoch": 4.632960893854749, + "grad_norm": 1.3124040365219116, + "learning_rate": 0.0007707002801120448, + "loss": 0.6444, + "step": 8293 + }, + { + "epoch": 4.633519553072626, + "grad_norm": 0.5190688371658325, + "learning_rate": 0.000770672268907563, + "loss": 0.4619, + "step": 8294 + }, + { + "epoch": 4.634078212290503, + "grad_norm": 0.45809775590896606, + "learning_rate": 0.0007706442577030812, + "loss": 0.3827, + "step": 8295 + }, + { + "epoch": 4.63463687150838, + "grad_norm": 1.3893835544586182, + "learning_rate": 0.0007706162464985994, + "loss": 0.4837, + "step": 8296 + }, + { + "epoch": 4.635195530726257, + "grad_norm": 0.5533860325813293, + "learning_rate": 0.0007705882352941177, + "loss": 0.3982, + "step": 8297 + }, + { + "epoch": 4.635754189944134, + "grad_norm": 0.49091672897338867, + "learning_rate": 0.0007705602240896358, + "loss": 0.52, + "step": 8298 + }, + { + "epoch": 4.636312849162011, + "grad_norm": 0.5421136617660522, + "learning_rate": 0.000770532212885154, + "loss": 0.5373, + "step": 8299 + }, + { + "epoch": 4.636871508379889, + "grad_norm": 0.5311143398284912, + "learning_rate": 0.0007705042016806723, + "loss": 0.3757, + "step": 8300 + }, + { + "epoch": 4.637430167597765, + "grad_norm": 0.544424295425415, + "learning_rate": 0.0007704761904761905, + "loss": 0.4635, + "step": 8301 + }, + { + "epoch": 4.637988826815643, + "grad_norm": 1.6883760690689087, + "learning_rate": 0.0007704481792717088, + "loss": 0.4245, + "step": 8302 + }, + { + "epoch": 4.638547486033519, + "grad_norm": 0.8369178175926208, + "learning_rate": 0.0007704201680672269, + "loss": 0.7341, + "step": 8303 + }, + { + "epoch": 4.639106145251397, + "grad_norm": 0.9898208975791931, + "learning_rate": 0.0007703921568627451, + "loss": 0.5294, + "step": 8304 + }, + { + "epoch": 4.639664804469274, + "grad_norm": 0.45998162031173706, + "learning_rate": 0.0007703641456582633, + "loss": 0.4047, + "step": 8305 + }, + { + "epoch": 4.640223463687151, + "grad_norm": 0.6343604326248169, + "learning_rate": 0.0007703361344537815, + "loss": 0.5054, + "step": 8306 + }, + { + "epoch": 4.640782122905028, + "grad_norm": 0.5256215929985046, + "learning_rate": 0.0007703081232492998, + "loss": 0.4557, + "step": 8307 + }, + { + "epoch": 4.641340782122905, + "grad_norm": 0.613641619682312, + "learning_rate": 0.000770280112044818, + "loss": 0.407, + "step": 8308 + }, + { + "epoch": 4.641899441340782, + "grad_norm": 0.4358263313770294, + "learning_rate": 0.0007702521008403361, + "loss": 0.4997, + "step": 8309 + }, + { + "epoch": 4.64245810055866, + "grad_norm": 0.8095864653587341, + "learning_rate": 0.0007702240896358543, + "loss": 0.4851, + "step": 8310 + }, + { + "epoch": 4.643016759776536, + "grad_norm": 1.0731861591339111, + "learning_rate": 0.0007701960784313725, + "loss": 0.5774, + "step": 8311 + }, + { + "epoch": 4.643575418994414, + "grad_norm": 0.68409264087677, + "learning_rate": 0.0007701680672268908, + "loss": 0.4754, + "step": 8312 + }, + { + "epoch": 4.64413407821229, + "grad_norm": 0.7335063815116882, + "learning_rate": 0.000770140056022409, + "loss": 0.4201, + "step": 8313 + }, + { + "epoch": 4.644692737430168, + "grad_norm": 0.5331463813781738, + "learning_rate": 0.0007701120448179271, + "loss": 0.4149, + "step": 8314 + }, + { + "epoch": 4.645251396648044, + "grad_norm": 0.8423575758934021, + "learning_rate": 0.0007700840336134453, + "loss": 0.4208, + "step": 8315 + }, + { + "epoch": 4.645810055865922, + "grad_norm": 0.9110333919525146, + "learning_rate": 0.0007700560224089635, + "loss": 0.5889, + "step": 8316 + }, + { + "epoch": 4.646368715083799, + "grad_norm": 0.7645127177238464, + "learning_rate": 0.0007700280112044819, + "loss": 0.5387, + "step": 8317 + }, + { + "epoch": 4.646927374301676, + "grad_norm": 0.527195394039154, + "learning_rate": 0.0007700000000000001, + "loss": 0.4044, + "step": 8318 + }, + { + "epoch": 4.647486033519553, + "grad_norm": 1.3709142208099365, + "learning_rate": 0.0007699719887955182, + "loss": 0.3442, + "step": 8319 + }, + { + "epoch": 4.648044692737431, + "grad_norm": 1.069693922996521, + "learning_rate": 0.0007699439775910364, + "loss": 0.6279, + "step": 8320 + }, + { + "epoch": 4.648603351955307, + "grad_norm": 1.5301125049591064, + "learning_rate": 0.0007699159663865546, + "loss": 0.4696, + "step": 8321 + }, + { + "epoch": 4.649162011173185, + "grad_norm": 0.7884263396263123, + "learning_rate": 0.0007698879551820729, + "loss": 0.4188, + "step": 8322 + }, + { + "epoch": 4.649720670391061, + "grad_norm": 0.5019213557243347, + "learning_rate": 0.0007698599439775911, + "loss": 0.5026, + "step": 8323 + }, + { + "epoch": 4.650279329608939, + "grad_norm": 0.7438209056854248, + "learning_rate": 0.0007698319327731093, + "loss": 0.5183, + "step": 8324 + }, + { + "epoch": 4.650837988826815, + "grad_norm": 0.5842709541320801, + "learning_rate": 0.0007698039215686274, + "loss": 0.5326, + "step": 8325 + }, + { + "epoch": 4.651396648044693, + "grad_norm": 5.246720314025879, + "learning_rate": 0.0007697759103641456, + "loss": 0.4647, + "step": 8326 + }, + { + "epoch": 4.65195530726257, + "grad_norm": 0.5407655835151672, + "learning_rate": 0.0007697478991596639, + "loss": 0.3925, + "step": 8327 + }, + { + "epoch": 4.652513966480447, + "grad_norm": 0.5838391780853271, + "learning_rate": 0.0007697198879551821, + "loss": 0.4706, + "step": 8328 + }, + { + "epoch": 4.653072625698324, + "grad_norm": 0.46035677194595337, + "learning_rate": 0.0007696918767507003, + "loss": 0.4331, + "step": 8329 + }, + { + "epoch": 4.653631284916202, + "grad_norm": 0.598491907119751, + "learning_rate": 0.0007696638655462184, + "loss": 0.4963, + "step": 8330 + }, + { + "epoch": 4.654189944134078, + "grad_norm": 1.1157419681549072, + "learning_rate": 0.0007696358543417366, + "loss": 0.4076, + "step": 8331 + }, + { + "epoch": 4.654748603351956, + "grad_norm": 0.46766576170921326, + "learning_rate": 0.000769607843137255, + "loss": 0.4299, + "step": 8332 + }, + { + "epoch": 4.655307262569832, + "grad_norm": 3.0822103023529053, + "learning_rate": 0.0007695798319327732, + "loss": 0.4938, + "step": 8333 + }, + { + "epoch": 4.65586592178771, + "grad_norm": 3.285327196121216, + "learning_rate": 0.0007695518207282914, + "loss": 0.4061, + "step": 8334 + }, + { + "epoch": 4.656424581005586, + "grad_norm": 3.0389466285705566, + "learning_rate": 0.0007695238095238095, + "loss": 0.5347, + "step": 8335 + }, + { + "epoch": 4.656983240223464, + "grad_norm": 0.5797761082649231, + "learning_rate": 0.0007694957983193277, + "loss": 0.5723, + "step": 8336 + }, + { + "epoch": 4.657541899441341, + "grad_norm": 0.6282501816749573, + "learning_rate": 0.000769467787114846, + "loss": 0.4822, + "step": 8337 + }, + { + "epoch": 4.658100558659218, + "grad_norm": 0.4578629434108734, + "learning_rate": 0.0007694397759103642, + "loss": 0.4148, + "step": 8338 + }, + { + "epoch": 4.658659217877095, + "grad_norm": 5.898891925811768, + "learning_rate": 0.0007694117647058824, + "loss": 0.4121, + "step": 8339 + }, + { + "epoch": 4.659217877094972, + "grad_norm": 0.5945466160774231, + "learning_rate": 0.0007693837535014006, + "loss": 0.6493, + "step": 8340 + }, + { + "epoch": 4.659776536312849, + "grad_norm": 0.6316630840301514, + "learning_rate": 0.0007693557422969187, + "loss": 0.4453, + "step": 8341 + }, + { + "epoch": 4.660335195530727, + "grad_norm": 1.0428556203842163, + "learning_rate": 0.000769327731092437, + "loss": 0.4298, + "step": 8342 + }, + { + "epoch": 4.660893854748603, + "grad_norm": 0.5396453142166138, + "learning_rate": 0.0007692997198879552, + "loss": 0.4931, + "step": 8343 + }, + { + "epoch": 4.661452513966481, + "grad_norm": 0.656947672367096, + "learning_rate": 0.0007692717086834734, + "loss": 0.4767, + "step": 8344 + }, + { + "epoch": 4.662011173184357, + "grad_norm": 0.38328754901885986, + "learning_rate": 0.0007692436974789916, + "loss": 0.4942, + "step": 8345 + }, + { + "epoch": 4.662569832402235, + "grad_norm": 0.5110989809036255, + "learning_rate": 0.0007692156862745097, + "loss": 0.5929, + "step": 8346 + }, + { + "epoch": 4.663128491620112, + "grad_norm": 0.5763763189315796, + "learning_rate": 0.000769187675070028, + "loss": 0.4603, + "step": 8347 + }, + { + "epoch": 4.663687150837989, + "grad_norm": 0.6368765830993652, + "learning_rate": 0.0007691596638655462, + "loss": 0.5005, + "step": 8348 + }, + { + "epoch": 4.664245810055866, + "grad_norm": 0.5837327241897583, + "learning_rate": 0.0007691316526610645, + "loss": 0.4204, + "step": 8349 + }, + { + "epoch": 4.664804469273743, + "grad_norm": 0.4411894679069519, + "learning_rate": 0.0007691036414565827, + "loss": 0.403, + "step": 8350 + }, + { + "epoch": 4.66536312849162, + "grad_norm": 0.5981355905532837, + "learning_rate": 0.0007690756302521008, + "loss": 0.4338, + "step": 8351 + }, + { + "epoch": 4.665921787709497, + "grad_norm": 0.6112882494926453, + "learning_rate": 0.0007690476190476191, + "loss": 0.5062, + "step": 8352 + }, + { + "epoch": 4.666480446927374, + "grad_norm": 0.694929301738739, + "learning_rate": 0.0007690196078431373, + "loss": 0.3944, + "step": 8353 + }, + { + "epoch": 4.667039106145252, + "grad_norm": 0.6692671179771423, + "learning_rate": 0.0007689915966386555, + "loss": 0.6464, + "step": 8354 + }, + { + "epoch": 4.667597765363128, + "grad_norm": 0.7128127813339233, + "learning_rate": 0.0007689635854341737, + "loss": 0.4137, + "step": 8355 + }, + { + "epoch": 4.668156424581006, + "grad_norm": 0.6459019184112549, + "learning_rate": 0.0007689355742296919, + "loss": 0.4213, + "step": 8356 + }, + { + "epoch": 4.668715083798883, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0007689075630252101, + "loss": 0.49, + "step": 8357 + }, + { + "epoch": 4.66927374301676, + "grad_norm": 0.44031664729118347, + "learning_rate": 0.0007688795518207283, + "loss": 0.5177, + "step": 8358 + }, + { + "epoch": 4.669832402234637, + "grad_norm": 0.5887936353683472, + "learning_rate": 0.0007688515406162465, + "loss": 0.4505, + "step": 8359 + }, + { + "epoch": 4.670391061452514, + "grad_norm": 2.7244529724121094, + "learning_rate": 0.0007688235294117647, + "loss": 0.4451, + "step": 8360 + }, + { + "epoch": 4.670949720670391, + "grad_norm": 7.343851566314697, + "learning_rate": 0.0007687955182072829, + "loss": 0.4978, + "step": 8361 + }, + { + "epoch": 4.671508379888268, + "grad_norm": 0.6784741878509521, + "learning_rate": 0.0007687675070028011, + "loss": 0.5199, + "step": 8362 + }, + { + "epoch": 4.672067039106145, + "grad_norm": 2.5949554443359375, + "learning_rate": 0.0007687394957983193, + "loss": 0.5394, + "step": 8363 + }, + { + "epoch": 4.672625698324023, + "grad_norm": 0.5449838042259216, + "learning_rate": 0.0007687114845938375, + "loss": 0.5266, + "step": 8364 + }, + { + "epoch": 4.673184357541899, + "grad_norm": 0.6077706217765808, + "learning_rate": 0.0007686834733893557, + "loss": 0.5288, + "step": 8365 + }, + { + "epoch": 4.673743016759777, + "grad_norm": 0.5879938006401062, + "learning_rate": 0.000768655462184874, + "loss": 0.4754, + "step": 8366 + }, + { + "epoch": 4.674301675977654, + "grad_norm": 2.168011426925659, + "learning_rate": 0.0007686274509803923, + "loss": 0.49, + "step": 8367 + }, + { + "epoch": 4.674860335195531, + "grad_norm": 0.3998042345046997, + "learning_rate": 0.0007685994397759104, + "loss": 0.4161, + "step": 8368 + }, + { + "epoch": 4.675418994413408, + "grad_norm": 0.7290630340576172, + "learning_rate": 0.0007685714285714286, + "loss": 0.5756, + "step": 8369 + }, + { + "epoch": 4.675977653631285, + "grad_norm": 0.9285721778869629, + "learning_rate": 0.0007685434173669468, + "loss": 0.4602, + "step": 8370 + }, + { + "epoch": 4.676536312849162, + "grad_norm": 0.5415141582489014, + "learning_rate": 0.000768515406162465, + "loss": 0.4735, + "step": 8371 + }, + { + "epoch": 4.677094972067039, + "grad_norm": 0.47024989128112793, + "learning_rate": 0.0007684873949579833, + "loss": 0.4542, + "step": 8372 + }, + { + "epoch": 4.677653631284916, + "grad_norm": 0.9392772316932678, + "learning_rate": 0.0007684593837535014, + "loss": 0.4814, + "step": 8373 + }, + { + "epoch": 4.678212290502794, + "grad_norm": 0.5973453521728516, + "learning_rate": 0.0007684313725490196, + "loss": 0.5103, + "step": 8374 + }, + { + "epoch": 4.67877094972067, + "grad_norm": 0.6517735123634338, + "learning_rate": 0.0007684033613445378, + "loss": 0.626, + "step": 8375 + }, + { + "epoch": 4.679329608938548, + "grad_norm": 0.5248894691467285, + "learning_rate": 0.000768375350140056, + "loss": 0.4042, + "step": 8376 + }, + { + "epoch": 4.679888268156424, + "grad_norm": 1.4121628999710083, + "learning_rate": 0.0007683473389355743, + "loss": 0.56, + "step": 8377 + }, + { + "epoch": 4.680446927374302, + "grad_norm": 0.696548342704773, + "learning_rate": 0.0007683193277310924, + "loss": 0.6, + "step": 8378 + }, + { + "epoch": 4.681005586592179, + "grad_norm": 4.406064987182617, + "learning_rate": 0.0007682913165266106, + "loss": 0.4036, + "step": 8379 + }, + { + "epoch": 4.681564245810056, + "grad_norm": 5.332188129425049, + "learning_rate": 0.0007682633053221288, + "loss": 0.477, + "step": 8380 + }, + { + "epoch": 4.682122905027933, + "grad_norm": 5.1298298835754395, + "learning_rate": 0.000768235294117647, + "loss": 0.4473, + "step": 8381 + }, + { + "epoch": 4.68268156424581, + "grad_norm": 0.9707376956939697, + "learning_rate": 0.0007682072829131654, + "loss": 0.4221, + "step": 8382 + }, + { + "epoch": 4.683240223463687, + "grad_norm": 0.49457040429115295, + "learning_rate": 0.0007681792717086836, + "loss": 0.4591, + "step": 8383 + }, + { + "epoch": 4.683798882681565, + "grad_norm": 0.5346271395683289, + "learning_rate": 0.0007681512605042017, + "loss": 0.5003, + "step": 8384 + }, + { + "epoch": 4.684357541899441, + "grad_norm": 0.5562250018119812, + "learning_rate": 0.0007681232492997199, + "loss": 0.5317, + "step": 8385 + }, + { + "epoch": 4.684916201117319, + "grad_norm": 0.5596450567245483, + "learning_rate": 0.0007680952380952381, + "loss": 0.4964, + "step": 8386 + }, + { + "epoch": 4.685474860335195, + "grad_norm": 0.578858494758606, + "learning_rate": 0.0007680672268907564, + "loss": 0.4327, + "step": 8387 + }, + { + "epoch": 4.686033519553073, + "grad_norm": 0.47599729895591736, + "learning_rate": 0.0007680392156862746, + "loss": 0.3794, + "step": 8388 + }, + { + "epoch": 4.686592178770949, + "grad_norm": 0.6692066788673401, + "learning_rate": 0.0007680112044817927, + "loss": 0.4484, + "step": 8389 + }, + { + "epoch": 4.687150837988827, + "grad_norm": 1.602552056312561, + "learning_rate": 0.0007679831932773109, + "loss": 0.4162, + "step": 8390 + }, + { + "epoch": 4.687709497206704, + "grad_norm": 0.5399988293647766, + "learning_rate": 0.0007679551820728291, + "loss": 0.4497, + "step": 8391 + }, + { + "epoch": 4.688268156424581, + "grad_norm": 0.5623546838760376, + "learning_rate": 0.0007679271708683474, + "loss": 0.3778, + "step": 8392 + }, + { + "epoch": 4.688826815642458, + "grad_norm": 0.5338775515556335, + "learning_rate": 0.0007678991596638656, + "loss": 0.4721, + "step": 8393 + }, + { + "epoch": 4.689385474860336, + "grad_norm": 0.8115939497947693, + "learning_rate": 0.0007678711484593837, + "loss": 0.5732, + "step": 8394 + }, + { + "epoch": 4.689944134078212, + "grad_norm": 0.5184304118156433, + "learning_rate": 0.0007678431372549019, + "loss": 0.4505, + "step": 8395 + }, + { + "epoch": 4.69050279329609, + "grad_norm": 0.7624722719192505, + "learning_rate": 0.0007678151260504201, + "loss": 0.5618, + "step": 8396 + }, + { + "epoch": 4.691061452513966, + "grad_norm": 0.7532416582107544, + "learning_rate": 0.0007677871148459385, + "loss": 0.4829, + "step": 8397 + }, + { + "epoch": 4.691620111731844, + "grad_norm": 0.8131194114685059, + "learning_rate": 0.0007677591036414567, + "loss": 0.4276, + "step": 8398 + }, + { + "epoch": 4.69217877094972, + "grad_norm": 1.1290749311447144, + "learning_rate": 0.0007677310924369749, + "loss": 0.4436, + "step": 8399 + }, + { + "epoch": 4.692737430167598, + "grad_norm": 0.8095458745956421, + "learning_rate": 0.000767703081232493, + "loss": 0.4921, + "step": 8400 + }, + { + "epoch": 4.693296089385475, + "grad_norm": 2.7540712356567383, + "learning_rate": 0.0007676750700280112, + "loss": 0.4685, + "step": 8401 + }, + { + "epoch": 4.693854748603352, + "grad_norm": 1.0384215116500854, + "learning_rate": 0.0007676470588235295, + "loss": 0.7322, + "step": 8402 + }, + { + "epoch": 4.694413407821229, + "grad_norm": 0.9079729318618774, + "learning_rate": 0.0007676190476190477, + "loss": 0.5864, + "step": 8403 + }, + { + "epoch": 4.694972067039107, + "grad_norm": 0.5142230987548828, + "learning_rate": 0.0007675910364145659, + "loss": 0.6077, + "step": 8404 + }, + { + "epoch": 4.695530726256983, + "grad_norm": 0.8453822135925293, + "learning_rate": 0.000767563025210084, + "loss": 0.5175, + "step": 8405 + }, + { + "epoch": 4.696089385474861, + "grad_norm": 0.6371819376945496, + "learning_rate": 0.0007675350140056022, + "loss": 0.5104, + "step": 8406 + }, + { + "epoch": 4.696648044692737, + "grad_norm": 1.5283763408660889, + "learning_rate": 0.0007675070028011205, + "loss": 0.4872, + "step": 8407 + }, + { + "epoch": 4.697206703910615, + "grad_norm": 0.5396294593811035, + "learning_rate": 0.0007674789915966387, + "loss": 0.4043, + "step": 8408 + }, + { + "epoch": 4.697765363128491, + "grad_norm": 0.7387148141860962, + "learning_rate": 0.0007674509803921569, + "loss": 0.433, + "step": 8409 + }, + { + "epoch": 4.698324022346369, + "grad_norm": 0.5455625057220459, + "learning_rate": 0.000767422969187675, + "loss": 0.3997, + "step": 8410 + }, + { + "epoch": 4.698882681564246, + "grad_norm": 0.43977129459381104, + "learning_rate": 0.0007673949579831932, + "loss": 0.4116, + "step": 8411 + }, + { + "epoch": 4.699441340782123, + "grad_norm": 0.5600402355194092, + "learning_rate": 0.0007673669467787115, + "loss": 0.5067, + "step": 8412 + }, + { + "epoch": 4.7, + "grad_norm": 0.5582857131958008, + "learning_rate": 0.0007673389355742297, + "loss": 0.4591, + "step": 8413 + }, + { + "epoch": 4.700558659217877, + "grad_norm": 3.107438087463379, + "learning_rate": 0.000767310924369748, + "loss": 0.4407, + "step": 8414 + }, + { + "epoch": 4.701117318435754, + "grad_norm": 0.6607743501663208, + "learning_rate": 0.0007672829131652662, + "loss": 0.5214, + "step": 8415 + }, + { + "epoch": 4.701675977653632, + "grad_norm": 9.292428970336914, + "learning_rate": 0.0007672549019607843, + "loss": 0.5008, + "step": 8416 + }, + { + "epoch": 4.702234636871508, + "grad_norm": 0.5419268608093262, + "learning_rate": 0.0007672268907563026, + "loss": 0.5598, + "step": 8417 + }, + { + "epoch": 4.702793296089386, + "grad_norm": 0.6430529356002808, + "learning_rate": 0.0007671988795518208, + "loss": 0.443, + "step": 8418 + }, + { + "epoch": 4.703351955307262, + "grad_norm": 0.8362265229225159, + "learning_rate": 0.000767170868347339, + "loss": 0.5445, + "step": 8419 + }, + { + "epoch": 4.70391061452514, + "grad_norm": 0.8648584485054016, + "learning_rate": 0.0007671428571428572, + "loss": 0.4465, + "step": 8420 + }, + { + "epoch": 4.704469273743017, + "grad_norm": 0.7796539664268494, + "learning_rate": 0.0007671148459383753, + "loss": 0.6103, + "step": 8421 + }, + { + "epoch": 4.705027932960894, + "grad_norm": 4.9278154373168945, + "learning_rate": 0.0007670868347338936, + "loss": 0.4605, + "step": 8422 + }, + { + "epoch": 4.705586592178771, + "grad_norm": 0.9810303449630737, + "learning_rate": 0.0007670588235294118, + "loss": 0.3918, + "step": 8423 + }, + { + "epoch": 4.706145251396648, + "grad_norm": 0.5862292051315308, + "learning_rate": 0.00076703081232493, + "loss": 0.5266, + "step": 8424 + }, + { + "epoch": 4.706703910614525, + "grad_norm": 0.8005500435829163, + "learning_rate": 0.0007670028011204482, + "loss": 0.4616, + "step": 8425 + }, + { + "epoch": 4.707262569832402, + "grad_norm": 0.5437454581260681, + "learning_rate": 0.0007669747899159663, + "loss": 0.5492, + "step": 8426 + }, + { + "epoch": 4.707821229050279, + "grad_norm": 0.5736657381057739, + "learning_rate": 0.0007669467787114846, + "loss": 0.5325, + "step": 8427 + }, + { + "epoch": 4.708379888268157, + "grad_norm": 0.6227053999900818, + "learning_rate": 0.0007669187675070028, + "loss": 0.4251, + "step": 8428 + }, + { + "epoch": 4.708938547486033, + "grad_norm": 0.8950062394142151, + "learning_rate": 0.000766890756302521, + "loss": 0.4034, + "step": 8429 + }, + { + "epoch": 4.709497206703911, + "grad_norm": 0.7234181761741638, + "learning_rate": 0.0007668627450980392, + "loss": 0.3824, + "step": 8430 + }, + { + "epoch": 4.710055865921788, + "grad_norm": 2.1865220069885254, + "learning_rate": 0.0007668347338935575, + "loss": 0.5149, + "step": 8431 + }, + { + "epoch": 4.710614525139665, + "grad_norm": 0.5704142451286316, + "learning_rate": 0.0007668067226890757, + "loss": 0.5251, + "step": 8432 + }, + { + "epoch": 4.711173184357542, + "grad_norm": 0.5133355855941772, + "learning_rate": 0.0007667787114845939, + "loss": 0.4133, + "step": 8433 + }, + { + "epoch": 4.711731843575419, + "grad_norm": 0.5629573464393616, + "learning_rate": 0.0007667507002801121, + "loss": 0.3161, + "step": 8434 + }, + { + "epoch": 4.712290502793296, + "grad_norm": 6.425321578979492, + "learning_rate": 0.0007667226890756303, + "loss": 0.5432, + "step": 8435 + }, + { + "epoch": 4.712849162011173, + "grad_norm": 0.940984308719635, + "learning_rate": 0.0007666946778711485, + "loss": 0.3184, + "step": 8436 + }, + { + "epoch": 4.71340782122905, + "grad_norm": 0.9444315433502197, + "learning_rate": 0.0007666666666666667, + "loss": 0.4171, + "step": 8437 + }, + { + "epoch": 4.713966480446928, + "grad_norm": 0.5563790202140808, + "learning_rate": 0.0007666386554621849, + "loss": 0.4324, + "step": 8438 + }, + { + "epoch": 4.714525139664804, + "grad_norm": 0.7993582487106323, + "learning_rate": 0.0007666106442577031, + "loss": 0.3976, + "step": 8439 + }, + { + "epoch": 4.715083798882682, + "grad_norm": 0.6162702441215515, + "learning_rate": 0.0007665826330532213, + "loss": 0.6114, + "step": 8440 + }, + { + "epoch": 4.715642458100559, + "grad_norm": 0.9097176194190979, + "learning_rate": 0.0007665546218487395, + "loss": 0.4878, + "step": 8441 + }, + { + "epoch": 4.716201117318436, + "grad_norm": 0.6418911218643188, + "learning_rate": 0.0007665266106442577, + "loss": 0.555, + "step": 8442 + }, + { + "epoch": 4.716759776536313, + "grad_norm": 0.8568861484527588, + "learning_rate": 0.0007664985994397759, + "loss": 0.4174, + "step": 8443 + }, + { + "epoch": 4.71731843575419, + "grad_norm": 0.6429045796394348, + "learning_rate": 0.0007664705882352941, + "loss": 0.4053, + "step": 8444 + }, + { + "epoch": 4.717877094972067, + "grad_norm": 0.9385924339294434, + "learning_rate": 0.0007664425770308123, + "loss": 0.5763, + "step": 8445 + }, + { + "epoch": 4.718435754189944, + "grad_norm": 1.9731087684631348, + "learning_rate": 0.0007664145658263305, + "loss": 0.4474, + "step": 8446 + }, + { + "epoch": 4.718994413407821, + "grad_norm": 0.5894946455955505, + "learning_rate": 0.0007663865546218489, + "loss": 0.3376, + "step": 8447 + }, + { + "epoch": 4.719553072625699, + "grad_norm": 0.6896722912788391, + "learning_rate": 0.000766358543417367, + "loss": 0.4381, + "step": 8448 + }, + { + "epoch": 4.720111731843575, + "grad_norm": 0.8945481181144714, + "learning_rate": 0.0007663305322128852, + "loss": 0.4788, + "step": 8449 + }, + { + "epoch": 4.720670391061453, + "grad_norm": 0.5758554339408875, + "learning_rate": 0.0007663025210084034, + "loss": 0.4761, + "step": 8450 + }, + { + "epoch": 4.721229050279329, + "grad_norm": 0.6602922081947327, + "learning_rate": 0.0007662745098039216, + "loss": 0.4452, + "step": 8451 + }, + { + "epoch": 4.721787709497207, + "grad_norm": 0.5906196236610413, + "learning_rate": 0.0007662464985994399, + "loss": 0.4372, + "step": 8452 + }, + { + "epoch": 4.722346368715084, + "grad_norm": 0.5834033489227295, + "learning_rate": 0.000766218487394958, + "loss": 0.4272, + "step": 8453 + }, + { + "epoch": 4.722905027932961, + "grad_norm": 1.266449213027954, + "learning_rate": 0.0007661904761904762, + "loss": 0.4435, + "step": 8454 + }, + { + "epoch": 4.723463687150838, + "grad_norm": 0.4318232834339142, + "learning_rate": 0.0007661624649859944, + "loss": 0.5326, + "step": 8455 + }, + { + "epoch": 4.724022346368715, + "grad_norm": 1.2043719291687012, + "learning_rate": 0.0007661344537815126, + "loss": 0.4817, + "step": 8456 + }, + { + "epoch": 4.724581005586592, + "grad_norm": 0.5707458257675171, + "learning_rate": 0.0007661064425770309, + "loss": 0.4559, + "step": 8457 + }, + { + "epoch": 4.72513966480447, + "grad_norm": 0.46222472190856934, + "learning_rate": 0.000766078431372549, + "loss": 0.5457, + "step": 8458 + }, + { + "epoch": 4.725698324022346, + "grad_norm": 1.29996657371521, + "learning_rate": 0.0007660504201680672, + "loss": 0.3586, + "step": 8459 + }, + { + "epoch": 4.726256983240224, + "grad_norm": 0.8138699531555176, + "learning_rate": 0.0007660224089635854, + "loss": 0.4255, + "step": 8460 + }, + { + "epoch": 4.7268156424581, + "grad_norm": 0.6481475830078125, + "learning_rate": 0.0007659943977591036, + "loss": 0.4189, + "step": 8461 + }, + { + "epoch": 4.727374301675978, + "grad_norm": 0.4448300302028656, + "learning_rate": 0.000765966386554622, + "loss": 0.4507, + "step": 8462 + }, + { + "epoch": 4.727932960893854, + "grad_norm": 1.2780014276504517, + "learning_rate": 0.0007659383753501402, + "loss": 0.5453, + "step": 8463 + }, + { + "epoch": 4.728491620111732, + "grad_norm": 0.7414190769195557, + "learning_rate": 0.0007659103641456583, + "loss": 0.5589, + "step": 8464 + }, + { + "epoch": 4.729050279329609, + "grad_norm": 0.458852082490921, + "learning_rate": 0.0007658823529411765, + "loss": 0.4221, + "step": 8465 + }, + { + "epoch": 4.729608938547486, + "grad_norm": 0.6836581826210022, + "learning_rate": 0.0007658543417366947, + "loss": 0.4914, + "step": 8466 + }, + { + "epoch": 4.730167597765363, + "grad_norm": 0.4650897681713104, + "learning_rate": 0.000765826330532213, + "loss": 0.5294, + "step": 8467 + }, + { + "epoch": 4.730726256983241, + "grad_norm": 0.3533768057823181, + "learning_rate": 0.0007657983193277312, + "loss": 0.3611, + "step": 8468 + }, + { + "epoch": 4.731284916201117, + "grad_norm": 0.7186769247055054, + "learning_rate": 0.0007657703081232493, + "loss": 0.4371, + "step": 8469 + }, + { + "epoch": 4.731843575418995, + "grad_norm": 0.4629736542701721, + "learning_rate": 0.0007657422969187675, + "loss": 0.3897, + "step": 8470 + }, + { + "epoch": 4.732402234636871, + "grad_norm": 0.6437535881996155, + "learning_rate": 0.0007657142857142857, + "loss": 0.5201, + "step": 8471 + }, + { + "epoch": 4.732960893854749, + "grad_norm": 1.0654048919677734, + "learning_rate": 0.000765686274509804, + "loss": 0.4863, + "step": 8472 + }, + { + "epoch": 4.733519553072625, + "grad_norm": 0.7273711562156677, + "learning_rate": 0.0007656582633053222, + "loss": 0.5456, + "step": 8473 + }, + { + "epoch": 4.734078212290503, + "grad_norm": 0.4468400776386261, + "learning_rate": 0.0007656302521008403, + "loss": 0.4874, + "step": 8474 + }, + { + "epoch": 4.73463687150838, + "grad_norm": 0.5781263709068298, + "learning_rate": 0.0007656022408963585, + "loss": 0.5598, + "step": 8475 + }, + { + "epoch": 4.735195530726257, + "grad_norm": 0.7847022414207458, + "learning_rate": 0.0007655742296918767, + "loss": 0.3644, + "step": 8476 + }, + { + "epoch": 4.735754189944134, + "grad_norm": 0.6021060347557068, + "learning_rate": 0.0007655462184873949, + "loss": 0.4145, + "step": 8477 + }, + { + "epoch": 4.736312849162011, + "grad_norm": 0.5077820420265198, + "learning_rate": 0.0007655182072829132, + "loss": 0.4714, + "step": 8478 + }, + { + "epoch": 4.736871508379888, + "grad_norm": 0.7651529908180237, + "learning_rate": 0.0007654901960784315, + "loss": 0.5498, + "step": 8479 + }, + { + "epoch": 4.737430167597766, + "grad_norm": 5.596915245056152, + "learning_rate": 0.0007654621848739495, + "loss": 0.4456, + "step": 8480 + }, + { + "epoch": 4.737988826815642, + "grad_norm": 0.688568115234375, + "learning_rate": 0.0007654341736694678, + "loss": 0.5245, + "step": 8481 + }, + { + "epoch": 4.73854748603352, + "grad_norm": 0.5024352073669434, + "learning_rate": 0.000765406162464986, + "loss": 0.4767, + "step": 8482 + }, + { + "epoch": 4.739106145251396, + "grad_norm": 1.536091923713684, + "learning_rate": 0.0007653781512605043, + "loss": 0.4464, + "step": 8483 + }, + { + "epoch": 4.739664804469274, + "grad_norm": 1.293010950088501, + "learning_rate": 0.0007653501400560225, + "loss": 0.4876, + "step": 8484 + }, + { + "epoch": 4.740223463687151, + "grad_norm": 0.4086540639400482, + "learning_rate": 0.0007653221288515406, + "loss": 0.4024, + "step": 8485 + }, + { + "epoch": 4.740782122905028, + "grad_norm": 0.5928152799606323, + "learning_rate": 0.0007652941176470588, + "loss": 0.4414, + "step": 8486 + }, + { + "epoch": 4.741340782122905, + "grad_norm": 0.431692510843277, + "learning_rate": 0.000765266106442577, + "loss": 0.4093, + "step": 8487 + }, + { + "epoch": 4.741899441340782, + "grad_norm": 0.42686524987220764, + "learning_rate": 0.0007652380952380953, + "loss": 0.4037, + "step": 8488 + }, + { + "epoch": 4.742458100558659, + "grad_norm": 0.47489142417907715, + "learning_rate": 0.0007652100840336135, + "loss": 0.3869, + "step": 8489 + }, + { + "epoch": 4.743016759776537, + "grad_norm": 0.7389423847198486, + "learning_rate": 0.0007651820728291316, + "loss": 0.5071, + "step": 8490 + }, + { + "epoch": 4.743575418994413, + "grad_norm": 2.945859432220459, + "learning_rate": 0.0007651540616246498, + "loss": 0.448, + "step": 8491 + }, + { + "epoch": 4.744134078212291, + "grad_norm": 0.7166175842285156, + "learning_rate": 0.000765126050420168, + "loss": 0.4907, + "step": 8492 + }, + { + "epoch": 4.744692737430167, + "grad_norm": 0.544670045375824, + "learning_rate": 0.0007650980392156863, + "loss": 0.4525, + "step": 8493 + }, + { + "epoch": 4.745251396648045, + "grad_norm": 0.6563423871994019, + "learning_rate": 0.0007650700280112045, + "loss": 0.4313, + "step": 8494 + }, + { + "epoch": 4.745810055865922, + "grad_norm": 0.40189117193222046, + "learning_rate": 0.0007650420168067227, + "loss": 0.3294, + "step": 8495 + }, + { + "epoch": 4.746368715083799, + "grad_norm": 0.48015493154525757, + "learning_rate": 0.0007650140056022408, + "loss": 0.3317, + "step": 8496 + }, + { + "epoch": 4.746927374301676, + "grad_norm": 0.5164525508880615, + "learning_rate": 0.000764985994397759, + "loss": 0.4001, + "step": 8497 + }, + { + "epoch": 4.747486033519553, + "grad_norm": 0.46735015511512756, + "learning_rate": 0.0007649579831932774, + "loss": 0.4144, + "step": 8498 + }, + { + "epoch": 4.74804469273743, + "grad_norm": 0.4527231752872467, + "learning_rate": 0.0007649299719887956, + "loss": 0.4198, + "step": 8499 + }, + { + "epoch": 4.748603351955307, + "grad_norm": 1.2354981899261475, + "learning_rate": 0.0007649019607843138, + "loss": 0.4893, + "step": 8500 + }, + { + "epoch": 4.748603351955307, + "eval_cer": 0.09194461719747307, + "eval_loss": 0.35319676995277405, + "eval_runtime": 57.1481, + "eval_samples_per_second": 79.408, + "eval_steps_per_second": 4.97, + "eval_wer": 0.3625597050361722, + "step": 8500 + }, + { + "epoch": 4.749162011173184, + "grad_norm": 0.7207176089286804, + "learning_rate": 0.0007648739495798319, + "loss": 0.5563, + "step": 8501 + }, + { + "epoch": 4.749720670391062, + "grad_norm": 0.38375696539878845, + "learning_rate": 0.0007648459383753501, + "loss": 0.3644, + "step": 8502 + }, + { + "epoch": 4.750279329608938, + "grad_norm": 1.0161912441253662, + "learning_rate": 0.0007648179271708684, + "loss": 0.3834, + "step": 8503 + }, + { + "epoch": 4.750837988826816, + "grad_norm": 0.72743821144104, + "learning_rate": 0.0007647899159663866, + "loss": 0.5341, + "step": 8504 + }, + { + "epoch": 4.751396648044693, + "grad_norm": 0.514685869216919, + "learning_rate": 0.0007647619047619048, + "loss": 0.5476, + "step": 8505 + }, + { + "epoch": 4.75195530726257, + "grad_norm": 0.5789101123809814, + "learning_rate": 0.0007647338935574229, + "loss": 0.5369, + "step": 8506 + }, + { + "epoch": 4.752513966480447, + "grad_norm": 0.5056542158126831, + "learning_rate": 0.0007647058823529411, + "loss": 0.4842, + "step": 8507 + }, + { + "epoch": 4.753072625698324, + "grad_norm": 0.39382168650627136, + "learning_rate": 0.0007646778711484594, + "loss": 0.4077, + "step": 8508 + }, + { + "epoch": 4.753631284916201, + "grad_norm": 0.5213464498519897, + "learning_rate": 0.0007646498599439776, + "loss": 0.4121, + "step": 8509 + }, + { + "epoch": 4.754189944134078, + "grad_norm": 0.4672740399837494, + "learning_rate": 0.0007646218487394958, + "loss": 0.3647, + "step": 8510 + }, + { + "epoch": 4.754748603351955, + "grad_norm": 0.48578375577926636, + "learning_rate": 0.000764593837535014, + "loss": 0.4473, + "step": 8511 + }, + { + "epoch": 4.755307262569833, + "grad_norm": 1.0094746351242065, + "learning_rate": 0.0007645658263305321, + "loss": 0.5339, + "step": 8512 + }, + { + "epoch": 4.755865921787709, + "grad_norm": 0.6768170595169067, + "learning_rate": 0.0007645378151260505, + "loss": 0.4931, + "step": 8513 + }, + { + "epoch": 4.756424581005587, + "grad_norm": 0.4883013963699341, + "learning_rate": 0.0007645098039215687, + "loss": 0.3961, + "step": 8514 + }, + { + "epoch": 4.756983240223463, + "grad_norm": 0.6311526894569397, + "learning_rate": 0.0007644817927170869, + "loss": 0.5226, + "step": 8515 + }, + { + "epoch": 4.757541899441341, + "grad_norm": 0.5176435708999634, + "learning_rate": 0.0007644537815126051, + "loss": 0.4457, + "step": 8516 + }, + { + "epoch": 4.758100558659218, + "grad_norm": 1.0449047088623047, + "learning_rate": 0.0007644257703081232, + "loss": 0.5141, + "step": 8517 + }, + { + "epoch": 4.758659217877095, + "grad_norm": 0.3304331600666046, + "learning_rate": 0.0007643977591036415, + "loss": 0.3415, + "step": 8518 + }, + { + "epoch": 4.759217877094972, + "grad_norm": 0.5925968885421753, + "learning_rate": 0.0007643697478991597, + "loss": 0.3381, + "step": 8519 + }, + { + "epoch": 4.759776536312849, + "grad_norm": 0.9773862957954407, + "learning_rate": 0.0007643417366946779, + "loss": 0.5377, + "step": 8520 + }, + { + "epoch": 4.760335195530726, + "grad_norm": 0.7833096385002136, + "learning_rate": 0.0007643137254901961, + "loss": 0.4993, + "step": 8521 + }, + { + "epoch": 4.760893854748604, + "grad_norm": 0.6476019620895386, + "learning_rate": 0.0007642857142857142, + "loss": 0.4434, + "step": 8522 + }, + { + "epoch": 4.76145251396648, + "grad_norm": 0.4440038800239563, + "learning_rate": 0.0007642577030812325, + "loss": 0.4266, + "step": 8523 + }, + { + "epoch": 4.762011173184358, + "grad_norm": 0.8631799221038818, + "learning_rate": 0.0007642296918767507, + "loss": 0.4521, + "step": 8524 + }, + { + "epoch": 4.762569832402234, + "grad_norm": 0.5670862793922424, + "learning_rate": 0.0007642016806722689, + "loss": 0.4143, + "step": 8525 + }, + { + "epoch": 4.763128491620112, + "grad_norm": 0.6003089547157288, + "learning_rate": 0.0007641736694677871, + "loss": 0.5468, + "step": 8526 + }, + { + "epoch": 4.763687150837989, + "grad_norm": 0.5930803418159485, + "learning_rate": 0.0007641456582633053, + "loss": 0.4601, + "step": 8527 + }, + { + "epoch": 4.764245810055866, + "grad_norm": 0.3905738294124603, + "learning_rate": 0.0007641176470588235, + "loss": 0.4098, + "step": 8528 + }, + { + "epoch": 4.764804469273743, + "grad_norm": 0.562506914138794, + "learning_rate": 0.0007640896358543417, + "loss": 0.391, + "step": 8529 + }, + { + "epoch": 4.76536312849162, + "grad_norm": 0.6021487712860107, + "learning_rate": 0.00076406162464986, + "loss": 0.5974, + "step": 8530 + }, + { + "epoch": 4.765921787709497, + "grad_norm": 0.7099018096923828, + "learning_rate": 0.0007640336134453782, + "loss": 0.5282, + "step": 8531 + }, + { + "epoch": 4.766480446927375, + "grad_norm": 0.7900936603546143, + "learning_rate": 0.0007640056022408964, + "loss": 0.4747, + "step": 8532 + }, + { + "epoch": 4.767039106145251, + "grad_norm": 0.523512065410614, + "learning_rate": 0.0007639775910364146, + "loss": 0.4116, + "step": 8533 + }, + { + "epoch": 4.767597765363129, + "grad_norm": 0.6078117489814758, + "learning_rate": 0.0007639495798319328, + "loss": 0.5375, + "step": 8534 + }, + { + "epoch": 4.768156424581005, + "grad_norm": 0.6556194424629211, + "learning_rate": 0.000763921568627451, + "loss": 0.435, + "step": 8535 + }, + { + "epoch": 4.768715083798883, + "grad_norm": 2.1966516971588135, + "learning_rate": 0.0007638935574229692, + "loss": 0.4182, + "step": 8536 + }, + { + "epoch": 4.769273743016759, + "grad_norm": 0.8509520888328552, + "learning_rate": 0.0007638655462184874, + "loss": 0.527, + "step": 8537 + }, + { + "epoch": 4.769832402234637, + "grad_norm": 1.2802207469940186, + "learning_rate": 0.0007638375350140056, + "loss": 0.4945, + "step": 8538 + }, + { + "epoch": 4.770391061452514, + "grad_norm": 0.7501946687698364, + "learning_rate": 0.0007638095238095238, + "loss": 0.3921, + "step": 8539 + }, + { + "epoch": 4.770949720670391, + "grad_norm": 0.6180909276008606, + "learning_rate": 0.000763781512605042, + "loss": 0.4274, + "step": 8540 + }, + { + "epoch": 4.771508379888268, + "grad_norm": 0.5417438745498657, + "learning_rate": 0.0007637535014005602, + "loss": 0.5829, + "step": 8541 + }, + { + "epoch": 4.772067039106146, + "grad_norm": 0.44538432359695435, + "learning_rate": 0.0007637254901960784, + "loss": 0.4351, + "step": 8542 + }, + { + "epoch": 4.772625698324022, + "grad_norm": 0.8329975008964539, + "learning_rate": 0.0007636974789915967, + "loss": 0.4046, + "step": 8543 + }, + { + "epoch": 4.7731843575419, + "grad_norm": 0.814070463180542, + "learning_rate": 0.0007636694677871148, + "loss": 0.4987, + "step": 8544 + }, + { + "epoch": 4.773743016759776, + "grad_norm": 0.5354944467544556, + "learning_rate": 0.000763641456582633, + "loss": 0.4365, + "step": 8545 + }, + { + "epoch": 4.774301675977654, + "grad_norm": 0.44521036744117737, + "learning_rate": 0.0007636134453781513, + "loss": 0.4372, + "step": 8546 + }, + { + "epoch": 4.77486033519553, + "grad_norm": 1.0154215097427368, + "learning_rate": 0.0007635854341736695, + "loss": 0.3724, + "step": 8547 + }, + { + "epoch": 4.775418994413408, + "grad_norm": 0.6184551119804382, + "learning_rate": 0.0007635574229691878, + "loss": 0.4612, + "step": 8548 + }, + { + "epoch": 4.775977653631285, + "grad_norm": 0.48441067337989807, + "learning_rate": 0.0007635294117647059, + "loss": 0.445, + "step": 8549 + }, + { + "epoch": 4.776536312849162, + "grad_norm": 0.4702484905719757, + "learning_rate": 0.0007635014005602241, + "loss": 0.5007, + "step": 8550 + }, + { + "epoch": 4.777094972067039, + "grad_norm": 0.31311506032943726, + "learning_rate": 0.0007634733893557423, + "loss": 0.2831, + "step": 8551 + }, + { + "epoch": 4.777653631284916, + "grad_norm": 0.5636079907417297, + "learning_rate": 0.0007634453781512605, + "loss": 0.5599, + "step": 8552 + }, + { + "epoch": 4.778212290502793, + "grad_norm": 0.6074462532997131, + "learning_rate": 0.0007634173669467788, + "loss": 0.4183, + "step": 8553 + }, + { + "epoch": 4.778770949720671, + "grad_norm": 0.946617066860199, + "learning_rate": 0.0007633893557422969, + "loss": 0.3671, + "step": 8554 + }, + { + "epoch": 4.779329608938547, + "grad_norm": 0.46405544877052307, + "learning_rate": 0.0007633613445378151, + "loss": 0.4828, + "step": 8555 + }, + { + "epoch": 4.779888268156425, + "grad_norm": 0.49630939960479736, + "learning_rate": 0.0007633333333333333, + "loss": 0.4057, + "step": 8556 + }, + { + "epoch": 4.780446927374301, + "grad_norm": 1.0906662940979004, + "learning_rate": 0.0007633053221288515, + "loss": 0.494, + "step": 8557 + }, + { + "epoch": 4.781005586592179, + "grad_norm": 0.6826028227806091, + "learning_rate": 0.0007632773109243698, + "loss": 0.5314, + "step": 8558 + }, + { + "epoch": 4.781564245810056, + "grad_norm": 0.55367511510849, + "learning_rate": 0.000763249299719888, + "loss": 0.4674, + "step": 8559 + }, + { + "epoch": 4.782122905027933, + "grad_norm": 0.44338259100914, + "learning_rate": 0.0007632212885154061, + "loss": 0.4788, + "step": 8560 + }, + { + "epoch": 4.78268156424581, + "grad_norm": 0.3534598648548126, + "learning_rate": 0.0007631932773109243, + "loss": 0.3624, + "step": 8561 + }, + { + "epoch": 4.783240223463687, + "grad_norm": 0.4212881624698639, + "learning_rate": 0.0007631652661064425, + "loss": 0.4386, + "step": 8562 + }, + { + "epoch": 4.783798882681564, + "grad_norm": 0.4065394401550293, + "learning_rate": 0.0007631372549019609, + "loss": 0.4035, + "step": 8563 + }, + { + "epoch": 4.784357541899441, + "grad_norm": 0.4878144860267639, + "learning_rate": 0.0007631092436974791, + "loss": 0.4384, + "step": 8564 + }, + { + "epoch": 4.784916201117318, + "grad_norm": 0.5604859590530396, + "learning_rate": 0.0007630812324929972, + "loss": 0.4836, + "step": 8565 + }, + { + "epoch": 4.785474860335196, + "grad_norm": 0.5255528092384338, + "learning_rate": 0.0007630532212885154, + "loss": 0.4055, + "step": 8566 + }, + { + "epoch": 4.786033519553072, + "grad_norm": 0.5735591053962708, + "learning_rate": 0.0007630252100840336, + "loss": 0.4753, + "step": 8567 + }, + { + "epoch": 4.78659217877095, + "grad_norm": 1.0948740243911743, + "learning_rate": 0.0007629971988795519, + "loss": 0.518, + "step": 8568 + }, + { + "epoch": 4.787150837988827, + "grad_norm": 0.6727017164230347, + "learning_rate": 0.0007629691876750701, + "loss": 0.428, + "step": 8569 + }, + { + "epoch": 4.787709497206704, + "grad_norm": 0.577598512172699, + "learning_rate": 0.0007629411764705882, + "loss": 0.4873, + "step": 8570 + }, + { + "epoch": 4.788268156424581, + "grad_norm": 0.7192434668540955, + "learning_rate": 0.0007629131652661064, + "loss": 0.5153, + "step": 8571 + }, + { + "epoch": 4.788826815642458, + "grad_norm": 0.3713063597679138, + "learning_rate": 0.0007628851540616246, + "loss": 0.3928, + "step": 8572 + }, + { + "epoch": 4.789385474860335, + "grad_norm": 0.5450425744056702, + "learning_rate": 0.0007628571428571429, + "loss": 0.6114, + "step": 8573 + }, + { + "epoch": 4.789944134078212, + "grad_norm": 0.8751230835914612, + "learning_rate": 0.0007628291316526611, + "loss": 0.4851, + "step": 8574 + }, + { + "epoch": 4.790502793296089, + "grad_norm": 0.4314839243888855, + "learning_rate": 0.0007628011204481793, + "loss": 0.4735, + "step": 8575 + }, + { + "epoch": 4.791061452513967, + "grad_norm": 0.9281938076019287, + "learning_rate": 0.0007627731092436974, + "loss": 0.4769, + "step": 8576 + }, + { + "epoch": 4.791620111731843, + "grad_norm": 0.4785159230232239, + "learning_rate": 0.0007627450980392156, + "loss": 0.495, + "step": 8577 + }, + { + "epoch": 4.792178770949721, + "grad_norm": 0.46127575635910034, + "learning_rate": 0.000762717086834734, + "loss": 0.4093, + "step": 8578 + }, + { + "epoch": 4.792737430167598, + "grad_norm": 2.6541554927825928, + "learning_rate": 0.0007626890756302522, + "loss": 0.4028, + "step": 8579 + }, + { + "epoch": 4.793296089385475, + "grad_norm": 0.6880247592926025, + "learning_rate": 0.0007626610644257704, + "loss": 0.4358, + "step": 8580 + }, + { + "epoch": 4.793854748603352, + "grad_norm": 0.70887690782547, + "learning_rate": 0.0007626330532212885, + "loss": 0.4377, + "step": 8581 + }, + { + "epoch": 4.794413407821229, + "grad_norm": 5.6827921867370605, + "learning_rate": 0.0007626050420168067, + "loss": 0.4145, + "step": 8582 + }, + { + "epoch": 4.794972067039106, + "grad_norm": 0.9522745609283447, + "learning_rate": 0.000762577030812325, + "loss": 0.6287, + "step": 8583 + }, + { + "epoch": 4.795530726256983, + "grad_norm": 0.36743617057800293, + "learning_rate": 0.0007625490196078432, + "loss": 0.4324, + "step": 8584 + }, + { + "epoch": 4.79608938547486, + "grad_norm": 4.876690864562988, + "learning_rate": 0.0007625210084033614, + "loss": 0.4611, + "step": 8585 + }, + { + "epoch": 4.796648044692738, + "grad_norm": 0.720872163772583, + "learning_rate": 0.0007624929971988795, + "loss": 0.5852, + "step": 8586 + }, + { + "epoch": 4.797206703910614, + "grad_norm": 0.9262465238571167, + "learning_rate": 0.0007624649859943977, + "loss": 0.6957, + "step": 8587 + }, + { + "epoch": 4.797765363128492, + "grad_norm": 2.984988212585449, + "learning_rate": 0.000762436974789916, + "loss": 0.5382, + "step": 8588 + }, + { + "epoch": 4.798324022346368, + "grad_norm": 0.7714965343475342, + "learning_rate": 0.0007624089635854342, + "loss": 0.4909, + "step": 8589 + }, + { + "epoch": 4.798882681564246, + "grad_norm": 0.5527476668357849, + "learning_rate": 0.0007623809523809524, + "loss": 0.4358, + "step": 8590 + }, + { + "epoch": 4.799441340782123, + "grad_norm": 0.4277464747428894, + "learning_rate": 0.0007623529411764706, + "loss": 0.4174, + "step": 8591 + }, + { + "epoch": 4.8, + "grad_norm": 0.5458946824073792, + "learning_rate": 0.0007623249299719887, + "loss": 0.5026, + "step": 8592 + }, + { + "epoch": 4.800558659217877, + "grad_norm": 0.45590174198150635, + "learning_rate": 0.000762296918767507, + "loss": 0.3421, + "step": 8593 + }, + { + "epoch": 4.801117318435754, + "grad_norm": 0.8198550343513489, + "learning_rate": 0.0007622689075630252, + "loss": 0.3975, + "step": 8594 + }, + { + "epoch": 4.801675977653631, + "grad_norm": 0.7512623071670532, + "learning_rate": 0.0007622408963585435, + "loss": 0.4302, + "step": 8595 + }, + { + "epoch": 4.802234636871509, + "grad_norm": 1.2393786907196045, + "learning_rate": 0.0007622128851540617, + "loss": 0.4833, + "step": 8596 + }, + { + "epoch": 4.802793296089385, + "grad_norm": 0.7234466075897217, + "learning_rate": 0.0007621848739495798, + "loss": 0.4188, + "step": 8597 + }, + { + "epoch": 4.803351955307263, + "grad_norm": 0.42658236622810364, + "learning_rate": 0.0007621568627450981, + "loss": 0.3814, + "step": 8598 + }, + { + "epoch": 4.803910614525139, + "grad_norm": 0.6034008860588074, + "learning_rate": 0.0007621288515406163, + "loss": 0.5023, + "step": 8599 + }, + { + "epoch": 4.804469273743017, + "grad_norm": 0.7559418678283691, + "learning_rate": 0.0007621008403361345, + "loss": 0.4843, + "step": 8600 + }, + { + "epoch": 4.805027932960893, + "grad_norm": 6.497520446777344, + "learning_rate": 0.0007620728291316527, + "loss": 0.5978, + "step": 8601 + }, + { + "epoch": 4.805586592178771, + "grad_norm": 0.5292633771896362, + "learning_rate": 0.0007620448179271708, + "loss": 0.5179, + "step": 8602 + }, + { + "epoch": 4.806145251396648, + "grad_norm": 10.337291717529297, + "learning_rate": 0.0007620168067226891, + "loss": 0.4854, + "step": 8603 + }, + { + "epoch": 4.806703910614525, + "grad_norm": 0.5464903116226196, + "learning_rate": 0.0007619887955182073, + "loss": 0.4532, + "step": 8604 + }, + { + "epoch": 4.807262569832402, + "grad_norm": 0.4960624575614929, + "learning_rate": 0.0007619607843137255, + "loss": 0.4477, + "step": 8605 + }, + { + "epoch": 4.80782122905028, + "grad_norm": 0.9065825343132019, + "learning_rate": 0.0007619327731092437, + "loss": 0.5187, + "step": 8606 + }, + { + "epoch": 4.808379888268156, + "grad_norm": 0.8069644570350647, + "learning_rate": 0.0007619047619047619, + "loss": 0.4554, + "step": 8607 + }, + { + "epoch": 4.808938547486034, + "grad_norm": 0.5447313785552979, + "learning_rate": 0.0007618767507002801, + "loss": 0.4121, + "step": 8608 + }, + { + "epoch": 4.80949720670391, + "grad_norm": 0.4326414465904236, + "learning_rate": 0.0007618487394957983, + "loss": 0.4484, + "step": 8609 + }, + { + "epoch": 4.810055865921788, + "grad_norm": 0.8181438446044922, + "learning_rate": 0.0007618207282913165, + "loss": 0.4593, + "step": 8610 + }, + { + "epoch": 4.810614525139664, + "grad_norm": 0.5639909505844116, + "learning_rate": 0.0007617927170868347, + "loss": 0.5026, + "step": 8611 + }, + { + "epoch": 4.811173184357542, + "grad_norm": 0.6129013895988464, + "learning_rate": 0.000761764705882353, + "loss": 0.3885, + "step": 8612 + }, + { + "epoch": 4.811731843575419, + "grad_norm": 0.5478720664978027, + "learning_rate": 0.0007617366946778712, + "loss": 0.4847, + "step": 8613 + }, + { + "epoch": 4.812290502793296, + "grad_norm": 0.537415087223053, + "learning_rate": 0.0007617086834733894, + "loss": 0.5038, + "step": 8614 + }, + { + "epoch": 4.812849162011173, + "grad_norm": 0.5390510559082031, + "learning_rate": 0.0007616806722689076, + "loss": 0.4432, + "step": 8615 + }, + { + "epoch": 4.813407821229051, + "grad_norm": 0.5609541535377502, + "learning_rate": 0.0007616526610644258, + "loss": 0.5991, + "step": 8616 + }, + { + "epoch": 4.813966480446927, + "grad_norm": 0.5197296738624573, + "learning_rate": 0.000761624649859944, + "loss": 0.5161, + "step": 8617 + }, + { + "epoch": 4.814525139664805, + "grad_norm": 0.5966982841491699, + "learning_rate": 0.0007615966386554623, + "loss": 0.4598, + "step": 8618 + }, + { + "epoch": 4.815083798882681, + "grad_norm": 0.42973336577415466, + "learning_rate": 0.0007615686274509804, + "loss": 0.4254, + "step": 8619 + }, + { + "epoch": 4.815642458100559, + "grad_norm": 0.4096120595932007, + "learning_rate": 0.0007615406162464986, + "loss": 0.4645, + "step": 8620 + }, + { + "epoch": 4.816201117318435, + "grad_norm": 0.8303319811820984, + "learning_rate": 0.0007615126050420168, + "loss": 0.4628, + "step": 8621 + }, + { + "epoch": 4.816759776536313, + "grad_norm": 0.762954831123352, + "learning_rate": 0.000761484593837535, + "loss": 0.4831, + "step": 8622 + }, + { + "epoch": 4.81731843575419, + "grad_norm": 2.0583696365356445, + "learning_rate": 0.0007614565826330533, + "loss": 0.5275, + "step": 8623 + }, + { + "epoch": 4.817877094972067, + "grad_norm": 0.7234183549880981, + "learning_rate": 0.0007614285714285714, + "loss": 0.4301, + "step": 8624 + }, + { + "epoch": 4.818435754189944, + "grad_norm": 0.5252977609634399, + "learning_rate": 0.0007614005602240896, + "loss": 0.4538, + "step": 8625 + }, + { + "epoch": 4.818994413407821, + "grad_norm": 0.39629223942756653, + "learning_rate": 0.0007613725490196078, + "loss": 0.3955, + "step": 8626 + }, + { + "epoch": 4.819553072625698, + "grad_norm": 0.42328080534935, + "learning_rate": 0.000761344537815126, + "loss": 0.4527, + "step": 8627 + }, + { + "epoch": 4.820111731843576, + "grad_norm": 0.9976792335510254, + "learning_rate": 0.0007613165266106444, + "loss": 0.397, + "step": 8628 + }, + { + "epoch": 4.820670391061452, + "grad_norm": 0.8522345423698425, + "learning_rate": 0.0007612885154061625, + "loss": 0.6468, + "step": 8629 + }, + { + "epoch": 4.82122905027933, + "grad_norm": 0.5112760066986084, + "learning_rate": 0.0007612605042016807, + "loss": 0.4858, + "step": 8630 + }, + { + "epoch": 4.821787709497206, + "grad_norm": 0.5359641313552856, + "learning_rate": 0.0007612324929971989, + "loss": 0.4196, + "step": 8631 + }, + { + "epoch": 4.822346368715084, + "grad_norm": 0.4807271361351013, + "learning_rate": 0.0007612044817927171, + "loss": 0.4695, + "step": 8632 + }, + { + "epoch": 4.822905027932961, + "grad_norm": 0.4003235101699829, + "learning_rate": 0.0007611764705882354, + "loss": 0.41, + "step": 8633 + }, + { + "epoch": 4.823463687150838, + "grad_norm": 0.7319088578224182, + "learning_rate": 0.0007611484593837536, + "loss": 0.4372, + "step": 8634 + }, + { + "epoch": 4.824022346368715, + "grad_norm": 0.49317845702171326, + "learning_rate": 0.0007611204481792717, + "loss": 0.4685, + "step": 8635 + }, + { + "epoch": 4.824581005586592, + "grad_norm": 0.5065277814865112, + "learning_rate": 0.0007610924369747899, + "loss": 0.3763, + "step": 8636 + }, + { + "epoch": 4.825139664804469, + "grad_norm": 0.4806182384490967, + "learning_rate": 0.0007610644257703081, + "loss": 0.373, + "step": 8637 + }, + { + "epoch": 4.825698324022346, + "grad_norm": 0.5946223139762878, + "learning_rate": 0.0007610364145658264, + "loss": 0.5096, + "step": 8638 + }, + { + "epoch": 4.826256983240223, + "grad_norm": 0.47863414883613586, + "learning_rate": 0.0007610084033613446, + "loss": 0.5492, + "step": 8639 + }, + { + "epoch": 4.826815642458101, + "grad_norm": 1.5016567707061768, + "learning_rate": 0.0007609803921568627, + "loss": 0.4383, + "step": 8640 + }, + { + "epoch": 4.827374301675977, + "grad_norm": 0.4068509340286255, + "learning_rate": 0.0007609523809523809, + "loss": 0.4084, + "step": 8641 + }, + { + "epoch": 4.827932960893855, + "grad_norm": 0.796147346496582, + "learning_rate": 0.0007609243697478991, + "loss": 0.4047, + "step": 8642 + }, + { + "epoch": 4.828491620111732, + "grad_norm": 0.5410636067390442, + "learning_rate": 0.0007608963585434175, + "loss": 0.5005, + "step": 8643 + }, + { + "epoch": 4.829050279329609, + "grad_norm": 0.5957869291305542, + "learning_rate": 0.0007608683473389357, + "loss": 0.7698, + "step": 8644 + }, + { + "epoch": 4.829608938547486, + "grad_norm": 0.461546391248703, + "learning_rate": 0.0007608403361344538, + "loss": 0.4889, + "step": 8645 + }, + { + "epoch": 4.830167597765363, + "grad_norm": 0.7469927072525024, + "learning_rate": 0.000760812324929972, + "loss": 0.4555, + "step": 8646 + }, + { + "epoch": 4.83072625698324, + "grad_norm": 0.478488951921463, + "learning_rate": 0.0007607843137254902, + "loss": 0.4145, + "step": 8647 + }, + { + "epoch": 4.831284916201117, + "grad_norm": 1.2435779571533203, + "learning_rate": 0.0007607563025210085, + "loss": 0.5261, + "step": 8648 + }, + { + "epoch": 4.831843575418994, + "grad_norm": 0.6280058026313782, + "learning_rate": 0.0007607282913165267, + "loss": 0.6567, + "step": 8649 + }, + { + "epoch": 4.832402234636872, + "grad_norm": 0.7095413208007812, + "learning_rate": 0.0007607002801120449, + "loss": 0.5206, + "step": 8650 + }, + { + "epoch": 4.832960893854748, + "grad_norm": 0.560554027557373, + "learning_rate": 0.000760672268907563, + "loss": 0.6407, + "step": 8651 + }, + { + "epoch": 4.833519553072626, + "grad_norm": 0.5252218842506409, + "learning_rate": 0.0007606442577030812, + "loss": 0.4313, + "step": 8652 + }, + { + "epoch": 4.834078212290503, + "grad_norm": 0.45497995615005493, + "learning_rate": 0.0007606162464985995, + "loss": 0.3813, + "step": 8653 + }, + { + "epoch": 4.83463687150838, + "grad_norm": 0.5191269516944885, + "learning_rate": 0.0007605882352941177, + "loss": 0.4141, + "step": 8654 + }, + { + "epoch": 4.835195530726257, + "grad_norm": 0.5692581534385681, + "learning_rate": 0.0007605602240896359, + "loss": 0.4955, + "step": 8655 + }, + { + "epoch": 4.835754189944134, + "grad_norm": 5.363700866699219, + "learning_rate": 0.000760532212885154, + "loss": 0.414, + "step": 8656 + }, + { + "epoch": 4.836312849162011, + "grad_norm": 0.5020806789398193, + "learning_rate": 0.0007605042016806722, + "loss": 0.451, + "step": 8657 + }, + { + "epoch": 4.836871508379888, + "grad_norm": 0.49971261620521545, + "learning_rate": 0.0007604761904761905, + "loss": 0.5365, + "step": 8658 + }, + { + "epoch": 4.837430167597765, + "grad_norm": 0.5629549622535706, + "learning_rate": 0.0007604481792717087, + "loss": 0.4659, + "step": 8659 + }, + { + "epoch": 4.837988826815643, + "grad_norm": 0.4420061409473419, + "learning_rate": 0.000760420168067227, + "loss": 0.4987, + "step": 8660 + }, + { + "epoch": 4.838547486033519, + "grad_norm": 1.127530574798584, + "learning_rate": 0.000760392156862745, + "loss": 0.3674, + "step": 8661 + }, + { + "epoch": 4.839106145251397, + "grad_norm": 0.49039822816848755, + "learning_rate": 0.0007603641456582633, + "loss": 0.3769, + "step": 8662 + }, + { + "epoch": 4.839664804469273, + "grad_norm": 0.6439700722694397, + "learning_rate": 0.0007603361344537816, + "loss": 0.4574, + "step": 8663 + }, + { + "epoch": 4.840223463687151, + "grad_norm": 0.5725346803665161, + "learning_rate": 0.0007603081232492998, + "loss": 0.4913, + "step": 8664 + }, + { + "epoch": 4.840782122905028, + "grad_norm": 0.7629607319831848, + "learning_rate": 0.000760280112044818, + "loss": 0.6072, + "step": 8665 + }, + { + "epoch": 4.841340782122905, + "grad_norm": 0.4991709589958191, + "learning_rate": 0.0007602521008403362, + "loss": 0.5249, + "step": 8666 + }, + { + "epoch": 4.841899441340782, + "grad_norm": 0.5673263669013977, + "learning_rate": 0.0007602240896358543, + "loss": 0.4764, + "step": 8667 + }, + { + "epoch": 4.842458100558659, + "grad_norm": 0.4850786328315735, + "learning_rate": 0.0007601960784313726, + "loss": 0.398, + "step": 8668 + }, + { + "epoch": 4.843016759776536, + "grad_norm": 0.5174915194511414, + "learning_rate": 0.0007601680672268908, + "loss": 0.529, + "step": 8669 + }, + { + "epoch": 4.843575418994414, + "grad_norm": 0.6681555509567261, + "learning_rate": 0.000760140056022409, + "loss": 0.5516, + "step": 8670 + }, + { + "epoch": 4.84413407821229, + "grad_norm": 1.2518675327301025, + "learning_rate": 0.0007601120448179272, + "loss": 0.5558, + "step": 8671 + }, + { + "epoch": 4.844692737430168, + "grad_norm": 0.43766605854034424, + "learning_rate": 0.0007600840336134453, + "loss": 0.3608, + "step": 8672 + }, + { + "epoch": 4.845251396648044, + "grad_norm": 0.4313262701034546, + "learning_rate": 0.0007600560224089636, + "loss": 0.3828, + "step": 8673 + }, + { + "epoch": 4.845810055865922, + "grad_norm": 5.528144359588623, + "learning_rate": 0.0007600280112044818, + "loss": 0.498, + "step": 8674 + }, + { + "epoch": 4.846368715083798, + "grad_norm": 0.6264899969100952, + "learning_rate": 0.00076, + "loss": 0.6633, + "step": 8675 + }, + { + "epoch": 4.846927374301676, + "grad_norm": 0.5527233481407166, + "learning_rate": 0.0007599719887955182, + "loss": 0.446, + "step": 8676 + }, + { + "epoch": 4.847486033519553, + "grad_norm": 0.8556757569313049, + "learning_rate": 0.0007599439775910363, + "loss": 0.3835, + "step": 8677 + }, + { + "epoch": 4.84804469273743, + "grad_norm": 0.5169434547424316, + "learning_rate": 0.0007599159663865547, + "loss": 0.4091, + "step": 8678 + }, + { + "epoch": 4.848603351955307, + "grad_norm": 0.7467514872550964, + "learning_rate": 0.0007598879551820729, + "loss": 0.427, + "step": 8679 + }, + { + "epoch": 4.849162011173185, + "grad_norm": 0.7596680521965027, + "learning_rate": 0.0007598599439775911, + "loss": 0.405, + "step": 8680 + }, + { + "epoch": 4.849720670391061, + "grad_norm": 0.6410501003265381, + "learning_rate": 0.0007598319327731093, + "loss": 0.3795, + "step": 8681 + }, + { + "epoch": 4.850279329608939, + "grad_norm": 0.3620051145553589, + "learning_rate": 0.0007598039215686275, + "loss": 0.4281, + "step": 8682 + }, + { + "epoch": 4.850837988826815, + "grad_norm": 0.4518038034439087, + "learning_rate": 0.0007597759103641457, + "loss": 0.4675, + "step": 8683 + }, + { + "epoch": 4.851396648044693, + "grad_norm": 0.47344234585762024, + "learning_rate": 0.0007597478991596639, + "loss": 0.4614, + "step": 8684 + }, + { + "epoch": 4.851955307262569, + "grad_norm": 1.901712417602539, + "learning_rate": 0.0007597198879551821, + "loss": 0.4486, + "step": 8685 + }, + { + "epoch": 4.852513966480447, + "grad_norm": 0.6681280136108398, + "learning_rate": 0.0007596918767507003, + "loss": 0.5473, + "step": 8686 + }, + { + "epoch": 4.853072625698324, + "grad_norm": 0.8004860877990723, + "learning_rate": 0.0007596638655462185, + "loss": 0.6469, + "step": 8687 + }, + { + "epoch": 4.853631284916201, + "grad_norm": 0.694496214389801, + "learning_rate": 0.0007596358543417367, + "loss": 0.4262, + "step": 8688 + }, + { + "epoch": 4.854189944134078, + "grad_norm": 1.9153003692626953, + "learning_rate": 0.0007596078431372549, + "loss": 0.3836, + "step": 8689 + }, + { + "epoch": 4.854748603351956, + "grad_norm": 2.520158052444458, + "learning_rate": 0.0007595798319327731, + "loss": 0.4639, + "step": 8690 + }, + { + "epoch": 4.855307262569832, + "grad_norm": 1.5247082710266113, + "learning_rate": 0.0007595518207282913, + "loss": 0.5336, + "step": 8691 + }, + { + "epoch": 4.85586592178771, + "grad_norm": 0.7307385802268982, + "learning_rate": 0.0007595238095238095, + "loss": 0.4568, + "step": 8692 + }, + { + "epoch": 4.856424581005586, + "grad_norm": 0.60298091173172, + "learning_rate": 0.0007594957983193277, + "loss": 0.53, + "step": 8693 + }, + { + "epoch": 4.856983240223464, + "grad_norm": 1.5359807014465332, + "learning_rate": 0.000759467787114846, + "loss": 0.4008, + "step": 8694 + }, + { + "epoch": 4.85754189944134, + "grad_norm": 0.45238712430000305, + "learning_rate": 0.0007594397759103642, + "loss": 0.5278, + "step": 8695 + }, + { + "epoch": 4.858100558659218, + "grad_norm": 0.5202834010124207, + "learning_rate": 0.0007594117647058824, + "loss": 0.4521, + "step": 8696 + }, + { + "epoch": 4.858659217877095, + "grad_norm": 0.49711424112319946, + "learning_rate": 0.0007593837535014006, + "loss": 0.4195, + "step": 8697 + }, + { + "epoch": 4.859217877094972, + "grad_norm": 0.554820716381073, + "learning_rate": 0.0007593557422969188, + "loss": 0.4874, + "step": 8698 + }, + { + "epoch": 4.859776536312849, + "grad_norm": 0.48045921325683594, + "learning_rate": 0.000759327731092437, + "loss": 0.4587, + "step": 8699 + }, + { + "epoch": 4.860335195530726, + "grad_norm": 0.4410632252693176, + "learning_rate": 0.0007592997198879552, + "loss": 0.4341, + "step": 8700 + }, + { + "epoch": 4.860893854748603, + "grad_norm": 0.47672775387763977, + "learning_rate": 0.0007592717086834734, + "loss": 0.4631, + "step": 8701 + }, + { + "epoch": 4.861452513966481, + "grad_norm": 1.4025802612304688, + "learning_rate": 0.0007592436974789916, + "loss": 0.4088, + "step": 8702 + }, + { + "epoch": 4.862011173184357, + "grad_norm": 1.1027954816818237, + "learning_rate": 0.0007592156862745098, + "loss": 0.4637, + "step": 8703 + }, + { + "epoch": 4.862569832402235, + "grad_norm": 0.83026123046875, + "learning_rate": 0.000759187675070028, + "loss": 0.4811, + "step": 8704 + }, + { + "epoch": 4.863128491620111, + "grad_norm": 0.5367934703826904, + "learning_rate": 0.0007591596638655462, + "loss": 0.4482, + "step": 8705 + }, + { + "epoch": 4.863687150837989, + "grad_norm": 1.2069411277770996, + "learning_rate": 0.0007591316526610644, + "loss": 0.458, + "step": 8706 + }, + { + "epoch": 4.864245810055866, + "grad_norm": 0.5686973929405212, + "learning_rate": 0.0007591036414565826, + "loss": 0.4673, + "step": 8707 + }, + { + "epoch": 4.864804469273743, + "grad_norm": 0.7726436257362366, + "learning_rate": 0.0007590756302521008, + "loss": 0.558, + "step": 8708 + }, + { + "epoch": 4.86536312849162, + "grad_norm": 0.676937997341156, + "learning_rate": 0.000759047619047619, + "loss": 0.6981, + "step": 8709 + }, + { + "epoch": 4.865921787709497, + "grad_norm": 0.6577152013778687, + "learning_rate": 0.0007590196078431373, + "loss": 0.4472, + "step": 8710 + }, + { + "epoch": 4.866480446927374, + "grad_norm": 0.47892898321151733, + "learning_rate": 0.0007589915966386555, + "loss": 0.4808, + "step": 8711 + }, + { + "epoch": 4.867039106145251, + "grad_norm": 0.5467607378959656, + "learning_rate": 0.0007589635854341737, + "loss": 0.4267, + "step": 8712 + }, + { + "epoch": 4.867597765363128, + "grad_norm": 0.4089581072330475, + "learning_rate": 0.0007589355742296919, + "loss": 0.4266, + "step": 8713 + }, + { + "epoch": 4.868156424581006, + "grad_norm": 0.5872803926467896, + "learning_rate": 0.0007589075630252102, + "loss": 0.5169, + "step": 8714 + }, + { + "epoch": 4.868715083798882, + "grad_norm": 0.422395795583725, + "learning_rate": 0.0007588795518207283, + "loss": 0.4513, + "step": 8715 + }, + { + "epoch": 4.86927374301676, + "grad_norm": 0.5978456735610962, + "learning_rate": 0.0007588515406162465, + "loss": 0.535, + "step": 8716 + }, + { + "epoch": 4.869832402234637, + "grad_norm": 0.42006736993789673, + "learning_rate": 0.0007588235294117647, + "loss": 0.4155, + "step": 8717 + }, + { + "epoch": 4.870391061452514, + "grad_norm": 0.5250824093818665, + "learning_rate": 0.0007587955182072829, + "loss": 0.4221, + "step": 8718 + }, + { + "epoch": 4.870949720670391, + "grad_norm": 0.5473908185958862, + "learning_rate": 0.0007587675070028012, + "loss": 0.4665, + "step": 8719 + }, + { + "epoch": 4.871508379888268, + "grad_norm": 1.4276212453842163, + "learning_rate": 0.0007587394957983193, + "loss": 0.451, + "step": 8720 + }, + { + "epoch": 4.872067039106145, + "grad_norm": 0.8969179391860962, + "learning_rate": 0.0007587114845938375, + "loss": 0.4372, + "step": 8721 + }, + { + "epoch": 4.872625698324022, + "grad_norm": 0.49443671107292175, + "learning_rate": 0.0007586834733893557, + "loss": 0.6035, + "step": 8722 + }, + { + "epoch": 4.873184357541899, + "grad_norm": 0.49140894412994385, + "learning_rate": 0.0007586554621848739, + "loss": 0.5041, + "step": 8723 + }, + { + "epoch": 4.873743016759777, + "grad_norm": 0.6224446892738342, + "learning_rate": 0.0007586274509803922, + "loss": 0.4787, + "step": 8724 + }, + { + "epoch": 4.874301675977653, + "grad_norm": 4.660036563873291, + "learning_rate": 0.0007585994397759103, + "loss": 0.4448, + "step": 8725 + }, + { + "epoch": 4.874860335195531, + "grad_norm": 0.6111028790473938, + "learning_rate": 0.0007585714285714285, + "loss": 0.4609, + "step": 8726 + }, + { + "epoch": 4.875418994413408, + "grad_norm": 0.5526478886604309, + "learning_rate": 0.0007585434173669468, + "loss": 0.4369, + "step": 8727 + }, + { + "epoch": 4.875977653631285, + "grad_norm": 0.49239787459373474, + "learning_rate": 0.000758515406162465, + "loss": 0.4605, + "step": 8728 + }, + { + "epoch": 4.876536312849162, + "grad_norm": 1.1657596826553345, + "learning_rate": 0.0007584873949579833, + "loss": 0.4463, + "step": 8729 + }, + { + "epoch": 4.877094972067039, + "grad_norm": 1.0578683614730835, + "learning_rate": 0.0007584593837535015, + "loss": 0.4995, + "step": 8730 + }, + { + "epoch": 4.877653631284916, + "grad_norm": 5.073737621307373, + "learning_rate": 0.0007584313725490196, + "loss": 0.4353, + "step": 8731 + }, + { + "epoch": 4.878212290502793, + "grad_norm": 0.5168876647949219, + "learning_rate": 0.0007584033613445378, + "loss": 0.3778, + "step": 8732 + }, + { + "epoch": 4.87877094972067, + "grad_norm": 0.6424051523208618, + "learning_rate": 0.000758375350140056, + "loss": 0.4002, + "step": 8733 + }, + { + "epoch": 4.879329608938548, + "grad_norm": 0.5979295372962952, + "learning_rate": 0.0007583473389355743, + "loss": 0.4553, + "step": 8734 + }, + { + "epoch": 4.879888268156424, + "grad_norm": 0.4738885462284088, + "learning_rate": 0.0007583193277310925, + "loss": 0.4754, + "step": 8735 + }, + { + "epoch": 4.880446927374302, + "grad_norm": 0.7919620275497437, + "learning_rate": 0.0007582913165266106, + "loss": 0.5094, + "step": 8736 + }, + { + "epoch": 4.881005586592178, + "grad_norm": 0.4680382013320923, + "learning_rate": 0.0007582633053221288, + "loss": 0.4582, + "step": 8737 + }, + { + "epoch": 4.881564245810056, + "grad_norm": 0.47519004344940186, + "learning_rate": 0.000758235294117647, + "loss": 0.4853, + "step": 8738 + }, + { + "epoch": 4.882122905027933, + "grad_norm": 0.4977691173553467, + "learning_rate": 0.0007582072829131653, + "loss": 0.5458, + "step": 8739 + }, + { + "epoch": 4.88268156424581, + "grad_norm": 0.5639273524284363, + "learning_rate": 0.0007581792717086835, + "loss": 0.4325, + "step": 8740 + }, + { + "epoch": 4.883240223463687, + "grad_norm": 1.9027615785598755, + "learning_rate": 0.0007581512605042016, + "loss": 0.4996, + "step": 8741 + }, + { + "epoch": 4.883798882681564, + "grad_norm": 1.072900652885437, + "learning_rate": 0.0007581232492997198, + "loss": 0.3651, + "step": 8742 + }, + { + "epoch": 4.884357541899441, + "grad_norm": 0.9303123354911804, + "learning_rate": 0.000758095238095238, + "loss": 0.4311, + "step": 8743 + }, + { + "epoch": 4.884916201117319, + "grad_norm": 0.4010227620601654, + "learning_rate": 0.0007580672268907564, + "loss": 0.3677, + "step": 8744 + }, + { + "epoch": 4.885474860335195, + "grad_norm": 0.7847115397453308, + "learning_rate": 0.0007580392156862746, + "loss": 0.4087, + "step": 8745 + }, + { + "epoch": 4.886033519553073, + "grad_norm": Infinity, + "learning_rate": 0.0007580392156862746, + "loss": 0.549, + "step": 8746 + }, + { + "epoch": 4.886592178770949, + "grad_norm": 0.39211156964302063, + "learning_rate": 0.0007580112044817928, + "loss": 0.4083, + "step": 8747 + }, + { + "epoch": 4.887150837988827, + "grad_norm": 0.5703455209732056, + "learning_rate": 0.0007579831932773109, + "loss": 0.5028, + "step": 8748 + }, + { + "epoch": 4.8877094972067034, + "grad_norm": 0.478864848613739, + "learning_rate": 0.0007579551820728291, + "loss": 0.4676, + "step": 8749 + }, + { + "epoch": 4.888268156424581, + "grad_norm": 0.48383629322052, + "learning_rate": 0.0007579271708683474, + "loss": 0.4342, + "step": 8750 + }, + { + "epoch": 4.888826815642458, + "grad_norm": 0.7051289677619934, + "learning_rate": 0.0007578991596638656, + "loss": 0.3469, + "step": 8751 + }, + { + "epoch": 4.889385474860335, + "grad_norm": 0.9486963748931885, + "learning_rate": 0.0007578711484593838, + "loss": 0.4363, + "step": 8752 + }, + { + "epoch": 4.889944134078212, + "grad_norm": 0.5282118916511536, + "learning_rate": 0.0007578431372549019, + "loss": 0.4348, + "step": 8753 + }, + { + "epoch": 4.89050279329609, + "grad_norm": 1.3129225969314575, + "learning_rate": 0.0007578151260504201, + "loss": 0.4452, + "step": 8754 + }, + { + "epoch": 4.891061452513966, + "grad_norm": 0.48157554864883423, + "learning_rate": 0.0007577871148459384, + "loss": 0.4788, + "step": 8755 + }, + { + "epoch": 4.891620111731844, + "grad_norm": 0.5641001462936401, + "learning_rate": 0.0007577591036414566, + "loss": 0.3712, + "step": 8756 + }, + { + "epoch": 4.89217877094972, + "grad_norm": 0.407925009727478, + "learning_rate": 0.0007577310924369748, + "loss": 0.4158, + "step": 8757 + }, + { + "epoch": 4.892737430167598, + "grad_norm": 0.474031537771225, + "learning_rate": 0.0007577030812324929, + "loss": 0.5754, + "step": 8758 + }, + { + "epoch": 4.8932960893854744, + "grad_norm": 0.5503466725349426, + "learning_rate": 0.0007576750700280111, + "loss": 0.4634, + "step": 8759 + }, + { + "epoch": 4.893854748603352, + "grad_norm": 0.471722275018692, + "learning_rate": 0.0007576470588235295, + "loss": 0.3822, + "step": 8760 + }, + { + "epoch": 4.894413407821229, + "grad_norm": 0.7298048734664917, + "learning_rate": 0.0007576190476190477, + "loss": 0.4524, + "step": 8761 + }, + { + "epoch": 4.894972067039106, + "grad_norm": 0.6199996471405029, + "learning_rate": 0.0007575910364145659, + "loss": 0.4481, + "step": 8762 + }, + { + "epoch": 4.895530726256983, + "grad_norm": 0.6268185377120972, + "learning_rate": 0.0007575630252100841, + "loss": 0.5344, + "step": 8763 + }, + { + "epoch": 4.896089385474861, + "grad_norm": 11.937359809875488, + "learning_rate": 0.0007575350140056022, + "loss": 0.3685, + "step": 8764 + }, + { + "epoch": 4.896648044692737, + "grad_norm": 0.7418108582496643, + "learning_rate": 0.0007575070028011205, + "loss": 0.4452, + "step": 8765 + }, + { + "epoch": 4.897206703910615, + "grad_norm": 0.41409215331077576, + "learning_rate": 0.0007574789915966387, + "loss": 0.4274, + "step": 8766 + }, + { + "epoch": 4.897765363128491, + "grad_norm": 2.3767995834350586, + "learning_rate": 0.0007574509803921569, + "loss": 0.4353, + "step": 8767 + }, + { + "epoch": 4.898324022346369, + "grad_norm": 1.2191287279129028, + "learning_rate": 0.0007574229691876751, + "loss": 0.5076, + "step": 8768 + }, + { + "epoch": 4.8988826815642454, + "grad_norm": 0.5433291792869568, + "learning_rate": 0.0007573949579831932, + "loss": 0.4504, + "step": 8769 + }, + { + "epoch": 4.899441340782123, + "grad_norm": 0.4664328098297119, + "learning_rate": 0.0007573669467787115, + "loss": 0.4013, + "step": 8770 + }, + { + "epoch": 4.9, + "grad_norm": 0.4200805425643921, + "learning_rate": 0.0007573389355742297, + "loss": 0.4857, + "step": 8771 + }, + { + "epoch": 4.900558659217877, + "grad_norm": 0.5408905744552612, + "learning_rate": 0.0007573109243697479, + "loss": 0.5885, + "step": 8772 + }, + { + "epoch": 4.901117318435754, + "grad_norm": 0.6539720892906189, + "learning_rate": 0.0007572829131652661, + "loss": 0.5112, + "step": 8773 + }, + { + "epoch": 4.901675977653631, + "grad_norm": 0.6490855813026428, + "learning_rate": 0.0007572549019607842, + "loss": 0.4567, + "step": 8774 + }, + { + "epoch": 4.902234636871508, + "grad_norm": 0.3859747350215912, + "learning_rate": 0.0007572268907563025, + "loss": 0.3942, + "step": 8775 + }, + { + "epoch": 4.902793296089386, + "grad_norm": 0.544288158416748, + "learning_rate": 0.0007571988795518207, + "loss": 0.475, + "step": 8776 + }, + { + "epoch": 4.903351955307262, + "grad_norm": 0.4559132754802704, + "learning_rate": 0.000757170868347339, + "loss": 0.4431, + "step": 8777 + }, + { + "epoch": 4.90391061452514, + "grad_norm": 0.5335123538970947, + "learning_rate": 0.0007571428571428572, + "loss": 0.528, + "step": 8778 + }, + { + "epoch": 4.9044692737430164, + "grad_norm": 4.882332801818848, + "learning_rate": 0.0007571148459383754, + "loss": 0.4795, + "step": 8779 + }, + { + "epoch": 4.905027932960894, + "grad_norm": 0.4744202494621277, + "learning_rate": 0.0007570868347338936, + "loss": 0.3669, + "step": 8780 + }, + { + "epoch": 4.905586592178771, + "grad_norm": 0.39103662967681885, + "learning_rate": 0.0007570588235294118, + "loss": 0.4065, + "step": 8781 + }, + { + "epoch": 4.906145251396648, + "grad_norm": 0.8635833859443665, + "learning_rate": 0.00075703081232493, + "loss": 0.4335, + "step": 8782 + }, + { + "epoch": 4.906703910614525, + "grad_norm": 0.5420430898666382, + "learning_rate": 0.0007570028011204482, + "loss": 0.4903, + "step": 8783 + }, + { + "epoch": 4.907262569832402, + "grad_norm": 0.4244260787963867, + "learning_rate": 0.0007569747899159664, + "loss": 0.4615, + "step": 8784 + }, + { + "epoch": 4.907821229050279, + "grad_norm": 1.7777653932571411, + "learning_rate": 0.0007569467787114846, + "loss": 0.4363, + "step": 8785 + }, + { + "epoch": 4.908379888268156, + "grad_norm": 1.8669118881225586, + "learning_rate": 0.0007569187675070028, + "loss": 0.4838, + "step": 8786 + }, + { + "epoch": 4.908938547486033, + "grad_norm": 0.5046985745429993, + "learning_rate": 0.000756890756302521, + "loss": 0.6408, + "step": 8787 + }, + { + "epoch": 4.909497206703911, + "grad_norm": 0.7819457054138184, + "learning_rate": 0.0007568627450980392, + "loss": 0.4383, + "step": 8788 + }, + { + "epoch": 4.910055865921787, + "grad_norm": 0.4063820242881775, + "learning_rate": 0.0007568347338935574, + "loss": 0.4339, + "step": 8789 + }, + { + "epoch": 4.910614525139665, + "grad_norm": 0.4208689033985138, + "learning_rate": 0.0007568067226890757, + "loss": 0.4471, + "step": 8790 + }, + { + "epoch": 4.911173184357542, + "grad_norm": 0.3869853615760803, + "learning_rate": 0.0007567787114845938, + "loss": 0.4663, + "step": 8791 + }, + { + "epoch": 4.911731843575419, + "grad_norm": 0.4823906123638153, + "learning_rate": 0.000756750700280112, + "loss": 0.4342, + "step": 8792 + }, + { + "epoch": 4.912290502793296, + "grad_norm": 0.5998381972312927, + "learning_rate": 0.0007567226890756303, + "loss": 0.5075, + "step": 8793 + }, + { + "epoch": 4.912849162011173, + "grad_norm": 0.6274259686470032, + "learning_rate": 0.0007566946778711485, + "loss": 0.4661, + "step": 8794 + }, + { + "epoch": 4.91340782122905, + "grad_norm": 0.5794113874435425, + "learning_rate": 0.0007566666666666668, + "loss": 0.6888, + "step": 8795 + }, + { + "epoch": 4.913966480446927, + "grad_norm": 0.4683435559272766, + "learning_rate": 0.0007566386554621849, + "loss": 0.4427, + "step": 8796 + }, + { + "epoch": 4.914525139664804, + "grad_norm": 0.4963485896587372, + "learning_rate": 0.0007566106442577031, + "loss": 0.3689, + "step": 8797 + }, + { + "epoch": 4.915083798882682, + "grad_norm": 0.7152925133705139, + "learning_rate": 0.0007565826330532213, + "loss": 0.4986, + "step": 8798 + }, + { + "epoch": 4.915642458100558, + "grad_norm": 0.45349279046058655, + "learning_rate": 0.0007565546218487395, + "loss": 0.457, + "step": 8799 + }, + { + "epoch": 4.916201117318436, + "grad_norm": 1.5383104085922241, + "learning_rate": 0.0007565266106442578, + "loss": 0.5562, + "step": 8800 + }, + { + "epoch": 4.9167597765363125, + "grad_norm": 0.6608338952064514, + "learning_rate": 0.0007564985994397759, + "loss": 0.5309, + "step": 8801 + }, + { + "epoch": 4.91731843575419, + "grad_norm": 0.6817737221717834, + "learning_rate": 0.0007564705882352941, + "loss": 0.5386, + "step": 8802 + }, + { + "epoch": 4.917877094972067, + "grad_norm": 0.5446931719779968, + "learning_rate": 0.0007564425770308123, + "loss": 0.473, + "step": 8803 + }, + { + "epoch": 4.918435754189944, + "grad_norm": 0.392940878868103, + "learning_rate": 0.0007564145658263305, + "loss": 0.4018, + "step": 8804 + }, + { + "epoch": 4.918994413407821, + "grad_norm": 0.8185021281242371, + "learning_rate": 0.0007563865546218488, + "loss": 0.499, + "step": 8805 + }, + { + "epoch": 4.919553072625698, + "grad_norm": 0.5131251215934753, + "learning_rate": 0.000756358543417367, + "loss": 0.4505, + "step": 8806 + }, + { + "epoch": 4.920111731843575, + "grad_norm": 1.0140050649642944, + "learning_rate": 0.0007563305322128851, + "loss": 0.4862, + "step": 8807 + }, + { + "epoch": 4.920670391061453, + "grad_norm": 4.409199237823486, + "learning_rate": 0.0007563025210084033, + "loss": 0.4453, + "step": 8808 + }, + { + "epoch": 4.921229050279329, + "grad_norm": 0.4739084541797638, + "learning_rate": 0.0007562745098039215, + "loss": 0.47, + "step": 8809 + }, + { + "epoch": 4.921787709497207, + "grad_norm": 0.7353098392486572, + "learning_rate": 0.0007562464985994399, + "loss": 0.391, + "step": 8810 + }, + { + "epoch": 4.9223463687150835, + "grad_norm": 0.42085981369018555, + "learning_rate": 0.0007562184873949581, + "loss": 0.4492, + "step": 8811 + }, + { + "epoch": 4.922905027932961, + "grad_norm": 0.5738317370414734, + "learning_rate": 0.0007561904761904762, + "loss": 0.444, + "step": 8812 + }, + { + "epoch": 4.923463687150838, + "grad_norm": 0.4134341776371002, + "learning_rate": 0.0007561624649859944, + "loss": 0.4263, + "step": 8813 + }, + { + "epoch": 4.924022346368715, + "grad_norm": 0.7517583966255188, + "learning_rate": 0.0007561344537815126, + "loss": 0.6281, + "step": 8814 + }, + { + "epoch": 4.924581005586592, + "grad_norm": 0.43969208002090454, + "learning_rate": 0.0007561064425770309, + "loss": 0.3849, + "step": 8815 + }, + { + "epoch": 4.925139664804469, + "grad_norm": 0.568121075630188, + "learning_rate": 0.0007560784313725491, + "loss": 0.3956, + "step": 8816 + }, + { + "epoch": 4.925698324022346, + "grad_norm": 0.5273006558418274, + "learning_rate": 0.0007560504201680672, + "loss": 0.3976, + "step": 8817 + }, + { + "epoch": 4.926256983240224, + "grad_norm": 0.7773993611335754, + "learning_rate": 0.0007560224089635854, + "loss": 0.4725, + "step": 8818 + }, + { + "epoch": 4.9268156424581, + "grad_norm": 0.6869404911994934, + "learning_rate": 0.0007559943977591036, + "loss": 0.5455, + "step": 8819 + }, + { + "epoch": 4.927374301675978, + "grad_norm": 0.5815285444259644, + "learning_rate": 0.0007559663865546219, + "loss": 0.4704, + "step": 8820 + }, + { + "epoch": 4.9279329608938545, + "grad_norm": 0.8208044171333313, + "learning_rate": 0.0007559383753501401, + "loss": 0.425, + "step": 8821 + }, + { + "epoch": 4.928491620111732, + "grad_norm": 0.5147770643234253, + "learning_rate": 0.0007559103641456583, + "loss": 0.5813, + "step": 8822 + }, + { + "epoch": 4.9290502793296085, + "grad_norm": 0.4230298697948456, + "learning_rate": 0.0007558823529411764, + "loss": 0.4552, + "step": 8823 + }, + { + "epoch": 4.929608938547486, + "grad_norm": 0.5045498013496399, + "learning_rate": 0.0007558543417366946, + "loss": 0.4139, + "step": 8824 + }, + { + "epoch": 4.930167597765363, + "grad_norm": 0.5355737209320068, + "learning_rate": 0.000755826330532213, + "loss": 0.4922, + "step": 8825 + }, + { + "epoch": 4.93072625698324, + "grad_norm": 0.5541037321090698, + "learning_rate": 0.0007557983193277312, + "loss": 0.4409, + "step": 8826 + }, + { + "epoch": 4.931284916201117, + "grad_norm": 0.4639340341091156, + "learning_rate": 0.0007557703081232494, + "loss": 0.4725, + "step": 8827 + }, + { + "epoch": 4.931843575418995, + "grad_norm": 0.42134127020835876, + "learning_rate": 0.0007557422969187675, + "loss": 0.4122, + "step": 8828 + }, + { + "epoch": 4.932402234636871, + "grad_norm": 0.6108148694038391, + "learning_rate": 0.0007557142857142857, + "loss": 0.3604, + "step": 8829 + }, + { + "epoch": 4.932960893854749, + "grad_norm": 0.6303445100784302, + "learning_rate": 0.000755686274509804, + "loss": 0.5771, + "step": 8830 + }, + { + "epoch": 4.9335195530726255, + "grad_norm": 0.6305598616600037, + "learning_rate": 0.0007556582633053222, + "loss": 0.3262, + "step": 8831 + }, + { + "epoch": 4.934078212290503, + "grad_norm": 0.492229700088501, + "learning_rate": 0.0007556302521008404, + "loss": 0.4008, + "step": 8832 + }, + { + "epoch": 4.9346368715083795, + "grad_norm": 0.5563464164733887, + "learning_rate": 0.0007556022408963585, + "loss": 0.4474, + "step": 8833 + }, + { + "epoch": 4.935195530726257, + "grad_norm": 1.8338207006454468, + "learning_rate": 0.0007555742296918767, + "loss": 0.4931, + "step": 8834 + }, + { + "epoch": 4.935754189944134, + "grad_norm": 0.5386955142021179, + "learning_rate": 0.000755546218487395, + "loss": 0.4754, + "step": 8835 + }, + { + "epoch": 4.936312849162011, + "grad_norm": 1.487391710281372, + "learning_rate": 0.0007555182072829132, + "loss": 0.4392, + "step": 8836 + }, + { + "epoch": 4.936871508379888, + "grad_norm": 1.9271928071975708, + "learning_rate": 0.0007554901960784314, + "loss": 0.3616, + "step": 8837 + }, + { + "epoch": 4.937430167597765, + "grad_norm": 0.5119422674179077, + "learning_rate": 0.0007554621848739496, + "loss": 0.3675, + "step": 8838 + }, + { + "epoch": 4.937988826815642, + "grad_norm": 2.6700937747955322, + "learning_rate": 0.0007554341736694677, + "loss": 0.4194, + "step": 8839 + }, + { + "epoch": 4.93854748603352, + "grad_norm": 2.884505033493042, + "learning_rate": 0.000755406162464986, + "loss": 0.4576, + "step": 8840 + }, + { + "epoch": 4.9391061452513965, + "grad_norm": 0.4905424416065216, + "learning_rate": 0.0007553781512605042, + "loss": 0.4137, + "step": 8841 + }, + { + "epoch": 4.939664804469274, + "grad_norm": 0.5485576391220093, + "learning_rate": 0.0007553501400560225, + "loss": 0.4069, + "step": 8842 + }, + { + "epoch": 4.9402234636871505, + "grad_norm": 0.5925578474998474, + "learning_rate": 0.0007553221288515407, + "loss": 0.4242, + "step": 8843 + }, + { + "epoch": 4.940782122905028, + "grad_norm": 0.6026496887207031, + "learning_rate": 0.0007552941176470588, + "loss": 0.4257, + "step": 8844 + }, + { + "epoch": 4.941340782122905, + "grad_norm": 0.6376490592956543, + "learning_rate": 0.0007552661064425771, + "loss": 0.4238, + "step": 8845 + }, + { + "epoch": 4.941899441340782, + "grad_norm": 0.5547580718994141, + "learning_rate": 0.0007552380952380953, + "loss": 0.5217, + "step": 8846 + }, + { + "epoch": 4.942458100558659, + "grad_norm": 2.5092527866363525, + "learning_rate": 0.0007552100840336135, + "loss": 0.5043, + "step": 8847 + }, + { + "epoch": 4.943016759776536, + "grad_norm": 0.5121408104896545, + "learning_rate": 0.0007551820728291317, + "loss": 0.5123, + "step": 8848 + }, + { + "epoch": 4.943575418994413, + "grad_norm": 0.46961572766304016, + "learning_rate": 0.0007551540616246498, + "loss": 0.4247, + "step": 8849 + }, + { + "epoch": 4.94413407821229, + "grad_norm": 3.7292540073394775, + "learning_rate": 0.0007551260504201681, + "loss": 0.4625, + "step": 8850 + }, + { + "epoch": 4.9446927374301675, + "grad_norm": 0.5970547199249268, + "learning_rate": 0.0007550980392156863, + "loss": 0.4108, + "step": 8851 + }, + { + "epoch": 4.945251396648045, + "grad_norm": 0.5754162073135376, + "learning_rate": 0.0007550700280112045, + "loss": 0.3074, + "step": 8852 + }, + { + "epoch": 4.9458100558659215, + "grad_norm": 1.8235074281692505, + "learning_rate": 0.0007550420168067227, + "loss": 0.4488, + "step": 8853 + }, + { + "epoch": 4.946368715083799, + "grad_norm": 0.402045875787735, + "learning_rate": 0.0007550140056022409, + "loss": 0.3172, + "step": 8854 + }, + { + "epoch": 4.946927374301676, + "grad_norm": 0.3946745693683624, + "learning_rate": 0.0007549859943977591, + "loss": 0.4759, + "step": 8855 + }, + { + "epoch": 4.947486033519553, + "grad_norm": 0.5963053703308105, + "learning_rate": 0.0007549579831932773, + "loss": 0.3645, + "step": 8856 + }, + { + "epoch": 4.94804469273743, + "grad_norm": 0.7214495539665222, + "learning_rate": 0.0007549299719887955, + "loss": 0.4495, + "step": 8857 + }, + { + "epoch": 4.948603351955307, + "grad_norm": 0.7899619340896606, + "learning_rate": 0.0007549019607843137, + "loss": 0.4516, + "step": 8858 + }, + { + "epoch": 4.949162011173184, + "grad_norm": 0.3767538368701935, + "learning_rate": 0.000754873949579832, + "loss": 0.3875, + "step": 8859 + }, + { + "epoch": 4.949720670391061, + "grad_norm": 0.7419856786727905, + "learning_rate": 0.0007548459383753502, + "loss": 0.3864, + "step": 8860 + }, + { + "epoch": 4.9502793296089385, + "grad_norm": 0.39955633878707886, + "learning_rate": 0.0007548179271708684, + "loss": 0.4206, + "step": 8861 + }, + { + "epoch": 4.950837988826816, + "grad_norm": 0.46973446011543274, + "learning_rate": 0.0007547899159663866, + "loss": 0.4204, + "step": 8862 + }, + { + "epoch": 4.9513966480446925, + "grad_norm": 0.5184533596038818, + "learning_rate": 0.0007547619047619048, + "loss": 0.4635, + "step": 8863 + }, + { + "epoch": 4.95195530726257, + "grad_norm": 0.7716824412345886, + "learning_rate": 0.000754733893557423, + "loss": 0.5901, + "step": 8864 + }, + { + "epoch": 4.952513966480447, + "grad_norm": 0.49903708696365356, + "learning_rate": 0.0007547058823529412, + "loss": 0.4156, + "step": 8865 + }, + { + "epoch": 4.953072625698324, + "grad_norm": 0.4803718328475952, + "learning_rate": 0.0007546778711484594, + "loss": 0.4387, + "step": 8866 + }, + { + "epoch": 4.953631284916201, + "grad_norm": 0.6003878712654114, + "learning_rate": 0.0007546498599439776, + "loss": 0.4588, + "step": 8867 + }, + { + "epoch": 4.954189944134078, + "grad_norm": 0.6134268045425415, + "learning_rate": 0.0007546218487394958, + "loss": 0.4461, + "step": 8868 + }, + { + "epoch": 4.954748603351955, + "grad_norm": 0.657756507396698, + "learning_rate": 0.000754593837535014, + "loss": 0.4863, + "step": 8869 + }, + { + "epoch": 4.955307262569832, + "grad_norm": 0.6040956974029541, + "learning_rate": 0.0007545658263305323, + "loss": 0.3518, + "step": 8870 + }, + { + "epoch": 4.9558659217877095, + "grad_norm": 0.7183671593666077, + "learning_rate": 0.0007545378151260504, + "loss": 0.5694, + "step": 8871 + }, + { + "epoch": 4.956424581005587, + "grad_norm": 0.46575382351875305, + "learning_rate": 0.0007545098039215686, + "loss": 0.3515, + "step": 8872 + }, + { + "epoch": 4.9569832402234635, + "grad_norm": 0.4562683701515198, + "learning_rate": 0.0007544817927170868, + "loss": 0.4903, + "step": 8873 + }, + { + "epoch": 4.957541899441341, + "grad_norm": 0.5428005456924438, + "learning_rate": 0.000754453781512605, + "loss": 0.3775, + "step": 8874 + }, + { + "epoch": 4.9581005586592175, + "grad_norm": 1.8966871500015259, + "learning_rate": 0.0007544257703081234, + "loss": 0.3946, + "step": 8875 + }, + { + "epoch": 4.958659217877095, + "grad_norm": 0.48232144117355347, + "learning_rate": 0.0007543977591036415, + "loss": 0.4313, + "step": 8876 + }, + { + "epoch": 4.959217877094972, + "grad_norm": 0.44306883215904236, + "learning_rate": 0.0007543697478991597, + "loss": 0.438, + "step": 8877 + }, + { + "epoch": 4.959776536312849, + "grad_norm": 0.4111923277378082, + "learning_rate": 0.0007543417366946779, + "loss": 0.4041, + "step": 8878 + }, + { + "epoch": 4.960335195530726, + "grad_norm": 0.7629689574241638, + "learning_rate": 0.0007543137254901961, + "loss": 0.4726, + "step": 8879 + }, + { + "epoch": 4.960893854748603, + "grad_norm": 0.4854073226451874, + "learning_rate": 0.0007542857142857144, + "loss": 0.4355, + "step": 8880 + }, + { + "epoch": 4.9614525139664805, + "grad_norm": 0.5218124389648438, + "learning_rate": 0.0007542577030812325, + "loss": 0.4827, + "step": 8881 + }, + { + "epoch": 4.962011173184358, + "grad_norm": 1.7910889387130737, + "learning_rate": 0.0007542296918767507, + "loss": 0.3861, + "step": 8882 + }, + { + "epoch": 4.9625698324022345, + "grad_norm": 1.2245216369628906, + "learning_rate": 0.0007542016806722689, + "loss": 0.5188, + "step": 8883 + }, + { + "epoch": 4.963128491620112, + "grad_norm": 0.6214673519134521, + "learning_rate": 0.0007541736694677871, + "loss": 0.3982, + "step": 8884 + }, + { + "epoch": 4.9636871508379885, + "grad_norm": 0.5548007488250732, + "learning_rate": 0.0007541456582633054, + "loss": 0.3222, + "step": 8885 + }, + { + "epoch": 4.964245810055866, + "grad_norm": 0.5983314514160156, + "learning_rate": 0.0007541176470588236, + "loss": 0.5297, + "step": 8886 + }, + { + "epoch": 4.9648044692737425, + "grad_norm": 0.4969734251499176, + "learning_rate": 0.0007540896358543417, + "loss": 0.3995, + "step": 8887 + }, + { + "epoch": 4.96536312849162, + "grad_norm": 0.3779778778553009, + "learning_rate": 0.0007540616246498599, + "loss": 0.4144, + "step": 8888 + }, + { + "epoch": 4.965921787709497, + "grad_norm": 0.6147105097770691, + "learning_rate": 0.0007540336134453781, + "loss": 0.5256, + "step": 8889 + }, + { + "epoch": 4.966480446927374, + "grad_norm": 0.4686896800994873, + "learning_rate": 0.0007540056022408964, + "loss": 0.3076, + "step": 8890 + }, + { + "epoch": 4.9670391061452515, + "grad_norm": 0.4621661603450775, + "learning_rate": 0.0007539775910364147, + "loss": 0.374, + "step": 8891 + }, + { + "epoch": 4.967597765363129, + "grad_norm": 0.7105911374092102, + "learning_rate": 0.0007539495798319328, + "loss": 0.3769, + "step": 8892 + }, + { + "epoch": 4.9681564245810055, + "grad_norm": 0.4076268970966339, + "learning_rate": 0.000753921568627451, + "loss": 0.4254, + "step": 8893 + }, + { + "epoch": 4.968715083798883, + "grad_norm": 0.7149912714958191, + "learning_rate": 0.0007538935574229692, + "loss": 0.459, + "step": 8894 + }, + { + "epoch": 4.9692737430167595, + "grad_norm": 0.7878284454345703, + "learning_rate": 0.0007538655462184875, + "loss": 0.4826, + "step": 8895 + }, + { + "epoch": 4.969832402234637, + "grad_norm": 0.5910987854003906, + "learning_rate": 0.0007538375350140057, + "loss": 0.4134, + "step": 8896 + }, + { + "epoch": 4.9703910614525135, + "grad_norm": 0.6179719567298889, + "learning_rate": 0.0007538095238095238, + "loss": 0.4654, + "step": 8897 + }, + { + "epoch": 4.970949720670391, + "grad_norm": 0.7106649875640869, + "learning_rate": 0.000753781512605042, + "loss": 0.6835, + "step": 8898 + }, + { + "epoch": 4.971508379888268, + "grad_norm": 0.43262267112731934, + "learning_rate": 0.0007537535014005602, + "loss": 0.456, + "step": 8899 + }, + { + "epoch": 4.972067039106145, + "grad_norm": 1.1712673902511597, + "learning_rate": 0.0007537254901960785, + "loss": 0.5101, + "step": 8900 + }, + { + "epoch": 4.9726256983240225, + "grad_norm": 0.6719967722892761, + "learning_rate": 0.0007536974789915967, + "loss": 0.4613, + "step": 8901 + }, + { + "epoch": 4.9731843575419, + "grad_norm": 0.5877483487129211, + "learning_rate": 0.0007536694677871149, + "loss": 0.3872, + "step": 8902 + }, + { + "epoch": 4.9737430167597765, + "grad_norm": 0.6149214506149292, + "learning_rate": 0.000753641456582633, + "loss": 0.4977, + "step": 8903 + }, + { + "epoch": 4.974301675977654, + "grad_norm": 0.5694226026535034, + "learning_rate": 0.0007536134453781512, + "loss": 0.5598, + "step": 8904 + }, + { + "epoch": 4.9748603351955305, + "grad_norm": 1.9931093454360962, + "learning_rate": 0.0007535854341736695, + "loss": 0.3765, + "step": 8905 + }, + { + "epoch": 4.975418994413408, + "grad_norm": 0.7991714477539062, + "learning_rate": 0.0007535574229691877, + "loss": 0.4882, + "step": 8906 + }, + { + "epoch": 4.9759776536312845, + "grad_norm": 0.5801398754119873, + "learning_rate": 0.000753529411764706, + "loss": 0.413, + "step": 8907 + }, + { + "epoch": 4.976536312849162, + "grad_norm": 1.0248736143112183, + "learning_rate": 0.000753501400560224, + "loss": 0.5372, + "step": 8908 + }, + { + "epoch": 4.977094972067039, + "grad_norm": 0.6892338395118713, + "learning_rate": 0.0007534733893557423, + "loss": 0.4978, + "step": 8909 + }, + { + "epoch": 4.977653631284916, + "grad_norm": 0.420864075422287, + "learning_rate": 0.0007534453781512606, + "loss": 0.4207, + "step": 8910 + }, + { + "epoch": 4.9782122905027935, + "grad_norm": 0.8040164113044739, + "learning_rate": 0.0007534173669467788, + "loss": 0.6046, + "step": 8911 + }, + { + "epoch": 4.97877094972067, + "grad_norm": 0.7165022492408752, + "learning_rate": 0.000753389355742297, + "loss": 0.5481, + "step": 8912 + }, + { + "epoch": 4.9793296089385475, + "grad_norm": 0.5217366218566895, + "learning_rate": 0.0007533613445378151, + "loss": 0.5051, + "step": 8913 + }, + { + "epoch": 4.979888268156425, + "grad_norm": 0.47434869408607483, + "learning_rate": 0.0007533333333333333, + "loss": 0.4835, + "step": 8914 + }, + { + "epoch": 4.9804469273743015, + "grad_norm": 0.7647278904914856, + "learning_rate": 0.0007533053221288516, + "loss": 0.3348, + "step": 8915 + }, + { + "epoch": 4.981005586592179, + "grad_norm": 0.8804173469543457, + "learning_rate": 0.0007532773109243698, + "loss": 0.499, + "step": 8916 + }, + { + "epoch": 4.9815642458100555, + "grad_norm": 0.590628981590271, + "learning_rate": 0.000753249299719888, + "loss": 0.5345, + "step": 8917 + }, + { + "epoch": 4.982122905027933, + "grad_norm": 0.5201048254966736, + "learning_rate": 0.0007532212885154062, + "loss": 0.4925, + "step": 8918 + }, + { + "epoch": 4.98268156424581, + "grad_norm": 0.5053492188453674, + "learning_rate": 0.0007531932773109243, + "loss": 0.4573, + "step": 8919 + }, + { + "epoch": 4.983240223463687, + "grad_norm": 0.7040517330169678, + "learning_rate": 0.0007531652661064426, + "loss": 0.4012, + "step": 8920 + }, + { + "epoch": 4.9837988826815645, + "grad_norm": 1.1395535469055176, + "learning_rate": 0.0007531372549019608, + "loss": 0.5303, + "step": 8921 + }, + { + "epoch": 4.984357541899441, + "grad_norm": 0.5153475403785706, + "learning_rate": 0.000753109243697479, + "loss": 0.4744, + "step": 8922 + }, + { + "epoch": 4.9849162011173185, + "grad_norm": 0.4273338317871094, + "learning_rate": 0.0007530812324929972, + "loss": 0.4674, + "step": 8923 + }, + { + "epoch": 4.985474860335195, + "grad_norm": 1.0757174491882324, + "learning_rate": 0.0007530532212885153, + "loss": 0.463, + "step": 8924 + }, + { + "epoch": 4.9860335195530725, + "grad_norm": 0.6814197897911072, + "learning_rate": 0.0007530252100840336, + "loss": 0.4083, + "step": 8925 + }, + { + "epoch": 4.98659217877095, + "grad_norm": 0.5240745544433594, + "learning_rate": 0.0007529971988795519, + "loss": 0.4688, + "step": 8926 + }, + { + "epoch": 4.9871508379888265, + "grad_norm": 0.6920828223228455, + "learning_rate": 0.0007529691876750701, + "loss": 0.6228, + "step": 8927 + }, + { + "epoch": 4.987709497206704, + "grad_norm": 4.5850982666015625, + "learning_rate": 0.0007529411764705883, + "loss": 0.5397, + "step": 8928 + }, + { + "epoch": 4.988268156424581, + "grad_norm": 2.1096279621124268, + "learning_rate": 0.0007529131652661064, + "loss": 0.4948, + "step": 8929 + }, + { + "epoch": 4.988826815642458, + "grad_norm": 0.4749988317489624, + "learning_rate": 0.0007528851540616246, + "loss": 0.4483, + "step": 8930 + }, + { + "epoch": 4.9893854748603355, + "grad_norm": 10.724010467529297, + "learning_rate": 0.0007528571428571429, + "loss": 0.3862, + "step": 8931 + }, + { + "epoch": 4.989944134078212, + "grad_norm": 0.8744003176689148, + "learning_rate": 0.0007528291316526611, + "loss": 0.5138, + "step": 8932 + }, + { + "epoch": 4.9905027932960895, + "grad_norm": 0.5407172441482544, + "learning_rate": 0.0007528011204481793, + "loss": 0.7005, + "step": 8933 + }, + { + "epoch": 4.991061452513966, + "grad_norm": 0.4657617509365082, + "learning_rate": 0.0007527731092436975, + "loss": 0.4848, + "step": 8934 + }, + { + "epoch": 4.9916201117318435, + "grad_norm": 0.771497905254364, + "learning_rate": 0.0007527450980392156, + "loss": 0.6096, + "step": 8935 + }, + { + "epoch": 4.992178770949721, + "grad_norm": 0.44296932220458984, + "learning_rate": 0.0007527170868347339, + "loss": 0.4915, + "step": 8936 + }, + { + "epoch": 4.9927374301675975, + "grad_norm": 0.3926461935043335, + "learning_rate": 0.0007526890756302521, + "loss": 0.4053, + "step": 8937 + }, + { + "epoch": 4.993296089385475, + "grad_norm": 0.562260091304779, + "learning_rate": 0.0007526610644257703, + "loss": 0.5297, + "step": 8938 + }, + { + "epoch": 4.993854748603352, + "grad_norm": 0.41102197766304016, + "learning_rate": 0.0007526330532212885, + "loss": 0.3997, + "step": 8939 + }, + { + "epoch": 4.994413407821229, + "grad_norm": 0.4598945379257202, + "learning_rate": 0.0007526050420168066, + "loss": 0.4386, + "step": 8940 + }, + { + "epoch": 4.9949720670391065, + "grad_norm": 0.4748890697956085, + "learning_rate": 0.000752577030812325, + "loss": 0.4497, + "step": 8941 + }, + { + "epoch": 4.995530726256983, + "grad_norm": 0.9812225699424744, + "learning_rate": 0.0007525490196078432, + "loss": 0.4071, + "step": 8942 + }, + { + "epoch": 4.9960893854748605, + "grad_norm": 0.6202051639556885, + "learning_rate": 0.0007525210084033614, + "loss": 0.4608, + "step": 8943 + }, + { + "epoch": 4.996648044692737, + "grad_norm": 0.45992499589920044, + "learning_rate": 0.0007524929971988796, + "loss": 0.4516, + "step": 8944 + }, + { + "epoch": 4.9972067039106145, + "grad_norm": 0.9308560490608215, + "learning_rate": 0.0007524649859943977, + "loss": 0.4501, + "step": 8945 + }, + { + "epoch": 4.997765363128492, + "grad_norm": 0.736585259437561, + "learning_rate": 0.000752436974789916, + "loss": 0.6691, + "step": 8946 + }, + { + "epoch": 4.9983240223463685, + "grad_norm": 0.4983275830745697, + "learning_rate": 0.0007524089635854342, + "loss": 0.3534, + "step": 8947 + }, + { + "epoch": 4.998882681564246, + "grad_norm": 0.8690245151519775, + "learning_rate": 0.0007523809523809524, + "loss": 0.482, + "step": 8948 + }, + { + "epoch": 4.9994413407821225, + "grad_norm": 0.7713837623596191, + "learning_rate": 0.0007523529411764706, + "loss": 0.4586, + "step": 8949 + }, + { + "epoch": 5.0, + "grad_norm": 0.4516015946865082, + "learning_rate": 0.0007523249299719888, + "loss": 0.3913, + "step": 8950 + }, + { + "epoch": 5.0005586592178775, + "grad_norm": 0.4232989251613617, + "learning_rate": 0.000752296918767507, + "loss": 0.5067, + "step": 8951 + }, + { + "epoch": 5.001117318435754, + "grad_norm": 0.5575375556945801, + "learning_rate": 0.0007522689075630252, + "loss": 0.392, + "step": 8952 + }, + { + "epoch": 5.0016759776536315, + "grad_norm": 0.49774113297462463, + "learning_rate": 0.0007522408963585434, + "loss": 0.4076, + "step": 8953 + }, + { + "epoch": 5.002234636871508, + "grad_norm": 0.5956636071205139, + "learning_rate": 0.0007522128851540616, + "loss": 0.4615, + "step": 8954 + }, + { + "epoch": 5.0027932960893855, + "grad_norm": 0.6072801947593689, + "learning_rate": 0.0007521848739495798, + "loss": 0.4391, + "step": 8955 + }, + { + "epoch": 5.003351955307263, + "grad_norm": 0.4491267204284668, + "learning_rate": 0.000752156862745098, + "loss": 0.4243, + "step": 8956 + }, + { + "epoch": 5.0039106145251395, + "grad_norm": 0.4794199764728546, + "learning_rate": 0.0007521288515406163, + "loss": 0.4824, + "step": 8957 + }, + { + "epoch": 5.004469273743017, + "grad_norm": 0.5326511263847351, + "learning_rate": 0.0007521008403361345, + "loss": 0.4098, + "step": 8958 + }, + { + "epoch": 5.0050279329608935, + "grad_norm": 0.5080529451370239, + "learning_rate": 0.0007520728291316527, + "loss": 0.4363, + "step": 8959 + }, + { + "epoch": 5.005586592178771, + "grad_norm": 0.6544544100761414, + "learning_rate": 0.0007520448179271709, + "loss": 0.4628, + "step": 8960 + }, + { + "epoch": 5.0061452513966485, + "grad_norm": 3.7938168048858643, + "learning_rate": 0.0007520168067226891, + "loss": 0.4157, + "step": 8961 + }, + { + "epoch": 5.006703910614525, + "grad_norm": 0.696471631526947, + "learning_rate": 0.0007519887955182073, + "loss": 0.6147, + "step": 8962 + }, + { + "epoch": 5.0072625698324025, + "grad_norm": 0.5954288244247437, + "learning_rate": 0.0007519607843137255, + "loss": 0.5235, + "step": 8963 + }, + { + "epoch": 5.007821229050279, + "grad_norm": 0.7126972675323486, + "learning_rate": 0.0007519327731092437, + "loss": 0.3917, + "step": 8964 + }, + { + "epoch": 5.0083798882681565, + "grad_norm": 0.5534854531288147, + "learning_rate": 0.0007519047619047619, + "loss": 0.4274, + "step": 8965 + }, + { + "epoch": 5.008938547486034, + "grad_norm": 0.5404927134513855, + "learning_rate": 0.0007518767507002802, + "loss": 0.4745, + "step": 8966 + }, + { + "epoch": 5.0094972067039105, + "grad_norm": 0.5117726922035217, + "learning_rate": 0.0007518487394957983, + "loss": 0.4486, + "step": 8967 + }, + { + "epoch": 5.010055865921788, + "grad_norm": 0.8328001499176025, + "learning_rate": 0.0007518207282913165, + "loss": 0.4853, + "step": 8968 + }, + { + "epoch": 5.0106145251396645, + "grad_norm": 0.5918221473693848, + "learning_rate": 0.0007517927170868347, + "loss": 0.3931, + "step": 8969 + }, + { + "epoch": 5.011173184357542, + "grad_norm": 1.0455037355422974, + "learning_rate": 0.0007517647058823529, + "loss": 0.3951, + "step": 8970 + }, + { + "epoch": 5.011731843575419, + "grad_norm": 0.40491145849227905, + "learning_rate": 0.0007517366946778712, + "loss": 0.3488, + "step": 8971 + }, + { + "epoch": 5.012290502793296, + "grad_norm": 0.4737810492515564, + "learning_rate": 0.0007517086834733893, + "loss": 0.3953, + "step": 8972 + }, + { + "epoch": 5.0128491620111735, + "grad_norm": 0.5814889669418335, + "learning_rate": 0.0007516806722689075, + "loss": 0.5089, + "step": 8973 + }, + { + "epoch": 5.01340782122905, + "grad_norm": 0.445928692817688, + "learning_rate": 0.0007516526610644258, + "loss": 0.4894, + "step": 8974 + }, + { + "epoch": 5.0139664804469275, + "grad_norm": 0.5846951007843018, + "learning_rate": 0.000751624649859944, + "loss": 0.4451, + "step": 8975 + }, + { + "epoch": 5.014525139664804, + "grad_norm": 0.5077082514762878, + "learning_rate": 0.0007515966386554623, + "loss": 0.4082, + "step": 8976 + }, + { + "epoch": 5.0150837988826815, + "grad_norm": 0.5170125365257263, + "learning_rate": 0.0007515686274509804, + "loss": 0.5307, + "step": 8977 + }, + { + "epoch": 5.015642458100559, + "grad_norm": 0.5874345302581787, + "learning_rate": 0.0007515406162464986, + "loss": 0.3947, + "step": 8978 + }, + { + "epoch": 5.0162011173184355, + "grad_norm": 0.4098903238773346, + "learning_rate": 0.0007515126050420168, + "loss": 0.4447, + "step": 8979 + }, + { + "epoch": 5.016759776536313, + "grad_norm": 0.680138349533081, + "learning_rate": 0.000751484593837535, + "loss": 0.4923, + "step": 8980 + }, + { + "epoch": 5.01731843575419, + "grad_norm": 0.5135407447814941, + "learning_rate": 0.0007514565826330533, + "loss": 0.5303, + "step": 8981 + }, + { + "epoch": 5.017877094972067, + "grad_norm": 0.43305161595344543, + "learning_rate": 0.0007514285714285715, + "loss": 0.407, + "step": 8982 + }, + { + "epoch": 5.0184357541899445, + "grad_norm": 0.8420162796974182, + "learning_rate": 0.0007514005602240896, + "loss": 0.4851, + "step": 8983 + }, + { + "epoch": 5.018994413407821, + "grad_norm": 0.457429438829422, + "learning_rate": 0.0007513725490196078, + "loss": 0.3937, + "step": 8984 + }, + { + "epoch": 5.0195530726256985, + "grad_norm": 0.46801939606666565, + "learning_rate": 0.000751344537815126, + "loss": 0.4118, + "step": 8985 + }, + { + "epoch": 5.020111731843575, + "grad_norm": 0.5727358460426331, + "learning_rate": 0.0007513165266106443, + "loss": 0.3909, + "step": 8986 + }, + { + "epoch": 5.0206703910614525, + "grad_norm": 9.009857177734375, + "learning_rate": 0.0007512885154061625, + "loss": 0.487, + "step": 8987 + }, + { + "epoch": 5.02122905027933, + "grad_norm": 2.25361704826355, + "learning_rate": 0.0007512605042016806, + "loss": 0.3629, + "step": 8988 + }, + { + "epoch": 5.0217877094972065, + "grad_norm": 0.4565248191356659, + "learning_rate": 0.0007512324929971988, + "loss": 0.3981, + "step": 8989 + }, + { + "epoch": 5.022346368715084, + "grad_norm": 2.0229146480560303, + "learning_rate": 0.000751204481792717, + "loss": 0.7014, + "step": 8990 + }, + { + "epoch": 5.022905027932961, + "grad_norm": 0.5379871129989624, + "learning_rate": 0.0007511764705882354, + "loss": 0.388, + "step": 8991 + }, + { + "epoch": 5.023463687150838, + "grad_norm": 1.0842766761779785, + "learning_rate": 0.0007511484593837536, + "loss": 0.352, + "step": 8992 + }, + { + "epoch": 5.0240223463687155, + "grad_norm": 0.5023524165153503, + "learning_rate": 0.0007511204481792717, + "loss": 0.5143, + "step": 8993 + }, + { + "epoch": 5.024581005586592, + "grad_norm": 0.5101609826087952, + "learning_rate": 0.0007510924369747899, + "loss": 0.4125, + "step": 8994 + }, + { + "epoch": 5.0251396648044695, + "grad_norm": 0.5665290951728821, + "learning_rate": 0.0007510644257703081, + "loss": 0.4995, + "step": 8995 + }, + { + "epoch": 5.025698324022346, + "grad_norm": 0.550108015537262, + "learning_rate": 0.0007510364145658264, + "loss": 0.5133, + "step": 8996 + }, + { + "epoch": 5.0262569832402235, + "grad_norm": 0.6484015583992004, + "learning_rate": 0.0007510084033613446, + "loss": 0.3865, + "step": 8997 + }, + { + "epoch": 5.026815642458101, + "grad_norm": 1.9246560335159302, + "learning_rate": 0.0007509803921568628, + "loss": 0.4518, + "step": 8998 + }, + { + "epoch": 5.0273743016759775, + "grad_norm": 0.6043315529823303, + "learning_rate": 0.0007509523809523809, + "loss": 0.5676, + "step": 8999 + }, + { + "epoch": 5.027932960893855, + "grad_norm": 0.4110215902328491, + "learning_rate": 0.0007509243697478991, + "loss": 0.3894, + "step": 9000 + }, + { + "epoch": 5.027932960893855, + "eval_cer": 0.09487960023130722, + "eval_loss": 0.35608309507369995, + "eval_runtime": 55.9186, + "eval_samples_per_second": 81.154, + "eval_steps_per_second": 5.079, + "eval_wer": 0.37409569565095946, + "step": 9000 + }, + { + "epoch": 5.028491620111732, + "grad_norm": 0.46151965856552124, + "learning_rate": 0.0007508963585434174, + "loss": 0.4472, + "step": 9001 + }, + { + "epoch": 5.029050279329609, + "grad_norm": 0.4746703505516052, + "learning_rate": 0.0007508683473389356, + "loss": 0.3553, + "step": 9002 + }, + { + "epoch": 5.0296089385474865, + "grad_norm": 0.7386363744735718, + "learning_rate": 0.0007508403361344538, + "loss": 0.5406, + "step": 9003 + }, + { + "epoch": 5.030167597765363, + "grad_norm": 0.4979327619075775, + "learning_rate": 0.0007508123249299719, + "loss": 0.4625, + "step": 9004 + }, + { + "epoch": 5.0307262569832405, + "grad_norm": 0.4723169505596161, + "learning_rate": 0.0007507843137254901, + "loss": 0.4235, + "step": 9005 + }, + { + "epoch": 5.031284916201117, + "grad_norm": 0.48328202962875366, + "learning_rate": 0.0007507563025210085, + "loss": 0.4307, + "step": 9006 + }, + { + "epoch": 5.0318435754189945, + "grad_norm": 0.6487880945205688, + "learning_rate": 0.0007507282913165267, + "loss": 0.4204, + "step": 9007 + }, + { + "epoch": 5.032402234636871, + "grad_norm": 5.918490409851074, + "learning_rate": 0.0007507002801120449, + "loss": 0.6761, + "step": 9008 + }, + { + "epoch": 5.0329608938547485, + "grad_norm": 0.5372149348258972, + "learning_rate": 0.000750672268907563, + "loss": 0.4374, + "step": 9009 + }, + { + "epoch": 5.033519553072626, + "grad_norm": 0.6469781994819641, + "learning_rate": 0.0007506442577030812, + "loss": 0.4518, + "step": 9010 + }, + { + "epoch": 5.034078212290503, + "grad_norm": 0.5016604661941528, + "learning_rate": 0.0007506162464985995, + "loss": 0.4616, + "step": 9011 + }, + { + "epoch": 5.03463687150838, + "grad_norm": 0.4969886243343353, + "learning_rate": 0.0007505882352941177, + "loss": 0.4594, + "step": 9012 + }, + { + "epoch": 5.035195530726257, + "grad_norm": 0.5083377361297607, + "learning_rate": 0.0007505602240896359, + "loss": 0.4521, + "step": 9013 + }, + { + "epoch": 5.035754189944134, + "grad_norm": 1.9022886753082275, + "learning_rate": 0.0007505322128851541, + "loss": 0.3606, + "step": 9014 + }, + { + "epoch": 5.0363128491620115, + "grad_norm": 0.6921699047088623, + "learning_rate": 0.0007505042016806722, + "loss": 0.6119, + "step": 9015 + }, + { + "epoch": 5.036871508379888, + "grad_norm": 1.0125654935836792, + "learning_rate": 0.0007504761904761905, + "loss": 0.4553, + "step": 9016 + }, + { + "epoch": 5.0374301675977655, + "grad_norm": 0.7234715223312378, + "learning_rate": 0.0007504481792717087, + "loss": 0.5036, + "step": 9017 + }, + { + "epoch": 5.037988826815642, + "grad_norm": 0.5878175497055054, + "learning_rate": 0.0007504201680672269, + "loss": 0.4681, + "step": 9018 + }, + { + "epoch": 5.0385474860335195, + "grad_norm": 0.5051795244216919, + "learning_rate": 0.0007503921568627451, + "loss": 0.5717, + "step": 9019 + }, + { + "epoch": 5.039106145251397, + "grad_norm": 0.4283452332019806, + "learning_rate": 0.0007503641456582632, + "loss": 0.4911, + "step": 9020 + }, + { + "epoch": 5.039664804469274, + "grad_norm": 0.5273513793945312, + "learning_rate": 0.0007503361344537815, + "loss": 0.3506, + "step": 9021 + }, + { + "epoch": 5.040223463687151, + "grad_norm": 0.8313674330711365, + "learning_rate": 0.0007503081232492997, + "loss": 0.372, + "step": 9022 + }, + { + "epoch": 5.040782122905028, + "grad_norm": 0.8957833647727966, + "learning_rate": 0.000750280112044818, + "loss": 0.5206, + "step": 9023 + }, + { + "epoch": 5.041340782122905, + "grad_norm": 0.87749844789505, + "learning_rate": 0.0007502521008403362, + "loss": 0.4415, + "step": 9024 + }, + { + "epoch": 5.0418994413407825, + "grad_norm": 0.3926472067832947, + "learning_rate": 0.0007502240896358543, + "loss": 0.3942, + "step": 9025 + }, + { + "epoch": 5.042458100558659, + "grad_norm": 0.49140673875808716, + "learning_rate": 0.0007501960784313726, + "loss": 0.4766, + "step": 9026 + }, + { + "epoch": 5.0430167597765365, + "grad_norm": 0.48330798745155334, + "learning_rate": 0.0007501680672268908, + "loss": 0.4223, + "step": 9027 + }, + { + "epoch": 5.043575418994413, + "grad_norm": 0.5474338531494141, + "learning_rate": 0.000750140056022409, + "loss": 0.4604, + "step": 9028 + }, + { + "epoch": 5.0441340782122905, + "grad_norm": 0.6906026005744934, + "learning_rate": 0.0007501120448179272, + "loss": 0.4861, + "step": 9029 + }, + { + "epoch": 5.044692737430168, + "grad_norm": 1.0677733421325684, + "learning_rate": 0.0007500840336134454, + "loss": 0.4486, + "step": 9030 + }, + { + "epoch": 5.045251396648045, + "grad_norm": 0.7197783589363098, + "learning_rate": 0.0007500560224089636, + "loss": 0.4067, + "step": 9031 + }, + { + "epoch": 5.045810055865922, + "grad_norm": 0.416558176279068, + "learning_rate": 0.0007500280112044818, + "loss": 0.4526, + "step": 9032 + }, + { + "epoch": 5.046368715083799, + "grad_norm": 0.37266403436660767, + "learning_rate": 0.00075, + "loss": 0.3532, + "step": 9033 + }, + { + "epoch": 5.046927374301676, + "grad_norm": 0.5296469926834106, + "learning_rate": 0.0007499719887955182, + "loss": 0.3976, + "step": 9034 + }, + { + "epoch": 5.0474860335195535, + "grad_norm": 0.5366743803024292, + "learning_rate": 0.0007499439775910364, + "loss": 0.544, + "step": 9035 + }, + { + "epoch": 5.04804469273743, + "grad_norm": 0.709815263748169, + "learning_rate": 0.0007499159663865546, + "loss": 0.366, + "step": 9036 + }, + { + "epoch": 5.0486033519553075, + "grad_norm": 0.6635186076164246, + "learning_rate": 0.0007498879551820728, + "loss": 0.4646, + "step": 9037 + }, + { + "epoch": 5.049162011173184, + "grad_norm": 0.5140885710716248, + "learning_rate": 0.000749859943977591, + "loss": 0.4292, + "step": 9038 + }, + { + "epoch": 5.0497206703910615, + "grad_norm": 0.8237327337265015, + "learning_rate": 0.0007498319327731093, + "loss": 0.5936, + "step": 9039 + }, + { + "epoch": 5.050279329608939, + "grad_norm": 0.7342623472213745, + "learning_rate": 0.0007498039215686275, + "loss": 0.6089, + "step": 9040 + }, + { + "epoch": 5.050837988826816, + "grad_norm": 0.6316502690315247, + "learning_rate": 0.0007497759103641458, + "loss": 0.3639, + "step": 9041 + }, + { + "epoch": 5.051396648044693, + "grad_norm": 0.5191710591316223, + "learning_rate": 0.0007497478991596639, + "loss": 0.4286, + "step": 9042 + }, + { + "epoch": 5.05195530726257, + "grad_norm": 0.6342172622680664, + "learning_rate": 0.0007497198879551821, + "loss": 0.5062, + "step": 9043 + }, + { + "epoch": 5.052513966480447, + "grad_norm": 0.6966168880462646, + "learning_rate": 0.0007496918767507003, + "loss": 0.4927, + "step": 9044 + }, + { + "epoch": 5.053072625698324, + "grad_norm": 0.7143132090568542, + "learning_rate": 0.0007496638655462185, + "loss": 0.6288, + "step": 9045 + }, + { + "epoch": 5.053631284916201, + "grad_norm": 0.8026169538497925, + "learning_rate": 0.0007496358543417368, + "loss": 0.3856, + "step": 9046 + }, + { + "epoch": 5.0541899441340785, + "grad_norm": 1.144308090209961, + "learning_rate": 0.0007496078431372549, + "loss": 0.4914, + "step": 9047 + }, + { + "epoch": 5.054748603351955, + "grad_norm": 0.410597562789917, + "learning_rate": 0.0007495798319327731, + "loss": 0.4735, + "step": 9048 + }, + { + "epoch": 5.0553072625698325, + "grad_norm": 0.5788492560386658, + "learning_rate": 0.0007495518207282913, + "loss": 0.4474, + "step": 9049 + }, + { + "epoch": 5.055865921787709, + "grad_norm": 0.937309741973877, + "learning_rate": 0.0007495238095238095, + "loss": 0.5295, + "step": 9050 + }, + { + "epoch": 5.056424581005587, + "grad_norm": 0.7546955943107605, + "learning_rate": 0.0007494957983193278, + "loss": 0.3908, + "step": 9051 + }, + { + "epoch": 5.056983240223464, + "grad_norm": 0.4760865569114685, + "learning_rate": 0.0007494677871148459, + "loss": 0.472, + "step": 9052 + }, + { + "epoch": 5.057541899441341, + "grad_norm": 0.48237109184265137, + "learning_rate": 0.0007494397759103641, + "loss": 0.4583, + "step": 9053 + }, + { + "epoch": 5.058100558659218, + "grad_norm": 0.5296656489372253, + "learning_rate": 0.0007494117647058823, + "loss": 0.4464, + "step": 9054 + }, + { + "epoch": 5.058659217877095, + "grad_norm": 0.5461738109588623, + "learning_rate": 0.0007493837535014005, + "loss": 0.4793, + "step": 9055 + }, + { + "epoch": 5.059217877094972, + "grad_norm": 0.7374666929244995, + "learning_rate": 0.0007493557422969189, + "loss": 0.4267, + "step": 9056 + }, + { + "epoch": 5.0597765363128495, + "grad_norm": 0.46226826310157776, + "learning_rate": 0.0007493277310924371, + "loss": 0.4209, + "step": 9057 + }, + { + "epoch": 5.060335195530726, + "grad_norm": 0.49997398257255554, + "learning_rate": 0.0007492997198879552, + "loss": 0.504, + "step": 9058 + }, + { + "epoch": 5.0608938547486035, + "grad_norm": 0.45420151948928833, + "learning_rate": 0.0007492717086834734, + "loss": 0.4307, + "step": 9059 + }, + { + "epoch": 5.06145251396648, + "grad_norm": 5.43703031539917, + "learning_rate": 0.0007492436974789916, + "loss": 0.4714, + "step": 9060 + }, + { + "epoch": 5.062011173184358, + "grad_norm": 0.5206681489944458, + "learning_rate": 0.0007492156862745099, + "loss": 0.4188, + "step": 9061 + }, + { + "epoch": 5.062569832402235, + "grad_norm": 0.9799054265022278, + "learning_rate": 0.0007491876750700281, + "loss": 0.5357, + "step": 9062 + }, + { + "epoch": 5.063128491620112, + "grad_norm": 0.42604079842567444, + "learning_rate": 0.0007491596638655462, + "loss": 0.3856, + "step": 9063 + }, + { + "epoch": 5.063687150837989, + "grad_norm": 0.4713807702064514, + "learning_rate": 0.0007491316526610644, + "loss": 0.5249, + "step": 9064 + }, + { + "epoch": 5.064245810055866, + "grad_norm": 0.686998188495636, + "learning_rate": 0.0007491036414565826, + "loss": 0.4541, + "step": 9065 + }, + { + "epoch": 5.064804469273743, + "grad_norm": 0.4297754466533661, + "learning_rate": 0.0007490756302521009, + "loss": 0.3749, + "step": 9066 + }, + { + "epoch": 5.0653631284916205, + "grad_norm": 0.6463320255279541, + "learning_rate": 0.0007490476190476191, + "loss": 0.464, + "step": 9067 + }, + { + "epoch": 5.065921787709497, + "grad_norm": 0.525516152381897, + "learning_rate": 0.0007490196078431372, + "loss": 0.4105, + "step": 9068 + }, + { + "epoch": 5.0664804469273745, + "grad_norm": 0.8416241407394409, + "learning_rate": 0.0007489915966386554, + "loss": 0.5063, + "step": 9069 + }, + { + "epoch": 5.067039106145251, + "grad_norm": 0.42118191719055176, + "learning_rate": 0.0007489635854341736, + "loss": 0.4002, + "step": 9070 + }, + { + "epoch": 5.067597765363129, + "grad_norm": 0.8004279732704163, + "learning_rate": 0.000748935574229692, + "loss": 0.4241, + "step": 9071 + }, + { + "epoch": 5.068156424581006, + "grad_norm": 0.7429008483886719, + "learning_rate": 0.0007489075630252102, + "loss": 0.5162, + "step": 9072 + }, + { + "epoch": 5.068715083798883, + "grad_norm": 0.7366108298301697, + "learning_rate": 0.0007488795518207284, + "loss": 0.4642, + "step": 9073 + }, + { + "epoch": 5.06927374301676, + "grad_norm": 0.8561540842056274, + "learning_rate": 0.0007488515406162465, + "loss": 0.664, + "step": 9074 + }, + { + "epoch": 5.069832402234637, + "grad_norm": 0.5409652590751648, + "learning_rate": 0.0007488235294117647, + "loss": 0.5168, + "step": 9075 + }, + { + "epoch": 5.070391061452514, + "grad_norm": 0.5133274793624878, + "learning_rate": 0.000748795518207283, + "loss": 0.4855, + "step": 9076 + }, + { + "epoch": 5.070949720670391, + "grad_norm": 0.8196202516555786, + "learning_rate": 0.0007487675070028012, + "loss": 0.4366, + "step": 9077 + }, + { + "epoch": 5.071508379888268, + "grad_norm": 0.5594240427017212, + "learning_rate": 0.0007487394957983194, + "loss": 0.4384, + "step": 9078 + }, + { + "epoch": 5.0720670391061455, + "grad_norm": 0.6306514143943787, + "learning_rate": 0.0007487114845938375, + "loss": 0.4009, + "step": 9079 + }, + { + "epoch": 5.072625698324022, + "grad_norm": 0.6437781453132629, + "learning_rate": 0.0007486834733893557, + "loss": 0.3712, + "step": 9080 + }, + { + "epoch": 5.0731843575419, + "grad_norm": 0.819551944732666, + "learning_rate": 0.000748655462184874, + "loss": 0.3228, + "step": 9081 + }, + { + "epoch": 5.073743016759776, + "grad_norm": 0.5984007120132446, + "learning_rate": 0.0007486274509803922, + "loss": 0.4926, + "step": 9082 + }, + { + "epoch": 5.074301675977654, + "grad_norm": 0.7875058650970459, + "learning_rate": 0.0007485994397759104, + "loss": 0.4012, + "step": 9083 + }, + { + "epoch": 5.074860335195531, + "grad_norm": 0.641601026058197, + "learning_rate": 0.0007485714285714285, + "loss": 0.4481, + "step": 9084 + }, + { + "epoch": 5.075418994413408, + "grad_norm": 0.5293326377868652, + "learning_rate": 0.0007485434173669467, + "loss": 0.4653, + "step": 9085 + }, + { + "epoch": 5.075977653631285, + "grad_norm": 0.6156439185142517, + "learning_rate": 0.000748515406162465, + "loss": 0.4449, + "step": 9086 + }, + { + "epoch": 5.076536312849162, + "grad_norm": 0.41242364048957825, + "learning_rate": 0.0007484873949579832, + "loss": 0.4783, + "step": 9087 + }, + { + "epoch": 5.077094972067039, + "grad_norm": 0.6608604788780212, + "learning_rate": 0.0007484593837535015, + "loss": 0.4942, + "step": 9088 + }, + { + "epoch": 5.0776536312849165, + "grad_norm": 1.0168522596359253, + "learning_rate": 0.0007484313725490197, + "loss": 0.3869, + "step": 9089 + }, + { + "epoch": 5.078212290502793, + "grad_norm": 0.8041546940803528, + "learning_rate": 0.0007484033613445378, + "loss": 0.5234, + "step": 9090 + }, + { + "epoch": 5.078770949720671, + "grad_norm": 0.6877681612968445, + "learning_rate": 0.0007483753501400561, + "loss": 0.4753, + "step": 9091 + }, + { + "epoch": 5.079329608938547, + "grad_norm": 0.6746410131454468, + "learning_rate": 0.0007483473389355743, + "loss": 0.5417, + "step": 9092 + }, + { + "epoch": 5.079888268156425, + "grad_norm": 1.3586100339889526, + "learning_rate": 0.0007483193277310925, + "loss": 0.537, + "step": 9093 + }, + { + "epoch": 5.080446927374302, + "grad_norm": 0.81231689453125, + "learning_rate": 0.0007482913165266107, + "loss": 0.4411, + "step": 9094 + }, + { + "epoch": 5.081005586592179, + "grad_norm": 1.9372198581695557, + "learning_rate": 0.0007482633053221288, + "loss": 0.551, + "step": 9095 + }, + { + "epoch": 5.081564245810056, + "grad_norm": 0.6222761273384094, + "learning_rate": 0.0007482352941176471, + "loss": 0.4393, + "step": 9096 + }, + { + "epoch": 5.082122905027933, + "grad_norm": 1.0141596794128418, + "learning_rate": 0.0007482072829131653, + "loss": 0.521, + "step": 9097 + }, + { + "epoch": 5.08268156424581, + "grad_norm": 0.7729804515838623, + "learning_rate": 0.0007481792717086835, + "loss": 0.5078, + "step": 9098 + }, + { + "epoch": 5.0832402234636875, + "grad_norm": 0.6606077551841736, + "learning_rate": 0.0007481512605042017, + "loss": 0.5209, + "step": 9099 + }, + { + "epoch": 5.083798882681564, + "grad_norm": 0.5069326758384705, + "learning_rate": 0.0007481232492997198, + "loss": 0.4772, + "step": 9100 + }, + { + "epoch": 5.084357541899442, + "grad_norm": 0.47148627042770386, + "learning_rate": 0.0007480952380952381, + "loss": 0.4758, + "step": 9101 + }, + { + "epoch": 5.084916201117318, + "grad_norm": 0.9009197950363159, + "learning_rate": 0.0007480672268907563, + "loss": 0.5149, + "step": 9102 + }, + { + "epoch": 5.085474860335196, + "grad_norm": 0.5535609722137451, + "learning_rate": 0.0007480392156862745, + "loss": 0.3996, + "step": 9103 + }, + { + "epoch": 5.086033519553073, + "grad_norm": 0.6379386782646179, + "learning_rate": 0.0007480112044817927, + "loss": 0.3911, + "step": 9104 + }, + { + "epoch": 5.08659217877095, + "grad_norm": 0.7699502110481262, + "learning_rate": 0.000747983193277311, + "loss": 0.517, + "step": 9105 + }, + { + "epoch": 5.087150837988827, + "grad_norm": 0.5233026742935181, + "learning_rate": 0.0007479551820728292, + "loss": 0.4707, + "step": 9106 + }, + { + "epoch": 5.087709497206704, + "grad_norm": 0.5945190191268921, + "learning_rate": 0.0007479271708683474, + "loss": 0.4372, + "step": 9107 + }, + { + "epoch": 5.088268156424581, + "grad_norm": 0.5146439075469971, + "learning_rate": 0.0007478991596638656, + "loss": 0.4505, + "step": 9108 + }, + { + "epoch": 5.0888268156424585, + "grad_norm": 0.4786728322505951, + "learning_rate": 0.0007478711484593838, + "loss": 0.4103, + "step": 9109 + }, + { + "epoch": 5.089385474860335, + "grad_norm": 0.4101194739341736, + "learning_rate": 0.000747843137254902, + "loss": 0.3519, + "step": 9110 + }, + { + "epoch": 5.089944134078213, + "grad_norm": 0.7371591925621033, + "learning_rate": 0.0007478151260504202, + "loss": 0.3725, + "step": 9111 + }, + { + "epoch": 5.090502793296089, + "grad_norm": 0.5282604694366455, + "learning_rate": 0.0007477871148459384, + "loss": 0.4296, + "step": 9112 + }, + { + "epoch": 5.091061452513967, + "grad_norm": 0.5157302618026733, + "learning_rate": 0.0007477591036414566, + "loss": 0.4006, + "step": 9113 + }, + { + "epoch": 5.091620111731843, + "grad_norm": 0.7631750106811523, + "learning_rate": 0.0007477310924369748, + "loss": 0.4982, + "step": 9114 + }, + { + "epoch": 5.092178770949721, + "grad_norm": 5.686546325683594, + "learning_rate": 0.000747703081232493, + "loss": 0.6268, + "step": 9115 + }, + { + "epoch": 5.092737430167598, + "grad_norm": 0.48100242018699646, + "learning_rate": 0.0007476750700280112, + "loss": 0.5106, + "step": 9116 + }, + { + "epoch": 5.093296089385475, + "grad_norm": 0.6259045600891113, + "learning_rate": 0.0007476470588235294, + "loss": 0.4049, + "step": 9117 + }, + { + "epoch": 5.093854748603352, + "grad_norm": 0.5095353722572327, + "learning_rate": 0.0007476190476190476, + "loss": 0.4413, + "step": 9118 + }, + { + "epoch": 5.094413407821229, + "grad_norm": 0.6002395749092102, + "learning_rate": 0.0007475910364145658, + "loss": 0.4494, + "step": 9119 + }, + { + "epoch": 5.094972067039106, + "grad_norm": 0.5260825157165527, + "learning_rate": 0.000747563025210084, + "loss": 0.4345, + "step": 9120 + }, + { + "epoch": 5.0955307262569836, + "grad_norm": 0.46637779474258423, + "learning_rate": 0.0007475350140056024, + "loss": 0.472, + "step": 9121 + }, + { + "epoch": 5.09608938547486, + "grad_norm": 0.5473801493644714, + "learning_rate": 0.0007475070028011205, + "loss": 0.5044, + "step": 9122 + }, + { + "epoch": 5.096648044692738, + "grad_norm": 0.9179593920707703, + "learning_rate": 0.0007474789915966387, + "loss": 0.3786, + "step": 9123 + }, + { + "epoch": 5.097206703910614, + "grad_norm": 1.1870479583740234, + "learning_rate": 0.0007474509803921569, + "loss": 0.4681, + "step": 9124 + }, + { + "epoch": 5.097765363128492, + "grad_norm": 0.8743658065795898, + "learning_rate": 0.0007474229691876751, + "loss": 0.4436, + "step": 9125 + }, + { + "epoch": 5.098324022346369, + "grad_norm": 0.6783514618873596, + "learning_rate": 0.0007473949579831934, + "loss": 0.4095, + "step": 9126 + }, + { + "epoch": 5.098882681564246, + "grad_norm": 0.38759341835975647, + "learning_rate": 0.0007473669467787115, + "loss": 0.3211, + "step": 9127 + }, + { + "epoch": 5.099441340782123, + "grad_norm": 0.5709186792373657, + "learning_rate": 0.0007473389355742297, + "loss": 0.5117, + "step": 9128 + }, + { + "epoch": 5.1, + "grad_norm": 2.3181209564208984, + "learning_rate": 0.0007473109243697479, + "loss": 0.4957, + "step": 9129 + }, + { + "epoch": 5.100558659217877, + "grad_norm": 0.5004379749298096, + "learning_rate": 0.0007472829131652661, + "loss": 0.4353, + "step": 9130 + }, + { + "epoch": 5.1011173184357546, + "grad_norm": 0.4079465866088867, + "learning_rate": 0.0007472549019607844, + "loss": 0.3752, + "step": 9131 + }, + { + "epoch": 5.101675977653631, + "grad_norm": 0.6083446145057678, + "learning_rate": 0.0007472268907563025, + "loss": 0.5024, + "step": 9132 + }, + { + "epoch": 5.102234636871509, + "grad_norm": 0.4655718207359314, + "learning_rate": 0.0007471988795518207, + "loss": 0.4567, + "step": 9133 + }, + { + "epoch": 5.102793296089385, + "grad_norm": 0.45232367515563965, + "learning_rate": 0.0007471708683473389, + "loss": 0.345, + "step": 9134 + }, + { + "epoch": 5.103351955307263, + "grad_norm": 0.9004889130592346, + "learning_rate": 0.0007471428571428571, + "loss": 0.4456, + "step": 9135 + }, + { + "epoch": 5.10391061452514, + "grad_norm": 0.7123329639434814, + "learning_rate": 0.0007471148459383754, + "loss": 0.4709, + "step": 9136 + }, + { + "epoch": 5.104469273743017, + "grad_norm": 0.6250120997428894, + "learning_rate": 0.0007470868347338937, + "loss": 0.4538, + "step": 9137 + }, + { + "epoch": 5.105027932960894, + "grad_norm": 0.38795205950737, + "learning_rate": 0.0007470588235294118, + "loss": 0.3503, + "step": 9138 + }, + { + "epoch": 5.105586592178771, + "grad_norm": 0.7008611559867859, + "learning_rate": 0.00074703081232493, + "loss": 0.3955, + "step": 9139 + }, + { + "epoch": 5.106145251396648, + "grad_norm": 0.5417254567146301, + "learning_rate": 0.0007470028011204482, + "loss": 0.4044, + "step": 9140 + }, + { + "epoch": 5.1067039106145256, + "grad_norm": 0.506192147731781, + "learning_rate": 0.0007469747899159665, + "loss": 0.484, + "step": 9141 + }, + { + "epoch": 5.107262569832402, + "grad_norm": 0.5177536010742188, + "learning_rate": 0.0007469467787114847, + "loss": 0.3632, + "step": 9142 + }, + { + "epoch": 5.10782122905028, + "grad_norm": 1.4445257186889648, + "learning_rate": 0.0007469187675070028, + "loss": 0.4051, + "step": 9143 + }, + { + "epoch": 5.108379888268156, + "grad_norm": 0.35983484983444214, + "learning_rate": 0.000746890756302521, + "loss": 0.3717, + "step": 9144 + }, + { + "epoch": 5.108938547486034, + "grad_norm": 0.5104691982269287, + "learning_rate": 0.0007468627450980392, + "loss": 0.54, + "step": 9145 + }, + { + "epoch": 5.10949720670391, + "grad_norm": 0.6074373126029968, + "learning_rate": 0.0007468347338935574, + "loss": 0.423, + "step": 9146 + }, + { + "epoch": 5.110055865921788, + "grad_norm": 0.5419580936431885, + "learning_rate": 0.0007468067226890757, + "loss": 0.6334, + "step": 9147 + }, + { + "epoch": 5.110614525139665, + "grad_norm": 0.6220030188560486, + "learning_rate": 0.0007467787114845938, + "loss": 0.4293, + "step": 9148 + }, + { + "epoch": 5.111173184357542, + "grad_norm": 0.8056634664535522, + "learning_rate": 0.000746750700280112, + "loss": 0.5729, + "step": 9149 + }, + { + "epoch": 5.111731843575419, + "grad_norm": 0.42025524377822876, + "learning_rate": 0.0007467226890756302, + "loss": 0.3801, + "step": 9150 + }, + { + "epoch": 5.112290502793296, + "grad_norm": 0.593021810054779, + "learning_rate": 0.0007466946778711484, + "loss": 0.4028, + "step": 9151 + }, + { + "epoch": 5.112849162011173, + "grad_norm": 0.48286619782447815, + "learning_rate": 0.0007466666666666667, + "loss": 0.4314, + "step": 9152 + }, + { + "epoch": 5.113407821229051, + "grad_norm": 0.47215911746025085, + "learning_rate": 0.000746638655462185, + "loss": 0.4138, + "step": 9153 + }, + { + "epoch": 5.113966480446927, + "grad_norm": 1.1260055303573608, + "learning_rate": 0.000746610644257703, + "loss": 0.4264, + "step": 9154 + }, + { + "epoch": 5.114525139664805, + "grad_norm": 0.5313096046447754, + "learning_rate": 0.0007465826330532213, + "loss": 0.5381, + "step": 9155 + }, + { + "epoch": 5.115083798882681, + "grad_norm": 0.4856265187263489, + "learning_rate": 0.0007465546218487395, + "loss": 0.5178, + "step": 9156 + }, + { + "epoch": 5.115642458100559, + "grad_norm": 0.39768341183662415, + "learning_rate": 0.0007465266106442578, + "loss": 0.4264, + "step": 9157 + }, + { + "epoch": 5.116201117318436, + "grad_norm": 0.3987128734588623, + "learning_rate": 0.000746498599439776, + "loss": 0.4354, + "step": 9158 + }, + { + "epoch": 5.116759776536313, + "grad_norm": 6.93054723739624, + "learning_rate": 0.0007464705882352941, + "loss": 0.5876, + "step": 9159 + }, + { + "epoch": 5.11731843575419, + "grad_norm": 0.6016881465911865, + "learning_rate": 0.0007464425770308123, + "loss": 0.517, + "step": 9160 + }, + { + "epoch": 5.117877094972067, + "grad_norm": 1.1399112939834595, + "learning_rate": 0.0007464145658263305, + "loss": 0.5397, + "step": 9161 + }, + { + "epoch": 5.118435754189944, + "grad_norm": 0.717146098613739, + "learning_rate": 0.0007463865546218488, + "loss": 0.497, + "step": 9162 + }, + { + "epoch": 5.118994413407822, + "grad_norm": 0.46363526582717896, + "learning_rate": 0.000746358543417367, + "loss": 0.5519, + "step": 9163 + }, + { + "epoch": 5.119553072625698, + "grad_norm": 0.5593603253364563, + "learning_rate": 0.0007463305322128851, + "loss": 0.3576, + "step": 9164 + }, + { + "epoch": 5.120111731843576, + "grad_norm": 0.4428239166736603, + "learning_rate": 0.0007463025210084033, + "loss": 0.5007, + "step": 9165 + }, + { + "epoch": 5.120670391061452, + "grad_norm": 0.5615933537483215, + "learning_rate": 0.0007462745098039215, + "loss": 0.4523, + "step": 9166 + }, + { + "epoch": 5.12122905027933, + "grad_norm": 0.5217528939247131, + "learning_rate": 0.0007462464985994398, + "loss": 0.4572, + "step": 9167 + }, + { + "epoch": 5.121787709497207, + "grad_norm": 0.4537884294986725, + "learning_rate": 0.000746218487394958, + "loss": 0.499, + "step": 9168 + }, + { + "epoch": 5.122346368715084, + "grad_norm": 0.5292918086051941, + "learning_rate": 0.0007461904761904762, + "loss": 0.5083, + "step": 9169 + }, + { + "epoch": 5.122905027932961, + "grad_norm": 1.3333250284194946, + "learning_rate": 0.0007461624649859943, + "loss": 0.5122, + "step": 9170 + }, + { + "epoch": 5.123463687150838, + "grad_norm": 0.532231867313385, + "learning_rate": 0.0007461344537815126, + "loss": 0.4038, + "step": 9171 + }, + { + "epoch": 5.124022346368715, + "grad_norm": 0.5123986005783081, + "learning_rate": 0.0007461064425770309, + "loss": 0.4967, + "step": 9172 + }, + { + "epoch": 5.124581005586593, + "grad_norm": 0.740041196346283, + "learning_rate": 0.0007460784313725491, + "loss": 0.5546, + "step": 9173 + }, + { + "epoch": 5.125139664804469, + "grad_norm": 0.5402716994285583, + "learning_rate": 0.0007460504201680673, + "loss": 0.4548, + "step": 9174 + }, + { + "epoch": 5.125698324022347, + "grad_norm": 0.5447051525115967, + "learning_rate": 0.0007460224089635854, + "loss": 0.5365, + "step": 9175 + }, + { + "epoch": 5.126256983240223, + "grad_norm": 0.4971652626991272, + "learning_rate": 0.0007459943977591036, + "loss": 0.3755, + "step": 9176 + }, + { + "epoch": 5.126815642458101, + "grad_norm": 0.5869438052177429, + "learning_rate": 0.0007459663865546219, + "loss": 0.4202, + "step": 9177 + }, + { + "epoch": 5.127374301675978, + "grad_norm": 12.306452751159668, + "learning_rate": 0.0007459383753501401, + "loss": 0.5314, + "step": 9178 + }, + { + "epoch": 5.127932960893855, + "grad_norm": 0.6971128582954407, + "learning_rate": 0.0007459103641456583, + "loss": 0.7208, + "step": 9179 + }, + { + "epoch": 5.128491620111732, + "grad_norm": 1.1146947145462036, + "learning_rate": 0.0007458823529411764, + "loss": 0.4351, + "step": 9180 + }, + { + "epoch": 5.129050279329609, + "grad_norm": 0.7231548428535461, + "learning_rate": 0.0007458543417366946, + "loss": 0.5181, + "step": 9181 + }, + { + "epoch": 5.129608938547486, + "grad_norm": 0.46546879410743713, + "learning_rate": 0.0007458263305322129, + "loss": 0.4337, + "step": 9182 + }, + { + "epoch": 5.130167597765363, + "grad_norm": 0.529313325881958, + "learning_rate": 0.0007457983193277311, + "loss": 0.3499, + "step": 9183 + }, + { + "epoch": 5.13072625698324, + "grad_norm": 1.1186801195144653, + "learning_rate": 0.0007457703081232493, + "loss": 0.4444, + "step": 9184 + }, + { + "epoch": 5.131284916201118, + "grad_norm": 0.5682976841926575, + "learning_rate": 0.0007457422969187675, + "loss": 0.3459, + "step": 9185 + }, + { + "epoch": 5.131843575418994, + "grad_norm": 0.5462518930435181, + "learning_rate": 0.0007457142857142856, + "loss": 0.4414, + "step": 9186 + }, + { + "epoch": 5.132402234636872, + "grad_norm": 1.2040019035339355, + "learning_rate": 0.000745686274509804, + "loss": 0.5522, + "step": 9187 + }, + { + "epoch": 5.132960893854748, + "grad_norm": 0.5593796372413635, + "learning_rate": 0.0007456582633053222, + "loss": 0.3806, + "step": 9188 + }, + { + "epoch": 5.133519553072626, + "grad_norm": 0.44086018204689026, + "learning_rate": 0.0007456302521008404, + "loss": 0.3796, + "step": 9189 + }, + { + "epoch": 5.134078212290503, + "grad_norm": 0.6522893905639648, + "learning_rate": 0.0007456022408963586, + "loss": 0.4245, + "step": 9190 + }, + { + "epoch": 5.13463687150838, + "grad_norm": 0.6190874576568604, + "learning_rate": 0.0007455742296918767, + "loss": 0.4861, + "step": 9191 + }, + { + "epoch": 5.135195530726257, + "grad_norm": 0.7108556628227234, + "learning_rate": 0.000745546218487395, + "loss": 0.3839, + "step": 9192 + }, + { + "epoch": 5.135754189944134, + "grad_norm": 0.5619034767150879, + "learning_rate": 0.0007455182072829132, + "loss": 0.5357, + "step": 9193 + }, + { + "epoch": 5.136312849162011, + "grad_norm": 1.3583414554595947, + "learning_rate": 0.0007454901960784314, + "loss": 0.4291, + "step": 9194 + }, + { + "epoch": 5.136871508379889, + "grad_norm": 0.5836233496665955, + "learning_rate": 0.0007454621848739496, + "loss": 0.5288, + "step": 9195 + }, + { + "epoch": 5.137430167597765, + "grad_norm": 0.390178382396698, + "learning_rate": 0.0007454341736694677, + "loss": 0.3994, + "step": 9196 + }, + { + "epoch": 5.137988826815643, + "grad_norm": 0.5170450806617737, + "learning_rate": 0.000745406162464986, + "loss": 0.4195, + "step": 9197 + }, + { + "epoch": 5.138547486033519, + "grad_norm": 0.8974315524101257, + "learning_rate": 0.0007453781512605042, + "loss": 0.5007, + "step": 9198 + }, + { + "epoch": 5.139106145251397, + "grad_norm": 0.5151286125183105, + "learning_rate": 0.0007453501400560224, + "loss": 0.5086, + "step": 9199 + }, + { + "epoch": 5.139664804469274, + "grad_norm": 0.393502801656723, + "learning_rate": 0.0007453221288515406, + "loss": 0.3409, + "step": 9200 + }, + { + "epoch": 5.140223463687151, + "grad_norm": 0.45447811484336853, + "learning_rate": 0.0007452941176470588, + "loss": 0.3762, + "step": 9201 + }, + { + "epoch": 5.140782122905028, + "grad_norm": 0.40816089510917664, + "learning_rate": 0.000745266106442577, + "loss": 0.5043, + "step": 9202 + }, + { + "epoch": 5.141340782122905, + "grad_norm": 0.5953418016433716, + "learning_rate": 0.0007452380952380953, + "loss": 0.5736, + "step": 9203 + }, + { + "epoch": 5.141899441340782, + "grad_norm": 4.842398643493652, + "learning_rate": 0.0007452100840336135, + "loss": 0.521, + "step": 9204 + }, + { + "epoch": 5.14245810055866, + "grad_norm": 0.7641180753707886, + "learning_rate": 0.0007451820728291317, + "loss": 0.5186, + "step": 9205 + }, + { + "epoch": 5.143016759776536, + "grad_norm": 0.6450425982475281, + "learning_rate": 0.0007451540616246499, + "loss": 0.4421, + "step": 9206 + }, + { + "epoch": 5.143575418994414, + "grad_norm": 0.4553753733634949, + "learning_rate": 0.0007451260504201681, + "loss": 0.4423, + "step": 9207 + }, + { + "epoch": 5.14413407821229, + "grad_norm": 1.2479188442230225, + "learning_rate": 0.0007450980392156863, + "loss": 0.4086, + "step": 9208 + }, + { + "epoch": 5.144692737430168, + "grad_norm": 0.5659211277961731, + "learning_rate": 0.0007450700280112045, + "loss": 0.5707, + "step": 9209 + }, + { + "epoch": 5.145251396648045, + "grad_norm": 0.5723447799682617, + "learning_rate": 0.0007450420168067227, + "loss": 0.3486, + "step": 9210 + }, + { + "epoch": 5.145810055865922, + "grad_norm": 0.875298023223877, + "learning_rate": 0.0007450140056022409, + "loss": 0.5293, + "step": 9211 + }, + { + "epoch": 5.146368715083799, + "grad_norm": 1.6071817874908447, + "learning_rate": 0.0007449859943977591, + "loss": 0.5443, + "step": 9212 + }, + { + "epoch": 5.146927374301676, + "grad_norm": 1.6044070720672607, + "learning_rate": 0.0007449579831932773, + "loss": 0.4786, + "step": 9213 + }, + { + "epoch": 5.147486033519553, + "grad_norm": 0.5789911150932312, + "learning_rate": 0.0007449299719887955, + "loss": 0.5145, + "step": 9214 + }, + { + "epoch": 5.148044692737431, + "grad_norm": 0.7926276922225952, + "learning_rate": 0.0007449019607843137, + "loss": 0.4674, + "step": 9215 + }, + { + "epoch": 5.148603351955307, + "grad_norm": 1.0170307159423828, + "learning_rate": 0.0007448739495798319, + "loss": 0.7411, + "step": 9216 + }, + { + "epoch": 5.149162011173185, + "grad_norm": 0.9485819935798645, + "learning_rate": 0.0007448459383753502, + "loss": 0.5215, + "step": 9217 + }, + { + "epoch": 5.149720670391061, + "grad_norm": 0.47148680686950684, + "learning_rate": 0.0007448179271708683, + "loss": 0.4163, + "step": 9218 + }, + { + "epoch": 5.150279329608939, + "grad_norm": 0.6326032280921936, + "learning_rate": 0.0007447899159663865, + "loss": 0.4939, + "step": 9219 + }, + { + "epoch": 5.150837988826815, + "grad_norm": 0.5185278654098511, + "learning_rate": 0.0007447619047619048, + "loss": 0.3868, + "step": 9220 + }, + { + "epoch": 5.151396648044693, + "grad_norm": 0.6758152842521667, + "learning_rate": 0.000744733893557423, + "loss": 0.4409, + "step": 9221 + }, + { + "epoch": 5.15195530726257, + "grad_norm": 0.9032146334648132, + "learning_rate": 0.0007447058823529413, + "loss": 0.5934, + "step": 9222 + }, + { + "epoch": 5.152513966480447, + "grad_norm": 0.615212082862854, + "learning_rate": 0.0007446778711484594, + "loss": 0.389, + "step": 9223 + }, + { + "epoch": 5.153072625698324, + "grad_norm": 0.47535285353660583, + "learning_rate": 0.0007446498599439776, + "loss": 0.4048, + "step": 9224 + }, + { + "epoch": 5.153631284916201, + "grad_norm": 0.5904978513717651, + "learning_rate": 0.0007446218487394958, + "loss": 0.5017, + "step": 9225 + }, + { + "epoch": 5.154189944134078, + "grad_norm": 0.5763953328132629, + "learning_rate": 0.000744593837535014, + "loss": 0.4708, + "step": 9226 + }, + { + "epoch": 5.154748603351956, + "grad_norm": 0.8433234691619873, + "learning_rate": 0.0007445658263305323, + "loss": 0.453, + "step": 9227 + }, + { + "epoch": 5.155307262569832, + "grad_norm": 2.287301778793335, + "learning_rate": 0.0007445378151260504, + "loss": 0.4227, + "step": 9228 + }, + { + "epoch": 5.15586592178771, + "grad_norm": 0.948485791683197, + "learning_rate": 0.0007445098039215686, + "loss": 0.4887, + "step": 9229 + }, + { + "epoch": 5.156424581005586, + "grad_norm": 0.544860303401947, + "learning_rate": 0.0007444817927170868, + "loss": 0.5745, + "step": 9230 + }, + { + "epoch": 5.156983240223464, + "grad_norm": 0.6630926728248596, + "learning_rate": 0.000744453781512605, + "loss": 0.6264, + "step": 9231 + }, + { + "epoch": 5.157541899441341, + "grad_norm": 0.6315627694129944, + "learning_rate": 0.0007444257703081233, + "loss": 0.3774, + "step": 9232 + }, + { + "epoch": 5.158100558659218, + "grad_norm": 0.703891932964325, + "learning_rate": 0.0007443977591036415, + "loss": 0.4564, + "step": 9233 + }, + { + "epoch": 5.158659217877095, + "grad_norm": 1.1930180788040161, + "learning_rate": 0.0007443697478991596, + "loss": 0.4812, + "step": 9234 + }, + { + "epoch": 5.159217877094972, + "grad_norm": 0.5696471929550171, + "learning_rate": 0.0007443417366946778, + "loss": 0.482, + "step": 9235 + }, + { + "epoch": 5.159776536312849, + "grad_norm": 1.9199336767196655, + "learning_rate": 0.000744313725490196, + "loss": 0.4004, + "step": 9236 + }, + { + "epoch": 5.160335195530727, + "grad_norm": 1.5826082229614258, + "learning_rate": 0.0007442857142857144, + "loss": 0.4288, + "step": 9237 + }, + { + "epoch": 5.160893854748603, + "grad_norm": 0.9928046464920044, + "learning_rate": 0.0007442577030812326, + "loss": 0.7165, + "step": 9238 + }, + { + "epoch": 5.161452513966481, + "grad_norm": 0.5731468200683594, + "learning_rate": 0.0007442296918767507, + "loss": 0.4374, + "step": 9239 + }, + { + "epoch": 5.162011173184357, + "grad_norm": 0.7084184288978577, + "learning_rate": 0.0007442016806722689, + "loss": 0.4329, + "step": 9240 + }, + { + "epoch": 5.162569832402235, + "grad_norm": 0.5002947449684143, + "learning_rate": 0.0007441736694677871, + "loss": 0.3726, + "step": 9241 + }, + { + "epoch": 5.163128491620112, + "grad_norm": 0.4340097904205322, + "learning_rate": 0.0007441456582633054, + "loss": 0.485, + "step": 9242 + }, + { + "epoch": 5.163687150837989, + "grad_norm": 0.8195523023605347, + "learning_rate": 0.0007441176470588236, + "loss": 0.501, + "step": 9243 + }, + { + "epoch": 5.164245810055866, + "grad_norm": 0.5582486987113953, + "learning_rate": 0.0007440896358543417, + "loss": 0.5458, + "step": 9244 + }, + { + "epoch": 5.164804469273743, + "grad_norm": 0.8311848640441895, + "learning_rate": 0.0007440616246498599, + "loss": 0.4329, + "step": 9245 + }, + { + "epoch": 5.16536312849162, + "grad_norm": 0.5118499398231506, + "learning_rate": 0.0007440336134453781, + "loss": 0.4444, + "step": 9246 + }, + { + "epoch": 5.165921787709498, + "grad_norm": 0.39821428060531616, + "learning_rate": 0.0007440056022408964, + "loss": 0.4042, + "step": 9247 + }, + { + "epoch": 5.166480446927374, + "grad_norm": 0.590999960899353, + "learning_rate": 0.0007439775910364146, + "loss": 0.3764, + "step": 9248 + }, + { + "epoch": 5.167039106145252, + "grad_norm": 0.6888015866279602, + "learning_rate": 0.0007439495798319328, + "loss": 0.4367, + "step": 9249 + }, + { + "epoch": 5.167597765363128, + "grad_norm": 0.43305695056915283, + "learning_rate": 0.0007439215686274509, + "loss": 0.4577, + "step": 9250 + }, + { + "epoch": 5.168156424581006, + "grad_norm": 0.47872641682624817, + "learning_rate": 0.0007438935574229691, + "loss": 0.4112, + "step": 9251 + }, + { + "epoch": 5.168715083798883, + "grad_norm": 0.5339536666870117, + "learning_rate": 0.0007438655462184875, + "loss": 0.4605, + "step": 9252 + }, + { + "epoch": 5.16927374301676, + "grad_norm": 0.8600516319274902, + "learning_rate": 0.0007438375350140057, + "loss": 0.4863, + "step": 9253 + }, + { + "epoch": 5.169832402234637, + "grad_norm": 1.5130808353424072, + "learning_rate": 0.0007438095238095239, + "loss": 0.4264, + "step": 9254 + }, + { + "epoch": 5.170391061452514, + "grad_norm": 1.0064526796340942, + "learning_rate": 0.000743781512605042, + "loss": 0.4165, + "step": 9255 + }, + { + "epoch": 5.170949720670391, + "grad_norm": 0.9091895818710327, + "learning_rate": 0.0007437535014005602, + "loss": 0.6808, + "step": 9256 + }, + { + "epoch": 5.171508379888268, + "grad_norm": 0.7874834537506104, + "learning_rate": 0.0007437254901960785, + "loss": 0.4314, + "step": 9257 + }, + { + "epoch": 5.172067039106145, + "grad_norm": 0.49575164914131165, + "learning_rate": 0.0007436974789915967, + "loss": 0.4311, + "step": 9258 + }, + { + "epoch": 5.172625698324023, + "grad_norm": 0.41619476675987244, + "learning_rate": 0.0007436694677871149, + "loss": 0.3942, + "step": 9259 + }, + { + "epoch": 5.173184357541899, + "grad_norm": 0.3923790752887726, + "learning_rate": 0.000743641456582633, + "loss": 0.3437, + "step": 9260 + }, + { + "epoch": 5.173743016759777, + "grad_norm": 0.4505344331264496, + "learning_rate": 0.0007436134453781512, + "loss": 0.3975, + "step": 9261 + }, + { + "epoch": 5.174301675977653, + "grad_norm": 0.41459575295448303, + "learning_rate": 0.0007435854341736695, + "loss": 0.4426, + "step": 9262 + }, + { + "epoch": 5.174860335195531, + "grad_norm": 0.41789692640304565, + "learning_rate": 0.0007435574229691877, + "loss": 0.4668, + "step": 9263 + }, + { + "epoch": 5.175418994413408, + "grad_norm": 0.6984832286834717, + "learning_rate": 0.0007435294117647059, + "loss": 0.3303, + "step": 9264 + }, + { + "epoch": 5.175977653631285, + "grad_norm": 0.5196592211723328, + "learning_rate": 0.0007435014005602241, + "loss": 0.4269, + "step": 9265 + }, + { + "epoch": 5.176536312849162, + "grad_norm": 0.5434472560882568, + "learning_rate": 0.0007434733893557422, + "loss": 0.3435, + "step": 9266 + }, + { + "epoch": 5.177094972067039, + "grad_norm": 0.41812241077423096, + "learning_rate": 0.0007434453781512605, + "loss": 0.3234, + "step": 9267 + }, + { + "epoch": 5.177653631284916, + "grad_norm": 0.40646231174468994, + "learning_rate": 0.0007434173669467787, + "loss": 0.4378, + "step": 9268 + }, + { + "epoch": 5.178212290502794, + "grad_norm": 0.4325316250324249, + "learning_rate": 0.000743389355742297, + "loss": 0.4799, + "step": 9269 + }, + { + "epoch": 5.17877094972067, + "grad_norm": 0.36815252900123596, + "learning_rate": 0.0007433613445378152, + "loss": 0.4526, + "step": 9270 + }, + { + "epoch": 5.179329608938548, + "grad_norm": 0.41733407974243164, + "learning_rate": 0.0007433333333333333, + "loss": 0.4701, + "step": 9271 + }, + { + "epoch": 5.179888268156424, + "grad_norm": 0.5900431871414185, + "learning_rate": 0.0007433053221288516, + "loss": 0.4061, + "step": 9272 + }, + { + "epoch": 5.180446927374302, + "grad_norm": 1.3708845376968384, + "learning_rate": 0.0007432773109243698, + "loss": 0.4489, + "step": 9273 + }, + { + "epoch": 5.181005586592179, + "grad_norm": 0.4360641837120056, + "learning_rate": 0.000743249299719888, + "loss": 0.3557, + "step": 9274 + }, + { + "epoch": 5.181564245810056, + "grad_norm": 0.6780818104743958, + "learning_rate": 0.0007432212885154062, + "loss": 0.4028, + "step": 9275 + }, + { + "epoch": 5.182122905027933, + "grad_norm": 0.9363614320755005, + "learning_rate": 0.0007431932773109243, + "loss": 0.5742, + "step": 9276 + }, + { + "epoch": 5.18268156424581, + "grad_norm": 3.5284266471862793, + "learning_rate": 0.0007431652661064426, + "loss": 0.3808, + "step": 9277 + }, + { + "epoch": 5.183240223463687, + "grad_norm": 0.574120283126831, + "learning_rate": 0.0007431372549019608, + "loss": 0.4559, + "step": 9278 + }, + { + "epoch": 5.183798882681565, + "grad_norm": 0.7071759104728699, + "learning_rate": 0.000743109243697479, + "loss": 0.5033, + "step": 9279 + }, + { + "epoch": 5.184357541899441, + "grad_norm": 0.49550479650497437, + "learning_rate": 0.0007430812324929972, + "loss": 0.5458, + "step": 9280 + }, + { + "epoch": 5.184916201117319, + "grad_norm": 0.5007463693618774, + "learning_rate": 0.0007430532212885154, + "loss": 0.5074, + "step": 9281 + }, + { + "epoch": 5.185474860335195, + "grad_norm": 1.6928448677062988, + "learning_rate": 0.0007430252100840336, + "loss": 0.3536, + "step": 9282 + }, + { + "epoch": 5.186033519553073, + "grad_norm": 1.4974571466445923, + "learning_rate": 0.0007429971988795518, + "loss": 0.4392, + "step": 9283 + }, + { + "epoch": 5.18659217877095, + "grad_norm": 8.268061637878418, + "learning_rate": 0.00074296918767507, + "loss": 0.388, + "step": 9284 + }, + { + "epoch": 5.187150837988827, + "grad_norm": 2.442584991455078, + "learning_rate": 0.0007429411764705883, + "loss": 0.4953, + "step": 9285 + }, + { + "epoch": 5.187709497206704, + "grad_norm": 2.018022298812866, + "learning_rate": 0.0007429131652661065, + "loss": 0.3898, + "step": 9286 + }, + { + "epoch": 5.188268156424581, + "grad_norm": 0.5392963290214539, + "learning_rate": 0.0007428851540616247, + "loss": 0.5185, + "step": 9287 + }, + { + "epoch": 5.188826815642458, + "grad_norm": 0.5354064702987671, + "learning_rate": 0.0007428571428571429, + "loss": 0.447, + "step": 9288 + }, + { + "epoch": 5.189385474860336, + "grad_norm": 0.6769338250160217, + "learning_rate": 0.0007428291316526611, + "loss": 0.5134, + "step": 9289 + }, + { + "epoch": 5.189944134078212, + "grad_norm": 0.5994898676872253, + "learning_rate": 0.0007428011204481793, + "loss": 0.5327, + "step": 9290 + }, + { + "epoch": 5.19050279329609, + "grad_norm": 1.0860724449157715, + "learning_rate": 0.0007427731092436975, + "loss": 0.5241, + "step": 9291 + }, + { + "epoch": 5.191061452513966, + "grad_norm": 0.4479406177997589, + "learning_rate": 0.0007427450980392158, + "loss": 0.4274, + "step": 9292 + }, + { + "epoch": 5.191620111731844, + "grad_norm": 0.6470661759376526, + "learning_rate": 0.0007427170868347339, + "loss": 0.3776, + "step": 9293 + }, + { + "epoch": 5.19217877094972, + "grad_norm": 0.5875535011291504, + "learning_rate": 0.0007426890756302521, + "loss": 0.5605, + "step": 9294 + }, + { + "epoch": 5.192737430167598, + "grad_norm": 0.6037549376487732, + "learning_rate": 0.0007426610644257703, + "loss": 0.3953, + "step": 9295 + }, + { + "epoch": 5.193296089385475, + "grad_norm": 0.5385110378265381, + "learning_rate": 0.0007426330532212885, + "loss": 0.3951, + "step": 9296 + }, + { + "epoch": 5.193854748603352, + "grad_norm": 0.518102765083313, + "learning_rate": 0.0007426050420168068, + "loss": 0.4269, + "step": 9297 + }, + { + "epoch": 5.194413407821229, + "grad_norm": 0.7049671411514282, + "learning_rate": 0.0007425770308123249, + "loss": 0.553, + "step": 9298 + }, + { + "epoch": 5.194972067039106, + "grad_norm": 0.3856646418571472, + "learning_rate": 0.0007425490196078431, + "loss": 0.4273, + "step": 9299 + }, + { + "epoch": 5.195530726256983, + "grad_norm": 0.8590890169143677, + "learning_rate": 0.0007425210084033613, + "loss": 0.6444, + "step": 9300 + }, + { + "epoch": 5.196089385474861, + "grad_norm": 0.5261777639389038, + "learning_rate": 0.0007424929971988795, + "loss": 0.3433, + "step": 9301 + }, + { + "epoch": 5.196648044692737, + "grad_norm": 0.4589046239852905, + "learning_rate": 0.0007424649859943979, + "loss": 0.4006, + "step": 9302 + }, + { + "epoch": 5.197206703910615, + "grad_norm": 0.5377180576324463, + "learning_rate": 0.000742436974789916, + "loss": 0.4386, + "step": 9303 + }, + { + "epoch": 5.197765363128491, + "grad_norm": 0.5259334444999695, + "learning_rate": 0.0007424089635854342, + "loss": 0.544, + "step": 9304 + }, + { + "epoch": 5.198324022346369, + "grad_norm": 0.46482986211776733, + "learning_rate": 0.0007423809523809524, + "loss": 0.5363, + "step": 9305 + }, + { + "epoch": 5.198882681564246, + "grad_norm": 0.8501517176628113, + "learning_rate": 0.0007423529411764706, + "loss": 0.4826, + "step": 9306 + }, + { + "epoch": 5.199441340782123, + "grad_norm": 0.6368624567985535, + "learning_rate": 0.0007423249299719889, + "loss": 0.46, + "step": 9307 + }, + { + "epoch": 5.2, + "grad_norm": 0.4156450033187866, + "learning_rate": 0.0007422969187675071, + "loss": 0.3231, + "step": 9308 + }, + { + "epoch": 5.200558659217877, + "grad_norm": 0.5898316502571106, + "learning_rate": 0.0007422689075630252, + "loss": 0.4461, + "step": 9309 + }, + { + "epoch": 5.201117318435754, + "grad_norm": 1.5258007049560547, + "learning_rate": 0.0007422408963585434, + "loss": 0.5537, + "step": 9310 + }, + { + "epoch": 5.201675977653632, + "grad_norm": 0.4372830092906952, + "learning_rate": 0.0007422128851540616, + "loss": 0.3891, + "step": 9311 + }, + { + "epoch": 5.202234636871508, + "grad_norm": 0.5006040930747986, + "learning_rate": 0.0007421848739495799, + "loss": 0.4505, + "step": 9312 + }, + { + "epoch": 5.202793296089386, + "grad_norm": 0.43406760692596436, + "learning_rate": 0.0007421568627450981, + "loss": 0.4142, + "step": 9313 + }, + { + "epoch": 5.203351955307262, + "grad_norm": 0.4532420337200165, + "learning_rate": 0.0007421288515406162, + "loss": 0.367, + "step": 9314 + }, + { + "epoch": 5.20391061452514, + "grad_norm": 0.7268055081367493, + "learning_rate": 0.0007421008403361344, + "loss": 0.485, + "step": 9315 + }, + { + "epoch": 5.204469273743017, + "grad_norm": 0.6865625977516174, + "learning_rate": 0.0007420728291316526, + "loss": 0.4381, + "step": 9316 + }, + { + "epoch": 5.205027932960894, + "grad_norm": 0.3839685320854187, + "learning_rate": 0.000742044817927171, + "loss": 0.4183, + "step": 9317 + }, + { + "epoch": 5.205586592178771, + "grad_norm": 0.6105719208717346, + "learning_rate": 0.0007420168067226892, + "loss": 0.5089, + "step": 9318 + }, + { + "epoch": 5.206145251396648, + "grad_norm": 0.4529527723789215, + "learning_rate": 0.0007419887955182073, + "loss": 0.4367, + "step": 9319 + }, + { + "epoch": 5.206703910614525, + "grad_norm": 0.4352875351905823, + "learning_rate": 0.0007419607843137255, + "loss": 0.4493, + "step": 9320 + }, + { + "epoch": 5.207262569832403, + "grad_norm": 0.5288578867912292, + "learning_rate": 0.0007419327731092437, + "loss": 0.5192, + "step": 9321 + }, + { + "epoch": 5.207821229050279, + "grad_norm": 0.6562674045562744, + "learning_rate": 0.000741904761904762, + "loss": 0.4628, + "step": 9322 + }, + { + "epoch": 5.208379888268157, + "grad_norm": 0.7749854922294617, + "learning_rate": 0.0007418767507002802, + "loss": 0.4132, + "step": 9323 + }, + { + "epoch": 5.208938547486033, + "grad_norm": 0.665708065032959, + "learning_rate": 0.0007418487394957984, + "loss": 0.4753, + "step": 9324 + }, + { + "epoch": 5.209497206703911, + "grad_norm": 0.717275083065033, + "learning_rate": 0.0007418207282913165, + "loss": 0.5044, + "step": 9325 + }, + { + "epoch": 5.210055865921788, + "grad_norm": 0.9967623353004456, + "learning_rate": 0.0007417927170868347, + "loss": 0.3689, + "step": 9326 + }, + { + "epoch": 5.210614525139665, + "grad_norm": 0.544273853302002, + "learning_rate": 0.000741764705882353, + "loss": 0.4247, + "step": 9327 + }, + { + "epoch": 5.211173184357542, + "grad_norm": 0.4892696440219879, + "learning_rate": 0.0007417366946778712, + "loss": 0.5132, + "step": 9328 + }, + { + "epoch": 5.211731843575419, + "grad_norm": 0.6195788979530334, + "learning_rate": 0.0007417086834733894, + "loss": 0.492, + "step": 9329 + }, + { + "epoch": 5.212290502793296, + "grad_norm": 0.5596709251403809, + "learning_rate": 0.0007416806722689075, + "loss": 0.4446, + "step": 9330 + }, + { + "epoch": 5.212849162011173, + "grad_norm": 1.2067221403121948, + "learning_rate": 0.0007416526610644257, + "loss": 0.5659, + "step": 9331 + }, + { + "epoch": 5.21340782122905, + "grad_norm": 0.5361933708190918, + "learning_rate": 0.000741624649859944, + "loss": 0.4487, + "step": 9332 + }, + { + "epoch": 5.213966480446928, + "grad_norm": 0.46188193559646606, + "learning_rate": 0.0007415966386554622, + "loss": 0.4042, + "step": 9333 + }, + { + "epoch": 5.214525139664804, + "grad_norm": 0.9497390985488892, + "learning_rate": 0.0007415686274509805, + "loss": 0.4782, + "step": 9334 + }, + { + "epoch": 5.215083798882682, + "grad_norm": 0.5016930103302002, + "learning_rate": 0.0007415406162464986, + "loss": 0.4773, + "step": 9335 + }, + { + "epoch": 5.215642458100558, + "grad_norm": 1.062241792678833, + "learning_rate": 0.0007415126050420168, + "loss": 0.3903, + "step": 9336 + }, + { + "epoch": 5.216201117318436, + "grad_norm": 0.5524981617927551, + "learning_rate": 0.0007414845938375351, + "loss": 0.4973, + "step": 9337 + }, + { + "epoch": 5.216759776536313, + "grad_norm": 0.6969456672668457, + "learning_rate": 0.0007414565826330533, + "loss": 0.4872, + "step": 9338 + }, + { + "epoch": 5.21731843575419, + "grad_norm": 1.7280601263046265, + "learning_rate": 0.0007414285714285715, + "loss": 0.416, + "step": 9339 + }, + { + "epoch": 5.217877094972067, + "grad_norm": 0.5588036775588989, + "learning_rate": 0.0007414005602240897, + "loss": 0.4818, + "step": 9340 + }, + { + "epoch": 5.218435754189944, + "grad_norm": 1.1933562755584717, + "learning_rate": 0.0007413725490196078, + "loss": 0.3757, + "step": 9341 + }, + { + "epoch": 5.218994413407821, + "grad_norm": 0.6037487983703613, + "learning_rate": 0.0007413445378151261, + "loss": 0.6115, + "step": 9342 + }, + { + "epoch": 5.219553072625699, + "grad_norm": 0.4822850525379181, + "learning_rate": 0.0007413165266106443, + "loss": 0.3554, + "step": 9343 + }, + { + "epoch": 5.220111731843575, + "grad_norm": 0.5273537635803223, + "learning_rate": 0.0007412885154061625, + "loss": 0.5025, + "step": 9344 + }, + { + "epoch": 5.220670391061453, + "grad_norm": 0.5153448581695557, + "learning_rate": 0.0007412605042016807, + "loss": 0.4439, + "step": 9345 + }, + { + "epoch": 5.221229050279329, + "grad_norm": 0.3843933939933777, + "learning_rate": 0.0007412324929971988, + "loss": 0.3665, + "step": 9346 + }, + { + "epoch": 5.221787709497207, + "grad_norm": 0.525409460067749, + "learning_rate": 0.0007412044817927171, + "loss": 0.4527, + "step": 9347 + }, + { + "epoch": 5.222346368715084, + "grad_norm": 0.34481489658355713, + "learning_rate": 0.0007411764705882353, + "loss": 0.521, + "step": 9348 + }, + { + "epoch": 5.222905027932961, + "grad_norm": 0.37644752860069275, + "learning_rate": 0.0007411484593837535, + "loss": 0.3723, + "step": 9349 + }, + { + "epoch": 5.223463687150838, + "grad_norm": 0.4853006601333618, + "learning_rate": 0.0007411204481792717, + "loss": 0.5253, + "step": 9350 + }, + { + "epoch": 5.224022346368715, + "grad_norm": 0.6095213294029236, + "learning_rate": 0.0007410924369747898, + "loss": 0.4466, + "step": 9351 + }, + { + "epoch": 5.224581005586592, + "grad_norm": 0.41500964760780334, + "learning_rate": 0.0007410644257703082, + "loss": 0.3549, + "step": 9352 + }, + { + "epoch": 5.22513966480447, + "grad_norm": 1.1184957027435303, + "learning_rate": 0.0007410364145658264, + "loss": 0.3494, + "step": 9353 + }, + { + "epoch": 5.225698324022346, + "grad_norm": 0.7289383411407471, + "learning_rate": 0.0007410084033613446, + "loss": 0.5212, + "step": 9354 + }, + { + "epoch": 5.226256983240224, + "grad_norm": 0.44477397203445435, + "learning_rate": 0.0007409803921568628, + "loss": 0.448, + "step": 9355 + }, + { + "epoch": 5.2268156424581, + "grad_norm": 0.7348518967628479, + "learning_rate": 0.000740952380952381, + "loss": 0.4261, + "step": 9356 + }, + { + "epoch": 5.227374301675978, + "grad_norm": 0.8562750220298767, + "learning_rate": 0.0007409243697478992, + "loss": 0.6878, + "step": 9357 + }, + { + "epoch": 5.227932960893855, + "grad_norm": 0.46679219603538513, + "learning_rate": 0.0007408963585434174, + "loss": 0.3871, + "step": 9358 + }, + { + "epoch": 5.228491620111732, + "grad_norm": 0.5426879525184631, + "learning_rate": 0.0007408683473389356, + "loss": 0.4177, + "step": 9359 + }, + { + "epoch": 5.229050279329609, + "grad_norm": 1.2118076086044312, + "learning_rate": 0.0007408403361344538, + "loss": 0.5355, + "step": 9360 + }, + { + "epoch": 5.229608938547486, + "grad_norm": 0.8733117580413818, + "learning_rate": 0.000740812324929972, + "loss": 0.3816, + "step": 9361 + }, + { + "epoch": 5.230167597765363, + "grad_norm": 0.596260130405426, + "learning_rate": 0.0007407843137254902, + "loss": 0.4672, + "step": 9362 + }, + { + "epoch": 5.230726256983241, + "grad_norm": 0.5930527448654175, + "learning_rate": 0.0007407563025210084, + "loss": 0.4285, + "step": 9363 + }, + { + "epoch": 5.231284916201117, + "grad_norm": 0.6616517305374146, + "learning_rate": 0.0007407282913165266, + "loss": 0.6199, + "step": 9364 + }, + { + "epoch": 5.231843575418995, + "grad_norm": 0.6076006889343262, + "learning_rate": 0.0007407002801120448, + "loss": 0.4827, + "step": 9365 + }, + { + "epoch": 5.232402234636871, + "grad_norm": 0.6483851671218872, + "learning_rate": 0.000740672268907563, + "loss": 0.4048, + "step": 9366 + }, + { + "epoch": 5.232960893854749, + "grad_norm": 0.4512960612773895, + "learning_rate": 0.0007406442577030813, + "loss": 0.5279, + "step": 9367 + }, + { + "epoch": 5.233519553072625, + "grad_norm": 0.5346662402153015, + "learning_rate": 0.0007406162464985995, + "loss": 0.3669, + "step": 9368 + }, + { + "epoch": 5.234078212290503, + "grad_norm": 0.47256213426589966, + "learning_rate": 0.0007405882352941177, + "loss": 0.3534, + "step": 9369 + }, + { + "epoch": 5.23463687150838, + "grad_norm": 2.0141563415527344, + "learning_rate": 0.0007405602240896359, + "loss": 0.5079, + "step": 9370 + }, + { + "epoch": 5.235195530726257, + "grad_norm": 0.5680559873580933, + "learning_rate": 0.0007405322128851541, + "loss": 0.4236, + "step": 9371 + }, + { + "epoch": 5.235754189944134, + "grad_norm": 0.4786226451396942, + "learning_rate": 0.0007405042016806723, + "loss": 0.4883, + "step": 9372 + }, + { + "epoch": 5.236312849162011, + "grad_norm": 0.6113656759262085, + "learning_rate": 0.0007404761904761905, + "loss": 0.5046, + "step": 9373 + }, + { + "epoch": 5.236871508379888, + "grad_norm": 0.6588099002838135, + "learning_rate": 0.0007404481792717087, + "loss": 0.5455, + "step": 9374 + }, + { + "epoch": 5.237430167597766, + "grad_norm": 0.5834494233131409, + "learning_rate": 0.0007404201680672269, + "loss": 0.3592, + "step": 9375 + }, + { + "epoch": 5.237988826815642, + "grad_norm": 0.6198643445968628, + "learning_rate": 0.0007403921568627451, + "loss": 0.4408, + "step": 9376 + }, + { + "epoch": 5.23854748603352, + "grad_norm": 1.7232438325881958, + "learning_rate": 0.0007403641456582633, + "loss": 0.4576, + "step": 9377 + }, + { + "epoch": 5.239106145251396, + "grad_norm": 2.003746271133423, + "learning_rate": 0.0007403361344537815, + "loss": 0.4295, + "step": 9378 + }, + { + "epoch": 5.239664804469274, + "grad_norm": 0.5383461713790894, + "learning_rate": 0.0007403081232492997, + "loss": 0.4801, + "step": 9379 + }, + { + "epoch": 5.240223463687151, + "grad_norm": 0.6325685381889343, + "learning_rate": 0.0007402801120448179, + "loss": 0.4634, + "step": 9380 + }, + { + "epoch": 5.240782122905028, + "grad_norm": 1.5545400381088257, + "learning_rate": 0.0007402521008403361, + "loss": 0.4192, + "step": 9381 + }, + { + "epoch": 5.241340782122905, + "grad_norm": 0.48496976494789124, + "learning_rate": 0.0007402240896358543, + "loss": 0.5299, + "step": 9382 + }, + { + "epoch": 5.241899441340782, + "grad_norm": 0.6085032224655151, + "learning_rate": 0.0007401960784313725, + "loss": 0.3802, + "step": 9383 + }, + { + "epoch": 5.242458100558659, + "grad_norm": 0.4096871316432953, + "learning_rate": 0.0007401680672268908, + "loss": 0.3941, + "step": 9384 + }, + { + "epoch": 5.243016759776537, + "grad_norm": 0.8473930954933167, + "learning_rate": 0.000740140056022409, + "loss": 0.4374, + "step": 9385 + }, + { + "epoch": 5.243575418994413, + "grad_norm": 2.512249231338501, + "learning_rate": 0.0007401120448179272, + "loss": 0.5393, + "step": 9386 + }, + { + "epoch": 5.244134078212291, + "grad_norm": 0.4696202576160431, + "learning_rate": 0.0007400840336134454, + "loss": 0.4097, + "step": 9387 + }, + { + "epoch": 5.244692737430167, + "grad_norm": 0.890044629573822, + "learning_rate": 0.0007400560224089637, + "loss": 0.5202, + "step": 9388 + }, + { + "epoch": 5.245251396648045, + "grad_norm": 0.968052864074707, + "learning_rate": 0.0007400280112044818, + "loss": 0.4386, + "step": 9389 + }, + { + "epoch": 5.245810055865922, + "grad_norm": 0.6479350924491882, + "learning_rate": 0.00074, + "loss": 0.4354, + "step": 9390 + }, + { + "epoch": 5.246368715083799, + "grad_norm": 0.5397375226020813, + "learning_rate": 0.0007399719887955182, + "loss": 0.4201, + "step": 9391 + }, + { + "epoch": 5.246927374301676, + "grad_norm": 0.93658846616745, + "learning_rate": 0.0007399439775910364, + "loss": 0.644, + "step": 9392 + }, + { + "epoch": 5.247486033519553, + "grad_norm": 0.757621169090271, + "learning_rate": 0.0007399159663865547, + "loss": 0.508, + "step": 9393 + }, + { + "epoch": 5.24804469273743, + "grad_norm": 1.8535314798355103, + "learning_rate": 0.0007398879551820728, + "loss": 0.5549, + "step": 9394 + }, + { + "epoch": 5.248603351955307, + "grad_norm": 0.749240517616272, + "learning_rate": 0.000739859943977591, + "loss": 0.4933, + "step": 9395 + }, + { + "epoch": 5.249162011173184, + "grad_norm": 0.5966920256614685, + "learning_rate": 0.0007398319327731092, + "loss": 0.6242, + "step": 9396 + }, + { + "epoch": 5.249720670391062, + "grad_norm": 0.42783230543136597, + "learning_rate": 0.0007398039215686274, + "loss": 0.435, + "step": 9397 + }, + { + "epoch": 5.250279329608938, + "grad_norm": 0.62357097864151, + "learning_rate": 0.0007397759103641457, + "loss": 0.4688, + "step": 9398 + }, + { + "epoch": 5.250837988826816, + "grad_norm": 0.5383201837539673, + "learning_rate": 0.0007397478991596638, + "loss": 0.4474, + "step": 9399 + }, + { + "epoch": 5.251396648044693, + "grad_norm": 0.44935116171836853, + "learning_rate": 0.000739719887955182, + "loss": 0.406, + "step": 9400 + }, + { + "epoch": 5.25195530726257, + "grad_norm": 0.5153525471687317, + "learning_rate": 0.0007396918767507003, + "loss": 0.4749, + "step": 9401 + }, + { + "epoch": 5.252513966480447, + "grad_norm": 0.51866614818573, + "learning_rate": 0.0007396638655462185, + "loss": 0.4952, + "step": 9402 + }, + { + "epoch": 5.253072625698324, + "grad_norm": 0.4570440351963043, + "learning_rate": 0.0007396358543417368, + "loss": 0.4558, + "step": 9403 + }, + { + "epoch": 5.253631284916201, + "grad_norm": 0.5158559679985046, + "learning_rate": 0.000739607843137255, + "loss": 0.5161, + "step": 9404 + }, + { + "epoch": 5.254189944134078, + "grad_norm": 10.15989875793457, + "learning_rate": 0.0007395798319327731, + "loss": 0.4096, + "step": 9405 + }, + { + "epoch": 5.254748603351955, + "grad_norm": 0.4160911440849304, + "learning_rate": 0.0007395518207282913, + "loss": 0.4035, + "step": 9406 + }, + { + "epoch": 5.255307262569833, + "grad_norm": 0.6594462990760803, + "learning_rate": 0.0007395238095238095, + "loss": 0.4395, + "step": 9407 + }, + { + "epoch": 5.255865921787709, + "grad_norm": 0.45077621936798096, + "learning_rate": 0.0007394957983193278, + "loss": 0.4227, + "step": 9408 + }, + { + "epoch": 5.256424581005587, + "grad_norm": 0.8619872331619263, + "learning_rate": 0.000739467787114846, + "loss": 0.6381, + "step": 9409 + }, + { + "epoch": 5.256983240223463, + "grad_norm": 0.5996870398521423, + "learning_rate": 0.0007394397759103641, + "loss": 0.393, + "step": 9410 + }, + { + "epoch": 5.257541899441341, + "grad_norm": 0.5486067533493042, + "learning_rate": 0.0007394117647058823, + "loss": 0.4793, + "step": 9411 + }, + { + "epoch": 5.258100558659218, + "grad_norm": 0.5369665026664734, + "learning_rate": 0.0007393837535014005, + "loss": 0.6459, + "step": 9412 + }, + { + "epoch": 5.258659217877095, + "grad_norm": 0.41904544830322266, + "learning_rate": 0.0007393557422969188, + "loss": 0.4429, + "step": 9413 + }, + { + "epoch": 5.259217877094972, + "grad_norm": 0.79512619972229, + "learning_rate": 0.000739327731092437, + "loss": 0.4083, + "step": 9414 + }, + { + "epoch": 5.259776536312849, + "grad_norm": 0.8285619616508484, + "learning_rate": 0.0007392997198879551, + "loss": 0.5299, + "step": 9415 + }, + { + "epoch": 5.260335195530726, + "grad_norm": 0.5362997651100159, + "learning_rate": 0.0007392717086834733, + "loss": 0.3856, + "step": 9416 + }, + { + "epoch": 5.260893854748604, + "grad_norm": 0.772506833076477, + "learning_rate": 0.0007392436974789916, + "loss": 0.6419, + "step": 9417 + }, + { + "epoch": 5.26145251396648, + "grad_norm": 0.8862659335136414, + "learning_rate": 0.0007392156862745099, + "loss": 0.4034, + "step": 9418 + }, + { + "epoch": 5.262011173184358, + "grad_norm": 0.44163113832473755, + "learning_rate": 0.0007391876750700281, + "loss": 0.479, + "step": 9419 + }, + { + "epoch": 5.262569832402234, + "grad_norm": 0.7946412563323975, + "learning_rate": 0.0007391596638655463, + "loss": 0.547, + "step": 9420 + }, + { + "epoch": 5.263128491620112, + "grad_norm": 0.8583797812461853, + "learning_rate": 0.0007391316526610644, + "loss": 0.4984, + "step": 9421 + }, + { + "epoch": 5.263687150837989, + "grad_norm": 0.8412925601005554, + "learning_rate": 0.0007391036414565826, + "loss": 0.4153, + "step": 9422 + }, + { + "epoch": 5.264245810055866, + "grad_norm": 1.3151321411132812, + "learning_rate": 0.0007390756302521009, + "loss": 0.5074, + "step": 9423 + }, + { + "epoch": 5.264804469273743, + "grad_norm": 0.48241835832595825, + "learning_rate": 0.0007390476190476191, + "loss": 0.4557, + "step": 9424 + }, + { + "epoch": 5.26536312849162, + "grad_norm": 0.4340708255767822, + "learning_rate": 0.0007390196078431373, + "loss": 0.457, + "step": 9425 + }, + { + "epoch": 5.265921787709497, + "grad_norm": 0.7778294682502747, + "learning_rate": 0.0007389915966386554, + "loss": 0.5335, + "step": 9426 + }, + { + "epoch": 5.266480446927375, + "grad_norm": 0.7259948253631592, + "learning_rate": 0.0007389635854341736, + "loss": 0.4542, + "step": 9427 + }, + { + "epoch": 5.267039106145251, + "grad_norm": 0.9651069641113281, + "learning_rate": 0.0007389355742296919, + "loss": 0.5057, + "step": 9428 + }, + { + "epoch": 5.267597765363129, + "grad_norm": 0.7418129444122314, + "learning_rate": 0.0007389075630252101, + "loss": 0.4645, + "step": 9429 + }, + { + "epoch": 5.268156424581005, + "grad_norm": 3.919517993927002, + "learning_rate": 0.0007388795518207283, + "loss": 0.411, + "step": 9430 + }, + { + "epoch": 5.268715083798883, + "grad_norm": 0.611871063709259, + "learning_rate": 0.0007388515406162464, + "loss": 0.4784, + "step": 9431 + }, + { + "epoch": 5.269273743016759, + "grad_norm": 1.5884042978286743, + "learning_rate": 0.0007388235294117646, + "loss": 0.4451, + "step": 9432 + }, + { + "epoch": 5.269832402234637, + "grad_norm": 0.8458524346351624, + "learning_rate": 0.000738795518207283, + "loss": 0.4626, + "step": 9433 + }, + { + "epoch": 5.270391061452514, + "grad_norm": 1.2369534969329834, + "learning_rate": 0.0007387675070028012, + "loss": 0.3762, + "step": 9434 + }, + { + "epoch": 5.270949720670391, + "grad_norm": 0.9712255597114563, + "learning_rate": 0.0007387394957983194, + "loss": 0.4892, + "step": 9435 + }, + { + "epoch": 5.271508379888268, + "grad_norm": 0.4695819318294525, + "learning_rate": 0.0007387114845938376, + "loss": 0.4326, + "step": 9436 + }, + { + "epoch": 5.272067039106146, + "grad_norm": 0.5627531409263611, + "learning_rate": 0.0007386834733893557, + "loss": 0.4219, + "step": 9437 + }, + { + "epoch": 5.272625698324022, + "grad_norm": 0.6104740500450134, + "learning_rate": 0.000738655462184874, + "loss": 0.4783, + "step": 9438 + }, + { + "epoch": 5.2731843575419, + "grad_norm": 16.905256271362305, + "learning_rate": 0.0007386274509803922, + "loss": 0.4663, + "step": 9439 + }, + { + "epoch": 5.273743016759776, + "grad_norm": 0.901716411113739, + "learning_rate": 0.0007385994397759104, + "loss": 0.4681, + "step": 9440 + }, + { + "epoch": 5.274301675977654, + "grad_norm": 1.5548311471939087, + "learning_rate": 0.0007385714285714286, + "loss": 0.4684, + "step": 9441 + }, + { + "epoch": 5.27486033519553, + "grad_norm": 2.0308778285980225, + "learning_rate": 0.0007385434173669467, + "loss": 0.4874, + "step": 9442 + }, + { + "epoch": 5.275418994413408, + "grad_norm": 0.8268308639526367, + "learning_rate": 0.000738515406162465, + "loss": 0.386, + "step": 9443 + }, + { + "epoch": 5.275977653631285, + "grad_norm": 0.7548191547393799, + "learning_rate": 0.0007384873949579832, + "loss": 0.5778, + "step": 9444 + }, + { + "epoch": 5.276536312849162, + "grad_norm": 0.7626224160194397, + "learning_rate": 0.0007384593837535014, + "loss": 0.4707, + "step": 9445 + }, + { + "epoch": 5.277094972067039, + "grad_norm": 0.45574378967285156, + "learning_rate": 0.0007384313725490196, + "loss": 0.5238, + "step": 9446 + }, + { + "epoch": 5.277653631284916, + "grad_norm": 0.8960602879524231, + "learning_rate": 0.0007384033613445377, + "loss": 0.3711, + "step": 9447 + }, + { + "epoch": 5.278212290502793, + "grad_norm": 0.5864726305007935, + "learning_rate": 0.000738375350140056, + "loss": 0.5101, + "step": 9448 + }, + { + "epoch": 5.278770949720671, + "grad_norm": 0.9299264550209045, + "learning_rate": 0.0007383473389355743, + "loss": 0.447, + "step": 9449 + }, + { + "epoch": 5.279329608938547, + "grad_norm": 0.46527713537216187, + "learning_rate": 0.0007383193277310925, + "loss": 0.4552, + "step": 9450 + }, + { + "epoch": 5.279888268156425, + "grad_norm": 0.4526190757751465, + "learning_rate": 0.0007382913165266107, + "loss": 0.4653, + "step": 9451 + }, + { + "epoch": 5.280446927374301, + "grad_norm": 0.515207052230835, + "learning_rate": 0.0007382633053221289, + "loss": 0.4275, + "step": 9452 + }, + { + "epoch": 5.281005586592179, + "grad_norm": 0.6904286742210388, + "learning_rate": 0.0007382352941176471, + "loss": 0.4785, + "step": 9453 + }, + { + "epoch": 5.281564245810056, + "grad_norm": 2.0360584259033203, + "learning_rate": 0.0007382072829131653, + "loss": 0.4571, + "step": 9454 + }, + { + "epoch": 5.282122905027933, + "grad_norm": 0.5173314809799194, + "learning_rate": 0.0007381792717086835, + "loss": 0.419, + "step": 9455 + }, + { + "epoch": 5.28268156424581, + "grad_norm": 1.329249620437622, + "learning_rate": 0.0007381512605042017, + "loss": 0.5878, + "step": 9456 + }, + { + "epoch": 5.283240223463687, + "grad_norm": 1.6766265630722046, + "learning_rate": 0.0007381232492997199, + "loss": 0.4501, + "step": 9457 + }, + { + "epoch": 5.283798882681564, + "grad_norm": 0.47712254524230957, + "learning_rate": 0.0007380952380952381, + "loss": 0.4657, + "step": 9458 + }, + { + "epoch": 5.284357541899442, + "grad_norm": 2.790231466293335, + "learning_rate": 0.0007380672268907563, + "loss": 0.4721, + "step": 9459 + }, + { + "epoch": 5.284916201117318, + "grad_norm": 0.7931320071220398, + "learning_rate": 0.0007380392156862745, + "loss": 0.4282, + "step": 9460 + }, + { + "epoch": 5.285474860335196, + "grad_norm": 0.6824520230293274, + "learning_rate": 0.0007380112044817927, + "loss": 0.4684, + "step": 9461 + }, + { + "epoch": 5.286033519553072, + "grad_norm": 0.6180979013442993, + "learning_rate": 0.0007379831932773109, + "loss": 0.4594, + "step": 9462 + }, + { + "epoch": 5.28659217877095, + "grad_norm": 0.7881502509117126, + "learning_rate": 0.0007379551820728292, + "loss": 0.5103, + "step": 9463 + }, + { + "epoch": 5.287150837988827, + "grad_norm": 0.4643149971961975, + "learning_rate": 0.0007379271708683473, + "loss": 0.5473, + "step": 9464 + }, + { + "epoch": 5.287709497206704, + "grad_norm": 0.5156552195549011, + "learning_rate": 0.0007378991596638655, + "loss": 0.4879, + "step": 9465 + }, + { + "epoch": 5.288268156424581, + "grad_norm": 0.865510106086731, + "learning_rate": 0.0007378711484593838, + "loss": 0.5488, + "step": 9466 + }, + { + "epoch": 5.288826815642458, + "grad_norm": 0.6039503812789917, + "learning_rate": 0.000737843137254902, + "loss": 0.5231, + "step": 9467 + }, + { + "epoch": 5.289385474860335, + "grad_norm": 0.5857338309288025, + "learning_rate": 0.0007378151260504203, + "loss": 0.5269, + "step": 9468 + }, + { + "epoch": 5.289944134078212, + "grad_norm": 2.5231263637542725, + "learning_rate": 0.0007377871148459384, + "loss": 0.4478, + "step": 9469 + }, + { + "epoch": 5.290502793296089, + "grad_norm": 0.5198138952255249, + "learning_rate": 0.0007377591036414566, + "loss": 0.4359, + "step": 9470 + }, + { + "epoch": 5.291061452513967, + "grad_norm": 0.543129563331604, + "learning_rate": 0.0007377310924369748, + "loss": 0.4212, + "step": 9471 + }, + { + "epoch": 5.291620111731843, + "grad_norm": 0.8600278496742249, + "learning_rate": 0.000737703081232493, + "loss": 0.4645, + "step": 9472 + }, + { + "epoch": 5.292178770949721, + "grad_norm": 0.6033267378807068, + "learning_rate": 0.0007376750700280113, + "loss": 0.4627, + "step": 9473 + }, + { + "epoch": 5.292737430167598, + "grad_norm": 0.6613966226577759, + "learning_rate": 0.0007376470588235294, + "loss": 0.4022, + "step": 9474 + }, + { + "epoch": 5.293296089385475, + "grad_norm": 0.5299205183982849, + "learning_rate": 0.0007376190476190476, + "loss": 0.4745, + "step": 9475 + }, + { + "epoch": 5.293854748603352, + "grad_norm": 0.5181791186332703, + "learning_rate": 0.0007375910364145658, + "loss": 0.4996, + "step": 9476 + }, + { + "epoch": 5.294413407821229, + "grad_norm": 0.46199846267700195, + "learning_rate": 0.000737563025210084, + "loss": 0.4442, + "step": 9477 + }, + { + "epoch": 5.294972067039106, + "grad_norm": 0.37143176794052124, + "learning_rate": 0.0007375350140056023, + "loss": 0.4191, + "step": 9478 + }, + { + "epoch": 5.295530726256983, + "grad_norm": 0.4858918786048889, + "learning_rate": 0.0007375070028011205, + "loss": 0.5081, + "step": 9479 + }, + { + "epoch": 5.29608938547486, + "grad_norm": 0.5810004472732544, + "learning_rate": 0.0007374789915966386, + "loss": 0.463, + "step": 9480 + }, + { + "epoch": 5.296648044692738, + "grad_norm": 0.5940622091293335, + "learning_rate": 0.0007374509803921568, + "loss": 0.5178, + "step": 9481 + }, + { + "epoch": 5.297206703910614, + "grad_norm": 0.3989871144294739, + "learning_rate": 0.000737422969187675, + "loss": 0.3872, + "step": 9482 + }, + { + "epoch": 5.297765363128492, + "grad_norm": 1.742755651473999, + "learning_rate": 0.0007373949579831934, + "loss": 0.5275, + "step": 9483 + }, + { + "epoch": 5.298324022346368, + "grad_norm": 0.5456664562225342, + "learning_rate": 0.0007373669467787116, + "loss": 0.3836, + "step": 9484 + }, + { + "epoch": 5.298882681564246, + "grad_norm": 0.5809146165847778, + "learning_rate": 0.0007373389355742297, + "loss": 0.4341, + "step": 9485 + }, + { + "epoch": 5.299441340782123, + "grad_norm": 1.146213412284851, + "learning_rate": 0.0007373109243697479, + "loss": 0.4086, + "step": 9486 + }, + { + "epoch": 5.3, + "grad_norm": 0.6028604507446289, + "learning_rate": 0.0007372829131652661, + "loss": 0.581, + "step": 9487 + }, + { + "epoch": 5.300558659217877, + "grad_norm": 1.5353025197982788, + "learning_rate": 0.0007372549019607844, + "loss": 0.6055, + "step": 9488 + }, + { + "epoch": 5.301117318435754, + "grad_norm": 1.2211227416992188, + "learning_rate": 0.0007372268907563026, + "loss": 0.4879, + "step": 9489 + }, + { + "epoch": 5.301675977653631, + "grad_norm": 0.5455441474914551, + "learning_rate": 0.0007371988795518207, + "loss": 0.3609, + "step": 9490 + }, + { + "epoch": 5.302234636871509, + "grad_norm": 1.0635169744491577, + "learning_rate": 0.0007371708683473389, + "loss": 0.5163, + "step": 9491 + }, + { + "epoch": 5.302793296089385, + "grad_norm": 0.5655789971351624, + "learning_rate": 0.0007371428571428571, + "loss": 0.4182, + "step": 9492 + }, + { + "epoch": 5.303351955307263, + "grad_norm": 0.9413765668869019, + "learning_rate": 0.0007371148459383754, + "loss": 0.508, + "step": 9493 + }, + { + "epoch": 5.303910614525139, + "grad_norm": 0.4511139690876007, + "learning_rate": 0.0007370868347338936, + "loss": 0.3578, + "step": 9494 + }, + { + "epoch": 5.304469273743017, + "grad_norm": 0.5222950577735901, + "learning_rate": 0.0007370588235294118, + "loss": 0.4763, + "step": 9495 + }, + { + "epoch": 5.305027932960894, + "grad_norm": 1.1022090911865234, + "learning_rate": 0.0007370308123249299, + "loss": 0.5076, + "step": 9496 + }, + { + "epoch": 5.305586592178771, + "grad_norm": 0.492253839969635, + "learning_rate": 0.0007370028011204481, + "loss": 0.4645, + "step": 9497 + }, + { + "epoch": 5.306145251396648, + "grad_norm": 3.3863306045532227, + "learning_rate": 0.0007369747899159665, + "loss": 0.4724, + "step": 9498 + }, + { + "epoch": 5.306703910614525, + "grad_norm": 0.8537158966064453, + "learning_rate": 0.0007369467787114847, + "loss": 0.5155, + "step": 9499 + }, + { + "epoch": 5.307262569832402, + "grad_norm": 1.990062952041626, + "learning_rate": 0.0007369187675070029, + "loss": 0.4344, + "step": 9500 + }, + { + "epoch": 5.307262569832402, + "eval_cer": 0.09389763564749654, + "eval_loss": 0.35308894515037537, + "eval_runtime": 55.6031, + "eval_samples_per_second": 81.614, + "eval_steps_per_second": 5.108, + "eval_wer": 0.3660512276193402, + "step": 9500 + }, + { + "epoch": 5.30782122905028, + "grad_norm": 0.5177279114723206, + "learning_rate": 0.000736890756302521, + "loss": 0.4007, + "step": 9501 + }, + { + "epoch": 5.308379888268156, + "grad_norm": 0.5062219500541687, + "learning_rate": 0.0007368627450980392, + "loss": 0.4037, + "step": 9502 + }, + { + "epoch": 5.308938547486034, + "grad_norm": 0.43697401881217957, + "learning_rate": 0.0007368347338935575, + "loss": 0.4633, + "step": 9503 + }, + { + "epoch": 5.30949720670391, + "grad_norm": 0.6515137553215027, + "learning_rate": 0.0007368067226890757, + "loss": 0.4096, + "step": 9504 + }, + { + "epoch": 5.310055865921788, + "grad_norm": 0.5856940150260925, + "learning_rate": 0.0007367787114845939, + "loss": 0.7466, + "step": 9505 + }, + { + "epoch": 5.310614525139664, + "grad_norm": 0.5021346807479858, + "learning_rate": 0.000736750700280112, + "loss": 0.3782, + "step": 9506 + }, + { + "epoch": 5.311173184357542, + "grad_norm": 0.5606846213340759, + "learning_rate": 0.0007367226890756302, + "loss": 0.6457, + "step": 9507 + }, + { + "epoch": 5.311731843575419, + "grad_norm": 0.6789116263389587, + "learning_rate": 0.0007366946778711485, + "loss": 0.5971, + "step": 9508 + }, + { + "epoch": 5.312290502793296, + "grad_norm": 0.5939801931381226, + "learning_rate": 0.0007366666666666667, + "loss": 0.5072, + "step": 9509 + }, + { + "epoch": 5.312849162011173, + "grad_norm": 1.4139007329940796, + "learning_rate": 0.0007366386554621849, + "loss": 0.5588, + "step": 9510 + }, + { + "epoch": 5.31340782122905, + "grad_norm": 0.5975939631462097, + "learning_rate": 0.0007366106442577031, + "loss": 0.4135, + "step": 9511 + }, + { + "epoch": 5.313966480446927, + "grad_norm": 0.7397623062133789, + "learning_rate": 0.0007365826330532212, + "loss": 0.4587, + "step": 9512 + }, + { + "epoch": 5.314525139664805, + "grad_norm": 0.6376792788505554, + "learning_rate": 0.0007365546218487395, + "loss": 0.4865, + "step": 9513 + }, + { + "epoch": 5.315083798882681, + "grad_norm": 1.171474814414978, + "learning_rate": 0.0007365266106442577, + "loss": 0.4868, + "step": 9514 + }, + { + "epoch": 5.315642458100559, + "grad_norm": 1.2171443700790405, + "learning_rate": 0.000736498599439776, + "loss": 0.4821, + "step": 9515 + }, + { + "epoch": 5.316201117318435, + "grad_norm": 0.5943564176559448, + "learning_rate": 0.0007364705882352942, + "loss": 0.5972, + "step": 9516 + }, + { + "epoch": 5.316759776536313, + "grad_norm": 0.5895178318023682, + "learning_rate": 0.0007364425770308123, + "loss": 0.4112, + "step": 9517 + }, + { + "epoch": 5.31731843575419, + "grad_norm": 0.9535491466522217, + "learning_rate": 0.0007364145658263306, + "loss": 0.5116, + "step": 9518 + }, + { + "epoch": 5.317877094972067, + "grad_norm": 0.47943904995918274, + "learning_rate": 0.0007363865546218488, + "loss": 0.5044, + "step": 9519 + }, + { + "epoch": 5.318435754189944, + "grad_norm": 0.9426802396774292, + "learning_rate": 0.000736358543417367, + "loss": 0.5899, + "step": 9520 + }, + { + "epoch": 5.318994413407821, + "grad_norm": 0.6176594495773315, + "learning_rate": 0.0007363305322128852, + "loss": 0.4837, + "step": 9521 + }, + { + "epoch": 5.319553072625698, + "grad_norm": 0.4418538212776184, + "learning_rate": 0.0007363025210084033, + "loss": 0.4643, + "step": 9522 + }, + { + "epoch": 5.320111731843576, + "grad_norm": 0.5352808237075806, + "learning_rate": 0.0007362745098039216, + "loss": 0.4562, + "step": 9523 + }, + { + "epoch": 5.320670391061452, + "grad_norm": 0.6674498319625854, + "learning_rate": 0.0007362464985994398, + "loss": 0.4663, + "step": 9524 + }, + { + "epoch": 5.32122905027933, + "grad_norm": 0.9467893838882446, + "learning_rate": 0.000736218487394958, + "loss": 0.5103, + "step": 9525 + }, + { + "epoch": 5.321787709497206, + "grad_norm": 0.9457280039787292, + "learning_rate": 0.0007361904761904762, + "loss": 0.5996, + "step": 9526 + }, + { + "epoch": 5.322346368715084, + "grad_norm": 0.7973175048828125, + "learning_rate": 0.0007361624649859944, + "loss": 0.4759, + "step": 9527 + }, + { + "epoch": 5.322905027932961, + "grad_norm": 0.8320454955101013, + "learning_rate": 0.0007361344537815126, + "loss": 0.5847, + "step": 9528 + }, + { + "epoch": 5.323463687150838, + "grad_norm": 0.8061323165893555, + "learning_rate": 0.0007361064425770308, + "loss": 0.3615, + "step": 9529 + }, + { + "epoch": 5.324022346368715, + "grad_norm": 0.5948437452316284, + "learning_rate": 0.000736078431372549, + "loss": 0.4482, + "step": 9530 + }, + { + "epoch": 5.324581005586592, + "grad_norm": 0.741431713104248, + "learning_rate": 0.0007360504201680673, + "loss": 0.458, + "step": 9531 + }, + { + "epoch": 5.325139664804469, + "grad_norm": 0.6204893589019775, + "learning_rate": 0.0007360224089635855, + "loss": 0.4559, + "step": 9532 + }, + { + "epoch": 5.325698324022347, + "grad_norm": 0.5414924621582031, + "learning_rate": 0.0007359943977591037, + "loss": 0.4561, + "step": 9533 + }, + { + "epoch": 5.326256983240223, + "grad_norm": 0.5504729747772217, + "learning_rate": 0.0007359663865546219, + "loss": 0.4376, + "step": 9534 + }, + { + "epoch": 5.326815642458101, + "grad_norm": 0.9887414574623108, + "learning_rate": 0.0007359383753501401, + "loss": 0.4397, + "step": 9535 + }, + { + "epoch": 5.327374301675977, + "grad_norm": 0.5801450610160828, + "learning_rate": 0.0007359103641456583, + "loss": 0.5307, + "step": 9536 + }, + { + "epoch": 5.327932960893855, + "grad_norm": 2.0622060298919678, + "learning_rate": 0.0007358823529411765, + "loss": 0.4666, + "step": 9537 + }, + { + "epoch": 5.328491620111732, + "grad_norm": 0.42980602383613586, + "learning_rate": 0.0007358543417366947, + "loss": 0.5286, + "step": 9538 + }, + { + "epoch": 5.329050279329609, + "grad_norm": 0.6252503395080566, + "learning_rate": 0.0007358263305322129, + "loss": 0.5033, + "step": 9539 + }, + { + "epoch": 5.329608938547486, + "grad_norm": 7.814577102661133, + "learning_rate": 0.0007357983193277311, + "loss": 0.4454, + "step": 9540 + }, + { + "epoch": 5.330167597765363, + "grad_norm": 0.5140522122383118, + "learning_rate": 0.0007357703081232493, + "loss": 0.4402, + "step": 9541 + }, + { + "epoch": 5.33072625698324, + "grad_norm": 0.5562108755111694, + "learning_rate": 0.0007357422969187675, + "loss": 0.4818, + "step": 9542 + }, + { + "epoch": 5.331284916201117, + "grad_norm": 0.7808133363723755, + "learning_rate": 0.0007357142857142858, + "loss": 0.3713, + "step": 9543 + }, + { + "epoch": 5.331843575418994, + "grad_norm": 0.5227891802787781, + "learning_rate": 0.0007356862745098039, + "loss": 0.5561, + "step": 9544 + }, + { + "epoch": 5.332402234636872, + "grad_norm": 0.4147838354110718, + "learning_rate": 0.0007356582633053221, + "loss": 0.3992, + "step": 9545 + }, + { + "epoch": 5.332960893854748, + "grad_norm": 0.8092637062072754, + "learning_rate": 0.0007356302521008403, + "loss": 0.4472, + "step": 9546 + }, + { + "epoch": 5.333519553072626, + "grad_norm": 0.617157518863678, + "learning_rate": 0.0007356022408963585, + "loss": 0.3695, + "step": 9547 + }, + { + "epoch": 5.334078212290502, + "grad_norm": 0.7355561256408691, + "learning_rate": 0.0007355742296918769, + "loss": 0.5167, + "step": 9548 + }, + { + "epoch": 5.33463687150838, + "grad_norm": 0.613189697265625, + "learning_rate": 0.000735546218487395, + "loss": 0.4588, + "step": 9549 + }, + { + "epoch": 5.335195530726257, + "grad_norm": 0.5412580370903015, + "learning_rate": 0.0007355182072829132, + "loss": 0.4713, + "step": 9550 + }, + { + "epoch": 5.335754189944134, + "grad_norm": 0.5945103168487549, + "learning_rate": 0.0007354901960784314, + "loss": 0.68, + "step": 9551 + }, + { + "epoch": 5.336312849162011, + "grad_norm": 0.934156596660614, + "learning_rate": 0.0007354621848739496, + "loss": 0.5731, + "step": 9552 + }, + { + "epoch": 5.336871508379888, + "grad_norm": 6.088711261749268, + "learning_rate": 0.0007354341736694679, + "loss": 0.3591, + "step": 9553 + }, + { + "epoch": 5.337430167597765, + "grad_norm": 0.594549298286438, + "learning_rate": 0.000735406162464986, + "loss": 0.4115, + "step": 9554 + }, + { + "epoch": 5.337988826815643, + "grad_norm": 0.5507545471191406, + "learning_rate": 0.0007353781512605042, + "loss": 0.5486, + "step": 9555 + }, + { + "epoch": 5.338547486033519, + "grad_norm": 0.568227231502533, + "learning_rate": 0.0007353501400560224, + "loss": 0.5236, + "step": 9556 + }, + { + "epoch": 5.339106145251397, + "grad_norm": 0.5101413726806641, + "learning_rate": 0.0007353221288515406, + "loss": 0.3114, + "step": 9557 + }, + { + "epoch": 5.339664804469273, + "grad_norm": 1.19283127784729, + "learning_rate": 0.0007352941176470589, + "loss": 0.3956, + "step": 9558 + }, + { + "epoch": 5.340223463687151, + "grad_norm": 1.1779811382293701, + "learning_rate": 0.0007352661064425771, + "loss": 0.4663, + "step": 9559 + }, + { + "epoch": 5.340782122905028, + "grad_norm": 0.46355003118515015, + "learning_rate": 0.0007352380952380952, + "loss": 0.4456, + "step": 9560 + }, + { + "epoch": 5.341340782122905, + "grad_norm": 0.5419101119041443, + "learning_rate": 0.0007352100840336134, + "loss": 0.4775, + "step": 9561 + }, + { + "epoch": 5.341899441340782, + "grad_norm": 0.4649917781352997, + "learning_rate": 0.0007351820728291316, + "loss": 0.2917, + "step": 9562 + }, + { + "epoch": 5.342458100558659, + "grad_norm": 0.8416962027549744, + "learning_rate": 0.00073515406162465, + "loss": 0.5765, + "step": 9563 + }, + { + "epoch": 5.343016759776536, + "grad_norm": 2.6413912773132324, + "learning_rate": 0.0007351260504201682, + "loss": 0.4417, + "step": 9564 + }, + { + "epoch": 5.343575418994414, + "grad_norm": 0.5505444407463074, + "learning_rate": 0.0007350980392156863, + "loss": 0.3918, + "step": 9565 + }, + { + "epoch": 5.34413407821229, + "grad_norm": 0.4794973134994507, + "learning_rate": 0.0007350700280112045, + "loss": 0.4912, + "step": 9566 + }, + { + "epoch": 5.344692737430168, + "grad_norm": 0.5175204277038574, + "learning_rate": 0.0007350420168067227, + "loss": 0.5054, + "step": 9567 + }, + { + "epoch": 5.345251396648044, + "grad_norm": 0.4456702172756195, + "learning_rate": 0.000735014005602241, + "loss": 0.4262, + "step": 9568 + }, + { + "epoch": 5.345810055865922, + "grad_norm": 0.6180078387260437, + "learning_rate": 0.0007349859943977592, + "loss": 0.3775, + "step": 9569 + }, + { + "epoch": 5.346368715083799, + "grad_norm": 0.972252607345581, + "learning_rate": 0.0007349579831932773, + "loss": 0.4475, + "step": 9570 + }, + { + "epoch": 5.346927374301676, + "grad_norm": 0.5381904244422913, + "learning_rate": 0.0007349299719887955, + "loss": 0.4487, + "step": 9571 + }, + { + "epoch": 5.347486033519553, + "grad_norm": 0.7485121488571167, + "learning_rate": 0.0007349019607843137, + "loss": 0.6094, + "step": 9572 + }, + { + "epoch": 5.34804469273743, + "grad_norm": 0.4415353536605835, + "learning_rate": 0.000734873949579832, + "loss": 0.5194, + "step": 9573 + }, + { + "epoch": 5.348603351955307, + "grad_norm": 0.40677085518836975, + "learning_rate": 0.0007348459383753502, + "loss": 0.3824, + "step": 9574 + }, + { + "epoch": 5.349162011173185, + "grad_norm": 0.48893308639526367, + "learning_rate": 0.0007348179271708684, + "loss": 0.4422, + "step": 9575 + }, + { + "epoch": 5.349720670391061, + "grad_norm": 0.9173416495323181, + "learning_rate": 0.0007347899159663865, + "loss": 0.4142, + "step": 9576 + }, + { + "epoch": 5.350279329608939, + "grad_norm": 0.5607465505599976, + "learning_rate": 0.0007347619047619047, + "loss": 0.5624, + "step": 9577 + }, + { + "epoch": 5.350837988826815, + "grad_norm": 3.155935764312744, + "learning_rate": 0.000734733893557423, + "loss": 0.48, + "step": 9578 + }, + { + "epoch": 5.351396648044693, + "grad_norm": 0.4716293513774872, + "learning_rate": 0.0007347058823529412, + "loss": 0.4586, + "step": 9579 + }, + { + "epoch": 5.351955307262569, + "grad_norm": 0.49036017060279846, + "learning_rate": 0.0007346778711484595, + "loss": 0.514, + "step": 9580 + }, + { + "epoch": 5.352513966480447, + "grad_norm": 0.5479791760444641, + "learning_rate": 0.0007346498599439776, + "loss": 0.3693, + "step": 9581 + }, + { + "epoch": 5.353072625698324, + "grad_norm": 1.1325300931930542, + "learning_rate": 0.0007346218487394958, + "loss": 0.4994, + "step": 9582 + }, + { + "epoch": 5.353631284916201, + "grad_norm": 0.40058624744415283, + "learning_rate": 0.0007345938375350141, + "loss": 0.3965, + "step": 9583 + }, + { + "epoch": 5.354189944134078, + "grad_norm": 0.5006220936775208, + "learning_rate": 0.0007345658263305323, + "loss": 0.3884, + "step": 9584 + }, + { + "epoch": 5.354748603351955, + "grad_norm": 0.4322543144226074, + "learning_rate": 0.0007345378151260505, + "loss": 0.436, + "step": 9585 + }, + { + "epoch": 5.355307262569832, + "grad_norm": 0.6249381899833679, + "learning_rate": 0.0007345098039215686, + "loss": 0.4477, + "step": 9586 + }, + { + "epoch": 5.35586592178771, + "grad_norm": 2.471202850341797, + "learning_rate": 0.0007344817927170868, + "loss": 0.5496, + "step": 9587 + }, + { + "epoch": 5.356424581005586, + "grad_norm": 0.5098752975463867, + "learning_rate": 0.0007344537815126051, + "loss": 0.4727, + "step": 9588 + }, + { + "epoch": 5.356983240223464, + "grad_norm": 0.43282684683799744, + "learning_rate": 0.0007344257703081233, + "loss": 0.4393, + "step": 9589 + }, + { + "epoch": 5.35754189944134, + "grad_norm": 0.699896514415741, + "learning_rate": 0.0007343977591036415, + "loss": 0.4678, + "step": 9590 + }, + { + "epoch": 5.358100558659218, + "grad_norm": 3.43294620513916, + "learning_rate": 0.0007343697478991597, + "loss": 0.4484, + "step": 9591 + }, + { + "epoch": 5.358659217877095, + "grad_norm": 0.6883873343467712, + "learning_rate": 0.0007343417366946778, + "loss": 0.5667, + "step": 9592 + }, + { + "epoch": 5.359217877094972, + "grad_norm": 0.6168370246887207, + "learning_rate": 0.000734313725490196, + "loss": 0.5427, + "step": 9593 + }, + { + "epoch": 5.359776536312849, + "grad_norm": 0.5809407830238342, + "learning_rate": 0.0007342857142857143, + "loss": 0.4899, + "step": 9594 + }, + { + "epoch": 5.360335195530726, + "grad_norm": 4.26202917098999, + "learning_rate": 0.0007342577030812325, + "loss": 0.5517, + "step": 9595 + }, + { + "epoch": 5.360893854748603, + "grad_norm": 0.45270437002182007, + "learning_rate": 0.0007342296918767507, + "loss": 0.4688, + "step": 9596 + }, + { + "epoch": 5.361452513966481, + "grad_norm": 0.46052780747413635, + "learning_rate": 0.0007342016806722688, + "loss": 0.3735, + "step": 9597 + }, + { + "epoch": 5.362011173184357, + "grad_norm": 0.8623282313346863, + "learning_rate": 0.000734173669467787, + "loss": 0.5642, + "step": 9598 + }, + { + "epoch": 5.362569832402235, + "grad_norm": 0.6653597354888916, + "learning_rate": 0.0007341456582633054, + "loss": 0.5399, + "step": 9599 + }, + { + "epoch": 5.363128491620111, + "grad_norm": 0.7668310403823853, + "learning_rate": 0.0007341176470588236, + "loss": 0.401, + "step": 9600 + }, + { + "epoch": 5.363687150837989, + "grad_norm": 0.5306023955345154, + "learning_rate": 0.0007340896358543418, + "loss": 0.3953, + "step": 9601 + }, + { + "epoch": 5.364245810055866, + "grad_norm": 0.5377447605133057, + "learning_rate": 0.0007340616246498599, + "loss": 0.5033, + "step": 9602 + }, + { + "epoch": 5.364804469273743, + "grad_norm": 0.5565674901008606, + "learning_rate": 0.0007340336134453781, + "loss": 0.4676, + "step": 9603 + }, + { + "epoch": 5.36536312849162, + "grad_norm": 0.6691960096359253, + "learning_rate": 0.0007340056022408964, + "loss": 0.3914, + "step": 9604 + }, + { + "epoch": 5.365921787709497, + "grad_norm": 0.660874605178833, + "learning_rate": 0.0007339775910364146, + "loss": 0.5166, + "step": 9605 + }, + { + "epoch": 5.366480446927374, + "grad_norm": 0.6722997426986694, + "learning_rate": 0.0007339495798319328, + "loss": 0.5726, + "step": 9606 + }, + { + "epoch": 5.367039106145251, + "grad_norm": 0.5346750617027283, + "learning_rate": 0.000733921568627451, + "loss": 0.3429, + "step": 9607 + }, + { + "epoch": 5.367597765363128, + "grad_norm": 0.808154821395874, + "learning_rate": 0.0007338935574229691, + "loss": 0.4468, + "step": 9608 + }, + { + "epoch": 5.368156424581006, + "grad_norm": 0.7308852076530457, + "learning_rate": 0.0007338655462184874, + "loss": 0.4426, + "step": 9609 + }, + { + "epoch": 5.368715083798882, + "grad_norm": 0.6538249254226685, + "learning_rate": 0.0007338375350140056, + "loss": 0.4008, + "step": 9610 + }, + { + "epoch": 5.36927374301676, + "grad_norm": 0.5729008913040161, + "learning_rate": 0.0007338095238095238, + "loss": 0.4915, + "step": 9611 + }, + { + "epoch": 5.369832402234637, + "grad_norm": 0.6548587083816528, + "learning_rate": 0.000733781512605042, + "loss": 0.469, + "step": 9612 + }, + { + "epoch": 5.370391061452514, + "grad_norm": 0.9366130232810974, + "learning_rate": 0.0007337535014005601, + "loss": 0.5295, + "step": 9613 + }, + { + "epoch": 5.370949720670391, + "grad_norm": 0.602325975894928, + "learning_rate": 0.0007337254901960785, + "loss": 0.3406, + "step": 9614 + }, + { + "epoch": 5.371508379888268, + "grad_norm": 0.652629017829895, + "learning_rate": 0.0007336974789915967, + "loss": 0.469, + "step": 9615 + }, + { + "epoch": 5.372067039106145, + "grad_norm": 0.442274272441864, + "learning_rate": 0.0007336694677871149, + "loss": 0.4669, + "step": 9616 + }, + { + "epoch": 5.372625698324022, + "grad_norm": 0.5751197934150696, + "learning_rate": 0.0007336414565826331, + "loss": 0.4388, + "step": 9617 + }, + { + "epoch": 5.373184357541899, + "grad_norm": 0.4405896067619324, + "learning_rate": 0.0007336134453781512, + "loss": 0.3996, + "step": 9618 + }, + { + "epoch": 5.373743016759777, + "grad_norm": 0.457009494304657, + "learning_rate": 0.0007335854341736695, + "loss": 0.5106, + "step": 9619 + }, + { + "epoch": 5.374301675977653, + "grad_norm": 0.4784098267555237, + "learning_rate": 0.0007335574229691877, + "loss": 0.3808, + "step": 9620 + }, + { + "epoch": 5.374860335195531, + "grad_norm": 0.6183488368988037, + "learning_rate": 0.0007335294117647059, + "loss": 0.4381, + "step": 9621 + }, + { + "epoch": 5.375418994413407, + "grad_norm": 0.5040037035942078, + "learning_rate": 0.0007335014005602241, + "loss": 0.4397, + "step": 9622 + }, + { + "epoch": 5.375977653631285, + "grad_norm": 0.6120518445968628, + "learning_rate": 0.0007334733893557423, + "loss": 0.514, + "step": 9623 + }, + { + "epoch": 5.376536312849162, + "grad_norm": 1.4700332880020142, + "learning_rate": 0.0007334453781512605, + "loss": 0.463, + "step": 9624 + }, + { + "epoch": 5.377094972067039, + "grad_norm": 3.4195826053619385, + "learning_rate": 0.0007334173669467787, + "loss": 0.461, + "step": 9625 + }, + { + "epoch": 5.377653631284916, + "grad_norm": 0.5422326326370239, + "learning_rate": 0.0007333893557422969, + "loss": 0.4584, + "step": 9626 + }, + { + "epoch": 5.378212290502793, + "grad_norm": 0.48698753118515015, + "learning_rate": 0.0007333613445378151, + "loss": 0.3529, + "step": 9627 + }, + { + "epoch": 5.37877094972067, + "grad_norm": 0.7807590365409851, + "learning_rate": 0.0007333333333333333, + "loss": 0.4786, + "step": 9628 + }, + { + "epoch": 5.379329608938548, + "grad_norm": 0.4735409915447235, + "learning_rate": 0.0007333053221288515, + "loss": 0.4722, + "step": 9629 + }, + { + "epoch": 5.379888268156424, + "grad_norm": 0.5114326477050781, + "learning_rate": 0.0007332773109243698, + "loss": 0.582, + "step": 9630 + }, + { + "epoch": 5.380446927374302, + "grad_norm": 1.298922061920166, + "learning_rate": 0.000733249299719888, + "loss": 0.5337, + "step": 9631 + }, + { + "epoch": 5.381005586592178, + "grad_norm": 0.39356788992881775, + "learning_rate": 0.0007332212885154062, + "loss": 0.3435, + "step": 9632 + }, + { + "epoch": 5.381564245810056, + "grad_norm": 0.60985267162323, + "learning_rate": 0.0007331932773109244, + "loss": 0.3877, + "step": 9633 + }, + { + "epoch": 5.382122905027933, + "grad_norm": 0.4025561809539795, + "learning_rate": 0.0007331652661064426, + "loss": 0.4089, + "step": 9634 + }, + { + "epoch": 5.38268156424581, + "grad_norm": 0.8815410733222961, + "learning_rate": 0.0007331372549019608, + "loss": 0.4336, + "step": 9635 + }, + { + "epoch": 5.383240223463687, + "grad_norm": 0.8768227100372314, + "learning_rate": 0.000733109243697479, + "loss": 0.4922, + "step": 9636 + }, + { + "epoch": 5.383798882681564, + "grad_norm": 0.4536188244819641, + "learning_rate": 0.0007330812324929972, + "loss": 0.4011, + "step": 9637 + }, + { + "epoch": 5.384357541899441, + "grad_norm": 0.4479326903820038, + "learning_rate": 0.0007330532212885154, + "loss": 0.5036, + "step": 9638 + }, + { + "epoch": 5.384916201117319, + "grad_norm": 0.5947161912918091, + "learning_rate": 0.0007330252100840337, + "loss": 0.4754, + "step": 9639 + }, + { + "epoch": 5.385474860335195, + "grad_norm": 0.5461640954017639, + "learning_rate": 0.0007329971988795518, + "loss": 0.5277, + "step": 9640 + }, + { + "epoch": 5.386033519553073, + "grad_norm": 0.5795295238494873, + "learning_rate": 0.00073296918767507, + "loss": 0.5474, + "step": 9641 + }, + { + "epoch": 5.386592178770949, + "grad_norm": 0.6543920636177063, + "learning_rate": 0.0007329411764705882, + "loss": 0.5214, + "step": 9642 + }, + { + "epoch": 5.387150837988827, + "grad_norm": 0.4117977023124695, + "learning_rate": 0.0007329131652661064, + "loss": 0.4334, + "step": 9643 + }, + { + "epoch": 5.3877094972067034, + "grad_norm": 0.8344374299049377, + "learning_rate": 0.0007328851540616247, + "loss": 0.5254, + "step": 9644 + }, + { + "epoch": 5.388268156424581, + "grad_norm": 0.8225786089897156, + "learning_rate": 0.0007328571428571428, + "loss": 0.3996, + "step": 9645 + }, + { + "epoch": 5.388826815642458, + "grad_norm": 0.5805999636650085, + "learning_rate": 0.000732829131652661, + "loss": 0.4765, + "step": 9646 + }, + { + "epoch": 5.389385474860335, + "grad_norm": 0.9628885984420776, + "learning_rate": 0.0007328011204481793, + "loss": 0.4203, + "step": 9647 + }, + { + "epoch": 5.389944134078212, + "grad_norm": 0.3919629752635956, + "learning_rate": 0.0007327731092436975, + "loss": 0.4132, + "step": 9648 + }, + { + "epoch": 5.39050279329609, + "grad_norm": 0.4957062005996704, + "learning_rate": 0.0007327450980392158, + "loss": 0.5428, + "step": 9649 + }, + { + "epoch": 5.391061452513966, + "grad_norm": 0.5382682681083679, + "learning_rate": 0.0007327170868347339, + "loss": 0.4927, + "step": 9650 + }, + { + "epoch": 5.391620111731844, + "grad_norm": 1.3976613283157349, + "learning_rate": 0.0007326890756302521, + "loss": 0.4873, + "step": 9651 + }, + { + "epoch": 5.39217877094972, + "grad_norm": 0.39772841334342957, + "learning_rate": 0.0007326610644257703, + "loss": 0.4222, + "step": 9652 + }, + { + "epoch": 5.392737430167598, + "grad_norm": 0.4938870370388031, + "learning_rate": 0.0007326330532212885, + "loss": 0.4397, + "step": 9653 + }, + { + "epoch": 5.3932960893854744, + "grad_norm": 1.7677993774414062, + "learning_rate": 0.0007326050420168068, + "loss": 0.4443, + "step": 9654 + }, + { + "epoch": 5.393854748603352, + "grad_norm": 0.8407166004180908, + "learning_rate": 0.000732577030812325, + "loss": 0.4433, + "step": 9655 + }, + { + "epoch": 5.394413407821229, + "grad_norm": 0.4558146595954895, + "learning_rate": 0.0007325490196078431, + "loss": 0.434, + "step": 9656 + }, + { + "epoch": 5.394972067039106, + "grad_norm": 0.46504995226860046, + "learning_rate": 0.0007325210084033613, + "loss": 0.5059, + "step": 9657 + }, + { + "epoch": 5.395530726256983, + "grad_norm": 0.6100008487701416, + "learning_rate": 0.0007324929971988795, + "loss": 0.4628, + "step": 9658 + }, + { + "epoch": 5.39608938547486, + "grad_norm": 6.689543724060059, + "learning_rate": 0.0007324649859943978, + "loss": 0.5696, + "step": 9659 + }, + { + "epoch": 5.396648044692737, + "grad_norm": 0.4569566547870636, + "learning_rate": 0.000732436974789916, + "loss": 0.4606, + "step": 9660 + }, + { + "epoch": 5.397206703910615, + "grad_norm": 0.46072351932525635, + "learning_rate": 0.0007324089635854341, + "loss": 0.516, + "step": 9661 + }, + { + "epoch": 5.397765363128491, + "grad_norm": 1.1802020072937012, + "learning_rate": 0.0007323809523809523, + "loss": 0.8239, + "step": 9662 + }, + { + "epoch": 5.398324022346369, + "grad_norm": 1.0785757303237915, + "learning_rate": 0.0007323529411764706, + "loss": 0.4371, + "step": 9663 + }, + { + "epoch": 5.3988826815642454, + "grad_norm": 0.5387957692146301, + "learning_rate": 0.0007323249299719889, + "loss": 0.4216, + "step": 9664 + }, + { + "epoch": 5.399441340782123, + "grad_norm": 0.9764959216117859, + "learning_rate": 0.0007322969187675071, + "loss": 0.5584, + "step": 9665 + }, + { + "epoch": 5.4, + "grad_norm": 0.49934449791908264, + "learning_rate": 0.0007322689075630252, + "loss": 0.3339, + "step": 9666 + }, + { + "epoch": 5.400558659217877, + "grad_norm": 0.46985989809036255, + "learning_rate": 0.0007322408963585434, + "loss": 0.4857, + "step": 9667 + }, + { + "epoch": 5.401117318435754, + "grad_norm": 0.4268762469291687, + "learning_rate": 0.0007322128851540616, + "loss": 0.4363, + "step": 9668 + }, + { + "epoch": 5.401675977653631, + "grad_norm": 0.6550259590148926, + "learning_rate": 0.0007321848739495799, + "loss": 0.4064, + "step": 9669 + }, + { + "epoch": 5.402234636871508, + "grad_norm": 0.5340927243232727, + "learning_rate": 0.0007321568627450981, + "loss": 0.4919, + "step": 9670 + }, + { + "epoch": 5.402793296089386, + "grad_norm": 0.48531490564346313, + "learning_rate": 0.0007321288515406163, + "loss": 0.4698, + "step": 9671 + }, + { + "epoch": 5.403351955307262, + "grad_norm": 0.6543476581573486, + "learning_rate": 0.0007321008403361344, + "loss": 0.6542, + "step": 9672 + }, + { + "epoch": 5.40391061452514, + "grad_norm": 0.7351266741752625, + "learning_rate": 0.0007320728291316526, + "loss": 0.4447, + "step": 9673 + }, + { + "epoch": 5.4044692737430164, + "grad_norm": 0.3648409843444824, + "learning_rate": 0.0007320448179271709, + "loss": 0.3731, + "step": 9674 + }, + { + "epoch": 5.405027932960894, + "grad_norm": 0.6719562411308289, + "learning_rate": 0.0007320168067226891, + "loss": 0.7884, + "step": 9675 + }, + { + "epoch": 5.405586592178771, + "grad_norm": 1.019452452659607, + "learning_rate": 0.0007319887955182073, + "loss": 0.53, + "step": 9676 + }, + { + "epoch": 5.406145251396648, + "grad_norm": 0.6058816313743591, + "learning_rate": 0.0007319607843137254, + "loss": 0.6994, + "step": 9677 + }, + { + "epoch": 5.406703910614525, + "grad_norm": 0.6727103590965271, + "learning_rate": 0.0007319327731092436, + "loss": 0.5242, + "step": 9678 + }, + { + "epoch": 5.407262569832402, + "grad_norm": 0.5533923506736755, + "learning_rate": 0.000731904761904762, + "loss": 0.4036, + "step": 9679 + }, + { + "epoch": 5.407821229050279, + "grad_norm": 1.0296107530593872, + "learning_rate": 0.0007318767507002802, + "loss": 0.4419, + "step": 9680 + }, + { + "epoch": 5.408379888268156, + "grad_norm": 0.42621302604675293, + "learning_rate": 0.0007318487394957984, + "loss": 0.3606, + "step": 9681 + }, + { + "epoch": 5.408938547486033, + "grad_norm": 0.4223034381866455, + "learning_rate": 0.0007318207282913165, + "loss": 0.449, + "step": 9682 + }, + { + "epoch": 5.409497206703911, + "grad_norm": 0.7462381720542908, + "learning_rate": 0.0007317927170868347, + "loss": 0.5363, + "step": 9683 + }, + { + "epoch": 5.410055865921787, + "grad_norm": 0.58668452501297, + "learning_rate": 0.000731764705882353, + "loss": 0.4428, + "step": 9684 + }, + { + "epoch": 5.410614525139665, + "grad_norm": 0.5859023332595825, + "learning_rate": 0.0007317366946778712, + "loss": 0.4257, + "step": 9685 + }, + { + "epoch": 5.411173184357542, + "grad_norm": 0.5037703514099121, + "learning_rate": 0.0007317086834733894, + "loss": 0.4004, + "step": 9686 + }, + { + "epoch": 5.411731843575419, + "grad_norm": 0.3915973901748657, + "learning_rate": 0.0007316806722689076, + "loss": 0.4615, + "step": 9687 + }, + { + "epoch": 5.412290502793296, + "grad_norm": 1.3105286359786987, + "learning_rate": 0.0007316526610644257, + "loss": 0.5061, + "step": 9688 + }, + { + "epoch": 5.412849162011173, + "grad_norm": 0.759484589099884, + "learning_rate": 0.000731624649859944, + "loss": 0.4783, + "step": 9689 + }, + { + "epoch": 5.41340782122905, + "grad_norm": 0.4818784296512604, + "learning_rate": 0.0007315966386554622, + "loss": 0.4281, + "step": 9690 + }, + { + "epoch": 5.413966480446927, + "grad_norm": 0.5857297778129578, + "learning_rate": 0.0007315686274509804, + "loss": 0.3879, + "step": 9691 + }, + { + "epoch": 5.414525139664804, + "grad_norm": 0.4561799168586731, + "learning_rate": 0.0007315406162464986, + "loss": 0.4391, + "step": 9692 + }, + { + "epoch": 5.415083798882682, + "grad_norm": 0.5454745888710022, + "learning_rate": 0.0007315126050420167, + "loss": 0.369, + "step": 9693 + }, + { + "epoch": 5.415642458100558, + "grad_norm": 0.4950173497200012, + "learning_rate": 0.000731484593837535, + "loss": 0.4246, + "step": 9694 + }, + { + "epoch": 5.416201117318436, + "grad_norm": 0.6390392184257507, + "learning_rate": 0.0007314565826330533, + "loss": 0.5974, + "step": 9695 + }, + { + "epoch": 5.4167597765363125, + "grad_norm": 3.024045944213867, + "learning_rate": 0.0007314285714285715, + "loss": 0.4589, + "step": 9696 + }, + { + "epoch": 5.41731843575419, + "grad_norm": 0.5367445945739746, + "learning_rate": 0.0007314005602240897, + "loss": 0.3821, + "step": 9697 + }, + { + "epoch": 5.417877094972067, + "grad_norm": 0.599183976650238, + "learning_rate": 0.0007313725490196078, + "loss": 0.3664, + "step": 9698 + }, + { + "epoch": 5.418435754189944, + "grad_norm": 0.6512631773948669, + "learning_rate": 0.0007313445378151261, + "loss": 0.4725, + "step": 9699 + }, + { + "epoch": 5.418994413407821, + "grad_norm": 3.2039411067962646, + "learning_rate": 0.0007313165266106443, + "loss": 0.4358, + "step": 9700 + }, + { + "epoch": 5.419553072625698, + "grad_norm": 0.4627593755722046, + "learning_rate": 0.0007312885154061625, + "loss": 0.4605, + "step": 9701 + }, + { + "epoch": 5.420111731843575, + "grad_norm": 0.5062171220779419, + "learning_rate": 0.0007312605042016807, + "loss": 0.5212, + "step": 9702 + }, + { + "epoch": 5.420670391061453, + "grad_norm": 0.43370017409324646, + "learning_rate": 0.0007312324929971989, + "loss": 0.3959, + "step": 9703 + }, + { + "epoch": 5.421229050279329, + "grad_norm": 2.050886392593384, + "learning_rate": 0.0007312044817927171, + "loss": 0.476, + "step": 9704 + }, + { + "epoch": 5.421787709497207, + "grad_norm": 0.8578056693077087, + "learning_rate": 0.0007311764705882353, + "loss": 0.4784, + "step": 9705 + }, + { + "epoch": 5.4223463687150835, + "grad_norm": 29.487979888916016, + "learning_rate": 0.0007311484593837535, + "loss": 0.4053, + "step": 9706 + }, + { + "epoch": 5.422905027932961, + "grad_norm": 0.6425113677978516, + "learning_rate": 0.0007311204481792717, + "loss": 0.5146, + "step": 9707 + }, + { + "epoch": 5.423463687150838, + "grad_norm": 0.5732616782188416, + "learning_rate": 0.0007310924369747899, + "loss": 0.5239, + "step": 9708 + }, + { + "epoch": 5.424022346368715, + "grad_norm": 0.46527257561683655, + "learning_rate": 0.0007310644257703081, + "loss": 0.4067, + "step": 9709 + }, + { + "epoch": 5.424581005586592, + "grad_norm": 0.8030074238777161, + "learning_rate": 0.0007310364145658263, + "loss": 0.4974, + "step": 9710 + }, + { + "epoch": 5.425139664804469, + "grad_norm": 0.48365288972854614, + "learning_rate": 0.0007310084033613445, + "loss": 0.4305, + "step": 9711 + }, + { + "epoch": 5.425698324022346, + "grad_norm": 0.468726247549057, + "learning_rate": 0.0007309803921568628, + "loss": 0.4297, + "step": 9712 + }, + { + "epoch": 5.426256983240224, + "grad_norm": 0.6161282062530518, + "learning_rate": 0.000730952380952381, + "loss": 0.4664, + "step": 9713 + }, + { + "epoch": 5.4268156424581, + "grad_norm": 0.6594555377960205, + "learning_rate": 0.0007309243697478993, + "loss": 0.5754, + "step": 9714 + }, + { + "epoch": 5.427374301675978, + "grad_norm": 0.4305954575538635, + "learning_rate": 0.0007308963585434174, + "loss": 0.3506, + "step": 9715 + }, + { + "epoch": 5.4279329608938545, + "grad_norm": 0.4502977728843689, + "learning_rate": 0.0007308683473389356, + "loss": 0.4304, + "step": 9716 + }, + { + "epoch": 5.428491620111732, + "grad_norm": 0.502530038356781, + "learning_rate": 0.0007308403361344538, + "loss": 0.48, + "step": 9717 + }, + { + "epoch": 5.4290502793296085, + "grad_norm": 0.5808756947517395, + "learning_rate": 0.000730812324929972, + "loss": 0.5729, + "step": 9718 + }, + { + "epoch": 5.429608938547486, + "grad_norm": 3.9587249755859375, + "learning_rate": 0.0007307843137254903, + "loss": 0.508, + "step": 9719 + }, + { + "epoch": 5.430167597765363, + "grad_norm": 0.4714196026325226, + "learning_rate": 0.0007307563025210084, + "loss": 0.434, + "step": 9720 + }, + { + "epoch": 5.43072625698324, + "grad_norm": 0.6579060554504395, + "learning_rate": 0.0007307282913165266, + "loss": 0.3937, + "step": 9721 + }, + { + "epoch": 5.431284916201117, + "grad_norm": 0.5075948238372803, + "learning_rate": 0.0007307002801120448, + "loss": 0.4437, + "step": 9722 + }, + { + "epoch": 5.431843575418995, + "grad_norm": 0.5919167995452881, + "learning_rate": 0.000730672268907563, + "loss": 0.4974, + "step": 9723 + }, + { + "epoch": 5.432402234636871, + "grad_norm": 0.6856719851493835, + "learning_rate": 0.0007306442577030813, + "loss": 0.4381, + "step": 9724 + }, + { + "epoch": 5.432960893854749, + "grad_norm": 0.4521169066429138, + "learning_rate": 0.0007306162464985994, + "loss": 0.3568, + "step": 9725 + }, + { + "epoch": 5.4335195530726255, + "grad_norm": 0.6040863394737244, + "learning_rate": 0.0007305882352941176, + "loss": 0.6364, + "step": 9726 + }, + { + "epoch": 5.434078212290503, + "grad_norm": 0.47824618220329285, + "learning_rate": 0.0007305602240896358, + "loss": 0.3731, + "step": 9727 + }, + { + "epoch": 5.4346368715083795, + "grad_norm": 0.6427138447761536, + "learning_rate": 0.000730532212885154, + "loss": 0.4161, + "step": 9728 + }, + { + "epoch": 5.435195530726257, + "grad_norm": 0.42427805066108704, + "learning_rate": 0.0007305042016806724, + "loss": 0.3788, + "step": 9729 + }, + { + "epoch": 5.435754189944134, + "grad_norm": 0.890143871307373, + "learning_rate": 0.0007304761904761906, + "loss": 0.3498, + "step": 9730 + }, + { + "epoch": 5.436312849162011, + "grad_norm": 0.7862064838409424, + "learning_rate": 0.0007304481792717087, + "loss": 0.6898, + "step": 9731 + }, + { + "epoch": 5.436871508379888, + "grad_norm": 0.5681707262992859, + "learning_rate": 0.0007304201680672269, + "loss": 0.4026, + "step": 9732 + }, + { + "epoch": 5.437430167597765, + "grad_norm": 0.7881913781166077, + "learning_rate": 0.0007303921568627451, + "loss": 0.5982, + "step": 9733 + }, + { + "epoch": 5.437988826815642, + "grad_norm": 0.4226723611354828, + "learning_rate": 0.0007303641456582634, + "loss": 0.5315, + "step": 9734 + }, + { + "epoch": 5.43854748603352, + "grad_norm": 0.5856690406799316, + "learning_rate": 0.0007303361344537816, + "loss": 0.5702, + "step": 9735 + }, + { + "epoch": 5.4391061452513965, + "grad_norm": 0.962823212146759, + "learning_rate": 0.0007303081232492997, + "loss": 0.4158, + "step": 9736 + }, + { + "epoch": 5.439664804469274, + "grad_norm": 0.4471222162246704, + "learning_rate": 0.0007302801120448179, + "loss": 0.437, + "step": 9737 + }, + { + "epoch": 5.4402234636871505, + "grad_norm": 0.592616856098175, + "learning_rate": 0.0007302521008403361, + "loss": 0.3787, + "step": 9738 + }, + { + "epoch": 5.440782122905028, + "grad_norm": 0.9380432963371277, + "learning_rate": 0.0007302240896358544, + "loss": 0.4581, + "step": 9739 + }, + { + "epoch": 5.441340782122905, + "grad_norm": 0.5861191153526306, + "learning_rate": 0.0007301960784313726, + "loss": 0.5485, + "step": 9740 + }, + { + "epoch": 5.441899441340782, + "grad_norm": 0.46296703815460205, + "learning_rate": 0.0007301680672268907, + "loss": 0.3949, + "step": 9741 + }, + { + "epoch": 5.442458100558659, + "grad_norm": 0.5828236937522888, + "learning_rate": 0.0007301400560224089, + "loss": 0.5779, + "step": 9742 + }, + { + "epoch": 5.443016759776536, + "grad_norm": 0.43828508257865906, + "learning_rate": 0.0007301120448179271, + "loss": 0.4102, + "step": 9743 + }, + { + "epoch": 5.443575418994413, + "grad_norm": 0.7192258238792419, + "learning_rate": 0.0007300840336134455, + "loss": 0.4313, + "step": 9744 + }, + { + "epoch": 5.444134078212291, + "grad_norm": 0.39279791712760925, + "learning_rate": 0.0007300560224089637, + "loss": 0.3728, + "step": 9745 + }, + { + "epoch": 5.4446927374301675, + "grad_norm": 0.5375004410743713, + "learning_rate": 0.0007300280112044819, + "loss": 0.4513, + "step": 9746 + }, + { + "epoch": 5.445251396648045, + "grad_norm": 0.5508938431739807, + "learning_rate": 0.00073, + "loss": 0.4021, + "step": 9747 + }, + { + "epoch": 5.4458100558659215, + "grad_norm": 0.9927380084991455, + "learning_rate": 0.0007299719887955182, + "loss": 0.4315, + "step": 9748 + }, + { + "epoch": 5.446368715083799, + "grad_norm": 0.5101918578147888, + "learning_rate": 0.0007299439775910365, + "loss": 0.3883, + "step": 9749 + }, + { + "epoch": 5.446927374301676, + "grad_norm": 0.5668933987617493, + "learning_rate": 0.0007299159663865547, + "loss": 0.5092, + "step": 9750 + }, + { + "epoch": 5.447486033519553, + "grad_norm": 1.411636471748352, + "learning_rate": 0.0007298879551820729, + "loss": 0.4665, + "step": 9751 + }, + { + "epoch": 5.44804469273743, + "grad_norm": 0.6135072708129883, + "learning_rate": 0.000729859943977591, + "loss": 0.4248, + "step": 9752 + }, + { + "epoch": 5.448603351955307, + "grad_norm": 0.5498002767562866, + "learning_rate": 0.0007298319327731092, + "loss": 0.4838, + "step": 9753 + }, + { + "epoch": 5.449162011173184, + "grad_norm": 1.2738165855407715, + "learning_rate": 0.0007298039215686275, + "loss": 0.516, + "step": 9754 + }, + { + "epoch": 5.449720670391061, + "grad_norm": 0.48241597414016724, + "learning_rate": 0.0007297759103641457, + "loss": 0.3739, + "step": 9755 + }, + { + "epoch": 5.4502793296089385, + "grad_norm": 1.0763256549835205, + "learning_rate": 0.0007297478991596639, + "loss": 0.5358, + "step": 9756 + }, + { + "epoch": 5.450837988826816, + "grad_norm": 0.4620930552482605, + "learning_rate": 0.000729719887955182, + "loss": 0.4688, + "step": 9757 + }, + { + "epoch": 5.4513966480446925, + "grad_norm": 0.7245018482208252, + "learning_rate": 0.0007296918767507002, + "loss": 0.5372, + "step": 9758 + }, + { + "epoch": 5.45195530726257, + "grad_norm": 0.7951422333717346, + "learning_rate": 0.0007296638655462185, + "loss": 0.4482, + "step": 9759 + }, + { + "epoch": 5.452513966480447, + "grad_norm": 0.67264324426651, + "learning_rate": 0.0007296358543417367, + "loss": 0.5326, + "step": 9760 + }, + { + "epoch": 5.453072625698324, + "grad_norm": 0.4882766604423523, + "learning_rate": 0.000729607843137255, + "loss": 0.37, + "step": 9761 + }, + { + "epoch": 5.453631284916201, + "grad_norm": 0.5028199553489685, + "learning_rate": 0.0007295798319327732, + "loss": 0.4846, + "step": 9762 + }, + { + "epoch": 5.454189944134078, + "grad_norm": 0.5273077487945557, + "learning_rate": 0.0007295518207282913, + "loss": 0.4144, + "step": 9763 + }, + { + "epoch": 5.454748603351955, + "grad_norm": 0.5949251651763916, + "learning_rate": 0.0007295238095238096, + "loss": 0.5072, + "step": 9764 + }, + { + "epoch": 5.455307262569832, + "grad_norm": 0.9257952570915222, + "learning_rate": 0.0007294957983193278, + "loss": 0.4774, + "step": 9765 + }, + { + "epoch": 5.4558659217877095, + "grad_norm": 0.876560628414154, + "learning_rate": 0.000729467787114846, + "loss": 0.5045, + "step": 9766 + }, + { + "epoch": 5.456424581005587, + "grad_norm": 0.47527071833610535, + "learning_rate": 0.0007294397759103642, + "loss": 0.5013, + "step": 9767 + }, + { + "epoch": 5.4569832402234635, + "grad_norm": 0.5068494081497192, + "learning_rate": 0.0007294117647058823, + "loss": 0.4553, + "step": 9768 + }, + { + "epoch": 5.457541899441341, + "grad_norm": 1.2847996950149536, + "learning_rate": 0.0007293837535014006, + "loss": 0.4806, + "step": 9769 + }, + { + "epoch": 5.4581005586592175, + "grad_norm": 0.5115010142326355, + "learning_rate": 0.0007293557422969188, + "loss": 0.4574, + "step": 9770 + }, + { + "epoch": 5.458659217877095, + "grad_norm": 0.7836250066757202, + "learning_rate": 0.000729327731092437, + "loss": 0.4988, + "step": 9771 + }, + { + "epoch": 5.459217877094972, + "grad_norm": 0.4542462229728699, + "learning_rate": 0.0007292997198879552, + "loss": 0.481, + "step": 9772 + }, + { + "epoch": 5.459776536312849, + "grad_norm": 0.5937549471855164, + "learning_rate": 0.0007292717086834733, + "loss": 0.4334, + "step": 9773 + }, + { + "epoch": 5.460335195530726, + "grad_norm": 1.0559362173080444, + "learning_rate": 0.0007292436974789916, + "loss": 0.5298, + "step": 9774 + }, + { + "epoch": 5.460893854748603, + "grad_norm": 0.48945626616477966, + "learning_rate": 0.0007292156862745098, + "loss": 0.4505, + "step": 9775 + }, + { + "epoch": 5.4614525139664805, + "grad_norm": 0.6664998531341553, + "learning_rate": 0.000729187675070028, + "loss": 0.4166, + "step": 9776 + }, + { + "epoch": 5.462011173184358, + "grad_norm": 0.7264735698699951, + "learning_rate": 0.0007291596638655463, + "loss": 0.4789, + "step": 9777 + }, + { + "epoch": 5.4625698324022345, + "grad_norm": 0.596433162689209, + "learning_rate": 0.0007291316526610645, + "loss": 0.4553, + "step": 9778 + }, + { + "epoch": 5.463128491620112, + "grad_norm": 0.4596468210220337, + "learning_rate": 0.0007291036414565827, + "loss": 0.4302, + "step": 9779 + }, + { + "epoch": 5.4636871508379885, + "grad_norm": 0.5624420642852783, + "learning_rate": 0.0007290756302521009, + "loss": 0.5445, + "step": 9780 + }, + { + "epoch": 5.464245810055866, + "grad_norm": 1.8031001091003418, + "learning_rate": 0.0007290476190476191, + "loss": 0.4658, + "step": 9781 + }, + { + "epoch": 5.464804469273743, + "grad_norm": 0.5349305272102356, + "learning_rate": 0.0007290196078431373, + "loss": 0.5365, + "step": 9782 + }, + { + "epoch": 5.46536312849162, + "grad_norm": 0.6741350293159485, + "learning_rate": 0.0007289915966386555, + "loss": 0.5111, + "step": 9783 + }, + { + "epoch": 5.465921787709497, + "grad_norm": 0.5121055841445923, + "learning_rate": 0.0007289635854341737, + "loss": 0.4619, + "step": 9784 + }, + { + "epoch": 5.466480446927374, + "grad_norm": 0.4975737929344177, + "learning_rate": 0.0007289355742296919, + "loss": 0.4467, + "step": 9785 + }, + { + "epoch": 5.4670391061452515, + "grad_norm": 0.5289967656135559, + "learning_rate": 0.0007289075630252101, + "loss": 0.5125, + "step": 9786 + }, + { + "epoch": 5.467597765363129, + "grad_norm": 0.4235764145851135, + "learning_rate": 0.0007288795518207283, + "loss": 0.4657, + "step": 9787 + }, + { + "epoch": 5.4681564245810055, + "grad_norm": 0.697348415851593, + "learning_rate": 0.0007288515406162465, + "loss": 0.4799, + "step": 9788 + }, + { + "epoch": 5.468715083798883, + "grad_norm": 0.5696758031845093, + "learning_rate": 0.0007288235294117647, + "loss": 0.3823, + "step": 9789 + }, + { + "epoch": 5.4692737430167595, + "grad_norm": 0.5714661478996277, + "learning_rate": 0.0007287955182072829, + "loss": 0.3269, + "step": 9790 + }, + { + "epoch": 5.469832402234637, + "grad_norm": 0.8170825242996216, + "learning_rate": 0.0007287675070028011, + "loss": 0.4135, + "step": 9791 + }, + { + "epoch": 5.4703910614525135, + "grad_norm": 0.49487197399139404, + "learning_rate": 0.0007287394957983193, + "loss": 0.53, + "step": 9792 + }, + { + "epoch": 5.470949720670391, + "grad_norm": 0.7495507001876831, + "learning_rate": 0.0007287114845938375, + "loss": 0.4666, + "step": 9793 + }, + { + "epoch": 5.471508379888268, + "grad_norm": 0.5392534136772156, + "learning_rate": 0.0007286834733893559, + "loss": 0.3898, + "step": 9794 + }, + { + "epoch": 5.472067039106145, + "grad_norm": 1.5419838428497314, + "learning_rate": 0.000728655462184874, + "loss": 0.4037, + "step": 9795 + }, + { + "epoch": 5.4726256983240225, + "grad_norm": 0.36275818943977356, + "learning_rate": 0.0007286274509803922, + "loss": 0.3731, + "step": 9796 + }, + { + "epoch": 5.473184357541899, + "grad_norm": 0.7083741426467896, + "learning_rate": 0.0007285994397759104, + "loss": 0.6887, + "step": 9797 + }, + { + "epoch": 5.4737430167597765, + "grad_norm": 1.5125281810760498, + "learning_rate": 0.0007285714285714286, + "loss": 0.6228, + "step": 9798 + }, + { + "epoch": 5.474301675977654, + "grad_norm": 2.1880712509155273, + "learning_rate": 0.0007285434173669469, + "loss": 0.4433, + "step": 9799 + }, + { + "epoch": 5.4748603351955305, + "grad_norm": 0.7548524141311646, + "learning_rate": 0.000728515406162465, + "loss": 0.5692, + "step": 9800 + }, + { + "epoch": 5.475418994413408, + "grad_norm": 0.6337263584136963, + "learning_rate": 0.0007284873949579832, + "loss": 0.4067, + "step": 9801 + }, + { + "epoch": 5.4759776536312845, + "grad_norm": 0.7854358553886414, + "learning_rate": 0.0007284593837535014, + "loss": 0.5491, + "step": 9802 + }, + { + "epoch": 5.476536312849162, + "grad_norm": 0.5236221551895142, + "learning_rate": 0.0007284313725490196, + "loss": 0.4444, + "step": 9803 + }, + { + "epoch": 5.477094972067039, + "grad_norm": 3.851414680480957, + "learning_rate": 0.0007284033613445379, + "loss": 0.705, + "step": 9804 + }, + { + "epoch": 5.477653631284916, + "grad_norm": 0.47031188011169434, + "learning_rate": 0.000728375350140056, + "loss": 0.5239, + "step": 9805 + }, + { + "epoch": 5.4782122905027935, + "grad_norm": 0.4021458923816681, + "learning_rate": 0.0007283473389355742, + "loss": 0.4387, + "step": 9806 + }, + { + "epoch": 5.47877094972067, + "grad_norm": 0.5655771493911743, + "learning_rate": 0.0007283193277310924, + "loss": 0.3806, + "step": 9807 + }, + { + "epoch": 5.4793296089385475, + "grad_norm": 0.5272669196128845, + "learning_rate": 0.0007282913165266106, + "loss": 0.5229, + "step": 9808 + }, + { + "epoch": 5.479888268156425, + "grad_norm": 0.5111368298530579, + "learning_rate": 0.000728263305322129, + "loss": 0.367, + "step": 9809 + }, + { + "epoch": 5.4804469273743015, + "grad_norm": 0.5307624936103821, + "learning_rate": 0.0007282352941176472, + "loss": 0.425, + "step": 9810 + }, + { + "epoch": 5.481005586592179, + "grad_norm": 0.48857417702674866, + "learning_rate": 0.0007282072829131653, + "loss": 0.3748, + "step": 9811 + }, + { + "epoch": 5.4815642458100555, + "grad_norm": 0.4147144854068756, + "learning_rate": 0.0007281792717086835, + "loss": 0.4621, + "step": 9812 + }, + { + "epoch": 5.482122905027933, + "grad_norm": 0.5259796380996704, + "learning_rate": 0.0007281512605042017, + "loss": 0.3953, + "step": 9813 + }, + { + "epoch": 5.48268156424581, + "grad_norm": 0.829331636428833, + "learning_rate": 0.0007281232492997199, + "loss": 0.3615, + "step": 9814 + }, + { + "epoch": 5.483240223463687, + "grad_norm": 0.6177496910095215, + "learning_rate": 0.0007280952380952382, + "loss": 0.4471, + "step": 9815 + }, + { + "epoch": 5.4837988826815645, + "grad_norm": 0.5386406183242798, + "learning_rate": 0.0007280672268907563, + "loss": 0.3801, + "step": 9816 + }, + { + "epoch": 5.484357541899441, + "grad_norm": 0.48177188634872437, + "learning_rate": 0.0007280392156862745, + "loss": 0.4954, + "step": 9817 + }, + { + "epoch": 5.4849162011173185, + "grad_norm": 0.6160786747932434, + "learning_rate": 0.0007280112044817927, + "loss": 0.4273, + "step": 9818 + }, + { + "epoch": 5.485474860335196, + "grad_norm": 0.42306891083717346, + "learning_rate": 0.0007279831932773109, + "loss": 0.3996, + "step": 9819 + }, + { + "epoch": 5.4860335195530725, + "grad_norm": 0.6724539399147034, + "learning_rate": 0.0007279551820728292, + "loss": 0.4493, + "step": 9820 + }, + { + "epoch": 5.48659217877095, + "grad_norm": 2.4731643199920654, + "learning_rate": 0.0007279271708683473, + "loss": 0.431, + "step": 9821 + }, + { + "epoch": 5.4871508379888265, + "grad_norm": 0.6919944286346436, + "learning_rate": 0.0007278991596638655, + "loss": 0.5869, + "step": 9822 + }, + { + "epoch": 5.487709497206704, + "grad_norm": 0.5119506120681763, + "learning_rate": 0.0007278711484593837, + "loss": 0.5099, + "step": 9823 + }, + { + "epoch": 5.488268156424581, + "grad_norm": 0.5532276034355164, + "learning_rate": 0.0007278431372549019, + "loss": 0.4434, + "step": 9824 + }, + { + "epoch": 5.488826815642458, + "grad_norm": 1.0022008419036865, + "learning_rate": 0.0007278151260504202, + "loss": 0.3452, + "step": 9825 + }, + { + "epoch": 5.4893854748603355, + "grad_norm": 0.8269253373146057, + "learning_rate": 0.0007277871148459385, + "loss": 0.4684, + "step": 9826 + }, + { + "epoch": 5.489944134078212, + "grad_norm": 0.4292662441730499, + "learning_rate": 0.0007277591036414566, + "loss": 0.4355, + "step": 9827 + }, + { + "epoch": 5.4905027932960895, + "grad_norm": 1.08241605758667, + "learning_rate": 0.0007277310924369748, + "loss": 0.3902, + "step": 9828 + }, + { + "epoch": 5.491061452513966, + "grad_norm": 1.653778076171875, + "learning_rate": 0.000727703081232493, + "loss": 0.4446, + "step": 9829 + }, + { + "epoch": 5.4916201117318435, + "grad_norm": 2.0861117839813232, + "learning_rate": 0.0007276750700280113, + "loss": 0.4145, + "step": 9830 + }, + { + "epoch": 5.492178770949721, + "grad_norm": 0.6282828450202942, + "learning_rate": 0.0007276470588235295, + "loss": 0.4928, + "step": 9831 + }, + { + "epoch": 5.4927374301675975, + "grad_norm": 0.4635153114795685, + "learning_rate": 0.0007276190476190476, + "loss": 0.4133, + "step": 9832 + }, + { + "epoch": 5.493296089385475, + "grad_norm": 0.698635995388031, + "learning_rate": 0.0007275910364145658, + "loss": 0.461, + "step": 9833 + }, + { + "epoch": 5.4938547486033515, + "grad_norm": 0.3883945047855377, + "learning_rate": 0.000727563025210084, + "loss": 0.3465, + "step": 9834 + }, + { + "epoch": 5.494413407821229, + "grad_norm": 0.5274906158447266, + "learning_rate": 0.0007275350140056023, + "loss": 0.5029, + "step": 9835 + }, + { + "epoch": 5.4949720670391065, + "grad_norm": 0.5786053538322449, + "learning_rate": 0.0007275070028011205, + "loss": 0.5087, + "step": 9836 + }, + { + "epoch": 5.495530726256983, + "grad_norm": 0.49566227197647095, + "learning_rate": 0.0007274789915966386, + "loss": 0.3724, + "step": 9837 + }, + { + "epoch": 5.4960893854748605, + "grad_norm": 4.963715076446533, + "learning_rate": 0.0007274509803921568, + "loss": 0.481, + "step": 9838 + }, + { + "epoch": 5.496648044692737, + "grad_norm": 0.6923301815986633, + "learning_rate": 0.000727422969187675, + "loss": 0.4789, + "step": 9839 + }, + { + "epoch": 5.4972067039106145, + "grad_norm": 0.5754944086074829, + "learning_rate": 0.0007273949579831933, + "loss": 0.4784, + "step": 9840 + }, + { + "epoch": 5.497765363128492, + "grad_norm": 0.6470469832420349, + "learning_rate": 0.0007273669467787115, + "loss": 0.3967, + "step": 9841 + }, + { + "epoch": 5.4983240223463685, + "grad_norm": 0.5435767769813538, + "learning_rate": 0.0007273389355742297, + "loss": 0.4058, + "step": 9842 + }, + { + "epoch": 5.498882681564246, + "grad_norm": 0.646068274974823, + "learning_rate": 0.0007273109243697478, + "loss": 0.4498, + "step": 9843 + }, + { + "epoch": 5.4994413407821225, + "grad_norm": 0.5059846639633179, + "learning_rate": 0.000727282913165266, + "loss": 0.3569, + "step": 9844 + }, + { + "epoch": 5.5, + "grad_norm": 0.5060825943946838, + "learning_rate": 0.0007272549019607844, + "loss": 0.3963, + "step": 9845 + }, + { + "epoch": 5.5005586592178775, + "grad_norm": 0.5156931281089783, + "learning_rate": 0.0007272268907563026, + "loss": 0.4709, + "step": 9846 + }, + { + "epoch": 5.501117318435754, + "grad_norm": 0.4591083228588104, + "learning_rate": 0.0007271988795518208, + "loss": 0.3489, + "step": 9847 + }, + { + "epoch": 5.5016759776536315, + "grad_norm": 0.6754613518714905, + "learning_rate": 0.0007271708683473389, + "loss": 0.5642, + "step": 9848 + }, + { + "epoch": 5.502234636871508, + "grad_norm": 0.6588128209114075, + "learning_rate": 0.0007271428571428571, + "loss": 0.4926, + "step": 9849 + }, + { + "epoch": 5.5027932960893855, + "grad_norm": 0.5223484039306641, + "learning_rate": 0.0007271148459383754, + "loss": 0.4975, + "step": 9850 + }, + { + "epoch": 5.503351955307263, + "grad_norm": 0.9048058986663818, + "learning_rate": 0.0007270868347338936, + "loss": 0.4119, + "step": 9851 + }, + { + "epoch": 5.5039106145251395, + "grad_norm": 0.5210031867027283, + "learning_rate": 0.0007270588235294118, + "loss": 0.4993, + "step": 9852 + }, + { + "epoch": 5.504469273743017, + "grad_norm": 0.5703561305999756, + "learning_rate": 0.0007270308123249299, + "loss": 0.4657, + "step": 9853 + }, + { + "epoch": 5.5050279329608935, + "grad_norm": 0.3826907277107239, + "learning_rate": 0.0007270028011204481, + "loss": 0.3661, + "step": 9854 + }, + { + "epoch": 5.505586592178771, + "grad_norm": 0.5331421494483948, + "learning_rate": 0.0007269747899159664, + "loss": 0.528, + "step": 9855 + }, + { + "epoch": 5.506145251396648, + "grad_norm": 0.9331299066543579, + "learning_rate": 0.0007269467787114846, + "loss": 0.4596, + "step": 9856 + }, + { + "epoch": 5.506703910614525, + "grad_norm": 0.7615708708763123, + "learning_rate": 0.0007269187675070028, + "loss": 0.4064, + "step": 9857 + }, + { + "epoch": 5.5072625698324025, + "grad_norm": 1.6598408222198486, + "learning_rate": 0.000726890756302521, + "loss": 0.3933, + "step": 9858 + }, + { + "epoch": 5.507821229050279, + "grad_norm": 0.45569321513175964, + "learning_rate": 0.0007268627450980391, + "loss": 0.4479, + "step": 9859 + }, + { + "epoch": 5.5083798882681565, + "grad_norm": 1.6721045970916748, + "learning_rate": 0.0007268347338935575, + "loss": 0.5206, + "step": 9860 + }, + { + "epoch": 5.508938547486034, + "grad_norm": 0.4890991747379303, + "learning_rate": 0.0007268067226890757, + "loss": 0.3752, + "step": 9861 + }, + { + "epoch": 5.5094972067039105, + "grad_norm": 0.7073611617088318, + "learning_rate": 0.0007267787114845939, + "loss": 0.5763, + "step": 9862 + }, + { + "epoch": 5.510055865921788, + "grad_norm": 0.4285040497779846, + "learning_rate": 0.0007267507002801121, + "loss": 0.3729, + "step": 9863 + }, + { + "epoch": 5.5106145251396645, + "grad_norm": 0.6984803080558777, + "learning_rate": 0.0007267226890756302, + "loss": 0.4455, + "step": 9864 + }, + { + "epoch": 5.511173184357542, + "grad_norm": 0.6127260327339172, + "learning_rate": 0.0007266946778711485, + "loss": 0.3976, + "step": 9865 + }, + { + "epoch": 5.511731843575419, + "grad_norm": 0.5203281044960022, + "learning_rate": 0.0007266666666666667, + "loss": 0.4531, + "step": 9866 + }, + { + "epoch": 5.512290502793296, + "grad_norm": 0.5179316401481628, + "learning_rate": 0.0007266386554621849, + "loss": 0.4745, + "step": 9867 + }, + { + "epoch": 5.5128491620111735, + "grad_norm": 0.5677104592323303, + "learning_rate": 0.0007266106442577031, + "loss": 0.4281, + "step": 9868 + }, + { + "epoch": 5.51340782122905, + "grad_norm": 11.250227928161621, + "learning_rate": 0.0007265826330532212, + "loss": 0.4614, + "step": 9869 + }, + { + "epoch": 5.5139664804469275, + "grad_norm": 0.6089310050010681, + "learning_rate": 0.0007265546218487395, + "loss": 0.4924, + "step": 9870 + }, + { + "epoch": 5.514525139664805, + "grad_norm": 0.38104772567749023, + "learning_rate": 0.0007265266106442577, + "loss": 0.3731, + "step": 9871 + }, + { + "epoch": 5.5150837988826815, + "grad_norm": 0.4221259653568268, + "learning_rate": 0.0007264985994397759, + "loss": 0.3629, + "step": 9872 + }, + { + "epoch": 5.515642458100559, + "grad_norm": 0.6919800043106079, + "learning_rate": 0.0007264705882352941, + "loss": 0.4592, + "step": 9873 + }, + { + "epoch": 5.5162011173184355, + "grad_norm": 5.412047863006592, + "learning_rate": 0.0007264425770308123, + "loss": 0.4079, + "step": 9874 + }, + { + "epoch": 5.516759776536313, + "grad_norm": 18.767086029052734, + "learning_rate": 0.0007264145658263305, + "loss": 0.4061, + "step": 9875 + }, + { + "epoch": 5.51731843575419, + "grad_norm": 0.4553179442882538, + "learning_rate": 0.0007263865546218488, + "loss": 0.4817, + "step": 9876 + }, + { + "epoch": 5.517877094972067, + "grad_norm": 0.5219476819038391, + "learning_rate": 0.000726358543417367, + "loss": 0.5107, + "step": 9877 + }, + { + "epoch": 5.5184357541899445, + "grad_norm": 0.5047419667243958, + "learning_rate": 0.0007263305322128852, + "loss": 0.4855, + "step": 9878 + }, + { + "epoch": 5.518994413407821, + "grad_norm": 0.4721706211566925, + "learning_rate": 0.0007263025210084034, + "loss": 0.4413, + "step": 9879 + }, + { + "epoch": 5.5195530726256985, + "grad_norm": 0.6387419700622559, + "learning_rate": 0.0007262745098039216, + "loss": 0.463, + "step": 9880 + }, + { + "epoch": 5.520111731843575, + "grad_norm": 1.0378801822662354, + "learning_rate": 0.0007262464985994398, + "loss": 0.3899, + "step": 9881 + }, + { + "epoch": 5.5206703910614525, + "grad_norm": 0.587094783782959, + "learning_rate": 0.000726218487394958, + "loss": 0.5024, + "step": 9882 + }, + { + "epoch": 5.52122905027933, + "grad_norm": 0.4050324857234955, + "learning_rate": 0.0007261904761904762, + "loss": 0.408, + "step": 9883 + }, + { + "epoch": 5.5217877094972065, + "grad_norm": 0.8569478392601013, + "learning_rate": 0.0007261624649859944, + "loss": 0.4879, + "step": 9884 + }, + { + "epoch": 5.522346368715084, + "grad_norm": 0.401920884847641, + "learning_rate": 0.0007261344537815126, + "loss": 0.4856, + "step": 9885 + }, + { + "epoch": 5.522905027932961, + "grad_norm": 0.39946064352989197, + "learning_rate": 0.0007261064425770308, + "loss": 0.3503, + "step": 9886 + }, + { + "epoch": 5.523463687150838, + "grad_norm": 0.5640581846237183, + "learning_rate": 0.000726078431372549, + "loss": 0.4927, + "step": 9887 + }, + { + "epoch": 5.5240223463687155, + "grad_norm": 0.9073072075843811, + "learning_rate": 0.0007260504201680672, + "loss": 0.628, + "step": 9888 + }, + { + "epoch": 5.524581005586592, + "grad_norm": 0.4360564053058624, + "learning_rate": 0.0007260224089635854, + "loss": 0.3375, + "step": 9889 + }, + { + "epoch": 5.5251396648044695, + "grad_norm": 0.5369274616241455, + "learning_rate": 0.0007259943977591037, + "loss": 0.4587, + "step": 9890 + }, + { + "epoch": 5.525698324022346, + "grad_norm": 0.49115100502967834, + "learning_rate": 0.0007259663865546218, + "loss": 0.3569, + "step": 9891 + }, + { + "epoch": 5.5262569832402235, + "grad_norm": 0.44144749641418457, + "learning_rate": 0.00072593837535014, + "loss": 0.3566, + "step": 9892 + }, + { + "epoch": 5.5268156424581, + "grad_norm": 0.41151079535484314, + "learning_rate": 0.0007259103641456583, + "loss": 0.4256, + "step": 9893 + }, + { + "epoch": 5.5273743016759775, + "grad_norm": 0.5457106828689575, + "learning_rate": 0.0007258823529411765, + "loss": 0.4617, + "step": 9894 + }, + { + "epoch": 5.527932960893855, + "grad_norm": 0.5878649353981018, + "learning_rate": 0.0007258543417366948, + "loss": 0.5457, + "step": 9895 + }, + { + "epoch": 5.528491620111732, + "grad_norm": 0.5504418611526489, + "learning_rate": 0.0007258263305322129, + "loss": 0.4108, + "step": 9896 + }, + { + "epoch": 5.529050279329609, + "grad_norm": 2.616567373275757, + "learning_rate": 0.0007257983193277311, + "loss": 0.5422, + "step": 9897 + }, + { + "epoch": 5.5296089385474865, + "grad_norm": 0.3624570667743683, + "learning_rate": 0.0007257703081232493, + "loss": 0.3618, + "step": 9898 + }, + { + "epoch": 5.530167597765363, + "grad_norm": 0.6448395252227783, + "learning_rate": 0.0007257422969187675, + "loss": 0.4461, + "step": 9899 + }, + { + "epoch": 5.5307262569832405, + "grad_norm": 0.5649136304855347, + "learning_rate": 0.0007257142857142858, + "loss": 0.4244, + "step": 9900 + }, + { + "epoch": 5.531284916201117, + "grad_norm": 0.9016652703285217, + "learning_rate": 0.0007256862745098039, + "loss": 0.6765, + "step": 9901 + }, + { + "epoch": 5.5318435754189945, + "grad_norm": 0.8346880674362183, + "learning_rate": 0.0007256582633053221, + "loss": 0.6174, + "step": 9902 + }, + { + "epoch": 5.532402234636871, + "grad_norm": 0.6906309127807617, + "learning_rate": 0.0007256302521008403, + "loss": 0.679, + "step": 9903 + }, + { + "epoch": 5.5329608938547485, + "grad_norm": 0.8676113486289978, + "learning_rate": 0.0007256022408963585, + "loss": 0.412, + "step": 9904 + }, + { + "epoch": 5.533519553072626, + "grad_norm": 0.545275092124939, + "learning_rate": 0.0007255742296918768, + "loss": 0.5197, + "step": 9905 + }, + { + "epoch": 5.534078212290503, + "grad_norm": 1.7213584184646606, + "learning_rate": 0.000725546218487395, + "loss": 0.5242, + "step": 9906 + }, + { + "epoch": 5.53463687150838, + "grad_norm": 0.709518551826477, + "learning_rate": 0.0007255182072829131, + "loss": 0.4413, + "step": 9907 + }, + { + "epoch": 5.5351955307262575, + "grad_norm": 0.5518812537193298, + "learning_rate": 0.0007254901960784313, + "loss": 0.4004, + "step": 9908 + }, + { + "epoch": 5.535754189944134, + "grad_norm": 0.5170642137527466, + "learning_rate": 0.0007254621848739496, + "loss": 0.3437, + "step": 9909 + }, + { + "epoch": 5.5363128491620115, + "grad_norm": 0.6805608868598938, + "learning_rate": 0.0007254341736694679, + "loss": 0.5541, + "step": 9910 + }, + { + "epoch": 5.536871508379888, + "grad_norm": 0.5276179313659668, + "learning_rate": 0.0007254061624649861, + "loss": 0.435, + "step": 9911 + }, + { + "epoch": 5.5374301675977655, + "grad_norm": 0.438228964805603, + "learning_rate": 0.0007253781512605042, + "loss": 0.4204, + "step": 9912 + }, + { + "epoch": 5.537988826815642, + "grad_norm": 0.5030065178871155, + "learning_rate": 0.0007253501400560224, + "loss": 0.5013, + "step": 9913 + }, + { + "epoch": 5.5385474860335195, + "grad_norm": 0.5147072076797485, + "learning_rate": 0.0007253221288515406, + "loss": 0.4694, + "step": 9914 + }, + { + "epoch": 5.539106145251397, + "grad_norm": 0.479183554649353, + "learning_rate": 0.0007252941176470589, + "loss": 0.5558, + "step": 9915 + }, + { + "epoch": 5.539664804469274, + "grad_norm": 0.592819094657898, + "learning_rate": 0.0007252661064425771, + "loss": 0.3598, + "step": 9916 + }, + { + "epoch": 5.540223463687151, + "grad_norm": 0.41784223914146423, + "learning_rate": 0.0007252380952380952, + "loss": 0.4706, + "step": 9917 + }, + { + "epoch": 5.540782122905028, + "grad_norm": 0.4165116548538208, + "learning_rate": 0.0007252100840336134, + "loss": 0.3884, + "step": 9918 + }, + { + "epoch": 5.541340782122905, + "grad_norm": 0.6245352029800415, + "learning_rate": 0.0007251820728291316, + "loss": 0.4169, + "step": 9919 + }, + { + "epoch": 5.5418994413407825, + "grad_norm": 2.388648271560669, + "learning_rate": 0.0007251540616246499, + "loss": 0.4267, + "step": 9920 + }, + { + "epoch": 5.542458100558659, + "grad_norm": 1.1768643856048584, + "learning_rate": 0.0007251260504201681, + "loss": 0.4908, + "step": 9921 + }, + { + "epoch": 5.5430167597765365, + "grad_norm": 0.4066906273365021, + "learning_rate": 0.0007250980392156863, + "loss": 0.5364, + "step": 9922 + }, + { + "epoch": 5.543575418994413, + "grad_norm": 0.6321830153465271, + "learning_rate": 0.0007250700280112044, + "loss": 0.3889, + "step": 9923 + }, + { + "epoch": 5.5441340782122905, + "grad_norm": 0.3898478150367737, + "learning_rate": 0.0007250420168067226, + "loss": 0.3647, + "step": 9924 + }, + { + "epoch": 5.544692737430168, + "grad_norm": 0.7242388129234314, + "learning_rate": 0.000725014005602241, + "loss": 0.4181, + "step": 9925 + }, + { + "epoch": 5.545251396648045, + "grad_norm": 0.4089735150337219, + "learning_rate": 0.0007249859943977592, + "loss": 0.4361, + "step": 9926 + }, + { + "epoch": 5.545810055865922, + "grad_norm": 0.5072340965270996, + "learning_rate": 0.0007249579831932774, + "loss": 0.5734, + "step": 9927 + }, + { + "epoch": 5.546368715083799, + "grad_norm": 0.3940463066101074, + "learning_rate": 0.0007249299719887955, + "loss": 0.4593, + "step": 9928 + }, + { + "epoch": 5.546927374301676, + "grad_norm": 0.48787176609039307, + "learning_rate": 0.0007249019607843137, + "loss": 0.5869, + "step": 9929 + }, + { + "epoch": 5.547486033519553, + "grad_norm": 0.4486599564552307, + "learning_rate": 0.000724873949579832, + "loss": 0.3893, + "step": 9930 + }, + { + "epoch": 5.54804469273743, + "grad_norm": 0.5582526922225952, + "learning_rate": 0.0007248459383753502, + "loss": 0.5283, + "step": 9931 + }, + { + "epoch": 5.5486033519553075, + "grad_norm": 3.792804479598999, + "learning_rate": 0.0007248179271708684, + "loss": 0.4343, + "step": 9932 + }, + { + "epoch": 5.549162011173184, + "grad_norm": 0.7790818810462952, + "learning_rate": 0.0007247899159663865, + "loss": 0.4612, + "step": 9933 + }, + { + "epoch": 5.5497206703910615, + "grad_norm": 1.0483152866363525, + "learning_rate": 0.0007247619047619047, + "loss": 0.6877, + "step": 9934 + }, + { + "epoch": 5.550279329608939, + "grad_norm": 0.4311125576496124, + "learning_rate": 0.000724733893557423, + "loss": 0.4446, + "step": 9935 + }, + { + "epoch": 5.550837988826816, + "grad_norm": 0.8082439303398132, + "learning_rate": 0.0007247058823529412, + "loss": 0.5547, + "step": 9936 + }, + { + "epoch": 5.551396648044693, + "grad_norm": 0.807525634765625, + "learning_rate": 0.0007246778711484594, + "loss": 0.4132, + "step": 9937 + }, + { + "epoch": 5.55195530726257, + "grad_norm": 0.6115683913230896, + "learning_rate": 0.0007246498599439776, + "loss": 0.4223, + "step": 9938 + }, + { + "epoch": 5.552513966480447, + "grad_norm": 0.6792250275611877, + "learning_rate": 0.0007246218487394957, + "loss": 0.4893, + "step": 9939 + }, + { + "epoch": 5.553072625698324, + "grad_norm": 0.5155826807022095, + "learning_rate": 0.000724593837535014, + "loss": 0.4944, + "step": 9940 + }, + { + "epoch": 5.553631284916201, + "grad_norm": 0.7451338171958923, + "learning_rate": 0.0007245658263305323, + "loss": 0.4332, + "step": 9941 + }, + { + "epoch": 5.5541899441340785, + "grad_norm": 1.2530287504196167, + "learning_rate": 0.0007245378151260505, + "loss": 0.4383, + "step": 9942 + }, + { + "epoch": 5.554748603351955, + "grad_norm": 0.6665396690368652, + "learning_rate": 0.0007245098039215687, + "loss": 0.4881, + "step": 9943 + }, + { + "epoch": 5.5553072625698325, + "grad_norm": 0.6986837387084961, + "learning_rate": 0.0007244817927170868, + "loss": 0.3823, + "step": 9944 + }, + { + "epoch": 5.55586592178771, + "grad_norm": 0.6157588362693787, + "learning_rate": 0.0007244537815126051, + "loss": 0.4765, + "step": 9945 + }, + { + "epoch": 5.556424581005587, + "grad_norm": 0.5928972363471985, + "learning_rate": 0.0007244257703081233, + "loss": 0.4312, + "step": 9946 + }, + { + "epoch": 5.556983240223464, + "grad_norm": 0.6123839616775513, + "learning_rate": 0.0007243977591036415, + "loss": 0.3446, + "step": 9947 + }, + { + "epoch": 5.557541899441341, + "grad_norm": 0.7116967439651489, + "learning_rate": 0.0007243697478991597, + "loss": 0.3571, + "step": 9948 + }, + { + "epoch": 5.558100558659218, + "grad_norm": 0.4645528197288513, + "learning_rate": 0.0007243417366946778, + "loss": 0.4266, + "step": 9949 + }, + { + "epoch": 5.558659217877095, + "grad_norm": 0.44229093194007874, + "learning_rate": 0.0007243137254901961, + "loss": 0.3995, + "step": 9950 + }, + { + "epoch": 5.559217877094972, + "grad_norm": 0.5539903044700623, + "learning_rate": 0.0007242857142857143, + "loss": 0.3754, + "step": 9951 + }, + { + "epoch": 5.5597765363128495, + "grad_norm": 0.5988803505897522, + "learning_rate": 0.0007242577030812325, + "loss": 0.414, + "step": 9952 + }, + { + "epoch": 5.560335195530726, + "grad_norm": 2.0920209884643555, + "learning_rate": 0.0007242296918767507, + "loss": 0.4041, + "step": 9953 + }, + { + "epoch": 5.5608938547486035, + "grad_norm": 0.7040426731109619, + "learning_rate": 0.0007242016806722689, + "loss": 0.3424, + "step": 9954 + }, + { + "epoch": 5.56145251396648, + "grad_norm": 0.6377585530281067, + "learning_rate": 0.0007241736694677871, + "loss": 0.3787, + "step": 9955 + }, + { + "epoch": 5.562011173184358, + "grad_norm": 0.6620262861251831, + "learning_rate": 0.0007241456582633053, + "loss": 0.5126, + "step": 9956 + }, + { + "epoch": 5.562569832402235, + "grad_norm": 0.627249002456665, + "learning_rate": 0.0007241176470588235, + "loss": 0.5437, + "step": 9957 + }, + { + "epoch": 5.563128491620112, + "grad_norm": 0.6866033673286438, + "learning_rate": 0.0007240896358543418, + "loss": 0.5025, + "step": 9958 + }, + { + "epoch": 5.563687150837989, + "grad_norm": 0.5920504927635193, + "learning_rate": 0.00072406162464986, + "loss": 0.4295, + "step": 9959 + }, + { + "epoch": 5.564245810055866, + "grad_norm": 0.4929662048816681, + "learning_rate": 0.0007240336134453782, + "loss": 0.5608, + "step": 9960 + }, + { + "epoch": 5.564804469273743, + "grad_norm": 0.4381335377693176, + "learning_rate": 0.0007240056022408964, + "loss": 0.4269, + "step": 9961 + }, + { + "epoch": 5.5653631284916205, + "grad_norm": 0.6184817552566528, + "learning_rate": 0.0007239775910364146, + "loss": 0.5056, + "step": 9962 + }, + { + "epoch": 5.565921787709497, + "grad_norm": 1.3165264129638672, + "learning_rate": 0.0007239495798319328, + "loss": 0.5942, + "step": 9963 + }, + { + "epoch": 5.5664804469273745, + "grad_norm": 0.6004919409751892, + "learning_rate": 0.000723921568627451, + "loss": 0.599, + "step": 9964 + }, + { + "epoch": 5.567039106145251, + "grad_norm": 2.6897833347320557, + "learning_rate": 0.0007238935574229693, + "loss": 0.3412, + "step": 9965 + }, + { + "epoch": 5.567597765363129, + "grad_norm": 5.2298665046691895, + "learning_rate": 0.0007238655462184874, + "loss": 0.4602, + "step": 9966 + }, + { + "epoch": 5.568156424581005, + "grad_norm": 0.398495078086853, + "learning_rate": 0.0007238375350140056, + "loss": 0.3792, + "step": 9967 + }, + { + "epoch": 5.568715083798883, + "grad_norm": 0.4069007933139801, + "learning_rate": 0.0007238095238095238, + "loss": 0.358, + "step": 9968 + }, + { + "epoch": 5.56927374301676, + "grad_norm": 0.39702823758125305, + "learning_rate": 0.000723781512605042, + "loss": 0.3858, + "step": 9969 + }, + { + "epoch": 5.569832402234637, + "grad_norm": 0.6081941723823547, + "learning_rate": 0.0007237535014005603, + "loss": 0.4399, + "step": 9970 + }, + { + "epoch": 5.570391061452514, + "grad_norm": 0.9728443622589111, + "learning_rate": 0.0007237254901960784, + "loss": 0.3563, + "step": 9971 + }, + { + "epoch": 5.5709497206703915, + "grad_norm": 0.8170782923698425, + "learning_rate": 0.0007236974789915966, + "loss": 0.4035, + "step": 9972 + }, + { + "epoch": 5.571508379888268, + "grad_norm": 0.48218995332717896, + "learning_rate": 0.0007236694677871148, + "loss": 0.3905, + "step": 9973 + }, + { + "epoch": 5.5720670391061455, + "grad_norm": 0.5158736705780029, + "learning_rate": 0.000723641456582633, + "loss": 0.4134, + "step": 9974 + }, + { + "epoch": 5.572625698324022, + "grad_norm": 0.5635305643081665, + "learning_rate": 0.0007236134453781514, + "loss": 0.4232, + "step": 9975 + }, + { + "epoch": 5.5731843575419, + "grad_norm": 3.2909793853759766, + "learning_rate": 0.0007235854341736695, + "loss": 0.3994, + "step": 9976 + }, + { + "epoch": 5.573743016759776, + "grad_norm": 0.9748474955558777, + "learning_rate": 0.0007235574229691877, + "loss": 0.4116, + "step": 9977 + }, + { + "epoch": 5.574301675977654, + "grad_norm": 0.603339672088623, + "learning_rate": 0.0007235294117647059, + "loss": 0.4266, + "step": 9978 + }, + { + "epoch": 5.574860335195531, + "grad_norm": 0.5575515627861023, + "learning_rate": 0.0007235014005602241, + "loss": 0.5807, + "step": 9979 + }, + { + "epoch": 5.575418994413408, + "grad_norm": 0.5503337383270264, + "learning_rate": 0.0007234733893557424, + "loss": 0.4602, + "step": 9980 + }, + { + "epoch": 5.575977653631285, + "grad_norm": 0.6396551132202148, + "learning_rate": 0.0007234453781512606, + "loss": 0.5343, + "step": 9981 + }, + { + "epoch": 5.576536312849162, + "grad_norm": 0.5318945050239563, + "learning_rate": 0.0007234173669467787, + "loss": 0.4157, + "step": 9982 + }, + { + "epoch": 5.577094972067039, + "grad_norm": 0.4633936583995819, + "learning_rate": 0.0007233893557422969, + "loss": 0.5588, + "step": 9983 + }, + { + "epoch": 5.5776536312849165, + "grad_norm": 0.7308537364006042, + "learning_rate": 0.0007233613445378151, + "loss": 0.4908, + "step": 9984 + }, + { + "epoch": 5.578212290502793, + "grad_norm": 0.5050171613693237, + "learning_rate": 0.0007233333333333334, + "loss": 0.4761, + "step": 9985 + }, + { + "epoch": 5.578770949720671, + "grad_norm": 0.8002045154571533, + "learning_rate": 0.0007233053221288516, + "loss": 0.7016, + "step": 9986 + }, + { + "epoch": 5.579329608938547, + "grad_norm": 0.5756903886795044, + "learning_rate": 0.0007232773109243697, + "loss": 0.4641, + "step": 9987 + }, + { + "epoch": 5.579888268156425, + "grad_norm": 0.6262851357460022, + "learning_rate": 0.0007232492997198879, + "loss": 0.4732, + "step": 9988 + }, + { + "epoch": 5.580446927374302, + "grad_norm": 4.5401811599731445, + "learning_rate": 0.0007232212885154061, + "loss": 0.4239, + "step": 9989 + }, + { + "epoch": 5.581005586592179, + "grad_norm": 2.0694973468780518, + "learning_rate": 0.0007231932773109245, + "loss": 0.4129, + "step": 9990 + }, + { + "epoch": 5.581564245810056, + "grad_norm": 1.45489501953125, + "learning_rate": 0.0007231652661064427, + "loss": 0.533, + "step": 9991 + }, + { + "epoch": 5.582122905027933, + "grad_norm": 0.6292015314102173, + "learning_rate": 0.0007231372549019608, + "loss": 0.5515, + "step": 9992 + }, + { + "epoch": 5.58268156424581, + "grad_norm": 2.903557777404785, + "learning_rate": 0.000723109243697479, + "loss": 0.4424, + "step": 9993 + }, + { + "epoch": 5.5832402234636875, + "grad_norm": 0.6302845478057861, + "learning_rate": 0.0007230812324929972, + "loss": 0.4439, + "step": 9994 + }, + { + "epoch": 5.583798882681564, + "grad_norm": 0.7181702256202698, + "learning_rate": 0.0007230532212885155, + "loss": 0.5026, + "step": 9995 + }, + { + "epoch": 5.584357541899442, + "grad_norm": 0.4197083115577698, + "learning_rate": 0.0007230252100840337, + "loss": 0.4236, + "step": 9996 + }, + { + "epoch": 5.584916201117318, + "grad_norm": 0.43952012062072754, + "learning_rate": 0.0007229971988795519, + "loss": 0.422, + "step": 9997 + }, + { + "epoch": 5.585474860335196, + "grad_norm": 0.44125857949256897, + "learning_rate": 0.00072296918767507, + "loss": 0.52, + "step": 9998 + }, + { + "epoch": 5.586033519553073, + "grad_norm": 1.0360386371612549, + "learning_rate": 0.0007229411764705882, + "loss": 0.4764, + "step": 9999 + }, + { + "epoch": 5.58659217877095, + "grad_norm": 0.4642835855484009, + "learning_rate": 0.0007229131652661065, + "loss": 0.5008, + "step": 10000 + }, + { + "epoch": 5.58659217877095, + "eval_cer": 0.09317752828603537, + "eval_loss": 0.348407119512558, + "eval_runtime": 55.8205, + "eval_samples_per_second": 81.296, + "eval_steps_per_second": 5.088, + "eval_wer": 0.3695427502025083, + "step": 10000 + }, + { + "epoch": 5.587150837988827, + "grad_norm": 0.46861758828163147, + "learning_rate": 0.0007228851540616247, + "loss": 0.3906, + "step": 10001 + }, + { + "epoch": 5.587709497206704, + "grad_norm": 0.40227460861206055, + "learning_rate": 0.0007228571428571429, + "loss": 0.43, + "step": 10002 + }, + { + "epoch": 5.588268156424581, + "grad_norm": 0.6384165287017822, + "learning_rate": 0.000722829131652661, + "loss": 0.4269, + "step": 10003 + }, + { + "epoch": 5.588826815642458, + "grad_norm": 0.7281396389007568, + "learning_rate": 0.0007228011204481792, + "loss": 0.4503, + "step": 10004 + }, + { + "epoch": 5.589385474860335, + "grad_norm": 0.421785444021225, + "learning_rate": 0.0007227731092436975, + "loss": 0.3037, + "step": 10005 + }, + { + "epoch": 5.589944134078213, + "grad_norm": 1.030700445175171, + "learning_rate": 0.0007227450980392157, + "loss": 0.4839, + "step": 10006 + }, + { + "epoch": 5.590502793296089, + "grad_norm": 0.5000261664390564, + "learning_rate": 0.000722717086834734, + "loss": 0.4447, + "step": 10007 + }, + { + "epoch": 5.591061452513967, + "grad_norm": 0.8885688781738281, + "learning_rate": 0.000722689075630252, + "loss": 0.5159, + "step": 10008 + }, + { + "epoch": 5.591620111731844, + "grad_norm": 0.5367618203163147, + "learning_rate": 0.0007226610644257703, + "loss": 0.5675, + "step": 10009 + }, + { + "epoch": 5.592178770949721, + "grad_norm": 0.5591814517974854, + "learning_rate": 0.0007226330532212886, + "loss": 0.447, + "step": 10010 + }, + { + "epoch": 5.592737430167598, + "grad_norm": 0.4015854299068451, + "learning_rate": 0.0007226050420168068, + "loss": 0.4855, + "step": 10011 + }, + { + "epoch": 5.593296089385475, + "grad_norm": 0.4199011027812958, + "learning_rate": 0.000722577030812325, + "loss": 0.4855, + "step": 10012 + }, + { + "epoch": 5.593854748603352, + "grad_norm": 0.5143280625343323, + "learning_rate": 0.0007225490196078432, + "loss": 0.4491, + "step": 10013 + }, + { + "epoch": 5.594413407821229, + "grad_norm": 0.3952634334564209, + "learning_rate": 0.0007225210084033613, + "loss": 0.3515, + "step": 10014 + }, + { + "epoch": 5.594972067039106, + "grad_norm": 0.4140816330909729, + "learning_rate": 0.0007224929971988796, + "loss": 0.4413, + "step": 10015 + }, + { + "epoch": 5.5955307262569836, + "grad_norm": 0.6067275404930115, + "learning_rate": 0.0007224649859943978, + "loss": 0.703, + "step": 10016 + }, + { + "epoch": 5.59608938547486, + "grad_norm": 0.8579608201980591, + "learning_rate": 0.000722436974789916, + "loss": 0.4522, + "step": 10017 + }, + { + "epoch": 5.596648044692738, + "grad_norm": 0.46375545859336853, + "learning_rate": 0.0007224089635854342, + "loss": 0.3829, + "step": 10018 + }, + { + "epoch": 5.597206703910614, + "grad_norm": 0.43228957056999207, + "learning_rate": 0.0007223809523809523, + "loss": 0.4277, + "step": 10019 + }, + { + "epoch": 5.597765363128492, + "grad_norm": 2.5335819721221924, + "learning_rate": 0.0007223529411764706, + "loss": 0.3521, + "step": 10020 + }, + { + "epoch": 5.598324022346369, + "grad_norm": 0.6500746607780457, + "learning_rate": 0.0007223249299719888, + "loss": 0.5025, + "step": 10021 + }, + { + "epoch": 5.598882681564246, + "grad_norm": 0.6348538398742676, + "learning_rate": 0.000722296918767507, + "loss": 0.3942, + "step": 10022 + }, + { + "epoch": 5.599441340782123, + "grad_norm": 0.5580580830574036, + "learning_rate": 0.0007222689075630253, + "loss": 0.5733, + "step": 10023 + }, + { + "epoch": 5.6, + "grad_norm": 0.817731499671936, + "learning_rate": 0.0007222408963585433, + "loss": 0.5185, + "step": 10024 + }, + { + "epoch": 5.600558659217877, + "grad_norm": 0.4958416521549225, + "learning_rate": 0.0007222128851540617, + "loss": 0.4505, + "step": 10025 + }, + { + "epoch": 5.6011173184357546, + "grad_norm": 0.42211151123046875, + "learning_rate": 0.0007221848739495799, + "loss": 0.3859, + "step": 10026 + }, + { + "epoch": 5.601675977653631, + "grad_norm": 0.4639894366264343, + "learning_rate": 0.0007221568627450981, + "loss": 0.4849, + "step": 10027 + }, + { + "epoch": 5.602234636871509, + "grad_norm": 0.601848840713501, + "learning_rate": 0.0007221288515406163, + "loss": 0.4942, + "step": 10028 + }, + { + "epoch": 5.602793296089385, + "grad_norm": 1.1188138723373413, + "learning_rate": 0.0007221008403361345, + "loss": 0.4027, + "step": 10029 + }, + { + "epoch": 5.603351955307263, + "grad_norm": 0.6030825972557068, + "learning_rate": 0.0007220728291316527, + "loss": 0.4657, + "step": 10030 + }, + { + "epoch": 5.603910614525139, + "grad_norm": 0.6522092819213867, + "learning_rate": 0.0007220448179271709, + "loss": 0.4364, + "step": 10031 + }, + { + "epoch": 5.604469273743017, + "grad_norm": 0.5696516633033752, + "learning_rate": 0.0007220168067226891, + "loss": 0.4526, + "step": 10032 + }, + { + "epoch": 5.605027932960894, + "grad_norm": 0.5284076929092407, + "learning_rate": 0.0007219887955182073, + "loss": 0.5557, + "step": 10033 + }, + { + "epoch": 5.605586592178771, + "grad_norm": 0.5809754133224487, + "learning_rate": 0.0007219607843137255, + "loss": 0.4597, + "step": 10034 + }, + { + "epoch": 5.606145251396648, + "grad_norm": 0.442489355802536, + "learning_rate": 0.0007219327731092437, + "loss": 0.4967, + "step": 10035 + }, + { + "epoch": 5.6067039106145256, + "grad_norm": 0.5501169562339783, + "learning_rate": 0.0007219047619047619, + "loss": 0.3924, + "step": 10036 + }, + { + "epoch": 5.607262569832402, + "grad_norm": 0.3731783628463745, + "learning_rate": 0.0007218767507002801, + "loss": 0.3764, + "step": 10037 + }, + { + "epoch": 5.60782122905028, + "grad_norm": 0.4078274071216583, + "learning_rate": 0.0007218487394957983, + "loss": 0.5023, + "step": 10038 + }, + { + "epoch": 5.608379888268156, + "grad_norm": 0.44391918182373047, + "learning_rate": 0.0007218207282913165, + "loss": 0.435, + "step": 10039 + }, + { + "epoch": 5.608938547486034, + "grad_norm": 0.3873365819454193, + "learning_rate": 0.0007217927170868346, + "loss": 0.4155, + "step": 10040 + }, + { + "epoch": 5.60949720670391, + "grad_norm": 0.5123922228813171, + "learning_rate": 0.000721764705882353, + "loss": 0.5375, + "step": 10041 + }, + { + "epoch": 5.610055865921788, + "grad_norm": 0.4906674921512604, + "learning_rate": 0.0007217366946778712, + "loss": 0.4599, + "step": 10042 + }, + { + "epoch": 5.610614525139665, + "grad_norm": 0.48765140771865845, + "learning_rate": 0.0007217086834733894, + "loss": 0.4794, + "step": 10043 + }, + { + "epoch": 5.611173184357542, + "grad_norm": 0.6538378000259399, + "learning_rate": 0.0007216806722689076, + "loss": 0.4788, + "step": 10044 + }, + { + "epoch": 5.611731843575419, + "grad_norm": 0.5697200894355774, + "learning_rate": 0.0007216526610644258, + "loss": 0.4451, + "step": 10045 + }, + { + "epoch": 5.6122905027932966, + "grad_norm": 1.6420912742614746, + "learning_rate": 0.000721624649859944, + "loss": 0.4216, + "step": 10046 + }, + { + "epoch": 5.612849162011173, + "grad_norm": 0.502399742603302, + "learning_rate": 0.0007215966386554622, + "loss": 0.3592, + "step": 10047 + }, + { + "epoch": 5.613407821229051, + "grad_norm": 4.2799530029296875, + "learning_rate": 0.0007215686274509804, + "loss": 0.5068, + "step": 10048 + }, + { + "epoch": 5.613966480446927, + "grad_norm": 0.9166780710220337, + "learning_rate": 0.0007215406162464986, + "loss": 0.5354, + "step": 10049 + }, + { + "epoch": 5.614525139664805, + "grad_norm": 0.6505798697471619, + "learning_rate": 0.0007215126050420168, + "loss": 0.446, + "step": 10050 + }, + { + "epoch": 5.615083798882681, + "grad_norm": 0.3875667154788971, + "learning_rate": 0.000721484593837535, + "loss": 0.3177, + "step": 10051 + }, + { + "epoch": 5.615642458100559, + "grad_norm": 0.5634588599205017, + "learning_rate": 0.0007214565826330532, + "loss": 0.4833, + "step": 10052 + }, + { + "epoch": 5.616201117318436, + "grad_norm": 0.43827882409095764, + "learning_rate": 0.0007214285714285714, + "loss": 0.4703, + "step": 10053 + }, + { + "epoch": 5.616759776536313, + "grad_norm": 0.6062840223312378, + "learning_rate": 0.0007214005602240896, + "loss": 0.4048, + "step": 10054 + }, + { + "epoch": 5.61731843575419, + "grad_norm": 4.657212734222412, + "learning_rate": 0.0007213725490196078, + "loss": 0.4246, + "step": 10055 + }, + { + "epoch": 5.617877094972067, + "grad_norm": 0.48280224204063416, + "learning_rate": 0.000721344537815126, + "loss": 0.4399, + "step": 10056 + }, + { + "epoch": 5.618435754189944, + "grad_norm": 0.6430901288986206, + "learning_rate": 0.0007213165266106443, + "loss": 0.4206, + "step": 10057 + }, + { + "epoch": 5.618994413407822, + "grad_norm": 0.8701556324958801, + "learning_rate": 0.0007212885154061625, + "loss": 0.4454, + "step": 10058 + }, + { + "epoch": 5.619553072625698, + "grad_norm": 0.8723263740539551, + "learning_rate": 0.0007212605042016807, + "loss": 0.5346, + "step": 10059 + }, + { + "epoch": 5.620111731843576, + "grad_norm": 0.5074014663696289, + "learning_rate": 0.0007212324929971989, + "loss": 0.4705, + "step": 10060 + }, + { + "epoch": 5.620670391061452, + "grad_norm": 0.4879077970981598, + "learning_rate": 0.0007212044817927172, + "loss": 0.5322, + "step": 10061 + }, + { + "epoch": 5.62122905027933, + "grad_norm": 0.8208219408988953, + "learning_rate": 0.0007211764705882353, + "loss": 0.454, + "step": 10062 + }, + { + "epoch": 5.621787709497207, + "grad_norm": 0.4299928843975067, + "learning_rate": 0.0007211484593837535, + "loss": 0.4776, + "step": 10063 + }, + { + "epoch": 5.622346368715084, + "grad_norm": 0.7997061014175415, + "learning_rate": 0.0007211204481792717, + "loss": 0.5368, + "step": 10064 + }, + { + "epoch": 5.622905027932961, + "grad_norm": 0.6432591676712036, + "learning_rate": 0.0007210924369747899, + "loss": 0.4587, + "step": 10065 + }, + { + "epoch": 5.623463687150838, + "grad_norm": 0.49981507658958435, + "learning_rate": 0.0007210644257703082, + "loss": 0.3957, + "step": 10066 + }, + { + "epoch": 5.624022346368715, + "grad_norm": 0.48898595571517944, + "learning_rate": 0.0007210364145658263, + "loss": 0.5023, + "step": 10067 + }, + { + "epoch": 5.624581005586592, + "grad_norm": 0.4667215347290039, + "learning_rate": 0.0007210084033613445, + "loss": 0.6151, + "step": 10068 + }, + { + "epoch": 5.625139664804469, + "grad_norm": 0.5658891797065735, + "learning_rate": 0.0007209803921568627, + "loss": 0.5951, + "step": 10069 + }, + { + "epoch": 5.625698324022347, + "grad_norm": 1.6414737701416016, + "learning_rate": 0.0007209523809523809, + "loss": 0.4226, + "step": 10070 + }, + { + "epoch": 5.626256983240223, + "grad_norm": 1.6741398572921753, + "learning_rate": 0.0007209243697478992, + "loss": 0.7108, + "step": 10071 + }, + { + "epoch": 5.626815642458101, + "grad_norm": 0.40429213643074036, + "learning_rate": 0.0007208963585434173, + "loss": 0.4004, + "step": 10072 + }, + { + "epoch": 5.627374301675978, + "grad_norm": 0.5193113088607788, + "learning_rate": 0.0007208683473389356, + "loss": 0.4949, + "step": 10073 + }, + { + "epoch": 5.627932960893855, + "grad_norm": 0.634591281414032, + "learning_rate": 0.0007208403361344538, + "loss": 0.4283, + "step": 10074 + }, + { + "epoch": 5.628491620111732, + "grad_norm": 0.5754101872444153, + "learning_rate": 0.000720812324929972, + "loss": 0.507, + "step": 10075 + }, + { + "epoch": 5.629050279329609, + "grad_norm": 0.635532796382904, + "learning_rate": 0.0007207843137254903, + "loss": 0.5256, + "step": 10076 + }, + { + "epoch": 5.629608938547486, + "grad_norm": 0.6660797595977783, + "learning_rate": 0.0007207563025210085, + "loss": 0.6456, + "step": 10077 + }, + { + "epoch": 5.630167597765363, + "grad_norm": 0.5921980738639832, + "learning_rate": 0.0007207282913165266, + "loss": 0.4674, + "step": 10078 + }, + { + "epoch": 5.63072625698324, + "grad_norm": 0.4608224332332611, + "learning_rate": 0.0007207002801120448, + "loss": 0.5155, + "step": 10079 + }, + { + "epoch": 5.631284916201118, + "grad_norm": 0.8263514637947083, + "learning_rate": 0.000720672268907563, + "loss": 0.5754, + "step": 10080 + }, + { + "epoch": 5.631843575418994, + "grad_norm": 0.7397025227546692, + "learning_rate": 0.0007206442577030813, + "loss": 0.4833, + "step": 10081 + }, + { + "epoch": 5.632402234636872, + "grad_norm": 0.5124477744102478, + "learning_rate": 0.0007206162464985995, + "loss": 0.4982, + "step": 10082 + }, + { + "epoch": 5.632960893854749, + "grad_norm": 0.4849470853805542, + "learning_rate": 0.0007205882352941176, + "loss": 0.5175, + "step": 10083 + }, + { + "epoch": 5.633519553072626, + "grad_norm": 0.3957095146179199, + "learning_rate": 0.0007205602240896358, + "loss": 0.3494, + "step": 10084 + }, + { + "epoch": 5.634078212290503, + "grad_norm": 1.271714448928833, + "learning_rate": 0.000720532212885154, + "loss": 0.4368, + "step": 10085 + }, + { + "epoch": 5.63463687150838, + "grad_norm": 0.5071250796318054, + "learning_rate": 0.0007205042016806723, + "loss": 0.428, + "step": 10086 + }, + { + "epoch": 5.635195530726257, + "grad_norm": 0.5866625905036926, + "learning_rate": 0.0007204761904761905, + "loss": 0.4318, + "step": 10087 + }, + { + "epoch": 5.635754189944134, + "grad_norm": 0.8688901662826538, + "learning_rate": 0.0007204481792717086, + "loss": 0.5897, + "step": 10088 + }, + { + "epoch": 5.636312849162011, + "grad_norm": 0.5585296154022217, + "learning_rate": 0.0007204201680672268, + "loss": 0.4684, + "step": 10089 + }, + { + "epoch": 5.636871508379889, + "grad_norm": 0.4540899693965912, + "learning_rate": 0.000720392156862745, + "loss": 0.4765, + "step": 10090 + }, + { + "epoch": 5.637430167597765, + "grad_norm": 3.2529242038726807, + "learning_rate": 0.0007203641456582634, + "loss": 0.426, + "step": 10091 + }, + { + "epoch": 5.637988826815643, + "grad_norm": 1.139200210571289, + "learning_rate": 0.0007203361344537816, + "loss": 0.6208, + "step": 10092 + }, + { + "epoch": 5.638547486033519, + "grad_norm": 0.6055642366409302, + "learning_rate": 0.0007203081232492998, + "loss": 0.4254, + "step": 10093 + }, + { + "epoch": 5.639106145251397, + "grad_norm": 1.2333184480667114, + "learning_rate": 0.0007202801120448179, + "loss": 0.4284, + "step": 10094 + }, + { + "epoch": 5.639664804469274, + "grad_norm": 1.5095566511154175, + "learning_rate": 0.0007202521008403361, + "loss": 0.4449, + "step": 10095 + }, + { + "epoch": 5.640223463687151, + "grad_norm": 4.3493475914001465, + "learning_rate": 0.0007202240896358544, + "loss": 0.3722, + "step": 10096 + }, + { + "epoch": 5.640782122905028, + "grad_norm": 0.5825946927070618, + "learning_rate": 0.0007201960784313726, + "loss": 0.3941, + "step": 10097 + }, + { + "epoch": 5.641340782122905, + "grad_norm": 0.5449908375740051, + "learning_rate": 0.0007201680672268908, + "loss": 0.5186, + "step": 10098 + }, + { + "epoch": 5.641899441340782, + "grad_norm": 0.5583450794219971, + "learning_rate": 0.0007201400560224089, + "loss": 0.5047, + "step": 10099 + }, + { + "epoch": 5.64245810055866, + "grad_norm": 0.7477580308914185, + "learning_rate": 0.0007201120448179271, + "loss": 0.3964, + "step": 10100 + }, + { + "epoch": 5.643016759776536, + "grad_norm": 0.7733327150344849, + "learning_rate": 0.0007200840336134454, + "loss": 0.4127, + "step": 10101 + }, + { + "epoch": 5.643575418994414, + "grad_norm": 0.5495614409446716, + "learning_rate": 0.0007200560224089636, + "loss": 0.4351, + "step": 10102 + }, + { + "epoch": 5.64413407821229, + "grad_norm": 0.5987143516540527, + "learning_rate": 0.0007200280112044818, + "loss": 0.404, + "step": 10103 + }, + { + "epoch": 5.644692737430168, + "grad_norm": 0.5004493594169617, + "learning_rate": 0.0007199999999999999, + "loss": 0.407, + "step": 10104 + }, + { + "epoch": 5.645251396648044, + "grad_norm": 0.6078566908836365, + "learning_rate": 0.0007199719887955181, + "loss": 0.4627, + "step": 10105 + }, + { + "epoch": 5.645810055865922, + "grad_norm": 0.7295481562614441, + "learning_rate": 0.0007199439775910365, + "loss": 0.423, + "step": 10106 + }, + { + "epoch": 5.646368715083799, + "grad_norm": 0.5811827778816223, + "learning_rate": 0.0007199159663865547, + "loss": 0.4175, + "step": 10107 + }, + { + "epoch": 5.646927374301676, + "grad_norm": 0.7878211736679077, + "learning_rate": 0.0007198879551820729, + "loss": 0.4309, + "step": 10108 + }, + { + "epoch": 5.647486033519553, + "grad_norm": 0.7037503719329834, + "learning_rate": 0.0007198599439775911, + "loss": 0.4791, + "step": 10109 + }, + { + "epoch": 5.648044692737431, + "grad_norm": 0.2669017016887665, + "learning_rate": 0.0007198319327731092, + "loss": 0.3246, + "step": 10110 + }, + { + "epoch": 5.648603351955307, + "grad_norm": 0.5799449682235718, + "learning_rate": 0.0007198039215686275, + "loss": 0.4184, + "step": 10111 + }, + { + "epoch": 5.649162011173185, + "grad_norm": 1.132827639579773, + "learning_rate": 0.0007197759103641457, + "loss": 0.42, + "step": 10112 + }, + { + "epoch": 5.649720670391061, + "grad_norm": 0.6218265295028687, + "learning_rate": 0.0007197478991596639, + "loss": 0.4334, + "step": 10113 + }, + { + "epoch": 5.650279329608939, + "grad_norm": 0.5614800453186035, + "learning_rate": 0.0007197198879551821, + "loss": 0.3936, + "step": 10114 + }, + { + "epoch": 5.650837988826815, + "grad_norm": 2.8094356060028076, + "learning_rate": 0.0007196918767507002, + "loss": 0.5851, + "step": 10115 + }, + { + "epoch": 5.651396648044693, + "grad_norm": 1.7567263841629028, + "learning_rate": 0.0007196638655462185, + "loss": 0.4774, + "step": 10116 + }, + { + "epoch": 5.65195530726257, + "grad_norm": 0.6033588647842407, + "learning_rate": 0.0007196358543417367, + "loss": 0.5989, + "step": 10117 + }, + { + "epoch": 5.652513966480447, + "grad_norm": 1.7935905456542969, + "learning_rate": 0.0007196078431372549, + "loss": 0.4067, + "step": 10118 + }, + { + "epoch": 5.653072625698324, + "grad_norm": 0.4951060712337494, + "learning_rate": 0.0007195798319327731, + "loss": 0.4529, + "step": 10119 + }, + { + "epoch": 5.653631284916202, + "grad_norm": 0.5029152631759644, + "learning_rate": 0.0007195518207282912, + "loss": 0.5307, + "step": 10120 + }, + { + "epoch": 5.654189944134078, + "grad_norm": 0.458808034658432, + "learning_rate": 0.0007195238095238095, + "loss": 0.4112, + "step": 10121 + }, + { + "epoch": 5.654748603351956, + "grad_norm": 0.7599425315856934, + "learning_rate": 0.0007194957983193278, + "loss": 0.4658, + "step": 10122 + }, + { + "epoch": 5.655307262569832, + "grad_norm": 0.5707579851150513, + "learning_rate": 0.000719467787114846, + "loss": 0.3847, + "step": 10123 + }, + { + "epoch": 5.65586592178771, + "grad_norm": 0.4810388684272766, + "learning_rate": 0.0007194397759103642, + "loss": 0.4089, + "step": 10124 + }, + { + "epoch": 5.656424581005586, + "grad_norm": 2.0035946369171143, + "learning_rate": 0.0007194117647058824, + "loss": 0.4846, + "step": 10125 + }, + { + "epoch": 5.656983240223464, + "grad_norm": 0.46774283051490784, + "learning_rate": 0.0007193837535014006, + "loss": 0.4436, + "step": 10126 + }, + { + "epoch": 5.657541899441341, + "grad_norm": 0.7552431225776672, + "learning_rate": 0.0007193557422969188, + "loss": 0.6333, + "step": 10127 + }, + { + "epoch": 5.658100558659218, + "grad_norm": 0.6337360143661499, + "learning_rate": 0.000719327731092437, + "loss": 0.4392, + "step": 10128 + }, + { + "epoch": 5.658659217877095, + "grad_norm": 0.37299537658691406, + "learning_rate": 0.0007192997198879552, + "loss": 0.3343, + "step": 10129 + }, + { + "epoch": 5.659217877094972, + "grad_norm": 1.113067626953125, + "learning_rate": 0.0007192717086834734, + "loss": 0.4078, + "step": 10130 + }, + { + "epoch": 5.659776536312849, + "grad_norm": 0.9046213030815125, + "learning_rate": 0.0007192436974789916, + "loss": 0.4936, + "step": 10131 + }, + { + "epoch": 5.660335195530727, + "grad_norm": 0.7275684475898743, + "learning_rate": 0.0007192156862745098, + "loss": 0.4217, + "step": 10132 + }, + { + "epoch": 5.660893854748603, + "grad_norm": 0.43270766735076904, + "learning_rate": 0.000719187675070028, + "loss": 0.4088, + "step": 10133 + }, + { + "epoch": 5.661452513966481, + "grad_norm": 0.6650369763374329, + "learning_rate": 0.0007191596638655462, + "loss": 0.4309, + "step": 10134 + }, + { + "epoch": 5.662011173184357, + "grad_norm": 0.6442579627037048, + "learning_rate": 0.0007191316526610644, + "loss": 0.6323, + "step": 10135 + }, + { + "epoch": 5.662569832402235, + "grad_norm": 0.5259701609611511, + "learning_rate": 0.0007191036414565826, + "loss": 0.3865, + "step": 10136 + }, + { + "epoch": 5.663128491620112, + "grad_norm": 0.4101817309856415, + "learning_rate": 0.0007190756302521008, + "loss": 0.343, + "step": 10137 + }, + { + "epoch": 5.663687150837989, + "grad_norm": 0.7517940998077393, + "learning_rate": 0.000719047619047619, + "loss": 0.567, + "step": 10138 + }, + { + "epoch": 5.664245810055866, + "grad_norm": 0.4668457508087158, + "learning_rate": 0.0007190196078431373, + "loss": 0.442, + "step": 10139 + }, + { + "epoch": 5.664804469273743, + "grad_norm": 0.8796895146369934, + "learning_rate": 0.0007189915966386555, + "loss": 0.5088, + "step": 10140 + }, + { + "epoch": 5.66536312849162, + "grad_norm": 0.9703786969184875, + "learning_rate": 0.0007189635854341738, + "loss": 0.5957, + "step": 10141 + }, + { + "epoch": 5.665921787709497, + "grad_norm": 0.5584157109260559, + "learning_rate": 0.0007189355742296919, + "loss": 0.6407, + "step": 10142 + }, + { + "epoch": 5.666480446927374, + "grad_norm": 0.5805408358573914, + "learning_rate": 0.0007189075630252101, + "loss": 0.4389, + "step": 10143 + }, + { + "epoch": 5.667039106145252, + "grad_norm": 0.4985531270503998, + "learning_rate": 0.0007188795518207283, + "loss": 0.4821, + "step": 10144 + }, + { + "epoch": 5.667597765363128, + "grad_norm": 0.4365057349205017, + "learning_rate": 0.0007188515406162465, + "loss": 0.4635, + "step": 10145 + }, + { + "epoch": 5.668156424581006, + "grad_norm": 0.517618715763092, + "learning_rate": 0.0007188235294117648, + "loss": 0.407, + "step": 10146 + }, + { + "epoch": 5.668715083798883, + "grad_norm": 0.9290892481803894, + "learning_rate": 0.0007187955182072829, + "loss": 0.7564, + "step": 10147 + }, + { + "epoch": 5.66927374301676, + "grad_norm": 0.5010323524475098, + "learning_rate": 0.0007187675070028011, + "loss": 0.4413, + "step": 10148 + }, + { + "epoch": 5.669832402234637, + "grad_norm": 0.4111177623271942, + "learning_rate": 0.0007187394957983193, + "loss": 0.3555, + "step": 10149 + }, + { + "epoch": 5.670391061452514, + "grad_norm": 0.5751590728759766, + "learning_rate": 0.0007187114845938375, + "loss": 0.4882, + "step": 10150 + }, + { + "epoch": 5.670949720670391, + "grad_norm": 0.6412973999977112, + "learning_rate": 0.0007186834733893558, + "loss": 0.4287, + "step": 10151 + }, + { + "epoch": 5.671508379888268, + "grad_norm": 0.7515106201171875, + "learning_rate": 0.0007186554621848739, + "loss": 0.4534, + "step": 10152 + }, + { + "epoch": 5.672067039106145, + "grad_norm": 0.514158308506012, + "learning_rate": 0.0007186274509803921, + "loss": 0.4138, + "step": 10153 + }, + { + "epoch": 5.672625698324023, + "grad_norm": 0.4929622411727905, + "learning_rate": 0.0007185994397759103, + "loss": 0.406, + "step": 10154 + }, + { + "epoch": 5.673184357541899, + "grad_norm": 0.7281866669654846, + "learning_rate": 0.0007185714285714286, + "loss": 0.596, + "step": 10155 + }, + { + "epoch": 5.673743016759777, + "grad_norm": 0.4281335771083832, + "learning_rate": 0.0007185434173669469, + "loss": 0.4713, + "step": 10156 + }, + { + "epoch": 5.674301675977654, + "grad_norm": 0.6203215718269348, + "learning_rate": 0.0007185154061624651, + "loss": 0.4433, + "step": 10157 + }, + { + "epoch": 5.674860335195531, + "grad_norm": 0.5056416392326355, + "learning_rate": 0.0007184873949579832, + "loss": 0.4264, + "step": 10158 + }, + { + "epoch": 5.675418994413408, + "grad_norm": 1.670349359512329, + "learning_rate": 0.0007184593837535014, + "loss": 0.4141, + "step": 10159 + }, + { + "epoch": 5.675977653631285, + "grad_norm": 0.6079891324043274, + "learning_rate": 0.0007184313725490196, + "loss": 0.4828, + "step": 10160 + }, + { + "epoch": 5.676536312849162, + "grad_norm": 1.1369798183441162, + "learning_rate": 0.0007184033613445379, + "loss": 0.4626, + "step": 10161 + }, + { + "epoch": 5.677094972067039, + "grad_norm": 0.42787474393844604, + "learning_rate": 0.0007183753501400561, + "loss": 0.6025, + "step": 10162 + }, + { + "epoch": 5.677653631284916, + "grad_norm": 0.39584946632385254, + "learning_rate": 0.0007183473389355742, + "loss": 0.4049, + "step": 10163 + }, + { + "epoch": 5.678212290502794, + "grad_norm": 0.7364213466644287, + "learning_rate": 0.0007183193277310924, + "loss": 0.5055, + "step": 10164 + }, + { + "epoch": 5.67877094972067, + "grad_norm": 0.4109117090702057, + "learning_rate": 0.0007182913165266106, + "loss": 0.3837, + "step": 10165 + }, + { + "epoch": 5.679329608938548, + "grad_norm": 0.42509889602661133, + "learning_rate": 0.0007182633053221289, + "loss": 0.4439, + "step": 10166 + }, + { + "epoch": 5.679888268156424, + "grad_norm": 0.6144108772277832, + "learning_rate": 0.0007182352941176471, + "loss": 0.429, + "step": 10167 + }, + { + "epoch": 5.680446927374302, + "grad_norm": 0.5343363881111145, + "learning_rate": 0.0007182072829131652, + "loss": 0.3778, + "step": 10168 + }, + { + "epoch": 5.681005586592179, + "grad_norm": 0.4703051745891571, + "learning_rate": 0.0007181792717086834, + "loss": 0.4238, + "step": 10169 + }, + { + "epoch": 5.681564245810056, + "grad_norm": 0.5371783971786499, + "learning_rate": 0.0007181512605042016, + "loss": 0.4451, + "step": 10170 + }, + { + "epoch": 5.682122905027933, + "grad_norm": 0.5565941333770752, + "learning_rate": 0.00071812324929972, + "loss": 0.4208, + "step": 10171 + }, + { + "epoch": 5.68268156424581, + "grad_norm": 2.1324219703674316, + "learning_rate": 0.0007180952380952382, + "loss": 0.4747, + "step": 10172 + }, + { + "epoch": 5.683240223463687, + "grad_norm": 0.3692035675048828, + "learning_rate": 0.0007180672268907564, + "loss": 0.456, + "step": 10173 + }, + { + "epoch": 5.683798882681565, + "grad_norm": 0.5666664838790894, + "learning_rate": 0.0007180392156862745, + "loss": 0.5193, + "step": 10174 + }, + { + "epoch": 5.684357541899441, + "grad_norm": 0.47725966572761536, + "learning_rate": 0.0007180112044817927, + "loss": 0.5569, + "step": 10175 + }, + { + "epoch": 5.684916201117319, + "grad_norm": 0.9071924686431885, + "learning_rate": 0.000717983193277311, + "loss": 0.4178, + "step": 10176 + }, + { + "epoch": 5.685474860335195, + "grad_norm": 0.4128570556640625, + "learning_rate": 0.0007179551820728292, + "loss": 0.4413, + "step": 10177 + }, + { + "epoch": 5.686033519553073, + "grad_norm": 0.6640491485595703, + "learning_rate": 0.0007179271708683474, + "loss": 0.4865, + "step": 10178 + }, + { + "epoch": 5.686592178770949, + "grad_norm": 0.5974729657173157, + "learning_rate": 0.0007178991596638655, + "loss": 0.5042, + "step": 10179 + }, + { + "epoch": 5.687150837988827, + "grad_norm": 0.5193836688995361, + "learning_rate": 0.0007178711484593837, + "loss": 0.5873, + "step": 10180 + }, + { + "epoch": 5.687709497206704, + "grad_norm": 0.43944063782691956, + "learning_rate": 0.000717843137254902, + "loss": 0.4738, + "step": 10181 + }, + { + "epoch": 5.688268156424581, + "grad_norm": 0.4762427508831024, + "learning_rate": 0.0007178151260504202, + "loss": 0.3782, + "step": 10182 + }, + { + "epoch": 5.688826815642458, + "grad_norm": 0.6396538019180298, + "learning_rate": 0.0007177871148459384, + "loss": 0.5364, + "step": 10183 + }, + { + "epoch": 5.689385474860336, + "grad_norm": 1.5707911252975464, + "learning_rate": 0.0007177591036414565, + "loss": 0.466, + "step": 10184 + }, + { + "epoch": 5.689944134078212, + "grad_norm": 0.521094799041748, + "learning_rate": 0.0007177310924369747, + "loss": 0.4169, + "step": 10185 + }, + { + "epoch": 5.69050279329609, + "grad_norm": 0.7004362344741821, + "learning_rate": 0.000717703081232493, + "loss": 0.4224, + "step": 10186 + }, + { + "epoch": 5.691061452513966, + "grad_norm": 0.5133131146430969, + "learning_rate": 0.0007176750700280113, + "loss": 0.4152, + "step": 10187 + }, + { + "epoch": 5.691620111731844, + "grad_norm": 1.3676213026046753, + "learning_rate": 0.0007176470588235295, + "loss": 0.4561, + "step": 10188 + }, + { + "epoch": 5.69217877094972, + "grad_norm": 0.6307364106178284, + "learning_rate": 0.0007176190476190477, + "loss": 0.3587, + "step": 10189 + }, + { + "epoch": 5.692737430167598, + "grad_norm": 0.4155840277671814, + "learning_rate": 0.0007175910364145658, + "loss": 0.3527, + "step": 10190 + }, + { + "epoch": 5.693296089385475, + "grad_norm": 0.3923536241054535, + "learning_rate": 0.0007175630252100841, + "loss": 0.3642, + "step": 10191 + }, + { + "epoch": 5.693854748603352, + "grad_norm": 0.47840192914009094, + "learning_rate": 0.0007175350140056023, + "loss": 0.3695, + "step": 10192 + }, + { + "epoch": 5.694413407821229, + "grad_norm": 0.44021400809288025, + "learning_rate": 0.0007175070028011205, + "loss": 0.4623, + "step": 10193 + }, + { + "epoch": 5.694972067039107, + "grad_norm": 0.620509922504425, + "learning_rate": 0.0007174789915966387, + "loss": 0.4494, + "step": 10194 + }, + { + "epoch": 5.695530726256983, + "grad_norm": 0.6490848064422607, + "learning_rate": 0.0007174509803921568, + "loss": 0.4601, + "step": 10195 + }, + { + "epoch": 5.696089385474861, + "grad_norm": 0.4066997170448303, + "learning_rate": 0.0007174229691876751, + "loss": 0.4243, + "step": 10196 + }, + { + "epoch": 5.696648044692737, + "grad_norm": 0.4810340404510498, + "learning_rate": 0.0007173949579831933, + "loss": 0.5408, + "step": 10197 + }, + { + "epoch": 5.697206703910615, + "grad_norm": 2.20889949798584, + "learning_rate": 0.0007173669467787115, + "loss": 0.4733, + "step": 10198 + }, + { + "epoch": 5.697765363128491, + "grad_norm": 0.3742657005786896, + "learning_rate": 0.0007173389355742297, + "loss": 0.3776, + "step": 10199 + }, + { + "epoch": 5.698324022346369, + "grad_norm": 3.9947080612182617, + "learning_rate": 0.0007173109243697478, + "loss": 0.5621, + "step": 10200 + }, + { + "epoch": 5.698882681564246, + "grad_norm": 0.5836235880851746, + "learning_rate": 0.0007172829131652661, + "loss": 0.4484, + "step": 10201 + }, + { + "epoch": 5.699441340782123, + "grad_norm": 0.49547767639160156, + "learning_rate": 0.0007172549019607843, + "loss": 0.4442, + "step": 10202 + }, + { + "epoch": 5.7, + "grad_norm": 0.4873258173465729, + "learning_rate": 0.0007172268907563025, + "loss": 0.5619, + "step": 10203 + }, + { + "epoch": 5.700558659217877, + "grad_norm": 0.465565949678421, + "learning_rate": 0.0007171988795518208, + "loss": 0.4779, + "step": 10204 + }, + { + "epoch": 5.701117318435754, + "grad_norm": 0.5477548837661743, + "learning_rate": 0.000717170868347339, + "loss": 0.5451, + "step": 10205 + }, + { + "epoch": 5.701675977653632, + "grad_norm": 0.48681530356407166, + "learning_rate": 0.0007171428571428572, + "loss": 0.5464, + "step": 10206 + }, + { + "epoch": 5.702234636871508, + "grad_norm": 0.4436895251274109, + "learning_rate": 0.0007171148459383754, + "loss": 0.3968, + "step": 10207 + }, + { + "epoch": 5.702793296089386, + "grad_norm": 4.854309558868408, + "learning_rate": 0.0007170868347338936, + "loss": 0.4396, + "step": 10208 + }, + { + "epoch": 5.703351955307262, + "grad_norm": 1.396460771560669, + "learning_rate": 0.0007170588235294118, + "loss": 0.4414, + "step": 10209 + }, + { + "epoch": 5.70391061452514, + "grad_norm": 0.5090497136116028, + "learning_rate": 0.00071703081232493, + "loss": 0.3962, + "step": 10210 + }, + { + "epoch": 5.704469273743017, + "grad_norm": 0.4225035011768341, + "learning_rate": 0.0007170028011204482, + "loss": 0.527, + "step": 10211 + }, + { + "epoch": 5.705027932960894, + "grad_norm": 0.40933382511138916, + "learning_rate": 0.0007169747899159664, + "loss": 0.3944, + "step": 10212 + }, + { + "epoch": 5.705586592178771, + "grad_norm": 0.38999760150909424, + "learning_rate": 0.0007169467787114846, + "loss": 0.4669, + "step": 10213 + }, + { + "epoch": 5.706145251396648, + "grad_norm": 0.6663619875907898, + "learning_rate": 0.0007169187675070028, + "loss": 0.5686, + "step": 10214 + }, + { + "epoch": 5.706703910614525, + "grad_norm": 0.38841113448143005, + "learning_rate": 0.000716890756302521, + "loss": 0.4206, + "step": 10215 + }, + { + "epoch": 5.707262569832402, + "grad_norm": 0.5985347628593445, + "learning_rate": 0.0007168627450980393, + "loss": 0.4112, + "step": 10216 + }, + { + "epoch": 5.707821229050279, + "grad_norm": 0.3735370934009552, + "learning_rate": 0.0007168347338935574, + "loss": 0.452, + "step": 10217 + }, + { + "epoch": 5.708379888268157, + "grad_norm": 0.5071160197257996, + "learning_rate": 0.0007168067226890756, + "loss": 0.3923, + "step": 10218 + }, + { + "epoch": 5.708938547486033, + "grad_norm": 0.45847848057746887, + "learning_rate": 0.0007167787114845938, + "loss": 0.4339, + "step": 10219 + }, + { + "epoch": 5.709497206703911, + "grad_norm": 0.9417409896850586, + "learning_rate": 0.000716750700280112, + "loss": 0.4745, + "step": 10220 + }, + { + "epoch": 5.710055865921788, + "grad_norm": 1.1794190406799316, + "learning_rate": 0.0007167226890756304, + "loss": 0.4921, + "step": 10221 + }, + { + "epoch": 5.710614525139665, + "grad_norm": 0.4215124845504761, + "learning_rate": 0.0007166946778711485, + "loss": 0.401, + "step": 10222 + }, + { + "epoch": 5.711173184357542, + "grad_norm": 0.5434643626213074, + "learning_rate": 0.0007166666666666667, + "loss": 0.6894, + "step": 10223 + }, + { + "epoch": 5.711731843575419, + "grad_norm": 0.49288007616996765, + "learning_rate": 0.0007166386554621849, + "loss": 0.4719, + "step": 10224 + }, + { + "epoch": 5.712290502793296, + "grad_norm": 13.83658218383789, + "learning_rate": 0.0007166106442577031, + "loss": 0.5467, + "step": 10225 + }, + { + "epoch": 5.712849162011173, + "grad_norm": 0.5512245297431946, + "learning_rate": 0.0007165826330532214, + "loss": 0.3874, + "step": 10226 + }, + { + "epoch": 5.71340782122905, + "grad_norm": 0.640646755695343, + "learning_rate": 0.0007165546218487395, + "loss": 0.3851, + "step": 10227 + }, + { + "epoch": 5.713966480446928, + "grad_norm": 1.979858160018921, + "learning_rate": 0.0007165266106442577, + "loss": 0.4239, + "step": 10228 + }, + { + "epoch": 5.714525139664804, + "grad_norm": 0.49379274249076843, + "learning_rate": 0.0007164985994397759, + "loss": 0.4977, + "step": 10229 + }, + { + "epoch": 5.715083798882682, + "grad_norm": 0.5241057276725769, + "learning_rate": 0.0007164705882352941, + "loss": 0.4931, + "step": 10230 + }, + { + "epoch": 5.715642458100559, + "grad_norm": 0.41967758536338806, + "learning_rate": 0.0007164425770308124, + "loss": 0.33, + "step": 10231 + }, + { + "epoch": 5.716201117318436, + "grad_norm": 0.4155007600784302, + "learning_rate": 0.0007164145658263306, + "loss": 0.4143, + "step": 10232 + }, + { + "epoch": 5.716759776536313, + "grad_norm": 0.48228001594543457, + "learning_rate": 0.0007163865546218487, + "loss": 0.4352, + "step": 10233 + }, + { + "epoch": 5.71731843575419, + "grad_norm": 0.6980542540550232, + "learning_rate": 0.0007163585434173669, + "loss": 0.4942, + "step": 10234 + }, + { + "epoch": 5.717877094972067, + "grad_norm": 1.4606268405914307, + "learning_rate": 0.0007163305322128851, + "loss": 0.6699, + "step": 10235 + }, + { + "epoch": 5.718435754189944, + "grad_norm": 0.5530686378479004, + "learning_rate": 0.0007163025210084035, + "loss": 0.4618, + "step": 10236 + }, + { + "epoch": 5.718994413407821, + "grad_norm": 0.5177992582321167, + "learning_rate": 0.0007162745098039217, + "loss": 0.4234, + "step": 10237 + }, + { + "epoch": 5.719553072625699, + "grad_norm": 0.7660933136940002, + "learning_rate": 0.0007162464985994398, + "loss": 0.4053, + "step": 10238 + }, + { + "epoch": 5.720111731843575, + "grad_norm": 1.2872079610824585, + "learning_rate": 0.000716218487394958, + "loss": 0.5579, + "step": 10239 + }, + { + "epoch": 5.720670391061453, + "grad_norm": 0.3990005552768707, + "learning_rate": 0.0007161904761904762, + "loss": 0.4447, + "step": 10240 + }, + { + "epoch": 5.721229050279329, + "grad_norm": 0.5466039180755615, + "learning_rate": 0.0007161624649859945, + "loss": 0.4488, + "step": 10241 + }, + { + "epoch": 5.721787709497207, + "grad_norm": 0.533645749092102, + "learning_rate": 0.0007161344537815127, + "loss": 0.4425, + "step": 10242 + }, + { + "epoch": 5.722346368715084, + "grad_norm": 1.3025493621826172, + "learning_rate": 0.0007161064425770308, + "loss": 0.4362, + "step": 10243 + }, + { + "epoch": 5.722905027932961, + "grad_norm": 0.45794904232025146, + "learning_rate": 0.000716078431372549, + "loss": 0.5088, + "step": 10244 + }, + { + "epoch": 5.723463687150838, + "grad_norm": 0.4998818635940552, + "learning_rate": 0.0007160504201680672, + "loss": 0.3459, + "step": 10245 + }, + { + "epoch": 5.724022346368715, + "grad_norm": 0.6522479057312012, + "learning_rate": 0.0007160224089635855, + "loss": 0.3285, + "step": 10246 + }, + { + "epoch": 5.724581005586592, + "grad_norm": 0.45060428977012634, + "learning_rate": 0.0007159943977591037, + "loss": 0.4389, + "step": 10247 + }, + { + "epoch": 5.72513966480447, + "grad_norm": 0.5087399482727051, + "learning_rate": 0.0007159663865546219, + "loss": 0.4732, + "step": 10248 + }, + { + "epoch": 5.725698324022346, + "grad_norm": 0.617437481880188, + "learning_rate": 0.00071593837535014, + "loss": 0.5119, + "step": 10249 + }, + { + "epoch": 5.726256983240224, + "grad_norm": 0.572313666343689, + "learning_rate": 0.0007159103641456582, + "loss": 0.4244, + "step": 10250 + }, + { + "epoch": 5.7268156424581, + "grad_norm": 0.44844773411750793, + "learning_rate": 0.0007158823529411765, + "loss": 0.4789, + "step": 10251 + }, + { + "epoch": 5.727374301675978, + "grad_norm": 0.4281400442123413, + "learning_rate": 0.0007158543417366947, + "loss": 0.4069, + "step": 10252 + }, + { + "epoch": 5.727932960893854, + "grad_norm": 0.6595506072044373, + "learning_rate": 0.000715826330532213, + "loss": 0.5112, + "step": 10253 + }, + { + "epoch": 5.728491620111732, + "grad_norm": 0.48393046855926514, + "learning_rate": 0.000715798319327731, + "loss": 0.4836, + "step": 10254 + }, + { + "epoch": 5.729050279329609, + "grad_norm": 0.7495980262756348, + "learning_rate": 0.0007157703081232493, + "loss": 0.4125, + "step": 10255 + }, + { + "epoch": 5.729608938547486, + "grad_norm": 0.4771103262901306, + "learning_rate": 0.0007157422969187676, + "loss": 0.3413, + "step": 10256 + }, + { + "epoch": 5.730167597765363, + "grad_norm": 0.7307485938072205, + "learning_rate": 0.0007157142857142858, + "loss": 0.402, + "step": 10257 + }, + { + "epoch": 5.730726256983241, + "grad_norm": 0.440533846616745, + "learning_rate": 0.000715686274509804, + "loss": 0.4158, + "step": 10258 + }, + { + "epoch": 5.731284916201117, + "grad_norm": 0.633655309677124, + "learning_rate": 0.0007156582633053221, + "loss": 0.5131, + "step": 10259 + }, + { + "epoch": 5.731843575418995, + "grad_norm": 0.6138968467712402, + "learning_rate": 0.0007156302521008403, + "loss": 0.4086, + "step": 10260 + }, + { + "epoch": 5.732402234636871, + "grad_norm": 0.5836642980575562, + "learning_rate": 0.0007156022408963585, + "loss": 0.5043, + "step": 10261 + }, + { + "epoch": 5.732960893854749, + "grad_norm": 1.9546562433242798, + "learning_rate": 0.0007155742296918768, + "loss": 0.4488, + "step": 10262 + }, + { + "epoch": 5.733519553072625, + "grad_norm": 0.876761794090271, + "learning_rate": 0.000715546218487395, + "loss": 0.6389, + "step": 10263 + }, + { + "epoch": 5.734078212290503, + "grad_norm": 0.7043126821517944, + "learning_rate": 0.0007155182072829132, + "loss": 0.5307, + "step": 10264 + }, + { + "epoch": 5.73463687150838, + "grad_norm": 0.7099644541740417, + "learning_rate": 0.0007154901960784313, + "loss": 0.449, + "step": 10265 + }, + { + "epoch": 5.735195530726257, + "grad_norm": 1.3028637170791626, + "learning_rate": 0.0007154621848739495, + "loss": 0.546, + "step": 10266 + }, + { + "epoch": 5.735754189944134, + "grad_norm": 0.4699321389198303, + "learning_rate": 0.0007154341736694678, + "loss": 0.4753, + "step": 10267 + }, + { + "epoch": 5.736312849162011, + "grad_norm": 0.6209031939506531, + "learning_rate": 0.000715406162464986, + "loss": 0.6278, + "step": 10268 + }, + { + "epoch": 5.736871508379888, + "grad_norm": 0.3387393057346344, + "learning_rate": 0.0007153781512605043, + "loss": 0.3813, + "step": 10269 + }, + { + "epoch": 5.737430167597766, + "grad_norm": 1.0591232776641846, + "learning_rate": 0.0007153501400560223, + "loss": 0.406, + "step": 10270 + }, + { + "epoch": 5.737988826815642, + "grad_norm": 0.8121812343597412, + "learning_rate": 0.0007153221288515406, + "loss": 0.5272, + "step": 10271 + }, + { + "epoch": 5.73854748603352, + "grad_norm": 0.6363793015480042, + "learning_rate": 0.0007152941176470589, + "loss": 0.4774, + "step": 10272 + }, + { + "epoch": 5.739106145251396, + "grad_norm": 0.4615616500377655, + "learning_rate": 0.0007152661064425771, + "loss": 0.3486, + "step": 10273 + }, + { + "epoch": 5.739664804469274, + "grad_norm": 0.5085633397102356, + "learning_rate": 0.0007152380952380953, + "loss": 0.5245, + "step": 10274 + }, + { + "epoch": 5.740223463687151, + "grad_norm": 0.4302261769771576, + "learning_rate": 0.0007152100840336134, + "loss": 0.4086, + "step": 10275 + }, + { + "epoch": 5.740782122905028, + "grad_norm": 0.5649064779281616, + "learning_rate": 0.0007151820728291316, + "loss": 0.5173, + "step": 10276 + }, + { + "epoch": 5.741340782122905, + "grad_norm": 0.800110936164856, + "learning_rate": 0.0007151540616246499, + "loss": 0.3889, + "step": 10277 + }, + { + "epoch": 5.741899441340782, + "grad_norm": 0.5093740820884705, + "learning_rate": 0.0007151260504201681, + "loss": 0.4682, + "step": 10278 + }, + { + "epoch": 5.742458100558659, + "grad_norm": 0.66042560338974, + "learning_rate": 0.0007150980392156863, + "loss": 0.5723, + "step": 10279 + }, + { + "epoch": 5.743016759776537, + "grad_norm": 0.46901175379753113, + "learning_rate": 0.0007150700280112045, + "loss": 0.4157, + "step": 10280 + }, + { + "epoch": 5.743575418994413, + "grad_norm": 0.7348647713661194, + "learning_rate": 0.0007150420168067226, + "loss": 0.4622, + "step": 10281 + }, + { + "epoch": 5.744134078212291, + "grad_norm": 0.6721197366714478, + "learning_rate": 0.0007150140056022409, + "loss": 0.3587, + "step": 10282 + }, + { + "epoch": 5.744692737430167, + "grad_norm": 0.502974271774292, + "learning_rate": 0.0007149859943977591, + "loss": 0.3856, + "step": 10283 + }, + { + "epoch": 5.745251396648045, + "grad_norm": 0.5577751398086548, + "learning_rate": 0.0007149579831932773, + "loss": 0.5753, + "step": 10284 + }, + { + "epoch": 5.745810055865922, + "grad_norm": 0.6245629191398621, + "learning_rate": 0.0007149299719887955, + "loss": 0.4758, + "step": 10285 + }, + { + "epoch": 5.746368715083799, + "grad_norm": 10.034965515136719, + "learning_rate": 0.0007149019607843136, + "loss": 0.4458, + "step": 10286 + }, + { + "epoch": 5.746927374301676, + "grad_norm": 0.38283371925354004, + "learning_rate": 0.000714873949579832, + "loss": 0.3838, + "step": 10287 + }, + { + "epoch": 5.747486033519553, + "grad_norm": 0.8203089237213135, + "learning_rate": 0.0007148459383753502, + "loss": 0.5509, + "step": 10288 + }, + { + "epoch": 5.74804469273743, + "grad_norm": 0.5299038290977478, + "learning_rate": 0.0007148179271708684, + "loss": 0.4389, + "step": 10289 + }, + { + "epoch": 5.748603351955307, + "grad_norm": 0.5275065302848816, + "learning_rate": 0.0007147899159663866, + "loss": 0.4486, + "step": 10290 + }, + { + "epoch": 5.749162011173184, + "grad_norm": 0.5049828290939331, + "learning_rate": 0.0007147619047619047, + "loss": 0.4602, + "step": 10291 + }, + { + "epoch": 5.749720670391062, + "grad_norm": 0.5503336787223816, + "learning_rate": 0.000714733893557423, + "loss": 0.4351, + "step": 10292 + }, + { + "epoch": 5.750279329608938, + "grad_norm": 0.772299587726593, + "learning_rate": 0.0007147058823529412, + "loss": 0.6089, + "step": 10293 + }, + { + "epoch": 5.750837988826816, + "grad_norm": 0.4710327684879303, + "learning_rate": 0.0007146778711484594, + "loss": 0.4321, + "step": 10294 + }, + { + "epoch": 5.751396648044693, + "grad_norm": 0.648944616317749, + "learning_rate": 0.0007146498599439776, + "loss": 0.5418, + "step": 10295 + }, + { + "epoch": 5.75195530726257, + "grad_norm": 0.7221190333366394, + "learning_rate": 0.0007146218487394958, + "loss": 0.4424, + "step": 10296 + }, + { + "epoch": 5.752513966480447, + "grad_norm": 1.8884402513504028, + "learning_rate": 0.000714593837535014, + "loss": 0.5027, + "step": 10297 + }, + { + "epoch": 5.753072625698324, + "grad_norm": 0.525856614112854, + "learning_rate": 0.0007145658263305322, + "loss": 0.5015, + "step": 10298 + }, + { + "epoch": 5.753631284916201, + "grad_norm": 0.48546022176742554, + "learning_rate": 0.0007145378151260504, + "loss": 0.4952, + "step": 10299 + }, + { + "epoch": 5.754189944134078, + "grad_norm": 0.5645453333854675, + "learning_rate": 0.0007145098039215686, + "loss": 0.5335, + "step": 10300 + }, + { + "epoch": 5.754748603351955, + "grad_norm": 0.5032395124435425, + "learning_rate": 0.0007144817927170868, + "loss": 0.4258, + "step": 10301 + }, + { + "epoch": 5.755307262569833, + "grad_norm": 0.47148433327674866, + "learning_rate": 0.000714453781512605, + "loss": 0.3793, + "step": 10302 + }, + { + "epoch": 5.755865921787709, + "grad_norm": 0.5534130930900574, + "learning_rate": 0.0007144257703081233, + "loss": 0.3904, + "step": 10303 + }, + { + "epoch": 5.756424581005587, + "grad_norm": 0.5768478512763977, + "learning_rate": 0.0007143977591036415, + "loss": 0.3728, + "step": 10304 + }, + { + "epoch": 5.756983240223463, + "grad_norm": 0.6742956042289734, + "learning_rate": 0.0007143697478991597, + "loss": 0.4512, + "step": 10305 + }, + { + "epoch": 5.757541899441341, + "grad_norm": 0.4233131408691406, + "learning_rate": 0.0007143417366946779, + "loss": 0.4657, + "step": 10306 + }, + { + "epoch": 5.758100558659218, + "grad_norm": 0.8073203563690186, + "learning_rate": 0.0007143137254901961, + "loss": 0.5316, + "step": 10307 + }, + { + "epoch": 5.758659217877095, + "grad_norm": 0.6402434706687927, + "learning_rate": 0.0007142857142857143, + "loss": 0.4549, + "step": 10308 + }, + { + "epoch": 5.759217877094972, + "grad_norm": 1.0459699630737305, + "learning_rate": 0.0007142577030812325, + "loss": 0.4441, + "step": 10309 + }, + { + "epoch": 5.759776536312849, + "grad_norm": 0.7883827090263367, + "learning_rate": 0.0007142296918767507, + "loss": 0.4036, + "step": 10310 + }, + { + "epoch": 5.760335195530726, + "grad_norm": 0.739895761013031, + "learning_rate": 0.0007142016806722689, + "loss": 0.4846, + "step": 10311 + }, + { + "epoch": 5.760893854748604, + "grad_norm": 0.4047786295413971, + "learning_rate": 0.0007141736694677872, + "loss": 0.4197, + "step": 10312 + }, + { + "epoch": 5.76145251396648, + "grad_norm": 0.7069640755653381, + "learning_rate": 0.0007141456582633053, + "loss": 0.4007, + "step": 10313 + }, + { + "epoch": 5.762011173184358, + "grad_norm": 0.49129244685173035, + "learning_rate": 0.0007141176470588235, + "loss": 0.5691, + "step": 10314 + }, + { + "epoch": 5.762569832402234, + "grad_norm": 0.5147718191146851, + "learning_rate": 0.0007140896358543417, + "loss": 0.4255, + "step": 10315 + }, + { + "epoch": 5.763128491620112, + "grad_norm": 0.45998597145080566, + "learning_rate": 0.0007140616246498599, + "loss": 0.3919, + "step": 10316 + }, + { + "epoch": 5.763687150837989, + "grad_norm": 0.4655545651912689, + "learning_rate": 0.0007140336134453782, + "loss": 0.3918, + "step": 10317 + }, + { + "epoch": 5.764245810055866, + "grad_norm": 0.7525028586387634, + "learning_rate": 0.0007140056022408963, + "loss": 0.6541, + "step": 10318 + }, + { + "epoch": 5.764804469273743, + "grad_norm": 0.6135449409484863, + "learning_rate": 0.0007139775910364146, + "loss": 0.4143, + "step": 10319 + }, + { + "epoch": 5.76536312849162, + "grad_norm": 0.3862467110157013, + "learning_rate": 0.0007139495798319328, + "loss": 0.4043, + "step": 10320 + }, + { + "epoch": 5.765921787709497, + "grad_norm": 0.4504019021987915, + "learning_rate": 0.000713921568627451, + "loss": 0.4101, + "step": 10321 + }, + { + "epoch": 5.766480446927375, + "grad_norm": 0.5463133454322815, + "learning_rate": 0.0007138935574229693, + "loss": 0.4725, + "step": 10322 + }, + { + "epoch": 5.767039106145251, + "grad_norm": 0.8920431137084961, + "learning_rate": 0.0007138655462184874, + "loss": 0.37, + "step": 10323 + }, + { + "epoch": 5.767597765363129, + "grad_norm": 0.5116588473320007, + "learning_rate": 0.0007138375350140056, + "loss": 0.3676, + "step": 10324 + }, + { + "epoch": 5.768156424581005, + "grad_norm": 0.5672568082809448, + "learning_rate": 0.0007138095238095238, + "loss": 0.46, + "step": 10325 + }, + { + "epoch": 5.768715083798883, + "grad_norm": 0.6564370393753052, + "learning_rate": 0.000713781512605042, + "loss": 0.4016, + "step": 10326 + }, + { + "epoch": 5.769273743016759, + "grad_norm": 0.5924671292304993, + "learning_rate": 0.0007137535014005603, + "loss": 0.444, + "step": 10327 + }, + { + "epoch": 5.769832402234637, + "grad_norm": 0.46051570773124695, + "learning_rate": 0.0007137254901960785, + "loss": 0.4421, + "step": 10328 + }, + { + "epoch": 5.770391061452514, + "grad_norm": 0.4963991343975067, + "learning_rate": 0.0007136974789915966, + "loss": 0.4837, + "step": 10329 + }, + { + "epoch": 5.770949720670391, + "grad_norm": 1.0286465883255005, + "learning_rate": 0.0007136694677871148, + "loss": 0.5905, + "step": 10330 + }, + { + "epoch": 5.771508379888268, + "grad_norm": 1.0372523069381714, + "learning_rate": 0.000713641456582633, + "loss": 0.566, + "step": 10331 + }, + { + "epoch": 5.772067039106146, + "grad_norm": 0.7292980551719666, + "learning_rate": 0.0007136134453781513, + "loss": 0.5482, + "step": 10332 + }, + { + "epoch": 5.772625698324022, + "grad_norm": 2.2000749111175537, + "learning_rate": 0.0007135854341736695, + "loss": 0.4456, + "step": 10333 + }, + { + "epoch": 5.7731843575419, + "grad_norm": 0.9369186162948608, + "learning_rate": 0.0007135574229691876, + "loss": 0.6232, + "step": 10334 + }, + { + "epoch": 5.773743016759776, + "grad_norm": 0.5925668478012085, + "learning_rate": 0.0007135294117647058, + "loss": 0.541, + "step": 10335 + }, + { + "epoch": 5.774301675977654, + "grad_norm": 0.6764436960220337, + "learning_rate": 0.000713501400560224, + "loss": 0.4441, + "step": 10336 + }, + { + "epoch": 5.77486033519553, + "grad_norm": 0.516507625579834, + "learning_rate": 0.0007134733893557424, + "loss": 0.4232, + "step": 10337 + }, + { + "epoch": 5.775418994413408, + "grad_norm": 1.0051759481430054, + "learning_rate": 0.0007134453781512606, + "loss": 0.3122, + "step": 10338 + }, + { + "epoch": 5.775977653631285, + "grad_norm": 0.986807107925415, + "learning_rate": 0.0007134173669467787, + "loss": 0.5684, + "step": 10339 + }, + { + "epoch": 5.776536312849162, + "grad_norm": 0.59564208984375, + "learning_rate": 0.0007133893557422969, + "loss": 0.4224, + "step": 10340 + }, + { + "epoch": 5.777094972067039, + "grad_norm": 0.7339884638786316, + "learning_rate": 0.0007133613445378151, + "loss": 0.5999, + "step": 10341 + }, + { + "epoch": 5.777653631284916, + "grad_norm": 0.6786849498748779, + "learning_rate": 0.0007133333333333334, + "loss": 0.4191, + "step": 10342 + }, + { + "epoch": 5.778212290502793, + "grad_norm": 0.47214388847351074, + "learning_rate": 0.0007133053221288516, + "loss": 0.4719, + "step": 10343 + }, + { + "epoch": 5.778770949720671, + "grad_norm": 0.5573878288269043, + "learning_rate": 0.0007132773109243698, + "loss": 0.3942, + "step": 10344 + }, + { + "epoch": 5.779329608938547, + "grad_norm": 0.5015531778335571, + "learning_rate": 0.0007132492997198879, + "loss": 0.3887, + "step": 10345 + }, + { + "epoch": 5.779888268156425, + "grad_norm": 0.656991183757782, + "learning_rate": 0.0007132212885154061, + "loss": 0.432, + "step": 10346 + }, + { + "epoch": 5.780446927374301, + "grad_norm": 0.4074009358882904, + "learning_rate": 0.0007131932773109244, + "loss": 0.4126, + "step": 10347 + }, + { + "epoch": 5.781005586592179, + "grad_norm": 0.40070173144340515, + "learning_rate": 0.0007131652661064426, + "loss": 0.3531, + "step": 10348 + }, + { + "epoch": 5.781564245810056, + "grad_norm": 1.1118261814117432, + "learning_rate": 0.0007131372549019608, + "loss": 0.3628, + "step": 10349 + }, + { + "epoch": 5.782122905027933, + "grad_norm": 0.4356982111930847, + "learning_rate": 0.0007131092436974789, + "loss": 0.4028, + "step": 10350 + }, + { + "epoch": 5.78268156424581, + "grad_norm": 0.5115057826042175, + "learning_rate": 0.0007130812324929971, + "loss": 0.4273, + "step": 10351 + }, + { + "epoch": 5.783240223463687, + "grad_norm": 0.7075468301773071, + "learning_rate": 0.0007130532212885155, + "loss": 0.4247, + "step": 10352 + }, + { + "epoch": 5.783798882681564, + "grad_norm": 0.48956945538520813, + "learning_rate": 0.0007130252100840337, + "loss": 0.5386, + "step": 10353 + }, + { + "epoch": 5.784357541899441, + "grad_norm": 0.6290279030799866, + "learning_rate": 0.0007129971988795519, + "loss": 0.4949, + "step": 10354 + }, + { + "epoch": 5.784916201117318, + "grad_norm": 0.3998968005180359, + "learning_rate": 0.00071296918767507, + "loss": 0.4515, + "step": 10355 + }, + { + "epoch": 5.785474860335196, + "grad_norm": 1.2403266429901123, + "learning_rate": 0.0007129411764705882, + "loss": 0.4137, + "step": 10356 + }, + { + "epoch": 5.786033519553072, + "grad_norm": 0.6960744857788086, + "learning_rate": 0.0007129131652661065, + "loss": 0.4211, + "step": 10357 + }, + { + "epoch": 5.78659217877095, + "grad_norm": 1.0042225122451782, + "learning_rate": 0.0007128851540616247, + "loss": 0.4041, + "step": 10358 + }, + { + "epoch": 5.787150837988827, + "grad_norm": 0.5344101190567017, + "learning_rate": 0.0007128571428571429, + "loss": 0.3921, + "step": 10359 + }, + { + "epoch": 5.787709497206704, + "grad_norm": 0.8131825923919678, + "learning_rate": 0.0007128291316526611, + "loss": 0.3912, + "step": 10360 + }, + { + "epoch": 5.788268156424581, + "grad_norm": 0.4721587300300598, + "learning_rate": 0.0007128011204481792, + "loss": 0.5389, + "step": 10361 + }, + { + "epoch": 5.788826815642458, + "grad_norm": 0.5934752821922302, + "learning_rate": 0.0007127731092436975, + "loss": 0.4513, + "step": 10362 + }, + { + "epoch": 5.789385474860335, + "grad_norm": 1.3433643579483032, + "learning_rate": 0.0007127450980392157, + "loss": 0.4972, + "step": 10363 + }, + { + "epoch": 5.789944134078212, + "grad_norm": 3.16454815864563, + "learning_rate": 0.0007127170868347339, + "loss": 0.3907, + "step": 10364 + }, + { + "epoch": 5.790502793296089, + "grad_norm": 0.5132924318313599, + "learning_rate": 0.0007126890756302521, + "loss": 0.6453, + "step": 10365 + }, + { + "epoch": 5.791061452513967, + "grad_norm": 0.6273893117904663, + "learning_rate": 0.0007126610644257702, + "loss": 0.4369, + "step": 10366 + }, + { + "epoch": 5.791620111731843, + "grad_norm": 0.5797558426856995, + "learning_rate": 0.0007126330532212885, + "loss": 0.5125, + "step": 10367 + }, + { + "epoch": 5.792178770949721, + "grad_norm": 0.5737980604171753, + "learning_rate": 0.0007126050420168068, + "loss": 0.3799, + "step": 10368 + }, + { + "epoch": 5.792737430167598, + "grad_norm": 0.613592803478241, + "learning_rate": 0.000712577030812325, + "loss": 0.51, + "step": 10369 + }, + { + "epoch": 5.793296089385475, + "grad_norm": 1.2404719591140747, + "learning_rate": 0.0007125490196078432, + "loss": 0.3916, + "step": 10370 + }, + { + "epoch": 5.793854748603352, + "grad_norm": 0.8153291940689087, + "learning_rate": 0.0007125210084033613, + "loss": 0.5167, + "step": 10371 + }, + { + "epoch": 5.794413407821229, + "grad_norm": 0.45598992705345154, + "learning_rate": 0.0007124929971988796, + "loss": 0.5011, + "step": 10372 + }, + { + "epoch": 5.794972067039106, + "grad_norm": 0.844488799571991, + "learning_rate": 0.0007124649859943978, + "loss": 0.4639, + "step": 10373 + }, + { + "epoch": 5.795530726256983, + "grad_norm": 0.5530940294265747, + "learning_rate": 0.000712436974789916, + "loss": 0.5155, + "step": 10374 + }, + { + "epoch": 5.79608938547486, + "grad_norm": 0.5928524732589722, + "learning_rate": 0.0007124089635854342, + "loss": 0.386, + "step": 10375 + }, + { + "epoch": 5.796648044692738, + "grad_norm": 0.4422083795070648, + "learning_rate": 0.0007123809523809524, + "loss": 0.4759, + "step": 10376 + }, + { + "epoch": 5.797206703910614, + "grad_norm": 0.41824138164520264, + "learning_rate": 0.0007123529411764706, + "loss": 0.386, + "step": 10377 + }, + { + "epoch": 5.797765363128492, + "grad_norm": 1.7007778882980347, + "learning_rate": 0.0007123249299719888, + "loss": 0.4932, + "step": 10378 + }, + { + "epoch": 5.798324022346368, + "grad_norm": 0.7457218170166016, + "learning_rate": 0.000712296918767507, + "loss": 0.4775, + "step": 10379 + }, + { + "epoch": 5.798882681564246, + "grad_norm": 0.7043110728263855, + "learning_rate": 0.0007122689075630252, + "loss": 0.4898, + "step": 10380 + }, + { + "epoch": 5.799441340782123, + "grad_norm": 0.5834572315216064, + "learning_rate": 0.0007122408963585434, + "loss": 0.5153, + "step": 10381 + }, + { + "epoch": 5.8, + "grad_norm": 0.771232545375824, + "learning_rate": 0.0007122128851540616, + "loss": 0.4092, + "step": 10382 + }, + { + "epoch": 5.800558659217877, + "grad_norm": 0.5131063461303711, + "learning_rate": 0.0007121848739495798, + "loss": 0.3896, + "step": 10383 + }, + { + "epoch": 5.801117318435754, + "grad_norm": 0.4704035222530365, + "learning_rate": 0.000712156862745098, + "loss": 0.3581, + "step": 10384 + }, + { + "epoch": 5.801675977653631, + "grad_norm": 3.205909013748169, + "learning_rate": 0.0007121288515406163, + "loss": 0.5036, + "step": 10385 + }, + { + "epoch": 5.802234636871509, + "grad_norm": 3.2709145545959473, + "learning_rate": 0.0007121008403361345, + "loss": 0.4595, + "step": 10386 + }, + { + "epoch": 5.802793296089385, + "grad_norm": 0.7111645936965942, + "learning_rate": 0.0007120728291316528, + "loss": 0.4382, + "step": 10387 + }, + { + "epoch": 5.803351955307263, + "grad_norm": 0.6028965711593628, + "learning_rate": 0.0007120448179271709, + "loss": 0.45, + "step": 10388 + }, + { + "epoch": 5.803910614525139, + "grad_norm": 0.5565794706344604, + "learning_rate": 0.0007120168067226891, + "loss": 0.336, + "step": 10389 + }, + { + "epoch": 5.804469273743017, + "grad_norm": 0.539331316947937, + "learning_rate": 0.0007119887955182073, + "loss": 0.4887, + "step": 10390 + }, + { + "epoch": 5.805027932960893, + "grad_norm": 0.5169692039489746, + "learning_rate": 0.0007119607843137255, + "loss": 0.5495, + "step": 10391 + }, + { + "epoch": 5.805586592178771, + "grad_norm": 0.4366172254085541, + "learning_rate": 0.0007119327731092438, + "loss": 0.4383, + "step": 10392 + }, + { + "epoch": 5.806145251396648, + "grad_norm": 0.9151692986488342, + "learning_rate": 0.0007119047619047619, + "loss": 0.6503, + "step": 10393 + }, + { + "epoch": 5.806703910614525, + "grad_norm": 0.6308380365371704, + "learning_rate": 0.0007118767507002801, + "loss": 0.3807, + "step": 10394 + }, + { + "epoch": 5.807262569832402, + "grad_norm": 0.6575770974159241, + "learning_rate": 0.0007118487394957983, + "loss": 0.4787, + "step": 10395 + }, + { + "epoch": 5.80782122905028, + "grad_norm": 0.6000781059265137, + "learning_rate": 0.0007118207282913165, + "loss": 0.4419, + "step": 10396 + }, + { + "epoch": 5.808379888268156, + "grad_norm": 0.6404375433921814, + "learning_rate": 0.0007117927170868348, + "loss": 0.4397, + "step": 10397 + }, + { + "epoch": 5.808938547486034, + "grad_norm": 0.4470246732234955, + "learning_rate": 0.0007117647058823529, + "loss": 0.5114, + "step": 10398 + }, + { + "epoch": 5.80949720670391, + "grad_norm": 0.6480503082275391, + "learning_rate": 0.0007117366946778711, + "loss": 0.3989, + "step": 10399 + }, + { + "epoch": 5.810055865921788, + "grad_norm": 0.5222627520561218, + "learning_rate": 0.0007117086834733893, + "loss": 0.4436, + "step": 10400 + }, + { + "epoch": 5.810614525139664, + "grad_norm": 0.4827338755130768, + "learning_rate": 0.0007116806722689076, + "loss": 0.3718, + "step": 10401 + }, + { + "epoch": 5.811173184357542, + "grad_norm": 0.5411868095397949, + "learning_rate": 0.0007116526610644259, + "loss": 0.5172, + "step": 10402 + }, + { + "epoch": 5.811731843575419, + "grad_norm": 0.4699190855026245, + "learning_rate": 0.0007116246498599441, + "loss": 0.3949, + "step": 10403 + }, + { + "epoch": 5.812290502793296, + "grad_norm": 2.8595128059387207, + "learning_rate": 0.0007115966386554622, + "loss": 0.457, + "step": 10404 + }, + { + "epoch": 5.812849162011173, + "grad_norm": 0.7540532946586609, + "learning_rate": 0.0007115686274509804, + "loss": 0.5276, + "step": 10405 + }, + { + "epoch": 5.813407821229051, + "grad_norm": 0.43818703293800354, + "learning_rate": 0.0007115406162464986, + "loss": 0.3988, + "step": 10406 + }, + { + "epoch": 5.813966480446927, + "grad_norm": 0.542113721370697, + "learning_rate": 0.0007115126050420169, + "loss": 0.4029, + "step": 10407 + }, + { + "epoch": 5.814525139664805, + "grad_norm": 2.103590488433838, + "learning_rate": 0.0007114845938375351, + "loss": 0.5161, + "step": 10408 + }, + { + "epoch": 5.815083798882681, + "grad_norm": 0.6302580237388611, + "learning_rate": 0.0007114565826330532, + "loss": 0.4646, + "step": 10409 + }, + { + "epoch": 5.815642458100559, + "grad_norm": 0.5420263409614563, + "learning_rate": 0.0007114285714285714, + "loss": 0.5032, + "step": 10410 + }, + { + "epoch": 5.816201117318435, + "grad_norm": 0.42350655794143677, + "learning_rate": 0.0007114005602240896, + "loss": 0.4667, + "step": 10411 + }, + { + "epoch": 5.816759776536313, + "grad_norm": 1.8075344562530518, + "learning_rate": 0.0007113725490196079, + "loss": 0.4886, + "step": 10412 + }, + { + "epoch": 5.81731843575419, + "grad_norm": 0.5061565637588501, + "learning_rate": 0.0007113445378151261, + "loss": 0.3747, + "step": 10413 + }, + { + "epoch": 5.817877094972067, + "grad_norm": 0.7426716089248657, + "learning_rate": 0.0007113165266106442, + "loss": 0.5273, + "step": 10414 + }, + { + "epoch": 5.818435754189944, + "grad_norm": 0.5342215299606323, + "learning_rate": 0.0007112885154061624, + "loss": 0.4341, + "step": 10415 + }, + { + "epoch": 5.818994413407821, + "grad_norm": 0.5137396454811096, + "learning_rate": 0.0007112605042016806, + "loss": 0.3784, + "step": 10416 + }, + { + "epoch": 5.819553072625698, + "grad_norm": 0.5618327260017395, + "learning_rate": 0.000711232492997199, + "loss": 0.3531, + "step": 10417 + }, + { + "epoch": 5.820111731843576, + "grad_norm": 0.4717462360858917, + "learning_rate": 0.0007112044817927172, + "loss": 0.6047, + "step": 10418 + }, + { + "epoch": 5.820670391061452, + "grad_norm": 0.48858439922332764, + "learning_rate": 0.0007111764705882354, + "loss": 0.4873, + "step": 10419 + }, + { + "epoch": 5.82122905027933, + "grad_norm": 0.6286826729774475, + "learning_rate": 0.0007111484593837535, + "loss": 0.4355, + "step": 10420 + }, + { + "epoch": 5.821787709497206, + "grad_norm": 0.5934100151062012, + "learning_rate": 0.0007111204481792717, + "loss": 0.3404, + "step": 10421 + }, + { + "epoch": 5.822346368715084, + "grad_norm": 0.6859205961227417, + "learning_rate": 0.00071109243697479, + "loss": 0.5466, + "step": 10422 + }, + { + "epoch": 5.822905027932961, + "grad_norm": 0.6118091344833374, + "learning_rate": 0.0007110644257703082, + "loss": 0.4698, + "step": 10423 + }, + { + "epoch": 5.823463687150838, + "grad_norm": 0.6592184901237488, + "learning_rate": 0.0007110364145658264, + "loss": 0.4413, + "step": 10424 + }, + { + "epoch": 5.824022346368715, + "grad_norm": 0.37606433033943176, + "learning_rate": 0.0007110084033613445, + "loss": 0.4124, + "step": 10425 + }, + { + "epoch": 5.824581005586592, + "grad_norm": 0.3604859709739685, + "learning_rate": 0.0007109803921568627, + "loss": 0.4038, + "step": 10426 + }, + { + "epoch": 5.825139664804469, + "grad_norm": 0.38896092772483826, + "learning_rate": 0.000710952380952381, + "loss": 0.3804, + "step": 10427 + }, + { + "epoch": 5.825698324022346, + "grad_norm": 0.6724205017089844, + "learning_rate": 0.0007109243697478992, + "loss": 0.4783, + "step": 10428 + }, + { + "epoch": 5.826256983240223, + "grad_norm": 4.228426456451416, + "learning_rate": 0.0007108963585434174, + "loss": 0.41, + "step": 10429 + }, + { + "epoch": 5.826815642458101, + "grad_norm": 0.6450451612472534, + "learning_rate": 0.0007108683473389355, + "loss": 0.5602, + "step": 10430 + }, + { + "epoch": 5.827374301675977, + "grad_norm": 0.7637776136398315, + "learning_rate": 0.0007108403361344537, + "loss": 0.5773, + "step": 10431 + }, + { + "epoch": 5.827932960893855, + "grad_norm": 0.5785620212554932, + "learning_rate": 0.000710812324929972, + "loss": 0.394, + "step": 10432 + }, + { + "epoch": 5.828491620111732, + "grad_norm": 0.38797181844711304, + "learning_rate": 0.0007107843137254903, + "loss": 0.356, + "step": 10433 + }, + { + "epoch": 5.829050279329609, + "grad_norm": 0.5849924683570862, + "learning_rate": 0.0007107563025210085, + "loss": 0.4449, + "step": 10434 + }, + { + "epoch": 5.829608938547486, + "grad_norm": 0.49999329447746277, + "learning_rate": 0.0007107282913165267, + "loss": 0.5016, + "step": 10435 + }, + { + "epoch": 5.830167597765363, + "grad_norm": 0.4862354099750519, + "learning_rate": 0.0007107002801120448, + "loss": 0.4636, + "step": 10436 + }, + { + "epoch": 5.83072625698324, + "grad_norm": 0.6120949983596802, + "learning_rate": 0.0007106722689075631, + "loss": 0.479, + "step": 10437 + }, + { + "epoch": 5.831284916201117, + "grad_norm": 0.4680287837982178, + "learning_rate": 0.0007106442577030813, + "loss": 0.3783, + "step": 10438 + }, + { + "epoch": 5.831843575418994, + "grad_norm": 0.4549527168273926, + "learning_rate": 0.0007106162464985995, + "loss": 0.3926, + "step": 10439 + }, + { + "epoch": 5.832402234636872, + "grad_norm": 0.5019168257713318, + "learning_rate": 0.0007105882352941177, + "loss": 0.434, + "step": 10440 + }, + { + "epoch": 5.832960893854748, + "grad_norm": 0.5315141677856445, + "learning_rate": 0.0007105602240896358, + "loss": 0.4836, + "step": 10441 + }, + { + "epoch": 5.833519553072626, + "grad_norm": 0.5233185887336731, + "learning_rate": 0.0007105322128851541, + "loss": 0.477, + "step": 10442 + }, + { + "epoch": 5.834078212290503, + "grad_norm": 0.4724504053592682, + "learning_rate": 0.0007105042016806723, + "loss": 0.3778, + "step": 10443 + }, + { + "epoch": 5.83463687150838, + "grad_norm": 0.6455507278442383, + "learning_rate": 0.0007104761904761905, + "loss": 0.554, + "step": 10444 + }, + { + "epoch": 5.835195530726257, + "grad_norm": 0.41713330149650574, + "learning_rate": 0.0007104481792717087, + "loss": 0.4065, + "step": 10445 + }, + { + "epoch": 5.835754189944134, + "grad_norm": 0.6680293083190918, + "learning_rate": 0.0007104201680672268, + "loss": 0.3554, + "step": 10446 + }, + { + "epoch": 5.836312849162011, + "grad_norm": 0.4157554507255554, + "learning_rate": 0.0007103921568627451, + "loss": 0.4236, + "step": 10447 + }, + { + "epoch": 5.836871508379888, + "grad_norm": 0.7821092009544373, + "learning_rate": 0.0007103641456582633, + "loss": 0.3547, + "step": 10448 + }, + { + "epoch": 5.837430167597765, + "grad_norm": 0.4302677512168884, + "learning_rate": 0.0007103361344537815, + "loss": 0.3936, + "step": 10449 + }, + { + "epoch": 5.837988826815643, + "grad_norm": 0.6865050792694092, + "learning_rate": 0.0007103081232492998, + "loss": 0.4549, + "step": 10450 + }, + { + "epoch": 5.838547486033519, + "grad_norm": 0.5388014912605286, + "learning_rate": 0.000710280112044818, + "loss": 0.4881, + "step": 10451 + }, + { + "epoch": 5.839106145251397, + "grad_norm": 0.49156346917152405, + "learning_rate": 0.0007102521008403362, + "loss": 0.4371, + "step": 10452 + }, + { + "epoch": 5.839664804469273, + "grad_norm": 0.49882760643959045, + "learning_rate": 0.0007102240896358544, + "loss": 0.3914, + "step": 10453 + }, + { + "epoch": 5.840223463687151, + "grad_norm": 0.501832127571106, + "learning_rate": 0.0007101960784313726, + "loss": 0.5015, + "step": 10454 + }, + { + "epoch": 5.840782122905028, + "grad_norm": 0.5040813088417053, + "learning_rate": 0.0007101680672268908, + "loss": 0.4468, + "step": 10455 + }, + { + "epoch": 5.841340782122905, + "grad_norm": 0.7020991444587708, + "learning_rate": 0.000710140056022409, + "loss": 0.4642, + "step": 10456 + }, + { + "epoch": 5.841899441340782, + "grad_norm": 0.7725340127944946, + "learning_rate": 0.0007101120448179272, + "loss": 0.3988, + "step": 10457 + }, + { + "epoch": 5.842458100558659, + "grad_norm": 0.5417423844337463, + "learning_rate": 0.0007100840336134454, + "loss": 0.4571, + "step": 10458 + }, + { + "epoch": 5.843016759776536, + "grad_norm": 0.7756684422492981, + "learning_rate": 0.0007100560224089636, + "loss": 0.6887, + "step": 10459 + }, + { + "epoch": 5.843575418994414, + "grad_norm": 0.873654305934906, + "learning_rate": 0.0007100280112044818, + "loss": 0.4264, + "step": 10460 + }, + { + "epoch": 5.84413407821229, + "grad_norm": 1.2273424863815308, + "learning_rate": 0.00071, + "loss": 0.4112, + "step": 10461 + }, + { + "epoch": 5.844692737430168, + "grad_norm": 0.868361234664917, + "learning_rate": 0.0007099719887955182, + "loss": 0.379, + "step": 10462 + }, + { + "epoch": 5.845251396648044, + "grad_norm": 1.1167317628860474, + "learning_rate": 0.0007099439775910364, + "loss": 0.4187, + "step": 10463 + }, + { + "epoch": 5.845810055865922, + "grad_norm": 0.42700713872909546, + "learning_rate": 0.0007099159663865546, + "loss": 0.3559, + "step": 10464 + }, + { + "epoch": 5.846368715083798, + "grad_norm": 0.6540542840957642, + "learning_rate": 0.0007098879551820728, + "loss": 0.4894, + "step": 10465 + }, + { + "epoch": 5.846927374301676, + "grad_norm": 17.374895095825195, + "learning_rate": 0.000709859943977591, + "loss": 0.498, + "step": 10466 + }, + { + "epoch": 5.847486033519553, + "grad_norm": 0.5259714722633362, + "learning_rate": 0.0007098319327731094, + "loss": 0.5107, + "step": 10467 + }, + { + "epoch": 5.84804469273743, + "grad_norm": 0.671551525592804, + "learning_rate": 0.0007098039215686275, + "loss": 0.4188, + "step": 10468 + }, + { + "epoch": 5.848603351955307, + "grad_norm": 0.7632198929786682, + "learning_rate": 0.0007097759103641457, + "loss": 0.4265, + "step": 10469 + }, + { + "epoch": 5.849162011173185, + "grad_norm": 0.6515686511993408, + "learning_rate": 0.0007097478991596639, + "loss": 0.4556, + "step": 10470 + }, + { + "epoch": 5.849720670391061, + "grad_norm": 0.4687560498714447, + "learning_rate": 0.0007097198879551821, + "loss": 0.414, + "step": 10471 + }, + { + "epoch": 5.850279329608939, + "grad_norm": 0.4172324538230896, + "learning_rate": 0.0007096918767507004, + "loss": 0.4027, + "step": 10472 + }, + { + "epoch": 5.850837988826815, + "grad_norm": 0.5436916947364807, + "learning_rate": 0.0007096638655462185, + "loss": 0.5866, + "step": 10473 + }, + { + "epoch": 5.851396648044693, + "grad_norm": 0.5017995834350586, + "learning_rate": 0.0007096358543417367, + "loss": 0.3561, + "step": 10474 + }, + { + "epoch": 5.851955307262569, + "grad_norm": 1.3895572423934937, + "learning_rate": 0.0007096078431372549, + "loss": 0.4267, + "step": 10475 + }, + { + "epoch": 5.852513966480447, + "grad_norm": 0.5709172487258911, + "learning_rate": 0.0007095798319327731, + "loss": 0.6533, + "step": 10476 + }, + { + "epoch": 5.853072625698324, + "grad_norm": 3.526177167892456, + "learning_rate": 0.0007095518207282914, + "loss": 0.4235, + "step": 10477 + }, + { + "epoch": 5.853631284916201, + "grad_norm": 0.6979643106460571, + "learning_rate": 0.0007095238095238095, + "loss": 0.5456, + "step": 10478 + }, + { + "epoch": 5.854189944134078, + "grad_norm": 0.4838812053203583, + "learning_rate": 0.0007094957983193277, + "loss": 0.4353, + "step": 10479 + }, + { + "epoch": 5.854748603351956, + "grad_norm": 0.48904502391815186, + "learning_rate": 0.0007094677871148459, + "loss": 0.4469, + "step": 10480 + }, + { + "epoch": 5.855307262569832, + "grad_norm": 0.5256216526031494, + "learning_rate": 0.0007094397759103641, + "loss": 0.4255, + "step": 10481 + }, + { + "epoch": 5.85586592178771, + "grad_norm": 1.4294583797454834, + "learning_rate": 0.0007094117647058825, + "loss": 0.6436, + "step": 10482 + }, + { + "epoch": 5.856424581005586, + "grad_norm": 0.6884363293647766, + "learning_rate": 0.0007093837535014007, + "loss": 0.5158, + "step": 10483 + }, + { + "epoch": 5.856983240223464, + "grad_norm": 0.3635435402393341, + "learning_rate": 0.0007093557422969188, + "loss": 0.3553, + "step": 10484 + }, + { + "epoch": 5.85754189944134, + "grad_norm": 0.49657881259918213, + "learning_rate": 0.000709327731092437, + "loss": 0.4269, + "step": 10485 + }, + { + "epoch": 5.858100558659218, + "grad_norm": 0.5020352005958557, + "learning_rate": 0.0007092997198879552, + "loss": 0.4535, + "step": 10486 + }, + { + "epoch": 5.858659217877095, + "grad_norm": 0.8311812877655029, + "learning_rate": 0.0007092717086834734, + "loss": 0.6703, + "step": 10487 + }, + { + "epoch": 5.859217877094972, + "grad_norm": 1.1023448705673218, + "learning_rate": 0.0007092436974789917, + "loss": 0.4366, + "step": 10488 + }, + { + "epoch": 5.859776536312849, + "grad_norm": 0.5711884498596191, + "learning_rate": 0.0007092156862745098, + "loss": 0.6715, + "step": 10489 + }, + { + "epoch": 5.860335195530726, + "grad_norm": 0.5218866467475891, + "learning_rate": 0.000709187675070028, + "loss": 0.4071, + "step": 10490 + }, + { + "epoch": 5.860893854748603, + "grad_norm": 0.5009090304374695, + "learning_rate": 0.0007091596638655462, + "loss": 0.3999, + "step": 10491 + }, + { + "epoch": 5.861452513966481, + "grad_norm": 0.5133971571922302, + "learning_rate": 0.0007091316526610644, + "loss": 0.4329, + "step": 10492 + }, + { + "epoch": 5.862011173184357, + "grad_norm": 0.48217669129371643, + "learning_rate": 0.0007091036414565827, + "loss": 0.5211, + "step": 10493 + }, + { + "epoch": 5.862569832402235, + "grad_norm": 0.7686821222305298, + "learning_rate": 0.0007090756302521008, + "loss": 0.4309, + "step": 10494 + }, + { + "epoch": 5.863128491620111, + "grad_norm": 1.671038031578064, + "learning_rate": 0.000709047619047619, + "loss": 0.5433, + "step": 10495 + }, + { + "epoch": 5.863687150837989, + "grad_norm": 0.4058610200881958, + "learning_rate": 0.0007090196078431372, + "loss": 0.3102, + "step": 10496 + }, + { + "epoch": 5.864245810055866, + "grad_norm": 0.5447606444358826, + "learning_rate": 0.0007089915966386554, + "loss": 0.4792, + "step": 10497 + }, + { + "epoch": 5.864804469273743, + "grad_norm": 0.4325959086418152, + "learning_rate": 0.0007089635854341737, + "loss": 0.4216, + "step": 10498 + }, + { + "epoch": 5.86536312849162, + "grad_norm": 0.34113359451293945, + "learning_rate": 0.000708935574229692, + "loss": 0.404, + "step": 10499 + }, + { + "epoch": 5.865921787709497, + "grad_norm": 0.44729116559028625, + "learning_rate": 0.00070890756302521, + "loss": 0.4309, + "step": 10500 + }, + { + "epoch": 5.865921787709497, + "eval_cer": 0.09162820638713408, + "eval_loss": 0.34694910049438477, + "eval_runtime": 55.6054, + "eval_samples_per_second": 81.611, + "eval_steps_per_second": 5.107, + "eval_wer": 0.36197312924219993, + "step": 10500 + }, + { + "epoch": 5.866480446927374, + "grad_norm": 0.9637112021446228, + "learning_rate": 0.0007088795518207283, + "loss": 0.4681, + "step": 10501 + }, + { + "epoch": 5.867039106145251, + "grad_norm": 0.5004937648773193, + "learning_rate": 0.0007088515406162465, + "loss": 0.4719, + "step": 10502 + }, + { + "epoch": 5.867597765363128, + "grad_norm": 0.4892960786819458, + "learning_rate": 0.0007088235294117648, + "loss": 0.435, + "step": 10503 + }, + { + "epoch": 5.868156424581006, + "grad_norm": 0.587005078792572, + "learning_rate": 0.000708795518207283, + "loss": 0.4385, + "step": 10504 + }, + { + "epoch": 5.868715083798882, + "grad_norm": 1.0113680362701416, + "learning_rate": 0.0007087675070028011, + "loss": 0.4313, + "step": 10505 + }, + { + "epoch": 5.86927374301676, + "grad_norm": 0.43034034967422485, + "learning_rate": 0.0007087394957983193, + "loss": 0.3942, + "step": 10506 + }, + { + "epoch": 5.869832402234637, + "grad_norm": 0.45791736245155334, + "learning_rate": 0.0007087114845938375, + "loss": 0.4069, + "step": 10507 + }, + { + "epoch": 5.870391061452514, + "grad_norm": 0.363300085067749, + "learning_rate": 0.0007086834733893558, + "loss": 0.4497, + "step": 10508 + }, + { + "epoch": 5.870949720670391, + "grad_norm": 0.5509997606277466, + "learning_rate": 0.000708655462184874, + "loss": 0.5023, + "step": 10509 + }, + { + "epoch": 5.871508379888268, + "grad_norm": 0.6267831921577454, + "learning_rate": 0.0007086274509803921, + "loss": 0.4743, + "step": 10510 + }, + { + "epoch": 5.872067039106145, + "grad_norm": 1.38927161693573, + "learning_rate": 0.0007085994397759103, + "loss": 0.4181, + "step": 10511 + }, + { + "epoch": 5.872625698324022, + "grad_norm": 0.6998752951622009, + "learning_rate": 0.0007085714285714285, + "loss": 0.5421, + "step": 10512 + }, + { + "epoch": 5.873184357541899, + "grad_norm": 0.4201306700706482, + "learning_rate": 0.0007085434173669468, + "loss": 0.4872, + "step": 10513 + }, + { + "epoch": 5.873743016759777, + "grad_norm": 0.6429771184921265, + "learning_rate": 0.000708515406162465, + "loss": 0.4769, + "step": 10514 + }, + { + "epoch": 5.874301675977653, + "grad_norm": 0.5104084014892578, + "learning_rate": 0.0007084873949579833, + "loss": 0.4284, + "step": 10515 + }, + { + "epoch": 5.874860335195531, + "grad_norm": 0.5287335515022278, + "learning_rate": 0.0007084593837535013, + "loss": 0.4826, + "step": 10516 + }, + { + "epoch": 5.875418994413408, + "grad_norm": 0.4161457121372223, + "learning_rate": 0.0007084313725490196, + "loss": 0.3787, + "step": 10517 + }, + { + "epoch": 5.875977653631285, + "grad_norm": 0.4690793454647064, + "learning_rate": 0.0007084033613445379, + "loss": 0.4553, + "step": 10518 + }, + { + "epoch": 5.876536312849162, + "grad_norm": 0.6330850720405579, + "learning_rate": 0.0007083753501400561, + "loss": 0.4217, + "step": 10519 + }, + { + "epoch": 5.877094972067039, + "grad_norm": 0.72749263048172, + "learning_rate": 0.0007083473389355743, + "loss": 0.3944, + "step": 10520 + }, + { + "epoch": 5.877653631284916, + "grad_norm": 0.5926832556724548, + "learning_rate": 0.0007083193277310924, + "loss": 0.4276, + "step": 10521 + }, + { + "epoch": 5.878212290502793, + "grad_norm": 0.9759839773178101, + "learning_rate": 0.0007082913165266106, + "loss": 0.5273, + "step": 10522 + }, + { + "epoch": 5.87877094972067, + "grad_norm": 0.5710582733154297, + "learning_rate": 0.0007082633053221289, + "loss": 0.4441, + "step": 10523 + }, + { + "epoch": 5.879329608938548, + "grad_norm": 0.5046635270118713, + "learning_rate": 0.0007082352941176471, + "loss": 0.3817, + "step": 10524 + }, + { + "epoch": 5.879888268156424, + "grad_norm": 0.6082225441932678, + "learning_rate": 0.0007082072829131653, + "loss": 0.4131, + "step": 10525 + }, + { + "epoch": 5.880446927374302, + "grad_norm": 0.6650238037109375, + "learning_rate": 0.0007081792717086834, + "loss": 0.4509, + "step": 10526 + }, + { + "epoch": 5.881005586592178, + "grad_norm": 0.7564213275909424, + "learning_rate": 0.0007081512605042016, + "loss": 0.4422, + "step": 10527 + }, + { + "epoch": 5.881564245810056, + "grad_norm": 0.376693993806839, + "learning_rate": 0.0007081232492997199, + "loss": 0.4314, + "step": 10528 + }, + { + "epoch": 5.882122905027933, + "grad_norm": 0.40023693442344666, + "learning_rate": 0.0007080952380952381, + "loss": 0.4076, + "step": 10529 + }, + { + "epoch": 5.88268156424581, + "grad_norm": 0.9997773766517639, + "learning_rate": 0.0007080672268907563, + "loss": 0.4007, + "step": 10530 + }, + { + "epoch": 5.883240223463687, + "grad_norm": 0.4893626570701599, + "learning_rate": 0.0007080392156862745, + "loss": 0.3829, + "step": 10531 + }, + { + "epoch": 5.883798882681564, + "grad_norm": 8.082022666931152, + "learning_rate": 0.0007080112044817926, + "loss": 0.3434, + "step": 10532 + }, + { + "epoch": 5.884357541899441, + "grad_norm": 0.5004198551177979, + "learning_rate": 0.000707983193277311, + "loss": 0.4379, + "step": 10533 + }, + { + "epoch": 5.884916201117319, + "grad_norm": 0.3681134283542633, + "learning_rate": 0.0007079551820728292, + "loss": 0.3594, + "step": 10534 + }, + { + "epoch": 5.885474860335195, + "grad_norm": 0.45535004138946533, + "learning_rate": 0.0007079271708683474, + "loss": 0.4661, + "step": 10535 + }, + { + "epoch": 5.886033519553073, + "grad_norm": 0.6571336388587952, + "learning_rate": 0.0007078991596638656, + "loss": 0.4049, + "step": 10536 + }, + { + "epoch": 5.886592178770949, + "grad_norm": 0.638870894908905, + "learning_rate": 0.0007078711484593837, + "loss": 0.4013, + "step": 10537 + }, + { + "epoch": 5.887150837988827, + "grad_norm": 0.6915528774261475, + "learning_rate": 0.000707843137254902, + "loss": 0.3957, + "step": 10538 + }, + { + "epoch": 5.8877094972067034, + "grad_norm": 0.6207171678543091, + "learning_rate": 0.0007078151260504202, + "loss": 0.4515, + "step": 10539 + }, + { + "epoch": 5.888268156424581, + "grad_norm": 0.5034327507019043, + "learning_rate": 0.0007077871148459384, + "loss": 0.398, + "step": 10540 + }, + { + "epoch": 5.888826815642458, + "grad_norm": 0.39464282989501953, + "learning_rate": 0.0007077591036414566, + "loss": 0.3175, + "step": 10541 + }, + { + "epoch": 5.889385474860335, + "grad_norm": 0.6110559105873108, + "learning_rate": 0.0007077310924369747, + "loss": 0.487, + "step": 10542 + }, + { + "epoch": 5.889944134078212, + "grad_norm": 0.48830339312553406, + "learning_rate": 0.000707703081232493, + "loss": 0.3654, + "step": 10543 + }, + { + "epoch": 5.89050279329609, + "grad_norm": 0.444409042596817, + "learning_rate": 0.0007076750700280112, + "loss": 0.4212, + "step": 10544 + }, + { + "epoch": 5.891061452513966, + "grad_norm": 0.5622066259384155, + "learning_rate": 0.0007076470588235294, + "loss": 0.3945, + "step": 10545 + }, + { + "epoch": 5.891620111731844, + "grad_norm": 0.4948543608188629, + "learning_rate": 0.0007076190476190476, + "loss": 0.5903, + "step": 10546 + }, + { + "epoch": 5.89217877094972, + "grad_norm": 1.0931496620178223, + "learning_rate": 0.0007075910364145658, + "loss": 0.4833, + "step": 10547 + }, + { + "epoch": 5.892737430167598, + "grad_norm": 0.5254516005516052, + "learning_rate": 0.000707563025210084, + "loss": 0.4919, + "step": 10548 + }, + { + "epoch": 5.8932960893854744, + "grad_norm": 0.6026060581207275, + "learning_rate": 0.0007075350140056023, + "loss": 0.5276, + "step": 10549 + }, + { + "epoch": 5.893854748603352, + "grad_norm": 0.47187286615371704, + "learning_rate": 0.0007075070028011205, + "loss": 0.4777, + "step": 10550 + }, + { + "epoch": 5.894413407821229, + "grad_norm": 0.5213013291358948, + "learning_rate": 0.0007074789915966387, + "loss": 0.5804, + "step": 10551 + }, + { + "epoch": 5.894972067039106, + "grad_norm": 1.7274495363235474, + "learning_rate": 0.0007074509803921569, + "loss": 0.466, + "step": 10552 + }, + { + "epoch": 5.895530726256983, + "grad_norm": 0.48574063181877136, + "learning_rate": 0.0007074229691876751, + "loss": 0.4384, + "step": 10553 + }, + { + "epoch": 5.896089385474861, + "grad_norm": 0.6204369068145752, + "learning_rate": 0.0007073949579831933, + "loss": 0.5534, + "step": 10554 + }, + { + "epoch": 5.896648044692737, + "grad_norm": 0.5332340598106384, + "learning_rate": 0.0007073669467787115, + "loss": 0.4587, + "step": 10555 + }, + { + "epoch": 5.897206703910615, + "grad_norm": 1.092002272605896, + "learning_rate": 0.0007073389355742297, + "loss": 0.4286, + "step": 10556 + }, + { + "epoch": 5.897765363128491, + "grad_norm": 0.4838496744632721, + "learning_rate": 0.0007073109243697479, + "loss": 0.4651, + "step": 10557 + }, + { + "epoch": 5.898324022346369, + "grad_norm": 0.49133217334747314, + "learning_rate": 0.0007072829131652661, + "loss": 0.4712, + "step": 10558 + }, + { + "epoch": 5.8988826815642454, + "grad_norm": 0.8116902709007263, + "learning_rate": 0.0007072549019607843, + "loss": 0.5069, + "step": 10559 + }, + { + "epoch": 5.899441340782123, + "grad_norm": 0.37905728816986084, + "learning_rate": 0.0007072268907563025, + "loss": 0.359, + "step": 10560 + }, + { + "epoch": 5.9, + "grad_norm": 0.8048175573348999, + "learning_rate": 0.0007071988795518207, + "loss": 0.5839, + "step": 10561 + }, + { + "epoch": 5.900558659217877, + "grad_norm": 0.6506947875022888, + "learning_rate": 0.0007071708683473389, + "loss": 0.4196, + "step": 10562 + }, + { + "epoch": 5.901117318435754, + "grad_norm": 0.6302304267883301, + "learning_rate": 0.0007071428571428572, + "loss": 0.5325, + "step": 10563 + }, + { + "epoch": 5.901675977653631, + "grad_norm": 0.44592854380607605, + "learning_rate": 0.0007071148459383753, + "loss": 0.3849, + "step": 10564 + }, + { + "epoch": 5.902234636871508, + "grad_norm": 1.670932650566101, + "learning_rate": 0.0007070868347338936, + "loss": 0.3905, + "step": 10565 + }, + { + "epoch": 5.902793296089386, + "grad_norm": 0.6100943088531494, + "learning_rate": 0.0007070588235294118, + "loss": 0.4002, + "step": 10566 + }, + { + "epoch": 5.903351955307262, + "grad_norm": 0.5980740189552307, + "learning_rate": 0.00070703081232493, + "loss": 0.4991, + "step": 10567 + }, + { + "epoch": 5.90391061452514, + "grad_norm": 1.6025962829589844, + "learning_rate": 0.0007070028011204483, + "loss": 0.4177, + "step": 10568 + }, + { + "epoch": 5.9044692737430164, + "grad_norm": 0.9667633771896362, + "learning_rate": 0.0007069747899159664, + "loss": 0.5212, + "step": 10569 + }, + { + "epoch": 5.905027932960894, + "grad_norm": 0.6661363244056702, + "learning_rate": 0.0007069467787114846, + "loss": 0.3631, + "step": 10570 + }, + { + "epoch": 5.905586592178771, + "grad_norm": 0.4226272404193878, + "learning_rate": 0.0007069187675070028, + "loss": 0.4953, + "step": 10571 + }, + { + "epoch": 5.906145251396648, + "grad_norm": 0.6326567530632019, + "learning_rate": 0.000706890756302521, + "loss": 0.4929, + "step": 10572 + }, + { + "epoch": 5.906703910614525, + "grad_norm": 0.6285231709480286, + "learning_rate": 0.0007068627450980393, + "loss": 0.4136, + "step": 10573 + }, + { + "epoch": 5.907262569832402, + "grad_norm": 0.5302049517631531, + "learning_rate": 0.0007068347338935574, + "loss": 0.4279, + "step": 10574 + }, + { + "epoch": 5.907821229050279, + "grad_norm": 1.0716586112976074, + "learning_rate": 0.0007068067226890756, + "loss": 0.6027, + "step": 10575 + }, + { + "epoch": 5.908379888268156, + "grad_norm": 0.6912524700164795, + "learning_rate": 0.0007067787114845938, + "loss": 0.6106, + "step": 10576 + }, + { + "epoch": 5.908938547486033, + "grad_norm": 0.7536414861679077, + "learning_rate": 0.000706750700280112, + "loss": 0.361, + "step": 10577 + }, + { + "epoch": 5.909497206703911, + "grad_norm": 0.4771345853805542, + "learning_rate": 0.0007067226890756303, + "loss": 0.4497, + "step": 10578 + }, + { + "epoch": 5.910055865921787, + "grad_norm": 0.4545881450176239, + "learning_rate": 0.0007066946778711485, + "loss": 0.3132, + "step": 10579 + }, + { + "epoch": 5.910614525139665, + "grad_norm": 0.6520172953605652, + "learning_rate": 0.0007066666666666666, + "loss": 0.7032, + "step": 10580 + }, + { + "epoch": 5.911173184357542, + "grad_norm": 1.0696566104888916, + "learning_rate": 0.0007066386554621848, + "loss": 0.4989, + "step": 10581 + }, + { + "epoch": 5.911731843575419, + "grad_norm": 0.4830806851387024, + "learning_rate": 0.000706610644257703, + "loss": 0.4242, + "step": 10582 + }, + { + "epoch": 5.912290502793296, + "grad_norm": 0.6830497980117798, + "learning_rate": 0.0007065826330532214, + "loss": 0.5273, + "step": 10583 + }, + { + "epoch": 5.912849162011173, + "grad_norm": 0.517987072467804, + "learning_rate": 0.0007065546218487396, + "loss": 0.362, + "step": 10584 + }, + { + "epoch": 5.91340782122905, + "grad_norm": 0.7807379364967346, + "learning_rate": 0.0007065266106442577, + "loss": 0.4447, + "step": 10585 + }, + { + "epoch": 5.913966480446927, + "grad_norm": 0.6242043375968933, + "learning_rate": 0.0007064985994397759, + "loss": 0.3775, + "step": 10586 + }, + { + "epoch": 5.914525139664804, + "grad_norm": 2.115849256515503, + "learning_rate": 0.0007064705882352941, + "loss": 0.4227, + "step": 10587 + }, + { + "epoch": 5.915083798882682, + "grad_norm": 0.45200830698013306, + "learning_rate": 0.0007064425770308124, + "loss": 0.404, + "step": 10588 + }, + { + "epoch": 5.915642458100558, + "grad_norm": 0.6316574215888977, + "learning_rate": 0.0007064145658263306, + "loss": 0.5078, + "step": 10589 + }, + { + "epoch": 5.916201117318436, + "grad_norm": 0.7201128602027893, + "learning_rate": 0.0007063865546218487, + "loss": 0.4984, + "step": 10590 + }, + { + "epoch": 5.9167597765363125, + "grad_norm": 0.49354565143585205, + "learning_rate": 0.0007063585434173669, + "loss": 0.3863, + "step": 10591 + }, + { + "epoch": 5.91731843575419, + "grad_norm": 0.7998040318489075, + "learning_rate": 0.0007063305322128851, + "loss": 0.5084, + "step": 10592 + }, + { + "epoch": 5.917877094972067, + "grad_norm": 0.5736560225486755, + "learning_rate": 0.0007063025210084034, + "loss": 0.4712, + "step": 10593 + }, + { + "epoch": 5.918435754189944, + "grad_norm": 0.630725622177124, + "learning_rate": 0.0007062745098039216, + "loss": 0.4383, + "step": 10594 + }, + { + "epoch": 5.918994413407821, + "grad_norm": 0.42156875133514404, + "learning_rate": 0.0007062464985994398, + "loss": 0.5056, + "step": 10595 + }, + { + "epoch": 5.919553072625698, + "grad_norm": 0.5405137538909912, + "learning_rate": 0.0007062184873949579, + "loss": 0.4783, + "step": 10596 + }, + { + "epoch": 5.920111731843575, + "grad_norm": 0.6225018501281738, + "learning_rate": 0.0007061904761904761, + "loss": 0.4109, + "step": 10597 + }, + { + "epoch": 5.920670391061453, + "grad_norm": 1.165069818496704, + "learning_rate": 0.0007061624649859945, + "loss": 0.5023, + "step": 10598 + }, + { + "epoch": 5.921229050279329, + "grad_norm": 0.5271239876747131, + "learning_rate": 0.0007061344537815127, + "loss": 0.4741, + "step": 10599 + }, + { + "epoch": 5.921787709497207, + "grad_norm": 0.4791272282600403, + "learning_rate": 0.0007061064425770309, + "loss": 0.4346, + "step": 10600 + }, + { + "epoch": 5.9223463687150835, + "grad_norm": 0.6669300198554993, + "learning_rate": 0.000706078431372549, + "loss": 0.3778, + "step": 10601 + }, + { + "epoch": 5.922905027932961, + "grad_norm": 0.601402997970581, + "learning_rate": 0.0007060504201680672, + "loss": 0.4113, + "step": 10602 + }, + { + "epoch": 5.923463687150838, + "grad_norm": 0.8014582395553589, + "learning_rate": 0.0007060224089635855, + "loss": 0.4926, + "step": 10603 + }, + { + "epoch": 5.924022346368715, + "grad_norm": 0.628642737865448, + "learning_rate": 0.0007059943977591037, + "loss": 0.467, + "step": 10604 + }, + { + "epoch": 5.924581005586592, + "grad_norm": 0.7804936766624451, + "learning_rate": 0.0007059663865546219, + "loss": 0.4549, + "step": 10605 + }, + { + "epoch": 5.925139664804469, + "grad_norm": 1.0483605861663818, + "learning_rate": 0.00070593837535014, + "loss": 0.445, + "step": 10606 + }, + { + "epoch": 5.925698324022346, + "grad_norm": 0.5775977373123169, + "learning_rate": 0.0007059103641456582, + "loss": 0.6146, + "step": 10607 + }, + { + "epoch": 5.926256983240224, + "grad_norm": 0.8711464405059814, + "learning_rate": 0.0007058823529411765, + "loss": 0.5122, + "step": 10608 + }, + { + "epoch": 5.9268156424581, + "grad_norm": 0.4983471632003784, + "learning_rate": 0.0007058543417366947, + "loss": 0.4494, + "step": 10609 + }, + { + "epoch": 5.927374301675978, + "grad_norm": 0.47444915771484375, + "learning_rate": 0.0007058263305322129, + "loss": 0.4098, + "step": 10610 + }, + { + "epoch": 5.9279329608938545, + "grad_norm": 0.5697492361068726, + "learning_rate": 0.0007057983193277311, + "loss": 0.4544, + "step": 10611 + }, + { + "epoch": 5.928491620111732, + "grad_norm": 0.6584872603416443, + "learning_rate": 0.0007057703081232492, + "loss": 0.5686, + "step": 10612 + }, + { + "epoch": 5.9290502793296085, + "grad_norm": 0.7022781372070312, + "learning_rate": 0.0007057422969187675, + "loss": 0.4933, + "step": 10613 + }, + { + "epoch": 5.929608938547486, + "grad_norm": 0.5768987536430359, + "learning_rate": 0.0007057142857142858, + "loss": 0.4003, + "step": 10614 + }, + { + "epoch": 5.930167597765363, + "grad_norm": 0.5249398946762085, + "learning_rate": 0.000705686274509804, + "loss": 0.4013, + "step": 10615 + }, + { + "epoch": 5.93072625698324, + "grad_norm": 0.5577698349952698, + "learning_rate": 0.0007056582633053222, + "loss": 0.3886, + "step": 10616 + }, + { + "epoch": 5.931284916201117, + "grad_norm": 0.5374637246131897, + "learning_rate": 0.0007056302521008403, + "loss": 0.4343, + "step": 10617 + }, + { + "epoch": 5.931843575418995, + "grad_norm": 0.8629102110862732, + "learning_rate": 0.0007056022408963586, + "loss": 0.3971, + "step": 10618 + }, + { + "epoch": 5.932402234636871, + "grad_norm": 0.5537786483764648, + "learning_rate": 0.0007055742296918768, + "loss": 0.407, + "step": 10619 + }, + { + "epoch": 5.932960893854749, + "grad_norm": 0.6504737734794617, + "learning_rate": 0.000705546218487395, + "loss": 0.3978, + "step": 10620 + }, + { + "epoch": 5.9335195530726255, + "grad_norm": 0.4973798990249634, + "learning_rate": 0.0007055182072829132, + "loss": 0.3837, + "step": 10621 + }, + { + "epoch": 5.934078212290503, + "grad_norm": 0.3515792787075043, + "learning_rate": 0.0007054901960784313, + "loss": 0.4248, + "step": 10622 + }, + { + "epoch": 5.9346368715083795, + "grad_norm": 0.5916847586631775, + "learning_rate": 0.0007054621848739496, + "loss": 0.5761, + "step": 10623 + }, + { + "epoch": 5.935195530726257, + "grad_norm": 0.3694547414779663, + "learning_rate": 0.0007054341736694678, + "loss": 0.4321, + "step": 10624 + }, + { + "epoch": 5.935754189944134, + "grad_norm": 0.6989243626594543, + "learning_rate": 0.000705406162464986, + "loss": 0.5094, + "step": 10625 + }, + { + "epoch": 5.936312849162011, + "grad_norm": 1.0088986158370972, + "learning_rate": 0.0007053781512605042, + "loss": 0.508, + "step": 10626 + }, + { + "epoch": 5.936871508379888, + "grad_norm": 0.8861255645751953, + "learning_rate": 0.0007053501400560224, + "loss": 0.5528, + "step": 10627 + }, + { + "epoch": 5.937430167597765, + "grad_norm": 0.5517739057540894, + "learning_rate": 0.0007053221288515406, + "loss": 0.5115, + "step": 10628 + }, + { + "epoch": 5.937988826815642, + "grad_norm": 1.1692125797271729, + "learning_rate": 0.0007052941176470588, + "loss": 0.583, + "step": 10629 + }, + { + "epoch": 5.93854748603352, + "grad_norm": 0.6481218338012695, + "learning_rate": 0.000705266106442577, + "loss": 0.3649, + "step": 10630 + }, + { + "epoch": 5.9391061452513965, + "grad_norm": 0.5833912491798401, + "learning_rate": 0.0007052380952380953, + "loss": 0.385, + "step": 10631 + }, + { + "epoch": 5.939664804469274, + "grad_norm": 0.689988374710083, + "learning_rate": 0.0007052100840336135, + "loss": 0.4179, + "step": 10632 + }, + { + "epoch": 5.9402234636871505, + "grad_norm": 0.7670750617980957, + "learning_rate": 0.0007051820728291317, + "loss": 0.4557, + "step": 10633 + }, + { + "epoch": 5.940782122905028, + "grad_norm": 2.537328004837036, + "learning_rate": 0.0007051540616246499, + "loss": 0.4419, + "step": 10634 + }, + { + "epoch": 5.941340782122905, + "grad_norm": 0.44420579075813293, + "learning_rate": 0.0007051260504201681, + "loss": 0.4102, + "step": 10635 + }, + { + "epoch": 5.941899441340782, + "grad_norm": 0.7864313125610352, + "learning_rate": 0.0007050980392156863, + "loss": 0.404, + "step": 10636 + }, + { + "epoch": 5.942458100558659, + "grad_norm": 1.1022429466247559, + "learning_rate": 0.0007050700280112045, + "loss": 0.6072, + "step": 10637 + }, + { + "epoch": 5.943016759776536, + "grad_norm": 0.7302351593971252, + "learning_rate": 0.0007050420168067228, + "loss": 0.5018, + "step": 10638 + }, + { + "epoch": 5.943575418994413, + "grad_norm": 0.415154367685318, + "learning_rate": 0.0007050140056022409, + "loss": 0.4589, + "step": 10639 + }, + { + "epoch": 5.94413407821229, + "grad_norm": 0.8116351962089539, + "learning_rate": 0.0007049859943977591, + "loss": 0.4651, + "step": 10640 + }, + { + "epoch": 5.9446927374301675, + "grad_norm": 1.8977586030960083, + "learning_rate": 0.0007049579831932773, + "loss": 0.4032, + "step": 10641 + }, + { + "epoch": 5.945251396648045, + "grad_norm": 0.49679845571517944, + "learning_rate": 0.0007049299719887955, + "loss": 0.3642, + "step": 10642 + }, + { + "epoch": 5.9458100558659215, + "grad_norm": 0.7046087384223938, + "learning_rate": 0.0007049019607843138, + "loss": 0.4383, + "step": 10643 + }, + { + "epoch": 5.946368715083799, + "grad_norm": 0.4634682834148407, + "learning_rate": 0.0007048739495798319, + "loss": 0.4783, + "step": 10644 + }, + { + "epoch": 5.946927374301676, + "grad_norm": 0.5060982704162598, + "learning_rate": 0.0007048459383753501, + "loss": 0.4279, + "step": 10645 + }, + { + "epoch": 5.947486033519553, + "grad_norm": 1.9865999221801758, + "learning_rate": 0.0007048179271708683, + "loss": 0.4345, + "step": 10646 + }, + { + "epoch": 5.94804469273743, + "grad_norm": 0.7619455456733704, + "learning_rate": 0.0007047899159663866, + "loss": 0.4927, + "step": 10647 + }, + { + "epoch": 5.948603351955307, + "grad_norm": 0.5044295191764832, + "learning_rate": 0.0007047619047619049, + "loss": 0.4822, + "step": 10648 + }, + { + "epoch": 5.949162011173184, + "grad_norm": 1.1890429258346558, + "learning_rate": 0.000704733893557423, + "loss": 0.4739, + "step": 10649 + }, + { + "epoch": 5.949720670391061, + "grad_norm": 0.47186458110809326, + "learning_rate": 0.0007047058823529412, + "loss": 0.4479, + "step": 10650 + }, + { + "epoch": 5.9502793296089385, + "grad_norm": 0.4976537525653839, + "learning_rate": 0.0007046778711484594, + "loss": 0.3963, + "step": 10651 + }, + { + "epoch": 5.950837988826816, + "grad_norm": 0.3979592025279999, + "learning_rate": 0.0007046498599439776, + "loss": 0.4367, + "step": 10652 + }, + { + "epoch": 5.9513966480446925, + "grad_norm": 1.8060015439987183, + "learning_rate": 0.0007046218487394959, + "loss": 0.4222, + "step": 10653 + }, + { + "epoch": 5.95195530726257, + "grad_norm": 0.5210453867912292, + "learning_rate": 0.0007045938375350141, + "loss": 0.5088, + "step": 10654 + }, + { + "epoch": 5.952513966480447, + "grad_norm": 0.6186861395835876, + "learning_rate": 0.0007045658263305322, + "loss": 0.5537, + "step": 10655 + }, + { + "epoch": 5.953072625698324, + "grad_norm": 0.5123553276062012, + "learning_rate": 0.0007045378151260504, + "loss": 0.45, + "step": 10656 + }, + { + "epoch": 5.953631284916201, + "grad_norm": 0.5243229269981384, + "learning_rate": 0.0007045098039215686, + "loss": 0.4015, + "step": 10657 + }, + { + "epoch": 5.954189944134078, + "grad_norm": 0.47566747665405273, + "learning_rate": 0.0007044817927170869, + "loss": 0.4194, + "step": 10658 + }, + { + "epoch": 5.954748603351955, + "grad_norm": 1.1357111930847168, + "learning_rate": 0.0007044537815126051, + "loss": 0.4013, + "step": 10659 + }, + { + "epoch": 5.955307262569832, + "grad_norm": 0.4731360971927643, + "learning_rate": 0.0007044257703081232, + "loss": 0.492, + "step": 10660 + }, + { + "epoch": 5.9558659217877095, + "grad_norm": 0.6241510510444641, + "learning_rate": 0.0007043977591036414, + "loss": 0.3763, + "step": 10661 + }, + { + "epoch": 5.956424581005587, + "grad_norm": 0.4301688075065613, + "learning_rate": 0.0007043697478991596, + "loss": 0.2866, + "step": 10662 + }, + { + "epoch": 5.9569832402234635, + "grad_norm": 0.5356236100196838, + "learning_rate": 0.000704341736694678, + "loss": 0.4482, + "step": 10663 + }, + { + "epoch": 5.957541899441341, + "grad_norm": 0.5268016457557678, + "learning_rate": 0.0007043137254901962, + "loss": 0.4882, + "step": 10664 + }, + { + "epoch": 5.9581005586592175, + "grad_norm": 0.5201060771942139, + "learning_rate": 0.0007042857142857143, + "loss": 0.578, + "step": 10665 + }, + { + "epoch": 5.958659217877095, + "grad_norm": 0.6463531851768494, + "learning_rate": 0.0007042577030812325, + "loss": 0.4167, + "step": 10666 + }, + { + "epoch": 5.959217877094972, + "grad_norm": 0.45140498876571655, + "learning_rate": 0.0007042296918767507, + "loss": 0.3724, + "step": 10667 + }, + { + "epoch": 5.959776536312849, + "grad_norm": 0.5019129514694214, + "learning_rate": 0.000704201680672269, + "loss": 0.4632, + "step": 10668 + }, + { + "epoch": 5.960335195530726, + "grad_norm": 0.5050307512283325, + "learning_rate": 0.0007041736694677872, + "loss": 0.4926, + "step": 10669 + }, + { + "epoch": 5.960893854748603, + "grad_norm": 0.47213369607925415, + "learning_rate": 0.0007041456582633054, + "loss": 0.4789, + "step": 10670 + }, + { + "epoch": 5.9614525139664805, + "grad_norm": 0.7370476722717285, + "learning_rate": 0.0007041176470588235, + "loss": 0.679, + "step": 10671 + }, + { + "epoch": 5.962011173184358, + "grad_norm": 0.5161084532737732, + "learning_rate": 0.0007040896358543417, + "loss": 0.5845, + "step": 10672 + }, + { + "epoch": 5.9625698324022345, + "grad_norm": 0.43537983298301697, + "learning_rate": 0.00070406162464986, + "loss": 0.4057, + "step": 10673 + }, + { + "epoch": 5.963128491620112, + "grad_norm": 0.5349956750869751, + "learning_rate": 0.0007040336134453782, + "loss": 0.4299, + "step": 10674 + }, + { + "epoch": 5.9636871508379885, + "grad_norm": 0.4862646758556366, + "learning_rate": 0.0007040056022408964, + "loss": 0.4585, + "step": 10675 + }, + { + "epoch": 5.964245810055866, + "grad_norm": 0.40399888157844543, + "learning_rate": 0.0007039775910364145, + "loss": 0.4013, + "step": 10676 + }, + { + "epoch": 5.9648044692737425, + "grad_norm": 0.9501025676727295, + "learning_rate": 0.0007039495798319327, + "loss": 0.5028, + "step": 10677 + }, + { + "epoch": 5.96536312849162, + "grad_norm": 0.5965065956115723, + "learning_rate": 0.000703921568627451, + "loss": 0.6575, + "step": 10678 + }, + { + "epoch": 5.965921787709497, + "grad_norm": 0.542474627494812, + "learning_rate": 0.0007038935574229693, + "loss": 0.4879, + "step": 10679 + }, + { + "epoch": 5.966480446927374, + "grad_norm": 0.7965006828308105, + "learning_rate": 0.0007038655462184875, + "loss": 0.5095, + "step": 10680 + }, + { + "epoch": 5.9670391061452515, + "grad_norm": 0.49771595001220703, + "learning_rate": 0.0007038375350140056, + "loss": 0.4371, + "step": 10681 + }, + { + "epoch": 5.967597765363129, + "grad_norm": 0.38680100440979004, + "learning_rate": 0.0007038095238095238, + "loss": 0.3783, + "step": 10682 + }, + { + "epoch": 5.9681564245810055, + "grad_norm": 0.483853280544281, + "learning_rate": 0.0007037815126050421, + "loss": 0.4089, + "step": 10683 + }, + { + "epoch": 5.968715083798883, + "grad_norm": 0.557817816734314, + "learning_rate": 0.0007037535014005603, + "loss": 0.5088, + "step": 10684 + }, + { + "epoch": 5.9692737430167595, + "grad_norm": 0.42193880677223206, + "learning_rate": 0.0007037254901960785, + "loss": 0.3933, + "step": 10685 + }, + { + "epoch": 5.969832402234637, + "grad_norm": 0.5682546496391296, + "learning_rate": 0.0007036974789915967, + "loss": 0.4603, + "step": 10686 + }, + { + "epoch": 5.9703910614525135, + "grad_norm": 0.8125534057617188, + "learning_rate": 0.0007036694677871148, + "loss": 0.426, + "step": 10687 + }, + { + "epoch": 5.970949720670391, + "grad_norm": 0.5613397359848022, + "learning_rate": 0.0007036414565826331, + "loss": 0.4178, + "step": 10688 + }, + { + "epoch": 5.971508379888268, + "grad_norm": 0.7704717516899109, + "learning_rate": 0.0007036134453781513, + "loss": 0.3796, + "step": 10689 + }, + { + "epoch": 5.972067039106145, + "grad_norm": 0.7329124212265015, + "learning_rate": 0.0007035854341736695, + "loss": 0.5127, + "step": 10690 + }, + { + "epoch": 5.9726256983240225, + "grad_norm": 1.2618038654327393, + "learning_rate": 0.0007035574229691877, + "loss": 0.5323, + "step": 10691 + }, + { + "epoch": 5.9731843575419, + "grad_norm": 0.48770755529403687, + "learning_rate": 0.0007035294117647058, + "loss": 0.3667, + "step": 10692 + }, + { + "epoch": 5.9737430167597765, + "grad_norm": 0.8446019887924194, + "learning_rate": 0.0007035014005602241, + "loss": 0.5731, + "step": 10693 + }, + { + "epoch": 5.974301675977654, + "grad_norm": 0.46063557267189026, + "learning_rate": 0.0007034733893557423, + "loss": 0.4143, + "step": 10694 + }, + { + "epoch": 5.9748603351955305, + "grad_norm": 0.48112040758132935, + "learning_rate": 0.0007034453781512605, + "loss": 0.3198, + "step": 10695 + }, + { + "epoch": 5.975418994413408, + "grad_norm": 0.5702455639839172, + "learning_rate": 0.0007034173669467788, + "loss": 0.3204, + "step": 10696 + }, + { + "epoch": 5.9759776536312845, + "grad_norm": 0.4615860879421234, + "learning_rate": 0.0007033893557422969, + "loss": 0.433, + "step": 10697 + }, + { + "epoch": 5.976536312849162, + "grad_norm": 0.44421282410621643, + "learning_rate": 0.0007033613445378152, + "loss": 0.438, + "step": 10698 + }, + { + "epoch": 5.977094972067039, + "grad_norm": 0.6527910232543945, + "learning_rate": 0.0007033333333333334, + "loss": 0.4898, + "step": 10699 + }, + { + "epoch": 5.977653631284916, + "grad_norm": 0.4210759401321411, + "learning_rate": 0.0007033053221288516, + "loss": 0.4204, + "step": 10700 + }, + { + "epoch": 5.9782122905027935, + "grad_norm": 0.8129958510398865, + "learning_rate": 0.0007032773109243698, + "loss": 0.3917, + "step": 10701 + }, + { + "epoch": 5.97877094972067, + "grad_norm": 0.6258031129837036, + "learning_rate": 0.000703249299719888, + "loss": 0.4117, + "step": 10702 + }, + { + "epoch": 5.9793296089385475, + "grad_norm": 0.5234305262565613, + "learning_rate": 0.0007032212885154062, + "loss": 0.5158, + "step": 10703 + }, + { + "epoch": 5.979888268156425, + "grad_norm": 1.7617279291152954, + "learning_rate": 0.0007031932773109244, + "loss": 0.4604, + "step": 10704 + }, + { + "epoch": 5.9804469273743015, + "grad_norm": 0.35483261942863464, + "learning_rate": 0.0007031652661064426, + "loss": 0.3561, + "step": 10705 + }, + { + "epoch": 5.981005586592179, + "grad_norm": 0.8235047459602356, + "learning_rate": 0.0007031372549019608, + "loss": 0.3557, + "step": 10706 + }, + { + "epoch": 5.9815642458100555, + "grad_norm": 1.5603936910629272, + "learning_rate": 0.000703109243697479, + "loss": 0.3923, + "step": 10707 + }, + { + "epoch": 5.982122905027933, + "grad_norm": 0.5309181213378906, + "learning_rate": 0.0007030812324929971, + "loss": 0.3921, + "step": 10708 + }, + { + "epoch": 5.98268156424581, + "grad_norm": 0.5465649962425232, + "learning_rate": 0.0007030532212885154, + "loss": 0.4679, + "step": 10709 + }, + { + "epoch": 5.983240223463687, + "grad_norm": 0.686987042427063, + "learning_rate": 0.0007030252100840336, + "loss": 0.4157, + "step": 10710 + }, + { + "epoch": 5.9837988826815645, + "grad_norm": 0.49509668350219727, + "learning_rate": 0.0007029971988795518, + "loss": 0.4808, + "step": 10711 + }, + { + "epoch": 5.984357541899441, + "grad_norm": 0.5534369349479675, + "learning_rate": 0.00070296918767507, + "loss": 0.419, + "step": 10712 + }, + { + "epoch": 5.9849162011173185, + "grad_norm": 0.5116916298866272, + "learning_rate": 0.0007029411764705881, + "loss": 0.5181, + "step": 10713 + }, + { + "epoch": 5.985474860335195, + "grad_norm": 0.4299747347831726, + "learning_rate": 0.0007029131652661065, + "loss": 0.4109, + "step": 10714 + }, + { + "epoch": 5.9860335195530725, + "grad_norm": 0.5178081393241882, + "learning_rate": 0.0007028851540616247, + "loss": 0.5312, + "step": 10715 + }, + { + "epoch": 5.98659217877095, + "grad_norm": 0.7596054673194885, + "learning_rate": 0.0007028571428571429, + "loss": 0.3994, + "step": 10716 + }, + { + "epoch": 5.9871508379888265, + "grad_norm": 0.618139922618866, + "learning_rate": 0.0007028291316526611, + "loss": 0.553, + "step": 10717 + }, + { + "epoch": 5.987709497206704, + "grad_norm": 0.9482153654098511, + "learning_rate": 0.0007028011204481793, + "loss": 0.4184, + "step": 10718 + }, + { + "epoch": 5.988268156424581, + "grad_norm": 0.5059841871261597, + "learning_rate": 0.0007027731092436975, + "loss": 0.3966, + "step": 10719 + }, + { + "epoch": 5.988826815642458, + "grad_norm": 0.5375123023986816, + "learning_rate": 0.0007027450980392157, + "loss": 0.4351, + "step": 10720 + }, + { + "epoch": 5.9893854748603355, + "grad_norm": 1.5509437322616577, + "learning_rate": 0.0007027170868347339, + "loss": 0.577, + "step": 10721 + }, + { + "epoch": 5.989944134078212, + "grad_norm": 0.3913935422897339, + "learning_rate": 0.0007026890756302521, + "loss": 0.4034, + "step": 10722 + }, + { + "epoch": 5.9905027932960895, + "grad_norm": 0.5882735848426819, + "learning_rate": 0.0007026610644257703, + "loss": 0.553, + "step": 10723 + }, + { + "epoch": 5.991061452513966, + "grad_norm": 0.49759504199028015, + "learning_rate": 0.0007026330532212885, + "loss": 0.4367, + "step": 10724 + }, + { + "epoch": 5.9916201117318435, + "grad_norm": 0.5933146476745605, + "learning_rate": 0.0007026050420168067, + "loss": 0.501, + "step": 10725 + }, + { + "epoch": 5.992178770949721, + "grad_norm": 0.8480409979820251, + "learning_rate": 0.0007025770308123249, + "loss": 0.5142, + "step": 10726 + }, + { + "epoch": 5.9927374301675975, + "grad_norm": 0.5606933236122131, + "learning_rate": 0.0007025490196078431, + "loss": 0.4981, + "step": 10727 + }, + { + "epoch": 5.993296089385475, + "grad_norm": 0.3889835476875305, + "learning_rate": 0.0007025210084033613, + "loss": 0.5629, + "step": 10728 + }, + { + "epoch": 5.993854748603352, + "grad_norm": 0.4942566156387329, + "learning_rate": 0.0007024929971988796, + "loss": 0.3911, + "step": 10729 + }, + { + "epoch": 5.994413407821229, + "grad_norm": 1.4093338251113892, + "learning_rate": 0.0007024649859943978, + "loss": 0.4624, + "step": 10730 + }, + { + "epoch": 5.9949720670391065, + "grad_norm": 1.139153242111206, + "learning_rate": 0.000702436974789916, + "loss": 0.4055, + "step": 10731 + }, + { + "epoch": 5.995530726256983, + "grad_norm": 0.45949316024780273, + "learning_rate": 0.0007024089635854342, + "loss": 0.435, + "step": 10732 + }, + { + "epoch": 5.9960893854748605, + "grad_norm": 0.3905138373374939, + "learning_rate": 0.0007023809523809524, + "loss": 0.4219, + "step": 10733 + }, + { + "epoch": 5.996648044692737, + "grad_norm": 1.0105937719345093, + "learning_rate": 0.0007023529411764707, + "loss": 0.5904, + "step": 10734 + }, + { + "epoch": 5.9972067039106145, + "grad_norm": 0.5596426725387573, + "learning_rate": 0.0007023249299719888, + "loss": 0.3982, + "step": 10735 + }, + { + "epoch": 5.997765363128492, + "grad_norm": 0.38583436608314514, + "learning_rate": 0.000702296918767507, + "loss": 0.4682, + "step": 10736 + }, + { + "epoch": 5.9983240223463685, + "grad_norm": 0.5878229737281799, + "learning_rate": 0.0007022689075630252, + "loss": 0.5023, + "step": 10737 + }, + { + "epoch": 5.998882681564246, + "grad_norm": 0.7048560976982117, + "learning_rate": 0.0007022408963585434, + "loss": 0.3977, + "step": 10738 + }, + { + "epoch": 5.9994413407821225, + "grad_norm": 0.632676362991333, + "learning_rate": 0.0007022128851540617, + "loss": 0.38, + "step": 10739 + }, + { + "epoch": 6.0, + "grad_norm": 1.2492765188217163, + "learning_rate": 0.0007021848739495798, + "loss": 0.4723, + "step": 10740 + }, + { + "epoch": 6.0005586592178775, + "grad_norm": 0.6041464805603027, + "learning_rate": 0.000702156862745098, + "loss": 0.4666, + "step": 10741 + }, + { + "epoch": 6.001117318435754, + "grad_norm": 0.7570438981056213, + "learning_rate": 0.0007021288515406162, + "loss": 0.4116, + "step": 10742 + }, + { + "epoch": 6.0016759776536315, + "grad_norm": 0.5255514979362488, + "learning_rate": 0.0007021008403361344, + "loss": 0.3727, + "step": 10743 + }, + { + "epoch": 6.002234636871508, + "grad_norm": 0.6134583353996277, + "learning_rate": 0.0007020728291316527, + "loss": 0.4397, + "step": 10744 + }, + { + "epoch": 6.0027932960893855, + "grad_norm": 2.010389804840088, + "learning_rate": 0.0007020448179271708, + "loss": 0.4007, + "step": 10745 + }, + { + "epoch": 6.003351955307263, + "grad_norm": 0.5612832307815552, + "learning_rate": 0.000702016806722689, + "loss": 0.4107, + "step": 10746 + }, + { + "epoch": 6.0039106145251395, + "grad_norm": 0.46385201811790466, + "learning_rate": 0.0007019887955182073, + "loss": 0.4381, + "step": 10747 + }, + { + "epoch": 6.004469273743017, + "grad_norm": 0.5772256255149841, + "learning_rate": 0.0007019607843137255, + "loss": 0.4458, + "step": 10748 + }, + { + "epoch": 6.0050279329608935, + "grad_norm": 0.3665994107723236, + "learning_rate": 0.0007019327731092438, + "loss": 0.3647, + "step": 10749 + }, + { + "epoch": 6.005586592178771, + "grad_norm": 0.7996562719345093, + "learning_rate": 0.000701904761904762, + "loss": 0.4073, + "step": 10750 + }, + { + "epoch": 6.0061452513966485, + "grad_norm": 0.46867483854293823, + "learning_rate": 0.0007018767507002801, + "loss": 0.3554, + "step": 10751 + }, + { + "epoch": 6.006703910614525, + "grad_norm": 0.4355800747871399, + "learning_rate": 0.0007018487394957983, + "loss": 0.481, + "step": 10752 + }, + { + "epoch": 6.0072625698324025, + "grad_norm": 0.5530795454978943, + "learning_rate": 0.0007018207282913165, + "loss": 0.4939, + "step": 10753 + }, + { + "epoch": 6.007821229050279, + "grad_norm": 0.4148414134979248, + "learning_rate": 0.0007017927170868348, + "loss": 0.4056, + "step": 10754 + }, + { + "epoch": 6.0083798882681565, + "grad_norm": 0.5292692184448242, + "learning_rate": 0.000701764705882353, + "loss": 0.4066, + "step": 10755 + }, + { + "epoch": 6.008938547486034, + "grad_norm": 0.6804555058479309, + "learning_rate": 0.0007017366946778711, + "loss": 0.4545, + "step": 10756 + }, + { + "epoch": 6.0094972067039105, + "grad_norm": 0.6187072396278381, + "learning_rate": 0.0007017086834733893, + "loss": 0.5057, + "step": 10757 + }, + { + "epoch": 6.010055865921788, + "grad_norm": 0.45158496499061584, + "learning_rate": 0.0007016806722689075, + "loss": 0.5135, + "step": 10758 + }, + { + "epoch": 6.0106145251396645, + "grad_norm": 0.5854185223579407, + "learning_rate": 0.0007016526610644258, + "loss": 0.3839, + "step": 10759 + }, + { + "epoch": 6.011173184357542, + "grad_norm": 0.4598930776119232, + "learning_rate": 0.000701624649859944, + "loss": 0.4112, + "step": 10760 + }, + { + "epoch": 6.011731843575419, + "grad_norm": 0.3839957118034363, + "learning_rate": 0.0007015966386554621, + "loss": 0.3706, + "step": 10761 + }, + { + "epoch": 6.012290502793296, + "grad_norm": 0.4976672828197479, + "learning_rate": 0.0007015686274509803, + "loss": 0.5029, + "step": 10762 + }, + { + "epoch": 6.0128491620111735, + "grad_norm": 0.4182429015636444, + "learning_rate": 0.0007015406162464986, + "loss": 0.3773, + "step": 10763 + }, + { + "epoch": 6.01340782122905, + "grad_norm": 0.9692990183830261, + "learning_rate": 0.0007015126050420169, + "loss": 0.4158, + "step": 10764 + }, + { + "epoch": 6.0139664804469275, + "grad_norm": 0.4656253159046173, + "learning_rate": 0.0007014845938375351, + "loss": 0.4477, + "step": 10765 + }, + { + "epoch": 6.014525139664804, + "grad_norm": 0.6307067275047302, + "learning_rate": 0.0007014565826330533, + "loss": 0.4127, + "step": 10766 + }, + { + "epoch": 6.0150837988826815, + "grad_norm": 1.356160044670105, + "learning_rate": 0.0007014285714285714, + "loss": 0.5075, + "step": 10767 + }, + { + "epoch": 6.015642458100559, + "grad_norm": 0.4145680367946625, + "learning_rate": 0.0007014005602240896, + "loss": 0.362, + "step": 10768 + }, + { + "epoch": 6.0162011173184355, + "grad_norm": 0.747823178768158, + "learning_rate": 0.0007013725490196079, + "loss": 0.3907, + "step": 10769 + }, + { + "epoch": 6.016759776536313, + "grad_norm": 0.43720245361328125, + "learning_rate": 0.0007013445378151261, + "loss": 0.3736, + "step": 10770 + }, + { + "epoch": 6.01731843575419, + "grad_norm": 1.323954701423645, + "learning_rate": 0.0007013165266106443, + "loss": 0.4856, + "step": 10771 + }, + { + "epoch": 6.017877094972067, + "grad_norm": 0.528076708316803, + "learning_rate": 0.0007012885154061624, + "loss": 0.5145, + "step": 10772 + }, + { + "epoch": 6.0184357541899445, + "grad_norm": 0.5131105780601501, + "learning_rate": 0.0007012605042016806, + "loss": 0.4855, + "step": 10773 + }, + { + "epoch": 6.018994413407821, + "grad_norm": 0.4567725658416748, + "learning_rate": 0.0007012324929971989, + "loss": 0.4204, + "step": 10774 + }, + { + "epoch": 6.0195530726256985, + "grad_norm": 0.9652224183082581, + "learning_rate": 0.0007012044817927171, + "loss": 0.4426, + "step": 10775 + }, + { + "epoch": 6.020111731843575, + "grad_norm": 0.5839270949363708, + "learning_rate": 0.0007011764705882353, + "loss": 0.3688, + "step": 10776 + }, + { + "epoch": 6.0206703910614525, + "grad_norm": 0.8496503233909607, + "learning_rate": 0.0007011484593837534, + "loss": 0.6914, + "step": 10777 + }, + { + "epoch": 6.02122905027933, + "grad_norm": 0.548754096031189, + "learning_rate": 0.0007011204481792716, + "loss": 0.5181, + "step": 10778 + }, + { + "epoch": 6.0217877094972065, + "grad_norm": 0.6896282434463501, + "learning_rate": 0.00070109243697479, + "loss": 0.5319, + "step": 10779 + }, + { + "epoch": 6.022346368715084, + "grad_norm": 0.540732741355896, + "learning_rate": 0.0007010644257703082, + "loss": 0.4831, + "step": 10780 + }, + { + "epoch": 6.022905027932961, + "grad_norm": 0.5916521549224854, + "learning_rate": 0.0007010364145658264, + "loss": 0.433, + "step": 10781 + }, + { + "epoch": 6.023463687150838, + "grad_norm": 0.8262721300125122, + "learning_rate": 0.0007010084033613446, + "loss": 0.4857, + "step": 10782 + }, + { + "epoch": 6.0240223463687155, + "grad_norm": 0.6633471250534058, + "learning_rate": 0.0007009803921568627, + "loss": 0.4667, + "step": 10783 + }, + { + "epoch": 6.024581005586592, + "grad_norm": 0.9709869027137756, + "learning_rate": 0.000700952380952381, + "loss": 0.4724, + "step": 10784 + }, + { + "epoch": 6.0251396648044695, + "grad_norm": 3.0102927684783936, + "learning_rate": 0.0007009243697478992, + "loss": 0.6534, + "step": 10785 + }, + { + "epoch": 6.025698324022346, + "grad_norm": 7.023932456970215, + "learning_rate": 0.0007008963585434174, + "loss": 0.3773, + "step": 10786 + }, + { + "epoch": 6.0262569832402235, + "grad_norm": 0.6988338232040405, + "learning_rate": 0.0007008683473389356, + "loss": 0.3827, + "step": 10787 + }, + { + "epoch": 6.026815642458101, + "grad_norm": 0.8771420121192932, + "learning_rate": 0.0007008403361344537, + "loss": 0.5022, + "step": 10788 + }, + { + "epoch": 6.0273743016759775, + "grad_norm": 0.6200162768363953, + "learning_rate": 0.000700812324929972, + "loss": 0.4222, + "step": 10789 + }, + { + "epoch": 6.027932960893855, + "grad_norm": 1.523466944694519, + "learning_rate": 0.0007007843137254902, + "loss": 0.4948, + "step": 10790 + }, + { + "epoch": 6.028491620111732, + "grad_norm": 0.6968160271644592, + "learning_rate": 0.0007007563025210084, + "loss": 0.433, + "step": 10791 + }, + { + "epoch": 6.029050279329609, + "grad_norm": 2.646639585494995, + "learning_rate": 0.0007007282913165266, + "loss": 0.467, + "step": 10792 + }, + { + "epoch": 6.0296089385474865, + "grad_norm": 0.5532181262969971, + "learning_rate": 0.0007007002801120447, + "loss": 0.4283, + "step": 10793 + }, + { + "epoch": 6.030167597765363, + "grad_norm": 0.5752729177474976, + "learning_rate": 0.000700672268907563, + "loss": 0.3218, + "step": 10794 + }, + { + "epoch": 6.0307262569832405, + "grad_norm": 0.7599282264709473, + "learning_rate": 0.0007006442577030813, + "loss": 0.5184, + "step": 10795 + }, + { + "epoch": 6.031284916201117, + "grad_norm": 1.3925663232803345, + "learning_rate": 0.0007006162464985995, + "loss": 0.4733, + "step": 10796 + }, + { + "epoch": 6.0318435754189945, + "grad_norm": 0.5994170904159546, + "learning_rate": 0.0007005882352941177, + "loss": 0.4751, + "step": 10797 + }, + { + "epoch": 6.032402234636871, + "grad_norm": 0.5804548263549805, + "learning_rate": 0.0007005602240896359, + "loss": 0.4027, + "step": 10798 + }, + { + "epoch": 6.0329608938547485, + "grad_norm": 0.6561393737792969, + "learning_rate": 0.0007005322128851541, + "loss": 0.4743, + "step": 10799 + }, + { + "epoch": 6.033519553072626, + "grad_norm": 0.5753200650215149, + "learning_rate": 0.0007005042016806723, + "loss": 0.5643, + "step": 10800 + }, + { + "epoch": 6.034078212290503, + "grad_norm": 0.6040725111961365, + "learning_rate": 0.0007004761904761905, + "loss": 0.4895, + "step": 10801 + }, + { + "epoch": 6.03463687150838, + "grad_norm": 0.7829612493515015, + "learning_rate": 0.0007004481792717087, + "loss": 0.3975, + "step": 10802 + }, + { + "epoch": 6.035195530726257, + "grad_norm": 0.8545989990234375, + "learning_rate": 0.0007004201680672269, + "loss": 0.4989, + "step": 10803 + }, + { + "epoch": 6.035754189944134, + "grad_norm": 0.4869842529296875, + "learning_rate": 0.0007003921568627451, + "loss": 0.4032, + "step": 10804 + }, + { + "epoch": 6.0363128491620115, + "grad_norm": 0.8219355344772339, + "learning_rate": 0.0007003641456582633, + "loss": 0.5121, + "step": 10805 + }, + { + "epoch": 6.036871508379888, + "grad_norm": 1.1885137557983398, + "learning_rate": 0.0007003361344537815, + "loss": 0.6034, + "step": 10806 + }, + { + "epoch": 6.0374301675977655, + "grad_norm": 1.8642171621322632, + "learning_rate": 0.0007003081232492997, + "loss": 0.416, + "step": 10807 + }, + { + "epoch": 6.037988826815642, + "grad_norm": 0.5085421204566956, + "learning_rate": 0.0007002801120448179, + "loss": 0.4397, + "step": 10808 + }, + { + "epoch": 6.0385474860335195, + "grad_norm": 0.5510767102241516, + "learning_rate": 0.0007002521008403361, + "loss": 0.4573, + "step": 10809 + }, + { + "epoch": 6.039106145251397, + "grad_norm": 0.45946282148361206, + "learning_rate": 0.0007002240896358543, + "loss": 0.3685, + "step": 10810 + }, + { + "epoch": 6.039664804469274, + "grad_norm": 0.6027918457984924, + "learning_rate": 0.0007001960784313726, + "loss": 0.3502, + "step": 10811 + }, + { + "epoch": 6.040223463687151, + "grad_norm": 0.5526122450828552, + "learning_rate": 0.0007001680672268908, + "loss": 0.4661, + "step": 10812 + }, + { + "epoch": 6.040782122905028, + "grad_norm": 0.5164427757263184, + "learning_rate": 0.000700140056022409, + "loss": 0.5412, + "step": 10813 + }, + { + "epoch": 6.041340782122905, + "grad_norm": 0.46552905440330505, + "learning_rate": 0.0007001120448179273, + "loss": 0.5202, + "step": 10814 + }, + { + "epoch": 6.0418994413407825, + "grad_norm": 0.4000746011734009, + "learning_rate": 0.0007000840336134454, + "loss": 0.4629, + "step": 10815 + }, + { + "epoch": 6.042458100558659, + "grad_norm": 0.42634910345077515, + "learning_rate": 0.0007000560224089636, + "loss": 0.3893, + "step": 10816 + }, + { + "epoch": 6.0430167597765365, + "grad_norm": 0.5009442567825317, + "learning_rate": 0.0007000280112044818, + "loss": 0.4373, + "step": 10817 + }, + { + "epoch": 6.043575418994413, + "grad_norm": 0.660260021686554, + "learning_rate": 0.0007, + "loss": 0.3632, + "step": 10818 + }, + { + "epoch": 6.0441340782122905, + "grad_norm": 1.0034247636795044, + "learning_rate": 0.0006999719887955183, + "loss": 0.4488, + "step": 10819 + }, + { + "epoch": 6.044692737430168, + "grad_norm": 0.7092230916023254, + "learning_rate": 0.0006999439775910364, + "loss": 0.4103, + "step": 10820 + }, + { + "epoch": 6.045251396648045, + "grad_norm": 0.6398437023162842, + "learning_rate": 0.0006999159663865546, + "loss": 0.5053, + "step": 10821 + }, + { + "epoch": 6.045810055865922, + "grad_norm": 0.6886416077613831, + "learning_rate": 0.0006998879551820728, + "loss": 0.5198, + "step": 10822 + }, + { + "epoch": 6.046368715083799, + "grad_norm": 0.4392039179801941, + "learning_rate": 0.000699859943977591, + "loss": 0.3337, + "step": 10823 + }, + { + "epoch": 6.046927374301676, + "grad_norm": 0.6431219577789307, + "learning_rate": 0.0006998319327731093, + "loss": 0.459, + "step": 10824 + }, + { + "epoch": 6.0474860335195535, + "grad_norm": 0.8359147310256958, + "learning_rate": 0.0006998039215686274, + "loss": 0.4441, + "step": 10825 + }, + { + "epoch": 6.04804469273743, + "grad_norm": 1.4082962274551392, + "learning_rate": 0.0006997759103641456, + "loss": 0.3914, + "step": 10826 + }, + { + "epoch": 6.0486033519553075, + "grad_norm": 0.7931913137435913, + "learning_rate": 0.0006997478991596638, + "loss": 0.389, + "step": 10827 + }, + { + "epoch": 6.049162011173184, + "grad_norm": 1.2825291156768799, + "learning_rate": 0.000699719887955182, + "loss": 0.4261, + "step": 10828 + }, + { + "epoch": 6.0497206703910615, + "grad_norm": 0.7035611867904663, + "learning_rate": 0.0006996918767507004, + "loss": 0.616, + "step": 10829 + }, + { + "epoch": 6.050279329608939, + "grad_norm": 2.3387644290924072, + "learning_rate": 0.0006996638655462186, + "loss": 0.4332, + "step": 10830 + }, + { + "epoch": 6.050837988826816, + "grad_norm": 0.46920862793922424, + "learning_rate": 0.0006996358543417367, + "loss": 0.3945, + "step": 10831 + }, + { + "epoch": 6.051396648044693, + "grad_norm": 0.7833524942398071, + "learning_rate": 0.0006996078431372549, + "loss": 0.5773, + "step": 10832 + }, + { + "epoch": 6.05195530726257, + "grad_norm": 0.6396458745002747, + "learning_rate": 0.0006995798319327731, + "loss": 0.4959, + "step": 10833 + }, + { + "epoch": 6.052513966480447, + "grad_norm": 0.836787760257721, + "learning_rate": 0.0006995518207282914, + "loss": 0.3687, + "step": 10834 + }, + { + "epoch": 6.053072625698324, + "grad_norm": 0.6487652659416199, + "learning_rate": 0.0006995238095238096, + "loss": 0.6013, + "step": 10835 + }, + { + "epoch": 6.053631284916201, + "grad_norm": 0.7880006432533264, + "learning_rate": 0.0006994957983193277, + "loss": 0.455, + "step": 10836 + }, + { + "epoch": 6.0541899441340785, + "grad_norm": 0.7589018940925598, + "learning_rate": 0.0006994677871148459, + "loss": 0.5345, + "step": 10837 + }, + { + "epoch": 6.054748603351955, + "grad_norm": 0.7541858553886414, + "learning_rate": 0.0006994397759103641, + "loss": 0.3533, + "step": 10838 + }, + { + "epoch": 6.0553072625698325, + "grad_norm": 0.49204450845718384, + "learning_rate": 0.0006994117647058824, + "loss": 0.4926, + "step": 10839 + }, + { + "epoch": 6.055865921787709, + "grad_norm": 0.4427184760570526, + "learning_rate": 0.0006993837535014006, + "loss": 0.3897, + "step": 10840 + }, + { + "epoch": 6.056424581005587, + "grad_norm": 0.47213247418403625, + "learning_rate": 0.0006993557422969187, + "loss": 0.339, + "step": 10841 + }, + { + "epoch": 6.056983240223464, + "grad_norm": 0.5522042512893677, + "learning_rate": 0.0006993277310924369, + "loss": 0.4789, + "step": 10842 + }, + { + "epoch": 6.057541899441341, + "grad_norm": 0.7679473757743835, + "learning_rate": 0.0006992997198879551, + "loss": 0.5913, + "step": 10843 + }, + { + "epoch": 6.058100558659218, + "grad_norm": 1.0393184423446655, + "learning_rate": 0.0006992717086834735, + "loss": 0.5372, + "step": 10844 + }, + { + "epoch": 6.058659217877095, + "grad_norm": 0.8953571319580078, + "learning_rate": 0.0006992436974789917, + "loss": 0.5416, + "step": 10845 + }, + { + "epoch": 6.059217877094972, + "grad_norm": 1.2806627750396729, + "learning_rate": 0.0006992156862745099, + "loss": 0.4492, + "step": 10846 + }, + { + "epoch": 6.0597765363128495, + "grad_norm": 2.110543727874756, + "learning_rate": 0.000699187675070028, + "loss": 0.3661, + "step": 10847 + }, + { + "epoch": 6.060335195530726, + "grad_norm": 1.072935700416565, + "learning_rate": 0.0006991596638655462, + "loss": 0.4846, + "step": 10848 + }, + { + "epoch": 6.0608938547486035, + "grad_norm": 0.4998075067996979, + "learning_rate": 0.0006991316526610645, + "loss": 0.3933, + "step": 10849 + }, + { + "epoch": 6.06145251396648, + "grad_norm": 0.5481994152069092, + "learning_rate": 0.0006991036414565827, + "loss": 0.5352, + "step": 10850 + }, + { + "epoch": 6.062011173184358, + "grad_norm": 0.5999717712402344, + "learning_rate": 0.0006990756302521009, + "loss": 0.5079, + "step": 10851 + }, + { + "epoch": 6.062569832402235, + "grad_norm": 0.44448959827423096, + "learning_rate": 0.000699047619047619, + "loss": 0.5184, + "step": 10852 + }, + { + "epoch": 6.063128491620112, + "grad_norm": 0.7252323031425476, + "learning_rate": 0.0006990196078431372, + "loss": 0.3998, + "step": 10853 + }, + { + "epoch": 6.063687150837989, + "grad_norm": 0.6027101278305054, + "learning_rate": 0.0006989915966386555, + "loss": 0.4991, + "step": 10854 + }, + { + "epoch": 6.064245810055866, + "grad_norm": 0.9127732515335083, + "learning_rate": 0.0006989635854341737, + "loss": 0.4681, + "step": 10855 + }, + { + "epoch": 6.064804469273743, + "grad_norm": 0.40761685371398926, + "learning_rate": 0.0006989355742296919, + "loss": 0.4431, + "step": 10856 + }, + { + "epoch": 6.0653631284916205, + "grad_norm": 0.41250237822532654, + "learning_rate": 0.00069890756302521, + "loss": 0.4757, + "step": 10857 + }, + { + "epoch": 6.065921787709497, + "grad_norm": 0.573297381401062, + "learning_rate": 0.0006988795518207282, + "loss": 0.5019, + "step": 10858 + }, + { + "epoch": 6.0664804469273745, + "grad_norm": 0.6602842211723328, + "learning_rate": 0.0006988515406162465, + "loss": 0.4551, + "step": 10859 + }, + { + "epoch": 6.067039106145251, + "grad_norm": 0.6826991438865662, + "learning_rate": 0.0006988235294117648, + "loss": 0.427, + "step": 10860 + }, + { + "epoch": 6.067597765363129, + "grad_norm": 0.482316255569458, + "learning_rate": 0.000698795518207283, + "loss": 0.3526, + "step": 10861 + }, + { + "epoch": 6.068156424581006, + "grad_norm": 0.9022002220153809, + "learning_rate": 0.0006987675070028012, + "loss": 0.45, + "step": 10862 + }, + { + "epoch": 6.068715083798883, + "grad_norm": 0.5509048700332642, + "learning_rate": 0.0006987394957983193, + "loss": 0.4807, + "step": 10863 + }, + { + "epoch": 6.06927374301676, + "grad_norm": 2.927882671356201, + "learning_rate": 0.0006987114845938376, + "loss": 0.443, + "step": 10864 + }, + { + "epoch": 6.069832402234637, + "grad_norm": 0.712352991104126, + "learning_rate": 0.0006986834733893558, + "loss": 0.5154, + "step": 10865 + }, + { + "epoch": 6.070391061452514, + "grad_norm": 0.7284734845161438, + "learning_rate": 0.000698655462184874, + "loss": 0.3712, + "step": 10866 + }, + { + "epoch": 6.070949720670391, + "grad_norm": 0.4085279405117035, + "learning_rate": 0.0006986274509803922, + "loss": 0.4312, + "step": 10867 + }, + { + "epoch": 6.071508379888268, + "grad_norm": 0.6925187706947327, + "learning_rate": 0.0006985994397759103, + "loss": 0.4419, + "step": 10868 + }, + { + "epoch": 6.0720670391061455, + "grad_norm": 2.800260066986084, + "learning_rate": 0.0006985714285714286, + "loss": 0.3975, + "step": 10869 + }, + { + "epoch": 6.072625698324022, + "grad_norm": 0.6637143492698669, + "learning_rate": 0.0006985434173669468, + "loss": 0.5204, + "step": 10870 + }, + { + "epoch": 6.0731843575419, + "grad_norm": 0.5931059718132019, + "learning_rate": 0.000698515406162465, + "loss": 0.4851, + "step": 10871 + }, + { + "epoch": 6.073743016759776, + "grad_norm": 2.2102060317993164, + "learning_rate": 0.0006984873949579832, + "loss": 0.5179, + "step": 10872 + }, + { + "epoch": 6.074301675977654, + "grad_norm": 0.5598397254943848, + "learning_rate": 0.0006984593837535013, + "loss": 0.4871, + "step": 10873 + }, + { + "epoch": 6.074860335195531, + "grad_norm": 0.483901709318161, + "learning_rate": 0.0006984313725490196, + "loss": 0.3633, + "step": 10874 + }, + { + "epoch": 6.075418994413408, + "grad_norm": 1.239761233329773, + "learning_rate": 0.0006984033613445378, + "loss": 0.4288, + "step": 10875 + }, + { + "epoch": 6.075977653631285, + "grad_norm": 0.5510937571525574, + "learning_rate": 0.000698375350140056, + "loss": 0.4639, + "step": 10876 + }, + { + "epoch": 6.076536312849162, + "grad_norm": 0.5686545372009277, + "learning_rate": 0.0006983473389355743, + "loss": 0.4021, + "step": 10877 + }, + { + "epoch": 6.077094972067039, + "grad_norm": 2.322188377380371, + "learning_rate": 0.0006983193277310925, + "loss": 0.3936, + "step": 10878 + }, + { + "epoch": 6.0776536312849165, + "grad_norm": 0.7091884016990662, + "learning_rate": 0.0006982913165266107, + "loss": 0.371, + "step": 10879 + }, + { + "epoch": 6.078212290502793, + "grad_norm": 0.6452813148498535, + "learning_rate": 0.0006982633053221289, + "loss": 0.4153, + "step": 10880 + }, + { + "epoch": 6.078770949720671, + "grad_norm": 0.4575195908546448, + "learning_rate": 0.0006982352941176471, + "loss": 0.4103, + "step": 10881 + }, + { + "epoch": 6.079329608938547, + "grad_norm": 0.5927631258964539, + "learning_rate": 0.0006982072829131653, + "loss": 0.6678, + "step": 10882 + }, + { + "epoch": 6.079888268156425, + "grad_norm": 0.521142840385437, + "learning_rate": 0.0006981792717086835, + "loss": 0.454, + "step": 10883 + }, + { + "epoch": 6.080446927374302, + "grad_norm": 0.6282399892807007, + "learning_rate": 0.0006981512605042017, + "loss": 0.471, + "step": 10884 + }, + { + "epoch": 6.081005586592179, + "grad_norm": 0.6412461996078491, + "learning_rate": 0.0006981232492997199, + "loss": 0.4296, + "step": 10885 + }, + { + "epoch": 6.081564245810056, + "grad_norm": 0.41716331243515015, + "learning_rate": 0.0006980952380952381, + "loss": 0.3336, + "step": 10886 + }, + { + "epoch": 6.082122905027933, + "grad_norm": 0.43193814158439636, + "learning_rate": 0.0006980672268907563, + "loss": 0.4132, + "step": 10887 + }, + { + "epoch": 6.08268156424581, + "grad_norm": 0.36316362023353577, + "learning_rate": 0.0006980392156862745, + "loss": 0.3262, + "step": 10888 + }, + { + "epoch": 6.0832402234636875, + "grad_norm": 0.5462936162948608, + "learning_rate": 0.0006980112044817928, + "loss": 0.523, + "step": 10889 + }, + { + "epoch": 6.083798882681564, + "grad_norm": 0.5788365602493286, + "learning_rate": 0.0006979831932773109, + "loss": 0.572, + "step": 10890 + }, + { + "epoch": 6.084357541899442, + "grad_norm": 0.4404538869857788, + "learning_rate": 0.0006979551820728291, + "loss": 0.3874, + "step": 10891 + }, + { + "epoch": 6.084916201117318, + "grad_norm": 0.670192539691925, + "learning_rate": 0.0006979271708683473, + "loss": 0.4835, + "step": 10892 + }, + { + "epoch": 6.085474860335196, + "grad_norm": 0.44022247195243835, + "learning_rate": 0.0006978991596638656, + "loss": 0.4005, + "step": 10893 + }, + { + "epoch": 6.086033519553073, + "grad_norm": 0.45533043146133423, + "learning_rate": 0.0006978711484593839, + "loss": 0.3686, + "step": 10894 + }, + { + "epoch": 6.08659217877095, + "grad_norm": 1.3965787887573242, + "learning_rate": 0.000697843137254902, + "loss": 0.4165, + "step": 10895 + }, + { + "epoch": 6.087150837988827, + "grad_norm": 0.8195257782936096, + "learning_rate": 0.0006978151260504202, + "loss": 0.4008, + "step": 10896 + }, + { + "epoch": 6.087709497206704, + "grad_norm": 0.8205869197845459, + "learning_rate": 0.0006977871148459384, + "loss": 0.3417, + "step": 10897 + }, + { + "epoch": 6.088268156424581, + "grad_norm": 1.1164945363998413, + "learning_rate": 0.0006977591036414566, + "loss": 0.3749, + "step": 10898 + }, + { + "epoch": 6.0888268156424585, + "grad_norm": 0.40060552954673767, + "learning_rate": 0.0006977310924369749, + "loss": 0.4623, + "step": 10899 + }, + { + "epoch": 6.089385474860335, + "grad_norm": 0.5135126113891602, + "learning_rate": 0.000697703081232493, + "loss": 0.4877, + "step": 10900 + }, + { + "epoch": 6.089944134078213, + "grad_norm": 1.592182993888855, + "learning_rate": 0.0006976750700280112, + "loss": 0.5295, + "step": 10901 + }, + { + "epoch": 6.090502793296089, + "grad_norm": 0.84066241979599, + "learning_rate": 0.0006976470588235294, + "loss": 0.4461, + "step": 10902 + }, + { + "epoch": 6.091061452513967, + "grad_norm": 0.6580346822738647, + "learning_rate": 0.0006976190476190476, + "loss": 0.4283, + "step": 10903 + }, + { + "epoch": 6.091620111731843, + "grad_norm": 0.591770350933075, + "learning_rate": 0.0006975910364145659, + "loss": 0.4478, + "step": 10904 + }, + { + "epoch": 6.092178770949721, + "grad_norm": 2.3529205322265625, + "learning_rate": 0.0006975630252100841, + "loss": 0.4048, + "step": 10905 + }, + { + "epoch": 6.092737430167598, + "grad_norm": 0.8928508162498474, + "learning_rate": 0.0006975350140056022, + "loss": 0.4431, + "step": 10906 + }, + { + "epoch": 6.093296089385475, + "grad_norm": 0.5485340356826782, + "learning_rate": 0.0006975070028011204, + "loss": 0.3834, + "step": 10907 + }, + { + "epoch": 6.093854748603352, + "grad_norm": 0.40008461475372314, + "learning_rate": 0.0006974789915966386, + "loss": 0.3491, + "step": 10908 + }, + { + "epoch": 6.094413407821229, + "grad_norm": 0.44092485308647156, + "learning_rate": 0.000697450980392157, + "loss": 0.4522, + "step": 10909 + }, + { + "epoch": 6.094972067039106, + "grad_norm": 0.41834747791290283, + "learning_rate": 0.0006974229691876752, + "loss": 0.4308, + "step": 10910 + }, + { + "epoch": 6.0955307262569836, + "grad_norm": 1.0991649627685547, + "learning_rate": 0.0006973949579831933, + "loss": 0.5114, + "step": 10911 + }, + { + "epoch": 6.09608938547486, + "grad_norm": 0.9742670655250549, + "learning_rate": 0.0006973669467787115, + "loss": 0.5979, + "step": 10912 + }, + { + "epoch": 6.096648044692738, + "grad_norm": 2.3714523315429688, + "learning_rate": 0.0006973389355742297, + "loss": 0.4323, + "step": 10913 + }, + { + "epoch": 6.097206703910614, + "grad_norm": 0.5217916369438171, + "learning_rate": 0.000697310924369748, + "loss": 0.5236, + "step": 10914 + }, + { + "epoch": 6.097765363128492, + "grad_norm": 0.6856021285057068, + "learning_rate": 0.0006972829131652662, + "loss": 0.4675, + "step": 10915 + }, + { + "epoch": 6.098324022346369, + "grad_norm": 0.7215303182601929, + "learning_rate": 0.0006972549019607843, + "loss": 0.4032, + "step": 10916 + }, + { + "epoch": 6.098882681564246, + "grad_norm": 0.5496611595153809, + "learning_rate": 0.0006972268907563025, + "loss": 0.4939, + "step": 10917 + }, + { + "epoch": 6.099441340782123, + "grad_norm": 0.6390330195426941, + "learning_rate": 0.0006971988795518207, + "loss": 0.5086, + "step": 10918 + }, + { + "epoch": 6.1, + "grad_norm": 0.4926900565624237, + "learning_rate": 0.000697170868347339, + "loss": 0.3372, + "step": 10919 + }, + { + "epoch": 6.100558659217877, + "grad_norm": 0.547737181186676, + "learning_rate": 0.0006971428571428572, + "loss": 0.4523, + "step": 10920 + }, + { + "epoch": 6.1011173184357546, + "grad_norm": 0.5713638663291931, + "learning_rate": 0.0006971148459383754, + "loss": 0.5189, + "step": 10921 + }, + { + "epoch": 6.101675977653631, + "grad_norm": 0.6128251552581787, + "learning_rate": 0.0006970868347338935, + "loss": 0.2731, + "step": 10922 + }, + { + "epoch": 6.102234636871509, + "grad_norm": 0.7094868421554565, + "learning_rate": 0.0006970588235294117, + "loss": 0.4075, + "step": 10923 + }, + { + "epoch": 6.102793296089385, + "grad_norm": 0.482438862323761, + "learning_rate": 0.00069703081232493, + "loss": 0.4253, + "step": 10924 + }, + { + "epoch": 6.103351955307263, + "grad_norm": 0.7579505443572998, + "learning_rate": 0.0006970028011204483, + "loss": 0.5492, + "step": 10925 + }, + { + "epoch": 6.10391061452514, + "grad_norm": 1.9759689569473267, + "learning_rate": 0.0006969747899159665, + "loss": 0.5222, + "step": 10926 + }, + { + "epoch": 6.104469273743017, + "grad_norm": 0.4022391438484192, + "learning_rate": 0.0006969467787114846, + "loss": 0.3137, + "step": 10927 + }, + { + "epoch": 6.105027932960894, + "grad_norm": 0.8322412967681885, + "learning_rate": 0.0006969187675070028, + "loss": 0.53, + "step": 10928 + }, + { + "epoch": 6.105586592178771, + "grad_norm": 0.501099169254303, + "learning_rate": 0.0006968907563025211, + "loss": 0.3716, + "step": 10929 + }, + { + "epoch": 6.106145251396648, + "grad_norm": 0.5984638929367065, + "learning_rate": 0.0006968627450980393, + "loss": 0.4958, + "step": 10930 + }, + { + "epoch": 6.1067039106145256, + "grad_norm": 0.578277587890625, + "learning_rate": 0.0006968347338935575, + "loss": 0.4391, + "step": 10931 + }, + { + "epoch": 6.107262569832402, + "grad_norm": 1.397057056427002, + "learning_rate": 0.0006968067226890756, + "loss": 0.49, + "step": 10932 + }, + { + "epoch": 6.10782122905028, + "grad_norm": 0.376043438911438, + "learning_rate": 0.0006967787114845938, + "loss": 0.4112, + "step": 10933 + }, + { + "epoch": 6.108379888268156, + "grad_norm": 0.4655259847640991, + "learning_rate": 0.000696750700280112, + "loss": 0.4504, + "step": 10934 + }, + { + "epoch": 6.108938547486034, + "grad_norm": 0.7578765749931335, + "learning_rate": 0.0006967226890756303, + "loss": 0.5295, + "step": 10935 + }, + { + "epoch": 6.10949720670391, + "grad_norm": 0.43772193789482117, + "learning_rate": 0.0006966946778711485, + "loss": 0.437, + "step": 10936 + }, + { + "epoch": 6.110055865921788, + "grad_norm": 0.9742315411567688, + "learning_rate": 0.0006966666666666667, + "loss": 0.3953, + "step": 10937 + }, + { + "epoch": 6.110614525139665, + "grad_norm": 0.4262291193008423, + "learning_rate": 0.0006966386554621848, + "loss": 0.4598, + "step": 10938 + }, + { + "epoch": 6.111173184357542, + "grad_norm": 0.4859914481639862, + "learning_rate": 0.000696610644257703, + "loss": 0.3415, + "step": 10939 + }, + { + "epoch": 6.111731843575419, + "grad_norm": 0.7761494517326355, + "learning_rate": 0.0006965826330532213, + "loss": 0.4251, + "step": 10940 + }, + { + "epoch": 6.112290502793296, + "grad_norm": 0.5885061621665955, + "learning_rate": 0.0006965546218487395, + "loss": 0.6029, + "step": 10941 + }, + { + "epoch": 6.112849162011173, + "grad_norm": 0.5668952465057373, + "learning_rate": 0.0006965266106442578, + "loss": 0.5304, + "step": 10942 + }, + { + "epoch": 6.113407821229051, + "grad_norm": 0.6129389405250549, + "learning_rate": 0.0006964985994397759, + "loss": 0.4598, + "step": 10943 + }, + { + "epoch": 6.113966480446927, + "grad_norm": 0.35725122690200806, + "learning_rate": 0.0006964705882352941, + "loss": 0.3318, + "step": 10944 + }, + { + "epoch": 6.114525139664805, + "grad_norm": 1.4380866289138794, + "learning_rate": 0.0006964425770308124, + "loss": 0.4372, + "step": 10945 + }, + { + "epoch": 6.115083798882681, + "grad_norm": 0.6878176331520081, + "learning_rate": 0.0006964145658263306, + "loss": 0.5267, + "step": 10946 + }, + { + "epoch": 6.115642458100559, + "grad_norm": 0.7436250448226929, + "learning_rate": 0.0006963865546218488, + "loss": 0.4181, + "step": 10947 + }, + { + "epoch": 6.116201117318436, + "grad_norm": 0.8786506056785583, + "learning_rate": 0.0006963585434173669, + "loss": 0.5409, + "step": 10948 + }, + { + "epoch": 6.116759776536313, + "grad_norm": 0.4254874587059021, + "learning_rate": 0.0006963305322128851, + "loss": 0.4699, + "step": 10949 + }, + { + "epoch": 6.11731843575419, + "grad_norm": 0.7329756021499634, + "learning_rate": 0.0006963025210084034, + "loss": 0.5014, + "step": 10950 + }, + { + "epoch": 6.117877094972067, + "grad_norm": 0.7794732451438904, + "learning_rate": 0.0006962745098039216, + "loss": 0.3411, + "step": 10951 + }, + { + "epoch": 6.118435754189944, + "grad_norm": 0.6167165637016296, + "learning_rate": 0.0006962464985994398, + "loss": 0.5812, + "step": 10952 + }, + { + "epoch": 6.118994413407822, + "grad_norm": 0.49819305539131165, + "learning_rate": 0.000696218487394958, + "loss": 0.4392, + "step": 10953 + }, + { + "epoch": 6.119553072625698, + "grad_norm": 0.4116550385951996, + "learning_rate": 0.0006961904761904761, + "loss": 0.3648, + "step": 10954 + }, + { + "epoch": 6.120111731843576, + "grad_norm": 0.5190322995185852, + "learning_rate": 0.0006961624649859944, + "loss": 0.4, + "step": 10955 + }, + { + "epoch": 6.120670391061452, + "grad_norm": 0.4842533469200134, + "learning_rate": 0.0006961344537815126, + "loss": 0.4824, + "step": 10956 + }, + { + "epoch": 6.12122905027933, + "grad_norm": 0.8224414587020874, + "learning_rate": 0.0006961064425770308, + "loss": 0.5171, + "step": 10957 + }, + { + "epoch": 6.121787709497207, + "grad_norm": 0.5653104186058044, + "learning_rate": 0.000696078431372549, + "loss": 0.3723, + "step": 10958 + }, + { + "epoch": 6.122346368715084, + "grad_norm": 0.9112353920936584, + "learning_rate": 0.0006960504201680671, + "loss": 0.5437, + "step": 10959 + }, + { + "epoch": 6.122905027932961, + "grad_norm": 0.504174530506134, + "learning_rate": 0.0006960224089635855, + "loss": 0.4373, + "step": 10960 + }, + { + "epoch": 6.123463687150838, + "grad_norm": 0.9851964116096497, + "learning_rate": 0.0006959943977591037, + "loss": 0.4757, + "step": 10961 + }, + { + "epoch": 6.124022346368715, + "grad_norm": 2.3386800289154053, + "learning_rate": 0.0006959663865546219, + "loss": 0.3673, + "step": 10962 + }, + { + "epoch": 6.124581005586593, + "grad_norm": 0.6139798760414124, + "learning_rate": 0.0006959383753501401, + "loss": 0.4811, + "step": 10963 + }, + { + "epoch": 6.125139664804469, + "grad_norm": 0.5551549792289734, + "learning_rate": 0.0006959103641456582, + "loss": 0.4608, + "step": 10964 + }, + { + "epoch": 6.125698324022347, + "grad_norm": 0.4621923863887787, + "learning_rate": 0.0006958823529411765, + "loss": 0.5717, + "step": 10965 + }, + { + "epoch": 6.126256983240223, + "grad_norm": 3.5591931343078613, + "learning_rate": 0.0006958543417366947, + "loss": 0.4244, + "step": 10966 + }, + { + "epoch": 6.126815642458101, + "grad_norm": 0.3890925943851471, + "learning_rate": 0.0006958263305322129, + "loss": 0.3799, + "step": 10967 + }, + { + "epoch": 6.127374301675978, + "grad_norm": 0.42973747849464417, + "learning_rate": 0.0006957983193277311, + "loss": 0.3414, + "step": 10968 + }, + { + "epoch": 6.127932960893855, + "grad_norm": 0.4976039230823517, + "learning_rate": 0.0006957703081232493, + "loss": 0.4378, + "step": 10969 + }, + { + "epoch": 6.128491620111732, + "grad_norm": 0.8133490085601807, + "learning_rate": 0.0006957422969187675, + "loss": 0.609, + "step": 10970 + }, + { + "epoch": 6.129050279329609, + "grad_norm": 0.575567364692688, + "learning_rate": 0.0006957142857142857, + "loss": 0.4739, + "step": 10971 + }, + { + "epoch": 6.129608938547486, + "grad_norm": 0.4603343605995178, + "learning_rate": 0.0006956862745098039, + "loss": 0.5469, + "step": 10972 + }, + { + "epoch": 6.130167597765363, + "grad_norm": 0.5232548713684082, + "learning_rate": 0.0006956582633053221, + "loss": 0.3963, + "step": 10973 + }, + { + "epoch": 6.13072625698324, + "grad_norm": 0.9490572810173035, + "learning_rate": 0.0006956302521008403, + "loss": 0.4825, + "step": 10974 + }, + { + "epoch": 6.131284916201118, + "grad_norm": 0.495437353849411, + "learning_rate": 0.0006956022408963586, + "loss": 0.4201, + "step": 10975 + }, + { + "epoch": 6.131843575418994, + "grad_norm": 0.4871094226837158, + "learning_rate": 0.0006955742296918768, + "loss": 0.5183, + "step": 10976 + }, + { + "epoch": 6.132402234636872, + "grad_norm": 0.46342968940734863, + "learning_rate": 0.000695546218487395, + "loss": 0.4424, + "step": 10977 + }, + { + "epoch": 6.132960893854748, + "grad_norm": 0.6024301648139954, + "learning_rate": 0.0006955182072829132, + "loss": 0.5927, + "step": 10978 + }, + { + "epoch": 6.133519553072626, + "grad_norm": 0.6028921008110046, + "learning_rate": 0.0006954901960784314, + "loss": 0.5112, + "step": 10979 + }, + { + "epoch": 6.134078212290503, + "grad_norm": 0.5339969992637634, + "learning_rate": 0.0006954621848739496, + "loss": 0.4154, + "step": 10980 + }, + { + "epoch": 6.13463687150838, + "grad_norm": 0.49067366123199463, + "learning_rate": 0.0006954341736694678, + "loss": 0.3258, + "step": 10981 + }, + { + "epoch": 6.135195530726257, + "grad_norm": 0.4345940947532654, + "learning_rate": 0.000695406162464986, + "loss": 0.4244, + "step": 10982 + }, + { + "epoch": 6.135754189944134, + "grad_norm": 0.4652898907661438, + "learning_rate": 0.0006953781512605042, + "loss": 0.4991, + "step": 10983 + }, + { + "epoch": 6.136312849162011, + "grad_norm": 0.5036503076553345, + "learning_rate": 0.0006953501400560224, + "loss": 0.438, + "step": 10984 + }, + { + "epoch": 6.136871508379889, + "grad_norm": 0.519578218460083, + "learning_rate": 0.0006953221288515407, + "loss": 0.4883, + "step": 10985 + }, + { + "epoch": 6.137430167597765, + "grad_norm": 0.635047435760498, + "learning_rate": 0.0006952941176470588, + "loss": 0.4135, + "step": 10986 + }, + { + "epoch": 6.137988826815643, + "grad_norm": 0.5979741215705872, + "learning_rate": 0.000695266106442577, + "loss": 0.3571, + "step": 10987 + }, + { + "epoch": 6.138547486033519, + "grad_norm": 0.4660211503505707, + "learning_rate": 0.0006952380952380952, + "loss": 0.5042, + "step": 10988 + }, + { + "epoch": 6.139106145251397, + "grad_norm": 0.5490683317184448, + "learning_rate": 0.0006952100840336134, + "loss": 0.4652, + "step": 10989 + }, + { + "epoch": 6.139664804469274, + "grad_norm": 0.7679424285888672, + "learning_rate": 0.0006951820728291317, + "loss": 0.5416, + "step": 10990 + }, + { + "epoch": 6.140223463687151, + "grad_norm": 0.688495934009552, + "learning_rate": 0.0006951540616246498, + "loss": 0.4643, + "step": 10991 + }, + { + "epoch": 6.140782122905028, + "grad_norm": 0.335827112197876, + "learning_rate": 0.000695126050420168, + "loss": 0.3742, + "step": 10992 + }, + { + "epoch": 6.141340782122905, + "grad_norm": 0.5576304197311401, + "learning_rate": 0.0006950980392156863, + "loss": 0.4624, + "step": 10993 + }, + { + "epoch": 6.141899441340782, + "grad_norm": 1.2468128204345703, + "learning_rate": 0.0006950700280112045, + "loss": 0.5582, + "step": 10994 + }, + { + "epoch": 6.14245810055866, + "grad_norm": 0.8157767653465271, + "learning_rate": 0.0006950420168067228, + "loss": 0.4305, + "step": 10995 + }, + { + "epoch": 6.143016759776536, + "grad_norm": 0.5396414995193481, + "learning_rate": 0.0006950140056022409, + "loss": 0.4267, + "step": 10996 + }, + { + "epoch": 6.143575418994414, + "grad_norm": 0.7560551762580872, + "learning_rate": 0.0006949859943977591, + "loss": 0.4171, + "step": 10997 + }, + { + "epoch": 6.14413407821229, + "grad_norm": 0.5291085839271545, + "learning_rate": 0.0006949579831932773, + "loss": 0.5123, + "step": 10998 + }, + { + "epoch": 6.144692737430168, + "grad_norm": 0.5279495716094971, + "learning_rate": 0.0006949299719887955, + "loss": 0.5024, + "step": 10999 + }, + { + "epoch": 6.145251396648045, + "grad_norm": 0.6578406095504761, + "learning_rate": 0.0006949019607843138, + "loss": 0.5366, + "step": 11000 + }, + { + "epoch": 6.145251396648045, + "eval_cer": 0.09167730461632462, + "eval_loss": 0.34710562229156494, + "eval_runtime": 55.6759, + "eval_samples_per_second": 81.508, + "eval_steps_per_second": 5.101, + "eval_wer": 0.36133068908689703, + "step": 11000 + }, + { + "epoch": 6.145810055865922, + "grad_norm": 0.6645103096961975, + "learning_rate": 0.000694873949579832, + "loss": 0.456, + "step": 11001 + }, + { + "epoch": 6.146368715083799, + "grad_norm": 0.485580176115036, + "learning_rate": 0.0006948459383753501, + "loss": 0.4979, + "step": 11002 + }, + { + "epoch": 6.146927374301676, + "grad_norm": 0.603653073310852, + "learning_rate": 0.0006948179271708683, + "loss": 0.5114, + "step": 11003 + }, + { + "epoch": 6.147486033519553, + "grad_norm": 0.6199735403060913, + "learning_rate": 0.0006947899159663865, + "loss": 0.4725, + "step": 11004 + }, + { + "epoch": 6.148044692737431, + "grad_norm": 0.4745272994041443, + "learning_rate": 0.0006947619047619048, + "loss": 0.3131, + "step": 11005 + }, + { + "epoch": 6.148603351955307, + "grad_norm": 0.444668710231781, + "learning_rate": 0.000694733893557423, + "loss": 0.3948, + "step": 11006 + }, + { + "epoch": 6.149162011173185, + "grad_norm": 0.395626425743103, + "learning_rate": 0.0006947058823529411, + "loss": 0.4473, + "step": 11007 + }, + { + "epoch": 6.149720670391061, + "grad_norm": 0.644459068775177, + "learning_rate": 0.0006946778711484593, + "loss": 0.4119, + "step": 11008 + }, + { + "epoch": 6.150279329608939, + "grad_norm": 0.8359875679016113, + "learning_rate": 0.0006946498599439776, + "loss": 0.3634, + "step": 11009 + }, + { + "epoch": 6.150837988826815, + "grad_norm": 0.5190132856369019, + "learning_rate": 0.0006946218487394959, + "loss": 0.4067, + "step": 11010 + }, + { + "epoch": 6.151396648044693, + "grad_norm": 9.551300048828125, + "learning_rate": 0.0006945938375350141, + "loss": 0.427, + "step": 11011 + }, + { + "epoch": 6.15195530726257, + "grad_norm": 0.3548396825790405, + "learning_rate": 0.0006945658263305322, + "loss": 0.3743, + "step": 11012 + }, + { + "epoch": 6.152513966480447, + "grad_norm": 0.6031273603439331, + "learning_rate": 0.0006945378151260504, + "loss": 0.571, + "step": 11013 + }, + { + "epoch": 6.153072625698324, + "grad_norm": 0.4338412880897522, + "learning_rate": 0.0006945098039215686, + "loss": 0.3511, + "step": 11014 + }, + { + "epoch": 6.153631284916201, + "grad_norm": 0.40045738220214844, + "learning_rate": 0.0006944817927170869, + "loss": 0.3809, + "step": 11015 + }, + { + "epoch": 6.154189944134078, + "grad_norm": 0.5165607929229736, + "learning_rate": 0.0006944537815126051, + "loss": 0.4093, + "step": 11016 + }, + { + "epoch": 6.154748603351956, + "grad_norm": 0.782764196395874, + "learning_rate": 0.0006944257703081233, + "loss": 0.497, + "step": 11017 + }, + { + "epoch": 6.155307262569832, + "grad_norm": 0.5764148235321045, + "learning_rate": 0.0006943977591036414, + "loss": 0.4085, + "step": 11018 + }, + { + "epoch": 6.15586592178771, + "grad_norm": 0.5112626552581787, + "learning_rate": 0.0006943697478991596, + "loss": 0.4169, + "step": 11019 + }, + { + "epoch": 6.156424581005586, + "grad_norm": 0.5369787812232971, + "learning_rate": 0.0006943417366946779, + "loss": 0.3273, + "step": 11020 + }, + { + "epoch": 6.156983240223464, + "grad_norm": 0.44596096873283386, + "learning_rate": 0.0006943137254901961, + "loss": 0.4841, + "step": 11021 + }, + { + "epoch": 6.157541899441341, + "grad_norm": 0.46426451206207275, + "learning_rate": 0.0006942857142857143, + "loss": 0.4868, + "step": 11022 + }, + { + "epoch": 6.158100558659218, + "grad_norm": 0.5903044939041138, + "learning_rate": 0.0006942577030812324, + "loss": 0.4962, + "step": 11023 + }, + { + "epoch": 6.158659217877095, + "grad_norm": 0.4951884150505066, + "learning_rate": 0.0006942296918767506, + "loss": 0.5406, + "step": 11024 + }, + { + "epoch": 6.159217877094972, + "grad_norm": 0.5276403427124023, + "learning_rate": 0.000694201680672269, + "loss": 0.421, + "step": 11025 + }, + { + "epoch": 6.159776536312849, + "grad_norm": 7.846343040466309, + "learning_rate": 0.0006941736694677872, + "loss": 0.3978, + "step": 11026 + }, + { + "epoch": 6.160335195530727, + "grad_norm": 0.9427791833877563, + "learning_rate": 0.0006941456582633054, + "loss": 0.6597, + "step": 11027 + }, + { + "epoch": 6.160893854748603, + "grad_norm": 0.44784265756607056, + "learning_rate": 0.0006941176470588235, + "loss": 0.3436, + "step": 11028 + }, + { + "epoch": 6.161452513966481, + "grad_norm": 1.2415848970413208, + "learning_rate": 0.0006940896358543417, + "loss": 0.5221, + "step": 11029 + }, + { + "epoch": 6.162011173184357, + "grad_norm": 0.5714986324310303, + "learning_rate": 0.00069406162464986, + "loss": 0.424, + "step": 11030 + }, + { + "epoch": 6.162569832402235, + "grad_norm": 0.7360231280326843, + "learning_rate": 0.0006940336134453782, + "loss": 0.544, + "step": 11031 + }, + { + "epoch": 6.163128491620112, + "grad_norm": 0.88707035779953, + "learning_rate": 0.0006940056022408964, + "loss": 0.4633, + "step": 11032 + }, + { + "epoch": 6.163687150837989, + "grad_norm": 1.9829038381576538, + "learning_rate": 0.0006939775910364146, + "loss": 0.5355, + "step": 11033 + }, + { + "epoch": 6.164245810055866, + "grad_norm": 0.4707237184047699, + "learning_rate": 0.0006939495798319327, + "loss": 0.4324, + "step": 11034 + }, + { + "epoch": 6.164804469273743, + "grad_norm": 2.2109055519104004, + "learning_rate": 0.000693921568627451, + "loss": 0.4969, + "step": 11035 + }, + { + "epoch": 6.16536312849162, + "grad_norm": 0.6761896014213562, + "learning_rate": 0.0006938935574229692, + "loss": 0.5069, + "step": 11036 + }, + { + "epoch": 6.165921787709498, + "grad_norm": 0.6342883110046387, + "learning_rate": 0.0006938655462184874, + "loss": 0.3927, + "step": 11037 + }, + { + "epoch": 6.166480446927374, + "grad_norm": 1.644527792930603, + "learning_rate": 0.0006938375350140056, + "loss": 0.3895, + "step": 11038 + }, + { + "epoch": 6.167039106145252, + "grad_norm": 0.46073096990585327, + "learning_rate": 0.0006938095238095237, + "loss": 0.523, + "step": 11039 + }, + { + "epoch": 6.167597765363128, + "grad_norm": 2.2446236610412598, + "learning_rate": 0.000693781512605042, + "loss": 0.5024, + "step": 11040 + }, + { + "epoch": 6.168156424581006, + "grad_norm": 0.4837959110736847, + "learning_rate": 0.0006937535014005603, + "loss": 0.5021, + "step": 11041 + }, + { + "epoch": 6.168715083798883, + "grad_norm": 0.4140658974647522, + "learning_rate": 0.0006937254901960785, + "loss": 0.4148, + "step": 11042 + }, + { + "epoch": 6.16927374301676, + "grad_norm": 0.9625561833381653, + "learning_rate": 0.0006936974789915967, + "loss": 0.4664, + "step": 11043 + }, + { + "epoch": 6.169832402234637, + "grad_norm": 1.376027226448059, + "learning_rate": 0.0006936694677871148, + "loss": 0.7111, + "step": 11044 + }, + { + "epoch": 6.170391061452514, + "grad_norm": 0.5922132730484009, + "learning_rate": 0.0006936414565826331, + "loss": 0.474, + "step": 11045 + }, + { + "epoch": 6.170949720670391, + "grad_norm": 0.46543070673942566, + "learning_rate": 0.0006936134453781513, + "loss": 0.5159, + "step": 11046 + }, + { + "epoch": 6.171508379888268, + "grad_norm": 0.6008877754211426, + "learning_rate": 0.0006935854341736695, + "loss": 0.4303, + "step": 11047 + }, + { + "epoch": 6.172067039106145, + "grad_norm": 0.37619924545288086, + "learning_rate": 0.0006935574229691877, + "loss": 0.4361, + "step": 11048 + }, + { + "epoch": 6.172625698324023, + "grad_norm": 1.0105559825897217, + "learning_rate": 0.0006935294117647059, + "loss": 0.5195, + "step": 11049 + }, + { + "epoch": 6.173184357541899, + "grad_norm": 0.5267997980117798, + "learning_rate": 0.0006935014005602241, + "loss": 0.4048, + "step": 11050 + }, + { + "epoch": 6.173743016759777, + "grad_norm": 0.40528345108032227, + "learning_rate": 0.0006934733893557423, + "loss": 0.3128, + "step": 11051 + }, + { + "epoch": 6.174301675977653, + "grad_norm": 0.5212454199790955, + "learning_rate": 0.0006934453781512605, + "loss": 0.476, + "step": 11052 + }, + { + "epoch": 6.174860335195531, + "grad_norm": 2.836933135986328, + "learning_rate": 0.0006934173669467787, + "loss": 0.7061, + "step": 11053 + }, + { + "epoch": 6.175418994413408, + "grad_norm": 0.562639057636261, + "learning_rate": 0.0006933893557422969, + "loss": 0.4495, + "step": 11054 + }, + { + "epoch": 6.175977653631285, + "grad_norm": 0.7593250870704651, + "learning_rate": 0.0006933613445378151, + "loss": 0.4694, + "step": 11055 + }, + { + "epoch": 6.176536312849162, + "grad_norm": 0.6319076418876648, + "learning_rate": 0.0006933333333333333, + "loss": 0.4637, + "step": 11056 + }, + { + "epoch": 6.177094972067039, + "grad_norm": 0.4834778308868408, + "learning_rate": 0.0006933053221288516, + "loss": 0.5669, + "step": 11057 + }, + { + "epoch": 6.177653631284916, + "grad_norm": 0.8001792430877686, + "learning_rate": 0.0006932773109243698, + "loss": 0.4873, + "step": 11058 + }, + { + "epoch": 6.178212290502794, + "grad_norm": 2.105329990386963, + "learning_rate": 0.000693249299719888, + "loss": 0.3873, + "step": 11059 + }, + { + "epoch": 6.17877094972067, + "grad_norm": 0.9396324753761292, + "learning_rate": 0.0006932212885154063, + "loss": 0.476, + "step": 11060 + }, + { + "epoch": 6.179329608938548, + "grad_norm": 1.8959730863571167, + "learning_rate": 0.0006931932773109244, + "loss": 0.4138, + "step": 11061 + }, + { + "epoch": 6.179888268156424, + "grad_norm": 0.5052026510238647, + "learning_rate": 0.0006931652661064426, + "loss": 0.3591, + "step": 11062 + }, + { + "epoch": 6.180446927374302, + "grad_norm": 0.6946987509727478, + "learning_rate": 0.0006931372549019608, + "loss": 0.474, + "step": 11063 + }, + { + "epoch": 6.181005586592179, + "grad_norm": 0.42790892720222473, + "learning_rate": 0.000693109243697479, + "loss": 0.4547, + "step": 11064 + }, + { + "epoch": 6.181564245810056, + "grad_norm": 0.5898541212081909, + "learning_rate": 0.0006930812324929973, + "loss": 0.5052, + "step": 11065 + }, + { + "epoch": 6.182122905027933, + "grad_norm": 0.5448857545852661, + "learning_rate": 0.0006930532212885154, + "loss": 0.4416, + "step": 11066 + }, + { + "epoch": 6.18268156424581, + "grad_norm": 0.443786084651947, + "learning_rate": 0.0006930252100840336, + "loss": 0.3703, + "step": 11067 + }, + { + "epoch": 6.183240223463687, + "grad_norm": 0.7968162894248962, + "learning_rate": 0.0006929971988795518, + "loss": 0.466, + "step": 11068 + }, + { + "epoch": 6.183798882681565, + "grad_norm": 4.451937675476074, + "learning_rate": 0.00069296918767507, + "loss": 0.435, + "step": 11069 + }, + { + "epoch": 6.184357541899441, + "grad_norm": 0.829754114151001, + "learning_rate": 0.0006929411764705883, + "loss": 0.475, + "step": 11070 + }, + { + "epoch": 6.184916201117319, + "grad_norm": 0.7205123901367188, + "learning_rate": 0.0006929131652661064, + "loss": 0.3662, + "step": 11071 + }, + { + "epoch": 6.185474860335195, + "grad_norm": 1.06096351146698, + "learning_rate": 0.0006928851540616246, + "loss": 0.4171, + "step": 11072 + }, + { + "epoch": 6.186033519553073, + "grad_norm": 0.4879666268825531, + "learning_rate": 0.0006928571428571428, + "loss": 0.4564, + "step": 11073 + }, + { + "epoch": 6.18659217877095, + "grad_norm": 5.856074810028076, + "learning_rate": 0.000692829131652661, + "loss": 0.4013, + "step": 11074 + }, + { + "epoch": 6.187150837988827, + "grad_norm": 0.5490097999572754, + "learning_rate": 0.0006928011204481794, + "loss": 0.4586, + "step": 11075 + }, + { + "epoch": 6.187709497206704, + "grad_norm": 0.41726183891296387, + "learning_rate": 0.0006927731092436976, + "loss": 0.4309, + "step": 11076 + }, + { + "epoch": 6.188268156424581, + "grad_norm": 0.4752315580844879, + "learning_rate": 0.0006927450980392157, + "loss": 0.4691, + "step": 11077 + }, + { + "epoch": 6.188826815642458, + "grad_norm": 0.7366651892662048, + "learning_rate": 0.0006927170868347339, + "loss": 0.3906, + "step": 11078 + }, + { + "epoch": 6.189385474860336, + "grad_norm": 1.2753126621246338, + "learning_rate": 0.0006926890756302521, + "loss": 0.4413, + "step": 11079 + }, + { + "epoch": 6.189944134078212, + "grad_norm": 0.747553288936615, + "learning_rate": 0.0006926610644257704, + "loss": 0.4707, + "step": 11080 + }, + { + "epoch": 6.19050279329609, + "grad_norm": 0.6181213855743408, + "learning_rate": 0.0006926330532212886, + "loss": 0.4345, + "step": 11081 + }, + { + "epoch": 6.191061452513966, + "grad_norm": 5.963598728179932, + "learning_rate": 0.0006926050420168067, + "loss": 0.4175, + "step": 11082 + }, + { + "epoch": 6.191620111731844, + "grad_norm": 0.6342845559120178, + "learning_rate": 0.0006925770308123249, + "loss": 0.4417, + "step": 11083 + }, + { + "epoch": 6.19217877094972, + "grad_norm": 1.0767654180526733, + "learning_rate": 0.0006925490196078431, + "loss": 0.4798, + "step": 11084 + }, + { + "epoch": 6.192737430167598, + "grad_norm": 0.8455361723899841, + "learning_rate": 0.0006925210084033614, + "loss": 0.5267, + "step": 11085 + }, + { + "epoch": 6.193296089385475, + "grad_norm": 0.7232323288917542, + "learning_rate": 0.0006924929971988796, + "loss": 0.4536, + "step": 11086 + }, + { + "epoch": 6.193854748603352, + "grad_norm": 0.47066929936408997, + "learning_rate": 0.0006924649859943977, + "loss": 0.3842, + "step": 11087 + }, + { + "epoch": 6.194413407821229, + "grad_norm": 0.5924574732780457, + "learning_rate": 0.0006924369747899159, + "loss": 0.4954, + "step": 11088 + }, + { + "epoch": 6.194972067039106, + "grad_norm": 0.48131537437438965, + "learning_rate": 0.0006924089635854341, + "loss": 0.3583, + "step": 11089 + }, + { + "epoch": 6.195530726256983, + "grad_norm": 0.7969999313354492, + "learning_rate": 0.0006923809523809525, + "loss": 0.3847, + "step": 11090 + }, + { + "epoch": 6.196089385474861, + "grad_norm": 0.5471673607826233, + "learning_rate": 0.0006923529411764707, + "loss": 0.4483, + "step": 11091 + }, + { + "epoch": 6.196648044692737, + "grad_norm": 0.5499786138534546, + "learning_rate": 0.0006923249299719889, + "loss": 0.5417, + "step": 11092 + }, + { + "epoch": 6.197206703910615, + "grad_norm": 0.5248214602470398, + "learning_rate": 0.000692296918767507, + "loss": 0.4075, + "step": 11093 + }, + { + "epoch": 6.197765363128491, + "grad_norm": 0.7355324029922485, + "learning_rate": 0.0006922689075630252, + "loss": 0.4283, + "step": 11094 + }, + { + "epoch": 6.198324022346369, + "grad_norm": 0.8176501989364624, + "learning_rate": 0.0006922408963585435, + "loss": 0.4692, + "step": 11095 + }, + { + "epoch": 6.198882681564246, + "grad_norm": 0.5303985476493835, + "learning_rate": 0.0006922128851540617, + "loss": 0.4899, + "step": 11096 + }, + { + "epoch": 6.199441340782123, + "grad_norm": 0.59996497631073, + "learning_rate": 0.0006921848739495799, + "loss": 0.51, + "step": 11097 + }, + { + "epoch": 6.2, + "grad_norm": 0.6563166379928589, + "learning_rate": 0.000692156862745098, + "loss": 0.3627, + "step": 11098 + }, + { + "epoch": 6.200558659217877, + "grad_norm": 2.303741693496704, + "learning_rate": 0.0006921288515406162, + "loss": 0.5127, + "step": 11099 + }, + { + "epoch": 6.201117318435754, + "grad_norm": 0.5335306525230408, + "learning_rate": 0.0006921008403361345, + "loss": 0.4675, + "step": 11100 + }, + { + "epoch": 6.201675977653632, + "grad_norm": 0.6454495787620544, + "learning_rate": 0.0006920728291316527, + "loss": 0.481, + "step": 11101 + }, + { + "epoch": 6.202234636871508, + "grad_norm": 0.4385012090206146, + "learning_rate": 0.0006920448179271709, + "loss": 0.386, + "step": 11102 + }, + { + "epoch": 6.202793296089386, + "grad_norm": 1.1113243103027344, + "learning_rate": 0.000692016806722689, + "loss": 0.4558, + "step": 11103 + }, + { + "epoch": 6.203351955307262, + "grad_norm": 0.7041577696800232, + "learning_rate": 0.0006919887955182072, + "loss": 0.4173, + "step": 11104 + }, + { + "epoch": 6.20391061452514, + "grad_norm": 0.577457070350647, + "learning_rate": 0.0006919607843137255, + "loss": 0.4972, + "step": 11105 + }, + { + "epoch": 6.204469273743017, + "grad_norm": 0.4837631583213806, + "learning_rate": 0.0006919327731092438, + "loss": 0.4035, + "step": 11106 + }, + { + "epoch": 6.205027932960894, + "grad_norm": 0.4688953161239624, + "learning_rate": 0.000691904761904762, + "loss": 0.4554, + "step": 11107 + }, + { + "epoch": 6.205586592178771, + "grad_norm": 2.351287841796875, + "learning_rate": 0.0006918767507002802, + "loss": 0.5017, + "step": 11108 + }, + { + "epoch": 6.206145251396648, + "grad_norm": 0.6057361960411072, + "learning_rate": 0.0006918487394957983, + "loss": 0.4949, + "step": 11109 + }, + { + "epoch": 6.206703910614525, + "grad_norm": 0.7558247447013855, + "learning_rate": 0.0006918207282913166, + "loss": 0.5533, + "step": 11110 + }, + { + "epoch": 6.207262569832403, + "grad_norm": 0.9976764917373657, + "learning_rate": 0.0006917927170868348, + "loss": 0.411, + "step": 11111 + }, + { + "epoch": 6.207821229050279, + "grad_norm": 0.6596027612686157, + "learning_rate": 0.000691764705882353, + "loss": 0.5329, + "step": 11112 + }, + { + "epoch": 6.208379888268157, + "grad_norm": 0.3358546495437622, + "learning_rate": 0.0006917366946778712, + "loss": 0.2956, + "step": 11113 + }, + { + "epoch": 6.208938547486033, + "grad_norm": 0.5593414902687073, + "learning_rate": 0.0006917086834733893, + "loss": 0.4266, + "step": 11114 + }, + { + "epoch": 6.209497206703911, + "grad_norm": 0.5333533883094788, + "learning_rate": 0.0006916806722689076, + "loss": 0.5247, + "step": 11115 + }, + { + "epoch": 6.210055865921788, + "grad_norm": 0.4437052309513092, + "learning_rate": 0.0006916526610644258, + "loss": 0.4344, + "step": 11116 + }, + { + "epoch": 6.210614525139665, + "grad_norm": 2.343329429626465, + "learning_rate": 0.000691624649859944, + "loss": 0.4622, + "step": 11117 + }, + { + "epoch": 6.211173184357542, + "grad_norm": 0.5243825912475586, + "learning_rate": 0.0006915966386554622, + "loss": 0.444, + "step": 11118 + }, + { + "epoch": 6.211731843575419, + "grad_norm": 0.8121657967567444, + "learning_rate": 0.0006915686274509803, + "loss": 0.4872, + "step": 11119 + }, + { + "epoch": 6.212290502793296, + "grad_norm": 0.43471094965934753, + "learning_rate": 0.0006915406162464986, + "loss": 0.3266, + "step": 11120 + }, + { + "epoch": 6.212849162011173, + "grad_norm": 0.47909021377563477, + "learning_rate": 0.0006915126050420168, + "loss": 0.5425, + "step": 11121 + }, + { + "epoch": 6.21340782122905, + "grad_norm": 0.6361687779426575, + "learning_rate": 0.000691484593837535, + "loss": 0.6024, + "step": 11122 + }, + { + "epoch": 6.213966480446928, + "grad_norm": 0.40451547503471375, + "learning_rate": 0.0006914565826330533, + "loss": 0.4516, + "step": 11123 + }, + { + "epoch": 6.214525139664804, + "grad_norm": 0.5453373789787292, + "learning_rate": 0.0006914285714285715, + "loss": 0.4877, + "step": 11124 + }, + { + "epoch": 6.215083798882682, + "grad_norm": 0.5723085999488831, + "learning_rate": 0.0006914005602240897, + "loss": 0.466, + "step": 11125 + }, + { + "epoch": 6.215642458100558, + "grad_norm": 1.1354963779449463, + "learning_rate": 0.0006913725490196079, + "loss": 0.4337, + "step": 11126 + }, + { + "epoch": 6.216201117318436, + "grad_norm": 0.5381463170051575, + "learning_rate": 0.0006913445378151261, + "loss": 0.5454, + "step": 11127 + }, + { + "epoch": 6.216759776536313, + "grad_norm": 0.6525371074676514, + "learning_rate": 0.0006913165266106443, + "loss": 0.4847, + "step": 11128 + }, + { + "epoch": 6.21731843575419, + "grad_norm": 1.4016340970993042, + "learning_rate": 0.0006912885154061625, + "loss": 0.4733, + "step": 11129 + }, + { + "epoch": 6.217877094972067, + "grad_norm": 0.7441390752792358, + "learning_rate": 0.0006912605042016807, + "loss": 0.3809, + "step": 11130 + }, + { + "epoch": 6.218435754189944, + "grad_norm": 0.3427259922027588, + "learning_rate": 0.0006912324929971989, + "loss": 0.3909, + "step": 11131 + }, + { + "epoch": 6.218994413407821, + "grad_norm": 0.6850032806396484, + "learning_rate": 0.0006912044817927171, + "loss": 0.4014, + "step": 11132 + }, + { + "epoch": 6.219553072625699, + "grad_norm": 0.5331707000732422, + "learning_rate": 0.0006911764705882353, + "loss": 0.4339, + "step": 11133 + }, + { + "epoch": 6.220111731843575, + "grad_norm": 0.5954989194869995, + "learning_rate": 0.0006911484593837535, + "loss": 0.4132, + "step": 11134 + }, + { + "epoch": 6.220670391061453, + "grad_norm": 0.6553730964660645, + "learning_rate": 0.0006911204481792717, + "loss": 0.4344, + "step": 11135 + }, + { + "epoch": 6.221229050279329, + "grad_norm": 0.8323370218276978, + "learning_rate": 0.0006910924369747899, + "loss": 0.4824, + "step": 11136 + }, + { + "epoch": 6.221787709497207, + "grad_norm": 0.5582082271575928, + "learning_rate": 0.0006910644257703081, + "loss": 0.4433, + "step": 11137 + }, + { + "epoch": 6.222346368715084, + "grad_norm": 0.5115128755569458, + "learning_rate": 0.0006910364145658263, + "loss": 0.4503, + "step": 11138 + }, + { + "epoch": 6.222905027932961, + "grad_norm": 0.3388581871986389, + "learning_rate": 0.0006910084033613446, + "loss": 0.3357, + "step": 11139 + }, + { + "epoch": 6.223463687150838, + "grad_norm": 0.48183414340019226, + "learning_rate": 0.0006909803921568629, + "loss": 0.4473, + "step": 11140 + }, + { + "epoch": 6.224022346368715, + "grad_norm": 0.5802580714225769, + "learning_rate": 0.000690952380952381, + "loss": 0.4185, + "step": 11141 + }, + { + "epoch": 6.224581005586592, + "grad_norm": 0.5617557764053345, + "learning_rate": 0.0006909243697478992, + "loss": 0.4135, + "step": 11142 + }, + { + "epoch": 6.22513966480447, + "grad_norm": 0.5625030994415283, + "learning_rate": 0.0006908963585434174, + "loss": 0.4139, + "step": 11143 + }, + { + "epoch": 6.225698324022346, + "grad_norm": 0.37672555446624756, + "learning_rate": 0.0006908683473389356, + "loss": 0.3918, + "step": 11144 + }, + { + "epoch": 6.226256983240224, + "grad_norm": 3.426548480987549, + "learning_rate": 0.0006908403361344539, + "loss": 0.5271, + "step": 11145 + }, + { + "epoch": 6.2268156424581, + "grad_norm": 1.4194186925888062, + "learning_rate": 0.000690812324929972, + "loss": 0.5401, + "step": 11146 + }, + { + "epoch": 6.227374301675978, + "grad_norm": 0.6973713040351868, + "learning_rate": 0.0006907843137254902, + "loss": 0.4769, + "step": 11147 + }, + { + "epoch": 6.227932960893855, + "grad_norm": 0.5698657631874084, + "learning_rate": 0.0006907563025210084, + "loss": 0.4104, + "step": 11148 + }, + { + "epoch": 6.228491620111732, + "grad_norm": 0.582277238368988, + "learning_rate": 0.0006907282913165266, + "loss": 0.3403, + "step": 11149 + }, + { + "epoch": 6.229050279329609, + "grad_norm": 0.5936879515647888, + "learning_rate": 0.0006907002801120449, + "loss": 0.5796, + "step": 11150 + }, + { + "epoch": 6.229608938547486, + "grad_norm": 0.5503798127174377, + "learning_rate": 0.000690672268907563, + "loss": 0.4832, + "step": 11151 + }, + { + "epoch": 6.230167597765363, + "grad_norm": 0.40784943103790283, + "learning_rate": 0.0006906442577030812, + "loss": 0.4235, + "step": 11152 + }, + { + "epoch": 6.230726256983241, + "grad_norm": 0.6808150410652161, + "learning_rate": 0.0006906162464985994, + "loss": 0.5843, + "step": 11153 + }, + { + "epoch": 6.231284916201117, + "grad_norm": 1.1461037397384644, + "learning_rate": 0.0006905882352941176, + "loss": 0.4227, + "step": 11154 + }, + { + "epoch": 6.231843575418995, + "grad_norm": 0.5101745128631592, + "learning_rate": 0.0006905602240896358, + "loss": 0.4225, + "step": 11155 + }, + { + "epoch": 6.232402234636871, + "grad_norm": 0.6600402593612671, + "learning_rate": 0.0006905322128851542, + "loss": 0.4565, + "step": 11156 + }, + { + "epoch": 6.232960893854749, + "grad_norm": 0.6657623052597046, + "learning_rate": 0.0006905042016806723, + "loss": 0.5936, + "step": 11157 + }, + { + "epoch": 6.233519553072625, + "grad_norm": 0.9028830528259277, + "learning_rate": 0.0006904761904761905, + "loss": 0.364, + "step": 11158 + }, + { + "epoch": 6.234078212290503, + "grad_norm": 0.7732554078102112, + "learning_rate": 0.0006904481792717087, + "loss": 0.4662, + "step": 11159 + }, + { + "epoch": 6.23463687150838, + "grad_norm": 0.6529476046562195, + "learning_rate": 0.0006904201680672269, + "loss": 0.3948, + "step": 11160 + }, + { + "epoch": 6.235195530726257, + "grad_norm": 0.3281601071357727, + "learning_rate": 0.0006903921568627452, + "loss": 0.3461, + "step": 11161 + }, + { + "epoch": 6.235754189944134, + "grad_norm": 13.75284194946289, + "learning_rate": 0.0006903641456582633, + "loss": 0.6447, + "step": 11162 + }, + { + "epoch": 6.236312849162011, + "grad_norm": 0.4098324775695801, + "learning_rate": 0.0006903361344537815, + "loss": 0.4928, + "step": 11163 + }, + { + "epoch": 6.236871508379888, + "grad_norm": 0.5982289910316467, + "learning_rate": 0.0006903081232492997, + "loss": 0.4533, + "step": 11164 + }, + { + "epoch": 6.237430167597766, + "grad_norm": 0.49558225274086, + "learning_rate": 0.0006902801120448179, + "loss": 0.4527, + "step": 11165 + }, + { + "epoch": 6.237988826815642, + "grad_norm": 0.7247928977012634, + "learning_rate": 0.0006902521008403362, + "loss": 0.4862, + "step": 11166 + }, + { + "epoch": 6.23854748603352, + "grad_norm": 0.5528486371040344, + "learning_rate": 0.0006902240896358543, + "loss": 0.4562, + "step": 11167 + }, + { + "epoch": 6.239106145251396, + "grad_norm": 0.5573023557662964, + "learning_rate": 0.0006901960784313725, + "loss": 0.3641, + "step": 11168 + }, + { + "epoch": 6.239664804469274, + "grad_norm": 0.8229979872703552, + "learning_rate": 0.0006901680672268907, + "loss": 0.3698, + "step": 11169 + }, + { + "epoch": 6.240223463687151, + "grad_norm": 0.6634820103645325, + "learning_rate": 0.0006901400560224089, + "loss": 0.6129, + "step": 11170 + }, + { + "epoch": 6.240782122905028, + "grad_norm": 0.4847666025161743, + "learning_rate": 0.0006901120448179273, + "loss": 0.4213, + "step": 11171 + }, + { + "epoch": 6.241340782122905, + "grad_norm": 0.7485854029655457, + "learning_rate": 0.0006900840336134455, + "loss": 0.4874, + "step": 11172 + }, + { + "epoch": 6.241899441340782, + "grad_norm": 1.7963004112243652, + "learning_rate": 0.0006900560224089636, + "loss": 0.4252, + "step": 11173 + }, + { + "epoch": 6.242458100558659, + "grad_norm": 0.613847553730011, + "learning_rate": 0.0006900280112044818, + "loss": 0.4939, + "step": 11174 + }, + { + "epoch": 6.243016759776537, + "grad_norm": 0.5112985372543335, + "learning_rate": 0.00069, + "loss": 0.4271, + "step": 11175 + }, + { + "epoch": 6.243575418994413, + "grad_norm": 0.47414660453796387, + "learning_rate": 0.0006899719887955183, + "loss": 0.3679, + "step": 11176 + }, + { + "epoch": 6.244134078212291, + "grad_norm": 1.2529995441436768, + "learning_rate": 0.0006899439775910365, + "loss": 0.3995, + "step": 11177 + }, + { + "epoch": 6.244692737430167, + "grad_norm": 0.6523088216781616, + "learning_rate": 0.0006899159663865546, + "loss": 0.4173, + "step": 11178 + }, + { + "epoch": 6.245251396648045, + "grad_norm": 0.5217496752738953, + "learning_rate": 0.0006898879551820728, + "loss": 0.4311, + "step": 11179 + }, + { + "epoch": 6.245810055865922, + "grad_norm": 0.830846905708313, + "learning_rate": 0.000689859943977591, + "loss": 0.4375, + "step": 11180 + }, + { + "epoch": 6.246368715083799, + "grad_norm": 0.5552548170089722, + "learning_rate": 0.0006898319327731093, + "loss": 0.4065, + "step": 11181 + }, + { + "epoch": 6.246927374301676, + "grad_norm": 0.4875868856906891, + "learning_rate": 0.0006898039215686275, + "loss": 0.3814, + "step": 11182 + }, + { + "epoch": 6.247486033519553, + "grad_norm": 0.4327409863471985, + "learning_rate": 0.0006897759103641456, + "loss": 0.3711, + "step": 11183 + }, + { + "epoch": 6.24804469273743, + "grad_norm": 0.6267342567443848, + "learning_rate": 0.0006897478991596638, + "loss": 0.5649, + "step": 11184 + }, + { + "epoch": 6.248603351955307, + "grad_norm": 0.4159642159938812, + "learning_rate": 0.000689719887955182, + "loss": 0.3937, + "step": 11185 + }, + { + "epoch": 6.249162011173184, + "grad_norm": 0.4459898769855499, + "learning_rate": 0.0006896918767507003, + "loss": 0.3673, + "step": 11186 + }, + { + "epoch": 6.249720670391062, + "grad_norm": 0.6854798793792725, + "learning_rate": 0.0006896638655462185, + "loss": 0.4599, + "step": 11187 + }, + { + "epoch": 6.250279329608938, + "grad_norm": 0.5844081044197083, + "learning_rate": 0.0006896358543417368, + "loss": 0.5304, + "step": 11188 + }, + { + "epoch": 6.250837988826816, + "grad_norm": 0.5282191634178162, + "learning_rate": 0.0006896078431372549, + "loss": 0.3807, + "step": 11189 + }, + { + "epoch": 6.251396648044693, + "grad_norm": 0.7448979020118713, + "learning_rate": 0.0006895798319327731, + "loss": 0.5425, + "step": 11190 + }, + { + "epoch": 6.25195530726257, + "grad_norm": 0.5471347570419312, + "learning_rate": 0.0006895518207282914, + "loss": 0.5398, + "step": 11191 + }, + { + "epoch": 6.252513966480447, + "grad_norm": 0.461185097694397, + "learning_rate": 0.0006895238095238096, + "loss": 0.4359, + "step": 11192 + }, + { + "epoch": 6.253072625698324, + "grad_norm": 0.5877832770347595, + "learning_rate": 0.0006894957983193278, + "loss": 0.554, + "step": 11193 + }, + { + "epoch": 6.253631284916201, + "grad_norm": 1.967076301574707, + "learning_rate": 0.0006894677871148459, + "loss": 0.4656, + "step": 11194 + }, + { + "epoch": 6.254189944134078, + "grad_norm": 0.5672575235366821, + "learning_rate": 0.0006894397759103641, + "loss": 0.4791, + "step": 11195 + }, + { + "epoch": 6.254748603351955, + "grad_norm": 0.4789290726184845, + "learning_rate": 0.0006894117647058824, + "loss": 0.3457, + "step": 11196 + }, + { + "epoch": 6.255307262569833, + "grad_norm": 0.5638444423675537, + "learning_rate": 0.0006893837535014006, + "loss": 0.4037, + "step": 11197 + }, + { + "epoch": 6.255865921787709, + "grad_norm": 0.7057085037231445, + "learning_rate": 0.0006893557422969188, + "loss": 0.4674, + "step": 11198 + }, + { + "epoch": 6.256424581005587, + "grad_norm": 2.4486758708953857, + "learning_rate": 0.0006893277310924369, + "loss": 0.391, + "step": 11199 + }, + { + "epoch": 6.256983240223463, + "grad_norm": 0.5865665674209595, + "learning_rate": 0.0006892997198879551, + "loss": 0.5648, + "step": 11200 + }, + { + "epoch": 6.257541899441341, + "grad_norm": 0.6624398827552795, + "learning_rate": 0.0006892717086834734, + "loss": 0.6568, + "step": 11201 + }, + { + "epoch": 6.258100558659218, + "grad_norm": 0.581609308719635, + "learning_rate": 0.0006892436974789916, + "loss": 0.4473, + "step": 11202 + }, + { + "epoch": 6.258659217877095, + "grad_norm": 0.49125662446022034, + "learning_rate": 0.0006892156862745098, + "loss": 0.497, + "step": 11203 + }, + { + "epoch": 6.259217877094972, + "grad_norm": 0.4363628029823303, + "learning_rate": 0.000689187675070028, + "loss": 0.4023, + "step": 11204 + }, + { + "epoch": 6.259776536312849, + "grad_norm": 1.1108808517456055, + "learning_rate": 0.0006891596638655461, + "loss": 0.5842, + "step": 11205 + }, + { + "epoch": 6.260335195530726, + "grad_norm": 0.3604320287704468, + "learning_rate": 0.0006891316526610645, + "loss": 0.3915, + "step": 11206 + }, + { + "epoch": 6.260893854748604, + "grad_norm": 0.54599928855896, + "learning_rate": 0.0006891036414565827, + "loss": 0.4666, + "step": 11207 + }, + { + "epoch": 6.26145251396648, + "grad_norm": 0.43186667561531067, + "learning_rate": 0.0006890756302521009, + "loss": 0.4123, + "step": 11208 + }, + { + "epoch": 6.262011173184358, + "grad_norm": 0.4337267577648163, + "learning_rate": 0.0006890476190476191, + "loss": 0.372, + "step": 11209 + }, + { + "epoch": 6.262569832402234, + "grad_norm": 0.7012616395950317, + "learning_rate": 0.0006890196078431372, + "loss": 0.4099, + "step": 11210 + }, + { + "epoch": 6.263128491620112, + "grad_norm": 0.5912482142448425, + "learning_rate": 0.0006889915966386555, + "loss": 0.4252, + "step": 11211 + }, + { + "epoch": 6.263687150837989, + "grad_norm": 0.5213072299957275, + "learning_rate": 0.0006889635854341737, + "loss": 0.4222, + "step": 11212 + }, + { + "epoch": 6.264245810055866, + "grad_norm": 0.547344446182251, + "learning_rate": 0.0006889355742296919, + "loss": 0.4906, + "step": 11213 + }, + { + "epoch": 6.264804469273743, + "grad_norm": 0.40549513697624207, + "learning_rate": 0.0006889075630252101, + "loss": 0.4714, + "step": 11214 + }, + { + "epoch": 6.26536312849162, + "grad_norm": 0.6650232076644897, + "learning_rate": 0.0006888795518207282, + "loss": 0.479, + "step": 11215 + }, + { + "epoch": 6.265921787709497, + "grad_norm": 0.717628538608551, + "learning_rate": 0.0006888515406162465, + "loss": 0.5645, + "step": 11216 + }, + { + "epoch": 6.266480446927375, + "grad_norm": 0.825869083404541, + "learning_rate": 0.0006888235294117647, + "loss": 0.393, + "step": 11217 + }, + { + "epoch": 6.267039106145251, + "grad_norm": 0.5630092620849609, + "learning_rate": 0.0006887955182072829, + "loss": 0.3434, + "step": 11218 + }, + { + "epoch": 6.267597765363129, + "grad_norm": 0.7241304516792297, + "learning_rate": 0.0006887675070028011, + "loss": 0.4394, + "step": 11219 + }, + { + "epoch": 6.268156424581005, + "grad_norm": 0.4889439046382904, + "learning_rate": 0.0006887394957983193, + "loss": 0.4697, + "step": 11220 + }, + { + "epoch": 6.268715083798883, + "grad_norm": 0.8159801959991455, + "learning_rate": 0.0006887114845938376, + "loss": 0.5028, + "step": 11221 + }, + { + "epoch": 6.269273743016759, + "grad_norm": 0.4378489553928375, + "learning_rate": 0.0006886834733893558, + "loss": 0.4606, + "step": 11222 + }, + { + "epoch": 6.269832402234637, + "grad_norm": 0.36192065477371216, + "learning_rate": 0.000688655462184874, + "loss": 0.2936, + "step": 11223 + }, + { + "epoch": 6.270391061452514, + "grad_norm": 0.36072278022766113, + "learning_rate": 0.0006886274509803922, + "loss": 0.4113, + "step": 11224 + }, + { + "epoch": 6.270949720670391, + "grad_norm": 3.2129781246185303, + "learning_rate": 0.0006885994397759104, + "loss": 0.4601, + "step": 11225 + }, + { + "epoch": 6.271508379888268, + "grad_norm": 0.5723569393157959, + "learning_rate": 0.0006885714285714286, + "loss": 0.416, + "step": 11226 + }, + { + "epoch": 6.272067039106146, + "grad_norm": 1.205926775932312, + "learning_rate": 0.0006885434173669468, + "loss": 0.3953, + "step": 11227 + }, + { + "epoch": 6.272625698324022, + "grad_norm": 0.5477660298347473, + "learning_rate": 0.000688515406162465, + "loss": 0.4305, + "step": 11228 + }, + { + "epoch": 6.2731843575419, + "grad_norm": 0.4841614365577698, + "learning_rate": 0.0006884873949579832, + "loss": 0.4569, + "step": 11229 + }, + { + "epoch": 6.273743016759776, + "grad_norm": 1.21365487575531, + "learning_rate": 0.0006884593837535014, + "loss": 0.5721, + "step": 11230 + }, + { + "epoch": 6.274301675977654, + "grad_norm": 0.8916204571723938, + "learning_rate": 0.0006884313725490196, + "loss": 0.4278, + "step": 11231 + }, + { + "epoch": 6.27486033519553, + "grad_norm": 0.566136360168457, + "learning_rate": 0.0006884033613445378, + "loss": 0.3799, + "step": 11232 + }, + { + "epoch": 6.275418994413408, + "grad_norm": 0.7047808766365051, + "learning_rate": 0.000688375350140056, + "loss": 0.4102, + "step": 11233 + }, + { + "epoch": 6.275977653631285, + "grad_norm": 1.0304712057113647, + "learning_rate": 0.0006883473389355742, + "loss": 0.3965, + "step": 11234 + }, + { + "epoch": 6.276536312849162, + "grad_norm": 0.7288039922714233, + "learning_rate": 0.0006883193277310924, + "loss": 0.6641, + "step": 11235 + }, + { + "epoch": 6.277094972067039, + "grad_norm": 0.9225782752037048, + "learning_rate": 0.0006882913165266107, + "loss": 0.4155, + "step": 11236 + }, + { + "epoch": 6.277653631284916, + "grad_norm": 0.4543639123439789, + "learning_rate": 0.0006882633053221288, + "loss": 0.469, + "step": 11237 + }, + { + "epoch": 6.278212290502793, + "grad_norm": 0.4584183990955353, + "learning_rate": 0.000688235294117647, + "loss": 0.4513, + "step": 11238 + }, + { + "epoch": 6.278770949720671, + "grad_norm": 0.5508493185043335, + "learning_rate": 0.0006882072829131653, + "loss": 0.5104, + "step": 11239 + }, + { + "epoch": 6.279329608938547, + "grad_norm": 0.5691764950752258, + "learning_rate": 0.0006881792717086835, + "loss": 0.4397, + "step": 11240 + }, + { + "epoch": 6.279888268156425, + "grad_norm": 0.39158087968826294, + "learning_rate": 0.0006881512605042018, + "loss": 0.4525, + "step": 11241 + }, + { + "epoch": 6.280446927374301, + "grad_norm": 0.8279297351837158, + "learning_rate": 0.0006881232492997199, + "loss": 0.5181, + "step": 11242 + }, + { + "epoch": 6.281005586592179, + "grad_norm": 0.48144567012786865, + "learning_rate": 0.0006880952380952381, + "loss": 0.4334, + "step": 11243 + }, + { + "epoch": 6.281564245810056, + "grad_norm": 0.5417659282684326, + "learning_rate": 0.0006880672268907563, + "loss": 0.4659, + "step": 11244 + }, + { + "epoch": 6.282122905027933, + "grad_norm": 0.4606843888759613, + "learning_rate": 0.0006880392156862745, + "loss": 0.4423, + "step": 11245 + }, + { + "epoch": 6.28268156424581, + "grad_norm": 0.5715280175209045, + "learning_rate": 0.0006880112044817928, + "loss": 0.5241, + "step": 11246 + }, + { + "epoch": 6.283240223463687, + "grad_norm": 0.7559725046157837, + "learning_rate": 0.0006879831932773109, + "loss": 0.445, + "step": 11247 + }, + { + "epoch": 6.283798882681564, + "grad_norm": 0.45731693506240845, + "learning_rate": 0.0006879551820728291, + "loss": 0.4304, + "step": 11248 + }, + { + "epoch": 6.284357541899442, + "grad_norm": 0.42387619614601135, + "learning_rate": 0.0006879271708683473, + "loss": 0.378, + "step": 11249 + }, + { + "epoch": 6.284916201117318, + "grad_norm": 1.0018961429595947, + "learning_rate": 0.0006878991596638655, + "loss": 0.3522, + "step": 11250 + }, + { + "epoch": 6.285474860335196, + "grad_norm": 0.38723766803741455, + "learning_rate": 0.0006878711484593838, + "loss": 0.3889, + "step": 11251 + }, + { + "epoch": 6.286033519553072, + "grad_norm": 0.5693039894104004, + "learning_rate": 0.000687843137254902, + "loss": 0.4528, + "step": 11252 + }, + { + "epoch": 6.28659217877095, + "grad_norm": 0.5103390216827393, + "learning_rate": 0.0006878151260504201, + "loss": 0.5247, + "step": 11253 + }, + { + "epoch": 6.287150837988827, + "grad_norm": 1.2775022983551025, + "learning_rate": 0.0006877871148459383, + "loss": 0.5028, + "step": 11254 + }, + { + "epoch": 6.287709497206704, + "grad_norm": 1.2915436029434204, + "learning_rate": 0.0006877591036414566, + "loss": 0.4331, + "step": 11255 + }, + { + "epoch": 6.288268156424581, + "grad_norm": 0.49905383586883545, + "learning_rate": 0.0006877310924369749, + "loss": 0.4093, + "step": 11256 + }, + { + "epoch": 6.288826815642458, + "grad_norm": 0.36620283126831055, + "learning_rate": 0.0006877030812324931, + "loss": 0.3804, + "step": 11257 + }, + { + "epoch": 6.289385474860335, + "grad_norm": 0.5044558048248291, + "learning_rate": 0.0006876750700280112, + "loss": 0.4475, + "step": 11258 + }, + { + "epoch": 6.289944134078212, + "grad_norm": 1.3240736722946167, + "learning_rate": 0.0006876470588235294, + "loss": 0.3913, + "step": 11259 + }, + { + "epoch": 6.290502793296089, + "grad_norm": 0.4556600749492645, + "learning_rate": 0.0006876190476190476, + "loss": 0.3909, + "step": 11260 + }, + { + "epoch": 6.291061452513967, + "grad_norm": 0.35912489891052246, + "learning_rate": 0.0006875910364145659, + "loss": 0.374, + "step": 11261 + }, + { + "epoch": 6.291620111731843, + "grad_norm": 0.8574111461639404, + "learning_rate": 0.0006875630252100841, + "loss": 0.6357, + "step": 11262 + }, + { + "epoch": 6.292178770949721, + "grad_norm": 0.448162317276001, + "learning_rate": 0.0006875350140056022, + "loss": 0.3778, + "step": 11263 + }, + { + "epoch": 6.292737430167598, + "grad_norm": 0.5595035552978516, + "learning_rate": 0.0006875070028011204, + "loss": 0.416, + "step": 11264 + }, + { + "epoch": 6.293296089385475, + "grad_norm": 0.5972183346748352, + "learning_rate": 0.0006874789915966386, + "loss": 0.5932, + "step": 11265 + }, + { + "epoch": 6.293854748603352, + "grad_norm": 0.6435426473617554, + "learning_rate": 0.0006874509803921569, + "loss": 0.695, + "step": 11266 + }, + { + "epoch": 6.294413407821229, + "grad_norm": 0.5722102522850037, + "learning_rate": 0.0006874229691876751, + "loss": 0.483, + "step": 11267 + }, + { + "epoch": 6.294972067039106, + "grad_norm": 0.4654307961463928, + "learning_rate": 0.0006873949579831933, + "loss": 0.3998, + "step": 11268 + }, + { + "epoch": 6.295530726256983, + "grad_norm": 0.5992904901504517, + "learning_rate": 0.0006873669467787114, + "loss": 0.51, + "step": 11269 + }, + { + "epoch": 6.29608938547486, + "grad_norm": 0.458285927772522, + "learning_rate": 0.0006873389355742296, + "loss": 0.4778, + "step": 11270 + }, + { + "epoch": 6.296648044692738, + "grad_norm": 0.8597704768180847, + "learning_rate": 0.000687310924369748, + "loss": 0.4958, + "step": 11271 + }, + { + "epoch": 6.297206703910614, + "grad_norm": 0.6881229877471924, + "learning_rate": 0.0006872829131652662, + "loss": 0.5557, + "step": 11272 + }, + { + "epoch": 6.297765363128492, + "grad_norm": 0.5812937021255493, + "learning_rate": 0.0006872549019607844, + "loss": 0.4134, + "step": 11273 + }, + { + "epoch": 6.298324022346368, + "grad_norm": 0.5038365125656128, + "learning_rate": 0.0006872268907563025, + "loss": 0.5143, + "step": 11274 + }, + { + "epoch": 6.298882681564246, + "grad_norm": 0.441057026386261, + "learning_rate": 0.0006871988795518207, + "loss": 0.4384, + "step": 11275 + }, + { + "epoch": 6.299441340782123, + "grad_norm": 0.6233197450637817, + "learning_rate": 0.000687170868347339, + "loss": 0.4051, + "step": 11276 + }, + { + "epoch": 6.3, + "grad_norm": 9.194594383239746, + "learning_rate": 0.0006871428571428572, + "loss": 0.5031, + "step": 11277 + }, + { + "epoch": 6.300558659217877, + "grad_norm": 0.588850200176239, + "learning_rate": 0.0006871148459383754, + "loss": 0.4499, + "step": 11278 + }, + { + "epoch": 6.301117318435754, + "grad_norm": 1.5040974617004395, + "learning_rate": 0.0006870868347338935, + "loss": 0.5344, + "step": 11279 + }, + { + "epoch": 6.301675977653631, + "grad_norm": 0.6354177594184875, + "learning_rate": 0.0006870588235294117, + "loss": 0.3549, + "step": 11280 + }, + { + "epoch": 6.302234636871509, + "grad_norm": 2.3752048015594482, + "learning_rate": 0.00068703081232493, + "loss": 0.5296, + "step": 11281 + }, + { + "epoch": 6.302793296089385, + "grad_norm": 0.7926993370056152, + "learning_rate": 0.0006870028011204482, + "loss": 0.4856, + "step": 11282 + }, + { + "epoch": 6.303351955307263, + "grad_norm": 0.9727863669395447, + "learning_rate": 0.0006869747899159664, + "loss": 0.4839, + "step": 11283 + }, + { + "epoch": 6.303910614525139, + "grad_norm": 0.8497305512428284, + "learning_rate": 0.0006869467787114846, + "loss": 0.5174, + "step": 11284 + }, + { + "epoch": 6.304469273743017, + "grad_norm": 0.39355963468551636, + "learning_rate": 0.0006869187675070027, + "loss": 0.3971, + "step": 11285 + }, + { + "epoch": 6.305027932960894, + "grad_norm": 0.48200666904449463, + "learning_rate": 0.000686890756302521, + "loss": 0.4982, + "step": 11286 + }, + { + "epoch": 6.305586592178771, + "grad_norm": 4.587625980377197, + "learning_rate": 0.0006868627450980393, + "loss": 0.5231, + "step": 11287 + }, + { + "epoch": 6.306145251396648, + "grad_norm": 1.2941746711730957, + "learning_rate": 0.0006868347338935575, + "loss": 0.3786, + "step": 11288 + }, + { + "epoch": 6.306703910614525, + "grad_norm": 0.41783273220062256, + "learning_rate": 0.0006868067226890757, + "loss": 0.36, + "step": 11289 + }, + { + "epoch": 6.307262569832402, + "grad_norm": 0.46241331100463867, + "learning_rate": 0.0006867787114845938, + "loss": 0.4543, + "step": 11290 + }, + { + "epoch": 6.30782122905028, + "grad_norm": 0.5026640892028809, + "learning_rate": 0.0006867507002801121, + "loss": 0.4374, + "step": 11291 + }, + { + "epoch": 6.308379888268156, + "grad_norm": 0.7011479139328003, + "learning_rate": 0.0006867226890756303, + "loss": 0.4345, + "step": 11292 + }, + { + "epoch": 6.308938547486034, + "grad_norm": 0.5282343029975891, + "learning_rate": 0.0006866946778711485, + "loss": 0.4435, + "step": 11293 + }, + { + "epoch": 6.30949720670391, + "grad_norm": 0.6447503566741943, + "learning_rate": 0.0006866666666666667, + "loss": 0.4792, + "step": 11294 + }, + { + "epoch": 6.310055865921788, + "grad_norm": 0.5949207544326782, + "learning_rate": 0.0006866386554621848, + "loss": 0.4869, + "step": 11295 + }, + { + "epoch": 6.310614525139664, + "grad_norm": 0.5433316826820374, + "learning_rate": 0.0006866106442577031, + "loss": 0.4122, + "step": 11296 + }, + { + "epoch": 6.311173184357542, + "grad_norm": 0.5279645919799805, + "learning_rate": 0.0006865826330532213, + "loss": 0.4121, + "step": 11297 + }, + { + "epoch": 6.311731843575419, + "grad_norm": 0.4653691053390503, + "learning_rate": 0.0006865546218487395, + "loss": 0.3951, + "step": 11298 + }, + { + "epoch": 6.312290502793296, + "grad_norm": 0.6034505367279053, + "learning_rate": 0.0006865266106442577, + "loss": 0.6397, + "step": 11299 + }, + { + "epoch": 6.312849162011173, + "grad_norm": 0.547141969203949, + "learning_rate": 0.0006864985994397759, + "loss": 0.423, + "step": 11300 + }, + { + "epoch": 6.31340782122905, + "grad_norm": 1.405989646911621, + "learning_rate": 0.0006864705882352941, + "loss": 0.3807, + "step": 11301 + }, + { + "epoch": 6.313966480446927, + "grad_norm": 0.5511883497238159, + "learning_rate": 0.0006864425770308123, + "loss": 0.4357, + "step": 11302 + }, + { + "epoch": 6.314525139664805, + "grad_norm": 0.48211267590522766, + "learning_rate": 0.0006864145658263306, + "loss": 0.4102, + "step": 11303 + }, + { + "epoch": 6.315083798882681, + "grad_norm": 1.332704782485962, + "learning_rate": 0.0006863865546218488, + "loss": 0.3709, + "step": 11304 + }, + { + "epoch": 6.315642458100559, + "grad_norm": 0.42972102761268616, + "learning_rate": 0.000686358543417367, + "loss": 0.374, + "step": 11305 + }, + { + "epoch": 6.316201117318435, + "grad_norm": 0.5429015159606934, + "learning_rate": 0.0006863305322128852, + "loss": 0.4569, + "step": 11306 + }, + { + "epoch": 6.316759776536313, + "grad_norm": 0.40285253524780273, + "learning_rate": 0.0006863025210084034, + "loss": 0.4019, + "step": 11307 + }, + { + "epoch": 6.31731843575419, + "grad_norm": 0.6154240965843201, + "learning_rate": 0.0006862745098039216, + "loss": 0.4195, + "step": 11308 + }, + { + "epoch": 6.317877094972067, + "grad_norm": 0.36808153986930847, + "learning_rate": 0.0006862464985994398, + "loss": 0.3715, + "step": 11309 + }, + { + "epoch": 6.318435754189944, + "grad_norm": 1.278550148010254, + "learning_rate": 0.000686218487394958, + "loss": 0.4924, + "step": 11310 + }, + { + "epoch": 6.318994413407821, + "grad_norm": 0.7281808853149414, + "learning_rate": 0.0006861904761904763, + "loss": 0.652, + "step": 11311 + }, + { + "epoch": 6.319553072625698, + "grad_norm": 0.8655731678009033, + "learning_rate": 0.0006861624649859944, + "loss": 0.6149, + "step": 11312 + }, + { + "epoch": 6.320111731843576, + "grad_norm": 0.6195973753929138, + "learning_rate": 0.0006861344537815126, + "loss": 0.4744, + "step": 11313 + }, + { + "epoch": 6.320670391061452, + "grad_norm": 0.7145146727561951, + "learning_rate": 0.0006861064425770308, + "loss": 0.4884, + "step": 11314 + }, + { + "epoch": 6.32122905027933, + "grad_norm": 0.8602355718612671, + "learning_rate": 0.000686078431372549, + "loss": 0.508, + "step": 11315 + }, + { + "epoch": 6.321787709497206, + "grad_norm": 1.4732013940811157, + "learning_rate": 0.0006860504201680673, + "loss": 0.5067, + "step": 11316 + }, + { + "epoch": 6.322346368715084, + "grad_norm": 0.8393772840499878, + "learning_rate": 0.0006860224089635854, + "loss": 0.562, + "step": 11317 + }, + { + "epoch": 6.322905027932961, + "grad_norm": 0.5369130373001099, + "learning_rate": 0.0006859943977591036, + "loss": 0.491, + "step": 11318 + }, + { + "epoch": 6.323463687150838, + "grad_norm": 0.45826050639152527, + "learning_rate": 0.0006859663865546218, + "loss": 0.412, + "step": 11319 + }, + { + "epoch": 6.324022346368715, + "grad_norm": 0.4535401463508606, + "learning_rate": 0.00068593837535014, + "loss": 0.3771, + "step": 11320 + }, + { + "epoch": 6.324581005586592, + "grad_norm": 0.6311883926391602, + "learning_rate": 0.0006859103641456584, + "loss": 0.4375, + "step": 11321 + }, + { + "epoch": 6.325139664804469, + "grad_norm": 0.6865175366401672, + "learning_rate": 0.0006858823529411765, + "loss": 0.3538, + "step": 11322 + }, + { + "epoch": 6.325698324022347, + "grad_norm": 0.6206411123275757, + "learning_rate": 0.0006858543417366947, + "loss": 0.4271, + "step": 11323 + }, + { + "epoch": 6.326256983240223, + "grad_norm": 0.5162012577056885, + "learning_rate": 0.0006858263305322129, + "loss": 0.3732, + "step": 11324 + }, + { + "epoch": 6.326815642458101, + "grad_norm": 0.633078932762146, + "learning_rate": 0.0006857983193277311, + "loss": 0.4613, + "step": 11325 + }, + { + "epoch": 6.327374301675977, + "grad_norm": 0.4444442093372345, + "learning_rate": 0.0006857703081232494, + "loss": 0.3598, + "step": 11326 + }, + { + "epoch": 6.327932960893855, + "grad_norm": 0.4896429777145386, + "learning_rate": 0.0006857422969187676, + "loss": 0.414, + "step": 11327 + }, + { + "epoch": 6.328491620111732, + "grad_norm": 0.4538799524307251, + "learning_rate": 0.0006857142857142857, + "loss": 0.3602, + "step": 11328 + }, + { + "epoch": 6.329050279329609, + "grad_norm": 0.6309373378753662, + "learning_rate": 0.0006856862745098039, + "loss": 0.4364, + "step": 11329 + }, + { + "epoch": 6.329608938547486, + "grad_norm": 0.5095107555389404, + "learning_rate": 0.0006856582633053221, + "loss": 0.4064, + "step": 11330 + }, + { + "epoch": 6.330167597765363, + "grad_norm": 0.7527093887329102, + "learning_rate": 0.0006856302521008404, + "loss": 0.482, + "step": 11331 + }, + { + "epoch": 6.33072625698324, + "grad_norm": 0.5486929416656494, + "learning_rate": 0.0006856022408963586, + "loss": 0.5146, + "step": 11332 + }, + { + "epoch": 6.331284916201117, + "grad_norm": 0.8615903854370117, + "learning_rate": 0.0006855742296918767, + "loss": 0.5306, + "step": 11333 + }, + { + "epoch": 6.331843575418994, + "grad_norm": 0.5210286974906921, + "learning_rate": 0.0006855462184873949, + "loss": 0.5113, + "step": 11334 + }, + { + "epoch": 6.332402234636872, + "grad_norm": 0.542382001876831, + "learning_rate": 0.0006855182072829131, + "loss": 0.4101, + "step": 11335 + }, + { + "epoch": 6.332960893854748, + "grad_norm": 3.9956793785095215, + "learning_rate": 0.0006854901960784315, + "loss": 0.4807, + "step": 11336 + }, + { + "epoch": 6.333519553072626, + "grad_norm": 0.5636622309684753, + "learning_rate": 0.0006854621848739497, + "loss": 0.607, + "step": 11337 + }, + { + "epoch": 6.334078212290502, + "grad_norm": 0.5061502456665039, + "learning_rate": 0.0006854341736694678, + "loss": 0.4997, + "step": 11338 + }, + { + "epoch": 6.33463687150838, + "grad_norm": 0.5561506748199463, + "learning_rate": 0.000685406162464986, + "loss": 0.4133, + "step": 11339 + }, + { + "epoch": 6.335195530726257, + "grad_norm": 0.5363943576812744, + "learning_rate": 0.0006853781512605042, + "loss": 0.4951, + "step": 11340 + }, + { + "epoch": 6.335754189944134, + "grad_norm": 0.5179654359817505, + "learning_rate": 0.0006853501400560225, + "loss": 0.51, + "step": 11341 + }, + { + "epoch": 6.336312849162011, + "grad_norm": 0.5023557543754578, + "learning_rate": 0.0006853221288515407, + "loss": 0.3788, + "step": 11342 + }, + { + "epoch": 6.336871508379888, + "grad_norm": 0.8036561012268066, + "learning_rate": 0.0006852941176470589, + "loss": 0.5158, + "step": 11343 + }, + { + "epoch": 6.337430167597765, + "grad_norm": 0.4334007799625397, + "learning_rate": 0.000685266106442577, + "loss": 0.446, + "step": 11344 + }, + { + "epoch": 6.337988826815643, + "grad_norm": 0.7328843474388123, + "learning_rate": 0.0006852380952380952, + "loss": 0.4617, + "step": 11345 + }, + { + "epoch": 6.338547486033519, + "grad_norm": 0.4876031279563904, + "learning_rate": 0.0006852100840336135, + "loss": 0.3867, + "step": 11346 + }, + { + "epoch": 6.339106145251397, + "grad_norm": 0.5504794120788574, + "learning_rate": 0.0006851820728291317, + "loss": 0.5307, + "step": 11347 + }, + { + "epoch": 6.339664804469273, + "grad_norm": 0.7562088966369629, + "learning_rate": 0.0006851540616246499, + "loss": 0.455, + "step": 11348 + }, + { + "epoch": 6.340223463687151, + "grad_norm": 0.5796620845794678, + "learning_rate": 0.000685126050420168, + "loss": 0.4663, + "step": 11349 + }, + { + "epoch": 6.340782122905028, + "grad_norm": 0.7910043001174927, + "learning_rate": 0.0006850980392156862, + "loss": 0.434, + "step": 11350 + }, + { + "epoch": 6.341340782122905, + "grad_norm": 1.6399033069610596, + "learning_rate": 0.0006850700280112045, + "loss": 0.4973, + "step": 11351 + }, + { + "epoch": 6.341899441340782, + "grad_norm": 0.580203115940094, + "learning_rate": 0.0006850420168067228, + "loss": 0.4167, + "step": 11352 + }, + { + "epoch": 6.342458100558659, + "grad_norm": 0.7104974389076233, + "learning_rate": 0.000685014005602241, + "loss": 0.3859, + "step": 11353 + }, + { + "epoch": 6.343016759776536, + "grad_norm": 0.5037657618522644, + "learning_rate": 0.0006849859943977591, + "loss": 0.4201, + "step": 11354 + }, + { + "epoch": 6.343575418994414, + "grad_norm": 1.741327166557312, + "learning_rate": 0.0006849579831932773, + "loss": 0.4617, + "step": 11355 + }, + { + "epoch": 6.34413407821229, + "grad_norm": 0.6208281517028809, + "learning_rate": 0.0006849299719887956, + "loss": 0.5056, + "step": 11356 + }, + { + "epoch": 6.344692737430168, + "grad_norm": 0.8182736039161682, + "learning_rate": 0.0006849019607843138, + "loss": 0.5144, + "step": 11357 + }, + { + "epoch": 6.345251396648044, + "grad_norm": 1.3300970792770386, + "learning_rate": 0.000684873949579832, + "loss": 0.4582, + "step": 11358 + }, + { + "epoch": 6.345810055865922, + "grad_norm": 0.5753659605979919, + "learning_rate": 0.0006848459383753502, + "loss": 0.3599, + "step": 11359 + }, + { + "epoch": 6.346368715083799, + "grad_norm": 0.6026843786239624, + "learning_rate": 0.0006848179271708683, + "loss": 0.3852, + "step": 11360 + }, + { + "epoch": 6.346927374301676, + "grad_norm": 0.4887150824069977, + "learning_rate": 0.0006847899159663866, + "loss": 0.4188, + "step": 11361 + }, + { + "epoch": 6.347486033519553, + "grad_norm": 0.6755757927894592, + "learning_rate": 0.0006847619047619048, + "loss": 0.4709, + "step": 11362 + }, + { + "epoch": 6.34804469273743, + "grad_norm": 1.9353394508361816, + "learning_rate": 0.000684733893557423, + "loss": 0.5553, + "step": 11363 + }, + { + "epoch": 6.348603351955307, + "grad_norm": 0.527420699596405, + "learning_rate": 0.0006847058823529412, + "loss": 0.4796, + "step": 11364 + }, + { + "epoch": 6.349162011173185, + "grad_norm": 0.646976113319397, + "learning_rate": 0.0006846778711484593, + "loss": 0.6217, + "step": 11365 + }, + { + "epoch": 6.349720670391061, + "grad_norm": 0.5248512625694275, + "learning_rate": 0.0006846498599439776, + "loss": 0.437, + "step": 11366 + }, + { + "epoch": 6.350279329608939, + "grad_norm": 0.5128051042556763, + "learning_rate": 0.0006846218487394958, + "loss": 0.5445, + "step": 11367 + }, + { + "epoch": 6.350837988826815, + "grad_norm": 0.4348798394203186, + "learning_rate": 0.000684593837535014, + "loss": 0.4063, + "step": 11368 + }, + { + "epoch": 6.351396648044693, + "grad_norm": 0.9783489108085632, + "learning_rate": 0.0006845658263305323, + "loss": 0.5715, + "step": 11369 + }, + { + "epoch": 6.351955307262569, + "grad_norm": 0.6436389684677124, + "learning_rate": 0.0006845378151260504, + "loss": 0.4803, + "step": 11370 + }, + { + "epoch": 6.352513966480447, + "grad_norm": 0.5941984057426453, + "learning_rate": 0.0006845098039215687, + "loss": 0.485, + "step": 11371 + }, + { + "epoch": 6.353072625698324, + "grad_norm": 0.6140072345733643, + "learning_rate": 0.0006844817927170869, + "loss": 0.3766, + "step": 11372 + }, + { + "epoch": 6.353631284916201, + "grad_norm": 0.955089807510376, + "learning_rate": 0.0006844537815126051, + "loss": 0.5038, + "step": 11373 + }, + { + "epoch": 6.354189944134078, + "grad_norm": 0.5532419085502625, + "learning_rate": 0.0006844257703081233, + "loss": 0.4049, + "step": 11374 + }, + { + "epoch": 6.354748603351955, + "grad_norm": 0.5676504969596863, + "learning_rate": 0.0006843977591036415, + "loss": 0.491, + "step": 11375 + }, + { + "epoch": 6.355307262569832, + "grad_norm": 0.5290305614471436, + "learning_rate": 0.0006843697478991596, + "loss": 0.4818, + "step": 11376 + }, + { + "epoch": 6.35586592178771, + "grad_norm": 0.4742472171783447, + "learning_rate": 0.0006843417366946779, + "loss": 0.4658, + "step": 11377 + }, + { + "epoch": 6.356424581005586, + "grad_norm": 0.508601188659668, + "learning_rate": 0.0006843137254901961, + "loss": 0.4436, + "step": 11378 + }, + { + "epoch": 6.356983240223464, + "grad_norm": 0.5605050325393677, + "learning_rate": 0.0006842857142857143, + "loss": 0.4842, + "step": 11379 + }, + { + "epoch": 6.35754189944134, + "grad_norm": 1.2105941772460938, + "learning_rate": 0.0006842577030812325, + "loss": 0.5238, + "step": 11380 + }, + { + "epoch": 6.358100558659218, + "grad_norm": 1.2805817127227783, + "learning_rate": 0.0006842296918767506, + "loss": 0.5011, + "step": 11381 + }, + { + "epoch": 6.358659217877095, + "grad_norm": 0.5717642903327942, + "learning_rate": 0.0006842016806722689, + "loss": 0.4405, + "step": 11382 + }, + { + "epoch": 6.359217877094972, + "grad_norm": 0.5865155458450317, + "learning_rate": 0.0006841736694677871, + "loss": 0.4009, + "step": 11383 + }, + { + "epoch": 6.359776536312849, + "grad_norm": 0.5778446793556213, + "learning_rate": 0.0006841456582633053, + "loss": 0.4466, + "step": 11384 + }, + { + "epoch": 6.360335195530726, + "grad_norm": 0.9259564280509949, + "learning_rate": 0.0006841176470588236, + "loss": 0.4787, + "step": 11385 + }, + { + "epoch": 6.360893854748603, + "grad_norm": 3.199401378631592, + "learning_rate": 0.0006840896358543416, + "loss": 0.362, + "step": 11386 + }, + { + "epoch": 6.361452513966481, + "grad_norm": 0.5453430414199829, + "learning_rate": 0.00068406162464986, + "loss": 0.4299, + "step": 11387 + }, + { + "epoch": 6.362011173184357, + "grad_norm": 0.4666514992713928, + "learning_rate": 0.0006840336134453782, + "loss": 0.4708, + "step": 11388 + }, + { + "epoch": 6.362569832402235, + "grad_norm": 0.5683820247650146, + "learning_rate": 0.0006840056022408964, + "loss": 0.3801, + "step": 11389 + }, + { + "epoch": 6.363128491620111, + "grad_norm": 0.5500078797340393, + "learning_rate": 0.0006839775910364146, + "loss": 0.4868, + "step": 11390 + }, + { + "epoch": 6.363687150837989, + "grad_norm": 2.013174295425415, + "learning_rate": 0.0006839495798319328, + "loss": 0.5194, + "step": 11391 + }, + { + "epoch": 6.364245810055866, + "grad_norm": 0.9869382977485657, + "learning_rate": 0.000683921568627451, + "loss": 0.4943, + "step": 11392 + }, + { + "epoch": 6.364804469273743, + "grad_norm": 0.5604583024978638, + "learning_rate": 0.0006838935574229692, + "loss": 0.3823, + "step": 11393 + }, + { + "epoch": 6.36536312849162, + "grad_norm": 0.8456790447235107, + "learning_rate": 0.0006838655462184874, + "loss": 0.5184, + "step": 11394 + }, + { + "epoch": 6.365921787709497, + "grad_norm": 0.3989706337451935, + "learning_rate": 0.0006838375350140056, + "loss": 0.3675, + "step": 11395 + }, + { + "epoch": 6.366480446927374, + "grad_norm": 0.6773026585578918, + "learning_rate": 0.0006838095238095238, + "loss": 0.4875, + "step": 11396 + }, + { + "epoch": 6.367039106145251, + "grad_norm": 1.5115110874176025, + "learning_rate": 0.000683781512605042, + "loss": 0.4182, + "step": 11397 + }, + { + "epoch": 6.367597765363128, + "grad_norm": 0.6385343074798584, + "learning_rate": 0.0006837535014005602, + "loss": 0.4116, + "step": 11398 + }, + { + "epoch": 6.368156424581006, + "grad_norm": 1.1313433647155762, + "learning_rate": 0.0006837254901960784, + "loss": 0.4713, + "step": 11399 + }, + { + "epoch": 6.368715083798882, + "grad_norm": 0.9814034104347229, + "learning_rate": 0.0006836974789915966, + "loss": 0.384, + "step": 11400 + }, + { + "epoch": 6.36927374301676, + "grad_norm": 0.5504094958305359, + "learning_rate": 0.0006836694677871148, + "loss": 0.4777, + "step": 11401 + }, + { + "epoch": 6.369832402234637, + "grad_norm": 0.6145797371864319, + "learning_rate": 0.000683641456582633, + "loss": 0.4953, + "step": 11402 + }, + { + "epoch": 6.370391061452514, + "grad_norm": 1.0758200883865356, + "learning_rate": 0.0006836134453781513, + "loss": 0.6673, + "step": 11403 + }, + { + "epoch": 6.370949720670391, + "grad_norm": 0.5151605010032654, + "learning_rate": 0.0006835854341736695, + "loss": 0.5061, + "step": 11404 + }, + { + "epoch": 6.371508379888268, + "grad_norm": 0.9018517732620239, + "learning_rate": 0.0006835574229691877, + "loss": 0.4564, + "step": 11405 + }, + { + "epoch": 6.372067039106145, + "grad_norm": 0.47681277990341187, + "learning_rate": 0.0006835294117647059, + "loss": 0.4757, + "step": 11406 + }, + { + "epoch": 6.372625698324022, + "grad_norm": 0.4705057740211487, + "learning_rate": 0.0006835014005602242, + "loss": 0.4192, + "step": 11407 + }, + { + "epoch": 6.373184357541899, + "grad_norm": 0.5153605341911316, + "learning_rate": 0.0006834733893557423, + "loss": 0.4288, + "step": 11408 + }, + { + "epoch": 6.373743016759777, + "grad_norm": 0.955181896686554, + "learning_rate": 0.0006834453781512605, + "loss": 0.5228, + "step": 11409 + }, + { + "epoch": 6.374301675977653, + "grad_norm": 0.491047203540802, + "learning_rate": 0.0006834173669467787, + "loss": 0.4676, + "step": 11410 + }, + { + "epoch": 6.374860335195531, + "grad_norm": 0.5429548621177673, + "learning_rate": 0.0006833893557422969, + "loss": 0.4835, + "step": 11411 + }, + { + "epoch": 6.375418994413407, + "grad_norm": 0.6758902072906494, + "learning_rate": 0.0006833613445378152, + "loss": 0.5477, + "step": 11412 + }, + { + "epoch": 6.375977653631285, + "grad_norm": 0.5432047247886658, + "learning_rate": 0.0006833333333333333, + "loss": 0.4523, + "step": 11413 + }, + { + "epoch": 6.376536312849162, + "grad_norm": 0.5245417356491089, + "learning_rate": 0.0006833053221288515, + "loss": 0.4615, + "step": 11414 + }, + { + "epoch": 6.377094972067039, + "grad_norm": 0.5416039228439331, + "learning_rate": 0.0006832773109243697, + "loss": 0.3934, + "step": 11415 + }, + { + "epoch": 6.377653631284916, + "grad_norm": 0.7096738219261169, + "learning_rate": 0.0006832492997198879, + "loss": 0.4815, + "step": 11416 + }, + { + "epoch": 6.378212290502793, + "grad_norm": 0.481178879737854, + "learning_rate": 0.0006832212885154063, + "loss": 0.4362, + "step": 11417 + }, + { + "epoch": 6.37877094972067, + "grad_norm": 0.46963879466056824, + "learning_rate": 0.0006831932773109243, + "loss": 0.4127, + "step": 11418 + }, + { + "epoch": 6.379329608938548, + "grad_norm": 0.6784202456474304, + "learning_rate": 0.0006831652661064426, + "loss": 0.3865, + "step": 11419 + }, + { + "epoch": 6.379888268156424, + "grad_norm": 0.42951303720474243, + "learning_rate": 0.0006831372549019608, + "loss": 0.4301, + "step": 11420 + }, + { + "epoch": 6.380446927374302, + "grad_norm": 0.6414159536361694, + "learning_rate": 0.000683109243697479, + "loss": 0.5283, + "step": 11421 + }, + { + "epoch": 6.381005586592178, + "grad_norm": 0.674823522567749, + "learning_rate": 0.0006830812324929973, + "loss": 0.4984, + "step": 11422 + }, + { + "epoch": 6.381564245810056, + "grad_norm": 0.44607362151145935, + "learning_rate": 0.0006830532212885155, + "loss": 0.4302, + "step": 11423 + }, + { + "epoch": 6.382122905027933, + "grad_norm": 0.5833104848861694, + "learning_rate": 0.0006830252100840336, + "loss": 0.4465, + "step": 11424 + }, + { + "epoch": 6.38268156424581, + "grad_norm": 0.7467641830444336, + "learning_rate": 0.0006829971988795518, + "loss": 0.4321, + "step": 11425 + }, + { + "epoch": 6.383240223463687, + "grad_norm": 0.5185431838035583, + "learning_rate": 0.00068296918767507, + "loss": 0.3528, + "step": 11426 + }, + { + "epoch": 6.383798882681564, + "grad_norm": 0.8562811613082886, + "learning_rate": 0.0006829411764705883, + "loss": 0.4064, + "step": 11427 + }, + { + "epoch": 6.384357541899441, + "grad_norm": 0.44138962030410767, + "learning_rate": 0.0006829131652661065, + "loss": 0.4917, + "step": 11428 + }, + { + "epoch": 6.384916201117319, + "grad_norm": 0.4690287411212921, + "learning_rate": 0.0006828851540616246, + "loss": 0.4146, + "step": 11429 + }, + { + "epoch": 6.385474860335195, + "grad_norm": 0.5973803400993347, + "learning_rate": 0.0006828571428571428, + "loss": 0.3892, + "step": 11430 + }, + { + "epoch": 6.386033519553073, + "grad_norm": 1.1588422060012817, + "learning_rate": 0.000682829131652661, + "loss": 0.5026, + "step": 11431 + }, + { + "epoch": 6.386592178770949, + "grad_norm": 0.7707207798957825, + "learning_rate": 0.0006828011204481793, + "loss": 0.644, + "step": 11432 + }, + { + "epoch": 6.387150837988827, + "grad_norm": 0.5774877667427063, + "learning_rate": 0.0006827731092436975, + "loss": 0.4519, + "step": 11433 + }, + { + "epoch": 6.3877094972067034, + "grad_norm": 0.4946560561656952, + "learning_rate": 0.0006827450980392156, + "loss": 0.3831, + "step": 11434 + }, + { + "epoch": 6.388268156424581, + "grad_norm": 0.4862198829650879, + "learning_rate": 0.0006827170868347339, + "loss": 0.4656, + "step": 11435 + }, + { + "epoch": 6.388826815642458, + "grad_norm": 0.4830875098705292, + "learning_rate": 0.0006826890756302521, + "loss": 0.4426, + "step": 11436 + }, + { + "epoch": 6.389385474860335, + "grad_norm": 0.7705630660057068, + "learning_rate": 0.0006826610644257704, + "loss": 0.5365, + "step": 11437 + }, + { + "epoch": 6.389944134078212, + "grad_norm": 3.553464651107788, + "learning_rate": 0.0006826330532212886, + "loss": 0.4289, + "step": 11438 + }, + { + "epoch": 6.39050279329609, + "grad_norm": 5.979016304016113, + "learning_rate": 0.0006826050420168068, + "loss": 0.4035, + "step": 11439 + }, + { + "epoch": 6.391061452513966, + "grad_norm": 0.5048288106918335, + "learning_rate": 0.0006825770308123249, + "loss": 0.4505, + "step": 11440 + }, + { + "epoch": 6.391620111731844, + "grad_norm": 0.6292498707771301, + "learning_rate": 0.0006825490196078431, + "loss": 0.3552, + "step": 11441 + }, + { + "epoch": 6.39217877094972, + "grad_norm": 0.5961422920227051, + "learning_rate": 0.0006825210084033614, + "loss": 0.4166, + "step": 11442 + }, + { + "epoch": 6.392737430167598, + "grad_norm": 0.6224048137664795, + "learning_rate": 0.0006824929971988796, + "loss": 0.4495, + "step": 11443 + }, + { + "epoch": 6.3932960893854744, + "grad_norm": 0.8895552158355713, + "learning_rate": 0.0006824649859943978, + "loss": 0.5218, + "step": 11444 + }, + { + "epoch": 6.393854748603352, + "grad_norm": 0.7797205448150635, + "learning_rate": 0.0006824369747899159, + "loss": 0.5335, + "step": 11445 + }, + { + "epoch": 6.394413407821229, + "grad_norm": 1.263240098953247, + "learning_rate": 0.0006824089635854341, + "loss": 0.3963, + "step": 11446 + }, + { + "epoch": 6.394972067039106, + "grad_norm": 0.9457670450210571, + "learning_rate": 0.0006823809523809524, + "loss": 0.5925, + "step": 11447 + }, + { + "epoch": 6.395530726256983, + "grad_norm": 0.91689532995224, + "learning_rate": 0.0006823529411764706, + "loss": 0.3814, + "step": 11448 + }, + { + "epoch": 6.39608938547486, + "grad_norm": 0.5170482993125916, + "learning_rate": 0.0006823249299719888, + "loss": 0.3701, + "step": 11449 + }, + { + "epoch": 6.396648044692737, + "grad_norm": 0.8940397500991821, + "learning_rate": 0.0006822969187675069, + "loss": 0.4448, + "step": 11450 + }, + { + "epoch": 6.397206703910615, + "grad_norm": 0.6540806293487549, + "learning_rate": 0.0006822689075630251, + "loss": 0.4433, + "step": 11451 + }, + { + "epoch": 6.397765363128491, + "grad_norm": 3.5318446159362793, + "learning_rate": 0.0006822408963585435, + "loss": 0.4573, + "step": 11452 + }, + { + "epoch": 6.398324022346369, + "grad_norm": 4.933814525604248, + "learning_rate": 0.0006822128851540617, + "loss": 0.3694, + "step": 11453 + }, + { + "epoch": 6.3988826815642454, + "grad_norm": 0.7283658981323242, + "learning_rate": 0.0006821848739495799, + "loss": 0.3455, + "step": 11454 + }, + { + "epoch": 6.399441340782123, + "grad_norm": 1.1376770734786987, + "learning_rate": 0.0006821568627450981, + "loss": 0.451, + "step": 11455 + }, + { + "epoch": 6.4, + "grad_norm": 0.7912377119064331, + "learning_rate": 0.0006821288515406162, + "loss": 0.481, + "step": 11456 + }, + { + "epoch": 6.400558659217877, + "grad_norm": 1.2436774969100952, + "learning_rate": 0.0006821008403361345, + "loss": 0.4199, + "step": 11457 + }, + { + "epoch": 6.401117318435754, + "grad_norm": 1.6595700979232788, + "learning_rate": 0.0006820728291316527, + "loss": 0.3624, + "step": 11458 + }, + { + "epoch": 6.401675977653631, + "grad_norm": 0.8691661953926086, + "learning_rate": 0.0006820448179271709, + "loss": 0.5335, + "step": 11459 + }, + { + "epoch": 6.402234636871508, + "grad_norm": 1.5098490715026855, + "learning_rate": 0.0006820168067226891, + "loss": 0.3824, + "step": 11460 + }, + { + "epoch": 6.402793296089386, + "grad_norm": 0.693132221698761, + "learning_rate": 0.0006819887955182072, + "loss": 0.5411, + "step": 11461 + }, + { + "epoch": 6.403351955307262, + "grad_norm": 0.7161781191825867, + "learning_rate": 0.0006819607843137255, + "loss": 0.4855, + "step": 11462 + }, + { + "epoch": 6.40391061452514, + "grad_norm": 0.5640398263931274, + "learning_rate": 0.0006819327731092437, + "loss": 0.6136, + "step": 11463 + }, + { + "epoch": 6.4044692737430164, + "grad_norm": 0.5421932935714722, + "learning_rate": 0.0006819047619047619, + "loss": 0.5153, + "step": 11464 + }, + { + "epoch": 6.405027932960894, + "grad_norm": 0.4436020255088806, + "learning_rate": 0.0006818767507002801, + "loss": 0.3355, + "step": 11465 + }, + { + "epoch": 6.405586592178771, + "grad_norm": 0.6029152870178223, + "learning_rate": 0.0006818487394957982, + "loss": 0.4013, + "step": 11466 + }, + { + "epoch": 6.406145251396648, + "grad_norm": 0.5074669122695923, + "learning_rate": 0.0006818207282913166, + "loss": 0.4693, + "step": 11467 + }, + { + "epoch": 6.406703910614525, + "grad_norm": 0.7190271615982056, + "learning_rate": 0.0006817927170868348, + "loss": 0.5282, + "step": 11468 + }, + { + "epoch": 6.407262569832402, + "grad_norm": 0.6207207441329956, + "learning_rate": 0.000681764705882353, + "loss": 0.5311, + "step": 11469 + }, + { + "epoch": 6.407821229050279, + "grad_norm": 0.48141947388648987, + "learning_rate": 0.0006817366946778712, + "loss": 0.4243, + "step": 11470 + }, + { + "epoch": 6.408379888268156, + "grad_norm": 1.4258897304534912, + "learning_rate": 0.0006817086834733894, + "loss": 0.5017, + "step": 11471 + }, + { + "epoch": 6.408938547486033, + "grad_norm": 0.542827844619751, + "learning_rate": 0.0006816806722689076, + "loss": 0.5222, + "step": 11472 + }, + { + "epoch": 6.409497206703911, + "grad_norm": 3.085486650466919, + "learning_rate": 0.0006816526610644258, + "loss": 0.4184, + "step": 11473 + }, + { + "epoch": 6.410055865921787, + "grad_norm": 0.9297780394554138, + "learning_rate": 0.000681624649859944, + "loss": 0.4804, + "step": 11474 + }, + { + "epoch": 6.410614525139665, + "grad_norm": 0.8626901507377625, + "learning_rate": 0.0006815966386554622, + "loss": 0.5163, + "step": 11475 + }, + { + "epoch": 6.411173184357542, + "grad_norm": 0.6746317744255066, + "learning_rate": 0.0006815686274509804, + "loss": 0.5117, + "step": 11476 + }, + { + "epoch": 6.411731843575419, + "grad_norm": 0.3959582448005676, + "learning_rate": 0.0006815406162464986, + "loss": 0.4302, + "step": 11477 + }, + { + "epoch": 6.412290502793296, + "grad_norm": 0.6799911856651306, + "learning_rate": 0.0006815126050420168, + "loss": 0.4736, + "step": 11478 + }, + { + "epoch": 6.412849162011173, + "grad_norm": 0.5117669105529785, + "learning_rate": 0.000681484593837535, + "loss": 0.4298, + "step": 11479 + }, + { + "epoch": 6.41340782122905, + "grad_norm": 0.6756731271743774, + "learning_rate": 0.0006814565826330532, + "loss": 0.3862, + "step": 11480 + }, + { + "epoch": 6.413966480446927, + "grad_norm": 0.7800499796867371, + "learning_rate": 0.0006814285714285714, + "loss": 0.4372, + "step": 11481 + }, + { + "epoch": 6.414525139664804, + "grad_norm": 0.5478665232658386, + "learning_rate": 0.0006814005602240896, + "loss": 0.388, + "step": 11482 + }, + { + "epoch": 6.415083798882682, + "grad_norm": 0.7572131156921387, + "learning_rate": 0.0006813725490196078, + "loss": 0.3239, + "step": 11483 + }, + { + "epoch": 6.415642458100558, + "grad_norm": 0.4937751591205597, + "learning_rate": 0.000681344537815126, + "loss": 0.4346, + "step": 11484 + }, + { + "epoch": 6.416201117318436, + "grad_norm": 0.4888262450695038, + "learning_rate": 0.0006813165266106443, + "loss": 0.4668, + "step": 11485 + }, + { + "epoch": 6.4167597765363125, + "grad_norm": 0.6490456461906433, + "learning_rate": 0.0006812885154061625, + "loss": 0.4866, + "step": 11486 + }, + { + "epoch": 6.41731843575419, + "grad_norm": 0.9592542052268982, + "learning_rate": 0.0006812605042016808, + "loss": 0.4989, + "step": 11487 + }, + { + "epoch": 6.417877094972067, + "grad_norm": 1.562477707862854, + "learning_rate": 0.0006812324929971989, + "loss": 0.3994, + "step": 11488 + }, + { + "epoch": 6.418435754189944, + "grad_norm": 0.5914081931114197, + "learning_rate": 0.0006812044817927171, + "loss": 0.4347, + "step": 11489 + }, + { + "epoch": 6.418994413407821, + "grad_norm": 0.421370267868042, + "learning_rate": 0.0006811764705882353, + "loss": 0.4007, + "step": 11490 + }, + { + "epoch": 6.419553072625698, + "grad_norm": 0.5330716967582703, + "learning_rate": 0.0006811484593837535, + "loss": 0.4284, + "step": 11491 + }, + { + "epoch": 6.420111731843575, + "grad_norm": 0.6734991073608398, + "learning_rate": 0.0006811204481792718, + "loss": 0.469, + "step": 11492 + }, + { + "epoch": 6.420670391061453, + "grad_norm": 0.5247898697853088, + "learning_rate": 0.0006810924369747899, + "loss": 0.377, + "step": 11493 + }, + { + "epoch": 6.421229050279329, + "grad_norm": 0.4436488151550293, + "learning_rate": 0.0006810644257703081, + "loss": 0.3854, + "step": 11494 + }, + { + "epoch": 6.421787709497207, + "grad_norm": 0.4950900375843048, + "learning_rate": 0.0006810364145658263, + "loss": 0.4331, + "step": 11495 + }, + { + "epoch": 6.4223463687150835, + "grad_norm": 0.605148196220398, + "learning_rate": 0.0006810084033613445, + "loss": 0.4171, + "step": 11496 + }, + { + "epoch": 6.422905027932961, + "grad_norm": 0.6597362756729126, + "learning_rate": 0.0006809803921568628, + "loss": 0.4259, + "step": 11497 + }, + { + "epoch": 6.423463687150838, + "grad_norm": 1.375606894493103, + "learning_rate": 0.0006809523809523809, + "loss": 0.4399, + "step": 11498 + }, + { + "epoch": 6.424022346368715, + "grad_norm": 0.6441894769668579, + "learning_rate": 0.0006809243697478991, + "loss": 0.4406, + "step": 11499 + }, + { + "epoch": 6.424581005586592, + "grad_norm": 0.9051856994628906, + "learning_rate": 0.0006808963585434173, + "loss": 0.5179, + "step": 11500 + }, + { + "epoch": 6.424581005586592, + "eval_cer": 0.0949450645368946, + "eval_loss": 0.35293301939964294, + "eval_runtime": 57.2426, + "eval_samples_per_second": 79.277, + "eval_steps_per_second": 4.961, + "eval_wer": 0.380380436300662, + "step": 11500 + }, + { + "epoch": 6.425139664804469, + "grad_norm": 1.4579472541809082, + "learning_rate": 0.0006808683473389356, + "loss": 0.4749, + "step": 11501 + }, + { + "epoch": 6.425698324022346, + "grad_norm": 1.4717273712158203, + "learning_rate": 0.0006808403361344539, + "loss": 0.6759, + "step": 11502 + }, + { + "epoch": 6.426256983240224, + "grad_norm": 0.49385038018226624, + "learning_rate": 0.0006808123249299721, + "loss": 0.4624, + "step": 11503 + }, + { + "epoch": 6.4268156424581, + "grad_norm": 0.48125413060188293, + "learning_rate": 0.0006807843137254902, + "loss": 0.3982, + "step": 11504 + }, + { + "epoch": 6.427374301675978, + "grad_norm": 0.7675278186798096, + "learning_rate": 0.0006807563025210084, + "loss": 0.3999, + "step": 11505 + }, + { + "epoch": 6.4279329608938545, + "grad_norm": 0.8417807221412659, + "learning_rate": 0.0006807282913165266, + "loss": 0.4844, + "step": 11506 + }, + { + "epoch": 6.428491620111732, + "grad_norm": 0.7927025556564331, + "learning_rate": 0.0006807002801120449, + "loss": 0.5138, + "step": 11507 + }, + { + "epoch": 6.4290502793296085, + "grad_norm": 0.42114967107772827, + "learning_rate": 0.0006806722689075631, + "loss": 0.4367, + "step": 11508 + }, + { + "epoch": 6.429608938547486, + "grad_norm": 0.32600682973861694, + "learning_rate": 0.0006806442577030812, + "loss": 0.2956, + "step": 11509 + }, + { + "epoch": 6.430167597765363, + "grad_norm": 0.7442623972892761, + "learning_rate": 0.0006806162464985994, + "loss": 0.6598, + "step": 11510 + }, + { + "epoch": 6.43072625698324, + "grad_norm": 0.6744649410247803, + "learning_rate": 0.0006805882352941176, + "loss": 0.5523, + "step": 11511 + }, + { + "epoch": 6.431284916201117, + "grad_norm": 2.1014201641082764, + "learning_rate": 0.0006805602240896359, + "loss": 0.5378, + "step": 11512 + }, + { + "epoch": 6.431843575418995, + "grad_norm": 0.4607792794704437, + "learning_rate": 0.0006805322128851541, + "loss": 0.5108, + "step": 11513 + }, + { + "epoch": 6.432402234636871, + "grad_norm": 0.4744274616241455, + "learning_rate": 0.0006805042016806722, + "loss": 0.4303, + "step": 11514 + }, + { + "epoch": 6.432960893854749, + "grad_norm": 0.6840748190879822, + "learning_rate": 0.0006804761904761904, + "loss": 0.4678, + "step": 11515 + }, + { + "epoch": 6.4335195530726255, + "grad_norm": 12.445887565612793, + "learning_rate": 0.0006804481792717086, + "loss": 0.5343, + "step": 11516 + }, + { + "epoch": 6.434078212290503, + "grad_norm": 0.5468632578849792, + "learning_rate": 0.000680420168067227, + "loss": 0.4468, + "step": 11517 + }, + { + "epoch": 6.4346368715083795, + "grad_norm": 0.6844728589057922, + "learning_rate": 0.0006803921568627452, + "loss": 0.4387, + "step": 11518 + }, + { + "epoch": 6.435195530726257, + "grad_norm": 0.4937349259853363, + "learning_rate": 0.0006803641456582634, + "loss": 0.4165, + "step": 11519 + }, + { + "epoch": 6.435754189944134, + "grad_norm": 0.5430785417556763, + "learning_rate": 0.0006803361344537815, + "loss": 0.4366, + "step": 11520 + }, + { + "epoch": 6.436312849162011, + "grad_norm": 0.7449158430099487, + "learning_rate": 0.0006803081232492997, + "loss": 0.369, + "step": 11521 + }, + { + "epoch": 6.436871508379888, + "grad_norm": 0.5434979796409607, + "learning_rate": 0.000680280112044818, + "loss": 0.4346, + "step": 11522 + }, + { + "epoch": 6.437430167597765, + "grad_norm": 0.6099750995635986, + "learning_rate": 0.0006802521008403362, + "loss": 0.4457, + "step": 11523 + }, + { + "epoch": 6.437988826815642, + "grad_norm": 5.076451778411865, + "learning_rate": 0.0006802240896358544, + "loss": 0.3957, + "step": 11524 + }, + { + "epoch": 6.43854748603352, + "grad_norm": 0.7869237065315247, + "learning_rate": 0.0006801960784313725, + "loss": 0.4016, + "step": 11525 + }, + { + "epoch": 6.4391061452513965, + "grad_norm": 1.243279218673706, + "learning_rate": 0.0006801680672268907, + "loss": 0.4972, + "step": 11526 + }, + { + "epoch": 6.439664804469274, + "grad_norm": 0.7846372127532959, + "learning_rate": 0.000680140056022409, + "loss": 0.445, + "step": 11527 + }, + { + "epoch": 6.4402234636871505, + "grad_norm": 0.6177942156791687, + "learning_rate": 0.0006801120448179272, + "loss": 0.5236, + "step": 11528 + }, + { + "epoch": 6.440782122905028, + "grad_norm": 0.7969048023223877, + "learning_rate": 0.0006800840336134454, + "loss": 0.4799, + "step": 11529 + }, + { + "epoch": 6.441340782122905, + "grad_norm": 0.5414762496948242, + "learning_rate": 0.0006800560224089635, + "loss": 0.3888, + "step": 11530 + }, + { + "epoch": 6.441899441340782, + "grad_norm": 1.089887261390686, + "learning_rate": 0.0006800280112044817, + "loss": 0.3662, + "step": 11531 + }, + { + "epoch": 6.442458100558659, + "grad_norm": 0.7972704768180847, + "learning_rate": 0.00068, + "loss": 0.5145, + "step": 11532 + }, + { + "epoch": 6.443016759776536, + "grad_norm": 0.6176258325576782, + "learning_rate": 0.0006799719887955183, + "loss": 0.496, + "step": 11533 + }, + { + "epoch": 6.443575418994413, + "grad_norm": 0.6138110160827637, + "learning_rate": 0.0006799439775910365, + "loss": 0.5631, + "step": 11534 + }, + { + "epoch": 6.444134078212291, + "grad_norm": 1.5930955410003662, + "learning_rate": 0.0006799159663865547, + "loss": 0.4199, + "step": 11535 + }, + { + "epoch": 6.4446927374301675, + "grad_norm": 0.443279504776001, + "learning_rate": 0.0006798879551820728, + "loss": 0.524, + "step": 11536 + }, + { + "epoch": 6.445251396648045, + "grad_norm": 0.8224071860313416, + "learning_rate": 0.0006798599439775911, + "loss": 0.4894, + "step": 11537 + }, + { + "epoch": 6.4458100558659215, + "grad_norm": 0.5819796919822693, + "learning_rate": 0.0006798319327731093, + "loss": 0.4802, + "step": 11538 + }, + { + "epoch": 6.446368715083799, + "grad_norm": 0.5683058500289917, + "learning_rate": 0.0006798039215686275, + "loss": 0.4199, + "step": 11539 + }, + { + "epoch": 6.446927374301676, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.0006797759103641457, + "loss": 0.3875, + "step": 11540 + }, + { + "epoch": 6.447486033519553, + "grad_norm": 0.8234904408454895, + "learning_rate": 0.0006797478991596638, + "loss": 0.5202, + "step": 11541 + }, + { + "epoch": 6.44804469273743, + "grad_norm": 0.7311606407165527, + "learning_rate": 0.0006797198879551821, + "loss": 0.5252, + "step": 11542 + }, + { + "epoch": 6.448603351955307, + "grad_norm": 0.6099271178245544, + "learning_rate": 0.0006796918767507003, + "loss": 0.4048, + "step": 11543 + }, + { + "epoch": 6.449162011173184, + "grad_norm": 0.5512201189994812, + "learning_rate": 0.0006796638655462185, + "loss": 0.4183, + "step": 11544 + }, + { + "epoch": 6.449720670391061, + "grad_norm": 0.4703015387058258, + "learning_rate": 0.0006796358543417367, + "loss": 0.432, + "step": 11545 + }, + { + "epoch": 6.4502793296089385, + "grad_norm": 0.4602089524269104, + "learning_rate": 0.0006796078431372548, + "loss": 0.4712, + "step": 11546 + }, + { + "epoch": 6.450837988826816, + "grad_norm": 0.6360549330711365, + "learning_rate": 0.0006795798319327731, + "loss": 0.539, + "step": 11547 + }, + { + "epoch": 6.4513966480446925, + "grad_norm": 0.46027907729148865, + "learning_rate": 0.0006795518207282913, + "loss": 0.3673, + "step": 11548 + }, + { + "epoch": 6.45195530726257, + "grad_norm": 0.5553647875785828, + "learning_rate": 0.0006795238095238096, + "loss": 0.4222, + "step": 11549 + }, + { + "epoch": 6.452513966480447, + "grad_norm": 0.8476529717445374, + "learning_rate": 0.0006794957983193278, + "loss": 0.5464, + "step": 11550 + }, + { + "epoch": 6.453072625698324, + "grad_norm": 0.6125985980033875, + "learning_rate": 0.000679467787114846, + "loss": 0.5105, + "step": 11551 + }, + { + "epoch": 6.453631284916201, + "grad_norm": 0.45297735929489136, + "learning_rate": 0.0006794397759103642, + "loss": 0.485, + "step": 11552 + }, + { + "epoch": 6.454189944134078, + "grad_norm": 0.4742082357406616, + "learning_rate": 0.0006794117647058824, + "loss": 0.4205, + "step": 11553 + }, + { + "epoch": 6.454748603351955, + "grad_norm": 1.5596706867218018, + "learning_rate": 0.0006793837535014006, + "loss": 0.495, + "step": 11554 + }, + { + "epoch": 6.455307262569832, + "grad_norm": 0.7248988151550293, + "learning_rate": 0.0006793557422969188, + "loss": 0.4575, + "step": 11555 + }, + { + "epoch": 6.4558659217877095, + "grad_norm": 0.3988633453845978, + "learning_rate": 0.000679327731092437, + "loss": 0.4069, + "step": 11556 + }, + { + "epoch": 6.456424581005587, + "grad_norm": 0.5369634628295898, + "learning_rate": 0.0006792997198879552, + "loss": 0.6724, + "step": 11557 + }, + { + "epoch": 6.4569832402234635, + "grad_norm": 0.5384875535964966, + "learning_rate": 0.0006792717086834734, + "loss": 0.4388, + "step": 11558 + }, + { + "epoch": 6.457541899441341, + "grad_norm": 1.1868678331375122, + "learning_rate": 0.0006792436974789916, + "loss": 0.4208, + "step": 11559 + }, + { + "epoch": 6.4581005586592175, + "grad_norm": 0.4786730408668518, + "learning_rate": 0.0006792156862745098, + "loss": 0.367, + "step": 11560 + }, + { + "epoch": 6.458659217877095, + "grad_norm": 0.5368600487709045, + "learning_rate": 0.000679187675070028, + "loss": 0.5334, + "step": 11561 + }, + { + "epoch": 6.459217877094972, + "grad_norm": 0.38555872440338135, + "learning_rate": 0.0006791596638655463, + "loss": 0.5225, + "step": 11562 + }, + { + "epoch": 6.459776536312849, + "grad_norm": 0.6935184597969055, + "learning_rate": 0.0006791316526610644, + "loss": 0.6592, + "step": 11563 + }, + { + "epoch": 6.460335195530726, + "grad_norm": 0.6343544125556946, + "learning_rate": 0.0006791036414565826, + "loss": 0.5693, + "step": 11564 + }, + { + "epoch": 6.460893854748603, + "grad_norm": 0.7088921666145325, + "learning_rate": 0.0006790756302521008, + "loss": 0.6159, + "step": 11565 + }, + { + "epoch": 6.4614525139664805, + "grad_norm": 0.4299537241458893, + "learning_rate": 0.000679047619047619, + "loss": 0.4482, + "step": 11566 + }, + { + "epoch": 6.462011173184358, + "grad_norm": 0.6019023060798645, + "learning_rate": 0.0006790196078431374, + "loss": 0.5049, + "step": 11567 + }, + { + "epoch": 6.4625698324022345, + "grad_norm": 0.5985503792762756, + "learning_rate": 0.0006789915966386555, + "loss": 0.4236, + "step": 11568 + }, + { + "epoch": 6.463128491620112, + "grad_norm": 0.7729337811470032, + "learning_rate": 0.0006789635854341737, + "loss": 0.4296, + "step": 11569 + }, + { + "epoch": 6.4636871508379885, + "grad_norm": 0.4970942735671997, + "learning_rate": 0.0006789355742296919, + "loss": 0.4497, + "step": 11570 + }, + { + "epoch": 6.464245810055866, + "grad_norm": 1.7168594598770142, + "learning_rate": 0.0006789075630252101, + "loss": 0.574, + "step": 11571 + }, + { + "epoch": 6.464804469273743, + "grad_norm": 0.4154176414012909, + "learning_rate": 0.0006788795518207284, + "loss": 0.3927, + "step": 11572 + }, + { + "epoch": 6.46536312849162, + "grad_norm": 0.5008817315101624, + "learning_rate": 0.0006788515406162465, + "loss": 0.5798, + "step": 11573 + }, + { + "epoch": 6.465921787709497, + "grad_norm": 0.6479870080947876, + "learning_rate": 0.0006788235294117647, + "loss": 0.5343, + "step": 11574 + }, + { + "epoch": 6.466480446927374, + "grad_norm": 0.5261443257331848, + "learning_rate": 0.0006787955182072829, + "loss": 0.4389, + "step": 11575 + }, + { + "epoch": 6.4670391061452515, + "grad_norm": Infinity, + "learning_rate": 0.0006787955182072829, + "loss": 0.4804, + "step": 11576 + }, + { + "epoch": 6.467597765363129, + "grad_norm": 0.4131874442100525, + "learning_rate": 0.0006787675070028011, + "loss": 0.4144, + "step": 11577 + }, + { + "epoch": 6.4681564245810055, + "grad_norm": 0.5997766256332397, + "learning_rate": 0.0006787394957983194, + "loss": 0.467, + "step": 11578 + }, + { + "epoch": 6.468715083798883, + "grad_norm": 2.000835418701172, + "learning_rate": 0.0006787114845938376, + "loss": 0.3998, + "step": 11579 + }, + { + "epoch": 6.4692737430167595, + "grad_norm": 0.4934137165546417, + "learning_rate": 0.0006786834733893557, + "loss": 0.4218, + "step": 11580 + }, + { + "epoch": 6.469832402234637, + "grad_norm": 0.5290830731391907, + "learning_rate": 0.0006786554621848739, + "loss": 0.3911, + "step": 11581 + }, + { + "epoch": 6.4703910614525135, + "grad_norm": 0.6661310791969299, + "learning_rate": 0.0006786274509803921, + "loss": 0.3786, + "step": 11582 + }, + { + "epoch": 6.470949720670391, + "grad_norm": 0.5292418599128723, + "learning_rate": 0.0006785994397759105, + "loss": 0.4864, + "step": 11583 + }, + { + "epoch": 6.471508379888268, + "grad_norm": 1.2342309951782227, + "learning_rate": 0.0006785714285714287, + "loss": 0.5016, + "step": 11584 + }, + { + "epoch": 6.472067039106145, + "grad_norm": 0.3915422260761261, + "learning_rate": 0.0006785434173669468, + "loss": 0.4954, + "step": 11585 + }, + { + "epoch": 6.4726256983240225, + "grad_norm": 0.48492929339408875, + "learning_rate": 0.000678515406162465, + "loss": 0.4447, + "step": 11586 + }, + { + "epoch": 6.473184357541899, + "grad_norm": 0.33723577857017517, + "learning_rate": 0.0006784873949579832, + "loss": 0.3359, + "step": 11587 + }, + { + "epoch": 6.4737430167597765, + "grad_norm": 0.4832063615322113, + "learning_rate": 0.0006784593837535015, + "loss": 0.3196, + "step": 11588 + }, + { + "epoch": 6.474301675977654, + "grad_norm": 2.238978147506714, + "learning_rate": 0.0006784313725490197, + "loss": 0.4031, + "step": 11589 + }, + { + "epoch": 6.4748603351955305, + "grad_norm": 0.7958325147628784, + "learning_rate": 0.0006784033613445378, + "loss": 0.4567, + "step": 11590 + }, + { + "epoch": 6.475418994413408, + "grad_norm": 4.274355411529541, + "learning_rate": 0.000678375350140056, + "loss": 0.3562, + "step": 11591 + }, + { + "epoch": 6.4759776536312845, + "grad_norm": 0.40913107991218567, + "learning_rate": 0.0006783473389355742, + "loss": 0.4224, + "step": 11592 + }, + { + "epoch": 6.476536312849162, + "grad_norm": 0.905695378780365, + "learning_rate": 0.0006783193277310925, + "loss": 0.5078, + "step": 11593 + }, + { + "epoch": 6.477094972067039, + "grad_norm": 0.5614436864852905, + "learning_rate": 0.0006782913165266107, + "loss": 0.4966, + "step": 11594 + }, + { + "epoch": 6.477653631284916, + "grad_norm": 0.9083330035209656, + "learning_rate": 0.0006782633053221289, + "loss": 0.4057, + "step": 11595 + }, + { + "epoch": 6.4782122905027935, + "grad_norm": 0.9854283332824707, + "learning_rate": 0.000678235294117647, + "loss": 0.4475, + "step": 11596 + }, + { + "epoch": 6.47877094972067, + "grad_norm": 0.8095300197601318, + "learning_rate": 0.0006782072829131652, + "loss": 0.4906, + "step": 11597 + }, + { + "epoch": 6.4793296089385475, + "grad_norm": 0.4725625514984131, + "learning_rate": 0.0006781792717086835, + "loss": 0.4797, + "step": 11598 + }, + { + "epoch": 6.479888268156425, + "grad_norm": 1.2080659866333008, + "learning_rate": 0.0006781512605042018, + "loss": 0.5809, + "step": 11599 + }, + { + "epoch": 6.4804469273743015, + "grad_norm": 0.49512118101119995, + "learning_rate": 0.00067812324929972, + "loss": 0.5115, + "step": 11600 + }, + { + "epoch": 6.481005586592179, + "grad_norm": 0.9185149669647217, + "learning_rate": 0.0006780952380952381, + "loss": 0.3803, + "step": 11601 + }, + { + "epoch": 6.4815642458100555, + "grad_norm": 0.536938488483429, + "learning_rate": 0.0006780672268907563, + "loss": 0.3408, + "step": 11602 + }, + { + "epoch": 6.482122905027933, + "grad_norm": 0.6550922989845276, + "learning_rate": 0.0006780392156862745, + "loss": 0.4763, + "step": 11603 + }, + { + "epoch": 6.48268156424581, + "grad_norm": 0.451121985912323, + "learning_rate": 0.0006780112044817928, + "loss": 0.2965, + "step": 11604 + }, + { + "epoch": 6.483240223463687, + "grad_norm": 0.48272010684013367, + "learning_rate": 0.000677983193277311, + "loss": 0.4601, + "step": 11605 + }, + { + "epoch": 6.4837988826815645, + "grad_norm": 0.4383988380432129, + "learning_rate": 0.0006779551820728291, + "loss": 0.4034, + "step": 11606 + }, + { + "epoch": 6.484357541899441, + "grad_norm": 0.4807858467102051, + "learning_rate": 0.0006779271708683473, + "loss": 0.4006, + "step": 11607 + }, + { + "epoch": 6.4849162011173185, + "grad_norm": 0.8803003430366516, + "learning_rate": 0.0006778991596638655, + "loss": 0.5064, + "step": 11608 + }, + { + "epoch": 6.485474860335196, + "grad_norm": 0.45994246006011963, + "learning_rate": 0.0006778711484593838, + "loss": 0.4319, + "step": 11609 + }, + { + "epoch": 6.4860335195530725, + "grad_norm": 0.8877516388893127, + "learning_rate": 0.000677843137254902, + "loss": 0.3802, + "step": 11610 + }, + { + "epoch": 6.48659217877095, + "grad_norm": 0.8846052289009094, + "learning_rate": 0.0006778151260504202, + "loss": 0.5155, + "step": 11611 + }, + { + "epoch": 6.4871508379888265, + "grad_norm": 0.5974485874176025, + "learning_rate": 0.0006777871148459383, + "loss": 0.4132, + "step": 11612 + }, + { + "epoch": 6.487709497206704, + "grad_norm": 0.5771796703338623, + "learning_rate": 0.0006777591036414565, + "loss": 0.5279, + "step": 11613 + }, + { + "epoch": 6.488268156424581, + "grad_norm": 0.42405661940574646, + "learning_rate": 0.0006777310924369748, + "loss": 0.4475, + "step": 11614 + }, + { + "epoch": 6.488826815642458, + "grad_norm": 0.7989774942398071, + "learning_rate": 0.000677703081232493, + "loss": 0.5182, + "step": 11615 + }, + { + "epoch": 6.4893854748603355, + "grad_norm": 1.0028172731399536, + "learning_rate": 0.0006776750700280113, + "loss": 0.2954, + "step": 11616 + }, + { + "epoch": 6.489944134078212, + "grad_norm": 4.393218994140625, + "learning_rate": 0.0006776470588235294, + "loss": 0.4148, + "step": 11617 + }, + { + "epoch": 6.4905027932960895, + "grad_norm": 0.7309901118278503, + "learning_rate": 0.0006776190476190476, + "loss": 0.4974, + "step": 11618 + }, + { + "epoch": 6.491061452513966, + "grad_norm": 0.426919162273407, + "learning_rate": 0.0006775910364145659, + "loss": 0.4249, + "step": 11619 + }, + { + "epoch": 6.4916201117318435, + "grad_norm": 0.574055552482605, + "learning_rate": 0.0006775630252100841, + "loss": 0.5229, + "step": 11620 + }, + { + "epoch": 6.492178770949721, + "grad_norm": 0.5560129284858704, + "learning_rate": 0.0006775350140056023, + "loss": 0.5134, + "step": 11621 + }, + { + "epoch": 6.4927374301675975, + "grad_norm": 0.6170814037322998, + "learning_rate": 0.0006775070028011204, + "loss": 0.43, + "step": 11622 + }, + { + "epoch": 6.493296089385475, + "grad_norm": 0.5968080163002014, + "learning_rate": 0.0006774789915966386, + "loss": 0.3324, + "step": 11623 + }, + { + "epoch": 6.4938547486033515, + "grad_norm": 0.5597171783447266, + "learning_rate": 0.0006774509803921569, + "loss": 0.3745, + "step": 11624 + }, + { + "epoch": 6.494413407821229, + "grad_norm": 0.4412710666656494, + "learning_rate": 0.0006774229691876751, + "loss": 0.4077, + "step": 11625 + }, + { + "epoch": 6.4949720670391065, + "grad_norm": 0.499252587556839, + "learning_rate": 0.0006773949579831933, + "loss": 0.3957, + "step": 11626 + }, + { + "epoch": 6.495530726256983, + "grad_norm": 0.4536375403404236, + "learning_rate": 0.0006773669467787115, + "loss": 0.4084, + "step": 11627 + }, + { + "epoch": 6.4960893854748605, + "grad_norm": 0.6035411357879639, + "learning_rate": 0.0006773389355742296, + "loss": 0.3971, + "step": 11628 + }, + { + "epoch": 6.496648044692737, + "grad_norm": 0.528416097164154, + "learning_rate": 0.0006773109243697479, + "loss": 0.4148, + "step": 11629 + }, + { + "epoch": 6.4972067039106145, + "grad_norm": 4.354679107666016, + "learning_rate": 0.0006772829131652661, + "loss": 0.4047, + "step": 11630 + }, + { + "epoch": 6.497765363128492, + "grad_norm": 0.732670783996582, + "learning_rate": 0.0006772549019607843, + "loss": 0.3754, + "step": 11631 + }, + { + "epoch": 6.4983240223463685, + "grad_norm": 0.5008953809738159, + "learning_rate": 0.0006772268907563026, + "loss": 0.4057, + "step": 11632 + }, + { + "epoch": 6.498882681564246, + "grad_norm": 2.3818132877349854, + "learning_rate": 0.0006771988795518206, + "loss": 0.4109, + "step": 11633 + }, + { + "epoch": 6.4994413407821225, + "grad_norm": 0.4366646409034729, + "learning_rate": 0.000677170868347339, + "loss": 0.4189, + "step": 11634 + }, + { + "epoch": 6.5, + "grad_norm": 3.6260876655578613, + "learning_rate": 0.0006771428571428572, + "loss": 0.4343, + "step": 11635 + }, + { + "epoch": 6.5005586592178775, + "grad_norm": 0.4655911922454834, + "learning_rate": 0.0006771148459383754, + "loss": 0.3994, + "step": 11636 + }, + { + "epoch": 6.501117318435754, + "grad_norm": 0.44507983326911926, + "learning_rate": 0.0006770868347338936, + "loss": 0.271, + "step": 11637 + }, + { + "epoch": 6.5016759776536315, + "grad_norm": 0.6009631156921387, + "learning_rate": 0.0006770588235294117, + "loss": 0.4648, + "step": 11638 + }, + { + "epoch": 6.502234636871508, + "grad_norm": 0.5486988425254822, + "learning_rate": 0.00067703081232493, + "loss": 0.4593, + "step": 11639 + }, + { + "epoch": 6.5027932960893855, + "grad_norm": 0.46845799684524536, + "learning_rate": 0.0006770028011204482, + "loss": 0.4711, + "step": 11640 + }, + { + "epoch": 6.503351955307263, + "grad_norm": 0.4946540594100952, + "learning_rate": 0.0006769747899159664, + "loss": 0.4122, + "step": 11641 + }, + { + "epoch": 6.5039106145251395, + "grad_norm": 0.6086100339889526, + "learning_rate": 0.0006769467787114846, + "loss": 0.4373, + "step": 11642 + }, + { + "epoch": 6.504469273743017, + "grad_norm": 0.5401648879051208, + "learning_rate": 0.0006769187675070028, + "loss": 0.4221, + "step": 11643 + }, + { + "epoch": 6.5050279329608935, + "grad_norm": 0.44656991958618164, + "learning_rate": 0.000676890756302521, + "loss": 0.426, + "step": 11644 + }, + { + "epoch": 6.505586592178771, + "grad_norm": 1.386893391609192, + "learning_rate": 0.0006768627450980392, + "loss": 0.5002, + "step": 11645 + }, + { + "epoch": 6.506145251396648, + "grad_norm": 0.3629169464111328, + "learning_rate": 0.0006768347338935574, + "loss": 0.5152, + "step": 11646 + }, + { + "epoch": 6.506703910614525, + "grad_norm": 0.5242171287536621, + "learning_rate": 0.0006768067226890756, + "loss": 0.468, + "step": 11647 + }, + { + "epoch": 6.5072625698324025, + "grad_norm": 0.7179381847381592, + "learning_rate": 0.0006767787114845938, + "loss": 0.4532, + "step": 11648 + }, + { + "epoch": 6.507821229050279, + "grad_norm": 0.7212526202201843, + "learning_rate": 0.000676750700280112, + "loss": 0.4488, + "step": 11649 + }, + { + "epoch": 6.5083798882681565, + "grad_norm": 1.554672360420227, + "learning_rate": 0.0006767226890756303, + "loss": 0.4312, + "step": 11650 + }, + { + "epoch": 6.508938547486034, + "grad_norm": 0.45662686228752136, + "learning_rate": 0.0006766946778711485, + "loss": 0.5353, + "step": 11651 + }, + { + "epoch": 6.5094972067039105, + "grad_norm": 0.5225405693054199, + "learning_rate": 0.0006766666666666667, + "loss": 0.5689, + "step": 11652 + }, + { + "epoch": 6.510055865921788, + "grad_norm": 0.6175033450126648, + "learning_rate": 0.0006766386554621849, + "loss": 0.3839, + "step": 11653 + }, + { + "epoch": 6.5106145251396645, + "grad_norm": 2.2123074531555176, + "learning_rate": 0.0006766106442577031, + "loss": 0.4167, + "step": 11654 + }, + { + "epoch": 6.511173184357542, + "grad_norm": 0.5899970531463623, + "learning_rate": 0.0006765826330532213, + "loss": 0.3908, + "step": 11655 + }, + { + "epoch": 6.511731843575419, + "grad_norm": 0.5495463013648987, + "learning_rate": 0.0006765546218487395, + "loss": 0.491, + "step": 11656 + }, + { + "epoch": 6.512290502793296, + "grad_norm": 0.7404274940490723, + "learning_rate": 0.0006765266106442577, + "loss": 0.6168, + "step": 11657 + }, + { + "epoch": 6.5128491620111735, + "grad_norm": 0.5841642618179321, + "learning_rate": 0.0006764985994397759, + "loss": 0.4724, + "step": 11658 + }, + { + "epoch": 6.51340782122905, + "grad_norm": 0.47262054681777954, + "learning_rate": 0.0006764705882352942, + "loss": 0.4655, + "step": 11659 + }, + { + "epoch": 6.5139664804469275, + "grad_norm": 1.0039445161819458, + "learning_rate": 0.0006764425770308123, + "loss": 0.5438, + "step": 11660 + }, + { + "epoch": 6.514525139664805, + "grad_norm": 9.346894264221191, + "learning_rate": 0.0006764145658263305, + "loss": 0.4436, + "step": 11661 + }, + { + "epoch": 6.5150837988826815, + "grad_norm": 0.5571748614311218, + "learning_rate": 0.0006763865546218487, + "loss": 0.4725, + "step": 11662 + }, + { + "epoch": 6.515642458100559, + "grad_norm": 0.5540754795074463, + "learning_rate": 0.0006763585434173669, + "loss": 0.5995, + "step": 11663 + }, + { + "epoch": 6.5162011173184355, + "grad_norm": 0.6166994571685791, + "learning_rate": 0.0006763305322128853, + "loss": 0.4852, + "step": 11664 + }, + { + "epoch": 6.516759776536313, + "grad_norm": 0.4597347676753998, + "learning_rate": 0.0006763025210084033, + "loss": 0.3297, + "step": 11665 + }, + { + "epoch": 6.51731843575419, + "grad_norm": 0.7996395826339722, + "learning_rate": 0.0006762745098039216, + "loss": 0.6306, + "step": 11666 + }, + { + "epoch": 6.517877094972067, + "grad_norm": 1.1374506950378418, + "learning_rate": 0.0006762464985994398, + "loss": 0.4721, + "step": 11667 + }, + { + "epoch": 6.5184357541899445, + "grad_norm": 0.6028649210929871, + "learning_rate": 0.000676218487394958, + "loss": 0.7393, + "step": 11668 + }, + { + "epoch": 6.518994413407821, + "grad_norm": 0.5631989240646362, + "learning_rate": 0.0006761904761904763, + "loss": 0.3864, + "step": 11669 + }, + { + "epoch": 6.5195530726256985, + "grad_norm": 1.0149052143096924, + "learning_rate": 0.0006761624649859944, + "loss": 0.4789, + "step": 11670 + }, + { + "epoch": 6.520111731843575, + "grad_norm": 0.5269141793251038, + "learning_rate": 0.0006761344537815126, + "loss": 0.4225, + "step": 11671 + }, + { + "epoch": 6.5206703910614525, + "grad_norm": 0.6035739779472351, + "learning_rate": 0.0006761064425770308, + "loss": 0.4988, + "step": 11672 + }, + { + "epoch": 6.52122905027933, + "grad_norm": 0.5897813439369202, + "learning_rate": 0.000676078431372549, + "loss": 0.4789, + "step": 11673 + }, + { + "epoch": 6.5217877094972065, + "grad_norm": 1.0650050640106201, + "learning_rate": 0.0006760504201680673, + "loss": 0.4742, + "step": 11674 + }, + { + "epoch": 6.522346368715084, + "grad_norm": 1.0785117149353027, + "learning_rate": 0.0006760224089635855, + "loss": 0.4457, + "step": 11675 + }, + { + "epoch": 6.522905027932961, + "grad_norm": 0.5334888100624084, + "learning_rate": 0.0006759943977591036, + "loss": 0.3945, + "step": 11676 + }, + { + "epoch": 6.523463687150838, + "grad_norm": 0.6811595559120178, + "learning_rate": 0.0006759663865546218, + "loss": 0.4326, + "step": 11677 + }, + { + "epoch": 6.5240223463687155, + "grad_norm": 0.4486103355884552, + "learning_rate": 0.00067593837535014, + "loss": 0.4587, + "step": 11678 + }, + { + "epoch": 6.524581005586592, + "grad_norm": 0.594249963760376, + "learning_rate": 0.0006759103641456583, + "loss": 0.3933, + "step": 11679 + }, + { + "epoch": 6.5251396648044695, + "grad_norm": 0.7737732529640198, + "learning_rate": 0.0006758823529411765, + "loss": 0.3668, + "step": 11680 + }, + { + "epoch": 6.525698324022346, + "grad_norm": 0.4339161813259125, + "learning_rate": 0.0006758543417366946, + "loss": 0.3771, + "step": 11681 + }, + { + "epoch": 6.5262569832402235, + "grad_norm": 0.3916019797325134, + "learning_rate": 0.0006758263305322129, + "loss": 0.344, + "step": 11682 + }, + { + "epoch": 6.5268156424581, + "grad_norm": 0.6638060808181763, + "learning_rate": 0.0006757983193277311, + "loss": 0.6867, + "step": 11683 + }, + { + "epoch": 6.5273743016759775, + "grad_norm": 0.7278107404708862, + "learning_rate": 0.0006757703081232494, + "loss": 0.4547, + "step": 11684 + }, + { + "epoch": 6.527932960893855, + "grad_norm": 0.5387098789215088, + "learning_rate": 0.0006757422969187676, + "loss": 0.4331, + "step": 11685 + }, + { + "epoch": 6.528491620111732, + "grad_norm": 0.46186915040016174, + "learning_rate": 0.0006757142857142857, + "loss": 0.4794, + "step": 11686 + }, + { + "epoch": 6.529050279329609, + "grad_norm": 3.535621166229248, + "learning_rate": 0.0006756862745098039, + "loss": 0.4159, + "step": 11687 + }, + { + "epoch": 6.5296089385474865, + "grad_norm": 1.0551750659942627, + "learning_rate": 0.0006756582633053221, + "loss": 0.4948, + "step": 11688 + }, + { + "epoch": 6.530167597765363, + "grad_norm": 10.206016540527344, + "learning_rate": 0.0006756302521008404, + "loss": 0.3857, + "step": 11689 + }, + { + "epoch": 6.5307262569832405, + "grad_norm": 5.077488422393799, + "learning_rate": 0.0006756022408963586, + "loss": 0.455, + "step": 11690 + }, + { + "epoch": 6.531284916201117, + "grad_norm": 0.6153773665428162, + "learning_rate": 0.0006755742296918768, + "loss": 0.4023, + "step": 11691 + }, + { + "epoch": 6.5318435754189945, + "grad_norm": 0.7012595534324646, + "learning_rate": 0.0006755462184873949, + "loss": 0.6857, + "step": 11692 + }, + { + "epoch": 6.532402234636871, + "grad_norm": 0.5747553110122681, + "learning_rate": 0.0006755182072829131, + "loss": 0.4673, + "step": 11693 + }, + { + "epoch": 6.5329608938547485, + "grad_norm": 0.7796198725700378, + "learning_rate": 0.0006754901960784314, + "loss": 0.5701, + "step": 11694 + }, + { + "epoch": 6.533519553072626, + "grad_norm": 0.389577716588974, + "learning_rate": 0.0006754621848739496, + "loss": 0.3423, + "step": 11695 + }, + { + "epoch": 6.534078212290503, + "grad_norm": 0.46891096234321594, + "learning_rate": 0.0006754341736694678, + "loss": 0.3994, + "step": 11696 + }, + { + "epoch": 6.53463687150838, + "grad_norm": 0.4067317843437195, + "learning_rate": 0.0006754061624649859, + "loss": 0.4095, + "step": 11697 + }, + { + "epoch": 6.5351955307262575, + "grad_norm": 0.5995504260063171, + "learning_rate": 0.0006753781512605041, + "loss": 0.4651, + "step": 11698 + }, + { + "epoch": 6.535754189944134, + "grad_norm": 0.5230239033699036, + "learning_rate": 0.0006753501400560225, + "loss": 0.3816, + "step": 11699 + }, + { + "epoch": 6.5363128491620115, + "grad_norm": 0.8123779296875, + "learning_rate": 0.0006753221288515407, + "loss": 0.4034, + "step": 11700 + }, + { + "epoch": 6.536871508379888, + "grad_norm": 0.6425394415855408, + "learning_rate": 0.0006752941176470589, + "loss": 0.489, + "step": 11701 + }, + { + "epoch": 6.5374301675977655, + "grad_norm": 0.37751275300979614, + "learning_rate": 0.000675266106442577, + "loss": 0.4423, + "step": 11702 + }, + { + "epoch": 6.537988826815642, + "grad_norm": 0.5902776718139648, + "learning_rate": 0.0006752380952380952, + "loss": 0.4689, + "step": 11703 + }, + { + "epoch": 6.5385474860335195, + "grad_norm": 0.5996629595756531, + "learning_rate": 0.0006752100840336135, + "loss": 0.6342, + "step": 11704 + }, + { + "epoch": 6.539106145251397, + "grad_norm": 0.9844701290130615, + "learning_rate": 0.0006751820728291317, + "loss": 0.3726, + "step": 11705 + }, + { + "epoch": 6.539664804469274, + "grad_norm": 0.4572041928768158, + "learning_rate": 0.0006751540616246499, + "loss": 0.4269, + "step": 11706 + }, + { + "epoch": 6.540223463687151, + "grad_norm": 0.36848145723342896, + "learning_rate": 0.0006751260504201681, + "loss": 0.3772, + "step": 11707 + }, + { + "epoch": 6.540782122905028, + "grad_norm": 3.8569045066833496, + "learning_rate": 0.0006750980392156862, + "loss": 0.6048, + "step": 11708 + }, + { + "epoch": 6.541340782122905, + "grad_norm": 0.4871978461742401, + "learning_rate": 0.0006750700280112045, + "loss": 0.388, + "step": 11709 + }, + { + "epoch": 6.5418994413407825, + "grad_norm": 0.49558138847351074, + "learning_rate": 0.0006750420168067227, + "loss": 0.4798, + "step": 11710 + }, + { + "epoch": 6.542458100558659, + "grad_norm": 1.104504108428955, + "learning_rate": 0.0006750140056022409, + "loss": 0.3837, + "step": 11711 + }, + { + "epoch": 6.5430167597765365, + "grad_norm": 0.44812867045402527, + "learning_rate": 0.0006749859943977591, + "loss": 0.4445, + "step": 11712 + }, + { + "epoch": 6.543575418994413, + "grad_norm": 1.1642338037490845, + "learning_rate": 0.0006749579831932772, + "loss": 0.4395, + "step": 11713 + }, + { + "epoch": 6.5441340782122905, + "grad_norm": 0.5086650252342224, + "learning_rate": 0.0006749299719887956, + "loss": 0.4913, + "step": 11714 + }, + { + "epoch": 6.544692737430168, + "grad_norm": 0.5418620705604553, + "learning_rate": 0.0006749019607843138, + "loss": 0.4173, + "step": 11715 + }, + { + "epoch": 6.545251396648045, + "grad_norm": 0.4850539267063141, + "learning_rate": 0.000674873949579832, + "loss": 0.4777, + "step": 11716 + }, + { + "epoch": 6.545810055865922, + "grad_norm": 0.5534653663635254, + "learning_rate": 0.0006748459383753502, + "loss": 0.5654, + "step": 11717 + }, + { + "epoch": 6.546368715083799, + "grad_norm": 1.098381757736206, + "learning_rate": 0.0006748179271708683, + "loss": 0.3952, + "step": 11718 + }, + { + "epoch": 6.546927374301676, + "grad_norm": 0.7109861373901367, + "learning_rate": 0.0006747899159663866, + "loss": 0.467, + "step": 11719 + }, + { + "epoch": 6.547486033519553, + "grad_norm": 0.4619545042514801, + "learning_rate": 0.0006747619047619048, + "loss": 0.4914, + "step": 11720 + }, + { + "epoch": 6.54804469273743, + "grad_norm": 0.6146419644355774, + "learning_rate": 0.000674733893557423, + "loss": 0.5054, + "step": 11721 + }, + { + "epoch": 6.5486033519553075, + "grad_norm": 2.3284287452697754, + "learning_rate": 0.0006747058823529412, + "loss": 0.4157, + "step": 11722 + }, + { + "epoch": 6.549162011173184, + "grad_norm": 0.6265036463737488, + "learning_rate": 0.0006746778711484594, + "loss": 0.4746, + "step": 11723 + }, + { + "epoch": 6.5497206703910615, + "grad_norm": 1.0419626235961914, + "learning_rate": 0.0006746498599439776, + "loss": 0.3497, + "step": 11724 + }, + { + "epoch": 6.550279329608939, + "grad_norm": 0.6008273363113403, + "learning_rate": 0.0006746218487394958, + "loss": 0.4081, + "step": 11725 + }, + { + "epoch": 6.550837988826816, + "grad_norm": 0.448623925447464, + "learning_rate": 0.000674593837535014, + "loss": 0.3503, + "step": 11726 + }, + { + "epoch": 6.551396648044693, + "grad_norm": 0.4313521087169647, + "learning_rate": 0.0006745658263305322, + "loss": 0.3847, + "step": 11727 + }, + { + "epoch": 6.55195530726257, + "grad_norm": 0.44949379563331604, + "learning_rate": 0.0006745378151260504, + "loss": 0.4124, + "step": 11728 + }, + { + "epoch": 6.552513966480447, + "grad_norm": 0.5030506253242493, + "learning_rate": 0.0006745098039215686, + "loss": 0.415, + "step": 11729 + }, + { + "epoch": 6.553072625698324, + "grad_norm": 0.48275724053382874, + "learning_rate": 0.0006744817927170868, + "loss": 0.5779, + "step": 11730 + }, + { + "epoch": 6.553631284916201, + "grad_norm": 0.45968884229660034, + "learning_rate": 0.000674453781512605, + "loss": 0.4339, + "step": 11731 + }, + { + "epoch": 6.5541899441340785, + "grad_norm": 0.4956890046596527, + "learning_rate": 0.0006744257703081233, + "loss": 0.4526, + "step": 11732 + }, + { + "epoch": 6.554748603351955, + "grad_norm": 0.3742595911026001, + "learning_rate": 0.0006743977591036415, + "loss": 0.4289, + "step": 11733 + }, + { + "epoch": 6.5553072625698325, + "grad_norm": 0.39774876832962036, + "learning_rate": 0.0006743697478991597, + "loss": 0.4237, + "step": 11734 + }, + { + "epoch": 6.55586592178771, + "grad_norm": 0.3887314200401306, + "learning_rate": 0.0006743417366946779, + "loss": 0.4092, + "step": 11735 + }, + { + "epoch": 6.556424581005587, + "grad_norm": 0.5866477489471436, + "learning_rate": 0.0006743137254901961, + "loss": 0.4723, + "step": 11736 + }, + { + "epoch": 6.556983240223464, + "grad_norm": 0.38468068838119507, + "learning_rate": 0.0006742857142857143, + "loss": 0.3693, + "step": 11737 + }, + { + "epoch": 6.557541899441341, + "grad_norm": 0.37022414803504944, + "learning_rate": 0.0006742577030812325, + "loss": 0.4326, + "step": 11738 + }, + { + "epoch": 6.558100558659218, + "grad_norm": 1.8264210224151611, + "learning_rate": 0.0006742296918767508, + "loss": 0.5193, + "step": 11739 + }, + { + "epoch": 6.558659217877095, + "grad_norm": 0.5144490003585815, + "learning_rate": 0.0006742016806722689, + "loss": 0.4028, + "step": 11740 + }, + { + "epoch": 6.559217877094972, + "grad_norm": 0.42541375756263733, + "learning_rate": 0.0006741736694677871, + "loss": 0.4463, + "step": 11741 + }, + { + "epoch": 6.5597765363128495, + "grad_norm": 0.5716822147369385, + "learning_rate": 0.0006741456582633053, + "loss": 0.5107, + "step": 11742 + }, + { + "epoch": 6.560335195530726, + "grad_norm": 0.5076964497566223, + "learning_rate": 0.0006741176470588235, + "loss": 0.3544, + "step": 11743 + }, + { + "epoch": 6.5608938547486035, + "grad_norm": 1.107096791267395, + "learning_rate": 0.0006740896358543418, + "loss": 0.5096, + "step": 11744 + }, + { + "epoch": 6.56145251396648, + "grad_norm": 0.4176803231239319, + "learning_rate": 0.0006740616246498599, + "loss": 0.3936, + "step": 11745 + }, + { + "epoch": 6.562011173184358, + "grad_norm": 0.44088423252105713, + "learning_rate": 0.0006740336134453781, + "loss": 0.3798, + "step": 11746 + }, + { + "epoch": 6.562569832402235, + "grad_norm": 1.4666974544525146, + "learning_rate": 0.0006740056022408963, + "loss": 0.5046, + "step": 11747 + }, + { + "epoch": 6.563128491620112, + "grad_norm": 5.996772766113281, + "learning_rate": 0.0006739775910364146, + "loss": 0.4642, + "step": 11748 + }, + { + "epoch": 6.563687150837989, + "grad_norm": 0.3904889225959778, + "learning_rate": 0.0006739495798319329, + "loss": 0.3806, + "step": 11749 + }, + { + "epoch": 6.564245810055866, + "grad_norm": 3.2268543243408203, + "learning_rate": 0.000673921568627451, + "loss": 0.3809, + "step": 11750 + }, + { + "epoch": 6.564804469273743, + "grad_norm": 0.6965512037277222, + "learning_rate": 0.0006738935574229692, + "loss": 0.3569, + "step": 11751 + }, + { + "epoch": 6.5653631284916205, + "grad_norm": 0.8824526071548462, + "learning_rate": 0.0006738655462184874, + "loss": 0.5219, + "step": 11752 + }, + { + "epoch": 6.565921787709497, + "grad_norm": 0.6158630847930908, + "learning_rate": 0.0006738375350140056, + "loss": 0.4949, + "step": 11753 + }, + { + "epoch": 6.5664804469273745, + "grad_norm": 0.392378568649292, + "learning_rate": 0.0006738095238095239, + "loss": 0.3244, + "step": 11754 + }, + { + "epoch": 6.567039106145251, + "grad_norm": 0.6249035000801086, + "learning_rate": 0.0006737815126050421, + "loss": 0.5117, + "step": 11755 + }, + { + "epoch": 6.567597765363129, + "grad_norm": 0.47027185559272766, + "learning_rate": 0.0006737535014005602, + "loss": 0.4036, + "step": 11756 + }, + { + "epoch": 6.568156424581005, + "grad_norm": 0.5993382334709167, + "learning_rate": 0.0006737254901960784, + "loss": 0.4145, + "step": 11757 + }, + { + "epoch": 6.568715083798883, + "grad_norm": 0.46324798464775085, + "learning_rate": 0.0006736974789915966, + "loss": 0.4604, + "step": 11758 + }, + { + "epoch": 6.56927374301676, + "grad_norm": 0.6123028993606567, + "learning_rate": 0.0006736694677871149, + "loss": 0.4716, + "step": 11759 + }, + { + "epoch": 6.569832402234637, + "grad_norm": 0.5876016020774841, + "learning_rate": 0.0006736414565826331, + "loss": 0.4727, + "step": 11760 + }, + { + "epoch": 6.570391061452514, + "grad_norm": 0.590881884098053, + "learning_rate": 0.0006736134453781512, + "loss": 0.4719, + "step": 11761 + }, + { + "epoch": 6.5709497206703915, + "grad_norm": 0.6244599223136902, + "learning_rate": 0.0006735854341736694, + "loss": 0.4553, + "step": 11762 + }, + { + "epoch": 6.571508379888268, + "grad_norm": 0.7276081442832947, + "learning_rate": 0.0006735574229691876, + "loss": 0.5725, + "step": 11763 + }, + { + "epoch": 6.5720670391061455, + "grad_norm": 0.5244298577308655, + "learning_rate": 0.000673529411764706, + "loss": 0.3908, + "step": 11764 + }, + { + "epoch": 6.572625698324022, + "grad_norm": 0.40962591767311096, + "learning_rate": 0.0006735014005602242, + "loss": 0.4009, + "step": 11765 + }, + { + "epoch": 6.5731843575419, + "grad_norm": 0.47400426864624023, + "learning_rate": 0.0006734733893557423, + "loss": 0.4106, + "step": 11766 + }, + { + "epoch": 6.573743016759776, + "grad_norm": 0.6502686738967896, + "learning_rate": 0.0006734453781512605, + "loss": 0.4072, + "step": 11767 + }, + { + "epoch": 6.574301675977654, + "grad_norm": 0.6568413972854614, + "learning_rate": 0.0006734173669467787, + "loss": 0.4041, + "step": 11768 + }, + { + "epoch": 6.574860335195531, + "grad_norm": 0.38906148076057434, + "learning_rate": 0.000673389355742297, + "loss": 0.3412, + "step": 11769 + }, + { + "epoch": 6.575418994413408, + "grad_norm": 0.755928635597229, + "learning_rate": 0.0006733613445378152, + "loss": 0.4001, + "step": 11770 + }, + { + "epoch": 6.575977653631285, + "grad_norm": 0.608433723449707, + "learning_rate": 0.0006733333333333334, + "loss": 0.442, + "step": 11771 + }, + { + "epoch": 6.576536312849162, + "grad_norm": 0.5073427557945251, + "learning_rate": 0.0006733053221288515, + "loss": 0.3972, + "step": 11772 + }, + { + "epoch": 6.577094972067039, + "grad_norm": 0.8906851410865784, + "learning_rate": 0.0006732773109243697, + "loss": 0.5405, + "step": 11773 + }, + { + "epoch": 6.5776536312849165, + "grad_norm": 0.8808069229125977, + "learning_rate": 0.000673249299719888, + "loss": 0.389, + "step": 11774 + }, + { + "epoch": 6.578212290502793, + "grad_norm": 0.37472110986709595, + "learning_rate": 0.0006732212885154062, + "loss": 0.331, + "step": 11775 + }, + { + "epoch": 6.578770949720671, + "grad_norm": 0.949889600276947, + "learning_rate": 0.0006731932773109244, + "loss": 0.475, + "step": 11776 + }, + { + "epoch": 6.579329608938547, + "grad_norm": 0.5708540081977844, + "learning_rate": 0.0006731652661064425, + "loss": 0.4685, + "step": 11777 + }, + { + "epoch": 6.579888268156425, + "grad_norm": 1.2496596574783325, + "learning_rate": 0.0006731372549019607, + "loss": 0.5369, + "step": 11778 + }, + { + "epoch": 6.580446927374302, + "grad_norm": 0.41932421922683716, + "learning_rate": 0.000673109243697479, + "loss": 0.4791, + "step": 11779 + }, + { + "epoch": 6.581005586592179, + "grad_norm": 1.2046666145324707, + "learning_rate": 0.0006730812324929973, + "loss": 0.6789, + "step": 11780 + }, + { + "epoch": 6.581564245810056, + "grad_norm": 0.5528629422187805, + "learning_rate": 0.0006730532212885155, + "loss": 0.4585, + "step": 11781 + }, + { + "epoch": 6.582122905027933, + "grad_norm": 0.635690450668335, + "learning_rate": 0.0006730252100840336, + "loss": 0.4981, + "step": 11782 + }, + { + "epoch": 6.58268156424581, + "grad_norm": 0.36201101541519165, + "learning_rate": 0.0006729971988795518, + "loss": 0.3224, + "step": 11783 + }, + { + "epoch": 6.5832402234636875, + "grad_norm": 0.6385451555252075, + "learning_rate": 0.0006729691876750701, + "loss": 0.3413, + "step": 11784 + }, + { + "epoch": 6.583798882681564, + "grad_norm": 2.327892541885376, + "learning_rate": 0.0006729411764705883, + "loss": 0.4261, + "step": 11785 + }, + { + "epoch": 6.584357541899442, + "grad_norm": 0.8808413147926331, + "learning_rate": 0.0006729131652661065, + "loss": 0.4425, + "step": 11786 + }, + { + "epoch": 6.584916201117318, + "grad_norm": 0.5280790328979492, + "learning_rate": 0.0006728851540616247, + "loss": 0.4275, + "step": 11787 + }, + { + "epoch": 6.585474860335196, + "grad_norm": 0.9772935509681702, + "learning_rate": 0.0006728571428571428, + "loss": 0.475, + "step": 11788 + }, + { + "epoch": 6.586033519553073, + "grad_norm": 0.6907405853271484, + "learning_rate": 0.0006728291316526611, + "loss": 0.5107, + "step": 11789 + }, + { + "epoch": 6.58659217877095, + "grad_norm": 0.40243226289749146, + "learning_rate": 0.0006728011204481793, + "loss": 0.45, + "step": 11790 + }, + { + "epoch": 6.587150837988827, + "grad_norm": 0.40675678849220276, + "learning_rate": 0.0006727731092436975, + "loss": 0.3203, + "step": 11791 + }, + { + "epoch": 6.587709497206704, + "grad_norm": 0.7469590306282043, + "learning_rate": 0.0006727450980392157, + "loss": 0.4546, + "step": 11792 + }, + { + "epoch": 6.588268156424581, + "grad_norm": 0.48736336827278137, + "learning_rate": 0.0006727170868347338, + "loss": 0.5293, + "step": 11793 + }, + { + "epoch": 6.588826815642458, + "grad_norm": 0.5058596730232239, + "learning_rate": 0.0006726890756302521, + "loss": 0.3562, + "step": 11794 + }, + { + "epoch": 6.589385474860335, + "grad_norm": 0.8745835423469543, + "learning_rate": 0.0006726610644257703, + "loss": 0.526, + "step": 11795 + }, + { + "epoch": 6.589944134078213, + "grad_norm": 0.43587151169776917, + "learning_rate": 0.0006726330532212886, + "loss": 0.4278, + "step": 11796 + }, + { + "epoch": 6.590502793296089, + "grad_norm": 0.4505431354045868, + "learning_rate": 0.0006726050420168068, + "loss": 0.4032, + "step": 11797 + }, + { + "epoch": 6.591061452513967, + "grad_norm": 0.7565644979476929, + "learning_rate": 0.0006725770308123249, + "loss": 0.4498, + "step": 11798 + }, + { + "epoch": 6.591620111731844, + "grad_norm": 0.45958027243614197, + "learning_rate": 0.0006725490196078432, + "loss": 0.4441, + "step": 11799 + }, + { + "epoch": 6.592178770949721, + "grad_norm": 0.4430047869682312, + "learning_rate": 0.0006725210084033614, + "loss": 0.4691, + "step": 11800 + }, + { + "epoch": 6.592737430167598, + "grad_norm": 0.6521354913711548, + "learning_rate": 0.0006724929971988796, + "loss": 0.4246, + "step": 11801 + }, + { + "epoch": 6.593296089385475, + "grad_norm": 0.828670084476471, + "learning_rate": 0.0006724649859943978, + "loss": 0.4228, + "step": 11802 + }, + { + "epoch": 6.593854748603352, + "grad_norm": 0.737628698348999, + "learning_rate": 0.000672436974789916, + "loss": 0.3494, + "step": 11803 + }, + { + "epoch": 6.594413407821229, + "grad_norm": 1.446258783340454, + "learning_rate": 0.0006724089635854342, + "loss": 0.4789, + "step": 11804 + }, + { + "epoch": 6.594972067039106, + "grad_norm": 0.5946828126907349, + "learning_rate": 0.0006723809523809524, + "loss": 0.4658, + "step": 11805 + }, + { + "epoch": 6.5955307262569836, + "grad_norm": 0.5721814632415771, + "learning_rate": 0.0006723529411764706, + "loss": 0.4715, + "step": 11806 + }, + { + "epoch": 6.59608938547486, + "grad_norm": 0.5953073501586914, + "learning_rate": 0.0006723249299719888, + "loss": 0.4247, + "step": 11807 + }, + { + "epoch": 6.596648044692738, + "grad_norm": 3.5989556312561035, + "learning_rate": 0.000672296918767507, + "loss": 0.4803, + "step": 11808 + }, + { + "epoch": 6.597206703910614, + "grad_norm": 0.4964918792247772, + "learning_rate": 0.0006722689075630252, + "loss": 0.3813, + "step": 11809 + }, + { + "epoch": 6.597765363128492, + "grad_norm": 0.4691878855228424, + "learning_rate": 0.0006722408963585434, + "loss": 0.4632, + "step": 11810 + }, + { + "epoch": 6.598324022346369, + "grad_norm": 0.9500154256820679, + "learning_rate": 0.0006722128851540616, + "loss": 0.4424, + "step": 11811 + }, + { + "epoch": 6.598882681564246, + "grad_norm": 0.501692533493042, + "learning_rate": 0.0006721848739495798, + "loss": 0.3665, + "step": 11812 + }, + { + "epoch": 6.599441340782123, + "grad_norm": 1.8537033796310425, + "learning_rate": 0.000672156862745098, + "loss": 0.4284, + "step": 11813 + }, + { + "epoch": 6.6, + "grad_norm": 0.5142460465431213, + "learning_rate": 0.0006721288515406164, + "loss": 0.6066, + "step": 11814 + }, + { + "epoch": 6.600558659217877, + "grad_norm": 0.4929690957069397, + "learning_rate": 0.0006721008403361345, + "loss": 0.4803, + "step": 11815 + }, + { + "epoch": 6.6011173184357546, + "grad_norm": 1.3422304391860962, + "learning_rate": 0.0006720728291316527, + "loss": 0.4165, + "step": 11816 + }, + { + "epoch": 6.601675977653631, + "grad_norm": 0.6303967833518982, + "learning_rate": 0.0006720448179271709, + "loss": 0.4945, + "step": 11817 + }, + { + "epoch": 6.602234636871509, + "grad_norm": 0.586501955986023, + "learning_rate": 0.0006720168067226891, + "loss": 0.3817, + "step": 11818 + }, + { + "epoch": 6.602793296089385, + "grad_norm": 0.49638524651527405, + "learning_rate": 0.0006719887955182074, + "loss": 0.4508, + "step": 11819 + }, + { + "epoch": 6.603351955307263, + "grad_norm": 0.5122184753417969, + "learning_rate": 0.0006719607843137255, + "loss": 0.4056, + "step": 11820 + }, + { + "epoch": 6.603910614525139, + "grad_norm": 1.6933468580245972, + "learning_rate": 0.0006719327731092437, + "loss": 0.375, + "step": 11821 + }, + { + "epoch": 6.604469273743017, + "grad_norm": 0.598682701587677, + "learning_rate": 0.0006719047619047619, + "loss": 0.4776, + "step": 11822 + }, + { + "epoch": 6.605027932960894, + "grad_norm": 0.5985621809959412, + "learning_rate": 0.0006718767507002801, + "loss": 0.5135, + "step": 11823 + }, + { + "epoch": 6.605586592178771, + "grad_norm": 0.5397905707359314, + "learning_rate": 0.0006718487394957983, + "loss": 0.4153, + "step": 11824 + }, + { + "epoch": 6.606145251396648, + "grad_norm": 0.43009278178215027, + "learning_rate": 0.0006718207282913165, + "loss": 0.3868, + "step": 11825 + }, + { + "epoch": 6.6067039106145256, + "grad_norm": 0.6789054870605469, + "learning_rate": 0.0006717927170868347, + "loss": 0.3637, + "step": 11826 + }, + { + "epoch": 6.607262569832402, + "grad_norm": 0.7343611121177673, + "learning_rate": 0.0006717647058823529, + "loss": 0.4905, + "step": 11827 + }, + { + "epoch": 6.60782122905028, + "grad_norm": 0.9991815090179443, + "learning_rate": 0.0006717366946778711, + "loss": 0.453, + "step": 11828 + }, + { + "epoch": 6.608379888268156, + "grad_norm": 0.45997855067253113, + "learning_rate": 0.0006717086834733893, + "loss": 0.396, + "step": 11829 + }, + { + "epoch": 6.608938547486034, + "grad_norm": 0.5412790179252625, + "learning_rate": 0.0006716806722689077, + "loss": 0.3623, + "step": 11830 + }, + { + "epoch": 6.60949720670391, + "grad_norm": 0.6508433818817139, + "learning_rate": 0.0006716526610644258, + "loss": 0.4101, + "step": 11831 + }, + { + "epoch": 6.610055865921788, + "grad_norm": 0.646654486656189, + "learning_rate": 0.000671624649859944, + "loss": 0.5739, + "step": 11832 + }, + { + "epoch": 6.610614525139665, + "grad_norm": 0.47801244258880615, + "learning_rate": 0.0006715966386554622, + "loss": 0.4186, + "step": 11833 + }, + { + "epoch": 6.611173184357542, + "grad_norm": 0.4593677818775177, + "learning_rate": 0.0006715686274509804, + "loss": 0.3877, + "step": 11834 + }, + { + "epoch": 6.611731843575419, + "grad_norm": 0.7329741716384888, + "learning_rate": 0.0006715406162464987, + "loss": 0.502, + "step": 11835 + }, + { + "epoch": 6.6122905027932966, + "grad_norm": 0.46324464678764343, + "learning_rate": 0.0006715126050420168, + "loss": 0.3572, + "step": 11836 + }, + { + "epoch": 6.612849162011173, + "grad_norm": 1.8622710704803467, + "learning_rate": 0.000671484593837535, + "loss": 0.4531, + "step": 11837 + }, + { + "epoch": 6.613407821229051, + "grad_norm": 0.6932263374328613, + "learning_rate": 0.0006714565826330532, + "loss": 0.3735, + "step": 11838 + }, + { + "epoch": 6.613966480446927, + "grad_norm": 0.5971950888633728, + "learning_rate": 0.0006714285714285714, + "loss": 0.4406, + "step": 11839 + }, + { + "epoch": 6.614525139664805, + "grad_norm": 0.48642411828041077, + "learning_rate": 0.0006714005602240897, + "loss": 0.4243, + "step": 11840 + }, + { + "epoch": 6.615083798882681, + "grad_norm": 0.4740616977214813, + "learning_rate": 0.0006713725490196078, + "loss": 0.7003, + "step": 11841 + }, + { + "epoch": 6.615642458100559, + "grad_norm": 1.5656554698944092, + "learning_rate": 0.000671344537815126, + "loss": 0.523, + "step": 11842 + }, + { + "epoch": 6.616201117318436, + "grad_norm": 0.52299565076828, + "learning_rate": 0.0006713165266106442, + "loss": 0.4681, + "step": 11843 + }, + { + "epoch": 6.616759776536313, + "grad_norm": 0.4576612710952759, + "learning_rate": 0.0006712885154061624, + "loss": 0.3388, + "step": 11844 + }, + { + "epoch": 6.61731843575419, + "grad_norm": 1.0242376327514648, + "learning_rate": 0.0006712605042016808, + "loss": 0.545, + "step": 11845 + }, + { + "epoch": 6.617877094972067, + "grad_norm": 0.533794641494751, + "learning_rate": 0.000671232492997199, + "loss": 0.5442, + "step": 11846 + }, + { + "epoch": 6.618435754189944, + "grad_norm": 0.6163023114204407, + "learning_rate": 0.0006712044817927171, + "loss": 0.5745, + "step": 11847 + }, + { + "epoch": 6.618994413407822, + "grad_norm": 0.5679111480712891, + "learning_rate": 0.0006711764705882353, + "loss": 0.3956, + "step": 11848 + }, + { + "epoch": 6.619553072625698, + "grad_norm": 0.49660468101501465, + "learning_rate": 0.0006711484593837535, + "loss": 0.3644, + "step": 11849 + }, + { + "epoch": 6.620111731843576, + "grad_norm": 0.5545671582221985, + "learning_rate": 0.0006711204481792718, + "loss": 0.438, + "step": 11850 + }, + { + "epoch": 6.620670391061452, + "grad_norm": 0.4909864664077759, + "learning_rate": 0.00067109243697479, + "loss": 0.4799, + "step": 11851 + }, + { + "epoch": 6.62122905027933, + "grad_norm": 0.7022113800048828, + "learning_rate": 0.0006710644257703081, + "loss": 0.4846, + "step": 11852 + }, + { + "epoch": 6.621787709497207, + "grad_norm": 0.5411577820777893, + "learning_rate": 0.0006710364145658263, + "loss": 0.446, + "step": 11853 + }, + { + "epoch": 6.622346368715084, + "grad_norm": 0.5130810737609863, + "learning_rate": 0.0006710084033613445, + "loss": 0.4285, + "step": 11854 + }, + { + "epoch": 6.622905027932961, + "grad_norm": 0.5893478989601135, + "learning_rate": 0.0006709803921568628, + "loss": 0.4307, + "step": 11855 + }, + { + "epoch": 6.623463687150838, + "grad_norm": 4.347335338592529, + "learning_rate": 0.000670952380952381, + "loss": 0.477, + "step": 11856 + }, + { + "epoch": 6.624022346368715, + "grad_norm": 0.8039206266403198, + "learning_rate": 0.0006709243697478991, + "loss": 0.4949, + "step": 11857 + }, + { + "epoch": 6.624581005586592, + "grad_norm": 0.5761498212814331, + "learning_rate": 0.0006708963585434173, + "loss": 0.4806, + "step": 11858 + }, + { + "epoch": 6.625139664804469, + "grad_norm": 0.3392082750797272, + "learning_rate": 0.0006708683473389355, + "loss": 0.3649, + "step": 11859 + }, + { + "epoch": 6.625698324022347, + "grad_norm": 6.200045585632324, + "learning_rate": 0.0006708403361344538, + "loss": 0.4398, + "step": 11860 + }, + { + "epoch": 6.626256983240223, + "grad_norm": 0.4584237337112427, + "learning_rate": 0.000670812324929972, + "loss": 0.4103, + "step": 11861 + }, + { + "epoch": 6.626815642458101, + "grad_norm": 0.5027204751968384, + "learning_rate": 0.0006707843137254903, + "loss": 0.4293, + "step": 11862 + }, + { + "epoch": 6.627374301675978, + "grad_norm": 0.5450441241264343, + "learning_rate": 0.0006707563025210084, + "loss": 0.4954, + "step": 11863 + }, + { + "epoch": 6.627932960893855, + "grad_norm": 0.42596668004989624, + "learning_rate": 0.0006707282913165266, + "loss": 0.4258, + "step": 11864 + }, + { + "epoch": 6.628491620111732, + "grad_norm": 0.5425561666488647, + "learning_rate": 0.0006707002801120449, + "loss": 0.4617, + "step": 11865 + }, + { + "epoch": 6.629050279329609, + "grad_norm": 0.5719782114028931, + "learning_rate": 0.0006706722689075631, + "loss": 0.5718, + "step": 11866 + }, + { + "epoch": 6.629608938547486, + "grad_norm": 1.2569321393966675, + "learning_rate": 0.0006706442577030813, + "loss": 0.4671, + "step": 11867 + }, + { + "epoch": 6.630167597765363, + "grad_norm": 0.4194042384624481, + "learning_rate": 0.0006706162464985994, + "loss": 0.4061, + "step": 11868 + }, + { + "epoch": 6.63072625698324, + "grad_norm": 0.7099469900131226, + "learning_rate": 0.0006705882352941176, + "loss": 0.4435, + "step": 11869 + }, + { + "epoch": 6.631284916201118, + "grad_norm": 0.39683932065963745, + "learning_rate": 0.0006705602240896359, + "loss": 0.3773, + "step": 11870 + }, + { + "epoch": 6.631843575418994, + "grad_norm": 1.4019768238067627, + "learning_rate": 0.0006705322128851541, + "loss": 0.5027, + "step": 11871 + }, + { + "epoch": 6.632402234636872, + "grad_norm": 2.508816957473755, + "learning_rate": 0.0006705042016806723, + "loss": 0.4704, + "step": 11872 + }, + { + "epoch": 6.632960893854749, + "grad_norm": 0.7168423533439636, + "learning_rate": 0.0006704761904761904, + "loss": 0.4874, + "step": 11873 + }, + { + "epoch": 6.633519553072626, + "grad_norm": 0.44258370995521545, + "learning_rate": 0.0006704481792717086, + "loss": 0.3925, + "step": 11874 + }, + { + "epoch": 6.634078212290503, + "grad_norm": 0.4182576835155487, + "learning_rate": 0.0006704201680672269, + "loss": 0.4099, + "step": 11875 + }, + { + "epoch": 6.63463687150838, + "grad_norm": 0.41694721579551697, + "learning_rate": 0.0006703921568627451, + "loss": 0.4324, + "step": 11876 + }, + { + "epoch": 6.635195530726257, + "grad_norm": 0.41879621148109436, + "learning_rate": 0.0006703641456582633, + "loss": 0.4035, + "step": 11877 + }, + { + "epoch": 6.635754189944134, + "grad_norm": 0.6156706213951111, + "learning_rate": 0.0006703361344537816, + "loss": 0.3844, + "step": 11878 + }, + { + "epoch": 6.636312849162011, + "grad_norm": 0.6671702861785889, + "learning_rate": 0.0006703081232492996, + "loss": 0.5469, + "step": 11879 + }, + { + "epoch": 6.636871508379889, + "grad_norm": 0.4193579852581024, + "learning_rate": 0.000670280112044818, + "loss": 0.4663, + "step": 11880 + }, + { + "epoch": 6.637430167597765, + "grad_norm": 1.7180861234664917, + "learning_rate": 0.0006702521008403362, + "loss": 0.4079, + "step": 11881 + }, + { + "epoch": 6.637988826815643, + "grad_norm": 0.41882604360580444, + "learning_rate": 0.0006702240896358544, + "loss": 0.3873, + "step": 11882 + }, + { + "epoch": 6.638547486033519, + "grad_norm": 1.1675763130187988, + "learning_rate": 0.0006701960784313726, + "loss": 0.4308, + "step": 11883 + }, + { + "epoch": 6.639106145251397, + "grad_norm": 0.5950065851211548, + "learning_rate": 0.0006701680672268907, + "loss": 0.4061, + "step": 11884 + }, + { + "epoch": 6.639664804469274, + "grad_norm": 0.7166126370429993, + "learning_rate": 0.000670140056022409, + "loss": 0.4293, + "step": 11885 + }, + { + "epoch": 6.640223463687151, + "grad_norm": 0.5278535485267639, + "learning_rate": 0.0006701120448179272, + "loss": 0.479, + "step": 11886 + }, + { + "epoch": 6.640782122905028, + "grad_norm": 0.473520964384079, + "learning_rate": 0.0006700840336134454, + "loss": 0.4338, + "step": 11887 + }, + { + "epoch": 6.641340782122905, + "grad_norm": 0.43621179461479187, + "learning_rate": 0.0006700560224089636, + "loss": 0.4466, + "step": 11888 + }, + { + "epoch": 6.641899441340782, + "grad_norm": 0.5028188228607178, + "learning_rate": 0.0006700280112044817, + "loss": 0.4559, + "step": 11889 + }, + { + "epoch": 6.64245810055866, + "grad_norm": 1.0624299049377441, + "learning_rate": 0.00067, + "loss": 0.4077, + "step": 11890 + }, + { + "epoch": 6.643016759776536, + "grad_norm": 0.4720621407032013, + "learning_rate": 0.0006699719887955182, + "loss": 0.4667, + "step": 11891 + }, + { + "epoch": 6.643575418994414, + "grad_norm": 0.38738808035850525, + "learning_rate": 0.0006699439775910364, + "loss": 0.3834, + "step": 11892 + }, + { + "epoch": 6.64413407821229, + "grad_norm": 0.6606648564338684, + "learning_rate": 0.0006699159663865546, + "loss": 0.6001, + "step": 11893 + }, + { + "epoch": 6.644692737430168, + "grad_norm": 1.6325286626815796, + "learning_rate": 0.0006698879551820728, + "loss": 0.3995, + "step": 11894 + }, + { + "epoch": 6.645251396648044, + "grad_norm": 0.597726047039032, + "learning_rate": 0.000669859943977591, + "loss": 0.4367, + "step": 11895 + }, + { + "epoch": 6.645810055865922, + "grad_norm": 0.4438652992248535, + "learning_rate": 0.0006698319327731093, + "loss": 0.3943, + "step": 11896 + }, + { + "epoch": 6.646368715083799, + "grad_norm": 0.6187936067581177, + "learning_rate": 0.0006698039215686275, + "loss": 0.4991, + "step": 11897 + }, + { + "epoch": 6.646927374301676, + "grad_norm": 1.6471140384674072, + "learning_rate": 0.0006697759103641457, + "loss": 0.4304, + "step": 11898 + }, + { + "epoch": 6.647486033519553, + "grad_norm": 0.4638585150241852, + "learning_rate": 0.0006697478991596639, + "loss": 0.3907, + "step": 11899 + }, + { + "epoch": 6.648044692737431, + "grad_norm": 0.6080419421195984, + "learning_rate": 0.0006697198879551821, + "loss": 0.5716, + "step": 11900 + }, + { + "epoch": 6.648603351955307, + "grad_norm": 0.7600609660148621, + "learning_rate": 0.0006696918767507003, + "loss": 0.5238, + "step": 11901 + }, + { + "epoch": 6.649162011173185, + "grad_norm": 0.40342411398887634, + "learning_rate": 0.0006696638655462185, + "loss": 0.3784, + "step": 11902 + }, + { + "epoch": 6.649720670391061, + "grad_norm": 0.4914536774158478, + "learning_rate": 0.0006696358543417367, + "loss": 0.3133, + "step": 11903 + }, + { + "epoch": 6.650279329608939, + "grad_norm": 0.36723074316978455, + "learning_rate": 0.0006696078431372549, + "loss": 0.3745, + "step": 11904 + }, + { + "epoch": 6.650837988826815, + "grad_norm": 0.8028084635734558, + "learning_rate": 0.0006695798319327731, + "loss": 0.4228, + "step": 11905 + }, + { + "epoch": 6.651396648044693, + "grad_norm": 0.39742809534072876, + "learning_rate": 0.0006695518207282913, + "loss": 0.3796, + "step": 11906 + }, + { + "epoch": 6.65195530726257, + "grad_norm": 0.7723726034164429, + "learning_rate": 0.0006695238095238095, + "loss": 0.4192, + "step": 11907 + }, + { + "epoch": 6.652513966480447, + "grad_norm": 0.42612823843955994, + "learning_rate": 0.0006694957983193277, + "loss": 0.5041, + "step": 11908 + }, + { + "epoch": 6.653072625698324, + "grad_norm": 0.5529517531394958, + "learning_rate": 0.0006694677871148459, + "loss": 0.6456, + "step": 11909 + }, + { + "epoch": 6.653631284916202, + "grad_norm": 0.432962030172348, + "learning_rate": 0.0006694397759103643, + "loss": 0.4052, + "step": 11910 + }, + { + "epoch": 6.654189944134078, + "grad_norm": 0.7557727694511414, + "learning_rate": 0.0006694117647058823, + "loss": 0.5632, + "step": 11911 + }, + { + "epoch": 6.654748603351956, + "grad_norm": 0.44415441155433655, + "learning_rate": 0.0006693837535014006, + "loss": 0.556, + "step": 11912 + }, + { + "epoch": 6.655307262569832, + "grad_norm": 1.462938666343689, + "learning_rate": 0.0006693557422969188, + "loss": 0.5532, + "step": 11913 + }, + { + "epoch": 6.65586592178771, + "grad_norm": 0.7390327453613281, + "learning_rate": 0.000669327731092437, + "loss": 0.4529, + "step": 11914 + }, + { + "epoch": 6.656424581005586, + "grad_norm": 2.6326262950897217, + "learning_rate": 0.0006692997198879553, + "loss": 0.3835, + "step": 11915 + }, + { + "epoch": 6.656983240223464, + "grad_norm": 0.4834195673465729, + "learning_rate": 0.0006692717086834734, + "loss": 0.3513, + "step": 11916 + }, + { + "epoch": 6.657541899441341, + "grad_norm": 0.48843714594841003, + "learning_rate": 0.0006692436974789916, + "loss": 0.4866, + "step": 11917 + }, + { + "epoch": 6.658100558659218, + "grad_norm": 0.9501739144325256, + "learning_rate": 0.0006692156862745098, + "loss": 0.4514, + "step": 11918 + }, + { + "epoch": 6.658659217877095, + "grad_norm": 0.5491495728492737, + "learning_rate": 0.000669187675070028, + "loss": 0.4346, + "step": 11919 + }, + { + "epoch": 6.659217877094972, + "grad_norm": 0.5092610716819763, + "learning_rate": 0.0006691596638655463, + "loss": 0.5333, + "step": 11920 + }, + { + "epoch": 6.659776536312849, + "grad_norm": 0.5051995515823364, + "learning_rate": 0.0006691316526610644, + "loss": 0.4432, + "step": 11921 + }, + { + "epoch": 6.660335195530727, + "grad_norm": 0.6132280230522156, + "learning_rate": 0.0006691036414565826, + "loss": 0.3799, + "step": 11922 + }, + { + "epoch": 6.660893854748603, + "grad_norm": 0.7479427456855774, + "learning_rate": 0.0006690756302521008, + "loss": 0.5133, + "step": 11923 + }, + { + "epoch": 6.661452513966481, + "grad_norm": 0.6331705451011658, + "learning_rate": 0.000669047619047619, + "loss": 0.4406, + "step": 11924 + }, + { + "epoch": 6.662011173184357, + "grad_norm": 0.6844003796577454, + "learning_rate": 0.0006690196078431373, + "loss": 0.449, + "step": 11925 + }, + { + "epoch": 6.662569832402235, + "grad_norm": 0.4110753834247589, + "learning_rate": 0.0006689915966386555, + "loss": 0.4646, + "step": 11926 + }, + { + "epoch": 6.663128491620112, + "grad_norm": 0.7246172428131104, + "learning_rate": 0.0006689635854341736, + "loss": 0.4685, + "step": 11927 + }, + { + "epoch": 6.663687150837989, + "grad_norm": 0.7584655284881592, + "learning_rate": 0.0006689355742296919, + "loss": 0.7446, + "step": 11928 + }, + { + "epoch": 6.664245810055866, + "grad_norm": 0.6096429228782654, + "learning_rate": 0.0006689075630252101, + "loss": 0.5194, + "step": 11929 + }, + { + "epoch": 6.664804469273743, + "grad_norm": 0.9509395956993103, + "learning_rate": 0.0006688795518207284, + "loss": 0.5149, + "step": 11930 + }, + { + "epoch": 6.66536312849162, + "grad_norm": 0.41306325793266296, + "learning_rate": 0.0006688515406162466, + "loss": 0.4323, + "step": 11931 + }, + { + "epoch": 6.665921787709497, + "grad_norm": 0.618394136428833, + "learning_rate": 0.0006688235294117647, + "loss": 0.4677, + "step": 11932 + }, + { + "epoch": 6.666480446927374, + "grad_norm": 0.8841884732246399, + "learning_rate": 0.0006687955182072829, + "loss": 0.4686, + "step": 11933 + }, + { + "epoch": 6.667039106145252, + "grad_norm": 0.7433735132217407, + "learning_rate": 0.0006687675070028011, + "loss": 0.4896, + "step": 11934 + }, + { + "epoch": 6.667597765363128, + "grad_norm": 0.9621394276618958, + "learning_rate": 0.0006687394957983194, + "loss": 0.4834, + "step": 11935 + }, + { + "epoch": 6.668156424581006, + "grad_norm": 1.4438282251358032, + "learning_rate": 0.0006687114845938376, + "loss": 0.4916, + "step": 11936 + }, + { + "epoch": 6.668715083798883, + "grad_norm": 0.3853825032711029, + "learning_rate": 0.0006686834733893557, + "loss": 0.4277, + "step": 11937 + }, + { + "epoch": 6.66927374301676, + "grad_norm": 2.2963244915008545, + "learning_rate": 0.0006686554621848739, + "loss": 0.5279, + "step": 11938 + }, + { + "epoch": 6.669832402234637, + "grad_norm": 0.48953819274902344, + "learning_rate": 0.0006686274509803921, + "loss": 0.5855, + "step": 11939 + }, + { + "epoch": 6.670391061452514, + "grad_norm": 0.7464914917945862, + "learning_rate": 0.0006685994397759104, + "loss": 0.549, + "step": 11940 + }, + { + "epoch": 6.670949720670391, + "grad_norm": 0.5952526926994324, + "learning_rate": 0.0006685714285714286, + "loss": 0.4605, + "step": 11941 + }, + { + "epoch": 6.671508379888268, + "grad_norm": 2.3519411087036133, + "learning_rate": 0.0006685434173669468, + "loss": 0.4862, + "step": 11942 + }, + { + "epoch": 6.672067039106145, + "grad_norm": 0.5067525506019592, + "learning_rate": 0.0006685154061624649, + "loss": 0.5444, + "step": 11943 + }, + { + "epoch": 6.672625698324023, + "grad_norm": 1.0751349925994873, + "learning_rate": 0.0006684873949579831, + "loss": 0.5272, + "step": 11944 + }, + { + "epoch": 6.673184357541899, + "grad_norm": 0.5294737219810486, + "learning_rate": 0.0006684593837535015, + "loss": 0.3459, + "step": 11945 + }, + { + "epoch": 6.673743016759777, + "grad_norm": 0.4766441583633423, + "learning_rate": 0.0006684313725490197, + "loss": 0.373, + "step": 11946 + }, + { + "epoch": 6.674301675977654, + "grad_norm": 0.5001503825187683, + "learning_rate": 0.0006684033613445379, + "loss": 0.3901, + "step": 11947 + }, + { + "epoch": 6.674860335195531, + "grad_norm": 0.5113135576248169, + "learning_rate": 0.000668375350140056, + "loss": 0.4565, + "step": 11948 + }, + { + "epoch": 6.675418994413408, + "grad_norm": 0.6157906651496887, + "learning_rate": 0.0006683473389355742, + "loss": 0.398, + "step": 11949 + }, + { + "epoch": 6.675977653631285, + "grad_norm": 0.5761928558349609, + "learning_rate": 0.0006683193277310925, + "loss": 0.4914, + "step": 11950 + }, + { + "epoch": 6.676536312849162, + "grad_norm": 0.4351007640361786, + "learning_rate": 0.0006682913165266107, + "loss": 0.4139, + "step": 11951 + }, + { + "epoch": 6.677094972067039, + "grad_norm": 0.4437105655670166, + "learning_rate": 0.0006682633053221289, + "loss": 0.4052, + "step": 11952 + }, + { + "epoch": 6.677653631284916, + "grad_norm": 0.5127741694450378, + "learning_rate": 0.000668235294117647, + "loss": 0.4141, + "step": 11953 + }, + { + "epoch": 6.678212290502794, + "grad_norm": 0.7881187796592712, + "learning_rate": 0.0006682072829131652, + "loss": 0.4227, + "step": 11954 + }, + { + "epoch": 6.67877094972067, + "grad_norm": 0.5743787288665771, + "learning_rate": 0.0006681792717086835, + "loss": 0.4614, + "step": 11955 + }, + { + "epoch": 6.679329608938548, + "grad_norm": 0.5567173957824707, + "learning_rate": 0.0006681512605042017, + "loss": 0.4296, + "step": 11956 + }, + { + "epoch": 6.679888268156424, + "grad_norm": 0.6149084568023682, + "learning_rate": 0.0006681232492997199, + "loss": 0.527, + "step": 11957 + }, + { + "epoch": 6.680446927374302, + "grad_norm": 0.664783775806427, + "learning_rate": 0.0006680952380952381, + "loss": 0.4199, + "step": 11958 + }, + { + "epoch": 6.681005586592179, + "grad_norm": 0.5344581604003906, + "learning_rate": 0.0006680672268907562, + "loss": 0.5034, + "step": 11959 + }, + { + "epoch": 6.681564245810056, + "grad_norm": 0.5123084783554077, + "learning_rate": 0.0006680392156862746, + "loss": 0.4463, + "step": 11960 + }, + { + "epoch": 6.682122905027933, + "grad_norm": 0.4994768798351288, + "learning_rate": 0.0006680112044817928, + "loss": 0.4244, + "step": 11961 + }, + { + "epoch": 6.68268156424581, + "grad_norm": 0.4394984245300293, + "learning_rate": 0.000667983193277311, + "loss": 0.5317, + "step": 11962 + }, + { + "epoch": 6.683240223463687, + "grad_norm": 0.44369298219680786, + "learning_rate": 0.0006679551820728292, + "loss": 0.5142, + "step": 11963 + }, + { + "epoch": 6.683798882681565, + "grad_norm": 0.4762931168079376, + "learning_rate": 0.0006679271708683473, + "loss": 0.411, + "step": 11964 + }, + { + "epoch": 6.684357541899441, + "grad_norm": 0.9273788332939148, + "learning_rate": 0.0006678991596638656, + "loss": 0.4241, + "step": 11965 + }, + { + "epoch": 6.684916201117319, + "grad_norm": 0.42331764101982117, + "learning_rate": 0.0006678711484593838, + "loss": 0.4133, + "step": 11966 + }, + { + "epoch": 6.685474860335195, + "grad_norm": 0.6511138677597046, + "learning_rate": 0.000667843137254902, + "loss": 0.5919, + "step": 11967 + }, + { + "epoch": 6.686033519553073, + "grad_norm": 0.49632054567337036, + "learning_rate": 0.0006678151260504202, + "loss": 0.4345, + "step": 11968 + }, + { + "epoch": 6.686592178770949, + "grad_norm": 0.543677806854248, + "learning_rate": 0.0006677871148459383, + "loss": 0.4024, + "step": 11969 + }, + { + "epoch": 6.687150837988827, + "grad_norm": 0.5130450129508972, + "learning_rate": 0.0006677591036414566, + "loss": 0.4684, + "step": 11970 + }, + { + "epoch": 6.687709497206704, + "grad_norm": 0.5427372455596924, + "learning_rate": 0.0006677310924369748, + "loss": 0.4437, + "step": 11971 + }, + { + "epoch": 6.688268156424581, + "grad_norm": 0.775134265422821, + "learning_rate": 0.000667703081232493, + "loss": 0.4596, + "step": 11972 + }, + { + "epoch": 6.688826815642458, + "grad_norm": 1.0085527896881104, + "learning_rate": 0.0006676750700280112, + "loss": 0.6023, + "step": 11973 + }, + { + "epoch": 6.689385474860336, + "grad_norm": 0.3617066740989685, + "learning_rate": 0.0006676470588235294, + "loss": 0.3793, + "step": 11974 + }, + { + "epoch": 6.689944134078212, + "grad_norm": 1.0881379842758179, + "learning_rate": 0.0006676190476190476, + "loss": 0.7233, + "step": 11975 + }, + { + "epoch": 6.69050279329609, + "grad_norm": 0.3984529972076416, + "learning_rate": 0.0006675910364145658, + "loss": 0.477, + "step": 11976 + }, + { + "epoch": 6.691061452513966, + "grad_norm": 0.8676496148109436, + "learning_rate": 0.000667563025210084, + "loss": 0.4396, + "step": 11977 + }, + { + "epoch": 6.691620111731844, + "grad_norm": 0.43005532026290894, + "learning_rate": 0.0006675350140056023, + "loss": 0.4039, + "step": 11978 + }, + { + "epoch": 6.69217877094972, + "grad_norm": 0.5955531001091003, + "learning_rate": 0.0006675070028011205, + "loss": 0.3239, + "step": 11979 + }, + { + "epoch": 6.692737430167598, + "grad_norm": 0.4432985782623291, + "learning_rate": 0.0006674789915966387, + "loss": 0.4006, + "step": 11980 + }, + { + "epoch": 6.693296089385475, + "grad_norm": 0.44831740856170654, + "learning_rate": 0.0006674509803921569, + "loss": 0.3296, + "step": 11981 + }, + { + "epoch": 6.693854748603352, + "grad_norm": 0.5185133814811707, + "learning_rate": 0.0006674229691876751, + "loss": 0.4796, + "step": 11982 + }, + { + "epoch": 6.694413407821229, + "grad_norm": 1.4772552251815796, + "learning_rate": 0.0006673949579831933, + "loss": 0.4602, + "step": 11983 + }, + { + "epoch": 6.694972067039107, + "grad_norm": 0.6739949584007263, + "learning_rate": 0.0006673669467787115, + "loss": 0.4528, + "step": 11984 + }, + { + "epoch": 6.695530726256983, + "grad_norm": 1.486723780632019, + "learning_rate": 0.0006673389355742298, + "loss": 0.5464, + "step": 11985 + }, + { + "epoch": 6.696089385474861, + "grad_norm": 0.4152049720287323, + "learning_rate": 0.0006673109243697479, + "loss": 0.401, + "step": 11986 + }, + { + "epoch": 6.696648044692737, + "grad_norm": 1.7777329683303833, + "learning_rate": 0.0006672829131652661, + "loss": 0.433, + "step": 11987 + }, + { + "epoch": 6.697206703910615, + "grad_norm": 0.5812040567398071, + "learning_rate": 0.0006672549019607843, + "loss": 0.4011, + "step": 11988 + }, + { + "epoch": 6.697765363128491, + "grad_norm": 0.4774755537509918, + "learning_rate": 0.0006672268907563025, + "loss": 0.4119, + "step": 11989 + }, + { + "epoch": 6.698324022346369, + "grad_norm": 0.4158855080604553, + "learning_rate": 0.0006671988795518208, + "loss": 0.4326, + "step": 11990 + }, + { + "epoch": 6.698882681564246, + "grad_norm": 1.1127920150756836, + "learning_rate": 0.0006671708683473389, + "loss": 0.4633, + "step": 11991 + }, + { + "epoch": 6.699441340782123, + "grad_norm": 0.383543461561203, + "learning_rate": 0.0006671428571428571, + "loss": 0.3522, + "step": 11992 + }, + { + "epoch": 6.7, + "grad_norm": 0.5842460989952087, + "learning_rate": 0.0006671148459383753, + "loss": 0.4311, + "step": 11993 + }, + { + "epoch": 6.700558659217877, + "grad_norm": 0.53638756275177, + "learning_rate": 0.0006670868347338936, + "loss": 0.407, + "step": 11994 + }, + { + "epoch": 6.701117318435754, + "grad_norm": 0.8463553786277771, + "learning_rate": 0.0006670588235294119, + "loss": 0.3638, + "step": 11995 + }, + { + "epoch": 6.701675977653632, + "grad_norm": 0.4048050045967102, + "learning_rate": 0.00066703081232493, + "loss": 0.3987, + "step": 11996 + }, + { + "epoch": 6.702234636871508, + "grad_norm": 0.9497975707054138, + "learning_rate": 0.0006670028011204482, + "loss": 0.3657, + "step": 11997 + }, + { + "epoch": 6.702793296089386, + "grad_norm": 0.6640595197677612, + "learning_rate": 0.0006669747899159664, + "loss": 0.431, + "step": 11998 + }, + { + "epoch": 6.703351955307262, + "grad_norm": 0.9014831781387329, + "learning_rate": 0.0006669467787114846, + "loss": 0.5538, + "step": 11999 + }, + { + "epoch": 6.70391061452514, + "grad_norm": 3.5594048500061035, + "learning_rate": 0.0006669187675070029, + "loss": 0.7649, + "step": 12000 + }, + { + "epoch": 6.70391061452514, + "eval_cer": 0.09162820638713408, + "eval_loss": 0.34500652551651, + "eval_runtime": 60.9046, + "eval_samples_per_second": 74.51, + "eval_steps_per_second": 4.663, + "eval_wer": 0.364654618586073, + "step": 12000 + }, + { + "epoch": 6.704469273743017, + "grad_norm": 0.5483406186103821, + "learning_rate": 0.0006668907563025211, + "loss": 0.5314, + "step": 12001 + }, + { + "epoch": 6.705027932960894, + "grad_norm": 0.9583224058151245, + "learning_rate": 0.0006668627450980392, + "loss": 0.3826, + "step": 12002 + }, + { + "epoch": 6.705586592178771, + "grad_norm": 0.6977572441101074, + "learning_rate": 0.0006668347338935574, + "loss": 0.4966, + "step": 12003 + }, + { + "epoch": 6.706145251396648, + "grad_norm": 2.1843104362487793, + "learning_rate": 0.0006668067226890756, + "loss": 0.3792, + "step": 12004 + }, + { + "epoch": 6.706703910614525, + "grad_norm": 0.5551536083221436, + "learning_rate": 0.0006667787114845939, + "loss": 0.4793, + "step": 12005 + }, + { + "epoch": 6.707262569832402, + "grad_norm": 0.665658175945282, + "learning_rate": 0.0006667507002801121, + "loss": 0.5097, + "step": 12006 + }, + { + "epoch": 6.707821229050279, + "grad_norm": 0.7750752568244934, + "learning_rate": 0.0006667226890756302, + "loss": 0.4892, + "step": 12007 + }, + { + "epoch": 6.708379888268157, + "grad_norm": 0.5526975393295288, + "learning_rate": 0.0006666946778711484, + "loss": 0.6484, + "step": 12008 + }, + { + "epoch": 6.708938547486033, + "grad_norm": 0.6482200026512146, + "learning_rate": 0.0006666666666666666, + "loss": 0.4489, + "step": 12009 + }, + { + "epoch": 6.709497206703911, + "grad_norm": 0.4073962867259979, + "learning_rate": 0.000666638655462185, + "loss": 0.3483, + "step": 12010 + }, + { + "epoch": 6.710055865921788, + "grad_norm": 0.4463841915130615, + "learning_rate": 0.0006666106442577032, + "loss": 0.4073, + "step": 12011 + }, + { + "epoch": 6.710614525139665, + "grad_norm": 0.4955034852027893, + "learning_rate": 0.0006665826330532213, + "loss": 0.4875, + "step": 12012 + }, + { + "epoch": 6.711173184357542, + "grad_norm": 0.4222066104412079, + "learning_rate": 0.0006665546218487395, + "loss": 0.3956, + "step": 12013 + }, + { + "epoch": 6.711731843575419, + "grad_norm": 4.740274429321289, + "learning_rate": 0.0006665266106442577, + "loss": 0.4488, + "step": 12014 + }, + { + "epoch": 6.712290502793296, + "grad_norm": 0.8777411580085754, + "learning_rate": 0.000666498599439776, + "loss": 0.7606, + "step": 12015 + }, + { + "epoch": 6.712849162011173, + "grad_norm": 0.4971480071544647, + "learning_rate": 0.0006664705882352942, + "loss": 0.4609, + "step": 12016 + }, + { + "epoch": 6.71340782122905, + "grad_norm": 0.5550456643104553, + "learning_rate": 0.0006664425770308124, + "loss": 0.3525, + "step": 12017 + }, + { + "epoch": 6.713966480446928, + "grad_norm": 0.7842357158660889, + "learning_rate": 0.0006664145658263305, + "loss": 0.3546, + "step": 12018 + }, + { + "epoch": 6.714525139664804, + "grad_norm": 0.41769638657569885, + "learning_rate": 0.0006663865546218487, + "loss": 0.426, + "step": 12019 + }, + { + "epoch": 6.715083798882682, + "grad_norm": 0.454759418964386, + "learning_rate": 0.000666358543417367, + "loss": 0.4387, + "step": 12020 + }, + { + "epoch": 6.715642458100559, + "grad_norm": 0.6732493042945862, + "learning_rate": 0.0006663305322128852, + "loss": 0.4907, + "step": 12021 + }, + { + "epoch": 6.716201117318436, + "grad_norm": 0.4838760197162628, + "learning_rate": 0.0006663025210084034, + "loss": 0.4969, + "step": 12022 + }, + { + "epoch": 6.716759776536313, + "grad_norm": 0.7262096405029297, + "learning_rate": 0.0006662745098039215, + "loss": 0.5392, + "step": 12023 + }, + { + "epoch": 6.71731843575419, + "grad_norm": 0.6183413863182068, + "learning_rate": 0.0006662464985994397, + "loss": 0.5025, + "step": 12024 + }, + { + "epoch": 6.717877094972067, + "grad_norm": 0.506846010684967, + "learning_rate": 0.000666218487394958, + "loss": 0.4099, + "step": 12025 + }, + { + "epoch": 6.718435754189944, + "grad_norm": 0.7997880578041077, + "learning_rate": 0.0006661904761904763, + "loss": 0.4685, + "step": 12026 + }, + { + "epoch": 6.718994413407821, + "grad_norm": 0.3239750862121582, + "learning_rate": 0.0006661624649859945, + "loss": 0.3795, + "step": 12027 + }, + { + "epoch": 6.719553072625699, + "grad_norm": 1.2035984992980957, + "learning_rate": 0.0006661344537815126, + "loss": 0.6141, + "step": 12028 + }, + { + "epoch": 6.720111731843575, + "grad_norm": 0.40416109561920166, + "learning_rate": 0.0006661064425770308, + "loss": 0.3345, + "step": 12029 + }, + { + "epoch": 6.720670391061453, + "grad_norm": 0.9208695292472839, + "learning_rate": 0.0006660784313725491, + "loss": 0.5067, + "step": 12030 + }, + { + "epoch": 6.721229050279329, + "grad_norm": 0.41875791549682617, + "learning_rate": 0.0006660504201680673, + "loss": 0.4151, + "step": 12031 + }, + { + "epoch": 6.721787709497207, + "grad_norm": 0.521105945110321, + "learning_rate": 0.0006660224089635855, + "loss": 0.3748, + "step": 12032 + }, + { + "epoch": 6.722346368715084, + "grad_norm": 0.4621300995349884, + "learning_rate": 0.0006659943977591037, + "loss": 0.3065, + "step": 12033 + }, + { + "epoch": 6.722905027932961, + "grad_norm": 1.4029648303985596, + "learning_rate": 0.0006659663865546218, + "loss": 0.4561, + "step": 12034 + }, + { + "epoch": 6.723463687150838, + "grad_norm": 0.46591830253601074, + "learning_rate": 0.0006659383753501401, + "loss": 0.4577, + "step": 12035 + }, + { + "epoch": 6.724022346368715, + "grad_norm": 1.3738367557525635, + "learning_rate": 0.0006659103641456583, + "loss": 0.4709, + "step": 12036 + }, + { + "epoch": 6.724581005586592, + "grad_norm": 0.6199467182159424, + "learning_rate": 0.0006658823529411765, + "loss": 0.43, + "step": 12037 + }, + { + "epoch": 6.72513966480447, + "grad_norm": 0.47354018688201904, + "learning_rate": 0.0006658543417366947, + "loss": 0.4562, + "step": 12038 + }, + { + "epoch": 6.725698324022346, + "grad_norm": 0.46467769145965576, + "learning_rate": 0.0006658263305322128, + "loss": 0.3213, + "step": 12039 + }, + { + "epoch": 6.726256983240224, + "grad_norm": 0.4763118028640747, + "learning_rate": 0.0006657983193277311, + "loss": 0.4166, + "step": 12040 + }, + { + "epoch": 6.7268156424581, + "grad_norm": 0.4471207857131958, + "learning_rate": 0.0006657703081232493, + "loss": 0.398, + "step": 12041 + }, + { + "epoch": 6.727374301675978, + "grad_norm": 0.7418835163116455, + "learning_rate": 0.0006657422969187676, + "loss": 0.4962, + "step": 12042 + }, + { + "epoch": 6.727932960893854, + "grad_norm": 0.671700656414032, + "learning_rate": 0.0006657142857142858, + "loss": 0.412, + "step": 12043 + }, + { + "epoch": 6.728491620111732, + "grad_norm": 1.4614101648330688, + "learning_rate": 0.0006656862745098039, + "loss": 0.3884, + "step": 12044 + }, + { + "epoch": 6.729050279329609, + "grad_norm": 0.4766533374786377, + "learning_rate": 0.0006656582633053222, + "loss": 0.3857, + "step": 12045 + }, + { + "epoch": 6.729608938547486, + "grad_norm": 0.4485678970813751, + "learning_rate": 0.0006656302521008404, + "loss": 0.4191, + "step": 12046 + }, + { + "epoch": 6.730167597765363, + "grad_norm": 0.5006360411643982, + "learning_rate": 0.0006656022408963586, + "loss": 0.3175, + "step": 12047 + }, + { + "epoch": 6.730726256983241, + "grad_norm": 0.707591712474823, + "learning_rate": 0.0006655742296918768, + "loss": 0.4525, + "step": 12048 + }, + { + "epoch": 6.731284916201117, + "grad_norm": 0.6048533916473389, + "learning_rate": 0.000665546218487395, + "loss": 0.4362, + "step": 12049 + }, + { + "epoch": 6.731843575418995, + "grad_norm": 0.5978702306747437, + "learning_rate": 0.0006655182072829131, + "loss": 0.5811, + "step": 12050 + }, + { + "epoch": 6.732402234636871, + "grad_norm": 0.5664288997650146, + "learning_rate": 0.0006654901960784314, + "loss": 0.4725, + "step": 12051 + }, + { + "epoch": 6.732960893854749, + "grad_norm": 0.40197309851646423, + "learning_rate": 0.0006654621848739496, + "loss": 0.3741, + "step": 12052 + }, + { + "epoch": 6.733519553072625, + "grad_norm": 0.8479698896408081, + "learning_rate": 0.0006654341736694678, + "loss": 0.5441, + "step": 12053 + }, + { + "epoch": 6.734078212290503, + "grad_norm": 1.4909826517105103, + "learning_rate": 0.000665406162464986, + "loss": 0.6403, + "step": 12054 + }, + { + "epoch": 6.73463687150838, + "grad_norm": 0.5784264206886292, + "learning_rate": 0.0006653781512605041, + "loss": 0.4866, + "step": 12055 + }, + { + "epoch": 6.735195530726257, + "grad_norm": 1.651665210723877, + "learning_rate": 0.0006653501400560224, + "loss": 0.4543, + "step": 12056 + }, + { + "epoch": 6.735754189944134, + "grad_norm": 2.030379295349121, + "learning_rate": 0.0006653221288515406, + "loss": 0.4316, + "step": 12057 + }, + { + "epoch": 6.736312849162011, + "grad_norm": 0.4977875351905823, + "learning_rate": 0.0006652941176470588, + "loss": 0.4297, + "step": 12058 + }, + { + "epoch": 6.736871508379888, + "grad_norm": 0.7647444009780884, + "learning_rate": 0.000665266106442577, + "loss": 0.5852, + "step": 12059 + }, + { + "epoch": 6.737430167597766, + "grad_norm": 0.46734631061553955, + "learning_rate": 0.0006652380952380952, + "loss": 0.4704, + "step": 12060 + }, + { + "epoch": 6.737988826815642, + "grad_norm": 2.0098142623901367, + "learning_rate": 0.0006652100840336135, + "loss": 0.5337, + "step": 12061 + }, + { + "epoch": 6.73854748603352, + "grad_norm": 0.5219240784645081, + "learning_rate": 0.0006651820728291317, + "loss": 0.5184, + "step": 12062 + }, + { + "epoch": 6.739106145251396, + "grad_norm": 0.8293431401252747, + "learning_rate": 0.0006651540616246499, + "loss": 0.3782, + "step": 12063 + }, + { + "epoch": 6.739664804469274, + "grad_norm": 0.7018773555755615, + "learning_rate": 0.0006651260504201681, + "loss": 0.5091, + "step": 12064 + }, + { + "epoch": 6.740223463687151, + "grad_norm": 0.5579187273979187, + "learning_rate": 0.0006650980392156863, + "loss": 0.5382, + "step": 12065 + }, + { + "epoch": 6.740782122905028, + "grad_norm": 0.43822458386421204, + "learning_rate": 0.0006650700280112045, + "loss": 0.4922, + "step": 12066 + }, + { + "epoch": 6.741340782122905, + "grad_norm": 1.3476670980453491, + "learning_rate": 0.0006650420168067227, + "loss": 0.5205, + "step": 12067 + }, + { + "epoch": 6.741899441340782, + "grad_norm": 0.5670324563980103, + "learning_rate": 0.0006650140056022409, + "loss": 0.4844, + "step": 12068 + }, + { + "epoch": 6.742458100558659, + "grad_norm": 0.4712945818901062, + "learning_rate": 0.0006649859943977591, + "loss": 0.4032, + "step": 12069 + }, + { + "epoch": 6.743016759776537, + "grad_norm": 0.4902419447898865, + "learning_rate": 0.0006649579831932773, + "loss": 0.3289, + "step": 12070 + }, + { + "epoch": 6.743575418994413, + "grad_norm": 0.7132523059844971, + "learning_rate": 0.0006649299719887955, + "loss": 0.4696, + "step": 12071 + }, + { + "epoch": 6.744134078212291, + "grad_norm": 0.48290959000587463, + "learning_rate": 0.0006649019607843137, + "loss": 0.464, + "step": 12072 + }, + { + "epoch": 6.744692737430167, + "grad_norm": 0.4635353684425354, + "learning_rate": 0.0006648739495798319, + "loss": 0.4492, + "step": 12073 + }, + { + "epoch": 6.745251396648045, + "grad_norm": 0.6393581032752991, + "learning_rate": 0.0006648459383753501, + "loss": 0.4542, + "step": 12074 + }, + { + "epoch": 6.745810055865922, + "grad_norm": 0.47961464524269104, + "learning_rate": 0.0006648179271708683, + "loss": 0.5034, + "step": 12075 + }, + { + "epoch": 6.746368715083799, + "grad_norm": 1.6072580814361572, + "learning_rate": 0.0006647899159663866, + "loss": 0.3757, + "step": 12076 + }, + { + "epoch": 6.746927374301676, + "grad_norm": 0.53092360496521, + "learning_rate": 0.0006647619047619048, + "loss": 0.3748, + "step": 12077 + }, + { + "epoch": 6.747486033519553, + "grad_norm": 0.5965705513954163, + "learning_rate": 0.000664733893557423, + "loss": 0.4304, + "step": 12078 + }, + { + "epoch": 6.74804469273743, + "grad_norm": 1.8006287813186646, + "learning_rate": 0.0006647058823529412, + "loss": 0.4139, + "step": 12079 + }, + { + "epoch": 6.748603351955307, + "grad_norm": 0.6920422911643982, + "learning_rate": 0.0006646778711484594, + "loss": 0.4542, + "step": 12080 + }, + { + "epoch": 6.749162011173184, + "grad_norm": 0.4300786256790161, + "learning_rate": 0.0006646498599439777, + "loss": 0.4624, + "step": 12081 + }, + { + "epoch": 6.749720670391062, + "grad_norm": 0.39852672815322876, + "learning_rate": 0.0006646218487394958, + "loss": 0.3792, + "step": 12082 + }, + { + "epoch": 6.750279329608938, + "grad_norm": 1.3067153692245483, + "learning_rate": 0.000664593837535014, + "loss": 0.4524, + "step": 12083 + }, + { + "epoch": 6.750837988826816, + "grad_norm": 0.5748780369758606, + "learning_rate": 0.0006645658263305322, + "loss": 0.4365, + "step": 12084 + }, + { + "epoch": 6.751396648044693, + "grad_norm": 0.6959697604179382, + "learning_rate": 0.0006645378151260504, + "loss": 0.5574, + "step": 12085 + }, + { + "epoch": 6.75195530726257, + "grad_norm": 0.5242602229118347, + "learning_rate": 0.0006645098039215687, + "loss": 0.4369, + "step": 12086 + }, + { + "epoch": 6.752513966480447, + "grad_norm": 1.3811161518096924, + "learning_rate": 0.0006644817927170868, + "loss": 0.4515, + "step": 12087 + }, + { + "epoch": 6.753072625698324, + "grad_norm": 0.5280629396438599, + "learning_rate": 0.000664453781512605, + "loss": 0.4773, + "step": 12088 + }, + { + "epoch": 6.753631284916201, + "grad_norm": 0.41292819380760193, + "learning_rate": 0.0006644257703081232, + "loss": 0.4881, + "step": 12089 + }, + { + "epoch": 6.754189944134078, + "grad_norm": 0.4884382486343384, + "learning_rate": 0.0006643977591036414, + "loss": 0.3855, + "step": 12090 + }, + { + "epoch": 6.754748603351955, + "grad_norm": 0.5977643132209778, + "learning_rate": 0.0006643697478991598, + "loss": 0.4134, + "step": 12091 + }, + { + "epoch": 6.755307262569833, + "grad_norm": 0.5409367084503174, + "learning_rate": 0.0006643417366946779, + "loss": 0.4497, + "step": 12092 + }, + { + "epoch": 6.755865921787709, + "grad_norm": 0.5569071173667908, + "learning_rate": 0.0006643137254901961, + "loss": 0.4372, + "step": 12093 + }, + { + "epoch": 6.756424581005587, + "grad_norm": 1.4106059074401855, + "learning_rate": 0.0006642857142857143, + "loss": 0.3422, + "step": 12094 + }, + { + "epoch": 6.756983240223463, + "grad_norm": 0.46886613965034485, + "learning_rate": 0.0006642577030812325, + "loss": 0.4842, + "step": 12095 + }, + { + "epoch": 6.757541899441341, + "grad_norm": 0.601372480392456, + "learning_rate": 0.0006642296918767508, + "loss": 0.5204, + "step": 12096 + }, + { + "epoch": 6.758100558659218, + "grad_norm": 0.5059488415718079, + "learning_rate": 0.000664201680672269, + "loss": 0.3942, + "step": 12097 + }, + { + "epoch": 6.758659217877095, + "grad_norm": 0.7449119091033936, + "learning_rate": 0.0006641736694677871, + "loss": 0.4941, + "step": 12098 + }, + { + "epoch": 6.759217877094972, + "grad_norm": 0.46882739663124084, + "learning_rate": 0.0006641456582633053, + "loss": 0.431, + "step": 12099 + }, + { + "epoch": 6.759776536312849, + "grad_norm": 0.7931537628173828, + "learning_rate": 0.0006641176470588235, + "loss": 0.472, + "step": 12100 + }, + { + "epoch": 6.760335195530726, + "grad_norm": 0.45187756419181824, + "learning_rate": 0.0006640896358543418, + "loss": 0.575, + "step": 12101 + }, + { + "epoch": 6.760893854748604, + "grad_norm": 0.5821505188941956, + "learning_rate": 0.00066406162464986, + "loss": 0.4749, + "step": 12102 + }, + { + "epoch": 6.76145251396648, + "grad_norm": 1.4876537322998047, + "learning_rate": 0.0006640336134453781, + "loss": 0.4754, + "step": 12103 + }, + { + "epoch": 6.762011173184358, + "grad_norm": 0.866144061088562, + "learning_rate": 0.0006640056022408963, + "loss": 0.4289, + "step": 12104 + }, + { + "epoch": 6.762569832402234, + "grad_norm": 0.4579249620437622, + "learning_rate": 0.0006639775910364145, + "loss": 0.5009, + "step": 12105 + }, + { + "epoch": 6.763128491620112, + "grad_norm": 0.6374160051345825, + "learning_rate": 0.0006639495798319328, + "loss": 0.3892, + "step": 12106 + }, + { + "epoch": 6.763687150837989, + "grad_norm": 0.650264322757721, + "learning_rate": 0.000663921568627451, + "loss": 0.4387, + "step": 12107 + }, + { + "epoch": 6.764245810055866, + "grad_norm": 0.6742350459098816, + "learning_rate": 0.0006638935574229691, + "loss": 0.5609, + "step": 12108 + }, + { + "epoch": 6.764804469273743, + "grad_norm": 0.4441060423851013, + "learning_rate": 0.0006638655462184874, + "loss": 0.4389, + "step": 12109 + }, + { + "epoch": 6.76536312849162, + "grad_norm": 0.4060734510421753, + "learning_rate": 0.0006638375350140056, + "loss": 0.4307, + "step": 12110 + }, + { + "epoch": 6.765921787709497, + "grad_norm": 0.4801897406578064, + "learning_rate": 0.0006638095238095239, + "loss": 0.4521, + "step": 12111 + }, + { + "epoch": 6.766480446927375, + "grad_norm": 2.5180137157440186, + "learning_rate": 0.0006637815126050421, + "loss": 0.5442, + "step": 12112 + }, + { + "epoch": 6.767039106145251, + "grad_norm": 0.43988385796546936, + "learning_rate": 0.0006637535014005603, + "loss": 0.5378, + "step": 12113 + }, + { + "epoch": 6.767597765363129, + "grad_norm": 0.5194052457809448, + "learning_rate": 0.0006637254901960784, + "loss": 0.4361, + "step": 12114 + }, + { + "epoch": 6.768156424581005, + "grad_norm": 0.6256169080734253, + "learning_rate": 0.0006636974789915966, + "loss": 0.454, + "step": 12115 + }, + { + "epoch": 6.768715083798883, + "grad_norm": 0.4615011215209961, + "learning_rate": 0.0006636694677871149, + "loss": 0.4668, + "step": 12116 + }, + { + "epoch": 6.769273743016759, + "grad_norm": 0.4563002586364746, + "learning_rate": 0.0006636414565826331, + "loss": 0.3617, + "step": 12117 + }, + { + "epoch": 6.769832402234637, + "grad_norm": 1.2184298038482666, + "learning_rate": 0.0006636134453781513, + "loss": 0.3967, + "step": 12118 + }, + { + "epoch": 6.770391061452514, + "grad_norm": 0.7010212540626526, + "learning_rate": 0.0006635854341736694, + "loss": 0.4723, + "step": 12119 + }, + { + "epoch": 6.770949720670391, + "grad_norm": 0.4014045298099518, + "learning_rate": 0.0006635574229691876, + "loss": 0.3291, + "step": 12120 + }, + { + "epoch": 6.771508379888268, + "grad_norm": 0.43545007705688477, + "learning_rate": 0.0006635294117647059, + "loss": 0.4718, + "step": 12121 + }, + { + "epoch": 6.772067039106146, + "grad_norm": 0.48875871300697327, + "learning_rate": 0.0006635014005602241, + "loss": 0.4272, + "step": 12122 + }, + { + "epoch": 6.772625698324022, + "grad_norm": 0.6607202291488647, + "learning_rate": 0.0006634733893557423, + "loss": 0.4756, + "step": 12123 + }, + { + "epoch": 6.7731843575419, + "grad_norm": 1.0386394262313843, + "learning_rate": 0.0006634453781512604, + "loss": 0.4069, + "step": 12124 + }, + { + "epoch": 6.773743016759776, + "grad_norm": 1.3962101936340332, + "learning_rate": 0.0006634173669467786, + "loss": 0.3741, + "step": 12125 + }, + { + "epoch": 6.774301675977654, + "grad_norm": 0.5029513835906982, + "learning_rate": 0.000663389355742297, + "loss": 0.5083, + "step": 12126 + }, + { + "epoch": 6.77486033519553, + "grad_norm": 0.5117112398147583, + "learning_rate": 0.0006633613445378152, + "loss": 0.4086, + "step": 12127 + }, + { + "epoch": 6.775418994413408, + "grad_norm": 1.061491847038269, + "learning_rate": 0.0006633333333333334, + "loss": 0.3721, + "step": 12128 + }, + { + "epoch": 6.775977653631285, + "grad_norm": 1.292675256729126, + "learning_rate": 0.0006633053221288516, + "loss": 0.446, + "step": 12129 + }, + { + "epoch": 6.776536312849162, + "grad_norm": 0.5468816757202148, + "learning_rate": 0.0006632773109243697, + "loss": 0.4314, + "step": 12130 + }, + { + "epoch": 6.777094972067039, + "grad_norm": 0.594006359577179, + "learning_rate": 0.000663249299719888, + "loss": 0.4042, + "step": 12131 + }, + { + "epoch": 6.777653631284916, + "grad_norm": 0.34831440448760986, + "learning_rate": 0.0006632212885154062, + "loss": 0.3854, + "step": 12132 + }, + { + "epoch": 6.778212290502793, + "grad_norm": 0.6102625727653503, + "learning_rate": 0.0006631932773109244, + "loss": 0.5423, + "step": 12133 + }, + { + "epoch": 6.778770949720671, + "grad_norm": 0.4647621214389801, + "learning_rate": 0.0006631652661064426, + "loss": 0.4341, + "step": 12134 + }, + { + "epoch": 6.779329608938547, + "grad_norm": 0.4577479958534241, + "learning_rate": 0.0006631372549019607, + "loss": 0.358, + "step": 12135 + }, + { + "epoch": 6.779888268156425, + "grad_norm": 0.504641592502594, + "learning_rate": 0.000663109243697479, + "loss": 0.3962, + "step": 12136 + }, + { + "epoch": 6.780446927374301, + "grad_norm": 3.415985584259033, + "learning_rate": 0.0006630812324929972, + "loss": 0.4264, + "step": 12137 + }, + { + "epoch": 6.781005586592179, + "grad_norm": 6.4437713623046875, + "learning_rate": 0.0006630532212885154, + "loss": 0.434, + "step": 12138 + }, + { + "epoch": 6.781564245810056, + "grad_norm": 3.226740837097168, + "learning_rate": 0.0006630252100840336, + "loss": 0.4523, + "step": 12139 + }, + { + "epoch": 6.782122905027933, + "grad_norm": 0.8114117980003357, + "learning_rate": 0.0006629971988795517, + "loss": 0.4896, + "step": 12140 + }, + { + "epoch": 6.78268156424581, + "grad_norm": 0.7347950339317322, + "learning_rate": 0.00066296918767507, + "loss": 0.4357, + "step": 12141 + }, + { + "epoch": 6.783240223463687, + "grad_norm": 1.5617716312408447, + "learning_rate": 0.0006629411764705883, + "loss": 0.4382, + "step": 12142 + }, + { + "epoch": 6.783798882681564, + "grad_norm": 2.4106078147888184, + "learning_rate": 0.0006629131652661065, + "loss": 0.3704, + "step": 12143 + }, + { + "epoch": 6.784357541899441, + "grad_norm": 0.8199379444122314, + "learning_rate": 0.0006628851540616247, + "loss": 0.4761, + "step": 12144 + }, + { + "epoch": 6.784916201117318, + "grad_norm": 0.46194446086883545, + "learning_rate": 0.0006628571428571429, + "loss": 0.4813, + "step": 12145 + }, + { + "epoch": 6.785474860335196, + "grad_norm": 0.7091647982597351, + "learning_rate": 0.0006628291316526611, + "loss": 0.4157, + "step": 12146 + }, + { + "epoch": 6.786033519553072, + "grad_norm": 0.5812698006629944, + "learning_rate": 0.0006628011204481793, + "loss": 0.4228, + "step": 12147 + }, + { + "epoch": 6.78659217877095, + "grad_norm": 0.39596813917160034, + "learning_rate": 0.0006627731092436975, + "loss": 0.4331, + "step": 12148 + }, + { + "epoch": 6.787150837988827, + "grad_norm": 0.5265296697616577, + "learning_rate": 0.0006627450980392157, + "loss": 0.5176, + "step": 12149 + }, + { + "epoch": 6.787709497206704, + "grad_norm": 0.38072067499160767, + "learning_rate": 0.0006627170868347339, + "loss": 0.2953, + "step": 12150 + }, + { + "epoch": 6.788268156424581, + "grad_norm": 0.7729352712631226, + "learning_rate": 0.0006626890756302521, + "loss": 0.6336, + "step": 12151 + }, + { + "epoch": 6.788826815642458, + "grad_norm": 0.4469284415245056, + "learning_rate": 0.0006626610644257703, + "loss": 0.3789, + "step": 12152 + }, + { + "epoch": 6.789385474860335, + "grad_norm": 0.3830004632472992, + "learning_rate": 0.0006626330532212885, + "loss": 0.4208, + "step": 12153 + }, + { + "epoch": 6.789944134078212, + "grad_norm": 0.4384060502052307, + "learning_rate": 0.0006626050420168067, + "loss": 0.4982, + "step": 12154 + }, + { + "epoch": 6.790502793296089, + "grad_norm": 0.4818457365036011, + "learning_rate": 0.0006625770308123249, + "loss": 0.6442, + "step": 12155 + }, + { + "epoch": 6.791061452513967, + "grad_norm": 0.780181348323822, + "learning_rate": 0.0006625490196078431, + "loss": 0.3893, + "step": 12156 + }, + { + "epoch": 6.791620111731843, + "grad_norm": 0.5324886441230774, + "learning_rate": 0.0006625210084033613, + "loss": 0.5591, + "step": 12157 + }, + { + "epoch": 6.792178770949721, + "grad_norm": 0.6413914561271667, + "learning_rate": 0.0006624929971988796, + "loss": 0.4302, + "step": 12158 + }, + { + "epoch": 6.792737430167598, + "grad_norm": 0.5562315583229065, + "learning_rate": 0.0006624649859943978, + "loss": 0.3592, + "step": 12159 + }, + { + "epoch": 6.793296089385475, + "grad_norm": 0.4010394811630249, + "learning_rate": 0.000662436974789916, + "loss": 0.402, + "step": 12160 + }, + { + "epoch": 6.793854748603352, + "grad_norm": 0.6908612251281738, + "learning_rate": 0.0006624089635854343, + "loss": 0.5004, + "step": 12161 + }, + { + "epoch": 6.794413407821229, + "grad_norm": 0.48671984672546387, + "learning_rate": 0.0006623809523809524, + "loss": 0.4387, + "step": 12162 + }, + { + "epoch": 6.794972067039106, + "grad_norm": 10.109496116638184, + "learning_rate": 0.0006623529411764706, + "loss": 0.4356, + "step": 12163 + }, + { + "epoch": 6.795530726256983, + "grad_norm": 0.5088633894920349, + "learning_rate": 0.0006623249299719888, + "loss": 0.3064, + "step": 12164 + }, + { + "epoch": 6.79608938547486, + "grad_norm": 0.4546014368534088, + "learning_rate": 0.000662296918767507, + "loss": 0.5701, + "step": 12165 + }, + { + "epoch": 6.796648044692738, + "grad_norm": 1.4646694660186768, + "learning_rate": 0.0006622689075630253, + "loss": 0.5189, + "step": 12166 + }, + { + "epoch": 6.797206703910614, + "grad_norm": 0.4038674533367157, + "learning_rate": 0.0006622408963585434, + "loss": 0.3389, + "step": 12167 + }, + { + "epoch": 6.797765363128492, + "grad_norm": 1.1339504718780518, + "learning_rate": 0.0006622128851540616, + "loss": 0.6742, + "step": 12168 + }, + { + "epoch": 6.798324022346368, + "grad_norm": 0.6354513168334961, + "learning_rate": 0.0006621848739495798, + "loss": 0.4754, + "step": 12169 + }, + { + "epoch": 6.798882681564246, + "grad_norm": 0.4913661777973175, + "learning_rate": 0.000662156862745098, + "loss": 0.4298, + "step": 12170 + }, + { + "epoch": 6.799441340782123, + "grad_norm": 0.469773530960083, + "learning_rate": 0.0006621288515406163, + "loss": 0.5205, + "step": 12171 + }, + { + "epoch": 6.8, + "grad_norm": 0.5128031969070435, + "learning_rate": 0.0006621008403361344, + "loss": 0.3146, + "step": 12172 + }, + { + "epoch": 6.800558659217877, + "grad_norm": 4.143674850463867, + "learning_rate": 0.0006620728291316526, + "loss": 0.383, + "step": 12173 + }, + { + "epoch": 6.801117318435754, + "grad_norm": 0.6529032588005066, + "learning_rate": 0.0006620448179271709, + "loss": 0.5017, + "step": 12174 + }, + { + "epoch": 6.801675977653631, + "grad_norm": 0.5830149054527283, + "learning_rate": 0.0006620168067226891, + "loss": 0.474, + "step": 12175 + }, + { + "epoch": 6.802234636871509, + "grad_norm": 0.6475193500518799, + "learning_rate": 0.0006619887955182074, + "loss": 0.411, + "step": 12176 + }, + { + "epoch": 6.802793296089385, + "grad_norm": 0.5908905267715454, + "learning_rate": 0.0006619607843137256, + "loss": 0.419, + "step": 12177 + }, + { + "epoch": 6.803351955307263, + "grad_norm": 0.47705554962158203, + "learning_rate": 0.0006619327731092437, + "loss": 0.494, + "step": 12178 + }, + { + "epoch": 6.803910614525139, + "grad_norm": 0.5022670030593872, + "learning_rate": 0.0006619047619047619, + "loss": 0.5003, + "step": 12179 + }, + { + "epoch": 6.804469273743017, + "grad_norm": 0.47296231985092163, + "learning_rate": 0.0006618767507002801, + "loss": 0.5382, + "step": 12180 + }, + { + "epoch": 6.805027932960893, + "grad_norm": 0.5288333296775818, + "learning_rate": 0.0006618487394957984, + "loss": 0.4106, + "step": 12181 + }, + { + "epoch": 6.805586592178771, + "grad_norm": 0.36206546425819397, + "learning_rate": 0.0006618207282913166, + "loss": 0.4195, + "step": 12182 + }, + { + "epoch": 6.806145251396648, + "grad_norm": 1.007214069366455, + "learning_rate": 0.0006617927170868347, + "loss": 0.4235, + "step": 12183 + }, + { + "epoch": 6.806703910614525, + "grad_norm": 0.46468690037727356, + "learning_rate": 0.0006617647058823529, + "loss": 0.3398, + "step": 12184 + }, + { + "epoch": 6.807262569832402, + "grad_norm": 0.5715383887290955, + "learning_rate": 0.0006617366946778711, + "loss": 0.5331, + "step": 12185 + }, + { + "epoch": 6.80782122905028, + "grad_norm": 0.48019129037857056, + "learning_rate": 0.0006617086834733894, + "loss": 0.4826, + "step": 12186 + }, + { + "epoch": 6.808379888268156, + "grad_norm": 1.925338864326477, + "learning_rate": 0.0006616806722689076, + "loss": 0.4267, + "step": 12187 + }, + { + "epoch": 6.808938547486034, + "grad_norm": 0.42642077803611755, + "learning_rate": 0.0006616526610644257, + "loss": 0.4655, + "step": 12188 + }, + { + "epoch": 6.80949720670391, + "grad_norm": 0.5507749915122986, + "learning_rate": 0.0006616246498599439, + "loss": 0.3932, + "step": 12189 + }, + { + "epoch": 6.810055865921788, + "grad_norm": 0.7477067112922668, + "learning_rate": 0.0006615966386554621, + "loss": 0.4779, + "step": 12190 + }, + { + "epoch": 6.810614525139664, + "grad_norm": 0.4381631314754486, + "learning_rate": 0.0006615686274509805, + "loss": 0.3774, + "step": 12191 + }, + { + "epoch": 6.811173184357542, + "grad_norm": 0.561799168586731, + "learning_rate": 0.0006615406162464987, + "loss": 0.5242, + "step": 12192 + }, + { + "epoch": 6.811731843575419, + "grad_norm": 0.4435920715332031, + "learning_rate": 0.0006615126050420169, + "loss": 0.4929, + "step": 12193 + }, + { + "epoch": 6.812290502793296, + "grad_norm": 0.4417363703250885, + "learning_rate": 0.000661484593837535, + "loss": 0.4342, + "step": 12194 + }, + { + "epoch": 6.812849162011173, + "grad_norm": 0.5318664908409119, + "learning_rate": 0.0006614565826330532, + "loss": 0.4526, + "step": 12195 + }, + { + "epoch": 6.813407821229051, + "grad_norm": 0.4912884831428528, + "learning_rate": 0.0006614285714285715, + "loss": 0.6299, + "step": 12196 + }, + { + "epoch": 6.813966480446927, + "grad_norm": 0.44741302728652954, + "learning_rate": 0.0006614005602240897, + "loss": 0.4084, + "step": 12197 + }, + { + "epoch": 6.814525139664805, + "grad_norm": 0.4176372289657593, + "learning_rate": 0.0006613725490196079, + "loss": 0.4655, + "step": 12198 + }, + { + "epoch": 6.815083798882681, + "grad_norm": 0.48217812180519104, + "learning_rate": 0.000661344537815126, + "loss": 0.4032, + "step": 12199 + }, + { + "epoch": 6.815642458100559, + "grad_norm": 0.4372022747993469, + "learning_rate": 0.0006613165266106442, + "loss": 0.507, + "step": 12200 + }, + { + "epoch": 6.816201117318435, + "grad_norm": 0.7491111159324646, + "learning_rate": 0.0006612885154061625, + "loss": 0.5372, + "step": 12201 + }, + { + "epoch": 6.816759776536313, + "grad_norm": 1.053197979927063, + "learning_rate": 0.0006612605042016807, + "loss": 0.5283, + "step": 12202 + }, + { + "epoch": 6.81731843575419, + "grad_norm": 1.8735992908477783, + "learning_rate": 0.0006612324929971989, + "loss": 0.3841, + "step": 12203 + }, + { + "epoch": 6.817877094972067, + "grad_norm": 2.15081524848938, + "learning_rate": 0.000661204481792717, + "loss": 0.5999, + "step": 12204 + }, + { + "epoch": 6.818435754189944, + "grad_norm": 0.4968929886817932, + "learning_rate": 0.0006611764705882352, + "loss": 0.4864, + "step": 12205 + }, + { + "epoch": 6.818994413407821, + "grad_norm": 0.548363983631134, + "learning_rate": 0.0006611484593837536, + "loss": 0.4815, + "step": 12206 + }, + { + "epoch": 6.819553072625698, + "grad_norm": 0.5376548171043396, + "learning_rate": 0.0006611204481792718, + "loss": 0.462, + "step": 12207 + }, + { + "epoch": 6.820111731843576, + "grad_norm": 1.8746628761291504, + "learning_rate": 0.00066109243697479, + "loss": 0.3926, + "step": 12208 + }, + { + "epoch": 6.820670391061452, + "grad_norm": 0.4753383994102478, + "learning_rate": 0.0006610644257703082, + "loss": 0.4537, + "step": 12209 + }, + { + "epoch": 6.82122905027933, + "grad_norm": 0.5809012651443481, + "learning_rate": 0.0006610364145658263, + "loss": 0.4553, + "step": 12210 + }, + { + "epoch": 6.821787709497206, + "grad_norm": 0.6956520676612854, + "learning_rate": 0.0006610084033613446, + "loss": 0.4233, + "step": 12211 + }, + { + "epoch": 6.822346368715084, + "grad_norm": 0.5689812302589417, + "learning_rate": 0.0006609803921568628, + "loss": 0.5841, + "step": 12212 + }, + { + "epoch": 6.822905027932961, + "grad_norm": 0.7909679412841797, + "learning_rate": 0.000660952380952381, + "loss": 0.5484, + "step": 12213 + }, + { + "epoch": 6.823463687150838, + "grad_norm": 0.5619382858276367, + "learning_rate": 0.0006609243697478992, + "loss": 0.341, + "step": 12214 + }, + { + "epoch": 6.824022346368715, + "grad_norm": 0.41307708621025085, + "learning_rate": 0.0006608963585434173, + "loss": 0.431, + "step": 12215 + }, + { + "epoch": 6.824581005586592, + "grad_norm": 0.4418524205684662, + "learning_rate": 0.0006608683473389356, + "loss": 0.3833, + "step": 12216 + }, + { + "epoch": 6.825139664804469, + "grad_norm": 0.9368011355400085, + "learning_rate": 0.0006608403361344538, + "loss": 0.4328, + "step": 12217 + }, + { + "epoch": 6.825698324022346, + "grad_norm": 0.8204415440559387, + "learning_rate": 0.000660812324929972, + "loss": 0.5172, + "step": 12218 + }, + { + "epoch": 6.826256983240223, + "grad_norm": 0.90938800573349, + "learning_rate": 0.0006607843137254902, + "loss": 0.431, + "step": 12219 + }, + { + "epoch": 6.826815642458101, + "grad_norm": 0.48122236132621765, + "learning_rate": 0.0006607563025210083, + "loss": 0.3777, + "step": 12220 + }, + { + "epoch": 6.827374301675977, + "grad_norm": 0.4968211054801941, + "learning_rate": 0.0006607282913165266, + "loss": 0.5716, + "step": 12221 + }, + { + "epoch": 6.827932960893855, + "grad_norm": 0.4477780759334564, + "learning_rate": 0.0006607002801120448, + "loss": 0.4006, + "step": 12222 + }, + { + "epoch": 6.828491620111732, + "grad_norm": 0.5600734949111938, + "learning_rate": 0.000660672268907563, + "loss": 0.5963, + "step": 12223 + }, + { + "epoch": 6.829050279329609, + "grad_norm": 1.032914400100708, + "learning_rate": 0.0006606442577030813, + "loss": 0.4769, + "step": 12224 + }, + { + "epoch": 6.829608938547486, + "grad_norm": 0.7054173946380615, + "learning_rate": 0.0006606162464985995, + "loss": 0.3962, + "step": 12225 + }, + { + "epoch": 6.830167597765363, + "grad_norm": 2.3067338466644287, + "learning_rate": 0.0006605882352941177, + "loss": 0.589, + "step": 12226 + }, + { + "epoch": 6.83072625698324, + "grad_norm": 0.525468647480011, + "learning_rate": 0.0006605602240896359, + "loss": 0.4507, + "step": 12227 + }, + { + "epoch": 6.831284916201117, + "grad_norm": 0.4376436471939087, + "learning_rate": 0.0006605322128851541, + "loss": 0.4141, + "step": 12228 + }, + { + "epoch": 6.831843575418994, + "grad_norm": 2.277660846710205, + "learning_rate": 0.0006605042016806723, + "loss": 0.4964, + "step": 12229 + }, + { + "epoch": 6.832402234636872, + "grad_norm": 0.5358434319496155, + "learning_rate": 0.0006604761904761905, + "loss": 0.4127, + "step": 12230 + }, + { + "epoch": 6.832960893854748, + "grad_norm": 0.5851360559463501, + "learning_rate": 0.0006604481792717087, + "loss": 0.4064, + "step": 12231 + }, + { + "epoch": 6.833519553072626, + "grad_norm": 0.3869569003582001, + "learning_rate": 0.0006604201680672269, + "loss": 0.4763, + "step": 12232 + }, + { + "epoch": 6.834078212290503, + "grad_norm": 1.0456703901290894, + "learning_rate": 0.0006603921568627451, + "loss": 0.6841, + "step": 12233 + }, + { + "epoch": 6.83463687150838, + "grad_norm": 0.45951998233795166, + "learning_rate": 0.0006603641456582633, + "loss": 0.4759, + "step": 12234 + }, + { + "epoch": 6.835195530726257, + "grad_norm": 0.4701312780380249, + "learning_rate": 0.0006603361344537815, + "loss": 0.5475, + "step": 12235 + }, + { + "epoch": 6.835754189944134, + "grad_norm": 0.513654887676239, + "learning_rate": 0.0006603081232492998, + "loss": 0.4243, + "step": 12236 + }, + { + "epoch": 6.836312849162011, + "grad_norm": 0.6751559972763062, + "learning_rate": 0.0006602801120448179, + "loss": 0.4939, + "step": 12237 + }, + { + "epoch": 6.836871508379888, + "grad_norm": 0.5372594594955444, + "learning_rate": 0.0006602521008403361, + "loss": 0.486, + "step": 12238 + }, + { + "epoch": 6.837430167597765, + "grad_norm": 2.6426103115081787, + "learning_rate": 0.0006602240896358543, + "loss": 0.5077, + "step": 12239 + }, + { + "epoch": 6.837988826815643, + "grad_norm": 0.5769447684288025, + "learning_rate": 0.0006601960784313726, + "loss": 0.5089, + "step": 12240 + }, + { + "epoch": 6.838547486033519, + "grad_norm": 0.6189960241317749, + "learning_rate": 0.0006601680672268909, + "loss": 0.4863, + "step": 12241 + }, + { + "epoch": 6.839106145251397, + "grad_norm": 0.8525975346565247, + "learning_rate": 0.000660140056022409, + "loss": 0.3988, + "step": 12242 + }, + { + "epoch": 6.839664804469273, + "grad_norm": 0.4903383255004883, + "learning_rate": 0.0006601120448179272, + "loss": 0.479, + "step": 12243 + }, + { + "epoch": 6.840223463687151, + "grad_norm": 1.2140885591506958, + "learning_rate": 0.0006600840336134454, + "loss": 0.5013, + "step": 12244 + }, + { + "epoch": 6.840782122905028, + "grad_norm": 1.1946229934692383, + "learning_rate": 0.0006600560224089636, + "loss": 0.4261, + "step": 12245 + }, + { + "epoch": 6.841340782122905, + "grad_norm": 0.6053236722946167, + "learning_rate": 0.0006600280112044819, + "loss": 0.6284, + "step": 12246 + }, + { + "epoch": 6.841899441340782, + "grad_norm": 0.5141407251358032, + "learning_rate": 0.00066, + "loss": 0.3859, + "step": 12247 + }, + { + "epoch": 6.842458100558659, + "grad_norm": 0.5617372393608093, + "learning_rate": 0.0006599719887955182, + "loss": 0.4183, + "step": 12248 + }, + { + "epoch": 6.843016759776536, + "grad_norm": 0.7290052771568298, + "learning_rate": 0.0006599439775910364, + "loss": 0.4398, + "step": 12249 + }, + { + "epoch": 6.843575418994414, + "grad_norm": 0.4437260031700134, + "learning_rate": 0.0006599159663865546, + "loss": 0.3766, + "step": 12250 + }, + { + "epoch": 6.84413407821229, + "grad_norm": 0.43845051527023315, + "learning_rate": 0.0006598879551820729, + "loss": 0.3611, + "step": 12251 + }, + { + "epoch": 6.844692737430168, + "grad_norm": 0.6631003022193909, + "learning_rate": 0.0006598599439775911, + "loss": 0.5254, + "step": 12252 + }, + { + "epoch": 6.845251396648044, + "grad_norm": 0.39586126804351807, + "learning_rate": 0.0006598319327731092, + "loss": 0.4706, + "step": 12253 + }, + { + "epoch": 6.845810055865922, + "grad_norm": 0.3851293921470642, + "learning_rate": 0.0006598039215686274, + "loss": 0.3492, + "step": 12254 + }, + { + "epoch": 6.846368715083798, + "grad_norm": 0.8514540195465088, + "learning_rate": 0.0006597759103641456, + "loss": 0.4679, + "step": 12255 + }, + { + "epoch": 6.846927374301676, + "grad_norm": 0.8617717027664185, + "learning_rate": 0.000659747899159664, + "loss": 0.3896, + "step": 12256 + }, + { + "epoch": 6.847486033519553, + "grad_norm": 0.47195181250572205, + "learning_rate": 0.0006597198879551822, + "loss": 0.5756, + "step": 12257 + }, + { + "epoch": 6.84804469273743, + "grad_norm": 0.6824280023574829, + "learning_rate": 0.0006596918767507003, + "loss": 0.4798, + "step": 12258 + }, + { + "epoch": 6.848603351955307, + "grad_norm": 1.5259748697280884, + "learning_rate": 0.0006596638655462185, + "loss": 0.4248, + "step": 12259 + }, + { + "epoch": 6.849162011173185, + "grad_norm": 1.1958094835281372, + "learning_rate": 0.0006596358543417367, + "loss": 0.5645, + "step": 12260 + }, + { + "epoch": 6.849720670391061, + "grad_norm": 0.5710233449935913, + "learning_rate": 0.000659607843137255, + "loss": 0.4509, + "step": 12261 + }, + { + "epoch": 6.850279329608939, + "grad_norm": 0.6793966889381409, + "learning_rate": 0.0006595798319327732, + "loss": 0.3582, + "step": 12262 + }, + { + "epoch": 6.850837988826815, + "grad_norm": 0.4943723678588867, + "learning_rate": 0.0006595518207282913, + "loss": 0.4839, + "step": 12263 + }, + { + "epoch": 6.851396648044693, + "grad_norm": 0.6937837600708008, + "learning_rate": 0.0006595238095238095, + "loss": 0.4814, + "step": 12264 + }, + { + "epoch": 6.851955307262569, + "grad_norm": 0.5042548179626465, + "learning_rate": 0.0006594957983193277, + "loss": 0.4474, + "step": 12265 + }, + { + "epoch": 6.852513966480447, + "grad_norm": 0.6350553035736084, + "learning_rate": 0.000659467787114846, + "loss": 0.4229, + "step": 12266 + }, + { + "epoch": 6.853072625698324, + "grad_norm": 0.44231465458869934, + "learning_rate": 0.0006594397759103642, + "loss": 0.3864, + "step": 12267 + }, + { + "epoch": 6.853631284916201, + "grad_norm": 0.4294978976249695, + "learning_rate": 0.0006594117647058824, + "loss": 0.3916, + "step": 12268 + }, + { + "epoch": 6.854189944134078, + "grad_norm": 1.0935415029525757, + "learning_rate": 0.0006593837535014005, + "loss": 0.4628, + "step": 12269 + }, + { + "epoch": 6.854748603351956, + "grad_norm": 0.6725505590438843, + "learning_rate": 0.0006593557422969187, + "loss": 0.5135, + "step": 12270 + }, + { + "epoch": 6.855307262569832, + "grad_norm": 0.8837329149246216, + "learning_rate": 0.0006593277310924369, + "loss": 0.6269, + "step": 12271 + }, + { + "epoch": 6.85586592178771, + "grad_norm": 0.5046632885932922, + "learning_rate": 0.0006592997198879553, + "loss": 0.4485, + "step": 12272 + }, + { + "epoch": 6.856424581005586, + "grad_norm": 0.4662707448005676, + "learning_rate": 0.0006592717086834735, + "loss": 0.4243, + "step": 12273 + }, + { + "epoch": 6.856983240223464, + "grad_norm": 0.39036089181900024, + "learning_rate": 0.0006592436974789916, + "loss": 0.3864, + "step": 12274 + }, + { + "epoch": 6.85754189944134, + "grad_norm": 0.4357408285140991, + "learning_rate": 0.0006592156862745098, + "loss": 0.4034, + "step": 12275 + }, + { + "epoch": 6.858100558659218, + "grad_norm": 0.4454643726348877, + "learning_rate": 0.000659187675070028, + "loss": 0.3906, + "step": 12276 + }, + { + "epoch": 6.858659217877095, + "grad_norm": 1.276620864868164, + "learning_rate": 0.0006591596638655463, + "loss": 0.404, + "step": 12277 + }, + { + "epoch": 6.859217877094972, + "grad_norm": 0.4004911780357361, + "learning_rate": 0.0006591316526610645, + "loss": 0.4148, + "step": 12278 + }, + { + "epoch": 6.859776536312849, + "grad_norm": 0.5384070873260498, + "learning_rate": 0.0006591036414565826, + "loss": 0.5143, + "step": 12279 + }, + { + "epoch": 6.860335195530726, + "grad_norm": 0.8736839294433594, + "learning_rate": 0.0006590756302521008, + "loss": 0.3602, + "step": 12280 + }, + { + "epoch": 6.860893854748603, + "grad_norm": 0.8059192895889282, + "learning_rate": 0.000659047619047619, + "loss": 0.4569, + "step": 12281 + }, + { + "epoch": 6.861452513966481, + "grad_norm": 0.8635017275810242, + "learning_rate": 0.0006590196078431373, + "loss": 0.5441, + "step": 12282 + }, + { + "epoch": 6.862011173184357, + "grad_norm": 0.5241972804069519, + "learning_rate": 0.0006589915966386555, + "loss": 0.4499, + "step": 12283 + }, + { + "epoch": 6.862569832402235, + "grad_norm": 0.5192648768424988, + "learning_rate": 0.0006589635854341737, + "loss": 0.4598, + "step": 12284 + }, + { + "epoch": 6.863128491620111, + "grad_norm": 0.6250534653663635, + "learning_rate": 0.0006589355742296918, + "loss": 0.3846, + "step": 12285 + }, + { + "epoch": 6.863687150837989, + "grad_norm": 0.38513192534446716, + "learning_rate": 0.00065890756302521, + "loss": 0.3629, + "step": 12286 + }, + { + "epoch": 6.864245810055866, + "grad_norm": 0.5308732986450195, + "learning_rate": 0.0006588795518207283, + "loss": 0.5477, + "step": 12287 + }, + { + "epoch": 6.864804469273743, + "grad_norm": 0.4723777770996094, + "learning_rate": 0.0006588515406162466, + "loss": 0.372, + "step": 12288 + }, + { + "epoch": 6.86536312849162, + "grad_norm": 0.544681191444397, + "learning_rate": 0.0006588235294117648, + "loss": 0.5191, + "step": 12289 + }, + { + "epoch": 6.865921787709497, + "grad_norm": 0.4156869947910309, + "learning_rate": 0.0006587955182072829, + "loss": 0.3577, + "step": 12290 + }, + { + "epoch": 6.866480446927374, + "grad_norm": 0.9283410310745239, + "learning_rate": 0.0006587675070028011, + "loss": 0.4218, + "step": 12291 + }, + { + "epoch": 6.867039106145251, + "grad_norm": 0.45266541838645935, + "learning_rate": 0.0006587394957983194, + "loss": 0.3658, + "step": 12292 + }, + { + "epoch": 6.867597765363128, + "grad_norm": 0.4927558898925781, + "learning_rate": 0.0006587114845938376, + "loss": 0.3867, + "step": 12293 + }, + { + "epoch": 6.868156424581006, + "grad_norm": 0.6897336840629578, + "learning_rate": 0.0006586834733893558, + "loss": 0.4684, + "step": 12294 + }, + { + "epoch": 6.868715083798882, + "grad_norm": 0.9791338443756104, + "learning_rate": 0.0006586554621848739, + "loss": 0.4581, + "step": 12295 + }, + { + "epoch": 6.86927374301676, + "grad_norm": 0.5750959515571594, + "learning_rate": 0.0006586274509803921, + "loss": 0.3691, + "step": 12296 + }, + { + "epoch": 6.869832402234637, + "grad_norm": 0.4801139533519745, + "learning_rate": 0.0006585994397759104, + "loss": 0.4913, + "step": 12297 + }, + { + "epoch": 6.870391061452514, + "grad_norm": 0.4182223379611969, + "learning_rate": 0.0006585714285714286, + "loss": 0.4591, + "step": 12298 + }, + { + "epoch": 6.870949720670391, + "grad_norm": 0.5962969660758972, + "learning_rate": 0.0006585434173669468, + "loss": 0.4151, + "step": 12299 + }, + { + "epoch": 6.871508379888268, + "grad_norm": 1.4177026748657227, + "learning_rate": 0.000658515406162465, + "loss": 0.5151, + "step": 12300 + }, + { + "epoch": 6.872067039106145, + "grad_norm": 0.6685916185379028, + "learning_rate": 0.0006584873949579831, + "loss": 0.5699, + "step": 12301 + }, + { + "epoch": 6.872625698324022, + "grad_norm": 0.5574515461921692, + "learning_rate": 0.0006584593837535014, + "loss": 0.3825, + "step": 12302 + }, + { + "epoch": 6.873184357541899, + "grad_norm": 1.0649938583374023, + "learning_rate": 0.0006584313725490196, + "loss": 0.4321, + "step": 12303 + }, + { + "epoch": 6.873743016759777, + "grad_norm": 0.517022967338562, + "learning_rate": 0.0006584033613445378, + "loss": 0.5233, + "step": 12304 + }, + { + "epoch": 6.874301675977653, + "grad_norm": 0.6731800436973572, + "learning_rate": 0.000658375350140056, + "loss": 0.603, + "step": 12305 + }, + { + "epoch": 6.874860335195531, + "grad_norm": 0.4743667542934418, + "learning_rate": 0.0006583473389355742, + "loss": 0.4186, + "step": 12306 + }, + { + "epoch": 6.875418994413408, + "grad_norm": 0.5006465315818787, + "learning_rate": 0.0006583193277310925, + "loss": 0.3138, + "step": 12307 + }, + { + "epoch": 6.875977653631285, + "grad_norm": 0.5262283086776733, + "learning_rate": 0.0006582913165266107, + "loss": 0.4515, + "step": 12308 + }, + { + "epoch": 6.876536312849162, + "grad_norm": 0.6108571887016296, + "learning_rate": 0.0006582633053221289, + "loss": 0.4945, + "step": 12309 + }, + { + "epoch": 6.877094972067039, + "grad_norm": 0.795237123966217, + "learning_rate": 0.0006582352941176471, + "loss": 0.4214, + "step": 12310 + }, + { + "epoch": 6.877653631284916, + "grad_norm": 0.6529590487480164, + "learning_rate": 0.0006582072829131652, + "loss": 0.4101, + "step": 12311 + }, + { + "epoch": 6.878212290502793, + "grad_norm": 0.4235641658306122, + "learning_rate": 0.0006581792717086835, + "loss": 0.4968, + "step": 12312 + }, + { + "epoch": 6.87877094972067, + "grad_norm": 0.4552220106124878, + "learning_rate": 0.0006581512605042017, + "loss": 0.4687, + "step": 12313 + }, + { + "epoch": 6.879329608938548, + "grad_norm": 0.41594234108924866, + "learning_rate": 0.0006581232492997199, + "loss": 0.372, + "step": 12314 + }, + { + "epoch": 6.879888268156424, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0006580952380952381, + "loss": 0.4097, + "step": 12315 + }, + { + "epoch": 6.880446927374302, + "grad_norm": 0.6716263890266418, + "learning_rate": 0.0006580672268907563, + "loss": 0.6013, + "step": 12316 + }, + { + "epoch": 6.881005586592178, + "grad_norm": 0.4165213406085968, + "learning_rate": 0.0006580392156862745, + "loss": 0.4166, + "step": 12317 + }, + { + "epoch": 6.881564245810056, + "grad_norm": 0.539641261100769, + "learning_rate": 0.0006580112044817927, + "loss": 0.3606, + "step": 12318 + }, + { + "epoch": 6.882122905027933, + "grad_norm": 0.7333235740661621, + "learning_rate": 0.0006579831932773109, + "loss": 0.4156, + "step": 12319 + }, + { + "epoch": 6.88268156424581, + "grad_norm": 0.39429736137390137, + "learning_rate": 0.0006579551820728291, + "loss": 0.444, + "step": 12320 + }, + { + "epoch": 6.883240223463687, + "grad_norm": 0.6604355573654175, + "learning_rate": 0.0006579271708683473, + "loss": 0.5062, + "step": 12321 + }, + { + "epoch": 6.883798882681564, + "grad_norm": 0.43606069684028625, + "learning_rate": 0.0006578991596638656, + "loss": 0.4795, + "step": 12322 + }, + { + "epoch": 6.884357541899441, + "grad_norm": 0.7540426254272461, + "learning_rate": 0.0006578711484593838, + "loss": 0.5598, + "step": 12323 + }, + { + "epoch": 6.884916201117319, + "grad_norm": 0.4999866485595703, + "learning_rate": 0.000657843137254902, + "loss": 0.4493, + "step": 12324 + }, + { + "epoch": 6.885474860335195, + "grad_norm": 1.0222454071044922, + "learning_rate": 0.0006578151260504202, + "loss": 0.4033, + "step": 12325 + }, + { + "epoch": 6.886033519553073, + "grad_norm": 0.40791088342666626, + "learning_rate": 0.0006577871148459384, + "loss": 0.3806, + "step": 12326 + }, + { + "epoch": 6.886592178770949, + "grad_norm": 0.45409032702445984, + "learning_rate": 0.0006577591036414566, + "loss": 0.4798, + "step": 12327 + }, + { + "epoch": 6.887150837988827, + "grad_norm": 0.6834008693695068, + "learning_rate": 0.0006577310924369748, + "loss": 0.3432, + "step": 12328 + }, + { + "epoch": 6.8877094972067034, + "grad_norm": 3.514155149459839, + "learning_rate": 0.000657703081232493, + "loss": 0.5613, + "step": 12329 + }, + { + "epoch": 6.888268156424581, + "grad_norm": 0.44188401103019714, + "learning_rate": 0.0006576750700280112, + "loss": 0.4209, + "step": 12330 + }, + { + "epoch": 6.888826815642458, + "grad_norm": 1.5433259010314941, + "learning_rate": 0.0006576470588235294, + "loss": 0.4409, + "step": 12331 + }, + { + "epoch": 6.889385474860335, + "grad_norm": 0.7367574572563171, + "learning_rate": 0.0006576190476190477, + "loss": 0.4655, + "step": 12332 + }, + { + "epoch": 6.889944134078212, + "grad_norm": 2.3072218894958496, + "learning_rate": 0.0006575910364145658, + "loss": 0.4912, + "step": 12333 + }, + { + "epoch": 6.89050279329609, + "grad_norm": 0.5888306498527527, + "learning_rate": 0.000657563025210084, + "loss": 0.407, + "step": 12334 + }, + { + "epoch": 6.891061452513966, + "grad_norm": 0.5177647471427917, + "learning_rate": 0.0006575350140056022, + "loss": 0.4127, + "step": 12335 + }, + { + "epoch": 6.891620111731844, + "grad_norm": 0.5400302410125732, + "learning_rate": 0.0006575070028011204, + "loss": 0.4269, + "step": 12336 + }, + { + "epoch": 6.89217877094972, + "grad_norm": 0.6724732518196106, + "learning_rate": 0.0006574789915966388, + "loss": 0.4289, + "step": 12337 + }, + { + "epoch": 6.892737430167598, + "grad_norm": 0.9561570286750793, + "learning_rate": 0.0006574509803921569, + "loss": 0.5423, + "step": 12338 + }, + { + "epoch": 6.8932960893854744, + "grad_norm": 0.48202231526374817, + "learning_rate": 0.0006574229691876751, + "loss": 0.4312, + "step": 12339 + }, + { + "epoch": 6.893854748603352, + "grad_norm": 1.6430753469467163, + "learning_rate": 0.0006573949579831933, + "loss": 0.498, + "step": 12340 + }, + { + "epoch": 6.894413407821229, + "grad_norm": 0.5640503764152527, + "learning_rate": 0.0006573669467787115, + "loss": 0.3957, + "step": 12341 + }, + { + "epoch": 6.894972067039106, + "grad_norm": 0.3838455080986023, + "learning_rate": 0.0006573389355742298, + "loss": 0.4335, + "step": 12342 + }, + { + "epoch": 6.895530726256983, + "grad_norm": 0.5746883749961853, + "learning_rate": 0.0006573109243697479, + "loss": 0.3845, + "step": 12343 + }, + { + "epoch": 6.896089385474861, + "grad_norm": 0.4784086346626282, + "learning_rate": 0.0006572829131652661, + "loss": 0.4257, + "step": 12344 + }, + { + "epoch": 6.896648044692737, + "grad_norm": 0.752626359462738, + "learning_rate": 0.0006572549019607843, + "loss": 0.4444, + "step": 12345 + }, + { + "epoch": 6.897206703910615, + "grad_norm": 1.0025484561920166, + "learning_rate": 0.0006572268907563025, + "loss": 0.5299, + "step": 12346 + }, + { + "epoch": 6.897765363128491, + "grad_norm": 0.7981438636779785, + "learning_rate": 0.0006571988795518208, + "loss": 0.3938, + "step": 12347 + }, + { + "epoch": 6.898324022346369, + "grad_norm": 0.5628315210342407, + "learning_rate": 0.000657170868347339, + "loss": 0.4264, + "step": 12348 + }, + { + "epoch": 6.8988826815642454, + "grad_norm": 0.5373659729957581, + "learning_rate": 0.0006571428571428571, + "loss": 0.5799, + "step": 12349 + }, + { + "epoch": 6.899441340782123, + "grad_norm": 0.6609413623809814, + "learning_rate": 0.0006571148459383753, + "loss": 0.3921, + "step": 12350 + }, + { + "epoch": 6.9, + "grad_norm": 0.457053542137146, + "learning_rate": 0.0006570868347338935, + "loss": 0.3676, + "step": 12351 + }, + { + "epoch": 6.900558659217877, + "grad_norm": 0.7940835356712341, + "learning_rate": 0.0006570588235294118, + "loss": 0.4878, + "step": 12352 + }, + { + "epoch": 6.901117318435754, + "grad_norm": 0.4627826511859894, + "learning_rate": 0.00065703081232493, + "loss": 0.4674, + "step": 12353 + }, + { + "epoch": 6.901675977653631, + "grad_norm": 0.6538100242614746, + "learning_rate": 0.0006570028011204481, + "loss": 0.4804, + "step": 12354 + }, + { + "epoch": 6.902234636871508, + "grad_norm": 0.943400502204895, + "learning_rate": 0.0006569747899159664, + "loss": 0.3998, + "step": 12355 + }, + { + "epoch": 6.902793296089386, + "grad_norm": 0.6363654732704163, + "learning_rate": 0.0006569467787114846, + "loss": 0.5836, + "step": 12356 + }, + { + "epoch": 6.903351955307262, + "grad_norm": 0.6421704888343811, + "learning_rate": 0.0006569187675070029, + "loss": 0.483, + "step": 12357 + }, + { + "epoch": 6.90391061452514, + "grad_norm": 0.6273989677429199, + "learning_rate": 0.0006568907563025211, + "loss": 0.6133, + "step": 12358 + }, + { + "epoch": 6.9044692737430164, + "grad_norm": 0.4673924148082733, + "learning_rate": 0.0006568627450980392, + "loss": 0.394, + "step": 12359 + }, + { + "epoch": 6.905027932960894, + "grad_norm": 0.36412474513053894, + "learning_rate": 0.0006568347338935574, + "loss": 0.4247, + "step": 12360 + }, + { + "epoch": 6.905586592178771, + "grad_norm": 1.1147143840789795, + "learning_rate": 0.0006568067226890756, + "loss": 0.5324, + "step": 12361 + }, + { + "epoch": 6.906145251396648, + "grad_norm": 0.5823172926902771, + "learning_rate": 0.0006567787114845939, + "loss": 0.4566, + "step": 12362 + }, + { + "epoch": 6.906703910614525, + "grad_norm": 0.6668529510498047, + "learning_rate": 0.0006567507002801121, + "loss": 0.36, + "step": 12363 + }, + { + "epoch": 6.907262569832402, + "grad_norm": 0.5669211149215698, + "learning_rate": 0.0006567226890756303, + "loss": 0.3919, + "step": 12364 + }, + { + "epoch": 6.907821229050279, + "grad_norm": 0.5733174085617065, + "learning_rate": 0.0006566946778711484, + "loss": 0.5691, + "step": 12365 + }, + { + "epoch": 6.908379888268156, + "grad_norm": 0.5229942798614502, + "learning_rate": 0.0006566666666666666, + "loss": 0.416, + "step": 12366 + }, + { + "epoch": 6.908938547486033, + "grad_norm": 0.8055040240287781, + "learning_rate": 0.0006566386554621849, + "loss": 0.5443, + "step": 12367 + }, + { + "epoch": 6.909497206703911, + "grad_norm": 0.4818640649318695, + "learning_rate": 0.0006566106442577031, + "loss": 0.3809, + "step": 12368 + }, + { + "epoch": 6.910055865921787, + "grad_norm": 0.384899765253067, + "learning_rate": 0.0006565826330532213, + "loss": 0.4262, + "step": 12369 + }, + { + "epoch": 6.910614525139665, + "grad_norm": 0.4965118169784546, + "learning_rate": 0.0006565546218487394, + "loss": 0.4224, + "step": 12370 + }, + { + "epoch": 6.911173184357542, + "grad_norm": 0.4606805741786957, + "learning_rate": 0.0006565266106442576, + "loss": 0.5537, + "step": 12371 + }, + { + "epoch": 6.911731843575419, + "grad_norm": 1.11643385887146, + "learning_rate": 0.000656498599439776, + "loss": 0.4484, + "step": 12372 + }, + { + "epoch": 6.912290502793296, + "grad_norm": 0.47402969002723694, + "learning_rate": 0.0006564705882352942, + "loss": 0.4947, + "step": 12373 + }, + { + "epoch": 6.912849162011173, + "grad_norm": 0.5438334941864014, + "learning_rate": 0.0006564425770308124, + "loss": 0.5606, + "step": 12374 + }, + { + "epoch": 6.91340782122905, + "grad_norm": 0.8218211531639099, + "learning_rate": 0.0006564145658263305, + "loss": 0.4115, + "step": 12375 + }, + { + "epoch": 6.913966480446927, + "grad_norm": 0.9032309651374817, + "learning_rate": 0.0006563865546218487, + "loss": 0.5259, + "step": 12376 + }, + { + "epoch": 6.914525139664804, + "grad_norm": 0.42247480154037476, + "learning_rate": 0.000656358543417367, + "loss": 0.4629, + "step": 12377 + }, + { + "epoch": 6.915083798882682, + "grad_norm": 0.9950020909309387, + "learning_rate": 0.0006563305322128852, + "loss": 0.4232, + "step": 12378 + }, + { + "epoch": 6.915642458100558, + "grad_norm": 1.1210905313491821, + "learning_rate": 0.0006563025210084034, + "loss": 0.5409, + "step": 12379 + }, + { + "epoch": 6.916201117318436, + "grad_norm": 0.49637311697006226, + "learning_rate": 0.0006562745098039216, + "loss": 0.4644, + "step": 12380 + }, + { + "epoch": 6.9167597765363125, + "grad_norm": 0.5367304682731628, + "learning_rate": 0.0006562464985994397, + "loss": 0.4843, + "step": 12381 + }, + { + "epoch": 6.91731843575419, + "grad_norm": 0.4905094802379608, + "learning_rate": 0.000656218487394958, + "loss": 0.5144, + "step": 12382 + }, + { + "epoch": 6.917877094972067, + "grad_norm": 0.5550079941749573, + "learning_rate": 0.0006561904761904762, + "loss": 0.5285, + "step": 12383 + }, + { + "epoch": 6.918435754189944, + "grad_norm": 0.5665770173072815, + "learning_rate": 0.0006561624649859944, + "loss": 0.5761, + "step": 12384 + }, + { + "epoch": 6.918994413407821, + "grad_norm": 0.397034615278244, + "learning_rate": 0.0006561344537815126, + "loss": 0.463, + "step": 12385 + }, + { + "epoch": 6.919553072625698, + "grad_norm": 0.391889750957489, + "learning_rate": 0.0006561064425770307, + "loss": 0.3847, + "step": 12386 + }, + { + "epoch": 6.920111731843575, + "grad_norm": 0.4215538799762726, + "learning_rate": 0.000656078431372549, + "loss": 0.4007, + "step": 12387 + }, + { + "epoch": 6.920670391061453, + "grad_norm": 0.49836936593055725, + "learning_rate": 0.0006560504201680673, + "loss": 0.5631, + "step": 12388 + }, + { + "epoch": 6.921229050279329, + "grad_norm": 0.3793417811393738, + "learning_rate": 0.0006560224089635855, + "loss": 0.4234, + "step": 12389 + }, + { + "epoch": 6.921787709497207, + "grad_norm": 0.3756851851940155, + "learning_rate": 0.0006559943977591037, + "loss": 0.3831, + "step": 12390 + }, + { + "epoch": 6.9223463687150835, + "grad_norm": 0.5286829471588135, + "learning_rate": 0.0006559663865546218, + "loss": 0.4984, + "step": 12391 + }, + { + "epoch": 6.922905027932961, + "grad_norm": 0.4152444899082184, + "learning_rate": 0.0006559383753501401, + "loss": 0.4575, + "step": 12392 + }, + { + "epoch": 6.923463687150838, + "grad_norm": 2.9444100856781006, + "learning_rate": 0.0006559103641456583, + "loss": 0.5293, + "step": 12393 + }, + { + "epoch": 6.924022346368715, + "grad_norm": 0.5320962071418762, + "learning_rate": 0.0006558823529411765, + "loss": 0.4551, + "step": 12394 + }, + { + "epoch": 6.924581005586592, + "grad_norm": 3.482010841369629, + "learning_rate": 0.0006558543417366947, + "loss": 0.3821, + "step": 12395 + }, + { + "epoch": 6.925139664804469, + "grad_norm": 0.48805537819862366, + "learning_rate": 0.0006558263305322129, + "loss": 0.4393, + "step": 12396 + }, + { + "epoch": 6.925698324022346, + "grad_norm": 1.2725995779037476, + "learning_rate": 0.0006557983193277311, + "loss": 0.3222, + "step": 12397 + }, + { + "epoch": 6.926256983240224, + "grad_norm": 0.49517494440078735, + "learning_rate": 0.0006557703081232493, + "loss": 0.3314, + "step": 12398 + }, + { + "epoch": 6.9268156424581, + "grad_norm": 0.9687004685401917, + "learning_rate": 0.0006557422969187675, + "loss": 0.3499, + "step": 12399 + }, + { + "epoch": 6.927374301675978, + "grad_norm": 0.40634989738464355, + "learning_rate": 0.0006557142857142857, + "loss": 0.4285, + "step": 12400 + }, + { + "epoch": 6.9279329608938545, + "grad_norm": 0.7133250832557678, + "learning_rate": 0.0006556862745098039, + "loss": 0.5531, + "step": 12401 + }, + { + "epoch": 6.928491620111732, + "grad_norm": 0.5127828121185303, + "learning_rate": 0.0006556582633053221, + "loss": 0.3871, + "step": 12402 + }, + { + "epoch": 6.9290502793296085, + "grad_norm": 0.5962216258049011, + "learning_rate": 0.0006556302521008403, + "loss": 0.4671, + "step": 12403 + }, + { + "epoch": 6.929608938547486, + "grad_norm": 1.6099315881729126, + "learning_rate": 0.0006556022408963586, + "loss": 0.3547, + "step": 12404 + }, + { + "epoch": 6.930167597765363, + "grad_norm": 0.5895669460296631, + "learning_rate": 0.0006555742296918768, + "loss": 0.3576, + "step": 12405 + }, + { + "epoch": 6.93072625698324, + "grad_norm": 0.5484691262245178, + "learning_rate": 0.000655546218487395, + "loss": 0.488, + "step": 12406 + }, + { + "epoch": 6.931284916201117, + "grad_norm": 2.1635115146636963, + "learning_rate": 0.0006555182072829132, + "loss": 0.406, + "step": 12407 + }, + { + "epoch": 6.931843575418995, + "grad_norm": 0.40444448590278625, + "learning_rate": 0.0006554901960784314, + "loss": 0.4016, + "step": 12408 + }, + { + "epoch": 6.932402234636871, + "grad_norm": 0.5239859819412231, + "learning_rate": 0.0006554621848739496, + "loss": 0.5337, + "step": 12409 + }, + { + "epoch": 6.932960893854749, + "grad_norm": 0.9825257062911987, + "learning_rate": 0.0006554341736694678, + "loss": 0.5708, + "step": 12410 + }, + { + "epoch": 6.9335195530726255, + "grad_norm": 0.6810752153396606, + "learning_rate": 0.000655406162464986, + "loss": 0.6135, + "step": 12411 + }, + { + "epoch": 6.934078212290503, + "grad_norm": 0.42385968565940857, + "learning_rate": 0.0006553781512605043, + "loss": 0.4227, + "step": 12412 + }, + { + "epoch": 6.9346368715083795, + "grad_norm": 0.5549126267433167, + "learning_rate": 0.0006553501400560224, + "loss": 0.3593, + "step": 12413 + }, + { + "epoch": 6.935195530726257, + "grad_norm": 0.732172966003418, + "learning_rate": 0.0006553221288515406, + "loss": 0.4366, + "step": 12414 + }, + { + "epoch": 6.935754189944134, + "grad_norm": 0.5036166310310364, + "learning_rate": 0.0006552941176470588, + "loss": 0.5113, + "step": 12415 + }, + { + "epoch": 6.936312849162011, + "grad_norm": 0.47933119535446167, + "learning_rate": 0.000655266106442577, + "loss": 0.3737, + "step": 12416 + }, + { + "epoch": 6.936871508379888, + "grad_norm": 0.45102137327194214, + "learning_rate": 0.0006552380952380953, + "loss": 0.5188, + "step": 12417 + }, + { + "epoch": 6.937430167597765, + "grad_norm": 0.4979225695133209, + "learning_rate": 0.0006552100840336134, + "loss": 0.3399, + "step": 12418 + }, + { + "epoch": 6.937988826815642, + "grad_norm": 0.41004273295402527, + "learning_rate": 0.0006551820728291316, + "loss": 0.4299, + "step": 12419 + }, + { + "epoch": 6.93854748603352, + "grad_norm": 0.6230870485305786, + "learning_rate": 0.0006551540616246499, + "loss": 0.4275, + "step": 12420 + }, + { + "epoch": 6.9391061452513965, + "grad_norm": 0.5949428081512451, + "learning_rate": 0.0006551260504201681, + "loss": 0.4264, + "step": 12421 + }, + { + "epoch": 6.939664804469274, + "grad_norm": 0.500310480594635, + "learning_rate": 0.0006550980392156864, + "loss": 0.483, + "step": 12422 + }, + { + "epoch": 6.9402234636871505, + "grad_norm": 0.35211479663848877, + "learning_rate": 0.0006550700280112045, + "loss": 0.3301, + "step": 12423 + }, + { + "epoch": 6.940782122905028, + "grad_norm": 1.598307490348816, + "learning_rate": 0.0006550420168067227, + "loss": 0.4115, + "step": 12424 + }, + { + "epoch": 6.941340782122905, + "grad_norm": 0.5416072010993958, + "learning_rate": 0.0006550140056022409, + "loss": 0.445, + "step": 12425 + }, + { + "epoch": 6.941899441340782, + "grad_norm": 0.8757627606391907, + "learning_rate": 0.0006549859943977591, + "loss": 0.5127, + "step": 12426 + }, + { + "epoch": 6.942458100558659, + "grad_norm": 0.41450515389442444, + "learning_rate": 0.0006549579831932774, + "loss": 0.3773, + "step": 12427 + }, + { + "epoch": 6.943016759776536, + "grad_norm": 0.40004053711891174, + "learning_rate": 0.0006549299719887956, + "loss": 0.4692, + "step": 12428 + }, + { + "epoch": 6.943575418994413, + "grad_norm": 0.5712906122207642, + "learning_rate": 0.0006549019607843137, + "loss": 0.447, + "step": 12429 + }, + { + "epoch": 6.94413407821229, + "grad_norm": 0.6028456091880798, + "learning_rate": 0.0006548739495798319, + "loss": 0.5044, + "step": 12430 + }, + { + "epoch": 6.9446927374301675, + "grad_norm": 0.5334956049919128, + "learning_rate": 0.0006548459383753501, + "loss": 0.469, + "step": 12431 + }, + { + "epoch": 6.945251396648045, + "grad_norm": 0.49053484201431274, + "learning_rate": 0.0006548179271708684, + "loss": 0.3471, + "step": 12432 + }, + { + "epoch": 6.9458100558659215, + "grad_norm": 0.5994573831558228, + "learning_rate": 0.0006547899159663866, + "loss": 0.4562, + "step": 12433 + }, + { + "epoch": 6.946368715083799, + "grad_norm": 0.601992666721344, + "learning_rate": 0.0006547619047619047, + "loss": 0.4583, + "step": 12434 + }, + { + "epoch": 6.946927374301676, + "grad_norm": 0.48436716198921204, + "learning_rate": 0.0006547338935574229, + "loss": 0.4055, + "step": 12435 + }, + { + "epoch": 6.947486033519553, + "grad_norm": 0.40805190801620483, + "learning_rate": 0.0006547058823529411, + "loss": 0.3068, + "step": 12436 + }, + { + "epoch": 6.94804469273743, + "grad_norm": 0.4589974880218506, + "learning_rate": 0.0006546778711484595, + "loss": 0.3387, + "step": 12437 + }, + { + "epoch": 6.948603351955307, + "grad_norm": 0.5143588185310364, + "learning_rate": 0.0006546498599439777, + "loss": 0.5418, + "step": 12438 + }, + { + "epoch": 6.949162011173184, + "grad_norm": 0.4102528393268585, + "learning_rate": 0.0006546218487394958, + "loss": 0.3706, + "step": 12439 + }, + { + "epoch": 6.949720670391061, + "grad_norm": 0.6891082525253296, + "learning_rate": 0.000654593837535014, + "loss": 0.4248, + "step": 12440 + }, + { + "epoch": 6.9502793296089385, + "grad_norm": 0.4390251934528351, + "learning_rate": 0.0006545658263305322, + "loss": 0.4379, + "step": 12441 + }, + { + "epoch": 6.950837988826816, + "grad_norm": 0.844826877117157, + "learning_rate": 0.0006545378151260505, + "loss": 0.392, + "step": 12442 + }, + { + "epoch": 6.9513966480446925, + "grad_norm": 0.5709537267684937, + "learning_rate": 0.0006545098039215687, + "loss": 0.5046, + "step": 12443 + }, + { + "epoch": 6.95195530726257, + "grad_norm": 0.46316081285476685, + "learning_rate": 0.0006544817927170869, + "loss": 0.3944, + "step": 12444 + }, + { + "epoch": 6.952513966480447, + "grad_norm": 0.6412991285324097, + "learning_rate": 0.000654453781512605, + "loss": 0.3976, + "step": 12445 + }, + { + "epoch": 6.953072625698324, + "grad_norm": 1.5535904169082642, + "learning_rate": 0.0006544257703081232, + "loss": 0.4837, + "step": 12446 + }, + { + "epoch": 6.953631284916201, + "grad_norm": 0.4667535722255707, + "learning_rate": 0.0006543977591036415, + "loss": 0.3707, + "step": 12447 + }, + { + "epoch": 6.954189944134078, + "grad_norm": 0.7522304654121399, + "learning_rate": 0.0006543697478991597, + "loss": 0.4734, + "step": 12448 + }, + { + "epoch": 6.954748603351955, + "grad_norm": 0.4077203869819641, + "learning_rate": 0.0006543417366946779, + "loss": 0.4009, + "step": 12449 + }, + { + "epoch": 6.955307262569832, + "grad_norm": 0.6013436317443848, + "learning_rate": 0.000654313725490196, + "loss": 0.4216, + "step": 12450 + }, + { + "epoch": 6.9558659217877095, + "grad_norm": 0.5561041831970215, + "learning_rate": 0.0006542857142857142, + "loss": 0.5843, + "step": 12451 + }, + { + "epoch": 6.956424581005587, + "grad_norm": 1.5629853010177612, + "learning_rate": 0.0006542577030812326, + "loss": 0.5155, + "step": 12452 + }, + { + "epoch": 6.9569832402234635, + "grad_norm": 0.6675905585289001, + "learning_rate": 0.0006542296918767508, + "loss": 0.4216, + "step": 12453 + }, + { + "epoch": 6.957541899441341, + "grad_norm": 0.40994587540626526, + "learning_rate": 0.000654201680672269, + "loss": 0.4117, + "step": 12454 + }, + { + "epoch": 6.9581005586592175, + "grad_norm": 0.5865683555603027, + "learning_rate": 0.0006541736694677871, + "loss": 0.496, + "step": 12455 + }, + { + "epoch": 6.958659217877095, + "grad_norm": 0.5727628469467163, + "learning_rate": 0.0006541456582633053, + "loss": 0.4381, + "step": 12456 + }, + { + "epoch": 6.959217877094972, + "grad_norm": 0.39902451634407043, + "learning_rate": 0.0006541176470588236, + "loss": 0.4047, + "step": 12457 + }, + { + "epoch": 6.959776536312849, + "grad_norm": 7.920273303985596, + "learning_rate": 0.0006540896358543418, + "loss": 0.4464, + "step": 12458 + }, + { + "epoch": 6.960335195530726, + "grad_norm": 0.4360896348953247, + "learning_rate": 0.00065406162464986, + "loss": 0.4946, + "step": 12459 + }, + { + "epoch": 6.960893854748603, + "grad_norm": 0.4141186475753784, + "learning_rate": 0.0006540336134453782, + "loss": 0.3844, + "step": 12460 + }, + { + "epoch": 6.9614525139664805, + "grad_norm": 0.6742033362388611, + "learning_rate": 0.0006540056022408963, + "loss": 0.441, + "step": 12461 + }, + { + "epoch": 6.962011173184358, + "grad_norm": 2.065803050994873, + "learning_rate": 0.0006539775910364146, + "loss": 0.4161, + "step": 12462 + }, + { + "epoch": 6.9625698324022345, + "grad_norm": 0.5230733156204224, + "learning_rate": 0.0006539495798319328, + "loss": 0.4631, + "step": 12463 + }, + { + "epoch": 6.963128491620112, + "grad_norm": 0.4006344676017761, + "learning_rate": 0.000653921568627451, + "loss": 0.486, + "step": 12464 + }, + { + "epoch": 6.9636871508379885, + "grad_norm": 0.5748187303543091, + "learning_rate": 0.0006538935574229692, + "loss": 0.432, + "step": 12465 + }, + { + "epoch": 6.964245810055866, + "grad_norm": 0.4753177762031555, + "learning_rate": 0.0006538655462184873, + "loss": 0.4776, + "step": 12466 + }, + { + "epoch": 6.9648044692737425, + "grad_norm": 0.8131725788116455, + "learning_rate": 0.0006538375350140056, + "loss": 0.4465, + "step": 12467 + }, + { + "epoch": 6.96536312849162, + "grad_norm": 1.4685733318328857, + "learning_rate": 0.0006538095238095238, + "loss": 0.5757, + "step": 12468 + }, + { + "epoch": 6.965921787709497, + "grad_norm": 0.4905511736869812, + "learning_rate": 0.000653781512605042, + "loss": 0.5628, + "step": 12469 + }, + { + "epoch": 6.966480446927374, + "grad_norm": 0.7473582625389099, + "learning_rate": 0.0006537535014005603, + "loss": 0.4403, + "step": 12470 + }, + { + "epoch": 6.9670391061452515, + "grad_norm": 1.3870606422424316, + "learning_rate": 0.0006537254901960784, + "loss": 0.4142, + "step": 12471 + }, + { + "epoch": 6.967597765363129, + "grad_norm": 0.5380938053131104, + "learning_rate": 0.0006536974789915967, + "loss": 0.3878, + "step": 12472 + }, + { + "epoch": 6.9681564245810055, + "grad_norm": 0.34117642045021057, + "learning_rate": 0.0006536694677871149, + "loss": 0.5236, + "step": 12473 + }, + { + "epoch": 6.968715083798883, + "grad_norm": 0.42269209027290344, + "learning_rate": 0.0006536414565826331, + "loss": 0.4183, + "step": 12474 + }, + { + "epoch": 6.9692737430167595, + "grad_norm": 0.3919951319694519, + "learning_rate": 0.0006536134453781513, + "loss": 0.4123, + "step": 12475 + }, + { + "epoch": 6.969832402234637, + "grad_norm": 0.784248411655426, + "learning_rate": 0.0006535854341736695, + "loss": 0.4593, + "step": 12476 + }, + { + "epoch": 6.9703910614525135, + "grad_norm": 0.4799293577671051, + "learning_rate": 0.0006535574229691877, + "loss": 0.4164, + "step": 12477 + }, + { + "epoch": 6.970949720670391, + "grad_norm": 0.5970208644866943, + "learning_rate": 0.0006535294117647059, + "loss": 0.518, + "step": 12478 + }, + { + "epoch": 6.971508379888268, + "grad_norm": 0.3954332768917084, + "learning_rate": 0.0006535014005602241, + "loss": 0.4724, + "step": 12479 + }, + { + "epoch": 6.972067039106145, + "grad_norm": 0.4743720293045044, + "learning_rate": 0.0006534733893557423, + "loss": 0.4906, + "step": 12480 + }, + { + "epoch": 6.9726256983240225, + "grad_norm": 1.1048377752304077, + "learning_rate": 0.0006534453781512605, + "loss": 0.6001, + "step": 12481 + }, + { + "epoch": 6.9731843575419, + "grad_norm": 0.4624098837375641, + "learning_rate": 0.0006534173669467787, + "loss": 0.3546, + "step": 12482 + }, + { + "epoch": 6.9737430167597765, + "grad_norm": 0.6275320649147034, + "learning_rate": 0.0006533893557422969, + "loss": 0.5894, + "step": 12483 + }, + { + "epoch": 6.974301675977654, + "grad_norm": 0.5248343348503113, + "learning_rate": 0.0006533613445378151, + "loss": 0.3975, + "step": 12484 + }, + { + "epoch": 6.9748603351955305, + "grad_norm": 0.36769047379493713, + "learning_rate": 0.0006533333333333333, + "loss": 0.3626, + "step": 12485 + }, + { + "epoch": 6.975418994413408, + "grad_norm": 0.9091798067092896, + "learning_rate": 0.0006533053221288516, + "loss": 0.4925, + "step": 12486 + }, + { + "epoch": 6.9759776536312845, + "grad_norm": 0.5735195875167847, + "learning_rate": 0.0006532773109243699, + "loss": 0.4249, + "step": 12487 + }, + { + "epoch": 6.976536312849162, + "grad_norm": 0.7241012454032898, + "learning_rate": 0.000653249299719888, + "loss": 0.4004, + "step": 12488 + }, + { + "epoch": 6.977094972067039, + "grad_norm": 0.4537748396396637, + "learning_rate": 0.0006532212885154062, + "loss": 0.3994, + "step": 12489 + }, + { + "epoch": 6.977653631284916, + "grad_norm": 0.4960888624191284, + "learning_rate": 0.0006531932773109244, + "loss": 0.3374, + "step": 12490 + }, + { + "epoch": 6.9782122905027935, + "grad_norm": 0.4250923991203308, + "learning_rate": 0.0006531652661064426, + "loss": 0.3804, + "step": 12491 + }, + { + "epoch": 6.97877094972067, + "grad_norm": 2.0398457050323486, + "learning_rate": 0.0006531372549019609, + "loss": 0.4517, + "step": 12492 + }, + { + "epoch": 6.9793296089385475, + "grad_norm": 0.6344838738441467, + "learning_rate": 0.000653109243697479, + "loss": 0.3714, + "step": 12493 + }, + { + "epoch": 6.979888268156425, + "grad_norm": 0.44564327597618103, + "learning_rate": 0.0006530812324929972, + "loss": 0.4715, + "step": 12494 + }, + { + "epoch": 6.9804469273743015, + "grad_norm": 0.4302724599838257, + "learning_rate": 0.0006530532212885154, + "loss": 0.4688, + "step": 12495 + }, + { + "epoch": 6.981005586592179, + "grad_norm": 0.6383557319641113, + "learning_rate": 0.0006530252100840336, + "loss": 0.512, + "step": 12496 + }, + { + "epoch": 6.9815642458100555, + "grad_norm": 0.5707694888114929, + "learning_rate": 0.0006529971988795518, + "loss": 0.5227, + "step": 12497 + }, + { + "epoch": 6.982122905027933, + "grad_norm": 0.6816879510879517, + "learning_rate": 0.00065296918767507, + "loss": 0.5732, + "step": 12498 + }, + { + "epoch": 6.98268156424581, + "grad_norm": 2.52976131439209, + "learning_rate": 0.0006529411764705882, + "loss": 0.5392, + "step": 12499 + }, + { + "epoch": 6.983240223463687, + "grad_norm": 0.4378868639469147, + "learning_rate": 0.0006529131652661064, + "loss": 0.3632, + "step": 12500 + }, + { + "epoch": 6.983240223463687, + "eval_cer": 0.09071716146770974, + "eval_loss": 0.3444172739982605, + "eval_runtime": 61.6052, + "eval_samples_per_second": 73.663, + "eval_steps_per_second": 4.61, + "eval_wer": 0.3576995056004022, + "step": 12500 + }, + { + "epoch": 6.9837988826815645, + "grad_norm": 1.5691546201705933, + "learning_rate": 0.0006528851540616246, + "loss": 0.4101, + "step": 12501 + }, + { + "epoch": 6.984357541899441, + "grad_norm": 0.7826452255249023, + "learning_rate": 0.0006528571428571429, + "loss": 0.5157, + "step": 12502 + }, + { + "epoch": 6.9849162011173185, + "grad_norm": 1.035007357597351, + "learning_rate": 0.0006528291316526612, + "loss": 0.5782, + "step": 12503 + }, + { + "epoch": 6.985474860335195, + "grad_norm": 1.1592096090316772, + "learning_rate": 0.0006528011204481793, + "loss": 0.5618, + "step": 12504 + }, + { + "epoch": 6.9860335195530725, + "grad_norm": 0.7418396472930908, + "learning_rate": 0.0006527731092436975, + "loss": 0.4641, + "step": 12505 + }, + { + "epoch": 6.98659217877095, + "grad_norm": 1.3196353912353516, + "learning_rate": 0.0006527450980392157, + "loss": 0.419, + "step": 12506 + }, + { + "epoch": 6.9871508379888265, + "grad_norm": 0.6099117398262024, + "learning_rate": 0.0006527170868347339, + "loss": 0.3834, + "step": 12507 + }, + { + "epoch": 6.987709497206704, + "grad_norm": 0.7795719504356384, + "learning_rate": 0.0006526890756302522, + "loss": 0.4327, + "step": 12508 + }, + { + "epoch": 6.988268156424581, + "grad_norm": 0.45224884152412415, + "learning_rate": 0.0006526610644257703, + "loss": 0.353, + "step": 12509 + }, + { + "epoch": 6.988826815642458, + "grad_norm": 0.8954925537109375, + "learning_rate": 0.0006526330532212885, + "loss": 0.5832, + "step": 12510 + }, + { + "epoch": 6.9893854748603355, + "grad_norm": 3.428053855895996, + "learning_rate": 0.0006526050420168067, + "loss": 0.5943, + "step": 12511 + }, + { + "epoch": 6.989944134078212, + "grad_norm": 0.5529339909553528, + "learning_rate": 0.0006525770308123249, + "loss": 0.3559, + "step": 12512 + }, + { + "epoch": 6.9905027932960895, + "grad_norm": 0.8013816475868225, + "learning_rate": 0.0006525490196078432, + "loss": 0.383, + "step": 12513 + }, + { + "epoch": 6.991061452513966, + "grad_norm": 0.5604310631752014, + "learning_rate": 0.0006525210084033613, + "loss": 0.5928, + "step": 12514 + }, + { + "epoch": 6.9916201117318435, + "grad_norm": 0.779806911945343, + "learning_rate": 0.0006524929971988795, + "loss": 0.4769, + "step": 12515 + }, + { + "epoch": 6.992178770949721, + "grad_norm": 0.5772127509117126, + "learning_rate": 0.0006524649859943977, + "loss": 0.4831, + "step": 12516 + }, + { + "epoch": 6.9927374301675975, + "grad_norm": 0.47065261006355286, + "learning_rate": 0.0006524369747899159, + "loss": 0.4899, + "step": 12517 + }, + { + "epoch": 6.993296089385475, + "grad_norm": 1.526015281677246, + "learning_rate": 0.0006524089635854343, + "loss": 0.402, + "step": 12518 + }, + { + "epoch": 6.993854748603352, + "grad_norm": 0.3837558329105377, + "learning_rate": 0.0006523809523809525, + "loss": 0.4607, + "step": 12519 + }, + { + "epoch": 6.994413407821229, + "grad_norm": 0.4979607164859772, + "learning_rate": 0.0006523529411764706, + "loss": 0.4267, + "step": 12520 + }, + { + "epoch": 6.9949720670391065, + "grad_norm": 0.4629913866519928, + "learning_rate": 0.0006523249299719888, + "loss": 0.4747, + "step": 12521 + }, + { + "epoch": 6.995530726256983, + "grad_norm": 0.9315935969352722, + "learning_rate": 0.000652296918767507, + "loss": 0.4815, + "step": 12522 + }, + { + "epoch": 6.9960893854748605, + "grad_norm": 0.7310153841972351, + "learning_rate": 0.0006522689075630253, + "loss": 0.448, + "step": 12523 + }, + { + "epoch": 6.996648044692737, + "grad_norm": 0.3153943121433258, + "learning_rate": 0.0006522408963585435, + "loss": 0.3609, + "step": 12524 + }, + { + "epoch": 6.9972067039106145, + "grad_norm": 0.5415436625480652, + "learning_rate": 0.0006522128851540616, + "loss": 0.4289, + "step": 12525 + }, + { + "epoch": 6.997765363128492, + "grad_norm": 0.40753111243247986, + "learning_rate": 0.0006521848739495798, + "loss": 0.4147, + "step": 12526 + }, + { + "epoch": 6.9983240223463685, + "grad_norm": 0.5162279605865479, + "learning_rate": 0.000652156862745098, + "loss": 0.5075, + "step": 12527 + }, + { + "epoch": 6.998882681564246, + "grad_norm": 0.4145958423614502, + "learning_rate": 0.0006521288515406163, + "loss": 0.4486, + "step": 12528 + }, + { + "epoch": 6.9994413407821225, + "grad_norm": 0.6179265975952148, + "learning_rate": 0.0006521008403361345, + "loss": 0.3807, + "step": 12529 + }, + { + "epoch": 7.0, + "grad_norm": 0.4103699028491974, + "learning_rate": 0.0006520728291316526, + "loss": 0.416, + "step": 12530 + }, + { + "epoch": 7.0005586592178775, + "grad_norm": 0.659807562828064, + "learning_rate": 0.0006520448179271708, + "loss": 0.4172, + "step": 12531 + }, + { + "epoch": 7.001117318435754, + "grad_norm": 0.7853411436080933, + "learning_rate": 0.000652016806722689, + "loss": 0.5219, + "step": 12532 + }, + { + "epoch": 7.0016759776536315, + "grad_norm": 0.4956013262271881, + "learning_rate": 0.0006519887955182073, + "loss": 0.4955, + "step": 12533 + }, + { + "epoch": 7.002234636871508, + "grad_norm": 0.4066002070903778, + "learning_rate": 0.0006519607843137256, + "loss": 0.4167, + "step": 12534 + }, + { + "epoch": 7.0027932960893855, + "grad_norm": 0.5741144418716431, + "learning_rate": 0.0006519327731092438, + "loss": 0.5755, + "step": 12535 + }, + { + "epoch": 7.003351955307263, + "grad_norm": 0.6808263659477234, + "learning_rate": 0.0006519047619047619, + "loss": 0.5082, + "step": 12536 + }, + { + "epoch": 7.0039106145251395, + "grad_norm": 0.3875439465045929, + "learning_rate": 0.0006518767507002801, + "loss": 0.4286, + "step": 12537 + }, + { + "epoch": 7.004469273743017, + "grad_norm": 1.193933367729187, + "learning_rate": 0.0006518487394957984, + "loss": 0.4419, + "step": 12538 + }, + { + "epoch": 7.0050279329608935, + "grad_norm": 0.398002028465271, + "learning_rate": 0.0006518207282913166, + "loss": 0.4691, + "step": 12539 + }, + { + "epoch": 7.005586592178771, + "grad_norm": 0.839568018913269, + "learning_rate": 0.0006517927170868348, + "loss": 0.4841, + "step": 12540 + }, + { + "epoch": 7.0061452513966485, + "grad_norm": 0.5931982398033142, + "learning_rate": 0.0006517647058823529, + "loss": 0.5055, + "step": 12541 + }, + { + "epoch": 7.006703910614525, + "grad_norm": 0.4617202877998352, + "learning_rate": 0.0006517366946778711, + "loss": 0.4025, + "step": 12542 + }, + { + "epoch": 7.0072625698324025, + "grad_norm": 1.279715657234192, + "learning_rate": 0.0006517086834733894, + "loss": 0.4088, + "step": 12543 + }, + { + "epoch": 7.007821229050279, + "grad_norm": 1.403926968574524, + "learning_rate": 0.0006516806722689076, + "loss": 0.3234, + "step": 12544 + }, + { + "epoch": 7.0083798882681565, + "grad_norm": 0.48188167810440063, + "learning_rate": 0.0006516526610644258, + "loss": 0.445, + "step": 12545 + }, + { + "epoch": 7.008938547486034, + "grad_norm": 0.49491217732429504, + "learning_rate": 0.0006516246498599439, + "loss": 0.3788, + "step": 12546 + }, + { + "epoch": 7.0094972067039105, + "grad_norm": 0.6949318647384644, + "learning_rate": 0.0006515966386554621, + "loss": 0.4554, + "step": 12547 + }, + { + "epoch": 7.010055865921788, + "grad_norm": 0.4032461643218994, + "learning_rate": 0.0006515686274509804, + "loss": 0.3735, + "step": 12548 + }, + { + "epoch": 7.0106145251396645, + "grad_norm": 0.6850608587265015, + "learning_rate": 0.0006515406162464986, + "loss": 0.683, + "step": 12549 + }, + { + "epoch": 7.011173184357542, + "grad_norm": 0.4039214849472046, + "learning_rate": 0.0006515126050420168, + "loss": 0.3644, + "step": 12550 + }, + { + "epoch": 7.011731843575419, + "grad_norm": 0.4704190194606781, + "learning_rate": 0.000651484593837535, + "loss": 0.4775, + "step": 12551 + }, + { + "epoch": 7.012290502793296, + "grad_norm": 0.5351200699806213, + "learning_rate": 0.0006514565826330532, + "loss": 0.4556, + "step": 12552 + }, + { + "epoch": 7.0128491620111735, + "grad_norm": 0.7129198312759399, + "learning_rate": 0.0006514285714285715, + "loss": 0.5092, + "step": 12553 + }, + { + "epoch": 7.01340782122905, + "grad_norm": 0.8035323619842529, + "learning_rate": 0.0006514005602240897, + "loss": 0.8438, + "step": 12554 + }, + { + "epoch": 7.0139664804469275, + "grad_norm": 0.6217595338821411, + "learning_rate": 0.0006513725490196079, + "loss": 0.4633, + "step": 12555 + }, + { + "epoch": 7.014525139664804, + "grad_norm": 0.9007251262664795, + "learning_rate": 0.0006513445378151261, + "loss": 0.4926, + "step": 12556 + }, + { + "epoch": 7.0150837988826815, + "grad_norm": 0.3942495584487915, + "learning_rate": 0.0006513165266106442, + "loss": 0.3861, + "step": 12557 + }, + { + "epoch": 7.015642458100559, + "grad_norm": 0.4143909811973572, + "learning_rate": 0.0006512885154061625, + "loss": 0.342, + "step": 12558 + }, + { + "epoch": 7.0162011173184355, + "grad_norm": 0.5664569735527039, + "learning_rate": 0.0006512605042016807, + "loss": 0.5453, + "step": 12559 + }, + { + "epoch": 7.016759776536313, + "grad_norm": 0.5399006009101868, + "learning_rate": 0.0006512324929971989, + "loss": 0.4397, + "step": 12560 + }, + { + "epoch": 7.01731843575419, + "grad_norm": 0.9018974304199219, + "learning_rate": 0.0006512044817927171, + "loss": 0.4963, + "step": 12561 + }, + { + "epoch": 7.017877094972067, + "grad_norm": 10.350446701049805, + "learning_rate": 0.0006511764705882352, + "loss": 0.452, + "step": 12562 + }, + { + "epoch": 7.0184357541899445, + "grad_norm": 0.7601476907730103, + "learning_rate": 0.0006511484593837535, + "loss": 0.4971, + "step": 12563 + }, + { + "epoch": 7.018994413407821, + "grad_norm": 1.6554700136184692, + "learning_rate": 0.0006511204481792717, + "loss": 0.3939, + "step": 12564 + }, + { + "epoch": 7.0195530726256985, + "grad_norm": 0.6143400073051453, + "learning_rate": 0.0006510924369747899, + "loss": 0.6002, + "step": 12565 + }, + { + "epoch": 7.020111731843575, + "grad_norm": 0.6016078591346741, + "learning_rate": 0.0006510644257703081, + "loss": 0.4271, + "step": 12566 + }, + { + "epoch": 7.0206703910614525, + "grad_norm": 0.6250494122505188, + "learning_rate": 0.0006510364145658263, + "loss": 0.4922, + "step": 12567 + }, + { + "epoch": 7.02122905027933, + "grad_norm": 0.5893234014511108, + "learning_rate": 0.0006510084033613446, + "loss": 0.4632, + "step": 12568 + }, + { + "epoch": 7.0217877094972065, + "grad_norm": 0.4258977472782135, + "learning_rate": 0.0006509803921568628, + "loss": 0.3828, + "step": 12569 + }, + { + "epoch": 7.022346368715084, + "grad_norm": 0.43992477655410767, + "learning_rate": 0.000650952380952381, + "loss": 0.5041, + "step": 12570 + }, + { + "epoch": 7.022905027932961, + "grad_norm": 0.6892359852790833, + "learning_rate": 0.0006509243697478992, + "loss": 0.5246, + "step": 12571 + }, + { + "epoch": 7.023463687150838, + "grad_norm": 0.43133360147476196, + "learning_rate": 0.0006508963585434174, + "loss": 0.4757, + "step": 12572 + }, + { + "epoch": 7.0240223463687155, + "grad_norm": 0.7305607795715332, + "learning_rate": 0.0006508683473389356, + "loss": 0.379, + "step": 12573 + }, + { + "epoch": 7.024581005586592, + "grad_norm": 0.634161114692688, + "learning_rate": 0.0006508403361344538, + "loss": 0.4357, + "step": 12574 + }, + { + "epoch": 7.0251396648044695, + "grad_norm": 0.9562336206436157, + "learning_rate": 0.000650812324929972, + "loss": 0.45, + "step": 12575 + }, + { + "epoch": 7.025698324022346, + "grad_norm": 0.5201601386070251, + "learning_rate": 0.0006507843137254902, + "loss": 0.5613, + "step": 12576 + }, + { + "epoch": 7.0262569832402235, + "grad_norm": 1.1862400770187378, + "learning_rate": 0.0006507563025210084, + "loss": 0.426, + "step": 12577 + }, + { + "epoch": 7.026815642458101, + "grad_norm": 0.4462496340274811, + "learning_rate": 0.0006507282913165266, + "loss": 0.3742, + "step": 12578 + }, + { + "epoch": 7.0273743016759775, + "grad_norm": 0.49201565980911255, + "learning_rate": 0.0006507002801120448, + "loss": 0.5107, + "step": 12579 + }, + { + "epoch": 7.027932960893855, + "grad_norm": 0.5636613965034485, + "learning_rate": 0.000650672268907563, + "loss": 0.4795, + "step": 12580 + }, + { + "epoch": 7.028491620111732, + "grad_norm": 2.4695205688476562, + "learning_rate": 0.0006506442577030812, + "loss": 0.3979, + "step": 12581 + }, + { + "epoch": 7.029050279329609, + "grad_norm": 1.5455950498580933, + "learning_rate": 0.0006506162464985994, + "loss": 0.3458, + "step": 12582 + }, + { + "epoch": 7.0296089385474865, + "grad_norm": 0.49119916558265686, + "learning_rate": 0.0006505882352941178, + "loss": 0.4275, + "step": 12583 + }, + { + "epoch": 7.030167597765363, + "grad_norm": 0.44068941473960876, + "learning_rate": 0.0006505602240896359, + "loss": 0.3821, + "step": 12584 + }, + { + "epoch": 7.0307262569832405, + "grad_norm": 0.7384152412414551, + "learning_rate": 0.0006505322128851541, + "loss": 0.3813, + "step": 12585 + }, + { + "epoch": 7.031284916201117, + "grad_norm": 0.6572919487953186, + "learning_rate": 0.0006505042016806723, + "loss": 0.3948, + "step": 12586 + }, + { + "epoch": 7.0318435754189945, + "grad_norm": 0.6202011108398438, + "learning_rate": 0.0006504761904761905, + "loss": 0.3574, + "step": 12587 + }, + { + "epoch": 7.032402234636871, + "grad_norm": 0.9921704530715942, + "learning_rate": 0.0006504481792717088, + "loss": 0.4431, + "step": 12588 + }, + { + "epoch": 7.0329608938547485, + "grad_norm": 1.0408921241760254, + "learning_rate": 0.0006504201680672269, + "loss": 0.4842, + "step": 12589 + }, + { + "epoch": 7.033519553072626, + "grad_norm": 0.41934728622436523, + "learning_rate": 0.0006503921568627451, + "loss": 0.3641, + "step": 12590 + }, + { + "epoch": 7.034078212290503, + "grad_norm": 0.7358263731002808, + "learning_rate": 0.0006503641456582633, + "loss": 0.3463, + "step": 12591 + }, + { + "epoch": 7.03463687150838, + "grad_norm": 0.8981183767318726, + "learning_rate": 0.0006503361344537815, + "loss": 0.5734, + "step": 12592 + }, + { + "epoch": 7.035195530726257, + "grad_norm": 0.46990492939949036, + "learning_rate": 0.0006503081232492998, + "loss": 0.4154, + "step": 12593 + }, + { + "epoch": 7.035754189944134, + "grad_norm": 0.4415307939052582, + "learning_rate": 0.0006502801120448179, + "loss": 0.3851, + "step": 12594 + }, + { + "epoch": 7.0363128491620115, + "grad_norm": 3.0129590034484863, + "learning_rate": 0.0006502521008403361, + "loss": 0.3889, + "step": 12595 + }, + { + "epoch": 7.036871508379888, + "grad_norm": 0.6053531169891357, + "learning_rate": 0.0006502240896358543, + "loss": 0.777, + "step": 12596 + }, + { + "epoch": 7.0374301675977655, + "grad_norm": 2.343404769897461, + "learning_rate": 0.0006501960784313725, + "loss": 0.489, + "step": 12597 + }, + { + "epoch": 7.037988826815642, + "grad_norm": 0.7072979807853699, + "learning_rate": 0.0006501680672268908, + "loss": 0.3918, + "step": 12598 + }, + { + "epoch": 7.0385474860335195, + "grad_norm": 0.5817676186561584, + "learning_rate": 0.000650140056022409, + "loss": 0.4072, + "step": 12599 + }, + { + "epoch": 7.039106145251397, + "grad_norm": 0.546341598033905, + "learning_rate": 0.0006501120448179271, + "loss": 0.3653, + "step": 12600 + }, + { + "epoch": 7.039664804469274, + "grad_norm": 0.5390695929527283, + "learning_rate": 0.0006500840336134454, + "loss": 0.4747, + "step": 12601 + }, + { + "epoch": 7.040223463687151, + "grad_norm": 0.8624338507652283, + "learning_rate": 0.0006500560224089636, + "loss": 0.6596, + "step": 12602 + }, + { + "epoch": 7.040782122905028, + "grad_norm": 0.6916890144348145, + "learning_rate": 0.0006500280112044819, + "loss": 0.3261, + "step": 12603 + }, + { + "epoch": 7.041340782122905, + "grad_norm": 1.2541073560714722, + "learning_rate": 0.0006500000000000001, + "loss": 0.4168, + "step": 12604 + }, + { + "epoch": 7.0418994413407825, + "grad_norm": 0.47612178325653076, + "learning_rate": 0.0006499719887955182, + "loss": 0.4731, + "step": 12605 + }, + { + "epoch": 7.042458100558659, + "grad_norm": 0.5722107887268066, + "learning_rate": 0.0006499439775910364, + "loss": 0.3883, + "step": 12606 + }, + { + "epoch": 7.0430167597765365, + "grad_norm": 0.8015046119689941, + "learning_rate": 0.0006499159663865546, + "loss": 0.4488, + "step": 12607 + }, + { + "epoch": 7.043575418994413, + "grad_norm": 1.1525018215179443, + "learning_rate": 0.0006498879551820729, + "loss": 0.3645, + "step": 12608 + }, + { + "epoch": 7.0441340782122905, + "grad_norm": 1.1446833610534668, + "learning_rate": 0.0006498599439775911, + "loss": 0.4705, + "step": 12609 + }, + { + "epoch": 7.044692737430168, + "grad_norm": 0.4137093722820282, + "learning_rate": 0.0006498319327731092, + "loss": 0.3488, + "step": 12610 + }, + { + "epoch": 7.045251396648045, + "grad_norm": 0.4494577944278717, + "learning_rate": 0.0006498039215686274, + "loss": 0.4752, + "step": 12611 + }, + { + "epoch": 7.045810055865922, + "grad_norm": 0.5632845163345337, + "learning_rate": 0.0006497759103641456, + "loss": 0.469, + "step": 12612 + }, + { + "epoch": 7.046368715083799, + "grad_norm": 0.47470736503601074, + "learning_rate": 0.0006497478991596639, + "loss": 0.3659, + "step": 12613 + }, + { + "epoch": 7.046927374301676, + "grad_norm": 0.7343278527259827, + "learning_rate": 0.0006497198879551821, + "loss": 0.4551, + "step": 12614 + }, + { + "epoch": 7.0474860335195535, + "grad_norm": 0.5307226777076721, + "learning_rate": 0.0006496918767507003, + "loss": 0.3913, + "step": 12615 + }, + { + "epoch": 7.04804469273743, + "grad_norm": 0.5707471370697021, + "learning_rate": 0.0006496638655462184, + "loss": 0.489, + "step": 12616 + }, + { + "epoch": 7.0486033519553075, + "grad_norm": 0.4415913224220276, + "learning_rate": 0.0006496358543417366, + "loss": 0.4385, + "step": 12617 + }, + { + "epoch": 7.049162011173184, + "grad_norm": 0.6077116131782532, + "learning_rate": 0.000649607843137255, + "loss": 0.3921, + "step": 12618 + }, + { + "epoch": 7.0497206703910615, + "grad_norm": 1.2557839155197144, + "learning_rate": 0.0006495798319327732, + "loss": 0.6063, + "step": 12619 + }, + { + "epoch": 7.050279329608939, + "grad_norm": 0.4803507328033447, + "learning_rate": 0.0006495518207282914, + "loss": 0.368, + "step": 12620 + }, + { + "epoch": 7.050837988826816, + "grad_norm": 0.42184460163116455, + "learning_rate": 0.0006495238095238095, + "loss": 0.3273, + "step": 12621 + }, + { + "epoch": 7.051396648044693, + "grad_norm": 0.636117160320282, + "learning_rate": 0.0006494957983193277, + "loss": 0.4355, + "step": 12622 + }, + { + "epoch": 7.05195530726257, + "grad_norm": 0.5575524568557739, + "learning_rate": 0.000649467787114846, + "loss": 0.4663, + "step": 12623 + }, + { + "epoch": 7.052513966480447, + "grad_norm": 0.49263331294059753, + "learning_rate": 0.0006494397759103642, + "loss": 0.5282, + "step": 12624 + }, + { + "epoch": 7.053072625698324, + "grad_norm": 0.4541114866733551, + "learning_rate": 0.0006494117647058824, + "loss": 0.4486, + "step": 12625 + }, + { + "epoch": 7.053631284916201, + "grad_norm": 1.2242417335510254, + "learning_rate": 0.0006493837535014005, + "loss": 0.5726, + "step": 12626 + }, + { + "epoch": 7.0541899441340785, + "grad_norm": 1.0743666887283325, + "learning_rate": 0.0006493557422969187, + "loss": 0.5283, + "step": 12627 + }, + { + "epoch": 7.054748603351955, + "grad_norm": 0.979205310344696, + "learning_rate": 0.000649327731092437, + "loss": 0.3351, + "step": 12628 + }, + { + "epoch": 7.0553072625698325, + "grad_norm": 0.5161188840866089, + "learning_rate": 0.0006492997198879552, + "loss": 0.4595, + "step": 12629 + }, + { + "epoch": 7.055865921787709, + "grad_norm": 0.6742265820503235, + "learning_rate": 0.0006492717086834734, + "loss": 0.437, + "step": 12630 + }, + { + "epoch": 7.056424581005587, + "grad_norm": 0.5923387408256531, + "learning_rate": 0.0006492436974789916, + "loss": 0.3738, + "step": 12631 + }, + { + "epoch": 7.056983240223464, + "grad_norm": 0.552234411239624, + "learning_rate": 0.0006492156862745097, + "loss": 0.4338, + "step": 12632 + }, + { + "epoch": 7.057541899441341, + "grad_norm": 1.2429804801940918, + "learning_rate": 0.000649187675070028, + "loss": 0.4514, + "step": 12633 + }, + { + "epoch": 7.058100558659218, + "grad_norm": 0.45925721526145935, + "learning_rate": 0.0006491596638655463, + "loss": 0.3422, + "step": 12634 + }, + { + "epoch": 7.058659217877095, + "grad_norm": 0.5782500505447388, + "learning_rate": 0.0006491316526610645, + "loss": 0.4749, + "step": 12635 + }, + { + "epoch": 7.059217877094972, + "grad_norm": 0.42826059460639954, + "learning_rate": 0.0006491036414565827, + "loss": 0.466, + "step": 12636 + }, + { + "epoch": 7.0597765363128495, + "grad_norm": 0.5627808570861816, + "learning_rate": 0.0006490756302521008, + "loss": 0.4615, + "step": 12637 + }, + { + "epoch": 7.060335195530726, + "grad_norm": 0.45309755206108093, + "learning_rate": 0.0006490476190476191, + "loss": 0.3708, + "step": 12638 + }, + { + "epoch": 7.0608938547486035, + "grad_norm": 0.46642163395881653, + "learning_rate": 0.0006490196078431373, + "loss": 0.5278, + "step": 12639 + }, + { + "epoch": 7.06145251396648, + "grad_norm": 0.4951694905757904, + "learning_rate": 0.0006489915966386555, + "loss": 0.5147, + "step": 12640 + }, + { + "epoch": 7.062011173184358, + "grad_norm": 0.4209784269332886, + "learning_rate": 0.0006489635854341737, + "loss": 0.5168, + "step": 12641 + }, + { + "epoch": 7.062569832402235, + "grad_norm": 0.7425562143325806, + "learning_rate": 0.0006489355742296918, + "loss": 0.5149, + "step": 12642 + }, + { + "epoch": 7.063128491620112, + "grad_norm": 0.6006401777267456, + "learning_rate": 0.0006489075630252101, + "loss": 0.5103, + "step": 12643 + }, + { + "epoch": 7.063687150837989, + "grad_norm": 0.43235620856285095, + "learning_rate": 0.0006488795518207283, + "loss": 0.317, + "step": 12644 + }, + { + "epoch": 7.064245810055866, + "grad_norm": 0.597262978553772, + "learning_rate": 0.0006488515406162465, + "loss": 0.5351, + "step": 12645 + }, + { + "epoch": 7.064804469273743, + "grad_norm": 0.36786171793937683, + "learning_rate": 0.0006488235294117647, + "loss": 0.3967, + "step": 12646 + }, + { + "epoch": 7.0653631284916205, + "grad_norm": 0.47619083523750305, + "learning_rate": 0.0006487955182072829, + "loss": 0.5102, + "step": 12647 + }, + { + "epoch": 7.065921787709497, + "grad_norm": 1.6786117553710938, + "learning_rate": 0.0006487675070028011, + "loss": 0.4214, + "step": 12648 + }, + { + "epoch": 7.0664804469273745, + "grad_norm": 0.4987828731536865, + "learning_rate": 0.0006487394957983193, + "loss": 0.3541, + "step": 12649 + }, + { + "epoch": 7.067039106145251, + "grad_norm": 0.4355384409427643, + "learning_rate": 0.0006487114845938376, + "loss": 0.4752, + "step": 12650 + }, + { + "epoch": 7.067597765363129, + "grad_norm": 0.4388313889503479, + "learning_rate": 0.0006486834733893558, + "loss": 0.4073, + "step": 12651 + }, + { + "epoch": 7.068156424581006, + "grad_norm": 0.450723260641098, + "learning_rate": 0.000648655462184874, + "loss": 0.4633, + "step": 12652 + }, + { + "epoch": 7.068715083798883, + "grad_norm": 0.5171381235122681, + "learning_rate": 0.0006486274509803922, + "loss": 0.4776, + "step": 12653 + }, + { + "epoch": 7.06927374301676, + "grad_norm": 0.4311113953590393, + "learning_rate": 0.0006485994397759104, + "loss": 0.4724, + "step": 12654 + }, + { + "epoch": 7.069832402234637, + "grad_norm": 0.4372659921646118, + "learning_rate": 0.0006485714285714286, + "loss": 0.4037, + "step": 12655 + }, + { + "epoch": 7.070391061452514, + "grad_norm": 0.5358920693397522, + "learning_rate": 0.0006485434173669468, + "loss": 0.3544, + "step": 12656 + }, + { + "epoch": 7.070949720670391, + "grad_norm": 0.3797658681869507, + "learning_rate": 0.000648515406162465, + "loss": 0.3823, + "step": 12657 + }, + { + "epoch": 7.071508379888268, + "grad_norm": 0.5499960780143738, + "learning_rate": 0.0006484873949579833, + "loss": 0.4706, + "step": 12658 + }, + { + "epoch": 7.0720670391061455, + "grad_norm": 0.6061813831329346, + "learning_rate": 0.0006484593837535014, + "loss": 0.4559, + "step": 12659 + }, + { + "epoch": 7.072625698324022, + "grad_norm": 6.584712982177734, + "learning_rate": 0.0006484313725490196, + "loss": 0.5266, + "step": 12660 + }, + { + "epoch": 7.0731843575419, + "grad_norm": 0.5736716389656067, + "learning_rate": 0.0006484033613445378, + "loss": 0.5633, + "step": 12661 + }, + { + "epoch": 7.073743016759776, + "grad_norm": 0.38336530327796936, + "learning_rate": 0.000648375350140056, + "loss": 0.3396, + "step": 12662 + }, + { + "epoch": 7.074301675977654, + "grad_norm": 0.6666037440299988, + "learning_rate": 0.0006483473389355743, + "loss": 0.4675, + "step": 12663 + }, + { + "epoch": 7.074860335195531, + "grad_norm": 0.5090362429618835, + "learning_rate": 0.0006483193277310924, + "loss": 0.4982, + "step": 12664 + }, + { + "epoch": 7.075418994413408, + "grad_norm": 0.5086391568183899, + "learning_rate": 0.0006482913165266106, + "loss": 0.4555, + "step": 12665 + }, + { + "epoch": 7.075977653631285, + "grad_norm": 0.46071144938468933, + "learning_rate": 0.0006482633053221289, + "loss": 0.3868, + "step": 12666 + }, + { + "epoch": 7.076536312849162, + "grad_norm": 0.47882330417633057, + "learning_rate": 0.0006482352941176471, + "loss": 0.421, + "step": 12667 + }, + { + "epoch": 7.077094972067039, + "grad_norm": 0.6279311776161194, + "learning_rate": 0.0006482072829131654, + "loss": 0.4989, + "step": 12668 + }, + { + "epoch": 7.0776536312849165, + "grad_norm": 0.6202947497367859, + "learning_rate": 0.0006481792717086835, + "loss": 0.5398, + "step": 12669 + }, + { + "epoch": 7.078212290502793, + "grad_norm": 0.5088551044464111, + "learning_rate": 0.0006481512605042017, + "loss": 0.3737, + "step": 12670 + }, + { + "epoch": 7.078770949720671, + "grad_norm": 0.7917990684509277, + "learning_rate": 0.0006481232492997199, + "loss": 0.4523, + "step": 12671 + }, + { + "epoch": 7.079329608938547, + "grad_norm": 1.0711537599563599, + "learning_rate": 0.0006480952380952381, + "loss": 0.4611, + "step": 12672 + }, + { + "epoch": 7.079888268156425, + "grad_norm": 0.6013180017471313, + "learning_rate": 0.0006480672268907564, + "loss": 0.5376, + "step": 12673 + }, + { + "epoch": 7.080446927374302, + "grad_norm": 0.7891436219215393, + "learning_rate": 0.0006480392156862746, + "loss": 0.4557, + "step": 12674 + }, + { + "epoch": 7.081005586592179, + "grad_norm": 0.5295562148094177, + "learning_rate": 0.0006480112044817927, + "loss": 0.5685, + "step": 12675 + }, + { + "epoch": 7.081564245810056, + "grad_norm": 0.5940903425216675, + "learning_rate": 0.0006479831932773109, + "loss": 0.4283, + "step": 12676 + }, + { + "epoch": 7.082122905027933, + "grad_norm": 0.4647383689880371, + "learning_rate": 0.0006479551820728291, + "loss": 0.5954, + "step": 12677 + }, + { + "epoch": 7.08268156424581, + "grad_norm": 0.7529733777046204, + "learning_rate": 0.0006479271708683474, + "loss": 0.3366, + "step": 12678 + }, + { + "epoch": 7.0832402234636875, + "grad_norm": 0.4236205816268921, + "learning_rate": 0.0006478991596638656, + "loss": 0.4257, + "step": 12679 + }, + { + "epoch": 7.083798882681564, + "grad_norm": 0.44986915588378906, + "learning_rate": 0.0006478711484593837, + "loss": 0.4193, + "step": 12680 + }, + { + "epoch": 7.084357541899442, + "grad_norm": 0.5581731200218201, + "learning_rate": 0.0006478431372549019, + "loss": 0.4689, + "step": 12681 + }, + { + "epoch": 7.084916201117318, + "grad_norm": 0.4808078706264496, + "learning_rate": 0.0006478151260504201, + "loss": 0.4337, + "step": 12682 + }, + { + "epoch": 7.085474860335196, + "grad_norm": 0.4529827833175659, + "learning_rate": 0.0006477871148459385, + "loss": 0.3852, + "step": 12683 + }, + { + "epoch": 7.086033519553073, + "grad_norm": 0.43915534019470215, + "learning_rate": 0.0006477591036414567, + "loss": 0.4549, + "step": 12684 + }, + { + "epoch": 7.08659217877095, + "grad_norm": 0.5764197111129761, + "learning_rate": 0.0006477310924369748, + "loss": 0.4775, + "step": 12685 + }, + { + "epoch": 7.087150837988827, + "grad_norm": 0.7181698679924011, + "learning_rate": 0.000647703081232493, + "loss": 0.5242, + "step": 12686 + }, + { + "epoch": 7.087709497206704, + "grad_norm": 0.47215521335601807, + "learning_rate": 0.0006476750700280112, + "loss": 0.3651, + "step": 12687 + }, + { + "epoch": 7.088268156424581, + "grad_norm": 0.6749891638755798, + "learning_rate": 0.0006476470588235295, + "loss": 0.4695, + "step": 12688 + }, + { + "epoch": 7.0888268156424585, + "grad_norm": 0.8477067947387695, + "learning_rate": 0.0006476190476190477, + "loss": 0.3801, + "step": 12689 + }, + { + "epoch": 7.089385474860335, + "grad_norm": 0.43932828307151794, + "learning_rate": 0.0006475910364145659, + "loss": 0.4026, + "step": 12690 + }, + { + "epoch": 7.089944134078213, + "grad_norm": 0.9299423694610596, + "learning_rate": 0.000647563025210084, + "loss": 0.4854, + "step": 12691 + }, + { + "epoch": 7.090502793296089, + "grad_norm": 0.4638229012489319, + "learning_rate": 0.0006475350140056022, + "loss": 0.4092, + "step": 12692 + }, + { + "epoch": 7.091061452513967, + "grad_norm": 0.48176753520965576, + "learning_rate": 0.0006475070028011205, + "loss": 0.38, + "step": 12693 + }, + { + "epoch": 7.091620111731843, + "grad_norm": 0.5932209491729736, + "learning_rate": 0.0006474789915966387, + "loss": 0.4243, + "step": 12694 + }, + { + "epoch": 7.092178770949721, + "grad_norm": 3.0513081550598145, + "learning_rate": 0.0006474509803921569, + "loss": 0.5551, + "step": 12695 + }, + { + "epoch": 7.092737430167598, + "grad_norm": 0.3734993636608124, + "learning_rate": 0.000647422969187675, + "loss": 0.411, + "step": 12696 + }, + { + "epoch": 7.093296089385475, + "grad_norm": 0.4572320580482483, + "learning_rate": 0.0006473949579831932, + "loss": 0.612, + "step": 12697 + }, + { + "epoch": 7.093854748603352, + "grad_norm": 0.42675843834877014, + "learning_rate": 0.0006473669467787116, + "loss": 0.4259, + "step": 12698 + }, + { + "epoch": 7.094413407821229, + "grad_norm": 0.4693995714187622, + "learning_rate": 0.0006473389355742298, + "loss": 0.4438, + "step": 12699 + }, + { + "epoch": 7.094972067039106, + "grad_norm": 0.5776245594024658, + "learning_rate": 0.000647310924369748, + "loss": 0.6619, + "step": 12700 + }, + { + "epoch": 7.0955307262569836, + "grad_norm": 0.5128140449523926, + "learning_rate": 0.0006472829131652661, + "loss": 0.5135, + "step": 12701 + }, + { + "epoch": 7.09608938547486, + "grad_norm": 0.5361731052398682, + "learning_rate": 0.0006472549019607843, + "loss": 0.452, + "step": 12702 + }, + { + "epoch": 7.096648044692738, + "grad_norm": 0.5004317164421082, + "learning_rate": 0.0006472268907563026, + "loss": 0.4145, + "step": 12703 + }, + { + "epoch": 7.097206703910614, + "grad_norm": 0.4886071979999542, + "learning_rate": 0.0006471988795518208, + "loss": 0.4396, + "step": 12704 + }, + { + "epoch": 7.097765363128492, + "grad_norm": 1.0755926370620728, + "learning_rate": 0.000647170868347339, + "loss": 0.536, + "step": 12705 + }, + { + "epoch": 7.098324022346369, + "grad_norm": 0.4762767255306244, + "learning_rate": 0.0006471428571428572, + "loss": 0.4837, + "step": 12706 + }, + { + "epoch": 7.098882681564246, + "grad_norm": 0.6282446980476379, + "learning_rate": 0.0006471148459383753, + "loss": 0.5586, + "step": 12707 + }, + { + "epoch": 7.099441340782123, + "grad_norm": 0.6570466160774231, + "learning_rate": 0.0006470868347338936, + "loss": 0.3973, + "step": 12708 + }, + { + "epoch": 7.1, + "grad_norm": 0.39744311571121216, + "learning_rate": 0.0006470588235294118, + "loss": 0.4994, + "step": 12709 + }, + { + "epoch": 7.100558659217877, + "grad_norm": 0.48977991938591003, + "learning_rate": 0.00064703081232493, + "loss": 0.4068, + "step": 12710 + }, + { + "epoch": 7.1011173184357546, + "grad_norm": 0.49988874793052673, + "learning_rate": 0.0006470028011204482, + "loss": 0.3902, + "step": 12711 + }, + { + "epoch": 7.101675977653631, + "grad_norm": 0.5221667885780334, + "learning_rate": 0.0006469747899159663, + "loss": 0.5238, + "step": 12712 + }, + { + "epoch": 7.102234636871509, + "grad_norm": 0.6152661442756653, + "learning_rate": 0.0006469467787114846, + "loss": 0.4372, + "step": 12713 + }, + { + "epoch": 7.102793296089385, + "grad_norm": 0.5400908589363098, + "learning_rate": 0.0006469187675070028, + "loss": 0.4648, + "step": 12714 + }, + { + "epoch": 7.103351955307263, + "grad_norm": 0.48163044452667236, + "learning_rate": 0.000646890756302521, + "loss": 0.4323, + "step": 12715 + }, + { + "epoch": 7.10391061452514, + "grad_norm": 1.400691270828247, + "learning_rate": 0.0006468627450980393, + "loss": 0.4309, + "step": 12716 + }, + { + "epoch": 7.104469273743017, + "grad_norm": 0.7338702082633972, + "learning_rate": 0.0006468347338935574, + "loss": 0.6295, + "step": 12717 + }, + { + "epoch": 7.105027932960894, + "grad_norm": 0.6296538710594177, + "learning_rate": 0.0006468067226890756, + "loss": 0.4425, + "step": 12718 + }, + { + "epoch": 7.105586592178771, + "grad_norm": 0.5258376002311707, + "learning_rate": 0.0006467787114845939, + "loss": 0.5288, + "step": 12719 + }, + { + "epoch": 7.106145251396648, + "grad_norm": 0.6234728693962097, + "learning_rate": 0.0006467507002801121, + "loss": 0.3663, + "step": 12720 + }, + { + "epoch": 7.1067039106145256, + "grad_norm": 1.0232789516448975, + "learning_rate": 0.0006467226890756303, + "loss": 0.4714, + "step": 12721 + }, + { + "epoch": 7.107262569832402, + "grad_norm": 0.4057672917842865, + "learning_rate": 0.0006466946778711485, + "loss": 0.4586, + "step": 12722 + }, + { + "epoch": 7.10782122905028, + "grad_norm": 0.6065461039543152, + "learning_rate": 0.0006466666666666666, + "loss": 0.457, + "step": 12723 + }, + { + "epoch": 7.108379888268156, + "grad_norm": 0.6504439115524292, + "learning_rate": 0.0006466386554621849, + "loss": 0.411, + "step": 12724 + }, + { + "epoch": 7.108938547486034, + "grad_norm": 0.537788987159729, + "learning_rate": 0.0006466106442577031, + "loss": 0.3954, + "step": 12725 + }, + { + "epoch": 7.10949720670391, + "grad_norm": 1.1965739727020264, + "learning_rate": 0.0006465826330532213, + "loss": 0.4266, + "step": 12726 + }, + { + "epoch": 7.110055865921788, + "grad_norm": 0.39094939827919006, + "learning_rate": 0.0006465546218487395, + "loss": 0.3475, + "step": 12727 + }, + { + "epoch": 7.110614525139665, + "grad_norm": 0.7719897627830505, + "learning_rate": 0.0006465266106442576, + "loss": 0.4283, + "step": 12728 + }, + { + "epoch": 7.111173184357542, + "grad_norm": 5.570230484008789, + "learning_rate": 0.0006464985994397759, + "loss": 0.4914, + "step": 12729 + }, + { + "epoch": 7.111731843575419, + "grad_norm": 1.7925388813018799, + "learning_rate": 0.0006464705882352941, + "loss": 0.4616, + "step": 12730 + }, + { + "epoch": 7.112290502793296, + "grad_norm": 0.33797651529312134, + "learning_rate": 0.0006464425770308123, + "loss": 0.3696, + "step": 12731 + }, + { + "epoch": 7.112849162011173, + "grad_norm": 0.5232540369033813, + "learning_rate": 0.0006464145658263306, + "loss": 0.406, + "step": 12732 + }, + { + "epoch": 7.113407821229051, + "grad_norm": 0.4373687505722046, + "learning_rate": 0.0006463865546218487, + "loss": 0.4399, + "step": 12733 + }, + { + "epoch": 7.113966480446927, + "grad_norm": 0.37885135412216187, + "learning_rate": 0.000646358543417367, + "loss": 0.4033, + "step": 12734 + }, + { + "epoch": 7.114525139664805, + "grad_norm": 0.40603572130203247, + "learning_rate": 0.0006463305322128852, + "loss": 0.4128, + "step": 12735 + }, + { + "epoch": 7.115083798882681, + "grad_norm": 0.8822109699249268, + "learning_rate": 0.0006463025210084034, + "loss": 0.4754, + "step": 12736 + }, + { + "epoch": 7.115642458100559, + "grad_norm": 0.5899255275726318, + "learning_rate": 0.0006462745098039216, + "loss": 0.4115, + "step": 12737 + }, + { + "epoch": 7.116201117318436, + "grad_norm": 0.4771755039691925, + "learning_rate": 0.0006462464985994398, + "loss": 0.428, + "step": 12738 + }, + { + "epoch": 7.116759776536313, + "grad_norm": 0.4788636267185211, + "learning_rate": 0.000646218487394958, + "loss": 0.4184, + "step": 12739 + }, + { + "epoch": 7.11731843575419, + "grad_norm": 0.5708326101303101, + "learning_rate": 0.0006461904761904762, + "loss": 0.4483, + "step": 12740 + }, + { + "epoch": 7.117877094972067, + "grad_norm": 0.3364965319633484, + "learning_rate": 0.0006461624649859944, + "loss": 0.5065, + "step": 12741 + }, + { + "epoch": 7.118435754189944, + "grad_norm": 0.4152913987636566, + "learning_rate": 0.0006461344537815126, + "loss": 0.4076, + "step": 12742 + }, + { + "epoch": 7.118994413407822, + "grad_norm": 0.4488314986228943, + "learning_rate": 0.0006461064425770308, + "loss": 0.4691, + "step": 12743 + }, + { + "epoch": 7.119553072625698, + "grad_norm": 0.6819281578063965, + "learning_rate": 0.000646078431372549, + "loss": 0.5166, + "step": 12744 + }, + { + "epoch": 7.120111731843576, + "grad_norm": 0.5671976208686829, + "learning_rate": 0.0006460504201680672, + "loss": 0.4185, + "step": 12745 + }, + { + "epoch": 7.120670391061452, + "grad_norm": 0.44680994749069214, + "learning_rate": 0.0006460224089635854, + "loss": 0.4176, + "step": 12746 + }, + { + "epoch": 7.12122905027933, + "grad_norm": 1.061310887336731, + "learning_rate": 0.0006459943977591036, + "loss": 0.4174, + "step": 12747 + }, + { + "epoch": 7.121787709497207, + "grad_norm": 3.3708105087280273, + "learning_rate": 0.0006459663865546219, + "loss": 0.4232, + "step": 12748 + }, + { + "epoch": 7.122346368715084, + "grad_norm": 0.43583595752716064, + "learning_rate": 0.0006459383753501401, + "loss": 0.5121, + "step": 12749 + }, + { + "epoch": 7.122905027932961, + "grad_norm": 1.007943034172058, + "learning_rate": 0.0006459103641456583, + "loss": 0.5472, + "step": 12750 + }, + { + "epoch": 7.123463687150838, + "grad_norm": 0.44202613830566406, + "learning_rate": 0.0006458823529411765, + "loss": 0.5354, + "step": 12751 + }, + { + "epoch": 7.124022346368715, + "grad_norm": 0.9356262683868408, + "learning_rate": 0.0006458543417366947, + "loss": 0.5202, + "step": 12752 + }, + { + "epoch": 7.124581005586593, + "grad_norm": 0.5274031162261963, + "learning_rate": 0.0006458263305322129, + "loss": 0.4766, + "step": 12753 + }, + { + "epoch": 7.125139664804469, + "grad_norm": 0.6325783729553223, + "learning_rate": 0.0006457983193277312, + "loss": 0.4718, + "step": 12754 + }, + { + "epoch": 7.125698324022347, + "grad_norm": 0.46212828159332275, + "learning_rate": 0.0006457703081232493, + "loss": 0.4406, + "step": 12755 + }, + { + "epoch": 7.126256983240223, + "grad_norm": 0.8070359826087952, + "learning_rate": 0.0006457422969187675, + "loss": 0.3565, + "step": 12756 + }, + { + "epoch": 7.126815642458101, + "grad_norm": 0.4731125831604004, + "learning_rate": 0.0006457142857142857, + "loss": 0.4638, + "step": 12757 + }, + { + "epoch": 7.127374301675978, + "grad_norm": 0.5495564937591553, + "learning_rate": 0.0006456862745098039, + "loss": 0.4266, + "step": 12758 + }, + { + "epoch": 7.127932960893855, + "grad_norm": 1.6205756664276123, + "learning_rate": 0.0006456582633053222, + "loss": 0.5318, + "step": 12759 + }, + { + "epoch": 7.128491620111732, + "grad_norm": 0.4181329607963562, + "learning_rate": 0.0006456302521008403, + "loss": 0.3918, + "step": 12760 + }, + { + "epoch": 7.129050279329609, + "grad_norm": 0.9298834800720215, + "learning_rate": 0.0006456022408963585, + "loss": 0.7562, + "step": 12761 + }, + { + "epoch": 7.129608938547486, + "grad_norm": 0.4737931489944458, + "learning_rate": 0.0006455742296918767, + "loss": 0.3728, + "step": 12762 + }, + { + "epoch": 7.130167597765363, + "grad_norm": 0.7189401984214783, + "learning_rate": 0.0006455462184873949, + "loss": 0.4413, + "step": 12763 + }, + { + "epoch": 7.13072625698324, + "grad_norm": 0.7077210545539856, + "learning_rate": 0.0006455182072829133, + "loss": 0.3631, + "step": 12764 + }, + { + "epoch": 7.131284916201118, + "grad_norm": 0.641346275806427, + "learning_rate": 0.0006454901960784314, + "loss": 0.543, + "step": 12765 + }, + { + "epoch": 7.131843575418994, + "grad_norm": 0.4871518611907959, + "learning_rate": 0.0006454621848739496, + "loss": 0.5246, + "step": 12766 + }, + { + "epoch": 7.132402234636872, + "grad_norm": 1.3887220621109009, + "learning_rate": 0.0006454341736694678, + "loss": 0.5868, + "step": 12767 + }, + { + "epoch": 7.132960893854748, + "grad_norm": 0.6222584843635559, + "learning_rate": 0.000645406162464986, + "loss": 0.3746, + "step": 12768 + }, + { + "epoch": 7.133519553072626, + "grad_norm": 0.5651334524154663, + "learning_rate": 0.0006453781512605043, + "loss": 0.4544, + "step": 12769 + }, + { + "epoch": 7.134078212290503, + "grad_norm": 0.5305673480033875, + "learning_rate": 0.0006453501400560225, + "loss": 0.4357, + "step": 12770 + }, + { + "epoch": 7.13463687150838, + "grad_norm": 0.4170058071613312, + "learning_rate": 0.0006453221288515406, + "loss": 0.4594, + "step": 12771 + }, + { + "epoch": 7.135195530726257, + "grad_norm": 0.4718420207500458, + "learning_rate": 0.0006452941176470588, + "loss": 0.3915, + "step": 12772 + }, + { + "epoch": 7.135754189944134, + "grad_norm": 0.33161619305610657, + "learning_rate": 0.000645266106442577, + "loss": 0.3861, + "step": 12773 + }, + { + "epoch": 7.136312849162011, + "grad_norm": 0.7735040187835693, + "learning_rate": 0.0006452380952380953, + "loss": 0.5066, + "step": 12774 + }, + { + "epoch": 7.136871508379889, + "grad_norm": 0.5578786134719849, + "learning_rate": 0.0006452100840336135, + "loss": 0.5385, + "step": 12775 + }, + { + "epoch": 7.137430167597765, + "grad_norm": 0.6634144186973572, + "learning_rate": 0.0006451820728291316, + "loss": 0.6731, + "step": 12776 + }, + { + "epoch": 7.137988826815643, + "grad_norm": 1.489732265472412, + "learning_rate": 0.0006451540616246498, + "loss": 0.4059, + "step": 12777 + }, + { + "epoch": 7.138547486033519, + "grad_norm": 0.45307981967926025, + "learning_rate": 0.000645126050420168, + "loss": 0.4548, + "step": 12778 + }, + { + "epoch": 7.139106145251397, + "grad_norm": 0.5672163963317871, + "learning_rate": 0.0006450980392156863, + "loss": 0.5295, + "step": 12779 + }, + { + "epoch": 7.139664804469274, + "grad_norm": 1.4033528566360474, + "learning_rate": 0.0006450700280112046, + "loss": 0.4371, + "step": 12780 + }, + { + "epoch": 7.140223463687151, + "grad_norm": 0.6696240901947021, + "learning_rate": 0.0006450420168067226, + "loss": 0.4999, + "step": 12781 + }, + { + "epoch": 7.140782122905028, + "grad_norm": 0.46200865507125854, + "learning_rate": 0.0006450140056022409, + "loss": 0.4236, + "step": 12782 + }, + { + "epoch": 7.141340782122905, + "grad_norm": 0.4872981309890747, + "learning_rate": 0.0006449859943977591, + "loss": 0.4379, + "step": 12783 + }, + { + "epoch": 7.141899441340782, + "grad_norm": 0.39078593254089355, + "learning_rate": 0.0006449579831932774, + "loss": 0.3524, + "step": 12784 + }, + { + "epoch": 7.14245810055866, + "grad_norm": 2.3813068866729736, + "learning_rate": 0.0006449299719887956, + "loss": 0.3891, + "step": 12785 + }, + { + "epoch": 7.143016759776536, + "grad_norm": 0.5509335398674011, + "learning_rate": 0.0006449019607843138, + "loss": 0.4508, + "step": 12786 + }, + { + "epoch": 7.143575418994414, + "grad_norm": 0.764968991279602, + "learning_rate": 0.0006448739495798319, + "loss": 0.4261, + "step": 12787 + }, + { + "epoch": 7.14413407821229, + "grad_norm": 0.48758915066719055, + "learning_rate": 0.0006448459383753501, + "loss": 0.438, + "step": 12788 + }, + { + "epoch": 7.144692737430168, + "grad_norm": 0.5093138217926025, + "learning_rate": 0.0006448179271708684, + "loss": 0.3542, + "step": 12789 + }, + { + "epoch": 7.145251396648045, + "grad_norm": 6.327672004699707, + "learning_rate": 0.0006447899159663866, + "loss": 0.4273, + "step": 12790 + }, + { + "epoch": 7.145810055865922, + "grad_norm": 0.8771942853927612, + "learning_rate": 0.0006447619047619048, + "loss": 0.3891, + "step": 12791 + }, + { + "epoch": 7.146368715083799, + "grad_norm": 3.5664823055267334, + "learning_rate": 0.0006447338935574229, + "loss": 0.4918, + "step": 12792 + }, + { + "epoch": 7.146927374301676, + "grad_norm": 0.4794231355190277, + "learning_rate": 0.0006447058823529411, + "loss": 0.4275, + "step": 12793 + }, + { + "epoch": 7.147486033519553, + "grad_norm": 0.5505436062812805, + "learning_rate": 0.0006446778711484594, + "loss": 0.6091, + "step": 12794 + }, + { + "epoch": 7.148044692737431, + "grad_norm": 0.41391441226005554, + "learning_rate": 0.0006446498599439776, + "loss": 0.4286, + "step": 12795 + }, + { + "epoch": 7.148603351955307, + "grad_norm": 0.6714885234832764, + "learning_rate": 0.0006446218487394958, + "loss": 0.449, + "step": 12796 + }, + { + "epoch": 7.149162011173185, + "grad_norm": 0.37468045949935913, + "learning_rate": 0.0006445938375350139, + "loss": 0.3685, + "step": 12797 + }, + { + "epoch": 7.149720670391061, + "grad_norm": 0.5984861850738525, + "learning_rate": 0.0006445658263305321, + "loss": 0.4558, + "step": 12798 + }, + { + "epoch": 7.150279329608939, + "grad_norm": 0.5259456634521484, + "learning_rate": 0.0006445378151260505, + "loss": 0.3493, + "step": 12799 + }, + { + "epoch": 7.150837988826815, + "grad_norm": 0.5435981154441833, + "learning_rate": 0.0006445098039215687, + "loss": 0.4379, + "step": 12800 + }, + { + "epoch": 7.151396648044693, + "grad_norm": 0.43052938580513, + "learning_rate": 0.0006444817927170869, + "loss": 0.4249, + "step": 12801 + }, + { + "epoch": 7.15195530726257, + "grad_norm": 1.0683350563049316, + "learning_rate": 0.0006444537815126051, + "loss": 0.5386, + "step": 12802 + }, + { + "epoch": 7.152513966480447, + "grad_norm": 1.0287319421768188, + "learning_rate": 0.0006444257703081232, + "loss": 0.4937, + "step": 12803 + }, + { + "epoch": 7.153072625698324, + "grad_norm": 0.428891122341156, + "learning_rate": 0.0006443977591036415, + "loss": 0.4899, + "step": 12804 + }, + { + "epoch": 7.153631284916201, + "grad_norm": 0.5163244009017944, + "learning_rate": 0.0006443697478991597, + "loss": 0.445, + "step": 12805 + }, + { + "epoch": 7.154189944134078, + "grad_norm": 0.5456315875053406, + "learning_rate": 0.0006443417366946779, + "loss": 0.6807, + "step": 12806 + }, + { + "epoch": 7.154748603351956, + "grad_norm": 0.6007381677627563, + "learning_rate": 0.0006443137254901961, + "loss": 0.4452, + "step": 12807 + }, + { + "epoch": 7.155307262569832, + "grad_norm": 0.5373033881187439, + "learning_rate": 0.0006442857142857142, + "loss": 0.6039, + "step": 12808 + }, + { + "epoch": 7.15586592178771, + "grad_norm": 0.5036280155181885, + "learning_rate": 0.0006442577030812325, + "loss": 0.5336, + "step": 12809 + }, + { + "epoch": 7.156424581005586, + "grad_norm": 0.5868043303489685, + "learning_rate": 0.0006442296918767507, + "loss": 0.6274, + "step": 12810 + }, + { + "epoch": 7.156983240223464, + "grad_norm": 0.5793883204460144, + "learning_rate": 0.0006442016806722689, + "loss": 0.4187, + "step": 12811 + }, + { + "epoch": 7.157541899441341, + "grad_norm": 0.571361243724823, + "learning_rate": 0.0006441736694677871, + "loss": 0.471, + "step": 12812 + }, + { + "epoch": 7.158100558659218, + "grad_norm": 0.5304030179977417, + "learning_rate": 0.0006441456582633052, + "loss": 0.5783, + "step": 12813 + }, + { + "epoch": 7.158659217877095, + "grad_norm": 1.6864985227584839, + "learning_rate": 0.0006441176470588236, + "loss": 0.4137, + "step": 12814 + }, + { + "epoch": 7.159217877094972, + "grad_norm": 0.5903187394142151, + "learning_rate": 0.0006440896358543418, + "loss": 0.4528, + "step": 12815 + }, + { + "epoch": 7.159776536312849, + "grad_norm": 1.0610902309417725, + "learning_rate": 0.00064406162464986, + "loss": 0.4505, + "step": 12816 + }, + { + "epoch": 7.160335195530727, + "grad_norm": 0.8419244289398193, + "learning_rate": 0.0006440336134453782, + "loss": 0.4472, + "step": 12817 + }, + { + "epoch": 7.160893854748603, + "grad_norm": 0.44933366775512695, + "learning_rate": 0.0006440056022408964, + "loss": 0.4482, + "step": 12818 + }, + { + "epoch": 7.161452513966481, + "grad_norm": 0.6411414742469788, + "learning_rate": 0.0006439775910364146, + "loss": 0.3909, + "step": 12819 + }, + { + "epoch": 7.162011173184357, + "grad_norm": 0.7263995409011841, + "learning_rate": 0.0006439495798319328, + "loss": 0.5136, + "step": 12820 + }, + { + "epoch": 7.162569832402235, + "grad_norm": 0.5708878636360168, + "learning_rate": 0.000643921568627451, + "loss": 0.3808, + "step": 12821 + }, + { + "epoch": 7.163128491620112, + "grad_norm": 0.3861788213253021, + "learning_rate": 0.0006438935574229692, + "loss": 0.4345, + "step": 12822 + }, + { + "epoch": 7.163687150837989, + "grad_norm": 0.45457369089126587, + "learning_rate": 0.0006438655462184874, + "loss": 0.2889, + "step": 12823 + }, + { + "epoch": 7.164245810055866, + "grad_norm": 0.5887379050254822, + "learning_rate": 0.0006438375350140056, + "loss": 0.7333, + "step": 12824 + }, + { + "epoch": 7.164804469273743, + "grad_norm": 1.1495230197906494, + "learning_rate": 0.0006438095238095238, + "loss": 0.4128, + "step": 12825 + }, + { + "epoch": 7.16536312849162, + "grad_norm": 0.3804246187210083, + "learning_rate": 0.000643781512605042, + "loss": 0.4147, + "step": 12826 + }, + { + "epoch": 7.165921787709498, + "grad_norm": 0.4984986186027527, + "learning_rate": 0.0006437535014005602, + "loss": 0.4712, + "step": 12827 + }, + { + "epoch": 7.166480446927374, + "grad_norm": 0.44855397939682007, + "learning_rate": 0.0006437254901960784, + "loss": 0.3908, + "step": 12828 + }, + { + "epoch": 7.167039106145252, + "grad_norm": 0.6057441234588623, + "learning_rate": 0.0006436974789915966, + "loss": 0.3828, + "step": 12829 + }, + { + "epoch": 7.167597765363128, + "grad_norm": 4.7428107261657715, + "learning_rate": 0.0006436694677871149, + "loss": 0.3937, + "step": 12830 + }, + { + "epoch": 7.168156424581006, + "grad_norm": 1.20963454246521, + "learning_rate": 0.0006436414565826331, + "loss": 0.5028, + "step": 12831 + }, + { + "epoch": 7.168715083798883, + "grad_norm": 0.7283357381820679, + "learning_rate": 0.0006436134453781513, + "loss": 0.5822, + "step": 12832 + }, + { + "epoch": 7.16927374301676, + "grad_norm": 0.6972780227661133, + "learning_rate": 0.0006435854341736695, + "loss": 0.4718, + "step": 12833 + }, + { + "epoch": 7.169832402234637, + "grad_norm": 0.5511775016784668, + "learning_rate": 0.0006435574229691878, + "loss": 0.3353, + "step": 12834 + }, + { + "epoch": 7.170391061452514, + "grad_norm": 0.4870832860469818, + "learning_rate": 0.0006435294117647059, + "loss": 0.5524, + "step": 12835 + }, + { + "epoch": 7.170949720670391, + "grad_norm": 0.4437590539455414, + "learning_rate": 0.0006435014005602241, + "loss": 0.3883, + "step": 12836 + }, + { + "epoch": 7.171508379888268, + "grad_norm": 0.5412284135818481, + "learning_rate": 0.0006434733893557423, + "loss": 0.4194, + "step": 12837 + }, + { + "epoch": 7.172067039106145, + "grad_norm": 1.660475492477417, + "learning_rate": 0.0006434453781512605, + "loss": 0.4, + "step": 12838 + }, + { + "epoch": 7.172625698324023, + "grad_norm": 0.43075165152549744, + "learning_rate": 0.0006434173669467788, + "loss": 0.3833, + "step": 12839 + }, + { + "epoch": 7.173184357541899, + "grad_norm": 0.39243221282958984, + "learning_rate": 0.0006433893557422969, + "loss": 0.4517, + "step": 12840 + }, + { + "epoch": 7.173743016759777, + "grad_norm": 0.5009211897850037, + "learning_rate": 0.0006433613445378151, + "loss": 0.4029, + "step": 12841 + }, + { + "epoch": 7.174301675977653, + "grad_norm": 0.5243275761604309, + "learning_rate": 0.0006433333333333333, + "loss": 0.5026, + "step": 12842 + }, + { + "epoch": 7.174860335195531, + "grad_norm": 0.4553791880607605, + "learning_rate": 0.0006433053221288515, + "loss": 0.4587, + "step": 12843 + }, + { + "epoch": 7.175418994413408, + "grad_norm": 0.464993417263031, + "learning_rate": 0.0006432773109243698, + "loss": 0.383, + "step": 12844 + }, + { + "epoch": 7.175977653631285, + "grad_norm": 0.6442744135856628, + "learning_rate": 0.0006432492997198879, + "loss": 0.4961, + "step": 12845 + }, + { + "epoch": 7.176536312849162, + "grad_norm": 0.45668184757232666, + "learning_rate": 0.0006432212885154061, + "loss": 0.3952, + "step": 12846 + }, + { + "epoch": 7.177094972067039, + "grad_norm": 0.6154555082321167, + "learning_rate": 0.0006431932773109244, + "loss": 0.3509, + "step": 12847 + }, + { + "epoch": 7.177653631284916, + "grad_norm": 0.5299314856529236, + "learning_rate": 0.0006431652661064426, + "loss": 0.4692, + "step": 12848 + }, + { + "epoch": 7.178212290502794, + "grad_norm": 0.41880351305007935, + "learning_rate": 0.0006431372549019609, + "loss": 0.4731, + "step": 12849 + }, + { + "epoch": 7.17877094972067, + "grad_norm": 0.7551229000091553, + "learning_rate": 0.0006431092436974791, + "loss": 0.6476, + "step": 12850 + }, + { + "epoch": 7.179329608938548, + "grad_norm": 0.5173777341842651, + "learning_rate": 0.0006430812324929972, + "loss": 0.4406, + "step": 12851 + }, + { + "epoch": 7.179888268156424, + "grad_norm": 0.45377928018569946, + "learning_rate": 0.0006430532212885154, + "loss": 0.5003, + "step": 12852 + }, + { + "epoch": 7.180446927374302, + "grad_norm": 1.4140952825546265, + "learning_rate": 0.0006430252100840336, + "loss": 0.4726, + "step": 12853 + }, + { + "epoch": 7.181005586592179, + "grad_norm": 0.6744934320449829, + "learning_rate": 0.0006429971988795519, + "loss": 0.611, + "step": 12854 + }, + { + "epoch": 7.181564245810056, + "grad_norm": 0.7543616890907288, + "learning_rate": 0.0006429691876750701, + "loss": 0.4849, + "step": 12855 + }, + { + "epoch": 7.182122905027933, + "grad_norm": 0.382301390171051, + "learning_rate": 0.0006429411764705882, + "loss": 0.3688, + "step": 12856 + }, + { + "epoch": 7.18268156424581, + "grad_norm": 0.652662456035614, + "learning_rate": 0.0006429131652661064, + "loss": 0.4044, + "step": 12857 + }, + { + "epoch": 7.183240223463687, + "grad_norm": 0.4762151539325714, + "learning_rate": 0.0006428851540616246, + "loss": 0.4239, + "step": 12858 + }, + { + "epoch": 7.183798882681565, + "grad_norm": 0.6116922497749329, + "learning_rate": 0.0006428571428571429, + "loss": 0.3874, + "step": 12859 + }, + { + "epoch": 7.184357541899441, + "grad_norm": 0.5512605309486389, + "learning_rate": 0.0006428291316526611, + "loss": 0.4232, + "step": 12860 + }, + { + "epoch": 7.184916201117319, + "grad_norm": 1.3778985738754272, + "learning_rate": 0.0006428011204481792, + "loss": 0.4114, + "step": 12861 + }, + { + "epoch": 7.185474860335195, + "grad_norm": 0.4655029773712158, + "learning_rate": 0.0006427731092436974, + "loss": 0.3501, + "step": 12862 + }, + { + "epoch": 7.186033519553073, + "grad_norm": 0.43497544527053833, + "learning_rate": 0.0006427450980392156, + "loss": 0.4285, + "step": 12863 + }, + { + "epoch": 7.18659217877095, + "grad_norm": 0.6692568063735962, + "learning_rate": 0.000642717086834734, + "loss": 0.4947, + "step": 12864 + }, + { + "epoch": 7.187150837988827, + "grad_norm": 0.42030754685401917, + "learning_rate": 0.0006426890756302522, + "loss": 0.3645, + "step": 12865 + }, + { + "epoch": 7.187709497206704, + "grad_norm": 0.5105885863304138, + "learning_rate": 0.0006426610644257704, + "loss": 0.4794, + "step": 12866 + }, + { + "epoch": 7.188268156424581, + "grad_norm": 0.43217065930366516, + "learning_rate": 0.0006426330532212885, + "loss": 0.3949, + "step": 12867 + }, + { + "epoch": 7.188826815642458, + "grad_norm": 0.5292461514472961, + "learning_rate": 0.0006426050420168067, + "loss": 0.4397, + "step": 12868 + }, + { + "epoch": 7.189385474860336, + "grad_norm": 0.635705828666687, + "learning_rate": 0.000642577030812325, + "loss": 0.4063, + "step": 12869 + }, + { + "epoch": 7.189944134078212, + "grad_norm": 0.49821507930755615, + "learning_rate": 0.0006425490196078432, + "loss": 0.4543, + "step": 12870 + }, + { + "epoch": 7.19050279329609, + "grad_norm": 1.5830234289169312, + "learning_rate": 0.0006425210084033614, + "loss": 0.5662, + "step": 12871 + }, + { + "epoch": 7.191061452513966, + "grad_norm": 0.7206829190254211, + "learning_rate": 0.0006424929971988795, + "loss": 0.3783, + "step": 12872 + }, + { + "epoch": 7.191620111731844, + "grad_norm": 0.4381575584411621, + "learning_rate": 0.0006424649859943977, + "loss": 0.5115, + "step": 12873 + }, + { + "epoch": 7.19217877094972, + "grad_norm": 0.5439664125442505, + "learning_rate": 0.000642436974789916, + "loss": 0.4643, + "step": 12874 + }, + { + "epoch": 7.192737430167598, + "grad_norm": 0.6580325961112976, + "learning_rate": 0.0006424089635854342, + "loss": 0.5256, + "step": 12875 + }, + { + "epoch": 7.193296089385475, + "grad_norm": 0.7051538825035095, + "learning_rate": 0.0006423809523809524, + "loss": 0.4853, + "step": 12876 + }, + { + "epoch": 7.193854748603352, + "grad_norm": 0.4369136393070221, + "learning_rate": 0.0006423529411764705, + "loss": 0.5359, + "step": 12877 + }, + { + "epoch": 7.194413407821229, + "grad_norm": 0.7481156587600708, + "learning_rate": 0.0006423249299719887, + "loss": 0.4422, + "step": 12878 + }, + { + "epoch": 7.194972067039106, + "grad_norm": 0.41737329959869385, + "learning_rate": 0.000642296918767507, + "loss": 0.503, + "step": 12879 + }, + { + "epoch": 7.195530726256983, + "grad_norm": 0.5005262494087219, + "learning_rate": 0.0006422689075630253, + "loss": 0.4643, + "step": 12880 + }, + { + "epoch": 7.196089385474861, + "grad_norm": 0.4186493456363678, + "learning_rate": 0.0006422408963585435, + "loss": 0.3752, + "step": 12881 + }, + { + "epoch": 7.196648044692737, + "grad_norm": 0.7866208553314209, + "learning_rate": 0.0006422128851540617, + "loss": 0.3106, + "step": 12882 + }, + { + "epoch": 7.197206703910615, + "grad_norm": 4.869137287139893, + "learning_rate": 0.0006421848739495798, + "loss": 0.4292, + "step": 12883 + }, + { + "epoch": 7.197765363128491, + "grad_norm": 0.4108200967311859, + "learning_rate": 0.0006421568627450981, + "loss": 0.4045, + "step": 12884 + }, + { + "epoch": 7.198324022346369, + "grad_norm": 0.6972508430480957, + "learning_rate": 0.0006421288515406163, + "loss": 0.3792, + "step": 12885 + }, + { + "epoch": 7.198882681564246, + "grad_norm": 0.4358613193035126, + "learning_rate": 0.0006421008403361345, + "loss": 0.4711, + "step": 12886 + }, + { + "epoch": 7.199441340782123, + "grad_norm": 0.5593037009239197, + "learning_rate": 0.0006420728291316527, + "loss": 0.4848, + "step": 12887 + }, + { + "epoch": 7.2, + "grad_norm": 0.5414353013038635, + "learning_rate": 0.0006420448179271708, + "loss": 0.3897, + "step": 12888 + }, + { + "epoch": 7.200558659217877, + "grad_norm": 0.6617635488510132, + "learning_rate": 0.0006420168067226891, + "loss": 0.5381, + "step": 12889 + }, + { + "epoch": 7.201117318435754, + "grad_norm": 0.6181525588035583, + "learning_rate": 0.0006419887955182073, + "loss": 0.4574, + "step": 12890 + }, + { + "epoch": 7.201675977653632, + "grad_norm": 0.44847801327705383, + "learning_rate": 0.0006419607843137255, + "loss": 0.3639, + "step": 12891 + }, + { + "epoch": 7.202234636871508, + "grad_norm": 0.5697240829467773, + "learning_rate": 0.0006419327731092437, + "loss": 0.3302, + "step": 12892 + }, + { + "epoch": 7.202793296089386, + "grad_norm": 0.45044589042663574, + "learning_rate": 0.0006419047619047618, + "loss": 0.3428, + "step": 12893 + }, + { + "epoch": 7.203351955307262, + "grad_norm": 0.4011487662792206, + "learning_rate": 0.0006418767507002801, + "loss": 0.4987, + "step": 12894 + }, + { + "epoch": 7.20391061452514, + "grad_norm": 4.3708086013793945, + "learning_rate": 0.0006418487394957983, + "loss": 0.5639, + "step": 12895 + }, + { + "epoch": 7.204469273743017, + "grad_norm": 0.4025447964668274, + "learning_rate": 0.0006418207282913166, + "loss": 0.358, + "step": 12896 + }, + { + "epoch": 7.205027932960894, + "grad_norm": 1.2090263366699219, + "learning_rate": 0.0006417927170868348, + "loss": 0.3927, + "step": 12897 + }, + { + "epoch": 7.205586592178771, + "grad_norm": 0.47650420665740967, + "learning_rate": 0.000641764705882353, + "loss": 0.4537, + "step": 12898 + }, + { + "epoch": 7.206145251396648, + "grad_norm": 0.48592033982276917, + "learning_rate": 0.0006417366946778712, + "loss": 0.4952, + "step": 12899 + }, + { + "epoch": 7.206703910614525, + "grad_norm": 0.41837751865386963, + "learning_rate": 0.0006417086834733894, + "loss": 0.361, + "step": 12900 + }, + { + "epoch": 7.207262569832403, + "grad_norm": 0.5799628496170044, + "learning_rate": 0.0006416806722689076, + "loss": 0.4705, + "step": 12901 + }, + { + "epoch": 7.207821229050279, + "grad_norm": 0.5551765561103821, + "learning_rate": 0.0006416526610644258, + "loss": 0.3652, + "step": 12902 + }, + { + "epoch": 7.208379888268157, + "grad_norm": 0.6752025485038757, + "learning_rate": 0.000641624649859944, + "loss": 0.5995, + "step": 12903 + }, + { + "epoch": 7.208938547486033, + "grad_norm": 0.70978182554245, + "learning_rate": 0.0006415966386554622, + "loss": 0.4445, + "step": 12904 + }, + { + "epoch": 7.209497206703911, + "grad_norm": 0.6277596950531006, + "learning_rate": 0.0006415686274509804, + "loss": 0.4324, + "step": 12905 + }, + { + "epoch": 7.210055865921788, + "grad_norm": 0.3992590308189392, + "learning_rate": 0.0006415406162464986, + "loss": 0.395, + "step": 12906 + }, + { + "epoch": 7.210614525139665, + "grad_norm": 0.6173178553581238, + "learning_rate": 0.0006415126050420168, + "loss": 0.4062, + "step": 12907 + }, + { + "epoch": 7.211173184357542, + "grad_norm": 0.3988576829433441, + "learning_rate": 0.000641484593837535, + "loss": 0.4263, + "step": 12908 + }, + { + "epoch": 7.211731843575419, + "grad_norm": 0.5829341411590576, + "learning_rate": 0.0006414565826330533, + "loss": 0.4291, + "step": 12909 + }, + { + "epoch": 7.212290502793296, + "grad_norm": 0.5598450899124146, + "learning_rate": 0.0006414285714285714, + "loss": 0.394, + "step": 12910 + }, + { + "epoch": 7.212849162011173, + "grad_norm": 0.5120077133178711, + "learning_rate": 0.0006414005602240896, + "loss": 0.3612, + "step": 12911 + }, + { + "epoch": 7.21340782122905, + "grad_norm": 0.9902352690696716, + "learning_rate": 0.0006413725490196079, + "loss": 0.4317, + "step": 12912 + }, + { + "epoch": 7.213966480446928, + "grad_norm": 0.5374852418899536, + "learning_rate": 0.0006413445378151261, + "loss": 0.4455, + "step": 12913 + }, + { + "epoch": 7.214525139664804, + "grad_norm": 0.5447959899902344, + "learning_rate": 0.0006413165266106444, + "loss": 0.3945, + "step": 12914 + }, + { + "epoch": 7.215083798882682, + "grad_norm": 0.6012040376663208, + "learning_rate": 0.0006412885154061625, + "loss": 0.3984, + "step": 12915 + }, + { + "epoch": 7.215642458100558, + "grad_norm": 0.4044865369796753, + "learning_rate": 0.0006412605042016807, + "loss": 0.3964, + "step": 12916 + }, + { + "epoch": 7.216201117318436, + "grad_norm": 0.9325155019760132, + "learning_rate": 0.0006412324929971989, + "loss": 0.3636, + "step": 12917 + }, + { + "epoch": 7.216759776536313, + "grad_norm": 0.5127488970756531, + "learning_rate": 0.0006412044817927171, + "loss": 0.4919, + "step": 12918 + }, + { + "epoch": 7.21731843575419, + "grad_norm": 1.1355258226394653, + "learning_rate": 0.0006411764705882354, + "loss": 0.3927, + "step": 12919 + }, + { + "epoch": 7.217877094972067, + "grad_norm": 0.4683167636394501, + "learning_rate": 0.0006411484593837535, + "loss": 0.4414, + "step": 12920 + }, + { + "epoch": 7.218435754189944, + "grad_norm": 1.3356767892837524, + "learning_rate": 0.0006411204481792717, + "loss": 0.5558, + "step": 12921 + }, + { + "epoch": 7.218994413407821, + "grad_norm": 0.392017126083374, + "learning_rate": 0.0006410924369747899, + "loss": 0.3589, + "step": 12922 + }, + { + "epoch": 7.219553072625699, + "grad_norm": 0.49749869108200073, + "learning_rate": 0.0006410644257703081, + "loss": 0.3816, + "step": 12923 + }, + { + "epoch": 7.220111731843575, + "grad_norm": 0.7638598680496216, + "learning_rate": 0.0006410364145658264, + "loss": 0.447, + "step": 12924 + }, + { + "epoch": 7.220670391061453, + "grad_norm": 0.4961168169975281, + "learning_rate": 0.0006410084033613446, + "loss": 0.3948, + "step": 12925 + }, + { + "epoch": 7.221229050279329, + "grad_norm": 0.6985642910003662, + "learning_rate": 0.0006409803921568627, + "loss": 0.3327, + "step": 12926 + }, + { + "epoch": 7.221787709497207, + "grad_norm": 0.8201794028282166, + "learning_rate": 0.0006409523809523809, + "loss": 0.5289, + "step": 12927 + }, + { + "epoch": 7.222346368715084, + "grad_norm": 0.45156192779541016, + "learning_rate": 0.0006409243697478991, + "loss": 0.4555, + "step": 12928 + }, + { + "epoch": 7.222905027932961, + "grad_norm": 0.5104547739028931, + "learning_rate": 0.0006408963585434175, + "loss": 0.5608, + "step": 12929 + }, + { + "epoch": 7.223463687150838, + "grad_norm": 0.9331592321395874, + "learning_rate": 0.0006408683473389357, + "loss": 0.4529, + "step": 12930 + }, + { + "epoch": 7.224022346368715, + "grad_norm": 0.44113239645957947, + "learning_rate": 0.0006408403361344538, + "loss": 0.4393, + "step": 12931 + }, + { + "epoch": 7.224581005586592, + "grad_norm": 1.6009677648544312, + "learning_rate": 0.000640812324929972, + "loss": 0.3751, + "step": 12932 + }, + { + "epoch": 7.22513966480447, + "grad_norm": 0.6767282485961914, + "learning_rate": 0.0006407843137254902, + "loss": 0.4119, + "step": 12933 + }, + { + "epoch": 7.225698324022346, + "grad_norm": 0.517426073551178, + "learning_rate": 0.0006407563025210085, + "loss": 0.4043, + "step": 12934 + }, + { + "epoch": 7.226256983240224, + "grad_norm": 0.4232938587665558, + "learning_rate": 0.0006407282913165267, + "loss": 0.4271, + "step": 12935 + }, + { + "epoch": 7.2268156424581, + "grad_norm": 0.48331397771835327, + "learning_rate": 0.0006407002801120448, + "loss": 0.4649, + "step": 12936 + }, + { + "epoch": 7.227374301675978, + "grad_norm": 0.41168758273124695, + "learning_rate": 0.000640672268907563, + "loss": 0.4445, + "step": 12937 + }, + { + "epoch": 7.227932960893855, + "grad_norm": 3.627685785293579, + "learning_rate": 0.0006406442577030812, + "loss": 0.4475, + "step": 12938 + }, + { + "epoch": 7.228491620111732, + "grad_norm": 0.4270423650741577, + "learning_rate": 0.0006406162464985994, + "loss": 0.4504, + "step": 12939 + }, + { + "epoch": 7.229050279329609, + "grad_norm": 0.416037917137146, + "learning_rate": 0.0006405882352941177, + "loss": 0.5165, + "step": 12940 + }, + { + "epoch": 7.229608938547486, + "grad_norm": 0.5177565217018127, + "learning_rate": 0.0006405602240896359, + "loss": 0.4384, + "step": 12941 + }, + { + "epoch": 7.230167597765363, + "grad_norm": 0.6901390552520752, + "learning_rate": 0.000640532212885154, + "loss": 0.4848, + "step": 12942 + }, + { + "epoch": 7.230726256983241, + "grad_norm": 0.45817461609840393, + "learning_rate": 0.0006405042016806722, + "loss": 0.3719, + "step": 12943 + }, + { + "epoch": 7.231284916201117, + "grad_norm": 0.4559125602245331, + "learning_rate": 0.0006404761904761904, + "loss": 0.3874, + "step": 12944 + }, + { + "epoch": 7.231843575418995, + "grad_norm": 0.7202390432357788, + "learning_rate": 0.0006404481792717088, + "loss": 0.4825, + "step": 12945 + }, + { + "epoch": 7.232402234636871, + "grad_norm": 0.42777398228645325, + "learning_rate": 0.000640420168067227, + "loss": 0.3962, + "step": 12946 + }, + { + "epoch": 7.232960893854749, + "grad_norm": 0.43763643503189087, + "learning_rate": 0.0006403921568627451, + "loss": 0.3618, + "step": 12947 + }, + { + "epoch": 7.233519553072625, + "grad_norm": 0.5025946497917175, + "learning_rate": 0.0006403641456582633, + "loss": 0.5123, + "step": 12948 + }, + { + "epoch": 7.234078212290503, + "grad_norm": 0.6518982648849487, + "learning_rate": 0.0006403361344537815, + "loss": 0.5455, + "step": 12949 + }, + { + "epoch": 7.23463687150838, + "grad_norm": 0.6964249610900879, + "learning_rate": 0.0006403081232492998, + "loss": 0.4437, + "step": 12950 + }, + { + "epoch": 7.235195530726257, + "grad_norm": 0.5587266087532043, + "learning_rate": 0.000640280112044818, + "loss": 0.4159, + "step": 12951 + }, + { + "epoch": 7.235754189944134, + "grad_norm": 0.6424814462661743, + "learning_rate": 0.0006402521008403361, + "loss": 0.4672, + "step": 12952 + }, + { + "epoch": 7.236312849162011, + "grad_norm": 0.5570558905601501, + "learning_rate": 0.0006402240896358543, + "loss": 0.4099, + "step": 12953 + }, + { + "epoch": 7.236871508379888, + "grad_norm": 0.49858906865119934, + "learning_rate": 0.0006401960784313725, + "loss": 0.4288, + "step": 12954 + }, + { + "epoch": 7.237430167597766, + "grad_norm": 0.7407189607620239, + "learning_rate": 0.0006401680672268908, + "loss": 0.3844, + "step": 12955 + }, + { + "epoch": 7.237988826815642, + "grad_norm": 0.4942444860935211, + "learning_rate": 0.000640140056022409, + "loss": 0.414, + "step": 12956 + }, + { + "epoch": 7.23854748603352, + "grad_norm": 0.5886725187301636, + "learning_rate": 0.0006401120448179272, + "loss": 0.396, + "step": 12957 + }, + { + "epoch": 7.239106145251396, + "grad_norm": 0.6650775671005249, + "learning_rate": 0.0006400840336134453, + "loss": 0.3157, + "step": 12958 + }, + { + "epoch": 7.239664804469274, + "grad_norm": 0.7435926795005798, + "learning_rate": 0.0006400560224089635, + "loss": 0.5349, + "step": 12959 + }, + { + "epoch": 7.240223463687151, + "grad_norm": 0.4443039000034332, + "learning_rate": 0.0006400280112044818, + "loss": 0.4573, + "step": 12960 + }, + { + "epoch": 7.240782122905028, + "grad_norm": 0.5859401822090149, + "learning_rate": 0.00064, + "loss": 0.4784, + "step": 12961 + }, + { + "epoch": 7.241340782122905, + "grad_norm": 1.9100741147994995, + "learning_rate": 0.0006399719887955183, + "loss": 0.6601, + "step": 12962 + }, + { + "epoch": 7.241899441340782, + "grad_norm": 0.8647416830062866, + "learning_rate": 0.0006399439775910364, + "loss": 0.4608, + "step": 12963 + }, + { + "epoch": 7.242458100558659, + "grad_norm": 0.4086284637451172, + "learning_rate": 0.0006399159663865546, + "loss": 0.4334, + "step": 12964 + }, + { + "epoch": 7.243016759776537, + "grad_norm": 0.4809357225894928, + "learning_rate": 0.0006398879551820729, + "loss": 0.3844, + "step": 12965 + }, + { + "epoch": 7.243575418994413, + "grad_norm": 0.4752311110496521, + "learning_rate": 0.0006398599439775911, + "loss": 0.3657, + "step": 12966 + }, + { + "epoch": 7.244134078212291, + "grad_norm": 0.6342294812202454, + "learning_rate": 0.0006398319327731093, + "loss": 0.4687, + "step": 12967 + }, + { + "epoch": 7.244692737430167, + "grad_norm": 0.5234696865081787, + "learning_rate": 0.0006398039215686274, + "loss": 0.3994, + "step": 12968 + }, + { + "epoch": 7.245251396648045, + "grad_norm": 0.7517098784446716, + "learning_rate": 0.0006397759103641456, + "loss": 0.5198, + "step": 12969 + }, + { + "epoch": 7.245810055865922, + "grad_norm": 0.44918322563171387, + "learning_rate": 0.0006397478991596639, + "loss": 0.4208, + "step": 12970 + }, + { + "epoch": 7.246368715083799, + "grad_norm": 0.46741700172424316, + "learning_rate": 0.0006397198879551821, + "loss": 0.4292, + "step": 12971 + }, + { + "epoch": 7.246927374301676, + "grad_norm": 0.48479223251342773, + "learning_rate": 0.0006396918767507003, + "loss": 0.3691, + "step": 12972 + }, + { + "epoch": 7.247486033519553, + "grad_norm": 0.8295164704322815, + "learning_rate": 0.0006396638655462185, + "loss": 0.5168, + "step": 12973 + }, + { + "epoch": 7.24804469273743, + "grad_norm": 0.5968831181526184, + "learning_rate": 0.0006396358543417366, + "loss": 0.5099, + "step": 12974 + }, + { + "epoch": 7.248603351955307, + "grad_norm": 0.5067930817604065, + "learning_rate": 0.0006396078431372549, + "loss": 0.5719, + "step": 12975 + }, + { + "epoch": 7.249162011173184, + "grad_norm": 0.9654607176780701, + "learning_rate": 0.0006395798319327731, + "loss": 0.4232, + "step": 12976 + }, + { + "epoch": 7.249720670391062, + "grad_norm": 0.3738797605037689, + "learning_rate": 0.0006395518207282913, + "loss": 0.3812, + "step": 12977 + }, + { + "epoch": 7.250279329608938, + "grad_norm": 0.4697604775428772, + "learning_rate": 0.0006395238095238096, + "loss": 0.4742, + "step": 12978 + }, + { + "epoch": 7.250837988826816, + "grad_norm": 0.693396270275116, + "learning_rate": 0.0006394957983193277, + "loss": 0.5476, + "step": 12979 + }, + { + "epoch": 7.251396648044693, + "grad_norm": 0.6955320239067078, + "learning_rate": 0.000639467787114846, + "loss": 0.3677, + "step": 12980 + }, + { + "epoch": 7.25195530726257, + "grad_norm": 0.36686623096466064, + "learning_rate": 0.0006394397759103642, + "loss": 0.4218, + "step": 12981 + }, + { + "epoch": 7.252513966480447, + "grad_norm": 0.45032379031181335, + "learning_rate": 0.0006394117647058824, + "loss": 0.4203, + "step": 12982 + }, + { + "epoch": 7.253072625698324, + "grad_norm": 0.5395963788032532, + "learning_rate": 0.0006393837535014006, + "loss": 0.4838, + "step": 12983 + }, + { + "epoch": 7.253631284916201, + "grad_norm": 0.4786473214626312, + "learning_rate": 0.0006393557422969187, + "loss": 0.5148, + "step": 12984 + }, + { + "epoch": 7.254189944134078, + "grad_norm": 0.4348624050617218, + "learning_rate": 0.000639327731092437, + "loss": 0.4096, + "step": 12985 + }, + { + "epoch": 7.254748603351955, + "grad_norm": 0.562849760055542, + "learning_rate": 0.0006392997198879552, + "loss": 0.4654, + "step": 12986 + }, + { + "epoch": 7.255307262569833, + "grad_norm": 0.502211332321167, + "learning_rate": 0.0006392717086834734, + "loss": 0.477, + "step": 12987 + }, + { + "epoch": 7.255865921787709, + "grad_norm": 0.6770154237747192, + "learning_rate": 0.0006392436974789916, + "loss": 0.6185, + "step": 12988 + }, + { + "epoch": 7.256424581005587, + "grad_norm": 0.43307065963745117, + "learning_rate": 0.0006392156862745098, + "loss": 0.5102, + "step": 12989 + }, + { + "epoch": 7.256983240223463, + "grad_norm": 1.998072862625122, + "learning_rate": 0.000639187675070028, + "loss": 0.4388, + "step": 12990 + }, + { + "epoch": 7.257541899441341, + "grad_norm": 0.5020814538002014, + "learning_rate": 0.0006391596638655462, + "loss": 0.4729, + "step": 12991 + }, + { + "epoch": 7.258100558659218, + "grad_norm": 0.5628366470336914, + "learning_rate": 0.0006391316526610644, + "loss": 0.4677, + "step": 12992 + }, + { + "epoch": 7.258659217877095, + "grad_norm": 1.2835266590118408, + "learning_rate": 0.0006391036414565826, + "loss": 0.4214, + "step": 12993 + }, + { + "epoch": 7.259217877094972, + "grad_norm": 0.6612874269485474, + "learning_rate": 0.0006390756302521009, + "loss": 0.4315, + "step": 12994 + }, + { + "epoch": 7.259776536312849, + "grad_norm": 0.4183623790740967, + "learning_rate": 0.0006390476190476191, + "loss": 0.4478, + "step": 12995 + }, + { + "epoch": 7.260335195530726, + "grad_norm": 0.44214096665382385, + "learning_rate": 0.0006390196078431373, + "loss": 0.3958, + "step": 12996 + }, + { + "epoch": 7.260893854748604, + "grad_norm": 0.44483283162117004, + "learning_rate": 0.0006389915966386555, + "loss": 0.5137, + "step": 12997 + }, + { + "epoch": 7.26145251396648, + "grad_norm": 1.0209646224975586, + "learning_rate": 0.0006389635854341737, + "loss": 0.5574, + "step": 12998 + }, + { + "epoch": 7.262011173184358, + "grad_norm": 0.5998567938804626, + "learning_rate": 0.0006389355742296919, + "loss": 0.445, + "step": 12999 + }, + { + "epoch": 7.262569832402234, + "grad_norm": 0.7421895265579224, + "learning_rate": 0.0006389075630252101, + "loss": 0.4515, + "step": 13000 + }, + { + "epoch": 7.262569832402234, + "eval_cer": 0.09136089380598562, + "eval_loss": 0.34045925736427307, + "eval_runtime": 55.5748, + "eval_samples_per_second": 81.656, + "eval_steps_per_second": 5.11, + "eval_wer": 0.36032513058294463, + "step": 13000 + }, + { + "epoch": 7.263128491620112, + "grad_norm": 0.39072561264038086, + "learning_rate": 0.0006388795518207283, + "loss": 0.4199, + "step": 13001 + }, + { + "epoch": 7.263687150837989, + "grad_norm": 0.6472405791282654, + "learning_rate": 0.0006388515406162465, + "loss": 0.4129, + "step": 13002 + }, + { + "epoch": 7.264245810055866, + "grad_norm": 0.3537490963935852, + "learning_rate": 0.0006388235294117647, + "loss": 0.3476, + "step": 13003 + }, + { + "epoch": 7.264804469273743, + "grad_norm": 0.38289088010787964, + "learning_rate": 0.0006387955182072829, + "loss": 0.3555, + "step": 13004 + }, + { + "epoch": 7.26536312849162, + "grad_norm": 0.6368013620376587, + "learning_rate": 0.0006387675070028012, + "loss": 0.4559, + "step": 13005 + }, + { + "epoch": 7.265921787709497, + "grad_norm": 0.4954204857349396, + "learning_rate": 0.0006387394957983193, + "loss": 0.4473, + "step": 13006 + }, + { + "epoch": 7.266480446927375, + "grad_norm": 0.6269132494926453, + "learning_rate": 0.0006387114845938375, + "loss": 0.4599, + "step": 13007 + }, + { + "epoch": 7.267039106145251, + "grad_norm": 0.5162798762321472, + "learning_rate": 0.0006386834733893557, + "loss": 0.5593, + "step": 13008 + }, + { + "epoch": 7.267597765363129, + "grad_norm": 0.4024692177772522, + "learning_rate": 0.0006386554621848739, + "loss": 0.3817, + "step": 13009 + }, + { + "epoch": 7.268156424581005, + "grad_norm": 0.3753225803375244, + "learning_rate": 0.0006386274509803923, + "loss": 0.3267, + "step": 13010 + }, + { + "epoch": 7.268715083798883, + "grad_norm": 0.4820145070552826, + "learning_rate": 0.0006385994397759104, + "loss": 0.4598, + "step": 13011 + }, + { + "epoch": 7.269273743016759, + "grad_norm": 0.4811007082462311, + "learning_rate": 0.0006385714285714286, + "loss": 0.3824, + "step": 13012 + }, + { + "epoch": 7.269832402234637, + "grad_norm": 0.6950400471687317, + "learning_rate": 0.0006385434173669468, + "loss": 0.374, + "step": 13013 + }, + { + "epoch": 7.270391061452514, + "grad_norm": 0.5128576159477234, + "learning_rate": 0.000638515406162465, + "loss": 0.4784, + "step": 13014 + }, + { + "epoch": 7.270949720670391, + "grad_norm": 0.3894340395927429, + "learning_rate": 0.0006384873949579833, + "loss": 0.4427, + "step": 13015 + }, + { + "epoch": 7.271508379888268, + "grad_norm": 0.5755175948143005, + "learning_rate": 0.0006384593837535014, + "loss": 0.471, + "step": 13016 + }, + { + "epoch": 7.272067039106146, + "grad_norm": 0.37587255239486694, + "learning_rate": 0.0006384313725490196, + "loss": 0.4461, + "step": 13017 + }, + { + "epoch": 7.272625698324022, + "grad_norm": 0.5242875814437866, + "learning_rate": 0.0006384033613445378, + "loss": 0.3799, + "step": 13018 + }, + { + "epoch": 7.2731843575419, + "grad_norm": 0.47452613711357117, + "learning_rate": 0.000638375350140056, + "loss": 0.5535, + "step": 13019 + }, + { + "epoch": 7.273743016759776, + "grad_norm": 0.7593187689781189, + "learning_rate": 0.0006383473389355743, + "loss": 0.4142, + "step": 13020 + }, + { + "epoch": 7.274301675977654, + "grad_norm": 0.4709136486053467, + "learning_rate": 0.0006383193277310925, + "loss": 0.4364, + "step": 13021 + }, + { + "epoch": 7.27486033519553, + "grad_norm": 0.4354836940765381, + "learning_rate": 0.0006382913165266106, + "loss": 0.5133, + "step": 13022 + }, + { + "epoch": 7.275418994413408, + "grad_norm": 0.42964091897010803, + "learning_rate": 0.0006382633053221288, + "loss": 0.3765, + "step": 13023 + }, + { + "epoch": 7.275977653631285, + "grad_norm": 0.3984837830066681, + "learning_rate": 0.000638235294117647, + "loss": 0.4564, + "step": 13024 + }, + { + "epoch": 7.276536312849162, + "grad_norm": 0.8874265551567078, + "learning_rate": 0.0006382072829131653, + "loss": 0.5248, + "step": 13025 + }, + { + "epoch": 7.277094972067039, + "grad_norm": 0.7759783864021301, + "learning_rate": 0.0006381792717086836, + "loss": 0.4771, + "step": 13026 + }, + { + "epoch": 7.277653631284916, + "grad_norm": 0.47065407037734985, + "learning_rate": 0.0006381512605042016, + "loss": 0.3769, + "step": 13027 + }, + { + "epoch": 7.278212290502793, + "grad_norm": 0.35122138261795044, + "learning_rate": 0.0006381232492997199, + "loss": 0.3498, + "step": 13028 + }, + { + "epoch": 7.278770949720671, + "grad_norm": 0.43810176849365234, + "learning_rate": 0.0006380952380952381, + "loss": 0.379, + "step": 13029 + }, + { + "epoch": 7.279329608938547, + "grad_norm": 0.46714839339256287, + "learning_rate": 0.0006380672268907564, + "loss": 0.4172, + "step": 13030 + }, + { + "epoch": 7.279888268156425, + "grad_norm": 0.45940467715263367, + "learning_rate": 0.0006380392156862746, + "loss": 0.4781, + "step": 13031 + }, + { + "epoch": 7.280446927374301, + "grad_norm": 1.0352586507797241, + "learning_rate": 0.0006380112044817927, + "loss": 0.3935, + "step": 13032 + }, + { + "epoch": 7.281005586592179, + "grad_norm": 0.5442858338356018, + "learning_rate": 0.0006379831932773109, + "loss": 0.4074, + "step": 13033 + }, + { + "epoch": 7.281564245810056, + "grad_norm": 0.3928253650665283, + "learning_rate": 0.0006379551820728291, + "loss": 0.3953, + "step": 13034 + }, + { + "epoch": 7.282122905027933, + "grad_norm": 0.6782439947128296, + "learning_rate": 0.0006379271708683474, + "loss": 0.5075, + "step": 13035 + }, + { + "epoch": 7.28268156424581, + "grad_norm": 0.528557538986206, + "learning_rate": 0.0006378991596638656, + "loss": 0.4141, + "step": 13036 + }, + { + "epoch": 7.283240223463687, + "grad_norm": 1.3695623874664307, + "learning_rate": 0.0006378711484593838, + "loss": 0.5236, + "step": 13037 + }, + { + "epoch": 7.283798882681564, + "grad_norm": 0.49207592010498047, + "learning_rate": 0.0006378431372549019, + "loss": 0.5545, + "step": 13038 + }, + { + "epoch": 7.284357541899442, + "grad_norm": 0.4578695297241211, + "learning_rate": 0.0006378151260504201, + "loss": 0.4887, + "step": 13039 + }, + { + "epoch": 7.284916201117318, + "grad_norm": 0.5954352617263794, + "learning_rate": 0.0006377871148459384, + "loss": 0.3935, + "step": 13040 + }, + { + "epoch": 7.285474860335196, + "grad_norm": 0.47506049275398254, + "learning_rate": 0.0006377591036414566, + "loss": 0.4123, + "step": 13041 + }, + { + "epoch": 7.286033519553072, + "grad_norm": 0.6954988837242126, + "learning_rate": 0.0006377310924369748, + "loss": 0.4929, + "step": 13042 + }, + { + "epoch": 7.28659217877095, + "grad_norm": 0.38984450697898865, + "learning_rate": 0.0006377030812324929, + "loss": 0.3274, + "step": 13043 + }, + { + "epoch": 7.287150837988827, + "grad_norm": 0.6316025257110596, + "learning_rate": 0.0006376750700280111, + "loss": 0.4704, + "step": 13044 + }, + { + "epoch": 7.287709497206704, + "grad_norm": 0.6168654561042786, + "learning_rate": 0.0006376470588235295, + "loss": 0.4664, + "step": 13045 + }, + { + "epoch": 7.288268156424581, + "grad_norm": 0.6625655889511108, + "learning_rate": 0.0006376190476190477, + "loss": 0.4639, + "step": 13046 + }, + { + "epoch": 7.288826815642458, + "grad_norm": 0.6546689867973328, + "learning_rate": 0.0006375910364145659, + "loss": 0.7042, + "step": 13047 + }, + { + "epoch": 7.289385474860335, + "grad_norm": 0.4011949300765991, + "learning_rate": 0.000637563025210084, + "loss": 0.4428, + "step": 13048 + }, + { + "epoch": 7.289944134078212, + "grad_norm": 0.7093271613121033, + "learning_rate": 0.0006375350140056022, + "loss": 0.6982, + "step": 13049 + }, + { + "epoch": 7.290502793296089, + "grad_norm": 0.44860026240348816, + "learning_rate": 0.0006375070028011205, + "loss": 0.4716, + "step": 13050 + }, + { + "epoch": 7.291061452513967, + "grad_norm": 0.5320126414299011, + "learning_rate": 0.0006374789915966387, + "loss": 0.4712, + "step": 13051 + }, + { + "epoch": 7.291620111731843, + "grad_norm": 0.3747129738330841, + "learning_rate": 0.0006374509803921569, + "loss": 0.3464, + "step": 13052 + }, + { + "epoch": 7.292178770949721, + "grad_norm": 0.5009068250656128, + "learning_rate": 0.0006374229691876751, + "loss": 0.4855, + "step": 13053 + }, + { + "epoch": 7.292737430167598, + "grad_norm": 0.5898135900497437, + "learning_rate": 0.0006373949579831932, + "loss": 0.356, + "step": 13054 + }, + { + "epoch": 7.293296089385475, + "grad_norm": 2.223297357559204, + "learning_rate": 0.0006373669467787115, + "loss": 0.5516, + "step": 13055 + }, + { + "epoch": 7.293854748603352, + "grad_norm": 0.5072278380393982, + "learning_rate": 0.0006373389355742297, + "loss": 0.474, + "step": 13056 + }, + { + "epoch": 7.294413407821229, + "grad_norm": 0.6756377220153809, + "learning_rate": 0.0006373109243697479, + "loss": 0.4579, + "step": 13057 + }, + { + "epoch": 7.294972067039106, + "grad_norm": 1.3354785442352295, + "learning_rate": 0.0006372829131652661, + "loss": 0.4413, + "step": 13058 + }, + { + "epoch": 7.295530726256983, + "grad_norm": 0.5601457357406616, + "learning_rate": 0.0006372549019607842, + "loss": 0.5884, + "step": 13059 + }, + { + "epoch": 7.29608938547486, + "grad_norm": 1.3154665231704712, + "learning_rate": 0.0006372268907563026, + "loss": 0.4807, + "step": 13060 + }, + { + "epoch": 7.296648044692738, + "grad_norm": 0.5090473294258118, + "learning_rate": 0.0006371988795518208, + "loss": 0.4381, + "step": 13061 + }, + { + "epoch": 7.297206703910614, + "grad_norm": 0.6379081606864929, + "learning_rate": 0.000637170868347339, + "loss": 0.4265, + "step": 13062 + }, + { + "epoch": 7.297765363128492, + "grad_norm": 0.46991169452667236, + "learning_rate": 0.0006371428571428572, + "loss": 0.415, + "step": 13063 + }, + { + "epoch": 7.298324022346368, + "grad_norm": 0.3955645263195038, + "learning_rate": 0.0006371148459383753, + "loss": 0.4217, + "step": 13064 + }, + { + "epoch": 7.298882681564246, + "grad_norm": 0.5086484551429749, + "learning_rate": 0.0006370868347338936, + "loss": 0.537, + "step": 13065 + }, + { + "epoch": 7.299441340782123, + "grad_norm": 0.5588968992233276, + "learning_rate": 0.0006370588235294118, + "loss": 0.3502, + "step": 13066 + }, + { + "epoch": 7.3, + "grad_norm": 0.5842592716217041, + "learning_rate": 0.00063703081232493, + "loss": 0.4281, + "step": 13067 + }, + { + "epoch": 7.300558659217877, + "grad_norm": 0.5039697885513306, + "learning_rate": 0.0006370028011204482, + "loss": 0.4264, + "step": 13068 + }, + { + "epoch": 7.301117318435754, + "grad_norm": 0.5301884412765503, + "learning_rate": 0.0006369747899159664, + "loss": 0.5225, + "step": 13069 + }, + { + "epoch": 7.301675977653631, + "grad_norm": 0.43623143434524536, + "learning_rate": 0.0006369467787114846, + "loss": 0.4709, + "step": 13070 + }, + { + "epoch": 7.302234636871509, + "grad_norm": 0.46189144253730774, + "learning_rate": 0.0006369187675070028, + "loss": 0.4573, + "step": 13071 + }, + { + "epoch": 7.302793296089385, + "grad_norm": 0.38868826627731323, + "learning_rate": 0.000636890756302521, + "loss": 0.3842, + "step": 13072 + }, + { + "epoch": 7.303351955307263, + "grad_norm": 0.46762970089912415, + "learning_rate": 0.0006368627450980392, + "loss": 0.4529, + "step": 13073 + }, + { + "epoch": 7.303910614525139, + "grad_norm": 0.4306127429008484, + "learning_rate": 0.0006368347338935574, + "loss": 0.41, + "step": 13074 + }, + { + "epoch": 7.304469273743017, + "grad_norm": 0.44215765595436096, + "learning_rate": 0.0006368067226890756, + "loss": 0.4002, + "step": 13075 + }, + { + "epoch": 7.305027932960894, + "grad_norm": 0.5541375875473022, + "learning_rate": 0.0006367787114845939, + "loss": 0.4775, + "step": 13076 + }, + { + "epoch": 7.305586592178771, + "grad_norm": 1.2680015563964844, + "learning_rate": 0.0006367507002801121, + "loss": 0.4617, + "step": 13077 + }, + { + "epoch": 7.306145251396648, + "grad_norm": 0.707722008228302, + "learning_rate": 0.0006367226890756303, + "loss": 0.5097, + "step": 13078 + }, + { + "epoch": 7.306703910614525, + "grad_norm": 0.5952586531639099, + "learning_rate": 0.0006366946778711485, + "loss": 0.4845, + "step": 13079 + }, + { + "epoch": 7.307262569832402, + "grad_norm": 0.7303816080093384, + "learning_rate": 0.0006366666666666667, + "loss": 0.4522, + "step": 13080 + }, + { + "epoch": 7.30782122905028, + "grad_norm": 0.4877987504005432, + "learning_rate": 0.0006366386554621849, + "loss": 0.3812, + "step": 13081 + }, + { + "epoch": 7.308379888268156, + "grad_norm": 10.018289566040039, + "learning_rate": 0.0006366106442577031, + "loss": 0.4897, + "step": 13082 + }, + { + "epoch": 7.308938547486034, + "grad_norm": 2.4549496173858643, + "learning_rate": 0.0006365826330532213, + "loss": 0.3864, + "step": 13083 + }, + { + "epoch": 7.30949720670391, + "grad_norm": 0.6826034784317017, + "learning_rate": 0.0006365546218487395, + "loss": 0.5868, + "step": 13084 + }, + { + "epoch": 7.310055865921788, + "grad_norm": 0.6241563558578491, + "learning_rate": 0.0006365266106442578, + "loss": 0.4755, + "step": 13085 + }, + { + "epoch": 7.310614525139664, + "grad_norm": 1.0280530452728271, + "learning_rate": 0.0006364985994397759, + "loss": 0.501, + "step": 13086 + }, + { + "epoch": 7.311173184357542, + "grad_norm": 0.5856412053108215, + "learning_rate": 0.0006364705882352941, + "loss": 0.3779, + "step": 13087 + }, + { + "epoch": 7.311731843575419, + "grad_norm": 0.4948166012763977, + "learning_rate": 0.0006364425770308123, + "loss": 0.3793, + "step": 13088 + }, + { + "epoch": 7.312290502793296, + "grad_norm": 0.4947815537452698, + "learning_rate": 0.0006364145658263305, + "loss": 0.4661, + "step": 13089 + }, + { + "epoch": 7.312849162011173, + "grad_norm": 0.7149274349212646, + "learning_rate": 0.0006363865546218488, + "loss": 0.4161, + "step": 13090 + }, + { + "epoch": 7.31340782122905, + "grad_norm": 0.5715914964675903, + "learning_rate": 0.0006363585434173669, + "loss": 0.4912, + "step": 13091 + }, + { + "epoch": 7.313966480446927, + "grad_norm": 1.0212067365646362, + "learning_rate": 0.0006363305322128851, + "loss": 0.4814, + "step": 13092 + }, + { + "epoch": 7.314525139664805, + "grad_norm": 0.6858073472976685, + "learning_rate": 0.0006363025210084034, + "loss": 0.4723, + "step": 13093 + }, + { + "epoch": 7.315083798882681, + "grad_norm": 0.4616902470588684, + "learning_rate": 0.0006362745098039216, + "loss": 0.2839, + "step": 13094 + }, + { + "epoch": 7.315642458100559, + "grad_norm": 3.5520362854003906, + "learning_rate": 0.0006362464985994399, + "loss": 0.4204, + "step": 13095 + }, + { + "epoch": 7.316201117318435, + "grad_norm": 0.584577202796936, + "learning_rate": 0.000636218487394958, + "loss": 0.5285, + "step": 13096 + }, + { + "epoch": 7.316759776536313, + "grad_norm": 0.478901743888855, + "learning_rate": 0.0006361904761904762, + "loss": 0.4025, + "step": 13097 + }, + { + "epoch": 7.31731843575419, + "grad_norm": 0.7869493365287781, + "learning_rate": 0.0006361624649859944, + "loss": 0.4436, + "step": 13098 + }, + { + "epoch": 7.317877094972067, + "grad_norm": 0.4529644846916199, + "learning_rate": 0.0006361344537815126, + "loss": 0.4191, + "step": 13099 + }, + { + "epoch": 7.318435754189944, + "grad_norm": 0.6310835480690002, + "learning_rate": 0.0006361064425770309, + "loss": 0.5404, + "step": 13100 + }, + { + "epoch": 7.318994413407821, + "grad_norm": 0.5917539000511169, + "learning_rate": 0.0006360784313725491, + "loss": 0.4313, + "step": 13101 + }, + { + "epoch": 7.319553072625698, + "grad_norm": 0.4380839765071869, + "learning_rate": 0.0006360504201680672, + "loss": 0.4137, + "step": 13102 + }, + { + "epoch": 7.320111731843576, + "grad_norm": 0.842938244342804, + "learning_rate": 0.0006360224089635854, + "loss": 0.3857, + "step": 13103 + }, + { + "epoch": 7.320670391061452, + "grad_norm": 0.8247633576393127, + "learning_rate": 0.0006359943977591036, + "loss": 0.4361, + "step": 13104 + }, + { + "epoch": 7.32122905027933, + "grad_norm": 0.48302939534187317, + "learning_rate": 0.0006359663865546219, + "loss": 0.4689, + "step": 13105 + }, + { + "epoch": 7.321787709497206, + "grad_norm": 0.7881895303726196, + "learning_rate": 0.0006359383753501401, + "loss": 0.4015, + "step": 13106 + }, + { + "epoch": 7.322346368715084, + "grad_norm": 0.6172569394111633, + "learning_rate": 0.0006359103641456582, + "loss": 0.5689, + "step": 13107 + }, + { + "epoch": 7.322905027932961, + "grad_norm": 0.9039911031723022, + "learning_rate": 0.0006358823529411764, + "loss": 0.4664, + "step": 13108 + }, + { + "epoch": 7.323463687150838, + "grad_norm": 3.5886175632476807, + "learning_rate": 0.0006358543417366946, + "loss": 0.3362, + "step": 13109 + }, + { + "epoch": 7.324022346368715, + "grad_norm": 0.4437963664531708, + "learning_rate": 0.000635826330532213, + "loss": 0.5628, + "step": 13110 + }, + { + "epoch": 7.324581005586592, + "grad_norm": 0.6224605441093445, + "learning_rate": 0.0006357983193277312, + "loss": 0.4705, + "step": 13111 + }, + { + "epoch": 7.325139664804469, + "grad_norm": 0.5060938000679016, + "learning_rate": 0.0006357703081232493, + "loss": 0.5119, + "step": 13112 + }, + { + "epoch": 7.325698324022347, + "grad_norm": 0.42316100001335144, + "learning_rate": 0.0006357422969187675, + "loss": 0.3545, + "step": 13113 + }, + { + "epoch": 7.326256983240223, + "grad_norm": 0.48364296555519104, + "learning_rate": 0.0006357142857142857, + "loss": 0.5177, + "step": 13114 + }, + { + "epoch": 7.326815642458101, + "grad_norm": 0.5766605734825134, + "learning_rate": 0.000635686274509804, + "loss": 0.4974, + "step": 13115 + }, + { + "epoch": 7.327374301675977, + "grad_norm": 0.5737341046333313, + "learning_rate": 0.0006356582633053222, + "loss": 0.4544, + "step": 13116 + }, + { + "epoch": 7.327932960893855, + "grad_norm": 0.7070626616477966, + "learning_rate": 0.0006356302521008404, + "loss": 0.5661, + "step": 13117 + }, + { + "epoch": 7.328491620111732, + "grad_norm": 0.8192995190620422, + "learning_rate": 0.0006356022408963585, + "loss": 0.3683, + "step": 13118 + }, + { + "epoch": 7.329050279329609, + "grad_norm": 0.6192365288734436, + "learning_rate": 0.0006355742296918767, + "loss": 0.4181, + "step": 13119 + }, + { + "epoch": 7.329608938547486, + "grad_norm": 0.46250036358833313, + "learning_rate": 0.000635546218487395, + "loss": 0.524, + "step": 13120 + }, + { + "epoch": 7.330167597765363, + "grad_norm": 0.6766829490661621, + "learning_rate": 0.0006355182072829132, + "loss": 0.4381, + "step": 13121 + }, + { + "epoch": 7.33072625698324, + "grad_norm": 0.5478894710540771, + "learning_rate": 0.0006354901960784314, + "loss": 0.6328, + "step": 13122 + }, + { + "epoch": 7.331284916201117, + "grad_norm": 1.5401118993759155, + "learning_rate": 0.0006354621848739495, + "loss": 0.3689, + "step": 13123 + }, + { + "epoch": 7.331843575418994, + "grad_norm": 0.7330833077430725, + "learning_rate": 0.0006354341736694677, + "loss": 0.4797, + "step": 13124 + }, + { + "epoch": 7.332402234636872, + "grad_norm": 0.532580554485321, + "learning_rate": 0.000635406162464986, + "loss": 0.3267, + "step": 13125 + }, + { + "epoch": 7.332960893854748, + "grad_norm": 0.4393022358417511, + "learning_rate": 0.0006353781512605043, + "loss": 0.4645, + "step": 13126 + }, + { + "epoch": 7.333519553072626, + "grad_norm": 0.7605563402175903, + "learning_rate": 0.0006353501400560225, + "loss": 0.5058, + "step": 13127 + }, + { + "epoch": 7.334078212290502, + "grad_norm": 1.0321952104568481, + "learning_rate": 0.0006353221288515406, + "loss": 0.4405, + "step": 13128 + }, + { + "epoch": 7.33463687150838, + "grad_norm": 0.3676065504550934, + "learning_rate": 0.0006352941176470588, + "loss": 0.3764, + "step": 13129 + }, + { + "epoch": 7.335195530726257, + "grad_norm": 0.6089480519294739, + "learning_rate": 0.0006352661064425771, + "loss": 0.3649, + "step": 13130 + }, + { + "epoch": 7.335754189944134, + "grad_norm": 0.483052134513855, + "learning_rate": 0.0006352380952380953, + "loss": 0.4819, + "step": 13131 + }, + { + "epoch": 7.336312849162011, + "grad_norm": 0.47349387407302856, + "learning_rate": 0.0006352100840336135, + "loss": 0.4485, + "step": 13132 + }, + { + "epoch": 7.336871508379888, + "grad_norm": 0.43881145119667053, + "learning_rate": 0.0006351820728291317, + "loss": 0.4042, + "step": 13133 + }, + { + "epoch": 7.337430167597765, + "grad_norm": 0.4796612858772278, + "learning_rate": 0.0006351540616246498, + "loss": 0.4279, + "step": 13134 + }, + { + "epoch": 7.337988826815643, + "grad_norm": 0.324850857257843, + "learning_rate": 0.0006351260504201681, + "loss": 0.3721, + "step": 13135 + }, + { + "epoch": 7.338547486033519, + "grad_norm": 0.4559110105037689, + "learning_rate": 0.0006350980392156863, + "loss": 0.5117, + "step": 13136 + }, + { + "epoch": 7.339106145251397, + "grad_norm": 0.3338627517223358, + "learning_rate": 0.0006350700280112045, + "loss": 0.3957, + "step": 13137 + }, + { + "epoch": 7.339664804469273, + "grad_norm": 0.5494134426116943, + "learning_rate": 0.0006350420168067227, + "loss": 0.5581, + "step": 13138 + }, + { + "epoch": 7.340223463687151, + "grad_norm": 0.36717334389686584, + "learning_rate": 0.0006350140056022408, + "loss": 0.3289, + "step": 13139 + }, + { + "epoch": 7.340782122905028, + "grad_norm": 0.46892473101615906, + "learning_rate": 0.0006349859943977591, + "loss": 0.4301, + "step": 13140 + }, + { + "epoch": 7.341340782122905, + "grad_norm": 0.48795053362846375, + "learning_rate": 0.0006349579831932773, + "loss": 0.4147, + "step": 13141 + }, + { + "epoch": 7.341899441340782, + "grad_norm": 0.481522798538208, + "learning_rate": 0.0006349299719887956, + "loss": 0.4936, + "step": 13142 + }, + { + "epoch": 7.342458100558659, + "grad_norm": 3.654675245285034, + "learning_rate": 0.0006349019607843138, + "loss": 0.5425, + "step": 13143 + }, + { + "epoch": 7.343016759776536, + "grad_norm": 0.4006863236427307, + "learning_rate": 0.0006348739495798319, + "loss": 0.5416, + "step": 13144 + }, + { + "epoch": 7.343575418994414, + "grad_norm": 0.5376516580581665, + "learning_rate": 0.0006348459383753502, + "loss": 0.4267, + "step": 13145 + }, + { + "epoch": 7.34413407821229, + "grad_norm": 0.44530048966407776, + "learning_rate": 0.0006348179271708684, + "loss": 0.4671, + "step": 13146 + }, + { + "epoch": 7.344692737430168, + "grad_norm": 1.1315100193023682, + "learning_rate": 0.0006347899159663866, + "loss": 0.4961, + "step": 13147 + }, + { + "epoch": 7.345251396648044, + "grad_norm": 0.39795854687690735, + "learning_rate": 0.0006347619047619048, + "loss": 0.3401, + "step": 13148 + }, + { + "epoch": 7.345810055865922, + "grad_norm": 2.752030372619629, + "learning_rate": 0.000634733893557423, + "loss": 0.4108, + "step": 13149 + }, + { + "epoch": 7.346368715083799, + "grad_norm": 0.5136620998382568, + "learning_rate": 0.0006347058823529412, + "loss": 0.5094, + "step": 13150 + }, + { + "epoch": 7.346927374301676, + "grad_norm": 0.3893483579158783, + "learning_rate": 0.0006346778711484594, + "loss": 0.4612, + "step": 13151 + }, + { + "epoch": 7.347486033519553, + "grad_norm": 0.47714313864707947, + "learning_rate": 0.0006346498599439776, + "loss": 0.4142, + "step": 13152 + }, + { + "epoch": 7.34804469273743, + "grad_norm": 3.0546116828918457, + "learning_rate": 0.0006346218487394958, + "loss": 0.3998, + "step": 13153 + }, + { + "epoch": 7.348603351955307, + "grad_norm": 0.5748021602630615, + "learning_rate": 0.000634593837535014, + "loss": 0.508, + "step": 13154 + }, + { + "epoch": 7.349162011173185, + "grad_norm": 0.41996580362319946, + "learning_rate": 0.0006345658263305322, + "loss": 0.4229, + "step": 13155 + }, + { + "epoch": 7.349720670391061, + "grad_norm": 0.5384958386421204, + "learning_rate": 0.0006345378151260504, + "loss": 0.4358, + "step": 13156 + }, + { + "epoch": 7.350279329608939, + "grad_norm": 0.606629490852356, + "learning_rate": 0.0006345098039215686, + "loss": 0.3531, + "step": 13157 + }, + { + "epoch": 7.350837988826815, + "grad_norm": 0.415831059217453, + "learning_rate": 0.0006344817927170869, + "loss": 0.328, + "step": 13158 + }, + { + "epoch": 7.351396648044693, + "grad_norm": 0.4611166715621948, + "learning_rate": 0.0006344537815126051, + "loss": 0.4616, + "step": 13159 + }, + { + "epoch": 7.351955307262569, + "grad_norm": 0.6233753561973572, + "learning_rate": 0.0006344257703081234, + "loss": 0.4287, + "step": 13160 + }, + { + "epoch": 7.352513966480447, + "grad_norm": 0.6902372241020203, + "learning_rate": 0.0006343977591036415, + "loss": 0.4051, + "step": 13161 + }, + { + "epoch": 7.353072625698324, + "grad_norm": 0.38717037439346313, + "learning_rate": 0.0006343697478991597, + "loss": 0.3819, + "step": 13162 + }, + { + "epoch": 7.353631284916201, + "grad_norm": 0.6942891478538513, + "learning_rate": 0.0006343417366946779, + "loss": 0.3681, + "step": 13163 + }, + { + "epoch": 7.354189944134078, + "grad_norm": 1.0617579221725464, + "learning_rate": 0.0006343137254901961, + "loss": 0.5983, + "step": 13164 + }, + { + "epoch": 7.354748603351955, + "grad_norm": 1.2739969491958618, + "learning_rate": 0.0006342857142857143, + "loss": 0.4021, + "step": 13165 + }, + { + "epoch": 7.355307262569832, + "grad_norm": 0.6465200185775757, + "learning_rate": 0.0006342577030812325, + "loss": 0.3021, + "step": 13166 + }, + { + "epoch": 7.35586592178771, + "grad_norm": 0.6682726144790649, + "learning_rate": 0.0006342296918767507, + "loss": 0.4849, + "step": 13167 + }, + { + "epoch": 7.356424581005586, + "grad_norm": 0.4172304570674896, + "learning_rate": 0.0006342016806722689, + "loss": 0.4584, + "step": 13168 + }, + { + "epoch": 7.356983240223464, + "grad_norm": 0.6555324196815491, + "learning_rate": 0.0006341736694677871, + "loss": 0.4939, + "step": 13169 + }, + { + "epoch": 7.35754189944134, + "grad_norm": 0.5502414703369141, + "learning_rate": 0.0006341456582633053, + "loss": 0.5627, + "step": 13170 + }, + { + "epoch": 7.358100558659218, + "grad_norm": 1.2976518869400024, + "learning_rate": 0.0006341176470588235, + "loss": 0.4506, + "step": 13171 + }, + { + "epoch": 7.358659217877095, + "grad_norm": 0.42921504378318787, + "learning_rate": 0.0006340896358543417, + "loss": 0.4531, + "step": 13172 + }, + { + "epoch": 7.359217877094972, + "grad_norm": 0.4945555031299591, + "learning_rate": 0.0006340616246498599, + "loss": 0.4605, + "step": 13173 + }, + { + "epoch": 7.359776536312849, + "grad_norm": 0.5439903140068054, + "learning_rate": 0.0006340336134453781, + "loss": 0.3161, + "step": 13174 + }, + { + "epoch": 7.360335195530726, + "grad_norm": 0.6298562288284302, + "learning_rate": 0.0006340056022408964, + "loss": 0.4117, + "step": 13175 + }, + { + "epoch": 7.360893854748603, + "grad_norm": 0.8301893472671509, + "learning_rate": 0.0006339775910364147, + "loss": 0.5166, + "step": 13176 + }, + { + "epoch": 7.361452513966481, + "grad_norm": 0.46162816882133484, + "learning_rate": 0.0006339495798319328, + "loss": 0.4338, + "step": 13177 + }, + { + "epoch": 7.362011173184357, + "grad_norm": 0.6729776859283447, + "learning_rate": 0.000633921568627451, + "loss": 0.3576, + "step": 13178 + }, + { + "epoch": 7.362569832402235, + "grad_norm": 7.691051483154297, + "learning_rate": 0.0006338935574229692, + "loss": 0.4666, + "step": 13179 + }, + { + "epoch": 7.363128491620111, + "grad_norm": 0.8211254477500916, + "learning_rate": 0.0006338655462184874, + "loss": 0.4905, + "step": 13180 + }, + { + "epoch": 7.363687150837989, + "grad_norm": 0.7436676621437073, + "learning_rate": 0.0006338375350140057, + "loss": 0.5449, + "step": 13181 + }, + { + "epoch": 7.364245810055866, + "grad_norm": 0.4844485819339752, + "learning_rate": 0.0006338095238095238, + "loss": 0.4974, + "step": 13182 + }, + { + "epoch": 7.364804469273743, + "grad_norm": 0.4114803075790405, + "learning_rate": 0.000633781512605042, + "loss": 0.4529, + "step": 13183 + }, + { + "epoch": 7.36536312849162, + "grad_norm": 0.8345039486885071, + "learning_rate": 0.0006337535014005602, + "loss": 0.4747, + "step": 13184 + }, + { + "epoch": 7.365921787709497, + "grad_norm": 0.5096237659454346, + "learning_rate": 0.0006337254901960784, + "loss": 0.4078, + "step": 13185 + }, + { + "epoch": 7.366480446927374, + "grad_norm": 0.36407962441444397, + "learning_rate": 0.0006336974789915967, + "loss": 0.4519, + "step": 13186 + }, + { + "epoch": 7.367039106145251, + "grad_norm": 0.4901229739189148, + "learning_rate": 0.0006336694677871148, + "loss": 0.4024, + "step": 13187 + }, + { + "epoch": 7.367597765363128, + "grad_norm": 0.45065784454345703, + "learning_rate": 0.000633641456582633, + "loss": 0.4746, + "step": 13188 + }, + { + "epoch": 7.368156424581006, + "grad_norm": 0.38963842391967773, + "learning_rate": 0.0006336134453781512, + "loss": 0.3924, + "step": 13189 + }, + { + "epoch": 7.368715083798882, + "grad_norm": 4.01322603225708, + "learning_rate": 0.0006335854341736694, + "loss": 0.5011, + "step": 13190 + }, + { + "epoch": 7.36927374301676, + "grad_norm": 1.151841402053833, + "learning_rate": 0.0006335574229691878, + "loss": 0.4343, + "step": 13191 + }, + { + "epoch": 7.369832402234637, + "grad_norm": 0.5479128956794739, + "learning_rate": 0.000633529411764706, + "loss": 0.5524, + "step": 13192 + }, + { + "epoch": 7.370391061452514, + "grad_norm": 0.3792780041694641, + "learning_rate": 0.0006335014005602241, + "loss": 0.3224, + "step": 13193 + }, + { + "epoch": 7.370949720670391, + "grad_norm": 0.45167112350463867, + "learning_rate": 0.0006334733893557423, + "loss": 0.4711, + "step": 13194 + }, + { + "epoch": 7.371508379888268, + "grad_norm": 0.6191294193267822, + "learning_rate": 0.0006334453781512605, + "loss": 0.4157, + "step": 13195 + }, + { + "epoch": 7.372067039106145, + "grad_norm": 0.5590991377830505, + "learning_rate": 0.0006334173669467788, + "loss": 0.4232, + "step": 13196 + }, + { + "epoch": 7.372625698324022, + "grad_norm": 0.4230961203575134, + "learning_rate": 0.000633389355742297, + "loss": 0.4401, + "step": 13197 + }, + { + "epoch": 7.373184357541899, + "grad_norm": 0.7729098796844482, + "learning_rate": 0.0006333613445378151, + "loss": 0.7506, + "step": 13198 + }, + { + "epoch": 7.373743016759777, + "grad_norm": 0.4981231987476349, + "learning_rate": 0.0006333333333333333, + "loss": 0.4668, + "step": 13199 + }, + { + "epoch": 7.374301675977653, + "grad_norm": 16.94438934326172, + "learning_rate": 0.0006333053221288515, + "loss": 0.4515, + "step": 13200 + }, + { + "epoch": 7.374860335195531, + "grad_norm": 0.40868082642555237, + "learning_rate": 0.0006332773109243698, + "loss": 0.4, + "step": 13201 + }, + { + "epoch": 7.375418994413407, + "grad_norm": 0.48749709129333496, + "learning_rate": 0.000633249299719888, + "loss": 0.3826, + "step": 13202 + }, + { + "epoch": 7.375977653631285, + "grad_norm": 0.7036557793617249, + "learning_rate": 0.0006332212885154061, + "loss": 0.3823, + "step": 13203 + }, + { + "epoch": 7.376536312849162, + "grad_norm": 0.4205828905105591, + "learning_rate": 0.0006331932773109243, + "loss": 0.353, + "step": 13204 + }, + { + "epoch": 7.377094972067039, + "grad_norm": 0.3899551033973694, + "learning_rate": 0.0006331652661064425, + "loss": 0.4245, + "step": 13205 + }, + { + "epoch": 7.377653631284916, + "grad_norm": 0.4924805164337158, + "learning_rate": 0.0006331372549019608, + "loss": 0.4464, + "step": 13206 + }, + { + "epoch": 7.378212290502793, + "grad_norm": 1.1695677042007446, + "learning_rate": 0.000633109243697479, + "loss": 0.4038, + "step": 13207 + }, + { + "epoch": 7.37877094972067, + "grad_norm": 0.529436469078064, + "learning_rate": 0.0006330812324929973, + "loss": 0.5105, + "step": 13208 + }, + { + "epoch": 7.379329608938548, + "grad_norm": 0.4653843939304352, + "learning_rate": 0.0006330532212885154, + "loss": 0.4918, + "step": 13209 + }, + { + "epoch": 7.379888268156424, + "grad_norm": 0.3980516493320465, + "learning_rate": 0.0006330252100840336, + "loss": 0.408, + "step": 13210 + }, + { + "epoch": 7.380446927374302, + "grad_norm": 0.4431236684322357, + "learning_rate": 0.0006329971988795519, + "loss": 0.4575, + "step": 13211 + }, + { + "epoch": 7.381005586592178, + "grad_norm": 0.704119086265564, + "learning_rate": 0.0006329691876750701, + "loss": 0.4815, + "step": 13212 + }, + { + "epoch": 7.381564245810056, + "grad_norm": 0.6382777094841003, + "learning_rate": 0.0006329411764705883, + "loss": 0.4759, + "step": 13213 + }, + { + "epoch": 7.382122905027933, + "grad_norm": 1.5455689430236816, + "learning_rate": 0.0006329131652661064, + "loss": 0.487, + "step": 13214 + }, + { + "epoch": 7.38268156424581, + "grad_norm": 3.0327768325805664, + "learning_rate": 0.0006328851540616246, + "loss": 0.5227, + "step": 13215 + }, + { + "epoch": 7.383240223463687, + "grad_norm": 0.5812059044837952, + "learning_rate": 0.0006328571428571429, + "loss": 0.4597, + "step": 13216 + }, + { + "epoch": 7.383798882681564, + "grad_norm": 0.4017048180103302, + "learning_rate": 0.0006328291316526611, + "loss": 0.4463, + "step": 13217 + }, + { + "epoch": 7.384357541899441, + "grad_norm": 0.5154291391372681, + "learning_rate": 0.0006328011204481793, + "loss": 0.5196, + "step": 13218 + }, + { + "epoch": 7.384916201117319, + "grad_norm": 1.431203842163086, + "learning_rate": 0.0006327731092436974, + "loss": 0.4399, + "step": 13219 + }, + { + "epoch": 7.385474860335195, + "grad_norm": 0.4689692556858063, + "learning_rate": 0.0006327450980392156, + "loss": 0.4999, + "step": 13220 + }, + { + "epoch": 7.386033519553073, + "grad_norm": 0.4327826201915741, + "learning_rate": 0.0006327170868347339, + "loss": 0.4883, + "step": 13221 + }, + { + "epoch": 7.386592178770949, + "grad_norm": 0.3834301829338074, + "learning_rate": 0.0006326890756302521, + "loss": 0.3313, + "step": 13222 + }, + { + "epoch": 7.387150837988827, + "grad_norm": 0.6593403220176697, + "learning_rate": 0.0006326610644257703, + "loss": 0.6204, + "step": 13223 + }, + { + "epoch": 7.3877094972067034, + "grad_norm": 0.7226658463478088, + "learning_rate": 0.0006326330532212886, + "loss": 0.4978, + "step": 13224 + }, + { + "epoch": 7.388268156424581, + "grad_norm": 1.397457242012024, + "learning_rate": 0.0006326050420168067, + "loss": 0.5043, + "step": 13225 + }, + { + "epoch": 7.388826815642458, + "grad_norm": 0.5905756950378418, + "learning_rate": 0.000632577030812325, + "loss": 0.5169, + "step": 13226 + }, + { + "epoch": 7.389385474860335, + "grad_norm": 0.692711591720581, + "learning_rate": 0.0006325490196078432, + "loss": 0.4879, + "step": 13227 + }, + { + "epoch": 7.389944134078212, + "grad_norm": 14.020736694335938, + "learning_rate": 0.0006325210084033614, + "loss": 0.5024, + "step": 13228 + }, + { + "epoch": 7.39050279329609, + "grad_norm": 2.3759541511535645, + "learning_rate": 0.0006324929971988796, + "loss": 0.4677, + "step": 13229 + }, + { + "epoch": 7.391061452513966, + "grad_norm": 0.6416304707527161, + "learning_rate": 0.0006324649859943977, + "loss": 0.3308, + "step": 13230 + }, + { + "epoch": 7.391620111731844, + "grad_norm": 0.7347482442855835, + "learning_rate": 0.000632436974789916, + "loss": 0.6094, + "step": 13231 + }, + { + "epoch": 7.39217877094972, + "grad_norm": 1.13887357711792, + "learning_rate": 0.0006324089635854342, + "loss": 0.3676, + "step": 13232 + }, + { + "epoch": 7.392737430167598, + "grad_norm": 0.48946502804756165, + "learning_rate": 0.0006323809523809524, + "loss": 0.4465, + "step": 13233 + }, + { + "epoch": 7.3932960893854744, + "grad_norm": 0.5838249921798706, + "learning_rate": 0.0006323529411764706, + "loss": 0.4723, + "step": 13234 + }, + { + "epoch": 7.393854748603352, + "grad_norm": 0.5177309513092041, + "learning_rate": 0.0006323249299719887, + "loss": 0.5624, + "step": 13235 + }, + { + "epoch": 7.394413407821229, + "grad_norm": 1.3859809637069702, + "learning_rate": 0.000632296918767507, + "loss": 0.4088, + "step": 13236 + }, + { + "epoch": 7.394972067039106, + "grad_norm": 0.4547765552997589, + "learning_rate": 0.0006322689075630252, + "loss": 0.5584, + "step": 13237 + }, + { + "epoch": 7.395530726256983, + "grad_norm": 0.6429945230484009, + "learning_rate": 0.0006322408963585434, + "loss": 0.5376, + "step": 13238 + }, + { + "epoch": 7.39608938547486, + "grad_norm": 0.5619122385978699, + "learning_rate": 0.0006322128851540616, + "loss": 0.4954, + "step": 13239 + }, + { + "epoch": 7.396648044692737, + "grad_norm": 0.5644948482513428, + "learning_rate": 0.0006321848739495799, + "loss": 0.4621, + "step": 13240 + }, + { + "epoch": 7.397206703910615, + "grad_norm": 0.8649450540542603, + "learning_rate": 0.0006321568627450981, + "loss": 0.4631, + "step": 13241 + }, + { + "epoch": 7.397765363128491, + "grad_norm": 0.4329600930213928, + "learning_rate": 0.0006321288515406163, + "loss": 0.4635, + "step": 13242 + }, + { + "epoch": 7.398324022346369, + "grad_norm": 4.139972686767578, + "learning_rate": 0.0006321008403361345, + "loss": 0.487, + "step": 13243 + }, + { + "epoch": 7.3988826815642454, + "grad_norm": 0.4320136606693268, + "learning_rate": 0.0006320728291316527, + "loss": 0.449, + "step": 13244 + }, + { + "epoch": 7.399441340782123, + "grad_norm": 2.186617851257324, + "learning_rate": 0.0006320448179271709, + "loss": 0.4427, + "step": 13245 + }, + { + "epoch": 7.4, + "grad_norm": 0.6491425633430481, + "learning_rate": 0.0006320168067226891, + "loss": 0.5894, + "step": 13246 + }, + { + "epoch": 7.400558659217877, + "grad_norm": 0.6695373058319092, + "learning_rate": 0.0006319887955182073, + "loss": 0.4565, + "step": 13247 + }, + { + "epoch": 7.401117318435754, + "grad_norm": 0.7496480941772461, + "learning_rate": 0.0006319607843137255, + "loss": 0.4661, + "step": 13248 + }, + { + "epoch": 7.401675977653631, + "grad_norm": 0.38105449080467224, + "learning_rate": 0.0006319327731092437, + "loss": 0.3993, + "step": 13249 + }, + { + "epoch": 7.402234636871508, + "grad_norm": 0.5346215963363647, + "learning_rate": 0.0006319047619047619, + "loss": 0.3761, + "step": 13250 + }, + { + "epoch": 7.402793296089386, + "grad_norm": 0.683036208152771, + "learning_rate": 0.0006318767507002801, + "loss": 0.4968, + "step": 13251 + }, + { + "epoch": 7.403351955307262, + "grad_norm": 0.40833696722984314, + "learning_rate": 0.0006318487394957983, + "loss": 0.499, + "step": 13252 + }, + { + "epoch": 7.40391061452514, + "grad_norm": 1.670440435409546, + "learning_rate": 0.0006318207282913165, + "loss": 0.381, + "step": 13253 + }, + { + "epoch": 7.4044692737430164, + "grad_norm": 0.4978441298007965, + "learning_rate": 0.0006317927170868347, + "loss": 0.4784, + "step": 13254 + }, + { + "epoch": 7.405027932960894, + "grad_norm": 0.5723251104354858, + "learning_rate": 0.0006317647058823529, + "loss": 0.5265, + "step": 13255 + }, + { + "epoch": 7.405586592178771, + "grad_norm": 0.7274388670921326, + "learning_rate": 0.0006317366946778713, + "loss": 0.3593, + "step": 13256 + }, + { + "epoch": 7.406145251396648, + "grad_norm": 0.3940960168838501, + "learning_rate": 0.0006317086834733894, + "loss": 0.4622, + "step": 13257 + }, + { + "epoch": 7.406703910614525, + "grad_norm": 0.5888367891311646, + "learning_rate": 0.0006316806722689076, + "loss": 0.4838, + "step": 13258 + }, + { + "epoch": 7.407262569832402, + "grad_norm": 0.4865497946739197, + "learning_rate": 0.0006316526610644258, + "loss": 0.4549, + "step": 13259 + }, + { + "epoch": 7.407821229050279, + "grad_norm": 0.5355167388916016, + "learning_rate": 0.000631624649859944, + "loss": 0.4255, + "step": 13260 + }, + { + "epoch": 7.408379888268156, + "grad_norm": 0.5155198574066162, + "learning_rate": 0.0006315966386554623, + "loss": 0.4661, + "step": 13261 + }, + { + "epoch": 7.408938547486033, + "grad_norm": 0.4844139516353607, + "learning_rate": 0.0006315686274509804, + "loss": 0.4747, + "step": 13262 + }, + { + "epoch": 7.409497206703911, + "grad_norm": 0.5467159152030945, + "learning_rate": 0.0006315406162464986, + "loss": 0.6514, + "step": 13263 + }, + { + "epoch": 7.410055865921787, + "grad_norm": 0.916296124458313, + "learning_rate": 0.0006315126050420168, + "loss": 0.3834, + "step": 13264 + }, + { + "epoch": 7.410614525139665, + "grad_norm": 0.5033332109451294, + "learning_rate": 0.000631484593837535, + "loss": 0.4656, + "step": 13265 + }, + { + "epoch": 7.411173184357542, + "grad_norm": 0.6166537404060364, + "learning_rate": 0.0006314565826330533, + "loss": 0.4434, + "step": 13266 + }, + { + "epoch": 7.411731843575419, + "grad_norm": 0.5477985739707947, + "learning_rate": 0.0006314285714285714, + "loss": 0.483, + "step": 13267 + }, + { + "epoch": 7.412290502793296, + "grad_norm": 0.5080099701881409, + "learning_rate": 0.0006314005602240896, + "loss": 0.4495, + "step": 13268 + }, + { + "epoch": 7.412849162011173, + "grad_norm": 0.46978458762168884, + "learning_rate": 0.0006313725490196078, + "loss": 0.5162, + "step": 13269 + }, + { + "epoch": 7.41340782122905, + "grad_norm": 0.9543061256408691, + "learning_rate": 0.000631344537815126, + "loss": 0.5561, + "step": 13270 + }, + { + "epoch": 7.413966480446927, + "grad_norm": 0.43025583028793335, + "learning_rate": 0.0006313165266106443, + "loss": 0.3908, + "step": 13271 + }, + { + "epoch": 7.414525139664804, + "grad_norm": 2.7704813480377197, + "learning_rate": 0.0006312885154061626, + "loss": 0.4813, + "step": 13272 + }, + { + "epoch": 7.415083798882682, + "grad_norm": 0.6134544610977173, + "learning_rate": 0.0006312605042016806, + "loss": 0.4849, + "step": 13273 + }, + { + "epoch": 7.415642458100558, + "grad_norm": 0.7115921974182129, + "learning_rate": 0.0006312324929971989, + "loss": 0.3744, + "step": 13274 + }, + { + "epoch": 7.416201117318436, + "grad_norm": 0.44600415229797363, + "learning_rate": 0.0006312044817927171, + "loss": 0.58, + "step": 13275 + }, + { + "epoch": 7.4167597765363125, + "grad_norm": 3.882844924926758, + "learning_rate": 0.0006311764705882354, + "loss": 0.4281, + "step": 13276 + }, + { + "epoch": 7.41731843575419, + "grad_norm": 0.5672256350517273, + "learning_rate": 0.0006311484593837536, + "loss": 0.4533, + "step": 13277 + }, + { + "epoch": 7.417877094972067, + "grad_norm": 0.5520586967468262, + "learning_rate": 0.0006311204481792717, + "loss": 0.3926, + "step": 13278 + }, + { + "epoch": 7.418435754189944, + "grad_norm": 0.5408652424812317, + "learning_rate": 0.0006310924369747899, + "loss": 0.6045, + "step": 13279 + }, + { + "epoch": 7.418994413407821, + "grad_norm": 0.41654324531555176, + "learning_rate": 0.0006310644257703081, + "loss": 0.3347, + "step": 13280 + }, + { + "epoch": 7.419553072625698, + "grad_norm": 0.3420940041542053, + "learning_rate": 0.0006310364145658264, + "loss": 0.3489, + "step": 13281 + }, + { + "epoch": 7.420111731843575, + "grad_norm": 1.138730764389038, + "learning_rate": 0.0006310084033613446, + "loss": 0.4176, + "step": 13282 + }, + { + "epoch": 7.420670391061453, + "grad_norm": 1.5793524980545044, + "learning_rate": 0.0006309803921568627, + "loss": 0.4954, + "step": 13283 + }, + { + "epoch": 7.421229050279329, + "grad_norm": 0.41933029890060425, + "learning_rate": 0.0006309523809523809, + "loss": 0.4331, + "step": 13284 + }, + { + "epoch": 7.421787709497207, + "grad_norm": 2.55085825920105, + "learning_rate": 0.0006309243697478991, + "loss": 0.5244, + "step": 13285 + }, + { + "epoch": 7.4223463687150835, + "grad_norm": 0.6810856461524963, + "learning_rate": 0.0006308963585434174, + "loss": 0.4382, + "step": 13286 + }, + { + "epoch": 7.422905027932961, + "grad_norm": 0.81991046667099, + "learning_rate": 0.0006308683473389356, + "loss": 0.4149, + "step": 13287 + }, + { + "epoch": 7.423463687150838, + "grad_norm": 0.5027201771736145, + "learning_rate": 0.0006308403361344538, + "loss": 0.5835, + "step": 13288 + }, + { + "epoch": 7.424022346368715, + "grad_norm": 0.4079318642616272, + "learning_rate": 0.0006308123249299719, + "loss": 0.48, + "step": 13289 + }, + { + "epoch": 7.424581005586592, + "grad_norm": 0.4573747515678406, + "learning_rate": 0.0006307843137254901, + "loss": 0.4164, + "step": 13290 + }, + { + "epoch": 7.425139664804469, + "grad_norm": 0.5541324615478516, + "learning_rate": 0.0006307563025210085, + "loss": 0.3837, + "step": 13291 + }, + { + "epoch": 7.425698324022346, + "grad_norm": 0.3572680950164795, + "learning_rate": 0.0006307282913165267, + "loss": 0.4226, + "step": 13292 + }, + { + "epoch": 7.426256983240224, + "grad_norm": 0.745029091835022, + "learning_rate": 0.0006307002801120449, + "loss": 0.4143, + "step": 13293 + }, + { + "epoch": 7.4268156424581, + "grad_norm": 0.4126095473766327, + "learning_rate": 0.000630672268907563, + "loss": 0.4211, + "step": 13294 + }, + { + "epoch": 7.427374301675978, + "grad_norm": 0.3626716732978821, + "learning_rate": 0.0006306442577030812, + "loss": 0.4179, + "step": 13295 + }, + { + "epoch": 7.4279329608938545, + "grad_norm": 0.47655558586120605, + "learning_rate": 0.0006306162464985995, + "loss": 0.3942, + "step": 13296 + }, + { + "epoch": 7.428491620111732, + "grad_norm": 0.5622798204421997, + "learning_rate": 0.0006305882352941177, + "loss": 0.4579, + "step": 13297 + }, + { + "epoch": 7.4290502793296085, + "grad_norm": 0.41237446665763855, + "learning_rate": 0.0006305602240896359, + "loss": 0.3956, + "step": 13298 + }, + { + "epoch": 7.429608938547486, + "grad_norm": 0.4969257414340973, + "learning_rate": 0.000630532212885154, + "loss": 0.397, + "step": 13299 + }, + { + "epoch": 7.430167597765363, + "grad_norm": 0.36382973194122314, + "learning_rate": 0.0006305042016806722, + "loss": 0.4329, + "step": 13300 + }, + { + "epoch": 7.43072625698324, + "grad_norm": 0.3624275326728821, + "learning_rate": 0.0006304761904761905, + "loss": 0.406, + "step": 13301 + }, + { + "epoch": 7.431284916201117, + "grad_norm": 0.5830177068710327, + "learning_rate": 0.0006304481792717087, + "loss": 0.4038, + "step": 13302 + }, + { + "epoch": 7.431843575418995, + "grad_norm": 0.5396106839179993, + "learning_rate": 0.0006304201680672269, + "loss": 0.4249, + "step": 13303 + }, + { + "epoch": 7.432402234636871, + "grad_norm": 0.4341740012168884, + "learning_rate": 0.0006303921568627451, + "loss": 0.3713, + "step": 13304 + }, + { + "epoch": 7.432960893854749, + "grad_norm": 0.48321545124053955, + "learning_rate": 0.0006303641456582632, + "loss": 0.3979, + "step": 13305 + }, + { + "epoch": 7.4335195530726255, + "grad_norm": 0.6073716282844543, + "learning_rate": 0.0006303361344537816, + "loss": 0.4398, + "step": 13306 + }, + { + "epoch": 7.434078212290503, + "grad_norm": 2.9777920246124268, + "learning_rate": 0.0006303081232492998, + "loss": 0.4581, + "step": 13307 + }, + { + "epoch": 7.4346368715083795, + "grad_norm": 0.3766200840473175, + "learning_rate": 0.000630280112044818, + "loss": 0.3866, + "step": 13308 + }, + { + "epoch": 7.435195530726257, + "grad_norm": 0.4557076096534729, + "learning_rate": 0.0006302521008403362, + "loss": 0.381, + "step": 13309 + }, + { + "epoch": 7.435754189944134, + "grad_norm": 0.3866672217845917, + "learning_rate": 0.0006302240896358543, + "loss": 0.4298, + "step": 13310 + }, + { + "epoch": 7.436312849162011, + "grad_norm": 0.4263617694377899, + "learning_rate": 0.0006301960784313726, + "loss": 0.3948, + "step": 13311 + }, + { + "epoch": 7.436871508379888, + "grad_norm": 0.3464999198913574, + "learning_rate": 0.0006301680672268908, + "loss": 0.3821, + "step": 13312 + }, + { + "epoch": 7.437430167597765, + "grad_norm": 0.706479012966156, + "learning_rate": 0.000630140056022409, + "loss": 0.4068, + "step": 13313 + }, + { + "epoch": 7.437988826815642, + "grad_norm": 0.40135225653648376, + "learning_rate": 0.0006301120448179272, + "loss": 0.4466, + "step": 13314 + }, + { + "epoch": 7.43854748603352, + "grad_norm": 0.5254541635513306, + "learning_rate": 0.0006300840336134453, + "loss": 0.5856, + "step": 13315 + }, + { + "epoch": 7.4391061452513965, + "grad_norm": 0.6211190223693848, + "learning_rate": 0.0006300560224089636, + "loss": 0.3508, + "step": 13316 + }, + { + "epoch": 7.439664804469274, + "grad_norm": 0.7710056900978088, + "learning_rate": 0.0006300280112044818, + "loss": 0.3955, + "step": 13317 + }, + { + "epoch": 7.4402234636871505, + "grad_norm": 1.1574424505233765, + "learning_rate": 0.00063, + "loss": 0.3814, + "step": 13318 + }, + { + "epoch": 7.440782122905028, + "grad_norm": 0.4815942049026489, + "learning_rate": 0.0006299719887955182, + "loss": 0.3752, + "step": 13319 + }, + { + "epoch": 7.441340782122905, + "grad_norm": 0.41815364360809326, + "learning_rate": 0.0006299439775910364, + "loss": 0.5149, + "step": 13320 + }, + { + "epoch": 7.441899441340782, + "grad_norm": 0.49710094928741455, + "learning_rate": 0.0006299159663865546, + "loss": 0.4977, + "step": 13321 + }, + { + "epoch": 7.442458100558659, + "grad_norm": 0.4088105261325836, + "learning_rate": 0.0006298879551820729, + "loss": 0.3766, + "step": 13322 + }, + { + "epoch": 7.443016759776536, + "grad_norm": 0.4568706154823303, + "learning_rate": 0.0006298599439775911, + "loss": 0.4925, + "step": 13323 + }, + { + "epoch": 7.443575418994413, + "grad_norm": 0.593758761882782, + "learning_rate": 0.0006298319327731093, + "loss": 0.5544, + "step": 13324 + }, + { + "epoch": 7.444134078212291, + "grad_norm": 1.2638896703720093, + "learning_rate": 0.0006298039215686275, + "loss": 0.3589, + "step": 13325 + }, + { + "epoch": 7.4446927374301675, + "grad_norm": 0.4110514521598816, + "learning_rate": 0.0006297759103641457, + "loss": 0.382, + "step": 13326 + }, + { + "epoch": 7.445251396648045, + "grad_norm": 0.35596656799316406, + "learning_rate": 0.0006297478991596639, + "loss": 0.418, + "step": 13327 + }, + { + "epoch": 7.4458100558659215, + "grad_norm": 0.4594074785709381, + "learning_rate": 0.0006297198879551821, + "loss": 0.4052, + "step": 13328 + }, + { + "epoch": 7.446368715083799, + "grad_norm": 0.7637801766395569, + "learning_rate": 0.0006296918767507003, + "loss": 0.4079, + "step": 13329 + }, + { + "epoch": 7.446927374301676, + "grad_norm": 1.1117703914642334, + "learning_rate": 0.0006296638655462185, + "loss": 0.4423, + "step": 13330 + }, + { + "epoch": 7.447486033519553, + "grad_norm": 0.46981385350227356, + "learning_rate": 0.0006296358543417367, + "loss": 0.3929, + "step": 13331 + }, + { + "epoch": 7.44804469273743, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0006296078431372549, + "loss": 0.4437, + "step": 13332 + }, + { + "epoch": 7.448603351955307, + "grad_norm": 1.425216555595398, + "learning_rate": 0.0006295798319327731, + "loss": 0.5872, + "step": 13333 + }, + { + "epoch": 7.449162011173184, + "grad_norm": 3.6802706718444824, + "learning_rate": 0.0006295518207282913, + "loss": 0.3871, + "step": 13334 + }, + { + "epoch": 7.449720670391061, + "grad_norm": 0.42191699147224426, + "learning_rate": 0.0006295238095238095, + "loss": 0.3316, + "step": 13335 + }, + { + "epoch": 7.4502793296089385, + "grad_norm": 0.5018813610076904, + "learning_rate": 0.0006294957983193278, + "loss": 0.5356, + "step": 13336 + }, + { + "epoch": 7.450837988826816, + "grad_norm": 0.454581618309021, + "learning_rate": 0.0006294677871148459, + "loss": 0.3017, + "step": 13337 + }, + { + "epoch": 7.4513966480446925, + "grad_norm": 0.7520351409912109, + "learning_rate": 0.0006294397759103641, + "loss": 0.5615, + "step": 13338 + }, + { + "epoch": 7.45195530726257, + "grad_norm": 0.5690171122550964, + "learning_rate": 0.0006294117647058824, + "loss": 0.5761, + "step": 13339 + }, + { + "epoch": 7.452513966480447, + "grad_norm": 0.5402337908744812, + "learning_rate": 0.0006293837535014006, + "loss": 0.4329, + "step": 13340 + }, + { + "epoch": 7.453072625698324, + "grad_norm": 0.48431089520454407, + "learning_rate": 0.0006293557422969189, + "loss": 0.3782, + "step": 13341 + }, + { + "epoch": 7.453631284916201, + "grad_norm": 0.3996334671974182, + "learning_rate": 0.000629327731092437, + "loss": 0.3892, + "step": 13342 + }, + { + "epoch": 7.454189944134078, + "grad_norm": 0.6260446906089783, + "learning_rate": 0.0006292997198879552, + "loss": 0.6518, + "step": 13343 + }, + { + "epoch": 7.454748603351955, + "grad_norm": 0.4749130308628082, + "learning_rate": 0.0006292717086834734, + "loss": 0.4031, + "step": 13344 + }, + { + "epoch": 7.455307262569832, + "grad_norm": 0.36266636848449707, + "learning_rate": 0.0006292436974789916, + "loss": 0.4639, + "step": 13345 + }, + { + "epoch": 7.4558659217877095, + "grad_norm": 0.7757393717765808, + "learning_rate": 0.0006292156862745099, + "loss": 0.5068, + "step": 13346 + }, + { + "epoch": 7.456424581005587, + "grad_norm": 0.4906960427761078, + "learning_rate": 0.000629187675070028, + "loss": 0.4805, + "step": 13347 + }, + { + "epoch": 7.4569832402234635, + "grad_norm": 0.4969324767589569, + "learning_rate": 0.0006291596638655462, + "loss": 0.5474, + "step": 13348 + }, + { + "epoch": 7.457541899441341, + "grad_norm": 0.4112037420272827, + "learning_rate": 0.0006291316526610644, + "loss": 0.4957, + "step": 13349 + }, + { + "epoch": 7.4581005586592175, + "grad_norm": 0.8729506134986877, + "learning_rate": 0.0006291036414565826, + "loss": 0.4357, + "step": 13350 + }, + { + "epoch": 7.458659217877095, + "grad_norm": 0.5021253228187561, + "learning_rate": 0.0006290756302521009, + "loss": 0.4743, + "step": 13351 + }, + { + "epoch": 7.459217877094972, + "grad_norm": 0.418542742729187, + "learning_rate": 0.0006290476190476191, + "loss": 0.4678, + "step": 13352 + }, + { + "epoch": 7.459776536312849, + "grad_norm": 0.48293745517730713, + "learning_rate": 0.0006290196078431372, + "loss": 0.369, + "step": 13353 + }, + { + "epoch": 7.460335195530726, + "grad_norm": 0.578088641166687, + "learning_rate": 0.0006289915966386554, + "loss": 0.4541, + "step": 13354 + }, + { + "epoch": 7.460893854748603, + "grad_norm": 0.48990461230278015, + "learning_rate": 0.0006289635854341736, + "loss": 0.4408, + "step": 13355 + }, + { + "epoch": 7.4614525139664805, + "grad_norm": 0.5749280452728271, + "learning_rate": 0.000628935574229692, + "loss": 0.3713, + "step": 13356 + }, + { + "epoch": 7.462011173184358, + "grad_norm": 0.5096279382705688, + "learning_rate": 0.0006289075630252102, + "loss": 0.4402, + "step": 13357 + }, + { + "epoch": 7.4625698324022345, + "grad_norm": 0.40594661235809326, + "learning_rate": 0.0006288795518207283, + "loss": 0.3948, + "step": 13358 + }, + { + "epoch": 7.463128491620112, + "grad_norm": 0.44064679741859436, + "learning_rate": 0.0006288515406162465, + "loss": 0.5458, + "step": 13359 + }, + { + "epoch": 7.4636871508379885, + "grad_norm": 0.3884088695049286, + "learning_rate": 0.0006288235294117647, + "loss": 0.4223, + "step": 13360 + }, + { + "epoch": 7.464245810055866, + "grad_norm": 0.47736164927482605, + "learning_rate": 0.000628795518207283, + "loss": 0.4198, + "step": 13361 + }, + { + "epoch": 7.464804469273743, + "grad_norm": 0.5243260860443115, + "learning_rate": 0.0006287675070028012, + "loss": 0.5427, + "step": 13362 + }, + { + "epoch": 7.46536312849162, + "grad_norm": 0.6184183359146118, + "learning_rate": 0.0006287394957983193, + "loss": 0.3739, + "step": 13363 + }, + { + "epoch": 7.465921787709497, + "grad_norm": 1.2865631580352783, + "learning_rate": 0.0006287114845938375, + "loss": 0.4546, + "step": 13364 + }, + { + "epoch": 7.466480446927374, + "grad_norm": 0.8537730574607849, + "learning_rate": 0.0006286834733893557, + "loss": 0.3757, + "step": 13365 + }, + { + "epoch": 7.4670391061452515, + "grad_norm": 0.5568868517875671, + "learning_rate": 0.000628655462184874, + "loss": 0.6031, + "step": 13366 + }, + { + "epoch": 7.467597765363129, + "grad_norm": 0.991691529750824, + "learning_rate": 0.0006286274509803922, + "loss": 0.3505, + "step": 13367 + }, + { + "epoch": 7.4681564245810055, + "grad_norm": 0.7091882824897766, + "learning_rate": 0.0006285994397759104, + "loss": 0.4866, + "step": 13368 + }, + { + "epoch": 7.468715083798883, + "grad_norm": 0.5065934062004089, + "learning_rate": 0.0006285714285714285, + "loss": 0.396, + "step": 13369 + }, + { + "epoch": 7.4692737430167595, + "grad_norm": 0.5529679656028748, + "learning_rate": 0.0006285434173669467, + "loss": 0.5114, + "step": 13370 + }, + { + "epoch": 7.469832402234637, + "grad_norm": 0.7804819941520691, + "learning_rate": 0.000628515406162465, + "loss": 0.5331, + "step": 13371 + }, + { + "epoch": 7.4703910614525135, + "grad_norm": 0.5834393501281738, + "learning_rate": 0.0006284873949579833, + "loss": 0.4095, + "step": 13372 + }, + { + "epoch": 7.470949720670391, + "grad_norm": 0.48990458250045776, + "learning_rate": 0.0006284593837535015, + "loss": 0.4611, + "step": 13373 + }, + { + "epoch": 7.471508379888268, + "grad_norm": 0.623346209526062, + "learning_rate": 0.0006284313725490196, + "loss": 0.5069, + "step": 13374 + }, + { + "epoch": 7.472067039106145, + "grad_norm": 1.558680534362793, + "learning_rate": 0.0006284033613445378, + "loss": 0.3005, + "step": 13375 + }, + { + "epoch": 7.4726256983240225, + "grad_norm": 0.6571272611618042, + "learning_rate": 0.0006283753501400561, + "loss": 0.4002, + "step": 13376 + }, + { + "epoch": 7.473184357541899, + "grad_norm": 0.49199920892715454, + "learning_rate": 0.0006283473389355743, + "loss": 0.3986, + "step": 13377 + }, + { + "epoch": 7.4737430167597765, + "grad_norm": 0.44239863753318787, + "learning_rate": 0.0006283193277310925, + "loss": 0.4948, + "step": 13378 + }, + { + "epoch": 7.474301675977654, + "grad_norm": 0.5550490617752075, + "learning_rate": 0.0006282913165266106, + "loss": 0.5536, + "step": 13379 + }, + { + "epoch": 7.4748603351955305, + "grad_norm": 0.4111871123313904, + "learning_rate": 0.0006282633053221288, + "loss": 0.3545, + "step": 13380 + }, + { + "epoch": 7.475418994413408, + "grad_norm": 0.531445324420929, + "learning_rate": 0.0006282352941176471, + "loss": 0.4724, + "step": 13381 + }, + { + "epoch": 7.4759776536312845, + "grad_norm": 0.4715085029602051, + "learning_rate": 0.0006282072829131653, + "loss": 0.3724, + "step": 13382 + }, + { + "epoch": 7.476536312849162, + "grad_norm": 0.7833395004272461, + "learning_rate": 0.0006281792717086835, + "loss": 0.3996, + "step": 13383 + }, + { + "epoch": 7.477094972067039, + "grad_norm": 0.726565957069397, + "learning_rate": 0.0006281512605042017, + "loss": 0.5204, + "step": 13384 + }, + { + "epoch": 7.477653631284916, + "grad_norm": 0.4775674343109131, + "learning_rate": 0.0006281232492997198, + "loss": 0.5317, + "step": 13385 + }, + { + "epoch": 7.4782122905027935, + "grad_norm": 0.5930054783821106, + "learning_rate": 0.000628095238095238, + "loss": 0.5465, + "step": 13386 + }, + { + "epoch": 7.47877094972067, + "grad_norm": 0.8893391489982605, + "learning_rate": 0.0006280672268907563, + "loss": 0.5135, + "step": 13387 + }, + { + "epoch": 7.4793296089385475, + "grad_norm": 0.38850027322769165, + "learning_rate": 0.0006280392156862746, + "loss": 0.337, + "step": 13388 + }, + { + "epoch": 7.479888268156425, + "grad_norm": 0.39688146114349365, + "learning_rate": 0.0006280112044817928, + "loss": 0.3272, + "step": 13389 + }, + { + "epoch": 7.4804469273743015, + "grad_norm": 0.6377097964286804, + "learning_rate": 0.0006279831932773109, + "loss": 0.3738, + "step": 13390 + }, + { + "epoch": 7.481005586592179, + "grad_norm": 0.47277647256851196, + "learning_rate": 0.0006279551820728291, + "loss": 0.3544, + "step": 13391 + }, + { + "epoch": 7.4815642458100555, + "grad_norm": 0.44037455320358276, + "learning_rate": 0.0006279271708683474, + "loss": 0.3983, + "step": 13392 + }, + { + "epoch": 7.482122905027933, + "grad_norm": 0.5357094407081604, + "learning_rate": 0.0006278991596638656, + "loss": 0.3683, + "step": 13393 + }, + { + "epoch": 7.48268156424581, + "grad_norm": 0.4651302099227905, + "learning_rate": 0.0006278711484593838, + "loss": 0.4055, + "step": 13394 + }, + { + "epoch": 7.483240223463687, + "grad_norm": 0.495184063911438, + "learning_rate": 0.0006278431372549019, + "loss": 0.43, + "step": 13395 + }, + { + "epoch": 7.4837988826815645, + "grad_norm": 0.5564959049224854, + "learning_rate": 0.0006278151260504201, + "loss": 0.6228, + "step": 13396 + }, + { + "epoch": 7.484357541899441, + "grad_norm": 0.4047142565250397, + "learning_rate": 0.0006277871148459384, + "loss": 0.386, + "step": 13397 + }, + { + "epoch": 7.4849162011173185, + "grad_norm": 0.41901081800460815, + "learning_rate": 0.0006277591036414566, + "loss": 0.4516, + "step": 13398 + }, + { + "epoch": 7.485474860335196, + "grad_norm": 0.450962632894516, + "learning_rate": 0.0006277310924369748, + "loss": 0.4742, + "step": 13399 + }, + { + "epoch": 7.4860335195530725, + "grad_norm": 1.0303248167037964, + "learning_rate": 0.000627703081232493, + "loss": 0.5202, + "step": 13400 + }, + { + "epoch": 7.48659217877095, + "grad_norm": 1.1069631576538086, + "learning_rate": 0.0006276750700280111, + "loss": 0.3502, + "step": 13401 + }, + { + "epoch": 7.4871508379888265, + "grad_norm": 0.3482000529766083, + "learning_rate": 0.0006276470588235294, + "loss": 0.4233, + "step": 13402 + }, + { + "epoch": 7.487709497206704, + "grad_norm": 1.8103731870651245, + "learning_rate": 0.0006276190476190476, + "loss": 0.4433, + "step": 13403 + }, + { + "epoch": 7.488268156424581, + "grad_norm": 0.9675346612930298, + "learning_rate": 0.0006275910364145659, + "loss": 0.404, + "step": 13404 + }, + { + "epoch": 7.488826815642458, + "grad_norm": 0.4502348303794861, + "learning_rate": 0.0006275630252100841, + "loss": 0.4194, + "step": 13405 + }, + { + "epoch": 7.4893854748603355, + "grad_norm": 0.3939608931541443, + "learning_rate": 0.0006275350140056022, + "loss": 0.4586, + "step": 13406 + }, + { + "epoch": 7.489944134078212, + "grad_norm": 0.4570101797580719, + "learning_rate": 0.0006275070028011205, + "loss": 0.4836, + "step": 13407 + }, + { + "epoch": 7.4905027932960895, + "grad_norm": 0.4995557367801666, + "learning_rate": 0.0006274789915966387, + "loss": 0.4504, + "step": 13408 + }, + { + "epoch": 7.491061452513966, + "grad_norm": 0.46654728055000305, + "learning_rate": 0.0006274509803921569, + "loss": 0.372, + "step": 13409 + }, + { + "epoch": 7.4916201117318435, + "grad_norm": 1.703250765800476, + "learning_rate": 0.0006274229691876751, + "loss": 0.5072, + "step": 13410 + }, + { + "epoch": 7.492178770949721, + "grad_norm": 0.6606800556182861, + "learning_rate": 0.0006273949579831932, + "loss": 0.455, + "step": 13411 + }, + { + "epoch": 7.4927374301675975, + "grad_norm": 2.4163243770599365, + "learning_rate": 0.0006273669467787115, + "loss": 0.4573, + "step": 13412 + }, + { + "epoch": 7.493296089385475, + "grad_norm": 1.5161129236221313, + "learning_rate": 0.0006273389355742297, + "loss": 0.4091, + "step": 13413 + }, + { + "epoch": 7.4938547486033515, + "grad_norm": 0.51549232006073, + "learning_rate": 0.0006273109243697479, + "loss": 0.4583, + "step": 13414 + }, + { + "epoch": 7.494413407821229, + "grad_norm": 0.416126549243927, + "learning_rate": 0.0006272829131652661, + "loss": 0.445, + "step": 13415 + }, + { + "epoch": 7.4949720670391065, + "grad_norm": 0.5740472674369812, + "learning_rate": 0.0006272549019607843, + "loss": 0.4026, + "step": 13416 + }, + { + "epoch": 7.495530726256983, + "grad_norm": 0.4391186833381653, + "learning_rate": 0.0006272268907563025, + "loss": 0.3458, + "step": 13417 + }, + { + "epoch": 7.4960893854748605, + "grad_norm": 0.7656887173652649, + "learning_rate": 0.0006271988795518207, + "loss": 0.4026, + "step": 13418 + }, + { + "epoch": 7.496648044692737, + "grad_norm": 0.45423150062561035, + "learning_rate": 0.0006271708683473389, + "loss": 0.3774, + "step": 13419 + }, + { + "epoch": 7.4972067039106145, + "grad_norm": 0.5736512541770935, + "learning_rate": 0.0006271428571428571, + "loss": 0.5254, + "step": 13420 + }, + { + "epoch": 7.497765363128492, + "grad_norm": 0.40836790204048157, + "learning_rate": 0.0006271148459383754, + "loss": 0.3873, + "step": 13421 + }, + { + "epoch": 7.4983240223463685, + "grad_norm": 0.4353005886077881, + "learning_rate": 0.0006270868347338936, + "loss": 0.4329, + "step": 13422 + }, + { + "epoch": 7.498882681564246, + "grad_norm": 0.7054917216300964, + "learning_rate": 0.0006270588235294118, + "loss": 0.5089, + "step": 13423 + }, + { + "epoch": 7.4994413407821225, + "grad_norm": 0.4563477337360382, + "learning_rate": 0.00062703081232493, + "loss": 0.3622, + "step": 13424 + }, + { + "epoch": 7.5, + "grad_norm": 0.4830285906791687, + "learning_rate": 0.0006270028011204482, + "loss": 0.4478, + "step": 13425 + }, + { + "epoch": 7.5005586592178775, + "grad_norm": 0.4337867796421051, + "learning_rate": 0.0006269747899159664, + "loss": 0.4299, + "step": 13426 + }, + { + "epoch": 7.501117318435754, + "grad_norm": 1.2124769687652588, + "learning_rate": 0.0006269467787114847, + "loss": 0.4545, + "step": 13427 + }, + { + "epoch": 7.5016759776536315, + "grad_norm": 0.5080555081367493, + "learning_rate": 0.0006269187675070028, + "loss": 0.4935, + "step": 13428 + }, + { + "epoch": 7.502234636871508, + "grad_norm": 8.636521339416504, + "learning_rate": 0.000626890756302521, + "loss": 0.5313, + "step": 13429 + }, + { + "epoch": 7.5027932960893855, + "grad_norm": 0.9094671607017517, + "learning_rate": 0.0006268627450980392, + "loss": 0.7791, + "step": 13430 + }, + { + "epoch": 7.503351955307263, + "grad_norm": 0.7183095812797546, + "learning_rate": 0.0006268347338935574, + "loss": 0.3839, + "step": 13431 + }, + { + "epoch": 7.5039106145251395, + "grad_norm": 0.4022354185581207, + "learning_rate": 0.0006268067226890757, + "loss": 0.2891, + "step": 13432 + }, + { + "epoch": 7.504469273743017, + "grad_norm": 0.5049694180488586, + "learning_rate": 0.0006267787114845938, + "loss": 0.4637, + "step": 13433 + }, + { + "epoch": 7.5050279329608935, + "grad_norm": 0.5416582226753235, + "learning_rate": 0.000626750700280112, + "loss": 0.4444, + "step": 13434 + }, + { + "epoch": 7.505586592178771, + "grad_norm": 0.5205233097076416, + "learning_rate": 0.0006267226890756302, + "loss": 0.4978, + "step": 13435 + }, + { + "epoch": 7.506145251396648, + "grad_norm": 1.540658712387085, + "learning_rate": 0.0006266946778711484, + "loss": 0.4372, + "step": 13436 + }, + { + "epoch": 7.506703910614525, + "grad_norm": 0.5724209547042847, + "learning_rate": 0.0006266666666666668, + "loss": 0.4544, + "step": 13437 + }, + { + "epoch": 7.5072625698324025, + "grad_norm": 0.6350071430206299, + "learning_rate": 0.0006266386554621849, + "loss": 0.3814, + "step": 13438 + }, + { + "epoch": 7.507821229050279, + "grad_norm": 2.5332157611846924, + "learning_rate": 0.0006266106442577031, + "loss": 0.8797, + "step": 13439 + }, + { + "epoch": 7.5083798882681565, + "grad_norm": 0.6972220540046692, + "learning_rate": 0.0006265826330532213, + "loss": 0.4871, + "step": 13440 + }, + { + "epoch": 7.508938547486034, + "grad_norm": 0.3587109446525574, + "learning_rate": 0.0006265546218487395, + "loss": 0.377, + "step": 13441 + }, + { + "epoch": 7.5094972067039105, + "grad_norm": 0.9471145868301392, + "learning_rate": 0.0006265266106442578, + "loss": 0.3801, + "step": 13442 + }, + { + "epoch": 7.510055865921788, + "grad_norm": 0.5360437035560608, + "learning_rate": 0.000626498599439776, + "loss": 0.3852, + "step": 13443 + }, + { + "epoch": 7.5106145251396645, + "grad_norm": 0.3196997046470642, + "learning_rate": 0.0006264705882352941, + "loss": 0.4081, + "step": 13444 + }, + { + "epoch": 7.511173184357542, + "grad_norm": 0.46493256092071533, + "learning_rate": 0.0006264425770308123, + "loss": 0.3728, + "step": 13445 + }, + { + "epoch": 7.511731843575419, + "grad_norm": 0.5148099660873413, + "learning_rate": 0.0006264145658263305, + "loss": 0.4404, + "step": 13446 + }, + { + "epoch": 7.512290502793296, + "grad_norm": 0.41929781436920166, + "learning_rate": 0.0006263865546218488, + "loss": 0.3995, + "step": 13447 + }, + { + "epoch": 7.5128491620111735, + "grad_norm": 4.587903022766113, + "learning_rate": 0.000626358543417367, + "loss": 0.4701, + "step": 13448 + }, + { + "epoch": 7.51340782122905, + "grad_norm": 0.6348535418510437, + "learning_rate": 0.0006263305322128851, + "loss": 0.3999, + "step": 13449 + }, + { + "epoch": 7.5139664804469275, + "grad_norm": 0.47435715794563293, + "learning_rate": 0.0006263025210084033, + "loss": 0.4129, + "step": 13450 + }, + { + "epoch": 7.514525139664805, + "grad_norm": 0.5418163537979126, + "learning_rate": 0.0006262745098039215, + "loss": 0.4317, + "step": 13451 + }, + { + "epoch": 7.5150837988826815, + "grad_norm": 0.41280433535575867, + "learning_rate": 0.0006262464985994398, + "loss": 0.4728, + "step": 13452 + }, + { + "epoch": 7.515642458100559, + "grad_norm": 0.5352231860160828, + "learning_rate": 0.000626218487394958, + "loss": 0.4941, + "step": 13453 + }, + { + "epoch": 7.5162011173184355, + "grad_norm": 0.5814662575721741, + "learning_rate": 0.0006261904761904761, + "loss": 0.4841, + "step": 13454 + }, + { + "epoch": 7.516759776536313, + "grad_norm": 0.4405836760997772, + "learning_rate": 0.0006261624649859944, + "loss": 0.3607, + "step": 13455 + }, + { + "epoch": 7.51731843575419, + "grad_norm": 2.6674387454986572, + "learning_rate": 0.0006261344537815126, + "loss": 0.4573, + "step": 13456 + }, + { + "epoch": 7.517877094972067, + "grad_norm": 0.702226996421814, + "learning_rate": 0.0006261064425770309, + "loss": 0.5032, + "step": 13457 + }, + { + "epoch": 7.5184357541899445, + "grad_norm": 0.6695470809936523, + "learning_rate": 0.0006260784313725491, + "loss": 0.3487, + "step": 13458 + }, + { + "epoch": 7.518994413407821, + "grad_norm": 0.44072970747947693, + "learning_rate": 0.0006260504201680673, + "loss": 0.4149, + "step": 13459 + }, + { + "epoch": 7.5195530726256985, + "grad_norm": 0.4761718213558197, + "learning_rate": 0.0006260224089635854, + "loss": 0.4124, + "step": 13460 + }, + { + "epoch": 7.520111731843575, + "grad_norm": 0.4066667854785919, + "learning_rate": 0.0006259943977591036, + "loss": 0.388, + "step": 13461 + }, + { + "epoch": 7.5206703910614525, + "grad_norm": 0.49014201760292053, + "learning_rate": 0.0006259663865546219, + "loss": 0.4879, + "step": 13462 + }, + { + "epoch": 7.52122905027933, + "grad_norm": 0.5860188603401184, + "learning_rate": 0.0006259383753501401, + "loss": 0.439, + "step": 13463 + }, + { + "epoch": 7.5217877094972065, + "grad_norm": 0.7908342480659485, + "learning_rate": 0.0006259103641456583, + "loss": 0.3859, + "step": 13464 + }, + { + "epoch": 7.522346368715084, + "grad_norm": 0.4601080119609833, + "learning_rate": 0.0006258823529411764, + "loss": 0.4344, + "step": 13465 + }, + { + "epoch": 7.522905027932961, + "grad_norm": 1.624560832977295, + "learning_rate": 0.0006258543417366946, + "loss": 0.4217, + "step": 13466 + }, + { + "epoch": 7.523463687150838, + "grad_norm": 0.4940057098865509, + "learning_rate": 0.0006258263305322129, + "loss": 0.4054, + "step": 13467 + }, + { + "epoch": 7.5240223463687155, + "grad_norm": 0.4555158317089081, + "learning_rate": 0.0006257983193277311, + "loss": 0.4714, + "step": 13468 + }, + { + "epoch": 7.524581005586592, + "grad_norm": 0.7547162175178528, + "learning_rate": 0.0006257703081232493, + "loss": 0.4268, + "step": 13469 + }, + { + "epoch": 7.5251396648044695, + "grad_norm": 1.0132737159729004, + "learning_rate": 0.0006257422969187674, + "loss": 0.4763, + "step": 13470 + }, + { + "epoch": 7.525698324022346, + "grad_norm": 1.5819170475006104, + "learning_rate": 0.0006257142857142857, + "loss": 0.4103, + "step": 13471 + }, + { + "epoch": 7.5262569832402235, + "grad_norm": 0.4714759886264801, + "learning_rate": 0.000625686274509804, + "loss": 0.5127, + "step": 13472 + }, + { + "epoch": 7.5268156424581, + "grad_norm": 0.6439411640167236, + "learning_rate": 0.0006256582633053222, + "loss": 0.5208, + "step": 13473 + }, + { + "epoch": 7.5273743016759775, + "grad_norm": 0.4376458525657654, + "learning_rate": 0.0006256302521008404, + "loss": 0.4689, + "step": 13474 + }, + { + "epoch": 7.527932960893855, + "grad_norm": 1.8966127634048462, + "learning_rate": 0.0006256022408963586, + "loss": 0.4653, + "step": 13475 + }, + { + "epoch": 7.528491620111732, + "grad_norm": 0.509615957736969, + "learning_rate": 0.0006255742296918767, + "loss": 0.4716, + "step": 13476 + }, + { + "epoch": 7.529050279329609, + "grad_norm": 0.5375753045082092, + "learning_rate": 0.000625546218487395, + "loss": 0.5063, + "step": 13477 + }, + { + "epoch": 7.5296089385474865, + "grad_norm": 0.48832905292510986, + "learning_rate": 0.0006255182072829132, + "loss": 0.4141, + "step": 13478 + }, + { + "epoch": 7.530167597765363, + "grad_norm": 0.5039777755737305, + "learning_rate": 0.0006254901960784314, + "loss": 0.4793, + "step": 13479 + }, + { + "epoch": 7.5307262569832405, + "grad_norm": 0.7332884669303894, + "learning_rate": 0.0006254621848739496, + "loss": 0.4505, + "step": 13480 + }, + { + "epoch": 7.531284916201117, + "grad_norm": 0.4855954945087433, + "learning_rate": 0.0006254341736694677, + "loss": 0.4759, + "step": 13481 + }, + { + "epoch": 7.5318435754189945, + "grad_norm": 0.4676671624183655, + "learning_rate": 0.000625406162464986, + "loss": 0.4221, + "step": 13482 + }, + { + "epoch": 7.532402234636871, + "grad_norm": 0.44321921467781067, + "learning_rate": 0.0006253781512605042, + "loss": 0.4429, + "step": 13483 + }, + { + "epoch": 7.5329608938547485, + "grad_norm": 3.397167921066284, + "learning_rate": 0.0006253501400560224, + "loss": 0.4447, + "step": 13484 + }, + { + "epoch": 7.533519553072626, + "grad_norm": 0.6599857211112976, + "learning_rate": 0.0006253221288515406, + "loss": 0.393, + "step": 13485 + }, + { + "epoch": 7.534078212290503, + "grad_norm": 0.6302818059921265, + "learning_rate": 0.0006252941176470587, + "loss": 0.4711, + "step": 13486 + }, + { + "epoch": 7.53463687150838, + "grad_norm": 0.5066717267036438, + "learning_rate": 0.0006252661064425771, + "loss": 0.4334, + "step": 13487 + }, + { + "epoch": 7.5351955307262575, + "grad_norm": 0.6920668482780457, + "learning_rate": 0.0006252380952380953, + "loss": 0.4643, + "step": 13488 + }, + { + "epoch": 7.535754189944134, + "grad_norm": 4.822314262390137, + "learning_rate": 0.0006252100840336135, + "loss": 0.3331, + "step": 13489 + }, + { + "epoch": 7.5363128491620115, + "grad_norm": 0.3996482789516449, + "learning_rate": 0.0006251820728291317, + "loss": 0.4245, + "step": 13490 + }, + { + "epoch": 7.536871508379888, + "grad_norm": 0.6152485013008118, + "learning_rate": 0.0006251540616246499, + "loss": 0.4439, + "step": 13491 + }, + { + "epoch": 7.5374301675977655, + "grad_norm": 0.5713964700698853, + "learning_rate": 0.0006251260504201681, + "loss": 0.3872, + "step": 13492 + }, + { + "epoch": 7.537988826815642, + "grad_norm": 1.7460379600524902, + "learning_rate": 0.0006250980392156863, + "loss": 0.4409, + "step": 13493 + }, + { + "epoch": 7.5385474860335195, + "grad_norm": 0.44153743982315063, + "learning_rate": 0.0006250700280112045, + "loss": 0.4277, + "step": 13494 + }, + { + "epoch": 7.539106145251397, + "grad_norm": 0.5277459025382996, + "learning_rate": 0.0006250420168067227, + "loss": 0.512, + "step": 13495 + }, + { + "epoch": 7.539664804469274, + "grad_norm": 1.0423752069473267, + "learning_rate": 0.0006250140056022409, + "loss": 0.4216, + "step": 13496 + }, + { + "epoch": 7.540223463687151, + "grad_norm": 0.47321251034736633, + "learning_rate": 0.0006249859943977591, + "loss": 0.3874, + "step": 13497 + }, + { + "epoch": 7.540782122905028, + "grad_norm": 0.6798033714294434, + "learning_rate": 0.0006249579831932773, + "loss": 0.4549, + "step": 13498 + }, + { + "epoch": 7.541340782122905, + "grad_norm": 0.4562455415725708, + "learning_rate": 0.0006249299719887955, + "loss": 0.369, + "step": 13499 + }, + { + "epoch": 7.5418994413407825, + "grad_norm": 0.6898072957992554, + "learning_rate": 0.0006249019607843137, + "loss": 0.5776, + "step": 13500 + }, + { + "epoch": 7.5418994413407825, + "eval_cer": 0.0892878574623853, + "eval_loss": 0.3408820331096649, + "eval_runtime": 55.542, + "eval_samples_per_second": 81.704, + "eval_steps_per_second": 5.113, + "eval_wer": 0.3519175442026759, + "step": 13500 + }, + { + "epoch": 7.542458100558659, + "grad_norm": 2.380089521408081, + "learning_rate": 0.0006248739495798319, + "loss": 0.5645, + "step": 13501 + }, + { + "epoch": 7.5430167597765365, + "grad_norm": 0.7679510712623596, + "learning_rate": 0.0006248459383753501, + "loss": 0.6456, + "step": 13502 + }, + { + "epoch": 7.543575418994413, + "grad_norm": 0.54976886510849, + "learning_rate": 0.0006248179271708684, + "loss": 0.4245, + "step": 13503 + }, + { + "epoch": 7.5441340782122905, + "grad_norm": 0.7989188432693481, + "learning_rate": 0.0006247899159663866, + "loss": 0.3701, + "step": 13504 + }, + { + "epoch": 7.544692737430168, + "grad_norm": 0.8670735955238342, + "learning_rate": 0.0006247619047619048, + "loss": 0.5506, + "step": 13505 + }, + { + "epoch": 7.545251396648045, + "grad_norm": 0.46816739439964294, + "learning_rate": 0.000624733893557423, + "loss": 0.3936, + "step": 13506 + }, + { + "epoch": 7.545810055865922, + "grad_norm": 0.6991981863975525, + "learning_rate": 0.0006247058823529413, + "loss": 0.486, + "step": 13507 + }, + { + "epoch": 7.546368715083799, + "grad_norm": 0.5753023624420166, + "learning_rate": 0.0006246778711484594, + "loss": 0.538, + "step": 13508 + }, + { + "epoch": 7.546927374301676, + "grad_norm": 0.4900223910808563, + "learning_rate": 0.0006246498599439776, + "loss": 0.4206, + "step": 13509 + }, + { + "epoch": 7.547486033519553, + "grad_norm": 0.5980709791183472, + "learning_rate": 0.0006246218487394958, + "loss": 0.4568, + "step": 13510 + }, + { + "epoch": 7.54804469273743, + "grad_norm": 0.4128499925136566, + "learning_rate": 0.000624593837535014, + "loss": 0.414, + "step": 13511 + }, + { + "epoch": 7.5486033519553075, + "grad_norm": 0.5706645846366882, + "learning_rate": 0.0006245658263305323, + "loss": 0.3477, + "step": 13512 + }, + { + "epoch": 7.549162011173184, + "grad_norm": 0.550557017326355, + "learning_rate": 0.0006245378151260504, + "loss": 0.4958, + "step": 13513 + }, + { + "epoch": 7.5497206703910615, + "grad_norm": 0.32888537645339966, + "learning_rate": 0.0006245098039215686, + "loss": 0.3844, + "step": 13514 + }, + { + "epoch": 7.550279329608939, + "grad_norm": 0.9568273425102234, + "learning_rate": 0.0006244817927170868, + "loss": 0.5141, + "step": 13515 + }, + { + "epoch": 7.550837988826816, + "grad_norm": 0.3877546191215515, + "learning_rate": 0.000624453781512605, + "loss": 0.3862, + "step": 13516 + }, + { + "epoch": 7.551396648044693, + "grad_norm": 0.6546022891998291, + "learning_rate": 0.0006244257703081233, + "loss": 0.3884, + "step": 13517 + }, + { + "epoch": 7.55195530726257, + "grad_norm": 0.5250709652900696, + "learning_rate": 0.0006243977591036414, + "loss": 0.4772, + "step": 13518 + }, + { + "epoch": 7.552513966480447, + "grad_norm": 0.40196648240089417, + "learning_rate": 0.0006243697478991596, + "loss": 0.4018, + "step": 13519 + }, + { + "epoch": 7.553072625698324, + "grad_norm": 0.7349145412445068, + "learning_rate": 0.0006243417366946779, + "loss": 0.4227, + "step": 13520 + }, + { + "epoch": 7.553631284916201, + "grad_norm": 0.4224432408809662, + "learning_rate": 0.0006243137254901961, + "loss": 0.3324, + "step": 13521 + }, + { + "epoch": 7.5541899441340785, + "grad_norm": 0.5843234658241272, + "learning_rate": 0.0006242857142857144, + "loss": 0.4288, + "step": 13522 + }, + { + "epoch": 7.554748603351955, + "grad_norm": 0.5495042204856873, + "learning_rate": 0.0006242577030812326, + "loss": 0.4855, + "step": 13523 + }, + { + "epoch": 7.5553072625698325, + "grad_norm": 0.6440644264221191, + "learning_rate": 0.0006242296918767507, + "loss": 0.4254, + "step": 13524 + }, + { + "epoch": 7.55586592178771, + "grad_norm": 0.571147620677948, + "learning_rate": 0.0006242016806722689, + "loss": 0.4285, + "step": 13525 + }, + { + "epoch": 7.556424581005587, + "grad_norm": 0.6583604216575623, + "learning_rate": 0.0006241736694677871, + "loss": 0.4267, + "step": 13526 + }, + { + "epoch": 7.556983240223464, + "grad_norm": 0.5905967354774475, + "learning_rate": 0.0006241456582633054, + "loss": 0.3538, + "step": 13527 + }, + { + "epoch": 7.557541899441341, + "grad_norm": 0.5711806416511536, + "learning_rate": 0.0006241176470588236, + "loss": 0.4205, + "step": 13528 + }, + { + "epoch": 7.558100558659218, + "grad_norm": 5.9775848388671875, + "learning_rate": 0.0006240896358543417, + "loss": 0.3912, + "step": 13529 + }, + { + "epoch": 7.558659217877095, + "grad_norm": 0.4094686210155487, + "learning_rate": 0.0006240616246498599, + "loss": 0.4045, + "step": 13530 + }, + { + "epoch": 7.559217877094972, + "grad_norm": 0.4282020926475525, + "learning_rate": 0.0006240336134453781, + "loss": 0.3374, + "step": 13531 + }, + { + "epoch": 7.5597765363128495, + "grad_norm": 0.5447085499763489, + "learning_rate": 0.0006240056022408964, + "loss": 0.335, + "step": 13532 + }, + { + "epoch": 7.560335195530726, + "grad_norm": 0.48971641063690186, + "learning_rate": 0.0006239775910364146, + "loss": 0.4137, + "step": 13533 + }, + { + "epoch": 7.5608938547486035, + "grad_norm": 0.9005027413368225, + "learning_rate": 0.0006239495798319327, + "loss": 0.5634, + "step": 13534 + }, + { + "epoch": 7.56145251396648, + "grad_norm": 0.45324206352233887, + "learning_rate": 0.0006239215686274509, + "loss": 0.4517, + "step": 13535 + }, + { + "epoch": 7.562011173184358, + "grad_norm": 1.0080307722091675, + "learning_rate": 0.0006238935574229691, + "loss": 0.453, + "step": 13536 + }, + { + "epoch": 7.562569832402235, + "grad_norm": 0.8796266913414001, + "learning_rate": 0.0006238655462184875, + "loss": 0.4719, + "step": 13537 + }, + { + "epoch": 7.563128491620112, + "grad_norm": 1.5173414945602417, + "learning_rate": 0.0006238375350140057, + "loss": 0.4004, + "step": 13538 + }, + { + "epoch": 7.563687150837989, + "grad_norm": 0.6940330266952515, + "learning_rate": 0.0006238095238095239, + "loss": 0.4645, + "step": 13539 + }, + { + "epoch": 7.564245810055866, + "grad_norm": 0.8782386183738708, + "learning_rate": 0.000623781512605042, + "loss": 0.4195, + "step": 13540 + }, + { + "epoch": 7.564804469273743, + "grad_norm": 0.4251776337623596, + "learning_rate": 0.0006237535014005602, + "loss": 0.3882, + "step": 13541 + }, + { + "epoch": 7.5653631284916205, + "grad_norm": 0.6051243543624878, + "learning_rate": 0.0006237254901960785, + "loss": 0.3694, + "step": 13542 + }, + { + "epoch": 7.565921787709497, + "grad_norm": 0.4507146179676056, + "learning_rate": 0.0006236974789915967, + "loss": 0.4456, + "step": 13543 + }, + { + "epoch": 7.5664804469273745, + "grad_norm": 0.5082181692123413, + "learning_rate": 0.0006236694677871149, + "loss": 0.5975, + "step": 13544 + }, + { + "epoch": 7.567039106145251, + "grad_norm": 1.060773253440857, + "learning_rate": 0.000623641456582633, + "loss": 0.472, + "step": 13545 + }, + { + "epoch": 7.567597765363129, + "grad_norm": 0.7521158456802368, + "learning_rate": 0.0006236134453781512, + "loss": 0.3807, + "step": 13546 + }, + { + "epoch": 7.568156424581005, + "grad_norm": 0.4635113477706909, + "learning_rate": 0.0006235854341736695, + "loss": 0.388, + "step": 13547 + }, + { + "epoch": 7.568715083798883, + "grad_norm": 0.8785490989685059, + "learning_rate": 0.0006235574229691877, + "loss": 0.4587, + "step": 13548 + }, + { + "epoch": 7.56927374301676, + "grad_norm": 0.512169361114502, + "learning_rate": 0.0006235294117647059, + "loss": 0.3665, + "step": 13549 + }, + { + "epoch": 7.569832402234637, + "grad_norm": 0.3617511987686157, + "learning_rate": 0.000623501400560224, + "loss": 0.3619, + "step": 13550 + }, + { + "epoch": 7.570391061452514, + "grad_norm": 0.4750588536262512, + "learning_rate": 0.0006234733893557422, + "loss": 0.3763, + "step": 13551 + }, + { + "epoch": 7.5709497206703915, + "grad_norm": 1.6142890453338623, + "learning_rate": 0.0006234453781512606, + "loss": 0.4686, + "step": 13552 + }, + { + "epoch": 7.571508379888268, + "grad_norm": 0.4497030973434448, + "learning_rate": 0.0006234173669467788, + "loss": 0.4023, + "step": 13553 + }, + { + "epoch": 7.5720670391061455, + "grad_norm": 1.4944583177566528, + "learning_rate": 0.000623389355742297, + "loss": 0.4646, + "step": 13554 + }, + { + "epoch": 7.572625698324022, + "grad_norm": 2.2621238231658936, + "learning_rate": 0.0006233613445378152, + "loss": 0.3752, + "step": 13555 + }, + { + "epoch": 7.5731843575419, + "grad_norm": 0.45141011476516724, + "learning_rate": 0.0006233333333333333, + "loss": 0.3661, + "step": 13556 + }, + { + "epoch": 7.573743016759776, + "grad_norm": 1.2667438983917236, + "learning_rate": 0.0006233053221288516, + "loss": 0.4109, + "step": 13557 + }, + { + "epoch": 7.574301675977654, + "grad_norm": 0.44011032581329346, + "learning_rate": 0.0006232773109243698, + "loss": 0.4115, + "step": 13558 + }, + { + "epoch": 7.574860335195531, + "grad_norm": 0.9447159767150879, + "learning_rate": 0.000623249299719888, + "loss": 0.4522, + "step": 13559 + }, + { + "epoch": 7.575418994413408, + "grad_norm": 0.43916839361190796, + "learning_rate": 0.0006232212885154062, + "loss": 0.428, + "step": 13560 + }, + { + "epoch": 7.575977653631285, + "grad_norm": 0.40302279591560364, + "learning_rate": 0.0006231932773109243, + "loss": 0.4065, + "step": 13561 + }, + { + "epoch": 7.576536312849162, + "grad_norm": 0.480938196182251, + "learning_rate": 0.0006231652661064426, + "loss": 0.3824, + "step": 13562 + }, + { + "epoch": 7.577094972067039, + "grad_norm": 1.4348293542861938, + "learning_rate": 0.0006231372549019608, + "loss": 0.5577, + "step": 13563 + }, + { + "epoch": 7.5776536312849165, + "grad_norm": 0.37254858016967773, + "learning_rate": 0.000623109243697479, + "loss": 0.3684, + "step": 13564 + }, + { + "epoch": 7.578212290502793, + "grad_norm": 1.5769639015197754, + "learning_rate": 0.0006230812324929972, + "loss": 0.4637, + "step": 13565 + }, + { + "epoch": 7.578770949720671, + "grad_norm": 0.38803431391716003, + "learning_rate": 0.0006230532212885153, + "loss": 0.3209, + "step": 13566 + }, + { + "epoch": 7.579329608938547, + "grad_norm": 0.9016589522361755, + "learning_rate": 0.0006230252100840336, + "loss": 0.5289, + "step": 13567 + }, + { + "epoch": 7.579888268156425, + "grad_norm": 0.393205463886261, + "learning_rate": 0.0006229971988795519, + "loss": 0.3557, + "step": 13568 + }, + { + "epoch": 7.580446927374302, + "grad_norm": 1.1996532678604126, + "learning_rate": 0.0006229691876750701, + "loss": 0.5249, + "step": 13569 + }, + { + "epoch": 7.581005586592179, + "grad_norm": 0.5125927329063416, + "learning_rate": 0.0006229411764705883, + "loss": 0.4817, + "step": 13570 + }, + { + "epoch": 7.581564245810056, + "grad_norm": 0.7786597013473511, + "learning_rate": 0.0006229131652661065, + "loss": 0.4115, + "step": 13571 + }, + { + "epoch": 7.582122905027933, + "grad_norm": 0.6712331175804138, + "learning_rate": 0.0006228851540616247, + "loss": 0.5255, + "step": 13572 + }, + { + "epoch": 7.58268156424581, + "grad_norm": 1.7728350162506104, + "learning_rate": 0.0006228571428571429, + "loss": 0.3863, + "step": 13573 + }, + { + "epoch": 7.5832402234636875, + "grad_norm": 0.4781673550605774, + "learning_rate": 0.0006228291316526611, + "loss": 0.4006, + "step": 13574 + }, + { + "epoch": 7.583798882681564, + "grad_norm": 3.055995225906372, + "learning_rate": 0.0006228011204481793, + "loss": 0.4622, + "step": 13575 + }, + { + "epoch": 7.584357541899442, + "grad_norm": 2.026175022125244, + "learning_rate": 0.0006227731092436975, + "loss": 0.4294, + "step": 13576 + }, + { + "epoch": 7.584916201117318, + "grad_norm": 0.476855605840683, + "learning_rate": 0.0006227450980392157, + "loss": 0.4465, + "step": 13577 + }, + { + "epoch": 7.585474860335196, + "grad_norm": 1.9430747032165527, + "learning_rate": 0.0006227170868347339, + "loss": 0.4249, + "step": 13578 + }, + { + "epoch": 7.586033519553073, + "grad_norm": 0.47005364298820496, + "learning_rate": 0.0006226890756302521, + "loss": 0.433, + "step": 13579 + }, + { + "epoch": 7.58659217877095, + "grad_norm": 0.2990330755710602, + "learning_rate": 0.0006226610644257703, + "loss": 0.3467, + "step": 13580 + }, + { + "epoch": 7.587150837988827, + "grad_norm": 0.47463586926460266, + "learning_rate": 0.0006226330532212885, + "loss": 0.4612, + "step": 13581 + }, + { + "epoch": 7.587709497206704, + "grad_norm": 0.4742593765258789, + "learning_rate": 0.0006226050420168068, + "loss": 0.4915, + "step": 13582 + }, + { + "epoch": 7.588268156424581, + "grad_norm": 0.48226580023765564, + "learning_rate": 0.0006225770308123249, + "loss": 0.7002, + "step": 13583 + }, + { + "epoch": 7.588826815642458, + "grad_norm": 0.4428882598876953, + "learning_rate": 0.0006225490196078431, + "loss": 0.4133, + "step": 13584 + }, + { + "epoch": 7.589385474860335, + "grad_norm": 1.0538718700408936, + "learning_rate": 0.0006225210084033614, + "loss": 0.408, + "step": 13585 + }, + { + "epoch": 7.589944134078213, + "grad_norm": 1.024491310119629, + "learning_rate": 0.0006224929971988796, + "loss": 0.3832, + "step": 13586 + }, + { + "epoch": 7.590502793296089, + "grad_norm": 0.4697954058647156, + "learning_rate": 0.0006224649859943979, + "loss": 0.4656, + "step": 13587 + }, + { + "epoch": 7.591061452513967, + "grad_norm": 0.49036210775375366, + "learning_rate": 0.000622436974789916, + "loss": 0.4792, + "step": 13588 + }, + { + "epoch": 7.591620111731844, + "grad_norm": 1.243543267250061, + "learning_rate": 0.0006224089635854342, + "loss": 0.5612, + "step": 13589 + }, + { + "epoch": 7.592178770949721, + "grad_norm": 0.4247657060623169, + "learning_rate": 0.0006223809523809524, + "loss": 0.4548, + "step": 13590 + }, + { + "epoch": 7.592737430167598, + "grad_norm": 0.4095500409603119, + "learning_rate": 0.0006223529411764706, + "loss": 0.3511, + "step": 13591 + }, + { + "epoch": 7.593296089385475, + "grad_norm": 0.5707187652587891, + "learning_rate": 0.0006223249299719889, + "loss": 0.4413, + "step": 13592 + }, + { + "epoch": 7.593854748603352, + "grad_norm": 1.092761754989624, + "learning_rate": 0.000622296918767507, + "loss": 0.4691, + "step": 13593 + }, + { + "epoch": 7.594413407821229, + "grad_norm": 1.0158687829971313, + "learning_rate": 0.0006222689075630252, + "loss": 0.4498, + "step": 13594 + }, + { + "epoch": 7.594972067039106, + "grad_norm": 0.531940758228302, + "learning_rate": 0.0006222408963585434, + "loss": 0.5006, + "step": 13595 + }, + { + "epoch": 7.5955307262569836, + "grad_norm": 0.4816807508468628, + "learning_rate": 0.0006222128851540616, + "loss": 0.5206, + "step": 13596 + }, + { + "epoch": 7.59608938547486, + "grad_norm": 0.40765947103500366, + "learning_rate": 0.0006221848739495799, + "loss": 0.4152, + "step": 13597 + }, + { + "epoch": 7.596648044692738, + "grad_norm": 0.6307384371757507, + "learning_rate": 0.0006221568627450981, + "loss": 0.464, + "step": 13598 + }, + { + "epoch": 7.597206703910614, + "grad_norm": 0.5930641889572144, + "learning_rate": 0.0006221288515406162, + "loss": 0.4582, + "step": 13599 + }, + { + "epoch": 7.597765363128492, + "grad_norm": 0.6988797187805176, + "learning_rate": 0.0006221008403361344, + "loss": 0.3736, + "step": 13600 + }, + { + "epoch": 7.598324022346369, + "grad_norm": 0.4459543526172638, + "learning_rate": 0.0006220728291316526, + "loss": 0.4106, + "step": 13601 + }, + { + "epoch": 7.598882681564246, + "grad_norm": 0.5185832977294922, + "learning_rate": 0.000622044817927171, + "loss": 0.3804, + "step": 13602 + }, + { + "epoch": 7.599441340782123, + "grad_norm": 0.4887022376060486, + "learning_rate": 0.0006220168067226892, + "loss": 0.5291, + "step": 13603 + }, + { + "epoch": 7.6, + "grad_norm": 0.5255801677703857, + "learning_rate": 0.0006219887955182073, + "loss": 0.4708, + "step": 13604 + }, + { + "epoch": 7.600558659217877, + "grad_norm": 1.3447906970977783, + "learning_rate": 0.0006219607843137255, + "loss": 0.4201, + "step": 13605 + }, + { + "epoch": 7.6011173184357546, + "grad_norm": 0.4351365864276886, + "learning_rate": 0.0006219327731092437, + "loss": 0.3431, + "step": 13606 + }, + { + "epoch": 7.601675977653631, + "grad_norm": 0.5405589938163757, + "learning_rate": 0.000621904761904762, + "loss": 0.4339, + "step": 13607 + }, + { + "epoch": 7.602234636871509, + "grad_norm": 0.5554489493370056, + "learning_rate": 0.0006218767507002802, + "loss": 0.4816, + "step": 13608 + }, + { + "epoch": 7.602793296089385, + "grad_norm": 0.5024551153182983, + "learning_rate": 0.0006218487394957983, + "loss": 0.3799, + "step": 13609 + }, + { + "epoch": 7.603351955307263, + "grad_norm": 0.4021596908569336, + "learning_rate": 0.0006218207282913165, + "loss": 0.4172, + "step": 13610 + }, + { + "epoch": 7.603910614525139, + "grad_norm": 0.36767902970314026, + "learning_rate": 0.0006217927170868347, + "loss": 0.3499, + "step": 13611 + }, + { + "epoch": 7.604469273743017, + "grad_norm": 0.5077435374259949, + "learning_rate": 0.0006217647058823529, + "loss": 0.5479, + "step": 13612 + }, + { + "epoch": 7.605027932960894, + "grad_norm": 0.7681729793548584, + "learning_rate": 0.0006217366946778712, + "loss": 0.3696, + "step": 13613 + }, + { + "epoch": 7.605586592178771, + "grad_norm": 1.5435250997543335, + "learning_rate": 0.0006217086834733894, + "loss": 0.3915, + "step": 13614 + }, + { + "epoch": 7.606145251396648, + "grad_norm": 0.5356452465057373, + "learning_rate": 0.0006216806722689075, + "loss": 0.4438, + "step": 13615 + }, + { + "epoch": 7.6067039106145256, + "grad_norm": 0.41589537262916565, + "learning_rate": 0.0006216526610644257, + "loss": 0.4388, + "step": 13616 + }, + { + "epoch": 7.607262569832402, + "grad_norm": 0.4666784405708313, + "learning_rate": 0.0006216246498599439, + "loss": 0.5367, + "step": 13617 + }, + { + "epoch": 7.60782122905028, + "grad_norm": 1.6144362688064575, + "learning_rate": 0.0006215966386554623, + "loss": 0.3293, + "step": 13618 + }, + { + "epoch": 7.608379888268156, + "grad_norm": 0.39060524106025696, + "learning_rate": 0.0006215686274509805, + "loss": 0.3585, + "step": 13619 + }, + { + "epoch": 7.608938547486034, + "grad_norm": 0.898628294467926, + "learning_rate": 0.0006215406162464986, + "loss": 0.3869, + "step": 13620 + }, + { + "epoch": 7.60949720670391, + "grad_norm": 0.6355387568473816, + "learning_rate": 0.0006215126050420168, + "loss": 0.5001, + "step": 13621 + }, + { + "epoch": 7.610055865921788, + "grad_norm": 0.5170354843139648, + "learning_rate": 0.000621484593837535, + "loss": 0.4455, + "step": 13622 + }, + { + "epoch": 7.610614525139665, + "grad_norm": 0.31859737634658813, + "learning_rate": 0.0006214565826330533, + "loss": 0.3214, + "step": 13623 + }, + { + "epoch": 7.611173184357542, + "grad_norm": 0.8199390172958374, + "learning_rate": 0.0006214285714285715, + "loss": 0.6691, + "step": 13624 + }, + { + "epoch": 7.611731843575419, + "grad_norm": 3.0901899337768555, + "learning_rate": 0.0006214005602240896, + "loss": 0.4273, + "step": 13625 + }, + { + "epoch": 7.6122905027932966, + "grad_norm": 0.7332158088684082, + "learning_rate": 0.0006213725490196078, + "loss": 0.5764, + "step": 13626 + }, + { + "epoch": 7.612849162011173, + "grad_norm": 0.3383885622024536, + "learning_rate": 0.000621344537815126, + "loss": 0.3479, + "step": 13627 + }, + { + "epoch": 7.613407821229051, + "grad_norm": 0.6298010945320129, + "learning_rate": 0.0006213165266106443, + "loss": 0.3987, + "step": 13628 + }, + { + "epoch": 7.613966480446927, + "grad_norm": 0.6476728916168213, + "learning_rate": 0.0006212885154061625, + "loss": 0.53, + "step": 13629 + }, + { + "epoch": 7.614525139664805, + "grad_norm": 4.1088128089904785, + "learning_rate": 0.0006212605042016807, + "loss": 0.4242, + "step": 13630 + }, + { + "epoch": 7.615083798882681, + "grad_norm": 1.6146824359893799, + "learning_rate": 0.0006212324929971988, + "loss": 0.4315, + "step": 13631 + }, + { + "epoch": 7.615642458100559, + "grad_norm": 0.6278559565544128, + "learning_rate": 0.000621204481792717, + "loss": 0.457, + "step": 13632 + }, + { + "epoch": 7.616201117318436, + "grad_norm": 0.5750777125358582, + "learning_rate": 0.0006211764705882353, + "loss": 0.4914, + "step": 13633 + }, + { + "epoch": 7.616759776536313, + "grad_norm": 0.4993010461330414, + "learning_rate": 0.0006211484593837536, + "loss": 0.4251, + "step": 13634 + }, + { + "epoch": 7.61731843575419, + "grad_norm": 0.5032191276550293, + "learning_rate": 0.0006211204481792718, + "loss": 0.6733, + "step": 13635 + }, + { + "epoch": 7.617877094972067, + "grad_norm": 0.5283039808273315, + "learning_rate": 0.0006210924369747899, + "loss": 0.4457, + "step": 13636 + }, + { + "epoch": 7.618435754189944, + "grad_norm": 0.37390825152397156, + "learning_rate": 0.0006210644257703081, + "loss": 0.5707, + "step": 13637 + }, + { + "epoch": 7.618994413407822, + "grad_norm": 2.6217222213745117, + "learning_rate": 0.0006210364145658264, + "loss": 0.4876, + "step": 13638 + }, + { + "epoch": 7.619553072625698, + "grad_norm": 0.8103950023651123, + "learning_rate": 0.0006210084033613446, + "loss": 0.4258, + "step": 13639 + }, + { + "epoch": 7.620111731843576, + "grad_norm": 0.549705982208252, + "learning_rate": 0.0006209803921568628, + "loss": 0.2865, + "step": 13640 + }, + { + "epoch": 7.620670391061452, + "grad_norm": 0.5610224008560181, + "learning_rate": 0.0006209523809523809, + "loss": 0.4444, + "step": 13641 + }, + { + "epoch": 7.62122905027933, + "grad_norm": 0.3924856185913086, + "learning_rate": 0.0006209243697478991, + "loss": 0.3473, + "step": 13642 + }, + { + "epoch": 7.621787709497207, + "grad_norm": 0.5121474862098694, + "learning_rate": 0.0006208963585434174, + "loss": 0.344, + "step": 13643 + }, + { + "epoch": 7.622346368715084, + "grad_norm": 0.7494958639144897, + "learning_rate": 0.0006208683473389356, + "loss": 0.5047, + "step": 13644 + }, + { + "epoch": 7.622905027932961, + "grad_norm": 0.7541096210479736, + "learning_rate": 0.0006208403361344538, + "loss": 0.5364, + "step": 13645 + }, + { + "epoch": 7.623463687150838, + "grad_norm": 1.1734641790390015, + "learning_rate": 0.000620812324929972, + "loss": 0.4056, + "step": 13646 + }, + { + "epoch": 7.624022346368715, + "grad_norm": 0.47596099972724915, + "learning_rate": 0.0006207843137254901, + "loss": 0.347, + "step": 13647 + }, + { + "epoch": 7.624581005586592, + "grad_norm": 0.5886275768280029, + "learning_rate": 0.0006207563025210084, + "loss": 0.4595, + "step": 13648 + }, + { + "epoch": 7.625139664804469, + "grad_norm": 0.6051918268203735, + "learning_rate": 0.0006207282913165266, + "loss": 0.4868, + "step": 13649 + }, + { + "epoch": 7.625698324022347, + "grad_norm": 0.5815566778182983, + "learning_rate": 0.0006207002801120449, + "loss": 0.4056, + "step": 13650 + }, + { + "epoch": 7.626256983240223, + "grad_norm": 0.4260379374027252, + "learning_rate": 0.0006206722689075631, + "loss": 0.3437, + "step": 13651 + }, + { + "epoch": 7.626815642458101, + "grad_norm": 0.5471779704093933, + "learning_rate": 0.0006206442577030812, + "loss": 0.3978, + "step": 13652 + }, + { + "epoch": 7.627374301675978, + "grad_norm": 0.6251311898231506, + "learning_rate": 0.0006206162464985995, + "loss": 0.6238, + "step": 13653 + }, + { + "epoch": 7.627932960893855, + "grad_norm": 0.5866640210151672, + "learning_rate": 0.0006205882352941177, + "loss": 0.4578, + "step": 13654 + }, + { + "epoch": 7.628491620111732, + "grad_norm": 0.6820197105407715, + "learning_rate": 0.0006205602240896359, + "loss": 0.4633, + "step": 13655 + }, + { + "epoch": 7.629050279329609, + "grad_norm": 0.4338420033454895, + "learning_rate": 0.0006205322128851541, + "loss": 0.4273, + "step": 13656 + }, + { + "epoch": 7.629608938547486, + "grad_norm": 0.364718496799469, + "learning_rate": 0.0006205042016806722, + "loss": 0.3253, + "step": 13657 + }, + { + "epoch": 7.630167597765363, + "grad_norm": 0.505489706993103, + "learning_rate": 0.0006204761904761905, + "loss": 0.4642, + "step": 13658 + }, + { + "epoch": 7.63072625698324, + "grad_norm": 0.4781780540943146, + "learning_rate": 0.0006204481792717087, + "loss": 0.4871, + "step": 13659 + }, + { + "epoch": 7.631284916201118, + "grad_norm": 3.7457022666931152, + "learning_rate": 0.0006204201680672269, + "loss": 0.3851, + "step": 13660 + }, + { + "epoch": 7.631843575418994, + "grad_norm": 0.47109392285346985, + "learning_rate": 0.0006203921568627451, + "loss": 0.3423, + "step": 13661 + }, + { + "epoch": 7.632402234636872, + "grad_norm": 1.9330334663391113, + "learning_rate": 0.0006203641456582633, + "loss": 0.4179, + "step": 13662 + }, + { + "epoch": 7.632960893854749, + "grad_norm": 1.0891554355621338, + "learning_rate": 0.0006203361344537815, + "loss": 0.4047, + "step": 13663 + }, + { + "epoch": 7.633519553072626, + "grad_norm": 0.5759936571121216, + "learning_rate": 0.0006203081232492997, + "loss": 0.5828, + "step": 13664 + }, + { + "epoch": 7.634078212290503, + "grad_norm": 0.5856107473373413, + "learning_rate": 0.0006202801120448179, + "loss": 0.4522, + "step": 13665 + }, + { + "epoch": 7.63463687150838, + "grad_norm": 0.47771868109703064, + "learning_rate": 0.0006202521008403361, + "loss": 0.418, + "step": 13666 + }, + { + "epoch": 7.635195530726257, + "grad_norm": 0.5531140565872192, + "learning_rate": 0.0006202240896358544, + "loss": 0.3931, + "step": 13667 + }, + { + "epoch": 7.635754189944134, + "grad_norm": 0.43420663475990295, + "learning_rate": 0.0006201960784313726, + "loss": 0.3911, + "step": 13668 + }, + { + "epoch": 7.636312849162011, + "grad_norm": 0.6260654926300049, + "learning_rate": 0.0006201680672268908, + "loss": 0.3894, + "step": 13669 + }, + { + "epoch": 7.636871508379889, + "grad_norm": 1.5277457237243652, + "learning_rate": 0.000620140056022409, + "loss": 0.5073, + "step": 13670 + }, + { + "epoch": 7.637430167597765, + "grad_norm": 0.424627423286438, + "learning_rate": 0.0006201120448179272, + "loss": 0.4319, + "step": 13671 + }, + { + "epoch": 7.637988826815643, + "grad_norm": 1.800803542137146, + "learning_rate": 0.0006200840336134454, + "loss": 0.4499, + "step": 13672 + }, + { + "epoch": 7.638547486033519, + "grad_norm": 0.4708835184574127, + "learning_rate": 0.0006200560224089636, + "loss": 0.4628, + "step": 13673 + }, + { + "epoch": 7.639106145251397, + "grad_norm": 1.399085283279419, + "learning_rate": 0.0006200280112044818, + "loss": 0.529, + "step": 13674 + }, + { + "epoch": 7.639664804469274, + "grad_norm": 0.512787938117981, + "learning_rate": 0.00062, + "loss": 0.4991, + "step": 13675 + }, + { + "epoch": 7.640223463687151, + "grad_norm": 0.46335405111312866, + "learning_rate": 0.0006199719887955182, + "loss": 0.5069, + "step": 13676 + }, + { + "epoch": 7.640782122905028, + "grad_norm": 0.4293338656425476, + "learning_rate": 0.0006199439775910364, + "loss": 0.3096, + "step": 13677 + }, + { + "epoch": 7.641340782122905, + "grad_norm": 0.46071186661720276, + "learning_rate": 0.0006199159663865547, + "loss": 0.3379, + "step": 13678 + }, + { + "epoch": 7.641899441340782, + "grad_norm": 0.5742217898368835, + "learning_rate": 0.0006198879551820728, + "loss": 0.5081, + "step": 13679 + }, + { + "epoch": 7.64245810055866, + "grad_norm": 0.4921104609966278, + "learning_rate": 0.000619859943977591, + "loss": 0.4397, + "step": 13680 + }, + { + "epoch": 7.643016759776536, + "grad_norm": 0.5453481674194336, + "learning_rate": 0.0006198319327731092, + "loss": 0.4033, + "step": 13681 + }, + { + "epoch": 7.643575418994414, + "grad_norm": 2.258979320526123, + "learning_rate": 0.0006198039215686274, + "loss": 0.3496, + "step": 13682 + }, + { + "epoch": 7.64413407821229, + "grad_norm": 0.483030766248703, + "learning_rate": 0.0006197759103641458, + "loss": 0.4889, + "step": 13683 + }, + { + "epoch": 7.644692737430168, + "grad_norm": 0.5434699058532715, + "learning_rate": 0.0006197478991596639, + "loss": 0.5406, + "step": 13684 + }, + { + "epoch": 7.645251396648044, + "grad_norm": 0.5193633437156677, + "learning_rate": 0.0006197198879551821, + "loss": 0.5057, + "step": 13685 + }, + { + "epoch": 7.645810055865922, + "grad_norm": 0.4553646147251129, + "learning_rate": 0.0006196918767507003, + "loss": 0.4554, + "step": 13686 + }, + { + "epoch": 7.646368715083799, + "grad_norm": 0.9845026135444641, + "learning_rate": 0.0006196638655462185, + "loss": 0.4455, + "step": 13687 + }, + { + "epoch": 7.646927374301676, + "grad_norm": 0.3351835310459137, + "learning_rate": 0.0006196358543417368, + "loss": 0.4613, + "step": 13688 + }, + { + "epoch": 7.647486033519553, + "grad_norm": 0.3721778690814972, + "learning_rate": 0.0006196078431372549, + "loss": 0.4132, + "step": 13689 + }, + { + "epoch": 7.648044692737431, + "grad_norm": 0.5659770369529724, + "learning_rate": 0.0006195798319327731, + "loss": 0.396, + "step": 13690 + }, + { + "epoch": 7.648603351955307, + "grad_norm": 0.4577784538269043, + "learning_rate": 0.0006195518207282913, + "loss": 0.4694, + "step": 13691 + }, + { + "epoch": 7.649162011173185, + "grad_norm": 0.7744741439819336, + "learning_rate": 0.0006195238095238095, + "loss": 0.5063, + "step": 13692 + }, + { + "epoch": 7.649720670391061, + "grad_norm": 0.6406466364860535, + "learning_rate": 0.0006194957983193278, + "loss": 0.4249, + "step": 13693 + }, + { + "epoch": 7.650279329608939, + "grad_norm": 0.5525261759757996, + "learning_rate": 0.000619467787114846, + "loss": 0.4521, + "step": 13694 + }, + { + "epoch": 7.650837988826815, + "grad_norm": 0.5090089440345764, + "learning_rate": 0.0006194397759103641, + "loss": 0.3405, + "step": 13695 + }, + { + "epoch": 7.651396648044693, + "grad_norm": 0.6989052295684814, + "learning_rate": 0.0006194117647058823, + "loss": 0.4281, + "step": 13696 + }, + { + "epoch": 7.65195530726257, + "grad_norm": 0.6486166715621948, + "learning_rate": 0.0006193837535014005, + "loss": 0.5128, + "step": 13697 + }, + { + "epoch": 7.652513966480447, + "grad_norm": 0.6600151658058167, + "learning_rate": 0.0006193557422969188, + "loss": 0.4793, + "step": 13698 + }, + { + "epoch": 7.653072625698324, + "grad_norm": 0.601128876209259, + "learning_rate": 0.000619327731092437, + "loss": 0.5005, + "step": 13699 + }, + { + "epoch": 7.653631284916202, + "grad_norm": 0.4556083679199219, + "learning_rate": 0.0006192997198879551, + "loss": 0.4707, + "step": 13700 + }, + { + "epoch": 7.654189944134078, + "grad_norm": 0.47777992486953735, + "learning_rate": 0.0006192717086834734, + "loss": 0.4096, + "step": 13701 + }, + { + "epoch": 7.654748603351956, + "grad_norm": 0.5485050678253174, + "learning_rate": 0.0006192436974789916, + "loss": 0.4695, + "step": 13702 + }, + { + "epoch": 7.655307262569832, + "grad_norm": 2.2587099075317383, + "learning_rate": 0.0006192156862745099, + "loss": 0.4956, + "step": 13703 + }, + { + "epoch": 7.65586592178771, + "grad_norm": 0.3950224220752716, + "learning_rate": 0.0006191876750700281, + "loss": 0.4163, + "step": 13704 + }, + { + "epoch": 7.656424581005586, + "grad_norm": 0.43634507060050964, + "learning_rate": 0.0006191596638655462, + "loss": 0.4292, + "step": 13705 + }, + { + "epoch": 7.656983240223464, + "grad_norm": 1.0446161031723022, + "learning_rate": 0.0006191316526610644, + "loss": 0.4639, + "step": 13706 + }, + { + "epoch": 7.657541899441341, + "grad_norm": 0.5352032780647278, + "learning_rate": 0.0006191036414565826, + "loss": 0.488, + "step": 13707 + }, + { + "epoch": 7.658100558659218, + "grad_norm": 0.5349762439727783, + "learning_rate": 0.0006190756302521009, + "loss": 0.4923, + "step": 13708 + }, + { + "epoch": 7.658659217877095, + "grad_norm": 0.46903496980667114, + "learning_rate": 0.0006190476190476191, + "loss": 0.4169, + "step": 13709 + }, + { + "epoch": 7.659217877094972, + "grad_norm": 0.42552435398101807, + "learning_rate": 0.0006190196078431373, + "loss": 0.4271, + "step": 13710 + }, + { + "epoch": 7.659776536312849, + "grad_norm": 0.4879568815231323, + "learning_rate": 0.0006189915966386554, + "loss": 0.4185, + "step": 13711 + }, + { + "epoch": 7.660335195530727, + "grad_norm": 0.830924928188324, + "learning_rate": 0.0006189635854341736, + "loss": 0.4635, + "step": 13712 + }, + { + "epoch": 7.660893854748603, + "grad_norm": 0.43467411398887634, + "learning_rate": 0.0006189355742296919, + "loss": 0.3426, + "step": 13713 + }, + { + "epoch": 7.661452513966481, + "grad_norm": 0.41393110156059265, + "learning_rate": 0.0006189075630252101, + "loss": 0.4423, + "step": 13714 + }, + { + "epoch": 7.662011173184357, + "grad_norm": 0.7466068863868713, + "learning_rate": 0.0006188795518207283, + "loss": 0.3492, + "step": 13715 + }, + { + "epoch": 7.662569832402235, + "grad_norm": 2.6812796592712402, + "learning_rate": 0.0006188515406162464, + "loss": 0.5415, + "step": 13716 + }, + { + "epoch": 7.663128491620112, + "grad_norm": 0.42886239290237427, + "learning_rate": 0.0006188235294117647, + "loss": 0.5442, + "step": 13717 + }, + { + "epoch": 7.663687150837989, + "grad_norm": 0.9191313982009888, + "learning_rate": 0.000618795518207283, + "loss": 0.5346, + "step": 13718 + }, + { + "epoch": 7.664245810055866, + "grad_norm": 0.5040920972824097, + "learning_rate": 0.0006187675070028012, + "loss": 0.4269, + "step": 13719 + }, + { + "epoch": 7.664804469273743, + "grad_norm": 0.43607982993125916, + "learning_rate": 0.0006187394957983194, + "loss": 0.4148, + "step": 13720 + }, + { + "epoch": 7.66536312849162, + "grad_norm": 0.411907434463501, + "learning_rate": 0.0006187114845938375, + "loss": 0.4211, + "step": 13721 + }, + { + "epoch": 7.665921787709497, + "grad_norm": 0.8341143131256104, + "learning_rate": 0.0006186834733893557, + "loss": 0.5373, + "step": 13722 + }, + { + "epoch": 7.666480446927374, + "grad_norm": 0.8826488256454468, + "learning_rate": 0.000618655462184874, + "loss": 0.4897, + "step": 13723 + }, + { + "epoch": 7.667039106145252, + "grad_norm": 0.8334322571754456, + "learning_rate": 0.0006186274509803922, + "loss": 0.4648, + "step": 13724 + }, + { + "epoch": 7.667597765363128, + "grad_norm": 0.3868120014667511, + "learning_rate": 0.0006185994397759104, + "loss": 0.3887, + "step": 13725 + }, + { + "epoch": 7.668156424581006, + "grad_norm": 0.3800918757915497, + "learning_rate": 0.0006185714285714286, + "loss": 0.4395, + "step": 13726 + }, + { + "epoch": 7.668715083798883, + "grad_norm": 0.7623074054718018, + "learning_rate": 0.0006185434173669467, + "loss": 0.4298, + "step": 13727 + }, + { + "epoch": 7.66927374301676, + "grad_norm": 0.49751996994018555, + "learning_rate": 0.000618515406162465, + "loss": 0.6008, + "step": 13728 + }, + { + "epoch": 7.669832402234637, + "grad_norm": 0.34923678636550903, + "learning_rate": 0.0006184873949579832, + "loss": 0.4395, + "step": 13729 + }, + { + "epoch": 7.670391061452514, + "grad_norm": 0.6198163032531738, + "learning_rate": 0.0006184593837535014, + "loss": 0.3518, + "step": 13730 + }, + { + "epoch": 7.670949720670391, + "grad_norm": 0.5441950559616089, + "learning_rate": 0.0006184313725490196, + "loss": 0.5727, + "step": 13731 + }, + { + "epoch": 7.671508379888268, + "grad_norm": 1.0818986892700195, + "learning_rate": 0.0006184033613445377, + "loss": 0.3853, + "step": 13732 + }, + { + "epoch": 7.672067039106145, + "grad_norm": 0.7946770787239075, + "learning_rate": 0.0006183753501400561, + "loss": 0.445, + "step": 13733 + }, + { + "epoch": 7.672625698324023, + "grad_norm": 1.122252345085144, + "learning_rate": 0.0006183473389355743, + "loss": 0.5267, + "step": 13734 + }, + { + "epoch": 7.673184357541899, + "grad_norm": 0.7753942608833313, + "learning_rate": 0.0006183193277310925, + "loss": 0.4658, + "step": 13735 + }, + { + "epoch": 7.673743016759777, + "grad_norm": 0.46905720233917236, + "learning_rate": 0.0006182913165266107, + "loss": 0.4746, + "step": 13736 + }, + { + "epoch": 7.674301675977654, + "grad_norm": 1.0591672658920288, + "learning_rate": 0.0006182633053221288, + "loss": 0.5493, + "step": 13737 + }, + { + "epoch": 7.674860335195531, + "grad_norm": 0.3919164538383484, + "learning_rate": 0.0006182352941176471, + "loss": 0.3957, + "step": 13738 + }, + { + "epoch": 7.675418994413408, + "grad_norm": 0.5964024662971497, + "learning_rate": 0.0006182072829131653, + "loss": 0.4231, + "step": 13739 + }, + { + "epoch": 7.675977653631285, + "grad_norm": 0.37612593173980713, + "learning_rate": 0.0006181792717086835, + "loss": 0.4415, + "step": 13740 + }, + { + "epoch": 7.676536312849162, + "grad_norm": 1.1040171384811401, + "learning_rate": 0.0006181512605042017, + "loss": 0.4617, + "step": 13741 + }, + { + "epoch": 7.677094972067039, + "grad_norm": 0.6453474760055542, + "learning_rate": 0.0006181232492997199, + "loss": 0.3868, + "step": 13742 + }, + { + "epoch": 7.677653631284916, + "grad_norm": 0.42111751437187195, + "learning_rate": 0.0006180952380952381, + "loss": 0.4798, + "step": 13743 + }, + { + "epoch": 7.678212290502794, + "grad_norm": 0.36102917790412903, + "learning_rate": 0.0006180672268907563, + "loss": 0.4075, + "step": 13744 + }, + { + "epoch": 7.67877094972067, + "grad_norm": 0.6224989295005798, + "learning_rate": 0.0006180392156862745, + "loss": 0.4577, + "step": 13745 + }, + { + "epoch": 7.679329608938548, + "grad_norm": 0.6655389070510864, + "learning_rate": 0.0006180112044817927, + "loss": 0.4789, + "step": 13746 + }, + { + "epoch": 7.679888268156424, + "grad_norm": 2.1099164485931396, + "learning_rate": 0.0006179831932773109, + "loss": 0.5262, + "step": 13747 + }, + { + "epoch": 7.680446927374302, + "grad_norm": 0.5552861094474792, + "learning_rate": 0.0006179551820728291, + "loss": 0.4693, + "step": 13748 + }, + { + "epoch": 7.681005586592179, + "grad_norm": 0.7028999328613281, + "learning_rate": 0.0006179271708683474, + "loss": 0.5608, + "step": 13749 + }, + { + "epoch": 7.681564245810056, + "grad_norm": 6.781050682067871, + "learning_rate": 0.0006178991596638656, + "loss": 0.463, + "step": 13750 + }, + { + "epoch": 7.682122905027933, + "grad_norm": 0.5714400410652161, + "learning_rate": 0.0006178711484593838, + "loss": 0.5884, + "step": 13751 + }, + { + "epoch": 7.68268156424581, + "grad_norm": 1.9843143224716187, + "learning_rate": 0.000617843137254902, + "loss": 0.4781, + "step": 13752 + }, + { + "epoch": 7.683240223463687, + "grad_norm": 0.6092252135276794, + "learning_rate": 0.0006178151260504202, + "loss": 0.5105, + "step": 13753 + }, + { + "epoch": 7.683798882681565, + "grad_norm": 0.7522796392440796, + "learning_rate": 0.0006177871148459384, + "loss": 0.5926, + "step": 13754 + }, + { + "epoch": 7.684357541899441, + "grad_norm": 0.3750437796115875, + "learning_rate": 0.0006177591036414566, + "loss": 0.4542, + "step": 13755 + }, + { + "epoch": 7.684916201117319, + "grad_norm": 0.38676556944847107, + "learning_rate": 0.0006177310924369748, + "loss": 0.3416, + "step": 13756 + }, + { + "epoch": 7.685474860335195, + "grad_norm": 0.4673371911048889, + "learning_rate": 0.000617703081232493, + "loss": 0.3595, + "step": 13757 + }, + { + "epoch": 7.686033519553073, + "grad_norm": 0.9639487862586975, + "learning_rate": 0.0006176750700280113, + "loss": 0.4054, + "step": 13758 + }, + { + "epoch": 7.686592178770949, + "grad_norm": 2.0492560863494873, + "learning_rate": 0.0006176470588235294, + "loss": 0.4234, + "step": 13759 + }, + { + "epoch": 7.687150837988827, + "grad_norm": 0.47220635414123535, + "learning_rate": 0.0006176190476190476, + "loss": 0.4411, + "step": 13760 + }, + { + "epoch": 7.687709497206704, + "grad_norm": 0.9797062873840332, + "learning_rate": 0.0006175910364145658, + "loss": 0.4645, + "step": 13761 + }, + { + "epoch": 7.688268156424581, + "grad_norm": 0.522896945476532, + "learning_rate": 0.000617563025210084, + "loss": 0.3668, + "step": 13762 + }, + { + "epoch": 7.688826815642458, + "grad_norm": 0.6306886672973633, + "learning_rate": 0.0006175350140056023, + "loss": 0.5104, + "step": 13763 + }, + { + "epoch": 7.689385474860336, + "grad_norm": 0.8065402507781982, + "learning_rate": 0.0006175070028011204, + "loss": 0.4484, + "step": 13764 + }, + { + "epoch": 7.689944134078212, + "grad_norm": 0.6034227013587952, + "learning_rate": 0.0006174789915966386, + "loss": 0.396, + "step": 13765 + }, + { + "epoch": 7.69050279329609, + "grad_norm": 0.6344764232635498, + "learning_rate": 0.0006174509803921569, + "loss": 0.3911, + "step": 13766 + }, + { + "epoch": 7.691061452513966, + "grad_norm": 0.7747047543525696, + "learning_rate": 0.0006174229691876751, + "loss": 0.4212, + "step": 13767 + }, + { + "epoch": 7.691620111731844, + "grad_norm": 0.5411653518676758, + "learning_rate": 0.0006173949579831934, + "loss": 0.4052, + "step": 13768 + }, + { + "epoch": 7.69217877094972, + "grad_norm": 0.4202643930912018, + "learning_rate": 0.0006173669467787115, + "loss": 0.3904, + "step": 13769 + }, + { + "epoch": 7.692737430167598, + "grad_norm": 0.376617968082428, + "learning_rate": 0.0006173389355742297, + "loss": 0.3124, + "step": 13770 + }, + { + "epoch": 7.693296089385475, + "grad_norm": 0.5287662148475647, + "learning_rate": 0.0006173109243697479, + "loss": 0.4483, + "step": 13771 + }, + { + "epoch": 7.693854748603352, + "grad_norm": 0.4734002649784088, + "learning_rate": 0.0006172829131652661, + "loss": 0.44, + "step": 13772 + }, + { + "epoch": 7.694413407821229, + "grad_norm": 0.37186411023139954, + "learning_rate": 0.0006172549019607844, + "loss": 0.3672, + "step": 13773 + }, + { + "epoch": 7.694972067039107, + "grad_norm": 0.408000648021698, + "learning_rate": 0.0006172268907563026, + "loss": 0.3329, + "step": 13774 + }, + { + "epoch": 7.695530726256983, + "grad_norm": 0.7121781706809998, + "learning_rate": 0.0006171988795518207, + "loss": 0.5811, + "step": 13775 + }, + { + "epoch": 7.696089385474861, + "grad_norm": 0.38978955149650574, + "learning_rate": 0.0006171708683473389, + "loss": 0.4693, + "step": 13776 + }, + { + "epoch": 7.696648044692737, + "grad_norm": 0.5049538016319275, + "learning_rate": 0.0006171428571428571, + "loss": 0.3904, + "step": 13777 + }, + { + "epoch": 7.697206703910615, + "grad_norm": 0.8071524500846863, + "learning_rate": 0.0006171148459383754, + "loss": 0.4922, + "step": 13778 + }, + { + "epoch": 7.697765363128491, + "grad_norm": 0.5353125333786011, + "learning_rate": 0.0006170868347338936, + "loss": 0.5046, + "step": 13779 + }, + { + "epoch": 7.698324022346369, + "grad_norm": 0.583459734916687, + "learning_rate": 0.0006170588235294117, + "loss": 0.4739, + "step": 13780 + }, + { + "epoch": 7.698882681564246, + "grad_norm": 0.49595117568969727, + "learning_rate": 0.0006170308123249299, + "loss": 0.3648, + "step": 13781 + }, + { + "epoch": 7.699441340782123, + "grad_norm": 0.5471254587173462, + "learning_rate": 0.0006170028011204481, + "loss": 0.51, + "step": 13782 + }, + { + "epoch": 7.7, + "grad_norm": 0.5223873257637024, + "learning_rate": 0.0006169747899159665, + "loss": 0.4504, + "step": 13783 + }, + { + "epoch": 7.700558659217877, + "grad_norm": 0.4435765743255615, + "learning_rate": 0.0006169467787114847, + "loss": 0.4574, + "step": 13784 + }, + { + "epoch": 7.701117318435754, + "grad_norm": 0.8653535842895508, + "learning_rate": 0.0006169187675070028, + "loss": 0.4024, + "step": 13785 + }, + { + "epoch": 7.701675977653632, + "grad_norm": 0.3829124867916107, + "learning_rate": 0.000616890756302521, + "loss": 0.4495, + "step": 13786 + }, + { + "epoch": 7.702234636871508, + "grad_norm": 0.4708978831768036, + "learning_rate": 0.0006168627450980392, + "loss": 0.425, + "step": 13787 + }, + { + "epoch": 7.702793296089386, + "grad_norm": 0.5074251890182495, + "learning_rate": 0.0006168347338935575, + "loss": 0.4854, + "step": 13788 + }, + { + "epoch": 7.703351955307262, + "grad_norm": 0.46025800704956055, + "learning_rate": 0.0006168067226890757, + "loss": 0.3775, + "step": 13789 + }, + { + "epoch": 7.70391061452514, + "grad_norm": 0.8103023171424866, + "learning_rate": 0.0006167787114845939, + "loss": 0.451, + "step": 13790 + }, + { + "epoch": 7.704469273743017, + "grad_norm": 0.582000732421875, + "learning_rate": 0.000616750700280112, + "loss": 0.5085, + "step": 13791 + }, + { + "epoch": 7.705027932960894, + "grad_norm": 0.371208131313324, + "learning_rate": 0.0006167226890756302, + "loss": 0.3684, + "step": 13792 + }, + { + "epoch": 7.705586592178771, + "grad_norm": 0.3921825885772705, + "learning_rate": 0.0006166946778711485, + "loss": 0.4204, + "step": 13793 + }, + { + "epoch": 7.706145251396648, + "grad_norm": 1.094571590423584, + "learning_rate": 0.0006166666666666667, + "loss": 0.4288, + "step": 13794 + }, + { + "epoch": 7.706703910614525, + "grad_norm": 0.530928909778595, + "learning_rate": 0.0006166386554621849, + "loss": 0.3052, + "step": 13795 + }, + { + "epoch": 7.707262569832402, + "grad_norm": 0.5719661116600037, + "learning_rate": 0.000616610644257703, + "loss": 0.4633, + "step": 13796 + }, + { + "epoch": 7.707821229050279, + "grad_norm": 0.5013129711151123, + "learning_rate": 0.0006165826330532212, + "loss": 0.432, + "step": 13797 + }, + { + "epoch": 7.708379888268157, + "grad_norm": 0.7338741421699524, + "learning_rate": 0.0006165546218487396, + "loss": 0.5131, + "step": 13798 + }, + { + "epoch": 7.708938547486033, + "grad_norm": 2.6967313289642334, + "learning_rate": 0.0006165266106442578, + "loss": 0.4484, + "step": 13799 + }, + { + "epoch": 7.709497206703911, + "grad_norm": 0.46453040838241577, + "learning_rate": 0.000616498599439776, + "loss": 0.4021, + "step": 13800 + }, + { + "epoch": 7.710055865921788, + "grad_norm": 0.6692829728126526, + "learning_rate": 0.0006164705882352941, + "loss": 0.5783, + "step": 13801 + }, + { + "epoch": 7.710614525139665, + "grad_norm": 0.5053221583366394, + "learning_rate": 0.0006164425770308123, + "loss": 0.4154, + "step": 13802 + }, + { + "epoch": 7.711173184357542, + "grad_norm": 0.5044556856155396, + "learning_rate": 0.0006164145658263306, + "loss": 0.3968, + "step": 13803 + }, + { + "epoch": 7.711731843575419, + "grad_norm": 0.5830820798873901, + "learning_rate": 0.0006163865546218488, + "loss": 0.4953, + "step": 13804 + }, + { + "epoch": 7.712290502793296, + "grad_norm": 1.0309406518936157, + "learning_rate": 0.000616358543417367, + "loss": 0.3941, + "step": 13805 + }, + { + "epoch": 7.712849162011173, + "grad_norm": 2.83329439163208, + "learning_rate": 0.0006163305322128852, + "loss": 0.4314, + "step": 13806 + }, + { + "epoch": 7.71340782122905, + "grad_norm": 0.4441019594669342, + "learning_rate": 0.0006163025210084033, + "loss": 0.4172, + "step": 13807 + }, + { + "epoch": 7.713966480446928, + "grad_norm": 0.6268557906150818, + "learning_rate": 0.0006162745098039216, + "loss": 0.4257, + "step": 13808 + }, + { + "epoch": 7.714525139664804, + "grad_norm": 2.7386374473571777, + "learning_rate": 0.0006162464985994398, + "loss": 0.466, + "step": 13809 + }, + { + "epoch": 7.715083798882682, + "grad_norm": 0.45009568333625793, + "learning_rate": 0.000616218487394958, + "loss": 0.4323, + "step": 13810 + }, + { + "epoch": 7.715642458100559, + "grad_norm": 4.924854755401611, + "learning_rate": 0.0006161904761904762, + "loss": 0.6347, + "step": 13811 + }, + { + "epoch": 7.716201117318436, + "grad_norm": 0.4844169616699219, + "learning_rate": 0.0006161624649859943, + "loss": 0.4234, + "step": 13812 + }, + { + "epoch": 7.716759776536313, + "grad_norm": 0.5485273599624634, + "learning_rate": 0.0006161344537815126, + "loss": 0.4263, + "step": 13813 + }, + { + "epoch": 7.71731843575419, + "grad_norm": 0.5497127771377563, + "learning_rate": 0.0006161064425770308, + "loss": 0.3986, + "step": 13814 + }, + { + "epoch": 7.717877094972067, + "grad_norm": 0.5263583660125732, + "learning_rate": 0.0006160784313725491, + "loss": 0.5311, + "step": 13815 + }, + { + "epoch": 7.718435754189944, + "grad_norm": 0.36241477727890015, + "learning_rate": 0.0006160504201680673, + "loss": 0.3968, + "step": 13816 + }, + { + "epoch": 7.718994413407821, + "grad_norm": 0.4783209562301636, + "learning_rate": 0.0006160224089635854, + "loss": 0.4473, + "step": 13817 + }, + { + "epoch": 7.719553072625699, + "grad_norm": 1.1497730016708374, + "learning_rate": 0.0006159943977591037, + "loss": 0.4315, + "step": 13818 + }, + { + "epoch": 7.720111731843575, + "grad_norm": 0.391686350107193, + "learning_rate": 0.0006159663865546219, + "loss": 0.3978, + "step": 13819 + }, + { + "epoch": 7.720670391061453, + "grad_norm": 0.47612759470939636, + "learning_rate": 0.0006159383753501401, + "loss": 0.4918, + "step": 13820 + }, + { + "epoch": 7.721229050279329, + "grad_norm": 0.7015030384063721, + "learning_rate": 0.0006159103641456583, + "loss": 0.4627, + "step": 13821 + }, + { + "epoch": 7.721787709497207, + "grad_norm": 0.5020100474357605, + "learning_rate": 0.0006158823529411765, + "loss": 0.4659, + "step": 13822 + }, + { + "epoch": 7.722346368715084, + "grad_norm": 0.5628128051757812, + "learning_rate": 0.0006158543417366947, + "loss": 0.4695, + "step": 13823 + }, + { + "epoch": 7.722905027932961, + "grad_norm": 0.4184693992137909, + "learning_rate": 0.0006158263305322129, + "loss": 0.3769, + "step": 13824 + }, + { + "epoch": 7.723463687150838, + "grad_norm": 0.5309139490127563, + "learning_rate": 0.0006157983193277311, + "loss": 0.5206, + "step": 13825 + }, + { + "epoch": 7.724022346368715, + "grad_norm": 0.7914676070213318, + "learning_rate": 0.0006157703081232493, + "loss": 0.439, + "step": 13826 + }, + { + "epoch": 7.724581005586592, + "grad_norm": 0.555533230304718, + "learning_rate": 0.0006157422969187675, + "loss": 0.4721, + "step": 13827 + }, + { + "epoch": 7.72513966480447, + "grad_norm": 0.6769536137580872, + "learning_rate": 0.0006157142857142857, + "loss": 0.4511, + "step": 13828 + }, + { + "epoch": 7.725698324022346, + "grad_norm": 0.4408147931098938, + "learning_rate": 0.0006156862745098039, + "loss": 0.4181, + "step": 13829 + }, + { + "epoch": 7.726256983240224, + "grad_norm": 0.4702502489089966, + "learning_rate": 0.0006156582633053221, + "loss": 0.4911, + "step": 13830 + }, + { + "epoch": 7.7268156424581, + "grad_norm": 0.3909347951412201, + "learning_rate": 0.0006156302521008404, + "loss": 0.3978, + "step": 13831 + }, + { + "epoch": 7.727374301675978, + "grad_norm": 0.42259547114372253, + "learning_rate": 0.0006156022408963586, + "loss": 0.4038, + "step": 13832 + }, + { + "epoch": 7.727932960893854, + "grad_norm": 0.730812668800354, + "learning_rate": 0.0006155742296918767, + "loss": 0.4515, + "step": 13833 + }, + { + "epoch": 7.728491620111732, + "grad_norm": 0.6177743077278137, + "learning_rate": 0.000615546218487395, + "loss": 0.5445, + "step": 13834 + }, + { + "epoch": 7.729050279329609, + "grad_norm": 0.956550657749176, + "learning_rate": 0.0006155182072829132, + "loss": 0.4675, + "step": 13835 + }, + { + "epoch": 7.729608938547486, + "grad_norm": 1.4229481220245361, + "learning_rate": 0.0006154901960784314, + "loss": 0.4468, + "step": 13836 + }, + { + "epoch": 7.730167597765363, + "grad_norm": 0.45281124114990234, + "learning_rate": 0.0006154621848739496, + "loss": 0.3918, + "step": 13837 + }, + { + "epoch": 7.730726256983241, + "grad_norm": 1.5274964570999146, + "learning_rate": 0.0006154341736694678, + "loss": 0.3822, + "step": 13838 + }, + { + "epoch": 7.731284916201117, + "grad_norm": 1.0664663314819336, + "learning_rate": 0.000615406162464986, + "loss": 0.3988, + "step": 13839 + }, + { + "epoch": 7.731843575418995, + "grad_norm": 0.568108856678009, + "learning_rate": 0.0006153781512605042, + "loss": 0.5821, + "step": 13840 + }, + { + "epoch": 7.732402234636871, + "grad_norm": 0.426876038312912, + "learning_rate": 0.0006153501400560224, + "loss": 0.365, + "step": 13841 + }, + { + "epoch": 7.732960893854749, + "grad_norm": 0.7191917300224304, + "learning_rate": 0.0006153221288515406, + "loss": 0.5099, + "step": 13842 + }, + { + "epoch": 7.733519553072625, + "grad_norm": 1.9407932758331299, + "learning_rate": 0.0006152941176470588, + "loss": 0.6146, + "step": 13843 + }, + { + "epoch": 7.734078212290503, + "grad_norm": 0.8126131296157837, + "learning_rate": 0.000615266106442577, + "loss": 0.3946, + "step": 13844 + }, + { + "epoch": 7.73463687150838, + "grad_norm": 0.7253028154373169, + "learning_rate": 0.0006152380952380952, + "loss": 0.4626, + "step": 13845 + }, + { + "epoch": 7.735195530726257, + "grad_norm": 0.4987674057483673, + "learning_rate": 0.0006152100840336134, + "loss": 0.4327, + "step": 13846 + }, + { + "epoch": 7.735754189944134, + "grad_norm": 0.7138853073120117, + "learning_rate": 0.0006151820728291316, + "loss": 0.4799, + "step": 13847 + }, + { + "epoch": 7.736312849162011, + "grad_norm": 0.6187353134155273, + "learning_rate": 0.0006151540616246499, + "loss": 0.4471, + "step": 13848 + }, + { + "epoch": 7.736871508379888, + "grad_norm": 0.544151246547699, + "learning_rate": 0.0006151260504201682, + "loss": 0.4107, + "step": 13849 + }, + { + "epoch": 7.737430167597766, + "grad_norm": 0.531272292137146, + "learning_rate": 0.0006150980392156863, + "loss": 0.3343, + "step": 13850 + }, + { + "epoch": 7.737988826815642, + "grad_norm": 0.42586618661880493, + "learning_rate": 0.0006150700280112045, + "loss": 0.4436, + "step": 13851 + }, + { + "epoch": 7.73854748603352, + "grad_norm": 0.7693768739700317, + "learning_rate": 0.0006150420168067227, + "loss": 0.6109, + "step": 13852 + }, + { + "epoch": 7.739106145251396, + "grad_norm": 0.47911494970321655, + "learning_rate": 0.0006150140056022409, + "loss": 0.3975, + "step": 13853 + }, + { + "epoch": 7.739664804469274, + "grad_norm": 0.6103163361549377, + "learning_rate": 0.0006149859943977592, + "loss": 0.4093, + "step": 13854 + }, + { + "epoch": 7.740223463687151, + "grad_norm": 0.4705524742603302, + "learning_rate": 0.0006149579831932773, + "loss": 0.4595, + "step": 13855 + }, + { + "epoch": 7.740782122905028, + "grad_norm": 0.443238765001297, + "learning_rate": 0.0006149299719887955, + "loss": 0.5113, + "step": 13856 + }, + { + "epoch": 7.741340782122905, + "grad_norm": 0.6643243432044983, + "learning_rate": 0.0006149019607843137, + "loss": 0.5926, + "step": 13857 + }, + { + "epoch": 7.741899441340782, + "grad_norm": 0.3780375123023987, + "learning_rate": 0.0006148739495798319, + "loss": 0.3841, + "step": 13858 + }, + { + "epoch": 7.742458100558659, + "grad_norm": 0.4166991412639618, + "learning_rate": 0.0006148459383753502, + "loss": 0.4298, + "step": 13859 + }, + { + "epoch": 7.743016759776537, + "grad_norm": 0.49918797612190247, + "learning_rate": 0.0006148179271708683, + "loss": 0.3641, + "step": 13860 + }, + { + "epoch": 7.743575418994413, + "grad_norm": 1.1586129665374756, + "learning_rate": 0.0006147899159663865, + "loss": 0.5314, + "step": 13861 + }, + { + "epoch": 7.744134078212291, + "grad_norm": 0.3844227194786072, + "learning_rate": 0.0006147619047619047, + "loss": 0.4187, + "step": 13862 + }, + { + "epoch": 7.744692737430167, + "grad_norm": 0.8257869482040405, + "learning_rate": 0.0006147338935574229, + "loss": 0.4557, + "step": 13863 + }, + { + "epoch": 7.745251396648045, + "grad_norm": 0.4101067781448364, + "learning_rate": 0.0006147058823529413, + "loss": 0.4573, + "step": 13864 + }, + { + "epoch": 7.745810055865922, + "grad_norm": 0.6604530811309814, + "learning_rate": 0.0006146778711484595, + "loss": 0.4762, + "step": 13865 + }, + { + "epoch": 7.746368715083799, + "grad_norm": 0.4204275906085968, + "learning_rate": 0.0006146498599439776, + "loss": 0.5202, + "step": 13866 + }, + { + "epoch": 7.746927374301676, + "grad_norm": 0.6597739458084106, + "learning_rate": 0.0006146218487394958, + "loss": 0.4185, + "step": 13867 + }, + { + "epoch": 7.747486033519553, + "grad_norm": 0.43951407074928284, + "learning_rate": 0.000614593837535014, + "loss": 0.4793, + "step": 13868 + }, + { + "epoch": 7.74804469273743, + "grad_norm": 0.4143410325050354, + "learning_rate": 0.0006145658263305323, + "loss": 0.4153, + "step": 13869 + }, + { + "epoch": 7.748603351955307, + "grad_norm": 1.717625617980957, + "learning_rate": 0.0006145378151260505, + "loss": 0.444, + "step": 13870 + }, + { + "epoch": 7.749162011173184, + "grad_norm": 0.4756516218185425, + "learning_rate": 0.0006145098039215686, + "loss": 0.5483, + "step": 13871 + }, + { + "epoch": 7.749720670391062, + "grad_norm": 0.6400270462036133, + "learning_rate": 0.0006144817927170868, + "loss": 0.4511, + "step": 13872 + }, + { + "epoch": 7.750279329608938, + "grad_norm": 1.033082365989685, + "learning_rate": 0.000614453781512605, + "loss": 0.4038, + "step": 13873 + }, + { + "epoch": 7.750837988826816, + "grad_norm": 0.48727673292160034, + "learning_rate": 0.0006144257703081233, + "loss": 0.4063, + "step": 13874 + }, + { + "epoch": 7.751396648044693, + "grad_norm": 0.47570937871932983, + "learning_rate": 0.0006143977591036415, + "loss": 0.4392, + "step": 13875 + }, + { + "epoch": 7.75195530726257, + "grad_norm": 0.47843945026397705, + "learning_rate": 0.0006143697478991596, + "loss": 0.4342, + "step": 13876 + }, + { + "epoch": 7.752513966480447, + "grad_norm": 1.9364606142044067, + "learning_rate": 0.0006143417366946778, + "loss": 0.4192, + "step": 13877 + }, + { + "epoch": 7.753072625698324, + "grad_norm": 0.6362441182136536, + "learning_rate": 0.000614313725490196, + "loss": 0.6112, + "step": 13878 + }, + { + "epoch": 7.753631284916201, + "grad_norm": 0.4092274308204651, + "learning_rate": 0.0006142857142857143, + "loss": 0.3732, + "step": 13879 + }, + { + "epoch": 7.754189944134078, + "grad_norm": 0.5682749152183533, + "learning_rate": 0.0006142577030812326, + "loss": 0.5345, + "step": 13880 + }, + { + "epoch": 7.754748603351955, + "grad_norm": 0.8510589599609375, + "learning_rate": 0.0006142296918767508, + "loss": 0.3834, + "step": 13881 + }, + { + "epoch": 7.755307262569833, + "grad_norm": 0.5240651369094849, + "learning_rate": 0.0006142016806722689, + "loss": 0.4374, + "step": 13882 + }, + { + "epoch": 7.755865921787709, + "grad_norm": 0.6527811884880066, + "learning_rate": 0.0006141736694677871, + "loss": 0.4909, + "step": 13883 + }, + { + "epoch": 7.756424581005587, + "grad_norm": 0.790418803691864, + "learning_rate": 0.0006141456582633054, + "loss": 0.4613, + "step": 13884 + }, + { + "epoch": 7.756983240223463, + "grad_norm": 0.6083050966262817, + "learning_rate": 0.0006141176470588236, + "loss": 0.5396, + "step": 13885 + }, + { + "epoch": 7.757541899441341, + "grad_norm": 0.44148558378219604, + "learning_rate": 0.0006140896358543418, + "loss": 0.4566, + "step": 13886 + }, + { + "epoch": 7.758100558659218, + "grad_norm": 0.4408504366874695, + "learning_rate": 0.0006140616246498599, + "loss": 0.5184, + "step": 13887 + }, + { + "epoch": 7.758659217877095, + "grad_norm": 0.6519942879676819, + "learning_rate": 0.0006140336134453781, + "loss": 0.5159, + "step": 13888 + }, + { + "epoch": 7.759217877094972, + "grad_norm": 0.6187353730201721, + "learning_rate": 0.0006140056022408964, + "loss": 0.4056, + "step": 13889 + }, + { + "epoch": 7.759776536312849, + "grad_norm": 0.810020387172699, + "learning_rate": 0.0006139775910364146, + "loss": 0.624, + "step": 13890 + }, + { + "epoch": 7.760335195530726, + "grad_norm": 0.6215172410011292, + "learning_rate": 0.0006139495798319328, + "loss": 0.5284, + "step": 13891 + }, + { + "epoch": 7.760893854748604, + "grad_norm": 0.7776293158531189, + "learning_rate": 0.0006139215686274509, + "loss": 0.5887, + "step": 13892 + }, + { + "epoch": 7.76145251396648, + "grad_norm": 0.3514454662799835, + "learning_rate": 0.0006138935574229691, + "loss": 0.365, + "step": 13893 + }, + { + "epoch": 7.762011173184358, + "grad_norm": 0.6031270623207092, + "learning_rate": 0.0006138655462184874, + "loss": 0.5181, + "step": 13894 + }, + { + "epoch": 7.762569832402234, + "grad_norm": 0.8488714694976807, + "learning_rate": 0.0006138375350140056, + "loss": 0.3842, + "step": 13895 + }, + { + "epoch": 7.763128491620112, + "grad_norm": 1.634118676185608, + "learning_rate": 0.0006138095238095238, + "loss": 0.4515, + "step": 13896 + }, + { + "epoch": 7.763687150837989, + "grad_norm": 0.4528811573982239, + "learning_rate": 0.0006137815126050421, + "loss": 0.4765, + "step": 13897 + }, + { + "epoch": 7.764245810055866, + "grad_norm": 0.7632697820663452, + "learning_rate": 0.0006137535014005602, + "loss": 0.5145, + "step": 13898 + }, + { + "epoch": 7.764804469273743, + "grad_norm": 0.9000115394592285, + "learning_rate": 0.0006137254901960785, + "loss": 0.4056, + "step": 13899 + }, + { + "epoch": 7.76536312849162, + "grad_norm": 0.4654691815376282, + "learning_rate": 0.0006136974789915967, + "loss": 0.3723, + "step": 13900 + }, + { + "epoch": 7.765921787709497, + "grad_norm": 0.8328071236610413, + "learning_rate": 0.0006136694677871149, + "loss": 0.4179, + "step": 13901 + }, + { + "epoch": 7.766480446927375, + "grad_norm": 0.8510044813156128, + "learning_rate": 0.0006136414565826331, + "loss": 0.4124, + "step": 13902 + }, + { + "epoch": 7.767039106145251, + "grad_norm": 0.5416488647460938, + "learning_rate": 0.0006136134453781512, + "loss": 0.3962, + "step": 13903 + }, + { + "epoch": 7.767597765363129, + "grad_norm": 0.6769051551818848, + "learning_rate": 0.0006135854341736695, + "loss": 0.427, + "step": 13904 + }, + { + "epoch": 7.768156424581005, + "grad_norm": 0.5259209275245667, + "learning_rate": 0.0006135574229691877, + "loss": 0.4533, + "step": 13905 + }, + { + "epoch": 7.768715083798883, + "grad_norm": 0.7382765412330627, + "learning_rate": 0.0006135294117647059, + "loss": 0.3871, + "step": 13906 + }, + { + "epoch": 7.769273743016759, + "grad_norm": 0.7412647604942322, + "learning_rate": 0.0006135014005602241, + "loss": 0.5075, + "step": 13907 + }, + { + "epoch": 7.769832402234637, + "grad_norm": 4.510376453399658, + "learning_rate": 0.0006134733893557422, + "loss": 0.4838, + "step": 13908 + }, + { + "epoch": 7.770391061452514, + "grad_norm": 2.4300150871276855, + "learning_rate": 0.0006134453781512605, + "loss": 0.5708, + "step": 13909 + }, + { + "epoch": 7.770949720670391, + "grad_norm": 0.5524688959121704, + "learning_rate": 0.0006134173669467787, + "loss": 0.4641, + "step": 13910 + }, + { + "epoch": 7.771508379888268, + "grad_norm": 0.3958134651184082, + "learning_rate": 0.0006133893557422969, + "loss": 0.3993, + "step": 13911 + }, + { + "epoch": 7.772067039106146, + "grad_norm": 0.39522585272789, + "learning_rate": 0.0006133613445378151, + "loss": 0.4739, + "step": 13912 + }, + { + "epoch": 7.772625698324022, + "grad_norm": 0.5362353324890137, + "learning_rate": 0.0006133333333333334, + "loss": 0.4541, + "step": 13913 + }, + { + "epoch": 7.7731843575419, + "grad_norm": 0.866977870464325, + "learning_rate": 0.0006133053221288516, + "loss": 0.3998, + "step": 13914 + }, + { + "epoch": 7.773743016759776, + "grad_norm": 0.840677797794342, + "learning_rate": 0.0006132773109243698, + "loss": 0.4495, + "step": 13915 + }, + { + "epoch": 7.774301675977654, + "grad_norm": 1.0331155061721802, + "learning_rate": 0.000613249299719888, + "loss": 0.4686, + "step": 13916 + }, + { + "epoch": 7.77486033519553, + "grad_norm": 0.44592276215553284, + "learning_rate": 0.0006132212885154062, + "loss": 0.3728, + "step": 13917 + }, + { + "epoch": 7.775418994413408, + "grad_norm": 0.534802258014679, + "learning_rate": 0.0006131932773109244, + "loss": 0.3878, + "step": 13918 + }, + { + "epoch": 7.775977653631285, + "grad_norm": 0.6893162131309509, + "learning_rate": 0.0006131652661064426, + "loss": 0.3981, + "step": 13919 + }, + { + "epoch": 7.776536312849162, + "grad_norm": 0.46987706422805786, + "learning_rate": 0.0006131372549019608, + "loss": 0.4733, + "step": 13920 + }, + { + "epoch": 7.777094972067039, + "grad_norm": 0.39327356219291687, + "learning_rate": 0.000613109243697479, + "loss": 0.3758, + "step": 13921 + }, + { + "epoch": 7.777653631284916, + "grad_norm": 0.7337353229522705, + "learning_rate": 0.0006130812324929972, + "loss": 0.3359, + "step": 13922 + }, + { + "epoch": 7.778212290502793, + "grad_norm": 2.117324113845825, + "learning_rate": 0.0006130532212885154, + "loss": 0.3588, + "step": 13923 + }, + { + "epoch": 7.778770949720671, + "grad_norm": 0.4609636068344116, + "learning_rate": 0.0006130252100840336, + "loss": 0.4937, + "step": 13924 + }, + { + "epoch": 7.779329608938547, + "grad_norm": 0.4368499517440796, + "learning_rate": 0.0006129971988795518, + "loss": 0.4018, + "step": 13925 + }, + { + "epoch": 7.779888268156425, + "grad_norm": 0.34663155674934387, + "learning_rate": 0.00061296918767507, + "loss": 0.3147, + "step": 13926 + }, + { + "epoch": 7.780446927374301, + "grad_norm": 0.4378778636455536, + "learning_rate": 0.0006129411764705882, + "loss": 0.3358, + "step": 13927 + }, + { + "epoch": 7.781005586592179, + "grad_norm": 1.629021406173706, + "learning_rate": 0.0006129131652661064, + "loss": 0.5392, + "step": 13928 + }, + { + "epoch": 7.781564245810056, + "grad_norm": 1.9251132011413574, + "learning_rate": 0.0006128851540616248, + "loss": 0.4689, + "step": 13929 + }, + { + "epoch": 7.782122905027933, + "grad_norm": 0.49251148104667664, + "learning_rate": 0.0006128571428571429, + "loss": 0.4831, + "step": 13930 + }, + { + "epoch": 7.78268156424581, + "grad_norm": 1.156540870666504, + "learning_rate": 0.0006128291316526611, + "loss": 0.5427, + "step": 13931 + }, + { + "epoch": 7.783240223463687, + "grad_norm": 3.9049224853515625, + "learning_rate": 0.0006128011204481793, + "loss": 0.2848, + "step": 13932 + }, + { + "epoch": 7.783798882681564, + "grad_norm": 0.6031736135482788, + "learning_rate": 0.0006127731092436975, + "loss": 0.3101, + "step": 13933 + }, + { + "epoch": 7.784357541899441, + "grad_norm": 0.39596477150917053, + "learning_rate": 0.0006127450980392158, + "loss": 0.4133, + "step": 13934 + }, + { + "epoch": 7.784916201117318, + "grad_norm": 0.5235522985458374, + "learning_rate": 0.0006127170868347339, + "loss": 0.5971, + "step": 13935 + }, + { + "epoch": 7.785474860335196, + "grad_norm": 0.4688574969768524, + "learning_rate": 0.0006126890756302521, + "loss": 0.3608, + "step": 13936 + }, + { + "epoch": 7.786033519553072, + "grad_norm": 0.48614370822906494, + "learning_rate": 0.0006126610644257703, + "loss": 0.5324, + "step": 13937 + }, + { + "epoch": 7.78659217877095, + "grad_norm": 0.6135228872299194, + "learning_rate": 0.0006126330532212885, + "loss": 0.4906, + "step": 13938 + }, + { + "epoch": 7.787150837988827, + "grad_norm": 0.6341299414634705, + "learning_rate": 0.0006126050420168068, + "loss": 0.5944, + "step": 13939 + }, + { + "epoch": 7.787709497206704, + "grad_norm": 0.4454633593559265, + "learning_rate": 0.0006125770308123249, + "loss": 0.4093, + "step": 13940 + }, + { + "epoch": 7.788268156424581, + "grad_norm": 0.7565609812736511, + "learning_rate": 0.0006125490196078431, + "loss": 0.4687, + "step": 13941 + }, + { + "epoch": 7.788826815642458, + "grad_norm": 0.4242513179779053, + "learning_rate": 0.0006125210084033613, + "loss": 0.4787, + "step": 13942 + }, + { + "epoch": 7.789385474860335, + "grad_norm": 0.47021517157554626, + "learning_rate": 0.0006124929971988795, + "loss": 0.4099, + "step": 13943 + }, + { + "epoch": 7.789944134078212, + "grad_norm": 0.7617304921150208, + "learning_rate": 0.0006124649859943978, + "loss": 0.5548, + "step": 13944 + }, + { + "epoch": 7.790502793296089, + "grad_norm": 0.4995030164718628, + "learning_rate": 0.000612436974789916, + "loss": 0.3532, + "step": 13945 + }, + { + "epoch": 7.791061452513967, + "grad_norm": 0.38720646500587463, + "learning_rate": 0.0006124089635854341, + "loss": 0.3722, + "step": 13946 + }, + { + "epoch": 7.791620111731843, + "grad_norm": 0.5832614898681641, + "learning_rate": 0.0006123809523809524, + "loss": 0.5378, + "step": 13947 + }, + { + "epoch": 7.792178770949721, + "grad_norm": 1.8605190515518188, + "learning_rate": 0.0006123529411764706, + "loss": 0.4192, + "step": 13948 + }, + { + "epoch": 7.792737430167598, + "grad_norm": 0.48895007371902466, + "learning_rate": 0.0006123249299719889, + "loss": 0.4937, + "step": 13949 + }, + { + "epoch": 7.793296089385475, + "grad_norm": 0.38163071870803833, + "learning_rate": 0.0006122969187675071, + "loss": 0.351, + "step": 13950 + }, + { + "epoch": 7.793854748603352, + "grad_norm": 1.0164268016815186, + "learning_rate": 0.0006122689075630252, + "loss": 0.3331, + "step": 13951 + }, + { + "epoch": 7.794413407821229, + "grad_norm": 0.8862943649291992, + "learning_rate": 0.0006122408963585434, + "loss": 0.4055, + "step": 13952 + }, + { + "epoch": 7.794972067039106, + "grad_norm": 0.3968307077884674, + "learning_rate": 0.0006122128851540616, + "loss": 0.482, + "step": 13953 + }, + { + "epoch": 7.795530726256983, + "grad_norm": 0.5490445494651794, + "learning_rate": 0.0006121848739495799, + "loss": 0.4625, + "step": 13954 + }, + { + "epoch": 7.79608938547486, + "grad_norm": 1.4571541547775269, + "learning_rate": 0.0006121568627450981, + "loss": 0.3694, + "step": 13955 + }, + { + "epoch": 7.796648044692738, + "grad_norm": 0.5338897705078125, + "learning_rate": 0.0006121288515406162, + "loss": 0.4824, + "step": 13956 + }, + { + "epoch": 7.797206703910614, + "grad_norm": 5.459682464599609, + "learning_rate": 0.0006121008403361344, + "loss": 0.6282, + "step": 13957 + }, + { + "epoch": 7.797765363128492, + "grad_norm": 1.1113295555114746, + "learning_rate": 0.0006120728291316526, + "loss": 0.5635, + "step": 13958 + }, + { + "epoch": 7.798324022346368, + "grad_norm": 0.5434353947639465, + "learning_rate": 0.0006120448179271709, + "loss": 0.3491, + "step": 13959 + }, + { + "epoch": 7.798882681564246, + "grad_norm": 0.7481069564819336, + "learning_rate": 0.0006120168067226891, + "loss": 0.3484, + "step": 13960 + }, + { + "epoch": 7.799441340782123, + "grad_norm": 0.6251439452171326, + "learning_rate": 0.0006119887955182073, + "loss": 0.4828, + "step": 13961 + }, + { + "epoch": 7.8, + "grad_norm": 0.48824432492256165, + "learning_rate": 0.0006119607843137254, + "loss": 0.48, + "step": 13962 + }, + { + "epoch": 7.800558659217877, + "grad_norm": 0.5305771827697754, + "learning_rate": 0.0006119327731092437, + "loss": 0.3644, + "step": 13963 + }, + { + "epoch": 7.801117318435754, + "grad_norm": 0.5494493246078491, + "learning_rate": 0.000611904761904762, + "loss": 0.4209, + "step": 13964 + }, + { + "epoch": 7.801675977653631, + "grad_norm": 0.5097732543945312, + "learning_rate": 0.0006118767507002802, + "loss": 0.4117, + "step": 13965 + }, + { + "epoch": 7.802234636871509, + "grad_norm": 1.5923417806625366, + "learning_rate": 0.0006118487394957984, + "loss": 0.49, + "step": 13966 + }, + { + "epoch": 7.802793296089385, + "grad_norm": 0.523015022277832, + "learning_rate": 0.0006118207282913165, + "loss": 0.408, + "step": 13967 + }, + { + "epoch": 7.803351955307263, + "grad_norm": Infinity, + "learning_rate": 0.0006118207282913165, + "loss": 0.4448, + "step": 13968 + }, + { + "epoch": 7.803910614525139, + "grad_norm": 0.5710523724555969, + "learning_rate": 0.0006117927170868347, + "loss": 0.4922, + "step": 13969 + }, + { + "epoch": 7.804469273743017, + "grad_norm": 0.9690161943435669, + "learning_rate": 0.000611764705882353, + "loss": 0.3205, + "step": 13970 + }, + { + "epoch": 7.805027932960893, + "grad_norm": 0.4549117684364319, + "learning_rate": 0.0006117366946778712, + "loss": 0.4894, + "step": 13971 + }, + { + "epoch": 7.805586592178771, + "grad_norm": 0.4980126619338989, + "learning_rate": 0.0006117086834733894, + "loss": 0.4853, + "step": 13972 + }, + { + "epoch": 7.806145251396648, + "grad_norm": 0.5128645896911621, + "learning_rate": 0.0006116806722689075, + "loss": 0.4183, + "step": 13973 + }, + { + "epoch": 7.806703910614525, + "grad_norm": 3.2597928047180176, + "learning_rate": 0.0006116526610644257, + "loss": 0.3818, + "step": 13974 + }, + { + "epoch": 7.807262569832402, + "grad_norm": 0.4002695083618164, + "learning_rate": 0.000611624649859944, + "loss": 0.3737, + "step": 13975 + }, + { + "epoch": 7.80782122905028, + "grad_norm": 0.5315811634063721, + "learning_rate": 0.0006115966386554622, + "loss": 0.4243, + "step": 13976 + }, + { + "epoch": 7.808379888268156, + "grad_norm": 0.5031778812408447, + "learning_rate": 0.0006115686274509804, + "loss": 0.3221, + "step": 13977 + }, + { + "epoch": 7.808938547486034, + "grad_norm": 0.791736900806427, + "learning_rate": 0.0006115406162464986, + "loss": 0.5553, + "step": 13978 + }, + { + "epoch": 7.80949720670391, + "grad_norm": 0.5402506589889526, + "learning_rate": 0.0006115126050420167, + "loss": 0.4367, + "step": 13979 + }, + { + "epoch": 7.810055865921788, + "grad_norm": 0.8056809306144714, + "learning_rate": 0.0006114845938375351, + "loss": 0.6272, + "step": 13980 + }, + { + "epoch": 7.810614525139664, + "grad_norm": 1.1596993207931519, + "learning_rate": 0.0006114565826330533, + "loss": 0.3993, + "step": 13981 + }, + { + "epoch": 7.811173184357542, + "grad_norm": 0.7039431929588318, + "learning_rate": 0.0006114285714285715, + "loss": 0.3529, + "step": 13982 + }, + { + "epoch": 7.811731843575419, + "grad_norm": 0.4426352381706238, + "learning_rate": 0.0006114005602240897, + "loss": 0.4302, + "step": 13983 + }, + { + "epoch": 7.812290502793296, + "grad_norm": 0.49005192518234253, + "learning_rate": 0.0006113725490196078, + "loss": 0.4461, + "step": 13984 + }, + { + "epoch": 7.812849162011173, + "grad_norm": 0.37636321783065796, + "learning_rate": 0.0006113445378151261, + "loss": 0.4068, + "step": 13985 + }, + { + "epoch": 7.813407821229051, + "grad_norm": 0.5424584746360779, + "learning_rate": 0.0006113165266106443, + "loss": 0.4707, + "step": 13986 + }, + { + "epoch": 7.813966480446927, + "grad_norm": 0.4625190198421478, + "learning_rate": 0.0006112885154061625, + "loss": 0.3768, + "step": 13987 + }, + { + "epoch": 7.814525139664805, + "grad_norm": 1.3029929399490356, + "learning_rate": 0.0006112605042016807, + "loss": 0.4689, + "step": 13988 + }, + { + "epoch": 7.815083798882681, + "grad_norm": 0.627018392086029, + "learning_rate": 0.0006112324929971988, + "loss": 0.5588, + "step": 13989 + }, + { + "epoch": 7.815642458100559, + "grad_norm": 0.42316123843193054, + "learning_rate": 0.0006112044817927171, + "loss": 0.4039, + "step": 13990 + }, + { + "epoch": 7.816201117318435, + "grad_norm": 0.32366514205932617, + "learning_rate": 0.0006111764705882353, + "loss": 0.4052, + "step": 13991 + }, + { + "epoch": 7.816759776536313, + "grad_norm": 0.41852328181266785, + "learning_rate": 0.0006111484593837535, + "loss": 0.4181, + "step": 13992 + }, + { + "epoch": 7.81731843575419, + "grad_norm": 0.6539618372917175, + "learning_rate": 0.0006111204481792717, + "loss": 0.5097, + "step": 13993 + }, + { + "epoch": 7.817877094972067, + "grad_norm": 0.48087573051452637, + "learning_rate": 0.0006110924369747899, + "loss": 0.3821, + "step": 13994 + }, + { + "epoch": 7.818435754189944, + "grad_norm": 0.708215594291687, + "learning_rate": 0.0006110644257703081, + "loss": 0.3635, + "step": 13995 + }, + { + "epoch": 7.818994413407821, + "grad_norm": 0.38754674792289734, + "learning_rate": 0.0006110364145658264, + "loss": 0.3689, + "step": 13996 + }, + { + "epoch": 7.819553072625698, + "grad_norm": 0.39824309945106506, + "learning_rate": 0.0006110084033613446, + "loss": 0.4058, + "step": 13997 + }, + { + "epoch": 7.820111731843576, + "grad_norm": 0.6965510249137878, + "learning_rate": 0.0006109803921568628, + "loss": 0.4776, + "step": 13998 + }, + { + "epoch": 7.820670391061452, + "grad_norm": 0.501475989818573, + "learning_rate": 0.000610952380952381, + "loss": 0.4461, + "step": 13999 + }, + { + "epoch": 7.82122905027933, + "grad_norm": 0.6898168921470642, + "learning_rate": 0.0006109243697478992, + "loss": 0.4488, + "step": 14000 + }, + { + "epoch": 7.82122905027933, + "eval_cer": 0.09159547423434039, + "eval_loss": 0.3414621353149414, + "eval_runtime": 57.6769, + "eval_samples_per_second": 78.68, + "eval_steps_per_second": 4.924, + "eval_wer": 0.36518533001871456, + "step": 14000 + }, + { + "epoch": 7.821787709497206, + "grad_norm": 0.47615861892700195, + "learning_rate": 0.0006108963585434174, + "loss": 0.4425, + "step": 14001 + }, + { + "epoch": 7.822346368715084, + "grad_norm": 0.514773964881897, + "learning_rate": 0.0006108683473389356, + "loss": 0.46, + "step": 14002 + }, + { + "epoch": 7.822905027932961, + "grad_norm": 0.511339008808136, + "learning_rate": 0.0006108403361344538, + "loss": 0.4479, + "step": 14003 + }, + { + "epoch": 7.823463687150838, + "grad_norm": 0.39632734656333923, + "learning_rate": 0.000610812324929972, + "loss": 0.412, + "step": 14004 + }, + { + "epoch": 7.824022346368715, + "grad_norm": 0.37343478202819824, + "learning_rate": 0.0006107843137254902, + "loss": 0.3882, + "step": 14005 + }, + { + "epoch": 7.824581005586592, + "grad_norm": 0.609805703163147, + "learning_rate": 0.0006107563025210084, + "loss": 0.4255, + "step": 14006 + }, + { + "epoch": 7.825139664804469, + "grad_norm": 0.6717429757118225, + "learning_rate": 0.0006107282913165266, + "loss": 0.4501, + "step": 14007 + }, + { + "epoch": 7.825698324022346, + "grad_norm": 0.6591873168945312, + "learning_rate": 0.0006107002801120448, + "loss": 0.505, + "step": 14008 + }, + { + "epoch": 7.826256983240223, + "grad_norm": 0.4814958870410919, + "learning_rate": 0.000610672268907563, + "loss": 0.6377, + "step": 14009 + }, + { + "epoch": 7.826815642458101, + "grad_norm": 0.34744584560394287, + "learning_rate": 0.0006106442577030813, + "loss": 0.3312, + "step": 14010 + }, + { + "epoch": 7.827374301675977, + "grad_norm": 0.5938692688941956, + "learning_rate": 0.0006106162464985994, + "loss": 0.4223, + "step": 14011 + }, + { + "epoch": 7.827932960893855, + "grad_norm": 0.40614497661590576, + "learning_rate": 0.0006105882352941176, + "loss": 0.4203, + "step": 14012 + }, + { + "epoch": 7.828491620111732, + "grad_norm": 2.525165557861328, + "learning_rate": 0.0006105602240896359, + "loss": 0.3115, + "step": 14013 + }, + { + "epoch": 7.829050279329609, + "grad_norm": 0.38710471987724304, + "learning_rate": 0.0006105322128851541, + "loss": 0.4365, + "step": 14014 + }, + { + "epoch": 7.829608938547486, + "grad_norm": 0.6059528589248657, + "learning_rate": 0.0006105042016806724, + "loss": 0.4143, + "step": 14015 + }, + { + "epoch": 7.830167597765363, + "grad_norm": 0.5290810465812683, + "learning_rate": 0.0006104761904761905, + "loss": 0.44, + "step": 14016 + }, + { + "epoch": 7.83072625698324, + "grad_norm": 0.593165397644043, + "learning_rate": 0.0006104481792717087, + "loss": 0.4513, + "step": 14017 + }, + { + "epoch": 7.831284916201117, + "grad_norm": 0.39692336320877075, + "learning_rate": 0.0006104201680672269, + "loss": 0.4607, + "step": 14018 + }, + { + "epoch": 7.831843575418994, + "grad_norm": 0.40724900364875793, + "learning_rate": 0.0006103921568627451, + "loss": 0.3811, + "step": 14019 + }, + { + "epoch": 7.832402234636872, + "grad_norm": 0.5121590495109558, + "learning_rate": 0.0006103641456582634, + "loss": 0.4841, + "step": 14020 + }, + { + "epoch": 7.832960893854748, + "grad_norm": 0.6458014249801636, + "learning_rate": 0.0006103361344537815, + "loss": 0.5812, + "step": 14021 + }, + { + "epoch": 7.833519553072626, + "grad_norm": 0.5924685001373291, + "learning_rate": 0.0006103081232492997, + "loss": 0.4381, + "step": 14022 + }, + { + "epoch": 7.834078212290503, + "grad_norm": 0.46820512413978577, + "learning_rate": 0.0006102801120448179, + "loss": 0.3811, + "step": 14023 + }, + { + "epoch": 7.83463687150838, + "grad_norm": 0.4230148196220398, + "learning_rate": 0.0006102521008403361, + "loss": 0.4229, + "step": 14024 + }, + { + "epoch": 7.835195530726257, + "grad_norm": 0.4968454837799072, + "learning_rate": 0.0006102240896358544, + "loss": 0.3683, + "step": 14025 + }, + { + "epoch": 7.835754189944134, + "grad_norm": 0.6096512079238892, + "learning_rate": 0.0006101960784313726, + "loss": 0.4128, + "step": 14026 + }, + { + "epoch": 7.836312849162011, + "grad_norm": 0.4862516224384308, + "learning_rate": 0.0006101680672268907, + "loss": 0.4409, + "step": 14027 + }, + { + "epoch": 7.836871508379888, + "grad_norm": 2.0232625007629395, + "learning_rate": 0.0006101400560224089, + "loss": 0.4563, + "step": 14028 + }, + { + "epoch": 7.837430167597765, + "grad_norm": 0.43342125415802, + "learning_rate": 0.0006101120448179271, + "loss": 0.3823, + "step": 14029 + }, + { + "epoch": 7.837988826815643, + "grad_norm": 0.573444128036499, + "learning_rate": 0.0006100840336134455, + "loss": 0.3678, + "step": 14030 + }, + { + "epoch": 7.838547486033519, + "grad_norm": 0.4459739923477173, + "learning_rate": 0.0006100560224089637, + "loss": 0.4485, + "step": 14031 + }, + { + "epoch": 7.839106145251397, + "grad_norm": 0.8919585943222046, + "learning_rate": 0.0006100280112044818, + "loss": 0.4919, + "step": 14032 + }, + { + "epoch": 7.839664804469273, + "grad_norm": 0.42220550775527954, + "learning_rate": 0.00061, + "loss": 0.4151, + "step": 14033 + }, + { + "epoch": 7.840223463687151, + "grad_norm": 0.44479402899742126, + "learning_rate": 0.0006099719887955182, + "loss": 0.3601, + "step": 14034 + }, + { + "epoch": 7.840782122905028, + "grad_norm": 0.44729384779930115, + "learning_rate": 0.0006099439775910365, + "loss": 0.5094, + "step": 14035 + }, + { + "epoch": 7.841340782122905, + "grad_norm": 0.6608854532241821, + "learning_rate": 0.0006099159663865547, + "loss": 0.5859, + "step": 14036 + }, + { + "epoch": 7.841899441340782, + "grad_norm": 0.41493016481399536, + "learning_rate": 0.0006098879551820728, + "loss": 0.4069, + "step": 14037 + }, + { + "epoch": 7.842458100558659, + "grad_norm": 0.38937169313430786, + "learning_rate": 0.000609859943977591, + "loss": 0.3618, + "step": 14038 + }, + { + "epoch": 7.843016759776536, + "grad_norm": 0.5008031129837036, + "learning_rate": 0.0006098319327731092, + "loss": 0.3578, + "step": 14039 + }, + { + "epoch": 7.843575418994414, + "grad_norm": 0.4291411340236664, + "learning_rate": 0.0006098039215686275, + "loss": 0.3225, + "step": 14040 + }, + { + "epoch": 7.84413407821229, + "grad_norm": 0.5514256358146667, + "learning_rate": 0.0006097759103641457, + "loss": 0.4761, + "step": 14041 + }, + { + "epoch": 7.844692737430168, + "grad_norm": 0.37141722440719604, + "learning_rate": 0.0006097478991596639, + "loss": 0.3846, + "step": 14042 + }, + { + "epoch": 7.845251396648044, + "grad_norm": 0.5600492358207703, + "learning_rate": 0.000609719887955182, + "loss": 0.3408, + "step": 14043 + }, + { + "epoch": 7.845810055865922, + "grad_norm": 3.491687536239624, + "learning_rate": 0.0006096918767507002, + "loss": 0.4988, + "step": 14044 + }, + { + "epoch": 7.846368715083798, + "grad_norm": 0.4222472906112671, + "learning_rate": 0.0006096638655462186, + "loss": 0.3946, + "step": 14045 + }, + { + "epoch": 7.846927374301676, + "grad_norm": 0.3901357650756836, + "learning_rate": 0.0006096358543417368, + "loss": 0.4548, + "step": 14046 + }, + { + "epoch": 7.847486033519553, + "grad_norm": 0.33597564697265625, + "learning_rate": 0.000609607843137255, + "loss": 0.37, + "step": 14047 + }, + { + "epoch": 7.84804469273743, + "grad_norm": 0.4221252501010895, + "learning_rate": 0.0006095798319327731, + "loss": 0.3616, + "step": 14048 + }, + { + "epoch": 7.848603351955307, + "grad_norm": 2.5202057361602783, + "learning_rate": 0.0006095518207282913, + "loss": 0.4123, + "step": 14049 + }, + { + "epoch": 7.849162011173185, + "grad_norm": 0.458539217710495, + "learning_rate": 0.0006095238095238096, + "loss": 0.3724, + "step": 14050 + }, + { + "epoch": 7.849720670391061, + "grad_norm": 0.4702228307723999, + "learning_rate": 0.0006094957983193278, + "loss": 0.3862, + "step": 14051 + }, + { + "epoch": 7.850279329608939, + "grad_norm": 0.4447256624698639, + "learning_rate": 0.000609467787114846, + "loss": 0.464, + "step": 14052 + }, + { + "epoch": 7.850837988826815, + "grad_norm": 0.46216046810150146, + "learning_rate": 0.0006094397759103641, + "loss": 0.5121, + "step": 14053 + }, + { + "epoch": 7.851396648044693, + "grad_norm": 2.6394336223602295, + "learning_rate": 0.0006094117647058823, + "loss": 0.368, + "step": 14054 + }, + { + "epoch": 7.851955307262569, + "grad_norm": 1.201012134552002, + "learning_rate": 0.0006093837535014006, + "loss": 0.3659, + "step": 14055 + }, + { + "epoch": 7.852513966480447, + "grad_norm": 0.4681328237056732, + "learning_rate": 0.0006093557422969188, + "loss": 0.4, + "step": 14056 + }, + { + "epoch": 7.853072625698324, + "grad_norm": 2.520373582839966, + "learning_rate": 0.000609327731092437, + "loss": 0.6831, + "step": 14057 + }, + { + "epoch": 7.853631284916201, + "grad_norm": 0.8120802640914917, + "learning_rate": 0.0006092997198879552, + "loss": 0.4608, + "step": 14058 + }, + { + "epoch": 7.854189944134078, + "grad_norm": 0.6128634810447693, + "learning_rate": 0.0006092717086834733, + "loss": 0.3924, + "step": 14059 + }, + { + "epoch": 7.854748603351956, + "grad_norm": 0.5131211876869202, + "learning_rate": 0.0006092436974789915, + "loss": 0.4283, + "step": 14060 + }, + { + "epoch": 7.855307262569832, + "grad_norm": 0.5298057794570923, + "learning_rate": 0.0006092156862745098, + "loss": 0.5154, + "step": 14061 + }, + { + "epoch": 7.85586592178771, + "grad_norm": 0.5277855396270752, + "learning_rate": 0.0006091876750700281, + "loss": 0.4556, + "step": 14062 + }, + { + "epoch": 7.856424581005586, + "grad_norm": 1.4483544826507568, + "learning_rate": 0.0006091596638655463, + "loss": 0.5639, + "step": 14063 + }, + { + "epoch": 7.856983240223464, + "grad_norm": 0.6038522720336914, + "learning_rate": 0.0006091316526610644, + "loss": 0.4247, + "step": 14064 + }, + { + "epoch": 7.85754189944134, + "grad_norm": 0.4921419024467468, + "learning_rate": 0.0006091036414565826, + "loss": 0.4428, + "step": 14065 + }, + { + "epoch": 7.858100558659218, + "grad_norm": 0.6876301169395447, + "learning_rate": 0.0006090756302521009, + "loss": 0.4493, + "step": 14066 + }, + { + "epoch": 7.858659217877095, + "grad_norm": 0.5611335039138794, + "learning_rate": 0.0006090476190476191, + "loss": 0.4226, + "step": 14067 + }, + { + "epoch": 7.859217877094972, + "grad_norm": 0.6593359708786011, + "learning_rate": 0.0006090196078431373, + "loss": 0.6283, + "step": 14068 + }, + { + "epoch": 7.859776536312849, + "grad_norm": 0.45986607670783997, + "learning_rate": 0.0006089915966386554, + "loss": 0.3943, + "step": 14069 + }, + { + "epoch": 7.860335195530726, + "grad_norm": 0.5802178382873535, + "learning_rate": 0.0006089635854341736, + "loss": 0.4123, + "step": 14070 + }, + { + "epoch": 7.860893854748603, + "grad_norm": 1.6869314908981323, + "learning_rate": 0.0006089355742296919, + "loss": 0.4492, + "step": 14071 + }, + { + "epoch": 7.861452513966481, + "grad_norm": 1.0273398160934448, + "learning_rate": 0.0006089075630252101, + "loss": 0.4047, + "step": 14072 + }, + { + "epoch": 7.862011173184357, + "grad_norm": 0.6036380529403687, + "learning_rate": 0.0006088795518207283, + "loss": 0.5452, + "step": 14073 + }, + { + "epoch": 7.862569832402235, + "grad_norm": 0.590654730796814, + "learning_rate": 0.0006088515406162465, + "loss": 0.4417, + "step": 14074 + }, + { + "epoch": 7.863128491620111, + "grad_norm": 0.5519625544548035, + "learning_rate": 0.0006088235294117646, + "loss": 0.4764, + "step": 14075 + }, + { + "epoch": 7.863687150837989, + "grad_norm": 1.196094036102295, + "learning_rate": 0.0006087955182072829, + "loss": 0.4474, + "step": 14076 + }, + { + "epoch": 7.864245810055866, + "grad_norm": 0.5589098930358887, + "learning_rate": 0.0006087675070028011, + "loss": 0.4415, + "step": 14077 + }, + { + "epoch": 7.864804469273743, + "grad_norm": 0.4926721155643463, + "learning_rate": 0.0006087394957983194, + "loss": 0.4017, + "step": 14078 + }, + { + "epoch": 7.86536312849162, + "grad_norm": 0.42424827814102173, + "learning_rate": 0.0006087114845938376, + "loss": 0.4248, + "step": 14079 + }, + { + "epoch": 7.865921787709497, + "grad_norm": 0.4860125780105591, + "learning_rate": 0.0006086834733893557, + "loss": 0.4152, + "step": 14080 + }, + { + "epoch": 7.866480446927374, + "grad_norm": 0.6805820465087891, + "learning_rate": 0.000608655462184874, + "loss": 0.5482, + "step": 14081 + }, + { + "epoch": 7.867039106145251, + "grad_norm": 0.4036625921726227, + "learning_rate": 0.0006086274509803922, + "loss": 0.3668, + "step": 14082 + }, + { + "epoch": 7.867597765363128, + "grad_norm": 0.45382872223854065, + "learning_rate": 0.0006085994397759104, + "loss": 0.4688, + "step": 14083 + }, + { + "epoch": 7.868156424581006, + "grad_norm": 0.5808756351470947, + "learning_rate": 0.0006085714285714286, + "loss": 0.5037, + "step": 14084 + }, + { + "epoch": 7.868715083798882, + "grad_norm": 1.0217854976654053, + "learning_rate": 0.0006085434173669467, + "loss": 0.474, + "step": 14085 + }, + { + "epoch": 7.86927374301676, + "grad_norm": 0.4961625933647156, + "learning_rate": 0.000608515406162465, + "loss": 0.5245, + "step": 14086 + }, + { + "epoch": 7.869832402234637, + "grad_norm": 0.3712250590324402, + "learning_rate": 0.0006084873949579832, + "loss": 0.4348, + "step": 14087 + }, + { + "epoch": 7.870391061452514, + "grad_norm": 0.39424946904182434, + "learning_rate": 0.0006084593837535014, + "loss": 0.3369, + "step": 14088 + }, + { + "epoch": 7.870949720670391, + "grad_norm": 0.496334433555603, + "learning_rate": 0.0006084313725490196, + "loss": 0.4656, + "step": 14089 + }, + { + "epoch": 7.871508379888268, + "grad_norm": 0.6619940996170044, + "learning_rate": 0.0006084033613445378, + "loss": 0.469, + "step": 14090 + }, + { + "epoch": 7.872067039106145, + "grad_norm": 0.563731849193573, + "learning_rate": 0.000608375350140056, + "loss": 0.5851, + "step": 14091 + }, + { + "epoch": 7.872625698324022, + "grad_norm": 0.36841338872909546, + "learning_rate": 0.0006083473389355742, + "loss": 0.3926, + "step": 14092 + }, + { + "epoch": 7.873184357541899, + "grad_norm": 0.7273062467575073, + "learning_rate": 0.0006083193277310924, + "loss": 0.4798, + "step": 14093 + }, + { + "epoch": 7.873743016759777, + "grad_norm": 1.2027250528335571, + "learning_rate": 0.0006082913165266106, + "loss": 0.3554, + "step": 14094 + }, + { + "epoch": 7.874301675977653, + "grad_norm": 0.4926111102104187, + "learning_rate": 0.0006082633053221289, + "loss": 0.423, + "step": 14095 + }, + { + "epoch": 7.874860335195531, + "grad_norm": 0.4224153459072113, + "learning_rate": 0.0006082352941176471, + "loss": 0.3501, + "step": 14096 + }, + { + "epoch": 7.875418994413408, + "grad_norm": 0.9446578621864319, + "learning_rate": 0.0006082072829131653, + "loss": 0.4769, + "step": 14097 + }, + { + "epoch": 7.875977653631285, + "grad_norm": 0.9632366299629211, + "learning_rate": 0.0006081792717086835, + "loss": 0.4178, + "step": 14098 + }, + { + "epoch": 7.876536312849162, + "grad_norm": 0.6089878082275391, + "learning_rate": 0.0006081512605042017, + "loss": 0.383, + "step": 14099 + }, + { + "epoch": 7.877094972067039, + "grad_norm": 0.4659743905067444, + "learning_rate": 0.0006081232492997199, + "loss": 0.4408, + "step": 14100 + }, + { + "epoch": 7.877653631284916, + "grad_norm": 0.6155918836593628, + "learning_rate": 0.0006080952380952382, + "loss": 0.5053, + "step": 14101 + }, + { + "epoch": 7.878212290502793, + "grad_norm": 0.595474898815155, + "learning_rate": 0.0006080672268907563, + "loss": 0.3606, + "step": 14102 + }, + { + "epoch": 7.87877094972067, + "grad_norm": 0.5520682334899902, + "learning_rate": 0.0006080392156862745, + "loss": 0.5041, + "step": 14103 + }, + { + "epoch": 7.879329608938548, + "grad_norm": 0.4283052980899811, + "learning_rate": 0.0006080112044817927, + "loss": 0.306, + "step": 14104 + }, + { + "epoch": 7.879888268156424, + "grad_norm": 0.5950391292572021, + "learning_rate": 0.0006079831932773109, + "loss": 0.4656, + "step": 14105 + }, + { + "epoch": 7.880446927374302, + "grad_norm": 0.37386465072631836, + "learning_rate": 0.0006079551820728292, + "loss": 0.34, + "step": 14106 + }, + { + "epoch": 7.881005586592178, + "grad_norm": 0.45117971301078796, + "learning_rate": 0.0006079271708683473, + "loss": 0.4369, + "step": 14107 + }, + { + "epoch": 7.881564245810056, + "grad_norm": 0.45627790689468384, + "learning_rate": 0.0006078991596638655, + "loss": 0.5299, + "step": 14108 + }, + { + "epoch": 7.882122905027933, + "grad_norm": 0.5724135041236877, + "learning_rate": 0.0006078711484593837, + "loss": 0.3796, + "step": 14109 + }, + { + "epoch": 7.88268156424581, + "grad_norm": 0.6409525275230408, + "learning_rate": 0.0006078431372549019, + "loss": 0.3968, + "step": 14110 + }, + { + "epoch": 7.883240223463687, + "grad_norm": 0.4467275142669678, + "learning_rate": 0.0006078151260504203, + "loss": 0.4365, + "step": 14111 + }, + { + "epoch": 7.883798882681564, + "grad_norm": 0.6085941791534424, + "learning_rate": 0.0006077871148459384, + "loss": 0.6568, + "step": 14112 + }, + { + "epoch": 7.884357541899441, + "grad_norm": 0.6792378425598145, + "learning_rate": 0.0006077591036414566, + "loss": 0.4271, + "step": 14113 + }, + { + "epoch": 7.884916201117319, + "grad_norm": 0.6143374443054199, + "learning_rate": 0.0006077310924369748, + "loss": 0.4595, + "step": 14114 + }, + { + "epoch": 7.885474860335195, + "grad_norm": 0.5388185977935791, + "learning_rate": 0.000607703081232493, + "loss": 0.5783, + "step": 14115 + }, + { + "epoch": 7.886033519553073, + "grad_norm": 0.6613753437995911, + "learning_rate": 0.0006076750700280113, + "loss": 0.5576, + "step": 14116 + }, + { + "epoch": 7.886592178770949, + "grad_norm": 0.77463299036026, + "learning_rate": 0.0006076470588235295, + "loss": 0.4237, + "step": 14117 + }, + { + "epoch": 7.887150837988827, + "grad_norm": 0.5430334210395813, + "learning_rate": 0.0006076190476190476, + "loss": 0.3825, + "step": 14118 + }, + { + "epoch": 7.8877094972067034, + "grad_norm": 0.46947580575942993, + "learning_rate": 0.0006075910364145658, + "loss": 0.4381, + "step": 14119 + }, + { + "epoch": 7.888268156424581, + "grad_norm": 0.44095292687416077, + "learning_rate": 0.000607563025210084, + "loss": 0.394, + "step": 14120 + }, + { + "epoch": 7.888826815642458, + "grad_norm": 3.042280673980713, + "learning_rate": 0.0006075350140056023, + "loss": 0.4902, + "step": 14121 + }, + { + "epoch": 7.889385474860335, + "grad_norm": 0.6561064124107361, + "learning_rate": 0.0006075070028011205, + "loss": 0.4371, + "step": 14122 + }, + { + "epoch": 7.889944134078212, + "grad_norm": 0.41461801528930664, + "learning_rate": 0.0006074789915966386, + "loss": 0.4457, + "step": 14123 + }, + { + "epoch": 7.89050279329609, + "grad_norm": 0.45347532629966736, + "learning_rate": 0.0006074509803921568, + "loss": 0.4344, + "step": 14124 + }, + { + "epoch": 7.891061452513966, + "grad_norm": 1.6163285970687866, + "learning_rate": 0.000607422969187675, + "loss": 0.4911, + "step": 14125 + }, + { + "epoch": 7.891620111731844, + "grad_norm": 0.5072400569915771, + "learning_rate": 0.0006073949579831933, + "loss": 0.4782, + "step": 14126 + }, + { + "epoch": 7.89217877094972, + "grad_norm": 0.6784650683403015, + "learning_rate": 0.0006073669467787116, + "loss": 0.5995, + "step": 14127 + }, + { + "epoch": 7.892737430167598, + "grad_norm": 0.5433607697486877, + "learning_rate": 0.0006073389355742297, + "loss": 0.4732, + "step": 14128 + }, + { + "epoch": 7.8932960893854744, + "grad_norm": 1.1077600717544556, + "learning_rate": 0.0006073109243697479, + "loss": 0.4915, + "step": 14129 + }, + { + "epoch": 7.893854748603352, + "grad_norm": 0.4496769905090332, + "learning_rate": 0.0006072829131652661, + "loss": 0.4551, + "step": 14130 + }, + { + "epoch": 7.894413407821229, + "grad_norm": 1.2492722272872925, + "learning_rate": 0.0006072549019607844, + "loss": 0.4537, + "step": 14131 + }, + { + "epoch": 7.894972067039106, + "grad_norm": 0.4690402150154114, + "learning_rate": 0.0006072268907563026, + "loss": 0.5178, + "step": 14132 + }, + { + "epoch": 7.895530726256983, + "grad_norm": 0.6281135082244873, + "learning_rate": 0.0006071988795518208, + "loss": 0.4628, + "step": 14133 + }, + { + "epoch": 7.896089385474861, + "grad_norm": 1.967771053314209, + "learning_rate": 0.0006071708683473389, + "loss": 0.4882, + "step": 14134 + }, + { + "epoch": 7.896648044692737, + "grad_norm": 0.5978216528892517, + "learning_rate": 0.0006071428571428571, + "loss": 0.5313, + "step": 14135 + }, + { + "epoch": 7.897206703910615, + "grad_norm": 0.6420818567276001, + "learning_rate": 0.0006071148459383754, + "loss": 0.4625, + "step": 14136 + }, + { + "epoch": 7.897765363128491, + "grad_norm": 0.8040462732315063, + "learning_rate": 0.0006070868347338936, + "loss": 0.5419, + "step": 14137 + }, + { + "epoch": 7.898324022346369, + "grad_norm": 0.670534074306488, + "learning_rate": 0.0006070588235294118, + "loss": 0.4328, + "step": 14138 + }, + { + "epoch": 7.8988826815642454, + "grad_norm": 0.5469644665718079, + "learning_rate": 0.0006070308123249299, + "loss": 0.3776, + "step": 14139 + }, + { + "epoch": 7.899441340782123, + "grad_norm": 0.35390475392341614, + "learning_rate": 0.0006070028011204481, + "loss": 0.3148, + "step": 14140 + }, + { + "epoch": 7.9, + "grad_norm": 0.35059481859207153, + "learning_rate": 0.0006069747899159664, + "loss": 0.3497, + "step": 14141 + }, + { + "epoch": 7.900558659217877, + "grad_norm": 0.5717032551765442, + "learning_rate": 0.0006069467787114846, + "loss": 0.3873, + "step": 14142 + }, + { + "epoch": 7.901117318435754, + "grad_norm": 0.9199079275131226, + "learning_rate": 0.0006069187675070028, + "loss": 0.4667, + "step": 14143 + }, + { + "epoch": 7.901675977653631, + "grad_norm": 0.6967477202415466, + "learning_rate": 0.000606890756302521, + "loss": 0.4552, + "step": 14144 + }, + { + "epoch": 7.902234636871508, + "grad_norm": 0.5415611267089844, + "learning_rate": 0.0006068627450980392, + "loss": 0.4036, + "step": 14145 + }, + { + "epoch": 7.902793296089386, + "grad_norm": 0.5598707795143127, + "learning_rate": 0.0006068347338935575, + "loss": 0.4102, + "step": 14146 + }, + { + "epoch": 7.903351955307262, + "grad_norm": 0.4622749090194702, + "learning_rate": 0.0006068067226890757, + "loss": 0.4958, + "step": 14147 + }, + { + "epoch": 7.90391061452514, + "grad_norm": 1.8067772388458252, + "learning_rate": 0.0006067787114845939, + "loss": 0.6638, + "step": 14148 + }, + { + "epoch": 7.9044692737430164, + "grad_norm": 0.44978487491607666, + "learning_rate": 0.0006067507002801121, + "loss": 0.3963, + "step": 14149 + }, + { + "epoch": 7.905027932960894, + "grad_norm": 0.4657762944698334, + "learning_rate": 0.0006067226890756302, + "loss": 0.3393, + "step": 14150 + }, + { + "epoch": 7.905586592178771, + "grad_norm": 0.7096941471099854, + "learning_rate": 0.0006066946778711485, + "loss": 0.5014, + "step": 14151 + }, + { + "epoch": 7.906145251396648, + "grad_norm": 1.3319052457809448, + "learning_rate": 0.0006066666666666667, + "loss": 0.3297, + "step": 14152 + }, + { + "epoch": 7.906703910614525, + "grad_norm": 0.5790479183197021, + "learning_rate": 0.0006066386554621849, + "loss": 0.4536, + "step": 14153 + }, + { + "epoch": 7.907262569832402, + "grad_norm": 0.4217730760574341, + "learning_rate": 0.0006066106442577031, + "loss": 0.3818, + "step": 14154 + }, + { + "epoch": 7.907821229050279, + "grad_norm": 0.3936649560928345, + "learning_rate": 0.0006065826330532212, + "loss": 0.3547, + "step": 14155 + }, + { + "epoch": 7.908379888268156, + "grad_norm": 0.6060563325881958, + "learning_rate": 0.0006065546218487395, + "loss": 0.4621, + "step": 14156 + }, + { + "epoch": 7.908938547486033, + "grad_norm": 0.5636858940124512, + "learning_rate": 0.0006065266106442577, + "loss": 0.5161, + "step": 14157 + }, + { + "epoch": 7.909497206703911, + "grad_norm": 0.6794993877410889, + "learning_rate": 0.0006064985994397759, + "loss": 0.4597, + "step": 14158 + }, + { + "epoch": 7.910055865921787, + "grad_norm": 0.7797375321388245, + "learning_rate": 0.0006064705882352941, + "loss": 0.3909, + "step": 14159 + }, + { + "epoch": 7.910614525139665, + "grad_norm": 0.535366415977478, + "learning_rate": 0.0006064425770308122, + "loss": 0.4556, + "step": 14160 + }, + { + "epoch": 7.911173184357542, + "grad_norm": 0.45064032077789307, + "learning_rate": 0.0006064145658263306, + "loss": 0.4461, + "step": 14161 + }, + { + "epoch": 7.911731843575419, + "grad_norm": 0.3856082260608673, + "learning_rate": 0.0006063865546218488, + "loss": 0.4205, + "step": 14162 + }, + { + "epoch": 7.912290502793296, + "grad_norm": 0.879021942615509, + "learning_rate": 0.000606358543417367, + "loss": 0.3695, + "step": 14163 + }, + { + "epoch": 7.912849162011173, + "grad_norm": 1.0510387420654297, + "learning_rate": 0.0006063305322128852, + "loss": 0.489, + "step": 14164 + }, + { + "epoch": 7.91340782122905, + "grad_norm": 1.6861861944198608, + "learning_rate": 0.0006063025210084034, + "loss": 0.459, + "step": 14165 + }, + { + "epoch": 7.913966480446927, + "grad_norm": 0.4760240316390991, + "learning_rate": 0.0006062745098039216, + "loss": 0.4776, + "step": 14166 + }, + { + "epoch": 7.914525139664804, + "grad_norm": 0.5942274928092957, + "learning_rate": 0.0006062464985994398, + "loss": 0.4878, + "step": 14167 + }, + { + "epoch": 7.915083798882682, + "grad_norm": 0.5481272339820862, + "learning_rate": 0.000606218487394958, + "loss": 0.5073, + "step": 14168 + }, + { + "epoch": 7.915642458100558, + "grad_norm": 0.39387619495391846, + "learning_rate": 0.0006061904761904762, + "loss": 0.4309, + "step": 14169 + }, + { + "epoch": 7.916201117318436, + "grad_norm": 0.6636371612548828, + "learning_rate": 0.0006061624649859944, + "loss": 0.4852, + "step": 14170 + }, + { + "epoch": 7.9167597765363125, + "grad_norm": 0.4540569484233856, + "learning_rate": 0.0006061344537815126, + "loss": 0.4822, + "step": 14171 + }, + { + "epoch": 7.91731843575419, + "grad_norm": 4.315849781036377, + "learning_rate": 0.0006061064425770308, + "loss": 0.4356, + "step": 14172 + }, + { + "epoch": 7.917877094972067, + "grad_norm": 2.6544439792633057, + "learning_rate": 0.000606078431372549, + "loss": 0.4241, + "step": 14173 + }, + { + "epoch": 7.918435754189944, + "grad_norm": 0.6716331243515015, + "learning_rate": 0.0006060504201680672, + "loss": 0.4079, + "step": 14174 + }, + { + "epoch": 7.918994413407821, + "grad_norm": 0.6275873780250549, + "learning_rate": 0.0006060224089635854, + "loss": 0.4237, + "step": 14175 + }, + { + "epoch": 7.919553072625698, + "grad_norm": 0.668163537979126, + "learning_rate": 0.0006059943977591036, + "loss": 0.5494, + "step": 14176 + }, + { + "epoch": 7.920111731843575, + "grad_norm": 0.4181583821773529, + "learning_rate": 0.0006059663865546219, + "loss": 0.452, + "step": 14177 + }, + { + "epoch": 7.920670391061453, + "grad_norm": 0.6004208922386169, + "learning_rate": 0.0006059383753501401, + "loss": 0.4012, + "step": 14178 + }, + { + "epoch": 7.921229050279329, + "grad_norm": 0.5395012497901917, + "learning_rate": 0.0006059103641456583, + "loss": 0.3843, + "step": 14179 + }, + { + "epoch": 7.921787709497207, + "grad_norm": 0.5836853384971619, + "learning_rate": 0.0006058823529411765, + "loss": 0.4605, + "step": 14180 + }, + { + "epoch": 7.9223463687150835, + "grad_norm": 0.8164491653442383, + "learning_rate": 0.0006058543417366948, + "loss": 0.491, + "step": 14181 + }, + { + "epoch": 7.922905027932961, + "grad_norm": 0.6076754331588745, + "learning_rate": 0.0006058263305322129, + "loss": 0.4937, + "step": 14182 + }, + { + "epoch": 7.923463687150838, + "grad_norm": 0.8746344447135925, + "learning_rate": 0.0006057983193277311, + "loss": 0.4295, + "step": 14183 + }, + { + "epoch": 7.924022346368715, + "grad_norm": 0.43245530128479004, + "learning_rate": 0.0006057703081232493, + "loss": 0.3798, + "step": 14184 + }, + { + "epoch": 7.924581005586592, + "grad_norm": 0.4194217026233673, + "learning_rate": 0.0006057422969187675, + "loss": 0.2922, + "step": 14185 + }, + { + "epoch": 7.925139664804469, + "grad_norm": 0.45826587080955505, + "learning_rate": 0.0006057142857142858, + "loss": 0.3696, + "step": 14186 + }, + { + "epoch": 7.925698324022346, + "grad_norm": 0.579105019569397, + "learning_rate": 0.0006056862745098039, + "loss": 0.3938, + "step": 14187 + }, + { + "epoch": 7.926256983240224, + "grad_norm": 0.44036081433296204, + "learning_rate": 0.0006056582633053221, + "loss": 0.3296, + "step": 14188 + }, + { + "epoch": 7.9268156424581, + "grad_norm": 0.7250059247016907, + "learning_rate": 0.0006056302521008403, + "loss": 0.5079, + "step": 14189 + }, + { + "epoch": 7.927374301675978, + "grad_norm": 1.0376495122909546, + "learning_rate": 0.0006056022408963585, + "loss": 0.5475, + "step": 14190 + }, + { + "epoch": 7.9279329608938545, + "grad_norm": 0.47896936535835266, + "learning_rate": 0.0006055742296918768, + "loss": 0.5369, + "step": 14191 + }, + { + "epoch": 7.928491620111732, + "grad_norm": 1.2012934684753418, + "learning_rate": 0.0006055462184873949, + "loss": 0.421, + "step": 14192 + }, + { + "epoch": 7.9290502793296085, + "grad_norm": 0.976495087146759, + "learning_rate": 0.0006055182072829131, + "loss": 0.5422, + "step": 14193 + }, + { + "epoch": 7.929608938547486, + "grad_norm": 0.8057586550712585, + "learning_rate": 0.0006054901960784314, + "loss": 0.4739, + "step": 14194 + }, + { + "epoch": 7.930167597765363, + "grad_norm": 0.6573531031608582, + "learning_rate": 0.0006054621848739496, + "loss": 0.4128, + "step": 14195 + }, + { + "epoch": 7.93072625698324, + "grad_norm": 0.6205860376358032, + "learning_rate": 0.0006054341736694679, + "loss": 0.4637, + "step": 14196 + }, + { + "epoch": 7.931284916201117, + "grad_norm": 0.5603929162025452, + "learning_rate": 0.0006054061624649861, + "loss": 0.5366, + "step": 14197 + }, + { + "epoch": 7.931843575418995, + "grad_norm": 0.6897447109222412, + "learning_rate": 0.0006053781512605042, + "loss": 0.5378, + "step": 14198 + }, + { + "epoch": 7.932402234636871, + "grad_norm": 0.45130735635757446, + "learning_rate": 0.0006053501400560224, + "loss": 0.5279, + "step": 14199 + }, + { + "epoch": 7.932960893854749, + "grad_norm": 1.1561640501022339, + "learning_rate": 0.0006053221288515406, + "loss": 0.5776, + "step": 14200 + }, + { + "epoch": 7.9335195530726255, + "grad_norm": 0.5011961460113525, + "learning_rate": 0.0006052941176470589, + "loss": 0.4582, + "step": 14201 + }, + { + "epoch": 7.934078212290503, + "grad_norm": 0.4498681128025055, + "learning_rate": 0.0006052661064425771, + "loss": 0.4753, + "step": 14202 + }, + { + "epoch": 7.9346368715083795, + "grad_norm": 0.7079980373382568, + "learning_rate": 0.0006052380952380952, + "loss": 0.4491, + "step": 14203 + }, + { + "epoch": 7.935195530726257, + "grad_norm": 0.5874936580657959, + "learning_rate": 0.0006052100840336134, + "loss": 0.5496, + "step": 14204 + }, + { + "epoch": 7.935754189944134, + "grad_norm": 0.794951856136322, + "learning_rate": 0.0006051820728291316, + "loss": 0.5788, + "step": 14205 + }, + { + "epoch": 7.936312849162011, + "grad_norm": 0.5293338894844055, + "learning_rate": 0.0006051540616246499, + "loss": 0.4832, + "step": 14206 + }, + { + "epoch": 7.936871508379888, + "grad_norm": 0.5555122494697571, + "learning_rate": 0.0006051260504201681, + "loss": 0.4358, + "step": 14207 + }, + { + "epoch": 7.937430167597765, + "grad_norm": 0.4400259256362915, + "learning_rate": 0.0006050980392156862, + "loss": 0.5594, + "step": 14208 + }, + { + "epoch": 7.937988826815642, + "grad_norm": 0.5891273021697998, + "learning_rate": 0.0006050700280112044, + "loss": 0.3796, + "step": 14209 + }, + { + "epoch": 7.93854748603352, + "grad_norm": 0.5500006079673767, + "learning_rate": 0.0006050420168067227, + "loss": 0.4296, + "step": 14210 + }, + { + "epoch": 7.9391061452513965, + "grad_norm": 0.4392663538455963, + "learning_rate": 0.000605014005602241, + "loss": 0.4465, + "step": 14211 + }, + { + "epoch": 7.939664804469274, + "grad_norm": 0.5625128746032715, + "learning_rate": 0.0006049859943977592, + "loss": 0.5645, + "step": 14212 + }, + { + "epoch": 7.9402234636871505, + "grad_norm": 0.715496301651001, + "learning_rate": 0.0006049579831932774, + "loss": 0.5525, + "step": 14213 + }, + { + "epoch": 7.940782122905028, + "grad_norm": 1.6878738403320312, + "learning_rate": 0.0006049299719887955, + "loss": 0.4062, + "step": 14214 + }, + { + "epoch": 7.941340782122905, + "grad_norm": 0.39091014862060547, + "learning_rate": 0.0006049019607843137, + "loss": 0.4523, + "step": 14215 + }, + { + "epoch": 7.941899441340782, + "grad_norm": 0.45580625534057617, + "learning_rate": 0.000604873949579832, + "loss": 0.3868, + "step": 14216 + }, + { + "epoch": 7.942458100558659, + "grad_norm": 0.5214827656745911, + "learning_rate": 0.0006048459383753502, + "loss": 0.4109, + "step": 14217 + }, + { + "epoch": 7.943016759776536, + "grad_norm": 0.73016357421875, + "learning_rate": 0.0006048179271708684, + "loss": 0.5033, + "step": 14218 + }, + { + "epoch": 7.943575418994413, + "grad_norm": 0.36870265007019043, + "learning_rate": 0.0006047899159663865, + "loss": 0.4019, + "step": 14219 + }, + { + "epoch": 7.94413407821229, + "grad_norm": 0.4197850525379181, + "learning_rate": 0.0006047619047619047, + "loss": 0.448, + "step": 14220 + }, + { + "epoch": 7.9446927374301675, + "grad_norm": 0.4646594226360321, + "learning_rate": 0.000604733893557423, + "loss": 0.4483, + "step": 14221 + }, + { + "epoch": 7.945251396648045, + "grad_norm": 0.40932217240333557, + "learning_rate": 0.0006047058823529412, + "loss": 0.4873, + "step": 14222 + }, + { + "epoch": 7.9458100558659215, + "grad_norm": 0.4788350760936737, + "learning_rate": 0.0006046778711484594, + "loss": 0.4524, + "step": 14223 + }, + { + "epoch": 7.946368715083799, + "grad_norm": 0.620957612991333, + "learning_rate": 0.0006046498599439775, + "loss": 0.6384, + "step": 14224 + }, + { + "epoch": 7.946927374301676, + "grad_norm": 0.5223733186721802, + "learning_rate": 0.0006046218487394957, + "loss": 0.5105, + "step": 14225 + }, + { + "epoch": 7.947486033519553, + "grad_norm": 0.5959054827690125, + "learning_rate": 0.0006045938375350141, + "loss": 0.45, + "step": 14226 + }, + { + "epoch": 7.94804469273743, + "grad_norm": 0.40770187973976135, + "learning_rate": 0.0006045658263305323, + "loss": 0.4236, + "step": 14227 + }, + { + "epoch": 7.948603351955307, + "grad_norm": 0.7343426942825317, + "learning_rate": 0.0006045378151260505, + "loss": 0.4794, + "step": 14228 + }, + { + "epoch": 7.949162011173184, + "grad_norm": 0.9903770685195923, + "learning_rate": 0.0006045098039215687, + "loss": 0.4373, + "step": 14229 + }, + { + "epoch": 7.949720670391061, + "grad_norm": 0.4142523407936096, + "learning_rate": 0.0006044817927170868, + "loss": 0.3739, + "step": 14230 + }, + { + "epoch": 7.9502793296089385, + "grad_norm": 4.297020435333252, + "learning_rate": 0.0006044537815126051, + "loss": 0.4263, + "step": 14231 + }, + { + "epoch": 7.950837988826816, + "grad_norm": 0.7485659718513489, + "learning_rate": 0.0006044257703081233, + "loss": 0.4112, + "step": 14232 + }, + { + "epoch": 7.9513966480446925, + "grad_norm": 0.9614757299423218, + "learning_rate": 0.0006043977591036415, + "loss": 0.4796, + "step": 14233 + }, + { + "epoch": 7.95195530726257, + "grad_norm": 1.32154381275177, + "learning_rate": 0.0006043697478991597, + "loss": 0.4991, + "step": 14234 + }, + { + "epoch": 7.952513966480447, + "grad_norm": 0.4387473165988922, + "learning_rate": 0.0006043417366946778, + "loss": 0.4062, + "step": 14235 + }, + { + "epoch": 7.953072625698324, + "grad_norm": 0.6479970812797546, + "learning_rate": 0.0006043137254901961, + "loss": 0.3529, + "step": 14236 + }, + { + "epoch": 7.953631284916201, + "grad_norm": 0.3849358856678009, + "learning_rate": 0.0006042857142857143, + "loss": 0.4237, + "step": 14237 + }, + { + "epoch": 7.954189944134078, + "grad_norm": 0.5455474853515625, + "learning_rate": 0.0006042577030812325, + "loss": 0.3911, + "step": 14238 + }, + { + "epoch": 7.954748603351955, + "grad_norm": 0.43311941623687744, + "learning_rate": 0.0006042296918767507, + "loss": 0.3977, + "step": 14239 + }, + { + "epoch": 7.955307262569832, + "grad_norm": 1.1351542472839355, + "learning_rate": 0.0006042016806722688, + "loss": 0.3528, + "step": 14240 + }, + { + "epoch": 7.9558659217877095, + "grad_norm": 2.5995676517486572, + "learning_rate": 0.0006041736694677871, + "loss": 0.4128, + "step": 14241 + }, + { + "epoch": 7.956424581005587, + "grad_norm": 1.4863731861114502, + "learning_rate": 0.0006041456582633054, + "loss": 0.4887, + "step": 14242 + }, + { + "epoch": 7.9569832402234635, + "grad_norm": 0.34584298729896545, + "learning_rate": 0.0006041176470588236, + "loss": 0.401, + "step": 14243 + }, + { + "epoch": 7.957541899441341, + "grad_norm": 1.1386308670043945, + "learning_rate": 0.0006040896358543418, + "loss": 0.5095, + "step": 14244 + }, + { + "epoch": 7.9581005586592175, + "grad_norm": 0.5187742114067078, + "learning_rate": 0.00060406162464986, + "loss": 0.5606, + "step": 14245 + }, + { + "epoch": 7.958659217877095, + "grad_norm": 0.646282970905304, + "learning_rate": 0.0006040336134453782, + "loss": 0.4379, + "step": 14246 + }, + { + "epoch": 7.959217877094972, + "grad_norm": 0.39817845821380615, + "learning_rate": 0.0006040056022408964, + "loss": 0.4476, + "step": 14247 + }, + { + "epoch": 7.959776536312849, + "grad_norm": 1.0522841215133667, + "learning_rate": 0.0006039775910364146, + "loss": 0.6138, + "step": 14248 + }, + { + "epoch": 7.960335195530726, + "grad_norm": 0.3197142779827118, + "learning_rate": 0.0006039495798319328, + "loss": 0.3769, + "step": 14249 + }, + { + "epoch": 7.960893854748603, + "grad_norm": 0.913466215133667, + "learning_rate": 0.000603921568627451, + "loss": 0.4181, + "step": 14250 + }, + { + "epoch": 7.9614525139664805, + "grad_norm": 0.5283289551734924, + "learning_rate": 0.0006038935574229692, + "loss": 0.4581, + "step": 14251 + }, + { + "epoch": 7.962011173184358, + "grad_norm": 4.923588752746582, + "learning_rate": 0.0006038655462184874, + "loss": 0.6446, + "step": 14252 + }, + { + "epoch": 7.9625698324022345, + "grad_norm": 0.9310001134872437, + "learning_rate": 0.0006038375350140056, + "loss": 0.4058, + "step": 14253 + }, + { + "epoch": 7.963128491620112, + "grad_norm": 0.7428258657455444, + "learning_rate": 0.0006038095238095238, + "loss": 0.4027, + "step": 14254 + }, + { + "epoch": 7.9636871508379885, + "grad_norm": 0.82972252368927, + "learning_rate": 0.000603781512605042, + "loss": 0.5034, + "step": 14255 + }, + { + "epoch": 7.964245810055866, + "grad_norm": 1.3849762678146362, + "learning_rate": 0.0006037535014005602, + "loss": 0.5053, + "step": 14256 + }, + { + "epoch": 7.9648044692737425, + "grad_norm": 0.43868687748908997, + "learning_rate": 0.0006037254901960784, + "loss": 0.4129, + "step": 14257 + }, + { + "epoch": 7.96536312849162, + "grad_norm": 0.5154237151145935, + "learning_rate": 0.0006036974789915966, + "loss": 0.4137, + "step": 14258 + }, + { + "epoch": 7.965921787709497, + "grad_norm": 0.43162018060684204, + "learning_rate": 0.0006036694677871149, + "loss": 0.3895, + "step": 14259 + }, + { + "epoch": 7.966480446927374, + "grad_norm": 0.6027476787567139, + "learning_rate": 0.0006036414565826331, + "loss": 0.3962, + "step": 14260 + }, + { + "epoch": 7.9670391061452515, + "grad_norm": 0.6836234927177429, + "learning_rate": 0.0006036134453781514, + "loss": 0.5137, + "step": 14261 + }, + { + "epoch": 7.967597765363129, + "grad_norm": 0.4649446904659271, + "learning_rate": 0.0006035854341736695, + "loss": 0.3996, + "step": 14262 + }, + { + "epoch": 7.9681564245810055, + "grad_norm": 0.5667934417724609, + "learning_rate": 0.0006035574229691877, + "loss": 0.5143, + "step": 14263 + }, + { + "epoch": 7.968715083798883, + "grad_norm": 0.4855731427669525, + "learning_rate": 0.0006035294117647059, + "loss": 0.4689, + "step": 14264 + }, + { + "epoch": 7.9692737430167595, + "grad_norm": 0.5278502106666565, + "learning_rate": 0.0006035014005602241, + "loss": 0.4851, + "step": 14265 + }, + { + "epoch": 7.969832402234637, + "grad_norm": 10.22043514251709, + "learning_rate": 0.0006034733893557424, + "loss": 0.5294, + "step": 14266 + }, + { + "epoch": 7.9703910614525135, + "grad_norm": 0.5031912922859192, + "learning_rate": 0.0006034453781512605, + "loss": 0.4817, + "step": 14267 + }, + { + "epoch": 7.970949720670391, + "grad_norm": 0.985588788986206, + "learning_rate": 0.0006034173669467787, + "loss": 0.4337, + "step": 14268 + }, + { + "epoch": 7.971508379888268, + "grad_norm": 0.5934202075004578, + "learning_rate": 0.0006033893557422969, + "loss": 0.394, + "step": 14269 + }, + { + "epoch": 7.972067039106145, + "grad_norm": 0.39451858401298523, + "learning_rate": 0.0006033613445378151, + "loss": 0.4091, + "step": 14270 + }, + { + "epoch": 7.9726256983240225, + "grad_norm": 0.45145025849342346, + "learning_rate": 0.0006033333333333334, + "loss": 0.4962, + "step": 14271 + }, + { + "epoch": 7.9731843575419, + "grad_norm": 0.5986431837081909, + "learning_rate": 0.0006033053221288515, + "loss": 0.3873, + "step": 14272 + }, + { + "epoch": 7.9737430167597765, + "grad_norm": 0.6096537113189697, + "learning_rate": 0.0006032773109243697, + "loss": 0.4633, + "step": 14273 + }, + { + "epoch": 7.974301675977654, + "grad_norm": 0.8488360047340393, + "learning_rate": 0.0006032492997198879, + "loss": 0.5554, + "step": 14274 + }, + { + "epoch": 7.9748603351955305, + "grad_norm": 0.44510534405708313, + "learning_rate": 0.0006032212885154061, + "loss": 0.4214, + "step": 14275 + }, + { + "epoch": 7.975418994413408, + "grad_norm": 0.35789167881011963, + "learning_rate": 0.0006031932773109245, + "loss": 0.3912, + "step": 14276 + }, + { + "epoch": 7.9759776536312845, + "grad_norm": 0.8667805790901184, + "learning_rate": 0.0006031652661064427, + "loss": 0.4503, + "step": 14277 + }, + { + "epoch": 7.976536312849162, + "grad_norm": 0.4212509095668793, + "learning_rate": 0.0006031372549019608, + "loss": 0.4124, + "step": 14278 + }, + { + "epoch": 7.977094972067039, + "grad_norm": 0.495271772146225, + "learning_rate": 0.000603109243697479, + "loss": 0.4409, + "step": 14279 + }, + { + "epoch": 7.977653631284916, + "grad_norm": 0.5545368194580078, + "learning_rate": 0.0006030812324929972, + "loss": 0.4684, + "step": 14280 + }, + { + "epoch": 7.9782122905027935, + "grad_norm": 0.45595961809158325, + "learning_rate": 0.0006030532212885154, + "loss": 0.5211, + "step": 14281 + }, + { + "epoch": 7.97877094972067, + "grad_norm": 0.46896255016326904, + "learning_rate": 0.0006030252100840337, + "loss": 0.4067, + "step": 14282 + }, + { + "epoch": 7.9793296089385475, + "grad_norm": 4.56844425201416, + "learning_rate": 0.0006029971988795518, + "loss": 0.4198, + "step": 14283 + }, + { + "epoch": 7.979888268156425, + "grad_norm": 0.5166363716125488, + "learning_rate": 0.00060296918767507, + "loss": 0.3801, + "step": 14284 + }, + { + "epoch": 7.9804469273743015, + "grad_norm": 0.4709360897541046, + "learning_rate": 0.0006029411764705882, + "loss": 0.4743, + "step": 14285 + }, + { + "epoch": 7.981005586592179, + "grad_norm": 0.4801344573497772, + "learning_rate": 0.0006029131652661064, + "loss": 0.3977, + "step": 14286 + }, + { + "epoch": 7.9815642458100555, + "grad_norm": 0.5393338799476624, + "learning_rate": 0.0006028851540616247, + "loss": 0.4951, + "step": 14287 + }, + { + "epoch": 7.982122905027933, + "grad_norm": 2.87831974029541, + "learning_rate": 0.0006028571428571428, + "loss": 0.3686, + "step": 14288 + }, + { + "epoch": 7.98268156424581, + "grad_norm": 1.395812749862671, + "learning_rate": 0.000602829131652661, + "loss": 0.4826, + "step": 14289 + }, + { + "epoch": 7.983240223463687, + "grad_norm": 0.5166707038879395, + "learning_rate": 0.0006028011204481792, + "loss": 0.4475, + "step": 14290 + }, + { + "epoch": 7.9837988826815645, + "grad_norm": 0.4548737704753876, + "learning_rate": 0.0006027731092436974, + "loss": 0.3641, + "step": 14291 + }, + { + "epoch": 7.984357541899441, + "grad_norm": 0.4100731313228607, + "learning_rate": 0.0006027450980392158, + "loss": 0.4128, + "step": 14292 + }, + { + "epoch": 7.9849162011173185, + "grad_norm": 2.9874231815338135, + "learning_rate": 0.000602717086834734, + "loss": 0.4089, + "step": 14293 + }, + { + "epoch": 7.985474860335195, + "grad_norm": 0.525781512260437, + "learning_rate": 0.0006026890756302521, + "loss": 0.406, + "step": 14294 + }, + { + "epoch": 7.9860335195530725, + "grad_norm": 1.8277138471603394, + "learning_rate": 0.0006026610644257703, + "loss": 0.8354, + "step": 14295 + }, + { + "epoch": 7.98659217877095, + "grad_norm": 0.5151097774505615, + "learning_rate": 0.0006026330532212885, + "loss": 0.4598, + "step": 14296 + }, + { + "epoch": 7.9871508379888265, + "grad_norm": 0.6688753366470337, + "learning_rate": 0.0006026050420168068, + "loss": 0.5116, + "step": 14297 + }, + { + "epoch": 7.987709497206704, + "grad_norm": 0.40915465354919434, + "learning_rate": 0.000602577030812325, + "loss": 0.2826, + "step": 14298 + }, + { + "epoch": 7.988268156424581, + "grad_norm": 0.6947688460350037, + "learning_rate": 0.0006025490196078431, + "loss": 0.3685, + "step": 14299 + }, + { + "epoch": 7.988826815642458, + "grad_norm": 1.5197031497955322, + "learning_rate": 0.0006025210084033613, + "loss": 0.3787, + "step": 14300 + }, + { + "epoch": 7.9893854748603355, + "grad_norm": 0.38484910130500793, + "learning_rate": 0.0006024929971988795, + "loss": 0.3972, + "step": 14301 + }, + { + "epoch": 7.989944134078212, + "grad_norm": 0.5499008893966675, + "learning_rate": 0.0006024649859943978, + "loss": 0.4426, + "step": 14302 + }, + { + "epoch": 7.9905027932960895, + "grad_norm": 0.48515400290489197, + "learning_rate": 0.000602436974789916, + "loss": 0.4094, + "step": 14303 + }, + { + "epoch": 7.991061452513966, + "grad_norm": 0.5160617828369141, + "learning_rate": 0.0006024089635854341, + "loss": 0.4687, + "step": 14304 + }, + { + "epoch": 7.9916201117318435, + "grad_norm": 0.4319307506084442, + "learning_rate": 0.0006023809523809523, + "loss": 0.4441, + "step": 14305 + }, + { + "epoch": 7.992178770949721, + "grad_norm": 0.6816244721412659, + "learning_rate": 0.0006023529411764705, + "loss": 0.4317, + "step": 14306 + }, + { + "epoch": 7.9927374301675975, + "grad_norm": 1.7799586057662964, + "learning_rate": 0.0006023249299719888, + "loss": 0.4308, + "step": 14307 + }, + { + "epoch": 7.993296089385475, + "grad_norm": 1.1707338094711304, + "learning_rate": 0.0006022969187675071, + "loss": 0.5587, + "step": 14308 + }, + { + "epoch": 7.993854748603352, + "grad_norm": 0.988422691822052, + "learning_rate": 0.0006022689075630253, + "loss": 0.4772, + "step": 14309 + }, + { + "epoch": 7.994413407821229, + "grad_norm": 0.5512896180152893, + "learning_rate": 0.0006022408963585434, + "loss": 0.3473, + "step": 14310 + }, + { + "epoch": 7.9949720670391065, + "grad_norm": 0.7550446391105652, + "learning_rate": 0.0006022128851540616, + "loss": 0.4443, + "step": 14311 + }, + { + "epoch": 7.995530726256983, + "grad_norm": 0.6585484147071838, + "learning_rate": 0.0006021848739495799, + "loss": 0.4696, + "step": 14312 + }, + { + "epoch": 7.9960893854748605, + "grad_norm": 1.5440630912780762, + "learning_rate": 0.0006021568627450981, + "loss": 0.4739, + "step": 14313 + }, + { + "epoch": 7.996648044692737, + "grad_norm": 0.5748957395553589, + "learning_rate": 0.0006021288515406163, + "loss": 0.332, + "step": 14314 + }, + { + "epoch": 7.9972067039106145, + "grad_norm": 0.5756217837333679, + "learning_rate": 0.0006021008403361344, + "loss": 0.5738, + "step": 14315 + }, + { + "epoch": 7.997765363128492, + "grad_norm": 0.38247138261795044, + "learning_rate": 0.0006020728291316526, + "loss": 0.3749, + "step": 14316 + }, + { + "epoch": 7.9983240223463685, + "grad_norm": 0.42976677417755127, + "learning_rate": 0.0006020448179271709, + "loss": 0.4572, + "step": 14317 + }, + { + "epoch": 7.998882681564246, + "grad_norm": 0.5669113397598267, + "learning_rate": 0.0006020168067226891, + "loss": 0.4158, + "step": 14318 + }, + { + "epoch": 7.9994413407821225, + "grad_norm": 0.4263019561767578, + "learning_rate": 0.0006019887955182073, + "loss": 0.4493, + "step": 14319 + }, + { + "epoch": 8.0, + "grad_norm": 0.42987707257270813, + "learning_rate": 0.0006019607843137254, + "loss": 0.3959, + "step": 14320 + }, + { + "epoch": 8.000558659217877, + "grad_norm": 0.4581637680530548, + "learning_rate": 0.0006019327731092436, + "loss": 0.4532, + "step": 14321 + }, + { + "epoch": 8.001117318435755, + "grad_norm": 1.0903503894805908, + "learning_rate": 0.0006019047619047619, + "loss": 0.514, + "step": 14322 + }, + { + "epoch": 8.001675977653631, + "grad_norm": 0.393097460269928, + "learning_rate": 0.0006018767507002801, + "loss": 0.3976, + "step": 14323 + }, + { + "epoch": 8.002234636871508, + "grad_norm": 2.589627981185913, + "learning_rate": 0.0006018487394957984, + "loss": 0.5084, + "step": 14324 + }, + { + "epoch": 8.002793296089385, + "grad_norm": 0.4391290545463562, + "learning_rate": 0.0006018207282913166, + "loss": 0.4712, + "step": 14325 + }, + { + "epoch": 8.003351955307263, + "grad_norm": 0.5736142992973328, + "learning_rate": 0.0006017927170868347, + "loss": 0.3762, + "step": 14326 + }, + { + "epoch": 8.00391061452514, + "grad_norm": 0.49017664790153503, + "learning_rate": 0.000601764705882353, + "loss": 0.5359, + "step": 14327 + }, + { + "epoch": 8.004469273743016, + "grad_norm": 1.2210017442703247, + "learning_rate": 0.0006017366946778712, + "loss": 0.4317, + "step": 14328 + }, + { + "epoch": 8.005027932960894, + "grad_norm": 0.46017375588417053, + "learning_rate": 0.0006017086834733894, + "loss": 0.4174, + "step": 14329 + }, + { + "epoch": 8.005586592178771, + "grad_norm": 0.48255613446235657, + "learning_rate": 0.0006016806722689076, + "loss": 0.4545, + "step": 14330 + }, + { + "epoch": 8.006145251396648, + "grad_norm": 0.4368647336959839, + "learning_rate": 0.0006016526610644257, + "loss": 0.429, + "step": 14331 + }, + { + "epoch": 8.006703910614526, + "grad_norm": 0.5759446024894714, + "learning_rate": 0.000601624649859944, + "loss": 0.4202, + "step": 14332 + }, + { + "epoch": 8.007262569832402, + "grad_norm": 0.4744316637516022, + "learning_rate": 0.0006015966386554622, + "loss": 0.5632, + "step": 14333 + }, + { + "epoch": 8.007821229050279, + "grad_norm": 0.5011651515960693, + "learning_rate": 0.0006015686274509804, + "loss": 0.418, + "step": 14334 + }, + { + "epoch": 8.008379888268156, + "grad_norm": 0.3920082747936249, + "learning_rate": 0.0006015406162464986, + "loss": 0.4656, + "step": 14335 + }, + { + "epoch": 8.008938547486034, + "grad_norm": 0.9925408959388733, + "learning_rate": 0.0006015126050420167, + "loss": 0.4318, + "step": 14336 + }, + { + "epoch": 8.00949720670391, + "grad_norm": 0.3603690266609192, + "learning_rate": 0.000601484593837535, + "loss": 0.368, + "step": 14337 + }, + { + "epoch": 8.010055865921787, + "grad_norm": 2.0020430088043213, + "learning_rate": 0.0006014565826330532, + "loss": 0.4571, + "step": 14338 + }, + { + "epoch": 8.010614525139665, + "grad_norm": 0.6948301792144775, + "learning_rate": 0.0006014285714285714, + "loss": 0.4483, + "step": 14339 + }, + { + "epoch": 8.011173184357542, + "grad_norm": 0.5292946100234985, + "learning_rate": 0.0006014005602240896, + "loss": 0.46, + "step": 14340 + }, + { + "epoch": 8.011731843575419, + "grad_norm": 0.4924343526363373, + "learning_rate": 0.0006013725490196079, + "loss": 0.3763, + "step": 14341 + }, + { + "epoch": 8.012290502793297, + "grad_norm": 0.5706194639205933, + "learning_rate": 0.0006013445378151261, + "loss": 0.2917, + "step": 14342 + }, + { + "epoch": 8.012849162011173, + "grad_norm": 1.1087849140167236, + "learning_rate": 0.0006013165266106443, + "loss": 0.3821, + "step": 14343 + }, + { + "epoch": 8.01340782122905, + "grad_norm": 0.4120514690876007, + "learning_rate": 0.0006012885154061625, + "loss": 0.4046, + "step": 14344 + }, + { + "epoch": 8.013966480446927, + "grad_norm": 0.3571608066558838, + "learning_rate": 0.0006012605042016807, + "loss": 0.3373, + "step": 14345 + }, + { + "epoch": 8.014525139664805, + "grad_norm": 0.5736892223358154, + "learning_rate": 0.0006012324929971989, + "loss": 0.4238, + "step": 14346 + }, + { + "epoch": 8.015083798882682, + "grad_norm": 0.6381986737251282, + "learning_rate": 0.0006012044817927171, + "loss": 0.3924, + "step": 14347 + }, + { + "epoch": 8.015642458100558, + "grad_norm": 0.41436097025871277, + "learning_rate": 0.0006011764705882353, + "loss": 0.3639, + "step": 14348 + }, + { + "epoch": 8.016201117318436, + "grad_norm": 0.712432861328125, + "learning_rate": 0.0006011484593837535, + "loss": 0.4726, + "step": 14349 + }, + { + "epoch": 8.016759776536313, + "grad_norm": 0.6390038728713989, + "learning_rate": 0.0006011204481792717, + "loss": 0.4026, + "step": 14350 + }, + { + "epoch": 8.01731843575419, + "grad_norm": 0.8491302132606506, + "learning_rate": 0.0006010924369747899, + "loss": 0.4274, + "step": 14351 + }, + { + "epoch": 8.017877094972068, + "grad_norm": 0.717250645160675, + "learning_rate": 0.0006010644257703082, + "loss": 0.4014, + "step": 14352 + }, + { + "epoch": 8.018435754189944, + "grad_norm": 0.5589627027511597, + "learning_rate": 0.0006010364145658263, + "loss": 0.5026, + "step": 14353 + }, + { + "epoch": 8.018994413407821, + "grad_norm": 0.5699906349182129, + "learning_rate": 0.0006010084033613445, + "loss": 0.441, + "step": 14354 + }, + { + "epoch": 8.019553072625698, + "grad_norm": 0.5970957279205322, + "learning_rate": 0.0006009803921568627, + "loss": 0.3878, + "step": 14355 + }, + { + "epoch": 8.020111731843576, + "grad_norm": 0.5575460195541382, + "learning_rate": 0.0006009523809523809, + "loss": 0.4442, + "step": 14356 + }, + { + "epoch": 8.020670391061453, + "grad_norm": 0.6154443025588989, + "learning_rate": 0.0006009243697478993, + "loss": 0.3454, + "step": 14357 + }, + { + "epoch": 8.021229050279329, + "grad_norm": 0.32149264216423035, + "learning_rate": 0.0006008963585434174, + "loss": 0.3733, + "step": 14358 + }, + { + "epoch": 8.021787709497207, + "grad_norm": 0.748559296131134, + "learning_rate": 0.0006008683473389356, + "loss": 0.3792, + "step": 14359 + }, + { + "epoch": 8.022346368715084, + "grad_norm": 0.4470595121383667, + "learning_rate": 0.0006008403361344538, + "loss": 0.3553, + "step": 14360 + }, + { + "epoch": 8.02290502793296, + "grad_norm": 0.48731231689453125, + "learning_rate": 0.000600812324929972, + "loss": 0.4171, + "step": 14361 + }, + { + "epoch": 8.023463687150837, + "grad_norm": 0.8704900741577148, + "learning_rate": 0.0006007843137254903, + "loss": 0.4265, + "step": 14362 + }, + { + "epoch": 8.024022346368715, + "grad_norm": 0.39106935262680054, + "learning_rate": 0.0006007563025210084, + "loss": 0.3407, + "step": 14363 + }, + { + "epoch": 8.024581005586592, + "grad_norm": 0.5778666734695435, + "learning_rate": 0.0006007282913165266, + "loss": 0.5475, + "step": 14364 + }, + { + "epoch": 8.025139664804469, + "grad_norm": 0.5334743857383728, + "learning_rate": 0.0006007002801120448, + "loss": 0.4085, + "step": 14365 + }, + { + "epoch": 8.025698324022347, + "grad_norm": 1.2521016597747803, + "learning_rate": 0.000600672268907563, + "loss": 0.4243, + "step": 14366 + }, + { + "epoch": 8.026256983240224, + "grad_norm": 0.4407244920730591, + "learning_rate": 0.0006006442577030813, + "loss": 0.5232, + "step": 14367 + }, + { + "epoch": 8.0268156424581, + "grad_norm": 0.5303959846496582, + "learning_rate": 0.0006006162464985995, + "loss": 0.4063, + "step": 14368 + }, + { + "epoch": 8.027374301675978, + "grad_norm": 0.5556484460830688, + "learning_rate": 0.0006005882352941176, + "loss": 0.3528, + "step": 14369 + }, + { + "epoch": 8.027932960893855, + "grad_norm": 0.7826901078224182, + "learning_rate": 0.0006005602240896358, + "loss": 0.6155, + "step": 14370 + }, + { + "epoch": 8.028491620111732, + "grad_norm": 0.47236984968185425, + "learning_rate": 0.000600532212885154, + "loss": 0.4217, + "step": 14371 + }, + { + "epoch": 8.029050279329608, + "grad_norm": 0.5443273186683655, + "learning_rate": 0.0006005042016806723, + "loss": 0.4279, + "step": 14372 + }, + { + "epoch": 8.029608938547486, + "grad_norm": 0.5189141035079956, + "learning_rate": 0.0006004761904761906, + "loss": 0.4762, + "step": 14373 + }, + { + "epoch": 8.030167597765363, + "grad_norm": 0.5528595447540283, + "learning_rate": 0.0006004481792717087, + "loss": 0.3505, + "step": 14374 + }, + { + "epoch": 8.03072625698324, + "grad_norm": 0.5922738313674927, + "learning_rate": 0.0006004201680672269, + "loss": 0.4317, + "step": 14375 + }, + { + "epoch": 8.031284916201118, + "grad_norm": 0.5522679686546326, + "learning_rate": 0.0006003921568627451, + "loss": 0.4224, + "step": 14376 + }, + { + "epoch": 8.031843575418995, + "grad_norm": 0.5774714350700378, + "learning_rate": 0.0006003641456582634, + "loss": 0.6338, + "step": 14377 + }, + { + "epoch": 8.032402234636871, + "grad_norm": 1.2023193836212158, + "learning_rate": 0.0006003361344537816, + "loss": 0.5284, + "step": 14378 + }, + { + "epoch": 8.03296089385475, + "grad_norm": 0.5452879071235657, + "learning_rate": 0.0006003081232492997, + "loss": 0.5738, + "step": 14379 + }, + { + "epoch": 8.033519553072626, + "grad_norm": 0.4827750027179718, + "learning_rate": 0.0006002801120448179, + "loss": 0.5209, + "step": 14380 + }, + { + "epoch": 8.034078212290503, + "grad_norm": 0.3710368871688843, + "learning_rate": 0.0006002521008403361, + "loss": 0.4059, + "step": 14381 + }, + { + "epoch": 8.03463687150838, + "grad_norm": 0.4284456968307495, + "learning_rate": 0.0006002240896358544, + "loss": 0.4133, + "step": 14382 + }, + { + "epoch": 8.035195530726257, + "grad_norm": 0.40405189990997314, + "learning_rate": 0.0006001960784313726, + "loss": 0.3613, + "step": 14383 + }, + { + "epoch": 8.035754189944134, + "grad_norm": 0.71453857421875, + "learning_rate": 0.0006001680672268908, + "loss": 0.3838, + "step": 14384 + }, + { + "epoch": 8.03631284916201, + "grad_norm": 0.9455159306526184, + "learning_rate": 0.0006001400560224089, + "loss": 0.4732, + "step": 14385 + }, + { + "epoch": 8.036871508379889, + "grad_norm": 1.5728727579116821, + "learning_rate": 0.0006001120448179271, + "loss": 0.5204, + "step": 14386 + }, + { + "epoch": 8.037430167597766, + "grad_norm": 0.4212017357349396, + "learning_rate": 0.0006000840336134454, + "loss": 0.4458, + "step": 14387 + }, + { + "epoch": 8.037988826815642, + "grad_norm": 0.6370043158531189, + "learning_rate": 0.0006000560224089636, + "loss": 0.4711, + "step": 14388 + }, + { + "epoch": 8.03854748603352, + "grad_norm": 7.670841217041016, + "learning_rate": 0.0006000280112044818, + "loss": 0.3343, + "step": 14389 + }, + { + "epoch": 8.039106145251397, + "grad_norm": 0.5167133808135986, + "learning_rate": 0.0006, + "loss": 0.4198, + "step": 14390 + }, + { + "epoch": 8.039664804469274, + "grad_norm": 0.501754641532898, + "learning_rate": 0.0005999719887955182, + "loss": 0.4012, + "step": 14391 + }, + { + "epoch": 8.04022346368715, + "grad_norm": 0.8308843374252319, + "learning_rate": 0.0005999439775910365, + "loss": 0.4356, + "step": 14392 + }, + { + "epoch": 8.040782122905028, + "grad_norm": 0.5902323126792908, + "learning_rate": 0.0005999159663865547, + "loss": 0.4823, + "step": 14393 + }, + { + "epoch": 8.041340782122905, + "grad_norm": 0.8933522701263428, + "learning_rate": 0.0005998879551820729, + "loss": 0.5271, + "step": 14394 + }, + { + "epoch": 8.041899441340782, + "grad_norm": 0.5437177419662476, + "learning_rate": 0.000599859943977591, + "loss": 0.6006, + "step": 14395 + }, + { + "epoch": 8.04245810055866, + "grad_norm": 1.7194788455963135, + "learning_rate": 0.0005998319327731092, + "loss": 0.5133, + "step": 14396 + }, + { + "epoch": 8.043016759776537, + "grad_norm": 2.9548416137695312, + "learning_rate": 0.0005998039215686275, + "loss": 0.4619, + "step": 14397 + }, + { + "epoch": 8.043575418994413, + "grad_norm": 1.250157356262207, + "learning_rate": 0.0005997759103641457, + "loss": 0.4708, + "step": 14398 + }, + { + "epoch": 8.04413407821229, + "grad_norm": 0.5268141627311707, + "learning_rate": 0.0005997478991596639, + "loss": 0.4167, + "step": 14399 + }, + { + "epoch": 8.044692737430168, + "grad_norm": 0.40548741817474365, + "learning_rate": 0.0005997198879551821, + "loss": 0.4076, + "step": 14400 + }, + { + "epoch": 8.045251396648045, + "grad_norm": 2.5386669635772705, + "learning_rate": 0.0005996918767507002, + "loss": 0.3719, + "step": 14401 + }, + { + "epoch": 8.045810055865921, + "grad_norm": 0.5602366924285889, + "learning_rate": 0.0005996638655462185, + "loss": 0.4048, + "step": 14402 + }, + { + "epoch": 8.0463687150838, + "grad_norm": 0.3502439558506012, + "learning_rate": 0.0005996358543417367, + "loss": 0.3355, + "step": 14403 + }, + { + "epoch": 8.046927374301676, + "grad_norm": 0.5376352667808533, + "learning_rate": 0.0005996078431372549, + "loss": 0.4692, + "step": 14404 + }, + { + "epoch": 8.047486033519553, + "grad_norm": 1.0940701961517334, + "learning_rate": 0.0005995798319327731, + "loss": 0.3624, + "step": 14405 + }, + { + "epoch": 8.048044692737431, + "grad_norm": 0.4787084758281708, + "learning_rate": 0.0005995518207282912, + "loss": 0.4247, + "step": 14406 + }, + { + "epoch": 8.048603351955308, + "grad_norm": 1.0625576972961426, + "learning_rate": 0.0005995238095238096, + "loss": 0.4153, + "step": 14407 + }, + { + "epoch": 8.049162011173184, + "grad_norm": 1.6665457487106323, + "learning_rate": 0.0005994957983193278, + "loss": 0.3258, + "step": 14408 + }, + { + "epoch": 8.04972067039106, + "grad_norm": 0.48011183738708496, + "learning_rate": 0.000599467787114846, + "loss": 0.459, + "step": 14409 + }, + { + "epoch": 8.050279329608939, + "grad_norm": 0.5652352571487427, + "learning_rate": 0.0005994397759103642, + "loss": 0.3983, + "step": 14410 + }, + { + "epoch": 8.050837988826816, + "grad_norm": 0.6023281812667847, + "learning_rate": 0.0005994117647058823, + "loss": 0.424, + "step": 14411 + }, + { + "epoch": 8.051396648044692, + "grad_norm": 0.5585233569145203, + "learning_rate": 0.0005993837535014006, + "loss": 0.4479, + "step": 14412 + }, + { + "epoch": 8.05195530726257, + "grad_norm": 0.45445168018341064, + "learning_rate": 0.0005993557422969188, + "loss": 0.4882, + "step": 14413 + }, + { + "epoch": 8.052513966480447, + "grad_norm": 3.381598711013794, + "learning_rate": 0.000599327731092437, + "loss": 0.4316, + "step": 14414 + }, + { + "epoch": 8.053072625698324, + "grad_norm": 0.4645046591758728, + "learning_rate": 0.0005992997198879552, + "loss": 0.3061, + "step": 14415 + }, + { + "epoch": 8.053631284916202, + "grad_norm": 1.2431318759918213, + "learning_rate": 0.0005992717086834734, + "loss": 0.3166, + "step": 14416 + }, + { + "epoch": 8.054189944134079, + "grad_norm": 0.653498113155365, + "learning_rate": 0.0005992436974789916, + "loss": 0.5003, + "step": 14417 + }, + { + "epoch": 8.054748603351955, + "grad_norm": 0.644396185874939, + "learning_rate": 0.0005992156862745098, + "loss": 0.4213, + "step": 14418 + }, + { + "epoch": 8.055307262569832, + "grad_norm": 0.40860694646835327, + "learning_rate": 0.000599187675070028, + "loss": 0.4328, + "step": 14419 + }, + { + "epoch": 8.05586592178771, + "grad_norm": 0.5199441313743591, + "learning_rate": 0.0005991596638655462, + "loss": 0.4717, + "step": 14420 + }, + { + "epoch": 8.056424581005587, + "grad_norm": 0.5356389284133911, + "learning_rate": 0.0005991316526610644, + "loss": 0.446, + "step": 14421 + }, + { + "epoch": 8.056983240223463, + "grad_norm": 2.948608875274658, + "learning_rate": 0.0005991036414565826, + "loss": 0.4904, + "step": 14422 + }, + { + "epoch": 8.057541899441341, + "grad_norm": 0.40729671716690063, + "learning_rate": 0.0005990756302521009, + "loss": 0.4429, + "step": 14423 + }, + { + "epoch": 8.058100558659218, + "grad_norm": 0.49135828018188477, + "learning_rate": 0.0005990476190476191, + "loss": 0.4937, + "step": 14424 + }, + { + "epoch": 8.058659217877095, + "grad_norm": 0.6867174506187439, + "learning_rate": 0.0005990196078431373, + "loss": 0.4428, + "step": 14425 + }, + { + "epoch": 8.059217877094973, + "grad_norm": 1.6701709032058716, + "learning_rate": 0.0005989915966386555, + "loss": 0.6129, + "step": 14426 + }, + { + "epoch": 8.05977653631285, + "grad_norm": 0.5143259763717651, + "learning_rate": 0.0005989635854341737, + "loss": 0.4244, + "step": 14427 + }, + { + "epoch": 8.060335195530726, + "grad_norm": 0.8569937944412231, + "learning_rate": 0.0005989355742296919, + "loss": 0.5474, + "step": 14428 + }, + { + "epoch": 8.060893854748603, + "grad_norm": 0.7850049138069153, + "learning_rate": 0.0005989075630252101, + "loss": 0.5752, + "step": 14429 + }, + { + "epoch": 8.061452513966481, + "grad_norm": 1.0247721672058105, + "learning_rate": 0.0005988795518207283, + "loss": 0.42, + "step": 14430 + }, + { + "epoch": 8.062011173184358, + "grad_norm": 0.75715172290802, + "learning_rate": 0.0005988515406162465, + "loss": 0.4341, + "step": 14431 + }, + { + "epoch": 8.062569832402234, + "grad_norm": 0.5107081532478333, + "learning_rate": 0.0005988235294117648, + "loss": 0.4448, + "step": 14432 + }, + { + "epoch": 8.063128491620112, + "grad_norm": 0.5078255534172058, + "learning_rate": 0.0005987955182072829, + "loss": 0.5197, + "step": 14433 + }, + { + "epoch": 8.063687150837989, + "grad_norm": 0.668635904788971, + "learning_rate": 0.0005987675070028011, + "loss": 0.4056, + "step": 14434 + }, + { + "epoch": 8.064245810055866, + "grad_norm": 0.6517415046691895, + "learning_rate": 0.0005987394957983193, + "loss": 0.3851, + "step": 14435 + }, + { + "epoch": 8.064804469273742, + "grad_norm": 0.9981436133384705, + "learning_rate": 0.0005987114845938375, + "loss": 0.4224, + "step": 14436 + }, + { + "epoch": 8.06536312849162, + "grad_norm": 0.4603477716445923, + "learning_rate": 0.0005986834733893558, + "loss": 0.4494, + "step": 14437 + }, + { + "epoch": 8.065921787709497, + "grad_norm": 0.5559819936752319, + "learning_rate": 0.0005986554621848739, + "loss": 0.5596, + "step": 14438 + }, + { + "epoch": 8.066480446927374, + "grad_norm": 0.4180082678794861, + "learning_rate": 0.0005986274509803921, + "loss": 0.4175, + "step": 14439 + }, + { + "epoch": 8.067039106145252, + "grad_norm": 0.8038545250892639, + "learning_rate": 0.0005985994397759104, + "loss": 0.63, + "step": 14440 + }, + { + "epoch": 8.067597765363129, + "grad_norm": 0.6108267903327942, + "learning_rate": 0.0005985714285714286, + "loss": 0.5032, + "step": 14441 + }, + { + "epoch": 8.068156424581005, + "grad_norm": 0.49462592601776123, + "learning_rate": 0.0005985434173669469, + "loss": 0.4102, + "step": 14442 + }, + { + "epoch": 8.068715083798883, + "grad_norm": 2.032073497772217, + "learning_rate": 0.000598515406162465, + "loss": 0.4789, + "step": 14443 + }, + { + "epoch": 8.06927374301676, + "grad_norm": 0.48974502086639404, + "learning_rate": 0.0005984873949579832, + "loss": 0.5186, + "step": 14444 + }, + { + "epoch": 8.069832402234637, + "grad_norm": 0.8576788902282715, + "learning_rate": 0.0005984593837535014, + "loss": 0.5996, + "step": 14445 + }, + { + "epoch": 8.070391061452513, + "grad_norm": 12.471939086914062, + "learning_rate": 0.0005984313725490196, + "loss": 0.3605, + "step": 14446 + }, + { + "epoch": 8.070949720670392, + "grad_norm": 0.5204570293426514, + "learning_rate": 0.0005984033613445379, + "loss": 0.3861, + "step": 14447 + }, + { + "epoch": 8.071508379888268, + "grad_norm": 0.3632824122905731, + "learning_rate": 0.0005983753501400561, + "loss": 0.4516, + "step": 14448 + }, + { + "epoch": 8.072067039106145, + "grad_norm": 0.6021848917007446, + "learning_rate": 0.0005983473389355742, + "loss": 0.383, + "step": 14449 + }, + { + "epoch": 8.072625698324023, + "grad_norm": 0.9489167928695679, + "learning_rate": 0.0005983193277310924, + "loss": 0.4295, + "step": 14450 + }, + { + "epoch": 8.0731843575419, + "grad_norm": 0.5070911645889282, + "learning_rate": 0.0005982913165266106, + "loss": 0.444, + "step": 14451 + }, + { + "epoch": 8.073743016759776, + "grad_norm": 2.872343063354492, + "learning_rate": 0.0005982633053221289, + "loss": 0.3669, + "step": 14452 + }, + { + "epoch": 8.074301675977654, + "grad_norm": 0.9399489164352417, + "learning_rate": 0.0005982352941176471, + "loss": 0.3951, + "step": 14453 + }, + { + "epoch": 8.074860335195531, + "grad_norm": 1.8839850425720215, + "learning_rate": 0.0005982072829131652, + "loss": 0.4762, + "step": 14454 + }, + { + "epoch": 8.075418994413408, + "grad_norm": 0.6631466746330261, + "learning_rate": 0.0005981792717086834, + "loss": 0.4784, + "step": 14455 + }, + { + "epoch": 8.075977653631284, + "grad_norm": 0.7401856184005737, + "learning_rate": 0.0005981512605042017, + "loss": 0.4392, + "step": 14456 + }, + { + "epoch": 8.076536312849163, + "grad_norm": 0.65260249376297, + "learning_rate": 0.00059812324929972, + "loss": 0.3996, + "step": 14457 + }, + { + "epoch": 8.077094972067039, + "grad_norm": 0.6222133636474609, + "learning_rate": 0.0005980952380952382, + "loss": 0.3404, + "step": 14458 + }, + { + "epoch": 8.077653631284916, + "grad_norm": 0.9325791597366333, + "learning_rate": 0.0005980672268907563, + "loss": 0.3248, + "step": 14459 + }, + { + "epoch": 8.078212290502794, + "grad_norm": 3.357792854309082, + "learning_rate": 0.0005980392156862745, + "loss": 0.3724, + "step": 14460 + }, + { + "epoch": 8.07877094972067, + "grad_norm": 0.4824487268924713, + "learning_rate": 0.0005980112044817927, + "loss": 0.4733, + "step": 14461 + }, + { + "epoch": 8.079329608938547, + "grad_norm": 0.7129288911819458, + "learning_rate": 0.000597983193277311, + "loss": 0.5004, + "step": 14462 + }, + { + "epoch": 8.079888268156424, + "grad_norm": 0.4432324767112732, + "learning_rate": 0.0005979551820728292, + "loss": 0.4279, + "step": 14463 + }, + { + "epoch": 8.080446927374302, + "grad_norm": 0.4276464879512787, + "learning_rate": 0.0005979271708683474, + "loss": 0.352, + "step": 14464 + }, + { + "epoch": 8.081005586592179, + "grad_norm": 0.5981405973434448, + "learning_rate": 0.0005978991596638655, + "loss": 0.4644, + "step": 14465 + }, + { + "epoch": 8.081564245810055, + "grad_norm": 0.4256029427051544, + "learning_rate": 0.0005978711484593837, + "loss": 0.3899, + "step": 14466 + }, + { + "epoch": 8.082122905027934, + "grad_norm": 0.6515499353408813, + "learning_rate": 0.000597843137254902, + "loss": 0.4576, + "step": 14467 + }, + { + "epoch": 8.08268156424581, + "grad_norm": 0.500608503818512, + "learning_rate": 0.0005978151260504202, + "loss": 0.6457, + "step": 14468 + }, + { + "epoch": 8.083240223463687, + "grad_norm": 0.535923182964325, + "learning_rate": 0.0005977871148459384, + "loss": 0.4132, + "step": 14469 + }, + { + "epoch": 8.083798882681565, + "grad_norm": 0.5433781147003174, + "learning_rate": 0.0005977591036414565, + "loss": 0.5366, + "step": 14470 + }, + { + "epoch": 8.084357541899442, + "grad_norm": 0.8429211974143982, + "learning_rate": 0.0005977310924369747, + "loss": 0.542, + "step": 14471 + }, + { + "epoch": 8.084916201117318, + "grad_norm": 0.45777586102485657, + "learning_rate": 0.0005977030812324931, + "loss": 0.4971, + "step": 14472 + }, + { + "epoch": 8.085474860335195, + "grad_norm": 0.5182831287384033, + "learning_rate": 0.0005976750700280113, + "loss": 0.4184, + "step": 14473 + }, + { + "epoch": 8.086033519553073, + "grad_norm": 1.22562837600708, + "learning_rate": 0.0005976470588235295, + "loss": 0.5364, + "step": 14474 + }, + { + "epoch": 8.08659217877095, + "grad_norm": 0.9164565205574036, + "learning_rate": 0.0005976190476190476, + "loss": 0.3132, + "step": 14475 + }, + { + "epoch": 8.087150837988826, + "grad_norm": 0.4706647992134094, + "learning_rate": 0.0005975910364145658, + "loss": 0.3871, + "step": 14476 + }, + { + "epoch": 8.087709497206705, + "grad_norm": 0.45112869143486023, + "learning_rate": 0.0005975630252100841, + "loss": 0.3858, + "step": 14477 + }, + { + "epoch": 8.088268156424581, + "grad_norm": 0.5883400440216064, + "learning_rate": 0.0005975350140056023, + "loss": 0.3692, + "step": 14478 + }, + { + "epoch": 8.088826815642458, + "grad_norm": 0.43818268179893494, + "learning_rate": 0.0005975070028011205, + "loss": 0.3544, + "step": 14479 + }, + { + "epoch": 8.089385474860336, + "grad_norm": 0.43844330310821533, + "learning_rate": 0.0005974789915966387, + "loss": 0.4451, + "step": 14480 + }, + { + "epoch": 8.089944134078213, + "grad_norm": 0.379326194524765, + "learning_rate": 0.0005974509803921568, + "loss": 0.3861, + "step": 14481 + }, + { + "epoch": 8.09050279329609, + "grad_norm": 1.012355089187622, + "learning_rate": 0.0005974229691876751, + "loss": 0.6071, + "step": 14482 + }, + { + "epoch": 8.091061452513966, + "grad_norm": 1.403200387954712, + "learning_rate": 0.0005973949579831933, + "loss": 0.4221, + "step": 14483 + }, + { + "epoch": 8.091620111731844, + "grad_norm": 0.5278259515762329, + "learning_rate": 0.0005973669467787115, + "loss": 0.3276, + "step": 14484 + }, + { + "epoch": 8.09217877094972, + "grad_norm": 0.47505414485931396, + "learning_rate": 0.0005973389355742297, + "loss": 0.452, + "step": 14485 + }, + { + "epoch": 8.092737430167597, + "grad_norm": 0.5616818070411682, + "learning_rate": 0.0005973109243697478, + "loss": 0.3871, + "step": 14486 + }, + { + "epoch": 8.093296089385476, + "grad_norm": 0.5413137078285217, + "learning_rate": 0.0005972829131652661, + "loss": 0.4361, + "step": 14487 + }, + { + "epoch": 8.093854748603352, + "grad_norm": 0.5107696652412415, + "learning_rate": 0.0005972549019607844, + "loss": 0.4843, + "step": 14488 + }, + { + "epoch": 8.094413407821229, + "grad_norm": 0.7350447773933411, + "learning_rate": 0.0005972268907563026, + "loss": 0.5338, + "step": 14489 + }, + { + "epoch": 8.094972067039107, + "grad_norm": 0.7885605692863464, + "learning_rate": 0.0005971988795518208, + "loss": 0.4659, + "step": 14490 + }, + { + "epoch": 8.095530726256984, + "grad_norm": 0.49230703711509705, + "learning_rate": 0.0005971708683473389, + "loss": 0.3995, + "step": 14491 + }, + { + "epoch": 8.09608938547486, + "grad_norm": 13.724759101867676, + "learning_rate": 0.0005971428571428572, + "loss": 0.3574, + "step": 14492 + }, + { + "epoch": 8.096648044692737, + "grad_norm": 0.6826164126396179, + "learning_rate": 0.0005971148459383754, + "loss": 0.4435, + "step": 14493 + }, + { + "epoch": 8.097206703910615, + "grad_norm": 0.8319756388664246, + "learning_rate": 0.0005970868347338936, + "loss": 0.3703, + "step": 14494 + }, + { + "epoch": 8.097765363128492, + "grad_norm": 0.5146135091781616, + "learning_rate": 0.0005970588235294118, + "loss": 0.2916, + "step": 14495 + }, + { + "epoch": 8.098324022346368, + "grad_norm": 0.5947471261024475, + "learning_rate": 0.00059703081232493, + "loss": 0.5095, + "step": 14496 + }, + { + "epoch": 8.098882681564247, + "grad_norm": 0.5056770443916321, + "learning_rate": 0.0005970028011204482, + "loss": 0.3667, + "step": 14497 + }, + { + "epoch": 8.099441340782123, + "grad_norm": 0.39849406480789185, + "learning_rate": 0.0005969747899159664, + "loss": 0.393, + "step": 14498 + }, + { + "epoch": 8.1, + "grad_norm": 0.4292544424533844, + "learning_rate": 0.0005969467787114846, + "loss": 0.4284, + "step": 14499 + }, + { + "epoch": 8.100558659217878, + "grad_norm": 0.4043031334877014, + "learning_rate": 0.0005969187675070028, + "loss": 0.4377, + "step": 14500 + }, + { + "epoch": 8.100558659217878, + "eval_cer": 0.09223375121381733, + "eval_loss": 0.34655728936195374, + "eval_runtime": 55.558, + "eval_samples_per_second": 81.68, + "eval_steps_per_second": 5.112, + "eval_wer": 0.37174939247507055, + "step": 14500 + }, + { + "epoch": 8.101117318435755, + "grad_norm": 0.9145787358283997, + "learning_rate": 0.000596890756302521, + "loss": 0.5343, + "step": 14501 + }, + { + "epoch": 8.101675977653631, + "grad_norm": 1.1518689393997192, + "learning_rate": 0.0005968627450980391, + "loss": 0.4324, + "step": 14502 + }, + { + "epoch": 8.102234636871508, + "grad_norm": 0.5576058626174927, + "learning_rate": 0.0005968347338935574, + "loss": 0.4712, + "step": 14503 + }, + { + "epoch": 8.102793296089386, + "grad_norm": 1.422631025314331, + "learning_rate": 0.0005968067226890756, + "loss": 0.3519, + "step": 14504 + }, + { + "epoch": 8.103351955307263, + "grad_norm": 0.7505115270614624, + "learning_rate": 0.0005967787114845939, + "loss": 0.5005, + "step": 14505 + }, + { + "epoch": 8.10391061452514, + "grad_norm": 1.0042904615402222, + "learning_rate": 0.0005967507002801121, + "loss": 0.3961, + "step": 14506 + }, + { + "epoch": 8.104469273743018, + "grad_norm": 0.595643937587738, + "learning_rate": 0.0005967226890756302, + "loss": 0.6221, + "step": 14507 + }, + { + "epoch": 8.105027932960894, + "grad_norm": 1.31422758102417, + "learning_rate": 0.0005966946778711485, + "loss": 0.4962, + "step": 14508 + }, + { + "epoch": 8.10558659217877, + "grad_norm": 0.8656612634658813, + "learning_rate": 0.0005966666666666667, + "loss": 0.4151, + "step": 14509 + }, + { + "epoch": 8.106145251396647, + "grad_norm": 0.857059895992279, + "learning_rate": 0.0005966386554621849, + "loss": 0.5488, + "step": 14510 + }, + { + "epoch": 8.106703910614526, + "grad_norm": 0.5608837008476257, + "learning_rate": 0.0005966106442577031, + "loss": 0.4081, + "step": 14511 + }, + { + "epoch": 8.107262569832402, + "grad_norm": 0.5355949401855469, + "learning_rate": 0.0005965826330532213, + "loss": 0.4165, + "step": 14512 + }, + { + "epoch": 8.107821229050279, + "grad_norm": 0.7131775617599487, + "learning_rate": 0.0005965546218487395, + "loss": 0.3979, + "step": 14513 + }, + { + "epoch": 8.108379888268157, + "grad_norm": 0.6019068956375122, + "learning_rate": 0.0005965266106442577, + "loss": 0.4851, + "step": 14514 + }, + { + "epoch": 8.108938547486034, + "grad_norm": 0.9674901962280273, + "learning_rate": 0.0005964985994397759, + "loss": 0.4301, + "step": 14515 + }, + { + "epoch": 8.10949720670391, + "grad_norm": 0.5199589133262634, + "learning_rate": 0.0005964705882352941, + "loss": 0.2968, + "step": 14516 + }, + { + "epoch": 8.110055865921789, + "grad_norm": 0.44233837723731995, + "learning_rate": 0.0005964425770308123, + "loss": 0.4034, + "step": 14517 + }, + { + "epoch": 8.110614525139665, + "grad_norm": 0.5038004517555237, + "learning_rate": 0.0005964145658263305, + "loss": 0.535, + "step": 14518 + }, + { + "epoch": 8.111173184357542, + "grad_norm": 0.4609309136867523, + "learning_rate": 0.0005963865546218487, + "loss": 0.4152, + "step": 14519 + }, + { + "epoch": 8.111731843575418, + "grad_norm": 0.4320252239704132, + "learning_rate": 0.0005963585434173669, + "loss": 0.3917, + "step": 14520 + }, + { + "epoch": 8.112290502793297, + "grad_norm": 0.4233516454696655, + "learning_rate": 0.0005963305322128851, + "loss": 0.3793, + "step": 14521 + }, + { + "epoch": 8.112849162011173, + "grad_norm": 0.5657986402511597, + "learning_rate": 0.0005963025210084034, + "loss": 0.5482, + "step": 14522 + }, + { + "epoch": 8.11340782122905, + "grad_norm": 0.7307112812995911, + "learning_rate": 0.0005962745098039217, + "loss": 0.4447, + "step": 14523 + }, + { + "epoch": 8.113966480446928, + "grad_norm": 0.5015503764152527, + "learning_rate": 0.0005962464985994398, + "loss": 0.4389, + "step": 14524 + }, + { + "epoch": 8.114525139664805, + "grad_norm": 0.5661396384239197, + "learning_rate": 0.000596218487394958, + "loss": 0.4389, + "step": 14525 + }, + { + "epoch": 8.115083798882681, + "grad_norm": 0.5356577634811401, + "learning_rate": 0.0005961904761904762, + "loss": 0.4356, + "step": 14526 + }, + { + "epoch": 8.11564245810056, + "grad_norm": 0.5272166132926941, + "learning_rate": 0.0005961624649859944, + "loss": 0.3509, + "step": 14527 + }, + { + "epoch": 8.116201117318436, + "grad_norm": 0.6902635097503662, + "learning_rate": 0.0005961344537815127, + "loss": 0.3811, + "step": 14528 + }, + { + "epoch": 8.116759776536313, + "grad_norm": 0.4835710823535919, + "learning_rate": 0.0005961064425770308, + "loss": 0.5222, + "step": 14529 + }, + { + "epoch": 8.11731843575419, + "grad_norm": 0.6754847168922424, + "learning_rate": 0.000596078431372549, + "loss": 0.4314, + "step": 14530 + }, + { + "epoch": 8.117877094972068, + "grad_norm": 0.7182206511497498, + "learning_rate": 0.0005960504201680672, + "loss": 0.6087, + "step": 14531 + }, + { + "epoch": 8.118435754189944, + "grad_norm": 1.2911028861999512, + "learning_rate": 0.0005960224089635854, + "loss": 0.505, + "step": 14532 + }, + { + "epoch": 8.11899441340782, + "grad_norm": 0.4551449716091156, + "learning_rate": 0.0005959943977591037, + "loss": 0.3461, + "step": 14533 + }, + { + "epoch": 8.119553072625699, + "grad_norm": 0.3183300495147705, + "learning_rate": 0.0005959663865546218, + "loss": 0.376, + "step": 14534 + }, + { + "epoch": 8.120111731843576, + "grad_norm": 0.6173131465911865, + "learning_rate": 0.00059593837535014, + "loss": 0.5936, + "step": 14535 + }, + { + "epoch": 8.120670391061452, + "grad_norm": 1.7582385540008545, + "learning_rate": 0.0005959103641456582, + "loss": 0.3967, + "step": 14536 + }, + { + "epoch": 8.121229050279329, + "grad_norm": 0.6299956440925598, + "learning_rate": 0.0005958823529411764, + "loss": 0.4445, + "step": 14537 + }, + { + "epoch": 8.121787709497207, + "grad_norm": 0.4965883195400238, + "learning_rate": 0.0005958543417366948, + "loss": 0.6776, + "step": 14538 + }, + { + "epoch": 8.122346368715084, + "grad_norm": 0.46675586700439453, + "learning_rate": 0.000595826330532213, + "loss": 0.341, + "step": 14539 + }, + { + "epoch": 8.12290502793296, + "grad_norm": 2.13820743560791, + "learning_rate": 0.0005957983193277311, + "loss": 0.4011, + "step": 14540 + }, + { + "epoch": 8.123463687150839, + "grad_norm": 0.6492177844047546, + "learning_rate": 0.0005957703081232493, + "loss": 0.4528, + "step": 14541 + }, + { + "epoch": 8.124022346368715, + "grad_norm": 0.4511052966117859, + "learning_rate": 0.0005957422969187675, + "loss": 0.3951, + "step": 14542 + }, + { + "epoch": 8.124581005586592, + "grad_norm": 0.5533331036567688, + "learning_rate": 0.0005957142857142858, + "loss": 0.3492, + "step": 14543 + }, + { + "epoch": 8.12513966480447, + "grad_norm": 0.5376847982406616, + "learning_rate": 0.000595686274509804, + "loss": 0.3695, + "step": 14544 + }, + { + "epoch": 8.125698324022347, + "grad_norm": 0.4759940505027771, + "learning_rate": 0.0005956582633053221, + "loss": 0.3654, + "step": 14545 + }, + { + "epoch": 8.126256983240223, + "grad_norm": 0.8489264249801636, + "learning_rate": 0.0005956302521008403, + "loss": 0.4596, + "step": 14546 + }, + { + "epoch": 8.1268156424581, + "grad_norm": 0.4568347632884979, + "learning_rate": 0.0005956022408963585, + "loss": 0.397, + "step": 14547 + }, + { + "epoch": 8.127374301675978, + "grad_norm": 0.37949851155281067, + "learning_rate": 0.0005955742296918768, + "loss": 0.4906, + "step": 14548 + }, + { + "epoch": 8.127932960893855, + "grad_norm": 0.478251576423645, + "learning_rate": 0.000595546218487395, + "loss": 0.512, + "step": 14549 + }, + { + "epoch": 8.128491620111731, + "grad_norm": 11.636027336120605, + "learning_rate": 0.0005955182072829131, + "loss": 0.3584, + "step": 14550 + }, + { + "epoch": 8.12905027932961, + "grad_norm": 0.5634673833847046, + "learning_rate": 0.0005954901960784313, + "loss": 0.5212, + "step": 14551 + }, + { + "epoch": 8.129608938547486, + "grad_norm": 0.42000940442085266, + "learning_rate": 0.0005954621848739495, + "loss": 0.3316, + "step": 14552 + }, + { + "epoch": 8.130167597765363, + "grad_norm": 0.5818812847137451, + "learning_rate": 0.0005954341736694678, + "loss": 0.4819, + "step": 14553 + }, + { + "epoch": 8.130726256983241, + "grad_norm": 2.9924280643463135, + "learning_rate": 0.0005954061624649861, + "loss": 0.4644, + "step": 14554 + }, + { + "epoch": 8.131284916201118, + "grad_norm": 0.39869850873947144, + "learning_rate": 0.0005953781512605043, + "loss": 0.3366, + "step": 14555 + }, + { + "epoch": 8.131843575418994, + "grad_norm": 0.42670831084251404, + "learning_rate": 0.0005953501400560224, + "loss": 0.4306, + "step": 14556 + }, + { + "epoch": 8.13240223463687, + "grad_norm": 5.134698867797852, + "learning_rate": 0.0005953221288515406, + "loss": 0.3672, + "step": 14557 + }, + { + "epoch": 8.132960893854749, + "grad_norm": 0.529649019241333, + "learning_rate": 0.0005952941176470589, + "loss": 0.5841, + "step": 14558 + }, + { + "epoch": 8.133519553072626, + "grad_norm": 0.4746466279029846, + "learning_rate": 0.0005952661064425771, + "loss": 0.4608, + "step": 14559 + }, + { + "epoch": 8.134078212290502, + "grad_norm": 0.6194973587989807, + "learning_rate": 0.0005952380952380953, + "loss": 0.5004, + "step": 14560 + }, + { + "epoch": 8.13463687150838, + "grad_norm": 0.5827535390853882, + "learning_rate": 0.0005952100840336134, + "loss": 0.4828, + "step": 14561 + }, + { + "epoch": 8.135195530726257, + "grad_norm": 0.8640741109848022, + "learning_rate": 0.0005951820728291316, + "loss": 0.3779, + "step": 14562 + }, + { + "epoch": 8.135754189944134, + "grad_norm": 0.3662491738796234, + "learning_rate": 0.0005951540616246499, + "loss": 0.437, + "step": 14563 + }, + { + "epoch": 8.136312849162012, + "grad_norm": 0.5627434253692627, + "learning_rate": 0.0005951260504201681, + "loss": 0.4627, + "step": 14564 + }, + { + "epoch": 8.136871508379889, + "grad_norm": 4.679205894470215, + "learning_rate": 0.0005950980392156863, + "loss": 0.4268, + "step": 14565 + }, + { + "epoch": 8.137430167597765, + "grad_norm": 0.437671035528183, + "learning_rate": 0.0005950700280112044, + "loss": 0.3921, + "step": 14566 + }, + { + "epoch": 8.137988826815642, + "grad_norm": 0.4792025685310364, + "learning_rate": 0.0005950420168067226, + "loss": 0.3736, + "step": 14567 + }, + { + "epoch": 8.13854748603352, + "grad_norm": 0.7871202826499939, + "learning_rate": 0.0005950140056022409, + "loss": 0.4548, + "step": 14568 + }, + { + "epoch": 8.139106145251397, + "grad_norm": 0.5289686918258667, + "learning_rate": 0.0005949859943977591, + "loss": 0.4754, + "step": 14569 + }, + { + "epoch": 8.139664804469273, + "grad_norm": 0.7209805250167847, + "learning_rate": 0.0005949579831932774, + "loss": 0.4984, + "step": 14570 + }, + { + "epoch": 8.140223463687152, + "grad_norm": 0.5015156269073486, + "learning_rate": 0.0005949299719887956, + "loss": 0.385, + "step": 14571 + }, + { + "epoch": 8.140782122905028, + "grad_norm": 0.5621908903121948, + "learning_rate": 0.0005949019607843137, + "loss": 0.4219, + "step": 14572 + }, + { + "epoch": 8.141340782122905, + "grad_norm": 0.5540359020233154, + "learning_rate": 0.000594873949579832, + "loss": 0.5039, + "step": 14573 + }, + { + "epoch": 8.141899441340781, + "grad_norm": 0.9592664241790771, + "learning_rate": 0.0005948459383753502, + "loss": 0.4666, + "step": 14574 + }, + { + "epoch": 8.14245810055866, + "grad_norm": 0.4834812581539154, + "learning_rate": 0.0005948179271708684, + "loss": 0.3982, + "step": 14575 + }, + { + "epoch": 8.143016759776536, + "grad_norm": 0.596591055393219, + "learning_rate": 0.0005947899159663866, + "loss": 0.3617, + "step": 14576 + }, + { + "epoch": 8.143575418994413, + "grad_norm": 0.5861586332321167, + "learning_rate": 0.0005947619047619047, + "loss": 0.5164, + "step": 14577 + }, + { + "epoch": 8.144134078212291, + "grad_norm": 1.3980904817581177, + "learning_rate": 0.000594733893557423, + "loss": 0.4516, + "step": 14578 + }, + { + "epoch": 8.144692737430168, + "grad_norm": 0.5210471749305725, + "learning_rate": 0.0005947058823529412, + "loss": 0.4487, + "step": 14579 + }, + { + "epoch": 8.145251396648044, + "grad_norm": 0.5994924306869507, + "learning_rate": 0.0005946778711484594, + "loss": 0.4097, + "step": 14580 + }, + { + "epoch": 8.145810055865923, + "grad_norm": 0.6276729702949524, + "learning_rate": 0.0005946498599439776, + "loss": 0.5411, + "step": 14581 + }, + { + "epoch": 8.1463687150838, + "grad_norm": 0.8257356882095337, + "learning_rate": 0.0005946218487394957, + "loss": 0.4328, + "step": 14582 + }, + { + "epoch": 8.146927374301676, + "grad_norm": 0.470299631357193, + "learning_rate": 0.000594593837535014, + "loss": 0.4495, + "step": 14583 + }, + { + "epoch": 8.147486033519552, + "grad_norm": 0.47502925992012024, + "learning_rate": 0.0005945658263305322, + "loss": 0.4185, + "step": 14584 + }, + { + "epoch": 8.14804469273743, + "grad_norm": 0.3554491698741913, + "learning_rate": 0.0005945378151260504, + "loss": 0.3505, + "step": 14585 + }, + { + "epoch": 8.148603351955307, + "grad_norm": 0.5056933760643005, + "learning_rate": 0.0005945098039215686, + "loss": 0.4115, + "step": 14586 + }, + { + "epoch": 8.149162011173184, + "grad_norm": 2.021817922592163, + "learning_rate": 0.0005944817927170869, + "loss": 0.4242, + "step": 14587 + }, + { + "epoch": 8.149720670391062, + "grad_norm": 0.76506507396698, + "learning_rate": 0.0005944537815126051, + "loss": 0.4783, + "step": 14588 + }, + { + "epoch": 8.150279329608939, + "grad_norm": 0.3604593575000763, + "learning_rate": 0.0005944257703081233, + "loss": 0.3483, + "step": 14589 + }, + { + "epoch": 8.150837988826815, + "grad_norm": 0.5649810433387756, + "learning_rate": 0.0005943977591036415, + "loss": 0.4487, + "step": 14590 + }, + { + "epoch": 8.151396648044694, + "grad_norm": 0.6508882641792297, + "learning_rate": 0.0005943697478991597, + "loss": 0.4834, + "step": 14591 + }, + { + "epoch": 8.15195530726257, + "grad_norm": 0.46237534284591675, + "learning_rate": 0.0005943417366946779, + "loss": 0.4237, + "step": 14592 + }, + { + "epoch": 8.152513966480447, + "grad_norm": 0.7530639171600342, + "learning_rate": 0.0005943137254901961, + "loss": 0.3495, + "step": 14593 + }, + { + "epoch": 8.153072625698323, + "grad_norm": 0.5456220507621765, + "learning_rate": 0.0005942857142857143, + "loss": 0.4674, + "step": 14594 + }, + { + "epoch": 8.153631284916202, + "grad_norm": 0.57440584897995, + "learning_rate": 0.0005942577030812325, + "loss": 0.4725, + "step": 14595 + }, + { + "epoch": 8.154189944134078, + "grad_norm": 0.4562806487083435, + "learning_rate": 0.0005942296918767507, + "loss": 0.3635, + "step": 14596 + }, + { + "epoch": 8.154748603351955, + "grad_norm": 0.41335538029670715, + "learning_rate": 0.0005942016806722689, + "loss": 0.3356, + "step": 14597 + }, + { + "epoch": 8.155307262569833, + "grad_norm": 0.6414315104484558, + "learning_rate": 0.0005941736694677871, + "loss": 0.503, + "step": 14598 + }, + { + "epoch": 8.15586592178771, + "grad_norm": 0.917523205280304, + "learning_rate": 0.0005941456582633053, + "loss": 0.4008, + "step": 14599 + }, + { + "epoch": 8.156424581005586, + "grad_norm": 0.6649678349494934, + "learning_rate": 0.0005941176470588235, + "loss": 0.5333, + "step": 14600 + }, + { + "epoch": 8.156983240223465, + "grad_norm": 0.8768752813339233, + "learning_rate": 0.0005940896358543417, + "loss": 0.4267, + "step": 14601 + }, + { + "epoch": 8.157541899441341, + "grad_norm": 0.40516921877861023, + "learning_rate": 0.0005940616246498599, + "loss": 0.4035, + "step": 14602 + }, + { + "epoch": 8.158100558659218, + "grad_norm": 0.4275057017803192, + "learning_rate": 0.0005940336134453783, + "loss": 0.4139, + "step": 14603 + }, + { + "epoch": 8.158659217877094, + "grad_norm": 0.6677563786506653, + "learning_rate": 0.0005940056022408964, + "loss": 0.5749, + "step": 14604 + }, + { + "epoch": 8.159217877094973, + "grad_norm": 0.48846563696861267, + "learning_rate": 0.0005939775910364146, + "loss": 0.4933, + "step": 14605 + }, + { + "epoch": 8.15977653631285, + "grad_norm": 0.47140753269195557, + "learning_rate": 0.0005939495798319328, + "loss": 0.416, + "step": 14606 + }, + { + "epoch": 8.160335195530726, + "grad_norm": 0.3789716958999634, + "learning_rate": 0.000593921568627451, + "loss": 0.4539, + "step": 14607 + }, + { + "epoch": 8.160893854748604, + "grad_norm": 0.35282275080680847, + "learning_rate": 0.0005938935574229693, + "loss": 0.4277, + "step": 14608 + }, + { + "epoch": 8.16145251396648, + "grad_norm": 0.4037696123123169, + "learning_rate": 0.0005938655462184874, + "loss": 0.4295, + "step": 14609 + }, + { + "epoch": 8.162011173184357, + "grad_norm": 2.7429211139678955, + "learning_rate": 0.0005938375350140056, + "loss": 0.3659, + "step": 14610 + }, + { + "epoch": 8.162569832402234, + "grad_norm": 0.46585479378700256, + "learning_rate": 0.0005938095238095238, + "loss": 0.4222, + "step": 14611 + }, + { + "epoch": 8.163128491620112, + "grad_norm": 0.4178381562232971, + "learning_rate": 0.000593781512605042, + "loss": 0.4079, + "step": 14612 + }, + { + "epoch": 8.163687150837989, + "grad_norm": 0.6297781467437744, + "learning_rate": 0.0005937535014005603, + "loss": 0.3685, + "step": 14613 + }, + { + "epoch": 8.164245810055865, + "grad_norm": 0.45283186435699463, + "learning_rate": 0.0005937254901960784, + "loss": 0.3259, + "step": 14614 + }, + { + "epoch": 8.164804469273744, + "grad_norm": 1.1479166746139526, + "learning_rate": 0.0005936974789915966, + "loss": 0.3486, + "step": 14615 + }, + { + "epoch": 8.16536312849162, + "grad_norm": 1.191484808921814, + "learning_rate": 0.0005936694677871148, + "loss": 0.3828, + "step": 14616 + }, + { + "epoch": 8.165921787709497, + "grad_norm": 0.763431191444397, + "learning_rate": 0.000593641456582633, + "loss": 0.5039, + "step": 14617 + }, + { + "epoch": 8.166480446927375, + "grad_norm": 0.6036748290061951, + "learning_rate": 0.0005936134453781513, + "loss": 0.4643, + "step": 14618 + }, + { + "epoch": 8.167039106145252, + "grad_norm": 1.3252840042114258, + "learning_rate": 0.0005935854341736696, + "loss": 0.6149, + "step": 14619 + }, + { + "epoch": 8.167597765363128, + "grad_norm": 0.4684814214706421, + "learning_rate": 0.0005935574229691877, + "loss": 0.3629, + "step": 14620 + }, + { + "epoch": 8.168156424581005, + "grad_norm": 0.593262255191803, + "learning_rate": 0.0005935294117647059, + "loss": 0.3708, + "step": 14621 + }, + { + "epoch": 8.168715083798883, + "grad_norm": 1.5334376096725464, + "learning_rate": 0.0005935014005602241, + "loss": 0.4969, + "step": 14622 + }, + { + "epoch": 8.16927374301676, + "grad_norm": 0.44006046652793884, + "learning_rate": 0.0005934733893557424, + "loss": 0.4523, + "step": 14623 + }, + { + "epoch": 8.169832402234636, + "grad_norm": 0.604774534702301, + "learning_rate": 0.0005934453781512606, + "loss": 0.4958, + "step": 14624 + }, + { + "epoch": 8.170391061452515, + "grad_norm": 1.083076000213623, + "learning_rate": 0.0005934173669467787, + "loss": 0.3813, + "step": 14625 + }, + { + "epoch": 8.170949720670391, + "grad_norm": 0.9524022936820984, + "learning_rate": 0.0005933893557422969, + "loss": 0.4315, + "step": 14626 + }, + { + "epoch": 8.171508379888268, + "grad_norm": 1.3805259466171265, + "learning_rate": 0.0005933613445378151, + "loss": 0.5444, + "step": 14627 + }, + { + "epoch": 8.172067039106146, + "grad_norm": 0.612502932548523, + "learning_rate": 0.0005933333333333334, + "loss": 0.4183, + "step": 14628 + }, + { + "epoch": 8.172625698324023, + "grad_norm": 0.7701265215873718, + "learning_rate": 0.0005933053221288516, + "loss": 0.5105, + "step": 14629 + }, + { + "epoch": 8.1731843575419, + "grad_norm": 0.47687798738479614, + "learning_rate": 0.0005932773109243697, + "loss": 0.4961, + "step": 14630 + }, + { + "epoch": 8.173743016759776, + "grad_norm": 0.3955836594104767, + "learning_rate": 0.0005932492997198879, + "loss": 0.3551, + "step": 14631 + }, + { + "epoch": 8.174301675977654, + "grad_norm": 0.40008121728897095, + "learning_rate": 0.0005932212885154061, + "loss": 0.4104, + "step": 14632 + }, + { + "epoch": 8.17486033519553, + "grad_norm": 2.1390469074249268, + "learning_rate": 0.0005931932773109244, + "loss": 0.4783, + "step": 14633 + }, + { + "epoch": 8.175418994413407, + "grad_norm": 0.8249197006225586, + "learning_rate": 0.0005931652661064426, + "loss": 0.5323, + "step": 14634 + }, + { + "epoch": 8.175977653631286, + "grad_norm": 0.8177069425582886, + "learning_rate": 0.0005931372549019608, + "loss": 0.4727, + "step": 14635 + }, + { + "epoch": 8.176536312849162, + "grad_norm": 0.6845154762268066, + "learning_rate": 0.000593109243697479, + "loss": 0.4864, + "step": 14636 + }, + { + "epoch": 8.177094972067039, + "grad_norm": 0.6187852025032043, + "learning_rate": 0.0005930812324929972, + "loss": 0.4349, + "step": 14637 + }, + { + "epoch": 8.177653631284917, + "grad_norm": 0.4934499263763428, + "learning_rate": 0.0005930532212885155, + "loss": 0.5327, + "step": 14638 + }, + { + "epoch": 8.178212290502794, + "grad_norm": 0.8398798108100891, + "learning_rate": 0.0005930252100840337, + "loss": 0.5663, + "step": 14639 + }, + { + "epoch": 8.17877094972067, + "grad_norm": 0.7031766772270203, + "learning_rate": 0.0005929971988795519, + "loss": 0.418, + "step": 14640 + }, + { + "epoch": 8.179329608938547, + "grad_norm": 0.46452808380126953, + "learning_rate": 0.00059296918767507, + "loss": 0.3706, + "step": 14641 + }, + { + "epoch": 8.179888268156425, + "grad_norm": 0.5066023468971252, + "learning_rate": 0.0005929411764705882, + "loss": 0.4157, + "step": 14642 + }, + { + "epoch": 8.180446927374302, + "grad_norm": 1.4259673357009888, + "learning_rate": 0.0005929131652661065, + "loss": 0.5676, + "step": 14643 + }, + { + "epoch": 8.181005586592178, + "grad_norm": 0.5272063612937927, + "learning_rate": 0.0005928851540616247, + "loss": 0.4865, + "step": 14644 + }, + { + "epoch": 8.181564245810057, + "grad_norm": 0.7903665900230408, + "learning_rate": 0.0005928571428571429, + "loss": 0.5629, + "step": 14645 + }, + { + "epoch": 8.182122905027933, + "grad_norm": 0.9882017374038696, + "learning_rate": 0.000592829131652661, + "loss": 0.4886, + "step": 14646 + }, + { + "epoch": 8.18268156424581, + "grad_norm": 0.49136465787887573, + "learning_rate": 0.0005928011204481792, + "loss": 0.4844, + "step": 14647 + }, + { + "epoch": 8.183240223463686, + "grad_norm": 0.8579321503639221, + "learning_rate": 0.0005927731092436975, + "loss": 0.4557, + "step": 14648 + }, + { + "epoch": 8.183798882681565, + "grad_norm": 0.7831169366836548, + "learning_rate": 0.0005927450980392157, + "loss": 0.4687, + "step": 14649 + }, + { + "epoch": 8.184357541899441, + "grad_norm": 0.3897453248500824, + "learning_rate": 0.0005927170868347339, + "loss": 0.3899, + "step": 14650 + }, + { + "epoch": 8.184916201117318, + "grad_norm": 0.3552367687225342, + "learning_rate": 0.0005926890756302521, + "loss": 0.3602, + "step": 14651 + }, + { + "epoch": 8.185474860335196, + "grad_norm": 0.9231951236724854, + "learning_rate": 0.0005926610644257702, + "loss": 0.4286, + "step": 14652 + }, + { + "epoch": 8.186033519553073, + "grad_norm": 0.3408631980419159, + "learning_rate": 0.0005926330532212886, + "loss": 0.394, + "step": 14653 + }, + { + "epoch": 8.18659217877095, + "grad_norm": 0.5777474641799927, + "learning_rate": 0.0005926050420168068, + "loss": 0.4021, + "step": 14654 + }, + { + "epoch": 8.187150837988828, + "grad_norm": 1.9134604930877686, + "learning_rate": 0.000592577030812325, + "loss": 0.5811, + "step": 14655 + }, + { + "epoch": 8.187709497206704, + "grad_norm": 0.519878089427948, + "learning_rate": 0.0005925490196078432, + "loss": 0.3929, + "step": 14656 + }, + { + "epoch": 8.18826815642458, + "grad_norm": 0.46713653206825256, + "learning_rate": 0.0005925210084033613, + "loss": 0.482, + "step": 14657 + }, + { + "epoch": 8.188826815642457, + "grad_norm": 0.5428266525268555, + "learning_rate": 0.0005924929971988796, + "loss": 0.4266, + "step": 14658 + }, + { + "epoch": 8.189385474860336, + "grad_norm": 0.6040323376655579, + "learning_rate": 0.0005924649859943978, + "loss": 0.4549, + "step": 14659 + }, + { + "epoch": 8.189944134078212, + "grad_norm": 0.4560234546661377, + "learning_rate": 0.000592436974789916, + "loss": 0.4911, + "step": 14660 + }, + { + "epoch": 8.190502793296089, + "grad_norm": 0.6009203195571899, + "learning_rate": 0.0005924089635854342, + "loss": 0.4201, + "step": 14661 + }, + { + "epoch": 8.191061452513967, + "grad_norm": 0.42681869864463806, + "learning_rate": 0.0005923809523809523, + "loss": 0.4044, + "step": 14662 + }, + { + "epoch": 8.191620111731844, + "grad_norm": 0.6925278306007385, + "learning_rate": 0.0005923529411764706, + "loss": 0.5338, + "step": 14663 + }, + { + "epoch": 8.19217877094972, + "grad_norm": 0.5270254015922546, + "learning_rate": 0.0005923249299719888, + "loss": 0.6332, + "step": 14664 + }, + { + "epoch": 8.192737430167599, + "grad_norm": 0.4446677565574646, + "learning_rate": 0.000592296918767507, + "loss": 0.615, + "step": 14665 + }, + { + "epoch": 8.193296089385475, + "grad_norm": 1.4417424201965332, + "learning_rate": 0.0005922689075630252, + "loss": 0.4108, + "step": 14666 + }, + { + "epoch": 8.193854748603352, + "grad_norm": 0.6009916067123413, + "learning_rate": 0.0005922408963585434, + "loss": 0.406, + "step": 14667 + }, + { + "epoch": 8.194413407821228, + "grad_norm": 0.5024697184562683, + "learning_rate": 0.0005922128851540616, + "loss": 0.4359, + "step": 14668 + }, + { + "epoch": 8.194972067039107, + "grad_norm": 0.5052469372749329, + "learning_rate": 0.0005921848739495799, + "loss": 0.4521, + "step": 14669 + }, + { + "epoch": 8.195530726256983, + "grad_norm": 1.6570814847946167, + "learning_rate": 0.0005921568627450981, + "loss": 0.7235, + "step": 14670 + }, + { + "epoch": 8.19608938547486, + "grad_norm": 0.8787750005722046, + "learning_rate": 0.0005921288515406163, + "loss": 0.5639, + "step": 14671 + }, + { + "epoch": 8.196648044692738, + "grad_norm": 0.46569761633872986, + "learning_rate": 0.0005921008403361345, + "loss": 0.5512, + "step": 14672 + }, + { + "epoch": 8.197206703910615, + "grad_norm": 0.41483837366104126, + "learning_rate": 0.0005920728291316527, + "loss": 0.4842, + "step": 14673 + }, + { + "epoch": 8.197765363128491, + "grad_norm": 0.5686022043228149, + "learning_rate": 0.0005920448179271709, + "loss": 0.5086, + "step": 14674 + }, + { + "epoch": 8.19832402234637, + "grad_norm": 0.6898638010025024, + "learning_rate": 0.0005920168067226891, + "loss": 0.4715, + "step": 14675 + }, + { + "epoch": 8.198882681564246, + "grad_norm": 0.6545287370681763, + "learning_rate": 0.0005919887955182073, + "loss": 0.656, + "step": 14676 + }, + { + "epoch": 8.199441340782123, + "grad_norm": 0.5481773614883423, + "learning_rate": 0.0005919607843137255, + "loss": 0.3534, + "step": 14677 + }, + { + "epoch": 8.2, + "grad_norm": 0.7593606114387512, + "learning_rate": 0.0005919327731092437, + "loss": 0.343, + "step": 14678 + }, + { + "epoch": 8.200558659217878, + "grad_norm": 2.396329879760742, + "learning_rate": 0.0005919047619047619, + "loss": 0.4138, + "step": 14679 + }, + { + "epoch": 8.201117318435754, + "grad_norm": 0.4819258153438568, + "learning_rate": 0.0005918767507002801, + "loss": 0.3967, + "step": 14680 + }, + { + "epoch": 8.20167597765363, + "grad_norm": 0.4910144507884979, + "learning_rate": 0.0005918487394957983, + "loss": 0.4803, + "step": 14681 + }, + { + "epoch": 8.202234636871509, + "grad_norm": 0.713206946849823, + "learning_rate": 0.0005918207282913165, + "loss": 0.3215, + "step": 14682 + }, + { + "epoch": 8.202793296089386, + "grad_norm": 0.430527001619339, + "learning_rate": 0.0005917927170868348, + "loss": 0.3517, + "step": 14683 + }, + { + "epoch": 8.203351955307262, + "grad_norm": 5.664494514465332, + "learning_rate": 0.0005917647058823529, + "loss": 0.5239, + "step": 14684 + }, + { + "epoch": 8.203910614525139, + "grad_norm": 0.5775011777877808, + "learning_rate": 0.0005917366946778711, + "loss": 0.3743, + "step": 14685 + }, + { + "epoch": 8.204469273743017, + "grad_norm": 0.6236304044723511, + "learning_rate": 0.0005917086834733894, + "loss": 0.4016, + "step": 14686 + }, + { + "epoch": 8.205027932960894, + "grad_norm": 0.4228741526603699, + "learning_rate": 0.0005916806722689076, + "loss": 0.4528, + "step": 14687 + }, + { + "epoch": 8.20558659217877, + "grad_norm": 4.4908447265625, + "learning_rate": 0.0005916526610644259, + "loss": 0.4665, + "step": 14688 + }, + { + "epoch": 8.206145251396649, + "grad_norm": 0.7010394930839539, + "learning_rate": 0.000591624649859944, + "loss": 0.4896, + "step": 14689 + }, + { + "epoch": 8.206703910614525, + "grad_norm": 0.6479232311248779, + "learning_rate": 0.0005915966386554622, + "loss": 0.4491, + "step": 14690 + }, + { + "epoch": 8.207262569832402, + "grad_norm": 0.46199384331703186, + "learning_rate": 0.0005915686274509804, + "loss": 0.4074, + "step": 14691 + }, + { + "epoch": 8.20782122905028, + "grad_norm": 0.37624022364616394, + "learning_rate": 0.0005915406162464986, + "loss": 0.4328, + "step": 14692 + }, + { + "epoch": 8.208379888268157, + "grad_norm": 0.5073803663253784, + "learning_rate": 0.0005915126050420169, + "loss": 0.528, + "step": 14693 + }, + { + "epoch": 8.208938547486033, + "grad_norm": 0.5571429133415222, + "learning_rate": 0.000591484593837535, + "loss": 0.4453, + "step": 14694 + }, + { + "epoch": 8.20949720670391, + "grad_norm": 0.5724432468414307, + "learning_rate": 0.0005914565826330532, + "loss": 0.4459, + "step": 14695 + }, + { + "epoch": 8.210055865921788, + "grad_norm": 6.897602558135986, + "learning_rate": 0.0005914285714285714, + "loss": 0.5211, + "step": 14696 + }, + { + "epoch": 8.210614525139665, + "grad_norm": 0.4925957918167114, + "learning_rate": 0.0005914005602240896, + "loss": 0.4479, + "step": 14697 + }, + { + "epoch": 8.211173184357541, + "grad_norm": 0.5022900700569153, + "learning_rate": 0.0005913725490196079, + "loss": 0.4388, + "step": 14698 + }, + { + "epoch": 8.21173184357542, + "grad_norm": 0.42731478810310364, + "learning_rate": 0.0005913445378151261, + "loss": 0.4333, + "step": 14699 + }, + { + "epoch": 8.212290502793296, + "grad_norm": 0.9523333311080933, + "learning_rate": 0.0005913165266106442, + "loss": 0.6219, + "step": 14700 + }, + { + "epoch": 8.212849162011173, + "grad_norm": 0.6128084063529968, + "learning_rate": 0.0005912885154061624, + "loss": 0.3607, + "step": 14701 + }, + { + "epoch": 8.213407821229051, + "grad_norm": 0.7442939281463623, + "learning_rate": 0.0005912605042016807, + "loss": 0.4726, + "step": 14702 + }, + { + "epoch": 8.213966480446928, + "grad_norm": 1.232234239578247, + "learning_rate": 0.000591232492997199, + "loss": 0.4757, + "step": 14703 + }, + { + "epoch": 8.214525139664804, + "grad_norm": 0.7523201704025269, + "learning_rate": 0.0005912044817927172, + "loss": 0.5099, + "step": 14704 + }, + { + "epoch": 8.21508379888268, + "grad_norm": 0.9683617353439331, + "learning_rate": 0.0005911764705882353, + "loss": 0.5804, + "step": 14705 + }, + { + "epoch": 8.21564245810056, + "grad_norm": 0.4094729721546173, + "learning_rate": 0.0005911484593837535, + "loss": 0.4704, + "step": 14706 + }, + { + "epoch": 8.216201117318436, + "grad_norm": 2.586884021759033, + "learning_rate": 0.0005911204481792717, + "loss": 0.3714, + "step": 14707 + }, + { + "epoch": 8.216759776536312, + "grad_norm": 0.5156849026679993, + "learning_rate": 0.00059109243697479, + "loss": 0.4664, + "step": 14708 + }, + { + "epoch": 8.21731843575419, + "grad_norm": 0.4346437156200409, + "learning_rate": 0.0005910644257703082, + "loss": 0.4427, + "step": 14709 + }, + { + "epoch": 8.217877094972067, + "grad_norm": 0.5476559996604919, + "learning_rate": 0.0005910364145658263, + "loss": 0.503, + "step": 14710 + }, + { + "epoch": 8.218435754189944, + "grad_norm": 0.7687256932258606, + "learning_rate": 0.0005910084033613445, + "loss": 0.4863, + "step": 14711 + }, + { + "epoch": 8.21899441340782, + "grad_norm": 1.872087001800537, + "learning_rate": 0.0005909803921568627, + "loss": 0.4466, + "step": 14712 + }, + { + "epoch": 8.219553072625699, + "grad_norm": 0.40305033326148987, + "learning_rate": 0.000590952380952381, + "loss": 0.4211, + "step": 14713 + }, + { + "epoch": 8.220111731843575, + "grad_norm": 1.1250699758529663, + "learning_rate": 0.0005909243697478992, + "loss": 0.4989, + "step": 14714 + }, + { + "epoch": 8.220670391061452, + "grad_norm": 0.4963032007217407, + "learning_rate": 0.0005908963585434174, + "loss": 0.3835, + "step": 14715 + }, + { + "epoch": 8.22122905027933, + "grad_norm": 0.5300571918487549, + "learning_rate": 0.0005908683473389355, + "loss": 0.4368, + "step": 14716 + }, + { + "epoch": 8.221787709497207, + "grad_norm": 1.3161786794662476, + "learning_rate": 0.0005908403361344537, + "loss": 0.4646, + "step": 14717 + }, + { + "epoch": 8.222346368715083, + "grad_norm": 0.5102779269218445, + "learning_rate": 0.0005908123249299721, + "loss": 0.5128, + "step": 14718 + }, + { + "epoch": 8.222905027932962, + "grad_norm": 0.744270384311676, + "learning_rate": 0.0005907843137254903, + "loss": 0.5052, + "step": 14719 + }, + { + "epoch": 8.223463687150838, + "grad_norm": 0.4997437596321106, + "learning_rate": 0.0005907563025210085, + "loss": 0.4973, + "step": 14720 + }, + { + "epoch": 8.224022346368715, + "grad_norm": 2.8112409114837646, + "learning_rate": 0.0005907282913165266, + "loss": 0.3742, + "step": 14721 + }, + { + "epoch": 8.224581005586591, + "grad_norm": 0.4091721177101135, + "learning_rate": 0.0005907002801120448, + "loss": 0.4351, + "step": 14722 + }, + { + "epoch": 8.22513966480447, + "grad_norm": 0.5062128305435181, + "learning_rate": 0.0005906722689075631, + "loss": 0.5213, + "step": 14723 + }, + { + "epoch": 8.225698324022346, + "grad_norm": 0.7321590781211853, + "learning_rate": 0.0005906442577030813, + "loss": 0.4003, + "step": 14724 + }, + { + "epoch": 8.226256983240223, + "grad_norm": 0.5163283348083496, + "learning_rate": 0.0005906162464985995, + "loss": 0.4479, + "step": 14725 + }, + { + "epoch": 8.226815642458101, + "grad_norm": 0.44984593987464905, + "learning_rate": 0.0005905882352941176, + "loss": 0.3771, + "step": 14726 + }, + { + "epoch": 8.227374301675978, + "grad_norm": 0.48010358214378357, + "learning_rate": 0.0005905602240896358, + "loss": 0.438, + "step": 14727 + }, + { + "epoch": 8.227932960893854, + "grad_norm": 0.503609836101532, + "learning_rate": 0.000590532212885154, + "loss": 0.4221, + "step": 14728 + }, + { + "epoch": 8.228491620111733, + "grad_norm": 0.40139448642730713, + "learning_rate": 0.0005905042016806723, + "loss": 0.3252, + "step": 14729 + }, + { + "epoch": 8.22905027932961, + "grad_norm": 0.4938288629055023, + "learning_rate": 0.0005904761904761905, + "loss": 0.4279, + "step": 14730 + }, + { + "epoch": 8.229608938547486, + "grad_norm": 1.9487265348434448, + "learning_rate": 0.0005904481792717087, + "loss": 0.4552, + "step": 14731 + }, + { + "epoch": 8.230167597765362, + "grad_norm": 0.471813440322876, + "learning_rate": 0.0005904201680672268, + "loss": 0.426, + "step": 14732 + }, + { + "epoch": 8.23072625698324, + "grad_norm": 0.5210540294647217, + "learning_rate": 0.000590392156862745, + "loss": 0.428, + "step": 14733 + }, + { + "epoch": 8.231284916201117, + "grad_norm": 0.7190614342689514, + "learning_rate": 0.0005903641456582634, + "loss": 0.3844, + "step": 14734 + }, + { + "epoch": 8.231843575418994, + "grad_norm": 0.5396774411201477, + "learning_rate": 0.0005903361344537816, + "loss": 0.4167, + "step": 14735 + }, + { + "epoch": 8.232402234636872, + "grad_norm": 0.602973461151123, + "learning_rate": 0.0005903081232492998, + "loss": 0.3907, + "step": 14736 + }, + { + "epoch": 8.232960893854749, + "grad_norm": 0.39327582716941833, + "learning_rate": 0.0005902801120448179, + "loss": 0.4841, + "step": 14737 + }, + { + "epoch": 8.233519553072625, + "grad_norm": 0.4999508261680603, + "learning_rate": 0.0005902521008403361, + "loss": 0.5663, + "step": 14738 + }, + { + "epoch": 8.234078212290504, + "grad_norm": 0.6242808699607849, + "learning_rate": 0.0005902240896358544, + "loss": 0.3476, + "step": 14739 + }, + { + "epoch": 8.23463687150838, + "grad_norm": 0.8377295136451721, + "learning_rate": 0.0005901960784313726, + "loss": 0.4243, + "step": 14740 + }, + { + "epoch": 8.235195530726257, + "grad_norm": 0.4045666754245758, + "learning_rate": 0.0005901680672268908, + "loss": 0.4446, + "step": 14741 + }, + { + "epoch": 8.235754189944133, + "grad_norm": 0.7267147898674011, + "learning_rate": 0.0005901400560224089, + "loss": 0.6043, + "step": 14742 + }, + { + "epoch": 8.236312849162012, + "grad_norm": 0.53114253282547, + "learning_rate": 0.0005901120448179271, + "loss": 0.6431, + "step": 14743 + }, + { + "epoch": 8.236871508379888, + "grad_norm": 0.43891292810440063, + "learning_rate": 0.0005900840336134454, + "loss": 0.4504, + "step": 14744 + }, + { + "epoch": 8.237430167597765, + "grad_norm": 0.47547683119773865, + "learning_rate": 0.0005900560224089636, + "loss": 0.4518, + "step": 14745 + }, + { + "epoch": 8.237988826815643, + "grad_norm": 0.441685289144516, + "learning_rate": 0.0005900280112044818, + "loss": 0.4763, + "step": 14746 + }, + { + "epoch": 8.23854748603352, + "grad_norm": 0.7056431174278259, + "learning_rate": 0.00059, + "loss": 0.411, + "step": 14747 + }, + { + "epoch": 8.239106145251396, + "grad_norm": 0.7852954268455505, + "learning_rate": 0.0005899719887955181, + "loss": 0.4232, + "step": 14748 + }, + { + "epoch": 8.239664804469275, + "grad_norm": 0.6250525116920471, + "learning_rate": 0.0005899439775910364, + "loss": 0.4849, + "step": 14749 + }, + { + "epoch": 8.240223463687151, + "grad_norm": 0.42558979988098145, + "learning_rate": 0.0005899159663865546, + "loss": 0.4631, + "step": 14750 + }, + { + "epoch": 8.240782122905028, + "grad_norm": 2.365145683288574, + "learning_rate": 0.0005898879551820729, + "loss": 0.4091, + "step": 14751 + }, + { + "epoch": 8.241340782122904, + "grad_norm": 0.5177446603775024, + "learning_rate": 0.0005898599439775911, + "loss": 0.3935, + "step": 14752 + }, + { + "epoch": 8.241899441340783, + "grad_norm": 0.3864086866378784, + "learning_rate": 0.0005898319327731092, + "loss": 0.4146, + "step": 14753 + }, + { + "epoch": 8.24245810055866, + "grad_norm": 0.6299644708633423, + "learning_rate": 0.0005898039215686275, + "loss": 0.4555, + "step": 14754 + }, + { + "epoch": 8.243016759776536, + "grad_norm": 0.3918527662754059, + "learning_rate": 0.0005897759103641457, + "loss": 0.3623, + "step": 14755 + }, + { + "epoch": 8.243575418994414, + "grad_norm": 0.4721129834651947, + "learning_rate": 0.0005897478991596639, + "loss": 0.4337, + "step": 14756 + }, + { + "epoch": 8.24413407821229, + "grad_norm": 0.7677056789398193, + "learning_rate": 0.0005897198879551821, + "loss": 0.4758, + "step": 14757 + }, + { + "epoch": 8.244692737430167, + "grad_norm": 0.5721067786216736, + "learning_rate": 0.0005896918767507002, + "loss": 0.589, + "step": 14758 + }, + { + "epoch": 8.245251396648044, + "grad_norm": 1.1299498081207275, + "learning_rate": 0.0005896638655462185, + "loss": 0.5644, + "step": 14759 + }, + { + "epoch": 8.245810055865922, + "grad_norm": 0.5949721336364746, + "learning_rate": 0.0005896358543417367, + "loss": 0.4949, + "step": 14760 + }, + { + "epoch": 8.246368715083799, + "grad_norm": 0.5529798865318298, + "learning_rate": 0.0005896078431372549, + "loss": 0.5172, + "step": 14761 + }, + { + "epoch": 8.246927374301675, + "grad_norm": 0.615172266960144, + "learning_rate": 0.0005895798319327731, + "loss": 0.5837, + "step": 14762 + }, + { + "epoch": 8.247486033519554, + "grad_norm": 0.4608076810836792, + "learning_rate": 0.0005895518207282913, + "loss": 0.3914, + "step": 14763 + }, + { + "epoch": 8.24804469273743, + "grad_norm": 0.38453319668769836, + "learning_rate": 0.0005895238095238095, + "loss": 0.3675, + "step": 14764 + }, + { + "epoch": 8.248603351955307, + "grad_norm": 0.752778172492981, + "learning_rate": 0.0005894957983193277, + "loss": 0.4572, + "step": 14765 + }, + { + "epoch": 8.249162011173185, + "grad_norm": 0.5437651872634888, + "learning_rate": 0.0005894677871148459, + "loss": 0.6394, + "step": 14766 + }, + { + "epoch": 8.249720670391062, + "grad_norm": 0.6189334392547607, + "learning_rate": 0.0005894397759103641, + "loss": 0.3928, + "step": 14767 + }, + { + "epoch": 8.250279329608938, + "grad_norm": 0.9584677219390869, + "learning_rate": 0.0005894117647058824, + "loss": 0.4543, + "step": 14768 + }, + { + "epoch": 8.250837988826815, + "grad_norm": 0.5113201141357422, + "learning_rate": 0.0005893837535014006, + "loss": 0.4851, + "step": 14769 + }, + { + "epoch": 8.251396648044693, + "grad_norm": 0.41447630524635315, + "learning_rate": 0.0005893557422969188, + "loss": 0.4632, + "step": 14770 + }, + { + "epoch": 8.25195530726257, + "grad_norm": 0.5667605400085449, + "learning_rate": 0.000589327731092437, + "loss": 0.3922, + "step": 14771 + }, + { + "epoch": 8.252513966480446, + "grad_norm": 3.096233367919922, + "learning_rate": 0.0005892997198879552, + "loss": 0.5371, + "step": 14772 + }, + { + "epoch": 8.253072625698325, + "grad_norm": 0.4266369044780731, + "learning_rate": 0.0005892717086834734, + "loss": 0.368, + "step": 14773 + }, + { + "epoch": 8.253631284916201, + "grad_norm": 2.1888341903686523, + "learning_rate": 0.0005892436974789917, + "loss": 0.592, + "step": 14774 + }, + { + "epoch": 8.254189944134078, + "grad_norm": 0.5836003422737122, + "learning_rate": 0.0005892156862745098, + "loss": 0.4407, + "step": 14775 + }, + { + "epoch": 8.254748603351956, + "grad_norm": 0.6670942306518555, + "learning_rate": 0.000589187675070028, + "loss": 0.5716, + "step": 14776 + }, + { + "epoch": 8.255307262569833, + "grad_norm": 0.5108802914619446, + "learning_rate": 0.0005891596638655462, + "loss": 0.442, + "step": 14777 + }, + { + "epoch": 8.25586592178771, + "grad_norm": 0.8770310282707214, + "learning_rate": 0.0005891316526610644, + "loss": 0.5376, + "step": 14778 + }, + { + "epoch": 8.256424581005586, + "grad_norm": 0.49834105372428894, + "learning_rate": 0.0005891036414565827, + "loss": 0.4849, + "step": 14779 + }, + { + "epoch": 8.256983240223464, + "grad_norm": 0.538061261177063, + "learning_rate": 0.0005890756302521008, + "loss": 0.4243, + "step": 14780 + }, + { + "epoch": 8.25754189944134, + "grad_norm": 0.43380460143089294, + "learning_rate": 0.000589047619047619, + "loss": 0.4664, + "step": 14781 + }, + { + "epoch": 8.258100558659217, + "grad_norm": 0.5464326739311218, + "learning_rate": 0.0005890196078431372, + "loss": 0.4397, + "step": 14782 + }, + { + "epoch": 8.258659217877096, + "grad_norm": 2.4383933544158936, + "learning_rate": 0.0005889915966386554, + "loss": 0.2838, + "step": 14783 + }, + { + "epoch": 8.259217877094972, + "grad_norm": 0.6485145688056946, + "learning_rate": 0.0005889635854341738, + "loss": 0.5056, + "step": 14784 + }, + { + "epoch": 8.259776536312849, + "grad_norm": 0.4227007329463959, + "learning_rate": 0.0005889355742296919, + "loss": 0.3799, + "step": 14785 + }, + { + "epoch": 8.260335195530725, + "grad_norm": 0.6206688284873962, + "learning_rate": 0.0005889075630252101, + "loss": 0.3493, + "step": 14786 + }, + { + "epoch": 8.260893854748604, + "grad_norm": 0.7826471924781799, + "learning_rate": 0.0005888795518207283, + "loss": 0.4162, + "step": 14787 + }, + { + "epoch": 8.26145251396648, + "grad_norm": 2.4297897815704346, + "learning_rate": 0.0005888515406162465, + "loss": 0.506, + "step": 14788 + }, + { + "epoch": 8.262011173184357, + "grad_norm": 0.5957176089286804, + "learning_rate": 0.0005888235294117648, + "loss": 0.5489, + "step": 14789 + }, + { + "epoch": 8.262569832402235, + "grad_norm": 0.7456678748130798, + "learning_rate": 0.000588795518207283, + "loss": 0.5109, + "step": 14790 + }, + { + "epoch": 8.263128491620112, + "grad_norm": 0.3750740587711334, + "learning_rate": 0.0005887675070028011, + "loss": 0.3543, + "step": 14791 + }, + { + "epoch": 8.263687150837988, + "grad_norm": 1.6128426790237427, + "learning_rate": 0.0005887394957983193, + "loss": 0.4306, + "step": 14792 + }, + { + "epoch": 8.264245810055867, + "grad_norm": 0.40688377618789673, + "learning_rate": 0.0005887114845938375, + "loss": 0.415, + "step": 14793 + }, + { + "epoch": 8.264804469273743, + "grad_norm": 0.45110994577407837, + "learning_rate": 0.0005886834733893558, + "loss": 0.3979, + "step": 14794 + }, + { + "epoch": 8.26536312849162, + "grad_norm": 0.46282047033309937, + "learning_rate": 0.000588655462184874, + "loss": 0.3669, + "step": 14795 + }, + { + "epoch": 8.265921787709496, + "grad_norm": 0.7603232264518738, + "learning_rate": 0.0005886274509803921, + "loss": 0.5639, + "step": 14796 + }, + { + "epoch": 8.266480446927375, + "grad_norm": 0.641492486000061, + "learning_rate": 0.0005885994397759103, + "loss": 0.3854, + "step": 14797 + }, + { + "epoch": 8.267039106145251, + "grad_norm": 0.7381540536880493, + "learning_rate": 0.0005885714285714285, + "loss": 0.5072, + "step": 14798 + }, + { + "epoch": 8.267597765363128, + "grad_norm": 0.616783857345581, + "learning_rate": 0.0005885434173669468, + "loss": 0.4772, + "step": 14799 + }, + { + "epoch": 8.268156424581006, + "grad_norm": 0.6735701560974121, + "learning_rate": 0.0005885154061624651, + "loss": 0.3688, + "step": 14800 + }, + { + "epoch": 8.268715083798883, + "grad_norm": 1.3496677875518799, + "learning_rate": 0.0005884873949579832, + "loss": 0.4789, + "step": 14801 + }, + { + "epoch": 8.26927374301676, + "grad_norm": 0.4321037232875824, + "learning_rate": 0.0005884593837535014, + "loss": 0.4514, + "step": 14802 + }, + { + "epoch": 8.269832402234638, + "grad_norm": 0.38056668639183044, + "learning_rate": 0.0005884313725490196, + "loss": 0.3793, + "step": 14803 + }, + { + "epoch": 8.270391061452514, + "grad_norm": 0.44099777936935425, + "learning_rate": 0.0005884033613445379, + "loss": 0.4295, + "step": 14804 + }, + { + "epoch": 8.27094972067039, + "grad_norm": 0.3462850749492645, + "learning_rate": 0.0005883753501400561, + "loss": 0.4106, + "step": 14805 + }, + { + "epoch": 8.271508379888267, + "grad_norm": 0.5856468081474304, + "learning_rate": 0.0005883473389355743, + "loss": 0.4962, + "step": 14806 + }, + { + "epoch": 8.272067039106146, + "grad_norm": 0.4173562824726105, + "learning_rate": 0.0005883193277310924, + "loss": 0.4585, + "step": 14807 + }, + { + "epoch": 8.272625698324022, + "grad_norm": 0.4310797154903412, + "learning_rate": 0.0005882913165266106, + "loss": 0.3862, + "step": 14808 + }, + { + "epoch": 8.273184357541899, + "grad_norm": 0.461738258600235, + "learning_rate": 0.0005882633053221289, + "loss": 0.5116, + "step": 14809 + }, + { + "epoch": 8.273743016759777, + "grad_norm": 0.41237929463386536, + "learning_rate": 0.0005882352941176471, + "loss": 0.392, + "step": 14810 + }, + { + "epoch": 8.274301675977654, + "grad_norm": 0.46872952580451965, + "learning_rate": 0.0005882072829131653, + "loss": 0.4341, + "step": 14811 + }, + { + "epoch": 8.27486033519553, + "grad_norm": 0.49128663539886475, + "learning_rate": 0.0005881792717086834, + "loss": 0.5157, + "step": 14812 + }, + { + "epoch": 8.275418994413409, + "grad_norm": 0.5641964077949524, + "learning_rate": 0.0005881512605042016, + "loss": 0.5872, + "step": 14813 + }, + { + "epoch": 8.275977653631285, + "grad_norm": 0.379846453666687, + "learning_rate": 0.0005881232492997199, + "loss": 0.3761, + "step": 14814 + }, + { + "epoch": 8.276536312849162, + "grad_norm": 0.5453080534934998, + "learning_rate": 0.0005880952380952381, + "loss": 0.3727, + "step": 14815 + }, + { + "epoch": 8.277094972067038, + "grad_norm": 0.6268227696418762, + "learning_rate": 0.0005880672268907564, + "loss": 0.4792, + "step": 14816 + }, + { + "epoch": 8.277653631284917, + "grad_norm": 0.8663527965545654, + "learning_rate": 0.0005880392156862744, + "loss": 0.6346, + "step": 14817 + }, + { + "epoch": 8.278212290502793, + "grad_norm": 0.5720897912979126, + "learning_rate": 0.0005880112044817927, + "loss": 0.4205, + "step": 14818 + }, + { + "epoch": 8.27877094972067, + "grad_norm": 0.6963171362876892, + "learning_rate": 0.000587983193277311, + "loss": 0.4822, + "step": 14819 + }, + { + "epoch": 8.279329608938548, + "grad_norm": 0.766904354095459, + "learning_rate": 0.0005879551820728292, + "loss": 0.4264, + "step": 14820 + }, + { + "epoch": 8.279888268156425, + "grad_norm": 0.3742465674877167, + "learning_rate": 0.0005879271708683474, + "loss": 0.4017, + "step": 14821 + }, + { + "epoch": 8.280446927374301, + "grad_norm": 0.7821504473686218, + "learning_rate": 0.0005878991596638656, + "loss": 0.4799, + "step": 14822 + }, + { + "epoch": 8.28100558659218, + "grad_norm": 0.6238220930099487, + "learning_rate": 0.0005878711484593837, + "loss": 0.4299, + "step": 14823 + }, + { + "epoch": 8.281564245810056, + "grad_norm": 0.8606948852539062, + "learning_rate": 0.000587843137254902, + "loss": 0.4149, + "step": 14824 + }, + { + "epoch": 8.282122905027933, + "grad_norm": 1.495185136795044, + "learning_rate": 0.0005878151260504202, + "loss": 0.3506, + "step": 14825 + }, + { + "epoch": 8.28268156424581, + "grad_norm": 2.243377208709717, + "learning_rate": 0.0005877871148459384, + "loss": 0.5415, + "step": 14826 + }, + { + "epoch": 8.283240223463688, + "grad_norm": 0.3555644750595093, + "learning_rate": 0.0005877591036414566, + "loss": 0.4041, + "step": 14827 + }, + { + "epoch": 8.283798882681564, + "grad_norm": 0.5263892412185669, + "learning_rate": 0.0005877310924369747, + "loss": 0.4354, + "step": 14828 + }, + { + "epoch": 8.28435754189944, + "grad_norm": 0.5312854647636414, + "learning_rate": 0.000587703081232493, + "loss": 0.5255, + "step": 14829 + }, + { + "epoch": 8.28491620111732, + "grad_norm": 1.4136950969696045, + "learning_rate": 0.0005876750700280112, + "loss": 0.3816, + "step": 14830 + }, + { + "epoch": 8.285474860335196, + "grad_norm": 0.642829179763794, + "learning_rate": 0.0005876470588235294, + "loss": 0.4379, + "step": 14831 + }, + { + "epoch": 8.286033519553072, + "grad_norm": 0.49640750885009766, + "learning_rate": 0.0005876190476190476, + "loss": 0.5183, + "step": 14832 + }, + { + "epoch": 8.286592178770949, + "grad_norm": 1.241936206817627, + "learning_rate": 0.0005875910364145657, + "loss": 0.3919, + "step": 14833 + }, + { + "epoch": 8.287150837988827, + "grad_norm": 0.5289754867553711, + "learning_rate": 0.0005875630252100841, + "loss": 0.3893, + "step": 14834 + }, + { + "epoch": 8.287709497206704, + "grad_norm": 0.41866251826286316, + "learning_rate": 0.0005875350140056023, + "loss": 0.4189, + "step": 14835 + }, + { + "epoch": 8.28826815642458, + "grad_norm": 0.6389116644859314, + "learning_rate": 0.0005875070028011205, + "loss": 0.5844, + "step": 14836 + }, + { + "epoch": 8.288826815642459, + "grad_norm": 3.5144050121307373, + "learning_rate": 0.0005874789915966387, + "loss": 0.4533, + "step": 14837 + }, + { + "epoch": 8.289385474860335, + "grad_norm": 0.6351314187049866, + "learning_rate": 0.0005874509803921569, + "loss": 0.4714, + "step": 14838 + }, + { + "epoch": 8.289944134078212, + "grad_norm": 2.8534011840820312, + "learning_rate": 0.0005874229691876751, + "loss": 0.4773, + "step": 14839 + }, + { + "epoch": 8.29050279329609, + "grad_norm": 0.5138262510299683, + "learning_rate": 0.0005873949579831933, + "loss": 0.416, + "step": 14840 + }, + { + "epoch": 8.291061452513967, + "grad_norm": 3.2907907962799072, + "learning_rate": 0.0005873669467787115, + "loss": 0.4474, + "step": 14841 + }, + { + "epoch": 8.291620111731843, + "grad_norm": 0.3572208285331726, + "learning_rate": 0.0005873389355742297, + "loss": 0.3088, + "step": 14842 + }, + { + "epoch": 8.29217877094972, + "grad_norm": 0.5566766858100891, + "learning_rate": 0.0005873109243697479, + "loss": 0.3802, + "step": 14843 + }, + { + "epoch": 8.292737430167598, + "grad_norm": 0.753349244594574, + "learning_rate": 0.0005872829131652661, + "loss": 0.4487, + "step": 14844 + }, + { + "epoch": 8.293296089385475, + "grad_norm": 0.4712984561920166, + "learning_rate": 0.0005872549019607843, + "loss": 0.4232, + "step": 14845 + }, + { + "epoch": 8.293854748603351, + "grad_norm": 0.8797717094421387, + "learning_rate": 0.0005872268907563025, + "loss": 0.3483, + "step": 14846 + }, + { + "epoch": 8.29441340782123, + "grad_norm": 1.030199408531189, + "learning_rate": 0.0005871988795518207, + "loss": 0.5208, + "step": 14847 + }, + { + "epoch": 8.294972067039106, + "grad_norm": 0.41478410363197327, + "learning_rate": 0.0005871708683473389, + "loss": 0.4291, + "step": 14848 + }, + { + "epoch": 8.295530726256983, + "grad_norm": 0.40925636887550354, + "learning_rate": 0.0005871428571428571, + "loss": 0.4043, + "step": 14849 + }, + { + "epoch": 8.296089385474861, + "grad_norm": 0.9737657904624939, + "learning_rate": 0.0005871148459383754, + "loss": 0.4006, + "step": 14850 + }, + { + "epoch": 8.296648044692738, + "grad_norm": 0.5900642275810242, + "learning_rate": 0.0005870868347338936, + "loss": 0.4699, + "step": 14851 + }, + { + "epoch": 8.297206703910614, + "grad_norm": 0.6438400149345398, + "learning_rate": 0.0005870588235294118, + "loss": 0.4955, + "step": 14852 + }, + { + "epoch": 8.297765363128491, + "grad_norm": 0.652421236038208, + "learning_rate": 0.00058703081232493, + "loss": 0.4258, + "step": 14853 + }, + { + "epoch": 8.29832402234637, + "grad_norm": 0.7232846021652222, + "learning_rate": 0.0005870028011204483, + "loss": 0.4107, + "step": 14854 + }, + { + "epoch": 8.298882681564246, + "grad_norm": 0.534042477607727, + "learning_rate": 0.0005869747899159664, + "loss": 0.3472, + "step": 14855 + }, + { + "epoch": 8.299441340782122, + "grad_norm": 0.5714625716209412, + "learning_rate": 0.0005869467787114846, + "loss": 0.4111, + "step": 14856 + }, + { + "epoch": 8.3, + "grad_norm": 3.703117847442627, + "learning_rate": 0.0005869187675070028, + "loss": 0.4347, + "step": 14857 + }, + { + "epoch": 8.300558659217877, + "grad_norm": 0.47215965390205383, + "learning_rate": 0.000586890756302521, + "loss": 0.4054, + "step": 14858 + }, + { + "epoch": 8.301117318435754, + "grad_norm": 0.5239530801773071, + "learning_rate": 0.0005868627450980393, + "loss": 0.4525, + "step": 14859 + }, + { + "epoch": 8.30167597765363, + "grad_norm": 0.36858704686164856, + "learning_rate": 0.0005868347338935574, + "loss": 0.3215, + "step": 14860 + }, + { + "epoch": 8.302234636871509, + "grad_norm": 0.6091682314872742, + "learning_rate": 0.0005868067226890756, + "loss": 0.348, + "step": 14861 + }, + { + "epoch": 8.302793296089385, + "grad_norm": 0.39357200264930725, + "learning_rate": 0.0005867787114845938, + "loss": 0.3912, + "step": 14862 + }, + { + "epoch": 8.303351955307262, + "grad_norm": 0.45767244696617126, + "learning_rate": 0.000586750700280112, + "loss": 0.4689, + "step": 14863 + }, + { + "epoch": 8.30391061452514, + "grad_norm": 0.5937530994415283, + "learning_rate": 0.0005867226890756303, + "loss": 0.3904, + "step": 14864 + }, + { + "epoch": 8.304469273743017, + "grad_norm": 0.6735968589782715, + "learning_rate": 0.0005866946778711484, + "loss": 0.4025, + "step": 14865 + }, + { + "epoch": 8.305027932960893, + "grad_norm": 0.5939727425575256, + "learning_rate": 0.0005866666666666667, + "loss": 0.4157, + "step": 14866 + }, + { + "epoch": 8.305586592178772, + "grad_norm": 0.4244139492511749, + "learning_rate": 0.0005866386554621849, + "loss": 0.3759, + "step": 14867 + }, + { + "epoch": 8.306145251396648, + "grad_norm": 0.3585618734359741, + "learning_rate": 0.0005866106442577031, + "loss": 0.3625, + "step": 14868 + }, + { + "epoch": 8.306703910614525, + "grad_norm": 0.9539227485656738, + "learning_rate": 0.0005865826330532214, + "loss": 0.4714, + "step": 14869 + }, + { + "epoch": 8.307262569832401, + "grad_norm": 0.475507527589798, + "learning_rate": 0.0005865546218487396, + "loss": 0.3395, + "step": 14870 + }, + { + "epoch": 8.30782122905028, + "grad_norm": 0.6727422475814819, + "learning_rate": 0.0005865266106442577, + "loss": 0.5992, + "step": 14871 + }, + { + "epoch": 8.308379888268156, + "grad_norm": 0.47513654828071594, + "learning_rate": 0.0005864985994397759, + "loss": 0.356, + "step": 14872 + }, + { + "epoch": 8.308938547486033, + "grad_norm": Infinity, + "learning_rate": 0.0005864985994397759, + "loss": 0.5356, + "step": 14873 + }, + { + "epoch": 8.309497206703911, + "grad_norm": 0.4587685763835907, + "learning_rate": 0.0005864705882352941, + "loss": 0.3335, + "step": 14874 + }, + { + "epoch": 8.310055865921788, + "grad_norm": 0.6229572892189026, + "learning_rate": 0.0005864425770308124, + "loss": 0.4461, + "step": 14875 + }, + { + "epoch": 8.310614525139664, + "grad_norm": 0.4542607069015503, + "learning_rate": 0.0005864145658263306, + "loss": 0.4247, + "step": 14876 + }, + { + "epoch": 8.311173184357543, + "grad_norm": 1.3235516548156738, + "learning_rate": 0.0005863865546218487, + "loss": 0.3939, + "step": 14877 + }, + { + "epoch": 8.31173184357542, + "grad_norm": 1.194519281387329, + "learning_rate": 0.0005863585434173669, + "loss": 0.3328, + "step": 14878 + }, + { + "epoch": 8.312290502793296, + "grad_norm": 0.644889235496521, + "learning_rate": 0.0005863305322128851, + "loss": 0.4413, + "step": 14879 + }, + { + "epoch": 8.312849162011172, + "grad_norm": 0.4533769190311432, + "learning_rate": 0.0005863025210084034, + "loss": 0.5828, + "step": 14880 + }, + { + "epoch": 8.31340782122905, + "grad_norm": 0.47157952189445496, + "learning_rate": 0.0005862745098039216, + "loss": 0.4517, + "step": 14881 + }, + { + "epoch": 8.313966480446927, + "grad_norm": 0.47866547107696533, + "learning_rate": 0.0005862464985994397, + "loss": 0.5156, + "step": 14882 + }, + { + "epoch": 8.314525139664804, + "grad_norm": 0.5529776811599731, + "learning_rate": 0.000586218487394958, + "loss": 0.5069, + "step": 14883 + }, + { + "epoch": 8.315083798882682, + "grad_norm": 0.5036071538925171, + "learning_rate": 0.0005861904761904762, + "loss": 0.3871, + "step": 14884 + }, + { + "epoch": 8.315642458100559, + "grad_norm": 2.540558338165283, + "learning_rate": 0.0005861624649859945, + "loss": 0.4002, + "step": 14885 + }, + { + "epoch": 8.316201117318435, + "grad_norm": 0.5781753063201904, + "learning_rate": 0.0005861344537815127, + "loss": 0.4731, + "step": 14886 + }, + { + "epoch": 8.316759776536314, + "grad_norm": 2.816850423812866, + "learning_rate": 0.0005861064425770309, + "loss": 0.3895, + "step": 14887 + }, + { + "epoch": 8.31731843575419, + "grad_norm": 0.7096949219703674, + "learning_rate": 0.000586078431372549, + "loss": 0.4181, + "step": 14888 + }, + { + "epoch": 8.317877094972067, + "grad_norm": 1.2901709079742432, + "learning_rate": 0.0005860504201680672, + "loss": 0.4351, + "step": 14889 + }, + { + "epoch": 8.318435754189943, + "grad_norm": 0.412121057510376, + "learning_rate": 0.0005860224089635855, + "loss": 0.4472, + "step": 14890 + }, + { + "epoch": 8.318994413407822, + "grad_norm": 0.35975179076194763, + "learning_rate": 0.0005859943977591037, + "loss": 0.3562, + "step": 14891 + }, + { + "epoch": 8.319553072625698, + "grad_norm": 0.3908674120903015, + "learning_rate": 0.0005859663865546219, + "loss": 0.4021, + "step": 14892 + }, + { + "epoch": 8.320111731843575, + "grad_norm": 1.7003639936447144, + "learning_rate": 0.00058593837535014, + "loss": 0.4258, + "step": 14893 + }, + { + "epoch": 8.320670391061453, + "grad_norm": 1.2953040599822998, + "learning_rate": 0.0005859103641456582, + "loss": 0.4537, + "step": 14894 + }, + { + "epoch": 8.32122905027933, + "grad_norm": 0.3916996419429779, + "learning_rate": 0.0005858823529411765, + "loss": 0.458, + "step": 14895 + }, + { + "epoch": 8.321787709497206, + "grad_norm": 0.6036856174468994, + "learning_rate": 0.0005858543417366947, + "loss": 0.2812, + "step": 14896 + }, + { + "epoch": 8.322346368715085, + "grad_norm": 1.1489933729171753, + "learning_rate": 0.0005858263305322129, + "loss": 0.5304, + "step": 14897 + }, + { + "epoch": 8.322905027932961, + "grad_norm": 0.5643019676208496, + "learning_rate": 0.000585798319327731, + "loss": 0.4582, + "step": 14898 + }, + { + "epoch": 8.323463687150838, + "grad_norm": 0.5098575353622437, + "learning_rate": 0.0005857703081232492, + "loss": 0.4368, + "step": 14899 + }, + { + "epoch": 8.324022346368714, + "grad_norm": 0.8201267123222351, + "learning_rate": 0.0005857422969187676, + "loss": 0.3759, + "step": 14900 + }, + { + "epoch": 8.324581005586593, + "grad_norm": 0.7555643916130066, + "learning_rate": 0.0005857142857142858, + "loss": 0.5012, + "step": 14901 + }, + { + "epoch": 8.32513966480447, + "grad_norm": 0.8516841530799866, + "learning_rate": 0.000585686274509804, + "loss": 0.6133, + "step": 14902 + }, + { + "epoch": 8.325698324022346, + "grad_norm": 0.5094266533851624, + "learning_rate": 0.0005856582633053222, + "loss": 0.5723, + "step": 14903 + }, + { + "epoch": 8.326256983240224, + "grad_norm": 0.632649302482605, + "learning_rate": 0.0005856302521008403, + "loss": 0.4469, + "step": 14904 + }, + { + "epoch": 8.3268156424581, + "grad_norm": 0.5945995450019836, + "learning_rate": 0.0005856022408963586, + "loss": 0.3827, + "step": 14905 + }, + { + "epoch": 8.327374301675977, + "grad_norm": 0.4750930368900299, + "learning_rate": 0.0005855742296918768, + "loss": 0.3888, + "step": 14906 + }, + { + "epoch": 8.327932960893854, + "grad_norm": 0.46484503149986267, + "learning_rate": 0.000585546218487395, + "loss": 0.4446, + "step": 14907 + }, + { + "epoch": 8.328491620111732, + "grad_norm": 0.6619868278503418, + "learning_rate": 0.0005855182072829132, + "loss": 0.423, + "step": 14908 + }, + { + "epoch": 8.329050279329609, + "grad_norm": 0.7141128182411194, + "learning_rate": 0.0005854901960784313, + "loss": 0.4426, + "step": 14909 + }, + { + "epoch": 8.329608938547485, + "grad_norm": 0.48962703347206116, + "learning_rate": 0.0005854621848739496, + "loss": 0.5299, + "step": 14910 + }, + { + "epoch": 8.330167597765364, + "grad_norm": 0.7770978212356567, + "learning_rate": 0.0005854341736694678, + "loss": 0.4347, + "step": 14911 + }, + { + "epoch": 8.33072625698324, + "grad_norm": 0.47875073552131653, + "learning_rate": 0.000585406162464986, + "loss": 0.43, + "step": 14912 + }, + { + "epoch": 8.331284916201117, + "grad_norm": 0.4565463066101074, + "learning_rate": 0.0005853781512605042, + "loss": 0.5082, + "step": 14913 + }, + { + "epoch": 8.331843575418995, + "grad_norm": 0.39837974309921265, + "learning_rate": 0.0005853501400560223, + "loss": 0.4339, + "step": 14914 + }, + { + "epoch": 8.332402234636872, + "grad_norm": 0.7079082727432251, + "learning_rate": 0.0005853221288515406, + "loss": 0.6329, + "step": 14915 + }, + { + "epoch": 8.332960893854748, + "grad_norm": 0.5177701115608215, + "learning_rate": 0.0005852941176470589, + "loss": 0.4776, + "step": 14916 + }, + { + "epoch": 8.333519553072625, + "grad_norm": 0.5062240958213806, + "learning_rate": 0.0005852661064425771, + "loss": 0.475, + "step": 14917 + }, + { + "epoch": 8.334078212290503, + "grad_norm": 0.4818429946899414, + "learning_rate": 0.0005852380952380953, + "loss": 0.362, + "step": 14918 + }, + { + "epoch": 8.33463687150838, + "grad_norm": 0.6559048295021057, + "learning_rate": 0.0005852100840336135, + "loss": 0.5981, + "step": 14919 + }, + { + "epoch": 8.335195530726256, + "grad_norm": 0.4394361674785614, + "learning_rate": 0.0005851820728291317, + "loss": 0.4462, + "step": 14920 + }, + { + "epoch": 8.335754189944135, + "grad_norm": 0.5834078192710876, + "learning_rate": 0.0005851540616246499, + "loss": 0.466, + "step": 14921 + }, + { + "epoch": 8.336312849162011, + "grad_norm": 1.0570108890533447, + "learning_rate": 0.0005851260504201681, + "loss": 0.4706, + "step": 14922 + }, + { + "epoch": 8.336871508379888, + "grad_norm": 0.38948193192481995, + "learning_rate": 0.0005850980392156863, + "loss": 0.3212, + "step": 14923 + }, + { + "epoch": 8.337430167597766, + "grad_norm": 0.44940298795700073, + "learning_rate": 0.0005850700280112045, + "loss": 0.4889, + "step": 14924 + }, + { + "epoch": 8.337988826815643, + "grad_norm": 0.4493415057659149, + "learning_rate": 0.0005850420168067227, + "loss": 0.567, + "step": 14925 + }, + { + "epoch": 8.33854748603352, + "grad_norm": 0.39843258261680603, + "learning_rate": 0.0005850140056022409, + "loss": 0.3646, + "step": 14926 + }, + { + "epoch": 8.339106145251396, + "grad_norm": 1.4064139127731323, + "learning_rate": 0.0005849859943977591, + "loss": 0.4846, + "step": 14927 + }, + { + "epoch": 8.339664804469274, + "grad_norm": 0.6744639873504639, + "learning_rate": 0.0005849579831932773, + "loss": 0.5923, + "step": 14928 + }, + { + "epoch": 8.34022346368715, + "grad_norm": 0.4554484188556671, + "learning_rate": 0.0005849299719887955, + "loss": 0.3656, + "step": 14929 + }, + { + "epoch": 8.340782122905027, + "grad_norm": 0.4262297749519348, + "learning_rate": 0.0005849019607843137, + "loss": 0.419, + "step": 14930 + }, + { + "epoch": 8.341340782122906, + "grad_norm": 0.6004900932312012, + "learning_rate": 0.0005848739495798319, + "loss": 0.511, + "step": 14931 + }, + { + "epoch": 8.341899441340782, + "grad_norm": 0.46860986948013306, + "learning_rate": 0.0005848459383753501, + "loss": 0.4257, + "step": 14932 + }, + { + "epoch": 8.342458100558659, + "grad_norm": 0.4969500005245209, + "learning_rate": 0.0005848179271708684, + "loss": 0.3263, + "step": 14933 + }, + { + "epoch": 8.343016759776535, + "grad_norm": 0.6126229763031006, + "learning_rate": 0.0005847899159663866, + "loss": 0.5477, + "step": 14934 + }, + { + "epoch": 8.343575418994414, + "grad_norm": 0.5851977467536926, + "learning_rate": 0.0005847619047619049, + "loss": 0.507, + "step": 14935 + }, + { + "epoch": 8.34413407821229, + "grad_norm": 0.6072599291801453, + "learning_rate": 0.000584733893557423, + "loss": 0.3371, + "step": 14936 + }, + { + "epoch": 8.344692737430167, + "grad_norm": 0.4667683243751526, + "learning_rate": 0.0005847058823529412, + "loss": 0.4495, + "step": 14937 + }, + { + "epoch": 8.345251396648045, + "grad_norm": 0.7304673790931702, + "learning_rate": 0.0005846778711484594, + "loss": 0.5209, + "step": 14938 + }, + { + "epoch": 8.345810055865922, + "grad_norm": 1.4601686000823975, + "learning_rate": 0.0005846498599439776, + "loss": 0.4581, + "step": 14939 + }, + { + "epoch": 8.346368715083798, + "grad_norm": 0.5530440807342529, + "learning_rate": 0.0005846218487394959, + "loss": 0.4586, + "step": 14940 + }, + { + "epoch": 8.346927374301677, + "grad_norm": 0.4376341998577118, + "learning_rate": 0.000584593837535014, + "loss": 0.4564, + "step": 14941 + }, + { + "epoch": 8.347486033519553, + "grad_norm": 0.5981418490409851, + "learning_rate": 0.0005845658263305322, + "loss": 0.3566, + "step": 14942 + }, + { + "epoch": 8.34804469273743, + "grad_norm": 0.578819751739502, + "learning_rate": 0.0005845378151260504, + "loss": 0.4327, + "step": 14943 + }, + { + "epoch": 8.348603351955306, + "grad_norm": 0.6129190921783447, + "learning_rate": 0.0005845098039215686, + "loss": 0.5493, + "step": 14944 + }, + { + "epoch": 8.349162011173185, + "grad_norm": 0.4983977675437927, + "learning_rate": 0.0005844817927170869, + "loss": 0.5175, + "step": 14945 + }, + { + "epoch": 8.349720670391061, + "grad_norm": 0.5523300766944885, + "learning_rate": 0.000584453781512605, + "loss": 0.4596, + "step": 14946 + }, + { + "epoch": 8.350279329608938, + "grad_norm": 0.5708497762680054, + "learning_rate": 0.0005844257703081232, + "loss": 0.5217, + "step": 14947 + }, + { + "epoch": 8.350837988826816, + "grad_norm": 12.819463729858398, + "learning_rate": 0.0005843977591036414, + "loss": 0.5491, + "step": 14948 + }, + { + "epoch": 8.351396648044693, + "grad_norm": 0.46666884422302246, + "learning_rate": 0.0005843697478991597, + "loss": 0.454, + "step": 14949 + }, + { + "epoch": 8.35195530726257, + "grad_norm": 0.6145971417427063, + "learning_rate": 0.0005843417366946779, + "loss": 0.3775, + "step": 14950 + }, + { + "epoch": 8.352513966480448, + "grad_norm": 0.384982168674469, + "learning_rate": 0.0005843137254901962, + "loss": 0.3397, + "step": 14951 + }, + { + "epoch": 8.353072625698324, + "grad_norm": 1.0820281505584717, + "learning_rate": 0.0005842857142857143, + "loss": 0.4324, + "step": 14952 + }, + { + "epoch": 8.3536312849162, + "grad_norm": 0.5820189714431763, + "learning_rate": 0.0005842577030812325, + "loss": 0.4352, + "step": 14953 + }, + { + "epoch": 8.354189944134077, + "grad_norm": 1.0354069471359253, + "learning_rate": 0.0005842296918767507, + "loss": 0.6378, + "step": 14954 + }, + { + "epoch": 8.354748603351956, + "grad_norm": 0.48619651794433594, + "learning_rate": 0.0005842016806722689, + "loss": 0.4484, + "step": 14955 + }, + { + "epoch": 8.355307262569832, + "grad_norm": 0.6371716856956482, + "learning_rate": 0.0005841736694677872, + "loss": 0.6083, + "step": 14956 + }, + { + "epoch": 8.355865921787709, + "grad_norm": 0.9120499491691589, + "learning_rate": 0.0005841456582633053, + "loss": 0.4088, + "step": 14957 + }, + { + "epoch": 8.356424581005587, + "grad_norm": 0.4198913276195526, + "learning_rate": 0.0005841176470588235, + "loss": 0.5411, + "step": 14958 + }, + { + "epoch": 8.356983240223464, + "grad_norm": 0.40404555201530457, + "learning_rate": 0.0005840896358543417, + "loss": 0.3832, + "step": 14959 + }, + { + "epoch": 8.35754189944134, + "grad_norm": 0.8810367584228516, + "learning_rate": 0.0005840616246498599, + "loss": 0.6768, + "step": 14960 + }, + { + "epoch": 8.358100558659217, + "grad_norm": 0.4589516222476959, + "learning_rate": 0.0005840336134453782, + "loss": 0.4203, + "step": 14961 + }, + { + "epoch": 8.358659217877095, + "grad_norm": 0.6582610607147217, + "learning_rate": 0.0005840056022408963, + "loss": 0.4572, + "step": 14962 + }, + { + "epoch": 8.359217877094972, + "grad_norm": 0.5612242221832275, + "learning_rate": 0.0005839775910364145, + "loss": 0.5905, + "step": 14963 + }, + { + "epoch": 8.359776536312848, + "grad_norm": 0.36198267340660095, + "learning_rate": 0.0005839495798319327, + "loss": 0.3823, + "step": 14964 + }, + { + "epoch": 8.360335195530727, + "grad_norm": 0.4489588141441345, + "learning_rate": 0.000583921568627451, + "loss": 0.4179, + "step": 14965 + }, + { + "epoch": 8.360893854748603, + "grad_norm": 0.4734388291835785, + "learning_rate": 0.0005838935574229693, + "loss": 0.438, + "step": 14966 + }, + { + "epoch": 8.36145251396648, + "grad_norm": 0.5238151550292969, + "learning_rate": 0.0005838655462184875, + "loss": 0.53, + "step": 14967 + }, + { + "epoch": 8.362011173184358, + "grad_norm": 0.7804664373397827, + "learning_rate": 0.0005838375350140056, + "loss": 0.4179, + "step": 14968 + }, + { + "epoch": 8.362569832402235, + "grad_norm": 0.4341186285018921, + "learning_rate": 0.0005838095238095238, + "loss": 0.4099, + "step": 14969 + }, + { + "epoch": 8.363128491620111, + "grad_norm": 0.7319085001945496, + "learning_rate": 0.000583781512605042, + "loss": 0.4674, + "step": 14970 + }, + { + "epoch": 8.363687150837988, + "grad_norm": 0.8095124959945679, + "learning_rate": 0.0005837535014005603, + "loss": 0.6271, + "step": 14971 + }, + { + "epoch": 8.364245810055866, + "grad_norm": 2.2014153003692627, + "learning_rate": 0.0005837254901960785, + "loss": 0.3597, + "step": 14972 + }, + { + "epoch": 8.364804469273743, + "grad_norm": 0.5027381777763367, + "learning_rate": 0.0005836974789915966, + "loss": 0.3901, + "step": 14973 + }, + { + "epoch": 8.36536312849162, + "grad_norm": 0.4144800305366516, + "learning_rate": 0.0005836694677871148, + "loss": 0.4262, + "step": 14974 + }, + { + "epoch": 8.365921787709498, + "grad_norm": 0.8771882057189941, + "learning_rate": 0.000583641456582633, + "loss": 0.3776, + "step": 14975 + }, + { + "epoch": 8.366480446927374, + "grad_norm": 0.37773919105529785, + "learning_rate": 0.0005836134453781513, + "loss": 0.4941, + "step": 14976 + }, + { + "epoch": 8.367039106145251, + "grad_norm": 0.6087919473648071, + "learning_rate": 0.0005835854341736695, + "loss": 0.4216, + "step": 14977 + }, + { + "epoch": 8.36759776536313, + "grad_norm": 0.591992199420929, + "learning_rate": 0.0005835574229691876, + "loss": 0.4708, + "step": 14978 + }, + { + "epoch": 8.368156424581006, + "grad_norm": 0.5549662113189697, + "learning_rate": 0.0005835294117647058, + "loss": 0.4493, + "step": 14979 + }, + { + "epoch": 8.368715083798882, + "grad_norm": 0.43173640966415405, + "learning_rate": 0.000583501400560224, + "loss": 0.4514, + "step": 14980 + }, + { + "epoch": 8.369273743016759, + "grad_norm": 1.4267487525939941, + "learning_rate": 0.0005834733893557424, + "loss": 0.3851, + "step": 14981 + }, + { + "epoch": 8.369832402234637, + "grad_norm": 0.4697130024433136, + "learning_rate": 0.0005834453781512606, + "loss": 0.4601, + "step": 14982 + }, + { + "epoch": 8.370391061452514, + "grad_norm": 0.5327762961387634, + "learning_rate": 0.0005834173669467788, + "loss": 0.4097, + "step": 14983 + }, + { + "epoch": 8.37094972067039, + "grad_norm": 0.656789243221283, + "learning_rate": 0.0005833893557422969, + "loss": 0.4823, + "step": 14984 + }, + { + "epoch": 8.371508379888269, + "grad_norm": 4.2425994873046875, + "learning_rate": 0.0005833613445378151, + "loss": 0.4776, + "step": 14985 + }, + { + "epoch": 8.372067039106145, + "grad_norm": 0.42395198345184326, + "learning_rate": 0.0005833333333333334, + "loss": 0.3747, + "step": 14986 + }, + { + "epoch": 8.372625698324022, + "grad_norm": 0.4683852791786194, + "learning_rate": 0.0005833053221288516, + "loss": 0.5181, + "step": 14987 + }, + { + "epoch": 8.3731843575419, + "grad_norm": 0.5276800990104675, + "learning_rate": 0.0005832773109243698, + "loss": 0.4857, + "step": 14988 + }, + { + "epoch": 8.373743016759777, + "grad_norm": 0.6185788512229919, + "learning_rate": 0.0005832492997198879, + "loss": 0.4546, + "step": 14989 + }, + { + "epoch": 8.374301675977653, + "grad_norm": 0.6540019512176514, + "learning_rate": 0.0005832212885154061, + "loss": 0.5035, + "step": 14990 + }, + { + "epoch": 8.37486033519553, + "grad_norm": 0.4345349371433258, + "learning_rate": 0.0005831932773109244, + "loss": 0.4356, + "step": 14991 + }, + { + "epoch": 8.375418994413408, + "grad_norm": 0.7053734660148621, + "learning_rate": 0.0005831652661064426, + "loss": 0.5728, + "step": 14992 + }, + { + "epoch": 8.375977653631285, + "grad_norm": 0.4583536684513092, + "learning_rate": 0.0005831372549019608, + "loss": 0.4444, + "step": 14993 + }, + { + "epoch": 8.376536312849161, + "grad_norm": 0.5855910181999207, + "learning_rate": 0.0005831092436974789, + "loss": 0.2691, + "step": 14994 + }, + { + "epoch": 8.37709497206704, + "grad_norm": 0.5928325057029724, + "learning_rate": 0.0005830812324929971, + "loss": 0.4249, + "step": 14995 + }, + { + "epoch": 8.377653631284916, + "grad_norm": 0.5972592830657959, + "learning_rate": 0.0005830532212885154, + "loss": 0.3577, + "step": 14996 + }, + { + "epoch": 8.378212290502793, + "grad_norm": 0.5805910229682922, + "learning_rate": 0.0005830252100840336, + "loss": 0.4642, + "step": 14997 + }, + { + "epoch": 8.378770949720671, + "grad_norm": 0.5550611615180969, + "learning_rate": 0.0005829971988795519, + "loss": 0.4218, + "step": 14998 + }, + { + "epoch": 8.379329608938548, + "grad_norm": 0.6600235104560852, + "learning_rate": 0.0005829691876750701, + "loss": 0.4222, + "step": 14999 + }, + { + "epoch": 8.379888268156424, + "grad_norm": 1.6635565757751465, + "learning_rate": 0.0005829411764705882, + "loss": 0.4987, + "step": 15000 + }, + { + "epoch": 8.379888268156424, + "eval_cer": 0.09018253630541281, + "eval_loss": 0.33840733766555786, + "eval_runtime": 55.6651, + "eval_samples_per_second": 81.523, + "eval_steps_per_second": 5.102, + "eval_wer": 0.35859333538169325, + "step": 15000 + }, + { + "epoch": 8.380446927374301, + "grad_norm": 0.4069099426269531, + "learning_rate": 0.0005829131652661065, + "loss": 0.3955, + "step": 15001 + }, + { + "epoch": 8.38100558659218, + "grad_norm": 2.484233856201172, + "learning_rate": 0.0005828851540616247, + "loss": 0.455, + "step": 15002 + }, + { + "epoch": 8.381564245810056, + "grad_norm": 0.8413163423538208, + "learning_rate": 0.0005828571428571429, + "loss": 0.5025, + "step": 15003 + }, + { + "epoch": 8.382122905027932, + "grad_norm": 0.6786373257637024, + "learning_rate": 0.0005828291316526611, + "loss": 0.4657, + "step": 15004 + }, + { + "epoch": 8.38268156424581, + "grad_norm": 7.700829982757568, + "learning_rate": 0.0005828011204481792, + "loss": 0.4499, + "step": 15005 + }, + { + "epoch": 8.383240223463687, + "grad_norm": 0.4800693988800049, + "learning_rate": 0.0005827731092436975, + "loss": 0.4174, + "step": 15006 + }, + { + "epoch": 8.383798882681564, + "grad_norm": 0.4424120783805847, + "learning_rate": 0.0005827450980392157, + "loss": 0.5419, + "step": 15007 + }, + { + "epoch": 8.38435754189944, + "grad_norm": 0.6140261292457581, + "learning_rate": 0.0005827170868347339, + "loss": 0.6571, + "step": 15008 + }, + { + "epoch": 8.384916201117319, + "grad_norm": 0.6278320550918579, + "learning_rate": 0.0005826890756302521, + "loss": 0.4594, + "step": 15009 + }, + { + "epoch": 8.385474860335195, + "grad_norm": 1.014625906944275, + "learning_rate": 0.0005826610644257702, + "loss": 0.4305, + "step": 15010 + }, + { + "epoch": 8.386033519553072, + "grad_norm": 0.6238791346549988, + "learning_rate": 0.0005826330532212885, + "loss": 0.5134, + "step": 15011 + }, + { + "epoch": 8.38659217877095, + "grad_norm": 0.9392037391662598, + "learning_rate": 0.0005826050420168067, + "loss": 0.5098, + "step": 15012 + }, + { + "epoch": 8.387150837988827, + "grad_norm": 0.734483003616333, + "learning_rate": 0.0005825770308123249, + "loss": 0.4961, + "step": 15013 + }, + { + "epoch": 8.387709497206703, + "grad_norm": 0.3761874735355377, + "learning_rate": 0.0005825490196078431, + "loss": 0.3452, + "step": 15014 + }, + { + "epoch": 8.388268156424582, + "grad_norm": 0.5336946249008179, + "learning_rate": 0.0005825210084033614, + "loss": 0.4099, + "step": 15015 + }, + { + "epoch": 8.388826815642458, + "grad_norm": 0.9680999517440796, + "learning_rate": 0.0005824929971988796, + "loss": 0.5749, + "step": 15016 + }, + { + "epoch": 8.389385474860335, + "grad_norm": 0.6268932819366455, + "learning_rate": 0.0005824649859943978, + "loss": 0.4869, + "step": 15017 + }, + { + "epoch": 8.389944134078211, + "grad_norm": 0.48394903540611267, + "learning_rate": 0.000582436974789916, + "loss": 0.4142, + "step": 15018 + }, + { + "epoch": 8.39050279329609, + "grad_norm": 0.3707742691040039, + "learning_rate": 0.0005824089635854342, + "loss": 0.427, + "step": 15019 + }, + { + "epoch": 8.391061452513966, + "grad_norm": 0.6092075109481812, + "learning_rate": 0.0005823809523809524, + "loss": 0.3906, + "step": 15020 + }, + { + "epoch": 8.391620111731843, + "grad_norm": 0.4559633433818817, + "learning_rate": 0.0005823529411764706, + "loss": 0.5168, + "step": 15021 + }, + { + "epoch": 8.392178770949721, + "grad_norm": 0.36660459637641907, + "learning_rate": 0.0005823249299719888, + "loss": 0.399, + "step": 15022 + }, + { + "epoch": 8.392737430167598, + "grad_norm": 1.7564077377319336, + "learning_rate": 0.000582296918767507, + "loss": 0.3607, + "step": 15023 + }, + { + "epoch": 8.393296089385474, + "grad_norm": 0.6242188811302185, + "learning_rate": 0.0005822689075630252, + "loss": 0.3524, + "step": 15024 + }, + { + "epoch": 8.393854748603353, + "grad_norm": 2.5956006050109863, + "learning_rate": 0.0005822408963585434, + "loss": 0.4181, + "step": 15025 + }, + { + "epoch": 8.39441340782123, + "grad_norm": 0.3992879390716553, + "learning_rate": 0.0005822128851540617, + "loss": 0.4334, + "step": 15026 + }, + { + "epoch": 8.394972067039106, + "grad_norm": 0.5396314263343811, + "learning_rate": 0.0005821848739495798, + "loss": 0.4547, + "step": 15027 + }, + { + "epoch": 8.395530726256982, + "grad_norm": 0.49497008323669434, + "learning_rate": 0.000582156862745098, + "loss": 0.4041, + "step": 15028 + }, + { + "epoch": 8.39608938547486, + "grad_norm": 0.45197969675064087, + "learning_rate": 0.0005821288515406162, + "loss": 0.4061, + "step": 15029 + }, + { + "epoch": 8.396648044692737, + "grad_norm": 2.5198616981506348, + "learning_rate": 0.0005821008403361344, + "loss": 0.4119, + "step": 15030 + }, + { + "epoch": 8.397206703910614, + "grad_norm": 0.5039351582527161, + "learning_rate": 0.0005820728291316528, + "loss": 0.3602, + "step": 15031 + }, + { + "epoch": 8.397765363128492, + "grad_norm": 7.539263725280762, + "learning_rate": 0.0005820448179271709, + "loss": 0.4095, + "step": 15032 + }, + { + "epoch": 8.398324022346369, + "grad_norm": 0.7699964642524719, + "learning_rate": 0.0005820168067226891, + "loss": 0.5351, + "step": 15033 + }, + { + "epoch": 8.398882681564245, + "grad_norm": 1.0932117700576782, + "learning_rate": 0.0005819887955182073, + "loss": 0.5278, + "step": 15034 + }, + { + "epoch": 8.399441340782122, + "grad_norm": 0.6231299638748169, + "learning_rate": 0.0005819607843137255, + "loss": 0.413, + "step": 15035 + }, + { + "epoch": 8.4, + "grad_norm": 0.5333104729652405, + "learning_rate": 0.0005819327731092438, + "loss": 0.4203, + "step": 15036 + }, + { + "epoch": 8.400558659217877, + "grad_norm": 0.9620609283447266, + "learning_rate": 0.0005819047619047619, + "loss": 0.4379, + "step": 15037 + }, + { + "epoch": 8.401117318435753, + "grad_norm": 0.5910012125968933, + "learning_rate": 0.0005818767507002801, + "loss": 0.3737, + "step": 15038 + }, + { + "epoch": 8.401675977653632, + "grad_norm": 0.8014978766441345, + "learning_rate": 0.0005818487394957983, + "loss": 0.3765, + "step": 15039 + }, + { + "epoch": 8.402234636871508, + "grad_norm": 0.5707065463066101, + "learning_rate": 0.0005818207282913165, + "loss": 0.4146, + "step": 15040 + }, + { + "epoch": 8.402793296089385, + "grad_norm": 0.6289256811141968, + "learning_rate": 0.0005817927170868348, + "loss": 0.4353, + "step": 15041 + }, + { + "epoch": 8.403351955307263, + "grad_norm": 0.8036611080169678, + "learning_rate": 0.000581764705882353, + "loss": 0.577, + "step": 15042 + }, + { + "epoch": 8.40391061452514, + "grad_norm": 0.4298895001411438, + "learning_rate": 0.0005817366946778711, + "loss": 0.3891, + "step": 15043 + }, + { + "epoch": 8.404469273743016, + "grad_norm": 0.4730525314807892, + "learning_rate": 0.0005817086834733893, + "loss": 0.4311, + "step": 15044 + }, + { + "epoch": 8.405027932960893, + "grad_norm": 0.5966941714286804, + "learning_rate": 0.0005816806722689075, + "loss": 0.3262, + "step": 15045 + }, + { + "epoch": 8.405586592178771, + "grad_norm": 0.5134749412536621, + "learning_rate": 0.0005816526610644258, + "loss": 0.4683, + "step": 15046 + }, + { + "epoch": 8.406145251396648, + "grad_norm": 0.4108908176422119, + "learning_rate": 0.0005816246498599441, + "loss": 0.3255, + "step": 15047 + }, + { + "epoch": 8.406703910614524, + "grad_norm": 0.4103609621524811, + "learning_rate": 0.0005815966386554622, + "loss": 0.4047, + "step": 15048 + }, + { + "epoch": 8.407262569832403, + "grad_norm": 0.5080446600914001, + "learning_rate": 0.0005815686274509804, + "loss": 0.4178, + "step": 15049 + }, + { + "epoch": 8.40782122905028, + "grad_norm": 0.6712818145751953, + "learning_rate": 0.0005815406162464986, + "loss": 0.4931, + "step": 15050 + }, + { + "epoch": 8.408379888268156, + "grad_norm": 0.6717536449432373, + "learning_rate": 0.0005815126050420169, + "loss": 0.5419, + "step": 15051 + }, + { + "epoch": 8.408938547486034, + "grad_norm": 0.7146703600883484, + "learning_rate": 0.0005814845938375351, + "loss": 0.4064, + "step": 15052 + }, + { + "epoch": 8.40949720670391, + "grad_norm": 7.56820821762085, + "learning_rate": 0.0005814565826330532, + "loss": 0.5398, + "step": 15053 + }, + { + "epoch": 8.410055865921787, + "grad_norm": 0.44317319989204407, + "learning_rate": 0.0005814285714285714, + "loss": 0.3379, + "step": 15054 + }, + { + "epoch": 8.410614525139664, + "grad_norm": 0.5120975375175476, + "learning_rate": 0.0005814005602240896, + "loss": 0.4168, + "step": 15055 + }, + { + "epoch": 8.411173184357542, + "grad_norm": 1.1907446384429932, + "learning_rate": 0.0005813725490196079, + "loss": 0.3392, + "step": 15056 + }, + { + "epoch": 8.411731843575419, + "grad_norm": 0.5381262898445129, + "learning_rate": 0.0005813445378151261, + "loss": 0.3176, + "step": 15057 + }, + { + "epoch": 8.412290502793295, + "grad_norm": 0.43007978796958923, + "learning_rate": 0.0005813165266106443, + "loss": 0.4151, + "step": 15058 + }, + { + "epoch": 8.412849162011174, + "grad_norm": 0.505859911441803, + "learning_rate": 0.0005812885154061624, + "loss": 0.5643, + "step": 15059 + }, + { + "epoch": 8.41340782122905, + "grad_norm": 0.6560646891593933, + "learning_rate": 0.0005812605042016806, + "loss": 0.4442, + "step": 15060 + }, + { + "epoch": 8.413966480446927, + "grad_norm": 0.6167864203453064, + "learning_rate": 0.0005812324929971989, + "loss": 0.3798, + "step": 15061 + }, + { + "epoch": 8.414525139664805, + "grad_norm": 0.689209520816803, + "learning_rate": 0.0005812044817927171, + "loss": 0.4153, + "step": 15062 + }, + { + "epoch": 8.415083798882682, + "grad_norm": 0.8740217685699463, + "learning_rate": 0.0005811764705882354, + "loss": 0.4053, + "step": 15063 + }, + { + "epoch": 8.415642458100558, + "grad_norm": 0.5117918848991394, + "learning_rate": 0.0005811484593837534, + "loss": 0.4156, + "step": 15064 + }, + { + "epoch": 8.416201117318435, + "grad_norm": 0.4569648504257202, + "learning_rate": 0.0005811204481792717, + "loss": 0.4219, + "step": 15065 + }, + { + "epoch": 8.416759776536313, + "grad_norm": 2.984717607498169, + "learning_rate": 0.00058109243697479, + "loss": 0.3834, + "step": 15066 + }, + { + "epoch": 8.41731843575419, + "grad_norm": 0.4624974727630615, + "learning_rate": 0.0005810644257703082, + "loss": 0.3727, + "step": 15067 + }, + { + "epoch": 8.417877094972066, + "grad_norm": 2.9909536838531494, + "learning_rate": 0.0005810364145658264, + "loss": 0.5461, + "step": 15068 + }, + { + "epoch": 8.418435754189945, + "grad_norm": 0.44195353984832764, + "learning_rate": 0.0005810084033613445, + "loss": 0.363, + "step": 15069 + }, + { + "epoch": 8.418994413407821, + "grad_norm": 0.5280654430389404, + "learning_rate": 0.0005809803921568627, + "loss": 0.4627, + "step": 15070 + }, + { + "epoch": 8.419553072625698, + "grad_norm": 0.7593404650688171, + "learning_rate": 0.000580952380952381, + "loss": 0.4568, + "step": 15071 + }, + { + "epoch": 8.420111731843576, + "grad_norm": 0.4967724680900574, + "learning_rate": 0.0005809243697478992, + "loss": 0.5067, + "step": 15072 + }, + { + "epoch": 8.420670391061453, + "grad_norm": 0.41193273663520813, + "learning_rate": 0.0005808963585434174, + "loss": 0.4349, + "step": 15073 + }, + { + "epoch": 8.42122905027933, + "grad_norm": 0.5806664228439331, + "learning_rate": 0.0005808683473389356, + "loss": 0.4153, + "step": 15074 + }, + { + "epoch": 8.421787709497206, + "grad_norm": 0.43386971950531006, + "learning_rate": 0.0005808403361344537, + "loss": 0.4706, + "step": 15075 + }, + { + "epoch": 8.422346368715084, + "grad_norm": 0.6706230044364929, + "learning_rate": 0.000580812324929972, + "loss": 0.5027, + "step": 15076 + }, + { + "epoch": 8.422905027932961, + "grad_norm": 0.5473315715789795, + "learning_rate": 0.0005807843137254902, + "loss": 0.6938, + "step": 15077 + }, + { + "epoch": 8.423463687150837, + "grad_norm": 2.0396227836608887, + "learning_rate": 0.0005807563025210084, + "loss": 0.4742, + "step": 15078 + }, + { + "epoch": 8.424022346368716, + "grad_norm": 0.41608479619026184, + "learning_rate": 0.0005807282913165266, + "loss": 0.3652, + "step": 15079 + }, + { + "epoch": 8.424581005586592, + "grad_norm": 0.5501396059989929, + "learning_rate": 0.0005807002801120447, + "loss": 0.4825, + "step": 15080 + }, + { + "epoch": 8.425139664804469, + "grad_norm": 0.4052152931690216, + "learning_rate": 0.0005806722689075631, + "loss": 0.4553, + "step": 15081 + }, + { + "epoch": 8.425698324022346, + "grad_norm": 0.360068678855896, + "learning_rate": 0.0005806442577030813, + "loss": 0.4187, + "step": 15082 + }, + { + "epoch": 8.426256983240224, + "grad_norm": 0.3935549855232239, + "learning_rate": 0.0005806162464985995, + "loss": 0.4486, + "step": 15083 + }, + { + "epoch": 8.4268156424581, + "grad_norm": 0.6176988482475281, + "learning_rate": 0.0005805882352941177, + "loss": 0.4482, + "step": 15084 + }, + { + "epoch": 8.427374301675977, + "grad_norm": 4.438694953918457, + "learning_rate": 0.0005805602240896358, + "loss": 0.3855, + "step": 15085 + }, + { + "epoch": 8.427932960893855, + "grad_norm": 0.591834545135498, + "learning_rate": 0.0005805322128851541, + "loss": 0.4643, + "step": 15086 + }, + { + "epoch": 8.428491620111732, + "grad_norm": 0.48653504252433777, + "learning_rate": 0.0005805042016806723, + "loss": 0.3746, + "step": 15087 + }, + { + "epoch": 8.429050279329608, + "grad_norm": 0.43574297428131104, + "learning_rate": 0.0005804761904761905, + "loss": 0.4731, + "step": 15088 + }, + { + "epoch": 8.429608938547487, + "grad_norm": 0.9128918051719666, + "learning_rate": 0.0005804481792717087, + "loss": 0.4218, + "step": 15089 + }, + { + "epoch": 8.430167597765363, + "grad_norm": 0.4452519118785858, + "learning_rate": 0.0005804201680672269, + "loss": 0.4395, + "step": 15090 + }, + { + "epoch": 8.43072625698324, + "grad_norm": 0.5181838870048523, + "learning_rate": 0.0005803921568627451, + "loss": 0.3677, + "step": 15091 + }, + { + "epoch": 8.431284916201117, + "grad_norm": 0.4824831783771515, + "learning_rate": 0.0005803641456582633, + "loss": 0.4234, + "step": 15092 + }, + { + "epoch": 8.431843575418995, + "grad_norm": 0.5785755515098572, + "learning_rate": 0.0005803361344537815, + "loss": 0.6059, + "step": 15093 + }, + { + "epoch": 8.432402234636871, + "grad_norm": 0.42026451230049133, + "learning_rate": 0.0005803081232492997, + "loss": 0.4067, + "step": 15094 + }, + { + "epoch": 8.432960893854748, + "grad_norm": 0.5909212827682495, + "learning_rate": 0.0005802801120448179, + "loss": 0.4481, + "step": 15095 + }, + { + "epoch": 8.433519553072626, + "grad_norm": 0.44709333777427673, + "learning_rate": 0.0005802521008403361, + "loss": 0.5091, + "step": 15096 + }, + { + "epoch": 8.434078212290503, + "grad_norm": 2.2895781993865967, + "learning_rate": 0.0005802240896358544, + "loss": 0.4165, + "step": 15097 + }, + { + "epoch": 8.43463687150838, + "grad_norm": 1.7314233779907227, + "learning_rate": 0.0005801960784313726, + "loss": 0.3606, + "step": 15098 + }, + { + "epoch": 8.435195530726258, + "grad_norm": 0.47517508268356323, + "learning_rate": 0.0005801680672268908, + "loss": 0.4047, + "step": 15099 + }, + { + "epoch": 8.435754189944134, + "grad_norm": 0.511033296585083, + "learning_rate": 0.000580140056022409, + "loss": 0.4535, + "step": 15100 + }, + { + "epoch": 8.436312849162011, + "grad_norm": 0.5666338205337524, + "learning_rate": 0.0005801120448179272, + "loss": 0.4598, + "step": 15101 + }, + { + "epoch": 8.436871508379888, + "grad_norm": 1.1614675521850586, + "learning_rate": 0.0005800840336134454, + "loss": 0.4548, + "step": 15102 + }, + { + "epoch": 8.437430167597766, + "grad_norm": 0.4814290404319763, + "learning_rate": 0.0005800560224089636, + "loss": 0.4376, + "step": 15103 + }, + { + "epoch": 8.437988826815642, + "grad_norm": 0.8443406224250793, + "learning_rate": 0.0005800280112044818, + "loss": 0.3793, + "step": 15104 + }, + { + "epoch": 8.438547486033519, + "grad_norm": 0.709213376045227, + "learning_rate": 0.00058, + "loss": 0.3492, + "step": 15105 + }, + { + "epoch": 8.439106145251397, + "grad_norm": 0.6441038846969604, + "learning_rate": 0.0005799719887955183, + "loss": 0.4306, + "step": 15106 + }, + { + "epoch": 8.439664804469274, + "grad_norm": 0.6008891463279724, + "learning_rate": 0.0005799439775910364, + "loss": 0.4937, + "step": 15107 + }, + { + "epoch": 8.44022346368715, + "grad_norm": 0.5976288914680481, + "learning_rate": 0.0005799159663865546, + "loss": 0.4254, + "step": 15108 + }, + { + "epoch": 8.440782122905027, + "grad_norm": 1.7645634412765503, + "learning_rate": 0.0005798879551820728, + "loss": 0.4629, + "step": 15109 + }, + { + "epoch": 8.441340782122905, + "grad_norm": 0.43416234850883484, + "learning_rate": 0.000579859943977591, + "loss": 0.4221, + "step": 15110 + }, + { + "epoch": 8.441899441340782, + "grad_norm": 0.46453604102134705, + "learning_rate": 0.0005798319327731093, + "loss": 0.3954, + "step": 15111 + }, + { + "epoch": 8.442458100558659, + "grad_norm": 0.5179375410079956, + "learning_rate": 0.0005798039215686274, + "loss": 0.4671, + "step": 15112 + }, + { + "epoch": 8.443016759776537, + "grad_norm": 0.6036710739135742, + "learning_rate": 0.0005797759103641457, + "loss": 0.5123, + "step": 15113 + }, + { + "epoch": 8.443575418994413, + "grad_norm": 0.3649946451187134, + "learning_rate": 0.0005797478991596639, + "loss": 0.4524, + "step": 15114 + }, + { + "epoch": 8.44413407821229, + "grad_norm": 0.4086681008338928, + "learning_rate": 0.0005797198879551821, + "loss": 0.403, + "step": 15115 + }, + { + "epoch": 8.444692737430168, + "grad_norm": 0.4461531341075897, + "learning_rate": 0.0005796918767507004, + "loss": 0.5342, + "step": 15116 + }, + { + "epoch": 8.445251396648045, + "grad_norm": 0.4921421706676483, + "learning_rate": 0.0005796638655462185, + "loss": 0.4663, + "step": 15117 + }, + { + "epoch": 8.445810055865921, + "grad_norm": 1.8387770652770996, + "learning_rate": 0.0005796358543417367, + "loss": 0.5494, + "step": 15118 + }, + { + "epoch": 8.446368715083798, + "grad_norm": 0.4927188456058502, + "learning_rate": 0.0005796078431372549, + "loss": 0.5956, + "step": 15119 + }, + { + "epoch": 8.446927374301676, + "grad_norm": 0.5424768328666687, + "learning_rate": 0.0005795798319327731, + "loss": 0.4196, + "step": 15120 + }, + { + "epoch": 8.447486033519553, + "grad_norm": 0.9578824639320374, + "learning_rate": 0.0005795518207282914, + "loss": 0.4401, + "step": 15121 + }, + { + "epoch": 8.44804469273743, + "grad_norm": 0.6570304036140442, + "learning_rate": 0.0005795238095238096, + "loss": 0.4994, + "step": 15122 + }, + { + "epoch": 8.448603351955308, + "grad_norm": 0.5701783895492554, + "learning_rate": 0.0005794957983193277, + "loss": 0.3914, + "step": 15123 + }, + { + "epoch": 8.449162011173184, + "grad_norm": 0.4354120194911957, + "learning_rate": 0.0005794677871148459, + "loss": 0.4802, + "step": 15124 + }, + { + "epoch": 8.449720670391061, + "grad_norm": 0.9803155660629272, + "learning_rate": 0.0005794397759103641, + "loss": 0.3394, + "step": 15125 + }, + { + "epoch": 8.45027932960894, + "grad_norm": 0.715829074382782, + "learning_rate": 0.0005794117647058824, + "loss": 0.4679, + "step": 15126 + }, + { + "epoch": 8.450837988826816, + "grad_norm": 0.5965988636016846, + "learning_rate": 0.0005793837535014006, + "loss": 0.4591, + "step": 15127 + }, + { + "epoch": 8.451396648044692, + "grad_norm": 1.1053520441055298, + "learning_rate": 0.0005793557422969187, + "loss": 0.4367, + "step": 15128 + }, + { + "epoch": 8.451955307262569, + "grad_norm": 0.882635772228241, + "learning_rate": 0.000579327731092437, + "loss": 0.4614, + "step": 15129 + }, + { + "epoch": 8.452513966480447, + "grad_norm": 0.5637270212173462, + "learning_rate": 0.0005792997198879552, + "loss": 0.5258, + "step": 15130 + }, + { + "epoch": 8.453072625698324, + "grad_norm": 0.4686194360256195, + "learning_rate": 0.0005792717086834735, + "loss": 0.4381, + "step": 15131 + }, + { + "epoch": 8.4536312849162, + "grad_norm": 0.5487255454063416, + "learning_rate": 0.0005792436974789917, + "loss": 0.5203, + "step": 15132 + }, + { + "epoch": 8.454189944134079, + "grad_norm": 0.4387367367744446, + "learning_rate": 0.0005792156862745098, + "loss": 0.5721, + "step": 15133 + }, + { + "epoch": 8.454748603351955, + "grad_norm": 0.4858744442462921, + "learning_rate": 0.000579187675070028, + "loss": 0.5477, + "step": 15134 + }, + { + "epoch": 8.455307262569832, + "grad_norm": 0.6573995351791382, + "learning_rate": 0.0005791596638655462, + "loss": 0.4172, + "step": 15135 + }, + { + "epoch": 8.45586592178771, + "grad_norm": 0.7264791131019592, + "learning_rate": 0.0005791316526610645, + "loss": 0.3538, + "step": 15136 + }, + { + "epoch": 8.456424581005587, + "grad_norm": 0.5942917466163635, + "learning_rate": 0.0005791036414565827, + "loss": 0.4882, + "step": 15137 + }, + { + "epoch": 8.456983240223463, + "grad_norm": 0.6617839932441711, + "learning_rate": 0.0005790756302521009, + "loss": 0.4583, + "step": 15138 + }, + { + "epoch": 8.45754189944134, + "grad_norm": 0.8159322738647461, + "learning_rate": 0.000579047619047619, + "loss": 0.4415, + "step": 15139 + }, + { + "epoch": 8.458100558659218, + "grad_norm": 0.42285773158073425, + "learning_rate": 0.0005790196078431372, + "loss": 0.3736, + "step": 15140 + }, + { + "epoch": 8.458659217877095, + "grad_norm": 0.46349599957466125, + "learning_rate": 0.0005789915966386555, + "loss": 0.4708, + "step": 15141 + }, + { + "epoch": 8.459217877094972, + "grad_norm": 0.5299896597862244, + "learning_rate": 0.0005789635854341737, + "loss": 0.3909, + "step": 15142 + }, + { + "epoch": 8.45977653631285, + "grad_norm": 0.5893663167953491, + "learning_rate": 0.0005789355742296919, + "loss": 0.4049, + "step": 15143 + }, + { + "epoch": 8.460335195530726, + "grad_norm": 0.4479421079158783, + "learning_rate": 0.00057890756302521, + "loss": 0.4355, + "step": 15144 + }, + { + "epoch": 8.460893854748603, + "grad_norm": 0.4521818459033966, + "learning_rate": 0.0005788795518207282, + "loss": 0.4395, + "step": 15145 + }, + { + "epoch": 8.461452513966481, + "grad_norm": 0.8326586484909058, + "learning_rate": 0.0005788515406162466, + "loss": 0.5878, + "step": 15146 + }, + { + "epoch": 8.462011173184358, + "grad_norm": 2.5874366760253906, + "learning_rate": 0.0005788235294117648, + "loss": 0.3954, + "step": 15147 + }, + { + "epoch": 8.462569832402234, + "grad_norm": 0.4896738529205322, + "learning_rate": 0.000578795518207283, + "loss": 0.4108, + "step": 15148 + }, + { + "epoch": 8.463128491620111, + "grad_norm": 0.4276901185512543, + "learning_rate": 0.0005787675070028011, + "loss": 0.4182, + "step": 15149 + }, + { + "epoch": 8.46368715083799, + "grad_norm": 0.9241102337837219, + "learning_rate": 0.0005787394957983193, + "loss": 0.4939, + "step": 15150 + }, + { + "epoch": 8.464245810055866, + "grad_norm": 1.7278085947036743, + "learning_rate": 0.0005787114845938376, + "loss": 0.3167, + "step": 15151 + }, + { + "epoch": 8.464804469273743, + "grad_norm": 0.516864538192749, + "learning_rate": 0.0005786834733893558, + "loss": 0.369, + "step": 15152 + }, + { + "epoch": 8.46536312849162, + "grad_norm": 0.47903817892074585, + "learning_rate": 0.000578655462184874, + "loss": 0.4597, + "step": 15153 + }, + { + "epoch": 8.465921787709497, + "grad_norm": 0.6159040927886963, + "learning_rate": 0.0005786274509803922, + "loss": 0.3888, + "step": 15154 + }, + { + "epoch": 8.466480446927374, + "grad_norm": 0.35229989886283875, + "learning_rate": 0.0005785994397759103, + "loss": 0.323, + "step": 15155 + }, + { + "epoch": 8.46703910614525, + "grad_norm": 0.4384755492210388, + "learning_rate": 0.0005785714285714286, + "loss": 0.3895, + "step": 15156 + }, + { + "epoch": 8.467597765363129, + "grad_norm": 0.6398372054100037, + "learning_rate": 0.0005785434173669468, + "loss": 0.4142, + "step": 15157 + }, + { + "epoch": 8.468156424581005, + "grad_norm": 0.47204533219337463, + "learning_rate": 0.000578515406162465, + "loss": 0.3795, + "step": 15158 + }, + { + "epoch": 8.468715083798882, + "grad_norm": 0.6622194051742554, + "learning_rate": 0.0005784873949579832, + "loss": 0.4378, + "step": 15159 + }, + { + "epoch": 8.46927374301676, + "grad_norm": 1.2734688520431519, + "learning_rate": 0.0005784593837535013, + "loss": 0.4494, + "step": 15160 + }, + { + "epoch": 8.469832402234637, + "grad_norm": 0.9295127987861633, + "learning_rate": 0.0005784313725490196, + "loss": 0.6095, + "step": 15161 + }, + { + "epoch": 8.470391061452514, + "grad_norm": 0.6259679794311523, + "learning_rate": 0.0005784033613445379, + "loss": 0.4424, + "step": 15162 + }, + { + "epoch": 8.470949720670392, + "grad_norm": 1.059126853942871, + "learning_rate": 0.0005783753501400561, + "loss": 0.3123, + "step": 15163 + }, + { + "epoch": 8.471508379888268, + "grad_norm": 0.5010936260223389, + "learning_rate": 0.0005783473389355743, + "loss": 0.4049, + "step": 15164 + }, + { + "epoch": 8.472067039106145, + "grad_norm": 0.5045092701911926, + "learning_rate": 0.0005783193277310924, + "loss": 0.4235, + "step": 15165 + }, + { + "epoch": 8.472625698324022, + "grad_norm": 0.3551951050758362, + "learning_rate": 0.0005782913165266107, + "loss": 0.344, + "step": 15166 + }, + { + "epoch": 8.4731843575419, + "grad_norm": 0.6202253103256226, + "learning_rate": 0.0005782633053221289, + "loss": 0.4448, + "step": 15167 + }, + { + "epoch": 8.473743016759776, + "grad_norm": 0.5454356074333191, + "learning_rate": 0.0005782352941176471, + "loss": 0.5295, + "step": 15168 + }, + { + "epoch": 8.474301675977653, + "grad_norm": 0.41051018238067627, + "learning_rate": 0.0005782072829131653, + "loss": 0.3878, + "step": 15169 + }, + { + "epoch": 8.474860335195531, + "grad_norm": 0.4879007637500763, + "learning_rate": 0.0005781792717086835, + "loss": 0.4787, + "step": 15170 + }, + { + "epoch": 8.475418994413408, + "grad_norm": 0.7446616888046265, + "learning_rate": 0.0005781512605042017, + "loss": 0.4479, + "step": 15171 + }, + { + "epoch": 8.475977653631285, + "grad_norm": 1.3310225009918213, + "learning_rate": 0.0005781232492997199, + "loss": 0.4161, + "step": 15172 + }, + { + "epoch": 8.476536312849163, + "grad_norm": 0.4325145184993744, + "learning_rate": 0.0005780952380952381, + "loss": 0.3515, + "step": 15173 + }, + { + "epoch": 8.47709497206704, + "grad_norm": 0.5783650875091553, + "learning_rate": 0.0005780672268907563, + "loss": 0.5555, + "step": 15174 + }, + { + "epoch": 8.477653631284916, + "grad_norm": 0.6773547530174255, + "learning_rate": 0.0005780392156862745, + "loss": 0.5505, + "step": 15175 + }, + { + "epoch": 8.478212290502793, + "grad_norm": 0.47681915760040283, + "learning_rate": 0.0005780112044817926, + "loss": 0.3965, + "step": 15176 + }, + { + "epoch": 8.478770949720671, + "grad_norm": 0.8694610595703125, + "learning_rate": 0.0005779831932773109, + "loss": 0.4967, + "step": 15177 + }, + { + "epoch": 8.479329608938547, + "grad_norm": 0.7414939403533936, + "learning_rate": 0.0005779551820728291, + "loss": 0.3652, + "step": 15178 + }, + { + "epoch": 8.479888268156424, + "grad_norm": 0.7186941504478455, + "learning_rate": 0.0005779271708683474, + "loss": 0.5713, + "step": 15179 + }, + { + "epoch": 8.480446927374302, + "grad_norm": 0.6920742392539978, + "learning_rate": 0.0005778991596638656, + "loss": 0.5204, + "step": 15180 + }, + { + "epoch": 8.481005586592179, + "grad_norm": 0.4140740633010864, + "learning_rate": 0.0005778711484593837, + "loss": 0.4269, + "step": 15181 + }, + { + "epoch": 8.481564245810056, + "grad_norm": 0.7242079973220825, + "learning_rate": 0.000577843137254902, + "loss": 0.5297, + "step": 15182 + }, + { + "epoch": 8.482122905027932, + "grad_norm": 6.258434295654297, + "learning_rate": 0.0005778151260504202, + "loss": 0.4646, + "step": 15183 + }, + { + "epoch": 8.48268156424581, + "grad_norm": 0.4407448470592499, + "learning_rate": 0.0005777871148459384, + "loss": 0.2855, + "step": 15184 + }, + { + "epoch": 8.483240223463687, + "grad_norm": 0.5108163356781006, + "learning_rate": 0.0005777591036414566, + "loss": 0.4594, + "step": 15185 + }, + { + "epoch": 8.483798882681564, + "grad_norm": 0.4854128360748291, + "learning_rate": 0.0005777310924369748, + "loss": 0.4463, + "step": 15186 + }, + { + "epoch": 8.484357541899442, + "grad_norm": 0.5266446471214294, + "learning_rate": 0.000577703081232493, + "loss": 0.3882, + "step": 15187 + }, + { + "epoch": 8.484916201117318, + "grad_norm": 2.322105884552002, + "learning_rate": 0.0005776750700280112, + "loss": 0.3792, + "step": 15188 + }, + { + "epoch": 8.485474860335195, + "grad_norm": 0.515484631061554, + "learning_rate": 0.0005776470588235294, + "loss": 0.4347, + "step": 15189 + }, + { + "epoch": 8.486033519553073, + "grad_norm": 0.5316684246063232, + "learning_rate": 0.0005776190476190476, + "loss": 0.3438, + "step": 15190 + }, + { + "epoch": 8.48659217877095, + "grad_norm": 0.4150233268737793, + "learning_rate": 0.0005775910364145658, + "loss": 0.3617, + "step": 15191 + }, + { + "epoch": 8.487150837988827, + "grad_norm": 1.9137550592422485, + "learning_rate": 0.000577563025210084, + "loss": 0.6027, + "step": 15192 + }, + { + "epoch": 8.487709497206703, + "grad_norm": 0.5668200254440308, + "learning_rate": 0.0005775350140056022, + "loss": 0.4251, + "step": 15193 + }, + { + "epoch": 8.488268156424581, + "grad_norm": 1.2568035125732422, + "learning_rate": 0.0005775070028011204, + "loss": 0.36, + "step": 15194 + }, + { + "epoch": 8.488826815642458, + "grad_norm": 4.105048656463623, + "learning_rate": 0.0005774789915966387, + "loss": 0.4748, + "step": 15195 + }, + { + "epoch": 8.489385474860335, + "grad_norm": 0.5091862082481384, + "learning_rate": 0.0005774509803921569, + "loss": 0.3959, + "step": 15196 + }, + { + "epoch": 8.489944134078213, + "grad_norm": 0.6350382566452026, + "learning_rate": 0.0005774229691876752, + "loss": 0.7157, + "step": 15197 + }, + { + "epoch": 8.49050279329609, + "grad_norm": 0.47594332695007324, + "learning_rate": 0.0005773949579831933, + "loss": 0.4638, + "step": 15198 + }, + { + "epoch": 8.491061452513966, + "grad_norm": 0.5144283771514893, + "learning_rate": 0.0005773669467787115, + "loss": 0.4744, + "step": 15199 + }, + { + "epoch": 8.491620111731844, + "grad_norm": 0.9460113048553467, + "learning_rate": 0.0005773389355742297, + "loss": 0.3787, + "step": 15200 + }, + { + "epoch": 8.492178770949721, + "grad_norm": 0.5416004061698914, + "learning_rate": 0.0005773109243697479, + "loss": 0.4765, + "step": 15201 + }, + { + "epoch": 8.492737430167598, + "grad_norm": 0.3924878239631653, + "learning_rate": 0.0005772829131652662, + "loss": 0.4494, + "step": 15202 + }, + { + "epoch": 8.493296089385474, + "grad_norm": 0.4454866349697113, + "learning_rate": 0.0005772549019607843, + "loss": 0.3877, + "step": 15203 + }, + { + "epoch": 8.493854748603352, + "grad_norm": 0.9776567220687866, + "learning_rate": 0.0005772268907563025, + "loss": 0.3407, + "step": 15204 + }, + { + "epoch": 8.494413407821229, + "grad_norm": 0.49780526757240295, + "learning_rate": 0.0005771988795518207, + "loss": 0.5092, + "step": 15205 + }, + { + "epoch": 8.494972067039106, + "grad_norm": 0.4817385673522949, + "learning_rate": 0.0005771708683473389, + "loss": 0.4408, + "step": 15206 + }, + { + "epoch": 8.495530726256984, + "grad_norm": 1.1811668872833252, + "learning_rate": 0.0005771428571428572, + "loss": 0.4696, + "step": 15207 + }, + { + "epoch": 8.49608938547486, + "grad_norm": 0.5603366494178772, + "learning_rate": 0.0005771148459383753, + "loss": 0.4709, + "step": 15208 + }, + { + "epoch": 8.496648044692737, + "grad_norm": 0.4623156189918518, + "learning_rate": 0.0005770868347338935, + "loss": 0.3893, + "step": 15209 + }, + { + "epoch": 8.497206703910614, + "grad_norm": 0.47783979773521423, + "learning_rate": 0.0005770588235294117, + "loss": 0.4488, + "step": 15210 + }, + { + "epoch": 8.497765363128492, + "grad_norm": 0.365688294172287, + "learning_rate": 0.00057703081232493, + "loss": 0.3015, + "step": 15211 + }, + { + "epoch": 8.498324022346369, + "grad_norm": 0.3822142779827118, + "learning_rate": 0.0005770028011204483, + "loss": 0.422, + "step": 15212 + }, + { + "epoch": 8.498882681564245, + "grad_norm": 0.6450942754745483, + "learning_rate": 0.0005769747899159665, + "loss": 0.4827, + "step": 15213 + }, + { + "epoch": 8.499441340782123, + "grad_norm": 0.4435880780220032, + "learning_rate": 0.0005769467787114846, + "loss": 0.388, + "step": 15214 + }, + { + "epoch": 8.5, + "grad_norm": 0.49928146600723267, + "learning_rate": 0.0005769187675070028, + "loss": 0.3566, + "step": 15215 + }, + { + "epoch": 8.500558659217877, + "grad_norm": 0.7004232406616211, + "learning_rate": 0.000576890756302521, + "loss": 0.3701, + "step": 15216 + }, + { + "epoch": 8.501117318435755, + "grad_norm": 0.48921772837638855, + "learning_rate": 0.0005768627450980393, + "loss": 0.4805, + "step": 15217 + }, + { + "epoch": 8.501675977653631, + "grad_norm": 0.35773390531539917, + "learning_rate": 0.0005768347338935575, + "loss": 0.4482, + "step": 15218 + }, + { + "epoch": 8.502234636871508, + "grad_norm": 0.7257769107818604, + "learning_rate": 0.0005768067226890756, + "loss": 0.5911, + "step": 15219 + }, + { + "epoch": 8.502793296089386, + "grad_norm": 0.4031614363193512, + "learning_rate": 0.0005767787114845938, + "loss": 0.3966, + "step": 15220 + }, + { + "epoch": 8.503351955307263, + "grad_norm": 0.334354966878891, + "learning_rate": 0.000576750700280112, + "loss": 0.4081, + "step": 15221 + }, + { + "epoch": 8.50391061452514, + "grad_norm": 0.40169692039489746, + "learning_rate": 0.0005767226890756303, + "loss": 0.3978, + "step": 15222 + }, + { + "epoch": 8.504469273743016, + "grad_norm": 0.8172411322593689, + "learning_rate": 0.0005766946778711485, + "loss": 0.3922, + "step": 15223 + }, + { + "epoch": 8.505027932960894, + "grad_norm": 1.7994247674942017, + "learning_rate": 0.0005766666666666666, + "loss": 0.4577, + "step": 15224 + }, + { + "epoch": 8.505586592178771, + "grad_norm": 0.5939977169036865, + "learning_rate": 0.0005766386554621848, + "loss": 0.4589, + "step": 15225 + }, + { + "epoch": 8.506145251396648, + "grad_norm": 0.4617646038532257, + "learning_rate": 0.000576610644257703, + "loss": 0.5809, + "step": 15226 + }, + { + "epoch": 8.506703910614526, + "grad_norm": 0.7643673419952393, + "learning_rate": 0.0005765826330532214, + "loss": 0.4111, + "step": 15227 + }, + { + "epoch": 8.507262569832402, + "grad_norm": 0.6527523398399353, + "learning_rate": 0.0005765546218487396, + "loss": 0.5062, + "step": 15228 + }, + { + "epoch": 8.507821229050279, + "grad_norm": 0.7315698862075806, + "learning_rate": 0.0005765266106442578, + "loss": 0.4736, + "step": 15229 + }, + { + "epoch": 8.508379888268156, + "grad_norm": 0.92759108543396, + "learning_rate": 0.0005764985994397759, + "loss": 0.3637, + "step": 15230 + }, + { + "epoch": 8.508938547486034, + "grad_norm": 0.3782186210155487, + "learning_rate": 0.0005764705882352941, + "loss": 0.4743, + "step": 15231 + }, + { + "epoch": 8.50949720670391, + "grad_norm": 4.267871379852295, + "learning_rate": 0.0005764425770308124, + "loss": 0.4108, + "step": 15232 + }, + { + "epoch": 8.510055865921787, + "grad_norm": 0.7674300074577332, + "learning_rate": 0.0005764145658263306, + "loss": 0.3554, + "step": 15233 + }, + { + "epoch": 8.510614525139665, + "grad_norm": 0.5992588400840759, + "learning_rate": 0.0005763865546218488, + "loss": 0.5023, + "step": 15234 + }, + { + "epoch": 8.511173184357542, + "grad_norm": 0.7348669767379761, + "learning_rate": 0.0005763585434173669, + "loss": 0.4499, + "step": 15235 + }, + { + "epoch": 8.511731843575419, + "grad_norm": 0.8790232539176941, + "learning_rate": 0.0005763305322128851, + "loss": 0.3699, + "step": 15236 + }, + { + "epoch": 8.512290502793297, + "grad_norm": 0.4855366349220276, + "learning_rate": 0.0005763025210084034, + "loss": 0.393, + "step": 15237 + }, + { + "epoch": 8.512849162011173, + "grad_norm": 2.1212544441223145, + "learning_rate": 0.0005762745098039216, + "loss": 0.3952, + "step": 15238 + }, + { + "epoch": 8.51340782122905, + "grad_norm": 0.5433536171913147, + "learning_rate": 0.0005762464985994398, + "loss": 0.37, + "step": 15239 + }, + { + "epoch": 8.513966480446927, + "grad_norm": 0.7903736233711243, + "learning_rate": 0.0005762184873949579, + "loss": 0.4866, + "step": 15240 + }, + { + "epoch": 8.514525139664805, + "grad_norm": 1.2631672620773315, + "learning_rate": 0.0005761904761904761, + "loss": 0.6079, + "step": 15241 + }, + { + "epoch": 8.515083798882682, + "grad_norm": 3.1984617710113525, + "learning_rate": 0.0005761624649859944, + "loss": 0.386, + "step": 15242 + }, + { + "epoch": 8.515642458100558, + "grad_norm": 0.6026468873023987, + "learning_rate": 0.0005761344537815126, + "loss": 0.4322, + "step": 15243 + }, + { + "epoch": 8.516201117318436, + "grad_norm": 0.49546369910240173, + "learning_rate": 0.0005761064425770309, + "loss": 0.4695, + "step": 15244 + }, + { + "epoch": 8.516759776536313, + "grad_norm": 0.6571424007415771, + "learning_rate": 0.0005760784313725491, + "loss": 0.4463, + "step": 15245 + }, + { + "epoch": 8.51731843575419, + "grad_norm": 0.5202311277389526, + "learning_rate": 0.0005760504201680672, + "loss": 0.4609, + "step": 15246 + }, + { + "epoch": 8.517877094972068, + "grad_norm": 0.5377156138420105, + "learning_rate": 0.0005760224089635855, + "loss": 0.5627, + "step": 15247 + }, + { + "epoch": 8.518435754189944, + "grad_norm": 0.9329795837402344, + "learning_rate": 0.0005759943977591037, + "loss": 0.5134, + "step": 15248 + }, + { + "epoch": 8.518994413407821, + "grad_norm": 1.0959504842758179, + "learning_rate": 0.0005759663865546219, + "loss": 0.3847, + "step": 15249 + }, + { + "epoch": 8.519553072625698, + "grad_norm": 0.6080384254455566, + "learning_rate": 0.0005759383753501401, + "loss": 0.5454, + "step": 15250 + }, + { + "epoch": 8.520111731843576, + "grad_norm": 0.5090042352676392, + "learning_rate": 0.0005759103641456582, + "loss": 0.4289, + "step": 15251 + }, + { + "epoch": 8.520670391061453, + "grad_norm": 0.32931584119796753, + "learning_rate": 0.0005758823529411765, + "loss": 0.3426, + "step": 15252 + }, + { + "epoch": 8.521229050279329, + "grad_norm": 0.721106767654419, + "learning_rate": 0.0005758543417366947, + "loss": 0.4052, + "step": 15253 + }, + { + "epoch": 8.521787709497207, + "grad_norm": 0.5239723920822144, + "learning_rate": 0.0005758263305322129, + "loss": 0.4452, + "step": 15254 + }, + { + "epoch": 8.522346368715084, + "grad_norm": 0.5963440537452698, + "learning_rate": 0.0005757983193277311, + "loss": 0.3637, + "step": 15255 + }, + { + "epoch": 8.52290502793296, + "grad_norm": 0.709584653377533, + "learning_rate": 0.0005757703081232492, + "loss": 0.4127, + "step": 15256 + }, + { + "epoch": 8.523463687150837, + "grad_norm": 0.6826097369194031, + "learning_rate": 0.0005757422969187675, + "loss": 0.4371, + "step": 15257 + }, + { + "epoch": 8.524022346368715, + "grad_norm": 0.4796157479286194, + "learning_rate": 0.0005757142857142857, + "loss": 0.4362, + "step": 15258 + }, + { + "epoch": 8.524581005586592, + "grad_norm": 0.530832052230835, + "learning_rate": 0.0005756862745098039, + "loss": 0.447, + "step": 15259 + }, + { + "epoch": 8.525139664804469, + "grad_norm": 0.5507177114486694, + "learning_rate": 0.0005756582633053221, + "loss": 0.5301, + "step": 15260 + }, + { + "epoch": 8.525698324022347, + "grad_norm": 0.45665866136550903, + "learning_rate": 0.0005756302521008404, + "loss": 0.4246, + "step": 15261 + }, + { + "epoch": 8.526256983240224, + "grad_norm": 1.238662838935852, + "learning_rate": 0.0005756022408963586, + "loss": 0.3877, + "step": 15262 + }, + { + "epoch": 8.5268156424581, + "grad_norm": 0.5967044234275818, + "learning_rate": 0.0005755742296918768, + "loss": 0.4119, + "step": 15263 + }, + { + "epoch": 8.527374301675978, + "grad_norm": 1.104540467262268, + "learning_rate": 0.000575546218487395, + "loss": 0.3732, + "step": 15264 + }, + { + "epoch": 8.527932960893855, + "grad_norm": 0.39002472162246704, + "learning_rate": 0.0005755182072829132, + "loss": 0.4222, + "step": 15265 + }, + { + "epoch": 8.528491620111732, + "grad_norm": 1.3714148998260498, + "learning_rate": 0.0005754901960784314, + "loss": 0.3174, + "step": 15266 + }, + { + "epoch": 8.529050279329608, + "grad_norm": 0.40640711784362793, + "learning_rate": 0.0005754621848739496, + "loss": 0.4269, + "step": 15267 + }, + { + "epoch": 8.529608938547486, + "grad_norm": 0.4001014530658722, + "learning_rate": 0.0005754341736694678, + "loss": 0.4606, + "step": 15268 + }, + { + "epoch": 8.530167597765363, + "grad_norm": 0.6572843194007874, + "learning_rate": 0.000575406162464986, + "loss": 0.4876, + "step": 15269 + }, + { + "epoch": 8.53072625698324, + "grad_norm": 0.5424188375473022, + "learning_rate": 0.0005753781512605042, + "loss": 0.4822, + "step": 15270 + }, + { + "epoch": 8.531284916201118, + "grad_norm": 0.3950853943824768, + "learning_rate": 0.0005753501400560224, + "loss": 0.4627, + "step": 15271 + }, + { + "epoch": 8.531843575418995, + "grad_norm": 0.580479085445404, + "learning_rate": 0.0005753221288515406, + "loss": 0.5232, + "step": 15272 + }, + { + "epoch": 8.532402234636871, + "grad_norm": 0.5094438791275024, + "learning_rate": 0.0005752941176470588, + "loss": 0.5471, + "step": 15273 + }, + { + "epoch": 8.53296089385475, + "grad_norm": 2.7602362632751465, + "learning_rate": 0.000575266106442577, + "loss": 0.4116, + "step": 15274 + }, + { + "epoch": 8.533519553072626, + "grad_norm": 0.4748554527759552, + "learning_rate": 0.0005752380952380952, + "loss": 0.4031, + "step": 15275 + }, + { + "epoch": 8.534078212290503, + "grad_norm": 0.48180916905403137, + "learning_rate": 0.0005752100840336134, + "loss": 0.4343, + "step": 15276 + }, + { + "epoch": 8.53463687150838, + "grad_norm": 2.5633301734924316, + "learning_rate": 0.0005751820728291318, + "loss": 0.4275, + "step": 15277 + }, + { + "epoch": 8.535195530726257, + "grad_norm": 1.068892478942871, + "learning_rate": 0.0005751540616246499, + "loss": 0.418, + "step": 15278 + }, + { + "epoch": 8.535754189944134, + "grad_norm": 0.4912353456020355, + "learning_rate": 0.0005751260504201681, + "loss": 0.3105, + "step": 15279 + }, + { + "epoch": 8.53631284916201, + "grad_norm": 1.028026819229126, + "learning_rate": 0.0005750980392156863, + "loss": 0.4586, + "step": 15280 + }, + { + "epoch": 8.536871508379889, + "grad_norm": 0.45351430773735046, + "learning_rate": 0.0005750700280112045, + "loss": 0.5092, + "step": 15281 + }, + { + "epoch": 8.537430167597766, + "grad_norm": 0.5421093106269836, + "learning_rate": 0.0005750420168067228, + "loss": 0.5087, + "step": 15282 + }, + { + "epoch": 8.537988826815642, + "grad_norm": 0.706861674785614, + "learning_rate": 0.0005750140056022409, + "loss": 0.3875, + "step": 15283 + }, + { + "epoch": 8.538547486033519, + "grad_norm": 0.4213111698627472, + "learning_rate": 0.0005749859943977591, + "loss": 0.4406, + "step": 15284 + }, + { + "epoch": 8.539106145251397, + "grad_norm": 0.7430433630943298, + "learning_rate": 0.0005749579831932773, + "loss": 0.5058, + "step": 15285 + }, + { + "epoch": 8.539664804469274, + "grad_norm": 1.3065729141235352, + "learning_rate": 0.0005749299719887955, + "loss": 0.4532, + "step": 15286 + }, + { + "epoch": 8.54022346368715, + "grad_norm": 0.45534664392471313, + "learning_rate": 0.0005749019607843138, + "loss": 0.399, + "step": 15287 + }, + { + "epoch": 8.540782122905028, + "grad_norm": 0.6419287919998169, + "learning_rate": 0.0005748739495798319, + "loss": 0.4442, + "step": 15288 + }, + { + "epoch": 8.541340782122905, + "grad_norm": 0.8328287601470947, + "learning_rate": 0.0005748459383753501, + "loss": 0.3241, + "step": 15289 + }, + { + "epoch": 8.541899441340782, + "grad_norm": 0.6678518652915955, + "learning_rate": 0.0005748179271708683, + "loss": 0.4441, + "step": 15290 + }, + { + "epoch": 8.54245810055866, + "grad_norm": 0.4900563359260559, + "learning_rate": 0.0005747899159663865, + "loss": 0.5411, + "step": 15291 + }, + { + "epoch": 8.543016759776537, + "grad_norm": 0.415881484746933, + "learning_rate": 0.0005747619047619048, + "loss": 0.3548, + "step": 15292 + }, + { + "epoch": 8.543575418994413, + "grad_norm": 0.48712825775146484, + "learning_rate": 0.0005747338935574231, + "loss": 0.417, + "step": 15293 + }, + { + "epoch": 8.544134078212291, + "grad_norm": 0.5948696136474609, + "learning_rate": 0.0005747058823529412, + "loss": 0.5534, + "step": 15294 + }, + { + "epoch": 8.544692737430168, + "grad_norm": 0.6844908595085144, + "learning_rate": 0.0005746778711484594, + "loss": 0.3397, + "step": 15295 + }, + { + "epoch": 8.545251396648045, + "grad_norm": 0.516048014163971, + "learning_rate": 0.0005746498599439776, + "loss": 0.4157, + "step": 15296 + }, + { + "epoch": 8.545810055865921, + "grad_norm": 0.37359654903411865, + "learning_rate": 0.0005746218487394959, + "loss": 0.3686, + "step": 15297 + }, + { + "epoch": 8.5463687150838, + "grad_norm": 0.4231356680393219, + "learning_rate": 0.0005745938375350141, + "loss": 0.3044, + "step": 15298 + }, + { + "epoch": 8.546927374301676, + "grad_norm": 0.5538113117218018, + "learning_rate": 0.0005745658263305322, + "loss": 0.394, + "step": 15299 + }, + { + "epoch": 8.547486033519553, + "grad_norm": 0.44817057251930237, + "learning_rate": 0.0005745378151260504, + "loss": 0.4676, + "step": 15300 + }, + { + "epoch": 8.548044692737431, + "grad_norm": 0.5843313932418823, + "learning_rate": 0.0005745098039215686, + "loss": 0.4392, + "step": 15301 + }, + { + "epoch": 8.548603351955308, + "grad_norm": 0.5026608109474182, + "learning_rate": 0.0005744817927170869, + "loss": 0.4901, + "step": 15302 + }, + { + "epoch": 8.549162011173184, + "grad_norm": 0.5650805234909058, + "learning_rate": 0.0005744537815126051, + "loss": 0.4373, + "step": 15303 + }, + { + "epoch": 8.54972067039106, + "grad_norm": 0.4040001332759857, + "learning_rate": 0.0005744257703081232, + "loss": 0.323, + "step": 15304 + }, + { + "epoch": 8.550279329608939, + "grad_norm": 1.3389577865600586, + "learning_rate": 0.0005743977591036414, + "loss": 0.3031, + "step": 15305 + }, + { + "epoch": 8.550837988826816, + "grad_norm": 1.3753235340118408, + "learning_rate": 0.0005743697478991596, + "loss": 0.4942, + "step": 15306 + }, + { + "epoch": 8.551396648044692, + "grad_norm": 0.437633752822876, + "learning_rate": 0.0005743417366946779, + "loss": 0.4945, + "step": 15307 + }, + { + "epoch": 8.55195530726257, + "grad_norm": 1.9556528329849243, + "learning_rate": 0.0005743137254901961, + "loss": 0.4551, + "step": 15308 + }, + { + "epoch": 8.552513966480447, + "grad_norm": 0.35048234462738037, + "learning_rate": 0.0005742857142857144, + "loss": 0.4185, + "step": 15309 + }, + { + "epoch": 8.553072625698324, + "grad_norm": 0.44669902324676514, + "learning_rate": 0.0005742577030812324, + "loss": 0.4259, + "step": 15310 + }, + { + "epoch": 8.553631284916202, + "grad_norm": 3.903268814086914, + "learning_rate": 0.0005742296918767507, + "loss": 0.5387, + "step": 15311 + }, + { + "epoch": 8.554189944134079, + "grad_norm": 0.6867513656616211, + "learning_rate": 0.000574201680672269, + "loss": 0.4175, + "step": 15312 + }, + { + "epoch": 8.554748603351955, + "grad_norm": 0.5278183817863464, + "learning_rate": 0.0005741736694677872, + "loss": 0.5153, + "step": 15313 + }, + { + "epoch": 8.555307262569832, + "grad_norm": 2.99004864692688, + "learning_rate": 0.0005741456582633054, + "loss": 0.4439, + "step": 15314 + }, + { + "epoch": 8.55586592178771, + "grad_norm": 0.5834138989448547, + "learning_rate": 0.0005741176470588235, + "loss": 0.4667, + "step": 15315 + }, + { + "epoch": 8.556424581005587, + "grad_norm": 0.4948843717575073, + "learning_rate": 0.0005740896358543417, + "loss": 0.4084, + "step": 15316 + }, + { + "epoch": 8.556983240223463, + "grad_norm": 1.5831941366195679, + "learning_rate": 0.00057406162464986, + "loss": 0.5034, + "step": 15317 + }, + { + "epoch": 8.557541899441341, + "grad_norm": 0.47506552934646606, + "learning_rate": 0.0005740336134453782, + "loss": 0.4968, + "step": 15318 + }, + { + "epoch": 8.558100558659218, + "grad_norm": 0.40010133385658264, + "learning_rate": 0.0005740056022408964, + "loss": 0.4323, + "step": 15319 + }, + { + "epoch": 8.558659217877095, + "grad_norm": 0.46140024065971375, + "learning_rate": 0.0005739775910364145, + "loss": 0.522, + "step": 15320 + }, + { + "epoch": 8.559217877094973, + "grad_norm": 0.8153825402259827, + "learning_rate": 0.0005739495798319327, + "loss": 0.5142, + "step": 15321 + }, + { + "epoch": 8.55977653631285, + "grad_norm": 0.6610863208770752, + "learning_rate": 0.000573921568627451, + "loss": 0.4634, + "step": 15322 + }, + { + "epoch": 8.560335195530726, + "grad_norm": 0.4047147035598755, + "learning_rate": 0.0005738935574229692, + "loss": 0.4321, + "step": 15323 + }, + { + "epoch": 8.560893854748603, + "grad_norm": 0.5521747469902039, + "learning_rate": 0.0005738655462184874, + "loss": 0.3799, + "step": 15324 + }, + { + "epoch": 8.561452513966481, + "grad_norm": 0.47687071561813354, + "learning_rate": 0.0005738375350140056, + "loss": 0.4006, + "step": 15325 + }, + { + "epoch": 8.562011173184358, + "grad_norm": 0.38448652625083923, + "learning_rate": 0.0005738095238095237, + "loss": 0.3422, + "step": 15326 + }, + { + "epoch": 8.562569832402234, + "grad_norm": 0.4361882209777832, + "learning_rate": 0.0005737815126050421, + "loss": 0.4377, + "step": 15327 + }, + { + "epoch": 8.563128491620112, + "grad_norm": 0.544578492641449, + "learning_rate": 0.0005737535014005603, + "loss": 0.4607, + "step": 15328 + }, + { + "epoch": 8.563687150837989, + "grad_norm": 0.37169408798217773, + "learning_rate": 0.0005737254901960785, + "loss": 0.3756, + "step": 15329 + }, + { + "epoch": 8.564245810055866, + "grad_norm": 0.5007482767105103, + "learning_rate": 0.0005736974789915967, + "loss": 0.4422, + "step": 15330 + }, + { + "epoch": 8.564804469273742, + "grad_norm": 0.4904552400112152, + "learning_rate": 0.0005736694677871148, + "loss": 0.3253, + "step": 15331 + }, + { + "epoch": 8.56536312849162, + "grad_norm": 1.5046217441558838, + "learning_rate": 0.0005736414565826331, + "loss": 0.4161, + "step": 15332 + }, + { + "epoch": 8.565921787709497, + "grad_norm": 0.6295437216758728, + "learning_rate": 0.0005736134453781513, + "loss": 0.4532, + "step": 15333 + }, + { + "epoch": 8.566480446927374, + "grad_norm": 0.5816341042518616, + "learning_rate": 0.0005735854341736695, + "loss": 0.3213, + "step": 15334 + }, + { + "epoch": 8.567039106145252, + "grad_norm": 0.4504592716693878, + "learning_rate": 0.0005735574229691877, + "loss": 0.4142, + "step": 15335 + }, + { + "epoch": 8.567597765363129, + "grad_norm": 0.4077862799167633, + "learning_rate": 0.0005735294117647058, + "loss": 0.363, + "step": 15336 + }, + { + "epoch": 8.568156424581005, + "grad_norm": 0.47129207849502563, + "learning_rate": 0.0005735014005602241, + "loss": 0.372, + "step": 15337 + }, + { + "epoch": 8.568715083798883, + "grad_norm": 1.7956185340881348, + "learning_rate": 0.0005734733893557423, + "loss": 0.5204, + "step": 15338 + }, + { + "epoch": 8.56927374301676, + "grad_norm": 0.42570939660072327, + "learning_rate": 0.0005734453781512605, + "loss": 0.5525, + "step": 15339 + }, + { + "epoch": 8.569832402234637, + "grad_norm": 0.38253700733184814, + "learning_rate": 0.0005734173669467787, + "loss": 0.3981, + "step": 15340 + }, + { + "epoch": 8.570391061452513, + "grad_norm": 0.5737777948379517, + "learning_rate": 0.0005733893557422969, + "loss": 0.6334, + "step": 15341 + }, + { + "epoch": 8.570949720670392, + "grad_norm": 0.552773654460907, + "learning_rate": 0.0005733613445378151, + "loss": 0.4469, + "step": 15342 + }, + { + "epoch": 8.571508379888268, + "grad_norm": 0.5476149320602417, + "learning_rate": 0.0005733333333333334, + "loss": 0.3532, + "step": 15343 + }, + { + "epoch": 8.572067039106145, + "grad_norm": 0.7920732498168945, + "learning_rate": 0.0005733053221288516, + "loss": 0.5412, + "step": 15344 + }, + { + "epoch": 8.572625698324023, + "grad_norm": 0.4449453055858612, + "learning_rate": 0.0005732773109243698, + "loss": 0.4464, + "step": 15345 + }, + { + "epoch": 8.5731843575419, + "grad_norm": 0.5351168513298035, + "learning_rate": 0.000573249299719888, + "loss": 0.4083, + "step": 15346 + }, + { + "epoch": 8.573743016759776, + "grad_norm": 0.5519024729728699, + "learning_rate": 0.0005732212885154062, + "loss": 0.4925, + "step": 15347 + }, + { + "epoch": 8.574301675977654, + "grad_norm": 1.1811132431030273, + "learning_rate": 0.0005731932773109244, + "loss": 0.4455, + "step": 15348 + }, + { + "epoch": 8.574860335195531, + "grad_norm": 3.202483654022217, + "learning_rate": 0.0005731652661064426, + "loss": 0.5439, + "step": 15349 + }, + { + "epoch": 8.575418994413408, + "grad_norm": 0.559126079082489, + "learning_rate": 0.0005731372549019608, + "loss": 0.4076, + "step": 15350 + }, + { + "epoch": 8.575977653631284, + "grad_norm": 0.5949681997299194, + "learning_rate": 0.000573109243697479, + "loss": 0.5334, + "step": 15351 + }, + { + "epoch": 8.576536312849163, + "grad_norm": 1.6870633363723755, + "learning_rate": 0.0005730812324929972, + "loss": 0.43, + "step": 15352 + }, + { + "epoch": 8.577094972067039, + "grad_norm": 1.1216844320297241, + "learning_rate": 0.0005730532212885154, + "loss": 0.4933, + "step": 15353 + }, + { + "epoch": 8.577653631284916, + "grad_norm": 0.5340660810470581, + "learning_rate": 0.0005730252100840336, + "loss": 0.4221, + "step": 15354 + }, + { + "epoch": 8.578212290502794, + "grad_norm": 1.3156418800354004, + "learning_rate": 0.0005729971988795518, + "loss": 0.6627, + "step": 15355 + }, + { + "epoch": 8.57877094972067, + "grad_norm": 0.49273672699928284, + "learning_rate": 0.00057296918767507, + "loss": 0.4833, + "step": 15356 + }, + { + "epoch": 8.579329608938547, + "grad_norm": 6.776882648468018, + "learning_rate": 0.0005729411764705883, + "loss": 0.4101, + "step": 15357 + }, + { + "epoch": 8.579888268156424, + "grad_norm": 0.3707313537597656, + "learning_rate": 0.0005729131652661064, + "loss": 0.3967, + "step": 15358 + }, + { + "epoch": 8.580446927374302, + "grad_norm": 0.7379883527755737, + "learning_rate": 0.0005728851540616247, + "loss": 0.4327, + "step": 15359 + }, + { + "epoch": 8.581005586592179, + "grad_norm": 0.7171437740325928, + "learning_rate": 0.0005728571428571429, + "loss": 0.4237, + "step": 15360 + }, + { + "epoch": 8.581564245810055, + "grad_norm": 0.5564234256744385, + "learning_rate": 0.0005728291316526611, + "loss": 0.6196, + "step": 15361 + }, + { + "epoch": 8.582122905027934, + "grad_norm": 0.6342710852622986, + "learning_rate": 0.0005728011204481794, + "loss": 0.5136, + "step": 15362 + }, + { + "epoch": 8.58268156424581, + "grad_norm": 0.4498765170574188, + "learning_rate": 0.0005727731092436975, + "loss": 0.4119, + "step": 15363 + }, + { + "epoch": 8.583240223463687, + "grad_norm": 0.5579925179481506, + "learning_rate": 0.0005727450980392157, + "loss": 0.4959, + "step": 15364 + }, + { + "epoch": 8.583798882681565, + "grad_norm": 0.4603900611400604, + "learning_rate": 0.0005727170868347339, + "loss": 0.4239, + "step": 15365 + }, + { + "epoch": 8.584357541899442, + "grad_norm": 0.616898238658905, + "learning_rate": 0.0005726890756302521, + "loss": 0.3897, + "step": 15366 + }, + { + "epoch": 8.584916201117318, + "grad_norm": 0.8591034412384033, + "learning_rate": 0.0005726610644257704, + "loss": 0.4368, + "step": 15367 + }, + { + "epoch": 8.585474860335196, + "grad_norm": 1.5668877363204956, + "learning_rate": 0.0005726330532212885, + "loss": 0.4951, + "step": 15368 + }, + { + "epoch": 8.586033519553073, + "grad_norm": 0.7134303450584412, + "learning_rate": 0.0005726050420168067, + "loss": 0.3598, + "step": 15369 + }, + { + "epoch": 8.58659217877095, + "grad_norm": 0.6393819451332092, + "learning_rate": 0.0005725770308123249, + "loss": 0.4035, + "step": 15370 + }, + { + "epoch": 8.587150837988826, + "grad_norm": 0.3917793333530426, + "learning_rate": 0.0005725490196078431, + "loss": 0.4271, + "step": 15371 + }, + { + "epoch": 8.587709497206705, + "grad_norm": 0.3668496310710907, + "learning_rate": 0.0005725210084033614, + "loss": 0.4282, + "step": 15372 + }, + { + "epoch": 8.588268156424581, + "grad_norm": 0.48604726791381836, + "learning_rate": 0.0005724929971988796, + "loss": 0.3779, + "step": 15373 + }, + { + "epoch": 8.588826815642458, + "grad_norm": 0.469411700963974, + "learning_rate": 0.0005724649859943977, + "loss": 0.506, + "step": 15374 + }, + { + "epoch": 8.589385474860336, + "grad_norm": 1.0414034128189087, + "learning_rate": 0.000572436974789916, + "loss": 0.58, + "step": 15375 + }, + { + "epoch": 8.589944134078213, + "grad_norm": 0.650741696357727, + "learning_rate": 0.0005724089635854342, + "loss": 0.4933, + "step": 15376 + }, + { + "epoch": 8.59050279329609, + "grad_norm": 1.2917307615280151, + "learning_rate": 0.0005723809523809525, + "loss": 0.486, + "step": 15377 + }, + { + "epoch": 8.591061452513966, + "grad_norm": 0.5404037833213806, + "learning_rate": 0.0005723529411764707, + "loss": 0.5444, + "step": 15378 + }, + { + "epoch": 8.591620111731844, + "grad_norm": 0.5045173764228821, + "learning_rate": 0.0005723249299719888, + "loss": 0.4291, + "step": 15379 + }, + { + "epoch": 8.59217877094972, + "grad_norm": 0.4748923182487488, + "learning_rate": 0.000572296918767507, + "loss": 0.5593, + "step": 15380 + }, + { + "epoch": 8.592737430167597, + "grad_norm": 0.4588840901851654, + "learning_rate": 0.0005722689075630252, + "loss": 0.4488, + "step": 15381 + }, + { + "epoch": 8.593296089385476, + "grad_norm": 0.4534672200679779, + "learning_rate": 0.0005722408963585435, + "loss": 0.5363, + "step": 15382 + }, + { + "epoch": 8.593854748603352, + "grad_norm": 0.5587313771247864, + "learning_rate": 0.0005722128851540617, + "loss": 0.3772, + "step": 15383 + }, + { + "epoch": 8.594413407821229, + "grad_norm": 0.5406129956245422, + "learning_rate": 0.0005721848739495798, + "loss": 0.4719, + "step": 15384 + }, + { + "epoch": 8.594972067039105, + "grad_norm": 0.46785107254981995, + "learning_rate": 0.000572156862745098, + "loss": 0.5067, + "step": 15385 + }, + { + "epoch": 8.595530726256984, + "grad_norm": 0.3882419466972351, + "learning_rate": 0.0005721288515406162, + "loss": 0.3249, + "step": 15386 + }, + { + "epoch": 8.59608938547486, + "grad_norm": 0.696916401386261, + "learning_rate": 0.0005721008403361345, + "loss": 0.5048, + "step": 15387 + }, + { + "epoch": 8.596648044692737, + "grad_norm": 0.7198442220687866, + "learning_rate": 0.0005720728291316527, + "loss": 0.3699, + "step": 15388 + }, + { + "epoch": 8.597206703910615, + "grad_norm": 0.5968286395072937, + "learning_rate": 0.0005720448179271709, + "loss": 0.4135, + "step": 15389 + }, + { + "epoch": 8.597765363128492, + "grad_norm": 0.4334351122379303, + "learning_rate": 0.000572016806722689, + "loss": 0.3991, + "step": 15390 + }, + { + "epoch": 8.598324022346368, + "grad_norm": 4.028077602386475, + "learning_rate": 0.0005719887955182072, + "loss": 0.4556, + "step": 15391 + }, + { + "epoch": 8.598882681564247, + "grad_norm": 0.38883304595947266, + "learning_rate": 0.0005719607843137256, + "loss": 0.3998, + "step": 15392 + }, + { + "epoch": 8.599441340782123, + "grad_norm": 0.47652119398117065, + "learning_rate": 0.0005719327731092438, + "loss": 0.5709, + "step": 15393 + }, + { + "epoch": 8.6, + "grad_norm": 0.48822495341300964, + "learning_rate": 0.000571904761904762, + "loss": 0.3295, + "step": 15394 + }, + { + "epoch": 8.600558659217878, + "grad_norm": 0.5199114680290222, + "learning_rate": 0.0005718767507002801, + "loss": 0.4382, + "step": 15395 + }, + { + "epoch": 8.601117318435755, + "grad_norm": 0.4140242338180542, + "learning_rate": 0.0005718487394957983, + "loss": 0.3759, + "step": 15396 + }, + { + "epoch": 8.601675977653631, + "grad_norm": 9.742976188659668, + "learning_rate": 0.0005718207282913165, + "loss": 0.455, + "step": 15397 + }, + { + "epoch": 8.602234636871508, + "grad_norm": 0.5877695679664612, + "learning_rate": 0.0005717927170868348, + "loss": 0.3994, + "step": 15398 + }, + { + "epoch": 8.602793296089386, + "grad_norm": 0.5117267370223999, + "learning_rate": 0.000571764705882353, + "loss": 0.4464, + "step": 15399 + }, + { + "epoch": 8.603351955307263, + "grad_norm": 0.3696885108947754, + "learning_rate": 0.0005717366946778711, + "loss": 0.3576, + "step": 15400 + }, + { + "epoch": 8.60391061452514, + "grad_norm": 3.027294397354126, + "learning_rate": 0.0005717086834733893, + "loss": 0.4177, + "step": 15401 + }, + { + "epoch": 8.604469273743018, + "grad_norm": 0.7337448000907898, + "learning_rate": 0.0005716806722689075, + "loss": 0.2937, + "step": 15402 + }, + { + "epoch": 8.605027932960894, + "grad_norm": 1.5567195415496826, + "learning_rate": 0.0005716526610644258, + "loss": 0.3165, + "step": 15403 + }, + { + "epoch": 8.60558659217877, + "grad_norm": 0.7158293724060059, + "learning_rate": 0.000571624649859944, + "loss": 0.5073, + "step": 15404 + }, + { + "epoch": 8.606145251396647, + "grad_norm": 3.5723652839660645, + "learning_rate": 0.0005715966386554622, + "loss": 0.4221, + "step": 15405 + }, + { + "epoch": 8.606703910614526, + "grad_norm": 0.4587322175502777, + "learning_rate": 0.0005715686274509803, + "loss": 0.4709, + "step": 15406 + }, + { + "epoch": 8.607262569832402, + "grad_norm": 0.5292487740516663, + "learning_rate": 0.0005715406162464985, + "loss": 0.4456, + "step": 15407 + }, + { + "epoch": 8.607821229050279, + "grad_norm": 0.5739074945449829, + "learning_rate": 0.0005715126050420169, + "loss": 0.4395, + "step": 15408 + }, + { + "epoch": 8.608379888268157, + "grad_norm": 0.46167340874671936, + "learning_rate": 0.0005714845938375351, + "loss": 0.4064, + "step": 15409 + }, + { + "epoch": 8.608938547486034, + "grad_norm": 0.46245571970939636, + "learning_rate": 0.0005714565826330533, + "loss": 0.4121, + "step": 15410 + }, + { + "epoch": 8.60949720670391, + "grad_norm": 0.4056048095226288, + "learning_rate": 0.0005714285714285714, + "loss": 0.4483, + "step": 15411 + }, + { + "epoch": 8.610055865921789, + "grad_norm": 0.6184943914413452, + "learning_rate": 0.0005714005602240896, + "loss": 0.6366, + "step": 15412 + }, + { + "epoch": 8.610614525139665, + "grad_norm": 0.33188676834106445, + "learning_rate": 0.0005713725490196079, + "loss": 0.4143, + "step": 15413 + }, + { + "epoch": 8.611173184357542, + "grad_norm": 0.3528975546360016, + "learning_rate": 0.0005713445378151261, + "loss": 0.3827, + "step": 15414 + }, + { + "epoch": 8.611731843575418, + "grad_norm": 0.5689566135406494, + "learning_rate": 0.0005713165266106443, + "loss": 0.6413, + "step": 15415 + }, + { + "epoch": 8.612290502793297, + "grad_norm": 0.4984848201274872, + "learning_rate": 0.0005712885154061624, + "loss": 0.4425, + "step": 15416 + }, + { + "epoch": 8.612849162011173, + "grad_norm": 0.5701281428337097, + "learning_rate": 0.0005712605042016806, + "loss": 0.4005, + "step": 15417 + }, + { + "epoch": 8.61340782122905, + "grad_norm": 0.8601603507995605, + "learning_rate": 0.0005712324929971989, + "loss": 0.5124, + "step": 15418 + }, + { + "epoch": 8.613966480446928, + "grad_norm": 0.3771197199821472, + "learning_rate": 0.0005712044817927171, + "loss": 0.4134, + "step": 15419 + }, + { + "epoch": 8.614525139664805, + "grad_norm": 0.5102972984313965, + "learning_rate": 0.0005711764705882353, + "loss": 0.3771, + "step": 15420 + }, + { + "epoch": 8.615083798882681, + "grad_norm": 0.3841908872127533, + "learning_rate": 0.0005711484593837535, + "loss": 0.3402, + "step": 15421 + }, + { + "epoch": 8.61564245810056, + "grad_norm": 17.929323196411133, + "learning_rate": 0.0005711204481792716, + "loss": 0.3302, + "step": 15422 + }, + { + "epoch": 8.616201117318436, + "grad_norm": 0.5760158896446228, + "learning_rate": 0.0005710924369747899, + "loss": 0.498, + "step": 15423 + }, + { + "epoch": 8.616759776536313, + "grad_norm": 0.5124297738075256, + "learning_rate": 0.0005710644257703081, + "loss": 0.5373, + "step": 15424 + }, + { + "epoch": 8.61731843575419, + "grad_norm": 0.42361506819725037, + "learning_rate": 0.0005710364145658264, + "loss": 0.3442, + "step": 15425 + }, + { + "epoch": 8.617877094972068, + "grad_norm": 0.9741013646125793, + "learning_rate": 0.0005710084033613446, + "loss": 0.364, + "step": 15426 + }, + { + "epoch": 8.618435754189944, + "grad_norm": 0.663214385509491, + "learning_rate": 0.0005709803921568627, + "loss": 0.4551, + "step": 15427 + }, + { + "epoch": 8.61899441340782, + "grad_norm": 0.5509231686592102, + "learning_rate": 0.000570952380952381, + "loss": 0.3593, + "step": 15428 + }, + { + "epoch": 8.619553072625699, + "grad_norm": 0.5926401019096375, + "learning_rate": 0.0005709243697478992, + "loss": 0.5347, + "step": 15429 + }, + { + "epoch": 8.620111731843576, + "grad_norm": 0.42184609174728394, + "learning_rate": 0.0005708963585434174, + "loss": 0.4203, + "step": 15430 + }, + { + "epoch": 8.620670391061452, + "grad_norm": 0.5329546332359314, + "learning_rate": 0.0005708683473389356, + "loss": 0.4386, + "step": 15431 + }, + { + "epoch": 8.621229050279329, + "grad_norm": 0.77520751953125, + "learning_rate": 0.0005708403361344537, + "loss": 0.3368, + "step": 15432 + }, + { + "epoch": 8.621787709497207, + "grad_norm": 0.5103943943977356, + "learning_rate": 0.000570812324929972, + "loss": 0.5568, + "step": 15433 + }, + { + "epoch": 8.622346368715084, + "grad_norm": 0.48108652234077454, + "learning_rate": 0.0005707843137254902, + "loss": 0.345, + "step": 15434 + }, + { + "epoch": 8.62290502793296, + "grad_norm": 0.42470112442970276, + "learning_rate": 0.0005707563025210084, + "loss": 0.4091, + "step": 15435 + }, + { + "epoch": 8.623463687150839, + "grad_norm": 0.4808240234851837, + "learning_rate": 0.0005707282913165266, + "loss": 0.4693, + "step": 15436 + }, + { + "epoch": 8.624022346368715, + "grad_norm": 0.6123438477516174, + "learning_rate": 0.0005707002801120448, + "loss": 0.4776, + "step": 15437 + }, + { + "epoch": 8.624581005586592, + "grad_norm": 0.43505755066871643, + "learning_rate": 0.000570672268907563, + "loss": 0.4637, + "step": 15438 + }, + { + "epoch": 8.62513966480447, + "grad_norm": 0.5505017042160034, + "learning_rate": 0.0005706442577030812, + "loss": 0.4632, + "step": 15439 + }, + { + "epoch": 8.625698324022347, + "grad_norm": 0.5906779170036316, + "learning_rate": 0.0005706162464985994, + "loss": 0.4499, + "step": 15440 + }, + { + "epoch": 8.626256983240223, + "grad_norm": 0.45223239064216614, + "learning_rate": 0.0005705882352941177, + "loss": 0.3646, + "step": 15441 + }, + { + "epoch": 8.6268156424581, + "grad_norm": 0.7143236398696899, + "learning_rate": 0.0005705602240896359, + "loss": 0.3868, + "step": 15442 + }, + { + "epoch": 8.627374301675978, + "grad_norm": 0.39840784668922424, + "learning_rate": 0.0005705322128851541, + "loss": 0.4613, + "step": 15443 + }, + { + "epoch": 8.627932960893855, + "grad_norm": 1.2329320907592773, + "learning_rate": 0.0005705042016806723, + "loss": 0.5497, + "step": 15444 + }, + { + "epoch": 8.628491620111731, + "grad_norm": 0.6231057047843933, + "learning_rate": 0.0005704761904761905, + "loss": 0.4027, + "step": 15445 + }, + { + "epoch": 8.62905027932961, + "grad_norm": 0.7226108908653259, + "learning_rate": 0.0005704481792717087, + "loss": 0.4338, + "step": 15446 + }, + { + "epoch": 8.629608938547486, + "grad_norm": 0.5482200980186462, + "learning_rate": 0.0005704201680672269, + "loss": 0.3992, + "step": 15447 + }, + { + "epoch": 8.630167597765363, + "grad_norm": 0.8195366263389587, + "learning_rate": 0.0005703921568627452, + "loss": 0.4491, + "step": 15448 + }, + { + "epoch": 8.630726256983241, + "grad_norm": 0.5906659364700317, + "learning_rate": 0.0005703641456582633, + "loss": 0.4374, + "step": 15449 + }, + { + "epoch": 8.631284916201118, + "grad_norm": 0.44893544912338257, + "learning_rate": 0.0005703361344537815, + "loss": 0.3673, + "step": 15450 + }, + { + "epoch": 8.631843575418994, + "grad_norm": 1.057785153388977, + "learning_rate": 0.0005703081232492997, + "loss": 0.4845, + "step": 15451 + }, + { + "epoch": 8.63240223463687, + "grad_norm": 0.422775536775589, + "learning_rate": 0.0005702801120448179, + "loss": 0.4269, + "step": 15452 + }, + { + "epoch": 8.632960893854749, + "grad_norm": 0.4011528193950653, + "learning_rate": 0.0005702521008403362, + "loss": 0.4145, + "step": 15453 + }, + { + "epoch": 8.633519553072626, + "grad_norm": 1.4747127294540405, + "learning_rate": 0.0005702240896358543, + "loss": 0.5081, + "step": 15454 + }, + { + "epoch": 8.634078212290502, + "grad_norm": 0.5384272933006287, + "learning_rate": 0.0005701960784313725, + "loss": 0.4532, + "step": 15455 + }, + { + "epoch": 8.63463687150838, + "grad_norm": 0.4927268624305725, + "learning_rate": 0.0005701680672268907, + "loss": 0.6223, + "step": 15456 + }, + { + "epoch": 8.635195530726257, + "grad_norm": 0.8540382981300354, + "learning_rate": 0.000570140056022409, + "loss": 0.3853, + "step": 15457 + }, + { + "epoch": 8.635754189944134, + "grad_norm": 0.5726216435432434, + "learning_rate": 0.0005701120448179273, + "loss": 0.4688, + "step": 15458 + }, + { + "epoch": 8.63631284916201, + "grad_norm": 0.5496420860290527, + "learning_rate": 0.0005700840336134454, + "loss": 0.4294, + "step": 15459 + }, + { + "epoch": 8.636871508379889, + "grad_norm": 0.5806383490562439, + "learning_rate": 0.0005700560224089636, + "loss": 0.4136, + "step": 15460 + }, + { + "epoch": 8.637430167597765, + "grad_norm": 0.6385259628295898, + "learning_rate": 0.0005700280112044818, + "loss": 0.4054, + "step": 15461 + }, + { + "epoch": 8.637988826815642, + "grad_norm": 0.7559404969215393, + "learning_rate": 0.00057, + "loss": 0.5113, + "step": 15462 + }, + { + "epoch": 8.63854748603352, + "grad_norm": 0.4123435318470001, + "learning_rate": 0.0005699719887955183, + "loss": 0.3998, + "step": 15463 + }, + { + "epoch": 8.639106145251397, + "grad_norm": 0.47525525093078613, + "learning_rate": 0.0005699439775910365, + "loss": 0.477, + "step": 15464 + }, + { + "epoch": 8.639664804469273, + "grad_norm": 0.5204905867576599, + "learning_rate": 0.0005699159663865546, + "loss": 0.4073, + "step": 15465 + }, + { + "epoch": 8.640223463687152, + "grad_norm": 0.4411095678806305, + "learning_rate": 0.0005698879551820728, + "loss": 0.3632, + "step": 15466 + }, + { + "epoch": 8.640782122905028, + "grad_norm": 0.5747869610786438, + "learning_rate": 0.000569859943977591, + "loss": 0.3954, + "step": 15467 + }, + { + "epoch": 8.641340782122905, + "grad_norm": 0.5323585867881775, + "learning_rate": 0.0005698319327731093, + "loss": 0.3872, + "step": 15468 + }, + { + "epoch": 8.641899441340783, + "grad_norm": 0.46407270431518555, + "learning_rate": 0.0005698039215686275, + "loss": 0.3738, + "step": 15469 + }, + { + "epoch": 8.64245810055866, + "grad_norm": 0.45193350315093994, + "learning_rate": 0.0005697759103641456, + "loss": 0.4271, + "step": 15470 + }, + { + "epoch": 8.643016759776536, + "grad_norm": 0.526462197303772, + "learning_rate": 0.0005697478991596638, + "loss": 0.4023, + "step": 15471 + }, + { + "epoch": 8.643575418994413, + "grad_norm": 0.5079617500305176, + "learning_rate": 0.000569719887955182, + "loss": 0.346, + "step": 15472 + }, + { + "epoch": 8.644134078212291, + "grad_norm": 1.8635884523391724, + "learning_rate": 0.0005696918767507004, + "loss": 0.4141, + "step": 15473 + }, + { + "epoch": 8.644692737430168, + "grad_norm": 0.4533693790435791, + "learning_rate": 0.0005696638655462186, + "loss": 0.3465, + "step": 15474 + }, + { + "epoch": 8.645251396648044, + "grad_norm": 1.205929160118103, + "learning_rate": 0.0005696358543417367, + "loss": 0.5821, + "step": 15475 + }, + { + "epoch": 8.645810055865923, + "grad_norm": 0.6712439656257629, + "learning_rate": 0.0005696078431372549, + "loss": 0.4141, + "step": 15476 + }, + { + "epoch": 8.6463687150838, + "grad_norm": 0.4632731080055237, + "learning_rate": 0.0005695798319327731, + "loss": 0.4221, + "step": 15477 + }, + { + "epoch": 8.646927374301676, + "grad_norm": 0.6880497336387634, + "learning_rate": 0.0005695518207282914, + "loss": 0.4579, + "step": 15478 + }, + { + "epoch": 8.647486033519552, + "grad_norm": 1.4594767093658447, + "learning_rate": 0.0005695238095238096, + "loss": 0.4154, + "step": 15479 + }, + { + "epoch": 8.64804469273743, + "grad_norm": 0.5992676615715027, + "learning_rate": 0.0005694957983193278, + "loss": 0.4456, + "step": 15480 + }, + { + "epoch": 8.648603351955307, + "grad_norm": 0.331184446811676, + "learning_rate": 0.0005694677871148459, + "loss": 0.3202, + "step": 15481 + }, + { + "epoch": 8.649162011173184, + "grad_norm": 0.5219162702560425, + "learning_rate": 0.0005694397759103641, + "loss": 0.4742, + "step": 15482 + }, + { + "epoch": 8.649720670391062, + "grad_norm": 0.45459991693496704, + "learning_rate": 0.0005694117647058824, + "loss": 0.4612, + "step": 15483 + }, + { + "epoch": 8.650279329608939, + "grad_norm": 0.40991586446762085, + "learning_rate": 0.0005693837535014006, + "loss": 0.3984, + "step": 15484 + }, + { + "epoch": 8.650837988826815, + "grad_norm": 0.973129153251648, + "learning_rate": 0.0005693557422969188, + "loss": 0.7084, + "step": 15485 + }, + { + "epoch": 8.651396648044694, + "grad_norm": 0.4905858039855957, + "learning_rate": 0.0005693277310924369, + "loss": 0.4442, + "step": 15486 + }, + { + "epoch": 8.65195530726257, + "grad_norm": 0.5168609619140625, + "learning_rate": 0.0005692997198879551, + "loss": 0.4178, + "step": 15487 + }, + { + "epoch": 8.652513966480447, + "grad_norm": 0.45132094621658325, + "learning_rate": 0.0005692717086834734, + "loss": 0.3536, + "step": 15488 + }, + { + "epoch": 8.653072625698323, + "grad_norm": 0.48733070492744446, + "learning_rate": 0.0005692436974789916, + "loss": 0.3673, + "step": 15489 + }, + { + "epoch": 8.653631284916202, + "grad_norm": 0.5058335661888123, + "learning_rate": 0.0005692156862745099, + "loss": 0.5012, + "step": 15490 + }, + { + "epoch": 8.654189944134078, + "grad_norm": 0.6719403862953186, + "learning_rate": 0.000569187675070028, + "loss": 0.5382, + "step": 15491 + }, + { + "epoch": 8.654748603351955, + "grad_norm": 0.7544112801551819, + "learning_rate": 0.0005691596638655462, + "loss": 0.6047, + "step": 15492 + }, + { + "epoch": 8.655307262569833, + "grad_norm": 0.7219251990318298, + "learning_rate": 0.0005691316526610645, + "loss": 0.494, + "step": 15493 + }, + { + "epoch": 8.65586592178771, + "grad_norm": 0.8679959177970886, + "learning_rate": 0.0005691036414565827, + "loss": 0.5226, + "step": 15494 + }, + { + "epoch": 8.656424581005586, + "grad_norm": 0.5559894442558289, + "learning_rate": 0.0005690756302521009, + "loss": 0.4454, + "step": 15495 + }, + { + "epoch": 8.656983240223465, + "grad_norm": 0.4497896432876587, + "learning_rate": 0.0005690476190476191, + "loss": 0.4466, + "step": 15496 + }, + { + "epoch": 8.657541899441341, + "grad_norm": 0.46997177600860596, + "learning_rate": 0.0005690196078431372, + "loss": 0.4347, + "step": 15497 + }, + { + "epoch": 8.658100558659218, + "grad_norm": 0.6016250252723694, + "learning_rate": 0.0005689915966386555, + "loss": 0.4085, + "step": 15498 + }, + { + "epoch": 8.658659217877094, + "grad_norm": 0.8662602305412292, + "learning_rate": 0.0005689635854341737, + "loss": 0.3554, + "step": 15499 + }, + { + "epoch": 8.659217877094973, + "grad_norm": 0.5054382085800171, + "learning_rate": 0.0005689355742296919, + "loss": 0.4934, + "step": 15500 + }, + { + "epoch": 8.659217877094973, + "eval_cer": 0.09258834953574896, + "eval_loss": 0.33935561776161194, + "eval_runtime": 61.3442, + "eval_samples_per_second": 73.976, + "eval_steps_per_second": 4.63, + "eval_wer": 0.37049244434513, + "step": 15500 + }, + { + "epoch": 8.65977653631285, + "grad_norm": 0.5360488891601562, + "learning_rate": 0.0005689075630252101, + "loss": 0.4385, + "step": 15501 + }, + { + "epoch": 8.660335195530726, + "grad_norm": 0.8469676971435547, + "learning_rate": 0.0005688795518207282, + "loss": 0.6167, + "step": 15502 + }, + { + "epoch": 8.660893854748604, + "grad_norm": 0.471036434173584, + "learning_rate": 0.0005688515406162465, + "loss": 0.4706, + "step": 15503 + }, + { + "epoch": 8.66145251396648, + "grad_norm": 0.5919326543807983, + "learning_rate": 0.0005688235294117647, + "loss": 0.5583, + "step": 15504 + }, + { + "epoch": 8.662011173184357, + "grad_norm": 1.2199716567993164, + "learning_rate": 0.0005687955182072829, + "loss": 0.4125, + "step": 15505 + }, + { + "epoch": 8.662569832402234, + "grad_norm": 0.3077165484428406, + "learning_rate": 0.0005687675070028011, + "loss": 0.3382, + "step": 15506 + }, + { + "epoch": 8.663128491620112, + "grad_norm": 2.023994207382202, + "learning_rate": 0.0005687394957983192, + "loss": 0.3842, + "step": 15507 + }, + { + "epoch": 8.663687150837989, + "grad_norm": 0.7194250226020813, + "learning_rate": 0.0005687114845938376, + "loss": 0.4205, + "step": 15508 + }, + { + "epoch": 8.664245810055865, + "grad_norm": 0.5387637615203857, + "learning_rate": 0.0005686834733893558, + "loss": 0.3307, + "step": 15509 + }, + { + "epoch": 8.664804469273744, + "grad_norm": 0.398230642080307, + "learning_rate": 0.000568655462184874, + "loss": 0.4441, + "step": 15510 + }, + { + "epoch": 8.66536312849162, + "grad_norm": 1.3800987005233765, + "learning_rate": 0.0005686274509803922, + "loss": 0.492, + "step": 15511 + }, + { + "epoch": 8.665921787709497, + "grad_norm": 0.43140509724617004, + "learning_rate": 0.0005685994397759104, + "loss": 0.4437, + "step": 15512 + }, + { + "epoch": 8.666480446927375, + "grad_norm": 0.6863518357276917, + "learning_rate": 0.0005685714285714286, + "loss": 0.4814, + "step": 15513 + }, + { + "epoch": 8.667039106145252, + "grad_norm": 0.5488103628158569, + "learning_rate": 0.0005685434173669468, + "loss": 0.3479, + "step": 15514 + }, + { + "epoch": 8.667597765363128, + "grad_norm": 0.3921966850757599, + "learning_rate": 0.000568515406162465, + "loss": 0.4351, + "step": 15515 + }, + { + "epoch": 8.668156424581005, + "grad_norm": 0.5004563927650452, + "learning_rate": 0.0005684873949579832, + "loss": 0.411, + "step": 15516 + }, + { + "epoch": 8.668715083798883, + "grad_norm": 0.43397316336631775, + "learning_rate": 0.0005684593837535014, + "loss": 0.4898, + "step": 15517 + }, + { + "epoch": 8.66927374301676, + "grad_norm": 0.4364555776119232, + "learning_rate": 0.0005684313725490196, + "loss": 0.5329, + "step": 15518 + }, + { + "epoch": 8.669832402234636, + "grad_norm": 0.4831409454345703, + "learning_rate": 0.0005684033613445378, + "loss": 0.3991, + "step": 15519 + }, + { + "epoch": 8.670391061452515, + "grad_norm": 0.7217563986778259, + "learning_rate": 0.000568375350140056, + "loss": 0.4335, + "step": 15520 + }, + { + "epoch": 8.670949720670391, + "grad_norm": 0.412957102060318, + "learning_rate": 0.0005683473389355742, + "loss": 0.3641, + "step": 15521 + }, + { + "epoch": 8.671508379888268, + "grad_norm": 0.7792677879333496, + "learning_rate": 0.0005683193277310924, + "loss": 0.4475, + "step": 15522 + }, + { + "epoch": 8.672067039106146, + "grad_norm": 1.1043118238449097, + "learning_rate": 0.0005682913165266107, + "loss": 0.4727, + "step": 15523 + }, + { + "epoch": 8.672625698324023, + "grad_norm": 0.5187425017356873, + "learning_rate": 0.0005682633053221289, + "loss": 0.3608, + "step": 15524 + }, + { + "epoch": 8.6731843575419, + "grad_norm": 0.4371188282966614, + "learning_rate": 0.0005682352941176471, + "loss": 0.356, + "step": 15525 + }, + { + "epoch": 8.673743016759776, + "grad_norm": 0.9478253126144409, + "learning_rate": 0.0005682072829131653, + "loss": 0.4465, + "step": 15526 + }, + { + "epoch": 8.674301675977654, + "grad_norm": 0.9206333160400391, + "learning_rate": 0.0005681792717086835, + "loss": 0.4917, + "step": 15527 + }, + { + "epoch": 8.67486033519553, + "grad_norm": 1.079020380973816, + "learning_rate": 0.0005681512605042018, + "loss": 0.3557, + "step": 15528 + }, + { + "epoch": 8.675418994413407, + "grad_norm": 0.4506421983242035, + "learning_rate": 0.0005681232492997199, + "loss": 0.4713, + "step": 15529 + }, + { + "epoch": 8.675977653631286, + "grad_norm": 0.44211676716804504, + "learning_rate": 0.0005680952380952381, + "loss": 0.3573, + "step": 15530 + }, + { + "epoch": 8.676536312849162, + "grad_norm": 0.41927507519721985, + "learning_rate": 0.0005680672268907563, + "loss": 0.3704, + "step": 15531 + }, + { + "epoch": 8.677094972067039, + "grad_norm": 1.226069688796997, + "learning_rate": 0.0005680392156862745, + "loss": 0.7053, + "step": 15532 + }, + { + "epoch": 8.677653631284915, + "grad_norm": 1.1424435377120972, + "learning_rate": 0.0005680112044817928, + "loss": 0.335, + "step": 15533 + }, + { + "epoch": 8.678212290502794, + "grad_norm": 0.8386105298995972, + "learning_rate": 0.0005679831932773109, + "loss": 0.4867, + "step": 15534 + }, + { + "epoch": 8.67877094972067, + "grad_norm": 0.5779457092285156, + "learning_rate": 0.0005679551820728291, + "loss": 0.5181, + "step": 15535 + }, + { + "epoch": 8.679329608938547, + "grad_norm": 0.761139452457428, + "learning_rate": 0.0005679271708683473, + "loss": 0.4619, + "step": 15536 + }, + { + "epoch": 8.679888268156425, + "grad_norm": 0.5999415516853333, + "learning_rate": 0.0005678991596638655, + "loss": 0.505, + "step": 15537 + }, + { + "epoch": 8.680446927374302, + "grad_norm": 0.5625087022781372, + "learning_rate": 0.0005678711484593838, + "loss": 0.4821, + "step": 15538 + }, + { + "epoch": 8.681005586592178, + "grad_norm": 0.7208386659622192, + "learning_rate": 0.000567843137254902, + "loss": 0.5699, + "step": 15539 + }, + { + "epoch": 8.681564245810057, + "grad_norm": 0.5278449058532715, + "learning_rate": 0.0005678151260504202, + "loss": 0.4437, + "step": 15540 + }, + { + "epoch": 8.682122905027933, + "grad_norm": 0.5902572870254517, + "learning_rate": 0.0005677871148459384, + "loss": 0.4088, + "step": 15541 + }, + { + "epoch": 8.68268156424581, + "grad_norm": 0.7975791692733765, + "learning_rate": 0.0005677591036414566, + "loss": 0.5233, + "step": 15542 + }, + { + "epoch": 8.683240223463688, + "grad_norm": 0.44139623641967773, + "learning_rate": 0.0005677310924369749, + "loss": 0.4734, + "step": 15543 + }, + { + "epoch": 8.683798882681565, + "grad_norm": 2.1698899269104004, + "learning_rate": 0.0005677030812324931, + "loss": 0.4868, + "step": 15544 + }, + { + "epoch": 8.684357541899441, + "grad_norm": 0.4290623068809509, + "learning_rate": 0.0005676750700280112, + "loss": 0.3866, + "step": 15545 + }, + { + "epoch": 8.684916201117318, + "grad_norm": 0.5705886483192444, + "learning_rate": 0.0005676470588235294, + "loss": 0.5036, + "step": 15546 + }, + { + "epoch": 8.685474860335196, + "grad_norm": 0.4813299775123596, + "learning_rate": 0.0005676190476190476, + "loss": 0.3888, + "step": 15547 + }, + { + "epoch": 8.686033519553073, + "grad_norm": 1.1574420928955078, + "learning_rate": 0.0005675910364145659, + "loss": 0.4428, + "step": 15548 + }, + { + "epoch": 8.68659217877095, + "grad_norm": 0.3449275493621826, + "learning_rate": 0.0005675630252100841, + "loss": 0.3787, + "step": 15549 + }, + { + "epoch": 8.687150837988828, + "grad_norm": 0.6266234517097473, + "learning_rate": 0.0005675350140056022, + "loss": 0.7406, + "step": 15550 + }, + { + "epoch": 8.687709497206704, + "grad_norm": 0.7907277941703796, + "learning_rate": 0.0005675070028011204, + "loss": 0.4311, + "step": 15551 + }, + { + "epoch": 8.68826815642458, + "grad_norm": 2.236419677734375, + "learning_rate": 0.0005674789915966386, + "loss": 0.4123, + "step": 15552 + }, + { + "epoch": 8.688826815642457, + "grad_norm": 0.800818920135498, + "learning_rate": 0.0005674509803921569, + "loss": 0.6012, + "step": 15553 + }, + { + "epoch": 8.689385474860336, + "grad_norm": 0.6959667801856995, + "learning_rate": 0.0005674229691876751, + "loss": 0.4439, + "step": 15554 + }, + { + "epoch": 8.689944134078212, + "grad_norm": 0.6517641544342041, + "learning_rate": 0.0005673949579831932, + "loss": 0.4705, + "step": 15555 + }, + { + "epoch": 8.690502793296089, + "grad_norm": 0.4902878999710083, + "learning_rate": 0.0005673669467787114, + "loss": 0.45, + "step": 15556 + }, + { + "epoch": 8.691061452513967, + "grad_norm": 0.41238394379615784, + "learning_rate": 0.0005673389355742297, + "loss": 0.4023, + "step": 15557 + }, + { + "epoch": 8.691620111731844, + "grad_norm": 0.9365857839584351, + "learning_rate": 0.000567310924369748, + "loss": 0.46, + "step": 15558 + }, + { + "epoch": 8.69217877094972, + "grad_norm": 0.45045730471611023, + "learning_rate": 0.0005672829131652662, + "loss": 0.4269, + "step": 15559 + }, + { + "epoch": 8.692737430167599, + "grad_norm": 0.5339806079864502, + "learning_rate": 0.0005672549019607844, + "loss": 0.4217, + "step": 15560 + }, + { + "epoch": 8.693296089385475, + "grad_norm": 0.396564245223999, + "learning_rate": 0.0005672268907563025, + "loss": 0.3856, + "step": 15561 + }, + { + "epoch": 8.693854748603352, + "grad_norm": 0.6059952974319458, + "learning_rate": 0.0005671988795518207, + "loss": 0.6047, + "step": 15562 + }, + { + "epoch": 8.694413407821228, + "grad_norm": 2.8737268447875977, + "learning_rate": 0.000567170868347339, + "loss": 0.4055, + "step": 15563 + }, + { + "epoch": 8.694972067039107, + "grad_norm": 0.4673866033554077, + "learning_rate": 0.0005671428571428572, + "loss": 0.4483, + "step": 15564 + }, + { + "epoch": 8.695530726256983, + "grad_norm": 1.530030369758606, + "learning_rate": 0.0005671148459383754, + "loss": 0.4653, + "step": 15565 + }, + { + "epoch": 8.69608938547486, + "grad_norm": 0.5205464959144592, + "learning_rate": 0.0005670868347338935, + "loss": 0.4363, + "step": 15566 + }, + { + "epoch": 8.696648044692738, + "grad_norm": 0.5952099561691284, + "learning_rate": 0.0005670588235294117, + "loss": 0.4541, + "step": 15567 + }, + { + "epoch": 8.697206703910615, + "grad_norm": 0.3770199716091156, + "learning_rate": 0.00056703081232493, + "loss": 0.4282, + "step": 15568 + }, + { + "epoch": 8.697765363128491, + "grad_norm": 0.3640885055065155, + "learning_rate": 0.0005670028011204482, + "loss": 0.3999, + "step": 15569 + }, + { + "epoch": 8.69832402234637, + "grad_norm": 0.47385725378990173, + "learning_rate": 0.0005669747899159664, + "loss": 0.3928, + "step": 15570 + }, + { + "epoch": 8.698882681564246, + "grad_norm": 1.3914686441421509, + "learning_rate": 0.0005669467787114845, + "loss": 0.3759, + "step": 15571 + }, + { + "epoch": 8.699441340782123, + "grad_norm": 0.4102199077606201, + "learning_rate": 0.0005669187675070027, + "loss": 0.4028, + "step": 15572 + }, + { + "epoch": 8.7, + "grad_norm": 0.5234174728393555, + "learning_rate": 0.0005668907563025211, + "loss": 0.3744, + "step": 15573 + }, + { + "epoch": 8.700558659217878, + "grad_norm": 0.46223580837249756, + "learning_rate": 0.0005668627450980393, + "loss": 0.3884, + "step": 15574 + }, + { + "epoch": 8.701117318435754, + "grad_norm": 0.5250284075737, + "learning_rate": 0.0005668347338935575, + "loss": 0.4637, + "step": 15575 + }, + { + "epoch": 8.70167597765363, + "grad_norm": 0.8773692846298218, + "learning_rate": 0.0005668067226890757, + "loss": 0.4147, + "step": 15576 + }, + { + "epoch": 8.702234636871509, + "grad_norm": 0.5146695375442505, + "learning_rate": 0.0005667787114845938, + "loss": 0.3863, + "step": 15577 + }, + { + "epoch": 8.702793296089386, + "grad_norm": 0.48896458745002747, + "learning_rate": 0.0005667507002801121, + "loss": 0.4693, + "step": 15578 + }, + { + "epoch": 8.703351955307262, + "grad_norm": 0.49369266629219055, + "learning_rate": 0.0005667226890756303, + "loss": 0.5659, + "step": 15579 + }, + { + "epoch": 8.703910614525139, + "grad_norm": 0.43496614694595337, + "learning_rate": 0.0005666946778711485, + "loss": 0.4846, + "step": 15580 + }, + { + "epoch": 8.704469273743017, + "grad_norm": 3.0558488368988037, + "learning_rate": 0.0005666666666666667, + "loss": 0.4418, + "step": 15581 + }, + { + "epoch": 8.705027932960894, + "grad_norm": 0.5131064057350159, + "learning_rate": 0.0005666386554621848, + "loss": 0.4421, + "step": 15582 + }, + { + "epoch": 8.70558659217877, + "grad_norm": 0.4947444498538971, + "learning_rate": 0.0005666106442577031, + "loss": 0.5146, + "step": 15583 + }, + { + "epoch": 8.706145251396649, + "grad_norm": 0.7857218980789185, + "learning_rate": 0.0005665826330532213, + "loss": 0.4214, + "step": 15584 + }, + { + "epoch": 8.706703910614525, + "grad_norm": 0.4524097740650177, + "learning_rate": 0.0005665546218487395, + "loss": 0.3314, + "step": 15585 + }, + { + "epoch": 8.707262569832402, + "grad_norm": 0.5022789835929871, + "learning_rate": 0.0005665266106442577, + "loss": 0.3777, + "step": 15586 + }, + { + "epoch": 8.70782122905028, + "grad_norm": 0.7888333201408386, + "learning_rate": 0.0005664985994397758, + "loss": 0.486, + "step": 15587 + }, + { + "epoch": 8.708379888268157, + "grad_norm": 0.5010085105895996, + "learning_rate": 0.0005664705882352941, + "loss": 0.5468, + "step": 15588 + }, + { + "epoch": 8.708938547486033, + "grad_norm": 0.5213218927383423, + "learning_rate": 0.0005664425770308124, + "loss": 0.4705, + "step": 15589 + }, + { + "epoch": 8.70949720670391, + "grad_norm": 1.07518470287323, + "learning_rate": 0.0005664145658263306, + "loss": 0.6021, + "step": 15590 + }, + { + "epoch": 8.710055865921788, + "grad_norm": 0.5122137665748596, + "learning_rate": 0.0005663865546218488, + "loss": 0.351, + "step": 15591 + }, + { + "epoch": 8.710614525139665, + "grad_norm": 0.4663032591342926, + "learning_rate": 0.000566358543417367, + "loss": 0.4456, + "step": 15592 + }, + { + "epoch": 8.711173184357541, + "grad_norm": 0.7306776642799377, + "learning_rate": 0.0005663305322128852, + "loss": 0.4339, + "step": 15593 + }, + { + "epoch": 8.71173184357542, + "grad_norm": 0.7633882164955139, + "learning_rate": 0.0005663025210084034, + "loss": 0.6889, + "step": 15594 + }, + { + "epoch": 8.712290502793296, + "grad_norm": 1.062889575958252, + "learning_rate": 0.0005662745098039216, + "loss": 0.5849, + "step": 15595 + }, + { + "epoch": 8.712849162011173, + "grad_norm": 0.5082654356956482, + "learning_rate": 0.0005662464985994398, + "loss": 0.328, + "step": 15596 + }, + { + "epoch": 8.713407821229051, + "grad_norm": 0.4522295892238617, + "learning_rate": 0.000566218487394958, + "loss": 0.3886, + "step": 15597 + }, + { + "epoch": 8.713966480446928, + "grad_norm": 0.6530178785324097, + "learning_rate": 0.0005661904761904762, + "loss": 0.388, + "step": 15598 + }, + { + "epoch": 8.714525139664804, + "grad_norm": 0.5231906771659851, + "learning_rate": 0.0005661624649859944, + "loss": 0.4088, + "step": 15599 + }, + { + "epoch": 8.71508379888268, + "grad_norm": 0.7256081104278564, + "learning_rate": 0.0005661344537815126, + "loss": 0.4477, + "step": 15600 + }, + { + "epoch": 8.71564245810056, + "grad_norm": 0.5418559908866882, + "learning_rate": 0.0005661064425770308, + "loss": 0.4031, + "step": 15601 + }, + { + "epoch": 8.716201117318436, + "grad_norm": 9.977767944335938, + "learning_rate": 0.000566078431372549, + "loss": 0.4733, + "step": 15602 + }, + { + "epoch": 8.716759776536312, + "grad_norm": 0.5321292877197266, + "learning_rate": 0.0005660504201680672, + "loss": 0.4721, + "step": 15603 + }, + { + "epoch": 8.71731843575419, + "grad_norm": 1.3056150674819946, + "learning_rate": 0.0005660224089635854, + "loss": 0.5374, + "step": 15604 + }, + { + "epoch": 8.717877094972067, + "grad_norm": 0.4756324589252472, + "learning_rate": 0.0005659943977591037, + "loss": 0.4491, + "step": 15605 + }, + { + "epoch": 8.718435754189944, + "grad_norm": 0.5027273893356323, + "learning_rate": 0.0005659663865546219, + "loss": 0.4188, + "step": 15606 + }, + { + "epoch": 8.71899441340782, + "grad_norm": 0.7712967395782471, + "learning_rate": 0.0005659383753501401, + "loss": 0.4498, + "step": 15607 + }, + { + "epoch": 8.719553072625699, + "grad_norm": 0.8776791095733643, + "learning_rate": 0.0005659103641456584, + "loss": 0.6183, + "step": 15608 + }, + { + "epoch": 8.720111731843575, + "grad_norm": 0.38432690501213074, + "learning_rate": 0.0005658823529411765, + "loss": 0.3343, + "step": 15609 + }, + { + "epoch": 8.720670391061452, + "grad_norm": 0.531984806060791, + "learning_rate": 0.0005658543417366947, + "loss": 0.5629, + "step": 15610 + }, + { + "epoch": 8.72122905027933, + "grad_norm": 0.40593528747558594, + "learning_rate": 0.0005658263305322129, + "loss": 0.4715, + "step": 15611 + }, + { + "epoch": 8.721787709497207, + "grad_norm": 0.41519421339035034, + "learning_rate": 0.0005657983193277311, + "loss": 0.4198, + "step": 15612 + }, + { + "epoch": 8.722346368715083, + "grad_norm": 0.8204532265663147, + "learning_rate": 0.0005657703081232494, + "loss": 0.3999, + "step": 15613 + }, + { + "epoch": 8.722905027932962, + "grad_norm": 0.49945172667503357, + "learning_rate": 0.0005657422969187675, + "loss": 0.3515, + "step": 15614 + }, + { + "epoch": 8.723463687150838, + "grad_norm": 0.6600934863090515, + "learning_rate": 0.0005657142857142857, + "loss": 0.4702, + "step": 15615 + }, + { + "epoch": 8.724022346368715, + "grad_norm": 0.47808966040611267, + "learning_rate": 0.0005656862745098039, + "loss": 0.479, + "step": 15616 + }, + { + "epoch": 8.724581005586593, + "grad_norm": 0.6460592746734619, + "learning_rate": 0.0005656582633053221, + "loss": 0.4807, + "step": 15617 + }, + { + "epoch": 8.72513966480447, + "grad_norm": 0.48469334840774536, + "learning_rate": 0.0005656302521008404, + "loss": 0.4792, + "step": 15618 + }, + { + "epoch": 8.725698324022346, + "grad_norm": 0.5133102536201477, + "learning_rate": 0.0005656022408963585, + "loss": 0.3524, + "step": 15619 + }, + { + "epoch": 8.726256983240223, + "grad_norm": 2.818528890609741, + "learning_rate": 0.0005655742296918767, + "loss": 0.5097, + "step": 15620 + }, + { + "epoch": 8.726815642458101, + "grad_norm": 0.5690695643424988, + "learning_rate": 0.000565546218487395, + "loss": 0.5503, + "step": 15621 + }, + { + "epoch": 8.727374301675978, + "grad_norm": 0.5174113512039185, + "learning_rate": 0.0005655182072829132, + "loss": 0.5199, + "step": 15622 + }, + { + "epoch": 8.727932960893854, + "grad_norm": 0.5243664979934692, + "learning_rate": 0.0005654901960784314, + "loss": 0.4061, + "step": 15623 + }, + { + "epoch": 8.728491620111733, + "grad_norm": 0.5984207987785339, + "learning_rate": 0.0005654621848739497, + "loss": 0.4391, + "step": 15624 + }, + { + "epoch": 8.72905027932961, + "grad_norm": 0.485016405582428, + "learning_rate": 0.0005654341736694678, + "loss": 0.4019, + "step": 15625 + }, + { + "epoch": 8.729608938547486, + "grad_norm": 0.4126144051551819, + "learning_rate": 0.000565406162464986, + "loss": 0.4607, + "step": 15626 + }, + { + "epoch": 8.730167597765362, + "grad_norm": 0.45564934611320496, + "learning_rate": 0.0005653781512605042, + "loss": 0.3749, + "step": 15627 + }, + { + "epoch": 8.73072625698324, + "grad_norm": 0.3843534290790558, + "learning_rate": 0.0005653501400560224, + "loss": 0.4009, + "step": 15628 + }, + { + "epoch": 8.731284916201117, + "grad_norm": 0.6612676382064819, + "learning_rate": 0.0005653221288515407, + "loss": 0.3881, + "step": 15629 + }, + { + "epoch": 8.731843575418994, + "grad_norm": 0.7044295072555542, + "learning_rate": 0.0005652941176470588, + "loss": 0.3608, + "step": 15630 + }, + { + "epoch": 8.732402234636872, + "grad_norm": 0.4451577961444855, + "learning_rate": 0.000565266106442577, + "loss": 0.3588, + "step": 15631 + }, + { + "epoch": 8.732960893854749, + "grad_norm": 0.49735331535339355, + "learning_rate": 0.0005652380952380952, + "loss": 0.367, + "step": 15632 + }, + { + "epoch": 8.733519553072625, + "grad_norm": 2.1209583282470703, + "learning_rate": 0.0005652100840336134, + "loss": 0.3505, + "step": 15633 + }, + { + "epoch": 8.734078212290502, + "grad_norm": 0.9366016983985901, + "learning_rate": 0.0005651820728291317, + "loss": 0.4919, + "step": 15634 + }, + { + "epoch": 8.73463687150838, + "grad_norm": 0.49355000257492065, + "learning_rate": 0.0005651540616246498, + "loss": 0.4626, + "step": 15635 + }, + { + "epoch": 8.735195530726257, + "grad_norm": 0.3209354877471924, + "learning_rate": 0.000565126050420168, + "loss": 0.3048, + "step": 15636 + }, + { + "epoch": 8.735754189944133, + "grad_norm": 0.5550408363342285, + "learning_rate": 0.0005650980392156862, + "loss": 0.453, + "step": 15637 + }, + { + "epoch": 8.736312849162012, + "grad_norm": 0.5163437724113464, + "learning_rate": 0.0005650700280112044, + "loss": 0.4577, + "step": 15638 + }, + { + "epoch": 8.736871508379888, + "grad_norm": 0.43063703179359436, + "learning_rate": 0.0005650420168067228, + "loss": 0.364, + "step": 15639 + }, + { + "epoch": 8.737430167597765, + "grad_norm": 0.40728363394737244, + "learning_rate": 0.000565014005602241, + "loss": 0.362, + "step": 15640 + }, + { + "epoch": 8.737988826815643, + "grad_norm": 0.6699554324150085, + "learning_rate": 0.0005649859943977591, + "loss": 0.4406, + "step": 15641 + }, + { + "epoch": 8.73854748603352, + "grad_norm": 0.4548247158527374, + "learning_rate": 0.0005649579831932773, + "loss": 0.3391, + "step": 15642 + }, + { + "epoch": 8.739106145251396, + "grad_norm": 0.7346000075340271, + "learning_rate": 0.0005649299719887955, + "loss": 0.4069, + "step": 15643 + }, + { + "epoch": 8.739664804469275, + "grad_norm": 0.42775166034698486, + "learning_rate": 0.0005649019607843138, + "loss": 0.3878, + "step": 15644 + }, + { + "epoch": 8.740223463687151, + "grad_norm": 0.3822396993637085, + "learning_rate": 0.000564873949579832, + "loss": 0.3349, + "step": 15645 + }, + { + "epoch": 8.740782122905028, + "grad_norm": 0.5837403535842896, + "learning_rate": 0.0005648459383753501, + "loss": 0.4553, + "step": 15646 + }, + { + "epoch": 8.741340782122904, + "grad_norm": 0.36033493280410767, + "learning_rate": 0.0005648179271708683, + "loss": 0.3393, + "step": 15647 + }, + { + "epoch": 8.741899441340783, + "grad_norm": 0.4193015694618225, + "learning_rate": 0.0005647899159663865, + "loss": 0.4682, + "step": 15648 + }, + { + "epoch": 8.74245810055866, + "grad_norm": 0.500306248664856, + "learning_rate": 0.0005647619047619048, + "loss": 0.4939, + "step": 15649 + }, + { + "epoch": 8.743016759776536, + "grad_norm": 0.4655872583389282, + "learning_rate": 0.000564733893557423, + "loss": 0.3617, + "step": 15650 + }, + { + "epoch": 8.743575418994414, + "grad_norm": 8.034947395324707, + "learning_rate": 0.0005647058823529411, + "loss": 0.4282, + "step": 15651 + }, + { + "epoch": 8.74413407821229, + "grad_norm": 0.4258803427219391, + "learning_rate": 0.0005646778711484593, + "loss": 0.4751, + "step": 15652 + }, + { + "epoch": 8.744692737430167, + "grad_norm": 0.4732818603515625, + "learning_rate": 0.0005646498599439775, + "loss": 0.4379, + "step": 15653 + }, + { + "epoch": 8.745251396648044, + "grad_norm": 0.4753307104110718, + "learning_rate": 0.0005646218487394959, + "loss": 0.4345, + "step": 15654 + }, + { + "epoch": 8.745810055865922, + "grad_norm": 1.1873834133148193, + "learning_rate": 0.0005645938375350141, + "loss": 0.3901, + "step": 15655 + }, + { + "epoch": 8.746368715083799, + "grad_norm": 0.5162121653556824, + "learning_rate": 0.0005645658263305323, + "loss": 0.4958, + "step": 15656 + }, + { + "epoch": 8.746927374301675, + "grad_norm": 0.4946577250957489, + "learning_rate": 0.0005645378151260504, + "loss": 0.4076, + "step": 15657 + }, + { + "epoch": 8.747486033519554, + "grad_norm": 1.1167137622833252, + "learning_rate": 0.0005645098039215686, + "loss": 0.27, + "step": 15658 + }, + { + "epoch": 8.74804469273743, + "grad_norm": 1.0856842994689941, + "learning_rate": 0.0005644817927170869, + "loss": 0.5634, + "step": 15659 + }, + { + "epoch": 8.748603351955307, + "grad_norm": 1.3085237741470337, + "learning_rate": 0.0005644537815126051, + "loss": 0.5078, + "step": 15660 + }, + { + "epoch": 8.749162011173185, + "grad_norm": 0.3998778760433197, + "learning_rate": 0.0005644257703081233, + "loss": 0.3229, + "step": 15661 + }, + { + "epoch": 8.749720670391062, + "grad_norm": 0.9483353495597839, + "learning_rate": 0.0005643977591036414, + "loss": 0.3889, + "step": 15662 + }, + { + "epoch": 8.750279329608938, + "grad_norm": 0.38763555884361267, + "learning_rate": 0.0005643697478991596, + "loss": 0.4316, + "step": 15663 + }, + { + "epoch": 8.750837988826815, + "grad_norm": 0.5255081653594971, + "learning_rate": 0.0005643417366946779, + "loss": 0.4302, + "step": 15664 + }, + { + "epoch": 8.751396648044693, + "grad_norm": 0.6534271836280823, + "learning_rate": 0.0005643137254901961, + "loss": 0.398, + "step": 15665 + }, + { + "epoch": 8.75195530726257, + "grad_norm": 2.007716178894043, + "learning_rate": 0.0005642857142857143, + "loss": 0.4102, + "step": 15666 + }, + { + "epoch": 8.752513966480446, + "grad_norm": 0.42920124530792236, + "learning_rate": 0.0005642577030812324, + "loss": 0.5197, + "step": 15667 + }, + { + "epoch": 8.753072625698325, + "grad_norm": 0.7165144085884094, + "learning_rate": 0.0005642296918767506, + "loss": 0.4966, + "step": 15668 + }, + { + "epoch": 8.753631284916201, + "grad_norm": 0.5029805302619934, + "learning_rate": 0.0005642016806722689, + "loss": 0.4287, + "step": 15669 + }, + { + "epoch": 8.754189944134078, + "grad_norm": 0.5944901704788208, + "learning_rate": 0.0005641736694677871, + "loss": 0.5286, + "step": 15670 + }, + { + "epoch": 8.754748603351956, + "grad_norm": 0.5133156180381775, + "learning_rate": 0.0005641456582633054, + "loss": 0.2861, + "step": 15671 + }, + { + "epoch": 8.755307262569833, + "grad_norm": 0.3726537823677063, + "learning_rate": 0.0005641176470588236, + "loss": 0.4109, + "step": 15672 + }, + { + "epoch": 8.75586592178771, + "grad_norm": 0.5534281134605408, + "learning_rate": 0.0005640896358543417, + "loss": 0.4931, + "step": 15673 + }, + { + "epoch": 8.756424581005586, + "grad_norm": 0.5244345664978027, + "learning_rate": 0.00056406162464986, + "loss": 0.4225, + "step": 15674 + }, + { + "epoch": 8.756983240223464, + "grad_norm": 0.6160147190093994, + "learning_rate": 0.0005640336134453782, + "loss": 0.4799, + "step": 15675 + }, + { + "epoch": 8.75754189944134, + "grad_norm": 0.7252782583236694, + "learning_rate": 0.0005640056022408964, + "loss": 0.4367, + "step": 15676 + }, + { + "epoch": 8.758100558659217, + "grad_norm": 1.1900861263275146, + "learning_rate": 0.0005639775910364146, + "loss": 0.3935, + "step": 15677 + }, + { + "epoch": 8.758659217877096, + "grad_norm": 0.6065216064453125, + "learning_rate": 0.0005639495798319327, + "loss": 0.4505, + "step": 15678 + }, + { + "epoch": 8.759217877094972, + "grad_norm": 0.5070031881332397, + "learning_rate": 0.000563921568627451, + "loss": 0.479, + "step": 15679 + }, + { + "epoch": 8.759776536312849, + "grad_norm": 0.3886505365371704, + "learning_rate": 0.0005638935574229692, + "loss": 0.3378, + "step": 15680 + }, + { + "epoch": 8.760335195530725, + "grad_norm": 0.4132860600948334, + "learning_rate": 0.0005638655462184874, + "loss": 0.3514, + "step": 15681 + }, + { + "epoch": 8.760893854748604, + "grad_norm": 0.47031691670417786, + "learning_rate": 0.0005638375350140056, + "loss": 0.5206, + "step": 15682 + }, + { + "epoch": 8.76145251396648, + "grad_norm": 0.49106690287590027, + "learning_rate": 0.0005638095238095237, + "loss": 0.4429, + "step": 15683 + }, + { + "epoch": 8.762011173184357, + "grad_norm": 0.45995762944221497, + "learning_rate": 0.000563781512605042, + "loss": 0.3899, + "step": 15684 + }, + { + "epoch": 8.762569832402235, + "grad_norm": 0.35264644026756287, + "learning_rate": 0.0005637535014005602, + "loss": 0.3835, + "step": 15685 + }, + { + "epoch": 8.763128491620112, + "grad_norm": 0.42212116718292236, + "learning_rate": 0.0005637254901960784, + "loss": 0.4782, + "step": 15686 + }, + { + "epoch": 8.763687150837988, + "grad_norm": 0.7341591715812683, + "learning_rate": 0.0005636974789915967, + "loss": 0.4344, + "step": 15687 + }, + { + "epoch": 8.764245810055867, + "grad_norm": 0.578353762626648, + "learning_rate": 0.0005636694677871149, + "loss": 0.375, + "step": 15688 + }, + { + "epoch": 8.764804469273743, + "grad_norm": 11.390461921691895, + "learning_rate": 0.0005636414565826331, + "loss": 0.3373, + "step": 15689 + }, + { + "epoch": 8.76536312849162, + "grad_norm": 0.43578940629959106, + "learning_rate": 0.0005636134453781513, + "loss": 0.363, + "step": 15690 + }, + { + "epoch": 8.765921787709498, + "grad_norm": 0.6495600342750549, + "learning_rate": 0.0005635854341736695, + "loss": 0.3482, + "step": 15691 + }, + { + "epoch": 8.766480446927375, + "grad_norm": 0.4802863597869873, + "learning_rate": 0.0005635574229691877, + "loss": 0.5019, + "step": 15692 + }, + { + "epoch": 8.767039106145251, + "grad_norm": 0.49207985401153564, + "learning_rate": 0.0005635294117647059, + "loss": 0.4726, + "step": 15693 + }, + { + "epoch": 8.767597765363128, + "grad_norm": 0.8635436296463013, + "learning_rate": 0.0005635014005602241, + "loss": 0.5205, + "step": 15694 + }, + { + "epoch": 8.768156424581006, + "grad_norm": 0.3986515998840332, + "learning_rate": 0.0005634733893557423, + "loss": 0.3936, + "step": 15695 + }, + { + "epoch": 8.768715083798883, + "grad_norm": 0.46163687109947205, + "learning_rate": 0.0005634453781512605, + "loss": 0.4748, + "step": 15696 + }, + { + "epoch": 8.76927374301676, + "grad_norm": 1.6048675775527954, + "learning_rate": 0.0005634173669467787, + "loss": 0.5088, + "step": 15697 + }, + { + "epoch": 8.769832402234638, + "grad_norm": 2.0931811332702637, + "learning_rate": 0.0005633893557422969, + "loss": 0.3983, + "step": 15698 + }, + { + "epoch": 8.770391061452514, + "grad_norm": 0.5724558234214783, + "learning_rate": 0.0005633613445378152, + "loss": 0.3966, + "step": 15699 + }, + { + "epoch": 8.77094972067039, + "grad_norm": 0.5514177680015564, + "learning_rate": 0.0005633333333333333, + "loss": 0.4012, + "step": 15700 + }, + { + "epoch": 8.771508379888267, + "grad_norm": 0.438016802072525, + "learning_rate": 0.0005633053221288515, + "loss": 0.3459, + "step": 15701 + }, + { + "epoch": 8.772067039106146, + "grad_norm": 0.4400658905506134, + "learning_rate": 0.0005632773109243697, + "loss": 0.4276, + "step": 15702 + }, + { + "epoch": 8.772625698324022, + "grad_norm": 1.2535483837127686, + "learning_rate": 0.000563249299719888, + "loss": 0.5748, + "step": 15703 + }, + { + "epoch": 8.773184357541899, + "grad_norm": 1.3620244264602661, + "learning_rate": 0.0005632212885154063, + "loss": 0.4012, + "step": 15704 + }, + { + "epoch": 8.773743016759777, + "grad_norm": 0.4720567762851715, + "learning_rate": 0.0005631932773109244, + "loss": 0.3815, + "step": 15705 + }, + { + "epoch": 8.774301675977654, + "grad_norm": 0.36377981305122375, + "learning_rate": 0.0005631652661064426, + "loss": 0.4601, + "step": 15706 + }, + { + "epoch": 8.77486033519553, + "grad_norm": 1.499476671218872, + "learning_rate": 0.0005631372549019608, + "loss": 0.5913, + "step": 15707 + }, + { + "epoch": 8.775418994413407, + "grad_norm": 0.5073271989822388, + "learning_rate": 0.000563109243697479, + "loss": 0.4163, + "step": 15708 + }, + { + "epoch": 8.775977653631285, + "grad_norm": 0.5782145261764526, + "learning_rate": 0.0005630812324929973, + "loss": 0.4112, + "step": 15709 + }, + { + "epoch": 8.776536312849162, + "grad_norm": 0.566599428653717, + "learning_rate": 0.0005630532212885154, + "loss": 0.4603, + "step": 15710 + }, + { + "epoch": 8.777094972067038, + "grad_norm": 0.5575801134109497, + "learning_rate": 0.0005630252100840336, + "loss": 0.5307, + "step": 15711 + }, + { + "epoch": 8.777653631284917, + "grad_norm": 0.5349463224411011, + "learning_rate": 0.0005629971988795518, + "loss": 0.4837, + "step": 15712 + }, + { + "epoch": 8.778212290502793, + "grad_norm": 0.4377054274082184, + "learning_rate": 0.00056296918767507, + "loss": 0.3102, + "step": 15713 + }, + { + "epoch": 8.77877094972067, + "grad_norm": 0.345467209815979, + "learning_rate": 0.0005629411764705883, + "loss": 0.377, + "step": 15714 + }, + { + "epoch": 8.779329608938548, + "grad_norm": 0.9769173860549927, + "learning_rate": 0.0005629131652661065, + "loss": 0.4129, + "step": 15715 + }, + { + "epoch": 8.779888268156425, + "grad_norm": 0.761565625667572, + "learning_rate": 0.0005628851540616246, + "loss": 0.4586, + "step": 15716 + }, + { + "epoch": 8.780446927374301, + "grad_norm": 1.0414402484893799, + "learning_rate": 0.0005628571428571428, + "loss": 0.4703, + "step": 15717 + }, + { + "epoch": 8.78100558659218, + "grad_norm": 0.7781525254249573, + "learning_rate": 0.000562829131652661, + "loss": 0.4029, + "step": 15718 + }, + { + "epoch": 8.781564245810056, + "grad_norm": 0.46379581093788147, + "learning_rate": 0.0005628011204481794, + "loss": 0.4168, + "step": 15719 + }, + { + "epoch": 8.782122905027933, + "grad_norm": 0.30571869015693665, + "learning_rate": 0.0005627731092436976, + "loss": 0.2938, + "step": 15720 + }, + { + "epoch": 8.78268156424581, + "grad_norm": 0.5634928941726685, + "learning_rate": 0.0005627450980392157, + "loss": 0.4369, + "step": 15721 + }, + { + "epoch": 8.783240223463688, + "grad_norm": 0.71206134557724, + "learning_rate": 0.0005627170868347339, + "loss": 0.4624, + "step": 15722 + }, + { + "epoch": 8.783798882681564, + "grad_norm": 1.0042665004730225, + "learning_rate": 0.0005626890756302521, + "loss": 0.4571, + "step": 15723 + }, + { + "epoch": 8.78435754189944, + "grad_norm": 0.669148862361908, + "learning_rate": 0.0005626610644257704, + "loss": 0.5413, + "step": 15724 + }, + { + "epoch": 8.78491620111732, + "grad_norm": 0.4178120791912079, + "learning_rate": 0.0005626330532212886, + "loss": 0.4269, + "step": 15725 + }, + { + "epoch": 8.785474860335196, + "grad_norm": 0.4230409264564514, + "learning_rate": 0.0005626050420168067, + "loss": 0.3929, + "step": 15726 + }, + { + "epoch": 8.786033519553072, + "grad_norm": 0.5099184513092041, + "learning_rate": 0.0005625770308123249, + "loss": 0.504, + "step": 15727 + }, + { + "epoch": 8.786592178770949, + "grad_norm": 1.1595661640167236, + "learning_rate": 0.0005625490196078431, + "loss": 0.3423, + "step": 15728 + }, + { + "epoch": 8.787150837988827, + "grad_norm": 0.39395782351493835, + "learning_rate": 0.0005625210084033614, + "loss": 0.4581, + "step": 15729 + }, + { + "epoch": 8.787709497206704, + "grad_norm": 0.5747889280319214, + "learning_rate": 0.0005624929971988796, + "loss": 0.412, + "step": 15730 + }, + { + "epoch": 8.78826815642458, + "grad_norm": 0.42089641094207764, + "learning_rate": 0.0005624649859943978, + "loss": 0.4908, + "step": 15731 + }, + { + "epoch": 8.788826815642459, + "grad_norm": 0.4230941832065582, + "learning_rate": 0.0005624369747899159, + "loss": 0.4879, + "step": 15732 + }, + { + "epoch": 8.789385474860335, + "grad_norm": 0.6767685413360596, + "learning_rate": 0.0005624089635854341, + "loss": 0.4485, + "step": 15733 + }, + { + "epoch": 8.789944134078212, + "grad_norm": 3.443275213241577, + "learning_rate": 0.0005623809523809524, + "loss": 0.6271, + "step": 15734 + }, + { + "epoch": 8.79050279329609, + "grad_norm": 0.4694518744945526, + "learning_rate": 0.0005623529411764706, + "loss": 0.4284, + "step": 15735 + }, + { + "epoch": 8.791061452513967, + "grad_norm": 0.5341271162033081, + "learning_rate": 0.0005623249299719889, + "loss": 0.4376, + "step": 15736 + }, + { + "epoch": 8.791620111731843, + "grad_norm": 0.5891209244728088, + "learning_rate": 0.000562296918767507, + "loss": 0.3885, + "step": 15737 + }, + { + "epoch": 8.79217877094972, + "grad_norm": 0.4981018602848053, + "learning_rate": 0.0005622689075630252, + "loss": 0.5263, + "step": 15738 + }, + { + "epoch": 8.792737430167598, + "grad_norm": 0.5746358633041382, + "learning_rate": 0.0005622408963585435, + "loss": 0.4476, + "step": 15739 + }, + { + "epoch": 8.793296089385475, + "grad_norm": 1.7005538940429688, + "learning_rate": 0.0005622128851540617, + "loss": 0.6176, + "step": 15740 + }, + { + "epoch": 8.793854748603351, + "grad_norm": 0.5340922474861145, + "learning_rate": 0.0005621848739495799, + "loss": 0.3977, + "step": 15741 + }, + { + "epoch": 8.79441340782123, + "grad_norm": 0.5265353918075562, + "learning_rate": 0.000562156862745098, + "loss": 0.4059, + "step": 15742 + }, + { + "epoch": 8.794972067039106, + "grad_norm": 0.396685928106308, + "learning_rate": 0.0005621288515406162, + "loss": 0.2848, + "step": 15743 + }, + { + "epoch": 8.795530726256983, + "grad_norm": 2.166918992996216, + "learning_rate": 0.0005621008403361345, + "loss": 0.3943, + "step": 15744 + }, + { + "epoch": 8.796089385474861, + "grad_norm": 1.0985817909240723, + "learning_rate": 0.0005620728291316527, + "loss": 0.5232, + "step": 15745 + }, + { + "epoch": 8.796648044692738, + "grad_norm": 1.067770004272461, + "learning_rate": 0.0005620448179271709, + "loss": 0.5097, + "step": 15746 + }, + { + "epoch": 8.797206703910614, + "grad_norm": 0.7104836106300354, + "learning_rate": 0.0005620168067226891, + "loss": 0.4293, + "step": 15747 + }, + { + "epoch": 8.797765363128491, + "grad_norm": 0.34466931223869324, + "learning_rate": 0.0005619887955182072, + "loss": 0.32, + "step": 15748 + }, + { + "epoch": 8.79832402234637, + "grad_norm": 0.5265138149261475, + "learning_rate": 0.0005619607843137255, + "loss": 0.593, + "step": 15749 + }, + { + "epoch": 8.798882681564246, + "grad_norm": 1.250335931777954, + "learning_rate": 0.0005619327731092437, + "loss": 0.4092, + "step": 15750 + }, + { + "epoch": 8.799441340782122, + "grad_norm": 0.3780997097492218, + "learning_rate": 0.0005619047619047619, + "loss": 0.3767, + "step": 15751 + }, + { + "epoch": 8.8, + "grad_norm": 0.4873446226119995, + "learning_rate": 0.0005618767507002801, + "loss": 0.5189, + "step": 15752 + }, + { + "epoch": 8.800558659217877, + "grad_norm": 0.4541783332824707, + "learning_rate": 0.0005618487394957982, + "loss": 0.396, + "step": 15753 + }, + { + "epoch": 8.801117318435754, + "grad_norm": 0.7754843235015869, + "learning_rate": 0.0005618207282913166, + "loss": 0.4513, + "step": 15754 + }, + { + "epoch": 8.80167597765363, + "grad_norm": 2.0724496841430664, + "learning_rate": 0.0005617927170868348, + "loss": 0.4561, + "step": 15755 + }, + { + "epoch": 8.802234636871509, + "grad_norm": 0.7389378547668457, + "learning_rate": 0.000561764705882353, + "loss": 0.3614, + "step": 15756 + }, + { + "epoch": 8.802793296089385, + "grad_norm": 0.42319151759147644, + "learning_rate": 0.0005617366946778712, + "loss": 0.3707, + "step": 15757 + }, + { + "epoch": 8.803351955307262, + "grad_norm": 0.43397441506385803, + "learning_rate": 0.0005617086834733893, + "loss": 0.4021, + "step": 15758 + }, + { + "epoch": 8.80391061452514, + "grad_norm": 5.564876556396484, + "learning_rate": 0.0005616806722689076, + "loss": 0.3989, + "step": 15759 + }, + { + "epoch": 8.804469273743017, + "grad_norm": 0.6502397060394287, + "learning_rate": 0.0005616526610644258, + "loss": 0.4283, + "step": 15760 + }, + { + "epoch": 8.805027932960893, + "grad_norm": 0.8007463216781616, + "learning_rate": 0.000561624649859944, + "loss": 0.4429, + "step": 15761 + }, + { + "epoch": 8.805586592178772, + "grad_norm": 0.8617485761642456, + "learning_rate": 0.0005615966386554622, + "loss": 0.5963, + "step": 15762 + }, + { + "epoch": 8.806145251396648, + "grad_norm": 0.5161985158920288, + "learning_rate": 0.0005615686274509804, + "loss": 0.401, + "step": 15763 + }, + { + "epoch": 8.806703910614525, + "grad_norm": 0.5018290281295776, + "learning_rate": 0.0005615406162464986, + "loss": 0.4164, + "step": 15764 + }, + { + "epoch": 8.807262569832401, + "grad_norm": 0.5203245878219604, + "learning_rate": 0.0005615126050420168, + "loss": 0.4833, + "step": 15765 + }, + { + "epoch": 8.80782122905028, + "grad_norm": 0.7248677015304565, + "learning_rate": 0.000561484593837535, + "loss": 0.4828, + "step": 15766 + }, + { + "epoch": 8.808379888268156, + "grad_norm": 0.5944013595581055, + "learning_rate": 0.0005614565826330532, + "loss": 0.3959, + "step": 15767 + }, + { + "epoch": 8.808938547486033, + "grad_norm": 0.5159067511558533, + "learning_rate": 0.0005614285714285714, + "loss": 0.3531, + "step": 15768 + }, + { + "epoch": 8.809497206703911, + "grad_norm": 0.4834436774253845, + "learning_rate": 0.0005614005602240897, + "loss": 0.506, + "step": 15769 + }, + { + "epoch": 8.810055865921788, + "grad_norm": 0.5468249917030334, + "learning_rate": 0.0005613725490196079, + "loss": 0.4982, + "step": 15770 + }, + { + "epoch": 8.810614525139664, + "grad_norm": 1.7091220617294312, + "learning_rate": 0.0005613445378151261, + "loss": 0.4169, + "step": 15771 + }, + { + "epoch": 8.811173184357543, + "grad_norm": 0.613426923751831, + "learning_rate": 0.0005613165266106443, + "loss": 0.773, + "step": 15772 + }, + { + "epoch": 8.81173184357542, + "grad_norm": 0.6755386590957642, + "learning_rate": 0.0005612885154061625, + "loss": 0.3302, + "step": 15773 + }, + { + "epoch": 8.812290502793296, + "grad_norm": 0.6490817070007324, + "learning_rate": 0.0005612605042016807, + "loss": 0.4201, + "step": 15774 + }, + { + "epoch": 8.812849162011172, + "grad_norm": 0.8108155131340027, + "learning_rate": 0.0005612324929971989, + "loss": 0.4452, + "step": 15775 + }, + { + "epoch": 8.81340782122905, + "grad_norm": 0.4282110929489136, + "learning_rate": 0.0005612044817927171, + "loss": 0.3913, + "step": 15776 + }, + { + "epoch": 8.813966480446927, + "grad_norm": 0.38874006271362305, + "learning_rate": 0.0005611764705882353, + "loss": 0.3941, + "step": 15777 + }, + { + "epoch": 8.814525139664804, + "grad_norm": 0.5276618003845215, + "learning_rate": 0.0005611484593837535, + "loss": 0.5142, + "step": 15778 + }, + { + "epoch": 8.815083798882682, + "grad_norm": 0.504787802696228, + "learning_rate": 0.0005611204481792718, + "loss": 0.4565, + "step": 15779 + }, + { + "epoch": 8.815642458100559, + "grad_norm": 0.494769424200058, + "learning_rate": 0.0005610924369747899, + "loss": 0.4245, + "step": 15780 + }, + { + "epoch": 8.816201117318435, + "grad_norm": 0.3847169280052185, + "learning_rate": 0.0005610644257703081, + "loss": 0.4498, + "step": 15781 + }, + { + "epoch": 8.816759776536312, + "grad_norm": 0.6524850130081177, + "learning_rate": 0.0005610364145658263, + "loss": 0.4731, + "step": 15782 + }, + { + "epoch": 8.81731843575419, + "grad_norm": 0.6199373602867126, + "learning_rate": 0.0005610084033613445, + "loss": 0.4352, + "step": 15783 + }, + { + "epoch": 8.817877094972067, + "grad_norm": 0.8549118638038635, + "learning_rate": 0.0005609803921568628, + "loss": 0.4191, + "step": 15784 + }, + { + "epoch": 8.818435754189943, + "grad_norm": 0.969581127166748, + "learning_rate": 0.000560952380952381, + "loss": 0.5507, + "step": 15785 + }, + { + "epoch": 8.818994413407822, + "grad_norm": 0.43300533294677734, + "learning_rate": 0.0005609243697478992, + "loss": 0.5037, + "step": 15786 + }, + { + "epoch": 8.819553072625698, + "grad_norm": 0.648641049861908, + "learning_rate": 0.0005608963585434174, + "loss": 0.3187, + "step": 15787 + }, + { + "epoch": 8.820111731843575, + "grad_norm": 0.5054218769073486, + "learning_rate": 0.0005608683473389356, + "loss": 0.4771, + "step": 15788 + }, + { + "epoch": 8.820670391061453, + "grad_norm": 0.4416903555393219, + "learning_rate": 0.0005608403361344539, + "loss": 0.4213, + "step": 15789 + }, + { + "epoch": 8.82122905027933, + "grad_norm": 0.6103943586349487, + "learning_rate": 0.000560812324929972, + "loss": 0.4273, + "step": 15790 + }, + { + "epoch": 8.821787709497206, + "grad_norm": 0.5159399509429932, + "learning_rate": 0.0005607843137254902, + "loss": 0.4427, + "step": 15791 + }, + { + "epoch": 8.822346368715085, + "grad_norm": 0.5488128662109375, + "learning_rate": 0.0005607563025210084, + "loss": 0.4177, + "step": 15792 + }, + { + "epoch": 8.822905027932961, + "grad_norm": 0.6271675229072571, + "learning_rate": 0.0005607282913165266, + "loss": 0.4656, + "step": 15793 + }, + { + "epoch": 8.823463687150838, + "grad_norm": 1.1658530235290527, + "learning_rate": 0.0005607002801120449, + "loss": 0.405, + "step": 15794 + }, + { + "epoch": 8.824022346368714, + "grad_norm": 0.7646031975746155, + "learning_rate": 0.0005606722689075631, + "loss": 0.4228, + "step": 15795 + }, + { + "epoch": 8.824581005586593, + "grad_norm": 0.5794070363044739, + "learning_rate": 0.0005606442577030812, + "loss": 0.4064, + "step": 15796 + }, + { + "epoch": 8.82513966480447, + "grad_norm": 0.4996819794178009, + "learning_rate": 0.0005606162464985994, + "loss": 0.4272, + "step": 15797 + }, + { + "epoch": 8.825698324022346, + "grad_norm": 0.37560370564460754, + "learning_rate": 0.0005605882352941176, + "loss": 0.3945, + "step": 15798 + }, + { + "epoch": 8.826256983240224, + "grad_norm": 0.4004479646682739, + "learning_rate": 0.0005605602240896359, + "loss": 0.457, + "step": 15799 + }, + { + "epoch": 8.8268156424581, + "grad_norm": 0.5167516469955444, + "learning_rate": 0.0005605322128851541, + "loss": 0.4259, + "step": 15800 + }, + { + "epoch": 8.827374301675977, + "grad_norm": 0.47834140062332153, + "learning_rate": 0.0005605042016806722, + "loss": 0.3343, + "step": 15801 + }, + { + "epoch": 8.827932960893854, + "grad_norm": 0.5612804293632507, + "learning_rate": 0.0005604761904761904, + "loss": 0.344, + "step": 15802 + }, + { + "epoch": 8.828491620111732, + "grad_norm": 0.5487278699874878, + "learning_rate": 0.0005604481792717087, + "loss": 0.3641, + "step": 15803 + }, + { + "epoch": 8.829050279329609, + "grad_norm": 2.5715460777282715, + "learning_rate": 0.000560420168067227, + "loss": 0.3943, + "step": 15804 + }, + { + "epoch": 8.829608938547485, + "grad_norm": 0.8560015559196472, + "learning_rate": 0.0005603921568627452, + "loss": 0.5879, + "step": 15805 + }, + { + "epoch": 8.830167597765364, + "grad_norm": 0.5358844995498657, + "learning_rate": 0.0005603641456582633, + "loss": 0.4238, + "step": 15806 + }, + { + "epoch": 8.83072625698324, + "grad_norm": 0.6962670683860779, + "learning_rate": 0.0005603361344537815, + "loss": 0.5367, + "step": 15807 + }, + { + "epoch": 8.831284916201117, + "grad_norm": 1.5312117338180542, + "learning_rate": 0.0005603081232492997, + "loss": 0.5755, + "step": 15808 + }, + { + "epoch": 8.831843575418995, + "grad_norm": 2.177212715148926, + "learning_rate": 0.000560280112044818, + "loss": 0.3926, + "step": 15809 + }, + { + "epoch": 8.832402234636872, + "grad_norm": 2.0129446983337402, + "learning_rate": 0.0005602521008403362, + "loss": 0.5054, + "step": 15810 + }, + { + "epoch": 8.832960893854748, + "grad_norm": 0.6918999552726746, + "learning_rate": 0.0005602240896358544, + "loss": 0.4311, + "step": 15811 + }, + { + "epoch": 8.833519553072625, + "grad_norm": 0.4057055413722992, + "learning_rate": 0.0005601960784313725, + "loss": 0.3703, + "step": 15812 + }, + { + "epoch": 8.834078212290503, + "grad_norm": 0.5047594904899597, + "learning_rate": 0.0005601680672268907, + "loss": 0.4717, + "step": 15813 + }, + { + "epoch": 8.83463687150838, + "grad_norm": 1.7702102661132812, + "learning_rate": 0.000560140056022409, + "loss": 0.4751, + "step": 15814 + }, + { + "epoch": 8.835195530726256, + "grad_norm": 0.8252912163734436, + "learning_rate": 0.0005601120448179272, + "loss": 0.5057, + "step": 15815 + }, + { + "epoch": 8.835754189944135, + "grad_norm": 0.5314022302627563, + "learning_rate": 0.0005600840336134454, + "loss": 0.4176, + "step": 15816 + }, + { + "epoch": 8.836312849162011, + "grad_norm": 0.44567450881004333, + "learning_rate": 0.0005600560224089635, + "loss": 0.4846, + "step": 15817 + }, + { + "epoch": 8.836871508379888, + "grad_norm": 0.5232447981834412, + "learning_rate": 0.0005600280112044817, + "loss": 0.4326, + "step": 15818 + }, + { + "epoch": 8.837430167597766, + "grad_norm": 0.46586373448371887, + "learning_rate": 0.0005600000000000001, + "loss": 0.4706, + "step": 15819 + }, + { + "epoch": 8.837988826815643, + "grad_norm": 1.3477208614349365, + "learning_rate": 0.0005599719887955183, + "loss": 0.4725, + "step": 15820 + }, + { + "epoch": 8.83854748603352, + "grad_norm": 0.49078822135925293, + "learning_rate": 0.0005599439775910365, + "loss": 0.4124, + "step": 15821 + }, + { + "epoch": 8.839106145251396, + "grad_norm": 0.8453941345214844, + "learning_rate": 0.0005599159663865546, + "loss": 0.5441, + "step": 15822 + }, + { + "epoch": 8.839664804469274, + "grad_norm": 0.5877315998077393, + "learning_rate": 0.0005598879551820728, + "loss": 0.4284, + "step": 15823 + }, + { + "epoch": 8.84022346368715, + "grad_norm": 1.5403003692626953, + "learning_rate": 0.0005598599439775911, + "loss": 0.36, + "step": 15824 + }, + { + "epoch": 8.840782122905027, + "grad_norm": 0.3417825996875763, + "learning_rate": 0.0005598319327731093, + "loss": 0.3587, + "step": 15825 + }, + { + "epoch": 8.841340782122906, + "grad_norm": 1.1989957094192505, + "learning_rate": 0.0005598039215686275, + "loss": 0.4314, + "step": 15826 + }, + { + "epoch": 8.841899441340782, + "grad_norm": 0.8693377375602722, + "learning_rate": 0.0005597759103641457, + "loss": 0.5083, + "step": 15827 + }, + { + "epoch": 8.842458100558659, + "grad_norm": 0.7427593469619751, + "learning_rate": 0.0005597478991596638, + "loss": 0.4641, + "step": 15828 + }, + { + "epoch": 8.843016759776535, + "grad_norm": 0.5602688789367676, + "learning_rate": 0.0005597198879551821, + "loss": 0.4509, + "step": 15829 + }, + { + "epoch": 8.843575418994414, + "grad_norm": 0.9375371336936951, + "learning_rate": 0.0005596918767507003, + "loss": 0.4398, + "step": 15830 + }, + { + "epoch": 8.84413407821229, + "grad_norm": 0.3733755052089691, + "learning_rate": 0.0005596638655462185, + "loss": 0.4562, + "step": 15831 + }, + { + "epoch": 8.844692737430167, + "grad_norm": 0.8695487380027771, + "learning_rate": 0.0005596358543417367, + "loss": 0.4735, + "step": 15832 + }, + { + "epoch": 8.845251396648045, + "grad_norm": 6.599333763122559, + "learning_rate": 0.0005596078431372548, + "loss": 0.4304, + "step": 15833 + }, + { + "epoch": 8.845810055865922, + "grad_norm": 0.41975608468055725, + "learning_rate": 0.0005595798319327731, + "loss": 0.4463, + "step": 15834 + }, + { + "epoch": 8.846368715083798, + "grad_norm": 11.498644828796387, + "learning_rate": 0.0005595518207282914, + "loss": 0.4016, + "step": 15835 + }, + { + "epoch": 8.846927374301677, + "grad_norm": 0.5879505276679993, + "learning_rate": 0.0005595238095238096, + "loss": 0.372, + "step": 15836 + }, + { + "epoch": 8.847486033519553, + "grad_norm": 0.5154557824134827, + "learning_rate": 0.0005594957983193278, + "loss": 0.5299, + "step": 15837 + }, + { + "epoch": 8.84804469273743, + "grad_norm": 0.48858073353767395, + "learning_rate": 0.0005594677871148459, + "loss": 0.4914, + "step": 15838 + }, + { + "epoch": 8.848603351955306, + "grad_norm": 0.39561858773231506, + "learning_rate": 0.0005594397759103642, + "loss": 0.3889, + "step": 15839 + }, + { + "epoch": 8.849162011173185, + "grad_norm": 0.8169729113578796, + "learning_rate": 0.0005594117647058824, + "loss": 0.3588, + "step": 15840 + }, + { + "epoch": 8.849720670391061, + "grad_norm": 0.6235904097557068, + "learning_rate": 0.0005593837535014006, + "loss": 0.4483, + "step": 15841 + }, + { + "epoch": 8.850279329608938, + "grad_norm": 1.2435609102249146, + "learning_rate": 0.0005593557422969188, + "loss": 0.4193, + "step": 15842 + }, + { + "epoch": 8.850837988826816, + "grad_norm": 0.6560278534889221, + "learning_rate": 0.000559327731092437, + "loss": 0.3958, + "step": 15843 + }, + { + "epoch": 8.851396648044693, + "grad_norm": 0.5255511403083801, + "learning_rate": 0.0005592997198879551, + "loss": 0.3439, + "step": 15844 + }, + { + "epoch": 8.85195530726257, + "grad_norm": 0.4293096363544464, + "learning_rate": 0.0005592717086834734, + "loss": 0.4303, + "step": 15845 + }, + { + "epoch": 8.852513966480448, + "grad_norm": 0.5821239352226257, + "learning_rate": 0.0005592436974789916, + "loss": 0.5027, + "step": 15846 + }, + { + "epoch": 8.853072625698324, + "grad_norm": 0.5697384476661682, + "learning_rate": 0.0005592156862745098, + "loss": 0.5261, + "step": 15847 + }, + { + "epoch": 8.8536312849162, + "grad_norm": 0.44587865471839905, + "learning_rate": 0.000559187675070028, + "loss": 0.325, + "step": 15848 + }, + { + "epoch": 8.854189944134077, + "grad_norm": 3.178269624710083, + "learning_rate": 0.0005591596638655461, + "loss": 0.4744, + "step": 15849 + }, + { + "epoch": 8.854748603351956, + "grad_norm": 0.4200572669506073, + "learning_rate": 0.0005591316526610644, + "loss": 0.4273, + "step": 15850 + }, + { + "epoch": 8.855307262569832, + "grad_norm": 0.4692087769508362, + "learning_rate": 0.0005591036414565827, + "loss": 0.3946, + "step": 15851 + }, + { + "epoch": 8.855865921787709, + "grad_norm": 1.0718377828598022, + "learning_rate": 0.0005590756302521009, + "loss": 0.4253, + "step": 15852 + }, + { + "epoch": 8.856424581005587, + "grad_norm": 0.47882017493247986, + "learning_rate": 0.0005590476190476191, + "loss": 0.4089, + "step": 15853 + }, + { + "epoch": 8.856983240223464, + "grad_norm": 0.4071679711341858, + "learning_rate": 0.0005590196078431372, + "loss": 0.4351, + "step": 15854 + }, + { + "epoch": 8.85754189944134, + "grad_norm": 0.37174373865127563, + "learning_rate": 0.0005589915966386555, + "loss": 0.395, + "step": 15855 + }, + { + "epoch": 8.858100558659217, + "grad_norm": 0.5214393138885498, + "learning_rate": 0.0005589635854341737, + "loss": 0.491, + "step": 15856 + }, + { + "epoch": 8.858659217877095, + "grad_norm": 1.4114837646484375, + "learning_rate": 0.0005589355742296919, + "loss": 0.5245, + "step": 15857 + }, + { + "epoch": 8.859217877094972, + "grad_norm": 0.5743383169174194, + "learning_rate": 0.0005589075630252101, + "loss": 0.4338, + "step": 15858 + }, + { + "epoch": 8.859776536312848, + "grad_norm": 0.5833740830421448, + "learning_rate": 0.0005588795518207283, + "loss": 0.4578, + "step": 15859 + }, + { + "epoch": 8.860335195530727, + "grad_norm": 0.47941040992736816, + "learning_rate": 0.0005588515406162465, + "loss": 0.4404, + "step": 15860 + }, + { + "epoch": 8.860893854748603, + "grad_norm": 0.46471959352493286, + "learning_rate": 0.0005588235294117647, + "loss": 0.3843, + "step": 15861 + }, + { + "epoch": 8.86145251396648, + "grad_norm": 0.7174744009971619, + "learning_rate": 0.0005587955182072829, + "loss": 0.4565, + "step": 15862 + }, + { + "epoch": 8.862011173184358, + "grad_norm": 0.5444117784500122, + "learning_rate": 0.0005587675070028011, + "loss": 0.4772, + "step": 15863 + }, + { + "epoch": 8.862569832402235, + "grad_norm": 1.0654371976852417, + "learning_rate": 0.0005587394957983193, + "loss": 0.4805, + "step": 15864 + }, + { + "epoch": 8.863128491620111, + "grad_norm": 1.2941851615905762, + "learning_rate": 0.0005587114845938375, + "loss": 0.4403, + "step": 15865 + }, + { + "epoch": 8.86368715083799, + "grad_norm": 0.5275970697402954, + "learning_rate": 0.0005586834733893557, + "loss": 0.4753, + "step": 15866 + }, + { + "epoch": 8.864245810055866, + "grad_norm": 0.4790375530719757, + "learning_rate": 0.000558655462184874, + "loss": 0.3831, + "step": 15867 + }, + { + "epoch": 8.864804469273743, + "grad_norm": 0.368158221244812, + "learning_rate": 0.0005586274509803922, + "loss": 0.3218, + "step": 15868 + }, + { + "epoch": 8.86536312849162, + "grad_norm": 0.4192827343940735, + "learning_rate": 0.0005585994397759104, + "loss": 0.5125, + "step": 15869 + }, + { + "epoch": 8.865921787709498, + "grad_norm": 0.7649282217025757, + "learning_rate": 0.0005585714285714286, + "loss": 0.4307, + "step": 15870 + }, + { + "epoch": 8.866480446927374, + "grad_norm": 0.6270117163658142, + "learning_rate": 0.0005585434173669468, + "loss": 0.4363, + "step": 15871 + }, + { + "epoch": 8.867039106145251, + "grad_norm": 0.6097122430801392, + "learning_rate": 0.000558515406162465, + "loss": 0.418, + "step": 15872 + }, + { + "epoch": 8.86759776536313, + "grad_norm": 0.4520244896411896, + "learning_rate": 0.0005584873949579832, + "loss": 0.4579, + "step": 15873 + }, + { + "epoch": 8.868156424581006, + "grad_norm": 3.2079672813415527, + "learning_rate": 0.0005584593837535014, + "loss": 0.6236, + "step": 15874 + }, + { + "epoch": 8.868715083798882, + "grad_norm": 0.6120218634605408, + "learning_rate": 0.0005584313725490197, + "loss": 0.4458, + "step": 15875 + }, + { + "epoch": 8.869273743016759, + "grad_norm": 1.532673954963684, + "learning_rate": 0.0005584033613445378, + "loss": 0.3609, + "step": 15876 + }, + { + "epoch": 8.869832402234637, + "grad_norm": 0.43884918093681335, + "learning_rate": 0.000558375350140056, + "loss": 0.4103, + "step": 15877 + }, + { + "epoch": 8.870391061452514, + "grad_norm": 0.4125737249851227, + "learning_rate": 0.0005583473389355742, + "loss": 0.374, + "step": 15878 + }, + { + "epoch": 8.87094972067039, + "grad_norm": 0.4179338216781616, + "learning_rate": 0.0005583193277310924, + "loss": 0.3871, + "step": 15879 + }, + { + "epoch": 8.871508379888269, + "grad_norm": 0.4581180512905121, + "learning_rate": 0.0005582913165266107, + "loss": 0.4421, + "step": 15880 + }, + { + "epoch": 8.872067039106145, + "grad_norm": 0.5099257230758667, + "learning_rate": 0.0005582633053221288, + "loss": 0.3815, + "step": 15881 + }, + { + "epoch": 8.872625698324022, + "grad_norm": 0.5308847427368164, + "learning_rate": 0.000558235294117647, + "loss": 0.3722, + "step": 15882 + }, + { + "epoch": 8.8731843575419, + "grad_norm": 0.48744913935661316, + "learning_rate": 0.0005582072829131652, + "loss": 0.3683, + "step": 15883 + }, + { + "epoch": 8.873743016759777, + "grad_norm": 1.6459118127822876, + "learning_rate": 0.0005581792717086834, + "loss": 0.4929, + "step": 15884 + }, + { + "epoch": 8.874301675977653, + "grad_norm": 1.429837942123413, + "learning_rate": 0.0005581512605042018, + "loss": 0.4272, + "step": 15885 + }, + { + "epoch": 8.87486033519553, + "grad_norm": 0.599993109703064, + "learning_rate": 0.0005581232492997199, + "loss": 0.5076, + "step": 15886 + }, + { + "epoch": 8.875418994413408, + "grad_norm": 0.9193503856658936, + "learning_rate": 0.0005580952380952381, + "loss": 0.6935, + "step": 15887 + }, + { + "epoch": 8.875977653631285, + "grad_norm": 0.4025314748287201, + "learning_rate": 0.0005580672268907563, + "loss": 0.3597, + "step": 15888 + }, + { + "epoch": 8.876536312849161, + "grad_norm": 0.8811167478561401, + "learning_rate": 0.0005580392156862745, + "loss": 0.4429, + "step": 15889 + }, + { + "epoch": 8.87709497206704, + "grad_norm": 0.6658065319061279, + "learning_rate": 0.0005580112044817928, + "loss": 0.4109, + "step": 15890 + }, + { + "epoch": 8.877653631284916, + "grad_norm": 1.5478134155273438, + "learning_rate": 0.000557983193277311, + "loss": 0.505, + "step": 15891 + }, + { + "epoch": 8.878212290502793, + "grad_norm": 0.36211755871772766, + "learning_rate": 0.0005579551820728291, + "loss": 0.4024, + "step": 15892 + }, + { + "epoch": 8.878770949720671, + "grad_norm": 0.9082402586936951, + "learning_rate": 0.0005579271708683473, + "loss": 0.3936, + "step": 15893 + }, + { + "epoch": 8.879329608938548, + "grad_norm": 0.8617127537727356, + "learning_rate": 0.0005578991596638655, + "loss": 0.4812, + "step": 15894 + }, + { + "epoch": 8.879888268156424, + "grad_norm": 1.3491594791412354, + "learning_rate": 0.0005578711484593838, + "loss": 0.4301, + "step": 15895 + }, + { + "epoch": 8.880446927374301, + "grad_norm": 0.7563647031784058, + "learning_rate": 0.000557843137254902, + "loss": 0.458, + "step": 15896 + }, + { + "epoch": 8.88100558659218, + "grad_norm": 18.068227767944336, + "learning_rate": 0.0005578151260504201, + "loss": 0.5537, + "step": 15897 + }, + { + "epoch": 8.881564245810056, + "grad_norm": 1.4140278100967407, + "learning_rate": 0.0005577871148459383, + "loss": 0.5839, + "step": 15898 + }, + { + "epoch": 8.882122905027932, + "grad_norm": 1.5591644048690796, + "learning_rate": 0.0005577591036414565, + "loss": 0.457, + "step": 15899 + }, + { + "epoch": 8.88268156424581, + "grad_norm": 0.5397035479545593, + "learning_rate": 0.0005577310924369749, + "loss": 0.3799, + "step": 15900 + }, + { + "epoch": 8.883240223463687, + "grad_norm": 4.0931243896484375, + "learning_rate": 0.0005577030812324931, + "loss": 0.7184, + "step": 15901 + }, + { + "epoch": 8.883798882681564, + "grad_norm": 0.6044573783874512, + "learning_rate": 0.0005576750700280112, + "loss": 0.4119, + "step": 15902 + }, + { + "epoch": 8.88435754189944, + "grad_norm": 1.6572458744049072, + "learning_rate": 0.0005576470588235294, + "loss": 0.3577, + "step": 15903 + }, + { + "epoch": 8.884916201117319, + "grad_norm": 0.4851267337799072, + "learning_rate": 0.0005576190476190476, + "loss": 0.3608, + "step": 15904 + }, + { + "epoch": 8.885474860335195, + "grad_norm": 0.48618993163108826, + "learning_rate": 0.0005575910364145659, + "loss": 0.4901, + "step": 15905 + }, + { + "epoch": 8.886033519553072, + "grad_norm": 0.5448348522186279, + "learning_rate": 0.0005575630252100841, + "loss": 0.4887, + "step": 15906 + }, + { + "epoch": 8.88659217877095, + "grad_norm": 3.0821704864501953, + "learning_rate": 0.0005575350140056023, + "loss": 0.4261, + "step": 15907 + }, + { + "epoch": 8.887150837988827, + "grad_norm": 1.0967351198196411, + "learning_rate": 0.0005575070028011204, + "loss": 0.3791, + "step": 15908 + }, + { + "epoch": 8.887709497206703, + "grad_norm": 0.4589420557022095, + "learning_rate": 0.0005574789915966386, + "loss": 0.5087, + "step": 15909 + }, + { + "epoch": 8.888268156424582, + "grad_norm": 0.559450089931488, + "learning_rate": 0.0005574509803921569, + "loss": 0.3874, + "step": 15910 + }, + { + "epoch": 8.888826815642458, + "grad_norm": 0.5953260660171509, + "learning_rate": 0.0005574229691876751, + "loss": 0.4488, + "step": 15911 + }, + { + "epoch": 8.889385474860335, + "grad_norm": 0.7041288614273071, + "learning_rate": 0.0005573949579831933, + "loss": 0.4065, + "step": 15912 + }, + { + "epoch": 8.889944134078211, + "grad_norm": 0.45519065856933594, + "learning_rate": 0.0005573669467787114, + "loss": 0.4226, + "step": 15913 + }, + { + "epoch": 8.89050279329609, + "grad_norm": 0.41252848505973816, + "learning_rate": 0.0005573389355742296, + "loss": 0.3932, + "step": 15914 + }, + { + "epoch": 8.891061452513966, + "grad_norm": 0.6071236729621887, + "learning_rate": 0.0005573109243697479, + "loss": 0.4147, + "step": 15915 + }, + { + "epoch": 8.891620111731843, + "grad_norm": 0.4969694912433624, + "learning_rate": 0.0005572829131652661, + "loss": 0.399, + "step": 15916 + }, + { + "epoch": 8.892178770949721, + "grad_norm": 0.8268481492996216, + "learning_rate": 0.0005572549019607844, + "loss": 0.5349, + "step": 15917 + }, + { + "epoch": 8.892737430167598, + "grad_norm": 0.5528133511543274, + "learning_rate": 0.0005572268907563025, + "loss": 0.3975, + "step": 15918 + }, + { + "epoch": 8.893296089385474, + "grad_norm": 0.5288232564926147, + "learning_rate": 0.0005571988795518207, + "loss": 0.4877, + "step": 15919 + }, + { + "epoch": 8.893854748603353, + "grad_norm": 0.6598970293998718, + "learning_rate": 0.000557170868347339, + "loss": 0.3626, + "step": 15920 + }, + { + "epoch": 8.89441340782123, + "grad_norm": 0.36831241846084595, + "learning_rate": 0.0005571428571428572, + "loss": 0.3136, + "step": 15921 + }, + { + "epoch": 8.894972067039106, + "grad_norm": 1.1029791831970215, + "learning_rate": 0.0005571148459383754, + "loss": 0.5374, + "step": 15922 + }, + { + "epoch": 8.895530726256982, + "grad_norm": 0.5720617771148682, + "learning_rate": 0.0005570868347338936, + "loss": 0.4872, + "step": 15923 + }, + { + "epoch": 8.89608938547486, + "grad_norm": 0.397910475730896, + "learning_rate": 0.0005570588235294117, + "loss": 0.3855, + "step": 15924 + }, + { + "epoch": 8.896648044692737, + "grad_norm": 0.981838583946228, + "learning_rate": 0.00055703081232493, + "loss": 0.4459, + "step": 15925 + }, + { + "epoch": 8.897206703910614, + "grad_norm": 0.5285311937332153, + "learning_rate": 0.0005570028011204482, + "loss": 0.3435, + "step": 15926 + }, + { + "epoch": 8.897765363128492, + "grad_norm": 0.6074533462524414, + "learning_rate": 0.0005569747899159664, + "loss": 0.4337, + "step": 15927 + }, + { + "epoch": 8.898324022346369, + "grad_norm": 0.7320986390113831, + "learning_rate": 0.0005569467787114846, + "loss": 0.5181, + "step": 15928 + }, + { + "epoch": 8.898882681564245, + "grad_norm": 0.6360154151916504, + "learning_rate": 0.0005569187675070027, + "loss": 0.6402, + "step": 15929 + }, + { + "epoch": 8.899441340782122, + "grad_norm": 0.6049297451972961, + "learning_rate": 0.000556890756302521, + "loss": 0.4989, + "step": 15930 + }, + { + "epoch": 8.9, + "grad_norm": 1.0205636024475098, + "learning_rate": 0.0005568627450980392, + "loss": 0.5307, + "step": 15931 + }, + { + "epoch": 8.900558659217877, + "grad_norm": 0.656028151512146, + "learning_rate": 0.0005568347338935574, + "loss": 0.4034, + "step": 15932 + }, + { + "epoch": 8.901117318435753, + "grad_norm": 0.5328769683837891, + "learning_rate": 0.0005568067226890757, + "loss": 0.5032, + "step": 15933 + }, + { + "epoch": 8.901675977653632, + "grad_norm": 0.5702680349349976, + "learning_rate": 0.0005567787114845937, + "loss": 0.4325, + "step": 15934 + }, + { + "epoch": 8.902234636871508, + "grad_norm": 0.48292964696884155, + "learning_rate": 0.0005567507002801121, + "loss": 0.4008, + "step": 15935 + }, + { + "epoch": 8.902793296089385, + "grad_norm": 0.6621236205101013, + "learning_rate": 0.0005567226890756303, + "loss": 0.502, + "step": 15936 + }, + { + "epoch": 8.903351955307263, + "grad_norm": 0.43324798345565796, + "learning_rate": 0.0005566946778711485, + "loss": 0.4301, + "step": 15937 + }, + { + "epoch": 8.90391061452514, + "grad_norm": 7.006535530090332, + "learning_rate": 0.0005566666666666667, + "loss": 0.4692, + "step": 15938 + }, + { + "epoch": 8.904469273743016, + "grad_norm": 0.6407557129859924, + "learning_rate": 0.0005566386554621849, + "loss": 0.6233, + "step": 15939 + }, + { + "epoch": 8.905027932960895, + "grad_norm": 0.6427462100982666, + "learning_rate": 0.0005566106442577031, + "loss": 0.3878, + "step": 15940 + }, + { + "epoch": 8.905586592178771, + "grad_norm": 1.8678460121154785, + "learning_rate": 0.0005565826330532213, + "loss": 0.4084, + "step": 15941 + }, + { + "epoch": 8.906145251396648, + "grad_norm": 0.40178200602531433, + "learning_rate": 0.0005565546218487395, + "loss": 0.3604, + "step": 15942 + }, + { + "epoch": 8.906703910614524, + "grad_norm": 0.4935017228126526, + "learning_rate": 0.0005565266106442577, + "loss": 0.4652, + "step": 15943 + }, + { + "epoch": 8.907262569832403, + "grad_norm": 1.5065382719039917, + "learning_rate": 0.0005564985994397759, + "loss": 0.5144, + "step": 15944 + }, + { + "epoch": 8.90782122905028, + "grad_norm": 0.5022974610328674, + "learning_rate": 0.0005564705882352941, + "loss": 0.5253, + "step": 15945 + }, + { + "epoch": 8.908379888268156, + "grad_norm": 0.5157644748687744, + "learning_rate": 0.0005564425770308123, + "loss": 0.5352, + "step": 15946 + }, + { + "epoch": 8.908938547486034, + "grad_norm": 1.0089850425720215, + "learning_rate": 0.0005564145658263305, + "loss": 0.5403, + "step": 15947 + }, + { + "epoch": 8.90949720670391, + "grad_norm": 0.6091569662094116, + "learning_rate": 0.0005563865546218487, + "loss": 0.4576, + "step": 15948 + }, + { + "epoch": 8.910055865921787, + "grad_norm": 0.5464770793914795, + "learning_rate": 0.000556358543417367, + "loss": 0.346, + "step": 15949 + }, + { + "epoch": 8.910614525139664, + "grad_norm": 1.5108433961868286, + "learning_rate": 0.0005563305322128853, + "loss": 0.4025, + "step": 15950 + }, + { + "epoch": 8.911173184357542, + "grad_norm": 0.8324965238571167, + "learning_rate": 0.0005563025210084034, + "loss": 0.5877, + "step": 15951 + }, + { + "epoch": 8.911731843575419, + "grad_norm": 0.7443004846572876, + "learning_rate": 0.0005562745098039216, + "loss": 0.4558, + "step": 15952 + }, + { + "epoch": 8.912290502793295, + "grad_norm": 0.6209191679954529, + "learning_rate": 0.0005562464985994398, + "loss": 0.4783, + "step": 15953 + }, + { + "epoch": 8.912849162011174, + "grad_norm": 0.5537817478179932, + "learning_rate": 0.000556218487394958, + "loss": 0.4153, + "step": 15954 + }, + { + "epoch": 8.91340782122905, + "grad_norm": 0.6437365412712097, + "learning_rate": 0.0005561904761904763, + "loss": 0.4323, + "step": 15955 + }, + { + "epoch": 8.913966480446927, + "grad_norm": 0.5271468758583069, + "learning_rate": 0.0005561624649859944, + "loss": 0.426, + "step": 15956 + }, + { + "epoch": 8.914525139664804, + "grad_norm": 1.0343917608261108, + "learning_rate": 0.0005561344537815126, + "loss": 0.5515, + "step": 15957 + }, + { + "epoch": 8.915083798882682, + "grad_norm": 1.8202875852584839, + "learning_rate": 0.0005561064425770308, + "loss": 0.4184, + "step": 15958 + }, + { + "epoch": 8.915642458100558, + "grad_norm": 0.5937560200691223, + "learning_rate": 0.000556078431372549, + "loss": 0.4586, + "step": 15959 + }, + { + "epoch": 8.916201117318435, + "grad_norm": 1.5224997997283936, + "learning_rate": 0.0005560504201680673, + "loss": 0.4259, + "step": 15960 + }, + { + "epoch": 8.916759776536313, + "grad_norm": 0.7288526296615601, + "learning_rate": 0.0005560224089635854, + "loss": 0.4676, + "step": 15961 + }, + { + "epoch": 8.91731843575419, + "grad_norm": 0.4420980215072632, + "learning_rate": 0.0005559943977591036, + "loss": 0.4393, + "step": 15962 + }, + { + "epoch": 8.917877094972066, + "grad_norm": 0.5820836424827576, + "learning_rate": 0.0005559663865546218, + "loss": 0.3767, + "step": 15963 + }, + { + "epoch": 8.918435754189945, + "grad_norm": 0.5217689871788025, + "learning_rate": 0.00055593837535014, + "loss": 0.379, + "step": 15964 + }, + { + "epoch": 8.918994413407821, + "grad_norm": 0.3979574143886566, + "learning_rate": 0.0005559103641456584, + "loss": 0.4204, + "step": 15965 + }, + { + "epoch": 8.919553072625698, + "grad_norm": 0.5714628100395203, + "learning_rate": 0.0005558823529411766, + "loss": 0.5348, + "step": 15966 + }, + { + "epoch": 8.920111731843576, + "grad_norm": 1.7164430618286133, + "learning_rate": 0.0005558543417366947, + "loss": 0.5723, + "step": 15967 + }, + { + "epoch": 8.920670391061453, + "grad_norm": 0.4394230544567108, + "learning_rate": 0.0005558263305322129, + "loss": 0.3848, + "step": 15968 + }, + { + "epoch": 8.92122905027933, + "grad_norm": 0.6123559474945068, + "learning_rate": 0.0005557983193277311, + "loss": 0.4742, + "step": 15969 + }, + { + "epoch": 8.921787709497206, + "grad_norm": 1.2475509643554688, + "learning_rate": 0.0005557703081232494, + "loss": 0.4408, + "step": 15970 + }, + { + "epoch": 8.922346368715084, + "grad_norm": 0.471023291349411, + "learning_rate": 0.0005557422969187676, + "loss": 0.4743, + "step": 15971 + }, + { + "epoch": 8.922905027932961, + "grad_norm": 0.6953405141830444, + "learning_rate": 0.0005557142857142857, + "loss": 0.4367, + "step": 15972 + }, + { + "epoch": 8.923463687150837, + "grad_norm": 0.681628406047821, + "learning_rate": 0.0005556862745098039, + "loss": 0.4739, + "step": 15973 + }, + { + "epoch": 8.924022346368716, + "grad_norm": 0.6660521030426025, + "learning_rate": 0.0005556582633053221, + "loss": 0.3252, + "step": 15974 + }, + { + "epoch": 8.924581005586592, + "grad_norm": 0.42581498622894287, + "learning_rate": 0.0005556302521008404, + "loss": 0.4024, + "step": 15975 + }, + { + "epoch": 8.925139664804469, + "grad_norm": 0.4710841476917267, + "learning_rate": 0.0005556022408963586, + "loss": 0.3389, + "step": 15976 + }, + { + "epoch": 8.925698324022346, + "grad_norm": 0.423613578081131, + "learning_rate": 0.0005555742296918767, + "loss": 0.4655, + "step": 15977 + }, + { + "epoch": 8.926256983240224, + "grad_norm": 1.524860143661499, + "learning_rate": 0.0005555462184873949, + "loss": 0.4266, + "step": 15978 + }, + { + "epoch": 8.9268156424581, + "grad_norm": 0.8713619709014893, + "learning_rate": 0.0005555182072829131, + "loss": 0.4722, + "step": 15979 + }, + { + "epoch": 8.927374301675977, + "grad_norm": 0.7668302059173584, + "learning_rate": 0.0005554901960784314, + "loss": 0.5078, + "step": 15980 + }, + { + "epoch": 8.927932960893855, + "grad_norm": 0.6430107951164246, + "learning_rate": 0.0005554621848739496, + "loss": 0.4857, + "step": 15981 + }, + { + "epoch": 8.928491620111732, + "grad_norm": 0.4193095862865448, + "learning_rate": 0.0005554341736694679, + "loss": 0.4686, + "step": 15982 + }, + { + "epoch": 8.929050279329608, + "grad_norm": 1.3140244483947754, + "learning_rate": 0.000555406162464986, + "loss": 0.3749, + "step": 15983 + }, + { + "epoch": 8.929608938547487, + "grad_norm": 0.3882800340652466, + "learning_rate": 0.0005553781512605042, + "loss": 0.4179, + "step": 15984 + }, + { + "epoch": 8.930167597765363, + "grad_norm": 0.4938243329524994, + "learning_rate": 0.0005553501400560225, + "loss": 0.3634, + "step": 15985 + }, + { + "epoch": 8.93072625698324, + "grad_norm": 0.5283041596412659, + "learning_rate": 0.0005553221288515407, + "loss": 0.3488, + "step": 15986 + }, + { + "epoch": 8.931284916201117, + "grad_norm": 0.862450122833252, + "learning_rate": 0.0005552941176470589, + "loss": 0.54, + "step": 15987 + }, + { + "epoch": 8.931843575418995, + "grad_norm": 1.287535309791565, + "learning_rate": 0.000555266106442577, + "loss": 0.4541, + "step": 15988 + }, + { + "epoch": 8.932402234636871, + "grad_norm": 0.6303169131278992, + "learning_rate": 0.0005552380952380952, + "loss": 0.5074, + "step": 15989 + }, + { + "epoch": 8.932960893854748, + "grad_norm": 0.38338854908943176, + "learning_rate": 0.0005552100840336135, + "loss": 0.4274, + "step": 15990 + }, + { + "epoch": 8.933519553072626, + "grad_norm": 0.7298899292945862, + "learning_rate": 0.0005551820728291317, + "loss": 0.5025, + "step": 15991 + }, + { + "epoch": 8.934078212290503, + "grad_norm": 0.4986940920352936, + "learning_rate": 0.0005551540616246499, + "loss": 0.4693, + "step": 15992 + }, + { + "epoch": 8.93463687150838, + "grad_norm": 0.4189278483390808, + "learning_rate": 0.000555126050420168, + "loss": 0.4832, + "step": 15993 + }, + { + "epoch": 8.935195530726258, + "grad_norm": 0.601718544960022, + "learning_rate": 0.0005550980392156862, + "loss": 0.5076, + "step": 15994 + }, + { + "epoch": 8.935754189944134, + "grad_norm": 0.5314221978187561, + "learning_rate": 0.0005550700280112045, + "loss": 0.4861, + "step": 15995 + }, + { + "epoch": 8.936312849162011, + "grad_norm": 0.3965783417224884, + "learning_rate": 0.0005550420168067227, + "loss": 0.3447, + "step": 15996 + }, + { + "epoch": 8.936871508379888, + "grad_norm": 0.48908886313438416, + "learning_rate": 0.0005550140056022409, + "loss": 0.4221, + "step": 15997 + }, + { + "epoch": 8.937430167597766, + "grad_norm": 0.4414675235748291, + "learning_rate": 0.0005549859943977591, + "loss": 0.5481, + "step": 15998 + }, + { + "epoch": 8.937988826815642, + "grad_norm": 0.449522465467453, + "learning_rate": 0.0005549579831932772, + "loss": 0.4437, + "step": 15999 + }, + { + "epoch": 8.938547486033519, + "grad_norm": 0.687625527381897, + "learning_rate": 0.0005549299719887956, + "loss": 0.3812, + "step": 16000 + }, + { + "epoch": 8.938547486033519, + "eval_cer": 0.08986067013627486, + "eval_loss": 0.338677316904068, + "eval_runtime": 59.3212, + "eval_samples_per_second": 76.499, + "eval_steps_per_second": 4.787, + "eval_wer": 0.3547945588112064, + "step": 16000 + }, + { + "epoch": 8.939106145251397, + "grad_norm": 0.4118690490722656, + "learning_rate": 0.0005549019607843138, + "loss": 0.4363, + "step": 16001 + }, + { + "epoch": 8.939664804469274, + "grad_norm": 0.7898194789886475, + "learning_rate": 0.000554873949579832, + "loss": 0.4652, + "step": 16002 + }, + { + "epoch": 8.94022346368715, + "grad_norm": 0.6410890817642212, + "learning_rate": 0.0005548459383753502, + "loss": 0.5963, + "step": 16003 + }, + { + "epoch": 8.940782122905027, + "grad_norm": 0.6669525504112244, + "learning_rate": 0.0005548179271708683, + "loss": 0.4695, + "step": 16004 + }, + { + "epoch": 8.941340782122905, + "grad_norm": 0.4108714759349823, + "learning_rate": 0.0005547899159663866, + "loss": 0.414, + "step": 16005 + }, + { + "epoch": 8.941899441340782, + "grad_norm": 2.204586982727051, + "learning_rate": 0.0005547619047619048, + "loss": 0.4554, + "step": 16006 + }, + { + "epoch": 8.942458100558659, + "grad_norm": 0.6554802060127258, + "learning_rate": 0.000554733893557423, + "loss": 0.5453, + "step": 16007 + }, + { + "epoch": 8.943016759776537, + "grad_norm": 1.2897108793258667, + "learning_rate": 0.0005547058823529412, + "loss": 0.5655, + "step": 16008 + }, + { + "epoch": 8.943575418994413, + "grad_norm": 0.6379144787788391, + "learning_rate": 0.0005546778711484593, + "loss": 0.4617, + "step": 16009 + }, + { + "epoch": 8.94413407821229, + "grad_norm": 0.41212019324302673, + "learning_rate": 0.0005546498599439776, + "loss": 0.3335, + "step": 16010 + }, + { + "epoch": 8.944692737430168, + "grad_norm": 7.185947418212891, + "learning_rate": 0.0005546218487394958, + "loss": 0.4866, + "step": 16011 + }, + { + "epoch": 8.945251396648045, + "grad_norm": 0.4587954580783844, + "learning_rate": 0.000554593837535014, + "loss": 0.4626, + "step": 16012 + }, + { + "epoch": 8.945810055865921, + "grad_norm": 2.7650115489959717, + "learning_rate": 0.0005545658263305322, + "loss": 0.4682, + "step": 16013 + }, + { + "epoch": 8.946368715083798, + "grad_norm": 1.127885341644287, + "learning_rate": 0.0005545378151260504, + "loss": 0.3877, + "step": 16014 + }, + { + "epoch": 8.946927374301676, + "grad_norm": 0.8281756043434143, + "learning_rate": 0.0005545098039215687, + "loss": 0.4863, + "step": 16015 + }, + { + "epoch": 8.947486033519553, + "grad_norm": 0.5242045521736145, + "learning_rate": 0.0005544817927170869, + "loss": 0.4594, + "step": 16016 + }, + { + "epoch": 8.94804469273743, + "grad_norm": 0.6643726229667664, + "learning_rate": 0.0005544537815126051, + "loss": 0.5377, + "step": 16017 + }, + { + "epoch": 8.948603351955308, + "grad_norm": 0.4242427349090576, + "learning_rate": 0.0005544257703081233, + "loss": 0.4764, + "step": 16018 + }, + { + "epoch": 8.949162011173184, + "grad_norm": 0.6682928800582886, + "learning_rate": 0.0005543977591036415, + "loss": 0.481, + "step": 16019 + }, + { + "epoch": 8.949720670391061, + "grad_norm": 0.3470328748226166, + "learning_rate": 0.0005543697478991597, + "loss": 0.4326, + "step": 16020 + }, + { + "epoch": 8.95027932960894, + "grad_norm": 0.46993130445480347, + "learning_rate": 0.0005543417366946779, + "loss": 0.4656, + "step": 16021 + }, + { + "epoch": 8.950837988826816, + "grad_norm": 0.49074608087539673, + "learning_rate": 0.0005543137254901961, + "loss": 0.4227, + "step": 16022 + }, + { + "epoch": 8.951396648044692, + "grad_norm": 0.39384642243385315, + "learning_rate": 0.0005542857142857143, + "loss": 0.385, + "step": 16023 + }, + { + "epoch": 8.951955307262569, + "grad_norm": 0.42947933077812195, + "learning_rate": 0.0005542577030812325, + "loss": 0.3922, + "step": 16024 + }, + { + "epoch": 8.952513966480447, + "grad_norm": 0.5113601684570312, + "learning_rate": 0.0005542296918767507, + "loss": 0.4506, + "step": 16025 + }, + { + "epoch": 8.953072625698324, + "grad_norm": 0.8232506513595581, + "learning_rate": 0.0005542016806722689, + "loss": 0.5329, + "step": 16026 + }, + { + "epoch": 8.9536312849162, + "grad_norm": 0.8880576491355896, + "learning_rate": 0.0005541736694677871, + "loss": 0.4885, + "step": 16027 + }, + { + "epoch": 8.954189944134079, + "grad_norm": 0.8181654810905457, + "learning_rate": 0.0005541456582633053, + "loss": 0.5341, + "step": 16028 + }, + { + "epoch": 8.954748603351955, + "grad_norm": 0.6600484848022461, + "learning_rate": 0.0005541176470588235, + "loss": 0.4963, + "step": 16029 + }, + { + "epoch": 8.955307262569832, + "grad_norm": 0.5024636387825012, + "learning_rate": 0.0005540896358543418, + "loss": 0.4829, + "step": 16030 + }, + { + "epoch": 8.955865921787709, + "grad_norm": 0.7213287353515625, + "learning_rate": 0.00055406162464986, + "loss": 0.3964, + "step": 16031 + }, + { + "epoch": 8.956424581005587, + "grad_norm": 0.6209570169448853, + "learning_rate": 0.0005540336134453782, + "loss": 0.4451, + "step": 16032 + }, + { + "epoch": 8.956983240223463, + "grad_norm": 0.5490360856056213, + "learning_rate": 0.0005540056022408964, + "loss": 0.3946, + "step": 16033 + }, + { + "epoch": 8.95754189944134, + "grad_norm": 0.43646201491355896, + "learning_rate": 0.0005539775910364146, + "loss": 0.451, + "step": 16034 + }, + { + "epoch": 8.958100558659218, + "grad_norm": 3.0522241592407227, + "learning_rate": 0.0005539495798319329, + "loss": 0.6898, + "step": 16035 + }, + { + "epoch": 8.958659217877095, + "grad_norm": 1.25969398021698, + "learning_rate": 0.000553921568627451, + "loss": 0.3744, + "step": 16036 + }, + { + "epoch": 8.959217877094972, + "grad_norm": 0.4619274139404297, + "learning_rate": 0.0005538935574229692, + "loss": 0.4291, + "step": 16037 + }, + { + "epoch": 8.95977653631285, + "grad_norm": 0.9642127752304077, + "learning_rate": 0.0005538655462184874, + "loss": 0.3878, + "step": 16038 + }, + { + "epoch": 8.960335195530726, + "grad_norm": 0.614017903804779, + "learning_rate": 0.0005538375350140056, + "loss": 0.4823, + "step": 16039 + }, + { + "epoch": 8.960893854748603, + "grad_norm": 0.9552125930786133, + "learning_rate": 0.0005538095238095239, + "loss": 0.4313, + "step": 16040 + }, + { + "epoch": 8.961452513966481, + "grad_norm": 0.49200764298439026, + "learning_rate": 0.000553781512605042, + "loss": 0.4594, + "step": 16041 + }, + { + "epoch": 8.962011173184358, + "grad_norm": 0.751595139503479, + "learning_rate": 0.0005537535014005602, + "loss": 0.4385, + "step": 16042 + }, + { + "epoch": 8.962569832402234, + "grad_norm": 0.6982694268226624, + "learning_rate": 0.0005537254901960784, + "loss": 0.3801, + "step": 16043 + }, + { + "epoch": 8.963128491620111, + "grad_norm": 1.366797924041748, + "learning_rate": 0.0005536974789915966, + "loss": 0.4394, + "step": 16044 + }, + { + "epoch": 8.96368715083799, + "grad_norm": 1.549389362335205, + "learning_rate": 0.0005536694677871149, + "loss": 0.4521, + "step": 16045 + }, + { + "epoch": 8.964245810055866, + "grad_norm": 0.3726401925086975, + "learning_rate": 0.0005536414565826331, + "loss": 0.3531, + "step": 16046 + }, + { + "epoch": 8.964804469273743, + "grad_norm": 0.4503166675567627, + "learning_rate": 0.0005536134453781512, + "loss": 0.4028, + "step": 16047 + }, + { + "epoch": 8.96536312849162, + "grad_norm": 0.6732364892959595, + "learning_rate": 0.0005535854341736694, + "loss": 0.4026, + "step": 16048 + }, + { + "epoch": 8.965921787709497, + "grad_norm": 4.926407337188721, + "learning_rate": 0.0005535574229691877, + "loss": 0.3675, + "step": 16049 + }, + { + "epoch": 8.966480446927374, + "grad_norm": 0.6232845187187195, + "learning_rate": 0.000553529411764706, + "loss": 0.6547, + "step": 16050 + }, + { + "epoch": 8.96703910614525, + "grad_norm": 0.5606221556663513, + "learning_rate": 0.0005535014005602242, + "loss": 0.5447, + "step": 16051 + }, + { + "epoch": 8.967597765363129, + "grad_norm": 1.1458344459533691, + "learning_rate": 0.0005534733893557423, + "loss": 0.5058, + "step": 16052 + }, + { + "epoch": 8.968156424581005, + "grad_norm": 0.7482420206069946, + "learning_rate": 0.0005534453781512605, + "loss": 0.4652, + "step": 16053 + }, + { + "epoch": 8.968715083798882, + "grad_norm": 0.4654027223587036, + "learning_rate": 0.0005534173669467787, + "loss": 0.5068, + "step": 16054 + }, + { + "epoch": 8.96927374301676, + "grad_norm": 0.5109484791755676, + "learning_rate": 0.000553389355742297, + "loss": 0.4004, + "step": 16055 + }, + { + "epoch": 8.969832402234637, + "grad_norm": 0.4242281913757324, + "learning_rate": 0.0005533613445378152, + "loss": 0.367, + "step": 16056 + }, + { + "epoch": 8.970391061452514, + "grad_norm": 0.46628156304359436, + "learning_rate": 0.0005533333333333333, + "loss": 0.5677, + "step": 16057 + }, + { + "epoch": 8.970949720670392, + "grad_norm": 0.6533046364784241, + "learning_rate": 0.0005533053221288515, + "loss": 0.6275, + "step": 16058 + }, + { + "epoch": 8.971508379888268, + "grad_norm": 0.3921302258968353, + "learning_rate": 0.0005532773109243697, + "loss": 0.2887, + "step": 16059 + }, + { + "epoch": 8.972067039106145, + "grad_norm": 0.5315089225769043, + "learning_rate": 0.000553249299719888, + "loss": 0.4757, + "step": 16060 + }, + { + "epoch": 8.972625698324022, + "grad_norm": 0.6399401426315308, + "learning_rate": 0.0005532212885154062, + "loss": 0.3979, + "step": 16061 + }, + { + "epoch": 8.9731843575419, + "grad_norm": 0.5833736658096313, + "learning_rate": 0.0005531932773109244, + "loss": 0.4549, + "step": 16062 + }, + { + "epoch": 8.973743016759776, + "grad_norm": 0.5431896448135376, + "learning_rate": 0.0005531652661064425, + "loss": 0.4373, + "step": 16063 + }, + { + "epoch": 8.974301675977653, + "grad_norm": 0.4164978265762329, + "learning_rate": 0.0005531372549019607, + "loss": 0.3365, + "step": 16064 + }, + { + "epoch": 8.974860335195531, + "grad_norm": 0.27505579590797424, + "learning_rate": 0.000553109243697479, + "loss": 0.2261, + "step": 16065 + }, + { + "epoch": 8.975418994413408, + "grad_norm": 0.4429325461387634, + "learning_rate": 0.0005530812324929973, + "loss": 0.3987, + "step": 16066 + }, + { + "epoch": 8.975977653631285, + "grad_norm": 0.5422666072845459, + "learning_rate": 0.0005530532212885155, + "loss": 0.5242, + "step": 16067 + }, + { + "epoch": 8.976536312849163, + "grad_norm": 0.4366532266139984, + "learning_rate": 0.0005530252100840336, + "loss": 0.3941, + "step": 16068 + }, + { + "epoch": 8.97709497206704, + "grad_norm": 3.298611640930176, + "learning_rate": 0.0005529971988795518, + "loss": 0.3795, + "step": 16069 + }, + { + "epoch": 8.977653631284916, + "grad_norm": 0.433038592338562, + "learning_rate": 0.00055296918767507, + "loss": 0.4503, + "step": 16070 + }, + { + "epoch": 8.978212290502793, + "grad_norm": 0.38121265172958374, + "learning_rate": 0.0005529411764705883, + "loss": 0.4205, + "step": 16071 + }, + { + "epoch": 8.978770949720671, + "grad_norm": 0.4107948839664459, + "learning_rate": 0.0005529131652661065, + "loss": 0.4777, + "step": 16072 + }, + { + "epoch": 8.979329608938547, + "grad_norm": 19.80625343322754, + "learning_rate": 0.0005528851540616246, + "loss": 0.4442, + "step": 16073 + }, + { + "epoch": 8.979888268156424, + "grad_norm": 0.3977179527282715, + "learning_rate": 0.0005528571428571428, + "loss": 0.3453, + "step": 16074 + }, + { + "epoch": 8.980446927374302, + "grad_norm": 0.5534055829048157, + "learning_rate": 0.000552829131652661, + "loss": 0.3843, + "step": 16075 + }, + { + "epoch": 8.981005586592179, + "grad_norm": 0.8614566922187805, + "learning_rate": 0.0005528011204481793, + "loss": 0.4216, + "step": 16076 + }, + { + "epoch": 8.981564245810056, + "grad_norm": 4.549533367156982, + "learning_rate": 0.0005527731092436975, + "loss": 0.37, + "step": 16077 + }, + { + "epoch": 8.982122905027932, + "grad_norm": 0.49782413244247437, + "learning_rate": 0.0005527450980392157, + "loss": 0.5538, + "step": 16078 + }, + { + "epoch": 8.98268156424581, + "grad_norm": 0.4126283824443817, + "learning_rate": 0.0005527170868347338, + "loss": 0.4421, + "step": 16079 + }, + { + "epoch": 8.983240223463687, + "grad_norm": 1.023959994316101, + "learning_rate": 0.000552689075630252, + "loss": 0.4062, + "step": 16080 + }, + { + "epoch": 8.983798882681564, + "grad_norm": 0.8352178931236267, + "learning_rate": 0.0005526610644257704, + "loss": 0.5029, + "step": 16081 + }, + { + "epoch": 8.984357541899442, + "grad_norm": 2.0720584392547607, + "learning_rate": 0.0005526330532212886, + "loss": 0.4871, + "step": 16082 + }, + { + "epoch": 8.984916201117318, + "grad_norm": 0.6406990885734558, + "learning_rate": 0.0005526050420168068, + "loss": 0.3551, + "step": 16083 + }, + { + "epoch": 8.985474860335195, + "grad_norm": 0.478938490152359, + "learning_rate": 0.0005525770308123249, + "loss": 0.497, + "step": 16084 + }, + { + "epoch": 8.986033519553073, + "grad_norm": 0.4386378824710846, + "learning_rate": 0.0005525490196078431, + "loss": 0.3771, + "step": 16085 + }, + { + "epoch": 8.98659217877095, + "grad_norm": 0.6012040972709656, + "learning_rate": 0.0005525210084033614, + "loss": 0.4388, + "step": 16086 + }, + { + "epoch": 8.987150837988827, + "grad_norm": 0.4123627245426178, + "learning_rate": 0.0005524929971988796, + "loss": 0.4408, + "step": 16087 + }, + { + "epoch": 8.987709497206703, + "grad_norm": 0.43089112639427185, + "learning_rate": 0.0005524649859943978, + "loss": 0.3995, + "step": 16088 + }, + { + "epoch": 8.988268156424581, + "grad_norm": 0.42207980155944824, + "learning_rate": 0.0005524369747899159, + "loss": 0.4455, + "step": 16089 + }, + { + "epoch": 8.988826815642458, + "grad_norm": 3.493142604827881, + "learning_rate": 0.0005524089635854341, + "loss": 0.3508, + "step": 16090 + }, + { + "epoch": 8.989385474860335, + "grad_norm": 0.535279393196106, + "learning_rate": 0.0005523809523809524, + "loss": 0.4705, + "step": 16091 + }, + { + "epoch": 8.989944134078213, + "grad_norm": 0.453834593296051, + "learning_rate": 0.0005523529411764706, + "loss": 0.3667, + "step": 16092 + }, + { + "epoch": 8.99050279329609, + "grad_norm": 1.633609652519226, + "learning_rate": 0.0005523249299719888, + "loss": 0.4161, + "step": 16093 + }, + { + "epoch": 8.991061452513966, + "grad_norm": 0.48708850145339966, + "learning_rate": 0.000552296918767507, + "loss": 0.4177, + "step": 16094 + }, + { + "epoch": 8.991620111731844, + "grad_norm": 0.6837671995162964, + "learning_rate": 0.0005522689075630251, + "loss": 0.545, + "step": 16095 + }, + { + "epoch": 8.992178770949721, + "grad_norm": 0.40899500250816345, + "learning_rate": 0.0005522408963585434, + "loss": 0.3441, + "step": 16096 + }, + { + "epoch": 8.992737430167598, + "grad_norm": 0.4136612117290497, + "learning_rate": 0.0005522128851540617, + "loss": 0.2907, + "step": 16097 + }, + { + "epoch": 8.993296089385474, + "grad_norm": 0.4239876866340637, + "learning_rate": 0.0005521848739495799, + "loss": 0.4237, + "step": 16098 + }, + { + "epoch": 8.993854748603352, + "grad_norm": 0.6510956883430481, + "learning_rate": 0.0005521568627450981, + "loss": 0.4515, + "step": 16099 + }, + { + "epoch": 8.994413407821229, + "grad_norm": 0.4541299641132355, + "learning_rate": 0.0005521288515406162, + "loss": 0.4396, + "step": 16100 + }, + { + "epoch": 8.994972067039106, + "grad_norm": 0.4164002537727356, + "learning_rate": 0.0005521008403361345, + "loss": 0.4, + "step": 16101 + }, + { + "epoch": 8.995530726256984, + "grad_norm": 0.35673004388809204, + "learning_rate": 0.0005520728291316527, + "loss": 0.4109, + "step": 16102 + }, + { + "epoch": 8.99608938547486, + "grad_norm": 0.8713533282279968, + "learning_rate": 0.0005520448179271709, + "loss": 0.4883, + "step": 16103 + }, + { + "epoch": 8.996648044692737, + "grad_norm": 0.5718778967857361, + "learning_rate": 0.0005520168067226891, + "loss": 0.4175, + "step": 16104 + }, + { + "epoch": 8.997206703910614, + "grad_norm": 1.1776643991470337, + "learning_rate": 0.0005519887955182072, + "loss": 0.3605, + "step": 16105 + }, + { + "epoch": 8.997765363128492, + "grad_norm": 0.8573605418205261, + "learning_rate": 0.0005519607843137255, + "loss": 0.4634, + "step": 16106 + }, + { + "epoch": 8.998324022346369, + "grad_norm": 0.37743502855300903, + "learning_rate": 0.0005519327731092437, + "loss": 0.4115, + "step": 16107 + }, + { + "epoch": 8.998882681564245, + "grad_norm": 0.4381525218486786, + "learning_rate": 0.0005519047619047619, + "loss": 0.4647, + "step": 16108 + }, + { + "epoch": 8.999441340782123, + "grad_norm": 0.5276052951812744, + "learning_rate": 0.0005518767507002801, + "loss": 0.3447, + "step": 16109 + }, + { + "epoch": 9.0, + "grad_norm": 0.45398926734924316, + "learning_rate": 0.0005518487394957983, + "loss": 0.4692, + "step": 16110 + }, + { + "epoch": 9.000558659217877, + "grad_norm": 0.471686989068985, + "learning_rate": 0.0005518207282913165, + "loss": 0.372, + "step": 16111 + }, + { + "epoch": 9.001117318435755, + "grad_norm": 0.48771005868911743, + "learning_rate": 0.0005517927170868347, + "loss": 0.5023, + "step": 16112 + }, + { + "epoch": 9.001675977653631, + "grad_norm": 0.49092793464660645, + "learning_rate": 0.000551764705882353, + "loss": 0.4116, + "step": 16113 + }, + { + "epoch": 9.002234636871508, + "grad_norm": 0.435975044965744, + "learning_rate": 0.0005517366946778712, + "loss": 0.3816, + "step": 16114 + }, + { + "epoch": 9.002793296089385, + "grad_norm": 0.45199039578437805, + "learning_rate": 0.0005517086834733894, + "loss": 0.3544, + "step": 16115 + }, + { + "epoch": 9.003351955307263, + "grad_norm": 7.319671630859375, + "learning_rate": 0.0005516806722689076, + "loss": 0.5074, + "step": 16116 + }, + { + "epoch": 9.00391061452514, + "grad_norm": 2.8106987476348877, + "learning_rate": 0.0005516526610644258, + "loss": 0.426, + "step": 16117 + }, + { + "epoch": 9.004469273743016, + "grad_norm": 0.6041256785392761, + "learning_rate": 0.000551624649859944, + "loss": 0.5083, + "step": 16118 + }, + { + "epoch": 9.005027932960894, + "grad_norm": 0.4483242332935333, + "learning_rate": 0.0005515966386554622, + "loss": 0.4408, + "step": 16119 + }, + { + "epoch": 9.005586592178771, + "grad_norm": 0.7909282445907593, + "learning_rate": 0.0005515686274509804, + "loss": 0.5326, + "step": 16120 + }, + { + "epoch": 9.006145251396648, + "grad_norm": 0.6927090883255005, + "learning_rate": 0.0005515406162464987, + "loss": 0.5765, + "step": 16121 + }, + { + "epoch": 9.006703910614526, + "grad_norm": 0.6927720904350281, + "learning_rate": 0.0005515126050420168, + "loss": 0.6415, + "step": 16122 + }, + { + "epoch": 9.007262569832402, + "grad_norm": 0.4872031509876251, + "learning_rate": 0.000551484593837535, + "loss": 0.3654, + "step": 16123 + }, + { + "epoch": 9.007821229050279, + "grad_norm": 0.7285237312316895, + "learning_rate": 0.0005514565826330532, + "loss": 0.4196, + "step": 16124 + }, + { + "epoch": 9.008379888268156, + "grad_norm": 0.3630661368370056, + "learning_rate": 0.0005514285714285714, + "loss": 0.3919, + "step": 16125 + }, + { + "epoch": 9.008938547486034, + "grad_norm": 0.44762906432151794, + "learning_rate": 0.0005514005602240897, + "loss": 0.3732, + "step": 16126 + }, + { + "epoch": 9.00949720670391, + "grad_norm": 0.6799202561378479, + "learning_rate": 0.0005513725490196078, + "loss": 0.436, + "step": 16127 + }, + { + "epoch": 9.010055865921787, + "grad_norm": 0.4219238758087158, + "learning_rate": 0.000551344537815126, + "loss": 0.4476, + "step": 16128 + }, + { + "epoch": 9.010614525139665, + "grad_norm": 0.7865345478057861, + "learning_rate": 0.0005513165266106442, + "loss": 0.3476, + "step": 16129 + }, + { + "epoch": 9.011173184357542, + "grad_norm": 2.6672730445861816, + "learning_rate": 0.0005512885154061624, + "loss": 0.4237, + "step": 16130 + }, + { + "epoch": 9.011731843575419, + "grad_norm": 0.6888644099235535, + "learning_rate": 0.0005512605042016808, + "loss": 0.429, + "step": 16131 + }, + { + "epoch": 9.012290502793297, + "grad_norm": 0.6142561435699463, + "learning_rate": 0.0005512324929971989, + "loss": 0.4408, + "step": 16132 + }, + { + "epoch": 9.012849162011173, + "grad_norm": 0.4682404398918152, + "learning_rate": 0.0005512044817927171, + "loss": 0.4748, + "step": 16133 + }, + { + "epoch": 9.01340782122905, + "grad_norm": 0.6848462224006653, + "learning_rate": 0.0005511764705882353, + "loss": 0.3949, + "step": 16134 + }, + { + "epoch": 9.013966480446927, + "grad_norm": 0.7022143006324768, + "learning_rate": 0.0005511484593837535, + "loss": 0.4867, + "step": 16135 + }, + { + "epoch": 9.014525139664805, + "grad_norm": 1.2872141599655151, + "learning_rate": 0.0005511204481792718, + "loss": 0.4714, + "step": 16136 + }, + { + "epoch": 9.015083798882682, + "grad_norm": 0.4741916060447693, + "learning_rate": 0.00055109243697479, + "loss": 0.3864, + "step": 16137 + }, + { + "epoch": 9.015642458100558, + "grad_norm": 0.7256016731262207, + "learning_rate": 0.0005510644257703081, + "loss": 0.4008, + "step": 16138 + }, + { + "epoch": 9.016201117318436, + "grad_norm": 0.46023839712142944, + "learning_rate": 0.0005510364145658263, + "loss": 0.4585, + "step": 16139 + }, + { + "epoch": 9.016759776536313, + "grad_norm": 0.6426053047180176, + "learning_rate": 0.0005510084033613445, + "loss": 0.4557, + "step": 16140 + }, + { + "epoch": 9.01731843575419, + "grad_norm": 0.40671733021736145, + "learning_rate": 0.0005509803921568628, + "loss": 0.335, + "step": 16141 + }, + { + "epoch": 9.017877094972068, + "grad_norm": 0.3892160654067993, + "learning_rate": 0.000550952380952381, + "loss": 0.3745, + "step": 16142 + }, + { + "epoch": 9.018435754189944, + "grad_norm": 1.191564679145813, + "learning_rate": 0.0005509243697478991, + "loss": 0.4222, + "step": 16143 + }, + { + "epoch": 9.018994413407821, + "grad_norm": 0.654286801815033, + "learning_rate": 0.0005508963585434173, + "loss": 0.3245, + "step": 16144 + }, + { + "epoch": 9.019553072625698, + "grad_norm": 0.7706868648529053, + "learning_rate": 0.0005508683473389355, + "loss": 0.5063, + "step": 16145 + }, + { + "epoch": 9.020111731843576, + "grad_norm": 0.5676131844520569, + "learning_rate": 0.0005508403361344539, + "loss": 0.5046, + "step": 16146 + }, + { + "epoch": 9.020670391061453, + "grad_norm": 0.6341442465782166, + "learning_rate": 0.0005508123249299721, + "loss": 0.4806, + "step": 16147 + }, + { + "epoch": 9.021229050279329, + "grad_norm": 0.35954737663269043, + "learning_rate": 0.0005507843137254902, + "loss": 0.4144, + "step": 16148 + }, + { + "epoch": 9.021787709497207, + "grad_norm": 0.6082434058189392, + "learning_rate": 0.0005507563025210084, + "loss": 0.544, + "step": 16149 + }, + { + "epoch": 9.022346368715084, + "grad_norm": 1.1365302801132202, + "learning_rate": 0.0005507282913165266, + "loss": 0.4561, + "step": 16150 + }, + { + "epoch": 9.02290502793296, + "grad_norm": 0.5052218437194824, + "learning_rate": 0.0005507002801120449, + "loss": 0.5296, + "step": 16151 + }, + { + "epoch": 9.023463687150837, + "grad_norm": 0.8568074703216553, + "learning_rate": 0.0005506722689075631, + "loss": 0.5617, + "step": 16152 + }, + { + "epoch": 9.024022346368715, + "grad_norm": 0.5012927055358887, + "learning_rate": 0.0005506442577030813, + "loss": 0.4135, + "step": 16153 + }, + { + "epoch": 9.024581005586592, + "grad_norm": 0.9031180739402771, + "learning_rate": 0.0005506162464985994, + "loss": 0.4345, + "step": 16154 + }, + { + "epoch": 9.025139664804469, + "grad_norm": 0.32558107376098633, + "learning_rate": 0.0005505882352941176, + "loss": 0.4025, + "step": 16155 + }, + { + "epoch": 9.025698324022347, + "grad_norm": 0.8359012603759766, + "learning_rate": 0.0005505602240896359, + "loss": 0.4153, + "step": 16156 + }, + { + "epoch": 9.026256983240224, + "grad_norm": 0.5309123396873474, + "learning_rate": 0.0005505322128851541, + "loss": 0.3908, + "step": 16157 + }, + { + "epoch": 9.0268156424581, + "grad_norm": 0.47509562969207764, + "learning_rate": 0.0005505042016806723, + "loss": 0.5155, + "step": 16158 + }, + { + "epoch": 9.027374301675978, + "grad_norm": 0.3786229193210602, + "learning_rate": 0.0005504761904761904, + "loss": 0.3358, + "step": 16159 + }, + { + "epoch": 9.027932960893855, + "grad_norm": 0.4620549976825714, + "learning_rate": 0.0005504481792717086, + "loss": 0.3888, + "step": 16160 + }, + { + "epoch": 9.028491620111732, + "grad_norm": 0.570759117603302, + "learning_rate": 0.0005504201680672269, + "loss": 0.4881, + "step": 16161 + }, + { + "epoch": 9.029050279329608, + "grad_norm": 0.4059472382068634, + "learning_rate": 0.0005503921568627451, + "loss": 0.4683, + "step": 16162 + }, + { + "epoch": 9.029608938547486, + "grad_norm": 0.6964138746261597, + "learning_rate": 0.0005503641456582634, + "loss": 0.4716, + "step": 16163 + }, + { + "epoch": 9.030167597765363, + "grad_norm": 0.4319922924041748, + "learning_rate": 0.0005503361344537815, + "loss": 0.3864, + "step": 16164 + }, + { + "epoch": 9.03072625698324, + "grad_norm": 0.4732145667076111, + "learning_rate": 0.0005503081232492997, + "loss": 0.4117, + "step": 16165 + }, + { + "epoch": 9.031284916201118, + "grad_norm": 0.7796878218650818, + "learning_rate": 0.000550280112044818, + "loss": 0.4702, + "step": 16166 + }, + { + "epoch": 9.031843575418995, + "grad_norm": 0.3250197470188141, + "learning_rate": 0.0005502521008403362, + "loss": 0.3937, + "step": 16167 + }, + { + "epoch": 9.032402234636871, + "grad_norm": 0.5737577080726624, + "learning_rate": 0.0005502240896358544, + "loss": 0.437, + "step": 16168 + }, + { + "epoch": 9.03296089385475, + "grad_norm": 0.4221270978450775, + "learning_rate": 0.0005501960784313726, + "loss": 0.4572, + "step": 16169 + }, + { + "epoch": 9.033519553072626, + "grad_norm": 0.36608004570007324, + "learning_rate": 0.0005501680672268907, + "loss": 0.3638, + "step": 16170 + }, + { + "epoch": 9.034078212290503, + "grad_norm": 1.0141520500183105, + "learning_rate": 0.000550140056022409, + "loss": 0.3616, + "step": 16171 + }, + { + "epoch": 9.03463687150838, + "grad_norm": 0.6397863030433655, + "learning_rate": 0.0005501120448179272, + "loss": 0.4809, + "step": 16172 + }, + { + "epoch": 9.035195530726257, + "grad_norm": 1.5604029893875122, + "learning_rate": 0.0005500840336134454, + "loss": 0.4924, + "step": 16173 + }, + { + "epoch": 9.035754189944134, + "grad_norm": 0.495320588350296, + "learning_rate": 0.0005500560224089636, + "loss": 0.4222, + "step": 16174 + }, + { + "epoch": 9.03631284916201, + "grad_norm": 0.5173612833023071, + "learning_rate": 0.0005500280112044817, + "loss": 0.4937, + "step": 16175 + }, + { + "epoch": 9.036871508379889, + "grad_norm": 0.6018291115760803, + "learning_rate": 0.00055, + "loss": 0.4473, + "step": 16176 + }, + { + "epoch": 9.037430167597766, + "grad_norm": 0.3838387429714203, + "learning_rate": 0.0005499719887955182, + "loss": 0.405, + "step": 16177 + }, + { + "epoch": 9.037988826815642, + "grad_norm": 0.4314991235733032, + "learning_rate": 0.0005499439775910364, + "loss": 0.4874, + "step": 16178 + }, + { + "epoch": 9.03854748603352, + "grad_norm": 1.260207176208496, + "learning_rate": 0.0005499159663865547, + "loss": 0.5416, + "step": 16179 + }, + { + "epoch": 9.039106145251397, + "grad_norm": 0.6589161157608032, + "learning_rate": 0.0005498879551820727, + "loss": 0.433, + "step": 16180 + }, + { + "epoch": 9.039664804469274, + "grad_norm": 0.6899911165237427, + "learning_rate": 0.0005498599439775911, + "loss": 0.4592, + "step": 16181 + }, + { + "epoch": 9.04022346368715, + "grad_norm": 0.5168753862380981, + "learning_rate": 0.0005498319327731093, + "loss": 0.4809, + "step": 16182 + }, + { + "epoch": 9.040782122905028, + "grad_norm": 0.7457990050315857, + "learning_rate": 0.0005498039215686275, + "loss": 0.4037, + "step": 16183 + }, + { + "epoch": 9.041340782122905, + "grad_norm": 0.4349231719970703, + "learning_rate": 0.0005497759103641457, + "loss": 0.3702, + "step": 16184 + }, + { + "epoch": 9.041899441340782, + "grad_norm": 0.7760615944862366, + "learning_rate": 0.0005497478991596639, + "loss": 0.3719, + "step": 16185 + }, + { + "epoch": 9.04245810055866, + "grad_norm": 0.5469179749488831, + "learning_rate": 0.0005497198879551821, + "loss": 0.458, + "step": 16186 + }, + { + "epoch": 9.043016759776537, + "grad_norm": 1.2574890851974487, + "learning_rate": 0.0005496918767507003, + "loss": 0.4447, + "step": 16187 + }, + { + "epoch": 9.043575418994413, + "grad_norm": 0.63707435131073, + "learning_rate": 0.0005496638655462185, + "loss": 0.4889, + "step": 16188 + }, + { + "epoch": 9.04413407821229, + "grad_norm": 0.43079155683517456, + "learning_rate": 0.0005496358543417367, + "loss": 0.3619, + "step": 16189 + }, + { + "epoch": 9.044692737430168, + "grad_norm": 0.6787812113761902, + "learning_rate": 0.0005496078431372549, + "loss": 0.4939, + "step": 16190 + }, + { + "epoch": 9.045251396648045, + "grad_norm": 0.4049622714519501, + "learning_rate": 0.0005495798319327731, + "loss": 0.349, + "step": 16191 + }, + { + "epoch": 9.045810055865921, + "grad_norm": 1.0897622108459473, + "learning_rate": 0.0005495518207282913, + "loss": 0.4392, + "step": 16192 + }, + { + "epoch": 9.0463687150838, + "grad_norm": 11.24601936340332, + "learning_rate": 0.0005495238095238095, + "loss": 0.5245, + "step": 16193 + }, + { + "epoch": 9.046927374301676, + "grad_norm": 0.8324848413467407, + "learning_rate": 0.0005494957983193277, + "loss": 0.4513, + "step": 16194 + }, + { + "epoch": 9.047486033519553, + "grad_norm": 0.5224907994270325, + "learning_rate": 0.000549467787114846, + "loss": 0.5327, + "step": 16195 + }, + { + "epoch": 9.048044692737431, + "grad_norm": 0.5830835700035095, + "learning_rate": 0.0005494397759103642, + "loss": 0.4373, + "step": 16196 + }, + { + "epoch": 9.048603351955308, + "grad_norm": 1.0531364679336548, + "learning_rate": 0.0005494117647058824, + "loss": 0.4228, + "step": 16197 + }, + { + "epoch": 9.049162011173184, + "grad_norm": 0.4486483037471771, + "learning_rate": 0.0005493837535014006, + "loss": 0.4906, + "step": 16198 + }, + { + "epoch": 9.04972067039106, + "grad_norm": 0.9480206966400146, + "learning_rate": 0.0005493557422969188, + "loss": 0.4164, + "step": 16199 + }, + { + "epoch": 9.050279329608939, + "grad_norm": 3.4200010299682617, + "learning_rate": 0.000549327731092437, + "loss": 0.3918, + "step": 16200 + }, + { + "epoch": 9.050837988826816, + "grad_norm": 0.5610361099243164, + "learning_rate": 0.0005492997198879553, + "loss": 0.5111, + "step": 16201 + }, + { + "epoch": 9.051396648044692, + "grad_norm": 0.5053939819335938, + "learning_rate": 0.0005492717086834734, + "loss": 0.5089, + "step": 16202 + }, + { + "epoch": 9.05195530726257, + "grad_norm": 0.40537121891975403, + "learning_rate": 0.0005492436974789916, + "loss": 0.3967, + "step": 16203 + }, + { + "epoch": 9.052513966480447, + "grad_norm": 0.4986492693424225, + "learning_rate": 0.0005492156862745098, + "loss": 0.3496, + "step": 16204 + }, + { + "epoch": 9.053072625698324, + "grad_norm": 0.39778822660446167, + "learning_rate": 0.000549187675070028, + "loss": 0.4901, + "step": 16205 + }, + { + "epoch": 9.053631284916202, + "grad_norm": 0.6032705307006836, + "learning_rate": 0.0005491596638655463, + "loss": 0.4693, + "step": 16206 + }, + { + "epoch": 9.054189944134079, + "grad_norm": 0.7003213167190552, + "learning_rate": 0.0005491316526610644, + "loss": 0.4886, + "step": 16207 + }, + { + "epoch": 9.054748603351955, + "grad_norm": 1.5134395360946655, + "learning_rate": 0.0005491036414565826, + "loss": 0.6274, + "step": 16208 + }, + { + "epoch": 9.055307262569832, + "grad_norm": 0.5603281259536743, + "learning_rate": 0.0005490756302521008, + "loss": 0.4845, + "step": 16209 + }, + { + "epoch": 9.05586592178771, + "grad_norm": 0.6256247758865356, + "learning_rate": 0.000549047619047619, + "loss": 0.4094, + "step": 16210 + }, + { + "epoch": 9.056424581005587, + "grad_norm": 0.582689106464386, + "learning_rate": 0.0005490196078431374, + "loss": 0.5259, + "step": 16211 + }, + { + "epoch": 9.056983240223463, + "grad_norm": 0.40499866008758545, + "learning_rate": 0.0005489915966386554, + "loss": 0.2875, + "step": 16212 + }, + { + "epoch": 9.057541899441341, + "grad_norm": 0.6198334693908691, + "learning_rate": 0.0005489635854341737, + "loss": 0.4245, + "step": 16213 + }, + { + "epoch": 9.058100558659218, + "grad_norm": 0.8410502076148987, + "learning_rate": 0.0005489355742296919, + "loss": 0.456, + "step": 16214 + }, + { + "epoch": 9.058659217877095, + "grad_norm": 0.6758708953857422, + "learning_rate": 0.0005489075630252101, + "loss": 0.4311, + "step": 16215 + }, + { + "epoch": 9.059217877094973, + "grad_norm": 0.4737738370895386, + "learning_rate": 0.0005488795518207284, + "loss": 0.468, + "step": 16216 + }, + { + "epoch": 9.05977653631285, + "grad_norm": 0.4956648647785187, + "learning_rate": 0.0005488515406162466, + "loss": 0.3793, + "step": 16217 + }, + { + "epoch": 9.060335195530726, + "grad_norm": 0.477300226688385, + "learning_rate": 0.0005488235294117647, + "loss": 0.3688, + "step": 16218 + }, + { + "epoch": 9.060893854748603, + "grad_norm": 0.49101436138153076, + "learning_rate": 0.0005487955182072829, + "loss": 0.3954, + "step": 16219 + }, + { + "epoch": 9.061452513966481, + "grad_norm": 0.6452205777168274, + "learning_rate": 0.0005487675070028011, + "loss": 0.3309, + "step": 16220 + }, + { + "epoch": 9.062011173184358, + "grad_norm": 5.432021617889404, + "learning_rate": 0.0005487394957983194, + "loss": 0.5069, + "step": 16221 + }, + { + "epoch": 9.062569832402234, + "grad_norm": 0.6202031970024109, + "learning_rate": 0.0005487114845938376, + "loss": 0.3848, + "step": 16222 + }, + { + "epoch": 9.063128491620112, + "grad_norm": 4.939457416534424, + "learning_rate": 0.0005486834733893557, + "loss": 0.4985, + "step": 16223 + }, + { + "epoch": 9.063687150837989, + "grad_norm": 0.7293437123298645, + "learning_rate": 0.0005486554621848739, + "loss": 0.449, + "step": 16224 + }, + { + "epoch": 9.064245810055866, + "grad_norm": 0.39150670170783997, + "learning_rate": 0.0005486274509803921, + "loss": 0.3321, + "step": 16225 + }, + { + "epoch": 9.064804469273742, + "grad_norm": 0.5872224569320679, + "learning_rate": 0.0005485994397759104, + "loss": 0.4727, + "step": 16226 + }, + { + "epoch": 9.06536312849162, + "grad_norm": 0.5569866299629211, + "learning_rate": 0.0005485714285714286, + "loss": 0.4433, + "step": 16227 + }, + { + "epoch": 9.065921787709497, + "grad_norm": 0.41548797488212585, + "learning_rate": 0.0005485434173669467, + "loss": 0.366, + "step": 16228 + }, + { + "epoch": 9.066480446927374, + "grad_norm": 0.5427729487419128, + "learning_rate": 0.000548515406162465, + "loss": 0.3932, + "step": 16229 + }, + { + "epoch": 9.067039106145252, + "grad_norm": 0.5051064491271973, + "learning_rate": 0.0005484873949579832, + "loss": 0.5481, + "step": 16230 + }, + { + "epoch": 9.067597765363129, + "grad_norm": 0.5244288444519043, + "learning_rate": 0.0005484593837535015, + "loss": 0.5337, + "step": 16231 + }, + { + "epoch": 9.068156424581005, + "grad_norm": 0.3912734389305115, + "learning_rate": 0.0005484313725490197, + "loss": 0.3239, + "step": 16232 + }, + { + "epoch": 9.068715083798883, + "grad_norm": 0.5021672248840332, + "learning_rate": 0.0005484033613445379, + "loss": 0.5159, + "step": 16233 + }, + { + "epoch": 9.06927374301676, + "grad_norm": 0.7317255139350891, + "learning_rate": 0.000548375350140056, + "loss": 0.4602, + "step": 16234 + }, + { + "epoch": 9.069832402234637, + "grad_norm": 0.3760683834552765, + "learning_rate": 0.0005483473389355742, + "loss": 0.3894, + "step": 16235 + }, + { + "epoch": 9.070391061452513, + "grad_norm": 0.4998409152030945, + "learning_rate": 0.0005483193277310925, + "loss": 0.4143, + "step": 16236 + }, + { + "epoch": 9.070949720670392, + "grad_norm": 0.5495596528053284, + "learning_rate": 0.0005482913165266107, + "loss": 0.4539, + "step": 16237 + }, + { + "epoch": 9.071508379888268, + "grad_norm": 0.5514495372772217, + "learning_rate": 0.0005482633053221289, + "loss": 0.4303, + "step": 16238 + }, + { + "epoch": 9.072067039106145, + "grad_norm": 0.582670271396637, + "learning_rate": 0.000548235294117647, + "loss": 0.4043, + "step": 16239 + }, + { + "epoch": 9.072625698324023, + "grad_norm": 0.35094568133354187, + "learning_rate": 0.0005482072829131652, + "loss": 0.4455, + "step": 16240 + }, + { + "epoch": 9.0731843575419, + "grad_norm": 0.6852053999900818, + "learning_rate": 0.0005481792717086835, + "loss": 0.5387, + "step": 16241 + }, + { + "epoch": 9.073743016759776, + "grad_norm": 0.3727850317955017, + "learning_rate": 0.0005481512605042017, + "loss": 0.3634, + "step": 16242 + }, + { + "epoch": 9.074301675977654, + "grad_norm": 0.45923852920532227, + "learning_rate": 0.0005481232492997199, + "loss": 0.5085, + "step": 16243 + }, + { + "epoch": 9.074860335195531, + "grad_norm": 0.6615126729011536, + "learning_rate": 0.000548095238095238, + "loss": 0.4763, + "step": 16244 + }, + { + "epoch": 9.075418994413408, + "grad_norm": 0.37014415860176086, + "learning_rate": 0.0005480672268907562, + "loss": 0.3241, + "step": 16245 + }, + { + "epoch": 9.075977653631284, + "grad_norm": 1.961401104927063, + "learning_rate": 0.0005480392156862746, + "loss": 0.3432, + "step": 16246 + }, + { + "epoch": 9.076536312849163, + "grad_norm": 0.5294041037559509, + "learning_rate": 0.0005480112044817928, + "loss": 0.487, + "step": 16247 + }, + { + "epoch": 9.077094972067039, + "grad_norm": 1.2390879392623901, + "learning_rate": 0.000547983193277311, + "loss": 0.4214, + "step": 16248 + }, + { + "epoch": 9.077653631284916, + "grad_norm": 0.5112480521202087, + "learning_rate": 0.0005479551820728292, + "loss": 0.5072, + "step": 16249 + }, + { + "epoch": 9.078212290502794, + "grad_norm": 0.5959158539772034, + "learning_rate": 0.0005479271708683473, + "loss": 0.4592, + "step": 16250 + }, + { + "epoch": 9.07877094972067, + "grad_norm": 0.44943875074386597, + "learning_rate": 0.0005478991596638656, + "loss": 0.352, + "step": 16251 + }, + { + "epoch": 9.079329608938547, + "grad_norm": 0.543141782283783, + "learning_rate": 0.0005478711484593838, + "loss": 0.4888, + "step": 16252 + }, + { + "epoch": 9.079888268156424, + "grad_norm": 0.5933079123497009, + "learning_rate": 0.000547843137254902, + "loss": 0.5776, + "step": 16253 + }, + { + "epoch": 9.080446927374302, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0005478151260504202, + "loss": 0.4523, + "step": 16254 + }, + { + "epoch": 9.081005586592179, + "grad_norm": 0.5095569491386414, + "learning_rate": 0.0005477871148459383, + "loss": 0.5002, + "step": 16255 + }, + { + "epoch": 9.081564245810055, + "grad_norm": 2.954151153564453, + "learning_rate": 0.0005477591036414566, + "loss": 0.5539, + "step": 16256 + }, + { + "epoch": 9.082122905027934, + "grad_norm": 0.5832377672195435, + "learning_rate": 0.0005477310924369748, + "loss": 0.4825, + "step": 16257 + }, + { + "epoch": 9.08268156424581, + "grad_norm": 0.7789512872695923, + "learning_rate": 0.000547703081232493, + "loss": 0.5376, + "step": 16258 + }, + { + "epoch": 9.083240223463687, + "grad_norm": 0.4797183573246002, + "learning_rate": 0.0005476750700280112, + "loss": 0.4311, + "step": 16259 + }, + { + "epoch": 9.083798882681565, + "grad_norm": 0.48299819231033325, + "learning_rate": 0.0005476470588235293, + "loss": 0.4117, + "step": 16260 + }, + { + "epoch": 9.084357541899442, + "grad_norm": 0.44618839025497437, + "learning_rate": 0.0005476190476190477, + "loss": 0.4477, + "step": 16261 + }, + { + "epoch": 9.084916201117318, + "grad_norm": 0.7945277690887451, + "learning_rate": 0.0005475910364145659, + "loss": 0.4485, + "step": 16262 + }, + { + "epoch": 9.085474860335195, + "grad_norm": 0.5267883539199829, + "learning_rate": 0.0005475630252100841, + "loss": 0.3896, + "step": 16263 + }, + { + "epoch": 9.086033519553073, + "grad_norm": 0.3976774215698242, + "learning_rate": 0.0005475350140056023, + "loss": 0.3821, + "step": 16264 + }, + { + "epoch": 9.08659217877095, + "grad_norm": 0.5658947825431824, + "learning_rate": 0.0005475070028011205, + "loss": 0.4726, + "step": 16265 + }, + { + "epoch": 9.087150837988826, + "grad_norm": 0.46620821952819824, + "learning_rate": 0.0005474789915966387, + "loss": 0.3273, + "step": 16266 + }, + { + "epoch": 9.087709497206705, + "grad_norm": 0.4152989983558655, + "learning_rate": 0.0005474509803921569, + "loss": 0.3535, + "step": 16267 + }, + { + "epoch": 9.088268156424581, + "grad_norm": 0.4430373013019562, + "learning_rate": 0.0005474229691876751, + "loss": 0.4914, + "step": 16268 + }, + { + "epoch": 9.088826815642458, + "grad_norm": 0.6288483142852783, + "learning_rate": 0.0005473949579831933, + "loss": 0.4947, + "step": 16269 + }, + { + "epoch": 9.089385474860336, + "grad_norm": 0.5687662959098816, + "learning_rate": 0.0005473669467787115, + "loss": 0.4216, + "step": 16270 + }, + { + "epoch": 9.089944134078213, + "grad_norm": 0.5555469393730164, + "learning_rate": 0.0005473389355742297, + "loss": 0.4108, + "step": 16271 + }, + { + "epoch": 9.09050279329609, + "grad_norm": 0.6849669814109802, + "learning_rate": 0.0005473109243697479, + "loss": 0.5279, + "step": 16272 + }, + { + "epoch": 9.091061452513966, + "grad_norm": 0.3727986514568329, + "learning_rate": 0.0005472829131652661, + "loss": 0.316, + "step": 16273 + }, + { + "epoch": 9.091620111731844, + "grad_norm": 0.6479452252388, + "learning_rate": 0.0005472549019607843, + "loss": 0.427, + "step": 16274 + }, + { + "epoch": 9.09217877094972, + "grad_norm": 1.1027398109436035, + "learning_rate": 0.0005472268907563025, + "loss": 0.4555, + "step": 16275 + }, + { + "epoch": 9.092737430167597, + "grad_norm": 0.7032692432403564, + "learning_rate": 0.0005471988795518207, + "loss": 0.4967, + "step": 16276 + }, + { + "epoch": 9.093296089385476, + "grad_norm": 1.9388405084609985, + "learning_rate": 0.000547170868347339, + "loss": 0.5312, + "step": 16277 + }, + { + "epoch": 9.093854748603352, + "grad_norm": 0.7738831639289856, + "learning_rate": 0.0005471428571428572, + "loss": 0.4959, + "step": 16278 + }, + { + "epoch": 9.094413407821229, + "grad_norm": 0.6083651185035706, + "learning_rate": 0.0005471148459383754, + "loss": 0.4799, + "step": 16279 + }, + { + "epoch": 9.094972067039107, + "grad_norm": 0.5386562943458557, + "learning_rate": 0.0005470868347338936, + "loss": 0.4181, + "step": 16280 + }, + { + "epoch": 9.095530726256984, + "grad_norm": 0.5231610536575317, + "learning_rate": 0.0005470588235294119, + "loss": 0.451, + "step": 16281 + }, + { + "epoch": 9.09608938547486, + "grad_norm": 0.4870569407939911, + "learning_rate": 0.00054703081232493, + "loss": 0.4537, + "step": 16282 + }, + { + "epoch": 9.096648044692737, + "grad_norm": 0.6581200361251831, + "learning_rate": 0.0005470028011204482, + "loss": 0.571, + "step": 16283 + }, + { + "epoch": 9.097206703910615, + "grad_norm": 0.41583675146102905, + "learning_rate": 0.0005469747899159664, + "loss": 0.4759, + "step": 16284 + }, + { + "epoch": 9.097765363128492, + "grad_norm": 0.5102593898773193, + "learning_rate": 0.0005469467787114846, + "loss": 0.4556, + "step": 16285 + }, + { + "epoch": 9.098324022346368, + "grad_norm": 0.3559733033180237, + "learning_rate": 0.0005469187675070029, + "loss": 0.341, + "step": 16286 + }, + { + "epoch": 9.098882681564247, + "grad_norm": 0.7939794063568115, + "learning_rate": 0.000546890756302521, + "loss": 0.4808, + "step": 16287 + }, + { + "epoch": 9.099441340782123, + "grad_norm": 0.4100939929485321, + "learning_rate": 0.0005468627450980392, + "loss": 0.3925, + "step": 16288 + }, + { + "epoch": 9.1, + "grad_norm": 0.45097044110298157, + "learning_rate": 0.0005468347338935574, + "loss": 0.4335, + "step": 16289 + }, + { + "epoch": 9.100558659217878, + "grad_norm": 0.4148938059806824, + "learning_rate": 0.0005468067226890756, + "loss": 0.4388, + "step": 16290 + }, + { + "epoch": 9.101117318435755, + "grad_norm": 0.5107712745666504, + "learning_rate": 0.0005467787114845938, + "loss": 0.39, + "step": 16291 + }, + { + "epoch": 9.101675977653631, + "grad_norm": 0.7259695529937744, + "learning_rate": 0.000546750700280112, + "loss": 0.5608, + "step": 16292 + }, + { + "epoch": 9.102234636871508, + "grad_norm": 0.8637893199920654, + "learning_rate": 0.0005467226890756302, + "loss": 0.4, + "step": 16293 + }, + { + "epoch": 9.102793296089386, + "grad_norm": 0.738400936126709, + "learning_rate": 0.0005466946778711484, + "loss": 0.4209, + "step": 16294 + }, + { + "epoch": 9.103351955307263, + "grad_norm": 0.3919229507446289, + "learning_rate": 0.0005466666666666667, + "loss": 0.3675, + "step": 16295 + }, + { + "epoch": 9.10391061452514, + "grad_norm": 1.9260445833206177, + "learning_rate": 0.0005466386554621849, + "loss": 0.4413, + "step": 16296 + }, + { + "epoch": 9.104469273743018, + "grad_norm": 0.8303425312042236, + "learning_rate": 0.0005466106442577032, + "loss": 0.398, + "step": 16297 + }, + { + "epoch": 9.105027932960894, + "grad_norm": 2.134166955947876, + "learning_rate": 0.0005465826330532213, + "loss": 0.561, + "step": 16298 + }, + { + "epoch": 9.10558659217877, + "grad_norm": 0.895537793636322, + "learning_rate": 0.0005465546218487395, + "loss": 0.5047, + "step": 16299 + }, + { + "epoch": 9.106145251396647, + "grad_norm": 0.4418204426765442, + "learning_rate": 0.0005465266106442577, + "loss": 0.4127, + "step": 16300 + }, + { + "epoch": 9.106703910614526, + "grad_norm": 0.43009352684020996, + "learning_rate": 0.0005464985994397759, + "loss": 0.3654, + "step": 16301 + }, + { + "epoch": 9.107262569832402, + "grad_norm": 0.4846309423446655, + "learning_rate": 0.0005464705882352942, + "loss": 0.4188, + "step": 16302 + }, + { + "epoch": 9.107821229050279, + "grad_norm": 0.6969563961029053, + "learning_rate": 0.0005464425770308123, + "loss": 0.5873, + "step": 16303 + }, + { + "epoch": 9.108379888268157, + "grad_norm": 0.5111419558525085, + "learning_rate": 0.0005464145658263305, + "loss": 0.3658, + "step": 16304 + }, + { + "epoch": 9.108938547486034, + "grad_norm": 0.5028784275054932, + "learning_rate": 0.0005463865546218487, + "loss": 0.5035, + "step": 16305 + }, + { + "epoch": 9.10949720670391, + "grad_norm": 0.4106832444667816, + "learning_rate": 0.0005463585434173669, + "loss": 0.3582, + "step": 16306 + }, + { + "epoch": 9.110055865921789, + "grad_norm": 0.45648127794265747, + "learning_rate": 0.0005463305322128852, + "loss": 0.3728, + "step": 16307 + }, + { + "epoch": 9.110614525139665, + "grad_norm": 1.281018853187561, + "learning_rate": 0.0005463025210084033, + "loss": 0.5196, + "step": 16308 + }, + { + "epoch": 9.111173184357542, + "grad_norm": 0.6316792964935303, + "learning_rate": 0.0005462745098039215, + "loss": 0.4413, + "step": 16309 + }, + { + "epoch": 9.111731843575418, + "grad_norm": 0.5648047924041748, + "learning_rate": 0.0005462464985994397, + "loss": 0.4137, + "step": 16310 + }, + { + "epoch": 9.112290502793297, + "grad_norm": 0.524459719657898, + "learning_rate": 0.000546218487394958, + "loss": 0.5085, + "step": 16311 + }, + { + "epoch": 9.112849162011173, + "grad_norm": 0.5050011277198792, + "learning_rate": 0.0005461904761904763, + "loss": 0.3521, + "step": 16312 + }, + { + "epoch": 9.11340782122905, + "grad_norm": 0.6483074426651001, + "learning_rate": 0.0005461624649859945, + "loss": 0.4539, + "step": 16313 + }, + { + "epoch": 9.113966480446928, + "grad_norm": 0.542499840259552, + "learning_rate": 0.0005461344537815126, + "loss": 0.4493, + "step": 16314 + }, + { + "epoch": 9.114525139664805, + "grad_norm": 0.3561041057109833, + "learning_rate": 0.0005461064425770308, + "loss": 0.4473, + "step": 16315 + }, + { + "epoch": 9.115083798882681, + "grad_norm": 2.08398699760437, + "learning_rate": 0.000546078431372549, + "loss": 0.5668, + "step": 16316 + }, + { + "epoch": 9.11564245810056, + "grad_norm": 0.523814857006073, + "learning_rate": 0.0005460504201680673, + "loss": 0.4261, + "step": 16317 + }, + { + "epoch": 9.116201117318436, + "grad_norm": 0.5716768503189087, + "learning_rate": 0.0005460224089635855, + "loss": 0.5053, + "step": 16318 + }, + { + "epoch": 9.116759776536313, + "grad_norm": 0.5685926675796509, + "learning_rate": 0.0005459943977591036, + "loss": 0.4171, + "step": 16319 + }, + { + "epoch": 9.11731843575419, + "grad_norm": 0.5755152106285095, + "learning_rate": 0.0005459663865546218, + "loss": 0.5121, + "step": 16320 + }, + { + "epoch": 9.117877094972068, + "grad_norm": 0.4981805086135864, + "learning_rate": 0.00054593837535014, + "loss": 0.4212, + "step": 16321 + }, + { + "epoch": 9.118435754189944, + "grad_norm": 0.4252035319805145, + "learning_rate": 0.0005459103641456583, + "loss": 0.4311, + "step": 16322 + }, + { + "epoch": 9.11899441340782, + "grad_norm": 0.42670243978500366, + "learning_rate": 0.0005458823529411765, + "loss": 0.4606, + "step": 16323 + }, + { + "epoch": 9.119553072625699, + "grad_norm": 0.6980792284011841, + "learning_rate": 0.0005458543417366946, + "loss": 0.5582, + "step": 16324 + }, + { + "epoch": 9.120111731843576, + "grad_norm": 2.64755916595459, + "learning_rate": 0.0005458263305322128, + "loss": 0.4501, + "step": 16325 + }, + { + "epoch": 9.120670391061452, + "grad_norm": 1.0233116149902344, + "learning_rate": 0.000545798319327731, + "loss": 0.393, + "step": 16326 + }, + { + "epoch": 9.121229050279329, + "grad_norm": 0.396380752325058, + "learning_rate": 0.0005457703081232494, + "loss": 0.3441, + "step": 16327 + }, + { + "epoch": 9.121787709497207, + "grad_norm": 0.512162983417511, + "learning_rate": 0.0005457422969187676, + "loss": 0.4543, + "step": 16328 + }, + { + "epoch": 9.122346368715084, + "grad_norm": 0.5223424434661865, + "learning_rate": 0.0005457142857142858, + "loss": 0.4577, + "step": 16329 + }, + { + "epoch": 9.12290502793296, + "grad_norm": 7.071179389953613, + "learning_rate": 0.0005456862745098039, + "loss": 0.468, + "step": 16330 + }, + { + "epoch": 9.123463687150839, + "grad_norm": 0.42750778794288635, + "learning_rate": 0.0005456582633053221, + "loss": 0.4144, + "step": 16331 + }, + { + "epoch": 9.124022346368715, + "grad_norm": 0.946935772895813, + "learning_rate": 0.0005456302521008404, + "loss": 0.5107, + "step": 16332 + }, + { + "epoch": 9.124581005586592, + "grad_norm": 0.4396444261074066, + "learning_rate": 0.0005456022408963586, + "loss": 0.3985, + "step": 16333 + }, + { + "epoch": 9.12513966480447, + "grad_norm": 0.7168882489204407, + "learning_rate": 0.0005455742296918768, + "loss": 0.4534, + "step": 16334 + }, + { + "epoch": 9.125698324022347, + "grad_norm": 0.48841142654418945, + "learning_rate": 0.0005455462184873949, + "loss": 0.5296, + "step": 16335 + }, + { + "epoch": 9.126256983240223, + "grad_norm": 0.6995057463645935, + "learning_rate": 0.0005455182072829131, + "loss": 0.4467, + "step": 16336 + }, + { + "epoch": 9.1268156424581, + "grad_norm": 4.894993305206299, + "learning_rate": 0.0005454901960784314, + "loss": 0.3613, + "step": 16337 + }, + { + "epoch": 9.127374301675978, + "grad_norm": 0.6327822804450989, + "learning_rate": 0.0005454621848739496, + "loss": 0.3516, + "step": 16338 + }, + { + "epoch": 9.127932960893855, + "grad_norm": 0.4114134907722473, + "learning_rate": 0.0005454341736694678, + "loss": 0.4014, + "step": 16339 + }, + { + "epoch": 9.128491620111731, + "grad_norm": 0.41836825013160706, + "learning_rate": 0.0005454061624649859, + "loss": 0.4139, + "step": 16340 + }, + { + "epoch": 9.12905027932961, + "grad_norm": 0.42986246943473816, + "learning_rate": 0.0005453781512605041, + "loss": 0.4561, + "step": 16341 + }, + { + "epoch": 9.129608938547486, + "grad_norm": 0.5191091895103455, + "learning_rate": 0.0005453501400560224, + "loss": 0.3996, + "step": 16342 + }, + { + "epoch": 9.130167597765363, + "grad_norm": 0.52755206823349, + "learning_rate": 0.0005453221288515407, + "loss": 0.3932, + "step": 16343 + }, + { + "epoch": 9.130726256983241, + "grad_norm": 0.34361231327056885, + "learning_rate": 0.0005452941176470589, + "loss": 0.3027, + "step": 16344 + }, + { + "epoch": 9.131284916201118, + "grad_norm": 0.3853558301925659, + "learning_rate": 0.0005452661064425771, + "loss": 0.4704, + "step": 16345 + }, + { + "epoch": 9.131843575418994, + "grad_norm": 0.5141075253486633, + "learning_rate": 0.0005452380952380952, + "loss": 0.3316, + "step": 16346 + }, + { + "epoch": 9.13240223463687, + "grad_norm": 1.0674580335617065, + "learning_rate": 0.0005452100840336135, + "loss": 0.5051, + "step": 16347 + }, + { + "epoch": 9.132960893854749, + "grad_norm": 2.7568652629852295, + "learning_rate": 0.0005451820728291317, + "loss": 0.7126, + "step": 16348 + }, + { + "epoch": 9.133519553072626, + "grad_norm": 0.5429947376251221, + "learning_rate": 0.0005451540616246499, + "loss": 0.5427, + "step": 16349 + }, + { + "epoch": 9.134078212290502, + "grad_norm": 0.45803746581077576, + "learning_rate": 0.0005451260504201681, + "loss": 0.465, + "step": 16350 + }, + { + "epoch": 9.13463687150838, + "grad_norm": 1.558245062828064, + "learning_rate": 0.0005450980392156862, + "loss": 0.5021, + "step": 16351 + }, + { + "epoch": 9.135195530726257, + "grad_norm": 0.4956776797771454, + "learning_rate": 0.0005450700280112045, + "loss": 0.4833, + "step": 16352 + }, + { + "epoch": 9.135754189944134, + "grad_norm": 0.40609315037727356, + "learning_rate": 0.0005450420168067227, + "loss": 0.3424, + "step": 16353 + }, + { + "epoch": 9.136312849162012, + "grad_norm": 0.5548574328422546, + "learning_rate": 0.0005450140056022409, + "loss": 0.3084, + "step": 16354 + }, + { + "epoch": 9.136871508379889, + "grad_norm": 0.5057917833328247, + "learning_rate": 0.0005449859943977591, + "loss": 0.5042, + "step": 16355 + }, + { + "epoch": 9.137430167597765, + "grad_norm": 0.5761809349060059, + "learning_rate": 0.0005449579831932772, + "loss": 0.4063, + "step": 16356 + }, + { + "epoch": 9.137988826815642, + "grad_norm": 0.4334220588207245, + "learning_rate": 0.0005449299719887955, + "loss": 0.5039, + "step": 16357 + }, + { + "epoch": 9.13854748603352, + "grad_norm": 0.8143592476844788, + "learning_rate": 0.0005449019607843137, + "loss": 0.4099, + "step": 16358 + }, + { + "epoch": 9.139106145251397, + "grad_norm": 0.7351080179214478, + "learning_rate": 0.000544873949579832, + "loss": 0.5194, + "step": 16359 + }, + { + "epoch": 9.139664804469273, + "grad_norm": 0.37233632802963257, + "learning_rate": 0.0005448459383753502, + "loss": 0.3753, + "step": 16360 + }, + { + "epoch": 9.140223463687152, + "grad_norm": 0.36705389618873596, + "learning_rate": 0.0005448179271708684, + "loss": 0.2991, + "step": 16361 + }, + { + "epoch": 9.140782122905028, + "grad_norm": 1.0420055389404297, + "learning_rate": 0.0005447899159663866, + "loss": 0.4184, + "step": 16362 + }, + { + "epoch": 9.141340782122905, + "grad_norm": 0.6832417249679565, + "learning_rate": 0.0005447619047619048, + "loss": 0.6216, + "step": 16363 + }, + { + "epoch": 9.141899441340781, + "grad_norm": 0.7023866772651672, + "learning_rate": 0.000544733893557423, + "loss": 0.4298, + "step": 16364 + }, + { + "epoch": 9.14245810055866, + "grad_norm": 0.4837002456188202, + "learning_rate": 0.0005447058823529412, + "loss": 0.5234, + "step": 16365 + }, + { + "epoch": 9.143016759776536, + "grad_norm": 0.45409059524536133, + "learning_rate": 0.0005446778711484594, + "loss": 0.3929, + "step": 16366 + }, + { + "epoch": 9.143575418994413, + "grad_norm": 6.828324317932129, + "learning_rate": 0.0005446498599439776, + "loss": 0.4482, + "step": 16367 + }, + { + "epoch": 9.144134078212291, + "grad_norm": 0.5492183566093445, + "learning_rate": 0.0005446218487394958, + "loss": 0.5474, + "step": 16368 + }, + { + "epoch": 9.144692737430168, + "grad_norm": 0.5949881672859192, + "learning_rate": 0.000544593837535014, + "loss": 0.5293, + "step": 16369 + }, + { + "epoch": 9.145251396648044, + "grad_norm": 0.5678888559341431, + "learning_rate": 0.0005445658263305322, + "loss": 0.4042, + "step": 16370 + }, + { + "epoch": 9.145810055865923, + "grad_norm": 0.3871781527996063, + "learning_rate": 0.0005445378151260504, + "loss": 0.3747, + "step": 16371 + }, + { + "epoch": 9.1463687150838, + "grad_norm": 0.5532249212265015, + "learning_rate": 0.0005445098039215687, + "loss": 0.422, + "step": 16372 + }, + { + "epoch": 9.146927374301676, + "grad_norm": 0.47664597630500793, + "learning_rate": 0.0005444817927170868, + "loss": 0.4691, + "step": 16373 + }, + { + "epoch": 9.147486033519552, + "grad_norm": 0.46502628922462463, + "learning_rate": 0.000544453781512605, + "loss": 0.4153, + "step": 16374 + }, + { + "epoch": 9.14804469273743, + "grad_norm": 0.535905659198761, + "learning_rate": 0.0005444257703081232, + "loss": 0.4587, + "step": 16375 + }, + { + "epoch": 9.148603351955307, + "grad_norm": 0.4472079575061798, + "learning_rate": 0.0005443977591036414, + "loss": 0.3934, + "step": 16376 + }, + { + "epoch": 9.149162011173184, + "grad_norm": 0.4134097397327423, + "learning_rate": 0.0005443697478991598, + "loss": 0.432, + "step": 16377 + }, + { + "epoch": 9.149720670391062, + "grad_norm": 0.5542639493942261, + "learning_rate": 0.0005443417366946779, + "loss": 0.3759, + "step": 16378 + }, + { + "epoch": 9.150279329608939, + "grad_norm": 0.8177208304405212, + "learning_rate": 0.0005443137254901961, + "loss": 0.4236, + "step": 16379 + }, + { + "epoch": 9.150837988826815, + "grad_norm": 0.514380156993866, + "learning_rate": 0.0005442857142857143, + "loss": 0.4575, + "step": 16380 + }, + { + "epoch": 9.151396648044694, + "grad_norm": 0.4157094359397888, + "learning_rate": 0.0005442577030812325, + "loss": 0.358, + "step": 16381 + }, + { + "epoch": 9.15195530726257, + "grad_norm": 0.4507540166378021, + "learning_rate": 0.0005442296918767508, + "loss": 0.3936, + "step": 16382 + }, + { + "epoch": 9.152513966480447, + "grad_norm": 0.7144804000854492, + "learning_rate": 0.0005442016806722689, + "loss": 0.4704, + "step": 16383 + }, + { + "epoch": 9.153072625698323, + "grad_norm": 0.33832982182502747, + "learning_rate": 0.0005441736694677871, + "loss": 0.4255, + "step": 16384 + }, + { + "epoch": 9.153631284916202, + "grad_norm": 0.6611616015434265, + "learning_rate": 0.0005441456582633053, + "loss": 0.413, + "step": 16385 + }, + { + "epoch": 9.154189944134078, + "grad_norm": 0.43906164169311523, + "learning_rate": 0.0005441176470588235, + "loss": 0.4903, + "step": 16386 + }, + { + "epoch": 9.154748603351955, + "grad_norm": 0.5179803967475891, + "learning_rate": 0.0005440896358543418, + "loss": 0.5272, + "step": 16387 + }, + { + "epoch": 9.155307262569833, + "grad_norm": 0.5361232757568359, + "learning_rate": 0.00054406162464986, + "loss": 0.3685, + "step": 16388 + }, + { + "epoch": 9.15586592178771, + "grad_norm": 0.6487807035446167, + "learning_rate": 0.0005440336134453781, + "loss": 0.4931, + "step": 16389 + }, + { + "epoch": 9.156424581005586, + "grad_norm": 0.3948875665664673, + "learning_rate": 0.0005440056022408963, + "loss": 0.3591, + "step": 16390 + }, + { + "epoch": 9.156983240223465, + "grad_norm": 0.4617760479450226, + "learning_rate": 0.0005439775910364145, + "loss": 0.4945, + "step": 16391 + }, + { + "epoch": 9.157541899441341, + "grad_norm": 0.5252341032028198, + "learning_rate": 0.0005439495798319329, + "loss": 0.4994, + "step": 16392 + }, + { + "epoch": 9.158100558659218, + "grad_norm": 0.3993321359157562, + "learning_rate": 0.0005439215686274511, + "loss": 0.535, + "step": 16393 + }, + { + "epoch": 9.158659217877094, + "grad_norm": 0.529315173625946, + "learning_rate": 0.0005438935574229692, + "loss": 0.3762, + "step": 16394 + }, + { + "epoch": 9.159217877094973, + "grad_norm": 0.7219976186752319, + "learning_rate": 0.0005438655462184874, + "loss": 0.5184, + "step": 16395 + }, + { + "epoch": 9.15977653631285, + "grad_norm": 0.7702376246452332, + "learning_rate": 0.0005438375350140056, + "loss": 0.4463, + "step": 16396 + }, + { + "epoch": 9.160335195530726, + "grad_norm": 0.4217546880245209, + "learning_rate": 0.0005438095238095239, + "loss": 0.4454, + "step": 16397 + }, + { + "epoch": 9.160893854748604, + "grad_norm": 0.5808950662612915, + "learning_rate": 0.0005437815126050421, + "loss": 0.3412, + "step": 16398 + }, + { + "epoch": 9.16145251396648, + "grad_norm": 0.3684050142765045, + "learning_rate": 0.0005437535014005602, + "loss": 0.3591, + "step": 16399 + }, + { + "epoch": 9.162011173184357, + "grad_norm": 0.5749147534370422, + "learning_rate": 0.0005437254901960784, + "loss": 0.4092, + "step": 16400 + }, + { + "epoch": 9.162569832402234, + "grad_norm": 0.43909958004951477, + "learning_rate": 0.0005436974789915966, + "loss": 0.4478, + "step": 16401 + }, + { + "epoch": 9.163128491620112, + "grad_norm": 0.5004404187202454, + "learning_rate": 0.0005436694677871149, + "loss": 0.4707, + "step": 16402 + }, + { + "epoch": 9.163687150837989, + "grad_norm": 0.3971354067325592, + "learning_rate": 0.0005436414565826331, + "loss": 0.336, + "step": 16403 + }, + { + "epoch": 9.164245810055865, + "grad_norm": 0.48322901129722595, + "learning_rate": 0.0005436134453781513, + "loss": 0.4363, + "step": 16404 + }, + { + "epoch": 9.164804469273744, + "grad_norm": 0.36465978622436523, + "learning_rate": 0.0005435854341736694, + "loss": 0.295, + "step": 16405 + }, + { + "epoch": 9.16536312849162, + "grad_norm": 13.25984001159668, + "learning_rate": 0.0005435574229691876, + "loss": 0.4929, + "step": 16406 + }, + { + "epoch": 9.165921787709497, + "grad_norm": 0.5462678074836731, + "learning_rate": 0.0005435294117647059, + "loss": 0.3636, + "step": 16407 + }, + { + "epoch": 9.166480446927375, + "grad_norm": 0.6351718902587891, + "learning_rate": 0.0005435014005602241, + "loss": 0.3828, + "step": 16408 + }, + { + "epoch": 9.167039106145252, + "grad_norm": 0.31751272082328796, + "learning_rate": 0.0005434733893557424, + "loss": 0.3198, + "step": 16409 + }, + { + "epoch": 9.167597765363128, + "grad_norm": 0.2950994670391083, + "learning_rate": 0.0005434453781512605, + "loss": 0.3686, + "step": 16410 + }, + { + "epoch": 9.168156424581005, + "grad_norm": 0.5025097727775574, + "learning_rate": 0.0005434173669467787, + "loss": 0.4293, + "step": 16411 + }, + { + "epoch": 9.168715083798883, + "grad_norm": 0.632214367389679, + "learning_rate": 0.000543389355742297, + "loss": 0.4881, + "step": 16412 + }, + { + "epoch": 9.16927374301676, + "grad_norm": 0.6147112846374512, + "learning_rate": 0.0005433613445378152, + "loss": 0.3965, + "step": 16413 + }, + { + "epoch": 9.169832402234636, + "grad_norm": 0.6025201678276062, + "learning_rate": 0.0005433333333333334, + "loss": 0.4268, + "step": 16414 + }, + { + "epoch": 9.170391061452515, + "grad_norm": 0.3863448202610016, + "learning_rate": 0.0005433053221288515, + "loss": 0.464, + "step": 16415 + }, + { + "epoch": 9.170949720670391, + "grad_norm": 0.41197606921195984, + "learning_rate": 0.0005432773109243697, + "loss": 0.4562, + "step": 16416 + }, + { + "epoch": 9.171508379888268, + "grad_norm": 0.3912128210067749, + "learning_rate": 0.000543249299719888, + "loss": 0.4079, + "step": 16417 + }, + { + "epoch": 9.172067039106146, + "grad_norm": 0.3998691439628601, + "learning_rate": 0.0005432212885154062, + "loss": 0.5347, + "step": 16418 + }, + { + "epoch": 9.172625698324023, + "grad_norm": 0.35939306020736694, + "learning_rate": 0.0005431932773109244, + "loss": 0.3342, + "step": 16419 + }, + { + "epoch": 9.1731843575419, + "grad_norm": 0.3489071726799011, + "learning_rate": 0.0005431652661064426, + "loss": 0.3803, + "step": 16420 + }, + { + "epoch": 9.173743016759776, + "grad_norm": 0.6111111640930176, + "learning_rate": 0.0005431372549019607, + "loss": 0.5551, + "step": 16421 + }, + { + "epoch": 9.174301675977654, + "grad_norm": 0.6878377199172974, + "learning_rate": 0.000543109243697479, + "loss": 0.3191, + "step": 16422 + }, + { + "epoch": 9.17486033519553, + "grad_norm": 0.908305287361145, + "learning_rate": 0.0005430812324929972, + "loss": 0.5914, + "step": 16423 + }, + { + "epoch": 9.175418994413407, + "grad_norm": 1.1644045114517212, + "learning_rate": 0.0005430532212885154, + "loss": 0.3874, + "step": 16424 + }, + { + "epoch": 9.175977653631286, + "grad_norm": 2.926616668701172, + "learning_rate": 0.0005430252100840337, + "loss": 0.3941, + "step": 16425 + }, + { + "epoch": 9.176536312849162, + "grad_norm": 1.587501049041748, + "learning_rate": 0.0005429971988795517, + "loss": 0.448, + "step": 16426 + }, + { + "epoch": 9.177094972067039, + "grad_norm": 0.8188040256500244, + "learning_rate": 0.0005429691876750701, + "loss": 0.3955, + "step": 16427 + }, + { + "epoch": 9.177653631284917, + "grad_norm": 0.5032440423965454, + "learning_rate": 0.0005429411764705883, + "loss": 0.4582, + "step": 16428 + }, + { + "epoch": 9.178212290502794, + "grad_norm": 0.5606140494346619, + "learning_rate": 0.0005429131652661065, + "loss": 0.5445, + "step": 16429 + }, + { + "epoch": 9.17877094972067, + "grad_norm": 0.5652497410774231, + "learning_rate": 0.0005428851540616247, + "loss": 0.4381, + "step": 16430 + }, + { + "epoch": 9.179329608938547, + "grad_norm": 0.40602239966392517, + "learning_rate": 0.0005428571428571428, + "loss": 0.4297, + "step": 16431 + }, + { + "epoch": 9.179888268156425, + "grad_norm": 0.519846498966217, + "learning_rate": 0.0005428291316526611, + "loss": 0.4639, + "step": 16432 + }, + { + "epoch": 9.180446927374302, + "grad_norm": 1.7221403121948242, + "learning_rate": 0.0005428011204481793, + "loss": 0.3747, + "step": 16433 + }, + { + "epoch": 9.181005586592178, + "grad_norm": 0.6046132445335388, + "learning_rate": 0.0005427731092436975, + "loss": 0.4141, + "step": 16434 + }, + { + "epoch": 9.181564245810057, + "grad_norm": 0.37778180837631226, + "learning_rate": 0.0005427450980392157, + "loss": 0.4471, + "step": 16435 + }, + { + "epoch": 9.182122905027933, + "grad_norm": 0.6673175692558289, + "learning_rate": 0.0005427170868347339, + "loss": 0.3409, + "step": 16436 + }, + { + "epoch": 9.18268156424581, + "grad_norm": 0.4877397418022156, + "learning_rate": 0.0005426890756302521, + "loss": 0.5284, + "step": 16437 + }, + { + "epoch": 9.183240223463686, + "grad_norm": 1.0286041498184204, + "learning_rate": 0.0005426610644257703, + "loss": 0.3731, + "step": 16438 + }, + { + "epoch": 9.183798882681565, + "grad_norm": 0.5005021095275879, + "learning_rate": 0.0005426330532212885, + "loss": 0.375, + "step": 16439 + }, + { + "epoch": 9.184357541899441, + "grad_norm": 0.9342933893203735, + "learning_rate": 0.0005426050420168067, + "loss": 0.5854, + "step": 16440 + }, + { + "epoch": 9.184916201117318, + "grad_norm": 0.5026494264602661, + "learning_rate": 0.000542577030812325, + "loss": 0.405, + "step": 16441 + }, + { + "epoch": 9.185474860335196, + "grad_norm": 0.4556749165058136, + "learning_rate": 0.0005425490196078432, + "loss": 0.427, + "step": 16442 + }, + { + "epoch": 9.186033519553073, + "grad_norm": 0.5704842805862427, + "learning_rate": 0.0005425210084033614, + "loss": 0.4343, + "step": 16443 + }, + { + "epoch": 9.18659217877095, + "grad_norm": 0.5358864665031433, + "learning_rate": 0.0005424929971988796, + "loss": 0.4051, + "step": 16444 + }, + { + "epoch": 9.187150837988828, + "grad_norm": 0.6559305191040039, + "learning_rate": 0.0005424649859943978, + "loss": 0.4203, + "step": 16445 + }, + { + "epoch": 9.187709497206704, + "grad_norm": 0.6133913993835449, + "learning_rate": 0.000542436974789916, + "loss": 0.4764, + "step": 16446 + }, + { + "epoch": 9.18826815642458, + "grad_norm": 0.39269503951072693, + "learning_rate": 0.0005424089635854342, + "loss": 0.3491, + "step": 16447 + }, + { + "epoch": 9.188826815642457, + "grad_norm": 0.4439775347709656, + "learning_rate": 0.0005423809523809524, + "loss": 0.4143, + "step": 16448 + }, + { + "epoch": 9.189385474860336, + "grad_norm": 0.49662068486213684, + "learning_rate": 0.0005423529411764706, + "loss": 0.3804, + "step": 16449 + }, + { + "epoch": 9.189944134078212, + "grad_norm": 0.5026037693023682, + "learning_rate": 0.0005423249299719888, + "loss": 0.3871, + "step": 16450 + }, + { + "epoch": 9.190502793296089, + "grad_norm": 0.3771837055683136, + "learning_rate": 0.000542296918767507, + "loss": 0.3394, + "step": 16451 + }, + { + "epoch": 9.191061452513967, + "grad_norm": 0.34782183170318604, + "learning_rate": 0.0005422689075630253, + "loss": 0.3255, + "step": 16452 + }, + { + "epoch": 9.191620111731844, + "grad_norm": 0.3992161452770233, + "learning_rate": 0.0005422408963585434, + "loss": 0.4664, + "step": 16453 + }, + { + "epoch": 9.19217877094972, + "grad_norm": 0.6021296977996826, + "learning_rate": 0.0005422128851540616, + "loss": 0.4732, + "step": 16454 + }, + { + "epoch": 9.192737430167599, + "grad_norm": 0.4611443877220154, + "learning_rate": 0.0005421848739495798, + "loss": 0.4681, + "step": 16455 + }, + { + "epoch": 9.193296089385475, + "grad_norm": 0.3801502585411072, + "learning_rate": 0.000542156862745098, + "loss": 0.3517, + "step": 16456 + }, + { + "epoch": 9.193854748603352, + "grad_norm": 0.43669968843460083, + "learning_rate": 0.0005421288515406164, + "loss": 0.4708, + "step": 16457 + }, + { + "epoch": 9.194413407821228, + "grad_norm": 0.44060125946998596, + "learning_rate": 0.0005421008403361344, + "loss": 0.4519, + "step": 16458 + }, + { + "epoch": 9.194972067039107, + "grad_norm": 4.516201019287109, + "learning_rate": 0.0005420728291316527, + "loss": 0.523, + "step": 16459 + }, + { + "epoch": 9.195530726256983, + "grad_norm": 0.7689078450202942, + "learning_rate": 0.0005420448179271709, + "loss": 0.3896, + "step": 16460 + }, + { + "epoch": 9.19608938547486, + "grad_norm": 2.603292465209961, + "learning_rate": 0.0005420168067226891, + "loss": 0.3577, + "step": 16461 + }, + { + "epoch": 9.196648044692738, + "grad_norm": 0.4897790551185608, + "learning_rate": 0.0005419887955182074, + "loss": 0.3882, + "step": 16462 + }, + { + "epoch": 9.197206703910615, + "grad_norm": 0.4763418436050415, + "learning_rate": 0.0005419607843137255, + "loss": 0.5867, + "step": 16463 + }, + { + "epoch": 9.197765363128491, + "grad_norm": 0.5391951203346252, + "learning_rate": 0.0005419327731092437, + "loss": 0.4056, + "step": 16464 + }, + { + "epoch": 9.19832402234637, + "grad_norm": 1.0460638999938965, + "learning_rate": 0.0005419047619047619, + "loss": 0.4438, + "step": 16465 + }, + { + "epoch": 9.198882681564246, + "grad_norm": 0.5877397656440735, + "learning_rate": 0.0005418767507002801, + "loss": 0.466, + "step": 16466 + }, + { + "epoch": 9.199441340782123, + "grad_norm": 0.4966689646244049, + "learning_rate": 0.0005418487394957984, + "loss": 0.4797, + "step": 16467 + }, + { + "epoch": 9.2, + "grad_norm": 0.5362020134925842, + "learning_rate": 0.0005418207282913166, + "loss": 0.4477, + "step": 16468 + }, + { + "epoch": 9.200558659217878, + "grad_norm": 0.4578613042831421, + "learning_rate": 0.0005417927170868347, + "loss": 0.4791, + "step": 16469 + }, + { + "epoch": 9.201117318435754, + "grad_norm": 1.344063639640808, + "learning_rate": 0.0005417647058823529, + "loss": 0.4641, + "step": 16470 + }, + { + "epoch": 9.20167597765363, + "grad_norm": 0.6875057220458984, + "learning_rate": 0.0005417366946778711, + "loss": 0.4328, + "step": 16471 + }, + { + "epoch": 9.202234636871509, + "grad_norm": 1.160433292388916, + "learning_rate": 0.0005417086834733894, + "loss": 0.6508, + "step": 16472 + }, + { + "epoch": 9.202793296089386, + "grad_norm": 1.4570869207382202, + "learning_rate": 0.0005416806722689076, + "loss": 0.3674, + "step": 16473 + }, + { + "epoch": 9.203351955307262, + "grad_norm": 2.2910892963409424, + "learning_rate": 0.0005416526610644257, + "loss": 0.4061, + "step": 16474 + }, + { + "epoch": 9.203910614525139, + "grad_norm": 0.5284371376037598, + "learning_rate": 0.000541624649859944, + "loss": 0.5464, + "step": 16475 + }, + { + "epoch": 9.204469273743017, + "grad_norm": 0.5682401061058044, + "learning_rate": 0.0005415966386554622, + "loss": 0.4537, + "step": 16476 + }, + { + "epoch": 9.205027932960894, + "grad_norm": 0.5928774476051331, + "learning_rate": 0.0005415686274509805, + "loss": 0.4038, + "step": 16477 + }, + { + "epoch": 9.20558659217877, + "grad_norm": 0.39407432079315186, + "learning_rate": 0.0005415406162464987, + "loss": 0.3553, + "step": 16478 + }, + { + "epoch": 9.206145251396649, + "grad_norm": 0.526796817779541, + "learning_rate": 0.0005415126050420168, + "loss": 0.4353, + "step": 16479 + }, + { + "epoch": 9.206703910614525, + "grad_norm": 0.47668054699897766, + "learning_rate": 0.000541484593837535, + "loss": 0.5527, + "step": 16480 + }, + { + "epoch": 9.207262569832402, + "grad_norm": 0.5429408550262451, + "learning_rate": 0.0005414565826330532, + "loss": 0.4561, + "step": 16481 + }, + { + "epoch": 9.20782122905028, + "grad_norm": 0.4693089425563812, + "learning_rate": 0.0005414285714285715, + "loss": 0.3852, + "step": 16482 + }, + { + "epoch": 9.208379888268157, + "grad_norm": 0.48932701349258423, + "learning_rate": 0.0005414005602240897, + "loss": 0.4439, + "step": 16483 + }, + { + "epoch": 9.208938547486033, + "grad_norm": 0.4219457507133484, + "learning_rate": 0.0005413725490196079, + "loss": 0.3291, + "step": 16484 + }, + { + "epoch": 9.20949720670391, + "grad_norm": 0.38331130146980286, + "learning_rate": 0.000541344537815126, + "loss": 0.3829, + "step": 16485 + }, + { + "epoch": 9.210055865921788, + "grad_norm": 1.3285800218582153, + "learning_rate": 0.0005413165266106442, + "loss": 0.4194, + "step": 16486 + }, + { + "epoch": 9.210614525139665, + "grad_norm": 0.6707630753517151, + "learning_rate": 0.0005412885154061625, + "loss": 0.4472, + "step": 16487 + }, + { + "epoch": 9.211173184357541, + "grad_norm": 0.9894908666610718, + "learning_rate": 0.0005412605042016807, + "loss": 0.3662, + "step": 16488 + }, + { + "epoch": 9.21173184357542, + "grad_norm": 0.3866855204105377, + "learning_rate": 0.0005412324929971989, + "loss": 0.3997, + "step": 16489 + }, + { + "epoch": 9.212290502793296, + "grad_norm": 0.6083716154098511, + "learning_rate": 0.000541204481792717, + "loss": 0.4733, + "step": 16490 + }, + { + "epoch": 9.212849162011173, + "grad_norm": 0.5836795568466187, + "learning_rate": 0.0005411764705882352, + "loss": 0.5028, + "step": 16491 + }, + { + "epoch": 9.213407821229051, + "grad_norm": 0.6724944114685059, + "learning_rate": 0.0005411484593837536, + "loss": 0.4246, + "step": 16492 + }, + { + "epoch": 9.213966480446928, + "grad_norm": 0.5771934390068054, + "learning_rate": 0.0005411204481792718, + "loss": 0.4776, + "step": 16493 + }, + { + "epoch": 9.214525139664804, + "grad_norm": 1.0224010944366455, + "learning_rate": 0.00054109243697479, + "loss": 0.4138, + "step": 16494 + }, + { + "epoch": 9.21508379888268, + "grad_norm": 0.4859912395477295, + "learning_rate": 0.0005410644257703081, + "loss": 0.345, + "step": 16495 + }, + { + "epoch": 9.21564245810056, + "grad_norm": 0.4043622314929962, + "learning_rate": 0.0005410364145658263, + "loss": 0.446, + "step": 16496 + }, + { + "epoch": 9.216201117318436, + "grad_norm": 0.5242271423339844, + "learning_rate": 0.0005410084033613446, + "loss": 0.496, + "step": 16497 + }, + { + "epoch": 9.216759776536312, + "grad_norm": 0.47943079471588135, + "learning_rate": 0.0005409803921568628, + "loss": 0.4433, + "step": 16498 + }, + { + "epoch": 9.21731843575419, + "grad_norm": 0.4939671754837036, + "learning_rate": 0.000540952380952381, + "loss": 0.5138, + "step": 16499 + }, + { + "epoch": 9.217877094972067, + "grad_norm": 0.604418158531189, + "learning_rate": 0.0005409243697478992, + "loss": 0.5597, + "step": 16500 + }, + { + "epoch": 9.217877094972067, + "eval_cer": 0.08833316967256936, + "eval_loss": 0.33648619055747986, + "eval_runtime": 55.6829, + "eval_samples_per_second": 81.497, + "eval_steps_per_second": 5.1, + "eval_wer": 0.3492360548588028, + "step": 16500 + }, + { + "epoch": 9.218435754189944, + "grad_norm": 0.4354100525379181, + "learning_rate": 0.0005408963585434173, + "loss": 0.3938, + "step": 16501 + }, + { + "epoch": 9.21899441340782, + "grad_norm": 1.2374842166900635, + "learning_rate": 0.0005408683473389356, + "loss": 0.4506, + "step": 16502 + }, + { + "epoch": 9.219553072625699, + "grad_norm": 0.39970171451568604, + "learning_rate": 0.0005408403361344538, + "loss": 0.4018, + "step": 16503 + }, + { + "epoch": 9.220111731843575, + "grad_norm": 0.45534956455230713, + "learning_rate": 0.000540812324929972, + "loss": 0.4006, + "step": 16504 + }, + { + "epoch": 9.220670391061452, + "grad_norm": 0.8092905282974243, + "learning_rate": 0.0005407843137254902, + "loss": 0.4661, + "step": 16505 + }, + { + "epoch": 9.22122905027933, + "grad_norm": 0.604893684387207, + "learning_rate": 0.0005407563025210083, + "loss": 0.4511, + "step": 16506 + }, + { + "epoch": 9.221787709497207, + "grad_norm": 0.4756384491920471, + "learning_rate": 0.0005407282913165267, + "loss": 0.4438, + "step": 16507 + }, + { + "epoch": 9.222346368715083, + "grad_norm": 0.572142481803894, + "learning_rate": 0.0005407002801120449, + "loss": 0.5343, + "step": 16508 + }, + { + "epoch": 9.222905027932962, + "grad_norm": 0.43380674719810486, + "learning_rate": 0.0005406722689075631, + "loss": 0.3714, + "step": 16509 + }, + { + "epoch": 9.223463687150838, + "grad_norm": 0.644777774810791, + "learning_rate": 0.0005406442577030813, + "loss": 0.4151, + "step": 16510 + }, + { + "epoch": 9.224022346368715, + "grad_norm": 0.357843279838562, + "learning_rate": 0.0005406162464985994, + "loss": 0.3504, + "step": 16511 + }, + { + "epoch": 9.224581005586591, + "grad_norm": 0.4684431254863739, + "learning_rate": 0.0005405882352941176, + "loss": 0.5099, + "step": 16512 + }, + { + "epoch": 9.22513966480447, + "grad_norm": 0.7435186505317688, + "learning_rate": 0.0005405602240896359, + "loss": 0.4077, + "step": 16513 + }, + { + "epoch": 9.225698324022346, + "grad_norm": 0.7387958765029907, + "learning_rate": 0.0005405322128851541, + "loss": 0.5861, + "step": 16514 + }, + { + "epoch": 9.226256983240223, + "grad_norm": 0.3787241578102112, + "learning_rate": 0.0005405042016806723, + "loss": 0.2862, + "step": 16515 + }, + { + "epoch": 9.226815642458101, + "grad_norm": 0.38561609387397766, + "learning_rate": 0.0005404761904761905, + "loss": 0.3696, + "step": 16516 + }, + { + "epoch": 9.227374301675978, + "grad_norm": 0.37068963050842285, + "learning_rate": 0.0005404481792717086, + "loss": 0.3752, + "step": 16517 + }, + { + "epoch": 9.227932960893854, + "grad_norm": 1.7013413906097412, + "learning_rate": 0.0005404201680672269, + "loss": 0.4507, + "step": 16518 + }, + { + "epoch": 9.228491620111733, + "grad_norm": 0.5270462036132812, + "learning_rate": 0.0005403921568627451, + "loss": 0.3809, + "step": 16519 + }, + { + "epoch": 9.22905027932961, + "grad_norm": 0.7093483805656433, + "learning_rate": 0.0005403641456582633, + "loss": 0.4598, + "step": 16520 + }, + { + "epoch": 9.229608938547486, + "grad_norm": 0.45013388991355896, + "learning_rate": 0.0005403361344537815, + "loss": 0.4179, + "step": 16521 + }, + { + "epoch": 9.230167597765362, + "grad_norm": 0.4635022282600403, + "learning_rate": 0.0005403081232492996, + "loss": 0.4092, + "step": 16522 + }, + { + "epoch": 9.23072625698324, + "grad_norm": 1.5607857704162598, + "learning_rate": 0.000540280112044818, + "loss": 0.3359, + "step": 16523 + }, + { + "epoch": 9.231284916201117, + "grad_norm": 0.6553241610527039, + "learning_rate": 0.0005402521008403362, + "loss": 0.3788, + "step": 16524 + }, + { + "epoch": 9.231843575418994, + "grad_norm": 0.3479006886482239, + "learning_rate": 0.0005402240896358544, + "loss": 0.3871, + "step": 16525 + }, + { + "epoch": 9.232402234636872, + "grad_norm": 0.6250573992729187, + "learning_rate": 0.0005401960784313726, + "loss": 0.5014, + "step": 16526 + }, + { + "epoch": 9.232960893854749, + "grad_norm": 0.7511268854141235, + "learning_rate": 0.0005401680672268907, + "loss": 0.4398, + "step": 16527 + }, + { + "epoch": 9.233519553072625, + "grad_norm": 0.8214605450630188, + "learning_rate": 0.000540140056022409, + "loss": 0.5629, + "step": 16528 + }, + { + "epoch": 9.234078212290504, + "grad_norm": 0.3612486720085144, + "learning_rate": 0.0005401120448179272, + "loss": 0.4069, + "step": 16529 + }, + { + "epoch": 9.23463687150838, + "grad_norm": 0.693700909614563, + "learning_rate": 0.0005400840336134454, + "loss": 0.4707, + "step": 16530 + }, + { + "epoch": 9.235195530726257, + "grad_norm": 0.935588538646698, + "learning_rate": 0.0005400560224089636, + "loss": 0.5702, + "step": 16531 + }, + { + "epoch": 9.235754189944133, + "grad_norm": 0.5711575150489807, + "learning_rate": 0.0005400280112044818, + "loss": 0.4543, + "step": 16532 + }, + { + "epoch": 9.236312849162012, + "grad_norm": 0.7414027452468872, + "learning_rate": 0.00054, + "loss": 0.5058, + "step": 16533 + }, + { + "epoch": 9.236871508379888, + "grad_norm": 0.549874484539032, + "learning_rate": 0.0005399719887955182, + "loss": 0.4346, + "step": 16534 + }, + { + "epoch": 9.237430167597765, + "grad_norm": 0.52960205078125, + "learning_rate": 0.0005399439775910364, + "loss": 0.5008, + "step": 16535 + }, + { + "epoch": 9.237988826815643, + "grad_norm": 0.4601619243621826, + "learning_rate": 0.0005399159663865546, + "loss": 0.472, + "step": 16536 + }, + { + "epoch": 9.23854748603352, + "grad_norm": 0.8490204811096191, + "learning_rate": 0.0005398879551820728, + "loss": 0.4669, + "step": 16537 + }, + { + "epoch": 9.239106145251396, + "grad_norm": 0.6143187284469604, + "learning_rate": 0.000539859943977591, + "loss": 0.4888, + "step": 16538 + }, + { + "epoch": 9.239664804469275, + "grad_norm": 1.2110612392425537, + "learning_rate": 0.0005398319327731092, + "loss": 0.3526, + "step": 16539 + }, + { + "epoch": 9.240223463687151, + "grad_norm": 0.5195451974868774, + "learning_rate": 0.0005398039215686274, + "loss": 0.3983, + "step": 16540 + }, + { + "epoch": 9.240782122905028, + "grad_norm": 0.6040216088294983, + "learning_rate": 0.0005397759103641457, + "loss": 0.5021, + "step": 16541 + }, + { + "epoch": 9.241340782122904, + "grad_norm": 0.4761215150356293, + "learning_rate": 0.0005397478991596639, + "loss": 0.3394, + "step": 16542 + }, + { + "epoch": 9.241899441340783, + "grad_norm": 0.3308653235435486, + "learning_rate": 0.0005397198879551821, + "loss": 0.3188, + "step": 16543 + }, + { + "epoch": 9.24245810055866, + "grad_norm": 0.7960337400436401, + "learning_rate": 0.0005396918767507003, + "loss": 0.4609, + "step": 16544 + }, + { + "epoch": 9.243016759776536, + "grad_norm": 0.7407571077346802, + "learning_rate": 0.0005396638655462185, + "loss": 0.477, + "step": 16545 + }, + { + "epoch": 9.243575418994414, + "grad_norm": 0.6363726258277893, + "learning_rate": 0.0005396358543417367, + "loss": 0.4224, + "step": 16546 + }, + { + "epoch": 9.24413407821229, + "grad_norm": 0.47694161534309387, + "learning_rate": 0.0005396078431372549, + "loss": 0.4466, + "step": 16547 + }, + { + "epoch": 9.244692737430167, + "grad_norm": 0.5374645590782166, + "learning_rate": 0.0005395798319327732, + "loss": 0.4513, + "step": 16548 + }, + { + "epoch": 9.245251396648044, + "grad_norm": 1.3158732652664185, + "learning_rate": 0.0005395518207282913, + "loss": 0.5985, + "step": 16549 + }, + { + "epoch": 9.245810055865922, + "grad_norm": 1.9134275913238525, + "learning_rate": 0.0005395238095238095, + "loss": 0.4046, + "step": 16550 + }, + { + "epoch": 9.246368715083799, + "grad_norm": 0.4959862232208252, + "learning_rate": 0.0005394957983193277, + "loss": 0.4526, + "step": 16551 + }, + { + "epoch": 9.246927374301675, + "grad_norm": 3.6215617656707764, + "learning_rate": 0.0005394677871148459, + "loss": 0.531, + "step": 16552 + }, + { + "epoch": 9.247486033519554, + "grad_norm": 0.6431394219398499, + "learning_rate": 0.0005394397759103642, + "loss": 0.4339, + "step": 16553 + }, + { + "epoch": 9.24804469273743, + "grad_norm": 0.5379325747489929, + "learning_rate": 0.0005394117647058823, + "loss": 0.4173, + "step": 16554 + }, + { + "epoch": 9.248603351955307, + "grad_norm": 1.1802860498428345, + "learning_rate": 0.0005393837535014005, + "loss": 0.4145, + "step": 16555 + }, + { + "epoch": 9.249162011173185, + "grad_norm": 0.5084881782531738, + "learning_rate": 0.0005393557422969187, + "loss": 0.4539, + "step": 16556 + }, + { + "epoch": 9.249720670391062, + "grad_norm": 0.7915683388710022, + "learning_rate": 0.000539327731092437, + "loss": 0.4085, + "step": 16557 + }, + { + "epoch": 9.250279329608938, + "grad_norm": 0.41347792744636536, + "learning_rate": 0.0005392997198879553, + "loss": 0.4164, + "step": 16558 + }, + { + "epoch": 9.250837988826815, + "grad_norm": 0.6693623065948486, + "learning_rate": 0.0005392717086834734, + "loss": 0.6032, + "step": 16559 + }, + { + "epoch": 9.251396648044693, + "grad_norm": 0.5454085469245911, + "learning_rate": 0.0005392436974789916, + "loss": 0.406, + "step": 16560 + }, + { + "epoch": 9.25195530726257, + "grad_norm": 0.9983789324760437, + "learning_rate": 0.0005392156862745098, + "loss": 0.4268, + "step": 16561 + }, + { + "epoch": 9.252513966480446, + "grad_norm": 0.6356601715087891, + "learning_rate": 0.000539187675070028, + "loss": 0.412, + "step": 16562 + }, + { + "epoch": 9.253072625698325, + "grad_norm": 0.6159977316856384, + "learning_rate": 0.0005391596638655463, + "loss": 0.4515, + "step": 16563 + }, + { + "epoch": 9.253631284916201, + "grad_norm": 0.6717731356620789, + "learning_rate": 0.0005391316526610645, + "loss": 0.4526, + "step": 16564 + }, + { + "epoch": 9.254189944134078, + "grad_norm": 0.4269741475582123, + "learning_rate": 0.0005391036414565826, + "loss": 0.3681, + "step": 16565 + }, + { + "epoch": 9.254748603351956, + "grad_norm": 0.5695484280586243, + "learning_rate": 0.0005390756302521008, + "loss": 0.4207, + "step": 16566 + }, + { + "epoch": 9.255307262569833, + "grad_norm": 0.4539811611175537, + "learning_rate": 0.000539047619047619, + "loss": 0.3735, + "step": 16567 + }, + { + "epoch": 9.25586592178771, + "grad_norm": 0.5337157845497131, + "learning_rate": 0.0005390196078431373, + "loss": 0.4526, + "step": 16568 + }, + { + "epoch": 9.256424581005586, + "grad_norm": 0.575912356376648, + "learning_rate": 0.0005389915966386555, + "loss": 0.4824, + "step": 16569 + }, + { + "epoch": 9.256983240223464, + "grad_norm": 0.48774194717407227, + "learning_rate": 0.0005389635854341736, + "loss": 0.4498, + "step": 16570 + }, + { + "epoch": 9.25754189944134, + "grad_norm": 1.373374104499817, + "learning_rate": 0.0005389355742296918, + "loss": 0.3093, + "step": 16571 + }, + { + "epoch": 9.258100558659217, + "grad_norm": 0.5435925126075745, + "learning_rate": 0.00053890756302521, + "loss": 0.4147, + "step": 16572 + }, + { + "epoch": 9.258659217877096, + "grad_norm": 0.5785366892814636, + "learning_rate": 0.0005388795518207284, + "loss": 0.3713, + "step": 16573 + }, + { + "epoch": 9.259217877094972, + "grad_norm": 0.43125826120376587, + "learning_rate": 0.0005388515406162466, + "loss": 0.4413, + "step": 16574 + }, + { + "epoch": 9.259776536312849, + "grad_norm": 0.5177059769630432, + "learning_rate": 0.0005388235294117647, + "loss": 0.4123, + "step": 16575 + }, + { + "epoch": 9.260335195530725, + "grad_norm": 0.9824718832969666, + "learning_rate": 0.0005387955182072829, + "loss": 0.3841, + "step": 16576 + }, + { + "epoch": 9.260893854748604, + "grad_norm": 0.7645853161811829, + "learning_rate": 0.0005387675070028011, + "loss": 0.3987, + "step": 16577 + }, + { + "epoch": 9.26145251396648, + "grad_norm": 0.44827255606651306, + "learning_rate": 0.0005387394957983194, + "loss": 0.4107, + "step": 16578 + }, + { + "epoch": 9.262011173184357, + "grad_norm": 1.3404701948165894, + "learning_rate": 0.0005387114845938376, + "loss": 0.402, + "step": 16579 + }, + { + "epoch": 9.262569832402235, + "grad_norm": 0.4581243395805359, + "learning_rate": 0.0005386834733893558, + "loss": 0.5066, + "step": 16580 + }, + { + "epoch": 9.263128491620112, + "grad_norm": 1.6659650802612305, + "learning_rate": 0.0005386554621848739, + "loss": 0.3811, + "step": 16581 + }, + { + "epoch": 9.263687150837988, + "grad_norm": 1.3342540264129639, + "learning_rate": 0.0005386274509803921, + "loss": 0.3909, + "step": 16582 + }, + { + "epoch": 9.264245810055867, + "grad_norm": 0.5435566902160645, + "learning_rate": 0.0005385994397759104, + "loss": 0.4321, + "step": 16583 + }, + { + "epoch": 9.264804469273743, + "grad_norm": 0.6568428874015808, + "learning_rate": 0.0005385714285714286, + "loss": 0.4963, + "step": 16584 + }, + { + "epoch": 9.26536312849162, + "grad_norm": 1.972517490386963, + "learning_rate": 0.0005385434173669468, + "loss": 0.4758, + "step": 16585 + }, + { + "epoch": 9.265921787709496, + "grad_norm": 0.3659202456474304, + "learning_rate": 0.0005385154061624649, + "loss": 0.4621, + "step": 16586 + }, + { + "epoch": 9.266480446927375, + "grad_norm": 0.46245095133781433, + "learning_rate": 0.0005384873949579831, + "loss": 0.4123, + "step": 16587 + }, + { + "epoch": 9.267039106145251, + "grad_norm": 0.892911970615387, + "learning_rate": 0.0005384593837535014, + "loss": 0.4285, + "step": 16588 + }, + { + "epoch": 9.267597765363128, + "grad_norm": 1.022056221961975, + "learning_rate": 0.0005384313725490197, + "loss": 0.556, + "step": 16589 + }, + { + "epoch": 9.268156424581006, + "grad_norm": 0.3725251853466034, + "learning_rate": 0.0005384033613445379, + "loss": 0.4063, + "step": 16590 + }, + { + "epoch": 9.268715083798883, + "grad_norm": 2.3333992958068848, + "learning_rate": 0.000538375350140056, + "loss": 0.3644, + "step": 16591 + }, + { + "epoch": 9.26927374301676, + "grad_norm": 0.5404777526855469, + "learning_rate": 0.0005383473389355742, + "loss": 0.5207, + "step": 16592 + }, + { + "epoch": 9.269832402234638, + "grad_norm": 0.646561861038208, + "learning_rate": 0.0005383193277310925, + "loss": 0.3858, + "step": 16593 + }, + { + "epoch": 9.270391061452514, + "grad_norm": 0.3435373306274414, + "learning_rate": 0.0005382913165266107, + "loss": 0.3408, + "step": 16594 + }, + { + "epoch": 9.27094972067039, + "grad_norm": 1.0020469427108765, + "learning_rate": 0.0005382633053221289, + "loss": 0.4863, + "step": 16595 + }, + { + "epoch": 9.271508379888267, + "grad_norm": 0.7925450801849365, + "learning_rate": 0.0005382352941176471, + "loss": 0.5796, + "step": 16596 + }, + { + "epoch": 9.272067039106146, + "grad_norm": 0.5109665393829346, + "learning_rate": 0.0005382072829131652, + "loss": 0.4686, + "step": 16597 + }, + { + "epoch": 9.272625698324022, + "grad_norm": 0.36900794506073, + "learning_rate": 0.0005381792717086835, + "loss": 0.3643, + "step": 16598 + }, + { + "epoch": 9.273184357541899, + "grad_norm": 0.6303600072860718, + "learning_rate": 0.0005381512605042017, + "loss": 0.478, + "step": 16599 + }, + { + "epoch": 9.273743016759777, + "grad_norm": 0.45135247707366943, + "learning_rate": 0.0005381232492997199, + "loss": 0.4186, + "step": 16600 + }, + { + "epoch": 9.274301675977654, + "grad_norm": 0.7109841108322144, + "learning_rate": 0.0005380952380952381, + "loss": 0.4153, + "step": 16601 + }, + { + "epoch": 9.27486033519553, + "grad_norm": 0.6614009737968445, + "learning_rate": 0.0005380672268907562, + "loss": 0.5569, + "step": 16602 + }, + { + "epoch": 9.275418994413409, + "grad_norm": 0.6850992441177368, + "learning_rate": 0.0005380392156862745, + "loss": 0.4905, + "step": 16603 + }, + { + "epoch": 9.275977653631285, + "grad_norm": 0.4450646638870239, + "learning_rate": 0.0005380112044817927, + "loss": 0.4828, + "step": 16604 + }, + { + "epoch": 9.276536312849162, + "grad_norm": 0.5977611541748047, + "learning_rate": 0.000537983193277311, + "loss": 0.4771, + "step": 16605 + }, + { + "epoch": 9.277094972067038, + "grad_norm": 1.1330811977386475, + "learning_rate": 0.0005379551820728292, + "loss": 0.3757, + "step": 16606 + }, + { + "epoch": 9.277653631284917, + "grad_norm": 0.6245313286781311, + "learning_rate": 0.0005379271708683473, + "loss": 0.4266, + "step": 16607 + }, + { + "epoch": 9.278212290502793, + "grad_norm": 0.5469295978546143, + "learning_rate": 0.0005378991596638656, + "loss": 0.4935, + "step": 16608 + }, + { + "epoch": 9.27877094972067, + "grad_norm": 0.6180469393730164, + "learning_rate": 0.0005378711484593838, + "loss": 0.5896, + "step": 16609 + }, + { + "epoch": 9.279329608938548, + "grad_norm": 0.4080008268356323, + "learning_rate": 0.000537843137254902, + "loss": 0.4612, + "step": 16610 + }, + { + "epoch": 9.279888268156425, + "grad_norm": 0.8717343211174011, + "learning_rate": 0.0005378151260504202, + "loss": 0.4475, + "step": 16611 + }, + { + "epoch": 9.280446927374301, + "grad_norm": 0.448574423789978, + "learning_rate": 0.0005377871148459384, + "loss": 0.3906, + "step": 16612 + }, + { + "epoch": 9.28100558659218, + "grad_norm": 0.5210317969322205, + "learning_rate": 0.0005377591036414566, + "loss": 0.4591, + "step": 16613 + }, + { + "epoch": 9.281564245810056, + "grad_norm": 4.129405498504639, + "learning_rate": 0.0005377310924369748, + "loss": 0.4641, + "step": 16614 + }, + { + "epoch": 9.282122905027933, + "grad_norm": 0.38154488801956177, + "learning_rate": 0.000537703081232493, + "loss": 0.3892, + "step": 16615 + }, + { + "epoch": 9.28268156424581, + "grad_norm": 0.49611175060272217, + "learning_rate": 0.0005376750700280112, + "loss": 0.5124, + "step": 16616 + }, + { + "epoch": 9.283240223463688, + "grad_norm": 0.8293722867965698, + "learning_rate": 0.0005376470588235294, + "loss": 0.4572, + "step": 16617 + }, + { + "epoch": 9.283798882681564, + "grad_norm": 0.5510809421539307, + "learning_rate": 0.0005376190476190476, + "loss": 0.4863, + "step": 16618 + }, + { + "epoch": 9.28435754189944, + "grad_norm": 0.6533995866775513, + "learning_rate": 0.0005375910364145658, + "loss": 0.3275, + "step": 16619 + }, + { + "epoch": 9.28491620111732, + "grad_norm": 0.9118890762329102, + "learning_rate": 0.000537563025210084, + "loss": 0.4116, + "step": 16620 + }, + { + "epoch": 9.285474860335196, + "grad_norm": 1.0961288213729858, + "learning_rate": 0.0005375350140056022, + "loss": 0.3776, + "step": 16621 + }, + { + "epoch": 9.286033519553072, + "grad_norm": 0.4982014298439026, + "learning_rate": 0.0005375070028011204, + "loss": 0.4194, + "step": 16622 + }, + { + "epoch": 9.286592178770949, + "grad_norm": 0.48637276887893677, + "learning_rate": 0.0005374789915966388, + "loss": 0.4334, + "step": 16623 + }, + { + "epoch": 9.287150837988827, + "grad_norm": 0.37924978137016296, + "learning_rate": 0.0005374509803921569, + "loss": 0.3464, + "step": 16624 + }, + { + "epoch": 9.287709497206704, + "grad_norm": 0.7492765188217163, + "learning_rate": 0.0005374229691876751, + "loss": 0.3901, + "step": 16625 + }, + { + "epoch": 9.28826815642458, + "grad_norm": 0.767127275466919, + "learning_rate": 0.0005373949579831933, + "loss": 0.3437, + "step": 16626 + }, + { + "epoch": 9.288826815642459, + "grad_norm": 0.6950533390045166, + "learning_rate": 0.0005373669467787115, + "loss": 0.3816, + "step": 16627 + }, + { + "epoch": 9.289385474860335, + "grad_norm": 0.4253644645214081, + "learning_rate": 0.0005373389355742298, + "loss": 0.3878, + "step": 16628 + }, + { + "epoch": 9.289944134078212, + "grad_norm": 0.39599257707595825, + "learning_rate": 0.0005373109243697479, + "loss": 0.4126, + "step": 16629 + }, + { + "epoch": 9.29050279329609, + "grad_norm": 0.38296350836753845, + "learning_rate": 0.0005372829131652661, + "loss": 0.3328, + "step": 16630 + }, + { + "epoch": 9.291061452513967, + "grad_norm": 0.5903850197792053, + "learning_rate": 0.0005372549019607843, + "loss": 0.4336, + "step": 16631 + }, + { + "epoch": 9.291620111731843, + "grad_norm": 0.52068692445755, + "learning_rate": 0.0005372268907563025, + "loss": 0.5449, + "step": 16632 + }, + { + "epoch": 9.29217877094972, + "grad_norm": 0.8957650065422058, + "learning_rate": 0.0005371988795518208, + "loss": 0.4016, + "step": 16633 + }, + { + "epoch": 9.292737430167598, + "grad_norm": 1.8552970886230469, + "learning_rate": 0.0005371708683473389, + "loss": 0.3548, + "step": 16634 + }, + { + "epoch": 9.293296089385475, + "grad_norm": 0.6319858431816101, + "learning_rate": 0.0005371428571428571, + "loss": 0.573, + "step": 16635 + }, + { + "epoch": 9.293854748603351, + "grad_norm": 0.6675060987472534, + "learning_rate": 0.0005371148459383753, + "loss": 0.4497, + "step": 16636 + }, + { + "epoch": 9.29441340782123, + "grad_norm": 0.7903112173080444, + "learning_rate": 0.0005370868347338935, + "loss": 0.5128, + "step": 16637 + }, + { + "epoch": 9.294972067039106, + "grad_norm": 0.824683666229248, + "learning_rate": 0.0005370588235294119, + "loss": 0.3936, + "step": 16638 + }, + { + "epoch": 9.295530726256983, + "grad_norm": 8.301406860351562, + "learning_rate": 0.0005370308123249301, + "loss": 0.3125, + "step": 16639 + }, + { + "epoch": 9.296089385474861, + "grad_norm": 0.3907196819782257, + "learning_rate": 0.0005370028011204482, + "loss": 0.3362, + "step": 16640 + }, + { + "epoch": 9.296648044692738, + "grad_norm": 0.6023927927017212, + "learning_rate": 0.0005369747899159664, + "loss": 0.6357, + "step": 16641 + }, + { + "epoch": 9.297206703910614, + "grad_norm": 0.366914302110672, + "learning_rate": 0.0005369467787114846, + "loss": 0.389, + "step": 16642 + }, + { + "epoch": 9.297765363128491, + "grad_norm": 0.5850542187690735, + "learning_rate": 0.0005369187675070029, + "loss": 0.344, + "step": 16643 + }, + { + "epoch": 9.29832402234637, + "grad_norm": 0.5472568869590759, + "learning_rate": 0.0005368907563025211, + "loss": 0.501, + "step": 16644 + }, + { + "epoch": 9.298882681564246, + "grad_norm": 0.6412962079048157, + "learning_rate": 0.0005368627450980392, + "loss": 0.3934, + "step": 16645 + }, + { + "epoch": 9.299441340782122, + "grad_norm": 1.08909010887146, + "learning_rate": 0.0005368347338935574, + "loss": 0.4727, + "step": 16646 + }, + { + "epoch": 9.3, + "grad_norm": 0.5555177330970764, + "learning_rate": 0.0005368067226890756, + "loss": 0.4778, + "step": 16647 + }, + { + "epoch": 9.300558659217877, + "grad_norm": 0.4491305649280548, + "learning_rate": 0.0005367787114845939, + "loss": 0.3367, + "step": 16648 + }, + { + "epoch": 9.301117318435754, + "grad_norm": 0.6214771866798401, + "learning_rate": 0.0005367507002801121, + "loss": 0.742, + "step": 16649 + }, + { + "epoch": 9.30167597765363, + "grad_norm": 0.69321608543396, + "learning_rate": 0.0005367226890756302, + "loss": 0.4564, + "step": 16650 + }, + { + "epoch": 9.302234636871509, + "grad_norm": 0.7769429683685303, + "learning_rate": 0.0005366946778711484, + "loss": 0.3556, + "step": 16651 + }, + { + "epoch": 9.302793296089385, + "grad_norm": 0.5233587622642517, + "learning_rate": 0.0005366666666666666, + "loss": 0.4942, + "step": 16652 + }, + { + "epoch": 9.303351955307262, + "grad_norm": 0.9653082489967346, + "learning_rate": 0.0005366386554621849, + "loss": 0.4555, + "step": 16653 + }, + { + "epoch": 9.30391061452514, + "grad_norm": 0.44297635555267334, + "learning_rate": 0.0005366106442577031, + "loss": 0.4801, + "step": 16654 + }, + { + "epoch": 9.304469273743017, + "grad_norm": 0.4555809199810028, + "learning_rate": 0.0005365826330532214, + "loss": 0.4131, + "step": 16655 + }, + { + "epoch": 9.305027932960893, + "grad_norm": 0.7620997428894043, + "learning_rate": 0.0005365546218487395, + "loss": 0.3868, + "step": 16656 + }, + { + "epoch": 9.305586592178772, + "grad_norm": 0.38635891675949097, + "learning_rate": 0.0005365266106442577, + "loss": 0.3834, + "step": 16657 + }, + { + "epoch": 9.306145251396648, + "grad_norm": 0.6984046697616577, + "learning_rate": 0.000536498599439776, + "loss": 0.3649, + "step": 16658 + }, + { + "epoch": 9.306703910614525, + "grad_norm": 0.5802876949310303, + "learning_rate": 0.0005364705882352942, + "loss": 0.4963, + "step": 16659 + }, + { + "epoch": 9.307262569832401, + "grad_norm": 1.2500238418579102, + "learning_rate": 0.0005364425770308124, + "loss": 0.3584, + "step": 16660 + }, + { + "epoch": 9.30782122905028, + "grad_norm": 0.49604320526123047, + "learning_rate": 0.0005364145658263305, + "loss": 0.3811, + "step": 16661 + }, + { + "epoch": 9.308379888268156, + "grad_norm": 0.5880340933799744, + "learning_rate": 0.0005363865546218487, + "loss": 0.5039, + "step": 16662 + }, + { + "epoch": 9.308938547486033, + "grad_norm": 0.3959980010986328, + "learning_rate": 0.000536358543417367, + "loss": 0.4416, + "step": 16663 + }, + { + "epoch": 9.309497206703911, + "grad_norm": 0.6716723442077637, + "learning_rate": 0.0005363305322128852, + "loss": 0.5665, + "step": 16664 + }, + { + "epoch": 9.310055865921788, + "grad_norm": 0.4211269021034241, + "learning_rate": 0.0005363025210084034, + "loss": 0.433, + "step": 16665 + }, + { + "epoch": 9.310614525139664, + "grad_norm": 0.8555530309677124, + "learning_rate": 0.0005362745098039215, + "loss": 0.3842, + "step": 16666 + }, + { + "epoch": 9.311173184357543, + "grad_norm": 0.8323864936828613, + "learning_rate": 0.0005362464985994397, + "loss": 0.4497, + "step": 16667 + }, + { + "epoch": 9.31173184357542, + "grad_norm": 0.5423151254653931, + "learning_rate": 0.000536218487394958, + "loss": 0.472, + "step": 16668 + }, + { + "epoch": 9.312290502793296, + "grad_norm": 0.5062610507011414, + "learning_rate": 0.0005361904761904762, + "loss": 0.5441, + "step": 16669 + }, + { + "epoch": 9.312849162011172, + "grad_norm": 0.4727415442466736, + "learning_rate": 0.0005361624649859944, + "loss": 0.4517, + "step": 16670 + }, + { + "epoch": 9.31340782122905, + "grad_norm": 0.6677928566932678, + "learning_rate": 0.0005361344537815127, + "loss": 0.4403, + "step": 16671 + }, + { + "epoch": 9.313966480446927, + "grad_norm": 0.5396201014518738, + "learning_rate": 0.0005361064425770307, + "loss": 0.37, + "step": 16672 + }, + { + "epoch": 9.314525139664804, + "grad_norm": 1.5581986904144287, + "learning_rate": 0.0005360784313725491, + "loss": 0.5874, + "step": 16673 + }, + { + "epoch": 9.315083798882682, + "grad_norm": 0.42642301321029663, + "learning_rate": 0.0005360504201680673, + "loss": 0.4968, + "step": 16674 + }, + { + "epoch": 9.315642458100559, + "grad_norm": 0.41232115030288696, + "learning_rate": 0.0005360224089635855, + "loss": 0.3941, + "step": 16675 + }, + { + "epoch": 9.316201117318435, + "grad_norm": 0.7357610464096069, + "learning_rate": 0.0005359943977591037, + "loss": 0.3497, + "step": 16676 + }, + { + "epoch": 9.316759776536314, + "grad_norm": 0.6841923594474792, + "learning_rate": 0.0005359663865546218, + "loss": 0.4366, + "step": 16677 + }, + { + "epoch": 9.31731843575419, + "grad_norm": 0.6401249170303345, + "learning_rate": 0.0005359383753501401, + "loss": 0.6711, + "step": 16678 + }, + { + "epoch": 9.317877094972067, + "grad_norm": 0.5170163512229919, + "learning_rate": 0.0005359103641456583, + "loss": 0.4026, + "step": 16679 + }, + { + "epoch": 9.318435754189943, + "grad_norm": 0.46179407835006714, + "learning_rate": 0.0005358823529411765, + "loss": 0.4657, + "step": 16680 + }, + { + "epoch": 9.318994413407822, + "grad_norm": 0.35712406039237976, + "learning_rate": 0.0005358543417366947, + "loss": 0.3633, + "step": 16681 + }, + { + "epoch": 9.319553072625698, + "grad_norm": 0.8107830882072449, + "learning_rate": 0.0005358263305322128, + "loss": 0.5316, + "step": 16682 + }, + { + "epoch": 9.320111731843575, + "grad_norm": 0.5190209746360779, + "learning_rate": 0.0005357983193277311, + "loss": 0.4528, + "step": 16683 + }, + { + "epoch": 9.320670391061453, + "grad_norm": 0.45393043756484985, + "learning_rate": 0.0005357703081232493, + "loss": 0.3212, + "step": 16684 + }, + { + "epoch": 9.32122905027933, + "grad_norm": 0.5104181170463562, + "learning_rate": 0.0005357422969187675, + "loss": 0.3212, + "step": 16685 + }, + { + "epoch": 9.321787709497206, + "grad_norm": 0.7569250464439392, + "learning_rate": 0.0005357142857142857, + "loss": 0.4558, + "step": 16686 + }, + { + "epoch": 9.322346368715085, + "grad_norm": 0.42169100046157837, + "learning_rate": 0.000535686274509804, + "loss": 0.354, + "step": 16687 + }, + { + "epoch": 9.322905027932961, + "grad_norm": 0.39165931940078735, + "learning_rate": 0.0005356582633053222, + "loss": 0.3489, + "step": 16688 + }, + { + "epoch": 9.323463687150838, + "grad_norm": 0.4005206823348999, + "learning_rate": 0.0005356302521008404, + "loss": 0.4199, + "step": 16689 + }, + { + "epoch": 9.324022346368714, + "grad_norm": 4.686884880065918, + "learning_rate": 0.0005356022408963586, + "loss": 0.4251, + "step": 16690 + }, + { + "epoch": 9.324581005586593, + "grad_norm": 0.8771541118621826, + "learning_rate": 0.0005355742296918768, + "loss": 0.5771, + "step": 16691 + }, + { + "epoch": 9.32513966480447, + "grad_norm": 0.8012189269065857, + "learning_rate": 0.000535546218487395, + "loss": 0.4466, + "step": 16692 + }, + { + "epoch": 9.325698324022346, + "grad_norm": 0.4651605784893036, + "learning_rate": 0.0005355182072829132, + "loss": 0.4002, + "step": 16693 + }, + { + "epoch": 9.326256983240224, + "grad_norm": 0.5822586417198181, + "learning_rate": 0.0005354901960784314, + "loss": 0.4188, + "step": 16694 + }, + { + "epoch": 9.3268156424581, + "grad_norm": 0.3681603968143463, + "learning_rate": 0.0005354621848739496, + "loss": 0.3724, + "step": 16695 + }, + { + "epoch": 9.327374301675977, + "grad_norm": 0.565109133720398, + "learning_rate": 0.0005354341736694678, + "loss": 0.4867, + "step": 16696 + }, + { + "epoch": 9.327932960893854, + "grad_norm": 0.5700971484184265, + "learning_rate": 0.000535406162464986, + "loss": 0.4823, + "step": 16697 + }, + { + "epoch": 9.328491620111732, + "grad_norm": 0.940841555595398, + "learning_rate": 0.0005353781512605042, + "loss": 0.5465, + "step": 16698 + }, + { + "epoch": 9.329050279329609, + "grad_norm": 0.577726423740387, + "learning_rate": 0.0005353501400560224, + "loss": 0.4409, + "step": 16699 + }, + { + "epoch": 9.329608938547485, + "grad_norm": 0.4471920430660248, + "learning_rate": 0.0005353221288515406, + "loss": 0.5679, + "step": 16700 + }, + { + "epoch": 9.330167597765364, + "grad_norm": 0.9432402849197388, + "learning_rate": 0.0005352941176470588, + "loss": 0.4979, + "step": 16701 + }, + { + "epoch": 9.33072625698324, + "grad_norm": 0.36435049772262573, + "learning_rate": 0.000535266106442577, + "loss": 0.3961, + "step": 16702 + }, + { + "epoch": 9.331284916201117, + "grad_norm": 0.6592210531234741, + "learning_rate": 0.0005352380952380954, + "loss": 0.4546, + "step": 16703 + }, + { + "epoch": 9.331843575418995, + "grad_norm": 0.5919036269187927, + "learning_rate": 0.0005352100840336134, + "loss": 0.3894, + "step": 16704 + }, + { + "epoch": 9.332402234636872, + "grad_norm": 0.5600491762161255, + "learning_rate": 0.0005351820728291317, + "loss": 0.3748, + "step": 16705 + }, + { + "epoch": 9.332960893854748, + "grad_norm": 0.7861679792404175, + "learning_rate": 0.0005351540616246499, + "loss": 0.3913, + "step": 16706 + }, + { + "epoch": 9.333519553072625, + "grad_norm": 0.3903673589229584, + "learning_rate": 0.0005351260504201681, + "loss": 0.5132, + "step": 16707 + }, + { + "epoch": 9.334078212290503, + "grad_norm": 0.6793580651283264, + "learning_rate": 0.0005350980392156864, + "loss": 0.4321, + "step": 16708 + }, + { + "epoch": 9.33463687150838, + "grad_norm": 0.5604198575019836, + "learning_rate": 0.0005350700280112045, + "loss": 0.4345, + "step": 16709 + }, + { + "epoch": 9.335195530726256, + "grad_norm": 1.4051264524459839, + "learning_rate": 0.0005350420168067227, + "loss": 0.3798, + "step": 16710 + }, + { + "epoch": 9.335754189944135, + "grad_norm": 0.49308064579963684, + "learning_rate": 0.0005350140056022409, + "loss": 0.405, + "step": 16711 + }, + { + "epoch": 9.336312849162011, + "grad_norm": 0.8990593552589417, + "learning_rate": 0.0005349859943977591, + "loss": 0.4412, + "step": 16712 + }, + { + "epoch": 9.336871508379888, + "grad_norm": 0.6738452911376953, + "learning_rate": 0.0005349579831932774, + "loss": 0.4535, + "step": 16713 + }, + { + "epoch": 9.337430167597766, + "grad_norm": 0.3531082570552826, + "learning_rate": 0.0005349299719887955, + "loss": 0.3906, + "step": 16714 + }, + { + "epoch": 9.337988826815643, + "grad_norm": 2.3532865047454834, + "learning_rate": 0.0005349019607843137, + "loss": 0.3796, + "step": 16715 + }, + { + "epoch": 9.33854748603352, + "grad_norm": 0.5781932473182678, + "learning_rate": 0.0005348739495798319, + "loss": 0.4727, + "step": 16716 + }, + { + "epoch": 9.339106145251396, + "grad_norm": 0.4092429280281067, + "learning_rate": 0.0005348459383753501, + "loss": 0.3558, + "step": 16717 + }, + { + "epoch": 9.339664804469274, + "grad_norm": 1.010851263999939, + "learning_rate": 0.0005348179271708684, + "loss": 0.4183, + "step": 16718 + }, + { + "epoch": 9.34022346368715, + "grad_norm": 0.5052831768989563, + "learning_rate": 0.0005347899159663866, + "loss": 0.4556, + "step": 16719 + }, + { + "epoch": 9.340782122905027, + "grad_norm": 0.3975076377391815, + "learning_rate": 0.0005347619047619047, + "loss": 0.3884, + "step": 16720 + }, + { + "epoch": 9.341340782122906, + "grad_norm": 0.38268956542015076, + "learning_rate": 0.000534733893557423, + "loss": 0.3694, + "step": 16721 + }, + { + "epoch": 9.341899441340782, + "grad_norm": 0.721857488155365, + "learning_rate": 0.0005347058823529412, + "loss": 0.5405, + "step": 16722 + }, + { + "epoch": 9.342458100558659, + "grad_norm": 0.6094126105308533, + "learning_rate": 0.0005346778711484595, + "loss": 0.4913, + "step": 16723 + }, + { + "epoch": 9.343016759776535, + "grad_norm": 0.5790989398956299, + "learning_rate": 0.0005346498599439777, + "loss": 0.535, + "step": 16724 + }, + { + "epoch": 9.343575418994414, + "grad_norm": 0.4490100145339966, + "learning_rate": 0.0005346218487394958, + "loss": 0.4619, + "step": 16725 + }, + { + "epoch": 9.34413407821229, + "grad_norm": 0.5182918310165405, + "learning_rate": 0.000534593837535014, + "loss": 0.5676, + "step": 16726 + }, + { + "epoch": 9.344692737430167, + "grad_norm": 0.9785728454589844, + "learning_rate": 0.0005345658263305322, + "loss": 0.5234, + "step": 16727 + }, + { + "epoch": 9.345251396648045, + "grad_norm": 0.4276009202003479, + "learning_rate": 0.0005345378151260505, + "loss": 0.4458, + "step": 16728 + }, + { + "epoch": 9.345810055865922, + "grad_norm": 0.5573778748512268, + "learning_rate": 0.0005345098039215687, + "loss": 0.3893, + "step": 16729 + }, + { + "epoch": 9.346368715083798, + "grad_norm": 0.5121758580207825, + "learning_rate": 0.0005344817927170868, + "loss": 0.4946, + "step": 16730 + }, + { + "epoch": 9.346927374301677, + "grad_norm": 0.3346363306045532, + "learning_rate": 0.000534453781512605, + "loss": 0.3682, + "step": 16731 + }, + { + "epoch": 9.347486033519553, + "grad_norm": 0.47509777545928955, + "learning_rate": 0.0005344257703081232, + "loss": 0.435, + "step": 16732 + }, + { + "epoch": 9.34804469273743, + "grad_norm": 0.5008212924003601, + "learning_rate": 0.0005343977591036415, + "loss": 0.4415, + "step": 16733 + }, + { + "epoch": 9.348603351955306, + "grad_norm": 0.6001195907592773, + "learning_rate": 0.0005343697478991597, + "loss": 0.3558, + "step": 16734 + }, + { + "epoch": 9.349162011173185, + "grad_norm": 0.3894767761230469, + "learning_rate": 0.0005343417366946779, + "loss": 0.3785, + "step": 16735 + }, + { + "epoch": 9.349720670391061, + "grad_norm": 0.5864819884300232, + "learning_rate": 0.000534313725490196, + "loss": 0.4, + "step": 16736 + }, + { + "epoch": 9.350279329608938, + "grad_norm": 3.946179151535034, + "learning_rate": 0.0005342857142857142, + "loss": 0.4666, + "step": 16737 + }, + { + "epoch": 9.350837988826816, + "grad_norm": 0.5504373908042908, + "learning_rate": 0.0005342577030812325, + "loss": 0.5113, + "step": 16738 + }, + { + "epoch": 9.351396648044693, + "grad_norm": 0.540897786617279, + "learning_rate": 0.0005342296918767508, + "loss": 0.4218, + "step": 16739 + }, + { + "epoch": 9.35195530726257, + "grad_norm": 1.406132459640503, + "learning_rate": 0.000534201680672269, + "loss": 0.3711, + "step": 16740 + }, + { + "epoch": 9.352513966480448, + "grad_norm": 0.4389044940471649, + "learning_rate": 0.0005341736694677871, + "loss": 0.4069, + "step": 16741 + }, + { + "epoch": 9.353072625698324, + "grad_norm": 0.39769864082336426, + "learning_rate": 0.0005341456582633053, + "loss": 0.3836, + "step": 16742 + }, + { + "epoch": 9.3536312849162, + "grad_norm": 0.8196011781692505, + "learning_rate": 0.0005341176470588235, + "loss": 0.4535, + "step": 16743 + }, + { + "epoch": 9.354189944134077, + "grad_norm": 0.4231727123260498, + "learning_rate": 0.0005340896358543418, + "loss": 0.4003, + "step": 16744 + }, + { + "epoch": 9.354748603351956, + "grad_norm": 0.35200005769729614, + "learning_rate": 0.00053406162464986, + "loss": 0.3886, + "step": 16745 + }, + { + "epoch": 9.355307262569832, + "grad_norm": 0.6663103699684143, + "learning_rate": 0.0005340336134453781, + "loss": 0.46, + "step": 16746 + }, + { + "epoch": 9.355865921787709, + "grad_norm": 0.44087740778923035, + "learning_rate": 0.0005340056022408963, + "loss": 0.4076, + "step": 16747 + }, + { + "epoch": 9.356424581005587, + "grad_norm": 0.45731955766677856, + "learning_rate": 0.0005339775910364145, + "loss": 0.3895, + "step": 16748 + }, + { + "epoch": 9.356983240223464, + "grad_norm": 0.5980762243270874, + "learning_rate": 0.0005339495798319328, + "loss": 0.3999, + "step": 16749 + }, + { + "epoch": 9.35754189944134, + "grad_norm": 0.6291030049324036, + "learning_rate": 0.000533921568627451, + "loss": 0.394, + "step": 16750 + }, + { + "epoch": 9.358100558659217, + "grad_norm": 0.5832840800285339, + "learning_rate": 0.0005338935574229692, + "loss": 0.3709, + "step": 16751 + }, + { + "epoch": 9.358659217877095, + "grad_norm": 0.5454499125480652, + "learning_rate": 0.0005338655462184873, + "loss": 0.5831, + "step": 16752 + }, + { + "epoch": 9.359217877094972, + "grad_norm": 3.0721566677093506, + "learning_rate": 0.0005338375350140055, + "loss": 0.6549, + "step": 16753 + }, + { + "epoch": 9.359776536312848, + "grad_norm": 1.4152270555496216, + "learning_rate": 0.0005338095238095239, + "loss": 0.3353, + "step": 16754 + }, + { + "epoch": 9.360335195530727, + "grad_norm": 0.5737860798835754, + "learning_rate": 0.0005337815126050421, + "loss": 0.3669, + "step": 16755 + }, + { + "epoch": 9.360893854748603, + "grad_norm": 0.5673283934593201, + "learning_rate": 0.0005337535014005603, + "loss": 0.3832, + "step": 16756 + }, + { + "epoch": 9.36145251396648, + "grad_norm": 0.5884286165237427, + "learning_rate": 0.0005337254901960784, + "loss": 0.3579, + "step": 16757 + }, + { + "epoch": 9.362011173184358, + "grad_norm": 0.493373841047287, + "learning_rate": 0.0005336974789915966, + "loss": 0.3934, + "step": 16758 + }, + { + "epoch": 9.362569832402235, + "grad_norm": 0.5414632558822632, + "learning_rate": 0.0005336694677871149, + "loss": 0.5144, + "step": 16759 + }, + { + "epoch": 9.363128491620111, + "grad_norm": 0.6368604302406311, + "learning_rate": 0.0005336414565826331, + "loss": 0.3807, + "step": 16760 + }, + { + "epoch": 9.363687150837988, + "grad_norm": 0.4669869542121887, + "learning_rate": 0.0005336134453781513, + "loss": 0.4277, + "step": 16761 + }, + { + "epoch": 9.364245810055866, + "grad_norm": 0.3774166703224182, + "learning_rate": 0.0005335854341736694, + "loss": 0.3903, + "step": 16762 + }, + { + "epoch": 9.364804469273743, + "grad_norm": 0.774614155292511, + "learning_rate": 0.0005335574229691876, + "loss": 0.4535, + "step": 16763 + }, + { + "epoch": 9.36536312849162, + "grad_norm": 0.42842450737953186, + "learning_rate": 0.0005335294117647059, + "loss": 0.3665, + "step": 16764 + }, + { + "epoch": 9.365921787709498, + "grad_norm": 0.40375590324401855, + "learning_rate": 0.0005335014005602241, + "loss": 0.383, + "step": 16765 + }, + { + "epoch": 9.366480446927374, + "grad_norm": 0.7816968560218811, + "learning_rate": 0.0005334733893557423, + "loss": 0.5081, + "step": 16766 + }, + { + "epoch": 9.367039106145251, + "grad_norm": 0.619130551815033, + "learning_rate": 0.0005334453781512605, + "loss": 0.356, + "step": 16767 + }, + { + "epoch": 9.36759776536313, + "grad_norm": 0.44995802640914917, + "learning_rate": 0.0005334173669467786, + "loss": 0.4221, + "step": 16768 + }, + { + "epoch": 9.368156424581006, + "grad_norm": 0.5371401309967041, + "learning_rate": 0.000533389355742297, + "loss": 0.5107, + "step": 16769 + }, + { + "epoch": 9.368715083798882, + "grad_norm": 1.8955756425857544, + "learning_rate": 0.0005333613445378152, + "loss": 0.3834, + "step": 16770 + }, + { + "epoch": 9.369273743016759, + "grad_norm": 0.3807872235774994, + "learning_rate": 0.0005333333333333334, + "loss": 0.3927, + "step": 16771 + }, + { + "epoch": 9.369832402234637, + "grad_norm": 0.6134088635444641, + "learning_rate": 0.0005333053221288516, + "loss": 0.4867, + "step": 16772 + }, + { + "epoch": 9.370391061452514, + "grad_norm": 0.6308655142784119, + "learning_rate": 0.0005332773109243697, + "loss": 0.4624, + "step": 16773 + }, + { + "epoch": 9.37094972067039, + "grad_norm": 3.3655457496643066, + "learning_rate": 0.000533249299719888, + "loss": 0.3889, + "step": 16774 + }, + { + "epoch": 9.371508379888269, + "grad_norm": 0.8009724617004395, + "learning_rate": 0.0005332212885154062, + "loss": 0.3911, + "step": 16775 + }, + { + "epoch": 9.372067039106145, + "grad_norm": 0.41596177220344543, + "learning_rate": 0.0005331932773109244, + "loss": 0.456, + "step": 16776 + }, + { + "epoch": 9.372625698324022, + "grad_norm": 0.4668046534061432, + "learning_rate": 0.0005331652661064426, + "loss": 0.4022, + "step": 16777 + }, + { + "epoch": 9.3731843575419, + "grad_norm": 0.7238803505897522, + "learning_rate": 0.0005331372549019607, + "loss": 0.4643, + "step": 16778 + }, + { + "epoch": 9.373743016759777, + "grad_norm": 2.1968822479248047, + "learning_rate": 0.000533109243697479, + "loss": 0.4543, + "step": 16779 + }, + { + "epoch": 9.374301675977653, + "grad_norm": 0.4328564405441284, + "learning_rate": 0.0005330812324929972, + "loss": 0.4487, + "step": 16780 + }, + { + "epoch": 9.37486033519553, + "grad_norm": 0.46303772926330566, + "learning_rate": 0.0005330532212885154, + "loss": 0.4249, + "step": 16781 + }, + { + "epoch": 9.375418994413408, + "grad_norm": 0.40519464015960693, + "learning_rate": 0.0005330252100840336, + "loss": 0.412, + "step": 16782 + }, + { + "epoch": 9.375977653631285, + "grad_norm": 0.555400550365448, + "learning_rate": 0.0005329971988795518, + "loss": 0.4956, + "step": 16783 + }, + { + "epoch": 9.376536312849161, + "grad_norm": 0.6737897992134094, + "learning_rate": 0.00053296918767507, + "loss": 0.4742, + "step": 16784 + }, + { + "epoch": 9.37709497206704, + "grad_norm": 0.5997216105461121, + "learning_rate": 0.0005329411764705882, + "loss": 0.4937, + "step": 16785 + }, + { + "epoch": 9.377653631284916, + "grad_norm": 0.6114674806594849, + "learning_rate": 0.0005329131652661064, + "loss": 0.4871, + "step": 16786 + }, + { + "epoch": 9.378212290502793, + "grad_norm": 0.4657524526119232, + "learning_rate": 0.0005328851540616247, + "loss": 0.4668, + "step": 16787 + }, + { + "epoch": 9.378770949720671, + "grad_norm": 0.4247683882713318, + "learning_rate": 0.0005328571428571429, + "loss": 0.4095, + "step": 16788 + }, + { + "epoch": 9.379329608938548, + "grad_norm": 0.3457503616809845, + "learning_rate": 0.0005328291316526611, + "loss": 0.3695, + "step": 16789 + }, + { + "epoch": 9.379888268156424, + "grad_norm": 0.4852539300918579, + "learning_rate": 0.0005328011204481793, + "loss": 0.3954, + "step": 16790 + }, + { + "epoch": 9.380446927374301, + "grad_norm": 0.4772719144821167, + "learning_rate": 0.0005327731092436975, + "loss": 0.4913, + "step": 16791 + }, + { + "epoch": 9.38100558659218, + "grad_norm": 0.4078742265701294, + "learning_rate": 0.0005327450980392157, + "loss": 0.3472, + "step": 16792 + }, + { + "epoch": 9.381564245810056, + "grad_norm": 0.7070629000663757, + "learning_rate": 0.0005327170868347339, + "loss": 0.6128, + "step": 16793 + }, + { + "epoch": 9.382122905027932, + "grad_norm": 0.652103841304779, + "learning_rate": 0.0005326890756302522, + "loss": 0.3886, + "step": 16794 + }, + { + "epoch": 9.38268156424581, + "grad_norm": 0.6479032635688782, + "learning_rate": 0.0005326610644257703, + "loss": 0.4171, + "step": 16795 + }, + { + "epoch": 9.383240223463687, + "grad_norm": 0.7084524035453796, + "learning_rate": 0.0005326330532212885, + "loss": 0.4825, + "step": 16796 + }, + { + "epoch": 9.383798882681564, + "grad_norm": 0.9993151426315308, + "learning_rate": 0.0005326050420168067, + "loss": 0.3809, + "step": 16797 + }, + { + "epoch": 9.38435754189944, + "grad_norm": 1.4311127662658691, + "learning_rate": 0.0005325770308123249, + "loss": 0.3121, + "step": 16798 + }, + { + "epoch": 9.384916201117319, + "grad_norm": 0.5914056897163391, + "learning_rate": 0.0005325490196078432, + "loss": 0.4658, + "step": 16799 + }, + { + "epoch": 9.385474860335195, + "grad_norm": 3.6157281398773193, + "learning_rate": 0.0005325210084033613, + "loss": 0.622, + "step": 16800 + }, + { + "epoch": 9.386033519553072, + "grad_norm": 1.3350077867507935, + "learning_rate": 0.0005324929971988795, + "loss": 0.5424, + "step": 16801 + }, + { + "epoch": 9.38659217877095, + "grad_norm": 0.6850478649139404, + "learning_rate": 0.0005324649859943977, + "loss": 0.4787, + "step": 16802 + }, + { + "epoch": 9.387150837988827, + "grad_norm": 0.7428293228149414, + "learning_rate": 0.000532436974789916, + "loss": 0.4541, + "step": 16803 + }, + { + "epoch": 9.387709497206703, + "grad_norm": 2.18629789352417, + "learning_rate": 0.0005324089635854343, + "loss": 0.3548, + "step": 16804 + }, + { + "epoch": 9.388268156424582, + "grad_norm": 0.7521222233772278, + "learning_rate": 0.0005323809523809524, + "loss": 0.512, + "step": 16805 + }, + { + "epoch": 9.388826815642458, + "grad_norm": 0.6294946074485779, + "learning_rate": 0.0005323529411764706, + "loss": 0.5323, + "step": 16806 + }, + { + "epoch": 9.389385474860335, + "grad_norm": 0.5802832245826721, + "learning_rate": 0.0005323249299719888, + "loss": 0.2995, + "step": 16807 + }, + { + "epoch": 9.389944134078211, + "grad_norm": 2.8059754371643066, + "learning_rate": 0.000532296918767507, + "loss": 0.4557, + "step": 16808 + }, + { + "epoch": 9.39050279329609, + "grad_norm": 0.657271146774292, + "learning_rate": 0.0005322689075630253, + "loss": 0.5192, + "step": 16809 + }, + { + "epoch": 9.391061452513966, + "grad_norm": 0.7676663994789124, + "learning_rate": 0.0005322408963585435, + "loss": 0.4827, + "step": 16810 + }, + { + "epoch": 9.391620111731843, + "grad_norm": 0.6108667850494385, + "learning_rate": 0.0005322128851540616, + "loss": 0.4051, + "step": 16811 + }, + { + "epoch": 9.392178770949721, + "grad_norm": 0.3898201882839203, + "learning_rate": 0.0005321848739495798, + "loss": 0.4939, + "step": 16812 + }, + { + "epoch": 9.392737430167598, + "grad_norm": 1.2757179737091064, + "learning_rate": 0.000532156862745098, + "loss": 0.6895, + "step": 16813 + }, + { + "epoch": 9.393296089385474, + "grad_norm": 0.5117576718330383, + "learning_rate": 0.0005321288515406163, + "loss": 0.4027, + "step": 16814 + }, + { + "epoch": 9.393854748603353, + "grad_norm": 0.44022637605667114, + "learning_rate": 0.0005321008403361345, + "loss": 0.3671, + "step": 16815 + }, + { + "epoch": 9.39441340782123, + "grad_norm": 0.3825591802597046, + "learning_rate": 0.0005320728291316526, + "loss": 0.3846, + "step": 16816 + }, + { + "epoch": 9.394972067039106, + "grad_norm": 0.5309603810310364, + "learning_rate": 0.0005320448179271708, + "loss": 0.513, + "step": 16817 + }, + { + "epoch": 9.395530726256982, + "grad_norm": 0.43423765897750854, + "learning_rate": 0.000532016806722689, + "loss": 0.4442, + "step": 16818 + }, + { + "epoch": 9.39608938547486, + "grad_norm": 0.5367094278335571, + "learning_rate": 0.0005319887955182074, + "loss": 0.4505, + "step": 16819 + }, + { + "epoch": 9.396648044692737, + "grad_norm": 0.5798840522766113, + "learning_rate": 0.0005319607843137256, + "loss": 0.4677, + "step": 16820 + }, + { + "epoch": 9.397206703910614, + "grad_norm": 0.4283909201622009, + "learning_rate": 0.0005319327731092437, + "loss": 0.4188, + "step": 16821 + }, + { + "epoch": 9.397765363128492, + "grad_norm": 0.47862565517425537, + "learning_rate": 0.0005319047619047619, + "loss": 0.341, + "step": 16822 + }, + { + "epoch": 9.398324022346369, + "grad_norm": 0.5590355396270752, + "learning_rate": 0.0005318767507002801, + "loss": 0.3156, + "step": 16823 + }, + { + "epoch": 9.398882681564245, + "grad_norm": 0.4354957342147827, + "learning_rate": 0.0005318487394957984, + "loss": 0.2996, + "step": 16824 + }, + { + "epoch": 9.399441340782122, + "grad_norm": 0.5914400815963745, + "learning_rate": 0.0005318207282913166, + "loss": 0.448, + "step": 16825 + }, + { + "epoch": 9.4, + "grad_norm": 0.7777588963508606, + "learning_rate": 0.0005317927170868348, + "loss": 0.4169, + "step": 16826 + }, + { + "epoch": 9.400558659217877, + "grad_norm": 0.5501382350921631, + "learning_rate": 0.0005317647058823529, + "loss": 0.475, + "step": 16827 + }, + { + "epoch": 9.401117318435753, + "grad_norm": 0.39645954966545105, + "learning_rate": 0.0005317366946778711, + "loss": 0.5077, + "step": 16828 + }, + { + "epoch": 9.401675977653632, + "grad_norm": 0.9203370809555054, + "learning_rate": 0.0005317086834733894, + "loss": 0.4567, + "step": 16829 + }, + { + "epoch": 9.402234636871508, + "grad_norm": 0.6831592917442322, + "learning_rate": 0.0005316806722689076, + "loss": 0.4235, + "step": 16830 + }, + { + "epoch": 9.402793296089385, + "grad_norm": 1.0460988283157349, + "learning_rate": 0.0005316526610644258, + "loss": 0.5288, + "step": 16831 + }, + { + "epoch": 9.403351955307263, + "grad_norm": 0.6364946365356445, + "learning_rate": 0.0005316246498599439, + "loss": 0.4513, + "step": 16832 + }, + { + "epoch": 9.40391061452514, + "grad_norm": 0.515491783618927, + "learning_rate": 0.0005315966386554621, + "loss": 0.471, + "step": 16833 + }, + { + "epoch": 9.404469273743016, + "grad_norm": 1.3763272762298584, + "learning_rate": 0.0005315686274509804, + "loss": 0.3567, + "step": 16834 + }, + { + "epoch": 9.405027932960893, + "grad_norm": 1.1162863969802856, + "learning_rate": 0.0005315406162464987, + "loss": 0.5599, + "step": 16835 + }, + { + "epoch": 9.405586592178771, + "grad_norm": 0.44892141222953796, + "learning_rate": 0.0005315126050420169, + "loss": 0.4461, + "step": 16836 + }, + { + "epoch": 9.406145251396648, + "grad_norm": 0.42992711067199707, + "learning_rate": 0.000531484593837535, + "loss": 0.4084, + "step": 16837 + }, + { + "epoch": 9.406703910614524, + "grad_norm": 0.6918512582778931, + "learning_rate": 0.0005314565826330532, + "loss": 0.4327, + "step": 16838 + }, + { + "epoch": 9.407262569832403, + "grad_norm": 0.40214061737060547, + "learning_rate": 0.0005314285714285715, + "loss": 0.2999, + "step": 16839 + }, + { + "epoch": 9.40782122905028, + "grad_norm": 0.48275989294052124, + "learning_rate": 0.0005314005602240897, + "loss": 0.4804, + "step": 16840 + }, + { + "epoch": 9.408379888268156, + "grad_norm": 0.42350292205810547, + "learning_rate": 0.0005313725490196079, + "loss": 0.375, + "step": 16841 + }, + { + "epoch": 9.408938547486034, + "grad_norm": 0.7236517071723938, + "learning_rate": 0.0005313445378151261, + "loss": 0.5642, + "step": 16842 + }, + { + "epoch": 9.40949720670391, + "grad_norm": 0.4190064072608948, + "learning_rate": 0.0005313165266106442, + "loss": 0.3392, + "step": 16843 + }, + { + "epoch": 9.410055865921787, + "grad_norm": 0.42308881878852844, + "learning_rate": 0.0005312885154061625, + "loss": 0.3061, + "step": 16844 + }, + { + "epoch": 9.410614525139664, + "grad_norm": 0.3551797568798065, + "learning_rate": 0.0005312605042016807, + "loss": 0.3581, + "step": 16845 + }, + { + "epoch": 9.411173184357542, + "grad_norm": 0.5283050537109375, + "learning_rate": 0.0005312324929971989, + "loss": 0.4388, + "step": 16846 + }, + { + "epoch": 9.411731843575419, + "grad_norm": 0.6589013338088989, + "learning_rate": 0.0005312044817927171, + "loss": 0.3644, + "step": 16847 + }, + { + "epoch": 9.412290502793295, + "grad_norm": 0.556256890296936, + "learning_rate": 0.0005311764705882352, + "loss": 0.4125, + "step": 16848 + }, + { + "epoch": 9.412849162011174, + "grad_norm": 0.6540932059288025, + "learning_rate": 0.0005311484593837535, + "loss": 0.4097, + "step": 16849 + }, + { + "epoch": 9.41340782122905, + "grad_norm": 0.5505632162094116, + "learning_rate": 0.0005311204481792717, + "loss": 0.4131, + "step": 16850 + }, + { + "epoch": 9.413966480446927, + "grad_norm": 0.3756787180900574, + "learning_rate": 0.00053109243697479, + "loss": 0.3572, + "step": 16851 + }, + { + "epoch": 9.414525139664805, + "grad_norm": 0.7743303179740906, + "learning_rate": 0.0005310644257703082, + "loss": 0.5434, + "step": 16852 + }, + { + "epoch": 9.415083798882682, + "grad_norm": 0.5008177161216736, + "learning_rate": 0.0005310364145658263, + "loss": 0.4358, + "step": 16853 + }, + { + "epoch": 9.415642458100558, + "grad_norm": 0.5340622663497925, + "learning_rate": 0.0005310084033613446, + "loss": 0.4683, + "step": 16854 + }, + { + "epoch": 9.416201117318435, + "grad_norm": 0.4869961440563202, + "learning_rate": 0.0005309803921568628, + "loss": 0.494, + "step": 16855 + }, + { + "epoch": 9.416759776536313, + "grad_norm": 0.47588032484054565, + "learning_rate": 0.000530952380952381, + "loss": 0.47, + "step": 16856 + }, + { + "epoch": 9.41731843575419, + "grad_norm": 0.48139962553977966, + "learning_rate": 0.0005309243697478992, + "loss": 0.4264, + "step": 16857 + }, + { + "epoch": 9.417877094972066, + "grad_norm": 1.1310714483261108, + "learning_rate": 0.0005308963585434174, + "loss": 0.3937, + "step": 16858 + }, + { + "epoch": 9.418435754189945, + "grad_norm": 2.103879690170288, + "learning_rate": 0.0005308683473389356, + "loss": 0.4049, + "step": 16859 + }, + { + "epoch": 9.418994413407821, + "grad_norm": 0.450005441904068, + "learning_rate": 0.0005308403361344538, + "loss": 0.4111, + "step": 16860 + }, + { + "epoch": 9.419553072625698, + "grad_norm": 0.41125884652137756, + "learning_rate": 0.000530812324929972, + "loss": 0.3421, + "step": 16861 + }, + { + "epoch": 9.420111731843576, + "grad_norm": 0.4717095196247101, + "learning_rate": 0.0005307843137254902, + "loss": 0.3288, + "step": 16862 + }, + { + "epoch": 9.420670391061453, + "grad_norm": 0.5915235877037048, + "learning_rate": 0.0005307563025210084, + "loss": 0.6243, + "step": 16863 + }, + { + "epoch": 9.42122905027933, + "grad_norm": 0.638978123664856, + "learning_rate": 0.0005307282913165266, + "loss": 0.5166, + "step": 16864 + }, + { + "epoch": 9.421787709497206, + "grad_norm": 0.7382144927978516, + "learning_rate": 0.0005307002801120448, + "loss": 0.4204, + "step": 16865 + }, + { + "epoch": 9.422346368715084, + "grad_norm": 1.3231853246688843, + "learning_rate": 0.000530672268907563, + "loss": 0.3653, + "step": 16866 + }, + { + "epoch": 9.422905027932961, + "grad_norm": 0.513291597366333, + "learning_rate": 0.0005306442577030812, + "loss": 0.5272, + "step": 16867 + }, + { + "epoch": 9.423463687150837, + "grad_norm": 0.4303114116191864, + "learning_rate": 0.0005306162464985994, + "loss": 0.3788, + "step": 16868 + }, + { + "epoch": 9.424022346368716, + "grad_norm": 0.5129906535148621, + "learning_rate": 0.0005305882352941177, + "loss": 0.4965, + "step": 16869 + }, + { + "epoch": 9.424581005586592, + "grad_norm": 0.7556983828544617, + "learning_rate": 0.0005305602240896359, + "loss": 0.468, + "step": 16870 + }, + { + "epoch": 9.425139664804469, + "grad_norm": 0.4534633755683899, + "learning_rate": 0.0005305322128851541, + "loss": 0.422, + "step": 16871 + }, + { + "epoch": 9.425698324022346, + "grad_norm": 0.7855282425880432, + "learning_rate": 0.0005305042016806723, + "loss": 0.4841, + "step": 16872 + }, + { + "epoch": 9.426256983240224, + "grad_norm": 0.6528869867324829, + "learning_rate": 0.0005304761904761905, + "loss": 0.4319, + "step": 16873 + }, + { + "epoch": 9.4268156424581, + "grad_norm": 1.577876091003418, + "learning_rate": 0.0005304481792717088, + "loss": 0.5527, + "step": 16874 + }, + { + "epoch": 9.427374301675977, + "grad_norm": 0.40677517652511597, + "learning_rate": 0.0005304201680672269, + "loss": 0.4119, + "step": 16875 + }, + { + "epoch": 9.427932960893855, + "grad_norm": 0.4042321741580963, + "learning_rate": 0.0005303921568627451, + "loss": 0.3946, + "step": 16876 + }, + { + "epoch": 9.428491620111732, + "grad_norm": 3.2340877056121826, + "learning_rate": 0.0005303641456582633, + "loss": 0.392, + "step": 16877 + }, + { + "epoch": 9.429050279329608, + "grad_norm": 0.6412645578384399, + "learning_rate": 0.0005303361344537815, + "loss": 0.4745, + "step": 16878 + }, + { + "epoch": 9.429608938547487, + "grad_norm": 0.38093122839927673, + "learning_rate": 0.0005303081232492998, + "loss": 0.3173, + "step": 16879 + }, + { + "epoch": 9.430167597765363, + "grad_norm": 0.5791457295417786, + "learning_rate": 0.0005302801120448179, + "loss": 0.4417, + "step": 16880 + }, + { + "epoch": 9.43072625698324, + "grad_norm": 0.6296137571334839, + "learning_rate": 0.0005302521008403361, + "loss": 0.3854, + "step": 16881 + }, + { + "epoch": 9.431284916201117, + "grad_norm": 0.8443319201469421, + "learning_rate": 0.0005302240896358543, + "loss": 0.4034, + "step": 16882 + }, + { + "epoch": 9.431843575418995, + "grad_norm": 1.5449734926223755, + "learning_rate": 0.0005301960784313725, + "loss": 0.4717, + "step": 16883 + }, + { + "epoch": 9.432402234636871, + "grad_norm": 0.7071899175643921, + "learning_rate": 0.0005301680672268909, + "loss": 0.3662, + "step": 16884 + }, + { + "epoch": 9.432960893854748, + "grad_norm": 0.5175532102584839, + "learning_rate": 0.000530140056022409, + "loss": 0.4511, + "step": 16885 + }, + { + "epoch": 9.433519553072626, + "grad_norm": 0.435304194688797, + "learning_rate": 0.0005301120448179272, + "loss": 0.4855, + "step": 16886 + }, + { + "epoch": 9.434078212290503, + "grad_norm": 1.1449611186981201, + "learning_rate": 0.0005300840336134454, + "loss": 0.4142, + "step": 16887 + }, + { + "epoch": 9.43463687150838, + "grad_norm": 0.3325803875923157, + "learning_rate": 0.0005300560224089636, + "loss": 0.3357, + "step": 16888 + }, + { + "epoch": 9.435195530726258, + "grad_norm": 0.7689087986946106, + "learning_rate": 0.0005300280112044819, + "loss": 0.5358, + "step": 16889 + }, + { + "epoch": 9.435754189944134, + "grad_norm": 0.5600887537002563, + "learning_rate": 0.0005300000000000001, + "loss": 0.4531, + "step": 16890 + }, + { + "epoch": 9.436312849162011, + "grad_norm": 2.0872435569763184, + "learning_rate": 0.0005299719887955182, + "loss": 0.4624, + "step": 16891 + }, + { + "epoch": 9.436871508379888, + "grad_norm": 0.8863978385925293, + "learning_rate": 0.0005299439775910364, + "loss": 0.438, + "step": 16892 + }, + { + "epoch": 9.437430167597766, + "grad_norm": 0.569903552532196, + "learning_rate": 0.0005299159663865546, + "loss": 0.5174, + "step": 16893 + }, + { + "epoch": 9.437988826815642, + "grad_norm": 0.7479849457740784, + "learning_rate": 0.0005298879551820729, + "loss": 0.4118, + "step": 16894 + }, + { + "epoch": 9.438547486033519, + "grad_norm": 0.5508785843849182, + "learning_rate": 0.0005298599439775911, + "loss": 0.3754, + "step": 16895 + }, + { + "epoch": 9.439106145251397, + "grad_norm": 5.470061302185059, + "learning_rate": 0.0005298319327731092, + "loss": 0.3891, + "step": 16896 + }, + { + "epoch": 9.439664804469274, + "grad_norm": 0.4785255789756775, + "learning_rate": 0.0005298039215686274, + "loss": 0.4148, + "step": 16897 + }, + { + "epoch": 9.44022346368715, + "grad_norm": 0.7331400513648987, + "learning_rate": 0.0005297759103641456, + "loss": 0.4232, + "step": 16898 + }, + { + "epoch": 9.440782122905027, + "grad_norm": 2.038973808288574, + "learning_rate": 0.0005297478991596639, + "loss": 0.5461, + "step": 16899 + }, + { + "epoch": 9.441340782122905, + "grad_norm": 0.40712592005729675, + "learning_rate": 0.0005297198879551821, + "loss": 0.428, + "step": 16900 + }, + { + "epoch": 9.441899441340782, + "grad_norm": 0.7654996514320374, + "learning_rate": 0.0005296918767507002, + "loss": 0.3192, + "step": 16901 + }, + { + "epoch": 9.442458100558659, + "grad_norm": 0.5104309916496277, + "learning_rate": 0.0005296638655462185, + "loss": 0.4549, + "step": 16902 + }, + { + "epoch": 9.443016759776537, + "grad_norm": 0.6030893921852112, + "learning_rate": 0.0005296358543417367, + "loss": 0.3589, + "step": 16903 + }, + { + "epoch": 9.443575418994413, + "grad_norm": 0.5829979777336121, + "learning_rate": 0.000529607843137255, + "loss": 0.4501, + "step": 16904 + }, + { + "epoch": 9.44413407821229, + "grad_norm": 0.7452841997146606, + "learning_rate": 0.0005295798319327732, + "loss": 0.5193, + "step": 16905 + }, + { + "epoch": 9.444692737430168, + "grad_norm": 0.5121172070503235, + "learning_rate": 0.0005295518207282914, + "loss": 0.5095, + "step": 16906 + }, + { + "epoch": 9.445251396648045, + "grad_norm": 0.8370568752288818, + "learning_rate": 0.0005295238095238095, + "loss": 0.4063, + "step": 16907 + }, + { + "epoch": 9.445810055865921, + "grad_norm": 0.58674556016922, + "learning_rate": 0.0005294957983193277, + "loss": 0.4101, + "step": 16908 + }, + { + "epoch": 9.446368715083798, + "grad_norm": 0.44542819261550903, + "learning_rate": 0.000529467787114846, + "loss": 0.3733, + "step": 16909 + }, + { + "epoch": 9.446927374301676, + "grad_norm": 0.6354761719703674, + "learning_rate": 0.0005294397759103642, + "loss": 0.4817, + "step": 16910 + }, + { + "epoch": 9.447486033519553, + "grad_norm": 0.797371506690979, + "learning_rate": 0.0005294117647058824, + "loss": 0.6478, + "step": 16911 + }, + { + "epoch": 9.44804469273743, + "grad_norm": 1.0392273664474487, + "learning_rate": 0.0005293837535014005, + "loss": 0.5313, + "step": 16912 + }, + { + "epoch": 9.448603351955308, + "grad_norm": 0.454172819852829, + "learning_rate": 0.0005293557422969187, + "loss": 0.4089, + "step": 16913 + }, + { + "epoch": 9.449162011173184, + "grad_norm": 0.40602946281433105, + "learning_rate": 0.000529327731092437, + "loss": 0.3903, + "step": 16914 + }, + { + "epoch": 9.449720670391061, + "grad_norm": 0.4775692820549011, + "learning_rate": 0.0005292997198879552, + "loss": 0.3848, + "step": 16915 + }, + { + "epoch": 9.45027932960894, + "grad_norm": 1.2314726114273071, + "learning_rate": 0.0005292717086834734, + "loss": 0.3862, + "step": 16916 + }, + { + "epoch": 9.450837988826816, + "grad_norm": 0.5477046966552734, + "learning_rate": 0.0005292436974789915, + "loss": 0.3504, + "step": 16917 + }, + { + "epoch": 9.451396648044692, + "grad_norm": 0.9424893260002136, + "learning_rate": 0.0005292156862745097, + "loss": 0.4473, + "step": 16918 + }, + { + "epoch": 9.451955307262569, + "grad_norm": 0.6837775111198425, + "learning_rate": 0.0005291876750700281, + "loss": 0.5215, + "step": 16919 + }, + { + "epoch": 9.452513966480447, + "grad_norm": 0.43599507212638855, + "learning_rate": 0.0005291596638655463, + "loss": 0.405, + "step": 16920 + }, + { + "epoch": 9.453072625698324, + "grad_norm": 0.609921932220459, + "learning_rate": 0.0005291316526610645, + "loss": 0.494, + "step": 16921 + }, + { + "epoch": 9.4536312849162, + "grad_norm": 0.7851957082748413, + "learning_rate": 0.0005291036414565827, + "loss": 0.4908, + "step": 16922 + }, + { + "epoch": 9.454189944134079, + "grad_norm": 1.0331196784973145, + "learning_rate": 0.0005290756302521008, + "loss": 0.4542, + "step": 16923 + }, + { + "epoch": 9.454748603351955, + "grad_norm": 0.45207321643829346, + "learning_rate": 0.0005290476190476191, + "loss": 0.5175, + "step": 16924 + }, + { + "epoch": 9.455307262569832, + "grad_norm": 0.6439725756645203, + "learning_rate": 0.0005290196078431373, + "loss": 0.3814, + "step": 16925 + }, + { + "epoch": 9.45586592178771, + "grad_norm": 0.7960869073867798, + "learning_rate": 0.0005289915966386555, + "loss": 0.3217, + "step": 16926 + }, + { + "epoch": 9.456424581005587, + "grad_norm": 0.49538466334342957, + "learning_rate": 0.0005289635854341737, + "loss": 0.604, + "step": 16927 + }, + { + "epoch": 9.456983240223463, + "grad_norm": 0.40449148416519165, + "learning_rate": 0.0005289355742296918, + "loss": 0.3721, + "step": 16928 + }, + { + "epoch": 9.45754189944134, + "grad_norm": 0.48132526874542236, + "learning_rate": 0.0005289075630252101, + "loss": 0.4298, + "step": 16929 + }, + { + "epoch": 9.458100558659218, + "grad_norm": 0.49563801288604736, + "learning_rate": 0.0005288795518207283, + "loss": 0.3348, + "step": 16930 + }, + { + "epoch": 9.458659217877095, + "grad_norm": 0.67368483543396, + "learning_rate": 0.0005288515406162465, + "loss": 0.4975, + "step": 16931 + }, + { + "epoch": 9.459217877094972, + "grad_norm": 0.7780280113220215, + "learning_rate": 0.0005288235294117647, + "loss": 0.4339, + "step": 16932 + }, + { + "epoch": 9.45977653631285, + "grad_norm": 1.3457205295562744, + "learning_rate": 0.0005287955182072828, + "loss": 0.4638, + "step": 16933 + }, + { + "epoch": 9.460335195530726, + "grad_norm": 0.7658423185348511, + "learning_rate": 0.0005287675070028012, + "loss": 0.3708, + "step": 16934 + }, + { + "epoch": 9.460893854748603, + "grad_norm": 0.7547198534011841, + "learning_rate": 0.0005287394957983194, + "loss": 0.2732, + "step": 16935 + }, + { + "epoch": 9.461452513966481, + "grad_norm": 0.4372788667678833, + "learning_rate": 0.0005287114845938376, + "loss": 0.3738, + "step": 16936 + }, + { + "epoch": 9.462011173184358, + "grad_norm": 0.4659584164619446, + "learning_rate": 0.0005286834733893558, + "loss": 0.3969, + "step": 16937 + }, + { + "epoch": 9.462569832402234, + "grad_norm": 2.6904969215393066, + "learning_rate": 0.000528655462184874, + "loss": 0.4689, + "step": 16938 + }, + { + "epoch": 9.463128491620111, + "grad_norm": 0.6364842057228088, + "learning_rate": 0.0005286274509803922, + "loss": 0.5047, + "step": 16939 + }, + { + "epoch": 9.46368715083799, + "grad_norm": 6.599497318267822, + "learning_rate": 0.0005285994397759104, + "loss": 0.4468, + "step": 16940 + }, + { + "epoch": 9.464245810055866, + "grad_norm": 0.45465609431266785, + "learning_rate": 0.0005285714285714286, + "loss": 0.4386, + "step": 16941 + }, + { + "epoch": 9.464804469273743, + "grad_norm": 0.5392580032348633, + "learning_rate": 0.0005285434173669468, + "loss": 0.419, + "step": 16942 + }, + { + "epoch": 9.46536312849162, + "grad_norm": 0.5984982252120972, + "learning_rate": 0.000528515406162465, + "loss": 0.4554, + "step": 16943 + }, + { + "epoch": 9.465921787709497, + "grad_norm": 1.6093237400054932, + "learning_rate": 0.0005284873949579832, + "loss": 0.4048, + "step": 16944 + }, + { + "epoch": 9.466480446927374, + "grad_norm": 0.7827469706535339, + "learning_rate": 0.0005284593837535014, + "loss": 0.5575, + "step": 16945 + }, + { + "epoch": 9.46703910614525, + "grad_norm": 0.6151099801063538, + "learning_rate": 0.0005284313725490196, + "loss": 0.3772, + "step": 16946 + }, + { + "epoch": 9.467597765363129, + "grad_norm": 0.4412667453289032, + "learning_rate": 0.0005284033613445378, + "loss": 0.3938, + "step": 16947 + }, + { + "epoch": 9.468156424581005, + "grad_norm": 0.4706019163131714, + "learning_rate": 0.000528375350140056, + "loss": 0.3906, + "step": 16948 + }, + { + "epoch": 9.468715083798882, + "grad_norm": 0.5834082365036011, + "learning_rate": 0.0005283473389355742, + "loss": 0.5105, + "step": 16949 + }, + { + "epoch": 9.46927374301676, + "grad_norm": 0.5690367221832275, + "learning_rate": 0.0005283193277310924, + "loss": 0.4442, + "step": 16950 + }, + { + "epoch": 9.469832402234637, + "grad_norm": 0.49667978286743164, + "learning_rate": 0.0005282913165266107, + "loss": 0.4298, + "step": 16951 + }, + { + "epoch": 9.470391061452514, + "grad_norm": 0.42091116309165955, + "learning_rate": 0.0005282633053221289, + "loss": 0.3867, + "step": 16952 + }, + { + "epoch": 9.470949720670392, + "grad_norm": 1.3130115270614624, + "learning_rate": 0.0005282352941176471, + "loss": 0.5803, + "step": 16953 + }, + { + "epoch": 9.471508379888268, + "grad_norm": 0.6040555834770203, + "learning_rate": 0.0005282072829131654, + "loss": 0.3901, + "step": 16954 + }, + { + "epoch": 9.472067039106145, + "grad_norm": 0.4838673770427704, + "learning_rate": 0.0005281792717086835, + "loss": 0.4193, + "step": 16955 + }, + { + "epoch": 9.472625698324022, + "grad_norm": 0.5954147577285767, + "learning_rate": 0.0005281512605042017, + "loss": 0.349, + "step": 16956 + }, + { + "epoch": 9.4731843575419, + "grad_norm": 0.597293496131897, + "learning_rate": 0.0005281232492997199, + "loss": 0.4784, + "step": 16957 + }, + { + "epoch": 9.473743016759776, + "grad_norm": 0.39264634251594543, + "learning_rate": 0.0005280952380952381, + "loss": 0.3667, + "step": 16958 + }, + { + "epoch": 9.474301675977653, + "grad_norm": 0.47603321075439453, + "learning_rate": 0.0005280672268907563, + "loss": 0.4177, + "step": 16959 + }, + { + "epoch": 9.474860335195531, + "grad_norm": 4.853092193603516, + "learning_rate": 0.0005280392156862745, + "loss": 0.508, + "step": 16960 + }, + { + "epoch": 9.475418994413408, + "grad_norm": 0.4145950973033905, + "learning_rate": 0.0005280112044817927, + "loss": 0.424, + "step": 16961 + }, + { + "epoch": 9.475977653631285, + "grad_norm": 7.030914306640625, + "learning_rate": 0.0005279831932773109, + "loss": 0.4347, + "step": 16962 + }, + { + "epoch": 9.476536312849163, + "grad_norm": 0.7101901769638062, + "learning_rate": 0.0005279551820728291, + "loss": 0.4203, + "step": 16963 + }, + { + "epoch": 9.47709497206704, + "grad_norm": 0.40349534153938293, + "learning_rate": 0.0005279271708683473, + "loss": 0.481, + "step": 16964 + }, + { + "epoch": 9.477653631284916, + "grad_norm": 0.4451386332511902, + "learning_rate": 0.0005278991596638655, + "loss": 0.4566, + "step": 16965 + }, + { + "epoch": 9.478212290502793, + "grad_norm": 0.986737847328186, + "learning_rate": 0.0005278711484593837, + "loss": 0.4478, + "step": 16966 + }, + { + "epoch": 9.478770949720671, + "grad_norm": 0.5118899941444397, + "learning_rate": 0.000527843137254902, + "loss": 0.5561, + "step": 16967 + }, + { + "epoch": 9.479329608938547, + "grad_norm": 0.5588580369949341, + "learning_rate": 0.0005278151260504202, + "loss": 0.4867, + "step": 16968 + }, + { + "epoch": 9.479888268156424, + "grad_norm": 0.4059693515300751, + "learning_rate": 0.0005277871148459384, + "loss": 0.3931, + "step": 16969 + }, + { + "epoch": 9.480446927374302, + "grad_norm": 0.464345782995224, + "learning_rate": 0.0005277591036414567, + "loss": 0.3336, + "step": 16970 + }, + { + "epoch": 9.481005586592179, + "grad_norm": 0.4442157447338104, + "learning_rate": 0.0005277310924369748, + "loss": 0.478, + "step": 16971 + }, + { + "epoch": 9.481564245810056, + "grad_norm": 0.5415831804275513, + "learning_rate": 0.000527703081232493, + "loss": 0.3304, + "step": 16972 + }, + { + "epoch": 9.482122905027932, + "grad_norm": 0.48225876688957214, + "learning_rate": 0.0005276750700280112, + "loss": 0.3886, + "step": 16973 + }, + { + "epoch": 9.48268156424581, + "grad_norm": 0.5581726431846619, + "learning_rate": 0.0005276470588235294, + "loss": 0.4473, + "step": 16974 + }, + { + "epoch": 9.483240223463687, + "grad_norm": 0.6811268329620361, + "learning_rate": 0.0005276190476190477, + "loss": 0.4646, + "step": 16975 + }, + { + "epoch": 9.483798882681564, + "grad_norm": 0.6459947824478149, + "learning_rate": 0.0005275910364145658, + "loss": 0.4539, + "step": 16976 + }, + { + "epoch": 9.484357541899442, + "grad_norm": 0.5975168347358704, + "learning_rate": 0.000527563025210084, + "loss": 0.4654, + "step": 16977 + }, + { + "epoch": 9.484916201117318, + "grad_norm": 0.38554954528808594, + "learning_rate": 0.0005275350140056022, + "loss": 0.3777, + "step": 16978 + }, + { + "epoch": 9.485474860335195, + "grad_norm": 0.6111862063407898, + "learning_rate": 0.0005275070028011204, + "loss": 0.4024, + "step": 16979 + }, + { + "epoch": 9.486033519553073, + "grad_norm": 0.8880608081817627, + "learning_rate": 0.0005274789915966387, + "loss": 0.5205, + "step": 16980 + }, + { + "epoch": 9.48659217877095, + "grad_norm": 0.6676656603813171, + "learning_rate": 0.0005274509803921568, + "loss": 0.439, + "step": 16981 + }, + { + "epoch": 9.487150837988827, + "grad_norm": 0.5042214393615723, + "learning_rate": 0.000527422969187675, + "loss": 0.5277, + "step": 16982 + }, + { + "epoch": 9.487709497206703, + "grad_norm": 0.5696197152137756, + "learning_rate": 0.0005273949579831932, + "loss": 0.4675, + "step": 16983 + }, + { + "epoch": 9.488268156424581, + "grad_norm": 0.3811723291873932, + "learning_rate": 0.0005273669467787115, + "loss": 0.381, + "step": 16984 + }, + { + "epoch": 9.488826815642458, + "grad_norm": 2.059382438659668, + "learning_rate": 0.0005273389355742298, + "loss": 0.3652, + "step": 16985 + }, + { + "epoch": 9.489385474860335, + "grad_norm": 0.6013665795326233, + "learning_rate": 0.000527310924369748, + "loss": 0.2809, + "step": 16986 + }, + { + "epoch": 9.489944134078213, + "grad_norm": 1.5045030117034912, + "learning_rate": 0.0005272829131652661, + "loss": 0.5084, + "step": 16987 + }, + { + "epoch": 9.49050279329609, + "grad_norm": 0.9651634097099304, + "learning_rate": 0.0005272549019607843, + "loss": 0.5106, + "step": 16988 + }, + { + "epoch": 9.491061452513966, + "grad_norm": 0.580617368221283, + "learning_rate": 0.0005272268907563025, + "loss": 0.4587, + "step": 16989 + }, + { + "epoch": 9.491620111731844, + "grad_norm": 0.5041201114654541, + "learning_rate": 0.0005271988795518208, + "loss": 0.3886, + "step": 16990 + }, + { + "epoch": 9.492178770949721, + "grad_norm": 0.5230264663696289, + "learning_rate": 0.000527170868347339, + "loss": 0.4946, + "step": 16991 + }, + { + "epoch": 9.492737430167598, + "grad_norm": 0.358958899974823, + "learning_rate": 0.0005271428571428571, + "loss": 0.4651, + "step": 16992 + }, + { + "epoch": 9.493296089385474, + "grad_norm": 0.5267083048820496, + "learning_rate": 0.0005271148459383753, + "loss": 0.6409, + "step": 16993 + }, + { + "epoch": 9.493854748603352, + "grad_norm": 0.46035170555114746, + "learning_rate": 0.0005270868347338935, + "loss": 0.4324, + "step": 16994 + }, + { + "epoch": 9.494413407821229, + "grad_norm": 0.466959148645401, + "learning_rate": 0.0005270588235294118, + "loss": 0.4284, + "step": 16995 + }, + { + "epoch": 9.494972067039106, + "grad_norm": 0.6545068025588989, + "learning_rate": 0.00052703081232493, + "loss": 0.5019, + "step": 16996 + }, + { + "epoch": 9.495530726256984, + "grad_norm": 0.6382648348808289, + "learning_rate": 0.0005270028011204481, + "loss": 0.5205, + "step": 16997 + }, + { + "epoch": 9.49608938547486, + "grad_norm": 2.8890585899353027, + "learning_rate": 0.0005269747899159663, + "loss": 0.4159, + "step": 16998 + }, + { + "epoch": 9.496648044692737, + "grad_norm": 2.189188241958618, + "learning_rate": 0.0005269467787114845, + "loss": 0.4453, + "step": 16999 + }, + { + "epoch": 9.497206703910614, + "grad_norm": 0.39940744638442993, + "learning_rate": 0.0005269187675070029, + "loss": 0.3932, + "step": 17000 + }, + { + "epoch": 9.497206703910614, + "eval_cer": 0.0885622947421252, + "eval_loss": 0.3346686065196991, + "eval_runtime": 55.465, + "eval_samples_per_second": 81.817, + "eval_steps_per_second": 5.12, + "eval_wer": 0.34962710538811764, + "step": 17000 + }, + { + "epoch": 9.497765363128492, + "grad_norm": 0.8588578701019287, + "learning_rate": 0.0005268907563025211, + "loss": 0.4771, + "step": 17001 + }, + { + "epoch": 9.498324022346369, + "grad_norm": 2.3360366821289062, + "learning_rate": 0.0005268627450980393, + "loss": 0.4208, + "step": 17002 + }, + { + "epoch": 9.498882681564245, + "grad_norm": 0.4767686426639557, + "learning_rate": 0.0005268347338935574, + "loss": 0.4254, + "step": 17003 + }, + { + "epoch": 9.499441340782123, + "grad_norm": 0.5392374396324158, + "learning_rate": 0.0005268067226890756, + "loss": 0.3953, + "step": 17004 + }, + { + "epoch": 9.5, + "grad_norm": 1.5540601015090942, + "learning_rate": 0.0005267787114845939, + "loss": 0.4818, + "step": 17005 + }, + { + "epoch": 9.500558659217877, + "grad_norm": 0.3668883442878723, + "learning_rate": 0.0005267507002801121, + "loss": 0.415, + "step": 17006 + }, + { + "epoch": 9.501117318435755, + "grad_norm": 0.4730549454689026, + "learning_rate": 0.0005267226890756303, + "loss": 0.4503, + "step": 17007 + }, + { + "epoch": 9.501675977653631, + "grad_norm": 0.4590531587600708, + "learning_rate": 0.0005266946778711484, + "loss": 0.4928, + "step": 17008 + }, + { + "epoch": 9.502234636871508, + "grad_norm": 0.5151906609535217, + "learning_rate": 0.0005266666666666666, + "loss": 0.4737, + "step": 17009 + }, + { + "epoch": 9.502793296089386, + "grad_norm": 0.5052680969238281, + "learning_rate": 0.0005266386554621849, + "loss": 0.4757, + "step": 17010 + }, + { + "epoch": 9.503351955307263, + "grad_norm": 1.080153226852417, + "learning_rate": 0.0005266106442577031, + "loss": 0.4144, + "step": 17011 + }, + { + "epoch": 9.50391061452514, + "grad_norm": 0.3803623616695404, + "learning_rate": 0.0005265826330532213, + "loss": 0.355, + "step": 17012 + }, + { + "epoch": 9.504469273743016, + "grad_norm": 1.0341953039169312, + "learning_rate": 0.0005265546218487394, + "loss": 0.335, + "step": 17013 + }, + { + "epoch": 9.505027932960894, + "grad_norm": 0.5581839680671692, + "learning_rate": 0.0005265266106442576, + "loss": 0.3988, + "step": 17014 + }, + { + "epoch": 9.505586592178771, + "grad_norm": 0.5371859073638916, + "learning_rate": 0.000526498599439776, + "loss": 0.5534, + "step": 17015 + }, + { + "epoch": 9.506145251396648, + "grad_norm": 0.7883785963058472, + "learning_rate": 0.0005264705882352942, + "loss": 0.3991, + "step": 17016 + }, + { + "epoch": 9.506703910614526, + "grad_norm": 1.2164338827133179, + "learning_rate": 0.0005264425770308124, + "loss": 0.3963, + "step": 17017 + }, + { + "epoch": 9.507262569832402, + "grad_norm": 0.7702980637550354, + "learning_rate": 0.0005264145658263306, + "loss": 0.4526, + "step": 17018 + }, + { + "epoch": 9.507821229050279, + "grad_norm": 1.0856456756591797, + "learning_rate": 0.0005263865546218487, + "loss": 0.4469, + "step": 17019 + }, + { + "epoch": 9.508379888268156, + "grad_norm": 0.5283039212226868, + "learning_rate": 0.000526358543417367, + "loss": 0.4258, + "step": 17020 + }, + { + "epoch": 9.508938547486034, + "grad_norm": 0.706710696220398, + "learning_rate": 0.0005263305322128852, + "loss": 0.4735, + "step": 17021 + }, + { + "epoch": 9.50949720670391, + "grad_norm": 0.49506470561027527, + "learning_rate": 0.0005263025210084034, + "loss": 0.4785, + "step": 17022 + }, + { + "epoch": 9.510055865921787, + "grad_norm": 0.48370540142059326, + "learning_rate": 0.0005262745098039216, + "loss": 0.4778, + "step": 17023 + }, + { + "epoch": 9.510614525139665, + "grad_norm": 0.5205458402633667, + "learning_rate": 0.0005262464985994397, + "loss": 0.38, + "step": 17024 + }, + { + "epoch": 9.511173184357542, + "grad_norm": 5.673548698425293, + "learning_rate": 0.000526218487394958, + "loss": 0.4341, + "step": 17025 + }, + { + "epoch": 9.511731843575419, + "grad_norm": 0.855028510093689, + "learning_rate": 0.0005261904761904762, + "loss": 0.4253, + "step": 17026 + }, + { + "epoch": 9.512290502793297, + "grad_norm": 0.4802238345146179, + "learning_rate": 0.0005261624649859944, + "loss": 0.606, + "step": 17027 + }, + { + "epoch": 9.512849162011173, + "grad_norm": 0.623631477355957, + "learning_rate": 0.0005261344537815126, + "loss": 0.4405, + "step": 17028 + }, + { + "epoch": 9.51340782122905, + "grad_norm": 0.8969951272010803, + "learning_rate": 0.0005261064425770307, + "loss": 0.4085, + "step": 17029 + }, + { + "epoch": 9.513966480446927, + "grad_norm": 1.3782315254211426, + "learning_rate": 0.000526078431372549, + "loss": 0.5027, + "step": 17030 + }, + { + "epoch": 9.514525139664805, + "grad_norm": 0.5698773860931396, + "learning_rate": 0.0005260504201680672, + "loss": 0.4724, + "step": 17031 + }, + { + "epoch": 9.515083798882682, + "grad_norm": 0.5380564332008362, + "learning_rate": 0.0005260224089635854, + "loss": 0.4539, + "step": 17032 + }, + { + "epoch": 9.515642458100558, + "grad_norm": 0.5562434792518616, + "learning_rate": 0.0005259943977591037, + "loss": 0.4309, + "step": 17033 + }, + { + "epoch": 9.516201117318436, + "grad_norm": 0.5673577189445496, + "learning_rate": 0.0005259663865546219, + "loss": 0.4182, + "step": 17034 + }, + { + "epoch": 9.516759776536313, + "grad_norm": 0.38618579506874084, + "learning_rate": 0.0005259383753501401, + "loss": 0.3485, + "step": 17035 + }, + { + "epoch": 9.51731843575419, + "grad_norm": 0.4398757219314575, + "learning_rate": 0.0005259103641456583, + "loss": 0.5243, + "step": 17036 + }, + { + "epoch": 9.517877094972068, + "grad_norm": 1.4248276948928833, + "learning_rate": 0.0005258823529411765, + "loss": 0.4168, + "step": 17037 + }, + { + "epoch": 9.518435754189944, + "grad_norm": 0.502399742603302, + "learning_rate": 0.0005258543417366947, + "loss": 0.3848, + "step": 17038 + }, + { + "epoch": 9.518994413407821, + "grad_norm": 0.4897719621658325, + "learning_rate": 0.0005258263305322129, + "loss": 0.5654, + "step": 17039 + }, + { + "epoch": 9.519553072625698, + "grad_norm": 1.2701818943023682, + "learning_rate": 0.0005257983193277311, + "loss": 0.4242, + "step": 17040 + }, + { + "epoch": 9.520111731843576, + "grad_norm": 0.4697544574737549, + "learning_rate": 0.0005257703081232493, + "loss": 0.4251, + "step": 17041 + }, + { + "epoch": 9.520670391061453, + "grad_norm": 0.7305477261543274, + "learning_rate": 0.0005257422969187675, + "loss": 0.5089, + "step": 17042 + }, + { + "epoch": 9.521229050279329, + "grad_norm": 0.7140673995018005, + "learning_rate": 0.0005257142857142857, + "loss": 0.4771, + "step": 17043 + }, + { + "epoch": 9.521787709497207, + "grad_norm": 0.4751395881175995, + "learning_rate": 0.0005256862745098039, + "loss": 0.4024, + "step": 17044 + }, + { + "epoch": 9.522346368715084, + "grad_norm": 0.8670386672019958, + "learning_rate": 0.0005256582633053222, + "loss": 0.5812, + "step": 17045 + }, + { + "epoch": 9.52290502793296, + "grad_norm": 0.5611286759376526, + "learning_rate": 0.0005256302521008403, + "loss": 0.4133, + "step": 17046 + }, + { + "epoch": 9.523463687150837, + "grad_norm": 0.6129143238067627, + "learning_rate": 0.0005256022408963585, + "loss": 0.3555, + "step": 17047 + }, + { + "epoch": 9.524022346368715, + "grad_norm": 0.5914970636367798, + "learning_rate": 0.0005255742296918767, + "loss": 0.3675, + "step": 17048 + }, + { + "epoch": 9.524581005586592, + "grad_norm": 0.3674723207950592, + "learning_rate": 0.000525546218487395, + "loss": 0.3392, + "step": 17049 + }, + { + "epoch": 9.525139664804469, + "grad_norm": 0.45090529322624207, + "learning_rate": 0.0005255182072829133, + "loss": 0.3598, + "step": 17050 + }, + { + "epoch": 9.525698324022347, + "grad_norm": 1.039182424545288, + "learning_rate": 0.0005254901960784314, + "loss": 0.5497, + "step": 17051 + }, + { + "epoch": 9.526256983240224, + "grad_norm": 0.5035821199417114, + "learning_rate": 0.0005254621848739496, + "loss": 0.3655, + "step": 17052 + }, + { + "epoch": 9.5268156424581, + "grad_norm": 0.7699378728866577, + "learning_rate": 0.0005254341736694678, + "loss": 0.4663, + "step": 17053 + }, + { + "epoch": 9.527374301675978, + "grad_norm": 1.0760384798049927, + "learning_rate": 0.000525406162464986, + "loss": 0.4905, + "step": 17054 + }, + { + "epoch": 9.527932960893855, + "grad_norm": 0.41430097818374634, + "learning_rate": 0.0005253781512605043, + "loss": 0.4142, + "step": 17055 + }, + { + "epoch": 9.528491620111732, + "grad_norm": 0.4487805664539337, + "learning_rate": 0.0005253501400560224, + "loss": 0.3556, + "step": 17056 + }, + { + "epoch": 9.529050279329608, + "grad_norm": 1.30894136428833, + "learning_rate": 0.0005253221288515406, + "loss": 0.4198, + "step": 17057 + }, + { + "epoch": 9.529608938547486, + "grad_norm": 0.39597150683403015, + "learning_rate": 0.0005252941176470588, + "loss": 0.382, + "step": 17058 + }, + { + "epoch": 9.530167597765363, + "grad_norm": 0.49407562613487244, + "learning_rate": 0.000525266106442577, + "loss": 0.472, + "step": 17059 + }, + { + "epoch": 9.53072625698324, + "grad_norm": 1.321121335029602, + "learning_rate": 0.0005252380952380953, + "loss": 0.5272, + "step": 17060 + }, + { + "epoch": 9.531284916201118, + "grad_norm": 0.5465450286865234, + "learning_rate": 0.0005252100840336135, + "loss": 0.4492, + "step": 17061 + }, + { + "epoch": 9.531843575418995, + "grad_norm": 0.4928973913192749, + "learning_rate": 0.0005251820728291316, + "loss": 0.3634, + "step": 17062 + }, + { + "epoch": 9.532402234636871, + "grad_norm": 0.5599426627159119, + "learning_rate": 0.0005251540616246498, + "loss": 0.5228, + "step": 17063 + }, + { + "epoch": 9.53296089385475, + "grad_norm": 0.40625089406967163, + "learning_rate": 0.000525126050420168, + "loss": 0.3379, + "step": 17064 + }, + { + "epoch": 9.533519553072626, + "grad_norm": 0.6190981268882751, + "learning_rate": 0.0005250980392156864, + "loss": 0.3809, + "step": 17065 + }, + { + "epoch": 9.534078212290503, + "grad_norm": 0.8735527396202087, + "learning_rate": 0.0005250700280112046, + "loss": 0.3654, + "step": 17066 + }, + { + "epoch": 9.53463687150838, + "grad_norm": 0.6221041679382324, + "learning_rate": 0.0005250420168067227, + "loss": 0.5481, + "step": 17067 + }, + { + "epoch": 9.535195530726257, + "grad_norm": 0.8592880964279175, + "learning_rate": 0.0005250140056022409, + "loss": 0.4099, + "step": 17068 + }, + { + "epoch": 9.535754189944134, + "grad_norm": 0.4455025792121887, + "learning_rate": 0.0005249859943977591, + "loss": 0.3824, + "step": 17069 + }, + { + "epoch": 9.53631284916201, + "grad_norm": 0.5507128238677979, + "learning_rate": 0.0005249579831932774, + "loss": 0.4256, + "step": 17070 + }, + { + "epoch": 9.536871508379889, + "grad_norm": 0.3528655171394348, + "learning_rate": 0.0005249299719887956, + "loss": 0.38, + "step": 17071 + }, + { + "epoch": 9.537430167597766, + "grad_norm": 0.5165811777114868, + "learning_rate": 0.0005249019607843137, + "loss": 0.4086, + "step": 17072 + }, + { + "epoch": 9.537988826815642, + "grad_norm": 0.43222153186798096, + "learning_rate": 0.0005248739495798319, + "loss": 0.4373, + "step": 17073 + }, + { + "epoch": 9.538547486033519, + "grad_norm": 0.5648389458656311, + "learning_rate": 0.0005248459383753501, + "loss": 0.5987, + "step": 17074 + }, + { + "epoch": 9.539106145251397, + "grad_norm": 1.7469907999038696, + "learning_rate": 0.0005248179271708684, + "loss": 0.4046, + "step": 17075 + }, + { + "epoch": 9.539664804469274, + "grad_norm": 0.5755770206451416, + "learning_rate": 0.0005247899159663866, + "loss": 0.3985, + "step": 17076 + }, + { + "epoch": 9.54022346368715, + "grad_norm": 1.8374018669128418, + "learning_rate": 0.0005247619047619048, + "loss": 0.6532, + "step": 17077 + }, + { + "epoch": 9.540782122905028, + "grad_norm": 0.4430503249168396, + "learning_rate": 0.0005247338935574229, + "loss": 0.4077, + "step": 17078 + }, + { + "epoch": 9.541340782122905, + "grad_norm": 0.5844117403030396, + "learning_rate": 0.0005247058823529411, + "loss": 0.4748, + "step": 17079 + }, + { + "epoch": 9.541899441340782, + "grad_norm": 0.4782406687736511, + "learning_rate": 0.0005246778711484594, + "loss": 0.4011, + "step": 17080 + }, + { + "epoch": 9.54245810055866, + "grad_norm": 0.9458624720573425, + "learning_rate": 0.0005246498599439777, + "loss": 0.578, + "step": 17081 + }, + { + "epoch": 9.543016759776537, + "grad_norm": 0.7406259775161743, + "learning_rate": 0.0005246218487394959, + "loss": 0.4464, + "step": 17082 + }, + { + "epoch": 9.543575418994413, + "grad_norm": 0.7134833335876465, + "learning_rate": 0.000524593837535014, + "loss": 0.5599, + "step": 17083 + }, + { + "epoch": 9.544134078212291, + "grad_norm": 0.4752214848995209, + "learning_rate": 0.0005245658263305322, + "loss": 0.4252, + "step": 17084 + }, + { + "epoch": 9.544692737430168, + "grad_norm": 0.48502466082572937, + "learning_rate": 0.0005245378151260505, + "loss": 0.4379, + "step": 17085 + }, + { + "epoch": 9.545251396648045, + "grad_norm": 0.7046025395393372, + "learning_rate": 0.0005245098039215687, + "loss": 0.4014, + "step": 17086 + }, + { + "epoch": 9.545810055865921, + "grad_norm": 1.863295555114746, + "learning_rate": 0.0005244817927170869, + "loss": 0.5312, + "step": 17087 + }, + { + "epoch": 9.5463687150838, + "grad_norm": 0.7555891871452332, + "learning_rate": 0.000524453781512605, + "loss": 0.5204, + "step": 17088 + }, + { + "epoch": 9.546927374301676, + "grad_norm": 0.774724006652832, + "learning_rate": 0.0005244257703081232, + "loss": 0.4239, + "step": 17089 + }, + { + "epoch": 9.547486033519553, + "grad_norm": 1.4700146913528442, + "learning_rate": 0.0005243977591036415, + "loss": 0.3992, + "step": 17090 + }, + { + "epoch": 9.548044692737431, + "grad_norm": 1.0987331867218018, + "learning_rate": 0.0005243697478991597, + "loss": 0.5973, + "step": 17091 + }, + { + "epoch": 9.548603351955308, + "grad_norm": 0.526216983795166, + "learning_rate": 0.0005243417366946779, + "loss": 0.4423, + "step": 17092 + }, + { + "epoch": 9.549162011173184, + "grad_norm": 4.453642845153809, + "learning_rate": 0.0005243137254901961, + "loss": 0.5893, + "step": 17093 + }, + { + "epoch": 9.54972067039106, + "grad_norm": 0.529809832572937, + "learning_rate": 0.0005242857142857142, + "loss": 0.525, + "step": 17094 + }, + { + "epoch": 9.550279329608939, + "grad_norm": 0.45865899324417114, + "learning_rate": 0.0005242577030812325, + "loss": 0.2893, + "step": 17095 + }, + { + "epoch": 9.550837988826816, + "grad_norm": 0.36625370383262634, + "learning_rate": 0.0005242296918767507, + "loss": 0.3691, + "step": 17096 + }, + { + "epoch": 9.551396648044692, + "grad_norm": 2.300689220428467, + "learning_rate": 0.000524201680672269, + "loss": 0.5251, + "step": 17097 + }, + { + "epoch": 9.55195530726257, + "grad_norm": 0.7351963520050049, + "learning_rate": 0.0005241736694677872, + "loss": 0.4333, + "step": 17098 + }, + { + "epoch": 9.552513966480447, + "grad_norm": 0.47708648443222046, + "learning_rate": 0.0005241456582633053, + "loss": 0.5072, + "step": 17099 + }, + { + "epoch": 9.553072625698324, + "grad_norm": 0.6173315048217773, + "learning_rate": 0.0005241176470588236, + "loss": 0.5266, + "step": 17100 + }, + { + "epoch": 9.553631284916202, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0005240896358543418, + "loss": 0.4647, + "step": 17101 + }, + { + "epoch": 9.554189944134079, + "grad_norm": 0.5267318487167358, + "learning_rate": 0.00052406162464986, + "loss": 0.5438, + "step": 17102 + }, + { + "epoch": 9.554748603351955, + "grad_norm": 0.7505891919136047, + "learning_rate": 0.0005240336134453782, + "loss": 0.4739, + "step": 17103 + }, + { + "epoch": 9.555307262569832, + "grad_norm": 2.939276695251465, + "learning_rate": 0.0005240056022408963, + "loss": 0.3609, + "step": 17104 + }, + { + "epoch": 9.55586592178771, + "grad_norm": 0.6280815601348877, + "learning_rate": 0.0005239775910364146, + "loss": 0.4767, + "step": 17105 + }, + { + "epoch": 9.556424581005587, + "grad_norm": 0.7200734615325928, + "learning_rate": 0.0005239495798319328, + "loss": 0.3892, + "step": 17106 + }, + { + "epoch": 9.556983240223463, + "grad_norm": 0.49386996030807495, + "learning_rate": 0.000523921568627451, + "loss": 0.3605, + "step": 17107 + }, + { + "epoch": 9.557541899441341, + "grad_norm": 0.5270695686340332, + "learning_rate": 0.0005238935574229692, + "loss": 0.4552, + "step": 17108 + }, + { + "epoch": 9.558100558659218, + "grad_norm": 0.3849281370639801, + "learning_rate": 0.0005238655462184874, + "loss": 0.3747, + "step": 17109 + }, + { + "epoch": 9.558659217877095, + "grad_norm": 0.9354639649391174, + "learning_rate": 0.0005238375350140056, + "loss": 0.5575, + "step": 17110 + }, + { + "epoch": 9.559217877094973, + "grad_norm": 1.1783463954925537, + "learning_rate": 0.0005238095238095238, + "loss": 0.4527, + "step": 17111 + }, + { + "epoch": 9.55977653631285, + "grad_norm": 0.47555312514305115, + "learning_rate": 0.000523781512605042, + "loss": 0.3406, + "step": 17112 + }, + { + "epoch": 9.560335195530726, + "grad_norm": 1.0341235399246216, + "learning_rate": 0.0005237535014005602, + "loss": 0.5136, + "step": 17113 + }, + { + "epoch": 9.560893854748603, + "grad_norm": 0.45691102743148804, + "learning_rate": 0.0005237254901960784, + "loss": 0.3725, + "step": 17114 + }, + { + "epoch": 9.561452513966481, + "grad_norm": 0.5268739461898804, + "learning_rate": 0.0005236974789915967, + "loss": 0.416, + "step": 17115 + }, + { + "epoch": 9.562011173184358, + "grad_norm": 0.7065299153327942, + "learning_rate": 0.0005236694677871149, + "loss": 0.5814, + "step": 17116 + }, + { + "epoch": 9.562569832402234, + "grad_norm": 0.5363345742225647, + "learning_rate": 0.0005236414565826331, + "loss": 0.46, + "step": 17117 + }, + { + "epoch": 9.563128491620112, + "grad_norm": 0.5881487131118774, + "learning_rate": 0.0005236134453781513, + "loss": 0.4344, + "step": 17118 + }, + { + "epoch": 9.563687150837989, + "grad_norm": 0.5991069674491882, + "learning_rate": 0.0005235854341736695, + "loss": 0.4994, + "step": 17119 + }, + { + "epoch": 9.564245810055866, + "grad_norm": 0.540647566318512, + "learning_rate": 0.0005235574229691877, + "loss": 0.4531, + "step": 17120 + }, + { + "epoch": 9.564804469273742, + "grad_norm": 0.6199446320533752, + "learning_rate": 0.0005235294117647059, + "loss": 0.4049, + "step": 17121 + }, + { + "epoch": 9.56536312849162, + "grad_norm": 0.9489259123802185, + "learning_rate": 0.0005235014005602241, + "loss": 0.4576, + "step": 17122 + }, + { + "epoch": 9.565921787709497, + "grad_norm": 1.1844803094863892, + "learning_rate": 0.0005234733893557423, + "loss": 0.4348, + "step": 17123 + }, + { + "epoch": 9.566480446927374, + "grad_norm": 0.6842337846755981, + "learning_rate": 0.0005234453781512605, + "loss": 0.5164, + "step": 17124 + }, + { + "epoch": 9.567039106145252, + "grad_norm": 0.9702425599098206, + "learning_rate": 0.0005234173669467788, + "loss": 0.5053, + "step": 17125 + }, + { + "epoch": 9.567597765363129, + "grad_norm": 0.4444902539253235, + "learning_rate": 0.0005233893557422969, + "loss": 0.3998, + "step": 17126 + }, + { + "epoch": 9.568156424581005, + "grad_norm": 0.39580947160720825, + "learning_rate": 0.0005233613445378151, + "loss": 0.4246, + "step": 17127 + }, + { + "epoch": 9.568715083798883, + "grad_norm": 0.38838502764701843, + "learning_rate": 0.0005233333333333333, + "loss": 0.3448, + "step": 17128 + }, + { + "epoch": 9.56927374301676, + "grad_norm": 0.529678463935852, + "learning_rate": 0.0005233053221288515, + "loss": 0.5969, + "step": 17129 + }, + { + "epoch": 9.569832402234637, + "grad_norm": 0.5953384041786194, + "learning_rate": 0.0005232773109243699, + "loss": 0.3602, + "step": 17130 + }, + { + "epoch": 9.570391061452513, + "grad_norm": 0.4435705542564392, + "learning_rate": 0.000523249299719888, + "loss": 0.3981, + "step": 17131 + }, + { + "epoch": 9.570949720670392, + "grad_norm": 0.5549498200416565, + "learning_rate": 0.0005232212885154062, + "loss": 0.413, + "step": 17132 + }, + { + "epoch": 9.571508379888268, + "grad_norm": 0.462931364774704, + "learning_rate": 0.0005231932773109244, + "loss": 0.4177, + "step": 17133 + }, + { + "epoch": 9.572067039106145, + "grad_norm": 0.3711908161640167, + "learning_rate": 0.0005231652661064426, + "loss": 0.4648, + "step": 17134 + }, + { + "epoch": 9.572625698324023, + "grad_norm": 0.4939176142215729, + "learning_rate": 0.0005231372549019609, + "loss": 0.4048, + "step": 17135 + }, + { + "epoch": 9.5731843575419, + "grad_norm": 1.4673243761062622, + "learning_rate": 0.000523109243697479, + "loss": 0.5333, + "step": 17136 + }, + { + "epoch": 9.573743016759776, + "grad_norm": 0.443583220243454, + "learning_rate": 0.0005230812324929972, + "loss": 0.305, + "step": 17137 + }, + { + "epoch": 9.574301675977654, + "grad_norm": 0.5470555424690247, + "learning_rate": 0.0005230532212885154, + "loss": 0.4449, + "step": 17138 + }, + { + "epoch": 9.574860335195531, + "grad_norm": 0.40456125140190125, + "learning_rate": 0.0005230252100840336, + "loss": 0.4328, + "step": 17139 + }, + { + "epoch": 9.575418994413408, + "grad_norm": 0.7064840197563171, + "learning_rate": 0.0005229971988795519, + "loss": 0.5424, + "step": 17140 + }, + { + "epoch": 9.575977653631284, + "grad_norm": 0.4466869831085205, + "learning_rate": 0.0005229691876750701, + "loss": 0.4153, + "step": 17141 + }, + { + "epoch": 9.576536312849163, + "grad_norm": 0.5580495595932007, + "learning_rate": 0.0005229411764705882, + "loss": 0.5362, + "step": 17142 + }, + { + "epoch": 9.577094972067039, + "grad_norm": 0.3946962356567383, + "learning_rate": 0.0005229131652661064, + "loss": 0.3823, + "step": 17143 + }, + { + "epoch": 9.577653631284916, + "grad_norm": 0.44697222113609314, + "learning_rate": 0.0005228851540616246, + "loss": 0.4547, + "step": 17144 + }, + { + "epoch": 9.578212290502794, + "grad_norm": 0.6752206087112427, + "learning_rate": 0.0005228571428571429, + "loss": 0.435, + "step": 17145 + }, + { + "epoch": 9.57877094972067, + "grad_norm": 0.4149279296398163, + "learning_rate": 0.0005228291316526611, + "loss": 0.3296, + "step": 17146 + }, + { + "epoch": 9.579329608938547, + "grad_norm": 0.48432645201683044, + "learning_rate": 0.0005228011204481792, + "loss": 0.4652, + "step": 17147 + }, + { + "epoch": 9.579888268156424, + "grad_norm": 0.49763989448547363, + "learning_rate": 0.0005227731092436975, + "loss": 0.4825, + "step": 17148 + }, + { + "epoch": 9.580446927374302, + "grad_norm": 0.5180466771125793, + "learning_rate": 0.0005227450980392157, + "loss": 0.4342, + "step": 17149 + }, + { + "epoch": 9.581005586592179, + "grad_norm": 1.0930254459381104, + "learning_rate": 0.000522717086834734, + "loss": 0.4871, + "step": 17150 + }, + { + "epoch": 9.581564245810055, + "grad_norm": 0.6297570466995239, + "learning_rate": 0.0005226890756302522, + "loss": 0.3721, + "step": 17151 + }, + { + "epoch": 9.582122905027934, + "grad_norm": 0.49666717648506165, + "learning_rate": 0.0005226610644257703, + "loss": 0.4522, + "step": 17152 + }, + { + "epoch": 9.58268156424581, + "grad_norm": 0.4924290180206299, + "learning_rate": 0.0005226330532212885, + "loss": 0.4572, + "step": 17153 + }, + { + "epoch": 9.583240223463687, + "grad_norm": 5.374730587005615, + "learning_rate": 0.0005226050420168067, + "loss": 0.4623, + "step": 17154 + }, + { + "epoch": 9.583798882681565, + "grad_norm": 0.49165159463882446, + "learning_rate": 0.000522577030812325, + "loss": 0.4094, + "step": 17155 + }, + { + "epoch": 9.584357541899442, + "grad_norm": 0.6179744601249695, + "learning_rate": 0.0005225490196078432, + "loss": 0.4683, + "step": 17156 + }, + { + "epoch": 9.584916201117318, + "grad_norm": 0.5643314123153687, + "learning_rate": 0.0005225210084033614, + "loss": 0.3381, + "step": 17157 + }, + { + "epoch": 9.585474860335196, + "grad_norm": 0.4656287431716919, + "learning_rate": 0.0005224929971988795, + "loss": 0.4767, + "step": 17158 + }, + { + "epoch": 9.586033519553073, + "grad_norm": 0.4668651223182678, + "learning_rate": 0.0005224649859943977, + "loss": 0.3958, + "step": 17159 + }, + { + "epoch": 9.58659217877095, + "grad_norm": 0.6359231472015381, + "learning_rate": 0.000522436974789916, + "loss": 0.4359, + "step": 17160 + }, + { + "epoch": 9.587150837988826, + "grad_norm": 0.9883036017417908, + "learning_rate": 0.0005224089635854342, + "loss": 0.5551, + "step": 17161 + }, + { + "epoch": 9.587709497206705, + "grad_norm": 0.4519002139568329, + "learning_rate": 0.0005223809523809524, + "loss": 0.3913, + "step": 17162 + }, + { + "epoch": 9.588268156424581, + "grad_norm": 1.19994056224823, + "learning_rate": 0.0005223529411764705, + "loss": 0.3671, + "step": 17163 + }, + { + "epoch": 9.588826815642458, + "grad_norm": 0.43731117248535156, + "learning_rate": 0.0005223249299719887, + "loss": 0.4964, + "step": 17164 + }, + { + "epoch": 9.589385474860336, + "grad_norm": 1.8858529329299927, + "learning_rate": 0.0005222969187675071, + "loss": 0.3982, + "step": 17165 + }, + { + "epoch": 9.589944134078213, + "grad_norm": 0.5303972363471985, + "learning_rate": 0.0005222689075630253, + "loss": 0.4732, + "step": 17166 + }, + { + "epoch": 9.59050279329609, + "grad_norm": 0.4779343903064728, + "learning_rate": 0.0005222408963585435, + "loss": 0.447, + "step": 17167 + }, + { + "epoch": 9.591061452513966, + "grad_norm": 42.68880844116211, + "learning_rate": 0.0005222128851540616, + "loss": 0.4795, + "step": 17168 + }, + { + "epoch": 9.591620111731844, + "grad_norm": 0.3906610906124115, + "learning_rate": 0.0005221848739495798, + "loss": 0.3446, + "step": 17169 + }, + { + "epoch": 9.59217877094972, + "grad_norm": 0.7117505073547363, + "learning_rate": 0.0005221568627450981, + "loss": 0.4275, + "step": 17170 + }, + { + "epoch": 9.592737430167597, + "grad_norm": 0.7052455544471741, + "learning_rate": 0.0005221288515406163, + "loss": 0.4334, + "step": 17171 + }, + { + "epoch": 9.593296089385476, + "grad_norm": 1.207653522491455, + "learning_rate": 0.0005221008403361345, + "loss": 0.3687, + "step": 17172 + }, + { + "epoch": 9.593854748603352, + "grad_norm": 0.5037881135940552, + "learning_rate": 0.0005220728291316527, + "loss": 0.448, + "step": 17173 + }, + { + "epoch": 9.594413407821229, + "grad_norm": 0.4416349530220032, + "learning_rate": 0.0005220448179271708, + "loss": 0.4845, + "step": 17174 + }, + { + "epoch": 9.594972067039105, + "grad_norm": 0.5492764115333557, + "learning_rate": 0.0005220168067226891, + "loss": 0.4785, + "step": 17175 + }, + { + "epoch": 9.595530726256984, + "grad_norm": 0.4624919891357422, + "learning_rate": 0.0005219887955182073, + "loss": 0.4793, + "step": 17176 + }, + { + "epoch": 9.59608938547486, + "grad_norm": 1.1057995557785034, + "learning_rate": 0.0005219607843137255, + "loss": 0.4647, + "step": 17177 + }, + { + "epoch": 9.596648044692737, + "grad_norm": 0.5127424001693726, + "learning_rate": 0.0005219327731092437, + "loss": 0.4357, + "step": 17178 + }, + { + "epoch": 9.597206703910615, + "grad_norm": 0.39835458993911743, + "learning_rate": 0.0005219047619047618, + "loss": 0.357, + "step": 17179 + }, + { + "epoch": 9.597765363128492, + "grad_norm": 0.816056489944458, + "learning_rate": 0.0005218767507002802, + "loss": 0.3523, + "step": 17180 + }, + { + "epoch": 9.598324022346368, + "grad_norm": 0.8206701874732971, + "learning_rate": 0.0005218487394957984, + "loss": 0.2964, + "step": 17181 + }, + { + "epoch": 9.598882681564247, + "grad_norm": 0.6436958909034729, + "learning_rate": 0.0005218207282913166, + "loss": 0.455, + "step": 17182 + }, + { + "epoch": 9.599441340782123, + "grad_norm": 0.5124699473381042, + "learning_rate": 0.0005217927170868348, + "loss": 0.4362, + "step": 17183 + }, + { + "epoch": 9.6, + "grad_norm": 0.5424758195877075, + "learning_rate": 0.0005217647058823529, + "loss": 0.3933, + "step": 17184 + }, + { + "epoch": 9.600558659217878, + "grad_norm": 0.413993239402771, + "learning_rate": 0.0005217366946778711, + "loss": 0.3832, + "step": 17185 + }, + { + "epoch": 9.601117318435755, + "grad_norm": 0.526695966720581, + "learning_rate": 0.0005217086834733894, + "loss": 0.4115, + "step": 17186 + }, + { + "epoch": 9.601675977653631, + "grad_norm": 0.43858984112739563, + "learning_rate": 0.0005216806722689076, + "loss": 0.472, + "step": 17187 + }, + { + "epoch": 9.602234636871508, + "grad_norm": 0.5752077698707581, + "learning_rate": 0.0005216526610644258, + "loss": 0.5045, + "step": 17188 + }, + { + "epoch": 9.602793296089386, + "grad_norm": 0.6708674430847168, + "learning_rate": 0.000521624649859944, + "loss": 0.4964, + "step": 17189 + }, + { + "epoch": 9.603351955307263, + "grad_norm": 1.5958267450332642, + "learning_rate": 0.0005215966386554621, + "loss": 0.4032, + "step": 17190 + }, + { + "epoch": 9.60391061452514, + "grad_norm": 1.4779036045074463, + "learning_rate": 0.0005215686274509804, + "loss": 0.4673, + "step": 17191 + }, + { + "epoch": 9.604469273743018, + "grad_norm": 0.4911649525165558, + "learning_rate": 0.0005215406162464986, + "loss": 0.4242, + "step": 17192 + }, + { + "epoch": 9.605027932960894, + "grad_norm": 0.37525609135627747, + "learning_rate": 0.0005215126050420168, + "loss": 0.4277, + "step": 17193 + }, + { + "epoch": 9.60558659217877, + "grad_norm": 0.4698725938796997, + "learning_rate": 0.000521484593837535, + "loss": 0.4921, + "step": 17194 + }, + { + "epoch": 9.606145251396647, + "grad_norm": 0.6171064376831055, + "learning_rate": 0.0005214565826330531, + "loss": 0.5934, + "step": 17195 + }, + { + "epoch": 9.606703910614526, + "grad_norm": 0.9733958840370178, + "learning_rate": 0.0005214285714285714, + "loss": 0.5253, + "step": 17196 + }, + { + "epoch": 9.607262569832402, + "grad_norm": 0.3835925757884979, + "learning_rate": 0.0005214005602240897, + "loss": 0.396, + "step": 17197 + }, + { + "epoch": 9.607821229050279, + "grad_norm": 0.6190245151519775, + "learning_rate": 0.0005213725490196079, + "loss": 0.4192, + "step": 17198 + }, + { + "epoch": 9.608379888268157, + "grad_norm": 0.8092804551124573, + "learning_rate": 0.0005213445378151261, + "loss": 0.398, + "step": 17199 + }, + { + "epoch": 9.608938547486034, + "grad_norm": 0.3974584937095642, + "learning_rate": 0.0005213165266106442, + "loss": 0.4114, + "step": 17200 + }, + { + "epoch": 9.60949720670391, + "grad_norm": 0.5593611598014832, + "learning_rate": 0.0005212885154061625, + "loss": 0.3764, + "step": 17201 + }, + { + "epoch": 9.610055865921789, + "grad_norm": 0.6625487804412842, + "learning_rate": 0.0005212605042016807, + "loss": 0.522, + "step": 17202 + }, + { + "epoch": 9.610614525139665, + "grad_norm": 0.46181565523147583, + "learning_rate": 0.0005212324929971989, + "loss": 0.386, + "step": 17203 + }, + { + "epoch": 9.611173184357542, + "grad_norm": 0.5924334526062012, + "learning_rate": 0.0005212044817927171, + "loss": 0.5211, + "step": 17204 + }, + { + "epoch": 9.611731843575418, + "grad_norm": 0.38895201683044434, + "learning_rate": 0.0005211764705882353, + "loss": 0.4185, + "step": 17205 + }, + { + "epoch": 9.612290502793297, + "grad_norm": 0.5148903131484985, + "learning_rate": 0.0005211484593837535, + "loss": 0.3631, + "step": 17206 + }, + { + "epoch": 9.612849162011173, + "grad_norm": 0.6946407556533813, + "learning_rate": 0.0005211204481792717, + "loss": 0.4007, + "step": 17207 + }, + { + "epoch": 9.61340782122905, + "grad_norm": 1.7788351774215698, + "learning_rate": 0.0005210924369747899, + "loss": 0.3872, + "step": 17208 + }, + { + "epoch": 9.613966480446928, + "grad_norm": 0.3721999228000641, + "learning_rate": 0.0005210644257703081, + "loss": 0.4727, + "step": 17209 + }, + { + "epoch": 9.614525139664805, + "grad_norm": 0.7394548654556274, + "learning_rate": 0.0005210364145658263, + "loss": 0.4723, + "step": 17210 + }, + { + "epoch": 9.615083798882681, + "grad_norm": 3.182971239089966, + "learning_rate": 0.0005210084033613445, + "loss": 0.4694, + "step": 17211 + }, + { + "epoch": 9.61564245810056, + "grad_norm": 3.233119010925293, + "learning_rate": 0.0005209803921568627, + "loss": 0.4268, + "step": 17212 + }, + { + "epoch": 9.616201117318436, + "grad_norm": 0.8366853594779968, + "learning_rate": 0.000520952380952381, + "loss": 0.7175, + "step": 17213 + }, + { + "epoch": 9.616759776536313, + "grad_norm": 0.37128975987434387, + "learning_rate": 0.0005209243697478992, + "loss": 0.3626, + "step": 17214 + }, + { + "epoch": 9.61731843575419, + "grad_norm": 0.625723659992218, + "learning_rate": 0.0005208963585434174, + "loss": 0.3871, + "step": 17215 + }, + { + "epoch": 9.617877094972068, + "grad_norm": 0.6446060538291931, + "learning_rate": 0.0005208683473389356, + "loss": 0.3854, + "step": 17216 + }, + { + "epoch": 9.618435754189944, + "grad_norm": 0.935449481010437, + "learning_rate": 0.0005208403361344538, + "loss": 0.3154, + "step": 17217 + }, + { + "epoch": 9.61899441340782, + "grad_norm": 1.6552786827087402, + "learning_rate": 0.000520812324929972, + "loss": 0.3992, + "step": 17218 + }, + { + "epoch": 9.619553072625699, + "grad_norm": 0.48644429445266724, + "learning_rate": 0.0005207843137254902, + "loss": 0.4763, + "step": 17219 + }, + { + "epoch": 9.620111731843576, + "grad_norm": 0.6879404783248901, + "learning_rate": 0.0005207563025210084, + "loss": 0.3049, + "step": 17220 + }, + { + "epoch": 9.620670391061452, + "grad_norm": 0.5403700470924377, + "learning_rate": 0.0005207282913165267, + "loss": 0.4361, + "step": 17221 + }, + { + "epoch": 9.621229050279329, + "grad_norm": 0.7189821004867554, + "learning_rate": 0.0005207002801120448, + "loss": 0.4991, + "step": 17222 + }, + { + "epoch": 9.621787709497207, + "grad_norm": 0.4064416289329529, + "learning_rate": 0.000520672268907563, + "loss": 0.4204, + "step": 17223 + }, + { + "epoch": 9.622346368715084, + "grad_norm": 0.5660318732261658, + "learning_rate": 0.0005206442577030812, + "loss": 0.4846, + "step": 17224 + }, + { + "epoch": 9.62290502793296, + "grad_norm": 0.44163334369659424, + "learning_rate": 0.0005206162464985994, + "loss": 0.3166, + "step": 17225 + }, + { + "epoch": 9.623463687150839, + "grad_norm": 0.4681432843208313, + "learning_rate": 0.0005205882352941177, + "loss": 0.4286, + "step": 17226 + }, + { + "epoch": 9.624022346368715, + "grad_norm": 0.48604559898376465, + "learning_rate": 0.0005205602240896358, + "loss": 0.3907, + "step": 17227 + }, + { + "epoch": 9.624581005586592, + "grad_norm": 12.079716682434082, + "learning_rate": 0.000520532212885154, + "loss": 0.5273, + "step": 17228 + }, + { + "epoch": 9.62513966480447, + "grad_norm": 0.7280187010765076, + "learning_rate": 0.0005205042016806722, + "loss": 0.3899, + "step": 17229 + }, + { + "epoch": 9.625698324022347, + "grad_norm": 0.5819796919822693, + "learning_rate": 0.0005204761904761905, + "loss": 0.4096, + "step": 17230 + }, + { + "epoch": 9.626256983240223, + "grad_norm": 0.588688313961029, + "learning_rate": 0.0005204481792717088, + "loss": 0.4111, + "step": 17231 + }, + { + "epoch": 9.6268156424581, + "grad_norm": 1.4041037559509277, + "learning_rate": 0.0005204201680672269, + "loss": 0.346, + "step": 17232 + }, + { + "epoch": 9.627374301675978, + "grad_norm": 0.6162841320037842, + "learning_rate": 0.0005203921568627451, + "loss": 0.4711, + "step": 17233 + }, + { + "epoch": 9.627932960893855, + "grad_norm": 0.4820128083229065, + "learning_rate": 0.0005203641456582633, + "loss": 0.4651, + "step": 17234 + }, + { + "epoch": 9.628491620111731, + "grad_norm": 0.6277658939361572, + "learning_rate": 0.0005203361344537815, + "loss": 0.4503, + "step": 17235 + }, + { + "epoch": 9.62905027932961, + "grad_norm": 0.44564491510391235, + "learning_rate": 0.0005203081232492998, + "loss": 0.4209, + "step": 17236 + }, + { + "epoch": 9.629608938547486, + "grad_norm": 0.4967097342014313, + "learning_rate": 0.000520280112044818, + "loss": 0.4725, + "step": 17237 + }, + { + "epoch": 9.630167597765363, + "grad_norm": 0.9343072175979614, + "learning_rate": 0.0005202521008403361, + "loss": 0.5744, + "step": 17238 + }, + { + "epoch": 9.630726256983241, + "grad_norm": 0.5217739343643188, + "learning_rate": 0.0005202240896358543, + "loss": 0.3833, + "step": 17239 + }, + { + "epoch": 9.631284916201118, + "grad_norm": 0.458037793636322, + "learning_rate": 0.0005201960784313725, + "loss": 0.3939, + "step": 17240 + }, + { + "epoch": 9.631843575418994, + "grad_norm": 1.8534164428710938, + "learning_rate": 0.0005201680672268908, + "loss": 0.57, + "step": 17241 + }, + { + "epoch": 9.63240223463687, + "grad_norm": 0.7761991024017334, + "learning_rate": 0.000520140056022409, + "loss": 0.5147, + "step": 17242 + }, + { + "epoch": 9.632960893854749, + "grad_norm": 0.6547293663024902, + "learning_rate": 0.0005201120448179271, + "loss": 0.5333, + "step": 17243 + }, + { + "epoch": 9.633519553072626, + "grad_norm": 2.3486833572387695, + "learning_rate": 0.0005200840336134453, + "loss": 0.3496, + "step": 17244 + }, + { + "epoch": 9.634078212290502, + "grad_norm": 0.8718780875205994, + "learning_rate": 0.0005200560224089635, + "loss": 0.4624, + "step": 17245 + }, + { + "epoch": 9.63463687150838, + "grad_norm": 0.4057908058166504, + "learning_rate": 0.0005200280112044819, + "loss": 0.3739, + "step": 17246 + }, + { + "epoch": 9.635195530726257, + "grad_norm": 0.525743305683136, + "learning_rate": 0.0005200000000000001, + "loss": 0.4772, + "step": 17247 + }, + { + "epoch": 9.635754189944134, + "grad_norm": 0.3417477309703827, + "learning_rate": 0.0005199719887955182, + "loss": 0.3156, + "step": 17248 + }, + { + "epoch": 9.63631284916201, + "grad_norm": 0.7415743470191956, + "learning_rate": 0.0005199439775910364, + "loss": 0.3991, + "step": 17249 + }, + { + "epoch": 9.636871508379889, + "grad_norm": 0.6087435483932495, + "learning_rate": 0.0005199159663865546, + "loss": 0.6012, + "step": 17250 + }, + { + "epoch": 9.637430167597765, + "grad_norm": 0.5264108180999756, + "learning_rate": 0.0005198879551820729, + "loss": 0.4477, + "step": 17251 + }, + { + "epoch": 9.637988826815642, + "grad_norm": 0.5705776810646057, + "learning_rate": 0.0005198599439775911, + "loss": 0.375, + "step": 17252 + }, + { + "epoch": 9.63854748603352, + "grad_norm": 3.4019694328308105, + "learning_rate": 0.0005198319327731093, + "loss": 0.5642, + "step": 17253 + }, + { + "epoch": 9.639106145251397, + "grad_norm": 0.4593721330165863, + "learning_rate": 0.0005198039215686274, + "loss": 0.3923, + "step": 17254 + }, + { + "epoch": 9.639664804469273, + "grad_norm": 0.4560600519180298, + "learning_rate": 0.0005197759103641456, + "loss": 0.4236, + "step": 17255 + }, + { + "epoch": 9.640223463687152, + "grad_norm": 0.5203753113746643, + "learning_rate": 0.0005197478991596639, + "loss": 0.3902, + "step": 17256 + }, + { + "epoch": 9.640782122905028, + "grad_norm": 0.9243482351303101, + "learning_rate": 0.0005197198879551821, + "loss": 0.4105, + "step": 17257 + }, + { + "epoch": 9.641340782122905, + "grad_norm": 0.3731384575366974, + "learning_rate": 0.0005196918767507003, + "loss": 0.4162, + "step": 17258 + }, + { + "epoch": 9.641899441340783, + "grad_norm": 0.593356192111969, + "learning_rate": 0.0005196638655462184, + "loss": 0.3857, + "step": 17259 + }, + { + "epoch": 9.64245810055866, + "grad_norm": 0.41375938057899475, + "learning_rate": 0.0005196358543417366, + "loss": 0.4508, + "step": 17260 + }, + { + "epoch": 9.643016759776536, + "grad_norm": 0.42015644907951355, + "learning_rate": 0.000519607843137255, + "loss": 0.3475, + "step": 17261 + }, + { + "epoch": 9.643575418994413, + "grad_norm": 0.6232340931892395, + "learning_rate": 0.0005195798319327732, + "loss": 0.453, + "step": 17262 + }, + { + "epoch": 9.644134078212291, + "grad_norm": 0.8085982203483582, + "learning_rate": 0.0005195518207282914, + "loss": 0.6541, + "step": 17263 + }, + { + "epoch": 9.644692737430168, + "grad_norm": 0.6777638792991638, + "learning_rate": 0.0005195238095238095, + "loss": 0.4783, + "step": 17264 + }, + { + "epoch": 9.645251396648044, + "grad_norm": 0.4959794282913208, + "learning_rate": 0.0005194957983193277, + "loss": 0.5392, + "step": 17265 + }, + { + "epoch": 9.645810055865923, + "grad_norm": 0.49847304821014404, + "learning_rate": 0.000519467787114846, + "loss": 0.4279, + "step": 17266 + }, + { + "epoch": 9.6463687150838, + "grad_norm": 0.38611137866973877, + "learning_rate": 0.0005194397759103642, + "loss": 0.4752, + "step": 17267 + }, + { + "epoch": 9.646927374301676, + "grad_norm": 0.8109422326087952, + "learning_rate": 0.0005194117647058824, + "loss": 0.443, + "step": 17268 + }, + { + "epoch": 9.647486033519552, + "grad_norm": 0.45339465141296387, + "learning_rate": 0.0005193837535014006, + "loss": 0.3606, + "step": 17269 + }, + { + "epoch": 9.64804469273743, + "grad_norm": 0.8560296297073364, + "learning_rate": 0.0005193557422969187, + "loss": 0.4049, + "step": 17270 + }, + { + "epoch": 9.648603351955307, + "grad_norm": 0.48132944107055664, + "learning_rate": 0.000519327731092437, + "loss": 0.526, + "step": 17271 + }, + { + "epoch": 9.649162011173184, + "grad_norm": 1.4822685718536377, + "learning_rate": 0.0005192997198879552, + "loss": 0.6057, + "step": 17272 + }, + { + "epoch": 9.649720670391062, + "grad_norm": 0.43631210923194885, + "learning_rate": 0.0005192717086834734, + "loss": 0.4402, + "step": 17273 + }, + { + "epoch": 9.650279329608939, + "grad_norm": 0.4125272333621979, + "learning_rate": 0.0005192436974789916, + "loss": 0.4166, + "step": 17274 + }, + { + "epoch": 9.650837988826815, + "grad_norm": 0.38469111919403076, + "learning_rate": 0.0005192156862745097, + "loss": 0.4541, + "step": 17275 + }, + { + "epoch": 9.651396648044694, + "grad_norm": 0.4103352725505829, + "learning_rate": 0.000519187675070028, + "loss": 0.2842, + "step": 17276 + }, + { + "epoch": 9.65195530726257, + "grad_norm": 1.4568179845809937, + "learning_rate": 0.0005191596638655462, + "loss": 0.3357, + "step": 17277 + }, + { + "epoch": 9.652513966480447, + "grad_norm": 0.5212461352348328, + "learning_rate": 0.0005191316526610644, + "loss": 0.3724, + "step": 17278 + }, + { + "epoch": 9.653072625698323, + "grad_norm": 0.577918529510498, + "learning_rate": 0.0005191036414565827, + "loss": 0.3594, + "step": 17279 + }, + { + "epoch": 9.653631284916202, + "grad_norm": 0.39440223574638367, + "learning_rate": 0.0005190756302521008, + "loss": 0.4137, + "step": 17280 + }, + { + "epoch": 9.654189944134078, + "grad_norm": 0.4893341362476349, + "learning_rate": 0.0005190476190476191, + "loss": 0.4233, + "step": 17281 + }, + { + "epoch": 9.654748603351955, + "grad_norm": 2.2492904663085938, + "learning_rate": 0.0005190196078431373, + "loss": 0.3848, + "step": 17282 + }, + { + "epoch": 9.655307262569833, + "grad_norm": 0.588296115398407, + "learning_rate": 0.0005189915966386555, + "loss": 0.4553, + "step": 17283 + }, + { + "epoch": 9.65586592178771, + "grad_norm": 0.8972691297531128, + "learning_rate": 0.0005189635854341737, + "loss": 0.457, + "step": 17284 + }, + { + "epoch": 9.656424581005586, + "grad_norm": 0.5906715393066406, + "learning_rate": 0.0005189355742296919, + "loss": 0.4258, + "step": 17285 + }, + { + "epoch": 9.656983240223465, + "grad_norm": 0.42371079325675964, + "learning_rate": 0.0005189075630252101, + "loss": 0.3763, + "step": 17286 + }, + { + "epoch": 9.657541899441341, + "grad_norm": 0.4780693054199219, + "learning_rate": 0.0005188795518207283, + "loss": 0.345, + "step": 17287 + }, + { + "epoch": 9.658100558659218, + "grad_norm": 0.8028094172477722, + "learning_rate": 0.0005188515406162465, + "loss": 0.4291, + "step": 17288 + }, + { + "epoch": 9.658659217877094, + "grad_norm": 0.44257014989852905, + "learning_rate": 0.0005188235294117647, + "loss": 0.4354, + "step": 17289 + }, + { + "epoch": 9.659217877094973, + "grad_norm": 0.7343131303787231, + "learning_rate": 0.0005187955182072829, + "loss": 0.3915, + "step": 17290 + }, + { + "epoch": 9.65977653631285, + "grad_norm": 0.4098905622959137, + "learning_rate": 0.0005187675070028011, + "loss": 0.3703, + "step": 17291 + }, + { + "epoch": 9.660335195530726, + "grad_norm": 0.5763243436813354, + "learning_rate": 0.0005187394957983193, + "loss": 0.3786, + "step": 17292 + }, + { + "epoch": 9.660893854748604, + "grad_norm": 0.4634205996990204, + "learning_rate": 0.0005187114845938375, + "loss": 0.4564, + "step": 17293 + }, + { + "epoch": 9.66145251396648, + "grad_norm": 0.7018575668334961, + "learning_rate": 0.0005186834733893557, + "loss": 0.3865, + "step": 17294 + }, + { + "epoch": 9.662011173184357, + "grad_norm": 0.8497735261917114, + "learning_rate": 0.000518655462184874, + "loss": 0.4725, + "step": 17295 + }, + { + "epoch": 9.662569832402234, + "grad_norm": 1.325561285018921, + "learning_rate": 0.0005186274509803923, + "loss": 0.4575, + "step": 17296 + }, + { + "epoch": 9.663128491620112, + "grad_norm": 0.39585602283477783, + "learning_rate": 0.0005185994397759104, + "loss": 0.4186, + "step": 17297 + }, + { + "epoch": 9.663687150837989, + "grad_norm": 0.9237062931060791, + "learning_rate": 0.0005185714285714286, + "loss": 0.5237, + "step": 17298 + }, + { + "epoch": 9.664245810055865, + "grad_norm": 0.3961396813392639, + "learning_rate": 0.0005185434173669468, + "loss": 0.3538, + "step": 17299 + }, + { + "epoch": 9.664804469273744, + "grad_norm": 0.5346406698226929, + "learning_rate": 0.000518515406162465, + "loss": 0.5811, + "step": 17300 + }, + { + "epoch": 9.66536312849162, + "grad_norm": 0.42963486909866333, + "learning_rate": 0.0005184873949579833, + "loss": 0.3283, + "step": 17301 + }, + { + "epoch": 9.665921787709497, + "grad_norm": 0.6642529368400574, + "learning_rate": 0.0005184593837535014, + "loss": 0.409, + "step": 17302 + }, + { + "epoch": 9.666480446927375, + "grad_norm": 0.5658999085426331, + "learning_rate": 0.0005184313725490196, + "loss": 0.5139, + "step": 17303 + }, + { + "epoch": 9.667039106145252, + "grad_norm": 0.5177510380744934, + "learning_rate": 0.0005184033613445378, + "loss": 0.4558, + "step": 17304 + }, + { + "epoch": 9.667597765363128, + "grad_norm": 0.3965812623500824, + "learning_rate": 0.000518375350140056, + "loss": 0.4795, + "step": 17305 + }, + { + "epoch": 9.668156424581005, + "grad_norm": 0.5368238687515259, + "learning_rate": 0.0005183473389355743, + "loss": 0.5326, + "step": 17306 + }, + { + "epoch": 9.668715083798883, + "grad_norm": 0.4565581679344177, + "learning_rate": 0.0005183193277310924, + "loss": 0.3878, + "step": 17307 + }, + { + "epoch": 9.66927374301676, + "grad_norm": 0.6998317837715149, + "learning_rate": 0.0005182913165266106, + "loss": 0.4608, + "step": 17308 + }, + { + "epoch": 9.669832402234636, + "grad_norm": 1.0472875833511353, + "learning_rate": 0.0005182633053221288, + "loss": 0.4721, + "step": 17309 + }, + { + "epoch": 9.670391061452515, + "grad_norm": 0.7842323184013367, + "learning_rate": 0.000518235294117647, + "loss": 0.5487, + "step": 17310 + }, + { + "epoch": 9.670949720670391, + "grad_norm": 0.49156635999679565, + "learning_rate": 0.0005182072829131654, + "loss": 0.4039, + "step": 17311 + }, + { + "epoch": 9.671508379888268, + "grad_norm": 1.769970417022705, + "learning_rate": 0.0005181792717086836, + "loss": 0.4403, + "step": 17312 + }, + { + "epoch": 9.672067039106146, + "grad_norm": 0.557755172252655, + "learning_rate": 0.0005181512605042017, + "loss": 0.5044, + "step": 17313 + }, + { + "epoch": 9.672625698324023, + "grad_norm": 0.5054857730865479, + "learning_rate": 0.0005181232492997199, + "loss": 0.4111, + "step": 17314 + }, + { + "epoch": 9.6731843575419, + "grad_norm": 0.708062469959259, + "learning_rate": 0.0005180952380952381, + "loss": 0.3802, + "step": 17315 + }, + { + "epoch": 9.673743016759776, + "grad_norm": 0.42805802822113037, + "learning_rate": 0.0005180672268907564, + "loss": 0.36, + "step": 17316 + }, + { + "epoch": 9.674301675977654, + "grad_norm": 0.4069773554801941, + "learning_rate": 0.0005180392156862746, + "loss": 0.4357, + "step": 17317 + }, + { + "epoch": 9.67486033519553, + "grad_norm": 0.676276683807373, + "learning_rate": 0.0005180112044817927, + "loss": 0.4035, + "step": 17318 + }, + { + "epoch": 9.675418994413407, + "grad_norm": 0.4525884985923767, + "learning_rate": 0.0005179831932773109, + "loss": 0.3963, + "step": 17319 + }, + { + "epoch": 9.675977653631286, + "grad_norm": 0.6365451812744141, + "learning_rate": 0.0005179551820728291, + "loss": 0.4527, + "step": 17320 + }, + { + "epoch": 9.676536312849162, + "grad_norm": 0.8492831587791443, + "learning_rate": 0.0005179271708683474, + "loss": 0.5113, + "step": 17321 + }, + { + "epoch": 9.677094972067039, + "grad_norm": 1.2222706079483032, + "learning_rate": 0.0005178991596638656, + "loss": 0.3895, + "step": 17322 + }, + { + "epoch": 9.677653631284915, + "grad_norm": 0.6792795658111572, + "learning_rate": 0.0005178711484593837, + "loss": 0.4578, + "step": 17323 + }, + { + "epoch": 9.678212290502794, + "grad_norm": 0.45879504084587097, + "learning_rate": 0.0005178431372549019, + "loss": 0.5174, + "step": 17324 + }, + { + "epoch": 9.67877094972067, + "grad_norm": 0.6171234846115112, + "learning_rate": 0.0005178151260504201, + "loss": 0.4221, + "step": 17325 + }, + { + "epoch": 9.679329608938547, + "grad_norm": 0.6004194617271423, + "learning_rate": 0.0005177871148459384, + "loss": 0.4285, + "step": 17326 + }, + { + "epoch": 9.679888268156425, + "grad_norm": 0.44632893800735474, + "learning_rate": 0.0005177591036414567, + "loss": 0.3802, + "step": 17327 + }, + { + "epoch": 9.680446927374302, + "grad_norm": 2.143056869506836, + "learning_rate": 0.0005177310924369749, + "loss": 0.4039, + "step": 17328 + }, + { + "epoch": 9.681005586592178, + "grad_norm": 2.0904133319854736, + "learning_rate": 0.000517703081232493, + "loss": 0.4136, + "step": 17329 + }, + { + "epoch": 9.681564245810057, + "grad_norm": 1.096933364868164, + "learning_rate": 0.0005176750700280112, + "loss": 0.3925, + "step": 17330 + }, + { + "epoch": 9.682122905027933, + "grad_norm": 0.3774789869785309, + "learning_rate": 0.0005176470588235295, + "loss": 0.4076, + "step": 17331 + }, + { + "epoch": 9.68268156424581, + "grad_norm": 1.2568482160568237, + "learning_rate": 0.0005176190476190477, + "loss": 0.4481, + "step": 17332 + }, + { + "epoch": 9.683240223463688, + "grad_norm": 0.6713975667953491, + "learning_rate": 0.0005175910364145659, + "loss": 0.4465, + "step": 17333 + }, + { + "epoch": 9.683798882681565, + "grad_norm": 0.47389307618141174, + "learning_rate": 0.000517563025210084, + "loss": 0.4224, + "step": 17334 + }, + { + "epoch": 9.684357541899441, + "grad_norm": 0.4750920832157135, + "learning_rate": 0.0005175350140056022, + "loss": 0.4169, + "step": 17335 + }, + { + "epoch": 9.684916201117318, + "grad_norm": 0.5598821043968201, + "learning_rate": 0.0005175070028011205, + "loss": 0.4247, + "step": 17336 + }, + { + "epoch": 9.685474860335196, + "grad_norm": 0.5696367621421814, + "learning_rate": 0.0005174789915966387, + "loss": 0.4178, + "step": 17337 + }, + { + "epoch": 9.686033519553073, + "grad_norm": 0.6177706718444824, + "learning_rate": 0.0005174509803921569, + "loss": 0.547, + "step": 17338 + }, + { + "epoch": 9.68659217877095, + "grad_norm": 0.40286052227020264, + "learning_rate": 0.000517422969187675, + "loss": 0.3373, + "step": 17339 + }, + { + "epoch": 9.687150837988828, + "grad_norm": 1.664947271347046, + "learning_rate": 0.0005173949579831932, + "loss": 0.4093, + "step": 17340 + }, + { + "epoch": 9.687709497206704, + "grad_norm": 0.4349423348903656, + "learning_rate": 0.0005173669467787115, + "loss": 0.4371, + "step": 17341 + }, + { + "epoch": 9.68826815642458, + "grad_norm": 0.4237612783908844, + "learning_rate": 0.0005173389355742297, + "loss": 0.3985, + "step": 17342 + }, + { + "epoch": 9.688826815642457, + "grad_norm": 0.45765864849090576, + "learning_rate": 0.000517310924369748, + "loss": 0.473, + "step": 17343 + }, + { + "epoch": 9.689385474860336, + "grad_norm": 3.4241554737091064, + "learning_rate": 0.0005172829131652662, + "loss": 0.5189, + "step": 17344 + }, + { + "epoch": 9.689944134078212, + "grad_norm": 0.6101832985877991, + "learning_rate": 0.0005172549019607843, + "loss": 0.3334, + "step": 17345 + }, + { + "epoch": 9.690502793296089, + "grad_norm": 0.43298497796058655, + "learning_rate": 0.0005172268907563026, + "loss": 0.3926, + "step": 17346 + }, + { + "epoch": 9.691061452513967, + "grad_norm": 0.544924795627594, + "learning_rate": 0.0005171988795518208, + "loss": 0.3795, + "step": 17347 + }, + { + "epoch": 9.691620111731844, + "grad_norm": 2.2309019565582275, + "learning_rate": 0.000517170868347339, + "loss": 0.363, + "step": 17348 + }, + { + "epoch": 9.69217877094972, + "grad_norm": 0.43780797719955444, + "learning_rate": 0.0005171428571428572, + "loss": 0.3341, + "step": 17349 + }, + { + "epoch": 9.692737430167599, + "grad_norm": 0.5309510231018066, + "learning_rate": 0.0005171148459383753, + "loss": 0.5498, + "step": 17350 + }, + { + "epoch": 9.693296089385475, + "grad_norm": 0.4866096079349518, + "learning_rate": 0.0005170868347338936, + "loss": 0.4216, + "step": 17351 + }, + { + "epoch": 9.693854748603352, + "grad_norm": 0.4058377742767334, + "learning_rate": 0.0005170588235294118, + "loss": 0.4583, + "step": 17352 + }, + { + "epoch": 9.694413407821228, + "grad_norm": 0.3556528389453888, + "learning_rate": 0.00051703081232493, + "loss": 0.3125, + "step": 17353 + }, + { + "epoch": 9.694972067039107, + "grad_norm": 0.598050057888031, + "learning_rate": 0.0005170028011204482, + "loss": 0.3957, + "step": 17354 + }, + { + "epoch": 9.695530726256983, + "grad_norm": 7.379345893859863, + "learning_rate": 0.0005169747899159663, + "loss": 0.4797, + "step": 17355 + }, + { + "epoch": 9.69608938547486, + "grad_norm": 0.8071146607398987, + "learning_rate": 0.0005169467787114846, + "loss": 0.6773, + "step": 17356 + }, + { + "epoch": 9.696648044692738, + "grad_norm": 0.4843011200428009, + "learning_rate": 0.0005169187675070028, + "loss": 0.4337, + "step": 17357 + }, + { + "epoch": 9.697206703910615, + "grad_norm": 0.4166673421859741, + "learning_rate": 0.000516890756302521, + "loss": 0.4847, + "step": 17358 + }, + { + "epoch": 9.697765363128491, + "grad_norm": 0.6579785346984863, + "learning_rate": 0.0005168627450980392, + "loss": 0.4721, + "step": 17359 + }, + { + "epoch": 9.69832402234637, + "grad_norm": 0.6555850505828857, + "learning_rate": 0.0005168347338935574, + "loss": 0.511, + "step": 17360 + }, + { + "epoch": 9.698882681564246, + "grad_norm": 0.40956735610961914, + "learning_rate": 0.0005168067226890757, + "loss": 0.3491, + "step": 17361 + }, + { + "epoch": 9.699441340782123, + "grad_norm": 0.42788368463516235, + "learning_rate": 0.0005167787114845939, + "loss": 0.3996, + "step": 17362 + }, + { + "epoch": 9.7, + "grad_norm": 0.5142825841903687, + "learning_rate": 0.0005167507002801121, + "loss": 0.4358, + "step": 17363 + }, + { + "epoch": 9.700558659217878, + "grad_norm": 0.4930938184261322, + "learning_rate": 0.0005167226890756303, + "loss": 0.3458, + "step": 17364 + }, + { + "epoch": 9.701117318435754, + "grad_norm": 2.621616840362549, + "learning_rate": 0.0005166946778711485, + "loss": 0.5364, + "step": 17365 + }, + { + "epoch": 9.70167597765363, + "grad_norm": 1.3645325899124146, + "learning_rate": 0.0005166666666666667, + "loss": 0.3791, + "step": 17366 + }, + { + "epoch": 9.702234636871509, + "grad_norm": 0.51255863904953, + "learning_rate": 0.0005166386554621849, + "loss": 0.4757, + "step": 17367 + }, + { + "epoch": 9.702793296089386, + "grad_norm": 0.45767876505851746, + "learning_rate": 0.0005166106442577031, + "loss": 0.5137, + "step": 17368 + }, + { + "epoch": 9.703351955307262, + "grad_norm": 1.3031829595565796, + "learning_rate": 0.0005165826330532213, + "loss": 0.3678, + "step": 17369 + }, + { + "epoch": 9.703910614525139, + "grad_norm": 0.502005934715271, + "learning_rate": 0.0005165546218487395, + "loss": 0.3858, + "step": 17370 + }, + { + "epoch": 9.704469273743017, + "grad_norm": 0.3450477421283722, + "learning_rate": 0.0005165266106442577, + "loss": 0.4116, + "step": 17371 + }, + { + "epoch": 9.705027932960894, + "grad_norm": 0.9023796916007996, + "learning_rate": 0.0005164985994397759, + "loss": 0.3769, + "step": 17372 + }, + { + "epoch": 9.70558659217877, + "grad_norm": 0.36074599623680115, + "learning_rate": 0.0005164705882352941, + "loss": 0.3729, + "step": 17373 + }, + { + "epoch": 9.706145251396649, + "grad_norm": 0.5085729360580444, + "learning_rate": 0.0005164425770308123, + "loss": 0.5042, + "step": 17374 + }, + { + "epoch": 9.706703910614525, + "grad_norm": 0.5425418615341187, + "learning_rate": 0.0005164145658263305, + "loss": 0.4355, + "step": 17375 + }, + { + "epoch": 9.707262569832402, + "grad_norm": 0.4391874074935913, + "learning_rate": 0.0005163865546218489, + "loss": 0.5507, + "step": 17376 + }, + { + "epoch": 9.70782122905028, + "grad_norm": 0.5718971490859985, + "learning_rate": 0.000516358543417367, + "loss": 0.5251, + "step": 17377 + }, + { + "epoch": 9.708379888268157, + "grad_norm": 0.38186365365982056, + "learning_rate": 0.0005163305322128852, + "loss": 0.3621, + "step": 17378 + }, + { + "epoch": 9.708938547486033, + "grad_norm": 0.5581526756286621, + "learning_rate": 0.0005163025210084034, + "loss": 0.4716, + "step": 17379 + }, + { + "epoch": 9.70949720670391, + "grad_norm": 0.6507248282432556, + "learning_rate": 0.0005162745098039216, + "loss": 0.4515, + "step": 17380 + }, + { + "epoch": 9.710055865921788, + "grad_norm": 0.515035092830658, + "learning_rate": 0.0005162464985994399, + "loss": 0.5512, + "step": 17381 + }, + { + "epoch": 9.710614525139665, + "grad_norm": 0.7244021892547607, + "learning_rate": 0.000516218487394958, + "loss": 0.55, + "step": 17382 + }, + { + "epoch": 9.711173184357541, + "grad_norm": 0.5577474236488342, + "learning_rate": 0.0005161904761904762, + "loss": 0.4144, + "step": 17383 + }, + { + "epoch": 9.71173184357542, + "grad_norm": 0.3789259195327759, + "learning_rate": 0.0005161624649859944, + "loss": 0.3352, + "step": 17384 + }, + { + "epoch": 9.712290502793296, + "grad_norm": 0.5367281436920166, + "learning_rate": 0.0005161344537815126, + "loss": 0.4809, + "step": 17385 + }, + { + "epoch": 9.712849162011173, + "grad_norm": 0.37130919098854065, + "learning_rate": 0.0005161064425770309, + "loss": 0.3973, + "step": 17386 + }, + { + "epoch": 9.713407821229051, + "grad_norm": 0.5869512557983398, + "learning_rate": 0.000516078431372549, + "loss": 0.5635, + "step": 17387 + }, + { + "epoch": 9.713966480446928, + "grad_norm": 0.4152495861053467, + "learning_rate": 0.0005160504201680672, + "loss": 0.4131, + "step": 17388 + }, + { + "epoch": 9.714525139664804, + "grad_norm": 0.4453161954879761, + "learning_rate": 0.0005160224089635854, + "loss": 0.4033, + "step": 17389 + }, + { + "epoch": 9.71508379888268, + "grad_norm": 0.4040379226207733, + "learning_rate": 0.0005159943977591036, + "loss": 0.5094, + "step": 17390 + }, + { + "epoch": 9.71564245810056, + "grad_norm": 0.5727993845939636, + "learning_rate": 0.0005159663865546219, + "loss": 0.3878, + "step": 17391 + }, + { + "epoch": 9.716201117318436, + "grad_norm": 0.5577274560928345, + "learning_rate": 0.0005159383753501401, + "loss": 0.4524, + "step": 17392 + }, + { + "epoch": 9.716759776536312, + "grad_norm": 0.6149861812591553, + "learning_rate": 0.0005159103641456582, + "loss": 0.5505, + "step": 17393 + }, + { + "epoch": 9.71731843575419, + "grad_norm": 0.4324537515640259, + "learning_rate": 0.0005158823529411765, + "loss": 0.3578, + "step": 17394 + }, + { + "epoch": 9.717877094972067, + "grad_norm": 0.5104597806930542, + "learning_rate": 0.0005158543417366947, + "loss": 0.3794, + "step": 17395 + }, + { + "epoch": 9.718435754189944, + "grad_norm": 0.7242637872695923, + "learning_rate": 0.000515826330532213, + "loss": 0.5203, + "step": 17396 + }, + { + "epoch": 9.71899441340782, + "grad_norm": 0.49718573689460754, + "learning_rate": 0.0005157983193277312, + "loss": 0.3944, + "step": 17397 + }, + { + "epoch": 9.719553072625699, + "grad_norm": 0.536837100982666, + "learning_rate": 0.0005157703081232493, + "loss": 0.3955, + "step": 17398 + }, + { + "epoch": 9.720111731843575, + "grad_norm": 0.6168854832649231, + "learning_rate": 0.0005157422969187675, + "loss": 0.4174, + "step": 17399 + }, + { + "epoch": 9.720670391061452, + "grad_norm": 0.5200393795967102, + "learning_rate": 0.0005157142857142857, + "loss": 0.4442, + "step": 17400 + }, + { + "epoch": 9.72122905027933, + "grad_norm": 0.3781569302082062, + "learning_rate": 0.000515686274509804, + "loss": 0.3348, + "step": 17401 + }, + { + "epoch": 9.721787709497207, + "grad_norm": 0.3797781765460968, + "learning_rate": 0.0005156582633053222, + "loss": 0.3355, + "step": 17402 + }, + { + "epoch": 9.722346368715083, + "grad_norm": 1.1978373527526855, + "learning_rate": 0.0005156302521008403, + "loss": 0.4886, + "step": 17403 + }, + { + "epoch": 9.722905027932962, + "grad_norm": 3.591677665710449, + "learning_rate": 0.0005156022408963585, + "loss": 0.3189, + "step": 17404 + }, + { + "epoch": 9.723463687150838, + "grad_norm": 0.8176594376564026, + "learning_rate": 0.0005155742296918767, + "loss": 0.4319, + "step": 17405 + }, + { + "epoch": 9.724022346368715, + "grad_norm": 0.7716394662857056, + "learning_rate": 0.0005155462184873949, + "loss": 0.4223, + "step": 17406 + }, + { + "epoch": 9.724581005586593, + "grad_norm": 0.7487101554870605, + "learning_rate": 0.0005155182072829132, + "loss": 0.4126, + "step": 17407 + }, + { + "epoch": 9.72513966480447, + "grad_norm": 0.5277567505836487, + "learning_rate": 0.0005154901960784314, + "loss": 0.373, + "step": 17408 + }, + { + "epoch": 9.725698324022346, + "grad_norm": 0.7035177946090698, + "learning_rate": 0.0005154621848739495, + "loss": 0.4986, + "step": 17409 + }, + { + "epoch": 9.726256983240223, + "grad_norm": 0.4386945366859436, + "learning_rate": 0.0005154341736694677, + "loss": 0.4114, + "step": 17410 + }, + { + "epoch": 9.726815642458101, + "grad_norm": 0.46786728501319885, + "learning_rate": 0.000515406162464986, + "loss": 0.386, + "step": 17411 + }, + { + "epoch": 9.727374301675978, + "grad_norm": 0.76334547996521, + "learning_rate": 0.0005153781512605043, + "loss": 0.4074, + "step": 17412 + }, + { + "epoch": 9.727932960893854, + "grad_norm": 0.8161738514900208, + "learning_rate": 0.0005153501400560225, + "loss": 0.4366, + "step": 17413 + }, + { + "epoch": 9.728491620111733, + "grad_norm": 2.1180057525634766, + "learning_rate": 0.0005153221288515406, + "loss": 0.4547, + "step": 17414 + }, + { + "epoch": 9.72905027932961, + "grad_norm": 1.3848165273666382, + "learning_rate": 0.0005152941176470588, + "loss": 0.4559, + "step": 17415 + }, + { + "epoch": 9.729608938547486, + "grad_norm": 0.6294699311256409, + "learning_rate": 0.000515266106442577, + "loss": 0.6064, + "step": 17416 + }, + { + "epoch": 9.730167597765362, + "grad_norm": 0.6414377093315125, + "learning_rate": 0.0005152380952380953, + "loss": 0.376, + "step": 17417 + }, + { + "epoch": 9.73072625698324, + "grad_norm": 0.5367869138717651, + "learning_rate": 0.0005152100840336135, + "loss": 0.4111, + "step": 17418 + }, + { + "epoch": 9.731284916201117, + "grad_norm": 0.863456130027771, + "learning_rate": 0.0005151820728291316, + "loss": 0.4863, + "step": 17419 + }, + { + "epoch": 9.731843575418994, + "grad_norm": 0.7863894701004028, + "learning_rate": 0.0005151540616246498, + "loss": 0.5944, + "step": 17420 + }, + { + "epoch": 9.732402234636872, + "grad_norm": 0.6741968989372253, + "learning_rate": 0.000515126050420168, + "loss": 0.4391, + "step": 17421 + }, + { + "epoch": 9.732960893854749, + "grad_norm": 1.422831416130066, + "learning_rate": 0.0005150980392156863, + "loss": 0.4722, + "step": 17422 + }, + { + "epoch": 9.733519553072625, + "grad_norm": 0.4514162540435791, + "learning_rate": 0.0005150700280112045, + "loss": 0.4898, + "step": 17423 + }, + { + "epoch": 9.734078212290502, + "grad_norm": 0.6525101661682129, + "learning_rate": 0.0005150420168067227, + "loss": 0.4869, + "step": 17424 + }, + { + "epoch": 9.73463687150838, + "grad_norm": 2.357609272003174, + "learning_rate": 0.0005150140056022408, + "loss": 0.4184, + "step": 17425 + }, + { + "epoch": 9.735195530726257, + "grad_norm": 0.46039775013923645, + "learning_rate": 0.000514985994397759, + "loss": 0.3712, + "step": 17426 + }, + { + "epoch": 9.735754189944133, + "grad_norm": 0.4177815616130829, + "learning_rate": 0.0005149579831932774, + "loss": 0.3431, + "step": 17427 + }, + { + "epoch": 9.736312849162012, + "grad_norm": 0.6000614166259766, + "learning_rate": 0.0005149299719887956, + "loss": 0.4305, + "step": 17428 + }, + { + "epoch": 9.736871508379888, + "grad_norm": 1.6972154378890991, + "learning_rate": 0.0005149019607843138, + "loss": 0.4688, + "step": 17429 + }, + { + "epoch": 9.737430167597765, + "grad_norm": 0.48101353645324707, + "learning_rate": 0.0005148739495798319, + "loss": 0.6671, + "step": 17430 + }, + { + "epoch": 9.737988826815643, + "grad_norm": 0.5486119985580444, + "learning_rate": 0.0005148459383753501, + "loss": 0.4086, + "step": 17431 + }, + { + "epoch": 9.73854748603352, + "grad_norm": 0.8113036751747131, + "learning_rate": 0.0005148179271708684, + "loss": 0.3624, + "step": 17432 + }, + { + "epoch": 9.739106145251396, + "grad_norm": 0.4654565155506134, + "learning_rate": 0.0005147899159663866, + "loss": 0.4304, + "step": 17433 + }, + { + "epoch": 9.739664804469275, + "grad_norm": 0.4594738185405731, + "learning_rate": 0.0005147619047619048, + "loss": 0.4001, + "step": 17434 + }, + { + "epoch": 9.740223463687151, + "grad_norm": 0.40408098697662354, + "learning_rate": 0.0005147338935574229, + "loss": 0.3278, + "step": 17435 + }, + { + "epoch": 9.740782122905028, + "grad_norm": 1.3653596639633179, + "learning_rate": 0.0005147058823529411, + "loss": 0.4319, + "step": 17436 + }, + { + "epoch": 9.741340782122904, + "grad_norm": 0.5705268383026123, + "learning_rate": 0.0005146778711484594, + "loss": 0.4371, + "step": 17437 + }, + { + "epoch": 9.741899441340783, + "grad_norm": 0.5972915887832642, + "learning_rate": 0.0005146498599439776, + "loss": 0.5505, + "step": 17438 + }, + { + "epoch": 9.74245810055866, + "grad_norm": 0.46979445219039917, + "learning_rate": 0.0005146218487394958, + "loss": 0.3926, + "step": 17439 + }, + { + "epoch": 9.743016759776536, + "grad_norm": 0.6100826263427734, + "learning_rate": 0.000514593837535014, + "loss": 0.4634, + "step": 17440 + }, + { + "epoch": 9.743575418994414, + "grad_norm": 0.3186141550540924, + "learning_rate": 0.0005145658263305321, + "loss": 0.3184, + "step": 17441 + }, + { + "epoch": 9.74413407821229, + "grad_norm": 0.3863731622695923, + "learning_rate": 0.0005145378151260504, + "loss": 0.4622, + "step": 17442 + }, + { + "epoch": 9.744692737430167, + "grad_norm": 0.6097526550292969, + "learning_rate": 0.0005145098039215687, + "loss": 0.3044, + "step": 17443 + }, + { + "epoch": 9.745251396648044, + "grad_norm": 1.0645852088928223, + "learning_rate": 0.0005144817927170869, + "loss": 0.3744, + "step": 17444 + }, + { + "epoch": 9.745810055865922, + "grad_norm": 0.6177852153778076, + "learning_rate": 0.0005144537815126051, + "loss": 0.4153, + "step": 17445 + }, + { + "epoch": 9.746368715083799, + "grad_norm": 3.6419007778167725, + "learning_rate": 0.0005144257703081232, + "loss": 0.4129, + "step": 17446 + }, + { + "epoch": 9.746927374301675, + "grad_norm": 0.5785045027732849, + "learning_rate": 0.0005143977591036415, + "loss": 0.3975, + "step": 17447 + }, + { + "epoch": 9.747486033519554, + "grad_norm": 0.6446021795272827, + "learning_rate": 0.0005143697478991597, + "loss": 0.4617, + "step": 17448 + }, + { + "epoch": 9.74804469273743, + "grad_norm": 0.6637579202651978, + "learning_rate": 0.0005143417366946779, + "loss": 0.4362, + "step": 17449 + }, + { + "epoch": 9.748603351955307, + "grad_norm": 0.7729748487472534, + "learning_rate": 0.0005143137254901961, + "loss": 0.5518, + "step": 17450 + }, + { + "epoch": 9.749162011173185, + "grad_norm": 0.3719259798526764, + "learning_rate": 0.0005142857142857142, + "loss": 0.3733, + "step": 17451 + }, + { + "epoch": 9.749720670391062, + "grad_norm": 0.6170015931129456, + "learning_rate": 0.0005142577030812325, + "loss": 0.5065, + "step": 17452 + }, + { + "epoch": 9.750279329608938, + "grad_norm": 6.93314790725708, + "learning_rate": 0.0005142296918767507, + "loss": 0.4768, + "step": 17453 + }, + { + "epoch": 9.750837988826815, + "grad_norm": 1.0931603908538818, + "learning_rate": 0.0005142016806722689, + "loss": 0.504, + "step": 17454 + }, + { + "epoch": 9.751396648044693, + "grad_norm": 1.4975506067276, + "learning_rate": 0.0005141736694677871, + "loss": 0.5809, + "step": 17455 + }, + { + "epoch": 9.75195530726257, + "grad_norm": 0.6435559988021851, + "learning_rate": 0.0005141456582633053, + "loss": 0.4173, + "step": 17456 + }, + { + "epoch": 9.752513966480446, + "grad_norm": 0.7482951283454895, + "learning_rate": 0.0005141176470588235, + "loss": 0.486, + "step": 17457 + }, + { + "epoch": 9.753072625698325, + "grad_norm": 0.4143964946269989, + "learning_rate": 0.0005140896358543417, + "loss": 0.3543, + "step": 17458 + }, + { + "epoch": 9.753631284916201, + "grad_norm": 0.6078479290008545, + "learning_rate": 0.00051406162464986, + "loss": 0.4561, + "step": 17459 + }, + { + "epoch": 9.754189944134078, + "grad_norm": 0.5232478976249695, + "learning_rate": 0.0005140336134453782, + "loss": 0.5092, + "step": 17460 + }, + { + "epoch": 9.754748603351956, + "grad_norm": 0.4944794774055481, + "learning_rate": 0.0005140056022408964, + "loss": 0.3744, + "step": 17461 + }, + { + "epoch": 9.755307262569833, + "grad_norm": 1.5352438688278198, + "learning_rate": 0.0005139775910364146, + "loss": 0.5147, + "step": 17462 + }, + { + "epoch": 9.75586592178771, + "grad_norm": 0.44495660066604614, + "learning_rate": 0.0005139495798319328, + "loss": 0.4241, + "step": 17463 + }, + { + "epoch": 9.756424581005586, + "grad_norm": 0.42115840315818787, + "learning_rate": 0.000513921568627451, + "loss": 0.4605, + "step": 17464 + }, + { + "epoch": 9.756983240223464, + "grad_norm": 0.43991491198539734, + "learning_rate": 0.0005138935574229692, + "loss": 0.3346, + "step": 17465 + }, + { + "epoch": 9.75754189944134, + "grad_norm": 1.1453438997268677, + "learning_rate": 0.0005138655462184874, + "loss": 0.494, + "step": 17466 + }, + { + "epoch": 9.758100558659217, + "grad_norm": 0.45045948028564453, + "learning_rate": 0.0005138375350140056, + "loss": 0.4267, + "step": 17467 + }, + { + "epoch": 9.758659217877096, + "grad_norm": 0.5257158875465393, + "learning_rate": 0.0005138095238095238, + "loss": 0.3169, + "step": 17468 + }, + { + "epoch": 9.759217877094972, + "grad_norm": 0.5546936392784119, + "learning_rate": 0.000513781512605042, + "loss": 0.3687, + "step": 17469 + }, + { + "epoch": 9.759776536312849, + "grad_norm": 0.9328604936599731, + "learning_rate": 0.0005137535014005602, + "loss": 0.3868, + "step": 17470 + }, + { + "epoch": 9.760335195530725, + "grad_norm": 0.4966212511062622, + "learning_rate": 0.0005137254901960784, + "loss": 0.3948, + "step": 17471 + }, + { + "epoch": 9.760893854748604, + "grad_norm": 0.4861254394054413, + "learning_rate": 0.0005136974789915967, + "loss": 0.3569, + "step": 17472 + }, + { + "epoch": 9.76145251396648, + "grad_norm": 1.0003819465637207, + "learning_rate": 0.0005136694677871148, + "loss": 0.3406, + "step": 17473 + }, + { + "epoch": 9.762011173184357, + "grad_norm": 0.7772659659385681, + "learning_rate": 0.000513641456582633, + "loss": 0.4225, + "step": 17474 + }, + { + "epoch": 9.762569832402235, + "grad_norm": 0.4207322597503662, + "learning_rate": 0.0005136134453781512, + "loss": 0.3888, + "step": 17475 + }, + { + "epoch": 9.763128491620112, + "grad_norm": 0.42850035429000854, + "learning_rate": 0.0005135854341736695, + "loss": 0.4259, + "step": 17476 + }, + { + "epoch": 9.763687150837988, + "grad_norm": 0.745561420917511, + "learning_rate": 0.0005135574229691878, + "loss": 0.4161, + "step": 17477 + }, + { + "epoch": 9.764245810055867, + "grad_norm": 0.4545363783836365, + "learning_rate": 0.0005135294117647059, + "loss": 0.48, + "step": 17478 + }, + { + "epoch": 9.764804469273743, + "grad_norm": 0.6444083452224731, + "learning_rate": 0.0005135014005602241, + "loss": 0.3608, + "step": 17479 + }, + { + "epoch": 9.76536312849162, + "grad_norm": 0.5528808236122131, + "learning_rate": 0.0005134733893557423, + "loss": 0.409, + "step": 17480 + }, + { + "epoch": 9.765921787709498, + "grad_norm": 0.4951499402523041, + "learning_rate": 0.0005134453781512605, + "loss": 0.4347, + "step": 17481 + }, + { + "epoch": 9.766480446927375, + "grad_norm": 0.43165549635887146, + "learning_rate": 0.0005134173669467788, + "loss": 0.4015, + "step": 17482 + }, + { + "epoch": 9.767039106145251, + "grad_norm": 0.6858075857162476, + "learning_rate": 0.0005133893557422969, + "loss": 0.3906, + "step": 17483 + }, + { + "epoch": 9.767597765363128, + "grad_norm": 0.4866388440132141, + "learning_rate": 0.0005133613445378151, + "loss": 0.4811, + "step": 17484 + }, + { + "epoch": 9.768156424581006, + "grad_norm": 0.8450616598129272, + "learning_rate": 0.0005133333333333333, + "loss": 0.4078, + "step": 17485 + }, + { + "epoch": 9.768715083798883, + "grad_norm": 0.47369492053985596, + "learning_rate": 0.0005133053221288515, + "loss": 0.46, + "step": 17486 + }, + { + "epoch": 9.76927374301676, + "grad_norm": 0.4902918338775635, + "learning_rate": 0.0005132773109243698, + "loss": 0.3865, + "step": 17487 + }, + { + "epoch": 9.769832402234638, + "grad_norm": 2.542403221130371, + "learning_rate": 0.000513249299719888, + "loss": 0.4601, + "step": 17488 + }, + { + "epoch": 9.770391061452514, + "grad_norm": 0.458379328250885, + "learning_rate": 0.0005132212885154061, + "loss": 0.6388, + "step": 17489 + }, + { + "epoch": 9.77094972067039, + "grad_norm": 0.5277748107910156, + "learning_rate": 0.0005131932773109243, + "loss": 0.4498, + "step": 17490 + }, + { + "epoch": 9.771508379888267, + "grad_norm": 0.46168458461761475, + "learning_rate": 0.0005131652661064425, + "loss": 0.4498, + "step": 17491 + }, + { + "epoch": 9.772067039106146, + "grad_norm": 0.4343564510345459, + "learning_rate": 0.0005131372549019609, + "loss": 0.4392, + "step": 17492 + }, + { + "epoch": 9.772625698324022, + "grad_norm": 0.5236911177635193, + "learning_rate": 0.0005131092436974791, + "loss": 0.4222, + "step": 17493 + }, + { + "epoch": 9.773184357541899, + "grad_norm": 0.5493744015693665, + "learning_rate": 0.0005130812324929972, + "loss": 0.4464, + "step": 17494 + }, + { + "epoch": 9.773743016759777, + "grad_norm": 0.517719566822052, + "learning_rate": 0.0005130532212885154, + "loss": 0.4947, + "step": 17495 + }, + { + "epoch": 9.774301675977654, + "grad_norm": 0.5794305205345154, + "learning_rate": 0.0005130252100840336, + "loss": 0.4684, + "step": 17496 + }, + { + "epoch": 9.77486033519553, + "grad_norm": 0.4486599266529083, + "learning_rate": 0.0005129971988795519, + "loss": 0.3477, + "step": 17497 + }, + { + "epoch": 9.775418994413407, + "grad_norm": 0.7341869473457336, + "learning_rate": 0.0005129691876750701, + "loss": 0.4012, + "step": 17498 + }, + { + "epoch": 9.775977653631285, + "grad_norm": 0.6696038842201233, + "learning_rate": 0.0005129411764705882, + "loss": 0.422, + "step": 17499 + }, + { + "epoch": 9.776536312849162, + "grad_norm": 0.5272493362426758, + "learning_rate": 0.0005129131652661064, + "loss": 0.4425, + "step": 17500 + }, + { + "epoch": 9.776536312849162, + "eval_cer": 0.08913510741601476, + "eval_loss": 0.33675557374954224, + "eval_runtime": 55.6443, + "eval_samples_per_second": 81.554, + "eval_steps_per_second": 5.104, + "eval_wer": 0.35376106812658864, + "step": 17500 + }, + { + "epoch": 9.777094972067038, + "grad_norm": 5.756768226623535, + "learning_rate": 0.0005128851540616246, + "loss": 0.4709, + "step": 17501 + }, + { + "epoch": 9.777653631284917, + "grad_norm": 0.5121567249298096, + "learning_rate": 0.0005128571428571429, + "loss": 0.4846, + "step": 17502 + }, + { + "epoch": 9.778212290502793, + "grad_norm": 0.8362381458282471, + "learning_rate": 0.0005128291316526611, + "loss": 0.4251, + "step": 17503 + }, + { + "epoch": 9.77877094972067, + "grad_norm": 0.5548500418663025, + "learning_rate": 0.0005128011204481793, + "loss": 0.4297, + "step": 17504 + }, + { + "epoch": 9.779329608938548, + "grad_norm": 0.30815228819847107, + "learning_rate": 0.0005127731092436974, + "loss": 0.2953, + "step": 17505 + }, + { + "epoch": 9.779888268156425, + "grad_norm": 0.5616171360015869, + "learning_rate": 0.0005127450980392156, + "loss": 0.3667, + "step": 17506 + }, + { + "epoch": 9.780446927374301, + "grad_norm": 0.4028511047363281, + "learning_rate": 0.000512717086834734, + "loss": 0.4273, + "step": 17507 + }, + { + "epoch": 9.78100558659218, + "grad_norm": 2.9142491817474365, + "learning_rate": 0.0005126890756302522, + "loss": 0.4801, + "step": 17508 + }, + { + "epoch": 9.781564245810056, + "grad_norm": 0.4930936098098755, + "learning_rate": 0.0005126610644257704, + "loss": 0.4564, + "step": 17509 + }, + { + "epoch": 9.782122905027933, + "grad_norm": 0.47975635528564453, + "learning_rate": 0.0005126330532212885, + "loss": 0.467, + "step": 17510 + }, + { + "epoch": 9.78268156424581, + "grad_norm": 0.42697492241859436, + "learning_rate": 0.0005126050420168067, + "loss": 0.381, + "step": 17511 + }, + { + "epoch": 9.783240223463688, + "grad_norm": 1.0264018774032593, + "learning_rate": 0.000512577030812325, + "loss": 0.326, + "step": 17512 + }, + { + "epoch": 9.783798882681564, + "grad_norm": 0.7078700065612793, + "learning_rate": 0.0005125490196078432, + "loss": 0.3513, + "step": 17513 + }, + { + "epoch": 9.78435754189944, + "grad_norm": 0.4582546055316925, + "learning_rate": 0.0005125210084033614, + "loss": 0.3684, + "step": 17514 + }, + { + "epoch": 9.78491620111732, + "grad_norm": 0.4572550654411316, + "learning_rate": 0.0005124929971988795, + "loss": 0.3964, + "step": 17515 + }, + { + "epoch": 9.785474860335196, + "grad_norm": 0.7325780391693115, + "learning_rate": 0.0005124649859943977, + "loss": 0.5065, + "step": 17516 + }, + { + "epoch": 9.786033519553072, + "grad_norm": 0.7576181888580322, + "learning_rate": 0.000512436974789916, + "loss": 0.4888, + "step": 17517 + }, + { + "epoch": 9.786592178770949, + "grad_norm": 0.44088447093963623, + "learning_rate": 0.0005124089635854342, + "loss": 0.4157, + "step": 17518 + }, + { + "epoch": 9.787150837988827, + "grad_norm": 0.4319549798965454, + "learning_rate": 0.0005123809523809524, + "loss": 0.4053, + "step": 17519 + }, + { + "epoch": 9.787709497206704, + "grad_norm": 2.964113473892212, + "learning_rate": 0.0005123529411764706, + "loss": 0.3993, + "step": 17520 + }, + { + "epoch": 9.78826815642458, + "grad_norm": 0.5118347406387329, + "learning_rate": 0.0005123249299719887, + "loss": 0.4678, + "step": 17521 + }, + { + "epoch": 9.788826815642459, + "grad_norm": 0.8929596543312073, + "learning_rate": 0.000512296918767507, + "loss": 0.7208, + "step": 17522 + }, + { + "epoch": 9.789385474860335, + "grad_norm": 0.6069864630699158, + "learning_rate": 0.0005122689075630252, + "loss": 0.4929, + "step": 17523 + }, + { + "epoch": 9.789944134078212, + "grad_norm": 0.7654553651809692, + "learning_rate": 0.0005122408963585434, + "loss": 0.4634, + "step": 17524 + }, + { + "epoch": 9.79050279329609, + "grad_norm": 0.5669147968292236, + "learning_rate": 0.0005122128851540617, + "loss": 0.4851, + "step": 17525 + }, + { + "epoch": 9.791061452513967, + "grad_norm": 0.650214672088623, + "learning_rate": 0.0005121848739495798, + "loss": 0.5101, + "step": 17526 + }, + { + "epoch": 9.791620111731843, + "grad_norm": 0.6564046144485474, + "learning_rate": 0.0005121568627450981, + "loss": 0.4136, + "step": 17527 + }, + { + "epoch": 9.79217877094972, + "grad_norm": 0.5590186715126038, + "learning_rate": 0.0005121288515406163, + "loss": 0.5181, + "step": 17528 + }, + { + "epoch": 9.792737430167598, + "grad_norm": 0.6540253758430481, + "learning_rate": 0.0005121008403361345, + "loss": 0.5555, + "step": 17529 + }, + { + "epoch": 9.793296089385475, + "grad_norm": 1.2710096836090088, + "learning_rate": 0.0005120728291316527, + "loss": 0.5116, + "step": 17530 + }, + { + "epoch": 9.793854748603351, + "grad_norm": 0.5380557775497437, + "learning_rate": 0.0005120448179271708, + "loss": 0.4605, + "step": 17531 + }, + { + "epoch": 9.79441340782123, + "grad_norm": 0.8932152986526489, + "learning_rate": 0.0005120168067226891, + "loss": 0.4825, + "step": 17532 + }, + { + "epoch": 9.794972067039106, + "grad_norm": 0.41872331500053406, + "learning_rate": 0.0005119887955182073, + "loss": 0.4765, + "step": 17533 + }, + { + "epoch": 9.795530726256983, + "grad_norm": 2.593411445617676, + "learning_rate": 0.0005119607843137255, + "loss": 0.4638, + "step": 17534 + }, + { + "epoch": 9.796089385474861, + "grad_norm": 1.1170240640640259, + "learning_rate": 0.0005119327731092437, + "loss": 0.4862, + "step": 17535 + }, + { + "epoch": 9.796648044692738, + "grad_norm": 0.7942990064620972, + "learning_rate": 0.0005119047619047619, + "loss": 0.4025, + "step": 17536 + }, + { + "epoch": 9.797206703910614, + "grad_norm": 0.38618552684783936, + "learning_rate": 0.0005118767507002801, + "loss": 0.3998, + "step": 17537 + }, + { + "epoch": 9.797765363128491, + "grad_norm": 0.5663967132568359, + "learning_rate": 0.0005118487394957983, + "loss": 0.3549, + "step": 17538 + }, + { + "epoch": 9.79832402234637, + "grad_norm": 0.4338151812553406, + "learning_rate": 0.0005118207282913165, + "loss": 0.5271, + "step": 17539 + }, + { + "epoch": 9.798882681564246, + "grad_norm": 0.946584165096283, + "learning_rate": 0.0005117927170868347, + "loss": 0.5013, + "step": 17540 + }, + { + "epoch": 9.799441340782122, + "grad_norm": 0.5233994722366333, + "learning_rate": 0.000511764705882353, + "loss": 0.4657, + "step": 17541 + }, + { + "epoch": 9.8, + "grad_norm": 0.5305754542350769, + "learning_rate": 0.0005117366946778712, + "loss": 0.4774, + "step": 17542 + }, + { + "epoch": 9.800558659217877, + "grad_norm": 0.564389169216156, + "learning_rate": 0.0005117086834733894, + "loss": 0.4033, + "step": 17543 + }, + { + "epoch": 9.801117318435754, + "grad_norm": 0.37814152240753174, + "learning_rate": 0.0005116806722689076, + "loss": 0.3245, + "step": 17544 + }, + { + "epoch": 9.80167597765363, + "grad_norm": 0.4387277364730835, + "learning_rate": 0.0005116526610644258, + "loss": 0.4064, + "step": 17545 + }, + { + "epoch": 9.802234636871509, + "grad_norm": 0.5256638526916504, + "learning_rate": 0.000511624649859944, + "loss": 0.3621, + "step": 17546 + }, + { + "epoch": 9.802793296089385, + "grad_norm": 0.40469667315483093, + "learning_rate": 0.0005115966386554623, + "loss": 0.4084, + "step": 17547 + }, + { + "epoch": 9.803351955307262, + "grad_norm": 0.6191628575325012, + "learning_rate": 0.0005115686274509804, + "loss": 0.3368, + "step": 17548 + }, + { + "epoch": 9.80391061452514, + "grad_norm": 0.3349907100200653, + "learning_rate": 0.0005115406162464986, + "loss": 0.3558, + "step": 17549 + }, + { + "epoch": 9.804469273743017, + "grad_norm": 0.7227099537849426, + "learning_rate": 0.0005115126050420168, + "loss": 0.4658, + "step": 17550 + }, + { + "epoch": 9.805027932960893, + "grad_norm": 0.33272337913513184, + "learning_rate": 0.000511484593837535, + "loss": 0.3493, + "step": 17551 + }, + { + "epoch": 9.805586592178772, + "grad_norm": 0.6169582009315491, + "learning_rate": 0.0005114565826330533, + "loss": 0.4483, + "step": 17552 + }, + { + "epoch": 9.806145251396648, + "grad_norm": 0.842114269733429, + "learning_rate": 0.0005114285714285714, + "loss": 0.4186, + "step": 17553 + }, + { + "epoch": 9.806703910614525, + "grad_norm": 1.0288182497024536, + "learning_rate": 0.0005114005602240896, + "loss": 0.4989, + "step": 17554 + }, + { + "epoch": 9.807262569832401, + "grad_norm": 0.45129239559173584, + "learning_rate": 0.0005113725490196078, + "loss": 0.4613, + "step": 17555 + }, + { + "epoch": 9.80782122905028, + "grad_norm": 0.5221298336982727, + "learning_rate": 0.000511344537815126, + "loss": 0.428, + "step": 17556 + }, + { + "epoch": 9.808379888268156, + "grad_norm": 0.5256417989730835, + "learning_rate": 0.0005113165266106444, + "loss": 0.5703, + "step": 17557 + }, + { + "epoch": 9.808938547486033, + "grad_norm": 0.398550808429718, + "learning_rate": 0.0005112885154061625, + "loss": 0.4125, + "step": 17558 + }, + { + "epoch": 9.809497206703911, + "grad_norm": 1.0827676057815552, + "learning_rate": 0.0005112605042016807, + "loss": 0.3728, + "step": 17559 + }, + { + "epoch": 9.810055865921788, + "grad_norm": 0.6408460736274719, + "learning_rate": 0.0005112324929971989, + "loss": 0.3541, + "step": 17560 + }, + { + "epoch": 9.810614525139664, + "grad_norm": 0.4745464026927948, + "learning_rate": 0.0005112044817927171, + "loss": 0.481, + "step": 17561 + }, + { + "epoch": 9.811173184357543, + "grad_norm": 0.8619210720062256, + "learning_rate": 0.0005111764705882354, + "loss": 0.4815, + "step": 17562 + }, + { + "epoch": 9.81173184357542, + "grad_norm": 0.5621064305305481, + "learning_rate": 0.0005111484593837536, + "loss": 0.5229, + "step": 17563 + }, + { + "epoch": 9.812290502793296, + "grad_norm": 0.6314519643783569, + "learning_rate": 0.0005111204481792717, + "loss": 0.525, + "step": 17564 + }, + { + "epoch": 9.812849162011172, + "grad_norm": 0.5272702574729919, + "learning_rate": 0.0005110924369747899, + "loss": 0.4177, + "step": 17565 + }, + { + "epoch": 9.81340782122905, + "grad_norm": 1.2172542810440063, + "learning_rate": 0.0005110644257703081, + "loss": 0.5238, + "step": 17566 + }, + { + "epoch": 9.813966480446927, + "grad_norm": 1.2464280128479004, + "learning_rate": 0.0005110364145658264, + "loss": 0.522, + "step": 17567 + }, + { + "epoch": 9.814525139664804, + "grad_norm": 0.6002777218818665, + "learning_rate": 0.0005110084033613446, + "loss": 0.4432, + "step": 17568 + }, + { + "epoch": 9.815083798882682, + "grad_norm": 1.5450071096420288, + "learning_rate": 0.0005109803921568627, + "loss": 0.4427, + "step": 17569 + }, + { + "epoch": 9.815642458100559, + "grad_norm": 0.8230819702148438, + "learning_rate": 0.0005109523809523809, + "loss": 0.539, + "step": 17570 + }, + { + "epoch": 9.816201117318435, + "grad_norm": 0.7993802428245544, + "learning_rate": 0.0005109243697478991, + "loss": 0.4118, + "step": 17571 + }, + { + "epoch": 9.816759776536312, + "grad_norm": 0.5523188710212708, + "learning_rate": 0.0005108963585434174, + "loss": 0.5507, + "step": 17572 + }, + { + "epoch": 9.81731843575419, + "grad_norm": 0.6646541357040405, + "learning_rate": 0.0005108683473389357, + "loss": 0.5483, + "step": 17573 + }, + { + "epoch": 9.817877094972067, + "grad_norm": 0.49421900510787964, + "learning_rate": 0.0005108403361344537, + "loss": 0.4018, + "step": 17574 + }, + { + "epoch": 9.818435754189943, + "grad_norm": 0.4183419644832611, + "learning_rate": 0.000510812324929972, + "loss": 0.3837, + "step": 17575 + }, + { + "epoch": 9.818994413407822, + "grad_norm": 0.6737042665481567, + "learning_rate": 0.0005107843137254902, + "loss": 0.5087, + "step": 17576 + }, + { + "epoch": 9.819553072625698, + "grad_norm": 1.118836522102356, + "learning_rate": 0.0005107563025210085, + "loss": 0.503, + "step": 17577 + }, + { + "epoch": 9.820111731843575, + "grad_norm": 0.5079019069671631, + "learning_rate": 0.0005107282913165267, + "loss": 0.4871, + "step": 17578 + }, + { + "epoch": 9.820670391061453, + "grad_norm": 0.6374537944793701, + "learning_rate": 0.0005107002801120449, + "loss": 0.4988, + "step": 17579 + }, + { + "epoch": 9.82122905027933, + "grad_norm": 0.443655401468277, + "learning_rate": 0.000510672268907563, + "loss": 0.4187, + "step": 17580 + }, + { + "epoch": 9.821787709497206, + "grad_norm": 0.44134342670440674, + "learning_rate": 0.0005106442577030812, + "loss": 0.4351, + "step": 17581 + }, + { + "epoch": 9.822346368715085, + "grad_norm": 0.5324344635009766, + "learning_rate": 0.0005106162464985995, + "loss": 0.4203, + "step": 17582 + }, + { + "epoch": 9.822905027932961, + "grad_norm": 0.5665680766105652, + "learning_rate": 0.0005105882352941177, + "loss": 0.5292, + "step": 17583 + }, + { + "epoch": 9.823463687150838, + "grad_norm": 1.1033124923706055, + "learning_rate": 0.0005105602240896359, + "loss": 0.4381, + "step": 17584 + }, + { + "epoch": 9.824022346368714, + "grad_norm": 0.8590704202651978, + "learning_rate": 0.000510532212885154, + "loss": 0.5774, + "step": 17585 + }, + { + "epoch": 9.824581005586593, + "grad_norm": 0.9189087152481079, + "learning_rate": 0.0005105042016806722, + "loss": 0.4903, + "step": 17586 + }, + { + "epoch": 9.82513966480447, + "grad_norm": 0.6272187232971191, + "learning_rate": 0.0005104761904761905, + "loss": 0.4192, + "step": 17587 + }, + { + "epoch": 9.825698324022346, + "grad_norm": 1.2049518823623657, + "learning_rate": 0.0005104481792717087, + "loss": 0.4588, + "step": 17588 + }, + { + "epoch": 9.826256983240224, + "grad_norm": 0.714314877986908, + "learning_rate": 0.000510420168067227, + "loss": 0.4723, + "step": 17589 + }, + { + "epoch": 9.8268156424581, + "grad_norm": 2.792663335800171, + "learning_rate": 0.000510392156862745, + "loss": 0.4281, + "step": 17590 + }, + { + "epoch": 9.827374301675977, + "grad_norm": 0.49839502573013306, + "learning_rate": 0.0005103641456582633, + "loss": 0.4482, + "step": 17591 + }, + { + "epoch": 9.827932960893854, + "grad_norm": 0.45128345489501953, + "learning_rate": 0.0005103361344537816, + "loss": 0.4066, + "step": 17592 + }, + { + "epoch": 9.828491620111732, + "grad_norm": 2.272136926651001, + "learning_rate": 0.0005103081232492998, + "loss": 0.6903, + "step": 17593 + }, + { + "epoch": 9.829050279329609, + "grad_norm": 0.6582977175712585, + "learning_rate": 0.000510280112044818, + "loss": 0.5057, + "step": 17594 + }, + { + "epoch": 9.829608938547485, + "grad_norm": 0.5356632471084595, + "learning_rate": 0.0005102521008403362, + "loss": 0.3518, + "step": 17595 + }, + { + "epoch": 9.830167597765364, + "grad_norm": 0.5593933463096619, + "learning_rate": 0.0005102240896358543, + "loss": 0.4968, + "step": 17596 + }, + { + "epoch": 9.83072625698324, + "grad_norm": 0.382621705532074, + "learning_rate": 0.0005101960784313726, + "loss": 0.3852, + "step": 17597 + }, + { + "epoch": 9.831284916201117, + "grad_norm": 0.6879888772964478, + "learning_rate": 0.0005101680672268908, + "loss": 0.4928, + "step": 17598 + }, + { + "epoch": 9.831843575418995, + "grad_norm": 1.9961494207382202, + "learning_rate": 0.000510140056022409, + "loss": 0.442, + "step": 17599 + }, + { + "epoch": 9.832402234636872, + "grad_norm": 0.42393195629119873, + "learning_rate": 0.0005101120448179272, + "loss": 0.3717, + "step": 17600 + }, + { + "epoch": 9.832960893854748, + "grad_norm": 0.7105714678764343, + "learning_rate": 0.0005100840336134453, + "loss": 0.6116, + "step": 17601 + }, + { + "epoch": 9.833519553072625, + "grad_norm": 0.38168278336524963, + "learning_rate": 0.0005100560224089636, + "loss": 0.4296, + "step": 17602 + }, + { + "epoch": 9.834078212290503, + "grad_norm": 3.837827444076538, + "learning_rate": 0.0005100280112044818, + "loss": 0.5037, + "step": 17603 + }, + { + "epoch": 9.83463687150838, + "grad_norm": 0.43454086780548096, + "learning_rate": 0.00051, + "loss": 0.4355, + "step": 17604 + }, + { + "epoch": 9.835195530726256, + "grad_norm": 0.48860910534858704, + "learning_rate": 0.0005099719887955182, + "loss": 0.4094, + "step": 17605 + }, + { + "epoch": 9.835754189944135, + "grad_norm": 1.1021904945373535, + "learning_rate": 0.0005099439775910363, + "loss": 0.4156, + "step": 17606 + }, + { + "epoch": 9.836312849162011, + "grad_norm": 0.42669644951820374, + "learning_rate": 0.0005099159663865547, + "loss": 0.4312, + "step": 17607 + }, + { + "epoch": 9.836871508379888, + "grad_norm": 1.3057827949523926, + "learning_rate": 0.0005098879551820729, + "loss": 0.3712, + "step": 17608 + }, + { + "epoch": 9.837430167597766, + "grad_norm": 0.5693615674972534, + "learning_rate": 0.0005098599439775911, + "loss": 0.4627, + "step": 17609 + }, + { + "epoch": 9.837988826815643, + "grad_norm": 0.47908318042755127, + "learning_rate": 0.0005098319327731093, + "loss": 0.4758, + "step": 17610 + }, + { + "epoch": 9.83854748603352, + "grad_norm": 1.0456914901733398, + "learning_rate": 0.0005098039215686275, + "loss": 0.4804, + "step": 17611 + }, + { + "epoch": 9.839106145251396, + "grad_norm": 0.5318778157234192, + "learning_rate": 0.0005097759103641457, + "loss": 0.4948, + "step": 17612 + }, + { + "epoch": 9.839664804469274, + "grad_norm": 0.7408348321914673, + "learning_rate": 0.0005097478991596639, + "loss": 0.5706, + "step": 17613 + }, + { + "epoch": 9.84022346368715, + "grad_norm": 0.38129833340644836, + "learning_rate": 0.0005097198879551821, + "loss": 0.4359, + "step": 17614 + }, + { + "epoch": 9.840782122905027, + "grad_norm": 0.4278886020183563, + "learning_rate": 0.0005096918767507003, + "loss": 0.4706, + "step": 17615 + }, + { + "epoch": 9.841340782122906, + "grad_norm": 0.5908858776092529, + "learning_rate": 0.0005096638655462185, + "loss": 0.4086, + "step": 17616 + }, + { + "epoch": 9.841899441340782, + "grad_norm": 0.463882714509964, + "learning_rate": 0.0005096358543417367, + "loss": 0.4633, + "step": 17617 + }, + { + "epoch": 9.842458100558659, + "grad_norm": 0.5576109290122986, + "learning_rate": 0.0005096078431372549, + "loss": 0.4767, + "step": 17618 + }, + { + "epoch": 9.843016759776535, + "grad_norm": 2.875582695007324, + "learning_rate": 0.0005095798319327731, + "loss": 0.4883, + "step": 17619 + }, + { + "epoch": 9.843575418994414, + "grad_norm": 1.7783489227294922, + "learning_rate": 0.0005095518207282913, + "loss": 0.5467, + "step": 17620 + }, + { + "epoch": 9.84413407821229, + "grad_norm": 0.44130298495292664, + "learning_rate": 0.0005095238095238095, + "loss": 0.3833, + "step": 17621 + }, + { + "epoch": 9.844692737430167, + "grad_norm": 0.5307114124298096, + "learning_rate": 0.0005094957983193277, + "loss": 0.4824, + "step": 17622 + }, + { + "epoch": 9.845251396648045, + "grad_norm": 0.5682210326194763, + "learning_rate": 0.000509467787114846, + "loss": 0.393, + "step": 17623 + }, + { + "epoch": 9.845810055865922, + "grad_norm": 0.5218472480773926, + "learning_rate": 0.0005094397759103642, + "loss": 0.3546, + "step": 17624 + }, + { + "epoch": 9.846368715083798, + "grad_norm": 0.38048240542411804, + "learning_rate": 0.0005094117647058824, + "loss": 0.3798, + "step": 17625 + }, + { + "epoch": 9.846927374301677, + "grad_norm": 0.3587186932563782, + "learning_rate": 0.0005093837535014006, + "loss": 0.4206, + "step": 17626 + }, + { + "epoch": 9.847486033519553, + "grad_norm": 0.9400306940078735, + "learning_rate": 0.0005093557422969188, + "loss": 0.5953, + "step": 17627 + }, + { + "epoch": 9.84804469273743, + "grad_norm": 0.46474337577819824, + "learning_rate": 0.000509327731092437, + "loss": 0.3993, + "step": 17628 + }, + { + "epoch": 9.848603351955306, + "grad_norm": 0.6591094732284546, + "learning_rate": 0.0005092997198879552, + "loss": 0.4134, + "step": 17629 + }, + { + "epoch": 9.849162011173185, + "grad_norm": 2.574821710586548, + "learning_rate": 0.0005092717086834734, + "loss": 0.4574, + "step": 17630 + }, + { + "epoch": 9.849720670391061, + "grad_norm": 0.8347105979919434, + "learning_rate": 0.0005092436974789916, + "loss": 0.5108, + "step": 17631 + }, + { + "epoch": 9.850279329608938, + "grad_norm": 0.49660438299179077, + "learning_rate": 0.0005092156862745098, + "loss": 0.4522, + "step": 17632 + }, + { + "epoch": 9.850837988826816, + "grad_norm": 0.4733448028564453, + "learning_rate": 0.000509187675070028, + "loss": 0.5828, + "step": 17633 + }, + { + "epoch": 9.851396648044693, + "grad_norm": 0.5776231288909912, + "learning_rate": 0.0005091596638655462, + "loss": 0.382, + "step": 17634 + }, + { + "epoch": 9.85195530726257, + "grad_norm": 0.44750189781188965, + "learning_rate": 0.0005091316526610644, + "loss": 0.4602, + "step": 17635 + }, + { + "epoch": 9.852513966480448, + "grad_norm": 0.5970128178596497, + "learning_rate": 0.0005091036414565826, + "loss": 0.4939, + "step": 17636 + }, + { + "epoch": 9.853072625698324, + "grad_norm": 0.9238325953483582, + "learning_rate": 0.0005090756302521008, + "loss": 0.4096, + "step": 17637 + }, + { + "epoch": 9.8536312849162, + "grad_norm": 1.023821234703064, + "learning_rate": 0.000509047619047619, + "loss": 0.4228, + "step": 17638 + }, + { + "epoch": 9.854189944134077, + "grad_norm": 0.6652180552482605, + "learning_rate": 0.0005090196078431372, + "loss": 0.5411, + "step": 17639 + }, + { + "epoch": 9.854748603351956, + "grad_norm": 0.41319939494132996, + "learning_rate": 0.0005089915966386555, + "loss": 0.416, + "step": 17640 + }, + { + "epoch": 9.855307262569832, + "grad_norm": 1.6668133735656738, + "learning_rate": 0.0005089635854341737, + "loss": 0.5576, + "step": 17641 + }, + { + "epoch": 9.855865921787709, + "grad_norm": 0.5754125714302063, + "learning_rate": 0.0005089355742296919, + "loss": 0.4303, + "step": 17642 + }, + { + "epoch": 9.856424581005587, + "grad_norm": 0.8854314684867859, + "learning_rate": 0.0005089075630252102, + "loss": 0.4667, + "step": 17643 + }, + { + "epoch": 9.856983240223464, + "grad_norm": 1.496046781539917, + "learning_rate": 0.0005088795518207283, + "loss": 0.4558, + "step": 17644 + }, + { + "epoch": 9.85754189944134, + "grad_norm": 0.5817050933837891, + "learning_rate": 0.0005088515406162465, + "loss": 0.3912, + "step": 17645 + }, + { + "epoch": 9.858100558659217, + "grad_norm": 0.7718605995178223, + "learning_rate": 0.0005088235294117647, + "loss": 0.4674, + "step": 17646 + }, + { + "epoch": 9.858659217877095, + "grad_norm": 0.791430652141571, + "learning_rate": 0.0005087955182072829, + "loss": 0.3752, + "step": 17647 + }, + { + "epoch": 9.859217877094972, + "grad_norm": 0.718021035194397, + "learning_rate": 0.0005087675070028012, + "loss": 0.4314, + "step": 17648 + }, + { + "epoch": 9.859776536312848, + "grad_norm": 0.7523576021194458, + "learning_rate": 0.0005087394957983193, + "loss": 0.3345, + "step": 17649 + }, + { + "epoch": 9.860335195530727, + "grad_norm": 0.6502276062965393, + "learning_rate": 0.0005087114845938375, + "loss": 0.4004, + "step": 17650 + }, + { + "epoch": 9.860893854748603, + "grad_norm": 0.5218349099159241, + "learning_rate": 0.0005086834733893557, + "loss": 0.4006, + "step": 17651 + }, + { + "epoch": 9.86145251396648, + "grad_norm": 14.275579452514648, + "learning_rate": 0.0005086554621848739, + "loss": 0.417, + "step": 17652 + }, + { + "epoch": 9.862011173184358, + "grad_norm": 0.4533391296863556, + "learning_rate": 0.0005086274509803922, + "loss": 0.3713, + "step": 17653 + }, + { + "epoch": 9.862569832402235, + "grad_norm": 0.5260671973228455, + "learning_rate": 0.0005085994397759103, + "loss": 0.3648, + "step": 17654 + }, + { + "epoch": 9.863128491620111, + "grad_norm": 0.684863269329071, + "learning_rate": 0.0005085714285714285, + "loss": 0.3956, + "step": 17655 + }, + { + "epoch": 9.86368715083799, + "grad_norm": 0.43199965357780457, + "learning_rate": 0.0005085434173669467, + "loss": 0.4028, + "step": 17656 + }, + { + "epoch": 9.864245810055866, + "grad_norm": 0.48945263028144836, + "learning_rate": 0.000508515406162465, + "loss": 0.4304, + "step": 17657 + }, + { + "epoch": 9.864804469273743, + "grad_norm": 0.4334677755832672, + "learning_rate": 0.0005084873949579833, + "loss": 0.4946, + "step": 17658 + }, + { + "epoch": 9.86536312849162, + "grad_norm": 0.6097220778465271, + "learning_rate": 0.0005084593837535015, + "loss": 0.4646, + "step": 17659 + }, + { + "epoch": 9.865921787709498, + "grad_norm": 0.5355980396270752, + "learning_rate": 0.0005084313725490196, + "loss": 0.5117, + "step": 17660 + }, + { + "epoch": 9.866480446927374, + "grad_norm": 0.7096639275550842, + "learning_rate": 0.0005084033613445378, + "loss": 0.3179, + "step": 17661 + }, + { + "epoch": 9.867039106145251, + "grad_norm": 1.6744120121002197, + "learning_rate": 0.000508375350140056, + "loss": 0.5419, + "step": 17662 + }, + { + "epoch": 9.86759776536313, + "grad_norm": 0.6097776889801025, + "learning_rate": 0.0005083473389355743, + "loss": 0.4162, + "step": 17663 + }, + { + "epoch": 9.868156424581006, + "grad_norm": 3.0255117416381836, + "learning_rate": 0.0005083193277310925, + "loss": 0.4343, + "step": 17664 + }, + { + "epoch": 9.868715083798882, + "grad_norm": 0.4438476860523224, + "learning_rate": 0.0005082913165266106, + "loss": 0.4919, + "step": 17665 + }, + { + "epoch": 9.869273743016759, + "grad_norm": 0.6422849893569946, + "learning_rate": 0.0005082633053221288, + "loss": 0.5293, + "step": 17666 + }, + { + "epoch": 9.869832402234637, + "grad_norm": 1.6900489330291748, + "learning_rate": 0.000508235294117647, + "loss": 0.397, + "step": 17667 + }, + { + "epoch": 9.870391061452514, + "grad_norm": 0.5288065075874329, + "learning_rate": 0.0005082072829131653, + "loss": 0.413, + "step": 17668 + }, + { + "epoch": 9.87094972067039, + "grad_norm": 0.9630688428878784, + "learning_rate": 0.0005081792717086835, + "loss": 0.3634, + "step": 17669 + }, + { + "epoch": 9.871508379888269, + "grad_norm": 0.4045056700706482, + "learning_rate": 0.0005081512605042016, + "loss": 0.3873, + "step": 17670 + }, + { + "epoch": 9.872067039106145, + "grad_norm": 0.6144238114356995, + "learning_rate": 0.0005081232492997198, + "loss": 0.4214, + "step": 17671 + }, + { + "epoch": 9.872625698324022, + "grad_norm": 0.7358296513557434, + "learning_rate": 0.000508095238095238, + "loss": 0.5928, + "step": 17672 + }, + { + "epoch": 9.8731843575419, + "grad_norm": 0.37702804803848267, + "learning_rate": 0.0005080672268907564, + "loss": 0.3447, + "step": 17673 + }, + { + "epoch": 9.873743016759777, + "grad_norm": 0.3421069085597992, + "learning_rate": 0.0005080392156862746, + "loss": 0.4447, + "step": 17674 + }, + { + "epoch": 9.874301675977653, + "grad_norm": 1.2603915929794312, + "learning_rate": 0.0005080112044817928, + "loss": 0.3705, + "step": 17675 + }, + { + "epoch": 9.87486033519553, + "grad_norm": 0.4427424967288971, + "learning_rate": 0.0005079831932773109, + "loss": 0.3877, + "step": 17676 + }, + { + "epoch": 9.875418994413408, + "grad_norm": 0.4370991885662079, + "learning_rate": 0.0005079551820728291, + "loss": 0.3291, + "step": 17677 + }, + { + "epoch": 9.875977653631285, + "grad_norm": 4.742631435394287, + "learning_rate": 0.0005079271708683474, + "loss": 0.3813, + "step": 17678 + }, + { + "epoch": 9.876536312849161, + "grad_norm": 0.4913809597492218, + "learning_rate": 0.0005078991596638656, + "loss": 0.4501, + "step": 17679 + }, + { + "epoch": 9.87709497206704, + "grad_norm": 0.40640562772750854, + "learning_rate": 0.0005078711484593838, + "loss": 0.48, + "step": 17680 + }, + { + "epoch": 9.877653631284916, + "grad_norm": 0.5571386814117432, + "learning_rate": 0.0005078431372549019, + "loss": 0.5487, + "step": 17681 + }, + { + "epoch": 9.878212290502793, + "grad_norm": 0.6430655121803284, + "learning_rate": 0.0005078151260504201, + "loss": 0.435, + "step": 17682 + }, + { + "epoch": 9.878770949720671, + "grad_norm": 0.8957878351211548, + "learning_rate": 0.0005077871148459384, + "loss": 0.432, + "step": 17683 + }, + { + "epoch": 9.879329608938548, + "grad_norm": 0.3793388605117798, + "learning_rate": 0.0005077591036414566, + "loss": 0.4078, + "step": 17684 + }, + { + "epoch": 9.879888268156424, + "grad_norm": 0.4135552644729614, + "learning_rate": 0.0005077310924369748, + "loss": 0.3916, + "step": 17685 + }, + { + "epoch": 9.880446927374301, + "grad_norm": 0.6391292810440063, + "learning_rate": 0.0005077030812324929, + "loss": 0.4032, + "step": 17686 + }, + { + "epoch": 9.88100558659218, + "grad_norm": 3.5902090072631836, + "learning_rate": 0.0005076750700280111, + "loss": 0.5199, + "step": 17687 + }, + { + "epoch": 9.881564245810056, + "grad_norm": 0.6722155809402466, + "learning_rate": 0.0005076470588235294, + "loss": 0.4596, + "step": 17688 + }, + { + "epoch": 9.882122905027932, + "grad_norm": 0.5453365445137024, + "learning_rate": 0.0005076190476190477, + "loss": 0.3791, + "step": 17689 + }, + { + "epoch": 9.88268156424581, + "grad_norm": 0.49773773550987244, + "learning_rate": 0.0005075910364145659, + "loss": 0.3436, + "step": 17690 + }, + { + "epoch": 9.883240223463687, + "grad_norm": 1.6311397552490234, + "learning_rate": 0.0005075630252100841, + "loss": 0.4237, + "step": 17691 + }, + { + "epoch": 9.883798882681564, + "grad_norm": 0.4438299238681793, + "learning_rate": 0.0005075350140056022, + "loss": 0.3747, + "step": 17692 + }, + { + "epoch": 9.88435754189944, + "grad_norm": 0.45684701204299927, + "learning_rate": 0.0005075070028011205, + "loss": 0.4337, + "step": 17693 + }, + { + "epoch": 9.884916201117319, + "grad_norm": 1.1657993793487549, + "learning_rate": 0.0005074789915966387, + "loss": 0.5087, + "step": 17694 + }, + { + "epoch": 9.885474860335195, + "grad_norm": 0.5270035266876221, + "learning_rate": 0.0005074509803921569, + "loss": 0.499, + "step": 17695 + }, + { + "epoch": 9.886033519553072, + "grad_norm": 0.397850900888443, + "learning_rate": 0.0005074229691876751, + "loss": 0.4688, + "step": 17696 + }, + { + "epoch": 9.88659217877095, + "grad_norm": 0.4842422902584076, + "learning_rate": 0.0005073949579831932, + "loss": 0.456, + "step": 17697 + }, + { + "epoch": 9.887150837988827, + "grad_norm": 0.6004319190979004, + "learning_rate": 0.0005073669467787115, + "loss": 0.3446, + "step": 17698 + }, + { + "epoch": 9.887709497206703, + "grad_norm": 0.41606444120407104, + "learning_rate": 0.0005073389355742297, + "loss": 0.3644, + "step": 17699 + }, + { + "epoch": 9.888268156424582, + "grad_norm": 0.5245066285133362, + "learning_rate": 0.0005073109243697479, + "loss": 0.4748, + "step": 17700 + }, + { + "epoch": 9.888826815642458, + "grad_norm": 0.5773292183876038, + "learning_rate": 0.0005072829131652661, + "loss": 0.4035, + "step": 17701 + }, + { + "epoch": 9.889385474860335, + "grad_norm": 0.5196065902709961, + "learning_rate": 0.0005072549019607842, + "loss": 0.5584, + "step": 17702 + }, + { + "epoch": 9.889944134078211, + "grad_norm": 0.6403607130050659, + "learning_rate": 0.0005072268907563025, + "loss": 0.4928, + "step": 17703 + }, + { + "epoch": 9.89050279329609, + "grad_norm": 1.6563434600830078, + "learning_rate": 0.0005071988795518207, + "loss": 0.3383, + "step": 17704 + }, + { + "epoch": 9.891061452513966, + "grad_norm": 0.5171833038330078, + "learning_rate": 0.000507170868347339, + "loss": 0.3978, + "step": 17705 + }, + { + "epoch": 9.891620111731843, + "grad_norm": 0.5451369881629944, + "learning_rate": 0.0005071428571428572, + "loss": 0.4419, + "step": 17706 + }, + { + "epoch": 9.892178770949721, + "grad_norm": 0.7001411318778992, + "learning_rate": 0.0005071148459383754, + "loss": 0.4389, + "step": 17707 + }, + { + "epoch": 9.892737430167598, + "grad_norm": 0.5099281072616577, + "learning_rate": 0.0005070868347338936, + "loss": 0.4806, + "step": 17708 + }, + { + "epoch": 9.893296089385474, + "grad_norm": 0.9550559520721436, + "learning_rate": 0.0005070588235294118, + "loss": 0.3437, + "step": 17709 + }, + { + "epoch": 9.893854748603353, + "grad_norm": 1.1679702997207642, + "learning_rate": 0.00050703081232493, + "loss": 0.5546, + "step": 17710 + }, + { + "epoch": 9.89441340782123, + "grad_norm": 0.42507076263427734, + "learning_rate": 0.0005070028011204482, + "loss": 0.4687, + "step": 17711 + }, + { + "epoch": 9.894972067039106, + "grad_norm": 0.6230679750442505, + "learning_rate": 0.0005069747899159664, + "loss": 0.4307, + "step": 17712 + }, + { + "epoch": 9.895530726256982, + "grad_norm": 0.46311771869659424, + "learning_rate": 0.0005069467787114846, + "loss": 0.4823, + "step": 17713 + }, + { + "epoch": 9.89608938547486, + "grad_norm": 0.2997567057609558, + "learning_rate": 0.0005069187675070028, + "loss": 0.3435, + "step": 17714 + }, + { + "epoch": 9.896648044692737, + "grad_norm": 0.5212675333023071, + "learning_rate": 0.000506890756302521, + "loss": 0.3867, + "step": 17715 + }, + { + "epoch": 9.897206703910614, + "grad_norm": 1.2878317832946777, + "learning_rate": 0.0005068627450980392, + "loss": 0.4853, + "step": 17716 + }, + { + "epoch": 9.897765363128492, + "grad_norm": 1.5933674573898315, + "learning_rate": 0.0005068347338935574, + "loss": 0.4322, + "step": 17717 + }, + { + "epoch": 9.898324022346369, + "grad_norm": 0.6506137847900391, + "learning_rate": 0.0005068067226890757, + "loss": 0.5069, + "step": 17718 + }, + { + "epoch": 9.898882681564245, + "grad_norm": 0.5195789933204651, + "learning_rate": 0.0005067787114845938, + "loss": 0.3979, + "step": 17719 + }, + { + "epoch": 9.899441340782122, + "grad_norm": 0.635689914226532, + "learning_rate": 0.000506750700280112, + "loss": 0.3611, + "step": 17720 + }, + { + "epoch": 9.9, + "grad_norm": 0.41705521941185, + "learning_rate": 0.0005067226890756302, + "loss": 0.4093, + "step": 17721 + }, + { + "epoch": 9.900558659217877, + "grad_norm": 0.5046812891960144, + "learning_rate": 0.0005066946778711485, + "loss": 0.4071, + "step": 17722 + }, + { + "epoch": 9.901117318435753, + "grad_norm": 0.5913689136505127, + "learning_rate": 0.0005066666666666668, + "loss": 0.5093, + "step": 17723 + }, + { + "epoch": 9.901675977653632, + "grad_norm": 0.5863388180732727, + "learning_rate": 0.0005066386554621849, + "loss": 0.495, + "step": 17724 + }, + { + "epoch": 9.902234636871508, + "grad_norm": 0.4121083617210388, + "learning_rate": 0.0005066106442577031, + "loss": 0.5517, + "step": 17725 + }, + { + "epoch": 9.902793296089385, + "grad_norm": 0.4791109263896942, + "learning_rate": 0.0005065826330532213, + "loss": 0.4433, + "step": 17726 + }, + { + "epoch": 9.903351955307263, + "grad_norm": 0.557059109210968, + "learning_rate": 0.0005065546218487395, + "loss": 0.3982, + "step": 17727 + }, + { + "epoch": 9.90391061452514, + "grad_norm": 0.4848213195800781, + "learning_rate": 0.0005065266106442578, + "loss": 0.5228, + "step": 17728 + }, + { + "epoch": 9.904469273743016, + "grad_norm": 0.439617782831192, + "learning_rate": 0.0005064985994397759, + "loss": 0.3029, + "step": 17729 + }, + { + "epoch": 9.905027932960895, + "grad_norm": 0.6478733420372009, + "learning_rate": 0.0005064705882352941, + "loss": 0.5194, + "step": 17730 + }, + { + "epoch": 9.905586592178771, + "grad_norm": 0.3450714945793152, + "learning_rate": 0.0005064425770308123, + "loss": 0.419, + "step": 17731 + }, + { + "epoch": 9.906145251396648, + "grad_norm": 0.6491840481758118, + "learning_rate": 0.0005064145658263305, + "loss": 0.6409, + "step": 17732 + }, + { + "epoch": 9.906703910614524, + "grad_norm": 0.6130540370941162, + "learning_rate": 0.0005063865546218488, + "loss": 0.3175, + "step": 17733 + }, + { + "epoch": 9.907262569832403, + "grad_norm": 0.4240414798259735, + "learning_rate": 0.000506358543417367, + "loss": 0.3694, + "step": 17734 + }, + { + "epoch": 9.90782122905028, + "grad_norm": 0.44559845328330994, + "learning_rate": 0.0005063305322128851, + "loss": 0.5907, + "step": 17735 + }, + { + "epoch": 9.908379888268156, + "grad_norm": 0.8267097473144531, + "learning_rate": 0.0005063025210084033, + "loss": 0.4164, + "step": 17736 + }, + { + "epoch": 9.908938547486034, + "grad_norm": 0.5043236017227173, + "learning_rate": 0.0005062745098039215, + "loss": 0.4432, + "step": 17737 + }, + { + "epoch": 9.90949720670391, + "grad_norm": 0.6198955178260803, + "learning_rate": 0.0005062464985994399, + "loss": 0.3964, + "step": 17738 + }, + { + "epoch": 9.910055865921787, + "grad_norm": 0.41644683480262756, + "learning_rate": 0.0005062184873949581, + "loss": 0.4098, + "step": 17739 + }, + { + "epoch": 9.910614525139664, + "grad_norm": 0.6375578045845032, + "learning_rate": 0.0005061904761904762, + "loss": 0.3837, + "step": 17740 + }, + { + "epoch": 9.911173184357542, + "grad_norm": 0.6016095876693726, + "learning_rate": 0.0005061624649859944, + "loss": 0.4679, + "step": 17741 + }, + { + "epoch": 9.911731843575419, + "grad_norm": 0.9189779758453369, + "learning_rate": 0.0005061344537815126, + "loss": 0.4224, + "step": 17742 + }, + { + "epoch": 9.912290502793295, + "grad_norm": 0.5817536115646362, + "learning_rate": 0.0005061064425770309, + "loss": 0.4614, + "step": 17743 + }, + { + "epoch": 9.912849162011174, + "grad_norm": 0.528851330280304, + "learning_rate": 0.0005060784313725491, + "loss": 0.479, + "step": 17744 + }, + { + "epoch": 9.91340782122905, + "grad_norm": 0.576828122138977, + "learning_rate": 0.0005060504201680672, + "loss": 0.4654, + "step": 17745 + }, + { + "epoch": 9.913966480446927, + "grad_norm": 0.39457982778549194, + "learning_rate": 0.0005060224089635854, + "loss": 0.4541, + "step": 17746 + }, + { + "epoch": 9.914525139664804, + "grad_norm": 1.1841816902160645, + "learning_rate": 0.0005059943977591036, + "loss": 0.3473, + "step": 17747 + }, + { + "epoch": 9.915083798882682, + "grad_norm": 4.332278728485107, + "learning_rate": 0.0005059663865546219, + "loss": 0.5184, + "step": 17748 + }, + { + "epoch": 9.915642458100558, + "grad_norm": 0.4184720814228058, + "learning_rate": 0.0005059383753501401, + "loss": 0.4291, + "step": 17749 + }, + { + "epoch": 9.916201117318435, + "grad_norm": 0.4247817099094391, + "learning_rate": 0.0005059103641456583, + "loss": 0.5183, + "step": 17750 + }, + { + "epoch": 9.916759776536313, + "grad_norm": 0.6020652651786804, + "learning_rate": 0.0005058823529411764, + "loss": 0.3901, + "step": 17751 + }, + { + "epoch": 9.91731843575419, + "grad_norm": 0.3913191258907318, + "learning_rate": 0.0005058543417366946, + "loss": 0.3606, + "step": 17752 + }, + { + "epoch": 9.917877094972066, + "grad_norm": 0.7320433855056763, + "learning_rate": 0.000505826330532213, + "loss": 0.4539, + "step": 17753 + }, + { + "epoch": 9.918435754189945, + "grad_norm": 0.4420003890991211, + "learning_rate": 0.0005057983193277312, + "loss": 0.4493, + "step": 17754 + }, + { + "epoch": 9.918994413407821, + "grad_norm": 0.504277765750885, + "learning_rate": 0.0005057703081232494, + "loss": 0.3642, + "step": 17755 + }, + { + "epoch": 9.919553072625698, + "grad_norm": 0.5234881043434143, + "learning_rate": 0.0005057422969187675, + "loss": 0.3795, + "step": 17756 + }, + { + "epoch": 9.920111731843576, + "grad_norm": 0.9350456595420837, + "learning_rate": 0.0005057142857142857, + "loss": 0.378, + "step": 17757 + }, + { + "epoch": 9.920670391061453, + "grad_norm": 0.3998241722583771, + "learning_rate": 0.000505686274509804, + "loss": 0.3876, + "step": 17758 + }, + { + "epoch": 9.92122905027933, + "grad_norm": 0.5045170783996582, + "learning_rate": 0.0005056582633053222, + "loss": 0.4123, + "step": 17759 + }, + { + "epoch": 9.921787709497206, + "grad_norm": 2.2669641971588135, + "learning_rate": 0.0005056302521008404, + "loss": 0.3603, + "step": 17760 + }, + { + "epoch": 9.922346368715084, + "grad_norm": 0.5163483619689941, + "learning_rate": 0.0005056022408963585, + "loss": 0.4299, + "step": 17761 + }, + { + "epoch": 9.922905027932961, + "grad_norm": 1.0533607006072998, + "learning_rate": 0.0005055742296918767, + "loss": 0.4692, + "step": 17762 + }, + { + "epoch": 9.923463687150837, + "grad_norm": 0.44947054982185364, + "learning_rate": 0.000505546218487395, + "loss": 0.4444, + "step": 17763 + }, + { + "epoch": 9.924022346368716, + "grad_norm": 0.4417736530303955, + "learning_rate": 0.0005055182072829132, + "loss": 0.3622, + "step": 17764 + }, + { + "epoch": 9.924581005586592, + "grad_norm": 0.6334666609764099, + "learning_rate": 0.0005054901960784314, + "loss": 0.5589, + "step": 17765 + }, + { + "epoch": 9.925139664804469, + "grad_norm": 0.33501386642456055, + "learning_rate": 0.0005054621848739496, + "loss": 0.2728, + "step": 17766 + }, + { + "epoch": 9.925698324022346, + "grad_norm": 0.445417582988739, + "learning_rate": 0.0005054341736694677, + "loss": 0.532, + "step": 17767 + }, + { + "epoch": 9.926256983240224, + "grad_norm": 0.4315336346626282, + "learning_rate": 0.000505406162464986, + "loss": 0.477, + "step": 17768 + }, + { + "epoch": 9.9268156424581, + "grad_norm": 0.543921947479248, + "learning_rate": 0.0005053781512605042, + "loss": 0.3535, + "step": 17769 + }, + { + "epoch": 9.927374301675977, + "grad_norm": 0.3253154158592224, + "learning_rate": 0.0005053501400560224, + "loss": 0.3135, + "step": 17770 + }, + { + "epoch": 9.927932960893855, + "grad_norm": 0.5165511965751648, + "learning_rate": 0.0005053221288515407, + "loss": 0.3982, + "step": 17771 + }, + { + "epoch": 9.928491620111732, + "grad_norm": 0.4735526442527771, + "learning_rate": 0.0005052941176470588, + "loss": 0.4189, + "step": 17772 + }, + { + "epoch": 9.929050279329608, + "grad_norm": 0.3920416235923767, + "learning_rate": 0.0005052661064425771, + "loss": 0.3485, + "step": 17773 + }, + { + "epoch": 9.929608938547487, + "grad_norm": 0.41889169812202454, + "learning_rate": 0.0005052380952380953, + "loss": 0.47, + "step": 17774 + }, + { + "epoch": 9.930167597765363, + "grad_norm": 0.5417645573616028, + "learning_rate": 0.0005052100840336135, + "loss": 0.3252, + "step": 17775 + }, + { + "epoch": 9.93072625698324, + "grad_norm": 1.3262546062469482, + "learning_rate": 0.0005051820728291317, + "loss": 0.3659, + "step": 17776 + }, + { + "epoch": 9.931284916201117, + "grad_norm": 0.5170767307281494, + "learning_rate": 0.0005051540616246498, + "loss": 0.3634, + "step": 17777 + }, + { + "epoch": 9.931843575418995, + "grad_norm": 0.41425251960754395, + "learning_rate": 0.0005051260504201681, + "loss": 0.3723, + "step": 17778 + }, + { + "epoch": 9.932402234636871, + "grad_norm": 0.4712209701538086, + "learning_rate": 0.0005050980392156863, + "loss": 0.4487, + "step": 17779 + }, + { + "epoch": 9.932960893854748, + "grad_norm": 0.43091684579849243, + "learning_rate": 0.0005050700280112045, + "loss": 0.397, + "step": 17780 + }, + { + "epoch": 9.933519553072626, + "grad_norm": 0.6392014026641846, + "learning_rate": 0.0005050420168067227, + "loss": 0.4536, + "step": 17781 + }, + { + "epoch": 9.934078212290503, + "grad_norm": 5.059877872467041, + "learning_rate": 0.0005050140056022409, + "loss": 0.5352, + "step": 17782 + }, + { + "epoch": 9.93463687150838, + "grad_norm": 0.5430557131767273, + "learning_rate": 0.0005049859943977591, + "loss": 0.4014, + "step": 17783 + }, + { + "epoch": 9.935195530726258, + "grad_norm": 0.5238717794418335, + "learning_rate": 0.0005049579831932773, + "loss": 0.4468, + "step": 17784 + }, + { + "epoch": 9.935754189944134, + "grad_norm": 0.6257060170173645, + "learning_rate": 0.0005049299719887955, + "loss": 0.5419, + "step": 17785 + }, + { + "epoch": 9.936312849162011, + "grad_norm": 0.6622463464736938, + "learning_rate": 0.0005049019607843137, + "loss": 0.4419, + "step": 17786 + }, + { + "epoch": 9.936871508379888, + "grad_norm": 0.6698209643363953, + "learning_rate": 0.000504873949579832, + "loss": 0.4104, + "step": 17787 + }, + { + "epoch": 9.937430167597766, + "grad_norm": 0.6647917032241821, + "learning_rate": 0.0005048459383753502, + "loss": 0.4414, + "step": 17788 + }, + { + "epoch": 9.937988826815642, + "grad_norm": 1.059608817100525, + "learning_rate": 0.0005048179271708684, + "loss": 0.3558, + "step": 17789 + }, + { + "epoch": 9.938547486033519, + "grad_norm": 1.3215471506118774, + "learning_rate": 0.0005047899159663866, + "loss": 0.4392, + "step": 17790 + }, + { + "epoch": 9.939106145251397, + "grad_norm": 0.42416998744010925, + "learning_rate": 0.0005047619047619048, + "loss": 0.4676, + "step": 17791 + }, + { + "epoch": 9.939664804469274, + "grad_norm": 0.5323529839515686, + "learning_rate": 0.000504733893557423, + "loss": 0.4977, + "step": 17792 + }, + { + "epoch": 9.94022346368715, + "grad_norm": 0.4561229348182678, + "learning_rate": 0.0005047058823529412, + "loss": 0.3788, + "step": 17793 + }, + { + "epoch": 9.940782122905027, + "grad_norm": 0.550279974937439, + "learning_rate": 0.0005046778711484594, + "loss": 0.4523, + "step": 17794 + }, + { + "epoch": 9.941340782122905, + "grad_norm": 0.47205206751823425, + "learning_rate": 0.0005046498599439776, + "loss": 0.4351, + "step": 17795 + }, + { + "epoch": 9.941899441340782, + "grad_norm": 0.3215457499027252, + "learning_rate": 0.0005046218487394958, + "loss": 0.3374, + "step": 17796 + }, + { + "epoch": 9.942458100558659, + "grad_norm": 0.7078421711921692, + "learning_rate": 0.000504593837535014, + "loss": 0.368, + "step": 17797 + }, + { + "epoch": 9.943016759776537, + "grad_norm": 0.777804434299469, + "learning_rate": 0.0005045658263305323, + "loss": 0.4869, + "step": 17798 + }, + { + "epoch": 9.943575418994413, + "grad_norm": 0.4482170343399048, + "learning_rate": 0.0005045378151260504, + "loss": 0.3924, + "step": 17799 + }, + { + "epoch": 9.94413407821229, + "grad_norm": 0.5563918352127075, + "learning_rate": 0.0005045098039215686, + "loss": 0.4068, + "step": 17800 + }, + { + "epoch": 9.944692737430168, + "grad_norm": 1.5845192670822144, + "learning_rate": 0.0005044817927170868, + "loss": 0.5286, + "step": 17801 + }, + { + "epoch": 9.945251396648045, + "grad_norm": 0.4471222460269928, + "learning_rate": 0.000504453781512605, + "loss": 0.3576, + "step": 17802 + }, + { + "epoch": 9.945810055865921, + "grad_norm": 1.390992283821106, + "learning_rate": 0.0005044257703081234, + "loss": 0.6128, + "step": 17803 + }, + { + "epoch": 9.946368715083798, + "grad_norm": 0.4919310510158539, + "learning_rate": 0.0005043977591036415, + "loss": 0.4478, + "step": 17804 + }, + { + "epoch": 9.946927374301676, + "grad_norm": 0.6390125155448914, + "learning_rate": 0.0005043697478991597, + "loss": 0.5085, + "step": 17805 + }, + { + "epoch": 9.947486033519553, + "grad_norm": 0.7367087006568909, + "learning_rate": 0.0005043417366946779, + "loss": 0.3303, + "step": 17806 + }, + { + "epoch": 9.94804469273743, + "grad_norm": 0.4817686676979065, + "learning_rate": 0.0005043137254901961, + "loss": 0.5774, + "step": 17807 + }, + { + "epoch": 9.948603351955308, + "grad_norm": 0.4301929771900177, + "learning_rate": 0.0005042857142857144, + "loss": 0.4102, + "step": 17808 + }, + { + "epoch": 9.949162011173184, + "grad_norm": 0.44621533155441284, + "learning_rate": 0.0005042577030812325, + "loss": 0.3929, + "step": 17809 + }, + { + "epoch": 9.949720670391061, + "grad_norm": 0.5734425783157349, + "learning_rate": 0.0005042296918767507, + "loss": 0.4805, + "step": 17810 + }, + { + "epoch": 9.95027932960894, + "grad_norm": 0.4563545882701874, + "learning_rate": 0.0005042016806722689, + "loss": 0.4574, + "step": 17811 + }, + { + "epoch": 9.950837988826816, + "grad_norm": 0.6577954888343811, + "learning_rate": 0.0005041736694677871, + "loss": 0.3443, + "step": 17812 + }, + { + "epoch": 9.951396648044692, + "grad_norm": 1.6626604795455933, + "learning_rate": 0.0005041456582633054, + "loss": 0.4839, + "step": 17813 + }, + { + "epoch": 9.951955307262569, + "grad_norm": 0.44711384177207947, + "learning_rate": 0.0005041176470588236, + "loss": 0.4409, + "step": 17814 + }, + { + "epoch": 9.952513966480447, + "grad_norm": 0.5573927760124207, + "learning_rate": 0.0005040896358543417, + "loss": 0.5416, + "step": 17815 + }, + { + "epoch": 9.953072625698324, + "grad_norm": 0.5896942019462585, + "learning_rate": 0.0005040616246498599, + "loss": 0.4672, + "step": 17816 + }, + { + "epoch": 9.9536312849162, + "grad_norm": 0.5028023719787598, + "learning_rate": 0.0005040336134453781, + "loss": 0.3652, + "step": 17817 + }, + { + "epoch": 9.954189944134079, + "grad_norm": 0.5095908045768738, + "learning_rate": 0.0005040056022408964, + "loss": 0.3766, + "step": 17818 + }, + { + "epoch": 9.954748603351955, + "grad_norm": 0.6158854365348816, + "learning_rate": 0.0005039775910364147, + "loss": 0.4199, + "step": 17819 + }, + { + "epoch": 9.955307262569832, + "grad_norm": 0.6902062296867371, + "learning_rate": 0.0005039495798319327, + "loss": 0.4139, + "step": 17820 + }, + { + "epoch": 9.955865921787709, + "grad_norm": 0.5172967910766602, + "learning_rate": 0.000503921568627451, + "loss": 0.5466, + "step": 17821 + }, + { + "epoch": 9.956424581005587, + "grad_norm": 0.618057906627655, + "learning_rate": 0.0005038935574229692, + "loss": 0.502, + "step": 17822 + }, + { + "epoch": 9.956983240223463, + "grad_norm": 5.883916854858398, + "learning_rate": 0.0005038655462184875, + "loss": 0.4512, + "step": 17823 + }, + { + "epoch": 9.95754189944134, + "grad_norm": 0.43965721130371094, + "learning_rate": 0.0005038375350140057, + "loss": 0.3345, + "step": 17824 + }, + { + "epoch": 9.958100558659218, + "grad_norm": 1.9393320083618164, + "learning_rate": 0.0005038095238095238, + "loss": 0.5221, + "step": 17825 + }, + { + "epoch": 9.958659217877095, + "grad_norm": 0.5314610600471497, + "learning_rate": 0.000503781512605042, + "loss": 0.3104, + "step": 17826 + }, + { + "epoch": 9.959217877094972, + "grad_norm": 0.6425861716270447, + "learning_rate": 0.0005037535014005602, + "loss": 0.4868, + "step": 17827 + }, + { + "epoch": 9.95977653631285, + "grad_norm": 0.35783296823501587, + "learning_rate": 0.0005037254901960785, + "loss": 0.3919, + "step": 17828 + }, + { + "epoch": 9.960335195530726, + "grad_norm": 1.5035312175750732, + "learning_rate": 0.0005036974789915967, + "loss": 0.4951, + "step": 17829 + }, + { + "epoch": 9.960893854748603, + "grad_norm": 9.320940017700195, + "learning_rate": 0.0005036694677871149, + "loss": 0.3845, + "step": 17830 + }, + { + "epoch": 9.961452513966481, + "grad_norm": 0.3904717266559601, + "learning_rate": 0.000503641456582633, + "loss": 0.4452, + "step": 17831 + }, + { + "epoch": 9.962011173184358, + "grad_norm": 0.5966362357139587, + "learning_rate": 0.0005036134453781512, + "loss": 0.4985, + "step": 17832 + }, + { + "epoch": 9.962569832402234, + "grad_norm": 0.7172439098358154, + "learning_rate": 0.0005035854341736695, + "loss": 0.4255, + "step": 17833 + }, + { + "epoch": 9.963128491620111, + "grad_norm": 0.5624261498451233, + "learning_rate": 0.0005035574229691877, + "loss": 0.4191, + "step": 17834 + }, + { + "epoch": 9.96368715083799, + "grad_norm": 1.0373311042785645, + "learning_rate": 0.000503529411764706, + "loss": 0.3993, + "step": 17835 + }, + { + "epoch": 9.964245810055866, + "grad_norm": 0.47587740421295166, + "learning_rate": 0.000503501400560224, + "loss": 0.5601, + "step": 17836 + }, + { + "epoch": 9.964804469273743, + "grad_norm": 0.8644841313362122, + "learning_rate": 0.0005034733893557423, + "loss": 0.4302, + "step": 17837 + }, + { + "epoch": 9.96536312849162, + "grad_norm": 0.6737934947013855, + "learning_rate": 0.0005034453781512606, + "loss": 0.4372, + "step": 17838 + }, + { + "epoch": 9.965921787709497, + "grad_norm": 0.8161765933036804, + "learning_rate": 0.0005034173669467788, + "loss": 0.3883, + "step": 17839 + }, + { + "epoch": 9.966480446927374, + "grad_norm": 0.9269985556602478, + "learning_rate": 0.000503389355742297, + "loss": 0.497, + "step": 17840 + }, + { + "epoch": 9.96703910614525, + "grad_norm": 0.40938255190849304, + "learning_rate": 0.0005033613445378151, + "loss": 0.4569, + "step": 17841 + }, + { + "epoch": 9.967597765363129, + "grad_norm": 1.0076881647109985, + "learning_rate": 0.0005033333333333333, + "loss": 0.3871, + "step": 17842 + }, + { + "epoch": 9.968156424581005, + "grad_norm": 0.7609528303146362, + "learning_rate": 0.0005033053221288516, + "loss": 0.4331, + "step": 17843 + }, + { + "epoch": 9.968715083798882, + "grad_norm": 0.44980940222740173, + "learning_rate": 0.0005032773109243698, + "loss": 0.3597, + "step": 17844 + }, + { + "epoch": 9.96927374301676, + "grad_norm": 0.3942873477935791, + "learning_rate": 0.000503249299719888, + "loss": 0.4042, + "step": 17845 + }, + { + "epoch": 9.969832402234637, + "grad_norm": 0.5557286143302917, + "learning_rate": 0.0005032212885154062, + "loss": 0.5173, + "step": 17846 + }, + { + "epoch": 9.970391061452514, + "grad_norm": 0.5568078756332397, + "learning_rate": 0.0005031932773109243, + "loss": 0.3698, + "step": 17847 + }, + { + "epoch": 9.970949720670392, + "grad_norm": 0.3488023281097412, + "learning_rate": 0.0005031652661064426, + "loss": 0.3522, + "step": 17848 + }, + { + "epoch": 9.971508379888268, + "grad_norm": 0.5862781405448914, + "learning_rate": 0.0005031372549019608, + "loss": 0.4666, + "step": 17849 + }, + { + "epoch": 9.972067039106145, + "grad_norm": 0.4708919823169708, + "learning_rate": 0.000503109243697479, + "loss": 0.4491, + "step": 17850 + }, + { + "epoch": 9.972625698324022, + "grad_norm": 0.46804797649383545, + "learning_rate": 0.0005030812324929972, + "loss": 0.3982, + "step": 17851 + }, + { + "epoch": 9.9731843575419, + "grad_norm": 0.48031699657440186, + "learning_rate": 0.0005030532212885153, + "loss": 0.3883, + "step": 17852 + }, + { + "epoch": 9.973743016759776, + "grad_norm": 0.429153174161911, + "learning_rate": 0.0005030252100840335, + "loss": 0.3737, + "step": 17853 + }, + { + "epoch": 9.974301675977653, + "grad_norm": 0.539635956287384, + "learning_rate": 0.0005029971988795519, + "loss": 0.4289, + "step": 17854 + }, + { + "epoch": 9.974860335195531, + "grad_norm": 0.797146201133728, + "learning_rate": 0.0005029691876750701, + "loss": 0.293, + "step": 17855 + }, + { + "epoch": 9.975418994413408, + "grad_norm": 1.0138105154037476, + "learning_rate": 0.0005029411764705883, + "loss": 0.4333, + "step": 17856 + }, + { + "epoch": 9.975977653631285, + "grad_norm": 2.880821704864502, + "learning_rate": 0.0005029131652661064, + "loss": 0.5098, + "step": 17857 + }, + { + "epoch": 9.976536312849163, + "grad_norm": 0.42172062397003174, + "learning_rate": 0.0005028851540616246, + "loss": 0.4094, + "step": 17858 + }, + { + "epoch": 9.97709497206704, + "grad_norm": 0.4181463420391083, + "learning_rate": 0.0005028571428571429, + "loss": 0.3671, + "step": 17859 + }, + { + "epoch": 9.977653631284916, + "grad_norm": 0.4769952893257141, + "learning_rate": 0.0005028291316526611, + "loss": 0.448, + "step": 17860 + }, + { + "epoch": 9.978212290502793, + "grad_norm": 0.47520503401756287, + "learning_rate": 0.0005028011204481793, + "loss": 0.4918, + "step": 17861 + }, + { + "epoch": 9.978770949720671, + "grad_norm": 0.5936439633369446, + "learning_rate": 0.0005027731092436975, + "loss": 0.432, + "step": 17862 + }, + { + "epoch": 9.979329608938547, + "grad_norm": 2.3392996788024902, + "learning_rate": 0.0005027450980392156, + "loss": 0.4427, + "step": 17863 + }, + { + "epoch": 9.979888268156424, + "grad_norm": 0.4882737100124359, + "learning_rate": 0.0005027170868347339, + "loss": 0.4121, + "step": 17864 + }, + { + "epoch": 9.980446927374302, + "grad_norm": 0.35804474353790283, + "learning_rate": 0.0005026890756302521, + "loss": 0.3434, + "step": 17865 + }, + { + "epoch": 9.981005586592179, + "grad_norm": 0.4074746072292328, + "learning_rate": 0.0005026610644257703, + "loss": 0.3969, + "step": 17866 + }, + { + "epoch": 9.981564245810056, + "grad_norm": 0.6635401844978333, + "learning_rate": 0.0005026330532212885, + "loss": 0.482, + "step": 17867 + }, + { + "epoch": 9.982122905027932, + "grad_norm": 0.5572589039802551, + "learning_rate": 0.0005026050420168066, + "loss": 0.5156, + "step": 17868 + }, + { + "epoch": 9.98268156424581, + "grad_norm": 0.8025408983230591, + "learning_rate": 0.000502577030812325, + "loss": 0.4049, + "step": 17869 + }, + { + "epoch": 9.983240223463687, + "grad_norm": 0.8880243897438049, + "learning_rate": 0.0005025490196078432, + "loss": 0.4613, + "step": 17870 + }, + { + "epoch": 9.983798882681564, + "grad_norm": 0.34688395261764526, + "learning_rate": 0.0005025210084033614, + "loss": 0.306, + "step": 17871 + }, + { + "epoch": 9.984357541899442, + "grad_norm": 1.3696792125701904, + "learning_rate": 0.0005024929971988796, + "loss": 0.3963, + "step": 17872 + }, + { + "epoch": 9.984916201117318, + "grad_norm": 0.5034725069999695, + "learning_rate": 0.0005024649859943977, + "loss": 0.3283, + "step": 17873 + }, + { + "epoch": 9.985474860335195, + "grad_norm": 0.44516313076019287, + "learning_rate": 0.000502436974789916, + "loss": 0.4339, + "step": 17874 + }, + { + "epoch": 9.986033519553073, + "grad_norm": 0.5296297669410706, + "learning_rate": 0.0005024089635854342, + "loss": 0.4944, + "step": 17875 + }, + { + "epoch": 9.98659217877095, + "grad_norm": 0.5013049840927124, + "learning_rate": 0.0005023809523809524, + "loss": 0.3323, + "step": 17876 + }, + { + "epoch": 9.987150837988827, + "grad_norm": 0.5601192712783813, + "learning_rate": 0.0005023529411764706, + "loss": 0.3613, + "step": 17877 + }, + { + "epoch": 9.987709497206703, + "grad_norm": 0.5455748438835144, + "learning_rate": 0.0005023249299719888, + "loss": 0.4636, + "step": 17878 + }, + { + "epoch": 9.988268156424581, + "grad_norm": 0.42147040367126465, + "learning_rate": 0.000502296918767507, + "loss": 0.4245, + "step": 17879 + }, + { + "epoch": 9.988826815642458, + "grad_norm": 3.4742698669433594, + "learning_rate": 0.0005022689075630252, + "loss": 0.4802, + "step": 17880 + }, + { + "epoch": 9.989385474860335, + "grad_norm": 2.0615384578704834, + "learning_rate": 0.0005022408963585434, + "loss": 0.4587, + "step": 17881 + }, + { + "epoch": 9.989944134078213, + "grad_norm": 0.7011036276817322, + "learning_rate": 0.0005022128851540616, + "loss": 0.5258, + "step": 17882 + }, + { + "epoch": 9.99050279329609, + "grad_norm": 0.6774871349334717, + "learning_rate": 0.0005021848739495798, + "loss": 0.4676, + "step": 17883 + }, + { + "epoch": 9.991061452513966, + "grad_norm": 0.48109954595565796, + "learning_rate": 0.000502156862745098, + "loss": 0.4101, + "step": 17884 + }, + { + "epoch": 9.991620111731844, + "grad_norm": 0.5987567901611328, + "learning_rate": 0.0005021288515406162, + "loss": 0.3873, + "step": 17885 + }, + { + "epoch": 9.992178770949721, + "grad_norm": 0.5523086786270142, + "learning_rate": 0.0005021008403361345, + "loss": 0.4494, + "step": 17886 + }, + { + "epoch": 9.992737430167598, + "grad_norm": 0.44415542483329773, + "learning_rate": 0.0005020728291316527, + "loss": 0.5245, + "step": 17887 + }, + { + "epoch": 9.993296089385474, + "grad_norm": 0.5693686008453369, + "learning_rate": 0.0005020448179271709, + "loss": 0.4991, + "step": 17888 + }, + { + "epoch": 9.993854748603352, + "grad_norm": 1.1321344375610352, + "learning_rate": 0.0005020168067226891, + "loss": 0.3953, + "step": 17889 + }, + { + "epoch": 9.994413407821229, + "grad_norm": 0.4679591953754425, + "learning_rate": 0.0005019887955182073, + "loss": 0.4459, + "step": 17890 + }, + { + "epoch": 9.994972067039106, + "grad_norm": 1.1107648611068726, + "learning_rate": 0.0005019607843137255, + "loss": 0.4412, + "step": 17891 + }, + { + "epoch": 9.995530726256984, + "grad_norm": 0.7567646503448486, + "learning_rate": 0.0005019327731092437, + "loss": 0.4893, + "step": 17892 + }, + { + "epoch": 9.99608938547486, + "grad_norm": 0.42347589135169983, + "learning_rate": 0.0005019047619047619, + "loss": 0.4143, + "step": 17893 + }, + { + "epoch": 9.996648044692737, + "grad_norm": 0.5419673323631287, + "learning_rate": 0.0005018767507002802, + "loss": 0.4728, + "step": 17894 + }, + { + "epoch": 9.997206703910614, + "grad_norm": 0.40730077028274536, + "learning_rate": 0.0005018487394957983, + "loss": 0.4279, + "step": 17895 + }, + { + "epoch": 9.997765363128492, + "grad_norm": 0.6806232333183289, + "learning_rate": 0.0005018207282913165, + "loss": 0.4555, + "step": 17896 + }, + { + "epoch": 9.998324022346369, + "grad_norm": 0.440498948097229, + "learning_rate": 0.0005017927170868347, + "loss": 0.4593, + "step": 17897 + }, + { + "epoch": 9.998882681564245, + "grad_norm": 0.4612373411655426, + "learning_rate": 0.0005017647058823529, + "loss": 0.3831, + "step": 17898 + }, + { + "epoch": 9.999441340782123, + "grad_norm": 0.5210971236228943, + "learning_rate": 0.0005017366946778712, + "loss": 0.3804, + "step": 17899 + }, + { + "epoch": 10.0, + "grad_norm": 0.4399031400680542, + "learning_rate": 0.0005017086834733893, + "loss": 0.4062, + "step": 17900 + }, + { + "epoch": 10.000558659217877, + "grad_norm": 0.345892071723938, + "learning_rate": 0.0005016806722689075, + "loss": 0.3255, + "step": 17901 + }, + { + "epoch": 10.001117318435755, + "grad_norm": 0.49039262533187866, + "learning_rate": 0.0005016526610644257, + "loss": 0.3856, + "step": 17902 + }, + { + "epoch": 10.001675977653631, + "grad_norm": 0.4026343822479248, + "learning_rate": 0.000501624649859944, + "loss": 0.3857, + "step": 17903 + }, + { + "epoch": 10.002234636871508, + "grad_norm": 1.6864407062530518, + "learning_rate": 0.0005015966386554623, + "loss": 0.463, + "step": 17904 + }, + { + "epoch": 10.002793296089385, + "grad_norm": 0.46433213353157043, + "learning_rate": 0.0005015686274509804, + "loss": 0.4358, + "step": 17905 + }, + { + "epoch": 10.003351955307263, + "grad_norm": 0.4463176131248474, + "learning_rate": 0.0005015406162464986, + "loss": 0.4161, + "step": 17906 + }, + { + "epoch": 10.00391061452514, + "grad_norm": 0.8651337623596191, + "learning_rate": 0.0005015126050420168, + "loss": 0.5041, + "step": 17907 + }, + { + "epoch": 10.004469273743016, + "grad_norm": 0.5406264066696167, + "learning_rate": 0.000501484593837535, + "loss": 0.4205, + "step": 17908 + }, + { + "epoch": 10.005027932960894, + "grad_norm": 0.593603253364563, + "learning_rate": 0.0005014565826330533, + "loss": 0.4034, + "step": 17909 + }, + { + "epoch": 10.005586592178771, + "grad_norm": 0.5999714136123657, + "learning_rate": 0.0005014285714285715, + "loss": 0.4496, + "step": 17910 + }, + { + "epoch": 10.006145251396648, + "grad_norm": 0.6581023335456848, + "learning_rate": 0.0005014005602240896, + "loss": 0.4811, + "step": 17911 + }, + { + "epoch": 10.006703910614526, + "grad_norm": 0.7229752540588379, + "learning_rate": 0.0005013725490196078, + "loss": 0.5404, + "step": 17912 + }, + { + "epoch": 10.007262569832402, + "grad_norm": 0.414296954870224, + "learning_rate": 0.000501344537815126, + "loss": 0.4058, + "step": 17913 + }, + { + "epoch": 10.007821229050279, + "grad_norm": 0.5042400360107422, + "learning_rate": 0.0005013165266106443, + "loss": 0.4155, + "step": 17914 + }, + { + "epoch": 10.008379888268156, + "grad_norm": 1.791998028755188, + "learning_rate": 0.0005012885154061625, + "loss": 0.3966, + "step": 17915 + }, + { + "epoch": 10.008938547486034, + "grad_norm": 0.42034295201301575, + "learning_rate": 0.0005012605042016806, + "loss": 0.4131, + "step": 17916 + }, + { + "epoch": 10.00949720670391, + "grad_norm": 0.6980493068695068, + "learning_rate": 0.0005012324929971988, + "loss": 0.4552, + "step": 17917 + }, + { + "epoch": 10.010055865921787, + "grad_norm": 0.4247693419456482, + "learning_rate": 0.000501204481792717, + "loss": 0.3665, + "step": 17918 + }, + { + "epoch": 10.010614525139665, + "grad_norm": 0.8249256014823914, + "learning_rate": 0.0005011764705882354, + "loss": 0.426, + "step": 17919 + }, + { + "epoch": 10.011173184357542, + "grad_norm": 0.44600003957748413, + "learning_rate": 0.0005011484593837536, + "loss": 0.4388, + "step": 17920 + }, + { + "epoch": 10.011731843575419, + "grad_norm": 0.5670394897460938, + "learning_rate": 0.0005011204481792717, + "loss": 0.4706, + "step": 17921 + }, + { + "epoch": 10.012290502793297, + "grad_norm": 0.47556865215301514, + "learning_rate": 0.0005010924369747899, + "loss": 0.4874, + "step": 17922 + }, + { + "epoch": 10.012849162011173, + "grad_norm": 1.188564419746399, + "learning_rate": 0.0005010644257703081, + "loss": 0.4724, + "step": 17923 + }, + { + "epoch": 10.01340782122905, + "grad_norm": 0.4579189121723175, + "learning_rate": 0.0005010364145658264, + "loss": 0.3863, + "step": 17924 + }, + { + "epoch": 10.013966480446927, + "grad_norm": 0.500243604183197, + "learning_rate": 0.0005010084033613446, + "loss": 0.3734, + "step": 17925 + }, + { + "epoch": 10.014525139664805, + "grad_norm": 0.5416630506515503, + "learning_rate": 0.0005009803921568628, + "loss": 0.4144, + "step": 17926 + }, + { + "epoch": 10.015083798882682, + "grad_norm": 0.4887044429779053, + "learning_rate": 0.0005009523809523809, + "loss": 0.484, + "step": 17927 + }, + { + "epoch": 10.015642458100558, + "grad_norm": 16.893583297729492, + "learning_rate": 0.0005009243697478991, + "loss": 0.3519, + "step": 17928 + }, + { + "epoch": 10.016201117318436, + "grad_norm": 0.46560680866241455, + "learning_rate": 0.0005008963585434174, + "loss": 0.4451, + "step": 17929 + }, + { + "epoch": 10.016759776536313, + "grad_norm": 1.105960726737976, + "learning_rate": 0.0005008683473389356, + "loss": 0.4774, + "step": 17930 + }, + { + "epoch": 10.01731843575419, + "grad_norm": 1.2146358489990234, + "learning_rate": 0.0005008403361344538, + "loss": 0.3292, + "step": 17931 + }, + { + "epoch": 10.017877094972068, + "grad_norm": 0.44622984528541565, + "learning_rate": 0.0005008123249299719, + "loss": 0.4929, + "step": 17932 + }, + { + "epoch": 10.018435754189944, + "grad_norm": 0.5632505416870117, + "learning_rate": 0.0005007843137254901, + "loss": 0.3551, + "step": 17933 + }, + { + "epoch": 10.018994413407821, + "grad_norm": 0.43393099308013916, + "learning_rate": 0.0005007563025210084, + "loss": 0.5032, + "step": 17934 + }, + { + "epoch": 10.019553072625698, + "grad_norm": 1.2526994943618774, + "learning_rate": 0.0005007282913165267, + "loss": 0.4211, + "step": 17935 + }, + { + "epoch": 10.020111731843576, + "grad_norm": 0.4087871313095093, + "learning_rate": 0.0005007002801120449, + "loss": 0.3924, + "step": 17936 + }, + { + "epoch": 10.020670391061453, + "grad_norm": 0.5228903889656067, + "learning_rate": 0.000500672268907563, + "loss": 0.409, + "step": 17937 + }, + { + "epoch": 10.021229050279329, + "grad_norm": 1.160319209098816, + "learning_rate": 0.0005006442577030812, + "loss": 0.3508, + "step": 17938 + }, + { + "epoch": 10.021787709497207, + "grad_norm": 0.39838096499443054, + "learning_rate": 0.0005006162464985995, + "loss": 0.4804, + "step": 17939 + }, + { + "epoch": 10.022346368715084, + "grad_norm": 0.40692901611328125, + "learning_rate": 0.0005005882352941177, + "loss": 0.346, + "step": 17940 + }, + { + "epoch": 10.02290502793296, + "grad_norm": 0.7101580500602722, + "learning_rate": 0.0005005602240896359, + "loss": 0.5936, + "step": 17941 + }, + { + "epoch": 10.023463687150837, + "grad_norm": 0.4281863272190094, + "learning_rate": 0.0005005322128851541, + "loss": 0.3857, + "step": 17942 + }, + { + "epoch": 10.024022346368715, + "grad_norm": 0.3620590269565582, + "learning_rate": 0.0005005042016806722, + "loss": 0.3799, + "step": 17943 + }, + { + "epoch": 10.024581005586592, + "grad_norm": 0.3812369704246521, + "learning_rate": 0.0005004761904761905, + "loss": 0.3059, + "step": 17944 + }, + { + "epoch": 10.025139664804469, + "grad_norm": 0.4740908741950989, + "learning_rate": 0.0005004481792717087, + "loss": 0.3452, + "step": 17945 + }, + { + "epoch": 10.025698324022347, + "grad_norm": 0.6070629954338074, + "learning_rate": 0.0005004201680672269, + "loss": 0.4658, + "step": 17946 + }, + { + "epoch": 10.026256983240224, + "grad_norm": 0.4376888871192932, + "learning_rate": 0.0005003921568627451, + "loss": 0.4862, + "step": 17947 + }, + { + "epoch": 10.0268156424581, + "grad_norm": 0.5635068416595459, + "learning_rate": 0.0005003641456582632, + "loss": 0.4731, + "step": 17948 + }, + { + "epoch": 10.027374301675978, + "grad_norm": 0.6688268184661865, + "learning_rate": 0.0005003361344537815, + "loss": 0.4297, + "step": 17949 + }, + { + "epoch": 10.027932960893855, + "grad_norm": 0.3915763199329376, + "learning_rate": 0.0005003081232492997, + "loss": 0.2974, + "step": 17950 + }, + { + "epoch": 10.028491620111732, + "grad_norm": 0.731935441493988, + "learning_rate": 0.000500280112044818, + "loss": 0.6455, + "step": 17951 + }, + { + "epoch": 10.029050279329608, + "grad_norm": 0.5909689664840698, + "learning_rate": 0.0005002521008403362, + "loss": 0.4373, + "step": 17952 + }, + { + "epoch": 10.029608938547486, + "grad_norm": 0.6215769052505493, + "learning_rate": 0.0005002240896358543, + "loss": 0.3597, + "step": 17953 + }, + { + "epoch": 10.030167597765363, + "grad_norm": 0.739921510219574, + "learning_rate": 0.0005001960784313726, + "loss": 0.4298, + "step": 17954 + }, + { + "epoch": 10.03072625698324, + "grad_norm": 2.083953619003296, + "learning_rate": 0.0005001680672268908, + "loss": 0.3057, + "step": 17955 + }, + { + "epoch": 10.031284916201118, + "grad_norm": 0.5171045660972595, + "learning_rate": 0.000500140056022409, + "loss": 0.4428, + "step": 17956 + }, + { + "epoch": 10.031843575418995, + "grad_norm": 1.0370972156524658, + "learning_rate": 0.0005001120448179272, + "loss": 0.5112, + "step": 17957 + }, + { + "epoch": 10.032402234636871, + "grad_norm": 3.2398455142974854, + "learning_rate": 0.0005000840336134454, + "loss": 0.3883, + "step": 17958 + }, + { + "epoch": 10.03296089385475, + "grad_norm": 0.5504884719848633, + "learning_rate": 0.0005000560224089636, + "loss": 0.4238, + "step": 17959 + }, + { + "epoch": 10.033519553072626, + "grad_norm": 0.6857721209526062, + "learning_rate": 0.0005000280112044818, + "loss": 0.4023, + "step": 17960 + }, + { + "epoch": 10.034078212290503, + "grad_norm": 0.4622032940387726, + "learning_rate": 0.0005, + "loss": 0.4926, + "step": 17961 + }, + { + "epoch": 10.03463687150838, + "grad_norm": 0.6019774675369263, + "learning_rate": 0.0004999719887955182, + "loss": 0.4645, + "step": 17962 + }, + { + "epoch": 10.035195530726257, + "grad_norm": 0.4611119031906128, + "learning_rate": 0.0004999439775910364, + "loss": 0.5331, + "step": 17963 + }, + { + "epoch": 10.035754189944134, + "grad_norm": 0.5036234855651855, + "learning_rate": 0.0004999159663865546, + "loss": 0.3573, + "step": 17964 + }, + { + "epoch": 10.03631284916201, + "grad_norm": 0.5459470748901367, + "learning_rate": 0.0004998879551820728, + "loss": 0.4795, + "step": 17965 + }, + { + "epoch": 10.036871508379889, + "grad_norm": 1.5565532445907593, + "learning_rate": 0.000499859943977591, + "loss": 0.5216, + "step": 17966 + }, + { + "epoch": 10.037430167597766, + "grad_norm": 0.38636332750320435, + "learning_rate": 0.0004998319327731092, + "loss": 0.3909, + "step": 17967 + }, + { + "epoch": 10.037988826815642, + "grad_norm": 0.3940342962741852, + "learning_rate": 0.0004998039215686275, + "loss": 0.3195, + "step": 17968 + }, + { + "epoch": 10.03854748603352, + "grad_norm": 0.7279254794120789, + "learning_rate": 0.0004997759103641457, + "loss": 0.4641, + "step": 17969 + }, + { + "epoch": 10.039106145251397, + "grad_norm": 0.40888601541519165, + "learning_rate": 0.0004997478991596639, + "loss": 0.5189, + "step": 17970 + }, + { + "epoch": 10.039664804469274, + "grad_norm": 0.47585687041282654, + "learning_rate": 0.0004997198879551821, + "loss": 0.4612, + "step": 17971 + }, + { + "epoch": 10.04022346368715, + "grad_norm": 0.9656063914299011, + "learning_rate": 0.0004996918767507003, + "loss": 0.4296, + "step": 17972 + }, + { + "epoch": 10.040782122905028, + "grad_norm": 0.397722989320755, + "learning_rate": 0.0004996638655462185, + "loss": 0.3917, + "step": 17973 + }, + { + "epoch": 10.041340782122905, + "grad_norm": 0.8667500019073486, + "learning_rate": 0.0004996358543417367, + "loss": 0.4959, + "step": 17974 + }, + { + "epoch": 10.041899441340782, + "grad_norm": 0.4051498472690582, + "learning_rate": 0.0004996078431372549, + "loss": 0.4592, + "step": 17975 + }, + { + "epoch": 10.04245810055866, + "grad_norm": 1.0580335855484009, + "learning_rate": 0.0004995798319327731, + "loss": 0.3187, + "step": 17976 + }, + { + "epoch": 10.043016759776537, + "grad_norm": 0.48696503043174744, + "learning_rate": 0.0004995518207282913, + "loss": 0.4534, + "step": 17977 + }, + { + "epoch": 10.043575418994413, + "grad_norm": 8.047385215759277, + "learning_rate": 0.0004995238095238095, + "loss": 0.3802, + "step": 17978 + }, + { + "epoch": 10.04413407821229, + "grad_norm": 1.4092398881912231, + "learning_rate": 0.0004994957983193277, + "loss": 0.4651, + "step": 17979 + }, + { + "epoch": 10.044692737430168, + "grad_norm": 0.4118942320346832, + "learning_rate": 0.0004994677871148459, + "loss": 0.4148, + "step": 17980 + }, + { + "epoch": 10.045251396648045, + "grad_norm": 0.46698030829429626, + "learning_rate": 0.0004994397759103641, + "loss": 0.4624, + "step": 17981 + }, + { + "epoch": 10.045810055865921, + "grad_norm": 0.6111294627189636, + "learning_rate": 0.0004994117647058823, + "loss": 0.3908, + "step": 17982 + }, + { + "epoch": 10.0463687150838, + "grad_norm": 0.5152270197868347, + "learning_rate": 0.0004993837535014005, + "loss": 0.3396, + "step": 17983 + }, + { + "epoch": 10.046927374301676, + "grad_norm": 4.6959452629089355, + "learning_rate": 0.0004993557422969187, + "loss": 0.4675, + "step": 17984 + }, + { + "epoch": 10.047486033519553, + "grad_norm": 0.8851702213287354, + "learning_rate": 0.0004993277310924371, + "loss": 0.4131, + "step": 17985 + }, + { + "epoch": 10.048044692737431, + "grad_norm": 0.4822857081890106, + "learning_rate": 0.0004992997198879552, + "loss": 0.3688, + "step": 17986 + }, + { + "epoch": 10.048603351955308, + "grad_norm": 1.4531407356262207, + "learning_rate": 0.0004992717086834734, + "loss": 0.4738, + "step": 17987 + }, + { + "epoch": 10.049162011173184, + "grad_norm": 0.46136125922203064, + "learning_rate": 0.0004992436974789916, + "loss": 0.5313, + "step": 17988 + }, + { + "epoch": 10.04972067039106, + "grad_norm": 1.3123393058776855, + "learning_rate": 0.0004992156862745098, + "loss": 0.5028, + "step": 17989 + }, + { + "epoch": 10.050279329608939, + "grad_norm": 0.5183739066123962, + "learning_rate": 0.0004991876750700281, + "loss": 0.3755, + "step": 17990 + }, + { + "epoch": 10.050837988826816, + "grad_norm": 0.8639532327651978, + "learning_rate": 0.0004991596638655462, + "loss": 0.3771, + "step": 17991 + }, + { + "epoch": 10.051396648044692, + "grad_norm": 0.4637674391269684, + "learning_rate": 0.0004991316526610644, + "loss": 0.3649, + "step": 17992 + }, + { + "epoch": 10.05195530726257, + "grad_norm": 0.7903505563735962, + "learning_rate": 0.0004991036414565827, + "loss": 0.4078, + "step": 17993 + }, + { + "epoch": 10.052513966480447, + "grad_norm": 0.49363213777542114, + "learning_rate": 0.0004990756302521008, + "loss": 0.5178, + "step": 17994 + }, + { + "epoch": 10.053072625698324, + "grad_norm": 5.578216075897217, + "learning_rate": 0.0004990476190476191, + "loss": 0.4068, + "step": 17995 + }, + { + "epoch": 10.053631284916202, + "grad_norm": 0.7987362146377563, + "learning_rate": 0.0004990196078431372, + "loss": 0.6, + "step": 17996 + }, + { + "epoch": 10.054189944134079, + "grad_norm": 0.4944694936275482, + "learning_rate": 0.0004989915966386554, + "loss": 0.5439, + "step": 17997 + }, + { + "epoch": 10.054748603351955, + "grad_norm": 0.5824446678161621, + "learning_rate": 0.0004989635854341737, + "loss": 0.5314, + "step": 17998 + }, + { + "epoch": 10.055307262569832, + "grad_norm": 0.6227288842201233, + "learning_rate": 0.0004989355742296918, + "loss": 0.4775, + "step": 17999 + }, + { + "epoch": 10.05586592178771, + "grad_norm": 0.5089334845542908, + "learning_rate": 0.0004989075630252102, + "loss": 0.3765, + "step": 18000 + }, + { + "epoch": 10.05586592178771, + "eval_cer": 0.08866049120050626, + "eval_loss": 0.3333585262298584, + "eval_runtime": 55.6479, + "eval_samples_per_second": 81.548, + "eval_steps_per_second": 5.104, + "eval_wer": 0.3523644590933214, + "step": 18000 + }, + { + "epoch": 10.056424581005587, + "grad_norm": 0.3654351830482483, + "learning_rate": 0.0004988795518207284, + "loss": 0.2883, + "step": 18001 + }, + { + "epoch": 10.056983240223463, + "grad_norm": 0.4010741114616394, + "learning_rate": 0.0004988515406162465, + "loss": 0.367, + "step": 18002 + }, + { + "epoch": 10.057541899441341, + "grad_norm": 0.7492659687995911, + "learning_rate": 0.0004988235294117648, + "loss": 0.5268, + "step": 18003 + }, + { + "epoch": 10.058100558659218, + "grad_norm": 3.2686333656311035, + "learning_rate": 0.0004987955182072829, + "loss": 0.4709, + "step": 18004 + }, + { + "epoch": 10.058659217877095, + "grad_norm": 0.394586980342865, + "learning_rate": 0.0004987675070028012, + "loss": 0.4133, + "step": 18005 + }, + { + "epoch": 10.059217877094973, + "grad_norm": 0.6349456310272217, + "learning_rate": 0.0004987394957983194, + "loss": 0.4667, + "step": 18006 + }, + { + "epoch": 10.05977653631285, + "grad_norm": 0.3994496464729309, + "learning_rate": 0.0004987114845938375, + "loss": 0.4202, + "step": 18007 + }, + { + "epoch": 10.060335195530726, + "grad_norm": 0.43270522356033325, + "learning_rate": 0.0004986834733893558, + "loss": 0.3797, + "step": 18008 + }, + { + "epoch": 10.060893854748603, + "grad_norm": 0.7027539014816284, + "learning_rate": 0.000498655462184874, + "loss": 0.3529, + "step": 18009 + }, + { + "epoch": 10.061452513966481, + "grad_norm": 0.5405945777893066, + "learning_rate": 0.0004986274509803922, + "loss": 0.4125, + "step": 18010 + }, + { + "epoch": 10.062011173184358, + "grad_norm": 0.4811776578426361, + "learning_rate": 0.0004985994397759104, + "loss": 0.4808, + "step": 18011 + }, + { + "epoch": 10.062569832402234, + "grad_norm": 0.407446026802063, + "learning_rate": 0.0004985714285714285, + "loss": 0.3681, + "step": 18012 + }, + { + "epoch": 10.063128491620112, + "grad_norm": 0.3555853068828583, + "learning_rate": 0.0004985434173669468, + "loss": 0.3904, + "step": 18013 + }, + { + "epoch": 10.063687150837989, + "grad_norm": 0.66319739818573, + "learning_rate": 0.000498515406162465, + "loss": 0.65, + "step": 18014 + }, + { + "epoch": 10.064245810055866, + "grad_norm": 0.46124690771102905, + "learning_rate": 0.0004984873949579832, + "loss": 0.3195, + "step": 18015 + }, + { + "epoch": 10.064804469273742, + "grad_norm": 0.4796826243400574, + "learning_rate": 0.0004984593837535014, + "loss": 0.4868, + "step": 18016 + }, + { + "epoch": 10.06536312849162, + "grad_norm": 0.34285181760787964, + "learning_rate": 0.0004984313725490197, + "loss": 0.3275, + "step": 18017 + }, + { + "epoch": 10.065921787709497, + "grad_norm": 0.45430615544319153, + "learning_rate": 0.0004984033613445379, + "loss": 0.3633, + "step": 18018 + }, + { + "epoch": 10.066480446927374, + "grad_norm": 0.46112847328186035, + "learning_rate": 0.0004983753501400561, + "loss": 0.6426, + "step": 18019 + }, + { + "epoch": 10.067039106145252, + "grad_norm": 0.6184660196304321, + "learning_rate": 0.0004983473389355742, + "loss": 0.5031, + "step": 18020 + }, + { + "epoch": 10.067597765363129, + "grad_norm": 0.7644140720367432, + "learning_rate": 0.0004983193277310925, + "loss": 0.4417, + "step": 18021 + }, + { + "epoch": 10.068156424581005, + "grad_norm": 0.6032038331031799, + "learning_rate": 0.0004982913165266107, + "loss": 0.4618, + "step": 18022 + }, + { + "epoch": 10.068715083798883, + "grad_norm": 0.6615059971809387, + "learning_rate": 0.0004982633053221289, + "loss": 0.3599, + "step": 18023 + }, + { + "epoch": 10.06927374301676, + "grad_norm": 0.6413350701332092, + "learning_rate": 0.0004982352941176471, + "loss": 0.3698, + "step": 18024 + }, + { + "epoch": 10.069832402234637, + "grad_norm": 1.8722656965255737, + "learning_rate": 0.0004982072829131653, + "loss": 0.4012, + "step": 18025 + }, + { + "epoch": 10.070391061452513, + "grad_norm": 1.1173092126846313, + "learning_rate": 0.0004981792717086835, + "loss": 0.4373, + "step": 18026 + }, + { + "epoch": 10.070949720670392, + "grad_norm": 0.45847341418266296, + "learning_rate": 0.0004981512605042017, + "loss": 0.3597, + "step": 18027 + }, + { + "epoch": 10.071508379888268, + "grad_norm": 0.4837469160556793, + "learning_rate": 0.0004981232492997199, + "loss": 0.3547, + "step": 18028 + }, + { + "epoch": 10.072067039106145, + "grad_norm": 0.4144695997238159, + "learning_rate": 0.0004980952380952381, + "loss": 0.3277, + "step": 18029 + }, + { + "epoch": 10.072625698324023, + "grad_norm": 0.5408447980880737, + "learning_rate": 0.0004980672268907563, + "loss": 0.4625, + "step": 18030 + }, + { + "epoch": 10.0731843575419, + "grad_norm": 0.5198730230331421, + "learning_rate": 0.0004980392156862745, + "loss": 0.4248, + "step": 18031 + }, + { + "epoch": 10.073743016759776, + "grad_norm": 0.993948221206665, + "learning_rate": 0.0004980112044817927, + "loss": 0.5049, + "step": 18032 + }, + { + "epoch": 10.074301675977654, + "grad_norm": 0.35296565294265747, + "learning_rate": 0.000497983193277311, + "loss": 0.3798, + "step": 18033 + }, + { + "epoch": 10.074860335195531, + "grad_norm": 1.2401721477508545, + "learning_rate": 0.0004979551820728292, + "loss": 0.376, + "step": 18034 + }, + { + "epoch": 10.075418994413408, + "grad_norm": 0.432939738035202, + "learning_rate": 0.0004979271708683474, + "loss": 0.5478, + "step": 18035 + }, + { + "epoch": 10.075977653631284, + "grad_norm": 0.5400649309158325, + "learning_rate": 0.0004978991596638656, + "loss": 0.3319, + "step": 18036 + }, + { + "epoch": 10.076536312849163, + "grad_norm": 1.9363116025924683, + "learning_rate": 0.0004978711484593838, + "loss": 0.5767, + "step": 18037 + }, + { + "epoch": 10.077094972067039, + "grad_norm": 0.5125581622123718, + "learning_rate": 0.000497843137254902, + "loss": 0.3716, + "step": 18038 + }, + { + "epoch": 10.077653631284916, + "grad_norm": 16.39263153076172, + "learning_rate": 0.0004978151260504202, + "loss": 0.4797, + "step": 18039 + }, + { + "epoch": 10.078212290502794, + "grad_norm": 0.44551587104797363, + "learning_rate": 0.0004977871148459384, + "loss": 0.4268, + "step": 18040 + }, + { + "epoch": 10.07877094972067, + "grad_norm": 0.6325133442878723, + "learning_rate": 0.0004977591036414566, + "loss": 0.4095, + "step": 18041 + }, + { + "epoch": 10.079329608938547, + "grad_norm": 0.44222506880760193, + "learning_rate": 0.0004977310924369748, + "loss": 0.4128, + "step": 18042 + }, + { + "epoch": 10.079888268156424, + "grad_norm": 0.41075995564460754, + "learning_rate": 0.000497703081232493, + "loss": 0.4313, + "step": 18043 + }, + { + "epoch": 10.080446927374302, + "grad_norm": 0.43985557556152344, + "learning_rate": 0.0004976750700280112, + "loss": 0.395, + "step": 18044 + }, + { + "epoch": 10.081005586592179, + "grad_norm": 0.48438480496406555, + "learning_rate": 0.0004976470588235294, + "loss": 0.4277, + "step": 18045 + }, + { + "epoch": 10.081564245810055, + "grad_norm": 0.3173558712005615, + "learning_rate": 0.0004976190476190476, + "loss": 0.4226, + "step": 18046 + }, + { + "epoch": 10.082122905027934, + "grad_norm": 0.9954471588134766, + "learning_rate": 0.0004975910364145658, + "loss": 0.3688, + "step": 18047 + }, + { + "epoch": 10.08268156424581, + "grad_norm": 0.5256673097610474, + "learning_rate": 0.000497563025210084, + "loss": 0.4816, + "step": 18048 + }, + { + "epoch": 10.083240223463687, + "grad_norm": 0.34848225116729736, + "learning_rate": 0.0004975350140056022, + "loss": 0.3846, + "step": 18049 + }, + { + "epoch": 10.083798882681565, + "grad_norm": 0.46043679118156433, + "learning_rate": 0.0004975070028011205, + "loss": 0.5424, + "step": 18050 + }, + { + "epoch": 10.084357541899442, + "grad_norm": 0.5182067155838013, + "learning_rate": 0.0004974789915966387, + "loss": 0.4828, + "step": 18051 + }, + { + "epoch": 10.084916201117318, + "grad_norm": 0.6389055848121643, + "learning_rate": 0.0004974509803921569, + "loss": 0.3969, + "step": 18052 + }, + { + "epoch": 10.085474860335195, + "grad_norm": 0.6957405805587769, + "learning_rate": 0.0004974229691876751, + "loss": 0.4516, + "step": 18053 + }, + { + "epoch": 10.086033519553073, + "grad_norm": 0.761493444442749, + "learning_rate": 0.0004973949579831933, + "loss": 0.7, + "step": 18054 + }, + { + "epoch": 10.08659217877095, + "grad_norm": 0.34772759675979614, + "learning_rate": 0.0004973669467787115, + "loss": 0.3788, + "step": 18055 + }, + { + "epoch": 10.087150837988826, + "grad_norm": 0.6579962968826294, + "learning_rate": 0.0004973389355742297, + "loss": 0.4843, + "step": 18056 + }, + { + "epoch": 10.087709497206705, + "grad_norm": 1.1136418581008911, + "learning_rate": 0.0004973109243697479, + "loss": 0.4985, + "step": 18057 + }, + { + "epoch": 10.088268156424581, + "grad_norm": 0.5118082761764526, + "learning_rate": 0.0004972829131652661, + "loss": 0.4457, + "step": 18058 + }, + { + "epoch": 10.088826815642458, + "grad_norm": 0.40080875158309937, + "learning_rate": 0.0004972549019607843, + "loss": 0.5929, + "step": 18059 + }, + { + "epoch": 10.089385474860336, + "grad_norm": 0.5157171487808228, + "learning_rate": 0.0004972268907563025, + "loss": 0.4551, + "step": 18060 + }, + { + "epoch": 10.089944134078213, + "grad_norm": 0.9593400955200195, + "learning_rate": 0.0004971988795518207, + "loss": 0.5289, + "step": 18061 + }, + { + "epoch": 10.09050279329609, + "grad_norm": 2.8089518547058105, + "learning_rate": 0.0004971708683473389, + "loss": 0.378, + "step": 18062 + }, + { + "epoch": 10.091061452513966, + "grad_norm": 3.404726266860962, + "learning_rate": 0.0004971428571428571, + "loss": 0.3314, + "step": 18063 + }, + { + "epoch": 10.091620111731844, + "grad_norm": 0.4110618829727173, + "learning_rate": 0.0004971148459383753, + "loss": 0.4634, + "step": 18064 + }, + { + "epoch": 10.09217877094972, + "grad_norm": 0.5282087326049805, + "learning_rate": 0.0004970868347338935, + "loss": 0.3968, + "step": 18065 + }, + { + "epoch": 10.092737430167597, + "grad_norm": 0.5879765152931213, + "learning_rate": 0.0004970588235294117, + "loss": 0.594, + "step": 18066 + }, + { + "epoch": 10.093296089385476, + "grad_norm": 0.5131786465644836, + "learning_rate": 0.00049703081232493, + "loss": 0.4439, + "step": 18067 + }, + { + "epoch": 10.093854748603352, + "grad_norm": 0.5974563360214233, + "learning_rate": 0.0004970028011204482, + "loss": 0.3734, + "step": 18068 + }, + { + "epoch": 10.094413407821229, + "grad_norm": 0.4999324083328247, + "learning_rate": 0.0004969747899159664, + "loss": 0.4895, + "step": 18069 + }, + { + "epoch": 10.094972067039107, + "grad_norm": 0.5282110571861267, + "learning_rate": 0.0004969467787114846, + "loss": 0.4574, + "step": 18070 + }, + { + "epoch": 10.095530726256984, + "grad_norm": 0.37837883830070496, + "learning_rate": 0.0004969187675070028, + "loss": 0.4464, + "step": 18071 + }, + { + "epoch": 10.09608938547486, + "grad_norm": 0.5894453525543213, + "learning_rate": 0.000496890756302521, + "loss": 0.4037, + "step": 18072 + }, + { + "epoch": 10.096648044692737, + "grad_norm": 0.49189305305480957, + "learning_rate": 0.0004968627450980393, + "loss": 0.4571, + "step": 18073 + }, + { + "epoch": 10.097206703910615, + "grad_norm": 0.5240940451622009, + "learning_rate": 0.0004968347338935574, + "loss": 0.3701, + "step": 18074 + }, + { + "epoch": 10.097765363128492, + "grad_norm": 0.5711789727210999, + "learning_rate": 0.0004968067226890756, + "loss": 0.4576, + "step": 18075 + }, + { + "epoch": 10.098324022346368, + "grad_norm": 0.5005568265914917, + "learning_rate": 0.0004967787114845938, + "loss": 0.5472, + "step": 18076 + }, + { + "epoch": 10.098882681564247, + "grad_norm": 0.43374624848365784, + "learning_rate": 0.000496750700280112, + "loss": 0.3925, + "step": 18077 + }, + { + "epoch": 10.099441340782123, + "grad_norm": 0.6369708180427551, + "learning_rate": 0.0004967226890756303, + "loss": 0.4185, + "step": 18078 + }, + { + "epoch": 10.1, + "grad_norm": 0.49052974581718445, + "learning_rate": 0.0004966946778711484, + "loss": 0.4878, + "step": 18079 + }, + { + "epoch": 10.100558659217878, + "grad_norm": 0.6738065481185913, + "learning_rate": 0.0004966666666666666, + "loss": 0.4841, + "step": 18080 + }, + { + "epoch": 10.101117318435755, + "grad_norm": 1.1961359977722168, + "learning_rate": 0.000496638655462185, + "loss": 0.4584, + "step": 18081 + }, + { + "epoch": 10.101675977653631, + "grad_norm": 0.5539975166320801, + "learning_rate": 0.000496610644257703, + "loss": 0.4395, + "step": 18082 + }, + { + "epoch": 10.102234636871508, + "grad_norm": 0.9628750085830688, + "learning_rate": 0.0004965826330532214, + "loss": 0.4173, + "step": 18083 + }, + { + "epoch": 10.102793296089386, + "grad_norm": 0.4032144844532013, + "learning_rate": 0.0004965546218487395, + "loss": 0.3792, + "step": 18084 + }, + { + "epoch": 10.103351955307263, + "grad_norm": 1.255530834197998, + "learning_rate": 0.0004965266106442577, + "loss": 0.3715, + "step": 18085 + }, + { + "epoch": 10.10391061452514, + "grad_norm": 0.3632354438304901, + "learning_rate": 0.000496498599439776, + "loss": 0.4655, + "step": 18086 + }, + { + "epoch": 10.104469273743018, + "grad_norm": 0.7581839561462402, + "learning_rate": 0.0004964705882352941, + "loss": 0.3759, + "step": 18087 + }, + { + "epoch": 10.105027932960894, + "grad_norm": 0.8161591291427612, + "learning_rate": 0.0004964425770308124, + "loss": 0.455, + "step": 18088 + }, + { + "epoch": 10.10558659217877, + "grad_norm": 0.6192954778671265, + "learning_rate": 0.0004964145658263306, + "loss": 0.3898, + "step": 18089 + }, + { + "epoch": 10.106145251396647, + "grad_norm": 0.4983682334423065, + "learning_rate": 0.0004963865546218487, + "loss": 0.4853, + "step": 18090 + }, + { + "epoch": 10.106703910614526, + "grad_norm": 0.5258122682571411, + "learning_rate": 0.000496358543417367, + "loss": 0.4513, + "step": 18091 + }, + { + "epoch": 10.107262569832402, + "grad_norm": 0.6247334480285645, + "learning_rate": 0.0004963305322128851, + "loss": 0.421, + "step": 18092 + }, + { + "epoch": 10.107821229050279, + "grad_norm": 0.5308733582496643, + "learning_rate": 0.0004963025210084034, + "loss": 0.4366, + "step": 18093 + }, + { + "epoch": 10.108379888268157, + "grad_norm": 0.3999197483062744, + "learning_rate": 0.0004962745098039216, + "loss": 0.3886, + "step": 18094 + }, + { + "epoch": 10.108938547486034, + "grad_norm": 0.47458192706108093, + "learning_rate": 0.0004962464985994397, + "loss": 0.3928, + "step": 18095 + }, + { + "epoch": 10.10949720670391, + "grad_norm": 0.5964575409889221, + "learning_rate": 0.000496218487394958, + "loss": 0.479, + "step": 18096 + }, + { + "epoch": 10.110055865921789, + "grad_norm": 0.5020251870155334, + "learning_rate": 0.0004961904761904762, + "loss": 0.4364, + "step": 18097 + }, + { + "epoch": 10.110614525139665, + "grad_norm": 1.2431504726409912, + "learning_rate": 0.0004961624649859944, + "loss": 0.5645, + "step": 18098 + }, + { + "epoch": 10.111173184357542, + "grad_norm": 0.5170184373855591, + "learning_rate": 0.0004961344537815127, + "loss": 0.4006, + "step": 18099 + }, + { + "epoch": 10.111731843575418, + "grad_norm": 0.4978105425834656, + "learning_rate": 0.0004961064425770308, + "loss": 0.3498, + "step": 18100 + }, + { + "epoch": 10.112290502793297, + "grad_norm": 0.5204949378967285, + "learning_rate": 0.0004960784313725491, + "loss": 0.5475, + "step": 18101 + }, + { + "epoch": 10.112849162011173, + "grad_norm": 0.5680293440818787, + "learning_rate": 0.0004960504201680673, + "loss": 0.4506, + "step": 18102 + }, + { + "epoch": 10.11340782122905, + "grad_norm": 0.5670507550239563, + "learning_rate": 0.0004960224089635855, + "loss": 0.5376, + "step": 18103 + }, + { + "epoch": 10.113966480446928, + "grad_norm": 0.41433772444725037, + "learning_rate": 0.0004959943977591037, + "loss": 0.5161, + "step": 18104 + }, + { + "epoch": 10.114525139664805, + "grad_norm": 0.44655802845954895, + "learning_rate": 0.0004959663865546219, + "loss": 0.4676, + "step": 18105 + }, + { + "epoch": 10.115083798882681, + "grad_norm": 0.6903935670852661, + "learning_rate": 0.0004959383753501401, + "loss": 0.5333, + "step": 18106 + }, + { + "epoch": 10.11564245810056, + "grad_norm": 0.4442330300807953, + "learning_rate": 0.0004959103641456583, + "loss": 0.434, + "step": 18107 + }, + { + "epoch": 10.116201117318436, + "grad_norm": 0.8517215251922607, + "learning_rate": 0.0004958823529411765, + "loss": 0.4592, + "step": 18108 + }, + { + "epoch": 10.116759776536313, + "grad_norm": 0.5763704776763916, + "learning_rate": 0.0004958543417366947, + "loss": 0.4281, + "step": 18109 + }, + { + "epoch": 10.11731843575419, + "grad_norm": 0.7409971952438354, + "learning_rate": 0.0004958263305322129, + "loss": 0.5307, + "step": 18110 + }, + { + "epoch": 10.117877094972068, + "grad_norm": 0.3384193480014801, + "learning_rate": 0.0004957983193277311, + "loss": 0.3736, + "step": 18111 + }, + { + "epoch": 10.118435754189944, + "grad_norm": 0.5298641920089722, + "learning_rate": 0.0004957703081232493, + "loss": 0.4543, + "step": 18112 + }, + { + "epoch": 10.11899441340782, + "grad_norm": 1.018093228340149, + "learning_rate": 0.0004957422969187675, + "loss": 0.397, + "step": 18113 + }, + { + "epoch": 10.119553072625699, + "grad_norm": 0.7656188011169434, + "learning_rate": 0.0004957142857142857, + "loss": 0.4782, + "step": 18114 + }, + { + "epoch": 10.120111731843576, + "grad_norm": 0.4250064790248871, + "learning_rate": 0.000495686274509804, + "loss": 0.4174, + "step": 18115 + }, + { + "epoch": 10.120670391061452, + "grad_norm": 3.176320791244507, + "learning_rate": 0.0004956582633053222, + "loss": 0.4966, + "step": 18116 + }, + { + "epoch": 10.121229050279329, + "grad_norm": 0.9759548306465149, + "learning_rate": 0.0004956302521008404, + "loss": 0.4804, + "step": 18117 + }, + { + "epoch": 10.121787709497207, + "grad_norm": 1.0502034425735474, + "learning_rate": 0.0004956022408963586, + "loss": 0.4608, + "step": 18118 + }, + { + "epoch": 10.122346368715084, + "grad_norm": 0.6199468374252319, + "learning_rate": 0.0004955742296918768, + "loss": 0.6233, + "step": 18119 + }, + { + "epoch": 10.12290502793296, + "grad_norm": 0.7264186143875122, + "learning_rate": 0.000495546218487395, + "loss": 0.4783, + "step": 18120 + }, + { + "epoch": 10.123463687150839, + "grad_norm": 0.5781332850456238, + "learning_rate": 0.0004955182072829132, + "loss": 0.5594, + "step": 18121 + }, + { + "epoch": 10.124022346368715, + "grad_norm": 0.8736269474029541, + "learning_rate": 0.0004954901960784314, + "loss": 0.4358, + "step": 18122 + }, + { + "epoch": 10.124581005586592, + "grad_norm": 0.6616122722625732, + "learning_rate": 0.0004954621848739496, + "loss": 0.4337, + "step": 18123 + }, + { + "epoch": 10.12513966480447, + "grad_norm": 0.4661155343055725, + "learning_rate": 0.0004954341736694678, + "loss": 0.4205, + "step": 18124 + }, + { + "epoch": 10.125698324022347, + "grad_norm": 0.3954992890357971, + "learning_rate": 0.000495406162464986, + "loss": 0.3641, + "step": 18125 + }, + { + "epoch": 10.126256983240223, + "grad_norm": 0.42161041498184204, + "learning_rate": 0.0004953781512605042, + "loss": 0.3659, + "step": 18126 + }, + { + "epoch": 10.1268156424581, + "grad_norm": 0.6067443490028381, + "learning_rate": 0.0004953501400560224, + "loss": 0.3581, + "step": 18127 + }, + { + "epoch": 10.127374301675978, + "grad_norm": 0.5355249047279358, + "learning_rate": 0.0004953221288515406, + "loss": 0.3928, + "step": 18128 + }, + { + "epoch": 10.127932960893855, + "grad_norm": 0.619052529335022, + "learning_rate": 0.0004952941176470588, + "loss": 0.402, + "step": 18129 + }, + { + "epoch": 10.128491620111731, + "grad_norm": 2.500716209411621, + "learning_rate": 0.000495266106442577, + "loss": 0.44, + "step": 18130 + }, + { + "epoch": 10.12905027932961, + "grad_norm": 0.6444413065910339, + "learning_rate": 0.0004952380952380952, + "loss": 0.5018, + "step": 18131 + }, + { + "epoch": 10.129608938547486, + "grad_norm": 0.4753556251525879, + "learning_rate": 0.0004952100840336135, + "loss": 0.394, + "step": 18132 + }, + { + "epoch": 10.130167597765363, + "grad_norm": 0.44528928399086, + "learning_rate": 0.0004951820728291317, + "loss": 0.3956, + "step": 18133 + }, + { + "epoch": 10.130726256983241, + "grad_norm": 0.471708744764328, + "learning_rate": 0.0004951540616246499, + "loss": 0.4736, + "step": 18134 + }, + { + "epoch": 10.131284916201118, + "grad_norm": 2.1290929317474365, + "learning_rate": 0.0004951260504201681, + "loss": 0.427, + "step": 18135 + }, + { + "epoch": 10.131843575418994, + "grad_norm": 1.0335098505020142, + "learning_rate": 0.0004950980392156863, + "loss": 0.4109, + "step": 18136 + }, + { + "epoch": 10.13240223463687, + "grad_norm": 0.49380993843078613, + "learning_rate": 0.0004950700280112045, + "loss": 0.4965, + "step": 18137 + }, + { + "epoch": 10.132960893854749, + "grad_norm": 1.5063607692718506, + "learning_rate": 0.0004950420168067227, + "loss": 0.3847, + "step": 18138 + }, + { + "epoch": 10.133519553072626, + "grad_norm": 1.4352372884750366, + "learning_rate": 0.0004950140056022409, + "loss": 0.4098, + "step": 18139 + }, + { + "epoch": 10.134078212290502, + "grad_norm": 0.48557960987091064, + "learning_rate": 0.0004949859943977591, + "loss": 0.3645, + "step": 18140 + }, + { + "epoch": 10.13463687150838, + "grad_norm": 1.1111335754394531, + "learning_rate": 0.0004949579831932773, + "loss": 0.4738, + "step": 18141 + }, + { + "epoch": 10.135195530726257, + "grad_norm": 0.3614799678325653, + "learning_rate": 0.0004949299719887955, + "loss": 0.4305, + "step": 18142 + }, + { + "epoch": 10.135754189944134, + "grad_norm": 0.6348438858985901, + "learning_rate": 0.0004949019607843137, + "loss": 0.4955, + "step": 18143 + }, + { + "epoch": 10.136312849162012, + "grad_norm": 2.1298398971557617, + "learning_rate": 0.0004948739495798319, + "loss": 0.4701, + "step": 18144 + }, + { + "epoch": 10.136871508379889, + "grad_norm": 0.5491769313812256, + "learning_rate": 0.0004948459383753501, + "loss": 0.389, + "step": 18145 + }, + { + "epoch": 10.137430167597765, + "grad_norm": 0.5336526036262512, + "learning_rate": 0.0004948179271708683, + "loss": 0.5628, + "step": 18146 + }, + { + "epoch": 10.137988826815642, + "grad_norm": 0.47665512561798096, + "learning_rate": 0.0004947899159663865, + "loss": 0.41, + "step": 18147 + }, + { + "epoch": 10.13854748603352, + "grad_norm": 7.293799877166748, + "learning_rate": 0.0004947619047619047, + "loss": 0.472, + "step": 18148 + }, + { + "epoch": 10.139106145251397, + "grad_norm": 12.360549926757812, + "learning_rate": 0.000494733893557423, + "loss": 0.3851, + "step": 18149 + }, + { + "epoch": 10.139664804469273, + "grad_norm": 0.47572776675224304, + "learning_rate": 0.0004947058823529412, + "loss": 0.4082, + "step": 18150 + }, + { + "epoch": 10.140223463687152, + "grad_norm": 0.5174591541290283, + "learning_rate": 0.0004946778711484594, + "loss": 0.364, + "step": 18151 + }, + { + "epoch": 10.140782122905028, + "grad_norm": 0.5342432260513306, + "learning_rate": 0.0004946498599439776, + "loss": 0.4121, + "step": 18152 + }, + { + "epoch": 10.141340782122905, + "grad_norm": 0.6754600405693054, + "learning_rate": 0.0004946218487394958, + "loss": 0.6592, + "step": 18153 + }, + { + "epoch": 10.141899441340781, + "grad_norm": 0.37145087122917175, + "learning_rate": 0.000494593837535014, + "loss": 0.4249, + "step": 18154 + }, + { + "epoch": 10.14245810055866, + "grad_norm": 0.541826069355011, + "learning_rate": 0.0004945658263305322, + "loss": 0.3715, + "step": 18155 + }, + { + "epoch": 10.143016759776536, + "grad_norm": 2.7477400302886963, + "learning_rate": 0.0004945378151260504, + "loss": 0.3865, + "step": 18156 + }, + { + "epoch": 10.143575418994413, + "grad_norm": 0.45038044452667236, + "learning_rate": 0.0004945098039215686, + "loss": 0.3855, + "step": 18157 + }, + { + "epoch": 10.144134078212291, + "grad_norm": 0.7185829877853394, + "learning_rate": 0.0004944817927170868, + "loss": 0.3504, + "step": 18158 + }, + { + "epoch": 10.144692737430168, + "grad_norm": 0.5916851758956909, + "learning_rate": 0.000494453781512605, + "loss": 0.4265, + "step": 18159 + }, + { + "epoch": 10.145251396648044, + "grad_norm": 3.196746826171875, + "learning_rate": 0.0004944257703081232, + "loss": 0.3565, + "step": 18160 + }, + { + "epoch": 10.145810055865923, + "grad_norm": 0.41470780968666077, + "learning_rate": 0.0004943977591036415, + "loss": 0.4461, + "step": 18161 + }, + { + "epoch": 10.1463687150838, + "grad_norm": 0.6514317989349365, + "learning_rate": 0.0004943697478991596, + "loss": 0.3879, + "step": 18162 + }, + { + "epoch": 10.146927374301676, + "grad_norm": 0.4813372492790222, + "learning_rate": 0.0004943417366946778, + "loss": 0.5368, + "step": 18163 + }, + { + "epoch": 10.147486033519552, + "grad_norm": 0.8580808043479919, + "learning_rate": 0.000494313725490196, + "loss": 0.4005, + "step": 18164 + }, + { + "epoch": 10.14804469273743, + "grad_norm": 0.5529158711433411, + "learning_rate": 0.0004942857142857143, + "loss": 0.4396, + "step": 18165 + }, + { + "epoch": 10.148603351955307, + "grad_norm": 0.4545688033103943, + "learning_rate": 0.0004942577030812326, + "loss": 0.4924, + "step": 18166 + }, + { + "epoch": 10.149162011173184, + "grad_norm": 0.526366651058197, + "learning_rate": 0.0004942296918767507, + "loss": 0.3854, + "step": 18167 + }, + { + "epoch": 10.149720670391062, + "grad_norm": 0.353569895029068, + "learning_rate": 0.0004942016806722689, + "loss": 0.4371, + "step": 18168 + }, + { + "epoch": 10.150279329608939, + "grad_norm": 0.8198625445365906, + "learning_rate": 0.0004941736694677872, + "loss": 0.4525, + "step": 18169 + }, + { + "epoch": 10.150837988826815, + "grad_norm": 3.6741857528686523, + "learning_rate": 0.0004941456582633053, + "loss": 0.4214, + "step": 18170 + }, + { + "epoch": 10.151396648044694, + "grad_norm": 0.4922366142272949, + "learning_rate": 0.0004941176470588236, + "loss": 0.4973, + "step": 18171 + }, + { + "epoch": 10.15195530726257, + "grad_norm": 0.4238635301589966, + "learning_rate": 0.0004940896358543417, + "loss": 0.5431, + "step": 18172 + }, + { + "epoch": 10.152513966480447, + "grad_norm": 2.6393191814422607, + "learning_rate": 0.0004940616246498599, + "loss": 0.3844, + "step": 18173 + }, + { + "epoch": 10.153072625698323, + "grad_norm": 0.5375121831893921, + "learning_rate": 0.0004940336134453782, + "loss": 0.4498, + "step": 18174 + }, + { + "epoch": 10.153631284916202, + "grad_norm": 0.47014671564102173, + "learning_rate": 0.0004940056022408963, + "loss": 0.3736, + "step": 18175 + }, + { + "epoch": 10.154189944134078, + "grad_norm": 3.2533578872680664, + "learning_rate": 0.0004939775910364146, + "loss": 0.4734, + "step": 18176 + }, + { + "epoch": 10.154748603351955, + "grad_norm": 0.6932738423347473, + "learning_rate": 0.0004939495798319328, + "loss": 0.4479, + "step": 18177 + }, + { + "epoch": 10.155307262569833, + "grad_norm": 0.6455178260803223, + "learning_rate": 0.0004939215686274509, + "loss": 0.5127, + "step": 18178 + }, + { + "epoch": 10.15586592178771, + "grad_norm": 0.7665501236915588, + "learning_rate": 0.0004938935574229692, + "loss": 0.3283, + "step": 18179 + }, + { + "epoch": 10.156424581005586, + "grad_norm": 0.49217841029167175, + "learning_rate": 0.0004938655462184873, + "loss": 0.3913, + "step": 18180 + }, + { + "epoch": 10.156983240223465, + "grad_norm": 0.5359972715377808, + "learning_rate": 0.0004938375350140057, + "loss": 0.507, + "step": 18181 + }, + { + "epoch": 10.157541899441341, + "grad_norm": 0.5304698944091797, + "learning_rate": 0.0004938095238095239, + "loss": 0.523, + "step": 18182 + }, + { + "epoch": 10.158100558659218, + "grad_norm": 0.4691931903362274, + "learning_rate": 0.000493781512605042, + "loss": 0.4527, + "step": 18183 + }, + { + "epoch": 10.158659217877094, + "grad_norm": 0.6343221664428711, + "learning_rate": 0.0004937535014005603, + "loss": 0.4865, + "step": 18184 + }, + { + "epoch": 10.159217877094973, + "grad_norm": 0.47323688864707947, + "learning_rate": 0.0004937254901960785, + "loss": 0.4604, + "step": 18185 + }, + { + "epoch": 10.15977653631285, + "grad_norm": 0.4352110028266907, + "learning_rate": 0.0004936974789915967, + "loss": 0.4303, + "step": 18186 + }, + { + "epoch": 10.160335195530726, + "grad_norm": 0.6599476933479309, + "learning_rate": 0.0004936694677871149, + "loss": 0.4235, + "step": 18187 + }, + { + "epoch": 10.160893854748604, + "grad_norm": 0.42270833253860474, + "learning_rate": 0.000493641456582633, + "loss": 0.4375, + "step": 18188 + }, + { + "epoch": 10.16145251396648, + "grad_norm": 0.5312519073486328, + "learning_rate": 0.0004936134453781513, + "loss": 0.3648, + "step": 18189 + }, + { + "epoch": 10.162011173184357, + "grad_norm": 0.9655329585075378, + "learning_rate": 0.0004935854341736695, + "loss": 0.4165, + "step": 18190 + }, + { + "epoch": 10.162569832402234, + "grad_norm": 0.4983498156070709, + "learning_rate": 0.0004935574229691877, + "loss": 0.4158, + "step": 18191 + }, + { + "epoch": 10.163128491620112, + "grad_norm": 0.561797022819519, + "learning_rate": 0.0004935294117647059, + "loss": 0.4084, + "step": 18192 + }, + { + "epoch": 10.163687150837989, + "grad_norm": 0.41133561730384827, + "learning_rate": 0.0004935014005602241, + "loss": 0.3845, + "step": 18193 + }, + { + "epoch": 10.164245810055865, + "grad_norm": 0.489717036485672, + "learning_rate": 0.0004934733893557423, + "loss": 0.4187, + "step": 18194 + }, + { + "epoch": 10.164804469273744, + "grad_norm": 2.4858293533325195, + "learning_rate": 0.0004934453781512605, + "loss": 0.4201, + "step": 18195 + }, + { + "epoch": 10.16536312849162, + "grad_norm": 0.39093923568725586, + "learning_rate": 0.0004934173669467787, + "loss": 0.4491, + "step": 18196 + }, + { + "epoch": 10.165921787709497, + "grad_norm": 0.605914294719696, + "learning_rate": 0.000493389355742297, + "loss": 0.4077, + "step": 18197 + }, + { + "epoch": 10.166480446927375, + "grad_norm": 0.6784054040908813, + "learning_rate": 0.0004933613445378152, + "loss": 0.4893, + "step": 18198 + }, + { + "epoch": 10.167039106145252, + "grad_norm": 1.2724392414093018, + "learning_rate": 0.0004933333333333334, + "loss": 0.3767, + "step": 18199 + }, + { + "epoch": 10.167597765363128, + "grad_norm": 0.4819677174091339, + "learning_rate": 0.0004933053221288516, + "loss": 0.4753, + "step": 18200 + }, + { + "epoch": 10.168156424581005, + "grad_norm": 0.5700948238372803, + "learning_rate": 0.0004932773109243698, + "loss": 0.6043, + "step": 18201 + }, + { + "epoch": 10.168715083798883, + "grad_norm": 1.1814908981323242, + "learning_rate": 0.000493249299719888, + "loss": 0.4522, + "step": 18202 + }, + { + "epoch": 10.16927374301676, + "grad_norm": 0.47052881121635437, + "learning_rate": 0.0004932212885154062, + "loss": 0.4532, + "step": 18203 + }, + { + "epoch": 10.169832402234636, + "grad_norm": 1.132826566696167, + "learning_rate": 0.0004931932773109244, + "loss": 0.5304, + "step": 18204 + }, + { + "epoch": 10.170391061452515, + "grad_norm": 0.5704160928726196, + "learning_rate": 0.0004931652661064426, + "loss": 0.4389, + "step": 18205 + }, + { + "epoch": 10.170949720670391, + "grad_norm": 0.5240380764007568, + "learning_rate": 0.0004931372549019608, + "loss": 0.4061, + "step": 18206 + }, + { + "epoch": 10.171508379888268, + "grad_norm": 0.42711037397384644, + "learning_rate": 0.000493109243697479, + "loss": 0.3504, + "step": 18207 + }, + { + "epoch": 10.172067039106146, + "grad_norm": 0.5761241316795349, + "learning_rate": 0.0004930812324929972, + "loss": 0.4013, + "step": 18208 + }, + { + "epoch": 10.172625698324023, + "grad_norm": 0.369295597076416, + "learning_rate": 0.0004930532212885154, + "loss": 0.3438, + "step": 18209 + }, + { + "epoch": 10.1731843575419, + "grad_norm": 0.42897453904151917, + "learning_rate": 0.0004930252100840336, + "loss": 0.366, + "step": 18210 + }, + { + "epoch": 10.173743016759776, + "grad_norm": 0.7355853915214539, + "learning_rate": 0.0004929971988795518, + "loss": 0.4373, + "step": 18211 + }, + { + "epoch": 10.174301675977654, + "grad_norm": 1.1857928037643433, + "learning_rate": 0.00049296918767507, + "loss": 0.4844, + "step": 18212 + }, + { + "epoch": 10.17486033519553, + "grad_norm": 1.0118834972381592, + "learning_rate": 0.0004929411764705882, + "loss": 0.4156, + "step": 18213 + }, + { + "epoch": 10.175418994413407, + "grad_norm": 0.5024769306182861, + "learning_rate": 0.0004929131652661065, + "loss": 0.4816, + "step": 18214 + }, + { + "epoch": 10.175977653631286, + "grad_norm": 0.45484137535095215, + "learning_rate": 0.0004928851540616247, + "loss": 0.3951, + "step": 18215 + }, + { + "epoch": 10.176536312849162, + "grad_norm": 0.5874055624008179, + "learning_rate": 0.0004928571428571429, + "loss": 0.4298, + "step": 18216 + }, + { + "epoch": 10.177094972067039, + "grad_norm": 0.6035540103912354, + "learning_rate": 0.0004928291316526611, + "loss": 0.5326, + "step": 18217 + }, + { + "epoch": 10.177653631284917, + "grad_norm": 1.7438260316848755, + "learning_rate": 0.0004928011204481793, + "loss": 0.4025, + "step": 18218 + }, + { + "epoch": 10.178212290502794, + "grad_norm": 0.37224647402763367, + "learning_rate": 0.0004927731092436975, + "loss": 0.3906, + "step": 18219 + }, + { + "epoch": 10.17877094972067, + "grad_norm": 0.6038146018981934, + "learning_rate": 0.0004927450980392157, + "loss": 0.4161, + "step": 18220 + }, + { + "epoch": 10.179329608938547, + "grad_norm": 0.4241250157356262, + "learning_rate": 0.0004927170868347339, + "loss": 0.4258, + "step": 18221 + }, + { + "epoch": 10.179888268156425, + "grad_norm": 0.45891451835632324, + "learning_rate": 0.0004926890756302521, + "loss": 0.4263, + "step": 18222 + }, + { + "epoch": 10.180446927374302, + "grad_norm": 0.47805875539779663, + "learning_rate": 0.0004926610644257703, + "loss": 0.4186, + "step": 18223 + }, + { + "epoch": 10.181005586592178, + "grad_norm": 0.4869680404663086, + "learning_rate": 0.0004926330532212885, + "loss": 0.3178, + "step": 18224 + }, + { + "epoch": 10.181564245810057, + "grad_norm": 0.49226653575897217, + "learning_rate": 0.0004926050420168067, + "loss": 0.4549, + "step": 18225 + }, + { + "epoch": 10.182122905027933, + "grad_norm": 1.3692060708999634, + "learning_rate": 0.0004925770308123249, + "loss": 0.3805, + "step": 18226 + }, + { + "epoch": 10.18268156424581, + "grad_norm": 0.5086750388145447, + "learning_rate": 0.0004925490196078431, + "loss": 0.4788, + "step": 18227 + }, + { + "epoch": 10.183240223463686, + "grad_norm": 0.4098435938358307, + "learning_rate": 0.0004925210084033613, + "loss": 0.4599, + "step": 18228 + }, + { + "epoch": 10.183798882681565, + "grad_norm": 0.38413023948669434, + "learning_rate": 0.0004924929971988795, + "loss": 0.3367, + "step": 18229 + }, + { + "epoch": 10.184357541899441, + "grad_norm": 0.5165103673934937, + "learning_rate": 0.0004924649859943977, + "loss": 0.4078, + "step": 18230 + }, + { + "epoch": 10.184916201117318, + "grad_norm": 0.6591992378234863, + "learning_rate": 0.000492436974789916, + "loss": 0.5216, + "step": 18231 + }, + { + "epoch": 10.185474860335196, + "grad_norm": 0.6246803402900696, + "learning_rate": 0.0004924089635854342, + "loss": 0.4171, + "step": 18232 + }, + { + "epoch": 10.186033519553073, + "grad_norm": 0.41472601890563965, + "learning_rate": 0.0004923809523809524, + "loss": 0.3166, + "step": 18233 + }, + { + "epoch": 10.18659217877095, + "grad_norm": 0.5151275992393494, + "learning_rate": 0.0004923529411764706, + "loss": 0.3946, + "step": 18234 + }, + { + "epoch": 10.187150837988828, + "grad_norm": 0.6874570250511169, + "learning_rate": 0.0004923249299719888, + "loss": 0.4903, + "step": 18235 + }, + { + "epoch": 10.187709497206704, + "grad_norm": 1.259940266609192, + "learning_rate": 0.0004922969187675071, + "loss": 0.4883, + "step": 18236 + }, + { + "epoch": 10.18826815642458, + "grad_norm": 0.5133796334266663, + "learning_rate": 0.0004922689075630252, + "loss": 0.4804, + "step": 18237 + }, + { + "epoch": 10.188826815642457, + "grad_norm": 0.4293467402458191, + "learning_rate": 0.0004922408963585434, + "loss": 0.3459, + "step": 18238 + }, + { + "epoch": 10.189385474860336, + "grad_norm": 0.5919665098190308, + "learning_rate": 0.0004922128851540616, + "loss": 0.3801, + "step": 18239 + }, + { + "epoch": 10.189944134078212, + "grad_norm": 1.5704176425933838, + "learning_rate": 0.0004921848739495798, + "loss": 0.3732, + "step": 18240 + }, + { + "epoch": 10.190502793296089, + "grad_norm": 0.5676593780517578, + "learning_rate": 0.000492156862745098, + "loss": 0.4393, + "step": 18241 + }, + { + "epoch": 10.191061452513967, + "grad_norm": 0.45903074741363525, + "learning_rate": 0.0004921288515406162, + "loss": 0.3448, + "step": 18242 + }, + { + "epoch": 10.191620111731844, + "grad_norm": 0.8279778361320496, + "learning_rate": 0.0004921008403361344, + "loss": 0.5311, + "step": 18243 + }, + { + "epoch": 10.19217877094972, + "grad_norm": 0.39877593517303467, + "learning_rate": 0.0004920728291316527, + "loss": 0.4688, + "step": 18244 + }, + { + "epoch": 10.192737430167599, + "grad_norm": 0.393664687871933, + "learning_rate": 0.0004920448179271708, + "loss": 0.3654, + "step": 18245 + }, + { + "epoch": 10.193296089385475, + "grad_norm": 0.5351702570915222, + "learning_rate": 0.000492016806722689, + "loss": 0.5736, + "step": 18246 + }, + { + "epoch": 10.193854748603352, + "grad_norm": 0.535545825958252, + "learning_rate": 0.0004919887955182073, + "loss": 0.4901, + "step": 18247 + }, + { + "epoch": 10.194413407821228, + "grad_norm": 0.6228219866752625, + "learning_rate": 0.0004919607843137255, + "loss": 0.8606, + "step": 18248 + }, + { + "epoch": 10.194972067039107, + "grad_norm": 0.823955237865448, + "learning_rate": 0.0004919327731092438, + "loss": 0.5484, + "step": 18249 + }, + { + "epoch": 10.195530726256983, + "grad_norm": 0.5338960289955139, + "learning_rate": 0.0004919047619047619, + "loss": 0.3941, + "step": 18250 + }, + { + "epoch": 10.19608938547486, + "grad_norm": 0.9280003905296326, + "learning_rate": 0.0004918767507002801, + "loss": 0.4034, + "step": 18251 + }, + { + "epoch": 10.196648044692738, + "grad_norm": 0.5905791521072388, + "learning_rate": 0.0004918487394957984, + "loss": 0.4744, + "step": 18252 + }, + { + "epoch": 10.197206703910615, + "grad_norm": 1.1923887729644775, + "learning_rate": 0.0004918207282913165, + "loss": 0.4744, + "step": 18253 + }, + { + "epoch": 10.197765363128491, + "grad_norm": 0.5870530605316162, + "learning_rate": 0.0004917927170868348, + "loss": 0.4276, + "step": 18254 + }, + { + "epoch": 10.19832402234637, + "grad_norm": 1.8262437582015991, + "learning_rate": 0.0004917647058823529, + "loss": 0.6013, + "step": 18255 + }, + { + "epoch": 10.198882681564246, + "grad_norm": 0.5767238140106201, + "learning_rate": 0.0004917366946778711, + "loss": 0.4003, + "step": 18256 + }, + { + "epoch": 10.199441340782123, + "grad_norm": 0.6672751903533936, + "learning_rate": 0.0004917086834733894, + "loss": 0.5047, + "step": 18257 + }, + { + "epoch": 10.2, + "grad_norm": 1.7866566181182861, + "learning_rate": 0.0004916806722689075, + "loss": 0.3972, + "step": 18258 + }, + { + "epoch": 10.200558659217878, + "grad_norm": 0.651545524597168, + "learning_rate": 0.0004916526610644258, + "loss": 0.3761, + "step": 18259 + }, + { + "epoch": 10.201117318435754, + "grad_norm": 0.5305073261260986, + "learning_rate": 0.000491624649859944, + "loss": 0.4314, + "step": 18260 + }, + { + "epoch": 10.20167597765363, + "grad_norm": 0.6352785229682922, + "learning_rate": 0.0004915966386554621, + "loss": 0.3676, + "step": 18261 + }, + { + "epoch": 10.202234636871509, + "grad_norm": 0.5113483667373657, + "learning_rate": 0.0004915686274509804, + "loss": 0.4431, + "step": 18262 + }, + { + "epoch": 10.202793296089386, + "grad_norm": 8.376091957092285, + "learning_rate": 0.0004915406162464985, + "loss": 0.5128, + "step": 18263 + }, + { + "epoch": 10.203351955307262, + "grad_norm": 0.7416399121284485, + "learning_rate": 0.0004915126050420169, + "loss": 0.6542, + "step": 18264 + }, + { + "epoch": 10.203910614525139, + "grad_norm": 0.8436415791511536, + "learning_rate": 0.0004914845938375351, + "loss": 0.5969, + "step": 18265 + }, + { + "epoch": 10.204469273743017, + "grad_norm": 1.1047639846801758, + "learning_rate": 0.0004914565826330532, + "loss": 0.4549, + "step": 18266 + }, + { + "epoch": 10.205027932960894, + "grad_norm": 0.5435006618499756, + "learning_rate": 0.0004914285714285715, + "loss": 0.4351, + "step": 18267 + }, + { + "epoch": 10.20558659217877, + "grad_norm": 0.6689305901527405, + "learning_rate": 0.0004914005602240897, + "loss": 0.4636, + "step": 18268 + }, + { + "epoch": 10.206145251396649, + "grad_norm": 0.5237941145896912, + "learning_rate": 0.0004913725490196079, + "loss": 0.3409, + "step": 18269 + }, + { + "epoch": 10.206703910614525, + "grad_norm": 0.4465031921863556, + "learning_rate": 0.0004913445378151261, + "loss": 0.4917, + "step": 18270 + }, + { + "epoch": 10.207262569832402, + "grad_norm": 0.5103542804718018, + "learning_rate": 0.0004913165266106442, + "loss": 0.4172, + "step": 18271 + }, + { + "epoch": 10.20782122905028, + "grad_norm": 0.7021303176879883, + "learning_rate": 0.0004912885154061625, + "loss": 0.5499, + "step": 18272 + }, + { + "epoch": 10.208379888268157, + "grad_norm": 0.6792715787887573, + "learning_rate": 0.0004912605042016807, + "loss": 0.5376, + "step": 18273 + }, + { + "epoch": 10.208938547486033, + "grad_norm": 0.455497682094574, + "learning_rate": 0.0004912324929971989, + "loss": 0.6352, + "step": 18274 + }, + { + "epoch": 10.20949720670391, + "grad_norm": 0.4163501262664795, + "learning_rate": 0.0004912044817927171, + "loss": 0.365, + "step": 18275 + }, + { + "epoch": 10.210055865921788, + "grad_norm": 0.5540227293968201, + "learning_rate": 0.0004911764705882353, + "loss": 0.4643, + "step": 18276 + }, + { + "epoch": 10.210614525139665, + "grad_norm": 1.2980217933654785, + "learning_rate": 0.0004911484593837535, + "loss": 0.5005, + "step": 18277 + }, + { + "epoch": 10.211173184357541, + "grad_norm": 0.4069842994213104, + "learning_rate": 0.0004911204481792717, + "loss": 0.4948, + "step": 18278 + }, + { + "epoch": 10.21173184357542, + "grad_norm": 0.4946157932281494, + "learning_rate": 0.00049109243697479, + "loss": 0.4181, + "step": 18279 + }, + { + "epoch": 10.212290502793296, + "grad_norm": 1.5342533588409424, + "learning_rate": 0.0004910644257703082, + "loss": 0.4167, + "step": 18280 + }, + { + "epoch": 10.212849162011173, + "grad_norm": 1.0761616230010986, + "learning_rate": 0.0004910364145658264, + "loss": 0.3983, + "step": 18281 + }, + { + "epoch": 10.213407821229051, + "grad_norm": 0.5338460803031921, + "learning_rate": 0.0004910084033613446, + "loss": 0.4254, + "step": 18282 + }, + { + "epoch": 10.213966480446928, + "grad_norm": 0.46749648451805115, + "learning_rate": 0.0004909803921568628, + "loss": 0.3791, + "step": 18283 + }, + { + "epoch": 10.214525139664804, + "grad_norm": 0.43255990743637085, + "learning_rate": 0.000490952380952381, + "loss": 0.5062, + "step": 18284 + }, + { + "epoch": 10.21508379888268, + "grad_norm": 1.0770831108093262, + "learning_rate": 0.0004909243697478992, + "loss": 0.475, + "step": 18285 + }, + { + "epoch": 10.21564245810056, + "grad_norm": 0.4636324942111969, + "learning_rate": 0.0004908963585434174, + "loss": 0.4113, + "step": 18286 + }, + { + "epoch": 10.216201117318436, + "grad_norm": 0.6983132362365723, + "learning_rate": 0.0004908683473389356, + "loss": 0.4709, + "step": 18287 + }, + { + "epoch": 10.216759776536312, + "grad_norm": 0.6466900706291199, + "learning_rate": 0.0004908403361344538, + "loss": 0.4279, + "step": 18288 + }, + { + "epoch": 10.21731843575419, + "grad_norm": 0.5919768810272217, + "learning_rate": 0.000490812324929972, + "loss": 0.3915, + "step": 18289 + }, + { + "epoch": 10.217877094972067, + "grad_norm": 0.48180440068244934, + "learning_rate": 0.0004907843137254902, + "loss": 0.3838, + "step": 18290 + }, + { + "epoch": 10.218435754189944, + "grad_norm": 0.5758348107337952, + "learning_rate": 0.0004907563025210084, + "loss": 0.425, + "step": 18291 + }, + { + "epoch": 10.21899441340782, + "grad_norm": 2.158036708831787, + "learning_rate": 0.0004907282913165266, + "loss": 0.6896, + "step": 18292 + }, + { + "epoch": 10.219553072625699, + "grad_norm": 0.3848896324634552, + "learning_rate": 0.0004907002801120448, + "loss": 0.357, + "step": 18293 + }, + { + "epoch": 10.220111731843575, + "grad_norm": 0.6241127848625183, + "learning_rate": 0.000490672268907563, + "loss": 0.4611, + "step": 18294 + }, + { + "epoch": 10.220670391061452, + "grad_norm": 0.588411271572113, + "learning_rate": 0.0004906442577030812, + "loss": 0.4577, + "step": 18295 + }, + { + "epoch": 10.22122905027933, + "grad_norm": 0.4453670382499695, + "learning_rate": 0.0004906162464985995, + "loss": 0.5436, + "step": 18296 + }, + { + "epoch": 10.221787709497207, + "grad_norm": 0.7782946825027466, + "learning_rate": 0.0004905882352941177, + "loss": 0.4541, + "step": 18297 + }, + { + "epoch": 10.222346368715083, + "grad_norm": 0.4636748433113098, + "learning_rate": 0.0004905602240896359, + "loss": 0.4391, + "step": 18298 + }, + { + "epoch": 10.222905027932962, + "grad_norm": 1.5460546016693115, + "learning_rate": 0.0004905322128851541, + "loss": 0.4155, + "step": 18299 + }, + { + "epoch": 10.223463687150838, + "grad_norm": 1.421724796295166, + "learning_rate": 0.0004905042016806723, + "loss": 0.3777, + "step": 18300 + }, + { + "epoch": 10.224022346368715, + "grad_norm": 0.4560874402523041, + "learning_rate": 0.0004904761904761905, + "loss": 0.4149, + "step": 18301 + }, + { + "epoch": 10.224581005586591, + "grad_norm": 0.40369710326194763, + "learning_rate": 0.0004904481792717087, + "loss": 0.4196, + "step": 18302 + }, + { + "epoch": 10.22513966480447, + "grad_norm": 1.113968014717102, + "learning_rate": 0.0004904201680672269, + "loss": 0.413, + "step": 18303 + }, + { + "epoch": 10.225698324022346, + "grad_norm": 1.7583075761795044, + "learning_rate": 0.0004903921568627451, + "loss": 0.503, + "step": 18304 + }, + { + "epoch": 10.226256983240223, + "grad_norm": 0.33197975158691406, + "learning_rate": 0.0004903641456582633, + "loss": 0.3293, + "step": 18305 + }, + { + "epoch": 10.226815642458101, + "grad_norm": 0.4173920154571533, + "learning_rate": 0.0004903361344537815, + "loss": 0.3964, + "step": 18306 + }, + { + "epoch": 10.227374301675978, + "grad_norm": 3.0750677585601807, + "learning_rate": 0.0004903081232492997, + "loss": 0.3213, + "step": 18307 + }, + { + "epoch": 10.227932960893854, + "grad_norm": 1.424278974533081, + "learning_rate": 0.0004902801120448179, + "loss": 0.437, + "step": 18308 + }, + { + "epoch": 10.228491620111733, + "grad_norm": 0.42078617215156555, + "learning_rate": 0.0004902521008403361, + "loss": 0.3803, + "step": 18309 + }, + { + "epoch": 10.22905027932961, + "grad_norm": 0.6340539455413818, + "learning_rate": 0.0004902240896358543, + "loss": 0.4872, + "step": 18310 + }, + { + "epoch": 10.229608938547486, + "grad_norm": 0.629845380783081, + "learning_rate": 0.0004901960784313725, + "loss": 0.4366, + "step": 18311 + }, + { + "epoch": 10.230167597765362, + "grad_norm": 0.8952680826187134, + "learning_rate": 0.0004901680672268907, + "loss": 0.4015, + "step": 18312 + }, + { + "epoch": 10.23072625698324, + "grad_norm": 0.5406718850135803, + "learning_rate": 0.000490140056022409, + "loss": 0.466, + "step": 18313 + }, + { + "epoch": 10.231284916201117, + "grad_norm": 2.296178102493286, + "learning_rate": 0.0004901120448179272, + "loss": 0.4153, + "step": 18314 + }, + { + "epoch": 10.231843575418994, + "grad_norm": 1.01360285282135, + "learning_rate": 0.0004900840336134454, + "loss": 0.5564, + "step": 18315 + }, + { + "epoch": 10.232402234636872, + "grad_norm": 0.4351801574230194, + "learning_rate": 0.0004900560224089636, + "loss": 0.4249, + "step": 18316 + }, + { + "epoch": 10.232960893854749, + "grad_norm": 0.6425051689147949, + "learning_rate": 0.0004900280112044818, + "loss": 0.387, + "step": 18317 + }, + { + "epoch": 10.233519553072625, + "grad_norm": 0.5039723515510559, + "learning_rate": 0.00049, + "loss": 0.4677, + "step": 18318 + }, + { + "epoch": 10.234078212290504, + "grad_norm": 0.41770872473716736, + "learning_rate": 0.0004899719887955182, + "loss": 0.3289, + "step": 18319 + }, + { + "epoch": 10.23463687150838, + "grad_norm": 0.497715026140213, + "learning_rate": 0.0004899439775910364, + "loss": 0.4723, + "step": 18320 + }, + { + "epoch": 10.235195530726257, + "grad_norm": 0.5172687768936157, + "learning_rate": 0.0004899159663865546, + "loss": 0.4855, + "step": 18321 + }, + { + "epoch": 10.235754189944133, + "grad_norm": 0.37893110513687134, + "learning_rate": 0.0004898879551820728, + "loss": 0.4661, + "step": 18322 + }, + { + "epoch": 10.236312849162012, + "grad_norm": 0.4221186935901642, + "learning_rate": 0.000489859943977591, + "loss": 0.3704, + "step": 18323 + }, + { + "epoch": 10.236871508379888, + "grad_norm": 0.5625306963920593, + "learning_rate": 0.0004898319327731093, + "loss": 0.4644, + "step": 18324 + }, + { + "epoch": 10.237430167597765, + "grad_norm": 0.4371757209300995, + "learning_rate": 0.0004898039215686274, + "loss": 0.4999, + "step": 18325 + }, + { + "epoch": 10.237988826815643, + "grad_norm": 0.46273258328437805, + "learning_rate": 0.0004897759103641456, + "loss": 0.5286, + "step": 18326 + }, + { + "epoch": 10.23854748603352, + "grad_norm": 0.4261535406112671, + "learning_rate": 0.0004897478991596638, + "loss": 0.3816, + "step": 18327 + }, + { + "epoch": 10.239106145251396, + "grad_norm": 0.4743141829967499, + "learning_rate": 0.000489719887955182, + "loss": 0.4516, + "step": 18328 + }, + { + "epoch": 10.239664804469275, + "grad_norm": 1.27565598487854, + "learning_rate": 0.0004896918767507004, + "loss": 0.4046, + "step": 18329 + }, + { + "epoch": 10.240223463687151, + "grad_norm": 0.3932933509349823, + "learning_rate": 0.0004896638655462185, + "loss": 0.4082, + "step": 18330 + }, + { + "epoch": 10.240782122905028, + "grad_norm": 0.5737097263336182, + "learning_rate": 0.0004896358543417367, + "loss": 0.5647, + "step": 18331 + }, + { + "epoch": 10.241340782122904, + "grad_norm": 0.47776395082473755, + "learning_rate": 0.000489607843137255, + "loss": 0.3503, + "step": 18332 + }, + { + "epoch": 10.241899441340783, + "grad_norm": 0.3598838746547699, + "learning_rate": 0.0004895798319327731, + "loss": 0.4211, + "step": 18333 + }, + { + "epoch": 10.24245810055866, + "grad_norm": 0.5205954313278198, + "learning_rate": 0.0004895518207282914, + "loss": 0.3933, + "step": 18334 + }, + { + "epoch": 10.243016759776536, + "grad_norm": 0.45160582661628723, + "learning_rate": 0.0004895238095238095, + "loss": 0.3835, + "step": 18335 + }, + { + "epoch": 10.243575418994414, + "grad_norm": 0.8252968788146973, + "learning_rate": 0.0004894957983193277, + "loss": 0.3551, + "step": 18336 + }, + { + "epoch": 10.24413407821229, + "grad_norm": 0.5469398498535156, + "learning_rate": 0.000489467787114846, + "loss": 0.4355, + "step": 18337 + }, + { + "epoch": 10.244692737430167, + "grad_norm": 1.1218631267547607, + "learning_rate": 0.0004894397759103641, + "loss": 0.3217, + "step": 18338 + }, + { + "epoch": 10.245251396648044, + "grad_norm": 0.48319295048713684, + "learning_rate": 0.0004894117647058824, + "loss": 0.4432, + "step": 18339 + }, + { + "epoch": 10.245810055865922, + "grad_norm": 0.3665015399456024, + "learning_rate": 0.0004893837535014006, + "loss": 0.3908, + "step": 18340 + }, + { + "epoch": 10.246368715083799, + "grad_norm": 0.892652690410614, + "learning_rate": 0.0004893557422969187, + "loss": 0.4797, + "step": 18341 + }, + { + "epoch": 10.246927374301675, + "grad_norm": 0.6330655813217163, + "learning_rate": 0.000489327731092437, + "loss": 0.3845, + "step": 18342 + }, + { + "epoch": 10.247486033519554, + "grad_norm": 0.40359973907470703, + "learning_rate": 0.0004892997198879551, + "loss": 0.4087, + "step": 18343 + }, + { + "epoch": 10.24804469273743, + "grad_norm": 0.6576468348503113, + "learning_rate": 0.0004892717086834734, + "loss": 0.4535, + "step": 18344 + }, + { + "epoch": 10.248603351955307, + "grad_norm": 0.5093878507614136, + "learning_rate": 0.0004892436974789917, + "loss": 0.3408, + "step": 18345 + }, + { + "epoch": 10.249162011173185, + "grad_norm": 0.4071648120880127, + "learning_rate": 0.0004892156862745098, + "loss": 0.3672, + "step": 18346 + }, + { + "epoch": 10.249720670391062, + "grad_norm": 0.4684867858886719, + "learning_rate": 0.0004891876750700281, + "loss": 0.4411, + "step": 18347 + }, + { + "epoch": 10.250279329608938, + "grad_norm": 1.053542971611023, + "learning_rate": 0.0004891596638655463, + "loss": 0.39, + "step": 18348 + }, + { + "epoch": 10.250837988826815, + "grad_norm": 1.0055053234100342, + "learning_rate": 0.0004891316526610645, + "loss": 0.7021, + "step": 18349 + }, + { + "epoch": 10.251396648044693, + "grad_norm": 3.780545711517334, + "learning_rate": 0.0004891036414565827, + "loss": 0.3463, + "step": 18350 + }, + { + "epoch": 10.25195530726257, + "grad_norm": 0.3567473590373993, + "learning_rate": 0.0004890756302521008, + "loss": 0.3226, + "step": 18351 + }, + { + "epoch": 10.252513966480446, + "grad_norm": 0.38406747579574585, + "learning_rate": 0.0004890476190476191, + "loss": 0.4224, + "step": 18352 + }, + { + "epoch": 10.253072625698325, + "grad_norm": 0.6070484519004822, + "learning_rate": 0.0004890196078431373, + "loss": 0.4863, + "step": 18353 + }, + { + "epoch": 10.253631284916201, + "grad_norm": 0.5813414454460144, + "learning_rate": 0.0004889915966386554, + "loss": 0.4126, + "step": 18354 + }, + { + "epoch": 10.254189944134078, + "grad_norm": 0.6065589189529419, + "learning_rate": 0.0004889635854341737, + "loss": 0.4598, + "step": 18355 + }, + { + "epoch": 10.254748603351956, + "grad_norm": 0.5930861830711365, + "learning_rate": 0.0004889355742296919, + "loss": 0.4485, + "step": 18356 + }, + { + "epoch": 10.255307262569833, + "grad_norm": 0.6899927854537964, + "learning_rate": 0.0004889075630252101, + "loss": 0.6009, + "step": 18357 + }, + { + "epoch": 10.25586592178771, + "grad_norm": 0.5042360424995422, + "learning_rate": 0.0004888795518207283, + "loss": 0.4671, + "step": 18358 + }, + { + "epoch": 10.256424581005586, + "grad_norm": 0.7181597352027893, + "learning_rate": 0.0004888515406162464, + "loss": 0.4897, + "step": 18359 + }, + { + "epoch": 10.256983240223464, + "grad_norm": 1.2735143899917603, + "learning_rate": 0.0004888235294117647, + "loss": 0.5099, + "step": 18360 + }, + { + "epoch": 10.25754189944134, + "grad_norm": 0.6729240417480469, + "learning_rate": 0.000488795518207283, + "loss": 0.3862, + "step": 18361 + }, + { + "epoch": 10.258100558659217, + "grad_norm": 0.6995847821235657, + "learning_rate": 0.0004887675070028012, + "loss": 0.484, + "step": 18362 + }, + { + "epoch": 10.258659217877096, + "grad_norm": 0.42959994077682495, + "learning_rate": 0.0004887394957983194, + "loss": 0.4264, + "step": 18363 + }, + { + "epoch": 10.259217877094972, + "grad_norm": 1.6488803625106812, + "learning_rate": 0.0004887114845938376, + "loss": 0.3879, + "step": 18364 + }, + { + "epoch": 10.259776536312849, + "grad_norm": 0.4537966549396515, + "learning_rate": 0.0004886834733893558, + "loss": 0.4116, + "step": 18365 + }, + { + "epoch": 10.260335195530725, + "grad_norm": 0.5976909399032593, + "learning_rate": 0.000488655462184874, + "loss": 0.3789, + "step": 18366 + }, + { + "epoch": 10.260893854748604, + "grad_norm": 0.6692116856575012, + "learning_rate": 0.0004886274509803922, + "loss": 0.4396, + "step": 18367 + }, + { + "epoch": 10.26145251396648, + "grad_norm": 2.3390302658081055, + "learning_rate": 0.0004885994397759104, + "loss": 0.4978, + "step": 18368 + }, + { + "epoch": 10.262011173184357, + "grad_norm": 1.1843814849853516, + "learning_rate": 0.0004885714285714286, + "loss": 0.5256, + "step": 18369 + }, + { + "epoch": 10.262569832402235, + "grad_norm": 1.6812130212783813, + "learning_rate": 0.0004885434173669468, + "loss": 0.4063, + "step": 18370 + }, + { + "epoch": 10.263128491620112, + "grad_norm": 0.6371800899505615, + "learning_rate": 0.000488515406162465, + "loss": 0.4366, + "step": 18371 + }, + { + "epoch": 10.263687150837988, + "grad_norm": 0.45672523975372314, + "learning_rate": 0.0004884873949579832, + "loss": 0.5356, + "step": 18372 + }, + { + "epoch": 10.264245810055867, + "grad_norm": 0.44220802187919617, + "learning_rate": 0.0004884593837535014, + "loss": 0.6256, + "step": 18373 + }, + { + "epoch": 10.264804469273743, + "grad_norm": 0.472271591424942, + "learning_rate": 0.0004884313725490196, + "loss": 0.4116, + "step": 18374 + }, + { + "epoch": 10.26536312849162, + "grad_norm": 0.6216697096824646, + "learning_rate": 0.0004884033613445378, + "loss": 0.3601, + "step": 18375 + }, + { + "epoch": 10.265921787709496, + "grad_norm": 0.5687991380691528, + "learning_rate": 0.000488375350140056, + "loss": 0.3816, + "step": 18376 + }, + { + "epoch": 10.266480446927375, + "grad_norm": 0.49071210622787476, + "learning_rate": 0.0004883473389355742, + "loss": 0.3691, + "step": 18377 + }, + { + "epoch": 10.267039106145251, + "grad_norm": 0.5811916589736938, + "learning_rate": 0.0004883193277310925, + "loss": 0.4555, + "step": 18378 + }, + { + "epoch": 10.267597765363128, + "grad_norm": 0.4637194275856018, + "learning_rate": 0.0004882913165266107, + "loss": 0.4222, + "step": 18379 + }, + { + "epoch": 10.268156424581006, + "grad_norm": 0.9916163086891174, + "learning_rate": 0.00048826330532212886, + "loss": 0.3469, + "step": 18380 + }, + { + "epoch": 10.268715083798883, + "grad_norm": 0.4936084747314453, + "learning_rate": 0.00048823529411764707, + "loss": 0.3552, + "step": 18381 + }, + { + "epoch": 10.26927374301676, + "grad_norm": 0.4554215371608734, + "learning_rate": 0.0004882072829131653, + "loss": 0.4944, + "step": 18382 + }, + { + "epoch": 10.269832402234638, + "grad_norm": 0.5257346034049988, + "learning_rate": 0.0004881792717086835, + "loss": 0.4905, + "step": 18383 + }, + { + "epoch": 10.270391061452514, + "grad_norm": 0.661540150642395, + "learning_rate": 0.0004881512605042017, + "loss": 0.4703, + "step": 18384 + }, + { + "epoch": 10.27094972067039, + "grad_norm": 1.3188999891281128, + "learning_rate": 0.0004881232492997199, + "loss": 0.4724, + "step": 18385 + }, + { + "epoch": 10.271508379888267, + "grad_norm": 2.559964418411255, + "learning_rate": 0.0004880952380952381, + "loss": 0.4642, + "step": 18386 + }, + { + "epoch": 10.272067039106146, + "grad_norm": 5.632225036621094, + "learning_rate": 0.0004880672268907563, + "loss": 0.3893, + "step": 18387 + }, + { + "epoch": 10.272625698324022, + "grad_norm": 0.5761430859565735, + "learning_rate": 0.0004880392156862745, + "loss": 0.4852, + "step": 18388 + }, + { + "epoch": 10.273184357541899, + "grad_norm": 0.4436241090297699, + "learning_rate": 0.0004880112044817927, + "loss": 0.456, + "step": 18389 + }, + { + "epoch": 10.273743016759777, + "grad_norm": 0.41078269481658936, + "learning_rate": 0.0004879831932773109, + "loss": 0.4756, + "step": 18390 + }, + { + "epoch": 10.274301675977654, + "grad_norm": 0.8542408347129822, + "learning_rate": 0.00048795518207282913, + "loss": 0.5041, + "step": 18391 + }, + { + "epoch": 10.27486033519553, + "grad_norm": 0.7590973377227783, + "learning_rate": 0.0004879271708683474, + "loss": 0.6663, + "step": 18392 + }, + { + "epoch": 10.275418994413409, + "grad_norm": 0.45177537202835083, + "learning_rate": 0.00048789915966386554, + "loss": 0.4165, + "step": 18393 + }, + { + "epoch": 10.275977653631285, + "grad_norm": 4.279130458831787, + "learning_rate": 0.00048787114845938375, + "loss": 0.3249, + "step": 18394 + }, + { + "epoch": 10.276536312849162, + "grad_norm": 0.5268144607543945, + "learning_rate": 0.00048784313725490195, + "loss": 0.4678, + "step": 18395 + }, + { + "epoch": 10.277094972067038, + "grad_norm": 0.38767245411872864, + "learning_rate": 0.00048781512605042016, + "loss": 0.3663, + "step": 18396 + }, + { + "epoch": 10.277653631284917, + "grad_norm": 0.642672061920166, + "learning_rate": 0.0004877871148459384, + "loss": 0.49, + "step": 18397 + }, + { + "epoch": 10.278212290502793, + "grad_norm": 0.39219579100608826, + "learning_rate": 0.00048775910364145657, + "loss": 0.4031, + "step": 18398 + }, + { + "epoch": 10.27877094972067, + "grad_norm": 0.4188181757926941, + "learning_rate": 0.0004877310924369748, + "loss": 0.4243, + "step": 18399 + }, + { + "epoch": 10.279329608938548, + "grad_norm": 0.4049554467201233, + "learning_rate": 0.00048770308123249304, + "loss": 0.4487, + "step": 18400 + }, + { + "epoch": 10.279888268156425, + "grad_norm": 2.1493453979492188, + "learning_rate": 0.0004876750700280112, + "loss": 0.411, + "step": 18401 + }, + { + "epoch": 10.280446927374301, + "grad_norm": 5.6451897621154785, + "learning_rate": 0.00048764705882352945, + "loss": 0.4399, + "step": 18402 + }, + { + "epoch": 10.28100558659218, + "grad_norm": 0.40940535068511963, + "learning_rate": 0.0004876190476190476, + "loss": 0.4728, + "step": 18403 + }, + { + "epoch": 10.281564245810056, + "grad_norm": 0.3710485100746155, + "learning_rate": 0.0004875910364145658, + "loss": 0.3848, + "step": 18404 + }, + { + "epoch": 10.282122905027933, + "grad_norm": 1.3781269788742065, + "learning_rate": 0.00048756302521008407, + "loss": 0.3992, + "step": 18405 + }, + { + "epoch": 10.28268156424581, + "grad_norm": 0.5431393980979919, + "learning_rate": 0.0004875350140056022, + "loss": 0.392, + "step": 18406 + }, + { + "epoch": 10.283240223463688, + "grad_norm": 0.5656924247741699, + "learning_rate": 0.0004875070028011205, + "loss": 0.4264, + "step": 18407 + }, + { + "epoch": 10.283798882681564, + "grad_norm": 0.6770188808441162, + "learning_rate": 0.0004874789915966387, + "loss": 0.4653, + "step": 18408 + }, + { + "epoch": 10.28435754189944, + "grad_norm": 0.6118614077568054, + "learning_rate": 0.00048745098039215684, + "loss": 0.3346, + "step": 18409 + }, + { + "epoch": 10.28491620111732, + "grad_norm": 0.6598475575447083, + "learning_rate": 0.0004874229691876751, + "loss": 0.3794, + "step": 18410 + }, + { + "epoch": 10.285474860335196, + "grad_norm": 0.5067694783210754, + "learning_rate": 0.00048739495798319325, + "loss": 0.3673, + "step": 18411 + }, + { + "epoch": 10.286033519553072, + "grad_norm": 0.4660192131996155, + "learning_rate": 0.0004873669467787115, + "loss": 0.4083, + "step": 18412 + }, + { + "epoch": 10.286592178770949, + "grad_norm": 0.38352593779563904, + "learning_rate": 0.0004873389355742297, + "loss": 0.3487, + "step": 18413 + }, + { + "epoch": 10.287150837988827, + "grad_norm": 0.45538339018821716, + "learning_rate": 0.00048731092436974787, + "loss": 0.3654, + "step": 18414 + }, + { + "epoch": 10.287709497206704, + "grad_norm": 1.3965487480163574, + "learning_rate": 0.00048728291316526613, + "loss": 0.4017, + "step": 18415 + }, + { + "epoch": 10.28826815642458, + "grad_norm": 0.620423436164856, + "learning_rate": 0.00048725490196078433, + "loss": 0.4182, + "step": 18416 + }, + { + "epoch": 10.288826815642459, + "grad_norm": 0.575762927532196, + "learning_rate": 0.00048722689075630254, + "loss": 0.5285, + "step": 18417 + }, + { + "epoch": 10.289385474860335, + "grad_norm": 0.6503027081489563, + "learning_rate": 0.00048719887955182075, + "loss": 0.4045, + "step": 18418 + }, + { + "epoch": 10.289944134078212, + "grad_norm": 0.463217556476593, + "learning_rate": 0.0004871708683473389, + "loss": 0.4092, + "step": 18419 + }, + { + "epoch": 10.29050279329609, + "grad_norm": 0.5537541508674622, + "learning_rate": 0.00048714285714285716, + "loss": 0.3835, + "step": 18420 + }, + { + "epoch": 10.291061452513967, + "grad_norm": 0.41254428029060364, + "learning_rate": 0.00048711484593837536, + "loss": 0.4602, + "step": 18421 + }, + { + "epoch": 10.291620111731843, + "grad_norm": 1.661500334739685, + "learning_rate": 0.00048708683473389357, + "loss": 0.4145, + "step": 18422 + }, + { + "epoch": 10.29217877094972, + "grad_norm": 2.934598684310913, + "learning_rate": 0.0004870588235294118, + "loss": 0.4143, + "step": 18423 + }, + { + "epoch": 10.292737430167598, + "grad_norm": 0.48016679286956787, + "learning_rate": 0.00048703081232493, + "loss": 0.4883, + "step": 18424 + }, + { + "epoch": 10.293296089385475, + "grad_norm": 0.5752936601638794, + "learning_rate": 0.0004870028011204482, + "loss": 0.4257, + "step": 18425 + }, + { + "epoch": 10.293854748603351, + "grad_norm": 0.3991956412792206, + "learning_rate": 0.0004869747899159664, + "loss": 0.4346, + "step": 18426 + }, + { + "epoch": 10.29441340782123, + "grad_norm": 0.3808521330356598, + "learning_rate": 0.00048694677871148465, + "loss": 0.3322, + "step": 18427 + }, + { + "epoch": 10.294972067039106, + "grad_norm": 6.706089973449707, + "learning_rate": 0.0004869187675070028, + "loss": 0.3817, + "step": 18428 + }, + { + "epoch": 10.295530726256983, + "grad_norm": 0.5070836544036865, + "learning_rate": 0.000486890756302521, + "loss": 0.4568, + "step": 18429 + }, + { + "epoch": 10.296089385474861, + "grad_norm": 2.9301469326019287, + "learning_rate": 0.0004868627450980392, + "loss": 0.4551, + "step": 18430 + }, + { + "epoch": 10.296648044692738, + "grad_norm": 0.682773768901825, + "learning_rate": 0.0004868347338935574, + "loss": 0.5501, + "step": 18431 + }, + { + "epoch": 10.297206703910614, + "grad_norm": 0.9472801685333252, + "learning_rate": 0.0004868067226890757, + "loss": 0.5857, + "step": 18432 + }, + { + "epoch": 10.297765363128491, + "grad_norm": 0.5117703676223755, + "learning_rate": 0.00048677871148459384, + "loss": 0.4218, + "step": 18433 + }, + { + "epoch": 10.29832402234637, + "grad_norm": 0.6095899939537048, + "learning_rate": 0.00048675070028011204, + "loss": 0.3983, + "step": 18434 + }, + { + "epoch": 10.298882681564246, + "grad_norm": 0.5017976760864258, + "learning_rate": 0.0004867226890756303, + "loss": 0.4705, + "step": 18435 + }, + { + "epoch": 10.299441340782122, + "grad_norm": 0.3897111415863037, + "learning_rate": 0.00048669467787114845, + "loss": 0.3911, + "step": 18436 + }, + { + "epoch": 10.3, + "grad_norm": 0.43075549602508545, + "learning_rate": 0.0004866666666666667, + "loss": 0.3405, + "step": 18437 + }, + { + "epoch": 10.300558659217877, + "grad_norm": 0.5940316915512085, + "learning_rate": 0.00048663865546218487, + "loss": 0.4006, + "step": 18438 + }, + { + "epoch": 10.301117318435754, + "grad_norm": 0.49035781621932983, + "learning_rate": 0.00048661064425770307, + "loss": 0.6055, + "step": 18439 + }, + { + "epoch": 10.30167597765363, + "grad_norm": 0.6272029280662537, + "learning_rate": 0.00048658263305322133, + "loss": 0.4145, + "step": 18440 + }, + { + "epoch": 10.302234636871509, + "grad_norm": 0.3923509120941162, + "learning_rate": 0.0004865546218487395, + "loss": 0.4174, + "step": 18441 + }, + { + "epoch": 10.302793296089385, + "grad_norm": 0.5060518980026245, + "learning_rate": 0.00048652661064425774, + "loss": 0.4013, + "step": 18442 + }, + { + "epoch": 10.303351955307262, + "grad_norm": 1.4095302820205688, + "learning_rate": 0.00048649859943977595, + "loss": 0.4449, + "step": 18443 + }, + { + "epoch": 10.30391061452514, + "grad_norm": 0.36465132236480713, + "learning_rate": 0.0004864705882352941, + "loss": 0.4214, + "step": 18444 + }, + { + "epoch": 10.304469273743017, + "grad_norm": 0.5210620760917664, + "learning_rate": 0.00048644257703081236, + "loss": 0.3518, + "step": 18445 + }, + { + "epoch": 10.305027932960893, + "grad_norm": 1.245948314666748, + "learning_rate": 0.0004864145658263305, + "loss": 0.4379, + "step": 18446 + }, + { + "epoch": 10.305586592178772, + "grad_norm": 0.48197925090789795, + "learning_rate": 0.0004863865546218488, + "loss": 0.4393, + "step": 18447 + }, + { + "epoch": 10.306145251396648, + "grad_norm": 0.5753238201141357, + "learning_rate": 0.000486358543417367, + "loss": 0.6433, + "step": 18448 + }, + { + "epoch": 10.306703910614525, + "grad_norm": 0.45251309871673584, + "learning_rate": 0.00048633053221288513, + "loss": 0.4302, + "step": 18449 + }, + { + "epoch": 10.307262569832401, + "grad_norm": 0.5044090151786804, + "learning_rate": 0.0004863025210084034, + "loss": 0.4201, + "step": 18450 + }, + { + "epoch": 10.30782122905028, + "grad_norm": 0.45618823170661926, + "learning_rate": 0.0004862745098039216, + "loss": 0.4309, + "step": 18451 + }, + { + "epoch": 10.308379888268156, + "grad_norm": 0.466362863779068, + "learning_rate": 0.0004862464985994398, + "loss": 0.3217, + "step": 18452 + }, + { + "epoch": 10.308938547486033, + "grad_norm": 6.978814601898193, + "learning_rate": 0.000486218487394958, + "loss": 0.5725, + "step": 18453 + }, + { + "epoch": 10.309497206703911, + "grad_norm": 0.7230717539787292, + "learning_rate": 0.00048619047619047616, + "loss": 0.5104, + "step": 18454 + }, + { + "epoch": 10.310055865921788, + "grad_norm": 0.44195520877838135, + "learning_rate": 0.0004861624649859944, + "loss": 0.4972, + "step": 18455 + }, + { + "epoch": 10.310614525139664, + "grad_norm": 0.47468531131744385, + "learning_rate": 0.00048613445378151263, + "loss": 0.4006, + "step": 18456 + }, + { + "epoch": 10.311173184357543, + "grad_norm": 1.8223645687103271, + "learning_rate": 0.00048610644257703083, + "loss": 0.4071, + "step": 18457 + }, + { + "epoch": 10.31173184357542, + "grad_norm": 0.46808022260665894, + "learning_rate": 0.00048607843137254904, + "loss": 0.5316, + "step": 18458 + }, + { + "epoch": 10.312290502793296, + "grad_norm": 0.5461525917053223, + "learning_rate": 0.00048605042016806725, + "loss": 0.3873, + "step": 18459 + }, + { + "epoch": 10.312849162011172, + "grad_norm": 0.4563542306423187, + "learning_rate": 0.00048602240896358545, + "loss": 0.437, + "step": 18460 + }, + { + "epoch": 10.31340782122905, + "grad_norm": 1.3422718048095703, + "learning_rate": 0.00048599439775910366, + "loss": 0.3769, + "step": 18461 + }, + { + "epoch": 10.313966480446927, + "grad_norm": 0.399704247713089, + "learning_rate": 0.00048596638655462186, + "loss": 0.4078, + "step": 18462 + }, + { + "epoch": 10.314525139664804, + "grad_norm": 0.5637335777282715, + "learning_rate": 0.00048593837535014007, + "loss": 0.4454, + "step": 18463 + }, + { + "epoch": 10.315083798882682, + "grad_norm": 0.5965197086334229, + "learning_rate": 0.0004859103641456583, + "loss": 0.3185, + "step": 18464 + }, + { + "epoch": 10.315642458100559, + "grad_norm": 0.4701148569583893, + "learning_rate": 0.0004858823529411765, + "loss": 0.3932, + "step": 18465 + }, + { + "epoch": 10.316201117318435, + "grad_norm": 1.511795163154602, + "learning_rate": 0.0004858543417366947, + "loss": 0.4163, + "step": 18466 + }, + { + "epoch": 10.316759776536314, + "grad_norm": 0.4629707336425781, + "learning_rate": 0.0004858263305322129, + "loss": 0.4453, + "step": 18467 + }, + { + "epoch": 10.31731843575419, + "grad_norm": 2.6410837173461914, + "learning_rate": 0.0004857983193277311, + "loss": 0.4609, + "step": 18468 + }, + { + "epoch": 10.317877094972067, + "grad_norm": 1.1350764036178589, + "learning_rate": 0.0004857703081232493, + "loss": 0.3947, + "step": 18469 + }, + { + "epoch": 10.318435754189943, + "grad_norm": 0.9224898219108582, + "learning_rate": 0.0004857422969187675, + "loss": 0.355, + "step": 18470 + }, + { + "epoch": 10.318994413407822, + "grad_norm": 0.710609495639801, + "learning_rate": 0.0004857142857142857, + "loss": 0.4634, + "step": 18471 + }, + { + "epoch": 10.319553072625698, + "grad_norm": 1.5205860137939453, + "learning_rate": 0.0004856862745098039, + "loss": 0.5903, + "step": 18472 + }, + { + "epoch": 10.320111731843575, + "grad_norm": 0.6428022384643555, + "learning_rate": 0.00048565826330532213, + "loss": 0.4682, + "step": 18473 + }, + { + "epoch": 10.320670391061453, + "grad_norm": 0.4200070798397064, + "learning_rate": 0.00048563025210084034, + "loss": 0.35, + "step": 18474 + }, + { + "epoch": 10.32122905027933, + "grad_norm": 0.4596361219882965, + "learning_rate": 0.0004856022408963586, + "loss": 0.4756, + "step": 18475 + }, + { + "epoch": 10.321787709497206, + "grad_norm": 0.37173566222190857, + "learning_rate": 0.00048557422969187675, + "loss": 0.435, + "step": 18476 + }, + { + "epoch": 10.322346368715085, + "grad_norm": 0.6648851037025452, + "learning_rate": 0.00048554621848739495, + "loss": 0.3565, + "step": 18477 + }, + { + "epoch": 10.322905027932961, + "grad_norm": 1.3423856496810913, + "learning_rate": 0.00048551820728291316, + "loss": 0.4056, + "step": 18478 + }, + { + "epoch": 10.323463687150838, + "grad_norm": 0.48765912652015686, + "learning_rate": 0.00048549019607843137, + "loss": 0.5069, + "step": 18479 + }, + { + "epoch": 10.324022346368714, + "grad_norm": 0.9991682767868042, + "learning_rate": 0.0004854621848739496, + "loss": 0.4077, + "step": 18480 + }, + { + "epoch": 10.324581005586593, + "grad_norm": 3.380387544631958, + "learning_rate": 0.0004854341736694678, + "loss": 0.5988, + "step": 18481 + }, + { + "epoch": 10.32513966480447, + "grad_norm": 0.43948987126350403, + "learning_rate": 0.000485406162464986, + "loss": 0.4611, + "step": 18482 + }, + { + "epoch": 10.325698324022346, + "grad_norm": 1.0265700817108154, + "learning_rate": 0.00048537815126050424, + "loss": 0.4076, + "step": 18483 + }, + { + "epoch": 10.326256983240224, + "grad_norm": 0.6248056292533875, + "learning_rate": 0.0004853501400560224, + "loss": 0.5667, + "step": 18484 + }, + { + "epoch": 10.3268156424581, + "grad_norm": 0.428129106760025, + "learning_rate": 0.00048532212885154066, + "loss": 0.399, + "step": 18485 + }, + { + "epoch": 10.327374301675977, + "grad_norm": 1.3516876697540283, + "learning_rate": 0.0004852941176470588, + "loss": 0.3419, + "step": 18486 + }, + { + "epoch": 10.327932960893854, + "grad_norm": 0.41521528363227844, + "learning_rate": 0.000485266106442577, + "loss": 0.2776, + "step": 18487 + }, + { + "epoch": 10.328491620111732, + "grad_norm": 0.5115822553634644, + "learning_rate": 0.0004852380952380953, + "loss": 0.4015, + "step": 18488 + }, + { + "epoch": 10.329050279329609, + "grad_norm": 3.43815279006958, + "learning_rate": 0.0004852100840336134, + "loss": 0.3941, + "step": 18489 + }, + { + "epoch": 10.329608938547485, + "grad_norm": 0.5160900354385376, + "learning_rate": 0.0004851820728291317, + "loss": 0.3538, + "step": 18490 + }, + { + "epoch": 10.330167597765364, + "grad_norm": 0.5784170627593994, + "learning_rate": 0.0004851540616246499, + "loss": 0.3808, + "step": 18491 + }, + { + "epoch": 10.33072625698324, + "grad_norm": 0.5866448283195496, + "learning_rate": 0.00048512605042016804, + "loss": 0.4005, + "step": 18492 + }, + { + "epoch": 10.331284916201117, + "grad_norm": 0.6335739493370056, + "learning_rate": 0.0004850980392156863, + "loss": 0.4965, + "step": 18493 + }, + { + "epoch": 10.331843575418995, + "grad_norm": 0.9090087413787842, + "learning_rate": 0.00048507002801120446, + "loss": 0.4821, + "step": 18494 + }, + { + "epoch": 10.332402234636872, + "grad_norm": 0.6544280648231506, + "learning_rate": 0.0004850420168067227, + "loss": 0.5083, + "step": 18495 + }, + { + "epoch": 10.332960893854748, + "grad_norm": 0.5462419390678406, + "learning_rate": 0.0004850140056022409, + "loss": 0.4255, + "step": 18496 + }, + { + "epoch": 10.333519553072625, + "grad_norm": 0.4685802757740021, + "learning_rate": 0.0004849859943977591, + "loss": 0.4342, + "step": 18497 + }, + { + "epoch": 10.334078212290503, + "grad_norm": 0.46307775378227234, + "learning_rate": 0.00048495798319327733, + "loss": 0.4785, + "step": 18498 + }, + { + "epoch": 10.33463687150838, + "grad_norm": 0.6195305585861206, + "learning_rate": 0.00048492997198879554, + "loss": 0.4477, + "step": 18499 + }, + { + "epoch": 10.335195530726256, + "grad_norm": 0.4749375581741333, + "learning_rate": 0.00048490196078431375, + "loss": 0.4459, + "step": 18500 + }, + { + "epoch": 10.335195530726256, + "eval_cer": 0.08912419669841685, + "eval_loss": 0.33553820848464966, + "eval_runtime": 55.512, + "eval_samples_per_second": 81.748, + "eval_steps_per_second": 5.116, + "eval_wer": 0.3543755761012262, + "step": 18500 + }, + { + "epoch": 10.335754189944135, + "grad_norm": 1.1466726064682007, + "learning_rate": 0.00048487394957983195, + "loss": 0.2767, + "step": 18501 + }, + { + "epoch": 10.336312849162011, + "grad_norm": 0.44482168555259705, + "learning_rate": 0.0004848459383753501, + "loss": 0.3529, + "step": 18502 + }, + { + "epoch": 10.336871508379888, + "grad_norm": 0.5936769843101501, + "learning_rate": 0.00048481792717086836, + "loss": 0.4692, + "step": 18503 + }, + { + "epoch": 10.337430167597766, + "grad_norm": 0.42303466796875, + "learning_rate": 0.00048478991596638657, + "loss": 0.3935, + "step": 18504 + }, + { + "epoch": 10.337988826815643, + "grad_norm": 0.43544089794158936, + "learning_rate": 0.0004847619047619048, + "loss": 0.4149, + "step": 18505 + }, + { + "epoch": 10.33854748603352, + "grad_norm": 0.4810880124568939, + "learning_rate": 0.000484733893557423, + "loss": 0.4738, + "step": 18506 + }, + { + "epoch": 10.339106145251396, + "grad_norm": 0.6476143598556519, + "learning_rate": 0.0004847058823529412, + "loss": 0.3405, + "step": 18507 + }, + { + "epoch": 10.339664804469274, + "grad_norm": 0.5902200937271118, + "learning_rate": 0.0004846778711484594, + "loss": 0.5164, + "step": 18508 + }, + { + "epoch": 10.34022346368715, + "grad_norm": 0.8516894578933716, + "learning_rate": 0.0004846498599439776, + "loss": 0.4093, + "step": 18509 + }, + { + "epoch": 10.340782122905027, + "grad_norm": 2.0323398113250732, + "learning_rate": 0.0004846218487394958, + "loss": 0.4778, + "step": 18510 + }, + { + "epoch": 10.341340782122906, + "grad_norm": 0.7313889265060425, + "learning_rate": 0.000484593837535014, + "loss": 0.4381, + "step": 18511 + }, + { + "epoch": 10.341899441340782, + "grad_norm": 0.482281357049942, + "learning_rate": 0.0004845658263305322, + "loss": 0.3419, + "step": 18512 + }, + { + "epoch": 10.342458100558659, + "grad_norm": 0.5999189615249634, + "learning_rate": 0.0004845378151260504, + "loss": 0.4298, + "step": 18513 + }, + { + "epoch": 10.343016759776535, + "grad_norm": 0.43831872940063477, + "learning_rate": 0.00048450980392156863, + "loss": 0.4432, + "step": 18514 + }, + { + "epoch": 10.343575418994414, + "grad_norm": 0.4799043536186218, + "learning_rate": 0.0004844817927170869, + "loss": 0.414, + "step": 18515 + }, + { + "epoch": 10.34413407821229, + "grad_norm": 0.569431483745575, + "learning_rate": 0.00048445378151260504, + "loss": 0.3974, + "step": 18516 + }, + { + "epoch": 10.344692737430167, + "grad_norm": 0.47372135519981384, + "learning_rate": 0.00048442577030812325, + "loss": 0.3766, + "step": 18517 + }, + { + "epoch": 10.345251396648045, + "grad_norm": 0.685141384601593, + "learning_rate": 0.00048439775910364145, + "loss": 0.4488, + "step": 18518 + }, + { + "epoch": 10.345810055865922, + "grad_norm": 0.7046328186988831, + "learning_rate": 0.00048436974789915966, + "loss": 0.5682, + "step": 18519 + }, + { + "epoch": 10.346368715083798, + "grad_norm": 0.4307580590248108, + "learning_rate": 0.0004843417366946779, + "loss": 0.4466, + "step": 18520 + }, + { + "epoch": 10.346927374301677, + "grad_norm": 0.41561874747276306, + "learning_rate": 0.00048431372549019607, + "loss": 0.4341, + "step": 18521 + }, + { + "epoch": 10.347486033519553, + "grad_norm": 0.5324827432632446, + "learning_rate": 0.0004842857142857143, + "loss": 0.531, + "step": 18522 + }, + { + "epoch": 10.34804469273743, + "grad_norm": 0.49715253710746765, + "learning_rate": 0.00048425770308123254, + "loss": 0.3066, + "step": 18523 + }, + { + "epoch": 10.348603351955306, + "grad_norm": 0.6423181891441345, + "learning_rate": 0.0004842296918767507, + "loss": 0.7424, + "step": 18524 + }, + { + "epoch": 10.349162011173185, + "grad_norm": 0.7908527851104736, + "learning_rate": 0.00048420168067226895, + "loss": 0.4363, + "step": 18525 + }, + { + "epoch": 10.349720670391061, + "grad_norm": 0.6770151853561401, + "learning_rate": 0.0004841736694677871, + "loss": 0.4476, + "step": 18526 + }, + { + "epoch": 10.350279329608938, + "grad_norm": 0.4739961624145508, + "learning_rate": 0.0004841456582633053, + "loss": 0.3642, + "step": 18527 + }, + { + "epoch": 10.350837988826816, + "grad_norm": 1.1014668941497803, + "learning_rate": 0.00048411764705882357, + "loss": 0.4062, + "step": 18528 + }, + { + "epoch": 10.351396648044693, + "grad_norm": 2.1694178581237793, + "learning_rate": 0.0004840896358543417, + "loss": 0.4481, + "step": 18529 + }, + { + "epoch": 10.35195530726257, + "grad_norm": 0.5644750595092773, + "learning_rate": 0.00048406162464986, + "loss": 0.4766, + "step": 18530 + }, + { + "epoch": 10.352513966480448, + "grad_norm": 0.4566260576248169, + "learning_rate": 0.0004840336134453782, + "loss": 0.5553, + "step": 18531 + }, + { + "epoch": 10.353072625698324, + "grad_norm": 0.4291819930076599, + "learning_rate": 0.00048400560224089634, + "loss": 0.3907, + "step": 18532 + }, + { + "epoch": 10.3536312849162, + "grad_norm": 0.6987180709838867, + "learning_rate": 0.0004839775910364146, + "loss": 0.3594, + "step": 18533 + }, + { + "epoch": 10.354189944134077, + "grad_norm": 0.44914641976356506, + "learning_rate": 0.00048394957983193275, + "loss": 0.3452, + "step": 18534 + }, + { + "epoch": 10.354748603351956, + "grad_norm": 0.760997474193573, + "learning_rate": 0.000483921568627451, + "loss": 0.4537, + "step": 18535 + }, + { + "epoch": 10.355307262569832, + "grad_norm": 0.6706992387771606, + "learning_rate": 0.0004838935574229692, + "loss": 0.5267, + "step": 18536 + }, + { + "epoch": 10.355865921787709, + "grad_norm": 0.6490182280540466, + "learning_rate": 0.00048386554621848737, + "loss": 0.4754, + "step": 18537 + }, + { + "epoch": 10.356424581005587, + "grad_norm": 0.5547512769699097, + "learning_rate": 0.00048383753501400563, + "loss": 0.4393, + "step": 18538 + }, + { + "epoch": 10.356983240223464, + "grad_norm": 0.47428181767463684, + "learning_rate": 0.00048380952380952383, + "loss": 0.4836, + "step": 18539 + }, + { + "epoch": 10.35754189944134, + "grad_norm": 0.48504653573036194, + "learning_rate": 0.00048378151260504204, + "loss": 0.4846, + "step": 18540 + }, + { + "epoch": 10.358100558659217, + "grad_norm": 0.5197486281394958, + "learning_rate": 0.00048375350140056025, + "loss": 0.5641, + "step": 18541 + }, + { + "epoch": 10.358659217877095, + "grad_norm": 0.5134580135345459, + "learning_rate": 0.0004837254901960784, + "loss": 0.3856, + "step": 18542 + }, + { + "epoch": 10.359217877094972, + "grad_norm": 0.9766239523887634, + "learning_rate": 0.00048369747899159666, + "loss": 0.3663, + "step": 18543 + }, + { + "epoch": 10.359776536312848, + "grad_norm": 0.423331081867218, + "learning_rate": 0.00048366946778711486, + "loss": 0.4179, + "step": 18544 + }, + { + "epoch": 10.360335195530727, + "grad_norm": 0.5242456793785095, + "learning_rate": 0.00048364145658263307, + "loss": 0.4284, + "step": 18545 + }, + { + "epoch": 10.360893854748603, + "grad_norm": 0.4409019947052002, + "learning_rate": 0.0004836134453781513, + "loss": 0.4368, + "step": 18546 + }, + { + "epoch": 10.36145251396648, + "grad_norm": 0.4317437708377838, + "learning_rate": 0.0004835854341736695, + "loss": 0.3708, + "step": 18547 + }, + { + "epoch": 10.362011173184358, + "grad_norm": 0.7175754308700562, + "learning_rate": 0.0004835574229691877, + "loss": 0.4632, + "step": 18548 + }, + { + "epoch": 10.362569832402235, + "grad_norm": 0.5007825493812561, + "learning_rate": 0.0004835294117647059, + "loss": 0.4097, + "step": 18549 + }, + { + "epoch": 10.363128491620111, + "grad_norm": 0.6789500117301941, + "learning_rate": 0.0004835014005602241, + "loss": 0.5448, + "step": 18550 + }, + { + "epoch": 10.363687150837988, + "grad_norm": 0.434480220079422, + "learning_rate": 0.0004834733893557423, + "loss": 0.5772, + "step": 18551 + }, + { + "epoch": 10.364245810055866, + "grad_norm": 0.38877207040786743, + "learning_rate": 0.0004834453781512605, + "loss": 0.3669, + "step": 18552 + }, + { + "epoch": 10.364804469273743, + "grad_norm": 0.6788049936294556, + "learning_rate": 0.0004834173669467787, + "loss": 0.6052, + "step": 18553 + }, + { + "epoch": 10.36536312849162, + "grad_norm": 8.106139183044434, + "learning_rate": 0.0004833893557422969, + "loss": 0.4574, + "step": 18554 + }, + { + "epoch": 10.365921787709498, + "grad_norm": 0.6326924562454224, + "learning_rate": 0.0004833613445378152, + "loss": 0.5245, + "step": 18555 + }, + { + "epoch": 10.366480446927374, + "grad_norm": 0.4364500939846039, + "learning_rate": 0.00048333333333333334, + "loss": 0.2962, + "step": 18556 + }, + { + "epoch": 10.367039106145251, + "grad_norm": 0.3921828269958496, + "learning_rate": 0.00048330532212885154, + "loss": 0.3731, + "step": 18557 + }, + { + "epoch": 10.36759776536313, + "grad_norm": 0.33790871500968933, + "learning_rate": 0.00048327731092436975, + "loss": 0.3375, + "step": 18558 + }, + { + "epoch": 10.368156424581006, + "grad_norm": 0.4161319434642792, + "learning_rate": 0.00048324929971988795, + "loss": 0.3653, + "step": 18559 + }, + { + "epoch": 10.368715083798882, + "grad_norm": 0.3491482436656952, + "learning_rate": 0.0004832212885154062, + "loss": 0.3125, + "step": 18560 + }, + { + "epoch": 10.369273743016759, + "grad_norm": 0.4830874502658844, + "learning_rate": 0.00048319327731092437, + "loss": 0.4309, + "step": 18561 + }, + { + "epoch": 10.369832402234637, + "grad_norm": 0.5594037771224976, + "learning_rate": 0.00048316526610644257, + "loss": 0.4668, + "step": 18562 + }, + { + "epoch": 10.370391061452514, + "grad_norm": 0.45328056812286377, + "learning_rate": 0.00048313725490196083, + "loss": 0.3663, + "step": 18563 + }, + { + "epoch": 10.37094972067039, + "grad_norm": 0.550655722618103, + "learning_rate": 0.000483109243697479, + "loss": 0.3791, + "step": 18564 + }, + { + "epoch": 10.371508379888269, + "grad_norm": 0.6805617213249207, + "learning_rate": 0.00048308123249299724, + "loss": 0.3942, + "step": 18565 + }, + { + "epoch": 10.372067039106145, + "grad_norm": 0.5872673988342285, + "learning_rate": 0.0004830532212885154, + "loss": 0.3954, + "step": 18566 + }, + { + "epoch": 10.372625698324022, + "grad_norm": 0.5242465734481812, + "learning_rate": 0.0004830252100840336, + "loss": 0.5089, + "step": 18567 + }, + { + "epoch": 10.3731843575419, + "grad_norm": 0.5024154186248779, + "learning_rate": 0.00048299719887955186, + "loss": 0.495, + "step": 18568 + }, + { + "epoch": 10.373743016759777, + "grad_norm": 0.5108323097229004, + "learning_rate": 0.00048296918767507, + "loss": 0.3692, + "step": 18569 + }, + { + "epoch": 10.374301675977653, + "grad_norm": 0.6018432378768921, + "learning_rate": 0.0004829411764705883, + "loss": 0.4887, + "step": 18570 + }, + { + "epoch": 10.37486033519553, + "grad_norm": 0.9265491366386414, + "learning_rate": 0.0004829131652661065, + "loss": 0.4313, + "step": 18571 + }, + { + "epoch": 10.375418994413408, + "grad_norm": 1.1394156217575073, + "learning_rate": 0.00048288515406162463, + "loss": 0.3922, + "step": 18572 + }, + { + "epoch": 10.375977653631285, + "grad_norm": 0.6537106037139893, + "learning_rate": 0.0004828571428571429, + "loss": 0.6844, + "step": 18573 + }, + { + "epoch": 10.376536312849161, + "grad_norm": 0.6739285588264465, + "learning_rate": 0.00048282913165266104, + "loss": 0.3817, + "step": 18574 + }, + { + "epoch": 10.37709497206704, + "grad_norm": 0.49427366256713867, + "learning_rate": 0.00048280112044817925, + "loss": 0.4028, + "step": 18575 + }, + { + "epoch": 10.377653631284916, + "grad_norm": 0.856814444065094, + "learning_rate": 0.0004827731092436975, + "loss": 0.393, + "step": 18576 + }, + { + "epoch": 10.378212290502793, + "grad_norm": 0.9378678798675537, + "learning_rate": 0.00048274509803921566, + "loss": 0.5244, + "step": 18577 + }, + { + "epoch": 10.378770949720671, + "grad_norm": 0.4787735044956207, + "learning_rate": 0.0004827170868347339, + "loss": 0.3893, + "step": 18578 + }, + { + "epoch": 10.379329608938548, + "grad_norm": 1.0707627534866333, + "learning_rate": 0.00048268907563025213, + "loss": 0.475, + "step": 18579 + }, + { + "epoch": 10.379888268156424, + "grad_norm": 0.9720556139945984, + "learning_rate": 0.0004826610644257703, + "loss": 0.3682, + "step": 18580 + }, + { + "epoch": 10.380446927374301, + "grad_norm": 0.3987704813480377, + "learning_rate": 0.00048263305322128854, + "loss": 0.3484, + "step": 18581 + }, + { + "epoch": 10.38100558659218, + "grad_norm": 1.6980855464935303, + "learning_rate": 0.0004826050420168067, + "loss": 0.3772, + "step": 18582 + }, + { + "epoch": 10.381564245810056, + "grad_norm": 0.5145918726921082, + "learning_rate": 0.00048257703081232495, + "loss": 0.4045, + "step": 18583 + }, + { + "epoch": 10.382122905027932, + "grad_norm": 0.4570493698120117, + "learning_rate": 0.00048254901960784316, + "loss": 0.3807, + "step": 18584 + }, + { + "epoch": 10.38268156424581, + "grad_norm": 0.5919678807258606, + "learning_rate": 0.0004825210084033613, + "loss": 0.488, + "step": 18585 + }, + { + "epoch": 10.383240223463687, + "grad_norm": 0.9404540061950684, + "learning_rate": 0.00048249299719887957, + "loss": 0.4843, + "step": 18586 + }, + { + "epoch": 10.383798882681564, + "grad_norm": 0.5159085988998413, + "learning_rate": 0.0004824649859943978, + "loss": 0.4021, + "step": 18587 + }, + { + "epoch": 10.38435754189944, + "grad_norm": 0.47732096910476685, + "learning_rate": 0.000482436974789916, + "loss": 0.5265, + "step": 18588 + }, + { + "epoch": 10.384916201117319, + "grad_norm": 0.3719688057899475, + "learning_rate": 0.0004824089635854342, + "loss": 0.3489, + "step": 18589 + }, + { + "epoch": 10.385474860335195, + "grad_norm": 1.1730204820632935, + "learning_rate": 0.00048238095238095234, + "loss": 0.5116, + "step": 18590 + }, + { + "epoch": 10.386033519553072, + "grad_norm": 0.3958281874656677, + "learning_rate": 0.0004823529411764706, + "loss": 0.3393, + "step": 18591 + }, + { + "epoch": 10.38659217877095, + "grad_norm": 0.48720407485961914, + "learning_rate": 0.0004823249299719888, + "loss": 0.6112, + "step": 18592 + }, + { + "epoch": 10.387150837988827, + "grad_norm": 0.8982740640640259, + "learning_rate": 0.000482296918767507, + "loss": 0.5717, + "step": 18593 + }, + { + "epoch": 10.387709497206703, + "grad_norm": 0.46810615062713623, + "learning_rate": 0.0004822689075630252, + "loss": 0.5008, + "step": 18594 + }, + { + "epoch": 10.388268156424582, + "grad_norm": 0.5596848130226135, + "learning_rate": 0.0004822408963585434, + "loss": 0.403, + "step": 18595 + }, + { + "epoch": 10.388826815642458, + "grad_norm": 1.020903468132019, + "learning_rate": 0.00048221288515406163, + "loss": 0.398, + "step": 18596 + }, + { + "epoch": 10.389385474860335, + "grad_norm": 0.5042295455932617, + "learning_rate": 0.00048218487394957984, + "loss": 0.4673, + "step": 18597 + }, + { + "epoch": 10.389944134078211, + "grad_norm": 0.7460339665412903, + "learning_rate": 0.00048215686274509804, + "loss": 0.5537, + "step": 18598 + }, + { + "epoch": 10.39050279329609, + "grad_norm": 0.40921106934547424, + "learning_rate": 0.00048212885154061625, + "loss": 0.3676, + "step": 18599 + }, + { + "epoch": 10.391061452513966, + "grad_norm": 0.5893543362617493, + "learning_rate": 0.00048210084033613445, + "loss": 0.5622, + "step": 18600 + }, + { + "epoch": 10.391620111731843, + "grad_norm": 0.4187788665294647, + "learning_rate": 0.00048207282913165266, + "loss": 0.3712, + "step": 18601 + }, + { + "epoch": 10.392178770949721, + "grad_norm": 0.4277508556842804, + "learning_rate": 0.00048204481792717087, + "loss": 0.5834, + "step": 18602 + }, + { + "epoch": 10.392737430167598, + "grad_norm": 0.6084835529327393, + "learning_rate": 0.0004820168067226891, + "loss": 0.4529, + "step": 18603 + }, + { + "epoch": 10.393296089385474, + "grad_norm": 1.1256463527679443, + "learning_rate": 0.0004819887955182073, + "loss": 0.3722, + "step": 18604 + }, + { + "epoch": 10.393854748603353, + "grad_norm": 0.6930166482925415, + "learning_rate": 0.0004819607843137255, + "loss": 0.4092, + "step": 18605 + }, + { + "epoch": 10.39441340782123, + "grad_norm": 0.5441730618476868, + "learning_rate": 0.0004819327731092437, + "loss": 0.3744, + "step": 18606 + }, + { + "epoch": 10.394972067039106, + "grad_norm": 0.44403278827667236, + "learning_rate": 0.0004819047619047619, + "loss": 0.3954, + "step": 18607 + }, + { + "epoch": 10.395530726256982, + "grad_norm": 0.528443455696106, + "learning_rate": 0.00048187675070028016, + "loss": 0.4874, + "step": 18608 + }, + { + "epoch": 10.39608938547486, + "grad_norm": 0.47436845302581787, + "learning_rate": 0.0004818487394957983, + "loss": 0.4757, + "step": 18609 + }, + { + "epoch": 10.396648044692737, + "grad_norm": 0.640788733959198, + "learning_rate": 0.0004818207282913165, + "loss": 0.4745, + "step": 18610 + }, + { + "epoch": 10.397206703910614, + "grad_norm": 0.6771900653839111, + "learning_rate": 0.0004817927170868348, + "loss": 0.5779, + "step": 18611 + }, + { + "epoch": 10.397765363128492, + "grad_norm": 0.5182864665985107, + "learning_rate": 0.0004817647058823529, + "loss": 0.4725, + "step": 18612 + }, + { + "epoch": 10.398324022346369, + "grad_norm": 0.5506117343902588, + "learning_rate": 0.0004817366946778712, + "loss": 0.4565, + "step": 18613 + }, + { + "epoch": 10.398882681564245, + "grad_norm": 0.6980795860290527, + "learning_rate": 0.00048170868347338934, + "loss": 0.5009, + "step": 18614 + }, + { + "epoch": 10.399441340782122, + "grad_norm": 0.5445689558982849, + "learning_rate": 0.00048168067226890754, + "loss": 0.4513, + "step": 18615 + }, + { + "epoch": 10.4, + "grad_norm": 0.5470642447471619, + "learning_rate": 0.0004816526610644258, + "loss": 0.4577, + "step": 18616 + }, + { + "epoch": 10.400558659217877, + "grad_norm": 0.49638262391090393, + "learning_rate": 0.00048162464985994396, + "loss": 0.5327, + "step": 18617 + }, + { + "epoch": 10.401117318435753, + "grad_norm": 0.4458141028881073, + "learning_rate": 0.0004815966386554622, + "loss": 0.5097, + "step": 18618 + }, + { + "epoch": 10.401675977653632, + "grad_norm": 8.243219375610352, + "learning_rate": 0.0004815686274509804, + "loss": 0.5264, + "step": 18619 + }, + { + "epoch": 10.402234636871508, + "grad_norm": 0.3683255612850189, + "learning_rate": 0.0004815406162464986, + "loss": 0.3233, + "step": 18620 + }, + { + "epoch": 10.402793296089385, + "grad_norm": 0.5535828471183777, + "learning_rate": 0.00048151260504201683, + "loss": 0.4949, + "step": 18621 + }, + { + "epoch": 10.403351955307263, + "grad_norm": 0.44889092445373535, + "learning_rate": 0.000481484593837535, + "loss": 0.3159, + "step": 18622 + }, + { + "epoch": 10.40391061452514, + "grad_norm": 0.5467238426208496, + "learning_rate": 0.00048145658263305325, + "loss": 0.4271, + "step": 18623 + }, + { + "epoch": 10.404469273743016, + "grad_norm": 0.5221225619316101, + "learning_rate": 0.00048142857142857145, + "loss": 0.4192, + "step": 18624 + }, + { + "epoch": 10.405027932960893, + "grad_norm": 0.4691750407218933, + "learning_rate": 0.0004814005602240896, + "loss": 0.385, + "step": 18625 + }, + { + "epoch": 10.405586592178771, + "grad_norm": 0.7881584167480469, + "learning_rate": 0.00048137254901960786, + "loss": 0.3605, + "step": 18626 + }, + { + "epoch": 10.406145251396648, + "grad_norm": 0.4600408971309662, + "learning_rate": 0.00048134453781512607, + "loss": 0.4444, + "step": 18627 + }, + { + "epoch": 10.406703910614524, + "grad_norm": 0.5457239747047424, + "learning_rate": 0.0004813165266106443, + "loss": 0.4614, + "step": 18628 + }, + { + "epoch": 10.407262569832403, + "grad_norm": 0.4991118013858795, + "learning_rate": 0.0004812885154061625, + "loss": 0.4409, + "step": 18629 + }, + { + "epoch": 10.40782122905028, + "grad_norm": 3.9297842979431152, + "learning_rate": 0.00048126050420168063, + "loss": 0.4899, + "step": 18630 + }, + { + "epoch": 10.408379888268156, + "grad_norm": 1.6745059490203857, + "learning_rate": 0.0004812324929971989, + "loss": 0.5218, + "step": 18631 + }, + { + "epoch": 10.408938547486034, + "grad_norm": 0.3986119329929352, + "learning_rate": 0.0004812044817927171, + "loss": 0.4616, + "step": 18632 + }, + { + "epoch": 10.40949720670391, + "grad_norm": 0.3511456549167633, + "learning_rate": 0.0004811764705882353, + "loss": 0.3976, + "step": 18633 + }, + { + "epoch": 10.410055865921787, + "grad_norm": 0.4709932804107666, + "learning_rate": 0.0004811484593837535, + "loss": 0.4354, + "step": 18634 + }, + { + "epoch": 10.410614525139664, + "grad_norm": 0.7571269869804382, + "learning_rate": 0.0004811204481792717, + "loss": 0.4053, + "step": 18635 + }, + { + "epoch": 10.411173184357542, + "grad_norm": 0.3287263512611389, + "learning_rate": 0.0004810924369747899, + "loss": 0.3821, + "step": 18636 + }, + { + "epoch": 10.411731843575419, + "grad_norm": 0.6286267638206482, + "learning_rate": 0.00048106442577030813, + "loss": 0.5083, + "step": 18637 + }, + { + "epoch": 10.412290502793295, + "grad_norm": 0.5040934085845947, + "learning_rate": 0.0004810364145658264, + "loss": 0.5267, + "step": 18638 + }, + { + "epoch": 10.412849162011174, + "grad_norm": 0.9080308079719543, + "learning_rate": 0.00048100840336134454, + "loss": 0.49, + "step": 18639 + }, + { + "epoch": 10.41340782122905, + "grad_norm": 0.49590161442756653, + "learning_rate": 0.00048098039215686275, + "loss": 0.4509, + "step": 18640 + }, + { + "epoch": 10.413966480446927, + "grad_norm": 0.42925822734832764, + "learning_rate": 0.00048095238095238095, + "loss": 0.3837, + "step": 18641 + }, + { + "epoch": 10.414525139664805, + "grad_norm": 0.4432879388332367, + "learning_rate": 0.00048092436974789916, + "loss": 0.5905, + "step": 18642 + }, + { + "epoch": 10.415083798882682, + "grad_norm": 0.4242720305919647, + "learning_rate": 0.0004808963585434174, + "loss": 0.432, + "step": 18643 + }, + { + "epoch": 10.415642458100558, + "grad_norm": 0.4358902871608734, + "learning_rate": 0.00048086834733893557, + "loss": 0.3908, + "step": 18644 + }, + { + "epoch": 10.416201117318435, + "grad_norm": 0.45682021975517273, + "learning_rate": 0.0004808403361344538, + "loss": 0.4088, + "step": 18645 + }, + { + "epoch": 10.416759776536313, + "grad_norm": 0.41442611813545227, + "learning_rate": 0.00048081232492997204, + "loss": 0.3955, + "step": 18646 + }, + { + "epoch": 10.41731843575419, + "grad_norm": 1.1255244016647339, + "learning_rate": 0.0004807843137254902, + "loss": 0.4282, + "step": 18647 + }, + { + "epoch": 10.417877094972066, + "grad_norm": 0.8667262196540833, + "learning_rate": 0.00048075630252100845, + "loss": 0.4312, + "step": 18648 + }, + { + "epoch": 10.418435754189945, + "grad_norm": 0.41436898708343506, + "learning_rate": 0.0004807282913165266, + "loss": 0.4442, + "step": 18649 + }, + { + "epoch": 10.418994413407821, + "grad_norm": 0.638525128364563, + "learning_rate": 0.0004807002801120448, + "loss": 0.4361, + "step": 18650 + }, + { + "epoch": 10.419553072625698, + "grad_norm": 1.5867396593093872, + "learning_rate": 0.00048067226890756307, + "loss": 0.4266, + "step": 18651 + }, + { + "epoch": 10.420111731843576, + "grad_norm": 0.5097740292549133, + "learning_rate": 0.0004806442577030812, + "loss": 0.3818, + "step": 18652 + }, + { + "epoch": 10.420670391061453, + "grad_norm": 0.49930477142333984, + "learning_rate": 0.0004806162464985995, + "loss": 0.3885, + "step": 18653 + }, + { + "epoch": 10.42122905027933, + "grad_norm": 0.49806830286979675, + "learning_rate": 0.0004805882352941177, + "loss": 0.3605, + "step": 18654 + }, + { + "epoch": 10.421787709497206, + "grad_norm": 0.37628641724586487, + "learning_rate": 0.00048056022408963584, + "loss": 0.3592, + "step": 18655 + }, + { + "epoch": 10.422346368715084, + "grad_norm": 0.3944506347179413, + "learning_rate": 0.0004805322128851541, + "loss": 0.4147, + "step": 18656 + }, + { + "epoch": 10.422905027932961, + "grad_norm": 1.5315886735916138, + "learning_rate": 0.00048050420168067225, + "loss": 0.4642, + "step": 18657 + }, + { + "epoch": 10.423463687150837, + "grad_norm": 0.3762303590774536, + "learning_rate": 0.0004804761904761905, + "loss": 0.4504, + "step": 18658 + }, + { + "epoch": 10.424022346368716, + "grad_norm": 0.3893774747848511, + "learning_rate": 0.0004804481792717087, + "loss": 0.3136, + "step": 18659 + }, + { + "epoch": 10.424581005586592, + "grad_norm": 0.48417970538139343, + "learning_rate": 0.00048042016806722687, + "loss": 0.4474, + "step": 18660 + }, + { + "epoch": 10.425139664804469, + "grad_norm": 0.6687957644462585, + "learning_rate": 0.00048039215686274513, + "loss": 0.5093, + "step": 18661 + }, + { + "epoch": 10.425698324022346, + "grad_norm": 0.550114631652832, + "learning_rate": 0.00048036414565826333, + "loss": 0.6183, + "step": 18662 + }, + { + "epoch": 10.426256983240224, + "grad_norm": 0.48493045568466187, + "learning_rate": 0.00048033613445378154, + "loss": 0.4293, + "step": 18663 + }, + { + "epoch": 10.4268156424581, + "grad_norm": 0.6532963514328003, + "learning_rate": 0.00048030812324929975, + "loss": 0.4161, + "step": 18664 + }, + { + "epoch": 10.427374301675977, + "grad_norm": 0.5330410599708557, + "learning_rate": 0.0004802801120448179, + "loss": 0.3695, + "step": 18665 + }, + { + "epoch": 10.427932960893855, + "grad_norm": 0.5917158126831055, + "learning_rate": 0.00048025210084033616, + "loss": 0.4935, + "step": 18666 + }, + { + "epoch": 10.428491620111732, + "grad_norm": 0.4779118299484253, + "learning_rate": 0.00048022408963585436, + "loss": 0.3319, + "step": 18667 + }, + { + "epoch": 10.429050279329608, + "grad_norm": 0.6365251541137695, + "learning_rate": 0.00048019607843137257, + "loss": 0.4135, + "step": 18668 + }, + { + "epoch": 10.429608938547487, + "grad_norm": 0.42598897218704224, + "learning_rate": 0.0004801680672268908, + "loss": 0.2796, + "step": 18669 + }, + { + "epoch": 10.430167597765363, + "grad_norm": 0.44862398505210876, + "learning_rate": 0.000480140056022409, + "loss": 0.402, + "step": 18670 + }, + { + "epoch": 10.43072625698324, + "grad_norm": 0.5557389855384827, + "learning_rate": 0.0004801120448179272, + "loss": 0.535, + "step": 18671 + }, + { + "epoch": 10.431284916201117, + "grad_norm": 0.41991230845451355, + "learning_rate": 0.0004800840336134454, + "loss": 0.3874, + "step": 18672 + }, + { + "epoch": 10.431843575418995, + "grad_norm": 0.8289154767990112, + "learning_rate": 0.0004800560224089636, + "loss": 0.463, + "step": 18673 + }, + { + "epoch": 10.432402234636871, + "grad_norm": 0.47423654794692993, + "learning_rate": 0.0004800280112044818, + "loss": 0.4877, + "step": 18674 + }, + { + "epoch": 10.432960893854748, + "grad_norm": 1.3550552129745483, + "learning_rate": 0.00048, + "loss": 0.3922, + "step": 18675 + }, + { + "epoch": 10.433519553072626, + "grad_norm": 0.518094003200531, + "learning_rate": 0.0004799719887955182, + "loss": 0.4311, + "step": 18676 + }, + { + "epoch": 10.434078212290503, + "grad_norm": 0.7114032506942749, + "learning_rate": 0.0004799439775910364, + "loss": 0.4145, + "step": 18677 + }, + { + "epoch": 10.43463687150838, + "grad_norm": 0.5246087908744812, + "learning_rate": 0.0004799159663865547, + "loss": 0.4299, + "step": 18678 + }, + { + "epoch": 10.435195530726258, + "grad_norm": 0.7234489321708679, + "learning_rate": 0.00047988795518207284, + "loss": 0.5988, + "step": 18679 + }, + { + "epoch": 10.435754189944134, + "grad_norm": 1.3226077556610107, + "learning_rate": 0.00047985994397759104, + "loss": 0.4171, + "step": 18680 + }, + { + "epoch": 10.436312849162011, + "grad_norm": 0.8794574737548828, + "learning_rate": 0.00047983193277310925, + "loss": 0.51, + "step": 18681 + }, + { + "epoch": 10.436871508379888, + "grad_norm": 0.6875120997428894, + "learning_rate": 0.00047980392156862745, + "loss": 0.4661, + "step": 18682 + }, + { + "epoch": 10.437430167597766, + "grad_norm": 0.5675374865531921, + "learning_rate": 0.0004797759103641457, + "loss": 0.4183, + "step": 18683 + }, + { + "epoch": 10.437988826815642, + "grad_norm": 0.4331020712852478, + "learning_rate": 0.00047974789915966387, + "loss": 0.3466, + "step": 18684 + }, + { + "epoch": 10.438547486033519, + "grad_norm": 0.4964342713356018, + "learning_rate": 0.00047971988795518207, + "loss": 0.473, + "step": 18685 + }, + { + "epoch": 10.439106145251397, + "grad_norm": 0.3521219491958618, + "learning_rate": 0.00047969187675070033, + "loss": 0.4089, + "step": 18686 + }, + { + "epoch": 10.439664804469274, + "grad_norm": 0.7384117841720581, + "learning_rate": 0.0004796638655462185, + "loss": 0.4892, + "step": 18687 + }, + { + "epoch": 10.44022346368715, + "grad_norm": 0.3999738097190857, + "learning_rate": 0.0004796358543417367, + "loss": 0.4303, + "step": 18688 + }, + { + "epoch": 10.440782122905027, + "grad_norm": 0.6329213380813599, + "learning_rate": 0.0004796078431372549, + "loss": 0.5215, + "step": 18689 + }, + { + "epoch": 10.441340782122905, + "grad_norm": 0.4546065330505371, + "learning_rate": 0.0004795798319327731, + "loss": 0.42, + "step": 18690 + }, + { + "epoch": 10.441899441340782, + "grad_norm": 0.5098863840103149, + "learning_rate": 0.00047955182072829136, + "loss": 0.353, + "step": 18691 + }, + { + "epoch": 10.442458100558659, + "grad_norm": 0.455161452293396, + "learning_rate": 0.0004795238095238095, + "loss": 0.3969, + "step": 18692 + }, + { + "epoch": 10.443016759776537, + "grad_norm": 1.7095807790756226, + "learning_rate": 0.0004794957983193277, + "loss": 0.5034, + "step": 18693 + }, + { + "epoch": 10.443575418994413, + "grad_norm": 0.44077131152153015, + "learning_rate": 0.000479467787114846, + "loss": 0.3477, + "step": 18694 + }, + { + "epoch": 10.44413407821229, + "grad_norm": 0.5123345851898193, + "learning_rate": 0.00047943977591036413, + "loss": 0.3586, + "step": 18695 + }, + { + "epoch": 10.444692737430168, + "grad_norm": 0.6031753420829773, + "learning_rate": 0.0004794117647058824, + "loss": 0.5442, + "step": 18696 + }, + { + "epoch": 10.445251396648045, + "grad_norm": 0.49947431683540344, + "learning_rate": 0.00047938375350140054, + "loss": 0.3883, + "step": 18697 + }, + { + "epoch": 10.445810055865921, + "grad_norm": 0.5277776122093201, + "learning_rate": 0.00047935574229691875, + "loss": 0.4561, + "step": 18698 + }, + { + "epoch": 10.446368715083798, + "grad_norm": 0.680150032043457, + "learning_rate": 0.000479327731092437, + "loss": 0.5531, + "step": 18699 + }, + { + "epoch": 10.446927374301676, + "grad_norm": 0.8029719591140747, + "learning_rate": 0.00047929971988795516, + "loss": 0.3867, + "step": 18700 + }, + { + "epoch": 10.447486033519553, + "grad_norm": 1.9289336204528809, + "learning_rate": 0.0004792717086834734, + "loss": 0.4727, + "step": 18701 + }, + { + "epoch": 10.44804469273743, + "grad_norm": 0.9800059199333191, + "learning_rate": 0.00047924369747899163, + "loss": 0.4341, + "step": 18702 + }, + { + "epoch": 10.448603351955308, + "grad_norm": 0.39864057302474976, + "learning_rate": 0.0004792156862745098, + "loss": 0.3679, + "step": 18703 + }, + { + "epoch": 10.449162011173184, + "grad_norm": 0.4866752326488495, + "learning_rate": 0.00047918767507002804, + "loss": 0.4556, + "step": 18704 + }, + { + "epoch": 10.449720670391061, + "grad_norm": 0.8988238573074341, + "learning_rate": 0.0004791596638655462, + "loss": 0.4758, + "step": 18705 + }, + { + "epoch": 10.45027932960894, + "grad_norm": 0.39659643173217773, + "learning_rate": 0.00047913165266106445, + "loss": 0.3788, + "step": 18706 + }, + { + "epoch": 10.450837988826816, + "grad_norm": 0.4448933005332947, + "learning_rate": 0.00047910364145658266, + "loss": 0.434, + "step": 18707 + }, + { + "epoch": 10.451396648044692, + "grad_norm": 0.45187652111053467, + "learning_rate": 0.0004790756302521008, + "loss": 0.3899, + "step": 18708 + }, + { + "epoch": 10.451955307262569, + "grad_norm": 0.406544953584671, + "learning_rate": 0.00047904761904761907, + "loss": 0.4306, + "step": 18709 + }, + { + "epoch": 10.452513966480447, + "grad_norm": 0.4826112687587738, + "learning_rate": 0.0004790196078431373, + "loss": 0.5228, + "step": 18710 + }, + { + "epoch": 10.453072625698324, + "grad_norm": 1.4266471862792969, + "learning_rate": 0.0004789915966386555, + "loss": 0.4651, + "step": 18711 + }, + { + "epoch": 10.4536312849162, + "grad_norm": 0.47993677854537964, + "learning_rate": 0.0004789635854341737, + "loss": 0.3941, + "step": 18712 + }, + { + "epoch": 10.454189944134079, + "grad_norm": 0.4121052026748657, + "learning_rate": 0.00047893557422969184, + "loss": 0.4249, + "step": 18713 + }, + { + "epoch": 10.454748603351955, + "grad_norm": 0.5191360116004944, + "learning_rate": 0.0004789075630252101, + "loss": 0.4523, + "step": 18714 + }, + { + "epoch": 10.455307262569832, + "grad_norm": 2.145153522491455, + "learning_rate": 0.0004788795518207283, + "loss": 0.3553, + "step": 18715 + }, + { + "epoch": 10.45586592178771, + "grad_norm": 0.5439107418060303, + "learning_rate": 0.0004788515406162465, + "loss": 0.4296, + "step": 18716 + }, + { + "epoch": 10.456424581005587, + "grad_norm": 0.5652459263801575, + "learning_rate": 0.0004788235294117647, + "loss": 0.4462, + "step": 18717 + }, + { + "epoch": 10.456983240223463, + "grad_norm": 0.9325499534606934, + "learning_rate": 0.0004787955182072829, + "loss": 0.4919, + "step": 18718 + }, + { + "epoch": 10.45754189944134, + "grad_norm": 0.6296287178993225, + "learning_rate": 0.00047876750700280113, + "loss": 0.5705, + "step": 18719 + }, + { + "epoch": 10.458100558659218, + "grad_norm": 0.44084757566452026, + "learning_rate": 0.00047873949579831934, + "loss": 0.4614, + "step": 18720 + }, + { + "epoch": 10.458659217877095, + "grad_norm": 0.40304630994796753, + "learning_rate": 0.00047871148459383754, + "loss": 0.3985, + "step": 18721 + }, + { + "epoch": 10.459217877094972, + "grad_norm": 0.9078214168548584, + "learning_rate": 0.00047868347338935575, + "loss": 0.4369, + "step": 18722 + }, + { + "epoch": 10.45977653631285, + "grad_norm": 0.7024386525154114, + "learning_rate": 0.00047865546218487395, + "loss": 0.3923, + "step": 18723 + }, + { + "epoch": 10.460335195530726, + "grad_norm": 0.49723172187805176, + "learning_rate": 0.00047862745098039216, + "loss": 0.5408, + "step": 18724 + }, + { + "epoch": 10.460893854748603, + "grad_norm": 0.626876711845398, + "learning_rate": 0.00047859943977591037, + "loss": 0.6996, + "step": 18725 + }, + { + "epoch": 10.461452513966481, + "grad_norm": 0.4237803518772125, + "learning_rate": 0.0004785714285714286, + "loss": 0.3362, + "step": 18726 + }, + { + "epoch": 10.462011173184358, + "grad_norm": 0.4565712809562683, + "learning_rate": 0.0004785434173669468, + "loss": 0.3611, + "step": 18727 + }, + { + "epoch": 10.462569832402234, + "grad_norm": 2.418124198913574, + "learning_rate": 0.000478515406162465, + "loss": 0.3753, + "step": 18728 + }, + { + "epoch": 10.463128491620111, + "grad_norm": 0.3883202373981476, + "learning_rate": 0.0004784873949579832, + "loss": 0.3862, + "step": 18729 + }, + { + "epoch": 10.46368715083799, + "grad_norm": 0.3922578692436218, + "learning_rate": 0.0004784593837535014, + "loss": 0.3851, + "step": 18730 + }, + { + "epoch": 10.464245810055866, + "grad_norm": 0.4642290771007538, + "learning_rate": 0.00047843137254901966, + "loss": 0.3958, + "step": 18731 + }, + { + "epoch": 10.464804469273743, + "grad_norm": 0.47723084688186646, + "learning_rate": 0.0004784033613445378, + "loss": 0.4896, + "step": 18732 + }, + { + "epoch": 10.46536312849162, + "grad_norm": 0.41552916169166565, + "learning_rate": 0.000478375350140056, + "loss": 0.3977, + "step": 18733 + }, + { + "epoch": 10.465921787709497, + "grad_norm": 0.572848916053772, + "learning_rate": 0.0004783473389355743, + "loss": 0.3607, + "step": 18734 + }, + { + "epoch": 10.466480446927374, + "grad_norm": 0.4564259648323059, + "learning_rate": 0.0004783193277310924, + "loss": 0.4351, + "step": 18735 + }, + { + "epoch": 10.46703910614525, + "grad_norm": 0.9085177779197693, + "learning_rate": 0.0004782913165266107, + "loss": 0.4619, + "step": 18736 + }, + { + "epoch": 10.467597765363129, + "grad_norm": 0.9216148257255554, + "learning_rate": 0.00047826330532212884, + "loss": 0.4802, + "step": 18737 + }, + { + "epoch": 10.468156424581005, + "grad_norm": 0.6707174181938171, + "learning_rate": 0.00047823529411764704, + "loss": 0.3866, + "step": 18738 + }, + { + "epoch": 10.468715083798882, + "grad_norm": 0.699027955532074, + "learning_rate": 0.0004782072829131653, + "loss": 0.4813, + "step": 18739 + }, + { + "epoch": 10.46927374301676, + "grad_norm": 0.6003559231758118, + "learning_rate": 0.00047817927170868346, + "loss": 0.5355, + "step": 18740 + }, + { + "epoch": 10.469832402234637, + "grad_norm": 0.5621857643127441, + "learning_rate": 0.0004781512605042017, + "loss": 0.3269, + "step": 18741 + }, + { + "epoch": 10.470391061452514, + "grad_norm": 0.7335500121116638, + "learning_rate": 0.0004781232492997199, + "loss": 0.406, + "step": 18742 + }, + { + "epoch": 10.470949720670392, + "grad_norm": 2.226038694381714, + "learning_rate": 0.0004780952380952381, + "loss": 0.4694, + "step": 18743 + }, + { + "epoch": 10.471508379888268, + "grad_norm": 1.3616586923599243, + "learning_rate": 0.00047806722689075633, + "loss": 0.42, + "step": 18744 + }, + { + "epoch": 10.472067039106145, + "grad_norm": 0.562004566192627, + "learning_rate": 0.0004780392156862745, + "loss": 0.4225, + "step": 18745 + }, + { + "epoch": 10.472625698324022, + "grad_norm": 0.7751689553260803, + "learning_rate": 0.00047801120448179275, + "loss": 0.4119, + "step": 18746 + }, + { + "epoch": 10.4731843575419, + "grad_norm": 0.6460176110267639, + "learning_rate": 0.00047798319327731095, + "loss": 0.5587, + "step": 18747 + }, + { + "epoch": 10.473743016759776, + "grad_norm": 0.6039220690727234, + "learning_rate": 0.0004779551820728291, + "loss": 0.5091, + "step": 18748 + }, + { + "epoch": 10.474301675977653, + "grad_norm": 0.5342893004417419, + "learning_rate": 0.00047792717086834736, + "loss": 0.3973, + "step": 18749 + }, + { + "epoch": 10.474860335195531, + "grad_norm": 1.0211554765701294, + "learning_rate": 0.00047789915966386557, + "loss": 0.4281, + "step": 18750 + }, + { + "epoch": 10.475418994413408, + "grad_norm": 0.4941284954547882, + "learning_rate": 0.0004778711484593838, + "loss": 0.3737, + "step": 18751 + }, + { + "epoch": 10.475977653631285, + "grad_norm": 0.4917150139808655, + "learning_rate": 0.000477843137254902, + "loss": 0.4507, + "step": 18752 + }, + { + "epoch": 10.476536312849163, + "grad_norm": 0.3774113953113556, + "learning_rate": 0.00047781512605042013, + "loss": 0.3939, + "step": 18753 + }, + { + "epoch": 10.47709497206704, + "grad_norm": 0.6499609351158142, + "learning_rate": 0.0004777871148459384, + "loss": 0.4722, + "step": 18754 + }, + { + "epoch": 10.477653631284916, + "grad_norm": 0.9482568502426147, + "learning_rate": 0.0004777591036414566, + "loss": 0.5944, + "step": 18755 + }, + { + "epoch": 10.478212290502793, + "grad_norm": 0.6117006540298462, + "learning_rate": 0.0004777310924369748, + "loss": 0.4002, + "step": 18756 + }, + { + "epoch": 10.478770949720671, + "grad_norm": 0.4160557687282562, + "learning_rate": 0.000477703081232493, + "loss": 0.4083, + "step": 18757 + }, + { + "epoch": 10.479329608938547, + "grad_norm": 0.5017178654670715, + "learning_rate": 0.0004776750700280112, + "loss": 0.3784, + "step": 18758 + }, + { + "epoch": 10.479888268156424, + "grad_norm": 0.6609714031219482, + "learning_rate": 0.0004776470588235294, + "loss": 0.4072, + "step": 18759 + }, + { + "epoch": 10.480446927374302, + "grad_norm": 0.6508843302726746, + "learning_rate": 0.00047761904761904763, + "loss": 0.3628, + "step": 18760 + }, + { + "epoch": 10.481005586592179, + "grad_norm": 0.9584119915962219, + "learning_rate": 0.00047759103641456584, + "loss": 0.5122, + "step": 18761 + }, + { + "epoch": 10.481564245810056, + "grad_norm": 0.5339064002037048, + "learning_rate": 0.00047756302521008404, + "loss": 0.509, + "step": 18762 + }, + { + "epoch": 10.482122905027932, + "grad_norm": 0.3849220275878906, + "learning_rate": 0.00047753501400560225, + "loss": 0.4129, + "step": 18763 + }, + { + "epoch": 10.48268156424581, + "grad_norm": 0.5345190167427063, + "learning_rate": 0.00047750700280112045, + "loss": 0.3398, + "step": 18764 + }, + { + "epoch": 10.483240223463687, + "grad_norm": 1.053317666053772, + "learning_rate": 0.00047747899159663866, + "loss": 0.3873, + "step": 18765 + }, + { + "epoch": 10.483798882681564, + "grad_norm": 0.554428219795227, + "learning_rate": 0.0004774509803921569, + "loss": 0.3478, + "step": 18766 + }, + { + "epoch": 10.484357541899442, + "grad_norm": 0.624885082244873, + "learning_rate": 0.00047742296918767507, + "loss": 0.5484, + "step": 18767 + }, + { + "epoch": 10.484916201117318, + "grad_norm": 0.4288087785243988, + "learning_rate": 0.0004773949579831933, + "loss": 0.4341, + "step": 18768 + }, + { + "epoch": 10.485474860335195, + "grad_norm": 0.4760594964027405, + "learning_rate": 0.0004773669467787115, + "loss": 0.3866, + "step": 18769 + }, + { + "epoch": 10.486033519553073, + "grad_norm": 0.9241908192634583, + "learning_rate": 0.0004773389355742297, + "loss": 0.5935, + "step": 18770 + }, + { + "epoch": 10.48659217877095, + "grad_norm": 0.5899703502655029, + "learning_rate": 0.00047731092436974795, + "loss": 0.4963, + "step": 18771 + }, + { + "epoch": 10.487150837988827, + "grad_norm": 0.5740683078765869, + "learning_rate": 0.0004772829131652661, + "loss": 0.4553, + "step": 18772 + }, + { + "epoch": 10.487709497206703, + "grad_norm": 0.6702711582183838, + "learning_rate": 0.0004772549019607843, + "loss": 0.5234, + "step": 18773 + }, + { + "epoch": 10.488268156424581, + "grad_norm": 1.3503100872039795, + "learning_rate": 0.00047722689075630257, + "loss": 0.5251, + "step": 18774 + }, + { + "epoch": 10.488826815642458, + "grad_norm": 0.42755404114723206, + "learning_rate": 0.0004771988795518207, + "loss": 0.4722, + "step": 18775 + }, + { + "epoch": 10.489385474860335, + "grad_norm": 1.348813772201538, + "learning_rate": 0.000477170868347339, + "loss": 0.4844, + "step": 18776 + }, + { + "epoch": 10.489944134078213, + "grad_norm": 0.3657236397266388, + "learning_rate": 0.00047714285714285713, + "loss": 0.3683, + "step": 18777 + }, + { + "epoch": 10.49050279329609, + "grad_norm": 0.42276760935783386, + "learning_rate": 0.00047711484593837534, + "loss": 0.438, + "step": 18778 + }, + { + "epoch": 10.491061452513966, + "grad_norm": 0.5383284687995911, + "learning_rate": 0.0004770868347338936, + "loss": 0.6893, + "step": 18779 + }, + { + "epoch": 10.491620111731844, + "grad_norm": 0.3489207625389099, + "learning_rate": 0.00047705882352941175, + "loss": 0.3293, + "step": 18780 + }, + { + "epoch": 10.492178770949721, + "grad_norm": 0.3855345547199249, + "learning_rate": 0.00047703081232493, + "loss": 0.3083, + "step": 18781 + }, + { + "epoch": 10.492737430167598, + "grad_norm": 0.6857742667198181, + "learning_rate": 0.0004770028011204482, + "loss": 0.4679, + "step": 18782 + }, + { + "epoch": 10.493296089385474, + "grad_norm": 0.6259848475456238, + "learning_rate": 0.00047697478991596637, + "loss": 0.4955, + "step": 18783 + }, + { + "epoch": 10.493854748603352, + "grad_norm": 0.3654603064060211, + "learning_rate": 0.00047694677871148463, + "loss": 0.4107, + "step": 18784 + }, + { + "epoch": 10.494413407821229, + "grad_norm": 0.9129793643951416, + "learning_rate": 0.0004769187675070028, + "loss": 0.3774, + "step": 18785 + }, + { + "epoch": 10.494972067039106, + "grad_norm": 0.4993589520454407, + "learning_rate": 0.00047689075630252104, + "loss": 0.3951, + "step": 18786 + }, + { + "epoch": 10.495530726256984, + "grad_norm": 0.4195460081100464, + "learning_rate": 0.00047686274509803925, + "loss": 0.5221, + "step": 18787 + }, + { + "epoch": 10.49608938547486, + "grad_norm": 0.5069847106933594, + "learning_rate": 0.0004768347338935574, + "loss": 0.3223, + "step": 18788 + }, + { + "epoch": 10.496648044692737, + "grad_norm": 0.47360026836395264, + "learning_rate": 0.00047680672268907566, + "loss": 0.3753, + "step": 18789 + }, + { + "epoch": 10.497206703910614, + "grad_norm": 0.45140454173088074, + "learning_rate": 0.00047677871148459386, + "loss": 0.5297, + "step": 18790 + }, + { + "epoch": 10.497765363128492, + "grad_norm": 0.47519221901893616, + "learning_rate": 0.00047675070028011207, + "loss": 0.516, + "step": 18791 + }, + { + "epoch": 10.498324022346369, + "grad_norm": 0.4776628017425537, + "learning_rate": 0.0004767226890756303, + "loss": 0.5112, + "step": 18792 + }, + { + "epoch": 10.498882681564245, + "grad_norm": 0.4391259253025055, + "learning_rate": 0.00047669467787114843, + "loss": 0.4241, + "step": 18793 + }, + { + "epoch": 10.499441340782123, + "grad_norm": Infinity, + "learning_rate": 0.00047669467787114843, + "loss": 0.44, + "step": 18794 + }, + { + "epoch": 10.5, + "grad_norm": 1.7532165050506592, + "learning_rate": 0.0004766666666666667, + "loss": 0.5259, + "step": 18795 + }, + { + "epoch": 10.500558659217877, + "grad_norm": 0.8160237669944763, + "learning_rate": 0.0004766386554621849, + "loss": 0.4555, + "step": 18796 + }, + { + "epoch": 10.501117318435755, + "grad_norm": 0.5965665578842163, + "learning_rate": 0.0004766106442577031, + "loss": 0.4871, + "step": 18797 + }, + { + "epoch": 10.501675977653631, + "grad_norm": 0.39612504839897156, + "learning_rate": 0.0004765826330532213, + "loss": 0.3631, + "step": 18798 + }, + { + "epoch": 10.502234636871508, + "grad_norm": 0.5372137427330017, + "learning_rate": 0.0004765546218487395, + "loss": 0.4, + "step": 18799 + }, + { + "epoch": 10.502793296089386, + "grad_norm": 0.531446099281311, + "learning_rate": 0.0004765266106442577, + "loss": 0.3325, + "step": 18800 + }, + { + "epoch": 10.503351955307263, + "grad_norm": 0.6240618228912354, + "learning_rate": 0.0004764985994397759, + "loss": 0.3937, + "step": 18801 + }, + { + "epoch": 10.50391061452514, + "grad_norm": 0.5450451970100403, + "learning_rate": 0.0004764705882352941, + "loss": 0.4917, + "step": 18802 + }, + { + "epoch": 10.504469273743016, + "grad_norm": 14.657256126403809, + "learning_rate": 0.00047644257703081234, + "loss": 0.4132, + "step": 18803 + }, + { + "epoch": 10.505027932960894, + "grad_norm": 0.6727287769317627, + "learning_rate": 0.00047641456582633054, + "loss": 0.4998, + "step": 18804 + }, + { + "epoch": 10.505586592178771, + "grad_norm": 0.4053119719028473, + "learning_rate": 0.00047638655462184875, + "loss": 0.4285, + "step": 18805 + }, + { + "epoch": 10.506145251396648, + "grad_norm": 1.2318850755691528, + "learning_rate": 0.00047635854341736695, + "loss": 0.3816, + "step": 18806 + }, + { + "epoch": 10.506703910614526, + "grad_norm": 0.5370670557022095, + "learning_rate": 0.00047633053221288516, + "loss": 0.443, + "step": 18807 + }, + { + "epoch": 10.507262569832402, + "grad_norm": 0.5725485682487488, + "learning_rate": 0.00047630252100840337, + "loss": 0.3975, + "step": 18808 + }, + { + "epoch": 10.507821229050279, + "grad_norm": 1.87415611743927, + "learning_rate": 0.00047627450980392157, + "loss": 0.4543, + "step": 18809 + }, + { + "epoch": 10.508379888268156, + "grad_norm": 0.5333583354949951, + "learning_rate": 0.0004762464985994398, + "loss": 0.4313, + "step": 18810 + }, + { + "epoch": 10.508938547486034, + "grad_norm": 0.4548228085041046, + "learning_rate": 0.000476218487394958, + "loss": 0.4386, + "step": 18811 + }, + { + "epoch": 10.50949720670391, + "grad_norm": 0.6111850142478943, + "learning_rate": 0.0004761904761904762, + "loss": 0.4234, + "step": 18812 + }, + { + "epoch": 10.510055865921787, + "grad_norm": 0.4857129156589508, + "learning_rate": 0.0004761624649859944, + "loss": 0.4298, + "step": 18813 + }, + { + "epoch": 10.510614525139665, + "grad_norm": 1.2505449056625366, + "learning_rate": 0.0004761344537815126, + "loss": 0.3958, + "step": 18814 + }, + { + "epoch": 10.511173184357542, + "grad_norm": 0.5724055171012878, + "learning_rate": 0.00047610644257703086, + "loss": 0.4273, + "step": 18815 + }, + { + "epoch": 10.511731843575419, + "grad_norm": 0.46530163288116455, + "learning_rate": 0.000476078431372549, + "loss": 0.3641, + "step": 18816 + }, + { + "epoch": 10.512290502793297, + "grad_norm": 5.036036491394043, + "learning_rate": 0.0004760504201680672, + "loss": 0.4276, + "step": 18817 + }, + { + "epoch": 10.512849162011173, + "grad_norm": 0.4140670895576477, + "learning_rate": 0.0004760224089635854, + "loss": 0.4816, + "step": 18818 + }, + { + "epoch": 10.51340782122905, + "grad_norm": 0.5452653169631958, + "learning_rate": 0.00047599439775910363, + "loss": 0.4379, + "step": 18819 + }, + { + "epoch": 10.513966480446927, + "grad_norm": 0.5115152597427368, + "learning_rate": 0.0004759663865546219, + "loss": 0.476, + "step": 18820 + }, + { + "epoch": 10.514525139664805, + "grad_norm": 0.4841180443763733, + "learning_rate": 0.00047593837535014004, + "loss": 0.4552, + "step": 18821 + }, + { + "epoch": 10.515083798882682, + "grad_norm": 0.4810382127761841, + "learning_rate": 0.00047591036414565825, + "loss": 0.3098, + "step": 18822 + }, + { + "epoch": 10.515642458100558, + "grad_norm": 0.7918359637260437, + "learning_rate": 0.0004758823529411765, + "loss": 0.4614, + "step": 18823 + }, + { + "epoch": 10.516201117318436, + "grad_norm": 0.5926598310470581, + "learning_rate": 0.00047585434173669466, + "loss": 0.3873, + "step": 18824 + }, + { + "epoch": 10.516759776536313, + "grad_norm": 3.9362900257110596, + "learning_rate": 0.0004758263305322129, + "loss": 0.4953, + "step": 18825 + }, + { + "epoch": 10.51731843575419, + "grad_norm": 0.5332849621772766, + "learning_rate": 0.0004757983193277311, + "loss": 0.4205, + "step": 18826 + }, + { + "epoch": 10.517877094972068, + "grad_norm": 2.4900104999542236, + "learning_rate": 0.0004757703081232493, + "loss": 0.4053, + "step": 18827 + }, + { + "epoch": 10.518435754189944, + "grad_norm": 0.5412886738777161, + "learning_rate": 0.00047574229691876754, + "loss": 0.3219, + "step": 18828 + }, + { + "epoch": 10.518994413407821, + "grad_norm": 0.33584755659103394, + "learning_rate": 0.0004757142857142857, + "loss": 0.3529, + "step": 18829 + }, + { + "epoch": 10.519553072625698, + "grad_norm": 0.39209020137786865, + "learning_rate": 0.00047568627450980395, + "loss": 0.3778, + "step": 18830 + }, + { + "epoch": 10.520111731843576, + "grad_norm": 0.4009786546230316, + "learning_rate": 0.00047565826330532216, + "loss": 0.3357, + "step": 18831 + }, + { + "epoch": 10.520670391061453, + "grad_norm": 0.4256860911846161, + "learning_rate": 0.0004756302521008403, + "loss": 0.437, + "step": 18832 + }, + { + "epoch": 10.521229050279329, + "grad_norm": 1.2227140665054321, + "learning_rate": 0.00047560224089635857, + "loss": 0.4364, + "step": 18833 + }, + { + "epoch": 10.521787709497207, + "grad_norm": 0.4340318441390991, + "learning_rate": 0.0004755742296918767, + "loss": 0.3346, + "step": 18834 + }, + { + "epoch": 10.522346368715084, + "grad_norm": 1.3686751127243042, + "learning_rate": 0.000475546218487395, + "loss": 0.4971, + "step": 18835 + }, + { + "epoch": 10.52290502793296, + "grad_norm": 4.517106056213379, + "learning_rate": 0.0004755182072829132, + "loss": 0.5161, + "step": 18836 + }, + { + "epoch": 10.523463687150837, + "grad_norm": 0.4715244472026825, + "learning_rate": 0.00047549019607843134, + "loss": 0.3535, + "step": 18837 + }, + { + "epoch": 10.524022346368715, + "grad_norm": 0.5791744589805603, + "learning_rate": 0.0004754621848739496, + "loss": 0.3698, + "step": 18838 + }, + { + "epoch": 10.524581005586592, + "grad_norm": 1.0592786073684692, + "learning_rate": 0.0004754341736694678, + "loss": 0.3173, + "step": 18839 + }, + { + "epoch": 10.525139664804469, + "grad_norm": 0.5131112933158875, + "learning_rate": 0.000475406162464986, + "loss": 0.4733, + "step": 18840 + }, + { + "epoch": 10.525698324022347, + "grad_norm": 0.48055174946784973, + "learning_rate": 0.0004753781512605042, + "loss": 0.4621, + "step": 18841 + }, + { + "epoch": 10.526256983240224, + "grad_norm": 0.5353572964668274, + "learning_rate": 0.00047535014005602237, + "loss": 0.5686, + "step": 18842 + }, + { + "epoch": 10.5268156424581, + "grad_norm": 0.5716177225112915, + "learning_rate": 0.00047532212885154063, + "loss": 0.5158, + "step": 18843 + }, + { + "epoch": 10.527374301675978, + "grad_norm": 1.1526488065719604, + "learning_rate": 0.00047529411764705884, + "loss": 0.3879, + "step": 18844 + }, + { + "epoch": 10.527932960893855, + "grad_norm": 0.5159631967544556, + "learning_rate": 0.00047526610644257704, + "loss": 0.428, + "step": 18845 + }, + { + "epoch": 10.528491620111732, + "grad_norm": 0.3464567959308624, + "learning_rate": 0.00047523809523809525, + "loss": 0.4076, + "step": 18846 + }, + { + "epoch": 10.529050279329608, + "grad_norm": 0.5847417712211609, + "learning_rate": 0.00047521008403361345, + "loss": 0.4315, + "step": 18847 + }, + { + "epoch": 10.529608938547486, + "grad_norm": 0.43284904956817627, + "learning_rate": 0.00047518207282913166, + "loss": 0.3904, + "step": 18848 + }, + { + "epoch": 10.530167597765363, + "grad_norm": 0.5853347778320312, + "learning_rate": 0.00047515406162464987, + "loss": 0.5065, + "step": 18849 + }, + { + "epoch": 10.53072625698324, + "grad_norm": 8.45334243774414, + "learning_rate": 0.00047512605042016807, + "loss": 0.4646, + "step": 18850 + }, + { + "epoch": 10.531284916201118, + "grad_norm": 0.42541107535362244, + "learning_rate": 0.0004750980392156863, + "loss": 0.4477, + "step": 18851 + }, + { + "epoch": 10.531843575418995, + "grad_norm": 0.36790648102760315, + "learning_rate": 0.0004750700280112045, + "loss": 0.4281, + "step": 18852 + }, + { + "epoch": 10.532402234636871, + "grad_norm": 0.46153897047042847, + "learning_rate": 0.0004750420168067227, + "loss": 0.417, + "step": 18853 + }, + { + "epoch": 10.53296089385475, + "grad_norm": 0.6567018628120422, + "learning_rate": 0.0004750140056022409, + "loss": 0.5118, + "step": 18854 + }, + { + "epoch": 10.533519553072626, + "grad_norm": 0.5465989112854004, + "learning_rate": 0.00047498599439775916, + "loss": 0.3924, + "step": 18855 + }, + { + "epoch": 10.534078212290503, + "grad_norm": 0.8438539505004883, + "learning_rate": 0.0004749579831932773, + "loss": 0.5306, + "step": 18856 + }, + { + "epoch": 10.53463687150838, + "grad_norm": 0.49790751934051514, + "learning_rate": 0.0004749299719887955, + "loss": 0.482, + "step": 18857 + }, + { + "epoch": 10.535195530726257, + "grad_norm": 0.4106896221637726, + "learning_rate": 0.0004749019607843137, + "loss": 0.4669, + "step": 18858 + }, + { + "epoch": 10.535754189944134, + "grad_norm": 0.42277494072914124, + "learning_rate": 0.0004748739495798319, + "loss": 0.4363, + "step": 18859 + }, + { + "epoch": 10.53631284916201, + "grad_norm": 0.712372362613678, + "learning_rate": 0.0004748459383753502, + "loss": 0.3562, + "step": 18860 + }, + { + "epoch": 10.536871508379889, + "grad_norm": 0.6095969676971436, + "learning_rate": 0.00047481792717086834, + "loss": 0.4157, + "step": 18861 + }, + { + "epoch": 10.537430167597766, + "grad_norm": 0.4902176558971405, + "learning_rate": 0.00047478991596638654, + "loss": 0.4956, + "step": 18862 + }, + { + "epoch": 10.537988826815642, + "grad_norm": 0.5600163340568542, + "learning_rate": 0.0004747619047619048, + "loss": 0.4604, + "step": 18863 + }, + { + "epoch": 10.538547486033519, + "grad_norm": 0.53652024269104, + "learning_rate": 0.00047473389355742296, + "loss": 0.3888, + "step": 18864 + }, + { + "epoch": 10.539106145251397, + "grad_norm": 0.950908899307251, + "learning_rate": 0.0004747058823529412, + "loss": 0.6144, + "step": 18865 + }, + { + "epoch": 10.539664804469274, + "grad_norm": 1.8051820993423462, + "learning_rate": 0.00047467787114845937, + "loss": 0.3821, + "step": 18866 + }, + { + "epoch": 10.54022346368715, + "grad_norm": 0.35427772998809814, + "learning_rate": 0.0004746498599439776, + "loss": 0.3084, + "step": 18867 + }, + { + "epoch": 10.540782122905028, + "grad_norm": 0.42422544956207275, + "learning_rate": 0.00047462184873949583, + "loss": 0.5019, + "step": 18868 + }, + { + "epoch": 10.541340782122905, + "grad_norm": 0.6202530860900879, + "learning_rate": 0.000474593837535014, + "loss": 0.583, + "step": 18869 + }, + { + "epoch": 10.541899441340782, + "grad_norm": 0.4086197316646576, + "learning_rate": 0.00047456582633053225, + "loss": 0.441, + "step": 18870 + }, + { + "epoch": 10.54245810055866, + "grad_norm": 0.4702913165092468, + "learning_rate": 0.00047453781512605045, + "loss": 0.3333, + "step": 18871 + }, + { + "epoch": 10.543016759776537, + "grad_norm": 0.420767217874527, + "learning_rate": 0.0004745098039215686, + "loss": 0.4638, + "step": 18872 + }, + { + "epoch": 10.543575418994413, + "grad_norm": 0.43494337797164917, + "learning_rate": 0.00047448179271708686, + "loss": 0.4266, + "step": 18873 + }, + { + "epoch": 10.544134078212291, + "grad_norm": 0.4264630973339081, + "learning_rate": 0.000474453781512605, + "loss": 0.4766, + "step": 18874 + }, + { + "epoch": 10.544692737430168, + "grad_norm": 0.693032443523407, + "learning_rate": 0.0004744257703081233, + "loss": 0.4934, + "step": 18875 + }, + { + "epoch": 10.545251396648045, + "grad_norm": 0.4030987024307251, + "learning_rate": 0.0004743977591036415, + "loss": 0.4482, + "step": 18876 + }, + { + "epoch": 10.545810055865921, + "grad_norm": 0.4038350284099579, + "learning_rate": 0.00047436974789915963, + "loss": 0.4302, + "step": 18877 + }, + { + "epoch": 10.5463687150838, + "grad_norm": 3.9995579719543457, + "learning_rate": 0.0004743417366946779, + "loss": 0.3512, + "step": 18878 + }, + { + "epoch": 10.546927374301676, + "grad_norm": 0.6617869138717651, + "learning_rate": 0.0004743137254901961, + "loss": 0.5645, + "step": 18879 + }, + { + "epoch": 10.547486033519553, + "grad_norm": 0.6665805578231812, + "learning_rate": 0.0004742857142857143, + "loss": 0.4271, + "step": 18880 + }, + { + "epoch": 10.548044692737431, + "grad_norm": 0.43734288215637207, + "learning_rate": 0.0004742577030812325, + "loss": 0.396, + "step": 18881 + }, + { + "epoch": 10.548603351955308, + "grad_norm": 0.5726372003555298, + "learning_rate": 0.00047422969187675066, + "loss": 0.4307, + "step": 18882 + }, + { + "epoch": 10.549162011173184, + "grad_norm": 0.8013854026794434, + "learning_rate": 0.0004742016806722689, + "loss": 0.4529, + "step": 18883 + }, + { + "epoch": 10.54972067039106, + "grad_norm": 0.3717445433139801, + "learning_rate": 0.00047417366946778713, + "loss": 0.3774, + "step": 18884 + }, + { + "epoch": 10.550279329608939, + "grad_norm": 0.5159956812858582, + "learning_rate": 0.00047414565826330534, + "loss": 0.3561, + "step": 18885 + }, + { + "epoch": 10.550837988826816, + "grad_norm": 0.44090867042541504, + "learning_rate": 0.00047411764705882354, + "loss": 0.4146, + "step": 18886 + }, + { + "epoch": 10.551396648044692, + "grad_norm": 0.4632580578327179, + "learning_rate": 0.00047408963585434175, + "loss": 0.428, + "step": 18887 + }, + { + "epoch": 10.55195530726257, + "grad_norm": 0.5727682113647461, + "learning_rate": 0.00047406162464985995, + "loss": 0.5145, + "step": 18888 + }, + { + "epoch": 10.552513966480447, + "grad_norm": 0.5884736180305481, + "learning_rate": 0.00047403361344537816, + "loss": 0.4905, + "step": 18889 + }, + { + "epoch": 10.553072625698324, + "grad_norm": 0.8395491242408752, + "learning_rate": 0.0004740056022408964, + "loss": 0.455, + "step": 18890 + }, + { + "epoch": 10.553631284916202, + "grad_norm": 0.5693738460540771, + "learning_rate": 0.00047397759103641457, + "loss": 0.5395, + "step": 18891 + }, + { + "epoch": 10.554189944134079, + "grad_norm": 0.6569846272468567, + "learning_rate": 0.0004739495798319328, + "loss": 0.5621, + "step": 18892 + }, + { + "epoch": 10.554748603351955, + "grad_norm": 0.7034730315208435, + "learning_rate": 0.000473921568627451, + "loss": 0.5091, + "step": 18893 + }, + { + "epoch": 10.555307262569832, + "grad_norm": 0.8309515714645386, + "learning_rate": 0.0004738935574229692, + "loss": 0.4025, + "step": 18894 + }, + { + "epoch": 10.55586592178771, + "grad_norm": 1.3515961170196533, + "learning_rate": 0.00047386554621848745, + "loss": 0.4052, + "step": 18895 + }, + { + "epoch": 10.556424581005587, + "grad_norm": 0.4653107821941376, + "learning_rate": 0.0004738375350140056, + "loss": 0.3627, + "step": 18896 + }, + { + "epoch": 10.556983240223463, + "grad_norm": 0.6703113317489624, + "learning_rate": 0.0004738095238095238, + "loss": 0.4167, + "step": 18897 + }, + { + "epoch": 10.557541899441341, + "grad_norm": 0.6055869460105896, + "learning_rate": 0.00047378151260504207, + "loss": 0.5314, + "step": 18898 + }, + { + "epoch": 10.558100558659218, + "grad_norm": 0.47751203179359436, + "learning_rate": 0.0004737535014005602, + "loss": 0.4254, + "step": 18899 + }, + { + "epoch": 10.558659217877095, + "grad_norm": 0.5802996754646301, + "learning_rate": 0.0004737254901960785, + "loss": 0.3882, + "step": 18900 + }, + { + "epoch": 10.559217877094973, + "grad_norm": 0.5866131782531738, + "learning_rate": 0.00047369747899159663, + "loss": 0.4328, + "step": 18901 + }, + { + "epoch": 10.55977653631285, + "grad_norm": 0.6149768233299255, + "learning_rate": 0.00047366946778711484, + "loss": 0.4349, + "step": 18902 + }, + { + "epoch": 10.560335195530726, + "grad_norm": 0.4731472432613373, + "learning_rate": 0.0004736414565826331, + "loss": 0.5065, + "step": 18903 + }, + { + "epoch": 10.560893854748603, + "grad_norm": 1.0369417667388916, + "learning_rate": 0.00047361344537815125, + "loss": 0.5574, + "step": 18904 + }, + { + "epoch": 10.561452513966481, + "grad_norm": 0.5057957768440247, + "learning_rate": 0.0004735854341736695, + "loss": 0.497, + "step": 18905 + }, + { + "epoch": 10.562011173184358, + "grad_norm": 0.7955731153488159, + "learning_rate": 0.0004735574229691877, + "loss": 0.5806, + "step": 18906 + }, + { + "epoch": 10.562569832402234, + "grad_norm": 1.3785350322723389, + "learning_rate": 0.00047352941176470587, + "loss": 0.6099, + "step": 18907 + }, + { + "epoch": 10.563128491620112, + "grad_norm": 1.5240799188613892, + "learning_rate": 0.00047350140056022413, + "loss": 0.493, + "step": 18908 + }, + { + "epoch": 10.563687150837989, + "grad_norm": 0.989829421043396, + "learning_rate": 0.0004734733893557423, + "loss": 0.5169, + "step": 18909 + }, + { + "epoch": 10.564245810055866, + "grad_norm": 0.41497549414634705, + "learning_rate": 0.00047344537815126054, + "loss": 0.4873, + "step": 18910 + }, + { + "epoch": 10.564804469273742, + "grad_norm": 0.40413084626197815, + "learning_rate": 0.00047341736694677875, + "loss": 0.3753, + "step": 18911 + }, + { + "epoch": 10.56536312849162, + "grad_norm": 0.36074674129486084, + "learning_rate": 0.0004733893557422969, + "loss": 0.3289, + "step": 18912 + }, + { + "epoch": 10.565921787709497, + "grad_norm": 0.8511361479759216, + "learning_rate": 0.00047336134453781516, + "loss": 0.4386, + "step": 18913 + }, + { + "epoch": 10.566480446927374, + "grad_norm": 0.5242519378662109, + "learning_rate": 0.00047333333333333336, + "loss": 0.4989, + "step": 18914 + }, + { + "epoch": 10.567039106145252, + "grad_norm": 0.5416566729545593, + "learning_rate": 0.0004733053221288515, + "loss": 0.4007, + "step": 18915 + }, + { + "epoch": 10.567597765363129, + "grad_norm": 0.49826931953430176, + "learning_rate": 0.0004732773109243698, + "loss": 0.4561, + "step": 18916 + }, + { + "epoch": 10.568156424581005, + "grad_norm": 0.6546459197998047, + "learning_rate": 0.00047324929971988793, + "loss": 0.3232, + "step": 18917 + }, + { + "epoch": 10.568715083798883, + "grad_norm": 0.6619869470596313, + "learning_rate": 0.0004732212885154062, + "loss": 0.3627, + "step": 18918 + }, + { + "epoch": 10.56927374301676, + "grad_norm": 0.720614492893219, + "learning_rate": 0.0004731932773109244, + "loss": 0.4592, + "step": 18919 + }, + { + "epoch": 10.569832402234637, + "grad_norm": 0.6503860950469971, + "learning_rate": 0.00047316526610644255, + "loss": 0.3995, + "step": 18920 + }, + { + "epoch": 10.570391061452513, + "grad_norm": 0.45763805508613586, + "learning_rate": 0.0004731372549019608, + "loss": 0.3341, + "step": 18921 + }, + { + "epoch": 10.570949720670392, + "grad_norm": 0.5539149045944214, + "learning_rate": 0.000473109243697479, + "loss": 0.4985, + "step": 18922 + }, + { + "epoch": 10.571508379888268, + "grad_norm": 0.3869083523750305, + "learning_rate": 0.0004730812324929972, + "loss": 0.3519, + "step": 18923 + }, + { + "epoch": 10.572067039106145, + "grad_norm": 0.45173728466033936, + "learning_rate": 0.0004730532212885154, + "loss": 0.5502, + "step": 18924 + }, + { + "epoch": 10.572625698324023, + "grad_norm": 0.47461721301078796, + "learning_rate": 0.0004730252100840336, + "loss": 0.3923, + "step": 18925 + }, + { + "epoch": 10.5731843575419, + "grad_norm": 0.7339091897010803, + "learning_rate": 0.00047299719887955184, + "loss": 0.4437, + "step": 18926 + }, + { + "epoch": 10.573743016759776, + "grad_norm": 0.4360916316509247, + "learning_rate": 0.00047296918767507004, + "loss": 0.354, + "step": 18927 + }, + { + "epoch": 10.574301675977654, + "grad_norm": 0.882824718952179, + "learning_rate": 0.00047294117647058825, + "loss": 0.417, + "step": 18928 + }, + { + "epoch": 10.574860335195531, + "grad_norm": 0.5574464797973633, + "learning_rate": 0.00047291316526610645, + "loss": 0.5044, + "step": 18929 + }, + { + "epoch": 10.575418994413408, + "grad_norm": 0.8435484170913696, + "learning_rate": 0.00047288515406162466, + "loss": 0.4042, + "step": 18930 + }, + { + "epoch": 10.575977653631284, + "grad_norm": 0.6394529938697815, + "learning_rate": 0.00047285714285714287, + "loss": 0.3711, + "step": 18931 + }, + { + "epoch": 10.576536312849163, + "grad_norm": 0.9052165746688843, + "learning_rate": 0.00047282913165266107, + "loss": 0.4115, + "step": 18932 + }, + { + "epoch": 10.577094972067039, + "grad_norm": 0.5005930066108704, + "learning_rate": 0.0004728011204481793, + "loss": 0.4264, + "step": 18933 + }, + { + "epoch": 10.577653631284916, + "grad_norm": 0.8237519264221191, + "learning_rate": 0.0004727731092436975, + "loss": 0.4763, + "step": 18934 + }, + { + "epoch": 10.578212290502794, + "grad_norm": 1.2540061473846436, + "learning_rate": 0.0004727450980392157, + "loss": 0.6188, + "step": 18935 + }, + { + "epoch": 10.57877094972067, + "grad_norm": 1.9022924900054932, + "learning_rate": 0.0004727170868347339, + "loss": 0.4723, + "step": 18936 + }, + { + "epoch": 10.579329608938547, + "grad_norm": 0.49300217628479004, + "learning_rate": 0.0004726890756302521, + "loss": 0.3986, + "step": 18937 + }, + { + "epoch": 10.579888268156424, + "grad_norm": 5.604837417602539, + "learning_rate": 0.00047266106442577036, + "loss": 0.4071, + "step": 18938 + }, + { + "epoch": 10.580446927374302, + "grad_norm": 0.6023059487342834, + "learning_rate": 0.0004726330532212885, + "loss": 0.466, + "step": 18939 + }, + { + "epoch": 10.581005586592179, + "grad_norm": 0.8436830043792725, + "learning_rate": 0.0004726050420168067, + "loss": 0.3965, + "step": 18940 + }, + { + "epoch": 10.581564245810055, + "grad_norm": 0.4717503488063812, + "learning_rate": 0.0004725770308123249, + "loss": 0.4241, + "step": 18941 + }, + { + "epoch": 10.582122905027934, + "grad_norm": 0.818313717842102, + "learning_rate": 0.00047254901960784313, + "loss": 0.4704, + "step": 18942 + }, + { + "epoch": 10.58268156424581, + "grad_norm": 0.45900532603263855, + "learning_rate": 0.0004725210084033614, + "loss": 0.46, + "step": 18943 + }, + { + "epoch": 10.583240223463687, + "grad_norm": 0.47023844718933105, + "learning_rate": 0.00047249299719887954, + "loss": 0.4157, + "step": 18944 + }, + { + "epoch": 10.583798882681565, + "grad_norm": 0.5904989838600159, + "learning_rate": 0.00047246498599439775, + "loss": 0.4479, + "step": 18945 + }, + { + "epoch": 10.584357541899442, + "grad_norm": 0.5782806873321533, + "learning_rate": 0.000472436974789916, + "loss": 0.4258, + "step": 18946 + }, + { + "epoch": 10.584916201117318, + "grad_norm": 0.9767335653305054, + "learning_rate": 0.00047240896358543416, + "loss": 0.4605, + "step": 18947 + }, + { + "epoch": 10.585474860335196, + "grad_norm": 1.2483271360397339, + "learning_rate": 0.0004723809523809524, + "loss": 0.5678, + "step": 18948 + }, + { + "epoch": 10.586033519553073, + "grad_norm": 0.3802148997783661, + "learning_rate": 0.0004723529411764706, + "loss": 0.4135, + "step": 18949 + }, + { + "epoch": 10.58659217877095, + "grad_norm": 0.6582540273666382, + "learning_rate": 0.0004723249299719888, + "loss": 0.4118, + "step": 18950 + }, + { + "epoch": 10.587150837988826, + "grad_norm": 0.6382595896720886, + "learning_rate": 0.00047229691876750704, + "loss": 0.43, + "step": 18951 + }, + { + "epoch": 10.587709497206705, + "grad_norm": 0.43404585123062134, + "learning_rate": 0.0004722689075630252, + "loss": 0.4691, + "step": 18952 + }, + { + "epoch": 10.588268156424581, + "grad_norm": 1.5711432695388794, + "learning_rate": 0.00047224089635854345, + "loss": 0.4473, + "step": 18953 + }, + { + "epoch": 10.588826815642458, + "grad_norm": 0.5249388217926025, + "learning_rate": 0.00047221288515406166, + "loss": 0.3789, + "step": 18954 + }, + { + "epoch": 10.589385474860336, + "grad_norm": 1.7224370241165161, + "learning_rate": 0.0004721848739495798, + "loss": 0.4746, + "step": 18955 + }, + { + "epoch": 10.589944134078213, + "grad_norm": 0.6495590806007385, + "learning_rate": 0.00047215686274509807, + "loss": 0.4295, + "step": 18956 + }, + { + "epoch": 10.59050279329609, + "grad_norm": 0.6040468215942383, + "learning_rate": 0.0004721288515406162, + "loss": 0.5346, + "step": 18957 + }, + { + "epoch": 10.591061452513966, + "grad_norm": 0.6303696036338806, + "learning_rate": 0.0004721008403361345, + "loss": 0.38, + "step": 18958 + }, + { + "epoch": 10.591620111731844, + "grad_norm": 0.5952167510986328, + "learning_rate": 0.0004720728291316527, + "loss": 0.4043, + "step": 18959 + }, + { + "epoch": 10.59217877094972, + "grad_norm": 0.42092835903167725, + "learning_rate": 0.00047204481792717084, + "loss": 0.4298, + "step": 18960 + }, + { + "epoch": 10.592737430167597, + "grad_norm": 3.261221408843994, + "learning_rate": 0.0004720168067226891, + "loss": 0.4501, + "step": 18961 + }, + { + "epoch": 10.593296089385476, + "grad_norm": 1.8740071058273315, + "learning_rate": 0.0004719887955182073, + "loss": 0.4757, + "step": 18962 + }, + { + "epoch": 10.593854748603352, + "grad_norm": 0.4286591708660126, + "learning_rate": 0.0004719607843137255, + "loss": 0.3307, + "step": 18963 + }, + { + "epoch": 10.594413407821229, + "grad_norm": 1.3903018236160278, + "learning_rate": 0.0004719327731092437, + "loss": 0.4135, + "step": 18964 + }, + { + "epoch": 10.594972067039105, + "grad_norm": 0.5239282846450806, + "learning_rate": 0.00047190476190476187, + "loss": 0.3613, + "step": 18965 + }, + { + "epoch": 10.595530726256984, + "grad_norm": 0.5262951850891113, + "learning_rate": 0.00047187675070028013, + "loss": 0.4185, + "step": 18966 + }, + { + "epoch": 10.59608938547486, + "grad_norm": 0.7183969616889954, + "learning_rate": 0.00047184873949579834, + "loss": 0.3164, + "step": 18967 + }, + { + "epoch": 10.596648044692737, + "grad_norm": 0.7471548318862915, + "learning_rate": 0.00047182072829131654, + "loss": 0.4525, + "step": 18968 + }, + { + "epoch": 10.597206703910615, + "grad_norm": 0.4025716483592987, + "learning_rate": 0.00047179271708683475, + "loss": 0.4073, + "step": 18969 + }, + { + "epoch": 10.597765363128492, + "grad_norm": 0.33605897426605225, + "learning_rate": 0.00047176470588235295, + "loss": 0.3077, + "step": 18970 + }, + { + "epoch": 10.598324022346368, + "grad_norm": 0.38188832998275757, + "learning_rate": 0.00047173669467787116, + "loss": 0.4213, + "step": 18971 + }, + { + "epoch": 10.598882681564247, + "grad_norm": 0.3840387165546417, + "learning_rate": 0.00047170868347338937, + "loss": 0.4146, + "step": 18972 + }, + { + "epoch": 10.599441340782123, + "grad_norm": 0.32684099674224854, + "learning_rate": 0.00047168067226890757, + "loss": 0.3573, + "step": 18973 + }, + { + "epoch": 10.6, + "grad_norm": 0.4086560308933258, + "learning_rate": 0.0004716526610644258, + "loss": 0.3264, + "step": 18974 + }, + { + "epoch": 10.600558659217878, + "grad_norm": 0.953455924987793, + "learning_rate": 0.000471624649859944, + "loss": 0.3764, + "step": 18975 + }, + { + "epoch": 10.601117318435755, + "grad_norm": 1.29454505443573, + "learning_rate": 0.0004715966386554622, + "loss": 0.4365, + "step": 18976 + }, + { + "epoch": 10.601675977653631, + "grad_norm": 0.34539467096328735, + "learning_rate": 0.0004715686274509804, + "loss": 0.3728, + "step": 18977 + }, + { + "epoch": 10.602234636871508, + "grad_norm": 0.7120475769042969, + "learning_rate": 0.00047154061624649866, + "loss": 0.5695, + "step": 18978 + }, + { + "epoch": 10.602793296089386, + "grad_norm": 1.0958791971206665, + "learning_rate": 0.0004715126050420168, + "loss": 0.4471, + "step": 18979 + }, + { + "epoch": 10.603351955307263, + "grad_norm": 0.5674501657485962, + "learning_rate": 0.000471484593837535, + "loss": 0.3892, + "step": 18980 + }, + { + "epoch": 10.60391061452514, + "grad_norm": 2.854255199432373, + "learning_rate": 0.0004714565826330532, + "loss": 0.4022, + "step": 18981 + }, + { + "epoch": 10.604469273743018, + "grad_norm": 0.5068051815032959, + "learning_rate": 0.0004714285714285714, + "loss": 0.4523, + "step": 18982 + }, + { + "epoch": 10.605027932960894, + "grad_norm": 0.44000405073165894, + "learning_rate": 0.0004714005602240897, + "loss": 0.458, + "step": 18983 + }, + { + "epoch": 10.60558659217877, + "grad_norm": 0.508782148361206, + "learning_rate": 0.00047137254901960784, + "loss": 0.3501, + "step": 18984 + }, + { + "epoch": 10.606145251396647, + "grad_norm": 0.6872757077217102, + "learning_rate": 0.00047134453781512604, + "loss": 0.4488, + "step": 18985 + }, + { + "epoch": 10.606703910614526, + "grad_norm": 0.7772918939590454, + "learning_rate": 0.0004713165266106443, + "loss": 0.34, + "step": 18986 + }, + { + "epoch": 10.607262569832402, + "grad_norm": 1.1493984460830688, + "learning_rate": 0.00047128851540616246, + "loss": 0.3969, + "step": 18987 + }, + { + "epoch": 10.607821229050279, + "grad_norm": 0.5101087689399719, + "learning_rate": 0.0004712605042016807, + "loss": 0.4444, + "step": 18988 + }, + { + "epoch": 10.608379888268157, + "grad_norm": 0.354507178068161, + "learning_rate": 0.00047123249299719887, + "loss": 0.384, + "step": 18989 + }, + { + "epoch": 10.608938547486034, + "grad_norm": 0.5652009844779968, + "learning_rate": 0.0004712044817927171, + "loss": 0.4034, + "step": 18990 + }, + { + "epoch": 10.60949720670391, + "grad_norm": 1.1361922025680542, + "learning_rate": 0.00047117647058823533, + "loss": 0.5921, + "step": 18991 + }, + { + "epoch": 10.610055865921789, + "grad_norm": 0.4728354215621948, + "learning_rate": 0.0004711484593837535, + "loss": 0.4261, + "step": 18992 + }, + { + "epoch": 10.610614525139665, + "grad_norm": 0.40010306239128113, + "learning_rate": 0.00047112044817927175, + "loss": 0.4103, + "step": 18993 + }, + { + "epoch": 10.611173184357542, + "grad_norm": 13.812115669250488, + "learning_rate": 0.00047109243697478995, + "loss": 0.4396, + "step": 18994 + }, + { + "epoch": 10.611731843575418, + "grad_norm": 0.5026459693908691, + "learning_rate": 0.0004710644257703081, + "loss": 0.4042, + "step": 18995 + }, + { + "epoch": 10.612290502793297, + "grad_norm": 0.4620753824710846, + "learning_rate": 0.00047103641456582636, + "loss": 0.4696, + "step": 18996 + }, + { + "epoch": 10.612849162011173, + "grad_norm": 0.864603579044342, + "learning_rate": 0.0004710084033613445, + "loss": 0.5172, + "step": 18997 + }, + { + "epoch": 10.61340782122905, + "grad_norm": 2.830671787261963, + "learning_rate": 0.0004709803921568628, + "loss": 0.3802, + "step": 18998 + }, + { + "epoch": 10.613966480446928, + "grad_norm": 0.5160785913467407, + "learning_rate": 0.000470952380952381, + "loss": 0.5051, + "step": 18999 + }, + { + "epoch": 10.614525139664805, + "grad_norm": 8.048046112060547, + "learning_rate": 0.00047092436974789913, + "loss": 0.417, + "step": 19000 + }, + { + "epoch": 10.614525139664805, + "eval_cer": 0.08926603602718951, + "eval_loss": 0.3345629572868347, + "eval_runtime": 55.3272, + "eval_samples_per_second": 82.021, + "eval_steps_per_second": 5.133, + "eval_wer": 0.35300689924862433, + "step": 19000 + }, + { + "epoch": 10.615083798882681, + "grad_norm": 0.415183961391449, + "learning_rate": 0.0004708963585434174, + "loss": 0.5224, + "step": 19001 + }, + { + "epoch": 10.61564245810056, + "grad_norm": 0.5513841509819031, + "learning_rate": 0.0004708683473389356, + "loss": 0.4743, + "step": 19002 + }, + { + "epoch": 10.616201117318436, + "grad_norm": 1.6740401983261108, + "learning_rate": 0.0004708403361344538, + "loss": 0.5792, + "step": 19003 + }, + { + "epoch": 10.616759776536313, + "grad_norm": 0.4966926574707031, + "learning_rate": 0.000470812324929972, + "loss": 0.4092, + "step": 19004 + }, + { + "epoch": 10.61731843575419, + "grad_norm": 0.5723060369491577, + "learning_rate": 0.00047078431372549016, + "loss": 0.4056, + "step": 19005 + }, + { + "epoch": 10.617877094972068, + "grad_norm": 0.4851645231246948, + "learning_rate": 0.0004707563025210084, + "loss": 0.4239, + "step": 19006 + }, + { + "epoch": 10.618435754189944, + "grad_norm": 1.4907587766647339, + "learning_rate": 0.00047072829131652663, + "loss": 0.4596, + "step": 19007 + }, + { + "epoch": 10.61899441340782, + "grad_norm": 0.5339739918708801, + "learning_rate": 0.00047070028011204484, + "loss": 0.3565, + "step": 19008 + }, + { + "epoch": 10.619553072625699, + "grad_norm": 0.6172865033149719, + "learning_rate": 0.00047067226890756304, + "loss": 0.3941, + "step": 19009 + }, + { + "epoch": 10.620111731843576, + "grad_norm": 0.7645795941352844, + "learning_rate": 0.00047064425770308125, + "loss": 0.5722, + "step": 19010 + }, + { + "epoch": 10.620670391061452, + "grad_norm": 0.5453724265098572, + "learning_rate": 0.00047061624649859945, + "loss": 0.5565, + "step": 19011 + }, + { + "epoch": 10.621229050279329, + "grad_norm": 0.6668143272399902, + "learning_rate": 0.00047058823529411766, + "loss": 0.4093, + "step": 19012 + }, + { + "epoch": 10.621787709497207, + "grad_norm": 2.8203282356262207, + "learning_rate": 0.00047056022408963587, + "loss": 0.5476, + "step": 19013 + }, + { + "epoch": 10.622346368715084, + "grad_norm": 0.5626693367958069, + "learning_rate": 0.00047053221288515407, + "loss": 0.4818, + "step": 19014 + }, + { + "epoch": 10.62290502793296, + "grad_norm": 0.4832860827445984, + "learning_rate": 0.0004705042016806723, + "loss": 0.4534, + "step": 19015 + }, + { + "epoch": 10.623463687150839, + "grad_norm": 0.6268319487571716, + "learning_rate": 0.0004704761904761905, + "loss": 0.3492, + "step": 19016 + }, + { + "epoch": 10.624022346368715, + "grad_norm": 0.43191006779670715, + "learning_rate": 0.0004704481792717087, + "loss": 0.4319, + "step": 19017 + }, + { + "epoch": 10.624581005586592, + "grad_norm": 0.6733386516571045, + "learning_rate": 0.00047042016806722695, + "loss": 0.546, + "step": 19018 + }, + { + "epoch": 10.62513966480447, + "grad_norm": 0.3711135983467102, + "learning_rate": 0.0004703921568627451, + "loss": 0.3334, + "step": 19019 + }, + { + "epoch": 10.625698324022347, + "grad_norm": 0.37633016705513, + "learning_rate": 0.0004703641456582633, + "loss": 0.374, + "step": 19020 + }, + { + "epoch": 10.626256983240223, + "grad_norm": 0.6560848951339722, + "learning_rate": 0.0004703361344537815, + "loss": 0.4801, + "step": 19021 + }, + { + "epoch": 10.6268156424581, + "grad_norm": 0.5283510088920593, + "learning_rate": 0.0004703081232492997, + "loss": 0.4622, + "step": 19022 + }, + { + "epoch": 10.627374301675978, + "grad_norm": 0.62628573179245, + "learning_rate": 0.0004702801120448179, + "loss": 0.4052, + "step": 19023 + }, + { + "epoch": 10.627932960893855, + "grad_norm": 0.9499527215957642, + "learning_rate": 0.00047025210084033613, + "loss": 0.4045, + "step": 19024 + }, + { + "epoch": 10.628491620111731, + "grad_norm": 0.4918128550052643, + "learning_rate": 0.00047022408963585434, + "loss": 0.3417, + "step": 19025 + }, + { + "epoch": 10.62905027932961, + "grad_norm": 2.0121538639068604, + "learning_rate": 0.0004701960784313726, + "loss": 0.4088, + "step": 19026 + }, + { + "epoch": 10.629608938547486, + "grad_norm": 0.3911835253238678, + "learning_rate": 0.00047016806722689075, + "loss": 0.3493, + "step": 19027 + }, + { + "epoch": 10.630167597765363, + "grad_norm": 0.519016444683075, + "learning_rate": 0.00047014005602240896, + "loss": 0.4779, + "step": 19028 + }, + { + "epoch": 10.630726256983241, + "grad_norm": 2.3829596042633057, + "learning_rate": 0.00047011204481792716, + "loss": 0.4247, + "step": 19029 + }, + { + "epoch": 10.631284916201118, + "grad_norm": 0.5463372468948364, + "learning_rate": 0.00047008403361344537, + "loss": 0.4618, + "step": 19030 + }, + { + "epoch": 10.631843575418994, + "grad_norm": 0.564556896686554, + "learning_rate": 0.00047005602240896363, + "loss": 0.4602, + "step": 19031 + }, + { + "epoch": 10.63240223463687, + "grad_norm": 1.4305754899978638, + "learning_rate": 0.0004700280112044818, + "loss": 0.4028, + "step": 19032 + }, + { + "epoch": 10.632960893854749, + "grad_norm": 0.5458186864852905, + "learning_rate": 0.00047, + "loss": 0.4983, + "step": 19033 + }, + { + "epoch": 10.633519553072626, + "grad_norm": 1.3629544973373413, + "learning_rate": 0.00046997198879551825, + "loss": 0.7202, + "step": 19034 + }, + { + "epoch": 10.634078212290502, + "grad_norm": 0.40971869230270386, + "learning_rate": 0.0004699439775910364, + "loss": 0.4323, + "step": 19035 + }, + { + "epoch": 10.63463687150838, + "grad_norm": 0.5169001221656799, + "learning_rate": 0.00046991596638655466, + "loss": 0.3726, + "step": 19036 + }, + { + "epoch": 10.635195530726257, + "grad_norm": 5.221506118774414, + "learning_rate": 0.0004698879551820728, + "loss": 0.4903, + "step": 19037 + }, + { + "epoch": 10.635754189944134, + "grad_norm": 0.4050253629684448, + "learning_rate": 0.000469859943977591, + "loss": 0.4436, + "step": 19038 + }, + { + "epoch": 10.63631284916201, + "grad_norm": 0.3868137300014496, + "learning_rate": 0.0004698319327731093, + "loss": 0.366, + "step": 19039 + }, + { + "epoch": 10.636871508379889, + "grad_norm": 55.83261489868164, + "learning_rate": 0.00046980392156862743, + "loss": 0.4528, + "step": 19040 + }, + { + "epoch": 10.637430167597765, + "grad_norm": 1.1053935289382935, + "learning_rate": 0.0004697759103641457, + "loss": 0.4221, + "step": 19041 + }, + { + "epoch": 10.637988826815642, + "grad_norm": 1.2547225952148438, + "learning_rate": 0.0004697478991596639, + "loss": 0.3847, + "step": 19042 + }, + { + "epoch": 10.63854748603352, + "grad_norm": 0.48490244150161743, + "learning_rate": 0.00046971988795518205, + "loss": 0.3517, + "step": 19043 + }, + { + "epoch": 10.639106145251397, + "grad_norm": 0.3545001149177551, + "learning_rate": 0.0004696918767507003, + "loss": 0.4162, + "step": 19044 + }, + { + "epoch": 10.639664804469273, + "grad_norm": 0.5452660322189331, + "learning_rate": 0.00046966386554621846, + "loss": 0.3535, + "step": 19045 + }, + { + "epoch": 10.640223463687152, + "grad_norm": 0.3560701906681061, + "learning_rate": 0.0004696358543417367, + "loss": 0.4084, + "step": 19046 + }, + { + "epoch": 10.640782122905028, + "grad_norm": 0.32805687189102173, + "learning_rate": 0.0004696078431372549, + "loss": 0.4022, + "step": 19047 + }, + { + "epoch": 10.641340782122905, + "grad_norm": 0.5181145668029785, + "learning_rate": 0.0004695798319327731, + "loss": 0.4451, + "step": 19048 + }, + { + "epoch": 10.641899441340783, + "grad_norm": 0.49063727259635925, + "learning_rate": 0.00046955182072829134, + "loss": 0.5393, + "step": 19049 + }, + { + "epoch": 10.64245810055866, + "grad_norm": 0.6111874580383301, + "learning_rate": 0.00046952380952380954, + "loss": 0.385, + "step": 19050 + }, + { + "epoch": 10.643016759776536, + "grad_norm": 0.5436881184577942, + "learning_rate": 0.00046949579831932775, + "loss": 0.4127, + "step": 19051 + }, + { + "epoch": 10.643575418994413, + "grad_norm": 0.8494965434074402, + "learning_rate": 0.00046946778711484595, + "loss": 0.4366, + "step": 19052 + }, + { + "epoch": 10.644134078212291, + "grad_norm": 0.4573822021484375, + "learning_rate": 0.0004694397759103641, + "loss": 0.4255, + "step": 19053 + }, + { + "epoch": 10.644692737430168, + "grad_norm": 0.3451094925403595, + "learning_rate": 0.00046941176470588237, + "loss": 0.3417, + "step": 19054 + }, + { + "epoch": 10.645251396648044, + "grad_norm": 0.36737358570098877, + "learning_rate": 0.00046938375350140057, + "loss": 0.4192, + "step": 19055 + }, + { + "epoch": 10.645810055865923, + "grad_norm": 0.5293173789978027, + "learning_rate": 0.0004693557422969188, + "loss": 0.4484, + "step": 19056 + }, + { + "epoch": 10.6463687150838, + "grad_norm": 0.44499993324279785, + "learning_rate": 0.000469327731092437, + "loss": 0.4155, + "step": 19057 + }, + { + "epoch": 10.646927374301676, + "grad_norm": 0.6773020625114441, + "learning_rate": 0.0004692997198879552, + "loss": 0.4605, + "step": 19058 + }, + { + "epoch": 10.647486033519552, + "grad_norm": 0.4028243124485016, + "learning_rate": 0.0004692717086834734, + "loss": 0.3437, + "step": 19059 + }, + { + "epoch": 10.64804469273743, + "grad_norm": 0.6818606853485107, + "learning_rate": 0.0004692436974789916, + "loss": 0.4542, + "step": 19060 + }, + { + "epoch": 10.648603351955307, + "grad_norm": 2.3583338260650635, + "learning_rate": 0.0004692156862745098, + "loss": 0.5257, + "step": 19061 + }, + { + "epoch": 10.649162011173184, + "grad_norm": 0.4159480929374695, + "learning_rate": 0.000469187675070028, + "loss": 0.4626, + "step": 19062 + }, + { + "epoch": 10.649720670391062, + "grad_norm": 0.44404101371765137, + "learning_rate": 0.0004691596638655462, + "loss": 0.3446, + "step": 19063 + }, + { + "epoch": 10.650279329608939, + "grad_norm": 1.7233636379241943, + "learning_rate": 0.0004691316526610644, + "loss": 0.292, + "step": 19064 + }, + { + "epoch": 10.650837988826815, + "grad_norm": 1.3191317319869995, + "learning_rate": 0.00046910364145658263, + "loss": 0.4955, + "step": 19065 + }, + { + "epoch": 10.651396648044694, + "grad_norm": 0.7556615471839905, + "learning_rate": 0.0004690756302521009, + "loss": 0.4689, + "step": 19066 + }, + { + "epoch": 10.65195530726257, + "grad_norm": 0.43262651562690735, + "learning_rate": 0.00046904761904761904, + "loss": 0.4419, + "step": 19067 + }, + { + "epoch": 10.652513966480447, + "grad_norm": 0.4974231421947479, + "learning_rate": 0.00046901960784313725, + "loss": 0.3429, + "step": 19068 + }, + { + "epoch": 10.653072625698323, + "grad_norm": 0.5129490494728088, + "learning_rate": 0.00046899159663865546, + "loss": 0.5127, + "step": 19069 + }, + { + "epoch": 10.653631284916202, + "grad_norm": 0.5030393004417419, + "learning_rate": 0.00046896358543417366, + "loss": 0.4539, + "step": 19070 + }, + { + "epoch": 10.654189944134078, + "grad_norm": 0.4107878506183624, + "learning_rate": 0.0004689355742296919, + "loss": 0.3833, + "step": 19071 + }, + { + "epoch": 10.654748603351955, + "grad_norm": 0.46170878410339355, + "learning_rate": 0.0004689075630252101, + "loss": 0.3309, + "step": 19072 + }, + { + "epoch": 10.655307262569833, + "grad_norm": 0.8438941240310669, + "learning_rate": 0.0004688795518207283, + "loss": 0.5224, + "step": 19073 + }, + { + "epoch": 10.65586592178771, + "grad_norm": 0.7511799335479736, + "learning_rate": 0.00046885154061624654, + "loss": 0.4567, + "step": 19074 + }, + { + "epoch": 10.656424581005586, + "grad_norm": 2.3624446392059326, + "learning_rate": 0.0004688235294117647, + "loss": 0.4491, + "step": 19075 + }, + { + "epoch": 10.656983240223465, + "grad_norm": 0.5792214274406433, + "learning_rate": 0.00046879551820728295, + "loss": 0.4607, + "step": 19076 + }, + { + "epoch": 10.657541899441341, + "grad_norm": 0.8547051548957825, + "learning_rate": 0.0004687675070028011, + "loss": 0.4094, + "step": 19077 + }, + { + "epoch": 10.658100558659218, + "grad_norm": 0.6848317980766296, + "learning_rate": 0.0004687394957983193, + "loss": 0.4159, + "step": 19078 + }, + { + "epoch": 10.658659217877094, + "grad_norm": 0.5106977224349976, + "learning_rate": 0.00046871148459383757, + "loss": 0.5057, + "step": 19079 + }, + { + "epoch": 10.659217877094973, + "grad_norm": 0.6351894736289978, + "learning_rate": 0.0004686834733893557, + "loss": 0.369, + "step": 19080 + }, + { + "epoch": 10.65977653631285, + "grad_norm": 0.5743239521980286, + "learning_rate": 0.000468655462184874, + "loss": 0.5266, + "step": 19081 + }, + { + "epoch": 10.660335195530726, + "grad_norm": 0.5067649483680725, + "learning_rate": 0.0004686274509803922, + "loss": 0.4256, + "step": 19082 + }, + { + "epoch": 10.660893854748604, + "grad_norm": 0.41858527064323425, + "learning_rate": 0.00046859943977591034, + "loss": 0.4271, + "step": 19083 + }, + { + "epoch": 10.66145251396648, + "grad_norm": 0.7740573883056641, + "learning_rate": 0.0004685714285714286, + "loss": 0.5517, + "step": 19084 + }, + { + "epoch": 10.662011173184357, + "grad_norm": 0.49993398785591125, + "learning_rate": 0.00046854341736694675, + "loss": 0.4022, + "step": 19085 + }, + { + "epoch": 10.662569832402234, + "grad_norm": 0.49791771173477173, + "learning_rate": 0.000468515406162465, + "loss": 0.4383, + "step": 19086 + }, + { + "epoch": 10.663128491620112, + "grad_norm": 0.8350551128387451, + "learning_rate": 0.0004684873949579832, + "loss": 0.4659, + "step": 19087 + }, + { + "epoch": 10.663687150837989, + "grad_norm": 1.5306137800216675, + "learning_rate": 0.00046845938375350137, + "loss": 0.3581, + "step": 19088 + }, + { + "epoch": 10.664245810055865, + "grad_norm": 0.6567550301551819, + "learning_rate": 0.00046843137254901963, + "loss": 0.3781, + "step": 19089 + }, + { + "epoch": 10.664804469273744, + "grad_norm": 0.8038487434387207, + "learning_rate": 0.00046840336134453784, + "loss": 0.5969, + "step": 19090 + }, + { + "epoch": 10.66536312849162, + "grad_norm": 0.6921828389167786, + "learning_rate": 0.00046837535014005604, + "loss": 0.3682, + "step": 19091 + }, + { + "epoch": 10.665921787709497, + "grad_norm": 0.4302246868610382, + "learning_rate": 0.00046834733893557425, + "loss": 0.3922, + "step": 19092 + }, + { + "epoch": 10.666480446927375, + "grad_norm": 0.5351213812828064, + "learning_rate": 0.0004683193277310924, + "loss": 0.5228, + "step": 19093 + }, + { + "epoch": 10.667039106145252, + "grad_norm": 0.4133875370025635, + "learning_rate": 0.00046829131652661066, + "loss": 0.4169, + "step": 19094 + }, + { + "epoch": 10.667597765363128, + "grad_norm": 0.6462990045547485, + "learning_rate": 0.00046826330532212887, + "loss": 0.5254, + "step": 19095 + }, + { + "epoch": 10.668156424581005, + "grad_norm": 1.2833349704742432, + "learning_rate": 0.00046823529411764707, + "loss": 0.3988, + "step": 19096 + }, + { + "epoch": 10.668715083798883, + "grad_norm": 0.7234786152839661, + "learning_rate": 0.0004682072829131653, + "loss": 0.4706, + "step": 19097 + }, + { + "epoch": 10.66927374301676, + "grad_norm": 0.7720224857330322, + "learning_rate": 0.0004681792717086835, + "loss": 0.5365, + "step": 19098 + }, + { + "epoch": 10.669832402234636, + "grad_norm": 0.4070813059806824, + "learning_rate": 0.0004681512605042017, + "loss": 0.3656, + "step": 19099 + }, + { + "epoch": 10.670391061452515, + "grad_norm": 0.45597848296165466, + "learning_rate": 0.0004681232492997199, + "loss": 0.4102, + "step": 19100 + }, + { + "epoch": 10.670949720670391, + "grad_norm": 0.5677588582038879, + "learning_rate": 0.00046809523809523816, + "loss": 0.5528, + "step": 19101 + }, + { + "epoch": 10.671508379888268, + "grad_norm": 0.5414113402366638, + "learning_rate": 0.0004680672268907563, + "loss": 0.4445, + "step": 19102 + }, + { + "epoch": 10.672067039106146, + "grad_norm": 0.47070086002349854, + "learning_rate": 0.0004680392156862745, + "loss": 0.4174, + "step": 19103 + }, + { + "epoch": 10.672625698324023, + "grad_norm": 0.6463845372200012, + "learning_rate": 0.0004680112044817927, + "loss": 0.3636, + "step": 19104 + }, + { + "epoch": 10.6731843575419, + "grad_norm": 0.34276607632637024, + "learning_rate": 0.0004679831932773109, + "loss": 0.3433, + "step": 19105 + }, + { + "epoch": 10.673743016759776, + "grad_norm": 0.47840774059295654, + "learning_rate": 0.0004679551820728292, + "loss": 0.3954, + "step": 19106 + }, + { + "epoch": 10.674301675977654, + "grad_norm": 0.7085204720497131, + "learning_rate": 0.00046792717086834734, + "loss": 0.3905, + "step": 19107 + }, + { + "epoch": 10.67486033519553, + "grad_norm": 1.1861666440963745, + "learning_rate": 0.00046789915966386554, + "loss": 0.4145, + "step": 19108 + }, + { + "epoch": 10.675418994413407, + "grad_norm": 0.3899077773094177, + "learning_rate": 0.0004678711484593838, + "loss": 0.3976, + "step": 19109 + }, + { + "epoch": 10.675977653631286, + "grad_norm": 0.45743659138679504, + "learning_rate": 0.00046784313725490196, + "loss": 0.3251, + "step": 19110 + }, + { + "epoch": 10.676536312849162, + "grad_norm": 0.45469725131988525, + "learning_rate": 0.0004678151260504202, + "loss": 0.463, + "step": 19111 + }, + { + "epoch": 10.677094972067039, + "grad_norm": 0.3300894498825073, + "learning_rate": 0.00046778711484593837, + "loss": 0.385, + "step": 19112 + }, + { + "epoch": 10.677653631284915, + "grad_norm": 0.4054047465324402, + "learning_rate": 0.0004677591036414566, + "loss": 0.4341, + "step": 19113 + }, + { + "epoch": 10.678212290502794, + "grad_norm": 0.47632908821105957, + "learning_rate": 0.00046773109243697483, + "loss": 0.2723, + "step": 19114 + }, + { + "epoch": 10.67877094972067, + "grad_norm": 1.205341100692749, + "learning_rate": 0.000467703081232493, + "loss": 0.5243, + "step": 19115 + }, + { + "epoch": 10.679329608938547, + "grad_norm": 0.3325849771499634, + "learning_rate": 0.00046767507002801125, + "loss": 0.3289, + "step": 19116 + }, + { + "epoch": 10.679888268156425, + "grad_norm": 0.5517097115516663, + "learning_rate": 0.00046764705882352945, + "loss": 0.3591, + "step": 19117 + }, + { + "epoch": 10.680446927374302, + "grad_norm": 2.2804384231567383, + "learning_rate": 0.0004676190476190476, + "loss": 0.3, + "step": 19118 + }, + { + "epoch": 10.681005586592178, + "grad_norm": 0.45297345519065857, + "learning_rate": 0.00046759103641456586, + "loss": 0.3275, + "step": 19119 + }, + { + "epoch": 10.681564245810057, + "grad_norm": 0.47264283895492554, + "learning_rate": 0.000467563025210084, + "loss": 0.4513, + "step": 19120 + }, + { + "epoch": 10.682122905027933, + "grad_norm": 0.6037371754646301, + "learning_rate": 0.0004675350140056023, + "loss": 0.3924, + "step": 19121 + }, + { + "epoch": 10.68268156424581, + "grad_norm": 0.44970548152923584, + "learning_rate": 0.0004675070028011205, + "loss": 0.4976, + "step": 19122 + }, + { + "epoch": 10.683240223463688, + "grad_norm": 0.38732364773750305, + "learning_rate": 0.00046747899159663863, + "loss": 0.423, + "step": 19123 + }, + { + "epoch": 10.683798882681565, + "grad_norm": 0.6231667995452881, + "learning_rate": 0.0004674509803921569, + "loss": 0.5034, + "step": 19124 + }, + { + "epoch": 10.684357541899441, + "grad_norm": 0.4319717288017273, + "learning_rate": 0.0004674229691876751, + "loss": 0.3598, + "step": 19125 + }, + { + "epoch": 10.684916201117318, + "grad_norm": 4.4659013748168945, + "learning_rate": 0.0004673949579831933, + "loss": 0.4826, + "step": 19126 + }, + { + "epoch": 10.685474860335196, + "grad_norm": 0.4170956611633301, + "learning_rate": 0.0004673669467787115, + "loss": 0.4104, + "step": 19127 + }, + { + "epoch": 10.686033519553073, + "grad_norm": 0.554125964641571, + "learning_rate": 0.00046733893557422966, + "loss": 0.4121, + "step": 19128 + }, + { + "epoch": 10.68659217877095, + "grad_norm": 0.7523870468139648, + "learning_rate": 0.0004673109243697479, + "loss": 0.7048, + "step": 19129 + }, + { + "epoch": 10.687150837988828, + "grad_norm": 0.44762739539146423, + "learning_rate": 0.00046728291316526613, + "loss": 0.4358, + "step": 19130 + }, + { + "epoch": 10.687709497206704, + "grad_norm": 0.5005160570144653, + "learning_rate": 0.00046725490196078434, + "loss": 0.4308, + "step": 19131 + }, + { + "epoch": 10.68826815642458, + "grad_norm": 0.3986276388168335, + "learning_rate": 0.00046722689075630254, + "loss": 0.3784, + "step": 19132 + }, + { + "epoch": 10.688826815642457, + "grad_norm": 0.41815945506095886, + "learning_rate": 0.00046719887955182075, + "loss": 0.3808, + "step": 19133 + }, + { + "epoch": 10.689385474860336, + "grad_norm": 0.47712522745132446, + "learning_rate": 0.00046717086834733895, + "loss": 0.4194, + "step": 19134 + }, + { + "epoch": 10.689944134078212, + "grad_norm": 0.534307599067688, + "learning_rate": 0.00046714285714285716, + "loss": 0.3459, + "step": 19135 + }, + { + "epoch": 10.690502793296089, + "grad_norm": 0.37046006321907043, + "learning_rate": 0.0004671148459383753, + "loss": 0.4017, + "step": 19136 + }, + { + "epoch": 10.691061452513967, + "grad_norm": 0.6235345602035522, + "learning_rate": 0.00046708683473389357, + "loss": 0.3615, + "step": 19137 + }, + { + "epoch": 10.691620111731844, + "grad_norm": 1.132794976234436, + "learning_rate": 0.0004670588235294118, + "loss": 0.4584, + "step": 19138 + }, + { + "epoch": 10.69217877094972, + "grad_norm": 0.5851292610168457, + "learning_rate": 0.00046703081232493, + "loss": 0.3411, + "step": 19139 + }, + { + "epoch": 10.692737430167599, + "grad_norm": 0.5423732995986938, + "learning_rate": 0.0004670028011204482, + "loss": 0.4639, + "step": 19140 + }, + { + "epoch": 10.693296089385475, + "grad_norm": 1.5003374814987183, + "learning_rate": 0.0004669747899159664, + "loss": 0.3638, + "step": 19141 + }, + { + "epoch": 10.693854748603352, + "grad_norm": 0.5174238085746765, + "learning_rate": 0.0004669467787114846, + "loss": 0.5133, + "step": 19142 + }, + { + "epoch": 10.694413407821228, + "grad_norm": 0.5892506837844849, + "learning_rate": 0.0004669187675070028, + "loss": 0.5287, + "step": 19143 + }, + { + "epoch": 10.694972067039107, + "grad_norm": 0.4213191866874695, + "learning_rate": 0.000466890756302521, + "loss": 0.4115, + "step": 19144 + }, + { + "epoch": 10.695530726256983, + "grad_norm": 0.41962504386901855, + "learning_rate": 0.0004668627450980392, + "loss": 0.4542, + "step": 19145 + }, + { + "epoch": 10.69608938547486, + "grad_norm": 0.6039366722106934, + "learning_rate": 0.0004668347338935574, + "loss": 0.4822, + "step": 19146 + }, + { + "epoch": 10.696648044692738, + "grad_norm": 1.4804936647415161, + "learning_rate": 0.00046680672268907563, + "loss": 0.586, + "step": 19147 + }, + { + "epoch": 10.697206703910615, + "grad_norm": 0.5975735187530518, + "learning_rate": 0.00046677871148459384, + "loss": 0.4225, + "step": 19148 + }, + { + "epoch": 10.697765363128491, + "grad_norm": 0.9269648790359497, + "learning_rate": 0.0004667507002801121, + "loss": 0.6056, + "step": 19149 + }, + { + "epoch": 10.69832402234637, + "grad_norm": 4.277854919433594, + "learning_rate": 0.00046672268907563025, + "loss": 0.4429, + "step": 19150 + }, + { + "epoch": 10.698882681564246, + "grad_norm": 0.5252466797828674, + "learning_rate": 0.00046669467787114846, + "loss": 0.4758, + "step": 19151 + }, + { + "epoch": 10.699441340782123, + "grad_norm": 0.4072783291339874, + "learning_rate": 0.00046666666666666666, + "loss": 0.4086, + "step": 19152 + }, + { + "epoch": 10.7, + "grad_norm": 0.5791612863540649, + "learning_rate": 0.00046663865546218487, + "loss": 0.4137, + "step": 19153 + }, + { + "epoch": 10.700558659217878, + "grad_norm": 0.841476321220398, + "learning_rate": 0.00046661064425770313, + "loss": 0.4458, + "step": 19154 + }, + { + "epoch": 10.701117318435754, + "grad_norm": 4.3205790519714355, + "learning_rate": 0.0004665826330532213, + "loss": 0.3965, + "step": 19155 + }, + { + "epoch": 10.70167597765363, + "grad_norm": 1.5909810066223145, + "learning_rate": 0.0004665546218487395, + "loss": 0.3965, + "step": 19156 + }, + { + "epoch": 10.702234636871509, + "grad_norm": 0.4131898880004883, + "learning_rate": 0.00046652661064425775, + "loss": 0.3952, + "step": 19157 + }, + { + "epoch": 10.702793296089386, + "grad_norm": 0.9349154233932495, + "learning_rate": 0.0004664985994397759, + "loss": 0.39, + "step": 19158 + }, + { + "epoch": 10.703351955307262, + "grad_norm": 0.5243114233016968, + "learning_rate": 0.00046647058823529416, + "loss": 0.4094, + "step": 19159 + }, + { + "epoch": 10.703910614525139, + "grad_norm": 0.3909927308559418, + "learning_rate": 0.0004664425770308123, + "loss": 0.4031, + "step": 19160 + }, + { + "epoch": 10.704469273743017, + "grad_norm": 0.38771721720695496, + "learning_rate": 0.0004664145658263305, + "loss": 0.3625, + "step": 19161 + }, + { + "epoch": 10.705027932960894, + "grad_norm": 0.5744292736053467, + "learning_rate": 0.0004663865546218488, + "loss": 0.4711, + "step": 19162 + }, + { + "epoch": 10.70558659217877, + "grad_norm": 0.6499484181404114, + "learning_rate": 0.00046635854341736693, + "loss": 0.322, + "step": 19163 + }, + { + "epoch": 10.706145251396649, + "grad_norm": 0.4552537500858307, + "learning_rate": 0.0004663305322128852, + "loss": 0.3913, + "step": 19164 + }, + { + "epoch": 10.706703910614525, + "grad_norm": 3.466280460357666, + "learning_rate": 0.0004663025210084034, + "loss": 0.54, + "step": 19165 + }, + { + "epoch": 10.707262569832402, + "grad_norm": 0.6018717288970947, + "learning_rate": 0.00046627450980392155, + "loss": 0.4178, + "step": 19166 + }, + { + "epoch": 10.70782122905028, + "grad_norm": 1.7603206634521484, + "learning_rate": 0.0004662464985994398, + "loss": 0.404, + "step": 19167 + }, + { + "epoch": 10.708379888268157, + "grad_norm": 0.4864920377731323, + "learning_rate": 0.00046621848739495796, + "loss": 0.4645, + "step": 19168 + }, + { + "epoch": 10.708938547486033, + "grad_norm": 0.49840936064720154, + "learning_rate": 0.0004661904761904762, + "loss": 0.4351, + "step": 19169 + }, + { + "epoch": 10.70949720670391, + "grad_norm": 0.36510297656059265, + "learning_rate": 0.0004661624649859944, + "loss": 0.4488, + "step": 19170 + }, + { + "epoch": 10.710055865921788, + "grad_norm": 1.0267553329467773, + "learning_rate": 0.0004661344537815126, + "loss": 0.3793, + "step": 19171 + }, + { + "epoch": 10.710614525139665, + "grad_norm": 1.7075783014297485, + "learning_rate": 0.00046610644257703084, + "loss": 0.4351, + "step": 19172 + }, + { + "epoch": 10.711173184357541, + "grad_norm": 0.41012072563171387, + "learning_rate": 0.00046607843137254904, + "loss": 0.3765, + "step": 19173 + }, + { + "epoch": 10.71173184357542, + "grad_norm": 0.4107798635959625, + "learning_rate": 0.00046605042016806725, + "loss": 0.4827, + "step": 19174 + }, + { + "epoch": 10.712290502793296, + "grad_norm": 0.5288774967193604, + "learning_rate": 0.00046602240896358545, + "loss": 0.4132, + "step": 19175 + }, + { + "epoch": 10.712849162011173, + "grad_norm": 0.34799283742904663, + "learning_rate": 0.0004659943977591036, + "loss": 0.3065, + "step": 19176 + }, + { + "epoch": 10.713407821229051, + "grad_norm": 0.5456564426422119, + "learning_rate": 0.00046596638655462187, + "loss": 0.419, + "step": 19177 + }, + { + "epoch": 10.713966480446928, + "grad_norm": 0.4334261417388916, + "learning_rate": 0.00046593837535014007, + "loss": 0.3324, + "step": 19178 + }, + { + "epoch": 10.714525139664804, + "grad_norm": 2.0333592891693115, + "learning_rate": 0.0004659103641456583, + "loss": 0.3161, + "step": 19179 + }, + { + "epoch": 10.71508379888268, + "grad_norm": 0.40038445591926575, + "learning_rate": 0.0004658823529411765, + "loss": 0.3818, + "step": 19180 + }, + { + "epoch": 10.71564245810056, + "grad_norm": 0.3658725321292877, + "learning_rate": 0.0004658543417366947, + "loss": 0.377, + "step": 19181 + }, + { + "epoch": 10.716201117318436, + "grad_norm": 1.070756435394287, + "learning_rate": 0.0004658263305322129, + "loss": 0.3694, + "step": 19182 + }, + { + "epoch": 10.716759776536312, + "grad_norm": 0.588098406791687, + "learning_rate": 0.0004657983193277311, + "loss": 0.3662, + "step": 19183 + }, + { + "epoch": 10.71731843575419, + "grad_norm": 0.6097560524940491, + "learning_rate": 0.0004657703081232493, + "loss": 0.3477, + "step": 19184 + }, + { + "epoch": 10.717877094972067, + "grad_norm": 0.42431163787841797, + "learning_rate": 0.0004657422969187675, + "loss": 0.3821, + "step": 19185 + }, + { + "epoch": 10.718435754189944, + "grad_norm": 0.48769694566726685, + "learning_rate": 0.0004657142857142857, + "loss": 0.363, + "step": 19186 + }, + { + "epoch": 10.71899441340782, + "grad_norm": 0.5009562373161316, + "learning_rate": 0.0004656862745098039, + "loss": 0.3721, + "step": 19187 + }, + { + "epoch": 10.719553072625699, + "grad_norm": 0.5543577671051025, + "learning_rate": 0.00046565826330532213, + "loss": 0.3801, + "step": 19188 + }, + { + "epoch": 10.720111731843575, + "grad_norm": 0.3637104630470276, + "learning_rate": 0.0004656302521008404, + "loss": 0.3793, + "step": 19189 + }, + { + "epoch": 10.720670391061452, + "grad_norm": 0.6036620140075684, + "learning_rate": 0.00046560224089635854, + "loss": 0.3778, + "step": 19190 + }, + { + "epoch": 10.72122905027933, + "grad_norm": 0.4180900752544403, + "learning_rate": 0.00046557422969187675, + "loss": 0.5679, + "step": 19191 + }, + { + "epoch": 10.721787709497207, + "grad_norm": 0.7165327072143555, + "learning_rate": 0.00046554621848739496, + "loss": 0.454, + "step": 19192 + }, + { + "epoch": 10.722346368715083, + "grad_norm": 0.34337225556373596, + "learning_rate": 0.00046551820728291316, + "loss": 0.4537, + "step": 19193 + }, + { + "epoch": 10.722905027932962, + "grad_norm": 0.47642239928245544, + "learning_rate": 0.0004654901960784314, + "loss": 0.464, + "step": 19194 + }, + { + "epoch": 10.723463687150838, + "grad_norm": 0.4889824390411377, + "learning_rate": 0.0004654621848739496, + "loss": 0.4008, + "step": 19195 + }, + { + "epoch": 10.724022346368715, + "grad_norm": 0.6504570245742798, + "learning_rate": 0.0004654341736694678, + "loss": 0.4218, + "step": 19196 + }, + { + "epoch": 10.724581005586593, + "grad_norm": 0.5067644715309143, + "learning_rate": 0.00046540616246498604, + "loss": 0.467, + "step": 19197 + }, + { + "epoch": 10.72513966480447, + "grad_norm": 0.6029285192489624, + "learning_rate": 0.0004653781512605042, + "loss": 0.3726, + "step": 19198 + }, + { + "epoch": 10.725698324022346, + "grad_norm": 0.4918954372406006, + "learning_rate": 0.00046535014005602245, + "loss": 0.4649, + "step": 19199 + }, + { + "epoch": 10.726256983240223, + "grad_norm": 0.5521659851074219, + "learning_rate": 0.0004653221288515406, + "loss": 0.5541, + "step": 19200 + }, + { + "epoch": 10.726815642458101, + "grad_norm": 0.3751126825809479, + "learning_rate": 0.0004652941176470588, + "loss": 0.4037, + "step": 19201 + }, + { + "epoch": 10.727374301675978, + "grad_norm": 0.4620317816734314, + "learning_rate": 0.00046526610644257707, + "loss": 0.3312, + "step": 19202 + }, + { + "epoch": 10.727932960893854, + "grad_norm": 2.065751075744629, + "learning_rate": 0.0004652380952380952, + "loss": 0.5624, + "step": 19203 + }, + { + "epoch": 10.728491620111733, + "grad_norm": 0.4712981879711151, + "learning_rate": 0.0004652100840336135, + "loss": 0.3809, + "step": 19204 + }, + { + "epoch": 10.72905027932961, + "grad_norm": 0.5137621164321899, + "learning_rate": 0.0004651820728291317, + "loss": 0.5207, + "step": 19205 + }, + { + "epoch": 10.729608938547486, + "grad_norm": 0.5231024622917175, + "learning_rate": 0.00046515406162464984, + "loss": 0.424, + "step": 19206 + }, + { + "epoch": 10.730167597765362, + "grad_norm": 0.3687092065811157, + "learning_rate": 0.0004651260504201681, + "loss": 0.415, + "step": 19207 + }, + { + "epoch": 10.73072625698324, + "grad_norm": 0.4202491343021393, + "learning_rate": 0.00046509803921568625, + "loss": 0.4009, + "step": 19208 + }, + { + "epoch": 10.731284916201117, + "grad_norm": 1.6730176210403442, + "learning_rate": 0.0004650700280112045, + "loss": 0.3533, + "step": 19209 + }, + { + "epoch": 10.731843575418994, + "grad_norm": 1.897058129310608, + "learning_rate": 0.0004650420168067227, + "loss": 0.4782, + "step": 19210 + }, + { + "epoch": 10.732402234636872, + "grad_norm": 0.7021287679672241, + "learning_rate": 0.00046501400560224087, + "loss": 0.387, + "step": 19211 + }, + { + "epoch": 10.732960893854749, + "grad_norm": 0.29879865050315857, + "learning_rate": 0.00046498599439775913, + "loss": 0.3061, + "step": 19212 + }, + { + "epoch": 10.733519553072625, + "grad_norm": 1.0450514554977417, + "learning_rate": 0.00046495798319327734, + "loss": 0.3492, + "step": 19213 + }, + { + "epoch": 10.734078212290502, + "grad_norm": 1.3027139902114868, + "learning_rate": 0.00046492997198879554, + "loss": 0.4735, + "step": 19214 + }, + { + "epoch": 10.73463687150838, + "grad_norm": 0.501135528087616, + "learning_rate": 0.00046490196078431375, + "loss": 0.3938, + "step": 19215 + }, + { + "epoch": 10.735195530726257, + "grad_norm": 0.43792861700057983, + "learning_rate": 0.0004648739495798319, + "loss": 0.5522, + "step": 19216 + }, + { + "epoch": 10.735754189944133, + "grad_norm": 0.8639389276504517, + "learning_rate": 0.00046484593837535016, + "loss": 0.4747, + "step": 19217 + }, + { + "epoch": 10.736312849162012, + "grad_norm": 0.5563386082649231, + "learning_rate": 0.00046481792717086837, + "loss": 0.4683, + "step": 19218 + }, + { + "epoch": 10.736871508379888, + "grad_norm": 0.5339515209197998, + "learning_rate": 0.00046478991596638657, + "loss": 0.4707, + "step": 19219 + }, + { + "epoch": 10.737430167597765, + "grad_norm": 0.8274433612823486, + "learning_rate": 0.0004647619047619048, + "loss": 0.5157, + "step": 19220 + }, + { + "epoch": 10.737988826815643, + "grad_norm": 0.5010893940925598, + "learning_rate": 0.000464733893557423, + "loss": 0.4804, + "step": 19221 + }, + { + "epoch": 10.73854748603352, + "grad_norm": 0.42481040954589844, + "learning_rate": 0.0004647058823529412, + "loss": 0.459, + "step": 19222 + }, + { + "epoch": 10.739106145251396, + "grad_norm": 0.4648483395576477, + "learning_rate": 0.0004646778711484594, + "loss": 0.4011, + "step": 19223 + }, + { + "epoch": 10.739664804469275, + "grad_norm": 0.5737546682357788, + "learning_rate": 0.0004646498599439776, + "loss": 0.539, + "step": 19224 + }, + { + "epoch": 10.740223463687151, + "grad_norm": 0.9265821576118469, + "learning_rate": 0.0004646218487394958, + "loss": 0.4534, + "step": 19225 + }, + { + "epoch": 10.740782122905028, + "grad_norm": 0.5134668946266174, + "learning_rate": 0.000464593837535014, + "loss": 0.4256, + "step": 19226 + }, + { + "epoch": 10.741340782122904, + "grad_norm": 0.49933183193206787, + "learning_rate": 0.0004645658263305322, + "loss": 0.4049, + "step": 19227 + }, + { + "epoch": 10.741899441340783, + "grad_norm": 0.49328574538230896, + "learning_rate": 0.0004645378151260504, + "loss": 0.5104, + "step": 19228 + }, + { + "epoch": 10.74245810055866, + "grad_norm": 3.4530794620513916, + "learning_rate": 0.0004645098039215687, + "loss": 0.4194, + "step": 19229 + }, + { + "epoch": 10.743016759776536, + "grad_norm": 0.7529295086860657, + "learning_rate": 0.00046448179271708684, + "loss": 0.3637, + "step": 19230 + }, + { + "epoch": 10.743575418994414, + "grad_norm": 0.9591073989868164, + "learning_rate": 0.00046445378151260504, + "loss": 0.3951, + "step": 19231 + }, + { + "epoch": 10.74413407821229, + "grad_norm": 0.643728494644165, + "learning_rate": 0.00046442577030812325, + "loss": 0.5053, + "step": 19232 + }, + { + "epoch": 10.744692737430167, + "grad_norm": 0.8256698846817017, + "learning_rate": 0.00046439775910364146, + "loss": 0.4126, + "step": 19233 + }, + { + "epoch": 10.745251396648044, + "grad_norm": 0.3897912800312042, + "learning_rate": 0.0004643697478991597, + "loss": 0.3792, + "step": 19234 + }, + { + "epoch": 10.745810055865922, + "grad_norm": 1.7647591829299927, + "learning_rate": 0.00046434173669467787, + "loss": 0.4318, + "step": 19235 + }, + { + "epoch": 10.746368715083799, + "grad_norm": 0.5318904519081116, + "learning_rate": 0.0004643137254901961, + "loss": 0.3695, + "step": 19236 + }, + { + "epoch": 10.746927374301675, + "grad_norm": 0.6479517221450806, + "learning_rate": 0.00046428571428571433, + "loss": 0.3973, + "step": 19237 + }, + { + "epoch": 10.747486033519554, + "grad_norm": 0.38081344962120056, + "learning_rate": 0.0004642577030812325, + "loss": 0.458, + "step": 19238 + }, + { + "epoch": 10.74804469273743, + "grad_norm": 0.5555077791213989, + "learning_rate": 0.00046422969187675075, + "loss": 0.4492, + "step": 19239 + }, + { + "epoch": 10.748603351955307, + "grad_norm": 0.8116068243980408, + "learning_rate": 0.0004642016806722689, + "loss": 0.3887, + "step": 19240 + }, + { + "epoch": 10.749162011173185, + "grad_norm": 0.46870726346969604, + "learning_rate": 0.0004641736694677871, + "loss": 0.4963, + "step": 19241 + }, + { + "epoch": 10.749720670391062, + "grad_norm": 0.424879789352417, + "learning_rate": 0.00046414565826330536, + "loss": 0.4673, + "step": 19242 + }, + { + "epoch": 10.750279329608938, + "grad_norm": 4.5148820877075195, + "learning_rate": 0.0004641176470588235, + "loss": 0.5258, + "step": 19243 + }, + { + "epoch": 10.750837988826815, + "grad_norm": 0.6333886981010437, + "learning_rate": 0.0004640896358543418, + "loss": 0.4546, + "step": 19244 + }, + { + "epoch": 10.751396648044693, + "grad_norm": 2.736905097961426, + "learning_rate": 0.00046406162464986, + "loss": 0.3824, + "step": 19245 + }, + { + "epoch": 10.75195530726257, + "grad_norm": 0.4120141863822937, + "learning_rate": 0.00046403361344537813, + "loss": 0.3597, + "step": 19246 + }, + { + "epoch": 10.752513966480446, + "grad_norm": 0.709510862827301, + "learning_rate": 0.0004640056022408964, + "loss": 0.4432, + "step": 19247 + }, + { + "epoch": 10.753072625698325, + "grad_norm": 0.4020548164844513, + "learning_rate": 0.00046397759103641455, + "loss": 0.4354, + "step": 19248 + }, + { + "epoch": 10.753631284916201, + "grad_norm": 6.594186305999756, + "learning_rate": 0.00046394957983193275, + "loss": 0.3966, + "step": 19249 + }, + { + "epoch": 10.754189944134078, + "grad_norm": 0.4017973840236664, + "learning_rate": 0.000463921568627451, + "loss": 0.4345, + "step": 19250 + }, + { + "epoch": 10.754748603351956, + "grad_norm": 1.7600725889205933, + "learning_rate": 0.00046389355742296916, + "loss": 0.5708, + "step": 19251 + }, + { + "epoch": 10.755307262569833, + "grad_norm": 0.4477103054523468, + "learning_rate": 0.0004638655462184874, + "loss": 0.5661, + "step": 19252 + }, + { + "epoch": 10.75586592178771, + "grad_norm": 0.923815131187439, + "learning_rate": 0.00046383753501400563, + "loss": 0.5779, + "step": 19253 + }, + { + "epoch": 10.756424581005586, + "grad_norm": 0.395528644323349, + "learning_rate": 0.0004638095238095238, + "loss": 0.4131, + "step": 19254 + }, + { + "epoch": 10.756983240223464, + "grad_norm": 0.5090433359146118, + "learning_rate": 0.00046378151260504204, + "loss": 0.4368, + "step": 19255 + }, + { + "epoch": 10.75754189944134, + "grad_norm": 0.7328028678894043, + "learning_rate": 0.0004637535014005602, + "loss": 0.6518, + "step": 19256 + }, + { + "epoch": 10.758100558659217, + "grad_norm": 0.7842195630073547, + "learning_rate": 0.00046372549019607845, + "loss": 0.4425, + "step": 19257 + }, + { + "epoch": 10.758659217877096, + "grad_norm": 0.4387664198875427, + "learning_rate": 0.00046369747899159666, + "loss": 0.3423, + "step": 19258 + }, + { + "epoch": 10.759217877094972, + "grad_norm": 0.6029638051986694, + "learning_rate": 0.0004636694677871148, + "loss": 0.5207, + "step": 19259 + }, + { + "epoch": 10.759776536312849, + "grad_norm": 0.43201056122779846, + "learning_rate": 0.00046364145658263307, + "loss": 0.4241, + "step": 19260 + }, + { + "epoch": 10.760335195530725, + "grad_norm": 2.0606930255889893, + "learning_rate": 0.0004636134453781513, + "loss": 0.4359, + "step": 19261 + }, + { + "epoch": 10.760893854748604, + "grad_norm": 0.5689659714698792, + "learning_rate": 0.0004635854341736695, + "loss": 0.3455, + "step": 19262 + }, + { + "epoch": 10.76145251396648, + "grad_norm": 1.1604888439178467, + "learning_rate": 0.0004635574229691877, + "loss": 0.4831, + "step": 19263 + }, + { + "epoch": 10.762011173184357, + "grad_norm": 0.467805951833725, + "learning_rate": 0.00046352941176470584, + "loss": 0.4345, + "step": 19264 + }, + { + "epoch": 10.762569832402235, + "grad_norm": 13.2154541015625, + "learning_rate": 0.0004635014005602241, + "loss": 0.4018, + "step": 19265 + }, + { + "epoch": 10.763128491620112, + "grad_norm": 0.435615599155426, + "learning_rate": 0.0004634733893557423, + "loss": 0.4652, + "step": 19266 + }, + { + "epoch": 10.763687150837988, + "grad_norm": 0.39532455801963806, + "learning_rate": 0.0004634453781512605, + "loss": 0.3569, + "step": 19267 + }, + { + "epoch": 10.764245810055867, + "grad_norm": 0.4446870982646942, + "learning_rate": 0.0004634173669467787, + "loss": 0.3963, + "step": 19268 + }, + { + "epoch": 10.764804469273743, + "grad_norm": 0.7825211882591248, + "learning_rate": 0.0004633893557422969, + "loss": 0.4749, + "step": 19269 + }, + { + "epoch": 10.76536312849162, + "grad_norm": 0.6228837370872498, + "learning_rate": 0.00046336134453781513, + "loss": 0.3874, + "step": 19270 + }, + { + "epoch": 10.765921787709498, + "grad_norm": 1.3253917694091797, + "learning_rate": 0.00046333333333333334, + "loss": 0.5808, + "step": 19271 + }, + { + "epoch": 10.766480446927375, + "grad_norm": 3.4840729236602783, + "learning_rate": 0.00046330532212885154, + "loss": 0.4553, + "step": 19272 + }, + { + "epoch": 10.767039106145251, + "grad_norm": 0.46659964323043823, + "learning_rate": 0.00046327731092436975, + "loss": 0.4037, + "step": 19273 + }, + { + "epoch": 10.767597765363128, + "grad_norm": 0.37165331840515137, + "learning_rate": 0.00046324929971988796, + "loss": 0.4175, + "step": 19274 + }, + { + "epoch": 10.768156424581006, + "grad_norm": 0.6347272992134094, + "learning_rate": 0.00046322128851540616, + "loss": 0.497, + "step": 19275 + }, + { + "epoch": 10.768715083798883, + "grad_norm": 1.4658212661743164, + "learning_rate": 0.00046319327731092437, + "loss": 0.4301, + "step": 19276 + }, + { + "epoch": 10.76927374301676, + "grad_norm": 0.3680577874183655, + "learning_rate": 0.00046316526610644263, + "loss": 0.3865, + "step": 19277 + }, + { + "epoch": 10.769832402234638, + "grad_norm": 0.7368124127388, + "learning_rate": 0.0004631372549019608, + "loss": 0.3583, + "step": 19278 + }, + { + "epoch": 10.770391061452514, + "grad_norm": 0.4230891466140747, + "learning_rate": 0.000463109243697479, + "loss": 0.5101, + "step": 19279 + }, + { + "epoch": 10.77094972067039, + "grad_norm": 1.1111137866973877, + "learning_rate": 0.0004630812324929972, + "loss": 0.397, + "step": 19280 + }, + { + "epoch": 10.771508379888267, + "grad_norm": 0.42726680636405945, + "learning_rate": 0.0004630532212885154, + "loss": 0.4537, + "step": 19281 + }, + { + "epoch": 10.772067039106146, + "grad_norm": 0.4583140015602112, + "learning_rate": 0.00046302521008403366, + "loss": 0.5424, + "step": 19282 + }, + { + "epoch": 10.772625698324022, + "grad_norm": 0.6325045824050903, + "learning_rate": 0.0004629971988795518, + "loss": 0.4279, + "step": 19283 + }, + { + "epoch": 10.773184357541899, + "grad_norm": 0.8716033101081848, + "learning_rate": 0.00046296918767507, + "loss": 0.5713, + "step": 19284 + }, + { + "epoch": 10.773743016759777, + "grad_norm": 0.6241661310195923, + "learning_rate": 0.0004629411764705883, + "loss": 0.4899, + "step": 19285 + }, + { + "epoch": 10.774301675977654, + "grad_norm": 0.7427290678024292, + "learning_rate": 0.00046291316526610643, + "loss": 0.4353, + "step": 19286 + }, + { + "epoch": 10.77486033519553, + "grad_norm": 0.5163410902023315, + "learning_rate": 0.0004628851540616247, + "loss": 0.5552, + "step": 19287 + }, + { + "epoch": 10.775418994413407, + "grad_norm": 0.4569401741027832, + "learning_rate": 0.00046285714285714284, + "loss": 0.3789, + "step": 19288 + }, + { + "epoch": 10.775977653631285, + "grad_norm": 0.5449912548065186, + "learning_rate": 0.00046282913165266105, + "loss": 0.3555, + "step": 19289 + }, + { + "epoch": 10.776536312849162, + "grad_norm": 0.4345206022262573, + "learning_rate": 0.0004628011204481793, + "loss": 0.3725, + "step": 19290 + }, + { + "epoch": 10.777094972067038, + "grad_norm": 0.40944766998291016, + "learning_rate": 0.00046277310924369746, + "loss": 0.4106, + "step": 19291 + }, + { + "epoch": 10.777653631284917, + "grad_norm": 0.4483919143676758, + "learning_rate": 0.0004627450980392157, + "loss": 0.4336, + "step": 19292 + }, + { + "epoch": 10.778212290502793, + "grad_norm": 0.4667958617210388, + "learning_rate": 0.0004627170868347339, + "loss": 0.4359, + "step": 19293 + }, + { + "epoch": 10.77877094972067, + "grad_norm": 0.5636364817619324, + "learning_rate": 0.0004626890756302521, + "loss": 0.4771, + "step": 19294 + }, + { + "epoch": 10.779329608938548, + "grad_norm": 0.619242250919342, + "learning_rate": 0.00046266106442577034, + "loss": 0.5751, + "step": 19295 + }, + { + "epoch": 10.779888268156425, + "grad_norm": 0.5496019721031189, + "learning_rate": 0.0004626330532212885, + "loss": 0.3757, + "step": 19296 + }, + { + "epoch": 10.780446927374301, + "grad_norm": 0.5858065485954285, + "learning_rate": 0.00046260504201680675, + "loss": 0.5946, + "step": 19297 + }, + { + "epoch": 10.78100558659218, + "grad_norm": 0.7319847941398621, + "learning_rate": 0.00046257703081232495, + "loss": 0.4712, + "step": 19298 + }, + { + "epoch": 10.781564245810056, + "grad_norm": 0.6593323945999146, + "learning_rate": 0.0004625490196078431, + "loss": 0.4029, + "step": 19299 + }, + { + "epoch": 10.782122905027933, + "grad_norm": 0.4027957320213318, + "learning_rate": 0.00046252100840336137, + "loss": 0.4971, + "step": 19300 + }, + { + "epoch": 10.78268156424581, + "grad_norm": 0.4077528417110443, + "learning_rate": 0.00046249299719887957, + "loss": 0.4621, + "step": 19301 + }, + { + "epoch": 10.783240223463688, + "grad_norm": 0.3862953186035156, + "learning_rate": 0.0004624649859943978, + "loss": 0.3546, + "step": 19302 + }, + { + "epoch": 10.783798882681564, + "grad_norm": 0.414044052362442, + "learning_rate": 0.000462436974789916, + "loss": 0.4204, + "step": 19303 + }, + { + "epoch": 10.78435754189944, + "grad_norm": 0.5123696327209473, + "learning_rate": 0.00046240896358543414, + "loss": 0.5273, + "step": 19304 + }, + { + "epoch": 10.78491620111732, + "grad_norm": 0.47156015038490295, + "learning_rate": 0.0004623809523809524, + "loss": 0.4362, + "step": 19305 + }, + { + "epoch": 10.785474860335196, + "grad_norm": 0.9246543645858765, + "learning_rate": 0.0004623529411764706, + "loss": 0.4105, + "step": 19306 + }, + { + "epoch": 10.786033519553072, + "grad_norm": 1.2858933210372925, + "learning_rate": 0.0004623249299719888, + "loss": 0.5014, + "step": 19307 + }, + { + "epoch": 10.786592178770949, + "grad_norm": 0.6493561863899231, + "learning_rate": 0.000462296918767507, + "loss": 0.5241, + "step": 19308 + }, + { + "epoch": 10.787150837988827, + "grad_norm": 0.48784035444259644, + "learning_rate": 0.0004622689075630252, + "loss": 0.4331, + "step": 19309 + }, + { + "epoch": 10.787709497206704, + "grad_norm": 0.39675384759902954, + "learning_rate": 0.0004622408963585434, + "loss": 0.3875, + "step": 19310 + }, + { + "epoch": 10.78826815642458, + "grad_norm": 0.49113771319389343, + "learning_rate": 0.00046221288515406163, + "loss": 0.3633, + "step": 19311 + }, + { + "epoch": 10.788826815642459, + "grad_norm": 0.6159145832061768, + "learning_rate": 0.00046218487394957984, + "loss": 0.4832, + "step": 19312 + }, + { + "epoch": 10.789385474860335, + "grad_norm": 0.8310093283653259, + "learning_rate": 0.00046215686274509804, + "loss": 0.4591, + "step": 19313 + }, + { + "epoch": 10.789944134078212, + "grad_norm": 0.3561941385269165, + "learning_rate": 0.00046212885154061625, + "loss": 0.304, + "step": 19314 + }, + { + "epoch": 10.79050279329609, + "grad_norm": 0.4736616909503937, + "learning_rate": 0.00046210084033613446, + "loss": 0.4513, + "step": 19315 + }, + { + "epoch": 10.791061452513967, + "grad_norm": 0.5063034892082214, + "learning_rate": 0.00046207282913165266, + "loss": 0.4467, + "step": 19316 + }, + { + "epoch": 10.791620111731843, + "grad_norm": 0.45543110370635986, + "learning_rate": 0.0004620448179271709, + "loss": 0.4513, + "step": 19317 + }, + { + "epoch": 10.79217877094972, + "grad_norm": 0.6345983743667603, + "learning_rate": 0.0004620168067226891, + "loss": 0.5543, + "step": 19318 + }, + { + "epoch": 10.792737430167598, + "grad_norm": 0.3974752128124237, + "learning_rate": 0.0004619887955182073, + "loss": 0.3946, + "step": 19319 + }, + { + "epoch": 10.793296089385475, + "grad_norm": 1.067287802696228, + "learning_rate": 0.0004619607843137255, + "loss": 0.4421, + "step": 19320 + }, + { + "epoch": 10.793854748603351, + "grad_norm": 1.379459261894226, + "learning_rate": 0.0004619327731092437, + "loss": 0.3857, + "step": 19321 + }, + { + "epoch": 10.79441340782123, + "grad_norm": 1.1232118606567383, + "learning_rate": 0.00046190476190476195, + "loss": 0.4589, + "step": 19322 + }, + { + "epoch": 10.794972067039106, + "grad_norm": 0.5565237998962402, + "learning_rate": 0.0004618767507002801, + "loss": 0.3886, + "step": 19323 + }, + { + "epoch": 10.795530726256983, + "grad_norm": 2.12746524810791, + "learning_rate": 0.0004618487394957983, + "loss": 0.4434, + "step": 19324 + }, + { + "epoch": 10.796089385474861, + "grad_norm": 0.4889336824417114, + "learning_rate": 0.00046182072829131657, + "loss": 0.5011, + "step": 19325 + }, + { + "epoch": 10.796648044692738, + "grad_norm": 1.116256594657898, + "learning_rate": 0.0004617927170868347, + "loss": 0.3395, + "step": 19326 + }, + { + "epoch": 10.797206703910614, + "grad_norm": 1.0063740015029907, + "learning_rate": 0.000461764705882353, + "loss": 0.5212, + "step": 19327 + }, + { + "epoch": 10.797765363128491, + "grad_norm": 0.5698748230934143, + "learning_rate": 0.00046173669467787113, + "loss": 0.4851, + "step": 19328 + }, + { + "epoch": 10.79832402234637, + "grad_norm": 0.4347343444824219, + "learning_rate": 0.00046170868347338934, + "loss": 0.3816, + "step": 19329 + }, + { + "epoch": 10.798882681564246, + "grad_norm": 1.0486772060394287, + "learning_rate": 0.0004616806722689076, + "loss": 0.4106, + "step": 19330 + }, + { + "epoch": 10.799441340782122, + "grad_norm": 0.5287651419639587, + "learning_rate": 0.00046165266106442575, + "loss": 0.3629, + "step": 19331 + }, + { + "epoch": 10.8, + "grad_norm": 0.48144909739494324, + "learning_rate": 0.000461624649859944, + "loss": 0.4464, + "step": 19332 + }, + { + "epoch": 10.800558659217877, + "grad_norm": 0.40220141410827637, + "learning_rate": 0.0004615966386554622, + "loss": 0.4883, + "step": 19333 + }, + { + "epoch": 10.801117318435754, + "grad_norm": 1.6693894863128662, + "learning_rate": 0.00046156862745098037, + "loss": 0.3668, + "step": 19334 + }, + { + "epoch": 10.80167597765363, + "grad_norm": 0.47027599811553955, + "learning_rate": 0.00046154061624649863, + "loss": 0.4018, + "step": 19335 + }, + { + "epoch": 10.802234636871509, + "grad_norm": 0.5801828503608704, + "learning_rate": 0.0004615126050420168, + "loss": 0.4979, + "step": 19336 + }, + { + "epoch": 10.802793296089385, + "grad_norm": 0.42702654004096985, + "learning_rate": 0.00046148459383753504, + "loss": 0.3765, + "step": 19337 + }, + { + "epoch": 10.803351955307262, + "grad_norm": 0.5157850384712219, + "learning_rate": 0.00046145658263305325, + "loss": 0.4205, + "step": 19338 + }, + { + "epoch": 10.80391061452514, + "grad_norm": 0.6636309027671814, + "learning_rate": 0.0004614285714285714, + "loss": 0.5256, + "step": 19339 + }, + { + "epoch": 10.804469273743017, + "grad_norm": 1.0942429304122925, + "learning_rate": 0.00046140056022408966, + "loss": 0.4275, + "step": 19340 + }, + { + "epoch": 10.805027932960893, + "grad_norm": 1.4030613899230957, + "learning_rate": 0.00046137254901960787, + "loss": 0.3863, + "step": 19341 + }, + { + "epoch": 10.805586592178772, + "grad_norm": 0.5275806784629822, + "learning_rate": 0.00046134453781512607, + "loss": 0.4348, + "step": 19342 + }, + { + "epoch": 10.806145251396648, + "grad_norm": 0.44086840748786926, + "learning_rate": 0.0004613165266106443, + "loss": 0.5261, + "step": 19343 + }, + { + "epoch": 10.806703910614525, + "grad_norm": 0.5196534395217896, + "learning_rate": 0.00046128851540616243, + "loss": 0.33, + "step": 19344 + }, + { + "epoch": 10.807262569832401, + "grad_norm": 0.9620259404182434, + "learning_rate": 0.0004612605042016807, + "loss": 0.3992, + "step": 19345 + }, + { + "epoch": 10.80782122905028, + "grad_norm": 1.5872310400009155, + "learning_rate": 0.0004612324929971989, + "loss": 0.3695, + "step": 19346 + }, + { + "epoch": 10.808379888268156, + "grad_norm": 0.544471800327301, + "learning_rate": 0.0004612044817927171, + "loss": 0.5001, + "step": 19347 + }, + { + "epoch": 10.808938547486033, + "grad_norm": 0.3735467195510864, + "learning_rate": 0.0004611764705882353, + "loss": 0.4871, + "step": 19348 + }, + { + "epoch": 10.809497206703911, + "grad_norm": 0.38875165581703186, + "learning_rate": 0.0004611484593837535, + "loss": 0.4097, + "step": 19349 + }, + { + "epoch": 10.810055865921788, + "grad_norm": 0.36824342608451843, + "learning_rate": 0.0004611204481792717, + "loss": 0.4045, + "step": 19350 + }, + { + "epoch": 10.810614525139664, + "grad_norm": 1.3030104637145996, + "learning_rate": 0.0004610924369747899, + "loss": 0.4166, + "step": 19351 + }, + { + "epoch": 10.811173184357543, + "grad_norm": 0.5162912607192993, + "learning_rate": 0.0004610644257703082, + "loss": 0.4594, + "step": 19352 + }, + { + "epoch": 10.81173184357542, + "grad_norm": 0.402105450630188, + "learning_rate": 0.00046103641456582634, + "loss": 0.3883, + "step": 19353 + }, + { + "epoch": 10.812290502793296, + "grad_norm": 0.4113750159740448, + "learning_rate": 0.00046100840336134454, + "loss": 0.3873, + "step": 19354 + }, + { + "epoch": 10.812849162011172, + "grad_norm": 1.1792447566986084, + "learning_rate": 0.00046098039215686275, + "loss": 0.4022, + "step": 19355 + }, + { + "epoch": 10.81340782122905, + "grad_norm": 0.4565427601337433, + "learning_rate": 0.00046095238095238096, + "loss": 0.3777, + "step": 19356 + }, + { + "epoch": 10.813966480446927, + "grad_norm": 0.473994642496109, + "learning_rate": 0.00046092436974789916, + "loss": 0.4238, + "step": 19357 + }, + { + "epoch": 10.814525139664804, + "grad_norm": 0.43914365768432617, + "learning_rate": 0.00046089635854341737, + "loss": 0.4176, + "step": 19358 + }, + { + "epoch": 10.815083798882682, + "grad_norm": 0.37854260206222534, + "learning_rate": 0.0004608683473389356, + "loss": 0.4129, + "step": 19359 + }, + { + "epoch": 10.815642458100559, + "grad_norm": 1.0585401058197021, + "learning_rate": 0.00046084033613445383, + "loss": 0.3534, + "step": 19360 + }, + { + "epoch": 10.816201117318435, + "grad_norm": 0.938851535320282, + "learning_rate": 0.000460812324929972, + "loss": 0.4993, + "step": 19361 + }, + { + "epoch": 10.816759776536312, + "grad_norm": 0.45085254311561584, + "learning_rate": 0.0004607843137254902, + "loss": 0.3952, + "step": 19362 + }, + { + "epoch": 10.81731843575419, + "grad_norm": 0.7993432879447937, + "learning_rate": 0.0004607563025210084, + "loss": 0.6465, + "step": 19363 + }, + { + "epoch": 10.817877094972067, + "grad_norm": 0.38208821415901184, + "learning_rate": 0.0004607282913165266, + "loss": 0.4511, + "step": 19364 + }, + { + "epoch": 10.818435754189943, + "grad_norm": 0.5797911286354065, + "learning_rate": 0.00046070028011204486, + "loss": 0.4004, + "step": 19365 + }, + { + "epoch": 10.818994413407822, + "grad_norm": 0.3544953465461731, + "learning_rate": 0.000460672268907563, + "loss": 0.3816, + "step": 19366 + }, + { + "epoch": 10.819553072625698, + "grad_norm": 0.37194350361824036, + "learning_rate": 0.0004606442577030812, + "loss": 0.3402, + "step": 19367 + }, + { + "epoch": 10.820111731843575, + "grad_norm": 0.45848625898361206, + "learning_rate": 0.0004606162464985995, + "loss": 0.4818, + "step": 19368 + }, + { + "epoch": 10.820670391061453, + "grad_norm": 0.5389447212219238, + "learning_rate": 0.00046058823529411763, + "loss": 0.4482, + "step": 19369 + }, + { + "epoch": 10.82122905027933, + "grad_norm": 0.6187888383865356, + "learning_rate": 0.0004605602240896359, + "loss": 0.5049, + "step": 19370 + }, + { + "epoch": 10.821787709497206, + "grad_norm": 0.4174392521381378, + "learning_rate": 0.00046053221288515405, + "loss": 0.3571, + "step": 19371 + }, + { + "epoch": 10.822346368715085, + "grad_norm": 1.2650152444839478, + "learning_rate": 0.00046050420168067225, + "loss": 0.3678, + "step": 19372 + }, + { + "epoch": 10.822905027932961, + "grad_norm": 0.38347044587135315, + "learning_rate": 0.0004604761904761905, + "loss": 0.3014, + "step": 19373 + }, + { + "epoch": 10.823463687150838, + "grad_norm": 1.2803605794906616, + "learning_rate": 0.00046044817927170866, + "loss": 0.3693, + "step": 19374 + }, + { + "epoch": 10.824022346368714, + "grad_norm": 0.41358181834220886, + "learning_rate": 0.0004604201680672269, + "loss": 0.3828, + "step": 19375 + }, + { + "epoch": 10.824581005586593, + "grad_norm": 0.46894052624702454, + "learning_rate": 0.00046039215686274513, + "loss": 0.5347, + "step": 19376 + }, + { + "epoch": 10.82513966480447, + "grad_norm": 0.49601656198501587, + "learning_rate": 0.0004603641456582633, + "loss": 0.4682, + "step": 19377 + }, + { + "epoch": 10.825698324022346, + "grad_norm": 0.3628762364387512, + "learning_rate": 0.00046033613445378154, + "loss": 0.3663, + "step": 19378 + }, + { + "epoch": 10.826256983240224, + "grad_norm": 0.9737803936004639, + "learning_rate": 0.0004603081232492997, + "loss": 0.3428, + "step": 19379 + }, + { + "epoch": 10.8268156424581, + "grad_norm": 0.43125107884407043, + "learning_rate": 0.00046028011204481795, + "loss": 0.4197, + "step": 19380 + }, + { + "epoch": 10.827374301675977, + "grad_norm": 0.8552989959716797, + "learning_rate": 0.00046025210084033616, + "loss": 0.4036, + "step": 19381 + }, + { + "epoch": 10.827932960893854, + "grad_norm": 0.3894343078136444, + "learning_rate": 0.0004602240896358543, + "loss": 0.3402, + "step": 19382 + }, + { + "epoch": 10.828491620111732, + "grad_norm": 2.2663543224334717, + "learning_rate": 0.00046019607843137257, + "loss": 0.3912, + "step": 19383 + }, + { + "epoch": 10.829050279329609, + "grad_norm": 0.489553838968277, + "learning_rate": 0.0004601680672268908, + "loss": 0.4027, + "step": 19384 + }, + { + "epoch": 10.829608938547485, + "grad_norm": 0.4080771207809448, + "learning_rate": 0.000460140056022409, + "loss": 0.3698, + "step": 19385 + }, + { + "epoch": 10.830167597765364, + "grad_norm": 0.4230101704597473, + "learning_rate": 0.0004601120448179272, + "loss": 0.4366, + "step": 19386 + }, + { + "epoch": 10.83072625698324, + "grad_norm": 0.5856796503067017, + "learning_rate": 0.00046008403361344534, + "loss": 0.5126, + "step": 19387 + }, + { + "epoch": 10.831284916201117, + "grad_norm": 0.4532826840877533, + "learning_rate": 0.0004600560224089636, + "loss": 0.4678, + "step": 19388 + }, + { + "epoch": 10.831843575418995, + "grad_norm": 1.1305073499679565, + "learning_rate": 0.0004600280112044818, + "loss": 0.3604, + "step": 19389 + }, + { + "epoch": 10.832402234636872, + "grad_norm": 8.677497863769531, + "learning_rate": 0.00046, + "loss": 0.4249, + "step": 19390 + }, + { + "epoch": 10.832960893854748, + "grad_norm": 0.4060731530189514, + "learning_rate": 0.0004599719887955182, + "loss": 0.4867, + "step": 19391 + }, + { + "epoch": 10.833519553072625, + "grad_norm": 0.5433281660079956, + "learning_rate": 0.0004599439775910364, + "loss": 0.4187, + "step": 19392 + }, + { + "epoch": 10.834078212290503, + "grad_norm": 0.7483600378036499, + "learning_rate": 0.00045991596638655463, + "loss": 0.4404, + "step": 19393 + }, + { + "epoch": 10.83463687150838, + "grad_norm": 0.4292261004447937, + "learning_rate": 0.00045988795518207284, + "loss": 0.4674, + "step": 19394 + }, + { + "epoch": 10.835195530726256, + "grad_norm": 3.1649255752563477, + "learning_rate": 0.00045985994397759104, + "loss": 0.3724, + "step": 19395 + }, + { + "epoch": 10.835754189944135, + "grad_norm": 1.2901256084442139, + "learning_rate": 0.00045983193277310925, + "loss": 0.4943, + "step": 19396 + }, + { + "epoch": 10.836312849162011, + "grad_norm": 0.6612398028373718, + "learning_rate": 0.00045980392156862746, + "loss": 0.3116, + "step": 19397 + }, + { + "epoch": 10.836871508379888, + "grad_norm": 0.5664133429527283, + "learning_rate": 0.00045977591036414566, + "loss": 0.5129, + "step": 19398 + }, + { + "epoch": 10.837430167597766, + "grad_norm": 0.501022458076477, + "learning_rate": 0.00045974789915966387, + "loss": 0.4489, + "step": 19399 + }, + { + "epoch": 10.837988826815643, + "grad_norm": 0.6660512685775757, + "learning_rate": 0.00045971988795518213, + "loss": 0.3566, + "step": 19400 + }, + { + "epoch": 10.83854748603352, + "grad_norm": 0.5224534869194031, + "learning_rate": 0.0004596918767507003, + "loss": 0.4298, + "step": 19401 + }, + { + "epoch": 10.839106145251396, + "grad_norm": 0.6409733891487122, + "learning_rate": 0.0004596638655462185, + "loss": 0.4639, + "step": 19402 + }, + { + "epoch": 10.839664804469274, + "grad_norm": 0.5712777972221375, + "learning_rate": 0.0004596358543417367, + "loss": 0.4727, + "step": 19403 + }, + { + "epoch": 10.84022346368715, + "grad_norm": 0.7224316000938416, + "learning_rate": 0.0004596078431372549, + "loss": 0.4512, + "step": 19404 + }, + { + "epoch": 10.840782122905027, + "grad_norm": 1.6012191772460938, + "learning_rate": 0.00045957983193277316, + "loss": 0.6956, + "step": 19405 + }, + { + "epoch": 10.841340782122906, + "grad_norm": 0.41232651472091675, + "learning_rate": 0.0004595518207282913, + "loss": 0.3873, + "step": 19406 + }, + { + "epoch": 10.841899441340782, + "grad_norm": 0.4890575408935547, + "learning_rate": 0.0004595238095238095, + "loss": 0.3495, + "step": 19407 + }, + { + "epoch": 10.842458100558659, + "grad_norm": 0.3697237968444824, + "learning_rate": 0.0004594957983193278, + "loss": 0.3459, + "step": 19408 + }, + { + "epoch": 10.843016759776535, + "grad_norm": 0.40435993671417236, + "learning_rate": 0.00045946778711484593, + "loss": 0.384, + "step": 19409 + }, + { + "epoch": 10.843575418994414, + "grad_norm": 0.6315296292304993, + "learning_rate": 0.0004594397759103642, + "loss": 0.4325, + "step": 19410 + }, + { + "epoch": 10.84413407821229, + "grad_norm": 1.816961646080017, + "learning_rate": 0.00045941176470588234, + "loss": 0.4791, + "step": 19411 + }, + { + "epoch": 10.844692737430167, + "grad_norm": 1.417698860168457, + "learning_rate": 0.00045938375350140055, + "loss": 0.44, + "step": 19412 + }, + { + "epoch": 10.845251396648045, + "grad_norm": 0.5425516366958618, + "learning_rate": 0.0004593557422969188, + "loss": 0.5434, + "step": 19413 + }, + { + "epoch": 10.845810055865922, + "grad_norm": 0.5822159051895142, + "learning_rate": 0.00045932773109243696, + "loss": 0.3976, + "step": 19414 + }, + { + "epoch": 10.846368715083798, + "grad_norm": 0.5106617212295532, + "learning_rate": 0.0004592997198879552, + "loss": 0.4486, + "step": 19415 + }, + { + "epoch": 10.846927374301677, + "grad_norm": 0.5801783204078674, + "learning_rate": 0.0004592717086834734, + "loss": 0.3663, + "step": 19416 + }, + { + "epoch": 10.847486033519553, + "grad_norm": 0.5556785464286804, + "learning_rate": 0.0004592436974789916, + "loss": 0.4236, + "step": 19417 + }, + { + "epoch": 10.84804469273743, + "grad_norm": 0.39270102977752686, + "learning_rate": 0.00045921568627450984, + "loss": 0.4367, + "step": 19418 + }, + { + "epoch": 10.848603351955306, + "grad_norm": 0.7479550838470459, + "learning_rate": 0.000459187675070028, + "loss": 0.4018, + "step": 19419 + }, + { + "epoch": 10.849162011173185, + "grad_norm": 0.3583800792694092, + "learning_rate": 0.00045915966386554625, + "loss": 0.3817, + "step": 19420 + }, + { + "epoch": 10.849720670391061, + "grad_norm": 0.5465046167373657, + "learning_rate": 0.00045913165266106445, + "loss": 0.4551, + "step": 19421 + }, + { + "epoch": 10.850279329608938, + "grad_norm": 0.46662572026252747, + "learning_rate": 0.0004591036414565826, + "loss": 0.42, + "step": 19422 + }, + { + "epoch": 10.850837988826816, + "grad_norm": 0.5551797151565552, + "learning_rate": 0.00045907563025210087, + "loss": 0.3999, + "step": 19423 + }, + { + "epoch": 10.851396648044693, + "grad_norm": 0.4343171715736389, + "learning_rate": 0.00045904761904761907, + "loss": 0.367, + "step": 19424 + }, + { + "epoch": 10.85195530726257, + "grad_norm": 0.5589561462402344, + "learning_rate": 0.0004590196078431373, + "loss": 0.4261, + "step": 19425 + }, + { + "epoch": 10.852513966480448, + "grad_norm": 0.7213286757469177, + "learning_rate": 0.0004589915966386555, + "loss": 0.4521, + "step": 19426 + }, + { + "epoch": 10.853072625698324, + "grad_norm": 0.4499867856502533, + "learning_rate": 0.00045896358543417364, + "loss": 0.4668, + "step": 19427 + }, + { + "epoch": 10.8536312849162, + "grad_norm": 1.3919888734817505, + "learning_rate": 0.0004589355742296919, + "loss": 0.4661, + "step": 19428 + }, + { + "epoch": 10.854189944134077, + "grad_norm": 1.2904208898544312, + "learning_rate": 0.0004589075630252101, + "loss": 0.3783, + "step": 19429 + }, + { + "epoch": 10.854748603351956, + "grad_norm": 1.404299020767212, + "learning_rate": 0.0004588795518207283, + "loss": 0.3654, + "step": 19430 + }, + { + "epoch": 10.855307262569832, + "grad_norm": 0.7671793103218079, + "learning_rate": 0.0004588515406162465, + "loss": 0.5028, + "step": 19431 + }, + { + "epoch": 10.855865921787709, + "grad_norm": 0.6422199606895447, + "learning_rate": 0.0004588235294117647, + "loss": 0.3595, + "step": 19432 + }, + { + "epoch": 10.856424581005587, + "grad_norm": 0.48911821842193604, + "learning_rate": 0.0004587955182072829, + "loss": 0.452, + "step": 19433 + }, + { + "epoch": 10.856983240223464, + "grad_norm": 0.3717813491821289, + "learning_rate": 0.00045876750700280113, + "loss": 0.3661, + "step": 19434 + }, + { + "epoch": 10.85754189944134, + "grad_norm": 0.5459839105606079, + "learning_rate": 0.00045873949579831934, + "loss": 0.3389, + "step": 19435 + }, + { + "epoch": 10.858100558659217, + "grad_norm": 0.5253312587738037, + "learning_rate": 0.00045871148459383754, + "loss": 0.5371, + "step": 19436 + }, + { + "epoch": 10.858659217877095, + "grad_norm": 0.44571197032928467, + "learning_rate": 0.00045868347338935575, + "loss": 0.5183, + "step": 19437 + }, + { + "epoch": 10.859217877094972, + "grad_norm": 0.3728795647621155, + "learning_rate": 0.00045865546218487396, + "loss": 0.3295, + "step": 19438 + }, + { + "epoch": 10.859776536312848, + "grad_norm": 0.5091723203659058, + "learning_rate": 0.00045862745098039216, + "loss": 0.4881, + "step": 19439 + }, + { + "epoch": 10.860335195530727, + "grad_norm": 12.88525390625, + "learning_rate": 0.0004585994397759104, + "loss": 0.3995, + "step": 19440 + }, + { + "epoch": 10.860893854748603, + "grad_norm": 0.40066832304000854, + "learning_rate": 0.0004585714285714286, + "loss": 0.4227, + "step": 19441 + }, + { + "epoch": 10.86145251396648, + "grad_norm": 1.599927544593811, + "learning_rate": 0.0004585434173669468, + "loss": 0.4424, + "step": 19442 + }, + { + "epoch": 10.862011173184358, + "grad_norm": 4.741647720336914, + "learning_rate": 0.000458515406162465, + "loss": 0.4309, + "step": 19443 + }, + { + "epoch": 10.862569832402235, + "grad_norm": 0.7968552708625793, + "learning_rate": 0.0004584873949579832, + "loss": 0.5603, + "step": 19444 + }, + { + "epoch": 10.863128491620111, + "grad_norm": 0.5232609510421753, + "learning_rate": 0.00045845938375350145, + "loss": 0.4461, + "step": 19445 + }, + { + "epoch": 10.86368715083799, + "grad_norm": 0.892772376537323, + "learning_rate": 0.0004584313725490196, + "loss": 0.3901, + "step": 19446 + }, + { + "epoch": 10.864245810055866, + "grad_norm": 0.7650253772735596, + "learning_rate": 0.0004584033613445378, + "loss": 0.4357, + "step": 19447 + }, + { + "epoch": 10.864804469273743, + "grad_norm": 0.8138840794563293, + "learning_rate": 0.00045837535014005607, + "loss": 0.4213, + "step": 19448 + }, + { + "epoch": 10.86536312849162, + "grad_norm": 0.7876865863800049, + "learning_rate": 0.0004583473389355742, + "loss": 0.4421, + "step": 19449 + }, + { + "epoch": 10.865921787709498, + "grad_norm": 0.5165186524391174, + "learning_rate": 0.0004583193277310925, + "loss": 0.408, + "step": 19450 + }, + { + "epoch": 10.866480446927374, + "grad_norm": 0.8853394985198975, + "learning_rate": 0.00045829131652661063, + "loss": 0.5033, + "step": 19451 + }, + { + "epoch": 10.867039106145251, + "grad_norm": 0.5527562499046326, + "learning_rate": 0.00045826330532212884, + "loss": 0.4613, + "step": 19452 + }, + { + "epoch": 10.86759776536313, + "grad_norm": 0.45241275429725647, + "learning_rate": 0.0004582352941176471, + "loss": 0.4677, + "step": 19453 + }, + { + "epoch": 10.868156424581006, + "grad_norm": 0.39702826738357544, + "learning_rate": 0.00045820728291316525, + "loss": 0.3307, + "step": 19454 + }, + { + "epoch": 10.868715083798882, + "grad_norm": 0.9053224325180054, + "learning_rate": 0.0004581792717086835, + "loss": 0.3657, + "step": 19455 + }, + { + "epoch": 10.869273743016759, + "grad_norm": 2.9940106868743896, + "learning_rate": 0.0004581512605042017, + "loss": 0.4027, + "step": 19456 + }, + { + "epoch": 10.869832402234637, + "grad_norm": 0.6413319706916809, + "learning_rate": 0.00045812324929971987, + "loss": 0.4932, + "step": 19457 + }, + { + "epoch": 10.870391061452514, + "grad_norm": 0.735305666923523, + "learning_rate": 0.00045809523809523813, + "loss": 0.5561, + "step": 19458 + }, + { + "epoch": 10.87094972067039, + "grad_norm": 0.5540690422058105, + "learning_rate": 0.0004580672268907563, + "loss": 0.3939, + "step": 19459 + }, + { + "epoch": 10.871508379888269, + "grad_norm": 0.48930805921554565, + "learning_rate": 0.00045803921568627454, + "loss": 0.3325, + "step": 19460 + }, + { + "epoch": 10.872067039106145, + "grad_norm": 0.5332046747207642, + "learning_rate": 0.00045801120448179275, + "loss": 0.4251, + "step": 19461 + }, + { + "epoch": 10.872625698324022, + "grad_norm": 0.4967195689678192, + "learning_rate": 0.0004579831932773109, + "loss": 0.3769, + "step": 19462 + }, + { + "epoch": 10.8731843575419, + "grad_norm": 0.8713586330413818, + "learning_rate": 0.00045795518207282916, + "loss": 0.324, + "step": 19463 + }, + { + "epoch": 10.873743016759777, + "grad_norm": 0.5224945545196533, + "learning_rate": 0.00045792717086834737, + "loss": 0.4552, + "step": 19464 + }, + { + "epoch": 10.874301675977653, + "grad_norm": 0.5018672943115234, + "learning_rate": 0.00045789915966386557, + "loss": 0.4526, + "step": 19465 + }, + { + "epoch": 10.87486033519553, + "grad_norm": 0.4567224681377411, + "learning_rate": 0.0004578711484593838, + "loss": 0.518, + "step": 19466 + }, + { + "epoch": 10.875418994413408, + "grad_norm": 0.6916138529777527, + "learning_rate": 0.00045784313725490193, + "loss": 0.4227, + "step": 19467 + }, + { + "epoch": 10.875977653631285, + "grad_norm": 0.40818580985069275, + "learning_rate": 0.0004578151260504202, + "loss": 0.4148, + "step": 19468 + }, + { + "epoch": 10.876536312849161, + "grad_norm": 0.5612096786499023, + "learning_rate": 0.0004577871148459384, + "loss": 0.4683, + "step": 19469 + }, + { + "epoch": 10.87709497206704, + "grad_norm": 0.4822520315647125, + "learning_rate": 0.00045775910364145655, + "loss": 0.4671, + "step": 19470 + }, + { + "epoch": 10.877653631284916, + "grad_norm": 0.7350682020187378, + "learning_rate": 0.0004577310924369748, + "loss": 0.4428, + "step": 19471 + }, + { + "epoch": 10.878212290502793, + "grad_norm": 0.4277208149433136, + "learning_rate": 0.000457703081232493, + "loss": 0.4075, + "step": 19472 + }, + { + "epoch": 10.878770949720671, + "grad_norm": 0.9943501353263855, + "learning_rate": 0.0004576750700280112, + "loss": 0.4584, + "step": 19473 + }, + { + "epoch": 10.879329608938548, + "grad_norm": 2.121504545211792, + "learning_rate": 0.0004576470588235294, + "loss": 0.3357, + "step": 19474 + }, + { + "epoch": 10.879888268156424, + "grad_norm": 0.5127196311950684, + "learning_rate": 0.0004576190476190476, + "loss": 0.3678, + "step": 19475 + }, + { + "epoch": 10.880446927374301, + "grad_norm": 0.5205025672912598, + "learning_rate": 0.00045759103641456584, + "loss": 0.3766, + "step": 19476 + }, + { + "epoch": 10.88100558659218, + "grad_norm": 0.768817126750946, + "learning_rate": 0.00045756302521008404, + "loss": 0.3878, + "step": 19477 + }, + { + "epoch": 10.881564245810056, + "grad_norm": 0.3431500792503357, + "learning_rate": 0.00045753501400560225, + "loss": 0.3893, + "step": 19478 + }, + { + "epoch": 10.882122905027932, + "grad_norm": 0.3882765769958496, + "learning_rate": 0.00045750700280112046, + "loss": 0.3951, + "step": 19479 + }, + { + "epoch": 10.88268156424581, + "grad_norm": 1.219213604927063, + "learning_rate": 0.00045747899159663866, + "loss": 0.4234, + "step": 19480 + }, + { + "epoch": 10.883240223463687, + "grad_norm": 0.40456315875053406, + "learning_rate": 0.00045745098039215687, + "loss": 0.4455, + "step": 19481 + }, + { + "epoch": 10.883798882681564, + "grad_norm": 0.43122079968452454, + "learning_rate": 0.0004574229691876751, + "loss": 0.4727, + "step": 19482 + }, + { + "epoch": 10.88435754189944, + "grad_norm": 0.5437071323394775, + "learning_rate": 0.0004573949579831933, + "loss": 0.3954, + "step": 19483 + }, + { + "epoch": 10.884916201117319, + "grad_norm": 0.4350295662879944, + "learning_rate": 0.0004573669467787115, + "loss": 0.4002, + "step": 19484 + }, + { + "epoch": 10.885474860335195, + "grad_norm": 0.5720260143280029, + "learning_rate": 0.0004573389355742297, + "loss": 0.5156, + "step": 19485 + }, + { + "epoch": 10.886033519553072, + "grad_norm": 0.686155378818512, + "learning_rate": 0.0004573109243697479, + "loss": 0.6256, + "step": 19486 + }, + { + "epoch": 10.88659217877095, + "grad_norm": 0.7230165004730225, + "learning_rate": 0.0004572829131652661, + "loss": 0.5493, + "step": 19487 + }, + { + "epoch": 10.887150837988827, + "grad_norm": 0.4863119125366211, + "learning_rate": 0.00045725490196078436, + "loss": 0.5854, + "step": 19488 + }, + { + "epoch": 10.887709497206703, + "grad_norm": 1.0747337341308594, + "learning_rate": 0.0004572268907563025, + "loss": 0.5878, + "step": 19489 + }, + { + "epoch": 10.888268156424582, + "grad_norm": 0.9444344639778137, + "learning_rate": 0.0004571988795518207, + "loss": 0.4221, + "step": 19490 + }, + { + "epoch": 10.888826815642458, + "grad_norm": 0.5995955467224121, + "learning_rate": 0.00045717086834733893, + "loss": 0.5155, + "step": 19491 + }, + { + "epoch": 10.889385474860335, + "grad_norm": 0.9069985747337341, + "learning_rate": 0.00045714285714285713, + "loss": 0.4391, + "step": 19492 + }, + { + "epoch": 10.889944134078211, + "grad_norm": 0.4465606212615967, + "learning_rate": 0.0004571148459383754, + "loss": 0.5045, + "step": 19493 + }, + { + "epoch": 10.89050279329609, + "grad_norm": 0.38655421137809753, + "learning_rate": 0.00045708683473389355, + "loss": 0.3466, + "step": 19494 + }, + { + "epoch": 10.891061452513966, + "grad_norm": 1.4529123306274414, + "learning_rate": 0.00045705882352941175, + "loss": 0.4651, + "step": 19495 + }, + { + "epoch": 10.891620111731843, + "grad_norm": 0.7777120471000671, + "learning_rate": 0.00045703081232493, + "loss": 0.4409, + "step": 19496 + }, + { + "epoch": 10.892178770949721, + "grad_norm": 0.43112510442733765, + "learning_rate": 0.00045700280112044816, + "loss": 0.5001, + "step": 19497 + }, + { + "epoch": 10.892737430167598, + "grad_norm": 0.581341564655304, + "learning_rate": 0.0004569747899159664, + "loss": 0.527, + "step": 19498 + }, + { + "epoch": 10.893296089385474, + "grad_norm": 0.5028506517410278, + "learning_rate": 0.0004569467787114846, + "loss": 0.494, + "step": 19499 + }, + { + "epoch": 10.893854748603353, + "grad_norm": 0.3946606516838074, + "learning_rate": 0.0004569187675070028, + "loss": 0.3967, + "step": 19500 + }, + { + "epoch": 10.893854748603353, + "eval_cer": 0.0880767678090188, + "eval_loss": 0.3288307785987854, + "eval_runtime": 55.5374, + "eval_samples_per_second": 81.711, + "eval_steps_per_second": 5.114, + "eval_wer": 0.3508561213373928, + "step": 19500 + }, + { + "epoch": 10.89441340782123, + "grad_norm": 0.41130995750427246, + "learning_rate": 0.00045689075630252104, + "loss": 0.3713, + "step": 19501 + }, + { + "epoch": 10.894972067039106, + "grad_norm": 0.5164546966552734, + "learning_rate": 0.0004568627450980392, + "loss": 0.3377, + "step": 19502 + }, + { + "epoch": 10.895530726256982, + "grad_norm": 0.5026293992996216, + "learning_rate": 0.00045683473389355745, + "loss": 0.4165, + "step": 19503 + }, + { + "epoch": 10.89608938547486, + "grad_norm": 0.49078041315078735, + "learning_rate": 0.00045680672268907566, + "loss": 0.395, + "step": 19504 + }, + { + "epoch": 10.896648044692737, + "grad_norm": 0.43103906512260437, + "learning_rate": 0.0004567787114845938, + "loss": 0.4234, + "step": 19505 + }, + { + "epoch": 10.897206703910614, + "grad_norm": 0.5015199780464172, + "learning_rate": 0.00045675070028011207, + "loss": 0.6125, + "step": 19506 + }, + { + "epoch": 10.897765363128492, + "grad_norm": 0.401161253452301, + "learning_rate": 0.0004567226890756302, + "loss": 0.4157, + "step": 19507 + }, + { + "epoch": 10.898324022346369, + "grad_norm": 2.0513744354248047, + "learning_rate": 0.0004566946778711485, + "loss": 0.5296, + "step": 19508 + }, + { + "epoch": 10.898882681564245, + "grad_norm": 0.6313964128494263, + "learning_rate": 0.0004566666666666667, + "loss": 0.27, + "step": 19509 + }, + { + "epoch": 10.899441340782122, + "grad_norm": 0.3549821972846985, + "learning_rate": 0.00045663865546218484, + "loss": 0.4157, + "step": 19510 + }, + { + "epoch": 10.9, + "grad_norm": 0.33573731780052185, + "learning_rate": 0.0004566106442577031, + "loss": 0.2892, + "step": 19511 + }, + { + "epoch": 10.900558659217877, + "grad_norm": 0.8451279401779175, + "learning_rate": 0.0004565826330532213, + "loss": 0.4456, + "step": 19512 + }, + { + "epoch": 10.901117318435753, + "grad_norm": 0.5604871511459351, + "learning_rate": 0.0004565546218487395, + "loss": 0.5025, + "step": 19513 + }, + { + "epoch": 10.901675977653632, + "grad_norm": 1.018762230873108, + "learning_rate": 0.0004565266106442577, + "loss": 0.4146, + "step": 19514 + }, + { + "epoch": 10.902234636871508, + "grad_norm": 0.459634393453598, + "learning_rate": 0.00045649859943977587, + "loss": 0.4496, + "step": 19515 + }, + { + "epoch": 10.902793296089385, + "grad_norm": 0.47295621037483215, + "learning_rate": 0.00045647058823529413, + "loss": 0.523, + "step": 19516 + }, + { + "epoch": 10.903351955307263, + "grad_norm": 0.4551648795604706, + "learning_rate": 0.00045644257703081234, + "loss": 0.5013, + "step": 19517 + }, + { + "epoch": 10.90391061452514, + "grad_norm": 0.41703560948371887, + "learning_rate": 0.00045641456582633054, + "loss": 0.3904, + "step": 19518 + }, + { + "epoch": 10.904469273743016, + "grad_norm": 7.310099124908447, + "learning_rate": 0.00045638655462184875, + "loss": 0.5181, + "step": 19519 + }, + { + "epoch": 10.905027932960895, + "grad_norm": 0.4674391746520996, + "learning_rate": 0.00045635854341736696, + "loss": 0.3536, + "step": 19520 + }, + { + "epoch": 10.905586592178771, + "grad_norm": 0.5428652167320251, + "learning_rate": 0.00045633053221288516, + "loss": 0.5864, + "step": 19521 + }, + { + "epoch": 10.906145251396648, + "grad_norm": 0.9354425668716431, + "learning_rate": 0.00045630252100840337, + "loss": 0.4639, + "step": 19522 + }, + { + "epoch": 10.906703910614524, + "grad_norm": 0.4676138162612915, + "learning_rate": 0.0004562745098039216, + "loss": 0.3484, + "step": 19523 + }, + { + "epoch": 10.907262569832403, + "grad_norm": 0.5535160899162292, + "learning_rate": 0.0004562464985994398, + "loss": 0.4784, + "step": 19524 + }, + { + "epoch": 10.90782122905028, + "grad_norm": 0.7905294299125671, + "learning_rate": 0.000456218487394958, + "loss": 0.4046, + "step": 19525 + }, + { + "epoch": 10.908379888268156, + "grad_norm": 0.8160191178321838, + "learning_rate": 0.0004561904761904762, + "loss": 0.4403, + "step": 19526 + }, + { + "epoch": 10.908938547486034, + "grad_norm": 0.6045393943786621, + "learning_rate": 0.0004561624649859944, + "loss": 0.3802, + "step": 19527 + }, + { + "epoch": 10.90949720670391, + "grad_norm": 0.9762187004089355, + "learning_rate": 0.00045613445378151266, + "loss": 0.4623, + "step": 19528 + }, + { + "epoch": 10.910055865921787, + "grad_norm": 0.8041545152664185, + "learning_rate": 0.0004561064425770308, + "loss": 0.3273, + "step": 19529 + }, + { + "epoch": 10.910614525139664, + "grad_norm": 2.0979504585266113, + "learning_rate": 0.000456078431372549, + "loss": 0.3865, + "step": 19530 + }, + { + "epoch": 10.911173184357542, + "grad_norm": 0.32649150490760803, + "learning_rate": 0.0004560504201680672, + "loss": 0.3553, + "step": 19531 + }, + { + "epoch": 10.911731843575419, + "grad_norm": 0.44879236817359924, + "learning_rate": 0.00045602240896358543, + "loss": 0.3794, + "step": 19532 + }, + { + "epoch": 10.912290502793295, + "grad_norm": 0.5523907542228699, + "learning_rate": 0.0004559943977591037, + "loss": 0.4494, + "step": 19533 + }, + { + "epoch": 10.912849162011174, + "grad_norm": 0.7766351699829102, + "learning_rate": 0.00045596638655462184, + "loss": 0.4794, + "step": 19534 + }, + { + "epoch": 10.91340782122905, + "grad_norm": 0.47743871808052063, + "learning_rate": 0.00045593837535014005, + "loss": 0.4017, + "step": 19535 + }, + { + "epoch": 10.913966480446927, + "grad_norm": 0.4808497726917267, + "learning_rate": 0.0004559103641456583, + "loss": 0.5614, + "step": 19536 + }, + { + "epoch": 10.914525139664804, + "grad_norm": 0.6021647453308105, + "learning_rate": 0.00045588235294117646, + "loss": 0.3827, + "step": 19537 + }, + { + "epoch": 10.915083798882682, + "grad_norm": 0.435291588306427, + "learning_rate": 0.0004558543417366947, + "loss": 0.4085, + "step": 19538 + }, + { + "epoch": 10.915642458100558, + "grad_norm": 0.38414716720581055, + "learning_rate": 0.00045582633053221287, + "loss": 0.3872, + "step": 19539 + }, + { + "epoch": 10.916201117318435, + "grad_norm": 1.3766226768493652, + "learning_rate": 0.0004557983193277311, + "loss": 0.4004, + "step": 19540 + }, + { + "epoch": 10.916759776536313, + "grad_norm": 0.4242534637451172, + "learning_rate": 0.00045577030812324934, + "loss": 0.377, + "step": 19541 + }, + { + "epoch": 10.91731843575419, + "grad_norm": 0.5262918472290039, + "learning_rate": 0.0004557422969187675, + "loss": 0.4333, + "step": 19542 + }, + { + "epoch": 10.917877094972066, + "grad_norm": 0.7887991070747375, + "learning_rate": 0.00045571428571428575, + "loss": 0.3603, + "step": 19543 + }, + { + "epoch": 10.918435754189945, + "grad_norm": 0.7010632753372192, + "learning_rate": 0.00045568627450980395, + "loss": 0.4901, + "step": 19544 + }, + { + "epoch": 10.918994413407821, + "grad_norm": 1.2077034711837769, + "learning_rate": 0.0004556582633053221, + "loss": 0.348, + "step": 19545 + }, + { + "epoch": 10.919553072625698, + "grad_norm": 0.41414928436279297, + "learning_rate": 0.00045563025210084037, + "loss": 0.3163, + "step": 19546 + }, + { + "epoch": 10.920111731843576, + "grad_norm": 0.4330168068408966, + "learning_rate": 0.0004556022408963585, + "loss": 0.4114, + "step": 19547 + }, + { + "epoch": 10.920670391061453, + "grad_norm": 0.7332313060760498, + "learning_rate": 0.0004555742296918768, + "loss": 0.4734, + "step": 19548 + }, + { + "epoch": 10.92122905027933, + "grad_norm": 0.815731942653656, + "learning_rate": 0.000455546218487395, + "loss": 0.4167, + "step": 19549 + }, + { + "epoch": 10.921787709497206, + "grad_norm": 0.41119226813316345, + "learning_rate": 0.00045551820728291314, + "loss": 0.4512, + "step": 19550 + }, + { + "epoch": 10.922346368715084, + "grad_norm": 0.38646405935287476, + "learning_rate": 0.0004554901960784314, + "loss": 0.3779, + "step": 19551 + }, + { + "epoch": 10.922905027932961, + "grad_norm": 2.6060380935668945, + "learning_rate": 0.0004554621848739496, + "loss": 0.4122, + "step": 19552 + }, + { + "epoch": 10.923463687150837, + "grad_norm": 0.403250515460968, + "learning_rate": 0.0004554341736694678, + "loss": 0.4977, + "step": 19553 + }, + { + "epoch": 10.924022346368716, + "grad_norm": 3.6118671894073486, + "learning_rate": 0.000455406162464986, + "loss": 0.4153, + "step": 19554 + }, + { + "epoch": 10.924581005586592, + "grad_norm": 0.34593239426612854, + "learning_rate": 0.00045537815126050417, + "loss": 0.4242, + "step": 19555 + }, + { + "epoch": 10.925139664804469, + "grad_norm": 2.0285918712615967, + "learning_rate": 0.0004553501400560224, + "loss": 0.3982, + "step": 19556 + }, + { + "epoch": 10.925698324022346, + "grad_norm": 0.35781940817832947, + "learning_rate": 0.00045532212885154063, + "loss": 0.3475, + "step": 19557 + }, + { + "epoch": 10.926256983240224, + "grad_norm": 0.5869525074958801, + "learning_rate": 0.00045529411764705884, + "loss": 0.4845, + "step": 19558 + }, + { + "epoch": 10.9268156424581, + "grad_norm": 0.390906423330307, + "learning_rate": 0.00045526610644257704, + "loss": 0.4572, + "step": 19559 + }, + { + "epoch": 10.927374301675977, + "grad_norm": 0.6141411662101746, + "learning_rate": 0.00045523809523809525, + "loss": 0.4154, + "step": 19560 + }, + { + "epoch": 10.927932960893855, + "grad_norm": 0.4375908672809601, + "learning_rate": 0.00045521008403361346, + "loss": 0.3124, + "step": 19561 + }, + { + "epoch": 10.928491620111732, + "grad_norm": 0.5885324478149414, + "learning_rate": 0.00045518207282913166, + "loss": 0.5413, + "step": 19562 + }, + { + "epoch": 10.929050279329608, + "grad_norm": 0.5527775287628174, + "learning_rate": 0.0004551540616246499, + "loss": 0.4553, + "step": 19563 + }, + { + "epoch": 10.929608938547487, + "grad_norm": 0.4902518689632416, + "learning_rate": 0.0004551260504201681, + "loss": 0.3416, + "step": 19564 + }, + { + "epoch": 10.930167597765363, + "grad_norm": 0.5091426372528076, + "learning_rate": 0.0004550980392156863, + "loss": 0.4116, + "step": 19565 + }, + { + "epoch": 10.93072625698324, + "grad_norm": 0.5823405385017395, + "learning_rate": 0.0004550700280112045, + "loss": 0.4464, + "step": 19566 + }, + { + "epoch": 10.931284916201117, + "grad_norm": 1.0579520463943481, + "learning_rate": 0.0004550420168067227, + "loss": 0.3851, + "step": 19567 + }, + { + "epoch": 10.931843575418995, + "grad_norm": 0.5456339120864868, + "learning_rate": 0.00045501400560224095, + "loss": 0.4308, + "step": 19568 + }, + { + "epoch": 10.932402234636871, + "grad_norm": 0.4888848662376404, + "learning_rate": 0.0004549859943977591, + "loss": 0.4242, + "step": 19569 + }, + { + "epoch": 10.932960893854748, + "grad_norm": 0.5546827912330627, + "learning_rate": 0.0004549579831932773, + "loss": 0.478, + "step": 19570 + }, + { + "epoch": 10.933519553072626, + "grad_norm": 0.5477078557014465, + "learning_rate": 0.00045492997198879557, + "loss": 0.531, + "step": 19571 + }, + { + "epoch": 10.934078212290503, + "grad_norm": 0.3058905601501465, + "learning_rate": 0.0004549019607843137, + "loss": 0.3468, + "step": 19572 + }, + { + "epoch": 10.93463687150838, + "grad_norm": 1.571875810623169, + "learning_rate": 0.000454873949579832, + "loss": 0.4637, + "step": 19573 + }, + { + "epoch": 10.935195530726258, + "grad_norm": 0.4084053337574005, + "learning_rate": 0.00045484593837535013, + "loss": 0.4177, + "step": 19574 + }, + { + "epoch": 10.935754189944134, + "grad_norm": 0.48023900389671326, + "learning_rate": 0.00045481792717086834, + "loss": 0.4431, + "step": 19575 + }, + { + "epoch": 10.936312849162011, + "grad_norm": 0.36626118421554565, + "learning_rate": 0.0004547899159663866, + "loss": 0.4176, + "step": 19576 + }, + { + "epoch": 10.936871508379888, + "grad_norm": 0.6702055335044861, + "learning_rate": 0.00045476190476190475, + "loss": 0.4911, + "step": 19577 + }, + { + "epoch": 10.937430167597766, + "grad_norm": 0.6162322759628296, + "learning_rate": 0.000454733893557423, + "loss": 0.4082, + "step": 19578 + }, + { + "epoch": 10.937988826815642, + "grad_norm": 4.33035135269165, + "learning_rate": 0.0004547058823529412, + "loss": 0.355, + "step": 19579 + }, + { + "epoch": 10.938547486033519, + "grad_norm": 0.5423091650009155, + "learning_rate": 0.00045467787114845937, + "loss": 0.4879, + "step": 19580 + }, + { + "epoch": 10.939106145251397, + "grad_norm": 1.243164300918579, + "learning_rate": 0.00045464985994397763, + "loss": 0.3925, + "step": 19581 + }, + { + "epoch": 10.939664804469274, + "grad_norm": 0.675317645072937, + "learning_rate": 0.0004546218487394958, + "loss": 0.5342, + "step": 19582 + }, + { + "epoch": 10.94022346368715, + "grad_norm": 0.5734110474586487, + "learning_rate": 0.000454593837535014, + "loss": 0.4463, + "step": 19583 + }, + { + "epoch": 10.940782122905027, + "grad_norm": 0.4258284866809845, + "learning_rate": 0.00045456582633053225, + "loss": 0.3748, + "step": 19584 + }, + { + "epoch": 10.941340782122905, + "grad_norm": 2.7716588973999023, + "learning_rate": 0.0004545378151260504, + "loss": 0.4513, + "step": 19585 + }, + { + "epoch": 10.941899441340782, + "grad_norm": 0.9812430739402771, + "learning_rate": 0.00045450980392156866, + "loss": 0.3437, + "step": 19586 + }, + { + "epoch": 10.942458100558659, + "grad_norm": 0.4784356951713562, + "learning_rate": 0.00045448179271708687, + "loss": 0.4679, + "step": 19587 + }, + { + "epoch": 10.943016759776537, + "grad_norm": 0.6685768961906433, + "learning_rate": 0.000454453781512605, + "loss": 0.5758, + "step": 19588 + }, + { + "epoch": 10.943575418994413, + "grad_norm": 5.183349132537842, + "learning_rate": 0.0004544257703081233, + "loss": 0.4057, + "step": 19589 + }, + { + "epoch": 10.94413407821229, + "grad_norm": 0.5789082646369934, + "learning_rate": 0.00045439775910364143, + "loss": 0.5449, + "step": 19590 + }, + { + "epoch": 10.944692737430168, + "grad_norm": 1.1220394372940063, + "learning_rate": 0.0004543697478991597, + "loss": 0.3977, + "step": 19591 + }, + { + "epoch": 10.945251396648045, + "grad_norm": 0.7104514241218567, + "learning_rate": 0.0004543417366946779, + "loss": 0.4354, + "step": 19592 + }, + { + "epoch": 10.945810055865921, + "grad_norm": 1.457007646560669, + "learning_rate": 0.00045431372549019605, + "loss": 0.4375, + "step": 19593 + }, + { + "epoch": 10.946368715083798, + "grad_norm": 0.5466610789299011, + "learning_rate": 0.0004542857142857143, + "loss": 0.4628, + "step": 19594 + }, + { + "epoch": 10.946927374301676, + "grad_norm": 0.7462083697319031, + "learning_rate": 0.0004542577030812325, + "loss": 0.4249, + "step": 19595 + }, + { + "epoch": 10.947486033519553, + "grad_norm": 0.43555358052253723, + "learning_rate": 0.0004542296918767507, + "loss": 0.3507, + "step": 19596 + }, + { + "epoch": 10.94804469273743, + "grad_norm": 0.45313170552253723, + "learning_rate": 0.0004542016806722689, + "loss": 0.431, + "step": 19597 + }, + { + "epoch": 10.948603351955308, + "grad_norm": 1.132649540901184, + "learning_rate": 0.0004541736694677871, + "loss": 0.4257, + "step": 19598 + }, + { + "epoch": 10.949162011173184, + "grad_norm": 0.3683547377586365, + "learning_rate": 0.00045414565826330534, + "loss": 0.4093, + "step": 19599 + }, + { + "epoch": 10.949720670391061, + "grad_norm": 0.6518691778182983, + "learning_rate": 0.00045411764705882354, + "loss": 0.4432, + "step": 19600 + }, + { + "epoch": 10.95027932960894, + "grad_norm": 0.47441366314888, + "learning_rate": 0.00045408963585434175, + "loss": 0.3596, + "step": 19601 + }, + { + "epoch": 10.950837988826816, + "grad_norm": 0.3779560327529907, + "learning_rate": 0.00045406162464985996, + "loss": 0.3943, + "step": 19602 + }, + { + "epoch": 10.951396648044692, + "grad_norm": 0.5354276299476624, + "learning_rate": 0.00045403361344537816, + "loss": 0.4064, + "step": 19603 + }, + { + "epoch": 10.951955307262569, + "grad_norm": 6.9575300216674805, + "learning_rate": 0.00045400560224089637, + "loss": 0.4225, + "step": 19604 + }, + { + "epoch": 10.952513966480447, + "grad_norm": 0.33621883392333984, + "learning_rate": 0.0004539775910364146, + "loss": 0.3562, + "step": 19605 + }, + { + "epoch": 10.953072625698324, + "grad_norm": 1.0441874265670776, + "learning_rate": 0.0004539495798319328, + "loss": 0.5699, + "step": 19606 + }, + { + "epoch": 10.9536312849162, + "grad_norm": 0.7098649144172668, + "learning_rate": 0.000453921568627451, + "loss": 0.5251, + "step": 19607 + }, + { + "epoch": 10.954189944134079, + "grad_norm": 0.6170877814292908, + "learning_rate": 0.0004538935574229692, + "loss": 0.4174, + "step": 19608 + }, + { + "epoch": 10.954748603351955, + "grad_norm": 0.6251637935638428, + "learning_rate": 0.0004538655462184874, + "loss": 0.4187, + "step": 19609 + }, + { + "epoch": 10.955307262569832, + "grad_norm": 0.5573031306266785, + "learning_rate": 0.0004538375350140056, + "loss": 0.447, + "step": 19610 + }, + { + "epoch": 10.955865921787709, + "grad_norm": 2.5758886337280273, + "learning_rate": 0.00045380952380952386, + "loss": 0.3442, + "step": 19611 + }, + { + "epoch": 10.956424581005587, + "grad_norm": 0.5471271276473999, + "learning_rate": 0.000453781512605042, + "loss": 0.4465, + "step": 19612 + }, + { + "epoch": 10.956983240223463, + "grad_norm": 0.5629642605781555, + "learning_rate": 0.0004537535014005602, + "loss": 0.3749, + "step": 19613 + }, + { + "epoch": 10.95754189944134, + "grad_norm": 0.7012425065040588, + "learning_rate": 0.00045372549019607843, + "loss": 0.4484, + "step": 19614 + }, + { + "epoch": 10.958100558659218, + "grad_norm": 0.5014681220054626, + "learning_rate": 0.00045369747899159663, + "loss": 0.3617, + "step": 19615 + }, + { + "epoch": 10.958659217877095, + "grad_norm": 0.4093373119831085, + "learning_rate": 0.0004536694677871149, + "loss": 0.44, + "step": 19616 + }, + { + "epoch": 10.959217877094972, + "grad_norm": 0.44349992275238037, + "learning_rate": 0.00045364145658263305, + "loss": 0.3918, + "step": 19617 + }, + { + "epoch": 10.95977653631285, + "grad_norm": 0.4647926688194275, + "learning_rate": 0.00045361344537815125, + "loss": 0.5832, + "step": 19618 + }, + { + "epoch": 10.960335195530726, + "grad_norm": 0.5177651047706604, + "learning_rate": 0.0004535854341736695, + "loss": 0.3546, + "step": 19619 + }, + { + "epoch": 10.960893854748603, + "grad_norm": 3.884995698928833, + "learning_rate": 0.00045355742296918766, + "loss": 0.4733, + "step": 19620 + }, + { + "epoch": 10.961452513966481, + "grad_norm": 0.3276974558830261, + "learning_rate": 0.0004535294117647059, + "loss": 0.3642, + "step": 19621 + }, + { + "epoch": 10.962011173184358, + "grad_norm": 0.4807997941970825, + "learning_rate": 0.0004535014005602241, + "loss": 0.3803, + "step": 19622 + }, + { + "epoch": 10.962569832402234, + "grad_norm": 0.40378236770629883, + "learning_rate": 0.0004534733893557423, + "loss": 0.4202, + "step": 19623 + }, + { + "epoch": 10.963128491620111, + "grad_norm": 0.7819784879684448, + "learning_rate": 0.00045344537815126054, + "loss": 0.4873, + "step": 19624 + }, + { + "epoch": 10.96368715083799, + "grad_norm": 0.6481210589408875, + "learning_rate": 0.0004534173669467787, + "loss": 0.5176, + "step": 19625 + }, + { + "epoch": 10.964245810055866, + "grad_norm": 0.4645385146141052, + "learning_rate": 0.00045338935574229695, + "loss": 0.3849, + "step": 19626 + }, + { + "epoch": 10.964804469273743, + "grad_norm": 0.40287497639656067, + "learning_rate": 0.00045336134453781516, + "loss": 0.375, + "step": 19627 + }, + { + "epoch": 10.96536312849162, + "grad_norm": 0.507799506187439, + "learning_rate": 0.0004533333333333333, + "loss": 0.4873, + "step": 19628 + }, + { + "epoch": 10.965921787709497, + "grad_norm": 1.1175541877746582, + "learning_rate": 0.00045330532212885157, + "loss": 0.5788, + "step": 19629 + }, + { + "epoch": 10.966480446927374, + "grad_norm": 0.6667212843894958, + "learning_rate": 0.0004532773109243697, + "loss": 0.4776, + "step": 19630 + }, + { + "epoch": 10.96703910614525, + "grad_norm": 0.5277851819992065, + "learning_rate": 0.000453249299719888, + "loss": 0.4856, + "step": 19631 + }, + { + "epoch": 10.967597765363129, + "grad_norm": 1.7395663261413574, + "learning_rate": 0.0004532212885154062, + "loss": 0.4985, + "step": 19632 + }, + { + "epoch": 10.968156424581005, + "grad_norm": 0.7707605957984924, + "learning_rate": 0.00045319327731092434, + "loss": 0.3627, + "step": 19633 + }, + { + "epoch": 10.968715083798882, + "grad_norm": 0.462990403175354, + "learning_rate": 0.0004531652661064426, + "loss": 0.4145, + "step": 19634 + }, + { + "epoch": 10.96927374301676, + "grad_norm": 0.7646647095680237, + "learning_rate": 0.0004531372549019608, + "loss": 0.4486, + "step": 19635 + }, + { + "epoch": 10.969832402234637, + "grad_norm": 0.7143821120262146, + "learning_rate": 0.000453109243697479, + "loss": 0.4108, + "step": 19636 + }, + { + "epoch": 10.970391061452514, + "grad_norm": 0.3491571247577667, + "learning_rate": 0.0004530812324929972, + "loss": 0.3908, + "step": 19637 + }, + { + "epoch": 10.970949720670392, + "grad_norm": 0.5693480968475342, + "learning_rate": 0.00045305322128851537, + "loss": 0.4161, + "step": 19638 + }, + { + "epoch": 10.971508379888268, + "grad_norm": 0.4876122772693634, + "learning_rate": 0.00045302521008403363, + "loss": 0.582, + "step": 19639 + }, + { + "epoch": 10.972067039106145, + "grad_norm": 1.2239868640899658, + "learning_rate": 0.00045299719887955184, + "loss": 0.394, + "step": 19640 + }, + { + "epoch": 10.972625698324022, + "grad_norm": 0.4342758357524872, + "learning_rate": 0.00045296918767507004, + "loss": 0.4103, + "step": 19641 + }, + { + "epoch": 10.9731843575419, + "grad_norm": 0.7984012961387634, + "learning_rate": 0.00045294117647058825, + "loss": 0.5361, + "step": 19642 + }, + { + "epoch": 10.973743016759776, + "grad_norm": 0.4931205213069916, + "learning_rate": 0.00045291316526610646, + "loss": 0.3766, + "step": 19643 + }, + { + "epoch": 10.974301675977653, + "grad_norm": 0.36186158657073975, + "learning_rate": 0.00045288515406162466, + "loss": 0.3871, + "step": 19644 + }, + { + "epoch": 10.974860335195531, + "grad_norm": 0.4351528286933899, + "learning_rate": 0.00045285714285714287, + "loss": 0.4285, + "step": 19645 + }, + { + "epoch": 10.975418994413408, + "grad_norm": 0.44537338614463806, + "learning_rate": 0.0004528291316526611, + "loss": 0.3881, + "step": 19646 + }, + { + "epoch": 10.975977653631285, + "grad_norm": 0.4209382236003876, + "learning_rate": 0.0004528011204481793, + "loss": 0.3895, + "step": 19647 + }, + { + "epoch": 10.976536312849163, + "grad_norm": 3.2156825065612793, + "learning_rate": 0.0004527731092436975, + "loss": 0.4198, + "step": 19648 + }, + { + "epoch": 10.97709497206704, + "grad_norm": 0.36199796199798584, + "learning_rate": 0.0004527450980392157, + "loss": 0.399, + "step": 19649 + }, + { + "epoch": 10.977653631284916, + "grad_norm": 1.9104856252670288, + "learning_rate": 0.0004527170868347339, + "loss": 0.429, + "step": 19650 + }, + { + "epoch": 10.978212290502793, + "grad_norm": 0.6647889614105225, + "learning_rate": 0.00045268907563025216, + "loss": 0.3044, + "step": 19651 + }, + { + "epoch": 10.978770949720671, + "grad_norm": 0.3544287383556366, + "learning_rate": 0.0004526610644257703, + "loss": 0.3189, + "step": 19652 + }, + { + "epoch": 10.979329608938547, + "grad_norm": 0.489487886428833, + "learning_rate": 0.0004526330532212885, + "loss": 0.4892, + "step": 19653 + }, + { + "epoch": 10.979888268156424, + "grad_norm": 0.4223870635032654, + "learning_rate": 0.0004526050420168067, + "loss": 0.4181, + "step": 19654 + }, + { + "epoch": 10.980446927374302, + "grad_norm": 0.6175500750541687, + "learning_rate": 0.00045257703081232493, + "loss": 0.4078, + "step": 19655 + }, + { + "epoch": 10.981005586592179, + "grad_norm": 1.7539653778076172, + "learning_rate": 0.0004525490196078432, + "loss": 0.4003, + "step": 19656 + }, + { + "epoch": 10.981564245810056, + "grad_norm": 0.4580148160457611, + "learning_rate": 0.00045252100840336134, + "loss": 0.5029, + "step": 19657 + }, + { + "epoch": 10.982122905027932, + "grad_norm": 14.965265274047852, + "learning_rate": 0.00045249299719887955, + "loss": 0.4467, + "step": 19658 + }, + { + "epoch": 10.98268156424581, + "grad_norm": 0.43277639150619507, + "learning_rate": 0.0004524649859943978, + "loss": 0.4291, + "step": 19659 + }, + { + "epoch": 10.983240223463687, + "grad_norm": 0.610205888748169, + "learning_rate": 0.00045243697478991596, + "loss": 0.3418, + "step": 19660 + }, + { + "epoch": 10.983798882681564, + "grad_norm": 2.1606523990631104, + "learning_rate": 0.0004524089635854342, + "loss": 0.4099, + "step": 19661 + }, + { + "epoch": 10.984357541899442, + "grad_norm": 1.0042824745178223, + "learning_rate": 0.00045238095238095237, + "loss": 0.3864, + "step": 19662 + }, + { + "epoch": 10.984916201117318, + "grad_norm": 0.5823230147361755, + "learning_rate": 0.0004523529411764706, + "loss": 0.4438, + "step": 19663 + }, + { + "epoch": 10.985474860335195, + "grad_norm": 0.5017644762992859, + "learning_rate": 0.00045232492997198884, + "loss": 0.3367, + "step": 19664 + }, + { + "epoch": 10.986033519553073, + "grad_norm": 0.3199414312839508, + "learning_rate": 0.000452296918767507, + "loss": 0.3371, + "step": 19665 + }, + { + "epoch": 10.98659217877095, + "grad_norm": 0.5106126666069031, + "learning_rate": 0.00045226890756302525, + "loss": 0.41, + "step": 19666 + }, + { + "epoch": 10.987150837988827, + "grad_norm": 0.4596010744571686, + "learning_rate": 0.00045224089635854345, + "loss": 0.3919, + "step": 19667 + }, + { + "epoch": 10.987709497206703, + "grad_norm": 3.0303168296813965, + "learning_rate": 0.0004522128851540616, + "loss": 0.5195, + "step": 19668 + }, + { + "epoch": 10.988268156424581, + "grad_norm": 0.38064494729042053, + "learning_rate": 0.00045218487394957987, + "loss": 0.4028, + "step": 19669 + }, + { + "epoch": 10.988826815642458, + "grad_norm": 0.44305285811424255, + "learning_rate": 0.000452156862745098, + "loss": 0.423, + "step": 19670 + }, + { + "epoch": 10.989385474860335, + "grad_norm": 0.4474530816078186, + "learning_rate": 0.0004521288515406163, + "loss": 0.5321, + "step": 19671 + }, + { + "epoch": 10.989944134078213, + "grad_norm": 0.5138105154037476, + "learning_rate": 0.0004521008403361345, + "loss": 0.4436, + "step": 19672 + }, + { + "epoch": 10.99050279329609, + "grad_norm": 1.440447211265564, + "learning_rate": 0.00045207282913165264, + "loss": 0.4462, + "step": 19673 + }, + { + "epoch": 10.991061452513966, + "grad_norm": 1.712256669998169, + "learning_rate": 0.0004520448179271709, + "loss": 0.5729, + "step": 19674 + }, + { + "epoch": 10.991620111731844, + "grad_norm": 0.6162176132202148, + "learning_rate": 0.0004520168067226891, + "loss": 0.4236, + "step": 19675 + }, + { + "epoch": 10.992178770949721, + "grad_norm": 0.46638524532318115, + "learning_rate": 0.0004519887955182073, + "loss": 0.3226, + "step": 19676 + }, + { + "epoch": 10.992737430167598, + "grad_norm": 0.39813655614852905, + "learning_rate": 0.0004519607843137255, + "loss": 0.4374, + "step": 19677 + }, + { + "epoch": 10.993296089385474, + "grad_norm": 0.6996338367462158, + "learning_rate": 0.00045193277310924367, + "loss": 0.4331, + "step": 19678 + }, + { + "epoch": 10.993854748603352, + "grad_norm": 0.5107343196868896, + "learning_rate": 0.0004519047619047619, + "loss": 0.4094, + "step": 19679 + }, + { + "epoch": 10.994413407821229, + "grad_norm": 0.5090128183364868, + "learning_rate": 0.00045187675070028013, + "loss": 0.4573, + "step": 19680 + }, + { + "epoch": 10.994972067039106, + "grad_norm": 0.5859510898590088, + "learning_rate": 0.00045184873949579834, + "loss": 0.3451, + "step": 19681 + }, + { + "epoch": 10.995530726256984, + "grad_norm": 0.7955729961395264, + "learning_rate": 0.00045182072829131654, + "loss": 0.3594, + "step": 19682 + }, + { + "epoch": 10.99608938547486, + "grad_norm": 0.3718481957912445, + "learning_rate": 0.00045179271708683475, + "loss": 0.4275, + "step": 19683 + }, + { + "epoch": 10.996648044692737, + "grad_norm": 0.4856449067592621, + "learning_rate": 0.00045176470588235296, + "loss": 0.4408, + "step": 19684 + }, + { + "epoch": 10.997206703910614, + "grad_norm": 0.3392998278141022, + "learning_rate": 0.00045173669467787116, + "loss": 0.4583, + "step": 19685 + }, + { + "epoch": 10.997765363128492, + "grad_norm": 0.3918527066707611, + "learning_rate": 0.00045170868347338937, + "loss": 0.4144, + "step": 19686 + }, + { + "epoch": 10.998324022346369, + "grad_norm": 0.43196964263916016, + "learning_rate": 0.0004516806722689076, + "loss": 0.4474, + "step": 19687 + }, + { + "epoch": 10.998882681564245, + "grad_norm": 0.5626681447029114, + "learning_rate": 0.0004516526610644258, + "loss": 0.3745, + "step": 19688 + }, + { + "epoch": 10.999441340782123, + "grad_norm": 0.3662945628166199, + "learning_rate": 0.000451624649859944, + "loss": 0.3981, + "step": 19689 + }, + { + "epoch": 11.0, + "grad_norm": 0.3969745934009552, + "learning_rate": 0.0004515966386554622, + "loss": 0.4305, + "step": 19690 + }, + { + "epoch": 11.000558659217877, + "grad_norm": 0.5482631325721741, + "learning_rate": 0.00045156862745098045, + "loss": 0.5046, + "step": 19691 + }, + { + "epoch": 11.001117318435755, + "grad_norm": 0.6897459626197815, + "learning_rate": 0.0004515406162464986, + "loss": 0.6416, + "step": 19692 + }, + { + "epoch": 11.001675977653631, + "grad_norm": 0.9451212882995605, + "learning_rate": 0.0004515126050420168, + "loss": 0.3999, + "step": 19693 + }, + { + "epoch": 11.002234636871508, + "grad_norm": 0.8297941088676453, + "learning_rate": 0.000451484593837535, + "loss": 0.4482, + "step": 19694 + }, + { + "epoch": 11.002793296089385, + "grad_norm": 1.6589319705963135, + "learning_rate": 0.0004514565826330532, + "loss": 0.4373, + "step": 19695 + }, + { + "epoch": 11.003351955307263, + "grad_norm": 0.4920943081378937, + "learning_rate": 0.00045142857142857143, + "loss": 0.3615, + "step": 19696 + }, + { + "epoch": 11.00391061452514, + "grad_norm": 0.5959135890007019, + "learning_rate": 0.00045140056022408963, + "loss": 0.5231, + "step": 19697 + }, + { + "epoch": 11.004469273743016, + "grad_norm": 0.527924120426178, + "learning_rate": 0.00045137254901960784, + "loss": 0.58, + "step": 19698 + }, + { + "epoch": 11.005027932960894, + "grad_norm": 0.3551040291786194, + "learning_rate": 0.0004513445378151261, + "loss": 0.3689, + "step": 19699 + }, + { + "epoch": 11.005586592178771, + "grad_norm": 2.1573057174682617, + "learning_rate": 0.00045131652661064425, + "loss": 0.3492, + "step": 19700 + }, + { + "epoch": 11.006145251396648, + "grad_norm": 0.8560137748718262, + "learning_rate": 0.00045128851540616246, + "loss": 0.5545, + "step": 19701 + }, + { + "epoch": 11.006703910614526, + "grad_norm": 0.7332513332366943, + "learning_rate": 0.00045126050420168066, + "loss": 0.5847, + "step": 19702 + }, + { + "epoch": 11.007262569832402, + "grad_norm": 0.4452804625034332, + "learning_rate": 0.00045123249299719887, + "loss": 0.5877, + "step": 19703 + }, + { + "epoch": 11.007821229050279, + "grad_norm": 0.48486676812171936, + "learning_rate": 0.00045120448179271713, + "loss": 0.3465, + "step": 19704 + }, + { + "epoch": 11.008379888268156, + "grad_norm": 0.5134686827659607, + "learning_rate": 0.0004511764705882353, + "loss": 0.4179, + "step": 19705 + }, + { + "epoch": 11.008938547486034, + "grad_norm": 0.5725839138031006, + "learning_rate": 0.0004511484593837535, + "loss": 0.65, + "step": 19706 + }, + { + "epoch": 11.00949720670391, + "grad_norm": 0.54770827293396, + "learning_rate": 0.00045112044817927175, + "loss": 0.4343, + "step": 19707 + }, + { + "epoch": 11.010055865921787, + "grad_norm": 0.6473379731178284, + "learning_rate": 0.0004510924369747899, + "loss": 0.4204, + "step": 19708 + }, + { + "epoch": 11.010614525139665, + "grad_norm": 0.4246036112308502, + "learning_rate": 0.00045106442577030816, + "loss": 0.476, + "step": 19709 + }, + { + "epoch": 11.011173184357542, + "grad_norm": 0.4953240156173706, + "learning_rate": 0.0004510364145658263, + "loss": 0.427, + "step": 19710 + }, + { + "epoch": 11.011731843575419, + "grad_norm": 0.6312206387519836, + "learning_rate": 0.0004510084033613445, + "loss": 0.3995, + "step": 19711 + }, + { + "epoch": 11.012290502793297, + "grad_norm": 0.5234584808349609, + "learning_rate": 0.0004509803921568628, + "loss": 0.4363, + "step": 19712 + }, + { + "epoch": 11.012849162011173, + "grad_norm": 0.6393572092056274, + "learning_rate": 0.00045095238095238093, + "loss": 0.463, + "step": 19713 + }, + { + "epoch": 11.01340782122905, + "grad_norm": 0.5865660905838013, + "learning_rate": 0.0004509243697478992, + "loss": 0.4178, + "step": 19714 + }, + { + "epoch": 11.013966480446927, + "grad_norm": 0.49153652787208557, + "learning_rate": 0.0004508963585434174, + "loss": 0.4386, + "step": 19715 + }, + { + "epoch": 11.014525139664805, + "grad_norm": 1.3606570959091187, + "learning_rate": 0.00045086834733893555, + "loss": 0.3972, + "step": 19716 + }, + { + "epoch": 11.015083798882682, + "grad_norm": 0.44186320900917053, + "learning_rate": 0.0004508403361344538, + "loss": 0.522, + "step": 19717 + }, + { + "epoch": 11.015642458100558, + "grad_norm": 0.355182021856308, + "learning_rate": 0.00045081232492997196, + "loss": 0.3898, + "step": 19718 + }, + { + "epoch": 11.016201117318436, + "grad_norm": 0.8190295696258545, + "learning_rate": 0.0004507843137254902, + "loss": 0.4114, + "step": 19719 + }, + { + "epoch": 11.016759776536313, + "grad_norm": 0.5568543076515198, + "learning_rate": 0.0004507563025210084, + "loss": 0.4222, + "step": 19720 + }, + { + "epoch": 11.01731843575419, + "grad_norm": 0.7755473256111145, + "learning_rate": 0.0004507282913165266, + "loss": 0.4088, + "step": 19721 + }, + { + "epoch": 11.017877094972068, + "grad_norm": 0.7063185572624207, + "learning_rate": 0.00045070028011204484, + "loss": 0.5097, + "step": 19722 + }, + { + "epoch": 11.018435754189944, + "grad_norm": 1.5051993131637573, + "learning_rate": 0.00045067226890756304, + "loss": 0.4066, + "step": 19723 + }, + { + "epoch": 11.018994413407821, + "grad_norm": 0.6611260771751404, + "learning_rate": 0.00045064425770308125, + "loss": 0.3836, + "step": 19724 + }, + { + "epoch": 11.019553072625698, + "grad_norm": 0.6900299191474915, + "learning_rate": 0.00045061624649859946, + "loss": 0.3303, + "step": 19725 + }, + { + "epoch": 11.020111731843576, + "grad_norm": 0.8698602914810181, + "learning_rate": 0.0004505882352941176, + "loss": 0.5044, + "step": 19726 + }, + { + "epoch": 11.020670391061453, + "grad_norm": 0.5802265405654907, + "learning_rate": 0.00045056022408963587, + "loss": 0.578, + "step": 19727 + }, + { + "epoch": 11.021229050279329, + "grad_norm": 0.4247480034828186, + "learning_rate": 0.0004505322128851541, + "loss": 0.4857, + "step": 19728 + }, + { + "epoch": 11.021787709497207, + "grad_norm": 0.601398229598999, + "learning_rate": 0.0004505042016806723, + "loss": 0.5832, + "step": 19729 + }, + { + "epoch": 11.022346368715084, + "grad_norm": 0.6081108450889587, + "learning_rate": 0.0004504761904761905, + "loss": 0.3957, + "step": 19730 + }, + { + "epoch": 11.02290502793296, + "grad_norm": 0.5422305464744568, + "learning_rate": 0.0004504481792717087, + "loss": 0.3656, + "step": 19731 + }, + { + "epoch": 11.023463687150837, + "grad_norm": 0.5673686861991882, + "learning_rate": 0.0004504201680672269, + "loss": 0.3877, + "step": 19732 + }, + { + "epoch": 11.024022346368715, + "grad_norm": 0.42768824100494385, + "learning_rate": 0.0004503921568627451, + "loss": 0.3224, + "step": 19733 + }, + { + "epoch": 11.024581005586592, + "grad_norm": 0.4438234865665436, + "learning_rate": 0.0004503641456582633, + "loss": 0.4193, + "step": 19734 + }, + { + "epoch": 11.025139664804469, + "grad_norm": 3.423943519592285, + "learning_rate": 0.0004503361344537815, + "loss": 0.4265, + "step": 19735 + }, + { + "epoch": 11.025698324022347, + "grad_norm": 0.4046982526779175, + "learning_rate": 0.0004503081232492997, + "loss": 0.416, + "step": 19736 + }, + { + "epoch": 11.026256983240224, + "grad_norm": 0.8301132917404175, + "learning_rate": 0.00045028011204481793, + "loss": 0.4521, + "step": 19737 + }, + { + "epoch": 11.0268156424581, + "grad_norm": 0.7326377630233765, + "learning_rate": 0.00045025210084033613, + "loss": 0.4588, + "step": 19738 + }, + { + "epoch": 11.027374301675978, + "grad_norm": 0.7532413005828857, + "learning_rate": 0.0004502240896358544, + "loss": 0.4204, + "step": 19739 + }, + { + "epoch": 11.027932960893855, + "grad_norm": 1.4342658519744873, + "learning_rate": 0.00045019607843137255, + "loss": 0.3627, + "step": 19740 + }, + { + "epoch": 11.028491620111732, + "grad_norm": 0.39807072281837463, + "learning_rate": 0.00045016806722689075, + "loss": 0.4182, + "step": 19741 + }, + { + "epoch": 11.029050279329608, + "grad_norm": 0.6088711023330688, + "learning_rate": 0.00045014005602240896, + "loss": 0.5232, + "step": 19742 + }, + { + "epoch": 11.029608938547486, + "grad_norm": 0.6009140014648438, + "learning_rate": 0.00045011204481792716, + "loss": 0.4527, + "step": 19743 + }, + { + "epoch": 11.030167597765363, + "grad_norm": 0.3853987753391266, + "learning_rate": 0.0004500840336134454, + "loss": 0.3275, + "step": 19744 + }, + { + "epoch": 11.03072625698324, + "grad_norm": 0.5026534795761108, + "learning_rate": 0.0004500560224089636, + "loss": 0.4687, + "step": 19745 + }, + { + "epoch": 11.031284916201118, + "grad_norm": 3.3531880378723145, + "learning_rate": 0.0004500280112044818, + "loss": 0.4827, + "step": 19746 + }, + { + "epoch": 11.031843575418995, + "grad_norm": 0.41678839921951294, + "learning_rate": 0.00045000000000000004, + "loss": 0.45, + "step": 19747 + }, + { + "epoch": 11.032402234636871, + "grad_norm": 1.356294870376587, + "learning_rate": 0.0004499719887955182, + "loss": 0.456, + "step": 19748 + }, + { + "epoch": 11.03296089385475, + "grad_norm": 0.5405905842781067, + "learning_rate": 0.00044994397759103645, + "loss": 0.4509, + "step": 19749 + }, + { + "epoch": 11.033519553072626, + "grad_norm": 1.7592824697494507, + "learning_rate": 0.0004499159663865546, + "loss": 0.4872, + "step": 19750 + }, + { + "epoch": 11.034078212290503, + "grad_norm": 0.8724478483200073, + "learning_rate": 0.0004498879551820728, + "loss": 0.4421, + "step": 19751 + }, + { + "epoch": 11.03463687150838, + "grad_norm": 1.4287068843841553, + "learning_rate": 0.00044985994397759107, + "loss": 0.4321, + "step": 19752 + }, + { + "epoch": 11.035195530726257, + "grad_norm": 0.5304585695266724, + "learning_rate": 0.0004498319327731092, + "loss": 0.4313, + "step": 19753 + }, + { + "epoch": 11.035754189944134, + "grad_norm": 0.44525033235549927, + "learning_rate": 0.0004498039215686275, + "loss": 0.4617, + "step": 19754 + }, + { + "epoch": 11.03631284916201, + "grad_norm": 0.4671137034893036, + "learning_rate": 0.0004497759103641457, + "loss": 0.4735, + "step": 19755 + }, + { + "epoch": 11.036871508379889, + "grad_norm": 0.4294011890888214, + "learning_rate": 0.00044974789915966384, + "loss": 0.5439, + "step": 19756 + }, + { + "epoch": 11.037430167597766, + "grad_norm": 2.162557363510132, + "learning_rate": 0.0004497198879551821, + "loss": 0.4158, + "step": 19757 + }, + { + "epoch": 11.037988826815642, + "grad_norm": 0.5015937685966492, + "learning_rate": 0.00044969187675070025, + "loss": 0.4842, + "step": 19758 + }, + { + "epoch": 11.03854748603352, + "grad_norm": 0.3503091037273407, + "learning_rate": 0.0004496638655462185, + "loss": 0.3708, + "step": 19759 + }, + { + "epoch": 11.039106145251397, + "grad_norm": 0.6573071479797363, + "learning_rate": 0.0004496358543417367, + "loss": 0.4353, + "step": 19760 + }, + { + "epoch": 11.039664804469274, + "grad_norm": 0.672507643699646, + "learning_rate": 0.00044960784313725487, + "loss": 0.4525, + "step": 19761 + }, + { + "epoch": 11.04022346368715, + "grad_norm": 0.5295575261116028, + "learning_rate": 0.00044957983193277313, + "loss": 0.4883, + "step": 19762 + }, + { + "epoch": 11.040782122905028, + "grad_norm": 6.157355308532715, + "learning_rate": 0.00044955182072829134, + "loss": 0.3086, + "step": 19763 + }, + { + "epoch": 11.041340782122905, + "grad_norm": 2.1904854774475098, + "learning_rate": 0.00044952380952380954, + "loss": 0.3571, + "step": 19764 + }, + { + "epoch": 11.041899441340782, + "grad_norm": 0.5147450566291809, + "learning_rate": 0.00044949579831932775, + "loss": 0.3984, + "step": 19765 + }, + { + "epoch": 11.04245810055866, + "grad_norm": 6.060114860534668, + "learning_rate": 0.0004494677871148459, + "loss": 0.6153, + "step": 19766 + }, + { + "epoch": 11.043016759776537, + "grad_norm": 0.7461329698562622, + "learning_rate": 0.00044943977591036416, + "loss": 0.3992, + "step": 19767 + }, + { + "epoch": 11.043575418994413, + "grad_norm": 0.9306190609931946, + "learning_rate": 0.00044941176470588237, + "loss": 0.3516, + "step": 19768 + }, + { + "epoch": 11.04413407821229, + "grad_norm": 0.4361603260040283, + "learning_rate": 0.0004493837535014006, + "loss": 0.4271, + "step": 19769 + }, + { + "epoch": 11.044692737430168, + "grad_norm": 0.7545832991600037, + "learning_rate": 0.0004493557422969188, + "loss": 0.4637, + "step": 19770 + }, + { + "epoch": 11.045251396648045, + "grad_norm": 0.4830663502216339, + "learning_rate": 0.000449327731092437, + "loss": 0.4796, + "step": 19771 + }, + { + "epoch": 11.045810055865921, + "grad_norm": 2.228134870529175, + "learning_rate": 0.0004492997198879552, + "loss": 0.4392, + "step": 19772 + }, + { + "epoch": 11.0463687150838, + "grad_norm": 0.4732452929019928, + "learning_rate": 0.0004492717086834734, + "loss": 0.3396, + "step": 19773 + }, + { + "epoch": 11.046927374301676, + "grad_norm": 0.631354808807373, + "learning_rate": 0.00044924369747899166, + "loss": 0.444, + "step": 19774 + }, + { + "epoch": 11.047486033519553, + "grad_norm": 0.6586755514144897, + "learning_rate": 0.0004492156862745098, + "loss": 0.4068, + "step": 19775 + }, + { + "epoch": 11.048044692737431, + "grad_norm": 0.7469296455383301, + "learning_rate": 0.000449187675070028, + "loss": 0.3993, + "step": 19776 + }, + { + "epoch": 11.048603351955308, + "grad_norm": 0.6499115824699402, + "learning_rate": 0.0004491596638655462, + "loss": 0.5036, + "step": 19777 + }, + { + "epoch": 11.049162011173184, + "grad_norm": 0.4841870367527008, + "learning_rate": 0.00044913165266106443, + "loss": 0.3586, + "step": 19778 + }, + { + "epoch": 11.04972067039106, + "grad_norm": 0.34010013937950134, + "learning_rate": 0.0004491036414565827, + "loss": 0.3428, + "step": 19779 + }, + { + "epoch": 11.050279329608939, + "grad_norm": 0.48760271072387695, + "learning_rate": 0.00044907563025210084, + "loss": 0.4189, + "step": 19780 + }, + { + "epoch": 11.050837988826816, + "grad_norm": 0.5678296685218811, + "learning_rate": 0.00044904761904761905, + "loss": 0.4008, + "step": 19781 + }, + { + "epoch": 11.051396648044692, + "grad_norm": 0.6407414078712463, + "learning_rate": 0.0004490196078431373, + "loss": 0.4333, + "step": 19782 + }, + { + "epoch": 11.05195530726257, + "grad_norm": 0.7218817472457886, + "learning_rate": 0.00044899159663865546, + "loss": 0.4397, + "step": 19783 + }, + { + "epoch": 11.052513966480447, + "grad_norm": 0.6650747060775757, + "learning_rate": 0.0004489635854341737, + "loss": 0.5319, + "step": 19784 + }, + { + "epoch": 11.053072625698324, + "grad_norm": 0.4200950860977173, + "learning_rate": 0.00044893557422969187, + "loss": 0.3557, + "step": 19785 + }, + { + "epoch": 11.053631284916202, + "grad_norm": 0.52342289686203, + "learning_rate": 0.0004489075630252101, + "loss": 0.4107, + "step": 19786 + }, + { + "epoch": 11.054189944134079, + "grad_norm": 0.4744264483451843, + "learning_rate": 0.00044887955182072834, + "loss": 0.3897, + "step": 19787 + }, + { + "epoch": 11.054748603351955, + "grad_norm": 0.43665143847465515, + "learning_rate": 0.0004488515406162465, + "loss": 0.3561, + "step": 19788 + }, + { + "epoch": 11.055307262569832, + "grad_norm": 0.3322528898715973, + "learning_rate": 0.00044882352941176475, + "loss": 0.3173, + "step": 19789 + }, + { + "epoch": 11.05586592178771, + "grad_norm": 0.7613691687583923, + "learning_rate": 0.00044879551820728295, + "loss": 0.4838, + "step": 19790 + }, + { + "epoch": 11.056424581005587, + "grad_norm": 0.5752625465393066, + "learning_rate": 0.0004487675070028011, + "loss": 0.5196, + "step": 19791 + }, + { + "epoch": 11.056983240223463, + "grad_norm": 0.4546201229095459, + "learning_rate": 0.00044873949579831937, + "loss": 0.4134, + "step": 19792 + }, + { + "epoch": 11.057541899441341, + "grad_norm": 0.46799811720848083, + "learning_rate": 0.0004487114845938375, + "loss": 0.3838, + "step": 19793 + }, + { + "epoch": 11.058100558659218, + "grad_norm": 0.6219011545181274, + "learning_rate": 0.0004486834733893558, + "loss": 0.3627, + "step": 19794 + }, + { + "epoch": 11.058659217877095, + "grad_norm": 0.5327205061912537, + "learning_rate": 0.000448655462184874, + "loss": 0.4026, + "step": 19795 + }, + { + "epoch": 11.059217877094973, + "grad_norm": 0.46416011452674866, + "learning_rate": 0.00044862745098039214, + "loss": 0.444, + "step": 19796 + }, + { + "epoch": 11.05977653631285, + "grad_norm": 0.4102841913700104, + "learning_rate": 0.0004485994397759104, + "loss": 0.4167, + "step": 19797 + }, + { + "epoch": 11.060335195530726, + "grad_norm": 0.8700477480888367, + "learning_rate": 0.0004485714285714286, + "loss": 0.4323, + "step": 19798 + }, + { + "epoch": 11.060893854748603, + "grad_norm": 0.7467467784881592, + "learning_rate": 0.0004485434173669468, + "loss": 0.3808, + "step": 19799 + }, + { + "epoch": 11.061452513966481, + "grad_norm": 1.0814316272735596, + "learning_rate": 0.000448515406162465, + "loss": 0.5042, + "step": 19800 + }, + { + "epoch": 11.062011173184358, + "grad_norm": 0.6582698822021484, + "learning_rate": 0.00044848739495798317, + "loss": 0.3572, + "step": 19801 + }, + { + "epoch": 11.062569832402234, + "grad_norm": 0.7060786485671997, + "learning_rate": 0.0004484593837535014, + "loss": 0.4106, + "step": 19802 + }, + { + "epoch": 11.063128491620112, + "grad_norm": 1.318556785583496, + "learning_rate": 0.00044843137254901963, + "loss": 0.5086, + "step": 19803 + }, + { + "epoch": 11.063687150837989, + "grad_norm": 0.6132367253303528, + "learning_rate": 0.0004484033613445378, + "loss": 0.4418, + "step": 19804 + }, + { + "epoch": 11.064245810055866, + "grad_norm": 0.5333831906318665, + "learning_rate": 0.00044837535014005604, + "loss": 0.3852, + "step": 19805 + }, + { + "epoch": 11.064804469273742, + "grad_norm": 0.5851463675498962, + "learning_rate": 0.00044834733893557425, + "loss": 0.5034, + "step": 19806 + }, + { + "epoch": 11.06536312849162, + "grad_norm": 0.399917334318161, + "learning_rate": 0.00044831932773109246, + "loss": 0.434, + "step": 19807 + }, + { + "epoch": 11.065921787709497, + "grad_norm": 2.6709144115448, + "learning_rate": 0.00044829131652661066, + "loss": 0.3053, + "step": 19808 + }, + { + "epoch": 11.066480446927374, + "grad_norm": 0.646639347076416, + "learning_rate": 0.0004482633053221288, + "loss": 0.5963, + "step": 19809 + }, + { + "epoch": 11.067039106145252, + "grad_norm": 0.4972863793373108, + "learning_rate": 0.0004482352941176471, + "loss": 0.4244, + "step": 19810 + }, + { + "epoch": 11.067597765363129, + "grad_norm": 0.39430907368659973, + "learning_rate": 0.0004482072829131653, + "loss": 0.4065, + "step": 19811 + }, + { + "epoch": 11.068156424581005, + "grad_norm": 0.3925905227661133, + "learning_rate": 0.0004481792717086835, + "loss": 0.4313, + "step": 19812 + }, + { + "epoch": 11.068715083798883, + "grad_norm": 0.36944013833999634, + "learning_rate": 0.0004481512605042017, + "loss": 0.3659, + "step": 19813 + }, + { + "epoch": 11.06927374301676, + "grad_norm": 0.49206021428108215, + "learning_rate": 0.0004481232492997199, + "loss": 0.3996, + "step": 19814 + }, + { + "epoch": 11.069832402234637, + "grad_norm": 0.38169413805007935, + "learning_rate": 0.0004480952380952381, + "loss": 0.3325, + "step": 19815 + }, + { + "epoch": 11.070391061452513, + "grad_norm": 0.38647326827049255, + "learning_rate": 0.0004480672268907563, + "loss": 0.3054, + "step": 19816 + }, + { + "epoch": 11.070949720670392, + "grad_norm": 0.6288229823112488, + "learning_rate": 0.0004480392156862745, + "loss": 0.4725, + "step": 19817 + }, + { + "epoch": 11.071508379888268, + "grad_norm": 0.43710818886756897, + "learning_rate": 0.0004480112044817927, + "loss": 0.2975, + "step": 19818 + }, + { + "epoch": 11.072067039106145, + "grad_norm": 0.536785364151001, + "learning_rate": 0.00044798319327731093, + "loss": 0.3972, + "step": 19819 + }, + { + "epoch": 11.072625698324023, + "grad_norm": 0.6924615502357483, + "learning_rate": 0.00044795518207282913, + "loss": 0.5361, + "step": 19820 + }, + { + "epoch": 11.0731843575419, + "grad_norm": 1.0973560810089111, + "learning_rate": 0.00044792717086834734, + "loss": 0.3645, + "step": 19821 + }, + { + "epoch": 11.073743016759776, + "grad_norm": 0.40427523851394653, + "learning_rate": 0.0004478991596638656, + "loss": 0.4499, + "step": 19822 + }, + { + "epoch": 11.074301675977654, + "grad_norm": 0.7949575185775757, + "learning_rate": 0.00044787114845938375, + "loss": 0.4101, + "step": 19823 + }, + { + "epoch": 11.074860335195531, + "grad_norm": 0.7031083703041077, + "learning_rate": 0.00044784313725490196, + "loss": 0.6343, + "step": 19824 + }, + { + "epoch": 11.075418994413408, + "grad_norm": 0.7106063961982727, + "learning_rate": 0.00044781512605042016, + "loss": 0.4775, + "step": 19825 + }, + { + "epoch": 11.075977653631284, + "grad_norm": 0.37123459577560425, + "learning_rate": 0.00044778711484593837, + "loss": 0.4577, + "step": 19826 + }, + { + "epoch": 11.076536312849163, + "grad_norm": 0.4597998857498169, + "learning_rate": 0.00044775910364145663, + "loss": 0.4196, + "step": 19827 + }, + { + "epoch": 11.077094972067039, + "grad_norm": 0.742706298828125, + "learning_rate": 0.0004477310924369748, + "loss": 0.4793, + "step": 19828 + }, + { + "epoch": 11.077653631284916, + "grad_norm": 1.2470176219940186, + "learning_rate": 0.000447703081232493, + "loss": 0.4176, + "step": 19829 + }, + { + "epoch": 11.078212290502794, + "grad_norm": 0.43885621428489685, + "learning_rate": 0.00044767507002801125, + "loss": 0.5263, + "step": 19830 + }, + { + "epoch": 11.07877094972067, + "grad_norm": 0.5389963984489441, + "learning_rate": 0.0004476470588235294, + "loss": 0.3911, + "step": 19831 + }, + { + "epoch": 11.079329608938547, + "grad_norm": 0.5724214911460876, + "learning_rate": 0.00044761904761904766, + "loss": 0.3604, + "step": 19832 + }, + { + "epoch": 11.079888268156424, + "grad_norm": 6.444626331329346, + "learning_rate": 0.0004475910364145658, + "loss": 0.4119, + "step": 19833 + }, + { + "epoch": 11.080446927374302, + "grad_norm": 0.6999254822731018, + "learning_rate": 0.000447563025210084, + "loss": 0.4683, + "step": 19834 + }, + { + "epoch": 11.081005586592179, + "grad_norm": 0.30982500314712524, + "learning_rate": 0.0004475350140056023, + "loss": 0.2887, + "step": 19835 + }, + { + "epoch": 11.081564245810055, + "grad_norm": 0.548913300037384, + "learning_rate": 0.00044750700280112043, + "loss": 0.2762, + "step": 19836 + }, + { + "epoch": 11.082122905027934, + "grad_norm": 0.7421655058860779, + "learning_rate": 0.0004474789915966387, + "loss": 0.4327, + "step": 19837 + }, + { + "epoch": 11.08268156424581, + "grad_norm": 0.6502736210823059, + "learning_rate": 0.0004474509803921569, + "loss": 0.4731, + "step": 19838 + }, + { + "epoch": 11.083240223463687, + "grad_norm": 0.6469879746437073, + "learning_rate": 0.00044742296918767505, + "loss": 0.5146, + "step": 19839 + }, + { + "epoch": 11.083798882681565, + "grad_norm": 0.7766003012657166, + "learning_rate": 0.0004473949579831933, + "loss": 0.4411, + "step": 19840 + }, + { + "epoch": 11.084357541899442, + "grad_norm": 0.6554601788520813, + "learning_rate": 0.00044736694677871146, + "loss": 0.5981, + "step": 19841 + }, + { + "epoch": 11.084916201117318, + "grad_norm": 0.3707920014858246, + "learning_rate": 0.0004473389355742297, + "loss": 0.3617, + "step": 19842 + }, + { + "epoch": 11.085474860335195, + "grad_norm": 0.6389076709747314, + "learning_rate": 0.0004473109243697479, + "loss": 0.4439, + "step": 19843 + }, + { + "epoch": 11.086033519553073, + "grad_norm": 0.5022822022438049, + "learning_rate": 0.0004472829131652661, + "loss": 0.4366, + "step": 19844 + }, + { + "epoch": 11.08659217877095, + "grad_norm": 0.6682159900665283, + "learning_rate": 0.00044725490196078434, + "loss": 0.3834, + "step": 19845 + }, + { + "epoch": 11.087150837988826, + "grad_norm": 0.43023720383644104, + "learning_rate": 0.00044722689075630254, + "loss": 0.447, + "step": 19846 + }, + { + "epoch": 11.087709497206705, + "grad_norm": 0.5825912952423096, + "learning_rate": 0.00044719887955182075, + "loss": 0.5824, + "step": 19847 + }, + { + "epoch": 11.088268156424581, + "grad_norm": 0.41087085008621216, + "learning_rate": 0.00044717086834733896, + "loss": 0.5243, + "step": 19848 + }, + { + "epoch": 11.088826815642458, + "grad_norm": 0.40642911195755005, + "learning_rate": 0.0004471428571428571, + "loss": 0.3839, + "step": 19849 + }, + { + "epoch": 11.089385474860336, + "grad_norm": 0.3995497524738312, + "learning_rate": 0.00044711484593837537, + "loss": 0.3825, + "step": 19850 + }, + { + "epoch": 11.089944134078213, + "grad_norm": 0.7660233378410339, + "learning_rate": 0.0004470868347338936, + "loss": 0.4725, + "step": 19851 + }, + { + "epoch": 11.09050279329609, + "grad_norm": 0.4556220769882202, + "learning_rate": 0.0004470588235294118, + "loss": 0.3796, + "step": 19852 + }, + { + "epoch": 11.091061452513966, + "grad_norm": 0.47020071744918823, + "learning_rate": 0.00044703081232493, + "loss": 0.4272, + "step": 19853 + }, + { + "epoch": 11.091620111731844, + "grad_norm": 0.46526384353637695, + "learning_rate": 0.0004470028011204482, + "loss": 0.4588, + "step": 19854 + }, + { + "epoch": 11.09217877094972, + "grad_norm": 0.7570987939834595, + "learning_rate": 0.0004469747899159664, + "loss": 0.5176, + "step": 19855 + }, + { + "epoch": 11.092737430167597, + "grad_norm": 0.37922248244285583, + "learning_rate": 0.0004469467787114846, + "loss": 0.3792, + "step": 19856 + }, + { + "epoch": 11.093296089385476, + "grad_norm": 0.45460763573646545, + "learning_rate": 0.0004469187675070028, + "loss": 0.4851, + "step": 19857 + }, + { + "epoch": 11.093854748603352, + "grad_norm": 1.700627326965332, + "learning_rate": 0.000446890756302521, + "loss": 0.3735, + "step": 19858 + }, + { + "epoch": 11.094413407821229, + "grad_norm": 0.5089961290359497, + "learning_rate": 0.0004468627450980392, + "loss": 0.4219, + "step": 19859 + }, + { + "epoch": 11.094972067039107, + "grad_norm": 0.4787507653236389, + "learning_rate": 0.00044683473389355743, + "loss": 0.3761, + "step": 19860 + }, + { + "epoch": 11.095530726256984, + "grad_norm": 0.6233643293380737, + "learning_rate": 0.00044680672268907563, + "loss": 0.5126, + "step": 19861 + }, + { + "epoch": 11.09608938547486, + "grad_norm": 1.026381015777588, + "learning_rate": 0.0004467787114845939, + "loss": 0.3771, + "step": 19862 + }, + { + "epoch": 11.096648044692737, + "grad_norm": 0.3337191939353943, + "learning_rate": 0.00044675070028011205, + "loss": 0.3513, + "step": 19863 + }, + { + "epoch": 11.097206703910615, + "grad_norm": 0.37014496326446533, + "learning_rate": 0.00044672268907563025, + "loss": 0.3559, + "step": 19864 + }, + { + "epoch": 11.097765363128492, + "grad_norm": 0.41693469882011414, + "learning_rate": 0.00044669467787114846, + "loss": 0.4258, + "step": 19865 + }, + { + "epoch": 11.098324022346368, + "grad_norm": 0.5736404657363892, + "learning_rate": 0.00044666666666666666, + "loss": 0.4507, + "step": 19866 + }, + { + "epoch": 11.098882681564247, + "grad_norm": 0.50935298204422, + "learning_rate": 0.0004466386554621849, + "loss": 0.3954, + "step": 19867 + }, + { + "epoch": 11.099441340782123, + "grad_norm": 0.9463884830474854, + "learning_rate": 0.0004466106442577031, + "loss": 0.3927, + "step": 19868 + }, + { + "epoch": 11.1, + "grad_norm": 0.5462876558303833, + "learning_rate": 0.0004465826330532213, + "loss": 0.4026, + "step": 19869 + }, + { + "epoch": 11.100558659217878, + "grad_norm": 2.7741801738739014, + "learning_rate": 0.00044655462184873954, + "loss": 0.3712, + "step": 19870 + }, + { + "epoch": 11.101117318435755, + "grad_norm": 0.3957228660583496, + "learning_rate": 0.0004465266106442577, + "loss": 0.4742, + "step": 19871 + }, + { + "epoch": 11.101675977653631, + "grad_norm": 0.664828896522522, + "learning_rate": 0.00044649859943977595, + "loss": 0.5621, + "step": 19872 + }, + { + "epoch": 11.102234636871508, + "grad_norm": 0.9680612087249756, + "learning_rate": 0.0004464705882352941, + "loss": 0.4785, + "step": 19873 + }, + { + "epoch": 11.102793296089386, + "grad_norm": 0.6311147809028625, + "learning_rate": 0.0004464425770308123, + "loss": 0.4577, + "step": 19874 + }, + { + "epoch": 11.103351955307263, + "grad_norm": 0.8508023023605347, + "learning_rate": 0.00044641456582633057, + "loss": 0.376, + "step": 19875 + }, + { + "epoch": 11.10391061452514, + "grad_norm": 13.140066146850586, + "learning_rate": 0.0004463865546218487, + "loss": 0.4167, + "step": 19876 + }, + { + "epoch": 11.104469273743018, + "grad_norm": 0.689651608467102, + "learning_rate": 0.000446358543417367, + "loss": 0.5557, + "step": 19877 + }, + { + "epoch": 11.105027932960894, + "grad_norm": 0.40504926443099976, + "learning_rate": 0.0004463305322128852, + "loss": 0.4504, + "step": 19878 + }, + { + "epoch": 11.10558659217877, + "grad_norm": 0.4543781578540802, + "learning_rate": 0.00044630252100840334, + "loss": 0.4336, + "step": 19879 + }, + { + "epoch": 11.106145251396647, + "grad_norm": 0.6363242268562317, + "learning_rate": 0.0004462745098039216, + "loss": 0.3772, + "step": 19880 + }, + { + "epoch": 11.106703910614526, + "grad_norm": 0.4519440233707428, + "learning_rate": 0.00044624649859943975, + "loss": 0.527, + "step": 19881 + }, + { + "epoch": 11.107262569832402, + "grad_norm": 0.3944387435913086, + "learning_rate": 0.000446218487394958, + "loss": 0.3321, + "step": 19882 + }, + { + "epoch": 11.107821229050279, + "grad_norm": 0.45843812823295593, + "learning_rate": 0.0004461904761904762, + "loss": 0.4609, + "step": 19883 + }, + { + "epoch": 11.108379888268157, + "grad_norm": 0.5077905654907227, + "learning_rate": 0.00044616246498599437, + "loss": 0.4625, + "step": 19884 + }, + { + "epoch": 11.108938547486034, + "grad_norm": 0.6277848482131958, + "learning_rate": 0.00044613445378151263, + "loss": 0.444, + "step": 19885 + }, + { + "epoch": 11.10949720670391, + "grad_norm": 4.500689506530762, + "learning_rate": 0.00044610644257703084, + "loss": 0.4412, + "step": 19886 + }, + { + "epoch": 11.110055865921789, + "grad_norm": 0.6838209629058838, + "learning_rate": 0.00044607843137254904, + "loss": 0.5411, + "step": 19887 + }, + { + "epoch": 11.110614525139665, + "grad_norm": 0.45014968514442444, + "learning_rate": 0.00044605042016806725, + "loss": 0.397, + "step": 19888 + }, + { + "epoch": 11.111173184357542, + "grad_norm": 0.4000174403190613, + "learning_rate": 0.0004460224089635854, + "loss": 0.3811, + "step": 19889 + }, + { + "epoch": 11.111731843575418, + "grad_norm": 3.1476850509643555, + "learning_rate": 0.00044599439775910366, + "loss": 0.5675, + "step": 19890 + }, + { + "epoch": 11.112290502793297, + "grad_norm": 0.4931184947490692, + "learning_rate": 0.00044596638655462187, + "loss": 0.3352, + "step": 19891 + }, + { + "epoch": 11.112849162011173, + "grad_norm": 0.38772642612457275, + "learning_rate": 0.0004459383753501401, + "loss": 0.3462, + "step": 19892 + }, + { + "epoch": 11.11340782122905, + "grad_norm": 0.37878426909446716, + "learning_rate": 0.0004459103641456583, + "loss": 0.3791, + "step": 19893 + }, + { + "epoch": 11.113966480446928, + "grad_norm": 0.36854350566864014, + "learning_rate": 0.0004458823529411765, + "loss": 0.4479, + "step": 19894 + }, + { + "epoch": 11.114525139664805, + "grad_norm": 0.5654700994491577, + "learning_rate": 0.0004458543417366947, + "loss": 0.4475, + "step": 19895 + }, + { + "epoch": 11.115083798882681, + "grad_norm": 1.123820185661316, + "learning_rate": 0.0004458263305322129, + "loss": 0.4679, + "step": 19896 + }, + { + "epoch": 11.11564245810056, + "grad_norm": 0.5729300379753113, + "learning_rate": 0.0004457983193277311, + "loss": 0.4097, + "step": 19897 + }, + { + "epoch": 11.116201117318436, + "grad_norm": 0.3579021096229553, + "learning_rate": 0.0004457703081232493, + "loss": 0.3735, + "step": 19898 + }, + { + "epoch": 11.116759776536313, + "grad_norm": 0.3928564190864563, + "learning_rate": 0.0004457422969187675, + "loss": 0.3832, + "step": 19899 + }, + { + "epoch": 11.11731843575419, + "grad_norm": 0.4491223394870758, + "learning_rate": 0.0004457142857142857, + "loss": 0.5005, + "step": 19900 + }, + { + "epoch": 11.117877094972068, + "grad_norm": 1.066507339477539, + "learning_rate": 0.00044568627450980393, + "loss": 0.414, + "step": 19901 + }, + { + "epoch": 11.118435754189944, + "grad_norm": 0.4069530665874481, + "learning_rate": 0.0004456582633053222, + "loss": 0.4845, + "step": 19902 + }, + { + "epoch": 11.11899441340782, + "grad_norm": 0.3738228678703308, + "learning_rate": 0.00044563025210084034, + "loss": 0.3443, + "step": 19903 + }, + { + "epoch": 11.119553072625699, + "grad_norm": 0.5073080658912659, + "learning_rate": 0.00044560224089635855, + "loss": 0.3996, + "step": 19904 + }, + { + "epoch": 11.120111731843576, + "grad_norm": 0.34824588894844055, + "learning_rate": 0.00044557422969187675, + "loss": 0.3185, + "step": 19905 + }, + { + "epoch": 11.120670391061452, + "grad_norm": 0.7932087182998657, + "learning_rate": 0.00044554621848739496, + "loss": 0.4965, + "step": 19906 + }, + { + "epoch": 11.121229050279329, + "grad_norm": 0.8209249973297119, + "learning_rate": 0.0004455182072829132, + "loss": 0.394, + "step": 19907 + }, + { + "epoch": 11.121787709497207, + "grad_norm": 0.4790240228176117, + "learning_rate": 0.00044549019607843137, + "loss": 0.4401, + "step": 19908 + }, + { + "epoch": 11.122346368715084, + "grad_norm": 0.48526036739349365, + "learning_rate": 0.0004454621848739496, + "loss": 0.4143, + "step": 19909 + }, + { + "epoch": 11.12290502793296, + "grad_norm": 0.4222531020641327, + "learning_rate": 0.00044543417366946784, + "loss": 0.4277, + "step": 19910 + }, + { + "epoch": 11.123463687150839, + "grad_norm": 0.5396952629089355, + "learning_rate": 0.000445406162464986, + "loss": 0.5434, + "step": 19911 + }, + { + "epoch": 11.124022346368715, + "grad_norm": 0.3793957233428955, + "learning_rate": 0.00044537815126050425, + "loss": 0.404, + "step": 19912 + }, + { + "epoch": 11.124581005586592, + "grad_norm": 0.7523414492607117, + "learning_rate": 0.0004453501400560224, + "loss": 0.4877, + "step": 19913 + }, + { + "epoch": 11.12513966480447, + "grad_norm": 0.48998716473579407, + "learning_rate": 0.0004453221288515406, + "loss": 0.3564, + "step": 19914 + }, + { + "epoch": 11.125698324022347, + "grad_norm": 2.81388521194458, + "learning_rate": 0.00044529411764705887, + "loss": 0.5428, + "step": 19915 + }, + { + "epoch": 11.126256983240223, + "grad_norm": 0.9800290465354919, + "learning_rate": 0.000445266106442577, + "loss": 0.5023, + "step": 19916 + }, + { + "epoch": 11.1268156424581, + "grad_norm": 0.6452310681343079, + "learning_rate": 0.0004452380952380952, + "loss": 0.7114, + "step": 19917 + }, + { + "epoch": 11.127374301675978, + "grad_norm": 0.4753224849700928, + "learning_rate": 0.0004452100840336135, + "loss": 0.4169, + "step": 19918 + }, + { + "epoch": 11.127932960893855, + "grad_norm": 2.317497730255127, + "learning_rate": 0.00044518207282913164, + "loss": 0.4244, + "step": 19919 + }, + { + "epoch": 11.128491620111731, + "grad_norm": 0.6090908050537109, + "learning_rate": 0.0004451540616246499, + "loss": 0.461, + "step": 19920 + }, + { + "epoch": 11.12905027932961, + "grad_norm": 0.556962251663208, + "learning_rate": 0.00044512605042016805, + "loss": 0.3918, + "step": 19921 + }, + { + "epoch": 11.129608938547486, + "grad_norm": 0.7319069504737854, + "learning_rate": 0.00044509803921568625, + "loss": 0.3506, + "step": 19922 + }, + { + "epoch": 11.130167597765363, + "grad_norm": 0.4105057716369629, + "learning_rate": 0.0004450700280112045, + "loss": 0.4249, + "step": 19923 + }, + { + "epoch": 11.130726256983241, + "grad_norm": 1.0867705345153809, + "learning_rate": 0.00044504201680672267, + "loss": 0.4178, + "step": 19924 + }, + { + "epoch": 11.131284916201118, + "grad_norm": 0.7936561703681946, + "learning_rate": 0.0004450140056022409, + "loss": 0.3833, + "step": 19925 + }, + { + "epoch": 11.131843575418994, + "grad_norm": 12.50045394897461, + "learning_rate": 0.00044498599439775913, + "loss": 0.5131, + "step": 19926 + }, + { + "epoch": 11.13240223463687, + "grad_norm": 0.46691498160362244, + "learning_rate": 0.0004449579831932773, + "loss": 0.4661, + "step": 19927 + }, + { + "epoch": 11.132960893854749, + "grad_norm": 0.5960050821304321, + "learning_rate": 0.00044492997198879554, + "loss": 0.4234, + "step": 19928 + }, + { + "epoch": 11.133519553072626, + "grad_norm": 1.6895347833633423, + "learning_rate": 0.0004449019607843137, + "loss": 0.4954, + "step": 19929 + }, + { + "epoch": 11.134078212290502, + "grad_norm": 0.5937805771827698, + "learning_rate": 0.00044487394957983196, + "loss": 0.4785, + "step": 19930 + }, + { + "epoch": 11.13463687150838, + "grad_norm": 0.4939204752445221, + "learning_rate": 0.00044484593837535016, + "loss": 0.4696, + "step": 19931 + }, + { + "epoch": 11.135195530726257, + "grad_norm": 0.3809947967529297, + "learning_rate": 0.0004448179271708683, + "loss": 0.4102, + "step": 19932 + }, + { + "epoch": 11.135754189944134, + "grad_norm": 8.647324562072754, + "learning_rate": 0.0004447899159663866, + "loss": 0.4694, + "step": 19933 + }, + { + "epoch": 11.136312849162012, + "grad_norm": 0.41407257318496704, + "learning_rate": 0.0004447619047619048, + "loss": 0.4039, + "step": 19934 + }, + { + "epoch": 11.136871508379889, + "grad_norm": 0.47642093896865845, + "learning_rate": 0.000444733893557423, + "loss": 0.387, + "step": 19935 + }, + { + "epoch": 11.137430167597765, + "grad_norm": 0.5021941661834717, + "learning_rate": 0.0004447058823529412, + "loss": 0.3938, + "step": 19936 + }, + { + "epoch": 11.137988826815642, + "grad_norm": 0.5977472066879272, + "learning_rate": 0.00044467787114845934, + "loss": 0.5128, + "step": 19937 + }, + { + "epoch": 11.13854748603352, + "grad_norm": 2.3964288234710693, + "learning_rate": 0.0004446498599439776, + "loss": 0.3804, + "step": 19938 + }, + { + "epoch": 11.139106145251397, + "grad_norm": 0.9563775062561035, + "learning_rate": 0.0004446218487394958, + "loss": 0.4082, + "step": 19939 + }, + { + "epoch": 11.139664804469273, + "grad_norm": 0.4272240400314331, + "learning_rate": 0.000444593837535014, + "loss": 0.4287, + "step": 19940 + }, + { + "epoch": 11.140223463687152, + "grad_norm": 1.8745205402374268, + "learning_rate": 0.0004445658263305322, + "loss": 0.3488, + "step": 19941 + }, + { + "epoch": 11.140782122905028, + "grad_norm": 5.201748371124268, + "learning_rate": 0.00044453781512605043, + "loss": 0.436, + "step": 19942 + }, + { + "epoch": 11.141340782122905, + "grad_norm": 0.5420041680335999, + "learning_rate": 0.00044450980392156863, + "loss": 0.4879, + "step": 19943 + }, + { + "epoch": 11.141899441340781, + "grad_norm": 0.4293597340583801, + "learning_rate": 0.00044448179271708684, + "loss": 0.3684, + "step": 19944 + }, + { + "epoch": 11.14245810055866, + "grad_norm": 0.4900834262371063, + "learning_rate": 0.00044445378151260505, + "loss": 0.6012, + "step": 19945 + }, + { + "epoch": 11.143016759776536, + "grad_norm": 1.0333806276321411, + "learning_rate": 0.00044442577030812325, + "loss": 0.5209, + "step": 19946 + }, + { + "epoch": 11.143575418994413, + "grad_norm": 0.36754944920539856, + "learning_rate": 0.00044439775910364146, + "loss": 0.3517, + "step": 19947 + }, + { + "epoch": 11.144134078212291, + "grad_norm": 0.37449154257774353, + "learning_rate": 0.00044436974789915966, + "loss": 0.4389, + "step": 19948 + }, + { + "epoch": 11.144692737430168, + "grad_norm": 0.6703065633773804, + "learning_rate": 0.00044434173669467787, + "loss": 0.4326, + "step": 19949 + }, + { + "epoch": 11.145251396648044, + "grad_norm": 0.6214983463287354, + "learning_rate": 0.00044431372549019613, + "loss": 0.4155, + "step": 19950 + }, + { + "epoch": 11.145810055865923, + "grad_norm": 0.4014701247215271, + "learning_rate": 0.0004442857142857143, + "loss": 0.3575, + "step": 19951 + }, + { + "epoch": 11.1463687150838, + "grad_norm": 0.5953041911125183, + "learning_rate": 0.0004442577030812325, + "loss": 0.4555, + "step": 19952 + }, + { + "epoch": 11.146927374301676, + "grad_norm": 0.4547383189201355, + "learning_rate": 0.0004442296918767507, + "loss": 0.4249, + "step": 19953 + }, + { + "epoch": 11.147486033519552, + "grad_norm": 0.4030759334564209, + "learning_rate": 0.0004442016806722689, + "loss": 0.418, + "step": 19954 + }, + { + "epoch": 11.14804469273743, + "grad_norm": 0.48749279975891113, + "learning_rate": 0.00044417366946778716, + "loss": 0.5147, + "step": 19955 + }, + { + "epoch": 11.148603351955307, + "grad_norm": 0.43067467212677, + "learning_rate": 0.0004441456582633053, + "loss": 0.3918, + "step": 19956 + }, + { + "epoch": 11.149162011173184, + "grad_norm": 0.5205173492431641, + "learning_rate": 0.0004441176470588235, + "loss": 0.4978, + "step": 19957 + }, + { + "epoch": 11.149720670391062, + "grad_norm": 1.1520217657089233, + "learning_rate": 0.0004440896358543418, + "loss": 0.4844, + "step": 19958 + }, + { + "epoch": 11.150279329608939, + "grad_norm": 0.4912845194339752, + "learning_rate": 0.00044406162464985993, + "loss": 0.5776, + "step": 19959 + }, + { + "epoch": 11.150837988826815, + "grad_norm": 0.6092059016227722, + "learning_rate": 0.0004440336134453782, + "loss": 0.5862, + "step": 19960 + }, + { + "epoch": 11.151396648044694, + "grad_norm": 0.7184321880340576, + "learning_rate": 0.00044400560224089634, + "loss": 0.4388, + "step": 19961 + }, + { + "epoch": 11.15195530726257, + "grad_norm": 0.40275368094444275, + "learning_rate": 0.00044397759103641455, + "loss": 0.3847, + "step": 19962 + }, + { + "epoch": 11.152513966480447, + "grad_norm": 0.3934175968170166, + "learning_rate": 0.0004439495798319328, + "loss": 0.4448, + "step": 19963 + }, + { + "epoch": 11.153072625698323, + "grad_norm": 0.8231940269470215, + "learning_rate": 0.00044392156862745096, + "loss": 0.4, + "step": 19964 + }, + { + "epoch": 11.153631284916202, + "grad_norm": 0.714524507522583, + "learning_rate": 0.0004438935574229692, + "loss": 0.4606, + "step": 19965 + }, + { + "epoch": 11.154189944134078, + "grad_norm": 0.44008609652519226, + "learning_rate": 0.0004438655462184874, + "loss": 0.4001, + "step": 19966 + }, + { + "epoch": 11.154748603351955, + "grad_norm": 0.44089022278785706, + "learning_rate": 0.0004438375350140056, + "loss": 0.4447, + "step": 19967 + }, + { + "epoch": 11.155307262569833, + "grad_norm": 0.370942622423172, + "learning_rate": 0.00044380952380952384, + "loss": 0.3814, + "step": 19968 + }, + { + "epoch": 11.15586592178771, + "grad_norm": 1.131296157836914, + "learning_rate": 0.000443781512605042, + "loss": 0.3927, + "step": 19969 + }, + { + "epoch": 11.156424581005586, + "grad_norm": 0.5789841413497925, + "learning_rate": 0.00044375350140056025, + "loss": 0.4624, + "step": 19970 + }, + { + "epoch": 11.156983240223465, + "grad_norm": 0.5582478642463684, + "learning_rate": 0.00044372549019607846, + "loss": 0.5091, + "step": 19971 + }, + { + "epoch": 11.157541899441341, + "grad_norm": 0.5246102809906006, + "learning_rate": 0.0004436974789915966, + "loss": 0.414, + "step": 19972 + }, + { + "epoch": 11.158100558659218, + "grad_norm": 0.9045739769935608, + "learning_rate": 0.00044366946778711487, + "loss": 0.4948, + "step": 19973 + }, + { + "epoch": 11.158659217877094, + "grad_norm": 0.38677120208740234, + "learning_rate": 0.0004436414565826331, + "loss": 0.421, + "step": 19974 + }, + { + "epoch": 11.159217877094973, + "grad_norm": 1.924946665763855, + "learning_rate": 0.0004436134453781513, + "loss": 0.3525, + "step": 19975 + }, + { + "epoch": 11.15977653631285, + "grad_norm": 0.3892768621444702, + "learning_rate": 0.0004435854341736695, + "loss": 0.4033, + "step": 19976 + }, + { + "epoch": 11.160335195530726, + "grad_norm": 0.4049968719482422, + "learning_rate": 0.00044355742296918764, + "loss": 0.3679, + "step": 19977 + }, + { + "epoch": 11.160893854748604, + "grad_norm": 0.45830294489860535, + "learning_rate": 0.0004435294117647059, + "loss": 0.4137, + "step": 19978 + }, + { + "epoch": 11.16145251396648, + "grad_norm": 0.7053163647651672, + "learning_rate": 0.0004435014005602241, + "loss": 0.4338, + "step": 19979 + }, + { + "epoch": 11.162011173184357, + "grad_norm": 0.4060651659965515, + "learning_rate": 0.0004434733893557423, + "loss": 0.4131, + "step": 19980 + }, + { + "epoch": 11.162569832402234, + "grad_norm": 0.5380635857582092, + "learning_rate": 0.0004434453781512605, + "loss": 0.54, + "step": 19981 + }, + { + "epoch": 11.163128491620112, + "grad_norm": 0.4502077102661133, + "learning_rate": 0.0004434173669467787, + "loss": 0.3791, + "step": 19982 + }, + { + "epoch": 11.163687150837989, + "grad_norm": 0.4709026515483856, + "learning_rate": 0.00044338935574229693, + "loss": 0.3722, + "step": 19983 + }, + { + "epoch": 11.164245810055865, + "grad_norm": 0.53351891040802, + "learning_rate": 0.00044336134453781513, + "loss": 0.4523, + "step": 19984 + }, + { + "epoch": 11.164804469273744, + "grad_norm": 0.4962504208087921, + "learning_rate": 0.00044333333333333334, + "loss": 0.4393, + "step": 19985 + }, + { + "epoch": 11.16536312849162, + "grad_norm": 0.4671856760978699, + "learning_rate": 0.00044330532212885155, + "loss": 0.3741, + "step": 19986 + }, + { + "epoch": 11.165921787709497, + "grad_norm": 0.44896411895751953, + "learning_rate": 0.00044327731092436975, + "loss": 0.4095, + "step": 19987 + }, + { + "epoch": 11.166480446927375, + "grad_norm": 0.6326718330383301, + "learning_rate": 0.00044324929971988796, + "loss": 0.3112, + "step": 19988 + }, + { + "epoch": 11.167039106145252, + "grad_norm": 0.4235628843307495, + "learning_rate": 0.00044322128851540616, + "loss": 0.3399, + "step": 19989 + }, + { + "epoch": 11.167597765363128, + "grad_norm": 0.48997122049331665, + "learning_rate": 0.0004431932773109244, + "loss": 0.3867, + "step": 19990 + }, + { + "epoch": 11.168156424581005, + "grad_norm": 0.4693921208381653, + "learning_rate": 0.0004431652661064426, + "loss": 0.5065, + "step": 19991 + }, + { + "epoch": 11.168715083798883, + "grad_norm": 0.6533905267715454, + "learning_rate": 0.0004431372549019608, + "loss": 0.4556, + "step": 19992 + }, + { + "epoch": 11.16927374301676, + "grad_norm": 0.6487933397293091, + "learning_rate": 0.000443109243697479, + "loss": 0.3861, + "step": 19993 + }, + { + "epoch": 11.169832402234636, + "grad_norm": 0.5712664127349854, + "learning_rate": 0.0004430812324929972, + "loss": 0.3827, + "step": 19994 + }, + { + "epoch": 11.170391061452515, + "grad_norm": 0.8969149589538574, + "learning_rate": 0.00044305322128851545, + "loss": 0.421, + "step": 19995 + }, + { + "epoch": 11.170949720670391, + "grad_norm": 0.3209882080554962, + "learning_rate": 0.0004430252100840336, + "loss": 0.3328, + "step": 19996 + }, + { + "epoch": 11.171508379888268, + "grad_norm": 0.42669442296028137, + "learning_rate": 0.0004429971988795518, + "loss": 0.3969, + "step": 19997 + }, + { + "epoch": 11.172067039106146, + "grad_norm": 0.42877790331840515, + "learning_rate": 0.00044296918767507007, + "loss": 0.3806, + "step": 19998 + }, + { + "epoch": 11.172625698324023, + "grad_norm": 0.7065102458000183, + "learning_rate": 0.0004429411764705882, + "loss": 0.4662, + "step": 19999 + }, + { + "epoch": 11.1731843575419, + "grad_norm": 1.6102545261383057, + "learning_rate": 0.0004429131652661065, + "loss": 0.3371, + "step": 20000 + }, + { + "epoch": 11.1731843575419, + "eval_cer": 0.08795674991544195, + "eval_loss": 0.3355470299720764, + "eval_runtime": 55.5822, + "eval_samples_per_second": 81.645, + "eval_steps_per_second": 5.11, + "eval_wer": 0.34795117454819696, + "step": 20000 + }, + { + "epoch": 11.173743016759776, + "grad_norm": 1.0497506856918335, + "learning_rate": 0.00044288515406162464, + "loss": 0.4268, + "step": 20001 + }, + { + "epoch": 11.174301675977654, + "grad_norm": 0.4438062906265259, + "learning_rate": 0.00044285714285714284, + "loss": 0.3994, + "step": 20002 + }, + { + "epoch": 11.17486033519553, + "grad_norm": 0.4353538751602173, + "learning_rate": 0.0004428291316526611, + "loss": 0.4089, + "step": 20003 + }, + { + "epoch": 11.175418994413407, + "grad_norm": 0.7603114247322083, + "learning_rate": 0.00044280112044817925, + "loss": 0.471, + "step": 20004 + }, + { + "epoch": 11.175977653631286, + "grad_norm": 0.5623716115951538, + "learning_rate": 0.0004427731092436975, + "loss": 0.5002, + "step": 20005 + }, + { + "epoch": 11.176536312849162, + "grad_norm": 0.34662359952926636, + "learning_rate": 0.0004427450980392157, + "loss": 0.2911, + "step": 20006 + }, + { + "epoch": 11.177094972067039, + "grad_norm": 0.4288632869720459, + "learning_rate": 0.00044271708683473387, + "loss": 0.3179, + "step": 20007 + }, + { + "epoch": 11.177653631284917, + "grad_norm": 0.8289805054664612, + "learning_rate": 0.00044268907563025213, + "loss": 0.4186, + "step": 20008 + }, + { + "epoch": 11.178212290502794, + "grad_norm": 0.5202844738960266, + "learning_rate": 0.0004426610644257703, + "loss": 0.3907, + "step": 20009 + }, + { + "epoch": 11.17877094972067, + "grad_norm": 0.4844478368759155, + "learning_rate": 0.00044263305322128854, + "loss": 0.3755, + "step": 20010 + }, + { + "epoch": 11.179329608938547, + "grad_norm": 0.45475879311561584, + "learning_rate": 0.00044260504201680675, + "loss": 0.4255, + "step": 20011 + }, + { + "epoch": 11.179888268156425, + "grad_norm": 0.6086148619651794, + "learning_rate": 0.0004425770308123249, + "loss": 0.482, + "step": 20012 + }, + { + "epoch": 11.180446927374302, + "grad_norm": 0.5118682980537415, + "learning_rate": 0.00044254901960784316, + "loss": 0.4234, + "step": 20013 + }, + { + "epoch": 11.181005586592178, + "grad_norm": 0.44856396317481995, + "learning_rate": 0.00044252100840336137, + "loss": 0.3857, + "step": 20014 + }, + { + "epoch": 11.181564245810057, + "grad_norm": 0.6991450190544128, + "learning_rate": 0.0004424929971988796, + "loss": 0.4822, + "step": 20015 + }, + { + "epoch": 11.182122905027933, + "grad_norm": 0.4700288474559784, + "learning_rate": 0.0004424649859943978, + "loss": 0.3751, + "step": 20016 + }, + { + "epoch": 11.18268156424581, + "grad_norm": 5.777561187744141, + "learning_rate": 0.00044243697478991593, + "loss": 0.4723, + "step": 20017 + }, + { + "epoch": 11.183240223463686, + "grad_norm": 1.6418412923812866, + "learning_rate": 0.0004424089635854342, + "loss": 0.4915, + "step": 20018 + }, + { + "epoch": 11.183798882681565, + "grad_norm": 0.7723186612129211, + "learning_rate": 0.0004423809523809524, + "loss": 0.4334, + "step": 20019 + }, + { + "epoch": 11.184357541899441, + "grad_norm": 2.2807466983795166, + "learning_rate": 0.0004423529411764706, + "loss": 0.5472, + "step": 20020 + }, + { + "epoch": 11.184916201117318, + "grad_norm": 9.813264846801758, + "learning_rate": 0.0004423249299719888, + "loss": 0.4824, + "step": 20021 + }, + { + "epoch": 11.185474860335196, + "grad_norm": 1.8531826734542847, + "learning_rate": 0.000442296918767507, + "loss": 0.437, + "step": 20022 + }, + { + "epoch": 11.186033519553073, + "grad_norm": 0.5479413866996765, + "learning_rate": 0.0004422689075630252, + "loss": 0.6534, + "step": 20023 + }, + { + "epoch": 11.18659217877095, + "grad_norm": 0.4835755527019501, + "learning_rate": 0.00044224089635854343, + "loss": 0.3739, + "step": 20024 + }, + { + "epoch": 11.187150837988828, + "grad_norm": 0.8052713871002197, + "learning_rate": 0.0004422128851540617, + "loss": 0.4332, + "step": 20025 + }, + { + "epoch": 11.187709497206704, + "grad_norm": 0.46861356496810913, + "learning_rate": 0.00044218487394957984, + "loss": 0.4136, + "step": 20026 + }, + { + "epoch": 11.18826815642458, + "grad_norm": 0.5138416290283203, + "learning_rate": 0.00044215686274509805, + "loss": 0.4212, + "step": 20027 + }, + { + "epoch": 11.188826815642457, + "grad_norm": 0.4061744213104248, + "learning_rate": 0.00044212885154061625, + "loss": 0.3101, + "step": 20028 + }, + { + "epoch": 11.189385474860336, + "grad_norm": 0.5533204078674316, + "learning_rate": 0.00044210084033613446, + "loss": 0.3935, + "step": 20029 + }, + { + "epoch": 11.189944134078212, + "grad_norm": 0.8668876886367798, + "learning_rate": 0.00044207282913165266, + "loss": 0.4078, + "step": 20030 + }, + { + "epoch": 11.190502793296089, + "grad_norm": 0.4431772530078888, + "learning_rate": 0.00044204481792717087, + "loss": 0.3593, + "step": 20031 + }, + { + "epoch": 11.191061452513967, + "grad_norm": 0.5002390146255493, + "learning_rate": 0.0004420168067226891, + "loss": 0.4997, + "step": 20032 + }, + { + "epoch": 11.191620111731844, + "grad_norm": 0.5600509643554688, + "learning_rate": 0.00044198879551820734, + "loss": 0.5281, + "step": 20033 + }, + { + "epoch": 11.19217877094972, + "grad_norm": 0.41320517659187317, + "learning_rate": 0.0004419607843137255, + "loss": 0.3685, + "step": 20034 + }, + { + "epoch": 11.192737430167599, + "grad_norm": 0.46651437878608704, + "learning_rate": 0.0004419327731092437, + "loss": 0.3518, + "step": 20035 + }, + { + "epoch": 11.193296089385475, + "grad_norm": 0.649560809135437, + "learning_rate": 0.0004419047619047619, + "loss": 0.3718, + "step": 20036 + }, + { + "epoch": 11.193854748603352, + "grad_norm": 1.3389005661010742, + "learning_rate": 0.0004418767507002801, + "loss": 0.4594, + "step": 20037 + }, + { + "epoch": 11.194413407821228, + "grad_norm": 3.4143636226654053, + "learning_rate": 0.00044184873949579837, + "loss": 0.4759, + "step": 20038 + }, + { + "epoch": 11.194972067039107, + "grad_norm": 1.9919544458389282, + "learning_rate": 0.0004418207282913165, + "loss": 0.3858, + "step": 20039 + }, + { + "epoch": 11.195530726256983, + "grad_norm": 1.37981379032135, + "learning_rate": 0.0004417927170868347, + "loss": 0.4451, + "step": 20040 + }, + { + "epoch": 11.19608938547486, + "grad_norm": 0.38582009077072144, + "learning_rate": 0.000441764705882353, + "loss": 0.428, + "step": 20041 + }, + { + "epoch": 11.196648044692738, + "grad_norm": 0.3976221978664398, + "learning_rate": 0.00044173669467787114, + "loss": 0.4256, + "step": 20042 + }, + { + "epoch": 11.197206703910615, + "grad_norm": 0.8136613965034485, + "learning_rate": 0.0004417086834733894, + "loss": 0.5439, + "step": 20043 + }, + { + "epoch": 11.197765363128491, + "grad_norm": 0.49499455094337463, + "learning_rate": 0.00044168067226890755, + "loss": 0.4703, + "step": 20044 + }, + { + "epoch": 11.19832402234637, + "grad_norm": 0.4361111521720886, + "learning_rate": 0.00044165266106442575, + "loss": 0.3807, + "step": 20045 + }, + { + "epoch": 11.198882681564246, + "grad_norm": 0.5261693000793457, + "learning_rate": 0.000441624649859944, + "loss": 0.2976, + "step": 20046 + }, + { + "epoch": 11.199441340782123, + "grad_norm": 0.6489242315292358, + "learning_rate": 0.00044159663865546217, + "loss": 0.418, + "step": 20047 + }, + { + "epoch": 11.2, + "grad_norm": 4.530487060546875, + "learning_rate": 0.0004415686274509804, + "loss": 0.7306, + "step": 20048 + }, + { + "epoch": 11.200558659217878, + "grad_norm": 0.4683583676815033, + "learning_rate": 0.00044154061624649863, + "loss": 0.4467, + "step": 20049 + }, + { + "epoch": 11.201117318435754, + "grad_norm": 2.0726184844970703, + "learning_rate": 0.0004415126050420168, + "loss": 0.3234, + "step": 20050 + }, + { + "epoch": 11.20167597765363, + "grad_norm": 0.5123502612113953, + "learning_rate": 0.00044148459383753504, + "loss": 0.5108, + "step": 20051 + }, + { + "epoch": 11.202234636871509, + "grad_norm": 0.4397718012332916, + "learning_rate": 0.0004414565826330532, + "loss": 0.4451, + "step": 20052 + }, + { + "epoch": 11.202793296089386, + "grad_norm": 0.4337514638900757, + "learning_rate": 0.00044142857142857146, + "loss": 0.4575, + "step": 20053 + }, + { + "epoch": 11.203351955307262, + "grad_norm": 0.39273107051849365, + "learning_rate": 0.00044140056022408966, + "loss": 0.498, + "step": 20054 + }, + { + "epoch": 11.203910614525139, + "grad_norm": 0.5106762647628784, + "learning_rate": 0.0004413725490196078, + "loss": 0.4031, + "step": 20055 + }, + { + "epoch": 11.204469273743017, + "grad_norm": 0.6422686576843262, + "learning_rate": 0.0004413445378151261, + "loss": 0.3635, + "step": 20056 + }, + { + "epoch": 11.205027932960894, + "grad_norm": 0.5077261924743652, + "learning_rate": 0.0004413165266106443, + "loss": 0.4331, + "step": 20057 + }, + { + "epoch": 11.20558659217877, + "grad_norm": 0.5573102831840515, + "learning_rate": 0.0004412885154061625, + "loss": 0.3637, + "step": 20058 + }, + { + "epoch": 11.206145251396649, + "grad_norm": 0.48876655101776123, + "learning_rate": 0.0004412605042016807, + "loss": 0.4376, + "step": 20059 + }, + { + "epoch": 11.206703910614525, + "grad_norm": 0.47710248827934265, + "learning_rate": 0.00044123249299719884, + "loss": 0.3698, + "step": 20060 + }, + { + "epoch": 11.207262569832402, + "grad_norm": 0.5610468983650208, + "learning_rate": 0.0004412044817927171, + "loss": 0.5781, + "step": 20061 + }, + { + "epoch": 11.20782122905028, + "grad_norm": 0.3984878957271576, + "learning_rate": 0.0004411764705882353, + "loss": 0.4171, + "step": 20062 + }, + { + "epoch": 11.208379888268157, + "grad_norm": 0.557945966720581, + "learning_rate": 0.0004411484593837535, + "loss": 0.3684, + "step": 20063 + }, + { + "epoch": 11.208938547486033, + "grad_norm": 2.785871982574463, + "learning_rate": 0.0004411204481792717, + "loss": 0.4418, + "step": 20064 + }, + { + "epoch": 11.20949720670391, + "grad_norm": 2.873032569885254, + "learning_rate": 0.00044109243697478993, + "loss": 0.4568, + "step": 20065 + }, + { + "epoch": 11.210055865921788, + "grad_norm": 0.5205723643302917, + "learning_rate": 0.00044106442577030813, + "loss": 0.4417, + "step": 20066 + }, + { + "epoch": 11.210614525139665, + "grad_norm": 0.5439926385879517, + "learning_rate": 0.00044103641456582634, + "loss": 0.4095, + "step": 20067 + }, + { + "epoch": 11.211173184357541, + "grad_norm": 0.46510207653045654, + "learning_rate": 0.00044100840336134455, + "loss": 0.49, + "step": 20068 + }, + { + "epoch": 11.21173184357542, + "grad_norm": 0.4424624741077423, + "learning_rate": 0.00044098039215686275, + "loss": 0.3856, + "step": 20069 + }, + { + "epoch": 11.212290502793296, + "grad_norm": 0.6569352746009827, + "learning_rate": 0.00044095238095238096, + "loss": 0.3589, + "step": 20070 + }, + { + "epoch": 11.212849162011173, + "grad_norm": 0.3509271740913391, + "learning_rate": 0.00044092436974789916, + "loss": 0.407, + "step": 20071 + }, + { + "epoch": 11.213407821229051, + "grad_norm": 0.45570892095565796, + "learning_rate": 0.00044089635854341737, + "loss": 0.4675, + "step": 20072 + }, + { + "epoch": 11.213966480446928, + "grad_norm": 0.8975301384925842, + "learning_rate": 0.00044086834733893563, + "loss": 0.448, + "step": 20073 + }, + { + "epoch": 11.214525139664804, + "grad_norm": 0.4648612141609192, + "learning_rate": 0.0004408403361344538, + "loss": 0.4848, + "step": 20074 + }, + { + "epoch": 11.21508379888268, + "grad_norm": 0.689265787601471, + "learning_rate": 0.000440812324929972, + "loss": 0.3949, + "step": 20075 + }, + { + "epoch": 11.21564245810056, + "grad_norm": 0.6003746390342712, + "learning_rate": 0.0004407843137254902, + "loss": 0.4982, + "step": 20076 + }, + { + "epoch": 11.216201117318436, + "grad_norm": 0.6151201128959656, + "learning_rate": 0.0004407563025210084, + "loss": 0.4687, + "step": 20077 + }, + { + "epoch": 11.216759776536312, + "grad_norm": 0.428731769323349, + "learning_rate": 0.00044072829131652666, + "loss": 0.4462, + "step": 20078 + }, + { + "epoch": 11.21731843575419, + "grad_norm": 0.5962857604026794, + "learning_rate": 0.0004407002801120448, + "loss": 0.3454, + "step": 20079 + }, + { + "epoch": 11.217877094972067, + "grad_norm": 0.35154932737350464, + "learning_rate": 0.000440672268907563, + "loss": 0.4113, + "step": 20080 + }, + { + "epoch": 11.218435754189944, + "grad_norm": 0.5738747715950012, + "learning_rate": 0.0004406442577030813, + "loss": 0.4235, + "step": 20081 + }, + { + "epoch": 11.21899441340782, + "grad_norm": 0.4769916236400604, + "learning_rate": 0.00044061624649859943, + "loss": 0.4319, + "step": 20082 + }, + { + "epoch": 11.219553072625699, + "grad_norm": 0.5487616658210754, + "learning_rate": 0.0004405882352941177, + "loss": 0.3855, + "step": 20083 + }, + { + "epoch": 11.220111731843575, + "grad_norm": 3.416422128677368, + "learning_rate": 0.00044056022408963584, + "loss": 0.4535, + "step": 20084 + }, + { + "epoch": 11.220670391061452, + "grad_norm": 0.45399385690689087, + "learning_rate": 0.00044053221288515405, + "loss": 0.3677, + "step": 20085 + }, + { + "epoch": 11.22122905027933, + "grad_norm": 0.3997017443180084, + "learning_rate": 0.0004405042016806723, + "loss": 0.4031, + "step": 20086 + }, + { + "epoch": 11.221787709497207, + "grad_norm": 0.6133577823638916, + "learning_rate": 0.00044047619047619046, + "loss": 0.4233, + "step": 20087 + }, + { + "epoch": 11.222346368715083, + "grad_norm": 0.5656393766403198, + "learning_rate": 0.0004404481792717087, + "loss": 0.4745, + "step": 20088 + }, + { + "epoch": 11.222905027932962, + "grad_norm": 0.42285895347595215, + "learning_rate": 0.0004404201680672269, + "loss": 0.3373, + "step": 20089 + }, + { + "epoch": 11.223463687150838, + "grad_norm": 0.747688353061676, + "learning_rate": 0.0004403921568627451, + "loss": 0.4509, + "step": 20090 + }, + { + "epoch": 11.224022346368715, + "grad_norm": 0.5643873810768127, + "learning_rate": 0.00044036414565826334, + "loss": 0.4337, + "step": 20091 + }, + { + "epoch": 11.224581005586591, + "grad_norm": 0.4267115592956543, + "learning_rate": 0.0004403361344537815, + "loss": 0.3616, + "step": 20092 + }, + { + "epoch": 11.22513966480447, + "grad_norm": 0.5259825587272644, + "learning_rate": 0.00044030812324929975, + "loss": 0.4448, + "step": 20093 + }, + { + "epoch": 11.225698324022346, + "grad_norm": 0.6425420641899109, + "learning_rate": 0.00044028011204481796, + "loss": 0.439, + "step": 20094 + }, + { + "epoch": 11.226256983240223, + "grad_norm": 0.5163062810897827, + "learning_rate": 0.0004402521008403361, + "loss": 0.498, + "step": 20095 + }, + { + "epoch": 11.226815642458101, + "grad_norm": 0.4639945924282074, + "learning_rate": 0.00044022408963585437, + "loss": 0.3496, + "step": 20096 + }, + { + "epoch": 11.227374301675978, + "grad_norm": 0.5147891044616699, + "learning_rate": 0.0004401960784313726, + "loss": 0.3996, + "step": 20097 + }, + { + "epoch": 11.227932960893854, + "grad_norm": 0.6439340114593506, + "learning_rate": 0.0004401680672268908, + "loss": 0.3965, + "step": 20098 + }, + { + "epoch": 11.228491620111733, + "grad_norm": 0.8488362431526184, + "learning_rate": 0.000440140056022409, + "loss": 0.475, + "step": 20099 + }, + { + "epoch": 11.22905027932961, + "grad_norm": 0.4989221394062042, + "learning_rate": 0.00044011204481792714, + "loss": 0.4403, + "step": 20100 + }, + { + "epoch": 11.229608938547486, + "grad_norm": 0.9705707430839539, + "learning_rate": 0.0004400840336134454, + "loss": 0.3873, + "step": 20101 + }, + { + "epoch": 11.230167597765362, + "grad_norm": 0.7108225226402283, + "learning_rate": 0.0004400560224089636, + "loss": 0.3668, + "step": 20102 + }, + { + "epoch": 11.23072625698324, + "grad_norm": 0.5318310856819153, + "learning_rate": 0.0004400280112044818, + "loss": 0.4534, + "step": 20103 + }, + { + "epoch": 11.231284916201117, + "grad_norm": 0.5211368799209595, + "learning_rate": 0.00044, + "loss": 0.6341, + "step": 20104 + }, + { + "epoch": 11.231843575418994, + "grad_norm": 0.5484535694122314, + "learning_rate": 0.0004399719887955182, + "loss": 0.4812, + "step": 20105 + }, + { + "epoch": 11.232402234636872, + "grad_norm": 0.5162181258201599, + "learning_rate": 0.00043994397759103643, + "loss": 0.6242, + "step": 20106 + }, + { + "epoch": 11.232960893854749, + "grad_norm": 0.5498502254486084, + "learning_rate": 0.00043991596638655463, + "loss": 0.5494, + "step": 20107 + }, + { + "epoch": 11.233519553072625, + "grad_norm": 0.9606100916862488, + "learning_rate": 0.00043988795518207284, + "loss": 0.4041, + "step": 20108 + }, + { + "epoch": 11.234078212290504, + "grad_norm": 0.7064275145530701, + "learning_rate": 0.00043985994397759105, + "loss": 0.4119, + "step": 20109 + }, + { + "epoch": 11.23463687150838, + "grad_norm": 0.5120725631713867, + "learning_rate": 0.00043983193277310925, + "loss": 0.4046, + "step": 20110 + }, + { + "epoch": 11.235195530726257, + "grad_norm": 0.4233984351158142, + "learning_rate": 0.00043980392156862746, + "loss": 0.4601, + "step": 20111 + }, + { + "epoch": 11.235754189944133, + "grad_norm": 0.47705525159835815, + "learning_rate": 0.00043977591036414566, + "loss": 0.4644, + "step": 20112 + }, + { + "epoch": 11.236312849162012, + "grad_norm": 0.5872893929481506, + "learning_rate": 0.0004397478991596639, + "loss": 0.5413, + "step": 20113 + }, + { + "epoch": 11.236871508379888, + "grad_norm": 0.4676712453365326, + "learning_rate": 0.0004397198879551821, + "loss": 0.3803, + "step": 20114 + }, + { + "epoch": 11.237430167597765, + "grad_norm": 2.7982687950134277, + "learning_rate": 0.0004396918767507003, + "loss": 0.3956, + "step": 20115 + }, + { + "epoch": 11.237988826815643, + "grad_norm": 0.3906034827232361, + "learning_rate": 0.0004396638655462185, + "loss": 0.3147, + "step": 20116 + }, + { + "epoch": 11.23854748603352, + "grad_norm": 0.4676041901111603, + "learning_rate": 0.0004396358543417367, + "loss": 0.4466, + "step": 20117 + }, + { + "epoch": 11.239106145251396, + "grad_norm": 1.093482494354248, + "learning_rate": 0.00043960784313725495, + "loss": 0.4907, + "step": 20118 + }, + { + "epoch": 11.239664804469275, + "grad_norm": 0.583071231842041, + "learning_rate": 0.0004395798319327731, + "loss": 0.403, + "step": 20119 + }, + { + "epoch": 11.240223463687151, + "grad_norm": 0.45677652955055237, + "learning_rate": 0.0004395518207282913, + "loss": 0.3397, + "step": 20120 + }, + { + "epoch": 11.240782122905028, + "grad_norm": 0.7649537324905396, + "learning_rate": 0.00043952380952380957, + "loss": 0.4074, + "step": 20121 + }, + { + "epoch": 11.241340782122904, + "grad_norm": 2.905608892440796, + "learning_rate": 0.0004394957983193277, + "loss": 0.4069, + "step": 20122 + }, + { + "epoch": 11.241899441340783, + "grad_norm": 2.4164202213287354, + "learning_rate": 0.000439467787114846, + "loss": 0.4268, + "step": 20123 + }, + { + "epoch": 11.24245810055866, + "grad_norm": 0.44573885202407837, + "learning_rate": 0.00043943977591036414, + "loss": 0.4055, + "step": 20124 + }, + { + "epoch": 11.243016759776536, + "grad_norm": 0.41281893849372864, + "learning_rate": 0.00043941176470588234, + "loss": 0.4484, + "step": 20125 + }, + { + "epoch": 11.243575418994414, + "grad_norm": 1.6746652126312256, + "learning_rate": 0.0004393837535014006, + "loss": 0.5086, + "step": 20126 + }, + { + "epoch": 11.24413407821229, + "grad_norm": 0.4273971617221832, + "learning_rate": 0.00043935574229691875, + "loss": 0.3718, + "step": 20127 + }, + { + "epoch": 11.244692737430167, + "grad_norm": 1.4274877309799194, + "learning_rate": 0.000439327731092437, + "loss": 0.4878, + "step": 20128 + }, + { + "epoch": 11.245251396648044, + "grad_norm": 0.44678208231925964, + "learning_rate": 0.0004392997198879552, + "loss": 0.3721, + "step": 20129 + }, + { + "epoch": 11.245810055865922, + "grad_norm": 0.4419698715209961, + "learning_rate": 0.00043927170868347337, + "loss": 0.3436, + "step": 20130 + }, + { + "epoch": 11.246368715083799, + "grad_norm": 0.4058476686477661, + "learning_rate": 0.00043924369747899163, + "loss": 0.3984, + "step": 20131 + }, + { + "epoch": 11.246927374301675, + "grad_norm": 0.4887247383594513, + "learning_rate": 0.0004392156862745098, + "loss": 0.3782, + "step": 20132 + }, + { + "epoch": 11.247486033519554, + "grad_norm": 0.6937375664710999, + "learning_rate": 0.00043918767507002804, + "loss": 0.4758, + "step": 20133 + }, + { + "epoch": 11.24804469273743, + "grad_norm": 0.49732986092567444, + "learning_rate": 0.00043915966386554625, + "loss": 0.5085, + "step": 20134 + }, + { + "epoch": 11.248603351955307, + "grad_norm": 0.6458745002746582, + "learning_rate": 0.0004391316526610644, + "loss": 0.3339, + "step": 20135 + }, + { + "epoch": 11.249162011173185, + "grad_norm": 0.555120587348938, + "learning_rate": 0.00043910364145658266, + "loss": 0.4745, + "step": 20136 + }, + { + "epoch": 11.249720670391062, + "grad_norm": 3.8175039291381836, + "learning_rate": 0.00043907563025210087, + "loss": 0.4063, + "step": 20137 + }, + { + "epoch": 11.250279329608938, + "grad_norm": 0.4628995358943939, + "learning_rate": 0.000439047619047619, + "loss": 0.4174, + "step": 20138 + }, + { + "epoch": 11.250837988826815, + "grad_norm": 0.44974204897880554, + "learning_rate": 0.0004390196078431373, + "loss": 0.3945, + "step": 20139 + }, + { + "epoch": 11.251396648044693, + "grad_norm": 6.7041425704956055, + "learning_rate": 0.00043899159663865543, + "loss": 0.3884, + "step": 20140 + }, + { + "epoch": 11.25195530726257, + "grad_norm": 0.4144757091999054, + "learning_rate": 0.0004389635854341737, + "loss": 0.4072, + "step": 20141 + }, + { + "epoch": 11.252513966480446, + "grad_norm": 0.5192364454269409, + "learning_rate": 0.0004389355742296919, + "loss": 0.4504, + "step": 20142 + }, + { + "epoch": 11.253072625698325, + "grad_norm": 0.8089028000831604, + "learning_rate": 0.00043890756302521005, + "loss": 0.3536, + "step": 20143 + }, + { + "epoch": 11.253631284916201, + "grad_norm": 0.42806190252304077, + "learning_rate": 0.0004388795518207283, + "loss": 0.3846, + "step": 20144 + }, + { + "epoch": 11.254189944134078, + "grad_norm": 0.6048259735107422, + "learning_rate": 0.0004388515406162465, + "loss": 0.2979, + "step": 20145 + }, + { + "epoch": 11.254748603351956, + "grad_norm": 0.4058989882469177, + "learning_rate": 0.0004388235294117647, + "loss": 0.4547, + "step": 20146 + }, + { + "epoch": 11.255307262569833, + "grad_norm": 0.3987780213356018, + "learning_rate": 0.00043879551820728293, + "loss": 0.3823, + "step": 20147 + }, + { + "epoch": 11.25586592178771, + "grad_norm": 0.41179659962654114, + "learning_rate": 0.0004387675070028011, + "loss": 0.4133, + "step": 20148 + }, + { + "epoch": 11.256424581005586, + "grad_norm": 0.5493605136871338, + "learning_rate": 0.00043873949579831934, + "loss": 0.4084, + "step": 20149 + }, + { + "epoch": 11.256983240223464, + "grad_norm": 1.3462769985198975, + "learning_rate": 0.00043871148459383755, + "loss": 0.4169, + "step": 20150 + }, + { + "epoch": 11.25754189944134, + "grad_norm": 0.4494118392467499, + "learning_rate": 0.00043868347338935575, + "loss": 0.4997, + "step": 20151 + }, + { + "epoch": 11.258100558659217, + "grad_norm": 0.842294454574585, + "learning_rate": 0.00043865546218487396, + "loss": 0.4013, + "step": 20152 + }, + { + "epoch": 11.258659217877096, + "grad_norm": 0.5919236540794373, + "learning_rate": 0.00043862745098039216, + "loss": 0.4914, + "step": 20153 + }, + { + "epoch": 11.259217877094972, + "grad_norm": 0.4723402261734009, + "learning_rate": 0.00043859943977591037, + "loss": 0.4451, + "step": 20154 + }, + { + "epoch": 11.259776536312849, + "grad_norm": 1.2802542448043823, + "learning_rate": 0.0004385714285714286, + "loss": 0.4232, + "step": 20155 + }, + { + "epoch": 11.260335195530725, + "grad_norm": 0.5456537008285522, + "learning_rate": 0.0004385434173669468, + "loss": 0.3986, + "step": 20156 + }, + { + "epoch": 11.260893854748604, + "grad_norm": 0.42404642701148987, + "learning_rate": 0.000438515406162465, + "loss": 0.4002, + "step": 20157 + }, + { + "epoch": 11.26145251396648, + "grad_norm": 0.4189004600048065, + "learning_rate": 0.0004384873949579832, + "loss": 0.4459, + "step": 20158 + }, + { + "epoch": 11.262011173184357, + "grad_norm": 0.43470510840415955, + "learning_rate": 0.0004384593837535014, + "loss": 0.4298, + "step": 20159 + }, + { + "epoch": 11.262569832402235, + "grad_norm": 0.787608802318573, + "learning_rate": 0.0004384313725490196, + "loss": 0.399, + "step": 20160 + }, + { + "epoch": 11.263128491620112, + "grad_norm": 0.509988009929657, + "learning_rate": 0.00043840336134453787, + "loss": 0.4686, + "step": 20161 + }, + { + "epoch": 11.263687150837988, + "grad_norm": 0.6208260655403137, + "learning_rate": 0.000438375350140056, + "loss": 0.4392, + "step": 20162 + }, + { + "epoch": 11.264245810055867, + "grad_norm": 0.40109121799468994, + "learning_rate": 0.0004383473389355742, + "loss": 0.3987, + "step": 20163 + }, + { + "epoch": 11.264804469273743, + "grad_norm": 2.0710699558258057, + "learning_rate": 0.00043831932773109243, + "loss": 0.723, + "step": 20164 + }, + { + "epoch": 11.26536312849162, + "grad_norm": 2.471428871154785, + "learning_rate": 0.00043829131652661064, + "loss": 0.403, + "step": 20165 + }, + { + "epoch": 11.265921787709496, + "grad_norm": 0.6072873473167419, + "learning_rate": 0.0004382633053221289, + "loss": 0.456, + "step": 20166 + }, + { + "epoch": 11.266480446927375, + "grad_norm": 0.38765987753868103, + "learning_rate": 0.00043823529411764705, + "loss": 0.3871, + "step": 20167 + }, + { + "epoch": 11.267039106145251, + "grad_norm": 0.4405870735645294, + "learning_rate": 0.00043820728291316525, + "loss": 0.5005, + "step": 20168 + }, + { + "epoch": 11.267597765363128, + "grad_norm": 1.8352257013320923, + "learning_rate": 0.0004381792717086835, + "loss": 0.3751, + "step": 20169 + }, + { + "epoch": 11.268156424581006, + "grad_norm": 1.0349719524383545, + "learning_rate": 0.00043815126050420167, + "loss": 0.4266, + "step": 20170 + }, + { + "epoch": 11.268715083798883, + "grad_norm": 0.558402419090271, + "learning_rate": 0.0004381232492997199, + "loss": 0.3411, + "step": 20171 + }, + { + "epoch": 11.26927374301676, + "grad_norm": 0.4799840450286865, + "learning_rate": 0.0004380952380952381, + "loss": 0.4086, + "step": 20172 + }, + { + "epoch": 11.269832402234638, + "grad_norm": 0.611286997795105, + "learning_rate": 0.0004380672268907563, + "loss": 0.5257, + "step": 20173 + }, + { + "epoch": 11.270391061452514, + "grad_norm": 0.5433061122894287, + "learning_rate": 0.00043803921568627454, + "loss": 0.4249, + "step": 20174 + }, + { + "epoch": 11.27094972067039, + "grad_norm": 0.6384894251823425, + "learning_rate": 0.0004380112044817927, + "loss": 0.4175, + "step": 20175 + }, + { + "epoch": 11.271508379888267, + "grad_norm": 0.6537908911705017, + "learning_rate": 0.00043798319327731096, + "loss": 0.6883, + "step": 20176 + }, + { + "epoch": 11.272067039106146, + "grad_norm": 0.8897542953491211, + "learning_rate": 0.00043795518207282916, + "loss": 0.4312, + "step": 20177 + }, + { + "epoch": 11.272625698324022, + "grad_norm": 0.3867975175380707, + "learning_rate": 0.0004379271708683473, + "loss": 0.4023, + "step": 20178 + }, + { + "epoch": 11.273184357541899, + "grad_norm": 0.8229814767837524, + "learning_rate": 0.0004378991596638656, + "loss": 0.3449, + "step": 20179 + }, + { + "epoch": 11.273743016759777, + "grad_norm": 1.3613076210021973, + "learning_rate": 0.0004378711484593837, + "loss": 0.3285, + "step": 20180 + }, + { + "epoch": 11.274301675977654, + "grad_norm": 0.5792914628982544, + "learning_rate": 0.000437843137254902, + "loss": 0.4653, + "step": 20181 + }, + { + "epoch": 11.27486033519553, + "grad_norm": 0.6109485030174255, + "learning_rate": 0.0004378151260504202, + "loss": 0.4795, + "step": 20182 + }, + { + "epoch": 11.275418994413409, + "grad_norm": 0.4183536469936371, + "learning_rate": 0.00043778711484593834, + "loss": 0.3641, + "step": 20183 + }, + { + "epoch": 11.275977653631285, + "grad_norm": 0.41660964488983154, + "learning_rate": 0.0004377591036414566, + "loss": 0.4267, + "step": 20184 + }, + { + "epoch": 11.276536312849162, + "grad_norm": 7.3677144050598145, + "learning_rate": 0.0004377310924369748, + "loss": 0.4426, + "step": 20185 + }, + { + "epoch": 11.277094972067038, + "grad_norm": 0.575406014919281, + "learning_rate": 0.000437703081232493, + "loss": 0.4732, + "step": 20186 + }, + { + "epoch": 11.277653631284917, + "grad_norm": 0.39118093252182007, + "learning_rate": 0.0004376750700280112, + "loss": 0.4539, + "step": 20187 + }, + { + "epoch": 11.278212290502793, + "grad_norm": 0.5141621828079224, + "learning_rate": 0.0004376470588235294, + "loss": 0.4559, + "step": 20188 + }, + { + "epoch": 11.27877094972067, + "grad_norm": 0.818881094455719, + "learning_rate": 0.00043761904761904763, + "loss": 0.4649, + "step": 20189 + }, + { + "epoch": 11.279329608938548, + "grad_norm": 0.6279159188270569, + "learning_rate": 0.00043759103641456584, + "loss": 0.4411, + "step": 20190 + }, + { + "epoch": 11.279888268156425, + "grad_norm": 0.4050810635089874, + "learning_rate": 0.00043756302521008405, + "loss": 0.4597, + "step": 20191 + }, + { + "epoch": 11.280446927374301, + "grad_norm": 0.7124335169792175, + "learning_rate": 0.00043753501400560225, + "loss": 0.4177, + "step": 20192 + }, + { + "epoch": 11.28100558659218, + "grad_norm": 0.6197972893714905, + "learning_rate": 0.00043750700280112046, + "loss": 0.3634, + "step": 20193 + }, + { + "epoch": 11.281564245810056, + "grad_norm": 0.637822687625885, + "learning_rate": 0.00043747899159663866, + "loss": 0.5034, + "step": 20194 + }, + { + "epoch": 11.282122905027933, + "grad_norm": 0.563567042350769, + "learning_rate": 0.00043745098039215687, + "loss": 0.4783, + "step": 20195 + }, + { + "epoch": 11.28268156424581, + "grad_norm": 0.8783479332923889, + "learning_rate": 0.0004374229691876751, + "loss": 0.382, + "step": 20196 + }, + { + "epoch": 11.283240223463688, + "grad_norm": 0.854576587677002, + "learning_rate": 0.0004373949579831933, + "loss": 0.6866, + "step": 20197 + }, + { + "epoch": 11.283798882681564, + "grad_norm": 0.5092287659645081, + "learning_rate": 0.0004373669467787115, + "loss": 0.4377, + "step": 20198 + }, + { + "epoch": 11.28435754189944, + "grad_norm": 0.37525826692581177, + "learning_rate": 0.0004373389355742297, + "loss": 0.4989, + "step": 20199 + }, + { + "epoch": 11.28491620111732, + "grad_norm": 0.5437232851982117, + "learning_rate": 0.0004373109243697479, + "loss": 0.6531, + "step": 20200 + }, + { + "epoch": 11.285474860335196, + "grad_norm": 0.7121294736862183, + "learning_rate": 0.00043728291316526616, + "loss": 0.6267, + "step": 20201 + }, + { + "epoch": 11.286033519553072, + "grad_norm": 0.41168221831321716, + "learning_rate": 0.0004372549019607843, + "loss": 0.437, + "step": 20202 + }, + { + "epoch": 11.286592178770949, + "grad_norm": 0.4956684112548828, + "learning_rate": 0.0004372268907563025, + "loss": 0.4571, + "step": 20203 + }, + { + "epoch": 11.287150837988827, + "grad_norm": 0.46635955572128296, + "learning_rate": 0.0004371988795518207, + "loss": 0.3708, + "step": 20204 + }, + { + "epoch": 11.287709497206704, + "grad_norm": 0.5189955234527588, + "learning_rate": 0.00043717086834733893, + "loss": 0.4038, + "step": 20205 + }, + { + "epoch": 11.28826815642458, + "grad_norm": 0.6829469799995422, + "learning_rate": 0.0004371428571428572, + "loss": 0.7769, + "step": 20206 + }, + { + "epoch": 11.288826815642459, + "grad_norm": 0.3898891806602478, + "learning_rate": 0.00043711484593837534, + "loss": 0.3301, + "step": 20207 + }, + { + "epoch": 11.289385474860335, + "grad_norm": 1.0952848196029663, + "learning_rate": 0.00043708683473389355, + "loss": 0.575, + "step": 20208 + }, + { + "epoch": 11.289944134078212, + "grad_norm": 0.4770362675189972, + "learning_rate": 0.0004370588235294118, + "loss": 0.3375, + "step": 20209 + }, + { + "epoch": 11.29050279329609, + "grad_norm": 0.46579399704933167, + "learning_rate": 0.00043703081232492996, + "loss": 0.3755, + "step": 20210 + }, + { + "epoch": 11.291061452513967, + "grad_norm": 4.949335098266602, + "learning_rate": 0.0004370028011204482, + "loss": 0.4253, + "step": 20211 + }, + { + "epoch": 11.291620111731843, + "grad_norm": 0.48682326078414917, + "learning_rate": 0.00043697478991596637, + "loss": 0.4022, + "step": 20212 + }, + { + "epoch": 11.29217877094972, + "grad_norm": 0.573822557926178, + "learning_rate": 0.0004369467787114846, + "loss": 0.4693, + "step": 20213 + }, + { + "epoch": 11.292737430167598, + "grad_norm": 0.4340304732322693, + "learning_rate": 0.00043691876750700284, + "loss": 0.3648, + "step": 20214 + }, + { + "epoch": 11.293296089385475, + "grad_norm": 0.8414255976676941, + "learning_rate": 0.000436890756302521, + "loss": 0.4972, + "step": 20215 + }, + { + "epoch": 11.293854748603351, + "grad_norm": 0.6106770634651184, + "learning_rate": 0.00043686274509803925, + "loss": 0.4185, + "step": 20216 + }, + { + "epoch": 11.29441340782123, + "grad_norm": 0.47612854838371277, + "learning_rate": 0.00043683473389355746, + "loss": 0.4969, + "step": 20217 + }, + { + "epoch": 11.294972067039106, + "grad_norm": 0.5907502174377441, + "learning_rate": 0.0004368067226890756, + "loss": 0.5458, + "step": 20218 + }, + { + "epoch": 11.295530726256983, + "grad_norm": 0.7037196755409241, + "learning_rate": 0.00043677871148459387, + "loss": 0.3775, + "step": 20219 + }, + { + "epoch": 11.296089385474861, + "grad_norm": 0.8624817728996277, + "learning_rate": 0.000436750700280112, + "loss": 0.3347, + "step": 20220 + }, + { + "epoch": 11.296648044692738, + "grad_norm": 0.42462775111198425, + "learning_rate": 0.0004367226890756303, + "loss": 0.4554, + "step": 20221 + }, + { + "epoch": 11.297206703910614, + "grad_norm": 0.4273512661457062, + "learning_rate": 0.0004366946778711485, + "loss": 0.4585, + "step": 20222 + }, + { + "epoch": 11.297765363128491, + "grad_norm": 0.8187879323959351, + "learning_rate": 0.00043666666666666664, + "loss": 0.4186, + "step": 20223 + }, + { + "epoch": 11.29832402234637, + "grad_norm": 1.2574671506881714, + "learning_rate": 0.0004366386554621849, + "loss": 0.4469, + "step": 20224 + }, + { + "epoch": 11.298882681564246, + "grad_norm": 0.5985570549964905, + "learning_rate": 0.0004366106442577031, + "loss": 0.3576, + "step": 20225 + }, + { + "epoch": 11.299441340782122, + "grad_norm": 0.47140201926231384, + "learning_rate": 0.0004365826330532213, + "loss": 0.4792, + "step": 20226 + }, + { + "epoch": 11.3, + "grad_norm": 0.8115884065628052, + "learning_rate": 0.0004365546218487395, + "loss": 0.5552, + "step": 20227 + }, + { + "epoch": 11.300558659217877, + "grad_norm": 0.4647372364997864, + "learning_rate": 0.00043652661064425767, + "loss": 0.4706, + "step": 20228 + }, + { + "epoch": 11.301117318435754, + "grad_norm": 1.0155870914459229, + "learning_rate": 0.00043649859943977593, + "loss": 0.4154, + "step": 20229 + }, + { + "epoch": 11.30167597765363, + "grad_norm": 0.7085390686988831, + "learning_rate": 0.00043647058823529413, + "loss": 0.446, + "step": 20230 + }, + { + "epoch": 11.302234636871509, + "grad_norm": 1.5133395195007324, + "learning_rate": 0.00043644257703081234, + "loss": 0.3566, + "step": 20231 + }, + { + "epoch": 11.302793296089385, + "grad_norm": 1.619756817817688, + "learning_rate": 0.00043641456582633055, + "loss": 0.5994, + "step": 20232 + }, + { + "epoch": 11.303351955307262, + "grad_norm": 0.45566222071647644, + "learning_rate": 0.00043638655462184875, + "loss": 0.5035, + "step": 20233 + }, + { + "epoch": 11.30391061452514, + "grad_norm": 0.4176664352416992, + "learning_rate": 0.00043635854341736696, + "loss": 0.4385, + "step": 20234 + }, + { + "epoch": 11.304469273743017, + "grad_norm": 0.9250991940498352, + "learning_rate": 0.00043633053221288516, + "loss": 0.4199, + "step": 20235 + }, + { + "epoch": 11.305027932960893, + "grad_norm": 0.34724947810173035, + "learning_rate": 0.0004363025210084034, + "loss": 0.3563, + "step": 20236 + }, + { + "epoch": 11.305586592178772, + "grad_norm": 0.7343859076499939, + "learning_rate": 0.0004362745098039216, + "loss": 0.5057, + "step": 20237 + }, + { + "epoch": 11.306145251396648, + "grad_norm": 0.3646984100341797, + "learning_rate": 0.0004362464985994398, + "loss": 0.317, + "step": 20238 + }, + { + "epoch": 11.306703910614525, + "grad_norm": 0.4432050585746765, + "learning_rate": 0.000436218487394958, + "loss": 0.4004, + "step": 20239 + }, + { + "epoch": 11.307262569832401, + "grad_norm": 1.527118444442749, + "learning_rate": 0.0004361904761904762, + "loss": 0.4745, + "step": 20240 + }, + { + "epoch": 11.30782122905028, + "grad_norm": 0.5312463641166687, + "learning_rate": 0.00043616246498599445, + "loss": 0.564, + "step": 20241 + }, + { + "epoch": 11.308379888268156, + "grad_norm": 0.4000355005264282, + "learning_rate": 0.0004361344537815126, + "loss": 0.3696, + "step": 20242 + }, + { + "epoch": 11.308938547486033, + "grad_norm": 0.7436264753341675, + "learning_rate": 0.0004361064425770308, + "loss": 0.333, + "step": 20243 + }, + { + "epoch": 11.309497206703911, + "grad_norm": 0.4654131233692169, + "learning_rate": 0.00043607843137254907, + "loss": 0.4086, + "step": 20244 + }, + { + "epoch": 11.310055865921788, + "grad_norm": 0.5024712681770325, + "learning_rate": 0.0004360504201680672, + "loss": 0.3552, + "step": 20245 + }, + { + "epoch": 11.310614525139664, + "grad_norm": 0.2869241237640381, + "learning_rate": 0.0004360224089635855, + "loss": 0.2277, + "step": 20246 + }, + { + "epoch": 11.311173184357543, + "grad_norm": 0.431352436542511, + "learning_rate": 0.00043599439775910364, + "loss": 0.4093, + "step": 20247 + }, + { + "epoch": 11.31173184357542, + "grad_norm": 0.5880358815193176, + "learning_rate": 0.00043596638655462184, + "loss": 0.4375, + "step": 20248 + }, + { + "epoch": 11.312290502793296, + "grad_norm": 1.8696519136428833, + "learning_rate": 0.0004359383753501401, + "loss": 0.5504, + "step": 20249 + }, + { + "epoch": 11.312849162011172, + "grad_norm": 1.9979270696640015, + "learning_rate": 0.00043591036414565825, + "loss": 0.3871, + "step": 20250 + }, + { + "epoch": 11.31340782122905, + "grad_norm": 0.6203386187553406, + "learning_rate": 0.00043588235294117646, + "loss": 0.437, + "step": 20251 + }, + { + "epoch": 11.313966480446927, + "grad_norm": 0.926114559173584, + "learning_rate": 0.0004358543417366947, + "loss": 0.4037, + "step": 20252 + }, + { + "epoch": 11.314525139664804, + "grad_norm": 0.5830777883529663, + "learning_rate": 0.00043582633053221287, + "loss": 0.4406, + "step": 20253 + }, + { + "epoch": 11.315083798882682, + "grad_norm": 0.6230756640434265, + "learning_rate": 0.00043579831932773113, + "loss": 0.3607, + "step": 20254 + }, + { + "epoch": 11.315642458100559, + "grad_norm": 0.730209231376648, + "learning_rate": 0.0004357703081232493, + "loss": 0.4044, + "step": 20255 + }, + { + "epoch": 11.316201117318435, + "grad_norm": 0.36078113317489624, + "learning_rate": 0.0004357422969187675, + "loss": 0.4099, + "step": 20256 + }, + { + "epoch": 11.316759776536314, + "grad_norm": 0.9884338974952698, + "learning_rate": 0.00043571428571428575, + "loss": 0.4508, + "step": 20257 + }, + { + "epoch": 11.31731843575419, + "grad_norm": 25.895471572875977, + "learning_rate": 0.0004356862745098039, + "loss": 0.3458, + "step": 20258 + }, + { + "epoch": 11.317877094972067, + "grad_norm": 0.5229678750038147, + "learning_rate": 0.00043565826330532216, + "loss": 0.4585, + "step": 20259 + }, + { + "epoch": 11.318435754189943, + "grad_norm": 0.4289412200450897, + "learning_rate": 0.00043563025210084037, + "loss": 0.521, + "step": 20260 + }, + { + "epoch": 11.318994413407822, + "grad_norm": 0.6651564836502075, + "learning_rate": 0.0004356022408963585, + "loss": 0.4817, + "step": 20261 + }, + { + "epoch": 11.319553072625698, + "grad_norm": 0.5189133286476135, + "learning_rate": 0.0004355742296918768, + "loss": 0.5234, + "step": 20262 + }, + { + "epoch": 11.320111731843575, + "grad_norm": 1.1015655994415283, + "learning_rate": 0.00043554621848739493, + "loss": 0.4493, + "step": 20263 + }, + { + "epoch": 11.320670391061453, + "grad_norm": 0.3143925666809082, + "learning_rate": 0.0004355182072829132, + "loss": 0.3268, + "step": 20264 + }, + { + "epoch": 11.32122905027933, + "grad_norm": 0.7230599522590637, + "learning_rate": 0.0004354901960784314, + "loss": 0.3853, + "step": 20265 + }, + { + "epoch": 11.321787709497206, + "grad_norm": 1.5055962800979614, + "learning_rate": 0.00043546218487394955, + "loss": 0.4847, + "step": 20266 + }, + { + "epoch": 11.322346368715085, + "grad_norm": 0.4309387505054474, + "learning_rate": 0.0004354341736694678, + "loss": 0.3525, + "step": 20267 + }, + { + "epoch": 11.322905027932961, + "grad_norm": 0.5553687810897827, + "learning_rate": 0.000435406162464986, + "loss": 0.4343, + "step": 20268 + }, + { + "epoch": 11.323463687150838, + "grad_norm": 5.23468017578125, + "learning_rate": 0.0004353781512605042, + "loss": 0.4469, + "step": 20269 + }, + { + "epoch": 11.324022346368714, + "grad_norm": 0.4122627377510071, + "learning_rate": 0.00043535014005602243, + "loss": 0.3864, + "step": 20270 + }, + { + "epoch": 11.324581005586593, + "grad_norm": 0.9588366150856018, + "learning_rate": 0.0004353221288515406, + "loss": 0.4719, + "step": 20271 + }, + { + "epoch": 11.32513966480447, + "grad_norm": 0.5181857943534851, + "learning_rate": 0.00043529411764705884, + "loss": 0.4342, + "step": 20272 + }, + { + "epoch": 11.325698324022346, + "grad_norm": 0.8975045680999756, + "learning_rate": 0.00043526610644257705, + "loss": 0.4456, + "step": 20273 + }, + { + "epoch": 11.326256983240224, + "grad_norm": 0.5405422449111938, + "learning_rate": 0.00043523809523809525, + "loss": 0.3905, + "step": 20274 + }, + { + "epoch": 11.3268156424581, + "grad_norm": 0.8306060433387756, + "learning_rate": 0.00043521008403361346, + "loss": 0.3984, + "step": 20275 + }, + { + "epoch": 11.327374301675977, + "grad_norm": 0.504985511302948, + "learning_rate": 0.00043518207282913166, + "loss": 0.4037, + "step": 20276 + }, + { + "epoch": 11.327932960893854, + "grad_norm": 0.4910610318183899, + "learning_rate": 0.00043515406162464987, + "loss": 0.4551, + "step": 20277 + }, + { + "epoch": 11.328491620111732, + "grad_norm": 0.712019681930542, + "learning_rate": 0.0004351260504201681, + "loss": 0.506, + "step": 20278 + }, + { + "epoch": 11.329050279329609, + "grad_norm": 0.44327428936958313, + "learning_rate": 0.0004350980392156863, + "loss": 0.3396, + "step": 20279 + }, + { + "epoch": 11.329608938547485, + "grad_norm": 2.1232106685638428, + "learning_rate": 0.0004350700280112045, + "loss": 0.3874, + "step": 20280 + }, + { + "epoch": 11.330167597765364, + "grad_norm": 0.4614521265029907, + "learning_rate": 0.0004350420168067227, + "loss": 0.4225, + "step": 20281 + }, + { + "epoch": 11.33072625698324, + "grad_norm": 0.3407973647117615, + "learning_rate": 0.0004350140056022409, + "loss": 0.4285, + "step": 20282 + }, + { + "epoch": 11.331284916201117, + "grad_norm": 0.46013057231903076, + "learning_rate": 0.0004349859943977591, + "loss": 0.4959, + "step": 20283 + }, + { + "epoch": 11.331843575418995, + "grad_norm": 0.5374694466590881, + "learning_rate": 0.00043495798319327737, + "loss": 0.4444, + "step": 20284 + }, + { + "epoch": 11.332402234636872, + "grad_norm": 0.4497166872024536, + "learning_rate": 0.0004349299719887955, + "loss": 0.4405, + "step": 20285 + }, + { + "epoch": 11.332960893854748, + "grad_norm": 4.576594352722168, + "learning_rate": 0.0004349019607843137, + "loss": 0.3169, + "step": 20286 + }, + { + "epoch": 11.333519553072625, + "grad_norm": 0.4949341416358948, + "learning_rate": 0.00043487394957983193, + "loss": 0.4425, + "step": 20287 + }, + { + "epoch": 11.334078212290503, + "grad_norm": 0.5314093828201294, + "learning_rate": 0.00043484593837535014, + "loss": 0.4037, + "step": 20288 + }, + { + "epoch": 11.33463687150838, + "grad_norm": 0.5761443376541138, + "learning_rate": 0.0004348179271708684, + "loss": 0.3584, + "step": 20289 + }, + { + "epoch": 11.335195530726256, + "grad_norm": 0.7590041756629944, + "learning_rate": 0.00043478991596638655, + "loss": 0.4299, + "step": 20290 + }, + { + "epoch": 11.335754189944135, + "grad_norm": 0.8453172445297241, + "learning_rate": 0.00043476190476190475, + "loss": 0.4523, + "step": 20291 + }, + { + "epoch": 11.336312849162011, + "grad_norm": 0.3963601291179657, + "learning_rate": 0.000434733893557423, + "loss": 0.527, + "step": 20292 + }, + { + "epoch": 11.336871508379888, + "grad_norm": 0.4410884976387024, + "learning_rate": 0.00043470588235294117, + "loss": 0.4116, + "step": 20293 + }, + { + "epoch": 11.337430167597766, + "grad_norm": 0.6327182054519653, + "learning_rate": 0.0004346778711484594, + "loss": 0.3284, + "step": 20294 + }, + { + "epoch": 11.337988826815643, + "grad_norm": 0.5186989903450012, + "learning_rate": 0.0004346498599439776, + "loss": 0.3263, + "step": 20295 + }, + { + "epoch": 11.33854748603352, + "grad_norm": 0.8051605820655823, + "learning_rate": 0.0004346218487394958, + "loss": 0.399, + "step": 20296 + }, + { + "epoch": 11.339106145251396, + "grad_norm": 0.4688992500305176, + "learning_rate": 0.00043459383753501404, + "loss": 0.4468, + "step": 20297 + }, + { + "epoch": 11.339664804469274, + "grad_norm": 0.47650954127311707, + "learning_rate": 0.0004345658263305322, + "loss": 0.4508, + "step": 20298 + }, + { + "epoch": 11.34022346368715, + "grad_norm": 0.42817622423171997, + "learning_rate": 0.00043453781512605046, + "loss": 0.3343, + "step": 20299 + }, + { + "epoch": 11.340782122905027, + "grad_norm": 0.42224690318107605, + "learning_rate": 0.00043450980392156866, + "loss": 0.5413, + "step": 20300 + }, + { + "epoch": 11.341340782122906, + "grad_norm": 0.8090122938156128, + "learning_rate": 0.0004344817927170868, + "loss": 0.4035, + "step": 20301 + }, + { + "epoch": 11.341899441340782, + "grad_norm": 0.5524550080299377, + "learning_rate": 0.0004344537815126051, + "loss": 0.4138, + "step": 20302 + }, + { + "epoch": 11.342458100558659, + "grad_norm": 0.550574779510498, + "learning_rate": 0.0004344257703081232, + "loss": 0.4671, + "step": 20303 + }, + { + "epoch": 11.343016759776535, + "grad_norm": 0.560962438583374, + "learning_rate": 0.0004343977591036415, + "loss": 0.3193, + "step": 20304 + }, + { + "epoch": 11.343575418994414, + "grad_norm": 1.720365285873413, + "learning_rate": 0.0004343697478991597, + "loss": 0.3345, + "step": 20305 + }, + { + "epoch": 11.34413407821229, + "grad_norm": 0.6047013998031616, + "learning_rate": 0.00043434173669467784, + "loss": 0.4359, + "step": 20306 + }, + { + "epoch": 11.344692737430167, + "grad_norm": 0.4994995892047882, + "learning_rate": 0.0004343137254901961, + "loss": 0.3999, + "step": 20307 + }, + { + "epoch": 11.345251396648045, + "grad_norm": 1.1507071256637573, + "learning_rate": 0.0004342857142857143, + "loss": 0.3648, + "step": 20308 + }, + { + "epoch": 11.345810055865922, + "grad_norm": 0.5431200265884399, + "learning_rate": 0.0004342577030812325, + "loss": 0.483, + "step": 20309 + }, + { + "epoch": 11.346368715083798, + "grad_norm": 1.2475614547729492, + "learning_rate": 0.0004342296918767507, + "loss": 0.3695, + "step": 20310 + }, + { + "epoch": 11.346927374301677, + "grad_norm": 0.4749751091003418, + "learning_rate": 0.0004342016806722689, + "loss": 0.3798, + "step": 20311 + }, + { + "epoch": 11.347486033519553, + "grad_norm": 0.4504222869873047, + "learning_rate": 0.00043417366946778713, + "loss": 0.3549, + "step": 20312 + }, + { + "epoch": 11.34804469273743, + "grad_norm": 1.5910853147506714, + "learning_rate": 0.00043414565826330534, + "loss": 0.4002, + "step": 20313 + }, + { + "epoch": 11.348603351955306, + "grad_norm": 0.6830914616584778, + "learning_rate": 0.00043411764705882355, + "loss": 0.3999, + "step": 20314 + }, + { + "epoch": 11.349162011173185, + "grad_norm": 0.5797119140625, + "learning_rate": 0.00043408963585434175, + "loss": 0.4553, + "step": 20315 + }, + { + "epoch": 11.349720670391061, + "grad_norm": 0.6772024631500244, + "learning_rate": 0.00043406162464985996, + "loss": 0.3756, + "step": 20316 + }, + { + "epoch": 11.350279329608938, + "grad_norm": 0.4456321895122528, + "learning_rate": 0.00043403361344537816, + "loss": 0.3555, + "step": 20317 + }, + { + "epoch": 11.350837988826816, + "grad_norm": 0.5499882698059082, + "learning_rate": 0.00043400560224089637, + "loss": 0.357, + "step": 20318 + }, + { + "epoch": 11.351396648044693, + "grad_norm": 0.7099266648292542, + "learning_rate": 0.0004339775910364146, + "loss": 0.4934, + "step": 20319 + }, + { + "epoch": 11.35195530726257, + "grad_norm": 0.48719528317451477, + "learning_rate": 0.0004339495798319328, + "loss": 0.3907, + "step": 20320 + }, + { + "epoch": 11.352513966480448, + "grad_norm": 0.48918482661247253, + "learning_rate": 0.000433921568627451, + "loss": 0.4743, + "step": 20321 + }, + { + "epoch": 11.353072625698324, + "grad_norm": 0.4085235893726349, + "learning_rate": 0.0004338935574229692, + "loss": 0.4434, + "step": 20322 + }, + { + "epoch": 11.3536312849162, + "grad_norm": 0.7279258370399475, + "learning_rate": 0.0004338655462184874, + "loss": 0.5687, + "step": 20323 + }, + { + "epoch": 11.354189944134077, + "grad_norm": 0.48025304079055786, + "learning_rate": 0.00043383753501400566, + "loss": 0.3214, + "step": 20324 + }, + { + "epoch": 11.354748603351956, + "grad_norm": 0.4051593840122223, + "learning_rate": 0.0004338095238095238, + "loss": 0.4089, + "step": 20325 + }, + { + "epoch": 11.355307262569832, + "grad_norm": 1.1692802906036377, + "learning_rate": 0.000433781512605042, + "loss": 0.3266, + "step": 20326 + }, + { + "epoch": 11.355865921787709, + "grad_norm": 0.6688805818557739, + "learning_rate": 0.0004337535014005602, + "loss": 0.6353, + "step": 20327 + }, + { + "epoch": 11.356424581005587, + "grad_norm": 0.4597741365432739, + "learning_rate": 0.00043372549019607843, + "loss": 0.4367, + "step": 20328 + }, + { + "epoch": 11.356983240223464, + "grad_norm": 0.45323848724365234, + "learning_rate": 0.0004336974789915967, + "loss": 0.4132, + "step": 20329 + }, + { + "epoch": 11.35754189944134, + "grad_norm": 0.5354863405227661, + "learning_rate": 0.00043366946778711484, + "loss": 0.5552, + "step": 20330 + }, + { + "epoch": 11.358100558659217, + "grad_norm": 0.46319136023521423, + "learning_rate": 0.00043364145658263305, + "loss": 0.3433, + "step": 20331 + }, + { + "epoch": 11.358659217877095, + "grad_norm": 3.5432674884796143, + "learning_rate": 0.0004336134453781513, + "loss": 0.3622, + "step": 20332 + }, + { + "epoch": 11.359217877094972, + "grad_norm": 0.3947017192840576, + "learning_rate": 0.00043358543417366946, + "loss": 0.4026, + "step": 20333 + }, + { + "epoch": 11.359776536312848, + "grad_norm": 0.6255600452423096, + "learning_rate": 0.0004335574229691877, + "loss": 0.5444, + "step": 20334 + }, + { + "epoch": 11.360335195530727, + "grad_norm": 0.32233691215515137, + "learning_rate": 0.00043352941176470587, + "loss": 0.4493, + "step": 20335 + }, + { + "epoch": 11.360893854748603, + "grad_norm": 0.4895597994327545, + "learning_rate": 0.0004335014005602241, + "loss": 0.4253, + "step": 20336 + }, + { + "epoch": 11.36145251396648, + "grad_norm": 0.7000439167022705, + "learning_rate": 0.00043347338935574234, + "loss": 0.5388, + "step": 20337 + }, + { + "epoch": 11.362011173184358, + "grad_norm": 0.5233309864997864, + "learning_rate": 0.0004334453781512605, + "loss": 0.3871, + "step": 20338 + }, + { + "epoch": 11.362569832402235, + "grad_norm": 0.5473671555519104, + "learning_rate": 0.00043341736694677875, + "loss": 0.4168, + "step": 20339 + }, + { + "epoch": 11.363128491620111, + "grad_norm": 0.4244515597820282, + "learning_rate": 0.00043338935574229696, + "loss": 0.4895, + "step": 20340 + }, + { + "epoch": 11.363687150837988, + "grad_norm": 0.461105078458786, + "learning_rate": 0.0004333613445378151, + "loss": 0.4639, + "step": 20341 + }, + { + "epoch": 11.364245810055866, + "grad_norm": 0.9202731251716614, + "learning_rate": 0.00043333333333333337, + "loss": 0.3506, + "step": 20342 + }, + { + "epoch": 11.364804469273743, + "grad_norm": 0.577087938785553, + "learning_rate": 0.0004333053221288515, + "loss": 0.3851, + "step": 20343 + }, + { + "epoch": 11.36536312849162, + "grad_norm": 0.5346361398696899, + "learning_rate": 0.0004332773109243698, + "loss": 0.4443, + "step": 20344 + }, + { + "epoch": 11.365921787709498, + "grad_norm": 0.4487300217151642, + "learning_rate": 0.000433249299719888, + "loss": 0.413, + "step": 20345 + }, + { + "epoch": 11.366480446927374, + "grad_norm": 0.48424363136291504, + "learning_rate": 0.00043322128851540614, + "loss": 0.3914, + "step": 20346 + }, + { + "epoch": 11.367039106145251, + "grad_norm": 0.38616514205932617, + "learning_rate": 0.0004331932773109244, + "loss": 0.3891, + "step": 20347 + }, + { + "epoch": 11.36759776536313, + "grad_norm": 0.7378391623497009, + "learning_rate": 0.0004331652661064426, + "loss": 0.449, + "step": 20348 + }, + { + "epoch": 11.368156424581006, + "grad_norm": 1.0481181144714355, + "learning_rate": 0.0004331372549019608, + "loss": 0.5323, + "step": 20349 + }, + { + "epoch": 11.368715083798882, + "grad_norm": 0.5536805987358093, + "learning_rate": 0.000433109243697479, + "loss": 0.4726, + "step": 20350 + }, + { + "epoch": 11.369273743016759, + "grad_norm": 0.5338822603225708, + "learning_rate": 0.00043308123249299717, + "loss": 0.4927, + "step": 20351 + }, + { + "epoch": 11.369832402234637, + "grad_norm": 0.3848881125450134, + "learning_rate": 0.00043305322128851543, + "loss": 0.4178, + "step": 20352 + }, + { + "epoch": 11.370391061452514, + "grad_norm": 0.5526819825172424, + "learning_rate": 0.00043302521008403363, + "loss": 0.4285, + "step": 20353 + }, + { + "epoch": 11.37094972067039, + "grad_norm": 0.44540566205978394, + "learning_rate": 0.00043299719887955184, + "loss": 0.3048, + "step": 20354 + }, + { + "epoch": 11.371508379888269, + "grad_norm": 0.8788999319076538, + "learning_rate": 0.00043296918767507005, + "loss": 0.3888, + "step": 20355 + }, + { + "epoch": 11.372067039106145, + "grad_norm": 0.5624000430107117, + "learning_rate": 0.00043294117647058825, + "loss": 0.369, + "step": 20356 + }, + { + "epoch": 11.372625698324022, + "grad_norm": 0.9151301383972168, + "learning_rate": 0.00043291316526610646, + "loss": 0.6925, + "step": 20357 + }, + { + "epoch": 11.3731843575419, + "grad_norm": 0.5433090925216675, + "learning_rate": 0.00043288515406162466, + "loss": 0.4438, + "step": 20358 + }, + { + "epoch": 11.373743016759777, + "grad_norm": 0.5443361401557922, + "learning_rate": 0.00043285714285714287, + "loss": 0.2824, + "step": 20359 + }, + { + "epoch": 11.374301675977653, + "grad_norm": 2.2954258918762207, + "learning_rate": 0.0004328291316526611, + "loss": 0.5234, + "step": 20360 + }, + { + "epoch": 11.37486033519553, + "grad_norm": 0.5968257784843445, + "learning_rate": 0.0004328011204481793, + "loss": 0.417, + "step": 20361 + }, + { + "epoch": 11.375418994413408, + "grad_norm": 0.9542478322982788, + "learning_rate": 0.0004327731092436975, + "loss": 0.3149, + "step": 20362 + }, + { + "epoch": 11.375977653631285, + "grad_norm": 0.5495153665542603, + "learning_rate": 0.0004327450980392157, + "loss": 0.3557, + "step": 20363 + }, + { + "epoch": 11.376536312849161, + "grad_norm": 2.898282766342163, + "learning_rate": 0.0004327170868347339, + "loss": 0.6315, + "step": 20364 + }, + { + "epoch": 11.37709497206704, + "grad_norm": 1.5180634260177612, + "learning_rate": 0.0004326890756302521, + "loss": 0.4758, + "step": 20365 + }, + { + "epoch": 11.377653631284916, + "grad_norm": 0.4966793358325958, + "learning_rate": 0.0004326610644257703, + "loss": 0.4433, + "step": 20366 + }, + { + "epoch": 11.378212290502793, + "grad_norm": 0.4382321536540985, + "learning_rate": 0.0004326330532212885, + "loss": 0.4331, + "step": 20367 + }, + { + "epoch": 11.378770949720671, + "grad_norm": 0.4112316966056824, + "learning_rate": 0.0004326050420168067, + "loss": 0.3642, + "step": 20368 + }, + { + "epoch": 11.379329608938548, + "grad_norm": 0.5049471259117126, + "learning_rate": 0.00043257703081232493, + "loss": 0.4079, + "step": 20369 + }, + { + "epoch": 11.379888268156424, + "grad_norm": 0.467467337846756, + "learning_rate": 0.00043254901960784314, + "loss": 0.3643, + "step": 20370 + }, + { + "epoch": 11.380446927374301, + "grad_norm": 0.3777334988117218, + "learning_rate": 0.00043252100840336134, + "loss": 0.3886, + "step": 20371 + }, + { + "epoch": 11.38100558659218, + "grad_norm": 0.4159626364707947, + "learning_rate": 0.0004324929971988796, + "loss": 0.4021, + "step": 20372 + }, + { + "epoch": 11.381564245810056, + "grad_norm": 0.5214095115661621, + "learning_rate": 0.00043246498599439775, + "loss": 0.4936, + "step": 20373 + }, + { + "epoch": 11.382122905027932, + "grad_norm": 0.48910412192344666, + "learning_rate": 0.00043243697478991596, + "loss": 0.3898, + "step": 20374 + }, + { + "epoch": 11.38268156424581, + "grad_norm": 0.30974388122558594, + "learning_rate": 0.00043240896358543417, + "loss": 0.3703, + "step": 20375 + }, + { + "epoch": 11.383240223463687, + "grad_norm": 0.3718155026435852, + "learning_rate": 0.00043238095238095237, + "loss": 0.3675, + "step": 20376 + }, + { + "epoch": 11.383798882681564, + "grad_norm": 1.363289713859558, + "learning_rate": 0.00043235294117647063, + "loss": 0.3934, + "step": 20377 + }, + { + "epoch": 11.38435754189944, + "grad_norm": 1.0593390464782715, + "learning_rate": 0.0004323249299719888, + "loss": 0.3917, + "step": 20378 + }, + { + "epoch": 11.384916201117319, + "grad_norm": 1.3262704610824585, + "learning_rate": 0.000432296918767507, + "loss": 0.2937, + "step": 20379 + }, + { + "epoch": 11.385474860335195, + "grad_norm": 0.8163474202156067, + "learning_rate": 0.00043226890756302525, + "loss": 0.515, + "step": 20380 + }, + { + "epoch": 11.386033519553072, + "grad_norm": 0.37344738841056824, + "learning_rate": 0.0004322408963585434, + "loss": 0.3521, + "step": 20381 + }, + { + "epoch": 11.38659217877095, + "grad_norm": 0.6237058639526367, + "learning_rate": 0.00043221288515406166, + "loss": 0.4177, + "step": 20382 + }, + { + "epoch": 11.387150837988827, + "grad_norm": 0.7022706270217896, + "learning_rate": 0.0004321848739495798, + "loss": 0.4078, + "step": 20383 + }, + { + "epoch": 11.387709497206703, + "grad_norm": 0.7716498374938965, + "learning_rate": 0.000432156862745098, + "loss": 0.4292, + "step": 20384 + }, + { + "epoch": 11.388268156424582, + "grad_norm": 0.46022769808769226, + "learning_rate": 0.0004321288515406163, + "loss": 0.3802, + "step": 20385 + }, + { + "epoch": 11.388826815642458, + "grad_norm": 0.5086798667907715, + "learning_rate": 0.00043210084033613443, + "loss": 0.3647, + "step": 20386 + }, + { + "epoch": 11.389385474860335, + "grad_norm": 0.40174588561058044, + "learning_rate": 0.0004320728291316527, + "loss": 0.4151, + "step": 20387 + }, + { + "epoch": 11.389944134078211, + "grad_norm": 0.5052075982093811, + "learning_rate": 0.0004320448179271709, + "loss": 0.4673, + "step": 20388 + }, + { + "epoch": 11.39050279329609, + "grad_norm": 0.4945640563964844, + "learning_rate": 0.00043201680672268905, + "loss": 0.4161, + "step": 20389 + }, + { + "epoch": 11.391061452513966, + "grad_norm": 1.064143419265747, + "learning_rate": 0.0004319887955182073, + "loss": 0.4443, + "step": 20390 + }, + { + "epoch": 11.391620111731843, + "grad_norm": 1.2059634923934937, + "learning_rate": 0.00043196078431372546, + "loss": 0.448, + "step": 20391 + }, + { + "epoch": 11.392178770949721, + "grad_norm": 0.5958070755004883, + "learning_rate": 0.0004319327731092437, + "loss": 0.4951, + "step": 20392 + }, + { + "epoch": 11.392737430167598, + "grad_norm": 0.45267143845558167, + "learning_rate": 0.00043190476190476193, + "loss": 0.3704, + "step": 20393 + }, + { + "epoch": 11.393296089385474, + "grad_norm": 0.41555067896842957, + "learning_rate": 0.0004318767507002801, + "loss": 0.3979, + "step": 20394 + }, + { + "epoch": 11.393854748603353, + "grad_norm": 0.44813475012779236, + "learning_rate": 0.00043184873949579834, + "loss": 0.4527, + "step": 20395 + }, + { + "epoch": 11.39441340782123, + "grad_norm": 0.5134300589561462, + "learning_rate": 0.00043182072829131655, + "loss": 0.5213, + "step": 20396 + }, + { + "epoch": 11.394972067039106, + "grad_norm": 0.4372962713241577, + "learning_rate": 0.00043179271708683475, + "loss": 0.3296, + "step": 20397 + }, + { + "epoch": 11.395530726256982, + "grad_norm": 5.657855033874512, + "learning_rate": 0.00043176470588235296, + "loss": 0.3748, + "step": 20398 + }, + { + "epoch": 11.39608938547486, + "grad_norm": 0.7380702495574951, + "learning_rate": 0.0004317366946778711, + "loss": 0.4264, + "step": 20399 + }, + { + "epoch": 11.396648044692737, + "grad_norm": 0.5794607400894165, + "learning_rate": 0.00043170868347338937, + "loss": 0.4823, + "step": 20400 + }, + { + "epoch": 11.397206703910614, + "grad_norm": 0.602072536945343, + "learning_rate": 0.0004316806722689076, + "loss": 0.3035, + "step": 20401 + }, + { + "epoch": 11.397765363128492, + "grad_norm": 0.5261669158935547, + "learning_rate": 0.0004316526610644258, + "loss": 0.4707, + "step": 20402 + }, + { + "epoch": 11.398324022346369, + "grad_norm": 0.4762754440307617, + "learning_rate": 0.000431624649859944, + "loss": 0.3798, + "step": 20403 + }, + { + "epoch": 11.398882681564245, + "grad_norm": 0.6748090982437134, + "learning_rate": 0.0004315966386554622, + "loss": 0.5468, + "step": 20404 + }, + { + "epoch": 11.399441340782122, + "grad_norm": 0.5334838032722473, + "learning_rate": 0.0004315686274509804, + "loss": 0.4193, + "step": 20405 + }, + { + "epoch": 11.4, + "grad_norm": 0.68052077293396, + "learning_rate": 0.0004315406162464986, + "loss": 0.502, + "step": 20406 + }, + { + "epoch": 11.400558659217877, + "grad_norm": 0.6950442790985107, + "learning_rate": 0.0004315126050420168, + "loss": 0.3883, + "step": 20407 + }, + { + "epoch": 11.401117318435753, + "grad_norm": 0.6587467193603516, + "learning_rate": 0.000431484593837535, + "loss": 0.4802, + "step": 20408 + }, + { + "epoch": 11.401675977653632, + "grad_norm": 0.5734756588935852, + "learning_rate": 0.0004314565826330532, + "loss": 0.4865, + "step": 20409 + }, + { + "epoch": 11.402234636871508, + "grad_norm": 0.7394313812255859, + "learning_rate": 0.00043142857142857143, + "loss": 0.5259, + "step": 20410 + }, + { + "epoch": 11.402793296089385, + "grad_norm": 0.46636223793029785, + "learning_rate": 0.00043140056022408964, + "loss": 0.519, + "step": 20411 + }, + { + "epoch": 11.403351955307263, + "grad_norm": 0.3804307281970978, + "learning_rate": 0.0004313725490196079, + "loss": 0.3641, + "step": 20412 + }, + { + "epoch": 11.40391061452514, + "grad_norm": 0.5207472443580627, + "learning_rate": 0.00043134453781512605, + "loss": 0.4119, + "step": 20413 + }, + { + "epoch": 11.404469273743016, + "grad_norm": 0.687709629535675, + "learning_rate": 0.00043131652661064425, + "loss": 0.4611, + "step": 20414 + }, + { + "epoch": 11.405027932960893, + "grad_norm": 0.7759419679641724, + "learning_rate": 0.00043128851540616246, + "loss": 0.3967, + "step": 20415 + }, + { + "epoch": 11.405586592178771, + "grad_norm": 0.8317773342132568, + "learning_rate": 0.00043126050420168067, + "loss": 0.3992, + "step": 20416 + }, + { + "epoch": 11.406145251396648, + "grad_norm": 0.47342318296432495, + "learning_rate": 0.0004312324929971989, + "loss": 0.4551, + "step": 20417 + }, + { + "epoch": 11.406703910614524, + "grad_norm": 0.519398033618927, + "learning_rate": 0.0004312044817927171, + "loss": 0.4847, + "step": 20418 + }, + { + "epoch": 11.407262569832403, + "grad_norm": 0.641997218132019, + "learning_rate": 0.0004311764705882353, + "loss": 0.3771, + "step": 20419 + }, + { + "epoch": 11.40782122905028, + "grad_norm": 0.7098687291145325, + "learning_rate": 0.00043114845938375354, + "loss": 0.48, + "step": 20420 + }, + { + "epoch": 11.408379888268156, + "grad_norm": 0.6648222208023071, + "learning_rate": 0.0004311204481792717, + "loss": 0.3898, + "step": 20421 + }, + { + "epoch": 11.408938547486034, + "grad_norm": 0.734363853931427, + "learning_rate": 0.00043109243697478996, + "loss": 0.5481, + "step": 20422 + }, + { + "epoch": 11.40949720670391, + "grad_norm": 1.2085016965866089, + "learning_rate": 0.0004310644257703081, + "loss": 0.3938, + "step": 20423 + }, + { + "epoch": 11.410055865921787, + "grad_norm": 0.6026791930198669, + "learning_rate": 0.0004310364145658263, + "loss": 0.668, + "step": 20424 + }, + { + "epoch": 11.410614525139664, + "grad_norm": 0.46169158816337585, + "learning_rate": 0.0004310084033613446, + "loss": 0.4686, + "step": 20425 + }, + { + "epoch": 11.411173184357542, + "grad_norm": 0.519842803478241, + "learning_rate": 0.0004309803921568627, + "loss": 0.3675, + "step": 20426 + }, + { + "epoch": 11.411731843575419, + "grad_norm": 0.5539289116859436, + "learning_rate": 0.000430952380952381, + "loss": 0.3825, + "step": 20427 + }, + { + "epoch": 11.412290502793295, + "grad_norm": 0.5642621517181396, + "learning_rate": 0.0004309243697478992, + "loss": 0.4798, + "step": 20428 + }, + { + "epoch": 11.412849162011174, + "grad_norm": 0.4757636487483978, + "learning_rate": 0.00043089635854341734, + "loss": 0.3381, + "step": 20429 + }, + { + "epoch": 11.41340782122905, + "grad_norm": 0.7182319760322571, + "learning_rate": 0.0004308683473389356, + "loss": 0.517, + "step": 20430 + }, + { + "epoch": 11.413966480446927, + "grad_norm": 1.3319298028945923, + "learning_rate": 0.00043084033613445376, + "loss": 0.3738, + "step": 20431 + }, + { + "epoch": 11.414525139664805, + "grad_norm": 0.9071935415267944, + "learning_rate": 0.000430812324929972, + "loss": 0.4779, + "step": 20432 + }, + { + "epoch": 11.415083798882682, + "grad_norm": 0.42323726415634155, + "learning_rate": 0.0004307843137254902, + "loss": 0.4384, + "step": 20433 + }, + { + "epoch": 11.415642458100558, + "grad_norm": 0.5242413282394409, + "learning_rate": 0.0004307563025210084, + "loss": 0.4699, + "step": 20434 + }, + { + "epoch": 11.416201117318435, + "grad_norm": 0.4536595046520233, + "learning_rate": 0.00043072829131652663, + "loss": 0.4191, + "step": 20435 + }, + { + "epoch": 11.416759776536313, + "grad_norm": 0.5623801946640015, + "learning_rate": 0.00043070028011204484, + "loss": 0.4698, + "step": 20436 + }, + { + "epoch": 11.41731843575419, + "grad_norm": 0.37475845217704773, + "learning_rate": 0.00043067226890756305, + "loss": 0.3837, + "step": 20437 + }, + { + "epoch": 11.417877094972066, + "grad_norm": 0.6459819674491882, + "learning_rate": 0.00043064425770308125, + "loss": 0.4446, + "step": 20438 + }, + { + "epoch": 11.418435754189945, + "grad_norm": 0.4501931965351105, + "learning_rate": 0.0004306162464985994, + "loss": 0.4452, + "step": 20439 + }, + { + "epoch": 11.418994413407821, + "grad_norm": 1.2586090564727783, + "learning_rate": 0.00043058823529411766, + "loss": 0.4561, + "step": 20440 + }, + { + "epoch": 11.419553072625698, + "grad_norm": 0.4796806275844574, + "learning_rate": 0.00043056022408963587, + "loss": 0.5221, + "step": 20441 + }, + { + "epoch": 11.420111731843576, + "grad_norm": 0.485820472240448, + "learning_rate": 0.0004305322128851541, + "loss": 0.4164, + "step": 20442 + }, + { + "epoch": 11.420670391061453, + "grad_norm": 0.6834775805473328, + "learning_rate": 0.0004305042016806723, + "loss": 0.448, + "step": 20443 + }, + { + "epoch": 11.42122905027933, + "grad_norm": 0.45969659090042114, + "learning_rate": 0.0004304761904761905, + "loss": 0.4925, + "step": 20444 + }, + { + "epoch": 11.421787709497206, + "grad_norm": 0.3783314526081085, + "learning_rate": 0.0004304481792717087, + "loss": 0.3769, + "step": 20445 + }, + { + "epoch": 11.422346368715084, + "grad_norm": 0.3382120430469513, + "learning_rate": 0.0004304201680672269, + "loss": 0.3181, + "step": 20446 + }, + { + "epoch": 11.422905027932961, + "grad_norm": 1.0975066423416138, + "learning_rate": 0.0004303921568627451, + "loss": 0.3762, + "step": 20447 + }, + { + "epoch": 11.423463687150837, + "grad_norm": 1.7357056140899658, + "learning_rate": 0.0004303641456582633, + "loss": 0.3632, + "step": 20448 + }, + { + "epoch": 11.424022346368716, + "grad_norm": 0.511623203754425, + "learning_rate": 0.0004303361344537815, + "loss": 0.499, + "step": 20449 + }, + { + "epoch": 11.424581005586592, + "grad_norm": 1.1244245767593384, + "learning_rate": 0.0004303081232492997, + "loss": 0.3796, + "step": 20450 + }, + { + "epoch": 11.425139664804469, + "grad_norm": 0.4435977339744568, + "learning_rate": 0.00043028011204481793, + "loss": 0.4031, + "step": 20451 + }, + { + "epoch": 11.425698324022346, + "grad_norm": 0.35999220609664917, + "learning_rate": 0.0004302521008403362, + "loss": 0.3835, + "step": 20452 + }, + { + "epoch": 11.426256983240224, + "grad_norm": 0.4780190885066986, + "learning_rate": 0.00043022408963585434, + "loss": 0.3443, + "step": 20453 + }, + { + "epoch": 11.4268156424581, + "grad_norm": 0.4865695536136627, + "learning_rate": 0.00043019607843137255, + "loss": 0.3709, + "step": 20454 + }, + { + "epoch": 11.427374301675977, + "grad_norm": 0.44215822219848633, + "learning_rate": 0.00043016806722689075, + "loss": 0.3697, + "step": 20455 + }, + { + "epoch": 11.427932960893855, + "grad_norm": 0.6452834010124207, + "learning_rate": 0.00043014005602240896, + "loss": 0.5452, + "step": 20456 + }, + { + "epoch": 11.428491620111732, + "grad_norm": 2.594749927520752, + "learning_rate": 0.0004301120448179272, + "loss": 0.3503, + "step": 20457 + }, + { + "epoch": 11.429050279329608, + "grad_norm": 1.3245359659194946, + "learning_rate": 0.00043008403361344537, + "loss": 0.5101, + "step": 20458 + }, + { + "epoch": 11.429608938547487, + "grad_norm": 0.6216909289360046, + "learning_rate": 0.0004300560224089636, + "loss": 0.5023, + "step": 20459 + }, + { + "epoch": 11.430167597765363, + "grad_norm": 0.3987646698951721, + "learning_rate": 0.00043002801120448184, + "loss": 0.5, + "step": 20460 + }, + { + "epoch": 11.43072625698324, + "grad_norm": 0.8627544045448303, + "learning_rate": 0.00043, + "loss": 0.4369, + "step": 20461 + }, + { + "epoch": 11.431284916201117, + "grad_norm": 1.1489179134368896, + "learning_rate": 0.00042997198879551825, + "loss": 0.3656, + "step": 20462 + }, + { + "epoch": 11.431843575418995, + "grad_norm": 0.4449630379676819, + "learning_rate": 0.0004299439775910364, + "loss": 0.4089, + "step": 20463 + }, + { + "epoch": 11.432402234636871, + "grad_norm": 2.871068239212036, + "learning_rate": 0.0004299159663865546, + "loss": 0.3754, + "step": 20464 + }, + { + "epoch": 11.432960893854748, + "grad_norm": 0.8406011462211609, + "learning_rate": 0.00042988795518207287, + "loss": 0.4799, + "step": 20465 + }, + { + "epoch": 11.433519553072626, + "grad_norm": 0.5874283909797668, + "learning_rate": 0.000429859943977591, + "loss": 0.4136, + "step": 20466 + }, + { + "epoch": 11.434078212290503, + "grad_norm": 0.6754739284515381, + "learning_rate": 0.0004298319327731093, + "loss": 0.6977, + "step": 20467 + }, + { + "epoch": 11.43463687150838, + "grad_norm": 0.4682725667953491, + "learning_rate": 0.0004298039215686275, + "loss": 0.4273, + "step": 20468 + }, + { + "epoch": 11.435195530726258, + "grad_norm": 2.1621012687683105, + "learning_rate": 0.00042977591036414564, + "loss": 0.3661, + "step": 20469 + }, + { + "epoch": 11.435754189944134, + "grad_norm": 0.34660494327545166, + "learning_rate": 0.0004297478991596639, + "loss": 0.4264, + "step": 20470 + }, + { + "epoch": 11.436312849162011, + "grad_norm": 0.622973620891571, + "learning_rate": 0.00042971988795518205, + "loss": 0.3835, + "step": 20471 + }, + { + "epoch": 11.436871508379888, + "grad_norm": 0.5168856978416443, + "learning_rate": 0.0004296918767507003, + "loss": 0.4199, + "step": 20472 + }, + { + "epoch": 11.437430167597766, + "grad_norm": 5.298727035522461, + "learning_rate": 0.0004296638655462185, + "loss": 0.5567, + "step": 20473 + }, + { + "epoch": 11.437988826815642, + "grad_norm": 0.8622888326644897, + "learning_rate": 0.00042963585434173667, + "loss": 0.4301, + "step": 20474 + }, + { + "epoch": 11.438547486033519, + "grad_norm": 0.3953629732131958, + "learning_rate": 0.00042960784313725493, + "loss": 0.3623, + "step": 20475 + }, + { + "epoch": 11.439106145251397, + "grad_norm": 0.4186992943286896, + "learning_rate": 0.00042957983193277313, + "loss": 0.345, + "step": 20476 + }, + { + "epoch": 11.439664804469274, + "grad_norm": 0.3621259033679962, + "learning_rate": 0.0004295518207282913, + "loss": 0.3733, + "step": 20477 + }, + { + "epoch": 11.44022346368715, + "grad_norm": 0.4001814126968384, + "learning_rate": 0.00042952380952380955, + "loss": 0.4272, + "step": 20478 + }, + { + "epoch": 11.440782122905027, + "grad_norm": 0.45869895815849304, + "learning_rate": 0.0004294957983193277, + "loss": 0.498, + "step": 20479 + }, + { + "epoch": 11.441340782122905, + "grad_norm": 1.2094117403030396, + "learning_rate": 0.00042946778711484596, + "loss": 0.4492, + "step": 20480 + }, + { + "epoch": 11.441899441340782, + "grad_norm": 0.46167707443237305, + "learning_rate": 0.00042943977591036416, + "loss": 0.3967, + "step": 20481 + }, + { + "epoch": 11.442458100558659, + "grad_norm": 0.35813504457473755, + "learning_rate": 0.0004294117647058823, + "loss": 0.4525, + "step": 20482 + }, + { + "epoch": 11.443016759776537, + "grad_norm": 0.4364907741546631, + "learning_rate": 0.0004293837535014006, + "loss": 0.3388, + "step": 20483 + }, + { + "epoch": 11.443575418994413, + "grad_norm": 1.050809383392334, + "learning_rate": 0.0004293557422969188, + "loss": 0.46, + "step": 20484 + }, + { + "epoch": 11.44413407821229, + "grad_norm": 0.44603869318962097, + "learning_rate": 0.000429327731092437, + "loss": 0.4162, + "step": 20485 + }, + { + "epoch": 11.444692737430168, + "grad_norm": 0.40829548239707947, + "learning_rate": 0.0004292997198879552, + "loss": 0.3276, + "step": 20486 + }, + { + "epoch": 11.445251396648045, + "grad_norm": 0.3749415874481201, + "learning_rate": 0.00042927170868347335, + "loss": 0.3637, + "step": 20487 + }, + { + "epoch": 11.445810055865921, + "grad_norm": 0.4271961748600006, + "learning_rate": 0.0004292436974789916, + "loss": 0.3762, + "step": 20488 + }, + { + "epoch": 11.446368715083798, + "grad_norm": 0.44431793689727783, + "learning_rate": 0.0004292156862745098, + "loss": 0.3894, + "step": 20489 + }, + { + "epoch": 11.446927374301676, + "grad_norm": 2.429311513900757, + "learning_rate": 0.000429187675070028, + "loss": 0.5131, + "step": 20490 + }, + { + "epoch": 11.447486033519553, + "grad_norm": 0.6587164998054504, + "learning_rate": 0.0004291596638655462, + "loss": 0.4741, + "step": 20491 + }, + { + "epoch": 11.44804469273743, + "grad_norm": 4.382190227508545, + "learning_rate": 0.00042913165266106443, + "loss": 0.3677, + "step": 20492 + }, + { + "epoch": 11.448603351955308, + "grad_norm": 0.566003680229187, + "learning_rate": 0.00042910364145658264, + "loss": 0.4863, + "step": 20493 + }, + { + "epoch": 11.449162011173184, + "grad_norm": 0.5580727458000183, + "learning_rate": 0.00042907563025210084, + "loss": 0.3704, + "step": 20494 + }, + { + "epoch": 11.449720670391061, + "grad_norm": 0.5175549387931824, + "learning_rate": 0.0004290476190476191, + "loss": 0.3775, + "step": 20495 + }, + { + "epoch": 11.45027932960894, + "grad_norm": 0.674576997756958, + "learning_rate": 0.00042901960784313725, + "loss": 0.4498, + "step": 20496 + }, + { + "epoch": 11.450837988826816, + "grad_norm": 0.5521635413169861, + "learning_rate": 0.00042899159663865546, + "loss": 0.444, + "step": 20497 + }, + { + "epoch": 11.451396648044692, + "grad_norm": 0.7391864657402039, + "learning_rate": 0.00042896358543417367, + "loss": 0.3582, + "step": 20498 + }, + { + "epoch": 11.451955307262569, + "grad_norm": 0.4824247360229492, + "learning_rate": 0.00042893557422969187, + "loss": 0.2599, + "step": 20499 + }, + { + "epoch": 11.452513966480447, + "grad_norm": 0.3248860239982605, + "learning_rate": 0.00042890756302521013, + "loss": 0.3387, + "step": 20500 + }, + { + "epoch": 11.452513966480447, + "eval_cer": 0.08737302652395448, + "eval_loss": 0.33205243945121765, + "eval_runtime": 55.7602, + "eval_samples_per_second": 81.384, + "eval_steps_per_second": 5.093, + "eval_wer": 0.345437278288316, + "step": 20500 + }, + { + "epoch": 11.453072625698324, + "grad_norm": 0.7746720314025879, + "learning_rate": 0.0004288795518207283, + "loss": 0.3719, + "step": 20501 + }, + { + "epoch": 11.4536312849162, + "grad_norm": 0.37366852164268494, + "learning_rate": 0.0004288515406162465, + "loss": 0.4067, + "step": 20502 + }, + { + "epoch": 11.454189944134079, + "grad_norm": 0.5273412466049194, + "learning_rate": 0.00042882352941176475, + "loss": 0.3561, + "step": 20503 + }, + { + "epoch": 11.454748603351955, + "grad_norm": 8.741072654724121, + "learning_rate": 0.0004287955182072829, + "loss": 0.4188, + "step": 20504 + }, + { + "epoch": 11.455307262569832, + "grad_norm": 0.7163769602775574, + "learning_rate": 0.00042876750700280116, + "loss": 0.38, + "step": 20505 + }, + { + "epoch": 11.45586592178771, + "grad_norm": 0.36931779980659485, + "learning_rate": 0.0004287394957983193, + "loss": 0.3797, + "step": 20506 + }, + { + "epoch": 11.456424581005587, + "grad_norm": 0.5512630343437195, + "learning_rate": 0.0004287114845938375, + "loss": 0.4726, + "step": 20507 + }, + { + "epoch": 11.456983240223463, + "grad_norm": 0.4833839237689972, + "learning_rate": 0.0004286834733893558, + "loss": 0.45, + "step": 20508 + }, + { + "epoch": 11.45754189944134, + "grad_norm": 0.6387859582901001, + "learning_rate": 0.00042865546218487393, + "loss": 0.4567, + "step": 20509 + }, + { + "epoch": 11.458100558659218, + "grad_norm": 0.34747910499572754, + "learning_rate": 0.0004286274509803922, + "loss": 0.2904, + "step": 20510 + }, + { + "epoch": 11.458659217877095, + "grad_norm": 0.4075853228569031, + "learning_rate": 0.0004285994397759104, + "loss": 0.4634, + "step": 20511 + }, + { + "epoch": 11.459217877094972, + "grad_norm": 0.5226553082466125, + "learning_rate": 0.00042857142857142855, + "loss": 0.3698, + "step": 20512 + }, + { + "epoch": 11.45977653631285, + "grad_norm": 0.7752663493156433, + "learning_rate": 0.0004285434173669468, + "loss": 0.5457, + "step": 20513 + }, + { + "epoch": 11.460335195530726, + "grad_norm": 0.7579894065856934, + "learning_rate": 0.00042851540616246496, + "loss": 0.3865, + "step": 20514 + }, + { + "epoch": 11.460893854748603, + "grad_norm": 0.5567745566368103, + "learning_rate": 0.0004284873949579832, + "loss": 0.3046, + "step": 20515 + }, + { + "epoch": 11.461452513966481, + "grad_norm": 0.6356061697006226, + "learning_rate": 0.00042845938375350143, + "loss": 0.4375, + "step": 20516 + }, + { + "epoch": 11.462011173184358, + "grad_norm": 0.5166113972663879, + "learning_rate": 0.0004284313725490196, + "loss": 0.4939, + "step": 20517 + }, + { + "epoch": 11.462569832402234, + "grad_norm": 0.3722655177116394, + "learning_rate": 0.00042840336134453784, + "loss": 0.3356, + "step": 20518 + }, + { + "epoch": 11.463128491620111, + "grad_norm": 0.5552321076393127, + "learning_rate": 0.00042837535014005605, + "loss": 0.417, + "step": 20519 + }, + { + "epoch": 11.46368715083799, + "grad_norm": 0.8605543971061707, + "learning_rate": 0.00042834733893557425, + "loss": 0.4364, + "step": 20520 + }, + { + "epoch": 11.464245810055866, + "grad_norm": 0.9286550283432007, + "learning_rate": 0.00042831932773109246, + "loss": 0.3419, + "step": 20521 + }, + { + "epoch": 11.464804469273743, + "grad_norm": 2.1893012523651123, + "learning_rate": 0.0004282913165266106, + "loss": 0.4016, + "step": 20522 + }, + { + "epoch": 11.46536312849162, + "grad_norm": 0.5527318120002747, + "learning_rate": 0.00042826330532212887, + "loss": 0.4268, + "step": 20523 + }, + { + "epoch": 11.465921787709497, + "grad_norm": 1.7358183860778809, + "learning_rate": 0.0004282352941176471, + "loss": 0.4319, + "step": 20524 + }, + { + "epoch": 11.466480446927374, + "grad_norm": 0.8993737101554871, + "learning_rate": 0.0004282072829131653, + "loss": 0.4777, + "step": 20525 + }, + { + "epoch": 11.46703910614525, + "grad_norm": 0.5329033136367798, + "learning_rate": 0.0004281792717086835, + "loss": 0.4646, + "step": 20526 + }, + { + "epoch": 11.467597765363129, + "grad_norm": 0.5158475041389465, + "learning_rate": 0.0004281512605042017, + "loss": 0.4838, + "step": 20527 + }, + { + "epoch": 11.468156424581005, + "grad_norm": 2.5568575859069824, + "learning_rate": 0.0004281232492997199, + "loss": 0.5044, + "step": 20528 + }, + { + "epoch": 11.468715083798882, + "grad_norm": 0.389017254114151, + "learning_rate": 0.0004280952380952381, + "loss": 0.3942, + "step": 20529 + }, + { + "epoch": 11.46927374301676, + "grad_norm": 0.5271598100662231, + "learning_rate": 0.0004280672268907563, + "loss": 0.3744, + "step": 20530 + }, + { + "epoch": 11.469832402234637, + "grad_norm": 0.8620637059211731, + "learning_rate": 0.0004280392156862745, + "loss": 0.4213, + "step": 20531 + }, + { + "epoch": 11.470391061452514, + "grad_norm": 0.5783917903900146, + "learning_rate": 0.0004280112044817927, + "loss": 0.4436, + "step": 20532 + }, + { + "epoch": 11.470949720670392, + "grad_norm": 0.5613778233528137, + "learning_rate": 0.00042798319327731093, + "loss": 0.4387, + "step": 20533 + }, + { + "epoch": 11.471508379888268, + "grad_norm": 0.6934236288070679, + "learning_rate": 0.00042795518207282914, + "loss": 0.5286, + "step": 20534 + }, + { + "epoch": 11.472067039106145, + "grad_norm": 0.502943217754364, + "learning_rate": 0.0004279271708683474, + "loss": 0.4772, + "step": 20535 + }, + { + "epoch": 11.472625698324022, + "grad_norm": 0.4045088589191437, + "learning_rate": 0.00042789915966386555, + "loss": 0.41, + "step": 20536 + }, + { + "epoch": 11.4731843575419, + "grad_norm": 0.675239622592926, + "learning_rate": 0.00042787114845938375, + "loss": 0.4631, + "step": 20537 + }, + { + "epoch": 11.473743016759776, + "grad_norm": 0.9886194467544556, + "learning_rate": 0.00042784313725490196, + "loss": 0.3601, + "step": 20538 + }, + { + "epoch": 11.474301675977653, + "grad_norm": 1.2970094680786133, + "learning_rate": 0.00042781512605042017, + "loss": 0.4427, + "step": 20539 + }, + { + "epoch": 11.474860335195531, + "grad_norm": 0.7917665243148804, + "learning_rate": 0.0004277871148459384, + "loss": 0.5053, + "step": 20540 + }, + { + "epoch": 11.475418994413408, + "grad_norm": 0.47673848271369934, + "learning_rate": 0.0004277591036414566, + "loss": 0.4823, + "step": 20541 + }, + { + "epoch": 11.475977653631285, + "grad_norm": 0.9772157669067383, + "learning_rate": 0.0004277310924369748, + "loss": 0.4771, + "step": 20542 + }, + { + "epoch": 11.476536312849163, + "grad_norm": 0.4587653577327728, + "learning_rate": 0.00042770308123249304, + "loss": 0.3909, + "step": 20543 + }, + { + "epoch": 11.47709497206704, + "grad_norm": 1.3717995882034302, + "learning_rate": 0.0004276750700280112, + "loss": 0.4589, + "step": 20544 + }, + { + "epoch": 11.477653631284916, + "grad_norm": 0.5064394474029541, + "learning_rate": 0.00042764705882352946, + "loss": 0.4088, + "step": 20545 + }, + { + "epoch": 11.478212290502793, + "grad_norm": 0.451831191778183, + "learning_rate": 0.0004276190476190476, + "loss": 0.4928, + "step": 20546 + }, + { + "epoch": 11.478770949720671, + "grad_norm": 0.49596506357192993, + "learning_rate": 0.0004275910364145658, + "loss": 0.4663, + "step": 20547 + }, + { + "epoch": 11.479329608938547, + "grad_norm": 0.5681385397911072, + "learning_rate": 0.0004275630252100841, + "loss": 0.509, + "step": 20548 + }, + { + "epoch": 11.479888268156424, + "grad_norm": 0.4054035544395447, + "learning_rate": 0.0004275350140056022, + "loss": 0.3317, + "step": 20549 + }, + { + "epoch": 11.480446927374302, + "grad_norm": 0.9036941528320312, + "learning_rate": 0.0004275070028011205, + "loss": 0.4982, + "step": 20550 + }, + { + "epoch": 11.481005586592179, + "grad_norm": 0.4798491895198822, + "learning_rate": 0.0004274789915966387, + "loss": 0.411, + "step": 20551 + }, + { + "epoch": 11.481564245810056, + "grad_norm": 0.6063157916069031, + "learning_rate": 0.00042745098039215684, + "loss": 0.5997, + "step": 20552 + }, + { + "epoch": 11.482122905027932, + "grad_norm": 0.5110703706741333, + "learning_rate": 0.0004274229691876751, + "loss": 0.4079, + "step": 20553 + }, + { + "epoch": 11.48268156424581, + "grad_norm": 0.5609031319618225, + "learning_rate": 0.00042739495798319326, + "loss": 0.5443, + "step": 20554 + }, + { + "epoch": 11.483240223463687, + "grad_norm": 0.5750454068183899, + "learning_rate": 0.0004273669467787115, + "loss": 0.4339, + "step": 20555 + }, + { + "epoch": 11.483798882681564, + "grad_norm": 0.4941383898258209, + "learning_rate": 0.0004273389355742297, + "loss": 0.523, + "step": 20556 + }, + { + "epoch": 11.484357541899442, + "grad_norm": 3.4129364490509033, + "learning_rate": 0.0004273109243697479, + "loss": 0.3486, + "step": 20557 + }, + { + "epoch": 11.484916201117318, + "grad_norm": 0.5427618622779846, + "learning_rate": 0.00042728291316526613, + "loss": 0.4667, + "step": 20558 + }, + { + "epoch": 11.485474860335195, + "grad_norm": 0.7671644687652588, + "learning_rate": 0.00042725490196078434, + "loss": 0.3694, + "step": 20559 + }, + { + "epoch": 11.486033519553073, + "grad_norm": 0.6954537034034729, + "learning_rate": 0.00042722689075630255, + "loss": 0.4228, + "step": 20560 + }, + { + "epoch": 11.48659217877095, + "grad_norm": 0.810337245464325, + "learning_rate": 0.00042719887955182075, + "loss": 0.3424, + "step": 20561 + }, + { + "epoch": 11.487150837988827, + "grad_norm": 0.5496257543563843, + "learning_rate": 0.0004271708683473389, + "loss": 0.5107, + "step": 20562 + }, + { + "epoch": 11.487709497206703, + "grad_norm": 2.4249541759490967, + "learning_rate": 0.00042714285714285716, + "loss": 0.3634, + "step": 20563 + }, + { + "epoch": 11.488268156424581, + "grad_norm": 0.6596747040748596, + "learning_rate": 0.00042711484593837537, + "loss": 0.5431, + "step": 20564 + }, + { + "epoch": 11.488826815642458, + "grad_norm": 6.44435977935791, + "learning_rate": 0.0004270868347338936, + "loss": 0.4203, + "step": 20565 + }, + { + "epoch": 11.489385474860335, + "grad_norm": 0.5959506630897522, + "learning_rate": 0.0004270588235294118, + "loss": 0.493, + "step": 20566 + }, + { + "epoch": 11.489944134078213, + "grad_norm": 0.5207118988037109, + "learning_rate": 0.00042703081232493, + "loss": 0.4061, + "step": 20567 + }, + { + "epoch": 11.49050279329609, + "grad_norm": 1.3582121133804321, + "learning_rate": 0.0004270028011204482, + "loss": 0.4175, + "step": 20568 + }, + { + "epoch": 11.491061452513966, + "grad_norm": 0.551295816898346, + "learning_rate": 0.0004269747899159664, + "loss": 0.4301, + "step": 20569 + }, + { + "epoch": 11.491620111731844, + "grad_norm": 0.3976898193359375, + "learning_rate": 0.0004269467787114846, + "loss": 0.4097, + "step": 20570 + }, + { + "epoch": 11.492178770949721, + "grad_norm": 0.6724882125854492, + "learning_rate": 0.0004269187675070028, + "loss": 0.4524, + "step": 20571 + }, + { + "epoch": 11.492737430167598, + "grad_norm": 0.4446307122707367, + "learning_rate": 0.000426890756302521, + "loss": 0.3682, + "step": 20572 + }, + { + "epoch": 11.493296089385474, + "grad_norm": 0.6518819332122803, + "learning_rate": 0.0004268627450980392, + "loss": 0.5342, + "step": 20573 + }, + { + "epoch": 11.493854748603352, + "grad_norm": 0.38295605778694153, + "learning_rate": 0.00042683473389355743, + "loss": 0.4583, + "step": 20574 + }, + { + "epoch": 11.494413407821229, + "grad_norm": 2.055450677871704, + "learning_rate": 0.0004268067226890757, + "loss": 0.412, + "step": 20575 + }, + { + "epoch": 11.494972067039106, + "grad_norm": 0.41317152976989746, + "learning_rate": 0.00042677871148459384, + "loss": 0.4511, + "step": 20576 + }, + { + "epoch": 11.495530726256984, + "grad_norm": 0.42795634269714355, + "learning_rate": 0.00042675070028011205, + "loss": 0.3204, + "step": 20577 + }, + { + "epoch": 11.49608938547486, + "grad_norm": 4.33543586730957, + "learning_rate": 0.00042672268907563025, + "loss": 0.336, + "step": 20578 + }, + { + "epoch": 11.496648044692737, + "grad_norm": 0.3578462600708008, + "learning_rate": 0.00042669467787114846, + "loss": 0.3054, + "step": 20579 + }, + { + "epoch": 11.497206703910614, + "grad_norm": 2.628692150115967, + "learning_rate": 0.0004266666666666667, + "loss": 0.4077, + "step": 20580 + }, + { + "epoch": 11.497765363128492, + "grad_norm": 0.5811374187469482, + "learning_rate": 0.00042663865546218487, + "loss": 0.3342, + "step": 20581 + }, + { + "epoch": 11.498324022346369, + "grad_norm": 0.4065400958061218, + "learning_rate": 0.0004266106442577031, + "loss": 0.389, + "step": 20582 + }, + { + "epoch": 11.498882681564245, + "grad_norm": 0.543093740940094, + "learning_rate": 0.00042658263305322134, + "loss": 0.4708, + "step": 20583 + }, + { + "epoch": 11.499441340782123, + "grad_norm": 8.488954544067383, + "learning_rate": 0.0004265546218487395, + "loss": 0.368, + "step": 20584 + }, + { + "epoch": 11.5, + "grad_norm": 0.43113794922828674, + "learning_rate": 0.0004265266106442577, + "loss": 0.4222, + "step": 20585 + }, + { + "epoch": 11.500558659217877, + "grad_norm": 0.4199307858943939, + "learning_rate": 0.0004264985994397759, + "loss": 0.3772, + "step": 20586 + }, + { + "epoch": 11.501117318435755, + "grad_norm": 1.419294834136963, + "learning_rate": 0.0004264705882352941, + "loss": 0.3452, + "step": 20587 + }, + { + "epoch": 11.501675977653631, + "grad_norm": 0.38322269916534424, + "learning_rate": 0.00042644257703081237, + "loss": 0.4025, + "step": 20588 + }, + { + "epoch": 11.502234636871508, + "grad_norm": 0.4934639632701874, + "learning_rate": 0.0004264145658263305, + "loss": 0.3242, + "step": 20589 + }, + { + "epoch": 11.502793296089386, + "grad_norm": 0.41777971386909485, + "learning_rate": 0.0004263865546218487, + "loss": 0.4451, + "step": 20590 + }, + { + "epoch": 11.503351955307263, + "grad_norm": 0.32535412907600403, + "learning_rate": 0.000426358543417367, + "loss": 0.345, + "step": 20591 + }, + { + "epoch": 11.50391061452514, + "grad_norm": 0.4173828661441803, + "learning_rate": 0.00042633053221288514, + "loss": 0.3212, + "step": 20592 + }, + { + "epoch": 11.504469273743016, + "grad_norm": 0.5191079378128052, + "learning_rate": 0.0004263025210084034, + "loss": 0.5303, + "step": 20593 + }, + { + "epoch": 11.505027932960894, + "grad_norm": 0.4365408718585968, + "learning_rate": 0.00042627450980392155, + "loss": 0.2847, + "step": 20594 + }, + { + "epoch": 11.505586592178771, + "grad_norm": 0.6085261702537537, + "learning_rate": 0.00042624649859943976, + "loss": 0.406, + "step": 20595 + }, + { + "epoch": 11.506145251396648, + "grad_norm": 0.9300827980041504, + "learning_rate": 0.000426218487394958, + "loss": 0.3382, + "step": 20596 + }, + { + "epoch": 11.506703910614526, + "grad_norm": 0.6728081703186035, + "learning_rate": 0.00042619047619047617, + "loss": 0.4516, + "step": 20597 + }, + { + "epoch": 11.507262569832402, + "grad_norm": 0.8834640979766846, + "learning_rate": 0.00042616246498599443, + "loss": 0.3371, + "step": 20598 + }, + { + "epoch": 11.507821229050279, + "grad_norm": 0.40679025650024414, + "learning_rate": 0.00042613445378151263, + "loss": 0.2974, + "step": 20599 + }, + { + "epoch": 11.508379888268156, + "grad_norm": 1.0970028638839722, + "learning_rate": 0.0004261064425770308, + "loss": 0.4262, + "step": 20600 + }, + { + "epoch": 11.508938547486034, + "grad_norm": 0.42507368326187134, + "learning_rate": 0.00042607843137254905, + "loss": 0.4422, + "step": 20601 + }, + { + "epoch": 11.50949720670391, + "grad_norm": 0.7584292888641357, + "learning_rate": 0.0004260504201680672, + "loss": 0.4584, + "step": 20602 + }, + { + "epoch": 11.510055865921787, + "grad_norm": 2.2101552486419678, + "learning_rate": 0.00042602240896358546, + "loss": 0.448, + "step": 20603 + }, + { + "epoch": 11.510614525139665, + "grad_norm": 0.7655834555625916, + "learning_rate": 0.00042599439775910366, + "loss": 0.5018, + "step": 20604 + }, + { + "epoch": 11.511173184357542, + "grad_norm": 0.8183310627937317, + "learning_rate": 0.0004259663865546218, + "loss": 0.5154, + "step": 20605 + }, + { + "epoch": 11.511731843575419, + "grad_norm": 0.45593780279159546, + "learning_rate": 0.0004259383753501401, + "loss": 0.3614, + "step": 20606 + }, + { + "epoch": 11.512290502793297, + "grad_norm": 0.4433230459690094, + "learning_rate": 0.0004259103641456583, + "loss": 0.2986, + "step": 20607 + }, + { + "epoch": 11.512849162011173, + "grad_norm": 0.4816119968891144, + "learning_rate": 0.0004258823529411765, + "loss": 0.4038, + "step": 20608 + }, + { + "epoch": 11.51340782122905, + "grad_norm": 0.4995867609977722, + "learning_rate": 0.0004258543417366947, + "loss": 0.4413, + "step": 20609 + }, + { + "epoch": 11.513966480446927, + "grad_norm": 0.39168286323547363, + "learning_rate": 0.00042582633053221285, + "loss": 0.4418, + "step": 20610 + }, + { + "epoch": 11.514525139664805, + "grad_norm": 0.34210118651390076, + "learning_rate": 0.0004257983193277311, + "loss": 0.3392, + "step": 20611 + }, + { + "epoch": 11.515083798882682, + "grad_norm": 0.39874303340911865, + "learning_rate": 0.0004257703081232493, + "loss": 0.3339, + "step": 20612 + }, + { + "epoch": 11.515642458100558, + "grad_norm": 0.34214985370635986, + "learning_rate": 0.0004257422969187675, + "loss": 0.4134, + "step": 20613 + }, + { + "epoch": 11.516201117318436, + "grad_norm": 0.40317386388778687, + "learning_rate": 0.0004257142857142857, + "loss": 0.3663, + "step": 20614 + }, + { + "epoch": 11.516759776536313, + "grad_norm": 0.4051573872566223, + "learning_rate": 0.00042568627450980393, + "loss": 0.3973, + "step": 20615 + }, + { + "epoch": 11.51731843575419, + "grad_norm": 0.6257272362709045, + "learning_rate": 0.00042565826330532214, + "loss": 0.3988, + "step": 20616 + }, + { + "epoch": 11.517877094972068, + "grad_norm": 0.5876336693763733, + "learning_rate": 0.00042563025210084034, + "loss": 0.4521, + "step": 20617 + }, + { + "epoch": 11.518435754189944, + "grad_norm": 0.3877696096897125, + "learning_rate": 0.00042560224089635855, + "loss": 0.3665, + "step": 20618 + }, + { + "epoch": 11.518994413407821, + "grad_norm": 0.803636372089386, + "learning_rate": 0.00042557422969187675, + "loss": 0.4727, + "step": 20619 + }, + { + "epoch": 11.519553072625698, + "grad_norm": 0.5632331967353821, + "learning_rate": 0.00042554621848739496, + "loss": 0.3677, + "step": 20620 + }, + { + "epoch": 11.520111731843576, + "grad_norm": 0.6445800065994263, + "learning_rate": 0.00042551820728291317, + "loss": 0.4206, + "step": 20621 + }, + { + "epoch": 11.520670391061453, + "grad_norm": 0.7842410802841187, + "learning_rate": 0.00042549019607843137, + "loss": 0.3914, + "step": 20622 + }, + { + "epoch": 11.521229050279329, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.00042546218487394963, + "loss": 0.349, + "step": 20623 + }, + { + "epoch": 11.521787709497207, + "grad_norm": 0.5364634990692139, + "learning_rate": 0.0004254341736694678, + "loss": 0.4359, + "step": 20624 + }, + { + "epoch": 11.522346368715084, + "grad_norm": 0.41717830300331116, + "learning_rate": 0.000425406162464986, + "loss": 0.4191, + "step": 20625 + }, + { + "epoch": 11.52290502793296, + "grad_norm": 0.4487048387527466, + "learning_rate": 0.0004253781512605042, + "loss": 0.4097, + "step": 20626 + }, + { + "epoch": 11.523463687150837, + "grad_norm": 0.5949258208274841, + "learning_rate": 0.0004253501400560224, + "loss": 0.4337, + "step": 20627 + }, + { + "epoch": 11.524022346368715, + "grad_norm": 0.49007198214530945, + "learning_rate": 0.00042532212885154066, + "loss": 0.5302, + "step": 20628 + }, + { + "epoch": 11.524581005586592, + "grad_norm": 0.5633420944213867, + "learning_rate": 0.0004252941176470588, + "loss": 0.3146, + "step": 20629 + }, + { + "epoch": 11.525139664804469, + "grad_norm": 0.5080938339233398, + "learning_rate": 0.000425266106442577, + "loss": 0.3896, + "step": 20630 + }, + { + "epoch": 11.525698324022347, + "grad_norm": 0.5473659038543701, + "learning_rate": 0.0004252380952380953, + "loss": 0.465, + "step": 20631 + }, + { + "epoch": 11.526256983240224, + "grad_norm": 0.43636035919189453, + "learning_rate": 0.00042521008403361343, + "loss": 0.4344, + "step": 20632 + }, + { + "epoch": 11.5268156424581, + "grad_norm": 0.46388959884643555, + "learning_rate": 0.0004251820728291317, + "loss": 0.414, + "step": 20633 + }, + { + "epoch": 11.527374301675978, + "grad_norm": 0.41416123509407043, + "learning_rate": 0.00042515406162464984, + "loss": 0.3742, + "step": 20634 + }, + { + "epoch": 11.527932960893855, + "grad_norm": 0.5078537464141846, + "learning_rate": 0.00042512605042016805, + "loss": 0.4377, + "step": 20635 + }, + { + "epoch": 11.528491620111732, + "grad_norm": 0.4472281038761139, + "learning_rate": 0.0004250980392156863, + "loss": 0.3161, + "step": 20636 + }, + { + "epoch": 11.529050279329608, + "grad_norm": 0.4106791317462921, + "learning_rate": 0.00042507002801120446, + "loss": 0.4361, + "step": 20637 + }, + { + "epoch": 11.529608938547486, + "grad_norm": 0.48135775327682495, + "learning_rate": 0.0004250420168067227, + "loss": 0.4131, + "step": 20638 + }, + { + "epoch": 11.530167597765363, + "grad_norm": 0.4622468054294586, + "learning_rate": 0.00042501400560224093, + "loss": 0.3848, + "step": 20639 + }, + { + "epoch": 11.53072625698324, + "grad_norm": 0.3971022069454193, + "learning_rate": 0.0004249859943977591, + "loss": 0.3508, + "step": 20640 + }, + { + "epoch": 11.531284916201118, + "grad_norm": 0.5203205347061157, + "learning_rate": 0.00042495798319327734, + "loss": 0.389, + "step": 20641 + }, + { + "epoch": 11.531843575418995, + "grad_norm": 0.38899949193000793, + "learning_rate": 0.0004249299719887955, + "loss": 0.3718, + "step": 20642 + }, + { + "epoch": 11.532402234636871, + "grad_norm": 0.40717244148254395, + "learning_rate": 0.00042490196078431375, + "loss": 0.3621, + "step": 20643 + }, + { + "epoch": 11.53296089385475, + "grad_norm": 0.7253324389457703, + "learning_rate": 0.00042487394957983196, + "loss": 0.3396, + "step": 20644 + }, + { + "epoch": 11.533519553072626, + "grad_norm": 0.4019280970096588, + "learning_rate": 0.0004248459383753501, + "loss": 0.3684, + "step": 20645 + }, + { + "epoch": 11.534078212290503, + "grad_norm": 0.41470420360565186, + "learning_rate": 0.00042481792717086837, + "loss": 0.3996, + "step": 20646 + }, + { + "epoch": 11.53463687150838, + "grad_norm": 0.9583471417427063, + "learning_rate": 0.0004247899159663866, + "loss": 0.596, + "step": 20647 + }, + { + "epoch": 11.535195530726257, + "grad_norm": 0.8679529428482056, + "learning_rate": 0.0004247619047619048, + "loss": 0.4855, + "step": 20648 + }, + { + "epoch": 11.535754189944134, + "grad_norm": 5.495964527130127, + "learning_rate": 0.000424733893557423, + "loss": 0.3894, + "step": 20649 + }, + { + "epoch": 11.53631284916201, + "grad_norm": 0.39126309752464294, + "learning_rate": 0.00042470588235294114, + "loss": 0.3242, + "step": 20650 + }, + { + "epoch": 11.536871508379889, + "grad_norm": 0.5814370512962341, + "learning_rate": 0.0004246778711484594, + "loss": 0.4276, + "step": 20651 + }, + { + "epoch": 11.537430167597766, + "grad_norm": 0.5931792855262756, + "learning_rate": 0.0004246498599439776, + "loss": 0.4052, + "step": 20652 + }, + { + "epoch": 11.537988826815642, + "grad_norm": 0.7188152074813843, + "learning_rate": 0.0004246218487394958, + "loss": 0.3873, + "step": 20653 + }, + { + "epoch": 11.538547486033519, + "grad_norm": 0.28363490104675293, + "learning_rate": 0.000424593837535014, + "loss": 0.3008, + "step": 20654 + }, + { + "epoch": 11.539106145251397, + "grad_norm": 0.5203449130058289, + "learning_rate": 0.0004245658263305322, + "loss": 0.4843, + "step": 20655 + }, + { + "epoch": 11.539664804469274, + "grad_norm": 0.43934735655784607, + "learning_rate": 0.00042453781512605043, + "loss": 0.4092, + "step": 20656 + }, + { + "epoch": 11.54022346368715, + "grad_norm": 0.48860037326812744, + "learning_rate": 0.00042450980392156864, + "loss": 0.4898, + "step": 20657 + }, + { + "epoch": 11.540782122905028, + "grad_norm": 0.4542331397533417, + "learning_rate": 0.00042448179271708684, + "loss": 0.4047, + "step": 20658 + }, + { + "epoch": 11.541340782122905, + "grad_norm": 0.4746894836425781, + "learning_rate": 0.00042445378151260505, + "loss": 0.5379, + "step": 20659 + }, + { + "epoch": 11.541899441340782, + "grad_norm": Infinity, + "learning_rate": 0.00042445378151260505, + "loss": 0.547, + "step": 20660 + }, + { + "epoch": 11.54245810055866, + "grad_norm": 0.4919686019420624, + "learning_rate": 0.00042442577030812325, + "loss": 0.5296, + "step": 20661 + }, + { + "epoch": 11.543016759776537, + "grad_norm": 0.30281633138656616, + "learning_rate": 0.00042439775910364146, + "loss": 0.3203, + "step": 20662 + }, + { + "epoch": 11.543575418994413, + "grad_norm": 0.9004039168357849, + "learning_rate": 0.00042436974789915967, + "loss": 0.4759, + "step": 20663 + }, + { + "epoch": 11.544134078212291, + "grad_norm": 0.6114302277565002, + "learning_rate": 0.0004243417366946779, + "loss": 0.5435, + "step": 20664 + }, + { + "epoch": 11.544692737430168, + "grad_norm": 0.4828857183456421, + "learning_rate": 0.0004243137254901961, + "loss": 0.4714, + "step": 20665 + }, + { + "epoch": 11.545251396648045, + "grad_norm": 0.6116296052932739, + "learning_rate": 0.0004242857142857143, + "loss": 0.3286, + "step": 20666 + }, + { + "epoch": 11.545810055865921, + "grad_norm": 0.8764639496803284, + "learning_rate": 0.0004242577030812325, + "loss": 0.4248, + "step": 20667 + }, + { + "epoch": 11.5463687150838, + "grad_norm": 0.55197674036026, + "learning_rate": 0.0004242296918767507, + "loss": 0.5273, + "step": 20668 + }, + { + "epoch": 11.546927374301676, + "grad_norm": 0.8051908612251282, + "learning_rate": 0.00042420168067226896, + "loss": 0.4746, + "step": 20669 + }, + { + "epoch": 11.547486033519553, + "grad_norm": 0.625347375869751, + "learning_rate": 0.0004241736694677871, + "loss": 0.4496, + "step": 20670 + }, + { + "epoch": 11.548044692737431, + "grad_norm": 0.7284619808197021, + "learning_rate": 0.0004241456582633053, + "loss": 0.6122, + "step": 20671 + }, + { + "epoch": 11.548603351955308, + "grad_norm": 0.6920905709266663, + "learning_rate": 0.0004241176470588236, + "loss": 0.4777, + "step": 20672 + }, + { + "epoch": 11.549162011173184, + "grad_norm": 0.565899670124054, + "learning_rate": 0.0004240896358543417, + "loss": 0.3905, + "step": 20673 + }, + { + "epoch": 11.54972067039106, + "grad_norm": 0.4656364619731903, + "learning_rate": 0.00042406162464986, + "loss": 0.4232, + "step": 20674 + }, + { + "epoch": 11.550279329608939, + "grad_norm": 0.4026971161365509, + "learning_rate": 0.00042403361344537814, + "loss": 0.385, + "step": 20675 + }, + { + "epoch": 11.550837988826816, + "grad_norm": 0.4009784460067749, + "learning_rate": 0.00042400560224089634, + "loss": 0.3341, + "step": 20676 + }, + { + "epoch": 11.551396648044692, + "grad_norm": 1.2701104879379272, + "learning_rate": 0.0004239775910364146, + "loss": 0.3925, + "step": 20677 + }, + { + "epoch": 11.55195530726257, + "grad_norm": 0.5902116298675537, + "learning_rate": 0.00042394957983193276, + "loss": 0.4186, + "step": 20678 + }, + { + "epoch": 11.552513966480447, + "grad_norm": 1.1804386377334595, + "learning_rate": 0.000423921568627451, + "loss": 0.3698, + "step": 20679 + }, + { + "epoch": 11.553072625698324, + "grad_norm": 1.080917239189148, + "learning_rate": 0.0004238935574229692, + "loss": 0.5306, + "step": 20680 + }, + { + "epoch": 11.553631284916202, + "grad_norm": 0.5245029926300049, + "learning_rate": 0.0004238655462184874, + "loss": 0.5989, + "step": 20681 + }, + { + "epoch": 11.554189944134079, + "grad_norm": 0.5329370498657227, + "learning_rate": 0.00042383753501400563, + "loss": 0.3679, + "step": 20682 + }, + { + "epoch": 11.554748603351955, + "grad_norm": 1.1451594829559326, + "learning_rate": 0.0004238095238095238, + "loss": 0.4497, + "step": 20683 + }, + { + "epoch": 11.555307262569832, + "grad_norm": 0.4891843795776367, + "learning_rate": 0.00042378151260504205, + "loss": 0.3997, + "step": 20684 + }, + { + "epoch": 11.55586592178771, + "grad_norm": 0.42326244711875916, + "learning_rate": 0.00042375350140056025, + "loss": 0.4702, + "step": 20685 + }, + { + "epoch": 11.556424581005587, + "grad_norm": 0.688256025314331, + "learning_rate": 0.0004237254901960784, + "loss": 0.402, + "step": 20686 + }, + { + "epoch": 11.556983240223463, + "grad_norm": 0.4372667074203491, + "learning_rate": 0.00042369747899159666, + "loss": 0.5072, + "step": 20687 + }, + { + "epoch": 11.557541899441341, + "grad_norm": 0.7290706038475037, + "learning_rate": 0.00042366946778711487, + "loss": 0.5547, + "step": 20688 + }, + { + "epoch": 11.558100558659218, + "grad_norm": 0.5263441205024719, + "learning_rate": 0.0004236414565826331, + "loss": 0.4542, + "step": 20689 + }, + { + "epoch": 11.558659217877095, + "grad_norm": 0.573900043964386, + "learning_rate": 0.0004236134453781513, + "loss": 0.4285, + "step": 20690 + }, + { + "epoch": 11.559217877094973, + "grad_norm": 0.5307414531707764, + "learning_rate": 0.00042358543417366943, + "loss": 0.4331, + "step": 20691 + }, + { + "epoch": 11.55977653631285, + "grad_norm": 0.41378021240234375, + "learning_rate": 0.0004235574229691877, + "loss": 0.48, + "step": 20692 + }, + { + "epoch": 11.560335195530726, + "grad_norm": 0.5707724690437317, + "learning_rate": 0.0004235294117647059, + "loss": 0.2765, + "step": 20693 + }, + { + "epoch": 11.560893854748603, + "grad_norm": 0.7068626880645752, + "learning_rate": 0.0004235014005602241, + "loss": 0.4568, + "step": 20694 + }, + { + "epoch": 11.561452513966481, + "grad_norm": 0.60820472240448, + "learning_rate": 0.0004234733893557423, + "loss": 0.3903, + "step": 20695 + }, + { + "epoch": 11.562011173184358, + "grad_norm": 0.622393012046814, + "learning_rate": 0.0004234453781512605, + "loss": 0.3972, + "step": 20696 + }, + { + "epoch": 11.562569832402234, + "grad_norm": 0.593914270401001, + "learning_rate": 0.0004234173669467787, + "loss": 0.5067, + "step": 20697 + }, + { + "epoch": 11.563128491620112, + "grad_norm": 0.39853671193122864, + "learning_rate": 0.00042338935574229693, + "loss": 0.4047, + "step": 20698 + }, + { + "epoch": 11.563687150837989, + "grad_norm": 0.5406050682067871, + "learning_rate": 0.0004233613445378151, + "loss": 0.4034, + "step": 20699 + }, + { + "epoch": 11.564245810055866, + "grad_norm": 0.503292441368103, + "learning_rate": 0.00042333333333333334, + "loss": 0.3445, + "step": 20700 + }, + { + "epoch": 11.564804469273742, + "grad_norm": 0.41019079089164734, + "learning_rate": 0.00042330532212885155, + "loss": 0.4172, + "step": 20701 + }, + { + "epoch": 11.56536312849162, + "grad_norm": 0.4900561571121216, + "learning_rate": 0.00042327731092436975, + "loss": 0.6158, + "step": 20702 + }, + { + "epoch": 11.565921787709497, + "grad_norm": 5.624237537384033, + "learning_rate": 0.00042324929971988796, + "loss": 0.4684, + "step": 20703 + }, + { + "epoch": 11.566480446927374, + "grad_norm": 0.7807705998420715, + "learning_rate": 0.00042322128851540617, + "loss": 0.4705, + "step": 20704 + }, + { + "epoch": 11.567039106145252, + "grad_norm": 0.4776996970176697, + "learning_rate": 0.00042319327731092437, + "loss": 0.2959, + "step": 20705 + }, + { + "epoch": 11.567597765363129, + "grad_norm": 0.679042398929596, + "learning_rate": 0.0004231652661064426, + "loss": 0.4381, + "step": 20706 + }, + { + "epoch": 11.568156424581005, + "grad_norm": 0.44969356060028076, + "learning_rate": 0.00042313725490196084, + "loss": 0.4179, + "step": 20707 + }, + { + "epoch": 11.568715083798883, + "grad_norm": 0.48446929454803467, + "learning_rate": 0.000423109243697479, + "loss": 0.403, + "step": 20708 + }, + { + "epoch": 11.56927374301676, + "grad_norm": 0.36746180057525635, + "learning_rate": 0.0004230812324929972, + "loss": 0.3835, + "step": 20709 + }, + { + "epoch": 11.569832402234637, + "grad_norm": 0.34233328700065613, + "learning_rate": 0.0004230532212885154, + "loss": 0.3753, + "step": 20710 + }, + { + "epoch": 11.570391061452513, + "grad_norm": 0.7545540928840637, + "learning_rate": 0.0004230252100840336, + "loss": 0.3771, + "step": 20711 + }, + { + "epoch": 11.570949720670392, + "grad_norm": 0.7465047240257263, + "learning_rate": 0.00042299719887955187, + "loss": 0.424, + "step": 20712 + }, + { + "epoch": 11.571508379888268, + "grad_norm": 2.300227642059326, + "learning_rate": 0.00042296918767507, + "loss": 0.3609, + "step": 20713 + }, + { + "epoch": 11.572067039106145, + "grad_norm": 0.4362581670284271, + "learning_rate": 0.0004229411764705882, + "loss": 0.4067, + "step": 20714 + }, + { + "epoch": 11.572625698324023, + "grad_norm": 0.38785016536712646, + "learning_rate": 0.0004229131652661065, + "loss": 0.3483, + "step": 20715 + }, + { + "epoch": 11.5731843575419, + "grad_norm": 0.9845558404922485, + "learning_rate": 0.00042288515406162464, + "loss": 0.3439, + "step": 20716 + }, + { + "epoch": 11.573743016759776, + "grad_norm": 0.3465559184551239, + "learning_rate": 0.0004228571428571429, + "loss": 0.3822, + "step": 20717 + }, + { + "epoch": 11.574301675977654, + "grad_norm": 1.5967515707015991, + "learning_rate": 0.00042282913165266105, + "loss": 0.5245, + "step": 20718 + }, + { + "epoch": 11.574860335195531, + "grad_norm": 0.6658174991607666, + "learning_rate": 0.00042280112044817926, + "loss": 0.431, + "step": 20719 + }, + { + "epoch": 11.575418994413408, + "grad_norm": 1.569270133972168, + "learning_rate": 0.0004227731092436975, + "loss": 0.4943, + "step": 20720 + }, + { + "epoch": 11.575977653631284, + "grad_norm": 0.5644358396530151, + "learning_rate": 0.00042274509803921567, + "loss": 0.454, + "step": 20721 + }, + { + "epoch": 11.576536312849163, + "grad_norm": 2.7591288089752197, + "learning_rate": 0.00042271708683473393, + "loss": 0.4999, + "step": 20722 + }, + { + "epoch": 11.577094972067039, + "grad_norm": 1.3475178480148315, + "learning_rate": 0.00042268907563025213, + "loss": 0.5448, + "step": 20723 + }, + { + "epoch": 11.577653631284916, + "grad_norm": 0.6630269289016724, + "learning_rate": 0.0004226610644257703, + "loss": 0.4617, + "step": 20724 + }, + { + "epoch": 11.578212290502794, + "grad_norm": 6.117388725280762, + "learning_rate": 0.00042263305322128855, + "loss": 0.5274, + "step": 20725 + }, + { + "epoch": 11.57877094972067, + "grad_norm": 0.6152281761169434, + "learning_rate": 0.0004226050420168067, + "loss": 0.5729, + "step": 20726 + }, + { + "epoch": 11.579329608938547, + "grad_norm": 0.48115888237953186, + "learning_rate": 0.00042257703081232496, + "loss": 0.531, + "step": 20727 + }, + { + "epoch": 11.579888268156424, + "grad_norm": 0.46707797050476074, + "learning_rate": 0.00042254901960784316, + "loss": 0.4932, + "step": 20728 + }, + { + "epoch": 11.580446927374302, + "grad_norm": 0.41524726152420044, + "learning_rate": 0.0004225210084033613, + "loss": 0.3432, + "step": 20729 + }, + { + "epoch": 11.581005586592179, + "grad_norm": 0.4186629354953766, + "learning_rate": 0.0004224929971988796, + "loss": 0.4565, + "step": 20730 + }, + { + "epoch": 11.581564245810055, + "grad_norm": 0.39581429958343506, + "learning_rate": 0.0004224649859943978, + "loss": 0.3454, + "step": 20731 + }, + { + "epoch": 11.582122905027934, + "grad_norm": 0.4746589958667755, + "learning_rate": 0.000422436974789916, + "loss": 0.3529, + "step": 20732 + }, + { + "epoch": 11.58268156424581, + "grad_norm": 0.7876299619674683, + "learning_rate": 0.0004224089635854342, + "loss": 0.4922, + "step": 20733 + }, + { + "epoch": 11.583240223463687, + "grad_norm": 0.5797983407974243, + "learning_rate": 0.00042238095238095235, + "loss": 0.3779, + "step": 20734 + }, + { + "epoch": 11.583798882681565, + "grad_norm": 0.5217782258987427, + "learning_rate": 0.0004223529411764706, + "loss": 0.4248, + "step": 20735 + }, + { + "epoch": 11.584357541899442, + "grad_norm": 0.637617290019989, + "learning_rate": 0.0004223249299719888, + "loss": 0.4861, + "step": 20736 + }, + { + "epoch": 11.584916201117318, + "grad_norm": 0.5186054706573486, + "learning_rate": 0.000422296918767507, + "loss": 0.4451, + "step": 20737 + }, + { + "epoch": 11.585474860335196, + "grad_norm": 4.0225324630737305, + "learning_rate": 0.0004222689075630252, + "loss": 0.4671, + "step": 20738 + }, + { + "epoch": 11.586033519553073, + "grad_norm": 1.0567153692245483, + "learning_rate": 0.00042224089635854343, + "loss": 0.4672, + "step": 20739 + }, + { + "epoch": 11.58659217877095, + "grad_norm": 0.6557429432868958, + "learning_rate": 0.00042221288515406164, + "loss": 0.3473, + "step": 20740 + }, + { + "epoch": 11.587150837988826, + "grad_norm": 1.4269450902938843, + "learning_rate": 0.00042218487394957984, + "loss": 0.4872, + "step": 20741 + }, + { + "epoch": 11.587709497206705, + "grad_norm": 0.6591324210166931, + "learning_rate": 0.00042215686274509805, + "loss": 0.4492, + "step": 20742 + }, + { + "epoch": 11.588268156424581, + "grad_norm": 0.4599229395389557, + "learning_rate": 0.00042212885154061625, + "loss": 0.4154, + "step": 20743 + }, + { + "epoch": 11.588826815642458, + "grad_norm": 0.5963237285614014, + "learning_rate": 0.00042210084033613446, + "loss": 0.4679, + "step": 20744 + }, + { + "epoch": 11.589385474860336, + "grad_norm": 0.44597873091697693, + "learning_rate": 0.00042207282913165267, + "loss": 0.4093, + "step": 20745 + }, + { + "epoch": 11.589944134078213, + "grad_norm": 0.6215892434120178, + "learning_rate": 0.00042204481792717087, + "loss": 0.3445, + "step": 20746 + }, + { + "epoch": 11.59050279329609, + "grad_norm": 0.44218289852142334, + "learning_rate": 0.00042201680672268913, + "loss": 0.3988, + "step": 20747 + }, + { + "epoch": 11.591061452513966, + "grad_norm": 0.30041131377220154, + "learning_rate": 0.0004219887955182073, + "loss": 0.3357, + "step": 20748 + }, + { + "epoch": 11.591620111731844, + "grad_norm": 0.5011849999427795, + "learning_rate": 0.0004219607843137255, + "loss": 0.3558, + "step": 20749 + }, + { + "epoch": 11.59217877094972, + "grad_norm": 0.5519015789031982, + "learning_rate": 0.0004219327731092437, + "loss": 0.4369, + "step": 20750 + }, + { + "epoch": 11.592737430167597, + "grad_norm": 0.4621015787124634, + "learning_rate": 0.0004219047619047619, + "loss": 0.3814, + "step": 20751 + }, + { + "epoch": 11.593296089385476, + "grad_norm": 0.4769175350666046, + "learning_rate": 0.00042187675070028016, + "loss": 0.3719, + "step": 20752 + }, + { + "epoch": 11.593854748603352, + "grad_norm": 0.9842724204063416, + "learning_rate": 0.0004218487394957983, + "loss": 0.4707, + "step": 20753 + }, + { + "epoch": 11.594413407821229, + "grad_norm": 0.5011136531829834, + "learning_rate": 0.0004218207282913165, + "loss": 0.3874, + "step": 20754 + }, + { + "epoch": 11.594972067039105, + "grad_norm": 0.5640696287155151, + "learning_rate": 0.0004217927170868348, + "loss": 0.4147, + "step": 20755 + }, + { + "epoch": 11.595530726256984, + "grad_norm": 0.6944668292999268, + "learning_rate": 0.00042176470588235293, + "loss": 0.4971, + "step": 20756 + }, + { + "epoch": 11.59608938547486, + "grad_norm": 3.0826215744018555, + "learning_rate": 0.0004217366946778712, + "loss": 0.4951, + "step": 20757 + }, + { + "epoch": 11.596648044692737, + "grad_norm": 0.6589071154594421, + "learning_rate": 0.00042170868347338934, + "loss": 0.6799, + "step": 20758 + }, + { + "epoch": 11.597206703910615, + "grad_norm": 0.49689385294914246, + "learning_rate": 0.00042168067226890755, + "loss": 0.665, + "step": 20759 + }, + { + "epoch": 11.597765363128492, + "grad_norm": 2.7962138652801514, + "learning_rate": 0.0004216526610644258, + "loss": 0.378, + "step": 20760 + }, + { + "epoch": 11.598324022346368, + "grad_norm": 1.0508677959442139, + "learning_rate": 0.00042162464985994396, + "loss": 0.5141, + "step": 20761 + }, + { + "epoch": 11.598882681564247, + "grad_norm": 0.9208343029022217, + "learning_rate": 0.0004215966386554622, + "loss": 0.403, + "step": 20762 + }, + { + "epoch": 11.599441340782123, + "grad_norm": 0.4969636797904968, + "learning_rate": 0.00042156862745098043, + "loss": 0.4329, + "step": 20763 + }, + { + "epoch": 11.6, + "grad_norm": 0.597708523273468, + "learning_rate": 0.0004215406162464986, + "loss": 0.3487, + "step": 20764 + }, + { + "epoch": 11.600558659217878, + "grad_norm": 3.5610849857330322, + "learning_rate": 0.00042151260504201684, + "loss": 0.4488, + "step": 20765 + }, + { + "epoch": 11.601117318435755, + "grad_norm": 0.4976906478404999, + "learning_rate": 0.000421484593837535, + "loss": 0.4382, + "step": 20766 + }, + { + "epoch": 11.601675977653631, + "grad_norm": 1.0895826816558838, + "learning_rate": 0.00042145658263305325, + "loss": 0.4961, + "step": 20767 + }, + { + "epoch": 11.602234636871508, + "grad_norm": 0.38939765095710754, + "learning_rate": 0.00042142857142857146, + "loss": 0.4, + "step": 20768 + }, + { + "epoch": 11.602793296089386, + "grad_norm": 0.8267750144004822, + "learning_rate": 0.0004214005602240896, + "loss": 0.5018, + "step": 20769 + }, + { + "epoch": 11.603351955307263, + "grad_norm": 0.5503513216972351, + "learning_rate": 0.00042137254901960787, + "loss": 0.464, + "step": 20770 + }, + { + "epoch": 11.60391061452514, + "grad_norm": 0.45620280504226685, + "learning_rate": 0.0004213445378151261, + "loss": 0.4632, + "step": 20771 + }, + { + "epoch": 11.604469273743018, + "grad_norm": 0.4537382125854492, + "learning_rate": 0.0004213165266106443, + "loss": 0.312, + "step": 20772 + }, + { + "epoch": 11.605027932960894, + "grad_norm": 0.5610388517379761, + "learning_rate": 0.0004212885154061625, + "loss": 0.4445, + "step": 20773 + }, + { + "epoch": 11.60558659217877, + "grad_norm": 0.7527524828910828, + "learning_rate": 0.00042126050420168064, + "loss": 0.3672, + "step": 20774 + }, + { + "epoch": 11.606145251396647, + "grad_norm": 2.493600845336914, + "learning_rate": 0.0004212324929971989, + "loss": 0.4032, + "step": 20775 + }, + { + "epoch": 11.606703910614526, + "grad_norm": 0.4892067313194275, + "learning_rate": 0.0004212044817927171, + "loss": 0.2884, + "step": 20776 + }, + { + "epoch": 11.607262569832402, + "grad_norm": 0.8390831351280212, + "learning_rate": 0.0004211764705882353, + "loss": 0.6152, + "step": 20777 + }, + { + "epoch": 11.607821229050279, + "grad_norm": 0.4732486307621002, + "learning_rate": 0.0004211484593837535, + "loss": 0.4295, + "step": 20778 + }, + { + "epoch": 11.608379888268157, + "grad_norm": 0.8024941086769104, + "learning_rate": 0.0004211204481792717, + "loss": 0.4585, + "step": 20779 + }, + { + "epoch": 11.608938547486034, + "grad_norm": 0.5505977869033813, + "learning_rate": 0.00042109243697478993, + "loss": 0.3713, + "step": 20780 + }, + { + "epoch": 11.60949720670391, + "grad_norm": 0.7352420687675476, + "learning_rate": 0.00042106442577030814, + "loss": 0.5876, + "step": 20781 + }, + { + "epoch": 11.610055865921789, + "grad_norm": 0.5054692029953003, + "learning_rate": 0.00042103641456582634, + "loss": 0.4342, + "step": 20782 + }, + { + "epoch": 11.610614525139665, + "grad_norm": 4.087777614593506, + "learning_rate": 0.00042100840336134455, + "loss": 0.5471, + "step": 20783 + }, + { + "epoch": 11.611173184357542, + "grad_norm": 0.6946427226066589, + "learning_rate": 0.00042098039215686275, + "loss": 0.4172, + "step": 20784 + }, + { + "epoch": 11.611731843575418, + "grad_norm": 0.4584228992462158, + "learning_rate": 0.00042095238095238096, + "loss": 0.369, + "step": 20785 + }, + { + "epoch": 11.612290502793297, + "grad_norm": 0.6100547909736633, + "learning_rate": 0.00042092436974789917, + "loss": 0.4526, + "step": 20786 + }, + { + "epoch": 11.612849162011173, + "grad_norm": 1.6176875829696655, + "learning_rate": 0.0004208963585434174, + "loss": 0.4305, + "step": 20787 + }, + { + "epoch": 11.61340782122905, + "grad_norm": 0.5075684189796448, + "learning_rate": 0.0004208683473389356, + "loss": 0.3849, + "step": 20788 + }, + { + "epoch": 11.613966480446928, + "grad_norm": 0.5725739002227783, + "learning_rate": 0.0004208403361344538, + "loss": 0.3565, + "step": 20789 + }, + { + "epoch": 11.614525139664805, + "grad_norm": 0.5863872170448303, + "learning_rate": 0.000420812324929972, + "loss": 0.5095, + "step": 20790 + }, + { + "epoch": 11.615083798882681, + "grad_norm": 0.8431749939918518, + "learning_rate": 0.0004207843137254902, + "loss": 0.3658, + "step": 20791 + }, + { + "epoch": 11.61564245810056, + "grad_norm": 0.5147542357444763, + "learning_rate": 0.00042075630252100846, + "loss": 0.5801, + "step": 20792 + }, + { + "epoch": 11.616201117318436, + "grad_norm": 0.5380616188049316, + "learning_rate": 0.0004207282913165266, + "loss": 0.6028, + "step": 20793 + }, + { + "epoch": 11.616759776536313, + "grad_norm": 0.7210173010826111, + "learning_rate": 0.0004207002801120448, + "loss": 0.5253, + "step": 20794 + }, + { + "epoch": 11.61731843575419, + "grad_norm": 0.34263548254966736, + "learning_rate": 0.0004206722689075631, + "loss": 0.4054, + "step": 20795 + }, + { + "epoch": 11.617877094972068, + "grad_norm": 0.4094128906726837, + "learning_rate": 0.0004206442577030812, + "loss": 0.3966, + "step": 20796 + }, + { + "epoch": 11.618435754189944, + "grad_norm": 0.4657425284385681, + "learning_rate": 0.0004206162464985995, + "loss": 0.4782, + "step": 20797 + }, + { + "epoch": 11.61899441340782, + "grad_norm": 0.5022130012512207, + "learning_rate": 0.00042058823529411764, + "loss": 0.4481, + "step": 20798 + }, + { + "epoch": 11.619553072625699, + "grad_norm": 0.5468015670776367, + "learning_rate": 0.00042056022408963584, + "loss": 0.4901, + "step": 20799 + }, + { + "epoch": 11.620111731843576, + "grad_norm": 0.5638743042945862, + "learning_rate": 0.0004205322128851541, + "loss": 0.5493, + "step": 20800 + }, + { + "epoch": 11.620670391061452, + "grad_norm": 0.5176316499710083, + "learning_rate": 0.00042050420168067226, + "loss": 0.4455, + "step": 20801 + }, + { + "epoch": 11.621229050279329, + "grad_norm": 0.4255404770374298, + "learning_rate": 0.0004204761904761905, + "loss": 0.4306, + "step": 20802 + }, + { + "epoch": 11.621787709497207, + "grad_norm": 0.39125552773475647, + "learning_rate": 0.0004204481792717087, + "loss": 0.4491, + "step": 20803 + }, + { + "epoch": 11.622346368715084, + "grad_norm": 0.46689918637275696, + "learning_rate": 0.0004204201680672269, + "loss": 0.4205, + "step": 20804 + }, + { + "epoch": 11.62290502793296, + "grad_norm": 0.5403127074241638, + "learning_rate": 0.00042039215686274513, + "loss": 0.4873, + "step": 20805 + }, + { + "epoch": 11.623463687150839, + "grad_norm": 0.3441616892814636, + "learning_rate": 0.0004203641456582633, + "loss": 0.3205, + "step": 20806 + }, + { + "epoch": 11.624022346368715, + "grad_norm": 0.7106980681419373, + "learning_rate": 0.00042033613445378155, + "loss": 0.4552, + "step": 20807 + }, + { + "epoch": 11.624581005586592, + "grad_norm": 1.0218454599380493, + "learning_rate": 0.00042030812324929975, + "loss": 0.4789, + "step": 20808 + }, + { + "epoch": 11.62513966480447, + "grad_norm": 0.7296574115753174, + "learning_rate": 0.0004202801120448179, + "loss": 0.3677, + "step": 20809 + }, + { + "epoch": 11.625698324022347, + "grad_norm": 0.42564648389816284, + "learning_rate": 0.00042025210084033616, + "loss": 0.4097, + "step": 20810 + }, + { + "epoch": 11.626256983240223, + "grad_norm": 0.7909795641899109, + "learning_rate": 0.00042022408963585437, + "loss": 0.3307, + "step": 20811 + }, + { + "epoch": 11.6268156424581, + "grad_norm": 0.4138389527797699, + "learning_rate": 0.0004201960784313725, + "loss": 0.4124, + "step": 20812 + }, + { + "epoch": 11.627374301675978, + "grad_norm": 0.3971346616744995, + "learning_rate": 0.0004201680672268908, + "loss": 0.4456, + "step": 20813 + }, + { + "epoch": 11.627932960893855, + "grad_norm": 0.5548760890960693, + "learning_rate": 0.00042014005602240893, + "loss": 0.3383, + "step": 20814 + }, + { + "epoch": 11.628491620111731, + "grad_norm": 0.38403066992759705, + "learning_rate": 0.0004201120448179272, + "loss": 0.3961, + "step": 20815 + }, + { + "epoch": 11.62905027932961, + "grad_norm": 1.153908133506775, + "learning_rate": 0.0004200840336134454, + "loss": 0.4114, + "step": 20816 + }, + { + "epoch": 11.629608938547486, + "grad_norm": 0.4572281241416931, + "learning_rate": 0.00042005602240896355, + "loss": 0.3735, + "step": 20817 + }, + { + "epoch": 11.630167597765363, + "grad_norm": 1.088218331336975, + "learning_rate": 0.0004200280112044818, + "loss": 0.5125, + "step": 20818 + }, + { + "epoch": 11.630726256983241, + "grad_norm": 0.4902636706829071, + "learning_rate": 0.00042, + "loss": 0.4338, + "step": 20819 + }, + { + "epoch": 11.631284916201118, + "grad_norm": 0.828087329864502, + "learning_rate": 0.0004199719887955182, + "loss": 0.4828, + "step": 20820 + }, + { + "epoch": 11.631843575418994, + "grad_norm": 0.4409154951572418, + "learning_rate": 0.00041994397759103643, + "loss": 0.4241, + "step": 20821 + }, + { + "epoch": 11.63240223463687, + "grad_norm": 3.852677583694458, + "learning_rate": 0.0004199159663865546, + "loss": 0.521, + "step": 20822 + }, + { + "epoch": 11.632960893854749, + "grad_norm": 0.43827804923057556, + "learning_rate": 0.00041988795518207284, + "loss": 0.3276, + "step": 20823 + }, + { + "epoch": 11.633519553072626, + "grad_norm": 0.8542401194572449, + "learning_rate": 0.00041985994397759105, + "loss": 0.5294, + "step": 20824 + }, + { + "epoch": 11.634078212290502, + "grad_norm": 0.6204919219017029, + "learning_rate": 0.00041983193277310925, + "loss": 0.4354, + "step": 20825 + }, + { + "epoch": 11.63463687150838, + "grad_norm": 1.001447081565857, + "learning_rate": 0.00041980392156862746, + "loss": 0.4689, + "step": 20826 + }, + { + "epoch": 11.635195530726257, + "grad_norm": 0.4870937168598175, + "learning_rate": 0.00041977591036414567, + "loss": 0.4152, + "step": 20827 + }, + { + "epoch": 11.635754189944134, + "grad_norm": 0.5477774739265442, + "learning_rate": 0.00041974789915966387, + "loss": 0.3665, + "step": 20828 + }, + { + "epoch": 11.63631284916201, + "grad_norm": 0.4083455502986908, + "learning_rate": 0.0004197198879551821, + "loss": 0.4155, + "step": 20829 + }, + { + "epoch": 11.636871508379889, + "grad_norm": 0.32426324486732483, + "learning_rate": 0.0004196918767507003, + "loss": 0.3393, + "step": 20830 + }, + { + "epoch": 11.637430167597765, + "grad_norm": 0.446688175201416, + "learning_rate": 0.0004196638655462185, + "loss": 0.4328, + "step": 20831 + }, + { + "epoch": 11.637988826815642, + "grad_norm": 0.44993913173675537, + "learning_rate": 0.0004196358543417367, + "loss": 0.5483, + "step": 20832 + }, + { + "epoch": 11.63854748603352, + "grad_norm": 0.5969384908676147, + "learning_rate": 0.0004196078431372549, + "loss": 0.4346, + "step": 20833 + }, + { + "epoch": 11.639106145251397, + "grad_norm": 0.4580068290233612, + "learning_rate": 0.0004195798319327731, + "loss": 0.3369, + "step": 20834 + }, + { + "epoch": 11.639664804469273, + "grad_norm": 3.9589877128601074, + "learning_rate": 0.00041955182072829137, + "loss": 0.4299, + "step": 20835 + }, + { + "epoch": 11.640223463687152, + "grad_norm": 0.41882380843162537, + "learning_rate": 0.0004195238095238095, + "loss": 0.364, + "step": 20836 + }, + { + "epoch": 11.640782122905028, + "grad_norm": 6.274776458740234, + "learning_rate": 0.0004194957983193277, + "loss": 0.421, + "step": 20837 + }, + { + "epoch": 11.641340782122905, + "grad_norm": 0.3929876983165741, + "learning_rate": 0.00041946778711484593, + "loss": 0.4225, + "step": 20838 + }, + { + "epoch": 11.641899441340783, + "grad_norm": 3.386552333831787, + "learning_rate": 0.00041943977591036414, + "loss": 0.6064, + "step": 20839 + }, + { + "epoch": 11.64245810055866, + "grad_norm": 0.498435914516449, + "learning_rate": 0.0004194117647058824, + "loss": 0.5534, + "step": 20840 + }, + { + "epoch": 11.643016759776536, + "grad_norm": 0.45741790533065796, + "learning_rate": 0.00041938375350140055, + "loss": 0.3762, + "step": 20841 + }, + { + "epoch": 11.643575418994413, + "grad_norm": 0.742429256439209, + "learning_rate": 0.00041935574229691876, + "loss": 0.5266, + "step": 20842 + }, + { + "epoch": 11.644134078212291, + "grad_norm": 0.9417940378189087, + "learning_rate": 0.000419327731092437, + "loss": 0.439, + "step": 20843 + }, + { + "epoch": 11.644692737430168, + "grad_norm": 1.004478096961975, + "learning_rate": 0.00041929971988795517, + "loss": 0.4475, + "step": 20844 + }, + { + "epoch": 11.645251396648044, + "grad_norm": 0.962350606918335, + "learning_rate": 0.00041927170868347343, + "loss": 0.5052, + "step": 20845 + }, + { + "epoch": 11.645810055865923, + "grad_norm": 0.5542930364608765, + "learning_rate": 0.0004192436974789916, + "loss": 0.4201, + "step": 20846 + }, + { + "epoch": 11.6463687150838, + "grad_norm": 2.6939172744750977, + "learning_rate": 0.0004192156862745098, + "loss": 0.5011, + "step": 20847 + }, + { + "epoch": 11.646927374301676, + "grad_norm": 0.44292253255844116, + "learning_rate": 0.00041918767507002805, + "loss": 0.3903, + "step": 20848 + }, + { + "epoch": 11.647486033519552, + "grad_norm": 0.510962963104248, + "learning_rate": 0.0004191596638655462, + "loss": 0.5295, + "step": 20849 + }, + { + "epoch": 11.64804469273743, + "grad_norm": 0.5129163861274719, + "learning_rate": 0.00041913165266106446, + "loss": 0.4276, + "step": 20850 + }, + { + "epoch": 11.648603351955307, + "grad_norm": 0.4973587989807129, + "learning_rate": 0.00041910364145658266, + "loss": 0.4428, + "step": 20851 + }, + { + "epoch": 11.649162011173184, + "grad_norm": 0.5627951622009277, + "learning_rate": 0.0004190756302521008, + "loss": 0.4555, + "step": 20852 + }, + { + "epoch": 11.649720670391062, + "grad_norm": 2.3687398433685303, + "learning_rate": 0.0004190476190476191, + "loss": 0.4367, + "step": 20853 + }, + { + "epoch": 11.650279329608939, + "grad_norm": 22.65110206604004, + "learning_rate": 0.00041901960784313723, + "loss": 0.5558, + "step": 20854 + }, + { + "epoch": 11.650837988826815, + "grad_norm": 0.5034905076026917, + "learning_rate": 0.0004189915966386555, + "loss": 0.3427, + "step": 20855 + }, + { + "epoch": 11.651396648044694, + "grad_norm": 0.4828323721885681, + "learning_rate": 0.0004189635854341737, + "loss": 0.3909, + "step": 20856 + }, + { + "epoch": 11.65195530726257, + "grad_norm": 0.3465307652950287, + "learning_rate": 0.00041893557422969185, + "loss": 0.3121, + "step": 20857 + }, + { + "epoch": 11.652513966480447, + "grad_norm": 0.6500255465507507, + "learning_rate": 0.0004189075630252101, + "loss": 0.5043, + "step": 20858 + }, + { + "epoch": 11.653072625698323, + "grad_norm": 0.4145534336566925, + "learning_rate": 0.0004188795518207283, + "loss": 0.4569, + "step": 20859 + }, + { + "epoch": 11.653631284916202, + "grad_norm": 0.4527547061443329, + "learning_rate": 0.0004188515406162465, + "loss": 0.4054, + "step": 20860 + }, + { + "epoch": 11.654189944134078, + "grad_norm": 0.5085048675537109, + "learning_rate": 0.0004188235294117647, + "loss": 0.5424, + "step": 20861 + }, + { + "epoch": 11.654748603351955, + "grad_norm": 0.6608612537384033, + "learning_rate": 0.0004187955182072829, + "loss": 0.5669, + "step": 20862 + }, + { + "epoch": 11.655307262569833, + "grad_norm": 0.45789891481399536, + "learning_rate": 0.00041876750700280114, + "loss": 0.3768, + "step": 20863 + }, + { + "epoch": 11.65586592178771, + "grad_norm": 0.3989100456237793, + "learning_rate": 0.00041873949579831934, + "loss": 0.3732, + "step": 20864 + }, + { + "epoch": 11.656424581005586, + "grad_norm": 0.4907222092151642, + "learning_rate": 0.00041871148459383755, + "loss": 0.4106, + "step": 20865 + }, + { + "epoch": 11.656983240223465, + "grad_norm": 0.9251715540885925, + "learning_rate": 0.00041868347338935575, + "loss": 0.4696, + "step": 20866 + }, + { + "epoch": 11.657541899441341, + "grad_norm": 1.0267250537872314, + "learning_rate": 0.00041865546218487396, + "loss": 0.5998, + "step": 20867 + }, + { + "epoch": 11.658100558659218, + "grad_norm": 4.684356689453125, + "learning_rate": 0.00041862745098039217, + "loss": 0.3843, + "step": 20868 + }, + { + "epoch": 11.658659217877094, + "grad_norm": 0.4320904612541199, + "learning_rate": 0.00041859943977591037, + "loss": 0.4287, + "step": 20869 + }, + { + "epoch": 11.659217877094973, + "grad_norm": 5.6273627281188965, + "learning_rate": 0.0004185714285714286, + "loss": 0.3861, + "step": 20870 + }, + { + "epoch": 11.65977653631285, + "grad_norm": 0.40378838777542114, + "learning_rate": 0.0004185434173669468, + "loss": 0.3187, + "step": 20871 + }, + { + "epoch": 11.660335195530726, + "grad_norm": 0.5093754529953003, + "learning_rate": 0.000418515406162465, + "loss": 0.363, + "step": 20872 + }, + { + "epoch": 11.660893854748604, + "grad_norm": 0.7717400789260864, + "learning_rate": 0.0004184873949579832, + "loss": 0.4281, + "step": 20873 + }, + { + "epoch": 11.66145251396648, + "grad_norm": 0.6055014729499817, + "learning_rate": 0.0004184593837535014, + "loss": 0.4512, + "step": 20874 + }, + { + "epoch": 11.662011173184357, + "grad_norm": 0.4012812674045563, + "learning_rate": 0.00041843137254901966, + "loss": 0.3706, + "step": 20875 + }, + { + "epoch": 11.662569832402234, + "grad_norm": 0.3984925150871277, + "learning_rate": 0.0004184033613445378, + "loss": 0.4649, + "step": 20876 + }, + { + "epoch": 11.663128491620112, + "grad_norm": 0.5753477215766907, + "learning_rate": 0.000418375350140056, + "loss": 0.31, + "step": 20877 + }, + { + "epoch": 11.663687150837989, + "grad_norm": 2.1902623176574707, + "learning_rate": 0.0004183473389355742, + "loss": 0.5273, + "step": 20878 + }, + { + "epoch": 11.664245810055865, + "grad_norm": 0.4382067024707794, + "learning_rate": 0.00041831932773109243, + "loss": 0.4245, + "step": 20879 + }, + { + "epoch": 11.664804469273744, + "grad_norm": 0.8010841608047485, + "learning_rate": 0.0004182913165266107, + "loss": 0.4593, + "step": 20880 + }, + { + "epoch": 11.66536312849162, + "grad_norm": 0.46820876002311707, + "learning_rate": 0.00041826330532212884, + "loss": 0.4346, + "step": 20881 + }, + { + "epoch": 11.665921787709497, + "grad_norm": 0.5519371032714844, + "learning_rate": 0.00041823529411764705, + "loss": 0.4786, + "step": 20882 + }, + { + "epoch": 11.666480446927375, + "grad_norm": 0.45646730065345764, + "learning_rate": 0.0004182072829131653, + "loss": 0.3799, + "step": 20883 + }, + { + "epoch": 11.667039106145252, + "grad_norm": 0.5841661691665649, + "learning_rate": 0.00041817927170868346, + "loss": 0.4647, + "step": 20884 + }, + { + "epoch": 11.667597765363128, + "grad_norm": 0.5996007919311523, + "learning_rate": 0.0004181512605042017, + "loss": 0.3582, + "step": 20885 + }, + { + "epoch": 11.668156424581005, + "grad_norm": 0.4924525022506714, + "learning_rate": 0.0004181232492997199, + "loss": 0.334, + "step": 20886 + }, + { + "epoch": 11.668715083798883, + "grad_norm": 0.5998673439025879, + "learning_rate": 0.0004180952380952381, + "loss": 0.4265, + "step": 20887 + }, + { + "epoch": 11.66927374301676, + "grad_norm": 0.4154660999774933, + "learning_rate": 0.00041806722689075634, + "loss": 0.3879, + "step": 20888 + }, + { + "epoch": 11.669832402234636, + "grad_norm": 0.7532088756561279, + "learning_rate": 0.0004180392156862745, + "loss": 0.4445, + "step": 20889 + }, + { + "epoch": 11.670391061452515, + "grad_norm": 0.4688023626804352, + "learning_rate": 0.00041801120448179275, + "loss": 0.4407, + "step": 20890 + }, + { + "epoch": 11.670949720670391, + "grad_norm": 1.3237495422363281, + "learning_rate": 0.00041798319327731096, + "loss": 0.4689, + "step": 20891 + }, + { + "epoch": 11.671508379888268, + "grad_norm": 1.004561424255371, + "learning_rate": 0.0004179551820728291, + "loss": 0.5923, + "step": 20892 + }, + { + "epoch": 11.672067039106146, + "grad_norm": 0.4932515621185303, + "learning_rate": 0.00041792717086834737, + "loss": 0.4147, + "step": 20893 + }, + { + "epoch": 11.672625698324023, + "grad_norm": 0.406093567609787, + "learning_rate": 0.0004178991596638655, + "loss": 0.4448, + "step": 20894 + }, + { + "epoch": 11.6731843575419, + "grad_norm": 1.1091928482055664, + "learning_rate": 0.0004178711484593838, + "loss": 0.4067, + "step": 20895 + }, + { + "epoch": 11.673743016759776, + "grad_norm": 0.699067234992981, + "learning_rate": 0.000417843137254902, + "loss": 0.3972, + "step": 20896 + }, + { + "epoch": 11.674301675977654, + "grad_norm": 1.3785072565078735, + "learning_rate": 0.00041781512605042014, + "loss": 0.4695, + "step": 20897 + }, + { + "epoch": 11.67486033519553, + "grad_norm": 0.5263012647628784, + "learning_rate": 0.0004177871148459384, + "loss": 0.3302, + "step": 20898 + }, + { + "epoch": 11.675418994413407, + "grad_norm": 0.4251067638397217, + "learning_rate": 0.0004177591036414566, + "loss": 0.4764, + "step": 20899 + }, + { + "epoch": 11.675977653631286, + "grad_norm": 0.47318360209465027, + "learning_rate": 0.0004177310924369748, + "loss": 0.5472, + "step": 20900 + }, + { + "epoch": 11.676536312849162, + "grad_norm": 0.3908814489841461, + "learning_rate": 0.000417703081232493, + "loss": 0.5137, + "step": 20901 + }, + { + "epoch": 11.677094972067039, + "grad_norm": 0.6362344622612, + "learning_rate": 0.00041767507002801117, + "loss": 0.4494, + "step": 20902 + }, + { + "epoch": 11.677653631284915, + "grad_norm": 0.5055546164512634, + "learning_rate": 0.00041764705882352943, + "loss": 0.488, + "step": 20903 + }, + { + "epoch": 11.678212290502794, + "grad_norm": 0.5171534419059753, + "learning_rate": 0.00041761904761904764, + "loss": 0.484, + "step": 20904 + }, + { + "epoch": 11.67877094972067, + "grad_norm": 2.00119686126709, + "learning_rate": 0.00041759103641456584, + "loss": 0.4824, + "step": 20905 + }, + { + "epoch": 11.679329608938547, + "grad_norm": 0.6595990657806396, + "learning_rate": 0.00041756302521008405, + "loss": 0.4405, + "step": 20906 + }, + { + "epoch": 11.679888268156425, + "grad_norm": 0.5611948370933533, + "learning_rate": 0.00041753501400560225, + "loss": 0.4683, + "step": 20907 + }, + { + "epoch": 11.680446927374302, + "grad_norm": 0.37523218989372253, + "learning_rate": 0.00041750700280112046, + "loss": 0.3346, + "step": 20908 + }, + { + "epoch": 11.681005586592178, + "grad_norm": 0.5734403729438782, + "learning_rate": 0.00041747899159663867, + "loss": 0.4778, + "step": 20909 + }, + { + "epoch": 11.681564245810057, + "grad_norm": 0.5474722385406494, + "learning_rate": 0.00041745098039215687, + "loss": 0.4319, + "step": 20910 + }, + { + "epoch": 11.682122905027933, + "grad_norm": 0.5411948561668396, + "learning_rate": 0.0004174229691876751, + "loss": 0.4802, + "step": 20911 + }, + { + "epoch": 11.68268156424581, + "grad_norm": 0.5044373273849487, + "learning_rate": 0.0004173949579831933, + "loss": 0.3734, + "step": 20912 + }, + { + "epoch": 11.683240223463688, + "grad_norm": 0.3963293135166168, + "learning_rate": 0.0004173669467787115, + "loss": 0.3806, + "step": 20913 + }, + { + "epoch": 11.683798882681565, + "grad_norm": 0.5499990582466125, + "learning_rate": 0.0004173389355742297, + "loss": 0.4071, + "step": 20914 + }, + { + "epoch": 11.684357541899441, + "grad_norm": 3.9386253356933594, + "learning_rate": 0.00041731092436974796, + "loss": 0.5305, + "step": 20915 + }, + { + "epoch": 11.684916201117318, + "grad_norm": 1.509673833847046, + "learning_rate": 0.0004172829131652661, + "loss": 0.4179, + "step": 20916 + }, + { + "epoch": 11.685474860335196, + "grad_norm": 0.8844584226608276, + "learning_rate": 0.0004172549019607843, + "loss": 0.4317, + "step": 20917 + }, + { + "epoch": 11.686033519553073, + "grad_norm": 0.43616244196891785, + "learning_rate": 0.0004172268907563025, + "loss": 0.4323, + "step": 20918 + }, + { + "epoch": 11.68659217877095, + "grad_norm": 0.4146277904510498, + "learning_rate": 0.0004171988795518207, + "loss": 0.3762, + "step": 20919 + }, + { + "epoch": 11.687150837988828, + "grad_norm": 0.6163322925567627, + "learning_rate": 0.00041717086834733893, + "loss": 0.4465, + "step": 20920 + }, + { + "epoch": 11.687709497206704, + "grad_norm": 0.5647704005241394, + "learning_rate": 0.00041714285714285714, + "loss": 0.4416, + "step": 20921 + }, + { + "epoch": 11.68826815642458, + "grad_norm": 0.5847733020782471, + "learning_rate": 0.00041711484593837534, + "loss": 0.4299, + "step": 20922 + }, + { + "epoch": 11.688826815642457, + "grad_norm": 0.4575791358947754, + "learning_rate": 0.0004170868347338936, + "loss": 0.4421, + "step": 20923 + }, + { + "epoch": 11.689385474860336, + "grad_norm": 0.7809575796127319, + "learning_rate": 0.00041705882352941176, + "loss": 0.4015, + "step": 20924 + }, + { + "epoch": 11.689944134078212, + "grad_norm": 0.4937390685081482, + "learning_rate": 0.00041703081232492996, + "loss": 0.4945, + "step": 20925 + }, + { + "epoch": 11.690502793296089, + "grad_norm": 0.518347442150116, + "learning_rate": 0.00041700280112044817, + "loss": 0.3909, + "step": 20926 + }, + { + "epoch": 11.691061452513967, + "grad_norm": 0.4789871573448181, + "learning_rate": 0.0004169747899159664, + "loss": 0.538, + "step": 20927 + }, + { + "epoch": 11.691620111731844, + "grad_norm": 0.5728869438171387, + "learning_rate": 0.00041694677871148463, + "loss": 0.4441, + "step": 20928 + }, + { + "epoch": 11.69217877094972, + "grad_norm": 0.4812524914741516, + "learning_rate": 0.0004169187675070028, + "loss": 0.5193, + "step": 20929 + }, + { + "epoch": 11.692737430167599, + "grad_norm": 0.5269397497177124, + "learning_rate": 0.000416890756302521, + "loss": 0.4327, + "step": 20930 + }, + { + "epoch": 11.693296089385475, + "grad_norm": 0.5200487971305847, + "learning_rate": 0.00041686274509803925, + "loss": 0.4149, + "step": 20931 + }, + { + "epoch": 11.693854748603352, + "grad_norm": 0.5177311301231384, + "learning_rate": 0.0004168347338935574, + "loss": 0.5222, + "step": 20932 + }, + { + "epoch": 11.694413407821228, + "grad_norm": 1.0114914178848267, + "learning_rate": 0.00041680672268907566, + "loss": 0.4473, + "step": 20933 + }, + { + "epoch": 11.694972067039107, + "grad_norm": 0.39139434695243835, + "learning_rate": 0.0004167787114845938, + "loss": 0.4353, + "step": 20934 + }, + { + "epoch": 11.695530726256983, + "grad_norm": 0.671846866607666, + "learning_rate": 0.000416750700280112, + "loss": 0.5476, + "step": 20935 + }, + { + "epoch": 11.69608938547486, + "grad_norm": 2.2795498371124268, + "learning_rate": 0.0004167226890756303, + "loss": 0.4468, + "step": 20936 + }, + { + "epoch": 11.696648044692738, + "grad_norm": 0.4834044277667999, + "learning_rate": 0.00041669467787114843, + "loss": 0.4177, + "step": 20937 + }, + { + "epoch": 11.697206703910615, + "grad_norm": 0.7285804748535156, + "learning_rate": 0.0004166666666666667, + "loss": 0.5578, + "step": 20938 + }, + { + "epoch": 11.697765363128491, + "grad_norm": 4.1155104637146, + "learning_rate": 0.0004166386554621849, + "loss": 0.4506, + "step": 20939 + }, + { + "epoch": 11.69832402234637, + "grad_norm": 0.43977445363998413, + "learning_rate": 0.00041661064425770305, + "loss": 0.4407, + "step": 20940 + }, + { + "epoch": 11.698882681564246, + "grad_norm": 0.363665372133255, + "learning_rate": 0.0004165826330532213, + "loss": 0.2814, + "step": 20941 + }, + { + "epoch": 11.699441340782123, + "grad_norm": 1.0198465585708618, + "learning_rate": 0.00041655462184873946, + "loss": 0.3967, + "step": 20942 + }, + { + "epoch": 11.7, + "grad_norm": 1.0242189168930054, + "learning_rate": 0.0004165266106442577, + "loss": 0.4392, + "step": 20943 + }, + { + "epoch": 11.700558659217878, + "grad_norm": 0.815778911113739, + "learning_rate": 0.00041649859943977593, + "loss": 0.4214, + "step": 20944 + }, + { + "epoch": 11.701117318435754, + "grad_norm": 1.8759633302688599, + "learning_rate": 0.0004164705882352941, + "loss": 0.4485, + "step": 20945 + }, + { + "epoch": 11.70167597765363, + "grad_norm": 1.4267017841339111, + "learning_rate": 0.00041644257703081234, + "loss": 0.3579, + "step": 20946 + }, + { + "epoch": 11.702234636871509, + "grad_norm": 0.706054151058197, + "learning_rate": 0.00041641456582633055, + "loss": 0.3676, + "step": 20947 + }, + { + "epoch": 11.702793296089386, + "grad_norm": 1.0912322998046875, + "learning_rate": 0.00041638655462184875, + "loss": 0.457, + "step": 20948 + }, + { + "epoch": 11.703351955307262, + "grad_norm": 0.4204034209251404, + "learning_rate": 0.00041635854341736696, + "loss": 0.4386, + "step": 20949 + }, + { + "epoch": 11.703910614525139, + "grad_norm": 1.5568163394927979, + "learning_rate": 0.0004163305322128851, + "loss": 0.4777, + "step": 20950 + }, + { + "epoch": 11.704469273743017, + "grad_norm": 0.7590762972831726, + "learning_rate": 0.00041630252100840337, + "loss": 0.4947, + "step": 20951 + }, + { + "epoch": 11.705027932960894, + "grad_norm": 0.5208397507667542, + "learning_rate": 0.0004162745098039216, + "loss": 0.3341, + "step": 20952 + }, + { + "epoch": 11.70558659217877, + "grad_norm": 0.5330596566200256, + "learning_rate": 0.0004162464985994398, + "loss": 0.5348, + "step": 20953 + }, + { + "epoch": 11.706145251396649, + "grad_norm": 0.5282301306724548, + "learning_rate": 0.000416218487394958, + "loss": 0.4584, + "step": 20954 + }, + { + "epoch": 11.706703910614525, + "grad_norm": 0.6201488375663757, + "learning_rate": 0.0004161904761904762, + "loss": 0.462, + "step": 20955 + }, + { + "epoch": 11.707262569832402, + "grad_norm": 0.5473606586456299, + "learning_rate": 0.0004161624649859944, + "loss": 0.4445, + "step": 20956 + }, + { + "epoch": 11.70782122905028, + "grad_norm": 0.42217910289764404, + "learning_rate": 0.0004161344537815126, + "loss": 0.4231, + "step": 20957 + }, + { + "epoch": 11.708379888268157, + "grad_norm": 0.5057543516159058, + "learning_rate": 0.00041610644257703087, + "loss": 0.3717, + "step": 20958 + }, + { + "epoch": 11.708938547486033, + "grad_norm": 0.35181501507759094, + "learning_rate": 0.000416078431372549, + "loss": 0.3805, + "step": 20959 + }, + { + "epoch": 11.70949720670391, + "grad_norm": 0.3671259582042694, + "learning_rate": 0.0004160504201680672, + "loss": 0.3887, + "step": 20960 + }, + { + "epoch": 11.710055865921788, + "grad_norm": 0.5064424872398376, + "learning_rate": 0.00041602240896358543, + "loss": 0.5408, + "step": 20961 + }, + { + "epoch": 11.710614525139665, + "grad_norm": 0.49026134610176086, + "learning_rate": 0.00041599439775910364, + "loss": 0.3025, + "step": 20962 + }, + { + "epoch": 11.711173184357541, + "grad_norm": 3.980346202850342, + "learning_rate": 0.0004159663865546219, + "loss": 0.3122, + "step": 20963 + }, + { + "epoch": 11.71173184357542, + "grad_norm": 0.5637076497077942, + "learning_rate": 0.00041593837535014005, + "loss": 0.5266, + "step": 20964 + }, + { + "epoch": 11.712290502793296, + "grad_norm": 0.6552236080169678, + "learning_rate": 0.00041591036414565826, + "loss": 0.4445, + "step": 20965 + }, + { + "epoch": 11.712849162011173, + "grad_norm": 0.8326232433319092, + "learning_rate": 0.0004158823529411765, + "loss": 0.3617, + "step": 20966 + }, + { + "epoch": 11.713407821229051, + "grad_norm": 0.4824349582195282, + "learning_rate": 0.00041585434173669467, + "loss": 0.3881, + "step": 20967 + }, + { + "epoch": 11.713966480446928, + "grad_norm": 3.46415114402771, + "learning_rate": 0.00041582633053221293, + "loss": 0.3791, + "step": 20968 + }, + { + "epoch": 11.714525139664804, + "grad_norm": 0.4594406485557556, + "learning_rate": 0.0004157983193277311, + "loss": 0.3782, + "step": 20969 + }, + { + "epoch": 11.71508379888268, + "grad_norm": 0.47127431631088257, + "learning_rate": 0.0004157703081232493, + "loss": 0.3632, + "step": 20970 + }, + { + "epoch": 11.71564245810056, + "grad_norm": 0.6570918560028076, + "learning_rate": 0.00041574229691876755, + "loss": 0.4198, + "step": 20971 + }, + { + "epoch": 11.716201117318436, + "grad_norm": 0.4378843605518341, + "learning_rate": 0.0004157142857142857, + "loss": 0.3927, + "step": 20972 + }, + { + "epoch": 11.716759776536312, + "grad_norm": 0.540259599685669, + "learning_rate": 0.00041568627450980396, + "loss": 0.445, + "step": 20973 + }, + { + "epoch": 11.71731843575419, + "grad_norm": 1.033901572227478, + "learning_rate": 0.00041565826330532216, + "loss": 0.4615, + "step": 20974 + }, + { + "epoch": 11.717877094972067, + "grad_norm": 0.7708370685577393, + "learning_rate": 0.0004156302521008403, + "loss": 0.5225, + "step": 20975 + }, + { + "epoch": 11.718435754189944, + "grad_norm": 0.7920206189155579, + "learning_rate": 0.0004156022408963586, + "loss": 0.4068, + "step": 20976 + }, + { + "epoch": 11.71899441340782, + "grad_norm": 0.9534438848495483, + "learning_rate": 0.00041557422969187673, + "loss": 0.4742, + "step": 20977 + }, + { + "epoch": 11.719553072625699, + "grad_norm": 0.5032855272293091, + "learning_rate": 0.000415546218487395, + "loss": 0.4401, + "step": 20978 + }, + { + "epoch": 11.720111731843575, + "grad_norm": 0.3207739293575287, + "learning_rate": 0.0004155182072829132, + "loss": 0.2921, + "step": 20979 + }, + { + "epoch": 11.720670391061452, + "grad_norm": 0.3972409665584564, + "learning_rate": 0.00041549019607843135, + "loss": 0.4261, + "step": 20980 + }, + { + "epoch": 11.72122905027933, + "grad_norm": 1.6158636808395386, + "learning_rate": 0.0004154621848739496, + "loss": 0.5182, + "step": 20981 + }, + { + "epoch": 11.721787709497207, + "grad_norm": 0.5116075873374939, + "learning_rate": 0.0004154341736694678, + "loss": 0.4188, + "step": 20982 + }, + { + "epoch": 11.722346368715083, + "grad_norm": 0.6547574996948242, + "learning_rate": 0.000415406162464986, + "loss": 0.5221, + "step": 20983 + }, + { + "epoch": 11.722905027932962, + "grad_norm": 9.015817642211914, + "learning_rate": 0.0004153781512605042, + "loss": 0.4194, + "step": 20984 + }, + { + "epoch": 11.723463687150838, + "grad_norm": 0.5121548175811768, + "learning_rate": 0.0004153501400560224, + "loss": 0.4588, + "step": 20985 + }, + { + "epoch": 11.724022346368715, + "grad_norm": 0.6617794036865234, + "learning_rate": 0.00041532212885154064, + "loss": 0.5173, + "step": 20986 + }, + { + "epoch": 11.724581005586593, + "grad_norm": 0.4122159779071808, + "learning_rate": 0.00041529411764705884, + "loss": 0.5111, + "step": 20987 + }, + { + "epoch": 11.72513966480447, + "grad_norm": 1.077163815498352, + "learning_rate": 0.00041526610644257705, + "loss": 0.4942, + "step": 20988 + }, + { + "epoch": 11.725698324022346, + "grad_norm": 0.9594558477401733, + "learning_rate": 0.00041523809523809525, + "loss": 0.6044, + "step": 20989 + }, + { + "epoch": 11.726256983240223, + "grad_norm": 0.5694657564163208, + "learning_rate": 0.00041521008403361346, + "loss": 0.4677, + "step": 20990 + }, + { + "epoch": 11.726815642458101, + "grad_norm": 0.5785322785377502, + "learning_rate": 0.00041518207282913167, + "loss": 0.5567, + "step": 20991 + }, + { + "epoch": 11.727374301675978, + "grad_norm": 0.449042946100235, + "learning_rate": 0.00041515406162464987, + "loss": 0.4259, + "step": 20992 + }, + { + "epoch": 11.727932960893854, + "grad_norm": 0.5365514159202576, + "learning_rate": 0.0004151260504201681, + "loss": 0.4718, + "step": 20993 + }, + { + "epoch": 11.728491620111733, + "grad_norm": 1.9636155366897583, + "learning_rate": 0.0004150980392156863, + "loss": 0.451, + "step": 20994 + }, + { + "epoch": 11.72905027932961, + "grad_norm": 0.8794490098953247, + "learning_rate": 0.0004150700280112045, + "loss": 0.3745, + "step": 20995 + }, + { + "epoch": 11.729608938547486, + "grad_norm": 1.2629022598266602, + "learning_rate": 0.0004150420168067227, + "loss": 0.4083, + "step": 20996 + }, + { + "epoch": 11.730167597765362, + "grad_norm": 0.4553702473640442, + "learning_rate": 0.0004150140056022409, + "loss": 0.3784, + "step": 20997 + }, + { + "epoch": 11.73072625698324, + "grad_norm": 0.5279976725578308, + "learning_rate": 0.00041498599439775916, + "loss": 0.3578, + "step": 20998 + }, + { + "epoch": 11.731284916201117, + "grad_norm": 1.3618521690368652, + "learning_rate": 0.0004149579831932773, + "loss": 0.3737, + "step": 20999 + }, + { + "epoch": 11.731843575418994, + "grad_norm": 0.4541051685810089, + "learning_rate": 0.0004149299719887955, + "loss": 0.4536, + "step": 21000 + }, + { + "epoch": 11.731843575418994, + "eval_cer": 0.08786400881585982, + "eval_loss": 0.33108505606651306, + "eval_runtime": 56.2558, + "eval_samples_per_second": 80.667, + "eval_steps_per_second": 5.048, + "eval_wer": 0.34750425965755144, + "step": 21000 + }, + { + "epoch": 11.732402234636872, + "grad_norm": 0.7673790454864502, + "learning_rate": 0.0004149019607843137, + "loss": 0.5946, + "step": 21001 + }, + { + "epoch": 11.732960893854749, + "grad_norm": 0.3800128400325775, + "learning_rate": 0.00041487394957983193, + "loss": 0.4643, + "step": 21002 + }, + { + "epoch": 11.733519553072625, + "grad_norm": 0.4540000557899475, + "learning_rate": 0.0004148459383753502, + "loss": 0.4183, + "step": 21003 + }, + { + "epoch": 11.734078212290502, + "grad_norm": 0.7685021162033081, + "learning_rate": 0.00041481792717086834, + "loss": 0.4576, + "step": 21004 + }, + { + "epoch": 11.73463687150838, + "grad_norm": 0.3957034647464752, + "learning_rate": 0.00041478991596638655, + "loss": 0.425, + "step": 21005 + }, + { + "epoch": 11.735195530726257, + "grad_norm": 0.866580605506897, + "learning_rate": 0.0004147619047619048, + "loss": 0.4591, + "step": 21006 + }, + { + "epoch": 11.735754189944133, + "grad_norm": 0.8084816336631775, + "learning_rate": 0.00041473389355742296, + "loss": 0.4333, + "step": 21007 + }, + { + "epoch": 11.736312849162012, + "grad_norm": 0.5898215770721436, + "learning_rate": 0.0004147058823529412, + "loss": 0.4038, + "step": 21008 + }, + { + "epoch": 11.736871508379888, + "grad_norm": 0.387378454208374, + "learning_rate": 0.0004146778711484594, + "loss": 0.4309, + "step": 21009 + }, + { + "epoch": 11.737430167597765, + "grad_norm": 0.38650548458099365, + "learning_rate": 0.0004146498599439776, + "loss": 0.4005, + "step": 21010 + }, + { + "epoch": 11.737988826815643, + "grad_norm": 0.5134881734848022, + "learning_rate": 0.00041462184873949584, + "loss": 0.4904, + "step": 21011 + }, + { + "epoch": 11.73854748603352, + "grad_norm": 1.4361058473587036, + "learning_rate": 0.000414593837535014, + "loss": 0.3718, + "step": 21012 + }, + { + "epoch": 11.739106145251396, + "grad_norm": 0.3456028699874878, + "learning_rate": 0.00041456582633053225, + "loss": 0.3193, + "step": 21013 + }, + { + "epoch": 11.739664804469275, + "grad_norm": 0.49402251839637756, + "learning_rate": 0.00041453781512605046, + "loss": 0.3571, + "step": 21014 + }, + { + "epoch": 11.740223463687151, + "grad_norm": 0.615706205368042, + "learning_rate": 0.0004145098039215686, + "loss": 0.6151, + "step": 21015 + }, + { + "epoch": 11.740782122905028, + "grad_norm": 0.43488016724586487, + "learning_rate": 0.00041448179271708687, + "loss": 0.4743, + "step": 21016 + }, + { + "epoch": 11.741340782122904, + "grad_norm": 0.7332260608673096, + "learning_rate": 0.000414453781512605, + "loss": 0.4763, + "step": 21017 + }, + { + "epoch": 11.741899441340783, + "grad_norm": 0.49806851148605347, + "learning_rate": 0.0004144257703081233, + "loss": 0.449, + "step": 21018 + }, + { + "epoch": 11.74245810055866, + "grad_norm": 0.622251570224762, + "learning_rate": 0.0004143977591036415, + "loss": 0.4362, + "step": 21019 + }, + { + "epoch": 11.743016759776536, + "grad_norm": 3.3908772468566895, + "learning_rate": 0.00041436974789915964, + "loss": 0.3371, + "step": 21020 + }, + { + "epoch": 11.743575418994414, + "grad_norm": 3.059241771697998, + "learning_rate": 0.0004143417366946779, + "loss": 0.4249, + "step": 21021 + }, + { + "epoch": 11.74413407821229, + "grad_norm": 2.3904757499694824, + "learning_rate": 0.0004143137254901961, + "loss": 0.4964, + "step": 21022 + }, + { + "epoch": 11.744692737430167, + "grad_norm": 0.6678722500801086, + "learning_rate": 0.0004142857142857143, + "loss": 0.3736, + "step": 21023 + }, + { + "epoch": 11.745251396648044, + "grad_norm": 0.9124069213867188, + "learning_rate": 0.0004142577030812325, + "loss": 0.4052, + "step": 21024 + }, + { + "epoch": 11.745810055865922, + "grad_norm": 2.4555346965789795, + "learning_rate": 0.00041422969187675067, + "loss": 0.3867, + "step": 21025 + }, + { + "epoch": 11.746368715083799, + "grad_norm": 0.7680983543395996, + "learning_rate": 0.00041420168067226893, + "loss": 0.4906, + "step": 21026 + }, + { + "epoch": 11.746927374301675, + "grad_norm": 0.5838503241539001, + "learning_rate": 0.00041417366946778714, + "loss": 0.4238, + "step": 21027 + }, + { + "epoch": 11.747486033519554, + "grad_norm": 0.4185757339000702, + "learning_rate": 0.00041414565826330534, + "loss": 0.4598, + "step": 21028 + }, + { + "epoch": 11.74804469273743, + "grad_norm": 0.45984694361686707, + "learning_rate": 0.00041411764705882355, + "loss": 0.4286, + "step": 21029 + }, + { + "epoch": 11.748603351955307, + "grad_norm": 0.5626301169395447, + "learning_rate": 0.00041408963585434175, + "loss": 0.42, + "step": 21030 + }, + { + "epoch": 11.749162011173185, + "grad_norm": 0.6233865022659302, + "learning_rate": 0.00041406162464985996, + "loss": 0.4049, + "step": 21031 + }, + { + "epoch": 11.749720670391062, + "grad_norm": 0.483267217874527, + "learning_rate": 0.00041403361344537817, + "loss": 0.3646, + "step": 21032 + }, + { + "epoch": 11.750279329608938, + "grad_norm": 0.4188838005065918, + "learning_rate": 0.0004140056022408963, + "loss": 0.4286, + "step": 21033 + }, + { + "epoch": 11.750837988826815, + "grad_norm": 0.43827927112579346, + "learning_rate": 0.0004139775910364146, + "loss": 0.3417, + "step": 21034 + }, + { + "epoch": 11.751396648044693, + "grad_norm": 0.5048608779907227, + "learning_rate": 0.0004139495798319328, + "loss": 0.4599, + "step": 21035 + }, + { + "epoch": 11.75195530726257, + "grad_norm": 1.21855890750885, + "learning_rate": 0.000413921568627451, + "loss": 0.4923, + "step": 21036 + }, + { + "epoch": 11.752513966480446, + "grad_norm": 0.6267884373664856, + "learning_rate": 0.0004138935574229692, + "loss": 0.4708, + "step": 21037 + }, + { + "epoch": 11.753072625698325, + "grad_norm": 0.6575756669044495, + "learning_rate": 0.0004138655462184874, + "loss": 0.3788, + "step": 21038 + }, + { + "epoch": 11.753631284916201, + "grad_norm": 0.6032079458236694, + "learning_rate": 0.0004138375350140056, + "loss": 0.4088, + "step": 21039 + }, + { + "epoch": 11.754189944134078, + "grad_norm": 0.3609331250190735, + "learning_rate": 0.0004138095238095238, + "loss": 0.4419, + "step": 21040 + }, + { + "epoch": 11.754748603351956, + "grad_norm": 0.4986215829849243, + "learning_rate": 0.000413781512605042, + "loss": 0.4019, + "step": 21041 + }, + { + "epoch": 11.755307262569833, + "grad_norm": 0.626741886138916, + "learning_rate": 0.0004137535014005602, + "loss": 0.6113, + "step": 21042 + }, + { + "epoch": 11.75586592178771, + "grad_norm": 0.9297717809677124, + "learning_rate": 0.00041372549019607843, + "loss": 0.42, + "step": 21043 + }, + { + "epoch": 11.756424581005586, + "grad_norm": 0.4717349112033844, + "learning_rate": 0.00041369747899159664, + "loss": 0.3961, + "step": 21044 + }, + { + "epoch": 11.756983240223464, + "grad_norm": 0.48344510793685913, + "learning_rate": 0.00041366946778711484, + "loss": 0.4034, + "step": 21045 + }, + { + "epoch": 11.75754189944134, + "grad_norm": 0.7169546484947205, + "learning_rate": 0.0004136414565826331, + "loss": 0.4604, + "step": 21046 + }, + { + "epoch": 11.758100558659217, + "grad_norm": 0.5718005299568176, + "learning_rate": 0.00041361344537815126, + "loss": 0.3963, + "step": 21047 + }, + { + "epoch": 11.758659217877096, + "grad_norm": 0.4658730626106262, + "learning_rate": 0.00041358543417366946, + "loss": 0.4585, + "step": 21048 + }, + { + "epoch": 11.759217877094972, + "grad_norm": 0.47685444355010986, + "learning_rate": 0.00041355742296918767, + "loss": 0.3827, + "step": 21049 + }, + { + "epoch": 11.759776536312849, + "grad_norm": 0.47769901156425476, + "learning_rate": 0.0004135294117647059, + "loss": 0.4263, + "step": 21050 + }, + { + "epoch": 11.760335195530725, + "grad_norm": 0.4674915373325348, + "learning_rate": 0.00041350140056022413, + "loss": 0.3213, + "step": 21051 + }, + { + "epoch": 11.760893854748604, + "grad_norm": 0.7401136755943298, + "learning_rate": 0.0004134733893557423, + "loss": 0.3675, + "step": 21052 + }, + { + "epoch": 11.76145251396648, + "grad_norm": 0.5001193881034851, + "learning_rate": 0.0004134453781512605, + "loss": 0.4973, + "step": 21053 + }, + { + "epoch": 11.762011173184357, + "grad_norm": 0.4197474420070648, + "learning_rate": 0.00041341736694677875, + "loss": 0.3978, + "step": 21054 + }, + { + "epoch": 11.762569832402235, + "grad_norm": 0.6338546276092529, + "learning_rate": 0.0004133893557422969, + "loss": 0.4277, + "step": 21055 + }, + { + "epoch": 11.763128491620112, + "grad_norm": 0.5549630522727966, + "learning_rate": 0.00041336134453781516, + "loss": 0.3819, + "step": 21056 + }, + { + "epoch": 11.763687150837988, + "grad_norm": 0.35421422123908997, + "learning_rate": 0.0004133333333333333, + "loss": 0.4265, + "step": 21057 + }, + { + "epoch": 11.764245810055867, + "grad_norm": 0.637296736240387, + "learning_rate": 0.0004133053221288515, + "loss": 0.4988, + "step": 21058 + }, + { + "epoch": 11.764804469273743, + "grad_norm": 0.7246748805046082, + "learning_rate": 0.0004132773109243698, + "loss": 0.4287, + "step": 21059 + }, + { + "epoch": 11.76536312849162, + "grad_norm": 0.7421596050262451, + "learning_rate": 0.00041324929971988793, + "loss": 0.4299, + "step": 21060 + }, + { + "epoch": 11.765921787709498, + "grad_norm": 0.4843403398990631, + "learning_rate": 0.0004132212885154062, + "loss": 0.4074, + "step": 21061 + }, + { + "epoch": 11.766480446927375, + "grad_norm": 0.4219281077384949, + "learning_rate": 0.0004131932773109244, + "loss": 0.3862, + "step": 21062 + }, + { + "epoch": 11.767039106145251, + "grad_norm": 0.40698567032814026, + "learning_rate": 0.00041316526610644255, + "loss": 0.3779, + "step": 21063 + }, + { + "epoch": 11.767597765363128, + "grad_norm": 0.3869565427303314, + "learning_rate": 0.0004131372549019608, + "loss": 0.4651, + "step": 21064 + }, + { + "epoch": 11.768156424581006, + "grad_norm": 0.49842697381973267, + "learning_rate": 0.00041310924369747896, + "loss": 0.4472, + "step": 21065 + }, + { + "epoch": 11.768715083798883, + "grad_norm": 0.7378855347633362, + "learning_rate": 0.0004130812324929972, + "loss": 0.6714, + "step": 21066 + }, + { + "epoch": 11.76927374301676, + "grad_norm": 0.5357637405395508, + "learning_rate": 0.00041305322128851543, + "loss": 0.3991, + "step": 21067 + }, + { + "epoch": 11.769832402234638, + "grad_norm": 2.3240857124328613, + "learning_rate": 0.0004130252100840336, + "loss": 0.4187, + "step": 21068 + }, + { + "epoch": 11.770391061452514, + "grad_norm": 0.42079317569732666, + "learning_rate": 0.00041299719887955184, + "loss": 0.3602, + "step": 21069 + }, + { + "epoch": 11.77094972067039, + "grad_norm": 0.4476144313812256, + "learning_rate": 0.00041296918767507005, + "loss": 0.3387, + "step": 21070 + }, + { + "epoch": 11.771508379888267, + "grad_norm": 0.5272865295410156, + "learning_rate": 0.00041294117647058825, + "loss": 0.4694, + "step": 21071 + }, + { + "epoch": 11.772067039106146, + "grad_norm": 0.48232021927833557, + "learning_rate": 0.00041291316526610646, + "loss": 0.3466, + "step": 21072 + }, + { + "epoch": 11.772625698324022, + "grad_norm": 0.8820331692695618, + "learning_rate": 0.0004128851540616246, + "loss": 0.4046, + "step": 21073 + }, + { + "epoch": 11.773184357541899, + "grad_norm": 0.5438117980957031, + "learning_rate": 0.00041285714285714287, + "loss": 0.4588, + "step": 21074 + }, + { + "epoch": 11.773743016759777, + "grad_norm": 0.38747113943099976, + "learning_rate": 0.0004128291316526611, + "loss": 0.44, + "step": 21075 + }, + { + "epoch": 11.774301675977654, + "grad_norm": 4.289586544036865, + "learning_rate": 0.0004128011204481793, + "loss": 0.5024, + "step": 21076 + }, + { + "epoch": 11.77486033519553, + "grad_norm": 0.5946493744850159, + "learning_rate": 0.0004127731092436975, + "loss": 0.3977, + "step": 21077 + }, + { + "epoch": 11.775418994413407, + "grad_norm": 2.347083330154419, + "learning_rate": 0.0004127450980392157, + "loss": 0.4292, + "step": 21078 + }, + { + "epoch": 11.775977653631285, + "grad_norm": 0.6738131046295166, + "learning_rate": 0.0004127170868347339, + "loss": 0.5922, + "step": 21079 + }, + { + "epoch": 11.776536312849162, + "grad_norm": 0.3936261534690857, + "learning_rate": 0.0004126890756302521, + "loss": 0.3009, + "step": 21080 + }, + { + "epoch": 11.777094972067038, + "grad_norm": 0.854155421257019, + "learning_rate": 0.0004126610644257703, + "loss": 0.3567, + "step": 21081 + }, + { + "epoch": 11.777653631284917, + "grad_norm": 0.4538637101650238, + "learning_rate": 0.0004126330532212885, + "loss": 0.3475, + "step": 21082 + }, + { + "epoch": 11.778212290502793, + "grad_norm": 0.5938655734062195, + "learning_rate": 0.0004126050420168067, + "loss": 0.3873, + "step": 21083 + }, + { + "epoch": 11.77877094972067, + "grad_norm": 0.5250483155250549, + "learning_rate": 0.00041257703081232493, + "loss": 0.4788, + "step": 21084 + }, + { + "epoch": 11.779329608938548, + "grad_norm": 1.1067556142807007, + "learning_rate": 0.00041254901960784314, + "loss": 0.5936, + "step": 21085 + }, + { + "epoch": 11.779888268156425, + "grad_norm": 0.4631839692592621, + "learning_rate": 0.0004125210084033614, + "loss": 0.4123, + "step": 21086 + }, + { + "epoch": 11.780446927374301, + "grad_norm": 0.3276774287223816, + "learning_rate": 0.00041249299719887955, + "loss": 0.3682, + "step": 21087 + }, + { + "epoch": 11.78100558659218, + "grad_norm": 0.4596749246120453, + "learning_rate": 0.00041246498599439776, + "loss": 0.3901, + "step": 21088 + }, + { + "epoch": 11.781564245810056, + "grad_norm": 0.5934166312217712, + "learning_rate": 0.00041243697478991596, + "loss": 0.454, + "step": 21089 + }, + { + "epoch": 11.782122905027933, + "grad_norm": 0.5500884652137756, + "learning_rate": 0.00041240896358543417, + "loss": 0.4513, + "step": 21090 + }, + { + "epoch": 11.78268156424581, + "grad_norm": 0.7103753685951233, + "learning_rate": 0.00041238095238095243, + "loss": 0.3383, + "step": 21091 + }, + { + "epoch": 11.783240223463688, + "grad_norm": 0.6784194111824036, + "learning_rate": 0.0004123529411764706, + "loss": 0.3965, + "step": 21092 + }, + { + "epoch": 11.783798882681564, + "grad_norm": 0.5971254706382751, + "learning_rate": 0.0004123249299719888, + "loss": 0.68, + "step": 21093 + }, + { + "epoch": 11.78435754189944, + "grad_norm": 0.5012035369873047, + "learning_rate": 0.00041229691876750705, + "loss": 0.5393, + "step": 21094 + }, + { + "epoch": 11.78491620111732, + "grad_norm": 0.3639828562736511, + "learning_rate": 0.0004122689075630252, + "loss": 0.3184, + "step": 21095 + }, + { + "epoch": 11.785474860335196, + "grad_norm": 0.4151918292045593, + "learning_rate": 0.00041224089635854346, + "loss": 0.3936, + "step": 21096 + }, + { + "epoch": 11.786033519553072, + "grad_norm": 0.6073093414306641, + "learning_rate": 0.0004122128851540616, + "loss": 0.4088, + "step": 21097 + }, + { + "epoch": 11.786592178770949, + "grad_norm": 0.9797057509422302, + "learning_rate": 0.0004121848739495798, + "loss": 0.3837, + "step": 21098 + }, + { + "epoch": 11.787150837988827, + "grad_norm": 0.3993837237358093, + "learning_rate": 0.0004121568627450981, + "loss": 0.405, + "step": 21099 + }, + { + "epoch": 11.787709497206704, + "grad_norm": 1.075048804283142, + "learning_rate": 0.00041212885154061623, + "loss": 0.5063, + "step": 21100 + }, + { + "epoch": 11.78826815642458, + "grad_norm": 0.6167490482330322, + "learning_rate": 0.0004121008403361345, + "loss": 0.5004, + "step": 21101 + }, + { + "epoch": 11.788826815642459, + "grad_norm": 0.38762179017066956, + "learning_rate": 0.0004120728291316527, + "loss": 0.3749, + "step": 21102 + }, + { + "epoch": 11.789385474860335, + "grad_norm": 0.832881510257721, + "learning_rate": 0.00041204481792717085, + "loss": 0.3566, + "step": 21103 + }, + { + "epoch": 11.789944134078212, + "grad_norm": 0.5114133358001709, + "learning_rate": 0.0004120168067226891, + "loss": 0.4031, + "step": 21104 + }, + { + "epoch": 11.79050279329609, + "grad_norm": 0.47123122215270996, + "learning_rate": 0.00041198879551820726, + "loss": 0.5073, + "step": 21105 + }, + { + "epoch": 11.791061452513967, + "grad_norm": 0.4798649549484253, + "learning_rate": 0.0004119607843137255, + "loss": 0.5166, + "step": 21106 + }, + { + "epoch": 11.791620111731843, + "grad_norm": 0.3645530343055725, + "learning_rate": 0.0004119327731092437, + "loss": 0.3972, + "step": 21107 + }, + { + "epoch": 11.79217877094972, + "grad_norm": 0.8375932574272156, + "learning_rate": 0.0004119047619047619, + "loss": 0.5054, + "step": 21108 + }, + { + "epoch": 11.792737430167598, + "grad_norm": 0.41399019956588745, + "learning_rate": 0.00041187675070028014, + "loss": 0.4554, + "step": 21109 + }, + { + "epoch": 11.793296089385475, + "grad_norm": 0.3531157076358795, + "learning_rate": 0.00041184873949579834, + "loss": 0.3153, + "step": 21110 + }, + { + "epoch": 11.793854748603351, + "grad_norm": 0.4534890651702881, + "learning_rate": 0.00041182072829131655, + "loss": 0.6091, + "step": 21111 + }, + { + "epoch": 11.79441340782123, + "grad_norm": 0.47167131304740906, + "learning_rate": 0.00041179271708683475, + "loss": 0.5476, + "step": 21112 + }, + { + "epoch": 11.794972067039106, + "grad_norm": 0.553837239742279, + "learning_rate": 0.0004117647058823529, + "loss": 0.5244, + "step": 21113 + }, + { + "epoch": 11.795530726256983, + "grad_norm": 0.5114364624023438, + "learning_rate": 0.00041173669467787117, + "loss": 0.3889, + "step": 21114 + }, + { + "epoch": 11.796089385474861, + "grad_norm": 5.881773471832275, + "learning_rate": 0.00041170868347338937, + "loss": 0.4724, + "step": 21115 + }, + { + "epoch": 11.796648044692738, + "grad_norm": 0.4173142910003662, + "learning_rate": 0.0004116806722689076, + "loss": 0.3378, + "step": 21116 + }, + { + "epoch": 11.797206703910614, + "grad_norm": 0.4318006634712219, + "learning_rate": 0.0004116526610644258, + "loss": 0.448, + "step": 21117 + }, + { + "epoch": 11.797765363128491, + "grad_norm": 1.1033917665481567, + "learning_rate": 0.000411624649859944, + "loss": 0.3458, + "step": 21118 + }, + { + "epoch": 11.79832402234637, + "grad_norm": 0.3786972165107727, + "learning_rate": 0.0004115966386554622, + "loss": 0.4512, + "step": 21119 + }, + { + "epoch": 11.798882681564246, + "grad_norm": 0.3894638419151306, + "learning_rate": 0.0004115686274509804, + "loss": 0.3944, + "step": 21120 + }, + { + "epoch": 11.799441340782122, + "grad_norm": 0.4841580092906952, + "learning_rate": 0.0004115406162464986, + "loss": 0.421, + "step": 21121 + }, + { + "epoch": 11.8, + "grad_norm": 0.3173466920852661, + "learning_rate": 0.0004115126050420168, + "loss": 0.2402, + "step": 21122 + }, + { + "epoch": 11.800558659217877, + "grad_norm": 0.5621463656425476, + "learning_rate": 0.000411484593837535, + "loss": 0.4858, + "step": 21123 + }, + { + "epoch": 11.801117318435754, + "grad_norm": 0.5451611876487732, + "learning_rate": 0.0004114565826330532, + "loss": 0.3647, + "step": 21124 + }, + { + "epoch": 11.80167597765363, + "grad_norm": 0.4159115254878998, + "learning_rate": 0.00041142857142857143, + "loss": 0.3247, + "step": 21125 + }, + { + "epoch": 11.802234636871509, + "grad_norm": 0.8276838660240173, + "learning_rate": 0.0004114005602240897, + "loss": 0.5077, + "step": 21126 + }, + { + "epoch": 11.802793296089385, + "grad_norm": 0.45653706789016724, + "learning_rate": 0.00041137254901960784, + "loss": 0.3926, + "step": 21127 + }, + { + "epoch": 11.803351955307262, + "grad_norm": 0.9907598495483398, + "learning_rate": 0.00041134453781512605, + "loss": 0.4088, + "step": 21128 + }, + { + "epoch": 11.80391061452514, + "grad_norm": 0.9811581373214722, + "learning_rate": 0.00041131652661064426, + "loss": 0.3878, + "step": 21129 + }, + { + "epoch": 11.804469273743017, + "grad_norm": 0.5454853773117065, + "learning_rate": 0.00041128851540616246, + "loss": 0.401, + "step": 21130 + }, + { + "epoch": 11.805027932960893, + "grad_norm": 0.6509480476379395, + "learning_rate": 0.0004112605042016807, + "loss": 0.4322, + "step": 21131 + }, + { + "epoch": 11.805586592178772, + "grad_norm": 0.4455661475658417, + "learning_rate": 0.0004112324929971989, + "loss": 0.414, + "step": 21132 + }, + { + "epoch": 11.806145251396648, + "grad_norm": 0.8075137734413147, + "learning_rate": 0.0004112044817927171, + "loss": 0.4282, + "step": 21133 + }, + { + "epoch": 11.806703910614525, + "grad_norm": 1.9101922512054443, + "learning_rate": 0.00041117647058823534, + "loss": 0.4059, + "step": 21134 + }, + { + "epoch": 11.807262569832401, + "grad_norm": 1.0270003080368042, + "learning_rate": 0.0004111484593837535, + "loss": 0.336, + "step": 21135 + }, + { + "epoch": 11.80782122905028, + "grad_norm": 0.5320864319801331, + "learning_rate": 0.00041112044817927175, + "loss": 0.5753, + "step": 21136 + }, + { + "epoch": 11.808379888268156, + "grad_norm": 0.4468209445476532, + "learning_rate": 0.0004110924369747899, + "loss": 0.397, + "step": 21137 + }, + { + "epoch": 11.808938547486033, + "grad_norm": 0.5232101678848267, + "learning_rate": 0.0004110644257703081, + "loss": 0.4884, + "step": 21138 + }, + { + "epoch": 11.809497206703911, + "grad_norm": 0.6122629046440125, + "learning_rate": 0.00041103641456582637, + "loss": 0.4768, + "step": 21139 + }, + { + "epoch": 11.810055865921788, + "grad_norm": 0.5297895669937134, + "learning_rate": 0.0004110084033613445, + "loss": 0.4466, + "step": 21140 + }, + { + "epoch": 11.810614525139664, + "grad_norm": 0.7453494071960449, + "learning_rate": 0.0004109803921568628, + "loss": 0.6733, + "step": 21141 + }, + { + "epoch": 11.811173184357543, + "grad_norm": 0.488770991563797, + "learning_rate": 0.000410952380952381, + "loss": 0.3585, + "step": 21142 + }, + { + "epoch": 11.81173184357542, + "grad_norm": 0.5269141793251038, + "learning_rate": 0.00041092436974789914, + "loss": 0.4053, + "step": 21143 + }, + { + "epoch": 11.812290502793296, + "grad_norm": 0.6079568266868591, + "learning_rate": 0.0004108963585434174, + "loss": 0.4506, + "step": 21144 + }, + { + "epoch": 11.812849162011172, + "grad_norm": 0.7228178381919861, + "learning_rate": 0.00041086834733893555, + "loss": 0.4481, + "step": 21145 + }, + { + "epoch": 11.81340782122905, + "grad_norm": 0.5010695457458496, + "learning_rate": 0.00041084033613445376, + "loss": 0.3148, + "step": 21146 + }, + { + "epoch": 11.813966480446927, + "grad_norm": 0.43662238121032715, + "learning_rate": 0.000410812324929972, + "loss": 0.3595, + "step": 21147 + }, + { + "epoch": 11.814525139664804, + "grad_norm": 4.003963947296143, + "learning_rate": 0.00041078431372549017, + "loss": 0.3745, + "step": 21148 + }, + { + "epoch": 11.815083798882682, + "grad_norm": 0.37572428584098816, + "learning_rate": 0.00041075630252100843, + "loss": 0.4348, + "step": 21149 + }, + { + "epoch": 11.815642458100559, + "grad_norm": 0.5043729543685913, + "learning_rate": 0.00041072829131652664, + "loss": 0.5413, + "step": 21150 + }, + { + "epoch": 11.816201117318435, + "grad_norm": 0.8976406455039978, + "learning_rate": 0.0004107002801120448, + "loss": 0.4063, + "step": 21151 + }, + { + "epoch": 11.816759776536312, + "grad_norm": 0.3893214464187622, + "learning_rate": 0.00041067226890756305, + "loss": 0.37, + "step": 21152 + }, + { + "epoch": 11.81731843575419, + "grad_norm": 0.7534773945808411, + "learning_rate": 0.0004106442577030812, + "loss": 0.5862, + "step": 21153 + }, + { + "epoch": 11.817877094972067, + "grad_norm": 0.48971685767173767, + "learning_rate": 0.00041061624649859946, + "loss": 0.5326, + "step": 21154 + }, + { + "epoch": 11.818435754189943, + "grad_norm": 0.8797467947006226, + "learning_rate": 0.00041058823529411767, + "loss": 0.5168, + "step": 21155 + }, + { + "epoch": 11.818994413407822, + "grad_norm": 0.5101203918457031, + "learning_rate": 0.0004105602240896358, + "loss": 0.4865, + "step": 21156 + }, + { + "epoch": 11.819553072625698, + "grad_norm": 0.9895678758621216, + "learning_rate": 0.0004105322128851541, + "loss": 0.7454, + "step": 21157 + }, + { + "epoch": 11.820111731843575, + "grad_norm": 1.2504035234451294, + "learning_rate": 0.0004105042016806723, + "loss": 0.4513, + "step": 21158 + }, + { + "epoch": 11.820670391061453, + "grad_norm": 0.8474510312080383, + "learning_rate": 0.0004104761904761905, + "loss": 0.4064, + "step": 21159 + }, + { + "epoch": 11.82122905027933, + "grad_norm": 0.44312119483947754, + "learning_rate": 0.0004104481792717087, + "loss": 0.4153, + "step": 21160 + }, + { + "epoch": 11.821787709497206, + "grad_norm": 0.6110444664955139, + "learning_rate": 0.00041042016806722685, + "loss": 0.4067, + "step": 21161 + }, + { + "epoch": 11.822346368715085, + "grad_norm": 0.4869738221168518, + "learning_rate": 0.0004103921568627451, + "loss": 0.453, + "step": 21162 + }, + { + "epoch": 11.822905027932961, + "grad_norm": 0.8458188772201538, + "learning_rate": 0.0004103641456582633, + "loss": 0.4881, + "step": 21163 + }, + { + "epoch": 11.823463687150838, + "grad_norm": 1.248897910118103, + "learning_rate": 0.0004103361344537815, + "loss": 0.3721, + "step": 21164 + }, + { + "epoch": 11.824022346368714, + "grad_norm": 0.9057021737098694, + "learning_rate": 0.0004103081232492997, + "loss": 0.245, + "step": 21165 + }, + { + "epoch": 11.824581005586593, + "grad_norm": 0.45632514357566833, + "learning_rate": 0.00041028011204481793, + "loss": 0.3486, + "step": 21166 + }, + { + "epoch": 11.82513966480447, + "grad_norm": 0.6380709409713745, + "learning_rate": 0.00041025210084033614, + "loss": 0.4601, + "step": 21167 + }, + { + "epoch": 11.825698324022346, + "grad_norm": 1.0564601421356201, + "learning_rate": 0.00041022408963585434, + "loss": 0.3373, + "step": 21168 + }, + { + "epoch": 11.826256983240224, + "grad_norm": 0.43833258748054504, + "learning_rate": 0.0004101960784313726, + "loss": 0.3901, + "step": 21169 + }, + { + "epoch": 11.8268156424581, + "grad_norm": 0.38899490237236023, + "learning_rate": 0.00041016806722689076, + "loss": 0.4047, + "step": 21170 + }, + { + "epoch": 11.827374301675977, + "grad_norm": 0.4540770351886749, + "learning_rate": 0.00041014005602240896, + "loss": 0.3086, + "step": 21171 + }, + { + "epoch": 11.827932960893854, + "grad_norm": 1.2066417932510376, + "learning_rate": 0.00041011204481792717, + "loss": 0.4631, + "step": 21172 + }, + { + "epoch": 11.828491620111732, + "grad_norm": 0.9145241975784302, + "learning_rate": 0.0004100840336134454, + "loss": 0.3756, + "step": 21173 + }, + { + "epoch": 11.829050279329609, + "grad_norm": 0.7616272568702698, + "learning_rate": 0.00041005602240896363, + "loss": 0.3951, + "step": 21174 + }, + { + "epoch": 11.829608938547485, + "grad_norm": 2.4852514266967773, + "learning_rate": 0.0004100280112044818, + "loss": 0.4663, + "step": 21175 + }, + { + "epoch": 11.830167597765364, + "grad_norm": 0.3470335006713867, + "learning_rate": 0.00041, + "loss": 0.3709, + "step": 21176 + }, + { + "epoch": 11.83072625698324, + "grad_norm": 0.4376351237297058, + "learning_rate": 0.00040997198879551825, + "loss": 0.4154, + "step": 21177 + }, + { + "epoch": 11.831284916201117, + "grad_norm": 0.5533244609832764, + "learning_rate": 0.0004099439775910364, + "loss": 0.5036, + "step": 21178 + }, + { + "epoch": 11.831843575418995, + "grad_norm": 0.47184041142463684, + "learning_rate": 0.00040991596638655466, + "loss": 0.4714, + "step": 21179 + }, + { + "epoch": 11.832402234636872, + "grad_norm": 0.64646315574646, + "learning_rate": 0.0004098879551820728, + "loss": 0.5599, + "step": 21180 + }, + { + "epoch": 11.832960893854748, + "grad_norm": 0.9532169699668884, + "learning_rate": 0.000409859943977591, + "loss": 0.4352, + "step": 21181 + }, + { + "epoch": 11.833519553072625, + "grad_norm": 0.4523061513900757, + "learning_rate": 0.0004098319327731093, + "loss": 0.3722, + "step": 21182 + }, + { + "epoch": 11.834078212290503, + "grad_norm": 0.7187719941139221, + "learning_rate": 0.00040980392156862743, + "loss": 0.3737, + "step": 21183 + }, + { + "epoch": 11.83463687150838, + "grad_norm": 0.5003236532211304, + "learning_rate": 0.0004097759103641457, + "loss": 0.5246, + "step": 21184 + }, + { + "epoch": 11.835195530726256, + "grad_norm": 0.6886492371559143, + "learning_rate": 0.0004097478991596639, + "loss": 0.5718, + "step": 21185 + }, + { + "epoch": 11.835754189944135, + "grad_norm": 0.39473968744277954, + "learning_rate": 0.00040971988795518205, + "loss": 0.3909, + "step": 21186 + }, + { + "epoch": 11.836312849162011, + "grad_norm": 0.4980083405971527, + "learning_rate": 0.0004096918767507003, + "loss": 0.3819, + "step": 21187 + }, + { + "epoch": 11.836871508379888, + "grad_norm": 0.4633629620075226, + "learning_rate": 0.00040966386554621846, + "loss": 0.3969, + "step": 21188 + }, + { + "epoch": 11.837430167597766, + "grad_norm": 0.425487220287323, + "learning_rate": 0.0004096358543417367, + "loss": 0.39, + "step": 21189 + }, + { + "epoch": 11.837988826815643, + "grad_norm": 6.8655829429626465, + "learning_rate": 0.00040960784313725493, + "loss": 0.5207, + "step": 21190 + }, + { + "epoch": 11.83854748603352, + "grad_norm": 0.4440138041973114, + "learning_rate": 0.0004095798319327731, + "loss": 0.4067, + "step": 21191 + }, + { + "epoch": 11.839106145251396, + "grad_norm": 0.5871977210044861, + "learning_rate": 0.00040955182072829134, + "loss": 0.5089, + "step": 21192 + }, + { + "epoch": 11.839664804469274, + "grad_norm": 0.3644394874572754, + "learning_rate": 0.00040952380952380955, + "loss": 0.3505, + "step": 21193 + }, + { + "epoch": 11.84022346368715, + "grad_norm": 0.6120206117630005, + "learning_rate": 0.00040949579831932775, + "loss": 0.4932, + "step": 21194 + }, + { + "epoch": 11.840782122905027, + "grad_norm": 0.6395864486694336, + "learning_rate": 0.00040946778711484596, + "loss": 0.5225, + "step": 21195 + }, + { + "epoch": 11.841340782122906, + "grad_norm": 0.5474733710289001, + "learning_rate": 0.0004094397759103641, + "loss": 0.3859, + "step": 21196 + }, + { + "epoch": 11.841899441340782, + "grad_norm": 0.6462367177009583, + "learning_rate": 0.00040941176470588237, + "loss": 0.5595, + "step": 21197 + }, + { + "epoch": 11.842458100558659, + "grad_norm": 0.6597458720207214, + "learning_rate": 0.0004093837535014006, + "loss": 0.3896, + "step": 21198 + }, + { + "epoch": 11.843016759776535, + "grad_norm": 0.7283481359481812, + "learning_rate": 0.0004093557422969188, + "loss": 0.4628, + "step": 21199 + }, + { + "epoch": 11.843575418994414, + "grad_norm": 0.5071769952774048, + "learning_rate": 0.000409327731092437, + "loss": 0.4068, + "step": 21200 + }, + { + "epoch": 11.84413407821229, + "grad_norm": 0.7466905117034912, + "learning_rate": 0.0004092997198879552, + "loss": 0.528, + "step": 21201 + }, + { + "epoch": 11.844692737430167, + "grad_norm": 0.5412207245826721, + "learning_rate": 0.0004092717086834734, + "loss": 0.4866, + "step": 21202 + }, + { + "epoch": 11.845251396648045, + "grad_norm": 0.445582777261734, + "learning_rate": 0.0004092436974789916, + "loss": 0.3945, + "step": 21203 + }, + { + "epoch": 11.845810055865922, + "grad_norm": 0.7843878269195557, + "learning_rate": 0.0004092156862745098, + "loss": 0.4853, + "step": 21204 + }, + { + "epoch": 11.846368715083798, + "grad_norm": 0.7705541849136353, + "learning_rate": 0.000409187675070028, + "loss": 0.383, + "step": 21205 + }, + { + "epoch": 11.846927374301677, + "grad_norm": 0.700949490070343, + "learning_rate": 0.0004091596638655462, + "loss": 0.3974, + "step": 21206 + }, + { + "epoch": 11.847486033519553, + "grad_norm": 1.0537673234939575, + "learning_rate": 0.00040913165266106443, + "loss": 0.4724, + "step": 21207 + }, + { + "epoch": 11.84804469273743, + "grad_norm": 0.6811145544052124, + "learning_rate": 0.00040910364145658264, + "loss": 0.5616, + "step": 21208 + }, + { + "epoch": 11.848603351955306, + "grad_norm": 0.5758842825889587, + "learning_rate": 0.0004090756302521009, + "loss": 0.4885, + "step": 21209 + }, + { + "epoch": 11.849162011173185, + "grad_norm": 0.5112441778182983, + "learning_rate": 0.00040904761904761905, + "loss": 0.5056, + "step": 21210 + }, + { + "epoch": 11.849720670391061, + "grad_norm": 0.3904726803302765, + "learning_rate": 0.00040901960784313726, + "loss": 0.4036, + "step": 21211 + }, + { + "epoch": 11.850279329608938, + "grad_norm": 1.1251496076583862, + "learning_rate": 0.00040899159663865546, + "loss": 0.5042, + "step": 21212 + }, + { + "epoch": 11.850837988826816, + "grad_norm": 0.48130232095718384, + "learning_rate": 0.00040896358543417367, + "loss": 0.4126, + "step": 21213 + }, + { + "epoch": 11.851396648044693, + "grad_norm": 0.6811621785163879, + "learning_rate": 0.00040893557422969193, + "loss": 0.4029, + "step": 21214 + }, + { + "epoch": 11.85195530726257, + "grad_norm": 0.46350258588790894, + "learning_rate": 0.0004089075630252101, + "loss": 0.3203, + "step": 21215 + }, + { + "epoch": 11.852513966480448, + "grad_norm": 0.5114133954048157, + "learning_rate": 0.0004088795518207283, + "loss": 0.3945, + "step": 21216 + }, + { + "epoch": 11.853072625698324, + "grad_norm": 0.49909162521362305, + "learning_rate": 0.00040885154061624655, + "loss": 0.5361, + "step": 21217 + }, + { + "epoch": 11.8536312849162, + "grad_norm": 0.3986165225505829, + "learning_rate": 0.0004088235294117647, + "loss": 0.4049, + "step": 21218 + }, + { + "epoch": 11.854189944134077, + "grad_norm": 0.8127726912498474, + "learning_rate": 0.00040879551820728296, + "loss": 0.4125, + "step": 21219 + }, + { + "epoch": 11.854748603351956, + "grad_norm": 0.4818750023841858, + "learning_rate": 0.0004087675070028011, + "loss": 0.3824, + "step": 21220 + }, + { + "epoch": 11.855307262569832, + "grad_norm": 0.43652278184890747, + "learning_rate": 0.0004087394957983193, + "loss": 0.4454, + "step": 21221 + }, + { + "epoch": 11.855865921787709, + "grad_norm": 0.6294071078300476, + "learning_rate": 0.0004087114845938376, + "loss": 0.5113, + "step": 21222 + }, + { + "epoch": 11.856424581005587, + "grad_norm": 0.7095273733139038, + "learning_rate": 0.00040868347338935573, + "loss": 0.3583, + "step": 21223 + }, + { + "epoch": 11.856983240223464, + "grad_norm": 0.453199565410614, + "learning_rate": 0.000408655462184874, + "loss": 0.3652, + "step": 21224 + }, + { + "epoch": 11.85754189944134, + "grad_norm": 0.64100182056427, + "learning_rate": 0.0004086274509803922, + "loss": 0.3838, + "step": 21225 + }, + { + "epoch": 11.858100558659217, + "grad_norm": 0.42354270815849304, + "learning_rate": 0.00040859943977591035, + "loss": 0.4516, + "step": 21226 + }, + { + "epoch": 11.858659217877095, + "grad_norm": 0.3842369019985199, + "learning_rate": 0.0004085714285714286, + "loss": 0.3885, + "step": 21227 + }, + { + "epoch": 11.859217877094972, + "grad_norm": 0.6809428930282593, + "learning_rate": 0.00040854341736694676, + "loss": 0.4357, + "step": 21228 + }, + { + "epoch": 11.859776536312848, + "grad_norm": 0.5415769219398499, + "learning_rate": 0.000408515406162465, + "loss": 0.5023, + "step": 21229 + }, + { + "epoch": 11.860335195530727, + "grad_norm": 0.3625841438770294, + "learning_rate": 0.0004084873949579832, + "loss": 0.329, + "step": 21230 + }, + { + "epoch": 11.860893854748603, + "grad_norm": 0.5092974305152893, + "learning_rate": 0.0004084593837535014, + "loss": 0.4724, + "step": 21231 + }, + { + "epoch": 11.86145251396648, + "grad_norm": 0.6173868179321289, + "learning_rate": 0.00040843137254901964, + "loss": 0.5882, + "step": 21232 + }, + { + "epoch": 11.862011173184358, + "grad_norm": 0.331398069858551, + "learning_rate": 0.00040840336134453784, + "loss": 0.3486, + "step": 21233 + }, + { + "epoch": 11.862569832402235, + "grad_norm": 0.7325568199157715, + "learning_rate": 0.00040837535014005605, + "loss": 0.466, + "step": 21234 + }, + { + "epoch": 11.863128491620111, + "grad_norm": 0.318058580160141, + "learning_rate": 0.00040834733893557425, + "loss": 0.2464, + "step": 21235 + }, + { + "epoch": 11.86368715083799, + "grad_norm": 0.7323331832885742, + "learning_rate": 0.0004083193277310924, + "loss": 0.4964, + "step": 21236 + }, + { + "epoch": 11.864245810055866, + "grad_norm": 0.5039665102958679, + "learning_rate": 0.00040829131652661067, + "loss": 0.3552, + "step": 21237 + }, + { + "epoch": 11.864804469273743, + "grad_norm": 1.4152615070343018, + "learning_rate": 0.00040826330532212887, + "loss": 0.4769, + "step": 21238 + }, + { + "epoch": 11.86536312849162, + "grad_norm": 0.324214369058609, + "learning_rate": 0.0004082352941176471, + "loss": 0.3355, + "step": 21239 + }, + { + "epoch": 11.865921787709498, + "grad_norm": 2.453305721282959, + "learning_rate": 0.0004082072829131653, + "loss": 0.4643, + "step": 21240 + }, + { + "epoch": 11.866480446927374, + "grad_norm": 0.7970339059829712, + "learning_rate": 0.0004081792717086835, + "loss": 0.6719, + "step": 21241 + }, + { + "epoch": 11.867039106145251, + "grad_norm": 0.7285621166229248, + "learning_rate": 0.0004081512605042017, + "loss": 0.3942, + "step": 21242 + }, + { + "epoch": 11.86759776536313, + "grad_norm": 0.8324102759361267, + "learning_rate": 0.0004081232492997199, + "loss": 0.535, + "step": 21243 + }, + { + "epoch": 11.868156424581006, + "grad_norm": 0.7698560953140259, + "learning_rate": 0.0004080952380952381, + "loss": 0.5079, + "step": 21244 + }, + { + "epoch": 11.868715083798882, + "grad_norm": 1.0058929920196533, + "learning_rate": 0.0004080672268907563, + "loss": 0.323, + "step": 21245 + }, + { + "epoch": 11.869273743016759, + "grad_norm": 0.5254631638526917, + "learning_rate": 0.0004080392156862745, + "loss": 0.3405, + "step": 21246 + }, + { + "epoch": 11.869832402234637, + "grad_norm": 0.41968396306037903, + "learning_rate": 0.0004080112044817927, + "loss": 0.4673, + "step": 21247 + }, + { + "epoch": 11.870391061452514, + "grad_norm": 0.4429382383823395, + "learning_rate": 0.00040798319327731093, + "loss": 0.3835, + "step": 21248 + }, + { + "epoch": 11.87094972067039, + "grad_norm": 0.4723098576068878, + "learning_rate": 0.0004079551820728292, + "loss": 0.3512, + "step": 21249 + }, + { + "epoch": 11.871508379888269, + "grad_norm": 0.5303624272346497, + "learning_rate": 0.00040792717086834734, + "loss": 0.4501, + "step": 21250 + }, + { + "epoch": 11.872067039106145, + "grad_norm": 1.0976248979568481, + "learning_rate": 0.00040789915966386555, + "loss": 0.5495, + "step": 21251 + }, + { + "epoch": 11.872625698324022, + "grad_norm": 0.831189751625061, + "learning_rate": 0.00040787114845938376, + "loss": 0.4487, + "step": 21252 + }, + { + "epoch": 11.8731843575419, + "grad_norm": 0.47820669412612915, + "learning_rate": 0.00040784313725490196, + "loss": 0.4165, + "step": 21253 + }, + { + "epoch": 11.873743016759777, + "grad_norm": 0.3727686107158661, + "learning_rate": 0.0004078151260504202, + "loss": 0.4361, + "step": 21254 + }, + { + "epoch": 11.874301675977653, + "grad_norm": 3.9249637126922607, + "learning_rate": 0.0004077871148459384, + "loss": 0.3182, + "step": 21255 + }, + { + "epoch": 11.87486033519553, + "grad_norm": 0.540917158126831, + "learning_rate": 0.0004077591036414566, + "loss": 0.3422, + "step": 21256 + }, + { + "epoch": 11.875418994413408, + "grad_norm": 0.6028260588645935, + "learning_rate": 0.00040773109243697484, + "loss": 0.4084, + "step": 21257 + }, + { + "epoch": 11.875977653631285, + "grad_norm": 0.842046856880188, + "learning_rate": 0.000407703081232493, + "loss": 0.5078, + "step": 21258 + }, + { + "epoch": 11.876536312849161, + "grad_norm": 0.5099661946296692, + "learning_rate": 0.0004076750700280112, + "loss": 0.3774, + "step": 21259 + }, + { + "epoch": 11.87709497206704, + "grad_norm": 0.5762701630592346, + "learning_rate": 0.0004076470588235294, + "loss": 0.4432, + "step": 21260 + }, + { + "epoch": 11.877653631284916, + "grad_norm": 1.1062768697738647, + "learning_rate": 0.0004076190476190476, + "loss": 0.4896, + "step": 21261 + }, + { + "epoch": 11.878212290502793, + "grad_norm": 0.42448118329048157, + "learning_rate": 0.00040759103641456587, + "loss": 0.3162, + "step": 21262 + }, + { + "epoch": 11.878770949720671, + "grad_norm": 0.8198091983795166, + "learning_rate": 0.000407563025210084, + "loss": 0.5053, + "step": 21263 + }, + { + "epoch": 11.879329608938548, + "grad_norm": 0.612362265586853, + "learning_rate": 0.00040753501400560223, + "loss": 0.4886, + "step": 21264 + }, + { + "epoch": 11.879888268156424, + "grad_norm": 0.33380112051963806, + "learning_rate": 0.0004075070028011205, + "loss": 0.2727, + "step": 21265 + }, + { + "epoch": 11.880446927374301, + "grad_norm": 1.6635680198669434, + "learning_rate": 0.00040747899159663864, + "loss": 0.4065, + "step": 21266 + }, + { + "epoch": 11.88100558659218, + "grad_norm": 1.0300723314285278, + "learning_rate": 0.0004074509803921569, + "loss": 0.3884, + "step": 21267 + }, + { + "epoch": 11.881564245810056, + "grad_norm": 0.5811362862586975, + "learning_rate": 0.00040742296918767505, + "loss": 0.3729, + "step": 21268 + }, + { + "epoch": 11.882122905027932, + "grad_norm": 1.3218542337417603, + "learning_rate": 0.00040739495798319326, + "loss": 0.4302, + "step": 21269 + }, + { + "epoch": 11.88268156424581, + "grad_norm": 0.4359176754951477, + "learning_rate": 0.0004073669467787115, + "loss": 0.2741, + "step": 21270 + }, + { + "epoch": 11.883240223463687, + "grad_norm": 0.5745676159858704, + "learning_rate": 0.00040733893557422967, + "loss": 0.5311, + "step": 21271 + }, + { + "epoch": 11.883798882681564, + "grad_norm": 0.4885067343711853, + "learning_rate": 0.00040731092436974793, + "loss": 0.4657, + "step": 21272 + }, + { + "epoch": 11.88435754189944, + "grad_norm": 0.3968677520751953, + "learning_rate": 0.00040728291316526614, + "loss": 0.3218, + "step": 21273 + }, + { + "epoch": 11.884916201117319, + "grad_norm": 0.5480905175209045, + "learning_rate": 0.0004072549019607843, + "loss": 0.3741, + "step": 21274 + }, + { + "epoch": 11.885474860335195, + "grad_norm": 1.2292654514312744, + "learning_rate": 0.00040722689075630255, + "loss": 0.3282, + "step": 21275 + }, + { + "epoch": 11.886033519553072, + "grad_norm": 0.8086422085762024, + "learning_rate": 0.0004071988795518207, + "loss": 0.3245, + "step": 21276 + }, + { + "epoch": 11.88659217877095, + "grad_norm": 1.569286584854126, + "learning_rate": 0.00040717086834733896, + "loss": 0.4709, + "step": 21277 + }, + { + "epoch": 11.887150837988827, + "grad_norm": 1.3830845355987549, + "learning_rate": 0.00040714285714285717, + "loss": 0.3265, + "step": 21278 + }, + { + "epoch": 11.887709497206703, + "grad_norm": 0.3987477123737335, + "learning_rate": 0.0004071148459383753, + "loss": 0.4086, + "step": 21279 + }, + { + "epoch": 11.888268156424582, + "grad_norm": 0.622778594493866, + "learning_rate": 0.0004070868347338936, + "loss": 0.51, + "step": 21280 + }, + { + "epoch": 11.888826815642458, + "grad_norm": 0.6695683598518372, + "learning_rate": 0.0004070588235294118, + "loss": 0.5908, + "step": 21281 + }, + { + "epoch": 11.889385474860335, + "grad_norm": 0.6142347455024719, + "learning_rate": 0.00040703081232493, + "loss": 0.48, + "step": 21282 + }, + { + "epoch": 11.889944134078211, + "grad_norm": 0.6269024014472961, + "learning_rate": 0.0004070028011204482, + "loss": 0.5048, + "step": 21283 + }, + { + "epoch": 11.89050279329609, + "grad_norm": 0.4405671954154968, + "learning_rate": 0.00040697478991596635, + "loss": 0.3683, + "step": 21284 + }, + { + "epoch": 11.891061452513966, + "grad_norm": 0.46253782510757446, + "learning_rate": 0.0004069467787114846, + "loss": 0.3849, + "step": 21285 + }, + { + "epoch": 11.891620111731843, + "grad_norm": 0.8965322375297546, + "learning_rate": 0.0004069187675070028, + "loss": 0.4877, + "step": 21286 + }, + { + "epoch": 11.892178770949721, + "grad_norm": 11.137365341186523, + "learning_rate": 0.000406890756302521, + "loss": 0.5581, + "step": 21287 + }, + { + "epoch": 11.892737430167598, + "grad_norm": 0.5854904651641846, + "learning_rate": 0.0004068627450980392, + "loss": 0.3538, + "step": 21288 + }, + { + "epoch": 11.893296089385474, + "grad_norm": 0.427871972322464, + "learning_rate": 0.00040683473389355743, + "loss": 0.4615, + "step": 21289 + }, + { + "epoch": 11.893854748603353, + "grad_norm": 0.40088075399398804, + "learning_rate": 0.00040680672268907564, + "loss": 0.3706, + "step": 21290 + }, + { + "epoch": 11.89441340782123, + "grad_norm": 0.6828593611717224, + "learning_rate": 0.00040677871148459384, + "loss": 0.4686, + "step": 21291 + }, + { + "epoch": 11.894972067039106, + "grad_norm": 0.9827330708503723, + "learning_rate": 0.00040675070028011205, + "loss": 0.4403, + "step": 21292 + }, + { + "epoch": 11.895530726256982, + "grad_norm": 0.4057422876358032, + "learning_rate": 0.00040672268907563026, + "loss": 0.3575, + "step": 21293 + }, + { + "epoch": 11.89608938547486, + "grad_norm": 0.5105399489402771, + "learning_rate": 0.00040669467787114846, + "loss": 0.5119, + "step": 21294 + }, + { + "epoch": 11.896648044692737, + "grad_norm": 1.131582260131836, + "learning_rate": 0.00040666666666666667, + "loss": 0.3716, + "step": 21295 + }, + { + "epoch": 11.897206703910614, + "grad_norm": 0.990439236164093, + "learning_rate": 0.0004066386554621849, + "loss": 0.3286, + "step": 21296 + }, + { + "epoch": 11.897765363128492, + "grad_norm": 0.4834293723106384, + "learning_rate": 0.00040661064425770313, + "loss": 0.5017, + "step": 21297 + }, + { + "epoch": 11.898324022346369, + "grad_norm": 0.4201735258102417, + "learning_rate": 0.0004065826330532213, + "loss": 0.3577, + "step": 21298 + }, + { + "epoch": 11.898882681564245, + "grad_norm": 0.3657025992870331, + "learning_rate": 0.0004065546218487395, + "loss": 0.4109, + "step": 21299 + }, + { + "epoch": 11.899441340782122, + "grad_norm": 1.0979835987091064, + "learning_rate": 0.0004065266106442577, + "loss": 0.3041, + "step": 21300 + }, + { + "epoch": 11.9, + "grad_norm": 0.5186835527420044, + "learning_rate": 0.0004064985994397759, + "loss": 0.3369, + "step": 21301 + }, + { + "epoch": 11.900558659217877, + "grad_norm": 0.5685844421386719, + "learning_rate": 0.00040647058823529416, + "loss": 0.4358, + "step": 21302 + }, + { + "epoch": 11.901117318435753, + "grad_norm": 0.5636425018310547, + "learning_rate": 0.0004064425770308123, + "loss": 0.4475, + "step": 21303 + }, + { + "epoch": 11.901675977653632, + "grad_norm": 0.4192626178264618, + "learning_rate": 0.0004064145658263305, + "loss": 0.3539, + "step": 21304 + }, + { + "epoch": 11.902234636871508, + "grad_norm": 11.530969619750977, + "learning_rate": 0.0004063865546218488, + "loss": 0.5548, + "step": 21305 + }, + { + "epoch": 11.902793296089385, + "grad_norm": 3.8384783267974854, + "learning_rate": 0.00040635854341736693, + "loss": 0.3482, + "step": 21306 + }, + { + "epoch": 11.903351955307263, + "grad_norm": 1.4641417264938354, + "learning_rate": 0.0004063305322128852, + "loss": 0.4488, + "step": 21307 + }, + { + "epoch": 11.90391061452514, + "grad_norm": 0.6003350019454956, + "learning_rate": 0.00040630252100840335, + "loss": 0.4896, + "step": 21308 + }, + { + "epoch": 11.904469273743016, + "grad_norm": 0.5757750272750854, + "learning_rate": 0.00040627450980392155, + "loss": 0.5085, + "step": 21309 + }, + { + "epoch": 11.905027932960895, + "grad_norm": 0.4466097354888916, + "learning_rate": 0.0004062464985994398, + "loss": 0.3448, + "step": 21310 + }, + { + "epoch": 11.905586592178771, + "grad_norm": 16.73419189453125, + "learning_rate": 0.00040621848739495796, + "loss": 0.4516, + "step": 21311 + }, + { + "epoch": 11.906145251396648, + "grad_norm": 0.5923506617546082, + "learning_rate": 0.0004061904761904762, + "loss": 0.4252, + "step": 21312 + }, + { + "epoch": 11.906703910614524, + "grad_norm": 2.602841377258301, + "learning_rate": 0.00040616246498599443, + "loss": 0.322, + "step": 21313 + }, + { + "epoch": 11.907262569832403, + "grad_norm": 1.155871868133545, + "learning_rate": 0.0004061344537815126, + "loss": 0.4191, + "step": 21314 + }, + { + "epoch": 11.90782122905028, + "grad_norm": 0.4974883198738098, + "learning_rate": 0.00040610644257703084, + "loss": 0.4063, + "step": 21315 + }, + { + "epoch": 11.908379888268156, + "grad_norm": 3.6762585639953613, + "learning_rate": 0.000406078431372549, + "loss": 0.3998, + "step": 21316 + }, + { + "epoch": 11.908938547486034, + "grad_norm": 0.6080918908119202, + "learning_rate": 0.00040605042016806725, + "loss": 0.5263, + "step": 21317 + }, + { + "epoch": 11.90949720670391, + "grad_norm": 2.4871015548706055, + "learning_rate": 0.00040602240896358546, + "loss": 0.4439, + "step": 21318 + }, + { + "epoch": 11.910055865921787, + "grad_norm": 0.39962974190711975, + "learning_rate": 0.0004059943977591036, + "loss": 0.3624, + "step": 21319 + }, + { + "epoch": 11.910614525139664, + "grad_norm": 0.4608193635940552, + "learning_rate": 0.00040596638655462187, + "loss": 0.3423, + "step": 21320 + }, + { + "epoch": 11.911173184357542, + "grad_norm": 0.41747844219207764, + "learning_rate": 0.0004059383753501401, + "loss": 0.3799, + "step": 21321 + }, + { + "epoch": 11.911731843575419, + "grad_norm": 0.624941885471344, + "learning_rate": 0.0004059103641456583, + "loss": 0.5725, + "step": 21322 + }, + { + "epoch": 11.912290502793295, + "grad_norm": 0.41783246397972107, + "learning_rate": 0.0004058823529411765, + "loss": 0.4539, + "step": 21323 + }, + { + "epoch": 11.912849162011174, + "grad_norm": 0.37872254848480225, + "learning_rate": 0.00040585434173669464, + "loss": 0.4947, + "step": 21324 + }, + { + "epoch": 11.91340782122905, + "grad_norm": 0.5078139901161194, + "learning_rate": 0.0004058263305322129, + "loss": 0.4066, + "step": 21325 + }, + { + "epoch": 11.913966480446927, + "grad_norm": 0.6185089349746704, + "learning_rate": 0.0004057983193277311, + "loss": 0.3433, + "step": 21326 + }, + { + "epoch": 11.914525139664804, + "grad_norm": 0.5614469647407532, + "learning_rate": 0.0004057703081232493, + "loss": 0.4078, + "step": 21327 + }, + { + "epoch": 11.915083798882682, + "grad_norm": 0.3830602467060089, + "learning_rate": 0.0004057422969187675, + "loss": 0.2841, + "step": 21328 + }, + { + "epoch": 11.915642458100558, + "grad_norm": 0.43531695008277893, + "learning_rate": 0.0004057142857142857, + "loss": 0.4895, + "step": 21329 + }, + { + "epoch": 11.916201117318435, + "grad_norm": 1.2865535020828247, + "learning_rate": 0.00040568627450980393, + "loss": 0.4536, + "step": 21330 + }, + { + "epoch": 11.916759776536313, + "grad_norm": 0.7823388576507568, + "learning_rate": 0.00040565826330532214, + "loss": 0.4322, + "step": 21331 + }, + { + "epoch": 11.91731843575419, + "grad_norm": 0.9831047654151917, + "learning_rate": 0.00040563025210084034, + "loss": 0.4312, + "step": 21332 + }, + { + "epoch": 11.917877094972066, + "grad_norm": 0.4631880223751068, + "learning_rate": 0.00040560224089635855, + "loss": 0.3827, + "step": 21333 + }, + { + "epoch": 11.918435754189945, + "grad_norm": 0.48576414585113525, + "learning_rate": 0.00040557422969187676, + "loss": 0.4894, + "step": 21334 + }, + { + "epoch": 11.918994413407821, + "grad_norm": 0.5551460981369019, + "learning_rate": 0.00040554621848739496, + "loss": 0.4013, + "step": 21335 + }, + { + "epoch": 11.919553072625698, + "grad_norm": 0.39262422919273376, + "learning_rate": 0.00040551820728291317, + "loss": 0.331, + "step": 21336 + }, + { + "epoch": 11.920111731843576, + "grad_norm": 0.4389094412326813, + "learning_rate": 0.00040549019607843143, + "loss": 0.3676, + "step": 21337 + }, + { + "epoch": 11.920670391061453, + "grad_norm": 0.4645405411720276, + "learning_rate": 0.0004054621848739496, + "loss": 0.522, + "step": 21338 + }, + { + "epoch": 11.92122905027933, + "grad_norm": 0.6992455124855042, + "learning_rate": 0.0004054341736694678, + "loss": 0.4609, + "step": 21339 + }, + { + "epoch": 11.921787709497206, + "grad_norm": 1.2163187265396118, + "learning_rate": 0.000405406162464986, + "loss": 0.4299, + "step": 21340 + }, + { + "epoch": 11.922346368715084, + "grad_norm": 0.548793375492096, + "learning_rate": 0.0004053781512605042, + "loss": 0.544, + "step": 21341 + }, + { + "epoch": 11.922905027932961, + "grad_norm": 0.9195559024810791, + "learning_rate": 0.00040535014005602246, + "loss": 0.4118, + "step": 21342 + }, + { + "epoch": 11.923463687150837, + "grad_norm": 0.4788474440574646, + "learning_rate": 0.0004053221288515406, + "loss": 0.4488, + "step": 21343 + }, + { + "epoch": 11.924022346368716, + "grad_norm": 0.5037703514099121, + "learning_rate": 0.0004052941176470588, + "loss": 0.3988, + "step": 21344 + }, + { + "epoch": 11.924581005586592, + "grad_norm": 0.3578023314476013, + "learning_rate": 0.0004052661064425771, + "loss": 0.4265, + "step": 21345 + }, + { + "epoch": 11.925139664804469, + "grad_norm": 0.6640678644180298, + "learning_rate": 0.00040523809523809523, + "loss": 0.4028, + "step": 21346 + }, + { + "epoch": 11.925698324022346, + "grad_norm": 0.41473472118377686, + "learning_rate": 0.0004052100840336135, + "loss": 0.4429, + "step": 21347 + }, + { + "epoch": 11.926256983240224, + "grad_norm": 0.44315069913864136, + "learning_rate": 0.00040518207282913164, + "loss": 0.3701, + "step": 21348 + }, + { + "epoch": 11.9268156424581, + "grad_norm": 0.43728509545326233, + "learning_rate": 0.00040515406162464985, + "loss": 0.4413, + "step": 21349 + }, + { + "epoch": 11.927374301675977, + "grad_norm": 0.7392879128456116, + "learning_rate": 0.0004051260504201681, + "loss": 0.4249, + "step": 21350 + }, + { + "epoch": 11.927932960893855, + "grad_norm": 0.4067839980125427, + "learning_rate": 0.00040509803921568626, + "loss": 0.3119, + "step": 21351 + }, + { + "epoch": 11.928491620111732, + "grad_norm": 0.4650818407535553, + "learning_rate": 0.0004050700280112045, + "loss": 0.458, + "step": 21352 + }, + { + "epoch": 11.929050279329608, + "grad_norm": 0.9499150514602661, + "learning_rate": 0.0004050420168067227, + "loss": 0.4244, + "step": 21353 + }, + { + "epoch": 11.929608938547487, + "grad_norm": 0.5892748832702637, + "learning_rate": 0.0004050140056022409, + "loss": 0.423, + "step": 21354 + }, + { + "epoch": 11.930167597765363, + "grad_norm": 0.6039199233055115, + "learning_rate": 0.00040498599439775914, + "loss": 0.5248, + "step": 21355 + }, + { + "epoch": 11.93072625698324, + "grad_norm": 0.9062708616256714, + "learning_rate": 0.0004049579831932773, + "loss": 0.5105, + "step": 21356 + }, + { + "epoch": 11.931284916201117, + "grad_norm": 0.5044767260551453, + "learning_rate": 0.00040492997198879555, + "loss": 0.3958, + "step": 21357 + }, + { + "epoch": 11.931843575418995, + "grad_norm": 0.7158891558647156, + "learning_rate": 0.00040490196078431375, + "loss": 0.513, + "step": 21358 + }, + { + "epoch": 11.932402234636871, + "grad_norm": 2.45357084274292, + "learning_rate": 0.0004048739495798319, + "loss": 0.3832, + "step": 21359 + }, + { + "epoch": 11.932960893854748, + "grad_norm": 0.657284677028656, + "learning_rate": 0.00040484593837535017, + "loss": 0.4027, + "step": 21360 + }, + { + "epoch": 11.933519553072626, + "grad_norm": 1.0045700073242188, + "learning_rate": 0.00040481792717086837, + "loss": 0.552, + "step": 21361 + }, + { + "epoch": 11.934078212290503, + "grad_norm": 0.3644641935825348, + "learning_rate": 0.0004047899159663866, + "loss": 0.3589, + "step": 21362 + }, + { + "epoch": 11.93463687150838, + "grad_norm": 0.9093109965324402, + "learning_rate": 0.0004047619047619048, + "loss": 0.4456, + "step": 21363 + }, + { + "epoch": 11.935195530726258, + "grad_norm": 0.4375036656856537, + "learning_rate": 0.00040473389355742294, + "loss": 0.4686, + "step": 21364 + }, + { + "epoch": 11.935754189944134, + "grad_norm": 0.4587719142436981, + "learning_rate": 0.0004047058823529412, + "loss": 0.4899, + "step": 21365 + }, + { + "epoch": 11.936312849162011, + "grad_norm": 0.6220703125, + "learning_rate": 0.0004046778711484594, + "loss": 0.4008, + "step": 21366 + }, + { + "epoch": 11.936871508379888, + "grad_norm": 1.466020107269287, + "learning_rate": 0.00040464985994397755, + "loss": 0.4538, + "step": 21367 + }, + { + "epoch": 11.937430167597766, + "grad_norm": 0.5692799091339111, + "learning_rate": 0.0004046218487394958, + "loss": 0.5368, + "step": 21368 + }, + { + "epoch": 11.937988826815642, + "grad_norm": 0.45667150616645813, + "learning_rate": 0.000404593837535014, + "loss": 0.4653, + "step": 21369 + }, + { + "epoch": 11.938547486033519, + "grad_norm": 0.4480198919773102, + "learning_rate": 0.0004045658263305322, + "loss": 0.5272, + "step": 21370 + }, + { + "epoch": 11.939106145251397, + "grad_norm": 1.0969327688217163, + "learning_rate": 0.00040453781512605043, + "loss": 0.3963, + "step": 21371 + }, + { + "epoch": 11.939664804469274, + "grad_norm": 0.6369842886924744, + "learning_rate": 0.0004045098039215686, + "loss": 0.5154, + "step": 21372 + }, + { + "epoch": 11.94022346368715, + "grad_norm": 0.5188466310501099, + "learning_rate": 0.00040448179271708684, + "loss": 0.4026, + "step": 21373 + }, + { + "epoch": 11.940782122905027, + "grad_norm": 0.471169650554657, + "learning_rate": 0.00040445378151260505, + "loss": 0.4585, + "step": 21374 + }, + { + "epoch": 11.941340782122905, + "grad_norm": 0.44957196712493896, + "learning_rate": 0.00040442577030812326, + "loss": 0.49, + "step": 21375 + }, + { + "epoch": 11.941899441340782, + "grad_norm": 5.944665431976318, + "learning_rate": 0.00040439775910364146, + "loss": 0.447, + "step": 21376 + }, + { + "epoch": 11.942458100558659, + "grad_norm": 0.48717060685157776, + "learning_rate": 0.00040436974789915967, + "loss": 0.3551, + "step": 21377 + }, + { + "epoch": 11.943016759776537, + "grad_norm": 0.6072143912315369, + "learning_rate": 0.0004043417366946779, + "loss": 0.4667, + "step": 21378 + }, + { + "epoch": 11.943575418994413, + "grad_norm": 1.032568335533142, + "learning_rate": 0.0004043137254901961, + "loss": 0.3677, + "step": 21379 + }, + { + "epoch": 11.94413407821229, + "grad_norm": 0.4335152208805084, + "learning_rate": 0.0004042857142857143, + "loss": 0.4538, + "step": 21380 + }, + { + "epoch": 11.944692737430168, + "grad_norm": 0.4862251877784729, + "learning_rate": 0.0004042577030812325, + "loss": 0.5147, + "step": 21381 + }, + { + "epoch": 11.945251396648045, + "grad_norm": 0.840763509273529, + "learning_rate": 0.0004042296918767507, + "loss": 0.4164, + "step": 21382 + }, + { + "epoch": 11.945810055865921, + "grad_norm": 0.4739285111427307, + "learning_rate": 0.0004042016806722689, + "loss": 0.4818, + "step": 21383 + }, + { + "epoch": 11.946368715083798, + "grad_norm": 23.829708099365234, + "learning_rate": 0.0004041736694677871, + "loss": 0.4786, + "step": 21384 + }, + { + "epoch": 11.946927374301676, + "grad_norm": 0.6557050943374634, + "learning_rate": 0.00040414565826330537, + "loss": 0.4389, + "step": 21385 + }, + { + "epoch": 11.947486033519553, + "grad_norm": 0.3654273748397827, + "learning_rate": 0.0004041176470588235, + "loss": 0.3677, + "step": 21386 + }, + { + "epoch": 11.94804469273743, + "grad_norm": 0.552470862865448, + "learning_rate": 0.00040408963585434173, + "loss": 0.4975, + "step": 21387 + }, + { + "epoch": 11.948603351955308, + "grad_norm": 0.39677512645721436, + "learning_rate": 0.00040406162464985993, + "loss": 0.4021, + "step": 21388 + }, + { + "epoch": 11.949162011173184, + "grad_norm": 0.4119769334793091, + "learning_rate": 0.00040403361344537814, + "loss": 0.4177, + "step": 21389 + }, + { + "epoch": 11.949720670391061, + "grad_norm": 0.40992507338523865, + "learning_rate": 0.0004040056022408964, + "loss": 0.427, + "step": 21390 + }, + { + "epoch": 11.95027932960894, + "grad_norm": 0.8193026781082153, + "learning_rate": 0.00040397759103641455, + "loss": 0.4525, + "step": 21391 + }, + { + "epoch": 11.950837988826816, + "grad_norm": 0.4050707519054413, + "learning_rate": 0.00040394957983193276, + "loss": 0.4839, + "step": 21392 + }, + { + "epoch": 11.951396648044692, + "grad_norm": 0.4837300181388855, + "learning_rate": 0.000403921568627451, + "loss": 0.3684, + "step": 21393 + }, + { + "epoch": 11.951955307262569, + "grad_norm": 0.38019701838493347, + "learning_rate": 0.00040389355742296917, + "loss": 0.3643, + "step": 21394 + }, + { + "epoch": 11.952513966480447, + "grad_norm": 0.535209059715271, + "learning_rate": 0.00040386554621848743, + "loss": 0.3914, + "step": 21395 + }, + { + "epoch": 11.953072625698324, + "grad_norm": 0.5260859727859497, + "learning_rate": 0.0004038375350140056, + "loss": 0.4598, + "step": 21396 + }, + { + "epoch": 11.9536312849162, + "grad_norm": 0.870768666267395, + "learning_rate": 0.0004038095238095238, + "loss": 0.3948, + "step": 21397 + }, + { + "epoch": 11.954189944134079, + "grad_norm": 0.44812509417533875, + "learning_rate": 0.00040378151260504205, + "loss": 0.4447, + "step": 21398 + }, + { + "epoch": 11.954748603351955, + "grad_norm": 0.49573367834091187, + "learning_rate": 0.0004037535014005602, + "loss": 0.404, + "step": 21399 + }, + { + "epoch": 11.955307262569832, + "grad_norm": 0.3819088041782379, + "learning_rate": 0.00040372549019607846, + "loss": 0.4229, + "step": 21400 + }, + { + "epoch": 11.955865921787709, + "grad_norm": 0.7327035665512085, + "learning_rate": 0.00040369747899159667, + "loss": 0.4963, + "step": 21401 + }, + { + "epoch": 11.956424581005587, + "grad_norm": 0.7574234008789062, + "learning_rate": 0.0004036694677871148, + "loss": 0.3949, + "step": 21402 + }, + { + "epoch": 11.956983240223463, + "grad_norm": 0.5388967394828796, + "learning_rate": 0.0004036414565826331, + "loss": 0.4076, + "step": 21403 + }, + { + "epoch": 11.95754189944134, + "grad_norm": 1.0600173473358154, + "learning_rate": 0.00040361344537815123, + "loss": 0.4323, + "step": 21404 + }, + { + "epoch": 11.958100558659218, + "grad_norm": 0.3955949544906616, + "learning_rate": 0.0004035854341736695, + "loss": 0.3476, + "step": 21405 + }, + { + "epoch": 11.958659217877095, + "grad_norm": 0.7843801975250244, + "learning_rate": 0.0004035574229691877, + "loss": 0.5156, + "step": 21406 + }, + { + "epoch": 11.959217877094972, + "grad_norm": 0.563435971736908, + "learning_rate": 0.00040352941176470585, + "loss": 0.4189, + "step": 21407 + }, + { + "epoch": 11.95977653631285, + "grad_norm": 0.8962976336479187, + "learning_rate": 0.0004035014005602241, + "loss": 0.3651, + "step": 21408 + }, + { + "epoch": 11.960335195530726, + "grad_norm": 0.659602165222168, + "learning_rate": 0.0004034733893557423, + "loss": 0.4733, + "step": 21409 + }, + { + "epoch": 11.960893854748603, + "grad_norm": 0.6104586720466614, + "learning_rate": 0.0004034453781512605, + "loss": 0.3869, + "step": 21410 + }, + { + "epoch": 11.961452513966481, + "grad_norm": 0.952088475227356, + "learning_rate": 0.0004034173669467787, + "loss": 0.5045, + "step": 21411 + }, + { + "epoch": 11.962011173184358, + "grad_norm": 1.0782089233398438, + "learning_rate": 0.0004033893557422969, + "loss": 0.4553, + "step": 21412 + }, + { + "epoch": 11.962569832402234, + "grad_norm": 0.8350033164024353, + "learning_rate": 0.00040336134453781514, + "loss": 0.5616, + "step": 21413 + }, + { + "epoch": 11.963128491620111, + "grad_norm": 1.7913035154342651, + "learning_rate": 0.00040333333333333334, + "loss": 0.571, + "step": 21414 + }, + { + "epoch": 11.96368715083799, + "grad_norm": 0.5928846597671509, + "learning_rate": 0.00040330532212885155, + "loss": 0.4653, + "step": 21415 + }, + { + "epoch": 11.964245810055866, + "grad_norm": 0.3542138934135437, + "learning_rate": 0.00040327731092436976, + "loss": 0.3622, + "step": 21416 + }, + { + "epoch": 11.964804469273743, + "grad_norm": 0.855762243270874, + "learning_rate": 0.00040324929971988796, + "loss": 0.3851, + "step": 21417 + }, + { + "epoch": 11.96536312849162, + "grad_norm": 1.241692066192627, + "learning_rate": 0.00040322128851540617, + "loss": 0.3584, + "step": 21418 + }, + { + "epoch": 11.965921787709497, + "grad_norm": 0.5246260762214661, + "learning_rate": 0.0004031932773109244, + "loss": 0.4798, + "step": 21419 + }, + { + "epoch": 11.966480446927374, + "grad_norm": 0.5350035429000854, + "learning_rate": 0.00040316526610644263, + "loss": 0.424, + "step": 21420 + }, + { + "epoch": 11.96703910614525, + "grad_norm": 0.48071035742759705, + "learning_rate": 0.0004031372549019608, + "loss": 0.4337, + "step": 21421 + }, + { + "epoch": 11.967597765363129, + "grad_norm": 0.6035120487213135, + "learning_rate": 0.000403109243697479, + "loss": 0.4308, + "step": 21422 + }, + { + "epoch": 11.968156424581005, + "grad_norm": 0.481770783662796, + "learning_rate": 0.0004030812324929972, + "loss": 0.4139, + "step": 21423 + }, + { + "epoch": 11.968715083798882, + "grad_norm": 0.6137310266494751, + "learning_rate": 0.0004030532212885154, + "loss": 0.5393, + "step": 21424 + }, + { + "epoch": 11.96927374301676, + "grad_norm": 0.5707393288612366, + "learning_rate": 0.00040302521008403366, + "loss": 0.3689, + "step": 21425 + }, + { + "epoch": 11.969832402234637, + "grad_norm": 0.6308573484420776, + "learning_rate": 0.0004029971988795518, + "loss": 0.4094, + "step": 21426 + }, + { + "epoch": 11.970391061452514, + "grad_norm": 0.6052660942077637, + "learning_rate": 0.00040296918767507, + "loss": 0.3737, + "step": 21427 + }, + { + "epoch": 11.970949720670392, + "grad_norm": 0.5102715492248535, + "learning_rate": 0.0004029411764705883, + "loss": 0.4508, + "step": 21428 + }, + { + "epoch": 11.971508379888268, + "grad_norm": 0.4994828999042511, + "learning_rate": 0.00040291316526610643, + "loss": 0.3651, + "step": 21429 + }, + { + "epoch": 11.972067039106145, + "grad_norm": 0.48235514760017395, + "learning_rate": 0.0004028851540616247, + "loss": 0.4765, + "step": 21430 + }, + { + "epoch": 11.972625698324022, + "grad_norm": 0.6641901135444641, + "learning_rate": 0.00040285714285714285, + "loss": 0.4346, + "step": 21431 + }, + { + "epoch": 11.9731843575419, + "grad_norm": 0.5821069478988647, + "learning_rate": 0.00040282913165266105, + "loss": 0.526, + "step": 21432 + }, + { + "epoch": 11.973743016759776, + "grad_norm": 0.9090389609336853, + "learning_rate": 0.0004028011204481793, + "loss": 0.308, + "step": 21433 + }, + { + "epoch": 11.974301675977653, + "grad_norm": 0.5727768540382385, + "learning_rate": 0.00040277310924369746, + "loss": 0.4178, + "step": 21434 + }, + { + "epoch": 11.974860335195531, + "grad_norm": 0.4237106442451477, + "learning_rate": 0.0004027450980392157, + "loss": 0.3878, + "step": 21435 + }, + { + "epoch": 11.975418994413408, + "grad_norm": 0.4511132538318634, + "learning_rate": 0.00040271708683473393, + "loss": 0.4352, + "step": 21436 + }, + { + "epoch": 11.975977653631285, + "grad_norm": 0.9254518747329712, + "learning_rate": 0.0004026890756302521, + "loss": 0.494, + "step": 21437 + }, + { + "epoch": 11.976536312849163, + "grad_norm": 1.2570074796676636, + "learning_rate": 0.00040266106442577034, + "loss": 0.4191, + "step": 21438 + }, + { + "epoch": 11.97709497206704, + "grad_norm": 0.35035431385040283, + "learning_rate": 0.0004026330532212885, + "loss": 0.401, + "step": 21439 + }, + { + "epoch": 11.977653631284916, + "grad_norm": 0.5531349778175354, + "learning_rate": 0.00040260504201680675, + "loss": 0.5512, + "step": 21440 + }, + { + "epoch": 11.978212290502793, + "grad_norm": 0.7655909061431885, + "learning_rate": 0.00040257703081232496, + "loss": 0.563, + "step": 21441 + }, + { + "epoch": 11.978770949720671, + "grad_norm": 0.44826754927635193, + "learning_rate": 0.0004025490196078431, + "loss": 0.4718, + "step": 21442 + }, + { + "epoch": 11.979329608938547, + "grad_norm": 0.5259056091308594, + "learning_rate": 0.00040252100840336137, + "loss": 0.4015, + "step": 21443 + }, + { + "epoch": 11.979888268156424, + "grad_norm": 0.5748715400695801, + "learning_rate": 0.0004024929971988796, + "loss": 0.4365, + "step": 21444 + }, + { + "epoch": 11.980446927374302, + "grad_norm": 0.8144130706787109, + "learning_rate": 0.0004024649859943978, + "loss": 0.3662, + "step": 21445 + }, + { + "epoch": 11.981005586592179, + "grad_norm": 0.3493744432926178, + "learning_rate": 0.000402436974789916, + "loss": 0.379, + "step": 21446 + }, + { + "epoch": 11.981564245810056, + "grad_norm": 0.7372487187385559, + "learning_rate": 0.00040240896358543414, + "loss": 0.6159, + "step": 21447 + }, + { + "epoch": 11.982122905027932, + "grad_norm": 1.4101358652114868, + "learning_rate": 0.0004023809523809524, + "loss": 0.3666, + "step": 21448 + }, + { + "epoch": 11.98268156424581, + "grad_norm": 8.36441421508789, + "learning_rate": 0.0004023529411764706, + "loss": 0.349, + "step": 21449 + }, + { + "epoch": 11.983240223463687, + "grad_norm": 0.5477078557014465, + "learning_rate": 0.0004023249299719888, + "loss": 0.445, + "step": 21450 + }, + { + "epoch": 11.983798882681564, + "grad_norm": 0.5610117316246033, + "learning_rate": 0.000402296918767507, + "loss": 0.3944, + "step": 21451 + }, + { + "epoch": 11.984357541899442, + "grad_norm": 0.3802027702331543, + "learning_rate": 0.0004022689075630252, + "loss": 0.3331, + "step": 21452 + }, + { + "epoch": 11.984916201117318, + "grad_norm": 0.58562171459198, + "learning_rate": 0.00040224089635854343, + "loss": 0.4818, + "step": 21453 + }, + { + "epoch": 11.985474860335195, + "grad_norm": 0.4802526831626892, + "learning_rate": 0.00040221288515406164, + "loss": 0.5043, + "step": 21454 + }, + { + "epoch": 11.986033519553073, + "grad_norm": 0.9251011610031128, + "learning_rate": 0.00040218487394957984, + "loss": 0.4765, + "step": 21455 + }, + { + "epoch": 11.98659217877095, + "grad_norm": 0.7091112732887268, + "learning_rate": 0.00040215686274509805, + "loss": 0.478, + "step": 21456 + }, + { + "epoch": 11.987150837988827, + "grad_norm": 0.9871140718460083, + "learning_rate": 0.00040212885154061626, + "loss": 0.4323, + "step": 21457 + }, + { + "epoch": 11.987709497206703, + "grad_norm": 0.9973250031471252, + "learning_rate": 0.00040210084033613446, + "loss": 0.3803, + "step": 21458 + }, + { + "epoch": 11.988268156424581, + "grad_norm": 0.6726212501525879, + "learning_rate": 0.00040207282913165267, + "loss": 0.4414, + "step": 21459 + }, + { + "epoch": 11.988826815642458, + "grad_norm": 0.35632383823394775, + "learning_rate": 0.00040204481792717093, + "loss": 0.3801, + "step": 21460 + }, + { + "epoch": 11.989385474860335, + "grad_norm": 0.5990647673606873, + "learning_rate": 0.0004020168067226891, + "loss": 0.5631, + "step": 21461 + }, + { + "epoch": 11.989944134078213, + "grad_norm": 1.3093101978302002, + "learning_rate": 0.0004019887955182073, + "loss": 0.4107, + "step": 21462 + }, + { + "epoch": 11.99050279329609, + "grad_norm": 0.6561041474342346, + "learning_rate": 0.0004019607843137255, + "loss": 0.6508, + "step": 21463 + }, + { + "epoch": 11.991061452513966, + "grad_norm": 0.5097846984863281, + "learning_rate": 0.0004019327731092437, + "loss": 0.4593, + "step": 21464 + }, + { + "epoch": 11.991620111731844, + "grad_norm": 0.41337135434150696, + "learning_rate": 0.00040190476190476196, + "loss": 0.5292, + "step": 21465 + }, + { + "epoch": 11.992178770949721, + "grad_norm": 0.46019884943962097, + "learning_rate": 0.0004018767507002801, + "loss": 0.3881, + "step": 21466 + }, + { + "epoch": 11.992737430167598, + "grad_norm": 0.5083510875701904, + "learning_rate": 0.0004018487394957983, + "loss": 0.4658, + "step": 21467 + }, + { + "epoch": 11.993296089385474, + "grad_norm": 0.43040454387664795, + "learning_rate": 0.0004018207282913166, + "loss": 0.404, + "step": 21468 + }, + { + "epoch": 11.993854748603352, + "grad_norm": 0.7379305958747864, + "learning_rate": 0.0004017927170868347, + "loss": 0.3714, + "step": 21469 + }, + { + "epoch": 11.994413407821229, + "grad_norm": 0.49640461802482605, + "learning_rate": 0.000401764705882353, + "loss": 0.411, + "step": 21470 + }, + { + "epoch": 11.994972067039106, + "grad_norm": 0.8307761549949646, + "learning_rate": 0.00040173669467787114, + "loss": 0.3684, + "step": 21471 + }, + { + "epoch": 11.995530726256984, + "grad_norm": 1.5874541997909546, + "learning_rate": 0.00040170868347338935, + "loss": 0.3373, + "step": 21472 + }, + { + "epoch": 11.99608938547486, + "grad_norm": 0.43602287769317627, + "learning_rate": 0.0004016806722689076, + "loss": 0.4466, + "step": 21473 + }, + { + "epoch": 11.996648044692737, + "grad_norm": 0.5021793246269226, + "learning_rate": 0.00040165266106442576, + "loss": 0.3648, + "step": 21474 + }, + { + "epoch": 11.997206703910614, + "grad_norm": 0.47018346190452576, + "learning_rate": 0.000401624649859944, + "loss": 0.4914, + "step": 21475 + }, + { + "epoch": 11.997765363128492, + "grad_norm": 0.8784552812576294, + "learning_rate": 0.0004015966386554622, + "loss": 0.5085, + "step": 21476 + }, + { + "epoch": 11.998324022346369, + "grad_norm": 0.5817292928695679, + "learning_rate": 0.0004015686274509804, + "loss": 0.4076, + "step": 21477 + }, + { + "epoch": 11.998882681564245, + "grad_norm": 0.6286437511444092, + "learning_rate": 0.00040154061624649864, + "loss": 0.4085, + "step": 21478 + }, + { + "epoch": 11.999441340782123, + "grad_norm": 0.47691115736961365, + "learning_rate": 0.0004015126050420168, + "loss": 0.4291, + "step": 21479 + }, + { + "epoch": 12.0, + "grad_norm": 0.44007375836372375, + "learning_rate": 0.000401484593837535, + "loss": 0.3999, + "step": 21480 + }, + { + "epoch": 12.000558659217877, + "grad_norm": 4.054800510406494, + "learning_rate": 0.00040145658263305325, + "loss": 0.4104, + "step": 21481 + }, + { + "epoch": 12.001117318435755, + "grad_norm": 0.5809652805328369, + "learning_rate": 0.0004014285714285714, + "loss": 0.4753, + "step": 21482 + }, + { + "epoch": 12.001675977653631, + "grad_norm": 7.96524715423584, + "learning_rate": 0.00040140056022408967, + "loss": 0.4473, + "step": 21483 + }, + { + "epoch": 12.002234636871508, + "grad_norm": 0.8553932905197144, + "learning_rate": 0.00040137254901960787, + "loss": 0.4088, + "step": 21484 + }, + { + "epoch": 12.002793296089385, + "grad_norm": 1.017467975616455, + "learning_rate": 0.000401344537815126, + "loss": 0.4812, + "step": 21485 + }, + { + "epoch": 12.003351955307263, + "grad_norm": 0.414492666721344, + "learning_rate": 0.0004013165266106443, + "loss": 0.3337, + "step": 21486 + }, + { + "epoch": 12.00391061452514, + "grad_norm": 1.8006995916366577, + "learning_rate": 0.00040128851540616243, + "loss": 0.5128, + "step": 21487 + }, + { + "epoch": 12.004469273743016, + "grad_norm": 0.5660163164138794, + "learning_rate": 0.0004012605042016807, + "loss": 0.3852, + "step": 21488 + }, + { + "epoch": 12.005027932960894, + "grad_norm": 0.5661418437957764, + "learning_rate": 0.0004012324929971989, + "loss": 0.3802, + "step": 21489 + }, + { + "epoch": 12.005586592178771, + "grad_norm": 0.6057156324386597, + "learning_rate": 0.00040120448179271705, + "loss": 0.3801, + "step": 21490 + }, + { + "epoch": 12.006145251396648, + "grad_norm": 0.5141134262084961, + "learning_rate": 0.0004011764705882353, + "loss": 0.5154, + "step": 21491 + }, + { + "epoch": 12.006703910614526, + "grad_norm": 0.8683204054832458, + "learning_rate": 0.0004011484593837535, + "loss": 0.4275, + "step": 21492 + }, + { + "epoch": 12.007262569832402, + "grad_norm": 0.576714277267456, + "learning_rate": 0.0004011204481792717, + "loss": 0.3941, + "step": 21493 + }, + { + "epoch": 12.007821229050279, + "grad_norm": 0.6502825021743774, + "learning_rate": 0.00040109243697478993, + "loss": 0.5747, + "step": 21494 + }, + { + "epoch": 12.008379888268156, + "grad_norm": 0.5278416275978088, + "learning_rate": 0.0004010644257703081, + "loss": 0.4673, + "step": 21495 + }, + { + "epoch": 12.008938547486034, + "grad_norm": 0.3412959575653076, + "learning_rate": 0.00040103641456582634, + "loss": 0.3165, + "step": 21496 + }, + { + "epoch": 12.00949720670391, + "grad_norm": 0.6991682052612305, + "learning_rate": 0.00040100840336134455, + "loss": 0.4227, + "step": 21497 + }, + { + "epoch": 12.010055865921787, + "grad_norm": 1.3554376363754272, + "learning_rate": 0.00040098039215686276, + "loss": 0.4333, + "step": 21498 + }, + { + "epoch": 12.010614525139665, + "grad_norm": 0.7261898517608643, + "learning_rate": 0.00040095238095238096, + "loss": 0.4795, + "step": 21499 + }, + { + "epoch": 12.011173184357542, + "grad_norm": 0.6800056099891663, + "learning_rate": 0.00040092436974789917, + "loss": 0.3799, + "step": 21500 + }, + { + "epoch": 12.011173184357542, + "eval_cer": 0.08738393724155237, + "eval_loss": 0.33092448115348816, + "eval_runtime": 55.623, + "eval_samples_per_second": 81.585, + "eval_steps_per_second": 5.106, + "eval_wer": 0.34373341526772994, + "step": 21500 + }, + { + "epoch": 12.011731843575419, + "grad_norm": 3.178755283355713, + "learning_rate": 0.0004008963585434174, + "loss": 0.4439, + "step": 21501 + }, + { + "epoch": 12.012290502793297, + "grad_norm": 0.40753012895584106, + "learning_rate": 0.0004008683473389356, + "loss": 0.3713, + "step": 21502 + }, + { + "epoch": 12.012849162011173, + "grad_norm": 0.44675058126449585, + "learning_rate": 0.0004008403361344538, + "loss": 0.3506, + "step": 21503 + }, + { + "epoch": 12.01340782122905, + "grad_norm": 0.8386898040771484, + "learning_rate": 0.000400812324929972, + "loss": 0.5338, + "step": 21504 + }, + { + "epoch": 12.013966480446927, + "grad_norm": 0.3318725526332855, + "learning_rate": 0.0004007843137254902, + "loss": 0.291, + "step": 21505 + }, + { + "epoch": 12.014525139664805, + "grad_norm": 0.6348558068275452, + "learning_rate": 0.0004007563025210084, + "loss": 0.5524, + "step": 21506 + }, + { + "epoch": 12.015083798882682, + "grad_norm": 1.133283257484436, + "learning_rate": 0.0004007282913165266, + "loss": 0.5196, + "step": 21507 + }, + { + "epoch": 12.015642458100558, + "grad_norm": 0.4056191146373749, + "learning_rate": 0.00040070028011204487, + "loss": 0.2897, + "step": 21508 + }, + { + "epoch": 12.016201117318436, + "grad_norm": 0.5457218289375305, + "learning_rate": 0.000400672268907563, + "loss": 0.3484, + "step": 21509 + }, + { + "epoch": 12.016759776536313, + "grad_norm": 0.4435984790325165, + "learning_rate": 0.0004006442577030812, + "loss": 0.4898, + "step": 21510 + }, + { + "epoch": 12.01731843575419, + "grad_norm": 0.3934905529022217, + "learning_rate": 0.00040061624649859943, + "loss": 0.4794, + "step": 21511 + }, + { + "epoch": 12.017877094972068, + "grad_norm": 1.8118194341659546, + "learning_rate": 0.00040058823529411764, + "loss": 0.3515, + "step": 21512 + }, + { + "epoch": 12.018435754189944, + "grad_norm": 0.626777172088623, + "learning_rate": 0.0004005602240896359, + "loss": 0.4214, + "step": 21513 + }, + { + "epoch": 12.018994413407821, + "grad_norm": 1.8108199834823608, + "learning_rate": 0.00040053221288515405, + "loss": 0.5368, + "step": 21514 + }, + { + "epoch": 12.019553072625698, + "grad_norm": 0.5234801173210144, + "learning_rate": 0.00040050420168067226, + "loss": 0.5102, + "step": 21515 + }, + { + "epoch": 12.020111731843576, + "grad_norm": 0.612450122833252, + "learning_rate": 0.0004004761904761905, + "loss": 0.3754, + "step": 21516 + }, + { + "epoch": 12.020670391061453, + "grad_norm": 0.5905392169952393, + "learning_rate": 0.00040044817927170867, + "loss": 0.4769, + "step": 21517 + }, + { + "epoch": 12.021229050279329, + "grad_norm": 0.5143051147460938, + "learning_rate": 0.00040042016806722693, + "loss": 0.371, + "step": 21518 + }, + { + "epoch": 12.021787709497207, + "grad_norm": 0.8932301998138428, + "learning_rate": 0.0004003921568627451, + "loss": 0.4304, + "step": 21519 + }, + { + "epoch": 12.022346368715084, + "grad_norm": 0.407192200422287, + "learning_rate": 0.0004003641456582633, + "loss": 0.3741, + "step": 21520 + }, + { + "epoch": 12.02290502793296, + "grad_norm": 2.492185115814209, + "learning_rate": 0.00040033613445378155, + "loss": 0.3627, + "step": 21521 + }, + { + "epoch": 12.023463687150837, + "grad_norm": 0.5570883750915527, + "learning_rate": 0.0004003081232492997, + "loss": 0.4403, + "step": 21522 + }, + { + "epoch": 12.024022346368715, + "grad_norm": 0.6157993674278259, + "learning_rate": 0.00040028011204481796, + "loss": 0.3439, + "step": 21523 + }, + { + "epoch": 12.024581005586592, + "grad_norm": 0.5208712220191956, + "learning_rate": 0.00040025210084033617, + "loss": 0.6999, + "step": 21524 + }, + { + "epoch": 12.025139664804469, + "grad_norm": 0.46682968735694885, + "learning_rate": 0.0004002240896358543, + "loss": 0.4223, + "step": 21525 + }, + { + "epoch": 12.025698324022347, + "grad_norm": 0.6710468530654907, + "learning_rate": 0.0004001960784313726, + "loss": 0.3785, + "step": 21526 + }, + { + "epoch": 12.026256983240224, + "grad_norm": 0.38457804918289185, + "learning_rate": 0.00040016806722689073, + "loss": 0.4177, + "step": 21527 + }, + { + "epoch": 12.0268156424581, + "grad_norm": 0.7539198994636536, + "learning_rate": 0.000400140056022409, + "loss": 0.5561, + "step": 21528 + }, + { + "epoch": 12.027374301675978, + "grad_norm": 1.5708000659942627, + "learning_rate": 0.0004001120448179272, + "loss": 0.4673, + "step": 21529 + }, + { + "epoch": 12.027932960893855, + "grad_norm": 0.4048934876918793, + "learning_rate": 0.00040008403361344535, + "loss": 0.5231, + "step": 21530 + }, + { + "epoch": 12.028491620111732, + "grad_norm": 0.3864879608154297, + "learning_rate": 0.0004000560224089636, + "loss": 0.3949, + "step": 21531 + }, + { + "epoch": 12.029050279329608, + "grad_norm": 0.38573333621025085, + "learning_rate": 0.0004000280112044818, + "loss": 0.3977, + "step": 21532 + }, + { + "epoch": 12.029608938547486, + "grad_norm": 0.5003073811531067, + "learning_rate": 0.0004, + "loss": 0.4605, + "step": 21533 + }, + { + "epoch": 12.030167597765363, + "grad_norm": 6.38754415512085, + "learning_rate": 0.0003999719887955182, + "loss": 0.4038, + "step": 21534 + }, + { + "epoch": 12.03072625698324, + "grad_norm": 11.37295913696289, + "learning_rate": 0.0003999439775910364, + "loss": 0.4571, + "step": 21535 + }, + { + "epoch": 12.031284916201118, + "grad_norm": 4.7879438400268555, + "learning_rate": 0.00039991596638655464, + "loss": 0.4373, + "step": 21536 + }, + { + "epoch": 12.031843575418995, + "grad_norm": 0.9730218648910522, + "learning_rate": 0.00039988795518207284, + "loss": 0.3795, + "step": 21537 + }, + { + "epoch": 12.032402234636871, + "grad_norm": 0.5231620073318481, + "learning_rate": 0.00039985994397759105, + "loss": 0.4207, + "step": 21538 + }, + { + "epoch": 12.03296089385475, + "grad_norm": 0.5141884684562683, + "learning_rate": 0.00039983193277310926, + "loss": 0.3889, + "step": 21539 + }, + { + "epoch": 12.033519553072626, + "grad_norm": 0.4972028434276581, + "learning_rate": 0.00039980392156862746, + "loss": 0.4673, + "step": 21540 + }, + { + "epoch": 12.034078212290503, + "grad_norm": 0.9289790987968445, + "learning_rate": 0.00039977591036414567, + "loss": 0.4573, + "step": 21541 + }, + { + "epoch": 12.03463687150838, + "grad_norm": 3.2927439212799072, + "learning_rate": 0.0003997478991596639, + "loss": 0.3546, + "step": 21542 + }, + { + "epoch": 12.035195530726257, + "grad_norm": 0.7546213269233704, + "learning_rate": 0.0003997198879551821, + "loss": 0.4986, + "step": 21543 + }, + { + "epoch": 12.035754189944134, + "grad_norm": 0.4614641070365906, + "learning_rate": 0.0003996918767507003, + "loss": 0.4107, + "step": 21544 + }, + { + "epoch": 12.03631284916201, + "grad_norm": 1.9791784286499023, + "learning_rate": 0.0003996638655462185, + "loss": 0.4615, + "step": 21545 + }, + { + "epoch": 12.036871508379889, + "grad_norm": 0.5705390572547913, + "learning_rate": 0.0003996358543417367, + "loss": 0.4557, + "step": 21546 + }, + { + "epoch": 12.037430167597766, + "grad_norm": 0.6609106659889221, + "learning_rate": 0.0003996078431372549, + "loss": 0.5273, + "step": 21547 + }, + { + "epoch": 12.037988826815642, + "grad_norm": 1.7435355186462402, + "learning_rate": 0.00039957983193277316, + "loss": 0.4297, + "step": 21548 + }, + { + "epoch": 12.03854748603352, + "grad_norm": 0.6595619320869446, + "learning_rate": 0.0003995518207282913, + "loss": 0.5128, + "step": 21549 + }, + { + "epoch": 12.039106145251397, + "grad_norm": 0.5085327625274658, + "learning_rate": 0.0003995238095238095, + "loss": 0.4025, + "step": 21550 + }, + { + "epoch": 12.039664804469274, + "grad_norm": 0.43308913707733154, + "learning_rate": 0.0003994957983193277, + "loss": 0.3433, + "step": 21551 + }, + { + "epoch": 12.04022346368715, + "grad_norm": 0.5915241837501526, + "learning_rate": 0.00039946778711484593, + "loss": 0.3883, + "step": 21552 + }, + { + "epoch": 12.040782122905028, + "grad_norm": 0.49047043919563293, + "learning_rate": 0.0003994397759103642, + "loss": 0.4529, + "step": 21553 + }, + { + "epoch": 12.041340782122905, + "grad_norm": 0.9699426889419556, + "learning_rate": 0.00039941176470588235, + "loss": 0.5972, + "step": 21554 + }, + { + "epoch": 12.041899441340782, + "grad_norm": 0.330026239156723, + "learning_rate": 0.00039938375350140055, + "loss": 0.4121, + "step": 21555 + }, + { + "epoch": 12.04245810055866, + "grad_norm": 0.7895104885101318, + "learning_rate": 0.0003993557422969188, + "loss": 0.4655, + "step": 21556 + }, + { + "epoch": 12.043016759776537, + "grad_norm": 0.419988214969635, + "learning_rate": 0.00039932773109243696, + "loss": 0.393, + "step": 21557 + }, + { + "epoch": 12.043575418994413, + "grad_norm": 0.5506265163421631, + "learning_rate": 0.0003992997198879552, + "loss": 0.3533, + "step": 21558 + }, + { + "epoch": 12.04413407821229, + "grad_norm": 0.49021315574645996, + "learning_rate": 0.0003992717086834734, + "loss": 0.3966, + "step": 21559 + }, + { + "epoch": 12.044692737430168, + "grad_norm": 0.6519712209701538, + "learning_rate": 0.0003992436974789916, + "loss": 0.4206, + "step": 21560 + }, + { + "epoch": 12.045251396648045, + "grad_norm": 0.6331649422645569, + "learning_rate": 0.00039921568627450984, + "loss": 0.4176, + "step": 21561 + }, + { + "epoch": 12.045810055865921, + "grad_norm": 0.794137179851532, + "learning_rate": 0.000399187675070028, + "loss": 0.429, + "step": 21562 + }, + { + "epoch": 12.0463687150838, + "grad_norm": 1.9765437841415405, + "learning_rate": 0.00039915966386554625, + "loss": 0.4986, + "step": 21563 + }, + { + "epoch": 12.046927374301676, + "grad_norm": 1.6925710439682007, + "learning_rate": 0.00039913165266106446, + "loss": 0.423, + "step": 21564 + }, + { + "epoch": 12.047486033519553, + "grad_norm": 1.5677542686462402, + "learning_rate": 0.0003991036414565826, + "loss": 0.4193, + "step": 21565 + }, + { + "epoch": 12.048044692737431, + "grad_norm": 0.7156217098236084, + "learning_rate": 0.00039907563025210087, + "loss": 0.4831, + "step": 21566 + }, + { + "epoch": 12.048603351955308, + "grad_norm": 0.6260681748390198, + "learning_rate": 0.000399047619047619, + "loss": 0.4537, + "step": 21567 + }, + { + "epoch": 12.049162011173184, + "grad_norm": 0.48465362191200256, + "learning_rate": 0.0003990196078431373, + "loss": 0.4521, + "step": 21568 + }, + { + "epoch": 12.04972067039106, + "grad_norm": 0.753529965877533, + "learning_rate": 0.0003989915966386555, + "loss": 0.4158, + "step": 21569 + }, + { + "epoch": 12.050279329608939, + "grad_norm": 0.4164171814918518, + "learning_rate": 0.00039896358543417364, + "loss": 0.4175, + "step": 21570 + }, + { + "epoch": 12.050837988826816, + "grad_norm": 0.733792781829834, + "learning_rate": 0.0003989355742296919, + "loss": 0.5642, + "step": 21571 + }, + { + "epoch": 12.051396648044692, + "grad_norm": 0.3996419906616211, + "learning_rate": 0.0003989075630252101, + "loss": 0.399, + "step": 21572 + }, + { + "epoch": 12.05195530726257, + "grad_norm": 0.8819694519042969, + "learning_rate": 0.0003988795518207283, + "loss": 0.392, + "step": 21573 + }, + { + "epoch": 12.052513966480447, + "grad_norm": 3.5123233795166016, + "learning_rate": 0.0003988515406162465, + "loss": 0.4584, + "step": 21574 + }, + { + "epoch": 12.053072625698324, + "grad_norm": 0.358894944190979, + "learning_rate": 0.00039882352941176467, + "loss": 0.403, + "step": 21575 + }, + { + "epoch": 12.053631284916202, + "grad_norm": 0.4959317445755005, + "learning_rate": 0.00039879551820728293, + "loss": 0.4753, + "step": 21576 + }, + { + "epoch": 12.054189944134079, + "grad_norm": 0.3478318750858307, + "learning_rate": 0.00039876750700280114, + "loss": 0.3805, + "step": 21577 + }, + { + "epoch": 12.054748603351955, + "grad_norm": 0.6848805546760559, + "learning_rate": 0.00039873949579831934, + "loss": 0.5441, + "step": 21578 + }, + { + "epoch": 12.055307262569832, + "grad_norm": 0.5436789989471436, + "learning_rate": 0.00039871148459383755, + "loss": 0.428, + "step": 21579 + }, + { + "epoch": 12.05586592178771, + "grad_norm": 0.3651781380176544, + "learning_rate": 0.00039868347338935576, + "loss": 0.3435, + "step": 21580 + }, + { + "epoch": 12.056424581005587, + "grad_norm": 0.4721587002277374, + "learning_rate": 0.00039865546218487396, + "loss": 0.5477, + "step": 21581 + }, + { + "epoch": 12.056983240223463, + "grad_norm": 0.6732951402664185, + "learning_rate": 0.00039862745098039217, + "loss": 0.4458, + "step": 21582 + }, + { + "epoch": 12.057541899441341, + "grad_norm": 0.8742517232894897, + "learning_rate": 0.0003985994397759104, + "loss": 0.3513, + "step": 21583 + }, + { + "epoch": 12.058100558659218, + "grad_norm": 0.68865966796875, + "learning_rate": 0.0003985714285714286, + "loss": 0.7124, + "step": 21584 + }, + { + "epoch": 12.058659217877095, + "grad_norm": 0.5192102789878845, + "learning_rate": 0.0003985434173669468, + "loss": 0.3528, + "step": 21585 + }, + { + "epoch": 12.059217877094973, + "grad_norm": 0.4324816167354584, + "learning_rate": 0.000398515406162465, + "loss": 0.3938, + "step": 21586 + }, + { + "epoch": 12.05977653631285, + "grad_norm": 0.5224592685699463, + "learning_rate": 0.0003984873949579832, + "loss": 0.4083, + "step": 21587 + }, + { + "epoch": 12.060335195530726, + "grad_norm": 0.3568752706050873, + "learning_rate": 0.00039845938375350146, + "loss": 0.3534, + "step": 21588 + }, + { + "epoch": 12.060893854748603, + "grad_norm": 3.1321470737457275, + "learning_rate": 0.0003984313725490196, + "loss": 0.3082, + "step": 21589 + }, + { + "epoch": 12.061452513966481, + "grad_norm": 0.6076870560646057, + "learning_rate": 0.0003984033613445378, + "loss": 0.5191, + "step": 21590 + }, + { + "epoch": 12.062011173184358, + "grad_norm": 0.8609892129898071, + "learning_rate": 0.000398375350140056, + "loss": 0.4518, + "step": 21591 + }, + { + "epoch": 12.062569832402234, + "grad_norm": 0.5587970018386841, + "learning_rate": 0.0003983473389355742, + "loss": 0.4176, + "step": 21592 + }, + { + "epoch": 12.063128491620112, + "grad_norm": 0.4230564534664154, + "learning_rate": 0.00039831932773109243, + "loss": 0.5093, + "step": 21593 + }, + { + "epoch": 12.063687150837989, + "grad_norm": 0.5761800408363342, + "learning_rate": 0.00039829131652661064, + "loss": 0.4522, + "step": 21594 + }, + { + "epoch": 12.064245810055866, + "grad_norm": 3.0153326988220215, + "learning_rate": 0.00039826330532212885, + "loss": 0.5196, + "step": 21595 + }, + { + "epoch": 12.064804469273742, + "grad_norm": 0.5321272611618042, + "learning_rate": 0.0003982352941176471, + "loss": 0.3619, + "step": 21596 + }, + { + "epoch": 12.06536312849162, + "grad_norm": 0.48106786608695984, + "learning_rate": 0.00039820728291316526, + "loss": 0.4905, + "step": 21597 + }, + { + "epoch": 12.065921787709497, + "grad_norm": 0.7509476542472839, + "learning_rate": 0.00039817927170868346, + "loss": 0.35, + "step": 21598 + }, + { + "epoch": 12.066480446927374, + "grad_norm": 0.6596659421920776, + "learning_rate": 0.00039815126050420167, + "loss": 0.4304, + "step": 21599 + }, + { + "epoch": 12.067039106145252, + "grad_norm": 0.503322422504425, + "learning_rate": 0.0003981232492997199, + "loss": 0.4014, + "step": 21600 + }, + { + "epoch": 12.067597765363129, + "grad_norm": 1.1880236864089966, + "learning_rate": 0.00039809523809523814, + "loss": 0.4543, + "step": 21601 + }, + { + "epoch": 12.068156424581005, + "grad_norm": 0.39636269211769104, + "learning_rate": 0.0003980672268907563, + "loss": 0.3756, + "step": 21602 + }, + { + "epoch": 12.068715083798883, + "grad_norm": 7.900854110717773, + "learning_rate": 0.0003980392156862745, + "loss": 0.3677, + "step": 21603 + }, + { + "epoch": 12.06927374301676, + "grad_norm": 0.7973554134368896, + "learning_rate": 0.00039801120448179275, + "loss": 0.3423, + "step": 21604 + }, + { + "epoch": 12.069832402234637, + "grad_norm": 0.6639948487281799, + "learning_rate": 0.0003979831932773109, + "loss": 0.4873, + "step": 21605 + }, + { + "epoch": 12.070391061452513, + "grad_norm": 0.4054809510707855, + "learning_rate": 0.00039795518207282917, + "loss": 0.4029, + "step": 21606 + }, + { + "epoch": 12.070949720670392, + "grad_norm": 0.8257238268852234, + "learning_rate": 0.0003979271708683473, + "loss": 0.4838, + "step": 21607 + }, + { + "epoch": 12.071508379888268, + "grad_norm": 0.6885493993759155, + "learning_rate": 0.0003978991596638655, + "loss": 0.5635, + "step": 21608 + }, + { + "epoch": 12.072067039106145, + "grad_norm": 0.4674849510192871, + "learning_rate": 0.0003978711484593838, + "loss": 0.4613, + "step": 21609 + }, + { + "epoch": 12.072625698324023, + "grad_norm": 1.5057246685028076, + "learning_rate": 0.00039784313725490193, + "loss": 0.419, + "step": 21610 + }, + { + "epoch": 12.0731843575419, + "grad_norm": 0.5516594052314758, + "learning_rate": 0.0003978151260504202, + "loss": 0.5002, + "step": 21611 + }, + { + "epoch": 12.073743016759776, + "grad_norm": 0.37535205483436584, + "learning_rate": 0.0003977871148459384, + "loss": 0.4082, + "step": 21612 + }, + { + "epoch": 12.074301675977654, + "grad_norm": 0.9043879508972168, + "learning_rate": 0.00039775910364145655, + "loss": 0.5307, + "step": 21613 + }, + { + "epoch": 12.074860335195531, + "grad_norm": 0.8339896202087402, + "learning_rate": 0.0003977310924369748, + "loss": 0.4068, + "step": 21614 + }, + { + "epoch": 12.075418994413408, + "grad_norm": 0.8790621757507324, + "learning_rate": 0.00039770308123249296, + "loss": 0.5259, + "step": 21615 + }, + { + "epoch": 12.075977653631284, + "grad_norm": 0.5176562070846558, + "learning_rate": 0.0003976750700280112, + "loss": 0.5019, + "step": 21616 + }, + { + "epoch": 12.076536312849163, + "grad_norm": 2.765859842300415, + "learning_rate": 0.00039764705882352943, + "loss": 0.6903, + "step": 21617 + }, + { + "epoch": 12.077094972067039, + "grad_norm": 0.49033039808273315, + "learning_rate": 0.0003976190476190476, + "loss": 0.3566, + "step": 21618 + }, + { + "epoch": 12.077653631284916, + "grad_norm": 0.596653938293457, + "learning_rate": 0.00039759103641456584, + "loss": 0.492, + "step": 21619 + }, + { + "epoch": 12.078212290502794, + "grad_norm": 0.877156138420105, + "learning_rate": 0.00039756302521008405, + "loss": 0.3871, + "step": 21620 + }, + { + "epoch": 12.07877094972067, + "grad_norm": 3.1856210231781006, + "learning_rate": 0.00039753501400560226, + "loss": 0.3289, + "step": 21621 + }, + { + "epoch": 12.079329608938547, + "grad_norm": 0.5671728849411011, + "learning_rate": 0.00039750700280112046, + "loss": 0.4278, + "step": 21622 + }, + { + "epoch": 12.079888268156424, + "grad_norm": 0.9842540621757507, + "learning_rate": 0.0003974789915966386, + "loss": 0.4035, + "step": 21623 + }, + { + "epoch": 12.080446927374302, + "grad_norm": 0.413460910320282, + "learning_rate": 0.0003974509803921569, + "loss": 0.3879, + "step": 21624 + }, + { + "epoch": 12.081005586592179, + "grad_norm": 0.5376746654510498, + "learning_rate": 0.0003974229691876751, + "loss": 0.4039, + "step": 21625 + }, + { + "epoch": 12.081564245810055, + "grad_norm": 0.6759371757507324, + "learning_rate": 0.0003973949579831933, + "loss": 0.4685, + "step": 21626 + }, + { + "epoch": 12.082122905027934, + "grad_norm": 0.4697682857513428, + "learning_rate": 0.0003973669467787115, + "loss": 0.3497, + "step": 21627 + }, + { + "epoch": 12.08268156424581, + "grad_norm": 0.5227476954460144, + "learning_rate": 0.0003973389355742297, + "loss": 0.4569, + "step": 21628 + }, + { + "epoch": 12.083240223463687, + "grad_norm": 0.5758565664291382, + "learning_rate": 0.0003973109243697479, + "loss": 0.3827, + "step": 21629 + }, + { + "epoch": 12.083798882681565, + "grad_norm": 0.4359157979488373, + "learning_rate": 0.0003972829131652661, + "loss": 0.4406, + "step": 21630 + }, + { + "epoch": 12.084357541899442, + "grad_norm": 0.6225452423095703, + "learning_rate": 0.00039725490196078437, + "loss": 0.4388, + "step": 21631 + }, + { + "epoch": 12.084916201117318, + "grad_norm": 0.6389302015304565, + "learning_rate": 0.0003972268907563025, + "loss": 0.3945, + "step": 21632 + }, + { + "epoch": 12.085474860335195, + "grad_norm": 1.9121235609054565, + "learning_rate": 0.0003971988795518207, + "loss": 0.4263, + "step": 21633 + }, + { + "epoch": 12.086033519553073, + "grad_norm": 0.5296115875244141, + "learning_rate": 0.00039717086834733893, + "loss": 0.3678, + "step": 21634 + }, + { + "epoch": 12.08659217877095, + "grad_norm": 0.4797914922237396, + "learning_rate": 0.00039714285714285714, + "loss": 0.4622, + "step": 21635 + }, + { + "epoch": 12.087150837988826, + "grad_norm": 0.5294155478477478, + "learning_rate": 0.0003971148459383754, + "loss": 0.3964, + "step": 21636 + }, + { + "epoch": 12.087709497206705, + "grad_norm": 1.4047764539718628, + "learning_rate": 0.00039708683473389355, + "loss": 0.4143, + "step": 21637 + }, + { + "epoch": 12.088268156424581, + "grad_norm": 0.738347589969635, + "learning_rate": 0.00039705882352941176, + "loss": 0.3618, + "step": 21638 + }, + { + "epoch": 12.088826815642458, + "grad_norm": 0.47197914123535156, + "learning_rate": 0.00039703081232493, + "loss": 0.4172, + "step": 21639 + }, + { + "epoch": 12.089385474860336, + "grad_norm": 0.41236546635627747, + "learning_rate": 0.00039700280112044817, + "loss": 0.4524, + "step": 21640 + }, + { + "epoch": 12.089944134078213, + "grad_norm": 0.9284387230873108, + "learning_rate": 0.00039697478991596643, + "loss": 0.3191, + "step": 21641 + }, + { + "epoch": 12.09050279329609, + "grad_norm": 0.40866610407829285, + "learning_rate": 0.0003969467787114846, + "loss": 0.4294, + "step": 21642 + }, + { + "epoch": 12.091061452513966, + "grad_norm": 0.3907952308654785, + "learning_rate": 0.0003969187675070028, + "loss": 0.3367, + "step": 21643 + }, + { + "epoch": 12.091620111731844, + "grad_norm": 0.6081427335739136, + "learning_rate": 0.00039689075630252105, + "loss": 0.3373, + "step": 21644 + }, + { + "epoch": 12.09217877094972, + "grad_norm": 0.525584876537323, + "learning_rate": 0.0003968627450980392, + "loss": 0.4389, + "step": 21645 + }, + { + "epoch": 12.092737430167597, + "grad_norm": 0.3987991511821747, + "learning_rate": 0.00039683473389355746, + "loss": 0.3855, + "step": 21646 + }, + { + "epoch": 12.093296089385476, + "grad_norm": 1.655609130859375, + "learning_rate": 0.00039680672268907567, + "loss": 0.4978, + "step": 21647 + }, + { + "epoch": 12.093854748603352, + "grad_norm": 1.1261324882507324, + "learning_rate": 0.0003967787114845938, + "loss": 0.3963, + "step": 21648 + }, + { + "epoch": 12.094413407821229, + "grad_norm": 0.34029123187065125, + "learning_rate": 0.0003967507002801121, + "loss": 0.322, + "step": 21649 + }, + { + "epoch": 12.094972067039107, + "grad_norm": 0.4872231185436249, + "learning_rate": 0.00039672268907563023, + "loss": 0.4012, + "step": 21650 + }, + { + "epoch": 12.095530726256984, + "grad_norm": 1.4977905750274658, + "learning_rate": 0.0003966946778711485, + "loss": 0.481, + "step": 21651 + }, + { + "epoch": 12.09608938547486, + "grad_norm": 0.47866091132164, + "learning_rate": 0.0003966666666666667, + "loss": 0.4402, + "step": 21652 + }, + { + "epoch": 12.096648044692737, + "grad_norm": 1.3456724882125854, + "learning_rate": 0.00039663865546218485, + "loss": 0.391, + "step": 21653 + }, + { + "epoch": 12.097206703910615, + "grad_norm": 0.5541782379150391, + "learning_rate": 0.0003966106442577031, + "loss": 0.5628, + "step": 21654 + }, + { + "epoch": 12.097765363128492, + "grad_norm": 0.42005544900894165, + "learning_rate": 0.0003965826330532213, + "loss": 0.4257, + "step": 21655 + }, + { + "epoch": 12.098324022346368, + "grad_norm": 0.5318379402160645, + "learning_rate": 0.0003965546218487395, + "loss": 0.5421, + "step": 21656 + }, + { + "epoch": 12.098882681564247, + "grad_norm": 0.4454403817653656, + "learning_rate": 0.0003965266106442577, + "loss": 0.3015, + "step": 21657 + }, + { + "epoch": 12.099441340782123, + "grad_norm": 0.3577471971511841, + "learning_rate": 0.0003964985994397759, + "loss": 0.3636, + "step": 21658 + }, + { + "epoch": 12.1, + "grad_norm": 1.3671613931655884, + "learning_rate": 0.00039647058823529414, + "loss": 0.3838, + "step": 21659 + }, + { + "epoch": 12.100558659217878, + "grad_norm": 0.6704123616218567, + "learning_rate": 0.00039644257703081234, + "loss": 0.3494, + "step": 21660 + }, + { + "epoch": 12.101117318435755, + "grad_norm": 0.5111764669418335, + "learning_rate": 0.00039641456582633055, + "loss": 0.4476, + "step": 21661 + }, + { + "epoch": 12.101675977653631, + "grad_norm": 0.5964788794517517, + "learning_rate": 0.00039638655462184876, + "loss": 0.5387, + "step": 21662 + }, + { + "epoch": 12.102234636871508, + "grad_norm": 1.0383455753326416, + "learning_rate": 0.00039635854341736696, + "loss": 0.4288, + "step": 21663 + }, + { + "epoch": 12.102793296089386, + "grad_norm": 0.9435370564460754, + "learning_rate": 0.00039633053221288517, + "loss": 0.3505, + "step": 21664 + }, + { + "epoch": 12.103351955307263, + "grad_norm": 0.5997491478919983, + "learning_rate": 0.0003963025210084034, + "loss": 0.458, + "step": 21665 + }, + { + "epoch": 12.10391061452514, + "grad_norm": 1.4295728206634521, + "learning_rate": 0.0003962745098039216, + "loss": 0.4292, + "step": 21666 + }, + { + "epoch": 12.104469273743018, + "grad_norm": 0.6672996878623962, + "learning_rate": 0.0003962464985994398, + "loss": 0.4339, + "step": 21667 + }, + { + "epoch": 12.105027932960894, + "grad_norm": 2.0946834087371826, + "learning_rate": 0.000396218487394958, + "loss": 0.3834, + "step": 21668 + }, + { + "epoch": 12.10558659217877, + "grad_norm": 0.5532956719398499, + "learning_rate": 0.0003961904761904762, + "loss": 0.4761, + "step": 21669 + }, + { + "epoch": 12.106145251396647, + "grad_norm": 0.5710166096687317, + "learning_rate": 0.0003961624649859944, + "loss": 0.4348, + "step": 21670 + }, + { + "epoch": 12.106703910614526, + "grad_norm": 0.5889133810997009, + "learning_rate": 0.00039613445378151266, + "loss": 0.484, + "step": 21671 + }, + { + "epoch": 12.107262569832402, + "grad_norm": 0.40772464871406555, + "learning_rate": 0.0003961064425770308, + "loss": 0.3391, + "step": 21672 + }, + { + "epoch": 12.107821229050279, + "grad_norm": 0.5061005353927612, + "learning_rate": 0.000396078431372549, + "loss": 0.4248, + "step": 21673 + }, + { + "epoch": 12.108379888268157, + "grad_norm": 0.36669039726257324, + "learning_rate": 0.0003960504201680672, + "loss": 0.4144, + "step": 21674 + }, + { + "epoch": 12.108938547486034, + "grad_norm": 1.1601930856704712, + "learning_rate": 0.00039602240896358543, + "loss": 0.4211, + "step": 21675 + }, + { + "epoch": 12.10949720670391, + "grad_norm": 0.5963717103004456, + "learning_rate": 0.0003959943977591037, + "loss": 0.426, + "step": 21676 + }, + { + "epoch": 12.110055865921789, + "grad_norm": 0.6810588836669922, + "learning_rate": 0.00039596638655462185, + "loss": 0.4785, + "step": 21677 + }, + { + "epoch": 12.110614525139665, + "grad_norm": 1.5960572957992554, + "learning_rate": 0.00039593837535014005, + "loss": 0.45, + "step": 21678 + }, + { + "epoch": 12.111173184357542, + "grad_norm": 0.4219338893890381, + "learning_rate": 0.0003959103641456583, + "loss": 0.3176, + "step": 21679 + }, + { + "epoch": 12.111731843575418, + "grad_norm": 0.6799542307853699, + "learning_rate": 0.00039588235294117646, + "loss": 0.4181, + "step": 21680 + }, + { + "epoch": 12.112290502793297, + "grad_norm": 0.6910079121589661, + "learning_rate": 0.0003958543417366947, + "loss": 0.495, + "step": 21681 + }, + { + "epoch": 12.112849162011173, + "grad_norm": 0.6280542016029358, + "learning_rate": 0.0003958263305322129, + "loss": 0.512, + "step": 21682 + }, + { + "epoch": 12.11340782122905, + "grad_norm": 0.615943968296051, + "learning_rate": 0.0003957983193277311, + "loss": 0.358, + "step": 21683 + }, + { + "epoch": 12.113966480446928, + "grad_norm": 0.6539126038551331, + "learning_rate": 0.00039577030812324934, + "loss": 0.3918, + "step": 21684 + }, + { + "epoch": 12.114525139664805, + "grad_norm": 0.5996832847595215, + "learning_rate": 0.0003957422969187675, + "loss": 0.4776, + "step": 21685 + }, + { + "epoch": 12.115083798882681, + "grad_norm": 0.5373105406761169, + "learning_rate": 0.00039571428571428575, + "loss": 0.3535, + "step": 21686 + }, + { + "epoch": 12.11564245810056, + "grad_norm": 5.450588703155518, + "learning_rate": 0.00039568627450980396, + "loss": 0.3515, + "step": 21687 + }, + { + "epoch": 12.116201117318436, + "grad_norm": 0.6423935294151306, + "learning_rate": 0.0003956582633053221, + "loss": 0.3753, + "step": 21688 + }, + { + "epoch": 12.116759776536313, + "grad_norm": 0.48174917697906494, + "learning_rate": 0.00039563025210084037, + "loss": 0.4535, + "step": 21689 + }, + { + "epoch": 12.11731843575419, + "grad_norm": 0.41791272163391113, + "learning_rate": 0.0003956022408963585, + "loss": 0.4169, + "step": 21690 + }, + { + "epoch": 12.117877094972068, + "grad_norm": 1.2938919067382812, + "learning_rate": 0.0003955742296918768, + "loss": 0.5359, + "step": 21691 + }, + { + "epoch": 12.118435754189944, + "grad_norm": 0.713721752166748, + "learning_rate": 0.000395546218487395, + "loss": 0.5202, + "step": 21692 + }, + { + "epoch": 12.11899441340782, + "grad_norm": 0.35895228385925293, + "learning_rate": 0.00039551820728291314, + "loss": 0.4554, + "step": 21693 + }, + { + "epoch": 12.119553072625699, + "grad_norm": 0.7618927359580994, + "learning_rate": 0.0003954901960784314, + "loss": 0.5268, + "step": 21694 + }, + { + "epoch": 12.120111731843576, + "grad_norm": 0.9843438863754272, + "learning_rate": 0.0003954621848739496, + "loss": 0.4075, + "step": 21695 + }, + { + "epoch": 12.120670391061452, + "grad_norm": 0.4600014090538025, + "learning_rate": 0.0003954341736694678, + "loss": 0.4734, + "step": 21696 + }, + { + "epoch": 12.121229050279329, + "grad_norm": 1.582340955734253, + "learning_rate": 0.000395406162464986, + "loss": 0.3683, + "step": 21697 + }, + { + "epoch": 12.121787709497207, + "grad_norm": 0.5499454140663147, + "learning_rate": 0.00039537815126050417, + "loss": 0.3954, + "step": 21698 + }, + { + "epoch": 12.122346368715084, + "grad_norm": 0.5066421031951904, + "learning_rate": 0.00039535014005602243, + "loss": 0.3895, + "step": 21699 + }, + { + "epoch": 12.12290502793296, + "grad_norm": 1.064005732536316, + "learning_rate": 0.00039532212885154064, + "loss": 0.3704, + "step": 21700 + }, + { + "epoch": 12.123463687150839, + "grad_norm": 0.516848087310791, + "learning_rate": 0.0003952941176470588, + "loss": 0.4433, + "step": 21701 + }, + { + "epoch": 12.124022346368715, + "grad_norm": 0.8953462839126587, + "learning_rate": 0.00039526610644257705, + "loss": 0.479, + "step": 21702 + }, + { + "epoch": 12.124581005586592, + "grad_norm": 3.992640733718872, + "learning_rate": 0.00039523809523809526, + "loss": 0.3705, + "step": 21703 + }, + { + "epoch": 12.12513966480447, + "grad_norm": 0.8260511159896851, + "learning_rate": 0.00039521008403361346, + "loss": 0.4977, + "step": 21704 + }, + { + "epoch": 12.125698324022347, + "grad_norm": 0.5871229767799377, + "learning_rate": 0.00039518207282913167, + "loss": 0.4156, + "step": 21705 + }, + { + "epoch": 12.126256983240223, + "grad_norm": 1.9053579568862915, + "learning_rate": 0.0003951540616246498, + "loss": 0.5071, + "step": 21706 + }, + { + "epoch": 12.1268156424581, + "grad_norm": 0.5244836211204529, + "learning_rate": 0.0003951260504201681, + "loss": 0.4865, + "step": 21707 + }, + { + "epoch": 12.127374301675978, + "grad_norm": 0.4213542938232422, + "learning_rate": 0.0003950980392156863, + "loss": 0.3706, + "step": 21708 + }, + { + "epoch": 12.127932960893855, + "grad_norm": 1.2047513723373413, + "learning_rate": 0.0003950700280112045, + "loss": 0.5565, + "step": 21709 + }, + { + "epoch": 12.128491620111731, + "grad_norm": 0.4915350675582886, + "learning_rate": 0.0003950420168067227, + "loss": 0.3024, + "step": 21710 + }, + { + "epoch": 12.12905027932961, + "grad_norm": 1.2328293323516846, + "learning_rate": 0.0003950140056022409, + "loss": 0.3636, + "step": 21711 + }, + { + "epoch": 12.129608938547486, + "grad_norm": 0.5793976783752441, + "learning_rate": 0.0003949859943977591, + "loss": 0.339, + "step": 21712 + }, + { + "epoch": 12.130167597765363, + "grad_norm": 0.5257045030593872, + "learning_rate": 0.0003949579831932773, + "loss": 0.6747, + "step": 21713 + }, + { + "epoch": 12.130726256983241, + "grad_norm": 0.45344027876853943, + "learning_rate": 0.0003949299719887955, + "loss": 0.3846, + "step": 21714 + }, + { + "epoch": 12.131284916201118, + "grad_norm": 0.4222092628479004, + "learning_rate": 0.0003949019607843137, + "loss": 0.3812, + "step": 21715 + }, + { + "epoch": 12.131843575418994, + "grad_norm": 0.43243011832237244, + "learning_rate": 0.00039487394957983193, + "loss": 0.3886, + "step": 21716 + }, + { + "epoch": 12.13240223463687, + "grad_norm": 0.40126949548721313, + "learning_rate": 0.00039484593837535014, + "loss": 0.4995, + "step": 21717 + }, + { + "epoch": 12.132960893854749, + "grad_norm": 0.6041075587272644, + "learning_rate": 0.00039481792717086835, + "loss": 0.4616, + "step": 21718 + }, + { + "epoch": 12.133519553072626, + "grad_norm": 0.4662020206451416, + "learning_rate": 0.0003947899159663866, + "loss": 0.4267, + "step": 21719 + }, + { + "epoch": 12.134078212290502, + "grad_norm": 0.5810329914093018, + "learning_rate": 0.00039476190476190476, + "loss": 0.3977, + "step": 21720 + }, + { + "epoch": 12.13463687150838, + "grad_norm": 0.4164109528064728, + "learning_rate": 0.00039473389355742296, + "loss": 0.3633, + "step": 21721 + }, + { + "epoch": 12.135195530726257, + "grad_norm": 0.7491818070411682, + "learning_rate": 0.00039470588235294117, + "loss": 0.3673, + "step": 21722 + }, + { + "epoch": 12.135754189944134, + "grad_norm": 0.7716153264045715, + "learning_rate": 0.0003946778711484594, + "loss": 0.4072, + "step": 21723 + }, + { + "epoch": 12.136312849162012, + "grad_norm": 2.2200896739959717, + "learning_rate": 0.00039464985994397764, + "loss": 0.4544, + "step": 21724 + }, + { + "epoch": 12.136871508379889, + "grad_norm": 0.6296133995056152, + "learning_rate": 0.0003946218487394958, + "loss": 0.4959, + "step": 21725 + }, + { + "epoch": 12.137430167597765, + "grad_norm": 0.7471591830253601, + "learning_rate": 0.000394593837535014, + "loss": 0.4906, + "step": 21726 + }, + { + "epoch": 12.137988826815642, + "grad_norm": 0.7727543115615845, + "learning_rate": 0.00039456582633053225, + "loss": 0.6243, + "step": 21727 + }, + { + "epoch": 12.13854748603352, + "grad_norm": 0.5286030173301697, + "learning_rate": 0.0003945378151260504, + "loss": 0.4391, + "step": 21728 + }, + { + "epoch": 12.139106145251397, + "grad_norm": 0.6781374216079712, + "learning_rate": 0.00039450980392156867, + "loss": 0.4692, + "step": 21729 + }, + { + "epoch": 12.139664804469273, + "grad_norm": 0.5865398049354553, + "learning_rate": 0.0003944817927170868, + "loss": 0.2876, + "step": 21730 + }, + { + "epoch": 12.140223463687152, + "grad_norm": 0.36739417910575867, + "learning_rate": 0.000394453781512605, + "loss": 0.3569, + "step": 21731 + }, + { + "epoch": 12.140782122905028, + "grad_norm": 2.0289204120635986, + "learning_rate": 0.0003944257703081233, + "loss": 0.4431, + "step": 21732 + }, + { + "epoch": 12.141340782122905, + "grad_norm": 0.4773618280887604, + "learning_rate": 0.00039439775910364143, + "loss": 0.5413, + "step": 21733 + }, + { + "epoch": 12.141899441340781, + "grad_norm": 0.438223659992218, + "learning_rate": 0.0003943697478991597, + "loss": 0.3063, + "step": 21734 + }, + { + "epoch": 12.14245810055866, + "grad_norm": 9.199708938598633, + "learning_rate": 0.0003943417366946779, + "loss": 0.3716, + "step": 21735 + }, + { + "epoch": 12.143016759776536, + "grad_norm": 0.41729608178138733, + "learning_rate": 0.00039431372549019605, + "loss": 0.4976, + "step": 21736 + }, + { + "epoch": 12.143575418994413, + "grad_norm": 0.7056612372398376, + "learning_rate": 0.0003942857142857143, + "loss": 0.4576, + "step": 21737 + }, + { + "epoch": 12.144134078212291, + "grad_norm": 0.42185676097869873, + "learning_rate": 0.00039425770308123246, + "loss": 0.3542, + "step": 21738 + }, + { + "epoch": 12.144692737430168, + "grad_norm": 3.568018913269043, + "learning_rate": 0.0003942296918767507, + "loss": 0.4505, + "step": 21739 + }, + { + "epoch": 12.145251396648044, + "grad_norm": 0.6252919435501099, + "learning_rate": 0.00039420168067226893, + "loss": 0.5047, + "step": 21740 + }, + { + "epoch": 12.145810055865923, + "grad_norm": 1.604628324508667, + "learning_rate": 0.0003941736694677871, + "loss": 0.3986, + "step": 21741 + }, + { + "epoch": 12.1463687150838, + "grad_norm": 0.4350687265396118, + "learning_rate": 0.00039414565826330534, + "loss": 0.3925, + "step": 21742 + }, + { + "epoch": 12.146927374301676, + "grad_norm": 0.41298022866249084, + "learning_rate": 0.00039411764705882355, + "loss": 0.4036, + "step": 21743 + }, + { + "epoch": 12.147486033519552, + "grad_norm": 3.243593692779541, + "learning_rate": 0.00039408963585434176, + "loss": 0.4881, + "step": 21744 + }, + { + "epoch": 12.14804469273743, + "grad_norm": 0.48016592860221863, + "learning_rate": 0.00039406162464985996, + "loss": 0.421, + "step": 21745 + }, + { + "epoch": 12.148603351955307, + "grad_norm": 0.4243204891681671, + "learning_rate": 0.0003940336134453781, + "loss": 0.4075, + "step": 21746 + }, + { + "epoch": 12.149162011173184, + "grad_norm": 0.37057727575302124, + "learning_rate": 0.0003940056022408964, + "loss": 0.3608, + "step": 21747 + }, + { + "epoch": 12.149720670391062, + "grad_norm": 0.9835109710693359, + "learning_rate": 0.0003939775910364146, + "loss": 0.4239, + "step": 21748 + }, + { + "epoch": 12.150279329608939, + "grad_norm": 0.6910563707351685, + "learning_rate": 0.0003939495798319328, + "loss": 0.4765, + "step": 21749 + }, + { + "epoch": 12.150837988826815, + "grad_norm": 0.5976811051368713, + "learning_rate": 0.000393921568627451, + "loss": 0.4454, + "step": 21750 + }, + { + "epoch": 12.151396648044694, + "grad_norm": 0.5772486329078674, + "learning_rate": 0.0003938935574229692, + "loss": 0.3372, + "step": 21751 + }, + { + "epoch": 12.15195530726257, + "grad_norm": 0.9358725547790527, + "learning_rate": 0.0003938655462184874, + "loss": 0.4362, + "step": 21752 + }, + { + "epoch": 12.152513966480447, + "grad_norm": 1.1724869012832642, + "learning_rate": 0.0003938375350140056, + "loss": 0.462, + "step": 21753 + }, + { + "epoch": 12.153072625698323, + "grad_norm": 0.5281693339347839, + "learning_rate": 0.0003938095238095238, + "loss": 0.4371, + "step": 21754 + }, + { + "epoch": 12.153631284916202, + "grad_norm": 0.579055905342102, + "learning_rate": 0.000393781512605042, + "loss": 0.4903, + "step": 21755 + }, + { + "epoch": 12.154189944134078, + "grad_norm": 0.33133259415626526, + "learning_rate": 0.0003937535014005602, + "loss": 0.378, + "step": 21756 + }, + { + "epoch": 12.154748603351955, + "grad_norm": 0.46535757184028625, + "learning_rate": 0.00039372549019607843, + "loss": 0.4663, + "step": 21757 + }, + { + "epoch": 12.155307262569833, + "grad_norm": 1.664006233215332, + "learning_rate": 0.00039369747899159664, + "loss": 0.3621, + "step": 21758 + }, + { + "epoch": 12.15586592178771, + "grad_norm": 0.6977758407592773, + "learning_rate": 0.0003936694677871149, + "loss": 0.4525, + "step": 21759 + }, + { + "epoch": 12.156424581005586, + "grad_norm": 0.4795726537704468, + "learning_rate": 0.00039364145658263305, + "loss": 0.3693, + "step": 21760 + }, + { + "epoch": 12.156983240223465, + "grad_norm": 0.39268234372138977, + "learning_rate": 0.00039361344537815126, + "loss": 0.3372, + "step": 21761 + }, + { + "epoch": 12.157541899441341, + "grad_norm": 0.4742097556591034, + "learning_rate": 0.00039358543417366946, + "loss": 0.3502, + "step": 21762 + }, + { + "epoch": 12.158100558659218, + "grad_norm": 0.5306321978569031, + "learning_rate": 0.00039355742296918767, + "loss": 0.401, + "step": 21763 + }, + { + "epoch": 12.158659217877094, + "grad_norm": 0.4546867311000824, + "learning_rate": 0.00039352941176470593, + "loss": 0.4108, + "step": 21764 + }, + { + "epoch": 12.159217877094973, + "grad_norm": 0.4598774015903473, + "learning_rate": 0.0003935014005602241, + "loss": 0.4106, + "step": 21765 + }, + { + "epoch": 12.15977653631285, + "grad_norm": 1.1472396850585938, + "learning_rate": 0.0003934733893557423, + "loss": 0.329, + "step": 21766 + }, + { + "epoch": 12.160335195530726, + "grad_norm": 2.278527021408081, + "learning_rate": 0.00039344537815126055, + "loss": 0.3068, + "step": 21767 + }, + { + "epoch": 12.160893854748604, + "grad_norm": 0.5542152523994446, + "learning_rate": 0.0003934173669467787, + "loss": 0.5449, + "step": 21768 + }, + { + "epoch": 12.16145251396648, + "grad_norm": 0.5256080627441406, + "learning_rate": 0.00039338935574229696, + "loss": 0.3507, + "step": 21769 + }, + { + "epoch": 12.162011173184357, + "grad_norm": 0.41452229022979736, + "learning_rate": 0.0003933613445378151, + "loss": 0.376, + "step": 21770 + }, + { + "epoch": 12.162569832402234, + "grad_norm": 1.3607451915740967, + "learning_rate": 0.0003933333333333333, + "loss": 0.505, + "step": 21771 + }, + { + "epoch": 12.163128491620112, + "grad_norm": 0.38324180245399475, + "learning_rate": 0.0003933053221288516, + "loss": 0.2948, + "step": 21772 + }, + { + "epoch": 12.163687150837989, + "grad_norm": 0.9718806147575378, + "learning_rate": 0.00039327731092436973, + "loss": 0.3485, + "step": 21773 + }, + { + "epoch": 12.164245810055865, + "grad_norm": 0.6692209839820862, + "learning_rate": 0.000393249299719888, + "loss": 0.4018, + "step": 21774 + }, + { + "epoch": 12.164804469273744, + "grad_norm": 0.3984488248825073, + "learning_rate": 0.0003932212885154062, + "loss": 0.434, + "step": 21775 + }, + { + "epoch": 12.16536312849162, + "grad_norm": 0.5287492275238037, + "learning_rate": 0.00039319327731092435, + "loss": 0.5105, + "step": 21776 + }, + { + "epoch": 12.165921787709497, + "grad_norm": 0.7724483609199524, + "learning_rate": 0.0003931652661064426, + "loss": 0.4274, + "step": 21777 + }, + { + "epoch": 12.166480446927375, + "grad_norm": 1.267682433128357, + "learning_rate": 0.00039313725490196076, + "loss": 0.3106, + "step": 21778 + }, + { + "epoch": 12.167039106145252, + "grad_norm": 0.48012182116508484, + "learning_rate": 0.000393109243697479, + "loss": 0.4039, + "step": 21779 + }, + { + "epoch": 12.167597765363128, + "grad_norm": 0.4909907281398773, + "learning_rate": 0.0003930812324929972, + "loss": 0.3256, + "step": 21780 + }, + { + "epoch": 12.168156424581005, + "grad_norm": 0.7097208499908447, + "learning_rate": 0.0003930532212885154, + "loss": 0.6106, + "step": 21781 + }, + { + "epoch": 12.168715083798883, + "grad_norm": 0.5284512042999268, + "learning_rate": 0.00039302521008403364, + "loss": 0.2588, + "step": 21782 + }, + { + "epoch": 12.16927374301676, + "grad_norm": 0.7502999305725098, + "learning_rate": 0.00039299719887955184, + "loss": 0.358, + "step": 21783 + }, + { + "epoch": 12.169832402234636, + "grad_norm": 0.5383977890014648, + "learning_rate": 0.00039296918767507005, + "loss": 0.521, + "step": 21784 + }, + { + "epoch": 12.170391061452515, + "grad_norm": 1.2534509897232056, + "learning_rate": 0.00039294117647058826, + "loss": 0.55, + "step": 21785 + }, + { + "epoch": 12.170949720670391, + "grad_norm": 0.3851598799228668, + "learning_rate": 0.0003929131652661064, + "loss": 0.3642, + "step": 21786 + }, + { + "epoch": 12.171508379888268, + "grad_norm": 0.6332042813301086, + "learning_rate": 0.00039288515406162467, + "loss": 0.4268, + "step": 21787 + }, + { + "epoch": 12.172067039106146, + "grad_norm": 0.592551589012146, + "learning_rate": 0.0003928571428571429, + "loss": 0.3564, + "step": 21788 + }, + { + "epoch": 12.172625698324023, + "grad_norm": 1.0004148483276367, + "learning_rate": 0.0003928291316526611, + "loss": 0.4341, + "step": 21789 + }, + { + "epoch": 12.1731843575419, + "grad_norm": 0.4812932014465332, + "learning_rate": 0.0003928011204481793, + "loss": 0.4541, + "step": 21790 + }, + { + "epoch": 12.173743016759776, + "grad_norm": 0.41774073243141174, + "learning_rate": 0.0003927731092436975, + "loss": 0.3839, + "step": 21791 + }, + { + "epoch": 12.174301675977654, + "grad_norm": 0.4575801193714142, + "learning_rate": 0.0003927450980392157, + "loss": 0.4468, + "step": 21792 + }, + { + "epoch": 12.17486033519553, + "grad_norm": 0.47126543521881104, + "learning_rate": 0.0003927170868347339, + "loss": 0.3536, + "step": 21793 + }, + { + "epoch": 12.175418994413407, + "grad_norm": 0.7147666811943054, + "learning_rate": 0.0003926890756302521, + "loss": 0.4329, + "step": 21794 + }, + { + "epoch": 12.175977653631286, + "grad_norm": 1.9386667013168335, + "learning_rate": 0.0003926610644257703, + "loss": 0.4113, + "step": 21795 + }, + { + "epoch": 12.176536312849162, + "grad_norm": 0.49150145053863525, + "learning_rate": 0.0003926330532212885, + "loss": 0.3867, + "step": 21796 + }, + { + "epoch": 12.177094972067039, + "grad_norm": 0.478884220123291, + "learning_rate": 0.0003926050420168067, + "loss": 0.4083, + "step": 21797 + }, + { + "epoch": 12.177653631284917, + "grad_norm": 1.033188819885254, + "learning_rate": 0.00039257703081232493, + "loss": 0.422, + "step": 21798 + }, + { + "epoch": 12.178212290502794, + "grad_norm": 0.6465140581130981, + "learning_rate": 0.0003925490196078432, + "loss": 0.4363, + "step": 21799 + }, + { + "epoch": 12.17877094972067, + "grad_norm": 1.037026286125183, + "learning_rate": 0.00039252100840336135, + "loss": 0.4185, + "step": 21800 + }, + { + "epoch": 12.179329608938547, + "grad_norm": 0.36252695322036743, + "learning_rate": 0.00039249299719887955, + "loss": 0.3977, + "step": 21801 + }, + { + "epoch": 12.179888268156425, + "grad_norm": 0.7120213508605957, + "learning_rate": 0.00039246498599439776, + "loss": 0.381, + "step": 21802 + }, + { + "epoch": 12.180446927374302, + "grad_norm": 1.418805480003357, + "learning_rate": 0.00039243697478991596, + "loss": 0.3741, + "step": 21803 + }, + { + "epoch": 12.181005586592178, + "grad_norm": 0.3128635287284851, + "learning_rate": 0.0003924089635854342, + "loss": 0.312, + "step": 21804 + }, + { + "epoch": 12.181564245810057, + "grad_norm": 1.3979904651641846, + "learning_rate": 0.0003923809523809524, + "loss": 0.3047, + "step": 21805 + }, + { + "epoch": 12.182122905027933, + "grad_norm": 0.5204529166221619, + "learning_rate": 0.0003923529411764706, + "loss": 0.4432, + "step": 21806 + }, + { + "epoch": 12.18268156424581, + "grad_norm": 0.5199275016784668, + "learning_rate": 0.00039232492997198884, + "loss": 0.5098, + "step": 21807 + }, + { + "epoch": 12.183240223463686, + "grad_norm": 1.4085497856140137, + "learning_rate": 0.000392296918767507, + "loss": 0.3794, + "step": 21808 + }, + { + "epoch": 12.183798882681565, + "grad_norm": 0.40714210271835327, + "learning_rate": 0.00039226890756302525, + "loss": 0.4829, + "step": 21809 + }, + { + "epoch": 12.184357541899441, + "grad_norm": 0.6840947270393372, + "learning_rate": 0.0003922408963585434, + "loss": 0.5658, + "step": 21810 + }, + { + "epoch": 12.184916201117318, + "grad_norm": 0.3958841562271118, + "learning_rate": 0.0003922128851540616, + "loss": 0.2873, + "step": 21811 + }, + { + "epoch": 12.185474860335196, + "grad_norm": 0.5028731822967529, + "learning_rate": 0.00039218487394957987, + "loss": 0.3564, + "step": 21812 + }, + { + "epoch": 12.186033519553073, + "grad_norm": 0.3707486391067505, + "learning_rate": 0.000392156862745098, + "loss": 0.3626, + "step": 21813 + }, + { + "epoch": 12.18659217877095, + "grad_norm": 0.4465099573135376, + "learning_rate": 0.00039212885154061623, + "loss": 0.3649, + "step": 21814 + }, + { + "epoch": 12.187150837988828, + "grad_norm": 0.5002136826515198, + "learning_rate": 0.0003921008403361345, + "loss": 0.45, + "step": 21815 + }, + { + "epoch": 12.187709497206704, + "grad_norm": 0.4699725806713104, + "learning_rate": 0.00039207282913165264, + "loss": 0.3947, + "step": 21816 + }, + { + "epoch": 12.18826815642458, + "grad_norm": 0.3924983739852905, + "learning_rate": 0.0003920448179271709, + "loss": 0.4159, + "step": 21817 + }, + { + "epoch": 12.188826815642457, + "grad_norm": 0.4827716648578644, + "learning_rate": 0.00039201680672268905, + "loss": 0.5022, + "step": 21818 + }, + { + "epoch": 12.189385474860336, + "grad_norm": 1.0485687255859375, + "learning_rate": 0.00039198879551820726, + "loss": 0.4069, + "step": 21819 + }, + { + "epoch": 12.189944134078212, + "grad_norm": 0.6386340856552124, + "learning_rate": 0.0003919607843137255, + "loss": 0.2849, + "step": 21820 + }, + { + "epoch": 12.190502793296089, + "grad_norm": 0.9350286722183228, + "learning_rate": 0.00039193277310924367, + "loss": 0.3835, + "step": 21821 + }, + { + "epoch": 12.191061452513967, + "grad_norm": 1.1977585554122925, + "learning_rate": 0.00039190476190476193, + "loss": 0.4274, + "step": 21822 + }, + { + "epoch": 12.191620111731844, + "grad_norm": 0.6727390289306641, + "learning_rate": 0.00039187675070028014, + "loss": 0.4387, + "step": 21823 + }, + { + "epoch": 12.19217877094972, + "grad_norm": 1.412946343421936, + "learning_rate": 0.0003918487394957983, + "loss": 0.4394, + "step": 21824 + }, + { + "epoch": 12.192737430167599, + "grad_norm": 1.2503514289855957, + "learning_rate": 0.00039182072829131655, + "loss": 0.3633, + "step": 21825 + }, + { + "epoch": 12.193296089385475, + "grad_norm": 0.476701945066452, + "learning_rate": 0.0003917927170868347, + "loss": 0.43, + "step": 21826 + }, + { + "epoch": 12.193854748603352, + "grad_norm": 0.4076438248157501, + "learning_rate": 0.00039176470588235296, + "loss": 0.4223, + "step": 21827 + }, + { + "epoch": 12.194413407821228, + "grad_norm": 0.4849054515361786, + "learning_rate": 0.00039173669467787117, + "loss": 0.4347, + "step": 21828 + }, + { + "epoch": 12.194972067039107, + "grad_norm": 0.4874263107776642, + "learning_rate": 0.0003917086834733893, + "loss": 0.4409, + "step": 21829 + }, + { + "epoch": 12.195530726256983, + "grad_norm": 0.4042741060256958, + "learning_rate": 0.0003916806722689076, + "loss": 0.3468, + "step": 21830 + }, + { + "epoch": 12.19608938547486, + "grad_norm": 0.4822596311569214, + "learning_rate": 0.0003916526610644258, + "loss": 0.4527, + "step": 21831 + }, + { + "epoch": 12.196648044692738, + "grad_norm": 0.6793760657310486, + "learning_rate": 0.000391624649859944, + "loss": 0.5699, + "step": 21832 + }, + { + "epoch": 12.197206703910615, + "grad_norm": 0.532418429851532, + "learning_rate": 0.0003915966386554622, + "loss": 0.4835, + "step": 21833 + }, + { + "epoch": 12.197765363128491, + "grad_norm": 0.36411771178245544, + "learning_rate": 0.00039156862745098035, + "loss": 0.3571, + "step": 21834 + }, + { + "epoch": 12.19832402234637, + "grad_norm": 0.7971248030662537, + "learning_rate": 0.0003915406162464986, + "loss": 0.4263, + "step": 21835 + }, + { + "epoch": 12.198882681564246, + "grad_norm": 0.8669003248214722, + "learning_rate": 0.0003915126050420168, + "loss": 0.4788, + "step": 21836 + }, + { + "epoch": 12.199441340782123, + "grad_norm": 0.4638930559158325, + "learning_rate": 0.000391484593837535, + "loss": 0.3206, + "step": 21837 + }, + { + "epoch": 12.2, + "grad_norm": 0.5332055687904358, + "learning_rate": 0.0003914565826330532, + "loss": 0.4596, + "step": 21838 + }, + { + "epoch": 12.200558659217878, + "grad_norm": 0.8995439410209656, + "learning_rate": 0.00039142857142857143, + "loss": 0.3574, + "step": 21839 + }, + { + "epoch": 12.201117318435754, + "grad_norm": 2.44504714012146, + "learning_rate": 0.00039140056022408964, + "loss": 0.3655, + "step": 21840 + }, + { + "epoch": 12.20167597765363, + "grad_norm": 2.5652449131011963, + "learning_rate": 0.00039137254901960784, + "loss": 0.3832, + "step": 21841 + }, + { + "epoch": 12.202234636871509, + "grad_norm": 0.5657622814178467, + "learning_rate": 0.0003913445378151261, + "loss": 0.5945, + "step": 21842 + }, + { + "epoch": 12.202793296089386, + "grad_norm": 0.4920353591442108, + "learning_rate": 0.00039131652661064426, + "loss": 0.4678, + "step": 21843 + }, + { + "epoch": 12.203351955307262, + "grad_norm": 1.2122029066085815, + "learning_rate": 0.00039128851540616246, + "loss": 0.3435, + "step": 21844 + }, + { + "epoch": 12.203910614525139, + "grad_norm": 0.500775933265686, + "learning_rate": 0.00039126050420168067, + "loss": 0.3498, + "step": 21845 + }, + { + "epoch": 12.204469273743017, + "grad_norm": 0.45466670393943787, + "learning_rate": 0.0003912324929971989, + "loss": 0.3981, + "step": 21846 + }, + { + "epoch": 12.205027932960894, + "grad_norm": 0.37359780073165894, + "learning_rate": 0.00039120448179271714, + "loss": 0.4142, + "step": 21847 + }, + { + "epoch": 12.20558659217877, + "grad_norm": 0.5335162281990051, + "learning_rate": 0.0003911764705882353, + "loss": 0.4375, + "step": 21848 + }, + { + "epoch": 12.206145251396649, + "grad_norm": 0.5312724113464355, + "learning_rate": 0.0003911484593837535, + "loss": 0.3921, + "step": 21849 + }, + { + "epoch": 12.206703910614525, + "grad_norm": 0.44479164481163025, + "learning_rate": 0.00039112044817927175, + "loss": 0.4252, + "step": 21850 + }, + { + "epoch": 12.207262569832402, + "grad_norm": 0.35407671332359314, + "learning_rate": 0.0003910924369747899, + "loss": 0.2967, + "step": 21851 + }, + { + "epoch": 12.20782122905028, + "grad_norm": 4.332983493804932, + "learning_rate": 0.00039106442577030817, + "loss": 0.3265, + "step": 21852 + }, + { + "epoch": 12.208379888268157, + "grad_norm": 0.4648149907588959, + "learning_rate": 0.0003910364145658263, + "loss": 0.4087, + "step": 21853 + }, + { + "epoch": 12.208938547486033, + "grad_norm": 0.6404945254325867, + "learning_rate": 0.0003910084033613445, + "loss": 0.4507, + "step": 21854 + }, + { + "epoch": 12.20949720670391, + "grad_norm": 0.8594422340393066, + "learning_rate": 0.0003909803921568628, + "loss": 0.3098, + "step": 21855 + }, + { + "epoch": 12.210055865921788, + "grad_norm": 7.797704696655273, + "learning_rate": 0.00039095238095238093, + "loss": 0.409, + "step": 21856 + }, + { + "epoch": 12.210614525139665, + "grad_norm": 0.5687383413314819, + "learning_rate": 0.0003909243697478992, + "loss": 0.6765, + "step": 21857 + }, + { + "epoch": 12.211173184357541, + "grad_norm": 7.988146781921387, + "learning_rate": 0.0003908963585434174, + "loss": 0.4515, + "step": 21858 + }, + { + "epoch": 12.21173184357542, + "grad_norm": 1.9037461280822754, + "learning_rate": 0.00039086834733893555, + "loss": 0.3847, + "step": 21859 + }, + { + "epoch": 12.212290502793296, + "grad_norm": 0.4811187982559204, + "learning_rate": 0.0003908403361344538, + "loss": 0.6082, + "step": 21860 + }, + { + "epoch": 12.212849162011173, + "grad_norm": 0.5357823967933655, + "learning_rate": 0.00039081232492997196, + "loss": 0.4657, + "step": 21861 + }, + { + "epoch": 12.213407821229051, + "grad_norm": 0.6308609247207642, + "learning_rate": 0.0003907843137254902, + "loss": 0.3716, + "step": 21862 + }, + { + "epoch": 12.213966480446928, + "grad_norm": 0.4544468820095062, + "learning_rate": 0.00039075630252100843, + "loss": 0.4571, + "step": 21863 + }, + { + "epoch": 12.214525139664804, + "grad_norm": 0.5269650816917419, + "learning_rate": 0.0003907282913165266, + "loss": 0.3746, + "step": 21864 + }, + { + "epoch": 12.21508379888268, + "grad_norm": 0.6236300468444824, + "learning_rate": 0.00039070028011204484, + "loss": 0.4047, + "step": 21865 + }, + { + "epoch": 12.21564245810056, + "grad_norm": 0.5601384043693542, + "learning_rate": 0.00039067226890756305, + "loss": 0.4223, + "step": 21866 + }, + { + "epoch": 12.216201117318436, + "grad_norm": 0.44696560502052307, + "learning_rate": 0.00039064425770308126, + "loss": 0.4471, + "step": 21867 + }, + { + "epoch": 12.216759776536312, + "grad_norm": 1.1227378845214844, + "learning_rate": 0.00039061624649859946, + "loss": 0.4578, + "step": 21868 + }, + { + "epoch": 12.21731843575419, + "grad_norm": 0.6919159293174744, + "learning_rate": 0.0003905882352941176, + "loss": 0.4738, + "step": 21869 + }, + { + "epoch": 12.217877094972067, + "grad_norm": 1.015834927558899, + "learning_rate": 0.0003905602240896359, + "loss": 0.5356, + "step": 21870 + }, + { + "epoch": 12.218435754189944, + "grad_norm": 2.636415719985962, + "learning_rate": 0.0003905322128851541, + "loss": 0.4774, + "step": 21871 + }, + { + "epoch": 12.21899441340782, + "grad_norm": 0.4044748544692993, + "learning_rate": 0.0003905042016806723, + "loss": 0.3946, + "step": 21872 + }, + { + "epoch": 12.219553072625699, + "grad_norm": 0.6548739075660706, + "learning_rate": 0.0003904761904761905, + "loss": 0.4648, + "step": 21873 + }, + { + "epoch": 12.220111731843575, + "grad_norm": 0.42191267013549805, + "learning_rate": 0.0003904481792717087, + "loss": 0.5553, + "step": 21874 + }, + { + "epoch": 12.220670391061452, + "grad_norm": 0.9451420903205872, + "learning_rate": 0.0003904201680672269, + "loss": 0.452, + "step": 21875 + }, + { + "epoch": 12.22122905027933, + "grad_norm": 0.572333037853241, + "learning_rate": 0.0003903921568627451, + "loss": 0.414, + "step": 21876 + }, + { + "epoch": 12.221787709497207, + "grad_norm": 0.39203354716300964, + "learning_rate": 0.0003903641456582633, + "loss": 0.526, + "step": 21877 + }, + { + "epoch": 12.222346368715083, + "grad_norm": 8.167726516723633, + "learning_rate": 0.0003903361344537815, + "loss": 0.4928, + "step": 21878 + }, + { + "epoch": 12.222905027932962, + "grad_norm": 0.468376100063324, + "learning_rate": 0.0003903081232492997, + "loss": 0.4226, + "step": 21879 + }, + { + "epoch": 12.223463687150838, + "grad_norm": 0.5577556490898132, + "learning_rate": 0.00039028011204481793, + "loss": 0.3841, + "step": 21880 + }, + { + "epoch": 12.224022346368715, + "grad_norm": 0.6264350414276123, + "learning_rate": 0.00039025210084033614, + "loss": 0.4877, + "step": 21881 + }, + { + "epoch": 12.224581005586591, + "grad_norm": 0.5471148490905762, + "learning_rate": 0.0003902240896358544, + "loss": 0.4355, + "step": 21882 + }, + { + "epoch": 12.22513966480447, + "grad_norm": 0.4771871566772461, + "learning_rate": 0.00039019607843137255, + "loss": 0.3246, + "step": 21883 + }, + { + "epoch": 12.225698324022346, + "grad_norm": 0.6908655166625977, + "learning_rate": 0.00039016806722689076, + "loss": 0.3336, + "step": 21884 + }, + { + "epoch": 12.226256983240223, + "grad_norm": 0.4203045666217804, + "learning_rate": 0.00039014005602240896, + "loss": 0.3613, + "step": 21885 + }, + { + "epoch": 12.226815642458101, + "grad_norm": 0.522285521030426, + "learning_rate": 0.00039011204481792717, + "loss": 0.4583, + "step": 21886 + }, + { + "epoch": 12.227374301675978, + "grad_norm": 0.6972352266311646, + "learning_rate": 0.00039008403361344543, + "loss": 0.326, + "step": 21887 + }, + { + "epoch": 12.227932960893854, + "grad_norm": 1.22415030002594, + "learning_rate": 0.0003900560224089636, + "loss": 0.4579, + "step": 21888 + }, + { + "epoch": 12.228491620111733, + "grad_norm": 2.0768392086029053, + "learning_rate": 0.0003900280112044818, + "loss": 0.341, + "step": 21889 + }, + { + "epoch": 12.22905027932961, + "grad_norm": 0.5469762682914734, + "learning_rate": 0.00039000000000000005, + "loss": 0.5079, + "step": 21890 + }, + { + "epoch": 12.229608938547486, + "grad_norm": 1.2877812385559082, + "learning_rate": 0.0003899719887955182, + "loss": 0.3778, + "step": 21891 + }, + { + "epoch": 12.230167597765362, + "grad_norm": 1.5677930116653442, + "learning_rate": 0.00038994397759103646, + "loss": 0.4174, + "step": 21892 + }, + { + "epoch": 12.23072625698324, + "grad_norm": 0.7079290747642517, + "learning_rate": 0.0003899159663865546, + "loss": 0.5094, + "step": 21893 + }, + { + "epoch": 12.231284916201117, + "grad_norm": 0.4397013485431671, + "learning_rate": 0.0003898879551820728, + "loss": 0.4256, + "step": 21894 + }, + { + "epoch": 12.231843575418994, + "grad_norm": 0.3371831476688385, + "learning_rate": 0.0003898599439775911, + "loss": 0.382, + "step": 21895 + }, + { + "epoch": 12.232402234636872, + "grad_norm": 0.5015655755996704, + "learning_rate": 0.00038983193277310923, + "loss": 0.464, + "step": 21896 + }, + { + "epoch": 12.232960893854749, + "grad_norm": 0.4748333990573883, + "learning_rate": 0.0003898039215686275, + "loss": 0.5532, + "step": 21897 + }, + { + "epoch": 12.233519553072625, + "grad_norm": 0.40756455063819885, + "learning_rate": 0.0003897759103641457, + "loss": 0.383, + "step": 21898 + }, + { + "epoch": 12.234078212290504, + "grad_norm": 0.4960795044898987, + "learning_rate": 0.00038974789915966385, + "loss": 0.3991, + "step": 21899 + }, + { + "epoch": 12.23463687150838, + "grad_norm": 0.4264710247516632, + "learning_rate": 0.0003897198879551821, + "loss": 0.4124, + "step": 21900 + }, + { + "epoch": 12.235195530726257, + "grad_norm": 1.2038625478744507, + "learning_rate": 0.00038969187675070026, + "loss": 0.335, + "step": 21901 + }, + { + "epoch": 12.235754189944133, + "grad_norm": 0.4595504403114319, + "learning_rate": 0.0003896638655462185, + "loss": 0.2935, + "step": 21902 + }, + { + "epoch": 12.236312849162012, + "grad_norm": 0.36419862508773804, + "learning_rate": 0.0003896358543417367, + "loss": 0.4935, + "step": 21903 + }, + { + "epoch": 12.236871508379888, + "grad_norm": 0.47434210777282715, + "learning_rate": 0.0003896078431372549, + "loss": 0.3734, + "step": 21904 + }, + { + "epoch": 12.237430167597765, + "grad_norm": 0.4589269757270813, + "learning_rate": 0.00038957983193277314, + "loss": 0.4204, + "step": 21905 + }, + { + "epoch": 12.237988826815643, + "grad_norm": 0.527160108089447, + "learning_rate": 0.00038955182072829134, + "loss": 0.4681, + "step": 21906 + }, + { + "epoch": 12.23854748603352, + "grad_norm": 0.3831530511379242, + "learning_rate": 0.00038952380952380955, + "loss": 0.3478, + "step": 21907 + }, + { + "epoch": 12.239106145251396, + "grad_norm": 2.4585824012756348, + "learning_rate": 0.00038949579831932776, + "loss": 0.4246, + "step": 21908 + }, + { + "epoch": 12.239664804469275, + "grad_norm": 1.2817026376724243, + "learning_rate": 0.0003894677871148459, + "loss": 0.3695, + "step": 21909 + }, + { + "epoch": 12.240223463687151, + "grad_norm": 3.9584319591522217, + "learning_rate": 0.00038943977591036417, + "loss": 0.4669, + "step": 21910 + }, + { + "epoch": 12.240782122905028, + "grad_norm": 0.5206130146980286, + "learning_rate": 0.0003894117647058824, + "loss": 0.4447, + "step": 21911 + }, + { + "epoch": 12.241340782122904, + "grad_norm": 0.6187414526939392, + "learning_rate": 0.0003893837535014006, + "loss": 0.3849, + "step": 21912 + }, + { + "epoch": 12.241899441340783, + "grad_norm": 0.6064357161521912, + "learning_rate": 0.0003893557422969188, + "loss": 0.4901, + "step": 21913 + }, + { + "epoch": 12.24245810055866, + "grad_norm": 0.4968681037425995, + "learning_rate": 0.000389327731092437, + "loss": 0.5264, + "step": 21914 + }, + { + "epoch": 12.243016759776536, + "grad_norm": 0.47898945212364197, + "learning_rate": 0.0003892997198879552, + "loss": 0.3699, + "step": 21915 + }, + { + "epoch": 12.243575418994414, + "grad_norm": 0.6146661639213562, + "learning_rate": 0.0003892717086834734, + "loss": 0.5232, + "step": 21916 + }, + { + "epoch": 12.24413407821229, + "grad_norm": 1.489929437637329, + "learning_rate": 0.0003892436974789916, + "loss": 0.4723, + "step": 21917 + }, + { + "epoch": 12.244692737430167, + "grad_norm": 9.172572135925293, + "learning_rate": 0.0003892156862745098, + "loss": 0.3544, + "step": 21918 + }, + { + "epoch": 12.245251396648044, + "grad_norm": 0.6824479103088379, + "learning_rate": 0.000389187675070028, + "loss": 0.5045, + "step": 21919 + }, + { + "epoch": 12.245810055865922, + "grad_norm": 0.6482643485069275, + "learning_rate": 0.0003891596638655462, + "loss": 0.4159, + "step": 21920 + }, + { + "epoch": 12.246368715083799, + "grad_norm": 0.8398950099945068, + "learning_rate": 0.00038913165266106443, + "loss": 0.4938, + "step": 21921 + }, + { + "epoch": 12.246927374301675, + "grad_norm": 0.48368746042251587, + "learning_rate": 0.0003891036414565827, + "loss": 0.486, + "step": 21922 + }, + { + "epoch": 12.247486033519554, + "grad_norm": 1.9520163536071777, + "learning_rate": 0.00038907563025210084, + "loss": 0.3875, + "step": 21923 + }, + { + "epoch": 12.24804469273743, + "grad_norm": 0.3982439935207367, + "learning_rate": 0.00038904761904761905, + "loss": 0.3845, + "step": 21924 + }, + { + "epoch": 12.248603351955307, + "grad_norm": 0.6793547868728638, + "learning_rate": 0.00038901960784313726, + "loss": 0.6255, + "step": 21925 + }, + { + "epoch": 12.249162011173185, + "grad_norm": 1.007798194885254, + "learning_rate": 0.00038899159663865546, + "loss": 0.4387, + "step": 21926 + }, + { + "epoch": 12.249720670391062, + "grad_norm": 0.501794159412384, + "learning_rate": 0.00038896358543417367, + "loss": 0.4539, + "step": 21927 + }, + { + "epoch": 12.250279329608938, + "grad_norm": 0.7932931780815125, + "learning_rate": 0.0003889355742296919, + "loss": 0.6936, + "step": 21928 + }, + { + "epoch": 12.250837988826815, + "grad_norm": 0.478601336479187, + "learning_rate": 0.0003889075630252101, + "loss": 0.5219, + "step": 21929 + }, + { + "epoch": 12.251396648044693, + "grad_norm": 0.8970796465873718, + "learning_rate": 0.00038887955182072834, + "loss": 0.6378, + "step": 21930 + }, + { + "epoch": 12.25195530726257, + "grad_norm": 0.4218604266643524, + "learning_rate": 0.0003888515406162465, + "loss": 0.4637, + "step": 21931 + }, + { + "epoch": 12.252513966480446, + "grad_norm": 0.5470377802848816, + "learning_rate": 0.0003888235294117647, + "loss": 0.3953, + "step": 21932 + }, + { + "epoch": 12.253072625698325, + "grad_norm": 0.6472342014312744, + "learning_rate": 0.0003887955182072829, + "loss": 0.4612, + "step": 21933 + }, + { + "epoch": 12.253631284916201, + "grad_norm": 0.6655737161636353, + "learning_rate": 0.0003887675070028011, + "loss": 0.4249, + "step": 21934 + }, + { + "epoch": 12.254189944134078, + "grad_norm": 0.4794478714466095, + "learning_rate": 0.00038873949579831937, + "loss": 0.4755, + "step": 21935 + }, + { + "epoch": 12.254748603351956, + "grad_norm": 0.37384161353111267, + "learning_rate": 0.0003887114845938375, + "loss": 0.3093, + "step": 21936 + }, + { + "epoch": 12.255307262569833, + "grad_norm": 0.47591012716293335, + "learning_rate": 0.00038868347338935573, + "loss": 0.5251, + "step": 21937 + }, + { + "epoch": 12.25586592178771, + "grad_norm": 1.1966019868850708, + "learning_rate": 0.000388655462184874, + "loss": 0.5843, + "step": 21938 + }, + { + "epoch": 12.256424581005586, + "grad_norm": 0.5154739618301392, + "learning_rate": 0.00038862745098039214, + "loss": 0.4434, + "step": 21939 + }, + { + "epoch": 12.256983240223464, + "grad_norm": 0.43514811992645264, + "learning_rate": 0.0003885994397759104, + "loss": 0.4456, + "step": 21940 + }, + { + "epoch": 12.25754189944134, + "grad_norm": 0.9524070620536804, + "learning_rate": 0.00038857142857142855, + "loss": 0.4501, + "step": 21941 + }, + { + "epoch": 12.258100558659217, + "grad_norm": 0.5993633270263672, + "learning_rate": 0.00038854341736694676, + "loss": 0.3877, + "step": 21942 + }, + { + "epoch": 12.258659217877096, + "grad_norm": 7.890388488769531, + "learning_rate": 0.000388515406162465, + "loss": 0.418, + "step": 21943 + }, + { + "epoch": 12.259217877094972, + "grad_norm": 0.47539111971855164, + "learning_rate": 0.00038848739495798317, + "loss": 0.412, + "step": 21944 + }, + { + "epoch": 12.259776536312849, + "grad_norm": 0.5194594860076904, + "learning_rate": 0.00038845938375350143, + "loss": 0.4101, + "step": 21945 + }, + { + "epoch": 12.260335195530725, + "grad_norm": 1.1685597896575928, + "learning_rate": 0.00038843137254901964, + "loss": 0.4934, + "step": 21946 + }, + { + "epoch": 12.260893854748604, + "grad_norm": 0.3871569037437439, + "learning_rate": 0.0003884033613445378, + "loss": 0.4218, + "step": 21947 + }, + { + "epoch": 12.26145251396648, + "grad_norm": 0.44887304306030273, + "learning_rate": 0.00038837535014005605, + "loss": 0.4318, + "step": 21948 + }, + { + "epoch": 12.262011173184357, + "grad_norm": 0.5415121912956238, + "learning_rate": 0.0003883473389355742, + "loss": 0.6098, + "step": 21949 + }, + { + "epoch": 12.262569832402235, + "grad_norm": 0.5002323389053345, + "learning_rate": 0.00038831932773109246, + "loss": 0.3703, + "step": 21950 + }, + { + "epoch": 12.263128491620112, + "grad_norm": 2.433748960494995, + "learning_rate": 0.00038829131652661067, + "loss": 0.4141, + "step": 21951 + }, + { + "epoch": 12.263687150837988, + "grad_norm": 1.0336363315582275, + "learning_rate": 0.0003882633053221288, + "loss": 0.4337, + "step": 21952 + }, + { + "epoch": 12.264245810055867, + "grad_norm": 2.1985156536102295, + "learning_rate": 0.0003882352941176471, + "loss": 0.3889, + "step": 21953 + }, + { + "epoch": 12.264804469273743, + "grad_norm": 0.4408109486103058, + "learning_rate": 0.0003882072829131653, + "loss": 0.3566, + "step": 21954 + }, + { + "epoch": 12.26536312849162, + "grad_norm": 0.7503076791763306, + "learning_rate": 0.0003881792717086835, + "loss": 0.5297, + "step": 21955 + }, + { + "epoch": 12.265921787709496, + "grad_norm": 0.5883480310440063, + "learning_rate": 0.0003881512605042017, + "loss": 0.4237, + "step": 21956 + }, + { + "epoch": 12.266480446927375, + "grad_norm": 0.5162433981895447, + "learning_rate": 0.00038812324929971985, + "loss": 0.5023, + "step": 21957 + }, + { + "epoch": 12.267039106145251, + "grad_norm": 0.5030010342597961, + "learning_rate": 0.0003880952380952381, + "loss": 0.4108, + "step": 21958 + }, + { + "epoch": 12.267597765363128, + "grad_norm": 0.8635503649711609, + "learning_rate": 0.0003880672268907563, + "loss": 0.4756, + "step": 21959 + }, + { + "epoch": 12.268156424581006, + "grad_norm": 1.5629216432571411, + "learning_rate": 0.0003880392156862745, + "loss": 0.478, + "step": 21960 + }, + { + "epoch": 12.268715083798883, + "grad_norm": 0.45288270711898804, + "learning_rate": 0.0003880112044817927, + "loss": 0.4164, + "step": 21961 + }, + { + "epoch": 12.26927374301676, + "grad_norm": 0.37146326899528503, + "learning_rate": 0.00038798319327731093, + "loss": 0.3652, + "step": 21962 + }, + { + "epoch": 12.269832402234638, + "grad_norm": 0.7616936564445496, + "learning_rate": 0.00038795518207282914, + "loss": 0.4825, + "step": 21963 + }, + { + "epoch": 12.270391061452514, + "grad_norm": 0.4957352876663208, + "learning_rate": 0.00038792717086834734, + "loss": 0.4551, + "step": 21964 + }, + { + "epoch": 12.27094972067039, + "grad_norm": 0.47230660915374756, + "learning_rate": 0.00038789915966386555, + "loss": 0.4663, + "step": 21965 + }, + { + "epoch": 12.271508379888267, + "grad_norm": 0.6230714321136475, + "learning_rate": 0.00038787114845938376, + "loss": 0.4603, + "step": 21966 + }, + { + "epoch": 12.272067039106146, + "grad_norm": 0.35418015718460083, + "learning_rate": 0.00038784313725490196, + "loss": 0.3535, + "step": 21967 + }, + { + "epoch": 12.272625698324022, + "grad_norm": 0.4451562762260437, + "learning_rate": 0.00038781512605042017, + "loss": 0.4465, + "step": 21968 + }, + { + "epoch": 12.273184357541899, + "grad_norm": 0.4882161319255829, + "learning_rate": 0.0003877871148459384, + "loss": 0.3678, + "step": 21969 + }, + { + "epoch": 12.273743016759777, + "grad_norm": 4.364351749420166, + "learning_rate": 0.00038775910364145664, + "loss": 0.339, + "step": 21970 + }, + { + "epoch": 12.274301675977654, + "grad_norm": 0.4866425395011902, + "learning_rate": 0.0003877310924369748, + "loss": 0.4275, + "step": 21971 + }, + { + "epoch": 12.27486033519553, + "grad_norm": 0.48463326692581177, + "learning_rate": 0.000387703081232493, + "loss": 0.4512, + "step": 21972 + }, + { + "epoch": 12.275418994413409, + "grad_norm": 0.8256913423538208, + "learning_rate": 0.0003876750700280112, + "loss": 0.4908, + "step": 21973 + }, + { + "epoch": 12.275977653631285, + "grad_norm": 0.4348258674144745, + "learning_rate": 0.0003876470588235294, + "loss": 0.419, + "step": 21974 + }, + { + "epoch": 12.276536312849162, + "grad_norm": 0.44900065660476685, + "learning_rate": 0.00038761904761904767, + "loss": 0.4138, + "step": 21975 + }, + { + "epoch": 12.277094972067038, + "grad_norm": 0.6732671856880188, + "learning_rate": 0.0003875910364145658, + "loss": 0.4057, + "step": 21976 + }, + { + "epoch": 12.277653631284917, + "grad_norm": 0.7130894064903259, + "learning_rate": 0.000387563025210084, + "loss": 0.4044, + "step": 21977 + }, + { + "epoch": 12.278212290502793, + "grad_norm": 0.3504762649536133, + "learning_rate": 0.0003875350140056023, + "loss": 0.3917, + "step": 21978 + }, + { + "epoch": 12.27877094972067, + "grad_norm": 1.5566868782043457, + "learning_rate": 0.00038750700280112043, + "loss": 0.3734, + "step": 21979 + }, + { + "epoch": 12.279329608938548, + "grad_norm": 0.8719233870506287, + "learning_rate": 0.0003874789915966387, + "loss": 0.4389, + "step": 21980 + }, + { + "epoch": 12.279888268156425, + "grad_norm": 4.940595626831055, + "learning_rate": 0.00038745098039215685, + "loss": 0.4811, + "step": 21981 + }, + { + "epoch": 12.280446927374301, + "grad_norm": 0.39729028940200806, + "learning_rate": 0.00038742296918767505, + "loss": 0.4077, + "step": 21982 + }, + { + "epoch": 12.28100558659218, + "grad_norm": 0.6331236362457275, + "learning_rate": 0.0003873949579831933, + "loss": 0.5556, + "step": 21983 + }, + { + "epoch": 12.281564245810056, + "grad_norm": 2.016129493713379, + "learning_rate": 0.00038736694677871146, + "loss": 0.3951, + "step": 21984 + }, + { + "epoch": 12.282122905027933, + "grad_norm": 0.6197336316108704, + "learning_rate": 0.0003873389355742297, + "loss": 0.3922, + "step": 21985 + }, + { + "epoch": 12.28268156424581, + "grad_norm": 0.38644084334373474, + "learning_rate": 0.00038731092436974793, + "loss": 0.4405, + "step": 21986 + }, + { + "epoch": 12.283240223463688, + "grad_norm": 0.29793092608451843, + "learning_rate": 0.0003872829131652661, + "loss": 0.2778, + "step": 21987 + }, + { + "epoch": 12.283798882681564, + "grad_norm": 0.39753955602645874, + "learning_rate": 0.00038725490196078434, + "loss": 0.3941, + "step": 21988 + }, + { + "epoch": 12.28435754189944, + "grad_norm": 0.5762141346931458, + "learning_rate": 0.0003872268907563025, + "loss": 0.415, + "step": 21989 + }, + { + "epoch": 12.28491620111732, + "grad_norm": 0.44524964690208435, + "learning_rate": 0.00038719887955182076, + "loss": 0.3884, + "step": 21990 + }, + { + "epoch": 12.285474860335196, + "grad_norm": 0.4412602186203003, + "learning_rate": 0.00038717086834733896, + "loss": 0.4031, + "step": 21991 + }, + { + "epoch": 12.286033519553072, + "grad_norm": 0.4033774733543396, + "learning_rate": 0.0003871428571428571, + "loss": 0.3993, + "step": 21992 + }, + { + "epoch": 12.286592178770949, + "grad_norm": 0.4507732093334198, + "learning_rate": 0.0003871148459383754, + "loss": 0.4622, + "step": 21993 + }, + { + "epoch": 12.287150837988827, + "grad_norm": 0.3614402711391449, + "learning_rate": 0.0003870868347338936, + "loss": 0.3849, + "step": 21994 + }, + { + "epoch": 12.287709497206704, + "grad_norm": 0.6344165205955505, + "learning_rate": 0.0003870588235294118, + "loss": 0.4199, + "step": 21995 + }, + { + "epoch": 12.28826815642458, + "grad_norm": 0.44309067726135254, + "learning_rate": 0.00038703081232493, + "loss": 0.3603, + "step": 21996 + }, + { + "epoch": 12.288826815642459, + "grad_norm": 0.5011874437332153, + "learning_rate": 0.00038700280112044814, + "loss": 0.5027, + "step": 21997 + }, + { + "epoch": 12.289385474860335, + "grad_norm": 0.40383729338645935, + "learning_rate": 0.0003869747899159664, + "loss": 0.3343, + "step": 21998 + }, + { + "epoch": 12.289944134078212, + "grad_norm": 2.4490933418273926, + "learning_rate": 0.0003869467787114846, + "loss": 0.6019, + "step": 21999 + }, + { + "epoch": 12.29050279329609, + "grad_norm": 0.5161100625991821, + "learning_rate": 0.0003869187675070028, + "loss": 0.3194, + "step": 22000 + }, + { + "epoch": 12.29050279329609, + "eval_cer": 0.08725846398917657, + "eval_loss": 0.33356931805610657, + "eval_runtime": 55.2916, + "eval_samples_per_second": 82.074, + "eval_steps_per_second": 5.136, + "eval_wer": 0.3433702969190805, + "step": 22000 + }, + { + "epoch": 12.291061452513967, + "grad_norm": 0.5101909637451172, + "learning_rate": 0.000386890756302521, + "loss": 0.5044, + "step": 22001 + }, + { + "epoch": 12.291620111731843, + "grad_norm": 0.5509570837020874, + "learning_rate": 0.0003868627450980392, + "loss": 0.503, + "step": 22002 + }, + { + "epoch": 12.29217877094972, + "grad_norm": 0.4949786365032196, + "learning_rate": 0.00038683473389355743, + "loss": 0.4503, + "step": 22003 + }, + { + "epoch": 12.292737430167598, + "grad_norm": 0.41839534044265747, + "learning_rate": 0.00038680672268907564, + "loss": 0.4616, + "step": 22004 + }, + { + "epoch": 12.293296089385475, + "grad_norm": 0.4735906422138214, + "learning_rate": 0.00038677871148459384, + "loss": 0.3397, + "step": 22005 + }, + { + "epoch": 12.293854748603351, + "grad_norm": 0.569817841053009, + "learning_rate": 0.00038675070028011205, + "loss": 0.389, + "step": 22006 + }, + { + "epoch": 12.29441340782123, + "grad_norm": 0.5748640298843384, + "learning_rate": 0.00038672268907563026, + "loss": 0.4819, + "step": 22007 + }, + { + "epoch": 12.294972067039106, + "grad_norm": 0.34630876779556274, + "learning_rate": 0.00038669467787114846, + "loss": 0.3394, + "step": 22008 + }, + { + "epoch": 12.295530726256983, + "grad_norm": 0.30318352580070496, + "learning_rate": 0.00038666666666666667, + "loss": 0.3045, + "step": 22009 + }, + { + "epoch": 12.296089385474861, + "grad_norm": 20.493606567382812, + "learning_rate": 0.00038663865546218493, + "loss": 0.4628, + "step": 22010 + }, + { + "epoch": 12.296648044692738, + "grad_norm": 0.6786620616912842, + "learning_rate": 0.0003866106442577031, + "loss": 0.403, + "step": 22011 + }, + { + "epoch": 12.297206703910614, + "grad_norm": 1.9632556438446045, + "learning_rate": 0.0003865826330532213, + "loss": 0.54, + "step": 22012 + }, + { + "epoch": 12.297765363128491, + "grad_norm": 0.435391902923584, + "learning_rate": 0.0003865546218487395, + "loss": 0.3931, + "step": 22013 + }, + { + "epoch": 12.29832402234637, + "grad_norm": 0.5964452624320984, + "learning_rate": 0.0003865266106442577, + "loss": 0.4574, + "step": 22014 + }, + { + "epoch": 12.298882681564246, + "grad_norm": 0.6367564797401428, + "learning_rate": 0.00038649859943977596, + "loss": 0.4706, + "step": 22015 + }, + { + "epoch": 12.299441340782122, + "grad_norm": 0.4371486306190491, + "learning_rate": 0.0003864705882352941, + "loss": 0.3904, + "step": 22016 + }, + { + "epoch": 12.3, + "grad_norm": 1.2134084701538086, + "learning_rate": 0.0003864425770308123, + "loss": 0.3511, + "step": 22017 + }, + { + "epoch": 12.300558659217877, + "grad_norm": 1.6539559364318848, + "learning_rate": 0.0003864145658263306, + "loss": 0.4262, + "step": 22018 + }, + { + "epoch": 12.301117318435754, + "grad_norm": 0.42026272416114807, + "learning_rate": 0.00038638655462184873, + "loss": 0.4589, + "step": 22019 + }, + { + "epoch": 12.30167597765363, + "grad_norm": 0.5202992558479309, + "learning_rate": 0.000386358543417367, + "loss": 0.4315, + "step": 22020 + }, + { + "epoch": 12.302234636871509, + "grad_norm": 0.467745840549469, + "learning_rate": 0.00038633053221288514, + "loss": 0.3924, + "step": 22021 + }, + { + "epoch": 12.302793296089385, + "grad_norm": 2.2003281116485596, + "learning_rate": 0.00038630252100840335, + "loss": 0.4182, + "step": 22022 + }, + { + "epoch": 12.303351955307262, + "grad_norm": 0.9660027027130127, + "learning_rate": 0.0003862745098039216, + "loss": 0.3344, + "step": 22023 + }, + { + "epoch": 12.30391061452514, + "grad_norm": 1.4834377765655518, + "learning_rate": 0.00038624649859943976, + "loss": 0.4439, + "step": 22024 + }, + { + "epoch": 12.304469273743017, + "grad_norm": 0.4662884473800659, + "learning_rate": 0.000386218487394958, + "loss": 0.4835, + "step": 22025 + }, + { + "epoch": 12.305027932960893, + "grad_norm": 0.4322328269481659, + "learning_rate": 0.0003861904761904762, + "loss": 0.5046, + "step": 22026 + }, + { + "epoch": 12.305586592178772, + "grad_norm": 1.4548461437225342, + "learning_rate": 0.0003861624649859944, + "loss": 0.5176, + "step": 22027 + }, + { + "epoch": 12.306145251396648, + "grad_norm": 1.0186183452606201, + "learning_rate": 0.00038613445378151264, + "loss": 0.3485, + "step": 22028 + }, + { + "epoch": 12.306703910614525, + "grad_norm": 0.4916955232620239, + "learning_rate": 0.0003861064425770308, + "loss": 0.4494, + "step": 22029 + }, + { + "epoch": 12.307262569832401, + "grad_norm": 0.4791578948497772, + "learning_rate": 0.00038607843137254905, + "loss": 0.3474, + "step": 22030 + }, + { + "epoch": 12.30782122905028, + "grad_norm": 0.718127965927124, + "learning_rate": 0.00038605042016806726, + "loss": 0.5104, + "step": 22031 + }, + { + "epoch": 12.308379888268156, + "grad_norm": 0.7696358561515808, + "learning_rate": 0.0003860224089635854, + "loss": 0.6354, + "step": 22032 + }, + { + "epoch": 12.308938547486033, + "grad_norm": 0.46793481707572937, + "learning_rate": 0.00038599439775910367, + "loss": 0.4074, + "step": 22033 + }, + { + "epoch": 12.309497206703911, + "grad_norm": 0.6133039593696594, + "learning_rate": 0.0003859663865546219, + "loss": 0.3894, + "step": 22034 + }, + { + "epoch": 12.310055865921788, + "grad_norm": 0.8257023692131042, + "learning_rate": 0.0003859383753501401, + "loss": 0.3952, + "step": 22035 + }, + { + "epoch": 12.310614525139664, + "grad_norm": 0.5027741193771362, + "learning_rate": 0.0003859103641456583, + "loss": 0.572, + "step": 22036 + }, + { + "epoch": 12.311173184357543, + "grad_norm": 6.1996612548828125, + "learning_rate": 0.00038588235294117644, + "loss": 0.5828, + "step": 22037 + }, + { + "epoch": 12.31173184357542, + "grad_norm": 0.4067029356956482, + "learning_rate": 0.0003858543417366947, + "loss": 0.4379, + "step": 22038 + }, + { + "epoch": 12.312290502793296, + "grad_norm": 2.2580740451812744, + "learning_rate": 0.0003858263305322129, + "loss": 0.4404, + "step": 22039 + }, + { + "epoch": 12.312849162011172, + "grad_norm": 0.3362097442150116, + "learning_rate": 0.00038579831932773105, + "loss": 0.4378, + "step": 22040 + }, + { + "epoch": 12.31340782122905, + "grad_norm": 0.5648675560951233, + "learning_rate": 0.0003857703081232493, + "loss": 0.5131, + "step": 22041 + }, + { + "epoch": 12.313966480446927, + "grad_norm": 0.8387340307235718, + "learning_rate": 0.0003857422969187675, + "loss": 0.336, + "step": 22042 + }, + { + "epoch": 12.314525139664804, + "grad_norm": 0.5210297107696533, + "learning_rate": 0.0003857142857142857, + "loss": 0.4216, + "step": 22043 + }, + { + "epoch": 12.315083798882682, + "grad_norm": 1.334911584854126, + "learning_rate": 0.00038568627450980393, + "loss": 0.3932, + "step": 22044 + }, + { + "epoch": 12.315642458100559, + "grad_norm": 0.5634598135948181, + "learning_rate": 0.0003856582633053221, + "loss": 0.432, + "step": 22045 + }, + { + "epoch": 12.316201117318435, + "grad_norm": 0.526818573474884, + "learning_rate": 0.00038563025210084034, + "loss": 0.4862, + "step": 22046 + }, + { + "epoch": 12.316759776536314, + "grad_norm": 0.41560494899749756, + "learning_rate": 0.00038560224089635855, + "loss": 0.4336, + "step": 22047 + }, + { + "epoch": 12.31731843575419, + "grad_norm": 0.676376461982727, + "learning_rate": 0.00038557422969187676, + "loss": 0.6179, + "step": 22048 + }, + { + "epoch": 12.317877094972067, + "grad_norm": 1.035886287689209, + "learning_rate": 0.00038554621848739496, + "loss": 0.5195, + "step": 22049 + }, + { + "epoch": 12.318435754189943, + "grad_norm": 1.0377691984176636, + "learning_rate": 0.00038551820728291317, + "loss": 0.3766, + "step": 22050 + }, + { + "epoch": 12.318994413407822, + "grad_norm": 1.0605108737945557, + "learning_rate": 0.0003854901960784314, + "loss": 0.5, + "step": 22051 + }, + { + "epoch": 12.319553072625698, + "grad_norm": 1.2051405906677246, + "learning_rate": 0.0003854621848739496, + "loss": 0.4617, + "step": 22052 + }, + { + "epoch": 12.320111731843575, + "grad_norm": 1.3574597835540771, + "learning_rate": 0.0003854341736694678, + "loss": 0.4602, + "step": 22053 + }, + { + "epoch": 12.320670391061453, + "grad_norm": 0.4773907959461212, + "learning_rate": 0.000385406162464986, + "loss": 0.3053, + "step": 22054 + }, + { + "epoch": 12.32122905027933, + "grad_norm": 0.4621227979660034, + "learning_rate": 0.0003853781512605042, + "loss": 0.4155, + "step": 22055 + }, + { + "epoch": 12.321787709497206, + "grad_norm": 1.189773440361023, + "learning_rate": 0.0003853501400560224, + "loss": 0.3165, + "step": 22056 + }, + { + "epoch": 12.322346368715085, + "grad_norm": 0.571053147315979, + "learning_rate": 0.0003853221288515406, + "loss": 0.3451, + "step": 22057 + }, + { + "epoch": 12.322905027932961, + "grad_norm": 0.6405549645423889, + "learning_rate": 0.00038529411764705887, + "loss": 0.4913, + "step": 22058 + }, + { + "epoch": 12.323463687150838, + "grad_norm": 0.7246174812316895, + "learning_rate": 0.000385266106442577, + "loss": 0.5107, + "step": 22059 + }, + { + "epoch": 12.324022346368714, + "grad_norm": 0.36921757459640503, + "learning_rate": 0.00038523809523809523, + "loss": 0.5043, + "step": 22060 + }, + { + "epoch": 12.324581005586593, + "grad_norm": 0.785773754119873, + "learning_rate": 0.00038521008403361343, + "loss": 0.417, + "step": 22061 + }, + { + "epoch": 12.32513966480447, + "grad_norm": 0.33367887139320374, + "learning_rate": 0.00038518207282913164, + "loss": 0.35, + "step": 22062 + }, + { + "epoch": 12.325698324022346, + "grad_norm": 3.0431160926818848, + "learning_rate": 0.0003851540616246499, + "loss": 0.4313, + "step": 22063 + }, + { + "epoch": 12.326256983240224, + "grad_norm": 0.46786046028137207, + "learning_rate": 0.00038512605042016805, + "loss": 0.3928, + "step": 22064 + }, + { + "epoch": 12.3268156424581, + "grad_norm": 1.3510156869888306, + "learning_rate": 0.00038509803921568626, + "loss": 0.4261, + "step": 22065 + }, + { + "epoch": 12.327374301675977, + "grad_norm": 0.6182519197463989, + "learning_rate": 0.0003850700280112045, + "loss": 0.3767, + "step": 22066 + }, + { + "epoch": 12.327932960893854, + "grad_norm": 0.49069443345069885, + "learning_rate": 0.00038504201680672267, + "loss": 0.4449, + "step": 22067 + }, + { + "epoch": 12.328491620111732, + "grad_norm": 0.427541583776474, + "learning_rate": 0.00038501400560224093, + "loss": 0.3877, + "step": 22068 + }, + { + "epoch": 12.329050279329609, + "grad_norm": 0.44517752528190613, + "learning_rate": 0.0003849859943977591, + "loss": 0.404, + "step": 22069 + }, + { + "epoch": 12.329608938547485, + "grad_norm": 0.4926365315914154, + "learning_rate": 0.0003849579831932773, + "loss": 0.4675, + "step": 22070 + }, + { + "epoch": 12.330167597765364, + "grad_norm": 0.6203492283821106, + "learning_rate": 0.00038492997198879555, + "loss": 0.4113, + "step": 22071 + }, + { + "epoch": 12.33072625698324, + "grad_norm": 0.61383455991745, + "learning_rate": 0.0003849019607843137, + "loss": 0.5191, + "step": 22072 + }, + { + "epoch": 12.331284916201117, + "grad_norm": 0.5258894562721252, + "learning_rate": 0.00038487394957983196, + "loss": 0.4, + "step": 22073 + }, + { + "epoch": 12.331843575418995, + "grad_norm": 0.5182380676269531, + "learning_rate": 0.00038484593837535017, + "loss": 0.4227, + "step": 22074 + }, + { + "epoch": 12.332402234636872, + "grad_norm": 0.5613095760345459, + "learning_rate": 0.0003848179271708683, + "loss": 0.5018, + "step": 22075 + }, + { + "epoch": 12.332960893854748, + "grad_norm": 0.46325626969337463, + "learning_rate": 0.0003847899159663866, + "loss": 0.3538, + "step": 22076 + }, + { + "epoch": 12.333519553072625, + "grad_norm": 0.7671961188316345, + "learning_rate": 0.00038476190476190473, + "loss": 0.6775, + "step": 22077 + }, + { + "epoch": 12.334078212290503, + "grad_norm": 0.3868919312953949, + "learning_rate": 0.000384733893557423, + "loss": 0.326, + "step": 22078 + }, + { + "epoch": 12.33463687150838, + "grad_norm": 0.49969127774238586, + "learning_rate": 0.0003847058823529412, + "loss": 0.6402, + "step": 22079 + }, + { + "epoch": 12.335195530726256, + "grad_norm": 0.73891681432724, + "learning_rate": 0.00038467787114845935, + "loss": 0.3978, + "step": 22080 + }, + { + "epoch": 12.335754189944135, + "grad_norm": 1.2865787744522095, + "learning_rate": 0.0003846498599439776, + "loss": 0.5412, + "step": 22081 + }, + { + "epoch": 12.336312849162011, + "grad_norm": 1.1106302738189697, + "learning_rate": 0.0003846218487394958, + "loss": 0.368, + "step": 22082 + }, + { + "epoch": 12.336871508379888, + "grad_norm": 0.5012155771255493, + "learning_rate": 0.000384593837535014, + "loss": 0.3491, + "step": 22083 + }, + { + "epoch": 12.337430167597766, + "grad_norm": 0.40300559997558594, + "learning_rate": 0.0003845658263305322, + "loss": 0.4637, + "step": 22084 + }, + { + "epoch": 12.337988826815643, + "grad_norm": 0.3870534300804138, + "learning_rate": 0.0003845378151260504, + "loss": 0.4074, + "step": 22085 + }, + { + "epoch": 12.33854748603352, + "grad_norm": 0.32733431458473206, + "learning_rate": 0.00038450980392156864, + "loss": 0.3158, + "step": 22086 + }, + { + "epoch": 12.339106145251396, + "grad_norm": 0.4238924980163574, + "learning_rate": 0.00038448179271708684, + "loss": 0.5426, + "step": 22087 + }, + { + "epoch": 12.339664804469274, + "grad_norm": 0.7052682042121887, + "learning_rate": 0.00038445378151260505, + "loss": 0.3858, + "step": 22088 + }, + { + "epoch": 12.34022346368715, + "grad_norm": 0.3622893989086151, + "learning_rate": 0.00038442577030812326, + "loss": 0.3668, + "step": 22089 + }, + { + "epoch": 12.340782122905027, + "grad_norm": 4.846566200256348, + "learning_rate": 0.00038439775910364146, + "loss": 0.3675, + "step": 22090 + }, + { + "epoch": 12.341340782122906, + "grad_norm": 0.4018767178058624, + "learning_rate": 0.00038436974789915967, + "loss": 0.4772, + "step": 22091 + }, + { + "epoch": 12.341899441340782, + "grad_norm": 0.7569673657417297, + "learning_rate": 0.0003843417366946779, + "loss": 0.5905, + "step": 22092 + }, + { + "epoch": 12.342458100558659, + "grad_norm": 0.47620949149131775, + "learning_rate": 0.00038431372549019614, + "loss": 0.4197, + "step": 22093 + }, + { + "epoch": 12.343016759776535, + "grad_norm": 0.4854866862297058, + "learning_rate": 0.0003842857142857143, + "loss": 0.3195, + "step": 22094 + }, + { + "epoch": 12.343575418994414, + "grad_norm": 0.44752511382102966, + "learning_rate": 0.0003842577030812325, + "loss": 0.3688, + "step": 22095 + }, + { + "epoch": 12.34413407821229, + "grad_norm": 1.077589511871338, + "learning_rate": 0.0003842296918767507, + "loss": 0.5776, + "step": 22096 + }, + { + "epoch": 12.344692737430167, + "grad_norm": 0.7049968242645264, + "learning_rate": 0.0003842016806722689, + "loss": 0.4006, + "step": 22097 + }, + { + "epoch": 12.345251396648045, + "grad_norm": 0.5065101981163025, + "learning_rate": 0.00038417366946778717, + "loss": 0.3826, + "step": 22098 + }, + { + "epoch": 12.345810055865922, + "grad_norm": 0.43125924468040466, + "learning_rate": 0.0003841456582633053, + "loss": 0.4319, + "step": 22099 + }, + { + "epoch": 12.346368715083798, + "grad_norm": 0.756377637386322, + "learning_rate": 0.0003841176470588235, + "loss": 0.4943, + "step": 22100 + }, + { + "epoch": 12.346927374301677, + "grad_norm": 11.611139297485352, + "learning_rate": 0.0003840896358543418, + "loss": 0.4376, + "step": 22101 + }, + { + "epoch": 12.347486033519553, + "grad_norm": 3.8073177337646484, + "learning_rate": 0.00038406162464985993, + "loss": 0.2788, + "step": 22102 + }, + { + "epoch": 12.34804469273743, + "grad_norm": 0.471903920173645, + "learning_rate": 0.0003840336134453782, + "loss": 0.4464, + "step": 22103 + }, + { + "epoch": 12.348603351955306, + "grad_norm": 3.4419453144073486, + "learning_rate": 0.00038400560224089635, + "loss": 0.4921, + "step": 22104 + }, + { + "epoch": 12.349162011173185, + "grad_norm": 0.4697394073009491, + "learning_rate": 0.00038397759103641455, + "loss": 0.5058, + "step": 22105 + }, + { + "epoch": 12.349720670391061, + "grad_norm": 0.41084542870521545, + "learning_rate": 0.0003839495798319328, + "loss": 0.3971, + "step": 22106 + }, + { + "epoch": 12.350279329608938, + "grad_norm": 0.5376641750335693, + "learning_rate": 0.00038392156862745096, + "loss": 0.5377, + "step": 22107 + }, + { + "epoch": 12.350837988826816, + "grad_norm": 0.4319959282875061, + "learning_rate": 0.0003838935574229692, + "loss": 0.4161, + "step": 22108 + }, + { + "epoch": 12.351396648044693, + "grad_norm": 0.427133172750473, + "learning_rate": 0.00038386554621848743, + "loss": 0.4302, + "step": 22109 + }, + { + "epoch": 12.35195530726257, + "grad_norm": 1.3137071132659912, + "learning_rate": 0.0003838375350140056, + "loss": 0.4051, + "step": 22110 + }, + { + "epoch": 12.352513966480448, + "grad_norm": 0.4548395276069641, + "learning_rate": 0.00038380952380952384, + "loss": 0.3753, + "step": 22111 + }, + { + "epoch": 12.353072625698324, + "grad_norm": 0.4807973802089691, + "learning_rate": 0.000383781512605042, + "loss": 0.3288, + "step": 22112 + }, + { + "epoch": 12.3536312849162, + "grad_norm": 0.49493372440338135, + "learning_rate": 0.00038375350140056026, + "loss": 0.3117, + "step": 22113 + }, + { + "epoch": 12.354189944134077, + "grad_norm": 5.739772319793701, + "learning_rate": 0.00038372549019607846, + "loss": 0.3862, + "step": 22114 + }, + { + "epoch": 12.354748603351956, + "grad_norm": 0.5182005763053894, + "learning_rate": 0.0003836974789915966, + "loss": 0.4909, + "step": 22115 + }, + { + "epoch": 12.355307262569832, + "grad_norm": 0.3472490608692169, + "learning_rate": 0.0003836694677871149, + "loss": 0.3603, + "step": 22116 + }, + { + "epoch": 12.355865921787709, + "grad_norm": 0.408651202917099, + "learning_rate": 0.0003836414565826331, + "loss": 0.5001, + "step": 22117 + }, + { + "epoch": 12.356424581005587, + "grad_norm": 0.540635883808136, + "learning_rate": 0.0003836134453781513, + "loss": 0.4973, + "step": 22118 + }, + { + "epoch": 12.356983240223464, + "grad_norm": 0.48477354645729065, + "learning_rate": 0.0003835854341736695, + "loss": 0.3761, + "step": 22119 + }, + { + "epoch": 12.35754189944134, + "grad_norm": 11.91270637512207, + "learning_rate": 0.00038355742296918764, + "loss": 0.4147, + "step": 22120 + }, + { + "epoch": 12.358100558659217, + "grad_norm": 0.9062607288360596, + "learning_rate": 0.0003835294117647059, + "loss": 0.4661, + "step": 22121 + }, + { + "epoch": 12.358659217877095, + "grad_norm": 0.4032188951969147, + "learning_rate": 0.0003835014005602241, + "loss": 0.4517, + "step": 22122 + }, + { + "epoch": 12.359217877094972, + "grad_norm": 0.41880401968955994, + "learning_rate": 0.0003834733893557423, + "loss": 0.3438, + "step": 22123 + }, + { + "epoch": 12.359776536312848, + "grad_norm": 2.1132102012634277, + "learning_rate": 0.0003834453781512605, + "loss": 0.4405, + "step": 22124 + }, + { + "epoch": 12.360335195530727, + "grad_norm": 0.496634304523468, + "learning_rate": 0.0003834173669467787, + "loss": 0.3625, + "step": 22125 + }, + { + "epoch": 12.360893854748603, + "grad_norm": 0.46067485213279724, + "learning_rate": 0.00038338935574229693, + "loss": 0.4481, + "step": 22126 + }, + { + "epoch": 12.36145251396648, + "grad_norm": 0.4707186222076416, + "learning_rate": 0.00038336134453781514, + "loss": 0.4023, + "step": 22127 + }, + { + "epoch": 12.362011173184358, + "grad_norm": 2.328150749206543, + "learning_rate": 0.00038333333333333334, + "loss": 0.4081, + "step": 22128 + }, + { + "epoch": 12.362569832402235, + "grad_norm": 0.6449905037879944, + "learning_rate": 0.00038330532212885155, + "loss": 0.4883, + "step": 22129 + }, + { + "epoch": 12.363128491620111, + "grad_norm": 0.5139626860618591, + "learning_rate": 0.00038327731092436976, + "loss": 0.4341, + "step": 22130 + }, + { + "epoch": 12.363687150837988, + "grad_norm": 1.5703986883163452, + "learning_rate": 0.00038324929971988796, + "loss": 0.461, + "step": 22131 + }, + { + "epoch": 12.364245810055866, + "grad_norm": 0.4426910877227783, + "learning_rate": 0.00038322128851540617, + "loss": 0.4139, + "step": 22132 + }, + { + "epoch": 12.364804469273743, + "grad_norm": 0.49806609749794006, + "learning_rate": 0.00038319327731092443, + "loss": 0.3995, + "step": 22133 + }, + { + "epoch": 12.36536312849162, + "grad_norm": 0.48100659251213074, + "learning_rate": 0.0003831652661064426, + "loss": 0.4737, + "step": 22134 + }, + { + "epoch": 12.365921787709498, + "grad_norm": 0.3108961284160614, + "learning_rate": 0.0003831372549019608, + "loss": 0.3132, + "step": 22135 + }, + { + "epoch": 12.366480446927374, + "grad_norm": 0.49499815702438354, + "learning_rate": 0.000383109243697479, + "loss": 0.3311, + "step": 22136 + }, + { + "epoch": 12.367039106145251, + "grad_norm": 0.7148417234420776, + "learning_rate": 0.0003830812324929972, + "loss": 0.3893, + "step": 22137 + }, + { + "epoch": 12.36759776536313, + "grad_norm": 0.7804326415061951, + "learning_rate": 0.00038305322128851546, + "loss": 0.3815, + "step": 22138 + }, + { + "epoch": 12.368156424581006, + "grad_norm": 0.6722231507301331, + "learning_rate": 0.0003830252100840336, + "loss": 0.4323, + "step": 22139 + }, + { + "epoch": 12.368715083798882, + "grad_norm": 0.645348846912384, + "learning_rate": 0.0003829971988795518, + "loss": 0.569, + "step": 22140 + }, + { + "epoch": 12.369273743016759, + "grad_norm": 0.6104491353034973, + "learning_rate": 0.0003829691876750701, + "loss": 0.5616, + "step": 22141 + }, + { + "epoch": 12.369832402234637, + "grad_norm": 1.1796618700027466, + "learning_rate": 0.00038294117647058823, + "loss": 0.4672, + "step": 22142 + }, + { + "epoch": 12.370391061452514, + "grad_norm": 0.8682754635810852, + "learning_rate": 0.0003829131652661065, + "loss": 0.4328, + "step": 22143 + }, + { + "epoch": 12.37094972067039, + "grad_norm": 0.5486952662467957, + "learning_rate": 0.00038288515406162464, + "loss": 0.4437, + "step": 22144 + }, + { + "epoch": 12.371508379888269, + "grad_norm": 0.5833659172058105, + "learning_rate": 0.00038285714285714285, + "loss": 0.4126, + "step": 22145 + }, + { + "epoch": 12.372067039106145, + "grad_norm": 0.6327529549598694, + "learning_rate": 0.0003828291316526611, + "loss": 0.3344, + "step": 22146 + }, + { + "epoch": 12.372625698324022, + "grad_norm": 0.6232379674911499, + "learning_rate": 0.00038280112044817926, + "loss": 0.485, + "step": 22147 + }, + { + "epoch": 12.3731843575419, + "grad_norm": 0.5624763369560242, + "learning_rate": 0.00038277310924369746, + "loss": 0.4345, + "step": 22148 + }, + { + "epoch": 12.373743016759777, + "grad_norm": 3.373488664627075, + "learning_rate": 0.0003827450980392157, + "loss": 0.5175, + "step": 22149 + }, + { + "epoch": 12.374301675977653, + "grad_norm": 10.65363883972168, + "learning_rate": 0.0003827170868347339, + "loss": 0.4708, + "step": 22150 + }, + { + "epoch": 12.37486033519553, + "grad_norm": 0.5036160349845886, + "learning_rate": 0.00038268907563025214, + "loss": 0.3163, + "step": 22151 + }, + { + "epoch": 12.375418994413408, + "grad_norm": 0.47631320357322693, + "learning_rate": 0.0003826610644257703, + "loss": 0.5565, + "step": 22152 + }, + { + "epoch": 12.375977653631285, + "grad_norm": 0.6353400945663452, + "learning_rate": 0.0003826330532212885, + "loss": 0.4484, + "step": 22153 + }, + { + "epoch": 12.376536312849161, + "grad_norm": 0.46554893255233765, + "learning_rate": 0.00038260504201680675, + "loss": 0.4478, + "step": 22154 + }, + { + "epoch": 12.37709497206704, + "grad_norm": 0.6046351790428162, + "learning_rate": 0.0003825770308123249, + "loss": 0.4124, + "step": 22155 + }, + { + "epoch": 12.377653631284916, + "grad_norm": 0.5346139073371887, + "learning_rate": 0.00038254901960784317, + "loss": 0.5369, + "step": 22156 + }, + { + "epoch": 12.378212290502793, + "grad_norm": 0.4957559108734131, + "learning_rate": 0.0003825210084033614, + "loss": 0.4417, + "step": 22157 + }, + { + "epoch": 12.378770949720671, + "grad_norm": 0.5550747513771057, + "learning_rate": 0.0003824929971988795, + "loss": 0.418, + "step": 22158 + }, + { + "epoch": 12.379329608938548, + "grad_norm": 0.42560771107673645, + "learning_rate": 0.0003824649859943978, + "loss": 0.2678, + "step": 22159 + }, + { + "epoch": 12.379888268156424, + "grad_norm": 0.62616366147995, + "learning_rate": 0.00038243697478991594, + "loss": 0.3804, + "step": 22160 + }, + { + "epoch": 12.380446927374301, + "grad_norm": 0.5516969561576843, + "learning_rate": 0.0003824089635854342, + "loss": 0.4018, + "step": 22161 + }, + { + "epoch": 12.38100558659218, + "grad_norm": 0.9069478511810303, + "learning_rate": 0.0003823809523809524, + "loss": 0.5022, + "step": 22162 + }, + { + "epoch": 12.381564245810056, + "grad_norm": 0.45761433243751526, + "learning_rate": 0.00038235294117647055, + "loss": 0.5687, + "step": 22163 + }, + { + "epoch": 12.382122905027932, + "grad_norm": 0.6873829960823059, + "learning_rate": 0.0003823249299719888, + "loss": 0.4693, + "step": 22164 + }, + { + "epoch": 12.38268156424581, + "grad_norm": 0.3658673167228699, + "learning_rate": 0.000382296918767507, + "loss": 0.381, + "step": 22165 + }, + { + "epoch": 12.383240223463687, + "grad_norm": 2.664102077484131, + "learning_rate": 0.0003822689075630252, + "loss": 0.4665, + "step": 22166 + }, + { + "epoch": 12.383798882681564, + "grad_norm": 4.503544807434082, + "learning_rate": 0.00038224089635854343, + "loss": 0.5338, + "step": 22167 + }, + { + "epoch": 12.38435754189944, + "grad_norm": 0.5606874227523804, + "learning_rate": 0.0003822128851540616, + "loss": 0.4669, + "step": 22168 + }, + { + "epoch": 12.384916201117319, + "grad_norm": 0.6650618314743042, + "learning_rate": 0.00038218487394957984, + "loss": 0.455, + "step": 22169 + }, + { + "epoch": 12.385474860335195, + "grad_norm": 0.6762911677360535, + "learning_rate": 0.00038215686274509805, + "loss": 0.4971, + "step": 22170 + }, + { + "epoch": 12.386033519553072, + "grad_norm": 0.6417225003242493, + "learning_rate": 0.00038212885154061626, + "loss": 0.6135, + "step": 22171 + }, + { + "epoch": 12.38659217877095, + "grad_norm": 0.7029066681861877, + "learning_rate": 0.00038210084033613446, + "loss": 0.4607, + "step": 22172 + }, + { + "epoch": 12.387150837988827, + "grad_norm": 0.49348974227905273, + "learning_rate": 0.00038207282913165267, + "loss": 0.4209, + "step": 22173 + }, + { + "epoch": 12.387709497206703, + "grad_norm": 0.5835763216018677, + "learning_rate": 0.0003820448179271709, + "loss": 0.45, + "step": 22174 + }, + { + "epoch": 12.388268156424582, + "grad_norm": 1.3397552967071533, + "learning_rate": 0.0003820168067226891, + "loss": 0.3866, + "step": 22175 + }, + { + "epoch": 12.388826815642458, + "grad_norm": 3.6497960090637207, + "learning_rate": 0.0003819887955182073, + "loss": 0.4991, + "step": 22176 + }, + { + "epoch": 12.389385474860335, + "grad_norm": 0.478920578956604, + "learning_rate": 0.0003819607843137255, + "loss": 0.4764, + "step": 22177 + }, + { + "epoch": 12.389944134078211, + "grad_norm": 0.5885761976242065, + "learning_rate": 0.0003819327731092437, + "loss": 0.4812, + "step": 22178 + }, + { + "epoch": 12.39050279329609, + "grad_norm": 0.38666030764579773, + "learning_rate": 0.0003819047619047619, + "loss": 0.3301, + "step": 22179 + }, + { + "epoch": 12.391061452513966, + "grad_norm": 0.5020449757575989, + "learning_rate": 0.0003818767507002801, + "loss": 0.4738, + "step": 22180 + }, + { + "epoch": 12.391620111731843, + "grad_norm": 0.4787828326225281, + "learning_rate": 0.00038184873949579837, + "loss": 0.434, + "step": 22181 + }, + { + "epoch": 12.392178770949721, + "grad_norm": 3.974708318710327, + "learning_rate": 0.0003818207282913165, + "loss": 0.4115, + "step": 22182 + }, + { + "epoch": 12.392737430167598, + "grad_norm": 0.6274554133415222, + "learning_rate": 0.00038179271708683473, + "loss": 0.479, + "step": 22183 + }, + { + "epoch": 12.393296089385474, + "grad_norm": 0.41928815841674805, + "learning_rate": 0.00038176470588235293, + "loss": 0.4133, + "step": 22184 + }, + { + "epoch": 12.393854748603353, + "grad_norm": 0.42631253600120544, + "learning_rate": 0.00038173669467787114, + "loss": 0.4161, + "step": 22185 + }, + { + "epoch": 12.39441340782123, + "grad_norm": 0.3683593273162842, + "learning_rate": 0.0003817086834733894, + "loss": 0.4397, + "step": 22186 + }, + { + "epoch": 12.394972067039106, + "grad_norm": 0.6384584307670593, + "learning_rate": 0.00038168067226890755, + "loss": 0.4684, + "step": 22187 + }, + { + "epoch": 12.395530726256982, + "grad_norm": 0.3789200484752655, + "learning_rate": 0.00038165266106442576, + "loss": 0.3977, + "step": 22188 + }, + { + "epoch": 12.39608938547486, + "grad_norm": 0.8033278584480286, + "learning_rate": 0.000381624649859944, + "loss": 0.3437, + "step": 22189 + }, + { + "epoch": 12.396648044692737, + "grad_norm": 0.7364555597305298, + "learning_rate": 0.00038159663865546217, + "loss": 0.4174, + "step": 22190 + }, + { + "epoch": 12.397206703910614, + "grad_norm": 0.5501554012298584, + "learning_rate": 0.00038156862745098043, + "loss": 0.5531, + "step": 22191 + }, + { + "epoch": 12.397765363128492, + "grad_norm": 0.5520037412643433, + "learning_rate": 0.0003815406162464986, + "loss": 0.5227, + "step": 22192 + }, + { + "epoch": 12.398324022346369, + "grad_norm": 0.43416768312454224, + "learning_rate": 0.0003815126050420168, + "loss": 0.4131, + "step": 22193 + }, + { + "epoch": 12.398882681564245, + "grad_norm": 0.6907766461372375, + "learning_rate": 0.00038148459383753505, + "loss": 0.4374, + "step": 22194 + }, + { + "epoch": 12.399441340782122, + "grad_norm": 0.6432982683181763, + "learning_rate": 0.0003814565826330532, + "loss": 0.4399, + "step": 22195 + }, + { + "epoch": 12.4, + "grad_norm": 0.3986281454563141, + "learning_rate": 0.00038142857142857146, + "loss": 0.3806, + "step": 22196 + }, + { + "epoch": 12.400558659217877, + "grad_norm": 0.6119687557220459, + "learning_rate": 0.00038140056022408967, + "loss": 0.6667, + "step": 22197 + }, + { + "epoch": 12.401117318435753, + "grad_norm": 0.397361695766449, + "learning_rate": 0.0003813725490196078, + "loss": 0.4009, + "step": 22198 + }, + { + "epoch": 12.401675977653632, + "grad_norm": 0.5475913286209106, + "learning_rate": 0.0003813445378151261, + "loss": 0.4018, + "step": 22199 + }, + { + "epoch": 12.402234636871508, + "grad_norm": 2.4565999507904053, + "learning_rate": 0.00038131652661064423, + "loss": 0.5485, + "step": 22200 + }, + { + "epoch": 12.402793296089385, + "grad_norm": 0.49206361174583435, + "learning_rate": 0.0003812885154061625, + "loss": 0.485, + "step": 22201 + }, + { + "epoch": 12.403351955307263, + "grad_norm": 0.689730703830719, + "learning_rate": 0.0003812605042016807, + "loss": 0.5028, + "step": 22202 + }, + { + "epoch": 12.40391061452514, + "grad_norm": 0.4874515235424042, + "learning_rate": 0.00038123249299719885, + "loss": 0.5252, + "step": 22203 + }, + { + "epoch": 12.404469273743016, + "grad_norm": 0.663447380065918, + "learning_rate": 0.0003812044817927171, + "loss": 0.5117, + "step": 22204 + }, + { + "epoch": 12.405027932960893, + "grad_norm": 0.7527303099632263, + "learning_rate": 0.0003811764705882353, + "loss": 0.2986, + "step": 22205 + }, + { + "epoch": 12.405586592178771, + "grad_norm": 0.650242805480957, + "learning_rate": 0.0003811484593837535, + "loss": 0.5176, + "step": 22206 + }, + { + "epoch": 12.406145251396648, + "grad_norm": 0.8362577557563782, + "learning_rate": 0.0003811204481792717, + "loss": 0.366, + "step": 22207 + }, + { + "epoch": 12.406703910614524, + "grad_norm": 0.5000497698783875, + "learning_rate": 0.0003810924369747899, + "loss": 0.4623, + "step": 22208 + }, + { + "epoch": 12.407262569832403, + "grad_norm": 0.43475061655044556, + "learning_rate": 0.00038106442577030814, + "loss": 0.3866, + "step": 22209 + }, + { + "epoch": 12.40782122905028, + "grad_norm": 0.4988260567188263, + "learning_rate": 0.00038103641456582634, + "loss": 0.3685, + "step": 22210 + }, + { + "epoch": 12.408379888268156, + "grad_norm": 0.4560915231704712, + "learning_rate": 0.00038100840336134455, + "loss": 0.4678, + "step": 22211 + }, + { + "epoch": 12.408938547486034, + "grad_norm": 0.929999589920044, + "learning_rate": 0.00038098039215686276, + "loss": 0.3801, + "step": 22212 + }, + { + "epoch": 12.40949720670391, + "grad_norm": 0.3650863766670227, + "learning_rate": 0.00038095238095238096, + "loss": 0.4155, + "step": 22213 + }, + { + "epoch": 12.410055865921787, + "grad_norm": 0.593860387802124, + "learning_rate": 0.00038092436974789917, + "loss": 0.3116, + "step": 22214 + }, + { + "epoch": 12.410614525139664, + "grad_norm": 0.3912438154220581, + "learning_rate": 0.0003808963585434174, + "loss": 0.4783, + "step": 22215 + }, + { + "epoch": 12.411173184357542, + "grad_norm": 0.5202620625495911, + "learning_rate": 0.0003808683473389356, + "loss": 0.4655, + "step": 22216 + }, + { + "epoch": 12.411731843575419, + "grad_norm": 0.445864200592041, + "learning_rate": 0.0003808403361344538, + "loss": 0.5187, + "step": 22217 + }, + { + "epoch": 12.412290502793295, + "grad_norm": 0.4446728229522705, + "learning_rate": 0.000380812324929972, + "loss": 0.3209, + "step": 22218 + }, + { + "epoch": 12.412849162011174, + "grad_norm": 3.3452131748199463, + "learning_rate": 0.0003807843137254902, + "loss": 0.6367, + "step": 22219 + }, + { + "epoch": 12.41340782122905, + "grad_norm": 0.5621975064277649, + "learning_rate": 0.0003807563025210084, + "loss": 0.4413, + "step": 22220 + }, + { + "epoch": 12.413966480446927, + "grad_norm": 0.41152825951576233, + "learning_rate": 0.00038072829131652667, + "loss": 0.4247, + "step": 22221 + }, + { + "epoch": 12.414525139664805, + "grad_norm": 0.4380573034286499, + "learning_rate": 0.0003807002801120448, + "loss": 0.4063, + "step": 22222 + }, + { + "epoch": 12.415083798882682, + "grad_norm": 0.4403732419013977, + "learning_rate": 0.000380672268907563, + "loss": 0.3676, + "step": 22223 + }, + { + "epoch": 12.415642458100558, + "grad_norm": 0.4695834219455719, + "learning_rate": 0.00038064425770308123, + "loss": 0.4188, + "step": 22224 + }, + { + "epoch": 12.416201117318435, + "grad_norm": 4.389756202697754, + "learning_rate": 0.00038061624649859943, + "loss": 0.3637, + "step": 22225 + }, + { + "epoch": 12.416759776536313, + "grad_norm": 0.4033837914466858, + "learning_rate": 0.0003805882352941177, + "loss": 0.3565, + "step": 22226 + }, + { + "epoch": 12.41731843575419, + "grad_norm": 0.4082547724246979, + "learning_rate": 0.00038056022408963585, + "loss": 0.4156, + "step": 22227 + }, + { + "epoch": 12.417877094972066, + "grad_norm": 0.4367124140262604, + "learning_rate": 0.00038053221288515405, + "loss": 0.3405, + "step": 22228 + }, + { + "epoch": 12.418435754189945, + "grad_norm": 0.7928216457366943, + "learning_rate": 0.0003805042016806723, + "loss": 0.5301, + "step": 22229 + }, + { + "epoch": 12.418994413407821, + "grad_norm": 0.5452948212623596, + "learning_rate": 0.00038047619047619046, + "loss": 0.4489, + "step": 22230 + }, + { + "epoch": 12.419553072625698, + "grad_norm": 0.44494733214378357, + "learning_rate": 0.0003804481792717087, + "loss": 0.3705, + "step": 22231 + }, + { + "epoch": 12.420111731843576, + "grad_norm": 0.5175156593322754, + "learning_rate": 0.0003804201680672269, + "loss": 0.397, + "step": 22232 + }, + { + "epoch": 12.420670391061453, + "grad_norm": 0.6962254643440247, + "learning_rate": 0.0003803921568627451, + "loss": 0.3377, + "step": 22233 + }, + { + "epoch": 12.42122905027933, + "grad_norm": 2.20906662940979, + "learning_rate": 0.00038036414565826334, + "loss": 0.4602, + "step": 22234 + }, + { + "epoch": 12.421787709497206, + "grad_norm": 0.5983732342720032, + "learning_rate": 0.0003803361344537815, + "loss": 0.4225, + "step": 22235 + }, + { + "epoch": 12.422346368715084, + "grad_norm": 0.6306063532829285, + "learning_rate": 0.00038030812324929975, + "loss": 0.486, + "step": 22236 + }, + { + "epoch": 12.422905027932961, + "grad_norm": 0.6764522194862366, + "learning_rate": 0.00038028011204481796, + "loss": 0.4634, + "step": 22237 + }, + { + "epoch": 12.423463687150837, + "grad_norm": 0.40836602449417114, + "learning_rate": 0.0003802521008403361, + "loss": 0.432, + "step": 22238 + }, + { + "epoch": 12.424022346368716, + "grad_norm": 0.32523348927497864, + "learning_rate": 0.0003802240896358544, + "loss": 0.3161, + "step": 22239 + }, + { + "epoch": 12.424581005586592, + "grad_norm": 0.603829562664032, + "learning_rate": 0.0003801960784313725, + "loss": 0.4671, + "step": 22240 + }, + { + "epoch": 12.425139664804469, + "grad_norm": 1.2986538410186768, + "learning_rate": 0.0003801680672268908, + "loss": 0.5123, + "step": 22241 + }, + { + "epoch": 12.425698324022346, + "grad_norm": 0.513359010219574, + "learning_rate": 0.000380140056022409, + "loss": 0.3993, + "step": 22242 + }, + { + "epoch": 12.426256983240224, + "grad_norm": 0.4499400556087494, + "learning_rate": 0.00038011204481792714, + "loss": 0.3895, + "step": 22243 + }, + { + "epoch": 12.4268156424581, + "grad_norm": 0.8446559906005859, + "learning_rate": 0.0003800840336134454, + "loss": 0.434, + "step": 22244 + }, + { + "epoch": 12.427374301675977, + "grad_norm": 1.5230028629302979, + "learning_rate": 0.0003800560224089636, + "loss": 0.4332, + "step": 22245 + }, + { + "epoch": 12.427932960893855, + "grad_norm": 5.320984363555908, + "learning_rate": 0.0003800280112044818, + "loss": 0.3902, + "step": 22246 + }, + { + "epoch": 12.428491620111732, + "grad_norm": 0.39092665910720825, + "learning_rate": 0.00038, + "loss": 0.4122, + "step": 22247 + }, + { + "epoch": 12.429050279329608, + "grad_norm": 2.8248467445373535, + "learning_rate": 0.00037997198879551817, + "loss": 0.4669, + "step": 22248 + }, + { + "epoch": 12.429608938547487, + "grad_norm": 0.5952622890472412, + "learning_rate": 0.00037994397759103643, + "loss": 0.4862, + "step": 22249 + }, + { + "epoch": 12.430167597765363, + "grad_norm": 0.46401551365852356, + "learning_rate": 0.00037991596638655464, + "loss": 0.3724, + "step": 22250 + }, + { + "epoch": 12.43072625698324, + "grad_norm": 0.49525415897369385, + "learning_rate": 0.00037988795518207284, + "loss": 0.3572, + "step": 22251 + }, + { + "epoch": 12.431284916201117, + "grad_norm": 0.678400993347168, + "learning_rate": 0.00037985994397759105, + "loss": 0.4558, + "step": 22252 + }, + { + "epoch": 12.431843575418995, + "grad_norm": 5.064430236816406, + "learning_rate": 0.00037983193277310926, + "loss": 0.31, + "step": 22253 + }, + { + "epoch": 12.432402234636871, + "grad_norm": 0.5133535861968994, + "learning_rate": 0.00037980392156862746, + "loss": 0.5087, + "step": 22254 + }, + { + "epoch": 12.432960893854748, + "grad_norm": 0.4685398042201996, + "learning_rate": 0.00037977591036414567, + "loss": 0.4688, + "step": 22255 + }, + { + "epoch": 12.433519553072626, + "grad_norm": 0.6730914115905762, + "learning_rate": 0.0003797478991596639, + "loss": 0.5012, + "step": 22256 + }, + { + "epoch": 12.434078212290503, + "grad_norm": 0.5985603332519531, + "learning_rate": 0.0003797198879551821, + "loss": 0.4412, + "step": 22257 + }, + { + "epoch": 12.43463687150838, + "grad_norm": 0.8360685110092163, + "learning_rate": 0.0003796918767507003, + "loss": 0.4078, + "step": 22258 + }, + { + "epoch": 12.435195530726258, + "grad_norm": 0.5260396003723145, + "learning_rate": 0.0003796638655462185, + "loss": 0.4636, + "step": 22259 + }, + { + "epoch": 12.435754189944134, + "grad_norm": 1.8112053871154785, + "learning_rate": 0.0003796358543417367, + "loss": 0.4253, + "step": 22260 + }, + { + "epoch": 12.436312849162011, + "grad_norm": 0.4029355049133301, + "learning_rate": 0.0003796078431372549, + "loss": 0.4499, + "step": 22261 + }, + { + "epoch": 12.436871508379888, + "grad_norm": 0.3339584171772003, + "learning_rate": 0.0003795798319327731, + "loss": 0.2828, + "step": 22262 + }, + { + "epoch": 12.437430167597766, + "grad_norm": 1.3417764902114868, + "learning_rate": 0.0003795518207282913, + "loss": 0.4251, + "step": 22263 + }, + { + "epoch": 12.437988826815642, + "grad_norm": 0.4353804290294647, + "learning_rate": 0.0003795238095238095, + "loss": 0.4752, + "step": 22264 + }, + { + "epoch": 12.438547486033519, + "grad_norm": 0.6685983538627625, + "learning_rate": 0.00037949579831932773, + "loss": 0.3468, + "step": 22265 + }, + { + "epoch": 12.439106145251397, + "grad_norm": 0.45141395926475525, + "learning_rate": 0.00037946778711484593, + "loss": 0.4109, + "step": 22266 + }, + { + "epoch": 12.439664804469274, + "grad_norm": 0.41326484084129333, + "learning_rate": 0.00037943977591036414, + "loss": 0.4098, + "step": 22267 + }, + { + "epoch": 12.44022346368715, + "grad_norm": 0.5875102281570435, + "learning_rate": 0.00037941176470588235, + "loss": 0.4737, + "step": 22268 + }, + { + "epoch": 12.440782122905027, + "grad_norm": 0.4500752389431, + "learning_rate": 0.0003793837535014006, + "loss": 0.4482, + "step": 22269 + }, + { + "epoch": 12.441340782122905, + "grad_norm": 3.4752230644226074, + "learning_rate": 0.00037935574229691876, + "loss": 0.4877, + "step": 22270 + }, + { + "epoch": 12.441899441340782, + "grad_norm": 1.4074817895889282, + "learning_rate": 0.00037932773109243696, + "loss": 0.544, + "step": 22271 + }, + { + "epoch": 12.442458100558659, + "grad_norm": 1.156740427017212, + "learning_rate": 0.00037929971988795517, + "loss": 0.3871, + "step": 22272 + }, + { + "epoch": 12.443016759776537, + "grad_norm": 0.43119776248931885, + "learning_rate": 0.0003792717086834734, + "loss": 0.3439, + "step": 22273 + }, + { + "epoch": 12.443575418994413, + "grad_norm": 0.5617696046829224, + "learning_rate": 0.00037924369747899164, + "loss": 0.4565, + "step": 22274 + }, + { + "epoch": 12.44413407821229, + "grad_norm": 0.5131112337112427, + "learning_rate": 0.0003792156862745098, + "loss": 0.4534, + "step": 22275 + }, + { + "epoch": 12.444692737430168, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.000379187675070028, + "loss": 0.4206, + "step": 22276 + }, + { + "epoch": 12.445251396648045, + "grad_norm": 0.6418407559394836, + "learning_rate": 0.00037915966386554625, + "loss": 0.4712, + "step": 22277 + }, + { + "epoch": 12.445810055865921, + "grad_norm": 0.7483553886413574, + "learning_rate": 0.0003791316526610644, + "loss": 0.5092, + "step": 22278 + }, + { + "epoch": 12.446368715083798, + "grad_norm": 0.641573965549469, + "learning_rate": 0.00037910364145658267, + "loss": 0.4297, + "step": 22279 + }, + { + "epoch": 12.446927374301676, + "grad_norm": 1.379223346710205, + "learning_rate": 0.0003790756302521008, + "loss": 0.3879, + "step": 22280 + }, + { + "epoch": 12.447486033519553, + "grad_norm": 0.7533921003341675, + "learning_rate": 0.000379047619047619, + "loss": 0.5088, + "step": 22281 + }, + { + "epoch": 12.44804469273743, + "grad_norm": 0.5609806180000305, + "learning_rate": 0.0003790196078431373, + "loss": 0.4506, + "step": 22282 + }, + { + "epoch": 12.448603351955308, + "grad_norm": 0.42809367179870605, + "learning_rate": 0.00037899159663865544, + "loss": 0.4643, + "step": 22283 + }, + { + "epoch": 12.449162011173184, + "grad_norm": 0.8922832608222961, + "learning_rate": 0.0003789635854341737, + "loss": 0.4519, + "step": 22284 + }, + { + "epoch": 12.449720670391061, + "grad_norm": 0.6599184274673462, + "learning_rate": 0.0003789355742296919, + "loss": 0.3862, + "step": 22285 + }, + { + "epoch": 12.45027932960894, + "grad_norm": 0.6763696670532227, + "learning_rate": 0.00037890756302521005, + "loss": 0.5211, + "step": 22286 + }, + { + "epoch": 12.450837988826816, + "grad_norm": 0.4426666498184204, + "learning_rate": 0.0003788795518207283, + "loss": 0.3664, + "step": 22287 + }, + { + "epoch": 12.451396648044692, + "grad_norm": 0.6292835474014282, + "learning_rate": 0.00037885154061624647, + "loss": 0.4443, + "step": 22288 + }, + { + "epoch": 12.451955307262569, + "grad_norm": 0.6607323288917542, + "learning_rate": 0.0003788235294117647, + "loss": 0.3689, + "step": 22289 + }, + { + "epoch": 12.452513966480447, + "grad_norm": 0.46668827533721924, + "learning_rate": 0.00037879551820728293, + "loss": 0.307, + "step": 22290 + }, + { + "epoch": 12.453072625698324, + "grad_norm": 0.4630614221096039, + "learning_rate": 0.0003787675070028011, + "loss": 0.3133, + "step": 22291 + }, + { + "epoch": 12.4536312849162, + "grad_norm": 0.4242039918899536, + "learning_rate": 0.00037873949579831934, + "loss": 0.3941, + "step": 22292 + }, + { + "epoch": 12.454189944134079, + "grad_norm": 0.45228031277656555, + "learning_rate": 0.00037871148459383755, + "loss": 0.4242, + "step": 22293 + }, + { + "epoch": 12.454748603351955, + "grad_norm": 0.6884719133377075, + "learning_rate": 0.00037868347338935576, + "loss": 0.383, + "step": 22294 + }, + { + "epoch": 12.455307262569832, + "grad_norm": 0.3652999997138977, + "learning_rate": 0.00037865546218487396, + "loss": 0.3999, + "step": 22295 + }, + { + "epoch": 12.45586592178771, + "grad_norm": 0.43142572045326233, + "learning_rate": 0.0003786274509803921, + "loss": 0.4736, + "step": 22296 + }, + { + "epoch": 12.456424581005587, + "grad_norm": 0.5138300061225891, + "learning_rate": 0.0003785994397759104, + "loss": 0.4097, + "step": 22297 + }, + { + "epoch": 12.456983240223463, + "grad_norm": 0.5320408344268799, + "learning_rate": 0.0003785714285714286, + "loss": 0.4147, + "step": 22298 + }, + { + "epoch": 12.45754189944134, + "grad_norm": 1.596933364868164, + "learning_rate": 0.0003785434173669468, + "loss": 0.3769, + "step": 22299 + }, + { + "epoch": 12.458100558659218, + "grad_norm": 1.4512317180633545, + "learning_rate": 0.000378515406162465, + "loss": 0.6775, + "step": 22300 + }, + { + "epoch": 12.458659217877095, + "grad_norm": 4.665525436401367, + "learning_rate": 0.0003784873949579832, + "loss": 0.3617, + "step": 22301 + }, + { + "epoch": 12.459217877094972, + "grad_norm": 0.6398403644561768, + "learning_rate": 0.0003784593837535014, + "loss": 0.4885, + "step": 22302 + }, + { + "epoch": 12.45977653631285, + "grad_norm": 0.6020791530609131, + "learning_rate": 0.0003784313725490196, + "loss": 0.4942, + "step": 22303 + }, + { + "epoch": 12.460335195530726, + "grad_norm": 0.5313255786895752, + "learning_rate": 0.00037840336134453787, + "loss": 0.3656, + "step": 22304 + }, + { + "epoch": 12.460893854748603, + "grad_norm": 0.37811189889907837, + "learning_rate": 0.000378375350140056, + "loss": 0.4451, + "step": 22305 + }, + { + "epoch": 12.461452513966481, + "grad_norm": 0.5424933433532715, + "learning_rate": 0.00037834733893557423, + "loss": 0.4903, + "step": 22306 + }, + { + "epoch": 12.462011173184358, + "grad_norm": 0.631622314453125, + "learning_rate": 0.00037831932773109243, + "loss": 0.3897, + "step": 22307 + }, + { + "epoch": 12.462569832402234, + "grad_norm": 0.46100568771362305, + "learning_rate": 0.00037829131652661064, + "loss": 0.4792, + "step": 22308 + }, + { + "epoch": 12.463128491620111, + "grad_norm": 0.6238300800323486, + "learning_rate": 0.0003782633053221289, + "loss": 0.3555, + "step": 22309 + }, + { + "epoch": 12.46368715083799, + "grad_norm": 2.455458164215088, + "learning_rate": 0.00037823529411764705, + "loss": 0.5361, + "step": 22310 + }, + { + "epoch": 12.464245810055866, + "grad_norm": 0.5340656638145447, + "learning_rate": 0.00037820728291316526, + "loss": 0.405, + "step": 22311 + }, + { + "epoch": 12.464804469273743, + "grad_norm": 0.4993051588535309, + "learning_rate": 0.0003781792717086835, + "loss": 0.3895, + "step": 22312 + }, + { + "epoch": 12.46536312849162, + "grad_norm": 0.4735506474971771, + "learning_rate": 0.00037815126050420167, + "loss": 0.429, + "step": 22313 + }, + { + "epoch": 12.465921787709497, + "grad_norm": 0.43139177560806274, + "learning_rate": 0.00037812324929971993, + "loss": 0.4543, + "step": 22314 + }, + { + "epoch": 12.466480446927374, + "grad_norm": 0.7186444401741028, + "learning_rate": 0.0003780952380952381, + "loss": 0.3158, + "step": 22315 + }, + { + "epoch": 12.46703910614525, + "grad_norm": 0.37876617908477783, + "learning_rate": 0.0003780672268907563, + "loss": 0.3068, + "step": 22316 + }, + { + "epoch": 12.467597765363129, + "grad_norm": 0.6739128232002258, + "learning_rate": 0.00037803921568627455, + "loss": 0.395, + "step": 22317 + }, + { + "epoch": 12.468156424581005, + "grad_norm": 0.4372479021549225, + "learning_rate": 0.0003780112044817927, + "loss": 0.4616, + "step": 22318 + }, + { + "epoch": 12.468715083798882, + "grad_norm": 0.476572722196579, + "learning_rate": 0.00037798319327731096, + "loss": 0.3813, + "step": 22319 + }, + { + "epoch": 12.46927374301676, + "grad_norm": 0.313006192445755, + "learning_rate": 0.00037795518207282917, + "loss": 0.3633, + "step": 22320 + }, + { + "epoch": 12.469832402234637, + "grad_norm": 0.42260926961898804, + "learning_rate": 0.0003779271708683473, + "loss": 0.379, + "step": 22321 + }, + { + "epoch": 12.470391061452514, + "grad_norm": 0.547232449054718, + "learning_rate": 0.0003778991596638656, + "loss": 0.5649, + "step": 22322 + }, + { + "epoch": 12.470949720670392, + "grad_norm": 0.5282040238380432, + "learning_rate": 0.00037787114845938373, + "loss": 0.6735, + "step": 22323 + }, + { + "epoch": 12.471508379888268, + "grad_norm": 0.9079980254173279, + "learning_rate": 0.000377843137254902, + "loss": 0.4699, + "step": 22324 + }, + { + "epoch": 12.472067039106145, + "grad_norm": 0.5286577939987183, + "learning_rate": 0.0003778151260504202, + "loss": 0.5274, + "step": 22325 + }, + { + "epoch": 12.472625698324022, + "grad_norm": 0.5055443048477173, + "learning_rate": 0.00037778711484593835, + "loss": 0.427, + "step": 22326 + }, + { + "epoch": 12.4731843575419, + "grad_norm": 0.5971497893333435, + "learning_rate": 0.0003777591036414566, + "loss": 0.3526, + "step": 22327 + }, + { + "epoch": 12.473743016759776, + "grad_norm": 0.8135359287261963, + "learning_rate": 0.0003777310924369748, + "loss": 0.5703, + "step": 22328 + }, + { + "epoch": 12.474301675977653, + "grad_norm": 0.4614641070365906, + "learning_rate": 0.000377703081232493, + "loss": 0.5145, + "step": 22329 + }, + { + "epoch": 12.474860335195531, + "grad_norm": 0.5052471160888672, + "learning_rate": 0.0003776750700280112, + "loss": 0.4039, + "step": 22330 + }, + { + "epoch": 12.475418994413408, + "grad_norm": 0.4049806296825409, + "learning_rate": 0.0003776470588235294, + "loss": 0.3715, + "step": 22331 + }, + { + "epoch": 12.475977653631285, + "grad_norm": 0.5324528217315674, + "learning_rate": 0.00037761904761904764, + "loss": 0.3494, + "step": 22332 + }, + { + "epoch": 12.476536312849163, + "grad_norm": 0.5156274437904358, + "learning_rate": 0.00037759103641456584, + "loss": 0.3729, + "step": 22333 + }, + { + "epoch": 12.47709497206704, + "grad_norm": 0.34459835290908813, + "learning_rate": 0.00037756302521008405, + "loss": 0.3578, + "step": 22334 + }, + { + "epoch": 12.477653631284916, + "grad_norm": 0.46284016966819763, + "learning_rate": 0.00037753501400560226, + "loss": 0.4223, + "step": 22335 + }, + { + "epoch": 12.478212290502793, + "grad_norm": 0.6220923066139221, + "learning_rate": 0.00037750700280112046, + "loss": 0.4281, + "step": 22336 + }, + { + "epoch": 12.478770949720671, + "grad_norm": 0.6622411012649536, + "learning_rate": 0.00037747899159663867, + "loss": 0.4348, + "step": 22337 + }, + { + "epoch": 12.479329608938547, + "grad_norm": 0.43630436062812805, + "learning_rate": 0.0003774509803921569, + "loss": 0.4421, + "step": 22338 + }, + { + "epoch": 12.479888268156424, + "grad_norm": 0.4875757396221161, + "learning_rate": 0.0003774229691876751, + "loss": 0.4511, + "step": 22339 + }, + { + "epoch": 12.480446927374302, + "grad_norm": 0.5452539324760437, + "learning_rate": 0.0003773949579831933, + "loss": 0.4978, + "step": 22340 + }, + { + "epoch": 12.481005586592179, + "grad_norm": 0.7059219479560852, + "learning_rate": 0.0003773669467787115, + "loss": 0.6353, + "step": 22341 + }, + { + "epoch": 12.481564245810056, + "grad_norm": 0.3773597478866577, + "learning_rate": 0.0003773389355742297, + "loss": 0.3723, + "step": 22342 + }, + { + "epoch": 12.482122905027932, + "grad_norm": 0.4117157757282257, + "learning_rate": 0.0003773109243697479, + "loss": 0.4021, + "step": 22343 + }, + { + "epoch": 12.48268156424581, + "grad_norm": 0.7017507553100586, + "learning_rate": 0.00037728291316526617, + "loss": 0.4756, + "step": 22344 + }, + { + "epoch": 12.483240223463687, + "grad_norm": 0.7124935984611511, + "learning_rate": 0.0003772549019607843, + "loss": 0.5007, + "step": 22345 + }, + { + "epoch": 12.483798882681564, + "grad_norm": 0.5178902745246887, + "learning_rate": 0.0003772268907563025, + "loss": 0.3957, + "step": 22346 + }, + { + "epoch": 12.484357541899442, + "grad_norm": 0.8460575938224792, + "learning_rate": 0.00037719887955182073, + "loss": 0.4692, + "step": 22347 + }, + { + "epoch": 12.484916201117318, + "grad_norm": 0.48622167110443115, + "learning_rate": 0.00037717086834733893, + "loss": 0.4074, + "step": 22348 + }, + { + "epoch": 12.485474860335195, + "grad_norm": 0.5220047235488892, + "learning_rate": 0.0003771428571428572, + "loss": 0.3927, + "step": 22349 + }, + { + "epoch": 12.486033519553073, + "grad_norm": 0.6508392095565796, + "learning_rate": 0.00037711484593837535, + "loss": 0.381, + "step": 22350 + }, + { + "epoch": 12.48659217877095, + "grad_norm": 1.6685177087783813, + "learning_rate": 0.00037708683473389355, + "loss": 0.4279, + "step": 22351 + }, + { + "epoch": 12.487150837988827, + "grad_norm": 0.5280059576034546, + "learning_rate": 0.0003770588235294118, + "loss": 0.3787, + "step": 22352 + }, + { + "epoch": 12.487709497206703, + "grad_norm": 1.572139024734497, + "learning_rate": 0.00037703081232492996, + "loss": 0.4289, + "step": 22353 + }, + { + "epoch": 12.488268156424581, + "grad_norm": 0.7184972167015076, + "learning_rate": 0.0003770028011204482, + "loss": 0.3445, + "step": 22354 + }, + { + "epoch": 12.488826815642458, + "grad_norm": 0.5787521004676819, + "learning_rate": 0.0003769747899159664, + "loss": 0.4458, + "step": 22355 + }, + { + "epoch": 12.489385474860335, + "grad_norm": 1.8154767751693726, + "learning_rate": 0.0003769467787114846, + "loss": 0.3592, + "step": 22356 + }, + { + "epoch": 12.489944134078213, + "grad_norm": 0.5421994924545288, + "learning_rate": 0.00037691876750700284, + "loss": 0.5095, + "step": 22357 + }, + { + "epoch": 12.49050279329609, + "grad_norm": 0.4253360629081726, + "learning_rate": 0.000376890756302521, + "loss": 0.4143, + "step": 22358 + }, + { + "epoch": 12.491061452513966, + "grad_norm": 0.4831967055797577, + "learning_rate": 0.00037686274509803925, + "loss": 0.3922, + "step": 22359 + }, + { + "epoch": 12.491620111731844, + "grad_norm": 0.5803911685943604, + "learning_rate": 0.00037683473389355746, + "loss": 0.3825, + "step": 22360 + }, + { + "epoch": 12.492178770949721, + "grad_norm": 0.5731937289237976, + "learning_rate": 0.0003768067226890756, + "loss": 0.3661, + "step": 22361 + }, + { + "epoch": 12.492737430167598, + "grad_norm": 4.877864837646484, + "learning_rate": 0.0003767787114845939, + "loss": 0.4392, + "step": 22362 + }, + { + "epoch": 12.493296089385474, + "grad_norm": 0.8301437497138977, + "learning_rate": 0.000376750700280112, + "loss": 0.5388, + "step": 22363 + }, + { + "epoch": 12.493854748603352, + "grad_norm": 0.5814093947410583, + "learning_rate": 0.0003767226890756303, + "loss": 0.4255, + "step": 22364 + }, + { + "epoch": 12.494413407821229, + "grad_norm": 0.8469949960708618, + "learning_rate": 0.0003766946778711485, + "loss": 0.5702, + "step": 22365 + }, + { + "epoch": 12.494972067039106, + "grad_norm": 0.40682846307754517, + "learning_rate": 0.00037666666666666664, + "loss": 0.3448, + "step": 22366 + }, + { + "epoch": 12.495530726256984, + "grad_norm": 0.5928486585617065, + "learning_rate": 0.0003766386554621849, + "loss": 0.4481, + "step": 22367 + }, + { + "epoch": 12.49608938547486, + "grad_norm": 0.5610290765762329, + "learning_rate": 0.0003766106442577031, + "loss": 0.4467, + "step": 22368 + }, + { + "epoch": 12.496648044692737, + "grad_norm": 0.6019773483276367, + "learning_rate": 0.0003765826330532213, + "loss": 0.3853, + "step": 22369 + }, + { + "epoch": 12.497206703910614, + "grad_norm": 0.8053200244903564, + "learning_rate": 0.0003765546218487395, + "loss": 0.4132, + "step": 22370 + }, + { + "epoch": 12.497765363128492, + "grad_norm": 0.7026535868644714, + "learning_rate": 0.00037652661064425767, + "loss": 0.3959, + "step": 22371 + }, + { + "epoch": 12.498324022346369, + "grad_norm": 0.3878311812877655, + "learning_rate": 0.00037649859943977593, + "loss": 0.312, + "step": 22372 + }, + { + "epoch": 12.498882681564245, + "grad_norm": 1.6654926538467407, + "learning_rate": 0.00037647058823529414, + "loss": 0.4812, + "step": 22373 + }, + { + "epoch": 12.499441340782123, + "grad_norm": 0.6538389325141907, + "learning_rate": 0.0003764425770308123, + "loss": 0.4063, + "step": 22374 + }, + { + "epoch": 12.5, + "grad_norm": 0.7435426712036133, + "learning_rate": 0.00037641456582633055, + "loss": 0.3508, + "step": 22375 + }, + { + "epoch": 12.500558659217877, + "grad_norm": 6.916824817657471, + "learning_rate": 0.00037638655462184876, + "loss": 0.4492, + "step": 22376 + }, + { + "epoch": 12.501117318435755, + "grad_norm": 0.5683846473693848, + "learning_rate": 0.00037635854341736696, + "loss": 0.4936, + "step": 22377 + }, + { + "epoch": 12.501675977653631, + "grad_norm": 0.5606112480163574, + "learning_rate": 0.00037633053221288517, + "loss": 0.415, + "step": 22378 + }, + { + "epoch": 12.502234636871508, + "grad_norm": 0.6635780334472656, + "learning_rate": 0.0003763025210084033, + "loss": 0.3643, + "step": 22379 + }, + { + "epoch": 12.502793296089386, + "grad_norm": 2.184018611907959, + "learning_rate": 0.0003762745098039216, + "loss": 0.4611, + "step": 22380 + }, + { + "epoch": 12.503351955307263, + "grad_norm": 0.44623345136642456, + "learning_rate": 0.0003762464985994398, + "loss": 0.4458, + "step": 22381 + }, + { + "epoch": 12.50391061452514, + "grad_norm": 0.3886384665966034, + "learning_rate": 0.000376218487394958, + "loss": 0.3932, + "step": 22382 + }, + { + "epoch": 12.504469273743016, + "grad_norm": 0.5578534603118896, + "learning_rate": 0.0003761904761904762, + "loss": 0.4275, + "step": 22383 + }, + { + "epoch": 12.505027932960894, + "grad_norm": 0.6590132713317871, + "learning_rate": 0.0003761624649859944, + "loss": 0.4364, + "step": 22384 + }, + { + "epoch": 12.505586592178771, + "grad_norm": 0.48075932264328003, + "learning_rate": 0.0003761344537815126, + "loss": 0.3414, + "step": 22385 + }, + { + "epoch": 12.506145251396648, + "grad_norm": 0.38701489567756653, + "learning_rate": 0.0003761064425770308, + "loss": 0.439, + "step": 22386 + }, + { + "epoch": 12.506703910614526, + "grad_norm": 3.3040475845336914, + "learning_rate": 0.000376078431372549, + "loss": 0.5619, + "step": 22387 + }, + { + "epoch": 12.507262569832402, + "grad_norm": 0.5540610551834106, + "learning_rate": 0.00037605042016806723, + "loss": 0.5202, + "step": 22388 + }, + { + "epoch": 12.507821229050279, + "grad_norm": 0.45256611704826355, + "learning_rate": 0.00037602240896358543, + "loss": 0.4175, + "step": 22389 + }, + { + "epoch": 12.508379888268156, + "grad_norm": 4.7016377449035645, + "learning_rate": 0.00037599439775910364, + "loss": 0.4584, + "step": 22390 + }, + { + "epoch": 12.508938547486034, + "grad_norm": 0.37993180751800537, + "learning_rate": 0.00037596638655462185, + "loss": 0.3877, + "step": 22391 + }, + { + "epoch": 12.50949720670391, + "grad_norm": 0.37144502997398376, + "learning_rate": 0.0003759383753501401, + "loss": 0.3715, + "step": 22392 + }, + { + "epoch": 12.510055865921787, + "grad_norm": 0.3228665888309479, + "learning_rate": 0.00037591036414565826, + "loss": 0.2991, + "step": 22393 + }, + { + "epoch": 12.510614525139665, + "grad_norm": 0.39813941717147827, + "learning_rate": 0.00037588235294117646, + "loss": 0.3389, + "step": 22394 + }, + { + "epoch": 12.511173184357542, + "grad_norm": 0.7362692952156067, + "learning_rate": 0.00037585434173669467, + "loss": 0.441, + "step": 22395 + }, + { + "epoch": 12.511731843575419, + "grad_norm": 0.6402891874313354, + "learning_rate": 0.0003758263305322129, + "loss": 0.471, + "step": 22396 + }, + { + "epoch": 12.512290502793297, + "grad_norm": 0.9931805729866028, + "learning_rate": 0.00037579831932773114, + "loss": 0.4023, + "step": 22397 + }, + { + "epoch": 12.512849162011173, + "grad_norm": 0.9622296094894409, + "learning_rate": 0.0003757703081232493, + "loss": 0.3927, + "step": 22398 + }, + { + "epoch": 12.51340782122905, + "grad_norm": 0.521777331829071, + "learning_rate": 0.0003757422969187675, + "loss": 0.3422, + "step": 22399 + }, + { + "epoch": 12.513966480446927, + "grad_norm": 0.49033617973327637, + "learning_rate": 0.00037571428571428575, + "loss": 0.3294, + "step": 22400 + }, + { + "epoch": 12.514525139664805, + "grad_norm": 0.5281840562820435, + "learning_rate": 0.0003756862745098039, + "loss": 0.3958, + "step": 22401 + }, + { + "epoch": 12.515083798882682, + "grad_norm": 0.4699585735797882, + "learning_rate": 0.00037565826330532217, + "loss": 0.4798, + "step": 22402 + }, + { + "epoch": 12.515642458100558, + "grad_norm": 0.8030683994293213, + "learning_rate": 0.0003756302521008403, + "loss": 0.4972, + "step": 22403 + }, + { + "epoch": 12.516201117318436, + "grad_norm": 1.1272097826004028, + "learning_rate": 0.0003756022408963585, + "loss": 0.3031, + "step": 22404 + }, + { + "epoch": 12.516759776536313, + "grad_norm": 0.4536973237991333, + "learning_rate": 0.0003755742296918768, + "loss": 0.3132, + "step": 22405 + }, + { + "epoch": 12.51731843575419, + "grad_norm": 0.7675778269767761, + "learning_rate": 0.00037554621848739494, + "loss": 0.4384, + "step": 22406 + }, + { + "epoch": 12.517877094972068, + "grad_norm": 1.0920019149780273, + "learning_rate": 0.0003755182072829132, + "loss": 0.4312, + "step": 22407 + }, + { + "epoch": 12.518435754189944, + "grad_norm": 1.5458168983459473, + "learning_rate": 0.0003754901960784314, + "loss": 0.4865, + "step": 22408 + }, + { + "epoch": 12.518994413407821, + "grad_norm": 1.7475107908248901, + "learning_rate": 0.00037546218487394955, + "loss": 0.4396, + "step": 22409 + }, + { + "epoch": 12.519553072625698, + "grad_norm": 0.39102673530578613, + "learning_rate": 0.0003754341736694678, + "loss": 0.2953, + "step": 22410 + }, + { + "epoch": 12.520111731843576, + "grad_norm": 0.3776918649673462, + "learning_rate": 0.00037540616246498597, + "loss": 0.4415, + "step": 22411 + }, + { + "epoch": 12.520670391061453, + "grad_norm": 0.48570331931114197, + "learning_rate": 0.0003753781512605042, + "loss": 0.511, + "step": 22412 + }, + { + "epoch": 12.521229050279329, + "grad_norm": 1.24405038356781, + "learning_rate": 0.00037535014005602243, + "loss": 0.4123, + "step": 22413 + }, + { + "epoch": 12.521787709497207, + "grad_norm": 0.7005947232246399, + "learning_rate": 0.0003753221288515406, + "loss": 0.4271, + "step": 22414 + }, + { + "epoch": 12.522346368715084, + "grad_norm": 0.5677045583724976, + "learning_rate": 0.00037529411764705884, + "loss": 0.5057, + "step": 22415 + }, + { + "epoch": 12.52290502793296, + "grad_norm": 0.41748666763305664, + "learning_rate": 0.00037526610644257705, + "loss": 0.4178, + "step": 22416 + }, + { + "epoch": 12.523463687150837, + "grad_norm": 0.4275798201560974, + "learning_rate": 0.00037523809523809526, + "loss": 0.4096, + "step": 22417 + }, + { + "epoch": 12.524022346368715, + "grad_norm": 0.4180930554866791, + "learning_rate": 0.00037521008403361346, + "loss": 0.4446, + "step": 22418 + }, + { + "epoch": 12.524581005586592, + "grad_norm": 9.960973739624023, + "learning_rate": 0.0003751820728291316, + "loss": 0.5341, + "step": 22419 + }, + { + "epoch": 12.525139664804469, + "grad_norm": 3.246262311935425, + "learning_rate": 0.0003751540616246499, + "loss": 0.4872, + "step": 22420 + }, + { + "epoch": 12.525698324022347, + "grad_norm": 0.584867537021637, + "learning_rate": 0.0003751260504201681, + "loss": 0.5785, + "step": 22421 + }, + { + "epoch": 12.526256983240224, + "grad_norm": 0.5116361379623413, + "learning_rate": 0.0003750980392156863, + "loss": 0.3505, + "step": 22422 + }, + { + "epoch": 12.5268156424581, + "grad_norm": 0.3268989026546478, + "learning_rate": 0.0003750700280112045, + "loss": 0.3381, + "step": 22423 + }, + { + "epoch": 12.527374301675978, + "grad_norm": 0.40259113907814026, + "learning_rate": 0.0003750420168067227, + "loss": 0.3822, + "step": 22424 + }, + { + "epoch": 12.527932960893855, + "grad_norm": 0.48108726739883423, + "learning_rate": 0.0003750140056022409, + "loss": 0.4306, + "step": 22425 + }, + { + "epoch": 12.528491620111732, + "grad_norm": 0.601531982421875, + "learning_rate": 0.0003749859943977591, + "loss": 0.5006, + "step": 22426 + }, + { + "epoch": 12.529050279329608, + "grad_norm": 0.47578126192092896, + "learning_rate": 0.0003749579831932773, + "loss": 0.4961, + "step": 22427 + }, + { + "epoch": 12.529608938547486, + "grad_norm": 0.5015585422515869, + "learning_rate": 0.0003749299719887955, + "loss": 0.4548, + "step": 22428 + }, + { + "epoch": 12.530167597765363, + "grad_norm": 0.7206169962882996, + "learning_rate": 0.00037490196078431373, + "loss": 0.4622, + "step": 22429 + }, + { + "epoch": 12.53072625698324, + "grad_norm": 0.4819530248641968, + "learning_rate": 0.00037487394957983193, + "loss": 0.4867, + "step": 22430 + }, + { + "epoch": 12.531284916201118, + "grad_norm": 0.8435218930244446, + "learning_rate": 0.00037484593837535014, + "loss": 0.3721, + "step": 22431 + }, + { + "epoch": 12.531843575418995, + "grad_norm": 0.6111214756965637, + "learning_rate": 0.0003748179271708684, + "loss": 0.457, + "step": 22432 + }, + { + "epoch": 12.532402234636871, + "grad_norm": 0.44760215282440186, + "learning_rate": 0.00037478991596638655, + "loss": 0.3866, + "step": 22433 + }, + { + "epoch": 12.53296089385475, + "grad_norm": 0.5960481762886047, + "learning_rate": 0.00037476190476190476, + "loss": 0.415, + "step": 22434 + }, + { + "epoch": 12.533519553072626, + "grad_norm": 0.9160645604133606, + "learning_rate": 0.00037473389355742296, + "loss": 0.3975, + "step": 22435 + }, + { + "epoch": 12.534078212290503, + "grad_norm": 1.0403597354888916, + "learning_rate": 0.00037470588235294117, + "loss": 0.3687, + "step": 22436 + }, + { + "epoch": 12.53463687150838, + "grad_norm": 1.976467490196228, + "learning_rate": 0.00037467787114845943, + "loss": 0.4413, + "step": 22437 + }, + { + "epoch": 12.535195530726257, + "grad_norm": 0.39749184250831604, + "learning_rate": 0.0003746498599439776, + "loss": 0.5165, + "step": 22438 + }, + { + "epoch": 12.535754189944134, + "grad_norm": 0.4099052846431732, + "learning_rate": 0.0003746218487394958, + "loss": 0.3771, + "step": 22439 + }, + { + "epoch": 12.53631284916201, + "grad_norm": 0.491862028837204, + "learning_rate": 0.00037459383753501405, + "loss": 0.3967, + "step": 22440 + }, + { + "epoch": 12.536871508379889, + "grad_norm": 0.5626294612884521, + "learning_rate": 0.0003745658263305322, + "loss": 0.3193, + "step": 22441 + }, + { + "epoch": 12.537430167597766, + "grad_norm": 0.3762354552745819, + "learning_rate": 0.00037453781512605046, + "loss": 0.4102, + "step": 22442 + }, + { + "epoch": 12.537988826815642, + "grad_norm": 0.7769125699996948, + "learning_rate": 0.0003745098039215686, + "loss": 0.5664, + "step": 22443 + }, + { + "epoch": 12.538547486033519, + "grad_norm": 0.40960627794265747, + "learning_rate": 0.0003744817927170868, + "loss": 0.4173, + "step": 22444 + }, + { + "epoch": 12.539106145251397, + "grad_norm": 0.40965136885643005, + "learning_rate": 0.0003744537815126051, + "loss": 0.4408, + "step": 22445 + }, + { + "epoch": 12.539664804469274, + "grad_norm": 0.7106426358222961, + "learning_rate": 0.00037442577030812323, + "loss": 0.4431, + "step": 22446 + }, + { + "epoch": 12.54022346368715, + "grad_norm": 1.4433718919754028, + "learning_rate": 0.0003743977591036415, + "loss": 0.4828, + "step": 22447 + }, + { + "epoch": 12.540782122905028, + "grad_norm": 0.38598471879959106, + "learning_rate": 0.0003743697478991597, + "loss": 0.3986, + "step": 22448 + }, + { + "epoch": 12.541340782122905, + "grad_norm": 0.33879542350769043, + "learning_rate": 0.00037434173669467785, + "loss": 0.3909, + "step": 22449 + }, + { + "epoch": 12.541899441340782, + "grad_norm": 0.4508078396320343, + "learning_rate": 0.0003743137254901961, + "loss": 0.4385, + "step": 22450 + }, + { + "epoch": 12.54245810055866, + "grad_norm": 0.7343797087669373, + "learning_rate": 0.00037428571428571426, + "loss": 0.4046, + "step": 22451 + }, + { + "epoch": 12.543016759776537, + "grad_norm": 3.512624740600586, + "learning_rate": 0.0003742577030812325, + "loss": 0.3836, + "step": 22452 + }, + { + "epoch": 12.543575418994413, + "grad_norm": 0.4800110459327698, + "learning_rate": 0.0003742296918767507, + "loss": 0.3622, + "step": 22453 + }, + { + "epoch": 12.544134078212291, + "grad_norm": 0.4378245770931244, + "learning_rate": 0.0003742016806722689, + "loss": 0.3643, + "step": 22454 + }, + { + "epoch": 12.544692737430168, + "grad_norm": 0.5712656378746033, + "learning_rate": 0.00037417366946778714, + "loss": 0.4772, + "step": 22455 + }, + { + "epoch": 12.545251396648045, + "grad_norm": 0.40475600957870483, + "learning_rate": 0.00037414565826330534, + "loss": 0.3983, + "step": 22456 + }, + { + "epoch": 12.545810055865921, + "grad_norm": 0.3769834041595459, + "learning_rate": 0.00037411764705882355, + "loss": 0.4161, + "step": 22457 + }, + { + "epoch": 12.5463687150838, + "grad_norm": 0.45288193225860596, + "learning_rate": 0.00037408963585434176, + "loss": 0.4712, + "step": 22458 + }, + { + "epoch": 12.546927374301676, + "grad_norm": 0.6904294490814209, + "learning_rate": 0.0003740616246498599, + "loss": 0.467, + "step": 22459 + }, + { + "epoch": 12.547486033519553, + "grad_norm": 0.667701780796051, + "learning_rate": 0.00037403361344537817, + "loss": 0.4746, + "step": 22460 + }, + { + "epoch": 12.548044692737431, + "grad_norm": 0.4270884692668915, + "learning_rate": 0.0003740056022408964, + "loss": 0.4712, + "step": 22461 + }, + { + "epoch": 12.548603351955308, + "grad_norm": 0.6284533143043518, + "learning_rate": 0.0003739775910364146, + "loss": 0.3833, + "step": 22462 + }, + { + "epoch": 12.549162011173184, + "grad_norm": 1.2914358377456665, + "learning_rate": 0.0003739495798319328, + "loss": 0.3785, + "step": 22463 + }, + { + "epoch": 12.54972067039106, + "grad_norm": 1.172956943511963, + "learning_rate": 0.000373921568627451, + "loss": 0.3974, + "step": 22464 + }, + { + "epoch": 12.550279329608939, + "grad_norm": 0.7549270391464233, + "learning_rate": 0.0003738935574229692, + "loss": 0.372, + "step": 22465 + }, + { + "epoch": 12.550837988826816, + "grad_norm": 0.4130135774612427, + "learning_rate": 0.0003738655462184874, + "loss": 0.3578, + "step": 22466 + }, + { + "epoch": 12.551396648044692, + "grad_norm": 0.40527695417404175, + "learning_rate": 0.0003738375350140056, + "loss": 0.3401, + "step": 22467 + }, + { + "epoch": 12.55195530726257, + "grad_norm": 0.6868615746498108, + "learning_rate": 0.0003738095238095238, + "loss": 0.3728, + "step": 22468 + }, + { + "epoch": 12.552513966480447, + "grad_norm": 0.6799153089523315, + "learning_rate": 0.000373781512605042, + "loss": 0.3624, + "step": 22469 + }, + { + "epoch": 12.553072625698324, + "grad_norm": 0.5361668467521667, + "learning_rate": 0.00037375350140056023, + "loss": 0.4261, + "step": 22470 + }, + { + "epoch": 12.553631284916202, + "grad_norm": 0.5403088331222534, + "learning_rate": 0.00037372549019607843, + "loss": 0.4815, + "step": 22471 + }, + { + "epoch": 12.554189944134079, + "grad_norm": 1.6084001064300537, + "learning_rate": 0.0003736974789915967, + "loss": 0.4416, + "step": 22472 + }, + { + "epoch": 12.554748603351955, + "grad_norm": 0.41897672414779663, + "learning_rate": 0.00037366946778711485, + "loss": 0.3341, + "step": 22473 + }, + { + "epoch": 12.555307262569832, + "grad_norm": 0.4434489607810974, + "learning_rate": 0.00037364145658263305, + "loss": 0.4018, + "step": 22474 + }, + { + "epoch": 12.55586592178771, + "grad_norm": 0.3878321349620819, + "learning_rate": 0.00037361344537815126, + "loss": 0.3278, + "step": 22475 + }, + { + "epoch": 12.556424581005587, + "grad_norm": 0.3136431872844696, + "learning_rate": 0.00037358543417366946, + "loss": 0.4089, + "step": 22476 + }, + { + "epoch": 12.556983240223463, + "grad_norm": 0.42402777075767517, + "learning_rate": 0.0003735574229691877, + "loss": 0.3563, + "step": 22477 + }, + { + "epoch": 12.557541899441341, + "grad_norm": 0.49588528275489807, + "learning_rate": 0.0003735294117647059, + "loss": 0.3614, + "step": 22478 + }, + { + "epoch": 12.558100558659218, + "grad_norm": 0.48343682289123535, + "learning_rate": 0.0003735014005602241, + "loss": 0.4303, + "step": 22479 + }, + { + "epoch": 12.558659217877095, + "grad_norm": 0.5517815351486206, + "learning_rate": 0.00037347338935574234, + "loss": 0.4309, + "step": 22480 + }, + { + "epoch": 12.559217877094973, + "grad_norm": 0.588929295539856, + "learning_rate": 0.0003734453781512605, + "loss": 0.5073, + "step": 22481 + }, + { + "epoch": 12.55977653631285, + "grad_norm": 0.5685851573944092, + "learning_rate": 0.0003734173669467787, + "loss": 0.36, + "step": 22482 + }, + { + "epoch": 12.560335195530726, + "grad_norm": 1.3220291137695312, + "learning_rate": 0.0003733893557422969, + "loss": 0.4066, + "step": 22483 + }, + { + "epoch": 12.560893854748603, + "grad_norm": 0.5837740302085876, + "learning_rate": 0.0003733613445378151, + "loss": 0.5779, + "step": 22484 + }, + { + "epoch": 12.561452513966481, + "grad_norm": 0.596911609172821, + "learning_rate": 0.0003733333333333334, + "loss": 0.3997, + "step": 22485 + }, + { + "epoch": 12.562011173184358, + "grad_norm": 1.013767957687378, + "learning_rate": 0.0003733053221288515, + "loss": 0.5074, + "step": 22486 + }, + { + "epoch": 12.562569832402234, + "grad_norm": 0.6253725290298462, + "learning_rate": 0.00037327731092436973, + "loss": 0.3561, + "step": 22487 + }, + { + "epoch": 12.563128491620112, + "grad_norm": 0.47042229771614075, + "learning_rate": 0.000373249299719888, + "loss": 0.3947, + "step": 22488 + }, + { + "epoch": 12.563687150837989, + "grad_norm": 0.41710326075553894, + "learning_rate": 0.00037322128851540614, + "loss": 0.3406, + "step": 22489 + }, + { + "epoch": 12.564245810055866, + "grad_norm": 0.46124324202537537, + "learning_rate": 0.0003731932773109244, + "loss": 0.313, + "step": 22490 + }, + { + "epoch": 12.564804469273742, + "grad_norm": 0.8389171361923218, + "learning_rate": 0.00037316526610644255, + "loss": 0.4081, + "step": 22491 + }, + { + "epoch": 12.56536312849162, + "grad_norm": 0.8235301971435547, + "learning_rate": 0.00037313725490196076, + "loss": 0.4478, + "step": 22492 + }, + { + "epoch": 12.565921787709497, + "grad_norm": 0.7095931172370911, + "learning_rate": 0.000373109243697479, + "loss": 0.5667, + "step": 22493 + }, + { + "epoch": 12.566480446927374, + "grad_norm": 0.5218303799629211, + "learning_rate": 0.00037308123249299717, + "loss": 0.3155, + "step": 22494 + }, + { + "epoch": 12.567039106145252, + "grad_norm": 0.41921132802963257, + "learning_rate": 0.00037305322128851543, + "loss": 0.373, + "step": 22495 + }, + { + "epoch": 12.567597765363129, + "grad_norm": 0.46741050481796265, + "learning_rate": 0.00037302521008403364, + "loss": 0.4563, + "step": 22496 + }, + { + "epoch": 12.568156424581005, + "grad_norm": 0.6967788338661194, + "learning_rate": 0.0003729971988795518, + "loss": 0.522, + "step": 22497 + }, + { + "epoch": 12.568715083798883, + "grad_norm": 0.8631910085678101, + "learning_rate": 0.00037296918767507005, + "loss": 0.4022, + "step": 22498 + }, + { + "epoch": 12.56927374301676, + "grad_norm": 0.4822239279747009, + "learning_rate": 0.0003729411764705882, + "loss": 0.4814, + "step": 22499 + }, + { + "epoch": 12.569832402234637, + "grad_norm": 1.4286555051803589, + "learning_rate": 0.00037291316526610646, + "loss": 0.3527, + "step": 22500 + }, + { + "epoch": 12.569832402234637, + "eval_cer": 0.0869911514080281, + "eval_loss": 0.33226311206817627, + "eval_runtime": 55.5702, + "eval_samples_per_second": 81.662, + "eval_steps_per_second": 5.111, + "eval_wer": 0.3423368062344627, + "step": 22500 + }, + { + "epoch": 12.570391061452513, + "grad_norm": 0.33531805872917175, + "learning_rate": 0.00037288515406162467, + "loss": 0.4203, + "step": 22501 + }, + { + "epoch": 12.570949720670392, + "grad_norm": 0.40229159593582153, + "learning_rate": 0.0003728571428571428, + "loss": 0.3863, + "step": 22502 + }, + { + "epoch": 12.571508379888268, + "grad_norm": 1.1792356967926025, + "learning_rate": 0.0003728291316526611, + "loss": 0.4736, + "step": 22503 + }, + { + "epoch": 12.572067039106145, + "grad_norm": 0.5437342524528503, + "learning_rate": 0.0003728011204481793, + "loss": 0.3849, + "step": 22504 + }, + { + "epoch": 12.572625698324023, + "grad_norm": 0.7495285272598267, + "learning_rate": 0.0003727731092436975, + "loss": 0.4733, + "step": 22505 + }, + { + "epoch": 12.5731843575419, + "grad_norm": 0.6106339693069458, + "learning_rate": 0.0003727450980392157, + "loss": 0.3952, + "step": 22506 + }, + { + "epoch": 12.573743016759776, + "grad_norm": 0.6154392957687378, + "learning_rate": 0.00037271708683473385, + "loss": 0.3716, + "step": 22507 + }, + { + "epoch": 12.574301675977654, + "grad_norm": 0.5503832697868347, + "learning_rate": 0.0003726890756302521, + "loss": 0.4636, + "step": 22508 + }, + { + "epoch": 12.574860335195531, + "grad_norm": 0.4107510447502136, + "learning_rate": 0.0003726610644257703, + "loss": 0.364, + "step": 22509 + }, + { + "epoch": 12.575418994413408, + "grad_norm": 0.470242440700531, + "learning_rate": 0.0003726330532212885, + "loss": 0.3888, + "step": 22510 + }, + { + "epoch": 12.575977653631284, + "grad_norm": 0.6612895131111145, + "learning_rate": 0.00037260504201680673, + "loss": 0.7543, + "step": 22511 + }, + { + "epoch": 12.576536312849163, + "grad_norm": 0.5041276216506958, + "learning_rate": 0.00037257703081232493, + "loss": 0.3962, + "step": 22512 + }, + { + "epoch": 12.577094972067039, + "grad_norm": 0.7390826940536499, + "learning_rate": 0.00037254901960784314, + "loss": 0.4902, + "step": 22513 + }, + { + "epoch": 12.577653631284916, + "grad_norm": 0.9431338906288147, + "learning_rate": 0.00037252100840336135, + "loss": 0.4666, + "step": 22514 + }, + { + "epoch": 12.578212290502794, + "grad_norm": 0.5420820713043213, + "learning_rate": 0.00037249299719887955, + "loss": 0.3939, + "step": 22515 + }, + { + "epoch": 12.57877094972067, + "grad_norm": 0.48682859539985657, + "learning_rate": 0.00037246498599439776, + "loss": 0.3618, + "step": 22516 + }, + { + "epoch": 12.579329608938547, + "grad_norm": 0.39323559403419495, + "learning_rate": 0.00037243697478991596, + "loss": 0.4543, + "step": 22517 + }, + { + "epoch": 12.579888268156424, + "grad_norm": 0.5573785305023193, + "learning_rate": 0.00037240896358543417, + "loss": 0.4314, + "step": 22518 + }, + { + "epoch": 12.580446927374302, + "grad_norm": 1.7468023300170898, + "learning_rate": 0.0003723809523809524, + "loss": 0.451, + "step": 22519 + }, + { + "epoch": 12.581005586592179, + "grad_norm": 0.8025546073913574, + "learning_rate": 0.00037235294117647064, + "loss": 0.5438, + "step": 22520 + }, + { + "epoch": 12.581564245810055, + "grad_norm": 0.4134397506713867, + "learning_rate": 0.0003723249299719888, + "loss": 0.3929, + "step": 22521 + }, + { + "epoch": 12.582122905027934, + "grad_norm": 0.47350767254829407, + "learning_rate": 0.000372296918767507, + "loss": 0.4415, + "step": 22522 + }, + { + "epoch": 12.58268156424581, + "grad_norm": 0.48985230922698975, + "learning_rate": 0.0003722689075630252, + "loss": 0.4311, + "step": 22523 + }, + { + "epoch": 12.583240223463687, + "grad_norm": 0.3546047806739807, + "learning_rate": 0.0003722408963585434, + "loss": 0.4008, + "step": 22524 + }, + { + "epoch": 12.583798882681565, + "grad_norm": 0.8964436650276184, + "learning_rate": 0.00037221288515406167, + "loss": 0.3502, + "step": 22525 + }, + { + "epoch": 12.584357541899442, + "grad_norm": 0.3794797956943512, + "learning_rate": 0.0003721848739495798, + "loss": 0.3348, + "step": 22526 + }, + { + "epoch": 12.584916201117318, + "grad_norm": 0.6527863144874573, + "learning_rate": 0.000372156862745098, + "loss": 0.5589, + "step": 22527 + }, + { + "epoch": 12.585474860335196, + "grad_norm": 0.3487827181816101, + "learning_rate": 0.0003721288515406163, + "loss": 0.3433, + "step": 22528 + }, + { + "epoch": 12.586033519553073, + "grad_norm": 0.43349307775497437, + "learning_rate": 0.00037210084033613444, + "loss": 0.4171, + "step": 22529 + }, + { + "epoch": 12.58659217877095, + "grad_norm": 0.39403101801872253, + "learning_rate": 0.0003720728291316527, + "loss": 0.3671, + "step": 22530 + }, + { + "epoch": 12.587150837988826, + "grad_norm": 0.6479273438453674, + "learning_rate": 0.00037204481792717085, + "loss": 0.3234, + "step": 22531 + }, + { + "epoch": 12.587709497206705, + "grad_norm": 0.379276305437088, + "learning_rate": 0.00037201680672268905, + "loss": 0.3705, + "step": 22532 + }, + { + "epoch": 12.588268156424581, + "grad_norm": 0.608769416809082, + "learning_rate": 0.0003719887955182073, + "loss": 0.5333, + "step": 22533 + }, + { + "epoch": 12.588826815642458, + "grad_norm": 4.696609973907471, + "learning_rate": 0.00037196078431372547, + "loss": 0.3334, + "step": 22534 + }, + { + "epoch": 12.589385474860336, + "grad_norm": 2.129253625869751, + "learning_rate": 0.0003719327731092437, + "loss": 0.4207, + "step": 22535 + }, + { + "epoch": 12.589944134078213, + "grad_norm": 0.5442612767219543, + "learning_rate": 0.00037190476190476193, + "loss": 0.4343, + "step": 22536 + }, + { + "epoch": 12.59050279329609, + "grad_norm": 0.8003788590431213, + "learning_rate": 0.0003718767507002801, + "loss": 0.4659, + "step": 22537 + }, + { + "epoch": 12.591061452513966, + "grad_norm": 0.6233965158462524, + "learning_rate": 0.00037184873949579834, + "loss": 0.4769, + "step": 22538 + }, + { + "epoch": 12.591620111731844, + "grad_norm": 0.30588507652282715, + "learning_rate": 0.0003718207282913165, + "loss": 0.3835, + "step": 22539 + }, + { + "epoch": 12.59217877094972, + "grad_norm": 0.6887941360473633, + "learning_rate": 0.00037179271708683476, + "loss": 0.4423, + "step": 22540 + }, + { + "epoch": 12.592737430167597, + "grad_norm": 0.6773592233657837, + "learning_rate": 0.00037176470588235296, + "loss": 0.6422, + "step": 22541 + }, + { + "epoch": 12.593296089385476, + "grad_norm": 0.466437965631485, + "learning_rate": 0.0003717366946778711, + "loss": 0.3935, + "step": 22542 + }, + { + "epoch": 12.593854748603352, + "grad_norm": 0.45137104392051697, + "learning_rate": 0.0003717086834733894, + "loss": 0.343, + "step": 22543 + }, + { + "epoch": 12.594413407821229, + "grad_norm": 0.5953842997550964, + "learning_rate": 0.0003716806722689076, + "loss": 0.4223, + "step": 22544 + }, + { + "epoch": 12.594972067039105, + "grad_norm": 0.5215619802474976, + "learning_rate": 0.0003716526610644258, + "loss": 0.5252, + "step": 22545 + }, + { + "epoch": 12.595530726256984, + "grad_norm": 0.4695204794406891, + "learning_rate": 0.000371624649859944, + "loss": 0.4253, + "step": 22546 + }, + { + "epoch": 12.59608938547486, + "grad_norm": 0.4120956063270569, + "learning_rate": 0.00037159663865546214, + "loss": 0.3966, + "step": 22547 + }, + { + "epoch": 12.596648044692737, + "grad_norm": 1.2609736919403076, + "learning_rate": 0.0003715686274509804, + "loss": 0.4993, + "step": 22548 + }, + { + "epoch": 12.597206703910615, + "grad_norm": 0.3601849675178528, + "learning_rate": 0.0003715406162464986, + "loss": 0.4309, + "step": 22549 + }, + { + "epoch": 12.597765363128492, + "grad_norm": 0.44100022315979004, + "learning_rate": 0.0003715126050420168, + "loss": 0.446, + "step": 22550 + }, + { + "epoch": 12.598324022346368, + "grad_norm": 0.5519166588783264, + "learning_rate": 0.000371484593837535, + "loss": 0.4023, + "step": 22551 + }, + { + "epoch": 12.598882681564247, + "grad_norm": 0.4104040563106537, + "learning_rate": 0.00037145658263305323, + "loss": 0.3614, + "step": 22552 + }, + { + "epoch": 12.599441340782123, + "grad_norm": 2.154728412628174, + "learning_rate": 0.00037142857142857143, + "loss": 0.3741, + "step": 22553 + }, + { + "epoch": 12.6, + "grad_norm": 0.4629014730453491, + "learning_rate": 0.00037140056022408964, + "loss": 0.4071, + "step": 22554 + }, + { + "epoch": 12.600558659217878, + "grad_norm": 0.45616215467453003, + "learning_rate": 0.0003713725490196079, + "loss": 0.3089, + "step": 22555 + }, + { + "epoch": 12.601117318435755, + "grad_norm": 0.5728338956832886, + "learning_rate": 0.00037134453781512605, + "loss": 0.4909, + "step": 22556 + }, + { + "epoch": 12.601675977653631, + "grad_norm": 0.5634094476699829, + "learning_rate": 0.00037131652661064426, + "loss": 0.3893, + "step": 22557 + }, + { + "epoch": 12.602234636871508, + "grad_norm": 3.124847650527954, + "learning_rate": 0.00037128851540616246, + "loss": 0.6828, + "step": 22558 + }, + { + "epoch": 12.602793296089386, + "grad_norm": 0.39011862874031067, + "learning_rate": 0.00037126050420168067, + "loss": 0.3127, + "step": 22559 + }, + { + "epoch": 12.603351955307263, + "grad_norm": 0.37355750799179077, + "learning_rate": 0.00037123249299719893, + "loss": 0.4067, + "step": 22560 + }, + { + "epoch": 12.60391061452514, + "grad_norm": 0.5913501381874084, + "learning_rate": 0.0003712044817927171, + "loss": 0.3871, + "step": 22561 + }, + { + "epoch": 12.604469273743018, + "grad_norm": 0.6833252906799316, + "learning_rate": 0.0003711764705882353, + "loss": 0.4781, + "step": 22562 + }, + { + "epoch": 12.605027932960894, + "grad_norm": 0.44945746660232544, + "learning_rate": 0.00037114845938375355, + "loss": 0.3824, + "step": 22563 + }, + { + "epoch": 12.60558659217877, + "grad_norm": 0.6070159077644348, + "learning_rate": 0.0003711204481792717, + "loss": 0.3528, + "step": 22564 + }, + { + "epoch": 12.606145251396647, + "grad_norm": 0.7241159677505493, + "learning_rate": 0.00037109243697478996, + "loss": 0.5414, + "step": 22565 + }, + { + "epoch": 12.606703910614526, + "grad_norm": 0.4528590738773346, + "learning_rate": 0.0003710644257703081, + "loss": 0.3055, + "step": 22566 + }, + { + "epoch": 12.607262569832402, + "grad_norm": 1.4933909177780151, + "learning_rate": 0.0003710364145658263, + "loss": 0.3507, + "step": 22567 + }, + { + "epoch": 12.607821229050279, + "grad_norm": 0.5839433670043945, + "learning_rate": 0.0003710084033613446, + "loss": 0.504, + "step": 22568 + }, + { + "epoch": 12.608379888268157, + "grad_norm": 0.37618908286094666, + "learning_rate": 0.00037098039215686273, + "loss": 0.3388, + "step": 22569 + }, + { + "epoch": 12.608938547486034, + "grad_norm": 0.5161586999893188, + "learning_rate": 0.000370952380952381, + "loss": 0.3468, + "step": 22570 + }, + { + "epoch": 12.60949720670391, + "grad_norm": 0.5248129963874817, + "learning_rate": 0.0003709243697478992, + "loss": 0.4103, + "step": 22571 + }, + { + "epoch": 12.610055865921789, + "grad_norm": 0.5380997657775879, + "learning_rate": 0.00037089635854341735, + "loss": 0.3879, + "step": 22572 + }, + { + "epoch": 12.610614525139665, + "grad_norm": 0.7444873452186584, + "learning_rate": 0.0003708683473389356, + "loss": 0.6215, + "step": 22573 + }, + { + "epoch": 12.611173184357542, + "grad_norm": 0.3966597020626068, + "learning_rate": 0.00037084033613445376, + "loss": 0.4008, + "step": 22574 + }, + { + "epoch": 12.611731843575418, + "grad_norm": 0.9485647678375244, + "learning_rate": 0.000370812324929972, + "loss": 0.421, + "step": 22575 + }, + { + "epoch": 12.612290502793297, + "grad_norm": 0.5700395703315735, + "learning_rate": 0.0003707843137254902, + "loss": 0.4223, + "step": 22576 + }, + { + "epoch": 12.612849162011173, + "grad_norm": 0.6104342937469482, + "learning_rate": 0.0003707563025210084, + "loss": 0.3854, + "step": 22577 + }, + { + "epoch": 12.61340782122905, + "grad_norm": 0.5377269387245178, + "learning_rate": 0.00037072829131652664, + "loss": 0.4307, + "step": 22578 + }, + { + "epoch": 12.613966480446928, + "grad_norm": 0.696079671382904, + "learning_rate": 0.00037070028011204484, + "loss": 0.3261, + "step": 22579 + }, + { + "epoch": 12.614525139664805, + "grad_norm": 0.4880560040473938, + "learning_rate": 0.00037067226890756305, + "loss": 0.4687, + "step": 22580 + }, + { + "epoch": 12.615083798882681, + "grad_norm": 0.4921223223209381, + "learning_rate": 0.00037064425770308126, + "loss": 0.4409, + "step": 22581 + }, + { + "epoch": 12.61564245810056, + "grad_norm": 0.6960219740867615, + "learning_rate": 0.0003706162464985994, + "loss": 0.4429, + "step": 22582 + }, + { + "epoch": 12.616201117318436, + "grad_norm": 0.5629876255989075, + "learning_rate": 0.00037058823529411767, + "loss": 0.5916, + "step": 22583 + }, + { + "epoch": 12.616759776536313, + "grad_norm": 0.4182674288749695, + "learning_rate": 0.0003705602240896359, + "loss": 0.4767, + "step": 22584 + }, + { + "epoch": 12.61731843575419, + "grad_norm": 9.708643913269043, + "learning_rate": 0.0003705322128851541, + "loss": 0.4888, + "step": 22585 + }, + { + "epoch": 12.617877094972068, + "grad_norm": 0.5720090270042419, + "learning_rate": 0.0003705042016806723, + "loss": 0.4589, + "step": 22586 + }, + { + "epoch": 12.618435754189944, + "grad_norm": 0.673067033290863, + "learning_rate": 0.0003704761904761905, + "loss": 0.3328, + "step": 22587 + }, + { + "epoch": 12.61899441340782, + "grad_norm": 0.6160628795623779, + "learning_rate": 0.0003704481792717087, + "loss": 0.4705, + "step": 22588 + }, + { + "epoch": 12.619553072625699, + "grad_norm": 0.4490644931793213, + "learning_rate": 0.0003704201680672269, + "loss": 0.3173, + "step": 22589 + }, + { + "epoch": 12.620111731843576, + "grad_norm": 1.8294893503189087, + "learning_rate": 0.0003703921568627451, + "loss": 0.421, + "step": 22590 + }, + { + "epoch": 12.620670391061452, + "grad_norm": 0.4117777943611145, + "learning_rate": 0.0003703641456582633, + "loss": 0.3146, + "step": 22591 + }, + { + "epoch": 12.621229050279329, + "grad_norm": 0.5099352598190308, + "learning_rate": 0.0003703361344537815, + "loss": 0.4525, + "step": 22592 + }, + { + "epoch": 12.621787709497207, + "grad_norm": 0.6766747832298279, + "learning_rate": 0.00037030812324929973, + "loss": 0.5214, + "step": 22593 + }, + { + "epoch": 12.622346368715084, + "grad_norm": 0.47820141911506653, + "learning_rate": 0.00037028011204481793, + "loss": 0.3575, + "step": 22594 + }, + { + "epoch": 12.62290502793296, + "grad_norm": 0.9130155444145203, + "learning_rate": 0.00037025210084033614, + "loss": 0.4422, + "step": 22595 + }, + { + "epoch": 12.623463687150839, + "grad_norm": 0.4260018765926361, + "learning_rate": 0.00037022408963585435, + "loss": 0.467, + "step": 22596 + }, + { + "epoch": 12.624022346368715, + "grad_norm": 0.9684244394302368, + "learning_rate": 0.00037019607843137255, + "loss": 0.4052, + "step": 22597 + }, + { + "epoch": 12.624581005586592, + "grad_norm": 0.7404807209968567, + "learning_rate": 0.00037016806722689076, + "loss": 0.3478, + "step": 22598 + }, + { + "epoch": 12.62513966480447, + "grad_norm": 0.5057596564292908, + "learning_rate": 0.00037014005602240896, + "loss": 0.3585, + "step": 22599 + }, + { + "epoch": 12.625698324022347, + "grad_norm": 0.4183289110660553, + "learning_rate": 0.00037011204481792717, + "loss": 0.3725, + "step": 22600 + }, + { + "epoch": 12.626256983240223, + "grad_norm": 2.5891125202178955, + "learning_rate": 0.0003700840336134454, + "loss": 0.4142, + "step": 22601 + }, + { + "epoch": 12.6268156424581, + "grad_norm": 2.5536437034606934, + "learning_rate": 0.0003700560224089636, + "loss": 0.4403, + "step": 22602 + }, + { + "epoch": 12.627374301675978, + "grad_norm": 0.4288052022457123, + "learning_rate": 0.00037002801120448184, + "loss": 0.4337, + "step": 22603 + }, + { + "epoch": 12.627932960893855, + "grad_norm": 1.304640293121338, + "learning_rate": 0.00037, + "loss": 0.4519, + "step": 22604 + }, + { + "epoch": 12.628491620111731, + "grad_norm": 0.4733700752258301, + "learning_rate": 0.0003699719887955182, + "loss": 0.3852, + "step": 22605 + }, + { + "epoch": 12.62905027932961, + "grad_norm": 0.6350199580192566, + "learning_rate": 0.0003699439775910364, + "loss": 0.4585, + "step": 22606 + }, + { + "epoch": 12.629608938547486, + "grad_norm": 0.5748293995857239, + "learning_rate": 0.0003699159663865546, + "loss": 0.4301, + "step": 22607 + }, + { + "epoch": 12.630167597765363, + "grad_norm": 3.6047136783599854, + "learning_rate": 0.00036988795518207287, + "loss": 0.5009, + "step": 22608 + }, + { + "epoch": 12.630726256983241, + "grad_norm": 0.392987996339798, + "learning_rate": 0.000369859943977591, + "loss": 0.358, + "step": 22609 + }, + { + "epoch": 12.631284916201118, + "grad_norm": 0.5424877405166626, + "learning_rate": 0.00036983193277310923, + "loss": 0.3397, + "step": 22610 + }, + { + "epoch": 12.631843575418994, + "grad_norm": 0.4962330758571625, + "learning_rate": 0.0003698039215686275, + "loss": 0.3657, + "step": 22611 + }, + { + "epoch": 12.63240223463687, + "grad_norm": 0.5226778984069824, + "learning_rate": 0.00036977591036414564, + "loss": 0.4884, + "step": 22612 + }, + { + "epoch": 12.632960893854749, + "grad_norm": 0.630646288394928, + "learning_rate": 0.0003697478991596639, + "loss": 0.4892, + "step": 22613 + }, + { + "epoch": 12.633519553072626, + "grad_norm": 0.4850040674209595, + "learning_rate": 0.00036971988795518205, + "loss": 0.512, + "step": 22614 + }, + { + "epoch": 12.634078212290502, + "grad_norm": 0.4644118547439575, + "learning_rate": 0.00036969187675070026, + "loss": 0.3383, + "step": 22615 + }, + { + "epoch": 12.63463687150838, + "grad_norm": 1.8969917297363281, + "learning_rate": 0.0003696638655462185, + "loss": 0.4529, + "step": 22616 + }, + { + "epoch": 12.635195530726257, + "grad_norm": 0.4594047963619232, + "learning_rate": 0.00036963585434173667, + "loss": 0.4195, + "step": 22617 + }, + { + "epoch": 12.635754189944134, + "grad_norm": 0.5045561194419861, + "learning_rate": 0.00036960784313725493, + "loss": 0.49, + "step": 22618 + }, + { + "epoch": 12.63631284916201, + "grad_norm": 0.8643025159835815, + "learning_rate": 0.00036957983193277314, + "loss": 0.426, + "step": 22619 + }, + { + "epoch": 12.636871508379889, + "grad_norm": 0.38268062472343445, + "learning_rate": 0.0003695518207282913, + "loss": 0.4288, + "step": 22620 + }, + { + "epoch": 12.637430167597765, + "grad_norm": 0.477268785238266, + "learning_rate": 0.00036952380952380955, + "loss": 0.3189, + "step": 22621 + }, + { + "epoch": 12.637988826815642, + "grad_norm": 0.5417362451553345, + "learning_rate": 0.0003694957983193277, + "loss": 0.4972, + "step": 22622 + }, + { + "epoch": 12.63854748603352, + "grad_norm": 3.8287198543548584, + "learning_rate": 0.00036946778711484596, + "loss": 0.4789, + "step": 22623 + }, + { + "epoch": 12.639106145251397, + "grad_norm": 0.36878421902656555, + "learning_rate": 0.00036943977591036417, + "loss": 0.3783, + "step": 22624 + }, + { + "epoch": 12.639664804469273, + "grad_norm": 0.6996465921401978, + "learning_rate": 0.0003694117647058823, + "loss": 0.4531, + "step": 22625 + }, + { + "epoch": 12.640223463687152, + "grad_norm": 0.6451622247695923, + "learning_rate": 0.0003693837535014006, + "loss": 0.6835, + "step": 22626 + }, + { + "epoch": 12.640782122905028, + "grad_norm": 2.6806771755218506, + "learning_rate": 0.0003693557422969188, + "loss": 0.3358, + "step": 22627 + }, + { + "epoch": 12.641340782122905, + "grad_norm": 0.44551214575767517, + "learning_rate": 0.000369327731092437, + "loss": 0.5094, + "step": 22628 + }, + { + "epoch": 12.641899441340783, + "grad_norm": 1.0081738233566284, + "learning_rate": 0.0003692997198879552, + "loss": 0.5406, + "step": 22629 + }, + { + "epoch": 12.64245810055866, + "grad_norm": 0.9619008898735046, + "learning_rate": 0.00036927170868347335, + "loss": 0.4743, + "step": 22630 + }, + { + "epoch": 12.643016759776536, + "grad_norm": 0.9496386051177979, + "learning_rate": 0.0003692436974789916, + "loss": 0.7209, + "step": 22631 + }, + { + "epoch": 12.643575418994413, + "grad_norm": 0.5079296827316284, + "learning_rate": 0.0003692156862745098, + "loss": 0.4111, + "step": 22632 + }, + { + "epoch": 12.644134078212291, + "grad_norm": 0.5236098170280457, + "learning_rate": 0.000369187675070028, + "loss": 0.3777, + "step": 22633 + }, + { + "epoch": 12.644692737430168, + "grad_norm": 0.4201522767543793, + "learning_rate": 0.00036915966386554623, + "loss": 0.401, + "step": 22634 + }, + { + "epoch": 12.645251396648044, + "grad_norm": 1.2196553945541382, + "learning_rate": 0.00036913165266106443, + "loss": 0.4771, + "step": 22635 + }, + { + "epoch": 12.645810055865923, + "grad_norm": 0.37933313846588135, + "learning_rate": 0.00036910364145658264, + "loss": 0.3819, + "step": 22636 + }, + { + "epoch": 12.6463687150838, + "grad_norm": 0.6158190965652466, + "learning_rate": 0.00036907563025210085, + "loss": 0.7561, + "step": 22637 + }, + { + "epoch": 12.646927374301676, + "grad_norm": 0.8624254465103149, + "learning_rate": 0.00036904761904761905, + "loss": 0.5712, + "step": 22638 + }, + { + "epoch": 12.647486033519552, + "grad_norm": 0.6276279091835022, + "learning_rate": 0.00036901960784313726, + "loss": 0.3783, + "step": 22639 + }, + { + "epoch": 12.64804469273743, + "grad_norm": 0.5077083706855774, + "learning_rate": 0.00036899159663865546, + "loss": 0.4371, + "step": 22640 + }, + { + "epoch": 12.648603351955307, + "grad_norm": 0.8366996049880981, + "learning_rate": 0.00036896358543417367, + "loss": 0.4396, + "step": 22641 + }, + { + "epoch": 12.649162011173184, + "grad_norm": 0.6168509721755981, + "learning_rate": 0.0003689355742296919, + "loss": 0.4076, + "step": 22642 + }, + { + "epoch": 12.649720670391062, + "grad_norm": 0.4172072410583496, + "learning_rate": 0.00036890756302521014, + "loss": 0.3869, + "step": 22643 + }, + { + "epoch": 12.650279329608939, + "grad_norm": 0.5623812675476074, + "learning_rate": 0.0003688795518207283, + "loss": 0.3181, + "step": 22644 + }, + { + "epoch": 12.650837988826815, + "grad_norm": 0.4591881036758423, + "learning_rate": 0.0003688515406162465, + "loss": 0.3614, + "step": 22645 + }, + { + "epoch": 12.651396648044694, + "grad_norm": 0.47652843594551086, + "learning_rate": 0.0003688235294117647, + "loss": 0.3753, + "step": 22646 + }, + { + "epoch": 12.65195530726257, + "grad_norm": 0.42527127265930176, + "learning_rate": 0.0003687955182072829, + "loss": 0.4329, + "step": 22647 + }, + { + "epoch": 12.652513966480447, + "grad_norm": 0.4065473675727844, + "learning_rate": 0.00036876750700280117, + "loss": 0.4287, + "step": 22648 + }, + { + "epoch": 12.653072625698323, + "grad_norm": 0.5770248770713806, + "learning_rate": 0.0003687394957983193, + "loss": 0.3847, + "step": 22649 + }, + { + "epoch": 12.653631284916202, + "grad_norm": 0.4943188428878784, + "learning_rate": 0.0003687114845938375, + "loss": 0.4073, + "step": 22650 + }, + { + "epoch": 12.654189944134078, + "grad_norm": 0.7674317955970764, + "learning_rate": 0.0003686834733893558, + "loss": 0.3779, + "step": 22651 + }, + { + "epoch": 12.654748603351955, + "grad_norm": 0.45545870065689087, + "learning_rate": 0.00036865546218487394, + "loss": 0.5403, + "step": 22652 + }, + { + "epoch": 12.655307262569833, + "grad_norm": 0.4823314845561981, + "learning_rate": 0.0003686274509803922, + "loss": 0.4919, + "step": 22653 + }, + { + "epoch": 12.65586592178771, + "grad_norm": 0.4444320797920227, + "learning_rate": 0.00036859943977591035, + "loss": 0.4732, + "step": 22654 + }, + { + "epoch": 12.656424581005586, + "grad_norm": 0.48194339871406555, + "learning_rate": 0.00036857142857142855, + "loss": 0.371, + "step": 22655 + }, + { + "epoch": 12.656983240223465, + "grad_norm": 0.9455886483192444, + "learning_rate": 0.0003685434173669468, + "loss": 0.4799, + "step": 22656 + }, + { + "epoch": 12.657541899441341, + "grad_norm": 0.5675630569458008, + "learning_rate": 0.00036851540616246497, + "loss": 0.5449, + "step": 22657 + }, + { + "epoch": 12.658100558659218, + "grad_norm": 1.1131759881973267, + "learning_rate": 0.0003684873949579832, + "loss": 0.5092, + "step": 22658 + }, + { + "epoch": 12.658659217877094, + "grad_norm": 0.3302125632762909, + "learning_rate": 0.00036845938375350143, + "loss": 0.4138, + "step": 22659 + }, + { + "epoch": 12.659217877094973, + "grad_norm": 1.188501238822937, + "learning_rate": 0.0003684313725490196, + "loss": 0.3869, + "step": 22660 + }, + { + "epoch": 12.65977653631285, + "grad_norm": 0.4855312705039978, + "learning_rate": 0.00036840336134453784, + "loss": 0.4653, + "step": 22661 + }, + { + "epoch": 12.660335195530726, + "grad_norm": 2.120654344558716, + "learning_rate": 0.000368375350140056, + "loss": 0.4393, + "step": 22662 + }, + { + "epoch": 12.660893854748604, + "grad_norm": 0.40007635951042175, + "learning_rate": 0.00036834733893557426, + "loss": 0.3682, + "step": 22663 + }, + { + "epoch": 12.66145251396648, + "grad_norm": 0.43272078037261963, + "learning_rate": 0.00036831932773109246, + "loss": 0.4097, + "step": 22664 + }, + { + "epoch": 12.662011173184357, + "grad_norm": 0.8415060639381409, + "learning_rate": 0.0003682913165266106, + "loss": 0.4651, + "step": 22665 + }, + { + "epoch": 12.662569832402234, + "grad_norm": 0.5167807936668396, + "learning_rate": 0.0003682633053221289, + "loss": 0.437, + "step": 22666 + }, + { + "epoch": 12.663128491620112, + "grad_norm": 0.5113229751586914, + "learning_rate": 0.0003682352941176471, + "loss": 0.5192, + "step": 22667 + }, + { + "epoch": 12.663687150837989, + "grad_norm": 0.4238426983356476, + "learning_rate": 0.0003682072829131653, + "loss": 0.3454, + "step": 22668 + }, + { + "epoch": 12.664245810055865, + "grad_norm": 4.335702896118164, + "learning_rate": 0.0003681792717086835, + "loss": 0.4291, + "step": 22669 + }, + { + "epoch": 12.664804469273744, + "grad_norm": 0.4370681047439575, + "learning_rate": 0.00036815126050420164, + "loss": 0.4076, + "step": 22670 + }, + { + "epoch": 12.66536312849162, + "grad_norm": 0.570739209651947, + "learning_rate": 0.0003681232492997199, + "loss": 0.3993, + "step": 22671 + }, + { + "epoch": 12.665921787709497, + "grad_norm": 0.9036211371421814, + "learning_rate": 0.0003680952380952381, + "loss": 0.587, + "step": 22672 + }, + { + "epoch": 12.666480446927375, + "grad_norm": 0.5403720140457153, + "learning_rate": 0.0003680672268907563, + "loss": 0.4692, + "step": 22673 + }, + { + "epoch": 12.667039106145252, + "grad_norm": 0.4469272792339325, + "learning_rate": 0.0003680392156862745, + "loss": 0.558, + "step": 22674 + }, + { + "epoch": 12.667597765363128, + "grad_norm": 2.5028650760650635, + "learning_rate": 0.00036801120448179273, + "loss": 0.4619, + "step": 22675 + }, + { + "epoch": 12.668156424581005, + "grad_norm": 0.8241000175476074, + "learning_rate": 0.00036798319327731093, + "loss": 0.4284, + "step": 22676 + }, + { + "epoch": 12.668715083798883, + "grad_norm": 0.4155273139476776, + "learning_rate": 0.00036795518207282914, + "loss": 0.4793, + "step": 22677 + }, + { + "epoch": 12.66927374301676, + "grad_norm": 0.8346865177154541, + "learning_rate": 0.00036792717086834735, + "loss": 0.4944, + "step": 22678 + }, + { + "epoch": 12.669832402234636, + "grad_norm": 0.3694761395454407, + "learning_rate": 0.00036789915966386555, + "loss": 0.4222, + "step": 22679 + }, + { + "epoch": 12.670391061452515, + "grad_norm": 0.6826328635215759, + "learning_rate": 0.00036787114845938376, + "loss": 0.5012, + "step": 22680 + }, + { + "epoch": 12.670949720670391, + "grad_norm": 0.7971574068069458, + "learning_rate": 0.00036784313725490196, + "loss": 0.4402, + "step": 22681 + }, + { + "epoch": 12.671508379888268, + "grad_norm": 0.4051872193813324, + "learning_rate": 0.00036781512605042017, + "loss": 0.3355, + "step": 22682 + }, + { + "epoch": 12.672067039106146, + "grad_norm": 0.44159600138664246, + "learning_rate": 0.00036778711484593843, + "loss": 0.3613, + "step": 22683 + }, + { + "epoch": 12.672625698324023, + "grad_norm": 0.6788287162780762, + "learning_rate": 0.0003677591036414566, + "loss": 0.4363, + "step": 22684 + }, + { + "epoch": 12.6731843575419, + "grad_norm": 0.4831417500972748, + "learning_rate": 0.0003677310924369748, + "loss": 0.4171, + "step": 22685 + }, + { + "epoch": 12.673743016759776, + "grad_norm": 0.4618101418018341, + "learning_rate": 0.000367703081232493, + "loss": 0.5157, + "step": 22686 + }, + { + "epoch": 12.674301675977654, + "grad_norm": 0.4800151586532593, + "learning_rate": 0.0003676750700280112, + "loss": 0.4015, + "step": 22687 + }, + { + "epoch": 12.67486033519553, + "grad_norm": 0.6734005212783813, + "learning_rate": 0.00036764705882352946, + "loss": 0.5868, + "step": 22688 + }, + { + "epoch": 12.675418994413407, + "grad_norm": 13.141242027282715, + "learning_rate": 0.0003676190476190476, + "loss": 0.365, + "step": 22689 + }, + { + "epoch": 12.675977653631286, + "grad_norm": 0.6268458962440491, + "learning_rate": 0.0003675910364145658, + "loss": 0.3227, + "step": 22690 + }, + { + "epoch": 12.676536312849162, + "grad_norm": 0.5545414686203003, + "learning_rate": 0.0003675630252100841, + "loss": 0.4361, + "step": 22691 + }, + { + "epoch": 12.677094972067039, + "grad_norm": 0.40543267130851746, + "learning_rate": 0.00036753501400560223, + "loss": 0.3819, + "step": 22692 + }, + { + "epoch": 12.677653631284915, + "grad_norm": 0.5659895539283752, + "learning_rate": 0.0003675070028011205, + "loss": 0.5817, + "step": 22693 + }, + { + "epoch": 12.678212290502794, + "grad_norm": 0.38993850350379944, + "learning_rate": 0.00036747899159663864, + "loss": 0.4236, + "step": 22694 + }, + { + "epoch": 12.67877094972067, + "grad_norm": 0.4423362910747528, + "learning_rate": 0.00036745098039215685, + "loss": 0.3407, + "step": 22695 + }, + { + "epoch": 12.679329608938547, + "grad_norm": 0.432059645652771, + "learning_rate": 0.0003674229691876751, + "loss": 0.4736, + "step": 22696 + }, + { + "epoch": 12.679888268156425, + "grad_norm": 0.6562801003456116, + "learning_rate": 0.00036739495798319326, + "loss": 0.389, + "step": 22697 + }, + { + "epoch": 12.680446927374302, + "grad_norm": 0.9713101387023926, + "learning_rate": 0.0003673669467787115, + "loss": 0.502, + "step": 22698 + }, + { + "epoch": 12.681005586592178, + "grad_norm": 0.83583664894104, + "learning_rate": 0.0003673389355742297, + "loss": 0.4063, + "step": 22699 + }, + { + "epoch": 12.681564245810057, + "grad_norm": 0.3693397343158722, + "learning_rate": 0.0003673109243697479, + "loss": 0.3886, + "step": 22700 + }, + { + "epoch": 12.682122905027933, + "grad_norm": 0.5915853977203369, + "learning_rate": 0.00036728291316526614, + "loss": 0.4183, + "step": 22701 + }, + { + "epoch": 12.68268156424581, + "grad_norm": 1.1918507814407349, + "learning_rate": 0.0003672549019607843, + "loss": 0.6949, + "step": 22702 + }, + { + "epoch": 12.683240223463688, + "grad_norm": 1.0061208009719849, + "learning_rate": 0.00036722689075630255, + "loss": 0.5751, + "step": 22703 + }, + { + "epoch": 12.683798882681565, + "grad_norm": 0.4901619851589203, + "learning_rate": 0.00036719887955182076, + "loss": 0.4106, + "step": 22704 + }, + { + "epoch": 12.684357541899441, + "grad_norm": 0.4705338478088379, + "learning_rate": 0.0003671708683473389, + "loss": 0.4329, + "step": 22705 + }, + { + "epoch": 12.684916201117318, + "grad_norm": 0.4751470386981964, + "learning_rate": 0.00036714285714285717, + "loss": 0.406, + "step": 22706 + }, + { + "epoch": 12.685474860335196, + "grad_norm": 0.5742958188056946, + "learning_rate": 0.0003671148459383754, + "loss": 0.4762, + "step": 22707 + }, + { + "epoch": 12.686033519553073, + "grad_norm": 0.42881226539611816, + "learning_rate": 0.0003670868347338935, + "loss": 0.3553, + "step": 22708 + }, + { + "epoch": 12.68659217877095, + "grad_norm": 0.44101089239120483, + "learning_rate": 0.0003670588235294118, + "loss": 0.4532, + "step": 22709 + }, + { + "epoch": 12.687150837988828, + "grad_norm": 0.4362788200378418, + "learning_rate": 0.00036703081232492994, + "loss": 0.4415, + "step": 22710 + }, + { + "epoch": 12.687709497206704, + "grad_norm": 0.46455127000808716, + "learning_rate": 0.0003670028011204482, + "loss": 0.4227, + "step": 22711 + }, + { + "epoch": 12.68826815642458, + "grad_norm": 1.557179570198059, + "learning_rate": 0.0003669747899159664, + "loss": 0.3986, + "step": 22712 + }, + { + "epoch": 12.688826815642457, + "grad_norm": 1.4577324390411377, + "learning_rate": 0.00036694677871148456, + "loss": 0.4091, + "step": 22713 + }, + { + "epoch": 12.689385474860336, + "grad_norm": 0.4985298812389374, + "learning_rate": 0.0003669187675070028, + "loss": 0.4474, + "step": 22714 + }, + { + "epoch": 12.689944134078212, + "grad_norm": 1.6187843084335327, + "learning_rate": 0.000366890756302521, + "loss": 0.5188, + "step": 22715 + }, + { + "epoch": 12.690502793296089, + "grad_norm": 0.47952544689178467, + "learning_rate": 0.00036686274509803923, + "loss": 0.4026, + "step": 22716 + }, + { + "epoch": 12.691061452513967, + "grad_norm": 0.5362875461578369, + "learning_rate": 0.00036683473389355743, + "loss": 0.4839, + "step": 22717 + }, + { + "epoch": 12.691620111731844, + "grad_norm": 0.48344671726226807, + "learning_rate": 0.0003668067226890756, + "loss": 0.4729, + "step": 22718 + }, + { + "epoch": 12.69217877094972, + "grad_norm": 0.4527988135814667, + "learning_rate": 0.00036677871148459385, + "loss": 0.4319, + "step": 22719 + }, + { + "epoch": 12.692737430167599, + "grad_norm": 0.45833876729011536, + "learning_rate": 0.00036675070028011205, + "loss": 0.4193, + "step": 22720 + }, + { + "epoch": 12.693296089385475, + "grad_norm": 0.4064957797527313, + "learning_rate": 0.00036672268907563026, + "loss": 0.4351, + "step": 22721 + }, + { + "epoch": 12.693854748603352, + "grad_norm": 0.5433568954467773, + "learning_rate": 0.00036669467787114846, + "loss": 0.5695, + "step": 22722 + }, + { + "epoch": 12.694413407821228, + "grad_norm": 0.40819454193115234, + "learning_rate": 0.00036666666666666667, + "loss": 0.3407, + "step": 22723 + }, + { + "epoch": 12.694972067039107, + "grad_norm": 1.3608016967773438, + "learning_rate": 0.0003666386554621849, + "loss": 0.3421, + "step": 22724 + }, + { + "epoch": 12.695530726256983, + "grad_norm": 0.4803275465965271, + "learning_rate": 0.0003666106442577031, + "loss": 0.4272, + "step": 22725 + }, + { + "epoch": 12.69608938547486, + "grad_norm": 0.6079450249671936, + "learning_rate": 0.0003665826330532213, + "loss": 0.3481, + "step": 22726 + }, + { + "epoch": 12.696648044692738, + "grad_norm": 0.3614291250705719, + "learning_rate": 0.0003665546218487395, + "loss": 0.4092, + "step": 22727 + }, + { + "epoch": 12.697206703910615, + "grad_norm": 0.3670042157173157, + "learning_rate": 0.0003665266106442577, + "loss": 0.3459, + "step": 22728 + }, + { + "epoch": 12.697765363128491, + "grad_norm": 0.3679581880569458, + "learning_rate": 0.0003664985994397759, + "loss": 0.3265, + "step": 22729 + }, + { + "epoch": 12.69832402234637, + "grad_norm": 0.47507980465888977, + "learning_rate": 0.0003664705882352941, + "loss": 0.4102, + "step": 22730 + }, + { + "epoch": 12.698882681564246, + "grad_norm": 0.6242567896842957, + "learning_rate": 0.00036644257703081237, + "loss": 0.3666, + "step": 22731 + }, + { + "epoch": 12.699441340782123, + "grad_norm": 0.5092839598655701, + "learning_rate": 0.0003664145658263305, + "loss": 0.3389, + "step": 22732 + }, + { + "epoch": 12.7, + "grad_norm": 0.6447473168373108, + "learning_rate": 0.00036638655462184873, + "loss": 0.5652, + "step": 22733 + }, + { + "epoch": 12.700558659217878, + "grad_norm": 1.5840729475021362, + "learning_rate": 0.00036635854341736694, + "loss": 0.3944, + "step": 22734 + }, + { + "epoch": 12.701117318435754, + "grad_norm": 0.4584546685218811, + "learning_rate": 0.00036633053221288514, + "loss": 0.3827, + "step": 22735 + }, + { + "epoch": 12.70167597765363, + "grad_norm": 0.44648319482803345, + "learning_rate": 0.0003663025210084034, + "loss": 0.4522, + "step": 22736 + }, + { + "epoch": 12.702234636871509, + "grad_norm": 1.1005603075027466, + "learning_rate": 0.00036627450980392155, + "loss": 0.4222, + "step": 22737 + }, + { + "epoch": 12.702793296089386, + "grad_norm": 0.7060168385505676, + "learning_rate": 0.00036624649859943976, + "loss": 0.4703, + "step": 22738 + }, + { + "epoch": 12.703351955307262, + "grad_norm": 0.42375731468200684, + "learning_rate": 0.000366218487394958, + "loss": 0.488, + "step": 22739 + }, + { + "epoch": 12.703910614525139, + "grad_norm": 0.5345640778541565, + "learning_rate": 0.00036619047619047617, + "loss": 0.5145, + "step": 22740 + }, + { + "epoch": 12.704469273743017, + "grad_norm": 0.3759945034980774, + "learning_rate": 0.00036616246498599443, + "loss": 0.3848, + "step": 22741 + }, + { + "epoch": 12.705027932960894, + "grad_norm": 0.5260798335075378, + "learning_rate": 0.0003661344537815126, + "loss": 0.3616, + "step": 22742 + }, + { + "epoch": 12.70558659217877, + "grad_norm": 0.6330503821372986, + "learning_rate": 0.0003661064425770308, + "loss": 0.5406, + "step": 22743 + }, + { + "epoch": 12.706145251396649, + "grad_norm": 0.4426876902580261, + "learning_rate": 0.00036607843137254905, + "loss": 0.3502, + "step": 22744 + }, + { + "epoch": 12.706703910614525, + "grad_norm": 0.37959563732147217, + "learning_rate": 0.0003660504201680672, + "loss": 0.3262, + "step": 22745 + }, + { + "epoch": 12.707262569832402, + "grad_norm": 0.4771775007247925, + "learning_rate": 0.00036602240896358546, + "loss": 0.4442, + "step": 22746 + }, + { + "epoch": 12.70782122905028, + "grad_norm": 1.4068987369537354, + "learning_rate": 0.00036599439775910367, + "loss": 0.4878, + "step": 22747 + }, + { + "epoch": 12.708379888268157, + "grad_norm": 0.649357795715332, + "learning_rate": 0.0003659663865546218, + "loss": 0.4428, + "step": 22748 + }, + { + "epoch": 12.708938547486033, + "grad_norm": 0.5546964406967163, + "learning_rate": 0.0003659383753501401, + "loss": 0.4378, + "step": 22749 + }, + { + "epoch": 12.70949720670391, + "grad_norm": 0.5037041306495667, + "learning_rate": 0.00036591036414565823, + "loss": 0.3744, + "step": 22750 + }, + { + "epoch": 12.710055865921788, + "grad_norm": 0.4368305504322052, + "learning_rate": 0.0003658823529411765, + "loss": 0.443, + "step": 22751 + }, + { + "epoch": 12.710614525139665, + "grad_norm": 0.42577028274536133, + "learning_rate": 0.0003658543417366947, + "loss": 0.4505, + "step": 22752 + }, + { + "epoch": 12.711173184357541, + "grad_norm": 0.48462414741516113, + "learning_rate": 0.00036582633053221285, + "loss": 0.3704, + "step": 22753 + }, + { + "epoch": 12.71173184357542, + "grad_norm": 1.2744355201721191, + "learning_rate": 0.0003657983193277311, + "loss": 0.3861, + "step": 22754 + }, + { + "epoch": 12.712290502793296, + "grad_norm": 7.955973148345947, + "learning_rate": 0.0003657703081232493, + "loss": 0.4129, + "step": 22755 + }, + { + "epoch": 12.712849162011173, + "grad_norm": 0.5515907406806946, + "learning_rate": 0.0003657422969187675, + "loss": 0.4737, + "step": 22756 + }, + { + "epoch": 12.713407821229051, + "grad_norm": 0.41590380668640137, + "learning_rate": 0.00036571428571428573, + "loss": 0.4009, + "step": 22757 + }, + { + "epoch": 12.713966480446928, + "grad_norm": 0.5429763793945312, + "learning_rate": 0.0003656862745098039, + "loss": 0.4156, + "step": 22758 + }, + { + "epoch": 12.714525139664804, + "grad_norm": 0.43287140130996704, + "learning_rate": 0.00036565826330532214, + "loss": 0.4542, + "step": 22759 + }, + { + "epoch": 12.71508379888268, + "grad_norm": 0.3885638415813446, + "learning_rate": 0.00036563025210084035, + "loss": 0.3889, + "step": 22760 + }, + { + "epoch": 12.71564245810056, + "grad_norm": 1.02476966381073, + "learning_rate": 0.00036560224089635855, + "loss": 0.4165, + "step": 22761 + }, + { + "epoch": 12.716201117318436, + "grad_norm": 0.34999823570251465, + "learning_rate": 0.00036557422969187676, + "loss": 0.3533, + "step": 22762 + }, + { + "epoch": 12.716759776536312, + "grad_norm": 0.6705333590507507, + "learning_rate": 0.00036554621848739496, + "loss": 0.3529, + "step": 22763 + }, + { + "epoch": 12.71731843575419, + "grad_norm": 0.5039358735084534, + "learning_rate": 0.00036551820728291317, + "loss": 0.3938, + "step": 22764 + }, + { + "epoch": 12.717877094972067, + "grad_norm": 0.34424564242362976, + "learning_rate": 0.0003654901960784314, + "loss": 0.2597, + "step": 22765 + }, + { + "epoch": 12.718435754189944, + "grad_norm": 1.6279664039611816, + "learning_rate": 0.00036546218487394964, + "loss": 0.4814, + "step": 22766 + }, + { + "epoch": 12.71899441340782, + "grad_norm": 1.525254726409912, + "learning_rate": 0.0003654341736694678, + "loss": 0.4829, + "step": 22767 + }, + { + "epoch": 12.719553072625699, + "grad_norm": 0.4523603022098541, + "learning_rate": 0.000365406162464986, + "loss": 0.4156, + "step": 22768 + }, + { + "epoch": 12.720111731843575, + "grad_norm": 0.43614131212234497, + "learning_rate": 0.0003653781512605042, + "loss": 0.4833, + "step": 22769 + }, + { + "epoch": 12.720670391061452, + "grad_norm": 0.34730708599090576, + "learning_rate": 0.0003653501400560224, + "loss": 0.3606, + "step": 22770 + }, + { + "epoch": 12.72122905027933, + "grad_norm": 0.40569254755973816, + "learning_rate": 0.00036532212885154067, + "loss": 0.3819, + "step": 22771 + }, + { + "epoch": 12.721787709497207, + "grad_norm": 0.916185200214386, + "learning_rate": 0.0003652941176470588, + "loss": 0.458, + "step": 22772 + }, + { + "epoch": 12.722346368715083, + "grad_norm": 0.6924963593482971, + "learning_rate": 0.000365266106442577, + "loss": 0.4698, + "step": 22773 + }, + { + "epoch": 12.722905027932962, + "grad_norm": 0.5590592622756958, + "learning_rate": 0.0003652380952380953, + "loss": 0.3562, + "step": 22774 + }, + { + "epoch": 12.723463687150838, + "grad_norm": 0.42891693115234375, + "learning_rate": 0.00036521008403361344, + "loss": 0.3461, + "step": 22775 + }, + { + "epoch": 12.724022346368715, + "grad_norm": 0.7148812413215637, + "learning_rate": 0.0003651820728291317, + "loss": 0.354, + "step": 22776 + }, + { + "epoch": 12.724581005586593, + "grad_norm": 5.826832294464111, + "learning_rate": 0.00036515406162464985, + "loss": 0.454, + "step": 22777 + }, + { + "epoch": 12.72513966480447, + "grad_norm": 0.6415731310844421, + "learning_rate": 0.00036512605042016805, + "loss": 0.4511, + "step": 22778 + }, + { + "epoch": 12.725698324022346, + "grad_norm": 0.7285545468330383, + "learning_rate": 0.0003650980392156863, + "loss": 0.5046, + "step": 22779 + }, + { + "epoch": 12.726256983240223, + "grad_norm": 0.5698167681694031, + "learning_rate": 0.00036507002801120447, + "loss": 0.4213, + "step": 22780 + }, + { + "epoch": 12.726815642458101, + "grad_norm": 0.5560302734375, + "learning_rate": 0.0003650420168067227, + "loss": 0.409, + "step": 22781 + }, + { + "epoch": 12.727374301675978, + "grad_norm": 0.3795311748981476, + "learning_rate": 0.00036501400560224093, + "loss": 0.3622, + "step": 22782 + }, + { + "epoch": 12.727932960893854, + "grad_norm": 0.4959107041358948, + "learning_rate": 0.0003649859943977591, + "loss": 0.5039, + "step": 22783 + }, + { + "epoch": 12.728491620111733, + "grad_norm": 1.0322513580322266, + "learning_rate": 0.00036495798319327734, + "loss": 0.5891, + "step": 22784 + }, + { + "epoch": 12.72905027932961, + "grad_norm": 0.4532834589481354, + "learning_rate": 0.0003649299719887955, + "loss": 0.3771, + "step": 22785 + }, + { + "epoch": 12.729608938547486, + "grad_norm": 0.43512043356895447, + "learning_rate": 0.00036490196078431376, + "loss": 0.3772, + "step": 22786 + }, + { + "epoch": 12.730167597765362, + "grad_norm": 0.45097917318344116, + "learning_rate": 0.00036487394957983196, + "loss": 0.4052, + "step": 22787 + }, + { + "epoch": 12.73072625698324, + "grad_norm": 10.076991081237793, + "learning_rate": 0.0003648459383753501, + "loss": 0.4625, + "step": 22788 + }, + { + "epoch": 12.731284916201117, + "grad_norm": 0.5700581669807434, + "learning_rate": 0.0003648179271708684, + "loss": 0.3814, + "step": 22789 + }, + { + "epoch": 12.731843575418994, + "grad_norm": 0.8671738505363464, + "learning_rate": 0.0003647899159663866, + "loss": 0.4473, + "step": 22790 + }, + { + "epoch": 12.732402234636872, + "grad_norm": 0.39406871795654297, + "learning_rate": 0.0003647619047619048, + "loss": 0.374, + "step": 22791 + }, + { + "epoch": 12.732960893854749, + "grad_norm": 0.3909502625465393, + "learning_rate": 0.000364733893557423, + "loss": 0.4332, + "step": 22792 + }, + { + "epoch": 12.733519553072625, + "grad_norm": 0.561922550201416, + "learning_rate": 0.00036470588235294114, + "loss": 0.388, + "step": 22793 + }, + { + "epoch": 12.734078212290502, + "grad_norm": 1.209590196609497, + "learning_rate": 0.0003646778711484594, + "loss": 0.3889, + "step": 22794 + }, + { + "epoch": 12.73463687150838, + "grad_norm": 0.591681182384491, + "learning_rate": 0.0003646498599439776, + "loss": 0.3201, + "step": 22795 + }, + { + "epoch": 12.735195530726257, + "grad_norm": 2.4374759197235107, + "learning_rate": 0.0003646218487394958, + "loss": 0.3817, + "step": 22796 + }, + { + "epoch": 12.735754189944133, + "grad_norm": 0.6187816858291626, + "learning_rate": 0.000364593837535014, + "loss": 0.3821, + "step": 22797 + }, + { + "epoch": 12.736312849162012, + "grad_norm": 0.3471083343029022, + "learning_rate": 0.00036456582633053223, + "loss": 0.4019, + "step": 22798 + }, + { + "epoch": 12.736871508379888, + "grad_norm": 0.3987346589565277, + "learning_rate": 0.00036453781512605043, + "loss": 0.4526, + "step": 22799 + }, + { + "epoch": 12.737430167597765, + "grad_norm": 0.5573232173919678, + "learning_rate": 0.00036450980392156864, + "loss": 0.5068, + "step": 22800 + }, + { + "epoch": 12.737988826815643, + "grad_norm": 0.7180553674697876, + "learning_rate": 0.00036448179271708685, + "loss": 0.4944, + "step": 22801 + }, + { + "epoch": 12.73854748603352, + "grad_norm": 0.45446667075157166, + "learning_rate": 0.00036445378151260505, + "loss": 0.431, + "step": 22802 + }, + { + "epoch": 12.739106145251396, + "grad_norm": 4.7218708992004395, + "learning_rate": 0.00036442577030812326, + "loss": 0.3109, + "step": 22803 + }, + { + "epoch": 12.739664804469275, + "grad_norm": 0.5455268025398254, + "learning_rate": 0.00036439775910364146, + "loss": 0.5835, + "step": 22804 + }, + { + "epoch": 12.740223463687151, + "grad_norm": 0.6029795408248901, + "learning_rate": 0.00036436974789915967, + "loss": 0.4175, + "step": 22805 + }, + { + "epoch": 12.740782122905028, + "grad_norm": 1.880365014076233, + "learning_rate": 0.00036434173669467793, + "loss": 0.4707, + "step": 22806 + }, + { + "epoch": 12.741340782122904, + "grad_norm": 0.50532066822052, + "learning_rate": 0.0003643137254901961, + "loss": 0.4525, + "step": 22807 + }, + { + "epoch": 12.741899441340783, + "grad_norm": 0.5116075277328491, + "learning_rate": 0.0003642857142857143, + "loss": 0.473, + "step": 22808 + }, + { + "epoch": 12.74245810055866, + "grad_norm": 0.6967884302139282, + "learning_rate": 0.0003642577030812325, + "loss": 0.6258, + "step": 22809 + }, + { + "epoch": 12.743016759776536, + "grad_norm": 0.6727312803268433, + "learning_rate": 0.0003642296918767507, + "loss": 0.3504, + "step": 22810 + }, + { + "epoch": 12.743575418994414, + "grad_norm": 0.6625656485557556, + "learning_rate": 0.00036420168067226896, + "loss": 0.6394, + "step": 22811 + }, + { + "epoch": 12.74413407821229, + "grad_norm": 0.567778468132019, + "learning_rate": 0.0003641736694677871, + "loss": 0.5532, + "step": 22812 + }, + { + "epoch": 12.744692737430167, + "grad_norm": 2.000669240951538, + "learning_rate": 0.0003641456582633053, + "loss": 0.3604, + "step": 22813 + }, + { + "epoch": 12.745251396648044, + "grad_norm": 1.4496207237243652, + "learning_rate": 0.0003641176470588236, + "loss": 0.455, + "step": 22814 + }, + { + "epoch": 12.745810055865922, + "grad_norm": 0.5075682997703552, + "learning_rate": 0.00036408963585434173, + "loss": 0.3656, + "step": 22815 + }, + { + "epoch": 12.746368715083799, + "grad_norm": 0.41381144523620605, + "learning_rate": 0.00036406162464985994, + "loss": 0.4398, + "step": 22816 + }, + { + "epoch": 12.746927374301675, + "grad_norm": 0.4415242075920105, + "learning_rate": 0.00036403361344537814, + "loss": 0.443, + "step": 22817 + }, + { + "epoch": 12.747486033519554, + "grad_norm": 1.0416975021362305, + "learning_rate": 0.00036400560224089635, + "loss": 0.4336, + "step": 22818 + }, + { + "epoch": 12.74804469273743, + "grad_norm": 0.6818392276763916, + "learning_rate": 0.0003639775910364146, + "loss": 0.4606, + "step": 22819 + }, + { + "epoch": 12.748603351955307, + "grad_norm": 0.4275626838207245, + "learning_rate": 0.00036394957983193276, + "loss": 0.4179, + "step": 22820 + }, + { + "epoch": 12.749162011173185, + "grad_norm": 0.6371297240257263, + "learning_rate": 0.00036392156862745097, + "loss": 0.4707, + "step": 22821 + }, + { + "epoch": 12.749720670391062, + "grad_norm": 0.6007310152053833, + "learning_rate": 0.0003638935574229692, + "loss": 0.5097, + "step": 22822 + }, + { + "epoch": 12.750279329608938, + "grad_norm": 0.7302827835083008, + "learning_rate": 0.0003638655462184874, + "loss": 0.4123, + "step": 22823 + }, + { + "epoch": 12.750837988826815, + "grad_norm": 0.4860741198062897, + "learning_rate": 0.00036383753501400564, + "loss": 0.4368, + "step": 22824 + }, + { + "epoch": 12.751396648044693, + "grad_norm": 0.377386212348938, + "learning_rate": 0.0003638095238095238, + "loss": 0.4169, + "step": 22825 + }, + { + "epoch": 12.75195530726257, + "grad_norm": 0.7121120691299438, + "learning_rate": 0.000363781512605042, + "loss": 0.4321, + "step": 22826 + }, + { + "epoch": 12.752513966480446, + "grad_norm": 0.44249460101127625, + "learning_rate": 0.00036375350140056026, + "loss": 0.4036, + "step": 22827 + }, + { + "epoch": 12.753072625698325, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0003637254901960784, + "loss": 0.5075, + "step": 22828 + }, + { + "epoch": 12.753631284916201, + "grad_norm": 1.1038780212402344, + "learning_rate": 0.00036369747899159667, + "loss": 0.3929, + "step": 22829 + }, + { + "epoch": 12.754189944134078, + "grad_norm": 0.6869402527809143, + "learning_rate": 0.0003636694677871149, + "loss": 0.3606, + "step": 22830 + }, + { + "epoch": 12.754748603351956, + "grad_norm": 0.5216014981269836, + "learning_rate": 0.000363641456582633, + "loss": 0.3238, + "step": 22831 + }, + { + "epoch": 12.755307262569833, + "grad_norm": 0.3676243722438812, + "learning_rate": 0.0003636134453781513, + "loss": 0.3371, + "step": 22832 + }, + { + "epoch": 12.75586592178771, + "grad_norm": 0.4803237020969391, + "learning_rate": 0.00036358543417366944, + "loss": 0.4185, + "step": 22833 + }, + { + "epoch": 12.756424581005586, + "grad_norm": 0.4397362768650055, + "learning_rate": 0.0003635574229691877, + "loss": 0.4349, + "step": 22834 + }, + { + "epoch": 12.756983240223464, + "grad_norm": 0.9142764210700989, + "learning_rate": 0.0003635294117647059, + "loss": 0.4274, + "step": 22835 + }, + { + "epoch": 12.75754189944134, + "grad_norm": 0.6187541484832764, + "learning_rate": 0.00036350140056022406, + "loss": 0.2993, + "step": 22836 + }, + { + "epoch": 12.758100558659217, + "grad_norm": 0.6531397104263306, + "learning_rate": 0.0003634733893557423, + "loss": 0.4021, + "step": 22837 + }, + { + "epoch": 12.758659217877096, + "grad_norm": 1.6469725370407104, + "learning_rate": 0.0003634453781512605, + "loss": 0.4299, + "step": 22838 + }, + { + "epoch": 12.759217877094972, + "grad_norm": 0.42806869745254517, + "learning_rate": 0.00036341736694677873, + "loss": 0.3581, + "step": 22839 + }, + { + "epoch": 12.759776536312849, + "grad_norm": 0.6976181268692017, + "learning_rate": 0.00036338935574229693, + "loss": 0.4947, + "step": 22840 + }, + { + "epoch": 12.760335195530725, + "grad_norm": 0.469889760017395, + "learning_rate": 0.0003633613445378151, + "loss": 0.3771, + "step": 22841 + }, + { + "epoch": 12.760893854748604, + "grad_norm": 0.5360656380653381, + "learning_rate": 0.00036333333333333335, + "loss": 0.4259, + "step": 22842 + }, + { + "epoch": 12.76145251396648, + "grad_norm": 0.8430083394050598, + "learning_rate": 0.00036330532212885155, + "loss": 0.4319, + "step": 22843 + }, + { + "epoch": 12.762011173184357, + "grad_norm": 0.5406758189201355, + "learning_rate": 0.00036327731092436976, + "loss": 0.4406, + "step": 22844 + }, + { + "epoch": 12.762569832402235, + "grad_norm": 0.5195489525794983, + "learning_rate": 0.00036324929971988796, + "loss": 0.2963, + "step": 22845 + }, + { + "epoch": 12.763128491620112, + "grad_norm": 0.5169768333435059, + "learning_rate": 0.00036322128851540617, + "loss": 0.3281, + "step": 22846 + }, + { + "epoch": 12.763687150837988, + "grad_norm": 0.39881834387779236, + "learning_rate": 0.0003631932773109244, + "loss": 0.3603, + "step": 22847 + }, + { + "epoch": 12.764245810055867, + "grad_norm": 0.4261707365512848, + "learning_rate": 0.0003631652661064426, + "loss": 0.449, + "step": 22848 + }, + { + "epoch": 12.764804469273743, + "grad_norm": 0.41769784688949585, + "learning_rate": 0.0003631372549019608, + "loss": 0.3626, + "step": 22849 + }, + { + "epoch": 12.76536312849162, + "grad_norm": 0.4486564099788666, + "learning_rate": 0.000363109243697479, + "loss": 0.4275, + "step": 22850 + }, + { + "epoch": 12.765921787709498, + "grad_norm": 0.4733645021915436, + "learning_rate": 0.0003630812324929972, + "loss": 0.3359, + "step": 22851 + }, + { + "epoch": 12.766480446927375, + "grad_norm": 0.5320999622344971, + "learning_rate": 0.0003630532212885154, + "loss": 0.4027, + "step": 22852 + }, + { + "epoch": 12.767039106145251, + "grad_norm": 0.48368027806282043, + "learning_rate": 0.0003630252100840336, + "loss": 0.3766, + "step": 22853 + }, + { + "epoch": 12.767597765363128, + "grad_norm": 0.5685689449310303, + "learning_rate": 0.00036299719887955187, + "loss": 0.4203, + "step": 22854 + }, + { + "epoch": 12.768156424581006, + "grad_norm": 0.40302178263664246, + "learning_rate": 0.00036296918767507, + "loss": 0.3017, + "step": 22855 + }, + { + "epoch": 12.768715083798883, + "grad_norm": 0.7145180702209473, + "learning_rate": 0.00036294117647058823, + "loss": 0.3863, + "step": 22856 + }, + { + "epoch": 12.76927374301676, + "grad_norm": 0.48648586869239807, + "learning_rate": 0.00036291316526610644, + "loss": 0.3131, + "step": 22857 + }, + { + "epoch": 12.769832402234638, + "grad_norm": 0.574644148349762, + "learning_rate": 0.00036288515406162464, + "loss": 0.3018, + "step": 22858 + }, + { + "epoch": 12.770391061452514, + "grad_norm": 0.4302155673503876, + "learning_rate": 0.0003628571428571429, + "loss": 0.4593, + "step": 22859 + }, + { + "epoch": 12.77094972067039, + "grad_norm": 0.49935561418533325, + "learning_rate": 0.00036282913165266105, + "loss": 0.333, + "step": 22860 + }, + { + "epoch": 12.771508379888267, + "grad_norm": 0.5210148692131042, + "learning_rate": 0.00036280112044817926, + "loss": 0.3756, + "step": 22861 + }, + { + "epoch": 12.772067039106146, + "grad_norm": 0.4947070777416229, + "learning_rate": 0.0003627731092436975, + "loss": 0.4755, + "step": 22862 + }, + { + "epoch": 12.772625698324022, + "grad_norm": 0.3514373302459717, + "learning_rate": 0.00036274509803921567, + "loss": 0.3161, + "step": 22863 + }, + { + "epoch": 12.773184357541899, + "grad_norm": 0.4451432228088379, + "learning_rate": 0.00036271708683473393, + "loss": 0.4335, + "step": 22864 + }, + { + "epoch": 12.773743016759777, + "grad_norm": 0.7320363521575928, + "learning_rate": 0.0003626890756302521, + "loss": 0.4721, + "step": 22865 + }, + { + "epoch": 12.774301675977654, + "grad_norm": 0.41618093848228455, + "learning_rate": 0.0003626610644257703, + "loss": 0.3359, + "step": 22866 + }, + { + "epoch": 12.77486033519553, + "grad_norm": 1.3386528491973877, + "learning_rate": 0.00036263305322128855, + "loss": 0.517, + "step": 22867 + }, + { + "epoch": 12.775418994413407, + "grad_norm": 0.38059747219085693, + "learning_rate": 0.0003626050420168067, + "loss": 0.3733, + "step": 22868 + }, + { + "epoch": 12.775977653631285, + "grad_norm": 0.8537725806236267, + "learning_rate": 0.00036257703081232496, + "loss": 0.3632, + "step": 22869 + }, + { + "epoch": 12.776536312849162, + "grad_norm": 0.7909241318702698, + "learning_rate": 0.00036254901960784317, + "loss": 0.5222, + "step": 22870 + }, + { + "epoch": 12.777094972067038, + "grad_norm": 0.5452353358268738, + "learning_rate": 0.0003625210084033613, + "loss": 0.4052, + "step": 22871 + }, + { + "epoch": 12.777653631284917, + "grad_norm": 0.5343122482299805, + "learning_rate": 0.0003624929971988796, + "loss": 0.4686, + "step": 22872 + }, + { + "epoch": 12.778212290502793, + "grad_norm": 0.4161309003829956, + "learning_rate": 0.00036246498599439773, + "loss": 0.3909, + "step": 22873 + }, + { + "epoch": 12.77877094972067, + "grad_norm": 0.5088701248168945, + "learning_rate": 0.000362436974789916, + "loss": 0.4229, + "step": 22874 + }, + { + "epoch": 12.779329608938548, + "grad_norm": 0.6684455275535583, + "learning_rate": 0.0003624089635854342, + "loss": 0.4262, + "step": 22875 + }, + { + "epoch": 12.779888268156425, + "grad_norm": 0.39843887090682983, + "learning_rate": 0.00036238095238095235, + "loss": 0.3187, + "step": 22876 + }, + { + "epoch": 12.780446927374301, + "grad_norm": 0.4985075294971466, + "learning_rate": 0.0003623529411764706, + "loss": 0.3945, + "step": 22877 + }, + { + "epoch": 12.78100558659218, + "grad_norm": 0.33783870935440063, + "learning_rate": 0.0003623249299719888, + "loss": 0.3238, + "step": 22878 + }, + { + "epoch": 12.781564245810056, + "grad_norm": 0.5383481383323669, + "learning_rate": 0.000362296918767507, + "loss": 0.3598, + "step": 22879 + }, + { + "epoch": 12.782122905027933, + "grad_norm": 0.40740877389907837, + "learning_rate": 0.00036226890756302523, + "loss": 0.3878, + "step": 22880 + }, + { + "epoch": 12.78268156424581, + "grad_norm": 3.1352524757385254, + "learning_rate": 0.0003622408963585434, + "loss": 0.4102, + "step": 22881 + }, + { + "epoch": 12.783240223463688, + "grad_norm": 0.5671041011810303, + "learning_rate": 0.00036221288515406164, + "loss": 0.5092, + "step": 22882 + }, + { + "epoch": 12.783798882681564, + "grad_norm": 0.6266922950744629, + "learning_rate": 0.00036218487394957985, + "loss": 0.4234, + "step": 22883 + }, + { + "epoch": 12.78435754189944, + "grad_norm": 4.076899528503418, + "learning_rate": 0.00036215686274509805, + "loss": 0.3562, + "step": 22884 + }, + { + "epoch": 12.78491620111732, + "grad_norm": 0.8048804402351379, + "learning_rate": 0.00036212885154061626, + "loss": 0.581, + "step": 22885 + }, + { + "epoch": 12.785474860335196, + "grad_norm": 0.4180176854133606, + "learning_rate": 0.00036210084033613446, + "loss": 0.4466, + "step": 22886 + }, + { + "epoch": 12.786033519553072, + "grad_norm": 1.5027180910110474, + "learning_rate": 0.00036207282913165267, + "loss": 0.4641, + "step": 22887 + }, + { + "epoch": 12.786592178770949, + "grad_norm": 0.49694186449050903, + "learning_rate": 0.0003620448179271709, + "loss": 0.3743, + "step": 22888 + }, + { + "epoch": 12.787150837988827, + "grad_norm": 0.569611668586731, + "learning_rate": 0.0003620168067226891, + "loss": 0.4991, + "step": 22889 + }, + { + "epoch": 12.787709497206704, + "grad_norm": 0.4208122491836548, + "learning_rate": 0.0003619887955182073, + "loss": 0.4373, + "step": 22890 + }, + { + "epoch": 12.78826815642458, + "grad_norm": 0.350697785615921, + "learning_rate": 0.0003619607843137255, + "loss": 0.336, + "step": 22891 + }, + { + "epoch": 12.788826815642459, + "grad_norm": 0.39987149834632874, + "learning_rate": 0.0003619327731092437, + "loss": 0.3555, + "step": 22892 + }, + { + "epoch": 12.789385474860335, + "grad_norm": 0.4770813584327698, + "learning_rate": 0.0003619047619047619, + "loss": 0.3663, + "step": 22893 + }, + { + "epoch": 12.789944134078212, + "grad_norm": 0.4965968728065491, + "learning_rate": 0.00036187675070028017, + "loss": 0.3886, + "step": 22894 + }, + { + "epoch": 12.79050279329609, + "grad_norm": 0.5654256939888, + "learning_rate": 0.0003618487394957983, + "loss": 0.4519, + "step": 22895 + }, + { + "epoch": 12.791061452513967, + "grad_norm": 0.39105844497680664, + "learning_rate": 0.0003618207282913165, + "loss": 0.3464, + "step": 22896 + }, + { + "epoch": 12.791620111731843, + "grad_norm": 0.39361634850502014, + "learning_rate": 0.00036179271708683473, + "loss": 0.3567, + "step": 22897 + }, + { + "epoch": 12.79217877094972, + "grad_norm": 0.5732298493385315, + "learning_rate": 0.00036176470588235294, + "loss": 0.3563, + "step": 22898 + }, + { + "epoch": 12.792737430167598, + "grad_norm": 0.8701691627502441, + "learning_rate": 0.0003617366946778712, + "loss": 0.4507, + "step": 22899 + }, + { + "epoch": 12.793296089385475, + "grad_norm": 0.83237624168396, + "learning_rate": 0.00036170868347338935, + "loss": 0.4936, + "step": 22900 + }, + { + "epoch": 12.793854748603351, + "grad_norm": 0.666620135307312, + "learning_rate": 0.00036168067226890755, + "loss": 0.465, + "step": 22901 + }, + { + "epoch": 12.79441340782123, + "grad_norm": 0.42541056871414185, + "learning_rate": 0.0003616526610644258, + "loss": 0.3368, + "step": 22902 + }, + { + "epoch": 12.794972067039106, + "grad_norm": 0.462836891412735, + "learning_rate": 0.00036162464985994397, + "loss": 0.4149, + "step": 22903 + }, + { + "epoch": 12.795530726256983, + "grad_norm": 2.050976514816284, + "learning_rate": 0.0003615966386554622, + "loss": 0.8036, + "step": 22904 + }, + { + "epoch": 12.796089385474861, + "grad_norm": 0.47639819979667664, + "learning_rate": 0.0003615686274509804, + "loss": 0.5363, + "step": 22905 + }, + { + "epoch": 12.796648044692738, + "grad_norm": 3.898019790649414, + "learning_rate": 0.0003615406162464986, + "loss": 0.4147, + "step": 22906 + }, + { + "epoch": 12.797206703910614, + "grad_norm": 0.48763981461524963, + "learning_rate": 0.00036151260504201684, + "loss": 0.5244, + "step": 22907 + }, + { + "epoch": 12.797765363128491, + "grad_norm": 0.6783661246299744, + "learning_rate": 0.000361484593837535, + "loss": 0.3365, + "step": 22908 + }, + { + "epoch": 12.79832402234637, + "grad_norm": 0.5876408219337463, + "learning_rate": 0.00036145658263305326, + "loss": 0.4194, + "step": 22909 + }, + { + "epoch": 12.798882681564246, + "grad_norm": 0.44247081875801086, + "learning_rate": 0.00036142857142857146, + "loss": 0.4621, + "step": 22910 + }, + { + "epoch": 12.799441340782122, + "grad_norm": 0.3946887254714966, + "learning_rate": 0.0003614005602240896, + "loss": 0.3529, + "step": 22911 + }, + { + "epoch": 12.8, + "grad_norm": 0.6611924171447754, + "learning_rate": 0.0003613725490196079, + "loss": 0.4283, + "step": 22912 + }, + { + "epoch": 12.800558659217877, + "grad_norm": 0.8747823238372803, + "learning_rate": 0.000361344537815126, + "loss": 0.6711, + "step": 22913 + }, + { + "epoch": 12.801117318435754, + "grad_norm": 0.8581296801567078, + "learning_rate": 0.0003613165266106443, + "loss": 0.3636, + "step": 22914 + }, + { + "epoch": 12.80167597765363, + "grad_norm": 0.4482992887496948, + "learning_rate": 0.0003612885154061625, + "loss": 0.4435, + "step": 22915 + }, + { + "epoch": 12.802234636871509, + "grad_norm": 0.4412821829319, + "learning_rate": 0.00036126050420168064, + "loss": 0.4349, + "step": 22916 + }, + { + "epoch": 12.802793296089385, + "grad_norm": 0.5532223582267761, + "learning_rate": 0.0003612324929971989, + "loss": 0.3913, + "step": 22917 + }, + { + "epoch": 12.803351955307262, + "grad_norm": 3.270271062850952, + "learning_rate": 0.0003612044817927171, + "loss": 0.4952, + "step": 22918 + }, + { + "epoch": 12.80391061452514, + "grad_norm": 0.8067111372947693, + "learning_rate": 0.0003611764705882353, + "loss": 0.5013, + "step": 22919 + }, + { + "epoch": 12.804469273743017, + "grad_norm": 0.7011149525642395, + "learning_rate": 0.0003611484593837535, + "loss": 0.496, + "step": 22920 + }, + { + "epoch": 12.805027932960893, + "grad_norm": 0.6165128946304321, + "learning_rate": 0.0003611204481792717, + "loss": 0.4906, + "step": 22921 + }, + { + "epoch": 12.805586592178772, + "grad_norm": 0.3503299653530121, + "learning_rate": 0.00036109243697478993, + "loss": 0.2899, + "step": 22922 + }, + { + "epoch": 12.806145251396648, + "grad_norm": 0.5084133744239807, + "learning_rate": 0.00036106442577030814, + "loss": 0.4163, + "step": 22923 + }, + { + "epoch": 12.806703910614525, + "grad_norm": 0.7758415341377258, + "learning_rate": 0.00036103641456582635, + "loss": 0.3803, + "step": 22924 + }, + { + "epoch": 12.807262569832401, + "grad_norm": 0.43046730756759644, + "learning_rate": 0.00036100840336134455, + "loss": 0.3943, + "step": 22925 + }, + { + "epoch": 12.80782122905028, + "grad_norm": 0.7915734052658081, + "learning_rate": 0.00036098039215686276, + "loss": 0.3862, + "step": 22926 + }, + { + "epoch": 12.808379888268156, + "grad_norm": 0.8833620548248291, + "learning_rate": 0.00036095238095238096, + "loss": 0.407, + "step": 22927 + }, + { + "epoch": 12.808938547486033, + "grad_norm": 2.662686824798584, + "learning_rate": 0.00036092436974789917, + "loss": 0.5277, + "step": 22928 + }, + { + "epoch": 12.809497206703911, + "grad_norm": 0.7573880553245544, + "learning_rate": 0.0003608963585434173, + "loss": 0.4419, + "step": 22929 + }, + { + "epoch": 12.810055865921788, + "grad_norm": 0.4178950786590576, + "learning_rate": 0.0003608683473389356, + "loss": 0.4042, + "step": 22930 + }, + { + "epoch": 12.810614525139664, + "grad_norm": 2.9414188861846924, + "learning_rate": 0.0003608403361344538, + "loss": 0.4143, + "step": 22931 + }, + { + "epoch": 12.811173184357543, + "grad_norm": 0.43912386894226074, + "learning_rate": 0.000360812324929972, + "loss": 0.3899, + "step": 22932 + }, + { + "epoch": 12.81173184357542, + "grad_norm": 1.2010248899459839, + "learning_rate": 0.0003607843137254902, + "loss": 0.4636, + "step": 22933 + }, + { + "epoch": 12.812290502793296, + "grad_norm": 0.5587520599365234, + "learning_rate": 0.0003607563025210084, + "loss": 0.329, + "step": 22934 + }, + { + "epoch": 12.812849162011172, + "grad_norm": 0.44351106882095337, + "learning_rate": 0.0003607282913165266, + "loss": 0.4201, + "step": 22935 + }, + { + "epoch": 12.81340782122905, + "grad_norm": 0.5449080467224121, + "learning_rate": 0.0003607002801120448, + "loss": 0.3573, + "step": 22936 + }, + { + "epoch": 12.813966480446927, + "grad_norm": 1.6266264915466309, + "learning_rate": 0.000360672268907563, + "loss": 0.485, + "step": 22937 + }, + { + "epoch": 12.814525139664804, + "grad_norm": 0.44367125630378723, + "learning_rate": 0.00036064425770308123, + "loss": 0.3097, + "step": 22938 + }, + { + "epoch": 12.815083798882682, + "grad_norm": 0.37211188673973083, + "learning_rate": 0.00036061624649859944, + "loss": 0.3652, + "step": 22939 + }, + { + "epoch": 12.815642458100559, + "grad_norm": 0.8515840768814087, + "learning_rate": 0.00036058823529411764, + "loss": 0.5518, + "step": 22940 + }, + { + "epoch": 12.816201117318435, + "grad_norm": 0.7962684035301208, + "learning_rate": 0.00036056022408963585, + "loss": 0.3652, + "step": 22941 + }, + { + "epoch": 12.816759776536312, + "grad_norm": 0.37861326336860657, + "learning_rate": 0.0003605322128851541, + "loss": 0.4161, + "step": 22942 + }, + { + "epoch": 12.81731843575419, + "grad_norm": 0.3585122525691986, + "learning_rate": 0.00036050420168067226, + "loss": 0.3956, + "step": 22943 + }, + { + "epoch": 12.817877094972067, + "grad_norm": 0.634141206741333, + "learning_rate": 0.00036047619047619047, + "loss": 0.3857, + "step": 22944 + }, + { + "epoch": 12.818435754189943, + "grad_norm": 0.5188333988189697, + "learning_rate": 0.00036044817927170867, + "loss": 0.4066, + "step": 22945 + }, + { + "epoch": 12.818994413407822, + "grad_norm": 0.5290367007255554, + "learning_rate": 0.0003604201680672269, + "loss": 0.3227, + "step": 22946 + }, + { + "epoch": 12.819553072625698, + "grad_norm": 0.7687883973121643, + "learning_rate": 0.00036039215686274514, + "loss": 0.4216, + "step": 22947 + }, + { + "epoch": 12.820111731843575, + "grad_norm": 0.4438192844390869, + "learning_rate": 0.0003603641456582633, + "loss": 0.5563, + "step": 22948 + }, + { + "epoch": 12.820670391061453, + "grad_norm": 0.5439226031303406, + "learning_rate": 0.0003603361344537815, + "loss": 0.3598, + "step": 22949 + }, + { + "epoch": 12.82122905027933, + "grad_norm": 0.5417304039001465, + "learning_rate": 0.00036030812324929976, + "loss": 0.4334, + "step": 22950 + }, + { + "epoch": 12.821787709497206, + "grad_norm": 0.4443223178386688, + "learning_rate": 0.0003602801120448179, + "loss": 0.4263, + "step": 22951 + }, + { + "epoch": 12.822346368715085, + "grad_norm": 0.3897508978843689, + "learning_rate": 0.00036025210084033617, + "loss": 0.3734, + "step": 22952 + }, + { + "epoch": 12.822905027932961, + "grad_norm": 0.4214841425418854, + "learning_rate": 0.0003602240896358543, + "loss": 0.4257, + "step": 22953 + }, + { + "epoch": 12.823463687150838, + "grad_norm": 0.4963054060935974, + "learning_rate": 0.0003601960784313725, + "loss": 0.3706, + "step": 22954 + }, + { + "epoch": 12.824022346368714, + "grad_norm": 0.8631716370582581, + "learning_rate": 0.0003601680672268908, + "loss": 0.3748, + "step": 22955 + }, + { + "epoch": 12.824581005586593, + "grad_norm": 0.49519115686416626, + "learning_rate": 0.00036014005602240894, + "loss": 0.3873, + "step": 22956 + }, + { + "epoch": 12.82513966480447, + "grad_norm": 1.7033321857452393, + "learning_rate": 0.0003601120448179272, + "loss": 0.5147, + "step": 22957 + }, + { + "epoch": 12.825698324022346, + "grad_norm": 0.5086231827735901, + "learning_rate": 0.0003600840336134454, + "loss": 0.3751, + "step": 22958 + }, + { + "epoch": 12.826256983240224, + "grad_norm": 2.3101048469543457, + "learning_rate": 0.00036005602240896356, + "loss": 0.3818, + "step": 22959 + }, + { + "epoch": 12.8268156424581, + "grad_norm": 0.3681088387966156, + "learning_rate": 0.0003600280112044818, + "loss": 0.345, + "step": 22960 + }, + { + "epoch": 12.827374301675977, + "grad_norm": 0.560340166091919, + "learning_rate": 0.00035999999999999997, + "loss": 0.3633, + "step": 22961 + }, + { + "epoch": 12.827932960893854, + "grad_norm": 0.9640902280807495, + "learning_rate": 0.00035997198879551823, + "loss": 0.426, + "step": 22962 + }, + { + "epoch": 12.828491620111732, + "grad_norm": 0.5351158380508423, + "learning_rate": 0.00035994397759103643, + "loss": 0.3972, + "step": 22963 + }, + { + "epoch": 12.829050279329609, + "grad_norm": 0.5665839910507202, + "learning_rate": 0.0003599159663865546, + "loss": 0.4003, + "step": 22964 + }, + { + "epoch": 12.829608938547485, + "grad_norm": 0.48218879103660583, + "learning_rate": 0.00035988795518207285, + "loss": 0.3298, + "step": 22965 + }, + { + "epoch": 12.830167597765364, + "grad_norm": 0.5230398178100586, + "learning_rate": 0.00035985994397759105, + "loss": 0.5165, + "step": 22966 + }, + { + "epoch": 12.83072625698324, + "grad_norm": 0.6515301465988159, + "learning_rate": 0.00035983193277310926, + "loss": 0.4461, + "step": 22967 + }, + { + "epoch": 12.831284916201117, + "grad_norm": 0.8557658195495605, + "learning_rate": 0.00035980392156862746, + "loss": 0.4364, + "step": 22968 + }, + { + "epoch": 12.831843575418995, + "grad_norm": 0.4500856101512909, + "learning_rate": 0.0003597759103641456, + "loss": 0.3393, + "step": 22969 + }, + { + "epoch": 12.832402234636872, + "grad_norm": 0.5157396793365479, + "learning_rate": 0.0003597478991596639, + "loss": 0.3789, + "step": 22970 + }, + { + "epoch": 12.832960893854748, + "grad_norm": 1.061504602432251, + "learning_rate": 0.0003597198879551821, + "loss": 0.4093, + "step": 22971 + }, + { + "epoch": 12.833519553072625, + "grad_norm": 1.2061611413955688, + "learning_rate": 0.0003596918767507003, + "loss": 0.3858, + "step": 22972 + }, + { + "epoch": 12.834078212290503, + "grad_norm": 0.369796484708786, + "learning_rate": 0.0003596638655462185, + "loss": 0.3654, + "step": 22973 + }, + { + "epoch": 12.83463687150838, + "grad_norm": 0.5278509259223938, + "learning_rate": 0.0003596358543417367, + "loss": 0.4444, + "step": 22974 + }, + { + "epoch": 12.835195530726256, + "grad_norm": 0.7993066906929016, + "learning_rate": 0.0003596078431372549, + "loss": 0.435, + "step": 22975 + }, + { + "epoch": 12.835754189944135, + "grad_norm": 0.4012240171432495, + "learning_rate": 0.0003595798319327731, + "loss": 0.3085, + "step": 22976 + }, + { + "epoch": 12.836312849162011, + "grad_norm": 0.49167600274086, + "learning_rate": 0.0003595518207282913, + "loss": 0.4289, + "step": 22977 + }, + { + "epoch": 12.836871508379888, + "grad_norm": 0.6099218130111694, + "learning_rate": 0.0003595238095238095, + "loss": 0.4413, + "step": 22978 + }, + { + "epoch": 12.837430167597766, + "grad_norm": 0.38181930780410767, + "learning_rate": 0.00035949579831932773, + "loss": 0.341, + "step": 22979 + }, + { + "epoch": 12.837988826815643, + "grad_norm": 0.3030468821525574, + "learning_rate": 0.00035946778711484594, + "loss": 0.3439, + "step": 22980 + }, + { + "epoch": 12.83854748603352, + "grad_norm": 0.5551451444625854, + "learning_rate": 0.00035943977591036414, + "loss": 0.3628, + "step": 22981 + }, + { + "epoch": 12.839106145251396, + "grad_norm": 0.5391368269920349, + "learning_rate": 0.0003594117647058824, + "loss": 0.4453, + "step": 22982 + }, + { + "epoch": 12.839664804469274, + "grad_norm": 0.4426036477088928, + "learning_rate": 0.00035938375350140055, + "loss": 0.4381, + "step": 22983 + }, + { + "epoch": 12.84022346368715, + "grad_norm": 0.7820568680763245, + "learning_rate": 0.00035935574229691876, + "loss": 0.4928, + "step": 22984 + }, + { + "epoch": 12.840782122905027, + "grad_norm": 0.5407456755638123, + "learning_rate": 0.00035932773109243697, + "loss": 0.4277, + "step": 22985 + }, + { + "epoch": 12.841340782122906, + "grad_norm": 0.44331902265548706, + "learning_rate": 0.00035929971988795517, + "loss": 0.3223, + "step": 22986 + }, + { + "epoch": 12.841899441340782, + "grad_norm": 0.6615450382232666, + "learning_rate": 0.00035927170868347343, + "loss": 0.5106, + "step": 22987 + }, + { + "epoch": 12.842458100558659, + "grad_norm": 0.6417344212532043, + "learning_rate": 0.0003592436974789916, + "loss": 0.4632, + "step": 22988 + }, + { + "epoch": 12.843016759776535, + "grad_norm": 0.4910949170589447, + "learning_rate": 0.0003592156862745098, + "loss": 0.4686, + "step": 22989 + }, + { + "epoch": 12.843575418994414, + "grad_norm": 0.6603783965110779, + "learning_rate": 0.00035918767507002805, + "loss": 0.4865, + "step": 22990 + }, + { + "epoch": 12.84413407821229, + "grad_norm": 0.36142870783805847, + "learning_rate": 0.0003591596638655462, + "loss": 0.3994, + "step": 22991 + }, + { + "epoch": 12.844692737430167, + "grad_norm": 0.3348754048347473, + "learning_rate": 0.00035913165266106446, + "loss": 0.2924, + "step": 22992 + }, + { + "epoch": 12.845251396648045, + "grad_norm": 0.38223209977149963, + "learning_rate": 0.0003591036414565826, + "loss": 0.3638, + "step": 22993 + }, + { + "epoch": 12.845810055865922, + "grad_norm": 4.185999393463135, + "learning_rate": 0.0003590756302521008, + "loss": 0.3598, + "step": 22994 + }, + { + "epoch": 12.846368715083798, + "grad_norm": 0.5923586487770081, + "learning_rate": 0.0003590476190476191, + "loss": 0.5234, + "step": 22995 + }, + { + "epoch": 12.846927374301677, + "grad_norm": 0.4538070857524872, + "learning_rate": 0.00035901960784313723, + "loss": 0.3444, + "step": 22996 + }, + { + "epoch": 12.847486033519553, + "grad_norm": 0.4295232892036438, + "learning_rate": 0.0003589915966386555, + "loss": 0.3595, + "step": 22997 + }, + { + "epoch": 12.84804469273743, + "grad_norm": 0.4248203933238983, + "learning_rate": 0.0003589635854341737, + "loss": 0.4056, + "step": 22998 + }, + { + "epoch": 12.848603351955306, + "grad_norm": 0.4672292470932007, + "learning_rate": 0.00035893557422969185, + "loss": 0.4036, + "step": 22999 + }, + { + "epoch": 12.849162011173185, + "grad_norm": 0.5464761257171631, + "learning_rate": 0.0003589075630252101, + "loss": 0.4748, + "step": 23000 + }, + { + "epoch": 12.849162011173185, + "eval_cer": 0.08653290126891645, + "eval_loss": 0.3306316137313843, + "eval_runtime": 55.5835, + "eval_samples_per_second": 81.643, + "eval_steps_per_second": 5.109, + "eval_wer": 0.34015809614256587, + "step": 23000 + }, + { + "epoch": 12.849720670391061, + "grad_norm": 0.40832653641700745, + "learning_rate": 0.00035887955182072826, + "loss": 0.3351, + "step": 23001 + }, + { + "epoch": 12.850279329608938, + "grad_norm": 0.419050395488739, + "learning_rate": 0.0003588515406162465, + "loss": 0.3767, + "step": 23002 + }, + { + "epoch": 12.850837988826816, + "grad_norm": 0.4054844379425049, + "learning_rate": 0.00035882352941176473, + "loss": 0.3843, + "step": 23003 + }, + { + "epoch": 12.851396648044693, + "grad_norm": 0.979784369468689, + "learning_rate": 0.0003587955182072829, + "loss": 0.3915, + "step": 23004 + }, + { + "epoch": 12.85195530726257, + "grad_norm": 0.8679656982421875, + "learning_rate": 0.00035876750700280114, + "loss": 0.3773, + "step": 23005 + }, + { + "epoch": 12.852513966480448, + "grad_norm": 0.4106649160385132, + "learning_rate": 0.00035873949579831935, + "loss": 0.4321, + "step": 23006 + }, + { + "epoch": 12.853072625698324, + "grad_norm": 0.4178391695022583, + "learning_rate": 0.00035871148459383755, + "loss": 0.4791, + "step": 23007 + }, + { + "epoch": 12.8536312849162, + "grad_norm": 1.3904122114181519, + "learning_rate": 0.00035868347338935576, + "loss": 0.3309, + "step": 23008 + }, + { + "epoch": 12.854189944134077, + "grad_norm": 0.5001409649848938, + "learning_rate": 0.0003586554621848739, + "loss": 0.4192, + "step": 23009 + }, + { + "epoch": 12.854748603351956, + "grad_norm": 0.5209900736808777, + "learning_rate": 0.00035862745098039217, + "loss": 0.4104, + "step": 23010 + }, + { + "epoch": 12.855307262569832, + "grad_norm": 0.4196328818798065, + "learning_rate": 0.0003585994397759104, + "loss": 0.3401, + "step": 23011 + }, + { + "epoch": 12.855865921787709, + "grad_norm": 0.5003356337547302, + "learning_rate": 0.0003585714285714286, + "loss": 0.5244, + "step": 23012 + }, + { + "epoch": 12.856424581005587, + "grad_norm": 0.46145743131637573, + "learning_rate": 0.0003585434173669468, + "loss": 0.3921, + "step": 23013 + }, + { + "epoch": 12.856983240223464, + "grad_norm": 0.5887634754180908, + "learning_rate": 0.000358515406162465, + "loss": 0.4759, + "step": 23014 + }, + { + "epoch": 12.85754189944134, + "grad_norm": 0.9846682548522949, + "learning_rate": 0.0003584873949579832, + "loss": 0.4528, + "step": 23015 + }, + { + "epoch": 12.858100558659217, + "grad_norm": 2.030968427658081, + "learning_rate": 0.0003584593837535014, + "loss": 0.4218, + "step": 23016 + }, + { + "epoch": 12.858659217877095, + "grad_norm": 0.9259823560714722, + "learning_rate": 0.00035843137254901967, + "loss": 0.6358, + "step": 23017 + }, + { + "epoch": 12.859217877094972, + "grad_norm": 0.5755375623703003, + "learning_rate": 0.0003584033613445378, + "loss": 0.4643, + "step": 23018 + }, + { + "epoch": 12.859776536312848, + "grad_norm": 2.5767340660095215, + "learning_rate": 0.000358375350140056, + "loss": 0.4682, + "step": 23019 + }, + { + "epoch": 12.860335195530727, + "grad_norm": 0.5438586473464966, + "learning_rate": 0.00035834733893557423, + "loss": 0.575, + "step": 23020 + }, + { + "epoch": 12.860893854748603, + "grad_norm": 1.0188018083572388, + "learning_rate": 0.00035831932773109244, + "loss": 0.5526, + "step": 23021 + }, + { + "epoch": 12.86145251396648, + "grad_norm": 0.6130913496017456, + "learning_rate": 0.0003582913165266107, + "loss": 0.4491, + "step": 23022 + }, + { + "epoch": 12.862011173184358, + "grad_norm": 0.6643819212913513, + "learning_rate": 0.00035826330532212885, + "loss": 0.5539, + "step": 23023 + }, + { + "epoch": 12.862569832402235, + "grad_norm": 0.3844551146030426, + "learning_rate": 0.00035823529411764705, + "loss": 0.3479, + "step": 23024 + }, + { + "epoch": 12.863128491620111, + "grad_norm": 1.7012395858764648, + "learning_rate": 0.0003582072829131653, + "loss": 0.4918, + "step": 23025 + }, + { + "epoch": 12.86368715083799, + "grad_norm": 0.6724205613136292, + "learning_rate": 0.00035817927170868347, + "loss": 0.3504, + "step": 23026 + }, + { + "epoch": 12.864245810055866, + "grad_norm": 0.6052526235580444, + "learning_rate": 0.0003581512605042017, + "loss": 0.5405, + "step": 23027 + }, + { + "epoch": 12.864804469273743, + "grad_norm": 0.707908570766449, + "learning_rate": 0.0003581232492997199, + "loss": 0.3254, + "step": 23028 + }, + { + "epoch": 12.86536312849162, + "grad_norm": 0.3449500799179077, + "learning_rate": 0.0003580952380952381, + "loss": 0.3633, + "step": 23029 + }, + { + "epoch": 12.865921787709498, + "grad_norm": 6.125621795654297, + "learning_rate": 0.00035806722689075634, + "loss": 0.5264, + "step": 23030 + }, + { + "epoch": 12.866480446927374, + "grad_norm": 0.4929667115211487, + "learning_rate": 0.0003580392156862745, + "loss": 0.37, + "step": 23031 + }, + { + "epoch": 12.867039106145251, + "grad_norm": 0.6589158177375793, + "learning_rate": 0.00035801120448179276, + "loss": 0.4408, + "step": 23032 + }, + { + "epoch": 12.86759776536313, + "grad_norm": 0.5033377408981323, + "learning_rate": 0.00035798319327731096, + "loss": 0.4684, + "step": 23033 + }, + { + "epoch": 12.868156424581006, + "grad_norm": 0.9584864974021912, + "learning_rate": 0.0003579551820728291, + "loss": 0.3205, + "step": 23034 + }, + { + "epoch": 12.868715083798882, + "grad_norm": 2.052773952484131, + "learning_rate": 0.0003579271708683474, + "loss": 0.3615, + "step": 23035 + }, + { + "epoch": 12.869273743016759, + "grad_norm": 0.3121645450592041, + "learning_rate": 0.0003578991596638655, + "loss": 0.2918, + "step": 23036 + }, + { + "epoch": 12.869832402234637, + "grad_norm": 0.501052737236023, + "learning_rate": 0.0003578711484593838, + "loss": 0.3291, + "step": 23037 + }, + { + "epoch": 12.870391061452514, + "grad_norm": 0.699521541595459, + "learning_rate": 0.000357843137254902, + "loss": 0.3831, + "step": 23038 + }, + { + "epoch": 12.87094972067039, + "grad_norm": 0.5161980986595154, + "learning_rate": 0.00035781512605042014, + "loss": 0.4041, + "step": 23039 + }, + { + "epoch": 12.871508379888269, + "grad_norm": 1.0380103588104248, + "learning_rate": 0.0003577871148459384, + "loss": 0.4158, + "step": 23040 + }, + { + "epoch": 12.872067039106145, + "grad_norm": 0.4340762495994568, + "learning_rate": 0.0003577591036414566, + "loss": 0.3768, + "step": 23041 + }, + { + "epoch": 12.872625698324022, + "grad_norm": 1.3014675378799438, + "learning_rate": 0.00035773109243697476, + "loss": 0.4367, + "step": 23042 + }, + { + "epoch": 12.8731843575419, + "grad_norm": 0.4287246763706207, + "learning_rate": 0.000357703081232493, + "loss": 0.3806, + "step": 23043 + }, + { + "epoch": 12.873743016759777, + "grad_norm": 0.43329617381095886, + "learning_rate": 0.0003576750700280112, + "loss": 0.3864, + "step": 23044 + }, + { + "epoch": 12.874301675977653, + "grad_norm": 0.40220052003860474, + "learning_rate": 0.00035764705882352943, + "loss": 0.3513, + "step": 23045 + }, + { + "epoch": 12.87486033519553, + "grad_norm": 11.074256896972656, + "learning_rate": 0.00035761904761904764, + "loss": 0.3399, + "step": 23046 + }, + { + "epoch": 12.875418994413408, + "grad_norm": 0.5627591609954834, + "learning_rate": 0.0003575910364145658, + "loss": 0.402, + "step": 23047 + }, + { + "epoch": 12.875977653631285, + "grad_norm": 0.3989485204219818, + "learning_rate": 0.00035756302521008405, + "loss": 0.4485, + "step": 23048 + }, + { + "epoch": 12.876536312849161, + "grad_norm": 0.8534471988677979, + "learning_rate": 0.00035753501400560226, + "loss": 0.4582, + "step": 23049 + }, + { + "epoch": 12.87709497206704, + "grad_norm": 1.5509145259857178, + "learning_rate": 0.00035750700280112046, + "loss": 0.3113, + "step": 23050 + }, + { + "epoch": 12.877653631284916, + "grad_norm": 0.5349453687667847, + "learning_rate": 0.00035747899159663867, + "loss": 0.3604, + "step": 23051 + }, + { + "epoch": 12.878212290502793, + "grad_norm": 0.5117002725601196, + "learning_rate": 0.0003574509803921568, + "loss": 0.5092, + "step": 23052 + }, + { + "epoch": 12.878770949720671, + "grad_norm": 2.3256473541259766, + "learning_rate": 0.0003574229691876751, + "loss": 0.4547, + "step": 23053 + }, + { + "epoch": 12.879329608938548, + "grad_norm": 0.5117779970169067, + "learning_rate": 0.0003573949579831933, + "loss": 0.483, + "step": 23054 + }, + { + "epoch": 12.879888268156424, + "grad_norm": 0.48029854893684387, + "learning_rate": 0.0003573669467787115, + "loss": 0.3635, + "step": 23055 + }, + { + "epoch": 12.880446927374301, + "grad_norm": 0.4011361300945282, + "learning_rate": 0.0003573389355742297, + "loss": 0.4259, + "step": 23056 + }, + { + "epoch": 12.88100558659218, + "grad_norm": 0.5677130818367004, + "learning_rate": 0.0003573109243697479, + "loss": 0.5313, + "step": 23057 + }, + { + "epoch": 12.881564245810056, + "grad_norm": 0.48748213052749634, + "learning_rate": 0.0003572829131652661, + "loss": 0.4961, + "step": 23058 + }, + { + "epoch": 12.882122905027932, + "grad_norm": 0.4813239574432373, + "learning_rate": 0.0003572549019607843, + "loss": 0.3753, + "step": 23059 + }, + { + "epoch": 12.88268156424581, + "grad_norm": 0.44387978315353394, + "learning_rate": 0.0003572268907563025, + "loss": 0.4695, + "step": 23060 + }, + { + "epoch": 12.883240223463687, + "grad_norm": 0.43491995334625244, + "learning_rate": 0.00035719887955182073, + "loss": 0.3946, + "step": 23061 + }, + { + "epoch": 12.883798882681564, + "grad_norm": 0.5463988780975342, + "learning_rate": 0.00035717086834733894, + "loss": 0.4322, + "step": 23062 + }, + { + "epoch": 12.88435754189944, + "grad_norm": 0.6533722877502441, + "learning_rate": 0.00035714285714285714, + "loss": 0.3946, + "step": 23063 + }, + { + "epoch": 12.884916201117319, + "grad_norm": 0.43979984521865845, + "learning_rate": 0.00035711484593837535, + "loss": 0.4218, + "step": 23064 + }, + { + "epoch": 12.885474860335195, + "grad_norm": 0.4594962000846863, + "learning_rate": 0.0003570868347338936, + "loss": 0.5339, + "step": 23065 + }, + { + "epoch": 12.886033519553072, + "grad_norm": 0.5154690146446228, + "learning_rate": 0.00035705882352941176, + "loss": 0.3145, + "step": 23066 + }, + { + "epoch": 12.88659217877095, + "grad_norm": 0.5820730328559875, + "learning_rate": 0.00035703081232492997, + "loss": 0.4964, + "step": 23067 + }, + { + "epoch": 12.887150837988827, + "grad_norm": 0.5742661952972412, + "learning_rate": 0.00035700280112044817, + "loss": 0.436, + "step": 23068 + }, + { + "epoch": 12.887709497206703, + "grad_norm": 0.45133382081985474, + "learning_rate": 0.0003569747899159664, + "loss": 0.3889, + "step": 23069 + }, + { + "epoch": 12.888268156424582, + "grad_norm": 2.184577703475952, + "learning_rate": 0.00035694677871148464, + "loss": 0.4346, + "step": 23070 + }, + { + "epoch": 12.888826815642458, + "grad_norm": 0.5054942965507507, + "learning_rate": 0.0003569187675070028, + "loss": 0.3372, + "step": 23071 + }, + { + "epoch": 12.889385474860335, + "grad_norm": 0.5646455883979797, + "learning_rate": 0.000356890756302521, + "loss": 0.4239, + "step": 23072 + }, + { + "epoch": 12.889944134078211, + "grad_norm": 0.47510045766830444, + "learning_rate": 0.00035686274509803926, + "loss": 0.3814, + "step": 23073 + }, + { + "epoch": 12.89050279329609, + "grad_norm": 0.3632448613643646, + "learning_rate": 0.0003568347338935574, + "loss": 0.4174, + "step": 23074 + }, + { + "epoch": 12.891061452513966, + "grad_norm": 0.5906867384910583, + "learning_rate": 0.00035680672268907567, + "loss": 0.5197, + "step": 23075 + }, + { + "epoch": 12.891620111731843, + "grad_norm": 0.4474334120750427, + "learning_rate": 0.0003567787114845938, + "loss": 0.4522, + "step": 23076 + }, + { + "epoch": 12.892178770949721, + "grad_norm": 0.7047088742256165, + "learning_rate": 0.000356750700280112, + "loss": 0.3908, + "step": 23077 + }, + { + "epoch": 12.892737430167598, + "grad_norm": 4.913631916046143, + "learning_rate": 0.0003567226890756303, + "loss": 0.4105, + "step": 23078 + }, + { + "epoch": 12.893296089385474, + "grad_norm": 0.3191641569137573, + "learning_rate": 0.00035669467787114844, + "loss": 0.2841, + "step": 23079 + }, + { + "epoch": 12.893854748603353, + "grad_norm": 0.5091173052787781, + "learning_rate": 0.0003566666666666667, + "loss": 0.5345, + "step": 23080 + }, + { + "epoch": 12.89441340782123, + "grad_norm": 0.9211945533752441, + "learning_rate": 0.0003566386554621849, + "loss": 0.4845, + "step": 23081 + }, + { + "epoch": 12.894972067039106, + "grad_norm": 0.5055422186851501, + "learning_rate": 0.00035661064425770306, + "loss": 0.4182, + "step": 23082 + }, + { + "epoch": 12.895530726256982, + "grad_norm": 0.8235592842102051, + "learning_rate": 0.0003565826330532213, + "loss": 0.4562, + "step": 23083 + }, + { + "epoch": 12.89608938547486, + "grad_norm": 0.4405900835990906, + "learning_rate": 0.00035655462184873947, + "loss": 0.4622, + "step": 23084 + }, + { + "epoch": 12.896648044692737, + "grad_norm": 2.4743685722351074, + "learning_rate": 0.00035652661064425773, + "loss": 0.3816, + "step": 23085 + }, + { + "epoch": 12.897206703910614, + "grad_norm": 0.6617835164070129, + "learning_rate": 0.00035649859943977593, + "loss": 0.4124, + "step": 23086 + }, + { + "epoch": 12.897765363128492, + "grad_norm": 0.36478501558303833, + "learning_rate": 0.0003564705882352941, + "loss": 0.3631, + "step": 23087 + }, + { + "epoch": 12.898324022346369, + "grad_norm": 0.7991409301757812, + "learning_rate": 0.00035644257703081235, + "loss": 0.4525, + "step": 23088 + }, + { + "epoch": 12.898882681564245, + "grad_norm": 0.7258668541908264, + "learning_rate": 0.00035641456582633055, + "loss": 0.4349, + "step": 23089 + }, + { + "epoch": 12.899441340782122, + "grad_norm": 0.33859020471572876, + "learning_rate": 0.00035638655462184876, + "loss": 0.4368, + "step": 23090 + }, + { + "epoch": 12.9, + "grad_norm": 0.3822377920150757, + "learning_rate": 0.00035635854341736696, + "loss": 0.4163, + "step": 23091 + }, + { + "epoch": 12.900558659217877, + "grad_norm": 0.7589262127876282, + "learning_rate": 0.0003563305322128851, + "loss": 0.4327, + "step": 23092 + }, + { + "epoch": 12.901117318435753, + "grad_norm": 0.7192412614822388, + "learning_rate": 0.0003563025210084034, + "loss": 0.3176, + "step": 23093 + }, + { + "epoch": 12.901675977653632, + "grad_norm": 1.1496564149856567, + "learning_rate": 0.0003562745098039216, + "loss": 0.4879, + "step": 23094 + }, + { + "epoch": 12.902234636871508, + "grad_norm": 0.40294763445854187, + "learning_rate": 0.0003562464985994398, + "loss": 0.3654, + "step": 23095 + }, + { + "epoch": 12.902793296089385, + "grad_norm": 0.5239118933677673, + "learning_rate": 0.000356218487394958, + "loss": 0.479, + "step": 23096 + }, + { + "epoch": 12.903351955307263, + "grad_norm": 0.43779295682907104, + "learning_rate": 0.0003561904761904762, + "loss": 0.3349, + "step": 23097 + }, + { + "epoch": 12.90391061452514, + "grad_norm": 0.6125030517578125, + "learning_rate": 0.0003561624649859944, + "loss": 0.3977, + "step": 23098 + }, + { + "epoch": 12.904469273743016, + "grad_norm": 0.39839741587638855, + "learning_rate": 0.0003561344537815126, + "loss": 0.2895, + "step": 23099 + }, + { + "epoch": 12.905027932960895, + "grad_norm": 5.303694248199463, + "learning_rate": 0.0003561064425770308, + "loss": 0.4634, + "step": 23100 + }, + { + "epoch": 12.905586592178771, + "grad_norm": 1.5205161571502686, + "learning_rate": 0.000356078431372549, + "loss": 0.4677, + "step": 23101 + }, + { + "epoch": 12.906145251396648, + "grad_norm": 0.39830920100212097, + "learning_rate": 0.00035605042016806723, + "loss": 0.384, + "step": 23102 + }, + { + "epoch": 12.906703910614524, + "grad_norm": 1.0327866077423096, + "learning_rate": 0.00035602240896358544, + "loss": 0.3333, + "step": 23103 + }, + { + "epoch": 12.907262569832403, + "grad_norm": 0.9274595379829407, + "learning_rate": 0.00035599439775910364, + "loss": 0.5955, + "step": 23104 + }, + { + "epoch": 12.90782122905028, + "grad_norm": 0.45628514885902405, + "learning_rate": 0.0003559663865546219, + "loss": 0.3501, + "step": 23105 + }, + { + "epoch": 12.908379888268156, + "grad_norm": 0.4545165002346039, + "learning_rate": 0.00035593837535014005, + "loss": 0.3669, + "step": 23106 + }, + { + "epoch": 12.908938547486034, + "grad_norm": 3.164350748062134, + "learning_rate": 0.00035591036414565826, + "loss": 0.3939, + "step": 23107 + }, + { + "epoch": 12.90949720670391, + "grad_norm": 0.5116806626319885, + "learning_rate": 0.00035588235294117647, + "loss": 0.4668, + "step": 23108 + }, + { + "epoch": 12.910055865921787, + "grad_norm": 0.3244902491569519, + "learning_rate": 0.00035585434173669467, + "loss": 0.3115, + "step": 23109 + }, + { + "epoch": 12.910614525139664, + "grad_norm": 0.6180371046066284, + "learning_rate": 0.00035582633053221293, + "loss": 0.4601, + "step": 23110 + }, + { + "epoch": 12.911173184357542, + "grad_norm": 0.40666645765304565, + "learning_rate": 0.0003557983193277311, + "loss": 0.35, + "step": 23111 + }, + { + "epoch": 12.911731843575419, + "grad_norm": 0.6671972274780273, + "learning_rate": 0.0003557703081232493, + "loss": 0.3457, + "step": 23112 + }, + { + "epoch": 12.912290502793295, + "grad_norm": 0.4347480237483978, + "learning_rate": 0.00035574229691876755, + "loss": 0.3041, + "step": 23113 + }, + { + "epoch": 12.912849162011174, + "grad_norm": 0.39989516139030457, + "learning_rate": 0.0003557142857142857, + "loss": 0.4196, + "step": 23114 + }, + { + "epoch": 12.91340782122905, + "grad_norm": 0.44127461314201355, + "learning_rate": 0.00035568627450980396, + "loss": 0.3872, + "step": 23115 + }, + { + "epoch": 12.913966480446927, + "grad_norm": 0.5842620134353638, + "learning_rate": 0.0003556582633053221, + "loss": 0.489, + "step": 23116 + }, + { + "epoch": 12.914525139664804, + "grad_norm": 0.9391373991966248, + "learning_rate": 0.0003556302521008403, + "loss": 0.5214, + "step": 23117 + }, + { + "epoch": 12.915083798882682, + "grad_norm": 0.7196061611175537, + "learning_rate": 0.0003556022408963586, + "loss": 0.3244, + "step": 23118 + }, + { + "epoch": 12.915642458100558, + "grad_norm": 0.630444347858429, + "learning_rate": 0.00035557422969187673, + "loss": 0.5156, + "step": 23119 + }, + { + "epoch": 12.916201117318435, + "grad_norm": 0.7112671732902527, + "learning_rate": 0.000355546218487395, + "loss": 0.4456, + "step": 23120 + }, + { + "epoch": 12.916759776536313, + "grad_norm": 0.4061303436756134, + "learning_rate": 0.0003555182072829132, + "loss": 0.3914, + "step": 23121 + }, + { + "epoch": 12.91731843575419, + "grad_norm": 0.6440028548240662, + "learning_rate": 0.00035549019607843135, + "loss": 0.4317, + "step": 23122 + }, + { + "epoch": 12.917877094972066, + "grad_norm": 5.443751335144043, + "learning_rate": 0.0003554621848739496, + "loss": 0.3821, + "step": 23123 + }, + { + "epoch": 12.918435754189945, + "grad_norm": 0.4395512342453003, + "learning_rate": 0.00035543417366946776, + "loss": 0.4109, + "step": 23124 + }, + { + "epoch": 12.918994413407821, + "grad_norm": 0.6705725789070129, + "learning_rate": 0.000355406162464986, + "loss": 0.4778, + "step": 23125 + }, + { + "epoch": 12.919553072625698, + "grad_norm": 0.669812023639679, + "learning_rate": 0.00035537815126050423, + "loss": 0.3954, + "step": 23126 + }, + { + "epoch": 12.920111731843576, + "grad_norm": 6.2944865226745605, + "learning_rate": 0.0003553501400560224, + "loss": 0.5463, + "step": 23127 + }, + { + "epoch": 12.920670391061453, + "grad_norm": 0.34514120221138, + "learning_rate": 0.00035532212885154064, + "loss": 0.3698, + "step": 23128 + }, + { + "epoch": 12.92122905027933, + "grad_norm": 0.4975703954696655, + "learning_rate": 0.00035529411764705885, + "loss": 0.54, + "step": 23129 + }, + { + "epoch": 12.921787709497206, + "grad_norm": 0.5599039196968079, + "learning_rate": 0.00035526610644257705, + "loss": 0.4116, + "step": 23130 + }, + { + "epoch": 12.922346368715084, + "grad_norm": 0.5498924255371094, + "learning_rate": 0.00035523809523809526, + "loss": 0.4792, + "step": 23131 + }, + { + "epoch": 12.922905027932961, + "grad_norm": 0.5034617781639099, + "learning_rate": 0.0003552100840336134, + "loss": 0.4584, + "step": 23132 + }, + { + "epoch": 12.923463687150837, + "grad_norm": 0.6296890377998352, + "learning_rate": 0.00035518207282913167, + "loss": 0.4363, + "step": 23133 + }, + { + "epoch": 12.924022346368716, + "grad_norm": 5.106089115142822, + "learning_rate": 0.0003551540616246499, + "loss": 0.3957, + "step": 23134 + }, + { + "epoch": 12.924581005586592, + "grad_norm": 0.8492132425308228, + "learning_rate": 0.0003551260504201681, + "loss": 0.3406, + "step": 23135 + }, + { + "epoch": 12.925139664804469, + "grad_norm": 1.0980751514434814, + "learning_rate": 0.0003550980392156863, + "loss": 0.401, + "step": 23136 + }, + { + "epoch": 12.925698324022346, + "grad_norm": 0.5778860449790955, + "learning_rate": 0.0003550700280112045, + "loss": 0.4991, + "step": 23137 + }, + { + "epoch": 12.926256983240224, + "grad_norm": 0.5193675756454468, + "learning_rate": 0.0003550420168067227, + "loss": 0.3746, + "step": 23138 + }, + { + "epoch": 12.9268156424581, + "grad_norm": 1.41737699508667, + "learning_rate": 0.0003550140056022409, + "loss": 0.3229, + "step": 23139 + }, + { + "epoch": 12.927374301675977, + "grad_norm": 1.3138048648834229, + "learning_rate": 0.0003549859943977591, + "loss": 0.4481, + "step": 23140 + }, + { + "epoch": 12.927932960893855, + "grad_norm": 0.411640465259552, + "learning_rate": 0.0003549579831932773, + "loss": 0.4498, + "step": 23141 + }, + { + "epoch": 12.928491620111732, + "grad_norm": 0.6565712094306946, + "learning_rate": 0.0003549299719887955, + "loss": 0.5351, + "step": 23142 + }, + { + "epoch": 12.929050279329608, + "grad_norm": 0.5316324830055237, + "learning_rate": 0.00035490196078431373, + "loss": 0.3879, + "step": 23143 + }, + { + "epoch": 12.929608938547487, + "grad_norm": 0.4305028021335602, + "learning_rate": 0.00035487394957983194, + "loss": 0.3697, + "step": 23144 + }, + { + "epoch": 12.930167597765363, + "grad_norm": 0.5711145401000977, + "learning_rate": 0.0003548459383753502, + "loss": 0.3749, + "step": 23145 + }, + { + "epoch": 12.93072625698324, + "grad_norm": 0.6483446955680847, + "learning_rate": 0.00035481792717086835, + "loss": 0.5618, + "step": 23146 + }, + { + "epoch": 12.931284916201117, + "grad_norm": 0.4535178542137146, + "learning_rate": 0.00035478991596638655, + "loss": 0.4038, + "step": 23147 + }, + { + "epoch": 12.931843575418995, + "grad_norm": 0.6860631108283997, + "learning_rate": 0.00035476190476190476, + "loss": 0.5147, + "step": 23148 + }, + { + "epoch": 12.932402234636871, + "grad_norm": 0.5085126757621765, + "learning_rate": 0.00035473389355742297, + "loss": 0.4812, + "step": 23149 + }, + { + "epoch": 12.932960893854748, + "grad_norm": 0.37389346957206726, + "learning_rate": 0.0003547058823529412, + "loss": 0.3888, + "step": 23150 + }, + { + "epoch": 12.933519553072626, + "grad_norm": 0.4842715561389923, + "learning_rate": 0.0003546778711484594, + "loss": 0.5799, + "step": 23151 + }, + { + "epoch": 12.934078212290503, + "grad_norm": 0.3209175169467926, + "learning_rate": 0.0003546498599439776, + "loss": 0.3765, + "step": 23152 + }, + { + "epoch": 12.93463687150838, + "grad_norm": 0.371474951505661, + "learning_rate": 0.00035462184873949584, + "loss": 0.4479, + "step": 23153 + }, + { + "epoch": 12.935195530726258, + "grad_norm": 0.46804267168045044, + "learning_rate": 0.000354593837535014, + "loss": 0.3831, + "step": 23154 + }, + { + "epoch": 12.935754189944134, + "grad_norm": 0.43284279108047485, + "learning_rate": 0.0003545658263305322, + "loss": 0.4451, + "step": 23155 + }, + { + "epoch": 12.936312849162011, + "grad_norm": 0.41407787799835205, + "learning_rate": 0.0003545378151260504, + "loss": 0.3459, + "step": 23156 + }, + { + "epoch": 12.936871508379888, + "grad_norm": 0.39946722984313965, + "learning_rate": 0.0003545098039215686, + "loss": 0.3971, + "step": 23157 + }, + { + "epoch": 12.937430167597766, + "grad_norm": 2.627351760864258, + "learning_rate": 0.0003544817927170869, + "loss": 0.4448, + "step": 23158 + }, + { + "epoch": 12.937988826815642, + "grad_norm": 0.5211644172668457, + "learning_rate": 0.000354453781512605, + "loss": 0.4807, + "step": 23159 + }, + { + "epoch": 12.938547486033519, + "grad_norm": 0.4213368594646454, + "learning_rate": 0.00035442577030812323, + "loss": 0.4305, + "step": 23160 + }, + { + "epoch": 12.939106145251397, + "grad_norm": 0.49920469522476196, + "learning_rate": 0.0003543977591036415, + "loss": 0.4454, + "step": 23161 + }, + { + "epoch": 12.939664804469274, + "grad_norm": 0.6270092129707336, + "learning_rate": 0.00035436974789915964, + "loss": 0.3566, + "step": 23162 + }, + { + "epoch": 12.94022346368715, + "grad_norm": 0.40327441692352295, + "learning_rate": 0.0003543417366946779, + "loss": 0.3618, + "step": 23163 + }, + { + "epoch": 12.940782122905027, + "grad_norm": 0.3988189697265625, + "learning_rate": 0.00035431372549019606, + "loss": 0.4086, + "step": 23164 + }, + { + "epoch": 12.941340782122905, + "grad_norm": 0.41284316778182983, + "learning_rate": 0.00035428571428571426, + "loss": 0.406, + "step": 23165 + }, + { + "epoch": 12.941899441340782, + "grad_norm": 0.5242443680763245, + "learning_rate": 0.0003542577030812325, + "loss": 0.4835, + "step": 23166 + }, + { + "epoch": 12.942458100558659, + "grad_norm": 0.37576621770858765, + "learning_rate": 0.0003542296918767507, + "loss": 0.3588, + "step": 23167 + }, + { + "epoch": 12.943016759776537, + "grad_norm": 0.5130129456520081, + "learning_rate": 0.00035420168067226893, + "loss": 0.3783, + "step": 23168 + }, + { + "epoch": 12.943575418994413, + "grad_norm": 0.5981467962265015, + "learning_rate": 0.00035417366946778714, + "loss": 0.3826, + "step": 23169 + }, + { + "epoch": 12.94413407821229, + "grad_norm": 0.6164994835853577, + "learning_rate": 0.0003541456582633053, + "loss": 0.3734, + "step": 23170 + }, + { + "epoch": 12.944692737430168, + "grad_norm": 5.469063758850098, + "learning_rate": 0.00035411764705882355, + "loss": 0.3224, + "step": 23171 + }, + { + "epoch": 12.945251396648045, + "grad_norm": 0.7147475481033325, + "learning_rate": 0.0003540896358543417, + "loss": 0.4188, + "step": 23172 + }, + { + "epoch": 12.945810055865921, + "grad_norm": 0.38063937425613403, + "learning_rate": 0.00035406162464985996, + "loss": 0.3715, + "step": 23173 + }, + { + "epoch": 12.946368715083798, + "grad_norm": 0.748613178730011, + "learning_rate": 0.00035403361344537817, + "loss": 0.3842, + "step": 23174 + }, + { + "epoch": 12.946927374301676, + "grad_norm": 0.4692458510398865, + "learning_rate": 0.0003540056022408963, + "loss": 0.5025, + "step": 23175 + }, + { + "epoch": 12.947486033519553, + "grad_norm": 0.549880862236023, + "learning_rate": 0.0003539775910364146, + "loss": 0.4091, + "step": 23176 + }, + { + "epoch": 12.94804469273743, + "grad_norm": 0.6346397399902344, + "learning_rate": 0.0003539495798319328, + "loss": 0.4154, + "step": 23177 + }, + { + "epoch": 12.948603351955308, + "grad_norm": 0.5505090951919556, + "learning_rate": 0.000353921568627451, + "loss": 0.396, + "step": 23178 + }, + { + "epoch": 12.949162011173184, + "grad_norm": 0.6531315445899963, + "learning_rate": 0.0003538935574229692, + "loss": 0.6022, + "step": 23179 + }, + { + "epoch": 12.949720670391061, + "grad_norm": 0.48369401693344116, + "learning_rate": 0.00035386554621848735, + "loss": 0.4315, + "step": 23180 + }, + { + "epoch": 12.95027932960894, + "grad_norm": 0.6149060130119324, + "learning_rate": 0.0003538375350140056, + "loss": 0.5696, + "step": 23181 + }, + { + "epoch": 12.950837988826816, + "grad_norm": 0.6020926833152771, + "learning_rate": 0.0003538095238095238, + "loss": 0.4266, + "step": 23182 + }, + { + "epoch": 12.951396648044692, + "grad_norm": 0.8118475675582886, + "learning_rate": 0.000353781512605042, + "loss": 0.4989, + "step": 23183 + }, + { + "epoch": 12.951955307262569, + "grad_norm": 0.5111149549484253, + "learning_rate": 0.00035375350140056023, + "loss": 0.3074, + "step": 23184 + }, + { + "epoch": 12.952513966480447, + "grad_norm": 0.4930194318294525, + "learning_rate": 0.00035372549019607844, + "loss": 0.3905, + "step": 23185 + }, + { + "epoch": 12.953072625698324, + "grad_norm": 0.44921284914016724, + "learning_rate": 0.00035369747899159664, + "loss": 0.3908, + "step": 23186 + }, + { + "epoch": 12.9536312849162, + "grad_norm": 0.4587908983230591, + "learning_rate": 0.00035366946778711485, + "loss": 0.3576, + "step": 23187 + }, + { + "epoch": 12.954189944134079, + "grad_norm": 0.6486285328865051, + "learning_rate": 0.00035364145658263305, + "loss": 0.3579, + "step": 23188 + }, + { + "epoch": 12.954748603351955, + "grad_norm": 1.1239144802093506, + "learning_rate": 0.00035361344537815126, + "loss": 0.3341, + "step": 23189 + }, + { + "epoch": 12.955307262569832, + "grad_norm": 0.537642776966095, + "learning_rate": 0.00035358543417366947, + "loss": 0.4835, + "step": 23190 + }, + { + "epoch": 12.955865921787709, + "grad_norm": 0.7267135977745056, + "learning_rate": 0.00035355742296918767, + "loss": 0.3661, + "step": 23191 + }, + { + "epoch": 12.956424581005587, + "grad_norm": 0.5360385775566101, + "learning_rate": 0.0003535294117647059, + "loss": 0.4747, + "step": 23192 + }, + { + "epoch": 12.956983240223463, + "grad_norm": 1.1423780918121338, + "learning_rate": 0.00035350140056022414, + "loss": 0.4156, + "step": 23193 + }, + { + "epoch": 12.95754189944134, + "grad_norm": 0.48165175318717957, + "learning_rate": 0.0003534733893557423, + "loss": 0.4929, + "step": 23194 + }, + { + "epoch": 12.958100558659218, + "grad_norm": 0.5315850973129272, + "learning_rate": 0.0003534453781512605, + "loss": 0.463, + "step": 23195 + }, + { + "epoch": 12.958659217877095, + "grad_norm": 0.6171473264694214, + "learning_rate": 0.0003534173669467787, + "loss": 0.3475, + "step": 23196 + }, + { + "epoch": 12.959217877094972, + "grad_norm": 0.6063719391822815, + "learning_rate": 0.0003533893557422969, + "loss": 0.4375, + "step": 23197 + }, + { + "epoch": 12.95977653631285, + "grad_norm": 2.2525529861450195, + "learning_rate": 0.00035336134453781517, + "loss": 0.4534, + "step": 23198 + }, + { + "epoch": 12.960335195530726, + "grad_norm": 0.9050586223602295, + "learning_rate": 0.0003533333333333333, + "loss": 0.5288, + "step": 23199 + }, + { + "epoch": 12.960893854748603, + "grad_norm": 5.178910732269287, + "learning_rate": 0.0003533053221288515, + "loss": 0.4181, + "step": 23200 + }, + { + "epoch": 12.961452513966481, + "grad_norm": 0.6808676719665527, + "learning_rate": 0.0003532773109243698, + "loss": 0.4808, + "step": 23201 + }, + { + "epoch": 12.962011173184358, + "grad_norm": 0.49515166878700256, + "learning_rate": 0.00035324929971988794, + "loss": 0.458, + "step": 23202 + }, + { + "epoch": 12.962569832402234, + "grad_norm": 3.1959047317504883, + "learning_rate": 0.0003532212885154062, + "loss": 0.412, + "step": 23203 + }, + { + "epoch": 12.963128491620111, + "grad_norm": 0.556829571723938, + "learning_rate": 0.00035319327731092435, + "loss": 0.3096, + "step": 23204 + }, + { + "epoch": 12.96368715083799, + "grad_norm": 3.763230562210083, + "learning_rate": 0.00035316526610644256, + "loss": 0.3793, + "step": 23205 + }, + { + "epoch": 12.964245810055866, + "grad_norm": 0.4567667245864868, + "learning_rate": 0.0003531372549019608, + "loss": 0.424, + "step": 23206 + }, + { + "epoch": 12.964804469273743, + "grad_norm": 0.5227357149124146, + "learning_rate": 0.00035310924369747897, + "loss": 0.4248, + "step": 23207 + }, + { + "epoch": 12.96536312849162, + "grad_norm": 0.6566517353057861, + "learning_rate": 0.00035308123249299723, + "loss": 0.6314, + "step": 23208 + }, + { + "epoch": 12.965921787709497, + "grad_norm": 0.4390355050563812, + "learning_rate": 0.00035305322128851543, + "loss": 0.4633, + "step": 23209 + }, + { + "epoch": 12.966480446927374, + "grad_norm": 0.4011116623878479, + "learning_rate": 0.0003530252100840336, + "loss": 0.3925, + "step": 23210 + }, + { + "epoch": 12.96703910614525, + "grad_norm": 0.36842766404151917, + "learning_rate": 0.00035299719887955185, + "loss": 0.4092, + "step": 23211 + }, + { + "epoch": 12.967597765363129, + "grad_norm": 0.47220832109451294, + "learning_rate": 0.00035296918767507, + "loss": 0.3117, + "step": 23212 + }, + { + "epoch": 12.968156424581005, + "grad_norm": 3.219080924987793, + "learning_rate": 0.00035294117647058826, + "loss": 0.7143, + "step": 23213 + }, + { + "epoch": 12.968715083798882, + "grad_norm": 1.0819718837738037, + "learning_rate": 0.00035291316526610646, + "loss": 0.3358, + "step": 23214 + }, + { + "epoch": 12.96927374301676, + "grad_norm": 0.6638665795326233, + "learning_rate": 0.0003528851540616246, + "loss": 0.4422, + "step": 23215 + }, + { + "epoch": 12.969832402234637, + "grad_norm": 0.3828807771205902, + "learning_rate": 0.0003528571428571429, + "loss": 0.3964, + "step": 23216 + }, + { + "epoch": 12.970391061452514, + "grad_norm": 0.5035955309867859, + "learning_rate": 0.0003528291316526611, + "loss": 0.4449, + "step": 23217 + }, + { + "epoch": 12.970949720670392, + "grad_norm": 0.5694970488548279, + "learning_rate": 0.0003528011204481793, + "loss": 0.4365, + "step": 23218 + }, + { + "epoch": 12.971508379888268, + "grad_norm": 2.992596387863159, + "learning_rate": 0.0003527731092436975, + "loss": 0.3939, + "step": 23219 + }, + { + "epoch": 12.972067039106145, + "grad_norm": 0.407412052154541, + "learning_rate": 0.00035274509803921565, + "loss": 0.3656, + "step": 23220 + }, + { + "epoch": 12.972625698324022, + "grad_norm": 1.1889253854751587, + "learning_rate": 0.0003527170868347339, + "loss": 0.367, + "step": 23221 + }, + { + "epoch": 12.9731843575419, + "grad_norm": 0.5009036660194397, + "learning_rate": 0.0003526890756302521, + "loss": 0.3725, + "step": 23222 + }, + { + "epoch": 12.973743016759776, + "grad_norm": 0.4660651385784149, + "learning_rate": 0.0003526610644257703, + "loss": 0.3455, + "step": 23223 + }, + { + "epoch": 12.974301675977653, + "grad_norm": 0.540440559387207, + "learning_rate": 0.0003526330532212885, + "loss": 0.456, + "step": 23224 + }, + { + "epoch": 12.974860335195531, + "grad_norm": 0.46635663509368896, + "learning_rate": 0.00035260504201680673, + "loss": 0.3262, + "step": 23225 + }, + { + "epoch": 12.975418994413408, + "grad_norm": 0.5682308673858643, + "learning_rate": 0.00035257703081232494, + "loss": 0.3867, + "step": 23226 + }, + { + "epoch": 12.975977653631285, + "grad_norm": 3.2422878742218018, + "learning_rate": 0.00035254901960784314, + "loss": 0.4573, + "step": 23227 + }, + { + "epoch": 12.976536312849163, + "grad_norm": 0.4941241443157196, + "learning_rate": 0.0003525210084033614, + "loss": 0.3987, + "step": 23228 + }, + { + "epoch": 12.97709497206704, + "grad_norm": 0.7356036305427551, + "learning_rate": 0.00035249299719887955, + "loss": 0.4712, + "step": 23229 + }, + { + "epoch": 12.977653631284916, + "grad_norm": 0.45608341693878174, + "learning_rate": 0.00035246498599439776, + "loss": 0.4382, + "step": 23230 + }, + { + "epoch": 12.978212290502793, + "grad_norm": 0.449614018201828, + "learning_rate": 0.00035243697478991597, + "loss": 0.4004, + "step": 23231 + }, + { + "epoch": 12.978770949720671, + "grad_norm": 0.4191884994506836, + "learning_rate": 0.00035240896358543417, + "loss": 0.3354, + "step": 23232 + }, + { + "epoch": 12.979329608938547, + "grad_norm": 0.4265230596065521, + "learning_rate": 0.00035238095238095243, + "loss": 0.4674, + "step": 23233 + }, + { + "epoch": 12.979888268156424, + "grad_norm": 0.4582882225513458, + "learning_rate": 0.0003523529411764706, + "loss": 0.4423, + "step": 23234 + }, + { + "epoch": 12.980446927374302, + "grad_norm": 1.04096257686615, + "learning_rate": 0.0003523249299719888, + "loss": 0.3586, + "step": 23235 + }, + { + "epoch": 12.981005586592179, + "grad_norm": 0.38066405057907104, + "learning_rate": 0.00035229691876750705, + "loss": 0.4026, + "step": 23236 + }, + { + "epoch": 12.981564245810056, + "grad_norm": 0.6009653806686401, + "learning_rate": 0.0003522689075630252, + "loss": 0.5469, + "step": 23237 + }, + { + "epoch": 12.982122905027932, + "grad_norm": 0.38071930408477783, + "learning_rate": 0.00035224089635854346, + "loss": 0.4713, + "step": 23238 + }, + { + "epoch": 12.98268156424581, + "grad_norm": 0.620356559753418, + "learning_rate": 0.0003522128851540616, + "loss": 0.4746, + "step": 23239 + }, + { + "epoch": 12.983240223463687, + "grad_norm": 0.4416988492012024, + "learning_rate": 0.0003521848739495798, + "loss": 0.3741, + "step": 23240 + }, + { + "epoch": 12.983798882681564, + "grad_norm": 0.5249208807945251, + "learning_rate": 0.0003521568627450981, + "loss": 0.4015, + "step": 23241 + }, + { + "epoch": 12.984357541899442, + "grad_norm": 0.4516470730304718, + "learning_rate": 0.00035212885154061623, + "loss": 0.4788, + "step": 23242 + }, + { + "epoch": 12.984916201117318, + "grad_norm": 0.6191099882125854, + "learning_rate": 0.0003521008403361345, + "loss": 0.4432, + "step": 23243 + }, + { + "epoch": 12.985474860335195, + "grad_norm": 0.6576861143112183, + "learning_rate": 0.0003520728291316527, + "loss": 0.6266, + "step": 23244 + }, + { + "epoch": 12.986033519553073, + "grad_norm": 10.930916786193848, + "learning_rate": 0.00035204481792717085, + "loss": 0.3527, + "step": 23245 + }, + { + "epoch": 12.98659217877095, + "grad_norm": 0.44888150691986084, + "learning_rate": 0.0003520168067226891, + "loss": 0.4743, + "step": 23246 + }, + { + "epoch": 12.987150837988827, + "grad_norm": 0.3685510456562042, + "learning_rate": 0.00035198879551820726, + "loss": 0.3969, + "step": 23247 + }, + { + "epoch": 12.987709497206703, + "grad_norm": 0.6509757041931152, + "learning_rate": 0.0003519607843137255, + "loss": 0.4456, + "step": 23248 + }, + { + "epoch": 12.988268156424581, + "grad_norm": 2.5802578926086426, + "learning_rate": 0.00035193277310924373, + "loss": 0.4512, + "step": 23249 + }, + { + "epoch": 12.988826815642458, + "grad_norm": 0.4414239823818207, + "learning_rate": 0.0003519047619047619, + "loss": 0.4157, + "step": 23250 + }, + { + "epoch": 12.989385474860335, + "grad_norm": 0.5902441740036011, + "learning_rate": 0.00035187675070028014, + "loss": 0.5264, + "step": 23251 + }, + { + "epoch": 12.989944134078213, + "grad_norm": 0.6017928123474121, + "learning_rate": 0.00035184873949579835, + "loss": 0.3992, + "step": 23252 + }, + { + "epoch": 12.99050279329609, + "grad_norm": 0.4962887465953827, + "learning_rate": 0.00035182072829131655, + "loss": 0.4205, + "step": 23253 + }, + { + "epoch": 12.991061452513966, + "grad_norm": 1.87726628780365, + "learning_rate": 0.00035179271708683476, + "loss": 0.4861, + "step": 23254 + }, + { + "epoch": 12.991620111731844, + "grad_norm": 0.412777304649353, + "learning_rate": 0.0003517647058823529, + "loss": 0.3152, + "step": 23255 + }, + { + "epoch": 12.992178770949721, + "grad_norm": 0.5835171341896057, + "learning_rate": 0.00035173669467787117, + "loss": 0.555, + "step": 23256 + }, + { + "epoch": 12.992737430167598, + "grad_norm": 0.44555264711380005, + "learning_rate": 0.0003517086834733894, + "loss": 0.4496, + "step": 23257 + }, + { + "epoch": 12.993296089385474, + "grad_norm": 0.6477193236351013, + "learning_rate": 0.0003516806722689076, + "loss": 0.3407, + "step": 23258 + }, + { + "epoch": 12.993854748603352, + "grad_norm": 0.29367247223854065, + "learning_rate": 0.0003516526610644258, + "loss": 0.2749, + "step": 23259 + }, + { + "epoch": 12.994413407821229, + "grad_norm": 0.40685129165649414, + "learning_rate": 0.000351624649859944, + "loss": 0.3807, + "step": 23260 + }, + { + "epoch": 12.994972067039106, + "grad_norm": 1.059714436531067, + "learning_rate": 0.0003515966386554622, + "loss": 0.4873, + "step": 23261 + }, + { + "epoch": 12.995530726256984, + "grad_norm": 0.6272127628326416, + "learning_rate": 0.0003515686274509804, + "loss": 0.3504, + "step": 23262 + }, + { + "epoch": 12.99608938547486, + "grad_norm": 0.4392222464084625, + "learning_rate": 0.00035154061624649856, + "loss": 0.4357, + "step": 23263 + }, + { + "epoch": 12.996648044692737, + "grad_norm": 0.41262519359588623, + "learning_rate": 0.0003515126050420168, + "loss": 0.4543, + "step": 23264 + }, + { + "epoch": 12.997206703910614, + "grad_norm": 0.43222057819366455, + "learning_rate": 0.000351484593837535, + "loss": 0.4734, + "step": 23265 + }, + { + "epoch": 12.997765363128492, + "grad_norm": 0.5488288402557373, + "learning_rate": 0.00035145658263305323, + "loss": 0.3411, + "step": 23266 + }, + { + "epoch": 12.998324022346369, + "grad_norm": 0.6002556681632996, + "learning_rate": 0.00035142857142857144, + "loss": 0.3443, + "step": 23267 + }, + { + "epoch": 12.998882681564245, + "grad_norm": 1.0528843402862549, + "learning_rate": 0.00035140056022408964, + "loss": 0.4884, + "step": 23268 + }, + { + "epoch": 12.999441340782123, + "grad_norm": 2.401143789291382, + "learning_rate": 0.00035137254901960785, + "loss": 0.5111, + "step": 23269 + }, + { + "epoch": 13.0, + "grad_norm": 0.46162986755371094, + "learning_rate": 0.00035134453781512605, + "loss": 0.591, + "step": 23270 + }, + { + "epoch": 13.000558659217877, + "grad_norm": 0.47466105222702026, + "learning_rate": 0.00035131652661064426, + "loss": 0.3738, + "step": 23271 + }, + { + "epoch": 13.001117318435755, + "grad_norm": 0.35235080122947693, + "learning_rate": 0.00035128851540616247, + "loss": 0.3253, + "step": 23272 + }, + { + "epoch": 13.001675977653631, + "grad_norm": 0.9307236075401306, + "learning_rate": 0.00035126050420168067, + "loss": 0.5879, + "step": 23273 + }, + { + "epoch": 13.002234636871508, + "grad_norm": 0.590955913066864, + "learning_rate": 0.0003512324929971989, + "loss": 0.424, + "step": 23274 + }, + { + "epoch": 13.002793296089385, + "grad_norm": 0.47806814312934875, + "learning_rate": 0.0003512044817927171, + "loss": 0.4274, + "step": 23275 + }, + { + "epoch": 13.003351955307263, + "grad_norm": 0.35463306307792664, + "learning_rate": 0.00035117647058823534, + "loss": 0.3433, + "step": 23276 + }, + { + "epoch": 13.00391061452514, + "grad_norm": 1.6759039163589478, + "learning_rate": 0.0003511484593837535, + "loss": 0.3917, + "step": 23277 + }, + { + "epoch": 13.004469273743016, + "grad_norm": 1.6848140954971313, + "learning_rate": 0.0003511204481792717, + "loss": 0.4547, + "step": 23278 + }, + { + "epoch": 13.005027932960894, + "grad_norm": 0.5859642624855042, + "learning_rate": 0.0003510924369747899, + "loss": 0.3993, + "step": 23279 + }, + { + "epoch": 13.005586592178771, + "grad_norm": 0.40881794691085815, + "learning_rate": 0.0003510644257703081, + "loss": 0.4588, + "step": 23280 + }, + { + "epoch": 13.006145251396648, + "grad_norm": 0.548321008682251, + "learning_rate": 0.0003510364145658264, + "loss": 0.35, + "step": 23281 + }, + { + "epoch": 13.006703910614526, + "grad_norm": 0.6069298386573792, + "learning_rate": 0.0003510084033613445, + "loss": 0.3775, + "step": 23282 + }, + { + "epoch": 13.007262569832402, + "grad_norm": 4.028810977935791, + "learning_rate": 0.00035098039215686273, + "loss": 0.3571, + "step": 23283 + }, + { + "epoch": 13.007821229050279, + "grad_norm": 0.42840051651000977, + "learning_rate": 0.000350952380952381, + "loss": 0.3795, + "step": 23284 + }, + { + "epoch": 13.008379888268156, + "grad_norm": 9.171091079711914, + "learning_rate": 0.00035092436974789914, + "loss": 0.452, + "step": 23285 + }, + { + "epoch": 13.008938547486034, + "grad_norm": 0.3818887770175934, + "learning_rate": 0.0003508963585434174, + "loss": 0.3893, + "step": 23286 + }, + { + "epoch": 13.00949720670391, + "grad_norm": 0.4687611162662506, + "learning_rate": 0.00035086834733893556, + "loss": 0.4417, + "step": 23287 + }, + { + "epoch": 13.010055865921787, + "grad_norm": 0.41950637102127075, + "learning_rate": 0.00035084033613445376, + "loss": 0.4926, + "step": 23288 + }, + { + "epoch": 13.010614525139665, + "grad_norm": 0.44910523295402527, + "learning_rate": 0.000350812324929972, + "loss": 0.3968, + "step": 23289 + }, + { + "epoch": 13.011173184357542, + "grad_norm": 2.7767529487609863, + "learning_rate": 0.0003507843137254902, + "loss": 0.6029, + "step": 23290 + }, + { + "epoch": 13.011731843575419, + "grad_norm": 1.1533838510513306, + "learning_rate": 0.00035075630252100843, + "loss": 0.409, + "step": 23291 + }, + { + "epoch": 13.012290502793297, + "grad_norm": 0.5757137537002563, + "learning_rate": 0.00035072829131652664, + "loss": 0.5104, + "step": 23292 + }, + { + "epoch": 13.012849162011173, + "grad_norm": 0.4171714782714844, + "learning_rate": 0.0003507002801120448, + "loss": 0.4045, + "step": 23293 + }, + { + "epoch": 13.01340782122905, + "grad_norm": 0.49656033515930176, + "learning_rate": 0.00035067226890756305, + "loss": 0.4049, + "step": 23294 + }, + { + "epoch": 13.013966480446927, + "grad_norm": 0.5441023111343384, + "learning_rate": 0.0003506442577030812, + "loss": 0.4317, + "step": 23295 + }, + { + "epoch": 13.014525139664805, + "grad_norm": 0.6322762370109558, + "learning_rate": 0.00035061624649859946, + "loss": 0.4018, + "step": 23296 + }, + { + "epoch": 13.015083798882682, + "grad_norm": 0.4555903673171997, + "learning_rate": 0.00035058823529411767, + "loss": 0.405, + "step": 23297 + }, + { + "epoch": 13.015642458100558, + "grad_norm": 3.7376105785369873, + "learning_rate": 0.0003505602240896358, + "loss": 0.3974, + "step": 23298 + }, + { + "epoch": 13.016201117318436, + "grad_norm": 0.4773577153682709, + "learning_rate": 0.0003505322128851541, + "loss": 0.4286, + "step": 23299 + }, + { + "epoch": 13.016759776536313, + "grad_norm": 0.5603949427604675, + "learning_rate": 0.0003505042016806723, + "loss": 0.4018, + "step": 23300 + }, + { + "epoch": 13.01731843575419, + "grad_norm": 0.38223135471343994, + "learning_rate": 0.0003504761904761905, + "loss": 0.3693, + "step": 23301 + }, + { + "epoch": 13.017877094972068, + "grad_norm": 0.5161375999450684, + "learning_rate": 0.0003504481792717087, + "loss": 0.4125, + "step": 23302 + }, + { + "epoch": 13.018435754189944, + "grad_norm": 0.6084408760070801, + "learning_rate": 0.00035042016806722685, + "loss": 0.4995, + "step": 23303 + }, + { + "epoch": 13.018994413407821, + "grad_norm": 0.6102217435836792, + "learning_rate": 0.0003503921568627451, + "loss": 0.4814, + "step": 23304 + }, + { + "epoch": 13.019553072625698, + "grad_norm": 0.49996086955070496, + "learning_rate": 0.0003503641456582633, + "loss": 0.4424, + "step": 23305 + }, + { + "epoch": 13.020111731843576, + "grad_norm": 0.44509491324424744, + "learning_rate": 0.0003503361344537815, + "loss": 0.3695, + "step": 23306 + }, + { + "epoch": 13.020670391061453, + "grad_norm": 0.5539101362228394, + "learning_rate": 0.00035030812324929973, + "loss": 0.3544, + "step": 23307 + }, + { + "epoch": 13.021229050279329, + "grad_norm": 0.49441125988960266, + "learning_rate": 0.00035028011204481794, + "loss": 0.4398, + "step": 23308 + }, + { + "epoch": 13.021787709497207, + "grad_norm": 0.35702961683273315, + "learning_rate": 0.00035025210084033614, + "loss": 0.4556, + "step": 23309 + }, + { + "epoch": 13.022346368715084, + "grad_norm": 0.5351783037185669, + "learning_rate": 0.00035022408963585435, + "loss": 0.4357, + "step": 23310 + }, + { + "epoch": 13.02290502793296, + "grad_norm": 0.5267682671546936, + "learning_rate": 0.00035019607843137255, + "loss": 0.3789, + "step": 23311 + }, + { + "epoch": 13.023463687150837, + "grad_norm": 0.609285831451416, + "learning_rate": 0.00035016806722689076, + "loss": 0.4698, + "step": 23312 + }, + { + "epoch": 13.024022346368715, + "grad_norm": 0.47037866711616516, + "learning_rate": 0.00035014005602240897, + "loss": 0.3499, + "step": 23313 + }, + { + "epoch": 13.024581005586592, + "grad_norm": 0.7390353083610535, + "learning_rate": 0.00035011204481792717, + "loss": 0.413, + "step": 23314 + }, + { + "epoch": 13.025139664804469, + "grad_norm": 2.7140743732452393, + "learning_rate": 0.0003500840336134454, + "loss": 0.3623, + "step": 23315 + }, + { + "epoch": 13.025698324022347, + "grad_norm": 2.934598207473755, + "learning_rate": 0.00035005602240896364, + "loss": 0.5114, + "step": 23316 + }, + { + "epoch": 13.026256983240224, + "grad_norm": 2.202272653579712, + "learning_rate": 0.0003500280112044818, + "loss": 0.3985, + "step": 23317 + }, + { + "epoch": 13.0268156424581, + "grad_norm": 0.31507304310798645, + "learning_rate": 0.00035, + "loss": 0.3448, + "step": 23318 + }, + { + "epoch": 13.027374301675978, + "grad_norm": 0.53544020652771, + "learning_rate": 0.0003499719887955182, + "loss": 0.3566, + "step": 23319 + }, + { + "epoch": 13.027932960893855, + "grad_norm": 0.6698783040046692, + "learning_rate": 0.0003499439775910364, + "loss": 0.4161, + "step": 23320 + }, + { + "epoch": 13.028491620111732, + "grad_norm": 2.000349760055542, + "learning_rate": 0.00034991596638655467, + "loss": 0.4796, + "step": 23321 + }, + { + "epoch": 13.029050279329608, + "grad_norm": 0.45821481943130493, + "learning_rate": 0.0003498879551820728, + "loss": 0.4374, + "step": 23322 + }, + { + "epoch": 13.029608938547486, + "grad_norm": 0.44443824887275696, + "learning_rate": 0.000349859943977591, + "loss": 0.3859, + "step": 23323 + }, + { + "epoch": 13.030167597765363, + "grad_norm": 11.175480842590332, + "learning_rate": 0.0003498319327731093, + "loss": 0.3849, + "step": 23324 + }, + { + "epoch": 13.03072625698324, + "grad_norm": 0.8641663789749146, + "learning_rate": 0.00034980392156862744, + "loss": 0.5376, + "step": 23325 + }, + { + "epoch": 13.031284916201118, + "grad_norm": 0.8429691791534424, + "learning_rate": 0.0003497759103641457, + "loss": 0.3326, + "step": 23326 + }, + { + "epoch": 13.031843575418995, + "grad_norm": 0.4473841190338135, + "learning_rate": 0.00034974789915966385, + "loss": 0.4164, + "step": 23327 + }, + { + "epoch": 13.032402234636871, + "grad_norm": 0.45497047901153564, + "learning_rate": 0.00034971988795518206, + "loss": 0.3431, + "step": 23328 + }, + { + "epoch": 13.03296089385475, + "grad_norm": 0.4669978618621826, + "learning_rate": 0.0003496918767507003, + "loss": 0.365, + "step": 23329 + }, + { + "epoch": 13.033519553072626, + "grad_norm": 0.40230894088745117, + "learning_rate": 0.00034966386554621847, + "loss": 0.3178, + "step": 23330 + }, + { + "epoch": 13.034078212290503, + "grad_norm": 0.5160427689552307, + "learning_rate": 0.00034963585434173673, + "loss": 0.4612, + "step": 23331 + }, + { + "epoch": 13.03463687150838, + "grad_norm": 0.754610002040863, + "learning_rate": 0.00034960784313725493, + "loss": 0.5468, + "step": 23332 + }, + { + "epoch": 13.035195530726257, + "grad_norm": 0.3806002140045166, + "learning_rate": 0.0003495798319327731, + "loss": 0.4028, + "step": 23333 + }, + { + "epoch": 13.035754189944134, + "grad_norm": 0.3582924008369446, + "learning_rate": 0.00034955182072829135, + "loss": 0.3286, + "step": 23334 + }, + { + "epoch": 13.03631284916201, + "grad_norm": 0.5125653147697449, + "learning_rate": 0.0003495238095238095, + "loss": 0.4791, + "step": 23335 + }, + { + "epoch": 13.036871508379889, + "grad_norm": 0.47860410809516907, + "learning_rate": 0.00034949579831932776, + "loss": 0.4265, + "step": 23336 + }, + { + "epoch": 13.037430167597766, + "grad_norm": 0.38151678442955017, + "learning_rate": 0.00034946778711484596, + "loss": 0.3546, + "step": 23337 + }, + { + "epoch": 13.037988826815642, + "grad_norm": 0.37241029739379883, + "learning_rate": 0.0003494397759103641, + "loss": 0.4061, + "step": 23338 + }, + { + "epoch": 13.03854748603352, + "grad_norm": 0.47382745146751404, + "learning_rate": 0.0003494117647058824, + "loss": 0.4034, + "step": 23339 + }, + { + "epoch": 13.039106145251397, + "grad_norm": 0.5245199799537659, + "learning_rate": 0.0003493837535014006, + "loss": 0.4215, + "step": 23340 + }, + { + "epoch": 13.039664804469274, + "grad_norm": 0.40086638927459717, + "learning_rate": 0.0003493557422969188, + "loss": 0.5141, + "step": 23341 + }, + { + "epoch": 13.04022346368715, + "grad_norm": 0.4952842891216278, + "learning_rate": 0.000349327731092437, + "loss": 0.3414, + "step": 23342 + }, + { + "epoch": 13.040782122905028, + "grad_norm": 0.6562835574150085, + "learning_rate": 0.00034929971988795515, + "loss": 0.4555, + "step": 23343 + }, + { + "epoch": 13.041340782122905, + "grad_norm": 0.5985842943191528, + "learning_rate": 0.0003492717086834734, + "loss": 0.4716, + "step": 23344 + }, + { + "epoch": 13.041899441340782, + "grad_norm": 0.4215427339076996, + "learning_rate": 0.0003492436974789916, + "loss": 0.522, + "step": 23345 + }, + { + "epoch": 13.04245810055866, + "grad_norm": 0.7224948406219482, + "learning_rate": 0.0003492156862745098, + "loss": 0.5043, + "step": 23346 + }, + { + "epoch": 13.043016759776537, + "grad_norm": 0.7831788659095764, + "learning_rate": 0.000349187675070028, + "loss": 0.3888, + "step": 23347 + }, + { + "epoch": 13.043575418994413, + "grad_norm": 0.4009150266647339, + "learning_rate": 0.00034915966386554623, + "loss": 0.3701, + "step": 23348 + }, + { + "epoch": 13.04413407821229, + "grad_norm": 0.4335838854312897, + "learning_rate": 0.00034913165266106444, + "loss": 0.4256, + "step": 23349 + }, + { + "epoch": 13.044692737430168, + "grad_norm": 0.5390592813491821, + "learning_rate": 0.00034910364145658264, + "loss": 0.582, + "step": 23350 + }, + { + "epoch": 13.045251396648045, + "grad_norm": 0.48130103945732117, + "learning_rate": 0.00034907563025210085, + "loss": 0.3127, + "step": 23351 + }, + { + "epoch": 13.045810055865921, + "grad_norm": 0.518240213394165, + "learning_rate": 0.00034904761904761905, + "loss": 0.4259, + "step": 23352 + }, + { + "epoch": 13.0463687150838, + "grad_norm": 0.39265748858451843, + "learning_rate": 0.00034901960784313726, + "loss": 0.3201, + "step": 23353 + }, + { + "epoch": 13.046927374301676, + "grad_norm": 0.6923423409461975, + "learning_rate": 0.00034899159663865547, + "loss": 0.4541, + "step": 23354 + }, + { + "epoch": 13.047486033519553, + "grad_norm": 0.4615306258201599, + "learning_rate": 0.00034896358543417367, + "loss": 0.4092, + "step": 23355 + }, + { + "epoch": 13.048044692737431, + "grad_norm": 0.4727894961833954, + "learning_rate": 0.00034893557422969193, + "loss": 0.5077, + "step": 23356 + }, + { + "epoch": 13.048603351955308, + "grad_norm": 0.554659903049469, + "learning_rate": 0.0003489075630252101, + "loss": 0.4187, + "step": 23357 + }, + { + "epoch": 13.049162011173184, + "grad_norm": 0.9945338368415833, + "learning_rate": 0.0003488795518207283, + "loss": 0.3427, + "step": 23358 + }, + { + "epoch": 13.04972067039106, + "grad_norm": 0.5358831286430359, + "learning_rate": 0.0003488515406162465, + "loss": 0.4445, + "step": 23359 + }, + { + "epoch": 13.050279329608939, + "grad_norm": 0.527206540107727, + "learning_rate": 0.0003488235294117647, + "loss": 0.4761, + "step": 23360 + }, + { + "epoch": 13.050837988826816, + "grad_norm": 0.5932209491729736, + "learning_rate": 0.00034879551820728296, + "loss": 0.3437, + "step": 23361 + }, + { + "epoch": 13.051396648044692, + "grad_norm": 0.5323039293289185, + "learning_rate": 0.0003487675070028011, + "loss": 0.3247, + "step": 23362 + }, + { + "epoch": 13.05195530726257, + "grad_norm": 2.791893243789673, + "learning_rate": 0.0003487394957983193, + "loss": 0.4139, + "step": 23363 + }, + { + "epoch": 13.052513966480447, + "grad_norm": 1.1677329540252686, + "learning_rate": 0.0003487114845938376, + "loss": 0.4465, + "step": 23364 + }, + { + "epoch": 13.053072625698324, + "grad_norm": 0.35761281847953796, + "learning_rate": 0.00034868347338935573, + "loss": 0.3995, + "step": 23365 + }, + { + "epoch": 13.053631284916202, + "grad_norm": 4.081105709075928, + "learning_rate": 0.000348655462184874, + "loss": 0.3641, + "step": 23366 + }, + { + "epoch": 13.054189944134079, + "grad_norm": 0.6329918503761292, + "learning_rate": 0.00034862745098039214, + "loss": 0.3823, + "step": 23367 + }, + { + "epoch": 13.054748603351955, + "grad_norm": 0.5143391489982605, + "learning_rate": 0.00034859943977591035, + "loss": 0.3968, + "step": 23368 + }, + { + "epoch": 13.055307262569832, + "grad_norm": 0.3964098393917084, + "learning_rate": 0.0003485714285714286, + "loss": 0.3502, + "step": 23369 + }, + { + "epoch": 13.05586592178771, + "grad_norm": 0.49288180470466614, + "learning_rate": 0.00034854341736694676, + "loss": 0.442, + "step": 23370 + }, + { + "epoch": 13.056424581005587, + "grad_norm": 0.4393397569656372, + "learning_rate": 0.000348515406162465, + "loss": 0.305, + "step": 23371 + }, + { + "epoch": 13.056983240223463, + "grad_norm": 0.44056546688079834, + "learning_rate": 0.00034848739495798323, + "loss": 0.428, + "step": 23372 + }, + { + "epoch": 13.057541899441341, + "grad_norm": 0.5214044451713562, + "learning_rate": 0.0003484593837535014, + "loss": 0.4495, + "step": 23373 + }, + { + "epoch": 13.058100558659218, + "grad_norm": 0.5635223388671875, + "learning_rate": 0.00034843137254901964, + "loss": 0.504, + "step": 23374 + }, + { + "epoch": 13.058659217877095, + "grad_norm": 0.37817564606666565, + "learning_rate": 0.0003484033613445378, + "loss": 0.3689, + "step": 23375 + }, + { + "epoch": 13.059217877094973, + "grad_norm": 0.35967835783958435, + "learning_rate": 0.000348375350140056, + "loss": 0.372, + "step": 23376 + }, + { + "epoch": 13.05977653631285, + "grad_norm": 5.484134197235107, + "learning_rate": 0.00034834733893557426, + "loss": 0.4478, + "step": 23377 + }, + { + "epoch": 13.060335195530726, + "grad_norm": 0.40357133746147156, + "learning_rate": 0.0003483193277310924, + "loss": 0.3885, + "step": 23378 + }, + { + "epoch": 13.060893854748603, + "grad_norm": 0.401522696018219, + "learning_rate": 0.00034829131652661067, + "loss": 0.4024, + "step": 23379 + }, + { + "epoch": 13.061452513966481, + "grad_norm": 1.2607015371322632, + "learning_rate": 0.0003482633053221289, + "loss": 0.2955, + "step": 23380 + }, + { + "epoch": 13.062011173184358, + "grad_norm": 0.5900859832763672, + "learning_rate": 0.00034823529411764703, + "loss": 0.4783, + "step": 23381 + }, + { + "epoch": 13.062569832402234, + "grad_norm": 0.6093268990516663, + "learning_rate": 0.0003482072829131653, + "loss": 0.3935, + "step": 23382 + }, + { + "epoch": 13.063128491620112, + "grad_norm": 0.5110654234886169, + "learning_rate": 0.00034817927170868344, + "loss": 0.4844, + "step": 23383 + }, + { + "epoch": 13.063687150837989, + "grad_norm": 0.6068913340568542, + "learning_rate": 0.0003481512605042017, + "loss": 0.4082, + "step": 23384 + }, + { + "epoch": 13.064245810055866, + "grad_norm": 0.6574090719223022, + "learning_rate": 0.0003481232492997199, + "loss": 0.535, + "step": 23385 + }, + { + "epoch": 13.064804469273742, + "grad_norm": 0.5582006573677063, + "learning_rate": 0.00034809523809523806, + "loss": 0.44, + "step": 23386 + }, + { + "epoch": 13.06536312849162, + "grad_norm": 1.0597667694091797, + "learning_rate": 0.0003480672268907563, + "loss": 0.431, + "step": 23387 + }, + { + "epoch": 13.065921787709497, + "grad_norm": 0.6355597972869873, + "learning_rate": 0.0003480392156862745, + "loss": 0.4666, + "step": 23388 + }, + { + "epoch": 13.066480446927374, + "grad_norm": 3.466019630432129, + "learning_rate": 0.00034801120448179273, + "loss": 0.516, + "step": 23389 + }, + { + "epoch": 13.067039106145252, + "grad_norm": 0.421601265668869, + "learning_rate": 0.00034798319327731094, + "loss": 0.3844, + "step": 23390 + }, + { + "epoch": 13.067597765363129, + "grad_norm": 0.4610384404659271, + "learning_rate": 0.0003479551820728291, + "loss": 0.4689, + "step": 23391 + }, + { + "epoch": 13.068156424581005, + "grad_norm": 4.263872146606445, + "learning_rate": 0.00034792717086834735, + "loss": 0.4303, + "step": 23392 + }, + { + "epoch": 13.068715083798883, + "grad_norm": 0.8405833840370178, + "learning_rate": 0.00034789915966386555, + "loss": 0.355, + "step": 23393 + }, + { + "epoch": 13.06927374301676, + "grad_norm": 4.102142810821533, + "learning_rate": 0.00034787114845938376, + "loss": 0.4393, + "step": 23394 + }, + { + "epoch": 13.069832402234637, + "grad_norm": 0.5717090368270874, + "learning_rate": 0.00034784313725490197, + "loss": 0.5269, + "step": 23395 + }, + { + "epoch": 13.070391061452513, + "grad_norm": 0.5308569073677063, + "learning_rate": 0.00034781512605042017, + "loss": 0.4395, + "step": 23396 + }, + { + "epoch": 13.070949720670392, + "grad_norm": 0.40274500846862793, + "learning_rate": 0.0003477871148459384, + "loss": 0.3977, + "step": 23397 + }, + { + "epoch": 13.071508379888268, + "grad_norm": 0.6104032397270203, + "learning_rate": 0.0003477591036414566, + "loss": 0.4386, + "step": 23398 + }, + { + "epoch": 13.072067039106145, + "grad_norm": 0.3510996699333191, + "learning_rate": 0.0003477310924369748, + "loss": 0.4816, + "step": 23399 + }, + { + "epoch": 13.072625698324023, + "grad_norm": 0.4035705327987671, + "learning_rate": 0.000347703081232493, + "loss": 0.3293, + "step": 23400 + }, + { + "epoch": 13.0731843575419, + "grad_norm": 0.45566388964653015, + "learning_rate": 0.0003476750700280112, + "loss": 0.4375, + "step": 23401 + }, + { + "epoch": 13.073743016759776, + "grad_norm": 0.4309523403644562, + "learning_rate": 0.0003476470588235294, + "loss": 0.38, + "step": 23402 + }, + { + "epoch": 13.074301675977654, + "grad_norm": 0.3658255934715271, + "learning_rate": 0.0003476190476190476, + "loss": 0.3627, + "step": 23403 + }, + { + "epoch": 13.074860335195531, + "grad_norm": 0.3523833155632019, + "learning_rate": 0.0003475910364145659, + "loss": 0.4075, + "step": 23404 + }, + { + "epoch": 13.075418994413408, + "grad_norm": 0.4560738801956177, + "learning_rate": 0.000347563025210084, + "loss": 0.412, + "step": 23405 + }, + { + "epoch": 13.075977653631284, + "grad_norm": 1.2952686548233032, + "learning_rate": 0.00034753501400560223, + "loss": 0.3877, + "step": 23406 + }, + { + "epoch": 13.076536312849163, + "grad_norm": 0.3817651569843292, + "learning_rate": 0.00034750700280112044, + "loss": 0.3739, + "step": 23407 + }, + { + "epoch": 13.077094972067039, + "grad_norm": 0.5688372850418091, + "learning_rate": 0.00034747899159663864, + "loss": 0.402, + "step": 23408 + }, + { + "epoch": 13.077653631284916, + "grad_norm": 0.43046021461486816, + "learning_rate": 0.0003474509803921569, + "loss": 0.3634, + "step": 23409 + }, + { + "epoch": 13.078212290502794, + "grad_norm": 0.3656826317310333, + "learning_rate": 0.00034742296918767506, + "loss": 0.3151, + "step": 23410 + }, + { + "epoch": 13.07877094972067, + "grad_norm": 0.4676840305328369, + "learning_rate": 0.00034739495798319326, + "loss": 0.4161, + "step": 23411 + }, + { + "epoch": 13.079329608938547, + "grad_norm": 0.48933640122413635, + "learning_rate": 0.0003473669467787115, + "loss": 0.5191, + "step": 23412 + }, + { + "epoch": 13.079888268156424, + "grad_norm": 0.6550039649009705, + "learning_rate": 0.0003473389355742297, + "loss": 0.3779, + "step": 23413 + }, + { + "epoch": 13.080446927374302, + "grad_norm": 0.5361563563346863, + "learning_rate": 0.00034731092436974793, + "loss": 0.4868, + "step": 23414 + }, + { + "epoch": 13.081005586592179, + "grad_norm": 0.4756554961204529, + "learning_rate": 0.0003472829131652661, + "loss": 0.5032, + "step": 23415 + }, + { + "epoch": 13.081564245810055, + "grad_norm": 0.527836799621582, + "learning_rate": 0.0003472549019607843, + "loss": 0.5133, + "step": 23416 + }, + { + "epoch": 13.082122905027934, + "grad_norm": 0.7004073262214661, + "learning_rate": 0.00034722689075630255, + "loss": 0.3439, + "step": 23417 + }, + { + "epoch": 13.08268156424581, + "grad_norm": 0.5956624746322632, + "learning_rate": 0.0003471988795518207, + "loss": 0.3976, + "step": 23418 + }, + { + "epoch": 13.083240223463687, + "grad_norm": 0.4042103886604309, + "learning_rate": 0.00034717086834733896, + "loss": 0.3206, + "step": 23419 + }, + { + "epoch": 13.083798882681565, + "grad_norm": 0.4299623966217041, + "learning_rate": 0.00034714285714285717, + "loss": 0.3968, + "step": 23420 + }, + { + "epoch": 13.084357541899442, + "grad_norm": 2.804033041000366, + "learning_rate": 0.0003471148459383753, + "loss": 0.4656, + "step": 23421 + }, + { + "epoch": 13.084916201117318, + "grad_norm": 0.42832913994789124, + "learning_rate": 0.0003470868347338936, + "loss": 0.4292, + "step": 23422 + }, + { + "epoch": 13.085474860335195, + "grad_norm": 0.8991368412971497, + "learning_rate": 0.00034705882352941173, + "loss": 0.5082, + "step": 23423 + }, + { + "epoch": 13.086033519553073, + "grad_norm": 0.4675876498222351, + "learning_rate": 0.00034703081232493, + "loss": 0.4235, + "step": 23424 + }, + { + "epoch": 13.08659217877095, + "grad_norm": 0.5286150574684143, + "learning_rate": 0.0003470028011204482, + "loss": 0.3207, + "step": 23425 + }, + { + "epoch": 13.087150837988826, + "grad_norm": 0.5329135656356812, + "learning_rate": 0.00034697478991596635, + "loss": 0.4508, + "step": 23426 + }, + { + "epoch": 13.087709497206705, + "grad_norm": 0.48325949907302856, + "learning_rate": 0.0003469467787114846, + "loss": 0.4762, + "step": 23427 + }, + { + "epoch": 13.088268156424581, + "grad_norm": 0.776962161064148, + "learning_rate": 0.0003469187675070028, + "loss": 0.3713, + "step": 23428 + }, + { + "epoch": 13.088826815642458, + "grad_norm": 0.45045456290245056, + "learning_rate": 0.000346890756302521, + "loss": 0.4427, + "step": 23429 + }, + { + "epoch": 13.089385474860336, + "grad_norm": 0.6555449366569519, + "learning_rate": 0.00034686274509803923, + "loss": 0.3658, + "step": 23430 + }, + { + "epoch": 13.089944134078213, + "grad_norm": 0.5370076894760132, + "learning_rate": 0.0003468347338935574, + "loss": 0.3792, + "step": 23431 + }, + { + "epoch": 13.09050279329609, + "grad_norm": 0.7479352951049805, + "learning_rate": 0.00034680672268907564, + "loss": 0.5671, + "step": 23432 + }, + { + "epoch": 13.091061452513966, + "grad_norm": 1.0681031942367554, + "learning_rate": 0.00034677871148459385, + "loss": 0.4474, + "step": 23433 + }, + { + "epoch": 13.091620111731844, + "grad_norm": 0.4158102869987488, + "learning_rate": 0.00034675070028011205, + "loss": 0.3915, + "step": 23434 + }, + { + "epoch": 13.09217877094972, + "grad_norm": 8.41711711883545, + "learning_rate": 0.00034672268907563026, + "loss": 0.4198, + "step": 23435 + }, + { + "epoch": 13.092737430167597, + "grad_norm": 0.4590073227882385, + "learning_rate": 0.00034669467787114847, + "loss": 0.4278, + "step": 23436 + }, + { + "epoch": 13.093296089385476, + "grad_norm": 0.5667222738265991, + "learning_rate": 0.00034666666666666667, + "loss": 0.3699, + "step": 23437 + }, + { + "epoch": 13.093854748603352, + "grad_norm": 0.46746981143951416, + "learning_rate": 0.0003466386554621849, + "loss": 0.3222, + "step": 23438 + }, + { + "epoch": 13.094413407821229, + "grad_norm": 0.3760402500629425, + "learning_rate": 0.00034661064425770314, + "loss": 0.3534, + "step": 23439 + }, + { + "epoch": 13.094972067039107, + "grad_norm": 0.608992338180542, + "learning_rate": 0.0003465826330532213, + "loss": 0.4393, + "step": 23440 + }, + { + "epoch": 13.095530726256984, + "grad_norm": 0.749060332775116, + "learning_rate": 0.0003465546218487395, + "loss": 0.3955, + "step": 23441 + }, + { + "epoch": 13.09608938547486, + "grad_norm": 1.5715206861495972, + "learning_rate": 0.0003465266106442577, + "loss": 0.4733, + "step": 23442 + }, + { + "epoch": 13.096648044692737, + "grad_norm": 0.45165273547172546, + "learning_rate": 0.0003464985994397759, + "loss": 0.3453, + "step": 23443 + }, + { + "epoch": 13.097206703910615, + "grad_norm": 0.6978132128715515, + "learning_rate": 0.00034647058823529417, + "loss": 0.3558, + "step": 23444 + }, + { + "epoch": 13.097765363128492, + "grad_norm": 0.4665333330631256, + "learning_rate": 0.0003464425770308123, + "loss": 0.4694, + "step": 23445 + }, + { + "epoch": 13.098324022346368, + "grad_norm": 0.47941717505455017, + "learning_rate": 0.0003464145658263305, + "loss": 0.4267, + "step": 23446 + }, + { + "epoch": 13.098882681564247, + "grad_norm": 0.7825055122375488, + "learning_rate": 0.0003463865546218488, + "loss": 0.4784, + "step": 23447 + }, + { + "epoch": 13.099441340782123, + "grad_norm": 0.7211691737174988, + "learning_rate": 0.00034635854341736694, + "loss": 0.3982, + "step": 23448 + }, + { + "epoch": 13.1, + "grad_norm": 0.41671818494796753, + "learning_rate": 0.0003463305322128852, + "loss": 0.3724, + "step": 23449 + }, + { + "epoch": 13.100558659217878, + "grad_norm": 0.5795016288757324, + "learning_rate": 0.00034630252100840335, + "loss": 0.4754, + "step": 23450 + }, + { + "epoch": 13.101117318435755, + "grad_norm": 0.5600852966308594, + "learning_rate": 0.00034627450980392156, + "loss": 0.5074, + "step": 23451 + }, + { + "epoch": 13.101675977653631, + "grad_norm": 0.34581464529037476, + "learning_rate": 0.0003462464985994398, + "loss": 0.3788, + "step": 23452 + }, + { + "epoch": 13.102234636871508, + "grad_norm": 0.4762992858886719, + "learning_rate": 0.00034621848739495797, + "loss": 0.4919, + "step": 23453 + }, + { + "epoch": 13.102793296089386, + "grad_norm": 0.43008309602737427, + "learning_rate": 0.00034619047619047623, + "loss": 0.3592, + "step": 23454 + }, + { + "epoch": 13.103351955307263, + "grad_norm": 0.48411741852760315, + "learning_rate": 0.00034616246498599443, + "loss": 0.4635, + "step": 23455 + }, + { + "epoch": 13.10391061452514, + "grad_norm": 0.7330057621002197, + "learning_rate": 0.0003461344537815126, + "loss": 0.3535, + "step": 23456 + }, + { + "epoch": 13.104469273743018, + "grad_norm": 0.43695884943008423, + "learning_rate": 0.00034610644257703085, + "loss": 0.361, + "step": 23457 + }, + { + "epoch": 13.105027932960894, + "grad_norm": 1.7426176071166992, + "learning_rate": 0.000346078431372549, + "loss": 0.3413, + "step": 23458 + }, + { + "epoch": 13.10558659217877, + "grad_norm": 0.3531784117221832, + "learning_rate": 0.00034605042016806726, + "loss": 0.3509, + "step": 23459 + }, + { + "epoch": 13.106145251396647, + "grad_norm": 0.39489856362342834, + "learning_rate": 0.00034602240896358546, + "loss": 0.4728, + "step": 23460 + }, + { + "epoch": 13.106703910614526, + "grad_norm": 0.5669115781784058, + "learning_rate": 0.0003459943977591036, + "loss": 0.4042, + "step": 23461 + }, + { + "epoch": 13.107262569832402, + "grad_norm": 0.4808441698551178, + "learning_rate": 0.0003459663865546219, + "loss": 0.4154, + "step": 23462 + }, + { + "epoch": 13.107821229050279, + "grad_norm": 0.8627301454544067, + "learning_rate": 0.0003459383753501401, + "loss": 0.319, + "step": 23463 + }, + { + "epoch": 13.108379888268157, + "grad_norm": 0.3973948061466217, + "learning_rate": 0.0003459103641456583, + "loss": 0.4312, + "step": 23464 + }, + { + "epoch": 13.108938547486034, + "grad_norm": 0.6303219795227051, + "learning_rate": 0.0003458823529411765, + "loss": 0.3947, + "step": 23465 + }, + { + "epoch": 13.10949720670391, + "grad_norm": 0.44183576107025146, + "learning_rate": 0.00034585434173669465, + "loss": 0.4253, + "step": 23466 + }, + { + "epoch": 13.110055865921789, + "grad_norm": 0.3627544939517975, + "learning_rate": 0.0003458263305322129, + "loss": 0.445, + "step": 23467 + }, + { + "epoch": 13.110614525139665, + "grad_norm": 0.406851202249527, + "learning_rate": 0.0003457983193277311, + "loss": 0.3856, + "step": 23468 + }, + { + "epoch": 13.111173184357542, + "grad_norm": 1.479478120803833, + "learning_rate": 0.0003457703081232493, + "loss": 0.3414, + "step": 23469 + }, + { + "epoch": 13.111731843575418, + "grad_norm": 0.6937291622161865, + "learning_rate": 0.0003457422969187675, + "loss": 0.3947, + "step": 23470 + }, + { + "epoch": 13.112290502793297, + "grad_norm": 0.3560657799243927, + "learning_rate": 0.00034571428571428573, + "loss": 0.3749, + "step": 23471 + }, + { + "epoch": 13.112849162011173, + "grad_norm": 0.3498425781726837, + "learning_rate": 0.00034568627450980394, + "loss": 0.3003, + "step": 23472 + }, + { + "epoch": 13.11340782122905, + "grad_norm": 0.46245017647743225, + "learning_rate": 0.00034565826330532214, + "loss": 0.4369, + "step": 23473 + }, + { + "epoch": 13.113966480446928, + "grad_norm": 0.7830196022987366, + "learning_rate": 0.00034563025210084035, + "loss": 0.4534, + "step": 23474 + }, + { + "epoch": 13.114525139664805, + "grad_norm": 0.5229582786560059, + "learning_rate": 0.00034560224089635855, + "loss": 0.496, + "step": 23475 + }, + { + "epoch": 13.115083798882681, + "grad_norm": 0.7278667688369751, + "learning_rate": 0.00034557422969187676, + "loss": 0.4963, + "step": 23476 + }, + { + "epoch": 13.11564245810056, + "grad_norm": 2.452320098876953, + "learning_rate": 0.00034554621848739497, + "loss": 0.326, + "step": 23477 + }, + { + "epoch": 13.116201117318436, + "grad_norm": 2.0859055519104004, + "learning_rate": 0.00034551820728291317, + "loss": 0.5025, + "step": 23478 + }, + { + "epoch": 13.116759776536313, + "grad_norm": 0.6509073376655579, + "learning_rate": 0.00034549019607843143, + "loss": 0.4261, + "step": 23479 + }, + { + "epoch": 13.11731843575419, + "grad_norm": 0.4105704426765442, + "learning_rate": 0.0003454621848739496, + "loss": 0.4669, + "step": 23480 + }, + { + "epoch": 13.117877094972068, + "grad_norm": 0.3937210738658905, + "learning_rate": 0.0003454341736694678, + "loss": 0.3452, + "step": 23481 + }, + { + "epoch": 13.118435754189944, + "grad_norm": 0.5726685523986816, + "learning_rate": 0.000345406162464986, + "loss": 0.4207, + "step": 23482 + }, + { + "epoch": 13.11899441340782, + "grad_norm": 0.5402323007583618, + "learning_rate": 0.0003453781512605042, + "loss": 0.4147, + "step": 23483 + }, + { + "epoch": 13.119553072625699, + "grad_norm": 0.5098580121994019, + "learning_rate": 0.00034535014005602246, + "loss": 0.4349, + "step": 23484 + }, + { + "epoch": 13.120111731843576, + "grad_norm": 0.5631532073020935, + "learning_rate": 0.0003453221288515406, + "loss": 0.3586, + "step": 23485 + }, + { + "epoch": 13.120670391061452, + "grad_norm": 0.6478902101516724, + "learning_rate": 0.0003452941176470588, + "loss": 0.5709, + "step": 23486 + }, + { + "epoch": 13.121229050279329, + "grad_norm": 0.35418611764907837, + "learning_rate": 0.0003452661064425771, + "loss": 0.3667, + "step": 23487 + }, + { + "epoch": 13.121787709497207, + "grad_norm": 0.6710458397865295, + "learning_rate": 0.00034523809523809523, + "loss": 0.459, + "step": 23488 + }, + { + "epoch": 13.122346368715084, + "grad_norm": 0.9335286617279053, + "learning_rate": 0.00034521008403361344, + "loss": 0.4511, + "step": 23489 + }, + { + "epoch": 13.12290502793296, + "grad_norm": 0.39675742387771606, + "learning_rate": 0.00034518207282913164, + "loss": 0.4662, + "step": 23490 + }, + { + "epoch": 13.123463687150839, + "grad_norm": 2.5451738834381104, + "learning_rate": 0.00034515406162464985, + "loss": 0.4111, + "step": 23491 + }, + { + "epoch": 13.124022346368715, + "grad_norm": 0.6817076206207275, + "learning_rate": 0.0003451260504201681, + "loss": 0.4472, + "step": 23492 + }, + { + "epoch": 13.124581005586592, + "grad_norm": 0.597777783870697, + "learning_rate": 0.00034509803921568626, + "loss": 0.4242, + "step": 23493 + }, + { + "epoch": 13.12513966480447, + "grad_norm": 0.38116705417633057, + "learning_rate": 0.00034507002801120447, + "loss": 0.3524, + "step": 23494 + }, + { + "epoch": 13.125698324022347, + "grad_norm": 0.39808329939842224, + "learning_rate": 0.00034504201680672273, + "loss": 0.3632, + "step": 23495 + }, + { + "epoch": 13.126256983240223, + "grad_norm": 0.787678062915802, + "learning_rate": 0.0003450140056022409, + "loss": 0.5201, + "step": 23496 + }, + { + "epoch": 13.1268156424581, + "grad_norm": 0.4769321084022522, + "learning_rate": 0.00034498599439775914, + "loss": 0.4292, + "step": 23497 + }, + { + "epoch": 13.127374301675978, + "grad_norm": 0.8958873748779297, + "learning_rate": 0.0003449579831932773, + "loss": 0.4356, + "step": 23498 + }, + { + "epoch": 13.127932960893855, + "grad_norm": 0.38787928223609924, + "learning_rate": 0.0003449299719887955, + "loss": 0.4488, + "step": 23499 + }, + { + "epoch": 13.128491620111731, + "grad_norm": 3.619854688644409, + "learning_rate": 0.00034490196078431376, + "loss": 0.462, + "step": 23500 + }, + { + "epoch": 13.128491620111731, + "eval_cer": 0.08704024963721864, + "eval_loss": 0.32847243547439575, + "eval_runtime": 55.6347, + "eval_samples_per_second": 81.568, + "eval_steps_per_second": 5.105, + "eval_wer": 0.3444037876036982, + "step": 23500 + }, + { + "epoch": 13.12905027932961, + "grad_norm": 0.6810756325721741, + "learning_rate": 0.0003448739495798319, + "loss": 0.3625, + "step": 23501 + }, + { + "epoch": 13.129608938547486, + "grad_norm": 26.42056655883789, + "learning_rate": 0.00034484593837535017, + "loss": 0.3343, + "step": 23502 + }, + { + "epoch": 13.130167597765363, + "grad_norm": 0.6155726313591003, + "learning_rate": 0.0003448179271708684, + "loss": 0.4215, + "step": 23503 + }, + { + "epoch": 13.130726256983241, + "grad_norm": 0.585280179977417, + "learning_rate": 0.00034478991596638653, + "loss": 0.4785, + "step": 23504 + }, + { + "epoch": 13.131284916201118, + "grad_norm": 0.7052249908447266, + "learning_rate": 0.0003447619047619048, + "loss": 0.3592, + "step": 23505 + }, + { + "epoch": 13.131843575418994, + "grad_norm": 0.3439772129058838, + "learning_rate": 0.00034473389355742294, + "loss": 0.3636, + "step": 23506 + }, + { + "epoch": 13.13240223463687, + "grad_norm": 0.5768067836761475, + "learning_rate": 0.0003447058823529412, + "loss": 0.4817, + "step": 23507 + }, + { + "epoch": 13.132960893854749, + "grad_norm": 0.48665159940719604, + "learning_rate": 0.0003446778711484594, + "loss": 0.4066, + "step": 23508 + }, + { + "epoch": 13.133519553072626, + "grad_norm": 0.6033996939659119, + "learning_rate": 0.00034464985994397756, + "loss": 0.451, + "step": 23509 + }, + { + "epoch": 13.134078212290502, + "grad_norm": 0.35850170254707336, + "learning_rate": 0.0003446218487394958, + "loss": 0.3341, + "step": 23510 + }, + { + "epoch": 13.13463687150838, + "grad_norm": 0.34263238310813904, + "learning_rate": 0.000344593837535014, + "loss": 0.3625, + "step": 23511 + }, + { + "epoch": 13.135195530726257, + "grad_norm": 0.4308985769748688, + "learning_rate": 0.00034456582633053223, + "loss": 0.4469, + "step": 23512 + }, + { + "epoch": 13.135754189944134, + "grad_norm": 0.3533570170402527, + "learning_rate": 0.00034453781512605044, + "loss": 0.2687, + "step": 23513 + }, + { + "epoch": 13.136312849162012, + "grad_norm": 0.4198404550552368, + "learning_rate": 0.0003445098039215686, + "loss": 0.4803, + "step": 23514 + }, + { + "epoch": 13.136871508379889, + "grad_norm": 1.7108770608901978, + "learning_rate": 0.00034448179271708685, + "loss": 0.4725, + "step": 23515 + }, + { + "epoch": 13.137430167597765, + "grad_norm": 0.6219730377197266, + "learning_rate": 0.00034445378151260505, + "loss": 0.4563, + "step": 23516 + }, + { + "epoch": 13.137988826815642, + "grad_norm": 0.8826090097427368, + "learning_rate": 0.00034442577030812326, + "loss": 0.3904, + "step": 23517 + }, + { + "epoch": 13.13854748603352, + "grad_norm": 0.37981295585632324, + "learning_rate": 0.00034439775910364147, + "loss": 0.3292, + "step": 23518 + }, + { + "epoch": 13.139106145251397, + "grad_norm": 0.5105723142623901, + "learning_rate": 0.00034436974789915967, + "loss": 0.3904, + "step": 23519 + }, + { + "epoch": 13.139664804469273, + "grad_norm": 0.3931066393852234, + "learning_rate": 0.0003443417366946779, + "loss": 0.4326, + "step": 23520 + }, + { + "epoch": 13.140223463687152, + "grad_norm": 0.44986462593078613, + "learning_rate": 0.0003443137254901961, + "loss": 0.4258, + "step": 23521 + }, + { + "epoch": 13.140782122905028, + "grad_norm": 0.5707066655158997, + "learning_rate": 0.0003442857142857143, + "loss": 0.5597, + "step": 23522 + }, + { + "epoch": 13.141340782122905, + "grad_norm": 0.5863339304924011, + "learning_rate": 0.0003442577030812325, + "loss": 0.427, + "step": 23523 + }, + { + "epoch": 13.141899441340781, + "grad_norm": 0.615108847618103, + "learning_rate": 0.0003442296918767507, + "loss": 0.4913, + "step": 23524 + }, + { + "epoch": 13.14245810055866, + "grad_norm": 0.5812993049621582, + "learning_rate": 0.0003442016806722689, + "loss": 0.394, + "step": 23525 + }, + { + "epoch": 13.143016759776536, + "grad_norm": 1.0453574657440186, + "learning_rate": 0.0003441736694677871, + "loss": 0.4934, + "step": 23526 + }, + { + "epoch": 13.143575418994413, + "grad_norm": 0.7637466192245483, + "learning_rate": 0.0003441456582633054, + "loss": 0.4815, + "step": 23527 + }, + { + "epoch": 13.144134078212291, + "grad_norm": 0.7954033613204956, + "learning_rate": 0.0003441176470588235, + "loss": 0.3582, + "step": 23528 + }, + { + "epoch": 13.144692737430168, + "grad_norm": 4.8569016456604, + "learning_rate": 0.00034408963585434173, + "loss": 0.483, + "step": 23529 + }, + { + "epoch": 13.145251396648044, + "grad_norm": 0.3829418420791626, + "learning_rate": 0.00034406162464985994, + "loss": 0.4215, + "step": 23530 + }, + { + "epoch": 13.145810055865923, + "grad_norm": 0.4325534403324127, + "learning_rate": 0.00034403361344537814, + "loss": 0.4284, + "step": 23531 + }, + { + "epoch": 13.1463687150838, + "grad_norm": 0.8124381899833679, + "learning_rate": 0.0003440056022408964, + "loss": 0.4473, + "step": 23532 + }, + { + "epoch": 13.146927374301676, + "grad_norm": 0.35435134172439575, + "learning_rate": 0.00034397759103641456, + "loss": 0.3891, + "step": 23533 + }, + { + "epoch": 13.147486033519552, + "grad_norm": 0.4271470308303833, + "learning_rate": 0.00034394957983193276, + "loss": 0.4484, + "step": 23534 + }, + { + "epoch": 13.14804469273743, + "grad_norm": 0.5407982468605042, + "learning_rate": 0.000343921568627451, + "loss": 0.4232, + "step": 23535 + }, + { + "epoch": 13.148603351955307, + "grad_norm": 0.5933109521865845, + "learning_rate": 0.0003438935574229692, + "loss": 0.4179, + "step": 23536 + }, + { + "epoch": 13.149162011173184, + "grad_norm": 0.5191864967346191, + "learning_rate": 0.00034386554621848743, + "loss": 0.4349, + "step": 23537 + }, + { + "epoch": 13.149720670391062, + "grad_norm": 1.0307084321975708, + "learning_rate": 0.0003438375350140056, + "loss": 0.6456, + "step": 23538 + }, + { + "epoch": 13.150279329608939, + "grad_norm": 0.31632161140441895, + "learning_rate": 0.0003438095238095238, + "loss": 0.3445, + "step": 23539 + }, + { + "epoch": 13.150837988826815, + "grad_norm": 0.36762794852256775, + "learning_rate": 0.00034378151260504205, + "loss": 0.3837, + "step": 23540 + }, + { + "epoch": 13.151396648044694, + "grad_norm": 0.454878568649292, + "learning_rate": 0.0003437535014005602, + "loss": 0.4148, + "step": 23541 + }, + { + "epoch": 13.15195530726257, + "grad_norm": 0.7697305679321289, + "learning_rate": 0.00034372549019607846, + "loss": 0.5602, + "step": 23542 + }, + { + "epoch": 13.152513966480447, + "grad_norm": 0.44395360350608826, + "learning_rate": 0.00034369747899159667, + "loss": 0.3639, + "step": 23543 + }, + { + "epoch": 13.153072625698323, + "grad_norm": 0.47492894530296326, + "learning_rate": 0.0003436694677871148, + "loss": 0.4208, + "step": 23544 + }, + { + "epoch": 13.153631284916202, + "grad_norm": 0.36974403262138367, + "learning_rate": 0.0003436414565826331, + "loss": 0.3543, + "step": 23545 + }, + { + "epoch": 13.154189944134078, + "grad_norm": 0.7163898944854736, + "learning_rate": 0.00034361344537815123, + "loss": 0.5361, + "step": 23546 + }, + { + "epoch": 13.154748603351955, + "grad_norm": 0.7160236835479736, + "learning_rate": 0.0003435854341736695, + "loss": 0.4204, + "step": 23547 + }, + { + "epoch": 13.155307262569833, + "grad_norm": 0.4247840940952301, + "learning_rate": 0.0003435574229691877, + "loss": 0.3608, + "step": 23548 + }, + { + "epoch": 13.15586592178771, + "grad_norm": 0.3794199824333191, + "learning_rate": 0.00034352941176470585, + "loss": 0.3944, + "step": 23549 + }, + { + "epoch": 13.156424581005586, + "grad_norm": 0.6485399603843689, + "learning_rate": 0.0003435014005602241, + "loss": 0.4774, + "step": 23550 + }, + { + "epoch": 13.156983240223465, + "grad_norm": 0.4471887946128845, + "learning_rate": 0.0003434733893557423, + "loss": 0.38, + "step": 23551 + }, + { + "epoch": 13.157541899441341, + "grad_norm": 0.8795566558837891, + "learning_rate": 0.0003434453781512605, + "loss": 0.4458, + "step": 23552 + }, + { + "epoch": 13.158100558659218, + "grad_norm": 0.4335973560810089, + "learning_rate": 0.00034341736694677873, + "loss": 0.4366, + "step": 23553 + }, + { + "epoch": 13.158659217877094, + "grad_norm": 1.1662137508392334, + "learning_rate": 0.0003433893557422969, + "loss": 0.5791, + "step": 23554 + }, + { + "epoch": 13.159217877094973, + "grad_norm": 0.7169398069381714, + "learning_rate": 0.00034336134453781514, + "loss": 0.3939, + "step": 23555 + }, + { + "epoch": 13.15977653631285, + "grad_norm": 2.2903497219085693, + "learning_rate": 0.00034333333333333335, + "loss": 0.7638, + "step": 23556 + }, + { + "epoch": 13.160335195530726, + "grad_norm": 0.46845901012420654, + "learning_rate": 0.00034330532212885155, + "loss": 0.4014, + "step": 23557 + }, + { + "epoch": 13.160893854748604, + "grad_norm": 0.5294485092163086, + "learning_rate": 0.00034327731092436976, + "loss": 0.3926, + "step": 23558 + }, + { + "epoch": 13.16145251396648, + "grad_norm": 0.6986362338066101, + "learning_rate": 0.00034324929971988797, + "loss": 0.4263, + "step": 23559 + }, + { + "epoch": 13.162011173184357, + "grad_norm": 0.7757440805435181, + "learning_rate": 0.00034322128851540617, + "loss": 0.4805, + "step": 23560 + }, + { + "epoch": 13.162569832402234, + "grad_norm": 0.6362632513046265, + "learning_rate": 0.0003431932773109244, + "loss": 0.5545, + "step": 23561 + }, + { + "epoch": 13.163128491620112, + "grad_norm": 0.5044678449630737, + "learning_rate": 0.0003431652661064426, + "loss": 0.3777, + "step": 23562 + }, + { + "epoch": 13.163687150837989, + "grad_norm": 0.49659910798072815, + "learning_rate": 0.0003431372549019608, + "loss": 0.3936, + "step": 23563 + }, + { + "epoch": 13.164245810055865, + "grad_norm": 0.6571908593177795, + "learning_rate": 0.000343109243697479, + "loss": 0.5951, + "step": 23564 + }, + { + "epoch": 13.164804469273744, + "grad_norm": 0.972420334815979, + "learning_rate": 0.0003430812324929972, + "loss": 0.4665, + "step": 23565 + }, + { + "epoch": 13.16536312849162, + "grad_norm": 0.5162473917007446, + "learning_rate": 0.0003430532212885154, + "loss": 0.387, + "step": 23566 + }, + { + "epoch": 13.165921787709497, + "grad_norm": 0.5060432553291321, + "learning_rate": 0.00034302521008403367, + "loss": 0.3819, + "step": 23567 + }, + { + "epoch": 13.166480446927375, + "grad_norm": 3.235938787460327, + "learning_rate": 0.0003429971988795518, + "loss": 0.5093, + "step": 23568 + }, + { + "epoch": 13.167039106145252, + "grad_norm": 0.40038686990737915, + "learning_rate": 0.00034296918767507, + "loss": 0.3489, + "step": 23569 + }, + { + "epoch": 13.167597765363128, + "grad_norm": 0.4324215054512024, + "learning_rate": 0.00034294117647058823, + "loss": 0.3479, + "step": 23570 + }, + { + "epoch": 13.168156424581005, + "grad_norm": 0.3749639689922333, + "learning_rate": 0.00034291316526610644, + "loss": 0.3425, + "step": 23571 + }, + { + "epoch": 13.168715083798883, + "grad_norm": 0.42007380723953247, + "learning_rate": 0.0003428851540616247, + "loss": 0.4433, + "step": 23572 + }, + { + "epoch": 13.16927374301676, + "grad_norm": 3.701608657836914, + "learning_rate": 0.00034285714285714285, + "loss": 0.6659, + "step": 23573 + }, + { + "epoch": 13.169832402234636, + "grad_norm": 0.6986651420593262, + "learning_rate": 0.00034282913165266106, + "loss": 0.5085, + "step": 23574 + }, + { + "epoch": 13.170391061452515, + "grad_norm": 0.5721229910850525, + "learning_rate": 0.0003428011204481793, + "loss": 0.4654, + "step": 23575 + }, + { + "epoch": 13.170949720670391, + "grad_norm": 0.5399644374847412, + "learning_rate": 0.00034277310924369747, + "loss": 0.3784, + "step": 23576 + }, + { + "epoch": 13.171508379888268, + "grad_norm": 0.5080203413963318, + "learning_rate": 0.00034274509803921573, + "loss": 0.5532, + "step": 23577 + }, + { + "epoch": 13.172067039106146, + "grad_norm": 0.5806756615638733, + "learning_rate": 0.0003427170868347339, + "loss": 0.4165, + "step": 23578 + }, + { + "epoch": 13.172625698324023, + "grad_norm": 0.6362572312355042, + "learning_rate": 0.0003426890756302521, + "loss": 0.5603, + "step": 23579 + }, + { + "epoch": 13.1731843575419, + "grad_norm": 0.3594598174095154, + "learning_rate": 0.00034266106442577035, + "loss": 0.4381, + "step": 23580 + }, + { + "epoch": 13.173743016759776, + "grad_norm": 0.863064706325531, + "learning_rate": 0.0003426330532212885, + "loss": 0.5102, + "step": 23581 + }, + { + "epoch": 13.174301675977654, + "grad_norm": 36.3951530456543, + "learning_rate": 0.00034260504201680676, + "loss": 0.3892, + "step": 23582 + }, + { + "epoch": 13.17486033519553, + "grad_norm": 0.6443665623664856, + "learning_rate": 0.00034257703081232496, + "loss": 0.3876, + "step": 23583 + }, + { + "epoch": 13.175418994413407, + "grad_norm": 0.4082416594028473, + "learning_rate": 0.0003425490196078431, + "loss": 0.2843, + "step": 23584 + }, + { + "epoch": 13.175977653631286, + "grad_norm": 0.4457329511642456, + "learning_rate": 0.0003425210084033614, + "loss": 0.3766, + "step": 23585 + }, + { + "epoch": 13.176536312849162, + "grad_norm": 0.370343953371048, + "learning_rate": 0.00034249299719887953, + "loss": 0.4197, + "step": 23586 + }, + { + "epoch": 13.177094972067039, + "grad_norm": 0.3777053952217102, + "learning_rate": 0.0003424649859943978, + "loss": 0.4416, + "step": 23587 + }, + { + "epoch": 13.177653631284917, + "grad_norm": 0.454223096370697, + "learning_rate": 0.000342436974789916, + "loss": 0.3687, + "step": 23588 + }, + { + "epoch": 13.178212290502794, + "grad_norm": 0.4108923673629761, + "learning_rate": 0.00034240896358543415, + "loss": 0.379, + "step": 23589 + }, + { + "epoch": 13.17877094972067, + "grad_norm": 0.5979317426681519, + "learning_rate": 0.0003423809523809524, + "loss": 0.4423, + "step": 23590 + }, + { + "epoch": 13.179329608938547, + "grad_norm": 0.867979884147644, + "learning_rate": 0.0003423529411764706, + "loss": 0.4397, + "step": 23591 + }, + { + "epoch": 13.179888268156425, + "grad_norm": 0.5001507997512817, + "learning_rate": 0.0003423249299719888, + "loss": 0.4925, + "step": 23592 + }, + { + "epoch": 13.180446927374302, + "grad_norm": 0.5346710085868835, + "learning_rate": 0.000342296918767507, + "loss": 0.4079, + "step": 23593 + }, + { + "epoch": 13.181005586592178, + "grad_norm": 0.5234153866767883, + "learning_rate": 0.0003422689075630252, + "loss": 0.362, + "step": 23594 + }, + { + "epoch": 13.181564245810057, + "grad_norm": 0.6689703464508057, + "learning_rate": 0.00034224089635854344, + "loss": 0.43, + "step": 23595 + }, + { + "epoch": 13.182122905027933, + "grad_norm": 0.4779629111289978, + "learning_rate": 0.00034221288515406164, + "loss": 0.4067, + "step": 23596 + }, + { + "epoch": 13.18268156424581, + "grad_norm": 0.6178259253501892, + "learning_rate": 0.0003421848739495798, + "loss": 0.4587, + "step": 23597 + }, + { + "epoch": 13.183240223463686, + "grad_norm": 0.42503488063812256, + "learning_rate": 0.00034215686274509805, + "loss": 0.3733, + "step": 23598 + }, + { + "epoch": 13.183798882681565, + "grad_norm": 1.5793695449829102, + "learning_rate": 0.00034212885154061626, + "loss": 0.4202, + "step": 23599 + }, + { + "epoch": 13.184357541899441, + "grad_norm": 0.45896288752555847, + "learning_rate": 0.00034210084033613447, + "loss": 0.6113, + "step": 23600 + }, + { + "epoch": 13.184916201117318, + "grad_norm": 0.46592289209365845, + "learning_rate": 0.00034207282913165267, + "loss": 0.4403, + "step": 23601 + }, + { + "epoch": 13.185474860335196, + "grad_norm": 0.49267512559890747, + "learning_rate": 0.0003420448179271708, + "loss": 0.4694, + "step": 23602 + }, + { + "epoch": 13.186033519553073, + "grad_norm": 0.35972535610198975, + "learning_rate": 0.0003420168067226891, + "loss": 0.3231, + "step": 23603 + }, + { + "epoch": 13.18659217877095, + "grad_norm": 0.4554131031036377, + "learning_rate": 0.0003419887955182073, + "loss": 0.4257, + "step": 23604 + }, + { + "epoch": 13.187150837988828, + "grad_norm": 0.3862420916557312, + "learning_rate": 0.0003419607843137255, + "loss": 0.4125, + "step": 23605 + }, + { + "epoch": 13.187709497206704, + "grad_norm": 0.5158444046974182, + "learning_rate": 0.0003419327731092437, + "loss": 0.4389, + "step": 23606 + }, + { + "epoch": 13.18826815642458, + "grad_norm": 0.683224081993103, + "learning_rate": 0.0003419047619047619, + "loss": 0.4378, + "step": 23607 + }, + { + "epoch": 13.188826815642457, + "grad_norm": 1.318318247795105, + "learning_rate": 0.0003418767507002801, + "loss": 0.3757, + "step": 23608 + }, + { + "epoch": 13.189385474860336, + "grad_norm": 1.172843337059021, + "learning_rate": 0.0003418487394957983, + "loss": 0.3676, + "step": 23609 + }, + { + "epoch": 13.189944134078212, + "grad_norm": 0.3646121621131897, + "learning_rate": 0.0003418207282913165, + "loss": 0.3366, + "step": 23610 + }, + { + "epoch": 13.190502793296089, + "grad_norm": 0.5097604393959045, + "learning_rate": 0.00034179271708683473, + "loss": 0.433, + "step": 23611 + }, + { + "epoch": 13.191061452513967, + "grad_norm": 1.0712482929229736, + "learning_rate": 0.00034176470588235294, + "loss": 0.4321, + "step": 23612 + }, + { + "epoch": 13.191620111731844, + "grad_norm": 0.6039049625396729, + "learning_rate": 0.00034173669467787114, + "loss": 0.4369, + "step": 23613 + }, + { + "epoch": 13.19217877094972, + "grad_norm": 0.6225924491882324, + "learning_rate": 0.00034170868347338935, + "loss": 0.449, + "step": 23614 + }, + { + "epoch": 13.192737430167599, + "grad_norm": 0.44095805287361145, + "learning_rate": 0.0003416806722689076, + "loss": 0.2924, + "step": 23615 + }, + { + "epoch": 13.193296089385475, + "grad_norm": 0.5484685301780701, + "learning_rate": 0.00034165266106442576, + "loss": 0.4602, + "step": 23616 + }, + { + "epoch": 13.193854748603352, + "grad_norm": 0.37980830669403076, + "learning_rate": 0.00034162464985994397, + "loss": 0.4425, + "step": 23617 + }, + { + "epoch": 13.194413407821228, + "grad_norm": 0.3541870415210724, + "learning_rate": 0.0003415966386554622, + "loss": 0.388, + "step": 23618 + }, + { + "epoch": 13.194972067039107, + "grad_norm": 2.8225080966949463, + "learning_rate": 0.0003415686274509804, + "loss": 0.423, + "step": 23619 + }, + { + "epoch": 13.195530726256983, + "grad_norm": 0.4947691857814789, + "learning_rate": 0.00034154061624649864, + "loss": 0.4027, + "step": 23620 + }, + { + "epoch": 13.19608938547486, + "grad_norm": 0.5435184836387634, + "learning_rate": 0.0003415126050420168, + "loss": 0.3603, + "step": 23621 + }, + { + "epoch": 13.196648044692738, + "grad_norm": 0.5195969343185425, + "learning_rate": 0.000341484593837535, + "loss": 0.397, + "step": 23622 + }, + { + "epoch": 13.197206703910615, + "grad_norm": 0.3950599431991577, + "learning_rate": 0.00034145658263305326, + "loss": 0.4146, + "step": 23623 + }, + { + "epoch": 13.197765363128491, + "grad_norm": 0.5738654732704163, + "learning_rate": 0.0003414285714285714, + "loss": 0.3531, + "step": 23624 + }, + { + "epoch": 13.19832402234637, + "grad_norm": 0.593899667263031, + "learning_rate": 0.00034140056022408967, + "loss": 0.4783, + "step": 23625 + }, + { + "epoch": 13.198882681564246, + "grad_norm": 0.4480629861354828, + "learning_rate": 0.0003413725490196078, + "loss": 0.469, + "step": 23626 + }, + { + "epoch": 13.199441340782123, + "grad_norm": 1.3859435319900513, + "learning_rate": 0.00034134453781512603, + "loss": 0.4182, + "step": 23627 + }, + { + "epoch": 13.2, + "grad_norm": 0.7241740822792053, + "learning_rate": 0.0003413165266106443, + "loss": 0.4453, + "step": 23628 + }, + { + "epoch": 13.200558659217878, + "grad_norm": 0.4517875909805298, + "learning_rate": 0.00034128851540616244, + "loss": 0.5206, + "step": 23629 + }, + { + "epoch": 13.201117318435754, + "grad_norm": 0.37582507729530334, + "learning_rate": 0.0003412605042016807, + "loss": 0.4036, + "step": 23630 + }, + { + "epoch": 13.20167597765363, + "grad_norm": 0.5926438570022583, + "learning_rate": 0.0003412324929971989, + "loss": 0.4185, + "step": 23631 + }, + { + "epoch": 13.202234636871509, + "grad_norm": 0.4149555265903473, + "learning_rate": 0.00034120448179271706, + "loss": 0.3489, + "step": 23632 + }, + { + "epoch": 13.202793296089386, + "grad_norm": 0.8887465000152588, + "learning_rate": 0.0003411764705882353, + "loss": 0.4132, + "step": 23633 + }, + { + "epoch": 13.203351955307262, + "grad_norm": 2.309170961380005, + "learning_rate": 0.00034114845938375347, + "loss": 0.4776, + "step": 23634 + }, + { + "epoch": 13.203910614525139, + "grad_norm": 0.663011908531189, + "learning_rate": 0.00034112044817927173, + "loss": 0.4734, + "step": 23635 + }, + { + "epoch": 13.204469273743017, + "grad_norm": 0.8255983591079712, + "learning_rate": 0.00034109243697478994, + "loss": 0.4683, + "step": 23636 + }, + { + "epoch": 13.205027932960894, + "grad_norm": 0.48494043946266174, + "learning_rate": 0.0003410644257703081, + "loss": 0.4544, + "step": 23637 + }, + { + "epoch": 13.20558659217877, + "grad_norm": 0.544628381729126, + "learning_rate": 0.00034103641456582635, + "loss": 0.4623, + "step": 23638 + }, + { + "epoch": 13.206145251396649, + "grad_norm": 0.6140303611755371, + "learning_rate": 0.00034100840336134455, + "loss": 0.471, + "step": 23639 + }, + { + "epoch": 13.206703910614525, + "grad_norm": 0.3705027401447296, + "learning_rate": 0.00034098039215686276, + "loss": 0.4034, + "step": 23640 + }, + { + "epoch": 13.207262569832402, + "grad_norm": 0.39516913890838623, + "learning_rate": 0.00034095238095238097, + "loss": 0.4429, + "step": 23641 + }, + { + "epoch": 13.20782122905028, + "grad_norm": 0.4192878007888794, + "learning_rate": 0.0003409243697478991, + "loss": 0.4398, + "step": 23642 + }, + { + "epoch": 13.208379888268157, + "grad_norm": 0.7160155177116394, + "learning_rate": 0.0003408963585434174, + "loss": 0.4594, + "step": 23643 + }, + { + "epoch": 13.208938547486033, + "grad_norm": 1.7860888242721558, + "learning_rate": 0.0003408683473389356, + "loss": 0.3091, + "step": 23644 + }, + { + "epoch": 13.20949720670391, + "grad_norm": 0.7349260449409485, + "learning_rate": 0.0003408403361344538, + "loss": 0.496, + "step": 23645 + }, + { + "epoch": 13.210055865921788, + "grad_norm": 0.6736895442008972, + "learning_rate": 0.000340812324929972, + "loss": 0.4373, + "step": 23646 + }, + { + "epoch": 13.210614525139665, + "grad_norm": 0.7749654054641724, + "learning_rate": 0.0003407843137254902, + "loss": 0.5233, + "step": 23647 + }, + { + "epoch": 13.211173184357541, + "grad_norm": 1.0293306112289429, + "learning_rate": 0.0003407563025210084, + "loss": 0.4343, + "step": 23648 + }, + { + "epoch": 13.21173184357542, + "grad_norm": 0.5538996458053589, + "learning_rate": 0.0003407282913165266, + "loss": 0.3533, + "step": 23649 + }, + { + "epoch": 13.212290502793296, + "grad_norm": 0.3994947075843811, + "learning_rate": 0.0003407002801120448, + "loss": 0.4113, + "step": 23650 + }, + { + "epoch": 13.212849162011173, + "grad_norm": 0.44770339131355286, + "learning_rate": 0.000340672268907563, + "loss": 0.3631, + "step": 23651 + }, + { + "epoch": 13.213407821229051, + "grad_norm": 0.4695807695388794, + "learning_rate": 0.00034064425770308123, + "loss": 0.4945, + "step": 23652 + }, + { + "epoch": 13.213966480446928, + "grad_norm": 0.8152663111686707, + "learning_rate": 0.00034061624649859944, + "loss": 0.3494, + "step": 23653 + }, + { + "epoch": 13.214525139664804, + "grad_norm": 0.7754992842674255, + "learning_rate": 0.00034058823529411764, + "loss": 0.4156, + "step": 23654 + }, + { + "epoch": 13.21508379888268, + "grad_norm": 1.0545365810394287, + "learning_rate": 0.0003405602240896359, + "loss": 0.4503, + "step": 23655 + }, + { + "epoch": 13.21564245810056, + "grad_norm": 0.3590008020401001, + "learning_rate": 0.00034053221288515406, + "loss": 0.3142, + "step": 23656 + }, + { + "epoch": 13.216201117318436, + "grad_norm": 0.6116838455200195, + "learning_rate": 0.00034050420168067226, + "loss": 0.5446, + "step": 23657 + }, + { + "epoch": 13.216759776536312, + "grad_norm": 0.5947260856628418, + "learning_rate": 0.00034047619047619047, + "loss": 0.3706, + "step": 23658 + }, + { + "epoch": 13.21731843575419, + "grad_norm": 0.43871936202049255, + "learning_rate": 0.0003404481792717087, + "loss": 0.3462, + "step": 23659 + }, + { + "epoch": 13.217877094972067, + "grad_norm": 0.47355180978775024, + "learning_rate": 0.00034042016806722693, + "loss": 0.4153, + "step": 23660 + }, + { + "epoch": 13.218435754189944, + "grad_norm": 0.5599240660667419, + "learning_rate": 0.0003403921568627451, + "loss": 0.4207, + "step": 23661 + }, + { + "epoch": 13.21899441340782, + "grad_norm": 2.1302199363708496, + "learning_rate": 0.0003403641456582633, + "loss": 0.4603, + "step": 23662 + }, + { + "epoch": 13.219553072625699, + "grad_norm": 1.1983551979064941, + "learning_rate": 0.00034033613445378155, + "loss": 0.4666, + "step": 23663 + }, + { + "epoch": 13.220111731843575, + "grad_norm": 2.491521120071411, + "learning_rate": 0.0003403081232492997, + "loss": 0.7406, + "step": 23664 + }, + { + "epoch": 13.220670391061452, + "grad_norm": 0.5156349539756775, + "learning_rate": 0.00034028011204481796, + "loss": 0.6736, + "step": 23665 + }, + { + "epoch": 13.22122905027933, + "grad_norm": 1.0162335634231567, + "learning_rate": 0.0003402521008403361, + "loss": 0.4549, + "step": 23666 + }, + { + "epoch": 13.221787709497207, + "grad_norm": 2.56878662109375, + "learning_rate": 0.0003402240896358543, + "loss": 0.3905, + "step": 23667 + }, + { + "epoch": 13.222346368715083, + "grad_norm": 0.45578983426094055, + "learning_rate": 0.0003401960784313726, + "loss": 0.4045, + "step": 23668 + }, + { + "epoch": 13.222905027932962, + "grad_norm": 0.4824478030204773, + "learning_rate": 0.00034016806722689073, + "loss": 0.3391, + "step": 23669 + }, + { + "epoch": 13.223463687150838, + "grad_norm": 2.444816827774048, + "learning_rate": 0.000340140056022409, + "loss": 0.3401, + "step": 23670 + }, + { + "epoch": 13.224022346368715, + "grad_norm": 0.438673198223114, + "learning_rate": 0.0003401120448179272, + "loss": 0.3308, + "step": 23671 + }, + { + "epoch": 13.224581005586591, + "grad_norm": 0.6557748317718506, + "learning_rate": 0.00034008403361344535, + "loss": 0.4582, + "step": 23672 + }, + { + "epoch": 13.22513966480447, + "grad_norm": 0.43666499853134155, + "learning_rate": 0.0003400560224089636, + "loss": 0.4359, + "step": 23673 + }, + { + "epoch": 13.225698324022346, + "grad_norm": 0.4532405138015747, + "learning_rate": 0.00034002801120448176, + "loss": 0.4513, + "step": 23674 + }, + { + "epoch": 13.226256983240223, + "grad_norm": 0.5603696703910828, + "learning_rate": 0.00034, + "loss": 0.5223, + "step": 23675 + }, + { + "epoch": 13.226815642458101, + "grad_norm": 0.3744250535964966, + "learning_rate": 0.00033997198879551823, + "loss": 0.3511, + "step": 23676 + }, + { + "epoch": 13.227374301675978, + "grad_norm": 0.4103121757507324, + "learning_rate": 0.0003399439775910364, + "loss": 0.458, + "step": 23677 + }, + { + "epoch": 13.227932960893854, + "grad_norm": 3.3088722229003906, + "learning_rate": 0.00033991596638655464, + "loss": 0.5882, + "step": 23678 + }, + { + "epoch": 13.228491620111733, + "grad_norm": 0.3987123668193817, + "learning_rate": 0.00033988795518207285, + "loss": 0.3491, + "step": 23679 + }, + { + "epoch": 13.22905027932961, + "grad_norm": 0.8478341698646545, + "learning_rate": 0.00033985994397759105, + "loss": 0.4476, + "step": 23680 + }, + { + "epoch": 13.229608938547486, + "grad_norm": 0.8431574106216431, + "learning_rate": 0.00033983193277310926, + "loss": 0.4342, + "step": 23681 + }, + { + "epoch": 13.230167597765362, + "grad_norm": 0.41380441188812256, + "learning_rate": 0.0003398039215686274, + "loss": 0.4658, + "step": 23682 + }, + { + "epoch": 13.23072625698324, + "grad_norm": 1.3141186237335205, + "learning_rate": 0.00033977591036414567, + "loss": 0.379, + "step": 23683 + }, + { + "epoch": 13.231284916201117, + "grad_norm": 4.500753402709961, + "learning_rate": 0.0003397478991596639, + "loss": 0.429, + "step": 23684 + }, + { + "epoch": 13.231843575418994, + "grad_norm": 0.41592085361480713, + "learning_rate": 0.0003397198879551821, + "loss": 0.3795, + "step": 23685 + }, + { + "epoch": 13.232402234636872, + "grad_norm": 0.5778118371963501, + "learning_rate": 0.0003396918767507003, + "loss": 0.5639, + "step": 23686 + }, + { + "epoch": 13.232960893854749, + "grad_norm": 0.7451493144035339, + "learning_rate": 0.0003396638655462185, + "loss": 0.3903, + "step": 23687 + }, + { + "epoch": 13.233519553072625, + "grad_norm": 0.8031677603721619, + "learning_rate": 0.0003396358543417367, + "loss": 0.499, + "step": 23688 + }, + { + "epoch": 13.234078212290504, + "grad_norm": 0.3959466218948364, + "learning_rate": 0.0003396078431372549, + "loss": 0.3404, + "step": 23689 + }, + { + "epoch": 13.23463687150838, + "grad_norm": 0.4078703224658966, + "learning_rate": 0.00033957983193277317, + "loss": 0.4294, + "step": 23690 + }, + { + "epoch": 13.235195530726257, + "grad_norm": 0.49590003490448, + "learning_rate": 0.0003395518207282913, + "loss": 0.3412, + "step": 23691 + }, + { + "epoch": 13.235754189944133, + "grad_norm": 0.36874130368232727, + "learning_rate": 0.0003395238095238095, + "loss": 0.3443, + "step": 23692 + }, + { + "epoch": 13.236312849162012, + "grad_norm": 0.5423593521118164, + "learning_rate": 0.00033949579831932773, + "loss": 0.4818, + "step": 23693 + }, + { + "epoch": 13.236871508379888, + "grad_norm": 0.4223531186580658, + "learning_rate": 0.00033946778711484594, + "loss": 0.325, + "step": 23694 + }, + { + "epoch": 13.237430167597765, + "grad_norm": 0.4977760910987854, + "learning_rate": 0.0003394397759103642, + "loss": 0.404, + "step": 23695 + }, + { + "epoch": 13.237988826815643, + "grad_norm": 2.3316402435302734, + "learning_rate": 0.00033941176470588235, + "loss": 0.457, + "step": 23696 + }, + { + "epoch": 13.23854748603352, + "grad_norm": 0.37392234802246094, + "learning_rate": 0.00033938375350140056, + "loss": 0.4031, + "step": 23697 + }, + { + "epoch": 13.239106145251396, + "grad_norm": 2.352795362472534, + "learning_rate": 0.0003393557422969188, + "loss": 0.5183, + "step": 23698 + }, + { + "epoch": 13.239664804469275, + "grad_norm": 0.5329104661941528, + "learning_rate": 0.00033932773109243697, + "loss": 0.6079, + "step": 23699 + }, + { + "epoch": 13.240223463687151, + "grad_norm": 0.3084893822669983, + "learning_rate": 0.00033929971988795523, + "loss": 0.3625, + "step": 23700 + }, + { + "epoch": 13.240782122905028, + "grad_norm": 0.7818188071250916, + "learning_rate": 0.0003392717086834734, + "loss": 0.396, + "step": 23701 + }, + { + "epoch": 13.241340782122904, + "grad_norm": 0.3785753548145294, + "learning_rate": 0.0003392436974789916, + "loss": 0.38, + "step": 23702 + }, + { + "epoch": 13.241899441340783, + "grad_norm": 0.4625031054019928, + "learning_rate": 0.00033921568627450985, + "loss": 0.5357, + "step": 23703 + }, + { + "epoch": 13.24245810055866, + "grad_norm": 0.39104342460632324, + "learning_rate": 0.000339187675070028, + "loss": 0.3973, + "step": 23704 + }, + { + "epoch": 13.243016759776536, + "grad_norm": 0.4392891228199005, + "learning_rate": 0.00033915966386554626, + "loss": 0.4813, + "step": 23705 + }, + { + "epoch": 13.243575418994414, + "grad_norm": 0.6085929870605469, + "learning_rate": 0.00033913165266106446, + "loss": 0.3963, + "step": 23706 + }, + { + "epoch": 13.24413407821229, + "grad_norm": 0.380363792181015, + "learning_rate": 0.0003391036414565826, + "loss": 0.3603, + "step": 23707 + }, + { + "epoch": 13.244692737430167, + "grad_norm": 0.5867030620574951, + "learning_rate": 0.0003390756302521009, + "loss": 0.4762, + "step": 23708 + }, + { + "epoch": 13.245251396648044, + "grad_norm": 0.6263246536254883, + "learning_rate": 0.00033904761904761903, + "loss": 0.4342, + "step": 23709 + }, + { + "epoch": 13.245810055865922, + "grad_norm": 0.4621928334236145, + "learning_rate": 0.00033901960784313723, + "loss": 0.4349, + "step": 23710 + }, + { + "epoch": 13.246368715083799, + "grad_norm": 0.523230791091919, + "learning_rate": 0.0003389915966386555, + "loss": 0.4075, + "step": 23711 + }, + { + "epoch": 13.246927374301675, + "grad_norm": 0.40113189816474915, + "learning_rate": 0.00033896358543417365, + "loss": 0.4657, + "step": 23712 + }, + { + "epoch": 13.247486033519554, + "grad_norm": 0.946519672870636, + "learning_rate": 0.0003389355742296919, + "loss": 0.3781, + "step": 23713 + }, + { + "epoch": 13.24804469273743, + "grad_norm": 0.39682650566101074, + "learning_rate": 0.0003389075630252101, + "loss": 0.3894, + "step": 23714 + }, + { + "epoch": 13.248603351955307, + "grad_norm": 0.4995368421077728, + "learning_rate": 0.00033887955182072826, + "loss": 0.3952, + "step": 23715 + }, + { + "epoch": 13.249162011173185, + "grad_norm": 0.47496095299720764, + "learning_rate": 0.0003388515406162465, + "loss": 0.4611, + "step": 23716 + }, + { + "epoch": 13.249720670391062, + "grad_norm": 0.32721957564353943, + "learning_rate": 0.0003388235294117647, + "loss": 0.3812, + "step": 23717 + }, + { + "epoch": 13.250279329608938, + "grad_norm": 0.5261251926422119, + "learning_rate": 0.00033879551820728294, + "loss": 0.4817, + "step": 23718 + }, + { + "epoch": 13.250837988826815, + "grad_norm": 0.5836412906646729, + "learning_rate": 0.00033876750700280114, + "loss": 0.6486, + "step": 23719 + }, + { + "epoch": 13.251396648044693, + "grad_norm": 0.6512834429740906, + "learning_rate": 0.0003387394957983193, + "loss": 0.5161, + "step": 23720 + }, + { + "epoch": 13.25195530726257, + "grad_norm": 0.4200536608695984, + "learning_rate": 0.00033871148459383755, + "loss": 0.4173, + "step": 23721 + }, + { + "epoch": 13.252513966480446, + "grad_norm": 0.41412511467933655, + "learning_rate": 0.00033868347338935576, + "loss": 0.3841, + "step": 23722 + }, + { + "epoch": 13.253072625698325, + "grad_norm": 0.9450453519821167, + "learning_rate": 0.00033865546218487397, + "loss": 0.3596, + "step": 23723 + }, + { + "epoch": 13.253631284916201, + "grad_norm": 0.6311314105987549, + "learning_rate": 0.00033862745098039217, + "loss": 0.4675, + "step": 23724 + }, + { + "epoch": 13.254189944134078, + "grad_norm": 1.0011779069900513, + "learning_rate": 0.0003385994397759103, + "loss": 0.4963, + "step": 23725 + }, + { + "epoch": 13.254748603351956, + "grad_norm": 0.7265097498893738, + "learning_rate": 0.0003385714285714286, + "loss": 0.3648, + "step": 23726 + }, + { + "epoch": 13.255307262569833, + "grad_norm": 0.5350236892700195, + "learning_rate": 0.0003385434173669468, + "loss": 0.5527, + "step": 23727 + }, + { + "epoch": 13.25586592178771, + "grad_norm": 0.3682211935520172, + "learning_rate": 0.000338515406162465, + "loss": 0.4746, + "step": 23728 + }, + { + "epoch": 13.256424581005586, + "grad_norm": 0.3880104720592499, + "learning_rate": 0.0003384873949579832, + "loss": 0.546, + "step": 23729 + }, + { + "epoch": 13.256983240223464, + "grad_norm": 0.5494697093963623, + "learning_rate": 0.0003384593837535014, + "loss": 0.5341, + "step": 23730 + }, + { + "epoch": 13.25754189944134, + "grad_norm": 0.5074079036712646, + "learning_rate": 0.0003384313725490196, + "loss": 0.4142, + "step": 23731 + }, + { + "epoch": 13.258100558659217, + "grad_norm": 0.582423210144043, + "learning_rate": 0.0003384033613445378, + "loss": 0.5867, + "step": 23732 + }, + { + "epoch": 13.258659217877096, + "grad_norm": 0.9407914876937866, + "learning_rate": 0.000338375350140056, + "loss": 0.3889, + "step": 23733 + }, + { + "epoch": 13.259217877094972, + "grad_norm": 0.46638309955596924, + "learning_rate": 0.00033834733893557423, + "loss": 0.3626, + "step": 23734 + }, + { + "epoch": 13.259776536312849, + "grad_norm": 0.47919225692749023, + "learning_rate": 0.00033831932773109244, + "loss": 0.6177, + "step": 23735 + }, + { + "epoch": 13.260335195530725, + "grad_norm": 0.4282124638557434, + "learning_rate": 0.00033829131652661064, + "loss": 0.4576, + "step": 23736 + }, + { + "epoch": 13.260893854748604, + "grad_norm": 0.6756613254547119, + "learning_rate": 0.00033826330532212885, + "loss": 0.4171, + "step": 23737 + }, + { + "epoch": 13.26145251396648, + "grad_norm": 0.49720460176467896, + "learning_rate": 0.0003382352941176471, + "loss": 0.3788, + "step": 23738 + }, + { + "epoch": 13.262011173184357, + "grad_norm": 0.963648796081543, + "learning_rate": 0.00033820728291316526, + "loss": 0.3591, + "step": 23739 + }, + { + "epoch": 13.262569832402235, + "grad_norm": 0.5625825524330139, + "learning_rate": 0.00033817927170868347, + "loss": 0.4253, + "step": 23740 + }, + { + "epoch": 13.263128491620112, + "grad_norm": 0.6193555593490601, + "learning_rate": 0.0003381512605042017, + "loss": 0.4243, + "step": 23741 + }, + { + "epoch": 13.263687150837988, + "grad_norm": 0.48198628425598145, + "learning_rate": 0.0003381232492997199, + "loss": 0.3295, + "step": 23742 + }, + { + "epoch": 13.264245810055867, + "grad_norm": 0.3768001198768616, + "learning_rate": 0.00033809523809523814, + "loss": 0.3951, + "step": 23743 + }, + { + "epoch": 13.264804469273743, + "grad_norm": 7.285225868225098, + "learning_rate": 0.0003380672268907563, + "loss": 0.488, + "step": 23744 + }, + { + "epoch": 13.26536312849162, + "grad_norm": 0.5495749711990356, + "learning_rate": 0.0003380392156862745, + "loss": 0.4656, + "step": 23745 + }, + { + "epoch": 13.265921787709496, + "grad_norm": 0.3612942099571228, + "learning_rate": 0.00033801120448179276, + "loss": 0.4098, + "step": 23746 + }, + { + "epoch": 13.266480446927375, + "grad_norm": 0.411364883184433, + "learning_rate": 0.0003379831932773109, + "loss": 0.4022, + "step": 23747 + }, + { + "epoch": 13.267039106145251, + "grad_norm": 0.565970242023468, + "learning_rate": 0.00033795518207282917, + "loss": 0.5006, + "step": 23748 + }, + { + "epoch": 13.267597765363128, + "grad_norm": 0.4536724090576172, + "learning_rate": 0.0003379271708683473, + "loss": 0.5085, + "step": 23749 + }, + { + "epoch": 13.268156424581006, + "grad_norm": 0.29209285974502563, + "learning_rate": 0.00033789915966386553, + "loss": 0.3827, + "step": 23750 + }, + { + "epoch": 13.268715083798883, + "grad_norm": 0.5238004326820374, + "learning_rate": 0.0003378711484593838, + "loss": 0.5162, + "step": 23751 + }, + { + "epoch": 13.26927374301676, + "grad_norm": 0.5797522068023682, + "learning_rate": 0.00033784313725490194, + "loss": 0.3792, + "step": 23752 + }, + { + "epoch": 13.269832402234638, + "grad_norm": 3.573162317276001, + "learning_rate": 0.0003378151260504202, + "loss": 0.4015, + "step": 23753 + }, + { + "epoch": 13.270391061452514, + "grad_norm": 1.1292129755020142, + "learning_rate": 0.0003377871148459384, + "loss": 0.3854, + "step": 23754 + }, + { + "epoch": 13.27094972067039, + "grad_norm": 6.190539836883545, + "learning_rate": 0.00033775910364145656, + "loss": 0.5266, + "step": 23755 + }, + { + "epoch": 13.271508379888267, + "grad_norm": 0.41582804918289185, + "learning_rate": 0.0003377310924369748, + "loss": 0.2796, + "step": 23756 + }, + { + "epoch": 13.272067039106146, + "grad_norm": 2.211156129837036, + "learning_rate": 0.00033770308123249297, + "loss": 0.5429, + "step": 23757 + }, + { + "epoch": 13.272625698324022, + "grad_norm": 0.7692956924438477, + "learning_rate": 0.00033767507002801123, + "loss": 0.3791, + "step": 23758 + }, + { + "epoch": 13.273184357541899, + "grad_norm": 0.4514099955558777, + "learning_rate": 0.00033764705882352944, + "loss": 0.4525, + "step": 23759 + }, + { + "epoch": 13.273743016759777, + "grad_norm": 0.3639926612377167, + "learning_rate": 0.0003376190476190476, + "loss": 0.4728, + "step": 23760 + }, + { + "epoch": 13.274301675977654, + "grad_norm": 0.5912122130393982, + "learning_rate": 0.00033759103641456585, + "loss": 0.4062, + "step": 23761 + }, + { + "epoch": 13.27486033519553, + "grad_norm": 0.9222456216812134, + "learning_rate": 0.00033756302521008405, + "loss": 0.4132, + "step": 23762 + }, + { + "epoch": 13.275418994413409, + "grad_norm": 0.2927217185497284, + "learning_rate": 0.00033753501400560226, + "loss": 0.3699, + "step": 23763 + }, + { + "epoch": 13.275977653631285, + "grad_norm": 0.7229845523834229, + "learning_rate": 0.00033750700280112047, + "loss": 0.5858, + "step": 23764 + }, + { + "epoch": 13.276536312849162, + "grad_norm": 0.8083800673484802, + "learning_rate": 0.0003374789915966386, + "loss": 0.436, + "step": 23765 + }, + { + "epoch": 13.277094972067038, + "grad_norm": 0.42317280173301697, + "learning_rate": 0.0003374509803921569, + "loss": 0.4916, + "step": 23766 + }, + { + "epoch": 13.277653631284917, + "grad_norm": 0.5788655281066895, + "learning_rate": 0.0003374229691876751, + "loss": 0.4597, + "step": 23767 + }, + { + "epoch": 13.278212290502793, + "grad_norm": 0.4634612500667572, + "learning_rate": 0.0003373949579831933, + "loss": 0.3903, + "step": 23768 + }, + { + "epoch": 13.27877094972067, + "grad_norm": 0.3420405089855194, + "learning_rate": 0.0003373669467787115, + "loss": 0.3958, + "step": 23769 + }, + { + "epoch": 13.279329608938548, + "grad_norm": 0.5979591608047485, + "learning_rate": 0.0003373389355742297, + "loss": 0.4618, + "step": 23770 + }, + { + "epoch": 13.279888268156425, + "grad_norm": 0.5105229020118713, + "learning_rate": 0.0003373109243697479, + "loss": 0.4256, + "step": 23771 + }, + { + "epoch": 13.280446927374301, + "grad_norm": 0.45555105805397034, + "learning_rate": 0.0003372829131652661, + "loss": 0.3752, + "step": 23772 + }, + { + "epoch": 13.28100558659218, + "grad_norm": 0.5491973161697388, + "learning_rate": 0.0003372549019607843, + "loss": 0.5126, + "step": 23773 + }, + { + "epoch": 13.281564245810056, + "grad_norm": 0.4162377715110779, + "learning_rate": 0.0003372268907563025, + "loss": 0.3753, + "step": 23774 + }, + { + "epoch": 13.282122905027933, + "grad_norm": 0.5493960976600647, + "learning_rate": 0.00033719887955182073, + "loss": 0.4632, + "step": 23775 + }, + { + "epoch": 13.28268156424581, + "grad_norm": 0.4823024868965149, + "learning_rate": 0.00033717086834733894, + "loss": 0.4191, + "step": 23776 + }, + { + "epoch": 13.283240223463688, + "grad_norm": 0.608932614326477, + "learning_rate": 0.00033714285714285714, + "loss": 0.5072, + "step": 23777 + }, + { + "epoch": 13.283798882681564, + "grad_norm": 0.3941212296485901, + "learning_rate": 0.0003371148459383754, + "loss": 0.4176, + "step": 23778 + }, + { + "epoch": 13.28435754189944, + "grad_norm": 0.7590417861938477, + "learning_rate": 0.00033708683473389356, + "loss": 0.5109, + "step": 23779 + }, + { + "epoch": 13.28491620111732, + "grad_norm": 0.48589059710502625, + "learning_rate": 0.00033705882352941176, + "loss": 0.5136, + "step": 23780 + }, + { + "epoch": 13.285474860335196, + "grad_norm": 0.6259783506393433, + "learning_rate": 0.00033703081232492997, + "loss": 0.4997, + "step": 23781 + }, + { + "epoch": 13.286033519553072, + "grad_norm": 0.3962301015853882, + "learning_rate": 0.0003370028011204482, + "loss": 0.3443, + "step": 23782 + }, + { + "epoch": 13.286592178770949, + "grad_norm": 1.0924508571624756, + "learning_rate": 0.00033697478991596643, + "loss": 0.3593, + "step": 23783 + }, + { + "epoch": 13.287150837988827, + "grad_norm": 0.46605604887008667, + "learning_rate": 0.0003369467787114846, + "loss": 0.4109, + "step": 23784 + }, + { + "epoch": 13.287709497206704, + "grad_norm": 0.6827029585838318, + "learning_rate": 0.0003369187675070028, + "loss": 0.3317, + "step": 23785 + }, + { + "epoch": 13.28826815642458, + "grad_norm": 0.5276693105697632, + "learning_rate": 0.00033689075630252105, + "loss": 0.3585, + "step": 23786 + }, + { + "epoch": 13.288826815642459, + "grad_norm": 1.05462646484375, + "learning_rate": 0.0003368627450980392, + "loss": 0.3805, + "step": 23787 + }, + { + "epoch": 13.289385474860335, + "grad_norm": 0.34285661578178406, + "learning_rate": 0.00033683473389355746, + "loss": 0.3501, + "step": 23788 + }, + { + "epoch": 13.289944134078212, + "grad_norm": 0.5167961716651917, + "learning_rate": 0.0003368067226890756, + "loss": 0.5222, + "step": 23789 + }, + { + "epoch": 13.29050279329609, + "grad_norm": 0.611379861831665, + "learning_rate": 0.0003367787114845938, + "loss": 0.371, + "step": 23790 + }, + { + "epoch": 13.291061452513967, + "grad_norm": 1.8814785480499268, + "learning_rate": 0.0003367507002801121, + "loss": 0.4354, + "step": 23791 + }, + { + "epoch": 13.291620111731843, + "grad_norm": 0.5234327912330627, + "learning_rate": 0.00033672268907563023, + "loss": 0.3668, + "step": 23792 + }, + { + "epoch": 13.29217877094972, + "grad_norm": 0.5477795004844666, + "learning_rate": 0.0003366946778711485, + "loss": 0.627, + "step": 23793 + }, + { + "epoch": 13.292737430167598, + "grad_norm": 1.0924941301345825, + "learning_rate": 0.0003366666666666667, + "loss": 0.3692, + "step": 23794 + }, + { + "epoch": 13.293296089385475, + "grad_norm": 0.8735679984092712, + "learning_rate": 0.00033663865546218485, + "loss": 0.417, + "step": 23795 + }, + { + "epoch": 13.293854748603351, + "grad_norm": 0.5475407838821411, + "learning_rate": 0.0003366106442577031, + "loss": 0.463, + "step": 23796 + }, + { + "epoch": 13.29441340782123, + "grad_norm": 0.3959656357765198, + "learning_rate": 0.00033658263305322126, + "loss": 0.3657, + "step": 23797 + }, + { + "epoch": 13.294972067039106, + "grad_norm": 0.6313350200653076, + "learning_rate": 0.0003365546218487395, + "loss": 0.437, + "step": 23798 + }, + { + "epoch": 13.295530726256983, + "grad_norm": 2.0455846786499023, + "learning_rate": 0.00033652661064425773, + "loss": 0.4531, + "step": 23799 + }, + { + "epoch": 13.296089385474861, + "grad_norm": 1.8393518924713135, + "learning_rate": 0.0003364985994397759, + "loss": 0.3803, + "step": 23800 + }, + { + "epoch": 13.296648044692738, + "grad_norm": 0.45601314306259155, + "learning_rate": 0.00033647058823529414, + "loss": 0.3257, + "step": 23801 + }, + { + "epoch": 13.297206703910614, + "grad_norm": 0.5860977172851562, + "learning_rate": 0.00033644257703081235, + "loss": 0.4253, + "step": 23802 + }, + { + "epoch": 13.297765363128491, + "grad_norm": 0.5303998589515686, + "learning_rate": 0.00033641456582633055, + "loss": 0.3477, + "step": 23803 + }, + { + "epoch": 13.29832402234637, + "grad_norm": 0.4638810455799103, + "learning_rate": 0.00033638655462184876, + "loss": 0.3804, + "step": 23804 + }, + { + "epoch": 13.298882681564246, + "grad_norm": 0.6082403063774109, + "learning_rate": 0.0003363585434173669, + "loss": 0.5631, + "step": 23805 + }, + { + "epoch": 13.299441340782122, + "grad_norm": 0.41769352555274963, + "learning_rate": 0.00033633053221288517, + "loss": 0.3649, + "step": 23806 + }, + { + "epoch": 13.3, + "grad_norm": 0.4883352518081665, + "learning_rate": 0.0003363025210084034, + "loss": 0.3777, + "step": 23807 + }, + { + "epoch": 13.300558659217877, + "grad_norm": 0.37725093960762024, + "learning_rate": 0.0003362745098039216, + "loss": 0.424, + "step": 23808 + }, + { + "epoch": 13.301117318435754, + "grad_norm": 0.41677770018577576, + "learning_rate": 0.0003362464985994398, + "loss": 0.3543, + "step": 23809 + }, + { + "epoch": 13.30167597765363, + "grad_norm": 0.37945252656936646, + "learning_rate": 0.000336218487394958, + "loss": 0.3537, + "step": 23810 + }, + { + "epoch": 13.302234636871509, + "grad_norm": 0.5119186639785767, + "learning_rate": 0.0003361904761904762, + "loss": 0.4028, + "step": 23811 + }, + { + "epoch": 13.302793296089385, + "grad_norm": 0.49954649806022644, + "learning_rate": 0.0003361624649859944, + "loss": 0.4695, + "step": 23812 + }, + { + "epoch": 13.303351955307262, + "grad_norm": 1.107476830482483, + "learning_rate": 0.0003361344537815126, + "loss": 0.6901, + "step": 23813 + }, + { + "epoch": 13.30391061452514, + "grad_norm": 1.364524006843567, + "learning_rate": 0.0003361064425770308, + "loss": 0.4671, + "step": 23814 + }, + { + "epoch": 13.304469273743017, + "grad_norm": 0.6046756505966187, + "learning_rate": 0.000336078431372549, + "loss": 0.5035, + "step": 23815 + }, + { + "epoch": 13.305027932960893, + "grad_norm": 1.9447007179260254, + "learning_rate": 0.00033605042016806723, + "loss": 0.3759, + "step": 23816 + }, + { + "epoch": 13.305586592178772, + "grad_norm": 3.609431266784668, + "learning_rate": 0.00033602240896358544, + "loss": 0.4887, + "step": 23817 + }, + { + "epoch": 13.306145251396648, + "grad_norm": 0.5841872096061707, + "learning_rate": 0.0003359943977591037, + "loss": 0.4432, + "step": 23818 + }, + { + "epoch": 13.306703910614525, + "grad_norm": 0.4049026370048523, + "learning_rate": 0.00033596638655462185, + "loss": 0.4052, + "step": 23819 + }, + { + "epoch": 13.307262569832401, + "grad_norm": 0.37242940068244934, + "learning_rate": 0.00033593837535014006, + "loss": 0.3739, + "step": 23820 + }, + { + "epoch": 13.30782122905028, + "grad_norm": 0.434740275144577, + "learning_rate": 0.00033591036414565826, + "loss": 0.4814, + "step": 23821 + }, + { + "epoch": 13.308379888268156, + "grad_norm": 0.5802294015884399, + "learning_rate": 0.00033588235294117647, + "loss": 0.5231, + "step": 23822 + }, + { + "epoch": 13.308938547486033, + "grad_norm": 0.5402165651321411, + "learning_rate": 0.0003358543417366947, + "loss": 0.5756, + "step": 23823 + }, + { + "epoch": 13.309497206703911, + "grad_norm": 0.46558958292007446, + "learning_rate": 0.0003358263305322129, + "loss": 0.4299, + "step": 23824 + }, + { + "epoch": 13.310055865921788, + "grad_norm": 0.3648386001586914, + "learning_rate": 0.0003357983193277311, + "loss": 0.3439, + "step": 23825 + }, + { + "epoch": 13.310614525139664, + "grad_norm": 0.47914642095565796, + "learning_rate": 0.00033577030812324935, + "loss": 0.3986, + "step": 23826 + }, + { + "epoch": 13.311173184357543, + "grad_norm": 0.4992138743400574, + "learning_rate": 0.0003357422969187675, + "loss": 0.4313, + "step": 23827 + }, + { + "epoch": 13.31173184357542, + "grad_norm": 0.3443225622177124, + "learning_rate": 0.0003357142857142857, + "loss": 0.3626, + "step": 23828 + }, + { + "epoch": 13.312290502793296, + "grad_norm": 0.5787002444267273, + "learning_rate": 0.0003356862745098039, + "loss": 0.3574, + "step": 23829 + }, + { + "epoch": 13.312849162011172, + "grad_norm": 0.43956124782562256, + "learning_rate": 0.0003356582633053221, + "loss": 0.5116, + "step": 23830 + }, + { + "epoch": 13.31340782122905, + "grad_norm": 0.7588776350021362, + "learning_rate": 0.0003356302521008404, + "loss": 0.3789, + "step": 23831 + }, + { + "epoch": 13.313966480446927, + "grad_norm": 0.4662046432495117, + "learning_rate": 0.00033560224089635853, + "loss": 0.4886, + "step": 23832 + }, + { + "epoch": 13.314525139664804, + "grad_norm": 0.47922998666763306, + "learning_rate": 0.00033557422969187673, + "loss": 0.4323, + "step": 23833 + }, + { + "epoch": 13.315083798882682, + "grad_norm": 0.9394093751907349, + "learning_rate": 0.000335546218487395, + "loss": 0.3502, + "step": 23834 + }, + { + "epoch": 13.315642458100559, + "grad_norm": 0.35608112812042236, + "learning_rate": 0.00033551820728291315, + "loss": 0.4018, + "step": 23835 + }, + { + "epoch": 13.316201117318435, + "grad_norm": 0.3313712775707245, + "learning_rate": 0.0003354901960784314, + "loss": 0.3623, + "step": 23836 + }, + { + "epoch": 13.316759776536314, + "grad_norm": 0.44091737270355225, + "learning_rate": 0.00033546218487394956, + "loss": 0.3845, + "step": 23837 + }, + { + "epoch": 13.31731843575419, + "grad_norm": 1.1808913946151733, + "learning_rate": 0.00033543417366946776, + "loss": 0.4446, + "step": 23838 + }, + { + "epoch": 13.317877094972067, + "grad_norm": 0.6786538362503052, + "learning_rate": 0.000335406162464986, + "loss": 0.6643, + "step": 23839 + }, + { + "epoch": 13.318435754189943, + "grad_norm": 1.007463812828064, + "learning_rate": 0.0003353781512605042, + "loss": 0.4263, + "step": 23840 + }, + { + "epoch": 13.318994413407822, + "grad_norm": 0.4319334030151367, + "learning_rate": 0.00033535014005602244, + "loss": 0.3863, + "step": 23841 + }, + { + "epoch": 13.319553072625698, + "grad_norm": 0.8270582556724548, + "learning_rate": 0.00033532212885154064, + "loss": 0.5159, + "step": 23842 + }, + { + "epoch": 13.320111731843575, + "grad_norm": 1.0376777648925781, + "learning_rate": 0.0003352941176470588, + "loss": 0.4225, + "step": 23843 + }, + { + "epoch": 13.320670391061453, + "grad_norm": 0.47982004284858704, + "learning_rate": 0.00033526610644257705, + "loss": 0.3449, + "step": 23844 + }, + { + "epoch": 13.32122905027933, + "grad_norm": 0.40144264698028564, + "learning_rate": 0.0003352380952380952, + "loss": 0.3341, + "step": 23845 + }, + { + "epoch": 13.321787709497206, + "grad_norm": 0.6232517957687378, + "learning_rate": 0.00033521008403361347, + "loss": 0.3658, + "step": 23846 + }, + { + "epoch": 13.322346368715085, + "grad_norm": 0.5487639307975769, + "learning_rate": 0.00033518207282913167, + "loss": 0.4097, + "step": 23847 + }, + { + "epoch": 13.322905027932961, + "grad_norm": 0.31972163915634155, + "learning_rate": 0.0003351540616246498, + "loss": 0.3973, + "step": 23848 + }, + { + "epoch": 13.323463687150838, + "grad_norm": 0.45323118567466736, + "learning_rate": 0.0003351260504201681, + "loss": 0.5238, + "step": 23849 + }, + { + "epoch": 13.324022346368714, + "grad_norm": 0.5316075682640076, + "learning_rate": 0.0003350980392156863, + "loss": 0.4174, + "step": 23850 + }, + { + "epoch": 13.324581005586593, + "grad_norm": 0.5013684630393982, + "learning_rate": 0.0003350700280112045, + "loss": 0.4098, + "step": 23851 + }, + { + "epoch": 13.32513966480447, + "grad_norm": 0.5039564967155457, + "learning_rate": 0.0003350420168067227, + "loss": 0.3542, + "step": 23852 + }, + { + "epoch": 13.325698324022346, + "grad_norm": 0.5305153727531433, + "learning_rate": 0.00033501400560224085, + "loss": 0.4429, + "step": 23853 + }, + { + "epoch": 13.326256983240224, + "grad_norm": 0.7472844123840332, + "learning_rate": 0.0003349859943977591, + "loss": 0.3688, + "step": 23854 + }, + { + "epoch": 13.3268156424581, + "grad_norm": 1.5326067209243774, + "learning_rate": 0.0003349579831932773, + "loss": 0.4184, + "step": 23855 + }, + { + "epoch": 13.327374301675977, + "grad_norm": 0.3773473799228668, + "learning_rate": 0.0003349299719887955, + "loss": 0.3738, + "step": 23856 + }, + { + "epoch": 13.327932960893854, + "grad_norm": 0.39498671889305115, + "learning_rate": 0.00033490196078431373, + "loss": 0.4291, + "step": 23857 + }, + { + "epoch": 13.328491620111732, + "grad_norm": 0.4979417324066162, + "learning_rate": 0.00033487394957983194, + "loss": 0.4405, + "step": 23858 + }, + { + "epoch": 13.329050279329609, + "grad_norm": 0.5262969136238098, + "learning_rate": 0.00033484593837535014, + "loss": 0.5357, + "step": 23859 + }, + { + "epoch": 13.329608938547485, + "grad_norm": 0.3744189739227295, + "learning_rate": 0.00033481792717086835, + "loss": 0.3762, + "step": 23860 + }, + { + "epoch": 13.330167597765364, + "grad_norm": 0.3269403874874115, + "learning_rate": 0.00033478991596638656, + "loss": 0.3229, + "step": 23861 + }, + { + "epoch": 13.33072625698324, + "grad_norm": 0.6849920153617859, + "learning_rate": 0.00033476190476190476, + "loss": 0.5005, + "step": 23862 + }, + { + "epoch": 13.331284916201117, + "grad_norm": 1.2992875576019287, + "learning_rate": 0.00033473389355742297, + "loss": 0.4249, + "step": 23863 + }, + { + "epoch": 13.331843575418995, + "grad_norm": 0.7522691488265991, + "learning_rate": 0.0003347058823529412, + "loss": 0.485, + "step": 23864 + }, + { + "epoch": 13.332402234636872, + "grad_norm": 0.5542646050453186, + "learning_rate": 0.0003346778711484594, + "loss": 0.5863, + "step": 23865 + }, + { + "epoch": 13.332960893854748, + "grad_norm": 0.4124361574649811, + "learning_rate": 0.00033464985994397764, + "loss": 0.4196, + "step": 23866 + }, + { + "epoch": 13.333519553072625, + "grad_norm": 7.529343605041504, + "learning_rate": 0.0003346218487394958, + "loss": 0.3452, + "step": 23867 + }, + { + "epoch": 13.334078212290503, + "grad_norm": 0.6973263025283813, + "learning_rate": 0.000334593837535014, + "loss": 0.3848, + "step": 23868 + }, + { + "epoch": 13.33463687150838, + "grad_norm": 0.5479353070259094, + "learning_rate": 0.0003345658263305322, + "loss": 0.3435, + "step": 23869 + }, + { + "epoch": 13.335195530726256, + "grad_norm": 0.4201735556125641, + "learning_rate": 0.0003345378151260504, + "loss": 0.403, + "step": 23870 + }, + { + "epoch": 13.335754189944135, + "grad_norm": 0.3778197169303894, + "learning_rate": 0.00033450980392156867, + "loss": 0.4235, + "step": 23871 + }, + { + "epoch": 13.336312849162011, + "grad_norm": 0.43880122900009155, + "learning_rate": 0.0003344817927170868, + "loss": 0.3849, + "step": 23872 + }, + { + "epoch": 13.336871508379888, + "grad_norm": 0.38169172406196594, + "learning_rate": 0.00033445378151260503, + "loss": 0.4012, + "step": 23873 + }, + { + "epoch": 13.337430167597766, + "grad_norm": 0.46411970257759094, + "learning_rate": 0.0003344257703081233, + "loss": 0.4198, + "step": 23874 + }, + { + "epoch": 13.337988826815643, + "grad_norm": 0.6051220893859863, + "learning_rate": 0.00033439775910364144, + "loss": 0.484, + "step": 23875 + }, + { + "epoch": 13.33854748603352, + "grad_norm": 0.5200245976448059, + "learning_rate": 0.0003343697478991597, + "loss": 0.3722, + "step": 23876 + }, + { + "epoch": 13.339106145251396, + "grad_norm": 0.4520384669303894, + "learning_rate": 0.00033434173669467785, + "loss": 0.3846, + "step": 23877 + }, + { + "epoch": 13.339664804469274, + "grad_norm": 0.3697698414325714, + "learning_rate": 0.00033431372549019606, + "loss": 0.4308, + "step": 23878 + }, + { + "epoch": 13.34022346368715, + "grad_norm": 0.8896191716194153, + "learning_rate": 0.0003342857142857143, + "loss": 0.391, + "step": 23879 + }, + { + "epoch": 13.340782122905027, + "grad_norm": 0.4311777949333191, + "learning_rate": 0.00033425770308123247, + "loss": 0.3403, + "step": 23880 + }, + { + "epoch": 13.341340782122906, + "grad_norm": 1.1365503072738647, + "learning_rate": 0.00033422969187675073, + "loss": 0.5369, + "step": 23881 + }, + { + "epoch": 13.341899441340782, + "grad_norm": 1.3326804637908936, + "learning_rate": 0.00033420168067226894, + "loss": 0.3207, + "step": 23882 + }, + { + "epoch": 13.342458100558659, + "grad_norm": 0.3907860517501831, + "learning_rate": 0.0003341736694677871, + "loss": 0.4018, + "step": 23883 + }, + { + "epoch": 13.343016759776535, + "grad_norm": 0.40447160601615906, + "learning_rate": 0.00033414565826330535, + "loss": 0.4004, + "step": 23884 + }, + { + "epoch": 13.343575418994414, + "grad_norm": 1.5601035356521606, + "learning_rate": 0.0003341176470588235, + "loss": 0.4241, + "step": 23885 + }, + { + "epoch": 13.34413407821229, + "grad_norm": 0.5868250727653503, + "learning_rate": 0.00033408963585434176, + "loss": 0.379, + "step": 23886 + }, + { + "epoch": 13.344692737430167, + "grad_norm": 0.4469394385814667, + "learning_rate": 0.00033406162464985997, + "loss": 0.2715, + "step": 23887 + }, + { + "epoch": 13.345251396648045, + "grad_norm": 0.4678853452205658, + "learning_rate": 0.0003340336134453781, + "loss": 0.3468, + "step": 23888 + }, + { + "epoch": 13.345810055865922, + "grad_norm": 0.7323243618011475, + "learning_rate": 0.0003340056022408964, + "loss": 0.3656, + "step": 23889 + }, + { + "epoch": 13.346368715083798, + "grad_norm": 0.6104510426521301, + "learning_rate": 0.0003339775910364146, + "loss": 0.4768, + "step": 23890 + }, + { + "epoch": 13.346927374301677, + "grad_norm": 2.0912399291992188, + "learning_rate": 0.0003339495798319328, + "loss": 0.3526, + "step": 23891 + }, + { + "epoch": 13.347486033519553, + "grad_norm": 0.4993850886821747, + "learning_rate": 0.000333921568627451, + "loss": 0.4722, + "step": 23892 + }, + { + "epoch": 13.34804469273743, + "grad_norm": 0.3902897834777832, + "learning_rate": 0.00033389355742296915, + "loss": 0.4615, + "step": 23893 + }, + { + "epoch": 13.348603351955306, + "grad_norm": 0.40381699800491333, + "learning_rate": 0.0003338655462184874, + "loss": 0.4241, + "step": 23894 + }, + { + "epoch": 13.349162011173185, + "grad_norm": 0.5576342940330505, + "learning_rate": 0.0003338375350140056, + "loss": 0.3997, + "step": 23895 + }, + { + "epoch": 13.349720670391061, + "grad_norm": 0.4372580945491791, + "learning_rate": 0.0003338095238095238, + "loss": 0.5035, + "step": 23896 + }, + { + "epoch": 13.350279329608938, + "grad_norm": 0.42239633202552795, + "learning_rate": 0.000333781512605042, + "loss": 0.3994, + "step": 23897 + }, + { + "epoch": 13.350837988826816, + "grad_norm": 1.4621309041976929, + "learning_rate": 0.00033375350140056023, + "loss": 0.4073, + "step": 23898 + }, + { + "epoch": 13.351396648044693, + "grad_norm": 0.41640138626098633, + "learning_rate": 0.00033372549019607844, + "loss": 0.4049, + "step": 23899 + }, + { + "epoch": 13.35195530726257, + "grad_norm": 0.7657338380813599, + "learning_rate": 0.00033369747899159664, + "loss": 0.39, + "step": 23900 + }, + { + "epoch": 13.352513966480448, + "grad_norm": 0.7223778367042542, + "learning_rate": 0.0003336694677871149, + "loss": 0.5628, + "step": 23901 + }, + { + "epoch": 13.353072625698324, + "grad_norm": 0.48033207654953003, + "learning_rate": 0.00033364145658263306, + "loss": 0.4704, + "step": 23902 + }, + { + "epoch": 13.3536312849162, + "grad_norm": 0.4417782723903656, + "learning_rate": 0.00033361344537815126, + "loss": 0.3768, + "step": 23903 + }, + { + "epoch": 13.354189944134077, + "grad_norm": 0.5472639203071594, + "learning_rate": 0.00033358543417366947, + "loss": 0.3224, + "step": 23904 + }, + { + "epoch": 13.354748603351956, + "grad_norm": 0.4203943610191345, + "learning_rate": 0.0003335574229691877, + "loss": 0.4523, + "step": 23905 + }, + { + "epoch": 13.355307262569832, + "grad_norm": 0.40968528389930725, + "learning_rate": 0.00033352941176470593, + "loss": 0.348, + "step": 23906 + }, + { + "epoch": 13.355865921787709, + "grad_norm": 2.3902251720428467, + "learning_rate": 0.0003335014005602241, + "loss": 0.3167, + "step": 23907 + }, + { + "epoch": 13.356424581005587, + "grad_norm": 5.612401485443115, + "learning_rate": 0.0003334733893557423, + "loss": 0.4434, + "step": 23908 + }, + { + "epoch": 13.356983240223464, + "grad_norm": 0.6918250918388367, + "learning_rate": 0.00033344537815126055, + "loss": 0.477, + "step": 23909 + }, + { + "epoch": 13.35754189944134, + "grad_norm": 0.42146405577659607, + "learning_rate": 0.0003334173669467787, + "loss": 0.4734, + "step": 23910 + }, + { + "epoch": 13.358100558659217, + "grad_norm": 0.6265257000923157, + "learning_rate": 0.00033338935574229696, + "loss": 0.3636, + "step": 23911 + }, + { + "epoch": 13.358659217877095, + "grad_norm": 0.5151063799858093, + "learning_rate": 0.0003333613445378151, + "loss": 0.5935, + "step": 23912 + }, + { + "epoch": 13.359217877094972, + "grad_norm": 1.145356297492981, + "learning_rate": 0.0003333333333333333, + "loss": 0.4679, + "step": 23913 + }, + { + "epoch": 13.359776536312848, + "grad_norm": 0.34669482707977295, + "learning_rate": 0.0003333053221288516, + "loss": 0.363, + "step": 23914 + }, + { + "epoch": 13.360335195530727, + "grad_norm": 0.4645282030105591, + "learning_rate": 0.00033327731092436973, + "loss": 0.4035, + "step": 23915 + }, + { + "epoch": 13.360893854748603, + "grad_norm": 0.4767138361930847, + "learning_rate": 0.000333249299719888, + "loss": 0.4715, + "step": 23916 + }, + { + "epoch": 13.36145251396648, + "grad_norm": 0.5608267784118652, + "learning_rate": 0.0003332212885154062, + "loss": 0.3601, + "step": 23917 + }, + { + "epoch": 13.362011173184358, + "grad_norm": 0.5498733520507812, + "learning_rate": 0.00033319327731092435, + "loss": 0.6437, + "step": 23918 + }, + { + "epoch": 13.362569832402235, + "grad_norm": 0.5856558084487915, + "learning_rate": 0.0003331652661064426, + "loss": 0.5159, + "step": 23919 + }, + { + "epoch": 13.363128491620111, + "grad_norm": 0.6594109535217285, + "learning_rate": 0.00033313725490196076, + "loss": 0.4361, + "step": 23920 + }, + { + "epoch": 13.363687150837988, + "grad_norm": 1.2163207530975342, + "learning_rate": 0.000333109243697479, + "loss": 0.4369, + "step": 23921 + }, + { + "epoch": 13.364245810055866, + "grad_norm": 0.46256163716316223, + "learning_rate": 0.00033308123249299723, + "loss": 0.3462, + "step": 23922 + }, + { + "epoch": 13.364804469273743, + "grad_norm": 0.7978337407112122, + "learning_rate": 0.0003330532212885154, + "loss": 0.3853, + "step": 23923 + }, + { + "epoch": 13.36536312849162, + "grad_norm": 0.48576152324676514, + "learning_rate": 0.00033302521008403364, + "loss": 0.3777, + "step": 23924 + }, + { + "epoch": 13.365921787709498, + "grad_norm": 0.5249726176261902, + "learning_rate": 0.00033299719887955185, + "loss": 0.4314, + "step": 23925 + }, + { + "epoch": 13.366480446927374, + "grad_norm": 0.4241422116756439, + "learning_rate": 0.00033296918767507005, + "loss": 0.3921, + "step": 23926 + }, + { + "epoch": 13.367039106145251, + "grad_norm": 0.5657902956008911, + "learning_rate": 0.00033294117647058826, + "loss": 0.4705, + "step": 23927 + }, + { + "epoch": 13.36759776536313, + "grad_norm": 0.5743637084960938, + "learning_rate": 0.0003329131652661064, + "loss": 0.3546, + "step": 23928 + }, + { + "epoch": 13.368156424581006, + "grad_norm": 0.7750608325004578, + "learning_rate": 0.00033288515406162467, + "loss": 0.5385, + "step": 23929 + }, + { + "epoch": 13.368715083798882, + "grad_norm": 0.5233944058418274, + "learning_rate": 0.0003328571428571429, + "loss": 0.4268, + "step": 23930 + }, + { + "epoch": 13.369273743016759, + "grad_norm": 0.423409104347229, + "learning_rate": 0.0003328291316526611, + "loss": 0.3835, + "step": 23931 + }, + { + "epoch": 13.369832402234637, + "grad_norm": 0.4611133933067322, + "learning_rate": 0.0003328011204481793, + "loss": 0.4779, + "step": 23932 + }, + { + "epoch": 13.370391061452514, + "grad_norm": 0.6168695092201233, + "learning_rate": 0.0003327731092436975, + "loss": 0.4186, + "step": 23933 + }, + { + "epoch": 13.37094972067039, + "grad_norm": 0.5267783403396606, + "learning_rate": 0.0003327450980392157, + "loss": 0.3806, + "step": 23934 + }, + { + "epoch": 13.371508379888269, + "grad_norm": 0.728651225566864, + "learning_rate": 0.0003327170868347339, + "loss": 0.413, + "step": 23935 + }, + { + "epoch": 13.372067039106145, + "grad_norm": 0.5308983325958252, + "learning_rate": 0.00033268907563025206, + "loss": 0.5084, + "step": 23936 + }, + { + "epoch": 13.372625698324022, + "grad_norm": 0.9741657972335815, + "learning_rate": 0.0003326610644257703, + "loss": 0.506, + "step": 23937 + }, + { + "epoch": 13.3731843575419, + "grad_norm": 0.37411120533943176, + "learning_rate": 0.0003326330532212885, + "loss": 0.3495, + "step": 23938 + }, + { + "epoch": 13.373743016759777, + "grad_norm": 0.5017676949501038, + "learning_rate": 0.00033260504201680673, + "loss": 0.37, + "step": 23939 + }, + { + "epoch": 13.374301675977653, + "grad_norm": 0.8682034611701965, + "learning_rate": 0.00033257703081232494, + "loss": 0.5199, + "step": 23940 + }, + { + "epoch": 13.37486033519553, + "grad_norm": 0.5218703150749207, + "learning_rate": 0.00033254901960784314, + "loss": 0.5387, + "step": 23941 + }, + { + "epoch": 13.375418994413408, + "grad_norm": 0.5939416289329529, + "learning_rate": 0.00033252100840336135, + "loss": 0.4932, + "step": 23942 + }, + { + "epoch": 13.375977653631285, + "grad_norm": 0.6838260293006897, + "learning_rate": 0.00033249299719887956, + "loss": 0.3823, + "step": 23943 + }, + { + "epoch": 13.376536312849161, + "grad_norm": 0.4618566632270813, + "learning_rate": 0.00033246498599439776, + "loss": 0.5363, + "step": 23944 + }, + { + "epoch": 13.37709497206704, + "grad_norm": 0.6762872934341431, + "learning_rate": 0.00033243697478991597, + "loss": 0.4902, + "step": 23945 + }, + { + "epoch": 13.377653631284916, + "grad_norm": 0.39185744524002075, + "learning_rate": 0.0003324089635854342, + "loss": 0.4039, + "step": 23946 + }, + { + "epoch": 13.378212290502793, + "grad_norm": 0.4100474417209625, + "learning_rate": 0.0003323809523809524, + "loss": 0.3251, + "step": 23947 + }, + { + "epoch": 13.378770949720671, + "grad_norm": 0.4624863564968109, + "learning_rate": 0.0003323529411764706, + "loss": 0.3983, + "step": 23948 + }, + { + "epoch": 13.379329608938548, + "grad_norm": 0.9086792469024658, + "learning_rate": 0.00033232492997198885, + "loss": 0.4873, + "step": 23949 + }, + { + "epoch": 13.379888268156424, + "grad_norm": 1.9930933713912964, + "learning_rate": 0.000332296918767507, + "loss": 0.4458, + "step": 23950 + }, + { + "epoch": 13.380446927374301, + "grad_norm": 0.6225258708000183, + "learning_rate": 0.0003322689075630252, + "loss": 0.3734, + "step": 23951 + }, + { + "epoch": 13.38100558659218, + "grad_norm": 0.4544983208179474, + "learning_rate": 0.0003322408963585434, + "loss": 0.4438, + "step": 23952 + }, + { + "epoch": 13.381564245810056, + "grad_norm": 0.8222813606262207, + "learning_rate": 0.0003322128851540616, + "loss": 0.63, + "step": 23953 + }, + { + "epoch": 13.382122905027932, + "grad_norm": 0.5289430022239685, + "learning_rate": 0.0003321848739495799, + "loss": 0.3208, + "step": 23954 + }, + { + "epoch": 13.38268156424581, + "grad_norm": 0.3467426300048828, + "learning_rate": 0.00033215686274509803, + "loss": 0.3111, + "step": 23955 + }, + { + "epoch": 13.383240223463687, + "grad_norm": 2.535980463027954, + "learning_rate": 0.00033212885154061623, + "loss": 0.5181, + "step": 23956 + }, + { + "epoch": 13.383798882681564, + "grad_norm": 1.7109472751617432, + "learning_rate": 0.0003321008403361345, + "loss": 0.4645, + "step": 23957 + }, + { + "epoch": 13.38435754189944, + "grad_norm": 0.6140024065971375, + "learning_rate": 0.00033207282913165265, + "loss": 0.3611, + "step": 23958 + }, + { + "epoch": 13.384916201117319, + "grad_norm": 0.39160990715026855, + "learning_rate": 0.0003320448179271709, + "loss": 0.4454, + "step": 23959 + }, + { + "epoch": 13.385474860335195, + "grad_norm": 0.7750301957130432, + "learning_rate": 0.00033201680672268906, + "loss": 0.4169, + "step": 23960 + }, + { + "epoch": 13.386033519553072, + "grad_norm": 0.8618877530097961, + "learning_rate": 0.00033198879551820726, + "loss": 0.5585, + "step": 23961 + }, + { + "epoch": 13.38659217877095, + "grad_norm": 1.4556639194488525, + "learning_rate": 0.0003319607843137255, + "loss": 0.446, + "step": 23962 + }, + { + "epoch": 13.387150837988827, + "grad_norm": 0.37511712312698364, + "learning_rate": 0.0003319327731092437, + "loss": 0.4248, + "step": 23963 + }, + { + "epoch": 13.387709497206703, + "grad_norm": 0.6858104467391968, + "learning_rate": 0.00033190476190476194, + "loss": 0.4957, + "step": 23964 + }, + { + "epoch": 13.388268156424582, + "grad_norm": 0.6047878265380859, + "learning_rate": 0.00033187675070028014, + "loss": 0.4738, + "step": 23965 + }, + { + "epoch": 13.388826815642458, + "grad_norm": 0.44325828552246094, + "learning_rate": 0.0003318487394957983, + "loss": 0.3719, + "step": 23966 + }, + { + "epoch": 13.389385474860335, + "grad_norm": 0.724241316318512, + "learning_rate": 0.00033182072829131655, + "loss": 0.3483, + "step": 23967 + }, + { + "epoch": 13.389944134078211, + "grad_norm": 0.5017548203468323, + "learning_rate": 0.0003317927170868347, + "loss": 0.4482, + "step": 23968 + }, + { + "epoch": 13.39050279329609, + "grad_norm": 0.43123143911361694, + "learning_rate": 0.00033176470588235297, + "loss": 0.4151, + "step": 23969 + }, + { + "epoch": 13.391061452513966, + "grad_norm": 0.31168562173843384, + "learning_rate": 0.00033173669467787117, + "loss": 0.297, + "step": 23970 + }, + { + "epoch": 13.391620111731843, + "grad_norm": 0.49120715260505676, + "learning_rate": 0.0003317086834733893, + "loss": 0.3888, + "step": 23971 + }, + { + "epoch": 13.392178770949721, + "grad_norm": 1.9605176448822021, + "learning_rate": 0.0003316806722689076, + "loss": 0.3904, + "step": 23972 + }, + { + "epoch": 13.392737430167598, + "grad_norm": 0.5817265510559082, + "learning_rate": 0.0003316526610644258, + "loss": 0.4741, + "step": 23973 + }, + { + "epoch": 13.393296089385474, + "grad_norm": 0.42018619179725647, + "learning_rate": 0.000331624649859944, + "loss": 0.3869, + "step": 23974 + }, + { + "epoch": 13.393854748603353, + "grad_norm": 0.7574966549873352, + "learning_rate": 0.0003315966386554622, + "loss": 0.4373, + "step": 23975 + }, + { + "epoch": 13.39441340782123, + "grad_norm": 0.5552080869674683, + "learning_rate": 0.00033156862745098035, + "loss": 0.342, + "step": 23976 + }, + { + "epoch": 13.394972067039106, + "grad_norm": 0.42406103014945984, + "learning_rate": 0.0003315406162464986, + "loss": 0.4161, + "step": 23977 + }, + { + "epoch": 13.395530726256982, + "grad_norm": 1.149976134300232, + "learning_rate": 0.0003315126050420168, + "loss": 0.3485, + "step": 23978 + }, + { + "epoch": 13.39608938547486, + "grad_norm": 0.43855980038642883, + "learning_rate": 0.000331484593837535, + "loss": 0.4103, + "step": 23979 + }, + { + "epoch": 13.396648044692737, + "grad_norm": 0.5216570496559143, + "learning_rate": 0.00033145658263305323, + "loss": 0.3253, + "step": 23980 + }, + { + "epoch": 13.397206703910614, + "grad_norm": 0.408236563205719, + "learning_rate": 0.00033142857142857144, + "loss": 0.3827, + "step": 23981 + }, + { + "epoch": 13.397765363128492, + "grad_norm": 0.8788466453552246, + "learning_rate": 0.00033140056022408964, + "loss": 0.4947, + "step": 23982 + }, + { + "epoch": 13.398324022346369, + "grad_norm": 0.5359235405921936, + "learning_rate": 0.00033137254901960785, + "loss": 0.3661, + "step": 23983 + }, + { + "epoch": 13.398882681564245, + "grad_norm": 0.5464350581169128, + "learning_rate": 0.00033134453781512606, + "loss": 0.4578, + "step": 23984 + }, + { + "epoch": 13.399441340782122, + "grad_norm": 0.43618351221084595, + "learning_rate": 0.00033131652661064426, + "loss": 0.3375, + "step": 23985 + }, + { + "epoch": 13.4, + "grad_norm": 0.4241359531879425, + "learning_rate": 0.00033128851540616247, + "loss": 0.3676, + "step": 23986 + }, + { + "epoch": 13.400558659217877, + "grad_norm": 0.6320966482162476, + "learning_rate": 0.0003312605042016807, + "loss": 0.3961, + "step": 23987 + }, + { + "epoch": 13.401117318435753, + "grad_norm": 0.479246586561203, + "learning_rate": 0.0003312324929971989, + "loss": 0.4815, + "step": 23988 + }, + { + "epoch": 13.401675977653632, + "grad_norm": 0.34296807646751404, + "learning_rate": 0.00033120448179271714, + "loss": 0.2678, + "step": 23989 + }, + { + "epoch": 13.402234636871508, + "grad_norm": 0.5537387728691101, + "learning_rate": 0.0003311764705882353, + "loss": 0.5158, + "step": 23990 + }, + { + "epoch": 13.402793296089385, + "grad_norm": 0.7673983573913574, + "learning_rate": 0.0003311484593837535, + "loss": 0.5018, + "step": 23991 + }, + { + "epoch": 13.403351955307263, + "grad_norm": 0.3897479772567749, + "learning_rate": 0.0003311204481792717, + "loss": 0.3878, + "step": 23992 + }, + { + "epoch": 13.40391061452514, + "grad_norm": 1.1340206861495972, + "learning_rate": 0.0003310924369747899, + "loss": 0.3561, + "step": 23993 + }, + { + "epoch": 13.404469273743016, + "grad_norm": 0.5545897483825684, + "learning_rate": 0.00033106442577030817, + "loss": 0.3832, + "step": 23994 + }, + { + "epoch": 13.405027932960893, + "grad_norm": 0.9974201321601868, + "learning_rate": 0.0003310364145658263, + "loss": 0.3523, + "step": 23995 + }, + { + "epoch": 13.405586592178771, + "grad_norm": 0.4552745819091797, + "learning_rate": 0.00033100840336134453, + "loss": 0.4373, + "step": 23996 + }, + { + "epoch": 13.406145251396648, + "grad_norm": 0.3898215889930725, + "learning_rate": 0.0003309803921568628, + "loss": 0.4002, + "step": 23997 + }, + { + "epoch": 13.406703910614524, + "grad_norm": 0.8718129396438599, + "learning_rate": 0.00033095238095238094, + "loss": 0.6538, + "step": 23998 + }, + { + "epoch": 13.407262569832403, + "grad_norm": 0.4390983283519745, + "learning_rate": 0.0003309243697478992, + "loss": 0.3123, + "step": 23999 + }, + { + "epoch": 13.40782122905028, + "grad_norm": 0.5923282504081726, + "learning_rate": 0.00033089635854341735, + "loss": 0.5346, + "step": 24000 + }, + { + "epoch": 13.40782122905028, + "eval_cer": 0.08641833873413854, + "eval_loss": 0.33153823018074036, + "eval_runtime": 55.6683, + "eval_samples_per_second": 81.519, + "eval_steps_per_second": 5.102, + "eval_wer": 0.3398229099745817, + "step": 24000 + }, + { + "epoch": 13.408379888268156, + "grad_norm": 0.8365556001663208, + "learning_rate": 0.00033086834733893556, + "loss": 0.4261, + "step": 24001 + }, + { + "epoch": 13.408938547486034, + "grad_norm": 0.551045835018158, + "learning_rate": 0.0003308403361344538, + "loss": 0.3676, + "step": 24002 + }, + { + "epoch": 13.40949720670391, + "grad_norm": 0.628197431564331, + "learning_rate": 0.00033081232492997197, + "loss": 0.4124, + "step": 24003 + }, + { + "epoch": 13.410055865921787, + "grad_norm": 0.5075826644897461, + "learning_rate": 0.00033078431372549023, + "loss": 0.4317, + "step": 24004 + }, + { + "epoch": 13.410614525139664, + "grad_norm": 5.493040561676025, + "learning_rate": 0.00033075630252100844, + "loss": 0.6192, + "step": 24005 + }, + { + "epoch": 13.411173184357542, + "grad_norm": 0.4345131516456604, + "learning_rate": 0.0003307282913165266, + "loss": 0.3825, + "step": 24006 + }, + { + "epoch": 13.411731843575419, + "grad_norm": 0.37854650616645813, + "learning_rate": 0.00033070028011204485, + "loss": 0.3553, + "step": 24007 + }, + { + "epoch": 13.412290502793295, + "grad_norm": 0.629016101360321, + "learning_rate": 0.000330672268907563, + "loss": 0.3608, + "step": 24008 + }, + { + "epoch": 13.412849162011174, + "grad_norm": 0.48460161685943604, + "learning_rate": 0.00033064425770308126, + "loss": 0.4466, + "step": 24009 + }, + { + "epoch": 13.41340782122905, + "grad_norm": 0.5453964471817017, + "learning_rate": 0.00033061624649859947, + "loss": 0.3567, + "step": 24010 + }, + { + "epoch": 13.413966480446927, + "grad_norm": 0.5664440393447876, + "learning_rate": 0.0003305882352941176, + "loss": 0.4554, + "step": 24011 + }, + { + "epoch": 13.414525139664805, + "grad_norm": 0.5190845131874084, + "learning_rate": 0.0003305602240896359, + "loss": 0.4445, + "step": 24012 + }, + { + "epoch": 13.415083798882682, + "grad_norm": 0.49040234088897705, + "learning_rate": 0.0003305322128851541, + "loss": 0.3752, + "step": 24013 + }, + { + "epoch": 13.415642458100558, + "grad_norm": 0.5478666424751282, + "learning_rate": 0.0003305042016806723, + "loss": 0.476, + "step": 24014 + }, + { + "epoch": 13.416201117318435, + "grad_norm": 2.912370204925537, + "learning_rate": 0.0003304761904761905, + "loss": 0.5006, + "step": 24015 + }, + { + "epoch": 13.416759776536313, + "grad_norm": 0.6135877370834351, + "learning_rate": 0.00033044817927170865, + "loss": 0.5034, + "step": 24016 + }, + { + "epoch": 13.41731843575419, + "grad_norm": 0.8700762391090393, + "learning_rate": 0.0003304201680672269, + "loss": 0.6384, + "step": 24017 + }, + { + "epoch": 13.417877094972066, + "grad_norm": 0.47633153200149536, + "learning_rate": 0.0003303921568627451, + "loss": 0.3772, + "step": 24018 + }, + { + "epoch": 13.418435754189945, + "grad_norm": 0.4083118438720703, + "learning_rate": 0.0003303641456582633, + "loss": 0.3795, + "step": 24019 + }, + { + "epoch": 13.418994413407821, + "grad_norm": 0.45499464869499207, + "learning_rate": 0.0003303361344537815, + "loss": 0.4086, + "step": 24020 + }, + { + "epoch": 13.419553072625698, + "grad_norm": 0.46390077471733093, + "learning_rate": 0.00033030812324929973, + "loss": 0.445, + "step": 24021 + }, + { + "epoch": 13.420111731843576, + "grad_norm": 0.5300405025482178, + "learning_rate": 0.00033028011204481794, + "loss": 0.3861, + "step": 24022 + }, + { + "epoch": 13.420670391061453, + "grad_norm": 0.5467058420181274, + "learning_rate": 0.00033025210084033614, + "loss": 0.5471, + "step": 24023 + }, + { + "epoch": 13.42122905027933, + "grad_norm": 0.5927534699440002, + "learning_rate": 0.00033022408963585435, + "loss": 0.3758, + "step": 24024 + }, + { + "epoch": 13.421787709497206, + "grad_norm": 0.7953538298606873, + "learning_rate": 0.00033019607843137256, + "loss": 0.5591, + "step": 24025 + }, + { + "epoch": 13.422346368715084, + "grad_norm": 0.5789852738380432, + "learning_rate": 0.00033016806722689076, + "loss": 0.5116, + "step": 24026 + }, + { + "epoch": 13.422905027932961, + "grad_norm": 0.37751510739326477, + "learning_rate": 0.00033014005602240897, + "loss": 0.3924, + "step": 24027 + }, + { + "epoch": 13.423463687150837, + "grad_norm": 0.3508763909339905, + "learning_rate": 0.0003301120448179272, + "loss": 0.4381, + "step": 24028 + }, + { + "epoch": 13.424022346368716, + "grad_norm": 0.458859920501709, + "learning_rate": 0.00033008403361344543, + "loss": 0.4542, + "step": 24029 + }, + { + "epoch": 13.424581005586592, + "grad_norm": 0.45266950130462646, + "learning_rate": 0.0003300560224089636, + "loss": 0.3955, + "step": 24030 + }, + { + "epoch": 13.425139664804469, + "grad_norm": 0.5199288129806519, + "learning_rate": 0.0003300280112044818, + "loss": 0.319, + "step": 24031 + }, + { + "epoch": 13.425698324022346, + "grad_norm": 9.08499813079834, + "learning_rate": 0.00033, + "loss": 0.4953, + "step": 24032 + }, + { + "epoch": 13.426256983240224, + "grad_norm": 0.37150460481643677, + "learning_rate": 0.0003299719887955182, + "loss": 0.3951, + "step": 24033 + }, + { + "epoch": 13.4268156424581, + "grad_norm": 0.7141701579093933, + "learning_rate": 0.00032994397759103646, + "loss": 0.4892, + "step": 24034 + }, + { + "epoch": 13.427374301675977, + "grad_norm": 0.6042700409889221, + "learning_rate": 0.0003299159663865546, + "loss": 0.4408, + "step": 24035 + }, + { + "epoch": 13.427932960893855, + "grad_norm": 0.5827171206474304, + "learning_rate": 0.0003298879551820728, + "loss": 0.3516, + "step": 24036 + }, + { + "epoch": 13.428491620111732, + "grad_norm": 0.45501765608787537, + "learning_rate": 0.0003298599439775911, + "loss": 0.5182, + "step": 24037 + }, + { + "epoch": 13.429050279329608, + "grad_norm": 0.6984469890594482, + "learning_rate": 0.00032983193277310923, + "loss": 0.5069, + "step": 24038 + }, + { + "epoch": 13.429608938547487, + "grad_norm": 1.102880835533142, + "learning_rate": 0.0003298039215686275, + "loss": 0.357, + "step": 24039 + }, + { + "epoch": 13.430167597765363, + "grad_norm": 0.38930967450141907, + "learning_rate": 0.00032977591036414565, + "loss": 0.3601, + "step": 24040 + }, + { + "epoch": 13.43072625698324, + "grad_norm": 0.8170896172523499, + "learning_rate": 0.00032974789915966385, + "loss": 0.4151, + "step": 24041 + }, + { + "epoch": 13.431284916201117, + "grad_norm": 0.34871599078178406, + "learning_rate": 0.0003297198879551821, + "loss": 0.4465, + "step": 24042 + }, + { + "epoch": 13.431843575418995, + "grad_norm": 0.4154914915561676, + "learning_rate": 0.00032969187675070026, + "loss": 0.3849, + "step": 24043 + }, + { + "epoch": 13.432402234636871, + "grad_norm": 1.2494512796401978, + "learning_rate": 0.00032966386554621847, + "loss": 0.4804, + "step": 24044 + }, + { + "epoch": 13.432960893854748, + "grad_norm": 2.529845952987671, + "learning_rate": 0.00032963585434173673, + "loss": 0.4597, + "step": 24045 + }, + { + "epoch": 13.433519553072626, + "grad_norm": 0.6845638155937195, + "learning_rate": 0.0003296078431372549, + "loss": 0.4631, + "step": 24046 + }, + { + "epoch": 13.434078212290503, + "grad_norm": 0.7565985321998596, + "learning_rate": 0.00032957983193277314, + "loss": 0.3217, + "step": 24047 + }, + { + "epoch": 13.43463687150838, + "grad_norm": 0.3648121953010559, + "learning_rate": 0.0003295518207282913, + "loss": 0.4119, + "step": 24048 + }, + { + "epoch": 13.435195530726258, + "grad_norm": 0.5148236155509949, + "learning_rate": 0.0003295238095238095, + "loss": 0.4098, + "step": 24049 + }, + { + "epoch": 13.435754189944134, + "grad_norm": 6.18539571762085, + "learning_rate": 0.00032949579831932776, + "loss": 0.473, + "step": 24050 + }, + { + "epoch": 13.436312849162011, + "grad_norm": 0.45961225032806396, + "learning_rate": 0.0003294677871148459, + "loss": 0.4264, + "step": 24051 + }, + { + "epoch": 13.436871508379888, + "grad_norm": 0.5115296244621277, + "learning_rate": 0.00032943977591036417, + "loss": 0.3598, + "step": 24052 + }, + { + "epoch": 13.437430167597766, + "grad_norm": 0.45676034688949585, + "learning_rate": 0.0003294117647058824, + "loss": 0.4193, + "step": 24053 + }, + { + "epoch": 13.437988826815642, + "grad_norm": 0.6057080030441284, + "learning_rate": 0.00032938375350140053, + "loss": 0.4042, + "step": 24054 + }, + { + "epoch": 13.438547486033519, + "grad_norm": 0.5430832505226135, + "learning_rate": 0.0003293557422969188, + "loss": 0.3292, + "step": 24055 + }, + { + "epoch": 13.439106145251397, + "grad_norm": 0.4006073772907257, + "learning_rate": 0.00032932773109243694, + "loss": 0.3254, + "step": 24056 + }, + { + "epoch": 13.439664804469274, + "grad_norm": 3.4763219356536865, + "learning_rate": 0.0003292997198879552, + "loss": 0.4068, + "step": 24057 + }, + { + "epoch": 13.44022346368715, + "grad_norm": 0.4126766622066498, + "learning_rate": 0.0003292717086834734, + "loss": 0.3275, + "step": 24058 + }, + { + "epoch": 13.440782122905027, + "grad_norm": 0.3994097411632538, + "learning_rate": 0.00032924369747899156, + "loss": 0.3569, + "step": 24059 + }, + { + "epoch": 13.441340782122905, + "grad_norm": 0.49512559175491333, + "learning_rate": 0.0003292156862745098, + "loss": 0.3286, + "step": 24060 + }, + { + "epoch": 13.441899441340782, + "grad_norm": 0.6281354427337646, + "learning_rate": 0.000329187675070028, + "loss": 0.4282, + "step": 24061 + }, + { + "epoch": 13.442458100558659, + "grad_norm": 0.8131765723228455, + "learning_rate": 0.00032915966386554623, + "loss": 0.4747, + "step": 24062 + }, + { + "epoch": 13.443016759776537, + "grad_norm": 0.6101727485656738, + "learning_rate": 0.00032913165266106444, + "loss": 0.4062, + "step": 24063 + }, + { + "epoch": 13.443575418994413, + "grad_norm": 0.4951469600200653, + "learning_rate": 0.0003291036414565826, + "loss": 0.4507, + "step": 24064 + }, + { + "epoch": 13.44413407821229, + "grad_norm": 0.5197064876556396, + "learning_rate": 0.00032907563025210085, + "loss": 0.4306, + "step": 24065 + }, + { + "epoch": 13.444692737430168, + "grad_norm": 3.8119289875030518, + "learning_rate": 0.00032904761904761906, + "loss": 0.3867, + "step": 24066 + }, + { + "epoch": 13.445251396648045, + "grad_norm": 0.5981159806251526, + "learning_rate": 0.00032901960784313726, + "loss": 0.6226, + "step": 24067 + }, + { + "epoch": 13.445810055865921, + "grad_norm": 0.4323895275592804, + "learning_rate": 0.00032899159663865547, + "loss": 0.3518, + "step": 24068 + }, + { + "epoch": 13.446368715083798, + "grad_norm": 0.5299617052078247, + "learning_rate": 0.0003289635854341737, + "loss": 0.6003, + "step": 24069 + }, + { + "epoch": 13.446927374301676, + "grad_norm": 0.5881797671318054, + "learning_rate": 0.0003289355742296919, + "loss": 0.2923, + "step": 24070 + }, + { + "epoch": 13.447486033519553, + "grad_norm": 0.5459182262420654, + "learning_rate": 0.0003289075630252101, + "loss": 0.3705, + "step": 24071 + }, + { + "epoch": 13.44804469273743, + "grad_norm": 0.6495555639266968, + "learning_rate": 0.0003288795518207283, + "loss": 0.3372, + "step": 24072 + }, + { + "epoch": 13.448603351955308, + "grad_norm": 0.5846161842346191, + "learning_rate": 0.0003288515406162465, + "loss": 0.3325, + "step": 24073 + }, + { + "epoch": 13.449162011173184, + "grad_norm": 0.44714757800102234, + "learning_rate": 0.0003288235294117647, + "loss": 0.4131, + "step": 24074 + }, + { + "epoch": 13.449720670391061, + "grad_norm": 0.5594474077224731, + "learning_rate": 0.0003287955182072829, + "loss": 0.6322, + "step": 24075 + }, + { + "epoch": 13.45027932960894, + "grad_norm": 0.4771212637424469, + "learning_rate": 0.0003287675070028011, + "loss": 0.4221, + "step": 24076 + }, + { + "epoch": 13.450837988826816, + "grad_norm": 0.461176335811615, + "learning_rate": 0.0003287394957983194, + "loss": 0.3213, + "step": 24077 + }, + { + "epoch": 13.451396648044692, + "grad_norm": 1.8055278062820435, + "learning_rate": 0.00032871148459383753, + "loss": 0.3278, + "step": 24078 + }, + { + "epoch": 13.451955307262569, + "grad_norm": 1.654625415802002, + "learning_rate": 0.00032868347338935573, + "loss": 0.4666, + "step": 24079 + }, + { + "epoch": 13.452513966480447, + "grad_norm": 0.6786640882492065, + "learning_rate": 0.00032865546218487394, + "loss": 0.4018, + "step": 24080 + }, + { + "epoch": 13.453072625698324, + "grad_norm": 0.7715920805931091, + "learning_rate": 0.00032862745098039215, + "loss": 0.5213, + "step": 24081 + }, + { + "epoch": 13.4536312849162, + "grad_norm": 0.5421979427337646, + "learning_rate": 0.0003285994397759104, + "loss": 0.4043, + "step": 24082 + }, + { + "epoch": 13.454189944134079, + "grad_norm": 0.7266767024993896, + "learning_rate": 0.00032857142857142856, + "loss": 0.3895, + "step": 24083 + }, + { + "epoch": 13.454748603351955, + "grad_norm": 9.650465965270996, + "learning_rate": 0.00032854341736694676, + "loss": 0.378, + "step": 24084 + }, + { + "epoch": 13.455307262569832, + "grad_norm": 0.44124212861061096, + "learning_rate": 0.000328515406162465, + "loss": 0.3872, + "step": 24085 + }, + { + "epoch": 13.45586592178771, + "grad_norm": 0.9369888305664062, + "learning_rate": 0.0003284873949579832, + "loss": 0.4032, + "step": 24086 + }, + { + "epoch": 13.456424581005587, + "grad_norm": 0.4573241174221039, + "learning_rate": 0.00032845938375350144, + "loss": 0.4073, + "step": 24087 + }, + { + "epoch": 13.456983240223463, + "grad_norm": 0.35886985063552856, + "learning_rate": 0.0003284313725490196, + "loss": 0.3779, + "step": 24088 + }, + { + "epoch": 13.45754189944134, + "grad_norm": 0.7889412641525269, + "learning_rate": 0.0003284033613445378, + "loss": 0.756, + "step": 24089 + }, + { + "epoch": 13.458100558659218, + "grad_norm": 0.7551457285881042, + "learning_rate": 0.00032837535014005605, + "loss": 0.3321, + "step": 24090 + }, + { + "epoch": 13.458659217877095, + "grad_norm": 0.4197429120540619, + "learning_rate": 0.0003283473389355742, + "loss": 0.4102, + "step": 24091 + }, + { + "epoch": 13.459217877094972, + "grad_norm": 0.7796857953071594, + "learning_rate": 0.00032831932773109247, + "loss": 0.5002, + "step": 24092 + }, + { + "epoch": 13.45977653631285, + "grad_norm": 2.507544755935669, + "learning_rate": 0.00032829131652661067, + "loss": 0.4085, + "step": 24093 + }, + { + "epoch": 13.460335195530726, + "grad_norm": 2.926431894302368, + "learning_rate": 0.0003282633053221288, + "loss": 0.4222, + "step": 24094 + }, + { + "epoch": 13.460893854748603, + "grad_norm": 0.5870780348777771, + "learning_rate": 0.0003282352941176471, + "loss": 0.4732, + "step": 24095 + }, + { + "epoch": 13.461452513966481, + "grad_norm": 0.43072089552879333, + "learning_rate": 0.00032820728291316524, + "loss": 0.3823, + "step": 24096 + }, + { + "epoch": 13.462011173184358, + "grad_norm": 0.4563128650188446, + "learning_rate": 0.0003281792717086835, + "loss": 0.4411, + "step": 24097 + }, + { + "epoch": 13.462569832402234, + "grad_norm": 1.0716365575790405, + "learning_rate": 0.0003281512605042017, + "loss": 0.4848, + "step": 24098 + }, + { + "epoch": 13.463128491620111, + "grad_norm": 1.2820457220077515, + "learning_rate": 0.00032812324929971985, + "loss": 0.4763, + "step": 24099 + }, + { + "epoch": 13.46368715083799, + "grad_norm": 0.3371823728084564, + "learning_rate": 0.0003280952380952381, + "loss": 0.3929, + "step": 24100 + }, + { + "epoch": 13.464245810055866, + "grad_norm": 0.6280838847160339, + "learning_rate": 0.0003280672268907563, + "loss": 0.3704, + "step": 24101 + }, + { + "epoch": 13.464804469273743, + "grad_norm": 2.7057723999023438, + "learning_rate": 0.0003280392156862745, + "loss": 0.3824, + "step": 24102 + }, + { + "epoch": 13.46536312849162, + "grad_norm": 0.832786500453949, + "learning_rate": 0.00032801120448179273, + "loss": 0.5244, + "step": 24103 + }, + { + "epoch": 13.465921787709497, + "grad_norm": 0.4125325083732605, + "learning_rate": 0.0003279831932773109, + "loss": 0.3519, + "step": 24104 + }, + { + "epoch": 13.466480446927374, + "grad_norm": 0.5188695192337036, + "learning_rate": 0.00032795518207282914, + "loss": 0.5037, + "step": 24105 + }, + { + "epoch": 13.46703910614525, + "grad_norm": 0.517441987991333, + "learning_rate": 0.00032792717086834735, + "loss": 0.453, + "step": 24106 + }, + { + "epoch": 13.467597765363129, + "grad_norm": 0.890071451663971, + "learning_rate": 0.00032789915966386556, + "loss": 0.4698, + "step": 24107 + }, + { + "epoch": 13.468156424581005, + "grad_norm": 0.4595129191875458, + "learning_rate": 0.00032787114845938376, + "loss": 0.4612, + "step": 24108 + }, + { + "epoch": 13.468715083798882, + "grad_norm": 0.47704362869262695, + "learning_rate": 0.00032784313725490197, + "loss": 0.4529, + "step": 24109 + }, + { + "epoch": 13.46927374301676, + "grad_norm": 0.5455402135848999, + "learning_rate": 0.0003278151260504202, + "loss": 0.4508, + "step": 24110 + }, + { + "epoch": 13.469832402234637, + "grad_norm": 0.5782555937767029, + "learning_rate": 0.0003277871148459384, + "loss": 0.4033, + "step": 24111 + }, + { + "epoch": 13.470391061452514, + "grad_norm": 0.4718027412891388, + "learning_rate": 0.0003277591036414566, + "loss": 0.4518, + "step": 24112 + }, + { + "epoch": 13.470949720670392, + "grad_norm": 0.5594518780708313, + "learning_rate": 0.0003277310924369748, + "loss": 0.3742, + "step": 24113 + }, + { + "epoch": 13.471508379888268, + "grad_norm": 0.4130031168460846, + "learning_rate": 0.000327703081232493, + "loss": 0.402, + "step": 24114 + }, + { + "epoch": 13.472067039106145, + "grad_norm": 0.3438200354576111, + "learning_rate": 0.0003276750700280112, + "loss": 0.3988, + "step": 24115 + }, + { + "epoch": 13.472625698324022, + "grad_norm": 0.37586405873298645, + "learning_rate": 0.0003276470588235294, + "loss": 0.3869, + "step": 24116 + }, + { + "epoch": 13.4731843575419, + "grad_norm": 0.5238877534866333, + "learning_rate": 0.00032761904761904767, + "loss": 0.572, + "step": 24117 + }, + { + "epoch": 13.473743016759776, + "grad_norm": 0.5923094153404236, + "learning_rate": 0.0003275910364145658, + "loss": 0.3986, + "step": 24118 + }, + { + "epoch": 13.474301675977653, + "grad_norm": 2.852379322052002, + "learning_rate": 0.00032756302521008403, + "loss": 0.5165, + "step": 24119 + }, + { + "epoch": 13.474860335195531, + "grad_norm": 0.4096725583076477, + "learning_rate": 0.00032753501400560223, + "loss": 0.4266, + "step": 24120 + }, + { + "epoch": 13.475418994413408, + "grad_norm": 0.5331655144691467, + "learning_rate": 0.00032750700280112044, + "loss": 0.4184, + "step": 24121 + }, + { + "epoch": 13.475977653631285, + "grad_norm": 2.1263222694396973, + "learning_rate": 0.0003274789915966387, + "loss": 0.3689, + "step": 24122 + }, + { + "epoch": 13.476536312849163, + "grad_norm": 0.6322385668754578, + "learning_rate": 0.00032745098039215685, + "loss": 0.4613, + "step": 24123 + }, + { + "epoch": 13.47709497206704, + "grad_norm": 0.4124833941459656, + "learning_rate": 0.00032742296918767506, + "loss": 0.359, + "step": 24124 + }, + { + "epoch": 13.477653631284916, + "grad_norm": 0.5539673566818237, + "learning_rate": 0.0003273949579831933, + "loss": 0.5617, + "step": 24125 + }, + { + "epoch": 13.478212290502793, + "grad_norm": 2.7253148555755615, + "learning_rate": 0.00032736694677871147, + "loss": 0.2838, + "step": 24126 + }, + { + "epoch": 13.478770949720671, + "grad_norm": 0.44082266092300415, + "learning_rate": 0.00032733893557422973, + "loss": 0.4267, + "step": 24127 + }, + { + "epoch": 13.479329608938547, + "grad_norm": 0.3895180821418762, + "learning_rate": 0.0003273109243697479, + "loss": 0.3356, + "step": 24128 + }, + { + "epoch": 13.479888268156424, + "grad_norm": 0.4109691083431244, + "learning_rate": 0.0003272829131652661, + "loss": 0.4625, + "step": 24129 + }, + { + "epoch": 13.480446927374302, + "grad_norm": 0.46554839611053467, + "learning_rate": 0.00032725490196078435, + "loss": 0.4282, + "step": 24130 + }, + { + "epoch": 13.481005586592179, + "grad_norm": 0.3763386607170105, + "learning_rate": 0.0003272268907563025, + "loss": 0.4413, + "step": 24131 + }, + { + "epoch": 13.481564245810056, + "grad_norm": 0.4315921664237976, + "learning_rate": 0.00032719887955182076, + "loss": 0.4097, + "step": 24132 + }, + { + "epoch": 13.482122905027932, + "grad_norm": 0.4852002263069153, + "learning_rate": 0.00032717086834733897, + "loss": 0.4, + "step": 24133 + }, + { + "epoch": 13.48268156424581, + "grad_norm": 0.5241253972053528, + "learning_rate": 0.0003271428571428571, + "loss": 0.3607, + "step": 24134 + }, + { + "epoch": 13.483240223463687, + "grad_norm": 0.4029459059238434, + "learning_rate": 0.0003271148459383754, + "loss": 0.4342, + "step": 24135 + }, + { + "epoch": 13.483798882681564, + "grad_norm": 0.5683883428573608, + "learning_rate": 0.00032708683473389353, + "loss": 0.3894, + "step": 24136 + }, + { + "epoch": 13.484357541899442, + "grad_norm": 0.4287756085395813, + "learning_rate": 0.0003270588235294118, + "loss": 0.3678, + "step": 24137 + }, + { + "epoch": 13.484916201117318, + "grad_norm": 0.9509091377258301, + "learning_rate": 0.00032703081232493, + "loss": 0.4939, + "step": 24138 + }, + { + "epoch": 13.485474860335195, + "grad_norm": 0.5072173476219177, + "learning_rate": 0.00032700280112044815, + "loss": 0.3109, + "step": 24139 + }, + { + "epoch": 13.486033519553073, + "grad_norm": 0.36319008469581604, + "learning_rate": 0.0003269747899159664, + "loss": 0.2862, + "step": 24140 + }, + { + "epoch": 13.48659217877095, + "grad_norm": 0.6130936741828918, + "learning_rate": 0.0003269467787114846, + "loss": 0.4215, + "step": 24141 + }, + { + "epoch": 13.487150837988827, + "grad_norm": 0.380995512008667, + "learning_rate": 0.0003269187675070028, + "loss": 0.4091, + "step": 24142 + }, + { + "epoch": 13.487709497206703, + "grad_norm": 0.42395463585853577, + "learning_rate": 0.000326890756302521, + "loss": 0.5316, + "step": 24143 + }, + { + "epoch": 13.488268156424581, + "grad_norm": 0.40420690178871155, + "learning_rate": 0.0003268627450980392, + "loss": 0.3688, + "step": 24144 + }, + { + "epoch": 13.488826815642458, + "grad_norm": 0.5257751941680908, + "learning_rate": 0.00032683473389355744, + "loss": 0.3836, + "step": 24145 + }, + { + "epoch": 13.489385474860335, + "grad_norm": 0.8360605835914612, + "learning_rate": 0.00032680672268907564, + "loss": 0.3395, + "step": 24146 + }, + { + "epoch": 13.489944134078213, + "grad_norm": 0.627238929271698, + "learning_rate": 0.00032677871148459385, + "loss": 0.2734, + "step": 24147 + }, + { + "epoch": 13.49050279329609, + "grad_norm": 0.4085390269756317, + "learning_rate": 0.00032675070028011206, + "loss": 0.4635, + "step": 24148 + }, + { + "epoch": 13.491061452513966, + "grad_norm": 0.46530553698539734, + "learning_rate": 0.00032672268907563026, + "loss": 0.4391, + "step": 24149 + }, + { + "epoch": 13.491620111731844, + "grad_norm": 0.3951616883277893, + "learning_rate": 0.00032669467787114847, + "loss": 0.4111, + "step": 24150 + }, + { + "epoch": 13.492178770949721, + "grad_norm": 0.5235318541526794, + "learning_rate": 0.0003266666666666667, + "loss": 0.4743, + "step": 24151 + }, + { + "epoch": 13.492737430167598, + "grad_norm": 0.4140463173389435, + "learning_rate": 0.00032663865546218493, + "loss": 0.3888, + "step": 24152 + }, + { + "epoch": 13.493296089385474, + "grad_norm": 1.059981346130371, + "learning_rate": 0.0003266106442577031, + "loss": 0.4244, + "step": 24153 + }, + { + "epoch": 13.493854748603352, + "grad_norm": 0.4322214424610138, + "learning_rate": 0.0003265826330532213, + "loss": 0.3555, + "step": 24154 + }, + { + "epoch": 13.494413407821229, + "grad_norm": 0.4388740658760071, + "learning_rate": 0.0003265546218487395, + "loss": 0.4535, + "step": 24155 + }, + { + "epoch": 13.494972067039106, + "grad_norm": 0.39848706126213074, + "learning_rate": 0.0003265266106442577, + "loss": 0.3855, + "step": 24156 + }, + { + "epoch": 13.495530726256984, + "grad_norm": 0.5250911116600037, + "learning_rate": 0.0003264985994397759, + "loss": 0.3727, + "step": 24157 + }, + { + "epoch": 13.49608938547486, + "grad_norm": 0.837040901184082, + "learning_rate": 0.0003264705882352941, + "loss": 0.435, + "step": 24158 + }, + { + "epoch": 13.496648044692737, + "grad_norm": 0.575750470161438, + "learning_rate": 0.0003264425770308123, + "loss": 0.5221, + "step": 24159 + }, + { + "epoch": 13.497206703910614, + "grad_norm": 1.2366598844528198, + "learning_rate": 0.0003264145658263306, + "loss": 0.403, + "step": 24160 + }, + { + "epoch": 13.497765363128492, + "grad_norm": 0.4141302704811096, + "learning_rate": 0.00032638655462184873, + "loss": 0.5009, + "step": 24161 + }, + { + "epoch": 13.498324022346369, + "grad_norm": 0.6292719841003418, + "learning_rate": 0.00032635854341736694, + "loss": 0.4022, + "step": 24162 + }, + { + "epoch": 13.498882681564245, + "grad_norm": 0.4115258455276489, + "learning_rate": 0.00032633053221288515, + "loss": 0.4545, + "step": 24163 + }, + { + "epoch": 13.499441340782123, + "grad_norm": 0.43598470091819763, + "learning_rate": 0.00032630252100840335, + "loss": 0.3849, + "step": 24164 + }, + { + "epoch": 13.5, + "grad_norm": 0.4456072449684143, + "learning_rate": 0.0003262745098039216, + "loss": 0.3516, + "step": 24165 + }, + { + "epoch": 13.500558659217877, + "grad_norm": 0.3481283485889435, + "learning_rate": 0.00032624649859943976, + "loss": 0.3444, + "step": 24166 + }, + { + "epoch": 13.501117318435755, + "grad_norm": 0.45081251859664917, + "learning_rate": 0.00032621848739495797, + "loss": 0.3664, + "step": 24167 + }, + { + "epoch": 13.501675977653631, + "grad_norm": 0.39018774032592773, + "learning_rate": 0.00032619047619047623, + "loss": 0.5551, + "step": 24168 + }, + { + "epoch": 13.502234636871508, + "grad_norm": 0.5667728185653687, + "learning_rate": 0.0003261624649859944, + "loss": 0.425, + "step": 24169 + }, + { + "epoch": 13.502793296089386, + "grad_norm": 0.3822039067745209, + "learning_rate": 0.00032613445378151264, + "loss": 0.3998, + "step": 24170 + }, + { + "epoch": 13.503351955307263, + "grad_norm": 0.3626020550727844, + "learning_rate": 0.0003261064425770308, + "loss": 0.4132, + "step": 24171 + }, + { + "epoch": 13.50391061452514, + "grad_norm": 0.5741115212440491, + "learning_rate": 0.000326078431372549, + "loss": 0.4491, + "step": 24172 + }, + { + "epoch": 13.504469273743016, + "grad_norm": 10.957284927368164, + "learning_rate": 0.00032605042016806726, + "loss": 0.4135, + "step": 24173 + }, + { + "epoch": 13.505027932960894, + "grad_norm": 0.3592619299888611, + "learning_rate": 0.0003260224089635854, + "loss": 0.3967, + "step": 24174 + }, + { + "epoch": 13.505586592178771, + "grad_norm": 0.5009909868240356, + "learning_rate": 0.00032599439775910367, + "loss": 0.4171, + "step": 24175 + }, + { + "epoch": 13.506145251396648, + "grad_norm": 0.7640971541404724, + "learning_rate": 0.0003259663865546219, + "loss": 0.4059, + "step": 24176 + }, + { + "epoch": 13.506703910614526, + "grad_norm": 0.6335141062736511, + "learning_rate": 0.00032593837535014003, + "loss": 0.401, + "step": 24177 + }, + { + "epoch": 13.507262569832402, + "grad_norm": 0.4155081510543823, + "learning_rate": 0.0003259103641456583, + "loss": 0.39, + "step": 24178 + }, + { + "epoch": 13.507821229050279, + "grad_norm": 0.5095494985580444, + "learning_rate": 0.00032588235294117644, + "loss": 0.3249, + "step": 24179 + }, + { + "epoch": 13.508379888268156, + "grad_norm": 0.565673291683197, + "learning_rate": 0.0003258543417366947, + "loss": 0.5632, + "step": 24180 + }, + { + "epoch": 13.508938547486034, + "grad_norm": 0.6595878005027771, + "learning_rate": 0.0003258263305322129, + "loss": 0.4268, + "step": 24181 + }, + { + "epoch": 13.50949720670391, + "grad_norm": 0.6628794074058533, + "learning_rate": 0.00032579831932773106, + "loss": 0.5202, + "step": 24182 + }, + { + "epoch": 13.510055865921787, + "grad_norm": 0.37234166264533997, + "learning_rate": 0.0003257703081232493, + "loss": 0.4072, + "step": 24183 + }, + { + "epoch": 13.510614525139665, + "grad_norm": 0.42800527811050415, + "learning_rate": 0.0003257422969187675, + "loss": 0.3678, + "step": 24184 + }, + { + "epoch": 13.511173184357542, + "grad_norm": 0.603776752948761, + "learning_rate": 0.00032571428571428573, + "loss": 0.5497, + "step": 24185 + }, + { + "epoch": 13.511731843575419, + "grad_norm": 1.179127812385559, + "learning_rate": 0.00032568627450980394, + "loss": 0.4688, + "step": 24186 + }, + { + "epoch": 13.512290502793297, + "grad_norm": 8.016297340393066, + "learning_rate": 0.0003256582633053221, + "loss": 0.4433, + "step": 24187 + }, + { + "epoch": 13.512849162011173, + "grad_norm": 0.9016374945640564, + "learning_rate": 0.00032563025210084035, + "loss": 0.5232, + "step": 24188 + }, + { + "epoch": 13.51340782122905, + "grad_norm": 0.45515188574790955, + "learning_rate": 0.00032560224089635856, + "loss": 0.419, + "step": 24189 + }, + { + "epoch": 13.513966480446927, + "grad_norm": 0.5851801633834839, + "learning_rate": 0.00032557422969187676, + "loss": 0.4626, + "step": 24190 + }, + { + "epoch": 13.514525139664805, + "grad_norm": 0.6761608123779297, + "learning_rate": 0.00032554621848739497, + "loss": 0.4628, + "step": 24191 + }, + { + "epoch": 13.515083798882682, + "grad_norm": 0.4658829867839813, + "learning_rate": 0.0003255182072829132, + "loss": 0.3779, + "step": 24192 + }, + { + "epoch": 13.515642458100558, + "grad_norm": 0.3431715667247772, + "learning_rate": 0.0003254901960784314, + "loss": 0.3791, + "step": 24193 + }, + { + "epoch": 13.516201117318436, + "grad_norm": 0.32139307260513306, + "learning_rate": 0.0003254621848739496, + "loss": 0.3361, + "step": 24194 + }, + { + "epoch": 13.516759776536313, + "grad_norm": 0.49882182478904724, + "learning_rate": 0.0003254341736694678, + "loss": 0.4824, + "step": 24195 + }, + { + "epoch": 13.51731843575419, + "grad_norm": 0.391493022441864, + "learning_rate": 0.000325406162464986, + "loss": 0.3547, + "step": 24196 + }, + { + "epoch": 13.517877094972068, + "grad_norm": 0.7438197731971741, + "learning_rate": 0.0003253781512605042, + "loss": 0.5398, + "step": 24197 + }, + { + "epoch": 13.518435754189944, + "grad_norm": 0.5835862755775452, + "learning_rate": 0.0003253501400560224, + "loss": 0.5447, + "step": 24198 + }, + { + "epoch": 13.518994413407821, + "grad_norm": 0.5489450693130493, + "learning_rate": 0.0003253221288515406, + "loss": 0.4434, + "step": 24199 + }, + { + "epoch": 13.519553072625698, + "grad_norm": 1.1164907217025757, + "learning_rate": 0.0003252941176470589, + "loss": 0.5135, + "step": 24200 + }, + { + "epoch": 13.520111731843576, + "grad_norm": 0.532439112663269, + "learning_rate": 0.00032526610644257703, + "loss": 0.5785, + "step": 24201 + }, + { + "epoch": 13.520670391061453, + "grad_norm": 0.3819122910499573, + "learning_rate": 0.00032523809523809523, + "loss": 0.3747, + "step": 24202 + }, + { + "epoch": 13.521229050279329, + "grad_norm": 0.49957165122032166, + "learning_rate": 0.00032521008403361344, + "loss": 0.3758, + "step": 24203 + }, + { + "epoch": 13.521787709497207, + "grad_norm": 0.44187700748443604, + "learning_rate": 0.00032518207282913165, + "loss": 0.4464, + "step": 24204 + }, + { + "epoch": 13.522346368715084, + "grad_norm": 0.37818682193756104, + "learning_rate": 0.0003251540616246499, + "loss": 0.3839, + "step": 24205 + }, + { + "epoch": 13.52290502793296, + "grad_norm": 0.4456169903278351, + "learning_rate": 0.00032512605042016806, + "loss": 0.4275, + "step": 24206 + }, + { + "epoch": 13.523463687150837, + "grad_norm": 0.9655728936195374, + "learning_rate": 0.00032509803921568626, + "loss": 0.4851, + "step": 24207 + }, + { + "epoch": 13.524022346368715, + "grad_norm": 0.6421684622764587, + "learning_rate": 0.0003250700280112045, + "loss": 0.4451, + "step": 24208 + }, + { + "epoch": 13.524581005586592, + "grad_norm": 0.3974587917327881, + "learning_rate": 0.0003250420168067227, + "loss": 0.4256, + "step": 24209 + }, + { + "epoch": 13.525139664804469, + "grad_norm": 0.4115317165851593, + "learning_rate": 0.00032501400560224094, + "loss": 0.38, + "step": 24210 + }, + { + "epoch": 13.525698324022347, + "grad_norm": 0.4842468500137329, + "learning_rate": 0.0003249859943977591, + "loss": 0.5274, + "step": 24211 + }, + { + "epoch": 13.526256983240224, + "grad_norm": 0.3966115117073059, + "learning_rate": 0.0003249579831932773, + "loss": 0.3963, + "step": 24212 + }, + { + "epoch": 13.5268156424581, + "grad_norm": 2.8386292457580566, + "learning_rate": 0.00032492997198879555, + "loss": 0.3592, + "step": 24213 + }, + { + "epoch": 13.527374301675978, + "grad_norm": 0.8059162497520447, + "learning_rate": 0.0003249019607843137, + "loss": 0.5126, + "step": 24214 + }, + { + "epoch": 13.527932960893855, + "grad_norm": 0.6205608248710632, + "learning_rate": 0.00032487394957983197, + "loss": 0.4321, + "step": 24215 + }, + { + "epoch": 13.528491620111732, + "grad_norm": 0.4475083649158478, + "learning_rate": 0.00032484593837535017, + "loss": 0.3884, + "step": 24216 + }, + { + "epoch": 13.529050279329608, + "grad_norm": 0.5723434686660767, + "learning_rate": 0.0003248179271708683, + "loss": 0.4876, + "step": 24217 + }, + { + "epoch": 13.529608938547486, + "grad_norm": 1.1262656450271606, + "learning_rate": 0.0003247899159663866, + "loss": 0.4277, + "step": 24218 + }, + { + "epoch": 13.530167597765363, + "grad_norm": 0.6936318278312683, + "learning_rate": 0.00032476190476190474, + "loss": 0.4547, + "step": 24219 + }, + { + "epoch": 13.53072625698324, + "grad_norm": 0.5107802152633667, + "learning_rate": 0.000324733893557423, + "loss": 0.4598, + "step": 24220 + }, + { + "epoch": 13.531284916201118, + "grad_norm": 1.8162590265274048, + "learning_rate": 0.0003247058823529412, + "loss": 0.4705, + "step": 24221 + }, + { + "epoch": 13.531843575418995, + "grad_norm": 0.7601991295814514, + "learning_rate": 0.00032467787114845935, + "loss": 0.4054, + "step": 24222 + }, + { + "epoch": 13.532402234636871, + "grad_norm": 0.42457419633865356, + "learning_rate": 0.0003246498599439776, + "loss": 0.4487, + "step": 24223 + }, + { + "epoch": 13.53296089385475, + "grad_norm": 0.49485647678375244, + "learning_rate": 0.0003246218487394958, + "loss": 0.4042, + "step": 24224 + }, + { + "epoch": 13.533519553072626, + "grad_norm": 0.45416122674942017, + "learning_rate": 0.000324593837535014, + "loss": 0.4325, + "step": 24225 + }, + { + "epoch": 13.534078212290503, + "grad_norm": 0.43602144718170166, + "learning_rate": 0.00032456582633053223, + "loss": 0.3925, + "step": 24226 + }, + { + "epoch": 13.53463687150838, + "grad_norm": 0.6646022796630859, + "learning_rate": 0.0003245378151260504, + "loss": 0.4351, + "step": 24227 + }, + { + "epoch": 13.535195530726257, + "grad_norm": 0.6392934918403625, + "learning_rate": 0.00032450980392156864, + "loss": 0.339, + "step": 24228 + }, + { + "epoch": 13.535754189944134, + "grad_norm": 0.538756787776947, + "learning_rate": 0.00032448179271708685, + "loss": 0.4694, + "step": 24229 + }, + { + "epoch": 13.53631284916201, + "grad_norm": 0.35067659616470337, + "learning_rate": 0.00032445378151260506, + "loss": 0.2582, + "step": 24230 + }, + { + "epoch": 13.536871508379889, + "grad_norm": 0.4100213944911957, + "learning_rate": 0.00032442577030812326, + "loss": 0.4139, + "step": 24231 + }, + { + "epoch": 13.537430167597766, + "grad_norm": 0.979207456111908, + "learning_rate": 0.00032439775910364147, + "loss": 0.4613, + "step": 24232 + }, + { + "epoch": 13.537988826815642, + "grad_norm": 0.7991422414779663, + "learning_rate": 0.0003243697478991597, + "loss": 0.476, + "step": 24233 + }, + { + "epoch": 13.538547486033519, + "grad_norm": 0.5174069404602051, + "learning_rate": 0.0003243417366946779, + "loss": 0.4082, + "step": 24234 + }, + { + "epoch": 13.539106145251397, + "grad_norm": 0.90731281042099, + "learning_rate": 0.0003243137254901961, + "loss": 0.4528, + "step": 24235 + }, + { + "epoch": 13.539664804469274, + "grad_norm": 0.7654157876968384, + "learning_rate": 0.0003242857142857143, + "loss": 0.5313, + "step": 24236 + }, + { + "epoch": 13.54022346368715, + "grad_norm": 0.36062702536582947, + "learning_rate": 0.0003242577030812325, + "loss": 0.4262, + "step": 24237 + }, + { + "epoch": 13.540782122905028, + "grad_norm": 0.5776169300079346, + "learning_rate": 0.0003242296918767507, + "loss": 0.3506, + "step": 24238 + }, + { + "epoch": 13.541340782122905, + "grad_norm": 0.5203577876091003, + "learning_rate": 0.0003242016806722689, + "loss": 0.5883, + "step": 24239 + }, + { + "epoch": 13.541899441340782, + "grad_norm": 0.47661954164505005, + "learning_rate": 0.00032417366946778717, + "loss": 0.4938, + "step": 24240 + }, + { + "epoch": 13.54245810055866, + "grad_norm": 0.462184339761734, + "learning_rate": 0.0003241456582633053, + "loss": 0.4259, + "step": 24241 + }, + { + "epoch": 13.543016759776537, + "grad_norm": 0.6133396625518799, + "learning_rate": 0.00032411764705882353, + "loss": 0.4095, + "step": 24242 + }, + { + "epoch": 13.543575418994413, + "grad_norm": 0.7531257271766663, + "learning_rate": 0.00032408963585434173, + "loss": 0.5045, + "step": 24243 + }, + { + "epoch": 13.544134078212291, + "grad_norm": 0.5929487347602844, + "learning_rate": 0.00032406162464985994, + "loss": 0.4323, + "step": 24244 + }, + { + "epoch": 13.544692737430168, + "grad_norm": 0.3269234299659729, + "learning_rate": 0.0003240336134453782, + "loss": 0.3957, + "step": 24245 + }, + { + "epoch": 13.545251396648045, + "grad_norm": 1.4112555980682373, + "learning_rate": 0.00032400560224089635, + "loss": 0.4331, + "step": 24246 + }, + { + "epoch": 13.545810055865921, + "grad_norm": 0.5134170651435852, + "learning_rate": 0.00032397759103641456, + "loss": 0.4151, + "step": 24247 + }, + { + "epoch": 13.5463687150838, + "grad_norm": 0.5318816304206848, + "learning_rate": 0.0003239495798319328, + "loss": 0.4119, + "step": 24248 + }, + { + "epoch": 13.546927374301676, + "grad_norm": 0.6621688604354858, + "learning_rate": 0.00032392156862745097, + "loss": 0.6012, + "step": 24249 + }, + { + "epoch": 13.547486033519553, + "grad_norm": 0.7528454661369324, + "learning_rate": 0.00032389355742296923, + "loss": 0.5409, + "step": 24250 + }, + { + "epoch": 13.548044692737431, + "grad_norm": 0.4618639647960663, + "learning_rate": 0.0003238655462184874, + "loss": 0.4708, + "step": 24251 + }, + { + "epoch": 13.548603351955308, + "grad_norm": 0.4162149727344513, + "learning_rate": 0.0003238375350140056, + "loss": 0.3975, + "step": 24252 + }, + { + "epoch": 13.549162011173184, + "grad_norm": 0.975376307964325, + "learning_rate": 0.00032380952380952385, + "loss": 0.3584, + "step": 24253 + }, + { + "epoch": 13.54972067039106, + "grad_norm": 0.46845823526382446, + "learning_rate": 0.000323781512605042, + "loss": 0.5591, + "step": 24254 + }, + { + "epoch": 13.550279329608939, + "grad_norm": 1.4530911445617676, + "learning_rate": 0.00032375350140056026, + "loss": 0.5976, + "step": 24255 + }, + { + "epoch": 13.550837988826816, + "grad_norm": 0.39533713459968567, + "learning_rate": 0.00032372549019607847, + "loss": 0.4209, + "step": 24256 + }, + { + "epoch": 13.551396648044692, + "grad_norm": 1.1704386472702026, + "learning_rate": 0.0003236974789915966, + "loss": 0.6624, + "step": 24257 + }, + { + "epoch": 13.55195530726257, + "grad_norm": 0.8711075186729431, + "learning_rate": 0.0003236694677871149, + "loss": 0.4374, + "step": 24258 + }, + { + "epoch": 13.552513966480447, + "grad_norm": 0.32882529497146606, + "learning_rate": 0.00032364145658263303, + "loss": 0.4534, + "step": 24259 + }, + { + "epoch": 13.553072625698324, + "grad_norm": 0.644980251789093, + "learning_rate": 0.0003236134453781513, + "loss": 0.4965, + "step": 24260 + }, + { + "epoch": 13.553631284916202, + "grad_norm": 0.5811072587966919, + "learning_rate": 0.0003235854341736695, + "loss": 0.7347, + "step": 24261 + }, + { + "epoch": 13.554189944134079, + "grad_norm": 0.4524349570274353, + "learning_rate": 0.00032355742296918765, + "loss": 0.462, + "step": 24262 + }, + { + "epoch": 13.554748603351955, + "grad_norm": 0.5441485047340393, + "learning_rate": 0.0003235294117647059, + "loss": 0.479, + "step": 24263 + }, + { + "epoch": 13.555307262569832, + "grad_norm": 0.41202443838119507, + "learning_rate": 0.0003235014005602241, + "loss": 0.3806, + "step": 24264 + }, + { + "epoch": 13.55586592178771, + "grad_norm": 0.8137189745903015, + "learning_rate": 0.0003234733893557423, + "loss": 0.4186, + "step": 24265 + }, + { + "epoch": 13.556424581005587, + "grad_norm": 0.4145328402519226, + "learning_rate": 0.0003234453781512605, + "loss": 0.432, + "step": 24266 + }, + { + "epoch": 13.556983240223463, + "grad_norm": 0.38000452518463135, + "learning_rate": 0.0003234173669467787, + "loss": 0.4581, + "step": 24267 + }, + { + "epoch": 13.557541899441341, + "grad_norm": 0.43896549940109253, + "learning_rate": 0.00032338935574229694, + "loss": 0.4317, + "step": 24268 + }, + { + "epoch": 13.558100558659218, + "grad_norm": 1.0027886629104614, + "learning_rate": 0.00032336134453781514, + "loss": 0.398, + "step": 24269 + }, + { + "epoch": 13.558659217877095, + "grad_norm": 1.0002566576004028, + "learning_rate": 0.0003233333333333333, + "loss": 0.3899, + "step": 24270 + }, + { + "epoch": 13.559217877094973, + "grad_norm": 3.61797833442688, + "learning_rate": 0.00032330532212885156, + "loss": 0.5673, + "step": 24271 + }, + { + "epoch": 13.55977653631285, + "grad_norm": 0.6382842659950256, + "learning_rate": 0.00032327731092436976, + "loss": 0.4118, + "step": 24272 + }, + { + "epoch": 13.560335195530726, + "grad_norm": 1.1657254695892334, + "learning_rate": 0.00032324929971988797, + "loss": 0.4881, + "step": 24273 + }, + { + "epoch": 13.560893854748603, + "grad_norm": 0.410346120595932, + "learning_rate": 0.0003232212885154062, + "loss": 0.3876, + "step": 24274 + }, + { + "epoch": 13.561452513966481, + "grad_norm": 0.9743613600730896, + "learning_rate": 0.0003231932773109243, + "loss": 0.3926, + "step": 24275 + }, + { + "epoch": 13.562011173184358, + "grad_norm": 2.832747220993042, + "learning_rate": 0.0003231652661064426, + "loss": 0.368, + "step": 24276 + }, + { + "epoch": 13.562569832402234, + "grad_norm": 0.4697764813899994, + "learning_rate": 0.0003231372549019608, + "loss": 0.4078, + "step": 24277 + }, + { + "epoch": 13.563128491620112, + "grad_norm": 0.5244917869567871, + "learning_rate": 0.000323109243697479, + "loss": 0.4865, + "step": 24278 + }, + { + "epoch": 13.563687150837989, + "grad_norm": 0.6743093132972717, + "learning_rate": 0.0003230812324929972, + "loss": 0.4442, + "step": 24279 + }, + { + "epoch": 13.564245810055866, + "grad_norm": 0.4919312298297882, + "learning_rate": 0.0003230532212885154, + "loss": 0.4084, + "step": 24280 + }, + { + "epoch": 13.564804469273742, + "grad_norm": 0.5810346603393555, + "learning_rate": 0.0003230252100840336, + "loss": 0.4218, + "step": 24281 + }, + { + "epoch": 13.56536312849162, + "grad_norm": 0.5394034385681152, + "learning_rate": 0.0003229971988795518, + "loss": 0.5409, + "step": 24282 + }, + { + "epoch": 13.565921787709497, + "grad_norm": 1.5001826286315918, + "learning_rate": 0.00032296918767507003, + "loss": 0.4208, + "step": 24283 + }, + { + "epoch": 13.566480446927374, + "grad_norm": 0.4995175302028656, + "learning_rate": 0.00032294117647058823, + "loss": 0.4024, + "step": 24284 + }, + { + "epoch": 13.567039106145252, + "grad_norm": 0.9005759358406067, + "learning_rate": 0.00032291316526610644, + "loss": 0.5347, + "step": 24285 + }, + { + "epoch": 13.567597765363129, + "grad_norm": 0.3349456489086151, + "learning_rate": 0.00032288515406162465, + "loss": 0.2691, + "step": 24286 + }, + { + "epoch": 13.568156424581005, + "grad_norm": 0.4442967176437378, + "learning_rate": 0.00032285714285714285, + "loss": 0.4984, + "step": 24287 + }, + { + "epoch": 13.568715083798883, + "grad_norm": 0.5860431790351868, + "learning_rate": 0.0003228291316526611, + "loss": 0.442, + "step": 24288 + }, + { + "epoch": 13.56927374301676, + "grad_norm": 0.7263837456703186, + "learning_rate": 0.00032280112044817926, + "loss": 0.4136, + "step": 24289 + }, + { + "epoch": 13.569832402234637, + "grad_norm": 0.6092385649681091, + "learning_rate": 0.00032277310924369747, + "loss": 0.427, + "step": 24290 + }, + { + "epoch": 13.570391061452513, + "grad_norm": 0.3892316520214081, + "learning_rate": 0.0003227450980392157, + "loss": 0.4188, + "step": 24291 + }, + { + "epoch": 13.570949720670392, + "grad_norm": 0.8783763647079468, + "learning_rate": 0.0003227170868347339, + "loss": 0.3693, + "step": 24292 + }, + { + "epoch": 13.571508379888268, + "grad_norm": 0.4158749282360077, + "learning_rate": 0.00032268907563025214, + "loss": 0.4154, + "step": 24293 + }, + { + "epoch": 13.572067039106145, + "grad_norm": 0.3377149701118469, + "learning_rate": 0.0003226610644257703, + "loss": 0.4184, + "step": 24294 + }, + { + "epoch": 13.572625698324023, + "grad_norm": 0.430833101272583, + "learning_rate": 0.0003226330532212885, + "loss": 0.4628, + "step": 24295 + }, + { + "epoch": 13.5731843575419, + "grad_norm": 0.8284791707992554, + "learning_rate": 0.00032260504201680676, + "loss": 0.3164, + "step": 24296 + }, + { + "epoch": 13.573743016759776, + "grad_norm": 1.172363042831421, + "learning_rate": 0.0003225770308123249, + "loss": 0.4507, + "step": 24297 + }, + { + "epoch": 13.574301675977654, + "grad_norm": 0.7132647037506104, + "learning_rate": 0.00032254901960784317, + "loss": 0.3534, + "step": 24298 + }, + { + "epoch": 13.574860335195531, + "grad_norm": 0.47527194023132324, + "learning_rate": 0.0003225210084033613, + "loss": 0.3989, + "step": 24299 + }, + { + "epoch": 13.575418994413408, + "grad_norm": 0.6327033042907715, + "learning_rate": 0.00032249299719887953, + "loss": 0.4595, + "step": 24300 + }, + { + "epoch": 13.575977653631284, + "grad_norm": 0.41773825883865356, + "learning_rate": 0.0003224649859943978, + "loss": 0.3774, + "step": 24301 + }, + { + "epoch": 13.576536312849163, + "grad_norm": 0.41878068447113037, + "learning_rate": 0.00032243697478991594, + "loss": 0.3821, + "step": 24302 + }, + { + "epoch": 13.577094972067039, + "grad_norm": 0.6169646978378296, + "learning_rate": 0.0003224089635854342, + "loss": 0.4287, + "step": 24303 + }, + { + "epoch": 13.577653631284916, + "grad_norm": 0.5389190912246704, + "learning_rate": 0.0003223809523809524, + "loss": 0.4591, + "step": 24304 + }, + { + "epoch": 13.578212290502794, + "grad_norm": 0.6131604909896851, + "learning_rate": 0.00032235294117647056, + "loss": 0.3759, + "step": 24305 + }, + { + "epoch": 13.57877094972067, + "grad_norm": 0.5030089020729065, + "learning_rate": 0.0003223249299719888, + "loss": 0.4815, + "step": 24306 + }, + { + "epoch": 13.579329608938547, + "grad_norm": 0.5005931854248047, + "learning_rate": 0.00032229691876750697, + "loss": 0.53, + "step": 24307 + }, + { + "epoch": 13.579888268156424, + "grad_norm": 0.43076246976852417, + "learning_rate": 0.00032226890756302523, + "loss": 0.4261, + "step": 24308 + }, + { + "epoch": 13.580446927374302, + "grad_norm": 0.594862699508667, + "learning_rate": 0.00032224089635854344, + "loss": 0.3809, + "step": 24309 + }, + { + "epoch": 13.581005586592179, + "grad_norm": 0.33322015404701233, + "learning_rate": 0.0003222128851540616, + "loss": 0.3735, + "step": 24310 + }, + { + "epoch": 13.581564245810055, + "grad_norm": 5.958134174346924, + "learning_rate": 0.00032218487394957985, + "loss": 0.4254, + "step": 24311 + }, + { + "epoch": 13.582122905027934, + "grad_norm": 0.45530906319618225, + "learning_rate": 0.00032215686274509806, + "loss": 0.3942, + "step": 24312 + }, + { + "epoch": 13.58268156424581, + "grad_norm": 0.5442169904708862, + "learning_rate": 0.00032212885154061626, + "loss": 0.5347, + "step": 24313 + }, + { + "epoch": 13.583240223463687, + "grad_norm": 1.794421911239624, + "learning_rate": 0.00032210084033613447, + "loss": 0.4279, + "step": 24314 + }, + { + "epoch": 13.583798882681565, + "grad_norm": 0.48989158868789673, + "learning_rate": 0.0003220728291316526, + "loss": 0.387, + "step": 24315 + }, + { + "epoch": 13.584357541899442, + "grad_norm": 0.7065900564193726, + "learning_rate": 0.0003220448179271709, + "loss": 0.4707, + "step": 24316 + }, + { + "epoch": 13.584916201117318, + "grad_norm": 0.4515487253665924, + "learning_rate": 0.0003220168067226891, + "loss": 0.4152, + "step": 24317 + }, + { + "epoch": 13.585474860335196, + "grad_norm": 0.8893181681632996, + "learning_rate": 0.0003219887955182073, + "loss": 0.4448, + "step": 24318 + }, + { + "epoch": 13.586033519553073, + "grad_norm": 0.6195316314697266, + "learning_rate": 0.0003219607843137255, + "loss": 0.5344, + "step": 24319 + }, + { + "epoch": 13.58659217877095, + "grad_norm": 0.9954841136932373, + "learning_rate": 0.0003219327731092437, + "loss": 0.4736, + "step": 24320 + }, + { + "epoch": 13.587150837988826, + "grad_norm": 0.9266394972801208, + "learning_rate": 0.0003219047619047619, + "loss": 0.4849, + "step": 24321 + }, + { + "epoch": 13.587709497206705, + "grad_norm": 0.6230759620666504, + "learning_rate": 0.0003218767507002801, + "loss": 0.5423, + "step": 24322 + }, + { + "epoch": 13.588268156424581, + "grad_norm": 0.5001469850540161, + "learning_rate": 0.0003218487394957983, + "loss": 0.3997, + "step": 24323 + }, + { + "epoch": 13.588826815642458, + "grad_norm": 1.8093211650848389, + "learning_rate": 0.00032182072829131653, + "loss": 0.41, + "step": 24324 + }, + { + "epoch": 13.589385474860336, + "grad_norm": 0.37465524673461914, + "learning_rate": 0.00032179271708683473, + "loss": 0.4085, + "step": 24325 + }, + { + "epoch": 13.589944134078213, + "grad_norm": 0.423692911863327, + "learning_rate": 0.00032176470588235294, + "loss": 0.3438, + "step": 24326 + }, + { + "epoch": 13.59050279329609, + "grad_norm": 1.3446416854858398, + "learning_rate": 0.00032173669467787115, + "loss": 0.5167, + "step": 24327 + }, + { + "epoch": 13.591061452513966, + "grad_norm": 0.47073447704315186, + "learning_rate": 0.0003217086834733894, + "loss": 0.3583, + "step": 24328 + }, + { + "epoch": 13.591620111731844, + "grad_norm": 0.51042640209198, + "learning_rate": 0.00032168067226890756, + "loss": 0.3911, + "step": 24329 + }, + { + "epoch": 13.59217877094972, + "grad_norm": 0.45660144090652466, + "learning_rate": 0.00032165266106442576, + "loss": 0.57, + "step": 24330 + }, + { + "epoch": 13.592737430167597, + "grad_norm": 0.605329692363739, + "learning_rate": 0.00032162464985994397, + "loss": 0.5513, + "step": 24331 + }, + { + "epoch": 13.593296089385476, + "grad_norm": 0.4736612141132355, + "learning_rate": 0.0003215966386554622, + "loss": 0.3644, + "step": 24332 + }, + { + "epoch": 13.593854748603352, + "grad_norm": 0.5587393641471863, + "learning_rate": 0.00032156862745098044, + "loss": 0.4166, + "step": 24333 + }, + { + "epoch": 13.594413407821229, + "grad_norm": 0.36336472630500793, + "learning_rate": 0.0003215406162464986, + "loss": 0.3603, + "step": 24334 + }, + { + "epoch": 13.594972067039105, + "grad_norm": 1.369785189628601, + "learning_rate": 0.0003215126050420168, + "loss": 0.4314, + "step": 24335 + }, + { + "epoch": 13.595530726256984, + "grad_norm": 0.4977770149707794, + "learning_rate": 0.00032148459383753505, + "loss": 0.3219, + "step": 24336 + }, + { + "epoch": 13.59608938547486, + "grad_norm": 0.6435325741767883, + "learning_rate": 0.0003214565826330532, + "loss": 0.5181, + "step": 24337 + }, + { + "epoch": 13.596648044692737, + "grad_norm": 0.36096060276031494, + "learning_rate": 0.00032142857142857147, + "loss": 0.344, + "step": 24338 + }, + { + "epoch": 13.597206703910615, + "grad_norm": 0.3784453272819519, + "learning_rate": 0.0003214005602240896, + "loss": 0.4263, + "step": 24339 + }, + { + "epoch": 13.597765363128492, + "grad_norm": 0.4730847477912903, + "learning_rate": 0.0003213725490196078, + "loss": 0.367, + "step": 24340 + }, + { + "epoch": 13.598324022346368, + "grad_norm": 0.44273480772972107, + "learning_rate": 0.0003213445378151261, + "loss": 0.4394, + "step": 24341 + }, + { + "epoch": 13.598882681564247, + "grad_norm": 0.37002480030059814, + "learning_rate": 0.00032131652661064424, + "loss": 0.4155, + "step": 24342 + }, + { + "epoch": 13.599441340782123, + "grad_norm": 7.938429355621338, + "learning_rate": 0.0003212885154061625, + "loss": 0.459, + "step": 24343 + }, + { + "epoch": 13.6, + "grad_norm": 0.4473360478878021, + "learning_rate": 0.0003212605042016807, + "loss": 0.3792, + "step": 24344 + }, + { + "epoch": 13.600558659217878, + "grad_norm": 0.3718496859073639, + "learning_rate": 0.00032123249299719885, + "loss": 0.3437, + "step": 24345 + }, + { + "epoch": 13.601117318435755, + "grad_norm": 0.6617511510848999, + "learning_rate": 0.0003212044817927171, + "loss": 0.3902, + "step": 24346 + }, + { + "epoch": 13.601675977653631, + "grad_norm": 0.42920982837677, + "learning_rate": 0.00032117647058823527, + "loss": 0.5534, + "step": 24347 + }, + { + "epoch": 13.602234636871508, + "grad_norm": 1.6077117919921875, + "learning_rate": 0.0003211484593837535, + "loss": 0.3793, + "step": 24348 + }, + { + "epoch": 13.602793296089386, + "grad_norm": 0.37592613697052, + "learning_rate": 0.00032112044817927173, + "loss": 0.3851, + "step": 24349 + }, + { + "epoch": 13.603351955307263, + "grad_norm": 0.44684112071990967, + "learning_rate": 0.0003210924369747899, + "loss": 0.3179, + "step": 24350 + }, + { + "epoch": 13.60391061452514, + "grad_norm": 0.5253171324729919, + "learning_rate": 0.00032106442577030814, + "loss": 0.4973, + "step": 24351 + }, + { + "epoch": 13.604469273743018, + "grad_norm": 0.9252895712852478, + "learning_rate": 0.00032103641456582635, + "loss": 0.4572, + "step": 24352 + }, + { + "epoch": 13.605027932960894, + "grad_norm": 1.9745129346847534, + "learning_rate": 0.00032100840336134456, + "loss": 0.4288, + "step": 24353 + }, + { + "epoch": 13.60558659217877, + "grad_norm": 0.5356171727180481, + "learning_rate": 0.00032098039215686276, + "loss": 0.3872, + "step": 24354 + }, + { + "epoch": 13.606145251396647, + "grad_norm": 0.5641050338745117, + "learning_rate": 0.0003209523809523809, + "loss": 0.3483, + "step": 24355 + }, + { + "epoch": 13.606703910614526, + "grad_norm": 0.40061479806900024, + "learning_rate": 0.0003209243697478992, + "loss": 0.3435, + "step": 24356 + }, + { + "epoch": 13.607262569832402, + "grad_norm": 0.46367666125297546, + "learning_rate": 0.0003208963585434174, + "loss": 0.3087, + "step": 24357 + }, + { + "epoch": 13.607821229050279, + "grad_norm": 0.4955611228942871, + "learning_rate": 0.0003208683473389356, + "loss": 0.4245, + "step": 24358 + }, + { + "epoch": 13.608379888268157, + "grad_norm": 0.5085077881813049, + "learning_rate": 0.0003208403361344538, + "loss": 0.433, + "step": 24359 + }, + { + "epoch": 13.608938547486034, + "grad_norm": 0.36777132749557495, + "learning_rate": 0.000320812324929972, + "loss": 0.3491, + "step": 24360 + }, + { + "epoch": 13.60949720670391, + "grad_norm": 0.5223658084869385, + "learning_rate": 0.0003207843137254902, + "loss": 0.3592, + "step": 24361 + }, + { + "epoch": 13.610055865921789, + "grad_norm": 0.5470029711723328, + "learning_rate": 0.0003207563025210084, + "loss": 0.4424, + "step": 24362 + }, + { + "epoch": 13.610614525139665, + "grad_norm": 0.38278546929359436, + "learning_rate": 0.00032072829131652667, + "loss": 0.3623, + "step": 24363 + }, + { + "epoch": 13.611173184357542, + "grad_norm": 0.3846319019794464, + "learning_rate": 0.0003207002801120448, + "loss": 0.2749, + "step": 24364 + }, + { + "epoch": 13.611731843575418, + "grad_norm": 0.9957003593444824, + "learning_rate": 0.00032067226890756303, + "loss": 0.3384, + "step": 24365 + }, + { + "epoch": 13.612290502793297, + "grad_norm": 0.5243500471115112, + "learning_rate": 0.00032064425770308123, + "loss": 0.3717, + "step": 24366 + }, + { + "epoch": 13.612849162011173, + "grad_norm": 0.4746619164943695, + "learning_rate": 0.00032061624649859944, + "loss": 0.435, + "step": 24367 + }, + { + "epoch": 13.61340782122905, + "grad_norm": 1.226295828819275, + "learning_rate": 0.0003205882352941177, + "loss": 0.4555, + "step": 24368 + }, + { + "epoch": 13.613966480446928, + "grad_norm": 1.6121059656143188, + "learning_rate": 0.00032056022408963585, + "loss": 0.3599, + "step": 24369 + }, + { + "epoch": 13.614525139664805, + "grad_norm": 0.45315811038017273, + "learning_rate": 0.00032053221288515406, + "loss": 0.5751, + "step": 24370 + }, + { + "epoch": 13.615083798882681, + "grad_norm": 1.625594973564148, + "learning_rate": 0.0003205042016806723, + "loss": 0.4484, + "step": 24371 + }, + { + "epoch": 13.61564245810056, + "grad_norm": 0.5459540486335754, + "learning_rate": 0.00032047619047619047, + "loss": 0.3991, + "step": 24372 + }, + { + "epoch": 13.616201117318436, + "grad_norm": 0.35960251092910767, + "learning_rate": 0.00032044817927170873, + "loss": 0.3721, + "step": 24373 + }, + { + "epoch": 13.616759776536313, + "grad_norm": 0.4609623849391937, + "learning_rate": 0.0003204201680672269, + "loss": 0.4145, + "step": 24374 + }, + { + "epoch": 13.61731843575419, + "grad_norm": 0.4776493310928345, + "learning_rate": 0.0003203921568627451, + "loss": 0.4558, + "step": 24375 + }, + { + "epoch": 13.617877094972068, + "grad_norm": 0.6469979882240295, + "learning_rate": 0.00032036414565826335, + "loss": 0.3647, + "step": 24376 + }, + { + "epoch": 13.618435754189944, + "grad_norm": 1.4890589714050293, + "learning_rate": 0.0003203361344537815, + "loss": 0.5229, + "step": 24377 + }, + { + "epoch": 13.61899441340782, + "grad_norm": 0.6840735077857971, + "learning_rate": 0.0003203081232492997, + "loss": 0.4278, + "step": 24378 + }, + { + "epoch": 13.619553072625699, + "grad_norm": 0.4018539488315582, + "learning_rate": 0.00032028011204481797, + "loss": 0.3919, + "step": 24379 + }, + { + "epoch": 13.620111731843576, + "grad_norm": 0.37476009130477905, + "learning_rate": 0.0003202521008403361, + "loss": 0.456, + "step": 24380 + }, + { + "epoch": 13.620670391061452, + "grad_norm": 0.37753865122795105, + "learning_rate": 0.0003202240896358544, + "loss": 0.3466, + "step": 24381 + }, + { + "epoch": 13.621229050279329, + "grad_norm": 0.46557021141052246, + "learning_rate": 0.00032019607843137253, + "loss": 0.4142, + "step": 24382 + }, + { + "epoch": 13.621787709497207, + "grad_norm": 4.799345970153809, + "learning_rate": 0.00032016806722689074, + "loss": 0.4271, + "step": 24383 + }, + { + "epoch": 13.622346368715084, + "grad_norm": 0.6641995906829834, + "learning_rate": 0.000320140056022409, + "loss": 0.5034, + "step": 24384 + }, + { + "epoch": 13.62290502793296, + "grad_norm": 0.6856821179389954, + "learning_rate": 0.00032011204481792715, + "loss": 0.5518, + "step": 24385 + }, + { + "epoch": 13.623463687150839, + "grad_norm": 0.48791933059692383, + "learning_rate": 0.0003200840336134454, + "loss": 0.4211, + "step": 24386 + }, + { + "epoch": 13.624022346368715, + "grad_norm": 0.5542627573013306, + "learning_rate": 0.0003200560224089636, + "loss": 0.4499, + "step": 24387 + }, + { + "epoch": 13.624581005586592, + "grad_norm": 0.4851677715778351, + "learning_rate": 0.00032002801120448177, + "loss": 0.4039, + "step": 24388 + }, + { + "epoch": 13.62513966480447, + "grad_norm": 0.4310801327228546, + "learning_rate": 0.00032, + "loss": 0.3953, + "step": 24389 + }, + { + "epoch": 13.625698324022347, + "grad_norm": 0.4289790689945221, + "learning_rate": 0.0003199719887955182, + "loss": 0.4124, + "step": 24390 + }, + { + "epoch": 13.626256983240223, + "grad_norm": 0.3752892017364502, + "learning_rate": 0.00031994397759103644, + "loss": 0.4201, + "step": 24391 + }, + { + "epoch": 13.6268156424581, + "grad_norm": 0.5055199861526489, + "learning_rate": 0.00031991596638655464, + "loss": 0.442, + "step": 24392 + }, + { + "epoch": 13.627374301675978, + "grad_norm": 0.5431085824966431, + "learning_rate": 0.0003198879551820728, + "loss": 0.4966, + "step": 24393 + }, + { + "epoch": 13.627932960893855, + "grad_norm": 0.5029839873313904, + "learning_rate": 0.00031985994397759106, + "loss": 0.4532, + "step": 24394 + }, + { + "epoch": 13.628491620111731, + "grad_norm": 0.4633367955684662, + "learning_rate": 0.00031983193277310926, + "loss": 0.4598, + "step": 24395 + }, + { + "epoch": 13.62905027932961, + "grad_norm": 0.479522705078125, + "learning_rate": 0.00031980392156862747, + "loss": 0.3988, + "step": 24396 + }, + { + "epoch": 13.629608938547486, + "grad_norm": 22.345001220703125, + "learning_rate": 0.0003197759103641457, + "loss": 0.4134, + "step": 24397 + }, + { + "epoch": 13.630167597765363, + "grad_norm": 0.49456721544265747, + "learning_rate": 0.0003197478991596638, + "loss": 0.3621, + "step": 24398 + }, + { + "epoch": 13.630726256983241, + "grad_norm": 1.0717533826828003, + "learning_rate": 0.0003197198879551821, + "loss": 0.3406, + "step": 24399 + }, + { + "epoch": 13.631284916201118, + "grad_norm": 0.4595997929573059, + "learning_rate": 0.0003196918767507003, + "loss": 0.3349, + "step": 24400 + }, + { + "epoch": 13.631843575418994, + "grad_norm": 0.4430665075778961, + "learning_rate": 0.0003196638655462185, + "loss": 0.4916, + "step": 24401 + }, + { + "epoch": 13.63240223463687, + "grad_norm": 0.41209807991981506, + "learning_rate": 0.0003196358543417367, + "loss": 0.3897, + "step": 24402 + }, + { + "epoch": 13.632960893854749, + "grad_norm": 0.47212961316108704, + "learning_rate": 0.0003196078431372549, + "loss": 0.2838, + "step": 24403 + }, + { + "epoch": 13.633519553072626, + "grad_norm": 0.36477741599082947, + "learning_rate": 0.0003195798319327731, + "loss": 0.3098, + "step": 24404 + }, + { + "epoch": 13.634078212290502, + "grad_norm": 0.3986670672893524, + "learning_rate": 0.0003195518207282913, + "loss": 0.4196, + "step": 24405 + }, + { + "epoch": 13.63463687150838, + "grad_norm": 0.49524441361427307, + "learning_rate": 0.00031952380952380953, + "loss": 0.4196, + "step": 24406 + }, + { + "epoch": 13.635195530726257, + "grad_norm": 1.3900222778320312, + "learning_rate": 0.00031949579831932773, + "loss": 0.3833, + "step": 24407 + }, + { + "epoch": 13.635754189944134, + "grad_norm": 0.5785476565361023, + "learning_rate": 0.00031946778711484594, + "loss": 0.408, + "step": 24408 + }, + { + "epoch": 13.63631284916201, + "grad_norm": 0.5940086245536804, + "learning_rate": 0.00031943977591036415, + "loss": 0.4067, + "step": 24409 + }, + { + "epoch": 13.636871508379889, + "grad_norm": 2.657275676727295, + "learning_rate": 0.00031941176470588235, + "loss": 0.4608, + "step": 24410 + }, + { + "epoch": 13.637430167597765, + "grad_norm": 0.6407803297042847, + "learning_rate": 0.0003193837535014006, + "loss": 0.5222, + "step": 24411 + }, + { + "epoch": 13.637988826815642, + "grad_norm": 0.4277299642562866, + "learning_rate": 0.00031935574229691876, + "loss": 0.3432, + "step": 24412 + }, + { + "epoch": 13.63854748603352, + "grad_norm": 0.8213837742805481, + "learning_rate": 0.00031932773109243697, + "loss": 0.4603, + "step": 24413 + }, + { + "epoch": 13.639106145251397, + "grad_norm": 0.5171836614608765, + "learning_rate": 0.0003192997198879552, + "loss": 0.3866, + "step": 24414 + }, + { + "epoch": 13.639664804469273, + "grad_norm": 0.494873046875, + "learning_rate": 0.0003192717086834734, + "loss": 0.4306, + "step": 24415 + }, + { + "epoch": 13.640223463687152, + "grad_norm": 0.2952917814254761, + "learning_rate": 0.00031924369747899164, + "loss": 0.2794, + "step": 24416 + }, + { + "epoch": 13.640782122905028, + "grad_norm": 0.3938904404640198, + "learning_rate": 0.0003192156862745098, + "loss": 0.5472, + "step": 24417 + }, + { + "epoch": 13.641340782122905, + "grad_norm": 0.8669130206108093, + "learning_rate": 0.000319187675070028, + "loss": 0.3086, + "step": 24418 + }, + { + "epoch": 13.641899441340783, + "grad_norm": 0.39553967118263245, + "learning_rate": 0.00031915966386554626, + "loss": 0.3864, + "step": 24419 + }, + { + "epoch": 13.64245810055866, + "grad_norm": 0.40750664472579956, + "learning_rate": 0.0003191316526610644, + "loss": 0.5224, + "step": 24420 + }, + { + "epoch": 13.643016759776536, + "grad_norm": 0.6431204676628113, + "learning_rate": 0.00031910364145658267, + "loss": 0.4321, + "step": 24421 + }, + { + "epoch": 13.643575418994413, + "grad_norm": 0.5904327034950256, + "learning_rate": 0.0003190756302521008, + "loss": 0.6309, + "step": 24422 + }, + { + "epoch": 13.644134078212291, + "grad_norm": 0.40578797459602356, + "learning_rate": 0.00031904761904761903, + "loss": 0.3865, + "step": 24423 + }, + { + "epoch": 13.644692737430168, + "grad_norm": 0.4975600838661194, + "learning_rate": 0.0003190196078431373, + "loss": 0.4856, + "step": 24424 + }, + { + "epoch": 13.645251396648044, + "grad_norm": 1.1321818828582764, + "learning_rate": 0.00031899159663865544, + "loss": 0.3811, + "step": 24425 + }, + { + "epoch": 13.645810055865923, + "grad_norm": 0.7656422257423401, + "learning_rate": 0.0003189635854341737, + "loss": 0.444, + "step": 24426 + }, + { + "epoch": 13.6463687150838, + "grad_norm": 0.412936806678772, + "learning_rate": 0.0003189355742296919, + "loss": 0.3771, + "step": 24427 + }, + { + "epoch": 13.646927374301676, + "grad_norm": 0.5320417881011963, + "learning_rate": 0.00031890756302521006, + "loss": 0.6079, + "step": 24428 + }, + { + "epoch": 13.647486033519552, + "grad_norm": 0.3536711633205414, + "learning_rate": 0.0003188795518207283, + "loss": 0.4358, + "step": 24429 + }, + { + "epoch": 13.64804469273743, + "grad_norm": 0.5484802722930908, + "learning_rate": 0.00031885154061624647, + "loss": 0.5392, + "step": 24430 + }, + { + "epoch": 13.648603351955307, + "grad_norm": 0.4762323498725891, + "learning_rate": 0.00031882352941176473, + "loss": 0.3719, + "step": 24431 + }, + { + "epoch": 13.649162011173184, + "grad_norm": 0.7804234027862549, + "learning_rate": 0.00031879551820728294, + "loss": 0.3759, + "step": 24432 + }, + { + "epoch": 13.649720670391062, + "grad_norm": 0.4490956962108612, + "learning_rate": 0.0003187675070028011, + "loss": 0.4596, + "step": 24433 + }, + { + "epoch": 13.650279329608939, + "grad_norm": 0.5034146904945374, + "learning_rate": 0.00031873949579831935, + "loss": 0.3284, + "step": 24434 + }, + { + "epoch": 13.650837988826815, + "grad_norm": 0.5368829369544983, + "learning_rate": 0.00031871148459383756, + "loss": 0.3822, + "step": 24435 + }, + { + "epoch": 13.651396648044694, + "grad_norm": 0.48756930232048035, + "learning_rate": 0.00031868347338935576, + "loss": 0.413, + "step": 24436 + }, + { + "epoch": 13.65195530726257, + "grad_norm": 0.5027840733528137, + "learning_rate": 0.00031865546218487397, + "loss": 0.3747, + "step": 24437 + }, + { + "epoch": 13.652513966480447, + "grad_norm": 0.8145759105682373, + "learning_rate": 0.0003186274509803921, + "loss": 0.3451, + "step": 24438 + }, + { + "epoch": 13.653072625698323, + "grad_norm": 1.5784149169921875, + "learning_rate": 0.0003185994397759104, + "loss": 0.4663, + "step": 24439 + }, + { + "epoch": 13.653631284916202, + "grad_norm": 0.4120514392852783, + "learning_rate": 0.0003185714285714286, + "loss": 0.5175, + "step": 24440 + }, + { + "epoch": 13.654189944134078, + "grad_norm": 0.9323202967643738, + "learning_rate": 0.0003185434173669468, + "loss": 0.6399, + "step": 24441 + }, + { + "epoch": 13.654748603351955, + "grad_norm": 6.808210372924805, + "learning_rate": 0.000318515406162465, + "loss": 0.4985, + "step": 24442 + }, + { + "epoch": 13.655307262569833, + "grad_norm": 0.3559451699256897, + "learning_rate": 0.0003184873949579832, + "loss": 0.4026, + "step": 24443 + }, + { + "epoch": 13.65586592178771, + "grad_norm": 0.391656756401062, + "learning_rate": 0.0003184593837535014, + "loss": 0.4318, + "step": 24444 + }, + { + "epoch": 13.656424581005586, + "grad_norm": 0.9895601272583008, + "learning_rate": 0.0003184313725490196, + "loss": 0.4206, + "step": 24445 + }, + { + "epoch": 13.656983240223465, + "grad_norm": 0.7245948314666748, + "learning_rate": 0.0003184033613445378, + "loss": 0.3907, + "step": 24446 + }, + { + "epoch": 13.657541899441341, + "grad_norm": 6.7192792892456055, + "learning_rate": 0.00031837535014005603, + "loss": 0.3928, + "step": 24447 + }, + { + "epoch": 13.658100558659218, + "grad_norm": 1.7429684400558472, + "learning_rate": 0.00031834733893557423, + "loss": 0.4669, + "step": 24448 + }, + { + "epoch": 13.658659217877094, + "grad_norm": 0.46691322326660156, + "learning_rate": 0.00031831932773109244, + "loss": 0.4856, + "step": 24449 + }, + { + "epoch": 13.659217877094973, + "grad_norm": 0.47422346472740173, + "learning_rate": 0.00031829131652661065, + "loss": 0.3313, + "step": 24450 + }, + { + "epoch": 13.65977653631285, + "grad_norm": 0.8414809703826904, + "learning_rate": 0.0003182633053221289, + "loss": 0.3155, + "step": 24451 + }, + { + "epoch": 13.660335195530726, + "grad_norm": 0.3099393844604492, + "learning_rate": 0.00031823529411764706, + "loss": 0.341, + "step": 24452 + }, + { + "epoch": 13.660893854748604, + "grad_norm": 0.6694900393486023, + "learning_rate": 0.00031820728291316526, + "loss": 0.4794, + "step": 24453 + }, + { + "epoch": 13.66145251396648, + "grad_norm": 0.5306008458137512, + "learning_rate": 0.00031817927170868347, + "loss": 0.4684, + "step": 24454 + }, + { + "epoch": 13.662011173184357, + "grad_norm": 3.5476903915405273, + "learning_rate": 0.0003181512605042017, + "loss": 0.4257, + "step": 24455 + }, + { + "epoch": 13.662569832402234, + "grad_norm": 0.7237918376922607, + "learning_rate": 0.00031812324929971994, + "loss": 0.4807, + "step": 24456 + }, + { + "epoch": 13.663128491620112, + "grad_norm": 0.4164571464061737, + "learning_rate": 0.0003180952380952381, + "loss": 0.3361, + "step": 24457 + }, + { + "epoch": 13.663687150837989, + "grad_norm": 0.3449452817440033, + "learning_rate": 0.0003180672268907563, + "loss": 0.3426, + "step": 24458 + }, + { + "epoch": 13.664245810055865, + "grad_norm": 0.699264645576477, + "learning_rate": 0.00031803921568627455, + "loss": 0.4645, + "step": 24459 + }, + { + "epoch": 13.664804469273744, + "grad_norm": 0.6480334401130676, + "learning_rate": 0.0003180112044817927, + "loss": 0.4169, + "step": 24460 + }, + { + "epoch": 13.66536312849162, + "grad_norm": 0.3999210596084595, + "learning_rate": 0.00031798319327731097, + "loss": 0.4488, + "step": 24461 + }, + { + "epoch": 13.665921787709497, + "grad_norm": 0.42112091183662415, + "learning_rate": 0.0003179551820728291, + "loss": 0.3699, + "step": 24462 + }, + { + "epoch": 13.666480446927375, + "grad_norm": 0.38431501388549805, + "learning_rate": 0.0003179271708683473, + "loss": 0.3726, + "step": 24463 + }, + { + "epoch": 13.667039106145252, + "grad_norm": 0.6671479940414429, + "learning_rate": 0.0003178991596638656, + "loss": 0.3603, + "step": 24464 + }, + { + "epoch": 13.667597765363128, + "grad_norm": 0.49649402499198914, + "learning_rate": 0.00031787114845938374, + "loss": 0.5209, + "step": 24465 + }, + { + "epoch": 13.668156424581005, + "grad_norm": 0.5430819988250732, + "learning_rate": 0.000317843137254902, + "loss": 0.4982, + "step": 24466 + }, + { + "epoch": 13.668715083798883, + "grad_norm": 0.5576528310775757, + "learning_rate": 0.0003178151260504202, + "loss": 0.3561, + "step": 24467 + }, + { + "epoch": 13.66927374301676, + "grad_norm": 0.8587684631347656, + "learning_rate": 0.00031778711484593835, + "loss": 0.4854, + "step": 24468 + }, + { + "epoch": 13.669832402234636, + "grad_norm": 1.5889253616333008, + "learning_rate": 0.0003177591036414566, + "loss": 0.3674, + "step": 24469 + }, + { + "epoch": 13.670391061452515, + "grad_norm": 0.4303986430168152, + "learning_rate": 0.00031773109243697477, + "loss": 0.4575, + "step": 24470 + }, + { + "epoch": 13.670949720670391, + "grad_norm": 1.6989390850067139, + "learning_rate": 0.000317703081232493, + "loss": 0.4801, + "step": 24471 + }, + { + "epoch": 13.671508379888268, + "grad_norm": 0.4176202714443207, + "learning_rate": 0.00031767507002801123, + "loss": 0.4061, + "step": 24472 + }, + { + "epoch": 13.672067039106146, + "grad_norm": 0.48989418148994446, + "learning_rate": 0.0003176470588235294, + "loss": 0.3934, + "step": 24473 + }, + { + "epoch": 13.672625698324023, + "grad_norm": 0.5725489854812622, + "learning_rate": 0.00031761904761904764, + "loss": 0.5604, + "step": 24474 + }, + { + "epoch": 13.6731843575419, + "grad_norm": 0.5144396424293518, + "learning_rate": 0.00031759103641456585, + "loss": 0.4342, + "step": 24475 + }, + { + "epoch": 13.673743016759776, + "grad_norm": 0.4507734775543213, + "learning_rate": 0.00031756302521008406, + "loss": 0.3368, + "step": 24476 + }, + { + "epoch": 13.674301675977654, + "grad_norm": 0.4781704843044281, + "learning_rate": 0.00031753501400560226, + "loss": 0.4302, + "step": 24477 + }, + { + "epoch": 13.67486033519553, + "grad_norm": 0.532047688961029, + "learning_rate": 0.0003175070028011204, + "loss": 0.5614, + "step": 24478 + }, + { + "epoch": 13.675418994413407, + "grad_norm": 0.5552019476890564, + "learning_rate": 0.0003174789915966387, + "loss": 0.4627, + "step": 24479 + }, + { + "epoch": 13.675977653631286, + "grad_norm": 0.7362289428710938, + "learning_rate": 0.0003174509803921569, + "loss": 0.4887, + "step": 24480 + }, + { + "epoch": 13.676536312849162, + "grad_norm": 0.43260982632637024, + "learning_rate": 0.0003174229691876751, + "loss": 0.4835, + "step": 24481 + }, + { + "epoch": 13.677094972067039, + "grad_norm": 0.5450297594070435, + "learning_rate": 0.0003173949579831933, + "loss": 0.5921, + "step": 24482 + }, + { + "epoch": 13.677653631284915, + "grad_norm": 0.4004102051258087, + "learning_rate": 0.0003173669467787115, + "loss": 0.3766, + "step": 24483 + }, + { + "epoch": 13.678212290502794, + "grad_norm": 0.5877929329872131, + "learning_rate": 0.0003173389355742297, + "loss": 0.3767, + "step": 24484 + }, + { + "epoch": 13.67877094972067, + "grad_norm": 0.38973262906074524, + "learning_rate": 0.0003173109243697479, + "loss": 0.451, + "step": 24485 + }, + { + "epoch": 13.679329608938547, + "grad_norm": 0.6328718066215515, + "learning_rate": 0.0003172829131652661, + "loss": 0.5041, + "step": 24486 + }, + { + "epoch": 13.679888268156425, + "grad_norm": 0.6081767678260803, + "learning_rate": 0.0003172549019607843, + "loss": 0.3969, + "step": 24487 + }, + { + "epoch": 13.680446927374302, + "grad_norm": 0.3715354800224304, + "learning_rate": 0.00031722689075630253, + "loss": 0.3678, + "step": 24488 + }, + { + "epoch": 13.681005586592178, + "grad_norm": 0.39989131689071655, + "learning_rate": 0.00031719887955182073, + "loss": 0.3856, + "step": 24489 + }, + { + "epoch": 13.681564245810057, + "grad_norm": 0.4814515709877014, + "learning_rate": 0.00031717086834733894, + "loss": 0.3994, + "step": 24490 + }, + { + "epoch": 13.682122905027933, + "grad_norm": 0.4881913363933563, + "learning_rate": 0.00031714285714285715, + "loss": 0.371, + "step": 24491 + }, + { + "epoch": 13.68268156424581, + "grad_norm": 0.8198256492614746, + "learning_rate": 0.00031711484593837535, + "loss": 0.5498, + "step": 24492 + }, + { + "epoch": 13.683240223463688, + "grad_norm": 0.36178117990493774, + "learning_rate": 0.00031708683473389356, + "loss": 0.3272, + "step": 24493 + }, + { + "epoch": 13.683798882681565, + "grad_norm": 0.3471197187900543, + "learning_rate": 0.00031705882352941176, + "loss": 0.3755, + "step": 24494 + }, + { + "epoch": 13.684357541899441, + "grad_norm": 0.5136399269104004, + "learning_rate": 0.00031703081232492997, + "loss": 0.5028, + "step": 24495 + }, + { + "epoch": 13.684916201117318, + "grad_norm": 0.7959884405136108, + "learning_rate": 0.0003170028011204482, + "loss": 0.4318, + "step": 24496 + }, + { + "epoch": 13.685474860335196, + "grad_norm": 0.3880917727947235, + "learning_rate": 0.0003169747899159664, + "loss": 0.4967, + "step": 24497 + }, + { + "epoch": 13.686033519553073, + "grad_norm": 0.48721611499786377, + "learning_rate": 0.0003169467787114846, + "loss": 0.3599, + "step": 24498 + }, + { + "epoch": 13.68659217877095, + "grad_norm": 0.48447638750076294, + "learning_rate": 0.00031691876750700285, + "loss": 0.3612, + "step": 24499 + }, + { + "epoch": 13.687150837988828, + "grad_norm": 0.5575209856033325, + "learning_rate": 0.000316890756302521, + "loss": 0.3255, + "step": 24500 + }, + { + "epoch": 13.687150837988828, + "eval_cer": 0.08666928523889016, + "eval_loss": 0.32727304100990295, + "eval_runtime": 55.4243, + "eval_samples_per_second": 81.877, + "eval_steps_per_second": 5.124, + "eval_wer": 0.3433982290997458, + "step": 24500 + }, + { + "epoch": 13.687709497206704, + "grad_norm": 0.667323887348175, + "learning_rate": 0.0003168627450980392, + "loss": 0.4303, + "step": 24501 + }, + { + "epoch": 13.68826815642458, + "grad_norm": 0.6596452593803406, + "learning_rate": 0.0003168347338935574, + "loss": 0.4048, + "step": 24502 + }, + { + "epoch": 13.688826815642457, + "grad_norm": 0.6369730234146118, + "learning_rate": 0.0003168067226890756, + "loss": 0.4133, + "step": 24503 + }, + { + "epoch": 13.689385474860336, + "grad_norm": 0.3578794598579407, + "learning_rate": 0.0003167787114845939, + "loss": 0.3067, + "step": 24504 + }, + { + "epoch": 13.689944134078212, + "grad_norm": 0.7099540829658508, + "learning_rate": 0.00031675070028011203, + "loss": 0.5211, + "step": 24505 + }, + { + "epoch": 13.690502793296089, + "grad_norm": 0.6109747290611267, + "learning_rate": 0.00031672268907563024, + "loss": 0.6047, + "step": 24506 + }, + { + "epoch": 13.691061452513967, + "grad_norm": 0.45618754625320435, + "learning_rate": 0.0003166946778711485, + "loss": 0.3851, + "step": 24507 + }, + { + "epoch": 13.691620111731844, + "grad_norm": 1.5236531496047974, + "learning_rate": 0.00031666666666666665, + "loss": 0.4273, + "step": 24508 + }, + { + "epoch": 13.69217877094972, + "grad_norm": 3.017887830734253, + "learning_rate": 0.0003166386554621849, + "loss": 0.4465, + "step": 24509 + }, + { + "epoch": 13.692737430167599, + "grad_norm": 0.806997537612915, + "learning_rate": 0.00031661064425770306, + "loss": 0.3675, + "step": 24510 + }, + { + "epoch": 13.693296089385475, + "grad_norm": 0.5202308893203735, + "learning_rate": 0.00031658263305322127, + "loss": 0.4247, + "step": 24511 + }, + { + "epoch": 13.693854748603352, + "grad_norm": 0.4889890253543854, + "learning_rate": 0.0003165546218487395, + "loss": 0.4221, + "step": 24512 + }, + { + "epoch": 13.694413407821228, + "grad_norm": 0.32687246799468994, + "learning_rate": 0.0003165266106442577, + "loss": 0.397, + "step": 24513 + }, + { + "epoch": 13.694972067039107, + "grad_norm": 7.667118549346924, + "learning_rate": 0.00031649859943977594, + "loss": 0.4309, + "step": 24514 + }, + { + "epoch": 13.695530726256983, + "grad_norm": 0.41886138916015625, + "learning_rate": 0.00031647058823529414, + "loss": 0.472, + "step": 24515 + }, + { + "epoch": 13.69608938547486, + "grad_norm": 0.7411496639251709, + "learning_rate": 0.0003164425770308123, + "loss": 0.373, + "step": 24516 + }, + { + "epoch": 13.696648044692738, + "grad_norm": 0.5268422961235046, + "learning_rate": 0.00031641456582633056, + "loss": 0.3805, + "step": 24517 + }, + { + "epoch": 13.697206703910615, + "grad_norm": 0.6885746717453003, + "learning_rate": 0.0003163865546218487, + "loss": 0.4061, + "step": 24518 + }, + { + "epoch": 13.697765363128491, + "grad_norm": 0.5006151795387268, + "learning_rate": 0.00031635854341736697, + "loss": 0.3743, + "step": 24519 + }, + { + "epoch": 13.69832402234637, + "grad_norm": 0.3941054046154022, + "learning_rate": 0.0003163305322128852, + "loss": 0.405, + "step": 24520 + }, + { + "epoch": 13.698882681564246, + "grad_norm": 0.6299375295639038, + "learning_rate": 0.0003163025210084033, + "loss": 0.421, + "step": 24521 + }, + { + "epoch": 13.699441340782123, + "grad_norm": 0.8714436888694763, + "learning_rate": 0.0003162745098039216, + "loss": 0.4772, + "step": 24522 + }, + { + "epoch": 13.7, + "grad_norm": 0.32743626832962036, + "learning_rate": 0.0003162464985994398, + "loss": 0.2963, + "step": 24523 + }, + { + "epoch": 13.700558659217878, + "grad_norm": 0.4370838403701782, + "learning_rate": 0.000316218487394958, + "loss": 0.3863, + "step": 24524 + }, + { + "epoch": 13.701117318435754, + "grad_norm": 0.40452754497528076, + "learning_rate": 0.0003161904761904762, + "loss": 0.3614, + "step": 24525 + }, + { + "epoch": 13.70167597765363, + "grad_norm": 0.5632776618003845, + "learning_rate": 0.00031616246498599436, + "loss": 0.4264, + "step": 24526 + }, + { + "epoch": 13.702234636871509, + "grad_norm": 0.6831368803977966, + "learning_rate": 0.0003161344537815126, + "loss": 0.4612, + "step": 24527 + }, + { + "epoch": 13.702793296089386, + "grad_norm": 0.5946988463401794, + "learning_rate": 0.0003161064425770308, + "loss": 0.5087, + "step": 24528 + }, + { + "epoch": 13.703351955307262, + "grad_norm": 0.4070788323879242, + "learning_rate": 0.00031607843137254903, + "loss": 0.3535, + "step": 24529 + }, + { + "epoch": 13.703910614525139, + "grad_norm": 0.44555631279945374, + "learning_rate": 0.00031605042016806723, + "loss": 0.4467, + "step": 24530 + }, + { + "epoch": 13.704469273743017, + "grad_norm": 0.5728515386581421, + "learning_rate": 0.00031602240896358544, + "loss": 0.4443, + "step": 24531 + }, + { + "epoch": 13.705027932960894, + "grad_norm": 0.6138285994529724, + "learning_rate": 0.00031599439775910365, + "loss": 0.363, + "step": 24532 + }, + { + "epoch": 13.70558659217877, + "grad_norm": 0.39038747549057007, + "learning_rate": 0.00031596638655462185, + "loss": 0.4248, + "step": 24533 + }, + { + "epoch": 13.706145251396649, + "grad_norm": 0.4148624837398529, + "learning_rate": 0.00031593837535014006, + "loss": 0.4268, + "step": 24534 + }, + { + "epoch": 13.706703910614525, + "grad_norm": 0.4829498529434204, + "learning_rate": 0.00031591036414565826, + "loss": 0.4783, + "step": 24535 + }, + { + "epoch": 13.707262569832402, + "grad_norm": 0.4681767225265503, + "learning_rate": 0.00031588235294117647, + "loss": 0.4102, + "step": 24536 + }, + { + "epoch": 13.70782122905028, + "grad_norm": 0.39798927307128906, + "learning_rate": 0.0003158543417366947, + "loss": 0.3487, + "step": 24537 + }, + { + "epoch": 13.708379888268157, + "grad_norm": 0.4582329988479614, + "learning_rate": 0.0003158263305322129, + "loss": 0.3083, + "step": 24538 + }, + { + "epoch": 13.708938547486033, + "grad_norm": 0.48490220308303833, + "learning_rate": 0.00031579831932773114, + "loss": 0.4, + "step": 24539 + }, + { + "epoch": 13.70949720670391, + "grad_norm": 0.38172608613967896, + "learning_rate": 0.0003157703081232493, + "loss": 0.532, + "step": 24540 + }, + { + "epoch": 13.710055865921788, + "grad_norm": 0.35551315546035767, + "learning_rate": 0.0003157422969187675, + "loss": 0.3808, + "step": 24541 + }, + { + "epoch": 13.710614525139665, + "grad_norm": 0.288070946931839, + "learning_rate": 0.0003157142857142857, + "loss": 0.2339, + "step": 24542 + }, + { + "epoch": 13.711173184357541, + "grad_norm": 0.5457075238227844, + "learning_rate": 0.0003156862745098039, + "loss": 0.6342, + "step": 24543 + }, + { + "epoch": 13.71173184357542, + "grad_norm": 0.3962937295436859, + "learning_rate": 0.00031565826330532217, + "loss": 0.3717, + "step": 24544 + }, + { + "epoch": 13.712290502793296, + "grad_norm": 0.4008278548717499, + "learning_rate": 0.0003156302521008403, + "loss": 0.3572, + "step": 24545 + }, + { + "epoch": 13.712849162011173, + "grad_norm": 0.9183762073516846, + "learning_rate": 0.00031560224089635853, + "loss": 0.3559, + "step": 24546 + }, + { + "epoch": 13.713407821229051, + "grad_norm": 0.2623822093009949, + "learning_rate": 0.0003155742296918768, + "loss": 0.2553, + "step": 24547 + }, + { + "epoch": 13.713966480446928, + "grad_norm": 0.5116667151451111, + "learning_rate": 0.00031554621848739494, + "loss": 0.4813, + "step": 24548 + }, + { + "epoch": 13.714525139664804, + "grad_norm": 0.5181401968002319, + "learning_rate": 0.0003155182072829132, + "loss": 0.4459, + "step": 24549 + }, + { + "epoch": 13.71508379888268, + "grad_norm": 0.5862475037574768, + "learning_rate": 0.00031549019607843135, + "loss": 0.4588, + "step": 24550 + }, + { + "epoch": 13.71564245810056, + "grad_norm": 0.5037033557891846, + "learning_rate": 0.00031546218487394956, + "loss": 0.4587, + "step": 24551 + }, + { + "epoch": 13.716201117318436, + "grad_norm": 0.5580079555511475, + "learning_rate": 0.0003154341736694678, + "loss": 0.4885, + "step": 24552 + }, + { + "epoch": 13.716759776536312, + "grad_norm": 0.5429263114929199, + "learning_rate": 0.00031540616246498597, + "loss": 0.3158, + "step": 24553 + }, + { + "epoch": 13.71731843575419, + "grad_norm": 0.5533357858657837, + "learning_rate": 0.00031537815126050423, + "loss": 0.6756, + "step": 24554 + }, + { + "epoch": 13.717877094972067, + "grad_norm": 1.255815863609314, + "learning_rate": 0.00031535014005602244, + "loss": 0.3585, + "step": 24555 + }, + { + "epoch": 13.718435754189944, + "grad_norm": 0.5340338945388794, + "learning_rate": 0.0003153221288515406, + "loss": 0.3875, + "step": 24556 + }, + { + "epoch": 13.71899441340782, + "grad_norm": 0.4080115854740143, + "learning_rate": 0.00031529411764705885, + "loss": 0.316, + "step": 24557 + }, + { + "epoch": 13.719553072625699, + "grad_norm": 0.6611988544464111, + "learning_rate": 0.000315266106442577, + "loss": 0.4606, + "step": 24558 + }, + { + "epoch": 13.720111731843575, + "grad_norm": 0.3810964524745941, + "learning_rate": 0.00031523809523809526, + "loss": 0.4552, + "step": 24559 + }, + { + "epoch": 13.720670391061452, + "grad_norm": 1.0721698999404907, + "learning_rate": 0.00031521008403361347, + "loss": 0.4714, + "step": 24560 + }, + { + "epoch": 13.72122905027933, + "grad_norm": 0.7423052787780762, + "learning_rate": 0.0003151820728291316, + "loss": 0.3403, + "step": 24561 + }, + { + "epoch": 13.721787709497207, + "grad_norm": 0.49939483404159546, + "learning_rate": 0.0003151540616246499, + "loss": 0.5856, + "step": 24562 + }, + { + "epoch": 13.722346368715083, + "grad_norm": 0.44711172580718994, + "learning_rate": 0.0003151260504201681, + "loss": 0.3844, + "step": 24563 + }, + { + "epoch": 13.722905027932962, + "grad_norm": 0.4472387135028839, + "learning_rate": 0.0003150980392156863, + "loss": 0.4338, + "step": 24564 + }, + { + "epoch": 13.723463687150838, + "grad_norm": 0.4355800747871399, + "learning_rate": 0.0003150700280112045, + "loss": 0.3082, + "step": 24565 + }, + { + "epoch": 13.724022346368715, + "grad_norm": 0.4703373610973358, + "learning_rate": 0.00031504201680672265, + "loss": 0.4255, + "step": 24566 + }, + { + "epoch": 13.724581005586593, + "grad_norm": 0.6501026153564453, + "learning_rate": 0.0003150140056022409, + "loss": 0.3076, + "step": 24567 + }, + { + "epoch": 13.72513966480447, + "grad_norm": 0.5645608901977539, + "learning_rate": 0.0003149859943977591, + "loss": 0.5815, + "step": 24568 + }, + { + "epoch": 13.725698324022346, + "grad_norm": 0.46356844902038574, + "learning_rate": 0.0003149579831932773, + "loss": 0.313, + "step": 24569 + }, + { + "epoch": 13.726256983240223, + "grad_norm": 0.38465526700019836, + "learning_rate": 0.00031492997198879553, + "loss": 0.4331, + "step": 24570 + }, + { + "epoch": 13.726815642458101, + "grad_norm": 0.515557587146759, + "learning_rate": 0.00031490196078431373, + "loss": 0.3329, + "step": 24571 + }, + { + "epoch": 13.727374301675978, + "grad_norm": 0.5560299754142761, + "learning_rate": 0.00031487394957983194, + "loss": 0.5442, + "step": 24572 + }, + { + "epoch": 13.727932960893854, + "grad_norm": 0.5835617184638977, + "learning_rate": 0.00031484593837535015, + "loss": 0.3897, + "step": 24573 + }, + { + "epoch": 13.728491620111733, + "grad_norm": 0.5543268322944641, + "learning_rate": 0.00031481792717086835, + "loss": 0.413, + "step": 24574 + }, + { + "epoch": 13.72905027932961, + "grad_norm": 0.9479879140853882, + "learning_rate": 0.00031478991596638656, + "loss": 0.4342, + "step": 24575 + }, + { + "epoch": 13.729608938547486, + "grad_norm": 0.40735381841659546, + "learning_rate": 0.00031476190476190476, + "loss": 0.4029, + "step": 24576 + }, + { + "epoch": 13.730167597765362, + "grad_norm": 0.43447422981262207, + "learning_rate": 0.00031473389355742297, + "loss": 0.3302, + "step": 24577 + }, + { + "epoch": 13.73072625698324, + "grad_norm": 0.500577986240387, + "learning_rate": 0.0003147058823529412, + "loss": 0.3815, + "step": 24578 + }, + { + "epoch": 13.731284916201117, + "grad_norm": 0.39958199858665466, + "learning_rate": 0.00031467787114845944, + "loss": 0.378, + "step": 24579 + }, + { + "epoch": 13.731843575418994, + "grad_norm": 0.3292566239833832, + "learning_rate": 0.0003146498599439776, + "loss": 0.4458, + "step": 24580 + }, + { + "epoch": 13.732402234636872, + "grad_norm": 0.4399887025356293, + "learning_rate": 0.0003146218487394958, + "loss": 0.4577, + "step": 24581 + }, + { + "epoch": 13.732960893854749, + "grad_norm": 0.5361818671226501, + "learning_rate": 0.000314593837535014, + "loss": 0.4597, + "step": 24582 + }, + { + "epoch": 13.733519553072625, + "grad_norm": 0.3752124011516571, + "learning_rate": 0.0003145658263305322, + "loss": 0.3746, + "step": 24583 + }, + { + "epoch": 13.734078212290502, + "grad_norm": 0.8983011841773987, + "learning_rate": 0.00031453781512605047, + "loss": 0.4075, + "step": 24584 + }, + { + "epoch": 13.73463687150838, + "grad_norm": 0.42087504267692566, + "learning_rate": 0.0003145098039215686, + "loss": 0.4069, + "step": 24585 + }, + { + "epoch": 13.735195530726257, + "grad_norm": 0.684815526008606, + "learning_rate": 0.0003144817927170868, + "loss": 0.5309, + "step": 24586 + }, + { + "epoch": 13.735754189944133, + "grad_norm": 0.48172667622566223, + "learning_rate": 0.0003144537815126051, + "loss": 0.4075, + "step": 24587 + }, + { + "epoch": 13.736312849162012, + "grad_norm": 0.3652772307395935, + "learning_rate": 0.00031442577030812324, + "loss": 0.3708, + "step": 24588 + }, + { + "epoch": 13.736871508379888, + "grad_norm": 0.5555179119110107, + "learning_rate": 0.0003143977591036415, + "loss": 0.4989, + "step": 24589 + }, + { + "epoch": 13.737430167597765, + "grad_norm": 1.068904161453247, + "learning_rate": 0.00031436974789915965, + "loss": 0.4675, + "step": 24590 + }, + { + "epoch": 13.737988826815643, + "grad_norm": 0.6132938861846924, + "learning_rate": 0.00031434173669467785, + "loss": 0.4282, + "step": 24591 + }, + { + "epoch": 13.73854748603352, + "grad_norm": 0.41793474555015564, + "learning_rate": 0.0003143137254901961, + "loss": 0.3708, + "step": 24592 + }, + { + "epoch": 13.739106145251396, + "grad_norm": 0.47035282850265503, + "learning_rate": 0.00031428571428571427, + "loss": 0.4435, + "step": 24593 + }, + { + "epoch": 13.739664804469275, + "grad_norm": 0.4950382709503174, + "learning_rate": 0.0003142577030812325, + "loss": 0.3104, + "step": 24594 + }, + { + "epoch": 13.740223463687151, + "grad_norm": 0.5601530075073242, + "learning_rate": 0.00031422969187675073, + "loss": 0.3845, + "step": 24595 + }, + { + "epoch": 13.740782122905028, + "grad_norm": 0.7217606902122498, + "learning_rate": 0.0003142016806722689, + "loss": 0.38, + "step": 24596 + }, + { + "epoch": 13.741340782122904, + "grad_norm": 0.7079225182533264, + "learning_rate": 0.00031417366946778714, + "loss": 0.4124, + "step": 24597 + }, + { + "epoch": 13.741899441340783, + "grad_norm": 0.36488285660743713, + "learning_rate": 0.0003141456582633053, + "loss": 0.352, + "step": 24598 + }, + { + "epoch": 13.74245810055866, + "grad_norm": 0.6521407961845398, + "learning_rate": 0.00031411764705882356, + "loss": 0.3825, + "step": 24599 + }, + { + "epoch": 13.743016759776536, + "grad_norm": 0.9532288312911987, + "learning_rate": 0.00031408963585434176, + "loss": 0.3708, + "step": 24600 + }, + { + "epoch": 13.743575418994414, + "grad_norm": 1.2292375564575195, + "learning_rate": 0.0003140616246498599, + "loss": 0.4161, + "step": 24601 + }, + { + "epoch": 13.74413407821229, + "grad_norm": 0.7507383227348328, + "learning_rate": 0.0003140336134453782, + "loss": 0.3888, + "step": 24602 + }, + { + "epoch": 13.744692737430167, + "grad_norm": 0.4155767858028412, + "learning_rate": 0.0003140056022408964, + "loss": 0.4368, + "step": 24603 + }, + { + "epoch": 13.745251396648044, + "grad_norm": 0.4549953043460846, + "learning_rate": 0.00031397759103641453, + "loss": 0.4851, + "step": 24604 + }, + { + "epoch": 13.745810055865922, + "grad_norm": 0.5594121217727661, + "learning_rate": 0.0003139495798319328, + "loss": 0.3797, + "step": 24605 + }, + { + "epoch": 13.746368715083799, + "grad_norm": 0.7021816968917847, + "learning_rate": 0.00031392156862745094, + "loss": 0.5175, + "step": 24606 + }, + { + "epoch": 13.746927374301675, + "grad_norm": 1.0182263851165771, + "learning_rate": 0.0003138935574229692, + "loss": 0.3267, + "step": 24607 + }, + { + "epoch": 13.747486033519554, + "grad_norm": 0.379764586687088, + "learning_rate": 0.0003138655462184874, + "loss": 0.3446, + "step": 24608 + }, + { + "epoch": 13.74804469273743, + "grad_norm": 0.4251542389392853, + "learning_rate": 0.00031383753501400556, + "loss": 0.3929, + "step": 24609 + }, + { + "epoch": 13.748603351955307, + "grad_norm": 0.3923177421092987, + "learning_rate": 0.0003138095238095238, + "loss": 0.3921, + "step": 24610 + }, + { + "epoch": 13.749162011173185, + "grad_norm": 0.47387468814849854, + "learning_rate": 0.00031378151260504203, + "loss": 0.4049, + "step": 24611 + }, + { + "epoch": 13.749720670391062, + "grad_norm": 0.4466247856616974, + "learning_rate": 0.00031375350140056023, + "loss": 0.4826, + "step": 24612 + }, + { + "epoch": 13.750279329608938, + "grad_norm": 0.5341809391975403, + "learning_rate": 0.00031372549019607844, + "loss": 0.5005, + "step": 24613 + }, + { + "epoch": 13.750837988826815, + "grad_norm": 0.6019898653030396, + "learning_rate": 0.0003136974789915966, + "loss": 0.3905, + "step": 24614 + }, + { + "epoch": 13.751396648044693, + "grad_norm": 0.36196956038475037, + "learning_rate": 0.00031366946778711485, + "loss": 0.3505, + "step": 24615 + }, + { + "epoch": 13.75195530726257, + "grad_norm": 0.5079441070556641, + "learning_rate": 0.00031364145658263306, + "loss": 0.4109, + "step": 24616 + }, + { + "epoch": 13.752513966480446, + "grad_norm": 0.46043869853019714, + "learning_rate": 0.00031361344537815126, + "loss": 0.3734, + "step": 24617 + }, + { + "epoch": 13.753072625698325, + "grad_norm": 0.8851267695426941, + "learning_rate": 0.00031358543417366947, + "loss": 0.4339, + "step": 24618 + }, + { + "epoch": 13.753631284916201, + "grad_norm": 0.47812914848327637, + "learning_rate": 0.0003135574229691877, + "loss": 0.3874, + "step": 24619 + }, + { + "epoch": 13.754189944134078, + "grad_norm": 0.34665754437446594, + "learning_rate": 0.0003135294117647059, + "loss": 0.3923, + "step": 24620 + }, + { + "epoch": 13.754748603351956, + "grad_norm": 0.8204054832458496, + "learning_rate": 0.0003135014005602241, + "loss": 0.3932, + "step": 24621 + }, + { + "epoch": 13.755307262569833, + "grad_norm": 0.6049415469169617, + "learning_rate": 0.00031347338935574235, + "loss": 0.3562, + "step": 24622 + }, + { + "epoch": 13.75586592178771, + "grad_norm": 0.3725706934928894, + "learning_rate": 0.0003134453781512605, + "loss": 0.3359, + "step": 24623 + }, + { + "epoch": 13.756424581005586, + "grad_norm": 0.3988759517669678, + "learning_rate": 0.0003134173669467787, + "loss": 0.376, + "step": 24624 + }, + { + "epoch": 13.756983240223464, + "grad_norm": 0.4468362331390381, + "learning_rate": 0.0003133893557422969, + "loss": 0.4223, + "step": 24625 + }, + { + "epoch": 13.75754189944134, + "grad_norm": 5.098926067352295, + "learning_rate": 0.0003133613445378151, + "loss": 0.4952, + "step": 24626 + }, + { + "epoch": 13.758100558659217, + "grad_norm": 0.45199814438819885, + "learning_rate": 0.0003133333333333334, + "loss": 0.3626, + "step": 24627 + }, + { + "epoch": 13.758659217877096, + "grad_norm": 0.5131231546401978, + "learning_rate": 0.00031330532212885153, + "loss": 0.4385, + "step": 24628 + }, + { + "epoch": 13.759217877094972, + "grad_norm": 0.8979408740997314, + "learning_rate": 0.00031327731092436974, + "loss": 0.4279, + "step": 24629 + }, + { + "epoch": 13.759776536312849, + "grad_norm": 0.8850966691970825, + "learning_rate": 0.000313249299719888, + "loss": 0.4999, + "step": 24630 + }, + { + "epoch": 13.760335195530725, + "grad_norm": 0.44493335485458374, + "learning_rate": 0.00031322128851540615, + "loss": 0.4163, + "step": 24631 + }, + { + "epoch": 13.760893854748604, + "grad_norm": 0.5462016463279724, + "learning_rate": 0.0003131932773109244, + "loss": 0.3428, + "step": 24632 + }, + { + "epoch": 13.76145251396648, + "grad_norm": 0.4455803632736206, + "learning_rate": 0.00031316526610644256, + "loss": 0.4576, + "step": 24633 + }, + { + "epoch": 13.762011173184357, + "grad_norm": 0.49655285477638245, + "learning_rate": 0.00031313725490196077, + "loss": 0.353, + "step": 24634 + }, + { + "epoch": 13.762569832402235, + "grad_norm": 0.5043083429336548, + "learning_rate": 0.000313109243697479, + "loss": 0.4215, + "step": 24635 + }, + { + "epoch": 13.763128491620112, + "grad_norm": 0.4754570424556732, + "learning_rate": 0.0003130812324929972, + "loss": 0.3636, + "step": 24636 + }, + { + "epoch": 13.763687150837988, + "grad_norm": 0.9718914031982422, + "learning_rate": 0.00031305322128851544, + "loss": 0.498, + "step": 24637 + }, + { + "epoch": 13.764245810055867, + "grad_norm": 0.3998493254184723, + "learning_rate": 0.00031302521008403364, + "loss": 0.3539, + "step": 24638 + }, + { + "epoch": 13.764804469273743, + "grad_norm": 0.6009743809700012, + "learning_rate": 0.0003129971988795518, + "loss": 0.4535, + "step": 24639 + }, + { + "epoch": 13.76536312849162, + "grad_norm": 0.9182845950126648, + "learning_rate": 0.00031296918767507006, + "loss": 0.432, + "step": 24640 + }, + { + "epoch": 13.765921787709498, + "grad_norm": 0.6865869164466858, + "learning_rate": 0.0003129411764705882, + "loss": 0.342, + "step": 24641 + }, + { + "epoch": 13.766480446927375, + "grad_norm": 0.402852326631546, + "learning_rate": 0.00031291316526610647, + "loss": 0.4819, + "step": 24642 + }, + { + "epoch": 13.767039106145251, + "grad_norm": 0.4187225103378296, + "learning_rate": 0.0003128851540616247, + "loss": 0.4365, + "step": 24643 + }, + { + "epoch": 13.767597765363128, + "grad_norm": 2.5152461528778076, + "learning_rate": 0.0003128571428571428, + "loss": 0.4728, + "step": 24644 + }, + { + "epoch": 13.768156424581006, + "grad_norm": 0.4061499536037445, + "learning_rate": 0.0003128291316526611, + "loss": 0.4361, + "step": 24645 + }, + { + "epoch": 13.768715083798883, + "grad_norm": 0.4748583436012268, + "learning_rate": 0.0003128011204481793, + "loss": 0.4209, + "step": 24646 + }, + { + "epoch": 13.76927374301676, + "grad_norm": 0.5399193167686462, + "learning_rate": 0.0003127731092436975, + "loss": 0.3765, + "step": 24647 + }, + { + "epoch": 13.769832402234638, + "grad_norm": 0.417892724275589, + "learning_rate": 0.0003127450980392157, + "loss": 0.4622, + "step": 24648 + }, + { + "epoch": 13.770391061452514, + "grad_norm": 3.1574244499206543, + "learning_rate": 0.00031271708683473386, + "loss": 0.5254, + "step": 24649 + }, + { + "epoch": 13.77094972067039, + "grad_norm": 0.6645531058311462, + "learning_rate": 0.0003126890756302521, + "loss": 0.6908, + "step": 24650 + }, + { + "epoch": 13.771508379888267, + "grad_norm": 0.33625566959381104, + "learning_rate": 0.0003126610644257703, + "loss": 0.3904, + "step": 24651 + }, + { + "epoch": 13.772067039106146, + "grad_norm": 0.8538450002670288, + "learning_rate": 0.00031263305322128853, + "loss": 0.4866, + "step": 24652 + }, + { + "epoch": 13.772625698324022, + "grad_norm": 0.43366724252700806, + "learning_rate": 0.00031260504201680673, + "loss": 0.4575, + "step": 24653 + }, + { + "epoch": 13.773184357541899, + "grad_norm": 0.5008127093315125, + "learning_rate": 0.00031257703081232494, + "loss": 0.4043, + "step": 24654 + }, + { + "epoch": 13.773743016759777, + "grad_norm": 0.734473705291748, + "learning_rate": 0.00031254901960784315, + "loss": 0.4041, + "step": 24655 + }, + { + "epoch": 13.774301675977654, + "grad_norm": 0.6329787373542786, + "learning_rate": 0.00031252100840336135, + "loss": 0.456, + "step": 24656 + }, + { + "epoch": 13.77486033519553, + "grad_norm": 0.5221911668777466, + "learning_rate": 0.00031249299719887956, + "loss": 0.4338, + "step": 24657 + }, + { + "epoch": 13.775418994413407, + "grad_norm": 0.47905802726745605, + "learning_rate": 0.00031246498599439776, + "loss": 0.3983, + "step": 24658 + }, + { + "epoch": 13.775977653631285, + "grad_norm": 0.4423077702522278, + "learning_rate": 0.00031243697478991597, + "loss": 0.4912, + "step": 24659 + }, + { + "epoch": 13.776536312849162, + "grad_norm": 0.568717360496521, + "learning_rate": 0.0003124089635854342, + "loss": 0.5354, + "step": 24660 + }, + { + "epoch": 13.777094972067038, + "grad_norm": 0.6811214685440063, + "learning_rate": 0.0003123809523809524, + "loss": 0.4527, + "step": 24661 + }, + { + "epoch": 13.777653631284917, + "grad_norm": 0.3925004303455353, + "learning_rate": 0.00031235294117647064, + "loss": 0.3469, + "step": 24662 + }, + { + "epoch": 13.778212290502793, + "grad_norm": 0.4180448651313782, + "learning_rate": 0.0003123249299719888, + "loss": 0.4302, + "step": 24663 + }, + { + "epoch": 13.77877094972067, + "grad_norm": 0.35556793212890625, + "learning_rate": 0.000312296918767507, + "loss": 0.3462, + "step": 24664 + }, + { + "epoch": 13.779329608938548, + "grad_norm": 0.5247901678085327, + "learning_rate": 0.0003122689075630252, + "loss": 0.4785, + "step": 24665 + }, + { + "epoch": 13.779888268156425, + "grad_norm": 0.6282362341880798, + "learning_rate": 0.0003122408963585434, + "loss": 0.4286, + "step": 24666 + }, + { + "epoch": 13.780446927374301, + "grad_norm": 3.9609861373901367, + "learning_rate": 0.00031221288515406167, + "loss": 0.3743, + "step": 24667 + }, + { + "epoch": 13.78100558659218, + "grad_norm": 0.5430408716201782, + "learning_rate": 0.0003121848739495798, + "loss": 0.4937, + "step": 24668 + }, + { + "epoch": 13.781564245810056, + "grad_norm": 1.0549169778823853, + "learning_rate": 0.00031215686274509803, + "loss": 0.4234, + "step": 24669 + }, + { + "epoch": 13.782122905027933, + "grad_norm": 0.4465056359767914, + "learning_rate": 0.0003121288515406163, + "loss": 0.4192, + "step": 24670 + }, + { + "epoch": 13.78268156424581, + "grad_norm": 0.5028314590454102, + "learning_rate": 0.00031210084033613444, + "loss": 0.4091, + "step": 24671 + }, + { + "epoch": 13.783240223463688, + "grad_norm": 0.690117597579956, + "learning_rate": 0.0003120728291316527, + "loss": 0.3422, + "step": 24672 + }, + { + "epoch": 13.783798882681564, + "grad_norm": 2.79848051071167, + "learning_rate": 0.00031204481792717085, + "loss": 0.4305, + "step": 24673 + }, + { + "epoch": 13.78435754189944, + "grad_norm": 0.548388659954071, + "learning_rate": 0.00031201680672268906, + "loss": 0.4641, + "step": 24674 + }, + { + "epoch": 13.78491620111732, + "grad_norm": 0.41758206486701965, + "learning_rate": 0.0003119887955182073, + "loss": 0.4391, + "step": 24675 + }, + { + "epoch": 13.785474860335196, + "grad_norm": 0.5833352208137512, + "learning_rate": 0.00031196078431372547, + "loss": 0.3802, + "step": 24676 + }, + { + "epoch": 13.786033519553072, + "grad_norm": 0.5385997891426086, + "learning_rate": 0.00031193277310924373, + "loss": 0.3592, + "step": 24677 + }, + { + "epoch": 13.786592178770949, + "grad_norm": 0.40148165822029114, + "learning_rate": 0.00031190476190476194, + "loss": 0.3826, + "step": 24678 + }, + { + "epoch": 13.787150837988827, + "grad_norm": 0.4532122313976288, + "learning_rate": 0.0003118767507002801, + "loss": 0.3265, + "step": 24679 + }, + { + "epoch": 13.787709497206704, + "grad_norm": 0.6617640852928162, + "learning_rate": 0.00031184873949579835, + "loss": 0.3516, + "step": 24680 + }, + { + "epoch": 13.78826815642458, + "grad_norm": 0.3093830347061157, + "learning_rate": 0.0003118207282913165, + "loss": 0.3361, + "step": 24681 + }, + { + "epoch": 13.788826815642459, + "grad_norm": 0.629340410232544, + "learning_rate": 0.00031179271708683476, + "loss": 0.5207, + "step": 24682 + }, + { + "epoch": 13.789385474860335, + "grad_norm": 0.5637341141700745, + "learning_rate": 0.00031176470588235297, + "loss": 0.4489, + "step": 24683 + }, + { + "epoch": 13.789944134078212, + "grad_norm": 0.6393017172813416, + "learning_rate": 0.0003117366946778711, + "loss": 0.5139, + "step": 24684 + }, + { + "epoch": 13.79050279329609, + "grad_norm": 0.38375043869018555, + "learning_rate": 0.0003117086834733894, + "loss": 0.3241, + "step": 24685 + }, + { + "epoch": 13.791061452513967, + "grad_norm": 0.39735087752342224, + "learning_rate": 0.0003116806722689076, + "loss": 0.2916, + "step": 24686 + }, + { + "epoch": 13.791620111731843, + "grad_norm": 0.578385055065155, + "learning_rate": 0.0003116526610644258, + "loss": 0.406, + "step": 24687 + }, + { + "epoch": 13.79217877094972, + "grad_norm": 0.7574548721313477, + "learning_rate": 0.000311624649859944, + "loss": 0.4503, + "step": 24688 + }, + { + "epoch": 13.792737430167598, + "grad_norm": 0.621619462966919, + "learning_rate": 0.00031159663865546215, + "loss": 0.4245, + "step": 24689 + }, + { + "epoch": 13.793296089385475, + "grad_norm": 0.7814710736274719, + "learning_rate": 0.0003115686274509804, + "loss": 0.636, + "step": 24690 + }, + { + "epoch": 13.793854748603351, + "grad_norm": 0.6537702083587646, + "learning_rate": 0.0003115406162464986, + "loss": 0.4781, + "step": 24691 + }, + { + "epoch": 13.79441340782123, + "grad_norm": 0.5536796450614929, + "learning_rate": 0.0003115126050420168, + "loss": 0.4541, + "step": 24692 + }, + { + "epoch": 13.794972067039106, + "grad_norm": 0.557115912437439, + "learning_rate": 0.00031148459383753503, + "loss": 0.4299, + "step": 24693 + }, + { + "epoch": 13.795530726256983, + "grad_norm": 0.4359208047389984, + "learning_rate": 0.00031145658263305323, + "loss": 0.4022, + "step": 24694 + }, + { + "epoch": 13.796089385474861, + "grad_norm": 0.5477483868598938, + "learning_rate": 0.00031142857142857144, + "loss": 0.2924, + "step": 24695 + }, + { + "epoch": 13.796648044692738, + "grad_norm": 0.38812774419784546, + "learning_rate": 0.00031140056022408965, + "loss": 0.3855, + "step": 24696 + }, + { + "epoch": 13.797206703910614, + "grad_norm": 0.3890925943851471, + "learning_rate": 0.00031137254901960785, + "loss": 0.4509, + "step": 24697 + }, + { + "epoch": 13.797765363128491, + "grad_norm": 0.9076298475265503, + "learning_rate": 0.00031134453781512606, + "loss": 0.5379, + "step": 24698 + }, + { + "epoch": 13.79832402234637, + "grad_norm": 0.7020007371902466, + "learning_rate": 0.00031131652661064426, + "loss": 0.3944, + "step": 24699 + }, + { + "epoch": 13.798882681564246, + "grad_norm": 0.4714997112751007, + "learning_rate": 0.00031128851540616247, + "loss": 0.426, + "step": 24700 + }, + { + "epoch": 13.799441340782122, + "grad_norm": 0.39724090695381165, + "learning_rate": 0.0003112605042016807, + "loss": 0.4327, + "step": 24701 + }, + { + "epoch": 13.8, + "grad_norm": 0.4750833213329315, + "learning_rate": 0.00031123249299719894, + "loss": 0.4446, + "step": 24702 + }, + { + "epoch": 13.800558659217877, + "grad_norm": 0.5161691904067993, + "learning_rate": 0.0003112044817927171, + "loss": 0.3625, + "step": 24703 + }, + { + "epoch": 13.801117318435754, + "grad_norm": 0.4361058175563812, + "learning_rate": 0.0003111764705882353, + "loss": 0.4515, + "step": 24704 + }, + { + "epoch": 13.80167597765363, + "grad_norm": 0.38783717155456543, + "learning_rate": 0.0003111484593837535, + "loss": 0.3889, + "step": 24705 + }, + { + "epoch": 13.802234636871509, + "grad_norm": 0.4028116464614868, + "learning_rate": 0.0003111204481792717, + "loss": 0.3657, + "step": 24706 + }, + { + "epoch": 13.802793296089385, + "grad_norm": 0.5284596085548401, + "learning_rate": 0.00031109243697478997, + "loss": 0.4298, + "step": 24707 + }, + { + "epoch": 13.803351955307262, + "grad_norm": 0.5655832290649414, + "learning_rate": 0.0003110644257703081, + "loss": 0.4333, + "step": 24708 + }, + { + "epoch": 13.80391061452514, + "grad_norm": 0.5062133073806763, + "learning_rate": 0.0003110364145658263, + "loss": 0.5057, + "step": 24709 + }, + { + "epoch": 13.804469273743017, + "grad_norm": 0.43898701667785645, + "learning_rate": 0.0003110084033613446, + "loss": 0.3977, + "step": 24710 + }, + { + "epoch": 13.805027932960893, + "grad_norm": 0.8681417107582092, + "learning_rate": 0.00031098039215686274, + "loss": 0.5185, + "step": 24711 + }, + { + "epoch": 13.805586592178772, + "grad_norm": 0.4172993004322052, + "learning_rate": 0.000310952380952381, + "loss": 0.4523, + "step": 24712 + }, + { + "epoch": 13.806145251396648, + "grad_norm": 0.4801238775253296, + "learning_rate": 0.00031092436974789915, + "loss": 0.4043, + "step": 24713 + }, + { + "epoch": 13.806703910614525, + "grad_norm": 0.4862687587738037, + "learning_rate": 0.00031089635854341735, + "loss": 0.4016, + "step": 24714 + }, + { + "epoch": 13.807262569832401, + "grad_norm": 0.30294865369796753, + "learning_rate": 0.0003108683473389356, + "loss": 0.3559, + "step": 24715 + }, + { + "epoch": 13.80782122905028, + "grad_norm": 0.4659216105937958, + "learning_rate": 0.00031084033613445377, + "loss": 0.3914, + "step": 24716 + }, + { + "epoch": 13.808379888268156, + "grad_norm": 0.30654600262641907, + "learning_rate": 0.00031081232492997197, + "loss": 0.3372, + "step": 24717 + }, + { + "epoch": 13.808938547486033, + "grad_norm": 0.4536445736885071, + "learning_rate": 0.00031078431372549023, + "loss": 0.472, + "step": 24718 + }, + { + "epoch": 13.809497206703911, + "grad_norm": 1.4842451810836792, + "learning_rate": 0.0003107563025210084, + "loss": 0.4134, + "step": 24719 + }, + { + "epoch": 13.810055865921788, + "grad_norm": 0.5635777711868286, + "learning_rate": 0.00031072829131652664, + "loss": 0.3248, + "step": 24720 + }, + { + "epoch": 13.810614525139664, + "grad_norm": 0.42820364236831665, + "learning_rate": 0.0003107002801120448, + "loss": 0.3508, + "step": 24721 + }, + { + "epoch": 13.811173184357543, + "grad_norm": 0.4716918468475342, + "learning_rate": 0.000310672268907563, + "loss": 0.4834, + "step": 24722 + }, + { + "epoch": 13.81173184357542, + "grad_norm": 2.3274002075195312, + "learning_rate": 0.00031064425770308126, + "loss": 0.3529, + "step": 24723 + }, + { + "epoch": 13.812290502793296, + "grad_norm": 0.5653722882270813, + "learning_rate": 0.0003106162464985994, + "loss": 0.491, + "step": 24724 + }, + { + "epoch": 13.812849162011172, + "grad_norm": 1.42947256565094, + "learning_rate": 0.0003105882352941177, + "loss": 0.3504, + "step": 24725 + }, + { + "epoch": 13.81340782122905, + "grad_norm": 1.928516149520874, + "learning_rate": 0.0003105602240896359, + "loss": 0.4699, + "step": 24726 + }, + { + "epoch": 13.813966480446927, + "grad_norm": 0.40962472558021545, + "learning_rate": 0.00031053221288515403, + "loss": 0.3122, + "step": 24727 + }, + { + "epoch": 13.814525139664804, + "grad_norm": 1.3945581912994385, + "learning_rate": 0.0003105042016806723, + "loss": 0.4259, + "step": 24728 + }, + { + "epoch": 13.815083798882682, + "grad_norm": 1.3861957788467407, + "learning_rate": 0.00031047619047619044, + "loss": 0.3504, + "step": 24729 + }, + { + "epoch": 13.815642458100559, + "grad_norm": 0.7788215279579163, + "learning_rate": 0.0003104481792717087, + "loss": 0.3956, + "step": 24730 + }, + { + "epoch": 13.816201117318435, + "grad_norm": 0.4320719540119171, + "learning_rate": 0.0003104201680672269, + "loss": 0.3542, + "step": 24731 + }, + { + "epoch": 13.816759776536312, + "grad_norm": 0.3784499168395996, + "learning_rate": 0.00031039215686274506, + "loss": 0.4004, + "step": 24732 + }, + { + "epoch": 13.81731843575419, + "grad_norm": 0.4452926516532898, + "learning_rate": 0.0003103641456582633, + "loss": 0.3821, + "step": 24733 + }, + { + "epoch": 13.817877094972067, + "grad_norm": 1.0932354927062988, + "learning_rate": 0.00031033613445378153, + "loss": 0.5385, + "step": 24734 + }, + { + "epoch": 13.818435754189943, + "grad_norm": 0.4239197075366974, + "learning_rate": 0.00031030812324929973, + "loss": 0.4186, + "step": 24735 + }, + { + "epoch": 13.818994413407822, + "grad_norm": 0.5929141044616699, + "learning_rate": 0.00031028011204481794, + "loss": 0.4336, + "step": 24736 + }, + { + "epoch": 13.819553072625698, + "grad_norm": 0.36009687185287476, + "learning_rate": 0.0003102521008403361, + "loss": 0.3616, + "step": 24737 + }, + { + "epoch": 13.820111731843575, + "grad_norm": 3.307466983795166, + "learning_rate": 0.00031022408963585435, + "loss": 0.3815, + "step": 24738 + }, + { + "epoch": 13.820670391061453, + "grad_norm": 0.35800763964653015, + "learning_rate": 0.00031019607843137256, + "loss": 0.3141, + "step": 24739 + }, + { + "epoch": 13.82122905027933, + "grad_norm": 0.4880959391593933, + "learning_rate": 0.00031016806722689076, + "loss": 0.4333, + "step": 24740 + }, + { + "epoch": 13.821787709497206, + "grad_norm": 2.480283737182617, + "learning_rate": 0.00031014005602240897, + "loss": 0.4749, + "step": 24741 + }, + { + "epoch": 13.822346368715085, + "grad_norm": 0.6007377505302429, + "learning_rate": 0.0003101120448179272, + "loss": 0.5857, + "step": 24742 + }, + { + "epoch": 13.822905027932961, + "grad_norm": 1.4767669439315796, + "learning_rate": 0.0003100840336134454, + "loss": 0.4434, + "step": 24743 + }, + { + "epoch": 13.823463687150838, + "grad_norm": 0.39550453424453735, + "learning_rate": 0.0003100560224089636, + "loss": 0.4314, + "step": 24744 + }, + { + "epoch": 13.824022346368714, + "grad_norm": 1.0598410367965698, + "learning_rate": 0.0003100280112044818, + "loss": 0.3635, + "step": 24745 + }, + { + "epoch": 13.824581005586593, + "grad_norm": 0.9734326004981995, + "learning_rate": 0.00031, + "loss": 0.4689, + "step": 24746 + }, + { + "epoch": 13.82513966480447, + "grad_norm": 0.5384419560432434, + "learning_rate": 0.0003099719887955182, + "loss": 0.3902, + "step": 24747 + }, + { + "epoch": 13.825698324022346, + "grad_norm": 2.225046396255493, + "learning_rate": 0.0003099439775910364, + "loss": 0.3756, + "step": 24748 + }, + { + "epoch": 13.826256983240224, + "grad_norm": 10.570537567138672, + "learning_rate": 0.0003099159663865546, + "loss": 0.372, + "step": 24749 + }, + { + "epoch": 13.8268156424581, + "grad_norm": 0.43944209814071655, + "learning_rate": 0.0003098879551820729, + "loss": 0.376, + "step": 24750 + }, + { + "epoch": 13.827374301675977, + "grad_norm": 0.39851173758506775, + "learning_rate": 0.00030985994397759103, + "loss": 0.3766, + "step": 24751 + }, + { + "epoch": 13.827932960893854, + "grad_norm": 0.4189709424972534, + "learning_rate": 0.00030983193277310924, + "loss": 0.4647, + "step": 24752 + }, + { + "epoch": 13.828491620111732, + "grad_norm": 0.6913696527481079, + "learning_rate": 0.00030980392156862744, + "loss": 0.353, + "step": 24753 + }, + { + "epoch": 13.829050279329609, + "grad_norm": 0.4704824388027191, + "learning_rate": 0.00030977591036414565, + "loss": 0.3919, + "step": 24754 + }, + { + "epoch": 13.829608938547485, + "grad_norm": 0.49627575278282166, + "learning_rate": 0.0003097478991596639, + "loss": 0.3783, + "step": 24755 + }, + { + "epoch": 13.830167597765364, + "grad_norm": 0.574061393737793, + "learning_rate": 0.00030971988795518206, + "loss": 0.5715, + "step": 24756 + }, + { + "epoch": 13.83072625698324, + "grad_norm": 0.5746279954910278, + "learning_rate": 0.00030969187675070027, + "loss": 0.4117, + "step": 24757 + }, + { + "epoch": 13.831284916201117, + "grad_norm": 0.5381438136100769, + "learning_rate": 0.0003096638655462185, + "loss": 0.3408, + "step": 24758 + }, + { + "epoch": 13.831843575418995, + "grad_norm": 0.5281462669372559, + "learning_rate": 0.0003096358543417367, + "loss": 0.3985, + "step": 24759 + }, + { + "epoch": 13.832402234636872, + "grad_norm": 0.448641300201416, + "learning_rate": 0.00030960784313725494, + "loss": 0.3417, + "step": 24760 + }, + { + "epoch": 13.832960893854748, + "grad_norm": 0.5278318524360657, + "learning_rate": 0.0003095798319327731, + "loss": 0.3657, + "step": 24761 + }, + { + "epoch": 13.833519553072625, + "grad_norm": 0.5017572641372681, + "learning_rate": 0.0003095518207282913, + "loss": 0.5063, + "step": 24762 + }, + { + "epoch": 13.834078212290503, + "grad_norm": 0.5167214274406433, + "learning_rate": 0.00030952380952380956, + "loss": 0.5081, + "step": 24763 + }, + { + "epoch": 13.83463687150838, + "grad_norm": 0.38477057218551636, + "learning_rate": 0.0003094957983193277, + "loss": 0.2677, + "step": 24764 + }, + { + "epoch": 13.835195530726256, + "grad_norm": 0.5728855133056641, + "learning_rate": 0.00030946778711484597, + "loss": 0.3538, + "step": 24765 + }, + { + "epoch": 13.835754189944135, + "grad_norm": 0.5977369546890259, + "learning_rate": 0.0003094397759103642, + "loss": 0.6885, + "step": 24766 + }, + { + "epoch": 13.836312849162011, + "grad_norm": 0.3712436854839325, + "learning_rate": 0.0003094117647058823, + "loss": 0.3134, + "step": 24767 + }, + { + "epoch": 13.836871508379888, + "grad_norm": 0.4216599762439728, + "learning_rate": 0.0003093837535014006, + "loss": 0.3811, + "step": 24768 + }, + { + "epoch": 13.837430167597766, + "grad_norm": 0.6151779890060425, + "learning_rate": 0.00030935574229691874, + "loss": 0.3746, + "step": 24769 + }, + { + "epoch": 13.837988826815643, + "grad_norm": 0.36623167991638184, + "learning_rate": 0.000309327731092437, + "loss": 0.3668, + "step": 24770 + }, + { + "epoch": 13.83854748603352, + "grad_norm": 0.5187537670135498, + "learning_rate": 0.0003092997198879552, + "loss": 0.4323, + "step": 24771 + }, + { + "epoch": 13.839106145251396, + "grad_norm": 0.478880912065506, + "learning_rate": 0.00030927170868347336, + "loss": 0.4179, + "step": 24772 + }, + { + "epoch": 13.839664804469274, + "grad_norm": 1.3582699298858643, + "learning_rate": 0.0003092436974789916, + "loss": 0.3941, + "step": 24773 + }, + { + "epoch": 13.84022346368715, + "grad_norm": 0.3994075059890747, + "learning_rate": 0.0003092156862745098, + "loss": 0.429, + "step": 24774 + }, + { + "epoch": 13.840782122905027, + "grad_norm": 0.3768007159233093, + "learning_rate": 0.00030918767507002803, + "loss": 0.3608, + "step": 24775 + }, + { + "epoch": 13.841340782122906, + "grad_norm": 0.4348650574684143, + "learning_rate": 0.00030915966386554623, + "loss": 0.3659, + "step": 24776 + }, + { + "epoch": 13.841899441340782, + "grad_norm": 0.6999574303627014, + "learning_rate": 0.0003091316526610644, + "loss": 0.5593, + "step": 24777 + }, + { + "epoch": 13.842458100558659, + "grad_norm": 0.6640080809593201, + "learning_rate": 0.00030910364145658265, + "loss": 0.6605, + "step": 24778 + }, + { + "epoch": 13.843016759776535, + "grad_norm": 0.4254578948020935, + "learning_rate": 0.00030907563025210085, + "loss": 0.3679, + "step": 24779 + }, + { + "epoch": 13.843575418994414, + "grad_norm": 0.7168781161308289, + "learning_rate": 0.00030904761904761906, + "loss": 0.3735, + "step": 24780 + }, + { + "epoch": 13.84413407821229, + "grad_norm": 0.7321787476539612, + "learning_rate": 0.00030901960784313726, + "loss": 0.416, + "step": 24781 + }, + { + "epoch": 13.844692737430167, + "grad_norm": 0.8198249936103821, + "learning_rate": 0.00030899159663865547, + "loss": 0.3627, + "step": 24782 + }, + { + "epoch": 13.845251396648045, + "grad_norm": 0.4430847465991974, + "learning_rate": 0.0003089635854341737, + "loss": 0.4394, + "step": 24783 + }, + { + "epoch": 13.845810055865922, + "grad_norm": 0.4459068179130554, + "learning_rate": 0.0003089355742296919, + "loss": 0.3838, + "step": 24784 + }, + { + "epoch": 13.846368715083798, + "grad_norm": 0.3711383044719696, + "learning_rate": 0.0003089075630252101, + "loss": 0.4061, + "step": 24785 + }, + { + "epoch": 13.846927374301677, + "grad_norm": 1.381853699684143, + "learning_rate": 0.0003088795518207283, + "loss": 0.4183, + "step": 24786 + }, + { + "epoch": 13.847486033519553, + "grad_norm": 0.45864179730415344, + "learning_rate": 0.0003088515406162465, + "loss": 0.3913, + "step": 24787 + }, + { + "epoch": 13.84804469273743, + "grad_norm": 0.37968701124191284, + "learning_rate": 0.0003088235294117647, + "loss": 0.339, + "step": 24788 + }, + { + "epoch": 13.848603351955306, + "grad_norm": 0.5475267171859741, + "learning_rate": 0.0003087955182072829, + "loss": 0.3488, + "step": 24789 + }, + { + "epoch": 13.849162011173185, + "grad_norm": 0.6172983646392822, + "learning_rate": 0.00030876750700280117, + "loss": 0.419, + "step": 24790 + }, + { + "epoch": 13.849720670391061, + "grad_norm": 0.8614798784255981, + "learning_rate": 0.0003087394957983193, + "loss": 0.4723, + "step": 24791 + }, + { + "epoch": 13.850279329608938, + "grad_norm": 0.3572481572628021, + "learning_rate": 0.00030871148459383753, + "loss": 0.323, + "step": 24792 + }, + { + "epoch": 13.850837988826816, + "grad_norm": 0.7472407817840576, + "learning_rate": 0.00030868347338935574, + "loss": 0.5763, + "step": 24793 + }, + { + "epoch": 13.851396648044693, + "grad_norm": 1.253983736038208, + "learning_rate": 0.00030865546218487394, + "loss": 0.3707, + "step": 24794 + }, + { + "epoch": 13.85195530726257, + "grad_norm": 0.6515014171600342, + "learning_rate": 0.0003086274509803922, + "loss": 0.4965, + "step": 24795 + }, + { + "epoch": 13.852513966480448, + "grad_norm": 0.5535890460014343, + "learning_rate": 0.00030859943977591035, + "loss": 0.4612, + "step": 24796 + }, + { + "epoch": 13.853072625698324, + "grad_norm": 0.6500381231307983, + "learning_rate": 0.00030857142857142856, + "loss": 0.4783, + "step": 24797 + }, + { + "epoch": 13.8536312849162, + "grad_norm": 1.5299204587936401, + "learning_rate": 0.0003085434173669468, + "loss": 0.3689, + "step": 24798 + }, + { + "epoch": 13.854189944134077, + "grad_norm": 0.9100631475448608, + "learning_rate": 0.00030851540616246497, + "loss": 0.4797, + "step": 24799 + }, + { + "epoch": 13.854748603351956, + "grad_norm": 0.5053530931472778, + "learning_rate": 0.00030848739495798323, + "loss": 0.6667, + "step": 24800 + }, + { + "epoch": 13.855307262569832, + "grad_norm": 0.31468626856803894, + "learning_rate": 0.0003084593837535014, + "loss": 0.3313, + "step": 24801 + }, + { + "epoch": 13.855865921787709, + "grad_norm": 0.7528792023658752, + "learning_rate": 0.0003084313725490196, + "loss": 0.4396, + "step": 24802 + }, + { + "epoch": 13.856424581005587, + "grad_norm": 0.5894933342933655, + "learning_rate": 0.00030840336134453785, + "loss": 0.472, + "step": 24803 + }, + { + "epoch": 13.856983240223464, + "grad_norm": 0.3429962992668152, + "learning_rate": 0.000308375350140056, + "loss": 0.4024, + "step": 24804 + }, + { + "epoch": 13.85754189944134, + "grad_norm": 0.5356067419052124, + "learning_rate": 0.00030834733893557426, + "loss": 0.4173, + "step": 24805 + }, + { + "epoch": 13.858100558659217, + "grad_norm": 0.5016393065452576, + "learning_rate": 0.00030831932773109247, + "loss": 0.5019, + "step": 24806 + }, + { + "epoch": 13.858659217877095, + "grad_norm": 0.4478253722190857, + "learning_rate": 0.0003082913165266106, + "loss": 0.3586, + "step": 24807 + }, + { + "epoch": 13.859217877094972, + "grad_norm": 0.48870691657066345, + "learning_rate": 0.0003082633053221289, + "loss": 0.5072, + "step": 24808 + }, + { + "epoch": 13.859776536312848, + "grad_norm": 0.4953252077102661, + "learning_rate": 0.00030823529411764703, + "loss": 0.4422, + "step": 24809 + }, + { + "epoch": 13.860335195530727, + "grad_norm": 0.6184070110321045, + "learning_rate": 0.0003082072829131653, + "loss": 0.4862, + "step": 24810 + }, + { + "epoch": 13.860893854748603, + "grad_norm": 0.3630295693874359, + "learning_rate": 0.0003081792717086835, + "loss": 0.4107, + "step": 24811 + }, + { + "epoch": 13.86145251396648, + "grad_norm": 0.5265270471572876, + "learning_rate": 0.00030815126050420165, + "loss": 0.3247, + "step": 24812 + }, + { + "epoch": 13.862011173184358, + "grad_norm": 1.2840449810028076, + "learning_rate": 0.0003081232492997199, + "loss": 0.3798, + "step": 24813 + }, + { + "epoch": 13.862569832402235, + "grad_norm": 0.6314387917518616, + "learning_rate": 0.0003080952380952381, + "loss": 0.3891, + "step": 24814 + }, + { + "epoch": 13.863128491620111, + "grad_norm": 0.6427621245384216, + "learning_rate": 0.0003080672268907563, + "loss": 0.4861, + "step": 24815 + }, + { + "epoch": 13.86368715083799, + "grad_norm": 0.5162996053695679, + "learning_rate": 0.00030803921568627453, + "loss": 0.4164, + "step": 24816 + }, + { + "epoch": 13.864245810055866, + "grad_norm": 0.48597386479377747, + "learning_rate": 0.0003080112044817927, + "loss": 0.4909, + "step": 24817 + }, + { + "epoch": 13.864804469273743, + "grad_norm": 0.4007278084754944, + "learning_rate": 0.00030798319327731094, + "loss": 0.4153, + "step": 24818 + }, + { + "epoch": 13.86536312849162, + "grad_norm": 0.514100968837738, + "learning_rate": 0.00030795518207282915, + "loss": 0.3756, + "step": 24819 + }, + { + "epoch": 13.865921787709498, + "grad_norm": 0.4739219546318054, + "learning_rate": 0.00030792717086834735, + "loss": 0.4713, + "step": 24820 + }, + { + "epoch": 13.866480446927374, + "grad_norm": 0.42733922600746155, + "learning_rate": 0.00030789915966386556, + "loss": 0.3627, + "step": 24821 + }, + { + "epoch": 13.867039106145251, + "grad_norm": 0.7179304361343384, + "learning_rate": 0.00030787114845938376, + "loss": 0.4428, + "step": 24822 + }, + { + "epoch": 13.86759776536313, + "grad_norm": 0.9533480405807495, + "learning_rate": 0.00030784313725490197, + "loss": 0.3956, + "step": 24823 + }, + { + "epoch": 13.868156424581006, + "grad_norm": 0.47056421637535095, + "learning_rate": 0.0003078151260504202, + "loss": 0.4164, + "step": 24824 + }, + { + "epoch": 13.868715083798882, + "grad_norm": 0.6332651972770691, + "learning_rate": 0.00030778711484593833, + "loss": 0.4094, + "step": 24825 + }, + { + "epoch": 13.869273743016759, + "grad_norm": 0.5415308475494385, + "learning_rate": 0.0003077591036414566, + "loss": 0.4968, + "step": 24826 + }, + { + "epoch": 13.869832402234637, + "grad_norm": 0.4044300615787506, + "learning_rate": 0.0003077310924369748, + "loss": 0.3796, + "step": 24827 + }, + { + "epoch": 13.870391061452514, + "grad_norm": 0.37973079085350037, + "learning_rate": 0.000307703081232493, + "loss": 0.3064, + "step": 24828 + }, + { + "epoch": 13.87094972067039, + "grad_norm": 0.4267829954624176, + "learning_rate": 0.0003076750700280112, + "loss": 0.4511, + "step": 24829 + }, + { + "epoch": 13.871508379888269, + "grad_norm": 0.5870457291603088, + "learning_rate": 0.0003076470588235294, + "loss": 0.5604, + "step": 24830 + }, + { + "epoch": 13.872067039106145, + "grad_norm": 2.063067674636841, + "learning_rate": 0.0003076190476190476, + "loss": 0.3536, + "step": 24831 + }, + { + "epoch": 13.872625698324022, + "grad_norm": 0.49359625577926636, + "learning_rate": 0.0003075910364145658, + "loss": 0.4329, + "step": 24832 + }, + { + "epoch": 13.8731843575419, + "grad_norm": 0.7901489734649658, + "learning_rate": 0.0003075630252100841, + "loss": 0.4224, + "step": 24833 + }, + { + "epoch": 13.873743016759777, + "grad_norm": 0.366642564535141, + "learning_rate": 0.00030753501400560224, + "loss": 0.3397, + "step": 24834 + }, + { + "epoch": 13.874301675977653, + "grad_norm": 0.3388896584510803, + "learning_rate": 0.00030750700280112044, + "loss": 0.3714, + "step": 24835 + }, + { + "epoch": 13.87486033519553, + "grad_norm": 0.3611159324645996, + "learning_rate": 0.00030747899159663865, + "loss": 0.3747, + "step": 24836 + }, + { + "epoch": 13.875418994413408, + "grad_norm": 0.5765059590339661, + "learning_rate": 0.00030745098039215685, + "loss": 0.4235, + "step": 24837 + }, + { + "epoch": 13.875977653631285, + "grad_norm": 0.5157936811447144, + "learning_rate": 0.0003074229691876751, + "loss": 0.4584, + "step": 24838 + }, + { + "epoch": 13.876536312849161, + "grad_norm": 0.5755608081817627, + "learning_rate": 0.00030739495798319327, + "loss": 0.4803, + "step": 24839 + }, + { + "epoch": 13.87709497206704, + "grad_norm": 0.7004003524780273, + "learning_rate": 0.00030736694677871147, + "loss": 0.4791, + "step": 24840 + }, + { + "epoch": 13.877653631284916, + "grad_norm": 0.5604896545410156, + "learning_rate": 0.00030733893557422973, + "loss": 0.3583, + "step": 24841 + }, + { + "epoch": 13.878212290502793, + "grad_norm": 0.4400005042552948, + "learning_rate": 0.0003073109243697479, + "loss": 0.3697, + "step": 24842 + }, + { + "epoch": 13.878770949720671, + "grad_norm": 0.7079864144325256, + "learning_rate": 0.00030728291316526614, + "loss": 0.4265, + "step": 24843 + }, + { + "epoch": 13.879329608938548, + "grad_norm": 0.5235856175422668, + "learning_rate": 0.0003072549019607843, + "loss": 0.3183, + "step": 24844 + }, + { + "epoch": 13.879888268156424, + "grad_norm": 0.3624257743358612, + "learning_rate": 0.0003072268907563025, + "loss": 0.363, + "step": 24845 + }, + { + "epoch": 13.880446927374301, + "grad_norm": 0.5465137362480164, + "learning_rate": 0.00030719887955182076, + "loss": 0.4307, + "step": 24846 + }, + { + "epoch": 13.88100558659218, + "grad_norm": 0.45373231172561646, + "learning_rate": 0.0003071708683473389, + "loss": 0.5498, + "step": 24847 + }, + { + "epoch": 13.881564245810056, + "grad_norm": 0.5142821073532104, + "learning_rate": 0.0003071428571428572, + "loss": 0.4475, + "step": 24848 + }, + { + "epoch": 13.882122905027932, + "grad_norm": 0.37014129757881165, + "learning_rate": 0.0003071148459383754, + "loss": 0.4255, + "step": 24849 + }, + { + "epoch": 13.88268156424581, + "grad_norm": 0.3725886046886444, + "learning_rate": 0.00030708683473389353, + "loss": 0.3451, + "step": 24850 + }, + { + "epoch": 13.883240223463687, + "grad_norm": 0.45471274852752686, + "learning_rate": 0.0003070588235294118, + "loss": 0.4549, + "step": 24851 + }, + { + "epoch": 13.883798882681564, + "grad_norm": 0.3539445996284485, + "learning_rate": 0.00030703081232492994, + "loss": 0.3404, + "step": 24852 + }, + { + "epoch": 13.88435754189944, + "grad_norm": 1.4144705533981323, + "learning_rate": 0.0003070028011204482, + "loss": 0.4332, + "step": 24853 + }, + { + "epoch": 13.884916201117319, + "grad_norm": 0.5037001967430115, + "learning_rate": 0.0003069747899159664, + "loss": 0.3877, + "step": 24854 + }, + { + "epoch": 13.885474860335195, + "grad_norm": 0.4294300377368927, + "learning_rate": 0.00030694677871148456, + "loss": 0.3402, + "step": 24855 + }, + { + "epoch": 13.886033519553072, + "grad_norm": 0.542075514793396, + "learning_rate": 0.0003069187675070028, + "loss": 0.3218, + "step": 24856 + }, + { + "epoch": 13.88659217877095, + "grad_norm": 0.7178765535354614, + "learning_rate": 0.00030689075630252103, + "loss": 0.4026, + "step": 24857 + }, + { + "epoch": 13.887150837988827, + "grad_norm": 0.34286803007125854, + "learning_rate": 0.00030686274509803923, + "loss": 0.3226, + "step": 24858 + }, + { + "epoch": 13.887709497206703, + "grad_norm": 0.47864800691604614, + "learning_rate": 0.00030683473389355744, + "loss": 0.4391, + "step": 24859 + }, + { + "epoch": 13.888268156424582, + "grad_norm": 0.5917994976043701, + "learning_rate": 0.0003068067226890756, + "loss": 0.3859, + "step": 24860 + }, + { + "epoch": 13.888826815642458, + "grad_norm": 0.4243963360786438, + "learning_rate": 0.00030677871148459385, + "loss": 0.4309, + "step": 24861 + }, + { + "epoch": 13.889385474860335, + "grad_norm": 0.5597795248031616, + "learning_rate": 0.00030675070028011206, + "loss": 0.3925, + "step": 24862 + }, + { + "epoch": 13.889944134078211, + "grad_norm": 0.38727062940597534, + "learning_rate": 0.00030672268907563026, + "loss": 0.3336, + "step": 24863 + }, + { + "epoch": 13.89050279329609, + "grad_norm": 0.42713820934295654, + "learning_rate": 0.00030669467787114847, + "loss": 0.408, + "step": 24864 + }, + { + "epoch": 13.891061452513966, + "grad_norm": 0.6606540083885193, + "learning_rate": 0.0003066666666666667, + "loss": 0.2949, + "step": 24865 + }, + { + "epoch": 13.891620111731843, + "grad_norm": 0.5189084410667419, + "learning_rate": 0.0003066386554621849, + "loss": 0.4535, + "step": 24866 + }, + { + "epoch": 13.892178770949721, + "grad_norm": 0.3938692808151245, + "learning_rate": 0.0003066106442577031, + "loss": 0.3846, + "step": 24867 + }, + { + "epoch": 13.892737430167598, + "grad_norm": 0.5946329236030579, + "learning_rate": 0.0003065826330532213, + "loss": 0.4372, + "step": 24868 + }, + { + "epoch": 13.893296089385474, + "grad_norm": 0.41366398334503174, + "learning_rate": 0.0003065546218487395, + "loss": 0.4985, + "step": 24869 + }, + { + "epoch": 13.893854748603353, + "grad_norm": 0.34584948420524597, + "learning_rate": 0.0003065266106442577, + "loss": 0.3786, + "step": 24870 + }, + { + "epoch": 13.89441340782123, + "grad_norm": 0.6611112356185913, + "learning_rate": 0.0003064985994397759, + "loss": 0.3314, + "step": 24871 + }, + { + "epoch": 13.894972067039106, + "grad_norm": 0.4873482286930084, + "learning_rate": 0.0003064705882352941, + "loss": 0.4504, + "step": 24872 + }, + { + "epoch": 13.895530726256982, + "grad_norm": 0.3984193503856659, + "learning_rate": 0.0003064425770308124, + "loss": 0.3731, + "step": 24873 + }, + { + "epoch": 13.89608938547486, + "grad_norm": 0.6287557482719421, + "learning_rate": 0.00030641456582633053, + "loss": 0.463, + "step": 24874 + }, + { + "epoch": 13.896648044692737, + "grad_norm": 0.4631573259830475, + "learning_rate": 0.00030638655462184874, + "loss": 0.3476, + "step": 24875 + }, + { + "epoch": 13.897206703910614, + "grad_norm": 0.6214492321014404, + "learning_rate": 0.00030635854341736694, + "loss": 0.4585, + "step": 24876 + }, + { + "epoch": 13.897765363128492, + "grad_norm": 1.2695749998092651, + "learning_rate": 0.00030633053221288515, + "loss": 0.4332, + "step": 24877 + }, + { + "epoch": 13.898324022346369, + "grad_norm": 0.543743371963501, + "learning_rate": 0.0003063025210084034, + "loss": 0.4681, + "step": 24878 + }, + { + "epoch": 13.898882681564245, + "grad_norm": 0.6609926223754883, + "learning_rate": 0.00030627450980392156, + "loss": 0.4241, + "step": 24879 + }, + { + "epoch": 13.899441340782122, + "grad_norm": 0.44550999999046326, + "learning_rate": 0.00030624649859943977, + "loss": 0.4329, + "step": 24880 + }, + { + "epoch": 13.9, + "grad_norm": 0.6306529641151428, + "learning_rate": 0.000306218487394958, + "loss": 0.5281, + "step": 24881 + }, + { + "epoch": 13.900558659217877, + "grad_norm": 0.7403978705406189, + "learning_rate": 0.0003061904761904762, + "loss": 0.4134, + "step": 24882 + }, + { + "epoch": 13.901117318435753, + "grad_norm": 0.4836862087249756, + "learning_rate": 0.00030616246498599444, + "loss": 0.4792, + "step": 24883 + }, + { + "epoch": 13.901675977653632, + "grad_norm": 0.8319069743156433, + "learning_rate": 0.0003061344537815126, + "loss": 0.469, + "step": 24884 + }, + { + "epoch": 13.902234636871508, + "grad_norm": 0.5553714632987976, + "learning_rate": 0.0003061064425770308, + "loss": 0.4946, + "step": 24885 + }, + { + "epoch": 13.902793296089385, + "grad_norm": 0.36867955327033997, + "learning_rate": 0.00030607843137254906, + "loss": 0.4309, + "step": 24886 + }, + { + "epoch": 13.903351955307263, + "grad_norm": 0.3950061798095703, + "learning_rate": 0.0003060504201680672, + "loss": 0.4647, + "step": 24887 + }, + { + "epoch": 13.90391061452514, + "grad_norm": 0.42105087637901306, + "learning_rate": 0.00030602240896358547, + "loss": 0.4269, + "step": 24888 + }, + { + "epoch": 13.904469273743016, + "grad_norm": 0.4055873453617096, + "learning_rate": 0.0003059943977591037, + "loss": 0.4052, + "step": 24889 + }, + { + "epoch": 13.905027932960895, + "grad_norm": 0.3744962215423584, + "learning_rate": 0.0003059663865546218, + "loss": 0.3786, + "step": 24890 + }, + { + "epoch": 13.905586592178771, + "grad_norm": 0.48850616812705994, + "learning_rate": 0.0003059383753501401, + "loss": 0.4241, + "step": 24891 + }, + { + "epoch": 13.906145251396648, + "grad_norm": 0.3959115743637085, + "learning_rate": 0.00030591036414565824, + "loss": 0.3752, + "step": 24892 + }, + { + "epoch": 13.906703910614524, + "grad_norm": 0.39288461208343506, + "learning_rate": 0.0003058823529411765, + "loss": 0.428, + "step": 24893 + }, + { + "epoch": 13.907262569832403, + "grad_norm": 0.4193260967731476, + "learning_rate": 0.0003058543417366947, + "loss": 0.3372, + "step": 24894 + }, + { + "epoch": 13.90782122905028, + "grad_norm": 0.4617643654346466, + "learning_rate": 0.00030582633053221286, + "loss": 0.4536, + "step": 24895 + }, + { + "epoch": 13.908379888268156, + "grad_norm": 0.5762104988098145, + "learning_rate": 0.0003057983193277311, + "loss": 0.3844, + "step": 24896 + }, + { + "epoch": 13.908938547486034, + "grad_norm": 0.31485405564308167, + "learning_rate": 0.0003057703081232493, + "loss": 0.3736, + "step": 24897 + }, + { + "epoch": 13.90949720670391, + "grad_norm": 0.4969572424888611, + "learning_rate": 0.00030574229691876753, + "loss": 0.3703, + "step": 24898 + }, + { + "epoch": 13.910055865921787, + "grad_norm": 0.35041072964668274, + "learning_rate": 0.00030571428571428573, + "loss": 0.3233, + "step": 24899 + }, + { + "epoch": 13.910614525139664, + "grad_norm": 0.48628121614456177, + "learning_rate": 0.0003056862745098039, + "loss": 0.3707, + "step": 24900 + }, + { + "epoch": 13.911173184357542, + "grad_norm": 0.6263110041618347, + "learning_rate": 0.00030565826330532215, + "loss": 0.4527, + "step": 24901 + }, + { + "epoch": 13.911731843575419, + "grad_norm": 0.49387502670288086, + "learning_rate": 0.00030563025210084035, + "loss": 0.4809, + "step": 24902 + }, + { + "epoch": 13.912290502793295, + "grad_norm": 0.4616723954677582, + "learning_rate": 0.00030560224089635856, + "loss": 0.3766, + "step": 24903 + }, + { + "epoch": 13.912849162011174, + "grad_norm": 0.3935795724391937, + "learning_rate": 0.00030557422969187676, + "loss": 0.3186, + "step": 24904 + }, + { + "epoch": 13.91340782122905, + "grad_norm": 0.7374389171600342, + "learning_rate": 0.00030554621848739497, + "loss": 0.5969, + "step": 24905 + }, + { + "epoch": 13.913966480446927, + "grad_norm": 0.4294931888580322, + "learning_rate": 0.0003055182072829132, + "loss": 0.3713, + "step": 24906 + }, + { + "epoch": 13.914525139664804, + "grad_norm": 0.5246458053588867, + "learning_rate": 0.0003054901960784314, + "loss": 0.426, + "step": 24907 + }, + { + "epoch": 13.915083798882682, + "grad_norm": 0.35554391145706177, + "learning_rate": 0.0003054621848739496, + "loss": 0.3892, + "step": 24908 + }, + { + "epoch": 13.915642458100558, + "grad_norm": 0.3771948218345642, + "learning_rate": 0.0003054341736694678, + "loss": 0.4549, + "step": 24909 + }, + { + "epoch": 13.916201117318435, + "grad_norm": 0.4551900029182434, + "learning_rate": 0.000305406162464986, + "loss": 0.3678, + "step": 24910 + }, + { + "epoch": 13.916759776536313, + "grad_norm": 0.4652528762817383, + "learning_rate": 0.0003053781512605042, + "loss": 0.4989, + "step": 24911 + }, + { + "epoch": 13.91731843575419, + "grad_norm": 0.40176817774772644, + "learning_rate": 0.0003053501400560224, + "loss": 0.4417, + "step": 24912 + }, + { + "epoch": 13.917877094972066, + "grad_norm": 0.4590175449848175, + "learning_rate": 0.00030532212885154067, + "loss": 0.334, + "step": 24913 + }, + { + "epoch": 13.918435754189945, + "grad_norm": 0.25038766860961914, + "learning_rate": 0.0003052941176470588, + "loss": 0.2776, + "step": 24914 + }, + { + "epoch": 13.918994413407821, + "grad_norm": 1.717965006828308, + "learning_rate": 0.00030526610644257703, + "loss": 0.3425, + "step": 24915 + }, + { + "epoch": 13.919553072625698, + "grad_norm": 0.44046440720558167, + "learning_rate": 0.00030523809523809524, + "loss": 0.4157, + "step": 24916 + }, + { + "epoch": 13.920111731843576, + "grad_norm": 0.7244234681129456, + "learning_rate": 0.00030521008403361344, + "loss": 0.3469, + "step": 24917 + }, + { + "epoch": 13.920670391061453, + "grad_norm": 0.447412371635437, + "learning_rate": 0.0003051820728291317, + "loss": 0.4717, + "step": 24918 + }, + { + "epoch": 13.92122905027933, + "grad_norm": 0.41557204723358154, + "learning_rate": 0.00030515406162464985, + "loss": 0.3163, + "step": 24919 + }, + { + "epoch": 13.921787709497206, + "grad_norm": 0.39765432476997375, + "learning_rate": 0.00030512605042016806, + "loss": 0.3783, + "step": 24920 + }, + { + "epoch": 13.922346368715084, + "grad_norm": 0.4708682894706726, + "learning_rate": 0.0003050980392156863, + "loss": 0.4489, + "step": 24921 + }, + { + "epoch": 13.922905027932961, + "grad_norm": 1.637816071510315, + "learning_rate": 0.00030507002801120447, + "loss": 0.574, + "step": 24922 + }, + { + "epoch": 13.923463687150837, + "grad_norm": 0.5205154418945312, + "learning_rate": 0.00030504201680672273, + "loss": 0.3754, + "step": 24923 + }, + { + "epoch": 13.924022346368716, + "grad_norm": 0.4843134880065918, + "learning_rate": 0.0003050140056022409, + "loss": 0.4248, + "step": 24924 + }, + { + "epoch": 13.924581005586592, + "grad_norm": 5.135484218597412, + "learning_rate": 0.0003049859943977591, + "loss": 0.4979, + "step": 24925 + }, + { + "epoch": 13.925139664804469, + "grad_norm": 0.4674665033817291, + "learning_rate": 0.00030495798319327735, + "loss": 0.4985, + "step": 24926 + }, + { + "epoch": 13.925698324022346, + "grad_norm": 0.4445206820964813, + "learning_rate": 0.0003049299719887955, + "loss": 0.3612, + "step": 24927 + }, + { + "epoch": 13.926256983240224, + "grad_norm": 0.4883038103580475, + "learning_rate": 0.00030490196078431376, + "loss": 0.494, + "step": 24928 + }, + { + "epoch": 13.9268156424581, + "grad_norm": 0.6263957619667053, + "learning_rate": 0.00030487394957983197, + "loss": 0.4728, + "step": 24929 + }, + { + "epoch": 13.927374301675977, + "grad_norm": 7.5520524978637695, + "learning_rate": 0.0003048459383753501, + "loss": 0.3862, + "step": 24930 + }, + { + "epoch": 13.927932960893855, + "grad_norm": 0.44578176736831665, + "learning_rate": 0.0003048179271708684, + "loss": 0.4137, + "step": 24931 + }, + { + "epoch": 13.928491620111732, + "grad_norm": 0.9556331634521484, + "learning_rate": 0.00030478991596638653, + "loss": 0.4159, + "step": 24932 + }, + { + "epoch": 13.929050279329608, + "grad_norm": 0.48974746465682983, + "learning_rate": 0.0003047619047619048, + "loss": 0.3416, + "step": 24933 + }, + { + "epoch": 13.929608938547487, + "grad_norm": 0.931921660900116, + "learning_rate": 0.000304733893557423, + "loss": 0.4043, + "step": 24934 + }, + { + "epoch": 13.930167597765363, + "grad_norm": 2.0274505615234375, + "learning_rate": 0.00030470588235294115, + "loss": 0.3542, + "step": 24935 + }, + { + "epoch": 13.93072625698324, + "grad_norm": 1.0719321966171265, + "learning_rate": 0.0003046778711484594, + "loss": 0.3991, + "step": 24936 + }, + { + "epoch": 13.931284916201117, + "grad_norm": 0.3949093222618103, + "learning_rate": 0.0003046498599439776, + "loss": 0.3916, + "step": 24937 + }, + { + "epoch": 13.931843575418995, + "grad_norm": 2.330949068069458, + "learning_rate": 0.00030462184873949577, + "loss": 0.4245, + "step": 24938 + }, + { + "epoch": 13.932402234636871, + "grad_norm": 0.5379676818847656, + "learning_rate": 0.00030459383753501403, + "loss": 0.4337, + "step": 24939 + }, + { + "epoch": 13.932960893854748, + "grad_norm": 0.4951185882091522, + "learning_rate": 0.0003045658263305322, + "loss": 0.456, + "step": 24940 + }, + { + "epoch": 13.933519553072626, + "grad_norm": 0.6574488878250122, + "learning_rate": 0.00030453781512605044, + "loss": 0.4314, + "step": 24941 + }, + { + "epoch": 13.934078212290503, + "grad_norm": 0.38705697655677795, + "learning_rate": 0.00030450980392156865, + "loss": 0.4273, + "step": 24942 + }, + { + "epoch": 13.93463687150838, + "grad_norm": 0.840924859046936, + "learning_rate": 0.0003044817927170868, + "loss": 0.4625, + "step": 24943 + }, + { + "epoch": 13.935195530726258, + "grad_norm": 1.2168173789978027, + "learning_rate": 0.00030445378151260506, + "loss": 0.2766, + "step": 24944 + }, + { + "epoch": 13.935754189944134, + "grad_norm": 0.43147024512290955, + "learning_rate": 0.00030442577030812326, + "loss": 0.5715, + "step": 24945 + }, + { + "epoch": 13.936312849162011, + "grad_norm": 0.5921577215194702, + "learning_rate": 0.00030439775910364147, + "loss": 0.4653, + "step": 24946 + }, + { + "epoch": 13.936871508379888, + "grad_norm": 0.43765193223953247, + "learning_rate": 0.0003043697478991597, + "loss": 0.3826, + "step": 24947 + }, + { + "epoch": 13.937430167597766, + "grad_norm": 0.4704474210739136, + "learning_rate": 0.00030434173669467783, + "loss": 0.3918, + "step": 24948 + }, + { + "epoch": 13.937988826815642, + "grad_norm": 0.4440448582172394, + "learning_rate": 0.0003043137254901961, + "loss": 0.43, + "step": 24949 + }, + { + "epoch": 13.938547486033519, + "grad_norm": 0.40606898069381714, + "learning_rate": 0.0003042857142857143, + "loss": 0.4245, + "step": 24950 + }, + { + "epoch": 13.939106145251397, + "grad_norm": 1.4436674118041992, + "learning_rate": 0.0003042577030812325, + "loss": 0.4639, + "step": 24951 + }, + { + "epoch": 13.939664804469274, + "grad_norm": 0.485685259103775, + "learning_rate": 0.0003042296918767507, + "loss": 0.4152, + "step": 24952 + }, + { + "epoch": 13.94022346368715, + "grad_norm": 0.6809135675430298, + "learning_rate": 0.0003042016806722689, + "loss": 0.4493, + "step": 24953 + }, + { + "epoch": 13.940782122905027, + "grad_norm": 0.7552197575569153, + "learning_rate": 0.0003041736694677871, + "loss": 0.4958, + "step": 24954 + }, + { + "epoch": 13.941340782122905, + "grad_norm": 1.0604698657989502, + "learning_rate": 0.0003041456582633053, + "loss": 0.3753, + "step": 24955 + }, + { + "epoch": 13.941899441340782, + "grad_norm": 0.44197893142700195, + "learning_rate": 0.00030411764705882353, + "loss": 0.506, + "step": 24956 + }, + { + "epoch": 13.942458100558659, + "grad_norm": 0.34298622608184814, + "learning_rate": 0.00030408963585434174, + "loss": 0.2879, + "step": 24957 + }, + { + "epoch": 13.943016759776537, + "grad_norm": 0.7418647408485413, + "learning_rate": 0.00030406162464985994, + "loss": 0.6121, + "step": 24958 + }, + { + "epoch": 13.943575418994413, + "grad_norm": 1.0117508172988892, + "learning_rate": 0.00030403361344537815, + "loss": 0.3255, + "step": 24959 + }, + { + "epoch": 13.94413407821229, + "grad_norm": 0.8493671417236328, + "learning_rate": 0.00030400560224089635, + "loss": 0.3605, + "step": 24960 + }, + { + "epoch": 13.944692737430168, + "grad_norm": 0.3530765473842621, + "learning_rate": 0.0003039775910364146, + "loss": 0.3804, + "step": 24961 + }, + { + "epoch": 13.945251396648045, + "grad_norm": 0.6087768077850342, + "learning_rate": 0.00030394957983193277, + "loss": 0.5235, + "step": 24962 + }, + { + "epoch": 13.945810055865921, + "grad_norm": 0.7930576801300049, + "learning_rate": 0.00030392156862745097, + "loss": 0.3845, + "step": 24963 + }, + { + "epoch": 13.946368715083798, + "grad_norm": 0.44346028566360474, + "learning_rate": 0.0003038935574229692, + "loss": 0.4258, + "step": 24964 + }, + { + "epoch": 13.946927374301676, + "grad_norm": 2.600170850753784, + "learning_rate": 0.0003038655462184874, + "loss": 0.3171, + "step": 24965 + }, + { + "epoch": 13.947486033519553, + "grad_norm": 1.5927622318267822, + "learning_rate": 0.00030383753501400564, + "loss": 0.3892, + "step": 24966 + }, + { + "epoch": 13.94804469273743, + "grad_norm": 0.894919216632843, + "learning_rate": 0.0003038095238095238, + "loss": 0.4701, + "step": 24967 + }, + { + "epoch": 13.948603351955308, + "grad_norm": 0.4815196990966797, + "learning_rate": 0.000303781512605042, + "loss": 0.3697, + "step": 24968 + }, + { + "epoch": 13.949162011173184, + "grad_norm": 0.3941391408443451, + "learning_rate": 0.00030375350140056026, + "loss": 0.3169, + "step": 24969 + }, + { + "epoch": 13.949720670391061, + "grad_norm": 0.4568084478378296, + "learning_rate": 0.0003037254901960784, + "loss": 0.4355, + "step": 24970 + }, + { + "epoch": 13.95027932960894, + "grad_norm": 0.8687947988510132, + "learning_rate": 0.0003036974789915967, + "loss": 0.5036, + "step": 24971 + }, + { + "epoch": 13.950837988826816, + "grad_norm": 0.5170503854751587, + "learning_rate": 0.0003036694677871148, + "loss": 0.4282, + "step": 24972 + }, + { + "epoch": 13.951396648044692, + "grad_norm": 0.4385678768157959, + "learning_rate": 0.00030364145658263303, + "loss": 0.4937, + "step": 24973 + }, + { + "epoch": 13.951955307262569, + "grad_norm": 0.6030340194702148, + "learning_rate": 0.0003036134453781513, + "loss": 0.5574, + "step": 24974 + }, + { + "epoch": 13.952513966480447, + "grad_norm": 0.8687620759010315, + "learning_rate": 0.00030358543417366944, + "loss": 0.3401, + "step": 24975 + }, + { + "epoch": 13.953072625698324, + "grad_norm": 0.5189350247383118, + "learning_rate": 0.0003035574229691877, + "loss": 0.4109, + "step": 24976 + }, + { + "epoch": 13.9536312849162, + "grad_norm": 0.36550942063331604, + "learning_rate": 0.0003035294117647059, + "loss": 0.3059, + "step": 24977 + }, + { + "epoch": 13.954189944134079, + "grad_norm": 0.43972718715667725, + "learning_rate": 0.00030350140056022406, + "loss": 0.4123, + "step": 24978 + }, + { + "epoch": 13.954748603351955, + "grad_norm": 0.9455841183662415, + "learning_rate": 0.0003034733893557423, + "loss": 0.4193, + "step": 24979 + }, + { + "epoch": 13.955307262569832, + "grad_norm": 0.7085368037223816, + "learning_rate": 0.0003034453781512605, + "loss": 0.3641, + "step": 24980 + }, + { + "epoch": 13.955865921787709, + "grad_norm": 0.3997029960155487, + "learning_rate": 0.00030341736694677873, + "loss": 0.3421, + "step": 24981 + }, + { + "epoch": 13.956424581005587, + "grad_norm": 0.46714967489242554, + "learning_rate": 0.00030338935574229694, + "loss": 0.3272, + "step": 24982 + }, + { + "epoch": 13.956983240223463, + "grad_norm": 0.4718601107597351, + "learning_rate": 0.0003033613445378151, + "loss": 0.4274, + "step": 24983 + }, + { + "epoch": 13.95754189944134, + "grad_norm": 2.4217123985290527, + "learning_rate": 0.00030333333333333335, + "loss": 0.5249, + "step": 24984 + }, + { + "epoch": 13.958100558659218, + "grad_norm": 0.6807698011398315, + "learning_rate": 0.00030330532212885156, + "loss": 0.416, + "step": 24985 + }, + { + "epoch": 13.958659217877095, + "grad_norm": 0.5843945741653442, + "learning_rate": 0.00030327731092436976, + "loss": 0.3969, + "step": 24986 + }, + { + "epoch": 13.959217877094972, + "grad_norm": 0.5690789222717285, + "learning_rate": 0.00030324929971988797, + "loss": 0.4249, + "step": 24987 + }, + { + "epoch": 13.95977653631285, + "grad_norm": 0.3401868939399719, + "learning_rate": 0.0003032212885154061, + "loss": 0.3148, + "step": 24988 + }, + { + "epoch": 13.960335195530726, + "grad_norm": 0.9192000031471252, + "learning_rate": 0.0003031932773109244, + "loss": 0.3858, + "step": 24989 + }, + { + "epoch": 13.960893854748603, + "grad_norm": 1.3651618957519531, + "learning_rate": 0.0003031652661064426, + "loss": 0.4264, + "step": 24990 + }, + { + "epoch": 13.961452513966481, + "grad_norm": 0.4703902304172516, + "learning_rate": 0.0003031372549019608, + "loss": 0.3557, + "step": 24991 + }, + { + "epoch": 13.962011173184358, + "grad_norm": 1.926825761795044, + "learning_rate": 0.000303109243697479, + "loss": 0.4822, + "step": 24992 + }, + { + "epoch": 13.962569832402234, + "grad_norm": 0.5890985727310181, + "learning_rate": 0.0003030812324929972, + "loss": 0.3963, + "step": 24993 + }, + { + "epoch": 13.963128491620111, + "grad_norm": 0.3517052233219147, + "learning_rate": 0.0003030532212885154, + "loss": 0.4021, + "step": 24994 + }, + { + "epoch": 13.96368715083799, + "grad_norm": 0.6159536838531494, + "learning_rate": 0.0003030252100840336, + "loss": 0.4574, + "step": 24995 + }, + { + "epoch": 13.964245810055866, + "grad_norm": 0.4008338153362274, + "learning_rate": 0.0003029971988795518, + "loss": 0.4137, + "step": 24996 + }, + { + "epoch": 13.964804469273743, + "grad_norm": 0.3515673577785492, + "learning_rate": 0.00030296918767507003, + "loss": 0.3177, + "step": 24997 + }, + { + "epoch": 13.96536312849162, + "grad_norm": 0.3601021468639374, + "learning_rate": 0.00030294117647058824, + "loss": 0.3553, + "step": 24998 + }, + { + "epoch": 13.965921787709497, + "grad_norm": 2.735264778137207, + "learning_rate": 0.00030291316526610644, + "loss": 0.3803, + "step": 24999 + }, + { + "epoch": 13.966480446927374, + "grad_norm": 0.7660241723060608, + "learning_rate": 0.00030288515406162465, + "loss": 0.3471, + "step": 25000 + }, + { + "epoch": 13.966480446927374, + "eval_cer": 0.08627104404656694, + "eval_loss": 0.3287004232406616, + "eval_runtime": 55.6728, + "eval_samples_per_second": 81.512, + "eval_steps_per_second": 5.101, + "eval_wer": 0.33990670651657773, + "step": 25000 + }, + { + "epoch": 13.96703910614525, + "grad_norm": 0.5212233662605286, + "learning_rate": 0.0003028571428571429, + "loss": 0.3975, + "step": 25001 + }, + { + "epoch": 13.967597765363129, + "grad_norm": 0.5010490417480469, + "learning_rate": 0.00030282913165266106, + "loss": 0.268, + "step": 25002 + }, + { + "epoch": 13.968156424581005, + "grad_norm": 0.3625105321407318, + "learning_rate": 0.00030280112044817927, + "loss": 0.415, + "step": 25003 + }, + { + "epoch": 13.968715083798882, + "grad_norm": 0.544319212436676, + "learning_rate": 0.00030277310924369747, + "loss": 0.5051, + "step": 25004 + }, + { + "epoch": 13.96927374301676, + "grad_norm": 0.8994757533073425, + "learning_rate": 0.0003027450980392157, + "loss": 0.4631, + "step": 25005 + }, + { + "epoch": 13.969832402234637, + "grad_norm": 0.6352178454399109, + "learning_rate": 0.00030271708683473394, + "loss": 0.4535, + "step": 25006 + }, + { + "epoch": 13.970391061452514, + "grad_norm": 0.5342041850090027, + "learning_rate": 0.0003026890756302521, + "loss": 0.4037, + "step": 25007 + }, + { + "epoch": 13.970949720670392, + "grad_norm": 0.6092376112937927, + "learning_rate": 0.0003026610644257703, + "loss": 0.5237, + "step": 25008 + }, + { + "epoch": 13.971508379888268, + "grad_norm": 1.606554627418518, + "learning_rate": 0.00030263305322128856, + "loss": 0.479, + "step": 25009 + }, + { + "epoch": 13.972067039106145, + "grad_norm": 0.6056253910064697, + "learning_rate": 0.0003026050420168067, + "loss": 0.5034, + "step": 25010 + }, + { + "epoch": 13.972625698324022, + "grad_norm": 0.4374372661113739, + "learning_rate": 0.00030257703081232497, + "loss": 0.491, + "step": 25011 + }, + { + "epoch": 13.9731843575419, + "grad_norm": 0.5344729423522949, + "learning_rate": 0.0003025490196078431, + "loss": 0.3837, + "step": 25012 + }, + { + "epoch": 13.973743016759776, + "grad_norm": 0.8057110905647278, + "learning_rate": 0.0003025210084033613, + "loss": 0.42, + "step": 25013 + }, + { + "epoch": 13.974301675977653, + "grad_norm": 1.5922526121139526, + "learning_rate": 0.0003024929971988796, + "loss": 0.3962, + "step": 25014 + }, + { + "epoch": 13.974860335195531, + "grad_norm": 0.42678239941596985, + "learning_rate": 0.00030246498599439774, + "loss": 0.3899, + "step": 25015 + }, + { + "epoch": 13.975418994413408, + "grad_norm": 0.7659466862678528, + "learning_rate": 0.000302436974789916, + "loss": 0.3991, + "step": 25016 + }, + { + "epoch": 13.975977653631285, + "grad_norm": 0.702599048614502, + "learning_rate": 0.0003024089635854342, + "loss": 0.3449, + "step": 25017 + }, + { + "epoch": 13.976536312849163, + "grad_norm": 0.7584528923034668, + "learning_rate": 0.00030238095238095236, + "loss": 0.5014, + "step": 25018 + }, + { + "epoch": 13.97709497206704, + "grad_norm": 0.42104771733283997, + "learning_rate": 0.0003023529411764706, + "loss": 0.4854, + "step": 25019 + }, + { + "epoch": 13.977653631284916, + "grad_norm": 0.8232793211936951, + "learning_rate": 0.00030232492997198877, + "loss": 0.4124, + "step": 25020 + }, + { + "epoch": 13.978212290502793, + "grad_norm": 0.557181179523468, + "learning_rate": 0.00030229691876750703, + "loss": 0.3953, + "step": 25021 + }, + { + "epoch": 13.978770949720671, + "grad_norm": 0.4982050061225891, + "learning_rate": 0.00030226890756302523, + "loss": 0.3152, + "step": 25022 + }, + { + "epoch": 13.979329608938547, + "grad_norm": 0.41561731696128845, + "learning_rate": 0.0003022408963585434, + "loss": 0.3955, + "step": 25023 + }, + { + "epoch": 13.979888268156424, + "grad_norm": 0.5033066272735596, + "learning_rate": 0.00030221288515406165, + "loss": 0.432, + "step": 25024 + }, + { + "epoch": 13.980446927374302, + "grad_norm": 0.5226442217826843, + "learning_rate": 0.00030218487394957985, + "loss": 0.4477, + "step": 25025 + }, + { + "epoch": 13.981005586592179, + "grad_norm": 0.3596704304218292, + "learning_rate": 0.00030215686274509806, + "loss": 0.4072, + "step": 25026 + }, + { + "epoch": 13.981564245810056, + "grad_norm": 0.43071648478507996, + "learning_rate": 0.00030212885154061626, + "loss": 0.4094, + "step": 25027 + }, + { + "epoch": 13.982122905027932, + "grad_norm": 0.8383761048316956, + "learning_rate": 0.0003021008403361344, + "loss": 0.4696, + "step": 25028 + }, + { + "epoch": 13.98268156424581, + "grad_norm": 0.5035290718078613, + "learning_rate": 0.0003020728291316527, + "loss": 0.4363, + "step": 25029 + }, + { + "epoch": 13.983240223463687, + "grad_norm": 0.4246644973754883, + "learning_rate": 0.0003020448179271709, + "loss": 0.5145, + "step": 25030 + }, + { + "epoch": 13.983798882681564, + "grad_norm": 1.333490252494812, + "learning_rate": 0.0003020168067226891, + "loss": 0.5115, + "step": 25031 + }, + { + "epoch": 13.984357541899442, + "grad_norm": 0.4504704773426056, + "learning_rate": 0.0003019887955182073, + "loss": 0.3872, + "step": 25032 + }, + { + "epoch": 13.984916201117318, + "grad_norm": 0.35326847434043884, + "learning_rate": 0.0003019607843137255, + "loss": 0.3361, + "step": 25033 + }, + { + "epoch": 13.985474860335195, + "grad_norm": 0.41942453384399414, + "learning_rate": 0.0003019327731092437, + "loss": 0.3568, + "step": 25034 + }, + { + "epoch": 13.986033519553073, + "grad_norm": 0.574332058429718, + "learning_rate": 0.0003019047619047619, + "loss": 0.3928, + "step": 25035 + }, + { + "epoch": 13.98659217877095, + "grad_norm": 0.41776958107948303, + "learning_rate": 0.0003018767507002801, + "loss": 0.4517, + "step": 25036 + }, + { + "epoch": 13.987150837988827, + "grad_norm": 0.535555899143219, + "learning_rate": 0.0003018487394957983, + "loss": 0.3967, + "step": 25037 + }, + { + "epoch": 13.987709497206703, + "grad_norm": 0.5311372876167297, + "learning_rate": 0.00030182072829131653, + "loss": 0.4513, + "step": 25038 + }, + { + "epoch": 13.988268156424581, + "grad_norm": 1.0931395292282104, + "learning_rate": 0.00030179271708683474, + "loss": 0.7758, + "step": 25039 + }, + { + "epoch": 13.988826815642458, + "grad_norm": 0.4387125074863434, + "learning_rate": 0.00030176470588235294, + "loss": 0.3698, + "step": 25040 + }, + { + "epoch": 13.989385474860335, + "grad_norm": 0.8777655363082886, + "learning_rate": 0.0003017366946778712, + "loss": 0.4809, + "step": 25041 + }, + { + "epoch": 13.989944134078213, + "grad_norm": 0.6878267526626587, + "learning_rate": 0.00030170868347338935, + "loss": 0.3858, + "step": 25042 + }, + { + "epoch": 13.99050279329609, + "grad_norm": 0.3940977156162262, + "learning_rate": 0.00030168067226890756, + "loss": 0.4235, + "step": 25043 + }, + { + "epoch": 13.991061452513966, + "grad_norm": 0.7464665174484253, + "learning_rate": 0.00030165266106442577, + "loss": 0.5448, + "step": 25044 + }, + { + "epoch": 13.991620111731844, + "grad_norm": 0.45642080903053284, + "learning_rate": 0.00030162464985994397, + "loss": 0.4523, + "step": 25045 + }, + { + "epoch": 13.992178770949721, + "grad_norm": 0.7723276019096375, + "learning_rate": 0.00030159663865546223, + "loss": 0.3269, + "step": 25046 + }, + { + "epoch": 13.992737430167598, + "grad_norm": 0.41004931926727295, + "learning_rate": 0.0003015686274509804, + "loss": 0.3492, + "step": 25047 + }, + { + "epoch": 13.993296089385474, + "grad_norm": 0.5904877185821533, + "learning_rate": 0.0003015406162464986, + "loss": 0.3897, + "step": 25048 + }, + { + "epoch": 13.993854748603352, + "grad_norm": 0.4126007556915283, + "learning_rate": 0.00030151260504201685, + "loss": 0.3872, + "step": 25049 + }, + { + "epoch": 13.994413407821229, + "grad_norm": 0.4562183618545532, + "learning_rate": 0.000301484593837535, + "loss": 0.4694, + "step": 25050 + }, + { + "epoch": 13.994972067039106, + "grad_norm": 3.9524052143096924, + "learning_rate": 0.0003014565826330532, + "loss": 0.4037, + "step": 25051 + }, + { + "epoch": 13.995530726256984, + "grad_norm": 0.589347779750824, + "learning_rate": 0.0003014285714285714, + "loss": 0.4321, + "step": 25052 + }, + { + "epoch": 13.99608938547486, + "grad_norm": 0.43606725335121155, + "learning_rate": 0.0003014005602240896, + "loss": 0.2877, + "step": 25053 + }, + { + "epoch": 13.996648044692737, + "grad_norm": 0.4309036433696747, + "learning_rate": 0.0003013725490196079, + "loss": 0.4147, + "step": 25054 + }, + { + "epoch": 13.997206703910614, + "grad_norm": 0.5048011541366577, + "learning_rate": 0.00030134453781512603, + "loss": 0.3415, + "step": 25055 + }, + { + "epoch": 13.997765363128492, + "grad_norm": 0.37226057052612305, + "learning_rate": 0.00030131652661064424, + "loss": 0.2313, + "step": 25056 + }, + { + "epoch": 13.998324022346369, + "grad_norm": 0.4966714680194855, + "learning_rate": 0.0003012885154061625, + "loss": 0.3837, + "step": 25057 + }, + { + "epoch": 13.998882681564245, + "grad_norm": 0.6564077734947205, + "learning_rate": 0.00030126050420168065, + "loss": 0.4731, + "step": 25058 + }, + { + "epoch": 13.999441340782123, + "grad_norm": 0.7023360133171082, + "learning_rate": 0.0003012324929971989, + "loss": 0.4921, + "step": 25059 + }, + { + "epoch": 14.0, + "grad_norm": 0.42959120869636536, + "learning_rate": 0.00030120448179271706, + "loss": 0.3585, + "step": 25060 + }, + { + "epoch": 14.000558659217877, + "grad_norm": 0.4712943434715271, + "learning_rate": 0.00030117647058823527, + "loss": 0.4188, + "step": 25061 + }, + { + "epoch": 14.001117318435755, + "grad_norm": 0.5183335542678833, + "learning_rate": 0.00030114845938375353, + "loss": 0.3994, + "step": 25062 + }, + { + "epoch": 14.001675977653631, + "grad_norm": 18.417646408081055, + "learning_rate": 0.0003011204481792717, + "loss": 0.3296, + "step": 25063 + }, + { + "epoch": 14.002234636871508, + "grad_norm": 0.44175639748573303, + "learning_rate": 0.00030109243697478994, + "loss": 0.3516, + "step": 25064 + }, + { + "epoch": 14.002793296089385, + "grad_norm": 0.9682335257530212, + "learning_rate": 0.00030106442577030815, + "loss": 0.3866, + "step": 25065 + }, + { + "epoch": 14.003351955307263, + "grad_norm": 0.7077897191047668, + "learning_rate": 0.0003010364145658263, + "loss": 0.4485, + "step": 25066 + }, + { + "epoch": 14.00391061452514, + "grad_norm": 0.5629830360412598, + "learning_rate": 0.00030100840336134456, + "loss": 0.4298, + "step": 25067 + }, + { + "epoch": 14.004469273743016, + "grad_norm": 0.8316035270690918, + "learning_rate": 0.0003009803921568627, + "loss": 0.367, + "step": 25068 + }, + { + "epoch": 14.005027932960894, + "grad_norm": 0.399234414100647, + "learning_rate": 0.00030095238095238097, + "loss": 0.3361, + "step": 25069 + }, + { + "epoch": 14.005586592178771, + "grad_norm": 0.3416390120983124, + "learning_rate": 0.0003009243697478992, + "loss": 0.3499, + "step": 25070 + }, + { + "epoch": 14.006145251396648, + "grad_norm": 0.5388157963752747, + "learning_rate": 0.00030089635854341733, + "loss": 0.4587, + "step": 25071 + }, + { + "epoch": 14.006703910614526, + "grad_norm": 0.9829431176185608, + "learning_rate": 0.0003008683473389356, + "loss": 0.4418, + "step": 25072 + }, + { + "epoch": 14.007262569832402, + "grad_norm": 0.39367660880088806, + "learning_rate": 0.0003008403361344538, + "loss": 0.402, + "step": 25073 + }, + { + "epoch": 14.007821229050279, + "grad_norm": 0.3428211212158203, + "learning_rate": 0.000300812324929972, + "loss": 0.3656, + "step": 25074 + }, + { + "epoch": 14.008379888268156, + "grad_norm": 0.3543752133846283, + "learning_rate": 0.0003007843137254902, + "loss": 0.4425, + "step": 25075 + }, + { + "epoch": 14.008938547486034, + "grad_norm": 0.4108964204788208, + "learning_rate": 0.00030075630252100836, + "loss": 0.4204, + "step": 25076 + }, + { + "epoch": 14.00949720670391, + "grad_norm": 0.6321560144424438, + "learning_rate": 0.0003007282913165266, + "loss": 0.385, + "step": 25077 + }, + { + "epoch": 14.010055865921787, + "grad_norm": 0.4434875249862671, + "learning_rate": 0.0003007002801120448, + "loss": 0.3945, + "step": 25078 + }, + { + "epoch": 14.010614525139665, + "grad_norm": 0.3013342022895813, + "learning_rate": 0.00030067226890756303, + "loss": 0.3081, + "step": 25079 + }, + { + "epoch": 14.011173184357542, + "grad_norm": 0.5292012691497803, + "learning_rate": 0.00030064425770308124, + "loss": 0.4856, + "step": 25080 + }, + { + "epoch": 14.011731843575419, + "grad_norm": 0.38223472237586975, + "learning_rate": 0.00030061624649859944, + "loss": 0.4097, + "step": 25081 + }, + { + "epoch": 14.012290502793297, + "grad_norm": 0.3560939431190491, + "learning_rate": 0.00030058823529411765, + "loss": 0.4264, + "step": 25082 + }, + { + "epoch": 14.012849162011173, + "grad_norm": 0.5588728189468384, + "learning_rate": 0.00030056022408963585, + "loss": 0.3723, + "step": 25083 + }, + { + "epoch": 14.01340782122905, + "grad_norm": 0.4066706895828247, + "learning_rate": 0.0003005322128851541, + "loss": 0.4191, + "step": 25084 + }, + { + "epoch": 14.013966480446927, + "grad_norm": 0.4044496715068817, + "learning_rate": 0.00030050420168067227, + "loss": 0.3767, + "step": 25085 + }, + { + "epoch": 14.014525139664805, + "grad_norm": 0.43806135654449463, + "learning_rate": 0.00030047619047619047, + "loss": 0.4753, + "step": 25086 + }, + { + "epoch": 14.015083798882682, + "grad_norm": 0.42578983306884766, + "learning_rate": 0.0003004481792717087, + "loss": 0.3864, + "step": 25087 + }, + { + "epoch": 14.015642458100558, + "grad_norm": 0.4107792377471924, + "learning_rate": 0.0003004201680672269, + "loss": 0.4069, + "step": 25088 + }, + { + "epoch": 14.016201117318436, + "grad_norm": 0.4648132622241974, + "learning_rate": 0.00030039215686274514, + "loss": 0.4098, + "step": 25089 + }, + { + "epoch": 14.016759776536313, + "grad_norm": 0.3882954716682434, + "learning_rate": 0.0003003641456582633, + "loss": 0.3972, + "step": 25090 + }, + { + "epoch": 14.01731843575419, + "grad_norm": 0.4436319172382355, + "learning_rate": 0.0003003361344537815, + "loss": 0.4274, + "step": 25091 + }, + { + "epoch": 14.017877094972068, + "grad_norm": 0.521126925945282, + "learning_rate": 0.00030030812324929976, + "loss": 0.2999, + "step": 25092 + }, + { + "epoch": 14.018435754189944, + "grad_norm": 0.5451239943504333, + "learning_rate": 0.0003002801120448179, + "loss": 0.4994, + "step": 25093 + }, + { + "epoch": 14.018994413407821, + "grad_norm": 0.45273008942604065, + "learning_rate": 0.0003002521008403362, + "loss": 0.3398, + "step": 25094 + }, + { + "epoch": 14.019553072625698, + "grad_norm": 0.6346138715744019, + "learning_rate": 0.0003002240896358543, + "loss": 0.5223, + "step": 25095 + }, + { + "epoch": 14.020111731843576, + "grad_norm": 1.1171289682388306, + "learning_rate": 0.00030019607843137253, + "loss": 0.4852, + "step": 25096 + }, + { + "epoch": 14.020670391061453, + "grad_norm": 0.4387471675872803, + "learning_rate": 0.0003001680672268908, + "loss": 0.4918, + "step": 25097 + }, + { + "epoch": 14.021229050279329, + "grad_norm": 0.653271496295929, + "learning_rate": 0.00030014005602240894, + "loss": 0.4949, + "step": 25098 + }, + { + "epoch": 14.021787709497207, + "grad_norm": 0.519095778465271, + "learning_rate": 0.0003001120448179272, + "loss": 0.4279, + "step": 25099 + }, + { + "epoch": 14.022346368715084, + "grad_norm": 0.48979949951171875, + "learning_rate": 0.0003000840336134454, + "loss": 0.4172, + "step": 25100 + }, + { + "epoch": 14.02290502793296, + "grad_norm": 0.519146740436554, + "learning_rate": 0.00030005602240896356, + "loss": 0.4088, + "step": 25101 + }, + { + "epoch": 14.023463687150837, + "grad_norm": 0.3801347315311432, + "learning_rate": 0.0003000280112044818, + "loss": 0.3158, + "step": 25102 + }, + { + "epoch": 14.024022346368715, + "grad_norm": 0.8574338555335999, + "learning_rate": 0.0003, + "loss": 0.3592, + "step": 25103 + }, + { + "epoch": 14.024581005586592, + "grad_norm": 0.45281165838241577, + "learning_rate": 0.00029997198879551823, + "loss": 0.4392, + "step": 25104 + }, + { + "epoch": 14.025139664804469, + "grad_norm": 0.3741462528705597, + "learning_rate": 0.00029994397759103644, + "loss": 0.4564, + "step": 25105 + }, + { + "epoch": 14.025698324022347, + "grad_norm": 0.3896619975566864, + "learning_rate": 0.0002999159663865546, + "loss": 0.3316, + "step": 25106 + }, + { + "epoch": 14.026256983240224, + "grad_norm": 0.3635427951812744, + "learning_rate": 0.00029988795518207285, + "loss": 0.3794, + "step": 25107 + }, + { + "epoch": 14.0268156424581, + "grad_norm": 0.5178716778755188, + "learning_rate": 0.00029985994397759106, + "loss": 0.478, + "step": 25108 + }, + { + "epoch": 14.027374301675978, + "grad_norm": 1.5163228511810303, + "learning_rate": 0.00029983193277310926, + "loss": 0.3813, + "step": 25109 + }, + { + "epoch": 14.027932960893855, + "grad_norm": 0.41071194410324097, + "learning_rate": 0.00029980392156862747, + "loss": 0.389, + "step": 25110 + }, + { + "epoch": 14.028491620111732, + "grad_norm": 0.5828934907913208, + "learning_rate": 0.0002997759103641456, + "loss": 0.5433, + "step": 25111 + }, + { + "epoch": 14.029050279329608, + "grad_norm": 0.43605664372444153, + "learning_rate": 0.0002997478991596639, + "loss": 0.4031, + "step": 25112 + }, + { + "epoch": 14.029608938547486, + "grad_norm": 0.32368379831314087, + "learning_rate": 0.0002997198879551821, + "loss": 0.3372, + "step": 25113 + }, + { + "epoch": 14.030167597765363, + "grad_norm": 0.3229929208755493, + "learning_rate": 0.0002996918767507003, + "loss": 0.3311, + "step": 25114 + }, + { + "epoch": 14.03072625698324, + "grad_norm": 0.5266735553741455, + "learning_rate": 0.0002996638655462185, + "loss": 0.5404, + "step": 25115 + }, + { + "epoch": 14.031284916201118, + "grad_norm": 0.3858741521835327, + "learning_rate": 0.0002996358543417367, + "loss": 0.3795, + "step": 25116 + }, + { + "epoch": 14.031843575418995, + "grad_norm": 0.48159995675086975, + "learning_rate": 0.0002996078431372549, + "loss": 0.4463, + "step": 25117 + }, + { + "epoch": 14.032402234636871, + "grad_norm": 1.2363423109054565, + "learning_rate": 0.0002995798319327731, + "loss": 0.4726, + "step": 25118 + }, + { + "epoch": 14.03296089385475, + "grad_norm": 0.8779730796813965, + "learning_rate": 0.0002995518207282913, + "loss": 0.4443, + "step": 25119 + }, + { + "epoch": 14.033519553072626, + "grad_norm": 0.5478161573410034, + "learning_rate": 0.00029952380952380953, + "loss": 0.3233, + "step": 25120 + }, + { + "epoch": 14.034078212290503, + "grad_norm": 0.294427752494812, + "learning_rate": 0.00029949579831932774, + "loss": 0.2965, + "step": 25121 + }, + { + "epoch": 14.03463687150838, + "grad_norm": 0.5613181591033936, + "learning_rate": 0.00029946778711484594, + "loss": 0.4103, + "step": 25122 + }, + { + "epoch": 14.035195530726257, + "grad_norm": 0.3509247303009033, + "learning_rate": 0.00029943977591036415, + "loss": 0.399, + "step": 25123 + }, + { + "epoch": 14.035754189944134, + "grad_norm": 0.4378091096878052, + "learning_rate": 0.0002994117647058824, + "loss": 0.3974, + "step": 25124 + }, + { + "epoch": 14.03631284916201, + "grad_norm": 0.5401489734649658, + "learning_rate": 0.00029938375350140056, + "loss": 0.3839, + "step": 25125 + }, + { + "epoch": 14.036871508379889, + "grad_norm": 0.5004332065582275, + "learning_rate": 0.00029935574229691877, + "loss": 0.5193, + "step": 25126 + }, + { + "epoch": 14.037430167597766, + "grad_norm": 0.4608827531337738, + "learning_rate": 0.00029932773109243697, + "loss": 0.4458, + "step": 25127 + }, + { + "epoch": 14.037988826815642, + "grad_norm": 0.5581133365631104, + "learning_rate": 0.0002992997198879552, + "loss": 0.3701, + "step": 25128 + }, + { + "epoch": 14.03854748603352, + "grad_norm": 0.5472137928009033, + "learning_rate": 0.00029927170868347344, + "loss": 0.3666, + "step": 25129 + }, + { + "epoch": 14.039106145251397, + "grad_norm": 0.4342600405216217, + "learning_rate": 0.0002992436974789916, + "loss": 0.4168, + "step": 25130 + }, + { + "epoch": 14.039664804469274, + "grad_norm": 0.48705387115478516, + "learning_rate": 0.0002992156862745098, + "loss": 0.5495, + "step": 25131 + }, + { + "epoch": 14.04022346368715, + "grad_norm": 0.6571857929229736, + "learning_rate": 0.00029918767507002806, + "loss": 0.3733, + "step": 25132 + }, + { + "epoch": 14.040782122905028, + "grad_norm": 0.9972319006919861, + "learning_rate": 0.0002991596638655462, + "loss": 0.4395, + "step": 25133 + }, + { + "epoch": 14.041340782122905, + "grad_norm": 1.2861734628677368, + "learning_rate": 0.00029913165266106447, + "loss": 0.429, + "step": 25134 + }, + { + "epoch": 14.041899441340782, + "grad_norm": 0.6150367259979248, + "learning_rate": 0.0002991036414565826, + "loss": 0.4162, + "step": 25135 + }, + { + "epoch": 14.04245810055866, + "grad_norm": 0.5176380276679993, + "learning_rate": 0.0002990756302521008, + "loss": 0.4357, + "step": 25136 + }, + { + "epoch": 14.043016759776537, + "grad_norm": 0.46156901121139526, + "learning_rate": 0.0002990476190476191, + "loss": 0.3647, + "step": 25137 + }, + { + "epoch": 14.043575418994413, + "grad_norm": 0.4424963593482971, + "learning_rate": 0.00029901960784313724, + "loss": 0.2935, + "step": 25138 + }, + { + "epoch": 14.04413407821229, + "grad_norm": 0.5505861639976501, + "learning_rate": 0.0002989915966386555, + "loss": 0.6266, + "step": 25139 + }, + { + "epoch": 14.044692737430168, + "grad_norm": 0.5297809839248657, + "learning_rate": 0.0002989635854341737, + "loss": 0.4665, + "step": 25140 + }, + { + "epoch": 14.045251396648045, + "grad_norm": 0.3823828101158142, + "learning_rate": 0.00029893557422969186, + "loss": 0.3453, + "step": 25141 + }, + { + "epoch": 14.045810055865921, + "grad_norm": 0.6137543320655823, + "learning_rate": 0.0002989075630252101, + "loss": 0.4747, + "step": 25142 + }, + { + "epoch": 14.0463687150838, + "grad_norm": 0.5718567967414856, + "learning_rate": 0.00029887955182072827, + "loss": 0.6341, + "step": 25143 + }, + { + "epoch": 14.046927374301676, + "grad_norm": 0.5188270211219788, + "learning_rate": 0.00029885154061624653, + "loss": 0.5634, + "step": 25144 + }, + { + "epoch": 14.047486033519553, + "grad_norm": 0.44254305958747864, + "learning_rate": 0.00029882352941176473, + "loss": 0.3635, + "step": 25145 + }, + { + "epoch": 14.048044692737431, + "grad_norm": 0.6161893010139465, + "learning_rate": 0.0002987955182072829, + "loss": 0.491, + "step": 25146 + }, + { + "epoch": 14.048603351955308, + "grad_norm": 1.4533168077468872, + "learning_rate": 0.00029876750700280115, + "loss": 0.3027, + "step": 25147 + }, + { + "epoch": 14.049162011173184, + "grad_norm": 0.8176302909851074, + "learning_rate": 0.00029873949579831935, + "loss": 0.3143, + "step": 25148 + }, + { + "epoch": 14.04972067039106, + "grad_norm": 0.6239168047904968, + "learning_rate": 0.00029871148459383756, + "loss": 0.3537, + "step": 25149 + }, + { + "epoch": 14.050279329608939, + "grad_norm": 5.831343173980713, + "learning_rate": 0.00029868347338935576, + "loss": 0.3829, + "step": 25150 + }, + { + "epoch": 14.050837988826816, + "grad_norm": 0.6405991911888123, + "learning_rate": 0.0002986554621848739, + "loss": 0.4646, + "step": 25151 + }, + { + "epoch": 14.051396648044692, + "grad_norm": 0.3845250904560089, + "learning_rate": 0.0002986274509803922, + "loss": 0.3478, + "step": 25152 + }, + { + "epoch": 14.05195530726257, + "grad_norm": 0.7091770172119141, + "learning_rate": 0.0002985994397759104, + "loss": 0.4759, + "step": 25153 + }, + { + "epoch": 14.052513966480447, + "grad_norm": 0.4209301173686981, + "learning_rate": 0.0002985714285714286, + "loss": 0.4502, + "step": 25154 + }, + { + "epoch": 14.053072625698324, + "grad_norm": 1.1112370491027832, + "learning_rate": 0.0002985434173669468, + "loss": 0.4293, + "step": 25155 + }, + { + "epoch": 14.053631284916202, + "grad_norm": 0.5396345853805542, + "learning_rate": 0.000298515406162465, + "loss": 0.4657, + "step": 25156 + }, + { + "epoch": 14.054189944134079, + "grad_norm": 0.6367893218994141, + "learning_rate": 0.0002984873949579832, + "loss": 0.5284, + "step": 25157 + }, + { + "epoch": 14.054748603351955, + "grad_norm": 0.49026769399642944, + "learning_rate": 0.0002984593837535014, + "loss": 0.4015, + "step": 25158 + }, + { + "epoch": 14.055307262569832, + "grad_norm": 0.3736318051815033, + "learning_rate": 0.00029843137254901956, + "loss": 0.2683, + "step": 25159 + }, + { + "epoch": 14.05586592178771, + "grad_norm": 0.9663935303688049, + "learning_rate": 0.0002984033613445378, + "loss": 0.7196, + "step": 25160 + }, + { + "epoch": 14.056424581005587, + "grad_norm": 0.9790797829627991, + "learning_rate": 0.00029837535014005603, + "loss": 0.4882, + "step": 25161 + }, + { + "epoch": 14.056983240223463, + "grad_norm": 0.5219529271125793, + "learning_rate": 0.00029834733893557424, + "loss": 0.5522, + "step": 25162 + }, + { + "epoch": 14.057541899441341, + "grad_norm": 11.369020462036133, + "learning_rate": 0.00029831932773109244, + "loss": 0.3799, + "step": 25163 + }, + { + "epoch": 14.058100558659218, + "grad_norm": 0.7957562804222107, + "learning_rate": 0.00029829131652661065, + "loss": 0.3867, + "step": 25164 + }, + { + "epoch": 14.058659217877095, + "grad_norm": 0.592454195022583, + "learning_rate": 0.00029826330532212885, + "loss": 0.5343, + "step": 25165 + }, + { + "epoch": 14.059217877094973, + "grad_norm": 0.841611385345459, + "learning_rate": 0.00029823529411764706, + "loss": 0.3398, + "step": 25166 + }, + { + "epoch": 14.05977653631285, + "grad_norm": 0.6596607565879822, + "learning_rate": 0.00029820728291316527, + "loss": 0.3552, + "step": 25167 + }, + { + "epoch": 14.060335195530726, + "grad_norm": 0.8836166858673096, + "learning_rate": 0.00029817927170868347, + "loss": 0.376, + "step": 25168 + }, + { + "epoch": 14.060893854748603, + "grad_norm": 19.35335922241211, + "learning_rate": 0.0002981512605042017, + "loss": 0.3215, + "step": 25169 + }, + { + "epoch": 14.061452513966481, + "grad_norm": 0.5205940008163452, + "learning_rate": 0.0002981232492997199, + "loss": 0.4637, + "step": 25170 + }, + { + "epoch": 14.062011173184358, + "grad_norm": 0.5076216459274292, + "learning_rate": 0.0002980952380952381, + "loss": 0.5615, + "step": 25171 + }, + { + "epoch": 14.062569832402234, + "grad_norm": 0.40266963839530945, + "learning_rate": 0.00029806722689075635, + "loss": 0.3353, + "step": 25172 + }, + { + "epoch": 14.063128491620112, + "grad_norm": 0.5750336050987244, + "learning_rate": 0.0002980392156862745, + "loss": 0.4524, + "step": 25173 + }, + { + "epoch": 14.063687150837989, + "grad_norm": 0.525634229183197, + "learning_rate": 0.0002980112044817927, + "loss": 0.4119, + "step": 25174 + }, + { + "epoch": 14.064245810055866, + "grad_norm": 1.1602169275283813, + "learning_rate": 0.0002979831932773109, + "loss": 0.3954, + "step": 25175 + }, + { + "epoch": 14.064804469273742, + "grad_norm": 0.5978020429611206, + "learning_rate": 0.0002979551820728291, + "loss": 0.4465, + "step": 25176 + }, + { + "epoch": 14.06536312849162, + "grad_norm": 0.4733113646507263, + "learning_rate": 0.0002979271708683474, + "loss": 0.4775, + "step": 25177 + }, + { + "epoch": 14.065921787709497, + "grad_norm": 0.6798698902130127, + "learning_rate": 0.00029789915966386553, + "loss": 0.6171, + "step": 25178 + }, + { + "epoch": 14.066480446927374, + "grad_norm": 0.5701090097427368, + "learning_rate": 0.00029787114845938374, + "loss": 0.4569, + "step": 25179 + }, + { + "epoch": 14.067039106145252, + "grad_norm": 0.4863775670528412, + "learning_rate": 0.000297843137254902, + "loss": 0.38, + "step": 25180 + }, + { + "epoch": 14.067597765363129, + "grad_norm": 0.6646408438682556, + "learning_rate": 0.00029781512605042015, + "loss": 0.6488, + "step": 25181 + }, + { + "epoch": 14.068156424581005, + "grad_norm": 0.4765235483646393, + "learning_rate": 0.0002977871148459384, + "loss": 0.4397, + "step": 25182 + }, + { + "epoch": 14.068715083798883, + "grad_norm": 2.3887763023376465, + "learning_rate": 0.00029775910364145656, + "loss": 0.4544, + "step": 25183 + }, + { + "epoch": 14.06927374301676, + "grad_norm": 0.6155056953430176, + "learning_rate": 0.00029773109243697477, + "loss": 0.3791, + "step": 25184 + }, + { + "epoch": 14.069832402234637, + "grad_norm": 0.49900445342063904, + "learning_rate": 0.00029770308123249303, + "loss": 0.3861, + "step": 25185 + }, + { + "epoch": 14.070391061452513, + "grad_norm": 0.9303284883499146, + "learning_rate": 0.0002976750700280112, + "loss": 0.3847, + "step": 25186 + }, + { + "epoch": 14.070949720670392, + "grad_norm": 0.5667380094528198, + "learning_rate": 0.00029764705882352944, + "loss": 0.4252, + "step": 25187 + }, + { + "epoch": 14.071508379888268, + "grad_norm": 0.5309032201766968, + "learning_rate": 0.00029761904761904765, + "loss": 0.4725, + "step": 25188 + }, + { + "epoch": 14.072067039106145, + "grad_norm": 0.597527027130127, + "learning_rate": 0.0002975910364145658, + "loss": 0.4007, + "step": 25189 + }, + { + "epoch": 14.072625698324023, + "grad_norm": 0.5219390988349915, + "learning_rate": 0.00029756302521008406, + "loss": 0.5085, + "step": 25190 + }, + { + "epoch": 14.0731843575419, + "grad_norm": 0.35437649488449097, + "learning_rate": 0.0002975350140056022, + "loss": 0.3983, + "step": 25191 + }, + { + "epoch": 14.073743016759776, + "grad_norm": 0.3699656128883362, + "learning_rate": 0.00029750700280112047, + "loss": 0.3124, + "step": 25192 + }, + { + "epoch": 14.074301675977654, + "grad_norm": 0.4022884964942932, + "learning_rate": 0.0002974789915966387, + "loss": 0.3703, + "step": 25193 + }, + { + "epoch": 14.074860335195531, + "grad_norm": 0.37731048464775085, + "learning_rate": 0.00029745098039215683, + "loss": 0.4088, + "step": 25194 + }, + { + "epoch": 14.075418994413408, + "grad_norm": 0.5332436561584473, + "learning_rate": 0.0002974229691876751, + "loss": 0.4706, + "step": 25195 + }, + { + "epoch": 14.075977653631284, + "grad_norm": 0.4692066013813019, + "learning_rate": 0.0002973949579831933, + "loss": 0.4376, + "step": 25196 + }, + { + "epoch": 14.076536312849163, + "grad_norm": 0.5193145871162415, + "learning_rate": 0.0002973669467787115, + "loss": 0.3453, + "step": 25197 + }, + { + "epoch": 14.077094972067039, + "grad_norm": 0.5056783556938171, + "learning_rate": 0.0002973389355742297, + "loss": 0.4136, + "step": 25198 + }, + { + "epoch": 14.077653631284916, + "grad_norm": 0.3452467918395996, + "learning_rate": 0.00029731092436974786, + "loss": 0.3302, + "step": 25199 + }, + { + "epoch": 14.078212290502794, + "grad_norm": 0.8596594929695129, + "learning_rate": 0.0002972829131652661, + "loss": 0.4093, + "step": 25200 + }, + { + "epoch": 14.07877094972067, + "grad_norm": 0.6025142669677734, + "learning_rate": 0.0002972549019607843, + "loss": 0.4807, + "step": 25201 + }, + { + "epoch": 14.079329608938547, + "grad_norm": 0.48050639033317566, + "learning_rate": 0.00029722689075630253, + "loss": 0.5147, + "step": 25202 + }, + { + "epoch": 14.079888268156424, + "grad_norm": 0.6344686150550842, + "learning_rate": 0.00029719887955182074, + "loss": 0.4303, + "step": 25203 + }, + { + "epoch": 14.080446927374302, + "grad_norm": 0.6649397611618042, + "learning_rate": 0.00029717086834733894, + "loss": 0.4077, + "step": 25204 + }, + { + "epoch": 14.081005586592179, + "grad_norm": 0.4509686529636383, + "learning_rate": 0.00029714285714285715, + "loss": 0.3855, + "step": 25205 + }, + { + "epoch": 14.081564245810055, + "grad_norm": 0.4079494774341583, + "learning_rate": 0.00029711484593837535, + "loss": 0.3264, + "step": 25206 + }, + { + "epoch": 14.082122905027934, + "grad_norm": 0.4381329119205475, + "learning_rate": 0.00029708683473389356, + "loss": 0.4357, + "step": 25207 + }, + { + "epoch": 14.08268156424581, + "grad_norm": 0.4486906826496124, + "learning_rate": 0.00029705882352941177, + "loss": 0.4582, + "step": 25208 + }, + { + "epoch": 14.083240223463687, + "grad_norm": 0.38026419281959534, + "learning_rate": 0.00029703081232492997, + "loss": 0.3547, + "step": 25209 + }, + { + "epoch": 14.083798882681565, + "grad_norm": 6.7017822265625, + "learning_rate": 0.0002970028011204482, + "loss": 0.416, + "step": 25210 + }, + { + "epoch": 14.084357541899442, + "grad_norm": 0.46478271484375, + "learning_rate": 0.0002969747899159664, + "loss": 0.3734, + "step": 25211 + }, + { + "epoch": 14.084916201117318, + "grad_norm": 0.4860726594924927, + "learning_rate": 0.00029694677871148464, + "loss": 0.455, + "step": 25212 + }, + { + "epoch": 14.085474860335195, + "grad_norm": 0.5200951099395752, + "learning_rate": 0.0002969187675070028, + "loss": 0.3661, + "step": 25213 + }, + { + "epoch": 14.086033519553073, + "grad_norm": 0.4196780323982239, + "learning_rate": 0.000296890756302521, + "loss": 0.4539, + "step": 25214 + }, + { + "epoch": 14.08659217877095, + "grad_norm": 0.5268884897232056, + "learning_rate": 0.0002968627450980392, + "loss": 0.4419, + "step": 25215 + }, + { + "epoch": 14.087150837988826, + "grad_norm": 0.5414919853210449, + "learning_rate": 0.0002968347338935574, + "loss": 0.5052, + "step": 25216 + }, + { + "epoch": 14.087709497206705, + "grad_norm": 0.4272289276123047, + "learning_rate": 0.0002968067226890757, + "loss": 0.2713, + "step": 25217 + }, + { + "epoch": 14.088268156424581, + "grad_norm": 0.38965895771980286, + "learning_rate": 0.0002967787114845938, + "loss": 0.3698, + "step": 25218 + }, + { + "epoch": 14.088826815642458, + "grad_norm": 0.39560815691947937, + "learning_rate": 0.00029675070028011203, + "loss": 0.3644, + "step": 25219 + }, + { + "epoch": 14.089385474860336, + "grad_norm": 0.6007598042488098, + "learning_rate": 0.0002967226890756303, + "loss": 0.4668, + "step": 25220 + }, + { + "epoch": 14.089944134078213, + "grad_norm": 0.42788320779800415, + "learning_rate": 0.00029669467787114844, + "loss": 0.3759, + "step": 25221 + }, + { + "epoch": 14.09050279329609, + "grad_norm": 0.396360844373703, + "learning_rate": 0.0002966666666666667, + "loss": 0.3699, + "step": 25222 + }, + { + "epoch": 14.091061452513966, + "grad_norm": 3.4097747802734375, + "learning_rate": 0.00029663865546218486, + "loss": 0.5345, + "step": 25223 + }, + { + "epoch": 14.091620111731844, + "grad_norm": 0.4605167806148529, + "learning_rate": 0.00029661064425770306, + "loss": 0.3749, + "step": 25224 + }, + { + "epoch": 14.09217877094972, + "grad_norm": 0.5276898145675659, + "learning_rate": 0.0002965826330532213, + "loss": 0.4286, + "step": 25225 + }, + { + "epoch": 14.092737430167597, + "grad_norm": 0.417802095413208, + "learning_rate": 0.0002965546218487395, + "loss": 0.4423, + "step": 25226 + }, + { + "epoch": 14.093296089385476, + "grad_norm": 0.5498452186584473, + "learning_rate": 0.00029652661064425773, + "loss": 0.3645, + "step": 25227 + }, + { + "epoch": 14.093854748603352, + "grad_norm": 0.6556692719459534, + "learning_rate": 0.00029649859943977594, + "loss": 0.3947, + "step": 25228 + }, + { + "epoch": 14.094413407821229, + "grad_norm": 0.5161360502243042, + "learning_rate": 0.0002964705882352941, + "loss": 0.4575, + "step": 25229 + }, + { + "epoch": 14.094972067039107, + "grad_norm": 0.5679553151130676, + "learning_rate": 0.00029644257703081235, + "loss": 0.5336, + "step": 25230 + }, + { + "epoch": 14.095530726256984, + "grad_norm": 0.49548977613449097, + "learning_rate": 0.0002964145658263305, + "loss": 0.3869, + "step": 25231 + }, + { + "epoch": 14.09608938547486, + "grad_norm": 1.4515784978866577, + "learning_rate": 0.00029638655462184876, + "loss": 0.3497, + "step": 25232 + }, + { + "epoch": 14.096648044692737, + "grad_norm": 0.3503189980983734, + "learning_rate": 0.00029635854341736697, + "loss": 0.3812, + "step": 25233 + }, + { + "epoch": 14.097206703910615, + "grad_norm": 0.39500895142555237, + "learning_rate": 0.0002963305322128851, + "loss": 0.4219, + "step": 25234 + }, + { + "epoch": 14.097765363128492, + "grad_norm": 0.687580406665802, + "learning_rate": 0.0002963025210084034, + "loss": 0.3774, + "step": 25235 + }, + { + "epoch": 14.098324022346368, + "grad_norm": 0.6148037314414978, + "learning_rate": 0.0002962745098039216, + "loss": 0.4375, + "step": 25236 + }, + { + "epoch": 14.098882681564247, + "grad_norm": 0.46995699405670166, + "learning_rate": 0.0002962464985994398, + "loss": 0.4371, + "step": 25237 + }, + { + "epoch": 14.099441340782123, + "grad_norm": 0.4028025269508362, + "learning_rate": 0.000296218487394958, + "loss": 0.4175, + "step": 25238 + }, + { + "epoch": 14.1, + "grad_norm": 1.3604490756988525, + "learning_rate": 0.00029619047619047615, + "loss": 0.5418, + "step": 25239 + }, + { + "epoch": 14.100558659217878, + "grad_norm": 0.35674938559532166, + "learning_rate": 0.0002961624649859944, + "loss": 0.3923, + "step": 25240 + }, + { + "epoch": 14.101117318435755, + "grad_norm": 0.4669075310230255, + "learning_rate": 0.0002961344537815126, + "loss": 0.4426, + "step": 25241 + }, + { + "epoch": 14.101675977653631, + "grad_norm": 0.4117047190666199, + "learning_rate": 0.0002961064425770308, + "loss": 0.4196, + "step": 25242 + }, + { + "epoch": 14.102234636871508, + "grad_norm": 1.3927124738693237, + "learning_rate": 0.00029607843137254903, + "loss": 0.531, + "step": 25243 + }, + { + "epoch": 14.102793296089386, + "grad_norm": 0.42453867197036743, + "learning_rate": 0.00029605042016806724, + "loss": 0.4364, + "step": 25244 + }, + { + "epoch": 14.103351955307263, + "grad_norm": 0.44160357117652893, + "learning_rate": 0.00029602240896358544, + "loss": 0.3726, + "step": 25245 + }, + { + "epoch": 14.10391061452514, + "grad_norm": 0.4080667793750763, + "learning_rate": 0.00029599439775910365, + "loss": 0.3693, + "step": 25246 + }, + { + "epoch": 14.104469273743018, + "grad_norm": 0.5016002058982849, + "learning_rate": 0.00029596638655462185, + "loss": 0.4269, + "step": 25247 + }, + { + "epoch": 14.105027932960894, + "grad_norm": 0.5420082211494446, + "learning_rate": 0.00029593837535014006, + "loss": 0.6439, + "step": 25248 + }, + { + "epoch": 14.10558659217877, + "grad_norm": 0.4951266348361969, + "learning_rate": 0.00029591036414565827, + "loss": 0.3099, + "step": 25249 + }, + { + "epoch": 14.106145251396647, + "grad_norm": 0.4550786018371582, + "learning_rate": 0.00029588235294117647, + "loss": 0.3699, + "step": 25250 + }, + { + "epoch": 14.106703910614526, + "grad_norm": 0.6337207555770874, + "learning_rate": 0.0002958543417366947, + "loss": 0.4013, + "step": 25251 + }, + { + "epoch": 14.107262569832402, + "grad_norm": 2.4398343563079834, + "learning_rate": 0.00029582633053221294, + "loss": 0.4477, + "step": 25252 + }, + { + "epoch": 14.107821229050279, + "grad_norm": 0.4273487627506256, + "learning_rate": 0.0002957983193277311, + "loss": 0.4273, + "step": 25253 + }, + { + "epoch": 14.108379888268157, + "grad_norm": 0.5015369653701782, + "learning_rate": 0.0002957703081232493, + "loss": 0.4909, + "step": 25254 + }, + { + "epoch": 14.108938547486034, + "grad_norm": 0.47109749913215637, + "learning_rate": 0.0002957422969187675, + "loss": 0.4461, + "step": 25255 + }, + { + "epoch": 14.10949720670391, + "grad_norm": 0.5753278732299805, + "learning_rate": 0.0002957142857142857, + "loss": 0.6006, + "step": 25256 + }, + { + "epoch": 14.110055865921789, + "grad_norm": 0.39780813455581665, + "learning_rate": 0.00029568627450980397, + "loss": 0.4196, + "step": 25257 + }, + { + "epoch": 14.110614525139665, + "grad_norm": 0.6434287428855896, + "learning_rate": 0.0002956582633053221, + "loss": 0.408, + "step": 25258 + }, + { + "epoch": 14.111173184357542, + "grad_norm": 0.5035482048988342, + "learning_rate": 0.0002956302521008403, + "loss": 0.4585, + "step": 25259 + }, + { + "epoch": 14.111731843575418, + "grad_norm": 0.7466797232627869, + "learning_rate": 0.0002956022408963586, + "loss": 0.4973, + "step": 25260 + }, + { + "epoch": 14.112290502793297, + "grad_norm": 0.5112746953964233, + "learning_rate": 0.00029557422969187674, + "loss": 0.5799, + "step": 25261 + }, + { + "epoch": 14.112849162011173, + "grad_norm": 1.0010509490966797, + "learning_rate": 0.000295546218487395, + "loss": 0.4776, + "step": 25262 + }, + { + "epoch": 14.11340782122905, + "grad_norm": 0.7096441388130188, + "learning_rate": 0.00029551820728291315, + "loss": 0.4699, + "step": 25263 + }, + { + "epoch": 14.113966480446928, + "grad_norm": 1.4034569263458252, + "learning_rate": 0.00029549019607843136, + "loss": 0.4724, + "step": 25264 + }, + { + "epoch": 14.114525139664805, + "grad_norm": 1.9724229574203491, + "learning_rate": 0.0002954621848739496, + "loss": 0.4356, + "step": 25265 + }, + { + "epoch": 14.115083798882681, + "grad_norm": 1.2837929725646973, + "learning_rate": 0.00029543417366946777, + "loss": 0.4072, + "step": 25266 + }, + { + "epoch": 14.11564245810056, + "grad_norm": 0.4360194802284241, + "learning_rate": 0.00029540616246498603, + "loss": 0.4394, + "step": 25267 + }, + { + "epoch": 14.116201117318436, + "grad_norm": 0.653012216091156, + "learning_rate": 0.00029537815126050423, + "loss": 0.5023, + "step": 25268 + }, + { + "epoch": 14.116759776536313, + "grad_norm": 0.4755963981151581, + "learning_rate": 0.0002953501400560224, + "loss": 0.4031, + "step": 25269 + }, + { + "epoch": 14.11731843575419, + "grad_norm": 0.529951810836792, + "learning_rate": 0.00029532212885154065, + "loss": 0.3702, + "step": 25270 + }, + { + "epoch": 14.117877094972068, + "grad_norm": 0.985953688621521, + "learning_rate": 0.0002952941176470588, + "loss": 0.4476, + "step": 25271 + }, + { + "epoch": 14.118435754189944, + "grad_norm": 0.4121471345424652, + "learning_rate": 0.000295266106442577, + "loss": 0.3754, + "step": 25272 + }, + { + "epoch": 14.11899441340782, + "grad_norm": 0.3403964340686798, + "learning_rate": 0.00029523809523809526, + "loss": 0.4224, + "step": 25273 + }, + { + "epoch": 14.119553072625699, + "grad_norm": 0.44252824783325195, + "learning_rate": 0.0002952100840336134, + "loss": 0.3419, + "step": 25274 + }, + { + "epoch": 14.120111731843576, + "grad_norm": 3.4991135597229004, + "learning_rate": 0.0002951820728291317, + "loss": 0.3548, + "step": 25275 + }, + { + "epoch": 14.120670391061452, + "grad_norm": 0.35608962178230286, + "learning_rate": 0.0002951540616246499, + "loss": 0.4404, + "step": 25276 + }, + { + "epoch": 14.121229050279329, + "grad_norm": 0.47937682271003723, + "learning_rate": 0.00029512605042016803, + "loss": 0.4, + "step": 25277 + }, + { + "epoch": 14.121787709497207, + "grad_norm": 0.738074779510498, + "learning_rate": 0.0002950980392156863, + "loss": 0.4152, + "step": 25278 + }, + { + "epoch": 14.122346368715084, + "grad_norm": 0.778804361820221, + "learning_rate": 0.00029507002801120445, + "loss": 0.4486, + "step": 25279 + }, + { + "epoch": 14.12290502793296, + "grad_norm": 0.3959718644618988, + "learning_rate": 0.0002950420168067227, + "loss": 0.3151, + "step": 25280 + }, + { + "epoch": 14.123463687150839, + "grad_norm": 4.583657264709473, + "learning_rate": 0.0002950140056022409, + "loss": 0.3263, + "step": 25281 + }, + { + "epoch": 14.124022346368715, + "grad_norm": 2.1122794151306152, + "learning_rate": 0.00029498599439775906, + "loss": 0.4463, + "step": 25282 + }, + { + "epoch": 14.124581005586592, + "grad_norm": 0.45545968413352966, + "learning_rate": 0.0002949579831932773, + "loss": 0.4253, + "step": 25283 + }, + { + "epoch": 14.12513966480447, + "grad_norm": 0.47403988242149353, + "learning_rate": 0.00029492997198879553, + "loss": 0.4362, + "step": 25284 + }, + { + "epoch": 14.125698324022347, + "grad_norm": 1.6997489929199219, + "learning_rate": 0.00029490196078431374, + "loss": 0.3871, + "step": 25285 + }, + { + "epoch": 14.126256983240223, + "grad_norm": 0.9154916405677795, + "learning_rate": 0.00029487394957983194, + "loss": 0.4911, + "step": 25286 + }, + { + "epoch": 14.1268156424581, + "grad_norm": 1.0880929231643677, + "learning_rate": 0.0002948459383753501, + "loss": 0.4515, + "step": 25287 + }, + { + "epoch": 14.127374301675978, + "grad_norm": 0.3965624272823334, + "learning_rate": 0.00029481792717086835, + "loss": 0.3945, + "step": 25288 + }, + { + "epoch": 14.127932960893855, + "grad_norm": 0.5198319554328918, + "learning_rate": 0.00029478991596638656, + "loss": 0.4914, + "step": 25289 + }, + { + "epoch": 14.128491620111731, + "grad_norm": 0.4856901466846466, + "learning_rate": 0.00029476190476190477, + "loss": 0.3462, + "step": 25290 + }, + { + "epoch": 14.12905027932961, + "grad_norm": 1.0101051330566406, + "learning_rate": 0.00029473389355742297, + "loss": 0.3211, + "step": 25291 + }, + { + "epoch": 14.129608938547486, + "grad_norm": 1.641786813735962, + "learning_rate": 0.0002947058823529412, + "loss": 0.4097, + "step": 25292 + }, + { + "epoch": 14.130167597765363, + "grad_norm": 0.4728354215621948, + "learning_rate": 0.0002946778711484594, + "loss": 0.3946, + "step": 25293 + }, + { + "epoch": 14.130726256983241, + "grad_norm": 1.2953299283981323, + "learning_rate": 0.0002946498599439776, + "loss": 0.3475, + "step": 25294 + }, + { + "epoch": 14.131284916201118, + "grad_norm": 0.6651237607002258, + "learning_rate": 0.00029462184873949585, + "loss": 0.4749, + "step": 25295 + }, + { + "epoch": 14.131843575418994, + "grad_norm": 0.4762365520000458, + "learning_rate": 0.000294593837535014, + "loss": 0.3566, + "step": 25296 + }, + { + "epoch": 14.13240223463687, + "grad_norm": 0.3909949064254761, + "learning_rate": 0.0002945658263305322, + "loss": 0.4144, + "step": 25297 + }, + { + "epoch": 14.132960893854749, + "grad_norm": 0.5893816351890564, + "learning_rate": 0.0002945378151260504, + "loss": 0.4077, + "step": 25298 + }, + { + "epoch": 14.133519553072626, + "grad_norm": 0.4073117673397064, + "learning_rate": 0.0002945098039215686, + "loss": 0.4622, + "step": 25299 + }, + { + "epoch": 14.134078212290502, + "grad_norm": 0.41223081946372986, + "learning_rate": 0.0002944817927170869, + "loss": 0.4471, + "step": 25300 + }, + { + "epoch": 14.13463687150838, + "grad_norm": 0.2972390651702881, + "learning_rate": 0.00029445378151260503, + "loss": 0.2445, + "step": 25301 + }, + { + "epoch": 14.135195530726257, + "grad_norm": 0.4496445059776306, + "learning_rate": 0.00029442577030812324, + "loss": 0.3742, + "step": 25302 + }, + { + "epoch": 14.135754189944134, + "grad_norm": 0.5851024389266968, + "learning_rate": 0.0002943977591036415, + "loss": 0.3323, + "step": 25303 + }, + { + "epoch": 14.136312849162012, + "grad_norm": 0.5123774409294128, + "learning_rate": 0.00029436974789915965, + "loss": 0.4662, + "step": 25304 + }, + { + "epoch": 14.136871508379889, + "grad_norm": 0.5973889231681824, + "learning_rate": 0.0002943417366946779, + "loss": 0.533, + "step": 25305 + }, + { + "epoch": 14.137430167597765, + "grad_norm": 0.7023487687110901, + "learning_rate": 0.00029431372549019606, + "loss": 0.4516, + "step": 25306 + }, + { + "epoch": 14.137988826815642, + "grad_norm": 0.44419729709625244, + "learning_rate": 0.00029428571428571427, + "loss": 0.4859, + "step": 25307 + }, + { + "epoch": 14.13854748603352, + "grad_norm": 0.4277520775794983, + "learning_rate": 0.00029425770308123253, + "loss": 0.3727, + "step": 25308 + }, + { + "epoch": 14.139106145251397, + "grad_norm": 0.8788644075393677, + "learning_rate": 0.0002942296918767507, + "loss": 0.4663, + "step": 25309 + }, + { + "epoch": 14.139664804469273, + "grad_norm": 0.9085835218429565, + "learning_rate": 0.00029420168067226894, + "loss": 0.5411, + "step": 25310 + }, + { + "epoch": 14.140223463687152, + "grad_norm": 3.711705446243286, + "learning_rate": 0.00029417366946778715, + "loss": 0.4121, + "step": 25311 + }, + { + "epoch": 14.140782122905028, + "grad_norm": 0.5772017240524292, + "learning_rate": 0.0002941456582633053, + "loss": 0.5762, + "step": 25312 + }, + { + "epoch": 14.141340782122905, + "grad_norm": 0.5865617394447327, + "learning_rate": 0.00029411764705882356, + "loss": 0.3212, + "step": 25313 + }, + { + "epoch": 14.141899441340781, + "grad_norm": 0.6632071733474731, + "learning_rate": 0.0002940896358543417, + "loss": 0.5052, + "step": 25314 + }, + { + "epoch": 14.14245810055866, + "grad_norm": 0.6682087182998657, + "learning_rate": 0.00029406162464985997, + "loss": 0.5174, + "step": 25315 + }, + { + "epoch": 14.143016759776536, + "grad_norm": 0.3512507975101471, + "learning_rate": 0.0002940336134453782, + "loss": 0.3698, + "step": 25316 + }, + { + "epoch": 14.143575418994413, + "grad_norm": 0.3714883625507355, + "learning_rate": 0.00029400560224089633, + "loss": 0.2646, + "step": 25317 + }, + { + "epoch": 14.144134078212291, + "grad_norm": 0.4298558235168457, + "learning_rate": 0.0002939775910364146, + "loss": 0.3523, + "step": 25318 + }, + { + "epoch": 14.144692737430168, + "grad_norm": 0.49693670868873596, + "learning_rate": 0.0002939495798319328, + "loss": 0.4972, + "step": 25319 + }, + { + "epoch": 14.145251396648044, + "grad_norm": 0.5760936737060547, + "learning_rate": 0.000293921568627451, + "loss": 0.3955, + "step": 25320 + }, + { + "epoch": 14.145810055865923, + "grad_norm": 0.7711797952651978, + "learning_rate": 0.0002938935574229692, + "loss": 0.5422, + "step": 25321 + }, + { + "epoch": 14.1463687150838, + "grad_norm": 0.5558648705482483, + "learning_rate": 0.00029386554621848736, + "loss": 0.4265, + "step": 25322 + }, + { + "epoch": 14.146927374301676, + "grad_norm": 0.4073849022388458, + "learning_rate": 0.0002938375350140056, + "loss": 0.3977, + "step": 25323 + }, + { + "epoch": 14.147486033519552, + "grad_norm": 0.5167980790138245, + "learning_rate": 0.0002938095238095238, + "loss": 0.5153, + "step": 25324 + }, + { + "epoch": 14.14804469273743, + "grad_norm": 0.36196261644363403, + "learning_rate": 0.00029378151260504203, + "loss": 0.4237, + "step": 25325 + }, + { + "epoch": 14.148603351955307, + "grad_norm": 0.679480254650116, + "learning_rate": 0.00029375350140056024, + "loss": 0.4334, + "step": 25326 + }, + { + "epoch": 14.149162011173184, + "grad_norm": 0.2971903383731842, + "learning_rate": 0.00029372549019607844, + "loss": 0.3687, + "step": 25327 + }, + { + "epoch": 14.149720670391062, + "grad_norm": 1.4860907793045044, + "learning_rate": 0.00029369747899159665, + "loss": 0.3838, + "step": 25328 + }, + { + "epoch": 14.150279329608939, + "grad_norm": 0.48321208357810974, + "learning_rate": 0.00029366946778711485, + "loss": 0.3966, + "step": 25329 + }, + { + "epoch": 14.150837988826815, + "grad_norm": 0.749348521232605, + "learning_rate": 0.00029364145658263306, + "loss": 0.5299, + "step": 25330 + }, + { + "epoch": 14.151396648044694, + "grad_norm": 0.7418469786643982, + "learning_rate": 0.00029361344537815127, + "loss": 0.4166, + "step": 25331 + }, + { + "epoch": 14.15195530726257, + "grad_norm": 0.43929678201675415, + "learning_rate": 0.00029358543417366947, + "loss": 0.3452, + "step": 25332 + }, + { + "epoch": 14.152513966480447, + "grad_norm": 0.39026811718940735, + "learning_rate": 0.0002935574229691877, + "loss": 0.3996, + "step": 25333 + }, + { + "epoch": 14.153072625698323, + "grad_norm": 0.42382487654685974, + "learning_rate": 0.0002935294117647059, + "loss": 0.391, + "step": 25334 + }, + { + "epoch": 14.153631284916202, + "grad_norm": 0.4462238550186157, + "learning_rate": 0.00029350140056022414, + "loss": 0.4032, + "step": 25335 + }, + { + "epoch": 14.154189944134078, + "grad_norm": 0.43746742606163025, + "learning_rate": 0.0002934733893557423, + "loss": 0.3956, + "step": 25336 + }, + { + "epoch": 14.154748603351955, + "grad_norm": 0.4640008211135864, + "learning_rate": 0.0002934453781512605, + "loss": 0.4619, + "step": 25337 + }, + { + "epoch": 14.155307262569833, + "grad_norm": 0.5463054180145264, + "learning_rate": 0.0002934173669467787, + "loss": 0.4103, + "step": 25338 + }, + { + "epoch": 14.15586592178771, + "grad_norm": 0.4512375593185425, + "learning_rate": 0.0002933893557422969, + "loss": 0.3586, + "step": 25339 + }, + { + "epoch": 14.156424581005586, + "grad_norm": 0.6465870141983032, + "learning_rate": 0.0002933613445378152, + "loss": 0.3646, + "step": 25340 + }, + { + "epoch": 14.156983240223465, + "grad_norm": 0.4685108959674835, + "learning_rate": 0.0002933333333333333, + "loss": 0.5483, + "step": 25341 + }, + { + "epoch": 14.157541899441341, + "grad_norm": 0.5340759754180908, + "learning_rate": 0.00029330532212885153, + "loss": 0.3275, + "step": 25342 + }, + { + "epoch": 14.158100558659218, + "grad_norm": 0.46942225098609924, + "learning_rate": 0.0002932773109243698, + "loss": 0.5873, + "step": 25343 + }, + { + "epoch": 14.158659217877094, + "grad_norm": 0.5129939317703247, + "learning_rate": 0.00029324929971988794, + "loss": 0.4062, + "step": 25344 + }, + { + "epoch": 14.159217877094973, + "grad_norm": 0.3288388252258301, + "learning_rate": 0.0002932212885154062, + "loss": 0.3835, + "step": 25345 + }, + { + "epoch": 14.15977653631285, + "grad_norm": 0.4304754436016083, + "learning_rate": 0.00029319327731092436, + "loss": 0.543, + "step": 25346 + }, + { + "epoch": 14.160335195530726, + "grad_norm": 0.4583442509174347, + "learning_rate": 0.00029316526610644256, + "loss": 0.6142, + "step": 25347 + }, + { + "epoch": 14.160893854748604, + "grad_norm": 1.060774803161621, + "learning_rate": 0.0002931372549019608, + "loss": 0.4083, + "step": 25348 + }, + { + "epoch": 14.16145251396648, + "grad_norm": 0.4252772331237793, + "learning_rate": 0.000293109243697479, + "loss": 0.3002, + "step": 25349 + }, + { + "epoch": 14.162011173184357, + "grad_norm": 0.459568589925766, + "learning_rate": 0.00029308123249299723, + "loss": 0.4, + "step": 25350 + }, + { + "epoch": 14.162569832402234, + "grad_norm": 0.6705222725868225, + "learning_rate": 0.00029305322128851544, + "loss": 0.4242, + "step": 25351 + }, + { + "epoch": 14.163128491620112, + "grad_norm": 0.39991599321365356, + "learning_rate": 0.0002930252100840336, + "loss": 0.434, + "step": 25352 + }, + { + "epoch": 14.163687150837989, + "grad_norm": 0.41154345870018005, + "learning_rate": 0.00029299719887955185, + "loss": 0.3575, + "step": 25353 + }, + { + "epoch": 14.164245810055865, + "grad_norm": 0.7604439854621887, + "learning_rate": 0.00029296918767507, + "loss": 0.5188, + "step": 25354 + }, + { + "epoch": 14.164804469273744, + "grad_norm": 0.4710926413536072, + "learning_rate": 0.00029294117647058826, + "loss": 0.3541, + "step": 25355 + }, + { + "epoch": 14.16536312849162, + "grad_norm": 0.5217876434326172, + "learning_rate": 0.00029291316526610647, + "loss": 0.4285, + "step": 25356 + }, + { + "epoch": 14.165921787709497, + "grad_norm": 0.7819264531135559, + "learning_rate": 0.0002928851540616246, + "loss": 0.3806, + "step": 25357 + }, + { + "epoch": 14.166480446927375, + "grad_norm": 0.552517294883728, + "learning_rate": 0.0002928571428571429, + "loss": 0.3913, + "step": 25358 + }, + { + "epoch": 14.167039106145252, + "grad_norm": 0.40774044394493103, + "learning_rate": 0.0002928291316526611, + "loss": 0.4348, + "step": 25359 + }, + { + "epoch": 14.167597765363128, + "grad_norm": 1.6148439645767212, + "learning_rate": 0.0002928011204481793, + "loss": 0.4019, + "step": 25360 + }, + { + "epoch": 14.168156424581005, + "grad_norm": 0.7223689556121826, + "learning_rate": 0.0002927731092436975, + "loss": 0.5353, + "step": 25361 + }, + { + "epoch": 14.168715083798883, + "grad_norm": 0.5304924249649048, + "learning_rate": 0.00029274509803921565, + "loss": 0.4043, + "step": 25362 + }, + { + "epoch": 14.16927374301676, + "grad_norm": 0.28764864802360535, + "learning_rate": 0.0002927170868347339, + "loss": 0.2598, + "step": 25363 + }, + { + "epoch": 14.169832402234636, + "grad_norm": 0.5099817514419556, + "learning_rate": 0.0002926890756302521, + "loss": 0.3896, + "step": 25364 + }, + { + "epoch": 14.170391061452515, + "grad_norm": 1.29264497756958, + "learning_rate": 0.0002926610644257703, + "loss": 0.3744, + "step": 25365 + }, + { + "epoch": 14.170949720670391, + "grad_norm": 12.400774002075195, + "learning_rate": 0.00029263305322128853, + "loss": 0.3973, + "step": 25366 + }, + { + "epoch": 14.171508379888268, + "grad_norm": 1.8185019493103027, + "learning_rate": 0.00029260504201680674, + "loss": 0.3709, + "step": 25367 + }, + { + "epoch": 14.172067039106146, + "grad_norm": 0.6673020720481873, + "learning_rate": 0.00029257703081232494, + "loss": 0.385, + "step": 25368 + }, + { + "epoch": 14.172625698324023, + "grad_norm": 0.505962610244751, + "learning_rate": 0.00029254901960784315, + "loss": 0.4431, + "step": 25369 + }, + { + "epoch": 14.1731843575419, + "grad_norm": 0.815013587474823, + "learning_rate": 0.00029252100840336135, + "loss": 0.3551, + "step": 25370 + }, + { + "epoch": 14.173743016759776, + "grad_norm": 0.6648625731468201, + "learning_rate": 0.00029249299719887956, + "loss": 0.5268, + "step": 25371 + }, + { + "epoch": 14.174301675977654, + "grad_norm": 0.4008505940437317, + "learning_rate": 0.00029246498599439777, + "loss": 0.4143, + "step": 25372 + }, + { + "epoch": 14.17486033519553, + "grad_norm": 1.815342903137207, + "learning_rate": 0.00029243697478991597, + "loss": 0.4116, + "step": 25373 + }, + { + "epoch": 14.175418994413407, + "grad_norm": 1.129390835762024, + "learning_rate": 0.0002924089635854342, + "loss": 0.3374, + "step": 25374 + }, + { + "epoch": 14.175977653631286, + "grad_norm": 0.4192420542240143, + "learning_rate": 0.00029238095238095244, + "loss": 0.3758, + "step": 25375 + }, + { + "epoch": 14.176536312849162, + "grad_norm": 0.40174534916877747, + "learning_rate": 0.0002923529411764706, + "loss": 0.442, + "step": 25376 + }, + { + "epoch": 14.177094972067039, + "grad_norm": 1.539506435394287, + "learning_rate": 0.0002923249299719888, + "loss": 0.5041, + "step": 25377 + }, + { + "epoch": 14.177653631284917, + "grad_norm": 0.45347777009010315, + "learning_rate": 0.000292296918767507, + "loss": 0.5917, + "step": 25378 + }, + { + "epoch": 14.178212290502794, + "grad_norm": 0.4287858307361603, + "learning_rate": 0.0002922689075630252, + "loss": 0.4937, + "step": 25379 + }, + { + "epoch": 14.17877094972067, + "grad_norm": 0.603649377822876, + "learning_rate": 0.00029224089635854347, + "loss": 0.4027, + "step": 25380 + }, + { + "epoch": 14.179329608938547, + "grad_norm": 0.4463514983654022, + "learning_rate": 0.0002922128851540616, + "loss": 0.508, + "step": 25381 + }, + { + "epoch": 14.179888268156425, + "grad_norm": 0.43069174885749817, + "learning_rate": 0.0002921848739495798, + "loss": 0.3804, + "step": 25382 + }, + { + "epoch": 14.180446927374302, + "grad_norm": 0.4212243854999542, + "learning_rate": 0.0002921568627450981, + "loss": 0.4008, + "step": 25383 + }, + { + "epoch": 14.181005586592178, + "grad_norm": 0.43528735637664795, + "learning_rate": 0.00029212885154061624, + "loss": 0.2751, + "step": 25384 + }, + { + "epoch": 14.181564245810057, + "grad_norm": 0.3767814040184021, + "learning_rate": 0.00029210084033613444, + "loss": 0.4111, + "step": 25385 + }, + { + "epoch": 14.182122905027933, + "grad_norm": 0.393604576587677, + "learning_rate": 0.00029207282913165265, + "loss": 0.3957, + "step": 25386 + }, + { + "epoch": 14.18268156424581, + "grad_norm": 0.3708493411540985, + "learning_rate": 0.00029204481792717086, + "loss": 0.4047, + "step": 25387 + }, + { + "epoch": 14.183240223463686, + "grad_norm": 0.9301588535308838, + "learning_rate": 0.0002920168067226891, + "loss": 0.5294, + "step": 25388 + }, + { + "epoch": 14.183798882681565, + "grad_norm": 0.49596869945526123, + "learning_rate": 0.00029198879551820727, + "loss": 0.536, + "step": 25389 + }, + { + "epoch": 14.184357541899441, + "grad_norm": 2.0865442752838135, + "learning_rate": 0.0002919607843137255, + "loss": 0.4239, + "step": 25390 + }, + { + "epoch": 14.184916201117318, + "grad_norm": 2.171962261199951, + "learning_rate": 0.00029193277310924373, + "loss": 0.3256, + "step": 25391 + }, + { + "epoch": 14.185474860335196, + "grad_norm": 2.2568492889404297, + "learning_rate": 0.0002919047619047619, + "loss": 0.4235, + "step": 25392 + }, + { + "epoch": 14.186033519553073, + "grad_norm": 0.3644927442073822, + "learning_rate": 0.00029187675070028015, + "loss": 0.3703, + "step": 25393 + }, + { + "epoch": 14.18659217877095, + "grad_norm": 0.47552794218063354, + "learning_rate": 0.0002918487394957983, + "loss": 0.4287, + "step": 25394 + }, + { + "epoch": 14.187150837988828, + "grad_norm": 0.3807581961154938, + "learning_rate": 0.0002918207282913165, + "loss": 0.3197, + "step": 25395 + }, + { + "epoch": 14.187709497206704, + "grad_norm": 0.3990229666233063, + "learning_rate": 0.00029179271708683476, + "loss": 0.3832, + "step": 25396 + }, + { + "epoch": 14.18826815642458, + "grad_norm": 0.37736618518829346, + "learning_rate": 0.0002917647058823529, + "loss": 0.4209, + "step": 25397 + }, + { + "epoch": 14.188826815642457, + "grad_norm": 0.40924301743507385, + "learning_rate": 0.0002917366946778712, + "loss": 0.5158, + "step": 25398 + }, + { + "epoch": 14.189385474860336, + "grad_norm": 0.46802961826324463, + "learning_rate": 0.0002917086834733894, + "loss": 0.5239, + "step": 25399 + }, + { + "epoch": 14.189944134078212, + "grad_norm": 0.4457922577857971, + "learning_rate": 0.00029168067226890753, + "loss": 0.3584, + "step": 25400 + }, + { + "epoch": 14.190502793296089, + "grad_norm": 0.43479225039482117, + "learning_rate": 0.0002916526610644258, + "loss": 0.4077, + "step": 25401 + }, + { + "epoch": 14.191061452513967, + "grad_norm": 0.5818706154823303, + "learning_rate": 0.00029162464985994395, + "loss": 0.6792, + "step": 25402 + }, + { + "epoch": 14.191620111731844, + "grad_norm": 1.5353957414627075, + "learning_rate": 0.0002915966386554622, + "loss": 0.4007, + "step": 25403 + }, + { + "epoch": 14.19217877094972, + "grad_norm": 0.8117145895957947, + "learning_rate": 0.0002915686274509804, + "loss": 0.4783, + "step": 25404 + }, + { + "epoch": 14.192737430167599, + "grad_norm": 1.470420241355896, + "learning_rate": 0.00029154061624649856, + "loss": 0.4613, + "step": 25405 + }, + { + "epoch": 14.193296089385475, + "grad_norm": 0.3942779302597046, + "learning_rate": 0.0002915126050420168, + "loss": 0.3891, + "step": 25406 + }, + { + "epoch": 14.193854748603352, + "grad_norm": 2.4024617671966553, + "learning_rate": 0.00029148459383753503, + "loss": 0.4088, + "step": 25407 + }, + { + "epoch": 14.194413407821228, + "grad_norm": 0.6227216720581055, + "learning_rate": 0.00029145658263305324, + "loss": 0.3696, + "step": 25408 + }, + { + "epoch": 14.194972067039107, + "grad_norm": 0.540982186794281, + "learning_rate": 0.00029142857142857144, + "loss": 0.4258, + "step": 25409 + }, + { + "epoch": 14.195530726256983, + "grad_norm": 0.6376860737800598, + "learning_rate": 0.0002914005602240896, + "loss": 0.3395, + "step": 25410 + }, + { + "epoch": 14.19608938547486, + "grad_norm": 0.5503140091896057, + "learning_rate": 0.00029137254901960785, + "loss": 0.5466, + "step": 25411 + }, + { + "epoch": 14.196648044692738, + "grad_norm": 0.4306468963623047, + "learning_rate": 0.00029134453781512606, + "loss": 0.351, + "step": 25412 + }, + { + "epoch": 14.197206703910615, + "grad_norm": 0.5129110813140869, + "learning_rate": 0.00029131652661064427, + "loss": 0.4, + "step": 25413 + }, + { + "epoch": 14.197765363128491, + "grad_norm": 0.4232270419597626, + "learning_rate": 0.00029128851540616247, + "loss": 0.3758, + "step": 25414 + }, + { + "epoch": 14.19832402234637, + "grad_norm": 0.32299908995628357, + "learning_rate": 0.0002912605042016807, + "loss": 0.4056, + "step": 25415 + }, + { + "epoch": 14.198882681564246, + "grad_norm": 0.5747291445732117, + "learning_rate": 0.0002912324929971989, + "loss": 0.4724, + "step": 25416 + }, + { + "epoch": 14.199441340782123, + "grad_norm": 0.662925124168396, + "learning_rate": 0.0002912044817927171, + "loss": 0.4932, + "step": 25417 + }, + { + "epoch": 14.2, + "grad_norm": 0.5969512462615967, + "learning_rate": 0.0002911764705882353, + "loss": 0.4246, + "step": 25418 + }, + { + "epoch": 14.200558659217878, + "grad_norm": 0.35183316469192505, + "learning_rate": 0.0002911484593837535, + "loss": 0.3795, + "step": 25419 + }, + { + "epoch": 14.201117318435754, + "grad_norm": 0.8971480131149292, + "learning_rate": 0.0002911204481792717, + "loss": 0.4092, + "step": 25420 + }, + { + "epoch": 14.20167597765363, + "grad_norm": 0.5833323001861572, + "learning_rate": 0.0002910924369747899, + "loss": 0.3542, + "step": 25421 + }, + { + "epoch": 14.202234636871509, + "grad_norm": 0.6520475149154663, + "learning_rate": 0.0002910644257703081, + "loss": 0.4713, + "step": 25422 + }, + { + "epoch": 14.202793296089386, + "grad_norm": 0.6420779824256897, + "learning_rate": 0.0002910364145658264, + "loss": 0.482, + "step": 25423 + }, + { + "epoch": 14.203351955307262, + "grad_norm": 1.9195599555969238, + "learning_rate": 0.00029100840336134453, + "loss": 0.3527, + "step": 25424 + }, + { + "epoch": 14.203910614525139, + "grad_norm": 0.5164729952812195, + "learning_rate": 0.00029098039215686274, + "loss": 0.3458, + "step": 25425 + }, + { + "epoch": 14.204469273743017, + "grad_norm": 0.3451891541481018, + "learning_rate": 0.00029095238095238094, + "loss": 0.3937, + "step": 25426 + }, + { + "epoch": 14.205027932960894, + "grad_norm": 2.548146963119507, + "learning_rate": 0.00029092436974789915, + "loss": 0.3416, + "step": 25427 + }, + { + "epoch": 14.20558659217877, + "grad_norm": 0.5128240585327148, + "learning_rate": 0.0002908963585434174, + "loss": 0.523, + "step": 25428 + }, + { + "epoch": 14.206145251396649, + "grad_norm": 0.5100475549697876, + "learning_rate": 0.00029086834733893556, + "loss": 0.3806, + "step": 25429 + }, + { + "epoch": 14.206703910614525, + "grad_norm": 0.7261170148849487, + "learning_rate": 0.00029084033613445377, + "loss": 0.2921, + "step": 25430 + }, + { + "epoch": 14.207262569832402, + "grad_norm": 0.3787108361721039, + "learning_rate": 0.00029081232492997203, + "loss": 0.4531, + "step": 25431 + }, + { + "epoch": 14.20782122905028, + "grad_norm": 0.428067147731781, + "learning_rate": 0.0002907843137254902, + "loss": 0.3762, + "step": 25432 + }, + { + "epoch": 14.208379888268157, + "grad_norm": 0.759437084197998, + "learning_rate": 0.00029075630252100844, + "loss": 0.4209, + "step": 25433 + }, + { + "epoch": 14.208938547486033, + "grad_norm": 0.3542296290397644, + "learning_rate": 0.0002907282913165266, + "loss": 0.3556, + "step": 25434 + }, + { + "epoch": 14.20949720670391, + "grad_norm": 0.42031148076057434, + "learning_rate": 0.0002907002801120448, + "loss": 0.3731, + "step": 25435 + }, + { + "epoch": 14.210055865921788, + "grad_norm": 0.9290695190429688, + "learning_rate": 0.00029067226890756306, + "loss": 0.3655, + "step": 25436 + }, + { + "epoch": 14.210614525139665, + "grad_norm": 1.0854949951171875, + "learning_rate": 0.0002906442577030812, + "loss": 0.4211, + "step": 25437 + }, + { + "epoch": 14.211173184357541, + "grad_norm": 0.46389836072921753, + "learning_rate": 0.00029061624649859947, + "loss": 0.4988, + "step": 25438 + }, + { + "epoch": 14.21173184357542, + "grad_norm": 0.458755761384964, + "learning_rate": 0.0002905882352941177, + "loss": 0.4781, + "step": 25439 + }, + { + "epoch": 14.212290502793296, + "grad_norm": 0.6610090136528015, + "learning_rate": 0.00029056022408963583, + "loss": 0.3479, + "step": 25440 + }, + { + "epoch": 14.212849162011173, + "grad_norm": 0.4801051616668701, + "learning_rate": 0.0002905322128851541, + "loss": 0.3652, + "step": 25441 + }, + { + "epoch": 14.213407821229051, + "grad_norm": 1.0304220914840698, + "learning_rate": 0.00029050420168067224, + "loss": 0.3939, + "step": 25442 + }, + { + "epoch": 14.213966480446928, + "grad_norm": 0.5730315446853638, + "learning_rate": 0.0002904761904761905, + "loss": 0.5498, + "step": 25443 + }, + { + "epoch": 14.214525139664804, + "grad_norm": 0.5775060057640076, + "learning_rate": 0.0002904481792717087, + "loss": 0.4653, + "step": 25444 + }, + { + "epoch": 14.21508379888268, + "grad_norm": 0.7271833419799805, + "learning_rate": 0.00029042016806722686, + "loss": 0.4736, + "step": 25445 + }, + { + "epoch": 14.21564245810056, + "grad_norm": 0.3662284016609192, + "learning_rate": 0.0002903921568627451, + "loss": 0.3464, + "step": 25446 + }, + { + "epoch": 14.216201117318436, + "grad_norm": 0.6032304167747498, + "learning_rate": 0.0002903641456582633, + "loss": 0.3747, + "step": 25447 + }, + { + "epoch": 14.216759776536312, + "grad_norm": 0.42111560702323914, + "learning_rate": 0.00029033613445378153, + "loss": 0.4004, + "step": 25448 + }, + { + "epoch": 14.21731843575419, + "grad_norm": 0.4776552617549896, + "learning_rate": 0.00029030812324929974, + "loss": 0.3661, + "step": 25449 + }, + { + "epoch": 14.217877094972067, + "grad_norm": 0.7991938591003418, + "learning_rate": 0.0002902801120448179, + "loss": 0.5408, + "step": 25450 + }, + { + "epoch": 14.218435754189944, + "grad_norm": 0.4549073874950409, + "learning_rate": 0.00029025210084033615, + "loss": 0.3762, + "step": 25451 + }, + { + "epoch": 14.21899441340782, + "grad_norm": 0.3923225998878479, + "learning_rate": 0.00029022408963585435, + "loss": 0.4157, + "step": 25452 + }, + { + "epoch": 14.219553072625699, + "grad_norm": 0.8494863510131836, + "learning_rate": 0.00029019607843137256, + "loss": 0.3551, + "step": 25453 + }, + { + "epoch": 14.220111731843575, + "grad_norm": 0.4054602086544037, + "learning_rate": 0.00029016806722689077, + "loss": 0.3353, + "step": 25454 + }, + { + "epoch": 14.220670391061452, + "grad_norm": 0.5391204953193665, + "learning_rate": 0.00029014005602240897, + "loss": 0.5238, + "step": 25455 + }, + { + "epoch": 14.22122905027933, + "grad_norm": 0.5281268358230591, + "learning_rate": 0.0002901120448179272, + "loss": 0.3553, + "step": 25456 + }, + { + "epoch": 14.221787709497207, + "grad_norm": 0.5622842907905579, + "learning_rate": 0.0002900840336134454, + "loss": 0.446, + "step": 25457 + }, + { + "epoch": 14.222346368715083, + "grad_norm": 0.4416981041431427, + "learning_rate": 0.0002900560224089636, + "loss": 0.4989, + "step": 25458 + }, + { + "epoch": 14.222905027932962, + "grad_norm": 0.5113020539283752, + "learning_rate": 0.0002900280112044818, + "loss": 0.5118, + "step": 25459 + }, + { + "epoch": 14.223463687150838, + "grad_norm": 0.62982577085495, + "learning_rate": 0.00029, + "loss": 0.4557, + "step": 25460 + }, + { + "epoch": 14.224022346368715, + "grad_norm": 0.5984874367713928, + "learning_rate": 0.0002899719887955182, + "loss": 0.4455, + "step": 25461 + }, + { + "epoch": 14.224581005586591, + "grad_norm": 0.6335060596466064, + "learning_rate": 0.0002899439775910364, + "loss": 0.3539, + "step": 25462 + }, + { + "epoch": 14.22513966480447, + "grad_norm": 0.5951938033103943, + "learning_rate": 0.0002899159663865547, + "loss": 0.4757, + "step": 25463 + }, + { + "epoch": 14.225698324022346, + "grad_norm": 0.46993544697761536, + "learning_rate": 0.0002898879551820728, + "loss": 0.497, + "step": 25464 + }, + { + "epoch": 14.226256983240223, + "grad_norm": 0.5990579128265381, + "learning_rate": 0.00028985994397759103, + "loss": 0.4243, + "step": 25465 + }, + { + "epoch": 14.226815642458101, + "grad_norm": 0.685468316078186, + "learning_rate": 0.00028983193277310924, + "loss": 0.4395, + "step": 25466 + }, + { + "epoch": 14.227374301675978, + "grad_norm": 0.6801133155822754, + "learning_rate": 0.00028980392156862744, + "loss": 0.3139, + "step": 25467 + }, + { + "epoch": 14.227932960893854, + "grad_norm": 0.9943726062774658, + "learning_rate": 0.0002897759103641457, + "loss": 0.4103, + "step": 25468 + }, + { + "epoch": 14.228491620111733, + "grad_norm": 1.1428391933441162, + "learning_rate": 0.00028974789915966386, + "loss": 0.4004, + "step": 25469 + }, + { + "epoch": 14.22905027932961, + "grad_norm": 0.5141860842704773, + "learning_rate": 0.00028971988795518206, + "loss": 0.4099, + "step": 25470 + }, + { + "epoch": 14.229608938547486, + "grad_norm": 0.4192003905773163, + "learning_rate": 0.0002896918767507003, + "loss": 0.4695, + "step": 25471 + }, + { + "epoch": 14.230167597765362, + "grad_norm": 0.5188270807266235, + "learning_rate": 0.0002896638655462185, + "loss": 0.4887, + "step": 25472 + }, + { + "epoch": 14.23072625698324, + "grad_norm": 0.3818208873271942, + "learning_rate": 0.00028963585434173673, + "loss": 0.4184, + "step": 25473 + }, + { + "epoch": 14.231284916201117, + "grad_norm": 0.5171835422515869, + "learning_rate": 0.0002896078431372549, + "loss": 0.3886, + "step": 25474 + }, + { + "epoch": 14.231843575418994, + "grad_norm": 0.4434897005558014, + "learning_rate": 0.0002895798319327731, + "loss": 0.4194, + "step": 25475 + }, + { + "epoch": 14.232402234636872, + "grad_norm": 0.4490929841995239, + "learning_rate": 0.00028955182072829135, + "loss": 0.41, + "step": 25476 + }, + { + "epoch": 14.232960893854749, + "grad_norm": 0.5383500456809998, + "learning_rate": 0.0002895238095238095, + "loss": 0.4215, + "step": 25477 + }, + { + "epoch": 14.233519553072625, + "grad_norm": 2.6780238151550293, + "learning_rate": 0.00028949579831932776, + "loss": 0.5994, + "step": 25478 + }, + { + "epoch": 14.234078212290504, + "grad_norm": 1.0847309827804565, + "learning_rate": 0.00028946778711484597, + "loss": 0.4374, + "step": 25479 + }, + { + "epoch": 14.23463687150838, + "grad_norm": 0.5482456684112549, + "learning_rate": 0.0002894397759103641, + "loss": 0.4361, + "step": 25480 + }, + { + "epoch": 14.235195530726257, + "grad_norm": 3.7909281253814697, + "learning_rate": 0.0002894117647058824, + "loss": 0.4548, + "step": 25481 + }, + { + "epoch": 14.235754189944133, + "grad_norm": 2.6463077068328857, + "learning_rate": 0.00028938375350140053, + "loss": 0.4261, + "step": 25482 + }, + { + "epoch": 14.236312849162012, + "grad_norm": 0.4804808795452118, + "learning_rate": 0.0002893557422969188, + "loss": 0.4383, + "step": 25483 + }, + { + "epoch": 14.236871508379888, + "grad_norm": 1.7638521194458008, + "learning_rate": 0.000289327731092437, + "loss": 0.6639, + "step": 25484 + }, + { + "epoch": 14.237430167597765, + "grad_norm": 0.4069303870201111, + "learning_rate": 0.00028929971988795515, + "loss": 0.394, + "step": 25485 + }, + { + "epoch": 14.237988826815643, + "grad_norm": 2.883249044418335, + "learning_rate": 0.0002892717086834734, + "loss": 0.5772, + "step": 25486 + }, + { + "epoch": 14.23854748603352, + "grad_norm": 0.4020165205001831, + "learning_rate": 0.0002892436974789916, + "loss": 0.3719, + "step": 25487 + }, + { + "epoch": 14.239106145251396, + "grad_norm": 0.43882298469543457, + "learning_rate": 0.0002892156862745098, + "loss": 0.4669, + "step": 25488 + }, + { + "epoch": 14.239664804469275, + "grad_norm": 0.7697341442108154, + "learning_rate": 0.00028918767507002803, + "loss": 0.384, + "step": 25489 + }, + { + "epoch": 14.240223463687151, + "grad_norm": 0.4405876696109772, + "learning_rate": 0.0002891596638655462, + "loss": 0.4291, + "step": 25490 + }, + { + "epoch": 14.240782122905028, + "grad_norm": 0.5887637138366699, + "learning_rate": 0.00028913165266106444, + "loss": 0.4115, + "step": 25491 + }, + { + "epoch": 14.241340782122904, + "grad_norm": 0.4987044930458069, + "learning_rate": 0.00028910364145658265, + "loss": 0.2933, + "step": 25492 + }, + { + "epoch": 14.241899441340783, + "grad_norm": 0.38464289903640747, + "learning_rate": 0.00028907563025210085, + "loss": 0.4263, + "step": 25493 + }, + { + "epoch": 14.24245810055866, + "grad_norm": 0.42155346274375916, + "learning_rate": 0.00028904761904761906, + "loss": 0.3301, + "step": 25494 + }, + { + "epoch": 14.243016759776536, + "grad_norm": 0.5381590127944946, + "learning_rate": 0.00028901960784313727, + "loss": 0.5916, + "step": 25495 + }, + { + "epoch": 14.243575418994414, + "grad_norm": 0.4940604865550995, + "learning_rate": 0.00028899159663865547, + "loss": 0.3237, + "step": 25496 + }, + { + "epoch": 14.24413407821229, + "grad_norm": 0.4953722357749939, + "learning_rate": 0.0002889635854341737, + "loss": 0.4075, + "step": 25497 + }, + { + "epoch": 14.244692737430167, + "grad_norm": 0.58608078956604, + "learning_rate": 0.00028893557422969183, + "loss": 0.4047, + "step": 25498 + }, + { + "epoch": 14.245251396648044, + "grad_norm": 0.6618332862854004, + "learning_rate": 0.0002889075630252101, + "loss": 0.4907, + "step": 25499 + }, + { + "epoch": 14.245810055865922, + "grad_norm": 0.5740599632263184, + "learning_rate": 0.0002888795518207283, + "loss": 0.3338, + "step": 25500 + }, + { + "epoch": 14.245810055865922, + "eval_cer": 0.08616193687058799, + "eval_loss": 0.3270304501056671, + "eval_runtime": 55.6267, + "eval_samples_per_second": 81.58, + "eval_steps_per_second": 5.105, + "eval_wer": 0.34046535012988466, + "step": 25500 + }, + { + "epoch": 14.246368715083799, + "grad_norm": 0.752598226070404, + "learning_rate": 0.0002888515406162465, + "loss": 0.5034, + "step": 25501 + }, + { + "epoch": 14.246927374301675, + "grad_norm": 0.38785606622695923, + "learning_rate": 0.0002888235294117647, + "loss": 0.3528, + "step": 25502 + }, + { + "epoch": 14.247486033519554, + "grad_norm": 2.519864320755005, + "learning_rate": 0.0002887955182072829, + "loss": 0.4753, + "step": 25503 + }, + { + "epoch": 14.24804469273743, + "grad_norm": 0.5507903695106506, + "learning_rate": 0.0002887675070028011, + "loss": 0.3467, + "step": 25504 + }, + { + "epoch": 14.248603351955307, + "grad_norm": 1.0111862421035767, + "learning_rate": 0.0002887394957983193, + "loss": 0.4537, + "step": 25505 + }, + { + "epoch": 14.249162011173185, + "grad_norm": 0.5406572818756104, + "learning_rate": 0.0002887114845938376, + "loss": 0.3905, + "step": 25506 + }, + { + "epoch": 14.249720670391062, + "grad_norm": 0.638052225112915, + "learning_rate": 0.00028868347338935574, + "loss": 0.3996, + "step": 25507 + }, + { + "epoch": 14.250279329608938, + "grad_norm": 0.560427188873291, + "learning_rate": 0.00028865546218487394, + "loss": 0.422, + "step": 25508 + }, + { + "epoch": 14.250837988826815, + "grad_norm": 0.8192435503005981, + "learning_rate": 0.00028862745098039215, + "loss": 0.393, + "step": 25509 + }, + { + "epoch": 14.251396648044693, + "grad_norm": 0.43932002782821655, + "learning_rate": 0.00028859943977591036, + "loss": 0.4577, + "step": 25510 + }, + { + "epoch": 14.25195530726257, + "grad_norm": 0.422860324382782, + "learning_rate": 0.0002885714285714286, + "loss": 0.3799, + "step": 25511 + }, + { + "epoch": 14.252513966480446, + "grad_norm": 0.7656521201133728, + "learning_rate": 0.00028854341736694677, + "loss": 0.5352, + "step": 25512 + }, + { + "epoch": 14.253072625698325, + "grad_norm": 0.3323248028755188, + "learning_rate": 0.000288515406162465, + "loss": 0.3841, + "step": 25513 + }, + { + "epoch": 14.253631284916201, + "grad_norm": 0.4313906133174896, + "learning_rate": 0.00028848739495798323, + "loss": 0.4854, + "step": 25514 + }, + { + "epoch": 14.254189944134078, + "grad_norm": 10.918773651123047, + "learning_rate": 0.0002884593837535014, + "loss": 0.4138, + "step": 25515 + }, + { + "epoch": 14.254748603351956, + "grad_norm": 0.867256760597229, + "learning_rate": 0.00028843137254901965, + "loss": 0.543, + "step": 25516 + }, + { + "epoch": 14.255307262569833, + "grad_norm": 0.49338144063949585, + "learning_rate": 0.0002884033613445378, + "loss": 0.3523, + "step": 25517 + }, + { + "epoch": 14.25586592178771, + "grad_norm": 0.7883240580558777, + "learning_rate": 0.000288375350140056, + "loss": 0.3198, + "step": 25518 + }, + { + "epoch": 14.256424581005586, + "grad_norm": 0.5634401440620422, + "learning_rate": 0.00028834733893557426, + "loss": 0.4217, + "step": 25519 + }, + { + "epoch": 14.256983240223464, + "grad_norm": 0.37593311071395874, + "learning_rate": 0.0002883193277310924, + "loss": 0.349, + "step": 25520 + }, + { + "epoch": 14.25754189944134, + "grad_norm": 0.3959060609340668, + "learning_rate": 0.0002882913165266107, + "loss": 0.3389, + "step": 25521 + }, + { + "epoch": 14.258100558659217, + "grad_norm": 5.378292083740234, + "learning_rate": 0.0002882633053221289, + "loss": 0.4151, + "step": 25522 + }, + { + "epoch": 14.258659217877096, + "grad_norm": 0.5760030746459961, + "learning_rate": 0.00028823529411764703, + "loss": 0.4536, + "step": 25523 + }, + { + "epoch": 14.259217877094972, + "grad_norm": 0.46774008870124817, + "learning_rate": 0.0002882072829131653, + "loss": 0.4568, + "step": 25524 + }, + { + "epoch": 14.259776536312849, + "grad_norm": 0.6029528975486755, + "learning_rate": 0.00028817927170868345, + "loss": 0.3828, + "step": 25525 + }, + { + "epoch": 14.260335195530725, + "grad_norm": 0.5174428224563599, + "learning_rate": 0.0002881512605042017, + "loss": 0.375, + "step": 25526 + }, + { + "epoch": 14.260893854748604, + "grad_norm": 0.5050787329673767, + "learning_rate": 0.0002881232492997199, + "loss": 0.4171, + "step": 25527 + }, + { + "epoch": 14.26145251396648, + "grad_norm": 0.4592553675174713, + "learning_rate": 0.00028809523809523806, + "loss": 0.418, + "step": 25528 + }, + { + "epoch": 14.262011173184357, + "grad_norm": 1.9049334526062012, + "learning_rate": 0.0002880672268907563, + "loss": 0.3889, + "step": 25529 + }, + { + "epoch": 14.262569832402235, + "grad_norm": 0.40549713373184204, + "learning_rate": 0.00028803921568627453, + "loss": 0.3566, + "step": 25530 + }, + { + "epoch": 14.263128491620112, + "grad_norm": 0.9373443722724915, + "learning_rate": 0.00028801120448179274, + "loss": 0.415, + "step": 25531 + }, + { + "epoch": 14.263687150837988, + "grad_norm": 0.41337016224861145, + "learning_rate": 0.00028798319327731094, + "loss": 0.3713, + "step": 25532 + }, + { + "epoch": 14.264245810055867, + "grad_norm": 0.600817084312439, + "learning_rate": 0.0002879551820728291, + "loss": 0.4654, + "step": 25533 + }, + { + "epoch": 14.264804469273743, + "grad_norm": 4.329148769378662, + "learning_rate": 0.00028792717086834735, + "loss": 0.3514, + "step": 25534 + }, + { + "epoch": 14.26536312849162, + "grad_norm": 0.3803815543651581, + "learning_rate": 0.00028789915966386556, + "loss": 0.3725, + "step": 25535 + }, + { + "epoch": 14.265921787709496, + "grad_norm": 0.5153924822807312, + "learning_rate": 0.00028787114845938377, + "loss": 0.4011, + "step": 25536 + }, + { + "epoch": 14.266480446927375, + "grad_norm": 0.38321420550346375, + "learning_rate": 0.00028784313725490197, + "loss": 0.3501, + "step": 25537 + }, + { + "epoch": 14.267039106145251, + "grad_norm": 0.5944235324859619, + "learning_rate": 0.0002878151260504202, + "loss": 0.4261, + "step": 25538 + }, + { + "epoch": 14.267597765363128, + "grad_norm": 0.6112844944000244, + "learning_rate": 0.0002877871148459384, + "loss": 0.3842, + "step": 25539 + }, + { + "epoch": 14.268156424581006, + "grad_norm": 0.5148619413375854, + "learning_rate": 0.0002877591036414566, + "loss": 0.3592, + "step": 25540 + }, + { + "epoch": 14.268715083798883, + "grad_norm": 0.40505728125572205, + "learning_rate": 0.0002877310924369748, + "loss": 0.3265, + "step": 25541 + }, + { + "epoch": 14.26927374301676, + "grad_norm": 0.4315570294857025, + "learning_rate": 0.000287703081232493, + "loss": 0.3915, + "step": 25542 + }, + { + "epoch": 14.269832402234638, + "grad_norm": 1.0758757591247559, + "learning_rate": 0.0002876750700280112, + "loss": 0.8022, + "step": 25543 + }, + { + "epoch": 14.270391061452514, + "grad_norm": 0.3557911217212677, + "learning_rate": 0.0002876470588235294, + "loss": 0.3871, + "step": 25544 + }, + { + "epoch": 14.27094972067039, + "grad_norm": 0.6048030257225037, + "learning_rate": 0.0002876190476190476, + "loss": 0.4904, + "step": 25545 + }, + { + "epoch": 14.271508379888267, + "grad_norm": 0.5858570337295532, + "learning_rate": 0.0002875910364145659, + "loss": 0.4267, + "step": 25546 + }, + { + "epoch": 14.272067039106146, + "grad_norm": 0.5784070491790771, + "learning_rate": 0.00028756302521008403, + "loss": 0.5145, + "step": 25547 + }, + { + "epoch": 14.272625698324022, + "grad_norm": 0.42552152276039124, + "learning_rate": 0.00028753501400560224, + "loss": 0.3919, + "step": 25548 + }, + { + "epoch": 14.273184357541899, + "grad_norm": 0.4977792501449585, + "learning_rate": 0.00028750700280112044, + "loss": 0.5396, + "step": 25549 + }, + { + "epoch": 14.273743016759777, + "grad_norm": 0.3209351599216461, + "learning_rate": 0.00028747899159663865, + "loss": 0.3126, + "step": 25550 + }, + { + "epoch": 14.274301675977654, + "grad_norm": 0.5076764822006226, + "learning_rate": 0.0002874509803921569, + "loss": 0.4115, + "step": 25551 + }, + { + "epoch": 14.27486033519553, + "grad_norm": 0.4412117600440979, + "learning_rate": 0.00028742296918767506, + "loss": 0.3798, + "step": 25552 + }, + { + "epoch": 14.275418994413409, + "grad_norm": 0.5591453313827515, + "learning_rate": 0.00028739495798319327, + "loss": 0.3711, + "step": 25553 + }, + { + "epoch": 14.275977653631285, + "grad_norm": 0.41338402032852173, + "learning_rate": 0.00028736694677871153, + "loss": 0.4216, + "step": 25554 + }, + { + "epoch": 14.276536312849162, + "grad_norm": 0.8007129430770874, + "learning_rate": 0.0002873389355742297, + "loss": 0.5284, + "step": 25555 + }, + { + "epoch": 14.277094972067038, + "grad_norm": 0.3463747501373291, + "learning_rate": 0.00028731092436974794, + "loss": 0.3978, + "step": 25556 + }, + { + "epoch": 14.277653631284917, + "grad_norm": 0.4747637212276459, + "learning_rate": 0.0002872829131652661, + "loss": 0.4384, + "step": 25557 + }, + { + "epoch": 14.278212290502793, + "grad_norm": 0.3862406611442566, + "learning_rate": 0.0002872549019607843, + "loss": 0.3211, + "step": 25558 + }, + { + "epoch": 14.27877094972067, + "grad_norm": 0.47372931241989136, + "learning_rate": 0.00028722689075630256, + "loss": 0.3553, + "step": 25559 + }, + { + "epoch": 14.279329608938548, + "grad_norm": 1.2283505201339722, + "learning_rate": 0.0002871988795518207, + "loss": 0.4279, + "step": 25560 + }, + { + "epoch": 14.279888268156425, + "grad_norm": 0.7186576724052429, + "learning_rate": 0.00028717086834733897, + "loss": 0.3839, + "step": 25561 + }, + { + "epoch": 14.280446927374301, + "grad_norm": 0.49575498700141907, + "learning_rate": 0.0002871428571428572, + "loss": 0.3703, + "step": 25562 + }, + { + "epoch": 14.28100558659218, + "grad_norm": 0.9154319167137146, + "learning_rate": 0.00028711484593837533, + "loss": 0.4882, + "step": 25563 + }, + { + "epoch": 14.281564245810056, + "grad_norm": 1.0527592897415161, + "learning_rate": 0.0002870868347338936, + "loss": 0.4304, + "step": 25564 + }, + { + "epoch": 14.282122905027933, + "grad_norm": 0.6844486594200134, + "learning_rate": 0.00028705882352941174, + "loss": 0.4457, + "step": 25565 + }, + { + "epoch": 14.28268156424581, + "grad_norm": 0.5184553861618042, + "learning_rate": 0.00028703081232493, + "loss": 0.3888, + "step": 25566 + }, + { + "epoch": 14.283240223463688, + "grad_norm": 0.5193313360214233, + "learning_rate": 0.0002870028011204482, + "loss": 0.4474, + "step": 25567 + }, + { + "epoch": 14.283798882681564, + "grad_norm": 0.48808997869491577, + "learning_rate": 0.00028697478991596636, + "loss": 0.3456, + "step": 25568 + }, + { + "epoch": 14.28435754189944, + "grad_norm": 0.39619943499565125, + "learning_rate": 0.0002869467787114846, + "loss": 0.3287, + "step": 25569 + }, + { + "epoch": 14.28491620111732, + "grad_norm": 0.7508231997489929, + "learning_rate": 0.0002869187675070028, + "loss": 0.3599, + "step": 25570 + }, + { + "epoch": 14.285474860335196, + "grad_norm": 0.73044753074646, + "learning_rate": 0.00028689075630252103, + "loss": 0.4251, + "step": 25571 + }, + { + "epoch": 14.286033519553072, + "grad_norm": 1.4588490724563599, + "learning_rate": 0.00028686274509803924, + "loss": 0.4485, + "step": 25572 + }, + { + "epoch": 14.286592178770949, + "grad_norm": 0.46958646178245544, + "learning_rate": 0.0002868347338935574, + "loss": 0.3903, + "step": 25573 + }, + { + "epoch": 14.287150837988827, + "grad_norm": 0.43628770112991333, + "learning_rate": 0.00028680672268907565, + "loss": 0.5557, + "step": 25574 + }, + { + "epoch": 14.287709497206704, + "grad_norm": 0.645496129989624, + "learning_rate": 0.00028677871148459385, + "loss": 0.3772, + "step": 25575 + }, + { + "epoch": 14.28826815642458, + "grad_norm": 0.44594693183898926, + "learning_rate": 0.00028675070028011206, + "loss": 0.4574, + "step": 25576 + }, + { + "epoch": 14.288826815642459, + "grad_norm": 0.368770033121109, + "learning_rate": 0.00028672268907563027, + "loss": 0.3557, + "step": 25577 + }, + { + "epoch": 14.289385474860335, + "grad_norm": 0.521491289138794, + "learning_rate": 0.00028669467787114847, + "loss": 0.4678, + "step": 25578 + }, + { + "epoch": 14.289944134078212, + "grad_norm": 0.6467028856277466, + "learning_rate": 0.0002866666666666667, + "loss": 0.4318, + "step": 25579 + }, + { + "epoch": 14.29050279329609, + "grad_norm": 0.334685742855072, + "learning_rate": 0.0002866386554621849, + "loss": 0.3157, + "step": 25580 + }, + { + "epoch": 14.291061452513967, + "grad_norm": 0.4182282090187073, + "learning_rate": 0.0002866106442577031, + "loss": 0.4883, + "step": 25581 + }, + { + "epoch": 14.291620111731843, + "grad_norm": 0.575793445110321, + "learning_rate": 0.0002865826330532213, + "loss": 0.4264, + "step": 25582 + }, + { + "epoch": 14.29217877094972, + "grad_norm": 0.3642539978027344, + "learning_rate": 0.0002865546218487395, + "loss": 0.3273, + "step": 25583 + }, + { + "epoch": 14.292737430167598, + "grad_norm": 5.717525959014893, + "learning_rate": 0.0002865266106442577, + "loss": 0.3787, + "step": 25584 + }, + { + "epoch": 14.293296089385475, + "grad_norm": 0.7128199338912964, + "learning_rate": 0.0002864985994397759, + "loss": 0.5182, + "step": 25585 + }, + { + "epoch": 14.293854748603351, + "grad_norm": 0.45790258049964905, + "learning_rate": 0.0002864705882352942, + "loss": 0.405, + "step": 25586 + }, + { + "epoch": 14.29441340782123, + "grad_norm": 0.3644290268421173, + "learning_rate": 0.0002864425770308123, + "loss": 0.3695, + "step": 25587 + }, + { + "epoch": 14.294972067039106, + "grad_norm": 2.33027982711792, + "learning_rate": 0.00028641456582633053, + "loss": 0.3861, + "step": 25588 + }, + { + "epoch": 14.295530726256983, + "grad_norm": 0.4087968170642853, + "learning_rate": 0.00028638655462184874, + "loss": 0.4143, + "step": 25589 + }, + { + "epoch": 14.296089385474861, + "grad_norm": 0.43147313594818115, + "learning_rate": 0.00028635854341736694, + "loss": 0.3789, + "step": 25590 + }, + { + "epoch": 14.296648044692738, + "grad_norm": 0.37114840745925903, + "learning_rate": 0.0002863305322128852, + "loss": 0.3139, + "step": 25591 + }, + { + "epoch": 14.297206703910614, + "grad_norm": 0.431992769241333, + "learning_rate": 0.00028630252100840336, + "loss": 0.4288, + "step": 25592 + }, + { + "epoch": 14.297765363128491, + "grad_norm": 3.008676528930664, + "learning_rate": 0.00028627450980392156, + "loss": 0.2972, + "step": 25593 + }, + { + "epoch": 14.29832402234637, + "grad_norm": 0.6362246870994568, + "learning_rate": 0.0002862464985994398, + "loss": 0.4963, + "step": 25594 + }, + { + "epoch": 14.298882681564246, + "grad_norm": 0.8416968584060669, + "learning_rate": 0.000286218487394958, + "loss": 0.3519, + "step": 25595 + }, + { + "epoch": 14.299441340782122, + "grad_norm": 0.4806748628616333, + "learning_rate": 0.00028619047619047623, + "loss": 0.4421, + "step": 25596 + }, + { + "epoch": 14.3, + "grad_norm": 0.3764563202857971, + "learning_rate": 0.0002861624649859944, + "loss": 0.4204, + "step": 25597 + }, + { + "epoch": 14.300558659217877, + "grad_norm": 0.7359580993652344, + "learning_rate": 0.0002861344537815126, + "loss": 0.6259, + "step": 25598 + }, + { + "epoch": 14.301117318435754, + "grad_norm": 0.41040635108947754, + "learning_rate": 0.00028610644257703085, + "loss": 0.3854, + "step": 25599 + }, + { + "epoch": 14.30167597765363, + "grad_norm": 0.31745120882987976, + "learning_rate": 0.000286078431372549, + "loss": 0.392, + "step": 25600 + }, + { + "epoch": 14.302234636871509, + "grad_norm": 0.7701444029808044, + "learning_rate": 0.00028605042016806726, + "loss": 0.3671, + "step": 25601 + }, + { + "epoch": 14.302793296089385, + "grad_norm": 0.6117039322853088, + "learning_rate": 0.00028602240896358547, + "loss": 0.3445, + "step": 25602 + }, + { + "epoch": 14.303351955307262, + "grad_norm": 0.4762633442878723, + "learning_rate": 0.0002859943977591036, + "loss": 0.3223, + "step": 25603 + }, + { + "epoch": 14.30391061452514, + "grad_norm": 0.43308648467063904, + "learning_rate": 0.0002859663865546219, + "loss": 0.3964, + "step": 25604 + }, + { + "epoch": 14.304469273743017, + "grad_norm": 0.40438348054885864, + "learning_rate": 0.00028593837535014003, + "loss": 0.3421, + "step": 25605 + }, + { + "epoch": 14.305027932960893, + "grad_norm": 0.6806602478027344, + "learning_rate": 0.00028591036414565824, + "loss": 0.4041, + "step": 25606 + }, + { + "epoch": 14.305586592178772, + "grad_norm": 0.5020909309387207, + "learning_rate": 0.0002858823529411765, + "loss": 0.4181, + "step": 25607 + }, + { + "epoch": 14.306145251396648, + "grad_norm": 0.5129014849662781, + "learning_rate": 0.00028585434173669465, + "loss": 0.4666, + "step": 25608 + }, + { + "epoch": 14.306703910614525, + "grad_norm": 0.4376325011253357, + "learning_rate": 0.0002858263305322129, + "loss": 0.4673, + "step": 25609 + }, + { + "epoch": 14.307262569832401, + "grad_norm": 0.6989833116531372, + "learning_rate": 0.0002857983193277311, + "loss": 0.4142, + "step": 25610 + }, + { + "epoch": 14.30782122905028, + "grad_norm": 0.8938417434692383, + "learning_rate": 0.00028577030812324927, + "loss": 0.4017, + "step": 25611 + }, + { + "epoch": 14.308379888268156, + "grad_norm": 0.3371240794658661, + "learning_rate": 0.00028574229691876753, + "loss": 0.346, + "step": 25612 + }, + { + "epoch": 14.308938547486033, + "grad_norm": 0.57198566198349, + "learning_rate": 0.0002857142857142857, + "loss": 0.3265, + "step": 25613 + }, + { + "epoch": 14.309497206703911, + "grad_norm": 0.4454578161239624, + "learning_rate": 0.00028568627450980394, + "loss": 0.4923, + "step": 25614 + }, + { + "epoch": 14.310055865921788, + "grad_norm": 0.3879300355911255, + "learning_rate": 0.00028565826330532215, + "loss": 0.3488, + "step": 25615 + }, + { + "epoch": 14.310614525139664, + "grad_norm": 0.4444729685783386, + "learning_rate": 0.0002856302521008403, + "loss": 0.4609, + "step": 25616 + }, + { + "epoch": 14.311173184357543, + "grad_norm": 0.8942714929580688, + "learning_rate": 0.00028560224089635856, + "loss": 0.4281, + "step": 25617 + }, + { + "epoch": 14.31173184357542, + "grad_norm": 0.4284249544143677, + "learning_rate": 0.00028557422969187677, + "loss": 0.3466, + "step": 25618 + }, + { + "epoch": 14.312290502793296, + "grad_norm": 0.6086063385009766, + "learning_rate": 0.00028554621848739497, + "loss": 0.4871, + "step": 25619 + }, + { + "epoch": 14.312849162011172, + "grad_norm": 1.3937976360321045, + "learning_rate": 0.0002855182072829132, + "loss": 0.3765, + "step": 25620 + }, + { + "epoch": 14.31340782122905, + "grad_norm": 0.45092374086380005, + "learning_rate": 0.00028549019607843133, + "loss": 0.3977, + "step": 25621 + }, + { + "epoch": 14.313966480446927, + "grad_norm": 0.45951715111732483, + "learning_rate": 0.0002854621848739496, + "loss": 0.416, + "step": 25622 + }, + { + "epoch": 14.314525139664804, + "grad_norm": 0.9599772095680237, + "learning_rate": 0.0002854341736694678, + "loss": 0.4227, + "step": 25623 + }, + { + "epoch": 14.315083798882682, + "grad_norm": 0.7276136875152588, + "learning_rate": 0.000285406162464986, + "loss": 0.3497, + "step": 25624 + }, + { + "epoch": 14.315642458100559, + "grad_norm": 1.7187039852142334, + "learning_rate": 0.0002853781512605042, + "loss": 0.4396, + "step": 25625 + }, + { + "epoch": 14.316201117318435, + "grad_norm": 0.46159234642982483, + "learning_rate": 0.0002853501400560224, + "loss": 0.4092, + "step": 25626 + }, + { + "epoch": 14.316759776536314, + "grad_norm": 0.514132022857666, + "learning_rate": 0.0002853221288515406, + "loss": 0.4296, + "step": 25627 + }, + { + "epoch": 14.31731843575419, + "grad_norm": 0.47145363688468933, + "learning_rate": 0.0002852941176470588, + "loss": 0.3638, + "step": 25628 + }, + { + "epoch": 14.317877094972067, + "grad_norm": 0.40994134545326233, + "learning_rate": 0.00028526610644257703, + "loss": 0.3472, + "step": 25629 + }, + { + "epoch": 14.318435754189943, + "grad_norm": 0.7872180938720703, + "learning_rate": 0.00028523809523809524, + "loss": 0.4858, + "step": 25630 + }, + { + "epoch": 14.318994413407822, + "grad_norm": 0.5457873940467834, + "learning_rate": 0.00028521008403361344, + "loss": 0.3691, + "step": 25631 + }, + { + "epoch": 14.319553072625698, + "grad_norm": 0.3201582133769989, + "learning_rate": 0.00028518207282913165, + "loss": 0.3506, + "step": 25632 + }, + { + "epoch": 14.320111731843575, + "grad_norm": 0.6611894369125366, + "learning_rate": 0.00028515406162464986, + "loss": 0.4559, + "step": 25633 + }, + { + "epoch": 14.320670391061453, + "grad_norm": 0.3534206449985504, + "learning_rate": 0.0002851260504201681, + "loss": 0.3857, + "step": 25634 + }, + { + "epoch": 14.32122905027933, + "grad_norm": 0.6890993118286133, + "learning_rate": 0.00028509803921568627, + "loss": 0.4612, + "step": 25635 + }, + { + "epoch": 14.321787709497206, + "grad_norm": 0.6053663492202759, + "learning_rate": 0.0002850700280112045, + "loss": 0.4293, + "step": 25636 + }, + { + "epoch": 14.322346368715085, + "grad_norm": 0.8205382823944092, + "learning_rate": 0.0002850420168067227, + "loss": 0.4353, + "step": 25637 + }, + { + "epoch": 14.322905027932961, + "grad_norm": 0.4678000211715698, + "learning_rate": 0.0002850140056022409, + "loss": 0.4414, + "step": 25638 + }, + { + "epoch": 14.323463687150838, + "grad_norm": 0.9820969700813293, + "learning_rate": 0.00028498599439775915, + "loss": 0.3453, + "step": 25639 + }, + { + "epoch": 14.324022346368714, + "grad_norm": 0.6170769929885864, + "learning_rate": 0.0002849579831932773, + "loss": 0.4082, + "step": 25640 + }, + { + "epoch": 14.324581005586593, + "grad_norm": 0.3387000262737274, + "learning_rate": 0.0002849299719887955, + "loss": 0.343, + "step": 25641 + }, + { + "epoch": 14.32513966480447, + "grad_norm": 0.41351622343063354, + "learning_rate": 0.00028490196078431376, + "loss": 0.4604, + "step": 25642 + }, + { + "epoch": 14.325698324022346, + "grad_norm": 0.4139968454837799, + "learning_rate": 0.0002848739495798319, + "loss": 0.3738, + "step": 25643 + }, + { + "epoch": 14.326256983240224, + "grad_norm": 0.5786769986152649, + "learning_rate": 0.0002848459383753502, + "loss": 0.3749, + "step": 25644 + }, + { + "epoch": 14.3268156424581, + "grad_norm": 1.082821011543274, + "learning_rate": 0.00028481792717086833, + "loss": 0.5075, + "step": 25645 + }, + { + "epoch": 14.327374301675977, + "grad_norm": 0.8283697366714478, + "learning_rate": 0.00028478991596638653, + "loss": 0.4805, + "step": 25646 + }, + { + "epoch": 14.327932960893854, + "grad_norm": 0.4066693186759949, + "learning_rate": 0.0002847619047619048, + "loss": 0.5018, + "step": 25647 + }, + { + "epoch": 14.328491620111732, + "grad_norm": 0.43121862411499023, + "learning_rate": 0.00028473389355742295, + "loss": 0.371, + "step": 25648 + }, + { + "epoch": 14.329050279329609, + "grad_norm": 0.6932399272918701, + "learning_rate": 0.0002847058823529412, + "loss": 0.3533, + "step": 25649 + }, + { + "epoch": 14.329608938547485, + "grad_norm": 0.62120121717453, + "learning_rate": 0.0002846778711484594, + "loss": 0.4223, + "step": 25650 + }, + { + "epoch": 14.330167597765364, + "grad_norm": 1.1668896675109863, + "learning_rate": 0.00028464985994397756, + "loss": 0.4433, + "step": 25651 + }, + { + "epoch": 14.33072625698324, + "grad_norm": 0.7521544098854065, + "learning_rate": 0.0002846218487394958, + "loss": 0.4454, + "step": 25652 + }, + { + "epoch": 14.331284916201117, + "grad_norm": 0.4208326041698456, + "learning_rate": 0.000284593837535014, + "loss": 0.501, + "step": 25653 + }, + { + "epoch": 14.331843575418995, + "grad_norm": 0.5409586429595947, + "learning_rate": 0.00028456582633053224, + "loss": 0.4698, + "step": 25654 + }, + { + "epoch": 14.332402234636872, + "grad_norm": 0.597471296787262, + "learning_rate": 0.00028453781512605044, + "loss": 0.3547, + "step": 25655 + }, + { + "epoch": 14.332960893854748, + "grad_norm": 0.7143123149871826, + "learning_rate": 0.0002845098039215686, + "loss": 0.3982, + "step": 25656 + }, + { + "epoch": 14.333519553072625, + "grad_norm": 0.4022220969200134, + "learning_rate": 0.00028448179271708685, + "loss": 0.4146, + "step": 25657 + }, + { + "epoch": 14.334078212290503, + "grad_norm": 0.4564114511013031, + "learning_rate": 0.00028445378151260506, + "loss": 0.4246, + "step": 25658 + }, + { + "epoch": 14.33463687150838, + "grad_norm": 0.5670762658119202, + "learning_rate": 0.00028442577030812327, + "loss": 0.5039, + "step": 25659 + }, + { + "epoch": 14.335195530726256, + "grad_norm": 0.5910559892654419, + "learning_rate": 0.00028439775910364147, + "loss": 0.3681, + "step": 25660 + }, + { + "epoch": 14.335754189944135, + "grad_norm": 0.6039672493934631, + "learning_rate": 0.0002843697478991596, + "loss": 0.5049, + "step": 25661 + }, + { + "epoch": 14.336312849162011, + "grad_norm": 0.3645922541618347, + "learning_rate": 0.0002843417366946779, + "loss": 0.3103, + "step": 25662 + }, + { + "epoch": 14.336871508379888, + "grad_norm": 0.3067604601383209, + "learning_rate": 0.0002843137254901961, + "loss": 0.3533, + "step": 25663 + }, + { + "epoch": 14.337430167597766, + "grad_norm": 0.46452659368515015, + "learning_rate": 0.0002842857142857143, + "loss": 0.3276, + "step": 25664 + }, + { + "epoch": 14.337988826815643, + "grad_norm": 0.44438958168029785, + "learning_rate": 0.0002842577030812325, + "loss": 0.426, + "step": 25665 + }, + { + "epoch": 14.33854748603352, + "grad_norm": 0.492631196975708, + "learning_rate": 0.0002842296918767507, + "loss": 0.394, + "step": 25666 + }, + { + "epoch": 14.339106145251396, + "grad_norm": 0.7714545726776123, + "learning_rate": 0.0002842016806722689, + "loss": 0.5024, + "step": 25667 + }, + { + "epoch": 14.339664804469274, + "grad_norm": 0.4022531509399414, + "learning_rate": 0.0002841736694677871, + "loss": 0.4186, + "step": 25668 + }, + { + "epoch": 14.34022346368715, + "grad_norm": 5.050975322723389, + "learning_rate": 0.0002841456582633053, + "loss": 0.4058, + "step": 25669 + }, + { + "epoch": 14.340782122905027, + "grad_norm": 0.37994644045829773, + "learning_rate": 0.00028411764705882353, + "loss": 0.4224, + "step": 25670 + }, + { + "epoch": 14.341340782122906, + "grad_norm": 0.9045817852020264, + "learning_rate": 0.00028408963585434174, + "loss": 0.353, + "step": 25671 + }, + { + "epoch": 14.341899441340782, + "grad_norm": 0.49429816007614136, + "learning_rate": 0.00028406162464985994, + "loss": 0.4733, + "step": 25672 + }, + { + "epoch": 14.342458100558659, + "grad_norm": 0.3610917627811432, + "learning_rate": 0.00028403361344537815, + "loss": 0.2923, + "step": 25673 + }, + { + "epoch": 14.343016759776535, + "grad_norm": 2.681816816329956, + "learning_rate": 0.0002840056022408964, + "loss": 0.3733, + "step": 25674 + }, + { + "epoch": 14.343575418994414, + "grad_norm": 0.5871475338935852, + "learning_rate": 0.00028397759103641456, + "loss": 0.4014, + "step": 25675 + }, + { + "epoch": 14.34413407821229, + "grad_norm": 0.44170141220092773, + "learning_rate": 0.00028394957983193277, + "loss": 0.4157, + "step": 25676 + }, + { + "epoch": 14.344692737430167, + "grad_norm": 0.8048399090766907, + "learning_rate": 0.000283921568627451, + "loss": 0.4336, + "step": 25677 + }, + { + "epoch": 14.345251396648045, + "grad_norm": 0.5510793328285217, + "learning_rate": 0.0002838935574229692, + "loss": 0.4143, + "step": 25678 + }, + { + "epoch": 14.345810055865922, + "grad_norm": 0.6959816813468933, + "learning_rate": 0.00028386554621848744, + "loss": 0.3851, + "step": 25679 + }, + { + "epoch": 14.346368715083798, + "grad_norm": 0.46577441692352295, + "learning_rate": 0.0002838375350140056, + "loss": 0.2991, + "step": 25680 + }, + { + "epoch": 14.346927374301677, + "grad_norm": 0.44089773297309875, + "learning_rate": 0.0002838095238095238, + "loss": 0.4154, + "step": 25681 + }, + { + "epoch": 14.347486033519553, + "grad_norm": 0.5041240453720093, + "learning_rate": 0.00028378151260504206, + "loss": 0.414, + "step": 25682 + }, + { + "epoch": 14.34804469273743, + "grad_norm": 0.5451045632362366, + "learning_rate": 0.0002837535014005602, + "loss": 0.4117, + "step": 25683 + }, + { + "epoch": 14.348603351955306, + "grad_norm": 0.5523211359977722, + "learning_rate": 0.00028372549019607847, + "loss": 0.4144, + "step": 25684 + }, + { + "epoch": 14.349162011173185, + "grad_norm": 0.7593986392021179, + "learning_rate": 0.0002836974789915966, + "loss": 0.3524, + "step": 25685 + }, + { + "epoch": 14.349720670391061, + "grad_norm": 1.9889676570892334, + "learning_rate": 0.00028366946778711483, + "loss": 0.4046, + "step": 25686 + }, + { + "epoch": 14.350279329608938, + "grad_norm": 0.48265495896339417, + "learning_rate": 0.0002836414565826331, + "loss": 0.5398, + "step": 25687 + }, + { + "epoch": 14.350837988826816, + "grad_norm": 0.43482232093811035, + "learning_rate": 0.00028361344537815124, + "loss": 0.4597, + "step": 25688 + }, + { + "epoch": 14.351396648044693, + "grad_norm": 0.3500218689441681, + "learning_rate": 0.0002835854341736695, + "loss": 0.3708, + "step": 25689 + }, + { + "epoch": 14.35195530726257, + "grad_norm": 0.5408735275268555, + "learning_rate": 0.0002835574229691877, + "loss": 0.4819, + "step": 25690 + }, + { + "epoch": 14.352513966480448, + "grad_norm": 0.6805570721626282, + "learning_rate": 0.00028352941176470586, + "loss": 0.4361, + "step": 25691 + }, + { + "epoch": 14.353072625698324, + "grad_norm": 0.7888262271881104, + "learning_rate": 0.0002835014005602241, + "loss": 0.4249, + "step": 25692 + }, + { + "epoch": 14.3536312849162, + "grad_norm": 0.32939043641090393, + "learning_rate": 0.00028347338935574227, + "loss": 0.3919, + "step": 25693 + }, + { + "epoch": 14.354189944134077, + "grad_norm": 0.6589708924293518, + "learning_rate": 0.00028344537815126053, + "loss": 0.4001, + "step": 25694 + }, + { + "epoch": 14.354748603351956, + "grad_norm": 0.39256951212882996, + "learning_rate": 0.00028341736694677874, + "loss": 0.3093, + "step": 25695 + }, + { + "epoch": 14.355307262569832, + "grad_norm": 0.31894832849502563, + "learning_rate": 0.0002833893557422969, + "loss": 0.319, + "step": 25696 + }, + { + "epoch": 14.355865921787709, + "grad_norm": 0.4189417362213135, + "learning_rate": 0.00028336134453781515, + "loss": 0.4303, + "step": 25697 + }, + { + "epoch": 14.356424581005587, + "grad_norm": 0.49348950386047363, + "learning_rate": 0.00028333333333333335, + "loss": 0.4588, + "step": 25698 + }, + { + "epoch": 14.356983240223464, + "grad_norm": 0.380424439907074, + "learning_rate": 0.00028330532212885156, + "loss": 0.3469, + "step": 25699 + }, + { + "epoch": 14.35754189944134, + "grad_norm": 0.4521121680736542, + "learning_rate": 0.00028327731092436977, + "loss": 0.5093, + "step": 25700 + }, + { + "epoch": 14.358100558659217, + "grad_norm": 0.38366076350212097, + "learning_rate": 0.0002832492997198879, + "loss": 0.3765, + "step": 25701 + }, + { + "epoch": 14.358659217877095, + "grad_norm": 0.41023173928260803, + "learning_rate": 0.0002832212885154062, + "loss": 0.4308, + "step": 25702 + }, + { + "epoch": 14.359217877094972, + "grad_norm": 0.8428417444229126, + "learning_rate": 0.0002831932773109244, + "loss": 0.3418, + "step": 25703 + }, + { + "epoch": 14.359776536312848, + "grad_norm": 3.7292654514312744, + "learning_rate": 0.0002831652661064426, + "loss": 0.481, + "step": 25704 + }, + { + "epoch": 14.360335195530727, + "grad_norm": 0.7049896717071533, + "learning_rate": 0.0002831372549019608, + "loss": 0.3586, + "step": 25705 + }, + { + "epoch": 14.360893854748603, + "grad_norm": 0.5074928998947144, + "learning_rate": 0.000283109243697479, + "loss": 0.4894, + "step": 25706 + }, + { + "epoch": 14.36145251396648, + "grad_norm": 0.4286574721336365, + "learning_rate": 0.0002830812324929972, + "loss": 0.3388, + "step": 25707 + }, + { + "epoch": 14.362011173184358, + "grad_norm": 0.34454187750816345, + "learning_rate": 0.0002830532212885154, + "loss": 0.347, + "step": 25708 + }, + { + "epoch": 14.362569832402235, + "grad_norm": 0.7793242335319519, + "learning_rate": 0.0002830252100840336, + "loss": 0.3957, + "step": 25709 + }, + { + "epoch": 14.363128491620111, + "grad_norm": 0.41481536626815796, + "learning_rate": 0.0002829971988795518, + "loss": 0.4063, + "step": 25710 + }, + { + "epoch": 14.363687150837988, + "grad_norm": 0.5770106315612793, + "learning_rate": 0.00028296918767507003, + "loss": 0.3488, + "step": 25711 + }, + { + "epoch": 14.364245810055866, + "grad_norm": 0.4796459972858429, + "learning_rate": 0.00028294117647058824, + "loss": 0.4223, + "step": 25712 + }, + { + "epoch": 14.364804469273743, + "grad_norm": 0.4658036231994629, + "learning_rate": 0.00028291316526610644, + "loss": 0.3649, + "step": 25713 + }, + { + "epoch": 14.36536312849162, + "grad_norm": 0.4907352030277252, + "learning_rate": 0.0002828851540616247, + "loss": 0.4932, + "step": 25714 + }, + { + "epoch": 14.365921787709498, + "grad_norm": 0.43476834893226624, + "learning_rate": 0.00028285714285714286, + "loss": 0.5205, + "step": 25715 + }, + { + "epoch": 14.366480446927374, + "grad_norm": 0.3576247990131378, + "learning_rate": 0.00028282913165266106, + "loss": 0.3031, + "step": 25716 + }, + { + "epoch": 14.367039106145251, + "grad_norm": 0.405793696641922, + "learning_rate": 0.00028280112044817927, + "loss": 0.3406, + "step": 25717 + }, + { + "epoch": 14.36759776536313, + "grad_norm": 0.6483950018882751, + "learning_rate": 0.0002827731092436975, + "loss": 0.4059, + "step": 25718 + }, + { + "epoch": 14.368156424581006, + "grad_norm": 0.3726096749305725, + "learning_rate": 0.0002827450980392157, + "loss": 0.3593, + "step": 25719 + }, + { + "epoch": 14.368715083798882, + "grad_norm": 0.36157068610191345, + "learning_rate": 0.0002827170868347339, + "loss": 0.3736, + "step": 25720 + }, + { + "epoch": 14.369273743016759, + "grad_norm": 0.41069304943084717, + "learning_rate": 0.0002826890756302521, + "loss": 0.3953, + "step": 25721 + }, + { + "epoch": 14.369832402234637, + "grad_norm": 0.5591342449188232, + "learning_rate": 0.00028266106442577035, + "loss": 0.2942, + "step": 25722 + }, + { + "epoch": 14.370391061452514, + "grad_norm": 0.7573222517967224, + "learning_rate": 0.0002826330532212885, + "loss": 0.4293, + "step": 25723 + }, + { + "epoch": 14.37094972067039, + "grad_norm": 0.5397464632987976, + "learning_rate": 0.0002826050420168067, + "loss": 0.5697, + "step": 25724 + }, + { + "epoch": 14.371508379888269, + "grad_norm": 0.5947046875953674, + "learning_rate": 0.0002825770308123249, + "loss": 0.4275, + "step": 25725 + }, + { + "epoch": 14.372067039106145, + "grad_norm": 0.576654314994812, + "learning_rate": 0.0002825490196078431, + "loss": 0.4647, + "step": 25726 + }, + { + "epoch": 14.372625698324022, + "grad_norm": 0.4334881603717804, + "learning_rate": 0.0002825210084033614, + "loss": 0.4593, + "step": 25727 + }, + { + "epoch": 14.3731843575419, + "grad_norm": 0.37201419472694397, + "learning_rate": 0.00028249299719887953, + "loss": 0.3517, + "step": 25728 + }, + { + "epoch": 14.373743016759777, + "grad_norm": 0.8242093920707703, + "learning_rate": 0.00028246498599439774, + "loss": 0.3823, + "step": 25729 + }, + { + "epoch": 14.374301675977653, + "grad_norm": 0.7555853128433228, + "learning_rate": 0.000282436974789916, + "loss": 0.4939, + "step": 25730 + }, + { + "epoch": 14.37486033519553, + "grad_norm": 1.2133944034576416, + "learning_rate": 0.00028240896358543415, + "loss": 0.4774, + "step": 25731 + }, + { + "epoch": 14.375418994413408, + "grad_norm": 0.3893866240978241, + "learning_rate": 0.0002823809523809524, + "loss": 0.4104, + "step": 25732 + }, + { + "epoch": 14.375977653631285, + "grad_norm": 2.329319715499878, + "learning_rate": 0.00028235294117647056, + "loss": 0.4933, + "step": 25733 + }, + { + "epoch": 14.376536312849161, + "grad_norm": 5.495023727416992, + "learning_rate": 0.00028232492997198877, + "loss": 0.4191, + "step": 25734 + }, + { + "epoch": 14.37709497206704, + "grad_norm": 0.47597333788871765, + "learning_rate": 0.00028229691876750703, + "loss": 0.5126, + "step": 25735 + }, + { + "epoch": 14.377653631284916, + "grad_norm": 0.34918737411499023, + "learning_rate": 0.0002822689075630252, + "loss": 0.3895, + "step": 25736 + }, + { + "epoch": 14.378212290502793, + "grad_norm": 3.2512359619140625, + "learning_rate": 0.00028224089635854344, + "loss": 0.5718, + "step": 25737 + }, + { + "epoch": 14.378770949720671, + "grad_norm": 0.6092514395713806, + "learning_rate": 0.00028221288515406165, + "loss": 0.4242, + "step": 25738 + }, + { + "epoch": 14.379329608938548, + "grad_norm": 0.41264331340789795, + "learning_rate": 0.0002821848739495798, + "loss": 0.4264, + "step": 25739 + }, + { + "epoch": 14.379888268156424, + "grad_norm": 0.6225352883338928, + "learning_rate": 0.00028215686274509806, + "loss": 0.3416, + "step": 25740 + }, + { + "epoch": 14.380446927374301, + "grad_norm": 0.3158023953437805, + "learning_rate": 0.0002821288515406162, + "loss": 0.4095, + "step": 25741 + }, + { + "epoch": 14.38100558659218, + "grad_norm": 0.5627813935279846, + "learning_rate": 0.00028210084033613447, + "loss": 0.4102, + "step": 25742 + }, + { + "epoch": 14.381564245810056, + "grad_norm": 0.8792022466659546, + "learning_rate": 0.0002820728291316527, + "loss": 0.3798, + "step": 25743 + }, + { + "epoch": 14.382122905027932, + "grad_norm": 0.40875422954559326, + "learning_rate": 0.00028204481792717083, + "loss": 0.4946, + "step": 25744 + }, + { + "epoch": 14.38268156424581, + "grad_norm": 0.4875701665878296, + "learning_rate": 0.0002820168067226891, + "loss": 0.4873, + "step": 25745 + }, + { + "epoch": 14.383240223463687, + "grad_norm": 0.5964968204498291, + "learning_rate": 0.0002819887955182073, + "loss": 0.3611, + "step": 25746 + }, + { + "epoch": 14.383798882681564, + "grad_norm": 0.5391162633895874, + "learning_rate": 0.0002819607843137255, + "loss": 0.469, + "step": 25747 + }, + { + "epoch": 14.38435754189944, + "grad_norm": 0.49238142371177673, + "learning_rate": 0.0002819327731092437, + "loss": 0.4111, + "step": 25748 + }, + { + "epoch": 14.384916201117319, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.00028190476190476186, + "loss": 0.4641, + "step": 25749 + }, + { + "epoch": 14.385474860335195, + "grad_norm": 0.5593146085739136, + "learning_rate": 0.0002818767507002801, + "loss": 0.4146, + "step": 25750 + }, + { + "epoch": 14.386033519553072, + "grad_norm": 0.6897799372673035, + "learning_rate": 0.0002818487394957983, + "loss": 0.4757, + "step": 25751 + }, + { + "epoch": 14.38659217877095, + "grad_norm": 0.33926549553871155, + "learning_rate": 0.00028182072829131653, + "loss": 0.3405, + "step": 25752 + }, + { + "epoch": 14.387150837988827, + "grad_norm": 0.38387221097946167, + "learning_rate": 0.00028179271708683474, + "loss": 0.3435, + "step": 25753 + }, + { + "epoch": 14.387709497206703, + "grad_norm": 0.8094502687454224, + "learning_rate": 0.00028176470588235294, + "loss": 0.5851, + "step": 25754 + }, + { + "epoch": 14.388268156424582, + "grad_norm": 1.2071977853775024, + "learning_rate": 0.00028173669467787115, + "loss": 0.4762, + "step": 25755 + }, + { + "epoch": 14.388826815642458, + "grad_norm": 0.3796028792858124, + "learning_rate": 0.00028170868347338936, + "loss": 0.4228, + "step": 25756 + }, + { + "epoch": 14.389385474860335, + "grad_norm": 0.6941390633583069, + "learning_rate": 0.0002816806722689076, + "loss": 0.4467, + "step": 25757 + }, + { + "epoch": 14.389944134078211, + "grad_norm": 0.6343345642089844, + "learning_rate": 0.00028165266106442577, + "loss": 0.4957, + "step": 25758 + }, + { + "epoch": 14.39050279329609, + "grad_norm": 0.5664315223693848, + "learning_rate": 0.000281624649859944, + "loss": 0.4144, + "step": 25759 + }, + { + "epoch": 14.391061452513966, + "grad_norm": 0.4608016908168793, + "learning_rate": 0.0002815966386554622, + "loss": 0.3555, + "step": 25760 + }, + { + "epoch": 14.391620111731843, + "grad_norm": 1.5384972095489502, + "learning_rate": 0.0002815686274509804, + "loss": 0.462, + "step": 25761 + }, + { + "epoch": 14.392178770949721, + "grad_norm": 0.4720909595489502, + "learning_rate": 0.00028154061624649865, + "loss": 0.4471, + "step": 25762 + }, + { + "epoch": 14.392737430167598, + "grad_norm": 0.36641809344291687, + "learning_rate": 0.0002815126050420168, + "loss": 0.2873, + "step": 25763 + }, + { + "epoch": 14.393296089385474, + "grad_norm": 0.4277470111846924, + "learning_rate": 0.000281484593837535, + "loss": 0.351, + "step": 25764 + }, + { + "epoch": 14.393854748603353, + "grad_norm": 0.5178866982460022, + "learning_rate": 0.00028145658263305326, + "loss": 0.4064, + "step": 25765 + }, + { + "epoch": 14.39441340782123, + "grad_norm": 0.532721996307373, + "learning_rate": 0.0002814285714285714, + "loss": 0.4609, + "step": 25766 + }, + { + "epoch": 14.394972067039106, + "grad_norm": 0.3775053024291992, + "learning_rate": 0.0002814005602240897, + "loss": 0.3408, + "step": 25767 + }, + { + "epoch": 14.395530726256982, + "grad_norm": 0.48296311497688293, + "learning_rate": 0.00028137254901960783, + "loss": 0.513, + "step": 25768 + }, + { + "epoch": 14.39608938547486, + "grad_norm": 0.6062502264976501, + "learning_rate": 0.00028134453781512603, + "loss": 0.5569, + "step": 25769 + }, + { + "epoch": 14.396648044692737, + "grad_norm": 1.783350944519043, + "learning_rate": 0.0002813165266106443, + "loss": 0.4486, + "step": 25770 + }, + { + "epoch": 14.397206703910614, + "grad_norm": 2.1369588375091553, + "learning_rate": 0.00028128851540616245, + "loss": 0.4258, + "step": 25771 + }, + { + "epoch": 14.397765363128492, + "grad_norm": 0.665153980255127, + "learning_rate": 0.0002812605042016807, + "loss": 0.3787, + "step": 25772 + }, + { + "epoch": 14.398324022346369, + "grad_norm": 0.6627637147903442, + "learning_rate": 0.0002812324929971989, + "loss": 0.5166, + "step": 25773 + }, + { + "epoch": 14.398882681564245, + "grad_norm": 0.4719976484775543, + "learning_rate": 0.00028120448179271706, + "loss": 0.4381, + "step": 25774 + }, + { + "epoch": 14.399441340782122, + "grad_norm": 0.40011176466941833, + "learning_rate": 0.0002811764705882353, + "loss": 0.4025, + "step": 25775 + }, + { + "epoch": 14.4, + "grad_norm": 0.3883362412452698, + "learning_rate": 0.0002811484593837535, + "loss": 0.4517, + "step": 25776 + }, + { + "epoch": 14.400558659217877, + "grad_norm": 0.5507895946502686, + "learning_rate": 0.00028112044817927174, + "loss": 0.3458, + "step": 25777 + }, + { + "epoch": 14.401117318435753, + "grad_norm": 0.39327993988990784, + "learning_rate": 0.00028109243697478994, + "loss": 0.4834, + "step": 25778 + }, + { + "epoch": 14.401675977653632, + "grad_norm": 0.4275578558444977, + "learning_rate": 0.0002810644257703081, + "loss": 0.3585, + "step": 25779 + }, + { + "epoch": 14.402234636871508, + "grad_norm": 0.7927836179733276, + "learning_rate": 0.00028103641456582635, + "loss": 0.4295, + "step": 25780 + }, + { + "epoch": 14.402793296089385, + "grad_norm": 0.6239094138145447, + "learning_rate": 0.00028100840336134456, + "loss": 0.2793, + "step": 25781 + }, + { + "epoch": 14.403351955307263, + "grad_norm": 0.5948808789253235, + "learning_rate": 0.00028098039215686277, + "loss": 0.6748, + "step": 25782 + }, + { + "epoch": 14.40391061452514, + "grad_norm": 0.579407274723053, + "learning_rate": 0.00028095238095238097, + "loss": 0.4861, + "step": 25783 + }, + { + "epoch": 14.404469273743016, + "grad_norm": 0.4269624650478363, + "learning_rate": 0.0002809243697478991, + "loss": 0.4071, + "step": 25784 + }, + { + "epoch": 14.405027932960893, + "grad_norm": 0.4624919295310974, + "learning_rate": 0.0002808963585434174, + "loss": 0.4408, + "step": 25785 + }, + { + "epoch": 14.405586592178771, + "grad_norm": 0.3998176157474518, + "learning_rate": 0.0002808683473389356, + "loss": 0.3213, + "step": 25786 + }, + { + "epoch": 14.406145251396648, + "grad_norm": 0.5063607692718506, + "learning_rate": 0.0002808403361344538, + "loss": 0.3633, + "step": 25787 + }, + { + "epoch": 14.406703910614524, + "grad_norm": 2.788221836090088, + "learning_rate": 0.000280812324929972, + "loss": 0.3837, + "step": 25788 + }, + { + "epoch": 14.407262569832403, + "grad_norm": 0.8124858140945435, + "learning_rate": 0.0002807843137254902, + "loss": 0.559, + "step": 25789 + }, + { + "epoch": 14.40782122905028, + "grad_norm": 0.4693353772163391, + "learning_rate": 0.0002807563025210084, + "loss": 0.4196, + "step": 25790 + }, + { + "epoch": 14.408379888268156, + "grad_norm": 1.7660160064697266, + "learning_rate": 0.0002807282913165266, + "loss": 0.3712, + "step": 25791 + }, + { + "epoch": 14.408938547486034, + "grad_norm": 0.8365939259529114, + "learning_rate": 0.0002807002801120448, + "loss": 0.5345, + "step": 25792 + }, + { + "epoch": 14.40949720670391, + "grad_norm": 0.37916257977485657, + "learning_rate": 0.00028067226890756303, + "loss": 0.3536, + "step": 25793 + }, + { + "epoch": 14.410055865921787, + "grad_norm": 0.5092312693595886, + "learning_rate": 0.00028064425770308124, + "loss": 0.4232, + "step": 25794 + }, + { + "epoch": 14.410614525139664, + "grad_norm": 0.6536540389060974, + "learning_rate": 0.00028061624649859944, + "loss": 0.358, + "step": 25795 + }, + { + "epoch": 14.411173184357542, + "grad_norm": 4.4830546379089355, + "learning_rate": 0.00028058823529411765, + "loss": 0.3305, + "step": 25796 + }, + { + "epoch": 14.411731843575419, + "grad_norm": 0.4286189377307892, + "learning_rate": 0.0002805602240896359, + "loss": 0.3636, + "step": 25797 + }, + { + "epoch": 14.412290502793295, + "grad_norm": 0.8312407732009888, + "learning_rate": 0.00028053221288515406, + "loss": 0.4554, + "step": 25798 + }, + { + "epoch": 14.412849162011174, + "grad_norm": 0.5654194951057434, + "learning_rate": 0.00028050420168067227, + "loss": 0.4181, + "step": 25799 + }, + { + "epoch": 14.41340782122905, + "grad_norm": 0.5863842964172363, + "learning_rate": 0.0002804761904761905, + "loss": 0.4628, + "step": 25800 + }, + { + "epoch": 14.413966480446927, + "grad_norm": 0.3270134925842285, + "learning_rate": 0.0002804481792717087, + "loss": 0.2687, + "step": 25801 + }, + { + "epoch": 14.414525139664805, + "grad_norm": 0.8150222897529602, + "learning_rate": 0.00028042016806722694, + "loss": 0.4096, + "step": 25802 + }, + { + "epoch": 14.415083798882682, + "grad_norm": 0.44506585597991943, + "learning_rate": 0.0002803921568627451, + "loss": 0.5477, + "step": 25803 + }, + { + "epoch": 14.415642458100558, + "grad_norm": 0.3880547881126404, + "learning_rate": 0.0002803641456582633, + "loss": 0.3942, + "step": 25804 + }, + { + "epoch": 14.416201117318435, + "grad_norm": 0.5805116295814514, + "learning_rate": 0.00028033613445378156, + "loss": 0.5443, + "step": 25805 + }, + { + "epoch": 14.416759776536313, + "grad_norm": 0.4237102270126343, + "learning_rate": 0.0002803081232492997, + "loss": 0.4433, + "step": 25806 + }, + { + "epoch": 14.41731843575419, + "grad_norm": 0.5702316761016846, + "learning_rate": 0.00028028011204481797, + "loss": 0.4432, + "step": 25807 + }, + { + "epoch": 14.417877094972066, + "grad_norm": 0.4755604565143585, + "learning_rate": 0.0002802521008403361, + "loss": 0.4003, + "step": 25808 + }, + { + "epoch": 14.418435754189945, + "grad_norm": 0.41497841477394104, + "learning_rate": 0.00028022408963585433, + "loss": 0.4138, + "step": 25809 + }, + { + "epoch": 14.418994413407821, + "grad_norm": 0.4244304299354553, + "learning_rate": 0.0002801960784313726, + "loss": 0.3975, + "step": 25810 + }, + { + "epoch": 14.419553072625698, + "grad_norm": 0.4146798253059387, + "learning_rate": 0.00028016806722689074, + "loss": 0.3978, + "step": 25811 + }, + { + "epoch": 14.420111731843576, + "grad_norm": 0.5190150737762451, + "learning_rate": 0.000280140056022409, + "loss": 0.4463, + "step": 25812 + }, + { + "epoch": 14.420670391061453, + "grad_norm": 0.6149578094482422, + "learning_rate": 0.0002801120448179272, + "loss": 0.5793, + "step": 25813 + }, + { + "epoch": 14.42122905027933, + "grad_norm": 0.6687585115432739, + "learning_rate": 0.00028008403361344536, + "loss": 0.3749, + "step": 25814 + }, + { + "epoch": 14.421787709497206, + "grad_norm": 1.8690475225448608, + "learning_rate": 0.0002800560224089636, + "loss": 0.4547, + "step": 25815 + }, + { + "epoch": 14.422346368715084, + "grad_norm": 0.42321157455444336, + "learning_rate": 0.00028002801120448177, + "loss": 0.4473, + "step": 25816 + }, + { + "epoch": 14.422905027932961, + "grad_norm": 2.1452319622039795, + "learning_rate": 0.00028000000000000003, + "loss": 0.3988, + "step": 25817 + }, + { + "epoch": 14.423463687150837, + "grad_norm": 0.5502938628196716, + "learning_rate": 0.00027997198879551824, + "loss": 0.4806, + "step": 25818 + }, + { + "epoch": 14.424022346368716, + "grad_norm": 0.49775195121765137, + "learning_rate": 0.0002799439775910364, + "loss": 0.4999, + "step": 25819 + }, + { + "epoch": 14.424581005586592, + "grad_norm": 0.4695853292942047, + "learning_rate": 0.00027991596638655465, + "loss": 0.3727, + "step": 25820 + }, + { + "epoch": 14.425139664804469, + "grad_norm": 0.7304810881614685, + "learning_rate": 0.00027988795518207285, + "loss": 0.4343, + "step": 25821 + }, + { + "epoch": 14.425698324022346, + "grad_norm": 1.0314608812332153, + "learning_rate": 0.00027985994397759106, + "loss": 0.3913, + "step": 25822 + }, + { + "epoch": 14.426256983240224, + "grad_norm": 2.5969812870025635, + "learning_rate": 0.00027983193277310927, + "loss": 0.4345, + "step": 25823 + }, + { + "epoch": 14.4268156424581, + "grad_norm": 0.6922586560249329, + "learning_rate": 0.0002798039215686274, + "loss": 0.3476, + "step": 25824 + }, + { + "epoch": 14.427374301675977, + "grad_norm": 1.0971225500106812, + "learning_rate": 0.0002797759103641457, + "loss": 0.4155, + "step": 25825 + }, + { + "epoch": 14.427932960893855, + "grad_norm": 0.6877304911613464, + "learning_rate": 0.0002797478991596639, + "loss": 0.5204, + "step": 25826 + }, + { + "epoch": 14.428491620111732, + "grad_norm": 0.7078418731689453, + "learning_rate": 0.0002797198879551821, + "loss": 0.4386, + "step": 25827 + }, + { + "epoch": 14.429050279329608, + "grad_norm": 1.0341423749923706, + "learning_rate": 0.0002796918767507003, + "loss": 0.5362, + "step": 25828 + }, + { + "epoch": 14.429608938547487, + "grad_norm": 0.5058976411819458, + "learning_rate": 0.0002796638655462185, + "loss": 0.3344, + "step": 25829 + }, + { + "epoch": 14.430167597765363, + "grad_norm": 1.0340579748153687, + "learning_rate": 0.0002796358543417367, + "loss": 0.385, + "step": 25830 + }, + { + "epoch": 14.43072625698324, + "grad_norm": 0.5209304094314575, + "learning_rate": 0.0002796078431372549, + "loss": 0.474, + "step": 25831 + }, + { + "epoch": 14.431284916201117, + "grad_norm": 0.6722506284713745, + "learning_rate": 0.00027957983193277307, + "loss": 0.5624, + "step": 25832 + }, + { + "epoch": 14.431843575418995, + "grad_norm": 2.0200576782226562, + "learning_rate": 0.0002795518207282913, + "loss": 0.403, + "step": 25833 + }, + { + "epoch": 14.432402234636871, + "grad_norm": 0.4758749306201935, + "learning_rate": 0.00027952380952380953, + "loss": 0.4527, + "step": 25834 + }, + { + "epoch": 14.432960893854748, + "grad_norm": 1.201953411102295, + "learning_rate": 0.00027949579831932774, + "loss": 0.3817, + "step": 25835 + }, + { + "epoch": 14.433519553072626, + "grad_norm": 0.4644990861415863, + "learning_rate": 0.00027946778711484594, + "loss": 0.4767, + "step": 25836 + }, + { + "epoch": 14.434078212290503, + "grad_norm": 3.7290847301483154, + "learning_rate": 0.00027943977591036415, + "loss": 0.3681, + "step": 25837 + }, + { + "epoch": 14.43463687150838, + "grad_norm": 0.7345566749572754, + "learning_rate": 0.00027941176470588236, + "loss": 0.4311, + "step": 25838 + }, + { + "epoch": 14.435195530726258, + "grad_norm": 0.684956431388855, + "learning_rate": 0.00027938375350140056, + "loss": 0.3851, + "step": 25839 + }, + { + "epoch": 14.435754189944134, + "grad_norm": 1.3335058689117432, + "learning_rate": 0.00027935574229691877, + "loss": 0.4385, + "step": 25840 + }, + { + "epoch": 14.436312849162011, + "grad_norm": 0.5183255076408386, + "learning_rate": 0.000279327731092437, + "loss": 0.3662, + "step": 25841 + }, + { + "epoch": 14.436871508379888, + "grad_norm": 0.40338483452796936, + "learning_rate": 0.0002792997198879552, + "loss": 0.4074, + "step": 25842 + }, + { + "epoch": 14.437430167597766, + "grad_norm": 0.40220940113067627, + "learning_rate": 0.0002792717086834734, + "loss": 0.2954, + "step": 25843 + }, + { + "epoch": 14.437988826815642, + "grad_norm": 0.47030654549598694, + "learning_rate": 0.0002792436974789916, + "loss": 0.3435, + "step": 25844 + }, + { + "epoch": 14.438547486033519, + "grad_norm": 0.3183976411819458, + "learning_rate": 0.00027921568627450985, + "loss": 0.2667, + "step": 25845 + }, + { + "epoch": 14.439106145251397, + "grad_norm": 0.5989043116569519, + "learning_rate": 0.000279187675070028, + "loss": 0.3914, + "step": 25846 + }, + { + "epoch": 14.439664804469274, + "grad_norm": 0.3644499182701111, + "learning_rate": 0.0002791596638655462, + "loss": 0.4763, + "step": 25847 + }, + { + "epoch": 14.44022346368715, + "grad_norm": 0.5210332870483398, + "learning_rate": 0.0002791316526610644, + "loss": 0.3578, + "step": 25848 + }, + { + "epoch": 14.440782122905027, + "grad_norm": 0.5334910750389099, + "learning_rate": 0.0002791036414565826, + "loss": 0.5267, + "step": 25849 + }, + { + "epoch": 14.441340782122905, + "grad_norm": 0.6800428628921509, + "learning_rate": 0.0002790756302521009, + "loss": 0.4044, + "step": 25850 + }, + { + "epoch": 14.441899441340782, + "grad_norm": 0.4861067235469818, + "learning_rate": 0.00027904761904761903, + "loss": 0.4885, + "step": 25851 + }, + { + "epoch": 14.442458100558659, + "grad_norm": 0.7069694995880127, + "learning_rate": 0.00027901960784313724, + "loss": 0.3833, + "step": 25852 + }, + { + "epoch": 14.443016759776537, + "grad_norm": 0.5879365801811218, + "learning_rate": 0.0002789915966386555, + "loss": 0.42, + "step": 25853 + }, + { + "epoch": 14.443575418994413, + "grad_norm": 1.8649920225143433, + "learning_rate": 0.00027896358543417365, + "loss": 0.4743, + "step": 25854 + }, + { + "epoch": 14.44413407821229, + "grad_norm": 0.3928588032722473, + "learning_rate": 0.0002789355742296919, + "loss": 0.4167, + "step": 25855 + }, + { + "epoch": 14.444692737430168, + "grad_norm": 0.4184569716453552, + "learning_rate": 0.00027890756302521006, + "loss": 0.386, + "step": 25856 + }, + { + "epoch": 14.445251396648045, + "grad_norm": 0.37697815895080566, + "learning_rate": 0.00027887955182072827, + "loss": 0.3515, + "step": 25857 + }, + { + "epoch": 14.445810055865921, + "grad_norm": 0.4627286195755005, + "learning_rate": 0.00027885154061624653, + "loss": 0.3727, + "step": 25858 + }, + { + "epoch": 14.446368715083798, + "grad_norm": 0.4251154363155365, + "learning_rate": 0.0002788235294117647, + "loss": 0.3987, + "step": 25859 + }, + { + "epoch": 14.446927374301676, + "grad_norm": 0.44241055846214294, + "learning_rate": 0.00027879551820728294, + "loss": 0.4119, + "step": 25860 + }, + { + "epoch": 14.447486033519553, + "grad_norm": 0.5731030702590942, + "learning_rate": 0.00027876750700280115, + "loss": 0.4532, + "step": 25861 + }, + { + "epoch": 14.44804469273743, + "grad_norm": 0.31942126154899597, + "learning_rate": 0.0002787394957983193, + "loss": 0.2581, + "step": 25862 + }, + { + "epoch": 14.448603351955308, + "grad_norm": 0.5858067870140076, + "learning_rate": 0.00027871148459383756, + "loss": 0.3868, + "step": 25863 + }, + { + "epoch": 14.449162011173184, + "grad_norm": 0.5176591277122498, + "learning_rate": 0.0002786834733893557, + "loss": 0.6416, + "step": 25864 + }, + { + "epoch": 14.449720670391061, + "grad_norm": 0.5952520370483398, + "learning_rate": 0.00027865546218487397, + "loss": 0.3012, + "step": 25865 + }, + { + "epoch": 14.45027932960894, + "grad_norm": 0.4955611228942871, + "learning_rate": 0.0002786274509803922, + "loss": 0.3949, + "step": 25866 + }, + { + "epoch": 14.450837988826816, + "grad_norm": 0.45805174112319946, + "learning_rate": 0.00027859943977591033, + "loss": 0.6283, + "step": 25867 + }, + { + "epoch": 14.451396648044692, + "grad_norm": 2.1716721057891846, + "learning_rate": 0.0002785714285714286, + "loss": 0.4934, + "step": 25868 + }, + { + "epoch": 14.451955307262569, + "grad_norm": 0.35805487632751465, + "learning_rate": 0.0002785434173669468, + "loss": 0.2824, + "step": 25869 + }, + { + "epoch": 14.452513966480447, + "grad_norm": 0.5593825578689575, + "learning_rate": 0.000278515406162465, + "loss": 0.464, + "step": 25870 + }, + { + "epoch": 14.453072625698324, + "grad_norm": 0.46954384446144104, + "learning_rate": 0.0002784873949579832, + "loss": 0.4405, + "step": 25871 + }, + { + "epoch": 14.4536312849162, + "grad_norm": 2.8768081665039062, + "learning_rate": 0.00027845938375350136, + "loss": 0.4671, + "step": 25872 + }, + { + "epoch": 14.454189944134079, + "grad_norm": 0.7915114164352417, + "learning_rate": 0.0002784313725490196, + "loss": 0.4529, + "step": 25873 + }, + { + "epoch": 14.454748603351955, + "grad_norm": 0.6863232254981995, + "learning_rate": 0.0002784033613445378, + "loss": 0.408, + "step": 25874 + }, + { + "epoch": 14.455307262569832, + "grad_norm": 0.47793740034103394, + "learning_rate": 0.00027837535014005603, + "loss": 0.3627, + "step": 25875 + }, + { + "epoch": 14.45586592178771, + "grad_norm": 0.39980530738830566, + "learning_rate": 0.00027834733893557424, + "loss": 0.3887, + "step": 25876 + }, + { + "epoch": 14.456424581005587, + "grad_norm": 1.248163104057312, + "learning_rate": 0.00027831932773109244, + "loss": 0.4542, + "step": 25877 + }, + { + "epoch": 14.456983240223463, + "grad_norm": 0.34120243787765503, + "learning_rate": 0.00027829131652661065, + "loss": 0.4245, + "step": 25878 + }, + { + "epoch": 14.45754189944134, + "grad_norm": 0.388263463973999, + "learning_rate": 0.00027826330532212886, + "loss": 0.454, + "step": 25879 + }, + { + "epoch": 14.458100558659218, + "grad_norm": 0.42011892795562744, + "learning_rate": 0.00027823529411764706, + "loss": 0.3985, + "step": 25880 + }, + { + "epoch": 14.458659217877095, + "grad_norm": 0.7509288787841797, + "learning_rate": 0.00027820728291316527, + "loss": 0.404, + "step": 25881 + }, + { + "epoch": 14.459217877094972, + "grad_norm": 0.600434422492981, + "learning_rate": 0.0002781792717086835, + "loss": 0.427, + "step": 25882 + }, + { + "epoch": 14.45977653631285, + "grad_norm": 0.6243824362754822, + "learning_rate": 0.0002781512605042017, + "loss": 0.3565, + "step": 25883 + }, + { + "epoch": 14.460335195530726, + "grad_norm": 0.5877174735069275, + "learning_rate": 0.0002781232492997199, + "loss": 0.3793, + "step": 25884 + }, + { + "epoch": 14.460893854748603, + "grad_norm": 0.6933143138885498, + "learning_rate": 0.00027809523809523815, + "loss": 0.3625, + "step": 25885 + }, + { + "epoch": 14.461452513966481, + "grad_norm": 0.3809208869934082, + "learning_rate": 0.0002780672268907563, + "loss": 0.3219, + "step": 25886 + }, + { + "epoch": 14.462011173184358, + "grad_norm": 0.4341868460178375, + "learning_rate": 0.0002780392156862745, + "loss": 0.4553, + "step": 25887 + }, + { + "epoch": 14.462569832402234, + "grad_norm": 0.3498751223087311, + "learning_rate": 0.0002780112044817927, + "loss": 0.3918, + "step": 25888 + }, + { + "epoch": 14.463128491620111, + "grad_norm": 0.5370792746543884, + "learning_rate": 0.0002779831932773109, + "loss": 0.4683, + "step": 25889 + }, + { + "epoch": 14.46368715083799, + "grad_norm": 2.3911874294281006, + "learning_rate": 0.0002779551820728292, + "loss": 0.3675, + "step": 25890 + }, + { + "epoch": 14.464245810055866, + "grad_norm": 0.4159843623638153, + "learning_rate": 0.00027792717086834733, + "loss": 0.518, + "step": 25891 + }, + { + "epoch": 14.464804469273743, + "grad_norm": 0.42622682452201843, + "learning_rate": 0.00027789915966386553, + "loss": 0.3819, + "step": 25892 + }, + { + "epoch": 14.46536312849162, + "grad_norm": 0.48557355999946594, + "learning_rate": 0.0002778711484593838, + "loss": 0.3972, + "step": 25893 + }, + { + "epoch": 14.465921787709497, + "grad_norm": 0.6152389049530029, + "learning_rate": 0.00027784313725490195, + "loss": 0.3862, + "step": 25894 + }, + { + "epoch": 14.466480446927374, + "grad_norm": 0.44748613238334656, + "learning_rate": 0.0002778151260504202, + "loss": 0.4744, + "step": 25895 + }, + { + "epoch": 14.46703910614525, + "grad_norm": 0.6692421436309814, + "learning_rate": 0.00027778711484593836, + "loss": 0.4342, + "step": 25896 + }, + { + "epoch": 14.467597765363129, + "grad_norm": 4.49300479888916, + "learning_rate": 0.00027775910364145656, + "loss": 0.3707, + "step": 25897 + }, + { + "epoch": 14.468156424581005, + "grad_norm": 0.5097029805183411, + "learning_rate": 0.0002777310924369748, + "loss": 0.5343, + "step": 25898 + }, + { + "epoch": 14.468715083798882, + "grad_norm": 0.55149245262146, + "learning_rate": 0.000277703081232493, + "loss": 0.4433, + "step": 25899 + }, + { + "epoch": 14.46927374301676, + "grad_norm": 0.44171226024627686, + "learning_rate": 0.00027767507002801124, + "loss": 0.4507, + "step": 25900 + }, + { + "epoch": 14.469832402234637, + "grad_norm": 0.4970145523548126, + "learning_rate": 0.00027764705882352944, + "loss": 0.4557, + "step": 25901 + }, + { + "epoch": 14.470391061452514, + "grad_norm": 0.5052556991577148, + "learning_rate": 0.0002776190476190476, + "loss": 0.4334, + "step": 25902 + }, + { + "epoch": 14.470949720670392, + "grad_norm": 1.618317723274231, + "learning_rate": 0.00027759103641456585, + "loss": 0.4407, + "step": 25903 + }, + { + "epoch": 14.471508379888268, + "grad_norm": 0.34399062395095825, + "learning_rate": 0.000277563025210084, + "loss": 0.2987, + "step": 25904 + }, + { + "epoch": 14.472067039106145, + "grad_norm": 0.36642301082611084, + "learning_rate": 0.00027753501400560227, + "loss": 0.3539, + "step": 25905 + }, + { + "epoch": 14.472625698324022, + "grad_norm": 0.8578351736068726, + "learning_rate": 0.00027750700280112047, + "loss": 0.3539, + "step": 25906 + }, + { + "epoch": 14.4731843575419, + "grad_norm": 0.413003146648407, + "learning_rate": 0.0002774789915966386, + "loss": 0.4346, + "step": 25907 + }, + { + "epoch": 14.473743016759776, + "grad_norm": 0.6205025911331177, + "learning_rate": 0.0002774509803921569, + "loss": 0.4073, + "step": 25908 + }, + { + "epoch": 14.474301675977653, + "grad_norm": 0.41772159934043884, + "learning_rate": 0.0002774229691876751, + "loss": 0.3453, + "step": 25909 + }, + { + "epoch": 14.474860335195531, + "grad_norm": 1.549654483795166, + "learning_rate": 0.0002773949579831933, + "loss": 0.388, + "step": 25910 + }, + { + "epoch": 14.475418994413408, + "grad_norm": 0.5830680727958679, + "learning_rate": 0.0002773669467787115, + "loss": 0.4695, + "step": 25911 + }, + { + "epoch": 14.475977653631285, + "grad_norm": 0.4966842830181122, + "learning_rate": 0.00027733893557422965, + "loss": 0.5402, + "step": 25912 + }, + { + "epoch": 14.476536312849163, + "grad_norm": 0.4180568754673004, + "learning_rate": 0.0002773109243697479, + "loss": 0.4398, + "step": 25913 + }, + { + "epoch": 14.47709497206704, + "grad_norm": 3.233367919921875, + "learning_rate": 0.0002772829131652661, + "loss": 0.357, + "step": 25914 + }, + { + "epoch": 14.477653631284916, + "grad_norm": 0.40725085139274597, + "learning_rate": 0.0002772549019607843, + "loss": 0.3687, + "step": 25915 + }, + { + "epoch": 14.478212290502793, + "grad_norm": 0.5402584075927734, + "learning_rate": 0.00027722689075630253, + "loss": 0.3921, + "step": 25916 + }, + { + "epoch": 14.478770949720671, + "grad_norm": 1.2117303609848022, + "learning_rate": 0.00027719887955182074, + "loss": 0.4623, + "step": 25917 + }, + { + "epoch": 14.479329608938547, + "grad_norm": 0.4490962326526642, + "learning_rate": 0.00027717086834733894, + "loss": 0.4384, + "step": 25918 + }, + { + "epoch": 14.479888268156424, + "grad_norm": 0.43865489959716797, + "learning_rate": 0.00027714285714285715, + "loss": 0.3899, + "step": 25919 + }, + { + "epoch": 14.480446927374302, + "grad_norm": 0.44409558176994324, + "learning_rate": 0.00027711484593837536, + "loss": 0.4186, + "step": 25920 + }, + { + "epoch": 14.481005586592179, + "grad_norm": 0.5111070275306702, + "learning_rate": 0.00027708683473389356, + "loss": 0.6634, + "step": 25921 + }, + { + "epoch": 14.481564245810056, + "grad_norm": 0.4883441925048828, + "learning_rate": 0.00027705882352941177, + "loss": 0.4135, + "step": 25922 + }, + { + "epoch": 14.482122905027932, + "grad_norm": 0.5916086435317993, + "learning_rate": 0.00027703081232493, + "loss": 0.3109, + "step": 25923 + }, + { + "epoch": 14.48268156424581, + "grad_norm": 0.4902220666408539, + "learning_rate": 0.0002770028011204482, + "loss": 0.3081, + "step": 25924 + }, + { + "epoch": 14.483240223463687, + "grad_norm": 0.3402598202228546, + "learning_rate": 0.00027697478991596644, + "loss": 0.3663, + "step": 25925 + }, + { + "epoch": 14.483798882681564, + "grad_norm": 0.46187663078308105, + "learning_rate": 0.0002769467787114846, + "loss": 0.4908, + "step": 25926 + }, + { + "epoch": 14.484357541899442, + "grad_norm": 1.052712321281433, + "learning_rate": 0.0002769187675070028, + "loss": 0.5474, + "step": 25927 + }, + { + "epoch": 14.484916201117318, + "grad_norm": 0.6220926642417908, + "learning_rate": 0.000276890756302521, + "loss": 0.6341, + "step": 25928 + }, + { + "epoch": 14.485474860335195, + "grad_norm": 1.2511452436447144, + "learning_rate": 0.0002768627450980392, + "loss": 0.4107, + "step": 25929 + }, + { + "epoch": 14.486033519553073, + "grad_norm": 0.5480039715766907, + "learning_rate": 0.00027683473389355747, + "loss": 0.3444, + "step": 25930 + }, + { + "epoch": 14.48659217877095, + "grad_norm": 0.8300253748893738, + "learning_rate": 0.0002768067226890756, + "loss": 0.3393, + "step": 25931 + }, + { + "epoch": 14.487150837988827, + "grad_norm": 0.577490508556366, + "learning_rate": 0.00027677871148459383, + "loss": 0.5192, + "step": 25932 + }, + { + "epoch": 14.487709497206703, + "grad_norm": 0.6029434204101562, + "learning_rate": 0.0002767507002801121, + "loss": 0.5011, + "step": 25933 + }, + { + "epoch": 14.488268156424581, + "grad_norm": 0.44405749440193176, + "learning_rate": 0.00027672268907563024, + "loss": 0.4414, + "step": 25934 + }, + { + "epoch": 14.488826815642458, + "grad_norm": 0.5729031562805176, + "learning_rate": 0.0002766946778711485, + "loss": 0.3957, + "step": 25935 + }, + { + "epoch": 14.489385474860335, + "grad_norm": 1.158888339996338, + "learning_rate": 0.00027666666666666665, + "loss": 0.4868, + "step": 25936 + }, + { + "epoch": 14.489944134078213, + "grad_norm": 0.42603081464767456, + "learning_rate": 0.00027663865546218486, + "loss": 0.4826, + "step": 25937 + }, + { + "epoch": 14.49050279329609, + "grad_norm": 1.5587899684906006, + "learning_rate": 0.0002766106442577031, + "loss": 0.3082, + "step": 25938 + }, + { + "epoch": 14.491061452513966, + "grad_norm": 0.7136504650115967, + "learning_rate": 0.00027658263305322127, + "loss": 0.3839, + "step": 25939 + }, + { + "epoch": 14.491620111731844, + "grad_norm": 0.4167581796646118, + "learning_rate": 0.0002765546218487395, + "loss": 0.3553, + "step": 25940 + }, + { + "epoch": 14.492178770949721, + "grad_norm": 1.5550014972686768, + "learning_rate": 0.00027652661064425774, + "loss": 0.416, + "step": 25941 + }, + { + "epoch": 14.492737430167598, + "grad_norm": 0.4727814793586731, + "learning_rate": 0.0002764985994397759, + "loss": 0.4125, + "step": 25942 + }, + { + "epoch": 14.493296089385474, + "grad_norm": 0.6300815939903259, + "learning_rate": 0.00027647058823529415, + "loss": 0.4168, + "step": 25943 + }, + { + "epoch": 14.493854748603352, + "grad_norm": 0.38475531339645386, + "learning_rate": 0.0002764425770308123, + "loss": 0.4369, + "step": 25944 + }, + { + "epoch": 14.494413407821229, + "grad_norm": 0.5899533033370972, + "learning_rate": 0.0002764145658263305, + "loss": 0.3714, + "step": 25945 + }, + { + "epoch": 14.494972067039106, + "grad_norm": 1.1216338872909546, + "learning_rate": 0.00027638655462184877, + "loss": 0.4638, + "step": 25946 + }, + { + "epoch": 14.495530726256984, + "grad_norm": 0.4806707799434662, + "learning_rate": 0.0002763585434173669, + "loss": 0.4203, + "step": 25947 + }, + { + "epoch": 14.49608938547486, + "grad_norm": 0.4335712194442749, + "learning_rate": 0.0002763305322128852, + "loss": 0.4131, + "step": 25948 + }, + { + "epoch": 14.496648044692737, + "grad_norm": 0.6214283108711243, + "learning_rate": 0.0002763025210084034, + "loss": 0.3542, + "step": 25949 + }, + { + "epoch": 14.497206703910614, + "grad_norm": 0.4234263598918915, + "learning_rate": 0.00027627450980392154, + "loss": 0.4529, + "step": 25950 + }, + { + "epoch": 14.497765363128492, + "grad_norm": 0.5320940613746643, + "learning_rate": 0.0002762464985994398, + "loss": 0.4522, + "step": 25951 + }, + { + "epoch": 14.498324022346369, + "grad_norm": 0.44390252232551575, + "learning_rate": 0.00027621848739495795, + "loss": 0.4595, + "step": 25952 + }, + { + "epoch": 14.498882681564245, + "grad_norm": 0.540688157081604, + "learning_rate": 0.0002761904761904762, + "loss": 0.3776, + "step": 25953 + }, + { + "epoch": 14.499441340782123, + "grad_norm": 0.6436562538146973, + "learning_rate": 0.0002761624649859944, + "loss": 0.6551, + "step": 25954 + }, + { + "epoch": 14.5, + "grad_norm": 1.3538720607757568, + "learning_rate": 0.00027613445378151257, + "loss": 0.3443, + "step": 25955 + }, + { + "epoch": 14.500558659217877, + "grad_norm": 0.9569316506385803, + "learning_rate": 0.0002761064425770308, + "loss": 0.4618, + "step": 25956 + }, + { + "epoch": 14.501117318435755, + "grad_norm": 0.5849122405052185, + "learning_rate": 0.00027607843137254903, + "loss": 0.3982, + "step": 25957 + }, + { + "epoch": 14.501675977653631, + "grad_norm": 1.7833435535430908, + "learning_rate": 0.00027605042016806724, + "loss": 0.3608, + "step": 25958 + }, + { + "epoch": 14.502234636871508, + "grad_norm": 0.9267893433570862, + "learning_rate": 0.00027602240896358544, + "loss": 0.4844, + "step": 25959 + }, + { + "epoch": 14.502793296089386, + "grad_norm": 0.586439847946167, + "learning_rate": 0.0002759943977591036, + "loss": 0.6485, + "step": 25960 + }, + { + "epoch": 14.503351955307263, + "grad_norm": 0.472647100687027, + "learning_rate": 0.00027596638655462186, + "loss": 0.4161, + "step": 25961 + }, + { + "epoch": 14.50391061452514, + "grad_norm": 0.46137022972106934, + "learning_rate": 0.00027593837535014006, + "loss": 0.3481, + "step": 25962 + }, + { + "epoch": 14.504469273743016, + "grad_norm": 0.4774845540523529, + "learning_rate": 0.00027591036414565827, + "loss": 0.374, + "step": 25963 + }, + { + "epoch": 14.505027932960894, + "grad_norm": 0.7174018025398254, + "learning_rate": 0.0002758823529411765, + "loss": 0.3755, + "step": 25964 + }, + { + "epoch": 14.505586592178771, + "grad_norm": 0.39748233556747437, + "learning_rate": 0.0002758543417366947, + "loss": 0.465, + "step": 25965 + }, + { + "epoch": 14.506145251396648, + "grad_norm": 0.5003325939178467, + "learning_rate": 0.0002758263305322129, + "loss": 0.534, + "step": 25966 + }, + { + "epoch": 14.506703910614526, + "grad_norm": 0.7247323989868164, + "learning_rate": 0.0002757983193277311, + "loss": 0.3158, + "step": 25967 + }, + { + "epoch": 14.507262569832402, + "grad_norm": 0.40612685680389404, + "learning_rate": 0.00027577030812324935, + "loss": 0.4028, + "step": 25968 + }, + { + "epoch": 14.507821229050279, + "grad_norm": 0.6557098627090454, + "learning_rate": 0.0002757422969187675, + "loss": 0.4125, + "step": 25969 + }, + { + "epoch": 14.508379888268156, + "grad_norm": 0.7481595277786255, + "learning_rate": 0.0002757142857142857, + "loss": 0.2999, + "step": 25970 + }, + { + "epoch": 14.508938547486034, + "grad_norm": 0.5338840484619141, + "learning_rate": 0.0002756862745098039, + "loss": 0.4391, + "step": 25971 + }, + { + "epoch": 14.50949720670391, + "grad_norm": 0.4046436548233032, + "learning_rate": 0.0002756582633053221, + "loss": 0.4779, + "step": 25972 + }, + { + "epoch": 14.510055865921787, + "grad_norm": 0.49413642287254333, + "learning_rate": 0.0002756302521008404, + "loss": 0.4386, + "step": 25973 + }, + { + "epoch": 14.510614525139665, + "grad_norm": 1.3335566520690918, + "learning_rate": 0.00027560224089635853, + "loss": 0.3907, + "step": 25974 + }, + { + "epoch": 14.511173184357542, + "grad_norm": 0.8605470657348633, + "learning_rate": 0.00027557422969187674, + "loss": 0.4689, + "step": 25975 + }, + { + "epoch": 14.511731843575419, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.000275546218487395, + "loss": 0.4374, + "step": 25976 + }, + { + "epoch": 14.512290502793297, + "grad_norm": 0.4011082351207733, + "learning_rate": 0.00027551820728291315, + "loss": 0.4145, + "step": 25977 + }, + { + "epoch": 14.512849162011173, + "grad_norm": 1.0278226137161255, + "learning_rate": 0.0002754901960784314, + "loss": 0.3024, + "step": 25978 + }, + { + "epoch": 14.51340782122905, + "grad_norm": 0.36048823595046997, + "learning_rate": 0.00027546218487394956, + "loss": 0.4077, + "step": 25979 + }, + { + "epoch": 14.513966480446927, + "grad_norm": 0.3824455440044403, + "learning_rate": 0.00027543417366946777, + "loss": 0.4161, + "step": 25980 + }, + { + "epoch": 14.514525139664805, + "grad_norm": 1.2201869487762451, + "learning_rate": 0.00027540616246498603, + "loss": 0.3678, + "step": 25981 + }, + { + "epoch": 14.515083798882682, + "grad_norm": 0.6133846044540405, + "learning_rate": 0.0002753781512605042, + "loss": 0.5822, + "step": 25982 + }, + { + "epoch": 14.515642458100558, + "grad_norm": 3.5307374000549316, + "learning_rate": 0.00027535014005602244, + "loss": 0.4668, + "step": 25983 + }, + { + "epoch": 14.516201117318436, + "grad_norm": 0.4548972249031067, + "learning_rate": 0.00027532212885154065, + "loss": 0.4188, + "step": 25984 + }, + { + "epoch": 14.516759776536313, + "grad_norm": 0.5998306274414062, + "learning_rate": 0.0002752941176470588, + "loss": 0.573, + "step": 25985 + }, + { + "epoch": 14.51731843575419, + "grad_norm": 0.5475751757621765, + "learning_rate": 0.00027526610644257706, + "loss": 0.3595, + "step": 25986 + }, + { + "epoch": 14.517877094972068, + "grad_norm": 0.3263009190559387, + "learning_rate": 0.0002752380952380952, + "loss": 0.3734, + "step": 25987 + }, + { + "epoch": 14.518435754189944, + "grad_norm": 0.6386430859565735, + "learning_rate": 0.00027521008403361347, + "loss": 0.4613, + "step": 25988 + }, + { + "epoch": 14.518994413407821, + "grad_norm": 0.5398362874984741, + "learning_rate": 0.0002751820728291317, + "loss": 0.5471, + "step": 25989 + }, + { + "epoch": 14.519553072625698, + "grad_norm": 0.3478747010231018, + "learning_rate": 0.00027515406162464983, + "loss": 0.3831, + "step": 25990 + }, + { + "epoch": 14.520111731843576, + "grad_norm": 0.48594221472740173, + "learning_rate": 0.0002751260504201681, + "loss": 0.3831, + "step": 25991 + }, + { + "epoch": 14.520670391061453, + "grad_norm": 0.31570881605148315, + "learning_rate": 0.0002750980392156863, + "loss": 0.3458, + "step": 25992 + }, + { + "epoch": 14.521229050279329, + "grad_norm": 0.4659821391105652, + "learning_rate": 0.0002750700280112045, + "loss": 0.421, + "step": 25993 + }, + { + "epoch": 14.521787709497207, + "grad_norm": 0.5973386168479919, + "learning_rate": 0.0002750420168067227, + "loss": 0.4089, + "step": 25994 + }, + { + "epoch": 14.522346368715084, + "grad_norm": 0.5446675419807434, + "learning_rate": 0.00027501400560224086, + "loss": 0.4108, + "step": 25995 + }, + { + "epoch": 14.52290502793296, + "grad_norm": 2.5869674682617188, + "learning_rate": 0.0002749859943977591, + "loss": 0.4668, + "step": 25996 + }, + { + "epoch": 14.523463687150837, + "grad_norm": 0.5437316298484802, + "learning_rate": 0.0002749579831932773, + "loss": 0.3679, + "step": 25997 + }, + { + "epoch": 14.524022346368715, + "grad_norm": 0.4237959682941437, + "learning_rate": 0.00027492997198879553, + "loss": 0.4254, + "step": 25998 + }, + { + "epoch": 14.524581005586592, + "grad_norm": 0.47599485516548157, + "learning_rate": 0.00027490196078431374, + "loss": 0.3897, + "step": 25999 + }, + { + "epoch": 14.525139664804469, + "grad_norm": 0.4000786244869232, + "learning_rate": 0.00027487394957983194, + "loss": 0.4103, + "step": 26000 + }, + { + "epoch": 14.525139664804469, + "eval_cer": 0.08566549921988369, + "eval_loss": 0.32592344284057617, + "eval_runtime": 55.5069, + "eval_samples_per_second": 81.756, + "eval_steps_per_second": 5.116, + "eval_wer": 0.33959945252925894, + "step": 26000 + }, + { + "epoch": 14.525698324022347, + "grad_norm": 1.9771556854248047, + "learning_rate": 0.00027484593837535015, + "loss": 0.3924, + "step": 26001 + }, + { + "epoch": 14.526256983240224, + "grad_norm": 0.522955596446991, + "learning_rate": 0.00027481792717086836, + "loss": 0.4107, + "step": 26002 + }, + { + "epoch": 14.5268156424581, + "grad_norm": 0.7025735974311829, + "learning_rate": 0.00027478991596638656, + "loss": 0.3578, + "step": 26003 + }, + { + "epoch": 14.527374301675978, + "grad_norm": 0.43271005153656006, + "learning_rate": 0.00027476190476190477, + "loss": 0.4055, + "step": 26004 + }, + { + "epoch": 14.527932960893855, + "grad_norm": 0.5428447127342224, + "learning_rate": 0.000274733893557423, + "loss": 0.498, + "step": 26005 + }, + { + "epoch": 14.528491620111732, + "grad_norm": 0.5781406760215759, + "learning_rate": 0.0002747058823529412, + "loss": 0.3292, + "step": 26006 + }, + { + "epoch": 14.529050279329608, + "grad_norm": 2.1886701583862305, + "learning_rate": 0.0002746778711484594, + "loss": 0.4734, + "step": 26007 + }, + { + "epoch": 14.529608938547486, + "grad_norm": 0.742739200592041, + "learning_rate": 0.00027464985994397765, + "loss": 0.3742, + "step": 26008 + }, + { + "epoch": 14.530167597765363, + "grad_norm": 1.6004784107208252, + "learning_rate": 0.0002746218487394958, + "loss": 0.4654, + "step": 26009 + }, + { + "epoch": 14.53072625698324, + "grad_norm": 0.40614578127861023, + "learning_rate": 0.000274593837535014, + "loss": 0.4035, + "step": 26010 + }, + { + "epoch": 14.531284916201118, + "grad_norm": 0.6169610023498535, + "learning_rate": 0.0002745658263305322, + "loss": 0.4775, + "step": 26011 + }, + { + "epoch": 14.531843575418995, + "grad_norm": 0.6450771689414978, + "learning_rate": 0.0002745378151260504, + "loss": 0.4256, + "step": 26012 + }, + { + "epoch": 14.532402234636871, + "grad_norm": 0.42523083090782166, + "learning_rate": 0.0002745098039215687, + "loss": 0.3226, + "step": 26013 + }, + { + "epoch": 14.53296089385475, + "grad_norm": 0.6148983240127563, + "learning_rate": 0.00027448179271708683, + "loss": 0.4975, + "step": 26014 + }, + { + "epoch": 14.533519553072626, + "grad_norm": 0.5217316150665283, + "learning_rate": 0.00027445378151260503, + "loss": 0.4175, + "step": 26015 + }, + { + "epoch": 14.534078212290503, + "grad_norm": 0.7151603698730469, + "learning_rate": 0.0002744257703081233, + "loss": 0.4726, + "step": 26016 + }, + { + "epoch": 14.53463687150838, + "grad_norm": 0.4330414831638336, + "learning_rate": 0.00027439775910364145, + "loss": 0.4007, + "step": 26017 + }, + { + "epoch": 14.535195530726257, + "grad_norm": 0.38777121901512146, + "learning_rate": 0.0002743697478991597, + "loss": 0.4533, + "step": 26018 + }, + { + "epoch": 14.535754189944134, + "grad_norm": 3.000777244567871, + "learning_rate": 0.00027434173669467786, + "loss": 0.4623, + "step": 26019 + }, + { + "epoch": 14.53631284916201, + "grad_norm": 1.1478471755981445, + "learning_rate": 0.00027431372549019606, + "loss": 0.4353, + "step": 26020 + }, + { + "epoch": 14.536871508379889, + "grad_norm": 1.5430406332015991, + "learning_rate": 0.0002742857142857143, + "loss": 0.3321, + "step": 26021 + }, + { + "epoch": 14.537430167597766, + "grad_norm": 0.4634488523006439, + "learning_rate": 0.0002742577030812325, + "loss": 0.405, + "step": 26022 + }, + { + "epoch": 14.537988826815642, + "grad_norm": 0.5155308246612549, + "learning_rate": 0.00027422969187675074, + "loss": 0.5002, + "step": 26023 + }, + { + "epoch": 14.538547486033519, + "grad_norm": 0.4073270559310913, + "learning_rate": 0.00027420168067226894, + "loss": 0.3, + "step": 26024 + }, + { + "epoch": 14.539106145251397, + "grad_norm": 0.6153872013092041, + "learning_rate": 0.0002741736694677871, + "loss": 0.3977, + "step": 26025 + }, + { + "epoch": 14.539664804469274, + "grad_norm": 0.6380208730697632, + "learning_rate": 0.00027414565826330535, + "loss": 0.5049, + "step": 26026 + }, + { + "epoch": 14.54022346368715, + "grad_norm": 0.41897475719451904, + "learning_rate": 0.0002741176470588235, + "loss": 0.3928, + "step": 26027 + }, + { + "epoch": 14.540782122905028, + "grad_norm": 1.5104429721832275, + "learning_rate": 0.00027408963585434177, + "loss": 0.3214, + "step": 26028 + }, + { + "epoch": 14.541340782122905, + "grad_norm": 1.6623996496200562, + "learning_rate": 0.00027406162464985997, + "loss": 0.4312, + "step": 26029 + }, + { + "epoch": 14.541899441340782, + "grad_norm": 0.3700019419193268, + "learning_rate": 0.0002740336134453781, + "loss": 0.3656, + "step": 26030 + }, + { + "epoch": 14.54245810055866, + "grad_norm": 0.4379046857357025, + "learning_rate": 0.0002740056022408964, + "loss": 0.3629, + "step": 26031 + }, + { + "epoch": 14.543016759776537, + "grad_norm": 0.8033010363578796, + "learning_rate": 0.0002739775910364146, + "loss": 0.4527, + "step": 26032 + }, + { + "epoch": 14.543575418994413, + "grad_norm": 0.7021380662918091, + "learning_rate": 0.0002739495798319328, + "loss": 0.3193, + "step": 26033 + }, + { + "epoch": 14.544134078212291, + "grad_norm": 0.8823875784873962, + "learning_rate": 0.000273921568627451, + "loss": 0.3757, + "step": 26034 + }, + { + "epoch": 14.544692737430168, + "grad_norm": 0.7063277959823608, + "learning_rate": 0.00027389355742296915, + "loss": 0.7309, + "step": 26035 + }, + { + "epoch": 14.545251396648045, + "grad_norm": 0.49394771456718445, + "learning_rate": 0.0002738655462184874, + "loss": 0.5467, + "step": 26036 + }, + { + "epoch": 14.545810055865921, + "grad_norm": 0.40190622210502625, + "learning_rate": 0.0002738375350140056, + "loss": 0.3621, + "step": 26037 + }, + { + "epoch": 14.5463687150838, + "grad_norm": 0.7106718420982361, + "learning_rate": 0.0002738095238095238, + "loss": 0.4182, + "step": 26038 + }, + { + "epoch": 14.546927374301676, + "grad_norm": 0.5196418166160583, + "learning_rate": 0.00027378151260504203, + "loss": 0.4571, + "step": 26039 + }, + { + "epoch": 14.547486033519553, + "grad_norm": 0.3982641100883484, + "learning_rate": 0.00027375350140056024, + "loss": 0.3458, + "step": 26040 + }, + { + "epoch": 14.548044692737431, + "grad_norm": 0.5754070281982422, + "learning_rate": 0.00027372549019607844, + "loss": 0.4574, + "step": 26041 + }, + { + "epoch": 14.548603351955308, + "grad_norm": 0.5434890389442444, + "learning_rate": 0.00027369747899159665, + "loss": 0.3786, + "step": 26042 + }, + { + "epoch": 14.549162011173184, + "grad_norm": 0.4313388764858246, + "learning_rate": 0.00027366946778711486, + "loss": 0.4345, + "step": 26043 + }, + { + "epoch": 14.54972067039106, + "grad_norm": 0.48550620675086975, + "learning_rate": 0.00027364145658263306, + "loss": 0.503, + "step": 26044 + }, + { + "epoch": 14.550279329608939, + "grad_norm": 0.40159523487091064, + "learning_rate": 0.00027361344537815127, + "loss": 0.4624, + "step": 26045 + }, + { + "epoch": 14.550837988826816, + "grad_norm": 0.37571531534194946, + "learning_rate": 0.0002735854341736695, + "loss": 0.4472, + "step": 26046 + }, + { + "epoch": 14.551396648044692, + "grad_norm": 0.5323110222816467, + "learning_rate": 0.0002735574229691877, + "loss": 0.3221, + "step": 26047 + }, + { + "epoch": 14.55195530726257, + "grad_norm": 0.7403799891471863, + "learning_rate": 0.00027352941176470594, + "loss": 0.4074, + "step": 26048 + }, + { + "epoch": 14.552513966480447, + "grad_norm": 0.32933682203292847, + "learning_rate": 0.0002735014005602241, + "loss": 0.3303, + "step": 26049 + }, + { + "epoch": 14.553072625698324, + "grad_norm": 0.38775405287742615, + "learning_rate": 0.0002734733893557423, + "loss": 0.4085, + "step": 26050 + }, + { + "epoch": 14.553631284916202, + "grad_norm": 0.83772212266922, + "learning_rate": 0.0002734453781512605, + "loss": 0.3432, + "step": 26051 + }, + { + "epoch": 14.554189944134079, + "grad_norm": 0.8909516334533691, + "learning_rate": 0.0002734173669467787, + "loss": 0.5251, + "step": 26052 + }, + { + "epoch": 14.554748603351955, + "grad_norm": 0.4273841977119446, + "learning_rate": 0.0002733893557422969, + "loss": 0.4467, + "step": 26053 + }, + { + "epoch": 14.555307262569832, + "grad_norm": 0.46706312894821167, + "learning_rate": 0.0002733613445378151, + "loss": 0.3305, + "step": 26054 + }, + { + "epoch": 14.55586592178771, + "grad_norm": 1.3802083730697632, + "learning_rate": 0.00027333333333333333, + "loss": 0.5234, + "step": 26055 + }, + { + "epoch": 14.556424581005587, + "grad_norm": 0.5829143524169922, + "learning_rate": 0.0002733053221288516, + "loss": 0.5211, + "step": 26056 + }, + { + "epoch": 14.556983240223463, + "grad_norm": 0.3526742160320282, + "learning_rate": 0.00027327731092436974, + "loss": 0.411, + "step": 26057 + }, + { + "epoch": 14.557541899441341, + "grad_norm": 0.6135927438735962, + "learning_rate": 0.00027324929971988795, + "loss": 0.4243, + "step": 26058 + }, + { + "epoch": 14.558100558659218, + "grad_norm": 0.4651064872741699, + "learning_rate": 0.00027322128851540615, + "loss": 0.492, + "step": 26059 + }, + { + "epoch": 14.558659217877095, + "grad_norm": 0.39348316192626953, + "learning_rate": 0.00027319327731092436, + "loss": 0.4161, + "step": 26060 + }, + { + "epoch": 14.559217877094973, + "grad_norm": 0.40608805418014526, + "learning_rate": 0.0002731652661064426, + "loss": 0.3723, + "step": 26061 + }, + { + "epoch": 14.55977653631285, + "grad_norm": 0.8289069533348083, + "learning_rate": 0.00027313725490196077, + "loss": 0.4473, + "step": 26062 + }, + { + "epoch": 14.560335195530726, + "grad_norm": 0.5005795359611511, + "learning_rate": 0.000273109243697479, + "loss": 0.4202, + "step": 26063 + }, + { + "epoch": 14.560893854748603, + "grad_norm": 0.4409407377243042, + "learning_rate": 0.00027308123249299724, + "loss": 0.405, + "step": 26064 + }, + { + "epoch": 14.561452513966481, + "grad_norm": 0.6019088625907898, + "learning_rate": 0.0002730532212885154, + "loss": 0.4844, + "step": 26065 + }, + { + "epoch": 14.562011173184358, + "grad_norm": 0.537895679473877, + "learning_rate": 0.00027302521008403365, + "loss": 0.357, + "step": 26066 + }, + { + "epoch": 14.562569832402234, + "grad_norm": 0.34745514392852783, + "learning_rate": 0.0002729971988795518, + "loss": 0.3399, + "step": 26067 + }, + { + "epoch": 14.563128491620112, + "grad_norm": 0.9350487589836121, + "learning_rate": 0.00027296918767507, + "loss": 0.3352, + "step": 26068 + }, + { + "epoch": 14.563687150837989, + "grad_norm": 0.6089915037155151, + "learning_rate": 0.00027294117647058827, + "loss": 0.4378, + "step": 26069 + }, + { + "epoch": 14.564245810055866, + "grad_norm": 0.47654271125793457, + "learning_rate": 0.0002729131652661064, + "loss": 0.3825, + "step": 26070 + }, + { + "epoch": 14.564804469273742, + "grad_norm": 0.52055424451828, + "learning_rate": 0.0002728851540616247, + "loss": 0.4719, + "step": 26071 + }, + { + "epoch": 14.56536312849162, + "grad_norm": 1.17324697971344, + "learning_rate": 0.0002728571428571429, + "loss": 0.5105, + "step": 26072 + }, + { + "epoch": 14.565921787709497, + "grad_norm": 0.3932095766067505, + "learning_rate": 0.00027282913165266104, + "loss": 0.445, + "step": 26073 + }, + { + "epoch": 14.566480446927374, + "grad_norm": 0.6846091747283936, + "learning_rate": 0.0002728011204481793, + "loss": 0.419, + "step": 26074 + }, + { + "epoch": 14.567039106145252, + "grad_norm": 0.37739360332489014, + "learning_rate": 0.00027277310924369745, + "loss": 0.4032, + "step": 26075 + }, + { + "epoch": 14.567597765363129, + "grad_norm": 0.36128780245780945, + "learning_rate": 0.0002727450980392157, + "loss": 0.327, + "step": 26076 + }, + { + "epoch": 14.568156424581005, + "grad_norm": 0.5941394567489624, + "learning_rate": 0.0002727170868347339, + "loss": 0.408, + "step": 26077 + }, + { + "epoch": 14.568715083798883, + "grad_norm": 2.184309720993042, + "learning_rate": 0.00027268907563025207, + "loss": 0.4819, + "step": 26078 + }, + { + "epoch": 14.56927374301676, + "grad_norm": 0.4476054012775421, + "learning_rate": 0.0002726610644257703, + "loss": 0.4246, + "step": 26079 + }, + { + "epoch": 14.569832402234637, + "grad_norm": 0.5083453059196472, + "learning_rate": 0.00027263305322128853, + "loss": 0.3647, + "step": 26080 + }, + { + "epoch": 14.570391061452513, + "grad_norm": 0.3089964985847473, + "learning_rate": 0.00027260504201680674, + "loss": 0.3666, + "step": 26081 + }, + { + "epoch": 14.570949720670392, + "grad_norm": 1.3355246782302856, + "learning_rate": 0.00027257703081232494, + "loss": 0.4315, + "step": 26082 + }, + { + "epoch": 14.571508379888268, + "grad_norm": 0.49300894141197205, + "learning_rate": 0.0002725490196078431, + "loss": 0.2956, + "step": 26083 + }, + { + "epoch": 14.572067039106145, + "grad_norm": 0.3968302011489868, + "learning_rate": 0.00027252100840336136, + "loss": 0.2935, + "step": 26084 + }, + { + "epoch": 14.572625698324023, + "grad_norm": 0.4763304591178894, + "learning_rate": 0.00027249299719887956, + "loss": 0.622, + "step": 26085 + }, + { + "epoch": 14.5731843575419, + "grad_norm": 0.3786044418811798, + "learning_rate": 0.00027246498599439777, + "loss": 0.4332, + "step": 26086 + }, + { + "epoch": 14.573743016759776, + "grad_norm": 0.46189144253730774, + "learning_rate": 0.000272436974789916, + "loss": 0.6368, + "step": 26087 + }, + { + "epoch": 14.574301675977654, + "grad_norm": 0.7109042406082153, + "learning_rate": 0.0002724089635854342, + "loss": 0.4257, + "step": 26088 + }, + { + "epoch": 14.574860335195531, + "grad_norm": 0.3672035336494446, + "learning_rate": 0.0002723809523809524, + "loss": 0.3468, + "step": 26089 + }, + { + "epoch": 14.575418994413408, + "grad_norm": 0.3219582438468933, + "learning_rate": 0.0002723529411764706, + "loss": 0.4181, + "step": 26090 + }, + { + "epoch": 14.575977653631284, + "grad_norm": 0.49108415842056274, + "learning_rate": 0.0002723249299719888, + "loss": 0.5212, + "step": 26091 + }, + { + "epoch": 14.576536312849163, + "grad_norm": 0.6956894397735596, + "learning_rate": 0.000272296918767507, + "loss": 0.3895, + "step": 26092 + }, + { + "epoch": 14.577094972067039, + "grad_norm": 0.5838638544082642, + "learning_rate": 0.0002722689075630252, + "loss": 0.6173, + "step": 26093 + }, + { + "epoch": 14.577653631284916, + "grad_norm": 0.3746076226234436, + "learning_rate": 0.0002722408963585434, + "loss": 0.4911, + "step": 26094 + }, + { + "epoch": 14.578212290502794, + "grad_norm": 1.9903069734573364, + "learning_rate": 0.0002722128851540616, + "loss": 0.4245, + "step": 26095 + }, + { + "epoch": 14.57877094972067, + "grad_norm": 1.221889615058899, + "learning_rate": 0.0002721848739495799, + "loss": 0.5497, + "step": 26096 + }, + { + "epoch": 14.579329608938547, + "grad_norm": 0.9008015990257263, + "learning_rate": 0.00027215686274509803, + "loss": 0.3349, + "step": 26097 + }, + { + "epoch": 14.579888268156424, + "grad_norm": 0.5886290073394775, + "learning_rate": 0.00027212885154061624, + "loss": 0.3793, + "step": 26098 + }, + { + "epoch": 14.580446927374302, + "grad_norm": 10.349678039550781, + "learning_rate": 0.00027210084033613445, + "loss": 0.3556, + "step": 26099 + }, + { + "epoch": 14.581005586592179, + "grad_norm": 0.7075499296188354, + "learning_rate": 0.00027207282913165265, + "loss": 0.4967, + "step": 26100 + }, + { + "epoch": 14.581564245810055, + "grad_norm": 0.659697949886322, + "learning_rate": 0.0002720448179271709, + "loss": 0.4812, + "step": 26101 + }, + { + "epoch": 14.582122905027934, + "grad_norm": 0.8686216473579407, + "learning_rate": 0.00027201680672268906, + "loss": 0.4624, + "step": 26102 + }, + { + "epoch": 14.58268156424581, + "grad_norm": 0.5101198554039001, + "learning_rate": 0.00027198879551820727, + "loss": 0.4314, + "step": 26103 + }, + { + "epoch": 14.583240223463687, + "grad_norm": 5.199033260345459, + "learning_rate": 0.00027196078431372553, + "loss": 0.3412, + "step": 26104 + }, + { + "epoch": 14.583798882681565, + "grad_norm": 0.5274847745895386, + "learning_rate": 0.0002719327731092437, + "loss": 0.3988, + "step": 26105 + }, + { + "epoch": 14.584357541899442, + "grad_norm": 0.45590347051620483, + "learning_rate": 0.00027190476190476194, + "loss": 0.4705, + "step": 26106 + }, + { + "epoch": 14.584916201117318, + "grad_norm": 1.5884829759597778, + "learning_rate": 0.0002718767507002801, + "loss": 0.3979, + "step": 26107 + }, + { + "epoch": 14.585474860335196, + "grad_norm": 0.39603689312934875, + "learning_rate": 0.0002718487394957983, + "loss": 0.3367, + "step": 26108 + }, + { + "epoch": 14.586033519553073, + "grad_norm": 0.552118718624115, + "learning_rate": 0.00027182072829131656, + "loss": 0.4641, + "step": 26109 + }, + { + "epoch": 14.58659217877095, + "grad_norm": 0.9286280274391174, + "learning_rate": 0.0002717927170868347, + "loss": 0.4111, + "step": 26110 + }, + { + "epoch": 14.587150837988826, + "grad_norm": 0.5307616591453552, + "learning_rate": 0.00027176470588235297, + "loss": 0.4479, + "step": 26111 + }, + { + "epoch": 14.587709497206705, + "grad_norm": 0.38906583189964294, + "learning_rate": 0.0002717366946778712, + "loss": 0.3181, + "step": 26112 + }, + { + "epoch": 14.588268156424581, + "grad_norm": 0.41692179441452026, + "learning_rate": 0.00027170868347338933, + "loss": 0.3959, + "step": 26113 + }, + { + "epoch": 14.588826815642458, + "grad_norm": 0.45565518736839294, + "learning_rate": 0.0002716806722689076, + "loss": 0.5004, + "step": 26114 + }, + { + "epoch": 14.589385474860336, + "grad_norm": 0.5528531074523926, + "learning_rate": 0.00027165266106442574, + "loss": 0.3311, + "step": 26115 + }, + { + "epoch": 14.589944134078213, + "grad_norm": 0.35898053646087646, + "learning_rate": 0.000271624649859944, + "loss": 0.3291, + "step": 26116 + }, + { + "epoch": 14.59050279329609, + "grad_norm": 0.4807465672492981, + "learning_rate": 0.0002715966386554622, + "loss": 0.4286, + "step": 26117 + }, + { + "epoch": 14.591061452513966, + "grad_norm": 0.42465221881866455, + "learning_rate": 0.00027156862745098036, + "loss": 0.4198, + "step": 26118 + }, + { + "epoch": 14.591620111731844, + "grad_norm": 0.41445204615592957, + "learning_rate": 0.0002715406162464986, + "loss": 0.3419, + "step": 26119 + }, + { + "epoch": 14.59217877094972, + "grad_norm": 0.32837793231010437, + "learning_rate": 0.0002715126050420168, + "loss": 0.4119, + "step": 26120 + }, + { + "epoch": 14.592737430167597, + "grad_norm": 0.3237442374229431, + "learning_rate": 0.00027148459383753503, + "loss": 0.3136, + "step": 26121 + }, + { + "epoch": 14.593296089385476, + "grad_norm": 0.42453381419181824, + "learning_rate": 0.00027145658263305324, + "loss": 0.3574, + "step": 26122 + }, + { + "epoch": 14.593854748603352, + "grad_norm": 1.4846601486206055, + "learning_rate": 0.0002714285714285714, + "loss": 0.3973, + "step": 26123 + }, + { + "epoch": 14.594413407821229, + "grad_norm": 0.4344806671142578, + "learning_rate": 0.00027140056022408965, + "loss": 0.4062, + "step": 26124 + }, + { + "epoch": 14.594972067039105, + "grad_norm": 0.38445907831192017, + "learning_rate": 0.00027137254901960786, + "loss": 0.3543, + "step": 26125 + }, + { + "epoch": 14.595530726256984, + "grad_norm": 0.5752466320991516, + "learning_rate": 0.00027134453781512606, + "loss": 0.4638, + "step": 26126 + }, + { + "epoch": 14.59608938547486, + "grad_norm": 0.8387674689292908, + "learning_rate": 0.00027131652661064427, + "loss": 0.3897, + "step": 26127 + }, + { + "epoch": 14.596648044692737, + "grad_norm": 0.705547034740448, + "learning_rate": 0.0002712885154061625, + "loss": 0.3764, + "step": 26128 + }, + { + "epoch": 14.597206703910615, + "grad_norm": 0.5294306874275208, + "learning_rate": 0.0002712605042016807, + "loss": 0.5623, + "step": 26129 + }, + { + "epoch": 14.597765363128492, + "grad_norm": 0.3328593373298645, + "learning_rate": 0.0002712324929971989, + "loss": 0.3736, + "step": 26130 + }, + { + "epoch": 14.598324022346368, + "grad_norm": 0.40915560722351074, + "learning_rate": 0.0002712044817927171, + "loss": 0.3548, + "step": 26131 + }, + { + "epoch": 14.598882681564247, + "grad_norm": 0.4741741716861725, + "learning_rate": 0.0002711764705882353, + "loss": 0.4751, + "step": 26132 + }, + { + "epoch": 14.599441340782123, + "grad_norm": 0.32831376791000366, + "learning_rate": 0.0002711484593837535, + "loss": 0.3642, + "step": 26133 + }, + { + "epoch": 14.6, + "grad_norm": 0.49157190322875977, + "learning_rate": 0.0002711204481792717, + "loss": 0.5136, + "step": 26134 + }, + { + "epoch": 14.600558659217878, + "grad_norm": 0.5309893488883972, + "learning_rate": 0.0002710924369747899, + "loss": 0.3769, + "step": 26135 + }, + { + "epoch": 14.601117318435755, + "grad_norm": 3.6335575580596924, + "learning_rate": 0.0002710644257703082, + "loss": 0.5501, + "step": 26136 + }, + { + "epoch": 14.601675977653631, + "grad_norm": 0.45638343691825867, + "learning_rate": 0.00027103641456582633, + "loss": 0.4078, + "step": 26137 + }, + { + "epoch": 14.602234636871508, + "grad_norm": 1.5849069356918335, + "learning_rate": 0.00027100840336134453, + "loss": 0.3802, + "step": 26138 + }, + { + "epoch": 14.602793296089386, + "grad_norm": 0.6146363019943237, + "learning_rate": 0.00027098039215686274, + "loss": 0.4778, + "step": 26139 + }, + { + "epoch": 14.603351955307263, + "grad_norm": 0.6562778353691101, + "learning_rate": 0.00027095238095238095, + "loss": 0.4588, + "step": 26140 + }, + { + "epoch": 14.60391061452514, + "grad_norm": 0.5333754420280457, + "learning_rate": 0.0002709243697478992, + "loss": 0.4616, + "step": 26141 + }, + { + "epoch": 14.604469273743018, + "grad_norm": 0.40937182307243347, + "learning_rate": 0.00027089635854341736, + "loss": 0.3876, + "step": 26142 + }, + { + "epoch": 14.605027932960894, + "grad_norm": 0.37132003903388977, + "learning_rate": 0.00027086834733893556, + "loss": 0.3164, + "step": 26143 + }, + { + "epoch": 14.60558659217877, + "grad_norm": 0.45623156428337097, + "learning_rate": 0.0002708403361344538, + "loss": 0.4682, + "step": 26144 + }, + { + "epoch": 14.606145251396647, + "grad_norm": 0.3326408267021179, + "learning_rate": 0.000270812324929972, + "loss": 0.4154, + "step": 26145 + }, + { + "epoch": 14.606703910614526, + "grad_norm": 0.4545988440513611, + "learning_rate": 0.00027078431372549024, + "loss": 0.4711, + "step": 26146 + }, + { + "epoch": 14.607262569832402, + "grad_norm": 0.4900790750980377, + "learning_rate": 0.0002707563025210084, + "loss": 0.5832, + "step": 26147 + }, + { + "epoch": 14.607821229050279, + "grad_norm": 0.7301142811775208, + "learning_rate": 0.0002707282913165266, + "loss": 0.3963, + "step": 26148 + }, + { + "epoch": 14.608379888268157, + "grad_norm": 0.3953668177127838, + "learning_rate": 0.00027070028011204485, + "loss": 0.4456, + "step": 26149 + }, + { + "epoch": 14.608938547486034, + "grad_norm": 0.5311002731323242, + "learning_rate": 0.000270672268907563, + "loss": 0.3338, + "step": 26150 + }, + { + "epoch": 14.60949720670391, + "grad_norm": 0.40936607122421265, + "learning_rate": 0.00027064425770308127, + "loss": 0.3963, + "step": 26151 + }, + { + "epoch": 14.610055865921789, + "grad_norm": 42.05873107910156, + "learning_rate": 0.00027061624649859947, + "loss": 0.4933, + "step": 26152 + }, + { + "epoch": 14.610614525139665, + "grad_norm": 0.7966561913490295, + "learning_rate": 0.0002705882352941176, + "loss": 0.4037, + "step": 26153 + }, + { + "epoch": 14.611173184357542, + "grad_norm": 0.49473223090171814, + "learning_rate": 0.0002705602240896359, + "loss": 0.4048, + "step": 26154 + }, + { + "epoch": 14.611731843575418, + "grad_norm": 0.5742973685264587, + "learning_rate": 0.00027053221288515404, + "loss": 0.5214, + "step": 26155 + }, + { + "epoch": 14.612290502793297, + "grad_norm": 0.4434870183467865, + "learning_rate": 0.0002705042016806723, + "loss": 0.3567, + "step": 26156 + }, + { + "epoch": 14.612849162011173, + "grad_norm": 0.6392762064933777, + "learning_rate": 0.0002704761904761905, + "loss": 0.4417, + "step": 26157 + }, + { + "epoch": 14.61340782122905, + "grad_norm": 0.4371165335178375, + "learning_rate": 0.00027044817927170865, + "loss": 0.398, + "step": 26158 + }, + { + "epoch": 14.613966480446928, + "grad_norm": 0.40892982482910156, + "learning_rate": 0.0002704201680672269, + "loss": 0.4626, + "step": 26159 + }, + { + "epoch": 14.614525139664805, + "grad_norm": 0.3397248387336731, + "learning_rate": 0.0002703921568627451, + "loss": 0.3824, + "step": 26160 + }, + { + "epoch": 14.615083798882681, + "grad_norm": 0.3726533353328705, + "learning_rate": 0.0002703641456582633, + "loss": 0.3862, + "step": 26161 + }, + { + "epoch": 14.61564245810056, + "grad_norm": 0.3600512146949768, + "learning_rate": 0.00027033613445378153, + "loss": 0.3747, + "step": 26162 + }, + { + "epoch": 14.616201117318436, + "grad_norm": 0.6502101421356201, + "learning_rate": 0.0002703081232492997, + "loss": 0.516, + "step": 26163 + }, + { + "epoch": 14.616759776536313, + "grad_norm": 0.40174487233161926, + "learning_rate": 0.00027028011204481794, + "loss": 0.4119, + "step": 26164 + }, + { + "epoch": 14.61731843575419, + "grad_norm": 0.8021479249000549, + "learning_rate": 0.00027025210084033615, + "loss": 0.4718, + "step": 26165 + }, + { + "epoch": 14.617877094972068, + "grad_norm": 0.8731991648674011, + "learning_rate": 0.0002702240896358543, + "loss": 0.5393, + "step": 26166 + }, + { + "epoch": 14.618435754189944, + "grad_norm": 0.4460891783237457, + "learning_rate": 0.00027019607843137256, + "loss": 0.2959, + "step": 26167 + }, + { + "epoch": 14.61899441340782, + "grad_norm": 0.4560711085796356, + "learning_rate": 0.00027016806722689077, + "loss": 0.3374, + "step": 26168 + }, + { + "epoch": 14.619553072625699, + "grad_norm": 0.45966285467147827, + "learning_rate": 0.000270140056022409, + "loss": 0.4145, + "step": 26169 + }, + { + "epoch": 14.620111731843576, + "grad_norm": 0.5002447962760925, + "learning_rate": 0.0002701120448179272, + "loss": 0.3981, + "step": 26170 + }, + { + "epoch": 14.620670391061452, + "grad_norm": 0.4015636146068573, + "learning_rate": 0.00027008403361344533, + "loss": 0.4184, + "step": 26171 + }, + { + "epoch": 14.621229050279329, + "grad_norm": 0.6292645335197449, + "learning_rate": 0.0002700560224089636, + "loss": 0.4649, + "step": 26172 + }, + { + "epoch": 14.621787709497207, + "grad_norm": 0.41048961877822876, + "learning_rate": 0.0002700280112044818, + "loss": 0.4324, + "step": 26173 + }, + { + "epoch": 14.622346368715084, + "grad_norm": 0.5327526926994324, + "learning_rate": 0.00027, + "loss": 0.403, + "step": 26174 + }, + { + "epoch": 14.62290502793296, + "grad_norm": 0.6859604716300964, + "learning_rate": 0.0002699719887955182, + "loss": 0.5822, + "step": 26175 + }, + { + "epoch": 14.623463687150839, + "grad_norm": 1.352912187576294, + "learning_rate": 0.0002699439775910364, + "loss": 0.3686, + "step": 26176 + }, + { + "epoch": 14.624022346368715, + "grad_norm": 0.5483441948890686, + "learning_rate": 0.0002699159663865546, + "loss": 0.3689, + "step": 26177 + }, + { + "epoch": 14.624581005586592, + "grad_norm": 0.5091087222099304, + "learning_rate": 0.00026988795518207283, + "loss": 0.3912, + "step": 26178 + }, + { + "epoch": 14.62513966480447, + "grad_norm": 0.5993852019309998, + "learning_rate": 0.00026985994397759103, + "loss": 0.3838, + "step": 26179 + }, + { + "epoch": 14.625698324022347, + "grad_norm": 0.45645418763160706, + "learning_rate": 0.00026983193277310924, + "loss": 0.3715, + "step": 26180 + }, + { + "epoch": 14.626256983240223, + "grad_norm": 0.4045490026473999, + "learning_rate": 0.00026980392156862745, + "loss": 0.484, + "step": 26181 + }, + { + "epoch": 14.6268156424581, + "grad_norm": 0.3664165735244751, + "learning_rate": 0.00026977591036414565, + "loss": 0.4915, + "step": 26182 + }, + { + "epoch": 14.627374301675978, + "grad_norm": 0.4844832122325897, + "learning_rate": 0.00026974789915966386, + "loss": 0.3243, + "step": 26183 + }, + { + "epoch": 14.627932960893855, + "grad_norm": 0.34051236510276794, + "learning_rate": 0.0002697198879551821, + "loss": 0.4092, + "step": 26184 + }, + { + "epoch": 14.628491620111731, + "grad_norm": 0.3885672986507416, + "learning_rate": 0.00026969187675070027, + "loss": 0.4048, + "step": 26185 + }, + { + "epoch": 14.62905027932961, + "grad_norm": 1.0388611555099487, + "learning_rate": 0.0002696638655462185, + "loss": 0.4823, + "step": 26186 + }, + { + "epoch": 14.629608938547486, + "grad_norm": 0.309492290019989, + "learning_rate": 0.0002696358543417367, + "loss": 0.3374, + "step": 26187 + }, + { + "epoch": 14.630167597765363, + "grad_norm": 0.5820735692977905, + "learning_rate": 0.0002696078431372549, + "loss": 0.4226, + "step": 26188 + }, + { + "epoch": 14.630726256983241, + "grad_norm": 0.557466447353363, + "learning_rate": 0.00026957983193277315, + "loss": 0.3548, + "step": 26189 + }, + { + "epoch": 14.631284916201118, + "grad_norm": 0.40924587845802307, + "learning_rate": 0.0002695518207282913, + "loss": 0.3518, + "step": 26190 + }, + { + "epoch": 14.631843575418994, + "grad_norm": 0.499137818813324, + "learning_rate": 0.0002695238095238095, + "loss": 0.3912, + "step": 26191 + }, + { + "epoch": 14.63240223463687, + "grad_norm": 0.3898703455924988, + "learning_rate": 0.00026949579831932777, + "loss": 0.3434, + "step": 26192 + }, + { + "epoch": 14.632960893854749, + "grad_norm": 0.40127143263816833, + "learning_rate": 0.0002694677871148459, + "loss": 0.42, + "step": 26193 + }, + { + "epoch": 14.633519553072626, + "grad_norm": 22.813966751098633, + "learning_rate": 0.0002694397759103642, + "loss": 0.3111, + "step": 26194 + }, + { + "epoch": 14.634078212290502, + "grad_norm": 1.0975518226623535, + "learning_rate": 0.00026941176470588233, + "loss": 0.3402, + "step": 26195 + }, + { + "epoch": 14.63463687150838, + "grad_norm": 0.4779471457004547, + "learning_rate": 0.00026938375350140054, + "loss": 0.4725, + "step": 26196 + }, + { + "epoch": 14.635195530726257, + "grad_norm": 0.6070505976676941, + "learning_rate": 0.0002693557422969188, + "loss": 0.432, + "step": 26197 + }, + { + "epoch": 14.635754189944134, + "grad_norm": 0.5175840854644775, + "learning_rate": 0.00026932773109243695, + "loss": 0.5037, + "step": 26198 + }, + { + "epoch": 14.63631284916201, + "grad_norm": 0.5430041551589966, + "learning_rate": 0.0002692997198879552, + "loss": 0.3568, + "step": 26199 + }, + { + "epoch": 14.636871508379889, + "grad_norm": 1.4578442573547363, + "learning_rate": 0.0002692717086834734, + "loss": 0.4893, + "step": 26200 + }, + { + "epoch": 14.637430167597765, + "grad_norm": 0.5916971564292908, + "learning_rate": 0.00026924369747899157, + "loss": 0.3631, + "step": 26201 + }, + { + "epoch": 14.637988826815642, + "grad_norm": 1.3853681087493896, + "learning_rate": 0.0002692156862745098, + "loss": 0.5891, + "step": 26202 + }, + { + "epoch": 14.63854748603352, + "grad_norm": 0.3974631726741791, + "learning_rate": 0.000269187675070028, + "loss": 0.3572, + "step": 26203 + }, + { + "epoch": 14.639106145251397, + "grad_norm": 0.6649497151374817, + "learning_rate": 0.00026915966386554624, + "loss": 0.4976, + "step": 26204 + }, + { + "epoch": 14.639664804469273, + "grad_norm": 0.4823826849460602, + "learning_rate": 0.00026913165266106444, + "loss": 0.437, + "step": 26205 + }, + { + "epoch": 14.640223463687152, + "grad_norm": 0.668759286403656, + "learning_rate": 0.0002691036414565826, + "loss": 0.4794, + "step": 26206 + }, + { + "epoch": 14.640782122905028, + "grad_norm": 1.0757184028625488, + "learning_rate": 0.00026907563025210086, + "loss": 0.3916, + "step": 26207 + }, + { + "epoch": 14.641340782122905, + "grad_norm": 2.5164954662323, + "learning_rate": 0.00026904761904761906, + "loss": 0.5758, + "step": 26208 + }, + { + "epoch": 14.641899441340783, + "grad_norm": 0.41021421551704407, + "learning_rate": 0.00026901960784313727, + "loss": 0.4502, + "step": 26209 + }, + { + "epoch": 14.64245810055866, + "grad_norm": 2.5434138774871826, + "learning_rate": 0.0002689915966386555, + "loss": 0.4813, + "step": 26210 + }, + { + "epoch": 14.643016759776536, + "grad_norm": 0.5545975565910339, + "learning_rate": 0.0002689635854341736, + "loss": 0.4348, + "step": 26211 + }, + { + "epoch": 14.643575418994413, + "grad_norm": 1.3623853921890259, + "learning_rate": 0.0002689355742296919, + "loss": 0.3345, + "step": 26212 + }, + { + "epoch": 14.644134078212291, + "grad_norm": 1.0093461275100708, + "learning_rate": 0.0002689075630252101, + "loss": 0.5207, + "step": 26213 + }, + { + "epoch": 14.644692737430168, + "grad_norm": 0.3987479507923126, + "learning_rate": 0.0002688795518207283, + "loss": 0.492, + "step": 26214 + }, + { + "epoch": 14.645251396648044, + "grad_norm": 0.4499443471431732, + "learning_rate": 0.0002688515406162465, + "loss": 0.4389, + "step": 26215 + }, + { + "epoch": 14.645810055865923, + "grad_norm": 0.8201692700386047, + "learning_rate": 0.0002688235294117647, + "loss": 0.4016, + "step": 26216 + }, + { + "epoch": 14.6463687150838, + "grad_norm": 0.6254597306251526, + "learning_rate": 0.0002687955182072829, + "loss": 0.4247, + "step": 26217 + }, + { + "epoch": 14.646927374301676, + "grad_norm": 0.32390525937080383, + "learning_rate": 0.0002687675070028011, + "loss": 0.3831, + "step": 26218 + }, + { + "epoch": 14.647486033519552, + "grad_norm": 0.4069139361381531, + "learning_rate": 0.0002687394957983194, + "loss": 0.4152, + "step": 26219 + }, + { + "epoch": 14.64804469273743, + "grad_norm": 0.37570512294769287, + "learning_rate": 0.00026871148459383753, + "loss": 0.4136, + "step": 26220 + }, + { + "epoch": 14.648603351955307, + "grad_norm": 0.6094589829444885, + "learning_rate": 0.00026868347338935574, + "loss": 0.5027, + "step": 26221 + }, + { + "epoch": 14.649162011173184, + "grad_norm": 0.39261001348495483, + "learning_rate": 0.00026865546218487395, + "loss": 0.4587, + "step": 26222 + }, + { + "epoch": 14.649720670391062, + "grad_norm": 0.5667718648910522, + "learning_rate": 0.00026862745098039215, + "loss": 0.5456, + "step": 26223 + }, + { + "epoch": 14.650279329608939, + "grad_norm": 0.7404841780662537, + "learning_rate": 0.0002685994397759104, + "loss": 0.4915, + "step": 26224 + }, + { + "epoch": 14.650837988826815, + "grad_norm": 0.6316376328468323, + "learning_rate": 0.00026857142857142856, + "loss": 0.4496, + "step": 26225 + }, + { + "epoch": 14.651396648044694, + "grad_norm": 0.4138798713684082, + "learning_rate": 0.00026854341736694677, + "loss": 0.4422, + "step": 26226 + }, + { + "epoch": 14.65195530726257, + "grad_norm": 0.5987324118614197, + "learning_rate": 0.00026851540616246503, + "loss": 0.3764, + "step": 26227 + }, + { + "epoch": 14.652513966480447, + "grad_norm": 0.5430089831352234, + "learning_rate": 0.0002684873949579832, + "loss": 0.4924, + "step": 26228 + }, + { + "epoch": 14.653072625698323, + "grad_norm": 0.37948867678642273, + "learning_rate": 0.00026845938375350144, + "loss": 0.3416, + "step": 26229 + }, + { + "epoch": 14.653631284916202, + "grad_norm": 1.177739143371582, + "learning_rate": 0.0002684313725490196, + "loss": 0.4222, + "step": 26230 + }, + { + "epoch": 14.654189944134078, + "grad_norm": 1.028996467590332, + "learning_rate": 0.0002684033613445378, + "loss": 0.403, + "step": 26231 + }, + { + "epoch": 14.654748603351955, + "grad_norm": 0.3851401209831238, + "learning_rate": 0.00026837535014005606, + "loss": 0.3784, + "step": 26232 + }, + { + "epoch": 14.655307262569833, + "grad_norm": 0.6418029069900513, + "learning_rate": 0.0002683473389355742, + "loss": 0.4123, + "step": 26233 + }, + { + "epoch": 14.65586592178771, + "grad_norm": 0.49686112999916077, + "learning_rate": 0.00026831932773109247, + "loss": 0.6794, + "step": 26234 + }, + { + "epoch": 14.656424581005586, + "grad_norm": 0.5954158902168274, + "learning_rate": 0.0002682913165266107, + "loss": 0.3666, + "step": 26235 + }, + { + "epoch": 14.656983240223465, + "grad_norm": 0.5749161243438721, + "learning_rate": 0.00026826330532212883, + "loss": 0.4424, + "step": 26236 + }, + { + "epoch": 14.657541899441341, + "grad_norm": 0.4638644754886627, + "learning_rate": 0.0002682352941176471, + "loss": 0.4241, + "step": 26237 + }, + { + "epoch": 14.658100558659218, + "grad_norm": 0.3733034133911133, + "learning_rate": 0.00026820728291316524, + "loss": 0.3891, + "step": 26238 + }, + { + "epoch": 14.658659217877094, + "grad_norm": 0.3820943832397461, + "learning_rate": 0.0002681792717086835, + "loss": 0.4636, + "step": 26239 + }, + { + "epoch": 14.659217877094973, + "grad_norm": 0.47946685552597046, + "learning_rate": 0.0002681512605042017, + "loss": 0.4383, + "step": 26240 + }, + { + "epoch": 14.65977653631285, + "grad_norm": 0.9269028306007385, + "learning_rate": 0.00026812324929971986, + "loss": 0.4399, + "step": 26241 + }, + { + "epoch": 14.660335195530726, + "grad_norm": 0.5003195405006409, + "learning_rate": 0.0002680952380952381, + "loss": 0.375, + "step": 26242 + }, + { + "epoch": 14.660893854748604, + "grad_norm": 0.5342738032341003, + "learning_rate": 0.0002680672268907563, + "loss": 0.6067, + "step": 26243 + }, + { + "epoch": 14.66145251396648, + "grad_norm": 0.47403308749198914, + "learning_rate": 0.00026803921568627453, + "loss": 0.4541, + "step": 26244 + }, + { + "epoch": 14.662011173184357, + "grad_norm": 0.6981449127197266, + "learning_rate": 0.00026801120448179274, + "loss": 0.4106, + "step": 26245 + }, + { + "epoch": 14.662569832402234, + "grad_norm": 0.3924463093280792, + "learning_rate": 0.0002679831932773109, + "loss": 0.4244, + "step": 26246 + }, + { + "epoch": 14.663128491620112, + "grad_norm": 0.5522960424423218, + "learning_rate": 0.00026795518207282915, + "loss": 0.4407, + "step": 26247 + }, + { + "epoch": 14.663687150837989, + "grad_norm": 0.5107319355010986, + "learning_rate": 0.00026792717086834736, + "loss": 0.4159, + "step": 26248 + }, + { + "epoch": 14.664245810055865, + "grad_norm": 4.181353569030762, + "learning_rate": 0.00026789915966386556, + "loss": 0.3792, + "step": 26249 + }, + { + "epoch": 14.664804469273744, + "grad_norm": 0.6626802086830139, + "learning_rate": 0.00026787114845938377, + "loss": 0.4671, + "step": 26250 + }, + { + "epoch": 14.66536312849162, + "grad_norm": 0.3738318085670471, + "learning_rate": 0.000267843137254902, + "loss": 0.4495, + "step": 26251 + }, + { + "epoch": 14.665921787709497, + "grad_norm": 0.5367510914802551, + "learning_rate": 0.0002678151260504202, + "loss": 0.3019, + "step": 26252 + }, + { + "epoch": 14.666480446927375, + "grad_norm": 0.5658826231956482, + "learning_rate": 0.0002677871148459384, + "loss": 0.5239, + "step": 26253 + }, + { + "epoch": 14.667039106145252, + "grad_norm": 0.4039609730243683, + "learning_rate": 0.0002677591036414566, + "loss": 0.4327, + "step": 26254 + }, + { + "epoch": 14.667597765363128, + "grad_norm": 0.42840760946273804, + "learning_rate": 0.0002677310924369748, + "loss": 0.4675, + "step": 26255 + }, + { + "epoch": 14.668156424581005, + "grad_norm": 0.47770094871520996, + "learning_rate": 0.000267703081232493, + "loss": 0.5503, + "step": 26256 + }, + { + "epoch": 14.668715083798883, + "grad_norm": 0.588168740272522, + "learning_rate": 0.0002676750700280112, + "loss": 0.437, + "step": 26257 + }, + { + "epoch": 14.66927374301676, + "grad_norm": 0.5643516182899475, + "learning_rate": 0.0002676470588235294, + "loss": 0.4007, + "step": 26258 + }, + { + "epoch": 14.669832402234636, + "grad_norm": 0.49603214859962463, + "learning_rate": 0.0002676190476190477, + "loss": 0.3827, + "step": 26259 + }, + { + "epoch": 14.670391061452515, + "grad_norm": 0.47800755500793457, + "learning_rate": 0.00026759103641456583, + "loss": 0.3589, + "step": 26260 + }, + { + "epoch": 14.670949720670391, + "grad_norm": 0.32600730657577515, + "learning_rate": 0.00026756302521008403, + "loss": 0.3427, + "step": 26261 + }, + { + "epoch": 14.671508379888268, + "grad_norm": 0.5929835438728333, + "learning_rate": 0.00026753501400560224, + "loss": 0.4558, + "step": 26262 + }, + { + "epoch": 14.672067039106146, + "grad_norm": 0.40991365909576416, + "learning_rate": 0.00026750700280112045, + "loss": 0.477, + "step": 26263 + }, + { + "epoch": 14.672625698324023, + "grad_norm": 0.5074556469917297, + "learning_rate": 0.0002674789915966387, + "loss": 0.4633, + "step": 26264 + }, + { + "epoch": 14.6731843575419, + "grad_norm": 0.5383959412574768, + "learning_rate": 0.00026745098039215686, + "loss": 0.4367, + "step": 26265 + }, + { + "epoch": 14.673743016759776, + "grad_norm": 0.6705188155174255, + "learning_rate": 0.00026742296918767506, + "loss": 0.393, + "step": 26266 + }, + { + "epoch": 14.674301675977654, + "grad_norm": 0.5704450607299805, + "learning_rate": 0.0002673949579831933, + "loss": 0.4979, + "step": 26267 + }, + { + "epoch": 14.67486033519553, + "grad_norm": 0.381169855594635, + "learning_rate": 0.0002673669467787115, + "loss": 0.3592, + "step": 26268 + }, + { + "epoch": 14.675418994413407, + "grad_norm": 0.4601171612739563, + "learning_rate": 0.00026733893557422974, + "loss": 0.4048, + "step": 26269 + }, + { + "epoch": 14.675977653631286, + "grad_norm": 0.4392685890197754, + "learning_rate": 0.0002673109243697479, + "loss": 0.4004, + "step": 26270 + }, + { + "epoch": 14.676536312849162, + "grad_norm": 0.32139235734939575, + "learning_rate": 0.0002672829131652661, + "loss": 0.3192, + "step": 26271 + }, + { + "epoch": 14.677094972067039, + "grad_norm": 0.3459639847278595, + "learning_rate": 0.00026725490196078435, + "loss": 0.3346, + "step": 26272 + }, + { + "epoch": 14.677653631284915, + "grad_norm": 0.37067097425460815, + "learning_rate": 0.0002672268907563025, + "loss": 0.3428, + "step": 26273 + }, + { + "epoch": 14.678212290502794, + "grad_norm": 0.578516960144043, + "learning_rate": 0.00026719887955182077, + "loss": 0.4317, + "step": 26274 + }, + { + "epoch": 14.67877094972067, + "grad_norm": 0.7438474893569946, + "learning_rate": 0.00026717086834733897, + "loss": 0.3312, + "step": 26275 + }, + { + "epoch": 14.679329608938547, + "grad_norm": 0.857010543346405, + "learning_rate": 0.0002671428571428571, + "loss": 0.3472, + "step": 26276 + }, + { + "epoch": 14.679888268156425, + "grad_norm": 0.4259175658226013, + "learning_rate": 0.0002671148459383754, + "loss": 0.4157, + "step": 26277 + }, + { + "epoch": 14.680446927374302, + "grad_norm": 0.5690827369689941, + "learning_rate": 0.00026708683473389354, + "loss": 0.5219, + "step": 26278 + }, + { + "epoch": 14.681005586592178, + "grad_norm": 0.3690986633300781, + "learning_rate": 0.00026705882352941174, + "loss": 0.3812, + "step": 26279 + }, + { + "epoch": 14.681564245810057, + "grad_norm": 0.5454613566398621, + "learning_rate": 0.00026703081232493, + "loss": 0.4324, + "step": 26280 + }, + { + "epoch": 14.682122905027933, + "grad_norm": 1.6613110303878784, + "learning_rate": 0.00026700280112044815, + "loss": 0.4891, + "step": 26281 + }, + { + "epoch": 14.68268156424581, + "grad_norm": 0.501953125, + "learning_rate": 0.0002669747899159664, + "loss": 0.2629, + "step": 26282 + }, + { + "epoch": 14.683240223463688, + "grad_norm": 0.483599454164505, + "learning_rate": 0.0002669467787114846, + "loss": 0.377, + "step": 26283 + }, + { + "epoch": 14.683798882681565, + "grad_norm": 0.4072704017162323, + "learning_rate": 0.00026691876750700277, + "loss": 0.4607, + "step": 26284 + }, + { + "epoch": 14.684357541899441, + "grad_norm": 0.5300388932228088, + "learning_rate": 0.00026689075630252103, + "loss": 0.4773, + "step": 26285 + }, + { + "epoch": 14.684916201117318, + "grad_norm": 0.7170148491859436, + "learning_rate": 0.0002668627450980392, + "loss": 0.5422, + "step": 26286 + }, + { + "epoch": 14.685474860335196, + "grad_norm": 0.45028436183929443, + "learning_rate": 0.00026683473389355744, + "loss": 0.308, + "step": 26287 + }, + { + "epoch": 14.686033519553073, + "grad_norm": 2.491564989089966, + "learning_rate": 0.00026680672268907565, + "loss": 0.4194, + "step": 26288 + }, + { + "epoch": 14.68659217877095, + "grad_norm": 1.1377471685409546, + "learning_rate": 0.0002667787114845938, + "loss": 0.3476, + "step": 26289 + }, + { + "epoch": 14.687150837988828, + "grad_norm": 1.7199230194091797, + "learning_rate": 0.00026675070028011206, + "loss": 0.3383, + "step": 26290 + }, + { + "epoch": 14.687709497206704, + "grad_norm": 0.4311559498310089, + "learning_rate": 0.00026672268907563027, + "loss": 0.4418, + "step": 26291 + }, + { + "epoch": 14.68826815642458, + "grad_norm": 0.3717029094696045, + "learning_rate": 0.0002666946778711485, + "loss": 0.396, + "step": 26292 + }, + { + "epoch": 14.688826815642457, + "grad_norm": 0.6051945686340332, + "learning_rate": 0.0002666666666666667, + "loss": 0.3519, + "step": 26293 + }, + { + "epoch": 14.689385474860336, + "grad_norm": 0.4016841650009155, + "learning_rate": 0.00026663865546218483, + "loss": 0.4889, + "step": 26294 + }, + { + "epoch": 14.689944134078212, + "grad_norm": 1.0330736637115479, + "learning_rate": 0.0002666106442577031, + "loss": 0.4406, + "step": 26295 + }, + { + "epoch": 14.690502793296089, + "grad_norm": 1.0698660612106323, + "learning_rate": 0.0002665826330532213, + "loss": 0.4468, + "step": 26296 + }, + { + "epoch": 14.691061452513967, + "grad_norm": 1.1493093967437744, + "learning_rate": 0.0002665546218487395, + "loss": 0.4364, + "step": 26297 + }, + { + "epoch": 14.691620111731844, + "grad_norm": 0.29861214756965637, + "learning_rate": 0.0002665266106442577, + "loss": 0.2787, + "step": 26298 + }, + { + "epoch": 14.69217877094972, + "grad_norm": 0.8863983154296875, + "learning_rate": 0.0002664985994397759, + "loss": 0.3846, + "step": 26299 + }, + { + "epoch": 14.692737430167599, + "grad_norm": 0.6042300462722778, + "learning_rate": 0.0002664705882352941, + "loss": 0.3744, + "step": 26300 + }, + { + "epoch": 14.693296089385475, + "grad_norm": 0.4075743556022644, + "learning_rate": 0.00026644257703081233, + "loss": 0.4425, + "step": 26301 + }, + { + "epoch": 14.693854748603352, + "grad_norm": 0.4086613655090332, + "learning_rate": 0.00026641456582633053, + "loss": 0.4512, + "step": 26302 + }, + { + "epoch": 14.694413407821228, + "grad_norm": 0.6771743297576904, + "learning_rate": 0.00026638655462184874, + "loss": 0.5386, + "step": 26303 + }, + { + "epoch": 14.694972067039107, + "grad_norm": 0.6762493252754211, + "learning_rate": 0.00026635854341736695, + "loss": 0.3726, + "step": 26304 + }, + { + "epoch": 14.695530726256983, + "grad_norm": 0.7674379944801331, + "learning_rate": 0.00026633053221288515, + "loss": 0.3424, + "step": 26305 + }, + { + "epoch": 14.69608938547486, + "grad_norm": 0.4134715497493744, + "learning_rate": 0.00026630252100840336, + "loss": 0.4478, + "step": 26306 + }, + { + "epoch": 14.696648044692738, + "grad_norm": 0.7435882091522217, + "learning_rate": 0.0002662745098039216, + "loss": 0.4822, + "step": 26307 + }, + { + "epoch": 14.697206703910615, + "grad_norm": 0.5122054219245911, + "learning_rate": 0.00026624649859943977, + "loss": 0.4371, + "step": 26308 + }, + { + "epoch": 14.697765363128491, + "grad_norm": 0.5374253988265991, + "learning_rate": 0.000266218487394958, + "loss": 0.5478, + "step": 26309 + }, + { + "epoch": 14.69832402234637, + "grad_norm": 0.392660528421402, + "learning_rate": 0.0002661904761904762, + "loss": 0.4816, + "step": 26310 + }, + { + "epoch": 14.698882681564246, + "grad_norm": 0.7365900874137878, + "learning_rate": 0.0002661624649859944, + "loss": 0.4212, + "step": 26311 + }, + { + "epoch": 14.699441340782123, + "grad_norm": 1.0071672201156616, + "learning_rate": 0.00026613445378151265, + "loss": 0.3028, + "step": 26312 + }, + { + "epoch": 14.7, + "grad_norm": 0.4007118344306946, + "learning_rate": 0.0002661064425770308, + "loss": 0.4014, + "step": 26313 + }, + { + "epoch": 14.700558659217878, + "grad_norm": 0.37256067991256714, + "learning_rate": 0.000266078431372549, + "loss": 0.3697, + "step": 26314 + }, + { + "epoch": 14.701117318435754, + "grad_norm": 0.4756200313568115, + "learning_rate": 0.00026605042016806727, + "loss": 0.4586, + "step": 26315 + }, + { + "epoch": 14.70167597765363, + "grad_norm": 0.5360227227210999, + "learning_rate": 0.0002660224089635854, + "loss": 0.4186, + "step": 26316 + }, + { + "epoch": 14.702234636871509, + "grad_norm": 0.46384015679359436, + "learning_rate": 0.0002659943977591037, + "loss": 0.3584, + "step": 26317 + }, + { + "epoch": 14.702793296089386, + "grad_norm": 0.4285893142223358, + "learning_rate": 0.00026596638655462183, + "loss": 0.3876, + "step": 26318 + }, + { + "epoch": 14.703351955307262, + "grad_norm": 0.46843311190605164, + "learning_rate": 0.00026593837535014004, + "loss": 0.386, + "step": 26319 + }, + { + "epoch": 14.703910614525139, + "grad_norm": 0.34277692437171936, + "learning_rate": 0.0002659103641456583, + "loss": 0.3437, + "step": 26320 + }, + { + "epoch": 14.704469273743017, + "grad_norm": 0.5339342355728149, + "learning_rate": 0.00026588235294117645, + "loss": 0.3317, + "step": 26321 + }, + { + "epoch": 14.705027932960894, + "grad_norm": 0.8739034533500671, + "learning_rate": 0.0002658543417366947, + "loss": 0.4804, + "step": 26322 + }, + { + "epoch": 14.70558659217877, + "grad_norm": 0.6812794804573059, + "learning_rate": 0.0002658263305322129, + "loss": 0.4258, + "step": 26323 + }, + { + "epoch": 14.706145251396649, + "grad_norm": 0.3642677366733551, + "learning_rate": 0.00026579831932773107, + "loss": 0.5258, + "step": 26324 + }, + { + "epoch": 14.706703910614525, + "grad_norm": 0.44004127383232117, + "learning_rate": 0.0002657703081232493, + "loss": 0.4812, + "step": 26325 + }, + { + "epoch": 14.707262569832402, + "grad_norm": 0.9689213037490845, + "learning_rate": 0.0002657422969187675, + "loss": 0.4773, + "step": 26326 + }, + { + "epoch": 14.70782122905028, + "grad_norm": 0.5342394709587097, + "learning_rate": 0.00026571428571428574, + "loss": 0.4071, + "step": 26327 + }, + { + "epoch": 14.708379888268157, + "grad_norm": 0.37837132811546326, + "learning_rate": 0.00026568627450980394, + "loss": 0.3604, + "step": 26328 + }, + { + "epoch": 14.708938547486033, + "grad_norm": 0.381149560213089, + "learning_rate": 0.0002656582633053221, + "loss": 0.3815, + "step": 26329 + }, + { + "epoch": 14.70949720670391, + "grad_norm": 0.5717688202857971, + "learning_rate": 0.00026563025210084036, + "loss": 0.3629, + "step": 26330 + }, + { + "epoch": 14.710055865921788, + "grad_norm": 23.346769332885742, + "learning_rate": 0.00026560224089635856, + "loss": 0.5919, + "step": 26331 + }, + { + "epoch": 14.710614525139665, + "grad_norm": 0.48594433069229126, + "learning_rate": 0.00026557422969187677, + "loss": 0.4452, + "step": 26332 + }, + { + "epoch": 14.711173184357541, + "grad_norm": 0.7142903208732605, + "learning_rate": 0.000265546218487395, + "loss": 0.4418, + "step": 26333 + }, + { + "epoch": 14.71173184357542, + "grad_norm": 0.5551584959030151, + "learning_rate": 0.0002655182072829131, + "loss": 0.4243, + "step": 26334 + }, + { + "epoch": 14.712290502793296, + "grad_norm": 0.41218405961990356, + "learning_rate": 0.0002654901960784314, + "loss": 0.469, + "step": 26335 + }, + { + "epoch": 14.712849162011173, + "grad_norm": 0.7614871859550476, + "learning_rate": 0.0002654621848739496, + "loss": 0.3932, + "step": 26336 + }, + { + "epoch": 14.713407821229051, + "grad_norm": 0.45670029520988464, + "learning_rate": 0.0002654341736694678, + "loss": 0.5043, + "step": 26337 + }, + { + "epoch": 14.713966480446928, + "grad_norm": 0.4401882290840149, + "learning_rate": 0.000265406162464986, + "loss": 0.3796, + "step": 26338 + }, + { + "epoch": 14.714525139664804, + "grad_norm": 0.6125187873840332, + "learning_rate": 0.0002653781512605042, + "loss": 0.3395, + "step": 26339 + }, + { + "epoch": 14.71508379888268, + "grad_norm": 0.5906593203544617, + "learning_rate": 0.0002653501400560224, + "loss": 0.5487, + "step": 26340 + }, + { + "epoch": 14.71564245810056, + "grad_norm": 0.7808915376663208, + "learning_rate": 0.0002653221288515406, + "loss": 0.4719, + "step": 26341 + }, + { + "epoch": 14.716201117318436, + "grad_norm": 0.3673170506954193, + "learning_rate": 0.00026529411764705883, + "loss": 0.399, + "step": 26342 + }, + { + "epoch": 14.716759776536312, + "grad_norm": 0.5387362241744995, + "learning_rate": 0.00026526610644257703, + "loss": 0.4279, + "step": 26343 + }, + { + "epoch": 14.71731843575419, + "grad_norm": 0.518286943435669, + "learning_rate": 0.00026523809523809524, + "loss": 0.4499, + "step": 26344 + }, + { + "epoch": 14.717877094972067, + "grad_norm": 1.57997727394104, + "learning_rate": 0.00026521008403361345, + "loss": 0.3926, + "step": 26345 + }, + { + "epoch": 14.718435754189944, + "grad_norm": 0.5612686276435852, + "learning_rate": 0.00026518207282913165, + "loss": 0.3542, + "step": 26346 + }, + { + "epoch": 14.71899441340782, + "grad_norm": 0.44491249322891235, + "learning_rate": 0.0002651540616246499, + "loss": 0.3814, + "step": 26347 + }, + { + "epoch": 14.719553072625699, + "grad_norm": 0.4358351528644562, + "learning_rate": 0.00026512605042016806, + "loss": 0.4099, + "step": 26348 + }, + { + "epoch": 14.720111731843575, + "grad_norm": 0.5698142051696777, + "learning_rate": 0.00026509803921568627, + "loss": 0.428, + "step": 26349 + }, + { + "epoch": 14.720670391061452, + "grad_norm": 1.27790367603302, + "learning_rate": 0.0002650700280112045, + "loss": 0.3452, + "step": 26350 + }, + { + "epoch": 14.72122905027933, + "grad_norm": 0.5082082748413086, + "learning_rate": 0.0002650420168067227, + "loss": 0.3512, + "step": 26351 + }, + { + "epoch": 14.721787709497207, + "grad_norm": 0.6949784755706787, + "learning_rate": 0.00026501400560224094, + "loss": 0.4782, + "step": 26352 + }, + { + "epoch": 14.722346368715083, + "grad_norm": 0.5166410803794861, + "learning_rate": 0.0002649859943977591, + "loss": 0.3365, + "step": 26353 + }, + { + "epoch": 14.722905027932962, + "grad_norm": 0.3750188946723938, + "learning_rate": 0.0002649579831932773, + "loss": 0.3802, + "step": 26354 + }, + { + "epoch": 14.723463687150838, + "grad_norm": 0.45565059781074524, + "learning_rate": 0.00026492997198879556, + "loss": 0.4176, + "step": 26355 + }, + { + "epoch": 14.724022346368715, + "grad_norm": 3.277662992477417, + "learning_rate": 0.0002649019607843137, + "loss": 0.3548, + "step": 26356 + }, + { + "epoch": 14.724581005586593, + "grad_norm": 0.9732744097709656, + "learning_rate": 0.00026487394957983197, + "loss": 0.4092, + "step": 26357 + }, + { + "epoch": 14.72513966480447, + "grad_norm": 0.38876616954803467, + "learning_rate": 0.0002648459383753501, + "loss": 0.4006, + "step": 26358 + }, + { + "epoch": 14.725698324022346, + "grad_norm": 0.5247053503990173, + "learning_rate": 0.00026481792717086833, + "loss": 0.4387, + "step": 26359 + }, + { + "epoch": 14.726256983240223, + "grad_norm": 0.5945095419883728, + "learning_rate": 0.0002647899159663866, + "loss": 0.3847, + "step": 26360 + }, + { + "epoch": 14.726815642458101, + "grad_norm": 0.5156911611557007, + "learning_rate": 0.00026476190476190474, + "loss": 0.3732, + "step": 26361 + }, + { + "epoch": 14.727374301675978, + "grad_norm": 0.4860561490058899, + "learning_rate": 0.000264733893557423, + "loss": 0.5057, + "step": 26362 + }, + { + "epoch": 14.727932960893854, + "grad_norm": 1.3079116344451904, + "learning_rate": 0.0002647058823529412, + "loss": 0.5015, + "step": 26363 + }, + { + "epoch": 14.728491620111733, + "grad_norm": 0.38649171590805054, + "learning_rate": 0.00026467787114845936, + "loss": 0.404, + "step": 26364 + }, + { + "epoch": 14.72905027932961, + "grad_norm": 0.42645263671875, + "learning_rate": 0.0002646498599439776, + "loss": 0.4418, + "step": 26365 + }, + { + "epoch": 14.729608938547486, + "grad_norm": 0.42595547437667847, + "learning_rate": 0.00026462184873949577, + "loss": 0.3842, + "step": 26366 + }, + { + "epoch": 14.730167597765362, + "grad_norm": 4.825133800506592, + "learning_rate": 0.00026459383753501403, + "loss": 0.5028, + "step": 26367 + }, + { + "epoch": 14.73072625698324, + "grad_norm": 0.40822741389274597, + "learning_rate": 0.00026456582633053224, + "loss": 0.4193, + "step": 26368 + }, + { + "epoch": 14.731284916201117, + "grad_norm": 0.7829367518424988, + "learning_rate": 0.0002645378151260504, + "loss": 0.5717, + "step": 26369 + }, + { + "epoch": 14.731843575418994, + "grad_norm": 1.4182761907577515, + "learning_rate": 0.00026450980392156865, + "loss": 0.4274, + "step": 26370 + }, + { + "epoch": 14.732402234636872, + "grad_norm": 2.1350560188293457, + "learning_rate": 0.00026448179271708686, + "loss": 0.3759, + "step": 26371 + }, + { + "epoch": 14.732960893854749, + "grad_norm": 0.42354243993759155, + "learning_rate": 0.00026445378151260506, + "loss": 0.4954, + "step": 26372 + }, + { + "epoch": 14.733519553072625, + "grad_norm": 0.35651227831840515, + "learning_rate": 0.00026442577030812327, + "loss": 0.3238, + "step": 26373 + }, + { + "epoch": 14.734078212290502, + "grad_norm": 0.6394851803779602, + "learning_rate": 0.0002643977591036414, + "loss": 0.4724, + "step": 26374 + }, + { + "epoch": 14.73463687150838, + "grad_norm": 0.38952717185020447, + "learning_rate": 0.0002643697478991597, + "loss": 0.4467, + "step": 26375 + }, + { + "epoch": 14.735195530726257, + "grad_norm": 0.7485803365707397, + "learning_rate": 0.0002643417366946779, + "loss": 0.4046, + "step": 26376 + }, + { + "epoch": 14.735754189944133, + "grad_norm": 0.5602471232414246, + "learning_rate": 0.0002643137254901961, + "loss": 0.3492, + "step": 26377 + }, + { + "epoch": 14.736312849162012, + "grad_norm": 0.9912500381469727, + "learning_rate": 0.0002642857142857143, + "loss": 0.3639, + "step": 26378 + }, + { + "epoch": 14.736871508379888, + "grad_norm": 0.6238000988960266, + "learning_rate": 0.0002642577030812325, + "loss": 0.3999, + "step": 26379 + }, + { + "epoch": 14.737430167597765, + "grad_norm": 0.7098654508590698, + "learning_rate": 0.0002642296918767507, + "loss": 0.6118, + "step": 26380 + }, + { + "epoch": 14.737988826815643, + "grad_norm": 0.38345661759376526, + "learning_rate": 0.0002642016806722689, + "loss": 0.3558, + "step": 26381 + }, + { + "epoch": 14.73854748603352, + "grad_norm": 0.5556296706199646, + "learning_rate": 0.0002641736694677871, + "loss": 0.4296, + "step": 26382 + }, + { + "epoch": 14.739106145251396, + "grad_norm": 0.5271759629249573, + "learning_rate": 0.00026414565826330533, + "loss": 0.4747, + "step": 26383 + }, + { + "epoch": 14.739664804469275, + "grad_norm": 0.5730955600738525, + "learning_rate": 0.00026411764705882353, + "loss": 0.4096, + "step": 26384 + }, + { + "epoch": 14.740223463687151, + "grad_norm": 0.5778507590293884, + "learning_rate": 0.00026408963585434174, + "loss": 0.4117, + "step": 26385 + }, + { + "epoch": 14.740782122905028, + "grad_norm": 0.3964424729347229, + "learning_rate": 0.00026406162464985995, + "loss": 0.3721, + "step": 26386 + }, + { + "epoch": 14.741340782122904, + "grad_norm": 0.7801333069801331, + "learning_rate": 0.00026403361344537815, + "loss": 0.4514, + "step": 26387 + }, + { + "epoch": 14.741899441340783, + "grad_norm": 0.42704489827156067, + "learning_rate": 0.00026400560224089636, + "loss": 0.4384, + "step": 26388 + }, + { + "epoch": 14.74245810055866, + "grad_norm": 0.6510639786720276, + "learning_rate": 0.00026397759103641456, + "loss": 0.5364, + "step": 26389 + }, + { + "epoch": 14.743016759776536, + "grad_norm": 0.4674013555049896, + "learning_rate": 0.00026394957983193277, + "loss": 0.5052, + "step": 26390 + }, + { + "epoch": 14.743575418994414, + "grad_norm": 0.3671068847179413, + "learning_rate": 0.000263921568627451, + "loss": 0.339, + "step": 26391 + }, + { + "epoch": 14.74413407821229, + "grad_norm": 0.6780019402503967, + "learning_rate": 0.0002638935574229692, + "loss": 0.4556, + "step": 26392 + }, + { + "epoch": 14.744692737430167, + "grad_norm": 1.0539917945861816, + "learning_rate": 0.0002638655462184874, + "loss": 0.5037, + "step": 26393 + }, + { + "epoch": 14.745251396648044, + "grad_norm": 0.7187385559082031, + "learning_rate": 0.0002638375350140056, + "loss": 0.414, + "step": 26394 + }, + { + "epoch": 14.745810055865922, + "grad_norm": 1.1761001348495483, + "learning_rate": 0.00026380952380952385, + "loss": 0.6764, + "step": 26395 + }, + { + "epoch": 14.746368715083799, + "grad_norm": 1.4172708988189697, + "learning_rate": 0.000263781512605042, + "loss": 0.3297, + "step": 26396 + }, + { + "epoch": 14.746927374301675, + "grad_norm": 1.9798983335494995, + "learning_rate": 0.0002637535014005602, + "loss": 0.4297, + "step": 26397 + }, + { + "epoch": 14.747486033519554, + "grad_norm": 1.5195233821868896, + "learning_rate": 0.0002637254901960784, + "loss": 0.3061, + "step": 26398 + }, + { + "epoch": 14.74804469273743, + "grad_norm": 0.36168909072875977, + "learning_rate": 0.0002636974789915966, + "loss": 0.396, + "step": 26399 + }, + { + "epoch": 14.748603351955307, + "grad_norm": 0.5995067358016968, + "learning_rate": 0.0002636694677871149, + "loss": 0.6228, + "step": 26400 + }, + { + "epoch": 14.749162011173185, + "grad_norm": 1.1359343528747559, + "learning_rate": 0.00026364145658263304, + "loss": 0.4636, + "step": 26401 + }, + { + "epoch": 14.749720670391062, + "grad_norm": 0.4448784589767456, + "learning_rate": 0.00026361344537815124, + "loss": 0.3464, + "step": 26402 + }, + { + "epoch": 14.750279329608938, + "grad_norm": 0.7094453573226929, + "learning_rate": 0.0002635854341736695, + "loss": 0.4707, + "step": 26403 + }, + { + "epoch": 14.750837988826815, + "grad_norm": 0.4814882278442383, + "learning_rate": 0.00026355742296918765, + "loss": 0.3963, + "step": 26404 + }, + { + "epoch": 14.751396648044693, + "grad_norm": 0.4036135971546173, + "learning_rate": 0.0002635294117647059, + "loss": 0.3986, + "step": 26405 + }, + { + "epoch": 14.75195530726257, + "grad_norm": 3.5774288177490234, + "learning_rate": 0.00026350140056022407, + "loss": 0.4137, + "step": 26406 + }, + { + "epoch": 14.752513966480446, + "grad_norm": 0.5589652061462402, + "learning_rate": 0.00026347338935574227, + "loss": 0.3292, + "step": 26407 + }, + { + "epoch": 14.753072625698325, + "grad_norm": 0.382071316242218, + "learning_rate": 0.00026344537815126053, + "loss": 0.3506, + "step": 26408 + }, + { + "epoch": 14.753631284916201, + "grad_norm": 0.3391832113265991, + "learning_rate": 0.0002634173669467787, + "loss": 0.4145, + "step": 26409 + }, + { + "epoch": 14.754189944134078, + "grad_norm": 0.37645387649536133, + "learning_rate": 0.00026338935574229694, + "loss": 0.4505, + "step": 26410 + }, + { + "epoch": 14.754748603351956, + "grad_norm": 0.4456816613674164, + "learning_rate": 0.00026336134453781515, + "loss": 0.4733, + "step": 26411 + }, + { + "epoch": 14.755307262569833, + "grad_norm": 1.2704954147338867, + "learning_rate": 0.0002633333333333333, + "loss": 0.2975, + "step": 26412 + }, + { + "epoch": 14.75586592178771, + "grad_norm": 0.5297966003417969, + "learning_rate": 0.00026330532212885156, + "loss": 0.6294, + "step": 26413 + }, + { + "epoch": 14.756424581005586, + "grad_norm": 0.36453700065612793, + "learning_rate": 0.0002632773109243697, + "loss": 0.4206, + "step": 26414 + }, + { + "epoch": 14.756983240223464, + "grad_norm": 0.45416614413261414, + "learning_rate": 0.000263249299719888, + "loss": 0.4805, + "step": 26415 + }, + { + "epoch": 14.75754189944134, + "grad_norm": 0.8847922086715698, + "learning_rate": 0.0002632212885154062, + "loss": 0.4523, + "step": 26416 + }, + { + "epoch": 14.758100558659217, + "grad_norm": 0.5305850505828857, + "learning_rate": 0.00026319327731092433, + "loss": 0.3817, + "step": 26417 + }, + { + "epoch": 14.758659217877096, + "grad_norm": 0.7760154604911804, + "learning_rate": 0.0002631652661064426, + "loss": 0.4457, + "step": 26418 + }, + { + "epoch": 14.759217877094972, + "grad_norm": 0.6772235631942749, + "learning_rate": 0.0002631372549019608, + "loss": 0.4001, + "step": 26419 + }, + { + "epoch": 14.759776536312849, + "grad_norm": 0.6633051633834839, + "learning_rate": 0.000263109243697479, + "loss": 0.4248, + "step": 26420 + }, + { + "epoch": 14.760335195530725, + "grad_norm": 0.2931738793849945, + "learning_rate": 0.0002630812324929972, + "loss": 0.297, + "step": 26421 + }, + { + "epoch": 14.760893854748604, + "grad_norm": 0.41645562648773193, + "learning_rate": 0.00026305322128851536, + "loss": 0.432, + "step": 26422 + }, + { + "epoch": 14.76145251396648, + "grad_norm": 0.4203241169452667, + "learning_rate": 0.0002630252100840336, + "loss": 0.3172, + "step": 26423 + }, + { + "epoch": 14.762011173184357, + "grad_norm": 0.4276534616947174, + "learning_rate": 0.00026299719887955183, + "loss": 0.3534, + "step": 26424 + }, + { + "epoch": 14.762569832402235, + "grad_norm": 1.4451985359191895, + "learning_rate": 0.00026296918767507003, + "loss": 0.5441, + "step": 26425 + }, + { + "epoch": 14.763128491620112, + "grad_norm": 0.41230452060699463, + "learning_rate": 0.00026294117647058824, + "loss": 0.3959, + "step": 26426 + }, + { + "epoch": 14.763687150837988, + "grad_norm": 0.39443573355674744, + "learning_rate": 0.00026291316526610645, + "loss": 0.2974, + "step": 26427 + }, + { + "epoch": 14.764245810055867, + "grad_norm": 0.4632950723171234, + "learning_rate": 0.00026288515406162465, + "loss": 0.3959, + "step": 26428 + }, + { + "epoch": 14.764804469273743, + "grad_norm": 0.4690471291542053, + "learning_rate": 0.00026285714285714286, + "loss": 0.4213, + "step": 26429 + }, + { + "epoch": 14.76536312849162, + "grad_norm": 0.8644055128097534, + "learning_rate": 0.0002628291316526611, + "loss": 0.3816, + "step": 26430 + }, + { + "epoch": 14.765921787709498, + "grad_norm": 0.5297317504882812, + "learning_rate": 0.00026280112044817927, + "loss": 0.3842, + "step": 26431 + }, + { + "epoch": 14.766480446927375, + "grad_norm": 0.36095932126045227, + "learning_rate": 0.0002627731092436975, + "loss": 0.38, + "step": 26432 + }, + { + "epoch": 14.767039106145251, + "grad_norm": 0.5994721055030823, + "learning_rate": 0.0002627450980392157, + "loss": 0.3954, + "step": 26433 + }, + { + "epoch": 14.767597765363128, + "grad_norm": 0.42066147923469543, + "learning_rate": 0.0002627170868347339, + "loss": 0.4586, + "step": 26434 + }, + { + "epoch": 14.768156424581006, + "grad_norm": 0.397690087556839, + "learning_rate": 0.00026268907563025215, + "loss": 0.4562, + "step": 26435 + }, + { + "epoch": 14.768715083798883, + "grad_norm": 0.431347131729126, + "learning_rate": 0.0002626610644257703, + "loss": 0.4554, + "step": 26436 + }, + { + "epoch": 14.76927374301676, + "grad_norm": 1.1868691444396973, + "learning_rate": 0.0002626330532212885, + "loss": 0.4128, + "step": 26437 + }, + { + "epoch": 14.769832402234638, + "grad_norm": 0.44891518354415894, + "learning_rate": 0.00026260504201680677, + "loss": 0.3847, + "step": 26438 + }, + { + "epoch": 14.770391061452514, + "grad_norm": 0.6369753479957581, + "learning_rate": 0.0002625770308123249, + "loss": 0.4246, + "step": 26439 + }, + { + "epoch": 14.77094972067039, + "grad_norm": 0.7375242710113525, + "learning_rate": 0.0002625490196078432, + "loss": 0.3811, + "step": 26440 + }, + { + "epoch": 14.771508379888267, + "grad_norm": 0.39455243945121765, + "learning_rate": 0.00026252100840336133, + "loss": 0.4143, + "step": 26441 + }, + { + "epoch": 14.772067039106146, + "grad_norm": 0.6123532652854919, + "learning_rate": 0.00026249299719887954, + "loss": 0.453, + "step": 26442 + }, + { + "epoch": 14.772625698324022, + "grad_norm": 0.3983652591705322, + "learning_rate": 0.0002624649859943978, + "loss": 0.3656, + "step": 26443 + }, + { + "epoch": 14.773184357541899, + "grad_norm": 0.43648362159729004, + "learning_rate": 0.00026243697478991595, + "loss": 0.447, + "step": 26444 + }, + { + "epoch": 14.773743016759777, + "grad_norm": 3.296373128890991, + "learning_rate": 0.0002624089635854342, + "loss": 0.5242, + "step": 26445 + }, + { + "epoch": 14.774301675977654, + "grad_norm": 0.40457525849342346, + "learning_rate": 0.0002623809523809524, + "loss": 0.3515, + "step": 26446 + }, + { + "epoch": 14.77486033519553, + "grad_norm": 0.5199089050292969, + "learning_rate": 0.00026235294117647057, + "loss": 0.429, + "step": 26447 + }, + { + "epoch": 14.775418994413407, + "grad_norm": 0.4300305247306824, + "learning_rate": 0.0002623249299719888, + "loss": 0.3581, + "step": 26448 + }, + { + "epoch": 14.775977653631285, + "grad_norm": 0.6626937985420227, + "learning_rate": 0.000262296918767507, + "loss": 0.4883, + "step": 26449 + }, + { + "epoch": 14.776536312849162, + "grad_norm": 0.5347988605499268, + "learning_rate": 0.00026226890756302524, + "loss": 0.4604, + "step": 26450 + }, + { + "epoch": 14.777094972067038, + "grad_norm": 0.42290830612182617, + "learning_rate": 0.00026224089635854344, + "loss": 0.4415, + "step": 26451 + }, + { + "epoch": 14.777653631284917, + "grad_norm": 0.4398691654205322, + "learning_rate": 0.0002622128851540616, + "loss": 0.4993, + "step": 26452 + }, + { + "epoch": 14.778212290502793, + "grad_norm": 0.4389786124229431, + "learning_rate": 0.00026218487394957986, + "loss": 0.4277, + "step": 26453 + }, + { + "epoch": 14.77877094972067, + "grad_norm": 0.5745500922203064, + "learning_rate": 0.00026215686274509806, + "loss": 0.3642, + "step": 26454 + }, + { + "epoch": 14.779329608938548, + "grad_norm": 6.254465579986572, + "learning_rate": 0.00026212885154061627, + "loss": 0.4274, + "step": 26455 + }, + { + "epoch": 14.779888268156425, + "grad_norm": 0.4897795021533966, + "learning_rate": 0.0002621008403361345, + "loss": 0.3883, + "step": 26456 + }, + { + "epoch": 14.780446927374301, + "grad_norm": 0.5450849533081055, + "learning_rate": 0.0002620728291316526, + "loss": 0.4014, + "step": 26457 + }, + { + "epoch": 14.78100558659218, + "grad_norm": 0.49721452593803406, + "learning_rate": 0.0002620448179271709, + "loss": 0.4233, + "step": 26458 + }, + { + "epoch": 14.781564245810056, + "grad_norm": 0.36960384249687195, + "learning_rate": 0.0002620168067226891, + "loss": 0.3521, + "step": 26459 + }, + { + "epoch": 14.782122905027933, + "grad_norm": 0.438244104385376, + "learning_rate": 0.0002619887955182073, + "loss": 0.3493, + "step": 26460 + }, + { + "epoch": 14.78268156424581, + "grad_norm": 0.3668515384197235, + "learning_rate": 0.0002619607843137255, + "loss": 0.378, + "step": 26461 + }, + { + "epoch": 14.783240223463688, + "grad_norm": 0.6461626887321472, + "learning_rate": 0.0002619327731092437, + "loss": 0.4054, + "step": 26462 + }, + { + "epoch": 14.783798882681564, + "grad_norm": 0.3935241997241974, + "learning_rate": 0.0002619047619047619, + "loss": 0.4462, + "step": 26463 + }, + { + "epoch": 14.78435754189944, + "grad_norm": 0.37472477555274963, + "learning_rate": 0.0002618767507002801, + "loss": 0.3536, + "step": 26464 + }, + { + "epoch": 14.78491620111732, + "grad_norm": 0.47504526376724243, + "learning_rate": 0.00026184873949579833, + "loss": 0.4127, + "step": 26465 + }, + { + "epoch": 14.785474860335196, + "grad_norm": 0.9019039273262024, + "learning_rate": 0.00026182072829131653, + "loss": 0.6654, + "step": 26466 + }, + { + "epoch": 14.786033519553072, + "grad_norm": 0.42635321617126465, + "learning_rate": 0.00026179271708683474, + "loss": 0.2354, + "step": 26467 + }, + { + "epoch": 14.786592178770949, + "grad_norm": 1.1489777565002441, + "learning_rate": 0.00026176470588235295, + "loss": 0.4523, + "step": 26468 + }, + { + "epoch": 14.787150837988827, + "grad_norm": 1.1180977821350098, + "learning_rate": 0.00026173669467787115, + "loss": 0.4413, + "step": 26469 + }, + { + "epoch": 14.787709497206704, + "grad_norm": 8.92027759552002, + "learning_rate": 0.0002617086834733894, + "loss": 0.3618, + "step": 26470 + }, + { + "epoch": 14.78826815642458, + "grad_norm": 0.5446832180023193, + "learning_rate": 0.00026168067226890756, + "loss": 0.3575, + "step": 26471 + }, + { + "epoch": 14.788826815642459, + "grad_norm": 0.32228612899780273, + "learning_rate": 0.00026165266106442577, + "loss": 0.3231, + "step": 26472 + }, + { + "epoch": 14.789385474860335, + "grad_norm": 1.173027753829956, + "learning_rate": 0.000261624649859944, + "loss": 0.4278, + "step": 26473 + }, + { + "epoch": 14.789944134078212, + "grad_norm": 0.5928820967674255, + "learning_rate": 0.0002615966386554622, + "loss": 0.393, + "step": 26474 + }, + { + "epoch": 14.79050279329609, + "grad_norm": 0.5773325562477112, + "learning_rate": 0.00026156862745098044, + "loss": 0.3756, + "step": 26475 + }, + { + "epoch": 14.791061452513967, + "grad_norm": 0.5384437441825867, + "learning_rate": 0.0002615406162464986, + "loss": 0.4622, + "step": 26476 + }, + { + "epoch": 14.791620111731843, + "grad_norm": 0.5310943126678467, + "learning_rate": 0.0002615126050420168, + "loss": 0.3923, + "step": 26477 + }, + { + "epoch": 14.79217877094972, + "grad_norm": 0.48452699184417725, + "learning_rate": 0.00026148459383753506, + "loss": 0.475, + "step": 26478 + }, + { + "epoch": 14.792737430167598, + "grad_norm": 1.5395199060440063, + "learning_rate": 0.0002614565826330532, + "loss": 0.3946, + "step": 26479 + }, + { + "epoch": 14.793296089385475, + "grad_norm": 0.4771752953529358, + "learning_rate": 0.00026142857142857147, + "loss": 0.4126, + "step": 26480 + }, + { + "epoch": 14.793854748603351, + "grad_norm": 0.5794165730476379, + "learning_rate": 0.0002614005602240896, + "loss": 0.4703, + "step": 26481 + }, + { + "epoch": 14.79441340782123, + "grad_norm": 0.4600181579589844, + "learning_rate": 0.00026137254901960783, + "loss": 0.456, + "step": 26482 + }, + { + "epoch": 14.794972067039106, + "grad_norm": 0.38606157898902893, + "learning_rate": 0.0002613445378151261, + "loss": 0.4349, + "step": 26483 + }, + { + "epoch": 14.795530726256983, + "grad_norm": 0.3928118348121643, + "learning_rate": 0.00026131652661064424, + "loss": 0.4459, + "step": 26484 + }, + { + "epoch": 14.796089385474861, + "grad_norm": 0.5269759297370911, + "learning_rate": 0.0002612885154061625, + "loss": 0.4311, + "step": 26485 + }, + { + "epoch": 14.796648044692738, + "grad_norm": 0.32078155875205994, + "learning_rate": 0.0002612605042016807, + "loss": 0.348, + "step": 26486 + }, + { + "epoch": 14.797206703910614, + "grad_norm": 0.6130132079124451, + "learning_rate": 0.00026123249299719886, + "loss": 0.4218, + "step": 26487 + }, + { + "epoch": 14.797765363128491, + "grad_norm": 0.5869680643081665, + "learning_rate": 0.0002612044817927171, + "loss": 0.4812, + "step": 26488 + }, + { + "epoch": 14.79832402234637, + "grad_norm": 0.8428412675857544, + "learning_rate": 0.00026117647058823527, + "loss": 0.393, + "step": 26489 + }, + { + "epoch": 14.798882681564246, + "grad_norm": 0.4352872967720032, + "learning_rate": 0.00026114845938375353, + "loss": 0.415, + "step": 26490 + }, + { + "epoch": 14.799441340782122, + "grad_norm": 0.4212241768836975, + "learning_rate": 0.00026112044817927174, + "loss": 0.5269, + "step": 26491 + }, + { + "epoch": 14.8, + "grad_norm": 0.5292167663574219, + "learning_rate": 0.0002610924369747899, + "loss": 0.4262, + "step": 26492 + }, + { + "epoch": 14.800558659217877, + "grad_norm": 0.5012488961219788, + "learning_rate": 0.00026106442577030815, + "loss": 0.4212, + "step": 26493 + }, + { + "epoch": 14.801117318435754, + "grad_norm": 0.35870733857154846, + "learning_rate": 0.00026103641456582636, + "loss": 0.3737, + "step": 26494 + }, + { + "epoch": 14.80167597765363, + "grad_norm": 5.093686103820801, + "learning_rate": 0.00026100840336134456, + "loss": 0.494, + "step": 26495 + }, + { + "epoch": 14.802234636871509, + "grad_norm": 0.4757404625415802, + "learning_rate": 0.00026098039215686277, + "loss": 0.4494, + "step": 26496 + }, + { + "epoch": 14.802793296089385, + "grad_norm": 0.5189264416694641, + "learning_rate": 0.0002609523809523809, + "loss": 0.3618, + "step": 26497 + }, + { + "epoch": 14.803351955307262, + "grad_norm": 0.41683512926101685, + "learning_rate": 0.0002609243697478992, + "loss": 0.395, + "step": 26498 + }, + { + "epoch": 14.80391061452514, + "grad_norm": 0.4827743172645569, + "learning_rate": 0.0002608963585434174, + "loss": 0.422, + "step": 26499 + }, + { + "epoch": 14.804469273743017, + "grad_norm": 0.545688271522522, + "learning_rate": 0.00026086834733893554, + "loss": 0.5109, + "step": 26500 + }, + { + "epoch": 14.804469273743017, + "eval_cer": 0.08619466902338167, + "eval_loss": 0.325408011674881, + "eval_runtime": 55.5061, + "eval_samples_per_second": 81.757, + "eval_steps_per_second": 5.117, + "eval_wer": 0.34066087539454204, + "step": 26500 + }, + { + "epoch": 14.805027932960893, + "grad_norm": 0.5007805228233337, + "learning_rate": 0.0002608403361344538, + "loss": 0.4515, + "step": 26501 + }, + { + "epoch": 14.805586592178772, + "grad_norm": 0.3946542739868164, + "learning_rate": 0.000260812324929972, + "loss": 0.364, + "step": 26502 + }, + { + "epoch": 14.806145251396648, + "grad_norm": 0.5230787992477417, + "learning_rate": 0.0002607843137254902, + "loss": 0.474, + "step": 26503 + }, + { + "epoch": 14.806703910614525, + "grad_norm": 0.43557125329971313, + "learning_rate": 0.0002607563025210084, + "loss": 0.4314, + "step": 26504 + }, + { + "epoch": 14.807262569832401, + "grad_norm": 0.41160711646080017, + "learning_rate": 0.00026072829131652657, + "loss": 0.3895, + "step": 26505 + }, + { + "epoch": 14.80782122905028, + "grad_norm": 0.4903390109539032, + "learning_rate": 0.00026070028011204483, + "loss": 0.4233, + "step": 26506 + }, + { + "epoch": 14.808379888268156, + "grad_norm": 0.40855300426483154, + "learning_rate": 0.00026067226890756303, + "loss": 0.4342, + "step": 26507 + }, + { + "epoch": 14.808938547486033, + "grad_norm": 0.466184139251709, + "learning_rate": 0.00026064425770308124, + "loss": 0.4318, + "step": 26508 + }, + { + "epoch": 14.809497206703911, + "grad_norm": 0.8855137228965759, + "learning_rate": 0.00026061624649859945, + "loss": 0.4194, + "step": 26509 + }, + { + "epoch": 14.810055865921788, + "grad_norm": 0.428856760263443, + "learning_rate": 0.00026058823529411765, + "loss": 0.4312, + "step": 26510 + }, + { + "epoch": 14.810614525139664, + "grad_norm": 0.386977881193161, + "learning_rate": 0.00026056022408963586, + "loss": 0.4247, + "step": 26511 + }, + { + "epoch": 14.811173184357543, + "grad_norm": 0.3859940767288208, + "learning_rate": 0.00026053221288515406, + "loss": 0.4209, + "step": 26512 + }, + { + "epoch": 14.81173184357542, + "grad_norm": 0.4737122058868408, + "learning_rate": 0.00026050420168067227, + "loss": 0.3469, + "step": 26513 + }, + { + "epoch": 14.812290502793296, + "grad_norm": 0.3521960973739624, + "learning_rate": 0.0002604761904761905, + "loss": 0.4099, + "step": 26514 + }, + { + "epoch": 14.812849162011172, + "grad_norm": 0.4565404951572418, + "learning_rate": 0.0002604481792717087, + "loss": 0.4718, + "step": 26515 + }, + { + "epoch": 14.81340782122905, + "grad_norm": 0.37796464562416077, + "learning_rate": 0.0002604201680672269, + "loss": 0.355, + "step": 26516 + }, + { + "epoch": 14.813966480446927, + "grad_norm": 0.9435752034187317, + "learning_rate": 0.0002603921568627451, + "loss": 0.5082, + "step": 26517 + }, + { + "epoch": 14.814525139664804, + "grad_norm": 0.4492003619670868, + "learning_rate": 0.00026036414565826335, + "loss": 0.4171, + "step": 26518 + }, + { + "epoch": 14.815083798882682, + "grad_norm": 0.39688143134117126, + "learning_rate": 0.0002603361344537815, + "loss": 0.3527, + "step": 26519 + }, + { + "epoch": 14.815642458100559, + "grad_norm": 0.48747146129608154, + "learning_rate": 0.0002603081232492997, + "loss": 0.4021, + "step": 26520 + }, + { + "epoch": 14.816201117318435, + "grad_norm": 0.42979180812835693, + "learning_rate": 0.0002602801120448179, + "loss": 0.4155, + "step": 26521 + }, + { + "epoch": 14.816759776536312, + "grad_norm": 0.5853160619735718, + "learning_rate": 0.0002602521008403361, + "loss": 0.3914, + "step": 26522 + }, + { + "epoch": 14.81731843575419, + "grad_norm": 0.3751629590988159, + "learning_rate": 0.0002602240896358544, + "loss": 0.428, + "step": 26523 + }, + { + "epoch": 14.817877094972067, + "grad_norm": 0.408683717250824, + "learning_rate": 0.00026019607843137254, + "loss": 0.4366, + "step": 26524 + }, + { + "epoch": 14.818435754189943, + "grad_norm": 0.5187716484069824, + "learning_rate": 0.00026016806722689074, + "loss": 0.5159, + "step": 26525 + }, + { + "epoch": 14.818994413407822, + "grad_norm": 0.6641644239425659, + "learning_rate": 0.000260140056022409, + "loss": 0.4877, + "step": 26526 + }, + { + "epoch": 14.819553072625698, + "grad_norm": 0.39999130368232727, + "learning_rate": 0.00026011204481792715, + "loss": 0.355, + "step": 26527 + }, + { + "epoch": 14.820111731843575, + "grad_norm": 0.3978792726993561, + "learning_rate": 0.0002600840336134454, + "loss": 0.2909, + "step": 26528 + }, + { + "epoch": 14.820670391061453, + "grad_norm": 0.5720070600509644, + "learning_rate": 0.00026005602240896357, + "loss": 0.4694, + "step": 26529 + }, + { + "epoch": 14.82122905027933, + "grad_norm": 0.6114780902862549, + "learning_rate": 0.00026002801120448177, + "loss": 0.4328, + "step": 26530 + }, + { + "epoch": 14.821787709497206, + "grad_norm": 0.38012170791625977, + "learning_rate": 0.00026000000000000003, + "loss": 0.3127, + "step": 26531 + }, + { + "epoch": 14.822346368715085, + "grad_norm": 0.579748272895813, + "learning_rate": 0.0002599719887955182, + "loss": 0.4388, + "step": 26532 + }, + { + "epoch": 14.822905027932961, + "grad_norm": 0.6629649996757507, + "learning_rate": 0.00025994397759103644, + "loss": 0.4505, + "step": 26533 + }, + { + "epoch": 14.823463687150838, + "grad_norm": 0.571614682674408, + "learning_rate": 0.00025991596638655465, + "loss": 0.4311, + "step": 26534 + }, + { + "epoch": 14.824022346368714, + "grad_norm": Infinity, + "learning_rate": 0.00025991596638655465, + "loss": 0.507, + "step": 26535 + }, + { + "epoch": 14.824581005586593, + "grad_norm": 0.5229687094688416, + "learning_rate": 0.0002598879551820728, + "loss": 0.3442, + "step": 26536 + }, + { + "epoch": 14.82513966480447, + "grad_norm": 0.5492231845855713, + "learning_rate": 0.00025985994397759106, + "loss": 0.4506, + "step": 26537 + }, + { + "epoch": 14.825698324022346, + "grad_norm": 0.3985249698162079, + "learning_rate": 0.0002598319327731092, + "loss": 0.3811, + "step": 26538 + }, + { + "epoch": 14.826256983240224, + "grad_norm": 0.4100886583328247, + "learning_rate": 0.0002598039215686275, + "loss": 0.5578, + "step": 26539 + }, + { + "epoch": 14.8268156424581, + "grad_norm": 0.4358460605144501, + "learning_rate": 0.0002597759103641457, + "loss": 0.4025, + "step": 26540 + }, + { + "epoch": 14.827374301675977, + "grad_norm": 1.138991355895996, + "learning_rate": 0.00025974789915966383, + "loss": 0.3615, + "step": 26541 + }, + { + "epoch": 14.827932960893854, + "grad_norm": 0.393299400806427, + "learning_rate": 0.0002597198879551821, + "loss": 0.2862, + "step": 26542 + }, + { + "epoch": 14.828491620111732, + "grad_norm": 0.4582339823246002, + "learning_rate": 0.0002596918767507003, + "loss": 0.4444, + "step": 26543 + }, + { + "epoch": 14.829050279329609, + "grad_norm": 0.704105794429779, + "learning_rate": 0.0002596638655462185, + "loss": 0.7172, + "step": 26544 + }, + { + "epoch": 14.829608938547485, + "grad_norm": 1.9063905477523804, + "learning_rate": 0.0002596358543417367, + "loss": 0.4766, + "step": 26545 + }, + { + "epoch": 14.830167597765364, + "grad_norm": 0.4756879508495331, + "learning_rate": 0.00025960784313725486, + "loss": 0.4232, + "step": 26546 + }, + { + "epoch": 14.83072625698324, + "grad_norm": 0.8283573389053345, + "learning_rate": 0.0002595798319327731, + "loss": 0.3762, + "step": 26547 + }, + { + "epoch": 14.831284916201117, + "grad_norm": 0.3527793884277344, + "learning_rate": 0.00025955182072829133, + "loss": 0.319, + "step": 26548 + }, + { + "epoch": 14.831843575418995, + "grad_norm": 0.5375033020973206, + "learning_rate": 0.00025952380952380953, + "loss": 0.4744, + "step": 26549 + }, + { + "epoch": 14.832402234636872, + "grad_norm": 0.4578782916069031, + "learning_rate": 0.00025949579831932774, + "loss": 0.4204, + "step": 26550 + }, + { + "epoch": 14.832960893854748, + "grad_norm": 0.5997194647789001, + "learning_rate": 0.00025946778711484595, + "loss": 0.464, + "step": 26551 + }, + { + "epoch": 14.833519553072625, + "grad_norm": 2.0308713912963867, + "learning_rate": 0.00025943977591036415, + "loss": 0.3984, + "step": 26552 + }, + { + "epoch": 14.834078212290503, + "grad_norm": 0.688795804977417, + "learning_rate": 0.00025941176470588236, + "loss": 0.5233, + "step": 26553 + }, + { + "epoch": 14.83463687150838, + "grad_norm": 0.4516580104827881, + "learning_rate": 0.00025938375350140056, + "loss": 0.4823, + "step": 26554 + }, + { + "epoch": 14.835195530726256, + "grad_norm": 0.4836987853050232, + "learning_rate": 0.00025935574229691877, + "loss": 0.4322, + "step": 26555 + }, + { + "epoch": 14.835754189944135, + "grad_norm": 1.104026198387146, + "learning_rate": 0.000259327731092437, + "loss": 0.3095, + "step": 26556 + }, + { + "epoch": 14.836312849162011, + "grad_norm": 0.36835381388664246, + "learning_rate": 0.0002592997198879552, + "loss": 0.4003, + "step": 26557 + }, + { + "epoch": 14.836871508379888, + "grad_norm": 0.4588301479816437, + "learning_rate": 0.0002592717086834734, + "loss": 0.4628, + "step": 26558 + }, + { + "epoch": 14.837430167597766, + "grad_norm": 0.4947979152202606, + "learning_rate": 0.00025924369747899165, + "loss": 0.3289, + "step": 26559 + }, + { + "epoch": 14.837988826815643, + "grad_norm": 0.2932858467102051, + "learning_rate": 0.0002592156862745098, + "loss": 0.2663, + "step": 26560 + }, + { + "epoch": 14.83854748603352, + "grad_norm": 0.44763603806495667, + "learning_rate": 0.000259187675070028, + "loss": 0.4778, + "step": 26561 + }, + { + "epoch": 14.839106145251396, + "grad_norm": 0.3396928906440735, + "learning_rate": 0.0002591596638655462, + "loss": 0.3839, + "step": 26562 + }, + { + "epoch": 14.839664804469274, + "grad_norm": 3.5120232105255127, + "learning_rate": 0.0002591316526610644, + "loss": 0.3143, + "step": 26563 + }, + { + "epoch": 14.84022346368715, + "grad_norm": 0.9616038203239441, + "learning_rate": 0.0002591036414565827, + "loss": 0.5485, + "step": 26564 + }, + { + "epoch": 14.840782122905027, + "grad_norm": 0.3105078637599945, + "learning_rate": 0.00025907563025210083, + "loss": 0.2576, + "step": 26565 + }, + { + "epoch": 14.841340782122906, + "grad_norm": 1.8114765882492065, + "learning_rate": 0.00025904761904761904, + "loss": 0.4819, + "step": 26566 + }, + { + "epoch": 14.841899441340782, + "grad_norm": 0.3861510455608368, + "learning_rate": 0.0002590196078431373, + "loss": 0.3238, + "step": 26567 + }, + { + "epoch": 14.842458100558659, + "grad_norm": 0.3394841253757477, + "learning_rate": 0.00025899159663865545, + "loss": 0.3101, + "step": 26568 + }, + { + "epoch": 14.843016759776535, + "grad_norm": 0.4576930105686188, + "learning_rate": 0.0002589635854341737, + "loss": 0.3933, + "step": 26569 + }, + { + "epoch": 14.843575418994414, + "grad_norm": 0.41548705101013184, + "learning_rate": 0.00025893557422969186, + "loss": 0.4054, + "step": 26570 + }, + { + "epoch": 14.84413407821229, + "grad_norm": 0.41307491064071655, + "learning_rate": 0.00025890756302521007, + "loss": 0.3915, + "step": 26571 + }, + { + "epoch": 14.844692737430167, + "grad_norm": 0.43612322211265564, + "learning_rate": 0.0002588795518207283, + "loss": 0.4657, + "step": 26572 + }, + { + "epoch": 14.845251396648045, + "grad_norm": 0.36599600315093994, + "learning_rate": 0.0002588515406162465, + "loss": 0.3577, + "step": 26573 + }, + { + "epoch": 14.845810055865922, + "grad_norm": 0.5370457768440247, + "learning_rate": 0.00025882352941176474, + "loss": 0.4093, + "step": 26574 + }, + { + "epoch": 14.846368715083798, + "grad_norm": 0.5047298073768616, + "learning_rate": 0.00025879551820728294, + "loss": 0.4968, + "step": 26575 + }, + { + "epoch": 14.846927374301677, + "grad_norm": 0.490090012550354, + "learning_rate": 0.0002587675070028011, + "loss": 0.4724, + "step": 26576 + }, + { + "epoch": 14.847486033519553, + "grad_norm": 0.4883776307106018, + "learning_rate": 0.00025873949579831936, + "loss": 0.33, + "step": 26577 + }, + { + "epoch": 14.84804469273743, + "grad_norm": 0.33621323108673096, + "learning_rate": 0.0002587114845938375, + "loss": 0.3866, + "step": 26578 + }, + { + "epoch": 14.848603351955306, + "grad_norm": 0.3998306691646576, + "learning_rate": 0.00025868347338935577, + "loss": 0.4417, + "step": 26579 + }, + { + "epoch": 14.849162011173185, + "grad_norm": 0.32000869512557983, + "learning_rate": 0.000258655462184874, + "loss": 0.3701, + "step": 26580 + }, + { + "epoch": 14.849720670391061, + "grad_norm": 0.4071613550186157, + "learning_rate": 0.0002586274509803921, + "loss": 0.4253, + "step": 26581 + }, + { + "epoch": 14.850279329608938, + "grad_norm": 2.027844190597534, + "learning_rate": 0.0002585994397759104, + "loss": 0.5256, + "step": 26582 + }, + { + "epoch": 14.850837988826816, + "grad_norm": 0.5331873297691345, + "learning_rate": 0.0002585714285714286, + "loss": 0.4831, + "step": 26583 + }, + { + "epoch": 14.851396648044693, + "grad_norm": 2.0115225315093994, + "learning_rate": 0.0002585434173669468, + "loss": 0.3547, + "step": 26584 + }, + { + "epoch": 14.85195530726257, + "grad_norm": 0.8113182187080383, + "learning_rate": 0.000258515406162465, + "loss": 0.3894, + "step": 26585 + }, + { + "epoch": 14.852513966480448, + "grad_norm": 0.43342965841293335, + "learning_rate": 0.00025848739495798316, + "loss": 0.4269, + "step": 26586 + }, + { + "epoch": 14.853072625698324, + "grad_norm": 0.3013548254966736, + "learning_rate": 0.0002584593837535014, + "loss": 0.3265, + "step": 26587 + }, + { + "epoch": 14.8536312849162, + "grad_norm": 0.40642988681793213, + "learning_rate": 0.0002584313725490196, + "loss": 0.4122, + "step": 26588 + }, + { + "epoch": 14.854189944134077, + "grad_norm": 0.3633423149585724, + "learning_rate": 0.00025840336134453783, + "loss": 0.3916, + "step": 26589 + }, + { + "epoch": 14.854748603351956, + "grad_norm": 0.3396340012550354, + "learning_rate": 0.00025837535014005603, + "loss": 0.3476, + "step": 26590 + }, + { + "epoch": 14.855307262569832, + "grad_norm": 0.47138839960098267, + "learning_rate": 0.00025834733893557424, + "loss": 0.3865, + "step": 26591 + }, + { + "epoch": 14.855865921787709, + "grad_norm": 1.229844093322754, + "learning_rate": 0.00025831932773109245, + "loss": 0.4567, + "step": 26592 + }, + { + "epoch": 14.856424581005587, + "grad_norm": 0.592532217502594, + "learning_rate": 0.00025829131652661065, + "loss": 0.4355, + "step": 26593 + }, + { + "epoch": 14.856983240223464, + "grad_norm": 0.5965381264686584, + "learning_rate": 0.00025826330532212886, + "loss": 0.3752, + "step": 26594 + }, + { + "epoch": 14.85754189944134, + "grad_norm": 0.5160749554634094, + "learning_rate": 0.00025823529411764706, + "loss": 0.5177, + "step": 26595 + }, + { + "epoch": 14.858100558659217, + "grad_norm": 0.41821354627609253, + "learning_rate": 0.00025820728291316527, + "loss": 0.4024, + "step": 26596 + }, + { + "epoch": 14.858659217877095, + "grad_norm": 0.5512333512306213, + "learning_rate": 0.0002581792717086835, + "loss": 0.3825, + "step": 26597 + }, + { + "epoch": 14.859217877094972, + "grad_norm": 1.4245907068252563, + "learning_rate": 0.0002581512605042017, + "loss": 0.3753, + "step": 26598 + }, + { + "epoch": 14.859776536312848, + "grad_norm": 1.4051717519760132, + "learning_rate": 0.00025812324929971994, + "loss": 0.3699, + "step": 26599 + }, + { + "epoch": 14.860335195530727, + "grad_norm": 0.4880584478378296, + "learning_rate": 0.0002580952380952381, + "loss": 0.3435, + "step": 26600 + }, + { + "epoch": 14.860893854748603, + "grad_norm": 0.40549179911613464, + "learning_rate": 0.0002580672268907563, + "loss": 0.4291, + "step": 26601 + }, + { + "epoch": 14.86145251396648, + "grad_norm": 0.6553271412849426, + "learning_rate": 0.0002580392156862745, + "loss": 0.6506, + "step": 26602 + }, + { + "epoch": 14.862011173184358, + "grad_norm": 0.3544323742389679, + "learning_rate": 0.0002580112044817927, + "loss": 0.3726, + "step": 26603 + }, + { + "epoch": 14.862569832402235, + "grad_norm": 2.8086049556732178, + "learning_rate": 0.00025798319327731097, + "loss": 0.408, + "step": 26604 + }, + { + "epoch": 14.863128491620111, + "grad_norm": 0.333035409450531, + "learning_rate": 0.0002579551820728291, + "loss": 0.2596, + "step": 26605 + }, + { + "epoch": 14.86368715083799, + "grad_norm": 0.4436178207397461, + "learning_rate": 0.00025792717086834733, + "loss": 0.3856, + "step": 26606 + }, + { + "epoch": 14.864245810055866, + "grad_norm": 0.5466976165771484, + "learning_rate": 0.0002578991596638656, + "loss": 0.4044, + "step": 26607 + }, + { + "epoch": 14.864804469273743, + "grad_norm": 0.37446752190589905, + "learning_rate": 0.00025787114845938374, + "loss": 0.4531, + "step": 26608 + }, + { + "epoch": 14.86536312849162, + "grad_norm": 0.4338454008102417, + "learning_rate": 0.000257843137254902, + "loss": 0.3605, + "step": 26609 + }, + { + "epoch": 14.865921787709498, + "grad_norm": 0.6497722864151001, + "learning_rate": 0.00025781512605042015, + "loss": 0.3824, + "step": 26610 + }, + { + "epoch": 14.866480446927374, + "grad_norm": 0.41234156489372253, + "learning_rate": 0.00025778711484593836, + "loss": 0.5167, + "step": 26611 + }, + { + "epoch": 14.867039106145251, + "grad_norm": 0.45685410499572754, + "learning_rate": 0.0002577591036414566, + "loss": 0.3876, + "step": 26612 + }, + { + "epoch": 14.86759776536313, + "grad_norm": 0.3604154586791992, + "learning_rate": 0.00025773109243697477, + "loss": 0.406, + "step": 26613 + }, + { + "epoch": 14.868156424581006, + "grad_norm": 0.4332268238067627, + "learning_rate": 0.000257703081232493, + "loss": 0.4247, + "step": 26614 + }, + { + "epoch": 14.868715083798882, + "grad_norm": 0.557467520236969, + "learning_rate": 0.00025767507002801124, + "loss": 0.4297, + "step": 26615 + }, + { + "epoch": 14.869273743016759, + "grad_norm": 0.41732969880104065, + "learning_rate": 0.0002576470588235294, + "loss": 0.3656, + "step": 26616 + }, + { + "epoch": 14.869832402234637, + "grad_norm": 0.42851725220680237, + "learning_rate": 0.00025761904761904765, + "loss": 0.3817, + "step": 26617 + }, + { + "epoch": 14.870391061452514, + "grad_norm": 0.5509878993034363, + "learning_rate": 0.0002575910364145658, + "loss": 0.5425, + "step": 26618 + }, + { + "epoch": 14.87094972067039, + "grad_norm": 0.7281415462493896, + "learning_rate": 0.000257563025210084, + "loss": 0.43, + "step": 26619 + }, + { + "epoch": 14.871508379888269, + "grad_norm": 0.40808019042015076, + "learning_rate": 0.00025753501400560227, + "loss": 0.3613, + "step": 26620 + }, + { + "epoch": 14.872067039106145, + "grad_norm": 0.6743311285972595, + "learning_rate": 0.0002575070028011204, + "loss": 0.4565, + "step": 26621 + }, + { + "epoch": 14.872625698324022, + "grad_norm": 1.0449023246765137, + "learning_rate": 0.0002574789915966387, + "loss": 0.4927, + "step": 26622 + }, + { + "epoch": 14.8731843575419, + "grad_norm": 0.3304521143436432, + "learning_rate": 0.0002574509803921569, + "loss": 0.3313, + "step": 26623 + }, + { + "epoch": 14.873743016759777, + "grad_norm": 0.476204514503479, + "learning_rate": 0.00025742296918767504, + "loss": 0.4571, + "step": 26624 + }, + { + "epoch": 14.874301675977653, + "grad_norm": 0.3807516098022461, + "learning_rate": 0.0002573949579831933, + "loss": 0.3532, + "step": 26625 + }, + { + "epoch": 14.87486033519553, + "grad_norm": 0.535419762134552, + "learning_rate": 0.00025736694677871145, + "loss": 0.454, + "step": 26626 + }, + { + "epoch": 14.875418994413408, + "grad_norm": 0.39224204421043396, + "learning_rate": 0.0002573389355742297, + "loss": 0.3741, + "step": 26627 + }, + { + "epoch": 14.875977653631285, + "grad_norm": 0.5574172735214233, + "learning_rate": 0.0002573109243697479, + "loss": 0.4204, + "step": 26628 + }, + { + "epoch": 14.876536312849161, + "grad_norm": 0.5498204231262207, + "learning_rate": 0.00025728291316526607, + "loss": 0.357, + "step": 26629 + }, + { + "epoch": 14.87709497206704, + "grad_norm": 0.34681206941604614, + "learning_rate": 0.00025725490196078433, + "loss": 0.3775, + "step": 26630 + }, + { + "epoch": 14.877653631284916, + "grad_norm": 0.3369964361190796, + "learning_rate": 0.00025722689075630253, + "loss": 0.3381, + "step": 26631 + }, + { + "epoch": 14.878212290502793, + "grad_norm": 0.3674883544445038, + "learning_rate": 0.00025719887955182074, + "loss": 0.3944, + "step": 26632 + }, + { + "epoch": 14.878770949720671, + "grad_norm": 0.4161687195301056, + "learning_rate": 0.00025717086834733895, + "loss": 0.3686, + "step": 26633 + }, + { + "epoch": 14.879329608938548, + "grad_norm": 0.3734763562679291, + "learning_rate": 0.0002571428571428571, + "loss": 0.3381, + "step": 26634 + }, + { + "epoch": 14.879888268156424, + "grad_norm": 0.6255685091018677, + "learning_rate": 0.00025711484593837536, + "loss": 0.3103, + "step": 26635 + }, + { + "epoch": 14.880446927374301, + "grad_norm": 0.4141809940338135, + "learning_rate": 0.00025708683473389356, + "loss": 0.3873, + "step": 26636 + }, + { + "epoch": 14.88100558659218, + "grad_norm": 0.5900827646255493, + "learning_rate": 0.00025705882352941177, + "loss": 0.5042, + "step": 26637 + }, + { + "epoch": 14.881564245810056, + "grad_norm": 0.5379325747489929, + "learning_rate": 0.00025703081232493, + "loss": 0.4259, + "step": 26638 + }, + { + "epoch": 14.882122905027932, + "grad_norm": 0.47491633892059326, + "learning_rate": 0.0002570028011204482, + "loss": 0.423, + "step": 26639 + }, + { + "epoch": 14.88268156424581, + "grad_norm": 0.49043571949005127, + "learning_rate": 0.0002569747899159664, + "loss": 0.4991, + "step": 26640 + }, + { + "epoch": 14.883240223463687, + "grad_norm": 0.8420180678367615, + "learning_rate": 0.0002569467787114846, + "loss": 0.4139, + "step": 26641 + }, + { + "epoch": 14.883798882681564, + "grad_norm": 0.6261454820632935, + "learning_rate": 0.0002569187675070028, + "loss": 0.5389, + "step": 26642 + }, + { + "epoch": 14.88435754189944, + "grad_norm": 1.0918277502059937, + "learning_rate": 0.000256890756302521, + "loss": 0.2714, + "step": 26643 + }, + { + "epoch": 14.884916201117319, + "grad_norm": 0.3174363374710083, + "learning_rate": 0.0002568627450980392, + "loss": 0.3878, + "step": 26644 + }, + { + "epoch": 14.885474860335195, + "grad_norm": 0.6370895504951477, + "learning_rate": 0.0002568347338935574, + "loss": 0.5015, + "step": 26645 + }, + { + "epoch": 14.886033519553072, + "grad_norm": 0.44760754704475403, + "learning_rate": 0.0002568067226890756, + "loss": 0.3652, + "step": 26646 + }, + { + "epoch": 14.88659217877095, + "grad_norm": 0.42924797534942627, + "learning_rate": 0.0002567787114845939, + "loss": 0.2802, + "step": 26647 + }, + { + "epoch": 14.887150837988827, + "grad_norm": 1.5820657014846802, + "learning_rate": 0.00025675070028011204, + "loss": 0.3777, + "step": 26648 + }, + { + "epoch": 14.887709497206703, + "grad_norm": 0.48219895362854004, + "learning_rate": 0.00025672268907563024, + "loss": 0.4461, + "step": 26649 + }, + { + "epoch": 14.888268156424582, + "grad_norm": 0.7413076162338257, + "learning_rate": 0.00025669467787114845, + "loss": 0.504, + "step": 26650 + }, + { + "epoch": 14.888826815642458, + "grad_norm": 4.097867488861084, + "learning_rate": 0.00025666666666666665, + "loss": 0.4297, + "step": 26651 + }, + { + "epoch": 14.889385474860335, + "grad_norm": 0.5052071809768677, + "learning_rate": 0.0002566386554621849, + "loss": 0.461, + "step": 26652 + }, + { + "epoch": 14.889944134078211, + "grad_norm": 0.43386533856391907, + "learning_rate": 0.00025661064425770307, + "loss": 0.411, + "step": 26653 + }, + { + "epoch": 14.89050279329609, + "grad_norm": 0.3635751008987427, + "learning_rate": 0.00025658263305322127, + "loss": 0.3706, + "step": 26654 + }, + { + "epoch": 14.891061452513966, + "grad_norm": 0.5499553084373474, + "learning_rate": 0.00025655462184873953, + "loss": 0.4579, + "step": 26655 + }, + { + "epoch": 14.891620111731843, + "grad_norm": 1.0164464712142944, + "learning_rate": 0.0002565266106442577, + "loss": 0.4046, + "step": 26656 + }, + { + "epoch": 14.892178770949721, + "grad_norm": 0.5983941555023193, + "learning_rate": 0.00025649859943977594, + "loss": 0.402, + "step": 26657 + }, + { + "epoch": 14.892737430167598, + "grad_norm": 0.49708226323127747, + "learning_rate": 0.0002564705882352941, + "loss": 0.355, + "step": 26658 + }, + { + "epoch": 14.893296089385474, + "grad_norm": 0.5279198288917542, + "learning_rate": 0.0002564425770308123, + "loss": 0.4718, + "step": 26659 + }, + { + "epoch": 14.893854748603353, + "grad_norm": 0.5096535682678223, + "learning_rate": 0.00025641456582633056, + "loss": 0.5149, + "step": 26660 + }, + { + "epoch": 14.89441340782123, + "grad_norm": 0.6974937319755554, + "learning_rate": 0.0002563865546218487, + "loss": 0.4487, + "step": 26661 + }, + { + "epoch": 14.894972067039106, + "grad_norm": 0.9840856790542603, + "learning_rate": 0.000256358543417367, + "loss": 0.3418, + "step": 26662 + }, + { + "epoch": 14.895530726256982, + "grad_norm": 0.42200687527656555, + "learning_rate": 0.0002563305322128852, + "loss": 0.4218, + "step": 26663 + }, + { + "epoch": 14.89608938547486, + "grad_norm": 0.5543202757835388, + "learning_rate": 0.00025630252100840333, + "loss": 0.3908, + "step": 26664 + }, + { + "epoch": 14.896648044692737, + "grad_norm": 0.42498838901519775, + "learning_rate": 0.0002562745098039216, + "loss": 0.4553, + "step": 26665 + }, + { + "epoch": 14.897206703910614, + "grad_norm": 2.4699130058288574, + "learning_rate": 0.00025624649859943974, + "loss": 0.5016, + "step": 26666 + }, + { + "epoch": 14.897765363128492, + "grad_norm": 0.5625272393226624, + "learning_rate": 0.000256218487394958, + "loss": 0.406, + "step": 26667 + }, + { + "epoch": 14.898324022346369, + "grad_norm": 0.5316697359085083, + "learning_rate": 0.0002561904761904762, + "loss": 0.2879, + "step": 26668 + }, + { + "epoch": 14.898882681564245, + "grad_norm": 0.3590381443500519, + "learning_rate": 0.00025616246498599436, + "loss": 0.5206, + "step": 26669 + }, + { + "epoch": 14.899441340782122, + "grad_norm": 0.4346022605895996, + "learning_rate": 0.0002561344537815126, + "loss": 0.4501, + "step": 26670 + }, + { + "epoch": 14.9, + "grad_norm": 0.47714418172836304, + "learning_rate": 0.00025610644257703083, + "loss": 0.4973, + "step": 26671 + }, + { + "epoch": 14.900558659217877, + "grad_norm": 0.48734551668167114, + "learning_rate": 0.00025607843137254903, + "loss": 0.4386, + "step": 26672 + }, + { + "epoch": 14.901117318435753, + "grad_norm": 0.5241233706474304, + "learning_rate": 0.00025605042016806724, + "loss": 0.3414, + "step": 26673 + }, + { + "epoch": 14.901675977653632, + "grad_norm": 0.5248932838439941, + "learning_rate": 0.0002560224089635854, + "loss": 0.4846, + "step": 26674 + }, + { + "epoch": 14.902234636871508, + "grad_norm": 0.4646275043487549, + "learning_rate": 0.00025599439775910365, + "loss": 0.4283, + "step": 26675 + }, + { + "epoch": 14.902793296089385, + "grad_norm": 0.4028272032737732, + "learning_rate": 0.00025596638655462186, + "loss": 0.342, + "step": 26676 + }, + { + "epoch": 14.903351955307263, + "grad_norm": 2.1062960624694824, + "learning_rate": 0.00025593837535014006, + "loss": 0.4469, + "step": 26677 + }, + { + "epoch": 14.90391061452514, + "grad_norm": 2.1968324184417725, + "learning_rate": 0.00025591036414565827, + "loss": 0.3279, + "step": 26678 + }, + { + "epoch": 14.904469273743016, + "grad_norm": 0.6872121095657349, + "learning_rate": 0.0002558823529411765, + "loss": 0.3232, + "step": 26679 + }, + { + "epoch": 14.905027932960895, + "grad_norm": 0.37037986516952515, + "learning_rate": 0.0002558543417366947, + "loss": 0.3142, + "step": 26680 + }, + { + "epoch": 14.905586592178771, + "grad_norm": 0.6516431570053101, + "learning_rate": 0.0002558263305322129, + "loss": 0.4806, + "step": 26681 + }, + { + "epoch": 14.906145251396648, + "grad_norm": 0.4694614112377167, + "learning_rate": 0.00025579831932773115, + "loss": 0.4179, + "step": 26682 + }, + { + "epoch": 14.906703910614524, + "grad_norm": 0.545961320400238, + "learning_rate": 0.0002557703081232493, + "loss": 0.3839, + "step": 26683 + }, + { + "epoch": 14.907262569832403, + "grad_norm": 0.40497687458992004, + "learning_rate": 0.0002557422969187675, + "loss": 0.2871, + "step": 26684 + }, + { + "epoch": 14.90782122905028, + "grad_norm": 0.8895620107650757, + "learning_rate": 0.0002557142857142857, + "loss": 0.3958, + "step": 26685 + }, + { + "epoch": 14.908379888268156, + "grad_norm": 0.6328848600387573, + "learning_rate": 0.0002556862745098039, + "loss": 0.4879, + "step": 26686 + }, + { + "epoch": 14.908938547486034, + "grad_norm": 0.37975776195526123, + "learning_rate": 0.0002556582633053222, + "loss": 0.4112, + "step": 26687 + }, + { + "epoch": 14.90949720670391, + "grad_norm": 0.5444965958595276, + "learning_rate": 0.00025563025210084033, + "loss": 0.4435, + "step": 26688 + }, + { + "epoch": 14.910055865921787, + "grad_norm": 0.8743622303009033, + "learning_rate": 0.00025560224089635854, + "loss": 0.3393, + "step": 26689 + }, + { + "epoch": 14.910614525139664, + "grad_norm": 0.40520724654197693, + "learning_rate": 0.0002555742296918768, + "loss": 0.4175, + "step": 26690 + }, + { + "epoch": 14.911173184357542, + "grad_norm": 0.507301926612854, + "learning_rate": 0.00025554621848739495, + "loss": 0.4894, + "step": 26691 + }, + { + "epoch": 14.911731843575419, + "grad_norm": 0.3801904320716858, + "learning_rate": 0.0002555182072829132, + "loss": 0.3553, + "step": 26692 + }, + { + "epoch": 14.912290502793295, + "grad_norm": 0.42334821820259094, + "learning_rate": 0.00025549019607843136, + "loss": 0.4115, + "step": 26693 + }, + { + "epoch": 14.912849162011174, + "grad_norm": 0.3584507703781128, + "learning_rate": 0.00025546218487394957, + "loss": 0.424, + "step": 26694 + }, + { + "epoch": 14.91340782122905, + "grad_norm": 0.44555243849754333, + "learning_rate": 0.0002554341736694678, + "loss": 0.4661, + "step": 26695 + }, + { + "epoch": 14.913966480446927, + "grad_norm": 0.47240155935287476, + "learning_rate": 0.000255406162464986, + "loss": 0.5293, + "step": 26696 + }, + { + "epoch": 14.914525139664804, + "grad_norm": 3.6318836212158203, + "learning_rate": 0.00025537815126050424, + "loss": 0.4274, + "step": 26697 + }, + { + "epoch": 14.915083798882682, + "grad_norm": 1.208521842956543, + "learning_rate": 0.00025535014005602244, + "loss": 0.502, + "step": 26698 + }, + { + "epoch": 14.915642458100558, + "grad_norm": 0.4105777442455292, + "learning_rate": 0.0002553221288515406, + "loss": 0.3547, + "step": 26699 + }, + { + "epoch": 14.916201117318435, + "grad_norm": 0.38693636655807495, + "learning_rate": 0.00025529411764705886, + "loss": 0.422, + "step": 26700 + }, + { + "epoch": 14.916759776536313, + "grad_norm": 0.5913032293319702, + "learning_rate": 0.000255266106442577, + "loss": 0.4376, + "step": 26701 + }, + { + "epoch": 14.91731843575419, + "grad_norm": 2.128818988800049, + "learning_rate": 0.00025523809523809527, + "loss": 0.337, + "step": 26702 + }, + { + "epoch": 14.917877094972066, + "grad_norm": 0.44014424085617065, + "learning_rate": 0.0002552100840336135, + "loss": 0.3936, + "step": 26703 + }, + { + "epoch": 14.918435754189945, + "grad_norm": 0.3193798065185547, + "learning_rate": 0.0002551820728291316, + "loss": 0.3646, + "step": 26704 + }, + { + "epoch": 14.918994413407821, + "grad_norm": 0.5399397015571594, + "learning_rate": 0.0002551540616246499, + "loss": 0.4451, + "step": 26705 + }, + { + "epoch": 14.919553072625698, + "grad_norm": 0.4297410845756531, + "learning_rate": 0.0002551260504201681, + "loss": 0.4076, + "step": 26706 + }, + { + "epoch": 14.920111731843576, + "grad_norm": 0.5189684629440308, + "learning_rate": 0.0002550980392156863, + "loss": 0.3843, + "step": 26707 + }, + { + "epoch": 14.920670391061453, + "grad_norm": 0.5604249238967896, + "learning_rate": 0.0002550700280112045, + "loss": 0.4171, + "step": 26708 + }, + { + "epoch": 14.92122905027933, + "grad_norm": 1.0083544254302979, + "learning_rate": 0.00025504201680672266, + "loss": 0.4521, + "step": 26709 + }, + { + "epoch": 14.921787709497206, + "grad_norm": 0.7064391374588013, + "learning_rate": 0.0002550140056022409, + "loss": 0.353, + "step": 26710 + }, + { + "epoch": 14.922346368715084, + "grad_norm": 0.3665640950202942, + "learning_rate": 0.0002549859943977591, + "loss": 0.3566, + "step": 26711 + }, + { + "epoch": 14.922905027932961, + "grad_norm": 0.4071871042251587, + "learning_rate": 0.00025495798319327733, + "loss": 0.3435, + "step": 26712 + }, + { + "epoch": 14.923463687150837, + "grad_norm": 0.44359061121940613, + "learning_rate": 0.00025492997198879553, + "loss": 0.4729, + "step": 26713 + }, + { + "epoch": 14.924022346368716, + "grad_norm": 0.6107977032661438, + "learning_rate": 0.00025490196078431374, + "loss": 0.4939, + "step": 26714 + }, + { + "epoch": 14.924581005586592, + "grad_norm": 0.5190136432647705, + "learning_rate": 0.00025487394957983195, + "loss": 0.3887, + "step": 26715 + }, + { + "epoch": 14.925139664804469, + "grad_norm": 0.38806429505348206, + "learning_rate": 0.00025484593837535015, + "loss": 0.3133, + "step": 26716 + }, + { + "epoch": 14.925698324022346, + "grad_norm": 0.42224401235580444, + "learning_rate": 0.00025481792717086836, + "loss": 0.4102, + "step": 26717 + }, + { + "epoch": 14.926256983240224, + "grad_norm": 0.5717446804046631, + "learning_rate": 0.00025478991596638656, + "loss": 0.4744, + "step": 26718 + }, + { + "epoch": 14.9268156424581, + "grad_norm": 0.6142580509185791, + "learning_rate": 0.00025476190476190477, + "loss": 0.4914, + "step": 26719 + }, + { + "epoch": 14.927374301675977, + "grad_norm": 0.3627018630504608, + "learning_rate": 0.000254733893557423, + "loss": 0.374, + "step": 26720 + }, + { + "epoch": 14.927932960893855, + "grad_norm": 0.3908005356788635, + "learning_rate": 0.0002547058823529412, + "loss": 0.4962, + "step": 26721 + }, + { + "epoch": 14.928491620111732, + "grad_norm": 0.4286330044269562, + "learning_rate": 0.0002546778711484594, + "loss": 0.4596, + "step": 26722 + }, + { + "epoch": 14.929050279329608, + "grad_norm": 0.41611284017562866, + "learning_rate": 0.0002546498599439776, + "loss": 0.4051, + "step": 26723 + }, + { + "epoch": 14.929608938547487, + "grad_norm": 0.531650722026825, + "learning_rate": 0.0002546218487394958, + "loss": 0.5239, + "step": 26724 + }, + { + "epoch": 14.930167597765363, + "grad_norm": 0.35428231954574585, + "learning_rate": 0.000254593837535014, + "loss": 0.4362, + "step": 26725 + }, + { + "epoch": 14.93072625698324, + "grad_norm": 0.4525468945503235, + "learning_rate": 0.0002545658263305322, + "loss": 0.3122, + "step": 26726 + }, + { + "epoch": 14.931284916201117, + "grad_norm": 0.6107298135757446, + "learning_rate": 0.0002545378151260504, + "loss": 0.4408, + "step": 26727 + }, + { + "epoch": 14.931843575418995, + "grad_norm": 0.38961026072502136, + "learning_rate": 0.0002545098039215686, + "loss": 0.444, + "step": 26728 + }, + { + "epoch": 14.932402234636871, + "grad_norm": 0.4186474680900574, + "learning_rate": 0.00025448179271708683, + "loss": 0.4527, + "step": 26729 + }, + { + "epoch": 14.932960893854748, + "grad_norm": 0.4926489293575287, + "learning_rate": 0.0002544537815126051, + "loss": 0.5086, + "step": 26730 + }, + { + "epoch": 14.933519553072626, + "grad_norm": 0.44995221495628357, + "learning_rate": 0.00025442577030812324, + "loss": 0.4627, + "step": 26731 + }, + { + "epoch": 14.934078212290503, + "grad_norm": 0.3999173045158386, + "learning_rate": 0.00025439775910364145, + "loss": 0.5113, + "step": 26732 + }, + { + "epoch": 14.93463687150838, + "grad_norm": 0.5775466561317444, + "learning_rate": 0.00025436974789915965, + "loss": 0.3226, + "step": 26733 + }, + { + "epoch": 14.935195530726258, + "grad_norm": 3.547893762588501, + "learning_rate": 0.00025434173669467786, + "loss": 0.3124, + "step": 26734 + }, + { + "epoch": 14.935754189944134, + "grad_norm": 0.4250103831291199, + "learning_rate": 0.0002543137254901961, + "loss": 0.3083, + "step": 26735 + }, + { + "epoch": 14.936312849162011, + "grad_norm": 0.585394561290741, + "learning_rate": 0.00025428571428571427, + "loss": 0.4156, + "step": 26736 + }, + { + "epoch": 14.936871508379888, + "grad_norm": 0.4130701422691345, + "learning_rate": 0.0002542577030812325, + "loss": 0.3007, + "step": 26737 + }, + { + "epoch": 14.937430167597766, + "grad_norm": 0.43073514103889465, + "learning_rate": 0.00025422969187675074, + "loss": 0.4547, + "step": 26738 + }, + { + "epoch": 14.937988826815642, + "grad_norm": 0.5746600031852722, + "learning_rate": 0.0002542016806722689, + "loss": 0.4421, + "step": 26739 + }, + { + "epoch": 14.938547486033519, + "grad_norm": 0.41955921053886414, + "learning_rate": 0.00025417366946778715, + "loss": 0.4599, + "step": 26740 + }, + { + "epoch": 14.939106145251397, + "grad_norm": 0.3666926324367523, + "learning_rate": 0.0002541456582633053, + "loss": 0.4538, + "step": 26741 + }, + { + "epoch": 14.939664804469274, + "grad_norm": 1.398512601852417, + "learning_rate": 0.0002541176470588235, + "loss": 0.6172, + "step": 26742 + }, + { + "epoch": 14.94022346368715, + "grad_norm": 0.4980083107948303, + "learning_rate": 0.00025408963585434177, + "loss": 0.4001, + "step": 26743 + }, + { + "epoch": 14.940782122905027, + "grad_norm": 0.34092041850090027, + "learning_rate": 0.0002540616246498599, + "loss": 0.3721, + "step": 26744 + }, + { + "epoch": 14.941340782122905, + "grad_norm": 0.37353256344795227, + "learning_rate": 0.0002540336134453782, + "loss": 0.3692, + "step": 26745 + }, + { + "epoch": 14.941899441340782, + "grad_norm": 0.3240382969379425, + "learning_rate": 0.0002540056022408964, + "loss": 0.3553, + "step": 26746 + }, + { + "epoch": 14.942458100558659, + "grad_norm": 0.568525493144989, + "learning_rate": 0.00025397759103641454, + "loss": 0.4068, + "step": 26747 + }, + { + "epoch": 14.943016759776537, + "grad_norm": 0.42363086342811584, + "learning_rate": 0.0002539495798319328, + "loss": 0.4224, + "step": 26748 + }, + { + "epoch": 14.943575418994413, + "grad_norm": 0.3492322564125061, + "learning_rate": 0.00025392156862745095, + "loss": 0.3949, + "step": 26749 + }, + { + "epoch": 14.94413407821229, + "grad_norm": 0.42833009362220764, + "learning_rate": 0.0002538935574229692, + "loss": 0.5427, + "step": 26750 + }, + { + "epoch": 14.944692737430168, + "grad_norm": 0.4058558940887451, + "learning_rate": 0.0002538655462184874, + "loss": 0.3925, + "step": 26751 + }, + { + "epoch": 14.945251396648045, + "grad_norm": 0.4290007948875427, + "learning_rate": 0.00025383753501400557, + "loss": 0.3184, + "step": 26752 + }, + { + "epoch": 14.945810055865921, + "grad_norm": 0.46052247285842896, + "learning_rate": 0.00025380952380952383, + "loss": 0.4291, + "step": 26753 + }, + { + "epoch": 14.946368715083798, + "grad_norm": 0.785688579082489, + "learning_rate": 0.00025378151260504203, + "loss": 0.3504, + "step": 26754 + }, + { + "epoch": 14.946927374301676, + "grad_norm": 0.3419393301010132, + "learning_rate": 0.00025375350140056024, + "loss": 0.4197, + "step": 26755 + }, + { + "epoch": 14.947486033519553, + "grad_norm": 0.4605628550052643, + "learning_rate": 0.00025372549019607845, + "loss": 0.4112, + "step": 26756 + }, + { + "epoch": 14.94804469273743, + "grad_norm": 0.5389098525047302, + "learning_rate": 0.0002536974789915966, + "loss": 0.4735, + "step": 26757 + }, + { + "epoch": 14.948603351955308, + "grad_norm": 1.1555759906768799, + "learning_rate": 0.00025366946778711486, + "loss": 0.3496, + "step": 26758 + }, + { + "epoch": 14.949162011173184, + "grad_norm": 0.44598883390426636, + "learning_rate": 0.00025364145658263306, + "loss": 0.4304, + "step": 26759 + }, + { + "epoch": 14.949720670391061, + "grad_norm": 0.5165014266967773, + "learning_rate": 0.00025361344537815127, + "loss": 0.3378, + "step": 26760 + }, + { + "epoch": 14.95027932960894, + "grad_norm": 0.4520178735256195, + "learning_rate": 0.0002535854341736695, + "loss": 0.508, + "step": 26761 + }, + { + "epoch": 14.950837988826816, + "grad_norm": 0.36179137229919434, + "learning_rate": 0.0002535574229691877, + "loss": 0.3353, + "step": 26762 + }, + { + "epoch": 14.951396648044692, + "grad_norm": 0.5026635527610779, + "learning_rate": 0.0002535294117647059, + "loss": 0.3965, + "step": 26763 + }, + { + "epoch": 14.951955307262569, + "grad_norm": 0.38730064034461975, + "learning_rate": 0.0002535014005602241, + "loss": 0.3938, + "step": 26764 + }, + { + "epoch": 14.952513966480447, + "grad_norm": 0.43364083766937256, + "learning_rate": 0.0002534733893557423, + "loss": 0.4268, + "step": 26765 + }, + { + "epoch": 14.953072625698324, + "grad_norm": 0.4257733225822449, + "learning_rate": 0.0002534453781512605, + "loss": 0.4203, + "step": 26766 + }, + { + "epoch": 14.9536312849162, + "grad_norm": 0.3969273567199707, + "learning_rate": 0.0002534173669467787, + "loss": 0.3456, + "step": 26767 + }, + { + "epoch": 14.954189944134079, + "grad_norm": 0.551905632019043, + "learning_rate": 0.0002533893557422969, + "loss": 0.4225, + "step": 26768 + }, + { + "epoch": 14.954748603351955, + "grad_norm": 0.4354240298271179, + "learning_rate": 0.0002533613445378151, + "loss": 0.4485, + "step": 26769 + }, + { + "epoch": 14.955307262569832, + "grad_norm": 0.5617095232009888, + "learning_rate": 0.0002533333333333334, + "loss": 0.3515, + "step": 26770 + }, + { + "epoch": 14.955865921787709, + "grad_norm": 0.4644061028957367, + "learning_rate": 0.00025330532212885154, + "loss": 0.4768, + "step": 26771 + }, + { + "epoch": 14.956424581005587, + "grad_norm": 0.48649847507476807, + "learning_rate": 0.00025327731092436974, + "loss": 0.4254, + "step": 26772 + }, + { + "epoch": 14.956983240223463, + "grad_norm": 0.5363298654556274, + "learning_rate": 0.00025324929971988795, + "loss": 0.4469, + "step": 26773 + }, + { + "epoch": 14.95754189944134, + "grad_norm": 1.2023565769195557, + "learning_rate": 0.00025322128851540615, + "loss": 0.2982, + "step": 26774 + }, + { + "epoch": 14.958100558659218, + "grad_norm": 0.6308413147926331, + "learning_rate": 0.0002531932773109244, + "loss": 0.4076, + "step": 26775 + }, + { + "epoch": 14.958659217877095, + "grad_norm": 0.484457403421402, + "learning_rate": 0.00025316526610644257, + "loss": 0.2778, + "step": 26776 + }, + { + "epoch": 14.959217877094972, + "grad_norm": 0.5184215903282166, + "learning_rate": 0.00025313725490196077, + "loss": 0.3792, + "step": 26777 + }, + { + "epoch": 14.95977653631285, + "grad_norm": 0.400098592042923, + "learning_rate": 0.00025310924369747903, + "loss": 0.4089, + "step": 26778 + }, + { + "epoch": 14.960335195530726, + "grad_norm": 0.6525869369506836, + "learning_rate": 0.0002530812324929972, + "loss": 0.5099, + "step": 26779 + }, + { + "epoch": 14.960893854748603, + "grad_norm": 0.5836340188980103, + "learning_rate": 0.00025305322128851544, + "loss": 0.458, + "step": 26780 + }, + { + "epoch": 14.961452513966481, + "grad_norm": 0.5297774076461792, + "learning_rate": 0.0002530252100840336, + "loss": 0.5553, + "step": 26781 + }, + { + "epoch": 14.962011173184358, + "grad_norm": 0.35775622725486755, + "learning_rate": 0.0002529971988795518, + "loss": 0.3315, + "step": 26782 + }, + { + "epoch": 14.962569832402234, + "grad_norm": 0.4294080436229706, + "learning_rate": 0.00025296918767507006, + "loss": 0.4052, + "step": 26783 + }, + { + "epoch": 14.963128491620111, + "grad_norm": 0.401673823595047, + "learning_rate": 0.0002529411764705882, + "loss": 0.3491, + "step": 26784 + }, + { + "epoch": 14.96368715083799, + "grad_norm": 0.5867675542831421, + "learning_rate": 0.0002529131652661065, + "loss": 0.4043, + "step": 26785 + }, + { + "epoch": 14.964245810055866, + "grad_norm": 0.46166130900382996, + "learning_rate": 0.0002528851540616247, + "loss": 0.5118, + "step": 26786 + }, + { + "epoch": 14.964804469273743, + "grad_norm": 0.9252844452857971, + "learning_rate": 0.00025285714285714283, + "loss": 0.3603, + "step": 26787 + }, + { + "epoch": 14.96536312849162, + "grad_norm": 0.9635888338088989, + "learning_rate": 0.0002528291316526611, + "loss": 0.2898, + "step": 26788 + }, + { + "epoch": 14.965921787709497, + "grad_norm": 0.4883910119533539, + "learning_rate": 0.00025280112044817924, + "loss": 0.5281, + "step": 26789 + }, + { + "epoch": 14.966480446927374, + "grad_norm": 0.5012143850326538, + "learning_rate": 0.0002527731092436975, + "loss": 0.3591, + "step": 26790 + }, + { + "epoch": 14.96703910614525, + "grad_norm": 0.38108256459236145, + "learning_rate": 0.0002527450980392157, + "loss": 0.4126, + "step": 26791 + }, + { + "epoch": 14.967597765363129, + "grad_norm": 0.6020956635475159, + "learning_rate": 0.00025271708683473386, + "loss": 0.4843, + "step": 26792 + }, + { + "epoch": 14.968156424581005, + "grad_norm": 0.6270828247070312, + "learning_rate": 0.0002526890756302521, + "loss": 0.5059, + "step": 26793 + }, + { + "epoch": 14.968715083798882, + "grad_norm": 2.904100179672241, + "learning_rate": 0.00025266106442577033, + "loss": 0.4719, + "step": 26794 + }, + { + "epoch": 14.96927374301676, + "grad_norm": 0.5739738345146179, + "learning_rate": 0.00025263305322128853, + "loss": 0.3722, + "step": 26795 + }, + { + "epoch": 14.969832402234637, + "grad_norm": 0.2860737442970276, + "learning_rate": 0.00025260504201680674, + "loss": 0.2672, + "step": 26796 + }, + { + "epoch": 14.970391061452514, + "grad_norm": 0.3919770419597626, + "learning_rate": 0.0002525770308123249, + "loss": 0.4256, + "step": 26797 + }, + { + "epoch": 14.970949720670392, + "grad_norm": 0.3273077607154846, + "learning_rate": 0.00025254901960784315, + "loss": 0.3114, + "step": 26798 + }, + { + "epoch": 14.971508379888268, + "grad_norm": 0.4783824384212494, + "learning_rate": 0.00025252100840336136, + "loss": 0.3996, + "step": 26799 + }, + { + "epoch": 14.972067039106145, + "grad_norm": 1.8253722190856934, + "learning_rate": 0.00025249299719887956, + "loss": 0.4797, + "step": 26800 + }, + { + "epoch": 14.972625698324022, + "grad_norm": 0.5969604253768921, + "learning_rate": 0.00025246498599439777, + "loss": 0.5129, + "step": 26801 + }, + { + "epoch": 14.9731843575419, + "grad_norm": 0.7188569903373718, + "learning_rate": 0.000252436974789916, + "loss": 0.4162, + "step": 26802 + }, + { + "epoch": 14.973743016759776, + "grad_norm": 0.38322216272354126, + "learning_rate": 0.0002524089635854342, + "loss": 0.3425, + "step": 26803 + }, + { + "epoch": 14.974301675977653, + "grad_norm": 0.5722339749336243, + "learning_rate": 0.0002523809523809524, + "loss": 0.5008, + "step": 26804 + }, + { + "epoch": 14.974860335195531, + "grad_norm": 0.37354356050491333, + "learning_rate": 0.0002523529411764706, + "loss": 0.3874, + "step": 26805 + }, + { + "epoch": 14.975418994413408, + "grad_norm": 0.40424275398254395, + "learning_rate": 0.0002523249299719888, + "loss": 0.4555, + "step": 26806 + }, + { + "epoch": 14.975977653631285, + "grad_norm": 0.5796392560005188, + "learning_rate": 0.000252296918767507, + "loss": 0.4723, + "step": 26807 + }, + { + "epoch": 14.976536312849163, + "grad_norm": 0.6225695610046387, + "learning_rate": 0.0002522689075630252, + "loss": 0.4004, + "step": 26808 + }, + { + "epoch": 14.97709497206704, + "grad_norm": 1.1140315532684326, + "learning_rate": 0.0002522408963585434, + "loss": 0.4631, + "step": 26809 + }, + { + "epoch": 14.977653631284916, + "grad_norm": 0.3727976083755493, + "learning_rate": 0.0002522128851540617, + "loss": 0.3779, + "step": 26810 + }, + { + "epoch": 14.978212290502793, + "grad_norm": 0.5018436908721924, + "learning_rate": 0.00025218487394957983, + "loss": 0.2911, + "step": 26811 + }, + { + "epoch": 14.978770949720671, + "grad_norm": 1.0696431398391724, + "learning_rate": 0.00025215686274509804, + "loss": 0.384, + "step": 26812 + }, + { + "epoch": 14.979329608938547, + "grad_norm": 0.5404345393180847, + "learning_rate": 0.00025212885154061624, + "loss": 0.425, + "step": 26813 + }, + { + "epoch": 14.979888268156424, + "grad_norm": 0.8666000366210938, + "learning_rate": 0.00025210084033613445, + "loss": 0.589, + "step": 26814 + }, + { + "epoch": 14.980446927374302, + "grad_norm": 0.4966289699077606, + "learning_rate": 0.0002520728291316527, + "loss": 0.4294, + "step": 26815 + }, + { + "epoch": 14.981005586592179, + "grad_norm": 0.7560771107673645, + "learning_rate": 0.00025204481792717086, + "loss": 0.4236, + "step": 26816 + }, + { + "epoch": 14.981564245810056, + "grad_norm": 0.6763277053833008, + "learning_rate": 0.00025201680672268907, + "loss": 0.4047, + "step": 26817 + }, + { + "epoch": 14.982122905027932, + "grad_norm": 0.5242845416069031, + "learning_rate": 0.0002519887955182073, + "loss": 0.3426, + "step": 26818 + }, + { + "epoch": 14.98268156424581, + "grad_norm": 0.6580713987350464, + "learning_rate": 0.0002519607843137255, + "loss": 0.4821, + "step": 26819 + }, + { + "epoch": 14.983240223463687, + "grad_norm": 0.3427670896053314, + "learning_rate": 0.00025193277310924374, + "loss": 0.3155, + "step": 26820 + }, + { + "epoch": 14.983798882681564, + "grad_norm": 0.5841524004936218, + "learning_rate": 0.0002519047619047619, + "loss": 0.4119, + "step": 26821 + }, + { + "epoch": 14.984357541899442, + "grad_norm": 0.6364976167678833, + "learning_rate": 0.0002518767507002801, + "loss": 0.4236, + "step": 26822 + }, + { + "epoch": 14.984916201117318, + "grad_norm": 0.3674076795578003, + "learning_rate": 0.00025184873949579836, + "loss": 0.4101, + "step": 26823 + }, + { + "epoch": 14.985474860335195, + "grad_norm": 1.5305808782577515, + "learning_rate": 0.0002518207282913165, + "loss": 0.4783, + "step": 26824 + }, + { + "epoch": 14.986033519553073, + "grad_norm": 0.5475207567214966, + "learning_rate": 0.00025179271708683477, + "loss": 0.3689, + "step": 26825 + }, + { + "epoch": 14.98659217877095, + "grad_norm": 0.5333097577095032, + "learning_rate": 0.000251764705882353, + "loss": 0.4149, + "step": 26826 + }, + { + "epoch": 14.987150837988827, + "grad_norm": 0.4091620147228241, + "learning_rate": 0.0002517366946778711, + "loss": 0.41, + "step": 26827 + }, + { + "epoch": 14.987709497206703, + "grad_norm": 0.40512973070144653, + "learning_rate": 0.0002517086834733894, + "loss": 0.3662, + "step": 26828 + }, + { + "epoch": 14.988268156424581, + "grad_norm": 0.6344616413116455, + "learning_rate": 0.00025168067226890754, + "loss": 0.3644, + "step": 26829 + }, + { + "epoch": 14.988826815642458, + "grad_norm": 0.6171712875366211, + "learning_rate": 0.0002516526610644258, + "loss": 0.4622, + "step": 26830 + }, + { + "epoch": 14.989385474860335, + "grad_norm": 0.48157447576522827, + "learning_rate": 0.000251624649859944, + "loss": 0.4254, + "step": 26831 + }, + { + "epoch": 14.989944134078213, + "grad_norm": 1.4633336067199707, + "learning_rate": 0.00025159663865546216, + "loss": 0.406, + "step": 26832 + }, + { + "epoch": 14.99050279329609, + "grad_norm": 0.4426704943180084, + "learning_rate": 0.0002515686274509804, + "loss": 0.3601, + "step": 26833 + }, + { + "epoch": 14.991061452513966, + "grad_norm": 0.9304420351982117, + "learning_rate": 0.0002515406162464986, + "loss": 0.3857, + "step": 26834 + }, + { + "epoch": 14.991620111731844, + "grad_norm": 0.47383683919906616, + "learning_rate": 0.0002515126050420168, + "loss": 0.4666, + "step": 26835 + }, + { + "epoch": 14.992178770949721, + "grad_norm": 0.8339270353317261, + "learning_rate": 0.00025148459383753503, + "loss": 0.4184, + "step": 26836 + }, + { + "epoch": 14.992737430167598, + "grad_norm": 4.5467329025268555, + "learning_rate": 0.0002514565826330532, + "loss": 0.4058, + "step": 26837 + }, + { + "epoch": 14.993296089385474, + "grad_norm": 1.2365411520004272, + "learning_rate": 0.00025142857142857145, + "loss": 0.3475, + "step": 26838 + }, + { + "epoch": 14.993854748603352, + "grad_norm": 0.7254947423934937, + "learning_rate": 0.00025140056022408965, + "loss": 0.4221, + "step": 26839 + }, + { + "epoch": 14.994413407821229, + "grad_norm": 0.3923681676387787, + "learning_rate": 0.0002513725490196078, + "loss": 0.3795, + "step": 26840 + }, + { + "epoch": 14.994972067039106, + "grad_norm": 1.1441880464553833, + "learning_rate": 0.00025134453781512606, + "loss": 0.3237, + "step": 26841 + }, + { + "epoch": 14.995530726256984, + "grad_norm": 0.613080620765686, + "learning_rate": 0.00025131652661064427, + "loss": 0.3942, + "step": 26842 + }, + { + "epoch": 14.99608938547486, + "grad_norm": 0.47400137782096863, + "learning_rate": 0.0002512885154061625, + "loss": 0.3835, + "step": 26843 + }, + { + "epoch": 14.996648044692737, + "grad_norm": 0.8768868446350098, + "learning_rate": 0.0002512605042016807, + "loss": 0.593, + "step": 26844 + }, + { + "epoch": 14.997206703910614, + "grad_norm": 0.6574596762657166, + "learning_rate": 0.00025123249299719883, + "loss": 0.5053, + "step": 26845 + }, + { + "epoch": 14.997765363128492, + "grad_norm": 1.4516459703445435, + "learning_rate": 0.0002512044817927171, + "loss": 0.3991, + "step": 26846 + }, + { + "epoch": 14.998324022346369, + "grad_norm": 0.4115583300590515, + "learning_rate": 0.0002511764705882353, + "loss": 0.4428, + "step": 26847 + }, + { + "epoch": 14.998882681564245, + "grad_norm": 3.324688673019409, + "learning_rate": 0.0002511484593837535, + "loss": 0.3352, + "step": 26848 + }, + { + "epoch": 14.999441340782123, + "grad_norm": 0.33019959926605225, + "learning_rate": 0.0002511204481792717, + "loss": 0.3327, + "step": 26849 + }, + { + "epoch": 15.0, + "grad_norm": 0.45191580057144165, + "learning_rate": 0.0002510924369747899, + "loss": 0.3564, + "step": 26850 + }, + { + "epoch": 15.000558659217877, + "grad_norm": 0.4028445780277252, + "learning_rate": 0.0002510644257703081, + "loss": 0.4243, + "step": 26851 + }, + { + "epoch": 15.001117318435755, + "grad_norm": 0.6346094012260437, + "learning_rate": 0.00025103641456582633, + "loss": 0.5177, + "step": 26852 + }, + { + "epoch": 15.001675977653631, + "grad_norm": 0.36450257897377014, + "learning_rate": 0.00025100840336134454, + "loss": 0.3586, + "step": 26853 + }, + { + "epoch": 15.002234636871508, + "grad_norm": 0.3948482573032379, + "learning_rate": 0.00025098039215686274, + "loss": 0.4466, + "step": 26854 + }, + { + "epoch": 15.002793296089385, + "grad_norm": 0.6200559139251709, + "learning_rate": 0.00025095238095238095, + "loss": 0.428, + "step": 26855 + }, + { + "epoch": 15.003351955307263, + "grad_norm": 0.372334361076355, + "learning_rate": 0.00025092436974789915, + "loss": 0.3406, + "step": 26856 + }, + { + "epoch": 15.00391061452514, + "grad_norm": 0.5261492133140564, + "learning_rate": 0.00025089635854341736, + "loss": 0.5081, + "step": 26857 + }, + { + "epoch": 15.004469273743016, + "grad_norm": 0.4601307809352875, + "learning_rate": 0.0002508683473389356, + "loss": 0.4763, + "step": 26858 + }, + { + "epoch": 15.005027932960894, + "grad_norm": 0.877083957195282, + "learning_rate": 0.00025084033613445377, + "loss": 0.4749, + "step": 26859 + }, + { + "epoch": 15.005586592178771, + "grad_norm": 0.3876457214355469, + "learning_rate": 0.000250812324929972, + "loss": 0.3666, + "step": 26860 + }, + { + "epoch": 15.006145251396648, + "grad_norm": 0.4225273132324219, + "learning_rate": 0.0002507843137254902, + "loss": 0.3234, + "step": 26861 + }, + { + "epoch": 15.006703910614526, + "grad_norm": 0.4907268285751343, + "learning_rate": 0.0002507563025210084, + "loss": 0.4391, + "step": 26862 + }, + { + "epoch": 15.007262569832402, + "grad_norm": 0.5842012166976929, + "learning_rate": 0.00025072829131652665, + "loss": 0.3832, + "step": 26863 + }, + { + "epoch": 15.007821229050279, + "grad_norm": 1.8074244260787964, + "learning_rate": 0.0002507002801120448, + "loss": 0.3827, + "step": 26864 + }, + { + "epoch": 15.008379888268156, + "grad_norm": 0.5633991360664368, + "learning_rate": 0.000250672268907563, + "loss": 0.3771, + "step": 26865 + }, + { + "epoch": 15.008938547486034, + "grad_norm": 0.47548773884773254, + "learning_rate": 0.00025064425770308127, + "loss": 0.3853, + "step": 26866 + }, + { + "epoch": 15.00949720670391, + "grad_norm": 0.3436129689216614, + "learning_rate": 0.0002506162464985994, + "loss": 0.4163, + "step": 26867 + }, + { + "epoch": 15.010055865921787, + "grad_norm": 0.37591785192489624, + "learning_rate": 0.0002505882352941177, + "loss": 0.3768, + "step": 26868 + }, + { + "epoch": 15.010614525139665, + "grad_norm": 0.6241902709007263, + "learning_rate": 0.00025056022408963583, + "loss": 0.4407, + "step": 26869 + }, + { + "epoch": 15.011173184357542, + "grad_norm": 0.4844948947429657, + "learning_rate": 0.00025053221288515404, + "loss": 0.3645, + "step": 26870 + }, + { + "epoch": 15.011731843575419, + "grad_norm": 0.44438520073890686, + "learning_rate": 0.0002505042016806723, + "loss": 0.4263, + "step": 26871 + }, + { + "epoch": 15.012290502793297, + "grad_norm": 0.32489725947380066, + "learning_rate": 0.00025047619047619045, + "loss": 0.3289, + "step": 26872 + }, + { + "epoch": 15.012849162011173, + "grad_norm": 0.4891221523284912, + "learning_rate": 0.0002504481792717087, + "loss": 0.4708, + "step": 26873 + }, + { + "epoch": 15.01340782122905, + "grad_norm": 0.45323818922042847, + "learning_rate": 0.0002504201680672269, + "loss": 0.3345, + "step": 26874 + }, + { + "epoch": 15.013966480446927, + "grad_norm": 0.43679043650627136, + "learning_rate": 0.00025039215686274507, + "loss": 0.3651, + "step": 26875 + }, + { + "epoch": 15.014525139664805, + "grad_norm": 0.5581892728805542, + "learning_rate": 0.00025036414565826333, + "loss": 0.3524, + "step": 26876 + }, + { + "epoch": 15.015083798882682, + "grad_norm": 0.4508143663406372, + "learning_rate": 0.0002503361344537815, + "loss": 0.4639, + "step": 26877 + }, + { + "epoch": 15.015642458100558, + "grad_norm": 0.7554802298545837, + "learning_rate": 0.00025030812324929974, + "loss": 0.3419, + "step": 26878 + }, + { + "epoch": 15.016201117318436, + "grad_norm": 1.1287566423416138, + "learning_rate": 0.00025028011204481795, + "loss": 0.3986, + "step": 26879 + }, + { + "epoch": 15.016759776536313, + "grad_norm": 0.4794694781303406, + "learning_rate": 0.0002502521008403361, + "loss": 0.4132, + "step": 26880 + }, + { + "epoch": 15.01731843575419, + "grad_norm": 0.4565139710903168, + "learning_rate": 0.00025022408963585436, + "loss": 0.4118, + "step": 26881 + }, + { + "epoch": 15.017877094972068, + "grad_norm": 0.44634249806404114, + "learning_rate": 0.00025019607843137256, + "loss": 0.4228, + "step": 26882 + }, + { + "epoch": 15.018435754189944, + "grad_norm": 0.5857980847358704, + "learning_rate": 0.00025016806722689077, + "loss": 0.3714, + "step": 26883 + }, + { + "epoch": 15.018994413407821, + "grad_norm": 0.4245086908340454, + "learning_rate": 0.000250140056022409, + "loss": 0.3279, + "step": 26884 + }, + { + "epoch": 15.019553072625698, + "grad_norm": 0.44360044598579407, + "learning_rate": 0.0002501120448179271, + "loss": 0.4227, + "step": 26885 + }, + { + "epoch": 15.020111731843576, + "grad_norm": 1.079078197479248, + "learning_rate": 0.0002500840336134454, + "loss": 0.434, + "step": 26886 + }, + { + "epoch": 15.020670391061453, + "grad_norm": 0.5442601442337036, + "learning_rate": 0.0002500560224089636, + "loss": 0.5015, + "step": 26887 + }, + { + "epoch": 15.021229050279329, + "grad_norm": 0.7010372877120972, + "learning_rate": 0.0002500280112044818, + "loss": 0.4554, + "step": 26888 + }, + { + "epoch": 15.021787709497207, + "grad_norm": 0.36108502745628357, + "learning_rate": 0.00025, + "loss": 0.352, + "step": 26889 + }, + { + "epoch": 15.022346368715084, + "grad_norm": 2.8384745121002197, + "learning_rate": 0.0002499719887955182, + "loss": 0.5172, + "step": 26890 + }, + { + "epoch": 15.02290502793296, + "grad_norm": 0.5143072605133057, + "learning_rate": 0.0002499439775910364, + "loss": 0.3654, + "step": 26891 + }, + { + "epoch": 15.023463687150837, + "grad_norm": 0.46261823177337646, + "learning_rate": 0.0002499159663865546, + "loss": 0.3173, + "step": 26892 + }, + { + "epoch": 15.024022346368715, + "grad_norm": 0.4659637212753296, + "learning_rate": 0.00024988795518207283, + "loss": 0.3228, + "step": 26893 + }, + { + "epoch": 15.024581005586592, + "grad_norm": 0.4876371920108795, + "learning_rate": 0.00024985994397759104, + "loss": 0.4118, + "step": 26894 + }, + { + "epoch": 15.025139664804469, + "grad_norm": 1.7116811275482178, + "learning_rate": 0.00024983193277310924, + "loss": 0.4531, + "step": 26895 + }, + { + "epoch": 15.025698324022347, + "grad_norm": 10.244782447814941, + "learning_rate": 0.00024980392156862745, + "loss": 0.4112, + "step": 26896 + }, + { + "epoch": 15.026256983240224, + "grad_norm": 0.5708367824554443, + "learning_rate": 0.00024977591036414565, + "loss": 0.5848, + "step": 26897 + }, + { + "epoch": 15.0268156424581, + "grad_norm": 0.37682056427001953, + "learning_rate": 0.00024974789915966386, + "loss": 0.3565, + "step": 26898 + }, + { + "epoch": 15.027374301675978, + "grad_norm": 0.43208223581314087, + "learning_rate": 0.00024971988795518207, + "loss": 0.4851, + "step": 26899 + }, + { + "epoch": 15.027932960893855, + "grad_norm": 0.5319404602050781, + "learning_rate": 0.00024969187675070027, + "loss": 0.5145, + "step": 26900 + }, + { + "epoch": 15.028491620111732, + "grad_norm": 0.48241758346557617, + "learning_rate": 0.00024966386554621853, + "loss": 0.3877, + "step": 26901 + }, + { + "epoch": 15.029050279329608, + "grad_norm": 1.4814679622650146, + "learning_rate": 0.0002496358543417367, + "loss": 0.2605, + "step": 26902 + }, + { + "epoch": 15.029608938547486, + "grad_norm": 0.37490302324295044, + "learning_rate": 0.0002496078431372549, + "loss": 0.3707, + "step": 26903 + }, + { + "epoch": 15.030167597765363, + "grad_norm": 0.8533223867416382, + "learning_rate": 0.0002495798319327731, + "loss": 0.3715, + "step": 26904 + }, + { + "epoch": 15.03072625698324, + "grad_norm": 0.45826980471611023, + "learning_rate": 0.00024955182072829136, + "loss": 0.3716, + "step": 26905 + }, + { + "epoch": 15.031284916201118, + "grad_norm": 0.6275171041488647, + "learning_rate": 0.00024952380952380956, + "loss": 0.3928, + "step": 26906 + }, + { + "epoch": 15.031843575418995, + "grad_norm": 0.8012722134590149, + "learning_rate": 0.0002494957983193277, + "loss": 0.3866, + "step": 26907 + }, + { + "epoch": 15.032402234636871, + "grad_norm": 0.5740479230880737, + "learning_rate": 0.0002494677871148459, + "loss": 0.4378, + "step": 26908 + }, + { + "epoch": 15.03296089385475, + "grad_norm": 0.5773026347160339, + "learning_rate": 0.0002494397759103642, + "loss": 0.3659, + "step": 26909 + }, + { + "epoch": 15.033519553072626, + "grad_norm": 0.49632546305656433, + "learning_rate": 0.0002494117647058824, + "loss": 0.5214, + "step": 26910 + }, + { + "epoch": 15.034078212290503, + "grad_norm": 0.5711356401443481, + "learning_rate": 0.0002493837535014006, + "loss": 0.4924, + "step": 26911 + }, + { + "epoch": 15.03463687150838, + "grad_norm": 0.8759272694587708, + "learning_rate": 0.00024935574229691874, + "loss": 0.3887, + "step": 26912 + }, + { + "epoch": 15.035195530726257, + "grad_norm": 0.649247944355011, + "learning_rate": 0.000249327731092437, + "loss": 0.2928, + "step": 26913 + }, + { + "epoch": 15.035754189944134, + "grad_norm": 1.206963300704956, + "learning_rate": 0.0002492997198879552, + "loss": 0.4736, + "step": 26914 + }, + { + "epoch": 15.03631284916201, + "grad_norm": 0.9956198930740356, + "learning_rate": 0.0002492717086834734, + "loss": 0.4638, + "step": 26915 + }, + { + "epoch": 15.036871508379889, + "grad_norm": 0.5089482665061951, + "learning_rate": 0.0002492436974789916, + "loss": 0.4372, + "step": 26916 + }, + { + "epoch": 15.037430167597766, + "grad_norm": 0.4803427755832672, + "learning_rate": 0.00024921568627450983, + "loss": 0.6179, + "step": 26917 + }, + { + "epoch": 15.037988826815642, + "grad_norm": 0.4337618947029114, + "learning_rate": 0.00024918767507002803, + "loss": 0.4552, + "step": 26918 + }, + { + "epoch": 15.03854748603352, + "grad_norm": 0.3171527087688446, + "learning_rate": 0.00024915966386554624, + "loss": 0.2976, + "step": 26919 + }, + { + "epoch": 15.039106145251397, + "grad_norm": 0.443774938583374, + "learning_rate": 0.00024913165266106445, + "loss": 0.5081, + "step": 26920 + }, + { + "epoch": 15.039664804469274, + "grad_norm": 0.431959331035614, + "learning_rate": 0.00024910364145658265, + "loss": 0.3502, + "step": 26921 + }, + { + "epoch": 15.04022346368715, + "grad_norm": 2.338465452194214, + "learning_rate": 0.00024907563025210086, + "loss": 0.4069, + "step": 26922 + }, + { + "epoch": 15.040782122905028, + "grad_norm": 0.37305769324302673, + "learning_rate": 0.00024904761904761906, + "loss": 0.3302, + "step": 26923 + }, + { + "epoch": 15.041340782122905, + "grad_norm": 0.4058387875556946, + "learning_rate": 0.00024901960784313727, + "loss": 0.3688, + "step": 26924 + }, + { + "epoch": 15.041899441340782, + "grad_norm": 0.3835179805755615, + "learning_rate": 0.0002489915966386555, + "loss": 0.3507, + "step": 26925 + }, + { + "epoch": 15.04245810055866, + "grad_norm": 0.764991283416748, + "learning_rate": 0.0002489635854341737, + "loss": 0.4481, + "step": 26926 + }, + { + "epoch": 15.043016759776537, + "grad_norm": 0.4784719944000244, + "learning_rate": 0.0002489355742296919, + "loss": 0.4478, + "step": 26927 + }, + { + "epoch": 15.043575418994413, + "grad_norm": 0.3947021961212158, + "learning_rate": 0.0002489075630252101, + "loss": 0.3699, + "step": 26928 + }, + { + "epoch": 15.04413407821229, + "grad_norm": 0.40625664591789246, + "learning_rate": 0.0002488795518207283, + "loss": 0.3561, + "step": 26929 + }, + { + "epoch": 15.044692737430168, + "grad_norm": 0.7485128045082092, + "learning_rate": 0.0002488515406162465, + "loss": 0.4737, + "step": 26930 + }, + { + "epoch": 15.045251396648045, + "grad_norm": 0.5016077756881714, + "learning_rate": 0.0002488235294117647, + "loss": 0.4761, + "step": 26931 + }, + { + "epoch": 15.045810055865921, + "grad_norm": 0.40478381514549255, + "learning_rate": 0.0002487955182072829, + "loss": 0.4094, + "step": 26932 + }, + { + "epoch": 15.0463687150838, + "grad_norm": 0.5221636295318604, + "learning_rate": 0.0002487675070028011, + "loss": 0.4465, + "step": 26933 + }, + { + "epoch": 15.046927374301676, + "grad_norm": 0.5584438443183899, + "learning_rate": 0.00024873949579831933, + "loss": 0.3425, + "step": 26934 + }, + { + "epoch": 15.047486033519553, + "grad_norm": 0.3996504545211792, + "learning_rate": 0.00024871148459383754, + "loss": 0.454, + "step": 26935 + }, + { + "epoch": 15.048044692737431, + "grad_norm": 0.5384317636489868, + "learning_rate": 0.00024868347338935574, + "loss": 0.5427, + "step": 26936 + }, + { + "epoch": 15.048603351955308, + "grad_norm": 1.0164932012557983, + "learning_rate": 0.00024865546218487395, + "loss": 0.4771, + "step": 26937 + }, + { + "epoch": 15.049162011173184, + "grad_norm": 0.7885221838951111, + "learning_rate": 0.00024862745098039215, + "loss": 0.3507, + "step": 26938 + }, + { + "epoch": 15.04972067039106, + "grad_norm": 0.5892857313156128, + "learning_rate": 0.00024859943977591036, + "loss": 0.4302, + "step": 26939 + }, + { + "epoch": 15.050279329608939, + "grad_norm": 0.40044379234313965, + "learning_rate": 0.00024857142857142857, + "loss": 0.4147, + "step": 26940 + }, + { + "epoch": 15.050837988826816, + "grad_norm": 0.4539005756378174, + "learning_rate": 0.00024854341736694677, + "loss": 0.3913, + "step": 26941 + }, + { + "epoch": 15.051396648044692, + "grad_norm": 0.6547682881355286, + "learning_rate": 0.000248515406162465, + "loss": 0.4286, + "step": 26942 + }, + { + "epoch": 15.05195530726257, + "grad_norm": 0.40776658058166504, + "learning_rate": 0.0002484873949579832, + "loss": 0.4446, + "step": 26943 + }, + { + "epoch": 15.052513966480447, + "grad_norm": 8.208586692810059, + "learning_rate": 0.0002484593837535014, + "loss": 0.3818, + "step": 26944 + }, + { + "epoch": 15.053072625698324, + "grad_norm": 0.5905068516731262, + "learning_rate": 0.00024843137254901965, + "loss": 0.4897, + "step": 26945 + }, + { + "epoch": 15.053631284916202, + "grad_norm": 0.5988060235977173, + "learning_rate": 0.0002484033613445378, + "loss": 0.3098, + "step": 26946 + }, + { + "epoch": 15.054189944134079, + "grad_norm": 0.5745384693145752, + "learning_rate": 0.000248375350140056, + "loss": 0.5645, + "step": 26947 + }, + { + "epoch": 15.054748603351955, + "grad_norm": 1.0670571327209473, + "learning_rate": 0.0002483473389355742, + "loss": 0.455, + "step": 26948 + }, + { + "epoch": 15.055307262569832, + "grad_norm": 0.544528603553772, + "learning_rate": 0.0002483193277310925, + "loss": 0.399, + "step": 26949 + }, + { + "epoch": 15.05586592178771, + "grad_norm": 0.3786354959011078, + "learning_rate": 0.0002482913165266107, + "loss": 0.3785, + "step": 26950 + }, + { + "epoch": 15.056424581005587, + "grad_norm": 0.6824361085891724, + "learning_rate": 0.00024826330532212883, + "loss": 0.4394, + "step": 26951 + }, + { + "epoch": 15.056983240223463, + "grad_norm": 0.3922429084777832, + "learning_rate": 0.00024823529411764704, + "loss": 0.4059, + "step": 26952 + }, + { + "epoch": 15.057541899441341, + "grad_norm": 0.4455968737602234, + "learning_rate": 0.0002482072829131653, + "loss": 0.3817, + "step": 26953 + }, + { + "epoch": 15.058100558659218, + "grad_norm": 0.467678964138031, + "learning_rate": 0.0002481792717086835, + "loss": 0.3993, + "step": 26954 + }, + { + "epoch": 15.058659217877095, + "grad_norm": 0.3912041187286377, + "learning_rate": 0.0002481512605042017, + "loss": 0.3281, + "step": 26955 + }, + { + "epoch": 15.059217877094973, + "grad_norm": 0.828059196472168, + "learning_rate": 0.00024812324929971986, + "loss": 0.3237, + "step": 26956 + }, + { + "epoch": 15.05977653631285, + "grad_norm": 0.37380895018577576, + "learning_rate": 0.0002480952380952381, + "loss": 0.3674, + "step": 26957 + }, + { + "epoch": 15.060335195530726, + "grad_norm": 0.4876594841480255, + "learning_rate": 0.00024806722689075633, + "loss": 0.369, + "step": 26958 + }, + { + "epoch": 15.060893854748603, + "grad_norm": 0.2946512997150421, + "learning_rate": 0.00024803921568627453, + "loss": 0.2426, + "step": 26959 + }, + { + "epoch": 15.061452513966481, + "grad_norm": 0.4818108081817627, + "learning_rate": 0.00024801120448179274, + "loss": 0.367, + "step": 26960 + }, + { + "epoch": 15.062011173184358, + "grad_norm": 4.84298849105835, + "learning_rate": 0.00024798319327731095, + "loss": 0.3869, + "step": 26961 + }, + { + "epoch": 15.062569832402234, + "grad_norm": 0.45844560861587524, + "learning_rate": 0.00024795518207282915, + "loss": 0.3902, + "step": 26962 + }, + { + "epoch": 15.063128491620112, + "grad_norm": 0.3915700912475586, + "learning_rate": 0.00024792717086834736, + "loss": 0.4014, + "step": 26963 + }, + { + "epoch": 15.063687150837989, + "grad_norm": 0.8484997153282166, + "learning_rate": 0.00024789915966386556, + "loss": 0.3755, + "step": 26964 + }, + { + "epoch": 15.064245810055866, + "grad_norm": 0.397550493478775, + "learning_rate": 0.00024787114845938377, + "loss": 0.4255, + "step": 26965 + }, + { + "epoch": 15.064804469273742, + "grad_norm": 0.36226579546928406, + "learning_rate": 0.000247843137254902, + "loss": 0.3431, + "step": 26966 + }, + { + "epoch": 15.06536312849162, + "grad_norm": 0.586125910282135, + "learning_rate": 0.0002478151260504202, + "loss": 0.4467, + "step": 26967 + }, + { + "epoch": 15.065921787709497, + "grad_norm": 0.42878472805023193, + "learning_rate": 0.0002477871148459384, + "loss": 0.3871, + "step": 26968 + }, + { + "epoch": 15.066480446927374, + "grad_norm": 1.2628026008605957, + "learning_rate": 0.0002477591036414566, + "loss": 0.4338, + "step": 26969 + }, + { + "epoch": 15.067039106145252, + "grad_norm": 0.5514110326766968, + "learning_rate": 0.0002477310924369748, + "loss": 0.4237, + "step": 26970 + }, + { + "epoch": 15.067597765363129, + "grad_norm": 0.3650316298007965, + "learning_rate": 0.000247703081232493, + "loss": 0.3568, + "step": 26971 + }, + { + "epoch": 15.068156424581005, + "grad_norm": 0.4012395143508911, + "learning_rate": 0.0002476750700280112, + "loss": 0.3276, + "step": 26972 + }, + { + "epoch": 15.068715083798883, + "grad_norm": 0.4632839560508728, + "learning_rate": 0.0002476470588235294, + "loss": 0.4044, + "step": 26973 + }, + { + "epoch": 15.06927374301676, + "grad_norm": 0.9237446188926697, + "learning_rate": 0.0002476190476190476, + "loss": 0.4136, + "step": 26974 + }, + { + "epoch": 15.069832402234637, + "grad_norm": 0.4552759528160095, + "learning_rate": 0.00024759103641456583, + "loss": 0.4752, + "step": 26975 + }, + { + "epoch": 15.070391061452513, + "grad_norm": 0.48059314489364624, + "learning_rate": 0.00024756302521008404, + "loss": 0.3667, + "step": 26976 + }, + { + "epoch": 15.070949720670392, + "grad_norm": 0.4038139283657074, + "learning_rate": 0.00024753501400560224, + "loss": 0.3644, + "step": 26977 + }, + { + "epoch": 15.071508379888268, + "grad_norm": 0.4506329894065857, + "learning_rate": 0.00024750700280112045, + "loss": 0.5779, + "step": 26978 + }, + { + "epoch": 15.072067039106145, + "grad_norm": 0.3530978560447693, + "learning_rate": 0.00024747899159663865, + "loss": 0.4101, + "step": 26979 + }, + { + "epoch": 15.072625698324023, + "grad_norm": 1.1197973489761353, + "learning_rate": 0.00024745098039215686, + "loss": 0.4629, + "step": 26980 + }, + { + "epoch": 15.0731843575419, + "grad_norm": 1.083927869796753, + "learning_rate": 0.00024742296918767507, + "loss": 0.5189, + "step": 26981 + }, + { + "epoch": 15.073743016759776, + "grad_norm": 0.579332709312439, + "learning_rate": 0.00024739495798319327, + "loss": 0.3133, + "step": 26982 + }, + { + "epoch": 15.074301675977654, + "grad_norm": 0.6353923678398132, + "learning_rate": 0.0002473669467787115, + "loss": 0.4556, + "step": 26983 + }, + { + "epoch": 15.074860335195531, + "grad_norm": 0.6581987142562866, + "learning_rate": 0.0002473389355742297, + "loss": 0.5038, + "step": 26984 + }, + { + "epoch": 15.075418994413408, + "grad_norm": 0.682462751865387, + "learning_rate": 0.0002473109243697479, + "loss": 0.4109, + "step": 26985 + }, + { + "epoch": 15.075977653631284, + "grad_norm": 1.4290266036987305, + "learning_rate": 0.0002472829131652661, + "loss": 0.3351, + "step": 26986 + }, + { + "epoch": 15.076536312849163, + "grad_norm": 1.4346846342086792, + "learning_rate": 0.0002472549019607843, + "loss": 0.5061, + "step": 26987 + }, + { + "epoch": 15.077094972067039, + "grad_norm": 0.32376420497894287, + "learning_rate": 0.0002472268907563025, + "loss": 0.3993, + "step": 26988 + }, + { + "epoch": 15.077653631284916, + "grad_norm": 0.7088323831558228, + "learning_rate": 0.00024719887955182077, + "loss": 0.3565, + "step": 26989 + }, + { + "epoch": 15.078212290502794, + "grad_norm": 0.5038853287696838, + "learning_rate": 0.0002471708683473389, + "loss": 0.425, + "step": 26990 + }, + { + "epoch": 15.07877094972067, + "grad_norm": 1.1923103332519531, + "learning_rate": 0.0002471428571428571, + "loss": 0.485, + "step": 26991 + }, + { + "epoch": 15.079329608938547, + "grad_norm": 0.3794712424278259, + "learning_rate": 0.00024711484593837533, + "loss": 0.4284, + "step": 26992 + }, + { + "epoch": 15.079888268156424, + "grad_norm": 0.36067497730255127, + "learning_rate": 0.0002470868347338936, + "loss": 0.2901, + "step": 26993 + }, + { + "epoch": 15.080446927374302, + "grad_norm": 0.4367261230945587, + "learning_rate": 0.0002470588235294118, + "loss": 0.4183, + "step": 26994 + }, + { + "epoch": 15.081005586592179, + "grad_norm": 0.3977581262588501, + "learning_rate": 0.00024703081232492995, + "loss": 0.3706, + "step": 26995 + }, + { + "epoch": 15.081564245810055, + "grad_norm": 0.4364907145500183, + "learning_rate": 0.00024700280112044816, + "loss": 0.3747, + "step": 26996 + }, + { + "epoch": 15.082122905027934, + "grad_norm": 0.5650056004524231, + "learning_rate": 0.0002469747899159664, + "loss": 0.4664, + "step": 26997 + }, + { + "epoch": 15.08268156424581, + "grad_norm": 0.4446296989917755, + "learning_rate": 0.0002469467787114846, + "loss": 0.584, + "step": 26998 + }, + { + "epoch": 15.083240223463687, + "grad_norm": 0.45611563324928284, + "learning_rate": 0.00024691876750700283, + "loss": 0.501, + "step": 26999 + }, + { + "epoch": 15.083798882681565, + "grad_norm": 0.4472367465496063, + "learning_rate": 0.000246890756302521, + "loss": 0.473, + "step": 27000 + }, + { + "epoch": 15.083798882681565, + "eval_cer": 0.0859709993126248, + "eval_loss": 0.32471296191215515, + "eval_runtime": 55.5103, + "eval_samples_per_second": 81.751, + "eval_steps_per_second": 5.116, + "eval_wer": 0.3408843328398648, + "step": 27000 + }, + { + "epoch": 15.084357541899442, + "grad_norm": 0.46555376052856445, + "learning_rate": 0.00024686274509803924, + "loss": 0.5431, + "step": 27001 + }, + { + "epoch": 15.084916201117318, + "grad_norm": 0.9119547605514526, + "learning_rate": 0.00024683473389355745, + "loss": 0.4241, + "step": 27002 + }, + { + "epoch": 15.085474860335195, + "grad_norm": 0.42160847783088684, + "learning_rate": 0.00024680672268907565, + "loss": 0.3458, + "step": 27003 + }, + { + "epoch": 15.086033519553073, + "grad_norm": 0.486910343170166, + "learning_rate": 0.00024677871148459386, + "loss": 0.4357, + "step": 27004 + }, + { + "epoch": 15.08659217877095, + "grad_norm": 0.3392389714717865, + "learning_rate": 0.00024675070028011206, + "loss": 0.2862, + "step": 27005 + }, + { + "epoch": 15.087150837988826, + "grad_norm": 0.9551404714584351, + "learning_rate": 0.00024672268907563027, + "loss": 0.306, + "step": 27006 + }, + { + "epoch": 15.087709497206705, + "grad_norm": 0.6245831847190857, + "learning_rate": 0.0002466946778711485, + "loss": 0.4347, + "step": 27007 + }, + { + "epoch": 15.088268156424581, + "grad_norm": 0.46229642629623413, + "learning_rate": 0.0002466666666666667, + "loss": 0.4521, + "step": 27008 + }, + { + "epoch": 15.088826815642458, + "grad_norm": 0.3591441512107849, + "learning_rate": 0.0002466386554621849, + "loss": 0.4022, + "step": 27009 + }, + { + "epoch": 15.089385474860336, + "grad_norm": 0.5125793814659119, + "learning_rate": 0.0002466106442577031, + "loss": 0.3479, + "step": 27010 + }, + { + "epoch": 15.089944134078213, + "grad_norm": 0.5043748617172241, + "learning_rate": 0.0002465826330532213, + "loss": 0.4097, + "step": 27011 + }, + { + "epoch": 15.09050279329609, + "grad_norm": 0.346449613571167, + "learning_rate": 0.0002465546218487395, + "loss": 0.3676, + "step": 27012 + }, + { + "epoch": 15.091061452513966, + "grad_norm": 0.5347242951393127, + "learning_rate": 0.0002465266106442577, + "loss": 0.5437, + "step": 27013 + }, + { + "epoch": 15.091620111731844, + "grad_norm": 0.5479172468185425, + "learning_rate": 0.0002464985994397759, + "loss": 0.4798, + "step": 27014 + }, + { + "epoch": 15.09217877094972, + "grad_norm": 2.947571277618408, + "learning_rate": 0.0002464705882352941, + "loss": 0.3597, + "step": 27015 + }, + { + "epoch": 15.092737430167597, + "grad_norm": 0.442832350730896, + "learning_rate": 0.00024644257703081233, + "loss": 0.3518, + "step": 27016 + }, + { + "epoch": 15.093296089385476, + "grad_norm": 0.3627994656562805, + "learning_rate": 0.00024641456582633054, + "loss": 0.3948, + "step": 27017 + }, + { + "epoch": 15.093854748603352, + "grad_norm": 0.7864519953727722, + "learning_rate": 0.00024638655462184874, + "loss": 0.4945, + "step": 27018 + }, + { + "epoch": 15.094413407821229, + "grad_norm": 0.4636199474334717, + "learning_rate": 0.00024635854341736695, + "loss": 0.359, + "step": 27019 + }, + { + "epoch": 15.094972067039107, + "grad_norm": 0.9081053137779236, + "learning_rate": 0.00024633053221288515, + "loss": 0.3362, + "step": 27020 + }, + { + "epoch": 15.095530726256984, + "grad_norm": 0.44295617938041687, + "learning_rate": 0.00024630252100840336, + "loss": 0.4572, + "step": 27021 + }, + { + "epoch": 15.09608938547486, + "grad_norm": 0.48747849464416504, + "learning_rate": 0.00024627450980392157, + "loss": 0.5052, + "step": 27022 + }, + { + "epoch": 15.096648044692737, + "grad_norm": 8.42707347869873, + "learning_rate": 0.00024624649859943977, + "loss": 0.371, + "step": 27023 + }, + { + "epoch": 15.097206703910615, + "grad_norm": 0.6480430364608765, + "learning_rate": 0.000246218487394958, + "loss": 0.4103, + "step": 27024 + }, + { + "epoch": 15.097765363128492, + "grad_norm": 0.3670368790626526, + "learning_rate": 0.0002461904761904762, + "loss": 0.4084, + "step": 27025 + }, + { + "epoch": 15.098324022346368, + "grad_norm": 0.6102190017700195, + "learning_rate": 0.0002461624649859944, + "loss": 0.3836, + "step": 27026 + }, + { + "epoch": 15.098882681564247, + "grad_norm": 0.46493402123451233, + "learning_rate": 0.0002461344537815126, + "loss": 0.4032, + "step": 27027 + }, + { + "epoch": 15.099441340782123, + "grad_norm": 3.027873992919922, + "learning_rate": 0.0002461064425770308, + "loss": 0.3891, + "step": 27028 + }, + { + "epoch": 15.1, + "grad_norm": 0.4689657390117645, + "learning_rate": 0.000246078431372549, + "loss": 0.3921, + "step": 27029 + }, + { + "epoch": 15.100558659217878, + "grad_norm": 0.34698885679244995, + "learning_rate": 0.0002460504201680672, + "loss": 0.3879, + "step": 27030 + }, + { + "epoch": 15.101117318435755, + "grad_norm": 0.371849924325943, + "learning_rate": 0.0002460224089635854, + "loss": 0.3345, + "step": 27031 + }, + { + "epoch": 15.101675977653631, + "grad_norm": 0.6494749784469604, + "learning_rate": 0.0002459943977591036, + "loss": 0.3818, + "step": 27032 + }, + { + "epoch": 15.102234636871508, + "grad_norm": 0.4158179759979248, + "learning_rate": 0.0002459663865546219, + "loss": 0.3808, + "step": 27033 + }, + { + "epoch": 15.102793296089386, + "grad_norm": 0.45474332571029663, + "learning_rate": 0.00024593837535014004, + "loss": 0.4842, + "step": 27034 + }, + { + "epoch": 15.103351955307263, + "grad_norm": 0.5171681046485901, + "learning_rate": 0.00024591036414565824, + "loss": 0.3572, + "step": 27035 + }, + { + "epoch": 15.10391061452514, + "grad_norm": 0.4828847050666809, + "learning_rate": 0.00024588235294117645, + "loss": 0.4643, + "step": 27036 + }, + { + "epoch": 15.104469273743018, + "grad_norm": 0.45485252141952515, + "learning_rate": 0.0002458543417366947, + "loss": 0.5088, + "step": 27037 + }, + { + "epoch": 15.105027932960894, + "grad_norm": 0.42376044392585754, + "learning_rate": 0.0002458263305322129, + "loss": 0.5013, + "step": 27038 + }, + { + "epoch": 15.10558659217877, + "grad_norm": 0.5620428323745728, + "learning_rate": 0.00024579831932773107, + "loss": 0.2976, + "step": 27039 + }, + { + "epoch": 15.106145251396647, + "grad_norm": 0.721748411655426, + "learning_rate": 0.0002457703081232493, + "loss": 0.3624, + "step": 27040 + }, + { + "epoch": 15.106703910614526, + "grad_norm": 0.34397101402282715, + "learning_rate": 0.00024574229691876753, + "loss": 0.3228, + "step": 27041 + }, + { + "epoch": 15.107262569832402, + "grad_norm": 3.018893241882324, + "learning_rate": 0.00024571428571428574, + "loss": 0.5769, + "step": 27042 + }, + { + "epoch": 15.107821229050279, + "grad_norm": 0.6660287976264954, + "learning_rate": 0.00024568627450980395, + "loss": 0.435, + "step": 27043 + }, + { + "epoch": 15.108379888268157, + "grad_norm": 0.43279436230659485, + "learning_rate": 0.0002456582633053221, + "loss": 0.5081, + "step": 27044 + }, + { + "epoch": 15.108938547486034, + "grad_norm": 1.3323110342025757, + "learning_rate": 0.00024563025210084036, + "loss": 0.4175, + "step": 27045 + }, + { + "epoch": 15.10949720670391, + "grad_norm": 0.6359979510307312, + "learning_rate": 0.00024560224089635856, + "loss": 0.3153, + "step": 27046 + }, + { + "epoch": 15.110055865921789, + "grad_norm": 0.4616943299770355, + "learning_rate": 0.00024557422969187677, + "loss": 0.4214, + "step": 27047 + }, + { + "epoch": 15.110614525139665, + "grad_norm": 0.5856008529663086, + "learning_rate": 0.000245546218487395, + "loss": 0.3628, + "step": 27048 + }, + { + "epoch": 15.111173184357542, + "grad_norm": 0.40157410502433777, + "learning_rate": 0.0002455182072829132, + "loss": 0.2982, + "step": 27049 + }, + { + "epoch": 15.111731843575418, + "grad_norm": 0.6337383389472961, + "learning_rate": 0.0002454901960784314, + "loss": 0.3596, + "step": 27050 + }, + { + "epoch": 15.112290502793297, + "grad_norm": 0.45262759923934937, + "learning_rate": 0.0002454621848739496, + "loss": 0.3678, + "step": 27051 + }, + { + "epoch": 15.112849162011173, + "grad_norm": 0.825549840927124, + "learning_rate": 0.0002454341736694678, + "loss": 0.3288, + "step": 27052 + }, + { + "epoch": 15.11340782122905, + "grad_norm": 0.39222854375839233, + "learning_rate": 0.000245406162464986, + "loss": 0.3562, + "step": 27053 + }, + { + "epoch": 15.113966480446928, + "grad_norm": 0.41247180104255676, + "learning_rate": 0.0002453781512605042, + "loss": 0.3624, + "step": 27054 + }, + { + "epoch": 15.114525139664805, + "grad_norm": 0.7672168016433716, + "learning_rate": 0.0002453501400560224, + "loss": 0.501, + "step": 27055 + }, + { + "epoch": 15.115083798882681, + "grad_norm": 0.41790902614593506, + "learning_rate": 0.0002453221288515406, + "loss": 0.3563, + "step": 27056 + }, + { + "epoch": 15.11564245810056, + "grad_norm": 0.4824005365371704, + "learning_rate": 0.00024529411764705883, + "loss": 0.388, + "step": 27057 + }, + { + "epoch": 15.116201117318436, + "grad_norm": 0.49185648560523987, + "learning_rate": 0.00024526610644257704, + "loss": 0.5412, + "step": 27058 + }, + { + "epoch": 15.116759776536313, + "grad_norm": 0.5381485819816589, + "learning_rate": 0.00024523809523809524, + "loss": 0.4175, + "step": 27059 + }, + { + "epoch": 15.11731843575419, + "grad_norm": 0.41351935267448425, + "learning_rate": 0.00024521008403361345, + "loss": 0.4051, + "step": 27060 + }, + { + "epoch": 15.117877094972068, + "grad_norm": 0.5445733666419983, + "learning_rate": 0.00024518207282913165, + "loss": 0.4901, + "step": 27061 + }, + { + "epoch": 15.118435754189944, + "grad_norm": 0.5667095184326172, + "learning_rate": 0.00024515406162464986, + "loss": 0.4413, + "step": 27062 + }, + { + "epoch": 15.11899441340782, + "grad_norm": 0.44915780425071716, + "learning_rate": 0.00024512605042016807, + "loss": 0.492, + "step": 27063 + }, + { + "epoch": 15.119553072625699, + "grad_norm": 1.5476468801498413, + "learning_rate": 0.00024509803921568627, + "loss": 0.5613, + "step": 27064 + }, + { + "epoch": 15.120111731843576, + "grad_norm": 0.32369574904441833, + "learning_rate": 0.0002450700280112045, + "loss": 0.3763, + "step": 27065 + }, + { + "epoch": 15.120670391061452, + "grad_norm": 0.5431506037712097, + "learning_rate": 0.0002450420168067227, + "loss": 0.4661, + "step": 27066 + }, + { + "epoch": 15.121229050279329, + "grad_norm": 0.4893670976161957, + "learning_rate": 0.0002450140056022409, + "loss": 0.3427, + "step": 27067 + }, + { + "epoch": 15.121787709497207, + "grad_norm": 1.858221411705017, + "learning_rate": 0.0002449859943977591, + "loss": 0.3877, + "step": 27068 + }, + { + "epoch": 15.122346368715084, + "grad_norm": 0.31977301836013794, + "learning_rate": 0.0002449579831932773, + "loss": 0.3144, + "step": 27069 + }, + { + "epoch": 15.12290502793296, + "grad_norm": 1.4009097814559937, + "learning_rate": 0.0002449299719887955, + "loss": 0.3541, + "step": 27070 + }, + { + "epoch": 15.123463687150839, + "grad_norm": 0.44302475452423096, + "learning_rate": 0.0002449019607843137, + "loss": 0.3841, + "step": 27071 + }, + { + "epoch": 15.124022346368715, + "grad_norm": 0.3800410330295563, + "learning_rate": 0.0002448739495798319, + "loss": 0.3564, + "step": 27072 + }, + { + "epoch": 15.124581005586592, + "grad_norm": 0.7728914022445679, + "learning_rate": 0.0002448459383753502, + "loss": 0.3359, + "step": 27073 + }, + { + "epoch": 15.12513966480447, + "grad_norm": 0.4465992748737335, + "learning_rate": 0.00024481792717086833, + "loss": 0.4417, + "step": 27074 + }, + { + "epoch": 15.125698324022347, + "grad_norm": 0.4095946252346039, + "learning_rate": 0.00024478991596638654, + "loss": 0.3654, + "step": 27075 + }, + { + "epoch": 15.126256983240223, + "grad_norm": 0.5216981768608093, + "learning_rate": 0.00024476190476190474, + "loss": 0.4827, + "step": 27076 + }, + { + "epoch": 15.1268156424581, + "grad_norm": 0.5526641011238098, + "learning_rate": 0.000244733893557423, + "loss": 0.4068, + "step": 27077 + }, + { + "epoch": 15.127374301675978, + "grad_norm": 0.572192907333374, + "learning_rate": 0.0002447058823529412, + "loss": 0.5303, + "step": 27078 + }, + { + "epoch": 15.127932960893855, + "grad_norm": 1.3528988361358643, + "learning_rate": 0.00024467787114845936, + "loss": 0.5299, + "step": 27079 + }, + { + "epoch": 15.128491620111731, + "grad_norm": 0.3897814452648163, + "learning_rate": 0.00024464985994397757, + "loss": 0.3441, + "step": 27080 + }, + { + "epoch": 15.12905027932961, + "grad_norm": 0.49050432443618774, + "learning_rate": 0.00024462184873949583, + "loss": 0.5119, + "step": 27081 + }, + { + "epoch": 15.129608938547486, + "grad_norm": 0.4133954644203186, + "learning_rate": 0.00024459383753501403, + "loss": 0.3326, + "step": 27082 + }, + { + "epoch": 15.130167597765363, + "grad_norm": 0.47314685583114624, + "learning_rate": 0.00024456582633053224, + "loss": 0.4209, + "step": 27083 + }, + { + "epoch": 15.130726256983241, + "grad_norm": 0.3983404338359833, + "learning_rate": 0.0002445378151260504, + "loss": 0.4779, + "step": 27084 + }, + { + "epoch": 15.131284916201118, + "grad_norm": 0.4799702763557434, + "learning_rate": 0.00024450980392156865, + "loss": 0.4399, + "step": 27085 + }, + { + "epoch": 15.131843575418994, + "grad_norm": 0.38889190554618835, + "learning_rate": 0.00024448179271708686, + "loss": 0.4156, + "step": 27086 + }, + { + "epoch": 15.13240223463687, + "grad_norm": 0.34335896372795105, + "learning_rate": 0.00024445378151260506, + "loss": 0.3567, + "step": 27087 + }, + { + "epoch": 15.132960893854749, + "grad_norm": 1.9918476343154907, + "learning_rate": 0.0002444257703081232, + "loss": 0.2787, + "step": 27088 + }, + { + "epoch": 15.133519553072626, + "grad_norm": 0.39316263794898987, + "learning_rate": 0.0002443977591036415, + "loss": 0.3886, + "step": 27089 + }, + { + "epoch": 15.134078212290502, + "grad_norm": 0.4902094900608063, + "learning_rate": 0.0002443697478991597, + "loss": 0.4207, + "step": 27090 + }, + { + "epoch": 15.13463687150838, + "grad_norm": 0.5925796627998352, + "learning_rate": 0.0002443417366946779, + "loss": 0.7048, + "step": 27091 + }, + { + "epoch": 15.135195530726257, + "grad_norm": 0.6203573346138, + "learning_rate": 0.0002443137254901961, + "loss": 0.4067, + "step": 27092 + }, + { + "epoch": 15.135754189944134, + "grad_norm": 0.4595261812210083, + "learning_rate": 0.0002442857142857143, + "loss": 0.4124, + "step": 27093 + }, + { + "epoch": 15.136312849162012, + "grad_norm": 0.5828721523284912, + "learning_rate": 0.0002442577030812325, + "loss": 0.4914, + "step": 27094 + }, + { + "epoch": 15.136871508379889, + "grad_norm": 0.45298030972480774, + "learning_rate": 0.0002442296918767507, + "loss": 0.4889, + "step": 27095 + }, + { + "epoch": 15.137430167597765, + "grad_norm": 0.6941516399383545, + "learning_rate": 0.0002442016806722689, + "loss": 0.4544, + "step": 27096 + }, + { + "epoch": 15.137988826815642, + "grad_norm": 0.48597705364227295, + "learning_rate": 0.0002441736694677871, + "loss": 0.4048, + "step": 27097 + }, + { + "epoch": 15.13854748603352, + "grad_norm": 0.5737661719322205, + "learning_rate": 0.00024414565826330533, + "loss": 0.5196, + "step": 27098 + }, + { + "epoch": 15.139106145251397, + "grad_norm": 0.37921103835105896, + "learning_rate": 0.00024411764705882354, + "loss": 0.3723, + "step": 27099 + }, + { + "epoch": 15.139664804469273, + "grad_norm": 0.5729641318321228, + "learning_rate": 0.00024408963585434174, + "loss": 0.3805, + "step": 27100 + }, + { + "epoch": 15.140223463687152, + "grad_norm": 0.3232637047767639, + "learning_rate": 0.00024406162464985995, + "loss": 0.3659, + "step": 27101 + }, + { + "epoch": 15.140782122905028, + "grad_norm": 6.596498966217041, + "learning_rate": 0.00024403361344537815, + "loss": 0.3631, + "step": 27102 + }, + { + "epoch": 15.141340782122905, + "grad_norm": 0.7614020705223083, + "learning_rate": 0.00024400560224089636, + "loss": 0.4226, + "step": 27103 + }, + { + "epoch": 15.141899441340781, + "grad_norm": 0.3891860246658325, + "learning_rate": 0.00024397759103641457, + "loss": 0.4035, + "step": 27104 + }, + { + "epoch": 15.14245810055866, + "grad_norm": 1.087943196296692, + "learning_rate": 0.00024394957983193277, + "loss": 0.4083, + "step": 27105 + }, + { + "epoch": 15.143016759776536, + "grad_norm": 0.33843058347702026, + "learning_rate": 0.00024392156862745098, + "loss": 0.3387, + "step": 27106 + }, + { + "epoch": 15.143575418994413, + "grad_norm": 0.5082100629806519, + "learning_rate": 0.0002438935574229692, + "loss": 0.3525, + "step": 27107 + }, + { + "epoch": 15.144134078212291, + "grad_norm": 0.40680164098739624, + "learning_rate": 0.0002438655462184874, + "loss": 0.381, + "step": 27108 + }, + { + "epoch": 15.144692737430168, + "grad_norm": 0.333112508058548, + "learning_rate": 0.0002438375350140056, + "loss": 0.3623, + "step": 27109 + }, + { + "epoch": 15.145251396648044, + "grad_norm": 0.4140739142894745, + "learning_rate": 0.0002438095238095238, + "loss": 0.3258, + "step": 27110 + }, + { + "epoch": 15.145810055865923, + "grad_norm": 0.4597927927970886, + "learning_rate": 0.00024378151260504203, + "loss": 0.3499, + "step": 27111 + }, + { + "epoch": 15.1463687150838, + "grad_norm": 0.43684789538383484, + "learning_rate": 0.00024375350140056024, + "loss": 0.3495, + "step": 27112 + }, + { + "epoch": 15.146927374301676, + "grad_norm": 0.3556741774082184, + "learning_rate": 0.00024372549019607842, + "loss": 0.4257, + "step": 27113 + }, + { + "epoch": 15.147486033519552, + "grad_norm": 0.46280211210250854, + "learning_rate": 0.00024369747899159663, + "loss": 0.4655, + "step": 27114 + }, + { + "epoch": 15.14804469273743, + "grad_norm": 0.4690878391265869, + "learning_rate": 0.00024366946778711486, + "loss": 0.3436, + "step": 27115 + }, + { + "epoch": 15.148603351955307, + "grad_norm": 0.477507621049881, + "learning_rate": 0.00024364145658263306, + "loss": 0.447, + "step": 27116 + }, + { + "epoch": 15.149162011173184, + "grad_norm": 0.3387720584869385, + "learning_rate": 0.00024361344537815127, + "loss": 0.3915, + "step": 27117 + }, + { + "epoch": 15.149720670391062, + "grad_norm": 0.476091593503952, + "learning_rate": 0.00024358543417366945, + "loss": 0.4431, + "step": 27118 + }, + { + "epoch": 15.150279329608939, + "grad_norm": 2.260544776916504, + "learning_rate": 0.00024355742296918768, + "loss": 0.384, + "step": 27119 + }, + { + "epoch": 15.150837988826815, + "grad_norm": 0.5228288173675537, + "learning_rate": 0.0002435294117647059, + "loss": 0.5477, + "step": 27120 + }, + { + "epoch": 15.151396648044694, + "grad_norm": 0.6211914420127869, + "learning_rate": 0.0002435014005602241, + "loss": 0.5798, + "step": 27121 + }, + { + "epoch": 15.15195530726257, + "grad_norm": 0.35126590728759766, + "learning_rate": 0.00024347338935574233, + "loss": 0.3928, + "step": 27122 + }, + { + "epoch": 15.152513966480447, + "grad_norm": 0.48009130358695984, + "learning_rate": 0.0002434453781512605, + "loss": 0.5798, + "step": 27123 + }, + { + "epoch": 15.153072625698323, + "grad_norm": 0.6149377226829529, + "learning_rate": 0.0002434173669467787, + "loss": 0.4288, + "step": 27124 + }, + { + "epoch": 15.153631284916202, + "grad_norm": 0.798728883266449, + "learning_rate": 0.00024338935574229692, + "loss": 0.5143, + "step": 27125 + }, + { + "epoch": 15.154189944134078, + "grad_norm": 0.5272912383079529, + "learning_rate": 0.00024336134453781515, + "loss": 0.4576, + "step": 27126 + }, + { + "epoch": 15.154748603351955, + "grad_norm": 0.3481975793838501, + "learning_rate": 0.00024333333333333336, + "loss": 0.446, + "step": 27127 + }, + { + "epoch": 15.155307262569833, + "grad_norm": 0.7295636534690857, + "learning_rate": 0.00024330532212885154, + "loss": 0.4959, + "step": 27128 + }, + { + "epoch": 15.15586592178771, + "grad_norm": 0.5239155888557434, + "learning_rate": 0.00024327731092436974, + "loss": 0.504, + "step": 27129 + }, + { + "epoch": 15.156424581005586, + "grad_norm": 0.8161219358444214, + "learning_rate": 0.00024324929971988798, + "loss": 0.3815, + "step": 27130 + }, + { + "epoch": 15.156983240223465, + "grad_norm": 0.3975829780101776, + "learning_rate": 0.00024322128851540618, + "loss": 0.4839, + "step": 27131 + }, + { + "epoch": 15.157541899441341, + "grad_norm": 0.35358983278274536, + "learning_rate": 0.0002431932773109244, + "loss": 0.3193, + "step": 27132 + }, + { + "epoch": 15.158100558659218, + "grad_norm": 0.6780710220336914, + "learning_rate": 0.00024316526610644257, + "loss": 0.4728, + "step": 27133 + }, + { + "epoch": 15.158659217877094, + "grad_norm": 0.5312126278877258, + "learning_rate": 0.0002431372549019608, + "loss": 0.3756, + "step": 27134 + }, + { + "epoch": 15.159217877094973, + "grad_norm": 1.957397699356079, + "learning_rate": 0.000243109243697479, + "loss": 0.3745, + "step": 27135 + }, + { + "epoch": 15.15977653631285, + "grad_norm": 0.42384931445121765, + "learning_rate": 0.0002430812324929972, + "loss": 0.3645, + "step": 27136 + }, + { + "epoch": 15.160335195530726, + "grad_norm": 0.4002234935760498, + "learning_rate": 0.00024305322128851542, + "loss": 0.2955, + "step": 27137 + }, + { + "epoch": 15.160893854748604, + "grad_norm": 1.1644526720046997, + "learning_rate": 0.00024302521008403362, + "loss": 0.4545, + "step": 27138 + }, + { + "epoch": 15.16145251396648, + "grad_norm": 0.4228382110595703, + "learning_rate": 0.00024299719887955183, + "loss": 0.3317, + "step": 27139 + }, + { + "epoch": 15.162011173184357, + "grad_norm": 0.4425305724143982, + "learning_rate": 0.00024296918767507004, + "loss": 0.4211, + "step": 27140 + }, + { + "epoch": 15.162569832402234, + "grad_norm": 12.102898597717285, + "learning_rate": 0.00024294117647058824, + "loss": 0.4857, + "step": 27141 + }, + { + "epoch": 15.163128491620112, + "grad_norm": 0.46449530124664307, + "learning_rate": 0.00024291316526610645, + "loss": 0.3903, + "step": 27142 + }, + { + "epoch": 15.163687150837989, + "grad_norm": 0.4231211245059967, + "learning_rate": 0.00024288515406162465, + "loss": 0.4803, + "step": 27143 + }, + { + "epoch": 15.164245810055865, + "grad_norm": 0.5800387859344482, + "learning_rate": 0.00024285714285714286, + "loss": 0.4655, + "step": 27144 + }, + { + "epoch": 15.164804469273744, + "grad_norm": 0.4049888253211975, + "learning_rate": 0.00024282913165266107, + "loss": 0.4367, + "step": 27145 + }, + { + "epoch": 15.16536312849162, + "grad_norm": 0.39952030777931213, + "learning_rate": 0.0002428011204481793, + "loss": 0.5176, + "step": 27146 + }, + { + "epoch": 15.165921787709497, + "grad_norm": 0.42050716280937195, + "learning_rate": 0.00024277310924369748, + "loss": 0.3216, + "step": 27147 + }, + { + "epoch": 15.166480446927375, + "grad_norm": 0.38417303562164307, + "learning_rate": 0.00024274509803921568, + "loss": 0.5151, + "step": 27148 + }, + { + "epoch": 15.167039106145252, + "grad_norm": 0.4076318144798279, + "learning_rate": 0.0002427170868347339, + "loss": 0.3785, + "step": 27149 + }, + { + "epoch": 15.167597765363128, + "grad_norm": 0.6365754008293152, + "learning_rate": 0.00024268907563025212, + "loss": 0.411, + "step": 27150 + }, + { + "epoch": 15.168156424581005, + "grad_norm": 2.771817922592163, + "learning_rate": 0.00024266106442577033, + "loss": 0.4113, + "step": 27151 + }, + { + "epoch": 15.168715083798883, + "grad_norm": 0.7303066253662109, + "learning_rate": 0.0002426330532212885, + "loss": 0.3963, + "step": 27152 + }, + { + "epoch": 15.16927374301676, + "grad_norm": 0.42182886600494385, + "learning_rate": 0.0002426050420168067, + "loss": 0.4615, + "step": 27153 + }, + { + "epoch": 15.169832402234636, + "grad_norm": 0.8525797724723816, + "learning_rate": 0.00024257703081232495, + "loss": 0.482, + "step": 27154 + }, + { + "epoch": 15.170391061452515, + "grad_norm": 0.6056182980537415, + "learning_rate": 0.00024254901960784315, + "loss": 0.5641, + "step": 27155 + }, + { + "epoch": 15.170949720670391, + "grad_norm": 0.4830930233001709, + "learning_rate": 0.00024252100840336136, + "loss": 0.4042, + "step": 27156 + }, + { + "epoch": 15.171508379888268, + "grad_norm": 0.42725157737731934, + "learning_rate": 0.00024249299719887954, + "loss": 0.3956, + "step": 27157 + }, + { + "epoch": 15.172067039106146, + "grad_norm": 0.4476284682750702, + "learning_rate": 0.00024246498599439777, + "loss": 0.3605, + "step": 27158 + }, + { + "epoch": 15.172625698324023, + "grad_norm": 0.3446992337703705, + "learning_rate": 0.00024243697478991598, + "loss": 0.3235, + "step": 27159 + }, + { + "epoch": 15.1731843575419, + "grad_norm": 0.43621525168418884, + "learning_rate": 0.00024240896358543418, + "loss": 0.4406, + "step": 27160 + }, + { + "epoch": 15.173743016759776, + "grad_norm": 4.17518424987793, + "learning_rate": 0.0002423809523809524, + "loss": 0.3035, + "step": 27161 + }, + { + "epoch": 15.174301675977654, + "grad_norm": 0.5337277054786682, + "learning_rate": 0.0002423529411764706, + "loss": 0.3987, + "step": 27162 + }, + { + "epoch": 15.17486033519553, + "grad_norm": 0.6094560623168945, + "learning_rate": 0.0002423249299719888, + "loss": 0.4167, + "step": 27163 + }, + { + "epoch": 15.175418994413407, + "grad_norm": 0.4864679276943207, + "learning_rate": 0.000242296918767507, + "loss": 0.3847, + "step": 27164 + }, + { + "epoch": 15.175977653631286, + "grad_norm": 0.3789011836051941, + "learning_rate": 0.0002422689075630252, + "loss": 0.3829, + "step": 27165 + }, + { + "epoch": 15.176536312849162, + "grad_norm": 0.38551798462867737, + "learning_rate": 0.00024224089635854345, + "loss": 0.3448, + "step": 27166 + }, + { + "epoch": 15.177094972067039, + "grad_norm": 0.3869551122188568, + "learning_rate": 0.00024221288515406162, + "loss": 0.42, + "step": 27167 + }, + { + "epoch": 15.177653631284917, + "grad_norm": 0.6148666739463806, + "learning_rate": 0.00024218487394957983, + "loss": 0.4083, + "step": 27168 + }, + { + "epoch": 15.178212290502794, + "grad_norm": 0.6997018456459045, + "learning_rate": 0.00024215686274509804, + "loss": 0.4197, + "step": 27169 + }, + { + "epoch": 15.17877094972067, + "grad_norm": 0.3802711069583893, + "learning_rate": 0.00024212885154061627, + "loss": 0.3537, + "step": 27170 + }, + { + "epoch": 15.179329608938547, + "grad_norm": 0.5877300500869751, + "learning_rate": 0.00024210084033613448, + "loss": 0.3941, + "step": 27171 + }, + { + "epoch": 15.179888268156425, + "grad_norm": 0.6996586918830872, + "learning_rate": 0.00024207282913165265, + "loss": 0.4312, + "step": 27172 + }, + { + "epoch": 15.180446927374302, + "grad_norm": 0.6761998534202576, + "learning_rate": 0.00024204481792717086, + "loss": 0.4189, + "step": 27173 + }, + { + "epoch": 15.181005586592178, + "grad_norm": 0.3972788155078888, + "learning_rate": 0.0002420168067226891, + "loss": 0.4239, + "step": 27174 + }, + { + "epoch": 15.181564245810057, + "grad_norm": 0.4193545877933502, + "learning_rate": 0.0002419887955182073, + "loss": 0.3773, + "step": 27175 + }, + { + "epoch": 15.182122905027933, + "grad_norm": 0.5859658718109131, + "learning_rate": 0.0002419607843137255, + "loss": 0.3704, + "step": 27176 + }, + { + "epoch": 15.18268156424581, + "grad_norm": 0.46136415004730225, + "learning_rate": 0.00024193277310924368, + "loss": 0.3605, + "step": 27177 + }, + { + "epoch": 15.183240223463686, + "grad_norm": 0.7353127598762512, + "learning_rate": 0.00024190476190476192, + "loss": 0.4138, + "step": 27178 + }, + { + "epoch": 15.183798882681565, + "grad_norm": 0.43678024411201477, + "learning_rate": 0.00024187675070028012, + "loss": 0.3953, + "step": 27179 + }, + { + "epoch": 15.184357541899441, + "grad_norm": 0.5883775353431702, + "learning_rate": 0.00024184873949579833, + "loss": 0.4165, + "step": 27180 + }, + { + "epoch": 15.184916201117318, + "grad_norm": 0.36661794781684875, + "learning_rate": 0.00024182072829131654, + "loss": 0.3876, + "step": 27181 + }, + { + "epoch": 15.185474860335196, + "grad_norm": 0.48775869607925415, + "learning_rate": 0.00024179271708683474, + "loss": 0.4548, + "step": 27182 + }, + { + "epoch": 15.186033519553073, + "grad_norm": 0.34850120544433594, + "learning_rate": 0.00024176470588235295, + "loss": 0.4022, + "step": 27183 + }, + { + "epoch": 15.18659217877095, + "grad_norm": 0.34781205654144287, + "learning_rate": 0.00024173669467787115, + "loss": 0.4063, + "step": 27184 + }, + { + "epoch": 15.187150837988828, + "grad_norm": 0.49330341815948486, + "learning_rate": 0.00024170868347338936, + "loss": 0.4032, + "step": 27185 + }, + { + "epoch": 15.187709497206704, + "grad_norm": 0.48733147978782654, + "learning_rate": 0.0002416806722689076, + "loss": 0.3515, + "step": 27186 + }, + { + "epoch": 15.18826815642458, + "grad_norm": 0.9458677768707275, + "learning_rate": 0.00024165266106442577, + "loss": 0.3293, + "step": 27187 + }, + { + "epoch": 15.188826815642457, + "grad_norm": 0.43216556310653687, + "learning_rate": 0.00024162464985994398, + "loss": 0.4078, + "step": 27188 + }, + { + "epoch": 15.189385474860336, + "grad_norm": 19.454490661621094, + "learning_rate": 0.00024159663865546218, + "loss": 0.4541, + "step": 27189 + }, + { + "epoch": 15.189944134078212, + "grad_norm": 0.4774804711341858, + "learning_rate": 0.00024156862745098042, + "loss": 0.344, + "step": 27190 + }, + { + "epoch": 15.190502793296089, + "grad_norm": 0.5158601999282837, + "learning_rate": 0.00024154061624649862, + "loss": 0.4167, + "step": 27191 + }, + { + "epoch": 15.191061452513967, + "grad_norm": 1.0138782262802124, + "learning_rate": 0.0002415126050420168, + "loss": 0.4612, + "step": 27192 + }, + { + "epoch": 15.191620111731844, + "grad_norm": 0.5057836771011353, + "learning_rate": 0.000241484593837535, + "loss": 0.5445, + "step": 27193 + }, + { + "epoch": 15.19217877094972, + "grad_norm": 0.3696221113204956, + "learning_rate": 0.00024145658263305324, + "loss": 0.4381, + "step": 27194 + }, + { + "epoch": 15.192737430167599, + "grad_norm": 0.41393572092056274, + "learning_rate": 0.00024142857142857145, + "loss": 0.2883, + "step": 27195 + }, + { + "epoch": 15.193296089385475, + "grad_norm": 0.35773953795433044, + "learning_rate": 0.00024140056022408963, + "loss": 0.3677, + "step": 27196 + }, + { + "epoch": 15.193854748603352, + "grad_norm": 1.09688138961792, + "learning_rate": 0.00024137254901960783, + "loss": 0.4206, + "step": 27197 + }, + { + "epoch": 15.194413407821228, + "grad_norm": 0.44415491819381714, + "learning_rate": 0.00024134453781512606, + "loss": 0.5319, + "step": 27198 + }, + { + "epoch": 15.194972067039107, + "grad_norm": 0.5444587469100952, + "learning_rate": 0.00024131652661064427, + "loss": 0.3968, + "step": 27199 + }, + { + "epoch": 15.195530726256983, + "grad_norm": 0.5443543791770935, + "learning_rate": 0.00024128851540616248, + "loss": 0.4141, + "step": 27200 + }, + { + "epoch": 15.19608938547486, + "grad_norm": 0.714827299118042, + "learning_rate": 0.00024126050420168066, + "loss": 0.5278, + "step": 27201 + }, + { + "epoch": 15.196648044692738, + "grad_norm": 0.3461098372936249, + "learning_rate": 0.0002412324929971989, + "loss": 0.3499, + "step": 27202 + }, + { + "epoch": 15.197206703910615, + "grad_norm": 0.5639128088951111, + "learning_rate": 0.0002412044817927171, + "loss": 0.4245, + "step": 27203 + }, + { + "epoch": 15.197765363128491, + "grad_norm": 0.504483699798584, + "learning_rate": 0.0002411764705882353, + "loss": 0.4451, + "step": 27204 + }, + { + "epoch": 15.19832402234637, + "grad_norm": 0.4270140826702118, + "learning_rate": 0.0002411484593837535, + "loss": 0.3925, + "step": 27205 + }, + { + "epoch": 15.198882681564246, + "grad_norm": 0.40988901257514954, + "learning_rate": 0.0002411204481792717, + "loss": 0.3553, + "step": 27206 + }, + { + "epoch": 15.199441340782123, + "grad_norm": 0.3633969724178314, + "learning_rate": 0.00024109243697478992, + "loss": 0.3032, + "step": 27207 + }, + { + "epoch": 15.2, + "grad_norm": 0.3978801965713501, + "learning_rate": 0.00024106442577030812, + "loss": 0.4529, + "step": 27208 + }, + { + "epoch": 15.200558659217878, + "grad_norm": 0.36953550577163696, + "learning_rate": 0.00024103641456582633, + "loss": 0.3472, + "step": 27209 + }, + { + "epoch": 15.201117318435754, + "grad_norm": 0.7324294447898865, + "learning_rate": 0.00024100840336134456, + "loss": 0.4125, + "step": 27210 + }, + { + "epoch": 15.20167597765363, + "grad_norm": 0.38765665888786316, + "learning_rate": 0.00024098039215686274, + "loss": 0.3249, + "step": 27211 + }, + { + "epoch": 15.202234636871509, + "grad_norm": 0.37759941816329956, + "learning_rate": 0.00024095238095238095, + "loss": 0.343, + "step": 27212 + }, + { + "epoch": 15.202793296089386, + "grad_norm": 0.5258983969688416, + "learning_rate": 0.00024092436974789915, + "loss": 0.4944, + "step": 27213 + }, + { + "epoch": 15.203351955307262, + "grad_norm": 0.5161382555961609, + "learning_rate": 0.0002408963585434174, + "loss": 0.4012, + "step": 27214 + }, + { + "epoch": 15.203910614525139, + "grad_norm": 0.5770924687385559, + "learning_rate": 0.0002408683473389356, + "loss": 0.3351, + "step": 27215 + }, + { + "epoch": 15.204469273743017, + "grad_norm": 0.4455718994140625, + "learning_rate": 0.00024084033613445377, + "loss": 0.4163, + "step": 27216 + }, + { + "epoch": 15.205027932960894, + "grad_norm": 0.7514647841453552, + "learning_rate": 0.00024081232492997198, + "loss": 0.4094, + "step": 27217 + }, + { + "epoch": 15.20558659217877, + "grad_norm": 0.4450298249721527, + "learning_rate": 0.0002407843137254902, + "loss": 0.3284, + "step": 27218 + }, + { + "epoch": 15.206145251396649, + "grad_norm": 0.3868100941181183, + "learning_rate": 0.00024075630252100842, + "loss": 0.328, + "step": 27219 + }, + { + "epoch": 15.206703910614525, + "grad_norm": 0.6516246199607849, + "learning_rate": 0.00024072829131652662, + "loss": 0.369, + "step": 27220 + }, + { + "epoch": 15.207262569832402, + "grad_norm": 0.43434417247772217, + "learning_rate": 0.0002407002801120448, + "loss": 0.4005, + "step": 27221 + }, + { + "epoch": 15.20782122905028, + "grad_norm": 0.770481526851654, + "learning_rate": 0.00024067226890756304, + "loss": 0.4172, + "step": 27222 + }, + { + "epoch": 15.208379888268157, + "grad_norm": 0.38107791543006897, + "learning_rate": 0.00024064425770308124, + "loss": 0.3336, + "step": 27223 + }, + { + "epoch": 15.208938547486033, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.00024061624649859945, + "loss": 0.3957, + "step": 27224 + }, + { + "epoch": 15.20949720670391, + "grad_norm": 0.6148368120193481, + "learning_rate": 0.00024058823529411765, + "loss": 0.5837, + "step": 27225 + }, + { + "epoch": 15.210055865921788, + "grad_norm": 0.43569695949554443, + "learning_rate": 0.00024056022408963586, + "loss": 0.4507, + "step": 27226 + }, + { + "epoch": 15.210614525139665, + "grad_norm": 0.4305015206336975, + "learning_rate": 0.00024053221288515407, + "loss": 0.3597, + "step": 27227 + }, + { + "epoch": 15.211173184357541, + "grad_norm": 3.1079533100128174, + "learning_rate": 0.00024050420168067227, + "loss": 0.6983, + "step": 27228 + }, + { + "epoch": 15.21173184357542, + "grad_norm": 0.4841807782649994, + "learning_rate": 0.00024047619047619048, + "loss": 0.3871, + "step": 27229 + }, + { + "epoch": 15.212290502793296, + "grad_norm": 0.4471410810947418, + "learning_rate": 0.0002404481792717087, + "loss": 0.3148, + "step": 27230 + }, + { + "epoch": 15.212849162011173, + "grad_norm": 0.639410138130188, + "learning_rate": 0.0002404201680672269, + "loss": 0.4899, + "step": 27231 + }, + { + "epoch": 15.213407821229051, + "grad_norm": 0.5526193976402283, + "learning_rate": 0.0002403921568627451, + "loss": 0.4907, + "step": 27232 + }, + { + "epoch": 15.213966480446928, + "grad_norm": 0.4611920416355133, + "learning_rate": 0.0002403641456582633, + "loss": 0.4557, + "step": 27233 + }, + { + "epoch": 15.214525139664804, + "grad_norm": 0.3367268741130829, + "learning_rate": 0.00024033613445378153, + "loss": 0.4035, + "step": 27234 + }, + { + "epoch": 15.21508379888268, + "grad_norm": 0.3978441655635834, + "learning_rate": 0.00024030812324929974, + "loss": 0.4656, + "step": 27235 + }, + { + "epoch": 15.21564245810056, + "grad_norm": 0.43446311354637146, + "learning_rate": 0.00024028011204481792, + "loss": 0.4488, + "step": 27236 + }, + { + "epoch": 15.216201117318436, + "grad_norm": 0.43057745695114136, + "learning_rate": 0.00024025210084033613, + "loss": 0.4285, + "step": 27237 + }, + { + "epoch": 15.216759776536312, + "grad_norm": 0.3746228814125061, + "learning_rate": 0.00024022408963585436, + "loss": 0.385, + "step": 27238 + }, + { + "epoch": 15.21731843575419, + "grad_norm": 0.5968604683876038, + "learning_rate": 0.00024019607843137256, + "loss": 0.3689, + "step": 27239 + }, + { + "epoch": 15.217877094972067, + "grad_norm": 0.4818195402622223, + "learning_rate": 0.00024016806722689077, + "loss": 0.3884, + "step": 27240 + }, + { + "epoch": 15.218435754189944, + "grad_norm": 0.41954466700553894, + "learning_rate": 0.00024014005602240895, + "loss": 0.3841, + "step": 27241 + }, + { + "epoch": 15.21899441340782, + "grad_norm": 0.41375160217285156, + "learning_rate": 0.00024011204481792718, + "loss": 0.4826, + "step": 27242 + }, + { + "epoch": 15.219553072625699, + "grad_norm": 0.44006311893463135, + "learning_rate": 0.0002400840336134454, + "loss": 0.3743, + "step": 27243 + }, + { + "epoch": 15.220111731843575, + "grad_norm": 0.44837290048599243, + "learning_rate": 0.0002400560224089636, + "loss": 0.4754, + "step": 27244 + }, + { + "epoch": 15.220670391061452, + "grad_norm": 0.5105819702148438, + "learning_rate": 0.0002400280112044818, + "loss": 0.3687, + "step": 27245 + }, + { + "epoch": 15.22122905027933, + "grad_norm": 0.4841049909591675, + "learning_rate": 0.00024, + "loss": 0.4296, + "step": 27246 + }, + { + "epoch": 15.221787709497207, + "grad_norm": 0.37922903895378113, + "learning_rate": 0.0002399719887955182, + "loss": 0.4071, + "step": 27247 + }, + { + "epoch": 15.222346368715083, + "grad_norm": 0.43594953417778015, + "learning_rate": 0.00023994397759103642, + "loss": 0.3984, + "step": 27248 + }, + { + "epoch": 15.222905027932962, + "grad_norm": 0.512876570224762, + "learning_rate": 0.00023991596638655462, + "loss": 0.4027, + "step": 27249 + }, + { + "epoch": 15.223463687150838, + "grad_norm": 0.7205897569656372, + "learning_rate": 0.00023988795518207286, + "loss": 0.463, + "step": 27250 + }, + { + "epoch": 15.224022346368715, + "grad_norm": 0.43102848529815674, + "learning_rate": 0.00023985994397759104, + "loss": 0.5187, + "step": 27251 + }, + { + "epoch": 15.224581005586591, + "grad_norm": 0.47895359992980957, + "learning_rate": 0.00023983193277310924, + "loss": 0.4228, + "step": 27252 + }, + { + "epoch": 15.22513966480447, + "grad_norm": 0.5614460110664368, + "learning_rate": 0.00023980392156862745, + "loss": 0.4512, + "step": 27253 + }, + { + "epoch": 15.225698324022346, + "grad_norm": 0.5633761882781982, + "learning_rate": 0.00023977591036414568, + "loss": 0.5342, + "step": 27254 + }, + { + "epoch": 15.226256983240223, + "grad_norm": 0.5918115377426147, + "learning_rate": 0.00023974789915966386, + "loss": 0.3865, + "step": 27255 + }, + { + "epoch": 15.226815642458101, + "grad_norm": 0.8485002517700195, + "learning_rate": 0.00023971988795518207, + "loss": 0.6571, + "step": 27256 + }, + { + "epoch": 15.227374301675978, + "grad_norm": 0.42712152004241943, + "learning_rate": 0.00023969187675070027, + "loss": 0.3879, + "step": 27257 + }, + { + "epoch": 15.227932960893854, + "grad_norm": 0.47626161575317383, + "learning_rate": 0.0002396638655462185, + "loss": 0.4658, + "step": 27258 + }, + { + "epoch": 15.228491620111733, + "grad_norm": 0.5027827620506287, + "learning_rate": 0.0002396358543417367, + "loss": 0.559, + "step": 27259 + }, + { + "epoch": 15.22905027932961, + "grad_norm": 2.392272472381592, + "learning_rate": 0.0002396078431372549, + "loss": 0.2938, + "step": 27260 + }, + { + "epoch": 15.229608938547486, + "grad_norm": 0.49962446093559265, + "learning_rate": 0.0002395798319327731, + "loss": 0.428, + "step": 27261 + }, + { + "epoch": 15.230167597765362, + "grad_norm": 0.4339938163757324, + "learning_rate": 0.00023955182072829133, + "loss": 0.4497, + "step": 27262 + }, + { + "epoch": 15.23072625698324, + "grad_norm": 0.6094764471054077, + "learning_rate": 0.00023952380952380954, + "loss": 0.3842, + "step": 27263 + }, + { + "epoch": 15.231284916201117, + "grad_norm": 0.4525226354598999, + "learning_rate": 0.00023949579831932774, + "loss": 0.4778, + "step": 27264 + }, + { + "epoch": 15.231843575418994, + "grad_norm": 0.4125398099422455, + "learning_rate": 0.00023946778711484592, + "loss": 0.3265, + "step": 27265 + }, + { + "epoch": 15.232402234636872, + "grad_norm": 0.5147817134857178, + "learning_rate": 0.00023943977591036415, + "loss": 0.4861, + "step": 27266 + }, + { + "epoch": 15.232960893854749, + "grad_norm": 1.3364759683609009, + "learning_rate": 0.00023941176470588236, + "loss": 0.4358, + "step": 27267 + }, + { + "epoch": 15.233519553072625, + "grad_norm": 0.44241103529930115, + "learning_rate": 0.00023938375350140057, + "loss": 0.4308, + "step": 27268 + }, + { + "epoch": 15.234078212290504, + "grad_norm": 0.3694785535335541, + "learning_rate": 0.00023935574229691877, + "loss": 0.3838, + "step": 27269 + }, + { + "epoch": 15.23463687150838, + "grad_norm": 0.41772425174713135, + "learning_rate": 0.00023932773109243698, + "loss": 0.3951, + "step": 27270 + }, + { + "epoch": 15.235195530726257, + "grad_norm": 0.7389469742774963, + "learning_rate": 0.00023929971988795518, + "loss": 0.7359, + "step": 27271 + }, + { + "epoch": 15.235754189944133, + "grad_norm": 0.49500662088394165, + "learning_rate": 0.0002392717086834734, + "loss": 0.4697, + "step": 27272 + }, + { + "epoch": 15.236312849162012, + "grad_norm": 0.5220922231674194, + "learning_rate": 0.0002392436974789916, + "loss": 0.468, + "step": 27273 + }, + { + "epoch": 15.236871508379888, + "grad_norm": 0.4546958804130554, + "learning_rate": 0.00023921568627450983, + "loss": 0.4194, + "step": 27274 + }, + { + "epoch": 15.237430167597765, + "grad_norm": 0.3412879705429077, + "learning_rate": 0.000239187675070028, + "loss": 0.4142, + "step": 27275 + }, + { + "epoch": 15.237988826815643, + "grad_norm": 0.40437960624694824, + "learning_rate": 0.0002391596638655462, + "loss": 0.4745, + "step": 27276 + }, + { + "epoch": 15.23854748603352, + "grad_norm": 0.5808608531951904, + "learning_rate": 0.00023913165266106442, + "loss": 0.3694, + "step": 27277 + }, + { + "epoch": 15.239106145251396, + "grad_norm": 0.46674829721450806, + "learning_rate": 0.00023910364145658265, + "loss": 0.3589, + "step": 27278 + }, + { + "epoch": 15.239664804469275, + "grad_norm": 0.8124476075172424, + "learning_rate": 0.00023907563025210086, + "loss": 0.3476, + "step": 27279 + }, + { + "epoch": 15.240223463687151, + "grad_norm": 0.35851016640663147, + "learning_rate": 0.00023904761904761904, + "loss": 0.3176, + "step": 27280 + }, + { + "epoch": 15.240782122905028, + "grad_norm": 0.5494292378425598, + "learning_rate": 0.00023901960784313724, + "loss": 0.4843, + "step": 27281 + }, + { + "epoch": 15.241340782122904, + "grad_norm": 0.38169774413108826, + "learning_rate": 0.00023899159663865548, + "loss": 0.4697, + "step": 27282 + }, + { + "epoch": 15.241899441340783, + "grad_norm": 0.476744145154953, + "learning_rate": 0.00023896358543417368, + "loss": 0.4574, + "step": 27283 + }, + { + "epoch": 15.24245810055866, + "grad_norm": 0.5318389534950256, + "learning_rate": 0.0002389355742296919, + "loss": 0.3775, + "step": 27284 + }, + { + "epoch": 15.243016759776536, + "grad_norm": 0.6290057301521301, + "learning_rate": 0.00023890756302521007, + "loss": 0.3823, + "step": 27285 + }, + { + "epoch": 15.243575418994414, + "grad_norm": 4.899540424346924, + "learning_rate": 0.0002388795518207283, + "loss": 0.4312, + "step": 27286 + }, + { + "epoch": 15.24413407821229, + "grad_norm": 0.3728036880493164, + "learning_rate": 0.0002388515406162465, + "loss": 0.3267, + "step": 27287 + }, + { + "epoch": 15.244692737430167, + "grad_norm": 0.38292205333709717, + "learning_rate": 0.0002388235294117647, + "loss": 0.3905, + "step": 27288 + }, + { + "epoch": 15.245251396648044, + "grad_norm": 0.4266759157180786, + "learning_rate": 0.00023879551820728292, + "loss": 0.4193, + "step": 27289 + }, + { + "epoch": 15.245810055865922, + "grad_norm": 0.47007545828819275, + "learning_rate": 0.00023876750700280112, + "loss": 0.3629, + "step": 27290 + }, + { + "epoch": 15.246368715083799, + "grad_norm": 1.3062148094177246, + "learning_rate": 0.00023873949579831933, + "loss": 0.4357, + "step": 27291 + }, + { + "epoch": 15.246927374301675, + "grad_norm": 0.6552423238754272, + "learning_rate": 0.00023871148459383754, + "loss": 0.4535, + "step": 27292 + }, + { + "epoch": 15.247486033519554, + "grad_norm": 0.4915752708911896, + "learning_rate": 0.00023868347338935574, + "loss": 0.5503, + "step": 27293 + }, + { + "epoch": 15.24804469273743, + "grad_norm": 0.382807195186615, + "learning_rate": 0.00023865546218487398, + "loss": 0.3209, + "step": 27294 + }, + { + "epoch": 15.248603351955307, + "grad_norm": 0.4989481270313263, + "learning_rate": 0.00023862745098039215, + "loss": 0.5095, + "step": 27295 + }, + { + "epoch": 15.249162011173185, + "grad_norm": 0.34435802698135376, + "learning_rate": 0.00023859943977591036, + "loss": 0.3609, + "step": 27296 + }, + { + "epoch": 15.249720670391062, + "grad_norm": 0.39178192615509033, + "learning_rate": 0.00023857142857142857, + "loss": 0.4913, + "step": 27297 + }, + { + "epoch": 15.250279329608938, + "grad_norm": 0.39510196447372437, + "learning_rate": 0.0002385434173669468, + "loss": 0.3389, + "step": 27298 + }, + { + "epoch": 15.250837988826815, + "grad_norm": 0.289592444896698, + "learning_rate": 0.000238515406162465, + "loss": 0.2906, + "step": 27299 + }, + { + "epoch": 15.251396648044693, + "grad_norm": 0.4477510154247284, + "learning_rate": 0.00023848739495798318, + "loss": 0.4611, + "step": 27300 + }, + { + "epoch": 15.25195530726257, + "grad_norm": 0.4839337170124054, + "learning_rate": 0.0002384593837535014, + "loss": 0.3498, + "step": 27301 + }, + { + "epoch": 15.252513966480446, + "grad_norm": 0.7617443799972534, + "learning_rate": 0.00023843137254901962, + "loss": 0.3765, + "step": 27302 + }, + { + "epoch": 15.253072625698325, + "grad_norm": 0.6577860713005066, + "learning_rate": 0.00023840336134453783, + "loss": 0.4719, + "step": 27303 + }, + { + "epoch": 15.253631284916201, + "grad_norm": 0.4975976347923279, + "learning_rate": 0.00023837535014005604, + "loss": 0.3287, + "step": 27304 + }, + { + "epoch": 15.254189944134078, + "grad_norm": 0.44093024730682373, + "learning_rate": 0.00023834733893557421, + "loss": 0.3799, + "step": 27305 + }, + { + "epoch": 15.254748603351956, + "grad_norm": 1.389951229095459, + "learning_rate": 0.00023831932773109245, + "loss": 0.3356, + "step": 27306 + }, + { + "epoch": 15.255307262569833, + "grad_norm": 0.3966330289840698, + "learning_rate": 0.00023829131652661065, + "loss": 0.3848, + "step": 27307 + }, + { + "epoch": 15.25586592178771, + "grad_norm": 0.4381747543811798, + "learning_rate": 0.00023826330532212886, + "loss": 0.4327, + "step": 27308 + }, + { + "epoch": 15.256424581005586, + "grad_norm": 0.7367750406265259, + "learning_rate": 0.00023823529411764704, + "loss": 0.3501, + "step": 27309 + }, + { + "epoch": 15.256983240223464, + "grad_norm": 0.34817299246788025, + "learning_rate": 0.00023820728291316527, + "loss": 0.3811, + "step": 27310 + }, + { + "epoch": 15.25754189944134, + "grad_norm": 0.3882730007171631, + "learning_rate": 0.00023817927170868348, + "loss": 0.3842, + "step": 27311 + }, + { + "epoch": 15.258100558659217, + "grad_norm": 0.49196770787239075, + "learning_rate": 0.00023815126050420168, + "loss": 0.3593, + "step": 27312 + }, + { + "epoch": 15.258659217877096, + "grad_norm": 1.9919036626815796, + "learning_rate": 0.0002381232492997199, + "loss": 0.5058, + "step": 27313 + }, + { + "epoch": 15.259217877094972, + "grad_norm": 0.6281132698059082, + "learning_rate": 0.0002380952380952381, + "loss": 0.4263, + "step": 27314 + }, + { + "epoch": 15.259776536312849, + "grad_norm": 0.45565271377563477, + "learning_rate": 0.0002380672268907563, + "loss": 0.449, + "step": 27315 + }, + { + "epoch": 15.260335195530725, + "grad_norm": 0.5083568096160889, + "learning_rate": 0.0002380392156862745, + "loss": 0.4098, + "step": 27316 + }, + { + "epoch": 15.260893854748604, + "grad_norm": 0.5633454918861389, + "learning_rate": 0.0002380112044817927, + "loss": 0.3126, + "step": 27317 + }, + { + "epoch": 15.26145251396648, + "grad_norm": 0.5010203123092651, + "learning_rate": 0.00023798319327731095, + "loss": 0.4285, + "step": 27318 + }, + { + "epoch": 15.262011173184357, + "grad_norm": 0.3600017726421356, + "learning_rate": 0.00023795518207282913, + "loss": 0.3855, + "step": 27319 + }, + { + "epoch": 15.262569832402235, + "grad_norm": 0.5928458571434021, + "learning_rate": 0.00023792717086834733, + "loss": 0.4163, + "step": 27320 + }, + { + "epoch": 15.263128491620112, + "grad_norm": 0.34623974561691284, + "learning_rate": 0.00023789915966386554, + "loss": 0.3586, + "step": 27321 + }, + { + "epoch": 15.263687150837988, + "grad_norm": 0.4261847138404846, + "learning_rate": 0.00023787114845938377, + "loss": 0.3601, + "step": 27322 + }, + { + "epoch": 15.264245810055867, + "grad_norm": 0.45708662271499634, + "learning_rate": 0.00023784313725490198, + "loss": 0.4331, + "step": 27323 + }, + { + "epoch": 15.264804469273743, + "grad_norm": 0.4622381925582886, + "learning_rate": 0.00023781512605042015, + "loss": 0.4676, + "step": 27324 + }, + { + "epoch": 15.26536312849162, + "grad_norm": 0.4170280694961548, + "learning_rate": 0.00023778711484593836, + "loss": 0.4359, + "step": 27325 + }, + { + "epoch": 15.265921787709496, + "grad_norm": 0.6590286493301392, + "learning_rate": 0.0002377591036414566, + "loss": 0.5291, + "step": 27326 + }, + { + "epoch": 15.266480446927375, + "grad_norm": 0.7865777015686035, + "learning_rate": 0.0002377310924369748, + "loss": 0.3311, + "step": 27327 + }, + { + "epoch": 15.267039106145251, + "grad_norm": 0.4268190264701843, + "learning_rate": 0.000237703081232493, + "loss": 0.498, + "step": 27328 + }, + { + "epoch": 15.267597765363128, + "grad_norm": 0.41497716307640076, + "learning_rate": 0.00023767507002801118, + "loss": 0.4136, + "step": 27329 + }, + { + "epoch": 15.268156424581006, + "grad_norm": 0.34212756156921387, + "learning_rate": 0.00023764705882352942, + "loss": 0.4468, + "step": 27330 + }, + { + "epoch": 15.268715083798883, + "grad_norm": 0.5231161117553711, + "learning_rate": 0.00023761904761904762, + "loss": 0.4593, + "step": 27331 + }, + { + "epoch": 15.26927374301676, + "grad_norm": 0.5548396110534668, + "learning_rate": 0.00023759103641456583, + "loss": 0.5669, + "step": 27332 + }, + { + "epoch": 15.269832402234638, + "grad_norm": 0.4548458158969879, + "learning_rate": 0.00023756302521008404, + "loss": 0.4052, + "step": 27333 + }, + { + "epoch": 15.270391061452514, + "grad_norm": 0.5036672949790955, + "learning_rate": 0.00023753501400560224, + "loss": 0.4673, + "step": 27334 + }, + { + "epoch": 15.27094972067039, + "grad_norm": 0.9956867694854736, + "learning_rate": 0.00023750700280112045, + "loss": 0.3387, + "step": 27335 + }, + { + "epoch": 15.271508379888267, + "grad_norm": 0.3966291844844818, + "learning_rate": 0.00023747899159663865, + "loss": 0.42, + "step": 27336 + }, + { + "epoch": 15.272067039106146, + "grad_norm": 0.36904725432395935, + "learning_rate": 0.00023745098039215686, + "loss": 0.4877, + "step": 27337 + }, + { + "epoch": 15.272625698324022, + "grad_norm": 0.46454742550849915, + "learning_rate": 0.0002374229691876751, + "loss": 0.4637, + "step": 27338 + }, + { + "epoch": 15.273184357541899, + "grad_norm": 0.3837496042251587, + "learning_rate": 0.00023739495798319327, + "loss": 0.3795, + "step": 27339 + }, + { + "epoch": 15.273743016759777, + "grad_norm": 0.41461503505706787, + "learning_rate": 0.00023736694677871148, + "loss": 0.4349, + "step": 27340 + }, + { + "epoch": 15.274301675977654, + "grad_norm": 0.47250935435295105, + "learning_rate": 0.00023733893557422968, + "loss": 0.459, + "step": 27341 + }, + { + "epoch": 15.27486033519553, + "grad_norm": 0.42125189304351807, + "learning_rate": 0.00023731092436974792, + "loss": 0.4918, + "step": 27342 + }, + { + "epoch": 15.275418994413409, + "grad_norm": 2.8062045574188232, + "learning_rate": 0.00023728291316526612, + "loss": 0.4098, + "step": 27343 + }, + { + "epoch": 15.275977653631285, + "grad_norm": 0.7166767716407776, + "learning_rate": 0.0002372549019607843, + "loss": 0.4191, + "step": 27344 + }, + { + "epoch": 15.276536312849162, + "grad_norm": 0.6807219982147217, + "learning_rate": 0.0002372268907563025, + "loss": 0.4433, + "step": 27345 + }, + { + "epoch": 15.277094972067038, + "grad_norm": 0.4246392250061035, + "learning_rate": 0.00023719887955182074, + "loss": 0.452, + "step": 27346 + }, + { + "epoch": 15.277653631284917, + "grad_norm": 0.4423331022262573, + "learning_rate": 0.00023717086834733895, + "loss": 0.3894, + "step": 27347 + }, + { + "epoch": 15.278212290502793, + "grad_norm": 0.7535455822944641, + "learning_rate": 0.00023714285714285715, + "loss": 0.4578, + "step": 27348 + }, + { + "epoch": 15.27877094972067, + "grad_norm": 0.4807535707950592, + "learning_rate": 0.00023711484593837533, + "loss": 0.3998, + "step": 27349 + }, + { + "epoch": 15.279329608938548, + "grad_norm": 2.586956262588501, + "learning_rate": 0.00023708683473389357, + "loss": 0.4998, + "step": 27350 + }, + { + "epoch": 15.279888268156425, + "grad_norm": 0.40266501903533936, + "learning_rate": 0.00023705882352941177, + "loss": 0.3149, + "step": 27351 + }, + { + "epoch": 15.280446927374301, + "grad_norm": 0.6439770460128784, + "learning_rate": 0.00023703081232492998, + "loss": 0.4691, + "step": 27352 + }, + { + "epoch": 15.28100558659218, + "grad_norm": 1.7154874801635742, + "learning_rate": 0.0002370028011204482, + "loss": 0.3783, + "step": 27353 + }, + { + "epoch": 15.281564245810056, + "grad_norm": 0.5013905763626099, + "learning_rate": 0.0002369747899159664, + "loss": 0.366, + "step": 27354 + }, + { + "epoch": 15.282122905027933, + "grad_norm": 0.4838288426399231, + "learning_rate": 0.0002369467787114846, + "loss": 0.4225, + "step": 27355 + }, + { + "epoch": 15.28268156424581, + "grad_norm": 0.48759859800338745, + "learning_rate": 0.0002369187675070028, + "loss": 0.4907, + "step": 27356 + }, + { + "epoch": 15.283240223463688, + "grad_norm": 0.636256217956543, + "learning_rate": 0.00023689075630252103, + "loss": 0.4596, + "step": 27357 + }, + { + "epoch": 15.283798882681564, + "grad_norm": 0.3870348632335663, + "learning_rate": 0.00023686274509803924, + "loss": 0.3714, + "step": 27358 + }, + { + "epoch": 15.28435754189944, + "grad_norm": 0.6482207775115967, + "learning_rate": 0.00023683473389355742, + "loss": 0.5904, + "step": 27359 + }, + { + "epoch": 15.28491620111732, + "grad_norm": 0.3728819787502289, + "learning_rate": 0.00023680672268907562, + "loss": 0.432, + "step": 27360 + }, + { + "epoch": 15.285474860335196, + "grad_norm": 1.8959358930587769, + "learning_rate": 0.00023677871148459386, + "loss": 0.4745, + "step": 27361 + }, + { + "epoch": 15.286033519553072, + "grad_norm": 2.601167917251587, + "learning_rate": 0.00023675070028011206, + "loss": 0.3805, + "step": 27362 + }, + { + "epoch": 15.286592178770949, + "grad_norm": 0.6279693841934204, + "learning_rate": 0.00023672268907563027, + "loss": 0.3181, + "step": 27363 + }, + { + "epoch": 15.287150837988827, + "grad_norm": 0.6313248872756958, + "learning_rate": 0.00023669467787114845, + "loss": 0.394, + "step": 27364 + }, + { + "epoch": 15.287709497206704, + "grad_norm": 0.569640576839447, + "learning_rate": 0.00023666666666666668, + "loss": 0.4009, + "step": 27365 + }, + { + "epoch": 15.28826815642458, + "grad_norm": 0.45173370838165283, + "learning_rate": 0.0002366386554621849, + "loss": 0.3805, + "step": 27366 + }, + { + "epoch": 15.288826815642459, + "grad_norm": 0.9054532051086426, + "learning_rate": 0.0002366106442577031, + "loss": 0.3985, + "step": 27367 + }, + { + "epoch": 15.289385474860335, + "grad_norm": 0.37061354517936707, + "learning_rate": 0.00023658263305322127, + "loss": 0.531, + "step": 27368 + }, + { + "epoch": 15.289944134078212, + "grad_norm": 0.35593268275260925, + "learning_rate": 0.0002365546218487395, + "loss": 0.3574, + "step": 27369 + }, + { + "epoch": 15.29050279329609, + "grad_norm": 0.3411157727241516, + "learning_rate": 0.0002365266106442577, + "loss": 0.3868, + "step": 27370 + }, + { + "epoch": 15.291061452513967, + "grad_norm": 0.45763590931892395, + "learning_rate": 0.00023649859943977592, + "loss": 0.4686, + "step": 27371 + }, + { + "epoch": 15.291620111731843, + "grad_norm": 0.42536401748657227, + "learning_rate": 0.00023647058823529412, + "loss": 0.4015, + "step": 27372 + }, + { + "epoch": 15.29217877094972, + "grad_norm": 0.5526837706565857, + "learning_rate": 0.00023644257703081233, + "loss": 0.4297, + "step": 27373 + }, + { + "epoch": 15.292737430167598, + "grad_norm": 0.3557010889053345, + "learning_rate": 0.00023641456582633054, + "loss": 0.3044, + "step": 27374 + }, + { + "epoch": 15.293296089385475, + "grad_norm": 0.43715643882751465, + "learning_rate": 0.00023638655462184874, + "loss": 0.4324, + "step": 27375 + }, + { + "epoch": 15.293854748603351, + "grad_norm": 0.40180277824401855, + "learning_rate": 0.00023635854341736695, + "loss": 0.3925, + "step": 27376 + }, + { + "epoch": 15.29441340782123, + "grad_norm": 0.3401314616203308, + "learning_rate": 0.00023633053221288518, + "loss": 0.3272, + "step": 27377 + }, + { + "epoch": 15.294972067039106, + "grad_norm": 3.161470413208008, + "learning_rate": 0.00023630252100840336, + "loss": 0.4517, + "step": 27378 + }, + { + "epoch": 15.295530726256983, + "grad_norm": 0.34121567010879517, + "learning_rate": 0.00023627450980392157, + "loss": 0.2791, + "step": 27379 + }, + { + "epoch": 15.296089385474861, + "grad_norm": 0.5235820412635803, + "learning_rate": 0.00023624649859943977, + "loss": 0.5952, + "step": 27380 + }, + { + "epoch": 15.296648044692738, + "grad_norm": 0.47562819719314575, + "learning_rate": 0.000236218487394958, + "loss": 0.4544, + "step": 27381 + }, + { + "epoch": 15.297206703910614, + "grad_norm": 0.5631003379821777, + "learning_rate": 0.0002361904761904762, + "loss": 0.4569, + "step": 27382 + }, + { + "epoch": 15.297765363128491, + "grad_norm": 0.4859277606010437, + "learning_rate": 0.0002361624649859944, + "loss": 0.5742, + "step": 27383 + }, + { + "epoch": 15.29832402234637, + "grad_norm": 0.36334100365638733, + "learning_rate": 0.0002361344537815126, + "loss": 0.4525, + "step": 27384 + }, + { + "epoch": 15.298882681564246, + "grad_norm": 0.3904019594192505, + "learning_rate": 0.00023610644257703083, + "loss": 0.417, + "step": 27385 + }, + { + "epoch": 15.299441340782122, + "grad_norm": 0.43609458208084106, + "learning_rate": 0.00023607843137254904, + "loss": 0.3987, + "step": 27386 + }, + { + "epoch": 15.3, + "grad_norm": 0.48538318276405334, + "learning_rate": 0.00023605042016806724, + "loss": 0.3534, + "step": 27387 + }, + { + "epoch": 15.300558659217877, + "grad_norm": 2.9375693798065186, + "learning_rate": 0.00023602240896358542, + "loss": 0.3829, + "step": 27388 + }, + { + "epoch": 15.301117318435754, + "grad_norm": 0.4311273992061615, + "learning_rate": 0.00023599439775910365, + "loss": 0.4089, + "step": 27389 + }, + { + "epoch": 15.30167597765363, + "grad_norm": 0.3711971938610077, + "learning_rate": 0.00023596638655462186, + "loss": 0.3456, + "step": 27390 + }, + { + "epoch": 15.302234636871509, + "grad_norm": 0.764815628528595, + "learning_rate": 0.00023593837535014007, + "loss": 0.4087, + "step": 27391 + }, + { + "epoch": 15.302793296089385, + "grad_norm": 0.3980580270290375, + "learning_rate": 0.00023591036414565827, + "loss": 0.3805, + "step": 27392 + }, + { + "epoch": 15.303351955307262, + "grad_norm": 0.5145567655563354, + "learning_rate": 0.00023588235294117648, + "loss": 0.3964, + "step": 27393 + }, + { + "epoch": 15.30391061452514, + "grad_norm": 2.1635966300964355, + "learning_rate": 0.00023585434173669468, + "loss": 0.4846, + "step": 27394 + }, + { + "epoch": 15.304469273743017, + "grad_norm": 0.7815500497817993, + "learning_rate": 0.0002358263305322129, + "loss": 0.4059, + "step": 27395 + }, + { + "epoch": 15.305027932960893, + "grad_norm": 1.560543179512024, + "learning_rate": 0.0002357983193277311, + "loss": 0.4489, + "step": 27396 + }, + { + "epoch": 15.305586592178772, + "grad_norm": 0.39423465728759766, + "learning_rate": 0.00023577030812324933, + "loss": 0.3675, + "step": 27397 + }, + { + "epoch": 15.306145251396648, + "grad_norm": 0.39616483449935913, + "learning_rate": 0.0002357422969187675, + "loss": 0.4807, + "step": 27398 + }, + { + "epoch": 15.306703910614525, + "grad_norm": 0.5009031891822815, + "learning_rate": 0.0002357142857142857, + "loss": 0.4109, + "step": 27399 + }, + { + "epoch": 15.307262569832401, + "grad_norm": 0.5291558504104614, + "learning_rate": 0.00023568627450980392, + "loss": 0.4489, + "step": 27400 + }, + { + "epoch": 15.30782122905028, + "grad_norm": 0.7831292152404785, + "learning_rate": 0.00023565826330532215, + "loss": 0.3388, + "step": 27401 + }, + { + "epoch": 15.308379888268156, + "grad_norm": 0.5438055992126465, + "learning_rate": 0.00023563025210084036, + "loss": 0.4295, + "step": 27402 + }, + { + "epoch": 15.308938547486033, + "grad_norm": 0.48060014843940735, + "learning_rate": 0.00023560224089635854, + "loss": 0.4782, + "step": 27403 + }, + { + "epoch": 15.309497206703911, + "grad_norm": 0.5952041745185852, + "learning_rate": 0.00023557422969187674, + "loss": 0.3667, + "step": 27404 + }, + { + "epoch": 15.310055865921788, + "grad_norm": 0.5067158937454224, + "learning_rate": 0.00023554621848739498, + "loss": 0.401, + "step": 27405 + }, + { + "epoch": 15.310614525139664, + "grad_norm": 1.7469717264175415, + "learning_rate": 0.00023551820728291318, + "loss": 0.4151, + "step": 27406 + }, + { + "epoch": 15.311173184357543, + "grad_norm": 0.43438252806663513, + "learning_rate": 0.0002354901960784314, + "loss": 0.4133, + "step": 27407 + }, + { + "epoch": 15.31173184357542, + "grad_norm": 0.6350743174552917, + "learning_rate": 0.00023546218487394957, + "loss": 0.4113, + "step": 27408 + }, + { + "epoch": 15.312290502793296, + "grad_norm": 0.4433327317237854, + "learning_rate": 0.0002354341736694678, + "loss": 0.3555, + "step": 27409 + }, + { + "epoch": 15.312849162011172, + "grad_norm": 0.4220203161239624, + "learning_rate": 0.000235406162464986, + "loss": 0.4514, + "step": 27410 + }, + { + "epoch": 15.31340782122905, + "grad_norm": 1.5481551885604858, + "learning_rate": 0.0002353781512605042, + "loss": 0.3857, + "step": 27411 + }, + { + "epoch": 15.313966480446927, + "grad_norm": 0.5273675918579102, + "learning_rate": 0.00023535014005602242, + "loss": 0.5126, + "step": 27412 + }, + { + "epoch": 15.314525139664804, + "grad_norm": 0.4492024779319763, + "learning_rate": 0.00023532212885154062, + "loss": 0.3989, + "step": 27413 + }, + { + "epoch": 15.315083798882682, + "grad_norm": 1.1539660692214966, + "learning_rate": 0.00023529411764705883, + "loss": 0.4852, + "step": 27414 + }, + { + "epoch": 15.315642458100559, + "grad_norm": 3.3286173343658447, + "learning_rate": 0.00023526610644257704, + "loss": 0.3654, + "step": 27415 + }, + { + "epoch": 15.316201117318435, + "grad_norm": 0.43987223505973816, + "learning_rate": 0.00023523809523809524, + "loss": 0.4506, + "step": 27416 + }, + { + "epoch": 15.316759776536314, + "grad_norm": 0.4094041883945465, + "learning_rate": 0.00023521008403361348, + "loss": 0.4581, + "step": 27417 + }, + { + "epoch": 15.31731843575419, + "grad_norm": 0.5308538675308228, + "learning_rate": 0.00023518207282913165, + "loss": 0.4487, + "step": 27418 + }, + { + "epoch": 15.317877094972067, + "grad_norm": 0.4632507264614105, + "learning_rate": 0.00023515406162464986, + "loss": 0.4111, + "step": 27419 + }, + { + "epoch": 15.318435754189943, + "grad_norm": 0.370576947927475, + "learning_rate": 0.00023512605042016807, + "loss": 0.3633, + "step": 27420 + }, + { + "epoch": 15.318994413407822, + "grad_norm": 0.5131366848945618, + "learning_rate": 0.0002350980392156863, + "loss": 0.4594, + "step": 27421 + }, + { + "epoch": 15.319553072625698, + "grad_norm": 1.1148682832717896, + "learning_rate": 0.00023507002801120448, + "loss": 0.3624, + "step": 27422 + }, + { + "epoch": 15.320111731843575, + "grad_norm": 0.47584646940231323, + "learning_rate": 0.00023504201680672268, + "loss": 0.4187, + "step": 27423 + }, + { + "epoch": 15.320670391061453, + "grad_norm": 0.6456692218780518, + "learning_rate": 0.0002350140056022409, + "loss": 0.5032, + "step": 27424 + }, + { + "epoch": 15.32122905027933, + "grad_norm": 0.5331834554672241, + "learning_rate": 0.00023498599439775912, + "loss": 0.4274, + "step": 27425 + }, + { + "epoch": 15.321787709497206, + "grad_norm": 0.4325638711452484, + "learning_rate": 0.00023495798319327733, + "loss": 0.391, + "step": 27426 + }, + { + "epoch": 15.322346368715085, + "grad_norm": 0.4980892539024353, + "learning_rate": 0.0002349299719887955, + "loss": 0.4272, + "step": 27427 + }, + { + "epoch": 15.322905027932961, + "grad_norm": 0.6630984544754028, + "learning_rate": 0.00023490196078431371, + "loss": 0.471, + "step": 27428 + }, + { + "epoch": 15.323463687150838, + "grad_norm": 0.5613689422607422, + "learning_rate": 0.00023487394957983195, + "loss": 0.4831, + "step": 27429 + }, + { + "epoch": 15.324022346368714, + "grad_norm": 0.5826461315155029, + "learning_rate": 0.00023484593837535015, + "loss": 0.3759, + "step": 27430 + }, + { + "epoch": 15.324581005586593, + "grad_norm": 0.38186895847320557, + "learning_rate": 0.00023481792717086836, + "loss": 0.3279, + "step": 27431 + }, + { + "epoch": 15.32513966480447, + "grad_norm": 0.8778761625289917, + "learning_rate": 0.00023478991596638654, + "loss": 0.4397, + "step": 27432 + }, + { + "epoch": 15.325698324022346, + "grad_norm": 0.40523669123649597, + "learning_rate": 0.00023476190476190477, + "loss": 0.3948, + "step": 27433 + }, + { + "epoch": 15.326256983240224, + "grad_norm": 0.4401901364326477, + "learning_rate": 0.00023473389355742298, + "loss": 0.51, + "step": 27434 + }, + { + "epoch": 15.3268156424581, + "grad_norm": 0.32560133934020996, + "learning_rate": 0.00023470588235294118, + "loss": 0.3507, + "step": 27435 + }, + { + "epoch": 15.327374301675977, + "grad_norm": 0.9157285094261169, + "learning_rate": 0.0002346778711484594, + "loss": 0.37, + "step": 27436 + }, + { + "epoch": 15.327932960893854, + "grad_norm": 0.4645446836948395, + "learning_rate": 0.0002346498599439776, + "loss": 0.4232, + "step": 27437 + }, + { + "epoch": 15.328491620111732, + "grad_norm": 0.48706427216529846, + "learning_rate": 0.0002346218487394958, + "loss": 0.4315, + "step": 27438 + }, + { + "epoch": 15.329050279329609, + "grad_norm": 0.38807767629623413, + "learning_rate": 0.000234593837535014, + "loss": 0.4093, + "step": 27439 + }, + { + "epoch": 15.329608938547485, + "grad_norm": 0.40641501545906067, + "learning_rate": 0.0002345658263305322, + "loss": 0.2849, + "step": 27440 + }, + { + "epoch": 15.330167597765364, + "grad_norm": 0.5182761549949646, + "learning_rate": 0.00023453781512605045, + "loss": 0.4397, + "step": 27441 + }, + { + "epoch": 15.33072625698324, + "grad_norm": 0.4478176236152649, + "learning_rate": 0.00023450980392156862, + "loss": 0.4226, + "step": 27442 + }, + { + "epoch": 15.331284916201117, + "grad_norm": 0.6990725994110107, + "learning_rate": 0.00023448179271708683, + "loss": 0.4019, + "step": 27443 + }, + { + "epoch": 15.331843575418995, + "grad_norm": 0.3783870041370392, + "learning_rate": 0.00023445378151260504, + "loss": 0.3643, + "step": 27444 + }, + { + "epoch": 15.332402234636872, + "grad_norm": 1.6086512804031372, + "learning_rate": 0.00023442577030812327, + "loss": 0.4534, + "step": 27445 + }, + { + "epoch": 15.332960893854748, + "grad_norm": 0.444332480430603, + "learning_rate": 0.00023439775910364148, + "loss": 0.4341, + "step": 27446 + }, + { + "epoch": 15.333519553072625, + "grad_norm": 0.40880537033081055, + "learning_rate": 0.00023436974789915965, + "loss": 0.388, + "step": 27447 + }, + { + "epoch": 15.334078212290503, + "grad_norm": 0.40572336316108704, + "learning_rate": 0.00023434173669467786, + "loss": 0.447, + "step": 27448 + }, + { + "epoch": 15.33463687150838, + "grad_norm": 0.5504960417747498, + "learning_rate": 0.0002343137254901961, + "loss": 0.4271, + "step": 27449 + }, + { + "epoch": 15.335195530726256, + "grad_norm": 0.3967116177082062, + "learning_rate": 0.0002342857142857143, + "loss": 0.442, + "step": 27450 + }, + { + "epoch": 15.335754189944135, + "grad_norm": 1.4559507369995117, + "learning_rate": 0.0002342577030812325, + "loss": 0.3804, + "step": 27451 + }, + { + "epoch": 15.336312849162011, + "grad_norm": 0.5857447981834412, + "learning_rate": 0.00023422969187675068, + "loss": 0.4832, + "step": 27452 + }, + { + "epoch": 15.336871508379888, + "grad_norm": 1.276358962059021, + "learning_rate": 0.00023420168067226892, + "loss": 0.4484, + "step": 27453 + }, + { + "epoch": 15.337430167597766, + "grad_norm": 0.47999775409698486, + "learning_rate": 0.00023417366946778712, + "loss": 0.3877, + "step": 27454 + }, + { + "epoch": 15.337988826815643, + "grad_norm": 0.459094762802124, + "learning_rate": 0.00023414565826330533, + "loss": 0.3145, + "step": 27455 + }, + { + "epoch": 15.33854748603352, + "grad_norm": 3.09444522857666, + "learning_rate": 0.00023411764705882354, + "loss": 0.4672, + "step": 27456 + }, + { + "epoch": 15.339106145251396, + "grad_norm": 0.9074259996414185, + "learning_rate": 0.00023408963585434174, + "loss": 0.4762, + "step": 27457 + }, + { + "epoch": 15.339664804469274, + "grad_norm": 0.49666330218315125, + "learning_rate": 0.00023406162464985995, + "loss": 0.5451, + "step": 27458 + }, + { + "epoch": 15.34022346368715, + "grad_norm": 0.6739406585693359, + "learning_rate": 0.00023403361344537815, + "loss": 0.3897, + "step": 27459 + }, + { + "epoch": 15.340782122905027, + "grad_norm": 0.49086982011795044, + "learning_rate": 0.00023400560224089636, + "loss": 0.3274, + "step": 27460 + }, + { + "epoch": 15.341340782122906, + "grad_norm": 0.3382643163204193, + "learning_rate": 0.0002339775910364146, + "loss": 0.3374, + "step": 27461 + }, + { + "epoch": 15.341899441340782, + "grad_norm": 0.5720822811126709, + "learning_rate": 0.00023394957983193277, + "loss": 0.447, + "step": 27462 + }, + { + "epoch": 15.342458100558659, + "grad_norm": 0.39611977338790894, + "learning_rate": 0.00023392156862745098, + "loss": 0.4714, + "step": 27463 + }, + { + "epoch": 15.343016759776535, + "grad_norm": 0.6396960020065308, + "learning_rate": 0.00023389355742296918, + "loss": 0.527, + "step": 27464 + }, + { + "epoch": 15.343575418994414, + "grad_norm": 0.38685473799705505, + "learning_rate": 0.00023386554621848742, + "loss": 0.3946, + "step": 27465 + }, + { + "epoch": 15.34413407821229, + "grad_norm": 0.36341413855552673, + "learning_rate": 0.00023383753501400562, + "loss": 0.3138, + "step": 27466 + }, + { + "epoch": 15.344692737430167, + "grad_norm": 0.7965563535690308, + "learning_rate": 0.0002338095238095238, + "loss": 0.5117, + "step": 27467 + }, + { + "epoch": 15.345251396648045, + "grad_norm": 0.41985607147216797, + "learning_rate": 0.000233781512605042, + "loss": 0.542, + "step": 27468 + }, + { + "epoch": 15.345810055865922, + "grad_norm": 0.38819757103919983, + "learning_rate": 0.00023375350140056024, + "loss": 0.383, + "step": 27469 + }, + { + "epoch": 15.346368715083798, + "grad_norm": 0.5193238258361816, + "learning_rate": 0.00023372549019607845, + "loss": 0.4748, + "step": 27470 + }, + { + "epoch": 15.346927374301677, + "grad_norm": 0.4518737494945526, + "learning_rate": 0.00023369747899159665, + "loss": 0.3561, + "step": 27471 + }, + { + "epoch": 15.347486033519553, + "grad_norm": 0.8575886487960815, + "learning_rate": 0.00023366946778711483, + "loss": 0.4776, + "step": 27472 + }, + { + "epoch": 15.34804469273743, + "grad_norm": 0.3266448378562927, + "learning_rate": 0.00023364145658263307, + "loss": 0.2543, + "step": 27473 + }, + { + "epoch": 15.348603351955306, + "grad_norm": 1.4699512720108032, + "learning_rate": 0.00023361344537815127, + "loss": 0.5686, + "step": 27474 + }, + { + "epoch": 15.349162011173185, + "grad_norm": 0.35907161235809326, + "learning_rate": 0.00023358543417366948, + "loss": 0.3322, + "step": 27475 + }, + { + "epoch": 15.349720670391061, + "grad_norm": 0.5248056054115295, + "learning_rate": 0.00023355742296918766, + "loss": 0.3766, + "step": 27476 + }, + { + "epoch": 15.350279329608938, + "grad_norm": 0.447727769613266, + "learning_rate": 0.0002335294117647059, + "loss": 0.4298, + "step": 27477 + }, + { + "epoch": 15.350837988826816, + "grad_norm": 0.6774367094039917, + "learning_rate": 0.0002335014005602241, + "loss": 0.4092, + "step": 27478 + }, + { + "epoch": 15.351396648044693, + "grad_norm": 0.385184109210968, + "learning_rate": 0.0002334733893557423, + "loss": 0.3924, + "step": 27479 + }, + { + "epoch": 15.35195530726257, + "grad_norm": 0.5629845857620239, + "learning_rate": 0.0002334453781512605, + "loss": 0.4709, + "step": 27480 + }, + { + "epoch": 15.352513966480448, + "grad_norm": 2.9537107944488525, + "learning_rate": 0.0002334173669467787, + "loss": 0.4158, + "step": 27481 + }, + { + "epoch": 15.353072625698324, + "grad_norm": 2.4394400119781494, + "learning_rate": 0.00023338935574229692, + "loss": 0.3627, + "step": 27482 + }, + { + "epoch": 15.3536312849162, + "grad_norm": 0.49361085891723633, + "learning_rate": 0.00023336134453781512, + "loss": 0.4637, + "step": 27483 + }, + { + "epoch": 15.354189944134077, + "grad_norm": 0.40729281306266785, + "learning_rate": 0.00023333333333333333, + "loss": 0.3397, + "step": 27484 + }, + { + "epoch": 15.354748603351956, + "grad_norm": 0.41827327013015747, + "learning_rate": 0.00023330532212885156, + "loss": 0.3922, + "step": 27485 + }, + { + "epoch": 15.355307262569832, + "grad_norm": 0.7532612681388855, + "learning_rate": 0.00023327731092436974, + "loss": 0.456, + "step": 27486 + }, + { + "epoch": 15.355865921787709, + "grad_norm": 0.654574453830719, + "learning_rate": 0.00023324929971988795, + "loss": 0.4426, + "step": 27487 + }, + { + "epoch": 15.356424581005587, + "grad_norm": 0.7738398909568787, + "learning_rate": 0.00023322128851540615, + "loss": 0.3934, + "step": 27488 + }, + { + "epoch": 15.356983240223464, + "grad_norm": 0.4268757700920105, + "learning_rate": 0.0002331932773109244, + "loss": 0.4565, + "step": 27489 + }, + { + "epoch": 15.35754189944134, + "grad_norm": 0.5913812518119812, + "learning_rate": 0.0002331652661064426, + "loss": 0.4944, + "step": 27490 + }, + { + "epoch": 15.358100558659217, + "grad_norm": 0.9433100819587708, + "learning_rate": 0.00023313725490196077, + "loss": 0.4831, + "step": 27491 + }, + { + "epoch": 15.358659217877095, + "grad_norm": 0.3646106719970703, + "learning_rate": 0.00023310924369747898, + "loss": 0.3796, + "step": 27492 + }, + { + "epoch": 15.359217877094972, + "grad_norm": 1.3304804563522339, + "learning_rate": 0.0002330812324929972, + "loss": 0.6214, + "step": 27493 + }, + { + "epoch": 15.359776536312848, + "grad_norm": 0.6899111270904541, + "learning_rate": 0.00023305322128851542, + "loss": 0.4288, + "step": 27494 + }, + { + "epoch": 15.360335195530727, + "grad_norm": 0.5661269426345825, + "learning_rate": 0.00023302521008403362, + "loss": 0.4828, + "step": 27495 + }, + { + "epoch": 15.360893854748603, + "grad_norm": 0.3339625597000122, + "learning_rate": 0.0002329971988795518, + "loss": 0.3689, + "step": 27496 + }, + { + "epoch": 15.36145251396648, + "grad_norm": 0.4951328933238983, + "learning_rate": 0.00023296918767507004, + "loss": 0.4463, + "step": 27497 + }, + { + "epoch": 15.362011173184358, + "grad_norm": 0.44402211904525757, + "learning_rate": 0.00023294117647058824, + "loss": 0.4084, + "step": 27498 + }, + { + "epoch": 15.362569832402235, + "grad_norm": 0.6998577117919922, + "learning_rate": 0.00023291316526610645, + "loss": 0.6134, + "step": 27499 + }, + { + "epoch": 15.363128491620111, + "grad_norm": 0.39879918098449707, + "learning_rate": 0.00023288515406162465, + "loss": 0.3641, + "step": 27500 + }, + { + "epoch": 15.363128491620111, + "eval_cer": 0.08602009754181532, + "eval_loss": 0.3251490890979767, + "eval_runtime": 55.761, + "eval_samples_per_second": 81.383, + "eval_steps_per_second": 5.093, + "eval_wer": 0.33909667327728277, + "step": 27500 + }, + { + "epoch": 15.363687150837988, + "grad_norm": 0.44014763832092285, + "learning_rate": 0.00023285714285714286, + "loss": 0.3847, + "step": 27501 + }, + { + "epoch": 15.364245810055866, + "grad_norm": 0.450112909078598, + "learning_rate": 0.00023282913165266107, + "loss": 0.5513, + "step": 27502 + }, + { + "epoch": 15.364804469273743, + "grad_norm": 0.42415347695350647, + "learning_rate": 0.00023280112044817927, + "loss": 0.4425, + "step": 27503 + }, + { + "epoch": 15.36536312849162, + "grad_norm": 0.5702974200248718, + "learning_rate": 0.00023277310924369748, + "loss": 0.2753, + "step": 27504 + }, + { + "epoch": 15.365921787709498, + "grad_norm": 7.577267169952393, + "learning_rate": 0.0002327450980392157, + "loss": 0.47, + "step": 27505 + }, + { + "epoch": 15.366480446927374, + "grad_norm": 0.6406427025794983, + "learning_rate": 0.0002327170868347339, + "loss": 0.4608, + "step": 27506 + }, + { + "epoch": 15.367039106145251, + "grad_norm": 0.6958872079849243, + "learning_rate": 0.0002326890756302521, + "loss": 0.5317, + "step": 27507 + }, + { + "epoch": 15.36759776536313, + "grad_norm": 0.751998782157898, + "learning_rate": 0.0002326610644257703, + "loss": 0.4889, + "step": 27508 + }, + { + "epoch": 15.368156424581006, + "grad_norm": 0.7922876477241516, + "learning_rate": 0.00023263305322128854, + "loss": 0.3869, + "step": 27509 + }, + { + "epoch": 15.368715083798882, + "grad_norm": 0.4760020673274994, + "learning_rate": 0.00023260504201680674, + "loss": 0.4252, + "step": 27510 + }, + { + "epoch": 15.369273743016759, + "grad_norm": 0.6070950031280518, + "learning_rate": 0.00023257703081232492, + "loss": 0.529, + "step": 27511 + }, + { + "epoch": 15.369832402234637, + "grad_norm": 0.4527904689311981, + "learning_rate": 0.00023254901960784313, + "loss": 0.4788, + "step": 27512 + }, + { + "epoch": 15.370391061452514, + "grad_norm": 0.5024074912071228, + "learning_rate": 0.00023252100840336136, + "loss": 0.4286, + "step": 27513 + }, + { + "epoch": 15.37094972067039, + "grad_norm": 0.6211361885070801, + "learning_rate": 0.00023249299719887957, + "loss": 0.5245, + "step": 27514 + }, + { + "epoch": 15.371508379888269, + "grad_norm": 0.38150015473365784, + "learning_rate": 0.00023246498599439777, + "loss": 0.3711, + "step": 27515 + }, + { + "epoch": 15.372067039106145, + "grad_norm": 0.42352208495140076, + "learning_rate": 0.00023243697478991595, + "loss": 0.4494, + "step": 27516 + }, + { + "epoch": 15.372625698324022, + "grad_norm": 0.7814956903457642, + "learning_rate": 0.00023240896358543418, + "loss": 0.4101, + "step": 27517 + }, + { + "epoch": 15.3731843575419, + "grad_norm": 0.4737066924571991, + "learning_rate": 0.0002323809523809524, + "loss": 0.401, + "step": 27518 + }, + { + "epoch": 15.373743016759777, + "grad_norm": 0.6003987789154053, + "learning_rate": 0.0002323529411764706, + "loss": 0.4746, + "step": 27519 + }, + { + "epoch": 15.374301675977653, + "grad_norm": 0.4094335436820984, + "learning_rate": 0.0002323249299719888, + "loss": 0.4024, + "step": 27520 + }, + { + "epoch": 15.37486033519553, + "grad_norm": 0.33603543043136597, + "learning_rate": 0.000232296918767507, + "loss": 0.3844, + "step": 27521 + }, + { + "epoch": 15.375418994413408, + "grad_norm": 0.5068058967590332, + "learning_rate": 0.0002322689075630252, + "loss": 0.4591, + "step": 27522 + }, + { + "epoch": 15.375977653631285, + "grad_norm": 0.48952800035476685, + "learning_rate": 0.00023224089635854342, + "loss": 0.5202, + "step": 27523 + }, + { + "epoch": 15.376536312849161, + "grad_norm": 1.1990879774093628, + "learning_rate": 0.00023221288515406162, + "loss": 0.4939, + "step": 27524 + }, + { + "epoch": 15.37709497206704, + "grad_norm": 0.3486270308494568, + "learning_rate": 0.00023218487394957986, + "loss": 0.4496, + "step": 27525 + }, + { + "epoch": 15.377653631284916, + "grad_norm": 0.41330480575561523, + "learning_rate": 0.00023215686274509804, + "loss": 0.3874, + "step": 27526 + }, + { + "epoch": 15.378212290502793, + "grad_norm": 0.42523783445358276, + "learning_rate": 0.00023212885154061624, + "loss": 0.4283, + "step": 27527 + }, + { + "epoch": 15.378770949720671, + "grad_norm": 4.902891635894775, + "learning_rate": 0.00023210084033613445, + "loss": 0.5188, + "step": 27528 + }, + { + "epoch": 15.379329608938548, + "grad_norm": 0.5284238457679749, + "learning_rate": 0.00023207282913165268, + "loss": 0.495, + "step": 27529 + }, + { + "epoch": 15.379888268156424, + "grad_norm": 0.6698698997497559, + "learning_rate": 0.0002320448179271709, + "loss": 0.4149, + "step": 27530 + }, + { + "epoch": 15.380446927374301, + "grad_norm": 0.4035865366458893, + "learning_rate": 0.00023201680672268907, + "loss": 0.3462, + "step": 27531 + }, + { + "epoch": 15.38100558659218, + "grad_norm": 0.5787323713302612, + "learning_rate": 0.00023198879551820727, + "loss": 0.4516, + "step": 27532 + }, + { + "epoch": 15.381564245810056, + "grad_norm": 0.4580431878566742, + "learning_rate": 0.0002319607843137255, + "loss": 0.4168, + "step": 27533 + }, + { + "epoch": 15.382122905027932, + "grad_norm": 0.4119787812232971, + "learning_rate": 0.0002319327731092437, + "loss": 0.3618, + "step": 27534 + }, + { + "epoch": 15.38268156424581, + "grad_norm": 0.5767946243286133, + "learning_rate": 0.0002319047619047619, + "loss": 0.4409, + "step": 27535 + }, + { + "epoch": 15.383240223463687, + "grad_norm": 0.3838423788547516, + "learning_rate": 0.0002318767507002801, + "loss": 0.3623, + "step": 27536 + }, + { + "epoch": 15.383798882681564, + "grad_norm": 0.47653728723526, + "learning_rate": 0.00023184873949579833, + "loss": 0.4988, + "step": 27537 + }, + { + "epoch": 15.38435754189944, + "grad_norm": 0.670563280582428, + "learning_rate": 0.00023182072829131654, + "loss": 0.3875, + "step": 27538 + }, + { + "epoch": 15.384916201117319, + "grad_norm": 0.7059043645858765, + "learning_rate": 0.00023179271708683474, + "loss": 0.5393, + "step": 27539 + }, + { + "epoch": 15.385474860335195, + "grad_norm": 0.5428305864334106, + "learning_rate": 0.00023176470588235292, + "loss": 0.3491, + "step": 27540 + }, + { + "epoch": 15.386033519553072, + "grad_norm": 1.0532699823379517, + "learning_rate": 0.00023173669467787115, + "loss": 0.3515, + "step": 27541 + }, + { + "epoch": 15.38659217877095, + "grad_norm": 0.8265397548675537, + "learning_rate": 0.00023170868347338936, + "loss": 0.4631, + "step": 27542 + }, + { + "epoch": 15.387150837988827, + "grad_norm": 0.30975237488746643, + "learning_rate": 0.00023168067226890757, + "loss": 0.3779, + "step": 27543 + }, + { + "epoch": 15.387709497206703, + "grad_norm": 0.5013826489448547, + "learning_rate": 0.00023165266106442577, + "loss": 0.494, + "step": 27544 + }, + { + "epoch": 15.388268156424582, + "grad_norm": 0.5017936825752258, + "learning_rate": 0.00023162464985994398, + "loss": 0.4105, + "step": 27545 + }, + { + "epoch": 15.388826815642458, + "grad_norm": 0.543145477771759, + "learning_rate": 0.00023159663865546218, + "loss": 0.3445, + "step": 27546 + }, + { + "epoch": 15.389385474860335, + "grad_norm": 2.5300023555755615, + "learning_rate": 0.0002315686274509804, + "loss": 0.3829, + "step": 27547 + }, + { + "epoch": 15.389944134078211, + "grad_norm": 0.4205876886844635, + "learning_rate": 0.0002315406162464986, + "loss": 0.4657, + "step": 27548 + }, + { + "epoch": 15.39050279329609, + "grad_norm": 0.30096274614334106, + "learning_rate": 0.00023151260504201683, + "loss": 0.352, + "step": 27549 + }, + { + "epoch": 15.391061452513966, + "grad_norm": 0.42650893330574036, + "learning_rate": 0.000231484593837535, + "loss": 0.3451, + "step": 27550 + }, + { + "epoch": 15.391620111731843, + "grad_norm": 0.3083764314651489, + "learning_rate": 0.00023145658263305321, + "loss": 0.3978, + "step": 27551 + }, + { + "epoch": 15.392178770949721, + "grad_norm": 0.4789768159389496, + "learning_rate": 0.00023142857142857142, + "loss": 0.3737, + "step": 27552 + }, + { + "epoch": 15.392737430167598, + "grad_norm": 0.5054426789283752, + "learning_rate": 0.00023140056022408965, + "loss": 0.41, + "step": 27553 + }, + { + "epoch": 15.393296089385474, + "grad_norm": 2.1758203506469727, + "learning_rate": 0.00023137254901960786, + "loss": 0.4607, + "step": 27554 + }, + { + "epoch": 15.393854748603353, + "grad_norm": 0.39306533336639404, + "learning_rate": 0.00023134453781512604, + "loss": 0.3265, + "step": 27555 + }, + { + "epoch": 15.39441340782123, + "grad_norm": 0.7310125827789307, + "learning_rate": 0.00023131652661064424, + "loss": 0.481, + "step": 27556 + }, + { + "epoch": 15.394972067039106, + "grad_norm": 0.674607515335083, + "learning_rate": 0.00023128851540616248, + "loss": 0.3696, + "step": 27557 + }, + { + "epoch": 15.395530726256982, + "grad_norm": 0.637444257736206, + "learning_rate": 0.00023126050420168068, + "loss": 0.5676, + "step": 27558 + }, + { + "epoch": 15.39608938547486, + "grad_norm": 0.2863808572292328, + "learning_rate": 0.0002312324929971989, + "loss": 0.2788, + "step": 27559 + }, + { + "epoch": 15.396648044692737, + "grad_norm": 0.5081846714019775, + "learning_rate": 0.00023120448179271707, + "loss": 0.4052, + "step": 27560 + }, + { + "epoch": 15.397206703910614, + "grad_norm": 0.33326709270477295, + "learning_rate": 0.0002311764705882353, + "loss": 0.377, + "step": 27561 + }, + { + "epoch": 15.397765363128492, + "grad_norm": 0.33083057403564453, + "learning_rate": 0.0002311484593837535, + "loss": 0.2984, + "step": 27562 + }, + { + "epoch": 15.398324022346369, + "grad_norm": 0.45589569211006165, + "learning_rate": 0.0002311204481792717, + "loss": 0.4344, + "step": 27563 + }, + { + "epoch": 15.398882681564245, + "grad_norm": 0.38245394825935364, + "learning_rate": 0.00023109243697478992, + "loss": 0.3602, + "step": 27564 + }, + { + "epoch": 15.399441340782122, + "grad_norm": 0.39443495869636536, + "learning_rate": 0.00023106442577030812, + "loss": 0.3611, + "step": 27565 + }, + { + "epoch": 15.4, + "grad_norm": 0.4073416590690613, + "learning_rate": 0.00023103641456582633, + "loss": 0.4346, + "step": 27566 + }, + { + "epoch": 15.400558659217877, + "grad_norm": 0.5925625562667847, + "learning_rate": 0.00023100840336134454, + "loss": 0.476, + "step": 27567 + }, + { + "epoch": 15.401117318435753, + "grad_norm": 0.522760808467865, + "learning_rate": 0.00023098039215686274, + "loss": 0.4629, + "step": 27568 + }, + { + "epoch": 15.401675977653632, + "grad_norm": 0.737259030342102, + "learning_rate": 0.00023095238095238098, + "loss": 0.3871, + "step": 27569 + }, + { + "epoch": 15.402234636871508, + "grad_norm": 0.42474088072776794, + "learning_rate": 0.00023092436974789915, + "loss": 0.4824, + "step": 27570 + }, + { + "epoch": 15.402793296089385, + "grad_norm": 0.5107524394989014, + "learning_rate": 0.00023089635854341736, + "loss": 0.4556, + "step": 27571 + }, + { + "epoch": 15.403351955307263, + "grad_norm": 0.7858421802520752, + "learning_rate": 0.00023086834733893557, + "loss": 0.4925, + "step": 27572 + }, + { + "epoch": 15.40391061452514, + "grad_norm": 0.5118883848190308, + "learning_rate": 0.0002308403361344538, + "loss": 0.3384, + "step": 27573 + }, + { + "epoch": 15.404469273743016, + "grad_norm": 0.5859266519546509, + "learning_rate": 0.000230812324929972, + "loss": 0.3735, + "step": 27574 + }, + { + "epoch": 15.405027932960893, + "grad_norm": 0.6551598906517029, + "learning_rate": 0.00023078431372549018, + "loss": 0.4377, + "step": 27575 + }, + { + "epoch": 15.405586592178771, + "grad_norm": 1.1171528100967407, + "learning_rate": 0.0002307563025210084, + "loss": 0.4845, + "step": 27576 + }, + { + "epoch": 15.406145251396648, + "grad_norm": 0.39609792828559875, + "learning_rate": 0.00023072829131652662, + "loss": 0.5033, + "step": 27577 + }, + { + "epoch": 15.406703910614524, + "grad_norm": 0.39364534616470337, + "learning_rate": 0.00023070028011204483, + "loss": 0.3599, + "step": 27578 + }, + { + "epoch": 15.407262569832403, + "grad_norm": 0.5504368543624878, + "learning_rate": 0.00023067226890756304, + "loss": 0.751, + "step": 27579 + }, + { + "epoch": 15.40782122905028, + "grad_norm": 0.3931092917919159, + "learning_rate": 0.00023064425770308121, + "loss": 0.2672, + "step": 27580 + }, + { + "epoch": 15.408379888268156, + "grad_norm": 0.55729740858078, + "learning_rate": 0.00023061624649859945, + "loss": 0.4177, + "step": 27581 + }, + { + "epoch": 15.408938547486034, + "grad_norm": 0.4239175319671631, + "learning_rate": 0.00023058823529411765, + "loss": 0.451, + "step": 27582 + }, + { + "epoch": 15.40949720670391, + "grad_norm": 0.4770985245704651, + "learning_rate": 0.00023056022408963586, + "loss": 0.3901, + "step": 27583 + }, + { + "epoch": 15.410055865921787, + "grad_norm": 0.6786672472953796, + "learning_rate": 0.0002305322128851541, + "loss": 0.4976, + "step": 27584 + }, + { + "epoch": 15.410614525139664, + "grad_norm": 0.582874596118927, + "learning_rate": 0.00023050420168067227, + "loss": 0.3703, + "step": 27585 + }, + { + "epoch": 15.411173184357542, + "grad_norm": 0.8771749138832092, + "learning_rate": 0.00023047619047619048, + "loss": 0.3139, + "step": 27586 + }, + { + "epoch": 15.411731843575419, + "grad_norm": 0.5681930780410767, + "learning_rate": 0.00023044817927170868, + "loss": 0.3656, + "step": 27587 + }, + { + "epoch": 15.412290502793295, + "grad_norm": 0.3547438383102417, + "learning_rate": 0.00023042016806722692, + "loss": 0.3947, + "step": 27588 + }, + { + "epoch": 15.412849162011174, + "grad_norm": 0.42386239767074585, + "learning_rate": 0.0002303921568627451, + "loss": 0.3042, + "step": 27589 + }, + { + "epoch": 15.41340782122905, + "grad_norm": 2.890068531036377, + "learning_rate": 0.0002303641456582633, + "loss": 0.3073, + "step": 27590 + }, + { + "epoch": 15.413966480446927, + "grad_norm": 2.2874836921691895, + "learning_rate": 0.0002303361344537815, + "loss": 0.618, + "step": 27591 + }, + { + "epoch": 15.414525139664805, + "grad_norm": 0.43156924843788147, + "learning_rate": 0.00023030812324929974, + "loss": 0.331, + "step": 27592 + }, + { + "epoch": 15.415083798882682, + "grad_norm": 2.0707414150238037, + "learning_rate": 0.00023028011204481795, + "loss": 0.3955, + "step": 27593 + }, + { + "epoch": 15.415642458100558, + "grad_norm": 0.8385738730430603, + "learning_rate": 0.00023025210084033613, + "loss": 0.3018, + "step": 27594 + }, + { + "epoch": 15.416201117318435, + "grad_norm": 0.5015643239021301, + "learning_rate": 0.00023022408963585433, + "loss": 0.3306, + "step": 27595 + }, + { + "epoch": 15.416759776536313, + "grad_norm": 1.2832459211349487, + "learning_rate": 0.00023019607843137256, + "loss": 0.3434, + "step": 27596 + }, + { + "epoch": 15.41731843575419, + "grad_norm": 0.4637226462364197, + "learning_rate": 0.00023016806722689077, + "loss": 0.4113, + "step": 27597 + }, + { + "epoch": 15.417877094972066, + "grad_norm": 0.6637462973594666, + "learning_rate": 0.00023014005602240898, + "loss": 0.4451, + "step": 27598 + }, + { + "epoch": 15.418435754189945, + "grad_norm": 0.3724496364593506, + "learning_rate": 0.00023011204481792716, + "loss": 0.3291, + "step": 27599 + }, + { + "epoch": 15.418994413407821, + "grad_norm": 0.3903222382068634, + "learning_rate": 0.0002300840336134454, + "loss": 0.3271, + "step": 27600 + }, + { + "epoch": 15.419553072625698, + "grad_norm": 4.25506067276001, + "learning_rate": 0.0002300560224089636, + "loss": 0.4898, + "step": 27601 + }, + { + "epoch": 15.420111731843576, + "grad_norm": 0.41180357336997986, + "learning_rate": 0.0002300280112044818, + "loss": 0.322, + "step": 27602 + }, + { + "epoch": 15.420670391061453, + "grad_norm": 0.6076601147651672, + "learning_rate": 0.00023, + "loss": 0.4253, + "step": 27603 + }, + { + "epoch": 15.42122905027933, + "grad_norm": 0.7646205425262451, + "learning_rate": 0.0002299719887955182, + "loss": 0.4349, + "step": 27604 + }, + { + "epoch": 15.421787709497206, + "grad_norm": 0.3251868784427643, + "learning_rate": 0.00022994397759103642, + "loss": 0.2879, + "step": 27605 + }, + { + "epoch": 15.422346368715084, + "grad_norm": 0.6212793588638306, + "learning_rate": 0.00022991596638655462, + "loss": 0.3858, + "step": 27606 + }, + { + "epoch": 15.422905027932961, + "grad_norm": 0.46519187092781067, + "learning_rate": 0.00022988795518207283, + "loss": 0.4382, + "step": 27607 + }, + { + "epoch": 15.423463687150837, + "grad_norm": 0.8211582899093628, + "learning_rate": 0.00022985994397759106, + "loss": 0.3877, + "step": 27608 + }, + { + "epoch": 15.424022346368716, + "grad_norm": 0.4072891175746918, + "learning_rate": 0.00022983193277310924, + "loss": 0.3507, + "step": 27609 + }, + { + "epoch": 15.424581005586592, + "grad_norm": 0.5349559783935547, + "learning_rate": 0.00022980392156862745, + "loss": 0.4666, + "step": 27610 + }, + { + "epoch": 15.425139664804469, + "grad_norm": 0.7876636981964111, + "learning_rate": 0.00022977591036414565, + "loss": 0.4664, + "step": 27611 + }, + { + "epoch": 15.425698324022346, + "grad_norm": 0.3492244482040405, + "learning_rate": 0.0002297478991596639, + "loss": 0.4843, + "step": 27612 + }, + { + "epoch": 15.426256983240224, + "grad_norm": 0.3406340777873993, + "learning_rate": 0.0002297198879551821, + "loss": 0.3334, + "step": 27613 + }, + { + "epoch": 15.4268156424581, + "grad_norm": 0.7022618055343628, + "learning_rate": 0.00022969187675070027, + "loss": 0.4076, + "step": 27614 + }, + { + "epoch": 15.427374301675977, + "grad_norm": 0.7164230346679688, + "learning_rate": 0.00022966386554621848, + "loss": 0.3755, + "step": 27615 + }, + { + "epoch": 15.427932960893855, + "grad_norm": 0.5902921557426453, + "learning_rate": 0.0002296358543417367, + "loss": 0.3245, + "step": 27616 + }, + { + "epoch": 15.428491620111732, + "grad_norm": 0.38953524827957153, + "learning_rate": 0.00022960784313725492, + "loss": 0.3491, + "step": 27617 + }, + { + "epoch": 15.429050279329608, + "grad_norm": 0.48069486021995544, + "learning_rate": 0.00022957983193277312, + "loss": 0.3791, + "step": 27618 + }, + { + "epoch": 15.429608938547487, + "grad_norm": 0.7019156217575073, + "learning_rate": 0.0002295518207282913, + "loss": 0.4645, + "step": 27619 + }, + { + "epoch": 15.430167597765363, + "grad_norm": 0.4355953335762024, + "learning_rate": 0.00022952380952380954, + "loss": 0.3615, + "step": 27620 + }, + { + "epoch": 15.43072625698324, + "grad_norm": 1.006739854812622, + "learning_rate": 0.00022949579831932774, + "loss": 0.7369, + "step": 27621 + }, + { + "epoch": 15.431284916201117, + "grad_norm": 0.4114830493927002, + "learning_rate": 0.00022946778711484595, + "loss": 0.4501, + "step": 27622 + }, + { + "epoch": 15.431843575418995, + "grad_norm": 0.5296586155891418, + "learning_rate": 0.00022943977591036415, + "loss": 0.3829, + "step": 27623 + }, + { + "epoch": 15.432402234636871, + "grad_norm": 0.6578084230422974, + "learning_rate": 0.00022941176470588236, + "loss": 0.4193, + "step": 27624 + }, + { + "epoch": 15.432960893854748, + "grad_norm": 0.4390453100204468, + "learning_rate": 0.00022938375350140057, + "loss": 0.4166, + "step": 27625 + }, + { + "epoch": 15.433519553072626, + "grad_norm": 0.3733435273170471, + "learning_rate": 0.00022935574229691877, + "loss": 0.3638, + "step": 27626 + }, + { + "epoch": 15.434078212290503, + "grad_norm": 0.7661147713661194, + "learning_rate": 0.00022932773109243698, + "loss": 0.4204, + "step": 27627 + }, + { + "epoch": 15.43463687150838, + "grad_norm": 0.46562889218330383, + "learning_rate": 0.0002292997198879552, + "loss": 0.3369, + "step": 27628 + }, + { + "epoch": 15.435195530726258, + "grad_norm": 0.38845834136009216, + "learning_rate": 0.0002292717086834734, + "loss": 0.4245, + "step": 27629 + }, + { + "epoch": 15.435754189944134, + "grad_norm": 0.3744841516017914, + "learning_rate": 0.0002292436974789916, + "loss": 0.3507, + "step": 27630 + }, + { + "epoch": 15.436312849162011, + "grad_norm": 0.8743610978126526, + "learning_rate": 0.0002292156862745098, + "loss": 0.362, + "step": 27631 + }, + { + "epoch": 15.436871508379888, + "grad_norm": 0.4833931028842926, + "learning_rate": 0.00022918767507002804, + "loss": 0.4614, + "step": 27632 + }, + { + "epoch": 15.437430167597766, + "grad_norm": 0.42429786920547485, + "learning_rate": 0.00022915966386554624, + "loss": 0.4175, + "step": 27633 + }, + { + "epoch": 15.437988826815642, + "grad_norm": 0.6156449317932129, + "learning_rate": 0.00022913165266106442, + "loss": 0.418, + "step": 27634 + }, + { + "epoch": 15.438547486033519, + "grad_norm": 1.6276191473007202, + "learning_rate": 0.00022910364145658263, + "loss": 0.4153, + "step": 27635 + }, + { + "epoch": 15.439106145251397, + "grad_norm": 0.5266149044036865, + "learning_rate": 0.00022907563025210086, + "loss": 0.3583, + "step": 27636 + }, + { + "epoch": 15.439664804469274, + "grad_norm": 11.407163619995117, + "learning_rate": 0.00022904761904761906, + "loss": 0.4658, + "step": 27637 + }, + { + "epoch": 15.44022346368715, + "grad_norm": 0.41940897703170776, + "learning_rate": 0.00022901960784313727, + "loss": 0.4096, + "step": 27638 + }, + { + "epoch": 15.440782122905027, + "grad_norm": 2.3815360069274902, + "learning_rate": 0.00022899159663865545, + "loss": 0.4761, + "step": 27639 + }, + { + "epoch": 15.441340782122905, + "grad_norm": 0.427706241607666, + "learning_rate": 0.00022896358543417368, + "loss": 0.35, + "step": 27640 + }, + { + "epoch": 15.441899441340782, + "grad_norm": 0.4956286549568176, + "learning_rate": 0.0002289355742296919, + "loss": 0.5104, + "step": 27641 + }, + { + "epoch": 15.442458100558659, + "grad_norm": 0.99051433801651, + "learning_rate": 0.0002289075630252101, + "loss": 0.4523, + "step": 27642 + }, + { + "epoch": 15.443016759776537, + "grad_norm": 0.5525454878807068, + "learning_rate": 0.00022887955182072827, + "loss": 0.3786, + "step": 27643 + }, + { + "epoch": 15.443575418994413, + "grad_norm": 1.143302083015442, + "learning_rate": 0.0002288515406162465, + "loss": 0.7081, + "step": 27644 + }, + { + "epoch": 15.44413407821229, + "grad_norm": 0.40622884035110474, + "learning_rate": 0.0002288235294117647, + "loss": 0.3518, + "step": 27645 + }, + { + "epoch": 15.444692737430168, + "grad_norm": 0.5262824892997742, + "learning_rate": 0.00022879551820728292, + "loss": 0.551, + "step": 27646 + }, + { + "epoch": 15.445251396648045, + "grad_norm": 0.5108913779258728, + "learning_rate": 0.00022876750700280112, + "loss": 0.4627, + "step": 27647 + }, + { + "epoch": 15.445810055865921, + "grad_norm": 0.4353354871273041, + "learning_rate": 0.00022873949579831933, + "loss": 0.3024, + "step": 27648 + }, + { + "epoch": 15.446368715083798, + "grad_norm": 0.36251866817474365, + "learning_rate": 0.00022871148459383754, + "loss": 0.3678, + "step": 27649 + }, + { + "epoch": 15.446927374301676, + "grad_norm": 0.43140944838523865, + "learning_rate": 0.00022868347338935574, + "loss": 0.3017, + "step": 27650 + }, + { + "epoch": 15.447486033519553, + "grad_norm": 0.6407912969589233, + "learning_rate": 0.00022865546218487395, + "loss": 0.4401, + "step": 27651 + }, + { + "epoch": 15.44804469273743, + "grad_norm": 0.49221348762512207, + "learning_rate": 0.00022862745098039218, + "loss": 0.4441, + "step": 27652 + }, + { + "epoch": 15.448603351955308, + "grad_norm": 0.3979596197605133, + "learning_rate": 0.00022859943977591036, + "loss": 0.3996, + "step": 27653 + }, + { + "epoch": 15.449162011173184, + "grad_norm": 0.4379895329475403, + "learning_rate": 0.00022857142857142857, + "loss": 0.3407, + "step": 27654 + }, + { + "epoch": 15.449720670391061, + "grad_norm": 0.4559272825717926, + "learning_rate": 0.00022854341736694677, + "loss": 0.4442, + "step": 27655 + }, + { + "epoch": 15.45027932960894, + "grad_norm": 0.4939734637737274, + "learning_rate": 0.000228515406162465, + "loss": 0.351, + "step": 27656 + }, + { + "epoch": 15.450837988826816, + "grad_norm": 0.6032205820083618, + "learning_rate": 0.0002284873949579832, + "loss": 0.3944, + "step": 27657 + }, + { + "epoch": 15.451396648044692, + "grad_norm": 0.45249828696250916, + "learning_rate": 0.0002284593837535014, + "loss": 0.4493, + "step": 27658 + }, + { + "epoch": 15.451955307262569, + "grad_norm": 0.6539236903190613, + "learning_rate": 0.0002284313725490196, + "loss": 0.3956, + "step": 27659 + }, + { + "epoch": 15.452513966480447, + "grad_norm": 1.2492121458053589, + "learning_rate": 0.00022840336134453783, + "loss": 0.3869, + "step": 27660 + }, + { + "epoch": 15.453072625698324, + "grad_norm": 0.627498984336853, + "learning_rate": 0.00022837535014005604, + "loss": 0.4193, + "step": 27661 + }, + { + "epoch": 15.4536312849162, + "grad_norm": 0.33169886469841003, + "learning_rate": 0.00022834733893557424, + "loss": 0.2996, + "step": 27662 + }, + { + "epoch": 15.454189944134079, + "grad_norm": 0.3716643452644348, + "learning_rate": 0.00022831932773109242, + "loss": 0.4158, + "step": 27663 + }, + { + "epoch": 15.454748603351955, + "grad_norm": 0.4901297986507416, + "learning_rate": 0.00022829131652661065, + "loss": 0.3603, + "step": 27664 + }, + { + "epoch": 15.455307262569832, + "grad_norm": 0.5403488278388977, + "learning_rate": 0.00022826330532212886, + "loss": 0.3732, + "step": 27665 + }, + { + "epoch": 15.45586592178771, + "grad_norm": 0.4044787883758545, + "learning_rate": 0.00022823529411764707, + "loss": 0.3437, + "step": 27666 + }, + { + "epoch": 15.456424581005587, + "grad_norm": 0.5171582698822021, + "learning_rate": 0.00022820728291316527, + "loss": 0.4816, + "step": 27667 + }, + { + "epoch": 15.456983240223463, + "grad_norm": 3.2613813877105713, + "learning_rate": 0.00022817927170868348, + "loss": 0.3555, + "step": 27668 + }, + { + "epoch": 15.45754189944134, + "grad_norm": 0.6673614978790283, + "learning_rate": 0.00022815126050420168, + "loss": 0.3789, + "step": 27669 + }, + { + "epoch": 15.458100558659218, + "grad_norm": 0.581941545009613, + "learning_rate": 0.0002281232492997199, + "loss": 0.4786, + "step": 27670 + }, + { + "epoch": 15.458659217877095, + "grad_norm": 0.5647519826889038, + "learning_rate": 0.0002280952380952381, + "loss": 0.4133, + "step": 27671 + }, + { + "epoch": 15.459217877094972, + "grad_norm": 1.0003045797348022, + "learning_rate": 0.00022806722689075633, + "loss": 0.4272, + "step": 27672 + }, + { + "epoch": 15.45977653631285, + "grad_norm": 0.779039204120636, + "learning_rate": 0.0002280392156862745, + "loss": 0.5387, + "step": 27673 + }, + { + "epoch": 15.460335195530726, + "grad_norm": 0.47317731380462646, + "learning_rate": 0.00022801120448179271, + "loss": 0.5029, + "step": 27674 + }, + { + "epoch": 15.460893854748603, + "grad_norm": 1.9812017679214478, + "learning_rate": 0.00022798319327731092, + "loss": 0.3512, + "step": 27675 + }, + { + "epoch": 15.461452513966481, + "grad_norm": 0.5163731575012207, + "learning_rate": 0.00022795518207282915, + "loss": 0.3531, + "step": 27676 + }, + { + "epoch": 15.462011173184358, + "grad_norm": 0.4579041302204132, + "learning_rate": 0.00022792717086834736, + "loss": 0.4395, + "step": 27677 + }, + { + "epoch": 15.462569832402234, + "grad_norm": 0.4563809633255005, + "learning_rate": 0.00022789915966386554, + "loss": 0.4023, + "step": 27678 + }, + { + "epoch": 15.463128491620111, + "grad_norm": 0.7590668201446533, + "learning_rate": 0.00022787114845938374, + "loss": 0.4024, + "step": 27679 + }, + { + "epoch": 15.46368715083799, + "grad_norm": 7.259964942932129, + "learning_rate": 0.00022784313725490198, + "loss": 0.3955, + "step": 27680 + }, + { + "epoch": 15.464245810055866, + "grad_norm": 0.4260775148868561, + "learning_rate": 0.00022781512605042018, + "loss": 0.5114, + "step": 27681 + }, + { + "epoch": 15.464804469273743, + "grad_norm": 0.5505006909370422, + "learning_rate": 0.0002277871148459384, + "loss": 0.4232, + "step": 27682 + }, + { + "epoch": 15.46536312849162, + "grad_norm": 0.5028451681137085, + "learning_rate": 0.00022775910364145657, + "loss": 0.3986, + "step": 27683 + }, + { + "epoch": 15.465921787709497, + "grad_norm": 0.5197755694389343, + "learning_rate": 0.0002277310924369748, + "loss": 0.4804, + "step": 27684 + }, + { + "epoch": 15.466480446927374, + "grad_norm": 0.5973348021507263, + "learning_rate": 0.000227703081232493, + "loss": 0.4683, + "step": 27685 + }, + { + "epoch": 15.46703910614525, + "grad_norm": 0.5457064509391785, + "learning_rate": 0.0002276750700280112, + "loss": 0.4634, + "step": 27686 + }, + { + "epoch": 15.467597765363129, + "grad_norm": 0.47269371151924133, + "learning_rate": 0.00022764705882352942, + "loss": 0.3978, + "step": 27687 + }, + { + "epoch": 15.468156424581005, + "grad_norm": 7.242918491363525, + "learning_rate": 0.00022761904761904762, + "loss": 0.4553, + "step": 27688 + }, + { + "epoch": 15.468715083798882, + "grad_norm": 0.5599194169044495, + "learning_rate": 0.00022759103641456583, + "loss": 0.4651, + "step": 27689 + }, + { + "epoch": 15.46927374301676, + "grad_norm": 0.43262985348701477, + "learning_rate": 0.00022756302521008404, + "loss": 0.429, + "step": 27690 + }, + { + "epoch": 15.469832402234637, + "grad_norm": 0.322427362203598, + "learning_rate": 0.00022753501400560224, + "loss": 0.3351, + "step": 27691 + }, + { + "epoch": 15.470391061452514, + "grad_norm": 0.43876051902770996, + "learning_rate": 0.00022750700280112048, + "loss": 0.3816, + "step": 27692 + }, + { + "epoch": 15.470949720670392, + "grad_norm": 0.4143429696559906, + "learning_rate": 0.00022747899159663865, + "loss": 0.6289, + "step": 27693 + }, + { + "epoch": 15.471508379888268, + "grad_norm": 0.46920105814933777, + "learning_rate": 0.00022745098039215686, + "loss": 0.4933, + "step": 27694 + }, + { + "epoch": 15.472067039106145, + "grad_norm": 0.522365152835846, + "learning_rate": 0.00022742296918767507, + "loss": 0.3471, + "step": 27695 + }, + { + "epoch": 15.472625698324022, + "grad_norm": 0.6207307577133179, + "learning_rate": 0.0002273949579831933, + "loss": 0.4391, + "step": 27696 + }, + { + "epoch": 15.4731843575419, + "grad_norm": 0.4017002582550049, + "learning_rate": 0.0002273669467787115, + "loss": 0.408, + "step": 27697 + }, + { + "epoch": 15.473743016759776, + "grad_norm": 0.5419505834579468, + "learning_rate": 0.00022733893557422968, + "loss": 0.3766, + "step": 27698 + }, + { + "epoch": 15.474301675977653, + "grad_norm": 0.43403494358062744, + "learning_rate": 0.0002273109243697479, + "loss": 0.4243, + "step": 27699 + }, + { + "epoch": 15.474860335195531, + "grad_norm": 0.6490126252174377, + "learning_rate": 0.00022728291316526612, + "loss": 0.4771, + "step": 27700 + }, + { + "epoch": 15.475418994413408, + "grad_norm": 0.8881585597991943, + "learning_rate": 0.00022725490196078433, + "loss": 0.4652, + "step": 27701 + }, + { + "epoch": 15.475977653631285, + "grad_norm": 0.3979702293872833, + "learning_rate": 0.0002272268907563025, + "loss": 0.3543, + "step": 27702 + }, + { + "epoch": 15.476536312849163, + "grad_norm": 2.262089490890503, + "learning_rate": 0.00022719887955182071, + "loss": 0.383, + "step": 27703 + }, + { + "epoch": 15.47709497206704, + "grad_norm": 0.49367737770080566, + "learning_rate": 0.00022717086834733895, + "loss": 0.4739, + "step": 27704 + }, + { + "epoch": 15.477653631284916, + "grad_norm": 0.5165473818778992, + "learning_rate": 0.00022714285714285715, + "loss": 0.3618, + "step": 27705 + }, + { + "epoch": 15.478212290502793, + "grad_norm": 0.5490630269050598, + "learning_rate": 0.00022711484593837536, + "loss": 0.4626, + "step": 27706 + }, + { + "epoch": 15.478770949720671, + "grad_norm": 0.67984938621521, + "learning_rate": 0.00022708683473389354, + "loss": 0.4194, + "step": 27707 + }, + { + "epoch": 15.479329608938547, + "grad_norm": 0.48702386021614075, + "learning_rate": 0.00022705882352941177, + "loss": 0.4674, + "step": 27708 + }, + { + "epoch": 15.479888268156424, + "grad_norm": 0.7793223261833191, + "learning_rate": 0.00022703081232492998, + "loss": 0.4446, + "step": 27709 + }, + { + "epoch": 15.480446927374302, + "grad_norm": 0.749777615070343, + "learning_rate": 0.00022700280112044818, + "loss": 0.467, + "step": 27710 + }, + { + "epoch": 15.481005586592179, + "grad_norm": 1.3378573656082153, + "learning_rate": 0.0002269747899159664, + "loss": 0.3866, + "step": 27711 + }, + { + "epoch": 15.481564245810056, + "grad_norm": 0.41602206230163574, + "learning_rate": 0.0002269467787114846, + "loss": 0.4069, + "step": 27712 + }, + { + "epoch": 15.482122905027932, + "grad_norm": 0.5027822256088257, + "learning_rate": 0.0002269187675070028, + "loss": 0.3923, + "step": 27713 + }, + { + "epoch": 15.48268156424581, + "grad_norm": 6.99078893661499, + "learning_rate": 0.000226890756302521, + "loss": 0.4555, + "step": 27714 + }, + { + "epoch": 15.483240223463687, + "grad_norm": 0.3848678469657898, + "learning_rate": 0.00022686274509803921, + "loss": 0.32, + "step": 27715 + }, + { + "epoch": 15.483798882681564, + "grad_norm": 0.742813229560852, + "learning_rate": 0.00022683473389355745, + "loss": 0.3903, + "step": 27716 + }, + { + "epoch": 15.484357541899442, + "grad_norm": 0.7288814783096313, + "learning_rate": 0.00022680672268907563, + "loss": 0.3523, + "step": 27717 + }, + { + "epoch": 15.484916201117318, + "grad_norm": 0.46087270975112915, + "learning_rate": 0.00022677871148459383, + "loss": 0.47, + "step": 27718 + }, + { + "epoch": 15.485474860335195, + "grad_norm": 4.229316711425781, + "learning_rate": 0.00022675070028011204, + "loss": 0.3998, + "step": 27719 + }, + { + "epoch": 15.486033519553073, + "grad_norm": 0.8021203875541687, + "learning_rate": 0.00022672268907563027, + "loss": 0.4289, + "step": 27720 + }, + { + "epoch": 15.48659217877095, + "grad_norm": 0.3956725001335144, + "learning_rate": 0.00022669467787114848, + "loss": 0.3374, + "step": 27721 + }, + { + "epoch": 15.487150837988827, + "grad_norm": 0.6719968318939209, + "learning_rate": 0.00022666666666666666, + "loss": 0.4233, + "step": 27722 + }, + { + "epoch": 15.487709497206703, + "grad_norm": 0.4739748537540436, + "learning_rate": 0.00022663865546218486, + "loss": 0.4605, + "step": 27723 + }, + { + "epoch": 15.488268156424581, + "grad_norm": 0.6410335302352905, + "learning_rate": 0.0002266106442577031, + "loss": 0.4411, + "step": 27724 + }, + { + "epoch": 15.488826815642458, + "grad_norm": 0.5351026654243469, + "learning_rate": 0.0002265826330532213, + "loss": 0.3417, + "step": 27725 + }, + { + "epoch": 15.489385474860335, + "grad_norm": 0.40366125106811523, + "learning_rate": 0.0002265546218487395, + "loss": 0.4236, + "step": 27726 + }, + { + "epoch": 15.489944134078213, + "grad_norm": 0.5542619228363037, + "learning_rate": 0.00022652661064425769, + "loss": 0.3679, + "step": 27727 + }, + { + "epoch": 15.49050279329609, + "grad_norm": 0.5849606394767761, + "learning_rate": 0.00022649859943977592, + "loss": 0.3836, + "step": 27728 + }, + { + "epoch": 15.491061452513966, + "grad_norm": 0.4847210645675659, + "learning_rate": 0.00022647058823529412, + "loss": 0.3922, + "step": 27729 + }, + { + "epoch": 15.491620111731844, + "grad_norm": 0.6907528042793274, + "learning_rate": 0.00022644257703081233, + "loss": 0.3566, + "step": 27730 + }, + { + "epoch": 15.492178770949721, + "grad_norm": 0.4100522994995117, + "learning_rate": 0.00022641456582633054, + "loss": 0.4029, + "step": 27731 + }, + { + "epoch": 15.492737430167598, + "grad_norm": 0.4595806300640106, + "learning_rate": 0.00022638655462184874, + "loss": 0.4208, + "step": 27732 + }, + { + "epoch": 15.493296089385474, + "grad_norm": 0.3797292709350586, + "learning_rate": 0.00022635854341736695, + "loss": 0.3549, + "step": 27733 + }, + { + "epoch": 15.493854748603352, + "grad_norm": 0.42252305150032043, + "learning_rate": 0.00022633053221288515, + "loss": 0.355, + "step": 27734 + }, + { + "epoch": 15.494413407821229, + "grad_norm": 0.90904700756073, + "learning_rate": 0.00022630252100840336, + "loss": 0.3713, + "step": 27735 + }, + { + "epoch": 15.494972067039106, + "grad_norm": 0.5891319513320923, + "learning_rate": 0.0002262745098039216, + "loss": 0.4912, + "step": 27736 + }, + { + "epoch": 15.495530726256984, + "grad_norm": 0.4711788296699524, + "learning_rate": 0.00022624649859943977, + "loss": 0.3273, + "step": 27737 + }, + { + "epoch": 15.49608938547486, + "grad_norm": 0.9140803813934326, + "learning_rate": 0.00022621848739495798, + "loss": 0.4454, + "step": 27738 + }, + { + "epoch": 15.496648044692737, + "grad_norm": 1.2518978118896484, + "learning_rate": 0.00022619047619047618, + "loss": 0.3853, + "step": 27739 + }, + { + "epoch": 15.497206703910614, + "grad_norm": 4.439642429351807, + "learning_rate": 0.00022616246498599442, + "loss": 0.6196, + "step": 27740 + }, + { + "epoch": 15.497765363128492, + "grad_norm": 0.5441060662269592, + "learning_rate": 0.00022613445378151262, + "loss": 0.4056, + "step": 27741 + }, + { + "epoch": 15.498324022346369, + "grad_norm": 0.46169713139533997, + "learning_rate": 0.0002261064425770308, + "loss": 0.3582, + "step": 27742 + }, + { + "epoch": 15.498882681564245, + "grad_norm": 0.42566612362861633, + "learning_rate": 0.000226078431372549, + "loss": 0.3208, + "step": 27743 + }, + { + "epoch": 15.499441340782123, + "grad_norm": 0.5000613927841187, + "learning_rate": 0.00022605042016806724, + "loss": 0.3902, + "step": 27744 + }, + { + "epoch": 15.5, + "grad_norm": 1.6427463293075562, + "learning_rate": 0.00022602240896358545, + "loss": 0.4906, + "step": 27745 + }, + { + "epoch": 15.500558659217877, + "grad_norm": 1.1477903127670288, + "learning_rate": 0.00022599439775910365, + "loss": 0.4149, + "step": 27746 + }, + { + "epoch": 15.501117318435755, + "grad_norm": 0.33115386962890625, + "learning_rate": 0.00022596638655462183, + "loss": 0.3167, + "step": 27747 + }, + { + "epoch": 15.501675977653631, + "grad_norm": 0.40444689989089966, + "learning_rate": 0.00022593837535014007, + "loss": 0.3464, + "step": 27748 + }, + { + "epoch": 15.502234636871508, + "grad_norm": 0.7617838382720947, + "learning_rate": 0.00022591036414565827, + "loss": 0.4017, + "step": 27749 + }, + { + "epoch": 15.502793296089386, + "grad_norm": 0.4813630282878876, + "learning_rate": 0.00022588235294117648, + "loss": 0.3969, + "step": 27750 + }, + { + "epoch": 15.503351955307263, + "grad_norm": 0.3566363453865051, + "learning_rate": 0.00022585434173669468, + "loss": 0.3564, + "step": 27751 + }, + { + "epoch": 15.50391061452514, + "grad_norm": 0.4510447382926941, + "learning_rate": 0.0002258263305322129, + "loss": 0.4681, + "step": 27752 + }, + { + "epoch": 15.504469273743016, + "grad_norm": 0.5560593008995056, + "learning_rate": 0.0002257983193277311, + "loss": 0.5418, + "step": 27753 + }, + { + "epoch": 15.505027932960894, + "grad_norm": 0.48497235774993896, + "learning_rate": 0.0002257703081232493, + "loss": 0.2578, + "step": 27754 + }, + { + "epoch": 15.505586592178771, + "grad_norm": 0.7682862281799316, + "learning_rate": 0.0002257422969187675, + "loss": 0.3326, + "step": 27755 + }, + { + "epoch": 15.506145251396648, + "grad_norm": 0.49692872166633606, + "learning_rate": 0.00022571428571428571, + "loss": 0.3487, + "step": 27756 + }, + { + "epoch": 15.506703910614526, + "grad_norm": 0.5352974534034729, + "learning_rate": 0.00022568627450980392, + "loss": 0.3662, + "step": 27757 + }, + { + "epoch": 15.507262569832402, + "grad_norm": 0.4714043140411377, + "learning_rate": 0.00022565826330532213, + "loss": 0.3866, + "step": 27758 + }, + { + "epoch": 15.507821229050279, + "grad_norm": 0.39776623249053955, + "learning_rate": 0.00022563025210084033, + "loss": 0.5038, + "step": 27759 + }, + { + "epoch": 15.508379888268156, + "grad_norm": 0.9410495162010193, + "learning_rate": 0.00022560224089635856, + "loss": 0.3989, + "step": 27760 + }, + { + "epoch": 15.508938547486034, + "grad_norm": 0.47694334387779236, + "learning_rate": 0.00022557422969187674, + "loss": 0.3357, + "step": 27761 + }, + { + "epoch": 15.50949720670391, + "grad_norm": 0.4025542736053467, + "learning_rate": 0.00022554621848739495, + "loss": 0.4093, + "step": 27762 + }, + { + "epoch": 15.510055865921787, + "grad_norm": 6.801734447479248, + "learning_rate": 0.00022551820728291316, + "loss": 0.4862, + "step": 27763 + }, + { + "epoch": 15.510614525139665, + "grad_norm": 0.7451063394546509, + "learning_rate": 0.0002254901960784314, + "loss": 0.791, + "step": 27764 + }, + { + "epoch": 15.511173184357542, + "grad_norm": 0.6250303387641907, + "learning_rate": 0.0002254621848739496, + "loss": 0.4077, + "step": 27765 + }, + { + "epoch": 15.511731843575419, + "grad_norm": 0.6530276536941528, + "learning_rate": 0.00022543417366946777, + "loss": 0.5877, + "step": 27766 + }, + { + "epoch": 15.512290502793297, + "grad_norm": 0.4405488073825836, + "learning_rate": 0.00022540616246498598, + "loss": 0.5036, + "step": 27767 + }, + { + "epoch": 15.512849162011173, + "grad_norm": 1.9687271118164062, + "learning_rate": 0.0002253781512605042, + "loss": 0.4081, + "step": 27768 + }, + { + "epoch": 15.51340782122905, + "grad_norm": 0.9267881512641907, + "learning_rate": 0.00022535014005602242, + "loss": 0.3863, + "step": 27769 + }, + { + "epoch": 15.513966480446927, + "grad_norm": 0.3652385473251343, + "learning_rate": 0.00022532212885154062, + "loss": 0.3228, + "step": 27770 + }, + { + "epoch": 15.514525139664805, + "grad_norm": 0.46286118030548096, + "learning_rate": 0.0002252941176470588, + "loss": 0.4435, + "step": 27771 + }, + { + "epoch": 15.515083798882682, + "grad_norm": 0.5767965912818909, + "learning_rate": 0.00022526610644257704, + "loss": 0.3195, + "step": 27772 + }, + { + "epoch": 15.515642458100558, + "grad_norm": 0.5513387322425842, + "learning_rate": 0.00022523809523809524, + "loss": 0.4673, + "step": 27773 + }, + { + "epoch": 15.516201117318436, + "grad_norm": 2.7823116779327393, + "learning_rate": 0.00022521008403361345, + "loss": 0.415, + "step": 27774 + }, + { + "epoch": 15.516759776536313, + "grad_norm": 0.5855982899665833, + "learning_rate": 0.00022518207282913165, + "loss": 0.6397, + "step": 27775 + }, + { + "epoch": 15.51731843575419, + "grad_norm": 0.6598750948905945, + "learning_rate": 0.00022515406162464986, + "loss": 0.4776, + "step": 27776 + }, + { + "epoch": 15.517877094972068, + "grad_norm": 0.8199924826622009, + "learning_rate": 0.00022512605042016807, + "loss": 0.554, + "step": 27777 + }, + { + "epoch": 15.518435754189944, + "grad_norm": 0.4444223642349243, + "learning_rate": 0.00022509803921568627, + "loss": 0.3705, + "step": 27778 + }, + { + "epoch": 15.518994413407821, + "grad_norm": 0.45687365531921387, + "learning_rate": 0.00022507002801120448, + "loss": 0.3419, + "step": 27779 + }, + { + "epoch": 15.519553072625698, + "grad_norm": 0.40366584062576294, + "learning_rate": 0.0002250420168067227, + "loss": 0.5157, + "step": 27780 + }, + { + "epoch": 15.520111731843576, + "grad_norm": 0.7116847634315491, + "learning_rate": 0.0002250140056022409, + "loss": 0.3981, + "step": 27781 + }, + { + "epoch": 15.520670391061453, + "grad_norm": 0.3483697772026062, + "learning_rate": 0.0002249859943977591, + "loss": 0.4021, + "step": 27782 + }, + { + "epoch": 15.521229050279329, + "grad_norm": 0.419830858707428, + "learning_rate": 0.0002249579831932773, + "loss": 0.331, + "step": 27783 + }, + { + "epoch": 15.521787709497207, + "grad_norm": 0.39615291357040405, + "learning_rate": 0.00022492997198879554, + "loss": 0.4366, + "step": 27784 + }, + { + "epoch": 15.522346368715084, + "grad_norm": 0.667262077331543, + "learning_rate": 0.00022490196078431374, + "loss": 0.5225, + "step": 27785 + }, + { + "epoch": 15.52290502793296, + "grad_norm": 0.3409097194671631, + "learning_rate": 0.00022487394957983192, + "loss": 0.4403, + "step": 27786 + }, + { + "epoch": 15.523463687150837, + "grad_norm": 1.2697312831878662, + "learning_rate": 0.00022484593837535013, + "loss": 0.4086, + "step": 27787 + }, + { + "epoch": 15.524022346368715, + "grad_norm": 0.39927807450294495, + "learning_rate": 0.00022481792717086836, + "loss": 0.3901, + "step": 27788 + }, + { + "epoch": 15.524581005586592, + "grad_norm": 0.7561572194099426, + "learning_rate": 0.00022478991596638657, + "loss": 0.4048, + "step": 27789 + }, + { + "epoch": 15.525139664804469, + "grad_norm": 1.0124382972717285, + "learning_rate": 0.00022476190476190477, + "loss": 0.4465, + "step": 27790 + }, + { + "epoch": 15.525698324022347, + "grad_norm": 0.7594790458679199, + "learning_rate": 0.00022473389355742295, + "loss": 0.4258, + "step": 27791 + }, + { + "epoch": 15.526256983240224, + "grad_norm": 0.46628737449645996, + "learning_rate": 0.00022470588235294118, + "loss": 0.4281, + "step": 27792 + }, + { + "epoch": 15.5268156424581, + "grad_norm": 0.9521386623382568, + "learning_rate": 0.0002246778711484594, + "loss": 0.4199, + "step": 27793 + }, + { + "epoch": 15.527374301675978, + "grad_norm": 0.3501659631729126, + "learning_rate": 0.0002246498599439776, + "loss": 0.4267, + "step": 27794 + }, + { + "epoch": 15.527932960893855, + "grad_norm": 0.3712787330150604, + "learning_rate": 0.00022462184873949583, + "loss": 0.3779, + "step": 27795 + }, + { + "epoch": 15.528491620111732, + "grad_norm": 0.44694897532463074, + "learning_rate": 0.000224593837535014, + "loss": 0.3585, + "step": 27796 + }, + { + "epoch": 15.529050279329608, + "grad_norm": 0.46219462156295776, + "learning_rate": 0.00022456582633053221, + "loss": 0.3315, + "step": 27797 + }, + { + "epoch": 15.529608938547486, + "grad_norm": 0.46868833899497986, + "learning_rate": 0.00022453781512605042, + "loss": 0.3413, + "step": 27798 + }, + { + "epoch": 15.530167597765363, + "grad_norm": 0.3779137134552002, + "learning_rate": 0.00022450980392156865, + "loss": 0.3353, + "step": 27799 + }, + { + "epoch": 15.53072625698324, + "grad_norm": 0.320011168718338, + "learning_rate": 0.00022448179271708686, + "loss": 0.338, + "step": 27800 + }, + { + "epoch": 15.531284916201118, + "grad_norm": 0.47712403535842896, + "learning_rate": 0.00022445378151260504, + "loss": 0.4771, + "step": 27801 + }, + { + "epoch": 15.531843575418995, + "grad_norm": 0.955322802066803, + "learning_rate": 0.00022442577030812324, + "loss": 0.3127, + "step": 27802 + }, + { + "epoch": 15.532402234636871, + "grad_norm": 0.43112605810165405, + "learning_rate": 0.00022439775910364148, + "loss": 0.4107, + "step": 27803 + }, + { + "epoch": 15.53296089385475, + "grad_norm": 0.5075454115867615, + "learning_rate": 0.00022436974789915968, + "loss": 0.4834, + "step": 27804 + }, + { + "epoch": 15.533519553072626, + "grad_norm": 0.46545740962028503, + "learning_rate": 0.0002243417366946779, + "loss": 0.4086, + "step": 27805 + }, + { + "epoch": 15.534078212290503, + "grad_norm": 0.6279367804527283, + "learning_rate": 0.00022431372549019607, + "loss": 0.3753, + "step": 27806 + }, + { + "epoch": 15.53463687150838, + "grad_norm": 1.413002848625183, + "learning_rate": 0.0002242857142857143, + "loss": 0.3897, + "step": 27807 + }, + { + "epoch": 15.535195530726257, + "grad_norm": 0.43945133686065674, + "learning_rate": 0.0002242577030812325, + "loss": 0.473, + "step": 27808 + }, + { + "epoch": 15.535754189944134, + "grad_norm": 0.4629015326499939, + "learning_rate": 0.0002242296918767507, + "loss": 0.4382, + "step": 27809 + }, + { + "epoch": 15.53631284916201, + "grad_norm": 0.4480507969856262, + "learning_rate": 0.0002242016806722689, + "loss": 0.4411, + "step": 27810 + }, + { + "epoch": 15.536871508379889, + "grad_norm": 0.39406436681747437, + "learning_rate": 0.00022417366946778712, + "loss": 0.3934, + "step": 27811 + }, + { + "epoch": 15.537430167597766, + "grad_norm": 0.4244726300239563, + "learning_rate": 0.00022414565826330533, + "loss": 0.4296, + "step": 27812 + }, + { + "epoch": 15.537988826815642, + "grad_norm": 0.7130715250968933, + "learning_rate": 0.00022411764705882354, + "loss": 0.4231, + "step": 27813 + }, + { + "epoch": 15.538547486033519, + "grad_norm": 0.37719523906707764, + "learning_rate": 0.00022408963585434174, + "loss": 0.3713, + "step": 27814 + }, + { + "epoch": 15.539106145251397, + "grad_norm": 0.4484434723854065, + "learning_rate": 0.00022406162464985995, + "loss": 0.4769, + "step": 27815 + }, + { + "epoch": 15.539664804469274, + "grad_norm": 0.507165253162384, + "learning_rate": 0.00022403361344537815, + "loss": 0.4794, + "step": 27816 + }, + { + "epoch": 15.54022346368715, + "grad_norm": 0.3663724958896637, + "learning_rate": 0.00022400560224089636, + "loss": 0.3152, + "step": 27817 + }, + { + "epoch": 15.540782122905028, + "grad_norm": 0.5082835555076599, + "learning_rate": 0.00022397759103641457, + "loss": 0.3894, + "step": 27818 + }, + { + "epoch": 15.541340782122905, + "grad_norm": 0.5757226943969727, + "learning_rate": 0.0002239495798319328, + "loss": 0.3396, + "step": 27819 + }, + { + "epoch": 15.541899441340782, + "grad_norm": 2.986664295196533, + "learning_rate": 0.00022392156862745098, + "loss": 0.5302, + "step": 27820 + }, + { + "epoch": 15.54245810055866, + "grad_norm": 0.5417391061782837, + "learning_rate": 0.00022389355742296918, + "loss": 0.3914, + "step": 27821 + }, + { + "epoch": 15.543016759776537, + "grad_norm": 0.5475592017173767, + "learning_rate": 0.0002238655462184874, + "loss": 0.5516, + "step": 27822 + }, + { + "epoch": 15.543575418994413, + "grad_norm": 0.9667094349861145, + "learning_rate": 0.00022383753501400562, + "loss": 0.4107, + "step": 27823 + }, + { + "epoch": 15.544134078212291, + "grad_norm": 0.8609445095062256, + "learning_rate": 0.00022380952380952383, + "loss": 0.4726, + "step": 27824 + }, + { + "epoch": 15.544692737430168, + "grad_norm": 0.7537353038787842, + "learning_rate": 0.000223781512605042, + "loss": 0.4655, + "step": 27825 + }, + { + "epoch": 15.545251396648045, + "grad_norm": 1.0021028518676758, + "learning_rate": 0.00022375350140056021, + "loss": 0.4195, + "step": 27826 + }, + { + "epoch": 15.545810055865921, + "grad_norm": 1.3827221393585205, + "learning_rate": 0.00022372549019607845, + "loss": 0.4152, + "step": 27827 + }, + { + "epoch": 15.5463687150838, + "grad_norm": 0.3408071994781494, + "learning_rate": 0.00022369747899159665, + "loss": 0.325, + "step": 27828 + }, + { + "epoch": 15.546927374301676, + "grad_norm": 0.7138128280639648, + "learning_rate": 0.00022366946778711486, + "loss": 0.3637, + "step": 27829 + }, + { + "epoch": 15.547486033519553, + "grad_norm": 0.4434405267238617, + "learning_rate": 0.00022364145658263304, + "loss": 0.4552, + "step": 27830 + }, + { + "epoch": 15.548044692737431, + "grad_norm": 0.7046041488647461, + "learning_rate": 0.00022361344537815127, + "loss": 0.4774, + "step": 27831 + }, + { + "epoch": 15.548603351955308, + "grad_norm": 0.5138440728187561, + "learning_rate": 0.00022358543417366948, + "loss": 0.4148, + "step": 27832 + }, + { + "epoch": 15.549162011173184, + "grad_norm": 0.5399837493896484, + "learning_rate": 0.00022355742296918768, + "loss": 0.3205, + "step": 27833 + }, + { + "epoch": 15.54972067039106, + "grad_norm": 0.5108838081359863, + "learning_rate": 0.0002235294117647059, + "loss": 0.433, + "step": 27834 + }, + { + "epoch": 15.550279329608939, + "grad_norm": 0.5395323634147644, + "learning_rate": 0.0002235014005602241, + "loss": 0.4069, + "step": 27835 + }, + { + "epoch": 15.550837988826816, + "grad_norm": 0.4377346336841583, + "learning_rate": 0.0002234733893557423, + "loss": 0.4045, + "step": 27836 + }, + { + "epoch": 15.551396648044692, + "grad_norm": 4.8756103515625, + "learning_rate": 0.0002234453781512605, + "loss": 0.3721, + "step": 27837 + }, + { + "epoch": 15.55195530726257, + "grad_norm": 0.4428098499774933, + "learning_rate": 0.00022341736694677871, + "loss": 0.3483, + "step": 27838 + }, + { + "epoch": 15.552513966480447, + "grad_norm": 0.8837556838989258, + "learning_rate": 0.00022338935574229695, + "loss": 0.3811, + "step": 27839 + }, + { + "epoch": 15.553072625698324, + "grad_norm": 1.031833291053772, + "learning_rate": 0.00022336134453781513, + "loss": 0.4121, + "step": 27840 + }, + { + "epoch": 15.553631284916202, + "grad_norm": 0.5359768867492676, + "learning_rate": 0.00022333333333333333, + "loss": 0.4345, + "step": 27841 + }, + { + "epoch": 15.554189944134079, + "grad_norm": 0.4339999854564667, + "learning_rate": 0.00022330532212885154, + "loss": 0.3908, + "step": 27842 + }, + { + "epoch": 15.554748603351955, + "grad_norm": 0.49947911500930786, + "learning_rate": 0.00022327731092436977, + "loss": 0.4705, + "step": 27843 + }, + { + "epoch": 15.555307262569832, + "grad_norm": 0.7107603549957275, + "learning_rate": 0.00022324929971988798, + "loss": 0.4691, + "step": 27844 + }, + { + "epoch": 15.55586592178771, + "grad_norm": 0.3989611566066742, + "learning_rate": 0.00022322128851540616, + "loss": 0.448, + "step": 27845 + }, + { + "epoch": 15.556424581005587, + "grad_norm": 0.6919142603874207, + "learning_rate": 0.00022319327731092436, + "loss": 0.3891, + "step": 27846 + }, + { + "epoch": 15.556983240223463, + "grad_norm": 0.4450211524963379, + "learning_rate": 0.0002231652661064426, + "loss": 0.3426, + "step": 27847 + }, + { + "epoch": 15.557541899441341, + "grad_norm": 0.5051398873329163, + "learning_rate": 0.0002231372549019608, + "loss": 0.3624, + "step": 27848 + }, + { + "epoch": 15.558100558659218, + "grad_norm": 0.3704480826854706, + "learning_rate": 0.000223109243697479, + "loss": 0.4175, + "step": 27849 + }, + { + "epoch": 15.558659217877095, + "grad_norm": 0.4108850955963135, + "learning_rate": 0.00022308123249299719, + "loss": 0.4215, + "step": 27850 + }, + { + "epoch": 15.559217877094973, + "grad_norm": 0.3740463852882385, + "learning_rate": 0.00022305322128851542, + "loss": 0.436, + "step": 27851 + }, + { + "epoch": 15.55977653631285, + "grad_norm": 0.577150821685791, + "learning_rate": 0.00022302521008403362, + "loss": 0.3592, + "step": 27852 + }, + { + "epoch": 15.560335195530726, + "grad_norm": 0.5524119138717651, + "learning_rate": 0.00022299719887955183, + "loss": 0.3251, + "step": 27853 + }, + { + "epoch": 15.560893854748603, + "grad_norm": 0.4201485514640808, + "learning_rate": 0.00022296918767507004, + "loss": 0.3999, + "step": 27854 + }, + { + "epoch": 15.561452513966481, + "grad_norm": 0.45837342739105225, + "learning_rate": 0.00022294117647058824, + "loss": 0.4121, + "step": 27855 + }, + { + "epoch": 15.562011173184358, + "grad_norm": 0.43424102663993835, + "learning_rate": 0.00022291316526610645, + "loss": 0.4174, + "step": 27856 + }, + { + "epoch": 15.562569832402234, + "grad_norm": 0.7538143992424011, + "learning_rate": 0.00022288515406162465, + "loss": 0.3682, + "step": 27857 + }, + { + "epoch": 15.563128491620112, + "grad_norm": 0.41381534934043884, + "learning_rate": 0.00022285714285714286, + "loss": 0.3739, + "step": 27858 + }, + { + "epoch": 15.563687150837989, + "grad_norm": 0.4533923268318176, + "learning_rate": 0.0002228291316526611, + "loss": 0.3686, + "step": 27859 + }, + { + "epoch": 15.564245810055866, + "grad_norm": 0.34350019693374634, + "learning_rate": 0.00022280112044817927, + "loss": 0.3715, + "step": 27860 + }, + { + "epoch": 15.564804469273742, + "grad_norm": 0.5557906031608582, + "learning_rate": 0.00022277310924369748, + "loss": 0.5958, + "step": 27861 + }, + { + "epoch": 15.56536312849162, + "grad_norm": 0.6481451988220215, + "learning_rate": 0.00022274509803921568, + "loss": 0.3957, + "step": 27862 + }, + { + "epoch": 15.565921787709497, + "grad_norm": 0.5459092259407043, + "learning_rate": 0.00022271708683473392, + "loss": 0.3216, + "step": 27863 + }, + { + "epoch": 15.566480446927374, + "grad_norm": 0.7596459984779358, + "learning_rate": 0.00022268907563025212, + "loss": 0.3183, + "step": 27864 + }, + { + "epoch": 15.567039106145252, + "grad_norm": 0.6075207591056824, + "learning_rate": 0.0002226610644257703, + "loss": 0.5228, + "step": 27865 + }, + { + "epoch": 15.567597765363129, + "grad_norm": 0.8762142658233643, + "learning_rate": 0.0002226330532212885, + "loss": 0.3887, + "step": 27866 + }, + { + "epoch": 15.568156424581005, + "grad_norm": 0.4284616708755493, + "learning_rate": 0.00022260504201680674, + "loss": 0.6057, + "step": 27867 + }, + { + "epoch": 15.568715083798883, + "grad_norm": 0.5207067131996155, + "learning_rate": 0.00022257703081232495, + "loss": 0.3693, + "step": 27868 + }, + { + "epoch": 15.56927374301676, + "grad_norm": 0.4390489459037781, + "learning_rate": 0.00022254901960784313, + "loss": 0.3953, + "step": 27869 + }, + { + "epoch": 15.569832402234637, + "grad_norm": 2.9083826541900635, + "learning_rate": 0.00022252100840336133, + "loss": 0.3678, + "step": 27870 + }, + { + "epoch": 15.570391061452513, + "grad_norm": 0.5766394138336182, + "learning_rate": 0.00022249299719887957, + "loss": 0.5632, + "step": 27871 + }, + { + "epoch": 15.570949720670392, + "grad_norm": 0.4161745011806488, + "learning_rate": 0.00022246498599439777, + "loss": 0.4148, + "step": 27872 + }, + { + "epoch": 15.571508379888268, + "grad_norm": 1.5840708017349243, + "learning_rate": 0.00022243697478991598, + "loss": 0.3985, + "step": 27873 + }, + { + "epoch": 15.572067039106145, + "grad_norm": 0.6482981443405151, + "learning_rate": 0.00022240896358543416, + "loss": 0.3563, + "step": 27874 + }, + { + "epoch": 15.572625698324023, + "grad_norm": 0.596347987651825, + "learning_rate": 0.0002223809523809524, + "loss": 0.3524, + "step": 27875 + }, + { + "epoch": 15.5731843575419, + "grad_norm": 0.4068700075149536, + "learning_rate": 0.0002223529411764706, + "loss": 0.415, + "step": 27876 + }, + { + "epoch": 15.573743016759776, + "grad_norm": 0.394394189119339, + "learning_rate": 0.0002223249299719888, + "loss": 0.3586, + "step": 27877 + }, + { + "epoch": 15.574301675977654, + "grad_norm": 0.4117099940776825, + "learning_rate": 0.000222296918767507, + "loss": 0.3585, + "step": 27878 + }, + { + "epoch": 15.574860335195531, + "grad_norm": 0.5155060887336731, + "learning_rate": 0.00022226890756302521, + "loss": 0.4698, + "step": 27879 + }, + { + "epoch": 15.575418994413408, + "grad_norm": 0.4252316355705261, + "learning_rate": 0.00022224089635854342, + "loss": 0.4955, + "step": 27880 + }, + { + "epoch": 15.575977653631284, + "grad_norm": 0.3797030448913574, + "learning_rate": 0.00022221288515406163, + "loss": 0.4122, + "step": 27881 + }, + { + "epoch": 15.576536312849163, + "grad_norm": 0.33882996439933777, + "learning_rate": 0.00022218487394957983, + "loss": 0.339, + "step": 27882 + }, + { + "epoch": 15.577094972067039, + "grad_norm": 0.37392720580101013, + "learning_rate": 0.00022215686274509806, + "loss": 0.356, + "step": 27883 + }, + { + "epoch": 15.577653631284916, + "grad_norm": 0.4503822922706604, + "learning_rate": 0.00022212885154061624, + "loss": 0.3986, + "step": 27884 + }, + { + "epoch": 15.578212290502794, + "grad_norm": 0.45349517464637756, + "learning_rate": 0.00022210084033613445, + "loss": 0.3729, + "step": 27885 + }, + { + "epoch": 15.57877094972067, + "grad_norm": 0.6705023646354675, + "learning_rate": 0.00022207282913165266, + "loss": 0.4701, + "step": 27886 + }, + { + "epoch": 15.579329608938547, + "grad_norm": 0.6574171781539917, + "learning_rate": 0.0002220448179271709, + "loss": 0.5386, + "step": 27887 + }, + { + "epoch": 15.579888268156424, + "grad_norm": 1.4103906154632568, + "learning_rate": 0.0002220168067226891, + "loss": 0.3012, + "step": 27888 + }, + { + "epoch": 15.580446927374302, + "grad_norm": 0.3313217759132385, + "learning_rate": 0.00022198879551820727, + "loss": 0.3573, + "step": 27889 + }, + { + "epoch": 15.581005586592179, + "grad_norm": 0.32307830452919006, + "learning_rate": 0.00022196078431372548, + "loss": 0.3587, + "step": 27890 + }, + { + "epoch": 15.581564245810055, + "grad_norm": 0.5686635971069336, + "learning_rate": 0.0002219327731092437, + "loss": 0.4455, + "step": 27891 + }, + { + "epoch": 15.582122905027934, + "grad_norm": 4.520925521850586, + "learning_rate": 0.00022190476190476192, + "loss": 0.4614, + "step": 27892 + }, + { + "epoch": 15.58268156424581, + "grad_norm": 0.4929114878177643, + "learning_rate": 0.00022187675070028012, + "loss": 0.3922, + "step": 27893 + }, + { + "epoch": 15.583240223463687, + "grad_norm": 7.434657096862793, + "learning_rate": 0.0002218487394957983, + "loss": 0.3705, + "step": 27894 + }, + { + "epoch": 15.583798882681565, + "grad_norm": 0.6490254998207092, + "learning_rate": 0.00022182072829131654, + "loss": 0.4126, + "step": 27895 + }, + { + "epoch": 15.584357541899442, + "grad_norm": 3.9909751415252686, + "learning_rate": 0.00022179271708683474, + "loss": 0.3589, + "step": 27896 + }, + { + "epoch": 15.584916201117318, + "grad_norm": 0.41026800870895386, + "learning_rate": 0.00022176470588235295, + "loss": 0.3939, + "step": 27897 + }, + { + "epoch": 15.585474860335196, + "grad_norm": 0.6016550660133362, + "learning_rate": 0.00022173669467787115, + "loss": 0.4498, + "step": 27898 + }, + { + "epoch": 15.586033519553073, + "grad_norm": 1.143162488937378, + "learning_rate": 0.00022170868347338936, + "loss": 0.6263, + "step": 27899 + }, + { + "epoch": 15.58659217877095, + "grad_norm": 0.8973687887191772, + "learning_rate": 0.00022168067226890757, + "loss": 0.4008, + "step": 27900 + }, + { + "epoch": 15.587150837988826, + "grad_norm": 1.216586947441101, + "learning_rate": 0.00022165266106442577, + "loss": 0.4845, + "step": 27901 + }, + { + "epoch": 15.587709497206705, + "grad_norm": 0.4037569761276245, + "learning_rate": 0.00022162464985994398, + "loss": 0.3714, + "step": 27902 + }, + { + "epoch": 15.588268156424581, + "grad_norm": 0.4583616852760315, + "learning_rate": 0.0002215966386554622, + "loss": 0.4034, + "step": 27903 + }, + { + "epoch": 15.588826815642458, + "grad_norm": 0.3377223610877991, + "learning_rate": 0.0002215686274509804, + "loss": 0.3138, + "step": 27904 + }, + { + "epoch": 15.589385474860336, + "grad_norm": 0.4331744909286499, + "learning_rate": 0.0002215406162464986, + "loss": 0.5142, + "step": 27905 + }, + { + "epoch": 15.589944134078213, + "grad_norm": 0.4325265884399414, + "learning_rate": 0.0002215126050420168, + "loss": 0.453, + "step": 27906 + }, + { + "epoch": 15.59050279329609, + "grad_norm": 0.422418475151062, + "learning_rate": 0.00022148459383753504, + "loss": 0.3625, + "step": 27907 + }, + { + "epoch": 15.591061452513966, + "grad_norm": 0.4533475637435913, + "learning_rate": 0.00022145658263305324, + "loss": 0.3877, + "step": 27908 + }, + { + "epoch": 15.591620111731844, + "grad_norm": 0.43079784512519836, + "learning_rate": 0.00022142857142857142, + "loss": 0.4155, + "step": 27909 + }, + { + "epoch": 15.59217877094972, + "grad_norm": 0.4105457067489624, + "learning_rate": 0.00022140056022408963, + "loss": 0.3429, + "step": 27910 + }, + { + "epoch": 15.592737430167597, + "grad_norm": 0.34282201528549194, + "learning_rate": 0.00022137254901960786, + "loss": 0.4353, + "step": 27911 + }, + { + "epoch": 15.593296089385476, + "grad_norm": 0.38764122128486633, + "learning_rate": 0.00022134453781512607, + "loss": 0.3986, + "step": 27912 + }, + { + "epoch": 15.593854748603352, + "grad_norm": 0.9187127351760864, + "learning_rate": 0.00022131652661064427, + "loss": 0.3587, + "step": 27913 + }, + { + "epoch": 15.594413407821229, + "grad_norm": 0.37504643201828003, + "learning_rate": 0.00022128851540616245, + "loss": 0.3478, + "step": 27914 + }, + { + "epoch": 15.594972067039105, + "grad_norm": 1.2339084148406982, + "learning_rate": 0.00022126050420168068, + "loss": 0.4146, + "step": 27915 + }, + { + "epoch": 15.595530726256984, + "grad_norm": 0.6316666603088379, + "learning_rate": 0.0002212324929971989, + "loss": 0.345, + "step": 27916 + }, + { + "epoch": 15.59608938547486, + "grad_norm": 0.5449196100234985, + "learning_rate": 0.0002212044817927171, + "loss": 0.5052, + "step": 27917 + }, + { + "epoch": 15.596648044692737, + "grad_norm": 0.5349736213684082, + "learning_rate": 0.0002211764705882353, + "loss": 0.6448, + "step": 27918 + }, + { + "epoch": 15.597206703910615, + "grad_norm": 0.37893494963645935, + "learning_rate": 0.0002211484593837535, + "loss": 0.3733, + "step": 27919 + }, + { + "epoch": 15.597765363128492, + "grad_norm": 0.3856014311313629, + "learning_rate": 0.00022112044817927171, + "loss": 0.3823, + "step": 27920 + }, + { + "epoch": 15.598324022346368, + "grad_norm": 0.8640221357345581, + "learning_rate": 0.00022109243697478992, + "loss": 0.4212, + "step": 27921 + }, + { + "epoch": 15.598882681564247, + "grad_norm": 0.5253803730010986, + "learning_rate": 0.00022106442577030813, + "loss": 0.5596, + "step": 27922 + }, + { + "epoch": 15.599441340782123, + "grad_norm": 0.4020250141620636, + "learning_rate": 0.00022103641456582633, + "loss": 0.3587, + "step": 27923 + }, + { + "epoch": 15.6, + "grad_norm": 0.41350749135017395, + "learning_rate": 0.00022100840336134454, + "loss": 0.3881, + "step": 27924 + }, + { + "epoch": 15.600558659217878, + "grad_norm": 0.48249930143356323, + "learning_rate": 0.00022098039215686274, + "loss": 0.4405, + "step": 27925 + }, + { + "epoch": 15.601117318435755, + "grad_norm": 0.47828221321105957, + "learning_rate": 0.00022095238095238095, + "loss": 0.466, + "step": 27926 + }, + { + "epoch": 15.601675977653631, + "grad_norm": 0.45245322585105896, + "learning_rate": 0.00022092436974789918, + "loss": 0.5831, + "step": 27927 + }, + { + "epoch": 15.602234636871508, + "grad_norm": 0.9160994291305542, + "learning_rate": 0.00022089635854341736, + "loss": 0.5035, + "step": 27928 + }, + { + "epoch": 15.602793296089386, + "grad_norm": 0.5818053483963013, + "learning_rate": 0.00022086834733893557, + "loss": 0.3685, + "step": 27929 + }, + { + "epoch": 15.603351955307263, + "grad_norm": 23.26161003112793, + "learning_rate": 0.00022084033613445377, + "loss": 0.3014, + "step": 27930 + }, + { + "epoch": 15.60391061452514, + "grad_norm": 0.2935068905353546, + "learning_rate": 0.000220812324929972, + "loss": 0.3349, + "step": 27931 + }, + { + "epoch": 15.604469273743018, + "grad_norm": 0.327740341424942, + "learning_rate": 0.0002207843137254902, + "loss": 0.3889, + "step": 27932 + }, + { + "epoch": 15.605027932960894, + "grad_norm": 0.4711613357067108, + "learning_rate": 0.0002207563025210084, + "loss": 0.3951, + "step": 27933 + }, + { + "epoch": 15.60558659217877, + "grad_norm": 0.5951714515686035, + "learning_rate": 0.0002207282913165266, + "loss": 0.5704, + "step": 27934 + }, + { + "epoch": 15.606145251396647, + "grad_norm": 0.6404430270195007, + "learning_rate": 0.00022070028011204483, + "loss": 0.5611, + "step": 27935 + }, + { + "epoch": 15.606703910614526, + "grad_norm": 0.7200407385826111, + "learning_rate": 0.00022067226890756304, + "loss": 0.4001, + "step": 27936 + }, + { + "epoch": 15.607262569832402, + "grad_norm": 0.5036531686782837, + "learning_rate": 0.00022064425770308124, + "loss": 0.3539, + "step": 27937 + }, + { + "epoch": 15.607821229050279, + "grad_norm": 0.409136027097702, + "learning_rate": 0.00022061624649859942, + "loss": 0.3768, + "step": 27938 + }, + { + "epoch": 15.608379888268157, + "grad_norm": 0.4895537197589874, + "learning_rate": 0.00022058823529411765, + "loss": 0.4071, + "step": 27939 + }, + { + "epoch": 15.608938547486034, + "grad_norm": 0.437532514333725, + "learning_rate": 0.00022056022408963586, + "loss": 0.3595, + "step": 27940 + }, + { + "epoch": 15.60949720670391, + "grad_norm": 0.3874495327472687, + "learning_rate": 0.00022053221288515407, + "loss": 0.2997, + "step": 27941 + }, + { + "epoch": 15.610055865921789, + "grad_norm": 0.41679462790489197, + "learning_rate": 0.00022050420168067227, + "loss": 0.3278, + "step": 27942 + }, + { + "epoch": 15.610614525139665, + "grad_norm": 0.39926114678382874, + "learning_rate": 0.00022047619047619048, + "loss": 0.4615, + "step": 27943 + }, + { + "epoch": 15.611173184357542, + "grad_norm": 0.5115527510643005, + "learning_rate": 0.00022044817927170868, + "loss": 0.4595, + "step": 27944 + }, + { + "epoch": 15.611731843575418, + "grad_norm": 0.4073830246925354, + "learning_rate": 0.0002204201680672269, + "loss": 0.3783, + "step": 27945 + }, + { + "epoch": 15.612290502793297, + "grad_norm": 0.452438086271286, + "learning_rate": 0.0002203921568627451, + "loss": 0.3064, + "step": 27946 + }, + { + "epoch": 15.612849162011173, + "grad_norm": 0.44046100974082947, + "learning_rate": 0.00022036414565826333, + "loss": 0.4469, + "step": 27947 + }, + { + "epoch": 15.61340782122905, + "grad_norm": 0.38222116231918335, + "learning_rate": 0.0002203361344537815, + "loss": 0.2998, + "step": 27948 + }, + { + "epoch": 15.613966480446928, + "grad_norm": 0.586675226688385, + "learning_rate": 0.00022030812324929971, + "loss": 0.3807, + "step": 27949 + }, + { + "epoch": 15.614525139664805, + "grad_norm": 0.3349973261356354, + "learning_rate": 0.00022028011204481792, + "loss": 0.3483, + "step": 27950 + }, + { + "epoch": 15.615083798882681, + "grad_norm": 0.7216817736625671, + "learning_rate": 0.00022025210084033615, + "loss": 0.3771, + "step": 27951 + }, + { + "epoch": 15.61564245810056, + "grad_norm": 0.768157958984375, + "learning_rate": 0.00022022408963585436, + "loss": 0.356, + "step": 27952 + }, + { + "epoch": 15.616201117318436, + "grad_norm": 0.8046454191207886, + "learning_rate": 0.00022019607843137254, + "loss": 0.4699, + "step": 27953 + }, + { + "epoch": 15.616759776536313, + "grad_norm": 0.5023537278175354, + "learning_rate": 0.00022016806722689074, + "loss": 0.3096, + "step": 27954 + }, + { + "epoch": 15.61731843575419, + "grad_norm": 0.3565543293952942, + "learning_rate": 0.00022014005602240898, + "loss": 0.3845, + "step": 27955 + }, + { + "epoch": 15.617877094972068, + "grad_norm": 0.44465214014053345, + "learning_rate": 0.00022011204481792718, + "loss": 0.4387, + "step": 27956 + }, + { + "epoch": 15.618435754189944, + "grad_norm": 0.5596281290054321, + "learning_rate": 0.0002200840336134454, + "loss": 0.4045, + "step": 27957 + }, + { + "epoch": 15.61899441340782, + "grad_norm": 0.54207444190979, + "learning_rate": 0.00022005602240896357, + "loss": 0.5789, + "step": 27958 + }, + { + "epoch": 15.619553072625699, + "grad_norm": 0.4174506664276123, + "learning_rate": 0.0002200280112044818, + "loss": 0.3504, + "step": 27959 + }, + { + "epoch": 15.620111731843576, + "grad_norm": 0.5959535241127014, + "learning_rate": 0.00022, + "loss": 0.4715, + "step": 27960 + }, + { + "epoch": 15.620670391061452, + "grad_norm": 0.3997792601585388, + "learning_rate": 0.00021997198879551821, + "loss": 0.4131, + "step": 27961 + }, + { + "epoch": 15.621229050279329, + "grad_norm": 0.7493446469306946, + "learning_rate": 0.00021994397759103642, + "loss": 0.3882, + "step": 27962 + }, + { + "epoch": 15.621787709497207, + "grad_norm": 0.5321084260940552, + "learning_rate": 0.00021991596638655463, + "loss": 0.4481, + "step": 27963 + }, + { + "epoch": 15.622346368715084, + "grad_norm": 0.5192870497703552, + "learning_rate": 0.00021988795518207283, + "loss": 0.4059, + "step": 27964 + }, + { + "epoch": 15.62290502793296, + "grad_norm": 0.9302773475646973, + "learning_rate": 0.00021985994397759104, + "loss": 0.3524, + "step": 27965 + }, + { + "epoch": 15.623463687150839, + "grad_norm": 0.7968913316726685, + "learning_rate": 0.00021983193277310924, + "loss": 0.3187, + "step": 27966 + }, + { + "epoch": 15.624022346368715, + "grad_norm": 0.3571932911872864, + "learning_rate": 0.00021980392156862748, + "loss": 0.3511, + "step": 27967 + }, + { + "epoch": 15.624581005586592, + "grad_norm": 1.3209444284439087, + "learning_rate": 0.00021977591036414566, + "loss": 0.5391, + "step": 27968 + }, + { + "epoch": 15.62513966480447, + "grad_norm": 0.7576371431350708, + "learning_rate": 0.00021974789915966386, + "loss": 0.4232, + "step": 27969 + }, + { + "epoch": 15.625698324022347, + "grad_norm": 3.180560350418091, + "learning_rate": 0.00021971988795518207, + "loss": 0.3515, + "step": 27970 + }, + { + "epoch": 15.626256983240223, + "grad_norm": 0.5635356903076172, + "learning_rate": 0.0002196918767507003, + "loss": 0.4208, + "step": 27971 + }, + { + "epoch": 15.6268156424581, + "grad_norm": 0.8229154348373413, + "learning_rate": 0.0002196638655462185, + "loss": 0.3811, + "step": 27972 + }, + { + "epoch": 15.627374301675978, + "grad_norm": 0.6761255860328674, + "learning_rate": 0.00021963585434173669, + "loss": 0.5865, + "step": 27973 + }, + { + "epoch": 15.627932960893855, + "grad_norm": 0.4207446873188019, + "learning_rate": 0.0002196078431372549, + "loss": 0.3162, + "step": 27974 + }, + { + "epoch": 15.628491620111731, + "grad_norm": 0.5189700126647949, + "learning_rate": 0.00021957983193277312, + "loss": 0.4833, + "step": 27975 + }, + { + "epoch": 15.62905027932961, + "grad_norm": 0.4031204879283905, + "learning_rate": 0.00021955182072829133, + "loss": 0.4975, + "step": 27976 + }, + { + "epoch": 15.629608938547486, + "grad_norm": 0.6120550036430359, + "learning_rate": 0.0002195238095238095, + "loss": 0.4854, + "step": 27977 + }, + { + "epoch": 15.630167597765363, + "grad_norm": 0.3992401361465454, + "learning_rate": 0.00021949579831932772, + "loss": 0.353, + "step": 27978 + }, + { + "epoch": 15.630726256983241, + "grad_norm": 1.417589783668518, + "learning_rate": 0.00021946778711484595, + "loss": 0.4176, + "step": 27979 + }, + { + "epoch": 15.631284916201118, + "grad_norm": 0.673062801361084, + "learning_rate": 0.00021943977591036415, + "loss": 0.3865, + "step": 27980 + }, + { + "epoch": 15.631843575418994, + "grad_norm": 0.6185072064399719, + "learning_rate": 0.00021941176470588236, + "loss": 0.3642, + "step": 27981 + }, + { + "epoch": 15.63240223463687, + "grad_norm": 0.4298047423362732, + "learning_rate": 0.00021938375350140054, + "loss": 0.4035, + "step": 27982 + }, + { + "epoch": 15.632960893854749, + "grad_norm": 0.8356462717056274, + "learning_rate": 0.00021935574229691877, + "loss": 0.4261, + "step": 27983 + }, + { + "epoch": 15.633519553072626, + "grad_norm": 0.5218881368637085, + "learning_rate": 0.00021932773109243698, + "loss": 0.4292, + "step": 27984 + }, + { + "epoch": 15.634078212290502, + "grad_norm": 0.6895882487297058, + "learning_rate": 0.00021929971988795518, + "loss": 0.3489, + "step": 27985 + }, + { + "epoch": 15.63463687150838, + "grad_norm": 0.5630412697792053, + "learning_rate": 0.0002192717086834734, + "loss": 0.4754, + "step": 27986 + }, + { + "epoch": 15.635195530726257, + "grad_norm": 2.5687711238861084, + "learning_rate": 0.0002192436974789916, + "loss": 0.337, + "step": 27987 + }, + { + "epoch": 15.635754189944134, + "grad_norm": 0.38433295488357544, + "learning_rate": 0.0002192156862745098, + "loss": 0.3681, + "step": 27988 + }, + { + "epoch": 15.63631284916201, + "grad_norm": 0.4970787465572357, + "learning_rate": 0.000219187675070028, + "loss": 0.4212, + "step": 27989 + }, + { + "epoch": 15.636871508379889, + "grad_norm": 0.7202358841896057, + "learning_rate": 0.00021915966386554621, + "loss": 0.414, + "step": 27990 + }, + { + "epoch": 15.637430167597765, + "grad_norm": 0.4074876606464386, + "learning_rate": 0.00021913165266106445, + "loss": 0.3623, + "step": 27991 + }, + { + "epoch": 15.637988826815642, + "grad_norm": 0.5222742557525635, + "learning_rate": 0.00021910364145658263, + "loss": 0.342, + "step": 27992 + }, + { + "epoch": 15.63854748603352, + "grad_norm": 1.1061675548553467, + "learning_rate": 0.00021907563025210083, + "loss": 0.6238, + "step": 27993 + }, + { + "epoch": 15.639106145251397, + "grad_norm": 0.3918350040912628, + "learning_rate": 0.00021904761904761904, + "loss": 0.5253, + "step": 27994 + }, + { + "epoch": 15.639664804469273, + "grad_norm": 0.33751118183135986, + "learning_rate": 0.00021901960784313727, + "loss": 0.3473, + "step": 27995 + }, + { + "epoch": 15.640223463687152, + "grad_norm": 0.4809604585170746, + "learning_rate": 0.00021899159663865548, + "loss": 0.5169, + "step": 27996 + }, + { + "epoch": 15.640782122905028, + "grad_norm": 0.4874924421310425, + "learning_rate": 0.00021896358543417366, + "loss": 0.3319, + "step": 27997 + }, + { + "epoch": 15.641340782122905, + "grad_norm": 0.38321277499198914, + "learning_rate": 0.00021893557422969186, + "loss": 0.3139, + "step": 27998 + }, + { + "epoch": 15.641899441340783, + "grad_norm": 0.6106351017951965, + "learning_rate": 0.0002189075630252101, + "loss": 0.5398, + "step": 27999 + }, + { + "epoch": 15.64245810055866, + "grad_norm": 0.4765563905239105, + "learning_rate": 0.0002188795518207283, + "loss": 0.4245, + "step": 28000 + }, + { + "epoch": 15.64245810055866, + "eval_cer": 0.08562731170829105, + "eval_loss": 0.3225233256816864, + "eval_runtime": 55.5941, + "eval_samples_per_second": 81.627, + "eval_steps_per_second": 5.108, + "eval_wer": 0.33839836876064916, + "step": 28000 + }, + { + "epoch": 15.643016759776536, + "grad_norm": 0.5473787784576416, + "learning_rate": 0.0002188515406162465, + "loss": 0.3087, + "step": 28001 + }, + { + "epoch": 15.643575418994413, + "grad_norm": 0.32055461406707764, + "learning_rate": 0.0002188235294117647, + "loss": 0.3049, + "step": 28002 + }, + { + "epoch": 15.644134078212291, + "grad_norm": 0.3655672073364258, + "learning_rate": 0.00021879551820728292, + "loss": 0.3107, + "step": 28003 + }, + { + "epoch": 15.644692737430168, + "grad_norm": 0.35586389899253845, + "learning_rate": 0.00021876750700280113, + "loss": 0.3541, + "step": 28004 + }, + { + "epoch": 15.645251396648044, + "grad_norm": 0.33578261733055115, + "learning_rate": 0.00021873949579831933, + "loss": 0.3611, + "step": 28005 + }, + { + "epoch": 15.645810055865923, + "grad_norm": 2.0723326206207275, + "learning_rate": 0.00021871148459383754, + "loss": 0.4157, + "step": 28006 + }, + { + "epoch": 15.6463687150838, + "grad_norm": 0.4603142738342285, + "learning_rate": 0.00021868347338935574, + "loss": 0.4801, + "step": 28007 + }, + { + "epoch": 15.646927374301676, + "grad_norm": 0.5859696269035339, + "learning_rate": 0.00021865546218487395, + "loss": 0.3282, + "step": 28008 + }, + { + "epoch": 15.647486033519552, + "grad_norm": 0.6254460215568542, + "learning_rate": 0.00021862745098039216, + "loss": 0.4269, + "step": 28009 + }, + { + "epoch": 15.64804469273743, + "grad_norm": 0.618705153465271, + "learning_rate": 0.00021859943977591036, + "loss": 0.3526, + "step": 28010 + }, + { + "epoch": 15.648603351955307, + "grad_norm": 0.6330431699752808, + "learning_rate": 0.0002185714285714286, + "loss": 0.6561, + "step": 28011 + }, + { + "epoch": 15.649162011173184, + "grad_norm": 0.6152411699295044, + "learning_rate": 0.00021854341736694677, + "loss": 0.359, + "step": 28012 + }, + { + "epoch": 15.649720670391062, + "grad_norm": 0.520406186580658, + "learning_rate": 0.00021851540616246498, + "loss": 0.4962, + "step": 28013 + }, + { + "epoch": 15.650279329608939, + "grad_norm": 0.459607869386673, + "learning_rate": 0.00021848739495798319, + "loss": 0.4322, + "step": 28014 + }, + { + "epoch": 15.650837988826815, + "grad_norm": 0.7368574738502502, + "learning_rate": 0.00021845938375350142, + "loss": 0.4397, + "step": 28015 + }, + { + "epoch": 15.651396648044694, + "grad_norm": 0.4270786941051483, + "learning_rate": 0.00021843137254901962, + "loss": 0.3793, + "step": 28016 + }, + { + "epoch": 15.65195530726257, + "grad_norm": 0.4606087803840637, + "learning_rate": 0.0002184033613445378, + "loss": 0.3899, + "step": 28017 + }, + { + "epoch": 15.652513966480447, + "grad_norm": 0.5311670303344727, + "learning_rate": 0.000218375350140056, + "loss": 0.3743, + "step": 28018 + }, + { + "epoch": 15.653072625698323, + "grad_norm": 0.3264901041984558, + "learning_rate": 0.00021834733893557424, + "loss": 0.3546, + "step": 28019 + }, + { + "epoch": 15.653631284916202, + "grad_norm": 0.7805134057998657, + "learning_rate": 0.00021831932773109245, + "loss": 0.4282, + "step": 28020 + }, + { + "epoch": 15.654189944134078, + "grad_norm": 0.6998165249824524, + "learning_rate": 0.00021829131652661065, + "loss": 0.4396, + "step": 28021 + }, + { + "epoch": 15.654748603351955, + "grad_norm": 0.33380597829818726, + "learning_rate": 0.00021826330532212883, + "loss": 0.3619, + "step": 28022 + }, + { + "epoch": 15.655307262569833, + "grad_norm": 0.40164363384246826, + "learning_rate": 0.00021823529411764707, + "loss": 0.3855, + "step": 28023 + }, + { + "epoch": 15.65586592178771, + "grad_norm": 0.9354643821716309, + "learning_rate": 0.00021820728291316527, + "loss": 0.4513, + "step": 28024 + }, + { + "epoch": 15.656424581005586, + "grad_norm": 10.054797172546387, + "learning_rate": 0.00021817927170868348, + "loss": 0.4721, + "step": 28025 + }, + { + "epoch": 15.656983240223465, + "grad_norm": 0.40198424458503723, + "learning_rate": 0.0002181512605042017, + "loss": 0.4358, + "step": 28026 + }, + { + "epoch": 15.657541899441341, + "grad_norm": 0.5293678045272827, + "learning_rate": 0.0002181232492997199, + "loss": 0.4397, + "step": 28027 + }, + { + "epoch": 15.658100558659218, + "grad_norm": 0.7725968956947327, + "learning_rate": 0.0002180952380952381, + "loss": 0.4925, + "step": 28028 + }, + { + "epoch": 15.658659217877094, + "grad_norm": 0.4105333089828491, + "learning_rate": 0.0002180672268907563, + "loss": 0.3911, + "step": 28029 + }, + { + "epoch": 15.659217877094973, + "grad_norm": 0.3805825412273407, + "learning_rate": 0.00021803921568627454, + "loss": 0.3913, + "step": 28030 + }, + { + "epoch": 15.65977653631285, + "grad_norm": 0.43646717071533203, + "learning_rate": 0.00021801120448179274, + "loss": 0.4768, + "step": 28031 + }, + { + "epoch": 15.660335195530726, + "grad_norm": 0.6447239518165588, + "learning_rate": 0.00021798319327731092, + "loss": 0.4063, + "step": 28032 + }, + { + "epoch": 15.660893854748604, + "grad_norm": 0.79170161485672, + "learning_rate": 0.00021795518207282913, + "loss": 0.4174, + "step": 28033 + }, + { + "epoch": 15.66145251396648, + "grad_norm": 0.3485768437385559, + "learning_rate": 0.00021792717086834736, + "loss": 0.3205, + "step": 28034 + }, + { + "epoch": 15.662011173184357, + "grad_norm": 0.6954795122146606, + "learning_rate": 0.00021789915966386557, + "loss": 0.5848, + "step": 28035 + }, + { + "epoch": 15.662569832402234, + "grad_norm": 0.7661239504814148, + "learning_rate": 0.00021787114845938374, + "loss": 0.346, + "step": 28036 + }, + { + "epoch": 15.663128491620112, + "grad_norm": 0.5549522638320923, + "learning_rate": 0.00021784313725490195, + "loss": 0.4464, + "step": 28037 + }, + { + "epoch": 15.663687150837989, + "grad_norm": 0.4942370653152466, + "learning_rate": 0.00021781512605042018, + "loss": 0.3929, + "step": 28038 + }, + { + "epoch": 15.664245810055865, + "grad_norm": 0.40910083055496216, + "learning_rate": 0.0002177871148459384, + "loss": 0.3208, + "step": 28039 + }, + { + "epoch": 15.664804469273744, + "grad_norm": 0.3546142578125, + "learning_rate": 0.0002177591036414566, + "loss": 0.4221, + "step": 28040 + }, + { + "epoch": 15.66536312849162, + "grad_norm": 0.4588821828365326, + "learning_rate": 0.00021773109243697477, + "loss": 0.4319, + "step": 28041 + }, + { + "epoch": 15.665921787709497, + "grad_norm": 0.4927888810634613, + "learning_rate": 0.000217703081232493, + "loss": 0.3312, + "step": 28042 + }, + { + "epoch": 15.666480446927375, + "grad_norm": 1.637519121170044, + "learning_rate": 0.00021767507002801121, + "loss": 0.4159, + "step": 28043 + }, + { + "epoch": 15.667039106145252, + "grad_norm": 0.6051217317581177, + "learning_rate": 0.00021764705882352942, + "loss": 0.4074, + "step": 28044 + }, + { + "epoch": 15.667597765363128, + "grad_norm": 0.4173748791217804, + "learning_rate": 0.00021761904761904763, + "loss": 0.4952, + "step": 28045 + }, + { + "epoch": 15.668156424581005, + "grad_norm": 0.39418622851371765, + "learning_rate": 0.00021759103641456583, + "loss": 0.3816, + "step": 28046 + }, + { + "epoch": 15.668715083798883, + "grad_norm": 0.3564317524433136, + "learning_rate": 0.00021756302521008404, + "loss": 0.3327, + "step": 28047 + }, + { + "epoch": 15.66927374301676, + "grad_norm": 0.9859527945518494, + "learning_rate": 0.00021753501400560224, + "loss": 0.4629, + "step": 28048 + }, + { + "epoch": 15.669832402234636, + "grad_norm": 3.0253565311431885, + "learning_rate": 0.00021750700280112045, + "loss": 0.4269, + "step": 28049 + }, + { + "epoch": 15.670391061452515, + "grad_norm": 2.299283266067505, + "learning_rate": 0.00021747899159663868, + "loss": 0.3088, + "step": 28050 + }, + { + "epoch": 15.670949720670391, + "grad_norm": 0.9328839182853699, + "learning_rate": 0.00021745098039215686, + "loss": 0.4022, + "step": 28051 + }, + { + "epoch": 15.671508379888268, + "grad_norm": 0.44723692536354065, + "learning_rate": 0.00021742296918767507, + "loss": 0.3944, + "step": 28052 + }, + { + "epoch": 15.672067039106146, + "grad_norm": 0.39097368717193604, + "learning_rate": 0.00021739495798319327, + "loss": 0.435, + "step": 28053 + }, + { + "epoch": 15.672625698324023, + "grad_norm": 0.5249546766281128, + "learning_rate": 0.0002173669467787115, + "loss": 0.3699, + "step": 28054 + }, + { + "epoch": 15.6731843575419, + "grad_norm": 0.932850182056427, + "learning_rate": 0.0002173389355742297, + "loss": 0.4172, + "step": 28055 + }, + { + "epoch": 15.673743016759776, + "grad_norm": 1.1159703731536865, + "learning_rate": 0.0002173109243697479, + "loss": 0.4471, + "step": 28056 + }, + { + "epoch": 15.674301675977654, + "grad_norm": 0.36395493149757385, + "learning_rate": 0.0002172829131652661, + "loss": 0.38, + "step": 28057 + }, + { + "epoch": 15.67486033519553, + "grad_norm": 0.5341206192970276, + "learning_rate": 0.00021725490196078433, + "loss": 0.4892, + "step": 28058 + }, + { + "epoch": 15.675418994413407, + "grad_norm": 0.5427635908126831, + "learning_rate": 0.00021722689075630254, + "loss": 0.4812, + "step": 28059 + }, + { + "epoch": 15.675977653631286, + "grad_norm": 0.5830546617507935, + "learning_rate": 0.00021719887955182074, + "loss": 0.481, + "step": 28060 + }, + { + "epoch": 15.676536312849162, + "grad_norm": 0.5074461102485657, + "learning_rate": 0.00021717086834733892, + "loss": 0.457, + "step": 28061 + }, + { + "epoch": 15.677094972067039, + "grad_norm": 0.3944685757160187, + "learning_rate": 0.00021714285714285715, + "loss": 0.3819, + "step": 28062 + }, + { + "epoch": 15.677653631284915, + "grad_norm": 0.34566086530685425, + "learning_rate": 0.00021711484593837536, + "loss": 0.3471, + "step": 28063 + }, + { + "epoch": 15.678212290502794, + "grad_norm": 0.4306388199329376, + "learning_rate": 0.00021708683473389357, + "loss": 0.4118, + "step": 28064 + }, + { + "epoch": 15.67877094972067, + "grad_norm": 0.4727766811847687, + "learning_rate": 0.00021705882352941177, + "loss": 0.4856, + "step": 28065 + }, + { + "epoch": 15.679329608938547, + "grad_norm": 0.36203840374946594, + "learning_rate": 0.00021703081232492998, + "loss": 0.3532, + "step": 28066 + }, + { + "epoch": 15.679888268156425, + "grad_norm": 0.6653933525085449, + "learning_rate": 0.00021700280112044818, + "loss": 0.4496, + "step": 28067 + }, + { + "epoch": 15.680446927374302, + "grad_norm": 0.5298588275909424, + "learning_rate": 0.0002169747899159664, + "loss": 0.4147, + "step": 28068 + }, + { + "epoch": 15.681005586592178, + "grad_norm": 7.28579568862915, + "learning_rate": 0.0002169467787114846, + "loss": 0.4534, + "step": 28069 + }, + { + "epoch": 15.681564245810057, + "grad_norm": 0.7363472580909729, + "learning_rate": 0.00021691876750700283, + "loss": 0.5128, + "step": 28070 + }, + { + "epoch": 15.682122905027933, + "grad_norm": 0.9153317213058472, + "learning_rate": 0.000216890756302521, + "loss": 0.5014, + "step": 28071 + }, + { + "epoch": 15.68268156424581, + "grad_norm": 1.3480807542800903, + "learning_rate": 0.00021686274509803921, + "loss": 0.4138, + "step": 28072 + }, + { + "epoch": 15.683240223463688, + "grad_norm": 0.648037850856781, + "learning_rate": 0.00021683473389355742, + "loss": 0.4395, + "step": 28073 + }, + { + "epoch": 15.683798882681565, + "grad_norm": 0.44283750653266907, + "learning_rate": 0.00021680672268907565, + "loss": 0.4652, + "step": 28074 + }, + { + "epoch": 15.684357541899441, + "grad_norm": 0.4221786558628082, + "learning_rate": 0.00021677871148459386, + "loss": 0.3767, + "step": 28075 + }, + { + "epoch": 15.684916201117318, + "grad_norm": 0.5067715644836426, + "learning_rate": 0.00021675070028011204, + "loss": 0.4935, + "step": 28076 + }, + { + "epoch": 15.685474860335196, + "grad_norm": 0.46660977602005005, + "learning_rate": 0.00021672268907563024, + "loss": 0.2776, + "step": 28077 + }, + { + "epoch": 15.686033519553073, + "grad_norm": 0.41411831974983215, + "learning_rate": 0.00021669467787114848, + "loss": 0.4438, + "step": 28078 + }, + { + "epoch": 15.68659217877095, + "grad_norm": 0.9013859629631042, + "learning_rate": 0.00021666666666666668, + "loss": 0.6201, + "step": 28079 + }, + { + "epoch": 15.687150837988828, + "grad_norm": 0.5020959377288818, + "learning_rate": 0.0002166386554621849, + "loss": 0.4511, + "step": 28080 + }, + { + "epoch": 15.687709497206704, + "grad_norm": 0.6807861924171448, + "learning_rate": 0.00021661064425770307, + "loss": 0.3768, + "step": 28081 + }, + { + "epoch": 15.68826815642458, + "grad_norm": 1.3946880102157593, + "learning_rate": 0.0002165826330532213, + "loss": 0.397, + "step": 28082 + }, + { + "epoch": 15.688826815642457, + "grad_norm": 0.4447402060031891, + "learning_rate": 0.0002165546218487395, + "loss": 0.4648, + "step": 28083 + }, + { + "epoch": 15.689385474860336, + "grad_norm": 0.5153530240058899, + "learning_rate": 0.00021652661064425771, + "loss": 0.3659, + "step": 28084 + }, + { + "epoch": 15.689944134078212, + "grad_norm": 0.47520315647125244, + "learning_rate": 0.00021649859943977592, + "loss": 0.4042, + "step": 28085 + }, + { + "epoch": 15.690502793296089, + "grad_norm": 0.3638395071029663, + "learning_rate": 0.00021647058823529413, + "loss": 0.3662, + "step": 28086 + }, + { + "epoch": 15.691061452513967, + "grad_norm": 0.3841032385826111, + "learning_rate": 0.00021644257703081233, + "loss": 0.4512, + "step": 28087 + }, + { + "epoch": 15.691620111731844, + "grad_norm": 1.059695839881897, + "learning_rate": 0.00021641456582633054, + "loss": 0.3874, + "step": 28088 + }, + { + "epoch": 15.69217877094972, + "grad_norm": 0.48372459411621094, + "learning_rate": 0.00021638655462184874, + "loss": 0.4759, + "step": 28089 + }, + { + "epoch": 15.692737430167599, + "grad_norm": 0.7638027667999268, + "learning_rate": 0.00021635854341736695, + "loss": 0.4961, + "step": 28090 + }, + { + "epoch": 15.693296089385475, + "grad_norm": 0.5282920598983765, + "learning_rate": 0.00021633053221288516, + "loss": 0.4017, + "step": 28091 + }, + { + "epoch": 15.693854748603352, + "grad_norm": 1.376887321472168, + "learning_rate": 0.00021630252100840336, + "loss": 0.4267, + "step": 28092 + }, + { + "epoch": 15.694413407821228, + "grad_norm": 0.8021776080131531, + "learning_rate": 0.00021627450980392157, + "loss": 0.3466, + "step": 28093 + }, + { + "epoch": 15.694972067039107, + "grad_norm": 0.40849369764328003, + "learning_rate": 0.0002162464985994398, + "loss": 0.4051, + "step": 28094 + }, + { + "epoch": 15.695530726256983, + "grad_norm": 0.41417962312698364, + "learning_rate": 0.00021621848739495798, + "loss": 0.3558, + "step": 28095 + }, + { + "epoch": 15.69608938547486, + "grad_norm": 0.4883786141872406, + "learning_rate": 0.00021619047619047619, + "loss": 0.3759, + "step": 28096 + }, + { + "epoch": 15.696648044692738, + "grad_norm": 0.4189356565475464, + "learning_rate": 0.0002161624649859944, + "loss": 0.3405, + "step": 28097 + }, + { + "epoch": 15.697206703910615, + "grad_norm": 0.3570777475833893, + "learning_rate": 0.00021613445378151262, + "loss": 0.3945, + "step": 28098 + }, + { + "epoch": 15.697765363128491, + "grad_norm": 0.4587579369544983, + "learning_rate": 0.00021610644257703083, + "loss": 0.3518, + "step": 28099 + }, + { + "epoch": 15.69832402234637, + "grad_norm": 0.6413249373435974, + "learning_rate": 0.000216078431372549, + "loss": 0.3829, + "step": 28100 + }, + { + "epoch": 15.698882681564246, + "grad_norm": 0.5313349366188049, + "learning_rate": 0.00021605042016806722, + "loss": 0.3975, + "step": 28101 + }, + { + "epoch": 15.699441340782123, + "grad_norm": 1.1013466119766235, + "learning_rate": 0.00021602240896358545, + "loss": 0.5159, + "step": 28102 + }, + { + "epoch": 15.7, + "grad_norm": 3.016965627670288, + "learning_rate": 0.00021599439775910365, + "loss": 0.4462, + "step": 28103 + }, + { + "epoch": 15.700558659217878, + "grad_norm": 5.754360198974609, + "learning_rate": 0.00021596638655462186, + "loss": 0.4059, + "step": 28104 + }, + { + "epoch": 15.701117318435754, + "grad_norm": 0.36752769351005554, + "learning_rate": 0.00021593837535014004, + "loss": 0.311, + "step": 28105 + }, + { + "epoch": 15.70167597765363, + "grad_norm": 0.5617859959602356, + "learning_rate": 0.00021591036414565827, + "loss": 0.4049, + "step": 28106 + }, + { + "epoch": 15.702234636871509, + "grad_norm": 0.8458712100982666, + "learning_rate": 0.00021588235294117648, + "loss": 0.4099, + "step": 28107 + }, + { + "epoch": 15.702793296089386, + "grad_norm": 0.44229528307914734, + "learning_rate": 0.00021585434173669468, + "loss": 0.4034, + "step": 28108 + }, + { + "epoch": 15.703351955307262, + "grad_norm": 0.4449937045574188, + "learning_rate": 0.0002158263305322129, + "loss": 0.3363, + "step": 28109 + }, + { + "epoch": 15.703910614525139, + "grad_norm": 0.49023550748825073, + "learning_rate": 0.0002157983193277311, + "loss": 0.4212, + "step": 28110 + }, + { + "epoch": 15.704469273743017, + "grad_norm": 0.415931761264801, + "learning_rate": 0.0002157703081232493, + "loss": 0.4225, + "step": 28111 + }, + { + "epoch": 15.705027932960894, + "grad_norm": 0.42796462774276733, + "learning_rate": 0.0002157422969187675, + "loss": 0.3966, + "step": 28112 + }, + { + "epoch": 15.70558659217877, + "grad_norm": 0.45776891708374023, + "learning_rate": 0.00021571428571428571, + "loss": 0.5357, + "step": 28113 + }, + { + "epoch": 15.706145251396649, + "grad_norm": 0.607927143573761, + "learning_rate": 0.00021568627450980395, + "loss": 0.5675, + "step": 28114 + }, + { + "epoch": 15.706703910614525, + "grad_norm": 0.4122005105018616, + "learning_rate": 0.00021565826330532213, + "loss": 0.4158, + "step": 28115 + }, + { + "epoch": 15.707262569832402, + "grad_norm": 0.3309580087661743, + "learning_rate": 0.00021563025210084033, + "loss": 0.2945, + "step": 28116 + }, + { + "epoch": 15.70782122905028, + "grad_norm": 1.7135727405548096, + "learning_rate": 0.00021560224089635854, + "loss": 0.5501, + "step": 28117 + }, + { + "epoch": 15.708379888268157, + "grad_norm": 0.3926011919975281, + "learning_rate": 0.00021557422969187677, + "loss": 0.3926, + "step": 28118 + }, + { + "epoch": 15.708938547486033, + "grad_norm": 1.5358628034591675, + "learning_rate": 0.00021554621848739498, + "loss": 0.3873, + "step": 28119 + }, + { + "epoch": 15.70949720670391, + "grad_norm": 0.4119482934474945, + "learning_rate": 0.00021551820728291316, + "loss": 0.3999, + "step": 28120 + }, + { + "epoch": 15.710055865921788, + "grad_norm": 4.981225967407227, + "learning_rate": 0.00021549019607843136, + "loss": 0.4542, + "step": 28121 + }, + { + "epoch": 15.710614525139665, + "grad_norm": 0.48065581917762756, + "learning_rate": 0.0002154621848739496, + "loss": 0.3545, + "step": 28122 + }, + { + "epoch": 15.711173184357541, + "grad_norm": 0.454269677400589, + "learning_rate": 0.0002154341736694678, + "loss": 0.4192, + "step": 28123 + }, + { + "epoch": 15.71173184357542, + "grad_norm": 0.4448856711387634, + "learning_rate": 0.000215406162464986, + "loss": 0.4102, + "step": 28124 + }, + { + "epoch": 15.712290502793296, + "grad_norm": 0.44434693455696106, + "learning_rate": 0.0002153781512605042, + "loss": 0.3084, + "step": 28125 + }, + { + "epoch": 15.712849162011173, + "grad_norm": 0.8203147649765015, + "learning_rate": 0.00021535014005602242, + "loss": 0.3925, + "step": 28126 + }, + { + "epoch": 15.713407821229051, + "grad_norm": 0.42819273471832275, + "learning_rate": 0.00021532212885154063, + "loss": 0.5081, + "step": 28127 + }, + { + "epoch": 15.713966480446928, + "grad_norm": 0.40749242901802063, + "learning_rate": 0.00021529411764705883, + "loss": 0.4591, + "step": 28128 + }, + { + "epoch": 15.714525139664804, + "grad_norm": 0.42136573791503906, + "learning_rate": 0.00021526610644257704, + "loss": 0.3978, + "step": 28129 + }, + { + "epoch": 15.71508379888268, + "grad_norm": 0.37534379959106445, + "learning_rate": 0.00021523809523809524, + "loss": 0.4432, + "step": 28130 + }, + { + "epoch": 15.71564245810056, + "grad_norm": 0.6140737533569336, + "learning_rate": 0.00021521008403361345, + "loss": 0.4645, + "step": 28131 + }, + { + "epoch": 15.716201117318436, + "grad_norm": 0.43461814522743225, + "learning_rate": 0.00021518207282913166, + "loss": 0.4257, + "step": 28132 + }, + { + "epoch": 15.716759776536312, + "grad_norm": 0.5296183228492737, + "learning_rate": 0.00021515406162464986, + "loss": 0.429, + "step": 28133 + }, + { + "epoch": 15.71731843575419, + "grad_norm": 0.4991846978664398, + "learning_rate": 0.0002151260504201681, + "loss": 0.3247, + "step": 28134 + }, + { + "epoch": 15.717877094972067, + "grad_norm": 0.6997066140174866, + "learning_rate": 0.00021509803921568627, + "loss": 0.3217, + "step": 28135 + }, + { + "epoch": 15.718435754189944, + "grad_norm": 0.54915851354599, + "learning_rate": 0.00021507002801120448, + "loss": 0.3599, + "step": 28136 + }, + { + "epoch": 15.71899441340782, + "grad_norm": 0.49522387981414795, + "learning_rate": 0.00021504201680672269, + "loss": 0.6005, + "step": 28137 + }, + { + "epoch": 15.719553072625699, + "grad_norm": 1.1229275465011597, + "learning_rate": 0.00021501400560224092, + "loss": 0.4658, + "step": 28138 + }, + { + "epoch": 15.720111731843575, + "grad_norm": 3.6634035110473633, + "learning_rate": 0.00021498599439775912, + "loss": 0.4077, + "step": 28139 + }, + { + "epoch": 15.720670391061452, + "grad_norm": 0.48935940861701965, + "learning_rate": 0.0002149579831932773, + "loss": 0.4222, + "step": 28140 + }, + { + "epoch": 15.72122905027933, + "grad_norm": 0.5736417770385742, + "learning_rate": 0.0002149299719887955, + "loss": 0.461, + "step": 28141 + }, + { + "epoch": 15.721787709497207, + "grad_norm": 0.8085067868232727, + "learning_rate": 0.00021490196078431374, + "loss": 0.4278, + "step": 28142 + }, + { + "epoch": 15.722346368715083, + "grad_norm": 0.39364534616470337, + "learning_rate": 0.00021487394957983195, + "loss": 0.4347, + "step": 28143 + }, + { + "epoch": 15.722905027932962, + "grad_norm": 0.35484471917152405, + "learning_rate": 0.00021484593837535015, + "loss": 0.3141, + "step": 28144 + }, + { + "epoch": 15.723463687150838, + "grad_norm": 0.384712815284729, + "learning_rate": 0.00021481792717086833, + "loss": 0.4192, + "step": 28145 + }, + { + "epoch": 15.724022346368715, + "grad_norm": 1.2662056684494019, + "learning_rate": 0.00021478991596638657, + "loss": 0.5098, + "step": 28146 + }, + { + "epoch": 15.724581005586593, + "grad_norm": 0.5253733992576599, + "learning_rate": 0.00021476190476190477, + "loss": 0.3821, + "step": 28147 + }, + { + "epoch": 15.72513966480447, + "grad_norm": 0.5488473176956177, + "learning_rate": 0.00021473389355742298, + "loss": 0.3729, + "step": 28148 + }, + { + "epoch": 15.725698324022346, + "grad_norm": 0.5362403988838196, + "learning_rate": 0.00021470588235294116, + "loss": 0.355, + "step": 28149 + }, + { + "epoch": 15.726256983240223, + "grad_norm": 0.48557308316230774, + "learning_rate": 0.0002146778711484594, + "loss": 0.475, + "step": 28150 + }, + { + "epoch": 15.726815642458101, + "grad_norm": 1.0692726373672485, + "learning_rate": 0.0002146498599439776, + "loss": 0.3537, + "step": 28151 + }, + { + "epoch": 15.727374301675978, + "grad_norm": 1.6404802799224854, + "learning_rate": 0.0002146218487394958, + "loss": 0.4396, + "step": 28152 + }, + { + "epoch": 15.727932960893854, + "grad_norm": 1.2937458753585815, + "learning_rate": 0.000214593837535014, + "loss": 0.3433, + "step": 28153 + }, + { + "epoch": 15.728491620111733, + "grad_norm": 0.7173959016799927, + "learning_rate": 0.00021456582633053221, + "loss": 0.3316, + "step": 28154 + }, + { + "epoch": 15.72905027932961, + "grad_norm": 3.468048334121704, + "learning_rate": 0.00021453781512605042, + "loss": 0.3969, + "step": 28155 + }, + { + "epoch": 15.729608938547486, + "grad_norm": 0.7456526160240173, + "learning_rate": 0.00021450980392156863, + "loss": 0.385, + "step": 28156 + }, + { + "epoch": 15.730167597765362, + "grad_norm": 25.07265281677246, + "learning_rate": 0.00021448179271708683, + "loss": 0.3852, + "step": 28157 + }, + { + "epoch": 15.73072625698324, + "grad_norm": 0.5502532124519348, + "learning_rate": 0.00021445378151260507, + "loss": 0.5549, + "step": 28158 + }, + { + "epoch": 15.731284916201117, + "grad_norm": 0.4713345170021057, + "learning_rate": 0.00021442577030812324, + "loss": 0.4081, + "step": 28159 + }, + { + "epoch": 15.731843575418994, + "grad_norm": 2.0924980640411377, + "learning_rate": 0.00021439775910364145, + "loss": 0.4937, + "step": 28160 + }, + { + "epoch": 15.732402234636872, + "grad_norm": 0.9878963232040405, + "learning_rate": 0.00021436974789915966, + "loss": 0.4308, + "step": 28161 + }, + { + "epoch": 15.732960893854749, + "grad_norm": 1.0944411754608154, + "learning_rate": 0.0002143417366946779, + "loss": 0.4274, + "step": 28162 + }, + { + "epoch": 15.733519553072625, + "grad_norm": 0.387623131275177, + "learning_rate": 0.0002143137254901961, + "loss": 0.3801, + "step": 28163 + }, + { + "epoch": 15.734078212290502, + "grad_norm": 0.4180534780025482, + "learning_rate": 0.00021428571428571427, + "loss": 0.3458, + "step": 28164 + }, + { + "epoch": 15.73463687150838, + "grad_norm": 0.3599781394004822, + "learning_rate": 0.00021425770308123248, + "loss": 0.3793, + "step": 28165 + }, + { + "epoch": 15.735195530726257, + "grad_norm": 0.5250738263130188, + "learning_rate": 0.00021422969187675071, + "loss": 0.5147, + "step": 28166 + }, + { + "epoch": 15.735754189944133, + "grad_norm": 0.37296831607818604, + "learning_rate": 0.00021420168067226892, + "loss": 0.4184, + "step": 28167 + }, + { + "epoch": 15.736312849162012, + "grad_norm": 0.8904727101325989, + "learning_rate": 0.00021417366946778713, + "loss": 0.58, + "step": 28168 + }, + { + "epoch": 15.736871508379888, + "grad_norm": 0.6159899830818176, + "learning_rate": 0.0002141456582633053, + "loss": 0.3904, + "step": 28169 + }, + { + "epoch": 15.737430167597765, + "grad_norm": 0.48762694001197815, + "learning_rate": 0.00021411764705882354, + "loss": 0.4652, + "step": 28170 + }, + { + "epoch": 15.737988826815643, + "grad_norm": 0.49352702498435974, + "learning_rate": 0.00021408963585434174, + "loss": 0.5325, + "step": 28171 + }, + { + "epoch": 15.73854748603352, + "grad_norm": 0.45881393551826477, + "learning_rate": 0.00021406162464985995, + "loss": 0.365, + "step": 28172 + }, + { + "epoch": 15.739106145251396, + "grad_norm": 0.4733392000198364, + "learning_rate": 0.00021403361344537816, + "loss": 0.3535, + "step": 28173 + }, + { + "epoch": 15.739664804469275, + "grad_norm": 0.5747881531715393, + "learning_rate": 0.00021400560224089636, + "loss": 0.3286, + "step": 28174 + }, + { + "epoch": 15.740223463687151, + "grad_norm": 0.5354949831962585, + "learning_rate": 0.00021397759103641457, + "loss": 0.4344, + "step": 28175 + }, + { + "epoch": 15.740782122905028, + "grad_norm": 0.6348032355308533, + "learning_rate": 0.00021394957983193277, + "loss": 0.5637, + "step": 28176 + }, + { + "epoch": 15.741340782122904, + "grad_norm": 0.45050469040870667, + "learning_rate": 0.00021392156862745098, + "loss": 0.4639, + "step": 28177 + }, + { + "epoch": 15.741899441340783, + "grad_norm": 1.7818286418914795, + "learning_rate": 0.0002138935574229692, + "loss": 0.3982, + "step": 28178 + }, + { + "epoch": 15.74245810055866, + "grad_norm": 0.37337547540664673, + "learning_rate": 0.0002138655462184874, + "loss": 0.4243, + "step": 28179 + }, + { + "epoch": 15.743016759776536, + "grad_norm": 0.6851099133491516, + "learning_rate": 0.0002138375350140056, + "loss": 0.5206, + "step": 28180 + }, + { + "epoch": 15.743575418994414, + "grad_norm": 0.7506327629089355, + "learning_rate": 0.0002138095238095238, + "loss": 0.3762, + "step": 28181 + }, + { + "epoch": 15.74413407821229, + "grad_norm": 0.44795480370521545, + "learning_rate": 0.00021378151260504204, + "loss": 0.38, + "step": 28182 + }, + { + "epoch": 15.744692737430167, + "grad_norm": 0.7723276019096375, + "learning_rate": 0.00021375350140056024, + "loss": 0.4501, + "step": 28183 + }, + { + "epoch": 15.745251396648044, + "grad_norm": 0.49385908246040344, + "learning_rate": 0.00021372549019607842, + "loss": 0.3646, + "step": 28184 + }, + { + "epoch": 15.745810055865922, + "grad_norm": 1.3371938467025757, + "learning_rate": 0.00021369747899159663, + "loss": 0.4058, + "step": 28185 + }, + { + "epoch": 15.746368715083799, + "grad_norm": 0.8355948328971863, + "learning_rate": 0.00021366946778711486, + "loss": 0.3837, + "step": 28186 + }, + { + "epoch": 15.746927374301675, + "grad_norm": 0.5903971791267395, + "learning_rate": 0.00021364145658263307, + "loss": 0.537, + "step": 28187 + }, + { + "epoch": 15.747486033519554, + "grad_norm": 0.47570276260375977, + "learning_rate": 0.00021361344537815127, + "loss": 0.4582, + "step": 28188 + }, + { + "epoch": 15.74804469273743, + "grad_norm": 0.46108779311180115, + "learning_rate": 0.00021358543417366945, + "loss": 0.3613, + "step": 28189 + }, + { + "epoch": 15.748603351955307, + "grad_norm": 0.3710097074508667, + "learning_rate": 0.00021355742296918768, + "loss": 0.4422, + "step": 28190 + }, + { + "epoch": 15.749162011173185, + "grad_norm": 0.616888701915741, + "learning_rate": 0.0002135294117647059, + "loss": 0.4114, + "step": 28191 + }, + { + "epoch": 15.749720670391062, + "grad_norm": 0.5004860162734985, + "learning_rate": 0.0002135014005602241, + "loss": 0.5068, + "step": 28192 + }, + { + "epoch": 15.750279329608938, + "grad_norm": 0.38989344239234924, + "learning_rate": 0.0002134733893557423, + "loss": 0.4164, + "step": 28193 + }, + { + "epoch": 15.750837988826815, + "grad_norm": 1.829546570777893, + "learning_rate": 0.0002134453781512605, + "loss": 0.3906, + "step": 28194 + }, + { + "epoch": 15.751396648044693, + "grad_norm": 1.5404324531555176, + "learning_rate": 0.00021341736694677871, + "loss": 0.3922, + "step": 28195 + }, + { + "epoch": 15.75195530726257, + "grad_norm": 0.541739821434021, + "learning_rate": 0.00021338935574229692, + "loss": 0.3306, + "step": 28196 + }, + { + "epoch": 15.752513966480446, + "grad_norm": 2.049353837966919, + "learning_rate": 0.00021336134453781513, + "loss": 0.4648, + "step": 28197 + }, + { + "epoch": 15.753072625698325, + "grad_norm": 0.5836424827575684, + "learning_rate": 0.00021333333333333336, + "loss": 0.3823, + "step": 28198 + }, + { + "epoch": 15.753631284916201, + "grad_norm": 0.33068305253982544, + "learning_rate": 0.00021330532212885154, + "loss": 0.3958, + "step": 28199 + }, + { + "epoch": 15.754189944134078, + "grad_norm": 0.325416624546051, + "learning_rate": 0.00021327731092436974, + "loss": 0.3946, + "step": 28200 + }, + { + "epoch": 15.754748603351956, + "grad_norm": 0.38331055641174316, + "learning_rate": 0.00021324929971988795, + "loss": 0.436, + "step": 28201 + }, + { + "epoch": 15.755307262569833, + "grad_norm": 1.1591451168060303, + "learning_rate": 0.00021322128851540618, + "loss": 0.3803, + "step": 28202 + }, + { + "epoch": 15.75586592178771, + "grad_norm": 0.4944038689136505, + "learning_rate": 0.00021319327731092436, + "loss": 0.4557, + "step": 28203 + }, + { + "epoch": 15.756424581005586, + "grad_norm": 0.3757638931274414, + "learning_rate": 0.00021316526610644257, + "loss": 0.2937, + "step": 28204 + }, + { + "epoch": 15.756983240223464, + "grad_norm": 5.244970321655273, + "learning_rate": 0.00021313725490196077, + "loss": 0.3399, + "step": 28205 + }, + { + "epoch": 15.75754189944134, + "grad_norm": 0.33353477716445923, + "learning_rate": 0.000213109243697479, + "loss": 0.305, + "step": 28206 + }, + { + "epoch": 15.758100558659217, + "grad_norm": 0.5653721690177917, + "learning_rate": 0.00021308123249299721, + "loss": 0.4854, + "step": 28207 + }, + { + "epoch": 15.758659217877096, + "grad_norm": 0.4542103409767151, + "learning_rate": 0.0002130532212885154, + "loss": 0.3424, + "step": 28208 + }, + { + "epoch": 15.759217877094972, + "grad_norm": 0.4675729274749756, + "learning_rate": 0.0002130252100840336, + "loss": 0.3804, + "step": 28209 + }, + { + "epoch": 15.759776536312849, + "grad_norm": 0.7025383114814758, + "learning_rate": 0.00021299719887955183, + "loss": 0.4995, + "step": 28210 + }, + { + "epoch": 15.760335195530725, + "grad_norm": 0.32371261715888977, + "learning_rate": 0.00021296918767507004, + "loss": 0.3095, + "step": 28211 + }, + { + "epoch": 15.760893854748604, + "grad_norm": 1.2456889152526855, + "learning_rate": 0.00021294117647058824, + "loss": 0.3937, + "step": 28212 + }, + { + "epoch": 15.76145251396648, + "grad_norm": 0.4105624854564667, + "learning_rate": 0.00021291316526610642, + "loss": 0.3572, + "step": 28213 + }, + { + "epoch": 15.762011173184357, + "grad_norm": 4.164335250854492, + "learning_rate": 0.00021288515406162466, + "loss": 0.4034, + "step": 28214 + }, + { + "epoch": 15.762569832402235, + "grad_norm": 0.7588809728622437, + "learning_rate": 0.00021285714285714286, + "loss": 0.4862, + "step": 28215 + }, + { + "epoch": 15.763128491620112, + "grad_norm": 0.38554051518440247, + "learning_rate": 0.00021282913165266107, + "loss": 0.3176, + "step": 28216 + }, + { + "epoch": 15.763687150837988, + "grad_norm": 0.6313602328300476, + "learning_rate": 0.00021280112044817927, + "loss": 0.5529, + "step": 28217 + }, + { + "epoch": 15.764245810055867, + "grad_norm": 0.37413835525512695, + "learning_rate": 0.00021277310924369748, + "loss": 0.4657, + "step": 28218 + }, + { + "epoch": 15.764804469273743, + "grad_norm": 0.567621648311615, + "learning_rate": 0.00021274509803921569, + "loss": 0.4256, + "step": 28219 + }, + { + "epoch": 15.76536312849162, + "grad_norm": 0.7270967364311218, + "learning_rate": 0.0002127170868347339, + "loss": 0.4057, + "step": 28220 + }, + { + "epoch": 15.765921787709498, + "grad_norm": 0.5513010621070862, + "learning_rate": 0.0002126890756302521, + "loss": 0.4731, + "step": 28221 + }, + { + "epoch": 15.766480446927375, + "grad_norm": 0.38643893599510193, + "learning_rate": 0.00021266106442577033, + "loss": 0.4341, + "step": 28222 + }, + { + "epoch": 15.767039106145251, + "grad_norm": 0.6488580107688904, + "learning_rate": 0.0002126330532212885, + "loss": 0.3671, + "step": 28223 + }, + { + "epoch": 15.767597765363128, + "grad_norm": 0.6248840093612671, + "learning_rate": 0.00021260504201680672, + "loss": 0.4997, + "step": 28224 + }, + { + "epoch": 15.768156424581006, + "grad_norm": 0.3813264071941376, + "learning_rate": 0.00021257703081232492, + "loss": 0.3437, + "step": 28225 + }, + { + "epoch": 15.768715083798883, + "grad_norm": 0.4814385771751404, + "learning_rate": 0.00021254901960784315, + "loss": 0.3562, + "step": 28226 + }, + { + "epoch": 15.76927374301676, + "grad_norm": 0.3675447702407837, + "learning_rate": 0.00021252100840336136, + "loss": 0.3952, + "step": 28227 + }, + { + "epoch": 15.769832402234638, + "grad_norm": 0.6319012641906738, + "learning_rate": 0.00021249299719887954, + "loss": 0.4785, + "step": 28228 + }, + { + "epoch": 15.770391061452514, + "grad_norm": 4.534488201141357, + "learning_rate": 0.00021246498599439775, + "loss": 0.3524, + "step": 28229 + }, + { + "epoch": 15.77094972067039, + "grad_norm": 0.4247336685657501, + "learning_rate": 0.00021243697478991598, + "loss": 0.3756, + "step": 28230 + }, + { + "epoch": 15.771508379888267, + "grad_norm": 0.5720586180686951, + "learning_rate": 0.00021240896358543418, + "loss": 0.4587, + "step": 28231 + }, + { + "epoch": 15.772067039106146, + "grad_norm": 0.42110154032707214, + "learning_rate": 0.0002123809523809524, + "loss": 0.3331, + "step": 28232 + }, + { + "epoch": 15.772625698324022, + "grad_norm": 0.5136241316795349, + "learning_rate": 0.00021235294117647057, + "loss": 0.4474, + "step": 28233 + }, + { + "epoch": 15.773184357541899, + "grad_norm": 0.6020116806030273, + "learning_rate": 0.0002123249299719888, + "loss": 0.4761, + "step": 28234 + }, + { + "epoch": 15.773743016759777, + "grad_norm": 0.5410043001174927, + "learning_rate": 0.000212296918767507, + "loss": 0.4962, + "step": 28235 + }, + { + "epoch": 15.774301675977654, + "grad_norm": 0.471684068441391, + "learning_rate": 0.00021226890756302521, + "loss": 0.4167, + "step": 28236 + }, + { + "epoch": 15.77486033519553, + "grad_norm": 0.44435322284698486, + "learning_rate": 0.00021224089635854342, + "loss": 0.4428, + "step": 28237 + }, + { + "epoch": 15.775418994413407, + "grad_norm": 0.5288370251655579, + "learning_rate": 0.00021221288515406163, + "loss": 0.325, + "step": 28238 + }, + { + "epoch": 15.775977653631285, + "grad_norm": 0.5087246298789978, + "learning_rate": 0.00021218487394957983, + "loss": 0.4106, + "step": 28239 + }, + { + "epoch": 15.776536312849162, + "grad_norm": 0.4656437933444977, + "learning_rate": 0.00021215686274509804, + "loss": 0.4707, + "step": 28240 + }, + { + "epoch": 15.777094972067038, + "grad_norm": 0.41650253534317017, + "learning_rate": 0.00021212885154061624, + "loss": 0.4246, + "step": 28241 + }, + { + "epoch": 15.777653631284917, + "grad_norm": 0.543643057346344, + "learning_rate": 0.00021210084033613448, + "loss": 0.3894, + "step": 28242 + }, + { + "epoch": 15.778212290502793, + "grad_norm": 0.9209433197975159, + "learning_rate": 0.00021207282913165266, + "loss": 0.3783, + "step": 28243 + }, + { + "epoch": 15.77877094972067, + "grad_norm": 0.6223825216293335, + "learning_rate": 0.00021204481792717086, + "loss": 0.4041, + "step": 28244 + }, + { + "epoch": 15.779329608938548, + "grad_norm": 0.3460981845855713, + "learning_rate": 0.00021201680672268907, + "loss": 0.3028, + "step": 28245 + }, + { + "epoch": 15.779888268156425, + "grad_norm": 0.7730225324630737, + "learning_rate": 0.0002119887955182073, + "loss": 0.4101, + "step": 28246 + }, + { + "epoch": 15.780446927374301, + "grad_norm": 0.42222270369529724, + "learning_rate": 0.0002119607843137255, + "loss": 0.4014, + "step": 28247 + }, + { + "epoch": 15.78100558659218, + "grad_norm": 0.4425014555454254, + "learning_rate": 0.0002119327731092437, + "loss": 0.4345, + "step": 28248 + }, + { + "epoch": 15.781564245810056, + "grad_norm": 0.7121943831443787, + "learning_rate": 0.0002119047619047619, + "loss": 0.5221, + "step": 28249 + }, + { + "epoch": 15.782122905027933, + "grad_norm": 0.6115889549255371, + "learning_rate": 0.00021187675070028013, + "loss": 0.5173, + "step": 28250 + }, + { + "epoch": 15.78268156424581, + "grad_norm": 0.44649675488471985, + "learning_rate": 0.00021184873949579833, + "loss": 0.4346, + "step": 28251 + }, + { + "epoch": 15.783240223463688, + "grad_norm": 0.3846137523651123, + "learning_rate": 0.00021182072829131654, + "loss": 0.4231, + "step": 28252 + }, + { + "epoch": 15.783798882681564, + "grad_norm": 2.3018479347229004, + "learning_rate": 0.00021179271708683472, + "loss": 0.3659, + "step": 28253 + }, + { + "epoch": 15.78435754189944, + "grad_norm": 0.5861935615539551, + "learning_rate": 0.00021176470588235295, + "loss": 0.41, + "step": 28254 + }, + { + "epoch": 15.78491620111732, + "grad_norm": 5.909586429595947, + "learning_rate": 0.00021173669467787116, + "loss": 0.4884, + "step": 28255 + }, + { + "epoch": 15.785474860335196, + "grad_norm": 0.3388601839542389, + "learning_rate": 0.00021170868347338936, + "loss": 0.283, + "step": 28256 + }, + { + "epoch": 15.786033519553072, + "grad_norm": 0.5771458148956299, + "learning_rate": 0.00021168067226890754, + "loss": 0.3824, + "step": 28257 + }, + { + "epoch": 15.786592178770949, + "grad_norm": 0.5275522470474243, + "learning_rate": 0.00021165266106442577, + "loss": 0.3793, + "step": 28258 + }, + { + "epoch": 15.787150837988827, + "grad_norm": 0.6667299270629883, + "learning_rate": 0.00021162464985994398, + "loss": 0.5232, + "step": 28259 + }, + { + "epoch": 15.787709497206704, + "grad_norm": 0.45518627762794495, + "learning_rate": 0.00021159663865546219, + "loss": 0.4345, + "step": 28260 + }, + { + "epoch": 15.78826815642458, + "grad_norm": 0.7462858557701111, + "learning_rate": 0.00021156862745098042, + "loss": 0.412, + "step": 28261 + }, + { + "epoch": 15.788826815642459, + "grad_norm": 1.8235708475112915, + "learning_rate": 0.0002115406162464986, + "loss": 0.4048, + "step": 28262 + }, + { + "epoch": 15.789385474860335, + "grad_norm": 0.5864009261131287, + "learning_rate": 0.0002115126050420168, + "loss": 0.3942, + "step": 28263 + }, + { + "epoch": 15.789944134078212, + "grad_norm": 0.4313289225101471, + "learning_rate": 0.000211484593837535, + "loss": 0.4062, + "step": 28264 + }, + { + "epoch": 15.79050279329609, + "grad_norm": 0.7372799515724182, + "learning_rate": 0.00021145658263305324, + "loss": 0.4278, + "step": 28265 + }, + { + "epoch": 15.791061452513967, + "grad_norm": 0.4913404583930969, + "learning_rate": 0.00021142857142857145, + "loss": 0.4769, + "step": 28266 + }, + { + "epoch": 15.791620111731843, + "grad_norm": 0.4150962233543396, + "learning_rate": 0.00021140056022408963, + "loss": 0.3899, + "step": 28267 + }, + { + "epoch": 15.79217877094972, + "grad_norm": 0.3714819550514221, + "learning_rate": 0.00021137254901960783, + "loss": 0.415, + "step": 28268 + }, + { + "epoch": 15.792737430167598, + "grad_norm": 0.44067323207855225, + "learning_rate": 0.00021134453781512607, + "loss": 0.3846, + "step": 28269 + }, + { + "epoch": 15.793296089385475, + "grad_norm": 0.48309043049812317, + "learning_rate": 0.00021131652661064427, + "loss": 0.4095, + "step": 28270 + }, + { + "epoch": 15.793854748603351, + "grad_norm": 0.9010888338088989, + "learning_rate": 0.00021128851540616248, + "loss": 0.6459, + "step": 28271 + }, + { + "epoch": 15.79441340782123, + "grad_norm": 0.463115394115448, + "learning_rate": 0.00021126050420168066, + "loss": 0.4062, + "step": 28272 + }, + { + "epoch": 15.794972067039106, + "grad_norm": 0.3948679566383362, + "learning_rate": 0.0002112324929971989, + "loss": 0.4239, + "step": 28273 + }, + { + "epoch": 15.795530726256983, + "grad_norm": 0.6633087396621704, + "learning_rate": 0.0002112044817927171, + "loss": 0.3746, + "step": 28274 + }, + { + "epoch": 15.796089385474861, + "grad_norm": 0.4883731007575989, + "learning_rate": 0.0002111764705882353, + "loss": 0.3441, + "step": 28275 + }, + { + "epoch": 15.796648044692738, + "grad_norm": 0.45647338032722473, + "learning_rate": 0.0002111484593837535, + "loss": 0.4526, + "step": 28276 + }, + { + "epoch": 15.797206703910614, + "grad_norm": 0.38070204854011536, + "learning_rate": 0.00021112044817927171, + "loss": 0.364, + "step": 28277 + }, + { + "epoch": 15.797765363128491, + "grad_norm": 0.4591529667377472, + "learning_rate": 0.00021109243697478992, + "loss": 0.377, + "step": 28278 + }, + { + "epoch": 15.79832402234637, + "grad_norm": 0.5029637217521667, + "learning_rate": 0.00021106442577030813, + "loss": 0.4278, + "step": 28279 + }, + { + "epoch": 15.798882681564246, + "grad_norm": 0.4572776257991791, + "learning_rate": 0.00021103641456582633, + "loss": 0.4632, + "step": 28280 + }, + { + "epoch": 15.799441340782122, + "grad_norm": 0.5541945695877075, + "learning_rate": 0.00021100840336134457, + "loss": 0.4269, + "step": 28281 + }, + { + "epoch": 15.8, + "grad_norm": 0.5428129434585571, + "learning_rate": 0.00021098039215686274, + "loss": 0.4126, + "step": 28282 + }, + { + "epoch": 15.800558659217877, + "grad_norm": 0.3842190206050873, + "learning_rate": 0.00021095238095238095, + "loss": 0.3171, + "step": 28283 + }, + { + "epoch": 15.801117318435754, + "grad_norm": 3.594968557357788, + "learning_rate": 0.00021092436974789916, + "loss": 0.3944, + "step": 28284 + }, + { + "epoch": 15.80167597765363, + "grad_norm": 0.45112496614456177, + "learning_rate": 0.0002108963585434174, + "loss": 0.3855, + "step": 28285 + }, + { + "epoch": 15.802234636871509, + "grad_norm": 0.7227235436439514, + "learning_rate": 0.0002108683473389356, + "loss": 0.5105, + "step": 28286 + }, + { + "epoch": 15.802793296089385, + "grad_norm": 0.685396671295166, + "learning_rate": 0.00021084033613445377, + "loss": 0.5296, + "step": 28287 + }, + { + "epoch": 15.803351955307262, + "grad_norm": 0.3453178405761719, + "learning_rate": 0.00021081232492997198, + "loss": 0.3888, + "step": 28288 + }, + { + "epoch": 15.80391061452514, + "grad_norm": 0.4115144908428192, + "learning_rate": 0.00021078431372549021, + "loss": 0.4003, + "step": 28289 + }, + { + "epoch": 15.804469273743017, + "grad_norm": 0.36292344331741333, + "learning_rate": 0.00021075630252100842, + "loss": 0.3397, + "step": 28290 + }, + { + "epoch": 15.805027932960893, + "grad_norm": 0.8145294189453125, + "learning_rate": 0.00021072829131652663, + "loss": 0.4216, + "step": 28291 + }, + { + "epoch": 15.805586592178772, + "grad_norm": 2.1440043449401855, + "learning_rate": 0.0002107002801120448, + "loss": 0.3865, + "step": 28292 + }, + { + "epoch": 15.806145251396648, + "grad_norm": 0.4146145284175873, + "learning_rate": 0.00021067226890756304, + "loss": 0.3604, + "step": 28293 + }, + { + "epoch": 15.806703910614525, + "grad_norm": 0.34399181604385376, + "learning_rate": 0.00021064425770308124, + "loss": 0.2795, + "step": 28294 + }, + { + "epoch": 15.807262569832401, + "grad_norm": 0.5868501663208008, + "learning_rate": 0.00021061624649859945, + "loss": 0.3938, + "step": 28295 + }, + { + "epoch": 15.80782122905028, + "grad_norm": 0.5085304975509644, + "learning_rate": 0.00021058823529411766, + "loss": 0.5137, + "step": 28296 + }, + { + "epoch": 15.808379888268156, + "grad_norm": 0.46111851930618286, + "learning_rate": 0.00021056022408963586, + "loss": 0.3755, + "step": 28297 + }, + { + "epoch": 15.808938547486033, + "grad_norm": 0.6453214287757874, + "learning_rate": 0.00021053221288515407, + "loss": 0.3752, + "step": 28298 + }, + { + "epoch": 15.809497206703911, + "grad_norm": 0.410430371761322, + "learning_rate": 0.00021050420168067227, + "loss": 0.2885, + "step": 28299 + }, + { + "epoch": 15.810055865921788, + "grad_norm": 0.40357017517089844, + "learning_rate": 0.00021047619047619048, + "loss": 0.3957, + "step": 28300 + }, + { + "epoch": 15.810614525139664, + "grad_norm": 0.5667738318443298, + "learning_rate": 0.0002104481792717087, + "loss": 0.3917, + "step": 28301 + }, + { + "epoch": 15.811173184357543, + "grad_norm": 8.480268478393555, + "learning_rate": 0.0002104201680672269, + "loss": 0.3927, + "step": 28302 + }, + { + "epoch": 15.81173184357542, + "grad_norm": 0.5042136907577515, + "learning_rate": 0.0002103921568627451, + "loss": 0.4054, + "step": 28303 + }, + { + "epoch": 15.812290502793296, + "grad_norm": 0.43359267711639404, + "learning_rate": 0.0002103641456582633, + "loss": 0.2964, + "step": 28304 + }, + { + "epoch": 15.812849162011172, + "grad_norm": 0.46846503019332886, + "learning_rate": 0.00021033613445378154, + "loss": 0.4553, + "step": 28305 + }, + { + "epoch": 15.81340782122905, + "grad_norm": 0.43995222449302673, + "learning_rate": 0.00021030812324929974, + "loss": 0.4142, + "step": 28306 + }, + { + "epoch": 15.813966480446927, + "grad_norm": 0.43687763810157776, + "learning_rate": 0.00021028011204481792, + "loss": 0.4013, + "step": 28307 + }, + { + "epoch": 15.814525139664804, + "grad_norm": 1.1201419830322266, + "learning_rate": 0.00021025210084033613, + "loss": 0.3124, + "step": 28308 + }, + { + "epoch": 15.815083798882682, + "grad_norm": 0.4475044310092926, + "learning_rate": 0.00021022408963585436, + "loss": 0.5161, + "step": 28309 + }, + { + "epoch": 15.815642458100559, + "grad_norm": 1.9716365337371826, + "learning_rate": 0.00021019607843137257, + "loss": 0.3639, + "step": 28310 + }, + { + "epoch": 15.816201117318435, + "grad_norm": 0.5290347933769226, + "learning_rate": 0.00021016806722689077, + "loss": 0.4378, + "step": 28311 + }, + { + "epoch": 15.816759776536312, + "grad_norm": 0.4921661615371704, + "learning_rate": 0.00021014005602240895, + "loss": 0.4831, + "step": 28312 + }, + { + "epoch": 15.81731843575419, + "grad_norm": 0.47815224528312683, + "learning_rate": 0.00021011204481792718, + "loss": 0.3903, + "step": 28313 + }, + { + "epoch": 15.817877094972067, + "grad_norm": 0.5091875195503235, + "learning_rate": 0.0002100840336134454, + "loss": 0.3688, + "step": 28314 + }, + { + "epoch": 15.818435754189943, + "grad_norm": 0.6837301254272461, + "learning_rate": 0.0002100560224089636, + "loss": 0.4595, + "step": 28315 + }, + { + "epoch": 15.818994413407822, + "grad_norm": 0.6364619135856628, + "learning_rate": 0.00021002801120448178, + "loss": 0.4679, + "step": 28316 + }, + { + "epoch": 15.819553072625698, + "grad_norm": 0.6363828778266907, + "learning_rate": 0.00021, + "loss": 0.4398, + "step": 28317 + }, + { + "epoch": 15.820111731843575, + "grad_norm": 0.40923407673835754, + "learning_rate": 0.00020997198879551821, + "loss": 0.3526, + "step": 28318 + }, + { + "epoch": 15.820670391061453, + "grad_norm": 0.6104234457015991, + "learning_rate": 0.00020994397759103642, + "loss": 0.4608, + "step": 28319 + }, + { + "epoch": 15.82122905027933, + "grad_norm": 1.1124622821807861, + "learning_rate": 0.00020991596638655463, + "loss": 0.3707, + "step": 28320 + }, + { + "epoch": 15.821787709497206, + "grad_norm": 0.427314817905426, + "learning_rate": 0.00020988795518207283, + "loss": 0.4528, + "step": 28321 + }, + { + "epoch": 15.822346368715085, + "grad_norm": 0.381928414106369, + "learning_rate": 0.00020985994397759104, + "loss": 0.3779, + "step": 28322 + }, + { + "epoch": 15.822905027932961, + "grad_norm": 0.4302935302257538, + "learning_rate": 0.00020983193277310924, + "loss": 0.3966, + "step": 28323 + }, + { + "epoch": 15.823463687150838, + "grad_norm": 0.6560373902320862, + "learning_rate": 0.00020980392156862745, + "loss": 0.4123, + "step": 28324 + }, + { + "epoch": 15.824022346368714, + "grad_norm": 0.47805914282798767, + "learning_rate": 0.00020977591036414568, + "loss": 0.402, + "step": 28325 + }, + { + "epoch": 15.824581005586593, + "grad_norm": 0.4818778336048126, + "learning_rate": 0.00020974789915966386, + "loss": 0.3634, + "step": 28326 + }, + { + "epoch": 15.82513966480447, + "grad_norm": 0.3727802634239197, + "learning_rate": 0.00020971988795518207, + "loss": 0.3497, + "step": 28327 + }, + { + "epoch": 15.825698324022346, + "grad_norm": 0.46278542280197144, + "learning_rate": 0.00020969187675070027, + "loss": 0.3682, + "step": 28328 + }, + { + "epoch": 15.826256983240224, + "grad_norm": 0.39895567297935486, + "learning_rate": 0.0002096638655462185, + "loss": 0.3823, + "step": 28329 + }, + { + "epoch": 15.8268156424581, + "grad_norm": 0.5496333241462708, + "learning_rate": 0.00020963585434173671, + "loss": 0.4496, + "step": 28330 + }, + { + "epoch": 15.827374301675977, + "grad_norm": 0.8188040256500244, + "learning_rate": 0.0002096078431372549, + "loss": 0.3558, + "step": 28331 + }, + { + "epoch": 15.827932960893854, + "grad_norm": 0.4682527780532837, + "learning_rate": 0.0002095798319327731, + "loss": 0.4711, + "step": 28332 + }, + { + "epoch": 15.828491620111732, + "grad_norm": 0.47295722365379333, + "learning_rate": 0.00020955182072829133, + "loss": 0.407, + "step": 28333 + }, + { + "epoch": 15.829050279329609, + "grad_norm": 0.5079707503318787, + "learning_rate": 0.00020952380952380954, + "loss": 0.3624, + "step": 28334 + }, + { + "epoch": 15.829608938547485, + "grad_norm": 0.3931850492954254, + "learning_rate": 0.00020949579831932774, + "loss": 0.3333, + "step": 28335 + }, + { + "epoch": 15.830167597765364, + "grad_norm": 0.36695098876953125, + "learning_rate": 0.00020946778711484592, + "loss": 0.3201, + "step": 28336 + }, + { + "epoch": 15.83072625698324, + "grad_norm": 0.4932332932949066, + "learning_rate": 0.00020943977591036416, + "loss": 0.4186, + "step": 28337 + }, + { + "epoch": 15.831284916201117, + "grad_norm": 0.37566468119621277, + "learning_rate": 0.00020941176470588236, + "loss": 0.3577, + "step": 28338 + }, + { + "epoch": 15.831843575418995, + "grad_norm": 0.4285581409931183, + "learning_rate": 0.00020938375350140057, + "loss": 0.406, + "step": 28339 + }, + { + "epoch": 15.832402234636872, + "grad_norm": 0.3985428810119629, + "learning_rate": 0.00020935574229691877, + "loss": 0.406, + "step": 28340 + }, + { + "epoch": 15.832960893854748, + "grad_norm": 1.2113806009292603, + "learning_rate": 0.00020932773109243698, + "loss": 0.4083, + "step": 28341 + }, + { + "epoch": 15.833519553072625, + "grad_norm": 1.4710288047790527, + "learning_rate": 0.00020929971988795519, + "loss": 0.4782, + "step": 28342 + }, + { + "epoch": 15.834078212290503, + "grad_norm": 0.350473552942276, + "learning_rate": 0.0002092717086834734, + "loss": 0.403, + "step": 28343 + }, + { + "epoch": 15.83463687150838, + "grad_norm": 0.4966563880443573, + "learning_rate": 0.0002092436974789916, + "loss": 0.4612, + "step": 28344 + }, + { + "epoch": 15.835195530726256, + "grad_norm": 0.3245506286621094, + "learning_rate": 0.00020921568627450983, + "loss": 0.3364, + "step": 28345 + }, + { + "epoch": 15.835754189944135, + "grad_norm": 0.3747156262397766, + "learning_rate": 0.000209187675070028, + "loss": 0.4897, + "step": 28346 + }, + { + "epoch": 15.836312849162011, + "grad_norm": 1.2024601697921753, + "learning_rate": 0.00020915966386554622, + "loss": 0.3126, + "step": 28347 + }, + { + "epoch": 15.836871508379888, + "grad_norm": 0.4942377805709839, + "learning_rate": 0.00020913165266106442, + "loss": 0.3626, + "step": 28348 + }, + { + "epoch": 15.837430167597766, + "grad_norm": 0.4200831949710846, + "learning_rate": 0.00020910364145658265, + "loss": 0.3739, + "step": 28349 + }, + { + "epoch": 15.837988826815643, + "grad_norm": 0.4163464307785034, + "learning_rate": 0.00020907563025210086, + "loss": 0.4032, + "step": 28350 + }, + { + "epoch": 15.83854748603352, + "grad_norm": 0.48109230399131775, + "learning_rate": 0.00020904761904761904, + "loss": 0.3262, + "step": 28351 + }, + { + "epoch": 15.839106145251396, + "grad_norm": 0.3733949661254883, + "learning_rate": 0.00020901960784313725, + "loss": 0.4544, + "step": 28352 + }, + { + "epoch": 15.839664804469274, + "grad_norm": 0.3351680040359497, + "learning_rate": 0.00020899159663865548, + "loss": 0.4556, + "step": 28353 + }, + { + "epoch": 15.84022346368715, + "grad_norm": 0.5531509518623352, + "learning_rate": 0.00020896358543417368, + "loss": 0.3882, + "step": 28354 + }, + { + "epoch": 15.840782122905027, + "grad_norm": 0.6404728889465332, + "learning_rate": 0.0002089355742296919, + "loss": 0.3724, + "step": 28355 + }, + { + "epoch": 15.841340782122906, + "grad_norm": 0.3734499216079712, + "learning_rate": 0.00020890756302521007, + "loss": 0.4391, + "step": 28356 + }, + { + "epoch": 15.841899441340782, + "grad_norm": 0.39347684383392334, + "learning_rate": 0.0002088795518207283, + "loss": 0.4276, + "step": 28357 + }, + { + "epoch": 15.842458100558659, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002088515406162465, + "loss": 0.486, + "step": 28358 + }, + { + "epoch": 15.843016759776535, + "grad_norm": 1.31234610080719, + "learning_rate": 0.00020882352941176471, + "loss": 0.3833, + "step": 28359 + }, + { + "epoch": 15.843575418994414, + "grad_norm": 0.7396610975265503, + "learning_rate": 0.00020879551820728292, + "loss": 0.3721, + "step": 28360 + }, + { + "epoch": 15.84413407821229, + "grad_norm": 0.45909854769706726, + "learning_rate": 0.00020876750700280113, + "loss": 0.4952, + "step": 28361 + }, + { + "epoch": 15.844692737430167, + "grad_norm": 0.8502050042152405, + "learning_rate": 0.00020873949579831933, + "loss": 0.4997, + "step": 28362 + }, + { + "epoch": 15.845251396648045, + "grad_norm": 0.8959946036338806, + "learning_rate": 0.00020871148459383754, + "loss": 0.5762, + "step": 28363 + }, + { + "epoch": 15.845810055865922, + "grad_norm": 0.5829812288284302, + "learning_rate": 0.00020868347338935574, + "loss": 0.5075, + "step": 28364 + }, + { + "epoch": 15.846368715083798, + "grad_norm": 0.4223109185695648, + "learning_rate": 0.00020865546218487398, + "loss": 0.3932, + "step": 28365 + }, + { + "epoch": 15.846927374301677, + "grad_norm": 0.46028634905815125, + "learning_rate": 0.00020862745098039216, + "loss": 0.4617, + "step": 28366 + }, + { + "epoch": 15.847486033519553, + "grad_norm": 0.5849369764328003, + "learning_rate": 0.00020859943977591036, + "loss": 0.3527, + "step": 28367 + }, + { + "epoch": 15.84804469273743, + "grad_norm": 0.5332579612731934, + "learning_rate": 0.00020857142857142857, + "loss": 0.4275, + "step": 28368 + }, + { + "epoch": 15.848603351955306, + "grad_norm": 0.5800167322158813, + "learning_rate": 0.0002085434173669468, + "loss": 0.336, + "step": 28369 + }, + { + "epoch": 15.849162011173185, + "grad_norm": 0.33230382204055786, + "learning_rate": 0.00020851540616246498, + "loss": 0.366, + "step": 28370 + }, + { + "epoch": 15.849720670391061, + "grad_norm": 0.5483872890472412, + "learning_rate": 0.0002084873949579832, + "loss": 0.3585, + "step": 28371 + }, + { + "epoch": 15.850279329608938, + "grad_norm": 0.5005813837051392, + "learning_rate": 0.0002084593837535014, + "loss": 0.3808, + "step": 28372 + }, + { + "epoch": 15.850837988826816, + "grad_norm": 11.123309135437012, + "learning_rate": 0.00020843137254901963, + "loss": 0.408, + "step": 28373 + }, + { + "epoch": 15.851396648044693, + "grad_norm": 0.489734411239624, + "learning_rate": 0.00020840336134453783, + "loss": 0.4529, + "step": 28374 + }, + { + "epoch": 15.85195530726257, + "grad_norm": 0.36484530568122864, + "learning_rate": 0.000208375350140056, + "loss": 0.3717, + "step": 28375 + }, + { + "epoch": 15.852513966480448, + "grad_norm": 0.4622708559036255, + "learning_rate": 0.00020834733893557422, + "loss": 0.4871, + "step": 28376 + }, + { + "epoch": 15.853072625698324, + "grad_norm": 0.4826257526874542, + "learning_rate": 0.00020831932773109245, + "loss": 0.4698, + "step": 28377 + }, + { + "epoch": 15.8536312849162, + "grad_norm": 0.4868282377719879, + "learning_rate": 0.00020829131652661066, + "loss": 0.4612, + "step": 28378 + }, + { + "epoch": 15.854189944134077, + "grad_norm": 0.4474319517612457, + "learning_rate": 0.00020826330532212886, + "loss": 0.5293, + "step": 28379 + }, + { + "epoch": 15.854748603351956, + "grad_norm": 0.6853349208831787, + "learning_rate": 0.00020823529411764704, + "loss": 0.3449, + "step": 28380 + }, + { + "epoch": 15.855307262569832, + "grad_norm": 0.5232422351837158, + "learning_rate": 0.00020820728291316527, + "loss": 0.4543, + "step": 28381 + }, + { + "epoch": 15.855865921787709, + "grad_norm": 0.4874227046966553, + "learning_rate": 0.00020817927170868348, + "loss": 0.44, + "step": 28382 + }, + { + "epoch": 15.856424581005587, + "grad_norm": 0.3655456006526947, + "learning_rate": 0.00020815126050420169, + "loss": 0.3741, + "step": 28383 + }, + { + "epoch": 15.856983240223464, + "grad_norm": 0.3759661912918091, + "learning_rate": 0.0002081232492997199, + "loss": 0.3301, + "step": 28384 + }, + { + "epoch": 15.85754189944134, + "grad_norm": 0.6729076504707336, + "learning_rate": 0.0002080952380952381, + "loss": 0.4547, + "step": 28385 + }, + { + "epoch": 15.858100558659217, + "grad_norm": 0.5717796087265015, + "learning_rate": 0.0002080672268907563, + "loss": 0.3132, + "step": 28386 + }, + { + "epoch": 15.858659217877095, + "grad_norm": 0.616753876209259, + "learning_rate": 0.0002080392156862745, + "loss": 0.4034, + "step": 28387 + }, + { + "epoch": 15.859217877094972, + "grad_norm": 0.48488298058509827, + "learning_rate": 0.00020801120448179272, + "loss": 0.4667, + "step": 28388 + }, + { + "epoch": 15.859776536312848, + "grad_norm": 0.3956802785396576, + "learning_rate": 0.00020798319327731095, + "loss": 0.3445, + "step": 28389 + }, + { + "epoch": 15.860335195530727, + "grad_norm": 0.3456871211528778, + "learning_rate": 0.00020795518207282913, + "loss": 0.3577, + "step": 28390 + }, + { + "epoch": 15.860893854748603, + "grad_norm": 0.4096421003341675, + "learning_rate": 0.00020792717086834733, + "loss": 0.4156, + "step": 28391 + }, + { + "epoch": 15.86145251396648, + "grad_norm": 0.385881632566452, + "learning_rate": 0.00020789915966386554, + "loss": 0.3469, + "step": 28392 + }, + { + "epoch": 15.862011173184358, + "grad_norm": 0.3828909397125244, + "learning_rate": 0.00020787114845938377, + "loss": 0.3857, + "step": 28393 + }, + { + "epoch": 15.862569832402235, + "grad_norm": 0.36904406547546387, + "learning_rate": 0.00020784313725490198, + "loss": 0.3603, + "step": 28394 + }, + { + "epoch": 15.863128491620111, + "grad_norm": 0.3796258568763733, + "learning_rate": 0.00020781512605042016, + "loss": 0.3879, + "step": 28395 + }, + { + "epoch": 15.86368715083799, + "grad_norm": 0.3728046417236328, + "learning_rate": 0.00020778711484593836, + "loss": 0.4918, + "step": 28396 + }, + { + "epoch": 15.864245810055866, + "grad_norm": 1.1431180238723755, + "learning_rate": 0.0002077591036414566, + "loss": 0.4027, + "step": 28397 + }, + { + "epoch": 15.864804469273743, + "grad_norm": 2.3157832622528076, + "learning_rate": 0.0002077310924369748, + "loss": 0.4078, + "step": 28398 + }, + { + "epoch": 15.86536312849162, + "grad_norm": 0.3106030225753784, + "learning_rate": 0.000207703081232493, + "loss": 0.3404, + "step": 28399 + }, + { + "epoch": 15.865921787709498, + "grad_norm": 0.6065434813499451, + "learning_rate": 0.0002076750700280112, + "loss": 0.7019, + "step": 28400 + }, + { + "epoch": 15.866480446927374, + "grad_norm": 0.46118006110191345, + "learning_rate": 0.00020764705882352942, + "loss": 0.3393, + "step": 28401 + }, + { + "epoch": 15.867039106145251, + "grad_norm": 0.5767713785171509, + "learning_rate": 0.00020761904761904763, + "loss": 0.4381, + "step": 28402 + }, + { + "epoch": 15.86759776536313, + "grad_norm": 0.4891776740550995, + "learning_rate": 0.00020759103641456583, + "loss": 0.3676, + "step": 28403 + }, + { + "epoch": 15.868156424581006, + "grad_norm": 0.3744615912437439, + "learning_rate": 0.00020756302521008404, + "loss": 0.3022, + "step": 28404 + }, + { + "epoch": 15.868715083798882, + "grad_norm": 0.7736131548881531, + "learning_rate": 0.00020753501400560224, + "loss": 0.4942, + "step": 28405 + }, + { + "epoch": 15.869273743016759, + "grad_norm": 1.4315904378890991, + "learning_rate": 0.00020750700280112045, + "loss": 0.3647, + "step": 28406 + }, + { + "epoch": 15.869832402234637, + "grad_norm": 0.3626892864704132, + "learning_rate": 0.00020747899159663866, + "loss": 0.4165, + "step": 28407 + }, + { + "epoch": 15.870391061452514, + "grad_norm": 0.7442923784255981, + "learning_rate": 0.00020745098039215686, + "loss": 0.4806, + "step": 28408 + }, + { + "epoch": 15.87094972067039, + "grad_norm": 2.0274085998535156, + "learning_rate": 0.0002074229691876751, + "loss": 0.3876, + "step": 28409 + }, + { + "epoch": 15.871508379888269, + "grad_norm": 0.5515741109848022, + "learning_rate": 0.00020739495798319327, + "loss": 0.3988, + "step": 28410 + }, + { + "epoch": 15.872067039106145, + "grad_norm": 0.44420647621154785, + "learning_rate": 0.00020736694677871148, + "loss": 0.4475, + "step": 28411 + }, + { + "epoch": 15.872625698324022, + "grad_norm": 0.42270877957344055, + "learning_rate": 0.0002073389355742297, + "loss": 0.4018, + "step": 28412 + }, + { + "epoch": 15.8731843575419, + "grad_norm": 0.8761048913002014, + "learning_rate": 0.00020731092436974792, + "loss": 0.3788, + "step": 28413 + }, + { + "epoch": 15.873743016759777, + "grad_norm": 0.7267293930053711, + "learning_rate": 0.00020728291316526613, + "loss": 0.4004, + "step": 28414 + }, + { + "epoch": 15.874301675977653, + "grad_norm": 0.4068256616592407, + "learning_rate": 0.0002072549019607843, + "loss": 0.3254, + "step": 28415 + }, + { + "epoch": 15.87486033519553, + "grad_norm": 0.4016488194465637, + "learning_rate": 0.0002072268907563025, + "loss": 0.4844, + "step": 28416 + }, + { + "epoch": 15.875418994413408, + "grad_norm": 0.3279385566711426, + "learning_rate": 0.00020719887955182074, + "loss": 0.3282, + "step": 28417 + }, + { + "epoch": 15.875977653631285, + "grad_norm": 0.5887762904167175, + "learning_rate": 0.00020717086834733895, + "loss": 0.4306, + "step": 28418 + }, + { + "epoch": 15.876536312849161, + "grad_norm": 0.350024938583374, + "learning_rate": 0.00020714285714285716, + "loss": 0.3529, + "step": 28419 + }, + { + "epoch": 15.87709497206704, + "grad_norm": 0.5459478497505188, + "learning_rate": 0.00020711484593837533, + "loss": 0.3658, + "step": 28420 + }, + { + "epoch": 15.877653631284916, + "grad_norm": 0.6664366722106934, + "learning_rate": 0.00020708683473389357, + "loss": 0.3459, + "step": 28421 + }, + { + "epoch": 15.878212290502793, + "grad_norm": 15.537731170654297, + "learning_rate": 0.00020705882352941177, + "loss": 0.4829, + "step": 28422 + }, + { + "epoch": 15.878770949720671, + "grad_norm": 0.4693450927734375, + "learning_rate": 0.00020703081232492998, + "loss": 0.3265, + "step": 28423 + }, + { + "epoch": 15.879329608938548, + "grad_norm": 0.46675968170166016, + "learning_rate": 0.00020700280112044816, + "loss": 0.3966, + "step": 28424 + }, + { + "epoch": 15.879888268156424, + "grad_norm": 0.39274975657463074, + "learning_rate": 0.0002069747899159664, + "loss": 0.3731, + "step": 28425 + }, + { + "epoch": 15.880446927374301, + "grad_norm": 0.6091746687889099, + "learning_rate": 0.0002069467787114846, + "loss": 0.4976, + "step": 28426 + }, + { + "epoch": 15.88100558659218, + "grad_norm": 0.4482857882976532, + "learning_rate": 0.0002069187675070028, + "loss": 0.3346, + "step": 28427 + }, + { + "epoch": 15.881564245810056, + "grad_norm": 1.0685772895812988, + "learning_rate": 0.000206890756302521, + "loss": 0.4871, + "step": 28428 + }, + { + "epoch": 15.882122905027932, + "grad_norm": 0.57100909948349, + "learning_rate": 0.00020686274509803922, + "loss": 0.3607, + "step": 28429 + }, + { + "epoch": 15.88268156424581, + "grad_norm": 0.440228134393692, + "learning_rate": 0.00020683473389355742, + "loss": 0.4409, + "step": 28430 + }, + { + "epoch": 15.883240223463687, + "grad_norm": 1.2606966495513916, + "learning_rate": 0.00020680672268907563, + "loss": 0.6222, + "step": 28431 + }, + { + "epoch": 15.883798882681564, + "grad_norm": 0.5349351763725281, + "learning_rate": 0.00020677871148459383, + "loss": 0.3509, + "step": 28432 + }, + { + "epoch": 15.88435754189944, + "grad_norm": 0.33402013778686523, + "learning_rate": 0.00020675070028011207, + "loss": 0.3488, + "step": 28433 + }, + { + "epoch": 15.884916201117319, + "grad_norm": 2.497631072998047, + "learning_rate": 0.00020672268907563025, + "loss": 0.3966, + "step": 28434 + }, + { + "epoch": 15.885474860335195, + "grad_norm": 0.5972374677658081, + "learning_rate": 0.00020669467787114845, + "loss": 0.616, + "step": 28435 + }, + { + "epoch": 15.886033519553072, + "grad_norm": 0.5541279315948486, + "learning_rate": 0.00020666666666666666, + "loss": 0.3521, + "step": 28436 + }, + { + "epoch": 15.88659217877095, + "grad_norm": 0.8467921018600464, + "learning_rate": 0.0002066386554621849, + "loss": 0.2775, + "step": 28437 + }, + { + "epoch": 15.887150837988827, + "grad_norm": 0.6436362266540527, + "learning_rate": 0.0002066106442577031, + "loss": 0.3813, + "step": 28438 + }, + { + "epoch": 15.887709497206703, + "grad_norm": 0.44945985078811646, + "learning_rate": 0.00020658263305322128, + "loss": 0.5514, + "step": 28439 + }, + { + "epoch": 15.888268156424582, + "grad_norm": 0.3350696563720703, + "learning_rate": 0.00020655462184873948, + "loss": 0.3674, + "step": 28440 + }, + { + "epoch": 15.888826815642458, + "grad_norm": 0.3166445195674896, + "learning_rate": 0.00020652661064425771, + "loss": 0.3327, + "step": 28441 + }, + { + "epoch": 15.889385474860335, + "grad_norm": 0.6495802402496338, + "learning_rate": 0.00020649859943977592, + "loss": 0.4819, + "step": 28442 + }, + { + "epoch": 15.889944134078211, + "grad_norm": 2.76810359954834, + "learning_rate": 0.00020647058823529413, + "loss": 0.5147, + "step": 28443 + }, + { + "epoch": 15.89050279329609, + "grad_norm": 0.5136860013008118, + "learning_rate": 0.0002064425770308123, + "loss": 0.3636, + "step": 28444 + }, + { + "epoch": 15.891061452513966, + "grad_norm": 0.43074607849121094, + "learning_rate": 0.00020641456582633054, + "loss": 0.4142, + "step": 28445 + }, + { + "epoch": 15.891620111731843, + "grad_norm": 0.4100888967514038, + "learning_rate": 0.00020638655462184874, + "loss": 0.4511, + "step": 28446 + }, + { + "epoch": 15.892178770949721, + "grad_norm": 0.6936303377151489, + "learning_rate": 0.00020635854341736695, + "loss": 0.4032, + "step": 28447 + }, + { + "epoch": 15.892737430167598, + "grad_norm": 0.38012710213661194, + "learning_rate": 0.00020633053221288516, + "loss": 0.3854, + "step": 28448 + }, + { + "epoch": 15.893296089385474, + "grad_norm": 0.7620035409927368, + "learning_rate": 0.00020630252100840336, + "loss": 0.4582, + "step": 28449 + }, + { + "epoch": 15.893854748603353, + "grad_norm": 0.4981892704963684, + "learning_rate": 0.00020627450980392157, + "loss": 0.569, + "step": 28450 + }, + { + "epoch": 15.89441340782123, + "grad_norm": 0.5805944800376892, + "learning_rate": 0.00020624649859943977, + "loss": 0.474, + "step": 28451 + }, + { + "epoch": 15.894972067039106, + "grad_norm": 0.37508833408355713, + "learning_rate": 0.00020621848739495798, + "loss": 0.4374, + "step": 28452 + }, + { + "epoch": 15.895530726256982, + "grad_norm": 1.0173367261886597, + "learning_rate": 0.00020619047619047621, + "loss": 0.4204, + "step": 28453 + }, + { + "epoch": 15.89608938547486, + "grad_norm": 0.7939687967300415, + "learning_rate": 0.0002061624649859944, + "loss": 0.365, + "step": 28454 + }, + { + "epoch": 15.896648044692737, + "grad_norm": 0.8371119499206543, + "learning_rate": 0.0002061344537815126, + "loss": 0.4268, + "step": 28455 + }, + { + "epoch": 15.897206703910614, + "grad_norm": 0.4625296890735626, + "learning_rate": 0.0002061064425770308, + "loss": 0.4179, + "step": 28456 + }, + { + "epoch": 15.897765363128492, + "grad_norm": 0.34698665142059326, + "learning_rate": 0.00020607843137254904, + "loss": 0.3675, + "step": 28457 + }, + { + "epoch": 15.898324022346369, + "grad_norm": 6.101527690887451, + "learning_rate": 0.00020605042016806724, + "loss": 0.4231, + "step": 28458 + }, + { + "epoch": 15.898882681564245, + "grad_norm": 0.5622072815895081, + "learning_rate": 0.00020602240896358542, + "loss": 0.4838, + "step": 28459 + }, + { + "epoch": 15.899441340782122, + "grad_norm": 0.924141526222229, + "learning_rate": 0.00020599439775910363, + "loss": 0.3551, + "step": 28460 + }, + { + "epoch": 15.9, + "grad_norm": 4.007843017578125, + "learning_rate": 0.00020596638655462186, + "loss": 0.4115, + "step": 28461 + }, + { + "epoch": 15.900558659217877, + "grad_norm": 1.081787347793579, + "learning_rate": 0.00020593837535014007, + "loss": 0.6797, + "step": 28462 + }, + { + "epoch": 15.901117318435753, + "grad_norm": 0.4717196822166443, + "learning_rate": 0.00020591036414565827, + "loss": 0.2993, + "step": 28463 + }, + { + "epoch": 15.901675977653632, + "grad_norm": 0.9129984974861145, + "learning_rate": 0.00020588235294117645, + "loss": 0.5495, + "step": 28464 + }, + { + "epoch": 15.902234636871508, + "grad_norm": 0.7169912457466125, + "learning_rate": 0.00020585434173669469, + "loss": 0.5595, + "step": 28465 + }, + { + "epoch": 15.902793296089385, + "grad_norm": 0.4903715252876282, + "learning_rate": 0.0002058263305322129, + "loss": 0.4882, + "step": 28466 + }, + { + "epoch": 15.903351955307263, + "grad_norm": 0.5238422155380249, + "learning_rate": 0.0002057983193277311, + "loss": 0.4049, + "step": 28467 + }, + { + "epoch": 15.90391061452514, + "grad_norm": 0.6813641786575317, + "learning_rate": 0.0002057703081232493, + "loss": 0.3379, + "step": 28468 + }, + { + "epoch": 15.904469273743016, + "grad_norm": 0.9063427448272705, + "learning_rate": 0.0002057422969187675, + "loss": 0.4149, + "step": 28469 + }, + { + "epoch": 15.905027932960895, + "grad_norm": 0.43170052766799927, + "learning_rate": 0.00020571428571428572, + "loss": 0.3482, + "step": 28470 + }, + { + "epoch": 15.905586592178771, + "grad_norm": 0.3269258141517639, + "learning_rate": 0.00020568627450980392, + "loss": 0.3935, + "step": 28471 + }, + { + "epoch": 15.906145251396648, + "grad_norm": 0.5334846377372742, + "learning_rate": 0.00020565826330532213, + "loss": 0.3952, + "step": 28472 + }, + { + "epoch": 15.906703910614524, + "grad_norm": 0.8405916094779968, + "learning_rate": 0.00020563025210084036, + "loss": 0.4517, + "step": 28473 + }, + { + "epoch": 15.907262569832403, + "grad_norm": 0.6547182202339172, + "learning_rate": 0.00020560224089635854, + "loss": 0.4063, + "step": 28474 + }, + { + "epoch": 15.90782122905028, + "grad_norm": 0.45974281430244446, + "learning_rate": 0.00020557422969187675, + "loss": 0.3768, + "step": 28475 + }, + { + "epoch": 15.908379888268156, + "grad_norm": 2.6047375202178955, + "learning_rate": 0.00020554621848739495, + "loss": 0.4796, + "step": 28476 + }, + { + "epoch": 15.908938547486034, + "grad_norm": 1.2541228532791138, + "learning_rate": 0.00020551820728291318, + "loss": 0.4766, + "step": 28477 + }, + { + "epoch": 15.90949720670391, + "grad_norm": 1.216201901435852, + "learning_rate": 0.0002054901960784314, + "loss": 0.485, + "step": 28478 + }, + { + "epoch": 15.910055865921787, + "grad_norm": 16.248367309570312, + "learning_rate": 0.00020546218487394957, + "loss": 0.3789, + "step": 28479 + }, + { + "epoch": 15.910614525139664, + "grad_norm": 0.6656233072280884, + "learning_rate": 0.00020543417366946778, + "loss": 0.4885, + "step": 28480 + }, + { + "epoch": 15.911173184357542, + "grad_norm": 0.43571439385414124, + "learning_rate": 0.000205406162464986, + "loss": 0.4797, + "step": 28481 + }, + { + "epoch": 15.911731843575419, + "grad_norm": 1.9898213148117065, + "learning_rate": 0.00020537815126050421, + "loss": 0.423, + "step": 28482 + }, + { + "epoch": 15.912290502793295, + "grad_norm": 0.5237147212028503, + "learning_rate": 0.0002053501400560224, + "loss": 0.3472, + "step": 28483 + }, + { + "epoch": 15.912849162011174, + "grad_norm": 0.4679156541824341, + "learning_rate": 0.0002053221288515406, + "loss": 0.5599, + "step": 28484 + }, + { + "epoch": 15.91340782122905, + "grad_norm": 0.5738799571990967, + "learning_rate": 0.00020529411764705883, + "loss": 0.5695, + "step": 28485 + }, + { + "epoch": 15.913966480446927, + "grad_norm": 0.385433167219162, + "learning_rate": 0.00020526610644257704, + "loss": 0.4009, + "step": 28486 + }, + { + "epoch": 15.914525139664804, + "grad_norm": 0.4707374572753906, + "learning_rate": 0.00020523809523809524, + "loss": 0.4851, + "step": 28487 + }, + { + "epoch": 15.915083798882682, + "grad_norm": 0.5555111169815063, + "learning_rate": 0.00020521008403361342, + "loss": 0.4917, + "step": 28488 + }, + { + "epoch": 15.915642458100558, + "grad_norm": 0.4589138627052307, + "learning_rate": 0.00020518207282913166, + "loss": 0.3896, + "step": 28489 + }, + { + "epoch": 15.916201117318435, + "grad_norm": 0.42126110196113586, + "learning_rate": 0.00020515406162464986, + "loss": 0.4569, + "step": 28490 + }, + { + "epoch": 15.916759776536313, + "grad_norm": 0.4824746251106262, + "learning_rate": 0.00020512605042016807, + "loss": 0.3677, + "step": 28491 + }, + { + "epoch": 15.91731843575419, + "grad_norm": 0.6603912711143494, + "learning_rate": 0.0002050980392156863, + "loss": 0.4111, + "step": 28492 + }, + { + "epoch": 15.917877094972066, + "grad_norm": 1.5749597549438477, + "learning_rate": 0.00020507002801120448, + "loss": 0.3415, + "step": 28493 + }, + { + "epoch": 15.918435754189945, + "grad_norm": 0.4526945650577545, + "learning_rate": 0.0002050420168067227, + "loss": 0.2991, + "step": 28494 + }, + { + "epoch": 15.918994413407821, + "grad_norm": 0.4534764289855957, + "learning_rate": 0.0002050140056022409, + "loss": 0.3299, + "step": 28495 + }, + { + "epoch": 15.919553072625698, + "grad_norm": 0.5210875272750854, + "learning_rate": 0.00020498599439775913, + "loss": 0.4253, + "step": 28496 + }, + { + "epoch": 15.920111731843576, + "grad_norm": 0.5250996351242065, + "learning_rate": 0.00020495798319327733, + "loss": 0.4201, + "step": 28497 + }, + { + "epoch": 15.920670391061453, + "grad_norm": 0.6987993717193604, + "learning_rate": 0.0002049299719887955, + "loss": 0.5055, + "step": 28498 + }, + { + "epoch": 15.92122905027933, + "grad_norm": 0.9810032844543457, + "learning_rate": 0.00020490196078431372, + "loss": 0.3642, + "step": 28499 + }, + { + "epoch": 15.921787709497206, + "grad_norm": 0.4533756971359253, + "learning_rate": 0.00020487394957983195, + "loss": 0.3648, + "step": 28500 + }, + { + "epoch": 15.921787709497206, + "eval_cer": 0.08613466007659323, + "eval_loss": 0.3235432505607605, + "eval_runtime": 58.068, + "eval_samples_per_second": 78.15, + "eval_steps_per_second": 4.891, + "eval_wer": 0.3415547051758331, + "step": 28500 + }, + { + "epoch": 15.922346368715084, + "grad_norm": 0.5654794573783875, + "learning_rate": 0.00020484593837535016, + "loss": 0.426, + "step": 28501 + }, + { + "epoch": 15.922905027932961, + "grad_norm": 0.5199263095855713, + "learning_rate": 0.00020481792717086836, + "loss": 0.3793, + "step": 28502 + }, + { + "epoch": 15.923463687150837, + "grad_norm": 2.2113380432128906, + "learning_rate": 0.00020478991596638654, + "loss": 0.4895, + "step": 28503 + }, + { + "epoch": 15.924022346368716, + "grad_norm": 0.6097524166107178, + "learning_rate": 0.00020476190476190477, + "loss": 0.4814, + "step": 28504 + }, + { + "epoch": 15.924581005586592, + "grad_norm": 0.6493017077445984, + "learning_rate": 0.00020473389355742298, + "loss": 0.3894, + "step": 28505 + }, + { + "epoch": 15.925139664804469, + "grad_norm": 0.4637574255466461, + "learning_rate": 0.00020470588235294119, + "loss": 0.3997, + "step": 28506 + }, + { + "epoch": 15.925698324022346, + "grad_norm": 0.4985971450805664, + "learning_rate": 0.0002046778711484594, + "loss": 0.4104, + "step": 28507 + }, + { + "epoch": 15.926256983240224, + "grad_norm": 1.911176323890686, + "learning_rate": 0.0002046498599439776, + "loss": 0.3783, + "step": 28508 + }, + { + "epoch": 15.9268156424581, + "grad_norm": 0.5623754262924194, + "learning_rate": 0.0002046218487394958, + "loss": 0.4578, + "step": 28509 + }, + { + "epoch": 15.927374301675977, + "grad_norm": 0.7828503847122192, + "learning_rate": 0.000204593837535014, + "loss": 0.6944, + "step": 28510 + }, + { + "epoch": 15.927932960893855, + "grad_norm": 0.3718717694282532, + "learning_rate": 0.00020456582633053222, + "loss": 0.3955, + "step": 28511 + }, + { + "epoch": 15.928491620111732, + "grad_norm": 0.6445087790489197, + "learning_rate": 0.00020453781512605045, + "loss": 0.6679, + "step": 28512 + }, + { + "epoch": 15.929050279329608, + "grad_norm": 0.47356128692626953, + "learning_rate": 0.00020450980392156863, + "loss": 0.3253, + "step": 28513 + }, + { + "epoch": 15.929608938547487, + "grad_norm": 0.4766961932182312, + "learning_rate": 0.00020448179271708683, + "loss": 0.544, + "step": 28514 + }, + { + "epoch": 15.930167597765363, + "grad_norm": 0.8592237830162048, + "learning_rate": 0.00020445378151260504, + "loss": 0.374, + "step": 28515 + }, + { + "epoch": 15.93072625698324, + "grad_norm": 1.002304196357727, + "learning_rate": 0.00020442577030812327, + "loss": 0.4401, + "step": 28516 + }, + { + "epoch": 15.931284916201117, + "grad_norm": 0.5882537961006165, + "learning_rate": 0.00020439775910364148, + "loss": 0.375, + "step": 28517 + }, + { + "epoch": 15.931843575418995, + "grad_norm": 2.5620806217193604, + "learning_rate": 0.00020436974789915966, + "loss": 0.505, + "step": 28518 + }, + { + "epoch": 15.932402234636871, + "grad_norm": 0.40555885434150696, + "learning_rate": 0.00020434173669467786, + "loss": 0.4169, + "step": 28519 + }, + { + "epoch": 15.932960893854748, + "grad_norm": 0.6523237824440002, + "learning_rate": 0.0002043137254901961, + "loss": 0.4334, + "step": 28520 + }, + { + "epoch": 15.933519553072626, + "grad_norm": 0.5478083491325378, + "learning_rate": 0.0002042857142857143, + "loss": 0.5454, + "step": 28521 + }, + { + "epoch": 15.934078212290503, + "grad_norm": 0.4292314052581787, + "learning_rate": 0.0002042577030812325, + "loss": 0.3919, + "step": 28522 + }, + { + "epoch": 15.93463687150838, + "grad_norm": 0.4856822192668915, + "learning_rate": 0.0002042296918767507, + "loss": 0.3842, + "step": 28523 + }, + { + "epoch": 15.935195530726258, + "grad_norm": 0.4929467439651489, + "learning_rate": 0.00020420168067226892, + "loss": 0.4751, + "step": 28524 + }, + { + "epoch": 15.935754189944134, + "grad_norm": 0.3763562738895416, + "learning_rate": 0.00020417366946778713, + "loss": 0.3052, + "step": 28525 + }, + { + "epoch": 15.936312849162011, + "grad_norm": 0.484101265668869, + "learning_rate": 0.00020414565826330533, + "loss": 0.4347, + "step": 28526 + }, + { + "epoch": 15.936871508379888, + "grad_norm": 0.5910486578941345, + "learning_rate": 0.00020411764705882354, + "loss": 0.4826, + "step": 28527 + }, + { + "epoch": 15.937430167597766, + "grad_norm": 0.660768985748291, + "learning_rate": 0.00020408963585434174, + "loss": 0.3767, + "step": 28528 + }, + { + "epoch": 15.937988826815642, + "grad_norm": 0.47091758251190186, + "learning_rate": 0.00020406162464985995, + "loss": 0.5084, + "step": 28529 + }, + { + "epoch": 15.938547486033519, + "grad_norm": 0.46020060777664185, + "learning_rate": 0.00020403361344537816, + "loss": 0.3624, + "step": 28530 + }, + { + "epoch": 15.939106145251397, + "grad_norm": 0.4867001175880432, + "learning_rate": 0.00020400560224089636, + "loss": 0.3611, + "step": 28531 + }, + { + "epoch": 15.939664804469274, + "grad_norm": 0.392189621925354, + "learning_rate": 0.0002039775910364146, + "loss": 0.4259, + "step": 28532 + }, + { + "epoch": 15.94022346368715, + "grad_norm": 0.4954374134540558, + "learning_rate": 0.00020394957983193277, + "loss": 0.4123, + "step": 28533 + }, + { + "epoch": 15.940782122905027, + "grad_norm": 0.7099990844726562, + "learning_rate": 0.00020392156862745098, + "loss": 0.4378, + "step": 28534 + }, + { + "epoch": 15.941340782122905, + "grad_norm": 4.950645446777344, + "learning_rate": 0.0002038935574229692, + "loss": 0.5561, + "step": 28535 + }, + { + "epoch": 15.941899441340782, + "grad_norm": 0.4951377511024475, + "learning_rate": 0.00020386554621848742, + "loss": 0.3797, + "step": 28536 + }, + { + "epoch": 15.942458100558659, + "grad_norm": 0.953783392906189, + "learning_rate": 0.0002038375350140056, + "loss": 0.3479, + "step": 28537 + }, + { + "epoch": 15.943016759776537, + "grad_norm": 0.38955867290496826, + "learning_rate": 0.0002038095238095238, + "loss": 0.388, + "step": 28538 + }, + { + "epoch": 15.943575418994413, + "grad_norm": 0.6025633215904236, + "learning_rate": 0.000203781512605042, + "loss": 0.4626, + "step": 28539 + }, + { + "epoch": 15.94413407821229, + "grad_norm": 0.39215004444122314, + "learning_rate": 0.00020375350140056024, + "loss": 0.3648, + "step": 28540 + }, + { + "epoch": 15.944692737430168, + "grad_norm": 0.416963130235672, + "learning_rate": 0.00020372549019607845, + "loss": 0.4433, + "step": 28541 + }, + { + "epoch": 15.945251396648045, + "grad_norm": 0.6851239800453186, + "learning_rate": 0.00020369747899159663, + "loss": 0.5279, + "step": 28542 + }, + { + "epoch": 15.945810055865921, + "grad_norm": 0.37526416778564453, + "learning_rate": 0.00020366946778711483, + "loss": 0.3689, + "step": 28543 + }, + { + "epoch": 15.946368715083798, + "grad_norm": 0.4368479251861572, + "learning_rate": 0.00020364145658263307, + "loss": 0.4656, + "step": 28544 + }, + { + "epoch": 15.946927374301676, + "grad_norm": 0.9038490653038025, + "learning_rate": 0.00020361344537815127, + "loss": 0.5051, + "step": 28545 + }, + { + "epoch": 15.947486033519553, + "grad_norm": 0.7822328805923462, + "learning_rate": 0.00020358543417366948, + "loss": 0.4648, + "step": 28546 + }, + { + "epoch": 15.94804469273743, + "grad_norm": 1.0376384258270264, + "learning_rate": 0.00020355742296918766, + "loss": 0.5006, + "step": 28547 + }, + { + "epoch": 15.948603351955308, + "grad_norm": 1.508028507232666, + "learning_rate": 0.0002035294117647059, + "loss": 0.4648, + "step": 28548 + }, + { + "epoch": 15.949162011173184, + "grad_norm": 0.49336063861846924, + "learning_rate": 0.0002035014005602241, + "loss": 0.3744, + "step": 28549 + }, + { + "epoch": 15.949720670391061, + "grad_norm": 1.7102192640304565, + "learning_rate": 0.0002034733893557423, + "loss": 0.3585, + "step": 28550 + }, + { + "epoch": 15.95027932960894, + "grad_norm": 0.5059553384780884, + "learning_rate": 0.0002034453781512605, + "loss": 0.4075, + "step": 28551 + }, + { + "epoch": 15.950837988826816, + "grad_norm": 0.35427260398864746, + "learning_rate": 0.00020341736694677872, + "loss": 0.3601, + "step": 28552 + }, + { + "epoch": 15.951396648044692, + "grad_norm": 0.5888104438781738, + "learning_rate": 0.00020338935574229692, + "loss": 0.5204, + "step": 28553 + }, + { + "epoch": 15.951955307262569, + "grad_norm": 1.101461410522461, + "learning_rate": 0.00020336134453781513, + "loss": 0.2755, + "step": 28554 + }, + { + "epoch": 15.952513966480447, + "grad_norm": 0.36998382210731506, + "learning_rate": 0.00020333333333333333, + "loss": 0.4071, + "step": 28555 + }, + { + "epoch": 15.953072625698324, + "grad_norm": 0.44662296772003174, + "learning_rate": 0.00020330532212885157, + "loss": 0.4485, + "step": 28556 + }, + { + "epoch": 15.9536312849162, + "grad_norm": 0.4298152029514313, + "learning_rate": 0.00020327731092436975, + "loss": 0.5044, + "step": 28557 + }, + { + "epoch": 15.954189944134079, + "grad_norm": 0.7141160368919373, + "learning_rate": 0.00020324929971988795, + "loss": 0.3431, + "step": 28558 + }, + { + "epoch": 15.954748603351955, + "grad_norm": 0.35678035020828247, + "learning_rate": 0.00020322128851540616, + "loss": 0.2986, + "step": 28559 + }, + { + "epoch": 15.955307262569832, + "grad_norm": 0.435062438249588, + "learning_rate": 0.0002031932773109244, + "loss": 0.3811, + "step": 28560 + }, + { + "epoch": 15.955865921787709, + "grad_norm": 0.40991783142089844, + "learning_rate": 0.0002031652661064426, + "loss": 0.441, + "step": 28561 + }, + { + "epoch": 15.956424581005587, + "grad_norm": 0.4384680688381195, + "learning_rate": 0.00020313725490196078, + "loss": 0.4559, + "step": 28562 + }, + { + "epoch": 15.956983240223463, + "grad_norm": 0.5405470728874207, + "learning_rate": 0.00020310924369747898, + "loss": 0.4067, + "step": 28563 + }, + { + "epoch": 15.95754189944134, + "grad_norm": 0.3395642042160034, + "learning_rate": 0.00020308123249299721, + "loss": 0.336, + "step": 28564 + }, + { + "epoch": 15.958100558659218, + "grad_norm": 0.3948809504508972, + "learning_rate": 0.00020305322128851542, + "loss": 0.4156, + "step": 28565 + }, + { + "epoch": 15.958659217877095, + "grad_norm": 0.42704781889915466, + "learning_rate": 0.00020302521008403363, + "loss": 0.3176, + "step": 28566 + }, + { + "epoch": 15.959217877094972, + "grad_norm": 1.9850234985351562, + "learning_rate": 0.0002029971988795518, + "loss": 0.4029, + "step": 28567 + }, + { + "epoch": 15.95977653631285, + "grad_norm": 0.42603111267089844, + "learning_rate": 0.00020296918767507004, + "loss": 0.3581, + "step": 28568 + }, + { + "epoch": 15.960335195530726, + "grad_norm": 0.452487975358963, + "learning_rate": 0.00020294117647058824, + "loss": 0.4134, + "step": 28569 + }, + { + "epoch": 15.960893854748603, + "grad_norm": 0.7981101870536804, + "learning_rate": 0.00020291316526610645, + "loss": 0.4401, + "step": 28570 + }, + { + "epoch": 15.961452513966481, + "grad_norm": 0.9477629661560059, + "learning_rate": 0.00020288515406162466, + "loss": 0.3739, + "step": 28571 + }, + { + "epoch": 15.962011173184358, + "grad_norm": 0.5518683195114136, + "learning_rate": 0.00020285714285714286, + "loss": 0.5674, + "step": 28572 + }, + { + "epoch": 15.962569832402234, + "grad_norm": 0.43295642733573914, + "learning_rate": 0.00020282913165266107, + "loss": 0.307, + "step": 28573 + }, + { + "epoch": 15.963128491620111, + "grad_norm": 0.35875412821769714, + "learning_rate": 0.00020280112044817927, + "loss": 0.3457, + "step": 28574 + }, + { + "epoch": 15.96368715083799, + "grad_norm": 0.5405577421188354, + "learning_rate": 0.00020277310924369748, + "loss": 0.3574, + "step": 28575 + }, + { + "epoch": 15.964245810055866, + "grad_norm": 0.44104719161987305, + "learning_rate": 0.00020274509803921571, + "loss": 0.4515, + "step": 28576 + }, + { + "epoch": 15.964804469273743, + "grad_norm": 9.683589935302734, + "learning_rate": 0.0002027170868347339, + "loss": 0.3122, + "step": 28577 + }, + { + "epoch": 15.96536312849162, + "grad_norm": 1.4077775478363037, + "learning_rate": 0.0002026890756302521, + "loss": 0.3856, + "step": 28578 + }, + { + "epoch": 15.965921787709497, + "grad_norm": 0.3799853026866913, + "learning_rate": 0.0002026610644257703, + "loss": 0.4183, + "step": 28579 + }, + { + "epoch": 15.966480446927374, + "grad_norm": 1.7218818664550781, + "learning_rate": 0.00020263305322128854, + "loss": 0.3339, + "step": 28580 + }, + { + "epoch": 15.96703910614525, + "grad_norm": 8.140228271484375, + "learning_rate": 0.00020260504201680674, + "loss": 0.3363, + "step": 28581 + }, + { + "epoch": 15.967597765363129, + "grad_norm": 0.7309381365776062, + "learning_rate": 0.00020257703081232492, + "loss": 0.4697, + "step": 28582 + }, + { + "epoch": 15.968156424581005, + "grad_norm": 0.5796902179718018, + "learning_rate": 0.00020254901960784313, + "loss": 0.4133, + "step": 28583 + }, + { + "epoch": 15.968715083798882, + "grad_norm": 0.4000356197357178, + "learning_rate": 0.00020252100840336136, + "loss": 0.3832, + "step": 28584 + }, + { + "epoch": 15.96927374301676, + "grad_norm": 0.7208417057991028, + "learning_rate": 0.00020249299719887957, + "loss": 0.3891, + "step": 28585 + }, + { + "epoch": 15.969832402234637, + "grad_norm": 1.0270278453826904, + "learning_rate": 0.00020246498599439777, + "loss": 0.4176, + "step": 28586 + }, + { + "epoch": 15.970391061452514, + "grad_norm": 0.4895149767398834, + "learning_rate": 0.00020243697478991595, + "loss": 0.4114, + "step": 28587 + }, + { + "epoch": 15.970949720670392, + "grad_norm": 0.6412506699562073, + "learning_rate": 0.00020240896358543419, + "loss": 0.4179, + "step": 28588 + }, + { + "epoch": 15.971508379888268, + "grad_norm": 0.42856088280677795, + "learning_rate": 0.0002023809523809524, + "loss": 0.3925, + "step": 28589 + }, + { + "epoch": 15.972067039106145, + "grad_norm": 0.6094840168952942, + "learning_rate": 0.0002023529411764706, + "loss": 0.4559, + "step": 28590 + }, + { + "epoch": 15.972625698324022, + "grad_norm": 0.4153468608856201, + "learning_rate": 0.00020232492997198878, + "loss": 0.3764, + "step": 28591 + }, + { + "epoch": 15.9731843575419, + "grad_norm": 0.6538140177726746, + "learning_rate": 0.000202296918767507, + "loss": 0.7245, + "step": 28592 + }, + { + "epoch": 15.973743016759776, + "grad_norm": 0.9328212738037109, + "learning_rate": 0.00020226890756302522, + "loss": 0.8007, + "step": 28593 + }, + { + "epoch": 15.974301675977653, + "grad_norm": 0.4586169421672821, + "learning_rate": 0.00020224089635854342, + "loss": 0.3397, + "step": 28594 + }, + { + "epoch": 15.974860335195531, + "grad_norm": 30.752887725830078, + "learning_rate": 0.00020221288515406163, + "loss": 0.4215, + "step": 28595 + }, + { + "epoch": 15.975418994413408, + "grad_norm": 0.48116356134414673, + "learning_rate": 0.00020218487394957983, + "loss": 0.4404, + "step": 28596 + }, + { + "epoch": 15.975977653631285, + "grad_norm": 0.9074668884277344, + "learning_rate": 0.00020215686274509804, + "loss": 0.4648, + "step": 28597 + }, + { + "epoch": 15.976536312849163, + "grad_norm": 0.6799079775810242, + "learning_rate": 0.00020212885154061625, + "loss": 0.5253, + "step": 28598 + }, + { + "epoch": 15.97709497206704, + "grad_norm": 0.6789929866790771, + "learning_rate": 0.00020210084033613445, + "loss": 0.3991, + "step": 28599 + }, + { + "epoch": 15.977653631284916, + "grad_norm": 0.43006426095962524, + "learning_rate": 0.00020207282913165268, + "loss": 0.3803, + "step": 28600 + }, + { + "epoch": 15.978212290502793, + "grad_norm": 1.7170820236206055, + "learning_rate": 0.00020204481792717086, + "loss": 0.4897, + "step": 28601 + }, + { + "epoch": 15.978770949720671, + "grad_norm": 0.6101322174072266, + "learning_rate": 0.00020201680672268907, + "loss": 0.3751, + "step": 28602 + }, + { + "epoch": 15.979329608938547, + "grad_norm": 0.4622575044631958, + "learning_rate": 0.00020198879551820728, + "loss": 0.3692, + "step": 28603 + }, + { + "epoch": 15.979888268156424, + "grad_norm": 3.59499454498291, + "learning_rate": 0.0002019607843137255, + "loss": 0.3856, + "step": 28604 + }, + { + "epoch": 15.980446927374302, + "grad_norm": 0.3898753225803375, + "learning_rate": 0.00020193277310924371, + "loss": 0.4428, + "step": 28605 + }, + { + "epoch": 15.981005586592179, + "grad_norm": 0.4102843403816223, + "learning_rate": 0.0002019047619047619, + "loss": 0.4631, + "step": 28606 + }, + { + "epoch": 15.981564245810056, + "grad_norm": 0.38137349486351013, + "learning_rate": 0.0002018767507002801, + "loss": 0.4184, + "step": 28607 + }, + { + "epoch": 15.982122905027932, + "grad_norm": 0.5560269951820374, + "learning_rate": 0.00020184873949579833, + "loss": 0.4638, + "step": 28608 + }, + { + "epoch": 15.98268156424581, + "grad_norm": 1.2141250371932983, + "learning_rate": 0.00020182072829131654, + "loss": 0.416, + "step": 28609 + }, + { + "epoch": 15.983240223463687, + "grad_norm": 0.8336212635040283, + "learning_rate": 0.00020179271708683474, + "loss": 0.4315, + "step": 28610 + }, + { + "epoch": 15.983798882681564, + "grad_norm": 0.7413162589073181, + "learning_rate": 0.00020176470588235292, + "loss": 0.3989, + "step": 28611 + }, + { + "epoch": 15.984357541899442, + "grad_norm": 0.5239019393920898, + "learning_rate": 0.00020173669467787116, + "loss": 0.3504, + "step": 28612 + }, + { + "epoch": 15.984916201117318, + "grad_norm": 0.5917267203330994, + "learning_rate": 0.00020170868347338936, + "loss": 0.3389, + "step": 28613 + }, + { + "epoch": 15.985474860335195, + "grad_norm": 0.4086204767227173, + "learning_rate": 0.00020168067226890757, + "loss": 0.3238, + "step": 28614 + }, + { + "epoch": 15.986033519553073, + "grad_norm": 0.3828534185886383, + "learning_rate": 0.00020165266106442577, + "loss": 0.4007, + "step": 28615 + }, + { + "epoch": 15.98659217877095, + "grad_norm": 0.45286455750465393, + "learning_rate": 0.00020162464985994398, + "loss": 0.4141, + "step": 28616 + }, + { + "epoch": 15.987150837988827, + "grad_norm": 1.5492680072784424, + "learning_rate": 0.0002015966386554622, + "loss": 0.4797, + "step": 28617 + }, + { + "epoch": 15.987709497206703, + "grad_norm": 0.4872051477432251, + "learning_rate": 0.0002015686274509804, + "loss": 0.3872, + "step": 28618 + }, + { + "epoch": 15.988268156424581, + "grad_norm": 0.4941769540309906, + "learning_rate": 0.0002015406162464986, + "loss": 0.3462, + "step": 28619 + }, + { + "epoch": 15.988826815642458, + "grad_norm": 0.4956414997577667, + "learning_rate": 0.00020151260504201683, + "loss": 0.3027, + "step": 28620 + }, + { + "epoch": 15.989385474860335, + "grad_norm": 0.4612584412097931, + "learning_rate": 0.000201484593837535, + "loss": 0.4028, + "step": 28621 + }, + { + "epoch": 15.989944134078213, + "grad_norm": 0.4827750027179718, + "learning_rate": 0.00020145658263305322, + "loss": 0.3857, + "step": 28622 + }, + { + "epoch": 15.99050279329609, + "grad_norm": 0.40966519713401794, + "learning_rate": 0.00020142857142857142, + "loss": 0.4001, + "step": 28623 + }, + { + "epoch": 15.991061452513966, + "grad_norm": 0.38131317496299744, + "learning_rate": 0.00020140056022408966, + "loss": 0.3941, + "step": 28624 + }, + { + "epoch": 15.991620111731844, + "grad_norm": 0.4684199094772339, + "learning_rate": 0.00020137254901960786, + "loss": 0.4498, + "step": 28625 + }, + { + "epoch": 15.992178770949721, + "grad_norm": 0.4183073341846466, + "learning_rate": 0.00020134453781512604, + "loss": 0.394, + "step": 28626 + }, + { + "epoch": 15.992737430167598, + "grad_norm": 0.4430552124977112, + "learning_rate": 0.00020131652661064425, + "loss": 0.3761, + "step": 28627 + }, + { + "epoch": 15.993296089385474, + "grad_norm": 0.4162876605987549, + "learning_rate": 0.00020128851540616248, + "loss": 0.3759, + "step": 28628 + }, + { + "epoch": 15.993854748603352, + "grad_norm": 0.48116421699523926, + "learning_rate": 0.00020126050420168069, + "loss": 0.2989, + "step": 28629 + }, + { + "epoch": 15.994413407821229, + "grad_norm": 0.48768338561058044, + "learning_rate": 0.0002012324929971989, + "loss": 0.4296, + "step": 28630 + }, + { + "epoch": 15.994972067039106, + "grad_norm": 1.4282417297363281, + "learning_rate": 0.00020120448179271707, + "loss": 0.3903, + "step": 28631 + }, + { + "epoch": 15.995530726256984, + "grad_norm": 4.254456520080566, + "learning_rate": 0.0002011764705882353, + "loss": 0.3804, + "step": 28632 + }, + { + "epoch": 15.99608938547486, + "grad_norm": 0.8755951523780823, + "learning_rate": 0.0002011484593837535, + "loss": 0.3919, + "step": 28633 + }, + { + "epoch": 15.996648044692737, + "grad_norm": 0.480814129114151, + "learning_rate": 0.00020112044817927172, + "loss": 0.3455, + "step": 28634 + }, + { + "epoch": 15.997206703910614, + "grad_norm": 0.4574722349643707, + "learning_rate": 0.00020109243697478992, + "loss": 0.3223, + "step": 28635 + }, + { + "epoch": 15.997765363128492, + "grad_norm": 0.31181037425994873, + "learning_rate": 0.00020106442577030813, + "loss": 0.3316, + "step": 28636 + }, + { + "epoch": 15.998324022346369, + "grad_norm": 0.5989413261413574, + "learning_rate": 0.00020103641456582633, + "loss": 0.4521, + "step": 28637 + }, + { + "epoch": 15.998882681564245, + "grad_norm": 2.2446444034576416, + "learning_rate": 0.00020100840336134454, + "loss": 0.438, + "step": 28638 + }, + { + "epoch": 15.999441340782123, + "grad_norm": 37.44776916503906, + "learning_rate": 0.00020098039215686275, + "loss": 0.3661, + "step": 28639 + }, + { + "epoch": 16.0, + "grad_norm": 0.43943220376968384, + "learning_rate": 0.00020095238095238098, + "loss": 0.4565, + "step": 28640 + }, + { + "epoch": 16.000558659217877, + "grad_norm": 0.7048699855804443, + "learning_rate": 0.00020092436974789916, + "loss": 0.3605, + "step": 28641 + }, + { + "epoch": 16.001117318435753, + "grad_norm": 0.5793527960777283, + "learning_rate": 0.00020089635854341736, + "loss": 0.4319, + "step": 28642 + }, + { + "epoch": 16.00167597765363, + "grad_norm": 0.9626024961471558, + "learning_rate": 0.00020086834733893557, + "loss": 0.3648, + "step": 28643 + }, + { + "epoch": 16.00223463687151, + "grad_norm": 0.37225890159606934, + "learning_rate": 0.0002008403361344538, + "loss": 0.413, + "step": 28644 + }, + { + "epoch": 16.002793296089386, + "grad_norm": 0.39024972915649414, + "learning_rate": 0.000200812324929972, + "loss": 0.3994, + "step": 28645 + }, + { + "epoch": 16.003351955307263, + "grad_norm": 0.72527015209198, + "learning_rate": 0.0002007843137254902, + "loss": 0.3831, + "step": 28646 + }, + { + "epoch": 16.00391061452514, + "grad_norm": 0.38242390751838684, + "learning_rate": 0.0002007563025210084, + "loss": 0.4909, + "step": 28647 + }, + { + "epoch": 16.004469273743016, + "grad_norm": 0.34653082489967346, + "learning_rate": 0.00020072829131652663, + "loss": 0.2554, + "step": 28648 + }, + { + "epoch": 16.005027932960893, + "grad_norm": 0.4286128878593445, + "learning_rate": 0.00020070028011204483, + "loss": 0.416, + "step": 28649 + }, + { + "epoch": 16.00558659217877, + "grad_norm": 0.8708988428115845, + "learning_rate": 0.000200672268907563, + "loss": 0.4155, + "step": 28650 + }, + { + "epoch": 16.00614525139665, + "grad_norm": 0.34916868805885315, + "learning_rate": 0.00020064425770308122, + "loss": 0.3637, + "step": 28651 + }, + { + "epoch": 16.006703910614526, + "grad_norm": 1.1638103723526, + "learning_rate": 0.00020061624649859945, + "loss": 0.4343, + "step": 28652 + }, + { + "epoch": 16.007262569832402, + "grad_norm": 1.565123200416565, + "learning_rate": 0.00020058823529411766, + "loss": 0.4478, + "step": 28653 + }, + { + "epoch": 16.00782122905028, + "grad_norm": 0.43218564987182617, + "learning_rate": 0.00020056022408963586, + "loss": 0.435, + "step": 28654 + }, + { + "epoch": 16.008379888268156, + "grad_norm": 0.587452232837677, + "learning_rate": 0.00020053221288515404, + "loss": 0.547, + "step": 28655 + }, + { + "epoch": 16.008938547486032, + "grad_norm": 0.5117811560630798, + "learning_rate": 0.00020050420168067227, + "loss": 0.4295, + "step": 28656 + }, + { + "epoch": 16.009497206703912, + "grad_norm": 0.49994993209838867, + "learning_rate": 0.00020047619047619048, + "loss": 0.4401, + "step": 28657 + }, + { + "epoch": 16.01005586592179, + "grad_norm": 0.5301174521446228, + "learning_rate": 0.0002004481792717087, + "loss": 0.3344, + "step": 28658 + }, + { + "epoch": 16.010614525139665, + "grad_norm": 0.39890173077583313, + "learning_rate": 0.0002004201680672269, + "loss": 0.3962, + "step": 28659 + }, + { + "epoch": 16.011173184357542, + "grad_norm": 0.8612424731254578, + "learning_rate": 0.0002003921568627451, + "loss": 0.3873, + "step": 28660 + }, + { + "epoch": 16.01173184357542, + "grad_norm": 0.46927180886268616, + "learning_rate": 0.0002003641456582633, + "loss": 0.4122, + "step": 28661 + }, + { + "epoch": 16.012290502793295, + "grad_norm": 0.4179631769657135, + "learning_rate": 0.0002003361344537815, + "loss": 0.4842, + "step": 28662 + }, + { + "epoch": 16.01284916201117, + "grad_norm": 0.5834910273551941, + "learning_rate": 0.00020030812324929972, + "loss": 0.4836, + "step": 28663 + }, + { + "epoch": 16.013407821229052, + "grad_norm": 0.555940568447113, + "learning_rate": 0.00020028011204481795, + "loss": 0.3847, + "step": 28664 + }, + { + "epoch": 16.01396648044693, + "grad_norm": 0.5529289245605469, + "learning_rate": 0.00020025210084033613, + "loss": 0.355, + "step": 28665 + }, + { + "epoch": 16.014525139664805, + "grad_norm": 0.5286394953727722, + "learning_rate": 0.00020022408963585433, + "loss": 0.4983, + "step": 28666 + }, + { + "epoch": 16.01508379888268, + "grad_norm": 0.5944266319274902, + "learning_rate": 0.00020019607843137254, + "loss": 0.4084, + "step": 28667 + }, + { + "epoch": 16.015642458100558, + "grad_norm": 0.6405765414237976, + "learning_rate": 0.00020016806722689077, + "loss": 0.3636, + "step": 28668 + }, + { + "epoch": 16.016201117318435, + "grad_norm": 0.4832776188850403, + "learning_rate": 0.00020014005602240898, + "loss": 0.3652, + "step": 28669 + }, + { + "epoch": 16.01675977653631, + "grad_norm": 0.43088850378990173, + "learning_rate": 0.00020011204481792716, + "loss": 0.3936, + "step": 28670 + }, + { + "epoch": 16.01731843575419, + "grad_norm": 2.6670849323272705, + "learning_rate": 0.00020008403361344536, + "loss": 0.4082, + "step": 28671 + }, + { + "epoch": 16.017877094972068, + "grad_norm": 0.3858379125595093, + "learning_rate": 0.0002000560224089636, + "loss": 0.336, + "step": 28672 + }, + { + "epoch": 16.018435754189944, + "grad_norm": 0.4809342622756958, + "learning_rate": 0.0002000280112044818, + "loss": 0.4306, + "step": 28673 + }, + { + "epoch": 16.01899441340782, + "grad_norm": 0.40753260254859924, + "learning_rate": 0.0002, + "loss": 0.4286, + "step": 28674 + }, + { + "epoch": 16.019553072625698, + "grad_norm": 0.5665051341056824, + "learning_rate": 0.0001999719887955182, + "loss": 0.4737, + "step": 28675 + }, + { + "epoch": 16.020111731843574, + "grad_norm": 1.057637095451355, + "learning_rate": 0.00019994397759103642, + "loss": 0.4571, + "step": 28676 + }, + { + "epoch": 16.02067039106145, + "grad_norm": 0.49633321166038513, + "learning_rate": 0.00019991596638655463, + "loss": 0.439, + "step": 28677 + }, + { + "epoch": 16.02122905027933, + "grad_norm": 0.33009234070777893, + "learning_rate": 0.00019988795518207283, + "loss": 0.4024, + "step": 28678 + }, + { + "epoch": 16.021787709497207, + "grad_norm": 1.06159245967865, + "learning_rate": 0.00019985994397759104, + "loss": 0.5011, + "step": 28679 + }, + { + "epoch": 16.022346368715084, + "grad_norm": 0.5506209135055542, + "learning_rate": 0.00019983193277310925, + "loss": 0.4544, + "step": 28680 + }, + { + "epoch": 16.02290502793296, + "grad_norm": 0.4042088985443115, + "learning_rate": 0.00019980392156862745, + "loss": 0.4188, + "step": 28681 + }, + { + "epoch": 16.023463687150837, + "grad_norm": 0.8546011447906494, + "learning_rate": 0.00019977591036414566, + "loss": 0.3441, + "step": 28682 + }, + { + "epoch": 16.024022346368714, + "grad_norm": 0.4159787595272064, + "learning_rate": 0.00019974789915966386, + "loss": 0.4095, + "step": 28683 + }, + { + "epoch": 16.024581005586594, + "grad_norm": 0.34117525815963745, + "learning_rate": 0.0001997198879551821, + "loss": 0.3602, + "step": 28684 + }, + { + "epoch": 16.02513966480447, + "grad_norm": 0.6552532911300659, + "learning_rate": 0.00019969187675070028, + "loss": 0.424, + "step": 28685 + }, + { + "epoch": 16.025698324022347, + "grad_norm": 0.4073229134082794, + "learning_rate": 0.00019966386554621848, + "loss": 0.3652, + "step": 28686 + }, + { + "epoch": 16.026256983240224, + "grad_norm": 0.351581335067749, + "learning_rate": 0.0001996358543417367, + "loss": 0.3459, + "step": 28687 + }, + { + "epoch": 16.0268156424581, + "grad_norm": 0.9441070556640625, + "learning_rate": 0.00019960784313725492, + "loss": 0.5215, + "step": 28688 + }, + { + "epoch": 16.027374301675977, + "grad_norm": 0.5241767764091492, + "learning_rate": 0.00019957983193277313, + "loss": 0.4037, + "step": 28689 + }, + { + "epoch": 16.027932960893853, + "grad_norm": 0.46423688530921936, + "learning_rate": 0.0001995518207282913, + "loss": 0.3779, + "step": 28690 + }, + { + "epoch": 16.028491620111733, + "grad_norm": 5.712769031524658, + "learning_rate": 0.0001995238095238095, + "loss": 0.3881, + "step": 28691 + }, + { + "epoch": 16.02905027932961, + "grad_norm": 0.6212025880813599, + "learning_rate": 0.00019949579831932774, + "loss": 0.4035, + "step": 28692 + }, + { + "epoch": 16.029608938547486, + "grad_norm": 0.6730415225028992, + "learning_rate": 0.00019946778711484595, + "loss": 0.3349, + "step": 28693 + }, + { + "epoch": 16.030167597765363, + "grad_norm": 0.37275049090385437, + "learning_rate": 0.00019943977591036416, + "loss": 0.3216, + "step": 28694 + }, + { + "epoch": 16.03072625698324, + "grad_norm": 2.095198154449463, + "learning_rate": 0.00019941176470588234, + "loss": 0.3938, + "step": 28695 + }, + { + "epoch": 16.031284916201116, + "grad_norm": 0.7965613007545471, + "learning_rate": 0.00019938375350140057, + "loss": 0.4118, + "step": 28696 + }, + { + "epoch": 16.031843575418993, + "grad_norm": 0.38307318091392517, + "learning_rate": 0.00019935574229691877, + "loss": 0.3259, + "step": 28697 + }, + { + "epoch": 16.032402234636873, + "grad_norm": 0.7534868121147156, + "learning_rate": 0.00019932773109243698, + "loss": 0.3655, + "step": 28698 + }, + { + "epoch": 16.03296089385475, + "grad_norm": 0.310043066740036, + "learning_rate": 0.0001992997198879552, + "loss": 0.3398, + "step": 28699 + }, + { + "epoch": 16.033519553072626, + "grad_norm": 1.157518744468689, + "learning_rate": 0.0001992717086834734, + "loss": 0.3228, + "step": 28700 + }, + { + "epoch": 16.034078212290503, + "grad_norm": 2.2935333251953125, + "learning_rate": 0.0001992436974789916, + "loss": 0.4583, + "step": 28701 + }, + { + "epoch": 16.03463687150838, + "grad_norm": 0.5491610169410706, + "learning_rate": 0.0001992156862745098, + "loss": 0.4437, + "step": 28702 + }, + { + "epoch": 16.035195530726256, + "grad_norm": 0.49655038118362427, + "learning_rate": 0.000199187675070028, + "loss": 0.448, + "step": 28703 + }, + { + "epoch": 16.035754189944136, + "grad_norm": 1.1616835594177246, + "learning_rate": 0.00019915966386554622, + "loss": 0.3663, + "step": 28704 + }, + { + "epoch": 16.036312849162012, + "grad_norm": 0.48957470059394836, + "learning_rate": 0.00019913165266106442, + "loss": 0.3926, + "step": 28705 + }, + { + "epoch": 16.03687150837989, + "grad_norm": 0.40391018986701965, + "learning_rate": 0.00019910364145658263, + "loss": 0.4804, + "step": 28706 + }, + { + "epoch": 16.037430167597766, + "grad_norm": 0.4524936378002167, + "learning_rate": 0.00019907563025210083, + "loss": 0.2925, + "step": 28707 + }, + { + "epoch": 16.037988826815642, + "grad_norm": 0.31568586826324463, + "learning_rate": 0.00019904761904761907, + "loss": 0.3227, + "step": 28708 + }, + { + "epoch": 16.03854748603352, + "grad_norm": 0.5016072392463684, + "learning_rate": 0.00019901960784313725, + "loss": 0.4659, + "step": 28709 + }, + { + "epoch": 16.039106145251395, + "grad_norm": 0.524066686630249, + "learning_rate": 0.00019899159663865545, + "loss": 0.4895, + "step": 28710 + }, + { + "epoch": 16.039664804469275, + "grad_norm": 0.5096714496612549, + "learning_rate": 0.00019896358543417366, + "loss": 0.4801, + "step": 28711 + }, + { + "epoch": 16.040223463687152, + "grad_norm": 0.5921947360038757, + "learning_rate": 0.0001989355742296919, + "loss": 0.4793, + "step": 28712 + }, + { + "epoch": 16.04078212290503, + "grad_norm": 0.8980619311332703, + "learning_rate": 0.0001989075630252101, + "loss": 0.4051, + "step": 28713 + }, + { + "epoch": 16.041340782122905, + "grad_norm": 0.6189582943916321, + "learning_rate": 0.00019887955182072828, + "loss": 0.3373, + "step": 28714 + }, + { + "epoch": 16.04189944134078, + "grad_norm": 0.5464439392089844, + "learning_rate": 0.00019885154061624648, + "loss": 0.4072, + "step": 28715 + }, + { + "epoch": 16.042458100558658, + "grad_norm": 3.057891368865967, + "learning_rate": 0.00019882352941176472, + "loss": 0.3639, + "step": 28716 + }, + { + "epoch": 16.043016759776535, + "grad_norm": 0.4201343059539795, + "learning_rate": 0.00019879551820728292, + "loss": 0.426, + "step": 28717 + }, + { + "epoch": 16.043575418994415, + "grad_norm": 0.808809757232666, + "learning_rate": 0.00019876750700280113, + "loss": 0.4774, + "step": 28718 + }, + { + "epoch": 16.04413407821229, + "grad_norm": 0.42565008997917175, + "learning_rate": 0.0001987394957983193, + "loss": 0.3174, + "step": 28719 + }, + { + "epoch": 16.044692737430168, + "grad_norm": 0.6673728823661804, + "learning_rate": 0.00019871148459383754, + "loss": 0.3693, + "step": 28720 + }, + { + "epoch": 16.045251396648045, + "grad_norm": 0.4921528398990631, + "learning_rate": 0.00019868347338935575, + "loss": 0.3409, + "step": 28721 + }, + { + "epoch": 16.04581005586592, + "grad_norm": 0.9265057444572449, + "learning_rate": 0.00019865546218487395, + "loss": 0.4993, + "step": 28722 + }, + { + "epoch": 16.046368715083798, + "grad_norm": 14.272374153137207, + "learning_rate": 0.00019862745098039218, + "loss": 0.4597, + "step": 28723 + }, + { + "epoch": 16.046927374301674, + "grad_norm": 0.8591921329498291, + "learning_rate": 0.00019859943977591036, + "loss": 0.3781, + "step": 28724 + }, + { + "epoch": 16.047486033519554, + "grad_norm": 0.45600244402885437, + "learning_rate": 0.00019857142857142857, + "loss": 0.3751, + "step": 28725 + }, + { + "epoch": 16.04804469273743, + "grad_norm": 1.3724937438964844, + "learning_rate": 0.00019854341736694678, + "loss": 0.5224, + "step": 28726 + }, + { + "epoch": 16.048603351955308, + "grad_norm": 0.5757851600646973, + "learning_rate": 0.000198515406162465, + "loss": 0.431, + "step": 28727 + }, + { + "epoch": 16.049162011173184, + "grad_norm": 0.5403292179107666, + "learning_rate": 0.00019848739495798321, + "loss": 0.475, + "step": 28728 + }, + { + "epoch": 16.04972067039106, + "grad_norm": 1.0260902643203735, + "learning_rate": 0.0001984593837535014, + "loss": 0.3883, + "step": 28729 + }, + { + "epoch": 16.050279329608937, + "grad_norm": 0.469302773475647, + "learning_rate": 0.0001984313725490196, + "loss": 0.3064, + "step": 28730 + }, + { + "epoch": 16.050837988826817, + "grad_norm": 0.5761861801147461, + "learning_rate": 0.00019840336134453783, + "loss": 0.4788, + "step": 28731 + }, + { + "epoch": 16.051396648044694, + "grad_norm": 0.5332999229431152, + "learning_rate": 0.00019837535014005604, + "loss": 0.3847, + "step": 28732 + }, + { + "epoch": 16.05195530726257, + "grad_norm": 0.35240307450294495, + "learning_rate": 0.00019834733893557424, + "loss": 0.4331, + "step": 28733 + }, + { + "epoch": 16.052513966480447, + "grad_norm": 0.5284339785575867, + "learning_rate": 0.00019831932773109242, + "loss": 0.5328, + "step": 28734 + }, + { + "epoch": 16.053072625698324, + "grad_norm": 0.7995294332504272, + "learning_rate": 0.00019829131652661066, + "loss": 0.4426, + "step": 28735 + }, + { + "epoch": 16.0536312849162, + "grad_norm": 0.4651493430137634, + "learning_rate": 0.00019826330532212886, + "loss": 0.332, + "step": 28736 + }, + { + "epoch": 16.054189944134077, + "grad_norm": 0.4823979437351227, + "learning_rate": 0.00019823529411764707, + "loss": 0.4345, + "step": 28737 + }, + { + "epoch": 16.054748603351957, + "grad_norm": 1.1699432134628296, + "learning_rate": 0.00019820728291316527, + "loss": 0.4165, + "step": 28738 + }, + { + "epoch": 16.055307262569833, + "grad_norm": 0.48355185985565186, + "learning_rate": 0.00019817927170868348, + "loss": 0.379, + "step": 28739 + }, + { + "epoch": 16.05586592178771, + "grad_norm": 0.8748696446418762, + "learning_rate": 0.0001981512605042017, + "loss": 0.8124, + "step": 28740 + }, + { + "epoch": 16.056424581005587, + "grad_norm": 0.3409229815006256, + "learning_rate": 0.0001981232492997199, + "loss": 0.3976, + "step": 28741 + }, + { + "epoch": 16.056983240223463, + "grad_norm": 0.3500695824623108, + "learning_rate": 0.0001980952380952381, + "loss": 0.3491, + "step": 28742 + }, + { + "epoch": 16.05754189944134, + "grad_norm": 0.5360079407691956, + "learning_rate": 0.00019806722689075633, + "loss": 0.4036, + "step": 28743 + }, + { + "epoch": 16.058100558659216, + "grad_norm": 0.9907260537147522, + "learning_rate": 0.0001980392156862745, + "loss": 0.3982, + "step": 28744 + }, + { + "epoch": 16.058659217877096, + "grad_norm": 0.3772161900997162, + "learning_rate": 0.00019801120448179272, + "loss": 0.4175, + "step": 28745 + }, + { + "epoch": 16.059217877094973, + "grad_norm": 0.48105260729789734, + "learning_rate": 0.00019798319327731092, + "loss": 0.4074, + "step": 28746 + }, + { + "epoch": 16.05977653631285, + "grad_norm": 0.4142693877220154, + "learning_rate": 0.00019795518207282916, + "loss": 0.4006, + "step": 28747 + }, + { + "epoch": 16.060335195530726, + "grad_norm": 11.138873100280762, + "learning_rate": 0.00019792717086834736, + "loss": 0.4017, + "step": 28748 + }, + { + "epoch": 16.060893854748603, + "grad_norm": 0.47995811700820923, + "learning_rate": 0.00019789915966386554, + "loss": 0.4257, + "step": 28749 + }, + { + "epoch": 16.06145251396648, + "grad_norm": 0.4389776885509491, + "learning_rate": 0.00019787114845938375, + "loss": 0.3558, + "step": 28750 + }, + { + "epoch": 16.062011173184356, + "grad_norm": 2.0461952686309814, + "learning_rate": 0.00019784313725490198, + "loss": 0.624, + "step": 28751 + }, + { + "epoch": 16.062569832402236, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.00019781512605042019, + "loss": 0.4538, + "step": 28752 + }, + { + "epoch": 16.063128491620112, + "grad_norm": 0.5412031412124634, + "learning_rate": 0.0001977871148459384, + "loss": 0.3106, + "step": 28753 + }, + { + "epoch": 16.06368715083799, + "grad_norm": 0.4140807092189789, + "learning_rate": 0.00019775910364145657, + "loss": 0.3162, + "step": 28754 + }, + { + "epoch": 16.064245810055866, + "grad_norm": 2.8879294395446777, + "learning_rate": 0.0001977310924369748, + "loss": 0.4743, + "step": 28755 + }, + { + "epoch": 16.064804469273742, + "grad_norm": 0.49047958850860596, + "learning_rate": 0.000197703081232493, + "loss": 0.427, + "step": 28756 + }, + { + "epoch": 16.06536312849162, + "grad_norm": 0.6948040127754211, + "learning_rate": 0.00019767507002801122, + "loss": 0.3439, + "step": 28757 + }, + { + "epoch": 16.0659217877095, + "grad_norm": 1.8124775886535645, + "learning_rate": 0.0001976470588235294, + "loss": 0.3917, + "step": 28758 + }, + { + "epoch": 16.066480446927375, + "grad_norm": 0.424328476190567, + "learning_rate": 0.00019761904761904763, + "loss": 0.4055, + "step": 28759 + }, + { + "epoch": 16.067039106145252, + "grad_norm": 0.5531435012817383, + "learning_rate": 0.00019759103641456583, + "loss": 0.417, + "step": 28760 + }, + { + "epoch": 16.06759776536313, + "grad_norm": 0.34919580817222595, + "learning_rate": 0.00019756302521008404, + "loss": 0.3044, + "step": 28761 + }, + { + "epoch": 16.068156424581005, + "grad_norm": 0.42807140946388245, + "learning_rate": 0.00019753501400560225, + "loss": 0.372, + "step": 28762 + }, + { + "epoch": 16.06871508379888, + "grad_norm": 0.5811235904693604, + "learning_rate": 0.00019750700280112045, + "loss": 0.8069, + "step": 28763 + }, + { + "epoch": 16.06927374301676, + "grad_norm": 0.47905027866363525, + "learning_rate": 0.00019747899159663866, + "loss": 0.5167, + "step": 28764 + }, + { + "epoch": 16.06983240223464, + "grad_norm": 0.41478869318962097, + "learning_rate": 0.00019745098039215686, + "loss": 0.4004, + "step": 28765 + }, + { + "epoch": 16.070391061452515, + "grad_norm": 1.1165934801101685, + "learning_rate": 0.00019742296918767507, + "loss": 0.3736, + "step": 28766 + }, + { + "epoch": 16.07094972067039, + "grad_norm": 0.39963629841804504, + "learning_rate": 0.0001973949579831933, + "loss": 0.439, + "step": 28767 + }, + { + "epoch": 16.071508379888268, + "grad_norm": 0.950663685798645, + "learning_rate": 0.00019736694677871148, + "loss": 0.6034, + "step": 28768 + }, + { + "epoch": 16.072067039106145, + "grad_norm": 0.6046480536460876, + "learning_rate": 0.0001973389355742297, + "loss": 0.4055, + "step": 28769 + }, + { + "epoch": 16.07262569832402, + "grad_norm": 0.3774864375591278, + "learning_rate": 0.0001973109243697479, + "loss": 0.4279, + "step": 28770 + }, + { + "epoch": 16.073184357541898, + "grad_norm": 0.4944782257080078, + "learning_rate": 0.00019728291316526613, + "loss": 0.3541, + "step": 28771 + }, + { + "epoch": 16.073743016759778, + "grad_norm": 0.8915618062019348, + "learning_rate": 0.00019725490196078433, + "loss": 0.4295, + "step": 28772 + }, + { + "epoch": 16.074301675977654, + "grad_norm": 0.48576030135154724, + "learning_rate": 0.0001972268907563025, + "loss": 0.4015, + "step": 28773 + }, + { + "epoch": 16.07486033519553, + "grad_norm": 0.4235306978225708, + "learning_rate": 0.00019719887955182072, + "loss": 0.5565, + "step": 28774 + }, + { + "epoch": 16.075418994413408, + "grad_norm": 0.3478255569934845, + "learning_rate": 0.00019717086834733895, + "loss": 0.39, + "step": 28775 + }, + { + "epoch": 16.075977653631284, + "grad_norm": 0.44412699341773987, + "learning_rate": 0.00019714285714285716, + "loss": 0.4141, + "step": 28776 + }, + { + "epoch": 16.07653631284916, + "grad_norm": 1.8947107791900635, + "learning_rate": 0.00019711484593837536, + "loss": 0.4653, + "step": 28777 + }, + { + "epoch": 16.07709497206704, + "grad_norm": 0.7067160606384277, + "learning_rate": 0.00019708683473389354, + "loss": 0.4177, + "step": 28778 + }, + { + "epoch": 16.077653631284917, + "grad_norm": 0.5734338164329529, + "learning_rate": 0.00019705882352941177, + "loss": 0.4436, + "step": 28779 + }, + { + "epoch": 16.078212290502794, + "grad_norm": 2.026653289794922, + "learning_rate": 0.00019703081232492998, + "loss": 0.364, + "step": 28780 + }, + { + "epoch": 16.07877094972067, + "grad_norm": 0.3616013824939728, + "learning_rate": 0.0001970028011204482, + "loss": 0.3936, + "step": 28781 + }, + { + "epoch": 16.079329608938547, + "grad_norm": 0.4312297999858856, + "learning_rate": 0.0001969747899159664, + "loss": 0.3756, + "step": 28782 + }, + { + "epoch": 16.079888268156424, + "grad_norm": 0.45226210355758667, + "learning_rate": 0.0001969467787114846, + "loss": 0.3535, + "step": 28783 + }, + { + "epoch": 16.0804469273743, + "grad_norm": 0.4514426290988922, + "learning_rate": 0.0001969187675070028, + "loss": 0.4161, + "step": 28784 + }, + { + "epoch": 16.08100558659218, + "grad_norm": 0.42622092366218567, + "learning_rate": 0.000196890756302521, + "loss": 0.4053, + "step": 28785 + }, + { + "epoch": 16.081564245810057, + "grad_norm": 0.6194861531257629, + "learning_rate": 0.00019686274509803922, + "loss": 0.3239, + "step": 28786 + }, + { + "epoch": 16.082122905027934, + "grad_norm": 0.6120905876159668, + "learning_rate": 0.00019683473389355745, + "loss": 0.3877, + "step": 28787 + }, + { + "epoch": 16.08268156424581, + "grad_norm": 4.206735134124756, + "learning_rate": 0.00019680672268907563, + "loss": 0.4098, + "step": 28788 + }, + { + "epoch": 16.083240223463687, + "grad_norm": 0.33900880813598633, + "learning_rate": 0.00019677871148459383, + "loss": 0.3221, + "step": 28789 + }, + { + "epoch": 16.083798882681563, + "grad_norm": 1.4210306406021118, + "learning_rate": 0.00019675070028011204, + "loss": 0.6155, + "step": 28790 + }, + { + "epoch": 16.08435754189944, + "grad_norm": 0.5302776098251343, + "learning_rate": 0.00019672268907563027, + "loss": 0.6129, + "step": 28791 + }, + { + "epoch": 16.08491620111732, + "grad_norm": 0.7050266861915588, + "learning_rate": 0.00019669467787114848, + "loss": 0.447, + "step": 28792 + }, + { + "epoch": 16.085474860335196, + "grad_norm": 0.3712307810783386, + "learning_rate": 0.00019666666666666666, + "loss": 0.3587, + "step": 28793 + }, + { + "epoch": 16.086033519553073, + "grad_norm": 0.5986878275871277, + "learning_rate": 0.00019663865546218486, + "loss": 0.4713, + "step": 28794 + }, + { + "epoch": 16.08659217877095, + "grad_norm": 0.4047383964061737, + "learning_rate": 0.0001966106442577031, + "loss": 0.3765, + "step": 28795 + }, + { + "epoch": 16.087150837988826, + "grad_norm": 0.4967782497406006, + "learning_rate": 0.0001965826330532213, + "loss": 0.3973, + "step": 28796 + }, + { + "epoch": 16.087709497206703, + "grad_norm": 0.45842745900154114, + "learning_rate": 0.0001965546218487395, + "loss": 0.4939, + "step": 28797 + }, + { + "epoch": 16.08826815642458, + "grad_norm": 0.3373325765132904, + "learning_rate": 0.0001965266106442577, + "loss": 0.3848, + "step": 28798 + }, + { + "epoch": 16.08882681564246, + "grad_norm": 0.4641217291355133, + "learning_rate": 0.00019649859943977592, + "loss": 0.3516, + "step": 28799 + }, + { + "epoch": 16.089385474860336, + "grad_norm": 0.4103059470653534, + "learning_rate": 0.00019647058823529413, + "loss": 0.5263, + "step": 28800 + }, + { + "epoch": 16.089944134078213, + "grad_norm": 0.4553210437297821, + "learning_rate": 0.00019644257703081233, + "loss": 0.4864, + "step": 28801 + }, + { + "epoch": 16.09050279329609, + "grad_norm": 0.33146676421165466, + "learning_rate": 0.00019641456582633054, + "loss": 0.3926, + "step": 28802 + }, + { + "epoch": 16.091061452513966, + "grad_norm": 5.626728534698486, + "learning_rate": 0.00019638655462184875, + "loss": 0.3473, + "step": 28803 + }, + { + "epoch": 16.091620111731842, + "grad_norm": 0.4232625961303711, + "learning_rate": 0.00019635854341736695, + "loss": 0.4914, + "step": 28804 + }, + { + "epoch": 16.092178770949722, + "grad_norm": 1.3321236371994019, + "learning_rate": 0.00019633053221288516, + "loss": 0.3322, + "step": 28805 + }, + { + "epoch": 16.0927374301676, + "grad_norm": 0.7122716307640076, + "learning_rate": 0.00019630252100840336, + "loss": 0.3935, + "step": 28806 + }, + { + "epoch": 16.093296089385476, + "grad_norm": 0.46890321373939514, + "learning_rate": 0.0001962745098039216, + "loss": 0.4412, + "step": 28807 + }, + { + "epoch": 16.093854748603352, + "grad_norm": 0.48771265149116516, + "learning_rate": 0.00019624649859943978, + "loss": 0.5041, + "step": 28808 + }, + { + "epoch": 16.09441340782123, + "grad_norm": 0.4522497057914734, + "learning_rate": 0.00019621848739495798, + "loss": 0.4605, + "step": 28809 + }, + { + "epoch": 16.094972067039105, + "grad_norm": 0.743918240070343, + "learning_rate": 0.0001961904761904762, + "loss": 0.3463, + "step": 28810 + }, + { + "epoch": 16.095530726256982, + "grad_norm": 6.346516132354736, + "learning_rate": 0.00019616246498599442, + "loss": 0.3904, + "step": 28811 + }, + { + "epoch": 16.096089385474862, + "grad_norm": 0.45202475786209106, + "learning_rate": 0.00019613445378151263, + "loss": 0.3986, + "step": 28812 + }, + { + "epoch": 16.09664804469274, + "grad_norm": 5.965319633483887, + "learning_rate": 0.0001961064425770308, + "loss": 0.4171, + "step": 28813 + }, + { + "epoch": 16.097206703910615, + "grad_norm": 0.3890116810798645, + "learning_rate": 0.000196078431372549, + "loss": 0.4416, + "step": 28814 + }, + { + "epoch": 16.09776536312849, + "grad_norm": 0.3330911099910736, + "learning_rate": 0.00019605042016806724, + "loss": 0.4189, + "step": 28815 + }, + { + "epoch": 16.098324022346368, + "grad_norm": 0.3990189731121063, + "learning_rate": 0.00019602240896358545, + "loss": 0.382, + "step": 28816 + }, + { + "epoch": 16.098882681564245, + "grad_norm": 0.5373325347900391, + "learning_rate": 0.00019599439775910363, + "loss": 0.4148, + "step": 28817 + }, + { + "epoch": 16.09944134078212, + "grad_norm": 0.6801446080207825, + "learning_rate": 0.00019596638655462184, + "loss": 0.4598, + "step": 28818 + }, + { + "epoch": 16.1, + "grad_norm": 0.3618169128894806, + "learning_rate": 0.00019593837535014007, + "loss": 0.2463, + "step": 28819 + }, + { + "epoch": 16.100558659217878, + "grad_norm": 0.5345287919044495, + "learning_rate": 0.00019591036414565827, + "loss": 0.4374, + "step": 28820 + }, + { + "epoch": 16.101117318435755, + "grad_norm": 0.5600844025611877, + "learning_rate": 0.00019588235294117648, + "loss": 0.3632, + "step": 28821 + }, + { + "epoch": 16.10167597765363, + "grad_norm": 0.42011335492134094, + "learning_rate": 0.00019585434173669466, + "loss": 0.405, + "step": 28822 + }, + { + "epoch": 16.102234636871508, + "grad_norm": 0.775844931602478, + "learning_rate": 0.0001958263305322129, + "loss": 0.4732, + "step": 28823 + }, + { + "epoch": 16.102793296089384, + "grad_norm": 0.47399938106536865, + "learning_rate": 0.0001957983193277311, + "loss": 0.3936, + "step": 28824 + }, + { + "epoch": 16.10335195530726, + "grad_norm": 0.42256438732147217, + "learning_rate": 0.0001957703081232493, + "loss": 0.4568, + "step": 28825 + }, + { + "epoch": 16.10391061452514, + "grad_norm": 0.570817232131958, + "learning_rate": 0.0001957422969187675, + "loss": 0.5531, + "step": 28826 + }, + { + "epoch": 16.104469273743018, + "grad_norm": 0.4960199296474457, + "learning_rate": 0.00019571428571428572, + "loss": 0.3889, + "step": 28827 + }, + { + "epoch": 16.105027932960894, + "grad_norm": 0.5583524107933044, + "learning_rate": 0.00019568627450980392, + "loss": 0.4265, + "step": 28828 + }, + { + "epoch": 16.10558659217877, + "grad_norm": 0.43021684885025024, + "learning_rate": 0.00019565826330532213, + "loss": 0.4337, + "step": 28829 + }, + { + "epoch": 16.106145251396647, + "grad_norm": 0.7041333317756653, + "learning_rate": 0.00019563025210084033, + "loss": 0.4424, + "step": 28830 + }, + { + "epoch": 16.106703910614524, + "grad_norm": 0.4659338891506195, + "learning_rate": 0.00019560224089635857, + "loss": 0.4191, + "step": 28831 + }, + { + "epoch": 16.107262569832404, + "grad_norm": 0.5400794148445129, + "learning_rate": 0.00019557422969187675, + "loss": 0.4044, + "step": 28832 + }, + { + "epoch": 16.10782122905028, + "grad_norm": 0.4359588921070099, + "learning_rate": 0.00019554621848739495, + "loss": 0.3595, + "step": 28833 + }, + { + "epoch": 16.108379888268157, + "grad_norm": 0.9698153138160706, + "learning_rate": 0.00019551820728291316, + "loss": 0.3382, + "step": 28834 + }, + { + "epoch": 16.108938547486034, + "grad_norm": 0.6758460402488708, + "learning_rate": 0.0001954901960784314, + "loss": 0.4053, + "step": 28835 + }, + { + "epoch": 16.10949720670391, + "grad_norm": 0.49740511178970337, + "learning_rate": 0.0001954621848739496, + "loss": 0.4474, + "step": 28836 + }, + { + "epoch": 16.110055865921787, + "grad_norm": 0.5072407126426697, + "learning_rate": 0.00019543417366946778, + "loss": 0.3999, + "step": 28837 + }, + { + "epoch": 16.110614525139663, + "grad_norm": 0.5797218084335327, + "learning_rate": 0.00019540616246498598, + "loss": 0.3937, + "step": 28838 + }, + { + "epoch": 16.111173184357543, + "grad_norm": 0.48711439967155457, + "learning_rate": 0.00019537815126050422, + "loss": 0.3525, + "step": 28839 + }, + { + "epoch": 16.11173184357542, + "grad_norm": 0.2812398374080658, + "learning_rate": 0.00019535014005602242, + "loss": 0.3679, + "step": 28840 + }, + { + "epoch": 16.112290502793297, + "grad_norm": 0.4230554699897766, + "learning_rate": 0.00019532212885154063, + "loss": 0.3439, + "step": 28841 + }, + { + "epoch": 16.112849162011173, + "grad_norm": 0.8616529107093811, + "learning_rate": 0.0001952941176470588, + "loss": 0.3844, + "step": 28842 + }, + { + "epoch": 16.11340782122905, + "grad_norm": 0.8501707315444946, + "learning_rate": 0.00019526610644257704, + "loss": 0.4702, + "step": 28843 + }, + { + "epoch": 16.113966480446926, + "grad_norm": 0.6084223389625549, + "learning_rate": 0.00019523809523809525, + "loss": 0.7092, + "step": 28844 + }, + { + "epoch": 16.114525139664803, + "grad_norm": 0.7039982080459595, + "learning_rate": 0.00019521008403361345, + "loss": 0.3849, + "step": 28845 + }, + { + "epoch": 16.115083798882683, + "grad_norm": 0.3611606955528259, + "learning_rate": 0.00019518207282913166, + "loss": 0.3713, + "step": 28846 + }, + { + "epoch": 16.11564245810056, + "grad_norm": 0.4002581238746643, + "learning_rate": 0.00019515406162464986, + "loss": 0.4499, + "step": 28847 + }, + { + "epoch": 16.116201117318436, + "grad_norm": 4.773829460144043, + "learning_rate": 0.00019512605042016807, + "loss": 0.3019, + "step": 28848 + }, + { + "epoch": 16.116759776536313, + "grad_norm": 0.5328097343444824, + "learning_rate": 0.00019509803921568628, + "loss": 0.4014, + "step": 28849 + }, + { + "epoch": 16.11731843575419, + "grad_norm": 0.5776244401931763, + "learning_rate": 0.00019507002801120448, + "loss": 0.4332, + "step": 28850 + }, + { + "epoch": 16.117877094972066, + "grad_norm": 1.8821289539337158, + "learning_rate": 0.00019504201680672271, + "loss": 0.4398, + "step": 28851 + }, + { + "epoch": 16.118435754189946, + "grad_norm": 0.5329691767692566, + "learning_rate": 0.0001950140056022409, + "loss": 0.4845, + "step": 28852 + }, + { + "epoch": 16.118994413407822, + "grad_norm": 0.6767594218254089, + "learning_rate": 0.0001949859943977591, + "loss": 0.4634, + "step": 28853 + }, + { + "epoch": 16.1195530726257, + "grad_norm": 1.8163594007492065, + "learning_rate": 0.0001949579831932773, + "loss": 0.5032, + "step": 28854 + }, + { + "epoch": 16.120111731843576, + "grad_norm": 0.36950555443763733, + "learning_rate": 0.00019492997198879554, + "loss": 0.3995, + "step": 28855 + }, + { + "epoch": 16.120670391061452, + "grad_norm": 0.47612613439559937, + "learning_rate": 0.00019490196078431374, + "loss": 0.4771, + "step": 28856 + }, + { + "epoch": 16.12122905027933, + "grad_norm": 0.419756680727005, + "learning_rate": 0.00019487394957983192, + "loss": 0.3311, + "step": 28857 + }, + { + "epoch": 16.121787709497205, + "grad_norm": 1.678883671760559, + "learning_rate": 0.00019484593837535013, + "loss": 0.3767, + "step": 28858 + }, + { + "epoch": 16.122346368715085, + "grad_norm": 0.36759650707244873, + "learning_rate": 0.00019481792717086836, + "loss": 0.3838, + "step": 28859 + }, + { + "epoch": 16.122905027932962, + "grad_norm": 0.39833423495292664, + "learning_rate": 0.00019478991596638657, + "loss": 0.3711, + "step": 28860 + }, + { + "epoch": 16.12346368715084, + "grad_norm": 0.4569665491580963, + "learning_rate": 0.00019476190476190477, + "loss": 0.3828, + "step": 28861 + }, + { + "epoch": 16.124022346368715, + "grad_norm": 0.6211580038070679, + "learning_rate": 0.00019473389355742295, + "loss": 0.3842, + "step": 28862 + }, + { + "epoch": 16.12458100558659, + "grad_norm": 0.41862139105796814, + "learning_rate": 0.0001947058823529412, + "loss": 0.4467, + "step": 28863 + }, + { + "epoch": 16.12513966480447, + "grad_norm": 0.41824907064437866, + "learning_rate": 0.0001946778711484594, + "loss": 0.393, + "step": 28864 + }, + { + "epoch": 16.125698324022345, + "grad_norm": 0.3791068494319916, + "learning_rate": 0.0001946498599439776, + "loss": 0.4825, + "step": 28865 + }, + { + "epoch": 16.126256983240225, + "grad_norm": 1.915249228477478, + "learning_rate": 0.0001946218487394958, + "loss": 0.4422, + "step": 28866 + }, + { + "epoch": 16.1268156424581, + "grad_norm": 0.7629830241203308, + "learning_rate": 0.000194593837535014, + "loss": 0.4596, + "step": 28867 + }, + { + "epoch": 16.127374301675978, + "grad_norm": 0.4331672191619873, + "learning_rate": 0.00019456582633053222, + "loss": 0.3667, + "step": 28868 + }, + { + "epoch": 16.127932960893855, + "grad_norm": 0.524337887763977, + "learning_rate": 0.00019453781512605042, + "loss": 0.455, + "step": 28869 + }, + { + "epoch": 16.12849162011173, + "grad_norm": 0.3574155867099762, + "learning_rate": 0.00019450980392156863, + "loss": 0.3914, + "step": 28870 + }, + { + "epoch": 16.129050279329608, + "grad_norm": 0.535084068775177, + "learning_rate": 0.00019448179271708683, + "loss": 0.5184, + "step": 28871 + }, + { + "epoch": 16.129608938547484, + "grad_norm": 0.3467373549938202, + "learning_rate": 0.00019445378151260504, + "loss": 0.3454, + "step": 28872 + }, + { + "epoch": 16.130167597765364, + "grad_norm": 0.4609588384628296, + "learning_rate": 0.00019442577030812325, + "loss": 0.4708, + "step": 28873 + }, + { + "epoch": 16.13072625698324, + "grad_norm": 0.9235809445381165, + "learning_rate": 0.00019439775910364145, + "loss": 0.3322, + "step": 28874 + }, + { + "epoch": 16.131284916201118, + "grad_norm": 0.6614897847175598, + "learning_rate": 0.00019436974789915969, + "loss": 0.377, + "step": 28875 + }, + { + "epoch": 16.131843575418994, + "grad_norm": 0.42744559049606323, + "learning_rate": 0.00019434173669467786, + "loss": 0.422, + "step": 28876 + }, + { + "epoch": 16.13240223463687, + "grad_norm": 0.4130760431289673, + "learning_rate": 0.00019431372549019607, + "loss": 0.4242, + "step": 28877 + }, + { + "epoch": 16.132960893854747, + "grad_norm": 0.5075344443321228, + "learning_rate": 0.00019428571428571428, + "loss": 0.4315, + "step": 28878 + }, + { + "epoch": 16.133519553072627, + "grad_norm": 0.8356727361679077, + "learning_rate": 0.0001942577030812325, + "loss": 0.3516, + "step": 28879 + }, + { + "epoch": 16.134078212290504, + "grad_norm": 2.4321515560150146, + "learning_rate": 0.00019422969187675072, + "loss": 0.4694, + "step": 28880 + }, + { + "epoch": 16.13463687150838, + "grad_norm": 1.457574486732483, + "learning_rate": 0.0001942016806722689, + "loss": 0.3689, + "step": 28881 + }, + { + "epoch": 16.135195530726257, + "grad_norm": 0.3981468975543976, + "learning_rate": 0.0001941736694677871, + "loss": 0.3455, + "step": 28882 + }, + { + "epoch": 16.135754189944134, + "grad_norm": 1.4101759195327759, + "learning_rate": 0.00019414565826330533, + "loss": 0.4785, + "step": 28883 + }, + { + "epoch": 16.13631284916201, + "grad_norm": 0.7086668014526367, + "learning_rate": 0.00019411764705882354, + "loss": 0.4477, + "step": 28884 + }, + { + "epoch": 16.136871508379887, + "grad_norm": 0.6071313619613647, + "learning_rate": 0.00019408963585434175, + "loss": 0.5448, + "step": 28885 + }, + { + "epoch": 16.137430167597767, + "grad_norm": 0.3937363922595978, + "learning_rate": 0.00019406162464985992, + "loss": 0.382, + "step": 28886 + }, + { + "epoch": 16.137988826815644, + "grad_norm": 0.34242311120033264, + "learning_rate": 0.00019403361344537816, + "loss": 0.3983, + "step": 28887 + }, + { + "epoch": 16.13854748603352, + "grad_norm": 0.6401903629302979, + "learning_rate": 0.00019400560224089636, + "loss": 0.392, + "step": 28888 + }, + { + "epoch": 16.139106145251397, + "grad_norm": 0.712629497051239, + "learning_rate": 0.00019397759103641457, + "loss": 0.3884, + "step": 28889 + }, + { + "epoch": 16.139664804469273, + "grad_norm": 0.500311017036438, + "learning_rate": 0.00019394957983193278, + "loss": 0.367, + "step": 28890 + }, + { + "epoch": 16.14022346368715, + "grad_norm": 0.5342960357666016, + "learning_rate": 0.00019392156862745098, + "loss": 0.4055, + "step": 28891 + }, + { + "epoch": 16.140782122905026, + "grad_norm": 0.543070375919342, + "learning_rate": 0.0001938935574229692, + "loss": 0.3509, + "step": 28892 + }, + { + "epoch": 16.141340782122906, + "grad_norm": 0.3987923264503479, + "learning_rate": 0.0001938655462184874, + "loss": 0.3665, + "step": 28893 + }, + { + "epoch": 16.141899441340783, + "grad_norm": 0.5370165705680847, + "learning_rate": 0.0001938375350140056, + "loss": 0.4208, + "step": 28894 + }, + { + "epoch": 16.14245810055866, + "grad_norm": 0.38203269243240356, + "learning_rate": 0.00019380952380952383, + "loss": 0.3763, + "step": 28895 + }, + { + "epoch": 16.143016759776536, + "grad_norm": 0.4971391260623932, + "learning_rate": 0.000193781512605042, + "loss": 0.4206, + "step": 28896 + }, + { + "epoch": 16.143575418994413, + "grad_norm": 1.1526046991348267, + "learning_rate": 0.00019375350140056022, + "loss": 0.3644, + "step": 28897 + }, + { + "epoch": 16.14413407821229, + "grad_norm": 0.33874574303627014, + "learning_rate": 0.00019372549019607842, + "loss": 0.3382, + "step": 28898 + }, + { + "epoch": 16.144692737430166, + "grad_norm": 0.35815566778182983, + "learning_rate": 0.00019369747899159666, + "loss": 0.3562, + "step": 28899 + }, + { + "epoch": 16.145251396648046, + "grad_norm": 0.36037832498550415, + "learning_rate": 0.00019366946778711486, + "loss": 0.3481, + "step": 28900 + }, + { + "epoch": 16.145810055865923, + "grad_norm": 0.6234013438224792, + "learning_rate": 0.00019364145658263304, + "loss": 0.4623, + "step": 28901 + }, + { + "epoch": 16.1463687150838, + "grad_norm": 0.4487958252429962, + "learning_rate": 0.00019361344537815125, + "loss": 0.3994, + "step": 28902 + }, + { + "epoch": 16.146927374301676, + "grad_norm": 0.45689263939857483, + "learning_rate": 0.00019358543417366948, + "loss": 0.5541, + "step": 28903 + }, + { + "epoch": 16.147486033519552, + "grad_norm": 0.45454445481300354, + "learning_rate": 0.0001935574229691877, + "loss": 0.4564, + "step": 28904 + }, + { + "epoch": 16.14804469273743, + "grad_norm": 0.592432975769043, + "learning_rate": 0.0001935294117647059, + "loss": 0.4554, + "step": 28905 + }, + { + "epoch": 16.14860335195531, + "grad_norm": 0.8670316934585571, + "learning_rate": 0.00019350140056022407, + "loss": 0.3391, + "step": 28906 + }, + { + "epoch": 16.149162011173186, + "grad_norm": 1.1139332056045532, + "learning_rate": 0.0001934733893557423, + "loss": 0.4316, + "step": 28907 + }, + { + "epoch": 16.149720670391062, + "grad_norm": 0.34945961833000183, + "learning_rate": 0.0001934453781512605, + "loss": 0.2629, + "step": 28908 + }, + { + "epoch": 16.15027932960894, + "grad_norm": 0.3951212167739868, + "learning_rate": 0.00019341736694677872, + "loss": 0.3899, + "step": 28909 + }, + { + "epoch": 16.150837988826815, + "grad_norm": 0.34655168652534485, + "learning_rate": 0.00019338935574229692, + "loss": 0.3558, + "step": 28910 + }, + { + "epoch": 16.15139664804469, + "grad_norm": 0.3671613335609436, + "learning_rate": 0.00019336134453781513, + "loss": 0.3536, + "step": 28911 + }, + { + "epoch": 16.15195530726257, + "grad_norm": 0.3306417465209961, + "learning_rate": 0.00019333333333333333, + "loss": 0.3765, + "step": 28912 + }, + { + "epoch": 16.15251396648045, + "grad_norm": 0.4065249264240265, + "learning_rate": 0.00019330532212885154, + "loss": 0.3667, + "step": 28913 + }, + { + "epoch": 16.153072625698325, + "grad_norm": 0.4873636066913605, + "learning_rate": 0.00019327731092436975, + "loss": 0.361, + "step": 28914 + }, + { + "epoch": 16.1536312849162, + "grad_norm": 0.42552927136421204, + "learning_rate": 0.00019324929971988798, + "loss": 0.5713, + "step": 28915 + }, + { + "epoch": 16.154189944134078, + "grad_norm": 1.208526849746704, + "learning_rate": 0.00019322128851540616, + "loss": 0.3693, + "step": 28916 + }, + { + "epoch": 16.154748603351955, + "grad_norm": 0.660973072052002, + "learning_rate": 0.00019319327731092436, + "loss": 0.3673, + "step": 28917 + }, + { + "epoch": 16.15530726256983, + "grad_norm": 0.35808494687080383, + "learning_rate": 0.00019316526610644257, + "loss": 0.3344, + "step": 28918 + }, + { + "epoch": 16.155865921787708, + "grad_norm": 0.38039055466651917, + "learning_rate": 0.0001931372549019608, + "loss": 0.4074, + "step": 28919 + }, + { + "epoch": 16.156424581005588, + "grad_norm": 0.5032056570053101, + "learning_rate": 0.000193109243697479, + "loss": 0.4776, + "step": 28920 + }, + { + "epoch": 16.156983240223465, + "grad_norm": 0.5298901200294495, + "learning_rate": 0.0001930812324929972, + "loss": 0.4434, + "step": 28921 + }, + { + "epoch": 16.15754189944134, + "grad_norm": 0.6850790977478027, + "learning_rate": 0.0001930532212885154, + "loss": 0.3908, + "step": 28922 + }, + { + "epoch": 16.158100558659218, + "grad_norm": 0.8807922601699829, + "learning_rate": 0.00019302521008403363, + "loss": 0.4267, + "step": 28923 + }, + { + "epoch": 16.158659217877094, + "grad_norm": 0.4602443277835846, + "learning_rate": 0.00019299719887955183, + "loss": 0.4627, + "step": 28924 + }, + { + "epoch": 16.15921787709497, + "grad_norm": 0.34916195273399353, + "learning_rate": 0.00019296918767507004, + "loss": 0.2844, + "step": 28925 + }, + { + "epoch": 16.159776536312847, + "grad_norm": 0.4090387523174286, + "learning_rate": 0.00019294117647058822, + "loss": 0.5318, + "step": 28926 + }, + { + "epoch": 16.160335195530728, + "grad_norm": 1.464445948600769, + "learning_rate": 0.00019291316526610645, + "loss": 0.3715, + "step": 28927 + }, + { + "epoch": 16.160893854748604, + "grad_norm": 0.383338987827301, + "learning_rate": 0.00019288515406162466, + "loss": 0.4827, + "step": 28928 + }, + { + "epoch": 16.16145251396648, + "grad_norm": 2.73823618888855, + "learning_rate": 0.00019285714285714286, + "loss": 0.443, + "step": 28929 + }, + { + "epoch": 16.162011173184357, + "grad_norm": 1.5407174825668335, + "learning_rate": 0.00019282913165266104, + "loss": 0.3322, + "step": 28930 + }, + { + "epoch": 16.162569832402234, + "grad_norm": 0.46490007638931274, + "learning_rate": 0.00019280112044817928, + "loss": 0.3855, + "step": 28931 + }, + { + "epoch": 16.16312849162011, + "grad_norm": 0.4543697237968445, + "learning_rate": 0.00019277310924369748, + "loss": 0.4277, + "step": 28932 + }, + { + "epoch": 16.16368715083799, + "grad_norm": 0.36964547634124756, + "learning_rate": 0.0001927450980392157, + "loss": 0.3994, + "step": 28933 + }, + { + "epoch": 16.164245810055867, + "grad_norm": 0.533072292804718, + "learning_rate": 0.0001927170868347339, + "loss": 0.3803, + "step": 28934 + }, + { + "epoch": 16.164804469273744, + "grad_norm": 0.471686989068985, + "learning_rate": 0.0001926890756302521, + "loss": 0.4216, + "step": 28935 + }, + { + "epoch": 16.16536312849162, + "grad_norm": 0.4997316300868988, + "learning_rate": 0.0001926610644257703, + "loss": 0.3474, + "step": 28936 + }, + { + "epoch": 16.165921787709497, + "grad_norm": 0.49094322323799133, + "learning_rate": 0.0001926330532212885, + "loss": 0.4135, + "step": 28937 + }, + { + "epoch": 16.166480446927373, + "grad_norm": 0.43086546659469604, + "learning_rate": 0.00019260504201680672, + "loss": 0.5377, + "step": 28938 + }, + { + "epoch": 16.16703910614525, + "grad_norm": 4.299013614654541, + "learning_rate": 0.00019257703081232495, + "loss": 0.4876, + "step": 28939 + }, + { + "epoch": 16.16759776536313, + "grad_norm": 0.48508575558662415, + "learning_rate": 0.00019254901960784313, + "loss": 0.413, + "step": 28940 + }, + { + "epoch": 16.168156424581007, + "grad_norm": 0.4109722375869751, + "learning_rate": 0.00019252100840336134, + "loss": 0.3512, + "step": 28941 + }, + { + "epoch": 16.168715083798883, + "grad_norm": 0.4435434639453888, + "learning_rate": 0.00019249299719887954, + "loss": 0.3758, + "step": 28942 + }, + { + "epoch": 16.16927374301676, + "grad_norm": 2.2694084644317627, + "learning_rate": 0.00019246498599439777, + "loss": 0.4886, + "step": 28943 + }, + { + "epoch": 16.169832402234636, + "grad_norm": 0.3850485682487488, + "learning_rate": 0.00019243697478991598, + "loss": 0.38, + "step": 28944 + }, + { + "epoch": 16.170391061452513, + "grad_norm": 0.5183414220809937, + "learning_rate": 0.00019240896358543416, + "loss": 0.4263, + "step": 28945 + }, + { + "epoch": 16.17094972067039, + "grad_norm": 1.2390666007995605, + "learning_rate": 0.00019238095238095237, + "loss": 0.3359, + "step": 28946 + }, + { + "epoch": 16.17150837988827, + "grad_norm": 0.6483945846557617, + "learning_rate": 0.0001923529411764706, + "loss": 0.4244, + "step": 28947 + }, + { + "epoch": 16.172067039106146, + "grad_norm": 5.2322258949279785, + "learning_rate": 0.0001923249299719888, + "loss": 0.4502, + "step": 28948 + }, + { + "epoch": 16.172625698324023, + "grad_norm": 0.7982333898544312, + "learning_rate": 0.000192296918767507, + "loss": 0.4099, + "step": 28949 + }, + { + "epoch": 16.1731843575419, + "grad_norm": 0.4285476505756378, + "learning_rate": 0.0001922689075630252, + "loss": 0.4088, + "step": 28950 + }, + { + "epoch": 16.173743016759776, + "grad_norm": 5.1269989013671875, + "learning_rate": 0.00019224089635854342, + "loss": 0.3385, + "step": 28951 + }, + { + "epoch": 16.174301675977652, + "grad_norm": 1.214417576789856, + "learning_rate": 0.00019221288515406163, + "loss": 0.413, + "step": 28952 + }, + { + "epoch": 16.174860335195532, + "grad_norm": 0.46083489060401917, + "learning_rate": 0.00019218487394957983, + "loss": 0.3782, + "step": 28953 + }, + { + "epoch": 16.17541899441341, + "grad_norm": 0.366047203540802, + "learning_rate": 0.00019215686274509807, + "loss": 0.3492, + "step": 28954 + }, + { + "epoch": 16.175977653631286, + "grad_norm": 3.1637935638427734, + "learning_rate": 0.00019212885154061625, + "loss": 0.3387, + "step": 28955 + }, + { + "epoch": 16.176536312849162, + "grad_norm": 0.5342099666595459, + "learning_rate": 0.00019210084033613445, + "loss": 0.334, + "step": 28956 + }, + { + "epoch": 16.17709497206704, + "grad_norm": 0.6804115176200867, + "learning_rate": 0.00019207282913165266, + "loss": 0.437, + "step": 28957 + }, + { + "epoch": 16.177653631284915, + "grad_norm": 0.574478805065155, + "learning_rate": 0.0001920448179271709, + "loss": 0.4401, + "step": 28958 + }, + { + "epoch": 16.178212290502792, + "grad_norm": 0.4467751383781433, + "learning_rate": 0.0001920168067226891, + "loss": 0.5054, + "step": 28959 + }, + { + "epoch": 16.178770949720672, + "grad_norm": 0.35941118001937866, + "learning_rate": 0.00019198879551820728, + "loss": 0.396, + "step": 28960 + }, + { + "epoch": 16.17932960893855, + "grad_norm": 0.3976753354072571, + "learning_rate": 0.00019196078431372548, + "loss": 0.3442, + "step": 28961 + }, + { + "epoch": 16.179888268156425, + "grad_norm": 1.8833200931549072, + "learning_rate": 0.00019193277310924372, + "loss": 0.4503, + "step": 28962 + }, + { + "epoch": 16.1804469273743, + "grad_norm": 0.5222572088241577, + "learning_rate": 0.00019190476190476192, + "loss": 0.4895, + "step": 28963 + }, + { + "epoch": 16.18100558659218, + "grad_norm": 1.4167481660842896, + "learning_rate": 0.00019187675070028013, + "loss": 0.3876, + "step": 28964 + }, + { + "epoch": 16.181564245810055, + "grad_norm": 0.4568377733230591, + "learning_rate": 0.0001918487394957983, + "loss": 0.4048, + "step": 28965 + }, + { + "epoch": 16.18212290502793, + "grad_norm": 0.351442813873291, + "learning_rate": 0.00019182072829131654, + "loss": 0.3562, + "step": 28966 + }, + { + "epoch": 16.18268156424581, + "grad_norm": 0.32675302028656006, + "learning_rate": 0.00019179271708683475, + "loss": 0.362, + "step": 28967 + }, + { + "epoch": 16.183240223463688, + "grad_norm": 0.44293713569641113, + "learning_rate": 0.00019176470588235295, + "loss": 0.4205, + "step": 28968 + }, + { + "epoch": 16.183798882681565, + "grad_norm": 0.36270028352737427, + "learning_rate": 0.00019173669467787116, + "loss": 0.4336, + "step": 28969 + }, + { + "epoch": 16.18435754189944, + "grad_norm": 1.8359348773956299, + "learning_rate": 0.00019170868347338936, + "loss": 0.3976, + "step": 28970 + }, + { + "epoch": 16.184916201117318, + "grad_norm": 0.5431550145149231, + "learning_rate": 0.00019168067226890757, + "loss": 0.452, + "step": 28971 + }, + { + "epoch": 16.185474860335194, + "grad_norm": 0.4765489995479584, + "learning_rate": 0.00019165266106442578, + "loss": 0.4306, + "step": 28972 + }, + { + "epoch": 16.18603351955307, + "grad_norm": 0.5705583095550537, + "learning_rate": 0.00019162464985994398, + "loss": 0.5865, + "step": 28973 + }, + { + "epoch": 16.18659217877095, + "grad_norm": 0.6350952386856079, + "learning_rate": 0.00019159663865546221, + "loss": 0.3381, + "step": 28974 + }, + { + "epoch": 16.187150837988828, + "grad_norm": 0.5249627232551575, + "learning_rate": 0.0001915686274509804, + "loss": 0.4851, + "step": 28975 + }, + { + "epoch": 16.187709497206704, + "grad_norm": 3.483253002166748, + "learning_rate": 0.0001915406162464986, + "loss": 0.4001, + "step": 28976 + }, + { + "epoch": 16.18826815642458, + "grad_norm": 0.6546013355255127, + "learning_rate": 0.0001915126050420168, + "loss": 0.4291, + "step": 28977 + }, + { + "epoch": 16.188826815642457, + "grad_norm": 0.4770772457122803, + "learning_rate": 0.00019148459383753504, + "loss": 0.4826, + "step": 28978 + }, + { + "epoch": 16.189385474860334, + "grad_norm": 0.6366468667984009, + "learning_rate": 0.00019145658263305324, + "loss": 0.4469, + "step": 28979 + }, + { + "epoch": 16.189944134078214, + "grad_norm": 0.6755110621452332, + "learning_rate": 0.00019142857142857142, + "loss": 0.4744, + "step": 28980 + }, + { + "epoch": 16.19050279329609, + "grad_norm": 0.8556031584739685, + "learning_rate": 0.00019140056022408963, + "loss": 0.5351, + "step": 28981 + }, + { + "epoch": 16.191061452513967, + "grad_norm": 0.6915291547775269, + "learning_rate": 0.00019137254901960786, + "loss": 0.6405, + "step": 28982 + }, + { + "epoch": 16.191620111731844, + "grad_norm": 0.6408063769340515, + "learning_rate": 0.00019134453781512607, + "loss": 0.6071, + "step": 28983 + }, + { + "epoch": 16.19217877094972, + "grad_norm": 0.39270055294036865, + "learning_rate": 0.00019131652661064425, + "loss": 0.4022, + "step": 28984 + }, + { + "epoch": 16.192737430167597, + "grad_norm": 0.4455922842025757, + "learning_rate": 0.00019128851540616245, + "loss": 0.4118, + "step": 28985 + }, + { + "epoch": 16.193296089385473, + "grad_norm": 0.4017958343029022, + "learning_rate": 0.0001912605042016807, + "loss": 0.4036, + "step": 28986 + }, + { + "epoch": 16.193854748603353, + "grad_norm": 0.4968282878398895, + "learning_rate": 0.0001912324929971989, + "loss": 0.4791, + "step": 28987 + }, + { + "epoch": 16.19441340782123, + "grad_norm": 0.3548322021961212, + "learning_rate": 0.0001912044817927171, + "loss": 0.3629, + "step": 28988 + }, + { + "epoch": 16.194972067039107, + "grad_norm": 0.3978058993816376, + "learning_rate": 0.00019117647058823528, + "loss": 0.408, + "step": 28989 + }, + { + "epoch": 16.195530726256983, + "grad_norm": 0.4985060393810272, + "learning_rate": 0.0001911484593837535, + "loss": 0.4459, + "step": 28990 + }, + { + "epoch": 16.19608938547486, + "grad_norm": 1.926979422569275, + "learning_rate": 0.00019112044817927172, + "loss": 0.478, + "step": 28991 + }, + { + "epoch": 16.196648044692736, + "grad_norm": 0.41042885184288025, + "learning_rate": 0.00019109243697478992, + "loss": 0.2723, + "step": 28992 + }, + { + "epoch": 16.197206703910613, + "grad_norm": 0.3781493604183197, + "learning_rate": 0.00019106442577030813, + "loss": 0.385, + "step": 28993 + }, + { + "epoch": 16.197765363128493, + "grad_norm": 0.36626923084259033, + "learning_rate": 0.00019103641456582633, + "loss": 0.3516, + "step": 28994 + }, + { + "epoch": 16.19832402234637, + "grad_norm": 0.45216286182403564, + "learning_rate": 0.00019100840336134454, + "loss": 0.4302, + "step": 28995 + }, + { + "epoch": 16.198882681564246, + "grad_norm": 0.8124073147773743, + "learning_rate": 0.00019098039215686275, + "loss": 0.3794, + "step": 28996 + }, + { + "epoch": 16.199441340782123, + "grad_norm": 0.7732542753219604, + "learning_rate": 0.00019095238095238095, + "loss": 0.3908, + "step": 28997 + }, + { + "epoch": 16.2, + "grad_norm": 0.6467917561531067, + "learning_rate": 0.00019092436974789919, + "loss": 0.4249, + "step": 28998 + }, + { + "epoch": 16.200558659217876, + "grad_norm": 0.3322466313838959, + "learning_rate": 0.00019089635854341736, + "loss": 0.3154, + "step": 28999 + }, + { + "epoch": 16.201117318435756, + "grad_norm": 0.7633682489395142, + "learning_rate": 0.00019086834733893557, + "loss": 0.339, + "step": 29000 + }, + { + "epoch": 16.201117318435756, + "eval_cer": 0.08562731170829105, + "eval_loss": 0.3217138350009918, + "eval_runtime": 55.4967, + "eval_samples_per_second": 81.771, + "eval_steps_per_second": 5.117, + "eval_wer": 0.3395156559872629, + "step": 29000 + }, + { + "epoch": 16.201675977653633, + "grad_norm": 0.49037250876426697, + "learning_rate": 0.00019084033613445378, + "loss": 0.425, + "step": 29001 + }, + { + "epoch": 16.20223463687151, + "grad_norm": 0.3645758032798767, + "learning_rate": 0.000190812324929972, + "loss": 0.3603, + "step": 29002 + }, + { + "epoch": 16.202793296089386, + "grad_norm": 0.41297295689582825, + "learning_rate": 0.00019078431372549022, + "loss": 0.4849, + "step": 29003 + }, + { + "epoch": 16.203351955307262, + "grad_norm": 0.36511531472206116, + "learning_rate": 0.0001907563025210084, + "loss": 0.3893, + "step": 29004 + }, + { + "epoch": 16.20391061452514, + "grad_norm": 0.5171051621437073, + "learning_rate": 0.0001907282913165266, + "loss": 0.3123, + "step": 29005 + }, + { + "epoch": 16.204469273743015, + "grad_norm": 0.4663669466972351, + "learning_rate": 0.00019070028011204483, + "loss": 0.3906, + "step": 29006 + }, + { + "epoch": 16.205027932960895, + "grad_norm": 0.49591711163520813, + "learning_rate": 0.00019067226890756304, + "loss": 0.4604, + "step": 29007 + }, + { + "epoch": 16.205586592178772, + "grad_norm": 0.7694417834281921, + "learning_rate": 0.00019064425770308125, + "loss": 0.3802, + "step": 29008 + }, + { + "epoch": 16.20614525139665, + "grad_norm": 1.1275218725204468, + "learning_rate": 0.00019061624649859942, + "loss": 0.5457, + "step": 29009 + }, + { + "epoch": 16.206703910614525, + "grad_norm": 0.6767483949661255, + "learning_rate": 0.00019058823529411766, + "loss": 0.4056, + "step": 29010 + }, + { + "epoch": 16.2072625698324, + "grad_norm": 1.1713223457336426, + "learning_rate": 0.00019056022408963586, + "loss": 0.32, + "step": 29011 + }, + { + "epoch": 16.20782122905028, + "grad_norm": 0.41864219307899475, + "learning_rate": 0.00019053221288515407, + "loss": 0.4077, + "step": 29012 + }, + { + "epoch": 16.208379888268155, + "grad_norm": 0.6184785962104797, + "learning_rate": 0.00019050420168067228, + "loss": 0.3934, + "step": 29013 + }, + { + "epoch": 16.208938547486035, + "grad_norm": 0.449820339679718, + "learning_rate": 0.00019047619047619048, + "loss": 0.4662, + "step": 29014 + }, + { + "epoch": 16.20949720670391, + "grad_norm": 0.6108324527740479, + "learning_rate": 0.0001904481792717087, + "loss": 0.6847, + "step": 29015 + }, + { + "epoch": 16.210055865921788, + "grad_norm": 0.45040029287338257, + "learning_rate": 0.0001904201680672269, + "loss": 0.379, + "step": 29016 + }, + { + "epoch": 16.210614525139665, + "grad_norm": 0.6352062821388245, + "learning_rate": 0.0001903921568627451, + "loss": 0.4382, + "step": 29017 + }, + { + "epoch": 16.21117318435754, + "grad_norm": 0.6224572658538818, + "learning_rate": 0.00019036414565826333, + "loss": 0.4887, + "step": 29018 + }, + { + "epoch": 16.211731843575418, + "grad_norm": 0.4349299967288971, + "learning_rate": 0.0001903361344537815, + "loss": 0.3016, + "step": 29019 + }, + { + "epoch": 16.212290502793294, + "grad_norm": 0.5045689344406128, + "learning_rate": 0.00019030812324929972, + "loss": 0.4532, + "step": 29020 + }, + { + "epoch": 16.212849162011175, + "grad_norm": 0.6894789338111877, + "learning_rate": 0.00019028011204481792, + "loss": 0.5491, + "step": 29021 + }, + { + "epoch": 16.21340782122905, + "grad_norm": 0.5629291534423828, + "learning_rate": 0.00019025210084033616, + "loss": 0.3674, + "step": 29022 + }, + { + "epoch": 16.213966480446928, + "grad_norm": 0.49713167548179626, + "learning_rate": 0.00019022408963585436, + "loss": 0.439, + "step": 29023 + }, + { + "epoch": 16.214525139664804, + "grad_norm": 0.9506420493125916, + "learning_rate": 0.00019019607843137254, + "loss": 0.5599, + "step": 29024 + }, + { + "epoch": 16.21508379888268, + "grad_norm": 0.39019033312797546, + "learning_rate": 0.00019016806722689075, + "loss": 0.4051, + "step": 29025 + }, + { + "epoch": 16.215642458100557, + "grad_norm": 0.3452981114387512, + "learning_rate": 0.00019014005602240898, + "loss": 0.4062, + "step": 29026 + }, + { + "epoch": 16.216201117318437, + "grad_norm": 1.3234254121780396, + "learning_rate": 0.0001901120448179272, + "loss": 0.3791, + "step": 29027 + }, + { + "epoch": 16.216759776536314, + "grad_norm": 0.46408969163894653, + "learning_rate": 0.0001900840336134454, + "loss": 0.3053, + "step": 29028 + }, + { + "epoch": 16.21731843575419, + "grad_norm": 0.6138445734977722, + "learning_rate": 0.00019005602240896357, + "loss": 0.2913, + "step": 29029 + }, + { + "epoch": 16.217877094972067, + "grad_norm": 0.5210884809494019, + "learning_rate": 0.0001900280112044818, + "loss": 0.4327, + "step": 29030 + }, + { + "epoch": 16.218435754189944, + "grad_norm": 0.4872722625732422, + "learning_rate": 0.00019, + "loss": 0.3336, + "step": 29031 + }, + { + "epoch": 16.21899441340782, + "grad_norm": 0.3628700077533722, + "learning_rate": 0.00018997198879551822, + "loss": 0.3824, + "step": 29032 + }, + { + "epoch": 16.219553072625697, + "grad_norm": 0.3308964967727661, + "learning_rate": 0.00018994397759103642, + "loss": 0.3566, + "step": 29033 + }, + { + "epoch": 16.220111731843577, + "grad_norm": 0.918548047542572, + "learning_rate": 0.00018991596638655463, + "loss": 0.5195, + "step": 29034 + }, + { + "epoch": 16.220670391061454, + "grad_norm": 0.7363218069076538, + "learning_rate": 0.00018988795518207283, + "loss": 0.3971, + "step": 29035 + }, + { + "epoch": 16.22122905027933, + "grad_norm": 0.39213788509368896, + "learning_rate": 0.00018985994397759104, + "loss": 0.3311, + "step": 29036 + }, + { + "epoch": 16.221787709497207, + "grad_norm": 0.42629334330558777, + "learning_rate": 0.00018983193277310925, + "loss": 0.4387, + "step": 29037 + }, + { + "epoch": 16.222346368715083, + "grad_norm": 0.7424758672714233, + "learning_rate": 0.00018980392156862745, + "loss": 0.5691, + "step": 29038 + }, + { + "epoch": 16.22290502793296, + "grad_norm": 0.7804065346717834, + "learning_rate": 0.00018977591036414566, + "loss": 0.441, + "step": 29039 + }, + { + "epoch": 16.223463687150836, + "grad_norm": 0.43857377767562866, + "learning_rate": 0.00018974789915966386, + "loss": 0.4612, + "step": 29040 + }, + { + "epoch": 16.224022346368717, + "grad_norm": 0.5992130637168884, + "learning_rate": 0.00018971988795518207, + "loss": 0.4271, + "step": 29041 + }, + { + "epoch": 16.224581005586593, + "grad_norm": 0.4632066786289215, + "learning_rate": 0.0001896918767507003, + "loss": 0.4237, + "step": 29042 + }, + { + "epoch": 16.22513966480447, + "grad_norm": 0.4995499551296234, + "learning_rate": 0.00018966386554621848, + "loss": 0.3592, + "step": 29043 + }, + { + "epoch": 16.225698324022346, + "grad_norm": 0.40829551219940186, + "learning_rate": 0.0001896358543417367, + "loss": 0.3492, + "step": 29044 + }, + { + "epoch": 16.226256983240223, + "grad_norm": 0.34807339310646057, + "learning_rate": 0.0001896078431372549, + "loss": 0.3914, + "step": 29045 + }, + { + "epoch": 16.2268156424581, + "grad_norm": 5.844367027282715, + "learning_rate": 0.00018957983193277313, + "loss": 0.4812, + "step": 29046 + }, + { + "epoch": 16.227374301675976, + "grad_norm": 0.4908343255519867, + "learning_rate": 0.00018955182072829133, + "loss": 0.4931, + "step": 29047 + }, + { + "epoch": 16.227932960893856, + "grad_norm": 1.6199932098388672, + "learning_rate": 0.0001895238095238095, + "loss": 0.6624, + "step": 29048 + }, + { + "epoch": 16.228491620111733, + "grad_norm": 0.5040088295936584, + "learning_rate": 0.00018949579831932772, + "loss": 0.4234, + "step": 29049 + }, + { + "epoch": 16.22905027932961, + "grad_norm": 0.40048477053642273, + "learning_rate": 0.00018946778711484595, + "loss": 0.4307, + "step": 29050 + }, + { + "epoch": 16.229608938547486, + "grad_norm": 0.46609658002853394, + "learning_rate": 0.00018943977591036416, + "loss": 0.5186, + "step": 29051 + }, + { + "epoch": 16.230167597765362, + "grad_norm": 0.37821826338768005, + "learning_rate": 0.00018941176470588236, + "loss": 0.3089, + "step": 29052 + }, + { + "epoch": 16.23072625698324, + "grad_norm": 0.5133246779441833, + "learning_rate": 0.00018938375350140054, + "loss": 0.5561, + "step": 29053 + }, + { + "epoch": 16.23128491620112, + "grad_norm": 0.6027588844299316, + "learning_rate": 0.00018935574229691878, + "loss": 0.6304, + "step": 29054 + }, + { + "epoch": 16.231843575418996, + "grad_norm": 2.0489094257354736, + "learning_rate": 0.00018932773109243698, + "loss": 0.4385, + "step": 29055 + }, + { + "epoch": 16.232402234636872, + "grad_norm": 0.45070499181747437, + "learning_rate": 0.0001892997198879552, + "loss": 0.4004, + "step": 29056 + }, + { + "epoch": 16.23296089385475, + "grad_norm": 0.32728439569473267, + "learning_rate": 0.0001892717086834734, + "loss": 0.3516, + "step": 29057 + }, + { + "epoch": 16.233519553072625, + "grad_norm": 0.45389971137046814, + "learning_rate": 0.0001892436974789916, + "loss": 0.4836, + "step": 29058 + }, + { + "epoch": 16.234078212290502, + "grad_norm": 2.1861369609832764, + "learning_rate": 0.0001892156862745098, + "loss": 0.3212, + "step": 29059 + }, + { + "epoch": 16.23463687150838, + "grad_norm": 0.5019142031669617, + "learning_rate": 0.000189187675070028, + "loss": 0.3969, + "step": 29060 + }, + { + "epoch": 16.23519553072626, + "grad_norm": 0.9010175466537476, + "learning_rate": 0.00018915966386554622, + "loss": 0.3848, + "step": 29061 + }, + { + "epoch": 16.235754189944135, + "grad_norm": 0.3512330949306488, + "learning_rate": 0.00018913165266106445, + "loss": 0.3354, + "step": 29062 + }, + { + "epoch": 16.23631284916201, + "grad_norm": 0.4336191713809967, + "learning_rate": 0.00018910364145658263, + "loss": 0.4292, + "step": 29063 + }, + { + "epoch": 16.23687150837989, + "grad_norm": 0.4691496789455414, + "learning_rate": 0.00018907563025210084, + "loss": 0.4035, + "step": 29064 + }, + { + "epoch": 16.237430167597765, + "grad_norm": 0.3892434537410736, + "learning_rate": 0.00018904761904761904, + "loss": 0.3894, + "step": 29065 + }, + { + "epoch": 16.23798882681564, + "grad_norm": 0.4409516751766205, + "learning_rate": 0.00018901960784313727, + "loss": 0.3611, + "step": 29066 + }, + { + "epoch": 16.238547486033518, + "grad_norm": 1.963951826095581, + "learning_rate": 0.00018899159663865548, + "loss": 0.376, + "step": 29067 + }, + { + "epoch": 16.239106145251398, + "grad_norm": 0.48292070627212524, + "learning_rate": 0.00018896358543417366, + "loss": 0.4659, + "step": 29068 + }, + { + "epoch": 16.239664804469275, + "grad_norm": 0.4007108211517334, + "learning_rate": 0.00018893557422969187, + "loss": 0.4485, + "step": 29069 + }, + { + "epoch": 16.24022346368715, + "grad_norm": 0.32865309715270996, + "learning_rate": 0.0001889075630252101, + "loss": 0.2822, + "step": 29070 + }, + { + "epoch": 16.240782122905028, + "grad_norm": 2.792288064956665, + "learning_rate": 0.0001888795518207283, + "loss": 0.531, + "step": 29071 + }, + { + "epoch": 16.241340782122904, + "grad_norm": 1.0920830965042114, + "learning_rate": 0.0001888515406162465, + "loss": 0.4286, + "step": 29072 + }, + { + "epoch": 16.24189944134078, + "grad_norm": 0.43305540084838867, + "learning_rate": 0.0001888235294117647, + "loss": 0.4685, + "step": 29073 + }, + { + "epoch": 16.242458100558657, + "grad_norm": 0.45835575461387634, + "learning_rate": 0.00018879551820728292, + "loss": 0.4401, + "step": 29074 + }, + { + "epoch": 16.243016759776538, + "grad_norm": 0.45942288637161255, + "learning_rate": 0.00018876750700280113, + "loss": 0.3581, + "step": 29075 + }, + { + "epoch": 16.243575418994414, + "grad_norm": 1.5817164182662964, + "learning_rate": 0.00018873949579831933, + "loss": 0.3596, + "step": 29076 + }, + { + "epoch": 16.24413407821229, + "grad_norm": 0.3791969120502472, + "learning_rate": 0.00018871148459383754, + "loss": 0.317, + "step": 29077 + }, + { + "epoch": 16.244692737430167, + "grad_norm": 0.6134214401245117, + "learning_rate": 0.00018868347338935575, + "loss": 0.4473, + "step": 29078 + }, + { + "epoch": 16.245251396648044, + "grad_norm": 0.9240784645080566, + "learning_rate": 0.00018865546218487395, + "loss": 0.4651, + "step": 29079 + }, + { + "epoch": 16.24581005586592, + "grad_norm": 0.42309319972991943, + "learning_rate": 0.00018862745098039216, + "loss": 0.3474, + "step": 29080 + }, + { + "epoch": 16.2463687150838, + "grad_norm": 0.6950482726097107, + "learning_rate": 0.00018859943977591036, + "loss": 0.45, + "step": 29081 + }, + { + "epoch": 16.246927374301677, + "grad_norm": 0.48081183433532715, + "learning_rate": 0.0001885714285714286, + "loss": 0.2722, + "step": 29082 + }, + { + "epoch": 16.247486033519554, + "grad_norm": 0.9204301238059998, + "learning_rate": 0.00018854341736694678, + "loss": 0.4694, + "step": 29083 + }, + { + "epoch": 16.24804469273743, + "grad_norm": 0.5251319408416748, + "learning_rate": 0.00018851540616246498, + "loss": 0.4333, + "step": 29084 + }, + { + "epoch": 16.248603351955307, + "grad_norm": 1.100652813911438, + "learning_rate": 0.0001884873949579832, + "loss": 0.5262, + "step": 29085 + }, + { + "epoch": 16.249162011173183, + "grad_norm": 1.8652902841567993, + "learning_rate": 0.00018845938375350142, + "loss": 0.4302, + "step": 29086 + }, + { + "epoch": 16.24972067039106, + "grad_norm": 1.2497718334197998, + "learning_rate": 0.00018843137254901963, + "loss": 0.404, + "step": 29087 + }, + { + "epoch": 16.25027932960894, + "grad_norm": 0.4833080768585205, + "learning_rate": 0.0001884033613445378, + "loss": 0.4255, + "step": 29088 + }, + { + "epoch": 16.250837988826817, + "grad_norm": 0.5544809699058533, + "learning_rate": 0.000188375350140056, + "loss": 0.365, + "step": 29089 + }, + { + "epoch": 16.251396648044693, + "grad_norm": 0.3277098536491394, + "learning_rate": 0.00018834733893557425, + "loss": 0.3109, + "step": 29090 + }, + { + "epoch": 16.25195530726257, + "grad_norm": 2.0763208866119385, + "learning_rate": 0.00018831932773109245, + "loss": 0.3628, + "step": 29091 + }, + { + "epoch": 16.252513966480446, + "grad_norm": 5.025925159454346, + "learning_rate": 0.00018829131652661066, + "loss": 0.347, + "step": 29092 + }, + { + "epoch": 16.253072625698323, + "grad_norm": 0.4993504583835602, + "learning_rate": 0.00018826330532212884, + "loss": 0.4132, + "step": 29093 + }, + { + "epoch": 16.2536312849162, + "grad_norm": 0.3727685213088989, + "learning_rate": 0.00018823529411764707, + "loss": 0.3961, + "step": 29094 + }, + { + "epoch": 16.25418994413408, + "grad_norm": 0.5742563605308533, + "learning_rate": 0.00018820728291316528, + "loss": 0.4418, + "step": 29095 + }, + { + "epoch": 16.254748603351956, + "grad_norm": 0.35151973366737366, + "learning_rate": 0.00018817927170868348, + "loss": 0.4017, + "step": 29096 + }, + { + "epoch": 16.255307262569833, + "grad_norm": 0.515477180480957, + "learning_rate": 0.00018815126050420166, + "loss": 0.5386, + "step": 29097 + }, + { + "epoch": 16.25586592178771, + "grad_norm": 0.4228178858757019, + "learning_rate": 0.0001881232492997199, + "loss": 0.3413, + "step": 29098 + }, + { + "epoch": 16.256424581005586, + "grad_norm": 0.40588104724884033, + "learning_rate": 0.0001880952380952381, + "loss": 0.3689, + "step": 29099 + }, + { + "epoch": 16.256983240223462, + "grad_norm": 1.015671730041504, + "learning_rate": 0.0001880672268907563, + "loss": 0.3922, + "step": 29100 + }, + { + "epoch": 16.257541899441343, + "grad_norm": 0.3605565130710602, + "learning_rate": 0.0001880392156862745, + "loss": 0.3401, + "step": 29101 + }, + { + "epoch": 16.25810055865922, + "grad_norm": 0.9874143600463867, + "learning_rate": 0.00018801120448179272, + "loss": 0.4106, + "step": 29102 + }, + { + "epoch": 16.258659217877096, + "grad_norm": 0.4626317620277405, + "learning_rate": 0.00018798319327731092, + "loss": 0.3945, + "step": 29103 + }, + { + "epoch": 16.259217877094972, + "grad_norm": 0.3932786285877228, + "learning_rate": 0.00018795518207282913, + "loss": 0.4554, + "step": 29104 + }, + { + "epoch": 16.25977653631285, + "grad_norm": 0.5947979688644409, + "learning_rate": 0.00018792717086834734, + "loss": 0.4747, + "step": 29105 + }, + { + "epoch": 16.260335195530725, + "grad_norm": 0.45465967059135437, + "learning_rate": 0.00018789915966386557, + "loss": 0.4238, + "step": 29106 + }, + { + "epoch": 16.260893854748602, + "grad_norm": 0.38727518916130066, + "learning_rate": 0.00018787114845938375, + "loss": 0.3378, + "step": 29107 + }, + { + "epoch": 16.261452513966482, + "grad_norm": 0.546984076499939, + "learning_rate": 0.00018784313725490195, + "loss": 0.3193, + "step": 29108 + }, + { + "epoch": 16.26201117318436, + "grad_norm": 0.6047913432121277, + "learning_rate": 0.00018781512605042016, + "loss": 0.3548, + "step": 29109 + }, + { + "epoch": 16.262569832402235, + "grad_norm": 0.5609403848648071, + "learning_rate": 0.0001877871148459384, + "loss": 0.44, + "step": 29110 + }, + { + "epoch": 16.26312849162011, + "grad_norm": 0.3945479393005371, + "learning_rate": 0.0001877591036414566, + "loss": 0.428, + "step": 29111 + }, + { + "epoch": 16.26368715083799, + "grad_norm": 3.5814199447631836, + "learning_rate": 0.00018773109243697478, + "loss": 0.3967, + "step": 29112 + }, + { + "epoch": 16.264245810055865, + "grad_norm": 0.5166468024253845, + "learning_rate": 0.00018770308123249298, + "loss": 0.4152, + "step": 29113 + }, + { + "epoch": 16.26480446927374, + "grad_norm": 0.6188220381736755, + "learning_rate": 0.00018767507002801122, + "loss": 0.4333, + "step": 29114 + }, + { + "epoch": 16.26536312849162, + "grad_norm": 0.43525469303131104, + "learning_rate": 0.00018764705882352942, + "loss": 0.3802, + "step": 29115 + }, + { + "epoch": 16.265921787709498, + "grad_norm": 0.6135135889053345, + "learning_rate": 0.00018761904761904763, + "loss": 0.4858, + "step": 29116 + }, + { + "epoch": 16.266480446927375, + "grad_norm": 1.1723921298980713, + "learning_rate": 0.0001875910364145658, + "loss": 0.4417, + "step": 29117 + }, + { + "epoch": 16.26703910614525, + "grad_norm": 0.39319777488708496, + "learning_rate": 0.00018756302521008404, + "loss": 0.4916, + "step": 29118 + }, + { + "epoch": 16.267597765363128, + "grad_norm": 0.5077816247940063, + "learning_rate": 0.00018753501400560225, + "loss": 0.3119, + "step": 29119 + }, + { + "epoch": 16.268156424581004, + "grad_norm": 0.5423672199249268, + "learning_rate": 0.00018750700280112045, + "loss": 0.3795, + "step": 29120 + }, + { + "epoch": 16.26871508379888, + "grad_norm": 0.5061722993850708, + "learning_rate": 0.00018747899159663866, + "loss": 0.4888, + "step": 29121 + }, + { + "epoch": 16.26927374301676, + "grad_norm": 16.067617416381836, + "learning_rate": 0.00018745098039215686, + "loss": 0.3969, + "step": 29122 + }, + { + "epoch": 16.269832402234638, + "grad_norm": 0.3667023777961731, + "learning_rate": 0.00018742296918767507, + "loss": 0.3164, + "step": 29123 + }, + { + "epoch": 16.270391061452514, + "grad_norm": 0.3833708167076111, + "learning_rate": 0.00018739495798319328, + "loss": 0.3643, + "step": 29124 + }, + { + "epoch": 16.27094972067039, + "grad_norm": 0.42458459734916687, + "learning_rate": 0.00018736694677871148, + "loss": 0.3894, + "step": 29125 + }, + { + "epoch": 16.271508379888267, + "grad_norm": 0.8859478235244751, + "learning_rate": 0.00018733893557422972, + "loss": 0.4356, + "step": 29126 + }, + { + "epoch": 16.272067039106144, + "grad_norm": 0.3674730360507965, + "learning_rate": 0.0001873109243697479, + "loss": 0.3278, + "step": 29127 + }, + { + "epoch": 16.272625698324024, + "grad_norm": 0.3688463568687439, + "learning_rate": 0.0001872829131652661, + "loss": 0.4002, + "step": 29128 + }, + { + "epoch": 16.2731843575419, + "grad_norm": 2.6958277225494385, + "learning_rate": 0.0001872549019607843, + "loss": 0.5277, + "step": 29129 + }, + { + "epoch": 16.273743016759777, + "grad_norm": 0.4835015535354614, + "learning_rate": 0.00018722689075630254, + "loss": 0.5684, + "step": 29130 + }, + { + "epoch": 16.274301675977654, + "grad_norm": 0.46925756335258484, + "learning_rate": 0.00018719887955182075, + "loss": 0.5322, + "step": 29131 + }, + { + "epoch": 16.27486033519553, + "grad_norm": 0.8536108136177063, + "learning_rate": 0.00018717086834733892, + "loss": 0.3429, + "step": 29132 + }, + { + "epoch": 16.275418994413407, + "grad_norm": 0.42519521713256836, + "learning_rate": 0.00018714285714285713, + "loss": 0.3869, + "step": 29133 + }, + { + "epoch": 16.275977653631283, + "grad_norm": 0.5542572140693665, + "learning_rate": 0.00018711484593837536, + "loss": 0.5038, + "step": 29134 + }, + { + "epoch": 16.276536312849164, + "grad_norm": 0.4148680567741394, + "learning_rate": 0.00018708683473389357, + "loss": 0.4158, + "step": 29135 + }, + { + "epoch": 16.27709497206704, + "grad_norm": 0.39515072107315063, + "learning_rate": 0.00018705882352941178, + "loss": 0.3911, + "step": 29136 + }, + { + "epoch": 16.277653631284917, + "grad_norm": 0.637635350227356, + "learning_rate": 0.00018703081232492995, + "loss": 0.3914, + "step": 29137 + }, + { + "epoch": 16.278212290502793, + "grad_norm": 0.5663643479347229, + "learning_rate": 0.0001870028011204482, + "loss": 0.3522, + "step": 29138 + }, + { + "epoch": 16.27877094972067, + "grad_norm": 0.47910791635513306, + "learning_rate": 0.0001869747899159664, + "loss": 0.412, + "step": 29139 + }, + { + "epoch": 16.279329608938546, + "grad_norm": 0.3700598478317261, + "learning_rate": 0.0001869467787114846, + "loss": 0.4322, + "step": 29140 + }, + { + "epoch": 16.279888268156423, + "grad_norm": 0.4291694760322571, + "learning_rate": 0.0001869187675070028, + "loss": 0.5087, + "step": 29141 + }, + { + "epoch": 16.280446927374303, + "grad_norm": 0.4926217198371887, + "learning_rate": 0.000186890756302521, + "loss": 0.3972, + "step": 29142 + }, + { + "epoch": 16.28100558659218, + "grad_norm": 0.39969736337661743, + "learning_rate": 0.00018686274509803922, + "loss": 0.4101, + "step": 29143 + }, + { + "epoch": 16.281564245810056, + "grad_norm": 0.3326007127761841, + "learning_rate": 0.00018683473389355742, + "loss": 0.3498, + "step": 29144 + }, + { + "epoch": 16.282122905027933, + "grad_norm": 0.43563219904899597, + "learning_rate": 0.00018680672268907563, + "loss": 0.388, + "step": 29145 + }, + { + "epoch": 16.28268156424581, + "grad_norm": 0.3399113118648529, + "learning_rate": 0.00018677871148459386, + "loss": 0.3723, + "step": 29146 + }, + { + "epoch": 16.283240223463686, + "grad_norm": 0.5271950960159302, + "learning_rate": 0.00018675070028011204, + "loss": 0.3757, + "step": 29147 + }, + { + "epoch": 16.283798882681563, + "grad_norm": 0.4606555998325348, + "learning_rate": 0.00018672268907563025, + "loss": 0.4286, + "step": 29148 + }, + { + "epoch": 16.284357541899443, + "grad_norm": 0.49080440402030945, + "learning_rate": 0.00018669467787114845, + "loss": 0.3785, + "step": 29149 + }, + { + "epoch": 16.28491620111732, + "grad_norm": 0.8866600394248962, + "learning_rate": 0.0001866666666666667, + "loss": 0.4363, + "step": 29150 + }, + { + "epoch": 16.285474860335196, + "grad_norm": 0.8600183725357056, + "learning_rate": 0.00018663865546218487, + "loss": 0.4588, + "step": 29151 + }, + { + "epoch": 16.286033519553072, + "grad_norm": 0.3469720780849457, + "learning_rate": 0.00018661064425770307, + "loss": 0.421, + "step": 29152 + }, + { + "epoch": 16.28659217877095, + "grad_norm": 0.44809409976005554, + "learning_rate": 0.00018658263305322128, + "loss": 0.3602, + "step": 29153 + }, + { + "epoch": 16.287150837988825, + "grad_norm": 1.6205464601516724, + "learning_rate": 0.0001865546218487395, + "loss": 0.5199, + "step": 29154 + }, + { + "epoch": 16.287709497206706, + "grad_norm": 0.6207685470581055, + "learning_rate": 0.00018652661064425772, + "loss": 0.3592, + "step": 29155 + }, + { + "epoch": 16.288268156424582, + "grad_norm": 0.4336671531200409, + "learning_rate": 0.0001864985994397759, + "loss": 0.3146, + "step": 29156 + }, + { + "epoch": 16.28882681564246, + "grad_norm": 0.504715085029602, + "learning_rate": 0.0001864705882352941, + "loss": 0.4937, + "step": 29157 + }, + { + "epoch": 16.289385474860335, + "grad_norm": 0.930429995059967, + "learning_rate": 0.00018644257703081233, + "loss": 0.4622, + "step": 29158 + }, + { + "epoch": 16.289944134078212, + "grad_norm": 0.38197097182273865, + "learning_rate": 0.00018641456582633054, + "loss": 0.4488, + "step": 29159 + }, + { + "epoch": 16.29050279329609, + "grad_norm": 0.36801236867904663, + "learning_rate": 0.00018638655462184875, + "loss": 0.3556, + "step": 29160 + }, + { + "epoch": 16.291061452513965, + "grad_norm": 0.38204339146614075, + "learning_rate": 0.00018635854341736693, + "loss": 0.3351, + "step": 29161 + }, + { + "epoch": 16.291620111731845, + "grad_norm": 0.47354158759117126, + "learning_rate": 0.00018633053221288516, + "loss": 0.4342, + "step": 29162 + }, + { + "epoch": 16.29217877094972, + "grad_norm": 0.5487963557243347, + "learning_rate": 0.00018630252100840336, + "loss": 0.535, + "step": 29163 + }, + { + "epoch": 16.2927374301676, + "grad_norm": 0.4645974636077881, + "learning_rate": 0.00018627450980392157, + "loss": 0.3823, + "step": 29164 + }, + { + "epoch": 16.293296089385475, + "grad_norm": 0.6940818428993225, + "learning_rate": 0.00018624649859943978, + "loss": 0.4547, + "step": 29165 + }, + { + "epoch": 16.29385474860335, + "grad_norm": 0.3313988745212555, + "learning_rate": 0.00018621848739495798, + "loss": 0.3685, + "step": 29166 + }, + { + "epoch": 16.294413407821228, + "grad_norm": 0.542900025844574, + "learning_rate": 0.0001861904761904762, + "loss": 0.3499, + "step": 29167 + }, + { + "epoch": 16.294972067039105, + "grad_norm": 0.3856269419193268, + "learning_rate": 0.0001861624649859944, + "loss": 0.4374, + "step": 29168 + }, + { + "epoch": 16.295530726256985, + "grad_norm": 0.5980980396270752, + "learning_rate": 0.0001861344537815126, + "loss": 0.5256, + "step": 29169 + }, + { + "epoch": 16.29608938547486, + "grad_norm": 0.4706897437572479, + "learning_rate": 0.00018610644257703083, + "loss": 0.4057, + "step": 29170 + }, + { + "epoch": 16.296648044692738, + "grad_norm": 0.3908962309360504, + "learning_rate": 0.000186078431372549, + "loss": 0.3912, + "step": 29171 + }, + { + "epoch": 16.297206703910614, + "grad_norm": 5.841452598571777, + "learning_rate": 0.00018605042016806722, + "loss": 0.5153, + "step": 29172 + }, + { + "epoch": 16.29776536312849, + "grad_norm": 0.48757871985435486, + "learning_rate": 0.00018602240896358542, + "loss": 0.6541, + "step": 29173 + }, + { + "epoch": 16.298324022346367, + "grad_norm": 0.7620923519134521, + "learning_rate": 0.00018599439775910366, + "loss": 0.3578, + "step": 29174 + }, + { + "epoch": 16.298882681564244, + "grad_norm": 0.33920931816101074, + "learning_rate": 0.00018596638655462186, + "loss": 0.272, + "step": 29175 + }, + { + "epoch": 16.299441340782124, + "grad_norm": 1.5870195627212524, + "learning_rate": 0.00018593837535014004, + "loss": 0.3656, + "step": 29176 + }, + { + "epoch": 16.3, + "grad_norm": 0.9805966019630432, + "learning_rate": 0.00018591036414565825, + "loss": 0.3682, + "step": 29177 + }, + { + "epoch": 16.300558659217877, + "grad_norm": 0.379983127117157, + "learning_rate": 0.00018588235294117648, + "loss": 0.3775, + "step": 29178 + }, + { + "epoch": 16.301117318435754, + "grad_norm": 0.6545946002006531, + "learning_rate": 0.0001858543417366947, + "loss": 0.541, + "step": 29179 + }, + { + "epoch": 16.30167597765363, + "grad_norm": 0.37663015723228455, + "learning_rate": 0.0001858263305322129, + "loss": 0.3774, + "step": 29180 + }, + { + "epoch": 16.302234636871507, + "grad_norm": 0.3958780765533447, + "learning_rate": 0.00018579831932773107, + "loss": 0.4102, + "step": 29181 + }, + { + "epoch": 16.302793296089387, + "grad_norm": 0.5670944452285767, + "learning_rate": 0.0001857703081232493, + "loss": 0.3944, + "step": 29182 + }, + { + "epoch": 16.303351955307264, + "grad_norm": 0.4776594042778015, + "learning_rate": 0.0001857422969187675, + "loss": 0.3873, + "step": 29183 + }, + { + "epoch": 16.30391061452514, + "grad_norm": 0.4428180754184723, + "learning_rate": 0.00018571428571428572, + "loss": 0.4654, + "step": 29184 + }, + { + "epoch": 16.304469273743017, + "grad_norm": 0.5051175355911255, + "learning_rate": 0.00018568627450980395, + "loss": 0.4051, + "step": 29185 + }, + { + "epoch": 16.305027932960893, + "grad_norm": 0.39730992913246155, + "learning_rate": 0.00018565826330532213, + "loss": 0.3407, + "step": 29186 + }, + { + "epoch": 16.30558659217877, + "grad_norm": 0.4471661448478699, + "learning_rate": 0.00018563025210084034, + "loss": 0.4246, + "step": 29187 + }, + { + "epoch": 16.306145251396647, + "grad_norm": 0.6260039210319519, + "learning_rate": 0.00018560224089635854, + "loss": 0.4354, + "step": 29188 + }, + { + "epoch": 16.306703910614527, + "grad_norm": 0.4061700403690338, + "learning_rate": 0.00018557422969187677, + "loss": 0.3366, + "step": 29189 + }, + { + "epoch": 16.307262569832403, + "grad_norm": 0.8906798362731934, + "learning_rate": 0.00018554621848739498, + "loss": 0.465, + "step": 29190 + }, + { + "epoch": 16.30782122905028, + "grad_norm": 0.3904813826084137, + "learning_rate": 0.00018551820728291316, + "loss": 0.4831, + "step": 29191 + }, + { + "epoch": 16.308379888268156, + "grad_norm": 0.3188614547252655, + "learning_rate": 0.00018549019607843137, + "loss": 0.378, + "step": 29192 + }, + { + "epoch": 16.308938547486033, + "grad_norm": 0.4340461790561676, + "learning_rate": 0.0001854621848739496, + "loss": 0.3103, + "step": 29193 + }, + { + "epoch": 16.30949720670391, + "grad_norm": 0.5216889977455139, + "learning_rate": 0.0001854341736694678, + "loss": 0.4056, + "step": 29194 + }, + { + "epoch": 16.310055865921786, + "grad_norm": 0.40086278319358826, + "learning_rate": 0.000185406162464986, + "loss": 0.3585, + "step": 29195 + }, + { + "epoch": 16.310614525139666, + "grad_norm": 0.42081013321876526, + "learning_rate": 0.0001853781512605042, + "loss": 0.3651, + "step": 29196 + }, + { + "epoch": 16.311173184357543, + "grad_norm": 0.4767129123210907, + "learning_rate": 0.00018535014005602242, + "loss": 0.3725, + "step": 29197 + }, + { + "epoch": 16.31173184357542, + "grad_norm": 0.5047635436058044, + "learning_rate": 0.00018532212885154063, + "loss": 0.4219, + "step": 29198 + }, + { + "epoch": 16.312290502793296, + "grad_norm": 0.3997997045516968, + "learning_rate": 0.00018529411764705883, + "loss": 0.4053, + "step": 29199 + }, + { + "epoch": 16.312849162011172, + "grad_norm": 0.4263671040534973, + "learning_rate": 0.00018526610644257704, + "loss": 0.3475, + "step": 29200 + }, + { + "epoch": 16.31340782122905, + "grad_norm": 0.4201339781284332, + "learning_rate": 0.00018523809523809525, + "loss": 0.3649, + "step": 29201 + }, + { + "epoch": 16.31396648044693, + "grad_norm": 0.4680660665035248, + "learning_rate": 0.00018521008403361345, + "loss": 0.3479, + "step": 29202 + }, + { + "epoch": 16.314525139664806, + "grad_norm": 0.4918220043182373, + "learning_rate": 0.00018518207282913166, + "loss": 0.4292, + "step": 29203 + }, + { + "epoch": 16.315083798882682, + "grad_norm": 0.6135198473930359, + "learning_rate": 0.00018515406162464986, + "loss": 0.4139, + "step": 29204 + }, + { + "epoch": 16.31564245810056, + "grad_norm": 0.4507567882537842, + "learning_rate": 0.00018512605042016807, + "loss": 0.3462, + "step": 29205 + }, + { + "epoch": 16.316201117318435, + "grad_norm": 0.33337104320526123, + "learning_rate": 0.00018509803921568628, + "loss": 0.3737, + "step": 29206 + }, + { + "epoch": 16.316759776536312, + "grad_norm": 0.4143950641155243, + "learning_rate": 0.00018507002801120448, + "loss": 0.3631, + "step": 29207 + }, + { + "epoch": 16.31731843575419, + "grad_norm": 0.39553341269493103, + "learning_rate": 0.0001850420168067227, + "loss": 0.3884, + "step": 29208 + }, + { + "epoch": 16.31787709497207, + "grad_norm": 0.4273497462272644, + "learning_rate": 0.00018501400560224092, + "loss": 0.4903, + "step": 29209 + }, + { + "epoch": 16.318435754189945, + "grad_norm": 0.8866323232650757, + "learning_rate": 0.0001849859943977591, + "loss": 0.5057, + "step": 29210 + }, + { + "epoch": 16.31899441340782, + "grad_norm": 0.5312885642051697, + "learning_rate": 0.0001849579831932773, + "loss": 0.4901, + "step": 29211 + }, + { + "epoch": 16.3195530726257, + "grad_norm": 0.3801111876964569, + "learning_rate": 0.0001849299719887955, + "loss": 0.3127, + "step": 29212 + }, + { + "epoch": 16.320111731843575, + "grad_norm": 0.42810311913490295, + "learning_rate": 0.00018490196078431375, + "loss": 0.5119, + "step": 29213 + }, + { + "epoch": 16.32067039106145, + "grad_norm": 0.6221145987510681, + "learning_rate": 0.00018487394957983195, + "loss": 0.3868, + "step": 29214 + }, + { + "epoch": 16.321229050279328, + "grad_norm": 2.6309096813201904, + "learning_rate": 0.00018484593837535013, + "loss": 0.4351, + "step": 29215 + }, + { + "epoch": 16.321787709497208, + "grad_norm": 0.46629124879837036, + "learning_rate": 0.00018481792717086834, + "loss": 0.4432, + "step": 29216 + }, + { + "epoch": 16.322346368715085, + "grad_norm": 0.4800971746444702, + "learning_rate": 0.00018478991596638657, + "loss": 0.3808, + "step": 29217 + }, + { + "epoch": 16.32290502793296, + "grad_norm": 0.45208385586738586, + "learning_rate": 0.00018476190476190478, + "loss": 0.3619, + "step": 29218 + }, + { + "epoch": 16.323463687150838, + "grad_norm": 0.36703911423683167, + "learning_rate": 0.00018473389355742298, + "loss": 0.3848, + "step": 29219 + }, + { + "epoch": 16.324022346368714, + "grad_norm": 0.38982218503952026, + "learning_rate": 0.00018470588235294116, + "loss": 0.4822, + "step": 29220 + }, + { + "epoch": 16.32458100558659, + "grad_norm": 0.3825012147426605, + "learning_rate": 0.0001846778711484594, + "loss": 0.3929, + "step": 29221 + }, + { + "epoch": 16.325139664804468, + "grad_norm": 0.526433527469635, + "learning_rate": 0.0001846498599439776, + "loss": 0.4149, + "step": 29222 + }, + { + "epoch": 16.325698324022348, + "grad_norm": 0.3424449563026428, + "learning_rate": 0.0001846218487394958, + "loss": 0.5157, + "step": 29223 + }, + { + "epoch": 16.326256983240224, + "grad_norm": 0.397943913936615, + "learning_rate": 0.000184593837535014, + "loss": 0.4104, + "step": 29224 + }, + { + "epoch": 16.3268156424581, + "grad_norm": 0.5023960471153259, + "learning_rate": 0.00018456582633053222, + "loss": 0.3657, + "step": 29225 + }, + { + "epoch": 16.327374301675977, + "grad_norm": 3.254836082458496, + "learning_rate": 0.00018453781512605042, + "loss": 0.4406, + "step": 29226 + }, + { + "epoch": 16.327932960893854, + "grad_norm": 1.1098432540893555, + "learning_rate": 0.00018450980392156863, + "loss": 0.6464, + "step": 29227 + }, + { + "epoch": 16.32849162011173, + "grad_norm": 0.46687057614326477, + "learning_rate": 0.00018448179271708684, + "loss": 0.4494, + "step": 29228 + }, + { + "epoch": 16.32905027932961, + "grad_norm": 0.5460361838340759, + "learning_rate": 0.00018445378151260507, + "loss": 0.4602, + "step": 29229 + }, + { + "epoch": 16.329608938547487, + "grad_norm": 0.44901618361473083, + "learning_rate": 0.00018442577030812325, + "loss": 0.3199, + "step": 29230 + }, + { + "epoch": 16.330167597765364, + "grad_norm": 0.40359994769096375, + "learning_rate": 0.00018439775910364145, + "loss": 0.3447, + "step": 29231 + }, + { + "epoch": 16.33072625698324, + "grad_norm": 3.2685322761535645, + "learning_rate": 0.00018436974789915966, + "loss": 0.44, + "step": 29232 + }, + { + "epoch": 16.331284916201117, + "grad_norm": 0.3578014671802521, + "learning_rate": 0.0001843417366946779, + "loss": 0.3838, + "step": 29233 + }, + { + "epoch": 16.331843575418993, + "grad_norm": 0.5542742609977722, + "learning_rate": 0.0001843137254901961, + "loss": 0.4238, + "step": 29234 + }, + { + "epoch": 16.33240223463687, + "grad_norm": 0.3870325982570648, + "learning_rate": 0.00018428571428571428, + "loss": 0.4147, + "step": 29235 + }, + { + "epoch": 16.33296089385475, + "grad_norm": 0.563289225101471, + "learning_rate": 0.00018425770308123248, + "loss": 0.4619, + "step": 29236 + }, + { + "epoch": 16.333519553072627, + "grad_norm": 0.42293792963027954, + "learning_rate": 0.00018422969187675072, + "loss": 0.3667, + "step": 29237 + }, + { + "epoch": 16.334078212290503, + "grad_norm": 0.906248927116394, + "learning_rate": 0.00018420168067226892, + "loss": 0.4891, + "step": 29238 + }, + { + "epoch": 16.33463687150838, + "grad_norm": 0.4047188460826874, + "learning_rate": 0.00018417366946778713, + "loss": 0.3169, + "step": 29239 + }, + { + "epoch": 16.335195530726256, + "grad_norm": 0.6282626390457153, + "learning_rate": 0.0001841456582633053, + "loss": 0.3237, + "step": 29240 + }, + { + "epoch": 16.335754189944133, + "grad_norm": 0.6191372275352478, + "learning_rate": 0.00018411764705882354, + "loss": 0.3981, + "step": 29241 + }, + { + "epoch": 16.33631284916201, + "grad_norm": 0.9407033920288086, + "learning_rate": 0.00018408963585434175, + "loss": 0.4274, + "step": 29242 + }, + { + "epoch": 16.33687150837989, + "grad_norm": 0.3497813940048218, + "learning_rate": 0.00018406162464985995, + "loss": 0.4113, + "step": 29243 + }, + { + "epoch": 16.337430167597766, + "grad_norm": 0.4571531116962433, + "learning_rate": 0.00018403361344537816, + "loss": 0.3937, + "step": 29244 + }, + { + "epoch": 16.337988826815643, + "grad_norm": 0.4238032102584839, + "learning_rate": 0.00018400560224089636, + "loss": 0.3561, + "step": 29245 + }, + { + "epoch": 16.33854748603352, + "grad_norm": 0.6335416436195374, + "learning_rate": 0.00018397759103641457, + "loss": 0.55, + "step": 29246 + }, + { + "epoch": 16.339106145251396, + "grad_norm": 1.5848215818405151, + "learning_rate": 0.00018394957983193278, + "loss": 0.4107, + "step": 29247 + }, + { + "epoch": 16.339664804469272, + "grad_norm": 0.47834599018096924, + "learning_rate": 0.00018392156862745098, + "loss": 0.3934, + "step": 29248 + }, + { + "epoch": 16.340223463687153, + "grad_norm": 3.7983195781707764, + "learning_rate": 0.00018389355742296922, + "loss": 0.5218, + "step": 29249 + }, + { + "epoch": 16.34078212290503, + "grad_norm": 0.4493592381477356, + "learning_rate": 0.0001838655462184874, + "loss": 0.5024, + "step": 29250 + }, + { + "epoch": 16.341340782122906, + "grad_norm": 0.41889163851737976, + "learning_rate": 0.0001838375350140056, + "loss": 0.3673, + "step": 29251 + }, + { + "epoch": 16.341899441340782, + "grad_norm": 0.42403653264045715, + "learning_rate": 0.0001838095238095238, + "loss": 0.3908, + "step": 29252 + }, + { + "epoch": 16.34245810055866, + "grad_norm": 0.44208356738090515, + "learning_rate": 0.00018378151260504204, + "loss": 0.4522, + "step": 29253 + }, + { + "epoch": 16.343016759776535, + "grad_norm": 0.5989401340484619, + "learning_rate": 0.00018375350140056025, + "loss": 0.38, + "step": 29254 + }, + { + "epoch": 16.343575418994412, + "grad_norm": 0.31500527262687683, + "learning_rate": 0.00018372549019607842, + "loss": 0.3827, + "step": 29255 + }, + { + "epoch": 16.344134078212292, + "grad_norm": 1.2640317678451538, + "learning_rate": 0.00018369747899159663, + "loss": 0.4623, + "step": 29256 + }, + { + "epoch": 16.34469273743017, + "grad_norm": 0.45024389028549194, + "learning_rate": 0.00018366946778711486, + "loss": 0.4668, + "step": 29257 + }, + { + "epoch": 16.345251396648045, + "grad_norm": 0.34603625535964966, + "learning_rate": 0.00018364145658263307, + "loss": 0.4083, + "step": 29258 + }, + { + "epoch": 16.345810055865922, + "grad_norm": 1.5504465103149414, + "learning_rate": 0.00018361344537815128, + "loss": 0.4711, + "step": 29259 + }, + { + "epoch": 16.3463687150838, + "grad_norm": 0.3613249957561493, + "learning_rate": 0.00018358543417366945, + "loss": 0.3456, + "step": 29260 + }, + { + "epoch": 16.346927374301675, + "grad_norm": 0.6513382196426392, + "learning_rate": 0.0001835574229691877, + "loss": 0.4329, + "step": 29261 + }, + { + "epoch": 16.34748603351955, + "grad_norm": 0.35389411449432373, + "learning_rate": 0.0001835294117647059, + "loss": 0.3403, + "step": 29262 + }, + { + "epoch": 16.34804469273743, + "grad_norm": 0.6265043020248413, + "learning_rate": 0.0001835014005602241, + "loss": 0.6289, + "step": 29263 + }, + { + "epoch": 16.34860335195531, + "grad_norm": 0.5129328370094299, + "learning_rate": 0.00018347338935574228, + "loss": 0.4474, + "step": 29264 + }, + { + "epoch": 16.349162011173185, + "grad_norm": 0.5313289761543274, + "learning_rate": 0.0001834453781512605, + "loss": 0.3894, + "step": 29265 + }, + { + "epoch": 16.34972067039106, + "grad_norm": 0.44400593638420105, + "learning_rate": 0.00018341736694677872, + "loss": 0.3965, + "step": 29266 + }, + { + "epoch": 16.350279329608938, + "grad_norm": 0.5046951770782471, + "learning_rate": 0.00018338935574229692, + "loss": 0.51, + "step": 29267 + }, + { + "epoch": 16.350837988826814, + "grad_norm": 0.38331663608551025, + "learning_rate": 0.00018336134453781513, + "loss": 0.3484, + "step": 29268 + }, + { + "epoch": 16.35139664804469, + "grad_norm": 0.38684171438217163, + "learning_rate": 0.00018333333333333334, + "loss": 0.4309, + "step": 29269 + }, + { + "epoch": 16.35195530726257, + "grad_norm": 0.2986755073070526, + "learning_rate": 0.00018330532212885154, + "loss": 0.2516, + "step": 29270 + }, + { + "epoch": 16.352513966480448, + "grad_norm": 0.38292479515075684, + "learning_rate": 0.00018327731092436975, + "loss": 0.4192, + "step": 29271 + }, + { + "epoch": 16.353072625698324, + "grad_norm": 0.428937703371048, + "learning_rate": 0.00018324929971988795, + "loss": 0.3361, + "step": 29272 + }, + { + "epoch": 16.3536312849162, + "grad_norm": 1.4777461290359497, + "learning_rate": 0.00018322128851540619, + "loss": 0.4556, + "step": 29273 + }, + { + "epoch": 16.354189944134077, + "grad_norm": 0.3609444200992584, + "learning_rate": 0.00018319327731092437, + "loss": 0.3869, + "step": 29274 + }, + { + "epoch": 16.354748603351954, + "grad_norm": 0.29203441739082336, + "learning_rate": 0.00018316526610644257, + "loss": 0.3545, + "step": 29275 + }, + { + "epoch": 16.355307262569834, + "grad_norm": 0.5496408343315125, + "learning_rate": 0.00018313725490196078, + "loss": 0.42, + "step": 29276 + }, + { + "epoch": 16.35586592178771, + "grad_norm": 0.42638736963272095, + "learning_rate": 0.000183109243697479, + "loss": 0.4328, + "step": 29277 + }, + { + "epoch": 16.356424581005587, + "grad_norm": 0.44181400537490845, + "learning_rate": 0.00018308123249299722, + "loss": 0.3923, + "step": 29278 + }, + { + "epoch": 16.356983240223464, + "grad_norm": 0.48514965176582336, + "learning_rate": 0.0001830532212885154, + "loss": 0.4978, + "step": 29279 + }, + { + "epoch": 16.35754189944134, + "grad_norm": 0.5489807724952698, + "learning_rate": 0.0001830252100840336, + "loss": 0.452, + "step": 29280 + }, + { + "epoch": 16.358100558659217, + "grad_norm": 0.4797283411026001, + "learning_rate": 0.00018299719887955183, + "loss": 0.4269, + "step": 29281 + }, + { + "epoch": 16.358659217877094, + "grad_norm": 1.541698694229126, + "learning_rate": 0.00018296918767507004, + "loss": 0.6974, + "step": 29282 + }, + { + "epoch": 16.359217877094974, + "grad_norm": 0.4843508005142212, + "learning_rate": 0.00018294117647058825, + "loss": 0.4573, + "step": 29283 + }, + { + "epoch": 16.35977653631285, + "grad_norm": 0.6121612787246704, + "learning_rate": 0.00018291316526610643, + "loss": 0.4276, + "step": 29284 + }, + { + "epoch": 16.360335195530727, + "grad_norm": 0.4368060231208801, + "learning_rate": 0.00018288515406162466, + "loss": 0.4165, + "step": 29285 + }, + { + "epoch": 16.360893854748603, + "grad_norm": 0.4716063141822815, + "learning_rate": 0.00018285714285714286, + "loss": 0.4793, + "step": 29286 + }, + { + "epoch": 16.36145251396648, + "grad_norm": 0.4299421012401581, + "learning_rate": 0.00018282913165266107, + "loss": 0.4326, + "step": 29287 + }, + { + "epoch": 16.362011173184356, + "grad_norm": 0.40455934405326843, + "learning_rate": 0.00018280112044817928, + "loss": 0.3515, + "step": 29288 + }, + { + "epoch": 16.362569832402233, + "grad_norm": 0.5767871141433716, + "learning_rate": 0.00018277310924369748, + "loss": 0.4018, + "step": 29289 + }, + { + "epoch": 16.363128491620113, + "grad_norm": 0.37597858905792236, + "learning_rate": 0.0001827450980392157, + "loss": 0.3562, + "step": 29290 + }, + { + "epoch": 16.36368715083799, + "grad_norm": 2.1146843433380127, + "learning_rate": 0.0001827170868347339, + "loss": 0.4456, + "step": 29291 + }, + { + "epoch": 16.364245810055866, + "grad_norm": 0.4962692856788635, + "learning_rate": 0.0001826890756302521, + "loss": 0.4764, + "step": 29292 + }, + { + "epoch": 16.364804469273743, + "grad_norm": 0.314089298248291, + "learning_rate": 0.00018266106442577033, + "loss": 0.3385, + "step": 29293 + }, + { + "epoch": 16.36536312849162, + "grad_norm": 0.6499539017677307, + "learning_rate": 0.0001826330532212885, + "loss": 0.3931, + "step": 29294 + }, + { + "epoch": 16.365921787709496, + "grad_norm": 0.43791627883911133, + "learning_rate": 0.00018260504201680672, + "loss": 0.4438, + "step": 29295 + }, + { + "epoch": 16.366480446927373, + "grad_norm": 0.3982679843902588, + "learning_rate": 0.00018257703081232492, + "loss": 0.3195, + "step": 29296 + }, + { + "epoch": 16.367039106145253, + "grad_norm": 2.6786413192749023, + "learning_rate": 0.00018254901960784316, + "loss": 0.3847, + "step": 29297 + }, + { + "epoch": 16.36759776536313, + "grad_norm": 0.6924843192100525, + "learning_rate": 0.00018252100840336136, + "loss": 0.4085, + "step": 29298 + }, + { + "epoch": 16.368156424581006, + "grad_norm": 0.5009807348251343, + "learning_rate": 0.00018249299719887954, + "loss": 0.4202, + "step": 29299 + }, + { + "epoch": 16.368715083798882, + "grad_norm": 0.4068930149078369, + "learning_rate": 0.00018246498599439775, + "loss": 0.3709, + "step": 29300 + }, + { + "epoch": 16.36927374301676, + "grad_norm": 0.414574533700943, + "learning_rate": 0.00018243697478991598, + "loss": 0.5027, + "step": 29301 + }, + { + "epoch": 16.369832402234636, + "grad_norm": 0.38836005330085754, + "learning_rate": 0.0001824089635854342, + "loss": 0.2532, + "step": 29302 + }, + { + "epoch": 16.370391061452516, + "grad_norm": 0.36547163128852844, + "learning_rate": 0.0001823809523809524, + "loss": 0.3456, + "step": 29303 + }, + { + "epoch": 16.370949720670392, + "grad_norm": 1.076255202293396, + "learning_rate": 0.00018235294117647057, + "loss": 0.639, + "step": 29304 + }, + { + "epoch": 16.37150837988827, + "grad_norm": 1.2171363830566406, + "learning_rate": 0.0001823249299719888, + "loss": 0.3471, + "step": 29305 + }, + { + "epoch": 16.372067039106145, + "grad_norm": 0.3890760540962219, + "learning_rate": 0.000182296918767507, + "loss": 0.4504, + "step": 29306 + }, + { + "epoch": 16.372625698324022, + "grad_norm": 0.4601249694824219, + "learning_rate": 0.00018226890756302522, + "loss": 0.4602, + "step": 29307 + }, + { + "epoch": 16.3731843575419, + "grad_norm": 0.37602803111076355, + "learning_rate": 0.00018224089635854342, + "loss": 0.414, + "step": 29308 + }, + { + "epoch": 16.373743016759775, + "grad_norm": 0.3871692419052124, + "learning_rate": 0.00018221288515406163, + "loss": 0.3911, + "step": 29309 + }, + { + "epoch": 16.374301675977655, + "grad_norm": 0.681891143321991, + "learning_rate": 0.00018218487394957984, + "loss": 0.5098, + "step": 29310 + }, + { + "epoch": 16.37486033519553, + "grad_norm": 0.45180290937423706, + "learning_rate": 0.00018215686274509804, + "loss": 0.4606, + "step": 29311 + }, + { + "epoch": 16.37541899441341, + "grad_norm": 0.31876417994499207, + "learning_rate": 0.00018212885154061625, + "loss": 0.4339, + "step": 29312 + }, + { + "epoch": 16.375977653631285, + "grad_norm": 0.5003113150596619, + "learning_rate": 0.00018210084033613448, + "loss": 0.4175, + "step": 29313 + }, + { + "epoch": 16.37653631284916, + "grad_norm": 0.3924187421798706, + "learning_rate": 0.00018207282913165266, + "loss": 0.3779, + "step": 29314 + }, + { + "epoch": 16.377094972067038, + "grad_norm": 1.0952647924423218, + "learning_rate": 0.00018204481792717087, + "loss": 0.3918, + "step": 29315 + }, + { + "epoch": 16.377653631284915, + "grad_norm": 0.36226415634155273, + "learning_rate": 0.00018201680672268907, + "loss": 0.354, + "step": 29316 + }, + { + "epoch": 16.378212290502795, + "grad_norm": 0.7102446556091309, + "learning_rate": 0.0001819887955182073, + "loss": 0.3695, + "step": 29317 + }, + { + "epoch": 16.37877094972067, + "grad_norm": 0.4620143175125122, + "learning_rate": 0.00018196078431372548, + "loss": 0.4195, + "step": 29318 + }, + { + "epoch": 16.379329608938548, + "grad_norm": 0.5799338817596436, + "learning_rate": 0.0001819327731092437, + "loss": 0.3947, + "step": 29319 + }, + { + "epoch": 16.379888268156424, + "grad_norm": 0.6911343932151794, + "learning_rate": 0.0001819047619047619, + "loss": 0.3326, + "step": 29320 + }, + { + "epoch": 16.3804469273743, + "grad_norm": 0.547643780708313, + "learning_rate": 0.00018187675070028013, + "loss": 0.5668, + "step": 29321 + }, + { + "epoch": 16.381005586592178, + "grad_norm": 0.6426225304603577, + "learning_rate": 0.00018184873949579833, + "loss": 0.4453, + "step": 29322 + }, + { + "epoch": 16.381564245810054, + "grad_norm": 0.501176118850708, + "learning_rate": 0.0001818207282913165, + "loss": 0.3989, + "step": 29323 + }, + { + "epoch": 16.382122905027934, + "grad_norm": 0.7337033748626709, + "learning_rate": 0.00018179271708683472, + "loss": 0.4347, + "step": 29324 + }, + { + "epoch": 16.38268156424581, + "grad_norm": 0.3888240158557892, + "learning_rate": 0.00018176470588235295, + "loss": 0.3696, + "step": 29325 + }, + { + "epoch": 16.383240223463687, + "grad_norm": 2.0745716094970703, + "learning_rate": 0.00018173669467787116, + "loss": 0.4853, + "step": 29326 + }, + { + "epoch": 16.383798882681564, + "grad_norm": 1.0249732732772827, + "learning_rate": 0.00018170868347338936, + "loss": 0.3249, + "step": 29327 + }, + { + "epoch": 16.38435754189944, + "grad_norm": 0.35450994968414307, + "learning_rate": 0.00018168067226890754, + "loss": 0.3556, + "step": 29328 + }, + { + "epoch": 16.384916201117317, + "grad_norm": 0.8636338710784912, + "learning_rate": 0.00018165266106442578, + "loss": 0.3969, + "step": 29329 + }, + { + "epoch": 16.385474860335197, + "grad_norm": 0.39495307207107544, + "learning_rate": 0.00018162464985994398, + "loss": 0.3413, + "step": 29330 + }, + { + "epoch": 16.386033519553074, + "grad_norm": 0.8098691701889038, + "learning_rate": 0.0001815966386554622, + "loss": 0.3852, + "step": 29331 + }, + { + "epoch": 16.38659217877095, + "grad_norm": 0.47679540514945984, + "learning_rate": 0.0001815686274509804, + "loss": 0.3945, + "step": 29332 + }, + { + "epoch": 16.387150837988827, + "grad_norm": 1.0485734939575195, + "learning_rate": 0.0001815406162464986, + "loss": 0.3792, + "step": 29333 + }, + { + "epoch": 16.387709497206703, + "grad_norm": 0.5419967770576477, + "learning_rate": 0.0001815126050420168, + "loss": 0.4015, + "step": 29334 + }, + { + "epoch": 16.38826815642458, + "grad_norm": 0.38929253816604614, + "learning_rate": 0.000181484593837535, + "loss": 0.343, + "step": 29335 + }, + { + "epoch": 16.388826815642457, + "grad_norm": 0.41058382391929626, + "learning_rate": 0.00018145658263305322, + "loss": 0.4496, + "step": 29336 + }, + { + "epoch": 16.389385474860337, + "grad_norm": 0.7400023341178894, + "learning_rate": 0.00018142857142857145, + "loss": 0.3882, + "step": 29337 + }, + { + "epoch": 16.389944134078213, + "grad_norm": 0.44622188806533813, + "learning_rate": 0.00018140056022408963, + "loss": 0.4985, + "step": 29338 + }, + { + "epoch": 16.39050279329609, + "grad_norm": 0.43652012944221497, + "learning_rate": 0.00018137254901960784, + "loss": 0.3915, + "step": 29339 + }, + { + "epoch": 16.391061452513966, + "grad_norm": 0.4128139913082123, + "learning_rate": 0.00018134453781512604, + "loss": 0.363, + "step": 29340 + }, + { + "epoch": 16.391620111731843, + "grad_norm": 0.33831775188446045, + "learning_rate": 0.00018131652661064428, + "loss": 0.382, + "step": 29341 + }, + { + "epoch": 16.39217877094972, + "grad_norm": 0.5503801107406616, + "learning_rate": 0.00018128851540616248, + "loss": 0.3605, + "step": 29342 + }, + { + "epoch": 16.392737430167596, + "grad_norm": 0.391715943813324, + "learning_rate": 0.00018126050420168066, + "loss": 0.4508, + "step": 29343 + }, + { + "epoch": 16.393296089385476, + "grad_norm": 0.37690484523773193, + "learning_rate": 0.00018123249299719887, + "loss": 0.3141, + "step": 29344 + }, + { + "epoch": 16.393854748603353, + "grad_norm": 0.46313005685806274, + "learning_rate": 0.0001812044817927171, + "loss": 0.365, + "step": 29345 + }, + { + "epoch": 16.39441340782123, + "grad_norm": 0.3934359848499298, + "learning_rate": 0.0001811764705882353, + "loss": 0.3592, + "step": 29346 + }, + { + "epoch": 16.394972067039106, + "grad_norm": 0.3994815945625305, + "learning_rate": 0.0001811484593837535, + "loss": 0.3421, + "step": 29347 + }, + { + "epoch": 16.395530726256982, + "grad_norm": 0.49727535247802734, + "learning_rate": 0.0001811204481792717, + "loss": 0.3814, + "step": 29348 + }, + { + "epoch": 16.39608938547486, + "grad_norm": 0.4964869022369385, + "learning_rate": 0.00018109243697478992, + "loss": 0.3946, + "step": 29349 + }, + { + "epoch": 16.39664804469274, + "grad_norm": 2.4565200805664062, + "learning_rate": 0.00018106442577030813, + "loss": 0.3296, + "step": 29350 + }, + { + "epoch": 16.397206703910616, + "grad_norm": 0.46368148922920227, + "learning_rate": 0.00018103641456582634, + "loss": 0.4344, + "step": 29351 + }, + { + "epoch": 16.397765363128492, + "grad_norm": 0.9674256443977356, + "learning_rate": 0.00018100840336134454, + "loss": 0.4751, + "step": 29352 + }, + { + "epoch": 16.39832402234637, + "grad_norm": 0.5346485376358032, + "learning_rate": 0.00018098039215686275, + "loss": 0.3908, + "step": 29353 + }, + { + "epoch": 16.398882681564245, + "grad_norm": 0.401483416557312, + "learning_rate": 0.00018095238095238095, + "loss": 0.3718, + "step": 29354 + }, + { + "epoch": 16.399441340782122, + "grad_norm": 1.0723282098770142, + "learning_rate": 0.00018092436974789916, + "loss": 0.459, + "step": 29355 + }, + { + "epoch": 16.4, + "grad_norm": 0.6425642371177673, + "learning_rate": 0.00018089635854341737, + "loss": 0.4133, + "step": 29356 + }, + { + "epoch": 16.40055865921788, + "grad_norm": 0.46530115604400635, + "learning_rate": 0.0001808683473389356, + "loss": 0.4477, + "step": 29357 + }, + { + "epoch": 16.401117318435755, + "grad_norm": 0.5972253084182739, + "learning_rate": 0.00018084033613445378, + "loss": 0.3748, + "step": 29358 + }, + { + "epoch": 16.401675977653632, + "grad_norm": 0.564058244228363, + "learning_rate": 0.00018081232492997198, + "loss": 0.5235, + "step": 29359 + }, + { + "epoch": 16.40223463687151, + "grad_norm": 0.46438759565353394, + "learning_rate": 0.0001807843137254902, + "loss": 0.4149, + "step": 29360 + }, + { + "epoch": 16.402793296089385, + "grad_norm": 0.5858387351036072, + "learning_rate": 0.00018075630252100842, + "loss": 0.6283, + "step": 29361 + }, + { + "epoch": 16.40335195530726, + "grad_norm": 1.0293725728988647, + "learning_rate": 0.00018072829131652663, + "loss": 0.4761, + "step": 29362 + }, + { + "epoch": 16.403910614525138, + "grad_norm": 0.5233083963394165, + "learning_rate": 0.0001807002801120448, + "loss": 0.5338, + "step": 29363 + }, + { + "epoch": 16.404469273743018, + "grad_norm": 0.39073577523231506, + "learning_rate": 0.000180672268907563, + "loss": 0.4256, + "step": 29364 + }, + { + "epoch": 16.405027932960895, + "grad_norm": 1.2213108539581299, + "learning_rate": 0.00018064425770308125, + "loss": 0.5129, + "step": 29365 + }, + { + "epoch": 16.40558659217877, + "grad_norm": 0.42700281739234924, + "learning_rate": 0.00018061624649859945, + "loss": 0.4128, + "step": 29366 + }, + { + "epoch": 16.406145251396648, + "grad_norm": 0.4969269037246704, + "learning_rate": 0.00018058823529411766, + "loss": 0.3885, + "step": 29367 + }, + { + "epoch": 16.406703910614524, + "grad_norm": 6.9765238761901855, + "learning_rate": 0.00018056022408963584, + "loss": 0.3337, + "step": 29368 + }, + { + "epoch": 16.4072625698324, + "grad_norm": 0.6105775237083435, + "learning_rate": 0.00018053221288515407, + "loss": 0.4606, + "step": 29369 + }, + { + "epoch": 16.407821229050278, + "grad_norm": 0.47373780608177185, + "learning_rate": 0.00018050420168067228, + "loss": 0.4018, + "step": 29370 + }, + { + "epoch": 16.408379888268158, + "grad_norm": 0.48614877462387085, + "learning_rate": 0.00018047619047619048, + "loss": 0.3678, + "step": 29371 + }, + { + "epoch": 16.408938547486034, + "grad_norm": 0.5047931671142578, + "learning_rate": 0.00018044817927170866, + "loss": 0.4625, + "step": 29372 + }, + { + "epoch": 16.40949720670391, + "grad_norm": 0.4170670211315155, + "learning_rate": 0.0001804201680672269, + "loss": 0.3593, + "step": 29373 + }, + { + "epoch": 16.410055865921787, + "grad_norm": 0.3998320996761322, + "learning_rate": 0.0001803921568627451, + "loss": 0.3599, + "step": 29374 + }, + { + "epoch": 16.410614525139664, + "grad_norm": 0.4551156461238861, + "learning_rate": 0.0001803641456582633, + "loss": 0.4183, + "step": 29375 + }, + { + "epoch": 16.41117318435754, + "grad_norm": 1.0150521993637085, + "learning_rate": 0.0001803361344537815, + "loss": 0.3646, + "step": 29376 + }, + { + "epoch": 16.41173184357542, + "grad_norm": 1.6198301315307617, + "learning_rate": 0.00018030812324929972, + "loss": 0.5356, + "step": 29377 + }, + { + "epoch": 16.412290502793297, + "grad_norm": 0.46991240978240967, + "learning_rate": 0.00018028011204481792, + "loss": 0.4629, + "step": 29378 + }, + { + "epoch": 16.412849162011174, + "grad_norm": 0.4913531243801117, + "learning_rate": 0.00018025210084033613, + "loss": 0.4256, + "step": 29379 + }, + { + "epoch": 16.41340782122905, + "grad_norm": 0.8520797491073608, + "learning_rate": 0.00018022408963585434, + "loss": 0.3958, + "step": 29380 + }, + { + "epoch": 16.413966480446927, + "grad_norm": 1.7837859392166138, + "learning_rate": 0.00018019607843137257, + "loss": 0.4377, + "step": 29381 + }, + { + "epoch": 16.414525139664804, + "grad_norm": 0.35356056690216064, + "learning_rate": 0.00018016806722689075, + "loss": 0.3857, + "step": 29382 + }, + { + "epoch": 16.41508379888268, + "grad_norm": 1.4621249437332153, + "learning_rate": 0.00018014005602240895, + "loss": 0.4112, + "step": 29383 + }, + { + "epoch": 16.41564245810056, + "grad_norm": 0.5419853925704956, + "learning_rate": 0.00018011204481792716, + "loss": 0.4548, + "step": 29384 + }, + { + "epoch": 16.416201117318437, + "grad_norm": 0.347685843706131, + "learning_rate": 0.0001800840336134454, + "loss": 0.3322, + "step": 29385 + }, + { + "epoch": 16.416759776536313, + "grad_norm": 0.4668864607810974, + "learning_rate": 0.0001800560224089636, + "loss": 0.3549, + "step": 29386 + }, + { + "epoch": 16.41731843575419, + "grad_norm": 0.8550448417663574, + "learning_rate": 0.00018002801120448178, + "loss": 0.4192, + "step": 29387 + }, + { + "epoch": 16.417877094972066, + "grad_norm": 0.4354788362979889, + "learning_rate": 0.00017999999999999998, + "loss": 0.3668, + "step": 29388 + }, + { + "epoch": 16.418435754189943, + "grad_norm": 0.3606705367565155, + "learning_rate": 0.00017997198879551822, + "loss": 0.3758, + "step": 29389 + }, + { + "epoch": 16.41899441340782, + "grad_norm": 0.43496471643447876, + "learning_rate": 0.00017994397759103642, + "loss": 0.3764, + "step": 29390 + }, + { + "epoch": 16.4195530726257, + "grad_norm": 1.3305996656417847, + "learning_rate": 0.00017991596638655463, + "loss": 0.5198, + "step": 29391 + }, + { + "epoch": 16.420111731843576, + "grad_norm": 0.4233115613460541, + "learning_rate": 0.0001798879551820728, + "loss": 0.4372, + "step": 29392 + }, + { + "epoch": 16.420670391061453, + "grad_norm": 0.35455557703971863, + "learning_rate": 0.00017985994397759104, + "loss": 0.3625, + "step": 29393 + }, + { + "epoch": 16.42122905027933, + "grad_norm": 0.6690486669540405, + "learning_rate": 0.00017983193277310925, + "loss": 0.6452, + "step": 29394 + }, + { + "epoch": 16.421787709497206, + "grad_norm": 0.43061983585357666, + "learning_rate": 0.00017980392156862745, + "loss": 0.3574, + "step": 29395 + }, + { + "epoch": 16.422346368715083, + "grad_norm": 0.6338505744934082, + "learning_rate": 0.00017977591036414566, + "loss": 0.3963, + "step": 29396 + }, + { + "epoch": 16.422905027932963, + "grad_norm": 0.4692417085170746, + "learning_rate": 0.00017974789915966387, + "loss": 0.4116, + "step": 29397 + }, + { + "epoch": 16.42346368715084, + "grad_norm": 1.1381837129592896, + "learning_rate": 0.00017971988795518207, + "loss": 0.3987, + "step": 29398 + }, + { + "epoch": 16.424022346368716, + "grad_norm": 0.44889065623283386, + "learning_rate": 0.00017969187675070028, + "loss": 0.3823, + "step": 29399 + }, + { + "epoch": 16.424581005586592, + "grad_norm": 0.37629449367523193, + "learning_rate": 0.00017966386554621848, + "loss": 0.3933, + "step": 29400 + }, + { + "epoch": 16.42513966480447, + "grad_norm": 0.380341112613678, + "learning_rate": 0.00017963585434173672, + "loss": 0.3203, + "step": 29401 + }, + { + "epoch": 16.425698324022346, + "grad_norm": 0.3731003701686859, + "learning_rate": 0.0001796078431372549, + "loss": 0.3838, + "step": 29402 + }, + { + "epoch": 16.426256983240222, + "grad_norm": 0.40624523162841797, + "learning_rate": 0.0001795798319327731, + "loss": 0.4334, + "step": 29403 + }, + { + "epoch": 16.426815642458102, + "grad_norm": 0.4896109104156494, + "learning_rate": 0.0001795518207282913, + "loss": 0.3675, + "step": 29404 + }, + { + "epoch": 16.42737430167598, + "grad_norm": 0.7210534811019897, + "learning_rate": 0.00017952380952380954, + "loss": 0.4412, + "step": 29405 + }, + { + "epoch": 16.427932960893855, + "grad_norm": 0.4975736737251282, + "learning_rate": 0.00017949579831932775, + "loss": 0.4504, + "step": 29406 + }, + { + "epoch": 16.428491620111732, + "grad_norm": 0.7467525601387024, + "learning_rate": 0.00017946778711484593, + "loss": 0.4592, + "step": 29407 + }, + { + "epoch": 16.42905027932961, + "grad_norm": 0.5787555575370789, + "learning_rate": 0.00017943977591036413, + "loss": 0.4144, + "step": 29408 + }, + { + "epoch": 16.429608938547485, + "grad_norm": 0.9575775265693665, + "learning_rate": 0.00017941176470588236, + "loss": 0.5782, + "step": 29409 + }, + { + "epoch": 16.43016759776536, + "grad_norm": 0.45134004950523376, + "learning_rate": 0.00017938375350140057, + "loss": 0.4362, + "step": 29410 + }, + { + "epoch": 16.43072625698324, + "grad_norm": 0.3954046368598938, + "learning_rate": 0.00017935574229691878, + "loss": 0.3687, + "step": 29411 + }, + { + "epoch": 16.43128491620112, + "grad_norm": 0.39862683415412903, + "learning_rate": 0.00017932773109243696, + "loss": 0.3575, + "step": 29412 + }, + { + "epoch": 16.431843575418995, + "grad_norm": 5.257641792297363, + "learning_rate": 0.0001792997198879552, + "loss": 0.6435, + "step": 29413 + }, + { + "epoch": 16.43240223463687, + "grad_norm": 0.47216475009918213, + "learning_rate": 0.0001792717086834734, + "loss": 0.4379, + "step": 29414 + }, + { + "epoch": 16.432960893854748, + "grad_norm": 0.41238969564437866, + "learning_rate": 0.0001792436974789916, + "loss": 0.387, + "step": 29415 + }, + { + "epoch": 16.433519553072625, + "grad_norm": 0.3795824348926544, + "learning_rate": 0.00017921568627450983, + "loss": 0.3687, + "step": 29416 + }, + { + "epoch": 16.4340782122905, + "grad_norm": 1.1302629709243774, + "learning_rate": 0.000179187675070028, + "loss": 0.3983, + "step": 29417 + }, + { + "epoch": 16.43463687150838, + "grad_norm": 0.4332362115383148, + "learning_rate": 0.00017915966386554622, + "loss": 0.4238, + "step": 29418 + }, + { + "epoch": 16.435195530726258, + "grad_norm": 0.36636883020401, + "learning_rate": 0.00017913165266106442, + "loss": 0.3809, + "step": 29419 + }, + { + "epoch": 16.435754189944134, + "grad_norm": 0.39438527822494507, + "learning_rate": 0.00017910364145658266, + "loss": 0.3385, + "step": 29420 + }, + { + "epoch": 16.43631284916201, + "grad_norm": 0.4470037519931793, + "learning_rate": 0.00017907563025210086, + "loss": 0.4038, + "step": 29421 + }, + { + "epoch": 16.436871508379888, + "grad_norm": 0.35678696632385254, + "learning_rate": 0.00017904761904761904, + "loss": 0.4445, + "step": 29422 + }, + { + "epoch": 16.437430167597764, + "grad_norm": 39.401817321777344, + "learning_rate": 0.00017901960784313725, + "loss": 0.3245, + "step": 29423 + }, + { + "epoch": 16.43798882681564, + "grad_norm": 0.47048747539520264, + "learning_rate": 0.00017899159663865548, + "loss": 0.3636, + "step": 29424 + }, + { + "epoch": 16.43854748603352, + "grad_norm": 0.60615473985672, + "learning_rate": 0.0001789635854341737, + "loss": 0.3999, + "step": 29425 + }, + { + "epoch": 16.439106145251397, + "grad_norm": 0.49049490690231323, + "learning_rate": 0.0001789355742296919, + "loss": 0.3878, + "step": 29426 + }, + { + "epoch": 16.439664804469274, + "grad_norm": 0.5932494401931763, + "learning_rate": 0.00017890756302521007, + "loss": 0.5057, + "step": 29427 + }, + { + "epoch": 16.44022346368715, + "grad_norm": 0.41386163234710693, + "learning_rate": 0.0001788795518207283, + "loss": 0.3113, + "step": 29428 + }, + { + "epoch": 16.440782122905027, + "grad_norm": 0.39143890142440796, + "learning_rate": 0.0001788515406162465, + "loss": 0.3211, + "step": 29429 + }, + { + "epoch": 16.441340782122904, + "grad_norm": 0.47978535294532776, + "learning_rate": 0.00017882352941176472, + "loss": 0.4291, + "step": 29430 + }, + { + "epoch": 16.441899441340784, + "grad_norm": 0.4087378680706024, + "learning_rate": 0.0001787955182072829, + "loss": 0.4357, + "step": 29431 + }, + { + "epoch": 16.44245810055866, + "grad_norm": 0.4008702337741852, + "learning_rate": 0.00017876750700280113, + "loss": 0.4008, + "step": 29432 + }, + { + "epoch": 16.443016759776537, + "grad_norm": 0.666835606098175, + "learning_rate": 0.00017873949579831934, + "loss": 0.4823, + "step": 29433 + }, + { + "epoch": 16.443575418994413, + "grad_norm": 0.3378714323043823, + "learning_rate": 0.00017871148459383754, + "loss": 0.311, + "step": 29434 + }, + { + "epoch": 16.44413407821229, + "grad_norm": 6.9305100440979, + "learning_rate": 0.00017868347338935575, + "loss": 0.8419, + "step": 29435 + }, + { + "epoch": 16.444692737430167, + "grad_norm": 0.5294859409332275, + "learning_rate": 0.00017865546218487395, + "loss": 0.3849, + "step": 29436 + }, + { + "epoch": 16.445251396648043, + "grad_norm": 0.47561389207839966, + "learning_rate": 0.00017862745098039216, + "loss": 0.4249, + "step": 29437 + }, + { + "epoch": 16.445810055865923, + "grad_norm": 0.9982561469078064, + "learning_rate": 0.00017859943977591037, + "loss": 0.4407, + "step": 29438 + }, + { + "epoch": 16.4463687150838, + "grad_norm": 0.5340924859046936, + "learning_rate": 0.00017857142857142857, + "loss": 0.4228, + "step": 29439 + }, + { + "epoch": 16.446927374301676, + "grad_norm": 0.661406397819519, + "learning_rate": 0.0001785434173669468, + "loss": 0.4038, + "step": 29440 + }, + { + "epoch": 16.447486033519553, + "grad_norm": 0.5189449787139893, + "learning_rate": 0.00017851540616246498, + "loss": 0.4325, + "step": 29441 + }, + { + "epoch": 16.44804469273743, + "grad_norm": 0.45457956194877625, + "learning_rate": 0.0001784873949579832, + "loss": 0.3436, + "step": 29442 + }, + { + "epoch": 16.448603351955306, + "grad_norm": 0.5822194218635559, + "learning_rate": 0.0001784593837535014, + "loss": 0.5424, + "step": 29443 + }, + { + "epoch": 16.449162011173183, + "grad_norm": 0.6378493309020996, + "learning_rate": 0.00017843137254901963, + "loss": 0.5246, + "step": 29444 + }, + { + "epoch": 16.449720670391063, + "grad_norm": 0.967460572719574, + "learning_rate": 0.00017840336134453783, + "loss": 0.3991, + "step": 29445 + }, + { + "epoch": 16.45027932960894, + "grad_norm": 8.89358139038086, + "learning_rate": 0.000178375350140056, + "loss": 0.3686, + "step": 29446 + }, + { + "epoch": 16.450837988826816, + "grad_norm": 0.5115857720375061, + "learning_rate": 0.00017834733893557422, + "loss": 0.5709, + "step": 29447 + }, + { + "epoch": 16.451396648044692, + "grad_norm": 1.2989612817764282, + "learning_rate": 0.00017831932773109245, + "loss": 0.544, + "step": 29448 + }, + { + "epoch": 16.45195530726257, + "grad_norm": 0.4012458324432373, + "learning_rate": 0.00017829131652661066, + "loss": 0.4386, + "step": 29449 + }, + { + "epoch": 16.452513966480446, + "grad_norm": 0.4779501259326935, + "learning_rate": 0.00017826330532212886, + "loss": 0.4002, + "step": 29450 + }, + { + "epoch": 16.453072625698326, + "grad_norm": 0.36083459854125977, + "learning_rate": 0.00017823529411764704, + "loss": 0.3723, + "step": 29451 + }, + { + "epoch": 16.453631284916202, + "grad_norm": 2.731084108352661, + "learning_rate": 0.00017820728291316528, + "loss": 0.48, + "step": 29452 + }, + { + "epoch": 16.45418994413408, + "grad_norm": 0.5182414650917053, + "learning_rate": 0.00017817927170868348, + "loss": 0.449, + "step": 29453 + }, + { + "epoch": 16.454748603351955, + "grad_norm": 0.32445117831230164, + "learning_rate": 0.0001781512605042017, + "loss": 0.319, + "step": 29454 + }, + { + "epoch": 16.455307262569832, + "grad_norm": 0.6372569799423218, + "learning_rate": 0.0001781232492997199, + "loss": 0.3511, + "step": 29455 + }, + { + "epoch": 16.45586592178771, + "grad_norm": 0.4452030658721924, + "learning_rate": 0.0001780952380952381, + "loss": 0.3397, + "step": 29456 + }, + { + "epoch": 16.456424581005585, + "grad_norm": 0.4560225307941437, + "learning_rate": 0.0001780672268907563, + "loss": 0.394, + "step": 29457 + }, + { + "epoch": 16.456983240223465, + "grad_norm": 0.5394875407218933, + "learning_rate": 0.0001780392156862745, + "loss": 0.5881, + "step": 29458 + }, + { + "epoch": 16.457541899441342, + "grad_norm": 0.48844999074935913, + "learning_rate": 0.00017801120448179272, + "loss": 0.3734, + "step": 29459 + }, + { + "epoch": 16.45810055865922, + "grad_norm": 0.4158102869987488, + "learning_rate": 0.00017798319327731095, + "loss": 0.2226, + "step": 29460 + }, + { + "epoch": 16.458659217877095, + "grad_norm": 0.6215737462043762, + "learning_rate": 0.00017795518207282913, + "loss": 0.4187, + "step": 29461 + }, + { + "epoch": 16.45921787709497, + "grad_norm": 1.918904423713684, + "learning_rate": 0.00017792717086834734, + "loss": 0.3467, + "step": 29462 + }, + { + "epoch": 16.459776536312848, + "grad_norm": 0.4171788692474365, + "learning_rate": 0.00017789915966386554, + "loss": 0.3431, + "step": 29463 + }, + { + "epoch": 16.460335195530725, + "grad_norm": 0.36005398631095886, + "learning_rate": 0.00017787114845938378, + "loss": 0.3756, + "step": 29464 + }, + { + "epoch": 16.460893854748605, + "grad_norm": 0.528662919998169, + "learning_rate": 0.00017784313725490198, + "loss": 0.5389, + "step": 29465 + }, + { + "epoch": 16.46145251396648, + "grad_norm": 0.44890251755714417, + "learning_rate": 0.00017781512605042016, + "loss": 0.3353, + "step": 29466 + }, + { + "epoch": 16.462011173184358, + "grad_norm": 0.33808767795562744, + "learning_rate": 0.00017778711484593837, + "loss": 0.4475, + "step": 29467 + }, + { + "epoch": 16.462569832402234, + "grad_norm": 0.44032731652259827, + "learning_rate": 0.0001777591036414566, + "loss": 0.4351, + "step": 29468 + }, + { + "epoch": 16.46312849162011, + "grad_norm": 3.4745051860809326, + "learning_rate": 0.0001777310924369748, + "loss": 0.4358, + "step": 29469 + }, + { + "epoch": 16.463687150837988, + "grad_norm": 0.4250917434692383, + "learning_rate": 0.000177703081232493, + "loss": 0.5355, + "step": 29470 + }, + { + "epoch": 16.464245810055864, + "grad_norm": 0.33331140875816345, + "learning_rate": 0.0001776750700280112, + "loss": 0.3472, + "step": 29471 + }, + { + "epoch": 16.464804469273744, + "grad_norm": 1.1574751138687134, + "learning_rate": 0.00017764705882352942, + "loss": 0.4325, + "step": 29472 + }, + { + "epoch": 16.46536312849162, + "grad_norm": 0.48738908767700195, + "learning_rate": 0.00017761904761904763, + "loss": 0.3956, + "step": 29473 + }, + { + "epoch": 16.465921787709497, + "grad_norm": 0.41552281379699707, + "learning_rate": 0.00017759103641456584, + "loss": 0.3715, + "step": 29474 + }, + { + "epoch": 16.466480446927374, + "grad_norm": 1.4205689430236816, + "learning_rate": 0.00017756302521008404, + "loss": 0.354, + "step": 29475 + }, + { + "epoch": 16.46703910614525, + "grad_norm": 0.4148227274417877, + "learning_rate": 0.00017753501400560225, + "loss": 0.2938, + "step": 29476 + }, + { + "epoch": 16.467597765363127, + "grad_norm": 0.4061793386936188, + "learning_rate": 0.00017750700280112045, + "loss": 0.4087, + "step": 29477 + }, + { + "epoch": 16.468156424581007, + "grad_norm": 0.43411895632743835, + "learning_rate": 0.00017747899159663866, + "loss": 0.3314, + "step": 29478 + }, + { + "epoch": 16.468715083798884, + "grad_norm": 0.4680617153644562, + "learning_rate": 0.00017745098039215687, + "loss": 0.4388, + "step": 29479 + }, + { + "epoch": 16.46927374301676, + "grad_norm": 0.424232542514801, + "learning_rate": 0.0001774229691876751, + "loss": 0.5143, + "step": 29480 + }, + { + "epoch": 16.469832402234637, + "grad_norm": 0.5051302313804626, + "learning_rate": 0.00017739495798319328, + "loss": 0.6205, + "step": 29481 + }, + { + "epoch": 16.470391061452514, + "grad_norm": 0.48615598678588867, + "learning_rate": 0.00017736694677871148, + "loss": 0.3775, + "step": 29482 + }, + { + "epoch": 16.47094972067039, + "grad_norm": 0.42735230922698975, + "learning_rate": 0.0001773389355742297, + "loss": 0.3789, + "step": 29483 + }, + { + "epoch": 16.471508379888267, + "grad_norm": 1.5181976556777954, + "learning_rate": 0.00017731092436974792, + "loss": 0.4173, + "step": 29484 + }, + { + "epoch": 16.472067039106147, + "grad_norm": 0.8731113076210022, + "learning_rate": 0.0001772829131652661, + "loss": 0.4225, + "step": 29485 + }, + { + "epoch": 16.472625698324023, + "grad_norm": 0.46482938528060913, + "learning_rate": 0.0001772549019607843, + "loss": 0.452, + "step": 29486 + }, + { + "epoch": 16.4731843575419, + "grad_norm": 0.36842676997184753, + "learning_rate": 0.0001772268907563025, + "loss": 0.4023, + "step": 29487 + }, + { + "epoch": 16.473743016759776, + "grad_norm": 0.3433108627796173, + "learning_rate": 0.00017719887955182075, + "loss": 0.3581, + "step": 29488 + }, + { + "epoch": 16.474301675977653, + "grad_norm": 0.4484669268131256, + "learning_rate": 0.00017717086834733895, + "loss": 0.4462, + "step": 29489 + }, + { + "epoch": 16.47486033519553, + "grad_norm": 20.87667465209961, + "learning_rate": 0.00017714285714285713, + "loss": 0.5043, + "step": 29490 + }, + { + "epoch": 16.475418994413406, + "grad_norm": 0.6367224454879761, + "learning_rate": 0.00017711484593837534, + "loss": 0.4368, + "step": 29491 + }, + { + "epoch": 16.475977653631286, + "grad_norm": 1.043039083480835, + "learning_rate": 0.00017708683473389357, + "loss": 0.5384, + "step": 29492 + }, + { + "epoch": 16.476536312849163, + "grad_norm": 0.3519802391529083, + "learning_rate": 0.00017705882352941178, + "loss": 0.3477, + "step": 29493 + }, + { + "epoch": 16.47709497206704, + "grad_norm": 0.3984992802143097, + "learning_rate": 0.00017703081232492998, + "loss": 0.4305, + "step": 29494 + }, + { + "epoch": 16.477653631284916, + "grad_norm": 1.0287542343139648, + "learning_rate": 0.00017700280112044816, + "loss": 0.6056, + "step": 29495 + }, + { + "epoch": 16.478212290502793, + "grad_norm": 0.530757486820221, + "learning_rate": 0.0001769747899159664, + "loss": 0.3759, + "step": 29496 + }, + { + "epoch": 16.47877094972067, + "grad_norm": 0.3497740626335144, + "learning_rate": 0.0001769467787114846, + "loss": 0.3848, + "step": 29497 + }, + { + "epoch": 16.47932960893855, + "grad_norm": 0.6007401347160339, + "learning_rate": 0.0001769187675070028, + "loss": 0.4708, + "step": 29498 + }, + { + "epoch": 16.479888268156426, + "grad_norm": 0.4128912091255188, + "learning_rate": 0.000176890756302521, + "loss": 0.3202, + "step": 29499 + }, + { + "epoch": 16.480446927374302, + "grad_norm": 0.3892780542373657, + "learning_rate": 0.00017686274509803922, + "loss": 0.3687, + "step": 29500 + }, + { + "epoch": 16.480446927374302, + "eval_cer": 0.08526180266876153, + "eval_loss": 0.32208189368247986, + "eval_runtime": 55.4806, + "eval_samples_per_second": 81.794, + "eval_steps_per_second": 5.119, + "eval_wer": 0.3367783022820592, + "step": 29500 + }, + { + "epoch": 16.48100558659218, + "grad_norm": 0.5070006847381592, + "learning_rate": 0.00017683473389355742, + "loss": 0.5553, + "step": 29501 + }, + { + "epoch": 16.481564245810056, + "grad_norm": 0.5133941173553467, + "learning_rate": 0.00017680672268907563, + "loss": 0.3917, + "step": 29502 + }, + { + "epoch": 16.482122905027932, + "grad_norm": 0.4133491516113281, + "learning_rate": 0.00017677871148459384, + "loss": 0.4881, + "step": 29503 + }, + { + "epoch": 16.48268156424581, + "grad_norm": 1.5901294946670532, + "learning_rate": 0.00017675070028011207, + "loss": 0.3937, + "step": 29504 + }, + { + "epoch": 16.48324022346369, + "grad_norm": 0.3636549711227417, + "learning_rate": 0.00017672268907563025, + "loss": 0.2974, + "step": 29505 + }, + { + "epoch": 16.483798882681565, + "grad_norm": 0.8664739727973938, + "learning_rate": 0.00017669467787114845, + "loss": 0.561, + "step": 29506 + }, + { + "epoch": 16.484357541899442, + "grad_norm": 0.9924991726875305, + "learning_rate": 0.00017666666666666666, + "loss": 0.5394, + "step": 29507 + }, + { + "epoch": 16.48491620111732, + "grad_norm": 0.3601953089237213, + "learning_rate": 0.0001766386554621849, + "loss": 0.3947, + "step": 29508 + }, + { + "epoch": 16.485474860335195, + "grad_norm": 0.39514946937561035, + "learning_rate": 0.0001766106442577031, + "loss": 0.3079, + "step": 29509 + }, + { + "epoch": 16.48603351955307, + "grad_norm": 0.4760145843029022, + "learning_rate": 0.00017658263305322128, + "loss": 0.3853, + "step": 29510 + }, + { + "epoch": 16.486592178770948, + "grad_norm": 0.9267053008079529, + "learning_rate": 0.00017655462184873948, + "loss": 0.4347, + "step": 29511 + }, + { + "epoch": 16.48715083798883, + "grad_norm": 0.5790505409240723, + "learning_rate": 0.00017652661064425772, + "loss": 0.4244, + "step": 29512 + }, + { + "epoch": 16.487709497206705, + "grad_norm": 0.5494601726531982, + "learning_rate": 0.00017649859943977592, + "loss": 0.5681, + "step": 29513 + }, + { + "epoch": 16.48826815642458, + "grad_norm": 0.43294307589530945, + "learning_rate": 0.00017647058823529413, + "loss": 0.404, + "step": 29514 + }, + { + "epoch": 16.488826815642458, + "grad_norm": 0.5143566131591797, + "learning_rate": 0.0001764425770308123, + "loss": 0.4754, + "step": 29515 + }, + { + "epoch": 16.489385474860335, + "grad_norm": 1.9302611351013184, + "learning_rate": 0.00017641456582633054, + "loss": 0.3845, + "step": 29516 + }, + { + "epoch": 16.48994413407821, + "grad_norm": 0.5820269584655762, + "learning_rate": 0.00017638655462184875, + "loss": 0.3088, + "step": 29517 + }, + { + "epoch": 16.490502793296088, + "grad_norm": 0.37830305099487305, + "learning_rate": 0.00017635854341736695, + "loss": 0.4489, + "step": 29518 + }, + { + "epoch": 16.491061452513968, + "grad_norm": 0.4058821201324463, + "learning_rate": 0.00017633053221288516, + "loss": 0.429, + "step": 29519 + }, + { + "epoch": 16.491620111731844, + "grad_norm": 0.3129950761795044, + "learning_rate": 0.00017630252100840337, + "loss": 0.3669, + "step": 29520 + }, + { + "epoch": 16.49217877094972, + "grad_norm": 0.4125111401081085, + "learning_rate": 0.00017627450980392157, + "loss": 0.3638, + "step": 29521 + }, + { + "epoch": 16.492737430167598, + "grad_norm": 0.850448727607727, + "learning_rate": 0.00017624649859943978, + "loss": 0.3668, + "step": 29522 + }, + { + "epoch": 16.493296089385474, + "grad_norm": 1.0357177257537842, + "learning_rate": 0.00017621848739495798, + "loss": 0.3691, + "step": 29523 + }, + { + "epoch": 16.49385474860335, + "grad_norm": 0.6750121116638184, + "learning_rate": 0.00017619047619047622, + "loss": 0.4634, + "step": 29524 + }, + { + "epoch": 16.49441340782123, + "grad_norm": 0.40457144379615784, + "learning_rate": 0.0001761624649859944, + "loss": 0.4212, + "step": 29525 + }, + { + "epoch": 16.494972067039107, + "grad_norm": 0.3802253007888794, + "learning_rate": 0.0001761344537815126, + "loss": 0.4377, + "step": 29526 + }, + { + "epoch": 16.495530726256984, + "grad_norm": 0.5436435341835022, + "learning_rate": 0.0001761064425770308, + "loss": 0.3979, + "step": 29527 + }, + { + "epoch": 16.49608938547486, + "grad_norm": 0.8033078908920288, + "learning_rate": 0.00017607843137254904, + "loss": 0.4785, + "step": 29528 + }, + { + "epoch": 16.496648044692737, + "grad_norm": 0.46778449416160583, + "learning_rate": 0.00017605042016806725, + "loss": 0.4041, + "step": 29529 + }, + { + "epoch": 16.497206703910614, + "grad_norm": 0.38254237174987793, + "learning_rate": 0.00017602240896358543, + "loss": 0.4057, + "step": 29530 + }, + { + "epoch": 16.49776536312849, + "grad_norm": 0.4245761036872864, + "learning_rate": 0.00017599439775910363, + "loss": 0.3903, + "step": 29531 + }, + { + "epoch": 16.49832402234637, + "grad_norm": 0.8905037045478821, + "learning_rate": 0.00017596638655462186, + "loss": 0.3991, + "step": 29532 + }, + { + "epoch": 16.498882681564247, + "grad_norm": 1.5270904302597046, + "learning_rate": 0.00017593837535014007, + "loss": 0.3446, + "step": 29533 + }, + { + "epoch": 16.499441340782123, + "grad_norm": 1.6669657230377197, + "learning_rate": 0.00017591036414565828, + "loss": 0.3426, + "step": 29534 + }, + { + "epoch": 16.5, + "grad_norm": 0.34264275431632996, + "learning_rate": 0.00017588235294117646, + "loss": 0.3028, + "step": 29535 + }, + { + "epoch": 16.500558659217877, + "grad_norm": 0.4609209895133972, + "learning_rate": 0.0001758543417366947, + "loss": 0.3911, + "step": 29536 + }, + { + "epoch": 16.501117318435753, + "grad_norm": 0.8710081577301025, + "learning_rate": 0.0001758263305322129, + "loss": 0.4124, + "step": 29537 + }, + { + "epoch": 16.50167597765363, + "grad_norm": 0.4963414967060089, + "learning_rate": 0.0001757983193277311, + "loss": 0.365, + "step": 29538 + }, + { + "epoch": 16.50223463687151, + "grad_norm": 0.8065279126167297, + "learning_rate": 0.00017577030812324928, + "loss": 0.367, + "step": 29539 + }, + { + "epoch": 16.502793296089386, + "grad_norm": 0.4079006016254425, + "learning_rate": 0.0001757422969187675, + "loss": 0.4052, + "step": 29540 + }, + { + "epoch": 16.503351955307263, + "grad_norm": 0.47505417466163635, + "learning_rate": 0.00017571428571428572, + "loss": 0.5816, + "step": 29541 + }, + { + "epoch": 16.50391061452514, + "grad_norm": 0.49536171555519104, + "learning_rate": 0.00017568627450980392, + "loss": 0.4312, + "step": 29542 + }, + { + "epoch": 16.504469273743016, + "grad_norm": 11.939494132995605, + "learning_rate": 0.00017565826330532213, + "loss": 0.4046, + "step": 29543 + }, + { + "epoch": 16.505027932960893, + "grad_norm": 0.44392749667167664, + "learning_rate": 0.00017563025210084034, + "loss": 0.3759, + "step": 29544 + }, + { + "epoch": 16.505586592178773, + "grad_norm": 0.3380763828754425, + "learning_rate": 0.00017560224089635854, + "loss": 0.3819, + "step": 29545 + }, + { + "epoch": 16.50614525139665, + "grad_norm": 0.44330930709838867, + "learning_rate": 0.00017557422969187675, + "loss": 0.3929, + "step": 29546 + }, + { + "epoch": 16.506703910614526, + "grad_norm": 0.5489369034767151, + "learning_rate": 0.00017554621848739495, + "loss": 0.4064, + "step": 29547 + }, + { + "epoch": 16.507262569832402, + "grad_norm": 0.46115559339523315, + "learning_rate": 0.0001755182072829132, + "loss": 0.4147, + "step": 29548 + }, + { + "epoch": 16.50782122905028, + "grad_norm": 0.9333981275558472, + "learning_rate": 0.00017549019607843137, + "loss": 0.4228, + "step": 29549 + }, + { + "epoch": 16.508379888268156, + "grad_norm": 0.512912929058075, + "learning_rate": 0.00017546218487394957, + "loss": 0.396, + "step": 29550 + }, + { + "epoch": 16.508938547486032, + "grad_norm": 0.383973628282547, + "learning_rate": 0.00017543417366946778, + "loss": 0.3502, + "step": 29551 + }, + { + "epoch": 16.509497206703912, + "grad_norm": 0.7826922535896301, + "learning_rate": 0.000175406162464986, + "loss": 0.5093, + "step": 29552 + }, + { + "epoch": 16.51005586592179, + "grad_norm": 1.2715567350387573, + "learning_rate": 0.00017537815126050422, + "loss": 0.4078, + "step": 29553 + }, + { + "epoch": 16.510614525139665, + "grad_norm": 0.7431827783584595, + "learning_rate": 0.0001753501400560224, + "loss": 0.3956, + "step": 29554 + }, + { + "epoch": 16.511173184357542, + "grad_norm": 0.6131908893585205, + "learning_rate": 0.0001753221288515406, + "loss": 0.4175, + "step": 29555 + }, + { + "epoch": 16.51173184357542, + "grad_norm": 0.3459623157978058, + "learning_rate": 0.00017529411764705884, + "loss": 0.382, + "step": 29556 + }, + { + "epoch": 16.512290502793295, + "grad_norm": 0.3784160912036896, + "learning_rate": 0.00017526610644257704, + "loss": 0.4012, + "step": 29557 + }, + { + "epoch": 16.51284916201117, + "grad_norm": 0.8570274710655212, + "learning_rate": 0.00017523809523809525, + "loss": 0.4445, + "step": 29558 + }, + { + "epoch": 16.513407821229052, + "grad_norm": 0.3129422962665558, + "learning_rate": 0.00017521008403361343, + "loss": 0.4027, + "step": 29559 + }, + { + "epoch": 16.51396648044693, + "grad_norm": 0.5028800964355469, + "learning_rate": 0.00017518207282913166, + "loss": 0.347, + "step": 29560 + }, + { + "epoch": 16.514525139664805, + "grad_norm": 0.45034411549568176, + "learning_rate": 0.00017515406162464987, + "loss": 0.3658, + "step": 29561 + }, + { + "epoch": 16.51508379888268, + "grad_norm": 0.49262264370918274, + "learning_rate": 0.00017512605042016807, + "loss": 0.3676, + "step": 29562 + }, + { + "epoch": 16.515642458100558, + "grad_norm": 0.549745500087738, + "learning_rate": 0.00017509803921568628, + "loss": 0.3593, + "step": 29563 + }, + { + "epoch": 16.516201117318435, + "grad_norm": 0.7150022387504578, + "learning_rate": 0.00017507002801120448, + "loss": 0.3858, + "step": 29564 + }, + { + "epoch": 16.51675977653631, + "grad_norm": 0.3563117980957031, + "learning_rate": 0.0001750420168067227, + "loss": 0.3856, + "step": 29565 + }, + { + "epoch": 16.51731843575419, + "grad_norm": 0.44275540113449097, + "learning_rate": 0.0001750140056022409, + "loss": 0.3878, + "step": 29566 + }, + { + "epoch": 16.517877094972068, + "grad_norm": 0.6995197534561157, + "learning_rate": 0.0001749859943977591, + "loss": 0.3762, + "step": 29567 + }, + { + "epoch": 16.518435754189944, + "grad_norm": 0.5018677115440369, + "learning_rate": 0.00017495798319327733, + "loss": 0.4436, + "step": 29568 + }, + { + "epoch": 16.51899441340782, + "grad_norm": 0.5029260516166687, + "learning_rate": 0.0001749299719887955, + "loss": 0.4151, + "step": 29569 + }, + { + "epoch": 16.519553072625698, + "grad_norm": 0.4033641815185547, + "learning_rate": 0.00017490196078431372, + "loss": 0.4366, + "step": 29570 + }, + { + "epoch": 16.520111731843574, + "grad_norm": 0.777968168258667, + "learning_rate": 0.00017487394957983193, + "loss": 0.3825, + "step": 29571 + }, + { + "epoch": 16.52067039106145, + "grad_norm": 0.4416103959083557, + "learning_rate": 0.00017484593837535016, + "loss": 0.4224, + "step": 29572 + }, + { + "epoch": 16.52122905027933, + "grad_norm": 11.347146987915039, + "learning_rate": 0.00017481792717086836, + "loss": 0.4416, + "step": 29573 + }, + { + "epoch": 16.521787709497207, + "grad_norm": 0.4550493061542511, + "learning_rate": 0.00017478991596638654, + "loss": 0.3259, + "step": 29574 + }, + { + "epoch": 16.522346368715084, + "grad_norm": 0.6317253112792969, + "learning_rate": 0.00017476190476190475, + "loss": 0.3822, + "step": 29575 + }, + { + "epoch": 16.52290502793296, + "grad_norm": 0.6731358170509338, + "learning_rate": 0.00017473389355742298, + "loss": 0.4104, + "step": 29576 + }, + { + "epoch": 16.523463687150837, + "grad_norm": 0.7160224914550781, + "learning_rate": 0.0001747058823529412, + "loss": 0.2865, + "step": 29577 + }, + { + "epoch": 16.524022346368714, + "grad_norm": 0.4603045880794525, + "learning_rate": 0.0001746778711484594, + "loss": 0.3875, + "step": 29578 + }, + { + "epoch": 16.524581005586594, + "grad_norm": 0.43759676814079285, + "learning_rate": 0.00017464985994397757, + "loss": 0.4154, + "step": 29579 + }, + { + "epoch": 16.52513966480447, + "grad_norm": 0.39633890986442566, + "learning_rate": 0.0001746218487394958, + "loss": 0.4358, + "step": 29580 + }, + { + "epoch": 16.525698324022347, + "grad_norm": 0.3954674303531647, + "learning_rate": 0.000174593837535014, + "loss": 0.4003, + "step": 29581 + }, + { + "epoch": 16.526256983240224, + "grad_norm": 0.39999040961265564, + "learning_rate": 0.00017456582633053222, + "loss": 0.4105, + "step": 29582 + }, + { + "epoch": 16.5268156424581, + "grad_norm": 4.218301773071289, + "learning_rate": 0.00017453781512605042, + "loss": 0.4567, + "step": 29583 + }, + { + "epoch": 16.527374301675977, + "grad_norm": 2.76589035987854, + "learning_rate": 0.00017450980392156863, + "loss": 0.5069, + "step": 29584 + }, + { + "epoch": 16.527932960893853, + "grad_norm": 0.4792419970035553, + "learning_rate": 0.00017448179271708684, + "loss": 0.399, + "step": 29585 + }, + { + "epoch": 16.528491620111733, + "grad_norm": 0.5354159474372864, + "learning_rate": 0.00017445378151260504, + "loss": 0.4727, + "step": 29586 + }, + { + "epoch": 16.52905027932961, + "grad_norm": 0.38616445660591125, + "learning_rate": 0.00017442577030812325, + "loss": 0.3983, + "step": 29587 + }, + { + "epoch": 16.529608938547486, + "grad_norm": 0.4955047369003296, + "learning_rate": 0.00017439775910364148, + "loss": 0.394, + "step": 29588 + }, + { + "epoch": 16.530167597765363, + "grad_norm": 0.3584757149219513, + "learning_rate": 0.00017436974789915966, + "loss": 0.362, + "step": 29589 + }, + { + "epoch": 16.53072625698324, + "grad_norm": 0.40841972827911377, + "learning_rate": 0.00017434173669467787, + "loss": 0.3846, + "step": 29590 + }, + { + "epoch": 16.531284916201116, + "grad_norm": 0.9143375158309937, + "learning_rate": 0.00017431372549019607, + "loss": 0.4077, + "step": 29591 + }, + { + "epoch": 16.531843575418993, + "grad_norm": 0.4993152916431427, + "learning_rate": 0.0001742857142857143, + "loss": 0.4283, + "step": 29592 + }, + { + "epoch": 16.532402234636873, + "grad_norm": 15.13156795501709, + "learning_rate": 0.0001742577030812325, + "loss": 0.417, + "step": 29593 + }, + { + "epoch": 16.53296089385475, + "grad_norm": 0.5883806943893433, + "learning_rate": 0.0001742296918767507, + "loss": 0.3737, + "step": 29594 + }, + { + "epoch": 16.533519553072626, + "grad_norm": 0.4294358193874359, + "learning_rate": 0.0001742016806722689, + "loss": 0.4663, + "step": 29595 + }, + { + "epoch": 16.534078212290503, + "grad_norm": 0.5711474418640137, + "learning_rate": 0.00017417366946778713, + "loss": 0.3758, + "step": 29596 + }, + { + "epoch": 16.53463687150838, + "grad_norm": 0.45019978284835815, + "learning_rate": 0.00017414565826330534, + "loss": 0.4607, + "step": 29597 + }, + { + "epoch": 16.535195530726256, + "grad_norm": 0.45127925276756287, + "learning_rate": 0.00017411764705882351, + "loss": 0.5163, + "step": 29598 + }, + { + "epoch": 16.535754189944136, + "grad_norm": 7.641594886779785, + "learning_rate": 0.00017408963585434172, + "loss": 0.3904, + "step": 29599 + }, + { + "epoch": 16.536312849162012, + "grad_norm": 0.4126541316509247, + "learning_rate": 0.00017406162464985995, + "loss": 0.4359, + "step": 29600 + }, + { + "epoch": 16.53687150837989, + "grad_norm": 0.4351097643375397, + "learning_rate": 0.00017403361344537816, + "loss": 0.4096, + "step": 29601 + }, + { + "epoch": 16.537430167597766, + "grad_norm": 3.101602077484131, + "learning_rate": 0.00017400560224089637, + "loss": 0.4188, + "step": 29602 + }, + { + "epoch": 16.537988826815642, + "grad_norm": 0.5362922549247742, + "learning_rate": 0.00017397759103641454, + "loss": 0.3835, + "step": 29603 + }, + { + "epoch": 16.53854748603352, + "grad_norm": 0.44642752408981323, + "learning_rate": 0.00017394957983193278, + "loss": 0.373, + "step": 29604 + }, + { + "epoch": 16.539106145251395, + "grad_norm": 0.4679783284664154, + "learning_rate": 0.00017392156862745098, + "loss": 0.3426, + "step": 29605 + }, + { + "epoch": 16.539664804469275, + "grad_norm": 1.766141414642334, + "learning_rate": 0.0001738935574229692, + "loss": 0.3562, + "step": 29606 + }, + { + "epoch": 16.540223463687152, + "grad_norm": 1.2823249101638794, + "learning_rate": 0.0001738655462184874, + "loss": 0.4688, + "step": 29607 + }, + { + "epoch": 16.54078212290503, + "grad_norm": 1.7663451433181763, + "learning_rate": 0.0001738375350140056, + "loss": 0.4044, + "step": 29608 + }, + { + "epoch": 16.541340782122905, + "grad_norm": 1.6318271160125732, + "learning_rate": 0.0001738095238095238, + "loss": 0.5122, + "step": 29609 + }, + { + "epoch": 16.54189944134078, + "grad_norm": 0.9168058633804321, + "learning_rate": 0.000173781512605042, + "loss": 0.3594, + "step": 29610 + }, + { + "epoch": 16.542458100558658, + "grad_norm": 0.5199082493782043, + "learning_rate": 0.00017375350140056022, + "loss": 0.4907, + "step": 29611 + }, + { + "epoch": 16.543016759776535, + "grad_norm": 0.6399844288825989, + "learning_rate": 0.00017372549019607845, + "loss": 0.5804, + "step": 29612 + }, + { + "epoch": 16.543575418994415, + "grad_norm": 0.9436054825782776, + "learning_rate": 0.00017369747899159663, + "loss": 0.2925, + "step": 29613 + }, + { + "epoch": 16.54413407821229, + "grad_norm": 0.544022262096405, + "learning_rate": 0.00017366946778711484, + "loss": 0.4792, + "step": 29614 + }, + { + "epoch": 16.544692737430168, + "grad_norm": 0.4976023733615875, + "learning_rate": 0.00017364145658263304, + "loss": 0.4114, + "step": 29615 + }, + { + "epoch": 16.545251396648045, + "grad_norm": 1.0800968408584595, + "learning_rate": 0.00017361344537815128, + "loss": 0.3403, + "step": 29616 + }, + { + "epoch": 16.54581005586592, + "grad_norm": 0.517621636390686, + "learning_rate": 0.00017358543417366948, + "loss": 0.3453, + "step": 29617 + }, + { + "epoch": 16.546368715083798, + "grad_norm": 1.2114115953445435, + "learning_rate": 0.00017355742296918766, + "loss": 0.3807, + "step": 29618 + }, + { + "epoch": 16.546927374301674, + "grad_norm": 0.39131876826286316, + "learning_rate": 0.00017352941176470587, + "loss": 0.3461, + "step": 29619 + }, + { + "epoch": 16.547486033519554, + "grad_norm": 1.1608264446258545, + "learning_rate": 0.0001735014005602241, + "loss": 0.5039, + "step": 29620 + }, + { + "epoch": 16.54804469273743, + "grad_norm": 0.32489052414894104, + "learning_rate": 0.0001734733893557423, + "loss": 0.4094, + "step": 29621 + }, + { + "epoch": 16.548603351955308, + "grad_norm": 0.4785040318965912, + "learning_rate": 0.0001734453781512605, + "loss": 0.4733, + "step": 29622 + }, + { + "epoch": 16.549162011173184, + "grad_norm": 0.37516117095947266, + "learning_rate": 0.0001734173669467787, + "loss": 0.3945, + "step": 29623 + }, + { + "epoch": 16.54972067039106, + "grad_norm": 0.4778275191783905, + "learning_rate": 0.00017338935574229692, + "loss": 0.4298, + "step": 29624 + }, + { + "epoch": 16.550279329608937, + "grad_norm": 0.4768771231174469, + "learning_rate": 0.00017336134453781513, + "loss": 0.3231, + "step": 29625 + }, + { + "epoch": 16.550837988826817, + "grad_norm": 0.48715725541114807, + "learning_rate": 0.00017333333333333334, + "loss": 0.4273, + "step": 29626 + }, + { + "epoch": 16.551396648044694, + "grad_norm": 0.6316798329353333, + "learning_rate": 0.00017330532212885157, + "loss": 0.4241, + "step": 29627 + }, + { + "epoch": 16.55195530726257, + "grad_norm": 1.118650197982788, + "learning_rate": 0.00017327731092436975, + "loss": 0.4511, + "step": 29628 + }, + { + "epoch": 16.552513966480447, + "grad_norm": 2.3481647968292236, + "learning_rate": 0.00017324929971988795, + "loss": 0.4329, + "step": 29629 + }, + { + "epoch": 16.553072625698324, + "grad_norm": 0.3932090103626251, + "learning_rate": 0.00017322128851540616, + "loss": 0.4335, + "step": 29630 + }, + { + "epoch": 16.5536312849162, + "grad_norm": 0.5164644718170166, + "learning_rate": 0.0001731932773109244, + "loss": 0.514, + "step": 29631 + }, + { + "epoch": 16.554189944134077, + "grad_norm": 0.6813526153564453, + "learning_rate": 0.0001731652661064426, + "loss": 0.4278, + "step": 29632 + }, + { + "epoch": 16.554748603351957, + "grad_norm": 0.5018157958984375, + "learning_rate": 0.00017313725490196078, + "loss": 0.4374, + "step": 29633 + }, + { + "epoch": 16.555307262569833, + "grad_norm": 0.4966731369495392, + "learning_rate": 0.00017310924369747898, + "loss": 0.5462, + "step": 29634 + }, + { + "epoch": 16.55586592178771, + "grad_norm": 0.307049959897995, + "learning_rate": 0.00017308123249299722, + "loss": 0.3618, + "step": 29635 + }, + { + "epoch": 16.556424581005587, + "grad_norm": 0.5479479432106018, + "learning_rate": 0.00017305322128851542, + "loss": 0.355, + "step": 29636 + }, + { + "epoch": 16.556983240223463, + "grad_norm": 1.2515720129013062, + "learning_rate": 0.00017302521008403363, + "loss": 0.3501, + "step": 29637 + }, + { + "epoch": 16.55754189944134, + "grad_norm": 0.7564569711685181, + "learning_rate": 0.0001729971988795518, + "loss": 0.4521, + "step": 29638 + }, + { + "epoch": 16.558100558659216, + "grad_norm": 0.6315253376960754, + "learning_rate": 0.00017296918767507004, + "loss": 0.4293, + "step": 29639 + }, + { + "epoch": 16.558659217877096, + "grad_norm": 0.4353763163089752, + "learning_rate": 0.00017294117647058825, + "loss": 0.2829, + "step": 29640 + }, + { + "epoch": 16.559217877094973, + "grad_norm": 0.518596887588501, + "learning_rate": 0.00017291316526610645, + "loss": 0.3557, + "step": 29641 + }, + { + "epoch": 16.55977653631285, + "grad_norm": 0.5975028872489929, + "learning_rate": 0.00017288515406162466, + "loss": 0.3438, + "step": 29642 + }, + { + "epoch": 16.560335195530726, + "grad_norm": 0.4311278164386749, + "learning_rate": 0.00017285714285714287, + "loss": 0.3413, + "step": 29643 + }, + { + "epoch": 16.560893854748603, + "grad_norm": 0.42580920457839966, + "learning_rate": 0.00017282913165266107, + "loss": 0.4041, + "step": 29644 + }, + { + "epoch": 16.56145251396648, + "grad_norm": 0.40019556879997253, + "learning_rate": 0.00017280112044817928, + "loss": 0.4361, + "step": 29645 + }, + { + "epoch": 16.56201117318436, + "grad_norm": 1.3777707815170288, + "learning_rate": 0.00017277310924369748, + "loss": 0.3613, + "step": 29646 + }, + { + "epoch": 16.562569832402236, + "grad_norm": 0.45521843433380127, + "learning_rate": 0.00017274509803921572, + "loss": 0.4192, + "step": 29647 + }, + { + "epoch": 16.563128491620112, + "grad_norm": 0.9244868159294128, + "learning_rate": 0.0001727170868347339, + "loss": 0.501, + "step": 29648 + }, + { + "epoch": 16.56368715083799, + "grad_norm": 1.7845511436462402, + "learning_rate": 0.0001726890756302521, + "loss": 0.432, + "step": 29649 + }, + { + "epoch": 16.564245810055866, + "grad_norm": 0.7052385210990906, + "learning_rate": 0.0001726610644257703, + "loss": 0.5091, + "step": 29650 + }, + { + "epoch": 16.564804469273742, + "grad_norm": 1.2290674448013306, + "learning_rate": 0.00017263305322128854, + "loss": 0.5519, + "step": 29651 + }, + { + "epoch": 16.56536312849162, + "grad_norm": 2.1546733379364014, + "learning_rate": 0.00017260504201680672, + "loss": 0.4083, + "step": 29652 + }, + { + "epoch": 16.5659217877095, + "grad_norm": 0.5240933299064636, + "learning_rate": 0.00017257703081232493, + "loss": 0.4367, + "step": 29653 + }, + { + "epoch": 16.566480446927375, + "grad_norm": 1.0836304426193237, + "learning_rate": 0.00017254901960784313, + "loss": 0.5049, + "step": 29654 + }, + { + "epoch": 16.567039106145252, + "grad_norm": 0.48378872871398926, + "learning_rate": 0.00017252100840336136, + "loss": 0.4602, + "step": 29655 + }, + { + "epoch": 16.56759776536313, + "grad_norm": 0.6799153089523315, + "learning_rate": 0.00017249299719887957, + "loss": 0.4034, + "step": 29656 + }, + { + "epoch": 16.568156424581005, + "grad_norm": 1.4692351818084717, + "learning_rate": 0.00017246498599439775, + "loss": 0.5495, + "step": 29657 + }, + { + "epoch": 16.56871508379888, + "grad_norm": 0.4837232828140259, + "learning_rate": 0.00017243697478991596, + "loss": 0.3757, + "step": 29658 + }, + { + "epoch": 16.56927374301676, + "grad_norm": 0.39382028579711914, + "learning_rate": 0.0001724089635854342, + "loss": 0.3576, + "step": 29659 + }, + { + "epoch": 16.56983240223464, + "grad_norm": 0.4457899034023285, + "learning_rate": 0.0001723809523809524, + "loss": 0.3659, + "step": 29660 + }, + { + "epoch": 16.570391061452515, + "grad_norm": 0.5575380325317383, + "learning_rate": 0.0001723529411764706, + "loss": 0.478, + "step": 29661 + }, + { + "epoch": 16.57094972067039, + "grad_norm": 0.9752169847488403, + "learning_rate": 0.00017232492997198878, + "loss": 0.3735, + "step": 29662 + }, + { + "epoch": 16.571508379888268, + "grad_norm": 0.40215227007865906, + "learning_rate": 0.000172296918767507, + "loss": 0.4444, + "step": 29663 + }, + { + "epoch": 16.572067039106145, + "grad_norm": 0.5985216498374939, + "learning_rate": 0.00017226890756302522, + "loss": 0.3131, + "step": 29664 + }, + { + "epoch": 16.57262569832402, + "grad_norm": 0.5278451442718506, + "learning_rate": 0.00017224089635854342, + "loss": 0.3822, + "step": 29665 + }, + { + "epoch": 16.573184357541898, + "grad_norm": 0.4382477104663849, + "learning_rate": 0.00017221288515406163, + "loss": 0.4592, + "step": 29666 + }, + { + "epoch": 16.573743016759778, + "grad_norm": 0.9168773889541626, + "learning_rate": 0.00017218487394957984, + "loss": 0.5365, + "step": 29667 + }, + { + "epoch": 16.574301675977654, + "grad_norm": 0.41254740953445435, + "learning_rate": 0.00017215686274509804, + "loss": 0.3662, + "step": 29668 + }, + { + "epoch": 16.57486033519553, + "grad_norm": 0.7508476972579956, + "learning_rate": 0.00017212885154061625, + "loss": 0.6102, + "step": 29669 + }, + { + "epoch": 16.575418994413408, + "grad_norm": 0.4705517590045929, + "learning_rate": 0.00017210084033613445, + "loss": 0.4243, + "step": 29670 + }, + { + "epoch": 16.575977653631284, + "grad_norm": 0.5159240365028381, + "learning_rate": 0.0001720728291316527, + "loss": 0.3668, + "step": 29671 + }, + { + "epoch": 16.57653631284916, + "grad_norm": 2.45832896232605, + "learning_rate": 0.00017204481792717087, + "loss": 0.3968, + "step": 29672 + }, + { + "epoch": 16.577094972067037, + "grad_norm": 0.47152748703956604, + "learning_rate": 0.00017201680672268907, + "loss": 0.3763, + "step": 29673 + }, + { + "epoch": 16.577653631284917, + "grad_norm": 0.49372634291648865, + "learning_rate": 0.00017198879551820728, + "loss": 0.3271, + "step": 29674 + }, + { + "epoch": 16.578212290502794, + "grad_norm": 0.37817874550819397, + "learning_rate": 0.0001719607843137255, + "loss": 0.4885, + "step": 29675 + }, + { + "epoch": 16.57877094972067, + "grad_norm": 0.7956822514533997, + "learning_rate": 0.00017193277310924372, + "loss": 0.4818, + "step": 29676 + }, + { + "epoch": 16.579329608938547, + "grad_norm": 0.4049944281578064, + "learning_rate": 0.0001719047619047619, + "loss": 0.3986, + "step": 29677 + }, + { + "epoch": 16.579888268156424, + "grad_norm": 0.4858841001987457, + "learning_rate": 0.0001718767507002801, + "loss": 0.3722, + "step": 29678 + }, + { + "epoch": 16.5804469273743, + "grad_norm": 0.5916111469268799, + "learning_rate": 0.00017184873949579834, + "loss": 0.4235, + "step": 29679 + }, + { + "epoch": 16.58100558659218, + "grad_norm": 0.6348965167999268, + "learning_rate": 0.00017182072829131654, + "loss": 0.4009, + "step": 29680 + }, + { + "epoch": 16.581564245810057, + "grad_norm": 0.45960313081741333, + "learning_rate": 0.00017179271708683475, + "loss": 0.4042, + "step": 29681 + }, + { + "epoch": 16.582122905027934, + "grad_norm": 0.4433102309703827, + "learning_rate": 0.00017176470588235293, + "loss": 0.3529, + "step": 29682 + }, + { + "epoch": 16.58268156424581, + "grad_norm": 0.34756675362586975, + "learning_rate": 0.00017173669467787116, + "loss": 0.3749, + "step": 29683 + }, + { + "epoch": 16.583240223463687, + "grad_norm": 0.4158267676830292, + "learning_rate": 0.00017170868347338937, + "loss": 0.44, + "step": 29684 + }, + { + "epoch": 16.583798882681563, + "grad_norm": 0.6636767983436584, + "learning_rate": 0.00017168067226890757, + "loss": 0.3907, + "step": 29685 + }, + { + "epoch": 16.58435754189944, + "grad_norm": 3.3351364135742188, + "learning_rate": 0.00017165266106442578, + "loss": 0.3563, + "step": 29686 + }, + { + "epoch": 16.58491620111732, + "grad_norm": 0.5583810210227966, + "learning_rate": 0.00017162464985994398, + "loss": 0.3593, + "step": 29687 + }, + { + "epoch": 16.585474860335196, + "grad_norm": 0.4328242540359497, + "learning_rate": 0.0001715966386554622, + "loss": 0.4542, + "step": 29688 + }, + { + "epoch": 16.586033519553073, + "grad_norm": 2.0626401901245117, + "learning_rate": 0.0001715686274509804, + "loss": 0.7094, + "step": 29689 + }, + { + "epoch": 16.58659217877095, + "grad_norm": 0.37423473596572876, + "learning_rate": 0.0001715406162464986, + "loss": 0.3501, + "step": 29690 + }, + { + "epoch": 16.587150837988826, + "grad_norm": 0.5140286087989807, + "learning_rate": 0.00017151260504201683, + "loss": 0.3606, + "step": 29691 + }, + { + "epoch": 16.587709497206703, + "grad_norm": 0.5087933540344238, + "learning_rate": 0.000171484593837535, + "loss": 0.3522, + "step": 29692 + }, + { + "epoch": 16.58826815642458, + "grad_norm": 0.5980960726737976, + "learning_rate": 0.00017145658263305322, + "loss": 0.4719, + "step": 29693 + }, + { + "epoch": 16.58882681564246, + "grad_norm": 1.4996987581253052, + "learning_rate": 0.00017142857142857143, + "loss": 0.512, + "step": 29694 + }, + { + "epoch": 16.589385474860336, + "grad_norm": 0.6197682619094849, + "learning_rate": 0.00017140056022408966, + "loss": 0.6934, + "step": 29695 + }, + { + "epoch": 16.589944134078213, + "grad_norm": 1.5981677770614624, + "learning_rate": 0.00017137254901960786, + "loss": 0.5227, + "step": 29696 + }, + { + "epoch": 16.59050279329609, + "grad_norm": 1.542922854423523, + "learning_rate": 0.00017134453781512604, + "loss": 0.5042, + "step": 29697 + }, + { + "epoch": 16.591061452513966, + "grad_norm": 0.36248210072517395, + "learning_rate": 0.00017131652661064425, + "loss": 0.3431, + "step": 29698 + }, + { + "epoch": 16.591620111731842, + "grad_norm": 0.4703643023967743, + "learning_rate": 0.00017128851540616248, + "loss": 0.4712, + "step": 29699 + }, + { + "epoch": 16.592178770949722, + "grad_norm": 0.5619949102401733, + "learning_rate": 0.0001712605042016807, + "loss": 0.3888, + "step": 29700 + }, + { + "epoch": 16.5927374301676, + "grad_norm": 0.42303141951560974, + "learning_rate": 0.0001712324929971989, + "loss": 0.4616, + "step": 29701 + }, + { + "epoch": 16.593296089385476, + "grad_norm": 0.44406846165657043, + "learning_rate": 0.00017120448179271707, + "loss": 0.4879, + "step": 29702 + }, + { + "epoch": 16.593854748603352, + "grad_norm": 0.8510697484016418, + "learning_rate": 0.0001711764705882353, + "loss": 0.4943, + "step": 29703 + }, + { + "epoch": 16.59441340782123, + "grad_norm": 0.3986489772796631, + "learning_rate": 0.0001711484593837535, + "loss": 0.3103, + "step": 29704 + }, + { + "epoch": 16.594972067039105, + "grad_norm": 0.4285847246646881, + "learning_rate": 0.00017112044817927172, + "loss": 0.4788, + "step": 29705 + }, + { + "epoch": 16.595530726256982, + "grad_norm": 0.33925896883010864, + "learning_rate": 0.0001710924369747899, + "loss": 0.4475, + "step": 29706 + }, + { + "epoch": 16.596089385474862, + "grad_norm": 4.6819586753845215, + "learning_rate": 0.00017106442577030813, + "loss": 0.4485, + "step": 29707 + }, + { + "epoch": 16.59664804469274, + "grad_norm": 0.4595019519329071, + "learning_rate": 0.00017103641456582634, + "loss": 0.3516, + "step": 29708 + }, + { + "epoch": 16.597206703910615, + "grad_norm": 0.4340721368789673, + "learning_rate": 0.00017100840336134454, + "loss": 0.2841, + "step": 29709 + }, + { + "epoch": 16.59776536312849, + "grad_norm": 0.5420017838478088, + "learning_rate": 0.00017098039215686275, + "loss": 0.3986, + "step": 29710 + }, + { + "epoch": 16.598324022346368, + "grad_norm": 0.5166442394256592, + "learning_rate": 0.00017095238095238095, + "loss": 0.3902, + "step": 29711 + }, + { + "epoch": 16.598882681564245, + "grad_norm": 0.38179582357406616, + "learning_rate": 0.00017092436974789916, + "loss": 0.3904, + "step": 29712 + }, + { + "epoch": 16.59944134078212, + "grad_norm": 0.36862191557884216, + "learning_rate": 0.00017089635854341737, + "loss": 0.3877, + "step": 29713 + }, + { + "epoch": 16.6, + "grad_norm": 0.6554352641105652, + "learning_rate": 0.00017086834733893557, + "loss": 0.4739, + "step": 29714 + }, + { + "epoch": 16.600558659217878, + "grad_norm": 1.3777062892913818, + "learning_rate": 0.0001708403361344538, + "loss": 0.8374, + "step": 29715 + }, + { + "epoch": 16.601117318435755, + "grad_norm": 0.5427790880203247, + "learning_rate": 0.00017081232492997198, + "loss": 0.4328, + "step": 29716 + }, + { + "epoch": 16.60167597765363, + "grad_norm": 0.4491827189922333, + "learning_rate": 0.0001707843137254902, + "loss": 0.3879, + "step": 29717 + }, + { + "epoch": 16.602234636871508, + "grad_norm": 0.4406115710735321, + "learning_rate": 0.0001707563025210084, + "loss": 0.3991, + "step": 29718 + }, + { + "epoch": 16.602793296089384, + "grad_norm": 0.5698709487915039, + "learning_rate": 0.00017072829131652663, + "loss": 0.6151, + "step": 29719 + }, + { + "epoch": 16.60335195530726, + "grad_norm": 0.4022897183895111, + "learning_rate": 0.00017070028011204484, + "loss": 0.3703, + "step": 29720 + }, + { + "epoch": 16.60391061452514, + "grad_norm": 0.3980019986629486, + "learning_rate": 0.00017067226890756301, + "loss": 0.4458, + "step": 29721 + }, + { + "epoch": 16.604469273743018, + "grad_norm": 3.4253838062286377, + "learning_rate": 0.00017064425770308122, + "loss": 0.4251, + "step": 29722 + }, + { + "epoch": 16.605027932960894, + "grad_norm": 0.35719001293182373, + "learning_rate": 0.00017061624649859945, + "loss": 0.37, + "step": 29723 + }, + { + "epoch": 16.60558659217877, + "grad_norm": 0.5619191527366638, + "learning_rate": 0.00017058823529411766, + "loss": 0.352, + "step": 29724 + }, + { + "epoch": 16.606145251396647, + "grad_norm": 0.34911277890205383, + "learning_rate": 0.00017056022408963587, + "loss": 0.3919, + "step": 29725 + }, + { + "epoch": 16.606703910614524, + "grad_norm": 0.5471906661987305, + "learning_rate": 0.00017053221288515404, + "loss": 0.5871, + "step": 29726 + }, + { + "epoch": 16.607262569832404, + "grad_norm": 0.4728212058544159, + "learning_rate": 0.00017050420168067228, + "loss": 0.3922, + "step": 29727 + }, + { + "epoch": 16.60782122905028, + "grad_norm": 0.344344824552536, + "learning_rate": 0.00017047619047619048, + "loss": 0.3806, + "step": 29728 + }, + { + "epoch": 16.608379888268157, + "grad_norm": 0.9261954426765442, + "learning_rate": 0.0001704481792717087, + "loss": 0.4156, + "step": 29729 + }, + { + "epoch": 16.608938547486034, + "grad_norm": 0.3845740556716919, + "learning_rate": 0.0001704201680672269, + "loss": 0.3719, + "step": 29730 + }, + { + "epoch": 16.60949720670391, + "grad_norm": 0.3786081373691559, + "learning_rate": 0.0001703921568627451, + "loss": 0.4619, + "step": 29731 + }, + { + "epoch": 16.610055865921787, + "grad_norm": 9.380817413330078, + "learning_rate": 0.0001703641456582633, + "loss": 0.4, + "step": 29732 + }, + { + "epoch": 16.610614525139663, + "grad_norm": 1.563375473022461, + "learning_rate": 0.0001703361344537815, + "loss": 0.5395, + "step": 29733 + }, + { + "epoch": 16.611173184357543, + "grad_norm": 0.7948753237724304, + "learning_rate": 0.00017030812324929972, + "loss": 0.3641, + "step": 29734 + }, + { + "epoch": 16.61173184357542, + "grad_norm": 0.8591349124908447, + "learning_rate": 0.00017028011204481795, + "loss": 0.4049, + "step": 29735 + }, + { + "epoch": 16.612290502793297, + "grad_norm": 0.8327168822288513, + "learning_rate": 0.00017025210084033613, + "loss": 0.5116, + "step": 29736 + }, + { + "epoch": 16.612849162011173, + "grad_norm": 0.4438212215900421, + "learning_rate": 0.00017022408963585434, + "loss": 0.3541, + "step": 29737 + }, + { + "epoch": 16.61340782122905, + "grad_norm": 0.5014461278915405, + "learning_rate": 0.00017019607843137254, + "loss": 0.3421, + "step": 29738 + }, + { + "epoch": 16.613966480446926, + "grad_norm": 1.445834994316101, + "learning_rate": 0.00017016806722689078, + "loss": 0.3864, + "step": 29739 + }, + { + "epoch": 16.614525139664803, + "grad_norm": 0.4010203778743744, + "learning_rate": 0.00017014005602240898, + "loss": 0.4058, + "step": 29740 + }, + { + "epoch": 16.615083798882683, + "grad_norm": 1.4026222229003906, + "learning_rate": 0.00017011204481792716, + "loss": 0.3821, + "step": 29741 + }, + { + "epoch": 16.61564245810056, + "grad_norm": 1.3032084703445435, + "learning_rate": 0.00017008403361344537, + "loss": 0.5126, + "step": 29742 + }, + { + "epoch": 16.616201117318436, + "grad_norm": 0.4361550509929657, + "learning_rate": 0.0001700560224089636, + "loss": 0.4642, + "step": 29743 + }, + { + "epoch": 16.616759776536313, + "grad_norm": 0.4836312234401703, + "learning_rate": 0.0001700280112044818, + "loss": 0.3256, + "step": 29744 + }, + { + "epoch": 16.61731843575419, + "grad_norm": 0.4750947654247284, + "learning_rate": 0.00017, + "loss": 0.4166, + "step": 29745 + }, + { + "epoch": 16.617877094972066, + "grad_norm": 1.03825044631958, + "learning_rate": 0.0001699719887955182, + "loss": 0.4849, + "step": 29746 + }, + { + "epoch": 16.618435754189946, + "grad_norm": 0.4988580346107483, + "learning_rate": 0.00016994397759103642, + "loss": 0.4587, + "step": 29747 + }, + { + "epoch": 16.618994413407822, + "grad_norm": 0.8459121584892273, + "learning_rate": 0.00016991596638655463, + "loss": 0.4798, + "step": 29748 + }, + { + "epoch": 16.6195530726257, + "grad_norm": 0.3887941837310791, + "learning_rate": 0.00016988795518207284, + "loss": 0.3233, + "step": 29749 + }, + { + "epoch": 16.620111731843576, + "grad_norm": 0.6093942523002625, + "learning_rate": 0.00016985994397759104, + "loss": 0.3486, + "step": 29750 + }, + { + "epoch": 16.620670391061452, + "grad_norm": 0.3763768970966339, + "learning_rate": 0.00016983193277310925, + "loss": 0.4905, + "step": 29751 + }, + { + "epoch": 16.62122905027933, + "grad_norm": 0.47057199478149414, + "learning_rate": 0.00016980392156862745, + "loss": 0.4818, + "step": 29752 + }, + { + "epoch": 16.621787709497205, + "grad_norm": 0.46999314427375793, + "learning_rate": 0.00016977591036414566, + "loss": 0.3309, + "step": 29753 + }, + { + "epoch": 16.622346368715085, + "grad_norm": 0.4252774715423584, + "learning_rate": 0.00016974789915966387, + "loss": 0.3791, + "step": 29754 + }, + { + "epoch": 16.622905027932962, + "grad_norm": 0.33144935965538025, + "learning_rate": 0.0001697198879551821, + "loss": 0.3416, + "step": 29755 + }, + { + "epoch": 16.62346368715084, + "grad_norm": 1.0748701095581055, + "learning_rate": 0.00016969187675070028, + "loss": 0.4465, + "step": 29756 + }, + { + "epoch": 16.624022346368715, + "grad_norm": 0.46357840299606323, + "learning_rate": 0.00016966386554621848, + "loss": 0.5094, + "step": 29757 + }, + { + "epoch": 16.62458100558659, + "grad_norm": 1.2137809991836548, + "learning_rate": 0.0001696358543417367, + "loss": 0.4933, + "step": 29758 + }, + { + "epoch": 16.62513966480447, + "grad_norm": 0.48791569471359253, + "learning_rate": 0.00016960784313725492, + "loss": 0.4891, + "step": 29759 + }, + { + "epoch": 16.625698324022345, + "grad_norm": 0.6240214705467224, + "learning_rate": 0.00016957983193277313, + "loss": 0.462, + "step": 29760 + }, + { + "epoch": 16.626256983240225, + "grad_norm": 0.4574168920516968, + "learning_rate": 0.0001695518207282913, + "loss": 0.4368, + "step": 29761 + }, + { + "epoch": 16.6268156424581, + "grad_norm": 2.8725969791412354, + "learning_rate": 0.00016952380952380951, + "loss": 0.3746, + "step": 29762 + }, + { + "epoch": 16.627374301675978, + "grad_norm": 0.3899432420730591, + "learning_rate": 0.00016949579831932775, + "loss": 0.3437, + "step": 29763 + }, + { + "epoch": 16.627932960893855, + "grad_norm": 0.48070505261421204, + "learning_rate": 0.00016946778711484595, + "loss": 0.4328, + "step": 29764 + }, + { + "epoch": 16.62849162011173, + "grad_norm": 0.9653042554855347, + "learning_rate": 0.00016943977591036413, + "loss": 0.4131, + "step": 29765 + }, + { + "epoch": 16.629050279329608, + "grad_norm": 0.5093980431556702, + "learning_rate": 0.00016941176470588234, + "loss": 0.3805, + "step": 29766 + }, + { + "epoch": 16.629608938547484, + "grad_norm": 0.5078274011611938, + "learning_rate": 0.00016938375350140057, + "loss": 0.5362, + "step": 29767 + }, + { + "epoch": 16.630167597765364, + "grad_norm": 0.3415151834487915, + "learning_rate": 0.00016935574229691878, + "loss": 0.3357, + "step": 29768 + }, + { + "epoch": 16.63072625698324, + "grad_norm": 1.2921910285949707, + "learning_rate": 0.00016932773109243698, + "loss": 0.4118, + "step": 29769 + }, + { + "epoch": 16.631284916201118, + "grad_norm": 0.8225568532943726, + "learning_rate": 0.00016929971988795516, + "loss": 0.6127, + "step": 29770 + }, + { + "epoch": 16.631843575418994, + "grad_norm": 0.41550955176353455, + "learning_rate": 0.0001692717086834734, + "loss": 0.4673, + "step": 29771 + }, + { + "epoch": 16.63240223463687, + "grad_norm": 0.5405202507972717, + "learning_rate": 0.0001692436974789916, + "loss": 0.3966, + "step": 29772 + }, + { + "epoch": 16.632960893854747, + "grad_norm": 0.2869426906108856, + "learning_rate": 0.0001692156862745098, + "loss": 0.3843, + "step": 29773 + }, + { + "epoch": 16.633519553072627, + "grad_norm": 0.6287881135940552, + "learning_rate": 0.000169187675070028, + "loss": 0.4232, + "step": 29774 + }, + { + "epoch": 16.634078212290504, + "grad_norm": 0.6040198802947998, + "learning_rate": 0.00016915966386554622, + "loss": 0.3733, + "step": 29775 + }, + { + "epoch": 16.63463687150838, + "grad_norm": 0.47310498356819153, + "learning_rate": 0.00016913165266106443, + "loss": 0.5953, + "step": 29776 + }, + { + "epoch": 16.635195530726257, + "grad_norm": 0.2993278205394745, + "learning_rate": 0.00016910364145658263, + "loss": 0.3394, + "step": 29777 + }, + { + "epoch": 16.635754189944134, + "grad_norm": 0.4342913031578064, + "learning_rate": 0.00016907563025210084, + "loss": 0.3745, + "step": 29778 + }, + { + "epoch": 16.63631284916201, + "grad_norm": 0.4923255145549774, + "learning_rate": 0.00016904761904761907, + "loss": 0.5028, + "step": 29779 + }, + { + "epoch": 16.636871508379887, + "grad_norm": 0.4501801133155823, + "learning_rate": 0.00016901960784313725, + "loss": 0.3889, + "step": 29780 + }, + { + "epoch": 16.637430167597767, + "grad_norm": 0.31149882078170776, + "learning_rate": 0.00016899159663865546, + "loss": 0.3579, + "step": 29781 + }, + { + "epoch": 16.637988826815644, + "grad_norm": 1.1276620626449585, + "learning_rate": 0.00016896358543417366, + "loss": 0.3462, + "step": 29782 + }, + { + "epoch": 16.63854748603352, + "grad_norm": 0.6334343552589417, + "learning_rate": 0.0001689355742296919, + "loss": 0.4647, + "step": 29783 + }, + { + "epoch": 16.639106145251397, + "grad_norm": 0.36434948444366455, + "learning_rate": 0.0001689075630252101, + "loss": 0.3855, + "step": 29784 + }, + { + "epoch": 16.639664804469273, + "grad_norm": 2.5770599842071533, + "learning_rate": 0.00016887955182072828, + "loss": 0.3896, + "step": 29785 + }, + { + "epoch": 16.64022346368715, + "grad_norm": 0.4854937195777893, + "learning_rate": 0.00016885154061624649, + "loss": 0.4692, + "step": 29786 + }, + { + "epoch": 16.640782122905026, + "grad_norm": 0.4809033274650574, + "learning_rate": 0.00016882352941176472, + "loss": 0.4189, + "step": 29787 + }, + { + "epoch": 16.641340782122906, + "grad_norm": 0.5006118416786194, + "learning_rate": 0.00016879551820728292, + "loss": 0.4356, + "step": 29788 + }, + { + "epoch": 16.641899441340783, + "grad_norm": 0.5263915657997131, + "learning_rate": 0.00016876750700280113, + "loss": 0.3806, + "step": 29789 + }, + { + "epoch": 16.64245810055866, + "grad_norm": 0.6017934679985046, + "learning_rate": 0.0001687394957983193, + "loss": 0.2537, + "step": 29790 + }, + { + "epoch": 16.643016759776536, + "grad_norm": 0.6212185621261597, + "learning_rate": 0.00016871148459383754, + "loss": 0.5214, + "step": 29791 + }, + { + "epoch": 16.643575418994413, + "grad_norm": 0.4498072862625122, + "learning_rate": 0.00016868347338935575, + "loss": 0.3417, + "step": 29792 + }, + { + "epoch": 16.64413407821229, + "grad_norm": 0.418268084526062, + "learning_rate": 0.00016865546218487395, + "loss": 0.4021, + "step": 29793 + }, + { + "epoch": 16.64469273743017, + "grad_norm": 0.3742298483848572, + "learning_rate": 0.00016862745098039216, + "loss": 0.3386, + "step": 29794 + }, + { + "epoch": 16.645251396648046, + "grad_norm": 0.4824741780757904, + "learning_rate": 0.00016859943977591037, + "loss": 0.3532, + "step": 29795 + }, + { + "epoch": 16.645810055865923, + "grad_norm": 0.3796785771846771, + "learning_rate": 0.00016857142857142857, + "loss": 0.3778, + "step": 29796 + }, + { + "epoch": 16.6463687150838, + "grad_norm": 0.3775938153266907, + "learning_rate": 0.00016854341736694678, + "loss": 0.4137, + "step": 29797 + }, + { + "epoch": 16.646927374301676, + "grad_norm": 0.42650899291038513, + "learning_rate": 0.00016851540616246498, + "loss": 0.3709, + "step": 29798 + }, + { + "epoch": 16.647486033519552, + "grad_norm": 0.4216354787349701, + "learning_rate": 0.00016848739495798322, + "loss": 0.3708, + "step": 29799 + }, + { + "epoch": 16.64804469273743, + "grad_norm": 0.2871125042438507, + "learning_rate": 0.0001684593837535014, + "loss": 0.3613, + "step": 29800 + }, + { + "epoch": 16.64860335195531, + "grad_norm": 0.393095999956131, + "learning_rate": 0.0001684313725490196, + "loss": 0.3516, + "step": 29801 + }, + { + "epoch": 16.649162011173186, + "grad_norm": 0.3895621597766876, + "learning_rate": 0.0001684033613445378, + "loss": 0.4114, + "step": 29802 + }, + { + "epoch": 16.649720670391062, + "grad_norm": 0.35037121176719666, + "learning_rate": 0.00016837535014005604, + "loss": 0.4198, + "step": 29803 + }, + { + "epoch": 16.65027932960894, + "grad_norm": 0.5421853065490723, + "learning_rate": 0.00016834733893557425, + "loss": 0.5941, + "step": 29804 + }, + { + "epoch": 16.650837988826815, + "grad_norm": 0.845901370048523, + "learning_rate": 0.00016831932773109243, + "loss": 0.4613, + "step": 29805 + }, + { + "epoch": 16.65139664804469, + "grad_norm": 0.485249787569046, + "learning_rate": 0.00016829131652661063, + "loss": 0.3706, + "step": 29806 + }, + { + "epoch": 16.65195530726257, + "grad_norm": 0.45605310797691345, + "learning_rate": 0.00016826330532212887, + "loss": 0.3898, + "step": 29807 + }, + { + "epoch": 16.65251396648045, + "grad_norm": 0.4363083839416504, + "learning_rate": 0.00016823529411764707, + "loss": 0.3734, + "step": 29808 + }, + { + "epoch": 16.653072625698325, + "grad_norm": 0.3861304223537445, + "learning_rate": 0.00016820728291316528, + "loss": 0.3746, + "step": 29809 + }, + { + "epoch": 16.6536312849162, + "grad_norm": 0.939969003200531, + "learning_rate": 0.00016817927170868346, + "loss": 0.332, + "step": 29810 + }, + { + "epoch": 16.654189944134078, + "grad_norm": 0.42532482743263245, + "learning_rate": 0.0001681512605042017, + "loss": 0.3903, + "step": 29811 + }, + { + "epoch": 16.654748603351955, + "grad_norm": 0.3521217703819275, + "learning_rate": 0.0001681232492997199, + "loss": 0.3263, + "step": 29812 + }, + { + "epoch": 16.65530726256983, + "grad_norm": 3.500936508178711, + "learning_rate": 0.0001680952380952381, + "loss": 0.4264, + "step": 29813 + }, + { + "epoch": 16.655865921787708, + "grad_norm": 0.46213197708129883, + "learning_rate": 0.0001680672268907563, + "loss": 0.4305, + "step": 29814 + }, + { + "epoch": 16.656424581005588, + "grad_norm": 0.6043916344642639, + "learning_rate": 0.0001680392156862745, + "loss": 0.3263, + "step": 29815 + }, + { + "epoch": 16.656983240223465, + "grad_norm": 0.3290908634662628, + "learning_rate": 0.00016801120448179272, + "loss": 0.2953, + "step": 29816 + }, + { + "epoch": 16.65754189944134, + "grad_norm": 0.45729339122772217, + "learning_rate": 0.00016798319327731093, + "loss": 0.483, + "step": 29817 + }, + { + "epoch": 16.658100558659218, + "grad_norm": 0.4936226010322571, + "learning_rate": 0.00016795518207282913, + "loss": 0.4003, + "step": 29818 + }, + { + "epoch": 16.658659217877094, + "grad_norm": 0.5775694847106934, + "learning_rate": 0.00016792717086834734, + "loss": 0.3776, + "step": 29819 + }, + { + "epoch": 16.65921787709497, + "grad_norm": 0.38737642765045166, + "learning_rate": 0.00016789915966386554, + "loss": 0.3505, + "step": 29820 + }, + { + "epoch": 16.659776536312847, + "grad_norm": 0.4338393807411194, + "learning_rate": 0.00016787114845938375, + "loss": 0.391, + "step": 29821 + }, + { + "epoch": 16.660335195530728, + "grad_norm": 0.39069968461990356, + "learning_rate": 0.00016784313725490196, + "loss": 0.3365, + "step": 29822 + }, + { + "epoch": 16.660893854748604, + "grad_norm": 0.5240722298622131, + "learning_rate": 0.0001678151260504202, + "loss": 0.4258, + "step": 29823 + }, + { + "epoch": 16.66145251396648, + "grad_norm": 2.5170700550079346, + "learning_rate": 0.00016778711484593837, + "loss": 0.3693, + "step": 29824 + }, + { + "epoch": 16.662011173184357, + "grad_norm": 0.520591139793396, + "learning_rate": 0.00016775910364145657, + "loss": 0.4493, + "step": 29825 + }, + { + "epoch": 16.662569832402234, + "grad_norm": 0.37775611877441406, + "learning_rate": 0.00016773109243697478, + "loss": 0.4546, + "step": 29826 + }, + { + "epoch": 16.66312849162011, + "grad_norm": 0.9459759593009949, + "learning_rate": 0.000167703081232493, + "loss": 0.4518, + "step": 29827 + }, + { + "epoch": 16.66368715083799, + "grad_norm": 0.3614331781864166, + "learning_rate": 0.00016767507002801122, + "loss": 0.3635, + "step": 29828 + }, + { + "epoch": 16.664245810055867, + "grad_norm": 0.4394908547401428, + "learning_rate": 0.0001676470588235294, + "loss": 0.262, + "step": 29829 + }, + { + "epoch": 16.664804469273744, + "grad_norm": 4.644527912139893, + "learning_rate": 0.0001676190476190476, + "loss": 0.4004, + "step": 29830 + }, + { + "epoch": 16.66536312849162, + "grad_norm": 0.4521452784538269, + "learning_rate": 0.00016759103641456584, + "loss": 0.4257, + "step": 29831 + }, + { + "epoch": 16.665921787709497, + "grad_norm": 0.4307730197906494, + "learning_rate": 0.00016756302521008404, + "loss": 0.3873, + "step": 29832 + }, + { + "epoch": 16.666480446927373, + "grad_norm": 0.4104289412498474, + "learning_rate": 0.00016753501400560225, + "loss": 0.3279, + "step": 29833 + }, + { + "epoch": 16.66703910614525, + "grad_norm": 1.1417956352233887, + "learning_rate": 0.00016750700280112043, + "loss": 0.3868, + "step": 29834 + }, + { + "epoch": 16.66759776536313, + "grad_norm": 0.3976530134677887, + "learning_rate": 0.00016747899159663866, + "loss": 0.3356, + "step": 29835 + }, + { + "epoch": 16.668156424581007, + "grad_norm": 4.729708671569824, + "learning_rate": 0.00016745098039215687, + "loss": 0.4438, + "step": 29836 + }, + { + "epoch": 16.668715083798883, + "grad_norm": 0.4817903935909271, + "learning_rate": 0.00016742296918767507, + "loss": 0.3501, + "step": 29837 + }, + { + "epoch": 16.66927374301676, + "grad_norm": 0.38200893998146057, + "learning_rate": 0.00016739495798319328, + "loss": 0.4531, + "step": 29838 + }, + { + "epoch": 16.669832402234636, + "grad_norm": 0.48971059918403625, + "learning_rate": 0.00016736694677871148, + "loss": 0.3622, + "step": 29839 + }, + { + "epoch": 16.670391061452513, + "grad_norm": 0.9083506464958191, + "learning_rate": 0.0001673389355742297, + "loss": 0.4264, + "step": 29840 + }, + { + "epoch": 16.67094972067039, + "grad_norm": 0.3592814803123474, + "learning_rate": 0.0001673109243697479, + "loss": 0.3323, + "step": 29841 + }, + { + "epoch": 16.67150837988827, + "grad_norm": 1.5707166194915771, + "learning_rate": 0.0001672829131652661, + "loss": 0.4032, + "step": 29842 + }, + { + "epoch": 16.672067039106146, + "grad_norm": 0.3624243438243866, + "learning_rate": 0.00016725490196078434, + "loss": 0.4164, + "step": 29843 + }, + { + "epoch": 16.672625698324023, + "grad_norm": 0.5553063750267029, + "learning_rate": 0.00016722689075630251, + "loss": 0.3849, + "step": 29844 + }, + { + "epoch": 16.6731843575419, + "grad_norm": 0.3137574791908264, + "learning_rate": 0.00016719887955182072, + "loss": 0.3201, + "step": 29845 + }, + { + "epoch": 16.673743016759776, + "grad_norm": 0.39533865451812744, + "learning_rate": 0.00016717086834733893, + "loss": 0.3956, + "step": 29846 + }, + { + "epoch": 16.674301675977652, + "grad_norm": 1.0763453245162964, + "learning_rate": 0.00016714285714285716, + "loss": 0.4178, + "step": 29847 + }, + { + "epoch": 16.674860335195532, + "grad_norm": 0.4174930453300476, + "learning_rate": 0.00016711484593837537, + "loss": 0.4033, + "step": 29848 + }, + { + "epoch": 16.67541899441341, + "grad_norm": 3.591134548187256, + "learning_rate": 0.00016708683473389354, + "loss": 0.4537, + "step": 29849 + }, + { + "epoch": 16.675977653631286, + "grad_norm": 0.5874401926994324, + "learning_rate": 0.00016705882352941175, + "loss": 0.4534, + "step": 29850 + }, + { + "epoch": 16.676536312849162, + "grad_norm": 0.5107418894767761, + "learning_rate": 0.00016703081232492998, + "loss": 0.4688, + "step": 29851 + }, + { + "epoch": 16.67709497206704, + "grad_norm": 0.3686980903148651, + "learning_rate": 0.0001670028011204482, + "loss": 0.2746, + "step": 29852 + }, + { + "epoch": 16.677653631284915, + "grad_norm": 0.3866333067417145, + "learning_rate": 0.0001669747899159664, + "loss": 0.3548, + "step": 29853 + }, + { + "epoch": 16.678212290502792, + "grad_norm": 0.42212390899658203, + "learning_rate": 0.00016694677871148457, + "loss": 0.4262, + "step": 29854 + }, + { + "epoch": 16.678770949720672, + "grad_norm": 13.590853691101074, + "learning_rate": 0.0001669187675070028, + "loss": 0.426, + "step": 29855 + }, + { + "epoch": 16.67932960893855, + "grad_norm": 0.5182271003723145, + "learning_rate": 0.000166890756302521, + "loss": 0.3961, + "step": 29856 + }, + { + "epoch": 16.679888268156425, + "grad_norm": 0.4262494444847107, + "learning_rate": 0.00016686274509803922, + "loss": 0.3991, + "step": 29857 + }, + { + "epoch": 16.6804469273743, + "grad_norm": 0.3989584743976593, + "learning_rate": 0.00016683473389355745, + "loss": 0.2982, + "step": 29858 + }, + { + "epoch": 16.68100558659218, + "grad_norm": 0.48111674189567566, + "learning_rate": 0.00016680672268907563, + "loss": 0.4127, + "step": 29859 + }, + { + "epoch": 16.681564245810055, + "grad_norm": 0.7447097301483154, + "learning_rate": 0.00016677871148459384, + "loss": 0.373, + "step": 29860 + }, + { + "epoch": 16.68212290502793, + "grad_norm": 0.48700830340385437, + "learning_rate": 0.00016675070028011204, + "loss": 0.3883, + "step": 29861 + }, + { + "epoch": 16.68268156424581, + "grad_norm": 0.40347445011138916, + "learning_rate": 0.00016672268907563028, + "loss": 0.4454, + "step": 29862 + }, + { + "epoch": 16.683240223463688, + "grad_norm": 0.5050028562545776, + "learning_rate": 0.00016669467787114848, + "loss": 0.4196, + "step": 29863 + }, + { + "epoch": 16.683798882681565, + "grad_norm": 0.4193721115589142, + "learning_rate": 0.00016666666666666666, + "loss": 0.3724, + "step": 29864 + }, + { + "epoch": 16.68435754189944, + "grad_norm": 0.37018245458602905, + "learning_rate": 0.00016663865546218487, + "loss": 0.3552, + "step": 29865 + }, + { + "epoch": 16.684916201117318, + "grad_norm": 0.407537043094635, + "learning_rate": 0.0001666106442577031, + "loss": 0.4217, + "step": 29866 + }, + { + "epoch": 16.685474860335194, + "grad_norm": 7.895242691040039, + "learning_rate": 0.0001665826330532213, + "loss": 0.3836, + "step": 29867 + }, + { + "epoch": 16.68603351955307, + "grad_norm": 0.5751171112060547, + "learning_rate": 0.0001665546218487395, + "loss": 0.3033, + "step": 29868 + }, + { + "epoch": 16.68659217877095, + "grad_norm": 0.5942823886871338, + "learning_rate": 0.0001665266106442577, + "loss": 0.4354, + "step": 29869 + }, + { + "epoch": 16.687150837988828, + "grad_norm": 0.30008044838905334, + "learning_rate": 0.00016649859943977592, + "loss": 0.3086, + "step": 29870 + }, + { + "epoch": 16.687709497206704, + "grad_norm": 0.34768733382225037, + "learning_rate": 0.00016647058823529413, + "loss": 0.3689, + "step": 29871 + }, + { + "epoch": 16.68826815642458, + "grad_norm": 0.45363518595695496, + "learning_rate": 0.00016644257703081234, + "loss": 0.3591, + "step": 29872 + }, + { + "epoch": 16.688826815642457, + "grad_norm": 0.4299505054950714, + "learning_rate": 0.00016641456582633054, + "loss": 0.4411, + "step": 29873 + }, + { + "epoch": 16.689385474860334, + "grad_norm": 1.0740561485290527, + "learning_rate": 0.00016638655462184875, + "loss": 0.4172, + "step": 29874 + }, + { + "epoch": 16.689944134078214, + "grad_norm": 0.6301817893981934, + "learning_rate": 0.00016635854341736695, + "loss": 0.4125, + "step": 29875 + }, + { + "epoch": 16.69050279329609, + "grad_norm": 0.6941254138946533, + "learning_rate": 0.00016633053221288516, + "loss": 0.5804, + "step": 29876 + }, + { + "epoch": 16.691061452513967, + "grad_norm": 0.46025583148002625, + "learning_rate": 0.00016630252100840337, + "loss": 0.4331, + "step": 29877 + }, + { + "epoch": 16.691620111731844, + "grad_norm": 0.6945750117301941, + "learning_rate": 0.00016627450980392157, + "loss": 0.4416, + "step": 29878 + }, + { + "epoch": 16.69217877094972, + "grad_norm": 0.375480055809021, + "learning_rate": 0.00016624649859943978, + "loss": 0.3182, + "step": 29879 + }, + { + "epoch": 16.692737430167597, + "grad_norm": 0.5400424003601074, + "learning_rate": 0.00016621848739495798, + "loss": 0.3523, + "step": 29880 + }, + { + "epoch": 16.693296089385473, + "grad_norm": 0.4541155993938446, + "learning_rate": 0.0001661904761904762, + "loss": 0.4909, + "step": 29881 + }, + { + "epoch": 16.693854748603353, + "grad_norm": 0.27958911657333374, + "learning_rate": 0.00016616246498599442, + "loss": 0.2921, + "step": 29882 + }, + { + "epoch": 16.69441340782123, + "grad_norm": 1.3257254362106323, + "learning_rate": 0.0001661344537815126, + "loss": 0.4412, + "step": 29883 + }, + { + "epoch": 16.694972067039107, + "grad_norm": 0.45688945055007935, + "learning_rate": 0.0001661064425770308, + "loss": 0.4096, + "step": 29884 + }, + { + "epoch": 16.695530726256983, + "grad_norm": 0.9085695147514343, + "learning_rate": 0.00016607843137254901, + "loss": 0.3687, + "step": 29885 + }, + { + "epoch": 16.69608938547486, + "grad_norm": 0.5619651079177856, + "learning_rate": 0.00016605042016806725, + "loss": 0.3901, + "step": 29886 + }, + { + "epoch": 16.696648044692736, + "grad_norm": 0.5347960591316223, + "learning_rate": 0.00016602240896358545, + "loss": 0.4193, + "step": 29887 + }, + { + "epoch": 16.697206703910613, + "grad_norm": 1.0598918199539185, + "learning_rate": 0.00016599439775910363, + "loss": 0.5168, + "step": 29888 + }, + { + "epoch": 16.697765363128493, + "grad_norm": 0.4130520522594452, + "learning_rate": 0.00016596638655462184, + "loss": 0.4165, + "step": 29889 + }, + { + "epoch": 16.69832402234637, + "grad_norm": 0.3421899974346161, + "learning_rate": 0.00016593837535014007, + "loss": 0.3401, + "step": 29890 + }, + { + "epoch": 16.698882681564246, + "grad_norm": 0.3486146032810211, + "learning_rate": 0.00016591036414565828, + "loss": 0.3371, + "step": 29891 + }, + { + "epoch": 16.699441340782123, + "grad_norm": 0.43327903747558594, + "learning_rate": 0.00016588235294117648, + "loss": 0.4299, + "step": 29892 + }, + { + "epoch": 16.7, + "grad_norm": 0.6202404499053955, + "learning_rate": 0.00016585434173669466, + "loss": 0.6182, + "step": 29893 + }, + { + "epoch": 16.700558659217876, + "grad_norm": 0.42607107758522034, + "learning_rate": 0.0001658263305322129, + "loss": 0.4443, + "step": 29894 + }, + { + "epoch": 16.701117318435756, + "grad_norm": 0.40272432565689087, + "learning_rate": 0.0001657983193277311, + "loss": 0.4303, + "step": 29895 + }, + { + "epoch": 16.701675977653633, + "grad_norm": 0.47172215580940247, + "learning_rate": 0.0001657703081232493, + "loss": 0.3609, + "step": 29896 + }, + { + "epoch": 16.70223463687151, + "grad_norm": 0.38696345686912537, + "learning_rate": 0.0001657422969187675, + "loss": 0.3198, + "step": 29897 + }, + { + "epoch": 16.702793296089386, + "grad_norm": 1.5591745376586914, + "learning_rate": 0.00016571428571428572, + "loss": 0.3752, + "step": 29898 + }, + { + "epoch": 16.703351955307262, + "grad_norm": 0.9605649709701538, + "learning_rate": 0.00016568627450980393, + "loss": 0.4239, + "step": 29899 + }, + { + "epoch": 16.70391061452514, + "grad_norm": 0.44754770398139954, + "learning_rate": 0.00016565826330532213, + "loss": 0.4025, + "step": 29900 + }, + { + "epoch": 16.704469273743015, + "grad_norm": 0.4150021970272064, + "learning_rate": 0.00016563025210084034, + "loss": 0.4145, + "step": 29901 + }, + { + "epoch": 16.705027932960895, + "grad_norm": 6.039492607116699, + "learning_rate": 0.00016560224089635857, + "loss": 0.3618, + "step": 29902 + }, + { + "epoch": 16.705586592178772, + "grad_norm": 1.223336100578308, + "learning_rate": 0.00016557422969187675, + "loss": 0.5, + "step": 29903 + }, + { + "epoch": 16.70614525139665, + "grad_norm": 4.616772174835205, + "learning_rate": 0.00016554621848739496, + "loss": 0.3983, + "step": 29904 + }, + { + "epoch": 16.706703910614525, + "grad_norm": 0.8068564534187317, + "learning_rate": 0.00016551820728291316, + "loss": 0.3283, + "step": 29905 + }, + { + "epoch": 16.7072625698324, + "grad_norm": 0.38786745071411133, + "learning_rate": 0.0001654901960784314, + "loss": 0.4067, + "step": 29906 + }, + { + "epoch": 16.70782122905028, + "grad_norm": 1.4356131553649902, + "learning_rate": 0.0001654621848739496, + "loss": 0.3633, + "step": 29907 + }, + { + "epoch": 16.708379888268155, + "grad_norm": 0.4727135896682739, + "learning_rate": 0.00016543417366946778, + "loss": 0.47, + "step": 29908 + }, + { + "epoch": 16.708938547486035, + "grad_norm": 0.402080237865448, + "learning_rate": 0.00016540616246498599, + "loss": 0.3399, + "step": 29909 + }, + { + "epoch": 16.70949720670391, + "grad_norm": 0.5463772416114807, + "learning_rate": 0.00016537815126050422, + "loss": 0.3137, + "step": 29910 + }, + { + "epoch": 16.710055865921788, + "grad_norm": 0.4279260039329529, + "learning_rate": 0.00016535014005602242, + "loss": 0.4554, + "step": 29911 + }, + { + "epoch": 16.710614525139665, + "grad_norm": 0.44774922728538513, + "learning_rate": 0.00016532212885154063, + "loss": 0.4614, + "step": 29912 + }, + { + "epoch": 16.71117318435754, + "grad_norm": 0.3404584228992462, + "learning_rate": 0.0001652941176470588, + "loss": 0.3071, + "step": 29913 + }, + { + "epoch": 16.711731843575418, + "grad_norm": 0.4860134720802307, + "learning_rate": 0.00016526610644257704, + "loss": 0.3868, + "step": 29914 + }, + { + "epoch": 16.712290502793294, + "grad_norm": 0.6186960935592651, + "learning_rate": 0.00016523809523809525, + "loss": 0.4002, + "step": 29915 + }, + { + "epoch": 16.712849162011175, + "grad_norm": 0.486005574464798, + "learning_rate": 0.00016521008403361345, + "loss": 0.4736, + "step": 29916 + }, + { + "epoch": 16.71340782122905, + "grad_norm": 0.9943147301673889, + "learning_rate": 0.00016518207282913166, + "loss": 0.4369, + "step": 29917 + }, + { + "epoch": 16.713966480446928, + "grad_norm": 6.732193946838379, + "learning_rate": 0.00016515406162464987, + "loss": 0.3906, + "step": 29918 + }, + { + "epoch": 16.714525139664804, + "grad_norm": 0.5688100457191467, + "learning_rate": 0.00016512605042016807, + "loss": 0.4562, + "step": 29919 + }, + { + "epoch": 16.71508379888268, + "grad_norm": 0.6026413440704346, + "learning_rate": 0.00016509803921568628, + "loss": 0.4667, + "step": 29920 + }, + { + "epoch": 16.715642458100557, + "grad_norm": 0.8118782043457031, + "learning_rate": 0.00016507002801120448, + "loss": 0.3906, + "step": 29921 + }, + { + "epoch": 16.716201117318434, + "grad_norm": 1.3486980199813843, + "learning_rate": 0.00016504201680672272, + "loss": 0.4707, + "step": 29922 + }, + { + "epoch": 16.716759776536314, + "grad_norm": 0.6077152490615845, + "learning_rate": 0.0001650140056022409, + "loss": 0.4093, + "step": 29923 + }, + { + "epoch": 16.71731843575419, + "grad_norm": 0.8175409436225891, + "learning_rate": 0.0001649859943977591, + "loss": 0.4454, + "step": 29924 + }, + { + "epoch": 16.717877094972067, + "grad_norm": 0.39631110429763794, + "learning_rate": 0.0001649579831932773, + "loss": 0.2738, + "step": 29925 + }, + { + "epoch": 16.718435754189944, + "grad_norm": 1.8991374969482422, + "learning_rate": 0.00016492997198879554, + "loss": 0.3533, + "step": 29926 + }, + { + "epoch": 16.71899441340782, + "grad_norm": 0.4475594460964203, + "learning_rate": 0.00016490196078431375, + "loss": 0.3457, + "step": 29927 + }, + { + "epoch": 16.719553072625697, + "grad_norm": 0.7632883787155151, + "learning_rate": 0.00016487394957983193, + "loss": 0.396, + "step": 29928 + }, + { + "epoch": 16.720111731843577, + "grad_norm": 0.7284535765647888, + "learning_rate": 0.00016484593837535013, + "loss": 0.5019, + "step": 29929 + }, + { + "epoch": 16.720670391061454, + "grad_norm": 0.5170512795448303, + "learning_rate": 0.00016481792717086837, + "loss": 0.4317, + "step": 29930 + }, + { + "epoch": 16.72122905027933, + "grad_norm": 0.7504491806030273, + "learning_rate": 0.00016478991596638657, + "loss": 0.4665, + "step": 29931 + }, + { + "epoch": 16.721787709497207, + "grad_norm": 0.31554993987083435, + "learning_rate": 0.00016476190476190475, + "loss": 0.3206, + "step": 29932 + }, + { + "epoch": 16.722346368715083, + "grad_norm": 0.4275408685207367, + "learning_rate": 0.00016473389355742296, + "loss": 0.424, + "step": 29933 + }, + { + "epoch": 16.72290502793296, + "grad_norm": 0.40644797682762146, + "learning_rate": 0.0001647058823529412, + "loss": 0.4387, + "step": 29934 + }, + { + "epoch": 16.723463687150836, + "grad_norm": 0.5162314772605896, + "learning_rate": 0.0001646778711484594, + "loss": 0.3359, + "step": 29935 + }, + { + "epoch": 16.724022346368717, + "grad_norm": 0.3356877267360687, + "learning_rate": 0.0001646498599439776, + "loss": 0.2943, + "step": 29936 + }, + { + "epoch": 16.724581005586593, + "grad_norm": 0.3958683907985687, + "learning_rate": 0.00016462184873949578, + "loss": 0.4764, + "step": 29937 + }, + { + "epoch": 16.72513966480447, + "grad_norm": 0.3934794068336487, + "learning_rate": 0.000164593837535014, + "loss": 0.4396, + "step": 29938 + }, + { + "epoch": 16.725698324022346, + "grad_norm": 0.43442875146865845, + "learning_rate": 0.00016456582633053222, + "loss": 0.4255, + "step": 29939 + }, + { + "epoch": 16.726256983240223, + "grad_norm": 0.3928596079349518, + "learning_rate": 0.00016453781512605043, + "loss": 0.4206, + "step": 29940 + }, + { + "epoch": 16.7268156424581, + "grad_norm": 0.6808320879936218, + "learning_rate": 0.00016450980392156863, + "loss": 0.4186, + "step": 29941 + }, + { + "epoch": 16.727374301675976, + "grad_norm": 0.4956154227256775, + "learning_rate": 0.00016448179271708684, + "loss": 0.3829, + "step": 29942 + }, + { + "epoch": 16.727932960893856, + "grad_norm": 0.577748715877533, + "learning_rate": 0.00016445378151260504, + "loss": 0.497, + "step": 29943 + }, + { + "epoch": 16.728491620111733, + "grad_norm": 0.38291576504707336, + "learning_rate": 0.00016442577030812325, + "loss": 0.3831, + "step": 29944 + }, + { + "epoch": 16.72905027932961, + "grad_norm": 0.6551175713539124, + "learning_rate": 0.00016439775910364146, + "loss": 0.4421, + "step": 29945 + }, + { + "epoch": 16.729608938547486, + "grad_norm": 0.4217638075351715, + "learning_rate": 0.0001643697478991597, + "loss": 0.3422, + "step": 29946 + }, + { + "epoch": 16.730167597765362, + "grad_norm": 0.39045143127441406, + "learning_rate": 0.00016434173669467787, + "loss": 0.3302, + "step": 29947 + }, + { + "epoch": 16.73072625698324, + "grad_norm": 1.9984890222549438, + "learning_rate": 0.00016431372549019607, + "loss": 0.4164, + "step": 29948 + }, + { + "epoch": 16.73128491620112, + "grad_norm": 0.5829303860664368, + "learning_rate": 0.00016428571428571428, + "loss": 0.3996, + "step": 29949 + }, + { + "epoch": 16.731843575418996, + "grad_norm": 0.6386532783508301, + "learning_rate": 0.0001642577030812325, + "loss": 0.4314, + "step": 29950 + }, + { + "epoch": 16.732402234636872, + "grad_norm": 0.790768027305603, + "learning_rate": 0.00016422969187675072, + "loss": 0.4433, + "step": 29951 + }, + { + "epoch": 16.73296089385475, + "grad_norm": 0.5277051329612732, + "learning_rate": 0.0001642016806722689, + "loss": 0.4143, + "step": 29952 + }, + { + "epoch": 16.733519553072625, + "grad_norm": 0.4073973596096039, + "learning_rate": 0.0001641736694677871, + "loss": 0.412, + "step": 29953 + }, + { + "epoch": 16.734078212290502, + "grad_norm": 0.4140506684780121, + "learning_rate": 0.00016414565826330534, + "loss": 0.3921, + "step": 29954 + }, + { + "epoch": 16.73463687150838, + "grad_norm": 0.4317987263202667, + "learning_rate": 0.00016411764705882354, + "loss": 0.3557, + "step": 29955 + }, + { + "epoch": 16.73519553072626, + "grad_norm": 0.5608370304107666, + "learning_rate": 0.00016408963585434175, + "loss": 0.4136, + "step": 29956 + }, + { + "epoch": 16.735754189944135, + "grad_norm": 1.1222494840621948, + "learning_rate": 0.00016406162464985993, + "loss": 0.4794, + "step": 29957 + }, + { + "epoch": 16.73631284916201, + "grad_norm": 0.3921917676925659, + "learning_rate": 0.00016403361344537816, + "loss": 0.4389, + "step": 29958 + }, + { + "epoch": 16.73687150837989, + "grad_norm": 0.40632399916648865, + "learning_rate": 0.00016400560224089637, + "loss": 0.4393, + "step": 29959 + }, + { + "epoch": 16.737430167597765, + "grad_norm": 0.3970141112804413, + "learning_rate": 0.00016397759103641457, + "loss": 0.421, + "step": 29960 + }, + { + "epoch": 16.73798882681564, + "grad_norm": 0.5261026620864868, + "learning_rate": 0.00016394957983193278, + "loss": 0.3715, + "step": 29961 + }, + { + "epoch": 16.738547486033518, + "grad_norm": 0.7265753746032715, + "learning_rate": 0.00016392156862745098, + "loss": 0.4421, + "step": 29962 + }, + { + "epoch": 16.739106145251398, + "grad_norm": 1.650052785873413, + "learning_rate": 0.0001638935574229692, + "loss": 0.5266, + "step": 29963 + }, + { + "epoch": 16.739664804469275, + "grad_norm": 0.5999802350997925, + "learning_rate": 0.0001638655462184874, + "loss": 0.306, + "step": 29964 + }, + { + "epoch": 16.74022346368715, + "grad_norm": 1.1444833278656006, + "learning_rate": 0.0001638375350140056, + "loss": 0.2927, + "step": 29965 + }, + { + "epoch": 16.740782122905028, + "grad_norm": 0.4256893992424011, + "learning_rate": 0.00016380952380952384, + "loss": 0.4032, + "step": 29966 + }, + { + "epoch": 16.741340782122904, + "grad_norm": 1.132124662399292, + "learning_rate": 0.00016378151260504201, + "loss": 0.3511, + "step": 29967 + }, + { + "epoch": 16.74189944134078, + "grad_norm": 0.3812745213508606, + "learning_rate": 0.00016375350140056022, + "loss": 0.3724, + "step": 29968 + }, + { + "epoch": 16.742458100558657, + "grad_norm": 0.4091469347476959, + "learning_rate": 0.00016372549019607843, + "loss": 0.4531, + "step": 29969 + }, + { + "epoch": 16.743016759776538, + "grad_norm": 0.5499061942100525, + "learning_rate": 0.00016369747899159666, + "loss": 0.3622, + "step": 29970 + }, + { + "epoch": 16.743575418994414, + "grad_norm": 0.35281631350517273, + "learning_rate": 0.00016366946778711487, + "loss": 0.4009, + "step": 29971 + }, + { + "epoch": 16.74413407821229, + "grad_norm": 0.4959126114845276, + "learning_rate": 0.00016364145658263304, + "loss": 0.4931, + "step": 29972 + }, + { + "epoch": 16.744692737430167, + "grad_norm": 0.37805166840553284, + "learning_rate": 0.00016361344537815125, + "loss": 0.4348, + "step": 29973 + }, + { + "epoch": 16.745251396648044, + "grad_norm": 0.7124146819114685, + "learning_rate": 0.00016358543417366948, + "loss": 0.392, + "step": 29974 + }, + { + "epoch": 16.74581005586592, + "grad_norm": 0.5156207084655762, + "learning_rate": 0.0001635574229691877, + "loss": 0.3511, + "step": 29975 + }, + { + "epoch": 16.7463687150838, + "grad_norm": 0.4222303330898285, + "learning_rate": 0.0001635294117647059, + "loss": 0.3414, + "step": 29976 + }, + { + "epoch": 16.746927374301677, + "grad_norm": 0.97475266456604, + "learning_rate": 0.00016350140056022407, + "loss": 0.3733, + "step": 29977 + }, + { + "epoch": 16.747486033519554, + "grad_norm": 0.4579882025718689, + "learning_rate": 0.0001634733893557423, + "loss": 0.3679, + "step": 29978 + }, + { + "epoch": 16.74804469273743, + "grad_norm": 0.4228186309337616, + "learning_rate": 0.0001634453781512605, + "loss": 0.4364, + "step": 29979 + }, + { + "epoch": 16.748603351955307, + "grad_norm": 0.4264308512210846, + "learning_rate": 0.00016341736694677872, + "loss": 0.5012, + "step": 29980 + }, + { + "epoch": 16.749162011173183, + "grad_norm": 0.40354928374290466, + "learning_rate": 0.00016338935574229693, + "loss": 0.3544, + "step": 29981 + }, + { + "epoch": 16.74972067039106, + "grad_norm": 0.42568692564964294, + "learning_rate": 0.00016336134453781513, + "loss": 0.3594, + "step": 29982 + }, + { + "epoch": 16.75027932960894, + "grad_norm": 0.5387842059135437, + "learning_rate": 0.00016333333333333334, + "loss": 0.5039, + "step": 29983 + }, + { + "epoch": 16.750837988826817, + "grad_norm": 0.5158648490905762, + "learning_rate": 0.00016330532212885154, + "loss": 0.4734, + "step": 29984 + }, + { + "epoch": 16.751396648044693, + "grad_norm": 0.5180208086967468, + "learning_rate": 0.00016327731092436975, + "loss": 0.2838, + "step": 29985 + }, + { + "epoch": 16.75195530726257, + "grad_norm": 0.7454515695571899, + "learning_rate": 0.00016324929971988796, + "loss": 0.4632, + "step": 29986 + }, + { + "epoch": 16.752513966480446, + "grad_norm": 0.5076193809509277, + "learning_rate": 0.00016322128851540616, + "loss": 0.5489, + "step": 29987 + }, + { + "epoch": 16.753072625698323, + "grad_norm": 0.4938351511955261, + "learning_rate": 0.00016319327731092437, + "loss": 0.3702, + "step": 29988 + }, + { + "epoch": 16.7536312849162, + "grad_norm": 0.9141116142272949, + "learning_rate": 0.00016316526610644257, + "loss": 0.4474, + "step": 29989 + }, + { + "epoch": 16.75418994413408, + "grad_norm": 0.48057761788368225, + "learning_rate": 0.0001631372549019608, + "loss": 0.3621, + "step": 29990 + }, + { + "epoch": 16.754748603351956, + "grad_norm": 0.4921472370624542, + "learning_rate": 0.00016310924369747899, + "loss": 0.2904, + "step": 29991 + }, + { + "epoch": 16.755307262569833, + "grad_norm": 1.0851823091506958, + "learning_rate": 0.0001630812324929972, + "loss": 0.4379, + "step": 29992 + }, + { + "epoch": 16.75586592178771, + "grad_norm": 0.5022264719009399, + "learning_rate": 0.0001630532212885154, + "loss": 0.4459, + "step": 29993 + }, + { + "epoch": 16.756424581005586, + "grad_norm": 0.42536184191703796, + "learning_rate": 0.00016302521008403363, + "loss": 0.4271, + "step": 29994 + }, + { + "epoch": 16.756983240223462, + "grad_norm": 0.41575920581817627, + "learning_rate": 0.00016299719887955184, + "loss": 0.5693, + "step": 29995 + }, + { + "epoch": 16.757541899441343, + "grad_norm": 0.5380123853683472, + "learning_rate": 0.00016296918767507002, + "loss": 0.3829, + "step": 29996 + }, + { + "epoch": 16.75810055865922, + "grad_norm": 0.4218633770942688, + "learning_rate": 0.00016294117647058822, + "loss": 0.3902, + "step": 29997 + }, + { + "epoch": 16.758659217877096, + "grad_norm": 0.3621273934841156, + "learning_rate": 0.00016291316526610645, + "loss": 0.3344, + "step": 29998 + }, + { + "epoch": 16.759217877094972, + "grad_norm": 0.37574502825737, + "learning_rate": 0.00016288515406162466, + "loss": 0.4674, + "step": 29999 + }, + { + "epoch": 16.75977653631285, + "grad_norm": 0.5005525350570679, + "learning_rate": 0.00016285714285714287, + "loss": 0.4329, + "step": 30000 + }, + { + "epoch": 16.75977653631285, + "eval_cer": 0.08611283864139745, + "eval_loss": 0.32325825095176697, + "eval_runtime": 55.6706, + "eval_samples_per_second": 81.515, + "eval_steps_per_second": 5.101, + "eval_wer": 0.34105192592385686, + "step": 30000 + }, + { + "epoch": 16.760335195530725, + "grad_norm": 0.4052548408508301, + "learning_rate": 0.00016282913165266104, + "loss": 0.363, + "step": 30001 + }, + { + "epoch": 16.760893854748602, + "grad_norm": 0.49155187606811523, + "learning_rate": 0.00016280112044817928, + "loss": 0.4062, + "step": 30002 + }, + { + "epoch": 16.761452513966482, + "grad_norm": 0.39574292302131653, + "learning_rate": 0.00016277310924369748, + "loss": 0.3745, + "step": 30003 + }, + { + "epoch": 16.76201117318436, + "grad_norm": 3.2734153270721436, + "learning_rate": 0.0001627450980392157, + "loss": 0.429, + "step": 30004 + }, + { + "epoch": 16.762569832402235, + "grad_norm": 0.6843207478523254, + "learning_rate": 0.0001627170868347339, + "loss": 0.4213, + "step": 30005 + }, + { + "epoch": 16.76312849162011, + "grad_norm": 0.6496402621269226, + "learning_rate": 0.0001626890756302521, + "loss": 0.4037, + "step": 30006 + }, + { + "epoch": 16.76368715083799, + "grad_norm": 0.38369643688201904, + "learning_rate": 0.0001626610644257703, + "loss": 0.3781, + "step": 30007 + }, + { + "epoch": 16.764245810055865, + "grad_norm": 0.789936900138855, + "learning_rate": 0.00016263305322128851, + "loss": 0.3429, + "step": 30008 + }, + { + "epoch": 16.76480446927374, + "grad_norm": 0.3861772418022156, + "learning_rate": 0.00016260504201680672, + "loss": 0.3419, + "step": 30009 + }, + { + "epoch": 16.76536312849162, + "grad_norm": 0.38788917660713196, + "learning_rate": 0.00016257703081232495, + "loss": 0.4038, + "step": 30010 + }, + { + "epoch": 16.765921787709498, + "grad_norm": 0.4419642984867096, + "learning_rate": 0.00016254901960784313, + "loss": 0.2792, + "step": 30011 + }, + { + "epoch": 16.766480446927375, + "grad_norm": 0.4525263011455536, + "learning_rate": 0.00016252100840336134, + "loss": 0.3405, + "step": 30012 + }, + { + "epoch": 16.76703910614525, + "grad_norm": 4.382625579833984, + "learning_rate": 0.00016249299719887954, + "loss": 0.4036, + "step": 30013 + }, + { + "epoch": 16.767597765363128, + "grad_norm": 0.444553405046463, + "learning_rate": 0.00016246498599439778, + "loss": 0.3746, + "step": 30014 + }, + { + "epoch": 16.768156424581004, + "grad_norm": 1.0176643133163452, + "learning_rate": 0.00016243697478991598, + "loss": 0.4712, + "step": 30015 + }, + { + "epoch": 16.76871508379888, + "grad_norm": 0.3747853934764862, + "learning_rate": 0.00016240896358543416, + "loss": 0.342, + "step": 30016 + }, + { + "epoch": 16.76927374301676, + "grad_norm": 0.48713281750679016, + "learning_rate": 0.00016238095238095237, + "loss": 0.462, + "step": 30017 + }, + { + "epoch": 16.769832402234638, + "grad_norm": 0.5657044053077698, + "learning_rate": 0.0001623529411764706, + "loss": 0.4339, + "step": 30018 + }, + { + "epoch": 16.770391061452514, + "grad_norm": 0.40071308612823486, + "learning_rate": 0.0001623249299719888, + "loss": 0.3091, + "step": 30019 + }, + { + "epoch": 16.77094972067039, + "grad_norm": 0.44520989060401917, + "learning_rate": 0.000162296918767507, + "loss": 0.3461, + "step": 30020 + }, + { + "epoch": 16.771508379888267, + "grad_norm": 1.4213981628417969, + "learning_rate": 0.0001622689075630252, + "loss": 0.4164, + "step": 30021 + }, + { + "epoch": 16.772067039106144, + "grad_norm": 0.5215101838111877, + "learning_rate": 0.00016224089635854343, + "loss": 0.4527, + "step": 30022 + }, + { + "epoch": 16.772625698324024, + "grad_norm": 0.4174841046333313, + "learning_rate": 0.00016221288515406163, + "loss": 0.4103, + "step": 30023 + }, + { + "epoch": 16.7731843575419, + "grad_norm": 0.47039493918418884, + "learning_rate": 0.00016218487394957984, + "loss": 0.3948, + "step": 30024 + }, + { + "epoch": 16.773743016759777, + "grad_norm": 0.5664065480232239, + "learning_rate": 0.00016215686274509804, + "loss": 0.3949, + "step": 30025 + }, + { + "epoch": 16.774301675977654, + "grad_norm": 0.7055777311325073, + "learning_rate": 0.00016212885154061625, + "loss": 0.3805, + "step": 30026 + }, + { + "epoch": 16.77486033519553, + "grad_norm": 0.4658617377281189, + "learning_rate": 0.00016210084033613446, + "loss": 0.3533, + "step": 30027 + }, + { + "epoch": 16.775418994413407, + "grad_norm": 1.507992148399353, + "learning_rate": 0.00016207282913165266, + "loss": 0.3788, + "step": 30028 + }, + { + "epoch": 16.775977653631283, + "grad_norm": 0.31773972511291504, + "learning_rate": 0.00016204481792717087, + "loss": 0.374, + "step": 30029 + }, + { + "epoch": 16.776536312849164, + "grad_norm": 0.4965648651123047, + "learning_rate": 0.0001620168067226891, + "loss": 0.5032, + "step": 30030 + }, + { + "epoch": 16.77709497206704, + "grad_norm": 0.45034071803092957, + "learning_rate": 0.00016198879551820728, + "loss": 0.4356, + "step": 30031 + }, + { + "epoch": 16.777653631284917, + "grad_norm": 0.5066399574279785, + "learning_rate": 0.00016196078431372549, + "loss": 0.4806, + "step": 30032 + }, + { + "epoch": 16.778212290502793, + "grad_norm": 0.4164505898952484, + "learning_rate": 0.0001619327731092437, + "loss": 0.3765, + "step": 30033 + }, + { + "epoch": 16.77877094972067, + "grad_norm": 1.2076771259307861, + "learning_rate": 0.00016190476190476192, + "loss": 0.3275, + "step": 30034 + }, + { + "epoch": 16.779329608938546, + "grad_norm": 0.3843424618244171, + "learning_rate": 0.00016187675070028013, + "loss": 0.3921, + "step": 30035 + }, + { + "epoch": 16.779888268156423, + "grad_norm": 0.5174873471260071, + "learning_rate": 0.0001618487394957983, + "loss": 0.4649, + "step": 30036 + }, + { + "epoch": 16.780446927374303, + "grad_norm": 0.5740247368812561, + "learning_rate": 0.00016182072829131652, + "loss": 0.4952, + "step": 30037 + }, + { + "epoch": 16.78100558659218, + "grad_norm": 0.6851624250411987, + "learning_rate": 0.00016179271708683475, + "loss": 0.4165, + "step": 30038 + }, + { + "epoch": 16.781564245810056, + "grad_norm": 0.4838239252567291, + "learning_rate": 0.00016176470588235295, + "loss": 0.5026, + "step": 30039 + }, + { + "epoch": 16.782122905027933, + "grad_norm": 0.5639671683311462, + "learning_rate": 0.00016173669467787116, + "loss": 0.5074, + "step": 30040 + }, + { + "epoch": 16.78268156424581, + "grad_norm": 0.5394155979156494, + "learning_rate": 0.00016170868347338934, + "loss": 0.3443, + "step": 30041 + }, + { + "epoch": 16.783240223463686, + "grad_norm": 0.4927348494529724, + "learning_rate": 0.00016168067226890757, + "loss": 0.4462, + "step": 30042 + }, + { + "epoch": 16.783798882681566, + "grad_norm": 0.5866169333457947, + "learning_rate": 0.00016165266106442578, + "loss": 0.3407, + "step": 30043 + }, + { + "epoch": 16.784357541899443, + "grad_norm": 0.6173146963119507, + "learning_rate": 0.00016162464985994398, + "loss": 0.2543, + "step": 30044 + }, + { + "epoch": 16.78491620111732, + "grad_norm": 1.2588061094284058, + "learning_rate": 0.00016159663865546216, + "loss": 0.5018, + "step": 30045 + }, + { + "epoch": 16.785474860335196, + "grad_norm": 0.5672191381454468, + "learning_rate": 0.0001615686274509804, + "loss": 0.4313, + "step": 30046 + }, + { + "epoch": 16.786033519553072, + "grad_norm": 0.4832766354084015, + "learning_rate": 0.0001615406162464986, + "loss": 0.2464, + "step": 30047 + }, + { + "epoch": 16.78659217877095, + "grad_norm": 1.892886757850647, + "learning_rate": 0.0001615126050420168, + "loss": 0.283, + "step": 30048 + }, + { + "epoch": 16.787150837988825, + "grad_norm": 0.6519088745117188, + "learning_rate": 0.00016148459383753501, + "loss": 0.4246, + "step": 30049 + }, + { + "epoch": 16.787709497206706, + "grad_norm": 0.44339805841445923, + "learning_rate": 0.00016145658263305322, + "loss": 0.3872, + "step": 30050 + }, + { + "epoch": 16.788268156424582, + "grad_norm": 0.6117792129516602, + "learning_rate": 0.00016142857142857143, + "loss": 0.3754, + "step": 30051 + }, + { + "epoch": 16.78882681564246, + "grad_norm": 0.39605194330215454, + "learning_rate": 0.00016140056022408963, + "loss": 0.4391, + "step": 30052 + }, + { + "epoch": 16.789385474860335, + "grad_norm": 0.5022445917129517, + "learning_rate": 0.00016137254901960784, + "loss": 0.3587, + "step": 30053 + }, + { + "epoch": 16.789944134078212, + "grad_norm": 0.5878540873527527, + "learning_rate": 0.00016134453781512607, + "loss": 0.5111, + "step": 30054 + }, + { + "epoch": 16.79050279329609, + "grad_norm": 0.33780530095100403, + "learning_rate": 0.00016131652661064425, + "loss": 0.366, + "step": 30055 + }, + { + "epoch": 16.791061452513965, + "grad_norm": 2.6099483966827393, + "learning_rate": 0.00016128851540616246, + "loss": 0.3523, + "step": 30056 + }, + { + "epoch": 16.791620111731845, + "grad_norm": 0.5597836971282959, + "learning_rate": 0.00016126050420168066, + "loss": 0.511, + "step": 30057 + }, + { + "epoch": 16.79217877094972, + "grad_norm": 0.6871532201766968, + "learning_rate": 0.0001612324929971989, + "loss": 0.4292, + "step": 30058 + }, + { + "epoch": 16.7927374301676, + "grad_norm": 0.4103935658931732, + "learning_rate": 0.0001612044817927171, + "loss": 0.4976, + "step": 30059 + }, + { + "epoch": 16.793296089385475, + "grad_norm": 0.6675010323524475, + "learning_rate": 0.00016117647058823528, + "loss": 0.4376, + "step": 30060 + }, + { + "epoch": 16.79385474860335, + "grad_norm": 0.41142186522483826, + "learning_rate": 0.00016114845938375349, + "loss": 0.3611, + "step": 30061 + }, + { + "epoch": 16.794413407821228, + "grad_norm": 0.511282742023468, + "learning_rate": 0.00016112044817927172, + "loss": 0.3495, + "step": 30062 + }, + { + "epoch": 16.794972067039105, + "grad_norm": 0.6248022317886353, + "learning_rate": 0.00016109243697478993, + "loss": 0.415, + "step": 30063 + }, + { + "epoch": 16.795530726256985, + "grad_norm": 0.4924640953540802, + "learning_rate": 0.00016106442577030813, + "loss": 0.4743, + "step": 30064 + }, + { + "epoch": 16.79608938547486, + "grad_norm": 0.603979229927063, + "learning_rate": 0.0001610364145658263, + "loss": 0.3523, + "step": 30065 + }, + { + "epoch": 16.796648044692738, + "grad_norm": 0.5137979984283447, + "learning_rate": 0.00016100840336134454, + "loss": 0.5143, + "step": 30066 + }, + { + "epoch": 16.797206703910614, + "grad_norm": 0.5229576230049133, + "learning_rate": 0.00016098039215686275, + "loss": 0.452, + "step": 30067 + }, + { + "epoch": 16.79776536312849, + "grad_norm": 0.49414879083633423, + "learning_rate": 0.00016095238095238096, + "loss": 0.4395, + "step": 30068 + }, + { + "epoch": 16.798324022346367, + "grad_norm": 0.4139558970928192, + "learning_rate": 0.00016092436974789916, + "loss": 0.365, + "step": 30069 + }, + { + "epoch": 16.798882681564244, + "grad_norm": 0.41835296154022217, + "learning_rate": 0.00016089635854341737, + "loss": 0.3999, + "step": 30070 + }, + { + "epoch": 16.799441340782124, + "grad_norm": 0.4176795482635498, + "learning_rate": 0.00016086834733893557, + "loss": 0.4125, + "step": 30071 + }, + { + "epoch": 16.8, + "grad_norm": 0.44688260555267334, + "learning_rate": 0.00016084033613445378, + "loss": 0.3575, + "step": 30072 + }, + { + "epoch": 16.800558659217877, + "grad_norm": 0.4810837507247925, + "learning_rate": 0.00016081232492997199, + "loss": 0.451, + "step": 30073 + }, + { + "epoch": 16.801117318435754, + "grad_norm": 0.5799554586410522, + "learning_rate": 0.00016078431372549022, + "loss": 0.4201, + "step": 30074 + }, + { + "epoch": 16.80167597765363, + "grad_norm": 0.35912781953811646, + "learning_rate": 0.0001607563025210084, + "loss": 0.4278, + "step": 30075 + }, + { + "epoch": 16.802234636871507, + "grad_norm": 0.3864504098892212, + "learning_rate": 0.0001607282913165266, + "loss": 0.4273, + "step": 30076 + }, + { + "epoch": 16.802793296089387, + "grad_norm": 0.5275647640228271, + "learning_rate": 0.0001607002801120448, + "loss": 0.3651, + "step": 30077 + }, + { + "epoch": 16.803351955307264, + "grad_norm": 0.41697949171066284, + "learning_rate": 0.00016067226890756304, + "loss": 0.4145, + "step": 30078 + }, + { + "epoch": 16.80391061452514, + "grad_norm": 0.433200478553772, + "learning_rate": 0.00016064425770308125, + "loss": 0.3225, + "step": 30079 + }, + { + "epoch": 16.804469273743017, + "grad_norm": 0.3597385883331299, + "learning_rate": 0.00016061624649859943, + "loss": 0.3647, + "step": 30080 + }, + { + "epoch": 16.805027932960893, + "grad_norm": 0.6093882322311401, + "learning_rate": 0.00016058823529411763, + "loss": 0.6441, + "step": 30081 + }, + { + "epoch": 16.80558659217877, + "grad_norm": 0.9391186833381653, + "learning_rate": 0.00016056022408963587, + "loss": 0.3832, + "step": 30082 + }, + { + "epoch": 16.806145251396647, + "grad_norm": 0.3506004512310028, + "learning_rate": 0.00016053221288515407, + "loss": 0.4148, + "step": 30083 + }, + { + "epoch": 16.806703910614527, + "grad_norm": 0.36887064576148987, + "learning_rate": 0.00016050420168067228, + "loss": 0.3311, + "step": 30084 + }, + { + "epoch": 16.807262569832403, + "grad_norm": 6.3074727058410645, + "learning_rate": 0.00016047619047619046, + "loss": 0.4321, + "step": 30085 + }, + { + "epoch": 16.80782122905028, + "grad_norm": 0.5146413445472717, + "learning_rate": 0.0001604481792717087, + "loss": 0.5419, + "step": 30086 + }, + { + "epoch": 16.808379888268156, + "grad_norm": 0.41056182980537415, + "learning_rate": 0.0001604201680672269, + "loss": 0.3547, + "step": 30087 + }, + { + "epoch": 16.808938547486033, + "grad_norm": 1.769498586654663, + "learning_rate": 0.0001603921568627451, + "loss": 0.4257, + "step": 30088 + }, + { + "epoch": 16.80949720670391, + "grad_norm": 0.35737600922584534, + "learning_rate": 0.00016036414565826334, + "loss": 0.3283, + "step": 30089 + }, + { + "epoch": 16.810055865921786, + "grad_norm": 3.807839870452881, + "learning_rate": 0.00016033613445378151, + "loss": 0.4933, + "step": 30090 + }, + { + "epoch": 16.810614525139666, + "grad_norm": 0.40298232436180115, + "learning_rate": 0.00016030812324929972, + "loss": 0.3827, + "step": 30091 + }, + { + "epoch": 16.811173184357543, + "grad_norm": 0.34603506326675415, + "learning_rate": 0.00016028011204481793, + "loss": 0.37, + "step": 30092 + }, + { + "epoch": 16.81173184357542, + "grad_norm": 0.4428398609161377, + "learning_rate": 0.00016025210084033616, + "loss": 0.4672, + "step": 30093 + }, + { + "epoch": 16.812290502793296, + "grad_norm": 0.3467986285686493, + "learning_rate": 0.00016022408963585437, + "loss": 0.3648, + "step": 30094 + }, + { + "epoch": 16.812849162011172, + "grad_norm": 0.38408535718917847, + "learning_rate": 0.00016019607843137254, + "loss": 0.392, + "step": 30095 + }, + { + "epoch": 16.81340782122905, + "grad_norm": 0.40280744433403015, + "learning_rate": 0.00016016806722689075, + "loss": 0.3634, + "step": 30096 + }, + { + "epoch": 16.81396648044693, + "grad_norm": 0.5805290937423706, + "learning_rate": 0.00016014005602240898, + "loss": 0.4533, + "step": 30097 + }, + { + "epoch": 16.814525139664806, + "grad_norm": 0.3980065882205963, + "learning_rate": 0.0001601120448179272, + "loss": 0.3685, + "step": 30098 + }, + { + "epoch": 16.815083798882682, + "grad_norm": 0.40838387608528137, + "learning_rate": 0.00016008403361344537, + "loss": 0.4539, + "step": 30099 + }, + { + "epoch": 16.81564245810056, + "grad_norm": 0.4192756712436676, + "learning_rate": 0.00016005602240896357, + "loss": 0.3942, + "step": 30100 + }, + { + "epoch": 16.816201117318435, + "grad_norm": 0.4256346523761749, + "learning_rate": 0.0001600280112044818, + "loss": 0.3048, + "step": 30101 + }, + { + "epoch": 16.816759776536312, + "grad_norm": 0.4652436077594757, + "learning_rate": 0.00016, + "loss": 0.3463, + "step": 30102 + }, + { + "epoch": 16.81731843575419, + "grad_norm": 0.4597277343273163, + "learning_rate": 0.00015997198879551822, + "loss": 0.3428, + "step": 30103 + }, + { + "epoch": 16.81787709497207, + "grad_norm": 0.3622557520866394, + "learning_rate": 0.0001599439775910364, + "loss": 0.3968, + "step": 30104 + }, + { + "epoch": 16.818435754189945, + "grad_norm": 0.4852827489376068, + "learning_rate": 0.00015991596638655463, + "loss": 0.4643, + "step": 30105 + }, + { + "epoch": 16.81899441340782, + "grad_norm": 0.5440395474433899, + "learning_rate": 0.00015988795518207284, + "loss": 0.534, + "step": 30106 + }, + { + "epoch": 16.8195530726257, + "grad_norm": 0.3743683695793152, + "learning_rate": 0.00015985994397759104, + "loss": 0.3925, + "step": 30107 + }, + { + "epoch": 16.820111731843575, + "grad_norm": 4.565816879272461, + "learning_rate": 0.00015983193277310925, + "loss": 0.2968, + "step": 30108 + }, + { + "epoch": 16.82067039106145, + "grad_norm": 0.4137653410434723, + "learning_rate": 0.00015980392156862746, + "loss": 0.373, + "step": 30109 + }, + { + "epoch": 16.821229050279328, + "grad_norm": 0.3782500624656677, + "learning_rate": 0.00015977591036414566, + "loss": 0.4872, + "step": 30110 + }, + { + "epoch": 16.821787709497208, + "grad_norm": 0.374213308095932, + "learning_rate": 0.00015974789915966387, + "loss": 0.3315, + "step": 30111 + }, + { + "epoch": 16.822346368715085, + "grad_norm": 0.7055694460868835, + "learning_rate": 0.00015971988795518207, + "loss": 0.4757, + "step": 30112 + }, + { + "epoch": 16.82290502793296, + "grad_norm": 0.478340744972229, + "learning_rate": 0.0001596918767507003, + "loss": 0.4939, + "step": 30113 + }, + { + "epoch": 16.823463687150838, + "grad_norm": 0.3142969310283661, + "learning_rate": 0.00015966386554621849, + "loss": 0.259, + "step": 30114 + }, + { + "epoch": 16.824022346368714, + "grad_norm": 0.5844735503196716, + "learning_rate": 0.0001596358543417367, + "loss": 0.4245, + "step": 30115 + }, + { + "epoch": 16.82458100558659, + "grad_norm": 0.3984036147594452, + "learning_rate": 0.0001596078431372549, + "loss": 0.3528, + "step": 30116 + }, + { + "epoch": 16.825139664804468, + "grad_norm": 4.019460201263428, + "learning_rate": 0.00015957983193277313, + "loss": 0.4194, + "step": 30117 + }, + { + "epoch": 16.825698324022348, + "grad_norm": 0.42472344636917114, + "learning_rate": 0.00015955182072829134, + "loss": 0.3925, + "step": 30118 + }, + { + "epoch": 16.826256983240224, + "grad_norm": 0.6465110182762146, + "learning_rate": 0.00015952380952380951, + "loss": 0.4051, + "step": 30119 + }, + { + "epoch": 16.8268156424581, + "grad_norm": 0.5962534546852112, + "learning_rate": 0.00015949579831932772, + "loss": 0.4239, + "step": 30120 + }, + { + "epoch": 16.827374301675977, + "grad_norm": 0.6170143485069275, + "learning_rate": 0.00015946778711484595, + "loss": 0.4946, + "step": 30121 + }, + { + "epoch": 16.827932960893854, + "grad_norm": 0.5440694689750671, + "learning_rate": 0.00015943977591036416, + "loss": 0.4483, + "step": 30122 + }, + { + "epoch": 16.82849162011173, + "grad_norm": 0.4368651211261749, + "learning_rate": 0.00015941176470588237, + "loss": 0.3626, + "step": 30123 + }, + { + "epoch": 16.82905027932961, + "grad_norm": 0.7729046940803528, + "learning_rate": 0.00015938375350140054, + "loss": 0.334, + "step": 30124 + }, + { + "epoch": 16.829608938547487, + "grad_norm": 0.5720878839492798, + "learning_rate": 0.00015935574229691878, + "loss": 0.377, + "step": 30125 + }, + { + "epoch": 16.830167597765364, + "grad_norm": 1.4981003999710083, + "learning_rate": 0.00015932773109243698, + "loss": 0.3569, + "step": 30126 + }, + { + "epoch": 16.83072625698324, + "grad_norm": 0.4393386244773865, + "learning_rate": 0.0001592997198879552, + "loss": 0.398, + "step": 30127 + }, + { + "epoch": 16.831284916201117, + "grad_norm": 0.40800952911376953, + "learning_rate": 0.0001592717086834734, + "loss": 0.4818, + "step": 30128 + }, + { + "epoch": 16.831843575418993, + "grad_norm": 1.3066201210021973, + "learning_rate": 0.0001592436974789916, + "loss": 0.3661, + "step": 30129 + }, + { + "epoch": 16.83240223463687, + "grad_norm": 0.43286192417144775, + "learning_rate": 0.0001592156862745098, + "loss": 0.3214, + "step": 30130 + }, + { + "epoch": 16.83296089385475, + "grad_norm": 0.4904375970363617, + "learning_rate": 0.00015918767507002801, + "loss": 0.3733, + "step": 30131 + }, + { + "epoch": 16.833519553072627, + "grad_norm": 0.44405966997146606, + "learning_rate": 0.00015915966386554622, + "loss": 0.3916, + "step": 30132 + }, + { + "epoch": 16.834078212290503, + "grad_norm": 0.5740193128585815, + "learning_rate": 0.00015913165266106445, + "loss": 0.4114, + "step": 30133 + }, + { + "epoch": 16.83463687150838, + "grad_norm": 0.4858759939670563, + "learning_rate": 0.00015910364145658263, + "loss": 0.365, + "step": 30134 + }, + { + "epoch": 16.835195530726256, + "grad_norm": 0.5334282517433167, + "learning_rate": 0.00015907563025210084, + "loss": 0.4353, + "step": 30135 + }, + { + "epoch": 16.835754189944133, + "grad_norm": 0.5499429106712341, + "learning_rate": 0.00015904761904761904, + "loss": 0.3757, + "step": 30136 + }, + { + "epoch": 16.83631284916201, + "grad_norm": 1.394912600517273, + "learning_rate": 0.00015901960784313728, + "loss": 0.634, + "step": 30137 + }, + { + "epoch": 16.83687150837989, + "grad_norm": 0.4118703603744507, + "learning_rate": 0.00015899159663865548, + "loss": 0.3574, + "step": 30138 + }, + { + "epoch": 16.837430167597766, + "grad_norm": 0.6004096269607544, + "learning_rate": 0.00015896358543417366, + "loss": 0.4295, + "step": 30139 + }, + { + "epoch": 16.837988826815643, + "grad_norm": 0.40506094694137573, + "learning_rate": 0.00015893557422969187, + "loss": 0.3435, + "step": 30140 + }, + { + "epoch": 16.83854748603352, + "grad_norm": 0.3610875904560089, + "learning_rate": 0.0001589075630252101, + "loss": 0.4478, + "step": 30141 + }, + { + "epoch": 16.839106145251396, + "grad_norm": 0.39817145466804504, + "learning_rate": 0.0001588795518207283, + "loss": 0.3696, + "step": 30142 + }, + { + "epoch": 16.839664804469272, + "grad_norm": 0.5740687847137451, + "learning_rate": 0.0001588515406162465, + "loss": 0.39, + "step": 30143 + }, + { + "epoch": 16.840223463687153, + "grad_norm": 0.4253646731376648, + "learning_rate": 0.0001588235294117647, + "loss": 0.313, + "step": 30144 + }, + { + "epoch": 16.84078212290503, + "grad_norm": 1.1679350137710571, + "learning_rate": 0.00015879551820728293, + "loss": 0.572, + "step": 30145 + }, + { + "epoch": 16.841340782122906, + "grad_norm": 0.3970259130001068, + "learning_rate": 0.00015876750700280113, + "loss": 0.3984, + "step": 30146 + }, + { + "epoch": 16.841899441340782, + "grad_norm": 0.5139328837394714, + "learning_rate": 0.00015873949579831934, + "loss": 0.4816, + "step": 30147 + }, + { + "epoch": 16.84245810055866, + "grad_norm": 0.7918537259101868, + "learning_rate": 0.00015871148459383754, + "loss": 0.5831, + "step": 30148 + }, + { + "epoch": 16.843016759776535, + "grad_norm": 0.4558841288089752, + "learning_rate": 0.00015868347338935575, + "loss": 0.3435, + "step": 30149 + }, + { + "epoch": 16.843575418994412, + "grad_norm": 0.3359465003013611, + "learning_rate": 0.00015865546218487396, + "loss": 0.3466, + "step": 30150 + }, + { + "epoch": 16.844134078212292, + "grad_norm": 0.38350799679756165, + "learning_rate": 0.00015862745098039216, + "loss": 0.4271, + "step": 30151 + }, + { + "epoch": 16.84469273743017, + "grad_norm": 0.3991387188434601, + "learning_rate": 0.00015859943977591037, + "loss": 0.39, + "step": 30152 + }, + { + "epoch": 16.845251396648045, + "grad_norm": 0.3865835964679718, + "learning_rate": 0.00015857142857142857, + "loss": 0.4485, + "step": 30153 + }, + { + "epoch": 16.845810055865922, + "grad_norm": 0.4502410888671875, + "learning_rate": 0.00015854341736694678, + "loss": 0.4842, + "step": 30154 + }, + { + "epoch": 16.8463687150838, + "grad_norm": 0.4129258990287781, + "learning_rate": 0.00015851540616246499, + "loss": 0.4452, + "step": 30155 + }, + { + "epoch": 16.846927374301675, + "grad_norm": 1.3322232961654663, + "learning_rate": 0.0001584873949579832, + "loss": 0.3218, + "step": 30156 + }, + { + "epoch": 16.84748603351955, + "grad_norm": 0.5254073739051819, + "learning_rate": 0.00015845938375350142, + "loss": 0.4675, + "step": 30157 + }, + { + "epoch": 16.84804469273743, + "grad_norm": 0.580143392086029, + "learning_rate": 0.0001584313725490196, + "loss": 0.4845, + "step": 30158 + }, + { + "epoch": 16.84860335195531, + "grad_norm": 0.3895910978317261, + "learning_rate": 0.0001584033613445378, + "loss": 0.3993, + "step": 30159 + }, + { + "epoch": 16.849162011173185, + "grad_norm": 0.36416247487068176, + "learning_rate": 0.00015837535014005601, + "loss": 0.3807, + "step": 30160 + }, + { + "epoch": 16.84972067039106, + "grad_norm": 0.5567619204521179, + "learning_rate": 0.00015834733893557425, + "loss": 0.4068, + "step": 30161 + }, + { + "epoch": 16.850279329608938, + "grad_norm": 0.444638729095459, + "learning_rate": 0.00015831932773109245, + "loss": 0.4625, + "step": 30162 + }, + { + "epoch": 16.850837988826814, + "grad_norm": 1.1667346954345703, + "learning_rate": 0.00015829131652661063, + "loss": 0.4223, + "step": 30163 + }, + { + "epoch": 16.85139664804469, + "grad_norm": 0.5068751573562622, + "learning_rate": 0.00015826330532212884, + "loss": 0.5173, + "step": 30164 + }, + { + "epoch": 16.85195530726257, + "grad_norm": 0.44086870551109314, + "learning_rate": 0.00015823529411764707, + "loss": 0.4567, + "step": 30165 + }, + { + "epoch": 16.852513966480448, + "grad_norm": 0.6275837421417236, + "learning_rate": 0.00015820728291316528, + "loss": 0.5035, + "step": 30166 + }, + { + "epoch": 16.853072625698324, + "grad_norm": 0.5532208681106567, + "learning_rate": 0.00015817927170868348, + "loss": 0.4662, + "step": 30167 + }, + { + "epoch": 16.8536312849162, + "grad_norm": 13.293482780456543, + "learning_rate": 0.00015815126050420166, + "loss": 0.331, + "step": 30168 + }, + { + "epoch": 16.854189944134077, + "grad_norm": 1.044973373413086, + "learning_rate": 0.0001581232492997199, + "loss": 0.3656, + "step": 30169 + }, + { + "epoch": 16.854748603351954, + "grad_norm": 0.456415057182312, + "learning_rate": 0.0001580952380952381, + "loss": 0.3218, + "step": 30170 + }, + { + "epoch": 16.85530726256983, + "grad_norm": 0.5060498714447021, + "learning_rate": 0.0001580672268907563, + "loss": 0.5017, + "step": 30171 + }, + { + "epoch": 16.85586592178771, + "grad_norm": 0.6787229776382446, + "learning_rate": 0.00015803921568627451, + "loss": 0.5713, + "step": 30172 + }, + { + "epoch": 16.856424581005587, + "grad_norm": 0.8607981204986572, + "learning_rate": 0.00015801120448179272, + "loss": 0.421, + "step": 30173 + }, + { + "epoch": 16.856983240223464, + "grad_norm": 1.5900905132293701, + "learning_rate": 0.00015798319327731093, + "loss": 0.4379, + "step": 30174 + }, + { + "epoch": 16.85754189944134, + "grad_norm": 0.7126103639602661, + "learning_rate": 0.00015795518207282913, + "loss": 0.3682, + "step": 30175 + }, + { + "epoch": 16.858100558659217, + "grad_norm": 1.5982716083526611, + "learning_rate": 0.00015792717086834734, + "loss": 0.3406, + "step": 30176 + }, + { + "epoch": 16.858659217877094, + "grad_norm": 0.5052145719528198, + "learning_rate": 0.00015789915966386557, + "loss": 0.3449, + "step": 30177 + }, + { + "epoch": 16.859217877094974, + "grad_norm": 0.41603681445121765, + "learning_rate": 0.00015787114845938375, + "loss": 0.4301, + "step": 30178 + }, + { + "epoch": 16.85977653631285, + "grad_norm": 0.468876451253891, + "learning_rate": 0.00015784313725490196, + "loss": 0.4057, + "step": 30179 + }, + { + "epoch": 16.860335195530727, + "grad_norm": 0.3810173571109772, + "learning_rate": 0.00015781512605042016, + "loss": 0.446, + "step": 30180 + }, + { + "epoch": 16.860893854748603, + "grad_norm": 0.3950745165348053, + "learning_rate": 0.0001577871148459384, + "loss": 0.3828, + "step": 30181 + }, + { + "epoch": 16.86145251396648, + "grad_norm": 0.4344445765018463, + "learning_rate": 0.0001577591036414566, + "loss": 0.4228, + "step": 30182 + }, + { + "epoch": 16.862011173184356, + "grad_norm": 0.488749623298645, + "learning_rate": 0.00015773109243697478, + "loss": 0.395, + "step": 30183 + }, + { + "epoch": 16.862569832402233, + "grad_norm": 1.1128497123718262, + "learning_rate": 0.00015770308123249299, + "loss": 0.5235, + "step": 30184 + }, + { + "epoch": 16.863128491620113, + "grad_norm": 0.8008594512939453, + "learning_rate": 0.00015767507002801122, + "loss": 0.3943, + "step": 30185 + }, + { + "epoch": 16.86368715083799, + "grad_norm": 0.4214664399623871, + "learning_rate": 0.00015764705882352943, + "loss": 0.4004, + "step": 30186 + }, + { + "epoch": 16.864245810055866, + "grad_norm": 0.7226083874702454, + "learning_rate": 0.00015761904761904763, + "loss": 0.4682, + "step": 30187 + }, + { + "epoch": 16.864804469273743, + "grad_norm": 0.7479549050331116, + "learning_rate": 0.0001575910364145658, + "loss": 0.4372, + "step": 30188 + }, + { + "epoch": 16.86536312849162, + "grad_norm": 1.2650318145751953, + "learning_rate": 0.00015756302521008404, + "loss": 0.4072, + "step": 30189 + }, + { + "epoch": 16.865921787709496, + "grad_norm": 1.8159083127975464, + "learning_rate": 0.00015753501400560225, + "loss": 0.4335, + "step": 30190 + }, + { + "epoch": 16.866480446927373, + "grad_norm": 0.424147367477417, + "learning_rate": 0.00015750700280112046, + "loss": 0.4016, + "step": 30191 + }, + { + "epoch": 16.867039106145253, + "grad_norm": 0.4480076730251312, + "learning_rate": 0.00015747899159663866, + "loss": 0.347, + "step": 30192 + }, + { + "epoch": 16.86759776536313, + "grad_norm": 0.44959908723831177, + "learning_rate": 0.00015745098039215687, + "loss": 0.342, + "step": 30193 + }, + { + "epoch": 16.868156424581006, + "grad_norm": 0.7538745403289795, + "learning_rate": 0.00015742296918767507, + "loss": 0.6072, + "step": 30194 + }, + { + "epoch": 16.868715083798882, + "grad_norm": 0.34363195300102234, + "learning_rate": 0.00015739495798319328, + "loss": 0.3381, + "step": 30195 + }, + { + "epoch": 16.86927374301676, + "grad_norm": 0.5263931155204773, + "learning_rate": 0.00015736694677871149, + "loss": 0.4159, + "step": 30196 + }, + { + "epoch": 16.869832402234636, + "grad_norm": 0.6510556936264038, + "learning_rate": 0.00015733893557422972, + "loss": 0.4327, + "step": 30197 + }, + { + "epoch": 16.870391061452516, + "grad_norm": 0.40254583954811096, + "learning_rate": 0.0001573109243697479, + "loss": 0.3398, + "step": 30198 + }, + { + "epoch": 16.870949720670392, + "grad_norm": 0.492439329624176, + "learning_rate": 0.0001572829131652661, + "loss": 0.5842, + "step": 30199 + }, + { + "epoch": 16.87150837988827, + "grad_norm": 0.5890607833862305, + "learning_rate": 0.0001572549019607843, + "loss": 0.4168, + "step": 30200 + }, + { + "epoch": 16.872067039106145, + "grad_norm": 0.44668495655059814, + "learning_rate": 0.00015722689075630254, + "loss": 0.3928, + "step": 30201 + }, + { + "epoch": 16.872625698324022, + "grad_norm": 0.5565999746322632, + "learning_rate": 0.00015719887955182075, + "loss": 0.4087, + "step": 30202 + }, + { + "epoch": 16.8731843575419, + "grad_norm": 0.44597819447517395, + "learning_rate": 0.00015717086834733893, + "loss": 0.4349, + "step": 30203 + }, + { + "epoch": 16.873743016759775, + "grad_norm": 0.35262104868888855, + "learning_rate": 0.00015714285714285713, + "loss": 0.3524, + "step": 30204 + }, + { + "epoch": 16.874301675977655, + "grad_norm": 0.37330934405326843, + "learning_rate": 0.00015711484593837537, + "loss": 0.3336, + "step": 30205 + }, + { + "epoch": 16.87486033519553, + "grad_norm": 0.4421864151954651, + "learning_rate": 0.00015708683473389357, + "loss": 0.3197, + "step": 30206 + }, + { + "epoch": 16.87541899441341, + "grad_norm": 0.4005056321620941, + "learning_rate": 0.00015705882352941178, + "loss": 0.5008, + "step": 30207 + }, + { + "epoch": 16.875977653631285, + "grad_norm": 0.6955085396766663, + "learning_rate": 0.00015703081232492996, + "loss": 0.4083, + "step": 30208 + }, + { + "epoch": 16.87653631284916, + "grad_norm": 0.48278653621673584, + "learning_rate": 0.0001570028011204482, + "loss": 0.4086, + "step": 30209 + }, + { + "epoch": 16.877094972067038, + "grad_norm": 0.38105377554893494, + "learning_rate": 0.0001569747899159664, + "loss": 0.3203, + "step": 30210 + }, + { + "epoch": 16.877653631284915, + "grad_norm": 0.3725895881652832, + "learning_rate": 0.0001569467787114846, + "loss": 0.3453, + "step": 30211 + }, + { + "epoch": 16.878212290502795, + "grad_norm": 0.7716445326805115, + "learning_rate": 0.00015691876750700278, + "loss": 0.4119, + "step": 30212 + }, + { + "epoch": 16.87877094972067, + "grad_norm": 0.8801575899124146, + "learning_rate": 0.00015689075630252101, + "loss": 0.4677, + "step": 30213 + }, + { + "epoch": 16.879329608938548, + "grad_norm": 0.3447099030017853, + "learning_rate": 0.00015686274509803922, + "loss": 0.3712, + "step": 30214 + }, + { + "epoch": 16.879888268156424, + "grad_norm": 0.5271446108818054, + "learning_rate": 0.00015683473389355743, + "loss": 0.4254, + "step": 30215 + }, + { + "epoch": 16.8804469273743, + "grad_norm": 0.4342610239982605, + "learning_rate": 0.00015680672268907563, + "loss": 0.4123, + "step": 30216 + }, + { + "epoch": 16.881005586592178, + "grad_norm": 0.4275245666503906, + "learning_rate": 0.00015677871148459384, + "loss": 0.4504, + "step": 30217 + }, + { + "epoch": 16.881564245810054, + "grad_norm": 5.342784881591797, + "learning_rate": 0.00015675070028011204, + "loss": 0.41, + "step": 30218 + }, + { + "epoch": 16.882122905027934, + "grad_norm": 0.35894110798835754, + "learning_rate": 0.00015672268907563025, + "loss": 0.3602, + "step": 30219 + }, + { + "epoch": 16.88268156424581, + "grad_norm": 0.5169207453727722, + "learning_rate": 0.00015669467787114846, + "loss": 0.5066, + "step": 30220 + }, + { + "epoch": 16.883240223463687, + "grad_norm": 1.1114915609359741, + "learning_rate": 0.0001566666666666667, + "loss": 0.3083, + "step": 30221 + }, + { + "epoch": 16.883798882681564, + "grad_norm": 0.4362267851829529, + "learning_rate": 0.00015663865546218487, + "loss": 0.3687, + "step": 30222 + }, + { + "epoch": 16.88435754189944, + "grad_norm": 0.5804873704910278, + "learning_rate": 0.00015661064425770307, + "loss": 0.4614, + "step": 30223 + }, + { + "epoch": 16.884916201117317, + "grad_norm": 0.45495694875717163, + "learning_rate": 0.00015658263305322128, + "loss": 0.4684, + "step": 30224 + }, + { + "epoch": 16.885474860335197, + "grad_norm": 2.383920192718506, + "learning_rate": 0.0001565546218487395, + "loss": 0.712, + "step": 30225 + }, + { + "epoch": 16.886033519553074, + "grad_norm": 0.9333776831626892, + "learning_rate": 0.00015652661064425772, + "loss": 0.3584, + "step": 30226 + }, + { + "epoch": 16.88659217877095, + "grad_norm": 0.8368858695030212, + "learning_rate": 0.0001564985994397759, + "loss": 0.4805, + "step": 30227 + }, + { + "epoch": 16.887150837988827, + "grad_norm": 0.38262975215911865, + "learning_rate": 0.0001564705882352941, + "loss": 0.3999, + "step": 30228 + }, + { + "epoch": 16.887709497206703, + "grad_norm": 0.7130293846130371, + "learning_rate": 0.00015644257703081234, + "loss": 0.4329, + "step": 30229 + }, + { + "epoch": 16.88826815642458, + "grad_norm": 0.3868766725063324, + "learning_rate": 0.00015641456582633054, + "loss": 0.3311, + "step": 30230 + }, + { + "epoch": 16.888826815642457, + "grad_norm": 0.6025223731994629, + "learning_rate": 0.00015638655462184875, + "loss": 0.612, + "step": 30231 + }, + { + "epoch": 16.889385474860337, + "grad_norm": 0.39621472358703613, + "learning_rate": 0.00015635854341736693, + "loss": 0.3461, + "step": 30232 + }, + { + "epoch": 16.889944134078213, + "grad_norm": 0.5080248713493347, + "learning_rate": 0.00015633053221288516, + "loss": 0.4839, + "step": 30233 + }, + { + "epoch": 16.89050279329609, + "grad_norm": 0.8841781616210938, + "learning_rate": 0.00015630252100840337, + "loss": 0.4909, + "step": 30234 + }, + { + "epoch": 16.891061452513966, + "grad_norm": 0.48106616735458374, + "learning_rate": 0.00015627450980392157, + "loss": 0.4152, + "step": 30235 + }, + { + "epoch": 16.891620111731843, + "grad_norm": 0.4235338568687439, + "learning_rate": 0.00015624649859943978, + "loss": 0.4636, + "step": 30236 + }, + { + "epoch": 16.89217877094972, + "grad_norm": 0.4886437654495239, + "learning_rate": 0.00015621848739495798, + "loss": 0.3888, + "step": 30237 + }, + { + "epoch": 16.892737430167596, + "grad_norm": 0.6446411609649658, + "learning_rate": 0.0001561904761904762, + "loss": 0.413, + "step": 30238 + }, + { + "epoch": 16.893296089385476, + "grad_norm": 0.4011625647544861, + "learning_rate": 0.0001561624649859944, + "loss": 0.4109, + "step": 30239 + }, + { + "epoch": 16.893854748603353, + "grad_norm": 0.39117443561553955, + "learning_rate": 0.0001561344537815126, + "loss": 0.3616, + "step": 30240 + }, + { + "epoch": 16.89441340782123, + "grad_norm": 0.38766980171203613, + "learning_rate": 0.00015610644257703084, + "loss": 0.3917, + "step": 30241 + }, + { + "epoch": 16.894972067039106, + "grad_norm": 0.40482330322265625, + "learning_rate": 0.00015607843137254901, + "loss": 0.4667, + "step": 30242 + }, + { + "epoch": 16.895530726256982, + "grad_norm": 0.5429258346557617, + "learning_rate": 0.00015605042016806722, + "loss": 0.3795, + "step": 30243 + }, + { + "epoch": 16.89608938547486, + "grad_norm": 2.321565628051758, + "learning_rate": 0.00015602240896358543, + "loss": 0.3792, + "step": 30244 + }, + { + "epoch": 16.89664804469274, + "grad_norm": 0.38894128799438477, + "learning_rate": 0.00015599439775910366, + "loss": 0.3649, + "step": 30245 + }, + { + "epoch": 16.897206703910616, + "grad_norm": 0.5999763011932373, + "learning_rate": 0.00015596638655462187, + "loss": 0.3026, + "step": 30246 + }, + { + "epoch": 16.897765363128492, + "grad_norm": 0.6594188213348389, + "learning_rate": 0.00015593837535014004, + "loss": 0.3825, + "step": 30247 + }, + { + "epoch": 16.89832402234637, + "grad_norm": 0.48847028613090515, + "learning_rate": 0.00015591036414565825, + "loss": 0.4173, + "step": 30248 + }, + { + "epoch": 16.898882681564245, + "grad_norm": 0.3831975758075714, + "learning_rate": 0.00015588235294117648, + "loss": 0.316, + "step": 30249 + }, + { + "epoch": 16.899441340782122, + "grad_norm": 0.547111988067627, + "learning_rate": 0.0001558543417366947, + "loss": 0.4064, + "step": 30250 + }, + { + "epoch": 16.9, + "grad_norm": 0.3626510798931122, + "learning_rate": 0.0001558263305322129, + "loss": 0.3189, + "step": 30251 + }, + { + "epoch": 16.90055865921788, + "grad_norm": 0.40763407945632935, + "learning_rate": 0.00015579831932773107, + "loss": 0.4641, + "step": 30252 + }, + { + "epoch": 16.901117318435755, + "grad_norm": 1.1521021127700806, + "learning_rate": 0.0001557703081232493, + "loss": 0.4325, + "step": 30253 + }, + { + "epoch": 16.901675977653632, + "grad_norm": 0.4214644730091095, + "learning_rate": 0.00015574229691876751, + "loss": 0.3507, + "step": 30254 + }, + { + "epoch": 16.90223463687151, + "grad_norm": 0.3495875895023346, + "learning_rate": 0.00015571428571428572, + "loss": 0.3517, + "step": 30255 + }, + { + "epoch": 16.902793296089385, + "grad_norm": 0.31127431988716125, + "learning_rate": 0.00015568627450980393, + "loss": 0.2604, + "step": 30256 + }, + { + "epoch": 16.90335195530726, + "grad_norm": 0.3663713335990906, + "learning_rate": 0.00015565826330532213, + "loss": 0.3759, + "step": 30257 + }, + { + "epoch": 16.903910614525138, + "grad_norm": 0.7469764351844788, + "learning_rate": 0.00015563025210084034, + "loss": 0.6233, + "step": 30258 + }, + { + "epoch": 16.904469273743018, + "grad_norm": 0.4186480939388275, + "learning_rate": 0.00015560224089635854, + "loss": 0.3789, + "step": 30259 + }, + { + "epoch": 16.905027932960895, + "grad_norm": 0.47238972783088684, + "learning_rate": 0.00015557422969187675, + "loss": 0.429, + "step": 30260 + }, + { + "epoch": 16.90558659217877, + "grad_norm": 0.6932469010353088, + "learning_rate": 0.00015554621848739498, + "loss": 0.4313, + "step": 30261 + }, + { + "epoch": 16.906145251396648, + "grad_norm": 0.8271285891532898, + "learning_rate": 0.00015551820728291316, + "loss": 0.3734, + "step": 30262 + }, + { + "epoch": 16.906703910614524, + "grad_norm": 0.5990018248558044, + "learning_rate": 0.00015549019607843137, + "loss": 0.3683, + "step": 30263 + }, + { + "epoch": 16.9072625698324, + "grad_norm": 0.625999391078949, + "learning_rate": 0.00015546218487394957, + "loss": 0.4034, + "step": 30264 + }, + { + "epoch": 16.907821229050278, + "grad_norm": 0.4199620187282562, + "learning_rate": 0.0001554341736694678, + "loss": 0.3603, + "step": 30265 + }, + { + "epoch": 16.908379888268158, + "grad_norm": 2.2511370182037354, + "learning_rate": 0.00015540616246498599, + "loss": 0.5155, + "step": 30266 + }, + { + "epoch": 16.908938547486034, + "grad_norm": 0.45186132192611694, + "learning_rate": 0.0001553781512605042, + "loss": 0.4384, + "step": 30267 + }, + { + "epoch": 16.90949720670391, + "grad_norm": 0.4772550165653229, + "learning_rate": 0.0001553501400560224, + "loss": 0.4245, + "step": 30268 + }, + { + "epoch": 16.910055865921787, + "grad_norm": 0.6822510361671448, + "learning_rate": 0.00015532212885154063, + "loss": 0.4253, + "step": 30269 + }, + { + "epoch": 16.910614525139664, + "grad_norm": 1.310558795928955, + "learning_rate": 0.00015529411764705884, + "loss": 0.3596, + "step": 30270 + }, + { + "epoch": 16.91117318435754, + "grad_norm": 0.42638224363327026, + "learning_rate": 0.00015526610644257702, + "loss": 0.4199, + "step": 30271 + }, + { + "epoch": 16.91173184357542, + "grad_norm": 0.44095078110694885, + "learning_rate": 0.00015523809523809522, + "loss": 0.475, + "step": 30272 + }, + { + "epoch": 16.912290502793297, + "grad_norm": 1.823797583580017, + "learning_rate": 0.00015521008403361346, + "loss": 0.4956, + "step": 30273 + }, + { + "epoch": 16.912849162011174, + "grad_norm": 0.5275580286979675, + "learning_rate": 0.00015518207282913166, + "loss": 0.4656, + "step": 30274 + }, + { + "epoch": 16.91340782122905, + "grad_norm": 0.8070825934410095, + "learning_rate": 0.00015515406162464987, + "loss": 0.382, + "step": 30275 + }, + { + "epoch": 16.913966480446927, + "grad_norm": 0.40196824073791504, + "learning_rate": 0.00015512605042016805, + "loss": 0.5042, + "step": 30276 + }, + { + "epoch": 16.914525139664804, + "grad_norm": 0.5920143127441406, + "learning_rate": 0.00015509803921568628, + "loss": 0.5957, + "step": 30277 + }, + { + "epoch": 16.91508379888268, + "grad_norm": 0.5059862732887268, + "learning_rate": 0.00015507002801120448, + "loss": 0.495, + "step": 30278 + }, + { + "epoch": 16.91564245810056, + "grad_norm": 0.372800350189209, + "learning_rate": 0.0001550420168067227, + "loss": 0.3789, + "step": 30279 + }, + { + "epoch": 16.916201117318437, + "grad_norm": 0.5092388391494751, + "learning_rate": 0.0001550140056022409, + "loss": 0.363, + "step": 30280 + }, + { + "epoch": 16.916759776536313, + "grad_norm": 0.5395246744155884, + "learning_rate": 0.0001549859943977591, + "loss": 0.5217, + "step": 30281 + }, + { + "epoch": 16.91731843575419, + "grad_norm": 0.8115045428276062, + "learning_rate": 0.0001549579831932773, + "loss": 0.42, + "step": 30282 + }, + { + "epoch": 16.917877094972066, + "grad_norm": 0.4640989899635315, + "learning_rate": 0.00015492997198879551, + "loss": 0.3768, + "step": 30283 + }, + { + "epoch": 16.918435754189943, + "grad_norm": 0.4571166932582855, + "learning_rate": 0.00015490196078431372, + "loss": 0.4004, + "step": 30284 + }, + { + "epoch": 16.91899441340782, + "grad_norm": 0.34246352314949036, + "learning_rate": 0.00015487394957983195, + "loss": 0.2978, + "step": 30285 + }, + { + "epoch": 16.9195530726257, + "grad_norm": 0.510298490524292, + "learning_rate": 0.00015484593837535013, + "loss": 0.4448, + "step": 30286 + }, + { + "epoch": 16.920111731843576, + "grad_norm": 0.375177264213562, + "learning_rate": 0.00015481792717086834, + "loss": 0.3154, + "step": 30287 + }, + { + "epoch": 16.920670391061453, + "grad_norm": 0.48991602659225464, + "learning_rate": 0.00015478991596638654, + "loss": 0.3548, + "step": 30288 + }, + { + "epoch": 16.92122905027933, + "grad_norm": 0.3705797493457794, + "learning_rate": 0.00015476190476190478, + "loss": 0.4276, + "step": 30289 + }, + { + "epoch": 16.921787709497206, + "grad_norm": 0.40698692202568054, + "learning_rate": 0.00015473389355742298, + "loss": 0.3508, + "step": 30290 + }, + { + "epoch": 16.922346368715083, + "grad_norm": 0.5141226053237915, + "learning_rate": 0.00015470588235294116, + "loss": 0.3915, + "step": 30291 + }, + { + "epoch": 16.922905027932963, + "grad_norm": 0.4309077262878418, + "learning_rate": 0.00015467787114845937, + "loss": 0.4075, + "step": 30292 + }, + { + "epoch": 16.92346368715084, + "grad_norm": 0.6018301844596863, + "learning_rate": 0.0001546498599439776, + "loss": 0.4251, + "step": 30293 + }, + { + "epoch": 16.924022346368716, + "grad_norm": 0.4072677493095398, + "learning_rate": 0.0001546218487394958, + "loss": 0.473, + "step": 30294 + }, + { + "epoch": 16.924581005586592, + "grad_norm": 0.5321144461631775, + "learning_rate": 0.00015459383753501401, + "loss": 0.4535, + "step": 30295 + }, + { + "epoch": 16.92513966480447, + "grad_norm": 0.4155421257019043, + "learning_rate": 0.0001545658263305322, + "loss": 0.4496, + "step": 30296 + }, + { + "epoch": 16.925698324022346, + "grad_norm": 0.4917810261249542, + "learning_rate": 0.00015453781512605043, + "loss": 0.3631, + "step": 30297 + }, + { + "epoch": 16.926256983240222, + "grad_norm": 0.9854460954666138, + "learning_rate": 0.00015450980392156863, + "loss": 0.392, + "step": 30298 + }, + { + "epoch": 16.926815642458102, + "grad_norm": 0.4849650263786316, + "learning_rate": 0.00015448179271708684, + "loss": 0.3442, + "step": 30299 + }, + { + "epoch": 16.92737430167598, + "grad_norm": 0.49455657601356506, + "learning_rate": 0.00015445378151260504, + "loss": 0.3862, + "step": 30300 + }, + { + "epoch": 16.927932960893855, + "grad_norm": 0.3992047607898712, + "learning_rate": 0.00015442577030812325, + "loss": 0.3025, + "step": 30301 + }, + { + "epoch": 16.928491620111732, + "grad_norm": 0.5470736026763916, + "learning_rate": 0.00015439775910364146, + "loss": 0.4432, + "step": 30302 + }, + { + "epoch": 16.92905027932961, + "grad_norm": 0.3519626557826996, + "learning_rate": 0.00015436974789915966, + "loss": 0.3512, + "step": 30303 + }, + { + "epoch": 16.929608938547485, + "grad_norm": 0.3889520764350891, + "learning_rate": 0.00015434173669467787, + "loss": 0.3501, + "step": 30304 + }, + { + "epoch": 16.93016759776536, + "grad_norm": 0.34396892786026, + "learning_rate": 0.0001543137254901961, + "loss": 0.3762, + "step": 30305 + }, + { + "epoch": 16.93072625698324, + "grad_norm": 0.6689139604568481, + "learning_rate": 0.00015428571428571428, + "loss": 0.4965, + "step": 30306 + }, + { + "epoch": 16.93128491620112, + "grad_norm": 0.41385045647621155, + "learning_rate": 0.00015425770308123249, + "loss": 0.4382, + "step": 30307 + }, + { + "epoch": 16.931843575418995, + "grad_norm": 0.38909074664115906, + "learning_rate": 0.0001542296918767507, + "loss": 0.387, + "step": 30308 + }, + { + "epoch": 16.93240223463687, + "grad_norm": 0.3993358016014099, + "learning_rate": 0.00015420168067226893, + "loss": 0.4423, + "step": 30309 + }, + { + "epoch": 16.932960893854748, + "grad_norm": 0.4193381667137146, + "learning_rate": 0.00015417366946778713, + "loss": 0.3351, + "step": 30310 + }, + { + "epoch": 16.933519553072625, + "grad_norm": 0.461887001991272, + "learning_rate": 0.0001541456582633053, + "loss": 0.5152, + "step": 30311 + }, + { + "epoch": 16.9340782122905, + "grad_norm": 0.4148399233818054, + "learning_rate": 0.00015411764705882352, + "loss": 0.4208, + "step": 30312 + }, + { + "epoch": 16.93463687150838, + "grad_norm": 0.6164987087249756, + "learning_rate": 0.00015408963585434175, + "loss": 0.3846, + "step": 30313 + }, + { + "epoch": 16.935195530726258, + "grad_norm": 0.4818493127822876, + "learning_rate": 0.00015406162464985996, + "loss": 0.2934, + "step": 30314 + }, + { + "epoch": 16.935754189944134, + "grad_norm": 0.4571642279624939, + "learning_rate": 0.00015403361344537816, + "loss": 0.3984, + "step": 30315 + }, + { + "epoch": 16.93631284916201, + "grad_norm": 0.5050204396247864, + "learning_rate": 0.00015400560224089634, + "loss": 0.3245, + "step": 30316 + }, + { + "epoch": 16.936871508379888, + "grad_norm": 0.3212690055370331, + "learning_rate": 0.00015397759103641457, + "loss": 0.3152, + "step": 30317 + }, + { + "epoch": 16.937430167597764, + "grad_norm": 0.46357548236846924, + "learning_rate": 0.00015394957983193278, + "loss": 0.4281, + "step": 30318 + }, + { + "epoch": 16.93798882681564, + "grad_norm": 0.4582130014896393, + "learning_rate": 0.00015392156862745098, + "loss": 0.447, + "step": 30319 + }, + { + "epoch": 16.93854748603352, + "grad_norm": 0.4889698624610901, + "learning_rate": 0.00015389355742296916, + "loss": 0.4215, + "step": 30320 + }, + { + "epoch": 16.939106145251397, + "grad_norm": 0.67316073179245, + "learning_rate": 0.0001538655462184874, + "loss": 0.5077, + "step": 30321 + }, + { + "epoch": 16.939664804469274, + "grad_norm": 0.5229091048240662, + "learning_rate": 0.0001538375350140056, + "loss": 0.5161, + "step": 30322 + }, + { + "epoch": 16.94022346368715, + "grad_norm": 0.7249429225921631, + "learning_rate": 0.0001538095238095238, + "loss": 0.4171, + "step": 30323 + }, + { + "epoch": 16.940782122905027, + "grad_norm": 0.5083596110343933, + "learning_rate": 0.00015378151260504204, + "loss": 0.5654, + "step": 30324 + }, + { + "epoch": 16.941340782122904, + "grad_norm": 0.3124542236328125, + "learning_rate": 0.00015375350140056022, + "loss": 0.299, + "step": 30325 + }, + { + "epoch": 16.941899441340784, + "grad_norm": 0.41659268736839294, + "learning_rate": 0.00015372549019607843, + "loss": 0.3377, + "step": 30326 + }, + { + "epoch": 16.94245810055866, + "grad_norm": 0.42169126868247986, + "learning_rate": 0.00015369747899159663, + "loss": 0.4597, + "step": 30327 + }, + { + "epoch": 16.943016759776537, + "grad_norm": 0.4793253242969513, + "learning_rate": 0.00015366946778711487, + "loss": 0.4167, + "step": 30328 + }, + { + "epoch": 16.943575418994413, + "grad_norm": 0.4617081582546234, + "learning_rate": 0.00015364145658263307, + "loss": 0.373, + "step": 30329 + }, + { + "epoch": 16.94413407821229, + "grad_norm": 0.5602740049362183, + "learning_rate": 0.00015361344537815125, + "loss": 0.3198, + "step": 30330 + }, + { + "epoch": 16.944692737430167, + "grad_norm": 1.0916534662246704, + "learning_rate": 0.00015358543417366946, + "loss": 0.4097, + "step": 30331 + }, + { + "epoch": 16.945251396648043, + "grad_norm": 1.9058390855789185, + "learning_rate": 0.0001535574229691877, + "loss": 0.5161, + "step": 30332 + }, + { + "epoch": 16.945810055865923, + "grad_norm": 0.44394323229789734, + "learning_rate": 0.0001535294117647059, + "loss": 0.4215, + "step": 30333 + }, + { + "epoch": 16.9463687150838, + "grad_norm": 0.33620184659957886, + "learning_rate": 0.0001535014005602241, + "loss": 0.3181, + "step": 30334 + }, + { + "epoch": 16.946927374301676, + "grad_norm": 0.7556557655334473, + "learning_rate": 0.00015347338935574228, + "loss": 0.4124, + "step": 30335 + }, + { + "epoch": 16.947486033519553, + "grad_norm": 0.6048688292503357, + "learning_rate": 0.00015344537815126051, + "loss": 0.409, + "step": 30336 + }, + { + "epoch": 16.94804469273743, + "grad_norm": 0.983925461769104, + "learning_rate": 0.00015341736694677872, + "loss": 0.3841, + "step": 30337 + }, + { + "epoch": 16.948603351955306, + "grad_norm": 0.5724963545799255, + "learning_rate": 0.00015338935574229693, + "loss": 0.4612, + "step": 30338 + }, + { + "epoch": 16.949162011173183, + "grad_norm": 0.45660504698753357, + "learning_rate": 0.00015336134453781513, + "loss": 0.2795, + "step": 30339 + }, + { + "epoch": 16.949720670391063, + "grad_norm": 0.40456148982048035, + "learning_rate": 0.00015333333333333334, + "loss": 0.5845, + "step": 30340 + }, + { + "epoch": 16.95027932960894, + "grad_norm": 0.5004631876945496, + "learning_rate": 0.00015330532212885154, + "loss": 0.3564, + "step": 30341 + }, + { + "epoch": 16.950837988826816, + "grad_norm": 0.3486461937427521, + "learning_rate": 0.00015327731092436975, + "loss": 0.3205, + "step": 30342 + }, + { + "epoch": 16.951396648044692, + "grad_norm": 0.7220545411109924, + "learning_rate": 0.00015324929971988796, + "loss": 0.4924, + "step": 30343 + }, + { + "epoch": 16.95195530726257, + "grad_norm": 5.430187702178955, + "learning_rate": 0.0001532212885154062, + "loss": 0.4761, + "step": 30344 + }, + { + "epoch": 16.952513966480446, + "grad_norm": 0.47808152437210083, + "learning_rate": 0.00015319327731092437, + "loss": 0.4208, + "step": 30345 + }, + { + "epoch": 16.953072625698326, + "grad_norm": 0.5832805633544922, + "learning_rate": 0.00015316526610644257, + "loss": 0.2934, + "step": 30346 + }, + { + "epoch": 16.953631284916202, + "grad_norm": 0.5149906873703003, + "learning_rate": 0.00015313725490196078, + "loss": 0.3609, + "step": 30347 + }, + { + "epoch": 16.95418994413408, + "grad_norm": 0.5477957725524902, + "learning_rate": 0.000153109243697479, + "loss": 0.4756, + "step": 30348 + }, + { + "epoch": 16.954748603351955, + "grad_norm": 0.7830228805541992, + "learning_rate": 0.00015308123249299722, + "loss": 0.3902, + "step": 30349 + }, + { + "epoch": 16.955307262569832, + "grad_norm": 0.4080927073955536, + "learning_rate": 0.0001530532212885154, + "loss": 0.4187, + "step": 30350 + }, + { + "epoch": 16.95586592178771, + "grad_norm": 0.5628142356872559, + "learning_rate": 0.0001530252100840336, + "loss": 0.4375, + "step": 30351 + }, + { + "epoch": 16.956424581005585, + "grad_norm": 0.5363519787788391, + "learning_rate": 0.00015299719887955184, + "loss": 0.4159, + "step": 30352 + }, + { + "epoch": 16.956983240223465, + "grad_norm": 0.5226016044616699, + "learning_rate": 0.00015296918767507004, + "loss": 0.407, + "step": 30353 + }, + { + "epoch": 16.957541899441342, + "grad_norm": 0.4318486452102661, + "learning_rate": 0.00015294117647058825, + "loss": 0.3876, + "step": 30354 + }, + { + "epoch": 16.95810055865922, + "grad_norm": 0.5389750599861145, + "learning_rate": 0.00015291316526610643, + "loss": 0.4849, + "step": 30355 + }, + { + "epoch": 16.958659217877095, + "grad_norm": 0.45806819200515747, + "learning_rate": 0.00015288515406162466, + "loss": 0.3435, + "step": 30356 + }, + { + "epoch": 16.95921787709497, + "grad_norm": 5.623819828033447, + "learning_rate": 0.00015285714285714287, + "loss": 0.4569, + "step": 30357 + }, + { + "epoch": 16.959776536312848, + "grad_norm": 0.4806057810783386, + "learning_rate": 0.00015282913165266107, + "loss": 0.4605, + "step": 30358 + }, + { + "epoch": 16.960335195530725, + "grad_norm": 0.49418771266937256, + "learning_rate": 0.00015280112044817928, + "loss": 0.3411, + "step": 30359 + }, + { + "epoch": 16.960893854748605, + "grad_norm": 0.532681405544281, + "learning_rate": 0.00015277310924369748, + "loss": 0.4761, + "step": 30360 + }, + { + "epoch": 16.96145251396648, + "grad_norm": 0.3914929926395416, + "learning_rate": 0.0001527450980392157, + "loss": 0.3678, + "step": 30361 + }, + { + "epoch": 16.962011173184358, + "grad_norm": 0.3841231167316437, + "learning_rate": 0.0001527170868347339, + "loss": 0.3611, + "step": 30362 + }, + { + "epoch": 16.962569832402234, + "grad_norm": 0.6541624069213867, + "learning_rate": 0.0001526890756302521, + "loss": 0.3907, + "step": 30363 + }, + { + "epoch": 16.96312849162011, + "grad_norm": 0.46786296367645264, + "learning_rate": 0.00015266106442577034, + "loss": 0.4288, + "step": 30364 + }, + { + "epoch": 16.963687150837988, + "grad_norm": 0.5675880312919617, + "learning_rate": 0.00015263305322128851, + "loss": 0.4288, + "step": 30365 + }, + { + "epoch": 16.964245810055864, + "grad_norm": 0.38062313199043274, + "learning_rate": 0.00015260504201680672, + "loss": 0.4316, + "step": 30366 + }, + { + "epoch": 16.964804469273744, + "grad_norm": 0.7660247087478638, + "learning_rate": 0.00015257703081232493, + "loss": 0.4691, + "step": 30367 + }, + { + "epoch": 16.96536312849162, + "grad_norm": 0.272076815366745, + "learning_rate": 0.00015254901960784316, + "loss": 0.3178, + "step": 30368 + }, + { + "epoch": 16.965921787709497, + "grad_norm": 0.7283949255943298, + "learning_rate": 0.00015252100840336137, + "loss": 0.4403, + "step": 30369 + }, + { + "epoch": 16.966480446927374, + "grad_norm": 0.391897588968277, + "learning_rate": 0.00015249299719887954, + "loss": 0.3679, + "step": 30370 + }, + { + "epoch": 16.96703910614525, + "grad_norm": 0.410977303981781, + "learning_rate": 0.00015246498599439775, + "loss": 0.3527, + "step": 30371 + }, + { + "epoch": 16.967597765363127, + "grad_norm": 0.48569798469543457, + "learning_rate": 0.00015243697478991598, + "loss": 0.3172, + "step": 30372 + }, + { + "epoch": 16.968156424581007, + "grad_norm": 0.6881005764007568, + "learning_rate": 0.0001524089635854342, + "loss": 0.3689, + "step": 30373 + }, + { + "epoch": 16.968715083798884, + "grad_norm": 0.5051879286766052, + "learning_rate": 0.0001523809523809524, + "loss": 0.4217, + "step": 30374 + }, + { + "epoch": 16.96927374301676, + "grad_norm": 0.7523027658462524, + "learning_rate": 0.00015235294117647057, + "loss": 0.4059, + "step": 30375 + }, + { + "epoch": 16.969832402234637, + "grad_norm": 0.3337520956993103, + "learning_rate": 0.0001523249299719888, + "loss": 0.4102, + "step": 30376 + }, + { + "epoch": 16.970391061452514, + "grad_norm": 0.6148476600646973, + "learning_rate": 0.00015229691876750701, + "loss": 0.3719, + "step": 30377 + }, + { + "epoch": 16.97094972067039, + "grad_norm": 0.5315216779708862, + "learning_rate": 0.00015226890756302522, + "loss": 0.4742, + "step": 30378 + }, + { + "epoch": 16.971508379888267, + "grad_norm": 0.42165207862854004, + "learning_rate": 0.0001522408963585434, + "loss": 0.389, + "step": 30379 + }, + { + "epoch": 16.972067039106147, + "grad_norm": 0.5636337399482727, + "learning_rate": 0.00015221288515406163, + "loss": 0.4001, + "step": 30380 + }, + { + "epoch": 16.972625698324023, + "grad_norm": 0.4556921124458313, + "learning_rate": 0.00015218487394957984, + "loss": 0.374, + "step": 30381 + }, + { + "epoch": 16.9731843575419, + "grad_norm": 0.5564090609550476, + "learning_rate": 0.00015215686274509804, + "loss": 0.4175, + "step": 30382 + }, + { + "epoch": 16.973743016759776, + "grad_norm": 0.40993061661720276, + "learning_rate": 0.00015212885154061625, + "loss": 0.426, + "step": 30383 + }, + { + "epoch": 16.974301675977653, + "grad_norm": 0.4953514337539673, + "learning_rate": 0.00015210084033613446, + "loss": 0.4035, + "step": 30384 + }, + { + "epoch": 16.97486033519553, + "grad_norm": 0.45152053236961365, + "learning_rate": 0.00015207282913165266, + "loss": 0.3084, + "step": 30385 + }, + { + "epoch": 16.975418994413406, + "grad_norm": 1.408462643623352, + "learning_rate": 0.00015204481792717087, + "loss": 0.4266, + "step": 30386 + }, + { + "epoch": 16.975977653631286, + "grad_norm": 2.6643075942993164, + "learning_rate": 0.00015201680672268907, + "loss": 0.3615, + "step": 30387 + }, + { + "epoch": 16.976536312849163, + "grad_norm": 0.40733802318573, + "learning_rate": 0.0001519887955182073, + "loss": 0.3587, + "step": 30388 + }, + { + "epoch": 16.97709497206704, + "grad_norm": 0.4195544123649597, + "learning_rate": 0.00015196078431372549, + "loss": 0.3177, + "step": 30389 + }, + { + "epoch": 16.977653631284916, + "grad_norm": 0.48221084475517273, + "learning_rate": 0.0001519327731092437, + "loss": 0.5664, + "step": 30390 + }, + { + "epoch": 16.978212290502793, + "grad_norm": 0.494863361120224, + "learning_rate": 0.0001519047619047619, + "loss": 0.4674, + "step": 30391 + }, + { + "epoch": 16.97877094972067, + "grad_norm": 0.303216814994812, + "learning_rate": 0.00015187675070028013, + "loss": 0.3052, + "step": 30392 + }, + { + "epoch": 16.97932960893855, + "grad_norm": 0.5198042392730713, + "learning_rate": 0.00015184873949579834, + "loss": 0.3815, + "step": 30393 + }, + { + "epoch": 16.979888268156426, + "grad_norm": 0.5553575158119202, + "learning_rate": 0.00015182072829131652, + "loss": 0.3854, + "step": 30394 + }, + { + "epoch": 16.980446927374302, + "grad_norm": 0.5964141488075256, + "learning_rate": 0.00015179271708683472, + "loss": 0.4508, + "step": 30395 + }, + { + "epoch": 16.98100558659218, + "grad_norm": 0.44770148396492004, + "learning_rate": 0.00015176470588235295, + "loss": 0.4239, + "step": 30396 + }, + { + "epoch": 16.981564245810056, + "grad_norm": 0.43785104155540466, + "learning_rate": 0.00015173669467787116, + "loss": 0.3668, + "step": 30397 + }, + { + "epoch": 16.982122905027932, + "grad_norm": 0.4443635046482086, + "learning_rate": 0.00015170868347338937, + "loss": 0.4358, + "step": 30398 + }, + { + "epoch": 16.98268156424581, + "grad_norm": 0.5695051550865173, + "learning_rate": 0.00015168067226890755, + "loss": 0.4331, + "step": 30399 + }, + { + "epoch": 16.98324022346369, + "grad_norm": 0.49616923928260803, + "learning_rate": 0.00015165266106442578, + "loss": 0.3587, + "step": 30400 + }, + { + "epoch": 16.983798882681565, + "grad_norm": 0.4713943898677826, + "learning_rate": 0.00015162464985994398, + "loss": 0.3646, + "step": 30401 + }, + { + "epoch": 16.984357541899442, + "grad_norm": 0.45692238211631775, + "learning_rate": 0.0001515966386554622, + "loss": 0.3509, + "step": 30402 + }, + { + "epoch": 16.98491620111732, + "grad_norm": 0.3453557789325714, + "learning_rate": 0.0001515686274509804, + "loss": 0.3372, + "step": 30403 + }, + { + "epoch": 16.985474860335195, + "grad_norm": 0.789562463760376, + "learning_rate": 0.0001515406162464986, + "loss": 0.4327, + "step": 30404 + }, + { + "epoch": 16.98603351955307, + "grad_norm": 0.3741300404071808, + "learning_rate": 0.0001515126050420168, + "loss": 0.3358, + "step": 30405 + }, + { + "epoch": 16.986592178770948, + "grad_norm": 0.5981579422950745, + "learning_rate": 0.00015148459383753501, + "loss": 0.3589, + "step": 30406 + }, + { + "epoch": 16.98715083798883, + "grad_norm": 0.4594210088253021, + "learning_rate": 0.00015145658263305322, + "loss": 0.4083, + "step": 30407 + }, + { + "epoch": 16.987709497206705, + "grad_norm": 0.5064340233802795, + "learning_rate": 0.00015142857142857145, + "loss": 0.4726, + "step": 30408 + }, + { + "epoch": 16.98826815642458, + "grad_norm": 8.44332218170166, + "learning_rate": 0.00015140056022408963, + "loss": 0.4203, + "step": 30409 + }, + { + "epoch": 16.988826815642458, + "grad_norm": 0.5397350788116455, + "learning_rate": 0.00015137254901960784, + "loss": 0.4103, + "step": 30410 + }, + { + "epoch": 16.989385474860335, + "grad_norm": 0.5714200735092163, + "learning_rate": 0.00015134453781512604, + "loss": 0.5161, + "step": 30411 + }, + { + "epoch": 16.98994413407821, + "grad_norm": 0.8399409651756287, + "learning_rate": 0.00015131652661064428, + "loss": 0.4342, + "step": 30412 + }, + { + "epoch": 16.990502793296088, + "grad_norm": 1.0706945657730103, + "learning_rate": 0.00015128851540616248, + "loss": 0.3302, + "step": 30413 + }, + { + "epoch": 16.991061452513968, + "grad_norm": 0.47315743565559387, + "learning_rate": 0.00015126050420168066, + "loss": 0.4044, + "step": 30414 + }, + { + "epoch": 16.991620111731844, + "grad_norm": 0.5722789764404297, + "learning_rate": 0.00015123249299719887, + "loss": 0.4494, + "step": 30415 + }, + { + "epoch": 16.99217877094972, + "grad_norm": 0.5723875761032104, + "learning_rate": 0.0001512044817927171, + "loss": 0.4076, + "step": 30416 + }, + { + "epoch": 16.992737430167598, + "grad_norm": 0.6388134360313416, + "learning_rate": 0.0001511764705882353, + "loss": 0.4836, + "step": 30417 + }, + { + "epoch": 16.993296089385474, + "grad_norm": 0.40612682700157166, + "learning_rate": 0.00015114845938375351, + "loss": 0.4231, + "step": 30418 + }, + { + "epoch": 16.99385474860335, + "grad_norm": 0.7671379446983337, + "learning_rate": 0.0001511204481792717, + "loss": 0.4081, + "step": 30419 + }, + { + "epoch": 16.994413407821227, + "grad_norm": 0.45651549100875854, + "learning_rate": 0.00015109243697478993, + "loss": 0.3477, + "step": 30420 + }, + { + "epoch": 16.994972067039107, + "grad_norm": 0.5423356890678406, + "learning_rate": 0.00015106442577030813, + "loss": 0.4501, + "step": 30421 + }, + { + "epoch": 16.995530726256984, + "grad_norm": 0.42410463094711304, + "learning_rate": 0.00015103641456582634, + "loss": 0.3896, + "step": 30422 + }, + { + "epoch": 16.99608938547486, + "grad_norm": 0.4299747347831726, + "learning_rate": 0.00015100840336134454, + "loss": 0.4271, + "step": 30423 + }, + { + "epoch": 16.996648044692737, + "grad_norm": 0.41651228070259094, + "learning_rate": 0.00015098039215686275, + "loss": 0.3995, + "step": 30424 + }, + { + "epoch": 16.997206703910614, + "grad_norm": 0.44958311319351196, + "learning_rate": 0.00015095238095238096, + "loss": 0.4121, + "step": 30425 + }, + { + "epoch": 16.99776536312849, + "grad_norm": 0.46983492374420166, + "learning_rate": 0.00015092436974789916, + "loss": 0.4017, + "step": 30426 + }, + { + "epoch": 16.99832402234637, + "grad_norm": 0.5891770124435425, + "learning_rate": 0.00015089635854341737, + "loss": 0.458, + "step": 30427 + }, + { + "epoch": 16.998882681564247, + "grad_norm": 0.5173524022102356, + "learning_rate": 0.0001508683473389356, + "loss": 0.5606, + "step": 30428 + }, + { + "epoch": 16.999441340782123, + "grad_norm": 1.2778003215789795, + "learning_rate": 0.00015084033613445378, + "loss": 0.4813, + "step": 30429 + }, + { + "epoch": 17.0, + "grad_norm": 0.662997841835022, + "learning_rate": 0.00015081232492997199, + "loss": 0.3612, + "step": 30430 + }, + { + "epoch": 17.000558659217877, + "grad_norm": 0.36778971552848816, + "learning_rate": 0.0001507843137254902, + "loss": 0.2869, + "step": 30431 + }, + { + "epoch": 17.001117318435753, + "grad_norm": 0.5440883636474609, + "learning_rate": 0.00015075630252100843, + "loss": 0.4328, + "step": 30432 + }, + { + "epoch": 17.00167597765363, + "grad_norm": 0.33694374561309814, + "learning_rate": 0.0001507282913165266, + "loss": 0.4027, + "step": 30433 + }, + { + "epoch": 17.00223463687151, + "grad_norm": 0.5987569093704224, + "learning_rate": 0.0001507002801120448, + "loss": 0.3475, + "step": 30434 + }, + { + "epoch": 17.002793296089386, + "grad_norm": 0.39774081110954285, + "learning_rate": 0.00015067226890756302, + "loss": 0.3922, + "step": 30435 + }, + { + "epoch": 17.003351955307263, + "grad_norm": 2.299609661102295, + "learning_rate": 0.00015064425770308125, + "loss": 0.3822, + "step": 30436 + }, + { + "epoch": 17.00391061452514, + "grad_norm": 4.180192470550537, + "learning_rate": 0.00015061624649859945, + "loss": 0.4177, + "step": 30437 + }, + { + "epoch": 17.004469273743016, + "grad_norm": 0.46989870071411133, + "learning_rate": 0.00015058823529411763, + "loss": 0.3707, + "step": 30438 + }, + { + "epoch": 17.005027932960893, + "grad_norm": 0.37000805139541626, + "learning_rate": 0.00015056022408963584, + "loss": 0.3177, + "step": 30439 + }, + { + "epoch": 17.00558659217877, + "grad_norm": 0.44988906383514404, + "learning_rate": 0.00015053221288515407, + "loss": 0.3763, + "step": 30440 + }, + { + "epoch": 17.00614525139665, + "grad_norm": 0.4812779128551483, + "learning_rate": 0.00015050420168067228, + "loss": 0.3948, + "step": 30441 + }, + { + "epoch": 17.006703910614526, + "grad_norm": 0.7407799363136292, + "learning_rate": 0.00015047619047619048, + "loss": 0.3995, + "step": 30442 + }, + { + "epoch": 17.007262569832402, + "grad_norm": 0.3529624044895172, + "learning_rate": 0.00015044817927170866, + "loss": 0.385, + "step": 30443 + }, + { + "epoch": 17.00782122905028, + "grad_norm": 0.3553699254989624, + "learning_rate": 0.0001504201680672269, + "loss": 0.4685, + "step": 30444 + }, + { + "epoch": 17.008379888268156, + "grad_norm": 0.4063166081905365, + "learning_rate": 0.0001503921568627451, + "loss": 0.4675, + "step": 30445 + }, + { + "epoch": 17.008938547486032, + "grad_norm": 1.3298100233078003, + "learning_rate": 0.0001503641456582633, + "loss": 0.3456, + "step": 30446 + }, + { + "epoch": 17.009497206703912, + "grad_norm": 3.5663909912109375, + "learning_rate": 0.00015033613445378151, + "loss": 0.4187, + "step": 30447 + }, + { + "epoch": 17.01005586592179, + "grad_norm": 1.5872750282287598, + "learning_rate": 0.00015030812324929972, + "loss": 0.353, + "step": 30448 + }, + { + "epoch": 17.010614525139665, + "grad_norm": 0.4045100510120392, + "learning_rate": 0.00015028011204481793, + "loss": 0.2705, + "step": 30449 + }, + { + "epoch": 17.011173184357542, + "grad_norm": 1.197712779045105, + "learning_rate": 0.00015025210084033613, + "loss": 0.5144, + "step": 30450 + }, + { + "epoch": 17.01173184357542, + "grad_norm": 0.615078330039978, + "learning_rate": 0.00015022408963585434, + "loss": 0.4217, + "step": 30451 + }, + { + "epoch": 17.012290502793295, + "grad_norm": 0.7409213781356812, + "learning_rate": 0.00015019607843137257, + "loss": 0.3797, + "step": 30452 + }, + { + "epoch": 17.01284916201117, + "grad_norm": 3.4061059951782227, + "learning_rate": 0.00015016806722689075, + "loss": 0.5353, + "step": 30453 + }, + { + "epoch": 17.013407821229052, + "grad_norm": 0.5427066087722778, + "learning_rate": 0.00015014005602240896, + "loss": 0.3286, + "step": 30454 + }, + { + "epoch": 17.01396648044693, + "grad_norm": 8.338967323303223, + "learning_rate": 0.00015011204481792716, + "loss": 0.4225, + "step": 30455 + }, + { + "epoch": 17.014525139664805, + "grad_norm": 0.6157800555229187, + "learning_rate": 0.0001500840336134454, + "loss": 0.4572, + "step": 30456 + }, + { + "epoch": 17.01508379888268, + "grad_norm": 1.7625092267990112, + "learning_rate": 0.0001500560224089636, + "loss": 0.4084, + "step": 30457 + }, + { + "epoch": 17.015642458100558, + "grad_norm": 0.6848914623260498, + "learning_rate": 0.00015002801120448178, + "loss": 0.4316, + "step": 30458 + }, + { + "epoch": 17.016201117318435, + "grad_norm": 1.7069450616836548, + "learning_rate": 0.00015, + "loss": 0.3977, + "step": 30459 + }, + { + "epoch": 17.01675977653631, + "grad_norm": 0.8589963912963867, + "learning_rate": 0.00014997198879551822, + "loss": 0.4355, + "step": 30460 + }, + { + "epoch": 17.01731843575419, + "grad_norm": 0.471879243850708, + "learning_rate": 0.00014994397759103643, + "loss": 0.3755, + "step": 30461 + }, + { + "epoch": 17.017877094972068, + "grad_norm": 4.113809585571289, + "learning_rate": 0.00014991596638655463, + "loss": 0.507, + "step": 30462 + }, + { + "epoch": 17.018435754189944, + "grad_norm": 0.3370513617992401, + "learning_rate": 0.0001498879551820728, + "loss": 0.3231, + "step": 30463 + }, + { + "epoch": 17.01899441340782, + "grad_norm": 0.3638036847114563, + "learning_rate": 0.00014985994397759104, + "loss": 0.31, + "step": 30464 + }, + { + "epoch": 17.019553072625698, + "grad_norm": 0.3504454493522644, + "learning_rate": 0.00014983193277310925, + "loss": 0.3139, + "step": 30465 + }, + { + "epoch": 17.020111731843574, + "grad_norm": 0.3819427192211151, + "learning_rate": 0.00014980392156862746, + "loss": 0.3707, + "step": 30466 + }, + { + "epoch": 17.02067039106145, + "grad_norm": 0.5636350512504578, + "learning_rate": 0.00014977591036414566, + "loss": 0.3306, + "step": 30467 + }, + { + "epoch": 17.02122905027933, + "grad_norm": 1.5958689451217651, + "learning_rate": 0.00014974789915966387, + "loss": 0.4532, + "step": 30468 + }, + { + "epoch": 17.021787709497207, + "grad_norm": 0.3493449091911316, + "learning_rate": 0.00014971988795518207, + "loss": 0.422, + "step": 30469 + }, + { + "epoch": 17.022346368715084, + "grad_norm": 2.130363702774048, + "learning_rate": 0.00014969187675070028, + "loss": 0.633, + "step": 30470 + }, + { + "epoch": 17.02290502793296, + "grad_norm": 0.5868925452232361, + "learning_rate": 0.00014966386554621849, + "loss": 0.4658, + "step": 30471 + }, + { + "epoch": 17.023463687150837, + "grad_norm": 0.40896233916282654, + "learning_rate": 0.00014963585434173672, + "loss": 0.3636, + "step": 30472 + }, + { + "epoch": 17.024022346368714, + "grad_norm": 0.3784361779689789, + "learning_rate": 0.0001496078431372549, + "loss": 0.3997, + "step": 30473 + }, + { + "epoch": 17.024581005586594, + "grad_norm": 0.5266556143760681, + "learning_rate": 0.0001495798319327731, + "loss": 0.4098, + "step": 30474 + }, + { + "epoch": 17.02513966480447, + "grad_norm": 0.4023912847042084, + "learning_rate": 0.0001495518207282913, + "loss": 0.4364, + "step": 30475 + }, + { + "epoch": 17.025698324022347, + "grad_norm": 1.3155419826507568, + "learning_rate": 0.00014952380952380954, + "loss": 0.4446, + "step": 30476 + }, + { + "epoch": 17.026256983240224, + "grad_norm": 0.3635464906692505, + "learning_rate": 0.00014949579831932775, + "loss": 0.4578, + "step": 30477 + }, + { + "epoch": 17.0268156424581, + "grad_norm": 1.4340434074401855, + "learning_rate": 0.00014946778711484593, + "loss": 0.3187, + "step": 30478 + }, + { + "epoch": 17.027374301675977, + "grad_norm": 0.6198726892471313, + "learning_rate": 0.00014943977591036413, + "loss": 0.4259, + "step": 30479 + }, + { + "epoch": 17.027932960893853, + "grad_norm": 1.1278038024902344, + "learning_rate": 0.00014941176470588237, + "loss": 0.3197, + "step": 30480 + }, + { + "epoch": 17.028491620111733, + "grad_norm": 0.46070167422294617, + "learning_rate": 0.00014938375350140057, + "loss": 0.3955, + "step": 30481 + }, + { + "epoch": 17.02905027932961, + "grad_norm": 0.5326356291770935, + "learning_rate": 0.00014935574229691878, + "loss": 0.4687, + "step": 30482 + }, + { + "epoch": 17.029608938547486, + "grad_norm": 0.42663297057151794, + "learning_rate": 0.00014932773109243696, + "loss": 0.4572, + "step": 30483 + }, + { + "epoch": 17.030167597765363, + "grad_norm": 0.36611148715019226, + "learning_rate": 0.0001492997198879552, + "loss": 0.3564, + "step": 30484 + }, + { + "epoch": 17.03072625698324, + "grad_norm": 0.4728602468967438, + "learning_rate": 0.0001492717086834734, + "loss": 0.4383, + "step": 30485 + }, + { + "epoch": 17.031284916201116, + "grad_norm": 0.4630252718925476, + "learning_rate": 0.0001492436974789916, + "loss": 0.3734, + "step": 30486 + }, + { + "epoch": 17.031843575418993, + "grad_norm": 0.4830436110496521, + "learning_rate": 0.00014921568627450978, + "loss": 0.4712, + "step": 30487 + }, + { + "epoch": 17.032402234636873, + "grad_norm": 0.3279505670070648, + "learning_rate": 0.00014918767507002801, + "loss": 0.2992, + "step": 30488 + }, + { + "epoch": 17.03296089385475, + "grad_norm": 0.4427335858345032, + "learning_rate": 0.00014915966386554622, + "loss": 0.4434, + "step": 30489 + }, + { + "epoch": 17.033519553072626, + "grad_norm": 0.5309723019599915, + "learning_rate": 0.00014913165266106443, + "loss": 0.4698, + "step": 30490 + }, + { + "epoch": 17.034078212290503, + "grad_norm": 0.5725134611129761, + "learning_rate": 0.00014910364145658263, + "loss": 0.4241, + "step": 30491 + }, + { + "epoch": 17.03463687150838, + "grad_norm": 14.767130851745605, + "learning_rate": 0.00014907563025210084, + "loss": 0.5801, + "step": 30492 + }, + { + "epoch": 17.035195530726256, + "grad_norm": 0.3527759611606598, + "learning_rate": 0.00014904761904761904, + "loss": 0.3567, + "step": 30493 + }, + { + "epoch": 17.035754189944136, + "grad_norm": 0.3585748076438904, + "learning_rate": 0.00014901960784313725, + "loss": 0.3253, + "step": 30494 + }, + { + "epoch": 17.036312849162012, + "grad_norm": 0.6002820730209351, + "learning_rate": 0.00014899159663865546, + "loss": 0.3623, + "step": 30495 + }, + { + "epoch": 17.03687150837989, + "grad_norm": 0.6385857462882996, + "learning_rate": 0.0001489635854341737, + "loss": 0.4929, + "step": 30496 + }, + { + "epoch": 17.037430167597766, + "grad_norm": 1.726499080657959, + "learning_rate": 0.00014893557422969187, + "loss": 0.4868, + "step": 30497 + }, + { + "epoch": 17.037988826815642, + "grad_norm": 0.41995203495025635, + "learning_rate": 0.00014890756302521007, + "loss": 0.3407, + "step": 30498 + }, + { + "epoch": 17.03854748603352, + "grad_norm": 0.5569109320640564, + "learning_rate": 0.00014887955182072828, + "loss": 0.5235, + "step": 30499 + }, + { + "epoch": 17.039106145251395, + "grad_norm": 2.3928635120391846, + "learning_rate": 0.00014885154061624651, + "loss": 0.5613, + "step": 30500 + }, + { + "epoch": 17.039106145251395, + "eval_cer": 0.08478718645325303, + "eval_loss": 0.32105767726898193, + "eval_runtime": 55.2041, + "eval_samples_per_second": 82.204, + "eval_steps_per_second": 5.145, + "eval_wer": 0.3347113209128237, + "step": 30500 + }, + { + "epoch": 17.039664804469275, + "grad_norm": 0.4282025992870331, + "learning_rate": 0.00014882352941176472, + "loss": 0.3997, + "step": 30501 + }, + { + "epoch": 17.040223463687152, + "grad_norm": 1.9419450759887695, + "learning_rate": 0.0001487955182072829, + "loss": 0.3019, + "step": 30502 + }, + { + "epoch": 17.04078212290503, + "grad_norm": 1.7139935493469238, + "learning_rate": 0.0001487675070028011, + "loss": 0.6062, + "step": 30503 + }, + { + "epoch": 17.041340782122905, + "grad_norm": 0.3762113153934479, + "learning_rate": 0.00014873949579831934, + "loss": 0.3172, + "step": 30504 + }, + { + "epoch": 17.04189944134078, + "grad_norm": 0.3709942698478699, + "learning_rate": 0.00014871148459383754, + "loss": 0.3543, + "step": 30505 + }, + { + "epoch": 17.042458100558658, + "grad_norm": 0.5424813032150269, + "learning_rate": 0.00014868347338935575, + "loss": 0.3489, + "step": 30506 + }, + { + "epoch": 17.043016759776535, + "grad_norm": 0.35648950934410095, + "learning_rate": 0.00014865546218487393, + "loss": 0.3092, + "step": 30507 + }, + { + "epoch": 17.043575418994415, + "grad_norm": 0.5092266798019409, + "learning_rate": 0.00014862745098039216, + "loss": 0.404, + "step": 30508 + }, + { + "epoch": 17.04413407821229, + "grad_norm": 0.8773375153541565, + "learning_rate": 0.00014859943977591037, + "loss": 0.4841, + "step": 30509 + }, + { + "epoch": 17.044692737430168, + "grad_norm": 0.4752204120159149, + "learning_rate": 0.00014857142857142857, + "loss": 0.3815, + "step": 30510 + }, + { + "epoch": 17.045251396648045, + "grad_norm": 0.7021821141242981, + "learning_rate": 0.00014854341736694678, + "loss": 0.4805, + "step": 30511 + }, + { + "epoch": 17.04581005586592, + "grad_norm": 0.40230700373649597, + "learning_rate": 0.00014851540616246499, + "loss": 0.4182, + "step": 30512 + }, + { + "epoch": 17.046368715083798, + "grad_norm": 0.5683800578117371, + "learning_rate": 0.0001484873949579832, + "loss": 0.351, + "step": 30513 + }, + { + "epoch": 17.046927374301674, + "grad_norm": 0.4263964295387268, + "learning_rate": 0.0001484593837535014, + "loss": 0.4341, + "step": 30514 + }, + { + "epoch": 17.047486033519554, + "grad_norm": 0.5452007055282593, + "learning_rate": 0.0001484313725490196, + "loss": 0.4949, + "step": 30515 + }, + { + "epoch": 17.04804469273743, + "grad_norm": 0.40486037731170654, + "learning_rate": 0.00014840336134453784, + "loss": 0.3668, + "step": 30516 + }, + { + "epoch": 17.048603351955308, + "grad_norm": 0.4338291883468628, + "learning_rate": 0.00014837535014005602, + "loss": 0.4316, + "step": 30517 + }, + { + "epoch": 17.049162011173184, + "grad_norm": 0.4577411711215973, + "learning_rate": 0.00014834733893557422, + "loss": 0.3992, + "step": 30518 + }, + { + "epoch": 17.04972067039106, + "grad_norm": 0.37868183851242065, + "learning_rate": 0.00014831932773109243, + "loss": 0.4336, + "step": 30519 + }, + { + "epoch": 17.050279329608937, + "grad_norm": 0.3750998079776764, + "learning_rate": 0.00014829131652661066, + "loss": 0.3244, + "step": 30520 + }, + { + "epoch": 17.050837988826817, + "grad_norm": 0.5478485226631165, + "learning_rate": 0.00014826330532212887, + "loss": 0.5612, + "step": 30521 + }, + { + "epoch": 17.051396648044694, + "grad_norm": 0.4271056056022644, + "learning_rate": 0.00014823529411764705, + "loss": 0.4541, + "step": 30522 + }, + { + "epoch": 17.05195530726257, + "grad_norm": 0.9498957395553589, + "learning_rate": 0.00014820728291316525, + "loss": 0.38, + "step": 30523 + }, + { + "epoch": 17.052513966480447, + "grad_norm": 0.38722601532936096, + "learning_rate": 0.00014817927170868348, + "loss": 0.3858, + "step": 30524 + }, + { + "epoch": 17.053072625698324, + "grad_norm": 0.3773421049118042, + "learning_rate": 0.0001481512605042017, + "loss": 0.391, + "step": 30525 + }, + { + "epoch": 17.0536312849162, + "grad_norm": 2.046898365020752, + "learning_rate": 0.0001481232492997199, + "loss": 0.4374, + "step": 30526 + }, + { + "epoch": 17.054189944134077, + "grad_norm": 0.44194281101226807, + "learning_rate": 0.00014809523809523808, + "loss": 0.4443, + "step": 30527 + }, + { + "epoch": 17.054748603351957, + "grad_norm": 0.961051344871521, + "learning_rate": 0.0001480672268907563, + "loss": 0.5131, + "step": 30528 + }, + { + "epoch": 17.055307262569833, + "grad_norm": 0.498801052570343, + "learning_rate": 0.00014803921568627451, + "loss": 0.506, + "step": 30529 + }, + { + "epoch": 17.05586592178771, + "grad_norm": 0.5123061537742615, + "learning_rate": 0.00014801120448179272, + "loss": 0.4112, + "step": 30530 + }, + { + "epoch": 17.056424581005587, + "grad_norm": 0.38034579157829285, + "learning_rate": 0.00014798319327731093, + "loss": 0.4355, + "step": 30531 + }, + { + "epoch": 17.056983240223463, + "grad_norm": 0.38187503814697266, + "learning_rate": 0.00014795518207282913, + "loss": 0.4039, + "step": 30532 + }, + { + "epoch": 17.05754189944134, + "grad_norm": 0.43583863973617554, + "learning_rate": 0.00014792717086834734, + "loss": 0.581, + "step": 30533 + }, + { + "epoch": 17.058100558659216, + "grad_norm": 0.3752821981906891, + "learning_rate": 0.00014789915966386554, + "loss": 0.4265, + "step": 30534 + }, + { + "epoch": 17.058659217877096, + "grad_norm": 2.105147361755371, + "learning_rate": 0.00014787114845938375, + "loss": 0.4563, + "step": 30535 + }, + { + "epoch": 17.059217877094973, + "grad_norm": 0.5207562446594238, + "learning_rate": 0.00014784313725490198, + "loss": 0.3956, + "step": 30536 + }, + { + "epoch": 17.05977653631285, + "grad_norm": 0.627870500087738, + "learning_rate": 0.00014781512605042016, + "loss": 0.516, + "step": 30537 + }, + { + "epoch": 17.060335195530726, + "grad_norm": 0.4861336052417755, + "learning_rate": 0.00014778711484593837, + "loss": 0.4406, + "step": 30538 + }, + { + "epoch": 17.060893854748603, + "grad_norm": 3.0468103885650635, + "learning_rate": 0.00014775910364145657, + "loss": 0.4418, + "step": 30539 + }, + { + "epoch": 17.06145251396648, + "grad_norm": 0.49870914220809937, + "learning_rate": 0.0001477310924369748, + "loss": 0.4393, + "step": 30540 + }, + { + "epoch": 17.062011173184356, + "grad_norm": 2.6195478439331055, + "learning_rate": 0.00014770308123249301, + "loss": 0.4129, + "step": 30541 + }, + { + "epoch": 17.062569832402236, + "grad_norm": 0.5501329302787781, + "learning_rate": 0.0001476750700280112, + "loss": 0.321, + "step": 30542 + }, + { + "epoch": 17.063128491620112, + "grad_norm": 0.3972786068916321, + "learning_rate": 0.0001476470588235294, + "loss": 0.3804, + "step": 30543 + }, + { + "epoch": 17.06368715083799, + "grad_norm": 0.5347886085510254, + "learning_rate": 0.00014761904761904763, + "loss": 0.4034, + "step": 30544 + }, + { + "epoch": 17.064245810055866, + "grad_norm": 0.36315909028053284, + "learning_rate": 0.00014759103641456584, + "loss": 0.5294, + "step": 30545 + }, + { + "epoch": 17.064804469273742, + "grad_norm": 1.4645538330078125, + "learning_rate": 0.00014756302521008402, + "loss": 0.4326, + "step": 30546 + }, + { + "epoch": 17.06536312849162, + "grad_norm": 0.48635053634643555, + "learning_rate": 0.00014753501400560222, + "loss": 0.4484, + "step": 30547 + }, + { + "epoch": 17.0659217877095, + "grad_norm": 0.38818925619125366, + "learning_rate": 0.00014750700280112046, + "loss": 0.4492, + "step": 30548 + }, + { + "epoch": 17.066480446927375, + "grad_norm": 1.262762188911438, + "learning_rate": 0.00014747899159663866, + "loss": 0.4299, + "step": 30549 + }, + { + "epoch": 17.067039106145252, + "grad_norm": 0.48113033175468445, + "learning_rate": 0.00014745098039215687, + "loss": 0.4097, + "step": 30550 + }, + { + "epoch": 17.06759776536313, + "grad_norm": 0.6917311549186707, + "learning_rate": 0.00014742296918767505, + "loss": 0.3696, + "step": 30551 + }, + { + "epoch": 17.068156424581005, + "grad_norm": 0.5821406841278076, + "learning_rate": 0.00014739495798319328, + "loss": 0.3911, + "step": 30552 + }, + { + "epoch": 17.06871508379888, + "grad_norm": 0.6156346797943115, + "learning_rate": 0.00014736694677871149, + "loss": 0.4394, + "step": 30553 + }, + { + "epoch": 17.06927374301676, + "grad_norm": 18.05360221862793, + "learning_rate": 0.0001473389355742297, + "loss": 0.3847, + "step": 30554 + }, + { + "epoch": 17.06983240223464, + "grad_norm": 0.5106489658355713, + "learning_rate": 0.00014731092436974792, + "loss": 0.435, + "step": 30555 + }, + { + "epoch": 17.070391061452515, + "grad_norm": 0.3439846336841583, + "learning_rate": 0.0001472829131652661, + "loss": 0.3495, + "step": 30556 + }, + { + "epoch": 17.07094972067039, + "grad_norm": 0.3897092640399933, + "learning_rate": 0.0001472549019607843, + "loss": 0.4916, + "step": 30557 + }, + { + "epoch": 17.071508379888268, + "grad_norm": 0.5481124520301819, + "learning_rate": 0.00014722689075630252, + "loss": 0.5462, + "step": 30558 + }, + { + "epoch": 17.072067039106145, + "grad_norm": 0.43974775075912476, + "learning_rate": 0.00014719887955182075, + "loss": 0.4289, + "step": 30559 + }, + { + "epoch": 17.07262569832402, + "grad_norm": 0.602907657623291, + "learning_rate": 0.00014717086834733895, + "loss": 0.5071, + "step": 30560 + }, + { + "epoch": 17.073184357541898, + "grad_norm": 0.35608187317848206, + "learning_rate": 0.00014714285714285713, + "loss": 0.3086, + "step": 30561 + }, + { + "epoch": 17.073743016759778, + "grad_norm": 0.3388363718986511, + "learning_rate": 0.00014711484593837534, + "loss": 0.3729, + "step": 30562 + }, + { + "epoch": 17.074301675977654, + "grad_norm": 0.4461972713470459, + "learning_rate": 0.00014708683473389357, + "loss": 0.3941, + "step": 30563 + }, + { + "epoch": 17.07486033519553, + "grad_norm": 0.9617152214050293, + "learning_rate": 0.00014705882352941178, + "loss": 0.4159, + "step": 30564 + }, + { + "epoch": 17.075418994413408, + "grad_norm": 0.4029093384742737, + "learning_rate": 0.00014703081232492998, + "loss": 0.3948, + "step": 30565 + }, + { + "epoch": 17.075977653631284, + "grad_norm": 0.495516300201416, + "learning_rate": 0.00014700280112044816, + "loss": 0.4716, + "step": 30566 + }, + { + "epoch": 17.07653631284916, + "grad_norm": 0.8004716634750366, + "learning_rate": 0.0001469747899159664, + "loss": 0.5289, + "step": 30567 + }, + { + "epoch": 17.07709497206704, + "grad_norm": 0.5013948678970337, + "learning_rate": 0.0001469467787114846, + "loss": 0.4269, + "step": 30568 + }, + { + "epoch": 17.077653631284917, + "grad_norm": 0.42532581090927124, + "learning_rate": 0.0001469187675070028, + "loss": 0.4482, + "step": 30569 + }, + { + "epoch": 17.078212290502794, + "grad_norm": 0.7256349325180054, + "learning_rate": 0.00014689075630252101, + "loss": 0.5398, + "step": 30570 + }, + { + "epoch": 17.07877094972067, + "grad_norm": 0.5493570566177368, + "learning_rate": 0.00014686274509803922, + "loss": 0.3284, + "step": 30571 + }, + { + "epoch": 17.079329608938547, + "grad_norm": 0.5636252760887146, + "learning_rate": 0.00014683473389355743, + "loss": 0.4947, + "step": 30572 + }, + { + "epoch": 17.079888268156424, + "grad_norm": 1.7464534044265747, + "learning_rate": 0.00014680672268907563, + "loss": 0.3855, + "step": 30573 + }, + { + "epoch": 17.0804469273743, + "grad_norm": 0.3875367343425751, + "learning_rate": 0.00014677871148459384, + "loss": 0.3412, + "step": 30574 + }, + { + "epoch": 17.08100558659218, + "grad_norm": 0.5242041945457458, + "learning_rate": 0.00014675070028011207, + "loss": 0.4053, + "step": 30575 + }, + { + "epoch": 17.081564245810057, + "grad_norm": 0.40674248337745667, + "learning_rate": 0.00014672268907563025, + "loss": 0.4532, + "step": 30576 + }, + { + "epoch": 17.082122905027934, + "grad_norm": 0.7609793543815613, + "learning_rate": 0.00014669467787114846, + "loss": 0.3148, + "step": 30577 + }, + { + "epoch": 17.08268156424581, + "grad_norm": 0.7364414930343628, + "learning_rate": 0.00014666666666666666, + "loss": 0.4908, + "step": 30578 + }, + { + "epoch": 17.083240223463687, + "grad_norm": 0.4483233094215393, + "learning_rate": 0.0001466386554621849, + "loss": 0.3973, + "step": 30579 + }, + { + "epoch": 17.083798882681563, + "grad_norm": 0.3973104655742645, + "learning_rate": 0.0001466106442577031, + "loss": 0.4524, + "step": 30580 + }, + { + "epoch": 17.08435754189944, + "grad_norm": 0.38178759813308716, + "learning_rate": 0.00014658263305322128, + "loss": 0.3727, + "step": 30581 + }, + { + "epoch": 17.08491620111732, + "grad_norm": 5.578076362609863, + "learning_rate": 0.0001465546218487395, + "loss": 0.4283, + "step": 30582 + }, + { + "epoch": 17.085474860335196, + "grad_norm": 0.3670594096183777, + "learning_rate": 0.00014652661064425772, + "loss": 0.4127, + "step": 30583 + }, + { + "epoch": 17.086033519553073, + "grad_norm": 0.5761085748672485, + "learning_rate": 0.00014649859943977593, + "loss": 0.3824, + "step": 30584 + }, + { + "epoch": 17.08659217877095, + "grad_norm": 0.4312434792518616, + "learning_rate": 0.00014647058823529413, + "loss": 0.3376, + "step": 30585 + }, + { + "epoch": 17.087150837988826, + "grad_norm": 0.4243375062942505, + "learning_rate": 0.0001464425770308123, + "loss": 0.3348, + "step": 30586 + }, + { + "epoch": 17.087709497206703, + "grad_norm": 0.4164976477622986, + "learning_rate": 0.00014641456582633054, + "loss": 0.3754, + "step": 30587 + }, + { + "epoch": 17.08826815642458, + "grad_norm": 0.8961573839187622, + "learning_rate": 0.00014638655462184875, + "loss": 0.3887, + "step": 30588 + }, + { + "epoch": 17.08882681564246, + "grad_norm": 0.4520336389541626, + "learning_rate": 0.00014635854341736696, + "loss": 0.3161, + "step": 30589 + }, + { + "epoch": 17.089385474860336, + "grad_norm": 0.6728535890579224, + "learning_rate": 0.00014633053221288516, + "loss": 0.3789, + "step": 30590 + }, + { + "epoch": 17.089944134078213, + "grad_norm": 0.6951990723609924, + "learning_rate": 0.00014630252100840337, + "loss": 0.4237, + "step": 30591 + }, + { + "epoch": 17.09050279329609, + "grad_norm": 0.4518861472606659, + "learning_rate": 0.00014627450980392157, + "loss": 0.2842, + "step": 30592 + }, + { + "epoch": 17.091061452513966, + "grad_norm": 0.3565356731414795, + "learning_rate": 0.00014624649859943978, + "loss": 0.3337, + "step": 30593 + }, + { + "epoch": 17.091620111731842, + "grad_norm": 0.32045552134513855, + "learning_rate": 0.00014621848739495799, + "loss": 0.3258, + "step": 30594 + }, + { + "epoch": 17.092178770949722, + "grad_norm": 0.3928930163383484, + "learning_rate": 0.00014619047619047622, + "loss": 0.4039, + "step": 30595 + }, + { + "epoch": 17.0927374301676, + "grad_norm": 0.4180894196033478, + "learning_rate": 0.0001461624649859944, + "loss": 0.2897, + "step": 30596 + }, + { + "epoch": 17.093296089385476, + "grad_norm": 0.568274974822998, + "learning_rate": 0.0001461344537815126, + "loss": 0.4662, + "step": 30597 + }, + { + "epoch": 17.093854748603352, + "grad_norm": 0.3471246659755707, + "learning_rate": 0.0001461064425770308, + "loss": 0.3321, + "step": 30598 + }, + { + "epoch": 17.09441340782123, + "grad_norm": 0.41662198305130005, + "learning_rate": 0.00014607843137254904, + "loss": 0.3683, + "step": 30599 + }, + { + "epoch": 17.094972067039105, + "grad_norm": 0.6780016422271729, + "learning_rate": 0.00014605042016806722, + "loss": 0.519, + "step": 30600 + }, + { + "epoch": 17.095530726256982, + "grad_norm": 0.8851748108863831, + "learning_rate": 0.00014602240896358543, + "loss": 0.3277, + "step": 30601 + }, + { + "epoch": 17.096089385474862, + "grad_norm": 0.6956737041473389, + "learning_rate": 0.00014599439775910363, + "loss": 0.4344, + "step": 30602 + }, + { + "epoch": 17.09664804469274, + "grad_norm": 0.6809481382369995, + "learning_rate": 0.00014596638655462187, + "loss": 0.3304, + "step": 30603 + }, + { + "epoch": 17.097206703910615, + "grad_norm": 0.7596341967582703, + "learning_rate": 0.00014593837535014007, + "loss": 0.4337, + "step": 30604 + }, + { + "epoch": 17.09776536312849, + "grad_norm": 0.43310680985450745, + "learning_rate": 0.00014591036414565825, + "loss": 0.5794, + "step": 30605 + }, + { + "epoch": 17.098324022346368, + "grad_norm": 0.6859529614448547, + "learning_rate": 0.00014588235294117646, + "loss": 0.3938, + "step": 30606 + }, + { + "epoch": 17.098882681564245, + "grad_norm": 1.169474482536316, + "learning_rate": 0.0001458543417366947, + "loss": 0.4018, + "step": 30607 + }, + { + "epoch": 17.09944134078212, + "grad_norm": 0.4618262052536011, + "learning_rate": 0.0001458263305322129, + "loss": 0.5233, + "step": 30608 + }, + { + "epoch": 17.1, + "grad_norm": 0.4496903419494629, + "learning_rate": 0.0001457983193277311, + "loss": 0.5017, + "step": 30609 + }, + { + "epoch": 17.100558659217878, + "grad_norm": 0.9016445279121399, + "learning_rate": 0.00014577030812324928, + "loss": 0.3914, + "step": 30610 + }, + { + "epoch": 17.101117318435755, + "grad_norm": 0.554819643497467, + "learning_rate": 0.00014574229691876751, + "loss": 0.5955, + "step": 30611 + }, + { + "epoch": 17.10167597765363, + "grad_norm": 0.4067293703556061, + "learning_rate": 0.00014571428571428572, + "loss": 0.5351, + "step": 30612 + }, + { + "epoch": 17.102234636871508, + "grad_norm": 0.5183228850364685, + "learning_rate": 0.00014568627450980393, + "loss": 0.4086, + "step": 30613 + }, + { + "epoch": 17.102793296089384, + "grad_norm": 6.587244987487793, + "learning_rate": 0.00014565826330532213, + "loss": 0.4892, + "step": 30614 + }, + { + "epoch": 17.10335195530726, + "grad_norm": 0.9795680642127991, + "learning_rate": 0.00014563025210084034, + "loss": 0.4459, + "step": 30615 + }, + { + "epoch": 17.10391061452514, + "grad_norm": 0.5369141101837158, + "learning_rate": 0.00014560224089635854, + "loss": 0.4167, + "step": 30616 + }, + { + "epoch": 17.104469273743018, + "grad_norm": 0.36952686309814453, + "learning_rate": 0.00014557422969187675, + "loss": 0.3498, + "step": 30617 + }, + { + "epoch": 17.105027932960894, + "grad_norm": 0.3905816674232483, + "learning_rate": 0.00014554621848739496, + "loss": 0.3489, + "step": 30618 + }, + { + "epoch": 17.10558659217877, + "grad_norm": 0.517412006855011, + "learning_rate": 0.0001455182072829132, + "loss": 0.373, + "step": 30619 + }, + { + "epoch": 17.106145251396647, + "grad_norm": 0.5572271943092346, + "learning_rate": 0.00014549019607843137, + "loss": 0.5167, + "step": 30620 + }, + { + "epoch": 17.106703910614524, + "grad_norm": 0.601365327835083, + "learning_rate": 0.00014546218487394957, + "loss": 0.3882, + "step": 30621 + }, + { + "epoch": 17.107262569832404, + "grad_norm": 0.49694252014160156, + "learning_rate": 0.00014543417366946778, + "loss": 0.4317, + "step": 30622 + }, + { + "epoch": 17.10782122905028, + "grad_norm": 0.7042227387428284, + "learning_rate": 0.00014540616246498601, + "loss": 0.424, + "step": 30623 + }, + { + "epoch": 17.108379888268157, + "grad_norm": 2.348402738571167, + "learning_rate": 0.00014537815126050422, + "loss": 0.3399, + "step": 30624 + }, + { + "epoch": 17.108938547486034, + "grad_norm": 0.357658326625824, + "learning_rate": 0.0001453501400560224, + "loss": 0.3377, + "step": 30625 + }, + { + "epoch": 17.10949720670391, + "grad_norm": 3.7513959407806396, + "learning_rate": 0.0001453221288515406, + "loss": 0.3313, + "step": 30626 + }, + { + "epoch": 17.110055865921787, + "grad_norm": 0.4794785678386688, + "learning_rate": 0.00014529411764705884, + "loss": 0.4224, + "step": 30627 + }, + { + "epoch": 17.110614525139663, + "grad_norm": 0.36617326736450195, + "learning_rate": 0.00014526610644257704, + "loss": 0.4576, + "step": 30628 + }, + { + "epoch": 17.111173184357543, + "grad_norm": 0.435846745967865, + "learning_rate": 0.00014523809523809525, + "loss": 0.3935, + "step": 30629 + }, + { + "epoch": 17.11173184357542, + "grad_norm": 0.4789201021194458, + "learning_rate": 0.00014521008403361343, + "loss": 0.3303, + "step": 30630 + }, + { + "epoch": 17.112290502793297, + "grad_norm": 0.6044398546218872, + "learning_rate": 0.00014518207282913166, + "loss": 0.4645, + "step": 30631 + }, + { + "epoch": 17.112849162011173, + "grad_norm": 5.2347540855407715, + "learning_rate": 0.00014515406162464987, + "loss": 0.4486, + "step": 30632 + }, + { + "epoch": 17.11340782122905, + "grad_norm": 0.3528764545917511, + "learning_rate": 0.00014512605042016807, + "loss": 0.3535, + "step": 30633 + }, + { + "epoch": 17.113966480446926, + "grad_norm": 0.3694852888584137, + "learning_rate": 0.00014509803921568628, + "loss": 0.385, + "step": 30634 + }, + { + "epoch": 17.114525139664803, + "grad_norm": 0.2871161699295044, + "learning_rate": 0.00014507002801120449, + "loss": 0.3449, + "step": 30635 + }, + { + "epoch": 17.115083798882683, + "grad_norm": 0.3888263404369354, + "learning_rate": 0.0001450420168067227, + "loss": 0.4752, + "step": 30636 + }, + { + "epoch": 17.11564245810056, + "grad_norm": 0.44928330183029175, + "learning_rate": 0.0001450140056022409, + "loss": 0.3576, + "step": 30637 + }, + { + "epoch": 17.116201117318436, + "grad_norm": 0.44116848707199097, + "learning_rate": 0.0001449859943977591, + "loss": 0.4291, + "step": 30638 + }, + { + "epoch": 17.116759776536313, + "grad_norm": 0.8819090723991394, + "learning_rate": 0.00014495798319327734, + "loss": 0.3813, + "step": 30639 + }, + { + "epoch": 17.11731843575419, + "grad_norm": 0.4356851279735565, + "learning_rate": 0.00014492997198879552, + "loss": 0.3799, + "step": 30640 + }, + { + "epoch": 17.117877094972066, + "grad_norm": 0.6298952698707581, + "learning_rate": 0.00014490196078431372, + "loss": 0.5793, + "step": 30641 + }, + { + "epoch": 17.118435754189946, + "grad_norm": 0.6416264176368713, + "learning_rate": 0.00014487394957983193, + "loss": 0.3043, + "step": 30642 + }, + { + "epoch": 17.118994413407822, + "grad_norm": 0.4306201934814453, + "learning_rate": 0.00014484593837535016, + "loss": 0.4532, + "step": 30643 + }, + { + "epoch": 17.1195530726257, + "grad_norm": 0.6793591380119324, + "learning_rate": 0.00014481792717086837, + "loss": 0.4275, + "step": 30644 + }, + { + "epoch": 17.120111731843576, + "grad_norm": 0.41066646575927734, + "learning_rate": 0.00014478991596638655, + "loss": 0.463, + "step": 30645 + }, + { + "epoch": 17.120670391061452, + "grad_norm": 0.7772185802459717, + "learning_rate": 0.00014476190476190475, + "loss": 0.4528, + "step": 30646 + }, + { + "epoch": 17.12122905027933, + "grad_norm": 0.43935853242874146, + "learning_rate": 0.00014473389355742298, + "loss": 0.364, + "step": 30647 + }, + { + "epoch": 17.121787709497205, + "grad_norm": 0.5706481337547302, + "learning_rate": 0.0001447058823529412, + "loss": 0.445, + "step": 30648 + }, + { + "epoch": 17.122346368715085, + "grad_norm": 0.4210303723812103, + "learning_rate": 0.0001446778711484594, + "loss": 0.3841, + "step": 30649 + }, + { + "epoch": 17.122905027932962, + "grad_norm": 0.4442594647407532, + "learning_rate": 0.00014464985994397758, + "loss": 0.3377, + "step": 30650 + }, + { + "epoch": 17.12346368715084, + "grad_norm": 0.8115764856338501, + "learning_rate": 0.0001446218487394958, + "loss": 0.328, + "step": 30651 + }, + { + "epoch": 17.124022346368715, + "grad_norm": 3.739281177520752, + "learning_rate": 0.00014459383753501401, + "loss": 0.4641, + "step": 30652 + }, + { + "epoch": 17.12458100558659, + "grad_norm": 0.4313853979110718, + "learning_rate": 0.00014456582633053222, + "loss": 0.4049, + "step": 30653 + }, + { + "epoch": 17.12513966480447, + "grad_norm": 0.4225478172302246, + "learning_rate": 0.00014453781512605043, + "loss": 0.4381, + "step": 30654 + }, + { + "epoch": 17.125698324022345, + "grad_norm": 0.557887852191925, + "learning_rate": 0.00014450980392156863, + "loss": 0.389, + "step": 30655 + }, + { + "epoch": 17.126256983240225, + "grad_norm": 0.6848803162574768, + "learning_rate": 0.00014448179271708684, + "loss": 0.388, + "step": 30656 + }, + { + "epoch": 17.1268156424581, + "grad_norm": 0.5033578872680664, + "learning_rate": 0.00014445378151260504, + "loss": 0.4012, + "step": 30657 + }, + { + "epoch": 17.127374301675978, + "grad_norm": 0.6004784107208252, + "learning_rate": 0.00014442577030812325, + "loss": 0.4827, + "step": 30658 + }, + { + "epoch": 17.127932960893855, + "grad_norm": 0.4511112570762634, + "learning_rate": 0.00014439775910364146, + "loss": 0.372, + "step": 30659 + }, + { + "epoch": 17.12849162011173, + "grad_norm": 0.701365053653717, + "learning_rate": 0.00014436974789915966, + "loss": 0.4164, + "step": 30660 + }, + { + "epoch": 17.129050279329608, + "grad_norm": 0.43382611870765686, + "learning_rate": 0.00014434173669467787, + "loss": 0.3811, + "step": 30661 + }, + { + "epoch": 17.129608938547484, + "grad_norm": 0.4533500373363495, + "learning_rate": 0.00014431372549019607, + "loss": 0.3504, + "step": 30662 + }, + { + "epoch": 17.130167597765364, + "grad_norm": 0.385133296251297, + "learning_rate": 0.0001442857142857143, + "loss": 0.3699, + "step": 30663 + }, + { + "epoch": 17.13072625698324, + "grad_norm": 0.7855821847915649, + "learning_rate": 0.0001442577030812325, + "loss": 0.4541, + "step": 30664 + }, + { + "epoch": 17.131284916201118, + "grad_norm": 4.786759376525879, + "learning_rate": 0.0001442296918767507, + "loss": 0.4434, + "step": 30665 + }, + { + "epoch": 17.131843575418994, + "grad_norm": 0.9977697730064392, + "learning_rate": 0.0001442016806722689, + "loss": 0.4855, + "step": 30666 + }, + { + "epoch": 17.13240223463687, + "grad_norm": 0.5389133095741272, + "learning_rate": 0.00014417366946778713, + "loss": 0.4693, + "step": 30667 + }, + { + "epoch": 17.132960893854747, + "grad_norm": 0.7849056720733643, + "learning_rate": 0.00014414565826330534, + "loss": 0.3601, + "step": 30668 + }, + { + "epoch": 17.133519553072627, + "grad_norm": 0.3958875834941864, + "learning_rate": 0.00014411764705882352, + "loss": 0.417, + "step": 30669 + }, + { + "epoch": 17.134078212290504, + "grad_norm": 0.4477807283401489, + "learning_rate": 0.00014408963585434172, + "loss": 0.3522, + "step": 30670 + }, + { + "epoch": 17.13463687150838, + "grad_norm": 1.1301321983337402, + "learning_rate": 0.00014406162464985996, + "loss": 0.4437, + "step": 30671 + }, + { + "epoch": 17.135195530726257, + "grad_norm": 0.3881736397743225, + "learning_rate": 0.00014403361344537816, + "loss": 0.3914, + "step": 30672 + }, + { + "epoch": 17.135754189944134, + "grad_norm": 0.4912889301776886, + "learning_rate": 0.00014400560224089637, + "loss": 0.419, + "step": 30673 + }, + { + "epoch": 17.13631284916201, + "grad_norm": 0.4167359173297882, + "learning_rate": 0.00014397759103641455, + "loss": 0.4036, + "step": 30674 + }, + { + "epoch": 17.136871508379887, + "grad_norm": 0.762546956539154, + "learning_rate": 0.00014394957983193278, + "loss": 0.561, + "step": 30675 + }, + { + "epoch": 17.137430167597767, + "grad_norm": 0.7804329991340637, + "learning_rate": 0.00014392156862745099, + "loss": 0.3645, + "step": 30676 + }, + { + "epoch": 17.137988826815644, + "grad_norm": 0.33971935510635376, + "learning_rate": 0.0001438935574229692, + "loss": 0.3488, + "step": 30677 + }, + { + "epoch": 17.13854748603352, + "grad_norm": 0.8154634833335876, + "learning_rate": 0.0001438655462184874, + "loss": 0.5119, + "step": 30678 + }, + { + "epoch": 17.139106145251397, + "grad_norm": 0.4247089624404907, + "learning_rate": 0.0001438375350140056, + "loss": 0.5857, + "step": 30679 + }, + { + "epoch": 17.139664804469273, + "grad_norm": 0.3276110887527466, + "learning_rate": 0.0001438095238095238, + "loss": 0.3701, + "step": 30680 + }, + { + "epoch": 17.14022346368715, + "grad_norm": 0.4172048568725586, + "learning_rate": 0.00014378151260504202, + "loss": 0.3429, + "step": 30681 + }, + { + "epoch": 17.140782122905026, + "grad_norm": 0.3361363708972931, + "learning_rate": 0.00014375350140056022, + "loss": 0.4208, + "step": 30682 + }, + { + "epoch": 17.141340782122906, + "grad_norm": 9.697089195251465, + "learning_rate": 0.00014372549019607845, + "loss": 0.4094, + "step": 30683 + }, + { + "epoch": 17.141899441340783, + "grad_norm": 0.49515315890312195, + "learning_rate": 0.00014369747899159663, + "loss": 0.3923, + "step": 30684 + }, + { + "epoch": 17.14245810055866, + "grad_norm": 0.46800726652145386, + "learning_rate": 0.00014366946778711484, + "loss": 0.3401, + "step": 30685 + }, + { + "epoch": 17.143016759776536, + "grad_norm": 1.4067541360855103, + "learning_rate": 0.00014364145658263305, + "loss": 0.3679, + "step": 30686 + }, + { + "epoch": 17.143575418994413, + "grad_norm": 0.3973587453365326, + "learning_rate": 0.00014361344537815128, + "loss": 0.374, + "step": 30687 + }, + { + "epoch": 17.14413407821229, + "grad_norm": 0.3729104995727539, + "learning_rate": 0.00014358543417366948, + "loss": 0.393, + "step": 30688 + }, + { + "epoch": 17.144692737430166, + "grad_norm": 0.5097337365150452, + "learning_rate": 0.00014355742296918766, + "loss": 0.4501, + "step": 30689 + }, + { + "epoch": 17.145251396648046, + "grad_norm": 2.895922899246216, + "learning_rate": 0.00014352941176470587, + "loss": 0.6325, + "step": 30690 + }, + { + "epoch": 17.145810055865923, + "grad_norm": 0.7796987891197205, + "learning_rate": 0.0001435014005602241, + "loss": 0.3506, + "step": 30691 + }, + { + "epoch": 17.1463687150838, + "grad_norm": 0.5188897252082825, + "learning_rate": 0.0001434733893557423, + "loss": 0.3887, + "step": 30692 + }, + { + "epoch": 17.146927374301676, + "grad_norm": 2.1030290126800537, + "learning_rate": 0.00014344537815126051, + "loss": 0.469, + "step": 30693 + }, + { + "epoch": 17.147486033519552, + "grad_norm": 0.45743533968925476, + "learning_rate": 0.0001434173669467787, + "loss": 0.3988, + "step": 30694 + }, + { + "epoch": 17.14804469273743, + "grad_norm": 0.6907396912574768, + "learning_rate": 0.00014338935574229693, + "loss": 0.4277, + "step": 30695 + }, + { + "epoch": 17.14860335195531, + "grad_norm": 1.2991725206375122, + "learning_rate": 0.00014336134453781513, + "loss": 0.3601, + "step": 30696 + }, + { + "epoch": 17.149162011173186, + "grad_norm": 0.703126847743988, + "learning_rate": 0.00014333333333333334, + "loss": 0.5457, + "step": 30697 + }, + { + "epoch": 17.149720670391062, + "grad_norm": 0.4269443452358246, + "learning_rate": 0.00014330532212885154, + "loss": 0.4236, + "step": 30698 + }, + { + "epoch": 17.15027932960894, + "grad_norm": 0.5367129445075989, + "learning_rate": 0.00014327731092436975, + "loss": 0.3983, + "step": 30699 + }, + { + "epoch": 17.150837988826815, + "grad_norm": 0.37813735008239746, + "learning_rate": 0.00014324929971988796, + "loss": 0.3945, + "step": 30700 + }, + { + "epoch": 17.15139664804469, + "grad_norm": 0.8809571266174316, + "learning_rate": 0.00014322128851540616, + "loss": 0.4923, + "step": 30701 + }, + { + "epoch": 17.15195530726257, + "grad_norm": 1.254481315612793, + "learning_rate": 0.00014319327731092437, + "loss": 0.4305, + "step": 30702 + }, + { + "epoch": 17.15251396648045, + "grad_norm": 0.3826538622379303, + "learning_rate": 0.0001431652661064426, + "loss": 0.3819, + "step": 30703 + }, + { + "epoch": 17.153072625698325, + "grad_norm": 0.4359777271747589, + "learning_rate": 0.00014313725490196078, + "loss": 0.4034, + "step": 30704 + }, + { + "epoch": 17.1536312849162, + "grad_norm": 0.7031931281089783, + "learning_rate": 0.000143109243697479, + "loss": 0.3267, + "step": 30705 + }, + { + "epoch": 17.154189944134078, + "grad_norm": 0.43798449635505676, + "learning_rate": 0.0001430812324929972, + "loss": 0.4039, + "step": 30706 + }, + { + "epoch": 17.154748603351955, + "grad_norm": 0.4789959788322449, + "learning_rate": 0.00014305322128851543, + "loss": 0.4182, + "step": 30707 + }, + { + "epoch": 17.15530726256983, + "grad_norm": 0.4141457676887512, + "learning_rate": 0.00014302521008403363, + "loss": 0.357, + "step": 30708 + }, + { + "epoch": 17.155865921787708, + "grad_norm": 0.3567759394645691, + "learning_rate": 0.0001429971988795518, + "loss": 0.3395, + "step": 30709 + }, + { + "epoch": 17.156424581005588, + "grad_norm": 0.761366605758667, + "learning_rate": 0.00014296918767507002, + "loss": 0.4784, + "step": 30710 + }, + { + "epoch": 17.156983240223465, + "grad_norm": 0.4974413812160492, + "learning_rate": 0.00014294117647058825, + "loss": 0.3842, + "step": 30711 + }, + { + "epoch": 17.15754189944134, + "grad_norm": 1.084973692893982, + "learning_rate": 0.00014291316526610646, + "loss": 0.5343, + "step": 30712 + }, + { + "epoch": 17.158100558659218, + "grad_norm": 0.4046635925769806, + "learning_rate": 0.00014288515406162463, + "loss": 0.3207, + "step": 30713 + }, + { + "epoch": 17.158659217877094, + "grad_norm": 0.3601182699203491, + "learning_rate": 0.00014285714285714284, + "loss": 0.4473, + "step": 30714 + }, + { + "epoch": 17.15921787709497, + "grad_norm": 0.4990438222885132, + "learning_rate": 0.00014282913165266107, + "loss": 0.4056, + "step": 30715 + }, + { + "epoch": 17.159776536312847, + "grad_norm": 0.5641283988952637, + "learning_rate": 0.00014280112044817928, + "loss": 0.3609, + "step": 30716 + }, + { + "epoch": 17.160335195530728, + "grad_norm": 0.6007311344146729, + "learning_rate": 0.00014277310924369749, + "loss": 0.4625, + "step": 30717 + }, + { + "epoch": 17.160893854748604, + "grad_norm": 0.7267006039619446, + "learning_rate": 0.00014274509803921566, + "loss": 0.401, + "step": 30718 + }, + { + "epoch": 17.16145251396648, + "grad_norm": 0.7893133163452148, + "learning_rate": 0.0001427170868347339, + "loss": 0.3988, + "step": 30719 + }, + { + "epoch": 17.162011173184357, + "grad_norm": 0.4442957639694214, + "learning_rate": 0.0001426890756302521, + "loss": 0.3621, + "step": 30720 + }, + { + "epoch": 17.162569832402234, + "grad_norm": 0.7172282934188843, + "learning_rate": 0.0001426610644257703, + "loss": 0.5008, + "step": 30721 + }, + { + "epoch": 17.16312849162011, + "grad_norm": 0.41543108224868774, + "learning_rate": 0.00014263305322128852, + "loss": 0.3916, + "step": 30722 + }, + { + "epoch": 17.16368715083799, + "grad_norm": 0.46276652812957764, + "learning_rate": 0.00014260504201680672, + "loss": 0.331, + "step": 30723 + }, + { + "epoch": 17.164245810055867, + "grad_norm": 0.7716822028160095, + "learning_rate": 0.00014257703081232493, + "loss": 0.3414, + "step": 30724 + }, + { + "epoch": 17.164804469273744, + "grad_norm": 0.37379390001296997, + "learning_rate": 0.00014254901960784313, + "loss": 0.392, + "step": 30725 + }, + { + "epoch": 17.16536312849162, + "grad_norm": 0.5457429885864258, + "learning_rate": 0.00014252100840336134, + "loss": 0.4658, + "step": 30726 + }, + { + "epoch": 17.165921787709497, + "grad_norm": 0.4888390004634857, + "learning_rate": 0.00014249299719887957, + "loss": 0.616, + "step": 30727 + }, + { + "epoch": 17.166480446927373, + "grad_norm": 0.3112061619758606, + "learning_rate": 0.00014246498599439775, + "loss": 0.3787, + "step": 30728 + }, + { + "epoch": 17.16703910614525, + "grad_norm": 3.6695821285247803, + "learning_rate": 0.00014243697478991596, + "loss": 0.4124, + "step": 30729 + }, + { + "epoch": 17.16759776536313, + "grad_norm": 1.4364280700683594, + "learning_rate": 0.00014240896358543416, + "loss": 0.3716, + "step": 30730 + }, + { + "epoch": 17.168156424581007, + "grad_norm": 0.42415791749954224, + "learning_rate": 0.0001423809523809524, + "loss": 0.3617, + "step": 30731 + }, + { + "epoch": 17.168715083798883, + "grad_norm": 0.5223853588104248, + "learning_rate": 0.0001423529411764706, + "loss": 0.409, + "step": 30732 + }, + { + "epoch": 17.16927374301676, + "grad_norm": 0.43599650263786316, + "learning_rate": 0.00014232492997198878, + "loss": 0.4166, + "step": 30733 + }, + { + "epoch": 17.169832402234636, + "grad_norm": 3.177008867263794, + "learning_rate": 0.000142296918767507, + "loss": 0.4631, + "step": 30734 + }, + { + "epoch": 17.170391061452513, + "grad_norm": 0.7092443704605103, + "learning_rate": 0.00014226890756302522, + "loss": 0.3763, + "step": 30735 + }, + { + "epoch": 17.17094972067039, + "grad_norm": 0.503337562084198, + "learning_rate": 0.00014224089635854343, + "loss": 0.3923, + "step": 30736 + }, + { + "epoch": 17.17150837988827, + "grad_norm": 0.5022904276847839, + "learning_rate": 0.00014221288515406163, + "loss": 0.3756, + "step": 30737 + }, + { + "epoch": 17.172067039106146, + "grad_norm": 0.3345615565776825, + "learning_rate": 0.0001421848739495798, + "loss": 0.3168, + "step": 30738 + }, + { + "epoch": 17.172625698324023, + "grad_norm": 2.0736594200134277, + "learning_rate": 0.00014215686274509804, + "loss": 0.339, + "step": 30739 + }, + { + "epoch": 17.1731843575419, + "grad_norm": 1.0013874769210815, + "learning_rate": 0.00014212885154061625, + "loss": 0.3853, + "step": 30740 + }, + { + "epoch": 17.173743016759776, + "grad_norm": 0.44100654125213623, + "learning_rate": 0.00014210084033613446, + "loss": 0.2739, + "step": 30741 + }, + { + "epoch": 17.174301675977652, + "grad_norm": 0.4583946466445923, + "learning_rate": 0.00014207282913165266, + "loss": 0.3344, + "step": 30742 + }, + { + "epoch": 17.174860335195532, + "grad_norm": 0.49546849727630615, + "learning_rate": 0.00014204481792717087, + "loss": 0.3992, + "step": 30743 + }, + { + "epoch": 17.17541899441341, + "grad_norm": 0.44333434104919434, + "learning_rate": 0.00014201680672268907, + "loss": 0.4895, + "step": 30744 + }, + { + "epoch": 17.175977653631286, + "grad_norm": 0.8308997750282288, + "learning_rate": 0.00014198879551820728, + "loss": 0.371, + "step": 30745 + }, + { + "epoch": 17.176536312849162, + "grad_norm": 0.4740828573703766, + "learning_rate": 0.0001419607843137255, + "loss": 0.4085, + "step": 30746 + }, + { + "epoch": 17.17709497206704, + "grad_norm": 0.44030627608299255, + "learning_rate": 0.00014193277310924372, + "loss": 0.3475, + "step": 30747 + }, + { + "epoch": 17.177653631284915, + "grad_norm": 0.7526219487190247, + "learning_rate": 0.0001419047619047619, + "loss": 0.3975, + "step": 30748 + }, + { + "epoch": 17.178212290502792, + "grad_norm": 0.5852035284042358, + "learning_rate": 0.0001418767507002801, + "loss": 0.5215, + "step": 30749 + }, + { + "epoch": 17.178770949720672, + "grad_norm": 0.5003010630607605, + "learning_rate": 0.0001418487394957983, + "loss": 0.5067, + "step": 30750 + }, + { + "epoch": 17.17932960893855, + "grad_norm": 0.49839702248573303, + "learning_rate": 0.00014182072829131654, + "loss": 0.3739, + "step": 30751 + }, + { + "epoch": 17.179888268156425, + "grad_norm": 3.989161491394043, + "learning_rate": 0.00014179271708683475, + "loss": 0.4152, + "step": 30752 + }, + { + "epoch": 17.1804469273743, + "grad_norm": 0.6722609996795654, + "learning_rate": 0.00014176470588235293, + "loss": 0.4152, + "step": 30753 + }, + { + "epoch": 17.18100558659218, + "grad_norm": 0.9006577730178833, + "learning_rate": 0.00014173669467787113, + "loss": 0.4545, + "step": 30754 + }, + { + "epoch": 17.181564245810055, + "grad_norm": 0.9005676507949829, + "learning_rate": 0.00014170868347338937, + "loss": 0.3867, + "step": 30755 + }, + { + "epoch": 17.18212290502793, + "grad_norm": 0.3634353578090668, + "learning_rate": 0.00014168067226890757, + "loss": 0.3688, + "step": 30756 + }, + { + "epoch": 17.18268156424581, + "grad_norm": 0.5950771570205688, + "learning_rate": 0.00014165266106442578, + "loss": 0.381, + "step": 30757 + }, + { + "epoch": 17.183240223463688, + "grad_norm": 0.547524094581604, + "learning_rate": 0.00014162464985994396, + "loss": 0.5534, + "step": 30758 + }, + { + "epoch": 17.183798882681565, + "grad_norm": 0.45856061577796936, + "learning_rate": 0.0001415966386554622, + "loss": 0.4081, + "step": 30759 + }, + { + "epoch": 17.18435754189944, + "grad_norm": 0.37353959679603577, + "learning_rate": 0.0001415686274509804, + "loss": 0.3982, + "step": 30760 + }, + { + "epoch": 17.184916201117318, + "grad_norm": 0.5519049763679504, + "learning_rate": 0.0001415406162464986, + "loss": 0.6021, + "step": 30761 + }, + { + "epoch": 17.185474860335194, + "grad_norm": 0.3649415671825409, + "learning_rate": 0.0001415126050420168, + "loss": 0.4029, + "step": 30762 + }, + { + "epoch": 17.18603351955307, + "grad_norm": 0.43485450744628906, + "learning_rate": 0.00014148459383753502, + "loss": 0.4843, + "step": 30763 + }, + { + "epoch": 17.18659217877095, + "grad_norm": 0.6240429282188416, + "learning_rate": 0.00014145658263305322, + "loss": 0.3766, + "step": 30764 + }, + { + "epoch": 17.187150837988828, + "grad_norm": 0.46352556347846985, + "learning_rate": 0.00014142857142857143, + "loss": 0.3892, + "step": 30765 + }, + { + "epoch": 17.187709497206704, + "grad_norm": 0.3896719813346863, + "learning_rate": 0.00014140056022408963, + "loss": 0.3854, + "step": 30766 + }, + { + "epoch": 17.18826815642458, + "grad_norm": 0.49799567461013794, + "learning_rate": 0.00014137254901960784, + "loss": 0.3704, + "step": 30767 + }, + { + "epoch": 17.188826815642457, + "grad_norm": 0.3745008111000061, + "learning_rate": 0.00014134453781512605, + "loss": 0.3372, + "step": 30768 + }, + { + "epoch": 17.189385474860334, + "grad_norm": 0.47652313113212585, + "learning_rate": 0.00014131652661064425, + "loss": 0.377, + "step": 30769 + }, + { + "epoch": 17.189944134078214, + "grad_norm": 0.3345956802368164, + "learning_rate": 0.00014128851540616246, + "loss": 0.3459, + "step": 30770 + }, + { + "epoch": 17.19050279329609, + "grad_norm": 0.3444382846355438, + "learning_rate": 0.0001412605042016807, + "loss": 0.3621, + "step": 30771 + }, + { + "epoch": 17.191061452513967, + "grad_norm": 0.37247714400291443, + "learning_rate": 0.00014123249299719887, + "loss": 0.3253, + "step": 30772 + }, + { + "epoch": 17.191620111731844, + "grad_norm": 0.5487529039382935, + "learning_rate": 0.00014120448179271708, + "loss": 0.4187, + "step": 30773 + }, + { + "epoch": 17.19217877094972, + "grad_norm": 0.3745158314704895, + "learning_rate": 0.00014117647058823528, + "loss": 0.4216, + "step": 30774 + }, + { + "epoch": 17.192737430167597, + "grad_norm": 0.4563126564025879, + "learning_rate": 0.00014114845938375351, + "loss": 0.4264, + "step": 30775 + }, + { + "epoch": 17.193296089385473, + "grad_norm": 0.6057673096656799, + "learning_rate": 0.00014112044817927172, + "loss": 0.5994, + "step": 30776 + }, + { + "epoch": 17.193854748603353, + "grad_norm": 0.5246877074241638, + "learning_rate": 0.0001410924369747899, + "loss": 0.4866, + "step": 30777 + }, + { + "epoch": 17.19441340782123, + "grad_norm": 1.5769007205963135, + "learning_rate": 0.0001410644257703081, + "loss": 0.4173, + "step": 30778 + }, + { + "epoch": 17.194972067039107, + "grad_norm": 0.5270800590515137, + "learning_rate": 0.00014103641456582634, + "loss": 0.3994, + "step": 30779 + }, + { + "epoch": 17.195530726256983, + "grad_norm": 0.4798087179660797, + "learning_rate": 0.00014100840336134454, + "loss": 0.5318, + "step": 30780 + }, + { + "epoch": 17.19608938547486, + "grad_norm": 0.377145379781723, + "learning_rate": 0.00014098039215686275, + "loss": 0.4044, + "step": 30781 + }, + { + "epoch": 17.196648044692736, + "grad_norm": 5.0258941650390625, + "learning_rate": 0.00014095238095238093, + "loss": 0.3911, + "step": 30782 + }, + { + "epoch": 17.197206703910613, + "grad_norm": 0.4774393141269684, + "learning_rate": 0.00014092436974789916, + "loss": 0.3697, + "step": 30783 + }, + { + "epoch": 17.197765363128493, + "grad_norm": 0.39969557523727417, + "learning_rate": 0.00014089635854341737, + "loss": 0.3502, + "step": 30784 + }, + { + "epoch": 17.19832402234637, + "grad_norm": 0.5120758414268494, + "learning_rate": 0.00014086834733893557, + "loss": 0.3672, + "step": 30785 + }, + { + "epoch": 17.198882681564246, + "grad_norm": 0.6791298389434814, + "learning_rate": 0.0001408403361344538, + "loss": 0.3598, + "step": 30786 + }, + { + "epoch": 17.199441340782123, + "grad_norm": 0.44210800528526306, + "learning_rate": 0.000140812324929972, + "loss": 0.4542, + "step": 30787 + }, + { + "epoch": 17.2, + "grad_norm": 0.6122923493385315, + "learning_rate": 0.0001407843137254902, + "loss": 0.3872, + "step": 30788 + }, + { + "epoch": 17.200558659217876, + "grad_norm": 0.5188512802124023, + "learning_rate": 0.0001407563025210084, + "loss": 0.4251, + "step": 30789 + }, + { + "epoch": 17.201117318435756, + "grad_norm": 0.5056479573249817, + "learning_rate": 0.00014072829131652663, + "loss": 0.3245, + "step": 30790 + }, + { + "epoch": 17.201675977653633, + "grad_norm": 0.5134987235069275, + "learning_rate": 0.00014070028011204484, + "loss": 0.3031, + "step": 30791 + }, + { + "epoch": 17.20223463687151, + "grad_norm": 0.766931414604187, + "learning_rate": 0.00014067226890756302, + "loss": 0.3508, + "step": 30792 + }, + { + "epoch": 17.202793296089386, + "grad_norm": 0.33603018522262573, + "learning_rate": 0.00014064425770308122, + "loss": 0.3574, + "step": 30793 + }, + { + "epoch": 17.203351955307262, + "grad_norm": 0.6310589909553528, + "learning_rate": 0.00014061624649859946, + "loss": 0.4536, + "step": 30794 + }, + { + "epoch": 17.20391061452514, + "grad_norm": 0.8279927968978882, + "learning_rate": 0.00014058823529411766, + "loss": 0.3854, + "step": 30795 + }, + { + "epoch": 17.204469273743015, + "grad_norm": 0.9566571712493896, + "learning_rate": 0.00014056022408963587, + "loss": 0.4779, + "step": 30796 + }, + { + "epoch": 17.205027932960895, + "grad_norm": 0.4382748305797577, + "learning_rate": 0.00014053221288515405, + "loss": 0.3631, + "step": 30797 + }, + { + "epoch": 17.205586592178772, + "grad_norm": 0.33860304951667786, + "learning_rate": 0.00014050420168067228, + "loss": 0.3777, + "step": 30798 + }, + { + "epoch": 17.20614525139665, + "grad_norm": 0.3949485123157501, + "learning_rate": 0.00014047619047619049, + "loss": 0.3493, + "step": 30799 + }, + { + "epoch": 17.206703910614525, + "grad_norm": 0.5826476216316223, + "learning_rate": 0.0001404481792717087, + "loss": 0.4618, + "step": 30800 + }, + { + "epoch": 17.2072625698324, + "grad_norm": 0.4941520094871521, + "learning_rate": 0.0001404201680672269, + "loss": 0.3332, + "step": 30801 + }, + { + "epoch": 17.20782122905028, + "grad_norm": 0.42459985613822937, + "learning_rate": 0.0001403921568627451, + "loss": 0.342, + "step": 30802 + }, + { + "epoch": 17.208379888268155, + "grad_norm": 0.4781554937362671, + "learning_rate": 0.0001403641456582633, + "loss": 0.3199, + "step": 30803 + }, + { + "epoch": 17.208938547486035, + "grad_norm": 0.39960116147994995, + "learning_rate": 0.00014033613445378152, + "loss": 0.4332, + "step": 30804 + }, + { + "epoch": 17.20949720670391, + "grad_norm": 4.267194747924805, + "learning_rate": 0.00014030812324929972, + "loss": 0.3692, + "step": 30805 + }, + { + "epoch": 17.210055865921788, + "grad_norm": 0.7935893535614014, + "learning_rate": 0.00014028011204481795, + "loss": 0.3996, + "step": 30806 + }, + { + "epoch": 17.210614525139665, + "grad_norm": 0.5719877481460571, + "learning_rate": 0.00014025210084033613, + "loss": 0.5224, + "step": 30807 + }, + { + "epoch": 17.21117318435754, + "grad_norm": 0.325985848903656, + "learning_rate": 0.00014022408963585434, + "loss": 0.3939, + "step": 30808 + }, + { + "epoch": 17.211731843575418, + "grad_norm": 0.46041709184646606, + "learning_rate": 0.00014019607843137255, + "loss": 0.4268, + "step": 30809 + }, + { + "epoch": 17.212290502793294, + "grad_norm": 0.8821285367012024, + "learning_rate": 0.00014016806722689078, + "loss": 0.4616, + "step": 30810 + }, + { + "epoch": 17.212849162011175, + "grad_norm": 0.563669741153717, + "learning_rate": 0.00014014005602240898, + "loss": 0.4486, + "step": 30811 + }, + { + "epoch": 17.21340782122905, + "grad_norm": 0.47116991877555847, + "learning_rate": 0.00014011204481792716, + "loss": 0.3261, + "step": 30812 + }, + { + "epoch": 17.213966480446928, + "grad_norm": 0.7829761505126953, + "learning_rate": 0.00014008403361344537, + "loss": 0.3214, + "step": 30813 + }, + { + "epoch": 17.214525139664804, + "grad_norm": 0.4250996708869934, + "learning_rate": 0.0001400560224089636, + "loss": 0.4002, + "step": 30814 + }, + { + "epoch": 17.21508379888268, + "grad_norm": 0.4736463129520416, + "learning_rate": 0.0001400280112044818, + "loss": 0.4208, + "step": 30815 + }, + { + "epoch": 17.215642458100557, + "grad_norm": 0.3516188859939575, + "learning_rate": 0.00014000000000000001, + "loss": 0.4041, + "step": 30816 + }, + { + "epoch": 17.216201117318437, + "grad_norm": 0.3567846715450287, + "learning_rate": 0.0001399719887955182, + "loss": 0.3776, + "step": 30817 + }, + { + "epoch": 17.216759776536314, + "grad_norm": 0.42406564950942993, + "learning_rate": 0.00013994397759103643, + "loss": 0.4301, + "step": 30818 + }, + { + "epoch": 17.21731843575419, + "grad_norm": 0.7535119652748108, + "learning_rate": 0.00013991596638655463, + "loss": 0.4104, + "step": 30819 + }, + { + "epoch": 17.217877094972067, + "grad_norm": 0.5563556551933289, + "learning_rate": 0.00013988795518207284, + "loss": 0.4862, + "step": 30820 + }, + { + "epoch": 17.218435754189944, + "grad_norm": 0.5752178430557251, + "learning_rate": 0.00013985994397759104, + "loss": 0.4008, + "step": 30821 + }, + { + "epoch": 17.21899441340782, + "grad_norm": 0.3778010606765747, + "learning_rate": 0.00013983193277310925, + "loss": 0.2946, + "step": 30822 + }, + { + "epoch": 17.219553072625697, + "grad_norm": 0.5849639177322388, + "learning_rate": 0.00013980392156862746, + "loss": 0.3877, + "step": 30823 + }, + { + "epoch": 17.220111731843577, + "grad_norm": 0.4828316569328308, + "learning_rate": 0.00013977591036414566, + "loss": 0.3599, + "step": 30824 + }, + { + "epoch": 17.220670391061454, + "grad_norm": 0.3942548632621765, + "learning_rate": 0.00013974789915966387, + "loss": 0.4413, + "step": 30825 + }, + { + "epoch": 17.22122905027933, + "grad_norm": 23.958749771118164, + "learning_rate": 0.00013971988795518207, + "loss": 0.3402, + "step": 30826 + }, + { + "epoch": 17.221787709497207, + "grad_norm": 1.4602874517440796, + "learning_rate": 0.00013969187675070028, + "loss": 0.4295, + "step": 30827 + }, + { + "epoch": 17.222346368715083, + "grad_norm": 0.4744478464126587, + "learning_rate": 0.0001396638655462185, + "loss": 0.3776, + "step": 30828 + }, + { + "epoch": 17.22290502793296, + "grad_norm": 0.4322073459625244, + "learning_rate": 0.0001396358543417367, + "loss": 0.4569, + "step": 30829 + }, + { + "epoch": 17.223463687150836, + "grad_norm": 0.5271614789962769, + "learning_rate": 0.00013960784313725493, + "loss": 0.3849, + "step": 30830 + }, + { + "epoch": 17.224022346368717, + "grad_norm": 0.9581164717674255, + "learning_rate": 0.0001395798319327731, + "loss": 0.3588, + "step": 30831 + }, + { + "epoch": 17.224581005586593, + "grad_norm": 0.5064389705657959, + "learning_rate": 0.0001395518207282913, + "loss": 0.3979, + "step": 30832 + }, + { + "epoch": 17.22513966480447, + "grad_norm": 0.4320276081562042, + "learning_rate": 0.00013952380952380952, + "loss": 0.3942, + "step": 30833 + }, + { + "epoch": 17.225698324022346, + "grad_norm": 0.4634736180305481, + "learning_rate": 0.00013949579831932775, + "loss": 0.3647, + "step": 30834 + }, + { + "epoch": 17.226256983240223, + "grad_norm": 0.715804398059845, + "learning_rate": 0.00013946778711484596, + "loss": 0.536, + "step": 30835 + }, + { + "epoch": 17.2268156424581, + "grad_norm": 0.8666571974754333, + "learning_rate": 0.00013943977591036413, + "loss": 0.3377, + "step": 30836 + }, + { + "epoch": 17.227374301675976, + "grad_norm": 0.36556607484817505, + "learning_rate": 0.00013941176470588234, + "loss": 0.364, + "step": 30837 + }, + { + "epoch": 17.227932960893856, + "grad_norm": 0.4033718407154083, + "learning_rate": 0.00013938375350140057, + "loss": 0.3177, + "step": 30838 + }, + { + "epoch": 17.228491620111733, + "grad_norm": 0.43227052688598633, + "learning_rate": 0.00013935574229691878, + "loss": 0.3495, + "step": 30839 + }, + { + "epoch": 17.22905027932961, + "grad_norm": 0.6135537624359131, + "learning_rate": 0.00013932773109243699, + "loss": 0.5143, + "step": 30840 + }, + { + "epoch": 17.229608938547486, + "grad_norm": 0.4333163797855377, + "learning_rate": 0.00013929971988795516, + "loss": 0.4698, + "step": 30841 + }, + { + "epoch": 17.230167597765362, + "grad_norm": 0.5569503903388977, + "learning_rate": 0.0001392717086834734, + "loss": 0.3978, + "step": 30842 + }, + { + "epoch": 17.23072625698324, + "grad_norm": 0.6848451495170593, + "learning_rate": 0.0001392436974789916, + "loss": 0.481, + "step": 30843 + }, + { + "epoch": 17.23128491620112, + "grad_norm": 0.8422544002532959, + "learning_rate": 0.0001392156862745098, + "loss": 0.443, + "step": 30844 + }, + { + "epoch": 17.231843575418996, + "grad_norm": 0.4208816587924957, + "learning_rate": 0.00013918767507002802, + "loss": 0.3659, + "step": 30845 + }, + { + "epoch": 17.232402234636872, + "grad_norm": 0.46880191564559937, + "learning_rate": 0.00013915966386554622, + "loss": 0.4404, + "step": 30846 + }, + { + "epoch": 17.23296089385475, + "grad_norm": 0.37111273407936096, + "learning_rate": 0.00013913165266106443, + "loss": 0.3204, + "step": 30847 + }, + { + "epoch": 17.233519553072625, + "grad_norm": 0.34396710991859436, + "learning_rate": 0.00013910364145658263, + "loss": 0.3837, + "step": 30848 + }, + { + "epoch": 17.234078212290502, + "grad_norm": 0.4517327845096588, + "learning_rate": 0.00013907563025210084, + "loss": 0.4, + "step": 30849 + }, + { + "epoch": 17.23463687150838, + "grad_norm": 0.4544031023979187, + "learning_rate": 0.00013904761904761907, + "loss": 0.3801, + "step": 30850 + }, + { + "epoch": 17.23519553072626, + "grad_norm": 1.6825200319290161, + "learning_rate": 0.00013901960784313725, + "loss": 0.3969, + "step": 30851 + }, + { + "epoch": 17.235754189944135, + "grad_norm": 0.6552950739860535, + "learning_rate": 0.00013899159663865546, + "loss": 0.4193, + "step": 30852 + }, + { + "epoch": 17.23631284916201, + "grad_norm": 0.6134896874427795, + "learning_rate": 0.00013896358543417366, + "loss": 0.4103, + "step": 30853 + }, + { + "epoch": 17.23687150837989, + "grad_norm": 0.36394381523132324, + "learning_rate": 0.0001389355742296919, + "loss": 0.3945, + "step": 30854 + }, + { + "epoch": 17.237430167597765, + "grad_norm": 4.964192867279053, + "learning_rate": 0.0001389075630252101, + "loss": 0.3259, + "step": 30855 + }, + { + "epoch": 17.23798882681564, + "grad_norm": 0.4651060700416565, + "learning_rate": 0.00013887955182072828, + "loss": 0.3964, + "step": 30856 + }, + { + "epoch": 17.238547486033518, + "grad_norm": 0.5870866179466248, + "learning_rate": 0.0001388515406162465, + "loss": 0.409, + "step": 30857 + }, + { + "epoch": 17.239106145251398, + "grad_norm": 0.49224334955215454, + "learning_rate": 0.00013882352941176472, + "loss": 0.4666, + "step": 30858 + }, + { + "epoch": 17.239664804469275, + "grad_norm": 1.063842535018921, + "learning_rate": 0.00013879551820728293, + "loss": 0.3182, + "step": 30859 + }, + { + "epoch": 17.24022346368715, + "grad_norm": 6.323765754699707, + "learning_rate": 0.00013876750700280113, + "loss": 0.4017, + "step": 30860 + }, + { + "epoch": 17.240782122905028, + "grad_norm": 0.4009861946105957, + "learning_rate": 0.0001387394957983193, + "loss": 0.3945, + "step": 30861 + }, + { + "epoch": 17.241340782122904, + "grad_norm": 0.36764004826545715, + "learning_rate": 0.00013871148459383754, + "loss": 0.334, + "step": 30862 + }, + { + "epoch": 17.24189944134078, + "grad_norm": 0.5300739407539368, + "learning_rate": 0.00013868347338935575, + "loss": 0.4345, + "step": 30863 + }, + { + "epoch": 17.242458100558657, + "grad_norm": 0.6319892406463623, + "learning_rate": 0.00013865546218487396, + "loss": 0.3874, + "step": 30864 + }, + { + "epoch": 17.243016759776538, + "grad_norm": 0.3876967132091522, + "learning_rate": 0.00013862745098039216, + "loss": 0.3318, + "step": 30865 + }, + { + "epoch": 17.243575418994414, + "grad_norm": 0.6696122288703918, + "learning_rate": 0.00013859943977591037, + "loss": 0.429, + "step": 30866 + }, + { + "epoch": 17.24413407821229, + "grad_norm": 0.6907252669334412, + "learning_rate": 0.00013857142857142857, + "loss": 0.4089, + "step": 30867 + }, + { + "epoch": 17.244692737430167, + "grad_norm": 0.5244340300559998, + "learning_rate": 0.00013854341736694678, + "loss": 0.459, + "step": 30868 + }, + { + "epoch": 17.245251396648044, + "grad_norm": 1.119207501411438, + "learning_rate": 0.000138515406162465, + "loss": 0.3509, + "step": 30869 + }, + { + "epoch": 17.24581005586592, + "grad_norm": 0.41680291295051575, + "learning_rate": 0.00013848739495798322, + "loss": 0.4479, + "step": 30870 + }, + { + "epoch": 17.2463687150838, + "grad_norm": 0.6145120859146118, + "learning_rate": 0.0001384593837535014, + "loss": 0.429, + "step": 30871 + }, + { + "epoch": 17.246927374301677, + "grad_norm": 2.2590606212615967, + "learning_rate": 0.0001384313725490196, + "loss": 0.415, + "step": 30872 + }, + { + "epoch": 17.247486033519554, + "grad_norm": 0.4758312404155731, + "learning_rate": 0.0001384033613445378, + "loss": 0.3951, + "step": 30873 + }, + { + "epoch": 17.24804469273743, + "grad_norm": 1.1972399950027466, + "learning_rate": 0.00013837535014005604, + "loss": 0.4254, + "step": 30874 + }, + { + "epoch": 17.248603351955307, + "grad_norm": 0.3477563261985779, + "learning_rate": 0.00013834733893557425, + "loss": 0.4614, + "step": 30875 + }, + { + "epoch": 17.249162011173183, + "grad_norm": 0.34456226229667664, + "learning_rate": 0.00013831932773109243, + "loss": 0.4409, + "step": 30876 + }, + { + "epoch": 17.24972067039106, + "grad_norm": 2.0233559608459473, + "learning_rate": 0.00013829131652661063, + "loss": 0.5128, + "step": 30877 + }, + { + "epoch": 17.25027932960894, + "grad_norm": 0.44099491834640503, + "learning_rate": 0.00013826330532212887, + "loss": 0.3918, + "step": 30878 + }, + { + "epoch": 17.250837988826817, + "grad_norm": 0.3825315237045288, + "learning_rate": 0.00013823529411764707, + "loss": 0.3995, + "step": 30879 + }, + { + "epoch": 17.251396648044693, + "grad_norm": 0.33068642020225525, + "learning_rate": 0.00013820728291316525, + "loss": 0.3558, + "step": 30880 + }, + { + "epoch": 17.25195530726257, + "grad_norm": 0.47674262523651123, + "learning_rate": 0.00013817927170868346, + "loss": 0.3743, + "step": 30881 + }, + { + "epoch": 17.252513966480446, + "grad_norm": 0.391883909702301, + "learning_rate": 0.0001381512605042017, + "loss": 0.35, + "step": 30882 + }, + { + "epoch": 17.253072625698323, + "grad_norm": 0.6043128967285156, + "learning_rate": 0.0001381232492997199, + "loss": 0.4051, + "step": 30883 + }, + { + "epoch": 17.2536312849162, + "grad_norm": 0.476500540971756, + "learning_rate": 0.0001380952380952381, + "loss": 0.4562, + "step": 30884 + }, + { + "epoch": 17.25418994413408, + "grad_norm": 0.5104117393493652, + "learning_rate": 0.00013806722689075628, + "loss": 0.2972, + "step": 30885 + }, + { + "epoch": 17.254748603351956, + "grad_norm": 0.4299020767211914, + "learning_rate": 0.00013803921568627452, + "loss": 0.3577, + "step": 30886 + }, + { + "epoch": 17.255307262569833, + "grad_norm": 0.5536918044090271, + "learning_rate": 0.00013801120448179272, + "loss": 0.5185, + "step": 30887 + }, + { + "epoch": 17.25586592178771, + "grad_norm": 0.4102650582790375, + "learning_rate": 0.00013798319327731093, + "loss": 0.4376, + "step": 30888 + }, + { + "epoch": 17.256424581005586, + "grad_norm": 0.446763813495636, + "learning_rate": 0.00013795518207282913, + "loss": 0.4226, + "step": 30889 + }, + { + "epoch": 17.256983240223462, + "grad_norm": 0.32200101017951965, + "learning_rate": 0.00013792717086834734, + "loss": 0.3654, + "step": 30890 + }, + { + "epoch": 17.257541899441343, + "grad_norm": 0.8275089263916016, + "learning_rate": 0.00013789915966386555, + "loss": 0.533, + "step": 30891 + }, + { + "epoch": 17.25810055865922, + "grad_norm": 0.6773897409439087, + "learning_rate": 0.00013787114845938375, + "loss": 0.4648, + "step": 30892 + }, + { + "epoch": 17.258659217877096, + "grad_norm": 0.3769640028476715, + "learning_rate": 0.00013784313725490196, + "loss": 0.4887, + "step": 30893 + }, + { + "epoch": 17.259217877094972, + "grad_norm": 0.5789656043052673, + "learning_rate": 0.0001378151260504202, + "loss": 0.5579, + "step": 30894 + }, + { + "epoch": 17.25977653631285, + "grad_norm": 0.44554921984672546, + "learning_rate": 0.00013778711484593837, + "loss": 0.3955, + "step": 30895 + }, + { + "epoch": 17.260335195530725, + "grad_norm": 0.49596601724624634, + "learning_rate": 0.00013775910364145658, + "loss": 0.4066, + "step": 30896 + }, + { + "epoch": 17.260893854748602, + "grad_norm": 0.6141653060913086, + "learning_rate": 0.00013773109243697478, + "loss": 0.4103, + "step": 30897 + }, + { + "epoch": 17.261452513966482, + "grad_norm": 0.4723193645477295, + "learning_rate": 0.00013770308123249301, + "loss": 0.3924, + "step": 30898 + }, + { + "epoch": 17.26201117318436, + "grad_norm": 0.37482163310050964, + "learning_rate": 0.00013767507002801122, + "loss": 0.3048, + "step": 30899 + }, + { + "epoch": 17.262569832402235, + "grad_norm": 0.3675345182418823, + "learning_rate": 0.0001376470588235294, + "loss": 0.353, + "step": 30900 + }, + { + "epoch": 17.26312849162011, + "grad_norm": 0.4940016269683838, + "learning_rate": 0.0001376190476190476, + "loss": 0.4825, + "step": 30901 + }, + { + "epoch": 17.26368715083799, + "grad_norm": 0.5392482280731201, + "learning_rate": 0.00013759103641456584, + "loss": 0.4118, + "step": 30902 + }, + { + "epoch": 17.264245810055865, + "grad_norm": 1.110404372215271, + "learning_rate": 0.00013756302521008404, + "loss": 0.3909, + "step": 30903 + }, + { + "epoch": 17.26480446927374, + "grad_norm": 0.453970730304718, + "learning_rate": 0.00013753501400560225, + "loss": 0.3649, + "step": 30904 + }, + { + "epoch": 17.26536312849162, + "grad_norm": 0.39727967977523804, + "learning_rate": 0.00013750700280112043, + "loss": 0.4499, + "step": 30905 + }, + { + "epoch": 17.265921787709498, + "grad_norm": 0.4828219413757324, + "learning_rate": 0.00013747899159663866, + "loss": 0.3408, + "step": 30906 + }, + { + "epoch": 17.266480446927375, + "grad_norm": 7.320755958557129, + "learning_rate": 0.00013745098039215687, + "loss": 0.4585, + "step": 30907 + }, + { + "epoch": 17.26703910614525, + "grad_norm": 0.5370703339576721, + "learning_rate": 0.00013742296918767507, + "loss": 0.3453, + "step": 30908 + }, + { + "epoch": 17.267597765363128, + "grad_norm": 0.42002859711647034, + "learning_rate": 0.00013739495798319328, + "loss": 0.4502, + "step": 30909 + }, + { + "epoch": 17.268156424581004, + "grad_norm": 0.34289199113845825, + "learning_rate": 0.0001373669467787115, + "loss": 0.3302, + "step": 30910 + }, + { + "epoch": 17.26871508379888, + "grad_norm": 0.5936195254325867, + "learning_rate": 0.0001373389355742297, + "loss": 0.3739, + "step": 30911 + }, + { + "epoch": 17.26927374301676, + "grad_norm": 0.39910629391670227, + "learning_rate": 0.0001373109243697479, + "loss": 0.417, + "step": 30912 + }, + { + "epoch": 17.269832402234638, + "grad_norm": 0.47082874178886414, + "learning_rate": 0.0001372829131652661, + "loss": 0.4941, + "step": 30913 + }, + { + "epoch": 17.270391061452514, + "grad_norm": 0.9759960770606995, + "learning_rate": 0.00013725490196078434, + "loss": 0.3609, + "step": 30914 + }, + { + "epoch": 17.27094972067039, + "grad_norm": 2.263122081756592, + "learning_rate": 0.00013722689075630252, + "loss": 0.4058, + "step": 30915 + }, + { + "epoch": 17.271508379888267, + "grad_norm": 0.419633150100708, + "learning_rate": 0.00013719887955182072, + "loss": 0.4724, + "step": 30916 + }, + { + "epoch": 17.272067039106144, + "grad_norm": 0.37599149346351624, + "learning_rate": 0.00013717086834733893, + "loss": 0.4537, + "step": 30917 + }, + { + "epoch": 17.272625698324024, + "grad_norm": 0.39581042528152466, + "learning_rate": 0.00013714285714285716, + "loss": 0.3674, + "step": 30918 + }, + { + "epoch": 17.2731843575419, + "grad_norm": 0.5695962905883789, + "learning_rate": 0.00013711484593837537, + "loss": 0.3849, + "step": 30919 + }, + { + "epoch": 17.273743016759777, + "grad_norm": 0.857007622718811, + "learning_rate": 0.00013708683473389355, + "loss": 0.3899, + "step": 30920 + }, + { + "epoch": 17.274301675977654, + "grad_norm": 7.770477771759033, + "learning_rate": 0.00013705882352941175, + "loss": 0.3871, + "step": 30921 + }, + { + "epoch": 17.27486033519553, + "grad_norm": 0.3573330342769623, + "learning_rate": 0.00013703081232492999, + "loss": 0.4289, + "step": 30922 + }, + { + "epoch": 17.275418994413407, + "grad_norm": 0.4509274661540985, + "learning_rate": 0.0001370028011204482, + "loss": 0.4, + "step": 30923 + }, + { + "epoch": 17.275977653631283, + "grad_norm": 0.34237974882125854, + "learning_rate": 0.0001369747899159664, + "loss": 0.4171, + "step": 30924 + }, + { + "epoch": 17.276536312849164, + "grad_norm": 0.8740954995155334, + "learning_rate": 0.00013694677871148458, + "loss": 0.5036, + "step": 30925 + }, + { + "epoch": 17.27709497206704, + "grad_norm": 0.7318565249443054, + "learning_rate": 0.0001369187675070028, + "loss": 0.5101, + "step": 30926 + }, + { + "epoch": 17.277653631284917, + "grad_norm": 1.6115286350250244, + "learning_rate": 0.00013689075630252102, + "loss": 0.29, + "step": 30927 + }, + { + "epoch": 17.278212290502793, + "grad_norm": 0.5907230377197266, + "learning_rate": 0.00013686274509803922, + "loss": 0.4436, + "step": 30928 + }, + { + "epoch": 17.27877094972067, + "grad_norm": 0.3992558419704437, + "learning_rate": 0.00013683473389355743, + "loss": 0.3901, + "step": 30929 + }, + { + "epoch": 17.279329608938546, + "grad_norm": 0.5210452675819397, + "learning_rate": 0.00013680672268907563, + "loss": 0.3434, + "step": 30930 + }, + { + "epoch": 17.279888268156423, + "grad_norm": 0.40953654050827026, + "learning_rate": 0.00013677871148459384, + "loss": 0.4168, + "step": 30931 + }, + { + "epoch": 17.280446927374303, + "grad_norm": 0.41116511821746826, + "learning_rate": 0.00013675070028011205, + "loss": 0.3767, + "step": 30932 + }, + { + "epoch": 17.28100558659218, + "grad_norm": 0.4857286214828491, + "learning_rate": 0.00013672268907563025, + "loss": 0.4451, + "step": 30933 + }, + { + "epoch": 17.281564245810056, + "grad_norm": 0.3809511959552765, + "learning_rate": 0.00013669467787114846, + "loss": 0.3726, + "step": 30934 + }, + { + "epoch": 17.282122905027933, + "grad_norm": 3.765681505203247, + "learning_rate": 0.00013666666666666666, + "loss": 0.4611, + "step": 30935 + }, + { + "epoch": 17.28268156424581, + "grad_norm": 0.4003088176250458, + "learning_rate": 0.00013663865546218487, + "loss": 0.4332, + "step": 30936 + }, + { + "epoch": 17.283240223463686, + "grad_norm": 0.42851537466049194, + "learning_rate": 0.00013661064425770308, + "loss": 0.397, + "step": 30937 + }, + { + "epoch": 17.283798882681563, + "grad_norm": 0.5623506307601929, + "learning_rate": 0.0001365826330532213, + "loss": 0.3678, + "step": 30938 + }, + { + "epoch": 17.284357541899443, + "grad_norm": 0.6187272667884827, + "learning_rate": 0.0001365546218487395, + "loss": 0.3686, + "step": 30939 + }, + { + "epoch": 17.28491620111732, + "grad_norm": 1.0401549339294434, + "learning_rate": 0.0001365266106442577, + "loss": 0.3553, + "step": 30940 + }, + { + "epoch": 17.285474860335196, + "grad_norm": 0.572690486907959, + "learning_rate": 0.0001364985994397759, + "loss": 0.4421, + "step": 30941 + }, + { + "epoch": 17.286033519553072, + "grad_norm": 0.5744448900222778, + "learning_rate": 0.00013647058823529413, + "loss": 0.4744, + "step": 30942 + }, + { + "epoch": 17.28659217877095, + "grad_norm": 0.6637737154960632, + "learning_rate": 0.00013644257703081234, + "loss": 0.4896, + "step": 30943 + }, + { + "epoch": 17.287150837988825, + "grad_norm": 12.140373229980469, + "learning_rate": 0.00013641456582633052, + "loss": 0.4325, + "step": 30944 + }, + { + "epoch": 17.287709497206706, + "grad_norm": 0.5495653748512268, + "learning_rate": 0.00013638655462184872, + "loss": 0.5844, + "step": 30945 + }, + { + "epoch": 17.288268156424582, + "grad_norm": 0.5662904977798462, + "learning_rate": 0.00013635854341736696, + "loss": 0.383, + "step": 30946 + }, + { + "epoch": 17.28882681564246, + "grad_norm": 0.530608057975769, + "learning_rate": 0.00013633053221288516, + "loss": 0.4547, + "step": 30947 + }, + { + "epoch": 17.289385474860335, + "grad_norm": 0.43215397000312805, + "learning_rate": 0.00013630252100840337, + "loss": 0.4145, + "step": 30948 + }, + { + "epoch": 17.289944134078212, + "grad_norm": 0.6672047972679138, + "learning_rate": 0.00013627450980392155, + "loss": 0.3413, + "step": 30949 + }, + { + "epoch": 17.29050279329609, + "grad_norm": 0.9095503091812134, + "learning_rate": 0.00013624649859943978, + "loss": 0.497, + "step": 30950 + }, + { + "epoch": 17.291061452513965, + "grad_norm": 1.1142430305480957, + "learning_rate": 0.000136218487394958, + "loss": 0.4701, + "step": 30951 + }, + { + "epoch": 17.291620111731845, + "grad_norm": 0.4204825460910797, + "learning_rate": 0.0001361904761904762, + "loss": 0.3896, + "step": 30952 + }, + { + "epoch": 17.29217877094972, + "grad_norm": 0.8348186612129211, + "learning_rate": 0.0001361624649859944, + "loss": 0.3278, + "step": 30953 + }, + { + "epoch": 17.2927374301676, + "grad_norm": 0.3412925601005554, + "learning_rate": 0.0001361344537815126, + "loss": 0.3412, + "step": 30954 + }, + { + "epoch": 17.293296089385475, + "grad_norm": 0.4504820704460144, + "learning_rate": 0.0001361064425770308, + "loss": 0.4762, + "step": 30955 + }, + { + "epoch": 17.29385474860335, + "grad_norm": 0.8060104250907898, + "learning_rate": 0.00013607843137254902, + "loss": 0.5149, + "step": 30956 + }, + { + "epoch": 17.294413407821228, + "grad_norm": 0.40251079201698303, + "learning_rate": 0.00013605042016806722, + "loss": 0.3836, + "step": 30957 + }, + { + "epoch": 17.294972067039105, + "grad_norm": 0.3634423017501831, + "learning_rate": 0.00013602240896358546, + "loss": 0.3616, + "step": 30958 + }, + { + "epoch": 17.295530726256985, + "grad_norm": 0.5007919073104858, + "learning_rate": 0.00013599439775910363, + "loss": 0.4911, + "step": 30959 + }, + { + "epoch": 17.29608938547486, + "grad_norm": 0.48630887269973755, + "learning_rate": 0.00013596638655462184, + "loss": 0.5072, + "step": 30960 + }, + { + "epoch": 17.296648044692738, + "grad_norm": 0.5721303224563599, + "learning_rate": 0.00013593837535014005, + "loss": 0.4613, + "step": 30961 + }, + { + "epoch": 17.297206703910614, + "grad_norm": 0.3682270050048828, + "learning_rate": 0.00013591036414565828, + "loss": 0.3556, + "step": 30962 + }, + { + "epoch": 17.29776536312849, + "grad_norm": 0.44523659348487854, + "learning_rate": 0.00013588235294117649, + "loss": 0.5177, + "step": 30963 + }, + { + "epoch": 17.298324022346367, + "grad_norm": 0.5685264468193054, + "learning_rate": 0.00013585434173669466, + "loss": 0.4693, + "step": 30964 + }, + { + "epoch": 17.298882681564244, + "grad_norm": 0.6503521800041199, + "learning_rate": 0.00013582633053221287, + "loss": 0.574, + "step": 30965 + }, + { + "epoch": 17.299441340782124, + "grad_norm": 0.5219910144805908, + "learning_rate": 0.0001357983193277311, + "loss": 0.3701, + "step": 30966 + }, + { + "epoch": 17.3, + "grad_norm": 0.6246150135993958, + "learning_rate": 0.0001357703081232493, + "loss": 0.4525, + "step": 30967 + }, + { + "epoch": 17.300558659217877, + "grad_norm": 1.1186622381210327, + "learning_rate": 0.00013574229691876752, + "loss": 0.4297, + "step": 30968 + }, + { + "epoch": 17.301117318435754, + "grad_norm": 0.41805148124694824, + "learning_rate": 0.0001357142857142857, + "loss": 0.4654, + "step": 30969 + }, + { + "epoch": 17.30167597765363, + "grad_norm": 0.6026161313056946, + "learning_rate": 0.00013568627450980393, + "loss": 0.3995, + "step": 30970 + }, + { + "epoch": 17.302234636871507, + "grad_norm": 0.3659982979297638, + "learning_rate": 0.00013565826330532213, + "loss": 0.3874, + "step": 30971 + }, + { + "epoch": 17.302793296089387, + "grad_norm": 2.2221803665161133, + "learning_rate": 0.00013563025210084034, + "loss": 0.4509, + "step": 30972 + }, + { + "epoch": 17.303351955307264, + "grad_norm": 0.4099322557449341, + "learning_rate": 0.00013560224089635855, + "loss": 0.5089, + "step": 30973 + }, + { + "epoch": 17.30391061452514, + "grad_norm": 0.8470801115036011, + "learning_rate": 0.00013557422969187675, + "loss": 0.3613, + "step": 30974 + }, + { + "epoch": 17.304469273743017, + "grad_norm": 0.36109501123428345, + "learning_rate": 0.00013554621848739496, + "loss": 0.3092, + "step": 30975 + }, + { + "epoch": 17.305027932960893, + "grad_norm": 1.5990710258483887, + "learning_rate": 0.00013551820728291316, + "loss": 0.5475, + "step": 30976 + }, + { + "epoch": 17.30558659217877, + "grad_norm": 0.8227080702781677, + "learning_rate": 0.00013549019607843137, + "loss": 0.3916, + "step": 30977 + }, + { + "epoch": 17.306145251396647, + "grad_norm": 0.45544853806495667, + "learning_rate": 0.0001354621848739496, + "loss": 0.3922, + "step": 30978 + }, + { + "epoch": 17.306703910614527, + "grad_norm": 0.5445821285247803, + "learning_rate": 0.00013543417366946778, + "loss": 0.3523, + "step": 30979 + }, + { + "epoch": 17.307262569832403, + "grad_norm": 0.4503592848777771, + "learning_rate": 0.000135406162464986, + "loss": 0.4257, + "step": 30980 + }, + { + "epoch": 17.30782122905028, + "grad_norm": 0.4236200153827667, + "learning_rate": 0.0001353781512605042, + "loss": 0.3291, + "step": 30981 + }, + { + "epoch": 17.308379888268156, + "grad_norm": 0.42220044136047363, + "learning_rate": 0.00013535014005602243, + "loss": 0.3796, + "step": 30982 + }, + { + "epoch": 17.308938547486033, + "grad_norm": 0.9984843134880066, + "learning_rate": 0.00013532212885154063, + "loss": 0.4022, + "step": 30983 + }, + { + "epoch": 17.30949720670391, + "grad_norm": 0.5241859555244446, + "learning_rate": 0.0001352941176470588, + "loss": 0.3775, + "step": 30984 + }, + { + "epoch": 17.310055865921786, + "grad_norm": 0.3857882618904114, + "learning_rate": 0.00013526610644257702, + "loss": 0.3796, + "step": 30985 + }, + { + "epoch": 17.310614525139666, + "grad_norm": 1.3354923725128174, + "learning_rate": 0.00013523809523809525, + "loss": 0.3523, + "step": 30986 + }, + { + "epoch": 17.311173184357543, + "grad_norm": 0.3864104151725769, + "learning_rate": 0.00013521008403361346, + "loss": 0.3942, + "step": 30987 + }, + { + "epoch": 17.31173184357542, + "grad_norm": 1.2963893413543701, + "learning_rate": 0.00013518207282913166, + "loss": 0.3878, + "step": 30988 + }, + { + "epoch": 17.312290502793296, + "grad_norm": 0.4099782705307007, + "learning_rate": 0.00013515406162464984, + "loss": 0.4229, + "step": 30989 + }, + { + "epoch": 17.312849162011172, + "grad_norm": 0.703102171421051, + "learning_rate": 0.00013512605042016807, + "loss": 0.4509, + "step": 30990 + }, + { + "epoch": 17.31340782122905, + "grad_norm": 0.6414778232574463, + "learning_rate": 0.00013509803921568628, + "loss": 0.427, + "step": 30991 + }, + { + "epoch": 17.31396648044693, + "grad_norm": 0.34215009212493896, + "learning_rate": 0.0001350700280112045, + "loss": 0.3436, + "step": 30992 + }, + { + "epoch": 17.314525139664806, + "grad_norm": 2.1678569316864014, + "learning_rate": 0.00013504201680672267, + "loss": 0.3776, + "step": 30993 + }, + { + "epoch": 17.315083798882682, + "grad_norm": 0.3732213079929352, + "learning_rate": 0.0001350140056022409, + "loss": 0.4372, + "step": 30994 + }, + { + "epoch": 17.31564245810056, + "grad_norm": 0.618624210357666, + "learning_rate": 0.0001349859943977591, + "loss": 0.4475, + "step": 30995 + }, + { + "epoch": 17.316201117318435, + "grad_norm": 0.36261868476867676, + "learning_rate": 0.0001349579831932773, + "loss": 0.3479, + "step": 30996 + }, + { + "epoch": 17.316759776536312, + "grad_norm": 0.5007100701332092, + "learning_rate": 0.00013492997198879552, + "loss": 0.4405, + "step": 30997 + }, + { + "epoch": 17.31731843575419, + "grad_norm": 0.4221706986427307, + "learning_rate": 0.00013490196078431372, + "loss": 0.4509, + "step": 30998 + }, + { + "epoch": 17.31787709497207, + "grad_norm": 2.1026504039764404, + "learning_rate": 0.00013487394957983193, + "loss": 0.3968, + "step": 30999 + }, + { + "epoch": 17.318435754189945, + "grad_norm": 0.5361040234565735, + "learning_rate": 0.00013484593837535013, + "loss": 0.4392, + "step": 31000 + }, + { + "epoch": 17.318435754189945, + "eval_cer": 0.08447623100171298, + "eval_loss": 0.32168301939964294, + "eval_runtime": 55.8575, + "eval_samples_per_second": 81.242, + "eval_steps_per_second": 5.084, + "eval_wer": 0.333035390072903, + "step": 31000 + }, + { + "epoch": 17.31899441340782, + "grad_norm": 0.4020605683326721, + "learning_rate": 0.00013481792717086834, + "loss": 0.4416, + "step": 31001 + }, + { + "epoch": 17.3195530726257, + "grad_norm": 0.46303820610046387, + "learning_rate": 0.00013478991596638657, + "loss": 0.345, + "step": 31002 + }, + { + "epoch": 17.320111731843575, + "grad_norm": 0.4367726445198059, + "learning_rate": 0.00013476190476190475, + "loss": 0.4028, + "step": 31003 + }, + { + "epoch": 17.32067039106145, + "grad_norm": 0.5548036098480225, + "learning_rate": 0.00013473389355742296, + "loss": 0.3605, + "step": 31004 + }, + { + "epoch": 17.321229050279328, + "grad_norm": 0.3474282920360565, + "learning_rate": 0.00013470588235294116, + "loss": 0.3881, + "step": 31005 + }, + { + "epoch": 17.321787709497208, + "grad_norm": 0.46288466453552246, + "learning_rate": 0.0001346778711484594, + "loss": 0.3729, + "step": 31006 + }, + { + "epoch": 17.322346368715085, + "grad_norm": 0.7189444303512573, + "learning_rate": 0.0001346498599439776, + "loss": 0.468, + "step": 31007 + }, + { + "epoch": 17.32290502793296, + "grad_norm": 1.3265817165374756, + "learning_rate": 0.00013462184873949578, + "loss": 0.4116, + "step": 31008 + }, + { + "epoch": 17.323463687150838, + "grad_norm": 0.37299486994743347, + "learning_rate": 0.000134593837535014, + "loss": 0.3733, + "step": 31009 + }, + { + "epoch": 17.324022346368714, + "grad_norm": 0.4688703715801239, + "learning_rate": 0.00013456582633053222, + "loss": 0.489, + "step": 31010 + }, + { + "epoch": 17.32458100558659, + "grad_norm": 0.30199870467185974, + "learning_rate": 0.00013453781512605043, + "loss": 0.3427, + "step": 31011 + }, + { + "epoch": 17.325139664804468, + "grad_norm": 0.8538857102394104, + "learning_rate": 0.00013450980392156863, + "loss": 0.4749, + "step": 31012 + }, + { + "epoch": 17.325698324022348, + "grad_norm": 0.4387122094631195, + "learning_rate": 0.0001344817927170868, + "loss": 0.4268, + "step": 31013 + }, + { + "epoch": 17.326256983240224, + "grad_norm": 0.8337236046791077, + "learning_rate": 0.00013445378151260505, + "loss": 0.7094, + "step": 31014 + }, + { + "epoch": 17.3268156424581, + "grad_norm": 0.6055076122283936, + "learning_rate": 0.00013442577030812325, + "loss": 0.4725, + "step": 31015 + }, + { + "epoch": 17.327374301675977, + "grad_norm": 0.46573007106781006, + "learning_rate": 0.00013439775910364146, + "loss": 0.4359, + "step": 31016 + }, + { + "epoch": 17.327932960893854, + "grad_norm": 0.8039844632148743, + "learning_rate": 0.0001343697478991597, + "loss": 0.5274, + "step": 31017 + }, + { + "epoch": 17.32849162011173, + "grad_norm": 0.5466184616088867, + "learning_rate": 0.00013434173669467787, + "loss": 0.4516, + "step": 31018 + }, + { + "epoch": 17.32905027932961, + "grad_norm": 0.8507922291755676, + "learning_rate": 0.00013431372549019608, + "loss": 0.3562, + "step": 31019 + }, + { + "epoch": 17.329608938547487, + "grad_norm": 0.6679883003234863, + "learning_rate": 0.00013428571428571428, + "loss": 0.4599, + "step": 31020 + }, + { + "epoch": 17.330167597765364, + "grad_norm": 2.3208248615264893, + "learning_rate": 0.00013425770308123251, + "loss": 0.3567, + "step": 31021 + }, + { + "epoch": 17.33072625698324, + "grad_norm": 9.774212837219238, + "learning_rate": 0.00013422969187675072, + "loss": 0.3518, + "step": 31022 + }, + { + "epoch": 17.331284916201117, + "grad_norm": 0.4367597699165344, + "learning_rate": 0.0001342016806722689, + "loss": 0.3901, + "step": 31023 + }, + { + "epoch": 17.331843575418993, + "grad_norm": 0.4077896773815155, + "learning_rate": 0.0001341736694677871, + "loss": 0.4077, + "step": 31024 + }, + { + "epoch": 17.33240223463687, + "grad_norm": 2.621882200241089, + "learning_rate": 0.00013414565826330534, + "loss": 0.5258, + "step": 31025 + }, + { + "epoch": 17.33296089385475, + "grad_norm": 0.374945729970932, + "learning_rate": 0.00013411764705882354, + "loss": 0.3027, + "step": 31026 + }, + { + "epoch": 17.333519553072627, + "grad_norm": 0.5708574652671814, + "learning_rate": 0.00013408963585434175, + "loss": 0.3692, + "step": 31027 + }, + { + "epoch": 17.334078212290503, + "grad_norm": 0.7426781058311462, + "learning_rate": 0.00013406162464985993, + "loss": 0.3747, + "step": 31028 + }, + { + "epoch": 17.33463687150838, + "grad_norm": 0.7198552489280701, + "learning_rate": 0.00013403361344537816, + "loss": 0.2718, + "step": 31029 + }, + { + "epoch": 17.335195530726256, + "grad_norm": 0.5650630593299866, + "learning_rate": 0.00013400560224089637, + "loss": 0.3947, + "step": 31030 + }, + { + "epoch": 17.335754189944133, + "grad_norm": 0.4685058891773224, + "learning_rate": 0.00013397759103641457, + "loss": 0.4047, + "step": 31031 + }, + { + "epoch": 17.33631284916201, + "grad_norm": 0.4511740803718567, + "learning_rate": 0.00013394957983193278, + "loss": 0.4206, + "step": 31032 + }, + { + "epoch": 17.33687150837989, + "grad_norm": 0.893106997013092, + "learning_rate": 0.000133921568627451, + "loss": 0.3377, + "step": 31033 + }, + { + "epoch": 17.337430167597766, + "grad_norm": 0.4217708706855774, + "learning_rate": 0.0001338935574229692, + "loss": 0.4595, + "step": 31034 + }, + { + "epoch": 17.337988826815643, + "grad_norm": 0.47246602177619934, + "learning_rate": 0.0001338655462184874, + "loss": 0.4727, + "step": 31035 + }, + { + "epoch": 17.33854748603352, + "grad_norm": 0.43586745858192444, + "learning_rate": 0.0001338375350140056, + "loss": 0.4012, + "step": 31036 + }, + { + "epoch": 17.339106145251396, + "grad_norm": 1.1428580284118652, + "learning_rate": 0.00013380952380952384, + "loss": 0.4178, + "step": 31037 + }, + { + "epoch": 17.339664804469272, + "grad_norm": 0.6881787776947021, + "learning_rate": 0.00013378151260504202, + "loss": 0.4648, + "step": 31038 + }, + { + "epoch": 17.340223463687153, + "grad_norm": 0.9605644941329956, + "learning_rate": 0.00013375350140056022, + "loss": 0.4384, + "step": 31039 + }, + { + "epoch": 17.34078212290503, + "grad_norm": 0.4451036751270294, + "learning_rate": 0.00013372549019607843, + "loss": 0.3957, + "step": 31040 + }, + { + "epoch": 17.341340782122906, + "grad_norm": 0.4208865165710449, + "learning_rate": 0.00013369747899159666, + "loss": 0.4122, + "step": 31041 + }, + { + "epoch": 17.341899441340782, + "grad_norm": 0.5682745575904846, + "learning_rate": 0.00013366946778711487, + "loss": 0.3828, + "step": 31042 + }, + { + "epoch": 17.34245810055866, + "grad_norm": 0.4290216565132141, + "learning_rate": 0.00013364145658263305, + "loss": 0.3744, + "step": 31043 + }, + { + "epoch": 17.343016759776535, + "grad_norm": 0.9240705370903015, + "learning_rate": 0.00013361344537815125, + "loss": 0.3837, + "step": 31044 + }, + { + "epoch": 17.343575418994412, + "grad_norm": 0.4382179081439972, + "learning_rate": 0.00013358543417366949, + "loss": 0.5254, + "step": 31045 + }, + { + "epoch": 17.344134078212292, + "grad_norm": 0.7130903005599976, + "learning_rate": 0.0001335574229691877, + "loss": 0.4986, + "step": 31046 + }, + { + "epoch": 17.34469273743017, + "grad_norm": 0.608599841594696, + "learning_rate": 0.00013352941176470587, + "loss": 0.3217, + "step": 31047 + }, + { + "epoch": 17.345251396648045, + "grad_norm": 1.5888854265213013, + "learning_rate": 0.00013350140056022408, + "loss": 0.5133, + "step": 31048 + }, + { + "epoch": 17.345810055865922, + "grad_norm": 0.5445583462715149, + "learning_rate": 0.0001334733893557423, + "loss": 0.3349, + "step": 31049 + }, + { + "epoch": 17.3463687150838, + "grad_norm": 1.5335725545883179, + "learning_rate": 0.00013344537815126052, + "loss": 0.3488, + "step": 31050 + }, + { + "epoch": 17.346927374301675, + "grad_norm": 0.35896986722946167, + "learning_rate": 0.00013341736694677872, + "loss": 0.354, + "step": 31051 + }, + { + "epoch": 17.34748603351955, + "grad_norm": 0.46245482563972473, + "learning_rate": 0.0001333893557422969, + "loss": 0.432, + "step": 31052 + }, + { + "epoch": 17.34804469273743, + "grad_norm": 1.0253490209579468, + "learning_rate": 0.00013336134453781513, + "loss": 0.4725, + "step": 31053 + }, + { + "epoch": 17.34860335195531, + "grad_norm": 0.40848150849342346, + "learning_rate": 0.00013333333333333334, + "loss": 0.3143, + "step": 31054 + }, + { + "epoch": 17.349162011173185, + "grad_norm": 0.8147862553596497, + "learning_rate": 0.00013330532212885155, + "loss": 0.4263, + "step": 31055 + }, + { + "epoch": 17.34972067039106, + "grad_norm": 0.40357330441474915, + "learning_rate": 0.00013327731092436975, + "loss": 0.3502, + "step": 31056 + }, + { + "epoch": 17.350279329608938, + "grad_norm": 0.8164263367652893, + "learning_rate": 0.00013324929971988796, + "loss": 0.5221, + "step": 31057 + }, + { + "epoch": 17.350837988826814, + "grad_norm": 0.6612587571144104, + "learning_rate": 0.00013322128851540616, + "loss": 0.4333, + "step": 31058 + }, + { + "epoch": 17.35139664804469, + "grad_norm": 0.4285007417201996, + "learning_rate": 0.00013319327731092437, + "loss": 0.3303, + "step": 31059 + }, + { + "epoch": 17.35195530726257, + "grad_norm": 0.7287948727607727, + "learning_rate": 0.00013316526610644258, + "loss": 0.3868, + "step": 31060 + }, + { + "epoch": 17.352513966480448, + "grad_norm": 0.29196247458457947, + "learning_rate": 0.0001331372549019608, + "loss": 0.3313, + "step": 31061 + }, + { + "epoch": 17.353072625698324, + "grad_norm": 0.3559337854385376, + "learning_rate": 0.000133109243697479, + "loss": 0.3722, + "step": 31062 + }, + { + "epoch": 17.3536312849162, + "grad_norm": 0.5292189121246338, + "learning_rate": 0.0001330812324929972, + "loss": 0.4242, + "step": 31063 + }, + { + "epoch": 17.354189944134077, + "grad_norm": 0.37551379203796387, + "learning_rate": 0.0001330532212885154, + "loss": 0.3307, + "step": 31064 + }, + { + "epoch": 17.354748603351954, + "grad_norm": 0.5712440609931946, + "learning_rate": 0.00013302521008403363, + "loss": 0.417, + "step": 31065 + }, + { + "epoch": 17.355307262569834, + "grad_norm": 0.52461177110672, + "learning_rate": 0.00013299719887955184, + "loss": 0.4474, + "step": 31066 + }, + { + "epoch": 17.35586592178771, + "grad_norm": 0.5089712142944336, + "learning_rate": 0.00013296918767507002, + "loss": 0.5588, + "step": 31067 + }, + { + "epoch": 17.356424581005587, + "grad_norm": 0.6703516840934753, + "learning_rate": 0.00013294117647058822, + "loss": 0.4169, + "step": 31068 + }, + { + "epoch": 17.356983240223464, + "grad_norm": 0.4563344717025757, + "learning_rate": 0.00013291316526610646, + "loss": 0.5132, + "step": 31069 + }, + { + "epoch": 17.35754189944134, + "grad_norm": 0.9744804501533508, + "learning_rate": 0.00013288515406162466, + "loss": 0.4448, + "step": 31070 + }, + { + "epoch": 17.358100558659217, + "grad_norm": 0.4727672338485718, + "learning_rate": 0.00013285714285714287, + "loss": 0.4282, + "step": 31071 + }, + { + "epoch": 17.358659217877094, + "grad_norm": 0.38442444801330566, + "learning_rate": 0.00013282913165266105, + "loss": 0.3821, + "step": 31072 + }, + { + "epoch": 17.359217877094974, + "grad_norm": 0.5265441536903381, + "learning_rate": 0.00013280112044817928, + "loss": 0.5058, + "step": 31073 + }, + { + "epoch": 17.35977653631285, + "grad_norm": 0.4554186761379242, + "learning_rate": 0.0001327731092436975, + "loss": 0.3197, + "step": 31074 + }, + { + "epoch": 17.360335195530727, + "grad_norm": 2.4853837490081787, + "learning_rate": 0.0001327450980392157, + "loss": 0.4539, + "step": 31075 + }, + { + "epoch": 17.360893854748603, + "grad_norm": 0.4225909411907196, + "learning_rate": 0.0001327170868347339, + "loss": 0.3803, + "step": 31076 + }, + { + "epoch": 17.36145251396648, + "grad_norm": 0.44362735748291016, + "learning_rate": 0.0001326890756302521, + "loss": 0.4426, + "step": 31077 + }, + { + "epoch": 17.362011173184356, + "grad_norm": 0.5334896445274353, + "learning_rate": 0.0001326610644257703, + "loss": 0.382, + "step": 31078 + }, + { + "epoch": 17.362569832402233, + "grad_norm": 0.6462735533714294, + "learning_rate": 0.00013263305322128852, + "loss": 0.3612, + "step": 31079 + }, + { + "epoch": 17.363128491620113, + "grad_norm": 0.38574841618537903, + "learning_rate": 0.00013260504201680672, + "loss": 0.5019, + "step": 31080 + }, + { + "epoch": 17.36368715083799, + "grad_norm": 11.807450294494629, + "learning_rate": 0.00013257703081232496, + "loss": 0.4538, + "step": 31081 + }, + { + "epoch": 17.364245810055866, + "grad_norm": 0.3913212716579437, + "learning_rate": 0.00013254901960784313, + "loss": 0.3783, + "step": 31082 + }, + { + "epoch": 17.364804469273743, + "grad_norm": 0.3708021640777588, + "learning_rate": 0.00013252100840336134, + "loss": 0.4296, + "step": 31083 + }, + { + "epoch": 17.36536312849162, + "grad_norm": 0.29742497205734253, + "learning_rate": 0.00013249299719887955, + "loss": 0.3355, + "step": 31084 + }, + { + "epoch": 17.365921787709496, + "grad_norm": 0.3839186131954193, + "learning_rate": 0.00013246498599439778, + "loss": 0.2789, + "step": 31085 + }, + { + "epoch": 17.366480446927373, + "grad_norm": 1.1304399967193604, + "learning_rate": 0.00013243697478991599, + "loss": 0.3587, + "step": 31086 + }, + { + "epoch": 17.367039106145253, + "grad_norm": 1.2746134996414185, + "learning_rate": 0.00013240896358543416, + "loss": 0.5103, + "step": 31087 + }, + { + "epoch": 17.36759776536313, + "grad_norm": 0.4955059587955475, + "learning_rate": 0.00013238095238095237, + "loss": 0.4981, + "step": 31088 + }, + { + "epoch": 17.368156424581006, + "grad_norm": 0.48075589537620544, + "learning_rate": 0.0001323529411764706, + "loss": 0.536, + "step": 31089 + }, + { + "epoch": 17.368715083798882, + "grad_norm": 0.44370028376579285, + "learning_rate": 0.0001323249299719888, + "loss": 0.3746, + "step": 31090 + }, + { + "epoch": 17.36927374301676, + "grad_norm": 0.38735711574554443, + "learning_rate": 0.00013229691876750702, + "loss": 0.381, + "step": 31091 + }, + { + "epoch": 17.369832402234636, + "grad_norm": 0.6469260454177856, + "learning_rate": 0.0001322689075630252, + "loss": 0.4111, + "step": 31092 + }, + { + "epoch": 17.370391061452516, + "grad_norm": 0.5939928293228149, + "learning_rate": 0.00013224089635854343, + "loss": 0.4654, + "step": 31093 + }, + { + "epoch": 17.370949720670392, + "grad_norm": 0.44604727625846863, + "learning_rate": 0.00013221288515406163, + "loss": 0.3473, + "step": 31094 + }, + { + "epoch": 17.37150837988827, + "grad_norm": 0.5927965044975281, + "learning_rate": 0.00013218487394957984, + "loss": 0.3626, + "step": 31095 + }, + { + "epoch": 17.372067039106145, + "grad_norm": 0.4443076550960541, + "learning_rate": 0.00013215686274509805, + "loss": 0.4574, + "step": 31096 + }, + { + "epoch": 17.372625698324022, + "grad_norm": 0.6326189041137695, + "learning_rate": 0.00013212885154061625, + "loss": 0.4052, + "step": 31097 + }, + { + "epoch": 17.3731843575419, + "grad_norm": 0.41950735449790955, + "learning_rate": 0.00013210084033613446, + "loss": 0.3325, + "step": 31098 + }, + { + "epoch": 17.373743016759775, + "grad_norm": 0.842399001121521, + "learning_rate": 0.00013207282913165266, + "loss": 0.3975, + "step": 31099 + }, + { + "epoch": 17.374301675977655, + "grad_norm": 0.4795132875442505, + "learning_rate": 0.00013204481792717087, + "loss": 0.3684, + "step": 31100 + }, + { + "epoch": 17.37486033519553, + "grad_norm": 0.5365030765533447, + "learning_rate": 0.00013201680672268908, + "loss": 0.3895, + "step": 31101 + }, + { + "epoch": 17.37541899441341, + "grad_norm": 0.44384628534317017, + "learning_rate": 0.00013198879551820728, + "loss": 0.4035, + "step": 31102 + }, + { + "epoch": 17.375977653631285, + "grad_norm": 0.5448676347732544, + "learning_rate": 0.0001319607843137255, + "loss": 0.4162, + "step": 31103 + }, + { + "epoch": 17.37653631284916, + "grad_norm": 0.31774941086769104, + "learning_rate": 0.0001319327731092437, + "loss": 0.3768, + "step": 31104 + }, + { + "epoch": 17.377094972067038, + "grad_norm": 0.8972043395042419, + "learning_rate": 0.00013190476190476193, + "loss": 0.3863, + "step": 31105 + }, + { + "epoch": 17.377653631284915, + "grad_norm": 1.0762656927108765, + "learning_rate": 0.0001318767507002801, + "loss": 0.389, + "step": 31106 + }, + { + "epoch": 17.378212290502795, + "grad_norm": 0.3484049439430237, + "learning_rate": 0.0001318487394957983, + "loss": 0.3416, + "step": 31107 + }, + { + "epoch": 17.37877094972067, + "grad_norm": 5.075378894805908, + "learning_rate": 0.00013182072829131652, + "loss": 0.4321, + "step": 31108 + }, + { + "epoch": 17.379329608938548, + "grad_norm": 1.223828673362732, + "learning_rate": 0.00013179271708683475, + "loss": 0.4254, + "step": 31109 + }, + { + "epoch": 17.379888268156424, + "grad_norm": 0.32664188742637634, + "learning_rate": 0.00013176470588235296, + "loss": 0.3373, + "step": 31110 + }, + { + "epoch": 17.3804469273743, + "grad_norm": 0.35772499442100525, + "learning_rate": 0.00013173669467787114, + "loss": 0.3857, + "step": 31111 + }, + { + "epoch": 17.381005586592178, + "grad_norm": 0.4438253939151764, + "learning_rate": 0.00013170868347338934, + "loss": 0.403, + "step": 31112 + }, + { + "epoch": 17.381564245810054, + "grad_norm": 0.5758340954780579, + "learning_rate": 0.00013168067226890757, + "loss": 0.4319, + "step": 31113 + }, + { + "epoch": 17.382122905027934, + "grad_norm": 0.4089943468570709, + "learning_rate": 0.00013165266106442578, + "loss": 0.2961, + "step": 31114 + }, + { + "epoch": 17.38268156424581, + "grad_norm": 0.5886080861091614, + "learning_rate": 0.000131624649859944, + "loss": 0.4895, + "step": 31115 + }, + { + "epoch": 17.383240223463687, + "grad_norm": 1.3975069522857666, + "learning_rate": 0.00013159663865546217, + "loss": 0.3515, + "step": 31116 + }, + { + "epoch": 17.383798882681564, + "grad_norm": 0.6111028790473938, + "learning_rate": 0.0001315686274509804, + "loss": 0.4484, + "step": 31117 + }, + { + "epoch": 17.38435754189944, + "grad_norm": 0.44044652581214905, + "learning_rate": 0.0001315406162464986, + "loss": 0.3735, + "step": 31118 + }, + { + "epoch": 17.384916201117317, + "grad_norm": 1.429068922996521, + "learning_rate": 0.0001315126050420168, + "loss": 0.4147, + "step": 31119 + }, + { + "epoch": 17.385474860335197, + "grad_norm": 0.37906938791275024, + "learning_rate": 0.00013148459383753502, + "loss": 0.3015, + "step": 31120 + }, + { + "epoch": 17.386033519553074, + "grad_norm": 0.38827309012413025, + "learning_rate": 0.00013145658263305322, + "loss": 0.381, + "step": 31121 + }, + { + "epoch": 17.38659217877095, + "grad_norm": 0.7701229453086853, + "learning_rate": 0.00013142857142857143, + "loss": 0.4689, + "step": 31122 + }, + { + "epoch": 17.387150837988827, + "grad_norm": 0.346907377243042, + "learning_rate": 0.00013140056022408963, + "loss": 0.4313, + "step": 31123 + }, + { + "epoch": 17.387709497206703, + "grad_norm": 0.42327284812927246, + "learning_rate": 0.00013137254901960784, + "loss": 0.4318, + "step": 31124 + }, + { + "epoch": 17.38826815642458, + "grad_norm": 0.6674845814704895, + "learning_rate": 0.00013134453781512607, + "loss": 0.3852, + "step": 31125 + }, + { + "epoch": 17.388826815642457, + "grad_norm": 0.5044666528701782, + "learning_rate": 0.00013131652661064425, + "loss": 0.335, + "step": 31126 + }, + { + "epoch": 17.389385474860337, + "grad_norm": 0.7203618884086609, + "learning_rate": 0.00013128851540616246, + "loss": 0.388, + "step": 31127 + }, + { + "epoch": 17.389944134078213, + "grad_norm": 1.1940944194793701, + "learning_rate": 0.00013126050420168066, + "loss": 0.4515, + "step": 31128 + }, + { + "epoch": 17.39050279329609, + "grad_norm": 0.5435183644294739, + "learning_rate": 0.0001312324929971989, + "loss": 0.3519, + "step": 31129 + }, + { + "epoch": 17.391061452513966, + "grad_norm": 0.4717260003089905, + "learning_rate": 0.0001312044817927171, + "loss": 0.4059, + "step": 31130 + }, + { + "epoch": 17.391620111731843, + "grad_norm": 0.6585044264793396, + "learning_rate": 0.00013117647058823528, + "loss": 0.631, + "step": 31131 + }, + { + "epoch": 17.39217877094972, + "grad_norm": 0.45104146003723145, + "learning_rate": 0.0001311484593837535, + "loss": 0.3558, + "step": 31132 + }, + { + "epoch": 17.392737430167596, + "grad_norm": 0.5185086131095886, + "learning_rate": 0.00013112044817927172, + "loss": 0.5385, + "step": 31133 + }, + { + "epoch": 17.393296089385476, + "grad_norm": 0.7516040205955505, + "learning_rate": 0.00013109243697478993, + "loss": 0.4288, + "step": 31134 + }, + { + "epoch": 17.393854748603353, + "grad_norm": 0.5420228242874146, + "learning_rate": 0.00013106442577030813, + "loss": 0.744, + "step": 31135 + }, + { + "epoch": 17.39441340782123, + "grad_norm": 1.1047993898391724, + "learning_rate": 0.0001310364145658263, + "loss": 0.4011, + "step": 31136 + }, + { + "epoch": 17.394972067039106, + "grad_norm": 4.153549671173096, + "learning_rate": 0.00013100840336134455, + "loss": 0.3628, + "step": 31137 + }, + { + "epoch": 17.395530726256982, + "grad_norm": 0.6312436461448669, + "learning_rate": 0.00013098039215686275, + "loss": 0.4365, + "step": 31138 + }, + { + "epoch": 17.39608938547486, + "grad_norm": 0.5247658491134644, + "learning_rate": 0.00013095238095238096, + "loss": 0.371, + "step": 31139 + }, + { + "epoch": 17.39664804469274, + "grad_norm": 0.3999980092048645, + "learning_rate": 0.00013092436974789916, + "loss": 0.3387, + "step": 31140 + }, + { + "epoch": 17.397206703910616, + "grad_norm": 0.5040502548217773, + "learning_rate": 0.00013089635854341737, + "loss": 0.4086, + "step": 31141 + }, + { + "epoch": 17.397765363128492, + "grad_norm": 1.0589455366134644, + "learning_rate": 0.00013086834733893558, + "loss": 0.2532, + "step": 31142 + }, + { + "epoch": 17.39832402234637, + "grad_norm": 0.3961639106273651, + "learning_rate": 0.00013084033613445378, + "loss": 0.355, + "step": 31143 + }, + { + "epoch": 17.398882681564245, + "grad_norm": 0.6299583911895752, + "learning_rate": 0.000130812324929972, + "loss": 0.4084, + "step": 31144 + }, + { + "epoch": 17.399441340782122, + "grad_norm": 0.3586007058620453, + "learning_rate": 0.00013078431372549022, + "loss": 0.3633, + "step": 31145 + }, + { + "epoch": 17.4, + "grad_norm": 0.3738737404346466, + "learning_rate": 0.0001307563025210084, + "loss": 0.3777, + "step": 31146 + }, + { + "epoch": 17.40055865921788, + "grad_norm": 1.038413643836975, + "learning_rate": 0.0001307282913165266, + "loss": 0.5113, + "step": 31147 + }, + { + "epoch": 17.401117318435755, + "grad_norm": 0.5648976564407349, + "learning_rate": 0.0001307002801120448, + "loss": 0.57, + "step": 31148 + }, + { + "epoch": 17.401675977653632, + "grad_norm": 0.465968519449234, + "learning_rate": 0.00013067226890756304, + "loss": 0.3386, + "step": 31149 + }, + { + "epoch": 17.40223463687151, + "grad_norm": 0.5013076663017273, + "learning_rate": 0.00013064425770308125, + "loss": 0.5146, + "step": 31150 + }, + { + "epoch": 17.402793296089385, + "grad_norm": 0.869238555431366, + "learning_rate": 0.00013061624649859943, + "loss": 0.4005, + "step": 31151 + }, + { + "epoch": 17.40335195530726, + "grad_norm": 0.41563844680786133, + "learning_rate": 0.00013058823529411764, + "loss": 0.4106, + "step": 31152 + }, + { + "epoch": 17.403910614525138, + "grad_norm": 0.4195695221424103, + "learning_rate": 0.00013056022408963587, + "loss": 0.3817, + "step": 31153 + }, + { + "epoch": 17.404469273743018, + "grad_norm": 0.37012434005737305, + "learning_rate": 0.00013053221288515407, + "loss": 0.3547, + "step": 31154 + }, + { + "epoch": 17.405027932960895, + "grad_norm": 0.34807100892066956, + "learning_rate": 0.00013050420168067228, + "loss": 0.4019, + "step": 31155 + }, + { + "epoch": 17.40558659217877, + "grad_norm": 0.7357035279273987, + "learning_rate": 0.00013047619047619046, + "loss": 0.444, + "step": 31156 + }, + { + "epoch": 17.406145251396648, + "grad_norm": 0.6481372117996216, + "learning_rate": 0.0001304481792717087, + "loss": 0.4614, + "step": 31157 + }, + { + "epoch": 17.406703910614524, + "grad_norm": 0.9928131699562073, + "learning_rate": 0.0001304201680672269, + "loss": 0.3733, + "step": 31158 + }, + { + "epoch": 17.4072625698324, + "grad_norm": 0.37836578488349915, + "learning_rate": 0.0001303921568627451, + "loss": 0.3426, + "step": 31159 + }, + { + "epoch": 17.407821229050278, + "grad_norm": 0.33758580684661865, + "learning_rate": 0.00013036414565826328, + "loss": 0.3914, + "step": 31160 + }, + { + "epoch": 17.408379888268158, + "grad_norm": 0.39689308404922485, + "learning_rate": 0.00013033613445378152, + "loss": 0.3583, + "step": 31161 + }, + { + "epoch": 17.408938547486034, + "grad_norm": 0.5006991028785706, + "learning_rate": 0.00013030812324929972, + "loss": 0.4154, + "step": 31162 + }, + { + "epoch": 17.40949720670391, + "grad_norm": 0.5362796783447266, + "learning_rate": 0.00013028011204481793, + "loss": 0.409, + "step": 31163 + }, + { + "epoch": 17.410055865921787, + "grad_norm": 0.39778801798820496, + "learning_rate": 0.00013025210084033613, + "loss": 0.4054, + "step": 31164 + }, + { + "epoch": 17.410614525139664, + "grad_norm": 0.3496702015399933, + "learning_rate": 0.00013022408963585434, + "loss": 0.3646, + "step": 31165 + }, + { + "epoch": 17.41117318435754, + "grad_norm": 0.3237707316875458, + "learning_rate": 0.00013019607843137255, + "loss": 0.3432, + "step": 31166 + }, + { + "epoch": 17.41173184357542, + "grad_norm": 0.39388328790664673, + "learning_rate": 0.00013016806722689075, + "loss": 0.4349, + "step": 31167 + }, + { + "epoch": 17.412290502793297, + "grad_norm": 0.5242244601249695, + "learning_rate": 0.00013014005602240896, + "loss": 0.369, + "step": 31168 + }, + { + "epoch": 17.412849162011174, + "grad_norm": 0.42856332659721375, + "learning_rate": 0.0001301120448179272, + "loss": 0.4278, + "step": 31169 + }, + { + "epoch": 17.41340782122905, + "grad_norm": 2.6800715923309326, + "learning_rate": 0.00013008403361344537, + "loss": 0.4979, + "step": 31170 + }, + { + "epoch": 17.413966480446927, + "grad_norm": 0.3910575807094574, + "learning_rate": 0.00013005602240896358, + "loss": 0.4928, + "step": 31171 + }, + { + "epoch": 17.414525139664804, + "grad_norm": 0.31625622510910034, + "learning_rate": 0.00013002801120448178, + "loss": 0.2782, + "step": 31172 + }, + { + "epoch": 17.41508379888268, + "grad_norm": 0.38408100605010986, + "learning_rate": 0.00013000000000000002, + "loss": 0.4502, + "step": 31173 + }, + { + "epoch": 17.41564245810056, + "grad_norm": 0.38111215829849243, + "learning_rate": 0.00012997198879551822, + "loss": 0.3911, + "step": 31174 + }, + { + "epoch": 17.416201117318437, + "grad_norm": 0.9490625262260437, + "learning_rate": 0.0001299439775910364, + "loss": 0.4187, + "step": 31175 + }, + { + "epoch": 17.416759776536313, + "grad_norm": 0.3859139680862427, + "learning_rate": 0.0001299159663865546, + "loss": 0.4071, + "step": 31176 + }, + { + "epoch": 17.41731843575419, + "grad_norm": 0.5537550449371338, + "learning_rate": 0.00012988795518207284, + "loss": 0.5584, + "step": 31177 + }, + { + "epoch": 17.417877094972066, + "grad_norm": 0.4984540045261383, + "learning_rate": 0.00012985994397759105, + "loss": 0.5001, + "step": 31178 + }, + { + "epoch": 17.418435754189943, + "grad_norm": 0.445202499628067, + "learning_rate": 0.00012983193277310925, + "loss": 0.3912, + "step": 31179 + }, + { + "epoch": 17.41899441340782, + "grad_norm": 0.42921656370162964, + "learning_rate": 0.00012980392156862743, + "loss": 0.5417, + "step": 31180 + }, + { + "epoch": 17.4195530726257, + "grad_norm": 0.47433730959892273, + "learning_rate": 0.00012977591036414566, + "loss": 0.4553, + "step": 31181 + }, + { + "epoch": 17.420111731843576, + "grad_norm": 0.3896671533584595, + "learning_rate": 0.00012974789915966387, + "loss": 0.3289, + "step": 31182 + }, + { + "epoch": 17.420670391061453, + "grad_norm": 1.5151444673538208, + "learning_rate": 0.00012971988795518208, + "loss": 0.4431, + "step": 31183 + }, + { + "epoch": 17.42122905027933, + "grad_norm": 0.3981206715106964, + "learning_rate": 0.00012969187675070028, + "loss": 0.3853, + "step": 31184 + }, + { + "epoch": 17.421787709497206, + "grad_norm": 0.5477587580680847, + "learning_rate": 0.0001296638655462185, + "loss": 0.4347, + "step": 31185 + }, + { + "epoch": 17.422346368715083, + "grad_norm": 0.6910151243209839, + "learning_rate": 0.0001296358543417367, + "loss": 0.3903, + "step": 31186 + }, + { + "epoch": 17.422905027932963, + "grad_norm": 2.355640172958374, + "learning_rate": 0.0001296078431372549, + "loss": 0.3713, + "step": 31187 + }, + { + "epoch": 17.42346368715084, + "grad_norm": 0.42415162920951843, + "learning_rate": 0.0001295798319327731, + "loss": 0.3866, + "step": 31188 + }, + { + "epoch": 17.424022346368716, + "grad_norm": 0.4407443404197693, + "learning_rate": 0.00012955182072829134, + "loss": 0.2818, + "step": 31189 + }, + { + "epoch": 17.424581005586592, + "grad_norm": 0.49550893902778625, + "learning_rate": 0.00012952380952380952, + "loss": 0.4505, + "step": 31190 + }, + { + "epoch": 17.42513966480447, + "grad_norm": 0.5797774791717529, + "learning_rate": 0.00012949579831932772, + "loss": 0.3807, + "step": 31191 + }, + { + "epoch": 17.425698324022346, + "grad_norm": 0.4401281774044037, + "learning_rate": 0.00012946778711484593, + "loss": 0.3979, + "step": 31192 + }, + { + "epoch": 17.426256983240222, + "grad_norm": 0.6628199219703674, + "learning_rate": 0.00012943977591036416, + "loss": 0.4008, + "step": 31193 + }, + { + "epoch": 17.426815642458102, + "grad_norm": 1.6024513244628906, + "learning_rate": 0.00012941176470588237, + "loss": 0.3079, + "step": 31194 + }, + { + "epoch": 17.42737430167598, + "grad_norm": 0.34817370772361755, + "learning_rate": 0.00012938375350140055, + "loss": 0.3773, + "step": 31195 + }, + { + "epoch": 17.427932960893855, + "grad_norm": 0.4814103841781616, + "learning_rate": 0.00012935574229691875, + "loss": 0.4857, + "step": 31196 + }, + { + "epoch": 17.428491620111732, + "grad_norm": 0.3819580376148224, + "learning_rate": 0.000129327731092437, + "loss": 0.3314, + "step": 31197 + }, + { + "epoch": 17.42905027932961, + "grad_norm": 0.3839380741119385, + "learning_rate": 0.0001292997198879552, + "loss": 0.3444, + "step": 31198 + }, + { + "epoch": 17.429608938547485, + "grad_norm": 0.4543190598487854, + "learning_rate": 0.0001292717086834734, + "loss": 0.3227, + "step": 31199 + }, + { + "epoch": 17.43016759776536, + "grad_norm": 0.6105818152427673, + "learning_rate": 0.00012924369747899158, + "loss": 0.3553, + "step": 31200 + }, + { + "epoch": 17.43072625698324, + "grad_norm": 0.4673517048358917, + "learning_rate": 0.0001292156862745098, + "loss": 0.3473, + "step": 31201 + }, + { + "epoch": 17.43128491620112, + "grad_norm": 0.3725651204586029, + "learning_rate": 0.00012918767507002802, + "loss": 0.4416, + "step": 31202 + }, + { + "epoch": 17.431843575418995, + "grad_norm": 0.4732637405395508, + "learning_rate": 0.00012915966386554622, + "loss": 0.4132, + "step": 31203 + }, + { + "epoch": 17.43240223463687, + "grad_norm": 1.4968427419662476, + "learning_rate": 0.00012913165266106443, + "loss": 0.3913, + "step": 31204 + }, + { + "epoch": 17.432960893854748, + "grad_norm": 2.410842180252075, + "learning_rate": 0.00012910364145658263, + "loss": 0.4388, + "step": 31205 + }, + { + "epoch": 17.433519553072625, + "grad_norm": 0.38205739855766296, + "learning_rate": 0.00012907563025210084, + "loss": 0.3819, + "step": 31206 + }, + { + "epoch": 17.4340782122905, + "grad_norm": 0.44525229930877686, + "learning_rate": 0.00012904761904761905, + "loss": 0.355, + "step": 31207 + }, + { + "epoch": 17.43463687150838, + "grad_norm": 0.33054521679878235, + "learning_rate": 0.00012901960784313725, + "loss": 0.3039, + "step": 31208 + }, + { + "epoch": 17.435195530726258, + "grad_norm": 0.3912530541419983, + "learning_rate": 0.00012899159663865549, + "loss": 0.4294, + "step": 31209 + }, + { + "epoch": 17.435754189944134, + "grad_norm": 1.9333300590515137, + "learning_rate": 0.00012896358543417366, + "loss": 0.3906, + "step": 31210 + }, + { + "epoch": 17.43631284916201, + "grad_norm": 0.3383536636829376, + "learning_rate": 0.00012893557422969187, + "loss": 0.3803, + "step": 31211 + }, + { + "epoch": 17.436871508379888, + "grad_norm": 0.33888325095176697, + "learning_rate": 0.00012890756302521008, + "loss": 0.3367, + "step": 31212 + }, + { + "epoch": 17.437430167597764, + "grad_norm": 0.44426989555358887, + "learning_rate": 0.0001288795518207283, + "loss": 0.3968, + "step": 31213 + }, + { + "epoch": 17.43798882681564, + "grad_norm": 0.4917047917842865, + "learning_rate": 0.0001288515406162465, + "loss": 0.3472, + "step": 31214 + }, + { + "epoch": 17.43854748603352, + "grad_norm": 0.42818641662597656, + "learning_rate": 0.0001288235294117647, + "loss": 0.442, + "step": 31215 + }, + { + "epoch": 17.439106145251397, + "grad_norm": 0.48286232352256775, + "learning_rate": 0.0001287955182072829, + "loss": 0.4061, + "step": 31216 + }, + { + "epoch": 17.439664804469274, + "grad_norm": 0.3871276080608368, + "learning_rate": 0.00012876750700280113, + "loss": 0.4856, + "step": 31217 + }, + { + "epoch": 17.44022346368715, + "grad_norm": 0.5358783602714539, + "learning_rate": 0.00012873949579831934, + "loss": 0.3334, + "step": 31218 + }, + { + "epoch": 17.440782122905027, + "grad_norm": 0.31744393706321716, + "learning_rate": 0.00012871148459383752, + "loss": 0.3663, + "step": 31219 + }, + { + "epoch": 17.441340782122904, + "grad_norm": 0.3956749737262726, + "learning_rate": 0.00012868347338935572, + "loss": 0.369, + "step": 31220 + }, + { + "epoch": 17.441899441340784, + "grad_norm": 0.44227921962738037, + "learning_rate": 0.00012865546218487396, + "loss": 0.5505, + "step": 31221 + }, + { + "epoch": 17.44245810055866, + "grad_norm": 0.5183186531066895, + "learning_rate": 0.00012862745098039216, + "loss": 0.5041, + "step": 31222 + }, + { + "epoch": 17.443016759776537, + "grad_norm": 0.7157148718833923, + "learning_rate": 0.00012859943977591037, + "loss": 0.391, + "step": 31223 + }, + { + "epoch": 17.443575418994413, + "grad_norm": 1.0879318714141846, + "learning_rate": 0.00012857142857142855, + "loss": 0.3555, + "step": 31224 + }, + { + "epoch": 17.44413407821229, + "grad_norm": 2.4511914253234863, + "learning_rate": 0.00012854341736694678, + "loss": 0.367, + "step": 31225 + }, + { + "epoch": 17.444692737430167, + "grad_norm": 0.3168339729309082, + "learning_rate": 0.000128515406162465, + "loss": 0.3696, + "step": 31226 + }, + { + "epoch": 17.445251396648043, + "grad_norm": 0.40689313411712646, + "learning_rate": 0.0001284873949579832, + "loss": 0.3725, + "step": 31227 + }, + { + "epoch": 17.445810055865923, + "grad_norm": 0.33695682883262634, + "learning_rate": 0.0001284593837535014, + "loss": 0.3602, + "step": 31228 + }, + { + "epoch": 17.4463687150838, + "grad_norm": 0.4141903817653656, + "learning_rate": 0.0001284313725490196, + "loss": 0.3079, + "step": 31229 + }, + { + "epoch": 17.446927374301676, + "grad_norm": 0.6350756287574768, + "learning_rate": 0.0001284033613445378, + "loss": 0.4451, + "step": 31230 + }, + { + "epoch": 17.447486033519553, + "grad_norm": 0.3371887505054474, + "learning_rate": 0.00012837535014005602, + "loss": 0.3402, + "step": 31231 + }, + { + "epoch": 17.44804469273743, + "grad_norm": 0.5235417485237122, + "learning_rate": 0.00012834733893557422, + "loss": 0.4089, + "step": 31232 + }, + { + "epoch": 17.448603351955306, + "grad_norm": 0.45878708362579346, + "learning_rate": 0.00012831932773109246, + "loss": 0.4281, + "step": 31233 + }, + { + "epoch": 17.449162011173183, + "grad_norm": 0.42058709263801575, + "learning_rate": 0.00012829131652661064, + "loss": 0.3968, + "step": 31234 + }, + { + "epoch": 17.449720670391063, + "grad_norm": 0.5157774686813354, + "learning_rate": 0.00012826330532212884, + "loss": 0.3863, + "step": 31235 + }, + { + "epoch": 17.45027932960894, + "grad_norm": 1.095288634300232, + "learning_rate": 0.00012823529411764705, + "loss": 0.4447, + "step": 31236 + }, + { + "epoch": 17.450837988826816, + "grad_norm": 0.5966135263442993, + "learning_rate": 0.00012820728291316528, + "loss": 0.5351, + "step": 31237 + }, + { + "epoch": 17.451396648044692, + "grad_norm": 2.243619441986084, + "learning_rate": 0.0001281792717086835, + "loss": 0.4642, + "step": 31238 + }, + { + "epoch": 17.45195530726257, + "grad_norm": 0.41884922981262207, + "learning_rate": 0.00012815126050420167, + "loss": 0.4461, + "step": 31239 + }, + { + "epoch": 17.452513966480446, + "grad_norm": 0.6158894300460815, + "learning_rate": 0.00012812324929971987, + "loss": 0.4425, + "step": 31240 + }, + { + "epoch": 17.453072625698326, + "grad_norm": 0.564399242401123, + "learning_rate": 0.0001280952380952381, + "loss": 0.45, + "step": 31241 + }, + { + "epoch": 17.453631284916202, + "grad_norm": 0.740170419216156, + "learning_rate": 0.0001280672268907563, + "loss": 0.395, + "step": 31242 + }, + { + "epoch": 17.45418994413408, + "grad_norm": 0.41129449009895325, + "learning_rate": 0.00012803921568627452, + "loss": 0.2984, + "step": 31243 + }, + { + "epoch": 17.454748603351955, + "grad_norm": 0.3733203411102295, + "learning_rate": 0.0001280112044817927, + "loss": 0.337, + "step": 31244 + }, + { + "epoch": 17.455307262569832, + "grad_norm": 0.5790176391601562, + "learning_rate": 0.00012798319327731093, + "loss": 0.3524, + "step": 31245 + }, + { + "epoch": 17.45586592178771, + "grad_norm": 0.546937882900238, + "learning_rate": 0.00012795518207282913, + "loss": 0.4131, + "step": 31246 + }, + { + "epoch": 17.456424581005585, + "grad_norm": 0.3751690685749054, + "learning_rate": 0.00012792717086834734, + "loss": 0.3928, + "step": 31247 + }, + { + "epoch": 17.456983240223465, + "grad_norm": 0.7142043709754944, + "learning_rate": 0.00012789915966386557, + "loss": 0.4499, + "step": 31248 + }, + { + "epoch": 17.457541899441342, + "grad_norm": 0.36646780371665955, + "learning_rate": 0.00012787114845938375, + "loss": 0.3072, + "step": 31249 + }, + { + "epoch": 17.45810055865922, + "grad_norm": 0.8351800441741943, + "learning_rate": 0.00012784313725490196, + "loss": 0.4419, + "step": 31250 + }, + { + "epoch": 17.458659217877095, + "grad_norm": 0.4552780091762543, + "learning_rate": 0.00012781512605042016, + "loss": 0.5186, + "step": 31251 + }, + { + "epoch": 17.45921787709497, + "grad_norm": 0.4794028699398041, + "learning_rate": 0.0001277871148459384, + "loss": 0.367, + "step": 31252 + }, + { + "epoch": 17.459776536312848, + "grad_norm": 1.0033352375030518, + "learning_rate": 0.0001277591036414566, + "loss": 0.3567, + "step": 31253 + }, + { + "epoch": 17.460335195530725, + "grad_norm": 0.9024407863616943, + "learning_rate": 0.00012773109243697478, + "loss": 0.5256, + "step": 31254 + }, + { + "epoch": 17.460893854748605, + "grad_norm": 0.5599893927574158, + "learning_rate": 0.000127703081232493, + "loss": 0.4971, + "step": 31255 + }, + { + "epoch": 17.46145251396648, + "grad_norm": 0.37672266364097595, + "learning_rate": 0.00012767507002801122, + "loss": 0.394, + "step": 31256 + }, + { + "epoch": 17.462011173184358, + "grad_norm": 0.5927656888961792, + "learning_rate": 0.00012764705882352943, + "loss": 0.5308, + "step": 31257 + }, + { + "epoch": 17.462569832402234, + "grad_norm": 0.8347872495651245, + "learning_rate": 0.00012761904761904763, + "loss": 0.3615, + "step": 31258 + }, + { + "epoch": 17.46312849162011, + "grad_norm": 0.4515547454357147, + "learning_rate": 0.0001275910364145658, + "loss": 0.4665, + "step": 31259 + }, + { + "epoch": 17.463687150837988, + "grad_norm": 1.0949366092681885, + "learning_rate": 0.00012756302521008405, + "loss": 0.457, + "step": 31260 + }, + { + "epoch": 17.464245810055864, + "grad_norm": 0.5063674449920654, + "learning_rate": 0.00012753501400560225, + "loss": 0.5971, + "step": 31261 + }, + { + "epoch": 17.464804469273744, + "grad_norm": 0.3996122181415558, + "learning_rate": 0.00012750700280112046, + "loss": 0.3813, + "step": 31262 + }, + { + "epoch": 17.46536312849162, + "grad_norm": 0.4895181655883789, + "learning_rate": 0.00012747899159663866, + "loss": 0.4412, + "step": 31263 + }, + { + "epoch": 17.465921787709497, + "grad_norm": 0.679352343082428, + "learning_rate": 0.00012745098039215687, + "loss": 0.456, + "step": 31264 + }, + { + "epoch": 17.466480446927374, + "grad_norm": 0.5667781233787537, + "learning_rate": 0.00012742296918767508, + "loss": 0.4587, + "step": 31265 + }, + { + "epoch": 17.46703910614525, + "grad_norm": 0.3791618347167969, + "learning_rate": 0.00012739495798319328, + "loss": 0.3619, + "step": 31266 + }, + { + "epoch": 17.467597765363127, + "grad_norm": 0.4643804132938385, + "learning_rate": 0.0001273669467787115, + "loss": 0.3956, + "step": 31267 + }, + { + "epoch": 17.468156424581007, + "grad_norm": 0.6760391592979431, + "learning_rate": 0.0001273389355742297, + "loss": 0.3493, + "step": 31268 + }, + { + "epoch": 17.468715083798884, + "grad_norm": 0.4165029525756836, + "learning_rate": 0.0001273109243697479, + "loss": 0.4396, + "step": 31269 + }, + { + "epoch": 17.46927374301676, + "grad_norm": 0.4410113990306854, + "learning_rate": 0.0001272829131652661, + "loss": 0.4114, + "step": 31270 + }, + { + "epoch": 17.469832402234637, + "grad_norm": 0.3582690954208374, + "learning_rate": 0.0001272549019607843, + "loss": 0.4026, + "step": 31271 + }, + { + "epoch": 17.470391061452514, + "grad_norm": 0.5017548203468323, + "learning_rate": 0.00012722689075630254, + "loss": 0.4254, + "step": 31272 + }, + { + "epoch": 17.47094972067039, + "grad_norm": 0.3959094285964966, + "learning_rate": 0.00012719887955182072, + "loss": 0.3526, + "step": 31273 + }, + { + "epoch": 17.471508379888267, + "grad_norm": 0.41697320342063904, + "learning_rate": 0.00012717086834733893, + "loss": 0.4482, + "step": 31274 + }, + { + "epoch": 17.472067039106147, + "grad_norm": 0.6840927004814148, + "learning_rate": 0.00012714285714285714, + "loss": 0.4384, + "step": 31275 + }, + { + "epoch": 17.472625698324023, + "grad_norm": 0.33184123039245605, + "learning_rate": 0.00012711484593837537, + "loss": 0.2965, + "step": 31276 + }, + { + "epoch": 17.4731843575419, + "grad_norm": 2.997969150543213, + "learning_rate": 0.00012708683473389357, + "loss": 0.3998, + "step": 31277 + }, + { + "epoch": 17.473743016759776, + "grad_norm": 0.5682587027549744, + "learning_rate": 0.00012705882352941175, + "loss": 0.4387, + "step": 31278 + }, + { + "epoch": 17.474301675977653, + "grad_norm": 0.47491955757141113, + "learning_rate": 0.00012703081232492996, + "loss": 0.5778, + "step": 31279 + }, + { + "epoch": 17.47486033519553, + "grad_norm": 0.4297771751880646, + "learning_rate": 0.0001270028011204482, + "loss": 0.3574, + "step": 31280 + }, + { + "epoch": 17.475418994413406, + "grad_norm": 2.0499913692474365, + "learning_rate": 0.0001269747899159664, + "loss": 0.3042, + "step": 31281 + }, + { + "epoch": 17.475977653631286, + "grad_norm": 0.48910534381866455, + "learning_rate": 0.0001269467787114846, + "loss": 0.4409, + "step": 31282 + }, + { + "epoch": 17.476536312849163, + "grad_norm": 0.5102927684783936, + "learning_rate": 0.00012691876750700278, + "loss": 0.4225, + "step": 31283 + }, + { + "epoch": 17.47709497206704, + "grad_norm": 0.3281363546848297, + "learning_rate": 0.00012689075630252102, + "loss": 0.3329, + "step": 31284 + }, + { + "epoch": 17.477653631284916, + "grad_norm": 0.5468024015426636, + "learning_rate": 0.00012686274509803922, + "loss": 0.4837, + "step": 31285 + }, + { + "epoch": 17.478212290502793, + "grad_norm": 0.7422975301742554, + "learning_rate": 0.00012683473389355743, + "loss": 0.4877, + "step": 31286 + }, + { + "epoch": 17.47877094972067, + "grad_norm": 0.3469271659851074, + "learning_rate": 0.00012680672268907563, + "loss": 0.3656, + "step": 31287 + }, + { + "epoch": 17.47932960893855, + "grad_norm": 0.6495400667190552, + "learning_rate": 0.00012677871148459384, + "loss": 0.4163, + "step": 31288 + }, + { + "epoch": 17.479888268156426, + "grad_norm": 0.5434854626655579, + "learning_rate": 0.00012675070028011205, + "loss": 0.3445, + "step": 31289 + }, + { + "epoch": 17.480446927374302, + "grad_norm": 3.2187349796295166, + "learning_rate": 0.00012672268907563025, + "loss": 0.354, + "step": 31290 + }, + { + "epoch": 17.48100558659218, + "grad_norm": 0.8823065757751465, + "learning_rate": 0.00012669467787114846, + "loss": 0.4252, + "step": 31291 + }, + { + "epoch": 17.481564245810056, + "grad_norm": 0.49800223112106323, + "learning_rate": 0.0001266666666666667, + "loss": 0.2975, + "step": 31292 + }, + { + "epoch": 17.482122905027932, + "grad_norm": 0.4639713168144226, + "learning_rate": 0.00012663865546218487, + "loss": 0.4643, + "step": 31293 + }, + { + "epoch": 17.48268156424581, + "grad_norm": 0.4206647574901581, + "learning_rate": 0.00012661064425770308, + "loss": 0.3625, + "step": 31294 + }, + { + "epoch": 17.48324022346369, + "grad_norm": 0.448525995016098, + "learning_rate": 0.00012658263305322128, + "loss": 0.2995, + "step": 31295 + }, + { + "epoch": 17.483798882681565, + "grad_norm": 0.391122967004776, + "learning_rate": 0.00012655462184873952, + "loss": 0.4144, + "step": 31296 + }, + { + "epoch": 17.484357541899442, + "grad_norm": 1.5043803453445435, + "learning_rate": 0.00012652661064425772, + "loss": 0.4243, + "step": 31297 + }, + { + "epoch": 17.48491620111732, + "grad_norm": 0.3991751968860626, + "learning_rate": 0.0001264985994397759, + "loss": 0.4054, + "step": 31298 + }, + { + "epoch": 17.485474860335195, + "grad_norm": 0.5427753925323486, + "learning_rate": 0.0001264705882352941, + "loss": 0.5228, + "step": 31299 + }, + { + "epoch": 17.48603351955307, + "grad_norm": 0.3899679481983185, + "learning_rate": 0.00012644257703081234, + "loss": 0.3706, + "step": 31300 + }, + { + "epoch": 17.486592178770948, + "grad_norm": 3.9391305446624756, + "learning_rate": 0.00012641456582633055, + "loss": 0.3294, + "step": 31301 + }, + { + "epoch": 17.48715083798883, + "grad_norm": 0.4592090845108032, + "learning_rate": 0.00012638655462184875, + "loss": 0.3922, + "step": 31302 + }, + { + "epoch": 17.487709497206705, + "grad_norm": 0.5964315533638, + "learning_rate": 0.00012635854341736693, + "loss": 0.4505, + "step": 31303 + }, + { + "epoch": 17.48826815642458, + "grad_norm": 0.5506134629249573, + "learning_rate": 0.00012633053221288516, + "loss": 0.448, + "step": 31304 + }, + { + "epoch": 17.488826815642458, + "grad_norm": 0.6865684390068054, + "learning_rate": 0.00012630252100840337, + "loss": 0.5109, + "step": 31305 + }, + { + "epoch": 17.489385474860335, + "grad_norm": 0.4935726523399353, + "learning_rate": 0.00012627450980392158, + "loss": 0.4008, + "step": 31306 + }, + { + "epoch": 17.48994413407821, + "grad_norm": 6.239386558532715, + "learning_rate": 0.00012624649859943978, + "loss": 0.4636, + "step": 31307 + }, + { + "epoch": 17.490502793296088, + "grad_norm": 1.9036096334457397, + "learning_rate": 0.000126218487394958, + "loss": 0.3467, + "step": 31308 + }, + { + "epoch": 17.491061452513968, + "grad_norm": 1.24658203125, + "learning_rate": 0.0001261904761904762, + "loss": 0.4893, + "step": 31309 + }, + { + "epoch": 17.491620111731844, + "grad_norm": 0.615731954574585, + "learning_rate": 0.0001261624649859944, + "loss": 0.4222, + "step": 31310 + }, + { + "epoch": 17.49217877094972, + "grad_norm": 0.42554715275764465, + "learning_rate": 0.0001261344537815126, + "loss": 0.6147, + "step": 31311 + }, + { + "epoch": 17.492737430167598, + "grad_norm": 0.6205093264579773, + "learning_rate": 0.00012610644257703084, + "loss": 0.3121, + "step": 31312 + }, + { + "epoch": 17.493296089385474, + "grad_norm": 0.6284263730049133, + "learning_rate": 0.00012607843137254902, + "loss": 0.4786, + "step": 31313 + }, + { + "epoch": 17.49385474860335, + "grad_norm": 1.0057072639465332, + "learning_rate": 0.00012605042016806722, + "loss": 0.5219, + "step": 31314 + }, + { + "epoch": 17.49441340782123, + "grad_norm": 0.4818427860736847, + "learning_rate": 0.00012602240896358543, + "loss": 0.5274, + "step": 31315 + }, + { + "epoch": 17.494972067039107, + "grad_norm": 0.7015307545661926, + "learning_rate": 0.00012599439775910366, + "loss": 0.4136, + "step": 31316 + }, + { + "epoch": 17.495530726256984, + "grad_norm": 1.4937360286712646, + "learning_rate": 0.00012596638655462187, + "loss": 0.4036, + "step": 31317 + }, + { + "epoch": 17.49608938547486, + "grad_norm": 0.9747006297111511, + "learning_rate": 0.00012593837535014005, + "loss": 0.3422, + "step": 31318 + }, + { + "epoch": 17.496648044692737, + "grad_norm": 0.3876058757305145, + "learning_rate": 0.00012591036414565825, + "loss": 0.3966, + "step": 31319 + }, + { + "epoch": 17.497206703910614, + "grad_norm": 0.48371076583862305, + "learning_rate": 0.0001258823529411765, + "loss": 0.3915, + "step": 31320 + }, + { + "epoch": 17.49776536312849, + "grad_norm": 0.3128376007080078, + "learning_rate": 0.0001258543417366947, + "loss": 0.3079, + "step": 31321 + }, + { + "epoch": 17.49832402234637, + "grad_norm": 0.4322524964809418, + "learning_rate": 0.0001258263305322129, + "loss": 0.3691, + "step": 31322 + }, + { + "epoch": 17.498882681564247, + "grad_norm": 0.45457175374031067, + "learning_rate": 0.00012579831932773108, + "loss": 0.4365, + "step": 31323 + }, + { + "epoch": 17.499441340782123, + "grad_norm": 0.5468364953994751, + "learning_rate": 0.0001257703081232493, + "loss": 0.3698, + "step": 31324 + }, + { + "epoch": 17.5, + "grad_norm": 0.9551847577095032, + "learning_rate": 0.00012574229691876752, + "loss": 0.3923, + "step": 31325 + }, + { + "epoch": 17.500558659217877, + "grad_norm": 0.3207535445690155, + "learning_rate": 0.00012571428571428572, + "loss": 0.318, + "step": 31326 + }, + { + "epoch": 17.501117318435753, + "grad_norm": 0.8012207746505737, + "learning_rate": 0.0001256862745098039, + "loss": 0.466, + "step": 31327 + }, + { + "epoch": 17.50167597765363, + "grad_norm": 0.42065680027008057, + "learning_rate": 0.00012565826330532213, + "loss": 0.3795, + "step": 31328 + }, + { + "epoch": 17.50223463687151, + "grad_norm": 0.45595505833625793, + "learning_rate": 0.00012563025210084034, + "loss": 0.465, + "step": 31329 + }, + { + "epoch": 17.502793296089386, + "grad_norm": 0.4388836622238159, + "learning_rate": 0.00012560224089635855, + "loss": 0.4349, + "step": 31330 + }, + { + "epoch": 17.503351955307263, + "grad_norm": 0.3341262936592102, + "learning_rate": 0.00012557422969187675, + "loss": 0.4089, + "step": 31331 + }, + { + "epoch": 17.50391061452514, + "grad_norm": 0.48142194747924805, + "learning_rate": 0.00012554621848739496, + "loss": 0.4203, + "step": 31332 + }, + { + "epoch": 17.504469273743016, + "grad_norm": 0.4287753999233246, + "learning_rate": 0.00012551820728291316, + "loss": 0.406, + "step": 31333 + }, + { + "epoch": 17.505027932960893, + "grad_norm": 0.382752925157547, + "learning_rate": 0.00012549019607843137, + "loss": 0.3868, + "step": 31334 + }, + { + "epoch": 17.505586592178773, + "grad_norm": 0.4057750403881073, + "learning_rate": 0.00012546218487394958, + "loss": 0.4076, + "step": 31335 + }, + { + "epoch": 17.50614525139665, + "grad_norm": 0.4480024576187134, + "learning_rate": 0.0001254341736694678, + "loss": 0.4189, + "step": 31336 + }, + { + "epoch": 17.506703910614526, + "grad_norm": 0.42614173889160156, + "learning_rate": 0.000125406162464986, + "loss": 0.399, + "step": 31337 + }, + { + "epoch": 17.507262569832402, + "grad_norm": 0.8793990612030029, + "learning_rate": 0.0001253781512605042, + "loss": 0.4118, + "step": 31338 + }, + { + "epoch": 17.50782122905028, + "grad_norm": 0.7743203043937683, + "learning_rate": 0.0001253501400560224, + "loss": 0.3745, + "step": 31339 + }, + { + "epoch": 17.508379888268156, + "grad_norm": 0.3688618838787079, + "learning_rate": 0.00012532212885154063, + "loss": 0.4876, + "step": 31340 + }, + { + "epoch": 17.508938547486032, + "grad_norm": 0.5979149341583252, + "learning_rate": 0.00012529411764705884, + "loss": 0.4168, + "step": 31341 + }, + { + "epoch": 17.509497206703912, + "grad_norm": 0.5943663120269775, + "learning_rate": 0.00012526610644257702, + "loss": 0.3421, + "step": 31342 + }, + { + "epoch": 17.51005586592179, + "grad_norm": 4.5099992752075195, + "learning_rate": 0.00012523809523809522, + "loss": 0.4085, + "step": 31343 + }, + { + "epoch": 17.510614525139665, + "grad_norm": 0.9150093793869019, + "learning_rate": 0.00012521008403361346, + "loss": 0.4656, + "step": 31344 + }, + { + "epoch": 17.511173184357542, + "grad_norm": 0.43181443214416504, + "learning_rate": 0.00012518207282913166, + "loss": 0.5019, + "step": 31345 + }, + { + "epoch": 17.51173184357542, + "grad_norm": 0.6618065237998962, + "learning_rate": 0.00012515406162464987, + "loss": 0.6839, + "step": 31346 + }, + { + "epoch": 17.512290502793295, + "grad_norm": 0.9445618391036987, + "learning_rate": 0.00012512605042016805, + "loss": 0.5031, + "step": 31347 + }, + { + "epoch": 17.51284916201117, + "grad_norm": 0.5031107068061829, + "learning_rate": 0.00012509803921568628, + "loss": 0.5551, + "step": 31348 + }, + { + "epoch": 17.513407821229052, + "grad_norm": 0.391179621219635, + "learning_rate": 0.0001250700280112045, + "loss": 0.4311, + "step": 31349 + }, + { + "epoch": 17.51396648044693, + "grad_norm": 1.2921807765960693, + "learning_rate": 0.0001250420168067227, + "loss": 0.4065, + "step": 31350 + }, + { + "epoch": 17.514525139664805, + "grad_norm": 1.0348345041275024, + "learning_rate": 0.0001250140056022409, + "loss": 0.3986, + "step": 31351 + }, + { + "epoch": 17.51508379888268, + "grad_norm": 0.5795475244522095, + "learning_rate": 0.0001249859943977591, + "loss": 0.4414, + "step": 31352 + }, + { + "epoch": 17.515642458100558, + "grad_norm": 0.4601471424102783, + "learning_rate": 0.0001249579831932773, + "loss": 0.4582, + "step": 31353 + }, + { + "epoch": 17.516201117318435, + "grad_norm": 2.1328625679016113, + "learning_rate": 0.00012492997198879552, + "loss": 0.3725, + "step": 31354 + }, + { + "epoch": 17.51675977653631, + "grad_norm": 0.5001599192619324, + "learning_rate": 0.00012490196078431372, + "loss": 0.438, + "step": 31355 + }, + { + "epoch": 17.51731843575419, + "grad_norm": 0.5486342310905457, + "learning_rate": 0.00012487394957983193, + "loss": 0.3573, + "step": 31356 + }, + { + "epoch": 17.517877094972068, + "grad_norm": 0.462859570980072, + "learning_rate": 0.00012484593837535014, + "loss": 0.4112, + "step": 31357 + }, + { + "epoch": 17.518435754189944, + "grad_norm": 0.6502798795700073, + "learning_rate": 0.00012481792717086834, + "loss": 0.4724, + "step": 31358 + }, + { + "epoch": 17.51899441340782, + "grad_norm": 0.5650374293327332, + "learning_rate": 0.00012478991596638655, + "loss": 0.4736, + "step": 31359 + }, + { + "epoch": 17.519553072625698, + "grad_norm": 0.5809507369995117, + "learning_rate": 0.00012476190476190478, + "loss": 0.5454, + "step": 31360 + }, + { + "epoch": 17.520111731843574, + "grad_norm": 0.5060024857521057, + "learning_rate": 0.00012473389355742296, + "loss": 0.4056, + "step": 31361 + }, + { + "epoch": 17.52067039106145, + "grad_norm": 0.40012040734291077, + "learning_rate": 0.0001247058823529412, + "loss": 0.4042, + "step": 31362 + }, + { + "epoch": 17.52122905027933, + "grad_norm": 0.4475395083427429, + "learning_rate": 0.00012467787114845937, + "loss": 0.324, + "step": 31363 + }, + { + "epoch": 17.521787709497207, + "grad_norm": 0.6050678491592407, + "learning_rate": 0.0001246498599439776, + "loss": 0.4074, + "step": 31364 + }, + { + "epoch": 17.522346368715084, + "grad_norm": 0.8675230145454407, + "learning_rate": 0.0001246218487394958, + "loss": 0.4499, + "step": 31365 + }, + { + "epoch": 17.52290502793296, + "grad_norm": 0.39340218901634216, + "learning_rate": 0.00012459383753501402, + "loss": 0.4347, + "step": 31366 + }, + { + "epoch": 17.523463687150837, + "grad_norm": 0.5411680340766907, + "learning_rate": 0.00012456582633053222, + "loss": 0.3305, + "step": 31367 + }, + { + "epoch": 17.524022346368714, + "grad_norm": 1.6587975025177002, + "learning_rate": 0.00012453781512605043, + "loss": 0.3332, + "step": 31368 + }, + { + "epoch": 17.524581005586594, + "grad_norm": 2.565991163253784, + "learning_rate": 0.00012450980392156863, + "loss": 0.4447, + "step": 31369 + }, + { + "epoch": 17.52513966480447, + "grad_norm": 0.4227873980998993, + "learning_rate": 0.00012448179271708684, + "loss": 0.4126, + "step": 31370 + }, + { + "epoch": 17.525698324022347, + "grad_norm": 0.45929771661758423, + "learning_rate": 0.00012445378151260505, + "loss": 0.3582, + "step": 31371 + }, + { + "epoch": 17.526256983240224, + "grad_norm": 0.5435831546783447, + "learning_rate": 0.00012442577030812325, + "loss": 0.4984, + "step": 31372 + }, + { + "epoch": 17.5268156424581, + "grad_norm": 0.41643592715263367, + "learning_rate": 0.00012439775910364146, + "loss": 0.4074, + "step": 31373 + }, + { + "epoch": 17.527374301675977, + "grad_norm": 0.43090009689331055, + "learning_rate": 0.00012436974789915966, + "loss": 0.3958, + "step": 31374 + }, + { + "epoch": 17.527932960893853, + "grad_norm": 0.9563839435577393, + "learning_rate": 0.00012434173669467787, + "loss": 0.3905, + "step": 31375 + }, + { + "epoch": 17.528491620111733, + "grad_norm": 0.43215587735176086, + "learning_rate": 0.00012431372549019608, + "loss": 0.425, + "step": 31376 + }, + { + "epoch": 17.52905027932961, + "grad_norm": 0.354337602853775, + "learning_rate": 0.00012428571428571428, + "loss": 0.4403, + "step": 31377 + }, + { + "epoch": 17.529608938547486, + "grad_norm": 4.78610897064209, + "learning_rate": 0.0001242577030812325, + "loss": 0.3937, + "step": 31378 + }, + { + "epoch": 17.530167597765363, + "grad_norm": 1.036870002746582, + "learning_rate": 0.0001242296918767507, + "loss": 0.3553, + "step": 31379 + }, + { + "epoch": 17.53072625698324, + "grad_norm": 0.7375327348709106, + "learning_rate": 0.0001242016806722689, + "loss": 0.3827, + "step": 31380 + }, + { + "epoch": 17.531284916201116, + "grad_norm": 0.33264487981796265, + "learning_rate": 0.0001241736694677871, + "loss": 0.3312, + "step": 31381 + }, + { + "epoch": 17.531843575418993, + "grad_norm": 0.41684606671333313, + "learning_rate": 0.00012414565826330534, + "loss": 0.3981, + "step": 31382 + }, + { + "epoch": 17.532402234636873, + "grad_norm": 0.3767816126346588, + "learning_rate": 0.00012411764705882352, + "loss": 0.412, + "step": 31383 + }, + { + "epoch": 17.53296089385475, + "grad_norm": 0.40415525436401367, + "learning_rate": 0.00012408963585434175, + "loss": 0.3904, + "step": 31384 + }, + { + "epoch": 17.533519553072626, + "grad_norm": 0.4591088593006134, + "learning_rate": 0.00012406162464985993, + "loss": 0.3354, + "step": 31385 + }, + { + "epoch": 17.534078212290503, + "grad_norm": 0.41783255338668823, + "learning_rate": 0.00012403361344537816, + "loss": 0.5111, + "step": 31386 + }, + { + "epoch": 17.53463687150838, + "grad_norm": 0.38515251874923706, + "learning_rate": 0.00012400560224089637, + "loss": 0.3786, + "step": 31387 + }, + { + "epoch": 17.535195530726256, + "grad_norm": 0.3406234681606293, + "learning_rate": 0.00012397759103641458, + "loss": 0.3904, + "step": 31388 + }, + { + "epoch": 17.535754189944136, + "grad_norm": 0.3470050096511841, + "learning_rate": 0.00012394957983193278, + "loss": 0.4298, + "step": 31389 + }, + { + "epoch": 17.536312849162012, + "grad_norm": 0.4810034930706024, + "learning_rate": 0.000123921568627451, + "loss": 0.3771, + "step": 31390 + }, + { + "epoch": 17.53687150837989, + "grad_norm": 0.5584728717803955, + "learning_rate": 0.0001238935574229692, + "loss": 0.3874, + "step": 31391 + }, + { + "epoch": 17.537430167597766, + "grad_norm": 0.5674973726272583, + "learning_rate": 0.0001238655462184874, + "loss": 0.3095, + "step": 31392 + }, + { + "epoch": 17.537988826815642, + "grad_norm": 0.4328550696372986, + "learning_rate": 0.0001238375350140056, + "loss": 0.3253, + "step": 31393 + }, + { + "epoch": 17.53854748603352, + "grad_norm": 0.4130370318889618, + "learning_rate": 0.0001238095238095238, + "loss": 0.3788, + "step": 31394 + }, + { + "epoch": 17.539106145251395, + "grad_norm": 0.4716345965862274, + "learning_rate": 0.00012378151260504202, + "loss": 0.4596, + "step": 31395 + }, + { + "epoch": 17.539664804469275, + "grad_norm": 1.815059781074524, + "learning_rate": 0.00012375350140056022, + "loss": 0.4453, + "step": 31396 + }, + { + "epoch": 17.540223463687152, + "grad_norm": 0.28682801127433777, + "learning_rate": 0.00012372549019607843, + "loss": 0.3091, + "step": 31397 + }, + { + "epoch": 17.54078212290503, + "grad_norm": 0.4455854892730713, + "learning_rate": 0.00012369747899159664, + "loss": 0.3273, + "step": 31398 + }, + { + "epoch": 17.541340782122905, + "grad_norm": 0.7888973355293274, + "learning_rate": 0.00012366946778711484, + "loss": 0.4247, + "step": 31399 + }, + { + "epoch": 17.54189944134078, + "grad_norm": 0.543374240398407, + "learning_rate": 0.00012364145658263305, + "loss": 0.4549, + "step": 31400 + }, + { + "epoch": 17.542458100558658, + "grad_norm": 0.7094858288764954, + "learning_rate": 0.00012361344537815125, + "loss": 0.388, + "step": 31401 + }, + { + "epoch": 17.543016759776535, + "grad_norm": 0.4381362199783325, + "learning_rate": 0.00012358543417366946, + "loss": 0.3872, + "step": 31402 + }, + { + "epoch": 17.543575418994415, + "grad_norm": 0.4672721028327942, + "learning_rate": 0.00012355742296918767, + "loss": 0.5157, + "step": 31403 + }, + { + "epoch": 17.54413407821229, + "grad_norm": 0.6241531372070312, + "learning_rate": 0.0001235294117647059, + "loss": 0.4097, + "step": 31404 + }, + { + "epoch": 17.544692737430168, + "grad_norm": 1.5492377281188965, + "learning_rate": 0.00012350140056022408, + "loss": 0.4101, + "step": 31405 + }, + { + "epoch": 17.545251396648045, + "grad_norm": 0.3204233646392822, + "learning_rate": 0.0001234733893557423, + "loss": 0.3426, + "step": 31406 + }, + { + "epoch": 17.54581005586592, + "grad_norm": 0.7466712594032288, + "learning_rate": 0.0001234453781512605, + "loss": 0.5604, + "step": 31407 + }, + { + "epoch": 17.546368715083798, + "grad_norm": 0.673948347568512, + "learning_rate": 0.00012341736694677872, + "loss": 0.3366, + "step": 31408 + }, + { + "epoch": 17.546927374301674, + "grad_norm": 0.3771499693393707, + "learning_rate": 0.00012338935574229693, + "loss": 0.3738, + "step": 31409 + }, + { + "epoch": 17.547486033519554, + "grad_norm": 5.109767436981201, + "learning_rate": 0.00012336134453781513, + "loss": 0.415, + "step": 31410 + }, + { + "epoch": 17.54804469273743, + "grad_norm": 0.4377894103527069, + "learning_rate": 0.00012333333333333334, + "loss": 0.4644, + "step": 31411 + }, + { + "epoch": 17.548603351955308, + "grad_norm": 0.4229608178138733, + "learning_rate": 0.00012330532212885155, + "loss": 0.3884, + "step": 31412 + }, + { + "epoch": 17.549162011173184, + "grad_norm": 0.43517762422561646, + "learning_rate": 0.00012327731092436975, + "loss": 0.384, + "step": 31413 + }, + { + "epoch": 17.54972067039106, + "grad_norm": 0.4626273810863495, + "learning_rate": 0.00012324929971988796, + "loss": 0.3851, + "step": 31414 + }, + { + "epoch": 17.550279329608937, + "grad_norm": 0.3298066258430481, + "learning_rate": 0.00012322128851540616, + "loss": 0.363, + "step": 31415 + }, + { + "epoch": 17.550837988826817, + "grad_norm": 4.576408863067627, + "learning_rate": 0.00012319327731092437, + "loss": 0.4781, + "step": 31416 + }, + { + "epoch": 17.551396648044694, + "grad_norm": 0.9707555174827576, + "learning_rate": 0.00012316526610644258, + "loss": 0.4127, + "step": 31417 + }, + { + "epoch": 17.55195530726257, + "grad_norm": 0.34349972009658813, + "learning_rate": 0.00012313725490196078, + "loss": 0.3346, + "step": 31418 + }, + { + "epoch": 17.552513966480447, + "grad_norm": 4.508553981781006, + "learning_rate": 0.000123109243697479, + "loss": 0.4511, + "step": 31419 + }, + { + "epoch": 17.553072625698324, + "grad_norm": 0.66888028383255, + "learning_rate": 0.0001230812324929972, + "loss": 0.3887, + "step": 31420 + }, + { + "epoch": 17.5536312849162, + "grad_norm": 0.3732622265815735, + "learning_rate": 0.0001230532212885154, + "loss": 0.3929, + "step": 31421 + }, + { + "epoch": 17.554189944134077, + "grad_norm": 0.5522032976150513, + "learning_rate": 0.0001230252100840336, + "loss": 0.4275, + "step": 31422 + }, + { + "epoch": 17.554748603351957, + "grad_norm": 0.43784770369529724, + "learning_rate": 0.0001229971988795518, + "loss": 0.4578, + "step": 31423 + }, + { + "epoch": 17.555307262569833, + "grad_norm": 0.8571161031723022, + "learning_rate": 0.00012296918767507002, + "loss": 0.4625, + "step": 31424 + }, + { + "epoch": 17.55586592178771, + "grad_norm": 0.3722347617149353, + "learning_rate": 0.00012294117647058822, + "loss": 0.3602, + "step": 31425 + }, + { + "epoch": 17.556424581005587, + "grad_norm": 0.6145003437995911, + "learning_rate": 0.00012291316526610646, + "loss": 0.4427, + "step": 31426 + }, + { + "epoch": 17.556983240223463, + "grad_norm": 2.3805315494537354, + "learning_rate": 0.00012288515406162464, + "loss": 0.446, + "step": 31427 + }, + { + "epoch": 17.55754189944134, + "grad_norm": 0.40157830715179443, + "learning_rate": 0.00012285714285714287, + "loss": 0.3836, + "step": 31428 + }, + { + "epoch": 17.558100558659216, + "grad_norm": 0.45903003215789795, + "learning_rate": 0.00012282913165266105, + "loss": 0.3689, + "step": 31429 + }, + { + "epoch": 17.558659217877096, + "grad_norm": 0.7679571509361267, + "learning_rate": 0.00012280112044817928, + "loss": 0.2849, + "step": 31430 + }, + { + "epoch": 17.559217877094973, + "grad_norm": 0.508935809135437, + "learning_rate": 0.0001227731092436975, + "loss": 0.3485, + "step": 31431 + }, + { + "epoch": 17.55977653631285, + "grad_norm": 0.3794126808643341, + "learning_rate": 0.0001227450980392157, + "loss": 0.308, + "step": 31432 + }, + { + "epoch": 17.560335195530726, + "grad_norm": 0.5495390892028809, + "learning_rate": 0.0001227170868347339, + "loss": 0.605, + "step": 31433 + }, + { + "epoch": 17.560893854748603, + "grad_norm": 0.665381908416748, + "learning_rate": 0.0001226890756302521, + "loss": 0.5278, + "step": 31434 + }, + { + "epoch": 17.56145251396648, + "grad_norm": 0.6173123121261597, + "learning_rate": 0.0001226610644257703, + "loss": 0.3187, + "step": 31435 + }, + { + "epoch": 17.56201117318436, + "grad_norm": 3.169093608856201, + "learning_rate": 0.00012263305322128852, + "loss": 0.4349, + "step": 31436 + }, + { + "epoch": 17.562569832402236, + "grad_norm": 0.4877188205718994, + "learning_rate": 0.00012260504201680672, + "loss": 0.4138, + "step": 31437 + }, + { + "epoch": 17.563128491620112, + "grad_norm": 0.3999217450618744, + "learning_rate": 0.00012257703081232493, + "loss": 0.3624, + "step": 31438 + }, + { + "epoch": 17.56368715083799, + "grad_norm": 0.4917362928390503, + "learning_rate": 0.00012254901960784314, + "loss": 0.4646, + "step": 31439 + }, + { + "epoch": 17.564245810055866, + "grad_norm": 0.39735132455825806, + "learning_rate": 0.00012252100840336134, + "loss": 0.3664, + "step": 31440 + }, + { + "epoch": 17.564804469273742, + "grad_norm": 0.4285474419593811, + "learning_rate": 0.00012249299719887955, + "loss": 0.44, + "step": 31441 + }, + { + "epoch": 17.56536312849162, + "grad_norm": 0.45227164030075073, + "learning_rate": 0.00012246498599439775, + "loss": 0.3774, + "step": 31442 + }, + { + "epoch": 17.5659217877095, + "grad_norm": 0.3510522246360779, + "learning_rate": 0.00012243697478991596, + "loss": 0.2889, + "step": 31443 + }, + { + "epoch": 17.566480446927375, + "grad_norm": 0.41908979415893555, + "learning_rate": 0.00012240896358543417, + "loss": 0.4113, + "step": 31444 + }, + { + "epoch": 17.567039106145252, + "grad_norm": 0.5059147477149963, + "learning_rate": 0.00012238095238095237, + "loss": 0.4623, + "step": 31445 + }, + { + "epoch": 17.56759776536313, + "grad_norm": 1.465397834777832, + "learning_rate": 0.0001223529411764706, + "loss": 0.3556, + "step": 31446 + }, + { + "epoch": 17.568156424581005, + "grad_norm": 0.4680210053920746, + "learning_rate": 0.00012232492997198878, + "loss": 0.4147, + "step": 31447 + }, + { + "epoch": 17.56871508379888, + "grad_norm": 0.38897278904914856, + "learning_rate": 0.00012229691876750702, + "loss": 0.404, + "step": 31448 + }, + { + "epoch": 17.56927374301676, + "grad_norm": 0.4207116961479187, + "learning_rate": 0.0001222689075630252, + "loss": 0.5453, + "step": 31449 + }, + { + "epoch": 17.56983240223464, + "grad_norm": 1.5752358436584473, + "learning_rate": 0.00012224089635854343, + "loss": 0.3971, + "step": 31450 + }, + { + "epoch": 17.570391061452515, + "grad_norm": 0.41147202253341675, + "learning_rate": 0.0001222128851540616, + "loss": 0.2843, + "step": 31451 + }, + { + "epoch": 17.57094972067039, + "grad_norm": 0.6159135699272156, + "learning_rate": 0.00012218487394957984, + "loss": 0.4589, + "step": 31452 + }, + { + "epoch": 17.571508379888268, + "grad_norm": 0.5537623167037964, + "learning_rate": 0.00012215686274509805, + "loss": 0.454, + "step": 31453 + }, + { + "epoch": 17.572067039106145, + "grad_norm": 0.4944468140602112, + "learning_rate": 0.00012212885154061625, + "loss": 0.4485, + "step": 31454 + }, + { + "epoch": 17.57262569832402, + "grad_norm": 0.40874210000038147, + "learning_rate": 0.00012210084033613446, + "loss": 0.3675, + "step": 31455 + }, + { + "epoch": 17.573184357541898, + "grad_norm": 0.4066227078437805, + "learning_rate": 0.00012207282913165266, + "loss": 0.3506, + "step": 31456 + }, + { + "epoch": 17.573743016759778, + "grad_norm": 1.199589490890503, + "learning_rate": 0.00012204481792717087, + "loss": 0.3576, + "step": 31457 + }, + { + "epoch": 17.574301675977654, + "grad_norm": 0.38021135330200195, + "learning_rate": 0.00012201680672268908, + "loss": 0.3642, + "step": 31458 + }, + { + "epoch": 17.57486033519553, + "grad_norm": 1.409810185432434, + "learning_rate": 0.00012198879551820728, + "loss": 0.3535, + "step": 31459 + }, + { + "epoch": 17.575418994413408, + "grad_norm": 0.5215467810630798, + "learning_rate": 0.00012196078431372549, + "loss": 0.4374, + "step": 31460 + }, + { + "epoch": 17.575977653631284, + "grad_norm": 0.5941303968429565, + "learning_rate": 0.0001219327731092437, + "loss": 0.4692, + "step": 31461 + }, + { + "epoch": 17.57653631284916, + "grad_norm": 0.3887898921966553, + "learning_rate": 0.0001219047619047619, + "loss": 0.347, + "step": 31462 + }, + { + "epoch": 17.577094972067037, + "grad_norm": 0.3999468684196472, + "learning_rate": 0.00012187675070028012, + "loss": 0.3779, + "step": 31463 + }, + { + "epoch": 17.577653631284917, + "grad_norm": 0.4554681181907654, + "learning_rate": 0.00012184873949579831, + "loss": 0.5051, + "step": 31464 + }, + { + "epoch": 17.578212290502794, + "grad_norm": 0.33645960688591003, + "learning_rate": 0.00012182072829131653, + "loss": 0.3762, + "step": 31465 + }, + { + "epoch": 17.57877094972067, + "grad_norm": 0.444515198469162, + "learning_rate": 0.00012179271708683472, + "loss": 0.3362, + "step": 31466 + }, + { + "epoch": 17.579329608938547, + "grad_norm": 0.3829556703567505, + "learning_rate": 0.00012176470588235294, + "loss": 0.427, + "step": 31467 + }, + { + "epoch": 17.579888268156424, + "grad_norm": 0.4609065055847168, + "learning_rate": 0.00012173669467787116, + "loss": 0.3865, + "step": 31468 + }, + { + "epoch": 17.5804469273743, + "grad_norm": 0.4460180699825287, + "learning_rate": 0.00012170868347338936, + "loss": 0.3777, + "step": 31469 + }, + { + "epoch": 17.58100558659218, + "grad_norm": 0.5300408601760864, + "learning_rate": 0.00012168067226890758, + "loss": 0.4202, + "step": 31470 + }, + { + "epoch": 17.581564245810057, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.00012165266106442577, + "loss": 0.4468, + "step": 31471 + }, + { + "epoch": 17.582122905027934, + "grad_norm": 0.3655270040035248, + "learning_rate": 0.00012162464985994399, + "loss": 0.3407, + "step": 31472 + }, + { + "epoch": 17.58268156424581, + "grad_norm": 0.5844898223876953, + "learning_rate": 0.0001215966386554622, + "loss": 0.6882, + "step": 31473 + }, + { + "epoch": 17.583240223463687, + "grad_norm": 0.879111647605896, + "learning_rate": 0.0001215686274509804, + "loss": 0.3737, + "step": 31474 + }, + { + "epoch": 17.583798882681563, + "grad_norm": 1.4070534706115723, + "learning_rate": 0.0001215406162464986, + "loss": 0.394, + "step": 31475 + }, + { + "epoch": 17.58435754189944, + "grad_norm": 0.5160382986068726, + "learning_rate": 0.00012151260504201681, + "loss": 0.5746, + "step": 31476 + }, + { + "epoch": 17.58491620111732, + "grad_norm": 0.5294186472892761, + "learning_rate": 0.00012148459383753502, + "loss": 0.5147, + "step": 31477 + }, + { + "epoch": 17.585474860335196, + "grad_norm": 0.37465882301330566, + "learning_rate": 0.00012145658263305322, + "loss": 0.2895, + "step": 31478 + }, + { + "epoch": 17.586033519553073, + "grad_norm": 3.461212158203125, + "learning_rate": 0.00012142857142857143, + "loss": 0.4826, + "step": 31479 + }, + { + "epoch": 17.58659217877095, + "grad_norm": 0.5687753558158875, + "learning_rate": 0.00012140056022408965, + "loss": 0.5353, + "step": 31480 + }, + { + "epoch": 17.587150837988826, + "grad_norm": 0.6214829683303833, + "learning_rate": 0.00012137254901960784, + "loss": 0.384, + "step": 31481 + }, + { + "epoch": 17.587709497206703, + "grad_norm": 0.4134661555290222, + "learning_rate": 0.00012134453781512606, + "loss": 0.3514, + "step": 31482 + }, + { + "epoch": 17.58826815642458, + "grad_norm": 4.23690938949585, + "learning_rate": 0.00012131652661064425, + "loss": 0.3824, + "step": 31483 + }, + { + "epoch": 17.58882681564246, + "grad_norm": 0.4844209551811218, + "learning_rate": 0.00012128851540616247, + "loss": 0.416, + "step": 31484 + }, + { + "epoch": 17.589385474860336, + "grad_norm": 0.4660329520702362, + "learning_rate": 0.00012126050420168068, + "loss": 0.5046, + "step": 31485 + }, + { + "epoch": 17.589944134078213, + "grad_norm": 0.4786379039287567, + "learning_rate": 0.00012123249299719889, + "loss": 0.4764, + "step": 31486 + }, + { + "epoch": 17.59050279329609, + "grad_norm": 1.036396861076355, + "learning_rate": 0.00012120448179271709, + "loss": 0.8822, + "step": 31487 + }, + { + "epoch": 17.591061452513966, + "grad_norm": 0.8079026937484741, + "learning_rate": 0.0001211764705882353, + "loss": 0.3213, + "step": 31488 + }, + { + "epoch": 17.591620111731842, + "grad_norm": 0.44388240575790405, + "learning_rate": 0.0001211484593837535, + "loss": 0.4536, + "step": 31489 + }, + { + "epoch": 17.592178770949722, + "grad_norm": 0.45528748631477356, + "learning_rate": 0.00012112044817927172, + "loss": 0.5471, + "step": 31490 + }, + { + "epoch": 17.5927374301676, + "grad_norm": 0.459954172372818, + "learning_rate": 0.00012109243697478992, + "loss": 0.3513, + "step": 31491 + }, + { + "epoch": 17.593296089385476, + "grad_norm": 0.5678022503852844, + "learning_rate": 0.00012106442577030813, + "loss": 0.3945, + "step": 31492 + }, + { + "epoch": 17.593854748603352, + "grad_norm": 1.2420063018798828, + "learning_rate": 0.00012103641456582633, + "loss": 0.5049, + "step": 31493 + }, + { + "epoch": 17.59441340782123, + "grad_norm": 0.4424779415130615, + "learning_rate": 0.00012100840336134455, + "loss": 0.31, + "step": 31494 + }, + { + "epoch": 17.594972067039105, + "grad_norm": 0.582234263420105, + "learning_rate": 0.00012098039215686275, + "loss": 0.4144, + "step": 31495 + }, + { + "epoch": 17.595530726256982, + "grad_norm": 0.6069568991661072, + "learning_rate": 0.00012095238095238096, + "loss": 0.3325, + "step": 31496 + }, + { + "epoch": 17.596089385474862, + "grad_norm": 0.3336666226387024, + "learning_rate": 0.00012092436974789916, + "loss": 0.4106, + "step": 31497 + }, + { + "epoch": 17.59664804469274, + "grad_norm": 0.5108823776245117, + "learning_rate": 0.00012089635854341737, + "loss": 0.3966, + "step": 31498 + }, + { + "epoch": 17.597206703910615, + "grad_norm": 0.5811411738395691, + "learning_rate": 0.00012086834733893558, + "loss": 0.4283, + "step": 31499 + }, + { + "epoch": 17.59776536312849, + "grad_norm": 2.5782835483551025, + "learning_rate": 0.0001208403361344538, + "loss": 0.4039, + "step": 31500 + }, + { + "epoch": 17.59776536312849, + "eval_cer": 0.08528907946275627, + "eval_loss": 0.31996604800224304, + "eval_runtime": 55.7055, + "eval_samples_per_second": 81.464, + "eval_steps_per_second": 5.098, + "eval_wer": 0.33864975838663725, + "step": 31500 + }, + { + "epoch": 17.598324022346368, + "grad_norm": 0.524627685546875, + "learning_rate": 0.00012081232492997199, + "loss": 0.3661, + "step": 31501 + }, + { + "epoch": 17.598882681564245, + "grad_norm": 0.401043564081192, + "learning_rate": 0.00012078431372549021, + "loss": 0.4058, + "step": 31502 + }, + { + "epoch": 17.59944134078212, + "grad_norm": 0.9795319437980652, + "learning_rate": 0.0001207563025210084, + "loss": 0.4405, + "step": 31503 + }, + { + "epoch": 17.6, + "grad_norm": 0.3942526876926422, + "learning_rate": 0.00012072829131652662, + "loss": 0.3337, + "step": 31504 + }, + { + "epoch": 17.600558659217878, + "grad_norm": 0.548869252204895, + "learning_rate": 0.00012070028011204481, + "loss": 0.4324, + "step": 31505 + }, + { + "epoch": 17.601117318435755, + "grad_norm": 0.5122600197792053, + "learning_rate": 0.00012067226890756303, + "loss": 0.5191, + "step": 31506 + }, + { + "epoch": 17.60167597765363, + "grad_norm": 0.3822869658470154, + "learning_rate": 0.00012064425770308124, + "loss": 0.3683, + "step": 31507 + }, + { + "epoch": 17.602234636871508, + "grad_norm": 0.3101330101490021, + "learning_rate": 0.00012061624649859944, + "loss": 0.3689, + "step": 31508 + }, + { + "epoch": 17.602793296089384, + "grad_norm": 0.4185240864753723, + "learning_rate": 0.00012058823529411765, + "loss": 0.4623, + "step": 31509 + }, + { + "epoch": 17.60335195530726, + "grad_norm": 0.42487871646881104, + "learning_rate": 0.00012056022408963586, + "loss": 0.4076, + "step": 31510 + }, + { + "epoch": 17.60391061452514, + "grad_norm": 0.5407841801643372, + "learning_rate": 0.00012053221288515406, + "loss": 0.3103, + "step": 31511 + }, + { + "epoch": 17.604469273743018, + "grad_norm": 0.6585702300071716, + "learning_rate": 0.00012050420168067228, + "loss": 0.3808, + "step": 31512 + }, + { + "epoch": 17.605027932960894, + "grad_norm": 0.3549375832080841, + "learning_rate": 0.00012047619047619047, + "loss": 0.3329, + "step": 31513 + }, + { + "epoch": 17.60558659217877, + "grad_norm": 0.42086732387542725, + "learning_rate": 0.0001204481792717087, + "loss": 0.3541, + "step": 31514 + }, + { + "epoch": 17.606145251396647, + "grad_norm": 0.6354426741600037, + "learning_rate": 0.00012042016806722689, + "loss": 0.4, + "step": 31515 + }, + { + "epoch": 17.606703910614524, + "grad_norm": 6.595485210418701, + "learning_rate": 0.0001203921568627451, + "loss": 0.5049, + "step": 31516 + }, + { + "epoch": 17.607262569832404, + "grad_norm": 0.42092061042785645, + "learning_rate": 0.00012036414565826331, + "loss": 0.4261, + "step": 31517 + }, + { + "epoch": 17.60782122905028, + "grad_norm": 0.46468386054039, + "learning_rate": 0.00012033613445378152, + "loss": 0.2787, + "step": 31518 + }, + { + "epoch": 17.608379888268157, + "grad_norm": 0.5346707105636597, + "learning_rate": 0.00012030812324929972, + "loss": 0.2795, + "step": 31519 + }, + { + "epoch": 17.608938547486034, + "grad_norm": 6.596825122833252, + "learning_rate": 0.00012028011204481793, + "loss": 0.4227, + "step": 31520 + }, + { + "epoch": 17.60949720670391, + "grad_norm": 0.412166029214859, + "learning_rate": 0.00012025210084033614, + "loss": 0.4157, + "step": 31521 + }, + { + "epoch": 17.610055865921787, + "grad_norm": 0.6900970935821533, + "learning_rate": 0.00012022408963585436, + "loss": 0.4507, + "step": 31522 + }, + { + "epoch": 17.610614525139663, + "grad_norm": 1.7597662210464478, + "learning_rate": 0.00012019607843137255, + "loss": 0.3885, + "step": 31523 + }, + { + "epoch": 17.611173184357543, + "grad_norm": 0.80777907371521, + "learning_rate": 0.00012016806722689077, + "loss": 0.4676, + "step": 31524 + }, + { + "epoch": 17.61173184357542, + "grad_norm": 0.42634645104408264, + "learning_rate": 0.00012014005602240896, + "loss": 0.4036, + "step": 31525 + }, + { + "epoch": 17.612290502793297, + "grad_norm": 2.0557897090911865, + "learning_rate": 0.00012011204481792718, + "loss": 0.4193, + "step": 31526 + }, + { + "epoch": 17.612849162011173, + "grad_norm": 0.5671976804733276, + "learning_rate": 0.00012008403361344539, + "loss": 0.4688, + "step": 31527 + }, + { + "epoch": 17.61340782122905, + "grad_norm": 0.6261650919914246, + "learning_rate": 0.00012005602240896359, + "loss": 0.3173, + "step": 31528 + }, + { + "epoch": 17.613966480446926, + "grad_norm": 0.4165620803833008, + "learning_rate": 0.0001200280112044818, + "loss": 0.4342, + "step": 31529 + }, + { + "epoch": 17.614525139664803, + "grad_norm": 0.460334450006485, + "learning_rate": 0.00012, + "loss": 0.3746, + "step": 31530 + }, + { + "epoch": 17.615083798882683, + "grad_norm": 1.6202187538146973, + "learning_rate": 0.00011997198879551821, + "loss": 0.6428, + "step": 31531 + }, + { + "epoch": 17.61564245810056, + "grad_norm": 0.5665677785873413, + "learning_rate": 0.00011994397759103643, + "loss": 0.2907, + "step": 31532 + }, + { + "epoch": 17.616201117318436, + "grad_norm": 0.39944425225257874, + "learning_rate": 0.00011991596638655462, + "loss": 0.3787, + "step": 31533 + }, + { + "epoch": 17.616759776536313, + "grad_norm": 0.562082827091217, + "learning_rate": 0.00011988795518207284, + "loss": 0.3582, + "step": 31534 + }, + { + "epoch": 17.61731843575419, + "grad_norm": 0.4276149868965149, + "learning_rate": 0.00011985994397759103, + "loss": 0.4732, + "step": 31535 + }, + { + "epoch": 17.617877094972066, + "grad_norm": 0.6590036153793335, + "learning_rate": 0.00011983193277310925, + "loss": 0.3843, + "step": 31536 + }, + { + "epoch": 17.618435754189946, + "grad_norm": 0.5104238390922546, + "learning_rate": 0.00011980392156862745, + "loss": 0.5287, + "step": 31537 + }, + { + "epoch": 17.618994413407822, + "grad_norm": 0.33312758803367615, + "learning_rate": 0.00011977591036414566, + "loss": 0.3356, + "step": 31538 + }, + { + "epoch": 17.6195530726257, + "grad_norm": 0.6793311238288879, + "learning_rate": 0.00011974789915966387, + "loss": 0.4423, + "step": 31539 + }, + { + "epoch": 17.620111731843576, + "grad_norm": 0.3907807171344757, + "learning_rate": 0.00011971988795518208, + "loss": 0.4378, + "step": 31540 + }, + { + "epoch": 17.620670391061452, + "grad_norm": 0.5622068643569946, + "learning_rate": 0.00011969187675070028, + "loss": 0.4348, + "step": 31541 + }, + { + "epoch": 17.62122905027933, + "grad_norm": 0.5129549503326416, + "learning_rate": 0.00011966386554621849, + "loss": 0.4897, + "step": 31542 + }, + { + "epoch": 17.621787709497205, + "grad_norm": 0.35348406434059143, + "learning_rate": 0.0001196358543417367, + "loss": 0.4355, + "step": 31543 + }, + { + "epoch": 17.622346368715085, + "grad_norm": 0.4193996489048004, + "learning_rate": 0.00011960784313725491, + "loss": 0.3968, + "step": 31544 + }, + { + "epoch": 17.622905027932962, + "grad_norm": 0.5911250114440918, + "learning_rate": 0.0001195798319327731, + "loss": 0.4094, + "step": 31545 + }, + { + "epoch": 17.62346368715084, + "grad_norm": 0.8757883906364441, + "learning_rate": 0.00011955182072829133, + "loss": 0.3381, + "step": 31546 + }, + { + "epoch": 17.624022346368715, + "grad_norm": 0.6150679588317871, + "learning_rate": 0.00011952380952380952, + "loss": 0.4874, + "step": 31547 + }, + { + "epoch": 17.62458100558659, + "grad_norm": 0.4737125635147095, + "learning_rate": 0.00011949579831932774, + "loss": 0.4128, + "step": 31548 + }, + { + "epoch": 17.62513966480447, + "grad_norm": 1.7527192831039429, + "learning_rate": 0.00011946778711484594, + "loss": 0.3142, + "step": 31549 + }, + { + "epoch": 17.625698324022345, + "grad_norm": 0.4082525074481964, + "learning_rate": 0.00011943977591036415, + "loss": 0.4127, + "step": 31550 + }, + { + "epoch": 17.626256983240225, + "grad_norm": 0.46964824199676514, + "learning_rate": 0.00011941176470588236, + "loss": 0.4541, + "step": 31551 + }, + { + "epoch": 17.6268156424581, + "grad_norm": 0.40015947818756104, + "learning_rate": 0.00011938375350140056, + "loss": 0.3936, + "step": 31552 + }, + { + "epoch": 17.627374301675978, + "grad_norm": 0.6811219453811646, + "learning_rate": 0.00011935574229691877, + "loss": 0.4426, + "step": 31553 + }, + { + "epoch": 17.627932960893855, + "grad_norm": 0.42463311553001404, + "learning_rate": 0.00011932773109243699, + "loss": 0.3125, + "step": 31554 + }, + { + "epoch": 17.62849162011173, + "grad_norm": 0.44518330693244934, + "learning_rate": 0.00011929971988795518, + "loss": 0.2777, + "step": 31555 + }, + { + "epoch": 17.629050279329608, + "grad_norm": 0.3771800100803375, + "learning_rate": 0.0001192717086834734, + "loss": 0.377, + "step": 31556 + }, + { + "epoch": 17.629608938547484, + "grad_norm": 0.4709591567516327, + "learning_rate": 0.00011924369747899159, + "loss": 0.4397, + "step": 31557 + }, + { + "epoch": 17.630167597765364, + "grad_norm": 0.4042418897151947, + "learning_rate": 0.00011921568627450981, + "loss": 0.3838, + "step": 31558 + }, + { + "epoch": 17.63072625698324, + "grad_norm": 0.4670283794403076, + "learning_rate": 0.00011918767507002802, + "loss": 0.3882, + "step": 31559 + }, + { + "epoch": 17.631284916201118, + "grad_norm": 0.6441609859466553, + "learning_rate": 0.00011915966386554622, + "loss": 0.504, + "step": 31560 + }, + { + "epoch": 17.631843575418994, + "grad_norm": 0.8895352482795715, + "learning_rate": 0.00011913165266106443, + "loss": 0.451, + "step": 31561 + }, + { + "epoch": 17.63240223463687, + "grad_norm": 0.35737344622612, + "learning_rate": 0.00011910364145658264, + "loss": 0.4517, + "step": 31562 + }, + { + "epoch": 17.632960893854747, + "grad_norm": 0.36211442947387695, + "learning_rate": 0.00011907563025210084, + "loss": 0.3633, + "step": 31563 + }, + { + "epoch": 17.633519553072627, + "grad_norm": 0.4149252474308014, + "learning_rate": 0.00011904761904761905, + "loss": 0.3364, + "step": 31564 + }, + { + "epoch": 17.634078212290504, + "grad_norm": 0.29160770773887634, + "learning_rate": 0.00011901960784313725, + "loss": 0.3008, + "step": 31565 + }, + { + "epoch": 17.63463687150838, + "grad_norm": 0.35701847076416016, + "learning_rate": 0.00011899159663865547, + "loss": 0.3682, + "step": 31566 + }, + { + "epoch": 17.635195530726257, + "grad_norm": 0.4197212755680084, + "learning_rate": 0.00011896358543417367, + "loss": 0.3595, + "step": 31567 + }, + { + "epoch": 17.635754189944134, + "grad_norm": 0.37884974479675293, + "learning_rate": 0.00011893557422969189, + "loss": 0.4083, + "step": 31568 + }, + { + "epoch": 17.63631284916201, + "grad_norm": 0.5068607330322266, + "learning_rate": 0.00011890756302521008, + "loss": 0.4852, + "step": 31569 + }, + { + "epoch": 17.636871508379887, + "grad_norm": 0.49007448554039, + "learning_rate": 0.0001188795518207283, + "loss": 0.3863, + "step": 31570 + }, + { + "epoch": 17.637430167597767, + "grad_norm": 0.4585886597633362, + "learning_rate": 0.0001188515406162465, + "loss": 0.4439, + "step": 31571 + }, + { + "epoch": 17.637988826815644, + "grad_norm": 0.3608410358428955, + "learning_rate": 0.00011882352941176471, + "loss": 0.3463, + "step": 31572 + }, + { + "epoch": 17.63854748603352, + "grad_norm": 0.3823234438896179, + "learning_rate": 0.00011879551820728292, + "loss": 0.3546, + "step": 31573 + }, + { + "epoch": 17.639106145251397, + "grad_norm": 1.4093756675720215, + "learning_rate": 0.00011876750700280112, + "loss": 0.3833, + "step": 31574 + }, + { + "epoch": 17.639664804469273, + "grad_norm": 0.42624631524086, + "learning_rate": 0.00011873949579831933, + "loss": 0.3542, + "step": 31575 + }, + { + "epoch": 17.64022346368715, + "grad_norm": 0.4674459993839264, + "learning_rate": 0.00011871148459383755, + "loss": 0.3508, + "step": 31576 + }, + { + "epoch": 17.640782122905026, + "grad_norm": 0.47147318720817566, + "learning_rate": 0.00011868347338935574, + "loss": 0.367, + "step": 31577 + }, + { + "epoch": 17.641340782122906, + "grad_norm": 0.45162442326545715, + "learning_rate": 0.00011865546218487396, + "loss": 0.4238, + "step": 31578 + }, + { + "epoch": 17.641899441340783, + "grad_norm": 0.6584606170654297, + "learning_rate": 0.00011862745098039215, + "loss": 0.4001, + "step": 31579 + }, + { + "epoch": 17.64245810055866, + "grad_norm": 1.55841863155365, + "learning_rate": 0.00011859943977591037, + "loss": 0.3405, + "step": 31580 + }, + { + "epoch": 17.643016759776536, + "grad_norm": 0.41311919689178467, + "learning_rate": 0.00011857142857142858, + "loss": 0.4435, + "step": 31581 + }, + { + "epoch": 17.643575418994413, + "grad_norm": 0.40032321214675903, + "learning_rate": 0.00011854341736694678, + "loss": 0.3566, + "step": 31582 + }, + { + "epoch": 17.64413407821229, + "grad_norm": 0.7496161460876465, + "learning_rate": 0.00011851540616246499, + "loss": 0.4416, + "step": 31583 + }, + { + "epoch": 17.64469273743017, + "grad_norm": 0.3911203444004059, + "learning_rate": 0.0001184873949579832, + "loss": 0.4098, + "step": 31584 + }, + { + "epoch": 17.645251396648046, + "grad_norm": 0.9852616190910339, + "learning_rate": 0.0001184593837535014, + "loss": 0.3722, + "step": 31585 + }, + { + "epoch": 17.645810055865923, + "grad_norm": 0.7721287608146667, + "learning_rate": 0.00011843137254901962, + "loss": 0.3411, + "step": 31586 + }, + { + "epoch": 17.6463687150838, + "grad_norm": 1.2441833019256592, + "learning_rate": 0.00011840336134453781, + "loss": 0.3388, + "step": 31587 + }, + { + "epoch": 17.646927374301676, + "grad_norm": 0.5143932700157166, + "learning_rate": 0.00011837535014005603, + "loss": 0.4098, + "step": 31588 + }, + { + "epoch": 17.647486033519552, + "grad_norm": 0.348914235830307, + "learning_rate": 0.00011834733893557422, + "loss": 0.3231, + "step": 31589 + }, + { + "epoch": 17.64804469273743, + "grad_norm": 0.36880359053611755, + "learning_rate": 0.00011831932773109244, + "loss": 0.3407, + "step": 31590 + }, + { + "epoch": 17.64860335195531, + "grad_norm": 0.6653019785881042, + "learning_rate": 0.00011829131652661064, + "loss": 0.5821, + "step": 31591 + }, + { + "epoch": 17.649162011173186, + "grad_norm": 0.6810843348503113, + "learning_rate": 0.00011826330532212886, + "loss": 0.3613, + "step": 31592 + }, + { + "epoch": 17.649720670391062, + "grad_norm": 0.7818769812583923, + "learning_rate": 0.00011823529411764706, + "loss": 0.4533, + "step": 31593 + }, + { + "epoch": 17.65027932960894, + "grad_norm": 0.4716668725013733, + "learning_rate": 0.00011820728291316527, + "loss": 0.4091, + "step": 31594 + }, + { + "epoch": 17.650837988826815, + "grad_norm": 11.892498016357422, + "learning_rate": 0.00011817927170868347, + "loss": 0.3633, + "step": 31595 + }, + { + "epoch": 17.65139664804469, + "grad_norm": 0.4523873031139374, + "learning_rate": 0.00011815126050420168, + "loss": 0.4002, + "step": 31596 + }, + { + "epoch": 17.65195530726257, + "grad_norm": 0.401519238948822, + "learning_rate": 0.00011812324929971989, + "loss": 0.3617, + "step": 31597 + }, + { + "epoch": 17.65251396648045, + "grad_norm": 0.6163433194160461, + "learning_rate": 0.0001180952380952381, + "loss": 0.4538, + "step": 31598 + }, + { + "epoch": 17.653072625698325, + "grad_norm": 0.4203943610191345, + "learning_rate": 0.0001180672268907563, + "loss": 0.3265, + "step": 31599 + }, + { + "epoch": 17.6536312849162, + "grad_norm": 0.6386510729789734, + "learning_rate": 0.00011803921568627452, + "loss": 0.4398, + "step": 31600 + }, + { + "epoch": 17.654189944134078, + "grad_norm": 1.06069815158844, + "learning_rate": 0.00011801120448179271, + "loss": 0.4286, + "step": 31601 + }, + { + "epoch": 17.654748603351955, + "grad_norm": 0.3916155695915222, + "learning_rate": 0.00011798319327731093, + "loss": 0.4547, + "step": 31602 + }, + { + "epoch": 17.65530726256983, + "grad_norm": 0.4664710462093353, + "learning_rate": 0.00011795518207282914, + "loss": 0.4157, + "step": 31603 + }, + { + "epoch": 17.655865921787708, + "grad_norm": 0.6798214316368103, + "learning_rate": 0.00011792717086834734, + "loss": 0.3903, + "step": 31604 + }, + { + "epoch": 17.656424581005588, + "grad_norm": 0.7662414312362671, + "learning_rate": 0.00011789915966386555, + "loss": 0.5906, + "step": 31605 + }, + { + "epoch": 17.656983240223465, + "grad_norm": 0.381435364484787, + "learning_rate": 0.00011787114845938375, + "loss": 0.508, + "step": 31606 + }, + { + "epoch": 17.65754189944134, + "grad_norm": 0.36230552196502686, + "learning_rate": 0.00011784313725490196, + "loss": 0.4304, + "step": 31607 + }, + { + "epoch": 17.658100558659218, + "grad_norm": 0.5704683065414429, + "learning_rate": 0.00011781512605042018, + "loss": 0.4462, + "step": 31608 + }, + { + "epoch": 17.658659217877094, + "grad_norm": 0.4187014400959015, + "learning_rate": 0.00011778711484593837, + "loss": 0.3724, + "step": 31609 + }, + { + "epoch": 17.65921787709497, + "grad_norm": 0.5483432412147522, + "learning_rate": 0.00011775910364145659, + "loss": 0.5125, + "step": 31610 + }, + { + "epoch": 17.659776536312847, + "grad_norm": 0.5280621647834778, + "learning_rate": 0.00011773109243697478, + "loss": 0.5092, + "step": 31611 + }, + { + "epoch": 17.660335195530728, + "grad_norm": 0.4235619902610779, + "learning_rate": 0.000117703081232493, + "loss": 0.4665, + "step": 31612 + }, + { + "epoch": 17.660893854748604, + "grad_norm": 0.5079139471054077, + "learning_rate": 0.00011767507002801121, + "loss": 0.4052, + "step": 31613 + }, + { + "epoch": 17.66145251396648, + "grad_norm": 1.5212641954421997, + "learning_rate": 0.00011764705882352942, + "loss": 0.4093, + "step": 31614 + }, + { + "epoch": 17.662011173184357, + "grad_norm": 0.5192087888717651, + "learning_rate": 0.00011761904761904762, + "loss": 0.4886, + "step": 31615 + }, + { + "epoch": 17.662569832402234, + "grad_norm": 0.5584065318107605, + "learning_rate": 0.00011759103641456583, + "loss": 0.4583, + "step": 31616 + }, + { + "epoch": 17.66312849162011, + "grad_norm": 0.5097191333770752, + "learning_rate": 0.00011756302521008403, + "loss": 0.5373, + "step": 31617 + }, + { + "epoch": 17.66368715083799, + "grad_norm": 0.47235700488090515, + "learning_rate": 0.00011753501400560224, + "loss": 0.446, + "step": 31618 + }, + { + "epoch": 17.664245810055867, + "grad_norm": 0.42533260583877563, + "learning_rate": 0.00011750700280112044, + "loss": 0.4097, + "step": 31619 + }, + { + "epoch": 17.664804469273744, + "grad_norm": 0.35144490003585815, + "learning_rate": 0.00011747899159663866, + "loss": 0.4372, + "step": 31620 + }, + { + "epoch": 17.66536312849162, + "grad_norm": 0.3451632857322693, + "learning_rate": 0.00011745098039215686, + "loss": 0.3348, + "step": 31621 + }, + { + "epoch": 17.665921787709497, + "grad_norm": 0.474505215883255, + "learning_rate": 0.00011742296918767508, + "loss": 0.6443, + "step": 31622 + }, + { + "epoch": 17.666480446927373, + "grad_norm": 0.5839816331863403, + "learning_rate": 0.00011739495798319327, + "loss": 0.4087, + "step": 31623 + }, + { + "epoch": 17.66703910614525, + "grad_norm": 0.384731650352478, + "learning_rate": 0.00011736694677871149, + "loss": 0.4137, + "step": 31624 + }, + { + "epoch": 17.66759776536313, + "grad_norm": 0.479227215051651, + "learning_rate": 0.0001173389355742297, + "loss": 0.3764, + "step": 31625 + }, + { + "epoch": 17.668156424581007, + "grad_norm": 0.6528783440589905, + "learning_rate": 0.0001173109243697479, + "loss": 0.4521, + "step": 31626 + }, + { + "epoch": 17.668715083798883, + "grad_norm": 0.9608190655708313, + "learning_rate": 0.0001172829131652661, + "loss": 0.6482, + "step": 31627 + }, + { + "epoch": 17.66927374301676, + "grad_norm": 0.43148183822631836, + "learning_rate": 0.00011725490196078431, + "loss": 0.3637, + "step": 31628 + }, + { + "epoch": 17.669832402234636, + "grad_norm": 0.4659328758716583, + "learning_rate": 0.00011722689075630252, + "loss": 0.4142, + "step": 31629 + }, + { + "epoch": 17.670391061452513, + "grad_norm": 1.5025144815444946, + "learning_rate": 0.00011719887955182074, + "loss": 0.4255, + "step": 31630 + }, + { + "epoch": 17.67094972067039, + "grad_norm": 0.5012494325637817, + "learning_rate": 0.00011717086834733893, + "loss": 0.3259, + "step": 31631 + }, + { + "epoch": 17.67150837988827, + "grad_norm": 0.4289581775665283, + "learning_rate": 0.00011714285714285715, + "loss": 0.4878, + "step": 31632 + }, + { + "epoch": 17.672067039106146, + "grad_norm": 0.3920581042766571, + "learning_rate": 0.00011711484593837534, + "loss": 0.3948, + "step": 31633 + }, + { + "epoch": 17.672625698324023, + "grad_norm": 2.1535732746124268, + "learning_rate": 0.00011708683473389356, + "loss": 0.3313, + "step": 31634 + }, + { + "epoch": 17.6731843575419, + "grad_norm": 0.7451744079589844, + "learning_rate": 0.00011705882352941177, + "loss": 0.4454, + "step": 31635 + }, + { + "epoch": 17.673743016759776, + "grad_norm": 0.4232161045074463, + "learning_rate": 0.00011703081232492997, + "loss": 0.5101, + "step": 31636 + }, + { + "epoch": 17.674301675977652, + "grad_norm": 1.4028230905532837, + "learning_rate": 0.00011700280112044818, + "loss": 0.4238, + "step": 31637 + }, + { + "epoch": 17.674860335195532, + "grad_norm": 0.4034656882286072, + "learning_rate": 0.00011697478991596639, + "loss": 0.4115, + "step": 31638 + }, + { + "epoch": 17.67541899441341, + "grad_norm": 0.48111510276794434, + "learning_rate": 0.00011694677871148459, + "loss": 0.3278, + "step": 31639 + }, + { + "epoch": 17.675977653631286, + "grad_norm": 0.7918464541435242, + "learning_rate": 0.00011691876750700281, + "loss": 0.3576, + "step": 31640 + }, + { + "epoch": 17.676536312849162, + "grad_norm": 0.6013913750648499, + "learning_rate": 0.000116890756302521, + "loss": 0.4078, + "step": 31641 + }, + { + "epoch": 17.67709497206704, + "grad_norm": 0.7468206286430359, + "learning_rate": 0.00011686274509803922, + "loss": 0.5465, + "step": 31642 + }, + { + "epoch": 17.677653631284915, + "grad_norm": 0.6063535213470459, + "learning_rate": 0.00011683473389355742, + "loss": 0.5013, + "step": 31643 + }, + { + "epoch": 17.678212290502792, + "grad_norm": 0.3940153121948242, + "learning_rate": 0.00011680672268907564, + "loss": 0.4193, + "step": 31644 + }, + { + "epoch": 17.678770949720672, + "grad_norm": 0.4078555405139923, + "learning_rate": 0.00011677871148459383, + "loss": 0.3593, + "step": 31645 + }, + { + "epoch": 17.67932960893855, + "grad_norm": 0.4437078833580017, + "learning_rate": 0.00011675070028011205, + "loss": 0.4136, + "step": 31646 + }, + { + "epoch": 17.679888268156425, + "grad_norm": 0.4826681911945343, + "learning_rate": 0.00011672268907563025, + "loss": 0.4074, + "step": 31647 + }, + { + "epoch": 17.6804469273743, + "grad_norm": 0.4207814633846283, + "learning_rate": 0.00011669467787114846, + "loss": 0.4013, + "step": 31648 + }, + { + "epoch": 17.68100558659218, + "grad_norm": 0.3458710014820099, + "learning_rate": 0.00011666666666666667, + "loss": 0.3464, + "step": 31649 + }, + { + "epoch": 17.681564245810055, + "grad_norm": 0.5296543836593628, + "learning_rate": 0.00011663865546218487, + "loss": 0.424, + "step": 31650 + }, + { + "epoch": 17.68212290502793, + "grad_norm": 0.555448055267334, + "learning_rate": 0.00011661064425770308, + "loss": 0.4235, + "step": 31651 + }, + { + "epoch": 17.68268156424581, + "grad_norm": 0.628456175327301, + "learning_rate": 0.0001165826330532213, + "loss": 0.6004, + "step": 31652 + }, + { + "epoch": 17.683240223463688, + "grad_norm": 0.37482598423957825, + "learning_rate": 0.00011655462184873949, + "loss": 0.3871, + "step": 31653 + }, + { + "epoch": 17.683798882681565, + "grad_norm": 15.061306953430176, + "learning_rate": 0.00011652661064425771, + "loss": 0.3937, + "step": 31654 + }, + { + "epoch": 17.68435754189944, + "grad_norm": 0.43291914463043213, + "learning_rate": 0.0001164985994397759, + "loss": 0.4344, + "step": 31655 + }, + { + "epoch": 17.684916201117318, + "grad_norm": 0.4832512140274048, + "learning_rate": 0.00011647058823529412, + "loss": 0.4133, + "step": 31656 + }, + { + "epoch": 17.685474860335194, + "grad_norm": 0.918084442615509, + "learning_rate": 0.00011644257703081233, + "loss": 0.3989, + "step": 31657 + }, + { + "epoch": 17.68603351955307, + "grad_norm": 0.48510441184043884, + "learning_rate": 0.00011641456582633053, + "loss": 0.3668, + "step": 31658 + }, + { + "epoch": 17.68659217877095, + "grad_norm": 0.37789955735206604, + "learning_rate": 0.00011638655462184874, + "loss": 0.3973, + "step": 31659 + }, + { + "epoch": 17.687150837988828, + "grad_norm": 1.3163118362426758, + "learning_rate": 0.00011635854341736694, + "loss": 0.3282, + "step": 31660 + }, + { + "epoch": 17.687709497206704, + "grad_norm": 0.4649355709552765, + "learning_rate": 0.00011633053221288515, + "loss": 0.3604, + "step": 31661 + }, + { + "epoch": 17.68826815642458, + "grad_norm": 0.27869874238967896, + "learning_rate": 0.00011630252100840337, + "loss": 0.302, + "step": 31662 + }, + { + "epoch": 17.688826815642457, + "grad_norm": 0.5352100133895874, + "learning_rate": 0.00011627450980392156, + "loss": 0.566, + "step": 31663 + }, + { + "epoch": 17.689385474860334, + "grad_norm": 0.3969566226005554, + "learning_rate": 0.00011624649859943978, + "loss": 0.2895, + "step": 31664 + }, + { + "epoch": 17.689944134078214, + "grad_norm": 0.36757737398147583, + "learning_rate": 0.00011621848739495797, + "loss": 0.3359, + "step": 31665 + }, + { + "epoch": 17.69050279329609, + "grad_norm": 0.502049446105957, + "learning_rate": 0.0001161904761904762, + "loss": 0.2775, + "step": 31666 + }, + { + "epoch": 17.691061452513967, + "grad_norm": 0.44251585006713867, + "learning_rate": 0.0001161624649859944, + "loss": 0.4396, + "step": 31667 + }, + { + "epoch": 17.691620111731844, + "grad_norm": 0.5007766485214233, + "learning_rate": 0.0001161344537815126, + "loss": 0.4285, + "step": 31668 + }, + { + "epoch": 17.69217877094972, + "grad_norm": 0.3388381600379944, + "learning_rate": 0.00011610644257703081, + "loss": 0.3408, + "step": 31669 + }, + { + "epoch": 17.692737430167597, + "grad_norm": 0.6114059090614319, + "learning_rate": 0.00011607843137254902, + "loss": 0.3896, + "step": 31670 + }, + { + "epoch": 17.693296089385473, + "grad_norm": 0.42510855197906494, + "learning_rate": 0.00011605042016806722, + "loss": 0.459, + "step": 31671 + }, + { + "epoch": 17.693854748603353, + "grad_norm": 0.5139893293380737, + "learning_rate": 0.00011602240896358544, + "loss": 0.4303, + "step": 31672 + }, + { + "epoch": 17.69441340782123, + "grad_norm": 1.0428709983825684, + "learning_rate": 0.00011599439775910364, + "loss": 0.3736, + "step": 31673 + }, + { + "epoch": 17.694972067039107, + "grad_norm": 2.1388893127441406, + "learning_rate": 0.00011596638655462186, + "loss": 0.6118, + "step": 31674 + }, + { + "epoch": 17.695530726256983, + "grad_norm": 0.5511255860328674, + "learning_rate": 0.00011593837535014005, + "loss": 0.564, + "step": 31675 + }, + { + "epoch": 17.69608938547486, + "grad_norm": 0.6469642519950867, + "learning_rate": 0.00011591036414565827, + "loss": 0.3678, + "step": 31676 + }, + { + "epoch": 17.696648044692736, + "grad_norm": 0.42850831151008606, + "learning_rate": 0.00011588235294117646, + "loss": 0.3473, + "step": 31677 + }, + { + "epoch": 17.697206703910613, + "grad_norm": 0.6640798449516296, + "learning_rate": 0.00011585434173669468, + "loss": 0.3513, + "step": 31678 + }, + { + "epoch": 17.697765363128493, + "grad_norm": 1.0331090688705444, + "learning_rate": 0.00011582633053221289, + "loss": 0.5791, + "step": 31679 + }, + { + "epoch": 17.69832402234637, + "grad_norm": 0.37808313965797424, + "learning_rate": 0.00011579831932773109, + "loss": 0.4451, + "step": 31680 + }, + { + "epoch": 17.698882681564246, + "grad_norm": 0.42985180020332336, + "learning_rate": 0.0001157703081232493, + "loss": 0.4207, + "step": 31681 + }, + { + "epoch": 17.699441340782123, + "grad_norm": 0.7293187975883484, + "learning_rate": 0.0001157422969187675, + "loss": 0.3876, + "step": 31682 + }, + { + "epoch": 17.7, + "grad_norm": 3.816993236541748, + "learning_rate": 0.00011571428571428571, + "loss": 0.4487, + "step": 31683 + }, + { + "epoch": 17.700558659217876, + "grad_norm": 0.33883845806121826, + "learning_rate": 0.00011568627450980393, + "loss": 0.3798, + "step": 31684 + }, + { + "epoch": 17.701117318435756, + "grad_norm": 0.4608825445175171, + "learning_rate": 0.00011565826330532212, + "loss": 0.4232, + "step": 31685 + }, + { + "epoch": 17.701675977653633, + "grad_norm": 0.9737520813941956, + "learning_rate": 0.00011563025210084034, + "loss": 0.3791, + "step": 31686 + }, + { + "epoch": 17.70223463687151, + "grad_norm": 0.44187629222869873, + "learning_rate": 0.00011560224089635853, + "loss": 0.3185, + "step": 31687 + }, + { + "epoch": 17.702793296089386, + "grad_norm": 0.872516393661499, + "learning_rate": 0.00011557422969187675, + "loss": 0.4443, + "step": 31688 + }, + { + "epoch": 17.703351955307262, + "grad_norm": 0.548848569393158, + "learning_rate": 0.00011554621848739496, + "loss": 0.4519, + "step": 31689 + }, + { + "epoch": 17.70391061452514, + "grad_norm": 0.47159314155578613, + "learning_rate": 0.00011551820728291317, + "loss": 0.535, + "step": 31690 + }, + { + "epoch": 17.704469273743015, + "grad_norm": 0.6160970330238342, + "learning_rate": 0.00011549019607843137, + "loss": 0.3898, + "step": 31691 + }, + { + "epoch": 17.705027932960895, + "grad_norm": 1.0960477590560913, + "learning_rate": 0.00011546218487394958, + "loss": 0.4759, + "step": 31692 + }, + { + "epoch": 17.705586592178772, + "grad_norm": 0.5396501421928406, + "learning_rate": 0.00011543417366946778, + "loss": 0.3898, + "step": 31693 + }, + { + "epoch": 17.70614525139665, + "grad_norm": 0.6107489466667175, + "learning_rate": 0.000115406162464986, + "loss": 0.4173, + "step": 31694 + }, + { + "epoch": 17.706703910614525, + "grad_norm": 0.5246310830116272, + "learning_rate": 0.0001153781512605042, + "loss": 0.3455, + "step": 31695 + }, + { + "epoch": 17.7072625698324, + "grad_norm": 0.48305490612983704, + "learning_rate": 0.00011535014005602241, + "loss": 0.3954, + "step": 31696 + }, + { + "epoch": 17.70782122905028, + "grad_norm": 0.5337732434272766, + "learning_rate": 0.00011532212885154061, + "loss": 0.3865, + "step": 31697 + }, + { + "epoch": 17.708379888268155, + "grad_norm": 0.7672650218009949, + "learning_rate": 0.00011529411764705883, + "loss": 0.3603, + "step": 31698 + }, + { + "epoch": 17.708938547486035, + "grad_norm": 0.687788188457489, + "learning_rate": 0.00011526610644257705, + "loss": 0.3206, + "step": 31699 + }, + { + "epoch": 17.70949720670391, + "grad_norm": 1.1132733821868896, + "learning_rate": 0.00011523809523809524, + "loss": 0.5971, + "step": 31700 + }, + { + "epoch": 17.710055865921788, + "grad_norm": 0.3982209861278534, + "learning_rate": 0.00011521008403361346, + "loss": 0.3968, + "step": 31701 + }, + { + "epoch": 17.710614525139665, + "grad_norm": 0.40567949414253235, + "learning_rate": 0.00011518207282913165, + "loss": 0.4902, + "step": 31702 + }, + { + "epoch": 17.71117318435754, + "grad_norm": 0.9496821165084839, + "learning_rate": 0.00011515406162464987, + "loss": 0.4258, + "step": 31703 + }, + { + "epoch": 17.711731843575418, + "grad_norm": 0.3718690276145935, + "learning_rate": 0.00011512605042016806, + "loss": 0.371, + "step": 31704 + }, + { + "epoch": 17.712290502793294, + "grad_norm": 0.4091494381427765, + "learning_rate": 0.00011509803921568628, + "loss": 0.3981, + "step": 31705 + }, + { + "epoch": 17.712849162011175, + "grad_norm": 0.34816282987594604, + "learning_rate": 0.00011507002801120449, + "loss": 0.3845, + "step": 31706 + }, + { + "epoch": 17.71340782122905, + "grad_norm": 0.8170983791351318, + "learning_rate": 0.0001150420168067227, + "loss": 0.298, + "step": 31707 + }, + { + "epoch": 17.713966480446928, + "grad_norm": 0.567524790763855, + "learning_rate": 0.0001150140056022409, + "loss": 0.4071, + "step": 31708 + }, + { + "epoch": 17.714525139664804, + "grad_norm": 0.4217959940433502, + "learning_rate": 0.0001149859943977591, + "loss": 0.3282, + "step": 31709 + }, + { + "epoch": 17.71508379888268, + "grad_norm": 0.3706951141357422, + "learning_rate": 0.00011495798319327731, + "loss": 0.3237, + "step": 31710 + }, + { + "epoch": 17.715642458100557, + "grad_norm": 0.4685196578502655, + "learning_rate": 0.00011492997198879553, + "loss": 0.6083, + "step": 31711 + }, + { + "epoch": 17.716201117318434, + "grad_norm": 0.5812598466873169, + "learning_rate": 0.00011490196078431372, + "loss": 0.3995, + "step": 31712 + }, + { + "epoch": 17.716759776536314, + "grad_norm": 0.36034324765205383, + "learning_rate": 0.00011487394957983194, + "loss": 0.327, + "step": 31713 + }, + { + "epoch": 17.71731843575419, + "grad_norm": 0.6904635429382324, + "learning_rate": 0.00011484593837535014, + "loss": 0.5734, + "step": 31714 + }, + { + "epoch": 17.717877094972067, + "grad_norm": 0.40325453877449036, + "learning_rate": 0.00011481792717086836, + "loss": 0.3822, + "step": 31715 + }, + { + "epoch": 17.718435754189944, + "grad_norm": 0.36447393894195557, + "learning_rate": 0.00011478991596638656, + "loss": 0.3569, + "step": 31716 + }, + { + "epoch": 17.71899441340782, + "grad_norm": 0.6370408535003662, + "learning_rate": 0.00011476190476190477, + "loss": 0.4883, + "step": 31717 + }, + { + "epoch": 17.719553072625697, + "grad_norm": 0.37020808458328247, + "learning_rate": 0.00011473389355742297, + "loss": 0.3948, + "step": 31718 + }, + { + "epoch": 17.720111731843577, + "grad_norm": 0.34848758578300476, + "learning_rate": 0.00011470588235294118, + "loss": 0.37, + "step": 31719 + }, + { + "epoch": 17.720670391061454, + "grad_norm": 0.5552564859390259, + "learning_rate": 0.00011467787114845939, + "loss": 0.6072, + "step": 31720 + }, + { + "epoch": 17.72122905027933, + "grad_norm": 0.4255106747150421, + "learning_rate": 0.0001146498599439776, + "loss": 0.4111, + "step": 31721 + }, + { + "epoch": 17.721787709497207, + "grad_norm": 0.5258291959762573, + "learning_rate": 0.0001146218487394958, + "loss": 0.3518, + "step": 31722 + }, + { + "epoch": 17.722346368715083, + "grad_norm": 0.444899320602417, + "learning_rate": 0.00011459383753501402, + "loss": 0.4101, + "step": 31723 + }, + { + "epoch": 17.72290502793296, + "grad_norm": 0.5501758456230164, + "learning_rate": 0.00011456582633053221, + "loss": 0.3817, + "step": 31724 + }, + { + "epoch": 17.723463687150836, + "grad_norm": 1.4332919120788574, + "learning_rate": 0.00011453781512605043, + "loss": 0.3544, + "step": 31725 + }, + { + "epoch": 17.724022346368717, + "grad_norm": 0.3350711464881897, + "learning_rate": 0.00011450980392156864, + "loss": 0.3635, + "step": 31726 + }, + { + "epoch": 17.724581005586593, + "grad_norm": 1.3741832971572876, + "learning_rate": 0.00011448179271708684, + "loss": 0.4342, + "step": 31727 + }, + { + "epoch": 17.72513966480447, + "grad_norm": 2.9018564224243164, + "learning_rate": 0.00011445378151260505, + "loss": 0.4103, + "step": 31728 + }, + { + "epoch": 17.725698324022346, + "grad_norm": 0.33238524198532104, + "learning_rate": 0.00011442577030812325, + "loss": 0.332, + "step": 31729 + }, + { + "epoch": 17.726256983240223, + "grad_norm": 0.39465397596359253, + "learning_rate": 0.00011439775910364146, + "loss": 0.3943, + "step": 31730 + }, + { + "epoch": 17.7268156424581, + "grad_norm": 0.6010270118713379, + "learning_rate": 0.00011436974789915967, + "loss": 0.4098, + "step": 31731 + }, + { + "epoch": 17.727374301675976, + "grad_norm": 0.6202162504196167, + "learning_rate": 0.00011434173669467787, + "loss": 0.6358, + "step": 31732 + }, + { + "epoch": 17.727932960893856, + "grad_norm": 0.4591814875602722, + "learning_rate": 0.00011431372549019609, + "loss": 0.442, + "step": 31733 + }, + { + "epoch": 17.728491620111733, + "grad_norm": 0.5069906115531921, + "learning_rate": 0.00011428571428571428, + "loss": 0.3449, + "step": 31734 + }, + { + "epoch": 17.72905027932961, + "grad_norm": 0.3964703679084778, + "learning_rate": 0.0001142577030812325, + "loss": 0.3507, + "step": 31735 + }, + { + "epoch": 17.729608938547486, + "grad_norm": 0.7453311085700989, + "learning_rate": 0.0001142296918767507, + "loss": 0.4001, + "step": 31736 + }, + { + "epoch": 17.730167597765362, + "grad_norm": 0.42806634306907654, + "learning_rate": 0.00011420168067226891, + "loss": 0.4508, + "step": 31737 + }, + { + "epoch": 17.73072625698324, + "grad_norm": 0.6020180583000183, + "learning_rate": 0.00011417366946778712, + "loss": 0.4229, + "step": 31738 + }, + { + "epoch": 17.73128491620112, + "grad_norm": 0.5702681541442871, + "learning_rate": 0.00011414565826330533, + "loss": 0.4787, + "step": 31739 + }, + { + "epoch": 17.731843575418996, + "grad_norm": 1.486236572265625, + "learning_rate": 0.00011411764705882353, + "loss": 0.4373, + "step": 31740 + }, + { + "epoch": 17.732402234636872, + "grad_norm": 2.740108013153076, + "learning_rate": 0.00011408963585434174, + "loss": 0.4312, + "step": 31741 + }, + { + "epoch": 17.73296089385475, + "grad_norm": 0.6044945120811462, + "learning_rate": 0.00011406162464985994, + "loss": 0.3758, + "step": 31742 + }, + { + "epoch": 17.733519553072625, + "grad_norm": 5.576402187347412, + "learning_rate": 0.00011403361344537816, + "loss": 0.3877, + "step": 31743 + }, + { + "epoch": 17.734078212290502, + "grad_norm": 0.49012938141822815, + "learning_rate": 0.00011400560224089636, + "loss": 0.4137, + "step": 31744 + }, + { + "epoch": 17.73463687150838, + "grad_norm": 0.3953683376312256, + "learning_rate": 0.00011397759103641458, + "loss": 0.4363, + "step": 31745 + }, + { + "epoch": 17.73519553072626, + "grad_norm": 0.9253637790679932, + "learning_rate": 0.00011394957983193277, + "loss": 0.3309, + "step": 31746 + }, + { + "epoch": 17.735754189944135, + "grad_norm": 0.3485408425331116, + "learning_rate": 0.00011392156862745099, + "loss": 0.4461, + "step": 31747 + }, + { + "epoch": 17.73631284916201, + "grad_norm": 5.815307140350342, + "learning_rate": 0.0001138935574229692, + "loss": 0.5169, + "step": 31748 + }, + { + "epoch": 17.73687150837989, + "grad_norm": 0.5165740847587585, + "learning_rate": 0.0001138655462184874, + "loss": 0.3959, + "step": 31749 + }, + { + "epoch": 17.737430167597765, + "grad_norm": 1.4824273586273193, + "learning_rate": 0.0001138375350140056, + "loss": 0.466, + "step": 31750 + }, + { + "epoch": 17.73798882681564, + "grad_norm": 0.4407392144203186, + "learning_rate": 0.00011380952380952381, + "loss": 0.5506, + "step": 31751 + }, + { + "epoch": 17.738547486033518, + "grad_norm": 0.42283546924591064, + "learning_rate": 0.00011378151260504202, + "loss": 0.3625, + "step": 31752 + }, + { + "epoch": 17.739106145251398, + "grad_norm": 1.8034919500350952, + "learning_rate": 0.00011375350140056024, + "loss": 0.4253, + "step": 31753 + }, + { + "epoch": 17.739664804469275, + "grad_norm": 0.8658513426780701, + "learning_rate": 0.00011372549019607843, + "loss": 0.3778, + "step": 31754 + }, + { + "epoch": 17.74022346368715, + "grad_norm": 0.4768412113189697, + "learning_rate": 0.00011369747899159665, + "loss": 0.518, + "step": 31755 + }, + { + "epoch": 17.740782122905028, + "grad_norm": 0.480977863073349, + "learning_rate": 0.00011366946778711484, + "loss": 0.4105, + "step": 31756 + }, + { + "epoch": 17.741340782122904, + "grad_norm": 0.4641282558441162, + "learning_rate": 0.00011364145658263306, + "loss": 0.4152, + "step": 31757 + }, + { + "epoch": 17.74189944134078, + "grad_norm": 0.5431604385375977, + "learning_rate": 0.00011361344537815125, + "loss": 0.5353, + "step": 31758 + }, + { + "epoch": 17.742458100558657, + "grad_norm": 0.41398873925209045, + "learning_rate": 0.00011358543417366947, + "loss": 0.4273, + "step": 31759 + }, + { + "epoch": 17.743016759776538, + "grad_norm": 0.452446848154068, + "learning_rate": 0.00011355742296918768, + "loss": 0.335, + "step": 31760 + }, + { + "epoch": 17.743575418994414, + "grad_norm": 0.6121146082878113, + "learning_rate": 0.00011352941176470589, + "loss": 0.4743, + "step": 31761 + }, + { + "epoch": 17.74413407821229, + "grad_norm": 1.3366907835006714, + "learning_rate": 0.00011350140056022409, + "loss": 0.4244, + "step": 31762 + }, + { + "epoch": 17.744692737430167, + "grad_norm": 0.50154709815979, + "learning_rate": 0.0001134733893557423, + "loss": 0.3787, + "step": 31763 + }, + { + "epoch": 17.745251396648044, + "grad_norm": 0.45586681365966797, + "learning_rate": 0.0001134453781512605, + "loss": 0.3345, + "step": 31764 + }, + { + "epoch": 17.74581005586592, + "grad_norm": 0.36819028854370117, + "learning_rate": 0.00011341736694677872, + "loss": 0.3532, + "step": 31765 + }, + { + "epoch": 17.7463687150838, + "grad_norm": 0.3592306077480316, + "learning_rate": 0.00011338935574229692, + "loss": 0.4122, + "step": 31766 + }, + { + "epoch": 17.746927374301677, + "grad_norm": 0.36595141887664795, + "learning_rate": 0.00011336134453781514, + "loss": 0.4265, + "step": 31767 + }, + { + "epoch": 17.747486033519554, + "grad_norm": 0.40586674213409424, + "learning_rate": 0.00011333333333333333, + "loss": 0.4106, + "step": 31768 + }, + { + "epoch": 17.74804469273743, + "grad_norm": 1.2317415475845337, + "learning_rate": 0.00011330532212885155, + "loss": 0.3875, + "step": 31769 + }, + { + "epoch": 17.748603351955307, + "grad_norm": 0.39180266857147217, + "learning_rate": 0.00011327731092436975, + "loss": 0.2927, + "step": 31770 + }, + { + "epoch": 17.749162011173183, + "grad_norm": 0.447621613740921, + "learning_rate": 0.00011324929971988796, + "loss": 0.3638, + "step": 31771 + }, + { + "epoch": 17.74972067039106, + "grad_norm": 0.3200156092643738, + "learning_rate": 0.00011322128851540617, + "loss": 0.3719, + "step": 31772 + }, + { + "epoch": 17.75027932960894, + "grad_norm": 0.3594907522201538, + "learning_rate": 0.00011319327731092437, + "loss": 0.3404, + "step": 31773 + }, + { + "epoch": 17.750837988826817, + "grad_norm": 0.3619016408920288, + "learning_rate": 0.00011316526610644258, + "loss": 0.2689, + "step": 31774 + }, + { + "epoch": 17.751396648044693, + "grad_norm": 0.4687427580356598, + "learning_rate": 0.0001131372549019608, + "loss": 0.3966, + "step": 31775 + }, + { + "epoch": 17.75195530726257, + "grad_norm": 2.486599922180176, + "learning_rate": 0.00011310924369747899, + "loss": 0.4061, + "step": 31776 + }, + { + "epoch": 17.752513966480446, + "grad_norm": 0.5070294141769409, + "learning_rate": 0.00011308123249299721, + "loss": 0.4258, + "step": 31777 + }, + { + "epoch": 17.753072625698323, + "grad_norm": 0.8229541182518005, + "learning_rate": 0.0001130532212885154, + "loss": 0.4483, + "step": 31778 + }, + { + "epoch": 17.7536312849162, + "grad_norm": 0.4754789173603058, + "learning_rate": 0.00011302521008403362, + "loss": 0.5531, + "step": 31779 + }, + { + "epoch": 17.75418994413408, + "grad_norm": 0.8967899680137634, + "learning_rate": 0.00011299719887955183, + "loss": 0.3463, + "step": 31780 + }, + { + "epoch": 17.754748603351956, + "grad_norm": 0.4442479908466339, + "learning_rate": 0.00011296918767507003, + "loss": 0.4539, + "step": 31781 + }, + { + "epoch": 17.755307262569833, + "grad_norm": 0.48731866478919983, + "learning_rate": 0.00011294117647058824, + "loss": 0.4408, + "step": 31782 + }, + { + "epoch": 17.75586592178771, + "grad_norm": 0.3123086094856262, + "learning_rate": 0.00011291316526610644, + "loss": 0.294, + "step": 31783 + }, + { + "epoch": 17.756424581005586, + "grad_norm": 0.45249998569488525, + "learning_rate": 0.00011288515406162465, + "loss": 0.3584, + "step": 31784 + }, + { + "epoch": 17.756983240223462, + "grad_norm": 0.36949098110198975, + "learning_rate": 0.00011285714285714286, + "loss": 0.4162, + "step": 31785 + }, + { + "epoch": 17.757541899441343, + "grad_norm": 0.391250342130661, + "learning_rate": 0.00011282913165266106, + "loss": 0.3537, + "step": 31786 + }, + { + "epoch": 17.75810055865922, + "grad_norm": 0.4333246350288391, + "learning_rate": 0.00011280112044817928, + "loss": 0.3671, + "step": 31787 + }, + { + "epoch": 17.758659217877096, + "grad_norm": 0.4002021253108978, + "learning_rate": 0.00011277310924369747, + "loss": 0.4698, + "step": 31788 + }, + { + "epoch": 17.759217877094972, + "grad_norm": 0.46529802680015564, + "learning_rate": 0.0001127450980392157, + "loss": 0.4365, + "step": 31789 + }, + { + "epoch": 17.75977653631285, + "grad_norm": 0.4416027069091797, + "learning_rate": 0.00011271708683473389, + "loss": 0.467, + "step": 31790 + }, + { + "epoch": 17.760335195530725, + "grad_norm": 0.38470885157585144, + "learning_rate": 0.0001126890756302521, + "loss": 0.4718, + "step": 31791 + }, + { + "epoch": 17.760893854748602, + "grad_norm": 2.6901586055755615, + "learning_rate": 0.00011266106442577031, + "loss": 0.3468, + "step": 31792 + }, + { + "epoch": 17.761452513966482, + "grad_norm": 0.5298575758934021, + "learning_rate": 0.00011263305322128852, + "loss": 0.4428, + "step": 31793 + }, + { + "epoch": 17.76201117318436, + "grad_norm": 0.36829259991645813, + "learning_rate": 0.00011260504201680672, + "loss": 0.3987, + "step": 31794 + }, + { + "epoch": 17.762569832402235, + "grad_norm": 0.6363039016723633, + "learning_rate": 0.00011257703081232493, + "loss": 0.496, + "step": 31795 + }, + { + "epoch": 17.76312849162011, + "grad_norm": 0.37412530183792114, + "learning_rate": 0.00011254901960784314, + "loss": 0.4066, + "step": 31796 + }, + { + "epoch": 17.76368715083799, + "grad_norm": 3.429553747177124, + "learning_rate": 0.00011252100840336136, + "loss": 0.4323, + "step": 31797 + }, + { + "epoch": 17.764245810055865, + "grad_norm": 1.2768347263336182, + "learning_rate": 0.00011249299719887955, + "loss": 0.5037, + "step": 31798 + }, + { + "epoch": 17.76480446927374, + "grad_norm": 0.31131407618522644, + "learning_rate": 0.00011246498599439777, + "loss": 0.3964, + "step": 31799 + }, + { + "epoch": 17.76536312849162, + "grad_norm": 0.39107710123062134, + "learning_rate": 0.00011243697478991596, + "loss": 0.3687, + "step": 31800 + }, + { + "epoch": 17.765921787709498, + "grad_norm": 0.3947944641113281, + "learning_rate": 0.00011240896358543418, + "loss": 0.4411, + "step": 31801 + }, + { + "epoch": 17.766480446927375, + "grad_norm": 0.32954323291778564, + "learning_rate": 0.00011238095238095239, + "loss": 0.3008, + "step": 31802 + }, + { + "epoch": 17.76703910614525, + "grad_norm": 0.45163053274154663, + "learning_rate": 0.00011235294117647059, + "loss": 0.5035, + "step": 31803 + }, + { + "epoch": 17.767597765363128, + "grad_norm": 0.3795064091682434, + "learning_rate": 0.0001123249299719888, + "loss": 0.3447, + "step": 31804 + }, + { + "epoch": 17.768156424581004, + "grad_norm": 1.524694800376892, + "learning_rate": 0.000112296918767507, + "loss": 0.4474, + "step": 31805 + }, + { + "epoch": 17.76871508379888, + "grad_norm": 0.34645721316337585, + "learning_rate": 0.00011226890756302521, + "loss": 0.3678, + "step": 31806 + }, + { + "epoch": 17.76927374301676, + "grad_norm": 0.4489235281944275, + "learning_rate": 0.00011224089635854343, + "loss": 0.4142, + "step": 31807 + }, + { + "epoch": 17.769832402234638, + "grad_norm": 0.3958072364330292, + "learning_rate": 0.00011221288515406162, + "loss": 0.458, + "step": 31808 + }, + { + "epoch": 17.770391061452514, + "grad_norm": 1.2771408557891846, + "learning_rate": 0.00011218487394957984, + "loss": 0.3309, + "step": 31809 + }, + { + "epoch": 17.77094972067039, + "grad_norm": 0.3904511332511902, + "learning_rate": 0.00011215686274509803, + "loss": 0.3707, + "step": 31810 + }, + { + "epoch": 17.771508379888267, + "grad_norm": 0.3633131682872772, + "learning_rate": 0.00011212885154061625, + "loss": 0.3726, + "step": 31811 + }, + { + "epoch": 17.772067039106144, + "grad_norm": 0.5046189427375793, + "learning_rate": 0.00011210084033613445, + "loss": 0.3883, + "step": 31812 + }, + { + "epoch": 17.772625698324024, + "grad_norm": 0.42271968722343445, + "learning_rate": 0.00011207282913165267, + "loss": 0.3196, + "step": 31813 + }, + { + "epoch": 17.7731843575419, + "grad_norm": 0.3760518729686737, + "learning_rate": 0.00011204481792717087, + "loss": 0.4372, + "step": 31814 + }, + { + "epoch": 17.773743016759777, + "grad_norm": 0.9491005539894104, + "learning_rate": 0.00011201680672268908, + "loss": 0.3523, + "step": 31815 + }, + { + "epoch": 17.774301675977654, + "grad_norm": 0.4184345304965973, + "learning_rate": 0.00011198879551820728, + "loss": 0.3349, + "step": 31816 + }, + { + "epoch": 17.77486033519553, + "grad_norm": 1.4697279930114746, + "learning_rate": 0.00011196078431372549, + "loss": 0.486, + "step": 31817 + }, + { + "epoch": 17.775418994413407, + "grad_norm": 0.6576196551322937, + "learning_rate": 0.0001119327731092437, + "loss": 0.3433, + "step": 31818 + }, + { + "epoch": 17.775977653631283, + "grad_norm": 1.0015850067138672, + "learning_rate": 0.00011190476190476191, + "loss": 0.3722, + "step": 31819 + }, + { + "epoch": 17.776536312849164, + "grad_norm": 0.5193490982055664, + "learning_rate": 0.00011187675070028011, + "loss": 0.4575, + "step": 31820 + }, + { + "epoch": 17.77709497206704, + "grad_norm": 0.3868086040019989, + "learning_rate": 0.00011184873949579833, + "loss": 0.3511, + "step": 31821 + }, + { + "epoch": 17.777653631284917, + "grad_norm": 0.6862823963165283, + "learning_rate": 0.00011182072829131652, + "loss": 0.4379, + "step": 31822 + }, + { + "epoch": 17.778212290502793, + "grad_norm": 0.5161616802215576, + "learning_rate": 0.00011179271708683474, + "loss": 0.4775, + "step": 31823 + }, + { + "epoch": 17.77877094972067, + "grad_norm": 0.5153317451477051, + "learning_rate": 0.00011176470588235294, + "loss": 0.4161, + "step": 31824 + }, + { + "epoch": 17.779329608938546, + "grad_norm": 0.4883970618247986, + "learning_rate": 0.00011173669467787115, + "loss": 0.3423, + "step": 31825 + }, + { + "epoch": 17.779888268156423, + "grad_norm": 0.5907506346702576, + "learning_rate": 0.00011170868347338936, + "loss": 0.3985, + "step": 31826 + }, + { + "epoch": 17.780446927374303, + "grad_norm": 0.7341945171356201, + "learning_rate": 0.00011168067226890756, + "loss": 0.3556, + "step": 31827 + }, + { + "epoch": 17.78100558659218, + "grad_norm": 0.2722187638282776, + "learning_rate": 0.00011165266106442577, + "loss": 0.2608, + "step": 31828 + }, + { + "epoch": 17.781564245810056, + "grad_norm": 1.0369043350219727, + "learning_rate": 0.00011162464985994399, + "loss": 0.4905, + "step": 31829 + }, + { + "epoch": 17.782122905027933, + "grad_norm": 0.3412696123123169, + "learning_rate": 0.00011159663865546218, + "loss": 0.3563, + "step": 31830 + }, + { + "epoch": 17.78268156424581, + "grad_norm": 0.3238923251628876, + "learning_rate": 0.0001115686274509804, + "loss": 0.3186, + "step": 31831 + }, + { + "epoch": 17.783240223463686, + "grad_norm": 0.872382640838623, + "learning_rate": 0.00011154061624649859, + "loss": 0.4017, + "step": 31832 + }, + { + "epoch": 17.783798882681566, + "grad_norm": 0.3689119815826416, + "learning_rate": 0.00011151260504201681, + "loss": 0.3306, + "step": 31833 + }, + { + "epoch": 17.784357541899443, + "grad_norm": 0.4544695317745209, + "learning_rate": 0.00011148459383753502, + "loss": 0.4932, + "step": 31834 + }, + { + "epoch": 17.78491620111732, + "grad_norm": 0.451572448015213, + "learning_rate": 0.00011145658263305322, + "loss": 0.3543, + "step": 31835 + }, + { + "epoch": 17.785474860335196, + "grad_norm": 0.40825968980789185, + "learning_rate": 0.00011142857142857143, + "loss": 0.4308, + "step": 31836 + }, + { + "epoch": 17.786033519553072, + "grad_norm": 2.6506662368774414, + "learning_rate": 0.00011140056022408964, + "loss": 0.3571, + "step": 31837 + }, + { + "epoch": 17.78659217877095, + "grad_norm": 0.3947301208972931, + "learning_rate": 0.00011137254901960784, + "loss": 0.3348, + "step": 31838 + }, + { + "epoch": 17.787150837988825, + "grad_norm": 2.443533420562744, + "learning_rate": 0.00011134453781512606, + "loss": 0.3631, + "step": 31839 + }, + { + "epoch": 17.787709497206706, + "grad_norm": 0.3971710205078125, + "learning_rate": 0.00011131652661064425, + "loss": 0.3422, + "step": 31840 + }, + { + "epoch": 17.788268156424582, + "grad_norm": 0.6614204049110413, + "learning_rate": 0.00011128851540616247, + "loss": 0.3085, + "step": 31841 + }, + { + "epoch": 17.78882681564246, + "grad_norm": 0.3786182403564453, + "learning_rate": 0.00011126050420168067, + "loss": 0.3332, + "step": 31842 + }, + { + "epoch": 17.789385474860335, + "grad_norm": 1.9345414638519287, + "learning_rate": 0.00011123249299719889, + "loss": 0.4199, + "step": 31843 + }, + { + "epoch": 17.789944134078212, + "grad_norm": 0.49531999230384827, + "learning_rate": 0.00011120448179271708, + "loss": 0.5077, + "step": 31844 + }, + { + "epoch": 17.79050279329609, + "grad_norm": 0.49440038204193115, + "learning_rate": 0.0001111764705882353, + "loss": 0.4385, + "step": 31845 + }, + { + "epoch": 17.791061452513965, + "grad_norm": 2.345907688140869, + "learning_rate": 0.0001111484593837535, + "loss": 0.3708, + "step": 31846 + }, + { + "epoch": 17.791620111731845, + "grad_norm": 0.780145525932312, + "learning_rate": 0.00011112044817927171, + "loss": 0.3446, + "step": 31847 + }, + { + "epoch": 17.79217877094972, + "grad_norm": 0.4204564690589905, + "learning_rate": 0.00011109243697478992, + "loss": 0.3942, + "step": 31848 + }, + { + "epoch": 17.7927374301676, + "grad_norm": 0.4733802080154419, + "learning_rate": 0.00011106442577030812, + "loss": 0.4914, + "step": 31849 + }, + { + "epoch": 17.793296089385475, + "grad_norm": 0.49106743931770325, + "learning_rate": 0.00011103641456582633, + "loss": 0.4141, + "step": 31850 + }, + { + "epoch": 17.79385474860335, + "grad_norm": 2.615499973297119, + "learning_rate": 0.00011100840336134455, + "loss": 0.4028, + "step": 31851 + }, + { + "epoch": 17.794413407821228, + "grad_norm": 0.5444148182868958, + "learning_rate": 0.00011098039215686274, + "loss": 0.4486, + "step": 31852 + }, + { + "epoch": 17.794972067039105, + "grad_norm": 0.3481050431728363, + "learning_rate": 0.00011095238095238096, + "loss": 0.3265, + "step": 31853 + }, + { + "epoch": 17.795530726256985, + "grad_norm": 0.4157502353191376, + "learning_rate": 0.00011092436974789915, + "loss": 0.338, + "step": 31854 + }, + { + "epoch": 17.79608938547486, + "grad_norm": 0.4834425449371338, + "learning_rate": 0.00011089635854341737, + "loss": 0.4843, + "step": 31855 + }, + { + "epoch": 17.796648044692738, + "grad_norm": 0.4319273829460144, + "learning_rate": 0.00011086834733893558, + "loss": 0.3502, + "step": 31856 + }, + { + "epoch": 17.797206703910614, + "grad_norm": 0.5160530805587769, + "learning_rate": 0.00011084033613445378, + "loss": 0.4258, + "step": 31857 + }, + { + "epoch": 17.79776536312849, + "grad_norm": 0.37492382526397705, + "learning_rate": 0.00011081232492997199, + "loss": 0.3746, + "step": 31858 + }, + { + "epoch": 17.798324022346367, + "grad_norm": 5.192200660705566, + "learning_rate": 0.0001107843137254902, + "loss": 0.4337, + "step": 31859 + }, + { + "epoch": 17.798882681564244, + "grad_norm": 0.6172341108322144, + "learning_rate": 0.0001107563025210084, + "loss": 0.45, + "step": 31860 + }, + { + "epoch": 17.799441340782124, + "grad_norm": 0.5336006879806519, + "learning_rate": 0.00011072829131652662, + "loss": 0.431, + "step": 31861 + }, + { + "epoch": 17.8, + "grad_norm": 0.5429198741912842, + "learning_rate": 0.00011070028011204481, + "loss": 0.5343, + "step": 31862 + }, + { + "epoch": 17.800558659217877, + "grad_norm": 0.6353127956390381, + "learning_rate": 0.00011067226890756303, + "loss": 0.5253, + "step": 31863 + }, + { + "epoch": 17.801117318435754, + "grad_norm": 1.7069145441055298, + "learning_rate": 0.00011064425770308123, + "loss": 0.4263, + "step": 31864 + }, + { + "epoch": 17.80167597765363, + "grad_norm": 0.5213304162025452, + "learning_rate": 0.00011061624649859944, + "loss": 0.5414, + "step": 31865 + }, + { + "epoch": 17.802234636871507, + "grad_norm": 6.125345230102539, + "learning_rate": 0.00011058823529411765, + "loss": 0.3751, + "step": 31866 + }, + { + "epoch": 17.802793296089387, + "grad_norm": 0.36859026551246643, + "learning_rate": 0.00011056022408963586, + "loss": 0.4566, + "step": 31867 + }, + { + "epoch": 17.803351955307264, + "grad_norm": 0.4032394587993622, + "learning_rate": 0.00011053221288515406, + "loss": 0.489, + "step": 31868 + }, + { + "epoch": 17.80391061452514, + "grad_norm": 0.45502305030822754, + "learning_rate": 0.00011050420168067227, + "loss": 0.4236, + "step": 31869 + }, + { + "epoch": 17.804469273743017, + "grad_norm": 0.4770444631576538, + "learning_rate": 0.00011047619047619047, + "loss": 0.4102, + "step": 31870 + }, + { + "epoch": 17.805027932960893, + "grad_norm": 0.3851770758628845, + "learning_rate": 0.00011044817927170868, + "loss": 0.333, + "step": 31871 + }, + { + "epoch": 17.80558659217877, + "grad_norm": 0.4486406743526459, + "learning_rate": 0.00011042016806722689, + "loss": 0.3953, + "step": 31872 + }, + { + "epoch": 17.806145251396647, + "grad_norm": 0.4979979395866394, + "learning_rate": 0.0001103921568627451, + "loss": 0.3086, + "step": 31873 + }, + { + "epoch": 17.806703910614527, + "grad_norm": 0.43344834446907043, + "learning_rate": 0.0001103641456582633, + "loss": 0.2791, + "step": 31874 + }, + { + "epoch": 17.807262569832403, + "grad_norm": 0.3375483751296997, + "learning_rate": 0.00011033613445378152, + "loss": 0.3637, + "step": 31875 + }, + { + "epoch": 17.80782122905028, + "grad_norm": 0.5579739809036255, + "learning_rate": 0.00011030812324929971, + "loss": 0.3386, + "step": 31876 + }, + { + "epoch": 17.808379888268156, + "grad_norm": 0.6381669640541077, + "learning_rate": 0.00011028011204481793, + "loss": 0.4795, + "step": 31877 + }, + { + "epoch": 17.808938547486033, + "grad_norm": 0.5497968196868896, + "learning_rate": 0.00011025210084033614, + "loss": 0.5316, + "step": 31878 + }, + { + "epoch": 17.80949720670391, + "grad_norm": 0.6413907408714294, + "learning_rate": 0.00011022408963585434, + "loss": 0.2878, + "step": 31879 + }, + { + "epoch": 17.810055865921786, + "grad_norm": 0.3716050386428833, + "learning_rate": 0.00011019607843137255, + "loss": 0.3482, + "step": 31880 + }, + { + "epoch": 17.810614525139666, + "grad_norm": 0.31638702750205994, + "learning_rate": 0.00011016806722689075, + "loss": 0.3785, + "step": 31881 + }, + { + "epoch": 17.811173184357543, + "grad_norm": 1.0723401308059692, + "learning_rate": 0.00011014005602240896, + "loss": 0.3525, + "step": 31882 + }, + { + "epoch": 17.81173184357542, + "grad_norm": 0.5220655202865601, + "learning_rate": 0.00011011204481792718, + "loss": 0.4249, + "step": 31883 + }, + { + "epoch": 17.812290502793296, + "grad_norm": 0.6201823949813843, + "learning_rate": 0.00011008403361344537, + "loss": 0.3958, + "step": 31884 + }, + { + "epoch": 17.812849162011172, + "grad_norm": 0.3612859845161438, + "learning_rate": 0.00011005602240896359, + "loss": 0.3515, + "step": 31885 + }, + { + "epoch": 17.81340782122905, + "grad_norm": 0.6465184092521667, + "learning_rate": 0.00011002801120448178, + "loss": 0.4484, + "step": 31886 + }, + { + "epoch": 17.81396648044693, + "grad_norm": 0.410290390253067, + "learning_rate": 0.00011, + "loss": 0.3302, + "step": 31887 + }, + { + "epoch": 17.814525139664806, + "grad_norm": 0.7216017246246338, + "learning_rate": 0.00010997198879551821, + "loss": 0.6642, + "step": 31888 + }, + { + "epoch": 17.815083798882682, + "grad_norm": 0.36545509099960327, + "learning_rate": 0.00010994397759103642, + "loss": 0.4407, + "step": 31889 + }, + { + "epoch": 17.81564245810056, + "grad_norm": 0.5036261677742004, + "learning_rate": 0.00010991596638655462, + "loss": 0.4231, + "step": 31890 + }, + { + "epoch": 17.816201117318435, + "grad_norm": 0.6891294717788696, + "learning_rate": 0.00010988795518207283, + "loss": 0.5012, + "step": 31891 + }, + { + "epoch": 17.816759776536312, + "grad_norm": 0.5240914821624756, + "learning_rate": 0.00010985994397759103, + "loss": 0.3678, + "step": 31892 + }, + { + "epoch": 17.81731843575419, + "grad_norm": 0.5042918920516968, + "learning_rate": 0.00010983193277310925, + "loss": 0.4614, + "step": 31893 + }, + { + "epoch": 17.81787709497207, + "grad_norm": 0.6323612332344055, + "learning_rate": 0.00010980392156862745, + "loss": 0.3963, + "step": 31894 + }, + { + "epoch": 17.818435754189945, + "grad_norm": 0.36174264550209045, + "learning_rate": 0.00010977591036414567, + "loss": 0.4666, + "step": 31895 + }, + { + "epoch": 17.81899441340782, + "grad_norm": 0.42684587836265564, + "learning_rate": 0.00010974789915966386, + "loss": 0.4552, + "step": 31896 + }, + { + "epoch": 17.8195530726257, + "grad_norm": 0.5217669010162354, + "learning_rate": 0.00010971988795518208, + "loss": 0.5117, + "step": 31897 + }, + { + "epoch": 17.820111731843575, + "grad_norm": 0.5353214144706726, + "learning_rate": 0.00010969187675070027, + "loss": 0.4298, + "step": 31898 + }, + { + "epoch": 17.82067039106145, + "grad_norm": 0.41908028721809387, + "learning_rate": 0.00010966386554621849, + "loss": 0.3252, + "step": 31899 + }, + { + "epoch": 17.821229050279328, + "grad_norm": 2.6000418663024902, + "learning_rate": 0.0001096358543417367, + "loss": 0.4494, + "step": 31900 + }, + { + "epoch": 17.821787709497208, + "grad_norm": 0.9905964732170105, + "learning_rate": 0.0001096078431372549, + "loss": 0.2856, + "step": 31901 + }, + { + "epoch": 17.822346368715085, + "grad_norm": 0.5244598388671875, + "learning_rate": 0.00010957983193277311, + "loss": 0.4672, + "step": 31902 + }, + { + "epoch": 17.82290502793296, + "grad_norm": 0.34664949774742126, + "learning_rate": 0.00010955182072829131, + "loss": 0.3729, + "step": 31903 + }, + { + "epoch": 17.823463687150838, + "grad_norm": 0.3882288634777069, + "learning_rate": 0.00010952380952380952, + "loss": 0.3285, + "step": 31904 + }, + { + "epoch": 17.824022346368714, + "grad_norm": 0.5065073370933533, + "learning_rate": 0.00010949579831932774, + "loss": 0.4638, + "step": 31905 + }, + { + "epoch": 17.82458100558659, + "grad_norm": 0.41068363189697266, + "learning_rate": 0.00010946778711484593, + "loss": 0.3416, + "step": 31906 + }, + { + "epoch": 17.825139664804468, + "grad_norm": 0.31750819087028503, + "learning_rate": 0.00010943977591036415, + "loss": 0.3399, + "step": 31907 + }, + { + "epoch": 17.825698324022348, + "grad_norm": 0.8183224201202393, + "learning_rate": 0.00010941176470588234, + "loss": 0.3684, + "step": 31908 + }, + { + "epoch": 17.826256983240224, + "grad_norm": 0.5211698412895203, + "learning_rate": 0.00010938375350140056, + "loss": 0.4143, + "step": 31909 + }, + { + "epoch": 17.8268156424581, + "grad_norm": 2.1088993549346924, + "learning_rate": 0.00010935574229691877, + "loss": 0.4852, + "step": 31910 + }, + { + "epoch": 17.827374301675977, + "grad_norm": 0.5901950001716614, + "learning_rate": 0.00010932773109243697, + "loss": 0.4276, + "step": 31911 + }, + { + "epoch": 17.827932960893854, + "grad_norm": 0.5979985594749451, + "learning_rate": 0.00010929971988795518, + "loss": 0.4371, + "step": 31912 + }, + { + "epoch": 17.82849162011173, + "grad_norm": 0.39830952882766724, + "learning_rate": 0.00010927170868347339, + "loss": 0.3631, + "step": 31913 + }, + { + "epoch": 17.82905027932961, + "grad_norm": 0.43950772285461426, + "learning_rate": 0.00010924369747899159, + "loss": 0.374, + "step": 31914 + }, + { + "epoch": 17.829608938547487, + "grad_norm": 0.4283125102519989, + "learning_rate": 0.00010921568627450981, + "loss": 0.3383, + "step": 31915 + }, + { + "epoch": 17.830167597765364, + "grad_norm": 0.47540703415870667, + "learning_rate": 0.000109187675070028, + "loss": 0.4599, + "step": 31916 + }, + { + "epoch": 17.83072625698324, + "grad_norm": 0.3979719579219818, + "learning_rate": 0.00010915966386554622, + "loss": 0.2708, + "step": 31917 + }, + { + "epoch": 17.831284916201117, + "grad_norm": 0.43210309743881226, + "learning_rate": 0.00010913165266106442, + "loss": 0.4322, + "step": 31918 + }, + { + "epoch": 17.831843575418993, + "grad_norm": 2.098341226577759, + "learning_rate": 0.00010910364145658264, + "loss": 0.4979, + "step": 31919 + }, + { + "epoch": 17.83240223463687, + "grad_norm": 1.82694673538208, + "learning_rate": 0.00010907563025210086, + "loss": 0.3639, + "step": 31920 + }, + { + "epoch": 17.83296089385475, + "grad_norm": 1.404898762702942, + "learning_rate": 0.00010904761904761905, + "loss": 0.3612, + "step": 31921 + }, + { + "epoch": 17.833519553072627, + "grad_norm": 0.3981015980243683, + "learning_rate": 0.00010901960784313727, + "loss": 0.4262, + "step": 31922 + }, + { + "epoch": 17.834078212290503, + "grad_norm": 10.760833740234375, + "learning_rate": 0.00010899159663865546, + "loss": 0.4608, + "step": 31923 + }, + { + "epoch": 17.83463687150838, + "grad_norm": 1.3174183368682861, + "learning_rate": 0.00010896358543417368, + "loss": 0.425, + "step": 31924 + }, + { + "epoch": 17.835195530726256, + "grad_norm": 0.4950545132160187, + "learning_rate": 0.00010893557422969187, + "loss": 0.3357, + "step": 31925 + }, + { + "epoch": 17.835754189944133, + "grad_norm": 0.46289488673210144, + "learning_rate": 0.00010890756302521009, + "loss": 0.5155, + "step": 31926 + }, + { + "epoch": 17.83631284916201, + "grad_norm": 0.4915810823440552, + "learning_rate": 0.0001088795518207283, + "loss": 0.3832, + "step": 31927 + }, + { + "epoch": 17.83687150837989, + "grad_norm": 0.4239140748977661, + "learning_rate": 0.0001088515406162465, + "loss": 0.3538, + "step": 31928 + }, + { + "epoch": 17.837430167597766, + "grad_norm": 0.4773687720298767, + "learning_rate": 0.00010882352941176471, + "loss": 0.4977, + "step": 31929 + }, + { + "epoch": 17.837988826815643, + "grad_norm": 0.4510042667388916, + "learning_rate": 0.00010879551820728292, + "loss": 0.494, + "step": 31930 + }, + { + "epoch": 17.83854748603352, + "grad_norm": 0.363717645406723, + "learning_rate": 0.00010876750700280112, + "loss": 0.4335, + "step": 31931 + }, + { + "epoch": 17.839106145251396, + "grad_norm": 0.36441436409950256, + "learning_rate": 0.00010873949579831934, + "loss": 0.3809, + "step": 31932 + }, + { + "epoch": 17.839664804469272, + "grad_norm": 0.4575157165527344, + "learning_rate": 0.00010871148459383753, + "loss": 0.4163, + "step": 31933 + }, + { + "epoch": 17.840223463687153, + "grad_norm": 0.3995513617992401, + "learning_rate": 0.00010868347338935575, + "loss": 0.3774, + "step": 31934 + }, + { + "epoch": 17.84078212290503, + "grad_norm": 0.46553125977516174, + "learning_rate": 0.00010865546218487395, + "loss": 0.4469, + "step": 31935 + }, + { + "epoch": 17.841340782122906, + "grad_norm": 0.5818611979484558, + "learning_rate": 0.00010862745098039217, + "loss": 0.5001, + "step": 31936 + }, + { + "epoch": 17.841899441340782, + "grad_norm": 0.5967891216278076, + "learning_rate": 0.00010859943977591037, + "loss": 0.4058, + "step": 31937 + }, + { + "epoch": 17.84245810055866, + "grad_norm": 0.3642972707748413, + "learning_rate": 0.00010857142857142858, + "loss": 0.4445, + "step": 31938 + }, + { + "epoch": 17.843016759776535, + "grad_norm": 0.7218237519264221, + "learning_rate": 0.00010854341736694678, + "loss": 0.3182, + "step": 31939 + }, + { + "epoch": 17.843575418994412, + "grad_norm": 0.49845090508461, + "learning_rate": 0.00010851540616246499, + "loss": 0.3442, + "step": 31940 + }, + { + "epoch": 17.844134078212292, + "grad_norm": 0.804343044757843, + "learning_rate": 0.0001084873949579832, + "loss": 0.3806, + "step": 31941 + }, + { + "epoch": 17.84469273743017, + "grad_norm": 0.545930802822113, + "learning_rate": 0.00010845938375350141, + "loss": 0.5129, + "step": 31942 + }, + { + "epoch": 17.845251396648045, + "grad_norm": 1.05837082862854, + "learning_rate": 0.00010843137254901961, + "loss": 0.4157, + "step": 31943 + }, + { + "epoch": 17.845810055865922, + "grad_norm": 0.45503243803977966, + "learning_rate": 0.00010840336134453783, + "loss": 0.3442, + "step": 31944 + }, + { + "epoch": 17.8463687150838, + "grad_norm": 0.5501124858856201, + "learning_rate": 0.00010837535014005602, + "loss": 0.3084, + "step": 31945 + }, + { + "epoch": 17.846927374301675, + "grad_norm": 0.3977105915546417, + "learning_rate": 0.00010834733893557424, + "loss": 0.462, + "step": 31946 + }, + { + "epoch": 17.84748603351955, + "grad_norm": 0.5938095450401306, + "learning_rate": 0.00010831932773109244, + "loss": 0.5859, + "step": 31947 + }, + { + "epoch": 17.84804469273743, + "grad_norm": 0.4917389452457428, + "learning_rate": 0.00010829131652661065, + "loss": 0.3658, + "step": 31948 + }, + { + "epoch": 17.84860335195531, + "grad_norm": 11.339869499206543, + "learning_rate": 0.00010826330532212886, + "loss": 0.4489, + "step": 31949 + }, + { + "epoch": 17.849162011173185, + "grad_norm": 0.9019293785095215, + "learning_rate": 0.00010823529411764706, + "loss": 0.5327, + "step": 31950 + }, + { + "epoch": 17.84972067039106, + "grad_norm": 0.5398561358451843, + "learning_rate": 0.00010820728291316527, + "loss": 0.488, + "step": 31951 + }, + { + "epoch": 17.850279329608938, + "grad_norm": 2.950305938720703, + "learning_rate": 0.00010817927170868347, + "loss": 0.3323, + "step": 31952 + }, + { + "epoch": 17.850837988826814, + "grad_norm": 0.4664683938026428, + "learning_rate": 0.00010815126050420168, + "loss": 0.4722, + "step": 31953 + }, + { + "epoch": 17.85139664804469, + "grad_norm": 0.8299731612205505, + "learning_rate": 0.0001081232492997199, + "loss": 0.4752, + "step": 31954 + }, + { + "epoch": 17.85195530726257, + "grad_norm": 0.39772647619247437, + "learning_rate": 0.00010809523809523809, + "loss": 0.4135, + "step": 31955 + }, + { + "epoch": 17.852513966480448, + "grad_norm": 0.9067767262458801, + "learning_rate": 0.00010806722689075631, + "loss": 0.4144, + "step": 31956 + }, + { + "epoch": 17.853072625698324, + "grad_norm": 0.4373841881752014, + "learning_rate": 0.0001080392156862745, + "loss": 0.4244, + "step": 31957 + }, + { + "epoch": 17.8536312849162, + "grad_norm": 0.6203702092170715, + "learning_rate": 0.00010801120448179272, + "loss": 0.5448, + "step": 31958 + }, + { + "epoch": 17.854189944134077, + "grad_norm": 0.40931329131126404, + "learning_rate": 0.00010798319327731093, + "loss": 0.34, + "step": 31959 + }, + { + "epoch": 17.854748603351954, + "grad_norm": 0.42782947421073914, + "learning_rate": 0.00010795518207282914, + "loss": 0.4218, + "step": 31960 + }, + { + "epoch": 17.85530726256983, + "grad_norm": 0.6662193536758423, + "learning_rate": 0.00010792717086834734, + "loss": 0.5633, + "step": 31961 + }, + { + "epoch": 17.85586592178771, + "grad_norm": 0.5704799890518188, + "learning_rate": 0.00010789915966386555, + "loss": 0.3347, + "step": 31962 + }, + { + "epoch": 17.856424581005587, + "grad_norm": 1.324694275856018, + "learning_rate": 0.00010787114845938375, + "loss": 0.4297, + "step": 31963 + }, + { + "epoch": 17.856983240223464, + "grad_norm": 0.4512801468372345, + "learning_rate": 0.00010784313725490197, + "loss": 0.2948, + "step": 31964 + }, + { + "epoch": 17.85754189944134, + "grad_norm": 0.9972048401832581, + "learning_rate": 0.00010781512605042017, + "loss": 0.4534, + "step": 31965 + }, + { + "epoch": 17.858100558659217, + "grad_norm": 0.4238443076610565, + "learning_rate": 0.00010778711484593839, + "loss": 0.3982, + "step": 31966 + }, + { + "epoch": 17.858659217877094, + "grad_norm": 0.5718919038772583, + "learning_rate": 0.00010775910364145658, + "loss": 0.4961, + "step": 31967 + }, + { + "epoch": 17.859217877094974, + "grad_norm": 1.6888715028762817, + "learning_rate": 0.0001077310924369748, + "loss": 0.4457, + "step": 31968 + }, + { + "epoch": 17.85977653631285, + "grad_norm": 0.49498414993286133, + "learning_rate": 0.000107703081232493, + "loss": 0.542, + "step": 31969 + }, + { + "epoch": 17.860335195530727, + "grad_norm": 3.436616897583008, + "learning_rate": 0.00010767507002801121, + "loss": 0.4906, + "step": 31970 + }, + { + "epoch": 17.860893854748603, + "grad_norm": 0.4372938275337219, + "learning_rate": 0.00010764705882352942, + "loss": 0.4205, + "step": 31971 + }, + { + "epoch": 17.86145251396648, + "grad_norm": 0.4418832063674927, + "learning_rate": 0.00010761904761904762, + "loss": 0.3754, + "step": 31972 + }, + { + "epoch": 17.862011173184356, + "grad_norm": 0.9103538393974304, + "learning_rate": 0.00010759103641456583, + "loss": 0.5945, + "step": 31973 + }, + { + "epoch": 17.862569832402233, + "grad_norm": 0.9872821569442749, + "learning_rate": 0.00010756302521008405, + "loss": 0.4801, + "step": 31974 + }, + { + "epoch": 17.863128491620113, + "grad_norm": 0.45777133107185364, + "learning_rate": 0.00010753501400560224, + "loss": 0.381, + "step": 31975 + }, + { + "epoch": 17.86368715083799, + "grad_norm": 0.36995068192481995, + "learning_rate": 0.00010750700280112046, + "loss": 0.3555, + "step": 31976 + }, + { + "epoch": 17.864245810055866, + "grad_norm": 0.6736024022102356, + "learning_rate": 0.00010747899159663865, + "loss": 0.4196, + "step": 31977 + }, + { + "epoch": 17.864804469273743, + "grad_norm": 0.7733534574508667, + "learning_rate": 0.00010745098039215687, + "loss": 0.5822, + "step": 31978 + }, + { + "epoch": 17.86536312849162, + "grad_norm": 0.47300904989242554, + "learning_rate": 0.00010742296918767508, + "loss": 0.463, + "step": 31979 + }, + { + "epoch": 17.865921787709496, + "grad_norm": 0.4311671853065491, + "learning_rate": 0.00010739495798319328, + "loss": 0.3113, + "step": 31980 + }, + { + "epoch": 17.866480446927373, + "grad_norm": 0.5093075633049011, + "learning_rate": 0.00010736694677871149, + "loss": 0.3407, + "step": 31981 + }, + { + "epoch": 17.867039106145253, + "grad_norm": 0.4941078722476959, + "learning_rate": 0.0001073389355742297, + "loss": 0.4441, + "step": 31982 + }, + { + "epoch": 17.86759776536313, + "grad_norm": 0.7564123272895813, + "learning_rate": 0.0001073109243697479, + "loss": 0.3644, + "step": 31983 + }, + { + "epoch": 17.868156424581006, + "grad_norm": 0.4897879660129547, + "learning_rate": 0.00010728291316526611, + "loss": 0.5605, + "step": 31984 + }, + { + "epoch": 17.868715083798882, + "grad_norm": 0.36037081480026245, + "learning_rate": 0.00010725490196078431, + "loss": 0.4279, + "step": 31985 + }, + { + "epoch": 17.86927374301676, + "grad_norm": 1.0760464668273926, + "learning_rate": 0.00010722689075630253, + "loss": 0.4798, + "step": 31986 + }, + { + "epoch": 17.869832402234636, + "grad_norm": 0.39917442202568054, + "learning_rate": 0.00010719887955182073, + "loss": 0.3588, + "step": 31987 + }, + { + "epoch": 17.870391061452516, + "grad_norm": 0.33753713965415955, + "learning_rate": 0.00010717086834733894, + "loss": 0.3469, + "step": 31988 + }, + { + "epoch": 17.870949720670392, + "grad_norm": 0.5130229592323303, + "learning_rate": 0.00010714285714285714, + "loss": 0.431, + "step": 31989 + }, + { + "epoch": 17.87150837988827, + "grad_norm": 0.41078999638557434, + "learning_rate": 0.00010711484593837536, + "loss": 0.44, + "step": 31990 + }, + { + "epoch": 17.872067039106145, + "grad_norm": 1.2140178680419922, + "learning_rate": 0.00010708683473389356, + "loss": 0.3265, + "step": 31991 + }, + { + "epoch": 17.872625698324022, + "grad_norm": 1.0050920248031616, + "learning_rate": 0.00010705882352941177, + "loss": 0.3698, + "step": 31992 + }, + { + "epoch": 17.8731843575419, + "grad_norm": 0.5039964318275452, + "learning_rate": 0.00010703081232492997, + "loss": 0.4237, + "step": 31993 + }, + { + "epoch": 17.873743016759775, + "grad_norm": 0.393161416053772, + "learning_rate": 0.00010700280112044818, + "loss": 0.3062, + "step": 31994 + }, + { + "epoch": 17.874301675977655, + "grad_norm": 0.7035702466964722, + "learning_rate": 0.00010697478991596639, + "loss": 0.4219, + "step": 31995 + }, + { + "epoch": 17.87486033519553, + "grad_norm": 0.40759238600730896, + "learning_rate": 0.0001069467787114846, + "loss": 0.3632, + "step": 31996 + }, + { + "epoch": 17.87541899441341, + "grad_norm": 0.41441529989242554, + "learning_rate": 0.0001069187675070028, + "loss": 0.3941, + "step": 31997 + }, + { + "epoch": 17.875977653631285, + "grad_norm": 0.36333101987838745, + "learning_rate": 0.00010689075630252102, + "loss": 0.4258, + "step": 31998 + }, + { + "epoch": 17.87653631284916, + "grad_norm": 0.34713229537010193, + "learning_rate": 0.00010686274509803921, + "loss": 0.3317, + "step": 31999 + }, + { + "epoch": 17.877094972067038, + "grad_norm": 0.955400824546814, + "learning_rate": 0.00010683473389355743, + "loss": 0.4111, + "step": 32000 + }, + { + "epoch": 17.877094972067038, + "eval_cer": 0.08432893631414139, + "eval_loss": 0.3200739026069641, + "eval_runtime": 55.6201, + "eval_samples_per_second": 81.589, + "eval_steps_per_second": 5.106, + "eval_wer": 0.33325884751822576, + "step": 32000 + }, + { + "epoch": 17.877653631284915, + "grad_norm": 0.4626900851726532, + "learning_rate": 0.00010680672268907564, + "loss": 0.4202, + "step": 32001 + }, + { + "epoch": 17.878212290502795, + "grad_norm": 0.44135206937789917, + "learning_rate": 0.00010677871148459384, + "loss": 0.344, + "step": 32002 + }, + { + "epoch": 17.87877094972067, + "grad_norm": 0.4019858241081238, + "learning_rate": 0.00010675070028011205, + "loss": 0.288, + "step": 32003 + }, + { + "epoch": 17.879329608938548, + "grad_norm": 0.4524330496788025, + "learning_rate": 0.00010672268907563025, + "loss": 0.3672, + "step": 32004 + }, + { + "epoch": 17.879888268156424, + "grad_norm": 0.6566335558891296, + "learning_rate": 0.00010669467787114846, + "loss": 0.4763, + "step": 32005 + }, + { + "epoch": 17.8804469273743, + "grad_norm": 1.3326184749603271, + "learning_rate": 0.00010666666666666668, + "loss": 0.5798, + "step": 32006 + }, + { + "epoch": 17.881005586592178, + "grad_norm": 25.050582885742188, + "learning_rate": 0.00010663865546218487, + "loss": 0.3253, + "step": 32007 + }, + { + "epoch": 17.881564245810054, + "grad_norm": 0.5367122888565063, + "learning_rate": 0.00010661064425770309, + "loss": 0.5938, + "step": 32008 + }, + { + "epoch": 17.882122905027934, + "grad_norm": 0.39557814598083496, + "learning_rate": 0.00010658263305322128, + "loss": 0.4841, + "step": 32009 + }, + { + "epoch": 17.88268156424581, + "grad_norm": 9.567939758300781, + "learning_rate": 0.0001065546218487395, + "loss": 0.4048, + "step": 32010 + }, + { + "epoch": 17.883240223463687, + "grad_norm": 0.53277587890625, + "learning_rate": 0.0001065266106442577, + "loss": 0.6215, + "step": 32011 + }, + { + "epoch": 17.883798882681564, + "grad_norm": 0.419577032327652, + "learning_rate": 0.00010649859943977592, + "loss": 0.4554, + "step": 32012 + }, + { + "epoch": 17.88435754189944, + "grad_norm": 0.4224916994571686, + "learning_rate": 0.00010647058823529412, + "loss": 0.3955, + "step": 32013 + }, + { + "epoch": 17.884916201117317, + "grad_norm": 0.8175129294395447, + "learning_rate": 0.00010644257703081233, + "loss": 0.5371, + "step": 32014 + }, + { + "epoch": 17.885474860335197, + "grad_norm": 0.4805639684200287, + "learning_rate": 0.00010641456582633053, + "loss": 0.3878, + "step": 32015 + }, + { + "epoch": 17.886033519553074, + "grad_norm": 0.450518935918808, + "learning_rate": 0.00010638655462184874, + "loss": 0.3436, + "step": 32016 + }, + { + "epoch": 17.88659217877095, + "grad_norm": 0.43353959918022156, + "learning_rate": 0.00010635854341736695, + "loss": 0.3683, + "step": 32017 + }, + { + "epoch": 17.887150837988827, + "grad_norm": 0.5608108043670654, + "learning_rate": 0.00010633053221288517, + "loss": 0.5416, + "step": 32018 + }, + { + "epoch": 17.887709497206703, + "grad_norm": 0.538255512714386, + "learning_rate": 0.00010630252100840336, + "loss": 0.3851, + "step": 32019 + }, + { + "epoch": 17.88826815642458, + "grad_norm": 0.4306880533695221, + "learning_rate": 0.00010627450980392158, + "loss": 0.3317, + "step": 32020 + }, + { + "epoch": 17.888826815642457, + "grad_norm": 0.6945005655288696, + "learning_rate": 0.00010624649859943977, + "loss": 0.4168, + "step": 32021 + }, + { + "epoch": 17.889385474860337, + "grad_norm": 0.45939913392066956, + "learning_rate": 0.00010621848739495799, + "loss": 0.5134, + "step": 32022 + }, + { + "epoch": 17.889944134078213, + "grad_norm": 0.5442972183227539, + "learning_rate": 0.0001061904761904762, + "loss": 0.453, + "step": 32023 + }, + { + "epoch": 17.89050279329609, + "grad_norm": 0.5479558110237122, + "learning_rate": 0.0001061624649859944, + "loss": 0.4877, + "step": 32024 + }, + { + "epoch": 17.891061452513966, + "grad_norm": 2.9534685611724854, + "learning_rate": 0.00010613445378151261, + "loss": 0.4438, + "step": 32025 + }, + { + "epoch": 17.891620111731843, + "grad_norm": 0.38486379384994507, + "learning_rate": 0.00010610644257703081, + "loss": 0.3996, + "step": 32026 + }, + { + "epoch": 17.89217877094972, + "grad_norm": 0.4887557327747345, + "learning_rate": 0.00010607843137254902, + "loss": 0.3762, + "step": 32027 + }, + { + "epoch": 17.892737430167596, + "grad_norm": 0.40388065576553345, + "learning_rate": 0.00010605042016806724, + "loss": 0.4619, + "step": 32028 + }, + { + "epoch": 17.893296089385476, + "grad_norm": 0.5143378376960754, + "learning_rate": 0.00010602240896358543, + "loss": 0.3742, + "step": 32029 + }, + { + "epoch": 17.893854748603353, + "grad_norm": 0.3665013611316681, + "learning_rate": 0.00010599439775910365, + "loss": 0.4007, + "step": 32030 + }, + { + "epoch": 17.89441340782123, + "grad_norm": 0.3822100758552551, + "learning_rate": 0.00010596638655462184, + "loss": 0.3761, + "step": 32031 + }, + { + "epoch": 17.894972067039106, + "grad_norm": 0.5564274191856384, + "learning_rate": 0.00010593837535014006, + "loss": 0.3406, + "step": 32032 + }, + { + "epoch": 17.895530726256982, + "grad_norm": 0.7217164635658264, + "learning_rate": 0.00010591036414565827, + "loss": 0.3972, + "step": 32033 + }, + { + "epoch": 17.89608938547486, + "grad_norm": 3.5297625064849854, + "learning_rate": 0.00010588235294117647, + "loss": 0.4178, + "step": 32034 + }, + { + "epoch": 17.89664804469274, + "grad_norm": 0.5031899809837341, + "learning_rate": 0.00010585434173669468, + "loss": 0.4507, + "step": 32035 + }, + { + "epoch": 17.897206703910616, + "grad_norm": 1.3595974445343018, + "learning_rate": 0.00010582633053221289, + "loss": 0.3222, + "step": 32036 + }, + { + "epoch": 17.897765363128492, + "grad_norm": 1.0752272605895996, + "learning_rate": 0.00010579831932773109, + "loss": 0.3487, + "step": 32037 + }, + { + "epoch": 17.89832402234637, + "grad_norm": 0.34939366579055786, + "learning_rate": 0.0001057703081232493, + "loss": 0.3818, + "step": 32038 + }, + { + "epoch": 17.898882681564245, + "grad_norm": 0.8856455683708191, + "learning_rate": 0.0001057422969187675, + "loss": 0.3968, + "step": 32039 + }, + { + "epoch": 17.899441340782122, + "grad_norm": 0.5000383257865906, + "learning_rate": 0.00010571428571428572, + "loss": 0.3757, + "step": 32040 + }, + { + "epoch": 17.9, + "grad_norm": 0.5492093563079834, + "learning_rate": 0.00010568627450980392, + "loss": 0.3876, + "step": 32041 + }, + { + "epoch": 17.90055865921788, + "grad_norm": 1.3750840425491333, + "learning_rate": 0.00010565826330532214, + "loss": 0.3885, + "step": 32042 + }, + { + "epoch": 17.901117318435755, + "grad_norm": 0.40190553665161133, + "learning_rate": 0.00010563025210084033, + "loss": 0.3729, + "step": 32043 + }, + { + "epoch": 17.901675977653632, + "grad_norm": 0.40007996559143066, + "learning_rate": 0.00010560224089635855, + "loss": 0.4331, + "step": 32044 + }, + { + "epoch": 17.90223463687151, + "grad_norm": 0.4040498435497284, + "learning_rate": 0.00010557422969187675, + "loss": 0.3885, + "step": 32045 + }, + { + "epoch": 17.902793296089385, + "grad_norm": 0.5825757384300232, + "learning_rate": 0.00010554621848739496, + "loss": 0.3931, + "step": 32046 + }, + { + "epoch": 17.90335195530726, + "grad_norm": 0.7963008880615234, + "learning_rate": 0.00010551820728291317, + "loss": 0.2815, + "step": 32047 + }, + { + "epoch": 17.903910614525138, + "grad_norm": 0.3657061755657196, + "learning_rate": 0.00010549019607843137, + "loss": 0.2593, + "step": 32048 + }, + { + "epoch": 17.904469273743018, + "grad_norm": 0.437903493642807, + "learning_rate": 0.00010546218487394958, + "loss": 0.4883, + "step": 32049 + }, + { + "epoch": 17.905027932960895, + "grad_norm": 0.3962153494358063, + "learning_rate": 0.0001054341736694678, + "loss": 0.3808, + "step": 32050 + }, + { + "epoch": 17.90558659217877, + "grad_norm": 0.554152250289917, + "learning_rate": 0.00010540616246498599, + "loss": 0.4805, + "step": 32051 + }, + { + "epoch": 17.906145251396648, + "grad_norm": 3.1426029205322266, + "learning_rate": 0.00010537815126050421, + "loss": 0.4685, + "step": 32052 + }, + { + "epoch": 17.906703910614524, + "grad_norm": 0.34781739115715027, + "learning_rate": 0.0001053501400560224, + "loss": 0.4164, + "step": 32053 + }, + { + "epoch": 17.9072625698324, + "grad_norm": 0.45156776905059814, + "learning_rate": 0.00010532212885154062, + "loss": 0.3965, + "step": 32054 + }, + { + "epoch": 17.907821229050278, + "grad_norm": 0.37126877903938293, + "learning_rate": 0.00010529411764705883, + "loss": 0.4043, + "step": 32055 + }, + { + "epoch": 17.908379888268158, + "grad_norm": 0.5230785012245178, + "learning_rate": 0.00010526610644257703, + "loss": 0.3767, + "step": 32056 + }, + { + "epoch": 17.908938547486034, + "grad_norm": 0.3856871426105499, + "learning_rate": 0.00010523809523809524, + "loss": 0.3922, + "step": 32057 + }, + { + "epoch": 17.90949720670391, + "grad_norm": 0.46022406220436096, + "learning_rate": 0.00010521008403361345, + "loss": 0.4806, + "step": 32058 + }, + { + "epoch": 17.910055865921787, + "grad_norm": 2.3201136589050293, + "learning_rate": 0.00010518207282913165, + "loss": 0.3462, + "step": 32059 + }, + { + "epoch": 17.910614525139664, + "grad_norm": 0.429340124130249, + "learning_rate": 0.00010515406162464987, + "loss": 0.3291, + "step": 32060 + }, + { + "epoch": 17.91117318435754, + "grad_norm": 0.5402361750602722, + "learning_rate": 0.00010512605042016806, + "loss": 0.4225, + "step": 32061 + }, + { + "epoch": 17.91173184357542, + "grad_norm": 0.5591565370559692, + "learning_rate": 0.00010509803921568628, + "loss": 0.4203, + "step": 32062 + }, + { + "epoch": 17.912290502793297, + "grad_norm": 3.564390182495117, + "learning_rate": 0.00010507002801120448, + "loss": 0.4272, + "step": 32063 + }, + { + "epoch": 17.912849162011174, + "grad_norm": 1.6119402647018433, + "learning_rate": 0.0001050420168067227, + "loss": 0.4188, + "step": 32064 + }, + { + "epoch": 17.91340782122905, + "grad_norm": 0.3713831603527069, + "learning_rate": 0.00010501400560224089, + "loss": 0.387, + "step": 32065 + }, + { + "epoch": 17.913966480446927, + "grad_norm": 0.5590749382972717, + "learning_rate": 0.00010498599439775911, + "loss": 0.3493, + "step": 32066 + }, + { + "epoch": 17.914525139664804, + "grad_norm": 0.6645670533180237, + "learning_rate": 0.00010495798319327731, + "loss": 0.4138, + "step": 32067 + }, + { + "epoch": 17.91508379888268, + "grad_norm": 0.7421476244926453, + "learning_rate": 0.00010492997198879552, + "loss": 0.4337, + "step": 32068 + }, + { + "epoch": 17.91564245810056, + "grad_norm": 0.4942847490310669, + "learning_rate": 0.00010490196078431373, + "loss": 0.441, + "step": 32069 + }, + { + "epoch": 17.916201117318437, + "grad_norm": 0.3272298574447632, + "learning_rate": 0.00010487394957983193, + "loss": 0.3237, + "step": 32070 + }, + { + "epoch": 17.916759776536313, + "grad_norm": 0.4406721293926239, + "learning_rate": 0.00010484593837535014, + "loss": 0.4312, + "step": 32071 + }, + { + "epoch": 17.91731843575419, + "grad_norm": 0.35039588809013367, + "learning_rate": 0.00010481792717086836, + "loss": 0.4062, + "step": 32072 + }, + { + "epoch": 17.917877094972066, + "grad_norm": 0.4460091292858124, + "learning_rate": 0.00010478991596638655, + "loss": 0.421, + "step": 32073 + }, + { + "epoch": 17.918435754189943, + "grad_norm": 0.35254159569740295, + "learning_rate": 0.00010476190476190477, + "loss": 0.3262, + "step": 32074 + }, + { + "epoch": 17.91899441340782, + "grad_norm": 0.5050002932548523, + "learning_rate": 0.00010473389355742296, + "loss": 0.3445, + "step": 32075 + }, + { + "epoch": 17.9195530726257, + "grad_norm": 0.5239331126213074, + "learning_rate": 0.00010470588235294118, + "loss": 0.403, + "step": 32076 + }, + { + "epoch": 17.920111731843576, + "grad_norm": 0.3262229263782501, + "learning_rate": 0.00010467787114845939, + "loss": 0.3974, + "step": 32077 + }, + { + "epoch": 17.920670391061453, + "grad_norm": 6.374235153198242, + "learning_rate": 0.00010464985994397759, + "loss": 0.397, + "step": 32078 + }, + { + "epoch": 17.92122905027933, + "grad_norm": 0.44188499450683594, + "learning_rate": 0.0001046218487394958, + "loss": 0.424, + "step": 32079 + }, + { + "epoch": 17.921787709497206, + "grad_norm": 0.4864714443683624, + "learning_rate": 0.000104593837535014, + "loss": 0.3611, + "step": 32080 + }, + { + "epoch": 17.922346368715083, + "grad_norm": 0.9476110339164734, + "learning_rate": 0.00010456582633053221, + "loss": 0.3961, + "step": 32081 + }, + { + "epoch": 17.922905027932963, + "grad_norm": 0.4864995777606964, + "learning_rate": 0.00010453781512605043, + "loss": 0.3896, + "step": 32082 + }, + { + "epoch": 17.92346368715084, + "grad_norm": 0.5234395265579224, + "learning_rate": 0.00010450980392156862, + "loss": 0.3804, + "step": 32083 + }, + { + "epoch": 17.924022346368716, + "grad_norm": 0.3916802704334259, + "learning_rate": 0.00010448179271708684, + "loss": 0.323, + "step": 32084 + }, + { + "epoch": 17.924581005586592, + "grad_norm": 0.5071191787719727, + "learning_rate": 0.00010445378151260503, + "loss": 0.554, + "step": 32085 + }, + { + "epoch": 17.92513966480447, + "grad_norm": 0.9988527894020081, + "learning_rate": 0.00010442577030812325, + "loss": 0.4345, + "step": 32086 + }, + { + "epoch": 17.925698324022346, + "grad_norm": 0.37349894642829895, + "learning_rate": 0.00010439775910364146, + "loss": 0.3474, + "step": 32087 + }, + { + "epoch": 17.926256983240222, + "grad_norm": 0.5939425826072693, + "learning_rate": 0.00010436974789915967, + "loss": 0.435, + "step": 32088 + }, + { + "epoch": 17.926815642458102, + "grad_norm": 0.46381595730781555, + "learning_rate": 0.00010434173669467787, + "loss": 0.5452, + "step": 32089 + }, + { + "epoch": 17.92737430167598, + "grad_norm": 0.9660611152648926, + "learning_rate": 0.00010431372549019608, + "loss": 0.3852, + "step": 32090 + }, + { + "epoch": 17.927932960893855, + "grad_norm": 0.40508294105529785, + "learning_rate": 0.00010428571428571428, + "loss": 0.4066, + "step": 32091 + }, + { + "epoch": 17.928491620111732, + "grad_norm": 1.4999148845672607, + "learning_rate": 0.00010425770308123249, + "loss": 0.383, + "step": 32092 + }, + { + "epoch": 17.92905027932961, + "grad_norm": 0.41329383850097656, + "learning_rate": 0.0001042296918767507, + "loss": 0.4594, + "step": 32093 + }, + { + "epoch": 17.929608938547485, + "grad_norm": 0.8257381916046143, + "learning_rate": 0.00010420168067226892, + "loss": 0.57, + "step": 32094 + }, + { + "epoch": 17.93016759776536, + "grad_norm": 0.5611152052879333, + "learning_rate": 0.00010417366946778711, + "loss": 0.3729, + "step": 32095 + }, + { + "epoch": 17.93072625698324, + "grad_norm": 0.5553175806999207, + "learning_rate": 0.00010414565826330533, + "loss": 0.491, + "step": 32096 + }, + { + "epoch": 17.93128491620112, + "grad_norm": 0.5101818442344666, + "learning_rate": 0.00010411764705882352, + "loss": 0.4563, + "step": 32097 + }, + { + "epoch": 17.931843575418995, + "grad_norm": 0.42898306250572205, + "learning_rate": 0.00010408963585434174, + "loss": 0.3305, + "step": 32098 + }, + { + "epoch": 17.93240223463687, + "grad_norm": 0.5169159770011902, + "learning_rate": 0.00010406162464985995, + "loss": 0.4026, + "step": 32099 + }, + { + "epoch": 17.932960893854748, + "grad_norm": 0.40212827920913696, + "learning_rate": 0.00010403361344537815, + "loss": 0.4071, + "step": 32100 + }, + { + "epoch": 17.933519553072625, + "grad_norm": 0.5036874413490295, + "learning_rate": 0.00010400560224089636, + "loss": 0.3974, + "step": 32101 + }, + { + "epoch": 17.9340782122905, + "grad_norm": 0.48071178793907166, + "learning_rate": 0.00010397759103641456, + "loss": 0.4066, + "step": 32102 + }, + { + "epoch": 17.93463687150838, + "grad_norm": 4.051095008850098, + "learning_rate": 0.00010394957983193277, + "loss": 0.4658, + "step": 32103 + }, + { + "epoch": 17.935195530726258, + "grad_norm": 0.5427535176277161, + "learning_rate": 0.00010392156862745099, + "loss": 0.48, + "step": 32104 + }, + { + "epoch": 17.935754189944134, + "grad_norm": 0.41416987776756287, + "learning_rate": 0.00010389355742296918, + "loss": 0.3215, + "step": 32105 + }, + { + "epoch": 17.93631284916201, + "grad_norm": 10.936762809753418, + "learning_rate": 0.0001038655462184874, + "loss": 0.3667, + "step": 32106 + }, + { + "epoch": 17.936871508379888, + "grad_norm": 0.6484376192092896, + "learning_rate": 0.0001038375350140056, + "loss": 0.4732, + "step": 32107 + }, + { + "epoch": 17.937430167597764, + "grad_norm": 1.140391230583191, + "learning_rate": 0.00010380952380952381, + "loss": 0.4438, + "step": 32108 + }, + { + "epoch": 17.93798882681564, + "grad_norm": 0.38668641448020935, + "learning_rate": 0.00010378151260504202, + "loss": 0.3633, + "step": 32109 + }, + { + "epoch": 17.93854748603352, + "grad_norm": 0.5950167775154114, + "learning_rate": 0.00010375350140056023, + "loss": 0.411, + "step": 32110 + }, + { + "epoch": 17.939106145251397, + "grad_norm": 0.47897446155548096, + "learning_rate": 0.00010372549019607843, + "loss": 0.3484, + "step": 32111 + }, + { + "epoch": 17.939664804469274, + "grad_norm": 1.1663424968719482, + "learning_rate": 0.00010369747899159664, + "loss": 0.3879, + "step": 32112 + }, + { + "epoch": 17.94022346368715, + "grad_norm": 0.37962576746940613, + "learning_rate": 0.00010366946778711484, + "loss": 0.3883, + "step": 32113 + }, + { + "epoch": 17.940782122905027, + "grad_norm": 0.35679253935813904, + "learning_rate": 0.00010364145658263306, + "loss": 0.4137, + "step": 32114 + }, + { + "epoch": 17.941340782122904, + "grad_norm": 0.5075446963310242, + "learning_rate": 0.00010361344537815126, + "loss": 0.3422, + "step": 32115 + }, + { + "epoch": 17.941899441340784, + "grad_norm": 0.5077576041221619, + "learning_rate": 0.00010358543417366947, + "loss": 0.345, + "step": 32116 + }, + { + "epoch": 17.94245810055866, + "grad_norm": 0.530564546585083, + "learning_rate": 0.00010355742296918767, + "loss": 0.3702, + "step": 32117 + }, + { + "epoch": 17.943016759776537, + "grad_norm": 0.5628470778465271, + "learning_rate": 0.00010352941176470589, + "loss": 0.4678, + "step": 32118 + }, + { + "epoch": 17.943575418994413, + "grad_norm": 0.5565418601036072, + "learning_rate": 0.00010350140056022408, + "loss": 0.431, + "step": 32119 + }, + { + "epoch": 17.94413407821229, + "grad_norm": 0.5384323000907898, + "learning_rate": 0.0001034733893557423, + "loss": 0.4193, + "step": 32120 + }, + { + "epoch": 17.944692737430167, + "grad_norm": 0.3861985504627228, + "learning_rate": 0.0001034453781512605, + "loss": 0.4186, + "step": 32121 + }, + { + "epoch": 17.945251396648043, + "grad_norm": 0.4633825421333313, + "learning_rate": 0.00010341736694677871, + "loss": 0.4426, + "step": 32122 + }, + { + "epoch": 17.945810055865923, + "grad_norm": 0.44751685857772827, + "learning_rate": 0.00010338935574229692, + "loss": 0.4302, + "step": 32123 + }, + { + "epoch": 17.9463687150838, + "grad_norm": 0.5824738144874573, + "learning_rate": 0.00010336134453781512, + "loss": 0.5436, + "step": 32124 + }, + { + "epoch": 17.946927374301676, + "grad_norm": 0.5901375412940979, + "learning_rate": 0.00010333333333333333, + "loss": 0.5259, + "step": 32125 + }, + { + "epoch": 17.947486033519553, + "grad_norm": 0.4049050807952881, + "learning_rate": 0.00010330532212885155, + "loss": 0.379, + "step": 32126 + }, + { + "epoch": 17.94804469273743, + "grad_norm": 0.9458888173103333, + "learning_rate": 0.00010327731092436974, + "loss": 0.6673, + "step": 32127 + }, + { + "epoch": 17.948603351955306, + "grad_norm": 0.46767690777778625, + "learning_rate": 0.00010324929971988796, + "loss": 0.4629, + "step": 32128 + }, + { + "epoch": 17.949162011173183, + "grad_norm": 2.2768640518188477, + "learning_rate": 0.00010322128851540615, + "loss": 0.389, + "step": 32129 + }, + { + "epoch": 17.949720670391063, + "grad_norm": 0.41384831070899963, + "learning_rate": 0.00010319327731092437, + "loss": 0.4262, + "step": 32130 + }, + { + "epoch": 17.95027932960894, + "grad_norm": 1.425829529762268, + "learning_rate": 0.00010316526610644258, + "loss": 0.2834, + "step": 32131 + }, + { + "epoch": 17.950837988826816, + "grad_norm": 0.3054933249950409, + "learning_rate": 0.00010313725490196078, + "loss": 0.2559, + "step": 32132 + }, + { + "epoch": 17.951396648044692, + "grad_norm": 0.4020155966281891, + "learning_rate": 0.00010310924369747899, + "loss": 0.5037, + "step": 32133 + }, + { + "epoch": 17.95195530726257, + "grad_norm": 0.5484250783920288, + "learning_rate": 0.0001030812324929972, + "loss": 0.3176, + "step": 32134 + }, + { + "epoch": 17.952513966480446, + "grad_norm": 0.7302910685539246, + "learning_rate": 0.0001030532212885154, + "loss": 0.5089, + "step": 32135 + }, + { + "epoch": 17.953072625698326, + "grad_norm": 0.46535348892211914, + "learning_rate": 0.00010302521008403362, + "loss": 0.3451, + "step": 32136 + }, + { + "epoch": 17.953631284916202, + "grad_norm": 0.39149177074432373, + "learning_rate": 0.00010299719887955181, + "loss": 0.3416, + "step": 32137 + }, + { + "epoch": 17.95418994413408, + "grad_norm": 0.37270358204841614, + "learning_rate": 0.00010296918767507003, + "loss": 0.3294, + "step": 32138 + }, + { + "epoch": 17.954748603351955, + "grad_norm": 0.4724014401435852, + "learning_rate": 0.00010294117647058823, + "loss": 0.4524, + "step": 32139 + }, + { + "epoch": 17.955307262569832, + "grad_norm": 0.3210068941116333, + "learning_rate": 0.00010291316526610645, + "loss": 0.3241, + "step": 32140 + }, + { + "epoch": 17.95586592178771, + "grad_norm": 7.787713050842285, + "learning_rate": 0.00010288515406162465, + "loss": 0.4483, + "step": 32141 + }, + { + "epoch": 17.956424581005585, + "grad_norm": 0.4520329236984253, + "learning_rate": 0.00010285714285714286, + "loss": 0.3667, + "step": 32142 + }, + { + "epoch": 17.956983240223465, + "grad_norm": 0.6888320446014404, + "learning_rate": 0.00010282913165266106, + "loss": 0.4194, + "step": 32143 + }, + { + "epoch": 17.957541899441342, + "grad_norm": 0.8103201985359192, + "learning_rate": 0.00010280112044817927, + "loss": 0.3727, + "step": 32144 + }, + { + "epoch": 17.95810055865922, + "grad_norm": 0.5739715695381165, + "learning_rate": 0.00010277310924369748, + "loss": 0.4644, + "step": 32145 + }, + { + "epoch": 17.958659217877095, + "grad_norm": 0.44778817892074585, + "learning_rate": 0.0001027450980392157, + "loss": 0.4121, + "step": 32146 + }, + { + "epoch": 17.95921787709497, + "grad_norm": 0.6674813032150269, + "learning_rate": 0.00010271708683473389, + "loss": 0.3549, + "step": 32147 + }, + { + "epoch": 17.959776536312848, + "grad_norm": 0.4502522349357605, + "learning_rate": 0.00010268907563025211, + "loss": 0.4164, + "step": 32148 + }, + { + "epoch": 17.960335195530725, + "grad_norm": 0.41869762539863586, + "learning_rate": 0.0001026610644257703, + "loss": 0.4212, + "step": 32149 + }, + { + "epoch": 17.960893854748605, + "grad_norm": 0.3820033073425293, + "learning_rate": 0.00010263305322128852, + "loss": 0.5288, + "step": 32150 + }, + { + "epoch": 17.96145251396648, + "grad_norm": 0.40751445293426514, + "learning_rate": 0.00010260504201680671, + "loss": 0.3637, + "step": 32151 + }, + { + "epoch": 17.962011173184358, + "grad_norm": 0.5653027296066284, + "learning_rate": 0.00010257703081232493, + "loss": 0.4395, + "step": 32152 + }, + { + "epoch": 17.962569832402234, + "grad_norm": 1.0940889120101929, + "learning_rate": 0.00010254901960784315, + "loss": 0.3925, + "step": 32153 + }, + { + "epoch": 17.96312849162011, + "grad_norm": 0.32194969058036804, + "learning_rate": 0.00010252100840336134, + "loss": 0.3994, + "step": 32154 + }, + { + "epoch": 17.963687150837988, + "grad_norm": 0.5637681484222412, + "learning_rate": 0.00010249299719887956, + "loss": 0.4458, + "step": 32155 + }, + { + "epoch": 17.964245810055864, + "grad_norm": 0.47407469153404236, + "learning_rate": 0.00010246498599439776, + "loss": 0.3285, + "step": 32156 + }, + { + "epoch": 17.964804469273744, + "grad_norm": 0.486642450094223, + "learning_rate": 0.00010243697478991597, + "loss": 0.3451, + "step": 32157 + }, + { + "epoch": 17.96536312849162, + "grad_norm": 0.5967510342597961, + "learning_rate": 0.00010240896358543418, + "loss": 0.3988, + "step": 32158 + }, + { + "epoch": 17.965921787709497, + "grad_norm": 0.3528304994106293, + "learning_rate": 0.00010238095238095239, + "loss": 0.3753, + "step": 32159 + }, + { + "epoch": 17.966480446927374, + "grad_norm": 0.9806635975837708, + "learning_rate": 0.00010235294117647059, + "loss": 0.4167, + "step": 32160 + }, + { + "epoch": 17.96703910614525, + "grad_norm": 0.49239224195480347, + "learning_rate": 0.0001023249299719888, + "loss": 0.4962, + "step": 32161 + }, + { + "epoch": 17.967597765363127, + "grad_norm": 0.47801443934440613, + "learning_rate": 0.000102296918767507, + "loss": 0.3625, + "step": 32162 + }, + { + "epoch": 17.968156424581007, + "grad_norm": 0.5573924779891968, + "learning_rate": 0.00010226890756302522, + "loss": 0.3267, + "step": 32163 + }, + { + "epoch": 17.968715083798884, + "grad_norm": 0.5888733267784119, + "learning_rate": 0.00010224089635854342, + "loss": 0.4356, + "step": 32164 + }, + { + "epoch": 17.96927374301676, + "grad_norm": 0.4991845488548279, + "learning_rate": 0.00010221288515406164, + "loss": 0.3967, + "step": 32165 + }, + { + "epoch": 17.969832402234637, + "grad_norm": 0.6667455434799194, + "learning_rate": 0.00010218487394957983, + "loss": 0.3928, + "step": 32166 + }, + { + "epoch": 17.970391061452514, + "grad_norm": 0.4273636043071747, + "learning_rate": 0.00010215686274509805, + "loss": 0.441, + "step": 32167 + }, + { + "epoch": 17.97094972067039, + "grad_norm": 0.4200524389743805, + "learning_rate": 0.00010212885154061625, + "loss": 0.4129, + "step": 32168 + }, + { + "epoch": 17.971508379888267, + "grad_norm": 0.4013611972332001, + "learning_rate": 0.00010210084033613446, + "loss": 0.3675, + "step": 32169 + }, + { + "epoch": 17.972067039106147, + "grad_norm": 20.487226486206055, + "learning_rate": 0.00010207282913165267, + "loss": 0.3944, + "step": 32170 + }, + { + "epoch": 17.972625698324023, + "grad_norm": 0.5312010049819946, + "learning_rate": 0.00010204481792717087, + "loss": 0.464, + "step": 32171 + }, + { + "epoch": 17.9731843575419, + "grad_norm": 2.0187249183654785, + "learning_rate": 0.00010201680672268908, + "loss": 0.5059, + "step": 32172 + }, + { + "epoch": 17.973743016759776, + "grad_norm": 17.045413970947266, + "learning_rate": 0.0001019887955182073, + "loss": 0.4262, + "step": 32173 + }, + { + "epoch": 17.974301675977653, + "grad_norm": 0.9672741293907166, + "learning_rate": 0.00010196078431372549, + "loss": 0.4437, + "step": 32174 + }, + { + "epoch": 17.97486033519553, + "grad_norm": 0.5415329933166504, + "learning_rate": 0.00010193277310924371, + "loss": 0.3903, + "step": 32175 + }, + { + "epoch": 17.975418994413406, + "grad_norm": 0.6438754796981812, + "learning_rate": 0.0001019047619047619, + "loss": 0.3886, + "step": 32176 + }, + { + "epoch": 17.975977653631286, + "grad_norm": 0.5428248047828674, + "learning_rate": 0.00010187675070028012, + "loss": 0.4948, + "step": 32177 + }, + { + "epoch": 17.976536312849163, + "grad_norm": 0.4975340962409973, + "learning_rate": 0.00010184873949579831, + "loss": 0.3714, + "step": 32178 + }, + { + "epoch": 17.97709497206704, + "grad_norm": 0.35350000858306885, + "learning_rate": 0.00010182072829131653, + "loss": 0.419, + "step": 32179 + }, + { + "epoch": 17.977653631284916, + "grad_norm": 0.4170933663845062, + "learning_rate": 0.00010179271708683474, + "loss": 0.3274, + "step": 32180 + }, + { + "epoch": 17.978212290502793, + "grad_norm": 0.6111339926719666, + "learning_rate": 0.00010176470588235295, + "loss": 0.5101, + "step": 32181 + }, + { + "epoch": 17.97877094972067, + "grad_norm": 0.4280765950679779, + "learning_rate": 0.00010173669467787115, + "loss": 0.3706, + "step": 32182 + }, + { + "epoch": 17.97932960893855, + "grad_norm": 0.4439769983291626, + "learning_rate": 0.00010170868347338936, + "loss": 0.3239, + "step": 32183 + }, + { + "epoch": 17.979888268156426, + "grad_norm": 0.4513072371482849, + "learning_rate": 0.00010168067226890756, + "loss": 0.3322, + "step": 32184 + }, + { + "epoch": 17.980446927374302, + "grad_norm": 0.336851567029953, + "learning_rate": 0.00010165266106442578, + "loss": 0.3525, + "step": 32185 + }, + { + "epoch": 17.98100558659218, + "grad_norm": 0.3594527244567871, + "learning_rate": 0.00010162464985994398, + "loss": 0.2935, + "step": 32186 + }, + { + "epoch": 17.981564245810056, + "grad_norm": 0.4038698077201843, + "learning_rate": 0.0001015966386554622, + "loss": 0.4512, + "step": 32187 + }, + { + "epoch": 17.982122905027932, + "grad_norm": 0.35323983430862427, + "learning_rate": 0.00010156862745098039, + "loss": 0.445, + "step": 32188 + }, + { + "epoch": 17.98268156424581, + "grad_norm": 0.5118697285652161, + "learning_rate": 0.00010154061624649861, + "loss": 0.3577, + "step": 32189 + }, + { + "epoch": 17.98324022346369, + "grad_norm": 0.3399391174316406, + "learning_rate": 0.00010151260504201681, + "loss": 0.381, + "step": 32190 + }, + { + "epoch": 17.983798882681565, + "grad_norm": 0.5384077429771423, + "learning_rate": 0.00010148459383753502, + "loss": 0.3696, + "step": 32191 + }, + { + "epoch": 17.984357541899442, + "grad_norm": 0.5064133405685425, + "learning_rate": 0.00010145658263305323, + "loss": 0.3448, + "step": 32192 + }, + { + "epoch": 17.98491620111732, + "grad_norm": 1.204272985458374, + "learning_rate": 0.00010142857142857143, + "loss": 0.3644, + "step": 32193 + }, + { + "epoch": 17.985474860335195, + "grad_norm": 0.41834303736686707, + "learning_rate": 0.00010140056022408964, + "loss": 0.3743, + "step": 32194 + }, + { + "epoch": 17.98603351955307, + "grad_norm": 0.44735148549079895, + "learning_rate": 0.00010137254901960786, + "loss": 0.5135, + "step": 32195 + }, + { + "epoch": 17.986592178770948, + "grad_norm": 0.3835451602935791, + "learning_rate": 0.00010134453781512605, + "loss": 0.3585, + "step": 32196 + }, + { + "epoch": 17.98715083798883, + "grad_norm": 0.4819982647895813, + "learning_rate": 0.00010131652661064427, + "loss": 0.4432, + "step": 32197 + }, + { + "epoch": 17.987709497206705, + "grad_norm": 0.49629509449005127, + "learning_rate": 0.00010128851540616246, + "loss": 0.5428, + "step": 32198 + }, + { + "epoch": 17.98826815642458, + "grad_norm": 0.508905291557312, + "learning_rate": 0.00010126050420168068, + "loss": 0.3444, + "step": 32199 + }, + { + "epoch": 17.988826815642458, + "grad_norm": 0.64361572265625, + "learning_rate": 0.00010123249299719889, + "loss": 0.4514, + "step": 32200 + }, + { + "epoch": 17.989385474860335, + "grad_norm": 1.2176861763000488, + "learning_rate": 0.00010120448179271709, + "loss": 0.4176, + "step": 32201 + }, + { + "epoch": 17.98994413407821, + "grad_norm": 0.4100797474384308, + "learning_rate": 0.0001011764705882353, + "loss": 0.4212, + "step": 32202 + }, + { + "epoch": 17.990502793296088, + "grad_norm": 0.935918927192688, + "learning_rate": 0.0001011484593837535, + "loss": 0.5361, + "step": 32203 + }, + { + "epoch": 17.991061452513968, + "grad_norm": 1.6712472438812256, + "learning_rate": 0.00010112044817927171, + "loss": 0.4151, + "step": 32204 + }, + { + "epoch": 17.991620111731844, + "grad_norm": 0.5248129367828369, + "learning_rate": 0.00010109243697478992, + "loss": 0.4453, + "step": 32205 + }, + { + "epoch": 17.99217877094972, + "grad_norm": 0.6204363107681274, + "learning_rate": 0.00010106442577030812, + "loss": 0.3319, + "step": 32206 + }, + { + "epoch": 17.992737430167598, + "grad_norm": 0.3578352928161621, + "learning_rate": 0.00010103641456582634, + "loss": 0.3813, + "step": 32207 + }, + { + "epoch": 17.993296089385474, + "grad_norm": 0.4895043671131134, + "learning_rate": 0.00010100840336134453, + "loss": 0.422, + "step": 32208 + }, + { + "epoch": 17.99385474860335, + "grad_norm": 0.48461681604385376, + "learning_rate": 0.00010098039215686275, + "loss": 0.3886, + "step": 32209 + }, + { + "epoch": 17.994413407821227, + "grad_norm": 2.6344857215881348, + "learning_rate": 0.00010095238095238095, + "loss": 0.4594, + "step": 32210 + }, + { + "epoch": 17.994972067039107, + "grad_norm": 0.4512898325920105, + "learning_rate": 0.00010092436974789917, + "loss": 0.4307, + "step": 32211 + }, + { + "epoch": 17.995530726256984, + "grad_norm": 0.5038577318191528, + "learning_rate": 0.00010089635854341737, + "loss": 0.4041, + "step": 32212 + }, + { + "epoch": 17.99608938547486, + "grad_norm": 1.025885820388794, + "learning_rate": 0.00010086834733893558, + "loss": 0.3315, + "step": 32213 + }, + { + "epoch": 17.996648044692737, + "grad_norm": 0.4434353709220886, + "learning_rate": 0.00010084033613445378, + "loss": 0.34, + "step": 32214 + }, + { + "epoch": 17.997206703910614, + "grad_norm": 0.4924229681491852, + "learning_rate": 0.00010081232492997199, + "loss": 0.369, + "step": 32215 + }, + { + "epoch": 17.99776536312849, + "grad_norm": 0.44298720359802246, + "learning_rate": 0.0001007843137254902, + "loss": 0.507, + "step": 32216 + }, + { + "epoch": 17.99832402234637, + "grad_norm": 0.5157743692398071, + "learning_rate": 0.00010075630252100842, + "loss": 0.3637, + "step": 32217 + }, + { + "epoch": 17.998882681564247, + "grad_norm": 0.483979731798172, + "learning_rate": 0.00010072829131652661, + "loss": 0.44, + "step": 32218 + }, + { + "epoch": 17.999441340782123, + "grad_norm": 0.5765401721000671, + "learning_rate": 0.00010070028011204483, + "loss": 0.3873, + "step": 32219 + }, + { + "epoch": 18.0, + "grad_norm": 0.40338215231895447, + "learning_rate": 0.00010067226890756302, + "loss": 0.3496, + "step": 32220 + }, + { + "epoch": 18.000558659217877, + "grad_norm": 0.5964576601982117, + "learning_rate": 0.00010064425770308124, + "loss": 0.3746, + "step": 32221 + }, + { + "epoch": 18.001117318435753, + "grad_norm": 0.6850411295890808, + "learning_rate": 0.00010061624649859945, + "loss": 0.5033, + "step": 32222 + }, + { + "epoch": 18.00167597765363, + "grad_norm": 0.40267351269721985, + "learning_rate": 0.00010058823529411765, + "loss": 0.3866, + "step": 32223 + }, + { + "epoch": 18.00223463687151, + "grad_norm": 0.37789425253868103, + "learning_rate": 0.00010056022408963586, + "loss": 0.4156, + "step": 32224 + }, + { + "epoch": 18.002793296089386, + "grad_norm": 0.3623667061328888, + "learning_rate": 0.00010053221288515406, + "loss": 0.3125, + "step": 32225 + }, + { + "epoch": 18.003351955307263, + "grad_norm": 0.3939913511276245, + "learning_rate": 0.00010050420168067227, + "loss": 0.3503, + "step": 32226 + }, + { + "epoch": 18.00391061452514, + "grad_norm": 0.42091861367225647, + "learning_rate": 0.00010047619047619049, + "loss": 0.346, + "step": 32227 + }, + { + "epoch": 18.004469273743016, + "grad_norm": 0.5072236061096191, + "learning_rate": 0.00010044817927170868, + "loss": 0.4363, + "step": 32228 + }, + { + "epoch": 18.005027932960893, + "grad_norm": 0.4480527341365814, + "learning_rate": 0.0001004201680672269, + "loss": 0.4823, + "step": 32229 + }, + { + "epoch": 18.00558659217877, + "grad_norm": 0.5972398519515991, + "learning_rate": 0.0001003921568627451, + "loss": 0.3489, + "step": 32230 + }, + { + "epoch": 18.00614525139665, + "grad_norm": 0.40248218178749084, + "learning_rate": 0.00010036414565826331, + "loss": 0.3482, + "step": 32231 + }, + { + "epoch": 18.006703910614526, + "grad_norm": 0.4852919578552246, + "learning_rate": 0.0001003361344537815, + "loss": 0.4764, + "step": 32232 + }, + { + "epoch": 18.007262569832402, + "grad_norm": 0.4575750231742859, + "learning_rate": 0.00010030812324929973, + "loss": 0.4602, + "step": 32233 + }, + { + "epoch": 18.00782122905028, + "grad_norm": 0.3594995141029358, + "learning_rate": 0.00010028011204481793, + "loss": 0.3459, + "step": 32234 + }, + { + "epoch": 18.008379888268156, + "grad_norm": 0.4226916432380676, + "learning_rate": 0.00010025210084033614, + "loss": 0.2825, + "step": 32235 + }, + { + "epoch": 18.008938547486032, + "grad_norm": 0.4638039171695709, + "learning_rate": 0.00010022408963585434, + "loss": 0.4521, + "step": 32236 + }, + { + "epoch": 18.009497206703912, + "grad_norm": 1.3567938804626465, + "learning_rate": 0.00010019607843137255, + "loss": 0.4194, + "step": 32237 + }, + { + "epoch": 18.01005586592179, + "grad_norm": 0.49408772587776184, + "learning_rate": 0.00010016806722689076, + "loss": 0.3871, + "step": 32238 + }, + { + "epoch": 18.010614525139665, + "grad_norm": 0.48068350553512573, + "learning_rate": 0.00010014005602240897, + "loss": 0.4438, + "step": 32239 + }, + { + "epoch": 18.011173184357542, + "grad_norm": 0.3814902603626251, + "learning_rate": 0.00010011204481792717, + "loss": 0.3139, + "step": 32240 + }, + { + "epoch": 18.01173184357542, + "grad_norm": 0.5485551357269287, + "learning_rate": 0.00010008403361344539, + "loss": 0.5443, + "step": 32241 + }, + { + "epoch": 18.012290502793295, + "grad_norm": 0.3657400906085968, + "learning_rate": 0.00010005602240896358, + "loss": 0.3701, + "step": 32242 + }, + { + "epoch": 18.01284916201117, + "grad_norm": 0.3984507620334625, + "learning_rate": 0.0001000280112044818, + "loss": 0.376, + "step": 32243 + }, + { + "epoch": 18.013407821229052, + "grad_norm": 0.3243999779224396, + "learning_rate": 0.0001, + "loss": 0.3586, + "step": 32244 + }, + { + "epoch": 18.01396648044693, + "grad_norm": 0.37670835852622986, + "learning_rate": 9.997198879551821e-05, + "loss": 0.5086, + "step": 32245 + }, + { + "epoch": 18.014525139664805, + "grad_norm": 1.4647576808929443, + "learning_rate": 9.994397759103642e-05, + "loss": 0.4316, + "step": 32246 + }, + { + "epoch": 18.01508379888268, + "grad_norm": 0.42384421825408936, + "learning_rate": 9.991596638655462e-05, + "loss": 0.3776, + "step": 32247 + }, + { + "epoch": 18.015642458100558, + "grad_norm": 0.5377728343009949, + "learning_rate": 9.988795518207283e-05, + "loss": 0.3353, + "step": 32248 + }, + { + "epoch": 18.016201117318435, + "grad_norm": 0.4751221239566803, + "learning_rate": 9.985994397759105e-05, + "loss": 0.427, + "step": 32249 + }, + { + "epoch": 18.01675977653631, + "grad_norm": 0.43127840757369995, + "learning_rate": 9.983193277310924e-05, + "loss": 0.3993, + "step": 32250 + }, + { + "epoch": 18.01731843575419, + "grad_norm": 0.6855712532997131, + "learning_rate": 9.980392156862746e-05, + "loss": 0.4527, + "step": 32251 + }, + { + "epoch": 18.017877094972068, + "grad_norm": 0.5313059687614441, + "learning_rate": 9.977591036414565e-05, + "loss": 0.3849, + "step": 32252 + }, + { + "epoch": 18.018435754189944, + "grad_norm": 0.6236023306846619, + "learning_rate": 9.974789915966387e-05, + "loss": 0.4618, + "step": 32253 + }, + { + "epoch": 18.01899441340782, + "grad_norm": 0.7799429297447205, + "learning_rate": 9.971988795518208e-05, + "loss": 0.3672, + "step": 32254 + }, + { + "epoch": 18.019553072625698, + "grad_norm": 0.5411490797996521, + "learning_rate": 9.969187675070028e-05, + "loss": 0.4196, + "step": 32255 + }, + { + "epoch": 18.020111731843574, + "grad_norm": 0.3924950659275055, + "learning_rate": 9.966386554621849e-05, + "loss": 0.3716, + "step": 32256 + }, + { + "epoch": 18.02067039106145, + "grad_norm": 0.579119861125946, + "learning_rate": 9.96358543417367e-05, + "loss": 0.3804, + "step": 32257 + }, + { + "epoch": 18.02122905027933, + "grad_norm": 0.3702141046524048, + "learning_rate": 9.96078431372549e-05, + "loss": 0.3669, + "step": 32258 + }, + { + "epoch": 18.021787709497207, + "grad_norm": 1.1192439794540405, + "learning_rate": 9.957983193277311e-05, + "loss": 0.352, + "step": 32259 + }, + { + "epoch": 18.022346368715084, + "grad_norm": 10.042969703674316, + "learning_rate": 9.955182072829131e-05, + "loss": 0.3675, + "step": 32260 + }, + { + "epoch": 18.02290502793296, + "grad_norm": 0.515103816986084, + "learning_rate": 9.952380952380953e-05, + "loss": 0.4227, + "step": 32261 + }, + { + "epoch": 18.023463687150837, + "grad_norm": 0.5099522471427917, + "learning_rate": 9.949579831932773e-05, + "loss": 0.4808, + "step": 32262 + }, + { + "epoch": 18.024022346368714, + "grad_norm": 0.40187525749206543, + "learning_rate": 9.946778711484595e-05, + "loss": 0.4333, + "step": 32263 + }, + { + "epoch": 18.024581005586594, + "grad_norm": 0.45274391770362854, + "learning_rate": 9.943977591036414e-05, + "loss": 0.3857, + "step": 32264 + }, + { + "epoch": 18.02513966480447, + "grad_norm": 0.524215042591095, + "learning_rate": 9.941176470588236e-05, + "loss": 0.3685, + "step": 32265 + }, + { + "epoch": 18.025698324022347, + "grad_norm": 0.46615076065063477, + "learning_rate": 9.938375350140056e-05, + "loss": 0.2647, + "step": 32266 + }, + { + "epoch": 18.026256983240224, + "grad_norm": 0.54038405418396, + "learning_rate": 9.935574229691877e-05, + "loss": 0.3059, + "step": 32267 + }, + { + "epoch": 18.0268156424581, + "grad_norm": 0.4917943477630615, + "learning_rate": 9.932773109243698e-05, + "loss": 0.3303, + "step": 32268 + }, + { + "epoch": 18.027374301675977, + "grad_norm": 0.4269266426563263, + "learning_rate": 9.929971988795518e-05, + "loss": 0.4245, + "step": 32269 + }, + { + "epoch": 18.027932960893853, + "grad_norm": 0.5459792017936707, + "learning_rate": 9.927170868347339e-05, + "loss": 0.4266, + "step": 32270 + }, + { + "epoch": 18.028491620111733, + "grad_norm": 0.3995327353477478, + "learning_rate": 9.924369747899161e-05, + "loss": 0.4168, + "step": 32271 + }, + { + "epoch": 18.02905027932961, + "grad_norm": 0.527877688407898, + "learning_rate": 9.92156862745098e-05, + "loss": 0.4083, + "step": 32272 + }, + { + "epoch": 18.029608938547486, + "grad_norm": 0.3832007348537445, + "learning_rate": 9.918767507002802e-05, + "loss": 0.2715, + "step": 32273 + }, + { + "epoch": 18.030167597765363, + "grad_norm": 0.3343893587589264, + "learning_rate": 9.915966386554621e-05, + "loss": 0.3998, + "step": 32274 + }, + { + "epoch": 18.03072625698324, + "grad_norm": 0.43357473611831665, + "learning_rate": 9.913165266106443e-05, + "loss": 0.4659, + "step": 32275 + }, + { + "epoch": 18.031284916201116, + "grad_norm": 1.858503818511963, + "learning_rate": 9.910364145658264e-05, + "loss": 0.3403, + "step": 32276 + }, + { + "epoch": 18.031843575418993, + "grad_norm": 1.3414182662963867, + "learning_rate": 9.907563025210084e-05, + "loss": 0.4059, + "step": 32277 + }, + { + "epoch": 18.032402234636873, + "grad_norm": 0.45445460081100464, + "learning_rate": 9.904761904761905e-05, + "loss": 0.4071, + "step": 32278 + }, + { + "epoch": 18.03296089385475, + "grad_norm": 0.5012571811676025, + "learning_rate": 9.901960784313726e-05, + "loss": 0.5003, + "step": 32279 + }, + { + "epoch": 18.033519553072626, + "grad_norm": 0.6706053018569946, + "learning_rate": 9.899159663865546e-05, + "loss": 0.4059, + "step": 32280 + }, + { + "epoch": 18.034078212290503, + "grad_norm": 0.9332179427146912, + "learning_rate": 9.896358543417368e-05, + "loss": 0.4242, + "step": 32281 + }, + { + "epoch": 18.03463687150838, + "grad_norm": 0.46652886271476746, + "learning_rate": 9.893557422969187e-05, + "loss": 0.3825, + "step": 32282 + }, + { + "epoch": 18.035195530726256, + "grad_norm": 0.32998818159103394, + "learning_rate": 9.890756302521009e-05, + "loss": 0.3812, + "step": 32283 + }, + { + "epoch": 18.035754189944136, + "grad_norm": 0.5732955932617188, + "learning_rate": 9.887955182072829e-05, + "loss": 0.3729, + "step": 32284 + }, + { + "epoch": 18.036312849162012, + "grad_norm": 0.4227480888366699, + "learning_rate": 9.88515406162465e-05, + "loss": 0.4074, + "step": 32285 + }, + { + "epoch": 18.03687150837989, + "grad_norm": 0.7531979084014893, + "learning_rate": 9.88235294117647e-05, + "loss": 0.4275, + "step": 32286 + }, + { + "epoch": 18.037430167597766, + "grad_norm": 0.7109859585762024, + "learning_rate": 9.879551820728292e-05, + "loss": 0.3545, + "step": 32287 + }, + { + "epoch": 18.037988826815642, + "grad_norm": 0.48333683609962463, + "learning_rate": 9.876750700280112e-05, + "loss": 0.3377, + "step": 32288 + }, + { + "epoch": 18.03854748603352, + "grad_norm": 0.3968415856361389, + "learning_rate": 9.873949579831933e-05, + "loss": 0.4417, + "step": 32289 + }, + { + "epoch": 18.039106145251395, + "grad_norm": 0.3953340947628021, + "learning_rate": 9.871148459383753e-05, + "loss": 0.2911, + "step": 32290 + }, + { + "epoch": 18.039664804469275, + "grad_norm": 0.5416538715362549, + "learning_rate": 9.868347338935574e-05, + "loss": 0.4176, + "step": 32291 + }, + { + "epoch": 18.040223463687152, + "grad_norm": 0.44783201813697815, + "learning_rate": 9.865546218487395e-05, + "loss": 0.4059, + "step": 32292 + }, + { + "epoch": 18.04078212290503, + "grad_norm": 3.4399209022521973, + "learning_rate": 9.862745098039217e-05, + "loss": 0.4041, + "step": 32293 + }, + { + "epoch": 18.041340782122905, + "grad_norm": 0.4822849929332733, + "learning_rate": 9.859943977591036e-05, + "loss": 0.4081, + "step": 32294 + }, + { + "epoch": 18.04189944134078, + "grad_norm": 2.9424662590026855, + "learning_rate": 9.857142857142858e-05, + "loss": 0.3402, + "step": 32295 + }, + { + "epoch": 18.042458100558658, + "grad_norm": 1.5867671966552734, + "learning_rate": 9.854341736694677e-05, + "loss": 0.5788, + "step": 32296 + }, + { + "epoch": 18.043016759776535, + "grad_norm": 0.5507910251617432, + "learning_rate": 9.851540616246499e-05, + "loss": 0.6523, + "step": 32297 + }, + { + "epoch": 18.043575418994415, + "grad_norm": 0.9780528545379639, + "learning_rate": 9.84873949579832e-05, + "loss": 0.397, + "step": 32298 + }, + { + "epoch": 18.04413407821229, + "grad_norm": 0.4034658372402191, + "learning_rate": 9.84593837535014e-05, + "loss": 0.3861, + "step": 32299 + }, + { + "epoch": 18.044692737430168, + "grad_norm": 0.3881433308124542, + "learning_rate": 9.843137254901961e-05, + "loss": 0.3836, + "step": 32300 + }, + { + "epoch": 18.045251396648045, + "grad_norm": 0.7576314806938171, + "learning_rate": 9.840336134453781e-05, + "loss": 0.3992, + "step": 32301 + }, + { + "epoch": 18.04581005586592, + "grad_norm": 0.43571993708610535, + "learning_rate": 9.837535014005602e-05, + "loss": 0.4538, + "step": 32302 + }, + { + "epoch": 18.046368715083798, + "grad_norm": 0.3823072016239166, + "learning_rate": 9.834733893557424e-05, + "loss": 0.4308, + "step": 32303 + }, + { + "epoch": 18.046927374301674, + "grad_norm": 0.5230814814567566, + "learning_rate": 9.831932773109243e-05, + "loss": 0.3813, + "step": 32304 + }, + { + "epoch": 18.047486033519554, + "grad_norm": 0.45276939868927, + "learning_rate": 9.829131652661065e-05, + "loss": 0.3408, + "step": 32305 + }, + { + "epoch": 18.04804469273743, + "grad_norm": 0.4107702076435089, + "learning_rate": 9.826330532212884e-05, + "loss": 0.4543, + "step": 32306 + }, + { + "epoch": 18.048603351955308, + "grad_norm": 0.5349020957946777, + "learning_rate": 9.823529411764706e-05, + "loss": 0.3946, + "step": 32307 + }, + { + "epoch": 18.049162011173184, + "grad_norm": 0.49025577306747437, + "learning_rate": 9.820728291316527e-05, + "loss": 0.4312, + "step": 32308 + }, + { + "epoch": 18.04972067039106, + "grad_norm": 0.5161467790603638, + "learning_rate": 9.817927170868348e-05, + "loss": 0.4998, + "step": 32309 + }, + { + "epoch": 18.050279329608937, + "grad_norm": 0.3578868508338928, + "learning_rate": 9.815126050420168e-05, + "loss": 0.3209, + "step": 32310 + }, + { + "epoch": 18.050837988826817, + "grad_norm": 0.5973392128944397, + "learning_rate": 9.812324929971989e-05, + "loss": 0.3495, + "step": 32311 + }, + { + "epoch": 18.051396648044694, + "grad_norm": 0.3100230097770691, + "learning_rate": 9.80952380952381e-05, + "loss": 0.3758, + "step": 32312 + }, + { + "epoch": 18.05195530726257, + "grad_norm": 0.39064323902130127, + "learning_rate": 9.806722689075631e-05, + "loss": 0.3605, + "step": 32313 + }, + { + "epoch": 18.052513966480447, + "grad_norm": 0.35504016280174255, + "learning_rate": 9.80392156862745e-05, + "loss": 0.323, + "step": 32314 + }, + { + "epoch": 18.053072625698324, + "grad_norm": 1.5023446083068848, + "learning_rate": 9.801120448179273e-05, + "loss": 0.3491, + "step": 32315 + }, + { + "epoch": 18.0536312849162, + "grad_norm": 0.4155273139476776, + "learning_rate": 9.798319327731092e-05, + "loss": 0.4041, + "step": 32316 + }, + { + "epoch": 18.054189944134077, + "grad_norm": 0.5000426173210144, + "learning_rate": 9.795518207282914e-05, + "loss": 0.3484, + "step": 32317 + }, + { + "epoch": 18.054748603351957, + "grad_norm": 0.49676889181137085, + "learning_rate": 9.792717086834733e-05, + "loss": 0.465, + "step": 32318 + }, + { + "epoch": 18.055307262569833, + "grad_norm": 0.3657989203929901, + "learning_rate": 9.789915966386555e-05, + "loss": 0.3029, + "step": 32319 + }, + { + "epoch": 18.05586592178771, + "grad_norm": 0.34014958143234253, + "learning_rate": 9.787114845938376e-05, + "loss": 0.3663, + "step": 32320 + }, + { + "epoch": 18.056424581005587, + "grad_norm": 0.4739871621131897, + "learning_rate": 9.784313725490196e-05, + "loss": 0.4468, + "step": 32321 + }, + { + "epoch": 18.056983240223463, + "grad_norm": 0.5413164496421814, + "learning_rate": 9.781512605042017e-05, + "loss": 0.4806, + "step": 32322 + }, + { + "epoch": 18.05754189944134, + "grad_norm": 1.307497262954712, + "learning_rate": 9.778711484593837e-05, + "loss": 0.36, + "step": 32323 + }, + { + "epoch": 18.058100558659216, + "grad_norm": 3.2699074745178223, + "learning_rate": 9.775910364145658e-05, + "loss": 0.4242, + "step": 32324 + }, + { + "epoch": 18.058659217877096, + "grad_norm": 0.7847439050674438, + "learning_rate": 9.77310924369748e-05, + "loss": 0.3199, + "step": 32325 + }, + { + "epoch": 18.059217877094973, + "grad_norm": 0.5128061771392822, + "learning_rate": 9.770308123249299e-05, + "loss": 0.3954, + "step": 32326 + }, + { + "epoch": 18.05977653631285, + "grad_norm": 0.7440681457519531, + "learning_rate": 9.767507002801121e-05, + "loss": 0.548, + "step": 32327 + }, + { + "epoch": 18.060335195530726, + "grad_norm": 0.3531847298145294, + "learning_rate": 9.76470588235294e-05, + "loss": 0.3961, + "step": 32328 + }, + { + "epoch": 18.060893854748603, + "grad_norm": 0.5368345975875854, + "learning_rate": 9.761904761904762e-05, + "loss": 0.5148, + "step": 32329 + }, + { + "epoch": 18.06145251396648, + "grad_norm": 0.3315516412258148, + "learning_rate": 9.759103641456583e-05, + "loss": 0.287, + "step": 32330 + }, + { + "epoch": 18.062011173184356, + "grad_norm": 0.35034263134002686, + "learning_rate": 9.756302521008403e-05, + "loss": 0.3237, + "step": 32331 + }, + { + "epoch": 18.062569832402236, + "grad_norm": 0.7380197644233704, + "learning_rate": 9.753501400560224e-05, + "loss": 0.5109, + "step": 32332 + }, + { + "epoch": 18.063128491620112, + "grad_norm": 1.6325008869171143, + "learning_rate": 9.750700280112045e-05, + "loss": 0.4722, + "step": 32333 + }, + { + "epoch": 18.06368715083799, + "grad_norm": 0.6074326634407043, + "learning_rate": 9.747899159663865e-05, + "loss": 0.3467, + "step": 32334 + }, + { + "epoch": 18.064245810055866, + "grad_norm": 2.7419750690460205, + "learning_rate": 9.745098039215687e-05, + "loss": 0.4185, + "step": 32335 + }, + { + "epoch": 18.064804469273742, + "grad_norm": 0.4355568587779999, + "learning_rate": 9.742296918767506e-05, + "loss": 0.4237, + "step": 32336 + }, + { + "epoch": 18.06536312849162, + "grad_norm": 0.4071156084537506, + "learning_rate": 9.739495798319328e-05, + "loss": 0.3871, + "step": 32337 + }, + { + "epoch": 18.0659217877095, + "grad_norm": 0.4159044921398163, + "learning_rate": 9.736694677871148e-05, + "loss": 0.428, + "step": 32338 + }, + { + "epoch": 18.066480446927375, + "grad_norm": 0.4854905903339386, + "learning_rate": 9.73389355742297e-05, + "loss": 0.3825, + "step": 32339 + }, + { + "epoch": 18.067039106145252, + "grad_norm": 0.7821348309516907, + "learning_rate": 9.73109243697479e-05, + "loss": 0.437, + "step": 32340 + }, + { + "epoch": 18.06759776536313, + "grad_norm": 0.320843368768692, + "learning_rate": 9.728291316526611e-05, + "loss": 0.3524, + "step": 32341 + }, + { + "epoch": 18.068156424581005, + "grad_norm": 0.4091688096523285, + "learning_rate": 9.725490196078431e-05, + "loss": 0.4402, + "step": 32342 + }, + { + "epoch": 18.06871508379888, + "grad_norm": 0.4603174030780792, + "learning_rate": 9.722689075630252e-05, + "loss": 0.3619, + "step": 32343 + }, + { + "epoch": 18.06927374301676, + "grad_norm": 0.538090705871582, + "learning_rate": 9.719887955182073e-05, + "loss": 0.458, + "step": 32344 + }, + { + "epoch": 18.06983240223464, + "grad_norm": 0.5462102293968201, + "learning_rate": 9.717086834733893e-05, + "loss": 0.3489, + "step": 32345 + }, + { + "epoch": 18.070391061452515, + "grad_norm": 0.3876274824142456, + "learning_rate": 9.714285714285714e-05, + "loss": 0.4015, + "step": 32346 + }, + { + "epoch": 18.07094972067039, + "grad_norm": 0.5038914084434509, + "learning_rate": 9.711484593837536e-05, + "loss": 0.3956, + "step": 32347 + }, + { + "epoch": 18.071508379888268, + "grad_norm": 0.343403697013855, + "learning_rate": 9.708683473389355e-05, + "loss": 0.3298, + "step": 32348 + }, + { + "epoch": 18.072067039106145, + "grad_norm": 0.39973217248916626, + "learning_rate": 9.705882352941177e-05, + "loss": 0.3596, + "step": 32349 + }, + { + "epoch": 18.07262569832402, + "grad_norm": 0.7613192796707153, + "learning_rate": 9.703081232492996e-05, + "loss": 0.4188, + "step": 32350 + }, + { + "epoch": 18.073184357541898, + "grad_norm": 5.275942325592041, + "learning_rate": 9.700280112044818e-05, + "loss": 0.3812, + "step": 32351 + }, + { + "epoch": 18.073743016759778, + "grad_norm": 0.4313063323497772, + "learning_rate": 9.697478991596639e-05, + "loss": 0.3734, + "step": 32352 + }, + { + "epoch": 18.074301675977654, + "grad_norm": 1.448196530342102, + "learning_rate": 9.69467787114846e-05, + "loss": 0.4771, + "step": 32353 + }, + { + "epoch": 18.07486033519553, + "grad_norm": 3.522810459136963, + "learning_rate": 9.69187675070028e-05, + "loss": 0.4753, + "step": 32354 + }, + { + "epoch": 18.075418994413408, + "grad_norm": 1.010437250137329, + "learning_rate": 9.6890756302521e-05, + "loss": 0.352, + "step": 32355 + }, + { + "epoch": 18.075977653631284, + "grad_norm": 0.553568959236145, + "learning_rate": 9.686274509803921e-05, + "loss": 0.4939, + "step": 32356 + }, + { + "epoch": 18.07653631284916, + "grad_norm": 0.5555480718612671, + "learning_rate": 9.683473389355743e-05, + "loss": 0.4298, + "step": 32357 + }, + { + "epoch": 18.07709497206704, + "grad_norm": 0.43442219495773315, + "learning_rate": 9.680672268907562e-05, + "loss": 0.4521, + "step": 32358 + }, + { + "epoch": 18.077653631284917, + "grad_norm": 0.6569821238517761, + "learning_rate": 9.677871148459384e-05, + "loss": 0.4076, + "step": 32359 + }, + { + "epoch": 18.078212290502794, + "grad_norm": 0.6165484189987183, + "learning_rate": 9.675070028011204e-05, + "loss": 0.3306, + "step": 32360 + }, + { + "epoch": 18.07877094972067, + "grad_norm": 0.44647157192230225, + "learning_rate": 9.672268907563026e-05, + "loss": 0.342, + "step": 32361 + }, + { + "epoch": 18.079329608938547, + "grad_norm": 0.7071517705917358, + "learning_rate": 9.669467787114846e-05, + "loss": 0.3781, + "step": 32362 + }, + { + "epoch": 18.079888268156424, + "grad_norm": 0.47947704792022705, + "learning_rate": 9.666666666666667e-05, + "loss": 0.3816, + "step": 32363 + }, + { + "epoch": 18.0804469273743, + "grad_norm": 0.3952135145664215, + "learning_rate": 9.663865546218487e-05, + "loss": 0.3865, + "step": 32364 + }, + { + "epoch": 18.08100558659218, + "grad_norm": 0.7275920510292053, + "learning_rate": 9.661064425770308e-05, + "loss": 0.3736, + "step": 32365 + }, + { + "epoch": 18.081564245810057, + "grad_norm": 0.3496969938278198, + "learning_rate": 9.658263305322129e-05, + "loss": 0.4123, + "step": 32366 + }, + { + "epoch": 18.082122905027934, + "grad_norm": 0.31497928500175476, + "learning_rate": 9.65546218487395e-05, + "loss": 0.3428, + "step": 32367 + }, + { + "epoch": 18.08268156424581, + "grad_norm": 0.3751422166824341, + "learning_rate": 9.65266106442577e-05, + "loss": 0.4505, + "step": 32368 + }, + { + "epoch": 18.083240223463687, + "grad_norm": 0.6251440644264221, + "learning_rate": 9.649859943977592e-05, + "loss": 0.3481, + "step": 32369 + }, + { + "epoch": 18.083798882681563, + "grad_norm": 0.3780922293663025, + "learning_rate": 9.647058823529411e-05, + "loss": 0.4113, + "step": 32370 + }, + { + "epoch": 18.08435754189944, + "grad_norm": 0.5271783471107483, + "learning_rate": 9.644257703081233e-05, + "loss": 0.4537, + "step": 32371 + }, + { + "epoch": 18.08491620111732, + "grad_norm": 4.852962970733643, + "learning_rate": 9.641456582633052e-05, + "loss": 0.4273, + "step": 32372 + }, + { + "epoch": 18.085474860335196, + "grad_norm": 0.39356476068496704, + "learning_rate": 9.638655462184874e-05, + "loss": 0.37, + "step": 32373 + }, + { + "epoch": 18.086033519553073, + "grad_norm": 0.44131913781166077, + "learning_rate": 9.635854341736695e-05, + "loss": 0.3359, + "step": 32374 + }, + { + "epoch": 18.08659217877095, + "grad_norm": 2.905324697494507, + "learning_rate": 9.633053221288515e-05, + "loss": 0.3475, + "step": 32375 + }, + { + "epoch": 18.087150837988826, + "grad_norm": 0.49924376606941223, + "learning_rate": 9.630252100840336e-05, + "loss": 0.3393, + "step": 32376 + }, + { + "epoch": 18.087709497206703, + "grad_norm": 0.650465190410614, + "learning_rate": 9.627450980392156e-05, + "loss": 0.4384, + "step": 32377 + }, + { + "epoch": 18.08826815642458, + "grad_norm": 0.35724392533302307, + "learning_rate": 9.624649859943977e-05, + "loss": 0.4068, + "step": 32378 + }, + { + "epoch": 18.08882681564246, + "grad_norm": 0.5715853571891785, + "learning_rate": 9.621848739495799e-05, + "loss": 0.4898, + "step": 32379 + }, + { + "epoch": 18.089385474860336, + "grad_norm": 0.4248448312282562, + "learning_rate": 9.619047619047618e-05, + "loss": 0.4238, + "step": 32380 + }, + { + "epoch": 18.089944134078213, + "grad_norm": 0.7967003583908081, + "learning_rate": 9.61624649859944e-05, + "loss": 0.6419, + "step": 32381 + }, + { + "epoch": 18.09050279329609, + "grad_norm": 0.6587761640548706, + "learning_rate": 9.61344537815126e-05, + "loss": 0.3594, + "step": 32382 + }, + { + "epoch": 18.091061452513966, + "grad_norm": 0.5312923192977905, + "learning_rate": 9.610644257703081e-05, + "loss": 0.4188, + "step": 32383 + }, + { + "epoch": 18.091620111731842, + "grad_norm": 0.585952639579773, + "learning_rate": 9.607843137254903e-05, + "loss": 0.3851, + "step": 32384 + }, + { + "epoch": 18.092178770949722, + "grad_norm": 0.33357542753219604, + "learning_rate": 9.605042016806723e-05, + "loss": 0.3851, + "step": 32385 + }, + { + "epoch": 18.0927374301676, + "grad_norm": 0.39248207211494446, + "learning_rate": 9.602240896358545e-05, + "loss": 0.3166, + "step": 32386 + }, + { + "epoch": 18.093296089385476, + "grad_norm": 0.5831889510154724, + "learning_rate": 9.599439775910364e-05, + "loss": 0.4307, + "step": 32387 + }, + { + "epoch": 18.093854748603352, + "grad_norm": 0.4109219014644623, + "learning_rate": 9.596638655462186e-05, + "loss": 0.4067, + "step": 32388 + }, + { + "epoch": 18.09441340782123, + "grad_norm": 0.4178137183189392, + "learning_rate": 9.593837535014006e-05, + "loss": 0.3455, + "step": 32389 + }, + { + "epoch": 18.094972067039105, + "grad_norm": 0.620341420173645, + "learning_rate": 9.591036414565827e-05, + "loss": 0.5119, + "step": 32390 + }, + { + "epoch": 18.095530726256982, + "grad_norm": 0.467006117105484, + "learning_rate": 9.588235294117648e-05, + "loss": 0.3469, + "step": 32391 + }, + { + "epoch": 18.096089385474862, + "grad_norm": 0.6388922929763794, + "learning_rate": 9.585434173669468e-05, + "loss": 0.4681, + "step": 32392 + }, + { + "epoch": 18.09664804469274, + "grad_norm": 0.9081668853759766, + "learning_rate": 9.582633053221289e-05, + "loss": 0.4724, + "step": 32393 + }, + { + "epoch": 18.097206703910615, + "grad_norm": 0.4373238682746887, + "learning_rate": 9.579831932773111e-05, + "loss": 0.3631, + "step": 32394 + }, + { + "epoch": 18.09776536312849, + "grad_norm": 0.4496663510799408, + "learning_rate": 9.57703081232493e-05, + "loss": 0.3552, + "step": 32395 + }, + { + "epoch": 18.098324022346368, + "grad_norm": 0.47216156125068665, + "learning_rate": 9.574229691876752e-05, + "loss": 0.3933, + "step": 32396 + }, + { + "epoch": 18.098882681564245, + "grad_norm": 0.34115511178970337, + "learning_rate": 9.571428571428571e-05, + "loss": 0.443, + "step": 32397 + }, + { + "epoch": 18.09944134078212, + "grad_norm": 0.6326836347579956, + "learning_rate": 9.568627450980393e-05, + "loss": 0.4772, + "step": 32398 + }, + { + "epoch": 18.1, + "grad_norm": 0.45279860496520996, + "learning_rate": 9.565826330532212e-05, + "loss": 0.4123, + "step": 32399 + }, + { + "epoch": 18.100558659217878, + "grad_norm": 0.5163799524307251, + "learning_rate": 9.563025210084034e-05, + "loss": 0.3916, + "step": 32400 + }, + { + "epoch": 18.101117318435755, + "grad_norm": 0.43643999099731445, + "learning_rate": 9.560224089635855e-05, + "loss": 0.4468, + "step": 32401 + }, + { + "epoch": 18.10167597765363, + "grad_norm": 0.4300617277622223, + "learning_rate": 9.557422969187676e-05, + "loss": 0.3304, + "step": 32402 + }, + { + "epoch": 18.102234636871508, + "grad_norm": 0.35711267590522766, + "learning_rate": 9.554621848739496e-05, + "loss": 0.3385, + "step": 32403 + }, + { + "epoch": 18.102793296089384, + "grad_norm": 0.6354407072067261, + "learning_rate": 9.551820728291317e-05, + "loss": 0.4069, + "step": 32404 + }, + { + "epoch": 18.10335195530726, + "grad_norm": 0.4284682273864746, + "learning_rate": 9.549019607843137e-05, + "loss": 0.5048, + "step": 32405 + }, + { + "epoch": 18.10391061452514, + "grad_norm": 0.3715086877346039, + "learning_rate": 9.546218487394959e-05, + "loss": 0.3421, + "step": 32406 + }, + { + "epoch": 18.104469273743018, + "grad_norm": 0.425657719373703, + "learning_rate": 9.543417366946779e-05, + "loss": 0.4341, + "step": 32407 + }, + { + "epoch": 18.105027932960894, + "grad_norm": 0.46359753608703613, + "learning_rate": 9.5406162464986e-05, + "loss": 0.4554, + "step": 32408 + }, + { + "epoch": 18.10558659217877, + "grad_norm": 0.3752211928367615, + "learning_rate": 9.53781512605042e-05, + "loss": 0.3942, + "step": 32409 + }, + { + "epoch": 18.106145251396647, + "grad_norm": 1.7721015214920044, + "learning_rate": 9.535014005602242e-05, + "loss": 0.4794, + "step": 32410 + }, + { + "epoch": 18.106703910614524, + "grad_norm": 0.801632285118103, + "learning_rate": 9.532212885154062e-05, + "loss": 0.6042, + "step": 32411 + }, + { + "epoch": 18.107262569832404, + "grad_norm": 0.4083986282348633, + "learning_rate": 9.529411764705883e-05, + "loss": 0.4206, + "step": 32412 + }, + { + "epoch": 18.10782122905028, + "grad_norm": 0.6771788597106934, + "learning_rate": 9.526610644257703e-05, + "loss": 0.6182, + "step": 32413 + }, + { + "epoch": 18.108379888268157, + "grad_norm": 0.4505431652069092, + "learning_rate": 9.523809523809524e-05, + "loss": 0.4314, + "step": 32414 + }, + { + "epoch": 18.108938547486034, + "grad_norm": 0.6466163992881775, + "learning_rate": 9.521008403361345e-05, + "loss": 0.5239, + "step": 32415 + }, + { + "epoch": 18.10949720670391, + "grad_norm": 0.4537881910800934, + "learning_rate": 9.518207282913167e-05, + "loss": 0.3854, + "step": 32416 + }, + { + "epoch": 18.110055865921787, + "grad_norm": 0.5449619889259338, + "learning_rate": 9.515406162464986e-05, + "loss": 0.4495, + "step": 32417 + }, + { + "epoch": 18.110614525139663, + "grad_norm": 0.5234614014625549, + "learning_rate": 9.512605042016808e-05, + "loss": 0.4212, + "step": 32418 + }, + { + "epoch": 18.111173184357543, + "grad_norm": 0.3781912326812744, + "learning_rate": 9.509803921568627e-05, + "loss": 0.3492, + "step": 32419 + }, + { + "epoch": 18.11173184357542, + "grad_norm": 0.6956316232681274, + "learning_rate": 9.507002801120449e-05, + "loss": 0.4208, + "step": 32420 + }, + { + "epoch": 18.112290502793297, + "grad_norm": 0.3979493975639343, + "learning_rate": 9.50420168067227e-05, + "loss": 0.4432, + "step": 32421 + }, + { + "epoch": 18.112849162011173, + "grad_norm": 0.41402044892311096, + "learning_rate": 9.50140056022409e-05, + "loss": 0.4491, + "step": 32422 + }, + { + "epoch": 18.11340782122905, + "grad_norm": 0.49965232610702515, + "learning_rate": 9.498599439775911e-05, + "loss": 0.4174, + "step": 32423 + }, + { + "epoch": 18.113966480446926, + "grad_norm": 0.6669294834136963, + "learning_rate": 9.495798319327731e-05, + "loss": 0.4533, + "step": 32424 + }, + { + "epoch": 18.114525139664803, + "grad_norm": 0.41609206795692444, + "learning_rate": 9.492997198879552e-05, + "loss": 0.3756, + "step": 32425 + }, + { + "epoch": 18.115083798882683, + "grad_norm": 0.4448949098587036, + "learning_rate": 9.490196078431373e-05, + "loss": 0.3953, + "step": 32426 + }, + { + "epoch": 18.11564245810056, + "grad_norm": 0.7255361080169678, + "learning_rate": 9.487394957983193e-05, + "loss": 0.4487, + "step": 32427 + }, + { + "epoch": 18.116201117318436, + "grad_norm": 0.413948655128479, + "learning_rate": 9.484593837535015e-05, + "loss": 0.3719, + "step": 32428 + }, + { + "epoch": 18.116759776536313, + "grad_norm": 0.323770135641098, + "learning_rate": 9.481792717086834e-05, + "loss": 0.4094, + "step": 32429 + }, + { + "epoch": 18.11731843575419, + "grad_norm": 0.3582766354084015, + "learning_rate": 9.478991596638656e-05, + "loss": 0.3807, + "step": 32430 + }, + { + "epoch": 18.117877094972066, + "grad_norm": 0.3958209156990051, + "learning_rate": 9.476190476190476e-05, + "loss": 0.5188, + "step": 32431 + }, + { + "epoch": 18.118435754189946, + "grad_norm": 0.7040544748306274, + "learning_rate": 9.473389355742298e-05, + "loss": 0.406, + "step": 32432 + }, + { + "epoch": 18.118994413407822, + "grad_norm": 0.40433672070503235, + "learning_rate": 9.470588235294118e-05, + "loss": 0.3748, + "step": 32433 + }, + { + "epoch": 18.1195530726257, + "grad_norm": 0.5301108360290527, + "learning_rate": 9.467787114845939e-05, + "loss": 0.3411, + "step": 32434 + }, + { + "epoch": 18.120111731843576, + "grad_norm": 0.6196574568748474, + "learning_rate": 9.46498599439776e-05, + "loss": 0.4587, + "step": 32435 + }, + { + "epoch": 18.120670391061452, + "grad_norm": 1.0553373098373413, + "learning_rate": 9.46218487394958e-05, + "loss": 0.5373, + "step": 32436 + }, + { + "epoch": 18.12122905027933, + "grad_norm": 0.4461023509502411, + "learning_rate": 9.4593837535014e-05, + "loss": 0.3963, + "step": 32437 + }, + { + "epoch": 18.121787709497205, + "grad_norm": 0.45119306445121765, + "learning_rate": 9.456582633053223e-05, + "loss": 0.4016, + "step": 32438 + }, + { + "epoch": 18.122346368715085, + "grad_norm": 0.3909936547279358, + "learning_rate": 9.453781512605042e-05, + "loss": 0.3992, + "step": 32439 + }, + { + "epoch": 18.122905027932962, + "grad_norm": 0.4895066022872925, + "learning_rate": 9.450980392156864e-05, + "loss": 0.4642, + "step": 32440 + }, + { + "epoch": 18.12346368715084, + "grad_norm": 0.31271567940711975, + "learning_rate": 9.448179271708683e-05, + "loss": 0.3423, + "step": 32441 + }, + { + "epoch": 18.124022346368715, + "grad_norm": 0.6508842706680298, + "learning_rate": 9.445378151260505e-05, + "loss": 0.6018, + "step": 32442 + }, + { + "epoch": 18.12458100558659, + "grad_norm": 0.49566635489463806, + "learning_rate": 9.442577030812326e-05, + "loss": 0.3558, + "step": 32443 + }, + { + "epoch": 18.12513966480447, + "grad_norm": 0.4299832582473755, + "learning_rate": 9.439775910364146e-05, + "loss": 0.4237, + "step": 32444 + }, + { + "epoch": 18.125698324022345, + "grad_norm": 0.45366498827934265, + "learning_rate": 9.436974789915967e-05, + "loss": 0.4452, + "step": 32445 + }, + { + "epoch": 18.126256983240225, + "grad_norm": 0.9918420910835266, + "learning_rate": 9.434173669467787e-05, + "loss": 0.3383, + "step": 32446 + }, + { + "epoch": 18.1268156424581, + "grad_norm": 0.41790103912353516, + "learning_rate": 9.431372549019608e-05, + "loss": 0.3641, + "step": 32447 + }, + { + "epoch": 18.127374301675978, + "grad_norm": 0.5692659616470337, + "learning_rate": 9.42857142857143e-05, + "loss": 0.3654, + "step": 32448 + }, + { + "epoch": 18.127932960893855, + "grad_norm": 0.38307905197143555, + "learning_rate": 9.425770308123249e-05, + "loss": 0.4579, + "step": 32449 + }, + { + "epoch": 18.12849162011173, + "grad_norm": 0.7055395245552063, + "learning_rate": 9.422969187675071e-05, + "loss": 0.4169, + "step": 32450 + }, + { + "epoch": 18.129050279329608, + "grad_norm": 0.42380666732788086, + "learning_rate": 9.42016806722689e-05, + "loss": 0.3665, + "step": 32451 + }, + { + "epoch": 18.129608938547484, + "grad_norm": 0.4086686968803406, + "learning_rate": 9.417366946778712e-05, + "loss": 0.492, + "step": 32452 + }, + { + "epoch": 18.130167597765364, + "grad_norm": 0.47182878851890564, + "learning_rate": 9.414565826330533e-05, + "loss": 0.4582, + "step": 32453 + }, + { + "epoch": 18.13072625698324, + "grad_norm": 0.8682714700698853, + "learning_rate": 9.411764705882353e-05, + "loss": 0.4316, + "step": 32454 + }, + { + "epoch": 18.131284916201118, + "grad_norm": 0.603193461894989, + "learning_rate": 9.408963585434174e-05, + "loss": 0.3247, + "step": 32455 + }, + { + "epoch": 18.131843575418994, + "grad_norm": 0.38381659984588623, + "learning_rate": 9.406162464985995e-05, + "loss": 0.3178, + "step": 32456 + }, + { + "epoch": 18.13240223463687, + "grad_norm": 0.46145394444465637, + "learning_rate": 9.403361344537815e-05, + "loss": 0.3245, + "step": 32457 + }, + { + "epoch": 18.132960893854747, + "grad_norm": 0.5114855170249939, + "learning_rate": 9.400560224089636e-05, + "loss": 0.4095, + "step": 32458 + }, + { + "epoch": 18.133519553072627, + "grad_norm": 1.6515815258026123, + "learning_rate": 9.397759103641456e-05, + "loss": 0.3389, + "step": 32459 + }, + { + "epoch": 18.134078212290504, + "grad_norm": 1.1794813871383667, + "learning_rate": 9.394957983193278e-05, + "loss": 0.5584, + "step": 32460 + }, + { + "epoch": 18.13463687150838, + "grad_norm": 0.583577573299408, + "learning_rate": 9.392156862745098e-05, + "loss": 0.4741, + "step": 32461 + }, + { + "epoch": 18.135195530726257, + "grad_norm": 0.40958139300346375, + "learning_rate": 9.38935574229692e-05, + "loss": 0.4346, + "step": 32462 + }, + { + "epoch": 18.135754189944134, + "grad_norm": 0.36646702885627747, + "learning_rate": 9.386554621848739e-05, + "loss": 0.38, + "step": 32463 + }, + { + "epoch": 18.13631284916201, + "grad_norm": 0.7411330938339233, + "learning_rate": 9.383753501400561e-05, + "loss": 0.4981, + "step": 32464 + }, + { + "epoch": 18.136871508379887, + "grad_norm": 0.3926258981227875, + "learning_rate": 9.380952380952381e-05, + "loss": 0.3915, + "step": 32465 + }, + { + "epoch": 18.137430167597767, + "grad_norm": 0.37541258335113525, + "learning_rate": 9.378151260504202e-05, + "loss": 0.2965, + "step": 32466 + }, + { + "epoch": 18.137988826815644, + "grad_norm": 0.4508364200592041, + "learning_rate": 9.375350140056023e-05, + "loss": 0.3885, + "step": 32467 + }, + { + "epoch": 18.13854748603352, + "grad_norm": 1.0271037817001343, + "learning_rate": 9.372549019607843e-05, + "loss": 0.3198, + "step": 32468 + }, + { + "epoch": 18.139106145251397, + "grad_norm": 1.0580130815505981, + "learning_rate": 9.369747899159664e-05, + "loss": 0.3248, + "step": 32469 + }, + { + "epoch": 18.139664804469273, + "grad_norm": 0.5313414335250854, + "learning_rate": 9.366946778711486e-05, + "loss": 0.3455, + "step": 32470 + }, + { + "epoch": 18.14022346368715, + "grad_norm": 0.4863787293434143, + "learning_rate": 9.364145658263305e-05, + "loss": 0.3814, + "step": 32471 + }, + { + "epoch": 18.140782122905026, + "grad_norm": 0.48073557019233704, + "learning_rate": 9.361344537815127e-05, + "loss": 0.3625, + "step": 32472 + }, + { + "epoch": 18.141340782122906, + "grad_norm": 0.37851858139038086, + "learning_rate": 9.358543417366946e-05, + "loss": 0.3364, + "step": 32473 + }, + { + "epoch": 18.141899441340783, + "grad_norm": 0.32191070914268494, + "learning_rate": 9.355742296918768e-05, + "loss": 0.3259, + "step": 32474 + }, + { + "epoch": 18.14245810055866, + "grad_norm": 0.40157049894332886, + "learning_rate": 9.352941176470589e-05, + "loss": 0.4028, + "step": 32475 + }, + { + "epoch": 18.143016759776536, + "grad_norm": 0.3746364712715149, + "learning_rate": 9.35014005602241e-05, + "loss": 0.3802, + "step": 32476 + }, + { + "epoch": 18.143575418994413, + "grad_norm": 3.2074031829833984, + "learning_rate": 9.34733893557423e-05, + "loss": 0.3821, + "step": 32477 + }, + { + "epoch": 18.14413407821229, + "grad_norm": 0.8361067175865173, + "learning_rate": 9.34453781512605e-05, + "loss": 0.3681, + "step": 32478 + }, + { + "epoch": 18.144692737430166, + "grad_norm": 0.4876335561275482, + "learning_rate": 9.341736694677871e-05, + "loss": 0.5789, + "step": 32479 + }, + { + "epoch": 18.145251396648046, + "grad_norm": 0.46530160307884216, + "learning_rate": 9.338935574229693e-05, + "loss": 0.4168, + "step": 32480 + }, + { + "epoch": 18.145810055865923, + "grad_norm": 0.40378502011299133, + "learning_rate": 9.336134453781512e-05, + "loss": 0.4796, + "step": 32481 + }, + { + "epoch": 18.1463687150838, + "grad_norm": 0.9662461876869202, + "learning_rate": 9.333333333333334e-05, + "loss": 0.3917, + "step": 32482 + }, + { + "epoch": 18.146927374301676, + "grad_norm": 0.33864277601242065, + "learning_rate": 9.330532212885154e-05, + "loss": 0.3652, + "step": 32483 + }, + { + "epoch": 18.147486033519552, + "grad_norm": 0.603958010673523, + "learning_rate": 9.327731092436976e-05, + "loss": 0.5275, + "step": 32484 + }, + { + "epoch": 18.14804469273743, + "grad_norm": 0.36513668298721313, + "learning_rate": 9.324929971988795e-05, + "loss": 0.4782, + "step": 32485 + }, + { + "epoch": 18.14860335195531, + "grad_norm": 1.4098597764968872, + "learning_rate": 9.322128851540617e-05, + "loss": 0.4092, + "step": 32486 + }, + { + "epoch": 18.149162011173186, + "grad_norm": 0.35510867834091187, + "learning_rate": 9.319327731092437e-05, + "loss": 0.3667, + "step": 32487 + }, + { + "epoch": 18.149720670391062, + "grad_norm": 0.9086071848869324, + "learning_rate": 9.316526610644258e-05, + "loss": 0.467, + "step": 32488 + }, + { + "epoch": 18.15027932960894, + "grad_norm": 0.4618314504623413, + "learning_rate": 9.313725490196079e-05, + "loss": 0.375, + "step": 32489 + }, + { + "epoch": 18.150837988826815, + "grad_norm": 0.649321973323822, + "learning_rate": 9.310924369747899e-05, + "loss": 0.3544, + "step": 32490 + }, + { + "epoch": 18.15139664804469, + "grad_norm": 2.0096356868743896, + "learning_rate": 9.30812324929972e-05, + "loss": 0.499, + "step": 32491 + }, + { + "epoch": 18.15195530726257, + "grad_norm": 0.5533978343009949, + "learning_rate": 9.305322128851542e-05, + "loss": 0.4398, + "step": 32492 + }, + { + "epoch": 18.15251396648045, + "grad_norm": 0.6605199575424194, + "learning_rate": 9.302521008403361e-05, + "loss": 0.3721, + "step": 32493 + }, + { + "epoch": 18.153072625698325, + "grad_norm": 0.3735016882419586, + "learning_rate": 9.299719887955183e-05, + "loss": 0.4213, + "step": 32494 + }, + { + "epoch": 18.1536312849162, + "grad_norm": 0.42000138759613037, + "learning_rate": 9.296918767507002e-05, + "loss": 0.4794, + "step": 32495 + }, + { + "epoch": 18.154189944134078, + "grad_norm": 0.48943063616752625, + "learning_rate": 9.294117647058824e-05, + "loss": 0.3607, + "step": 32496 + }, + { + "epoch": 18.154748603351955, + "grad_norm": 1.0324535369873047, + "learning_rate": 9.291316526610645e-05, + "loss": 0.4688, + "step": 32497 + }, + { + "epoch": 18.15530726256983, + "grad_norm": 0.5306296348571777, + "learning_rate": 9.288515406162465e-05, + "loss": 0.3205, + "step": 32498 + }, + { + "epoch": 18.155865921787708, + "grad_norm": 0.38911643624305725, + "learning_rate": 9.285714285714286e-05, + "loss": 0.3856, + "step": 32499 + }, + { + "epoch": 18.156424581005588, + "grad_norm": 0.5162615776062012, + "learning_rate": 9.282913165266106e-05, + "loss": 0.3852, + "step": 32500 + }, + { + "epoch": 18.156424581005588, + "eval_cer": 0.08433439167294032, + "eval_loss": 0.32063376903533936, + "eval_runtime": 55.2078, + "eval_samples_per_second": 82.199, + "eval_steps_per_second": 5.144, + "eval_wer": 0.3336498980475406, + "step": 32500 + }, + { + "epoch": 18.156983240223465, + "grad_norm": 0.5900794863700867, + "learning_rate": 9.280112044817927e-05, + "loss": 0.3841, + "step": 32501 + }, + { + "epoch": 18.15754189944134, + "grad_norm": 0.37070396542549133, + "learning_rate": 9.277310924369749e-05, + "loss": 0.3839, + "step": 32502 + }, + { + "epoch": 18.158100558659218, + "grad_norm": 0.4954962134361267, + "learning_rate": 9.274509803921568e-05, + "loss": 0.545, + "step": 32503 + }, + { + "epoch": 18.158659217877094, + "grad_norm": 0.6245876550674438, + "learning_rate": 9.27170868347339e-05, + "loss": 0.3945, + "step": 32504 + }, + { + "epoch": 18.15921787709497, + "grad_norm": 0.6478275656700134, + "learning_rate": 9.26890756302521e-05, + "loss": 0.4555, + "step": 32505 + }, + { + "epoch": 18.159776536312847, + "grad_norm": 0.36954963207244873, + "learning_rate": 9.266106442577031e-05, + "loss": 0.4065, + "step": 32506 + }, + { + "epoch": 18.160335195530728, + "grad_norm": 0.4671505093574524, + "learning_rate": 9.263305322128852e-05, + "loss": 0.4342, + "step": 32507 + }, + { + "epoch": 18.160893854748604, + "grad_norm": 0.641503095626831, + "learning_rate": 9.260504201680673e-05, + "loss": 0.461, + "step": 32508 + }, + { + "epoch": 18.16145251396648, + "grad_norm": 0.47220394015312195, + "learning_rate": 9.257703081232493e-05, + "loss": 0.3333, + "step": 32509 + }, + { + "epoch": 18.162011173184357, + "grad_norm": 0.38197147846221924, + "learning_rate": 9.254901960784314e-05, + "loss": 0.3194, + "step": 32510 + }, + { + "epoch": 18.162569832402234, + "grad_norm": 1.2994736433029175, + "learning_rate": 9.252100840336134e-05, + "loss": 0.4188, + "step": 32511 + }, + { + "epoch": 18.16312849162011, + "grad_norm": 0.3727455735206604, + "learning_rate": 9.249299719887955e-05, + "loss": 0.4084, + "step": 32512 + }, + { + "epoch": 18.16368715083799, + "grad_norm": 0.4289976954460144, + "learning_rate": 9.246498599439776e-05, + "loss": 0.4244, + "step": 32513 + }, + { + "epoch": 18.164245810055867, + "grad_norm": 1.45867121219635, + "learning_rate": 9.243697478991598e-05, + "loss": 0.3903, + "step": 32514 + }, + { + "epoch": 18.164804469273744, + "grad_norm": 1.9103114604949951, + "learning_rate": 9.240896358543417e-05, + "loss": 0.3926, + "step": 32515 + }, + { + "epoch": 18.16536312849162, + "grad_norm": 0.8467742800712585, + "learning_rate": 9.238095238095239e-05, + "loss": 0.4844, + "step": 32516 + }, + { + "epoch": 18.165921787709497, + "grad_norm": 0.5672905445098877, + "learning_rate": 9.235294117647058e-05, + "loss": 0.4825, + "step": 32517 + }, + { + "epoch": 18.166480446927373, + "grad_norm": 0.35855451226234436, + "learning_rate": 9.23249299719888e-05, + "loss": 0.3361, + "step": 32518 + }, + { + "epoch": 18.16703910614525, + "grad_norm": 0.3326796889305115, + "learning_rate": 9.2296918767507e-05, + "loss": 0.3375, + "step": 32519 + }, + { + "epoch": 18.16759776536313, + "grad_norm": 0.525824785232544, + "learning_rate": 9.226890756302521e-05, + "loss": 0.3427, + "step": 32520 + }, + { + "epoch": 18.168156424581007, + "grad_norm": 0.40844160318374634, + "learning_rate": 9.224089635854342e-05, + "loss": 0.3608, + "step": 32521 + }, + { + "epoch": 18.168715083798883, + "grad_norm": 0.5018882751464844, + "learning_rate": 9.221288515406162e-05, + "loss": 0.4218, + "step": 32522 + }, + { + "epoch": 18.16927374301676, + "grad_norm": 0.38595661520957947, + "learning_rate": 9.218487394957983e-05, + "loss": 0.342, + "step": 32523 + }, + { + "epoch": 18.169832402234636, + "grad_norm": 0.49344679713249207, + "learning_rate": 9.215686274509805e-05, + "loss": 0.43, + "step": 32524 + }, + { + "epoch": 18.170391061452513, + "grad_norm": 0.3859495520591736, + "learning_rate": 9.212885154061624e-05, + "loss": 0.3672, + "step": 32525 + }, + { + "epoch": 18.17094972067039, + "grad_norm": 0.5541929006576538, + "learning_rate": 9.210084033613446e-05, + "loss": 0.6609, + "step": 32526 + }, + { + "epoch": 18.17150837988827, + "grad_norm": 0.422832190990448, + "learning_rate": 9.207282913165265e-05, + "loss": 0.4374, + "step": 32527 + }, + { + "epoch": 18.172067039106146, + "grad_norm": 0.3607363998889923, + "learning_rate": 9.204481792717087e-05, + "loss": 0.4006, + "step": 32528 + }, + { + "epoch": 18.172625698324023, + "grad_norm": 0.4760386347770691, + "learning_rate": 9.201680672268908e-05, + "loss": 0.4676, + "step": 32529 + }, + { + "epoch": 18.1731843575419, + "grad_norm": 0.5883450508117676, + "learning_rate": 9.198879551820729e-05, + "loss": 0.3944, + "step": 32530 + }, + { + "epoch": 18.173743016759776, + "grad_norm": 0.4508742690086365, + "learning_rate": 9.196078431372549e-05, + "loss": 0.4697, + "step": 32531 + }, + { + "epoch": 18.174301675977652, + "grad_norm": 1.72762131690979, + "learning_rate": 9.19327731092437e-05, + "loss": 0.3853, + "step": 32532 + }, + { + "epoch": 18.174860335195532, + "grad_norm": 0.3518361449241638, + "learning_rate": 9.19047619047619e-05, + "loss": 0.3747, + "step": 32533 + }, + { + "epoch": 18.17541899441341, + "grad_norm": 3.5360472202301025, + "learning_rate": 9.187675070028012e-05, + "loss": 0.5803, + "step": 32534 + }, + { + "epoch": 18.175977653631286, + "grad_norm": 1.5024994611740112, + "learning_rate": 9.184873949579832e-05, + "loss": 0.3919, + "step": 32535 + }, + { + "epoch": 18.176536312849162, + "grad_norm": 0.675117015838623, + "learning_rate": 9.182072829131653e-05, + "loss": 0.3196, + "step": 32536 + }, + { + "epoch": 18.17709497206704, + "grad_norm": 0.470883846282959, + "learning_rate": 9.179271708683473e-05, + "loss": 0.3848, + "step": 32537 + }, + { + "epoch": 18.177653631284915, + "grad_norm": 0.5163912177085876, + "learning_rate": 9.176470588235295e-05, + "loss": 0.471, + "step": 32538 + }, + { + "epoch": 18.178212290502792, + "grad_norm": 1.051300287246704, + "learning_rate": 9.173669467787114e-05, + "loss": 0.4844, + "step": 32539 + }, + { + "epoch": 18.178770949720672, + "grad_norm": 0.41967248916625977, + "learning_rate": 9.170868347338936e-05, + "loss": 0.4125, + "step": 32540 + }, + { + "epoch": 18.17932960893855, + "grad_norm": 6.672290802001953, + "learning_rate": 9.168067226890756e-05, + "loss": 0.3738, + "step": 32541 + }, + { + "epoch": 18.179888268156425, + "grad_norm": 0.3774382770061493, + "learning_rate": 9.165266106442577e-05, + "loss": 0.424, + "step": 32542 + }, + { + "epoch": 18.1804469273743, + "grad_norm": 0.4206157624721527, + "learning_rate": 9.162464985994398e-05, + "loss": 0.3353, + "step": 32543 + }, + { + "epoch": 18.18100558659218, + "grad_norm": 0.4097682535648346, + "learning_rate": 9.159663865546218e-05, + "loss": 0.3576, + "step": 32544 + }, + { + "epoch": 18.181564245810055, + "grad_norm": 0.9245181679725647, + "learning_rate": 9.156862745098039e-05, + "loss": 0.4094, + "step": 32545 + }, + { + "epoch": 18.18212290502793, + "grad_norm": 0.3932160437107086, + "learning_rate": 9.154061624649861e-05, + "loss": 0.3837, + "step": 32546 + }, + { + "epoch": 18.18268156424581, + "grad_norm": 0.4510837197303772, + "learning_rate": 9.15126050420168e-05, + "loss": 0.4662, + "step": 32547 + }, + { + "epoch": 18.183240223463688, + "grad_norm": 0.7100754976272583, + "learning_rate": 9.148459383753502e-05, + "loss": 0.4582, + "step": 32548 + }, + { + "epoch": 18.183798882681565, + "grad_norm": 0.5350834131240845, + "learning_rate": 9.145658263305321e-05, + "loss": 0.3508, + "step": 32549 + }, + { + "epoch": 18.18435754189944, + "grad_norm": 1.6078065633773804, + "learning_rate": 9.142857142857143e-05, + "loss": 0.3739, + "step": 32550 + }, + { + "epoch": 18.184916201117318, + "grad_norm": 0.5379326343536377, + "learning_rate": 9.140056022408964e-05, + "loss": 0.394, + "step": 32551 + }, + { + "epoch": 18.185474860335194, + "grad_norm": 0.44442981481552124, + "learning_rate": 9.137254901960784e-05, + "loss": 0.489, + "step": 32552 + }, + { + "epoch": 18.18603351955307, + "grad_norm": 0.38975608348846436, + "learning_rate": 9.134453781512605e-05, + "loss": 0.4322, + "step": 32553 + }, + { + "epoch": 18.18659217877095, + "grad_norm": 0.5542870163917542, + "learning_rate": 9.131652661064426e-05, + "loss": 0.419, + "step": 32554 + }, + { + "epoch": 18.187150837988828, + "grad_norm": 0.5849865078926086, + "learning_rate": 9.128851540616246e-05, + "loss": 0.5205, + "step": 32555 + }, + { + "epoch": 18.187709497206704, + "grad_norm": 0.37388792634010315, + "learning_rate": 9.126050420168068e-05, + "loss": 0.3099, + "step": 32556 + }, + { + "epoch": 18.18826815642458, + "grad_norm": 0.6174965500831604, + "learning_rate": 9.123249299719887e-05, + "loss": 0.2984, + "step": 32557 + }, + { + "epoch": 18.188826815642457, + "grad_norm": 0.30588021874427795, + "learning_rate": 9.12044817927171e-05, + "loss": 0.2786, + "step": 32558 + }, + { + "epoch": 18.189385474860334, + "grad_norm": 0.23707345128059387, + "learning_rate": 9.117647058823529e-05, + "loss": 0.2342, + "step": 32559 + }, + { + "epoch": 18.189944134078214, + "grad_norm": 0.541202962398529, + "learning_rate": 9.11484593837535e-05, + "loss": 0.3985, + "step": 32560 + }, + { + "epoch": 18.19050279329609, + "grad_norm": 0.3407653272151947, + "learning_rate": 9.112044817927171e-05, + "loss": 0.4086, + "step": 32561 + }, + { + "epoch": 18.191061452513967, + "grad_norm": 0.6027182936668396, + "learning_rate": 9.109243697478992e-05, + "loss": 0.5373, + "step": 32562 + }, + { + "epoch": 18.191620111731844, + "grad_norm": 0.3618367910385132, + "learning_rate": 9.106442577030812e-05, + "loss": 0.3936, + "step": 32563 + }, + { + "epoch": 18.19217877094972, + "grad_norm": 1.2943954467773438, + "learning_rate": 9.103641456582633e-05, + "loss": 0.442, + "step": 32564 + }, + { + "epoch": 18.192737430167597, + "grad_norm": 0.49033835530281067, + "learning_rate": 9.100840336134454e-05, + "loss": 0.3982, + "step": 32565 + }, + { + "epoch": 18.193296089385473, + "grad_norm": 0.45318087935447693, + "learning_rate": 9.098039215686274e-05, + "loss": 0.431, + "step": 32566 + }, + { + "epoch": 18.193854748603353, + "grad_norm": 0.46391695737838745, + "learning_rate": 9.095238095238095e-05, + "loss": 0.4346, + "step": 32567 + }, + { + "epoch": 18.19441340782123, + "grad_norm": 0.3761165738105774, + "learning_rate": 9.092436974789917e-05, + "loss": 0.3828, + "step": 32568 + }, + { + "epoch": 18.194972067039107, + "grad_norm": 0.39247897267341614, + "learning_rate": 9.089635854341736e-05, + "loss": 0.4222, + "step": 32569 + }, + { + "epoch": 18.195530726256983, + "grad_norm": 0.5490865111351013, + "learning_rate": 9.086834733893558e-05, + "loss": 0.4431, + "step": 32570 + }, + { + "epoch": 18.19608938547486, + "grad_norm": 0.43112263083457947, + "learning_rate": 9.084033613445377e-05, + "loss": 0.377, + "step": 32571 + }, + { + "epoch": 18.196648044692736, + "grad_norm": 0.594628632068634, + "learning_rate": 9.081232492997199e-05, + "loss": 0.3921, + "step": 32572 + }, + { + "epoch": 18.197206703910613, + "grad_norm": 0.858647882938385, + "learning_rate": 9.07843137254902e-05, + "loss": 0.4228, + "step": 32573 + }, + { + "epoch": 18.197765363128493, + "grad_norm": 0.3972368836402893, + "learning_rate": 9.07563025210084e-05, + "loss": 0.4821, + "step": 32574 + }, + { + "epoch": 18.19832402234637, + "grad_norm": 0.6365428566932678, + "learning_rate": 9.072829131652661e-05, + "loss": 0.5124, + "step": 32575 + }, + { + "epoch": 18.198882681564246, + "grad_norm": 0.3716276288032532, + "learning_rate": 9.070028011204482e-05, + "loss": 0.3816, + "step": 32576 + }, + { + "epoch": 18.199441340782123, + "grad_norm": 0.40125203132629395, + "learning_rate": 9.067226890756302e-05, + "loss": 0.3364, + "step": 32577 + }, + { + "epoch": 18.2, + "grad_norm": 1.586991548538208, + "learning_rate": 9.064425770308124e-05, + "loss": 0.3861, + "step": 32578 + }, + { + "epoch": 18.200558659217876, + "grad_norm": 0.4253595769405365, + "learning_rate": 9.061624649859943e-05, + "loss": 0.407, + "step": 32579 + }, + { + "epoch": 18.201117318435756, + "grad_norm": 0.5789925456047058, + "learning_rate": 9.058823529411765e-05, + "loss": 0.387, + "step": 32580 + }, + { + "epoch": 18.201675977653633, + "grad_norm": 0.5610730051994324, + "learning_rate": 9.056022408963585e-05, + "loss": 0.3732, + "step": 32581 + }, + { + "epoch": 18.20223463687151, + "grad_norm": 0.5564419031143188, + "learning_rate": 9.053221288515406e-05, + "loss": 0.4879, + "step": 32582 + }, + { + "epoch": 18.202793296089386, + "grad_norm": 0.3981051743030548, + "learning_rate": 9.050420168067227e-05, + "loss": 0.3245, + "step": 32583 + }, + { + "epoch": 18.203351955307262, + "grad_norm": 0.47177278995513916, + "learning_rate": 9.047619047619048e-05, + "loss": 0.3961, + "step": 32584 + }, + { + "epoch": 18.20391061452514, + "grad_norm": 0.3759315311908722, + "learning_rate": 9.044817927170868e-05, + "loss": 0.3418, + "step": 32585 + }, + { + "epoch": 18.204469273743015, + "grad_norm": 0.9305090308189392, + "learning_rate": 9.042016806722689e-05, + "loss": 0.4495, + "step": 32586 + }, + { + "epoch": 18.205027932960895, + "grad_norm": 1.0677870512008667, + "learning_rate": 9.03921568627451e-05, + "loss": 0.4175, + "step": 32587 + }, + { + "epoch": 18.205586592178772, + "grad_norm": 0.3763781785964966, + "learning_rate": 9.036414565826331e-05, + "loss": 0.3729, + "step": 32588 + }, + { + "epoch": 18.20614525139665, + "grad_norm": 0.5919235944747925, + "learning_rate": 9.03361344537815e-05, + "loss": 0.446, + "step": 32589 + }, + { + "epoch": 18.206703910614525, + "grad_norm": 0.4296429455280304, + "learning_rate": 9.030812324929973e-05, + "loss": 0.3921, + "step": 32590 + }, + { + "epoch": 18.2072625698324, + "grad_norm": 0.5748556852340698, + "learning_rate": 9.028011204481792e-05, + "loss": 0.5197, + "step": 32591 + }, + { + "epoch": 18.20782122905028, + "grad_norm": 0.48893022537231445, + "learning_rate": 9.025210084033614e-05, + "loss": 0.5051, + "step": 32592 + }, + { + "epoch": 18.208379888268155, + "grad_norm": 0.3871202766895294, + "learning_rate": 9.022408963585433e-05, + "loss": 0.3564, + "step": 32593 + }, + { + "epoch": 18.208938547486035, + "grad_norm": 0.6607181429862976, + "learning_rate": 9.019607843137255e-05, + "loss": 0.398, + "step": 32594 + }, + { + "epoch": 18.20949720670391, + "grad_norm": 0.49910831451416016, + "learning_rate": 9.016806722689076e-05, + "loss": 0.3749, + "step": 32595 + }, + { + "epoch": 18.210055865921788, + "grad_norm": 1.2300628423690796, + "learning_rate": 9.014005602240896e-05, + "loss": 0.543, + "step": 32596 + }, + { + "epoch": 18.210614525139665, + "grad_norm": 1.643019676208496, + "learning_rate": 9.011204481792717e-05, + "loss": 0.3282, + "step": 32597 + }, + { + "epoch": 18.21117318435754, + "grad_norm": 1.0076401233673096, + "learning_rate": 9.008403361344537e-05, + "loss": 0.5498, + "step": 32598 + }, + { + "epoch": 18.211731843575418, + "grad_norm": 1.1145869493484497, + "learning_rate": 9.005602240896358e-05, + "loss": 0.6046, + "step": 32599 + }, + { + "epoch": 18.212290502793294, + "grad_norm": 0.4166979491710663, + "learning_rate": 9.00280112044818e-05, + "loss": 0.3985, + "step": 32600 + }, + { + "epoch": 18.212849162011175, + "grad_norm": 0.5675028562545776, + "learning_rate": 8.999999999999999e-05, + "loss": 0.5742, + "step": 32601 + }, + { + "epoch": 18.21340782122905, + "grad_norm": 0.4414902627468109, + "learning_rate": 8.997198879551821e-05, + "loss": 0.3072, + "step": 32602 + }, + { + "epoch": 18.213966480446928, + "grad_norm": 0.5873119235038757, + "learning_rate": 8.99439775910364e-05, + "loss": 0.3824, + "step": 32603 + }, + { + "epoch": 18.214525139664804, + "grad_norm": 0.4763086438179016, + "learning_rate": 8.991596638655462e-05, + "loss": 0.3903, + "step": 32604 + }, + { + "epoch": 18.21508379888268, + "grad_norm": 1.431850552558899, + "learning_rate": 8.988795518207283e-05, + "loss": 0.4268, + "step": 32605 + }, + { + "epoch": 18.215642458100557, + "grad_norm": 0.47908493876457214, + "learning_rate": 8.985994397759104e-05, + "loss": 0.3364, + "step": 32606 + }, + { + "epoch": 18.216201117318437, + "grad_norm": 0.4700092673301697, + "learning_rate": 8.983193277310924e-05, + "loss": 0.4631, + "step": 32607 + }, + { + "epoch": 18.216759776536314, + "grad_norm": 1.0191336870193481, + "learning_rate": 8.980392156862745e-05, + "loss": 0.4135, + "step": 32608 + }, + { + "epoch": 18.21731843575419, + "grad_norm": 1.3718301057815552, + "learning_rate": 8.977591036414565e-05, + "loss": 0.4806, + "step": 32609 + }, + { + "epoch": 18.217877094972067, + "grad_norm": 6.846329689025879, + "learning_rate": 8.974789915966387e-05, + "loss": 0.3451, + "step": 32610 + }, + { + "epoch": 18.218435754189944, + "grad_norm": 0.4599190056324005, + "learning_rate": 8.971988795518207e-05, + "loss": 0.3301, + "step": 32611 + }, + { + "epoch": 18.21899441340782, + "grad_norm": 1.8401906490325928, + "learning_rate": 8.969187675070029e-05, + "loss": 0.3158, + "step": 32612 + }, + { + "epoch": 18.219553072625697, + "grad_norm": 0.36960023641586304, + "learning_rate": 8.966386554621848e-05, + "loss": 0.2652, + "step": 32613 + }, + { + "epoch": 18.220111731843577, + "grad_norm": 0.4305665194988251, + "learning_rate": 8.96358543417367e-05, + "loss": 0.4355, + "step": 32614 + }, + { + "epoch": 18.220670391061454, + "grad_norm": 0.6946961283683777, + "learning_rate": 8.960784313725492e-05, + "loss": 0.4415, + "step": 32615 + }, + { + "epoch": 18.22122905027933, + "grad_norm": 0.4750472605228424, + "learning_rate": 8.957983193277311e-05, + "loss": 0.3805, + "step": 32616 + }, + { + "epoch": 18.221787709497207, + "grad_norm": 0.4733906686306, + "learning_rate": 8.955182072829133e-05, + "loss": 0.3545, + "step": 32617 + }, + { + "epoch": 18.222346368715083, + "grad_norm": 0.5815635919570923, + "learning_rate": 8.952380952380952e-05, + "loss": 0.3927, + "step": 32618 + }, + { + "epoch": 18.22290502793296, + "grad_norm": 6.846716403961182, + "learning_rate": 8.949579831932774e-05, + "loss": 0.3762, + "step": 32619 + }, + { + "epoch": 18.223463687150836, + "grad_norm": 0.4133613109588623, + "learning_rate": 8.946778711484595e-05, + "loss": 0.3729, + "step": 32620 + }, + { + "epoch": 18.224022346368717, + "grad_norm": 0.5400164723396301, + "learning_rate": 8.943977591036415e-05, + "loss": 0.3753, + "step": 32621 + }, + { + "epoch": 18.224581005586593, + "grad_norm": 1.4985110759735107, + "learning_rate": 8.941176470588236e-05, + "loss": 0.4775, + "step": 32622 + }, + { + "epoch": 18.22513966480447, + "grad_norm": 0.592915952205658, + "learning_rate": 8.938375350140056e-05, + "loss": 0.3905, + "step": 32623 + }, + { + "epoch": 18.225698324022346, + "grad_norm": 0.5983620882034302, + "learning_rate": 8.935574229691877e-05, + "loss": 0.4551, + "step": 32624 + }, + { + "epoch": 18.226256983240223, + "grad_norm": 0.4434497654438019, + "learning_rate": 8.932773109243698e-05, + "loss": 0.4656, + "step": 32625 + }, + { + "epoch": 18.2268156424581, + "grad_norm": 0.5307989120483398, + "learning_rate": 8.929971988795518e-05, + "loss": 0.4007, + "step": 32626 + }, + { + "epoch": 18.227374301675976, + "grad_norm": 0.721480667591095, + "learning_rate": 8.92717086834734e-05, + "loss": 0.5197, + "step": 32627 + }, + { + "epoch": 18.227932960893856, + "grad_norm": 0.45105406641960144, + "learning_rate": 8.92436974789916e-05, + "loss": 0.418, + "step": 32628 + }, + { + "epoch": 18.228491620111733, + "grad_norm": 0.48472219705581665, + "learning_rate": 8.921568627450981e-05, + "loss": 0.4069, + "step": 32629 + }, + { + "epoch": 18.22905027932961, + "grad_norm": 0.5231115221977234, + "learning_rate": 8.9187675070028e-05, + "loss": 0.4382, + "step": 32630 + }, + { + "epoch": 18.229608938547486, + "grad_norm": 0.33041539788246155, + "learning_rate": 8.915966386554623e-05, + "loss": 0.4929, + "step": 32631 + }, + { + "epoch": 18.230167597765362, + "grad_norm": 1.308843970298767, + "learning_rate": 8.913165266106443e-05, + "loss": 0.3491, + "step": 32632 + }, + { + "epoch": 18.23072625698324, + "grad_norm": 0.5758770704269409, + "learning_rate": 8.910364145658264e-05, + "loss": 0.5197, + "step": 32633 + }, + { + "epoch": 18.23128491620112, + "grad_norm": 0.3611909747123718, + "learning_rate": 8.907563025210084e-05, + "loss": 0.3711, + "step": 32634 + }, + { + "epoch": 18.231843575418996, + "grad_norm": 1.6035256385803223, + "learning_rate": 8.904761904761905e-05, + "loss": 0.4406, + "step": 32635 + }, + { + "epoch": 18.232402234636872, + "grad_norm": 0.6332162618637085, + "learning_rate": 8.901960784313726e-05, + "loss": 0.5166, + "step": 32636 + }, + { + "epoch": 18.23296089385475, + "grad_norm": 0.651264488697052, + "learning_rate": 8.899159663865548e-05, + "loss": 0.378, + "step": 32637 + }, + { + "epoch": 18.233519553072625, + "grad_norm": 0.3985283076763153, + "learning_rate": 8.896358543417367e-05, + "loss": 0.3369, + "step": 32638 + }, + { + "epoch": 18.234078212290502, + "grad_norm": 0.3729357123374939, + "learning_rate": 8.893557422969189e-05, + "loss": 0.3628, + "step": 32639 + }, + { + "epoch": 18.23463687150838, + "grad_norm": 0.40608903765678406, + "learning_rate": 8.890756302521008e-05, + "loss": 0.4224, + "step": 32640 + }, + { + "epoch": 18.23519553072626, + "grad_norm": 0.39620083570480347, + "learning_rate": 8.88795518207283e-05, + "loss": 0.3865, + "step": 32641 + }, + { + "epoch": 18.235754189944135, + "grad_norm": 0.35489603877067566, + "learning_rate": 8.88515406162465e-05, + "loss": 0.325, + "step": 32642 + }, + { + "epoch": 18.23631284916201, + "grad_norm": 0.6893346309661865, + "learning_rate": 8.882352941176471e-05, + "loss": 0.4692, + "step": 32643 + }, + { + "epoch": 18.23687150837989, + "grad_norm": 0.4614734351634979, + "learning_rate": 8.879551820728292e-05, + "loss": 0.5273, + "step": 32644 + }, + { + "epoch": 18.237430167597765, + "grad_norm": 0.7457419037818909, + "learning_rate": 8.876750700280112e-05, + "loss": 0.4335, + "step": 32645 + }, + { + "epoch": 18.23798882681564, + "grad_norm": 0.5542821288108826, + "learning_rate": 8.873949579831933e-05, + "loss": 0.462, + "step": 32646 + }, + { + "epoch": 18.238547486033518, + "grad_norm": 0.3706904649734497, + "learning_rate": 8.871148459383755e-05, + "loss": 0.4107, + "step": 32647 + }, + { + "epoch": 18.239106145251398, + "grad_norm": 0.4472128450870514, + "learning_rate": 8.868347338935574e-05, + "loss": 0.4742, + "step": 32648 + }, + { + "epoch": 18.239664804469275, + "grad_norm": 0.6131716966629028, + "learning_rate": 8.865546218487396e-05, + "loss": 0.3673, + "step": 32649 + }, + { + "epoch": 18.24022346368715, + "grad_norm": 0.32064637541770935, + "learning_rate": 8.862745098039215e-05, + "loss": 0.3779, + "step": 32650 + }, + { + "epoch": 18.240782122905028, + "grad_norm": 0.4744412899017334, + "learning_rate": 8.859943977591037e-05, + "loss": 0.3862, + "step": 32651 + }, + { + "epoch": 18.241340782122904, + "grad_norm": 0.5209770202636719, + "learning_rate": 8.857142857142857e-05, + "loss": 0.4758, + "step": 32652 + }, + { + "epoch": 18.24189944134078, + "grad_norm": 0.5869362354278564, + "learning_rate": 8.854341736694679e-05, + "loss": 0.3101, + "step": 32653 + }, + { + "epoch": 18.242458100558657, + "grad_norm": 0.44523465633392334, + "learning_rate": 8.851540616246499e-05, + "loss": 0.3581, + "step": 32654 + }, + { + "epoch": 18.243016759776538, + "grad_norm": 0.3737280070781708, + "learning_rate": 8.84873949579832e-05, + "loss": 0.4315, + "step": 32655 + }, + { + "epoch": 18.243575418994414, + "grad_norm": 0.5529664754867554, + "learning_rate": 8.84593837535014e-05, + "loss": 0.4035, + "step": 32656 + }, + { + "epoch": 18.24413407821229, + "grad_norm": 0.34042322635650635, + "learning_rate": 8.843137254901961e-05, + "loss": 0.3044, + "step": 32657 + }, + { + "epoch": 18.244692737430167, + "grad_norm": 0.4932880997657776, + "learning_rate": 8.840336134453782e-05, + "loss": 0.3448, + "step": 32658 + }, + { + "epoch": 18.245251396648044, + "grad_norm": 0.4720737934112549, + "learning_rate": 8.837535014005603e-05, + "loss": 0.4894, + "step": 32659 + }, + { + "epoch": 18.24581005586592, + "grad_norm": 0.5280595421791077, + "learning_rate": 8.834733893557423e-05, + "loss": 0.5299, + "step": 32660 + }, + { + "epoch": 18.2463687150838, + "grad_norm": 0.4164494574069977, + "learning_rate": 8.831932773109245e-05, + "loss": 0.5149, + "step": 32661 + }, + { + "epoch": 18.246927374301677, + "grad_norm": 1.0335514545440674, + "learning_rate": 8.829131652661064e-05, + "loss": 0.3933, + "step": 32662 + }, + { + "epoch": 18.247486033519554, + "grad_norm": 0.3542427718639374, + "learning_rate": 8.826330532212886e-05, + "loss": 0.3496, + "step": 32663 + }, + { + "epoch": 18.24804469273743, + "grad_norm": 0.35461345314979553, + "learning_rate": 8.823529411764706e-05, + "loss": 0.3788, + "step": 32664 + }, + { + "epoch": 18.248603351955307, + "grad_norm": 0.42494508624076843, + "learning_rate": 8.820728291316527e-05, + "loss": 0.3909, + "step": 32665 + }, + { + "epoch": 18.249162011173183, + "grad_norm": 0.49568766355514526, + "learning_rate": 8.817927170868348e-05, + "loss": 0.5342, + "step": 32666 + }, + { + "epoch": 18.24972067039106, + "grad_norm": 0.38212326169013977, + "learning_rate": 8.815126050420168e-05, + "loss": 0.3606, + "step": 32667 + }, + { + "epoch": 18.25027932960894, + "grad_norm": 0.43844321370124817, + "learning_rate": 8.812324929971989e-05, + "loss": 0.3283, + "step": 32668 + }, + { + "epoch": 18.250837988826817, + "grad_norm": 0.5792275667190552, + "learning_rate": 8.809523809523811e-05, + "loss": 0.5553, + "step": 32669 + }, + { + "epoch": 18.251396648044693, + "grad_norm": 0.45069336891174316, + "learning_rate": 8.80672268907563e-05, + "loss": 0.3663, + "step": 32670 + }, + { + "epoch": 18.25195530726257, + "grad_norm": 0.3351284861564636, + "learning_rate": 8.803921568627452e-05, + "loss": 0.3379, + "step": 32671 + }, + { + "epoch": 18.252513966480446, + "grad_norm": 0.3912334144115448, + "learning_rate": 8.801120448179271e-05, + "loss": 0.4557, + "step": 32672 + }, + { + "epoch": 18.253072625698323, + "grad_norm": 0.3823745846748352, + "learning_rate": 8.798319327731093e-05, + "loss": 0.45, + "step": 32673 + }, + { + "epoch": 18.2536312849162, + "grad_norm": 0.5660895109176636, + "learning_rate": 8.795518207282914e-05, + "loss": 0.4257, + "step": 32674 + }, + { + "epoch": 18.25418994413408, + "grad_norm": 0.5477224588394165, + "learning_rate": 8.792717086834734e-05, + "loss": 0.4939, + "step": 32675 + }, + { + "epoch": 18.254748603351956, + "grad_norm": 0.40360334515571594, + "learning_rate": 8.789915966386555e-05, + "loss": 0.3895, + "step": 32676 + }, + { + "epoch": 18.255307262569833, + "grad_norm": 0.5136256814002991, + "learning_rate": 8.787114845938376e-05, + "loss": 0.3832, + "step": 32677 + }, + { + "epoch": 18.25586592178771, + "grad_norm": 1.602191686630249, + "learning_rate": 8.784313725490196e-05, + "loss": 0.4502, + "step": 32678 + }, + { + "epoch": 18.256424581005586, + "grad_norm": 0.7725604176521301, + "learning_rate": 8.781512605042017e-05, + "loss": 0.496, + "step": 32679 + }, + { + "epoch": 18.256983240223462, + "grad_norm": 0.5572128891944885, + "learning_rate": 8.778711484593837e-05, + "loss": 0.4377, + "step": 32680 + }, + { + "epoch": 18.257541899441343, + "grad_norm": 0.5574092268943787, + "learning_rate": 8.77591036414566e-05, + "loss": 0.4732, + "step": 32681 + }, + { + "epoch": 18.25810055865922, + "grad_norm": 0.35897552967071533, + "learning_rate": 8.773109243697479e-05, + "loss": 0.4118, + "step": 32682 + }, + { + "epoch": 18.258659217877096, + "grad_norm": 1.541033148765564, + "learning_rate": 8.7703081232493e-05, + "loss": 0.3662, + "step": 32683 + }, + { + "epoch": 18.259217877094972, + "grad_norm": 0.3584447503089905, + "learning_rate": 8.76750700280112e-05, + "loss": 0.3905, + "step": 32684 + }, + { + "epoch": 18.25977653631285, + "grad_norm": 0.43582451343536377, + "learning_rate": 8.764705882352942e-05, + "loss": 0.33, + "step": 32685 + }, + { + "epoch": 18.260335195530725, + "grad_norm": 1.2134478092193604, + "learning_rate": 8.761904761904762e-05, + "loss": 0.332, + "step": 32686 + }, + { + "epoch": 18.260893854748602, + "grad_norm": 0.9116256237030029, + "learning_rate": 8.759103641456583e-05, + "loss": 0.3652, + "step": 32687 + }, + { + "epoch": 18.261452513966482, + "grad_norm": 0.44309672713279724, + "learning_rate": 8.756302521008404e-05, + "loss": 0.4279, + "step": 32688 + }, + { + "epoch": 18.26201117318436, + "grad_norm": 0.44933533668518066, + "learning_rate": 8.753501400560224e-05, + "loss": 0.3359, + "step": 32689 + }, + { + "epoch": 18.262569832402235, + "grad_norm": 0.37190139293670654, + "learning_rate": 8.750700280112045e-05, + "loss": 0.4022, + "step": 32690 + }, + { + "epoch": 18.26312849162011, + "grad_norm": 0.5863502025604248, + "learning_rate": 8.747899159663867e-05, + "loss": 0.4016, + "step": 32691 + }, + { + "epoch": 18.26368715083799, + "grad_norm": 8.179637908935547, + "learning_rate": 8.745098039215686e-05, + "loss": 0.4442, + "step": 32692 + }, + { + "epoch": 18.264245810055865, + "grad_norm": 0.4853821098804474, + "learning_rate": 8.742296918767508e-05, + "loss": 0.4429, + "step": 32693 + }, + { + "epoch": 18.26480446927374, + "grad_norm": 0.2989823520183563, + "learning_rate": 8.739495798319327e-05, + "loss": 0.293, + "step": 32694 + }, + { + "epoch": 18.26536312849162, + "grad_norm": 0.46924665570259094, + "learning_rate": 8.736694677871149e-05, + "loss": 0.4807, + "step": 32695 + }, + { + "epoch": 18.265921787709498, + "grad_norm": 0.4112356901168823, + "learning_rate": 8.73389355742297e-05, + "loss": 0.3483, + "step": 32696 + }, + { + "epoch": 18.266480446927375, + "grad_norm": 0.46377190947532654, + "learning_rate": 8.73109243697479e-05, + "loss": 0.3399, + "step": 32697 + }, + { + "epoch": 18.26703910614525, + "grad_norm": 0.9841479659080505, + "learning_rate": 8.728291316526611e-05, + "loss": 0.3423, + "step": 32698 + }, + { + "epoch": 18.267597765363128, + "grad_norm": 0.33804404735565186, + "learning_rate": 8.725490196078432e-05, + "loss": 0.3303, + "step": 32699 + }, + { + "epoch": 18.268156424581004, + "grad_norm": 0.48222580552101135, + "learning_rate": 8.722689075630252e-05, + "loss": 0.4222, + "step": 32700 + }, + { + "epoch": 18.26871508379888, + "grad_norm": 0.47329995036125183, + "learning_rate": 8.719887955182074e-05, + "loss": 0.439, + "step": 32701 + }, + { + "epoch": 18.26927374301676, + "grad_norm": 0.4900064468383789, + "learning_rate": 8.717086834733893e-05, + "loss": 0.3662, + "step": 32702 + }, + { + "epoch": 18.269832402234638, + "grad_norm": 0.42072832584381104, + "learning_rate": 8.714285714285715e-05, + "loss": 0.3832, + "step": 32703 + }, + { + "epoch": 18.270391061452514, + "grad_norm": 0.599050760269165, + "learning_rate": 8.711484593837535e-05, + "loss": 0.3996, + "step": 32704 + }, + { + "epoch": 18.27094972067039, + "grad_norm": 0.31566593050956726, + "learning_rate": 8.708683473389356e-05, + "loss": 0.3375, + "step": 32705 + }, + { + "epoch": 18.271508379888267, + "grad_norm": 0.4924340844154358, + "learning_rate": 8.705882352941176e-05, + "loss": 0.4397, + "step": 32706 + }, + { + "epoch": 18.272067039106144, + "grad_norm": 0.36471566557884216, + "learning_rate": 8.703081232492998e-05, + "loss": 0.266, + "step": 32707 + }, + { + "epoch": 18.272625698324024, + "grad_norm": 0.45468661189079285, + "learning_rate": 8.700280112044818e-05, + "loss": 0.2787, + "step": 32708 + }, + { + "epoch": 18.2731843575419, + "grad_norm": 0.3292801082134247, + "learning_rate": 8.697478991596639e-05, + "loss": 0.4341, + "step": 32709 + }, + { + "epoch": 18.273743016759777, + "grad_norm": 0.705889880657196, + "learning_rate": 8.69467787114846e-05, + "loss": 0.4941, + "step": 32710 + }, + { + "epoch": 18.274301675977654, + "grad_norm": 0.32996779680252075, + "learning_rate": 8.69187675070028e-05, + "loss": 0.3509, + "step": 32711 + }, + { + "epoch": 18.27486033519553, + "grad_norm": 0.5066474080085754, + "learning_rate": 8.6890756302521e-05, + "loss": 0.4193, + "step": 32712 + }, + { + "epoch": 18.275418994413407, + "grad_norm": 0.7488020062446594, + "learning_rate": 8.686274509803923e-05, + "loss": 0.6275, + "step": 32713 + }, + { + "epoch": 18.275977653631283, + "grad_norm": 0.3979850709438324, + "learning_rate": 8.683473389355742e-05, + "loss": 0.3467, + "step": 32714 + }, + { + "epoch": 18.276536312849164, + "grad_norm": 0.5178669095039368, + "learning_rate": 8.680672268907564e-05, + "loss": 0.5044, + "step": 32715 + }, + { + "epoch": 18.27709497206704, + "grad_norm": 0.5792111158370972, + "learning_rate": 8.677871148459383e-05, + "loss": 0.5073, + "step": 32716 + }, + { + "epoch": 18.277653631284917, + "grad_norm": 2.007347583770752, + "learning_rate": 8.675070028011205e-05, + "loss": 0.4509, + "step": 32717 + }, + { + "epoch": 18.278212290502793, + "grad_norm": 0.4164412319660187, + "learning_rate": 8.672268907563026e-05, + "loss": 0.3502, + "step": 32718 + }, + { + "epoch": 18.27877094972067, + "grad_norm": 0.6618167757987976, + "learning_rate": 8.669467787114846e-05, + "loss": 0.4215, + "step": 32719 + }, + { + "epoch": 18.279329608938546, + "grad_norm": 0.36278441548347473, + "learning_rate": 8.666666666666667e-05, + "loss": 0.4033, + "step": 32720 + }, + { + "epoch": 18.279888268156423, + "grad_norm": 0.5017735958099365, + "learning_rate": 8.663865546218487e-05, + "loss": 0.4402, + "step": 32721 + }, + { + "epoch": 18.280446927374303, + "grad_norm": 0.5115286111831665, + "learning_rate": 8.661064425770308e-05, + "loss": 0.3058, + "step": 32722 + }, + { + "epoch": 18.28100558659218, + "grad_norm": 0.5701693296432495, + "learning_rate": 8.65826330532213e-05, + "loss": 0.4815, + "step": 32723 + }, + { + "epoch": 18.281564245810056, + "grad_norm": 0.5837891697883606, + "learning_rate": 8.655462184873949e-05, + "loss": 0.4681, + "step": 32724 + }, + { + "epoch": 18.282122905027933, + "grad_norm": 0.6313376426696777, + "learning_rate": 8.652661064425771e-05, + "loss": 0.4246, + "step": 32725 + }, + { + "epoch": 18.28268156424581, + "grad_norm": 0.4196922481060028, + "learning_rate": 8.64985994397759e-05, + "loss": 0.3821, + "step": 32726 + }, + { + "epoch": 18.283240223463686, + "grad_norm": 0.43936988711357117, + "learning_rate": 8.647058823529412e-05, + "loss": 0.358, + "step": 32727 + }, + { + "epoch": 18.283798882681563, + "grad_norm": 0.5881524085998535, + "learning_rate": 8.644257703081233e-05, + "loss": 0.3941, + "step": 32728 + }, + { + "epoch": 18.284357541899443, + "grad_norm": 0.3587978184223175, + "learning_rate": 8.641456582633054e-05, + "loss": 0.3715, + "step": 32729 + }, + { + "epoch": 18.28491620111732, + "grad_norm": 0.38702911138534546, + "learning_rate": 8.638655462184874e-05, + "loss": 0.3943, + "step": 32730 + }, + { + "epoch": 18.285474860335196, + "grad_norm": 0.47795331478118896, + "learning_rate": 8.635854341736695e-05, + "loss": 0.3728, + "step": 32731 + }, + { + "epoch": 18.286033519553072, + "grad_norm": 0.6315271258354187, + "learning_rate": 8.633053221288515e-05, + "loss": 0.3726, + "step": 32732 + }, + { + "epoch": 18.28659217877095, + "grad_norm": 0.4539512097835541, + "learning_rate": 8.630252100840336e-05, + "loss": 0.4537, + "step": 32733 + }, + { + "epoch": 18.287150837988825, + "grad_norm": 0.74102383852005, + "learning_rate": 8.627450980392157e-05, + "loss": 0.435, + "step": 32734 + }, + { + "epoch": 18.287709497206706, + "grad_norm": 1.3684748411178589, + "learning_rate": 8.624649859943979e-05, + "loss": 0.3699, + "step": 32735 + }, + { + "epoch": 18.288268156424582, + "grad_norm": 0.49205633997917175, + "learning_rate": 8.621848739495798e-05, + "loss": 0.3227, + "step": 32736 + }, + { + "epoch": 18.28882681564246, + "grad_norm": 0.4957444965839386, + "learning_rate": 8.61904761904762e-05, + "loss": 0.4, + "step": 32737 + }, + { + "epoch": 18.289385474860335, + "grad_norm": 0.5035558342933655, + "learning_rate": 8.616246498599439e-05, + "loss": 0.3783, + "step": 32738 + }, + { + "epoch": 18.289944134078212, + "grad_norm": 0.44133779406547546, + "learning_rate": 8.613445378151261e-05, + "loss": 0.2953, + "step": 32739 + }, + { + "epoch": 18.29050279329609, + "grad_norm": 0.5431911945343018, + "learning_rate": 8.610644257703082e-05, + "loss": 0.3909, + "step": 32740 + }, + { + "epoch": 18.291061452513965, + "grad_norm": 0.4621146321296692, + "learning_rate": 8.607843137254902e-05, + "loss": 0.3649, + "step": 32741 + }, + { + "epoch": 18.291620111731845, + "grad_norm": 0.35900264978408813, + "learning_rate": 8.605042016806723e-05, + "loss": 0.3256, + "step": 32742 + }, + { + "epoch": 18.29217877094972, + "grad_norm": 1.841981053352356, + "learning_rate": 8.602240896358543e-05, + "loss": 0.3582, + "step": 32743 + }, + { + "epoch": 18.2927374301676, + "grad_norm": 1.3663215637207031, + "learning_rate": 8.599439775910364e-05, + "loss": 0.4403, + "step": 32744 + }, + { + "epoch": 18.293296089385475, + "grad_norm": 0.4664662778377533, + "learning_rate": 8.596638655462186e-05, + "loss": 0.4713, + "step": 32745 + }, + { + "epoch": 18.29385474860335, + "grad_norm": 0.36488622426986694, + "learning_rate": 8.593837535014005e-05, + "loss": 0.3527, + "step": 32746 + }, + { + "epoch": 18.294413407821228, + "grad_norm": 0.719631016254425, + "learning_rate": 8.591036414565827e-05, + "loss": 0.565, + "step": 32747 + }, + { + "epoch": 18.294972067039105, + "grad_norm": 1.0860995054244995, + "learning_rate": 8.588235294117646e-05, + "loss": 0.6013, + "step": 32748 + }, + { + "epoch": 18.295530726256985, + "grad_norm": 0.3828640282154083, + "learning_rate": 8.585434173669468e-05, + "loss": 0.4284, + "step": 32749 + }, + { + "epoch": 18.29608938547486, + "grad_norm": 0.7656653523445129, + "learning_rate": 8.582633053221289e-05, + "loss": 0.3263, + "step": 32750 + }, + { + "epoch": 18.296648044692738, + "grad_norm": 0.4117003381252289, + "learning_rate": 8.57983193277311e-05, + "loss": 0.4296, + "step": 32751 + }, + { + "epoch": 18.297206703910614, + "grad_norm": 0.5932239294052124, + "learning_rate": 8.57703081232493e-05, + "loss": 0.4228, + "step": 32752 + }, + { + "epoch": 18.29776536312849, + "grad_norm": 0.45200273394584656, + "learning_rate": 8.57422969187675e-05, + "loss": 0.4143, + "step": 32753 + }, + { + "epoch": 18.298324022346367, + "grad_norm": 0.5346892476081848, + "learning_rate": 8.571428571428571e-05, + "loss": 0.4719, + "step": 32754 + }, + { + "epoch": 18.298882681564244, + "grad_norm": 0.7034209966659546, + "learning_rate": 8.568627450980393e-05, + "loss": 0.4499, + "step": 32755 + }, + { + "epoch": 18.299441340782124, + "grad_norm": 1.0168575048446655, + "learning_rate": 8.565826330532212e-05, + "loss": 0.4135, + "step": 32756 + }, + { + "epoch": 18.3, + "grad_norm": 0.344670832157135, + "learning_rate": 8.563025210084034e-05, + "loss": 0.4056, + "step": 32757 + }, + { + "epoch": 18.300558659217877, + "grad_norm": 0.5479024648666382, + "learning_rate": 8.560224089635854e-05, + "loss": 0.3813, + "step": 32758 + }, + { + "epoch": 18.301117318435754, + "grad_norm": 0.6125989556312561, + "learning_rate": 8.557422969187676e-05, + "loss": 0.3271, + "step": 32759 + }, + { + "epoch": 18.30167597765363, + "grad_norm": 0.4583778977394104, + "learning_rate": 8.554621848739495e-05, + "loss": 0.2994, + "step": 32760 + }, + { + "epoch": 18.302234636871507, + "grad_norm": 0.3797827959060669, + "learning_rate": 8.551820728291317e-05, + "loss": 0.3217, + "step": 32761 + }, + { + "epoch": 18.302793296089387, + "grad_norm": 0.48128241300582886, + "learning_rate": 8.549019607843137e-05, + "loss": 0.546, + "step": 32762 + }, + { + "epoch": 18.303351955307264, + "grad_norm": 0.5493959784507751, + "learning_rate": 8.546218487394958e-05, + "loss": 0.4899, + "step": 32763 + }, + { + "epoch": 18.30391061452514, + "grad_norm": 0.5889927744865417, + "learning_rate": 8.543417366946779e-05, + "loss": 0.5364, + "step": 32764 + }, + { + "epoch": 18.304469273743017, + "grad_norm": 0.42793142795562744, + "learning_rate": 8.540616246498599e-05, + "loss": 0.4067, + "step": 32765 + }, + { + "epoch": 18.305027932960893, + "grad_norm": 0.40835610032081604, + "learning_rate": 8.53781512605042e-05, + "loss": 0.4564, + "step": 32766 + }, + { + "epoch": 18.30558659217877, + "grad_norm": 0.3972095847129822, + "learning_rate": 8.535014005602242e-05, + "loss": 0.2937, + "step": 32767 + }, + { + "epoch": 18.306145251396647, + "grad_norm": 0.6681272983551025, + "learning_rate": 8.532212885154061e-05, + "loss": 0.4869, + "step": 32768 + }, + { + "epoch": 18.306703910614527, + "grad_norm": 0.32472676038742065, + "learning_rate": 8.529411764705883e-05, + "loss": 0.386, + "step": 32769 + }, + { + "epoch": 18.307262569832403, + "grad_norm": 0.6974824070930481, + "learning_rate": 8.526610644257702e-05, + "loss": 0.3816, + "step": 32770 + }, + { + "epoch": 18.30782122905028, + "grad_norm": 5.003784656524658, + "learning_rate": 8.523809523809524e-05, + "loss": 0.3498, + "step": 32771 + }, + { + "epoch": 18.308379888268156, + "grad_norm": 0.5939489006996155, + "learning_rate": 8.521008403361345e-05, + "loss": 0.5552, + "step": 32772 + }, + { + "epoch": 18.308938547486033, + "grad_norm": 0.39250120520591736, + "learning_rate": 8.518207282913165e-05, + "loss": 0.4502, + "step": 32773 + }, + { + "epoch": 18.30949720670391, + "grad_norm": 0.55890291929245, + "learning_rate": 8.515406162464986e-05, + "loss": 0.432, + "step": 32774 + }, + { + "epoch": 18.310055865921786, + "grad_norm": 0.4307514429092407, + "learning_rate": 8.512605042016807e-05, + "loss": 0.4114, + "step": 32775 + }, + { + "epoch": 18.310614525139666, + "grad_norm": 0.40841779112815857, + "learning_rate": 8.509803921568627e-05, + "loss": 0.3903, + "step": 32776 + }, + { + "epoch": 18.311173184357543, + "grad_norm": 0.576561450958252, + "learning_rate": 8.507002801120449e-05, + "loss": 0.4155, + "step": 32777 + }, + { + "epoch": 18.31173184357542, + "grad_norm": 0.6510694026947021, + "learning_rate": 8.504201680672268e-05, + "loss": 0.3695, + "step": 32778 + }, + { + "epoch": 18.312290502793296, + "grad_norm": 0.3977453410625458, + "learning_rate": 8.50140056022409e-05, + "loss": 0.365, + "step": 32779 + }, + { + "epoch": 18.312849162011172, + "grad_norm": 0.45094066858291626, + "learning_rate": 8.49859943977591e-05, + "loss": 0.3377, + "step": 32780 + }, + { + "epoch": 18.31340782122905, + "grad_norm": 0.37779805064201355, + "learning_rate": 8.495798319327732e-05, + "loss": 0.4165, + "step": 32781 + }, + { + "epoch": 18.31396648044693, + "grad_norm": 1.4240490198135376, + "learning_rate": 8.492997198879552e-05, + "loss": 0.3597, + "step": 32782 + }, + { + "epoch": 18.314525139664806, + "grad_norm": 0.633579432964325, + "learning_rate": 8.490196078431373e-05, + "loss": 0.3714, + "step": 32783 + }, + { + "epoch": 18.315083798882682, + "grad_norm": 0.4324328303337097, + "learning_rate": 8.487394957983193e-05, + "loss": 0.3943, + "step": 32784 + }, + { + "epoch": 18.31564245810056, + "grad_norm": 0.49630314111709595, + "learning_rate": 8.484593837535014e-05, + "loss": 0.3625, + "step": 32785 + }, + { + "epoch": 18.316201117318435, + "grad_norm": 0.3495021462440491, + "learning_rate": 8.481792717086835e-05, + "loss": 0.391, + "step": 32786 + }, + { + "epoch": 18.316759776536312, + "grad_norm": 2.732417106628418, + "learning_rate": 8.478991596638656e-05, + "loss": 0.3685, + "step": 32787 + }, + { + "epoch": 18.31731843575419, + "grad_norm": 0.3252967894077301, + "learning_rate": 8.476190476190476e-05, + "loss": 0.3593, + "step": 32788 + }, + { + "epoch": 18.31787709497207, + "grad_norm": 0.5639015436172485, + "learning_rate": 8.473389355742298e-05, + "loss": 0.4012, + "step": 32789 + }, + { + "epoch": 18.318435754189945, + "grad_norm": 0.8142260313034058, + "learning_rate": 8.470588235294117e-05, + "loss": 0.4648, + "step": 32790 + }, + { + "epoch": 18.31899441340782, + "grad_norm": 0.3550356924533844, + "learning_rate": 8.467787114845939e-05, + "loss": 0.3674, + "step": 32791 + }, + { + "epoch": 18.3195530726257, + "grad_norm": 0.4787386655807495, + "learning_rate": 8.464985994397758e-05, + "loss": 0.46, + "step": 32792 + }, + { + "epoch": 18.320111731843575, + "grad_norm": 0.39011022448539734, + "learning_rate": 8.46218487394958e-05, + "loss": 0.3356, + "step": 32793 + }, + { + "epoch": 18.32067039106145, + "grad_norm": 0.40004441142082214, + "learning_rate": 8.4593837535014e-05, + "loss": 0.3849, + "step": 32794 + }, + { + "epoch": 18.321229050279328, + "grad_norm": 0.36449384689331055, + "learning_rate": 8.456582633053221e-05, + "loss": 0.4433, + "step": 32795 + }, + { + "epoch": 18.321787709497208, + "grad_norm": 0.37574502825737, + "learning_rate": 8.453781512605042e-05, + "loss": 0.4069, + "step": 32796 + }, + { + "epoch": 18.322346368715085, + "grad_norm": 0.5998339653015137, + "learning_rate": 8.450980392156862e-05, + "loss": 0.6158, + "step": 32797 + }, + { + "epoch": 18.32290502793296, + "grad_norm": 0.43560925126075745, + "learning_rate": 8.448179271708683e-05, + "loss": 0.4036, + "step": 32798 + }, + { + "epoch": 18.323463687150838, + "grad_norm": 0.4975191652774811, + "learning_rate": 8.445378151260505e-05, + "loss": 0.4369, + "step": 32799 + }, + { + "epoch": 18.324022346368714, + "grad_norm": 0.6788953542709351, + "learning_rate": 8.442577030812324e-05, + "loss": 0.3568, + "step": 32800 + }, + { + "epoch": 18.32458100558659, + "grad_norm": 0.7546923160552979, + "learning_rate": 8.439775910364146e-05, + "loss": 0.5594, + "step": 32801 + }, + { + "epoch": 18.325139664804468, + "grad_norm": 0.4436323046684265, + "learning_rate": 8.436974789915965e-05, + "loss": 0.4003, + "step": 32802 + }, + { + "epoch": 18.325698324022348, + "grad_norm": 1.403450846672058, + "learning_rate": 8.434173669467787e-05, + "loss": 0.4736, + "step": 32803 + }, + { + "epoch": 18.326256983240224, + "grad_norm": 0.4323928654193878, + "learning_rate": 8.431372549019608e-05, + "loss": 0.3664, + "step": 32804 + }, + { + "epoch": 18.3268156424581, + "grad_norm": 0.4957042634487152, + "learning_rate": 8.428571428571429e-05, + "loss": 0.3452, + "step": 32805 + }, + { + "epoch": 18.327374301675977, + "grad_norm": 0.5773230791091919, + "learning_rate": 8.425770308123249e-05, + "loss": 0.5432, + "step": 32806 + }, + { + "epoch": 18.327932960893854, + "grad_norm": 0.43703946471214294, + "learning_rate": 8.42296918767507e-05, + "loss": 0.3724, + "step": 32807 + }, + { + "epoch": 18.32849162011173, + "grad_norm": 0.6803310513496399, + "learning_rate": 8.42016806722689e-05, + "loss": 0.415, + "step": 32808 + }, + { + "epoch": 18.32905027932961, + "grad_norm": 0.4226667284965515, + "learning_rate": 8.417366946778712e-05, + "loss": 0.3944, + "step": 32809 + }, + { + "epoch": 18.329608938547487, + "grad_norm": 0.352253258228302, + "learning_rate": 8.414565826330532e-05, + "loss": 0.3608, + "step": 32810 + }, + { + "epoch": 18.330167597765364, + "grad_norm": 0.5811667442321777, + "learning_rate": 8.411764705882354e-05, + "loss": 0.3853, + "step": 32811 + }, + { + "epoch": 18.33072625698324, + "grad_norm": 1.4271761178970337, + "learning_rate": 8.408963585434173e-05, + "loss": 0.5111, + "step": 32812 + }, + { + "epoch": 18.331284916201117, + "grad_norm": 0.5063607096672058, + "learning_rate": 8.406162464985995e-05, + "loss": 0.5105, + "step": 32813 + }, + { + "epoch": 18.331843575418993, + "grad_norm": 0.4931904375553131, + "learning_rate": 8.403361344537815e-05, + "loss": 0.3597, + "step": 32814 + }, + { + "epoch": 18.33240223463687, + "grad_norm": 1.5323851108551025, + "learning_rate": 8.400560224089636e-05, + "loss": 0.4469, + "step": 32815 + }, + { + "epoch": 18.33296089385475, + "grad_norm": 0.38962700963020325, + "learning_rate": 8.397759103641457e-05, + "loss": 0.4647, + "step": 32816 + }, + { + "epoch": 18.333519553072627, + "grad_norm": 0.6305058002471924, + "learning_rate": 8.394957983193277e-05, + "loss": 0.5606, + "step": 32817 + }, + { + "epoch": 18.334078212290503, + "grad_norm": 0.5131636261940002, + "learning_rate": 8.392156862745098e-05, + "loss": 0.4307, + "step": 32818 + }, + { + "epoch": 18.33463687150838, + "grad_norm": 0.3672429025173187, + "learning_rate": 8.389355742296918e-05, + "loss": 0.3651, + "step": 32819 + }, + { + "epoch": 18.335195530726256, + "grad_norm": 0.4035200774669647, + "learning_rate": 8.386554621848739e-05, + "loss": 0.417, + "step": 32820 + }, + { + "epoch": 18.335754189944133, + "grad_norm": 0.479237824678421, + "learning_rate": 8.383753501400561e-05, + "loss": 0.4214, + "step": 32821 + }, + { + "epoch": 18.33631284916201, + "grad_norm": 0.3408047556877136, + "learning_rate": 8.38095238095238e-05, + "loss": 0.3622, + "step": 32822 + }, + { + "epoch": 18.33687150837989, + "grad_norm": 0.42649906873703003, + "learning_rate": 8.378151260504202e-05, + "loss": 0.4341, + "step": 32823 + }, + { + "epoch": 18.337430167597766, + "grad_norm": 3.1019928455352783, + "learning_rate": 8.375350140056021e-05, + "loss": 0.5003, + "step": 32824 + }, + { + "epoch": 18.337988826815643, + "grad_norm": 0.49332132935523987, + "learning_rate": 8.372549019607843e-05, + "loss": 0.3372, + "step": 32825 + }, + { + "epoch": 18.33854748603352, + "grad_norm": 0.3751504123210907, + "learning_rate": 8.369747899159664e-05, + "loss": 0.3448, + "step": 32826 + }, + { + "epoch": 18.339106145251396, + "grad_norm": 0.4341195225715637, + "learning_rate": 8.366946778711485e-05, + "loss": 0.3913, + "step": 32827 + }, + { + "epoch": 18.339664804469272, + "grad_norm": 0.4469486474990845, + "learning_rate": 8.364145658263305e-05, + "loss": 0.4465, + "step": 32828 + }, + { + "epoch": 18.340223463687153, + "grad_norm": 1.0455799102783203, + "learning_rate": 8.361344537815126e-05, + "loss": 0.4696, + "step": 32829 + }, + { + "epoch": 18.34078212290503, + "grad_norm": 0.4647930860519409, + "learning_rate": 8.358543417366946e-05, + "loss": 0.5153, + "step": 32830 + }, + { + "epoch": 18.341340782122906, + "grad_norm": 0.39613330364227295, + "learning_rate": 8.355742296918768e-05, + "loss": 0.4409, + "step": 32831 + }, + { + "epoch": 18.341899441340782, + "grad_norm": 1.484390377998352, + "learning_rate": 8.352941176470588e-05, + "loss": 0.3064, + "step": 32832 + }, + { + "epoch": 18.34245810055866, + "grad_norm": 0.6235575675964355, + "learning_rate": 8.35014005602241e-05, + "loss": 0.467, + "step": 32833 + }, + { + "epoch": 18.343016759776535, + "grad_norm": 0.37441450357437134, + "learning_rate": 8.347338935574229e-05, + "loss": 0.3083, + "step": 32834 + }, + { + "epoch": 18.343575418994412, + "grad_norm": 0.4194738566875458, + "learning_rate": 8.34453781512605e-05, + "loss": 0.3944, + "step": 32835 + }, + { + "epoch": 18.344134078212292, + "grad_norm": 0.3580695688724518, + "learning_rate": 8.341736694677873e-05, + "loss": 0.349, + "step": 32836 + }, + { + "epoch": 18.34469273743017, + "grad_norm": 0.8038898706436157, + "learning_rate": 8.338935574229692e-05, + "loss": 0.4701, + "step": 32837 + }, + { + "epoch": 18.345251396648045, + "grad_norm": 1.6893147230148315, + "learning_rate": 8.336134453781514e-05, + "loss": 0.3396, + "step": 32838 + }, + { + "epoch": 18.345810055865922, + "grad_norm": 0.5352857708930969, + "learning_rate": 8.333333333333333e-05, + "loss": 0.4297, + "step": 32839 + }, + { + "epoch": 18.3463687150838, + "grad_norm": 0.35862624645233154, + "learning_rate": 8.330532212885155e-05, + "loss": 0.4418, + "step": 32840 + }, + { + "epoch": 18.346927374301675, + "grad_norm": 0.4197530746459961, + "learning_rate": 8.327731092436976e-05, + "loss": 0.4265, + "step": 32841 + }, + { + "epoch": 18.34748603351955, + "grad_norm": 0.3959534466266632, + "learning_rate": 8.324929971988796e-05, + "loss": 0.4129, + "step": 32842 + }, + { + "epoch": 18.34804469273743, + "grad_norm": 1.516517996788025, + "learning_rate": 8.322128851540617e-05, + "loss": 0.438, + "step": 32843 + }, + { + "epoch": 18.34860335195531, + "grad_norm": 0.3945249021053314, + "learning_rate": 8.319327731092437e-05, + "loss": 0.2983, + "step": 32844 + }, + { + "epoch": 18.349162011173185, + "grad_norm": 1.3578836917877197, + "learning_rate": 8.316526610644258e-05, + "loss": 0.3832, + "step": 32845 + }, + { + "epoch": 18.34972067039106, + "grad_norm": 0.5925694704055786, + "learning_rate": 8.313725490196079e-05, + "loss": 0.4247, + "step": 32846 + }, + { + "epoch": 18.350279329608938, + "grad_norm": 0.40185171365737915, + "learning_rate": 8.310924369747899e-05, + "loss": 0.3487, + "step": 32847 + }, + { + "epoch": 18.350837988826814, + "grad_norm": 0.34567567706108093, + "learning_rate": 8.308123249299721e-05, + "loss": 0.4123, + "step": 32848 + }, + { + "epoch": 18.35139664804469, + "grad_norm": 0.31427231431007385, + "learning_rate": 8.30532212885154e-05, + "loss": 0.3928, + "step": 32849 + }, + { + "epoch": 18.35195530726257, + "grad_norm": 0.659511148929596, + "learning_rate": 8.302521008403362e-05, + "loss": 0.4432, + "step": 32850 + }, + { + "epoch": 18.352513966480448, + "grad_norm": 6.609692573547363, + "learning_rate": 8.299719887955182e-05, + "loss": 0.371, + "step": 32851 + }, + { + "epoch": 18.353072625698324, + "grad_norm": 0.6741576194763184, + "learning_rate": 8.296918767507004e-05, + "loss": 0.354, + "step": 32852 + }, + { + "epoch": 18.3536312849162, + "grad_norm": 0.5141425132751465, + "learning_rate": 8.294117647058824e-05, + "loss": 0.3765, + "step": 32853 + }, + { + "epoch": 18.354189944134077, + "grad_norm": 0.457843154668808, + "learning_rate": 8.291316526610645e-05, + "loss": 0.3968, + "step": 32854 + }, + { + "epoch": 18.354748603351954, + "grad_norm": 1.0220999717712402, + "learning_rate": 8.288515406162465e-05, + "loss": 0.4273, + "step": 32855 + }, + { + "epoch": 18.355307262569834, + "grad_norm": 1.3391636610031128, + "learning_rate": 8.285714285714286e-05, + "loss": 0.3777, + "step": 32856 + }, + { + "epoch": 18.35586592178771, + "grad_norm": 0.43547654151916504, + "learning_rate": 8.282913165266107e-05, + "loss": 0.4361, + "step": 32857 + }, + { + "epoch": 18.356424581005587, + "grad_norm": 0.35032373666763306, + "learning_rate": 8.280112044817929e-05, + "loss": 0.3932, + "step": 32858 + }, + { + "epoch": 18.356983240223464, + "grad_norm": 1.0634431838989258, + "learning_rate": 8.277310924369748e-05, + "loss": 0.4109, + "step": 32859 + }, + { + "epoch": 18.35754189944134, + "grad_norm": 0.7619585990905762, + "learning_rate": 8.27450980392157e-05, + "loss": 0.3677, + "step": 32860 + }, + { + "epoch": 18.358100558659217, + "grad_norm": 0.47028061747550964, + "learning_rate": 8.271708683473389e-05, + "loss": 0.4321, + "step": 32861 + }, + { + "epoch": 18.358659217877094, + "grad_norm": 0.36756548285484314, + "learning_rate": 8.268907563025211e-05, + "loss": 0.3337, + "step": 32862 + }, + { + "epoch": 18.359217877094974, + "grad_norm": 0.5534183979034424, + "learning_rate": 8.266106442577032e-05, + "loss": 0.4591, + "step": 32863 + }, + { + "epoch": 18.35977653631285, + "grad_norm": 0.39359161257743835, + "learning_rate": 8.263305322128852e-05, + "loss": 0.3896, + "step": 32864 + }, + { + "epoch": 18.360335195530727, + "grad_norm": 0.35327041149139404, + "learning_rate": 8.260504201680673e-05, + "loss": 0.3101, + "step": 32865 + }, + { + "epoch": 18.360893854748603, + "grad_norm": 0.4520114064216614, + "learning_rate": 8.257703081232493e-05, + "loss": 0.4383, + "step": 32866 + }, + { + "epoch": 18.36145251396648, + "grad_norm": 3.7016406059265137, + "learning_rate": 8.254901960784314e-05, + "loss": 0.3834, + "step": 32867 + }, + { + "epoch": 18.362011173184356, + "grad_norm": 0.36280062794685364, + "learning_rate": 8.252100840336136e-05, + "loss": 0.3951, + "step": 32868 + }, + { + "epoch": 18.362569832402233, + "grad_norm": 0.36964765191078186, + "learning_rate": 8.249299719887955e-05, + "loss": 0.2614, + "step": 32869 + }, + { + "epoch": 18.363128491620113, + "grad_norm": 0.38076820969581604, + "learning_rate": 8.246498599439777e-05, + "loss": 0.2917, + "step": 32870 + }, + { + "epoch": 18.36368715083799, + "grad_norm": 0.46522969007492065, + "learning_rate": 8.243697478991596e-05, + "loss": 0.4573, + "step": 32871 + }, + { + "epoch": 18.364245810055866, + "grad_norm": Infinity, + "learning_rate": 8.243697478991596e-05, + "loss": 0.3979, + "step": 32872 + }, + { + "epoch": 18.364804469273743, + "grad_norm": 0.45378348231315613, + "learning_rate": 8.240896358543418e-05, + "loss": 0.3797, + "step": 32873 + }, + { + "epoch": 18.36536312849162, + "grad_norm": 0.39367756247520447, + "learning_rate": 8.238095238095238e-05, + "loss": 0.3964, + "step": 32874 + }, + { + "epoch": 18.365921787709496, + "grad_norm": 0.3585837781429291, + "learning_rate": 8.23529411764706e-05, + "loss": 0.373, + "step": 32875 + }, + { + "epoch": 18.366480446927373, + "grad_norm": 0.454330176115036, + "learning_rate": 8.23249299719888e-05, + "loss": 0.3201, + "step": 32876 + }, + { + "epoch": 18.367039106145253, + "grad_norm": 0.5186188220977783, + "learning_rate": 8.2296918767507e-05, + "loss": 0.3346, + "step": 32877 + }, + { + "epoch": 18.36759776536313, + "grad_norm": 0.36090168356895447, + "learning_rate": 8.226890756302521e-05, + "loss": 0.3337, + "step": 32878 + }, + { + "epoch": 18.368156424581006, + "grad_norm": 0.9182324409484863, + "learning_rate": 8.224089635854342e-05, + "loss": 0.2814, + "step": 32879 + }, + { + "epoch": 18.368715083798882, + "grad_norm": 0.3522058427333832, + "learning_rate": 8.221288515406162e-05, + "loss": 0.3593, + "step": 32880 + }, + { + "epoch": 18.36927374301676, + "grad_norm": 0.373034805059433, + "learning_rate": 8.218487394957984e-05, + "loss": 0.4735, + "step": 32881 + }, + { + "epoch": 18.369832402234636, + "grad_norm": 19.711328506469727, + "learning_rate": 8.215686274509804e-05, + "loss": 0.3152, + "step": 32882 + }, + { + "epoch": 18.370391061452516, + "grad_norm": 0.3268198072910309, + "learning_rate": 8.212885154061626e-05, + "loss": 0.3646, + "step": 32883 + }, + { + "epoch": 18.370949720670392, + "grad_norm": 0.5742087960243225, + "learning_rate": 8.210084033613445e-05, + "loss": 0.5912, + "step": 32884 + }, + { + "epoch": 18.37150837988827, + "grad_norm": 0.4913671910762787, + "learning_rate": 8.207282913165267e-05, + "loss": 0.6029, + "step": 32885 + }, + { + "epoch": 18.372067039106145, + "grad_norm": 1.0694589614868164, + "learning_rate": 8.204481792717087e-05, + "loss": 0.395, + "step": 32886 + }, + { + "epoch": 18.372625698324022, + "grad_norm": 0.4125858247280121, + "learning_rate": 8.201680672268908e-05, + "loss": 0.3673, + "step": 32887 + }, + { + "epoch": 18.3731843575419, + "grad_norm": 0.4719606637954712, + "learning_rate": 8.198879551820729e-05, + "loss": 0.3139, + "step": 32888 + }, + { + "epoch": 18.373743016759775, + "grad_norm": 0.37062758207321167, + "learning_rate": 8.196078431372549e-05, + "loss": 0.3392, + "step": 32889 + }, + { + "epoch": 18.374301675977655, + "grad_norm": 0.4029068052768707, + "learning_rate": 8.19327731092437e-05, + "loss": 0.3678, + "step": 32890 + }, + { + "epoch": 18.37486033519553, + "grad_norm": 0.39382800459861755, + "learning_rate": 8.190476190476192e-05, + "loss": 0.3985, + "step": 32891 + }, + { + "epoch": 18.37541899441341, + "grad_norm": 0.7422434091567993, + "learning_rate": 8.187675070028011e-05, + "loss": 0.3468, + "step": 32892 + }, + { + "epoch": 18.375977653631285, + "grad_norm": 0.33596086502075195, + "learning_rate": 8.184873949579833e-05, + "loss": 0.3888, + "step": 32893 + }, + { + "epoch": 18.37653631284916, + "grad_norm": 0.5770246386528015, + "learning_rate": 8.182072829131652e-05, + "loss": 0.3454, + "step": 32894 + }, + { + "epoch": 18.377094972067038, + "grad_norm": 0.5962408781051636, + "learning_rate": 8.179271708683474e-05, + "loss": 0.3737, + "step": 32895 + }, + { + "epoch": 18.377653631284915, + "grad_norm": 1.5015170574188232, + "learning_rate": 8.176470588235295e-05, + "loss": 0.4487, + "step": 32896 + }, + { + "epoch": 18.378212290502795, + "grad_norm": 0.5289201736450195, + "learning_rate": 8.173669467787115e-05, + "loss": 0.4285, + "step": 32897 + }, + { + "epoch": 18.37877094972067, + "grad_norm": 0.40539059042930603, + "learning_rate": 8.170868347338936e-05, + "loss": 0.4056, + "step": 32898 + }, + { + "epoch": 18.379329608938548, + "grad_norm": 0.44266578555107117, + "learning_rate": 8.168067226890757e-05, + "loss": 0.3546, + "step": 32899 + }, + { + "epoch": 18.379888268156424, + "grad_norm": 0.31555184721946716, + "learning_rate": 8.165266106442577e-05, + "loss": 0.3419, + "step": 32900 + }, + { + "epoch": 18.3804469273743, + "grad_norm": 0.5579817891120911, + "learning_rate": 8.162464985994398e-05, + "loss": 0.5055, + "step": 32901 + }, + { + "epoch": 18.381005586592178, + "grad_norm": 0.4791562855243683, + "learning_rate": 8.159663865546218e-05, + "loss": 0.4083, + "step": 32902 + }, + { + "epoch": 18.381564245810054, + "grad_norm": 0.4405474364757538, + "learning_rate": 8.15686274509804e-05, + "loss": 0.3747, + "step": 32903 + }, + { + "epoch": 18.382122905027934, + "grad_norm": 2.780805826187134, + "learning_rate": 8.15406162464986e-05, + "loss": 0.3304, + "step": 32904 + }, + { + "epoch": 18.38268156424581, + "grad_norm": 1.5652447938919067, + "learning_rate": 8.151260504201682e-05, + "loss": 0.3571, + "step": 32905 + }, + { + "epoch": 18.383240223463687, + "grad_norm": 1.2004978656768799, + "learning_rate": 8.148459383753501e-05, + "loss": 0.3378, + "step": 32906 + }, + { + "epoch": 18.383798882681564, + "grad_norm": 0.47375693917274475, + "learning_rate": 8.145658263305323e-05, + "loss": 0.394, + "step": 32907 + }, + { + "epoch": 18.38435754189944, + "grad_norm": 0.6289718151092529, + "learning_rate": 8.142857142857143e-05, + "loss": 0.5781, + "step": 32908 + }, + { + "epoch": 18.384916201117317, + "grad_norm": 0.6018370389938354, + "learning_rate": 8.140056022408964e-05, + "loss": 0.4072, + "step": 32909 + }, + { + "epoch": 18.385474860335197, + "grad_norm": 0.48269882798194885, + "learning_rate": 8.137254901960785e-05, + "loss": 0.412, + "step": 32910 + }, + { + "epoch": 18.386033519553074, + "grad_norm": 0.39507797360420227, + "learning_rate": 8.134453781512605e-05, + "loss": 0.338, + "step": 32911 + }, + { + "epoch": 18.38659217877095, + "grad_norm": 0.4784800112247467, + "learning_rate": 8.131652661064426e-05, + "loss": 0.4702, + "step": 32912 + }, + { + "epoch": 18.387150837988827, + "grad_norm": 0.3584432601928711, + "learning_rate": 8.128851540616248e-05, + "loss": 0.3456, + "step": 32913 + }, + { + "epoch": 18.387709497206703, + "grad_norm": 0.5144425630569458, + "learning_rate": 8.126050420168067e-05, + "loss": 0.487, + "step": 32914 + }, + { + "epoch": 18.38826815642458, + "grad_norm": 0.35907530784606934, + "learning_rate": 8.123249299719889e-05, + "loss": 0.3876, + "step": 32915 + }, + { + "epoch": 18.388826815642457, + "grad_norm": 0.38987234234809875, + "learning_rate": 8.120448179271708e-05, + "loss": 0.3807, + "step": 32916 + }, + { + "epoch": 18.389385474860337, + "grad_norm": 0.44976285099983215, + "learning_rate": 8.11764705882353e-05, + "loss": 0.2841, + "step": 32917 + }, + { + "epoch": 18.389944134078213, + "grad_norm": 1.293975830078125, + "learning_rate": 8.11484593837535e-05, + "loss": 0.4223, + "step": 32918 + }, + { + "epoch": 18.39050279329609, + "grad_norm": 0.40058234333992004, + "learning_rate": 8.112044817927171e-05, + "loss": 0.349, + "step": 32919 + }, + { + "epoch": 18.391061452513966, + "grad_norm": 0.4394546449184418, + "learning_rate": 8.109243697478992e-05, + "loss": 0.3986, + "step": 32920 + }, + { + "epoch": 18.391620111731843, + "grad_norm": 0.4801974594593048, + "learning_rate": 8.106442577030812e-05, + "loss": 0.4179, + "step": 32921 + }, + { + "epoch": 18.39217877094972, + "grad_norm": 0.41869810223579407, + "learning_rate": 8.103641456582633e-05, + "loss": 0.4264, + "step": 32922 + }, + { + "epoch": 18.392737430167596, + "grad_norm": 0.5575540661811829, + "learning_rate": 8.100840336134455e-05, + "loss": 0.3996, + "step": 32923 + }, + { + "epoch": 18.393296089385476, + "grad_norm": 0.37092867493629456, + "learning_rate": 8.098039215686274e-05, + "loss": 0.3878, + "step": 32924 + }, + { + "epoch": 18.393854748603353, + "grad_norm": 1.365338683128357, + "learning_rate": 8.095238095238096e-05, + "loss": 0.5178, + "step": 32925 + }, + { + "epoch": 18.39441340782123, + "grad_norm": 0.49847087264060974, + "learning_rate": 8.092436974789915e-05, + "loss": 0.3857, + "step": 32926 + }, + { + "epoch": 18.394972067039106, + "grad_norm": 0.36416247487068176, + "learning_rate": 8.089635854341737e-05, + "loss": 0.4178, + "step": 32927 + }, + { + "epoch": 18.395530726256982, + "grad_norm": 3.3862669467926025, + "learning_rate": 8.086834733893558e-05, + "loss": 0.3456, + "step": 32928 + }, + { + "epoch": 18.39608938547486, + "grad_norm": 0.3563072383403778, + "learning_rate": 8.084033613445379e-05, + "loss": 0.3933, + "step": 32929 + }, + { + "epoch": 18.39664804469274, + "grad_norm": 0.8796271681785583, + "learning_rate": 8.081232492997199e-05, + "loss": 0.4575, + "step": 32930 + }, + { + "epoch": 18.397206703910616, + "grad_norm": 0.5124216675758362, + "learning_rate": 8.07843137254902e-05, + "loss": 0.4773, + "step": 32931 + }, + { + "epoch": 18.397765363128492, + "grad_norm": 0.3278964161872864, + "learning_rate": 8.07563025210084e-05, + "loss": 0.2702, + "step": 32932 + }, + { + "epoch": 18.39832402234637, + "grad_norm": 0.41057077050209045, + "learning_rate": 8.072829131652661e-05, + "loss": 0.4701, + "step": 32933 + }, + { + "epoch": 18.398882681564245, + "grad_norm": 0.5461214780807495, + "learning_rate": 8.070028011204482e-05, + "loss": 0.4735, + "step": 32934 + }, + { + "epoch": 18.399441340782122, + "grad_norm": 0.4644501209259033, + "learning_rate": 8.067226890756304e-05, + "loss": 0.3917, + "step": 32935 + }, + { + "epoch": 18.4, + "grad_norm": 0.671927809715271, + "learning_rate": 8.064425770308123e-05, + "loss": 0.3263, + "step": 32936 + }, + { + "epoch": 18.40055865921788, + "grad_norm": 0.7671076655387878, + "learning_rate": 8.061624649859945e-05, + "loss": 0.4463, + "step": 32937 + }, + { + "epoch": 18.401117318435755, + "grad_norm": 0.3986946642398834, + "learning_rate": 8.058823529411764e-05, + "loss": 0.3639, + "step": 32938 + }, + { + "epoch": 18.401675977653632, + "grad_norm": 0.38581475615501404, + "learning_rate": 8.056022408963586e-05, + "loss": 0.3927, + "step": 32939 + }, + { + "epoch": 18.40223463687151, + "grad_norm": 1.1379859447479248, + "learning_rate": 8.053221288515407e-05, + "loss": 0.417, + "step": 32940 + }, + { + "epoch": 18.402793296089385, + "grad_norm": 1.1928983926773071, + "learning_rate": 8.050420168067227e-05, + "loss": 0.4742, + "step": 32941 + }, + { + "epoch": 18.40335195530726, + "grad_norm": 0.6107305884361267, + "learning_rate": 8.047619047619048e-05, + "loss": 0.4547, + "step": 32942 + }, + { + "epoch": 18.403910614525138, + "grad_norm": 0.6340111494064331, + "learning_rate": 8.044817927170868e-05, + "loss": 0.4346, + "step": 32943 + }, + { + "epoch": 18.404469273743018, + "grad_norm": 0.8544795513153076, + "learning_rate": 8.042016806722689e-05, + "loss": 0.4457, + "step": 32944 + }, + { + "epoch": 18.405027932960895, + "grad_norm": 0.5879116654396057, + "learning_rate": 8.039215686274511e-05, + "loss": 0.3773, + "step": 32945 + }, + { + "epoch": 18.40558659217877, + "grad_norm": 0.30842816829681396, + "learning_rate": 8.03641456582633e-05, + "loss": 0.3092, + "step": 32946 + }, + { + "epoch": 18.406145251396648, + "grad_norm": 0.4883110225200653, + "learning_rate": 8.033613445378152e-05, + "loss": 0.4249, + "step": 32947 + }, + { + "epoch": 18.406703910614524, + "grad_norm": 0.39143693447113037, + "learning_rate": 8.030812324929971e-05, + "loss": 0.3966, + "step": 32948 + }, + { + "epoch": 18.4072625698324, + "grad_norm": 0.7340346574783325, + "learning_rate": 8.028011204481793e-05, + "loss": 0.4919, + "step": 32949 + }, + { + "epoch": 18.407821229050278, + "grad_norm": 0.5486397743225098, + "learning_rate": 8.025210084033614e-05, + "loss": 0.4141, + "step": 32950 + }, + { + "epoch": 18.408379888268158, + "grad_norm": 0.6926745772361755, + "learning_rate": 8.022408963585435e-05, + "loss": 0.3798, + "step": 32951 + }, + { + "epoch": 18.408938547486034, + "grad_norm": 0.4745299816131592, + "learning_rate": 8.019607843137255e-05, + "loss": 0.4238, + "step": 32952 + }, + { + "epoch": 18.40949720670391, + "grad_norm": 0.6261916160583496, + "learning_rate": 8.016806722689076e-05, + "loss": 0.3885, + "step": 32953 + }, + { + "epoch": 18.410055865921787, + "grad_norm": 0.42988282442092896, + "learning_rate": 8.014005602240896e-05, + "loss": 0.4396, + "step": 32954 + }, + { + "epoch": 18.410614525139664, + "grad_norm": 0.3979518711566925, + "learning_rate": 8.011204481792718e-05, + "loss": 0.3088, + "step": 32955 + }, + { + "epoch": 18.41117318435754, + "grad_norm": 6.194914817810059, + "learning_rate": 8.008403361344538e-05, + "loss": 0.5407, + "step": 32956 + }, + { + "epoch": 18.41173184357542, + "grad_norm": 0.4199903607368469, + "learning_rate": 8.00560224089636e-05, + "loss": 0.3436, + "step": 32957 + }, + { + "epoch": 18.412290502793297, + "grad_norm": 1.6032865047454834, + "learning_rate": 8.002801120448179e-05, + "loss": 0.4505, + "step": 32958 + }, + { + "epoch": 18.412849162011174, + "grad_norm": 0.33165737986564636, + "learning_rate": 8e-05, + "loss": 0.3495, + "step": 32959 + }, + { + "epoch": 18.41340782122905, + "grad_norm": 0.3565899729728699, + "learning_rate": 7.99719887955182e-05, + "loss": 0.3981, + "step": 32960 + }, + { + "epoch": 18.413966480446927, + "grad_norm": 0.47874388098716736, + "learning_rate": 7.994397759103642e-05, + "loss": 0.4094, + "step": 32961 + }, + { + "epoch": 18.414525139664804, + "grad_norm": 0.4249212145805359, + "learning_rate": 7.991596638655462e-05, + "loss": 0.3633, + "step": 32962 + }, + { + "epoch": 18.41508379888268, + "grad_norm": 0.4846232831478119, + "learning_rate": 7.988795518207283e-05, + "loss": 0.4138, + "step": 32963 + }, + { + "epoch": 18.41564245810056, + "grad_norm": 0.5512405037879944, + "learning_rate": 7.985994397759104e-05, + "loss": 0.4146, + "step": 32964 + }, + { + "epoch": 18.416201117318437, + "grad_norm": 0.370501309633255, + "learning_rate": 7.983193277310924e-05, + "loss": 0.3038, + "step": 32965 + }, + { + "epoch": 18.416759776536313, + "grad_norm": 0.5199580788612366, + "learning_rate": 7.980392156862745e-05, + "loss": 0.3512, + "step": 32966 + }, + { + "epoch": 18.41731843575419, + "grad_norm": 1.395495057106018, + "learning_rate": 7.977591036414567e-05, + "loss": 0.4137, + "step": 32967 + }, + { + "epoch": 18.417877094972066, + "grad_norm": 0.7687472105026245, + "learning_rate": 7.974789915966386e-05, + "loss": 0.4476, + "step": 32968 + }, + { + "epoch": 18.418435754189943, + "grad_norm": 0.5083678364753723, + "learning_rate": 7.971988795518208e-05, + "loss": 0.3393, + "step": 32969 + }, + { + "epoch": 18.41899441340782, + "grad_norm": 0.3709186613559723, + "learning_rate": 7.969187675070027e-05, + "loss": 0.4297, + "step": 32970 + }, + { + "epoch": 18.4195530726257, + "grad_norm": 0.702735185623169, + "learning_rate": 7.966386554621849e-05, + "loss": 0.3291, + "step": 32971 + }, + { + "epoch": 18.420111731843576, + "grad_norm": 0.6108242869377136, + "learning_rate": 7.96358543417367e-05, + "loss": 0.582, + "step": 32972 + }, + { + "epoch": 18.420670391061453, + "grad_norm": 0.5116142630577087, + "learning_rate": 7.96078431372549e-05, + "loss": 0.4132, + "step": 32973 + }, + { + "epoch": 18.42122905027933, + "grad_norm": 0.428847998380661, + "learning_rate": 7.957983193277311e-05, + "loss": 0.3916, + "step": 32974 + }, + { + "epoch": 18.421787709497206, + "grad_norm": 0.6311320662498474, + "learning_rate": 7.955182072829132e-05, + "loss": 0.4354, + "step": 32975 + }, + { + "epoch": 18.422346368715083, + "grad_norm": 0.34464508295059204, + "learning_rate": 7.952380952380952e-05, + "loss": 0.459, + "step": 32976 + }, + { + "epoch": 18.422905027932963, + "grad_norm": 0.35673147439956665, + "learning_rate": 7.949579831932774e-05, + "loss": 0.3044, + "step": 32977 + }, + { + "epoch": 18.42346368715084, + "grad_norm": 0.6588972210884094, + "learning_rate": 7.946778711484593e-05, + "loss": 0.4345, + "step": 32978 + }, + { + "epoch": 18.424022346368716, + "grad_norm": 1.5466976165771484, + "learning_rate": 7.943977591036415e-05, + "loss": 0.4463, + "step": 32979 + }, + { + "epoch": 18.424581005586592, + "grad_norm": 0.4252294600009918, + "learning_rate": 7.941176470588235e-05, + "loss": 0.4046, + "step": 32980 + }, + { + "epoch": 18.42513966480447, + "grad_norm": 0.6188439130783081, + "learning_rate": 7.938375350140057e-05, + "loss": 0.6927, + "step": 32981 + }, + { + "epoch": 18.425698324022346, + "grad_norm": 0.43326622247695923, + "learning_rate": 7.935574229691877e-05, + "loss": 0.3682, + "step": 32982 + }, + { + "epoch": 18.426256983240222, + "grad_norm": 0.5017699599266052, + "learning_rate": 7.932773109243698e-05, + "loss": 0.3753, + "step": 32983 + }, + { + "epoch": 18.426815642458102, + "grad_norm": 1.5855077505111694, + "learning_rate": 7.929971988795518e-05, + "loss": 0.3638, + "step": 32984 + }, + { + "epoch": 18.42737430167598, + "grad_norm": 0.6004158854484558, + "learning_rate": 7.927170868347339e-05, + "loss": 0.2602, + "step": 32985 + }, + { + "epoch": 18.427932960893855, + "grad_norm": 1.3407344818115234, + "learning_rate": 7.92436974789916e-05, + "loss": 0.3408, + "step": 32986 + }, + { + "epoch": 18.428491620111732, + "grad_norm": 1.0956668853759766, + "learning_rate": 7.92156862745098e-05, + "loss": 0.5017, + "step": 32987 + }, + { + "epoch": 18.42905027932961, + "grad_norm": 3.4091012477874756, + "learning_rate": 7.918767507002801e-05, + "loss": 0.5011, + "step": 32988 + }, + { + "epoch": 18.429608938547485, + "grad_norm": 0.6315897703170776, + "learning_rate": 7.915966386554623e-05, + "loss": 0.4583, + "step": 32989 + }, + { + "epoch": 18.43016759776536, + "grad_norm": 0.6949681043624878, + "learning_rate": 7.913165266106442e-05, + "loss": 0.3621, + "step": 32990 + }, + { + "epoch": 18.43072625698324, + "grad_norm": 3.8538801670074463, + "learning_rate": 7.910364145658264e-05, + "loss": 0.4234, + "step": 32991 + }, + { + "epoch": 18.43128491620112, + "grad_norm": 1.2446885108947754, + "learning_rate": 7.907563025210083e-05, + "loss": 0.4556, + "step": 32992 + }, + { + "epoch": 18.431843575418995, + "grad_norm": 0.33871883153915405, + "learning_rate": 7.904761904761905e-05, + "loss": 0.3682, + "step": 32993 + }, + { + "epoch": 18.43240223463687, + "grad_norm": 0.8030214309692383, + "learning_rate": 7.901960784313726e-05, + "loss": 0.3618, + "step": 32994 + }, + { + "epoch": 18.432960893854748, + "grad_norm": 0.34368887543678284, + "learning_rate": 7.899159663865546e-05, + "loss": 0.3262, + "step": 32995 + }, + { + "epoch": 18.433519553072625, + "grad_norm": 1.7237881422042847, + "learning_rate": 7.896358543417367e-05, + "loss": 0.7043, + "step": 32996 + }, + { + "epoch": 18.4340782122905, + "grad_norm": 1.5003234148025513, + "learning_rate": 7.893557422969187e-05, + "loss": 0.33, + "step": 32997 + }, + { + "epoch": 18.43463687150838, + "grad_norm": 1.2776204347610474, + "learning_rate": 7.890756302521008e-05, + "loss": 0.4121, + "step": 32998 + }, + { + "epoch": 18.435195530726258, + "grad_norm": 0.5692400932312012, + "learning_rate": 7.88795518207283e-05, + "loss": 0.4963, + "step": 32999 + }, + { + "epoch": 18.435754189944134, + "grad_norm": 0.47316592931747437, + "learning_rate": 7.885154061624649e-05, + "loss": 0.6218, + "step": 33000 + }, + { + "epoch": 18.435754189944134, + "eval_cer": 0.08477627573565513, + "eval_loss": 0.3203820586204529, + "eval_runtime": 55.6542, + "eval_samples_per_second": 81.539, + "eval_steps_per_second": 5.103, + "eval_wer": 0.3347113209128237, + "step": 33000 + }, + { + "epoch": 18.43631284916201, + "grad_norm": 0.44589757919311523, + "learning_rate": 7.882352941176471e-05, + "loss": 0.3691, + "step": 33001 + }, + { + "epoch": 18.436871508379888, + "grad_norm": 0.6666038632392883, + "learning_rate": 7.87955182072829e-05, + "loss": 0.4973, + "step": 33002 + }, + { + "epoch": 18.437430167597764, + "grad_norm": 0.38396579027175903, + "learning_rate": 7.876750700280112e-05, + "loss": 0.3244, + "step": 33003 + }, + { + "epoch": 18.43798882681564, + "grad_norm": 0.39291146397590637, + "learning_rate": 7.873949579831933e-05, + "loss": 0.3583, + "step": 33004 + }, + { + "epoch": 18.43854748603352, + "grad_norm": 0.40219053626060486, + "learning_rate": 7.871148459383754e-05, + "loss": 0.353, + "step": 33005 + }, + { + "epoch": 18.439106145251397, + "grad_norm": 0.5306525230407715, + "learning_rate": 7.868347338935574e-05, + "loss": 0.5015, + "step": 33006 + }, + { + "epoch": 18.439664804469274, + "grad_norm": 0.5150411128997803, + "learning_rate": 7.865546218487395e-05, + "loss": 0.5127, + "step": 33007 + }, + { + "epoch": 18.44022346368715, + "grad_norm": 0.3518139719963074, + "learning_rate": 7.862745098039215e-05, + "loss": 0.4225, + "step": 33008 + }, + { + "epoch": 18.440782122905027, + "grad_norm": 0.49749556183815, + "learning_rate": 7.859943977591037e-05, + "loss": 0.3997, + "step": 33009 + }, + { + "epoch": 18.441340782122904, + "grad_norm": 1.749674677848816, + "learning_rate": 7.857142857142857e-05, + "loss": 0.3951, + "step": 33010 + }, + { + "epoch": 18.441899441340784, + "grad_norm": 0.5609121918678284, + "learning_rate": 7.854341736694679e-05, + "loss": 0.4742, + "step": 33011 + }, + { + "epoch": 18.44245810055866, + "grad_norm": 0.37346526980400085, + "learning_rate": 7.851540616246498e-05, + "loss": 0.3268, + "step": 33012 + }, + { + "epoch": 18.443016759776537, + "grad_norm": 0.6323011517524719, + "learning_rate": 7.84873949579832e-05, + "loss": 0.5413, + "step": 33013 + }, + { + "epoch": 18.443575418994413, + "grad_norm": 0.7415624856948853, + "learning_rate": 7.845938375350139e-05, + "loss": 0.5697, + "step": 33014 + }, + { + "epoch": 18.44413407821229, + "grad_norm": 0.4007977247238159, + "learning_rate": 7.843137254901961e-05, + "loss": 0.4791, + "step": 33015 + }, + { + "epoch": 18.444692737430167, + "grad_norm": 0.5531507134437561, + "learning_rate": 7.840336134453782e-05, + "loss": 0.4185, + "step": 33016 + }, + { + "epoch": 18.445251396648043, + "grad_norm": 0.4146111011505127, + "learning_rate": 7.837535014005602e-05, + "loss": 0.3786, + "step": 33017 + }, + { + "epoch": 18.445810055865923, + "grad_norm": 2.5037317276000977, + "learning_rate": 7.834733893557423e-05, + "loss": 0.4907, + "step": 33018 + }, + { + "epoch": 18.4463687150838, + "grad_norm": 0.3815475106239319, + "learning_rate": 7.831932773109243e-05, + "loss": 0.4057, + "step": 33019 + }, + { + "epoch": 18.446927374301676, + "grad_norm": 0.34028899669647217, + "learning_rate": 7.829131652661064e-05, + "loss": 0.3928, + "step": 33020 + }, + { + "epoch": 18.447486033519553, + "grad_norm": 0.3826596736907959, + "learning_rate": 7.826330532212886e-05, + "loss": 0.3682, + "step": 33021 + }, + { + "epoch": 18.44804469273743, + "grad_norm": 0.44373390078544617, + "learning_rate": 7.823529411764705e-05, + "loss": 0.544, + "step": 33022 + }, + { + "epoch": 18.448603351955306, + "grad_norm": 0.3561420440673828, + "learning_rate": 7.820728291316527e-05, + "loss": 0.3207, + "step": 33023 + }, + { + "epoch": 18.449162011173183, + "grad_norm": 0.5077047944068909, + "learning_rate": 7.817927170868346e-05, + "loss": 0.399, + "step": 33024 + }, + { + "epoch": 18.449720670391063, + "grad_norm": 0.49721693992614746, + "learning_rate": 7.815126050420168e-05, + "loss": 0.4016, + "step": 33025 + }, + { + "epoch": 18.45027932960894, + "grad_norm": 1.33882474899292, + "learning_rate": 7.812324929971989e-05, + "loss": 0.3393, + "step": 33026 + }, + { + "epoch": 18.450837988826816, + "grad_norm": 0.4576573073863983, + "learning_rate": 7.80952380952381e-05, + "loss": 0.4236, + "step": 33027 + }, + { + "epoch": 18.451396648044692, + "grad_norm": 1.5567392110824585, + "learning_rate": 7.80672268907563e-05, + "loss": 0.4699, + "step": 33028 + }, + { + "epoch": 18.45195530726257, + "grad_norm": 0.5081969499588013, + "learning_rate": 7.803921568627451e-05, + "loss": 0.3925, + "step": 33029 + }, + { + "epoch": 18.452513966480446, + "grad_norm": 0.47761791944503784, + "learning_rate": 7.801120448179271e-05, + "loss": 0.3574, + "step": 33030 + }, + { + "epoch": 18.453072625698326, + "grad_norm": 0.6030970215797424, + "learning_rate": 7.798319327731093e-05, + "loss": 0.3416, + "step": 33031 + }, + { + "epoch": 18.453631284916202, + "grad_norm": 0.44586479663848877, + "learning_rate": 7.795518207282913e-05, + "loss": 0.3261, + "step": 33032 + }, + { + "epoch": 18.45418994413408, + "grad_norm": 0.4216799736022949, + "learning_rate": 7.792717086834735e-05, + "loss": 0.4234, + "step": 33033 + }, + { + "epoch": 18.454748603351955, + "grad_norm": 0.30692920088768005, + "learning_rate": 7.789915966386554e-05, + "loss": 0.3775, + "step": 33034 + }, + { + "epoch": 18.455307262569832, + "grad_norm": 0.45264431834220886, + "learning_rate": 7.787114845938376e-05, + "loss": 0.4657, + "step": 33035 + }, + { + "epoch": 18.45586592178771, + "grad_norm": 0.4765605330467224, + "learning_rate": 7.784313725490196e-05, + "loss": 0.5099, + "step": 33036 + }, + { + "epoch": 18.456424581005585, + "grad_norm": 0.540405809879303, + "learning_rate": 7.781512605042017e-05, + "loss": 0.4369, + "step": 33037 + }, + { + "epoch": 18.456983240223465, + "grad_norm": 0.5709408521652222, + "learning_rate": 7.778711484593837e-05, + "loss": 0.4126, + "step": 33038 + }, + { + "epoch": 18.457541899441342, + "grad_norm": 0.48756682872772217, + "learning_rate": 7.775910364145658e-05, + "loss": 0.4706, + "step": 33039 + }, + { + "epoch": 18.45810055865922, + "grad_norm": 0.37175390124320984, + "learning_rate": 7.773109243697479e-05, + "loss": 0.3476, + "step": 33040 + }, + { + "epoch": 18.458659217877095, + "grad_norm": 0.7057384848594666, + "learning_rate": 7.770308123249299e-05, + "loss": 0.4767, + "step": 33041 + }, + { + "epoch": 18.45921787709497, + "grad_norm": 1.2787559032440186, + "learning_rate": 7.76750700280112e-05, + "loss": 0.5325, + "step": 33042 + }, + { + "epoch": 18.459776536312848, + "grad_norm": 0.47130322456359863, + "learning_rate": 7.764705882352942e-05, + "loss": 0.3885, + "step": 33043 + }, + { + "epoch": 18.460335195530725, + "grad_norm": 0.4600951671600342, + "learning_rate": 7.761904761904761e-05, + "loss": 0.3801, + "step": 33044 + }, + { + "epoch": 18.460893854748605, + "grad_norm": 0.5492141842842102, + "learning_rate": 7.759103641456583e-05, + "loss": 0.4805, + "step": 33045 + }, + { + "epoch": 18.46145251396648, + "grad_norm": 0.8448355197906494, + "learning_rate": 7.756302521008402e-05, + "loss": 0.3472, + "step": 33046 + }, + { + "epoch": 18.462011173184358, + "grad_norm": 0.32858195900917053, + "learning_rate": 7.753501400560224e-05, + "loss": 0.3936, + "step": 33047 + }, + { + "epoch": 18.462569832402234, + "grad_norm": 1.110443353652954, + "learning_rate": 7.750700280112045e-05, + "loss": 0.4981, + "step": 33048 + }, + { + "epoch": 18.46312849162011, + "grad_norm": 2.133528709411621, + "learning_rate": 7.747899159663865e-05, + "loss": 0.4248, + "step": 33049 + }, + { + "epoch": 18.463687150837988, + "grad_norm": 0.81072998046875, + "learning_rate": 7.745098039215686e-05, + "loss": 0.4564, + "step": 33050 + }, + { + "epoch": 18.464245810055864, + "grad_norm": 1.1348986625671387, + "learning_rate": 7.742296918767507e-05, + "loss": 0.3962, + "step": 33051 + }, + { + "epoch": 18.464804469273744, + "grad_norm": 0.8548154830932617, + "learning_rate": 7.739495798319327e-05, + "loss": 0.3693, + "step": 33052 + }, + { + "epoch": 18.46536312849162, + "grad_norm": 0.3434957265853882, + "learning_rate": 7.736694677871149e-05, + "loss": 0.3647, + "step": 33053 + }, + { + "epoch": 18.465921787709497, + "grad_norm": 0.3319074809551239, + "learning_rate": 7.733893557422968e-05, + "loss": 0.3884, + "step": 33054 + }, + { + "epoch": 18.466480446927374, + "grad_norm": 0.5831570625305176, + "learning_rate": 7.73109243697479e-05, + "loss": 0.5707, + "step": 33055 + }, + { + "epoch": 18.46703910614525, + "grad_norm": 0.5149168372154236, + "learning_rate": 7.72829131652661e-05, + "loss": 0.3985, + "step": 33056 + }, + { + "epoch": 18.467597765363127, + "grad_norm": 0.5399753451347351, + "learning_rate": 7.725490196078432e-05, + "loss": 0.3828, + "step": 33057 + }, + { + "epoch": 18.468156424581007, + "grad_norm": 0.42691951990127563, + "learning_rate": 7.722689075630252e-05, + "loss": 0.3773, + "step": 33058 + }, + { + "epoch": 18.468715083798884, + "grad_norm": 1.6134134531021118, + "learning_rate": 7.719887955182073e-05, + "loss": 0.3723, + "step": 33059 + }, + { + "epoch": 18.46927374301676, + "grad_norm": 0.5129427313804626, + "learning_rate": 7.717086834733893e-05, + "loss": 0.4191, + "step": 33060 + }, + { + "epoch": 18.469832402234637, + "grad_norm": 0.4188036620616913, + "learning_rate": 7.714285714285714e-05, + "loss": 0.3696, + "step": 33061 + }, + { + "epoch": 18.470391061452514, + "grad_norm": 0.3631969392299652, + "learning_rate": 7.711484593837535e-05, + "loss": 0.3723, + "step": 33062 + }, + { + "epoch": 18.47094972067039, + "grad_norm": 0.4793473184108734, + "learning_rate": 7.708683473389357e-05, + "loss": 0.5606, + "step": 33063 + }, + { + "epoch": 18.471508379888267, + "grad_norm": 0.6509886980056763, + "learning_rate": 7.705882352941176e-05, + "loss": 0.312, + "step": 33064 + }, + { + "epoch": 18.472067039106147, + "grad_norm": 0.758695662021637, + "learning_rate": 7.703081232492998e-05, + "loss": 0.721, + "step": 33065 + }, + { + "epoch": 18.472625698324023, + "grad_norm": 0.37205594778060913, + "learning_rate": 7.700280112044817e-05, + "loss": 0.3416, + "step": 33066 + }, + { + "epoch": 18.4731843575419, + "grad_norm": 0.3956359326839447, + "learning_rate": 7.697478991596639e-05, + "loss": 0.4396, + "step": 33067 + }, + { + "epoch": 18.473743016759776, + "grad_norm": 0.5734394788742065, + "learning_rate": 7.694677871148458e-05, + "loss": 0.4488, + "step": 33068 + }, + { + "epoch": 18.474301675977653, + "grad_norm": 0.4282195270061493, + "learning_rate": 7.69187675070028e-05, + "loss": 0.3303, + "step": 33069 + }, + { + "epoch": 18.47486033519553, + "grad_norm": 0.39459228515625, + "learning_rate": 7.689075630252102e-05, + "loss": 0.3917, + "step": 33070 + }, + { + "epoch": 18.475418994413406, + "grad_norm": 0.4824240505695343, + "learning_rate": 7.686274509803921e-05, + "loss": 0.5014, + "step": 33071 + }, + { + "epoch": 18.475977653631286, + "grad_norm": 0.3895677328109741, + "learning_rate": 7.683473389355743e-05, + "loss": 0.3382, + "step": 33072 + }, + { + "epoch": 18.476536312849163, + "grad_norm": 0.42169928550720215, + "learning_rate": 7.680672268907563e-05, + "loss": 0.436, + "step": 33073 + }, + { + "epoch": 18.47709497206704, + "grad_norm": 0.37625226378440857, + "learning_rate": 7.677871148459385e-05, + "loss": 0.3906, + "step": 33074 + }, + { + "epoch": 18.477653631284916, + "grad_norm": 0.4884724020957947, + "learning_rate": 7.675070028011205e-05, + "loss": 0.398, + "step": 33075 + }, + { + "epoch": 18.478212290502793, + "grad_norm": 0.9212857484817505, + "learning_rate": 7.672268907563026e-05, + "loss": 0.3593, + "step": 33076 + }, + { + "epoch": 18.47877094972067, + "grad_norm": 0.732638955116272, + "learning_rate": 7.669467787114846e-05, + "loss": 0.3779, + "step": 33077 + }, + { + "epoch": 18.47932960893855, + "grad_norm": 0.3234267830848694, + "learning_rate": 7.666666666666667e-05, + "loss": 0.44, + "step": 33078 + }, + { + "epoch": 18.479888268156426, + "grad_norm": 0.9365727305412292, + "learning_rate": 7.663865546218487e-05, + "loss": 0.3489, + "step": 33079 + }, + { + "epoch": 18.480446927374302, + "grad_norm": 0.3906329870223999, + "learning_rate": 7.66106442577031e-05, + "loss": 0.3596, + "step": 33080 + }, + { + "epoch": 18.48100558659218, + "grad_norm": 0.37238189578056335, + "learning_rate": 7.658263305322129e-05, + "loss": 0.3963, + "step": 33081 + }, + { + "epoch": 18.481564245810056, + "grad_norm": 0.3529389798641205, + "learning_rate": 7.65546218487395e-05, + "loss": 0.3651, + "step": 33082 + }, + { + "epoch": 18.482122905027932, + "grad_norm": 0.5095834732055664, + "learning_rate": 7.65266106442577e-05, + "loss": 0.411, + "step": 33083 + }, + { + "epoch": 18.48268156424581, + "grad_norm": 0.46512576937675476, + "learning_rate": 7.649859943977592e-05, + "loss": 0.5025, + "step": 33084 + }, + { + "epoch": 18.48324022346369, + "grad_norm": 0.5901860594749451, + "learning_rate": 7.647058823529412e-05, + "loss": 0.3328, + "step": 33085 + }, + { + "epoch": 18.483798882681565, + "grad_norm": 0.8037012815475464, + "learning_rate": 7.644257703081233e-05, + "loss": 0.5664, + "step": 33086 + }, + { + "epoch": 18.484357541899442, + "grad_norm": 0.4427291750907898, + "learning_rate": 7.641456582633054e-05, + "loss": 0.3422, + "step": 33087 + }, + { + "epoch": 18.48491620111732, + "grad_norm": 0.8680294156074524, + "learning_rate": 7.638655462184874e-05, + "loss": 0.3653, + "step": 33088 + }, + { + "epoch": 18.485474860335195, + "grad_norm": 1.4251518249511719, + "learning_rate": 7.635854341736695e-05, + "loss": 0.3637, + "step": 33089 + }, + { + "epoch": 18.48603351955307, + "grad_norm": 0.5643889904022217, + "learning_rate": 7.633053221288517e-05, + "loss": 0.4147, + "step": 33090 + }, + { + "epoch": 18.486592178770948, + "grad_norm": 1.5160716772079468, + "learning_rate": 7.630252100840336e-05, + "loss": 0.3667, + "step": 33091 + }, + { + "epoch": 18.48715083798883, + "grad_norm": 0.5278897881507874, + "learning_rate": 7.627450980392158e-05, + "loss": 0.4094, + "step": 33092 + }, + { + "epoch": 18.487709497206705, + "grad_norm": 0.5631977915763855, + "learning_rate": 7.624649859943977e-05, + "loss": 0.3746, + "step": 33093 + }, + { + "epoch": 18.48826815642458, + "grad_norm": 0.4107406735420227, + "learning_rate": 7.621848739495799e-05, + "loss": 0.3953, + "step": 33094 + }, + { + "epoch": 18.488826815642458, + "grad_norm": 0.5018259286880493, + "learning_rate": 7.61904761904762e-05, + "loss": 0.4387, + "step": 33095 + }, + { + "epoch": 18.489385474860335, + "grad_norm": 0.37330323457717896, + "learning_rate": 7.61624649859944e-05, + "loss": 0.3784, + "step": 33096 + }, + { + "epoch": 18.48994413407821, + "grad_norm": 0.3837027847766876, + "learning_rate": 7.613445378151261e-05, + "loss": 0.3666, + "step": 33097 + }, + { + "epoch": 18.490502793296088, + "grad_norm": 1.465105414390564, + "learning_rate": 7.610644257703082e-05, + "loss": 0.3488, + "step": 33098 + }, + { + "epoch": 18.491061452513968, + "grad_norm": 0.44824424386024475, + "learning_rate": 7.607843137254902e-05, + "loss": 0.4173, + "step": 33099 + }, + { + "epoch": 18.491620111731844, + "grad_norm": 0.3605848550796509, + "learning_rate": 7.605042016806723e-05, + "loss": 0.3523, + "step": 33100 + }, + { + "epoch": 18.49217877094972, + "grad_norm": 0.7937778830528259, + "learning_rate": 7.602240896358543e-05, + "loss": 0.3825, + "step": 33101 + }, + { + "epoch": 18.492737430167598, + "grad_norm": 0.6326472759246826, + "learning_rate": 7.599439775910365e-05, + "loss": 0.3843, + "step": 33102 + }, + { + "epoch": 18.493296089385474, + "grad_norm": 0.6514365077018738, + "learning_rate": 7.596638655462185e-05, + "loss": 0.3958, + "step": 33103 + }, + { + "epoch": 18.49385474860335, + "grad_norm": 0.9572352766990662, + "learning_rate": 7.593837535014007e-05, + "loss": 0.3022, + "step": 33104 + }, + { + "epoch": 18.49441340782123, + "grad_norm": 0.325369656085968, + "learning_rate": 7.591036414565826e-05, + "loss": 0.3368, + "step": 33105 + }, + { + "epoch": 18.494972067039107, + "grad_norm": 0.5275483727455139, + "learning_rate": 7.588235294117648e-05, + "loss": 0.3703, + "step": 33106 + }, + { + "epoch": 18.495530726256984, + "grad_norm": 0.4663551151752472, + "learning_rate": 7.585434173669468e-05, + "loss": 0.3267, + "step": 33107 + }, + { + "epoch": 18.49608938547486, + "grad_norm": 1.1299080848693848, + "learning_rate": 7.582633053221289e-05, + "loss": 0.4904, + "step": 33108 + }, + { + "epoch": 18.496648044692737, + "grad_norm": 0.4735184609889984, + "learning_rate": 7.57983193277311e-05, + "loss": 0.4935, + "step": 33109 + }, + { + "epoch": 18.497206703910614, + "grad_norm": 0.40381255745887756, + "learning_rate": 7.57703081232493e-05, + "loss": 0.45, + "step": 33110 + }, + { + "epoch": 18.49776536312849, + "grad_norm": 0.49893704056739807, + "learning_rate": 7.574229691876751e-05, + "loss": 0.4343, + "step": 33111 + }, + { + "epoch": 18.49832402234637, + "grad_norm": 0.3621094822883606, + "learning_rate": 7.571428571428573e-05, + "loss": 0.3422, + "step": 33112 + }, + { + "epoch": 18.498882681564247, + "grad_norm": 0.5491006970405579, + "learning_rate": 7.568627450980392e-05, + "loss": 0.3803, + "step": 33113 + }, + { + "epoch": 18.499441340782123, + "grad_norm": 0.4539957642555237, + "learning_rate": 7.565826330532214e-05, + "loss": 0.421, + "step": 33114 + }, + { + "epoch": 18.5, + "grad_norm": 0.47944334149360657, + "learning_rate": 7.563025210084033e-05, + "loss": 0.4285, + "step": 33115 + }, + { + "epoch": 18.500558659217877, + "grad_norm": 0.5856572389602661, + "learning_rate": 7.560224089635855e-05, + "loss": 0.3584, + "step": 33116 + }, + { + "epoch": 18.501117318435753, + "grad_norm": 0.5330870747566223, + "learning_rate": 7.557422969187676e-05, + "loss": 0.529, + "step": 33117 + }, + { + "epoch": 18.50167597765363, + "grad_norm": 0.4092240631580353, + "learning_rate": 7.554621848739496e-05, + "loss": 0.3492, + "step": 33118 + }, + { + "epoch": 18.50223463687151, + "grad_norm": 0.7180306315422058, + "learning_rate": 7.551820728291317e-05, + "loss": 0.3424, + "step": 33119 + }, + { + "epoch": 18.502793296089386, + "grad_norm": 0.38659730553627014, + "learning_rate": 7.549019607843137e-05, + "loss": 0.3772, + "step": 33120 + }, + { + "epoch": 18.503351955307263, + "grad_norm": 0.36830198764801025, + "learning_rate": 7.546218487394958e-05, + "loss": 0.396, + "step": 33121 + }, + { + "epoch": 18.50391061452514, + "grad_norm": 0.7057275772094727, + "learning_rate": 7.54341736694678e-05, + "loss": 0.3826, + "step": 33122 + }, + { + "epoch": 18.504469273743016, + "grad_norm": 0.8301900625228882, + "learning_rate": 7.540616246498599e-05, + "loss": 0.3786, + "step": 33123 + }, + { + "epoch": 18.505027932960893, + "grad_norm": 0.4118098020553589, + "learning_rate": 7.537815126050421e-05, + "loss": 0.361, + "step": 33124 + }, + { + "epoch": 18.505586592178773, + "grad_norm": 1.040213704109192, + "learning_rate": 7.53501400560224e-05, + "loss": 0.3848, + "step": 33125 + }, + { + "epoch": 18.50614525139665, + "grad_norm": 0.8100784420967102, + "learning_rate": 7.532212885154062e-05, + "loss": 0.3111, + "step": 33126 + }, + { + "epoch": 18.506703910614526, + "grad_norm": 0.8182530403137207, + "learning_rate": 7.529411764705882e-05, + "loss": 0.3934, + "step": 33127 + }, + { + "epoch": 18.507262569832402, + "grad_norm": 0.3644542098045349, + "learning_rate": 7.526610644257704e-05, + "loss": 0.3501, + "step": 33128 + }, + { + "epoch": 18.50782122905028, + "grad_norm": 0.517069935798645, + "learning_rate": 7.523809523809524e-05, + "loss": 0.41, + "step": 33129 + }, + { + "epoch": 18.508379888268156, + "grad_norm": 0.46252840757369995, + "learning_rate": 7.521008403361345e-05, + "loss": 0.4326, + "step": 33130 + }, + { + "epoch": 18.508938547486032, + "grad_norm": 0.36550891399383545, + "learning_rate": 7.518207282913165e-05, + "loss": 0.4662, + "step": 33131 + }, + { + "epoch": 18.509497206703912, + "grad_norm": 0.6734106540679932, + "learning_rate": 7.515406162464986e-05, + "loss": 0.3488, + "step": 33132 + }, + { + "epoch": 18.51005586592179, + "grad_norm": 0.5024005174636841, + "learning_rate": 7.512605042016807e-05, + "loss": 0.4461, + "step": 33133 + }, + { + "epoch": 18.510614525139665, + "grad_norm": 0.45493632555007935, + "learning_rate": 7.509803921568629e-05, + "loss": 0.4329, + "step": 33134 + }, + { + "epoch": 18.511173184357542, + "grad_norm": 0.3843703866004944, + "learning_rate": 7.507002801120448e-05, + "loss": 0.3793, + "step": 33135 + }, + { + "epoch": 18.51173184357542, + "grad_norm": 0.3260156810283661, + "learning_rate": 7.50420168067227e-05, + "loss": 0.3111, + "step": 33136 + }, + { + "epoch": 18.512290502793295, + "grad_norm": 0.4309711158275604, + "learning_rate": 7.501400560224089e-05, + "loss": 0.4148, + "step": 33137 + }, + { + "epoch": 18.51284916201117, + "grad_norm": 0.42396098375320435, + "learning_rate": 7.498599439775911e-05, + "loss": 0.4683, + "step": 33138 + }, + { + "epoch": 18.513407821229052, + "grad_norm": 0.5131839513778687, + "learning_rate": 7.495798319327732e-05, + "loss": 0.5678, + "step": 33139 + }, + { + "epoch": 18.51396648044693, + "grad_norm": 0.7215701341629028, + "learning_rate": 7.492997198879552e-05, + "loss": 0.4591, + "step": 33140 + }, + { + "epoch": 18.514525139664805, + "grad_norm": 0.36599570512771606, + "learning_rate": 7.490196078431373e-05, + "loss": 0.3928, + "step": 33141 + }, + { + "epoch": 18.51508379888268, + "grad_norm": 0.5501282811164856, + "learning_rate": 7.487394957983193e-05, + "loss": 0.4272, + "step": 33142 + }, + { + "epoch": 18.515642458100558, + "grad_norm": 0.44552475214004517, + "learning_rate": 7.484593837535014e-05, + "loss": 0.3913, + "step": 33143 + }, + { + "epoch": 18.516201117318435, + "grad_norm": 0.7973290681838989, + "learning_rate": 7.481792717086836e-05, + "loss": 0.4686, + "step": 33144 + }, + { + "epoch": 18.51675977653631, + "grad_norm": 0.5320394039154053, + "learning_rate": 7.478991596638655e-05, + "loss": 0.382, + "step": 33145 + }, + { + "epoch": 18.51731843575419, + "grad_norm": 0.8162189722061157, + "learning_rate": 7.476190476190477e-05, + "loss": 0.4688, + "step": 33146 + }, + { + "epoch": 18.517877094972068, + "grad_norm": 0.341412216424942, + "learning_rate": 7.473389355742296e-05, + "loss": 0.377, + "step": 33147 + }, + { + "epoch": 18.518435754189944, + "grad_norm": 1.8213880062103271, + "learning_rate": 7.470588235294118e-05, + "loss": 0.4102, + "step": 33148 + }, + { + "epoch": 18.51899441340782, + "grad_norm": 0.46000349521636963, + "learning_rate": 7.467787114845939e-05, + "loss": 0.4041, + "step": 33149 + }, + { + "epoch": 18.519553072625698, + "grad_norm": 0.4847964644432068, + "learning_rate": 7.46498599439776e-05, + "loss": 0.2782, + "step": 33150 + }, + { + "epoch": 18.520111731843574, + "grad_norm": 1.4006507396697998, + "learning_rate": 7.46218487394958e-05, + "loss": 0.2997, + "step": 33151 + }, + { + "epoch": 18.52067039106145, + "grad_norm": 0.3633391261100769, + "learning_rate": 7.459383753501401e-05, + "loss": 0.3576, + "step": 33152 + }, + { + "epoch": 18.52122905027933, + "grad_norm": 0.41202208399772644, + "learning_rate": 7.456582633053221e-05, + "loss": 0.3238, + "step": 33153 + }, + { + "epoch": 18.521787709497207, + "grad_norm": 2.5322020053863525, + "learning_rate": 7.453781512605042e-05, + "loss": 0.4036, + "step": 33154 + }, + { + "epoch": 18.522346368715084, + "grad_norm": 0.45762449502944946, + "learning_rate": 7.450980392156863e-05, + "loss": 0.3718, + "step": 33155 + }, + { + "epoch": 18.52290502793296, + "grad_norm": 0.6414396166801453, + "learning_rate": 7.448179271708684e-05, + "loss": 0.3802, + "step": 33156 + }, + { + "epoch": 18.523463687150837, + "grad_norm": 0.5207961797714233, + "learning_rate": 7.445378151260504e-05, + "loss": 0.3864, + "step": 33157 + }, + { + "epoch": 18.524022346368714, + "grad_norm": 0.6475685238838196, + "learning_rate": 7.442577030812326e-05, + "loss": 0.429, + "step": 33158 + }, + { + "epoch": 18.524581005586594, + "grad_norm": 2.0297670364379883, + "learning_rate": 7.439775910364145e-05, + "loss": 0.4424, + "step": 33159 + }, + { + "epoch": 18.52513966480447, + "grad_norm": 0.3975844979286194, + "learning_rate": 7.436974789915967e-05, + "loss": 0.3114, + "step": 33160 + }, + { + "epoch": 18.525698324022347, + "grad_norm": 0.8499632477760315, + "learning_rate": 7.434173669467787e-05, + "loss": 0.4036, + "step": 33161 + }, + { + "epoch": 18.526256983240224, + "grad_norm": 2.8143064975738525, + "learning_rate": 7.431372549019608e-05, + "loss": 0.3653, + "step": 33162 + }, + { + "epoch": 18.5268156424581, + "grad_norm": 0.42966195940971375, + "learning_rate": 7.428571428571429e-05, + "loss": 0.3744, + "step": 33163 + }, + { + "epoch": 18.527374301675977, + "grad_norm": 2.403418779373169, + "learning_rate": 7.425770308123249e-05, + "loss": 0.4275, + "step": 33164 + }, + { + "epoch": 18.527932960893853, + "grad_norm": 0.4748799204826355, + "learning_rate": 7.42296918767507e-05, + "loss": 0.473, + "step": 33165 + }, + { + "epoch": 18.528491620111733, + "grad_norm": 0.42772579193115234, + "learning_rate": 7.420168067226892e-05, + "loss": 0.2686, + "step": 33166 + }, + { + "epoch": 18.52905027932961, + "grad_norm": 0.32804208993911743, + "learning_rate": 7.417366946778711e-05, + "loss": 0.389, + "step": 33167 + }, + { + "epoch": 18.529608938547486, + "grad_norm": 0.5379674434661865, + "learning_rate": 7.414565826330533e-05, + "loss": 0.5156, + "step": 33168 + }, + { + "epoch": 18.530167597765363, + "grad_norm": 0.4180110692977905, + "learning_rate": 7.411764705882352e-05, + "loss": 0.4637, + "step": 33169 + }, + { + "epoch": 18.53072625698324, + "grad_norm": 0.5903040170669556, + "learning_rate": 7.408963585434174e-05, + "loss": 0.4076, + "step": 33170 + }, + { + "epoch": 18.531284916201116, + "grad_norm": 0.5031701326370239, + "learning_rate": 7.406162464985995e-05, + "loss": 0.4007, + "step": 33171 + }, + { + "epoch": 18.531843575418993, + "grad_norm": 0.4165465235710144, + "learning_rate": 7.403361344537815e-05, + "loss": 0.3814, + "step": 33172 + }, + { + "epoch": 18.532402234636873, + "grad_norm": 0.4524424970149994, + "learning_rate": 7.400560224089636e-05, + "loss": 0.4105, + "step": 33173 + }, + { + "epoch": 18.53296089385475, + "grad_norm": 1.2550581693649292, + "learning_rate": 7.397759103641457e-05, + "loss": 0.4557, + "step": 33174 + }, + { + "epoch": 18.533519553072626, + "grad_norm": 2.22735333442688, + "learning_rate": 7.394957983193277e-05, + "loss": 0.3991, + "step": 33175 + }, + { + "epoch": 18.534078212290503, + "grad_norm": 0.5408685803413391, + "learning_rate": 7.392156862745099e-05, + "loss": 0.4748, + "step": 33176 + }, + { + "epoch": 18.53463687150838, + "grad_norm": 0.4846052825450897, + "learning_rate": 7.389355742296918e-05, + "loss": 0.232, + "step": 33177 + }, + { + "epoch": 18.535195530726256, + "grad_norm": 3.0756566524505615, + "learning_rate": 7.38655462184874e-05, + "loss": 0.4665, + "step": 33178 + }, + { + "epoch": 18.535754189944136, + "grad_norm": 0.3997471034526825, + "learning_rate": 7.38375350140056e-05, + "loss": 0.338, + "step": 33179 + }, + { + "epoch": 18.536312849162012, + "grad_norm": 0.3297956585884094, + "learning_rate": 7.380952380952382e-05, + "loss": 0.4415, + "step": 33180 + }, + { + "epoch": 18.53687150837989, + "grad_norm": 0.8906546235084534, + "learning_rate": 7.378151260504201e-05, + "loss": 0.4183, + "step": 33181 + }, + { + "epoch": 18.537430167597766, + "grad_norm": 0.46344131231307983, + "learning_rate": 7.375350140056023e-05, + "loss": 0.3753, + "step": 33182 + }, + { + "epoch": 18.537988826815642, + "grad_norm": 0.41927555203437805, + "learning_rate": 7.372549019607843e-05, + "loss": 0.4176, + "step": 33183 + }, + { + "epoch": 18.53854748603352, + "grad_norm": 0.5486651062965393, + "learning_rate": 7.369747899159664e-05, + "loss": 0.4219, + "step": 33184 + }, + { + "epoch": 18.539106145251395, + "grad_norm": 0.5035902857780457, + "learning_rate": 7.366946778711485e-05, + "loss": 0.4534, + "step": 33185 + }, + { + "epoch": 18.539664804469275, + "grad_norm": 1.6768066883087158, + "learning_rate": 7.364145658263305e-05, + "loss": 0.629, + "step": 33186 + }, + { + "epoch": 18.540223463687152, + "grad_norm": 2.1856179237365723, + "learning_rate": 7.361344537815126e-05, + "loss": 0.3252, + "step": 33187 + }, + { + "epoch": 18.54078212290503, + "grad_norm": 0.4666445255279541, + "learning_rate": 7.358543417366948e-05, + "loss": 0.3478, + "step": 33188 + }, + { + "epoch": 18.541340782122905, + "grad_norm": 0.543511152267456, + "learning_rate": 7.355742296918767e-05, + "loss": 0.538, + "step": 33189 + }, + { + "epoch": 18.54189944134078, + "grad_norm": 0.6718709468841553, + "learning_rate": 7.352941176470589e-05, + "loss": 0.4193, + "step": 33190 + }, + { + "epoch": 18.542458100558658, + "grad_norm": 0.2732716500759125, + "learning_rate": 7.350140056022408e-05, + "loss": 0.3295, + "step": 33191 + }, + { + "epoch": 18.543016759776535, + "grad_norm": 0.35757744312286377, + "learning_rate": 7.34733893557423e-05, + "loss": 0.3599, + "step": 33192 + }, + { + "epoch": 18.543575418994415, + "grad_norm": 0.4452117085456848, + "learning_rate": 7.344537815126051e-05, + "loss": 0.3975, + "step": 33193 + }, + { + "epoch": 18.54413407821229, + "grad_norm": 0.4574030339717865, + "learning_rate": 7.341736694677871e-05, + "loss": 0.5115, + "step": 33194 + }, + { + "epoch": 18.544692737430168, + "grad_norm": 0.588256299495697, + "learning_rate": 7.338935574229692e-05, + "loss": 0.5974, + "step": 33195 + }, + { + "epoch": 18.545251396648045, + "grad_norm": 0.56131911277771, + "learning_rate": 7.336134453781513e-05, + "loss": 0.3411, + "step": 33196 + }, + { + "epoch": 18.54581005586592, + "grad_norm": 1.006944179534912, + "learning_rate": 7.333333333333333e-05, + "loss": 0.4188, + "step": 33197 + }, + { + "epoch": 18.546368715083798, + "grad_norm": 0.8523073196411133, + "learning_rate": 7.330532212885155e-05, + "loss": 0.4112, + "step": 33198 + }, + { + "epoch": 18.546927374301674, + "grad_norm": 1.447073221206665, + "learning_rate": 7.327731092436974e-05, + "loss": 0.4438, + "step": 33199 + }, + { + "epoch": 18.547486033519554, + "grad_norm": 0.3902038335800171, + "learning_rate": 7.324929971988796e-05, + "loss": 0.402, + "step": 33200 + }, + { + "epoch": 18.54804469273743, + "grad_norm": 0.5681053400039673, + "learning_rate": 7.322128851540616e-05, + "loss": 0.3704, + "step": 33201 + }, + { + "epoch": 18.548603351955308, + "grad_norm": 0.5559684634208679, + "learning_rate": 7.319327731092437e-05, + "loss": 0.4918, + "step": 33202 + }, + { + "epoch": 18.549162011173184, + "grad_norm": 1.091188907623291, + "learning_rate": 7.316526610644258e-05, + "loss": 0.5067, + "step": 33203 + }, + { + "epoch": 18.54972067039106, + "grad_norm": 0.42429253458976746, + "learning_rate": 7.313725490196079e-05, + "loss": 0.3281, + "step": 33204 + }, + { + "epoch": 18.550279329608937, + "grad_norm": 0.5959124565124512, + "learning_rate": 7.310924369747899e-05, + "loss": 0.378, + "step": 33205 + }, + { + "epoch": 18.550837988826817, + "grad_norm": 0.6779744625091553, + "learning_rate": 7.30812324929972e-05, + "loss": 0.3785, + "step": 33206 + }, + { + "epoch": 18.551396648044694, + "grad_norm": 0.4374370872974396, + "learning_rate": 7.30532212885154e-05, + "loss": 0.3957, + "step": 33207 + }, + { + "epoch": 18.55195530726257, + "grad_norm": 0.7064943909645081, + "learning_rate": 7.302521008403361e-05, + "loss": 0.4395, + "step": 33208 + }, + { + "epoch": 18.552513966480447, + "grad_norm": 0.28746578097343445, + "learning_rate": 7.299719887955182e-05, + "loss": 0.2544, + "step": 33209 + }, + { + "epoch": 18.553072625698324, + "grad_norm": 0.4191751778125763, + "learning_rate": 7.296918767507004e-05, + "loss": 0.4033, + "step": 33210 + }, + { + "epoch": 18.5536312849162, + "grad_norm": 0.49161866307258606, + "learning_rate": 7.294117647058823e-05, + "loss": 0.4372, + "step": 33211 + }, + { + "epoch": 18.554189944134077, + "grad_norm": 0.3971637189388275, + "learning_rate": 7.291316526610645e-05, + "loss": 0.4318, + "step": 33212 + }, + { + "epoch": 18.554748603351957, + "grad_norm": 1.196581244468689, + "learning_rate": 7.288515406162464e-05, + "loss": 0.4166, + "step": 33213 + }, + { + "epoch": 18.555307262569833, + "grad_norm": 0.4599059224128723, + "learning_rate": 7.285714285714286e-05, + "loss": 0.3712, + "step": 33214 + }, + { + "epoch": 18.55586592178771, + "grad_norm": 0.4462869465351105, + "learning_rate": 7.282913165266107e-05, + "loss": 0.4345, + "step": 33215 + }, + { + "epoch": 18.556424581005587, + "grad_norm": 0.831937313079834, + "learning_rate": 7.280112044817927e-05, + "loss": 0.3613, + "step": 33216 + }, + { + "epoch": 18.556983240223463, + "grad_norm": 1.3959224224090576, + "learning_rate": 7.277310924369748e-05, + "loss": 0.5899, + "step": 33217 + }, + { + "epoch": 18.55754189944134, + "grad_norm": 0.3580646216869354, + "learning_rate": 7.274509803921568e-05, + "loss": 0.3501, + "step": 33218 + }, + { + "epoch": 18.558100558659216, + "grad_norm": 0.5040664672851562, + "learning_rate": 7.271708683473389e-05, + "loss": 0.332, + "step": 33219 + }, + { + "epoch": 18.558659217877096, + "grad_norm": 0.937634289264679, + "learning_rate": 7.268907563025211e-05, + "loss": 0.4182, + "step": 33220 + }, + { + "epoch": 18.559217877094973, + "grad_norm": 2.6909663677215576, + "learning_rate": 7.26610644257703e-05, + "loss": 0.4565, + "step": 33221 + }, + { + "epoch": 18.55977653631285, + "grad_norm": 0.4037894010543823, + "learning_rate": 7.263305322128852e-05, + "loss": 0.3517, + "step": 33222 + }, + { + "epoch": 18.560335195530726, + "grad_norm": 0.42995497584342957, + "learning_rate": 7.260504201680671e-05, + "loss": 0.3998, + "step": 33223 + }, + { + "epoch": 18.560893854748603, + "grad_norm": 0.37578660249710083, + "learning_rate": 7.257703081232493e-05, + "loss": 0.4749, + "step": 33224 + }, + { + "epoch": 18.56145251396648, + "grad_norm": 2.0960230827331543, + "learning_rate": 7.254901960784314e-05, + "loss": 0.4002, + "step": 33225 + }, + { + "epoch": 18.56201117318436, + "grad_norm": 0.5072458982467651, + "learning_rate": 7.252100840336135e-05, + "loss": 0.3718, + "step": 33226 + }, + { + "epoch": 18.562569832402236, + "grad_norm": 0.7249230742454529, + "learning_rate": 7.249299719887955e-05, + "loss": 0.3791, + "step": 33227 + }, + { + "epoch": 18.563128491620112, + "grad_norm": 0.4565328359603882, + "learning_rate": 7.246498599439776e-05, + "loss": 0.3983, + "step": 33228 + }, + { + "epoch": 18.56368715083799, + "grad_norm": 0.4697781801223755, + "learning_rate": 7.243697478991596e-05, + "loss": 0.3648, + "step": 33229 + }, + { + "epoch": 18.564245810055866, + "grad_norm": 0.5477104187011719, + "learning_rate": 7.240896358543418e-05, + "loss": 0.5351, + "step": 33230 + }, + { + "epoch": 18.564804469273742, + "grad_norm": 0.3905161917209625, + "learning_rate": 7.238095238095238e-05, + "loss": 0.3054, + "step": 33231 + }, + { + "epoch": 18.56536312849162, + "grad_norm": 2.3207709789276123, + "learning_rate": 7.23529411764706e-05, + "loss": 0.3868, + "step": 33232 + }, + { + "epoch": 18.5659217877095, + "grad_norm": 0.44025224447250366, + "learning_rate": 7.232492997198879e-05, + "loss": 0.384, + "step": 33233 + }, + { + "epoch": 18.566480446927375, + "grad_norm": 0.4687928259372711, + "learning_rate": 7.229691876750701e-05, + "loss": 0.332, + "step": 33234 + }, + { + "epoch": 18.567039106145252, + "grad_norm": 0.40319857001304626, + "learning_rate": 7.226890756302521e-05, + "loss": 0.499, + "step": 33235 + }, + { + "epoch": 18.56759776536313, + "grad_norm": 0.4615161418914795, + "learning_rate": 7.224089635854342e-05, + "loss": 0.406, + "step": 33236 + }, + { + "epoch": 18.568156424581005, + "grad_norm": 0.7605828046798706, + "learning_rate": 7.221288515406163e-05, + "loss": 0.511, + "step": 33237 + }, + { + "epoch": 18.56871508379888, + "grad_norm": 2.966731071472168, + "learning_rate": 7.218487394957983e-05, + "loss": 0.4446, + "step": 33238 + }, + { + "epoch": 18.56927374301676, + "grad_norm": 0.3684213161468506, + "learning_rate": 7.215686274509804e-05, + "loss": 0.394, + "step": 33239 + }, + { + "epoch": 18.56983240223464, + "grad_norm": 0.5597073435783386, + "learning_rate": 7.212885154061624e-05, + "loss": 0.546, + "step": 33240 + }, + { + "epoch": 18.570391061452515, + "grad_norm": 0.8951592445373535, + "learning_rate": 7.210084033613445e-05, + "loss": 0.4788, + "step": 33241 + }, + { + "epoch": 18.57094972067039, + "grad_norm": 0.34841498732566833, + "learning_rate": 7.207282913165267e-05, + "loss": 0.4622, + "step": 33242 + }, + { + "epoch": 18.571508379888268, + "grad_norm": 0.9607884883880615, + "learning_rate": 7.204481792717086e-05, + "loss": 0.4271, + "step": 33243 + }, + { + "epoch": 18.572067039106145, + "grad_norm": 0.46509015560150146, + "learning_rate": 7.201680672268908e-05, + "loss": 0.3516, + "step": 33244 + }, + { + "epoch": 18.57262569832402, + "grad_norm": 0.4555004835128784, + "learning_rate": 7.198879551820727e-05, + "loss": 0.5174, + "step": 33245 + }, + { + "epoch": 18.573184357541898, + "grad_norm": 5.340290069580078, + "learning_rate": 7.196078431372549e-05, + "loss": 0.4249, + "step": 33246 + }, + { + "epoch": 18.573743016759778, + "grad_norm": 0.6346580982208252, + "learning_rate": 7.19327731092437e-05, + "loss": 0.4467, + "step": 33247 + }, + { + "epoch": 18.574301675977654, + "grad_norm": 0.5110494494438171, + "learning_rate": 7.19047619047619e-05, + "loss": 0.3731, + "step": 33248 + }, + { + "epoch": 18.57486033519553, + "grad_norm": 0.36393192410469055, + "learning_rate": 7.187675070028011e-05, + "loss": 0.3197, + "step": 33249 + }, + { + "epoch": 18.575418994413408, + "grad_norm": 0.9145272374153137, + "learning_rate": 7.184873949579832e-05, + "loss": 0.5364, + "step": 33250 + }, + { + "epoch": 18.575977653631284, + "grad_norm": 0.6487234830856323, + "learning_rate": 7.182072829131652e-05, + "loss": 0.4829, + "step": 33251 + }, + { + "epoch": 18.57653631284916, + "grad_norm": 0.384820818901062, + "learning_rate": 7.179271708683474e-05, + "loss": 0.4202, + "step": 33252 + }, + { + "epoch": 18.577094972067037, + "grad_norm": 0.793975293636322, + "learning_rate": 7.176470588235293e-05, + "loss": 0.3957, + "step": 33253 + }, + { + "epoch": 18.577653631284917, + "grad_norm": 0.6721038222312927, + "learning_rate": 7.173669467787115e-05, + "loss": 0.3662, + "step": 33254 + }, + { + "epoch": 18.578212290502794, + "grad_norm": 1.0753592252731323, + "learning_rate": 7.170868347338935e-05, + "loss": 0.3263, + "step": 33255 + }, + { + "epoch": 18.57877094972067, + "grad_norm": 0.4046759307384491, + "learning_rate": 7.168067226890757e-05, + "loss": 0.4993, + "step": 33256 + }, + { + "epoch": 18.579329608938547, + "grad_norm": 1.7801207304000854, + "learning_rate": 7.165266106442577e-05, + "loss": 0.41, + "step": 33257 + }, + { + "epoch": 18.579888268156424, + "grad_norm": 1.1364519596099854, + "learning_rate": 7.162464985994398e-05, + "loss": 0.4858, + "step": 33258 + }, + { + "epoch": 18.5804469273743, + "grad_norm": 0.5880615711212158, + "learning_rate": 7.159663865546218e-05, + "loss": 0.4152, + "step": 33259 + }, + { + "epoch": 18.58100558659218, + "grad_norm": 1.4969029426574707, + "learning_rate": 7.156862745098039e-05, + "loss": 0.3912, + "step": 33260 + }, + { + "epoch": 18.581564245810057, + "grad_norm": 0.3177163004875183, + "learning_rate": 7.15406162464986e-05, + "loss": 0.3105, + "step": 33261 + }, + { + "epoch": 18.582122905027934, + "grad_norm": 0.39522597193717957, + "learning_rate": 7.151260504201682e-05, + "loss": 0.4095, + "step": 33262 + }, + { + "epoch": 18.58268156424581, + "grad_norm": 0.6207083463668823, + "learning_rate": 7.148459383753501e-05, + "loss": 0.3569, + "step": 33263 + }, + { + "epoch": 18.583240223463687, + "grad_norm": 0.46913254261016846, + "learning_rate": 7.145658263305323e-05, + "loss": 0.3818, + "step": 33264 + }, + { + "epoch": 18.583798882681563, + "grad_norm": 1.3477263450622559, + "learning_rate": 7.142857142857142e-05, + "loss": 0.4285, + "step": 33265 + }, + { + "epoch": 18.58435754189944, + "grad_norm": 1.10686457157135, + "learning_rate": 7.140056022408964e-05, + "loss": 0.351, + "step": 33266 + }, + { + "epoch": 18.58491620111732, + "grad_norm": 0.3655551075935364, + "learning_rate": 7.137254901960783e-05, + "loss": 0.3682, + "step": 33267 + }, + { + "epoch": 18.585474860335196, + "grad_norm": 0.35059067606925964, + "learning_rate": 7.134453781512605e-05, + "loss": 0.4181, + "step": 33268 + }, + { + "epoch": 18.586033519553073, + "grad_norm": 0.7126660943031311, + "learning_rate": 7.131652661064426e-05, + "loss": 0.3384, + "step": 33269 + }, + { + "epoch": 18.58659217877095, + "grad_norm": 0.39949169754981995, + "learning_rate": 7.128851540616246e-05, + "loss": 0.3947, + "step": 33270 + }, + { + "epoch": 18.587150837988826, + "grad_norm": 0.5206325054168701, + "learning_rate": 7.126050420168067e-05, + "loss": 0.462, + "step": 33271 + }, + { + "epoch": 18.587709497206703, + "grad_norm": 1.393926739692688, + "learning_rate": 7.123249299719888e-05, + "loss": 0.4513, + "step": 33272 + }, + { + "epoch": 18.58826815642458, + "grad_norm": 0.3261336088180542, + "learning_rate": 7.120448179271708e-05, + "loss": 0.3595, + "step": 33273 + }, + { + "epoch": 18.58882681564246, + "grad_norm": 0.47407636046409607, + "learning_rate": 7.11764705882353e-05, + "loss": 0.3385, + "step": 33274 + }, + { + "epoch": 18.589385474860336, + "grad_norm": 0.48043501377105713, + "learning_rate": 7.11484593837535e-05, + "loss": 0.5154, + "step": 33275 + }, + { + "epoch": 18.589944134078213, + "grad_norm": 0.6077399849891663, + "learning_rate": 7.112044817927171e-05, + "loss": 0.3616, + "step": 33276 + }, + { + "epoch": 18.59050279329609, + "grad_norm": 0.44984865188598633, + "learning_rate": 7.10924369747899e-05, + "loss": 0.3844, + "step": 33277 + }, + { + "epoch": 18.591061452513966, + "grad_norm": 1.0627429485321045, + "learning_rate": 7.106442577030813e-05, + "loss": 0.4101, + "step": 33278 + }, + { + "epoch": 18.591620111731842, + "grad_norm": 0.4674539268016815, + "learning_rate": 7.103641456582633e-05, + "loss": 0.3961, + "step": 33279 + }, + { + "epoch": 18.592178770949722, + "grad_norm": 0.4832356870174408, + "learning_rate": 7.100840336134454e-05, + "loss": 0.3921, + "step": 33280 + }, + { + "epoch": 18.5927374301676, + "grad_norm": 0.45875832438468933, + "learning_rate": 7.098039215686274e-05, + "loss": 0.5165, + "step": 33281 + }, + { + "epoch": 18.593296089385476, + "grad_norm": 0.5701793432235718, + "learning_rate": 7.095238095238095e-05, + "loss": 0.5011, + "step": 33282 + }, + { + "epoch": 18.593854748603352, + "grad_norm": 0.4326432943344116, + "learning_rate": 7.092436974789916e-05, + "loss": 0.3937, + "step": 33283 + }, + { + "epoch": 18.59441340782123, + "grad_norm": 0.5247666835784912, + "learning_rate": 7.089635854341737e-05, + "loss": 0.477, + "step": 33284 + }, + { + "epoch": 18.594972067039105, + "grad_norm": 0.4139988422393799, + "learning_rate": 7.086834733893557e-05, + "loss": 0.4307, + "step": 33285 + }, + { + "epoch": 18.595530726256982, + "grad_norm": 0.3451310396194458, + "learning_rate": 7.084033613445379e-05, + "loss": 0.3372, + "step": 33286 + }, + { + "epoch": 18.596089385474862, + "grad_norm": 1.644040822982788, + "learning_rate": 7.081232492997198e-05, + "loss": 0.3664, + "step": 33287 + }, + { + "epoch": 18.59664804469274, + "grad_norm": 0.6035840511322021, + "learning_rate": 7.07843137254902e-05, + "loss": 0.4486, + "step": 33288 + }, + { + "epoch": 18.597206703910615, + "grad_norm": 0.9313774108886719, + "learning_rate": 7.07563025210084e-05, + "loss": 0.3313, + "step": 33289 + }, + { + "epoch": 18.59776536312849, + "grad_norm": 2.306288480758667, + "learning_rate": 7.072829131652661e-05, + "loss": 0.3907, + "step": 33290 + }, + { + "epoch": 18.598324022346368, + "grad_norm": 0.4268757998943329, + "learning_rate": 7.070028011204482e-05, + "loss": 0.3032, + "step": 33291 + }, + { + "epoch": 18.598882681564245, + "grad_norm": 0.42450085282325745, + "learning_rate": 7.067226890756302e-05, + "loss": 0.3449, + "step": 33292 + }, + { + "epoch": 18.59944134078212, + "grad_norm": 0.4467846155166626, + "learning_rate": 7.064425770308123e-05, + "loss": 0.4707, + "step": 33293 + }, + { + "epoch": 18.6, + "grad_norm": 0.48317572474479675, + "learning_rate": 7.061624649859943e-05, + "loss": 0.4223, + "step": 33294 + }, + { + "epoch": 18.600558659217878, + "grad_norm": 0.4549301862716675, + "learning_rate": 7.058823529411764e-05, + "loss": 0.4177, + "step": 33295 + }, + { + "epoch": 18.601117318435755, + "grad_norm": 0.48605504631996155, + "learning_rate": 7.056022408963586e-05, + "loss": 0.3695, + "step": 33296 + }, + { + "epoch": 18.60167597765363, + "grad_norm": 0.4467478394508362, + "learning_rate": 7.053221288515405e-05, + "loss": 0.404, + "step": 33297 + }, + { + "epoch": 18.602234636871508, + "grad_norm": 0.44747933745384216, + "learning_rate": 7.050420168067227e-05, + "loss": 0.4255, + "step": 33298 + }, + { + "epoch": 18.602793296089384, + "grad_norm": 0.4978366792201996, + "learning_rate": 7.047619047619046e-05, + "loss": 0.4848, + "step": 33299 + }, + { + "epoch": 18.60335195530726, + "grad_norm": 0.39808717370033264, + "learning_rate": 7.044817927170868e-05, + "loss": 0.3402, + "step": 33300 + }, + { + "epoch": 18.60391061452514, + "grad_norm": 0.8843775987625122, + "learning_rate": 7.04201680672269e-05, + "loss": 0.3535, + "step": 33301 + }, + { + "epoch": 18.604469273743018, + "grad_norm": 1.2137494087219238, + "learning_rate": 7.03921568627451e-05, + "loss": 0.56, + "step": 33302 + }, + { + "epoch": 18.605027932960894, + "grad_norm": 0.44732093811035156, + "learning_rate": 7.036414565826332e-05, + "loss": 0.4376, + "step": 33303 + }, + { + "epoch": 18.60558659217877, + "grad_norm": 1.8453917503356934, + "learning_rate": 7.033613445378151e-05, + "loss": 0.4299, + "step": 33304 + }, + { + "epoch": 18.606145251396647, + "grad_norm": 0.4487641453742981, + "learning_rate": 7.030812324929973e-05, + "loss": 0.4115, + "step": 33305 + }, + { + "epoch": 18.606703910614524, + "grad_norm": 0.806923508644104, + "learning_rate": 7.028011204481793e-05, + "loss": 0.2797, + "step": 33306 + }, + { + "epoch": 18.607262569832404, + "grad_norm": 0.5042039155960083, + "learning_rate": 7.025210084033614e-05, + "loss": 0.353, + "step": 33307 + }, + { + "epoch": 18.60782122905028, + "grad_norm": 0.40456557273864746, + "learning_rate": 7.022408963585435e-05, + "loss": 0.378, + "step": 33308 + }, + { + "epoch": 18.608379888268157, + "grad_norm": 0.6416122913360596, + "learning_rate": 7.019607843137255e-05, + "loss": 0.4145, + "step": 33309 + }, + { + "epoch": 18.608938547486034, + "grad_norm": 0.3558356761932373, + "learning_rate": 7.016806722689076e-05, + "loss": 0.3978, + "step": 33310 + }, + { + "epoch": 18.60949720670391, + "grad_norm": 1.0025913715362549, + "learning_rate": 7.014005602240898e-05, + "loss": 0.4903, + "step": 33311 + }, + { + "epoch": 18.610055865921787, + "grad_norm": 0.5669221878051758, + "learning_rate": 7.011204481792717e-05, + "loss": 0.6458, + "step": 33312 + }, + { + "epoch": 18.610614525139663, + "grad_norm": 0.44173604249954224, + "learning_rate": 7.008403361344539e-05, + "loss": 0.4132, + "step": 33313 + }, + { + "epoch": 18.611173184357543, + "grad_norm": 0.4549146294593811, + "learning_rate": 7.005602240896358e-05, + "loss": 0.4963, + "step": 33314 + }, + { + "epoch": 18.61173184357542, + "grad_norm": 0.3699948489665985, + "learning_rate": 7.00280112044818e-05, + "loss": 0.4142, + "step": 33315 + }, + { + "epoch": 18.612290502793297, + "grad_norm": 0.30903923511505127, + "learning_rate": 7.000000000000001e-05, + "loss": 0.3222, + "step": 33316 + }, + { + "epoch": 18.612849162011173, + "grad_norm": 0.426784873008728, + "learning_rate": 6.997198879551821e-05, + "loss": 0.4562, + "step": 33317 + }, + { + "epoch": 18.61340782122905, + "grad_norm": 0.46098801493644714, + "learning_rate": 6.994397759103642e-05, + "loss": 0.436, + "step": 33318 + }, + { + "epoch": 18.613966480446926, + "grad_norm": 0.4943619966506958, + "learning_rate": 6.991596638655463e-05, + "loss": 0.4207, + "step": 33319 + }, + { + "epoch": 18.614525139664803, + "grad_norm": 2.67362380027771, + "learning_rate": 6.988795518207283e-05, + "loss": 0.4138, + "step": 33320 + }, + { + "epoch": 18.615083798882683, + "grad_norm": 1.2203537225723267, + "learning_rate": 6.985994397759104e-05, + "loss": 0.2797, + "step": 33321 + }, + { + "epoch": 18.61564245810056, + "grad_norm": 0.6407197117805481, + "learning_rate": 6.983193277310924e-05, + "loss": 0.4509, + "step": 33322 + }, + { + "epoch": 18.616201117318436, + "grad_norm": 0.44569990038871765, + "learning_rate": 6.980392156862746e-05, + "loss": 0.3285, + "step": 33323 + }, + { + "epoch": 18.616759776536313, + "grad_norm": 0.9600493311882019, + "learning_rate": 6.977591036414566e-05, + "loss": 0.337, + "step": 33324 + }, + { + "epoch": 18.61731843575419, + "grad_norm": 0.38452187180519104, + "learning_rate": 6.974789915966387e-05, + "loss": 0.3733, + "step": 33325 + }, + { + "epoch": 18.617877094972066, + "grad_norm": 0.5779157280921936, + "learning_rate": 6.971988795518207e-05, + "loss": 0.3544, + "step": 33326 + }, + { + "epoch": 18.618435754189946, + "grad_norm": 1.3285716772079468, + "learning_rate": 6.969187675070029e-05, + "loss": 0.435, + "step": 33327 + }, + { + "epoch": 18.618994413407822, + "grad_norm": 0.6382309198379517, + "learning_rate": 6.966386554621849e-05, + "loss": 0.4314, + "step": 33328 + }, + { + "epoch": 18.6195530726257, + "grad_norm": 0.4070654511451721, + "learning_rate": 6.96358543417367e-05, + "loss": 0.4191, + "step": 33329 + }, + { + "epoch": 18.620111731843576, + "grad_norm": 0.3165675699710846, + "learning_rate": 6.96078431372549e-05, + "loss": 0.2905, + "step": 33330 + }, + { + "epoch": 18.620670391061452, + "grad_norm": 0.6993505954742432, + "learning_rate": 6.957983193277311e-05, + "loss": 0.3288, + "step": 33331 + }, + { + "epoch": 18.62122905027933, + "grad_norm": 0.3519015312194824, + "learning_rate": 6.955182072829132e-05, + "loss": 0.3951, + "step": 33332 + }, + { + "epoch": 18.621787709497205, + "grad_norm": 0.4634532630443573, + "learning_rate": 6.952380952380954e-05, + "loss": 0.4133, + "step": 33333 + }, + { + "epoch": 18.622346368715085, + "grad_norm": 0.3909326493740082, + "learning_rate": 6.949579831932773e-05, + "loss": 0.3508, + "step": 33334 + }, + { + "epoch": 18.622905027932962, + "grad_norm": 0.3784567415714264, + "learning_rate": 6.946778711484595e-05, + "loss": 0.3786, + "step": 33335 + }, + { + "epoch": 18.62346368715084, + "grad_norm": 0.43578144907951355, + "learning_rate": 6.943977591036414e-05, + "loss": 0.4814, + "step": 33336 + }, + { + "epoch": 18.624022346368715, + "grad_norm": 0.3682384490966797, + "learning_rate": 6.941176470588236e-05, + "loss": 0.357, + "step": 33337 + }, + { + "epoch": 18.62458100558659, + "grad_norm": 0.3547244966030121, + "learning_rate": 6.938375350140057e-05, + "loss": 0.3404, + "step": 33338 + }, + { + "epoch": 18.62513966480447, + "grad_norm": 0.42116090655326843, + "learning_rate": 6.935574229691877e-05, + "loss": 0.4496, + "step": 33339 + }, + { + "epoch": 18.625698324022345, + "grad_norm": 0.38738059997558594, + "learning_rate": 6.932773109243698e-05, + "loss": 0.4142, + "step": 33340 + }, + { + "epoch": 18.626256983240225, + "grad_norm": 0.5341260433197021, + "learning_rate": 6.929971988795518e-05, + "loss": 0.4304, + "step": 33341 + }, + { + "epoch": 18.6268156424581, + "grad_norm": 0.5779695510864258, + "learning_rate": 6.927170868347339e-05, + "loss": 0.6634, + "step": 33342 + }, + { + "epoch": 18.627374301675978, + "grad_norm": 0.4567739963531494, + "learning_rate": 6.924369747899161e-05, + "loss": 0.4972, + "step": 33343 + }, + { + "epoch": 18.627932960893855, + "grad_norm": 0.5166769623756409, + "learning_rate": 6.92156862745098e-05, + "loss": 0.4008, + "step": 33344 + }, + { + "epoch": 18.62849162011173, + "grad_norm": 0.44283583760261536, + "learning_rate": 6.918767507002802e-05, + "loss": 0.5069, + "step": 33345 + }, + { + "epoch": 18.629050279329608, + "grad_norm": 0.4220067858695984, + "learning_rate": 6.915966386554621e-05, + "loss": 0.4044, + "step": 33346 + }, + { + "epoch": 18.629608938547484, + "grad_norm": 0.5188944339752197, + "learning_rate": 6.913165266106443e-05, + "loss": 0.3974, + "step": 33347 + }, + { + "epoch": 18.630167597765364, + "grad_norm": 0.4026779532432556, + "learning_rate": 6.910364145658263e-05, + "loss": 0.3993, + "step": 33348 + }, + { + "epoch": 18.63072625698324, + "grad_norm": 0.45261114835739136, + "learning_rate": 6.907563025210085e-05, + "loss": 0.3556, + "step": 33349 + }, + { + "epoch": 18.631284916201118, + "grad_norm": 0.4704189896583557, + "learning_rate": 6.904761904761905e-05, + "loss": 0.372, + "step": 33350 + }, + { + "epoch": 18.631843575418994, + "grad_norm": 0.43418875336647034, + "learning_rate": 6.901960784313726e-05, + "loss": 0.5005, + "step": 33351 + }, + { + "epoch": 18.63240223463687, + "grad_norm": 0.6777607202529907, + "learning_rate": 6.899159663865546e-05, + "loss": 0.4431, + "step": 33352 + }, + { + "epoch": 18.632960893854747, + "grad_norm": 0.5315324664115906, + "learning_rate": 6.896358543417367e-05, + "loss": 0.5795, + "step": 33353 + }, + { + "epoch": 18.633519553072627, + "grad_norm": 0.4635539948940277, + "learning_rate": 6.893557422969188e-05, + "loss": 0.3553, + "step": 33354 + }, + { + "epoch": 18.634078212290504, + "grad_norm": 0.405765175819397, + "learning_rate": 6.89075630252101e-05, + "loss": 0.3628, + "step": 33355 + }, + { + "epoch": 18.63463687150838, + "grad_norm": 0.386717826128006, + "learning_rate": 6.887955182072829e-05, + "loss": 0.4548, + "step": 33356 + }, + { + "epoch": 18.635195530726257, + "grad_norm": 0.46891769766807556, + "learning_rate": 6.885154061624651e-05, + "loss": 0.3966, + "step": 33357 + }, + { + "epoch": 18.635754189944134, + "grad_norm": 0.5598861575126648, + "learning_rate": 6.88235294117647e-05, + "loss": 0.3838, + "step": 33358 + }, + { + "epoch": 18.63631284916201, + "grad_norm": 3.9775185585021973, + "learning_rate": 6.879551820728292e-05, + "loss": 0.3262, + "step": 33359 + }, + { + "epoch": 18.636871508379887, + "grad_norm": 0.4295130968093872, + "learning_rate": 6.876750700280113e-05, + "loss": 0.3648, + "step": 33360 + }, + { + "epoch": 18.637430167597767, + "grad_norm": 0.38078030943870544, + "learning_rate": 6.873949579831933e-05, + "loss": 0.4605, + "step": 33361 + }, + { + "epoch": 18.637988826815644, + "grad_norm": 0.7125481367111206, + "learning_rate": 6.871148459383754e-05, + "loss": 0.4915, + "step": 33362 + }, + { + "epoch": 18.63854748603352, + "grad_norm": 0.5079014301300049, + "learning_rate": 6.868347338935574e-05, + "loss": 0.4306, + "step": 33363 + }, + { + "epoch": 18.639106145251397, + "grad_norm": 0.37734177708625793, + "learning_rate": 6.865546218487395e-05, + "loss": 0.2915, + "step": 33364 + }, + { + "epoch": 18.639664804469273, + "grad_norm": 0.36632850766181946, + "learning_rate": 6.862745098039217e-05, + "loss": 0.3349, + "step": 33365 + }, + { + "epoch": 18.64022346368715, + "grad_norm": 0.3756070137023926, + "learning_rate": 6.859943977591036e-05, + "loss": 0.3728, + "step": 33366 + }, + { + "epoch": 18.640782122905026, + "grad_norm": 0.4458467960357666, + "learning_rate": 6.857142857142858e-05, + "loss": 0.4098, + "step": 33367 + }, + { + "epoch": 18.641340782122906, + "grad_norm": 0.37712183594703674, + "learning_rate": 6.854341736694677e-05, + "loss": 0.4033, + "step": 33368 + }, + { + "epoch": 18.641899441340783, + "grad_norm": 0.42903950810432434, + "learning_rate": 6.851540616246499e-05, + "loss": 0.4192, + "step": 33369 + }, + { + "epoch": 18.64245810055866, + "grad_norm": 0.5669801831245422, + "learning_rate": 6.84873949579832e-05, + "loss": 0.3165, + "step": 33370 + }, + { + "epoch": 18.643016759776536, + "grad_norm": 0.5043022036552429, + "learning_rate": 6.84593837535014e-05, + "loss": 0.3816, + "step": 33371 + }, + { + "epoch": 18.643575418994413, + "grad_norm": 0.5248939394950867, + "learning_rate": 6.843137254901961e-05, + "loss": 0.3789, + "step": 33372 + }, + { + "epoch": 18.64413407821229, + "grad_norm": 1.111251950263977, + "learning_rate": 6.840336134453782e-05, + "loss": 0.4261, + "step": 33373 + }, + { + "epoch": 18.64469273743017, + "grad_norm": 0.5118752717971802, + "learning_rate": 6.837535014005602e-05, + "loss": 0.5202, + "step": 33374 + }, + { + "epoch": 18.645251396648046, + "grad_norm": 0.5092790126800537, + "learning_rate": 6.834733893557423e-05, + "loss": 0.4774, + "step": 33375 + }, + { + "epoch": 18.645810055865923, + "grad_norm": 0.4587847590446472, + "learning_rate": 6.831932773109243e-05, + "loss": 0.3485, + "step": 33376 + }, + { + "epoch": 18.6463687150838, + "grad_norm": 0.8198220729827881, + "learning_rate": 6.829131652661065e-05, + "loss": 0.3504, + "step": 33377 + }, + { + "epoch": 18.646927374301676, + "grad_norm": 0.5205161571502686, + "learning_rate": 6.826330532212885e-05, + "loss": 0.28, + "step": 33378 + }, + { + "epoch": 18.647486033519552, + "grad_norm": 0.6236117482185364, + "learning_rate": 6.823529411764707e-05, + "loss": 0.3962, + "step": 33379 + }, + { + "epoch": 18.64804469273743, + "grad_norm": 0.5078142881393433, + "learning_rate": 6.820728291316526e-05, + "loss": 0.4444, + "step": 33380 + }, + { + "epoch": 18.64860335195531, + "grad_norm": 0.5336299538612366, + "learning_rate": 6.817927170868348e-05, + "loss": 0.4304, + "step": 33381 + }, + { + "epoch": 18.649162011173186, + "grad_norm": 1.2762658596038818, + "learning_rate": 6.815126050420168e-05, + "loss": 0.3632, + "step": 33382 + }, + { + "epoch": 18.649720670391062, + "grad_norm": 0.3571813106536865, + "learning_rate": 6.812324929971989e-05, + "loss": 0.409, + "step": 33383 + }, + { + "epoch": 18.65027932960894, + "grad_norm": 0.9720109105110168, + "learning_rate": 6.80952380952381e-05, + "loss": 0.4535, + "step": 33384 + }, + { + "epoch": 18.650837988826815, + "grad_norm": 0.5444110631942749, + "learning_rate": 6.80672268907563e-05, + "loss": 0.3612, + "step": 33385 + }, + { + "epoch": 18.65139664804469, + "grad_norm": 0.40758684277534485, + "learning_rate": 6.803921568627451e-05, + "loss": 0.4329, + "step": 33386 + }, + { + "epoch": 18.65195530726257, + "grad_norm": 0.6672812104225159, + "learning_rate": 6.801120448179273e-05, + "loss": 0.3773, + "step": 33387 + }, + { + "epoch": 18.65251396648045, + "grad_norm": 0.7788664698600769, + "learning_rate": 6.798319327731092e-05, + "loss": 0.3264, + "step": 33388 + }, + { + "epoch": 18.653072625698325, + "grad_norm": 0.6699696779251099, + "learning_rate": 6.795518207282914e-05, + "loss": 0.5074, + "step": 33389 + }, + { + "epoch": 18.6536312849162, + "grad_norm": 0.4077255129814148, + "learning_rate": 6.792717086834733e-05, + "loss": 0.3973, + "step": 33390 + }, + { + "epoch": 18.654189944134078, + "grad_norm": 0.402718722820282, + "learning_rate": 6.789915966386555e-05, + "loss": 0.4072, + "step": 33391 + }, + { + "epoch": 18.654748603351955, + "grad_norm": 0.5116134285926819, + "learning_rate": 6.787114845938376e-05, + "loss": 0.3528, + "step": 33392 + }, + { + "epoch": 18.65530726256983, + "grad_norm": 0.4758792519569397, + "learning_rate": 6.784313725490196e-05, + "loss": 0.3629, + "step": 33393 + }, + { + "epoch": 18.655865921787708, + "grad_norm": 0.36802932620048523, + "learning_rate": 6.781512605042017e-05, + "loss": 0.3526, + "step": 33394 + }, + { + "epoch": 18.656424581005588, + "grad_norm": 0.4252503514289856, + "learning_rate": 6.778711484593838e-05, + "loss": 0.3415, + "step": 33395 + }, + { + "epoch": 18.656983240223465, + "grad_norm": 1.050351858139038, + "learning_rate": 6.775910364145658e-05, + "loss": 0.4402, + "step": 33396 + }, + { + "epoch": 18.65754189944134, + "grad_norm": 0.43924498558044434, + "learning_rate": 6.77310924369748e-05, + "loss": 0.3683, + "step": 33397 + }, + { + "epoch": 18.658100558659218, + "grad_norm": 0.5609034895896912, + "learning_rate": 6.7703081232493e-05, + "loss": 0.4917, + "step": 33398 + }, + { + "epoch": 18.658659217877094, + "grad_norm": 0.7274319529533386, + "learning_rate": 6.767507002801121e-05, + "loss": 0.3173, + "step": 33399 + }, + { + "epoch": 18.65921787709497, + "grad_norm": 0.9132292866706848, + "learning_rate": 6.76470588235294e-05, + "loss": 0.5188, + "step": 33400 + }, + { + "epoch": 18.659776536312847, + "grad_norm": 0.7994263172149658, + "learning_rate": 6.761904761904763e-05, + "loss": 0.4378, + "step": 33401 + }, + { + "epoch": 18.660335195530728, + "grad_norm": 0.368822306394577, + "learning_rate": 6.759103641456583e-05, + "loss": 0.3658, + "step": 33402 + }, + { + "epoch": 18.660893854748604, + "grad_norm": 0.7670091986656189, + "learning_rate": 6.756302521008404e-05, + "loss": 0.4421, + "step": 33403 + }, + { + "epoch": 18.66145251396648, + "grad_norm": 1.2274000644683838, + "learning_rate": 6.753501400560224e-05, + "loss": 0.4916, + "step": 33404 + }, + { + "epoch": 18.662011173184357, + "grad_norm": 1.5293766260147095, + "learning_rate": 6.750700280112045e-05, + "loss": 0.4057, + "step": 33405 + }, + { + "epoch": 18.662569832402234, + "grad_norm": 0.5386022329330444, + "learning_rate": 6.747899159663866e-05, + "loss": 0.479, + "step": 33406 + }, + { + "epoch": 18.66312849162011, + "grad_norm": 0.4993986189365387, + "learning_rate": 6.745098039215686e-05, + "loss": 0.4085, + "step": 33407 + }, + { + "epoch": 18.66368715083799, + "grad_norm": 0.7581431865692139, + "learning_rate": 6.742296918767507e-05, + "loss": 0.4864, + "step": 33408 + }, + { + "epoch": 18.664245810055867, + "grad_norm": 0.39697983860969543, + "learning_rate": 6.739495798319329e-05, + "loss": 0.5637, + "step": 33409 + }, + { + "epoch": 18.664804469273744, + "grad_norm": 0.515539288520813, + "learning_rate": 6.736694677871148e-05, + "loss": 0.4363, + "step": 33410 + }, + { + "epoch": 18.66536312849162, + "grad_norm": 0.37271395325660706, + "learning_rate": 6.73389355742297e-05, + "loss": 0.3839, + "step": 33411 + }, + { + "epoch": 18.665921787709497, + "grad_norm": 0.5110172629356384, + "learning_rate": 6.731092436974789e-05, + "loss": 0.4851, + "step": 33412 + }, + { + "epoch": 18.666480446927373, + "grad_norm": 0.43135422468185425, + "learning_rate": 6.728291316526611e-05, + "loss": 0.4247, + "step": 33413 + }, + { + "epoch": 18.66703910614525, + "grad_norm": 0.37992382049560547, + "learning_rate": 6.725490196078432e-05, + "loss": 0.4249, + "step": 33414 + }, + { + "epoch": 18.66759776536313, + "grad_norm": 0.33368533849716187, + "learning_rate": 6.722689075630252e-05, + "loss": 0.4076, + "step": 33415 + }, + { + "epoch": 18.668156424581007, + "grad_norm": 0.41844138503074646, + "learning_rate": 6.719887955182073e-05, + "loss": 0.5004, + "step": 33416 + }, + { + "epoch": 18.668715083798883, + "grad_norm": 0.6788742542266846, + "learning_rate": 6.717086834733893e-05, + "loss": 0.5111, + "step": 33417 + }, + { + "epoch": 18.66927374301676, + "grad_norm": 0.467020183801651, + "learning_rate": 6.714285714285714e-05, + "loss": 0.3777, + "step": 33418 + }, + { + "epoch": 18.669832402234636, + "grad_norm": 0.3772476315498352, + "learning_rate": 6.711484593837536e-05, + "loss": 0.4085, + "step": 33419 + }, + { + "epoch": 18.670391061452513, + "grad_norm": 0.38728058338165283, + "learning_rate": 6.708683473389355e-05, + "loss": 0.3391, + "step": 33420 + }, + { + "epoch": 18.67094972067039, + "grad_norm": 0.42252540588378906, + "learning_rate": 6.705882352941177e-05, + "loss": 0.4304, + "step": 33421 + }, + { + "epoch": 18.67150837988827, + "grad_norm": 0.42641809582710266, + "learning_rate": 6.703081232492996e-05, + "loss": 0.4231, + "step": 33422 + }, + { + "epoch": 18.672067039106146, + "grad_norm": 6.117280960083008, + "learning_rate": 6.700280112044818e-05, + "loss": 0.3582, + "step": 33423 + }, + { + "epoch": 18.672625698324023, + "grad_norm": 0.34205129742622375, + "learning_rate": 6.697478991596639e-05, + "loss": 0.3737, + "step": 33424 + }, + { + "epoch": 18.6731843575419, + "grad_norm": 0.38349395990371704, + "learning_rate": 6.69467787114846e-05, + "loss": 0.384, + "step": 33425 + }, + { + "epoch": 18.673743016759776, + "grad_norm": 5.445019721984863, + "learning_rate": 6.69187675070028e-05, + "loss": 0.549, + "step": 33426 + }, + { + "epoch": 18.674301675977652, + "grad_norm": 0.3726084232330322, + "learning_rate": 6.689075630252101e-05, + "loss": 0.3219, + "step": 33427 + }, + { + "epoch": 18.674860335195532, + "grad_norm": 0.4386651813983917, + "learning_rate": 6.686274509803921e-05, + "loss": 0.2697, + "step": 33428 + }, + { + "epoch": 18.67541899441341, + "grad_norm": 0.35443681478500366, + "learning_rate": 6.683473389355743e-05, + "loss": 0.388, + "step": 33429 + }, + { + "epoch": 18.675977653631286, + "grad_norm": 0.5843977928161621, + "learning_rate": 6.680672268907563e-05, + "loss": 0.492, + "step": 33430 + }, + { + "epoch": 18.676536312849162, + "grad_norm": 0.37908267974853516, + "learning_rate": 6.677871148459385e-05, + "loss": 0.3413, + "step": 33431 + }, + { + "epoch": 18.67709497206704, + "grad_norm": 0.5288969278335571, + "learning_rate": 6.675070028011204e-05, + "loss": 0.4877, + "step": 33432 + }, + { + "epoch": 18.677653631284915, + "grad_norm": 0.37134626507759094, + "learning_rate": 6.672268907563026e-05, + "loss": 0.3633, + "step": 33433 + }, + { + "epoch": 18.678212290502792, + "grad_norm": 0.38918158411979675, + "learning_rate": 6.669467787114845e-05, + "loss": 0.3626, + "step": 33434 + }, + { + "epoch": 18.678770949720672, + "grad_norm": 0.3228667378425598, + "learning_rate": 6.666666666666667e-05, + "loss": 0.4075, + "step": 33435 + }, + { + "epoch": 18.67932960893855, + "grad_norm": 0.6147653460502625, + "learning_rate": 6.663865546218488e-05, + "loss": 0.4991, + "step": 33436 + }, + { + "epoch": 18.679888268156425, + "grad_norm": 1.1263574361801147, + "learning_rate": 6.661064425770308e-05, + "loss": 0.513, + "step": 33437 + }, + { + "epoch": 18.6804469273743, + "grad_norm": 0.332773894071579, + "learning_rate": 6.658263305322129e-05, + "loss": 0.3661, + "step": 33438 + }, + { + "epoch": 18.68100558659218, + "grad_norm": 0.4812290668487549, + "learning_rate": 6.65546218487395e-05, + "loss": 0.4563, + "step": 33439 + }, + { + "epoch": 18.681564245810055, + "grad_norm": 0.35524433851242065, + "learning_rate": 6.65266106442577e-05, + "loss": 0.3677, + "step": 33440 + }, + { + "epoch": 18.68212290502793, + "grad_norm": 0.380925714969635, + "learning_rate": 6.649859943977592e-05, + "loss": 0.5541, + "step": 33441 + }, + { + "epoch": 18.68268156424581, + "grad_norm": 0.47978392243385315, + "learning_rate": 6.647058823529411e-05, + "loss": 0.527, + "step": 33442 + }, + { + "epoch": 18.683240223463688, + "grad_norm": 0.3735135495662689, + "learning_rate": 6.644257703081233e-05, + "loss": 0.4005, + "step": 33443 + }, + { + "epoch": 18.683798882681565, + "grad_norm": 0.6373332142829895, + "learning_rate": 6.641456582633052e-05, + "loss": 0.4329, + "step": 33444 + }, + { + "epoch": 18.68435754189944, + "grad_norm": 0.3036915063858032, + "learning_rate": 6.638655462184874e-05, + "loss": 0.2913, + "step": 33445 + }, + { + "epoch": 18.684916201117318, + "grad_norm": 0.5199482440948486, + "learning_rate": 6.635854341736695e-05, + "loss": 0.5777, + "step": 33446 + }, + { + "epoch": 18.685474860335194, + "grad_norm": 0.36825549602508545, + "learning_rate": 6.633053221288516e-05, + "loss": 0.3525, + "step": 33447 + }, + { + "epoch": 18.68603351955307, + "grad_norm": 0.29921406507492065, + "learning_rate": 6.630252100840336e-05, + "loss": 0.3117, + "step": 33448 + }, + { + "epoch": 18.68659217877095, + "grad_norm": 0.429211288690567, + "learning_rate": 6.627450980392157e-05, + "loss": 0.3609, + "step": 33449 + }, + { + "epoch": 18.687150837988828, + "grad_norm": 0.35947978496551514, + "learning_rate": 6.624649859943977e-05, + "loss": 0.3826, + "step": 33450 + }, + { + "epoch": 18.687709497206704, + "grad_norm": 0.4741494059562683, + "learning_rate": 6.621848739495799e-05, + "loss": 0.3908, + "step": 33451 + }, + { + "epoch": 18.68826815642458, + "grad_norm": 0.9542248845100403, + "learning_rate": 6.619047619047619e-05, + "loss": 0.3512, + "step": 33452 + }, + { + "epoch": 18.688826815642457, + "grad_norm": 0.469257116317749, + "learning_rate": 6.61624649859944e-05, + "loss": 0.4691, + "step": 33453 + }, + { + "epoch": 18.689385474860334, + "grad_norm": 0.37024492025375366, + "learning_rate": 6.61344537815126e-05, + "loss": 0.3029, + "step": 33454 + }, + { + "epoch": 18.689944134078214, + "grad_norm": 0.38087335228919983, + "learning_rate": 6.610644257703082e-05, + "loss": 0.4421, + "step": 33455 + }, + { + "epoch": 18.69050279329609, + "grad_norm": 0.5070014595985413, + "learning_rate": 6.607843137254902e-05, + "loss": 0.4671, + "step": 33456 + }, + { + "epoch": 18.691061452513967, + "grad_norm": 0.47173941135406494, + "learning_rate": 6.605042016806723e-05, + "loss": 0.4226, + "step": 33457 + }, + { + "epoch": 18.691620111731844, + "grad_norm": 1.1254301071166992, + "learning_rate": 6.602240896358543e-05, + "loss": 0.5326, + "step": 33458 + }, + { + "epoch": 18.69217877094972, + "grad_norm": 0.3740171492099762, + "learning_rate": 6.599439775910364e-05, + "loss": 0.3618, + "step": 33459 + }, + { + "epoch": 18.692737430167597, + "grad_norm": 1.5500136613845825, + "learning_rate": 6.596638655462185e-05, + "loss": 0.4863, + "step": 33460 + }, + { + "epoch": 18.693296089385473, + "grad_norm": 0.40329331159591675, + "learning_rate": 6.593837535014005e-05, + "loss": 0.36, + "step": 33461 + }, + { + "epoch": 18.693854748603353, + "grad_norm": 1.1434741020202637, + "learning_rate": 6.591036414565826e-05, + "loss": 0.5125, + "step": 33462 + }, + { + "epoch": 18.69441340782123, + "grad_norm": 0.4510940611362457, + "learning_rate": 6.588235294117648e-05, + "loss": 0.4034, + "step": 33463 + }, + { + "epoch": 18.694972067039107, + "grad_norm": 0.37764859199523926, + "learning_rate": 6.585434173669467e-05, + "loss": 0.42, + "step": 33464 + }, + { + "epoch": 18.695530726256983, + "grad_norm": 0.4557996392250061, + "learning_rate": 6.582633053221289e-05, + "loss": 0.3941, + "step": 33465 + }, + { + "epoch": 18.69608938547486, + "grad_norm": 0.42460697889328003, + "learning_rate": 6.579831932773108e-05, + "loss": 0.5063, + "step": 33466 + }, + { + "epoch": 18.696648044692736, + "grad_norm": 1.5521520376205444, + "learning_rate": 6.57703081232493e-05, + "loss": 0.4302, + "step": 33467 + }, + { + "epoch": 18.697206703910613, + "grad_norm": 0.6736728549003601, + "learning_rate": 6.574229691876751e-05, + "loss": 0.376, + "step": 33468 + }, + { + "epoch": 18.697765363128493, + "grad_norm": 0.5069401860237122, + "learning_rate": 6.571428571428571e-05, + "loss": 0.6019, + "step": 33469 + }, + { + "epoch": 18.69832402234637, + "grad_norm": 0.557361900806427, + "learning_rate": 6.568627450980392e-05, + "loss": 0.4209, + "step": 33470 + }, + { + "epoch": 18.698882681564246, + "grad_norm": 0.3788871169090271, + "learning_rate": 6.565826330532213e-05, + "loss": 0.4169, + "step": 33471 + }, + { + "epoch": 18.699441340782123, + "grad_norm": 0.5408769845962524, + "learning_rate": 6.563025210084033e-05, + "loss": 0.4089, + "step": 33472 + }, + { + "epoch": 18.7, + "grad_norm": 0.7183144688606262, + "learning_rate": 6.560224089635855e-05, + "loss": 0.3244, + "step": 33473 + }, + { + "epoch": 18.700558659217876, + "grad_norm": 0.38459843397140503, + "learning_rate": 6.557422969187674e-05, + "loss": 0.4013, + "step": 33474 + }, + { + "epoch": 18.701117318435756, + "grad_norm": 1.41618812084198, + "learning_rate": 6.554621848739496e-05, + "loss": 0.5249, + "step": 33475 + }, + { + "epoch": 18.701675977653633, + "grad_norm": 0.6083825826644897, + "learning_rate": 6.551820728291316e-05, + "loss": 0.4764, + "step": 33476 + }, + { + "epoch": 18.70223463687151, + "grad_norm": 0.49202781915664673, + "learning_rate": 6.549019607843138e-05, + "loss": 0.4582, + "step": 33477 + }, + { + "epoch": 18.702793296089386, + "grad_norm": 0.3436841666698456, + "learning_rate": 6.546218487394958e-05, + "loss": 0.2664, + "step": 33478 + }, + { + "epoch": 18.703351955307262, + "grad_norm": 0.4124920666217804, + "learning_rate": 6.543417366946779e-05, + "loss": 0.3201, + "step": 33479 + }, + { + "epoch": 18.70391061452514, + "grad_norm": 0.45277103781700134, + "learning_rate": 6.5406162464986e-05, + "loss": 0.423, + "step": 33480 + }, + { + "epoch": 18.704469273743015, + "grad_norm": 0.38388049602508545, + "learning_rate": 6.53781512605042e-05, + "loss": 0.3584, + "step": 33481 + }, + { + "epoch": 18.705027932960895, + "grad_norm": 0.42084765434265137, + "learning_rate": 6.53501400560224e-05, + "loss": 0.3358, + "step": 33482 + }, + { + "epoch": 18.705586592178772, + "grad_norm": 0.7023029327392578, + "learning_rate": 6.532212885154063e-05, + "loss": 0.4595, + "step": 33483 + }, + { + "epoch": 18.70614525139665, + "grad_norm": 0.7741283178329468, + "learning_rate": 6.529411764705882e-05, + "loss": 0.4092, + "step": 33484 + }, + { + "epoch": 18.706703910614525, + "grad_norm": 0.3026845455169678, + "learning_rate": 6.526610644257704e-05, + "loss": 0.3626, + "step": 33485 + }, + { + "epoch": 18.7072625698324, + "grad_norm": 0.345252126455307, + "learning_rate": 6.523809523809523e-05, + "loss": 0.4136, + "step": 33486 + }, + { + "epoch": 18.70782122905028, + "grad_norm": 0.5867792963981628, + "learning_rate": 6.521008403361345e-05, + "loss": 0.4503, + "step": 33487 + }, + { + "epoch": 18.708379888268155, + "grad_norm": 0.358710914850235, + "learning_rate": 6.518207282913164e-05, + "loss": 0.4142, + "step": 33488 + }, + { + "epoch": 18.708938547486035, + "grad_norm": 0.6675719618797302, + "learning_rate": 6.515406162464986e-05, + "loss": 0.2848, + "step": 33489 + }, + { + "epoch": 18.70949720670391, + "grad_norm": 0.5228134989738464, + "learning_rate": 6.512605042016807e-05, + "loss": 0.4357, + "step": 33490 + }, + { + "epoch": 18.710055865921788, + "grad_norm": 0.8309877514839172, + "learning_rate": 6.509803921568627e-05, + "loss": 0.3157, + "step": 33491 + }, + { + "epoch": 18.710614525139665, + "grad_norm": 1.5525248050689697, + "learning_rate": 6.507002801120448e-05, + "loss": 0.4008, + "step": 33492 + }, + { + "epoch": 18.71117318435754, + "grad_norm": 0.4749762713909149, + "learning_rate": 6.504201680672269e-05, + "loss": 0.4309, + "step": 33493 + }, + { + "epoch": 18.711731843575418, + "grad_norm": 0.44904613494873047, + "learning_rate": 6.501400560224089e-05, + "loss": 0.4429, + "step": 33494 + }, + { + "epoch": 18.712290502793294, + "grad_norm": 0.46362900733947754, + "learning_rate": 6.498599439775911e-05, + "loss": 0.4831, + "step": 33495 + }, + { + "epoch": 18.712849162011175, + "grad_norm": 0.4536849558353424, + "learning_rate": 6.49579831932773e-05, + "loss": 0.4639, + "step": 33496 + }, + { + "epoch": 18.71340782122905, + "grad_norm": 0.6675393581390381, + "learning_rate": 6.492997198879552e-05, + "loss": 0.3892, + "step": 33497 + }, + { + "epoch": 18.713966480446928, + "grad_norm": 0.42143043875694275, + "learning_rate": 6.490196078431372e-05, + "loss": 0.4607, + "step": 33498 + }, + { + "epoch": 18.714525139664804, + "grad_norm": 0.43682605028152466, + "learning_rate": 6.487394957983193e-05, + "loss": 0.4065, + "step": 33499 + }, + { + "epoch": 18.71508379888268, + "grad_norm": 0.4687761664390564, + "learning_rate": 6.484593837535014e-05, + "loss": 0.4763, + "step": 33500 + }, + { + "epoch": 18.71508379888268, + "eval_cer": 0.08476536501805723, + "eval_loss": 0.3193410336971283, + "eval_runtime": 55.6868, + "eval_samples_per_second": 81.491, + "eval_steps_per_second": 5.1, + "eval_wer": 0.3354375576101226, + "step": 33500 + }, + { + "epoch": 18.715642458100557, + "grad_norm": 0.6815500855445862, + "learning_rate": 6.481792717086835e-05, + "loss": 0.3896, + "step": 33501 + }, + { + "epoch": 18.716201117318434, + "grad_norm": 1.6572506427764893, + "learning_rate": 6.478991596638655e-05, + "loss": 0.4983, + "step": 33502 + }, + { + "epoch": 18.716759776536314, + "grad_norm": 0.5081488490104675, + "learning_rate": 6.476190476190476e-05, + "loss": 0.4617, + "step": 33503 + }, + { + "epoch": 18.71731843575419, + "grad_norm": 0.3655071258544922, + "learning_rate": 6.473389355742296e-05, + "loss": 0.3722, + "step": 33504 + }, + { + "epoch": 18.717877094972067, + "grad_norm": 1.954050898551941, + "learning_rate": 6.470588235294118e-05, + "loss": 0.3733, + "step": 33505 + }, + { + "epoch": 18.718435754189944, + "grad_norm": 0.47904837131500244, + "learning_rate": 6.467787114845938e-05, + "loss": 0.3619, + "step": 33506 + }, + { + "epoch": 18.71899441340782, + "grad_norm": 0.4262252748012543, + "learning_rate": 6.46498599439776e-05, + "loss": 0.4215, + "step": 33507 + }, + { + "epoch": 18.719553072625697, + "grad_norm": 0.6213170289993286, + "learning_rate": 6.462184873949579e-05, + "loss": 0.4556, + "step": 33508 + }, + { + "epoch": 18.720111731843577, + "grad_norm": 0.5783782005310059, + "learning_rate": 6.459383753501401e-05, + "loss": 0.4961, + "step": 33509 + }, + { + "epoch": 18.720670391061454, + "grad_norm": 0.3850608468055725, + "learning_rate": 6.456582633053221e-05, + "loss": 0.362, + "step": 33510 + }, + { + "epoch": 18.72122905027933, + "grad_norm": 0.5463806390762329, + "learning_rate": 6.453781512605042e-05, + "loss": 0.3582, + "step": 33511 + }, + { + "epoch": 18.721787709497207, + "grad_norm": 0.45780298113822937, + "learning_rate": 6.450980392156863e-05, + "loss": 0.3662, + "step": 33512 + }, + { + "epoch": 18.722346368715083, + "grad_norm": 1.0495574474334717, + "learning_rate": 6.448179271708683e-05, + "loss": 0.375, + "step": 33513 + }, + { + "epoch": 18.72290502793296, + "grad_norm": 0.4512307643890381, + "learning_rate": 6.445378151260504e-05, + "loss": 0.3662, + "step": 33514 + }, + { + "epoch": 18.723463687150836, + "grad_norm": 0.5555634498596191, + "learning_rate": 6.442577030812324e-05, + "loss": 0.5181, + "step": 33515 + }, + { + "epoch": 18.724022346368717, + "grad_norm": 0.5169801712036133, + "learning_rate": 6.439775910364145e-05, + "loss": 0.3613, + "step": 33516 + }, + { + "epoch": 18.724581005586593, + "grad_norm": 0.34606289863586426, + "learning_rate": 6.436974789915967e-05, + "loss": 0.3679, + "step": 33517 + }, + { + "epoch": 18.72513966480447, + "grad_norm": 0.4272880554199219, + "learning_rate": 6.434173669467786e-05, + "loss": 0.3703, + "step": 33518 + }, + { + "epoch": 18.725698324022346, + "grad_norm": 0.31165993213653564, + "learning_rate": 6.431372549019608e-05, + "loss": 0.3263, + "step": 33519 + }, + { + "epoch": 18.726256983240223, + "grad_norm": 1.1470900774002075, + "learning_rate": 6.428571428571427e-05, + "loss": 0.4411, + "step": 33520 + }, + { + "epoch": 18.7268156424581, + "grad_norm": 0.40630728006362915, + "learning_rate": 6.42577030812325e-05, + "loss": 0.3582, + "step": 33521 + }, + { + "epoch": 18.727374301675976, + "grad_norm": 3.5836431980133057, + "learning_rate": 6.42296918767507e-05, + "loss": 0.4071, + "step": 33522 + }, + { + "epoch": 18.727932960893856, + "grad_norm": 1.5012294054031372, + "learning_rate": 6.42016806722689e-05, + "loss": 0.4231, + "step": 33523 + }, + { + "epoch": 18.728491620111733, + "grad_norm": 0.46891170740127563, + "learning_rate": 6.417366946778711e-05, + "loss": 0.5469, + "step": 33524 + }, + { + "epoch": 18.72905027932961, + "grad_norm": 0.3708672523498535, + "learning_rate": 6.414565826330532e-05, + "loss": 0.3395, + "step": 33525 + }, + { + "epoch": 18.729608938547486, + "grad_norm": 0.8656470775604248, + "learning_rate": 6.411764705882352e-05, + "loss": 0.4004, + "step": 33526 + }, + { + "epoch": 18.730167597765362, + "grad_norm": 1.1113722324371338, + "learning_rate": 6.408963585434174e-05, + "loss": 0.2736, + "step": 33527 + }, + { + "epoch": 18.73072625698324, + "grad_norm": 0.43879860639572144, + "learning_rate": 6.406162464985994e-05, + "loss": 0.3823, + "step": 33528 + }, + { + "epoch": 18.73128491620112, + "grad_norm": 0.40990403294563293, + "learning_rate": 6.403361344537816e-05, + "loss": 0.3664, + "step": 33529 + }, + { + "epoch": 18.731843575418996, + "grad_norm": 0.3509818911552429, + "learning_rate": 6.400560224089635e-05, + "loss": 0.3173, + "step": 33530 + }, + { + "epoch": 18.732402234636872, + "grad_norm": 0.36291226744651794, + "learning_rate": 6.397759103641457e-05, + "loss": 0.3569, + "step": 33531 + }, + { + "epoch": 18.73296089385475, + "grad_norm": 0.5059667825698853, + "learning_rate": 6.394957983193279e-05, + "loss": 0.3845, + "step": 33532 + }, + { + "epoch": 18.733519553072625, + "grad_norm": 1.602874755859375, + "learning_rate": 6.392156862745098e-05, + "loss": 0.387, + "step": 33533 + }, + { + "epoch": 18.734078212290502, + "grad_norm": 0.50505530834198, + "learning_rate": 6.38935574229692e-05, + "loss": 0.4263, + "step": 33534 + }, + { + "epoch": 18.73463687150838, + "grad_norm": 0.554422914981842, + "learning_rate": 6.386554621848739e-05, + "loss": 0.3165, + "step": 33535 + }, + { + "epoch": 18.73519553072626, + "grad_norm": 0.410466730594635, + "learning_rate": 6.383753501400561e-05, + "loss": 0.4478, + "step": 33536 + }, + { + "epoch": 18.735754189944135, + "grad_norm": 0.4008373022079468, + "learning_rate": 6.380952380952382e-05, + "loss": 0.4049, + "step": 33537 + }, + { + "epoch": 18.73631284916201, + "grad_norm": 0.5057961344718933, + "learning_rate": 6.378151260504202e-05, + "loss": 0.5317, + "step": 33538 + }, + { + "epoch": 18.73687150837989, + "grad_norm": 0.8561438918113708, + "learning_rate": 6.375350140056023e-05, + "loss": 0.4027, + "step": 33539 + }, + { + "epoch": 18.737430167597765, + "grad_norm": 0.4856656491756439, + "learning_rate": 6.372549019607843e-05, + "loss": 0.3458, + "step": 33540 + }, + { + "epoch": 18.73798882681564, + "grad_norm": 0.31427201628685, + "learning_rate": 6.369747899159664e-05, + "loss": 0.4045, + "step": 33541 + }, + { + "epoch": 18.738547486033518, + "grad_norm": 0.6725631952285767, + "learning_rate": 6.366946778711485e-05, + "loss": 0.4022, + "step": 33542 + }, + { + "epoch": 18.739106145251398, + "grad_norm": 0.5081383585929871, + "learning_rate": 6.364145658263305e-05, + "loss": 0.3974, + "step": 33543 + }, + { + "epoch": 18.739664804469275, + "grad_norm": 0.3647308051586151, + "learning_rate": 6.361344537815127e-05, + "loss": 0.3628, + "step": 33544 + }, + { + "epoch": 18.74022346368715, + "grad_norm": 0.8962703347206116, + "learning_rate": 6.358543417366946e-05, + "loss": 0.5085, + "step": 33545 + }, + { + "epoch": 18.740782122905028, + "grad_norm": 0.41111117601394653, + "learning_rate": 6.355742296918768e-05, + "loss": 0.4359, + "step": 33546 + }, + { + "epoch": 18.741340782122904, + "grad_norm": 0.3825221061706543, + "learning_rate": 6.352941176470588e-05, + "loss": 0.4237, + "step": 33547 + }, + { + "epoch": 18.74189944134078, + "grad_norm": 2.754225969314575, + "learning_rate": 6.35014005602241e-05, + "loss": 0.4702, + "step": 33548 + }, + { + "epoch": 18.742458100558657, + "grad_norm": 0.746883749961853, + "learning_rate": 6.34733893557423e-05, + "loss": 0.5241, + "step": 33549 + }, + { + "epoch": 18.743016759776538, + "grad_norm": 0.46698498725891113, + "learning_rate": 6.344537815126051e-05, + "loss": 0.4536, + "step": 33550 + }, + { + "epoch": 18.743575418994414, + "grad_norm": 0.32715073227882385, + "learning_rate": 6.341736694677871e-05, + "loss": 0.3052, + "step": 33551 + }, + { + "epoch": 18.74413407821229, + "grad_norm": 0.3255140781402588, + "learning_rate": 6.338935574229692e-05, + "loss": 0.4001, + "step": 33552 + }, + { + "epoch": 18.744692737430167, + "grad_norm": 1.024924397468567, + "learning_rate": 6.336134453781513e-05, + "loss": 0.4406, + "step": 33553 + }, + { + "epoch": 18.745251396648044, + "grad_norm": 0.4181256592273712, + "learning_rate": 6.333333333333335e-05, + "loss": 0.403, + "step": 33554 + }, + { + "epoch": 18.74581005586592, + "grad_norm": 0.46969282627105713, + "learning_rate": 6.330532212885154e-05, + "loss": 0.4011, + "step": 33555 + }, + { + "epoch": 18.7463687150838, + "grad_norm": 1.2439371347427368, + "learning_rate": 6.327731092436976e-05, + "loss": 0.2997, + "step": 33556 + }, + { + "epoch": 18.746927374301677, + "grad_norm": 0.40725943446159363, + "learning_rate": 6.324929971988795e-05, + "loss": 0.2854, + "step": 33557 + }, + { + "epoch": 18.747486033519554, + "grad_norm": 0.5683439373970032, + "learning_rate": 6.322128851540617e-05, + "loss": 0.3898, + "step": 33558 + }, + { + "epoch": 18.74804469273743, + "grad_norm": 0.45918145775794983, + "learning_rate": 6.319327731092438e-05, + "loss": 0.3301, + "step": 33559 + }, + { + "epoch": 18.748603351955307, + "grad_norm": 0.5282741785049438, + "learning_rate": 6.316526610644258e-05, + "loss": 0.4106, + "step": 33560 + }, + { + "epoch": 18.749162011173183, + "grad_norm": 0.3408030569553375, + "learning_rate": 6.313725490196079e-05, + "loss": 0.3614, + "step": 33561 + }, + { + "epoch": 18.74972067039106, + "grad_norm": 0.2686549723148346, + "learning_rate": 6.3109243697479e-05, + "loss": 0.367, + "step": 33562 + }, + { + "epoch": 18.75027932960894, + "grad_norm": 0.4324513077735901, + "learning_rate": 6.30812324929972e-05, + "loss": 0.4271, + "step": 33563 + }, + { + "epoch": 18.750837988826817, + "grad_norm": 0.44949960708618164, + "learning_rate": 6.305322128851542e-05, + "loss": 0.448, + "step": 33564 + }, + { + "epoch": 18.751396648044693, + "grad_norm": 0.994263231754303, + "learning_rate": 6.302521008403361e-05, + "loss": 0.3576, + "step": 33565 + }, + { + "epoch": 18.75195530726257, + "grad_norm": 1.548141598701477, + "learning_rate": 6.299719887955183e-05, + "loss": 0.491, + "step": 33566 + }, + { + "epoch": 18.752513966480446, + "grad_norm": 1.3650633096694946, + "learning_rate": 6.296918767507002e-05, + "loss": 0.4545, + "step": 33567 + }, + { + "epoch": 18.753072625698323, + "grad_norm": 0.41302624344825745, + "learning_rate": 6.294117647058824e-05, + "loss": 0.4181, + "step": 33568 + }, + { + "epoch": 18.7536312849162, + "grad_norm": 0.661688506603241, + "learning_rate": 6.291316526610645e-05, + "loss": 0.4612, + "step": 33569 + }, + { + "epoch": 18.75418994413408, + "grad_norm": 0.4149094820022583, + "learning_rate": 6.288515406162466e-05, + "loss": 0.3798, + "step": 33570 + }, + { + "epoch": 18.754748603351956, + "grad_norm": 1.880969762802124, + "learning_rate": 6.285714285714286e-05, + "loss": 0.3609, + "step": 33571 + }, + { + "epoch": 18.755307262569833, + "grad_norm": 0.4125847816467285, + "learning_rate": 6.282913165266107e-05, + "loss": 0.4451, + "step": 33572 + }, + { + "epoch": 18.75586592178771, + "grad_norm": 0.4462127685546875, + "learning_rate": 6.280112044817927e-05, + "loss": 0.3838, + "step": 33573 + }, + { + "epoch": 18.756424581005586, + "grad_norm": 0.4350358545780182, + "learning_rate": 6.277310924369748e-05, + "loss": 0.3056, + "step": 33574 + }, + { + "epoch": 18.756983240223462, + "grad_norm": 0.36763831973075867, + "learning_rate": 6.274509803921569e-05, + "loss": 0.3435, + "step": 33575 + }, + { + "epoch": 18.757541899441343, + "grad_norm": 0.5017610192298889, + "learning_rate": 6.27170868347339e-05, + "loss": 0.3366, + "step": 33576 + }, + { + "epoch": 18.75810055865922, + "grad_norm": 1.6766163110733032, + "learning_rate": 6.26890756302521e-05, + "loss": 0.398, + "step": 33577 + }, + { + "epoch": 18.758659217877096, + "grad_norm": 0.9491270780563354, + "learning_rate": 6.266106442577032e-05, + "loss": 0.333, + "step": 33578 + }, + { + "epoch": 18.759217877094972, + "grad_norm": 0.3896239101886749, + "learning_rate": 6.263305322128851e-05, + "loss": 0.326, + "step": 33579 + }, + { + "epoch": 18.75977653631285, + "grad_norm": 0.8490318655967712, + "learning_rate": 6.260504201680673e-05, + "loss": 0.3519, + "step": 33580 + }, + { + "epoch": 18.760335195530725, + "grad_norm": 0.5182773470878601, + "learning_rate": 6.257703081232493e-05, + "loss": 0.4598, + "step": 33581 + }, + { + "epoch": 18.760893854748602, + "grad_norm": 0.3434610366821289, + "learning_rate": 6.254901960784314e-05, + "loss": 0.3383, + "step": 33582 + }, + { + "epoch": 18.761452513966482, + "grad_norm": 1.0617681741714478, + "learning_rate": 6.252100840336135e-05, + "loss": 0.4286, + "step": 33583 + }, + { + "epoch": 18.76201117318436, + "grad_norm": 0.8387086391448975, + "learning_rate": 6.249299719887955e-05, + "loss": 0.4394, + "step": 33584 + }, + { + "epoch": 18.762569832402235, + "grad_norm": 0.40427055954933167, + "learning_rate": 6.246498599439776e-05, + "loss": 0.3566, + "step": 33585 + }, + { + "epoch": 18.76312849162011, + "grad_norm": 0.3702031970024109, + "learning_rate": 6.243697478991596e-05, + "loss": 0.3836, + "step": 33586 + }, + { + "epoch": 18.76368715083799, + "grad_norm": 0.29070088267326355, + "learning_rate": 6.240896358543417e-05, + "loss": 0.3017, + "step": 33587 + }, + { + "epoch": 18.764245810055865, + "grad_norm": 0.38933902978897095, + "learning_rate": 6.238095238095239e-05, + "loss": 0.3978, + "step": 33588 + }, + { + "epoch": 18.76480446927374, + "grad_norm": 0.470816045999527, + "learning_rate": 6.23529411764706e-05, + "loss": 0.3778, + "step": 33589 + }, + { + "epoch": 18.76536312849162, + "grad_norm": 0.4901551604270935, + "learning_rate": 6.23249299719888e-05, + "loss": 0.3911, + "step": 33590 + }, + { + "epoch": 18.765921787709498, + "grad_norm": 0.3814218044281006, + "learning_rate": 6.229691876750701e-05, + "loss": 0.3432, + "step": 33591 + }, + { + "epoch": 18.766480446927375, + "grad_norm": 0.6107118129730225, + "learning_rate": 6.226890756302521e-05, + "loss": 0.6505, + "step": 33592 + }, + { + "epoch": 18.76703910614525, + "grad_norm": 0.6372637748718262, + "learning_rate": 6.224089635854342e-05, + "loss": 0.4389, + "step": 33593 + }, + { + "epoch": 18.767597765363128, + "grad_norm": 0.5028481483459473, + "learning_rate": 6.221288515406163e-05, + "loss": 0.4854, + "step": 33594 + }, + { + "epoch": 18.768156424581004, + "grad_norm": 0.3836221694946289, + "learning_rate": 6.218487394957983e-05, + "loss": 0.4245, + "step": 33595 + }, + { + "epoch": 18.76871508379888, + "grad_norm": 0.503173291683197, + "learning_rate": 6.215686274509804e-05, + "loss": 0.3976, + "step": 33596 + }, + { + "epoch": 18.76927374301676, + "grad_norm": 0.6945926547050476, + "learning_rate": 6.212885154061624e-05, + "loss": 0.5094, + "step": 33597 + }, + { + "epoch": 18.769832402234638, + "grad_norm": 0.4380642771720886, + "learning_rate": 6.210084033613445e-05, + "loss": 0.4273, + "step": 33598 + }, + { + "epoch": 18.770391061452514, + "grad_norm": 1.0532293319702148, + "learning_rate": 6.207282913165267e-05, + "loss": 0.3966, + "step": 33599 + }, + { + "epoch": 18.77094972067039, + "grad_norm": 0.303750216960907, + "learning_rate": 6.204481792717088e-05, + "loss": 0.3466, + "step": 33600 + }, + { + "epoch": 18.771508379888267, + "grad_norm": 0.38815316557884216, + "learning_rate": 6.201680672268908e-05, + "loss": 0.4295, + "step": 33601 + }, + { + "epoch": 18.772067039106144, + "grad_norm": 0.48118990659713745, + "learning_rate": 6.198879551820729e-05, + "loss": 0.5432, + "step": 33602 + }, + { + "epoch": 18.772625698324024, + "grad_norm": 0.7466734647750854, + "learning_rate": 6.19607843137255e-05, + "loss": 0.3938, + "step": 33603 + }, + { + "epoch": 18.7731843575419, + "grad_norm": 0.7467971444129944, + "learning_rate": 6.19327731092437e-05, + "loss": 0.4443, + "step": 33604 + }, + { + "epoch": 18.773743016759777, + "grad_norm": 0.6385853290557861, + "learning_rate": 6.19047619047619e-05, + "loss": 0.3208, + "step": 33605 + }, + { + "epoch": 18.774301675977654, + "grad_norm": 0.5847535729408264, + "learning_rate": 6.187675070028011e-05, + "loss": 0.4505, + "step": 33606 + }, + { + "epoch": 18.77486033519553, + "grad_norm": 0.6122723817825317, + "learning_rate": 6.184873949579832e-05, + "loss": 0.4491, + "step": 33607 + }, + { + "epoch": 18.775418994413407, + "grad_norm": 0.37647995352745056, + "learning_rate": 6.182072829131652e-05, + "loss": 0.4095, + "step": 33608 + }, + { + "epoch": 18.775977653631283, + "grad_norm": 0.5081083178520203, + "learning_rate": 6.179271708683473e-05, + "loss": 0.3845, + "step": 33609 + }, + { + "epoch": 18.776536312849164, + "grad_norm": 0.8722805380821228, + "learning_rate": 6.176470588235295e-05, + "loss": 0.2995, + "step": 33610 + }, + { + "epoch": 18.77709497206704, + "grad_norm": 0.4157169759273529, + "learning_rate": 6.173669467787116e-05, + "loss": 0.4648, + "step": 33611 + }, + { + "epoch": 18.777653631284917, + "grad_norm": 0.49111250042915344, + "learning_rate": 6.170868347338936e-05, + "loss": 0.3866, + "step": 33612 + }, + { + "epoch": 18.778212290502793, + "grad_norm": 0.4404752552509308, + "learning_rate": 6.168067226890757e-05, + "loss": 0.4771, + "step": 33613 + }, + { + "epoch": 18.77877094972067, + "grad_norm": 0.8342339396476746, + "learning_rate": 6.165266106442577e-05, + "loss": 0.4572, + "step": 33614 + }, + { + "epoch": 18.779329608938546, + "grad_norm": 0.4370078146457672, + "learning_rate": 6.162464985994398e-05, + "loss": 0.4462, + "step": 33615 + }, + { + "epoch": 18.779888268156423, + "grad_norm": 0.8758747577667236, + "learning_rate": 6.159663865546219e-05, + "loss": 0.4322, + "step": 33616 + }, + { + "epoch": 18.780446927374303, + "grad_norm": 0.37110209465026855, + "learning_rate": 6.156862745098039e-05, + "loss": 0.3856, + "step": 33617 + }, + { + "epoch": 18.78100558659218, + "grad_norm": 0.3819989562034607, + "learning_rate": 6.15406162464986e-05, + "loss": 0.3869, + "step": 33618 + }, + { + "epoch": 18.781564245810056, + "grad_norm": 0.41110458970069885, + "learning_rate": 6.15126050420168e-05, + "loss": 0.3263, + "step": 33619 + }, + { + "epoch": 18.782122905027933, + "grad_norm": 0.37413454055786133, + "learning_rate": 6.148459383753501e-05, + "loss": 0.4122, + "step": 33620 + }, + { + "epoch": 18.78268156424581, + "grad_norm": 0.4960332214832306, + "learning_rate": 6.145658263305323e-05, + "loss": 0.383, + "step": 33621 + }, + { + "epoch": 18.783240223463686, + "grad_norm": 0.30806565284729004, + "learning_rate": 6.142857142857143e-05, + "loss": 0.3718, + "step": 33622 + }, + { + "epoch": 18.783798882681566, + "grad_norm": 0.3246197998523712, + "learning_rate": 6.140056022408964e-05, + "loss": 0.3301, + "step": 33623 + }, + { + "epoch": 18.784357541899443, + "grad_norm": 0.35392487049102783, + "learning_rate": 6.137254901960785e-05, + "loss": 0.2859, + "step": 33624 + }, + { + "epoch": 18.78491620111732, + "grad_norm": 0.44571053981781006, + "learning_rate": 6.134453781512605e-05, + "loss": 0.3502, + "step": 33625 + }, + { + "epoch": 18.785474860335196, + "grad_norm": 0.4495675265789032, + "learning_rate": 6.131652661064426e-05, + "loss": 0.4848, + "step": 33626 + }, + { + "epoch": 18.786033519553072, + "grad_norm": 1.0844656229019165, + "learning_rate": 6.128851540616246e-05, + "loss": 0.3464, + "step": 33627 + }, + { + "epoch": 18.78659217877095, + "grad_norm": 0.5044671297073364, + "learning_rate": 6.126050420168067e-05, + "loss": 0.4682, + "step": 33628 + }, + { + "epoch": 18.787150837988825, + "grad_norm": 0.8576611280441284, + "learning_rate": 6.123249299719888e-05, + "loss": 0.3665, + "step": 33629 + }, + { + "epoch": 18.787709497206706, + "grad_norm": 0.518936038017273, + "learning_rate": 6.120448179271708e-05, + "loss": 0.4651, + "step": 33630 + }, + { + "epoch": 18.788268156424582, + "grad_norm": 0.3987240493297577, + "learning_rate": 6.11764705882353e-05, + "loss": 0.3649, + "step": 33631 + }, + { + "epoch": 18.78882681564246, + "grad_norm": 4.616729736328125, + "learning_rate": 6.114845938375351e-05, + "loss": 0.398, + "step": 33632 + }, + { + "epoch": 18.789385474860335, + "grad_norm": 0.5554696321487427, + "learning_rate": 6.112044817927171e-05, + "loss": 0.4504, + "step": 33633 + }, + { + "epoch": 18.789944134078212, + "grad_norm": 0.47344985604286194, + "learning_rate": 6.109243697478992e-05, + "loss": 0.3926, + "step": 33634 + }, + { + "epoch": 18.79050279329609, + "grad_norm": 0.5345861315727234, + "learning_rate": 6.106442577030813e-05, + "loss": 0.4997, + "step": 33635 + }, + { + "epoch": 18.791061452513965, + "grad_norm": 0.4340493381023407, + "learning_rate": 6.103641456582633e-05, + "loss": 0.3127, + "step": 33636 + }, + { + "epoch": 18.791620111731845, + "grad_norm": 0.4726484417915344, + "learning_rate": 6.100840336134454e-05, + "loss": 0.3954, + "step": 33637 + }, + { + "epoch": 18.79217877094972, + "grad_norm": 0.4120994806289673, + "learning_rate": 6.0980392156862744e-05, + "loss": 0.4294, + "step": 33638 + }, + { + "epoch": 18.7927374301676, + "grad_norm": 0.4894300699234009, + "learning_rate": 6.095238095238095e-05, + "loss": 0.4215, + "step": 33639 + }, + { + "epoch": 18.793296089385475, + "grad_norm": 0.5503779053688049, + "learning_rate": 6.0924369747899156e-05, + "loss": 0.4988, + "step": 33640 + }, + { + "epoch": 18.79385474860335, + "grad_norm": 6.059313774108887, + "learning_rate": 6.089635854341736e-05, + "loss": 0.478, + "step": 33641 + }, + { + "epoch": 18.794413407821228, + "grad_norm": 1.0625801086425781, + "learning_rate": 6.086834733893558e-05, + "loss": 0.3867, + "step": 33642 + }, + { + "epoch": 18.794972067039105, + "grad_norm": 0.46304410696029663, + "learning_rate": 6.084033613445379e-05, + "loss": 0.4384, + "step": 33643 + }, + { + "epoch": 18.795530726256985, + "grad_norm": 0.8256794214248657, + "learning_rate": 6.0812324929971994e-05, + "loss": 0.6431, + "step": 33644 + }, + { + "epoch": 18.79608938547486, + "grad_norm": 0.39776909351348877, + "learning_rate": 6.07843137254902e-05, + "loss": 0.3513, + "step": 33645 + }, + { + "epoch": 18.796648044692738, + "grad_norm": 0.40414267778396606, + "learning_rate": 6.0756302521008406e-05, + "loss": 0.3771, + "step": 33646 + }, + { + "epoch": 18.797206703910614, + "grad_norm": 0.5518737435340881, + "learning_rate": 6.072829131652661e-05, + "loss": 0.3924, + "step": 33647 + }, + { + "epoch": 18.79776536312849, + "grad_norm": 0.3767162561416626, + "learning_rate": 6.0700280112044825e-05, + "loss": 0.4086, + "step": 33648 + }, + { + "epoch": 18.798324022346367, + "grad_norm": 0.4354788064956665, + "learning_rate": 6.067226890756303e-05, + "loss": 0.416, + "step": 33649 + }, + { + "epoch": 18.798882681564244, + "grad_norm": 0.4227149188518524, + "learning_rate": 6.0644257703081237e-05, + "loss": 0.3765, + "step": 33650 + }, + { + "epoch": 18.799441340782124, + "grad_norm": 0.48564037680625916, + "learning_rate": 6.061624649859944e-05, + "loss": 0.3945, + "step": 33651 + }, + { + "epoch": 18.8, + "grad_norm": 0.44414955377578735, + "learning_rate": 6.058823529411765e-05, + "loss": 0.3253, + "step": 33652 + }, + { + "epoch": 18.800558659217877, + "grad_norm": 0.5386155843734741, + "learning_rate": 6.056022408963586e-05, + "loss": 0.4393, + "step": 33653 + }, + { + "epoch": 18.801117318435754, + "grad_norm": 0.3885471224784851, + "learning_rate": 6.053221288515407e-05, + "loss": 0.4009, + "step": 33654 + }, + { + "epoch": 18.80167597765363, + "grad_norm": 0.642909049987793, + "learning_rate": 6.050420168067227e-05, + "loss": 0.4884, + "step": 33655 + }, + { + "epoch": 18.802234636871507, + "grad_norm": 0.5241016745567322, + "learning_rate": 6.047619047619048e-05, + "loss": 0.41, + "step": 33656 + }, + { + "epoch": 18.802793296089387, + "grad_norm": 0.43042615056037903, + "learning_rate": 6.0448179271708685e-05, + "loss": 0.4248, + "step": 33657 + }, + { + "epoch": 18.803351955307264, + "grad_norm": 0.3907735347747803, + "learning_rate": 6.04201680672269e-05, + "loss": 0.3917, + "step": 33658 + }, + { + "epoch": 18.80391061452514, + "grad_norm": 0.3452138602733612, + "learning_rate": 6.0392156862745104e-05, + "loss": 0.3919, + "step": 33659 + }, + { + "epoch": 18.804469273743017, + "grad_norm": 0.3378983736038208, + "learning_rate": 6.036414565826331e-05, + "loss": 0.3564, + "step": 33660 + }, + { + "epoch": 18.805027932960893, + "grad_norm": 0.3364536762237549, + "learning_rate": 6.0336134453781516e-05, + "loss": 0.352, + "step": 33661 + }, + { + "epoch": 18.80558659217877, + "grad_norm": 0.41994136571884155, + "learning_rate": 6.030812324929972e-05, + "loss": 0.4406, + "step": 33662 + }, + { + "epoch": 18.806145251396647, + "grad_norm": 1.952738642692566, + "learning_rate": 6.028011204481793e-05, + "loss": 0.4115, + "step": 33663 + }, + { + "epoch": 18.806703910614527, + "grad_norm": 0.33895957469940186, + "learning_rate": 6.025210084033614e-05, + "loss": 0.341, + "step": 33664 + }, + { + "epoch": 18.807262569832403, + "grad_norm": 3.4712064266204834, + "learning_rate": 6.022408963585435e-05, + "loss": 0.4661, + "step": 33665 + }, + { + "epoch": 18.80782122905028, + "grad_norm": 0.48126447200775146, + "learning_rate": 6.019607843137255e-05, + "loss": 0.3822, + "step": 33666 + }, + { + "epoch": 18.808379888268156, + "grad_norm": 0.4363022446632385, + "learning_rate": 6.016806722689076e-05, + "loss": 0.355, + "step": 33667 + }, + { + "epoch": 18.808938547486033, + "grad_norm": 0.42777401208877563, + "learning_rate": 6.0140056022408965e-05, + "loss": 0.464, + "step": 33668 + }, + { + "epoch": 18.80949720670391, + "grad_norm": 0.6931414008140564, + "learning_rate": 6.011204481792718e-05, + "loss": 0.3896, + "step": 33669 + }, + { + "epoch": 18.810055865921786, + "grad_norm": 0.4182848334312439, + "learning_rate": 6.0084033613445384e-05, + "loss": 0.4577, + "step": 33670 + }, + { + "epoch": 18.810614525139666, + "grad_norm": 0.43032410740852356, + "learning_rate": 6.005602240896359e-05, + "loss": 0.4036, + "step": 33671 + }, + { + "epoch": 18.811173184357543, + "grad_norm": 0.43364471197128296, + "learning_rate": 6.0028011204481796e-05, + "loss": 0.382, + "step": 33672 + }, + { + "epoch": 18.81173184357542, + "grad_norm": 0.5476785898208618, + "learning_rate": 6e-05, + "loss": 0.4643, + "step": 33673 + }, + { + "epoch": 18.812290502793296, + "grad_norm": 0.3527085781097412, + "learning_rate": 5.9971988795518214e-05, + "loss": 0.3618, + "step": 33674 + }, + { + "epoch": 18.812849162011172, + "grad_norm": 0.3852047622203827, + "learning_rate": 5.994397759103642e-05, + "loss": 0.3997, + "step": 33675 + }, + { + "epoch": 18.81340782122905, + "grad_norm": 0.4716630280017853, + "learning_rate": 5.9915966386554626e-05, + "loss": 0.347, + "step": 33676 + }, + { + "epoch": 18.81396648044693, + "grad_norm": 0.5135959386825562, + "learning_rate": 5.988795518207283e-05, + "loss": 0.4655, + "step": 33677 + }, + { + "epoch": 18.814525139664806, + "grad_norm": 0.40920212864875793, + "learning_rate": 5.985994397759104e-05, + "loss": 0.4981, + "step": 33678 + }, + { + "epoch": 18.815083798882682, + "grad_norm": 0.38840556144714355, + "learning_rate": 5.9831932773109244e-05, + "loss": 0.3182, + "step": 33679 + }, + { + "epoch": 18.81564245810056, + "grad_norm": 0.4323107898235321, + "learning_rate": 5.980392156862746e-05, + "loss": 0.4632, + "step": 33680 + }, + { + "epoch": 18.816201117318435, + "grad_norm": 0.4154958426952362, + "learning_rate": 5.977591036414566e-05, + "loss": 0.4092, + "step": 33681 + }, + { + "epoch": 18.816759776536312, + "grad_norm": 0.5845361351966858, + "learning_rate": 5.974789915966387e-05, + "loss": 0.4858, + "step": 33682 + }, + { + "epoch": 18.81731843575419, + "grad_norm": 0.5577226281166077, + "learning_rate": 5.9719887955182075e-05, + "loss": 0.4394, + "step": 33683 + }, + { + "epoch": 18.81787709497207, + "grad_norm": 0.7876600623130798, + "learning_rate": 5.969187675070028e-05, + "loss": 0.4335, + "step": 33684 + }, + { + "epoch": 18.818435754189945, + "grad_norm": 1.0263603925704956, + "learning_rate": 5.9663865546218494e-05, + "loss": 0.375, + "step": 33685 + }, + { + "epoch": 18.81899441340782, + "grad_norm": 0.9165291786193848, + "learning_rate": 5.96358543417367e-05, + "loss": 0.3378, + "step": 33686 + }, + { + "epoch": 18.8195530726257, + "grad_norm": 0.9924935698509216, + "learning_rate": 5.9607843137254906e-05, + "loss": 0.3996, + "step": 33687 + }, + { + "epoch": 18.820111731843575, + "grad_norm": 0.48136764764785767, + "learning_rate": 5.957983193277311e-05, + "loss": 0.5362, + "step": 33688 + }, + { + "epoch": 18.82067039106145, + "grad_norm": 0.39025384187698364, + "learning_rate": 5.955182072829132e-05, + "loss": 0.3445, + "step": 33689 + }, + { + "epoch": 18.821229050279328, + "grad_norm": 0.4123421609401703, + "learning_rate": 5.9523809523809524e-05, + "loss": 0.4086, + "step": 33690 + }, + { + "epoch": 18.821787709497208, + "grad_norm": 0.36732372641563416, + "learning_rate": 5.9495798319327737e-05, + "loss": 0.3554, + "step": 33691 + }, + { + "epoch": 18.822346368715085, + "grad_norm": 0.4214653968811035, + "learning_rate": 5.946778711484594e-05, + "loss": 0.4356, + "step": 33692 + }, + { + "epoch": 18.82290502793296, + "grad_norm": 0.6310732960700989, + "learning_rate": 5.943977591036415e-05, + "loss": 0.5798, + "step": 33693 + }, + { + "epoch": 18.823463687150838, + "grad_norm": 0.4226304888725281, + "learning_rate": 5.9411764705882355e-05, + "loss": 0.4193, + "step": 33694 + }, + { + "epoch": 18.824022346368714, + "grad_norm": 0.4110008776187897, + "learning_rate": 5.938375350140056e-05, + "loss": 0.5657, + "step": 33695 + }, + { + "epoch": 18.82458100558659, + "grad_norm": 0.33921244740486145, + "learning_rate": 5.935574229691877e-05, + "loss": 0.324, + "step": 33696 + }, + { + "epoch": 18.825139664804468, + "grad_norm": 0.7263866662979126, + "learning_rate": 5.932773109243698e-05, + "loss": 0.5492, + "step": 33697 + }, + { + "epoch": 18.825698324022348, + "grad_norm": 0.4138749837875366, + "learning_rate": 5.9299719887955185e-05, + "loss": 0.3825, + "step": 33698 + }, + { + "epoch": 18.826256983240224, + "grad_norm": 1.0773077011108398, + "learning_rate": 5.927170868347339e-05, + "loss": 0.4271, + "step": 33699 + }, + { + "epoch": 18.8268156424581, + "grad_norm": 1.0225144624710083, + "learning_rate": 5.92436974789916e-05, + "loss": 0.469, + "step": 33700 + }, + { + "epoch": 18.827374301675977, + "grad_norm": 0.5519649982452393, + "learning_rate": 5.921568627450981e-05, + "loss": 0.5048, + "step": 33701 + }, + { + "epoch": 18.827932960893854, + "grad_norm": 0.4285273849964142, + "learning_rate": 5.9187675070028016e-05, + "loss": 0.3719, + "step": 33702 + }, + { + "epoch": 18.82849162011173, + "grad_norm": 0.3183865547180176, + "learning_rate": 5.915966386554622e-05, + "loss": 0.3398, + "step": 33703 + }, + { + "epoch": 18.82905027932961, + "grad_norm": 0.35934627056121826, + "learning_rate": 5.913165266106443e-05, + "loss": 0.2945, + "step": 33704 + }, + { + "epoch": 18.829608938547487, + "grad_norm": 0.3428328037261963, + "learning_rate": 5.9103641456582634e-05, + "loss": 0.4252, + "step": 33705 + }, + { + "epoch": 18.830167597765364, + "grad_norm": 0.399688720703125, + "learning_rate": 5.907563025210084e-05, + "loss": 0.3838, + "step": 33706 + }, + { + "epoch": 18.83072625698324, + "grad_norm": 0.43299439549446106, + "learning_rate": 5.904761904761905e-05, + "loss": 0.5022, + "step": 33707 + }, + { + "epoch": 18.831284916201117, + "grad_norm": 0.5337679982185364, + "learning_rate": 5.901960784313726e-05, + "loss": 0.3524, + "step": 33708 + }, + { + "epoch": 18.831843575418993, + "grad_norm": 0.5172250866889954, + "learning_rate": 5.8991596638655465e-05, + "loss": 0.3645, + "step": 33709 + }, + { + "epoch": 18.83240223463687, + "grad_norm": 0.4401620924472809, + "learning_rate": 5.896358543417367e-05, + "loss": 0.3656, + "step": 33710 + }, + { + "epoch": 18.83296089385475, + "grad_norm": 0.4201304018497467, + "learning_rate": 5.893557422969188e-05, + "loss": 0.4456, + "step": 33711 + }, + { + "epoch": 18.833519553072627, + "grad_norm": 0.48590970039367676, + "learning_rate": 5.890756302521009e-05, + "loss": 0.4417, + "step": 33712 + }, + { + "epoch": 18.834078212290503, + "grad_norm": 0.40995311737060547, + "learning_rate": 5.8879551820728296e-05, + "loss": 0.3401, + "step": 33713 + }, + { + "epoch": 18.83463687150838, + "grad_norm": 1.6525832414627075, + "learning_rate": 5.88515406162465e-05, + "loss": 0.3963, + "step": 33714 + }, + { + "epoch": 18.835195530726256, + "grad_norm": 0.6423236727714539, + "learning_rate": 5.882352941176471e-05, + "loss": 0.5462, + "step": 33715 + }, + { + "epoch": 18.835754189944133, + "grad_norm": 0.39395588636398315, + "learning_rate": 5.8795518207282914e-05, + "loss": 0.4129, + "step": 33716 + }, + { + "epoch": 18.83631284916201, + "grad_norm": 0.3613419234752655, + "learning_rate": 5.876750700280112e-05, + "loss": 0.2915, + "step": 33717 + }, + { + "epoch": 18.83687150837989, + "grad_norm": 0.5271963477134705, + "learning_rate": 5.873949579831933e-05, + "loss": 0.4528, + "step": 33718 + }, + { + "epoch": 18.837430167597766, + "grad_norm": 0.37885165214538574, + "learning_rate": 5.871148459383754e-05, + "loss": 0.3127, + "step": 33719 + }, + { + "epoch": 18.837988826815643, + "grad_norm": 0.6319796442985535, + "learning_rate": 5.8683473389355744e-05, + "loss": 0.4789, + "step": 33720 + }, + { + "epoch": 18.83854748603352, + "grad_norm": 0.725278913974762, + "learning_rate": 5.865546218487395e-05, + "loss": 0.4643, + "step": 33721 + }, + { + "epoch": 18.839106145251396, + "grad_norm": 0.3575795292854309, + "learning_rate": 5.8627450980392156e-05, + "loss": 0.3441, + "step": 33722 + }, + { + "epoch": 18.839664804469272, + "grad_norm": 0.4634858965873718, + "learning_rate": 5.859943977591037e-05, + "loss": 0.3861, + "step": 33723 + }, + { + "epoch": 18.840223463687153, + "grad_norm": 0.3979707956314087, + "learning_rate": 5.8571428571428575e-05, + "loss": 0.3607, + "step": 33724 + }, + { + "epoch": 18.84078212290503, + "grad_norm": 0.44715383648872375, + "learning_rate": 5.854341736694678e-05, + "loss": 0.5378, + "step": 33725 + }, + { + "epoch": 18.841340782122906, + "grad_norm": 0.5906356573104858, + "learning_rate": 5.851540616246499e-05, + "loss": 0.3927, + "step": 33726 + }, + { + "epoch": 18.841899441340782, + "grad_norm": 1.4931284189224243, + "learning_rate": 5.848739495798319e-05, + "loss": 0.4052, + "step": 33727 + }, + { + "epoch": 18.84245810055866, + "grad_norm": 0.6978968977928162, + "learning_rate": 5.8459383753501406e-05, + "loss": 0.3972, + "step": 33728 + }, + { + "epoch": 18.843016759776535, + "grad_norm": 0.4317839741706848, + "learning_rate": 5.843137254901961e-05, + "loss": 0.3438, + "step": 33729 + }, + { + "epoch": 18.843575418994412, + "grad_norm": 0.481693834066391, + "learning_rate": 5.840336134453782e-05, + "loss": 0.4343, + "step": 33730 + }, + { + "epoch": 18.844134078212292, + "grad_norm": 0.4739903211593628, + "learning_rate": 5.8375350140056024e-05, + "loss": 0.4162, + "step": 33731 + }, + { + "epoch": 18.84469273743017, + "grad_norm": 0.37906476855278015, + "learning_rate": 5.834733893557423e-05, + "loss": 0.4502, + "step": 33732 + }, + { + "epoch": 18.845251396648045, + "grad_norm": 0.48904070258140564, + "learning_rate": 5.8319327731092436e-05, + "loss": 0.4318, + "step": 33733 + }, + { + "epoch": 18.845810055865922, + "grad_norm": 0.4569445848464966, + "learning_rate": 5.829131652661065e-05, + "loss": 0.4916, + "step": 33734 + }, + { + "epoch": 18.8463687150838, + "grad_norm": 0.3764616847038269, + "learning_rate": 5.8263305322128855e-05, + "loss": 0.4485, + "step": 33735 + }, + { + "epoch": 18.846927374301675, + "grad_norm": 0.5238183736801147, + "learning_rate": 5.823529411764706e-05, + "loss": 0.4701, + "step": 33736 + }, + { + "epoch": 18.84748603351955, + "grad_norm": 0.46046218276023865, + "learning_rate": 5.8207282913165266e-05, + "loss": 0.5287, + "step": 33737 + }, + { + "epoch": 18.84804469273743, + "grad_norm": 0.4134523868560791, + "learning_rate": 5.817927170868347e-05, + "loss": 0.3618, + "step": 33738 + }, + { + "epoch": 18.84860335195531, + "grad_norm": 0.3827623128890991, + "learning_rate": 5.8151260504201685e-05, + "loss": 0.3278, + "step": 33739 + }, + { + "epoch": 18.849162011173185, + "grad_norm": 0.5377877354621887, + "learning_rate": 5.812324929971989e-05, + "loss": 0.53, + "step": 33740 + }, + { + "epoch": 18.84972067039106, + "grad_norm": 1.424222707748413, + "learning_rate": 5.80952380952381e-05, + "loss": 0.4046, + "step": 33741 + }, + { + "epoch": 18.850279329608938, + "grad_norm": 3.638166666030884, + "learning_rate": 5.80672268907563e-05, + "loss": 0.4394, + "step": 33742 + }, + { + "epoch": 18.850837988826814, + "grad_norm": 0.4418627619743347, + "learning_rate": 5.803921568627451e-05, + "loss": 0.4245, + "step": 33743 + }, + { + "epoch": 18.85139664804469, + "grad_norm": 0.34589412808418274, + "learning_rate": 5.801120448179272e-05, + "loss": 0.3761, + "step": 33744 + }, + { + "epoch": 18.85195530726257, + "grad_norm": 0.4258033037185669, + "learning_rate": 5.798319327731093e-05, + "loss": 0.4272, + "step": 33745 + }, + { + "epoch": 18.852513966480448, + "grad_norm": 0.4996253550052643, + "learning_rate": 5.7955182072829134e-05, + "loss": 0.4417, + "step": 33746 + }, + { + "epoch": 18.853072625698324, + "grad_norm": 0.3439030051231384, + "learning_rate": 5.792717086834734e-05, + "loss": 0.3546, + "step": 33747 + }, + { + "epoch": 18.8536312849162, + "grad_norm": 0.4623798429965973, + "learning_rate": 5.7899159663865546e-05, + "loss": 0.4037, + "step": 33748 + }, + { + "epoch": 18.854189944134077, + "grad_norm": 0.6681358814239502, + "learning_rate": 5.787114845938375e-05, + "loss": 0.4476, + "step": 33749 + }, + { + "epoch": 18.854748603351954, + "grad_norm": 0.5658302903175354, + "learning_rate": 5.7843137254901965e-05, + "loss": 0.364, + "step": 33750 + }, + { + "epoch": 18.85530726256983, + "grad_norm": 0.8510394096374512, + "learning_rate": 5.781512605042017e-05, + "loss": 0.3053, + "step": 33751 + }, + { + "epoch": 18.85586592178771, + "grad_norm": 0.8178472518920898, + "learning_rate": 5.778711484593838e-05, + "loss": 0.4575, + "step": 33752 + }, + { + "epoch": 18.856424581005587, + "grad_norm": 0.4230727255344391, + "learning_rate": 5.775910364145658e-05, + "loss": 0.5357, + "step": 33753 + }, + { + "epoch": 18.856983240223464, + "grad_norm": 1.4039816856384277, + "learning_rate": 5.773109243697479e-05, + "loss": 0.3991, + "step": 33754 + }, + { + "epoch": 18.85754189944134, + "grad_norm": 0.48048731684684753, + "learning_rate": 5.7703081232493e-05, + "loss": 0.4447, + "step": 33755 + }, + { + "epoch": 18.858100558659217, + "grad_norm": 0.5927854180335999, + "learning_rate": 5.767507002801121e-05, + "loss": 0.328, + "step": 33756 + }, + { + "epoch": 18.858659217877094, + "grad_norm": 0.4228411316871643, + "learning_rate": 5.7647058823529413e-05, + "loss": 0.3425, + "step": 33757 + }, + { + "epoch": 18.859217877094974, + "grad_norm": 0.622751772403717, + "learning_rate": 5.761904761904762e-05, + "loss": 0.3729, + "step": 33758 + }, + { + "epoch": 18.85977653631285, + "grad_norm": 1.3569228649139404, + "learning_rate": 5.7591036414565825e-05, + "loss": 0.4215, + "step": 33759 + }, + { + "epoch": 18.860335195530727, + "grad_norm": 1.6811857223510742, + "learning_rate": 5.756302521008403e-05, + "loss": 0.4857, + "step": 33760 + }, + { + "epoch": 18.860893854748603, + "grad_norm": 0.5531189441680908, + "learning_rate": 5.7535014005602244e-05, + "loss": 0.3257, + "step": 33761 + }, + { + "epoch": 18.86145251396648, + "grad_norm": 0.4516947865486145, + "learning_rate": 5.750700280112045e-05, + "loss": 0.2824, + "step": 33762 + }, + { + "epoch": 18.862011173184356, + "grad_norm": 0.5208219289779663, + "learning_rate": 5.7478991596638656e-05, + "loss": 0.4062, + "step": 33763 + }, + { + "epoch": 18.862569832402233, + "grad_norm": 0.5304266214370728, + "learning_rate": 5.745098039215686e-05, + "loss": 0.4474, + "step": 33764 + }, + { + "epoch": 18.863128491620113, + "grad_norm": 0.3732846975326538, + "learning_rate": 5.742296918767507e-05, + "loss": 0.441, + "step": 33765 + }, + { + "epoch": 18.86368715083799, + "grad_norm": 0.6364693641662598, + "learning_rate": 5.739495798319328e-05, + "loss": 0.401, + "step": 33766 + }, + { + "epoch": 18.864245810055866, + "grad_norm": 0.36977869272232056, + "learning_rate": 5.736694677871149e-05, + "loss": 0.3707, + "step": 33767 + }, + { + "epoch": 18.864804469273743, + "grad_norm": 0.3811185657978058, + "learning_rate": 5.733893557422969e-05, + "loss": 0.3291, + "step": 33768 + }, + { + "epoch": 18.86536312849162, + "grad_norm": 0.40819332003593445, + "learning_rate": 5.73109243697479e-05, + "loss": 0.3453, + "step": 33769 + }, + { + "epoch": 18.865921787709496, + "grad_norm": 0.6222102642059326, + "learning_rate": 5.7282913165266105e-05, + "loss": 0.5234, + "step": 33770 + }, + { + "epoch": 18.866480446927373, + "grad_norm": Infinity, + "learning_rate": 5.7282913165266105e-05, + "loss": 0.4241, + "step": 33771 + }, + { + "epoch": 18.867039106145253, + "grad_norm": 0.46164900064468384, + "learning_rate": 5.725490196078432e-05, + "loss": 0.4707, + "step": 33772 + }, + { + "epoch": 18.86759776536313, + "grad_norm": 1.1930509805679321, + "learning_rate": 5.7226890756302524e-05, + "loss": 0.4401, + "step": 33773 + }, + { + "epoch": 18.868156424581006, + "grad_norm": 0.49368709325790405, + "learning_rate": 5.719887955182073e-05, + "loss": 0.4313, + "step": 33774 + }, + { + "epoch": 18.868715083798882, + "grad_norm": 0.4615408480167389, + "learning_rate": 5.7170868347338936e-05, + "loss": 0.4023, + "step": 33775 + }, + { + "epoch": 18.86927374301676, + "grad_norm": 0.6049171686172485, + "learning_rate": 5.714285714285714e-05, + "loss": 0.3122, + "step": 33776 + }, + { + "epoch": 18.869832402234636, + "grad_norm": 0.4467255175113678, + "learning_rate": 5.711484593837535e-05, + "loss": 0.413, + "step": 33777 + }, + { + "epoch": 18.870391061452516, + "grad_norm": 1.2062095403671265, + "learning_rate": 5.708683473389356e-05, + "loss": 0.2946, + "step": 33778 + }, + { + "epoch": 18.870949720670392, + "grad_norm": 0.5447296500205994, + "learning_rate": 5.7058823529411766e-05, + "loss": 0.377, + "step": 33779 + }, + { + "epoch": 18.87150837988827, + "grad_norm": 0.6711100339889526, + "learning_rate": 5.703081232492997e-05, + "loss": 0.3874, + "step": 33780 + }, + { + "epoch": 18.872067039106145, + "grad_norm": 0.5505905151367188, + "learning_rate": 5.700280112044818e-05, + "loss": 0.4287, + "step": 33781 + }, + { + "epoch": 18.872625698324022, + "grad_norm": 0.43074774742126465, + "learning_rate": 5.6974789915966384e-05, + "loss": 0.3675, + "step": 33782 + }, + { + "epoch": 18.8731843575419, + "grad_norm": 0.9471970200538635, + "learning_rate": 5.69467787114846e-05, + "loss": 0.6955, + "step": 33783 + }, + { + "epoch": 18.873743016759775, + "grad_norm": 4.350872993469238, + "learning_rate": 5.69187675070028e-05, + "loss": 0.3992, + "step": 33784 + }, + { + "epoch": 18.874301675977655, + "grad_norm": 0.943796694278717, + "learning_rate": 5.689075630252101e-05, + "loss": 0.3821, + "step": 33785 + }, + { + "epoch": 18.87486033519553, + "grad_norm": 0.4437604248523712, + "learning_rate": 5.6862745098039215e-05, + "loss": 0.4595, + "step": 33786 + }, + { + "epoch": 18.87541899441341, + "grad_norm": 0.34584903717041016, + "learning_rate": 5.683473389355742e-05, + "loss": 0.3689, + "step": 33787 + }, + { + "epoch": 18.875977653631285, + "grad_norm": 0.5171068906784058, + "learning_rate": 5.680672268907563e-05, + "loss": 0.2757, + "step": 33788 + }, + { + "epoch": 18.87653631284916, + "grad_norm": 0.4553881585597992, + "learning_rate": 5.677871148459384e-05, + "loss": 0.3173, + "step": 33789 + }, + { + "epoch": 18.877094972067038, + "grad_norm": 0.5202228426933289, + "learning_rate": 5.6750700280112046e-05, + "loss": 0.4997, + "step": 33790 + }, + { + "epoch": 18.877653631284915, + "grad_norm": 31.49077033996582, + "learning_rate": 5.672268907563025e-05, + "loss": 0.4835, + "step": 33791 + }, + { + "epoch": 18.878212290502795, + "grad_norm": 0.6451875567436218, + "learning_rate": 5.669467787114846e-05, + "loss": 0.3518, + "step": 33792 + }, + { + "epoch": 18.87877094972067, + "grad_norm": 1.2403446435928345, + "learning_rate": 5.6666666666666664e-05, + "loss": 0.4217, + "step": 33793 + }, + { + "epoch": 18.879329608938548, + "grad_norm": 0.5851247310638428, + "learning_rate": 5.663865546218488e-05, + "loss": 0.3996, + "step": 33794 + }, + { + "epoch": 18.879888268156424, + "grad_norm": 1.5194450616836548, + "learning_rate": 5.661064425770308e-05, + "loss": 0.3895, + "step": 33795 + }, + { + "epoch": 18.8804469273743, + "grad_norm": 1.0575073957443237, + "learning_rate": 5.658263305322129e-05, + "loss": 0.3865, + "step": 33796 + }, + { + "epoch": 18.881005586592178, + "grad_norm": 0.6892126798629761, + "learning_rate": 5.6554621848739495e-05, + "loss": 0.3493, + "step": 33797 + }, + { + "epoch": 18.881564245810054, + "grad_norm": 0.5927559733390808, + "learning_rate": 5.65266106442577e-05, + "loss": 0.3257, + "step": 33798 + }, + { + "epoch": 18.882122905027934, + "grad_norm": 0.40630051493644714, + "learning_rate": 5.6498599439775913e-05, + "loss": 0.4255, + "step": 33799 + }, + { + "epoch": 18.88268156424581, + "grad_norm": 0.9027992486953735, + "learning_rate": 5.647058823529412e-05, + "loss": 0.3708, + "step": 33800 + }, + { + "epoch": 18.883240223463687, + "grad_norm": 0.46567243337631226, + "learning_rate": 5.6442577030812325e-05, + "loss": 0.399, + "step": 33801 + }, + { + "epoch": 18.883798882681564, + "grad_norm": 0.4221014082431793, + "learning_rate": 5.641456582633053e-05, + "loss": 0.3562, + "step": 33802 + }, + { + "epoch": 18.88435754189944, + "grad_norm": 0.7294585108757019, + "learning_rate": 5.638655462184874e-05, + "loss": 0.5107, + "step": 33803 + }, + { + "epoch": 18.884916201117317, + "grad_norm": 0.5214829444885254, + "learning_rate": 5.6358543417366943e-05, + "loss": 0.3887, + "step": 33804 + }, + { + "epoch": 18.885474860335197, + "grad_norm": 0.32403364777565, + "learning_rate": 5.6330532212885156e-05, + "loss": 0.4068, + "step": 33805 + }, + { + "epoch": 18.886033519553074, + "grad_norm": 0.37694549560546875, + "learning_rate": 5.630252100840336e-05, + "loss": 0.4088, + "step": 33806 + }, + { + "epoch": 18.88659217877095, + "grad_norm": 0.47963759303092957, + "learning_rate": 5.627450980392157e-05, + "loss": 0.4369, + "step": 33807 + }, + { + "epoch": 18.887150837988827, + "grad_norm": 1.2743427753448486, + "learning_rate": 5.6246498599439774e-05, + "loss": 0.4282, + "step": 33808 + }, + { + "epoch": 18.887709497206703, + "grad_norm": 0.5330156683921814, + "learning_rate": 5.621848739495798e-05, + "loss": 0.4474, + "step": 33809 + }, + { + "epoch": 18.88826815642458, + "grad_norm": 0.4144650399684906, + "learning_rate": 5.619047619047619e-05, + "loss": 0.3894, + "step": 33810 + }, + { + "epoch": 18.888826815642457, + "grad_norm": 0.4847582280635834, + "learning_rate": 5.61624649859944e-05, + "loss": 0.5132, + "step": 33811 + }, + { + "epoch": 18.889385474860337, + "grad_norm": 4.575112342834473, + "learning_rate": 5.6134453781512605e-05, + "loss": 0.6483, + "step": 33812 + }, + { + "epoch": 18.889944134078213, + "grad_norm": 1.3570356369018555, + "learning_rate": 5.610644257703081e-05, + "loss": 0.5871, + "step": 33813 + }, + { + "epoch": 18.89050279329609, + "grad_norm": 0.3947475850582123, + "learning_rate": 5.607843137254902e-05, + "loss": 0.4254, + "step": 33814 + }, + { + "epoch": 18.891061452513966, + "grad_norm": 0.4775826036930084, + "learning_rate": 5.605042016806722e-05, + "loss": 0.4923, + "step": 33815 + }, + { + "epoch": 18.891620111731843, + "grad_norm": 0.5627162456512451, + "learning_rate": 5.6022408963585436e-05, + "loss": 0.4182, + "step": 33816 + }, + { + "epoch": 18.89217877094972, + "grad_norm": 0.9713678956031799, + "learning_rate": 5.599439775910364e-05, + "loss": 0.5381, + "step": 33817 + }, + { + "epoch": 18.892737430167596, + "grad_norm": 0.3798504173755646, + "learning_rate": 5.596638655462185e-05, + "loss": 0.3773, + "step": 33818 + }, + { + "epoch": 18.893296089385476, + "grad_norm": 1.123907208442688, + "learning_rate": 5.5938375350140054e-05, + "loss": 0.6038, + "step": 33819 + }, + { + "epoch": 18.893854748603353, + "grad_norm": 0.29842960834503174, + "learning_rate": 5.591036414565826e-05, + "loss": 0.2406, + "step": 33820 + }, + { + "epoch": 18.89441340782123, + "grad_norm": 2.686370611190796, + "learning_rate": 5.588235294117647e-05, + "loss": 0.3811, + "step": 33821 + }, + { + "epoch": 18.894972067039106, + "grad_norm": 0.34505489468574524, + "learning_rate": 5.585434173669468e-05, + "loss": 0.3327, + "step": 33822 + }, + { + "epoch": 18.895530726256982, + "grad_norm": 0.3997093439102173, + "learning_rate": 5.5826330532212884e-05, + "loss": 0.4684, + "step": 33823 + }, + { + "epoch": 18.89608938547486, + "grad_norm": 1.4778010845184326, + "learning_rate": 5.579831932773109e-05, + "loss": 0.3432, + "step": 33824 + }, + { + "epoch": 18.89664804469274, + "grad_norm": 0.4771747291088104, + "learning_rate": 5.5770308123249296e-05, + "loss": 0.3939, + "step": 33825 + }, + { + "epoch": 18.897206703910616, + "grad_norm": 0.5368199944496155, + "learning_rate": 5.574229691876751e-05, + "loss": 0.3633, + "step": 33826 + }, + { + "epoch": 18.897765363128492, + "grad_norm": 0.8267648816108704, + "learning_rate": 5.5714285714285715e-05, + "loss": 0.5352, + "step": 33827 + }, + { + "epoch": 18.89832402234637, + "grad_norm": 0.4943237006664276, + "learning_rate": 5.568627450980392e-05, + "loss": 0.5406, + "step": 33828 + }, + { + "epoch": 18.898882681564245, + "grad_norm": 0.882495641708374, + "learning_rate": 5.565826330532213e-05, + "loss": 0.4723, + "step": 33829 + }, + { + "epoch": 18.899441340782122, + "grad_norm": 0.39665326476097107, + "learning_rate": 5.563025210084033e-05, + "loss": 0.4804, + "step": 33830 + }, + { + "epoch": 18.9, + "grad_norm": 0.8868411183357239, + "learning_rate": 5.560224089635854e-05, + "loss": 0.461, + "step": 33831 + }, + { + "epoch": 18.90055865921788, + "grad_norm": 0.4521852433681488, + "learning_rate": 5.557422969187675e-05, + "loss": 0.4124, + "step": 33832 + }, + { + "epoch": 18.901117318435755, + "grad_norm": 0.7548196315765381, + "learning_rate": 5.554621848739496e-05, + "loss": 0.3007, + "step": 33833 + }, + { + "epoch": 18.901675977653632, + "grad_norm": 0.5111404657363892, + "learning_rate": 5.5518207282913164e-05, + "loss": 0.5198, + "step": 33834 + }, + { + "epoch": 18.90223463687151, + "grad_norm": 0.49965816736221313, + "learning_rate": 5.549019607843137e-05, + "loss": 0.4763, + "step": 33835 + }, + { + "epoch": 18.902793296089385, + "grad_norm": 0.6660752892494202, + "learning_rate": 5.5462184873949576e-05, + "loss": 0.4514, + "step": 33836 + }, + { + "epoch": 18.90335195530726, + "grad_norm": 1.3603583574295044, + "learning_rate": 5.543417366946779e-05, + "loss": 0.435, + "step": 33837 + }, + { + "epoch": 18.903910614525138, + "grad_norm": 1.1103458404541016, + "learning_rate": 5.5406162464985995e-05, + "loss": 0.5685, + "step": 33838 + }, + { + "epoch": 18.904469273743018, + "grad_norm": 0.4276740849018097, + "learning_rate": 5.53781512605042e-05, + "loss": 0.4152, + "step": 33839 + }, + { + "epoch": 18.905027932960895, + "grad_norm": 0.3635987937450409, + "learning_rate": 5.535014005602241e-05, + "loss": 0.3539, + "step": 33840 + }, + { + "epoch": 18.90558659217877, + "grad_norm": 0.45916834473609924, + "learning_rate": 5.532212885154061e-05, + "loss": 0.403, + "step": 33841 + }, + { + "epoch": 18.906145251396648, + "grad_norm": 0.6639769673347473, + "learning_rate": 5.5294117647058825e-05, + "loss": 0.4668, + "step": 33842 + }, + { + "epoch": 18.906703910614524, + "grad_norm": 0.45911604166030884, + "learning_rate": 5.526610644257703e-05, + "loss": 0.4328, + "step": 33843 + }, + { + "epoch": 18.9072625698324, + "grad_norm": 0.7857806086540222, + "learning_rate": 5.523809523809524e-05, + "loss": 0.417, + "step": 33844 + }, + { + "epoch": 18.907821229050278, + "grad_norm": 0.7776517271995544, + "learning_rate": 5.5210084033613443e-05, + "loss": 0.3677, + "step": 33845 + }, + { + "epoch": 18.908379888268158, + "grad_norm": 0.4270723760128021, + "learning_rate": 5.518207282913165e-05, + "loss": 0.3843, + "step": 33846 + }, + { + "epoch": 18.908938547486034, + "grad_norm": 1.4539105892181396, + "learning_rate": 5.5154061624649855e-05, + "loss": 0.3672, + "step": 33847 + }, + { + "epoch": 18.90949720670391, + "grad_norm": 0.41392868757247925, + "learning_rate": 5.512605042016807e-05, + "loss": 0.3303, + "step": 33848 + }, + { + "epoch": 18.910055865921787, + "grad_norm": 0.5647719502449036, + "learning_rate": 5.5098039215686274e-05, + "loss": 0.553, + "step": 33849 + }, + { + "epoch": 18.910614525139664, + "grad_norm": 0.3424026370048523, + "learning_rate": 5.507002801120448e-05, + "loss": 0.3443, + "step": 33850 + }, + { + "epoch": 18.91117318435754, + "grad_norm": 0.5082478523254395, + "learning_rate": 5.5042016806722686e-05, + "loss": 0.5536, + "step": 33851 + }, + { + "epoch": 18.91173184357542, + "grad_norm": 0.4064604938030243, + "learning_rate": 5.501400560224089e-05, + "loss": 0.3801, + "step": 33852 + }, + { + "epoch": 18.912290502793297, + "grad_norm": 1.052262544631958, + "learning_rate": 5.4985994397759105e-05, + "loss": 0.4567, + "step": 33853 + }, + { + "epoch": 18.912849162011174, + "grad_norm": 0.5106754302978516, + "learning_rate": 5.495798319327731e-05, + "loss": 0.4851, + "step": 33854 + }, + { + "epoch": 18.91340782122905, + "grad_norm": 0.436939537525177, + "learning_rate": 5.492997198879552e-05, + "loss": 0.4729, + "step": 33855 + }, + { + "epoch": 18.913966480446927, + "grad_norm": 0.5108621716499329, + "learning_rate": 5.490196078431372e-05, + "loss": 0.3625, + "step": 33856 + }, + { + "epoch": 18.914525139664804, + "grad_norm": 0.8846465945243835, + "learning_rate": 5.487394957983193e-05, + "loss": 0.3987, + "step": 33857 + }, + { + "epoch": 18.91508379888268, + "grad_norm": 3.5128445625305176, + "learning_rate": 5.4845938375350135e-05, + "loss": 0.5637, + "step": 33858 + }, + { + "epoch": 18.91564245810056, + "grad_norm": 3.4166648387908936, + "learning_rate": 5.481792717086835e-05, + "loss": 0.5048, + "step": 33859 + }, + { + "epoch": 18.916201117318437, + "grad_norm": 0.45042723417282104, + "learning_rate": 5.4789915966386554e-05, + "loss": 0.4685, + "step": 33860 + }, + { + "epoch": 18.916759776536313, + "grad_norm": 4.460443496704102, + "learning_rate": 5.476190476190476e-05, + "loss": 0.3877, + "step": 33861 + }, + { + "epoch": 18.91731843575419, + "grad_norm": 0.4218735098838806, + "learning_rate": 5.4733893557422966e-05, + "loss": 0.4458, + "step": 33862 + }, + { + "epoch": 18.917877094972066, + "grad_norm": 0.3589506447315216, + "learning_rate": 5.470588235294117e-05, + "loss": 0.3439, + "step": 33863 + }, + { + "epoch": 18.918435754189943, + "grad_norm": 0.7866973280906677, + "learning_rate": 5.4677871148459384e-05, + "loss": 0.4355, + "step": 33864 + }, + { + "epoch": 18.91899441340782, + "grad_norm": 1.1428085565567017, + "learning_rate": 5.464985994397759e-05, + "loss": 0.3687, + "step": 33865 + }, + { + "epoch": 18.9195530726257, + "grad_norm": 11.617588996887207, + "learning_rate": 5.4621848739495796e-05, + "loss": 0.3951, + "step": 33866 + }, + { + "epoch": 18.920111731843576, + "grad_norm": 0.8480934500694275, + "learning_rate": 5.4593837535014e-05, + "loss": 0.4449, + "step": 33867 + }, + { + "epoch": 18.920670391061453, + "grad_norm": 0.7735621929168701, + "learning_rate": 5.456582633053221e-05, + "loss": 0.6169, + "step": 33868 + }, + { + "epoch": 18.92122905027933, + "grad_norm": 0.7786951661109924, + "learning_rate": 5.453781512605043e-05, + "loss": 0.4327, + "step": 33869 + }, + { + "epoch": 18.921787709497206, + "grad_norm": 0.3648815453052521, + "learning_rate": 5.4509803921568634e-05, + "loss": 0.2842, + "step": 33870 + }, + { + "epoch": 18.922346368715083, + "grad_norm": 0.6461501717567444, + "learning_rate": 5.448179271708684e-05, + "loss": 0.4989, + "step": 33871 + }, + { + "epoch": 18.922905027932963, + "grad_norm": 0.37036168575286865, + "learning_rate": 5.4453781512605046e-05, + "loss": 0.3437, + "step": 33872 + }, + { + "epoch": 18.92346368715084, + "grad_norm": 0.6820854544639587, + "learning_rate": 5.442577030812325e-05, + "loss": 0.3563, + "step": 33873 + }, + { + "epoch": 18.924022346368716, + "grad_norm": 2.24249267578125, + "learning_rate": 5.439775910364146e-05, + "loss": 0.3787, + "step": 33874 + }, + { + "epoch": 18.924581005586592, + "grad_norm": 0.39428409934043884, + "learning_rate": 5.436974789915967e-05, + "loss": 0.346, + "step": 33875 + }, + { + "epoch": 18.92513966480447, + "grad_norm": 0.47936001420021057, + "learning_rate": 5.434173669467788e-05, + "loss": 0.4097, + "step": 33876 + }, + { + "epoch": 18.925698324022346, + "grad_norm": 0.49858567118644714, + "learning_rate": 5.431372549019608e-05, + "loss": 0.3706, + "step": 33877 + }, + { + "epoch": 18.926256983240222, + "grad_norm": 0.5014204382896423, + "learning_rate": 5.428571428571429e-05, + "loss": 0.3831, + "step": 33878 + }, + { + "epoch": 18.926815642458102, + "grad_norm": 0.5206138491630554, + "learning_rate": 5.4257703081232495e-05, + "loss": 0.4625, + "step": 33879 + }, + { + "epoch": 18.92737430167598, + "grad_norm": 1.844326138496399, + "learning_rate": 5.422969187675071e-05, + "loss": 0.4055, + "step": 33880 + }, + { + "epoch": 18.927932960893855, + "grad_norm": 0.5907692313194275, + "learning_rate": 5.4201680672268913e-05, + "loss": 0.4577, + "step": 33881 + }, + { + "epoch": 18.928491620111732, + "grad_norm": 0.4953712224960327, + "learning_rate": 5.417366946778712e-05, + "loss": 0.3581, + "step": 33882 + }, + { + "epoch": 18.92905027932961, + "grad_norm": 0.3316100537776947, + "learning_rate": 5.4145658263305325e-05, + "loss": 0.3284, + "step": 33883 + }, + { + "epoch": 18.929608938547485, + "grad_norm": 0.48553207516670227, + "learning_rate": 5.411764705882353e-05, + "loss": 0.3742, + "step": 33884 + }, + { + "epoch": 18.93016759776536, + "grad_norm": 0.34783920645713806, + "learning_rate": 5.408963585434174e-05, + "loss": 0.2538, + "step": 33885 + }, + { + "epoch": 18.93072625698324, + "grad_norm": 0.41213372349739075, + "learning_rate": 5.406162464985995e-05, + "loss": 0.3792, + "step": 33886 + }, + { + "epoch": 18.93128491620112, + "grad_norm": 0.5908197164535522, + "learning_rate": 5.4033613445378156e-05, + "loss": 0.3949, + "step": 33887 + }, + { + "epoch": 18.931843575418995, + "grad_norm": 0.4517349302768707, + "learning_rate": 5.400560224089636e-05, + "loss": 0.4406, + "step": 33888 + }, + { + "epoch": 18.93240223463687, + "grad_norm": 0.930027186870575, + "learning_rate": 5.397759103641457e-05, + "loss": 0.3281, + "step": 33889 + }, + { + "epoch": 18.932960893854748, + "grad_norm": 0.551719069480896, + "learning_rate": 5.3949579831932774e-05, + "loss": 0.3581, + "step": 33890 + }, + { + "epoch": 18.933519553072625, + "grad_norm": 0.39376363158226013, + "learning_rate": 5.392156862745099e-05, + "loss": 0.3685, + "step": 33891 + }, + { + "epoch": 18.9340782122905, + "grad_norm": 1.0118646621704102, + "learning_rate": 5.389355742296919e-05, + "loss": 0.4237, + "step": 33892 + }, + { + "epoch": 18.93463687150838, + "grad_norm": 0.33368727564811707, + "learning_rate": 5.38655462184874e-05, + "loss": 0.2681, + "step": 33893 + }, + { + "epoch": 18.935195530726258, + "grad_norm": 0.3958822786808014, + "learning_rate": 5.3837535014005605e-05, + "loss": 0.422, + "step": 33894 + }, + { + "epoch": 18.935754189944134, + "grad_norm": 0.4539048373699188, + "learning_rate": 5.380952380952381e-05, + "loss": 0.4047, + "step": 33895 + }, + { + "epoch": 18.93631284916201, + "grad_norm": 0.5042299032211304, + "learning_rate": 5.3781512605042024e-05, + "loss": 0.3592, + "step": 33896 + }, + { + "epoch": 18.936871508379888, + "grad_norm": 0.39602118730545044, + "learning_rate": 5.375350140056023e-05, + "loss": 0.3712, + "step": 33897 + }, + { + "epoch": 18.937430167597764, + "grad_norm": 0.36842110753059387, + "learning_rate": 5.3725490196078436e-05, + "loss": 0.3887, + "step": 33898 + }, + { + "epoch": 18.93798882681564, + "grad_norm": 0.4202346205711365, + "learning_rate": 5.369747899159664e-05, + "loss": 0.455, + "step": 33899 + }, + { + "epoch": 18.93854748603352, + "grad_norm": 0.5464991927146912, + "learning_rate": 5.366946778711485e-05, + "loss": 0.5855, + "step": 33900 + }, + { + "epoch": 18.939106145251397, + "grad_norm": 0.89593905210495, + "learning_rate": 5.3641456582633054e-05, + "loss": 0.433, + "step": 33901 + }, + { + "epoch": 18.939664804469274, + "grad_norm": 0.3702648878097534, + "learning_rate": 5.3613445378151266e-05, + "loss": 0.4813, + "step": 33902 + }, + { + "epoch": 18.94022346368715, + "grad_norm": 0.5631015300750732, + "learning_rate": 5.358543417366947e-05, + "loss": 0.2789, + "step": 33903 + }, + { + "epoch": 18.940782122905027, + "grad_norm": 0.41592875123023987, + "learning_rate": 5.355742296918768e-05, + "loss": 0.4394, + "step": 33904 + }, + { + "epoch": 18.941340782122904, + "grad_norm": 0.35282185673713684, + "learning_rate": 5.3529411764705884e-05, + "loss": 0.3806, + "step": 33905 + }, + { + "epoch": 18.941899441340784, + "grad_norm": 0.7095988392829895, + "learning_rate": 5.350140056022409e-05, + "loss": 0.5656, + "step": 33906 + }, + { + "epoch": 18.94245810055866, + "grad_norm": 0.34797996282577515, + "learning_rate": 5.34733893557423e-05, + "loss": 0.336, + "step": 33907 + }, + { + "epoch": 18.943016759776537, + "grad_norm": 0.41780421137809753, + "learning_rate": 5.344537815126051e-05, + "loss": 0.4263, + "step": 33908 + }, + { + "epoch": 18.943575418994413, + "grad_norm": 0.3015372157096863, + "learning_rate": 5.3417366946778715e-05, + "loss": 0.355, + "step": 33909 + }, + { + "epoch": 18.94413407821229, + "grad_norm": 0.4017311930656433, + "learning_rate": 5.338935574229692e-05, + "loss": 0.3908, + "step": 33910 + }, + { + "epoch": 18.944692737430167, + "grad_norm": 0.4889945983886719, + "learning_rate": 5.336134453781513e-05, + "loss": 0.5316, + "step": 33911 + }, + { + "epoch": 18.945251396648043, + "grad_norm": 0.4016016721725464, + "learning_rate": 5.333333333333334e-05, + "loss": 0.4654, + "step": 33912 + }, + { + "epoch": 18.945810055865923, + "grad_norm": 0.5134336352348328, + "learning_rate": 5.3305322128851546e-05, + "loss": 0.347, + "step": 33913 + }, + { + "epoch": 18.9463687150838, + "grad_norm": 0.8897731900215149, + "learning_rate": 5.327731092436975e-05, + "loss": 0.4025, + "step": 33914 + }, + { + "epoch": 18.946927374301676, + "grad_norm": 0.7334646582603455, + "learning_rate": 5.324929971988796e-05, + "loss": 0.3937, + "step": 33915 + }, + { + "epoch": 18.947486033519553, + "grad_norm": 0.48601144552230835, + "learning_rate": 5.3221288515406164e-05, + "loss": 0.47, + "step": 33916 + }, + { + "epoch": 18.94804469273743, + "grad_norm": 0.48712092638015747, + "learning_rate": 5.319327731092437e-05, + "loss": 0.3804, + "step": 33917 + }, + { + "epoch": 18.948603351955306, + "grad_norm": 0.6057708263397217, + "learning_rate": 5.316526610644258e-05, + "loss": 0.3118, + "step": 33918 + }, + { + "epoch": 18.949162011173183, + "grad_norm": 1.0692092180252075, + "learning_rate": 5.313725490196079e-05, + "loss": 0.3982, + "step": 33919 + }, + { + "epoch": 18.949720670391063, + "grad_norm": 2.1109750270843506, + "learning_rate": 5.3109243697478995e-05, + "loss": 0.4418, + "step": 33920 + }, + { + "epoch": 18.95027932960894, + "grad_norm": 0.3324871063232422, + "learning_rate": 5.30812324929972e-05, + "loss": 0.3801, + "step": 33921 + }, + { + "epoch": 18.950837988826816, + "grad_norm": 0.4373445212841034, + "learning_rate": 5.305322128851541e-05, + "loss": 0.3976, + "step": 33922 + }, + { + "epoch": 18.951396648044692, + "grad_norm": 0.7524389028549194, + "learning_rate": 5.302521008403362e-05, + "loss": 0.2836, + "step": 33923 + }, + { + "epoch": 18.95195530726257, + "grad_norm": 1.6838334798812866, + "learning_rate": 5.2997198879551825e-05, + "loss": 0.3771, + "step": 33924 + }, + { + "epoch": 18.952513966480446, + "grad_norm": 0.896115779876709, + "learning_rate": 5.296918767507003e-05, + "loss": 0.292, + "step": 33925 + }, + { + "epoch": 18.953072625698326, + "grad_norm": 0.43289652466773987, + "learning_rate": 5.294117647058824e-05, + "loss": 0.4667, + "step": 33926 + }, + { + "epoch": 18.953631284916202, + "grad_norm": 0.5212640762329102, + "learning_rate": 5.2913165266106443e-05, + "loss": 0.3845, + "step": 33927 + }, + { + "epoch": 18.95418994413408, + "grad_norm": 0.6807703375816345, + "learning_rate": 5.288515406162465e-05, + "loss": 0.4292, + "step": 33928 + }, + { + "epoch": 18.954748603351955, + "grad_norm": 0.4425789713859558, + "learning_rate": 5.285714285714286e-05, + "loss": 0.4301, + "step": 33929 + }, + { + "epoch": 18.955307262569832, + "grad_norm": 0.41546064615249634, + "learning_rate": 5.282913165266107e-05, + "loss": 0.4305, + "step": 33930 + }, + { + "epoch": 18.95586592178771, + "grad_norm": 0.43749648332595825, + "learning_rate": 5.2801120448179274e-05, + "loss": 0.4138, + "step": 33931 + }, + { + "epoch": 18.956424581005585, + "grad_norm": 0.6813123822212219, + "learning_rate": 5.277310924369748e-05, + "loss": 0.3651, + "step": 33932 + }, + { + "epoch": 18.956983240223465, + "grad_norm": 0.35313770174980164, + "learning_rate": 5.2745098039215686e-05, + "loss": 0.4507, + "step": 33933 + }, + { + "epoch": 18.957541899441342, + "grad_norm": 0.3603627383708954, + "learning_rate": 5.27170868347339e-05, + "loss": 0.3907, + "step": 33934 + }, + { + "epoch": 18.95810055865922, + "grad_norm": 0.4522898197174072, + "learning_rate": 5.2689075630252105e-05, + "loss": 0.4027, + "step": 33935 + }, + { + "epoch": 18.958659217877095, + "grad_norm": 0.3996206820011139, + "learning_rate": 5.266106442577031e-05, + "loss": 0.389, + "step": 33936 + }, + { + "epoch": 18.95921787709497, + "grad_norm": 0.46490445733070374, + "learning_rate": 5.263305322128852e-05, + "loss": 0.3402, + "step": 33937 + }, + { + "epoch": 18.959776536312848, + "grad_norm": 0.4886170029640198, + "learning_rate": 5.260504201680672e-05, + "loss": 0.4489, + "step": 33938 + }, + { + "epoch": 18.960335195530725, + "grad_norm": 2.693960189819336, + "learning_rate": 5.2577030812324936e-05, + "loss": 0.3774, + "step": 33939 + }, + { + "epoch": 18.960893854748605, + "grad_norm": 0.5858724117279053, + "learning_rate": 5.254901960784314e-05, + "loss": 0.4251, + "step": 33940 + }, + { + "epoch": 18.96145251396648, + "grad_norm": 0.3281695246696472, + "learning_rate": 5.252100840336135e-05, + "loss": 0.362, + "step": 33941 + }, + { + "epoch": 18.962011173184358, + "grad_norm": 1.1966187953948975, + "learning_rate": 5.2492997198879554e-05, + "loss": 0.5748, + "step": 33942 + }, + { + "epoch": 18.962569832402234, + "grad_norm": 0.36663034558296204, + "learning_rate": 5.246498599439776e-05, + "loss": 0.3643, + "step": 33943 + }, + { + "epoch": 18.96312849162011, + "grad_norm": 1.103279709815979, + "learning_rate": 5.2436974789915966e-05, + "loss": 0.3651, + "step": 33944 + }, + { + "epoch": 18.963687150837988, + "grad_norm": 4.124715328216553, + "learning_rate": 5.240896358543418e-05, + "loss": 0.4382, + "step": 33945 + }, + { + "epoch": 18.964245810055864, + "grad_norm": 0.4870852828025818, + "learning_rate": 5.2380952380952384e-05, + "loss": 0.4002, + "step": 33946 + }, + { + "epoch": 18.964804469273744, + "grad_norm": 0.34290871024131775, + "learning_rate": 5.235294117647059e-05, + "loss": 0.4116, + "step": 33947 + }, + { + "epoch": 18.96536312849162, + "grad_norm": 0.3345028758049011, + "learning_rate": 5.2324929971988796e-05, + "loss": 0.3448, + "step": 33948 + }, + { + "epoch": 18.965921787709497, + "grad_norm": 2.3267154693603516, + "learning_rate": 5.2296918767507e-05, + "loss": 0.4233, + "step": 33949 + }, + { + "epoch": 18.966480446927374, + "grad_norm": 0.7566601634025574, + "learning_rate": 5.2268907563025215e-05, + "loss": 0.5048, + "step": 33950 + }, + { + "epoch": 18.96703910614525, + "grad_norm": 0.37007296085357666, + "learning_rate": 5.224089635854342e-05, + "loss": 0.3912, + "step": 33951 + }, + { + "epoch": 18.967597765363127, + "grad_norm": 0.4309180974960327, + "learning_rate": 5.221288515406163e-05, + "loss": 0.3832, + "step": 33952 + }, + { + "epoch": 18.968156424581007, + "grad_norm": 3.102376699447632, + "learning_rate": 5.218487394957983e-05, + "loss": 0.444, + "step": 33953 + }, + { + "epoch": 18.968715083798884, + "grad_norm": 0.588596522808075, + "learning_rate": 5.215686274509804e-05, + "loss": 0.3839, + "step": 33954 + }, + { + "epoch": 18.96927374301676, + "grad_norm": 0.8135894536972046, + "learning_rate": 5.2128851540616245e-05, + "loss": 0.4149, + "step": 33955 + }, + { + "epoch": 18.969832402234637, + "grad_norm": 0.4203833341598511, + "learning_rate": 5.210084033613446e-05, + "loss": 0.3848, + "step": 33956 + }, + { + "epoch": 18.970391061452514, + "grad_norm": 0.7620267868041992, + "learning_rate": 5.2072829131652664e-05, + "loss": 0.4737, + "step": 33957 + }, + { + "epoch": 18.97094972067039, + "grad_norm": 0.5293670892715454, + "learning_rate": 5.204481792717087e-05, + "loss": 0.4346, + "step": 33958 + }, + { + "epoch": 18.971508379888267, + "grad_norm": 0.7390501499176025, + "learning_rate": 5.2016806722689076e-05, + "loss": 0.3172, + "step": 33959 + }, + { + "epoch": 18.972067039106147, + "grad_norm": 0.9070444703102112, + "learning_rate": 5.198879551820728e-05, + "loss": 0.3466, + "step": 33960 + }, + { + "epoch": 18.972625698324023, + "grad_norm": 0.5488793849945068, + "learning_rate": 5.1960784313725495e-05, + "loss": 0.4414, + "step": 33961 + }, + { + "epoch": 18.9731843575419, + "grad_norm": 8.175317764282227, + "learning_rate": 5.19327731092437e-05, + "loss": 0.405, + "step": 33962 + }, + { + "epoch": 18.973743016759776, + "grad_norm": 0.8509916663169861, + "learning_rate": 5.190476190476191e-05, + "loss": 0.3367, + "step": 33963 + }, + { + "epoch": 18.974301675977653, + "grad_norm": 1.420340895652771, + "learning_rate": 5.187675070028011e-05, + "loss": 0.4212, + "step": 33964 + }, + { + "epoch": 18.97486033519553, + "grad_norm": 0.37568145990371704, + "learning_rate": 5.184873949579832e-05, + "loss": 0.4308, + "step": 33965 + }, + { + "epoch": 18.975418994413406, + "grad_norm": 0.45119836926460266, + "learning_rate": 5.182072829131653e-05, + "loss": 0.4295, + "step": 33966 + }, + { + "epoch": 18.975977653631286, + "grad_norm": 0.4286446273326874, + "learning_rate": 5.179271708683474e-05, + "loss": 0.3305, + "step": 33967 + }, + { + "epoch": 18.976536312849163, + "grad_norm": 0.5809363126754761, + "learning_rate": 5.1764705882352943e-05, + "loss": 0.5267, + "step": 33968 + }, + { + "epoch": 18.97709497206704, + "grad_norm": 0.4270780682563782, + "learning_rate": 5.173669467787115e-05, + "loss": 0.3908, + "step": 33969 + }, + { + "epoch": 18.977653631284916, + "grad_norm": 0.6030421853065491, + "learning_rate": 5.1708683473389355e-05, + "loss": 0.4067, + "step": 33970 + }, + { + "epoch": 18.978212290502793, + "grad_norm": 9.251143455505371, + "learning_rate": 5.168067226890756e-05, + "loss": 0.4219, + "step": 33971 + }, + { + "epoch": 18.97877094972067, + "grad_norm": 0.6906352043151855, + "learning_rate": 5.1652661064425774e-05, + "loss": 0.7465, + "step": 33972 + }, + { + "epoch": 18.97932960893855, + "grad_norm": 0.5502415299415588, + "learning_rate": 5.162464985994398e-05, + "loss": 0.4004, + "step": 33973 + }, + { + "epoch": 18.979888268156426, + "grad_norm": 0.47371426224708557, + "learning_rate": 5.1596638655462186e-05, + "loss": 0.4125, + "step": 33974 + }, + { + "epoch": 18.980446927374302, + "grad_norm": 1.191032886505127, + "learning_rate": 5.156862745098039e-05, + "loss": 0.3484, + "step": 33975 + }, + { + "epoch": 18.98100558659218, + "grad_norm": 3.6259195804595947, + "learning_rate": 5.15406162464986e-05, + "loss": 0.3549, + "step": 33976 + }, + { + "epoch": 18.981564245810056, + "grad_norm": 0.401210755109787, + "learning_rate": 5.151260504201681e-05, + "loss": 0.3255, + "step": 33977 + }, + { + "epoch": 18.982122905027932, + "grad_norm": 0.5023775100708008, + "learning_rate": 5.148459383753502e-05, + "loss": 0.5245, + "step": 33978 + }, + { + "epoch": 18.98268156424581, + "grad_norm": 0.3667581081390381, + "learning_rate": 5.145658263305322e-05, + "loss": 0.3686, + "step": 33979 + }, + { + "epoch": 18.98324022346369, + "grad_norm": 0.44442471861839294, + "learning_rate": 5.142857142857143e-05, + "loss": 0.3932, + "step": 33980 + }, + { + "epoch": 18.983798882681565, + "grad_norm": 0.45789602398872375, + "learning_rate": 5.1400560224089635e-05, + "loss": 0.613, + "step": 33981 + }, + { + "epoch": 18.984357541899442, + "grad_norm": 0.5256237387657166, + "learning_rate": 5.137254901960785e-05, + "loss": 0.4683, + "step": 33982 + }, + { + "epoch": 18.98491620111732, + "grad_norm": 1.2099040746688843, + "learning_rate": 5.1344537815126054e-05, + "loss": 0.4535, + "step": 33983 + }, + { + "epoch": 18.985474860335195, + "grad_norm": 0.3841578960418701, + "learning_rate": 5.131652661064426e-05, + "loss": 0.3364, + "step": 33984 + }, + { + "epoch": 18.98603351955307, + "grad_norm": 0.8091573119163513, + "learning_rate": 5.1288515406162466e-05, + "loss": 0.5057, + "step": 33985 + }, + { + "epoch": 18.986592178770948, + "grad_norm": 0.5129916071891785, + "learning_rate": 5.126050420168067e-05, + "loss": 0.4451, + "step": 33986 + }, + { + "epoch": 18.98715083798883, + "grad_norm": 0.39209139347076416, + "learning_rate": 5.123249299719888e-05, + "loss": 0.3354, + "step": 33987 + }, + { + "epoch": 18.987709497206705, + "grad_norm": 0.33828359842300415, + "learning_rate": 5.120448179271709e-05, + "loss": 0.3366, + "step": 33988 + }, + { + "epoch": 18.98826815642458, + "grad_norm": 0.6842661499977112, + "learning_rate": 5.1176470588235296e-05, + "loss": 0.4673, + "step": 33989 + }, + { + "epoch": 18.988826815642458, + "grad_norm": 0.437063604593277, + "learning_rate": 5.11484593837535e-05, + "loss": 0.4506, + "step": 33990 + }, + { + "epoch": 18.989385474860335, + "grad_norm": 0.7568961977958679, + "learning_rate": 5.112044817927171e-05, + "loss": 0.4899, + "step": 33991 + }, + { + "epoch": 18.98994413407821, + "grad_norm": 0.4318029582500458, + "learning_rate": 5.1092436974789914e-05, + "loss": 0.4284, + "step": 33992 + }, + { + "epoch": 18.990502793296088, + "grad_norm": 0.4273245632648468, + "learning_rate": 5.106442577030813e-05, + "loss": 0.4964, + "step": 33993 + }, + { + "epoch": 18.991061452513968, + "grad_norm": 0.42612603306770325, + "learning_rate": 5.103641456582633e-05, + "loss": 0.3822, + "step": 33994 + }, + { + "epoch": 18.991620111731844, + "grad_norm": 0.6867136359214783, + "learning_rate": 5.100840336134454e-05, + "loss": 0.3913, + "step": 33995 + }, + { + "epoch": 18.99217877094972, + "grad_norm": 0.4082396924495697, + "learning_rate": 5.0980392156862745e-05, + "loss": 0.4082, + "step": 33996 + }, + { + "epoch": 18.992737430167598, + "grad_norm": 0.5038939714431763, + "learning_rate": 5.095238095238095e-05, + "loss": 0.3688, + "step": 33997 + }, + { + "epoch": 18.993296089385474, + "grad_norm": 6.329892635345459, + "learning_rate": 5.092436974789916e-05, + "loss": 0.3698, + "step": 33998 + }, + { + "epoch": 18.99385474860335, + "grad_norm": 0.4058187007904053, + "learning_rate": 5.089635854341737e-05, + "loss": 0.3684, + "step": 33999 + }, + { + "epoch": 18.994413407821227, + "grad_norm": 0.4301515221595764, + "learning_rate": 5.0868347338935576e-05, + "loss": 0.3489, + "step": 34000 + }, + { + "epoch": 18.994413407821227, + "eval_cer": 0.08439440061972876, + "eval_loss": 0.31858545541763306, + "eval_runtime": 55.4392, + "eval_samples_per_second": 81.855, + "eval_steps_per_second": 5.123, + "eval_wer": 0.334655456551493, + "step": 34000 + }, + { + "epoch": 18.994972067039107, + "grad_norm": 0.696684718132019, + "learning_rate": 5.084033613445378e-05, + "loss": 0.3138, + "step": 34001 + }, + { + "epoch": 18.995530726256984, + "grad_norm": 2.6737091541290283, + "learning_rate": 5.081232492997199e-05, + "loss": 0.2794, + "step": 34002 + }, + { + "epoch": 18.99608938547486, + "grad_norm": 0.4672900438308716, + "learning_rate": 5.0784313725490194e-05, + "loss": 0.4445, + "step": 34003 + }, + { + "epoch": 18.996648044692737, + "grad_norm": 0.38065600395202637, + "learning_rate": 5.075630252100841e-05, + "loss": 0.2964, + "step": 34004 + }, + { + "epoch": 18.997206703910614, + "grad_norm": 0.4367673993110657, + "learning_rate": 5.072829131652661e-05, + "loss": 0.5116, + "step": 34005 + }, + { + "epoch": 18.99776536312849, + "grad_norm": 1.6056444644927979, + "learning_rate": 5.070028011204482e-05, + "loss": 0.4629, + "step": 34006 + }, + { + "epoch": 18.99832402234637, + "grad_norm": 0.3913673162460327, + "learning_rate": 5.0672268907563025e-05, + "loss": 0.346, + "step": 34007 + }, + { + "epoch": 18.998882681564247, + "grad_norm": 0.6824917197227478, + "learning_rate": 5.064425770308123e-05, + "loss": 0.5012, + "step": 34008 + }, + { + "epoch": 18.999441340782123, + "grad_norm": 1.9792801141738892, + "learning_rate": 5.0616246498599443e-05, + "loss": 0.4343, + "step": 34009 + }, + { + "epoch": 19.0, + "grad_norm": 0.838819682598114, + "learning_rate": 5.058823529411765e-05, + "loss": 0.381, + "step": 34010 + }, + { + "epoch": 19.000558659217877, + "grad_norm": 0.3558765947818756, + "learning_rate": 5.0560224089635855e-05, + "loss": 0.3837, + "step": 34011 + }, + { + "epoch": 19.001117318435753, + "grad_norm": 0.8690874576568604, + "learning_rate": 5.053221288515406e-05, + "loss": 0.4975, + "step": 34012 + }, + { + "epoch": 19.00167597765363, + "grad_norm": 0.34056082367897034, + "learning_rate": 5.050420168067227e-05, + "loss": 0.3591, + "step": 34013 + }, + { + "epoch": 19.00223463687151, + "grad_norm": 0.4120817184448242, + "learning_rate": 5.047619047619047e-05, + "loss": 0.4373, + "step": 34014 + }, + { + "epoch": 19.002793296089386, + "grad_norm": 0.612208902835846, + "learning_rate": 5.0448179271708686e-05, + "loss": 0.3276, + "step": 34015 + }, + { + "epoch": 19.003351955307263, + "grad_norm": 0.33575963973999023, + "learning_rate": 5.042016806722689e-05, + "loss": 0.3332, + "step": 34016 + }, + { + "epoch": 19.00391061452514, + "grad_norm": 0.5213196277618408, + "learning_rate": 5.03921568627451e-05, + "loss": 0.4321, + "step": 34017 + }, + { + "epoch": 19.004469273743016, + "grad_norm": 0.4308393597602844, + "learning_rate": 5.0364145658263304e-05, + "loss": 0.3894, + "step": 34018 + }, + { + "epoch": 19.005027932960893, + "grad_norm": 1.910498023033142, + "learning_rate": 5.033613445378151e-05, + "loss": 0.4154, + "step": 34019 + }, + { + "epoch": 19.00558659217877, + "grad_norm": 0.6421675682067871, + "learning_rate": 5.030812324929972e-05, + "loss": 0.4603, + "step": 34020 + }, + { + "epoch": 19.00614525139665, + "grad_norm": 0.37055546045303345, + "learning_rate": 5.028011204481793e-05, + "loss": 0.332, + "step": 34021 + }, + { + "epoch": 19.006703910614526, + "grad_norm": 0.427277147769928, + "learning_rate": 5.0252100840336135e-05, + "loss": 0.4285, + "step": 34022 + }, + { + "epoch": 19.007262569832402, + "grad_norm": 1.2247799634933472, + "learning_rate": 5.022408963585434e-05, + "loss": 0.4186, + "step": 34023 + }, + { + "epoch": 19.00782122905028, + "grad_norm": 0.3420492708683014, + "learning_rate": 5.019607843137255e-05, + "loss": 0.3582, + "step": 34024 + }, + { + "epoch": 19.008379888268156, + "grad_norm": 0.4589462876319885, + "learning_rate": 5.016806722689075e-05, + "loss": 0.353, + "step": 34025 + }, + { + "epoch": 19.008938547486032, + "grad_norm": 0.3948412537574768, + "learning_rate": 5.0140056022408966e-05, + "loss": 0.4366, + "step": 34026 + }, + { + "epoch": 19.009497206703912, + "grad_norm": 0.3301945626735687, + "learning_rate": 5.011204481792717e-05, + "loss": 0.3213, + "step": 34027 + }, + { + "epoch": 19.01005586592179, + "grad_norm": 0.478463351726532, + "learning_rate": 5.008403361344538e-05, + "loss": 0.5252, + "step": 34028 + }, + { + "epoch": 19.010614525139665, + "grad_norm": 0.4208948016166687, + "learning_rate": 5.0056022408963584e-05, + "loss": 0.4559, + "step": 34029 + }, + { + "epoch": 19.011173184357542, + "grad_norm": 0.7570073008537292, + "learning_rate": 5.002801120448179e-05, + "loss": 0.4632, + "step": 34030 + }, + { + "epoch": 19.01173184357542, + "grad_norm": 0.4433969557285309, + "learning_rate": 5e-05, + "loss": 0.3333, + "step": 34031 + }, + { + "epoch": 19.012290502793295, + "grad_norm": 0.7575501203536987, + "learning_rate": 4.997198879551821e-05, + "loss": 0.4447, + "step": 34032 + }, + { + "epoch": 19.01284916201117, + "grad_norm": 0.6942957043647766, + "learning_rate": 4.9943977591036414e-05, + "loss": 0.5404, + "step": 34033 + }, + { + "epoch": 19.013407821229052, + "grad_norm": 0.41899439692497253, + "learning_rate": 4.991596638655462e-05, + "loss": 0.3727, + "step": 34034 + }, + { + "epoch": 19.01396648044693, + "grad_norm": 0.43453991413116455, + "learning_rate": 4.9887955182072826e-05, + "loss": 0.3685, + "step": 34035 + }, + { + "epoch": 19.014525139664805, + "grad_norm": 0.3726389706134796, + "learning_rate": 4.985994397759104e-05, + "loss": 0.5042, + "step": 34036 + }, + { + "epoch": 19.01508379888268, + "grad_norm": 1.0077435970306396, + "learning_rate": 4.9831932773109245e-05, + "loss": 0.4406, + "step": 34037 + }, + { + "epoch": 19.015642458100558, + "grad_norm": 0.3891523778438568, + "learning_rate": 4.980392156862745e-05, + "loss": 0.3346, + "step": 34038 + }, + { + "epoch": 19.016201117318435, + "grad_norm": 0.4843681752681732, + "learning_rate": 4.977591036414566e-05, + "loss": 0.3849, + "step": 34039 + }, + { + "epoch": 19.01675977653631, + "grad_norm": 0.7743499279022217, + "learning_rate": 4.974789915966386e-05, + "loss": 0.7428, + "step": 34040 + }, + { + "epoch": 19.01731843575419, + "grad_norm": 0.3791869282722473, + "learning_rate": 4.971988795518207e-05, + "loss": 0.3572, + "step": 34041 + }, + { + "epoch": 19.017877094972068, + "grad_norm": 0.4791411757469177, + "learning_rate": 4.969187675070028e-05, + "loss": 0.4102, + "step": 34042 + }, + { + "epoch": 19.018435754189944, + "grad_norm": 0.4512704610824585, + "learning_rate": 4.966386554621849e-05, + "loss": 0.4702, + "step": 34043 + }, + { + "epoch": 19.01899441340782, + "grad_norm": 0.321982204914093, + "learning_rate": 4.9635854341736694e-05, + "loss": 0.3361, + "step": 34044 + }, + { + "epoch": 19.019553072625698, + "grad_norm": 0.40492740273475647, + "learning_rate": 4.96078431372549e-05, + "loss": 0.4158, + "step": 34045 + }, + { + "epoch": 19.020111731843574, + "grad_norm": 0.4612349569797516, + "learning_rate": 4.9579831932773106e-05, + "loss": 0.4831, + "step": 34046 + }, + { + "epoch": 19.02067039106145, + "grad_norm": 0.38113701343536377, + "learning_rate": 4.955182072829132e-05, + "loss": 0.4136, + "step": 34047 + }, + { + "epoch": 19.02122905027933, + "grad_norm": 0.4612719714641571, + "learning_rate": 4.9523809523809525e-05, + "loss": 0.4485, + "step": 34048 + }, + { + "epoch": 19.021787709497207, + "grad_norm": 0.5298845767974854, + "learning_rate": 4.949579831932773e-05, + "loss": 0.4222, + "step": 34049 + }, + { + "epoch": 19.022346368715084, + "grad_norm": 3.101620674133301, + "learning_rate": 4.946778711484594e-05, + "loss": 0.5287, + "step": 34050 + }, + { + "epoch": 19.02290502793296, + "grad_norm": 0.33950090408325195, + "learning_rate": 4.943977591036414e-05, + "loss": 0.3762, + "step": 34051 + }, + { + "epoch": 19.023463687150837, + "grad_norm": 0.37245798110961914, + "learning_rate": 4.941176470588235e-05, + "loss": 0.3819, + "step": 34052 + }, + { + "epoch": 19.024022346368714, + "grad_norm": 0.4824327826499939, + "learning_rate": 4.938375350140056e-05, + "loss": 0.5237, + "step": 34053 + }, + { + "epoch": 19.024581005586594, + "grad_norm": 0.8317398428916931, + "learning_rate": 4.935574229691877e-05, + "loss": 0.3923, + "step": 34054 + }, + { + "epoch": 19.02513966480447, + "grad_norm": 0.8856366872787476, + "learning_rate": 4.932773109243697e-05, + "loss": 0.3664, + "step": 34055 + }, + { + "epoch": 19.025698324022347, + "grad_norm": 2.81778883934021, + "learning_rate": 4.929971988795518e-05, + "loss": 0.4356, + "step": 34056 + }, + { + "epoch": 19.026256983240224, + "grad_norm": 0.667323648929596, + "learning_rate": 4.9271708683473385e-05, + "loss": 0.4066, + "step": 34057 + }, + { + "epoch": 19.0268156424581, + "grad_norm": 0.4666479527950287, + "learning_rate": 4.92436974789916e-05, + "loss": 0.3425, + "step": 34058 + }, + { + "epoch": 19.027374301675977, + "grad_norm": 0.41187188029289246, + "learning_rate": 4.9215686274509804e-05, + "loss": 0.4823, + "step": 34059 + }, + { + "epoch": 19.027932960893853, + "grad_norm": 0.5146870017051697, + "learning_rate": 4.918767507002801e-05, + "loss": 0.4044, + "step": 34060 + }, + { + "epoch": 19.028491620111733, + "grad_norm": 0.42817193269729614, + "learning_rate": 4.9159663865546216e-05, + "loss": 0.3684, + "step": 34061 + }, + { + "epoch": 19.02905027932961, + "grad_norm": 8.784381866455078, + "learning_rate": 4.913165266106442e-05, + "loss": 0.4205, + "step": 34062 + }, + { + "epoch": 19.029608938547486, + "grad_norm": 0.7608481049537659, + "learning_rate": 4.9103641456582635e-05, + "loss": 0.3371, + "step": 34063 + }, + { + "epoch": 19.030167597765363, + "grad_norm": 0.453857421875, + "learning_rate": 4.907563025210084e-05, + "loss": 0.4081, + "step": 34064 + }, + { + "epoch": 19.03072625698324, + "grad_norm": 0.5360804200172424, + "learning_rate": 4.904761904761905e-05, + "loss": 0.2983, + "step": 34065 + }, + { + "epoch": 19.031284916201116, + "grad_norm": 0.8454349040985107, + "learning_rate": 4.901960784313725e-05, + "loss": 0.4219, + "step": 34066 + }, + { + "epoch": 19.031843575418993, + "grad_norm": 0.4815874695777893, + "learning_rate": 4.899159663865546e-05, + "loss": 0.4718, + "step": 34067 + }, + { + "epoch": 19.032402234636873, + "grad_norm": 0.6291993260383606, + "learning_rate": 4.8963585434173665e-05, + "loss": 0.3947, + "step": 34068 + }, + { + "epoch": 19.03296089385475, + "grad_norm": 0.4006820321083069, + "learning_rate": 4.893557422969188e-05, + "loss": 0.3354, + "step": 34069 + }, + { + "epoch": 19.033519553072626, + "grad_norm": 1.2439199686050415, + "learning_rate": 4.8907563025210084e-05, + "loss": 0.3404, + "step": 34070 + }, + { + "epoch": 19.034078212290503, + "grad_norm": 0.552683413028717, + "learning_rate": 4.887955182072829e-05, + "loss": 0.7117, + "step": 34071 + }, + { + "epoch": 19.03463687150838, + "grad_norm": 0.386258602142334, + "learning_rate": 4.8851540616246496e-05, + "loss": 0.3918, + "step": 34072 + }, + { + "epoch": 19.035195530726256, + "grad_norm": 0.4527541995048523, + "learning_rate": 4.88235294117647e-05, + "loss": 0.3915, + "step": 34073 + }, + { + "epoch": 19.035754189944136, + "grad_norm": 0.43757620453834534, + "learning_rate": 4.8795518207282914e-05, + "loss": 0.4162, + "step": 34074 + }, + { + "epoch": 19.036312849162012, + "grad_norm": 0.32871389389038086, + "learning_rate": 4.876750700280112e-05, + "loss": 0.3347, + "step": 34075 + }, + { + "epoch": 19.03687150837989, + "grad_norm": 0.42162078619003296, + "learning_rate": 4.8739495798319326e-05, + "loss": 0.4381, + "step": 34076 + }, + { + "epoch": 19.037430167597766, + "grad_norm": 2.049283266067505, + "learning_rate": 4.871148459383753e-05, + "loss": 0.333, + "step": 34077 + }, + { + "epoch": 19.037988826815642, + "grad_norm": 0.9449719786643982, + "learning_rate": 4.868347338935574e-05, + "loss": 0.4235, + "step": 34078 + }, + { + "epoch": 19.03854748603352, + "grad_norm": 0.39846327900886536, + "learning_rate": 4.865546218487395e-05, + "loss": 0.4082, + "step": 34079 + }, + { + "epoch": 19.039106145251395, + "grad_norm": 0.9023163318634033, + "learning_rate": 4.862745098039216e-05, + "loss": 0.4537, + "step": 34080 + }, + { + "epoch": 19.039664804469275, + "grad_norm": 0.6002269387245178, + "learning_rate": 4.859943977591036e-05, + "loss": 0.4, + "step": 34081 + }, + { + "epoch": 19.040223463687152, + "grad_norm": 0.4130149483680725, + "learning_rate": 4.857142857142857e-05, + "loss": 0.4301, + "step": 34082 + }, + { + "epoch": 19.04078212290503, + "grad_norm": 0.6279932856559753, + "learning_rate": 4.8543417366946775e-05, + "loss": 0.4048, + "step": 34083 + }, + { + "epoch": 19.041340782122905, + "grad_norm": 0.862360954284668, + "learning_rate": 4.851540616246498e-05, + "loss": 0.6508, + "step": 34084 + }, + { + "epoch": 19.04189944134078, + "grad_norm": 0.4316216707229614, + "learning_rate": 4.8487394957983194e-05, + "loss": 0.4083, + "step": 34085 + }, + { + "epoch": 19.042458100558658, + "grad_norm": 0.39052802324295044, + "learning_rate": 4.84593837535014e-05, + "loss": 0.4149, + "step": 34086 + }, + { + "epoch": 19.043016759776535, + "grad_norm": 0.3880631625652313, + "learning_rate": 4.8431372549019606e-05, + "loss": 0.4942, + "step": 34087 + }, + { + "epoch": 19.043575418994415, + "grad_norm": 0.3922453224658966, + "learning_rate": 4.840336134453781e-05, + "loss": 0.3689, + "step": 34088 + }, + { + "epoch": 19.04413407821229, + "grad_norm": 0.5278199911117554, + "learning_rate": 4.837535014005602e-05, + "loss": 0.4032, + "step": 34089 + }, + { + "epoch": 19.044692737430168, + "grad_norm": 0.37316828966140747, + "learning_rate": 4.834733893557423e-05, + "loss": 0.435, + "step": 34090 + }, + { + "epoch": 19.045251396648045, + "grad_norm": 0.3822898864746094, + "learning_rate": 4.831932773109244e-05, + "loss": 0.3519, + "step": 34091 + }, + { + "epoch": 19.04581005586592, + "grad_norm": 0.5060520172119141, + "learning_rate": 4.829131652661064e-05, + "loss": 0.3543, + "step": 34092 + }, + { + "epoch": 19.046368715083798, + "grad_norm": 0.7076753377914429, + "learning_rate": 4.826330532212885e-05, + "loss": 0.3482, + "step": 34093 + }, + { + "epoch": 19.046927374301674, + "grad_norm": 0.38288766145706177, + "learning_rate": 4.8235294117647055e-05, + "loss": 0.4134, + "step": 34094 + }, + { + "epoch": 19.047486033519554, + "grad_norm": 0.5623285174369812, + "learning_rate": 4.820728291316526e-05, + "loss": 0.4837, + "step": 34095 + }, + { + "epoch": 19.04804469273743, + "grad_norm": 0.4888148009777069, + "learning_rate": 4.817927170868347e-05, + "loss": 0.3216, + "step": 34096 + }, + { + "epoch": 19.048603351955308, + "grad_norm": 0.6230806708335876, + "learning_rate": 4.815126050420168e-05, + "loss": 0.5085, + "step": 34097 + }, + { + "epoch": 19.049162011173184, + "grad_norm": 0.7253946661949158, + "learning_rate": 4.8123249299719885e-05, + "loss": 0.6701, + "step": 34098 + }, + { + "epoch": 19.04972067039106, + "grad_norm": 0.6751520037651062, + "learning_rate": 4.809523809523809e-05, + "loss": 0.4458, + "step": 34099 + }, + { + "epoch": 19.050279329608937, + "grad_norm": 0.4840782582759857, + "learning_rate": 4.80672268907563e-05, + "loss": 0.4558, + "step": 34100 + }, + { + "epoch": 19.050837988826817, + "grad_norm": 0.5162602663040161, + "learning_rate": 4.803921568627452e-05, + "loss": 0.3498, + "step": 34101 + }, + { + "epoch": 19.051396648044694, + "grad_norm": 0.37954264879226685, + "learning_rate": 4.801120448179272e-05, + "loss": 0.5211, + "step": 34102 + }, + { + "epoch": 19.05195530726257, + "grad_norm": 0.4165072441101074, + "learning_rate": 4.798319327731093e-05, + "loss": 0.4125, + "step": 34103 + }, + { + "epoch": 19.052513966480447, + "grad_norm": 0.5712683796882629, + "learning_rate": 4.7955182072829135e-05, + "loss": 0.3782, + "step": 34104 + }, + { + "epoch": 19.053072625698324, + "grad_norm": 0.29432412981987, + "learning_rate": 4.792717086834734e-05, + "loss": 0.3782, + "step": 34105 + }, + { + "epoch": 19.0536312849162, + "grad_norm": 0.3098897635936737, + "learning_rate": 4.7899159663865554e-05, + "loss": 0.3918, + "step": 34106 + }, + { + "epoch": 19.054189944134077, + "grad_norm": 0.5611530542373657, + "learning_rate": 4.787114845938376e-05, + "loss": 0.4941, + "step": 34107 + }, + { + "epoch": 19.054748603351957, + "grad_norm": 1.9435120820999146, + "learning_rate": 4.7843137254901966e-05, + "loss": 0.4363, + "step": 34108 + }, + { + "epoch": 19.055307262569833, + "grad_norm": 0.46485641598701477, + "learning_rate": 4.781512605042017e-05, + "loss": 0.438, + "step": 34109 + }, + { + "epoch": 19.05586592178771, + "grad_norm": 0.47409337759017944, + "learning_rate": 4.778711484593838e-05, + "loss": 0.579, + "step": 34110 + }, + { + "epoch": 19.056424581005587, + "grad_norm": 0.6881526708602905, + "learning_rate": 4.7759103641456584e-05, + "loss": 0.3658, + "step": 34111 + }, + { + "epoch": 19.056983240223463, + "grad_norm": 0.468822181224823, + "learning_rate": 4.7731092436974796e-05, + "loss": 0.4229, + "step": 34112 + }, + { + "epoch": 19.05754189944134, + "grad_norm": 0.4482509195804596, + "learning_rate": 4.7703081232493e-05, + "loss": 0.3863, + "step": 34113 + }, + { + "epoch": 19.058100558659216, + "grad_norm": 0.40117621421813965, + "learning_rate": 4.767507002801121e-05, + "loss": 0.4044, + "step": 34114 + }, + { + "epoch": 19.058659217877096, + "grad_norm": 0.6154475212097168, + "learning_rate": 4.7647058823529414e-05, + "loss": 0.4546, + "step": 34115 + }, + { + "epoch": 19.059217877094973, + "grad_norm": 0.4011020064353943, + "learning_rate": 4.761904761904762e-05, + "loss": 0.3227, + "step": 34116 + }, + { + "epoch": 19.05977653631285, + "grad_norm": 0.8815206289291382, + "learning_rate": 4.759103641456583e-05, + "loss": 0.3934, + "step": 34117 + }, + { + "epoch": 19.060335195530726, + "grad_norm": 1.187586784362793, + "learning_rate": 4.756302521008404e-05, + "loss": 0.4635, + "step": 34118 + }, + { + "epoch": 19.060893854748603, + "grad_norm": 0.7696611881256104, + "learning_rate": 4.7535014005602245e-05, + "loss": 0.5406, + "step": 34119 + }, + { + "epoch": 19.06145251396648, + "grad_norm": 0.35524311661720276, + "learning_rate": 4.750700280112045e-05, + "loss": 0.3825, + "step": 34120 + }, + { + "epoch": 19.062011173184356, + "grad_norm": 0.42355555295944214, + "learning_rate": 4.747899159663866e-05, + "loss": 0.3914, + "step": 34121 + }, + { + "epoch": 19.062569832402236, + "grad_norm": 1.7967973947525024, + "learning_rate": 4.745098039215686e-05, + "loss": 0.3586, + "step": 34122 + }, + { + "epoch": 19.063128491620112, + "grad_norm": 0.5909311771392822, + "learning_rate": 4.7422969187675076e-05, + "loss": 0.5545, + "step": 34123 + }, + { + "epoch": 19.06368715083799, + "grad_norm": 0.5110454559326172, + "learning_rate": 4.739495798319328e-05, + "loss": 0.3988, + "step": 34124 + }, + { + "epoch": 19.064245810055866, + "grad_norm": 0.35232388973236084, + "learning_rate": 4.736694677871149e-05, + "loss": 0.3914, + "step": 34125 + }, + { + "epoch": 19.064804469273742, + "grad_norm": 0.3780320882797241, + "learning_rate": 4.7338935574229694e-05, + "loss": 0.4401, + "step": 34126 + }, + { + "epoch": 19.06536312849162, + "grad_norm": 0.31993746757507324, + "learning_rate": 4.73109243697479e-05, + "loss": 0.3499, + "step": 34127 + }, + { + "epoch": 19.0659217877095, + "grad_norm": 0.7794672846794128, + "learning_rate": 4.728291316526611e-05, + "loss": 0.3744, + "step": 34128 + }, + { + "epoch": 19.066480446927375, + "grad_norm": 0.585939884185791, + "learning_rate": 4.725490196078432e-05, + "loss": 0.4236, + "step": 34129 + }, + { + "epoch": 19.067039106145252, + "grad_norm": 0.3240194022655487, + "learning_rate": 4.7226890756302525e-05, + "loss": 0.3468, + "step": 34130 + }, + { + "epoch": 19.06759776536313, + "grad_norm": 0.5565316677093506, + "learning_rate": 4.719887955182073e-05, + "loss": 0.47, + "step": 34131 + }, + { + "epoch": 19.068156424581005, + "grad_norm": 0.43060302734375, + "learning_rate": 4.7170868347338937e-05, + "loss": 0.378, + "step": 34132 + }, + { + "epoch": 19.06871508379888, + "grad_norm": 0.4093746840953827, + "learning_rate": 4.714285714285715e-05, + "loss": 0.4313, + "step": 34133 + }, + { + "epoch": 19.06927374301676, + "grad_norm": 0.714434802532196, + "learning_rate": 4.7114845938375355e-05, + "loss": 0.4189, + "step": 34134 + }, + { + "epoch": 19.06983240223464, + "grad_norm": 0.484144926071167, + "learning_rate": 4.708683473389356e-05, + "loss": 0.4037, + "step": 34135 + }, + { + "epoch": 19.070391061452515, + "grad_norm": 0.388107568025589, + "learning_rate": 4.705882352941177e-05, + "loss": 0.3177, + "step": 34136 + }, + { + "epoch": 19.07094972067039, + "grad_norm": 0.4172494411468506, + "learning_rate": 4.703081232492997e-05, + "loss": 0.3919, + "step": 34137 + }, + { + "epoch": 19.071508379888268, + "grad_norm": 0.41301459074020386, + "learning_rate": 4.700280112044818e-05, + "loss": 0.3968, + "step": 34138 + }, + { + "epoch": 19.072067039106145, + "grad_norm": 0.36860454082489014, + "learning_rate": 4.697478991596639e-05, + "loss": 0.3644, + "step": 34139 + }, + { + "epoch": 19.07262569832402, + "grad_norm": 0.36383143067359924, + "learning_rate": 4.69467787114846e-05, + "loss": 0.3251, + "step": 34140 + }, + { + "epoch": 19.073184357541898, + "grad_norm": 0.4134189486503601, + "learning_rate": 4.6918767507002804e-05, + "loss": 0.3419, + "step": 34141 + }, + { + "epoch": 19.073743016759778, + "grad_norm": 0.6444626450538635, + "learning_rate": 4.689075630252101e-05, + "loss": 0.3581, + "step": 34142 + }, + { + "epoch": 19.074301675977654, + "grad_norm": 0.9333446025848389, + "learning_rate": 4.6862745098039216e-05, + "loss": 0.3873, + "step": 34143 + }, + { + "epoch": 19.07486033519553, + "grad_norm": 0.5211381912231445, + "learning_rate": 4.683473389355743e-05, + "loss": 0.4522, + "step": 34144 + }, + { + "epoch": 19.075418994413408, + "grad_norm": 0.6196352243423462, + "learning_rate": 4.6806722689075635e-05, + "loss": 0.3596, + "step": 34145 + }, + { + "epoch": 19.075977653631284, + "grad_norm": 0.35157525539398193, + "learning_rate": 4.677871148459384e-05, + "loss": 0.3365, + "step": 34146 + }, + { + "epoch": 19.07653631284916, + "grad_norm": 0.4391478896141052, + "learning_rate": 4.675070028011205e-05, + "loss": 0.3466, + "step": 34147 + }, + { + "epoch": 19.07709497206704, + "grad_norm": 0.3524358570575714, + "learning_rate": 4.672268907563025e-05, + "loss": 0.3226, + "step": 34148 + }, + { + "epoch": 19.077653631284917, + "grad_norm": 0.4521028995513916, + "learning_rate": 4.6694677871148466e-05, + "loss": 0.3493, + "step": 34149 + }, + { + "epoch": 19.078212290502794, + "grad_norm": 1.3534393310546875, + "learning_rate": 4.666666666666667e-05, + "loss": 0.3782, + "step": 34150 + }, + { + "epoch": 19.07877094972067, + "grad_norm": 1.945213794708252, + "learning_rate": 4.663865546218488e-05, + "loss": 0.4144, + "step": 34151 + }, + { + "epoch": 19.079329608938547, + "grad_norm": 1.0611389875411987, + "learning_rate": 4.6610644257703084e-05, + "loss": 0.4153, + "step": 34152 + }, + { + "epoch": 19.079888268156424, + "grad_norm": 0.8366313576698303, + "learning_rate": 4.658263305322129e-05, + "loss": 0.3939, + "step": 34153 + }, + { + "epoch": 19.0804469273743, + "grad_norm": 0.7407335042953491, + "learning_rate": 4.6554621848739496e-05, + "loss": 0.4789, + "step": 34154 + }, + { + "epoch": 19.08100558659218, + "grad_norm": 0.7199921607971191, + "learning_rate": 4.652661064425771e-05, + "loss": 0.5183, + "step": 34155 + }, + { + "epoch": 19.081564245810057, + "grad_norm": 1.6046547889709473, + "learning_rate": 4.6498599439775914e-05, + "loss": 0.4423, + "step": 34156 + }, + { + "epoch": 19.082122905027934, + "grad_norm": 0.48168522119522095, + "learning_rate": 4.647058823529412e-05, + "loss": 0.3974, + "step": 34157 + }, + { + "epoch": 19.08268156424581, + "grad_norm": 0.4296114444732666, + "learning_rate": 4.6442577030812326e-05, + "loss": 0.3463, + "step": 34158 + }, + { + "epoch": 19.083240223463687, + "grad_norm": 2.2824909687042236, + "learning_rate": 4.641456582633053e-05, + "loss": 0.3455, + "step": 34159 + }, + { + "epoch": 19.083798882681563, + "grad_norm": 0.4152123034000397, + "learning_rate": 4.6386554621848745e-05, + "loss": 0.4699, + "step": 34160 + }, + { + "epoch": 19.08435754189944, + "grad_norm": 0.7814894318580627, + "learning_rate": 4.635854341736695e-05, + "loss": 0.458, + "step": 34161 + }, + { + "epoch": 19.08491620111732, + "grad_norm": 1.4727343320846558, + "learning_rate": 4.633053221288516e-05, + "loss": 0.4037, + "step": 34162 + }, + { + "epoch": 19.085474860335196, + "grad_norm": 0.5679681897163391, + "learning_rate": 4.630252100840336e-05, + "loss": 0.4008, + "step": 34163 + }, + { + "epoch": 19.086033519553073, + "grad_norm": 0.5303749442100525, + "learning_rate": 4.627450980392157e-05, + "loss": 0.3487, + "step": 34164 + }, + { + "epoch": 19.08659217877095, + "grad_norm": 0.5636522173881531, + "learning_rate": 4.6246498599439775e-05, + "loss": 0.4373, + "step": 34165 + }, + { + "epoch": 19.087150837988826, + "grad_norm": 5.520827293395996, + "learning_rate": 4.621848739495799e-05, + "loss": 0.4092, + "step": 34166 + }, + { + "epoch": 19.087709497206703, + "grad_norm": 0.6318961381912231, + "learning_rate": 4.6190476190476194e-05, + "loss": 0.3045, + "step": 34167 + }, + { + "epoch": 19.08826815642458, + "grad_norm": 0.39643406867980957, + "learning_rate": 4.61624649859944e-05, + "loss": 0.4362, + "step": 34168 + }, + { + "epoch": 19.08882681564246, + "grad_norm": 0.37128540873527527, + "learning_rate": 4.6134453781512606e-05, + "loss": 0.345, + "step": 34169 + }, + { + "epoch": 19.089385474860336, + "grad_norm": 0.5596094727516174, + "learning_rate": 4.610644257703081e-05, + "loss": 0.3603, + "step": 34170 + }, + { + "epoch": 19.089944134078213, + "grad_norm": 0.5508023500442505, + "learning_rate": 4.6078431372549025e-05, + "loss": 0.5016, + "step": 34171 + }, + { + "epoch": 19.09050279329609, + "grad_norm": 0.6179094314575195, + "learning_rate": 4.605042016806723e-05, + "loss": 0.346, + "step": 34172 + }, + { + "epoch": 19.091061452513966, + "grad_norm": 1.7648438215255737, + "learning_rate": 4.6022408963585437e-05, + "loss": 0.4677, + "step": 34173 + }, + { + "epoch": 19.091620111731842, + "grad_norm": 0.401187539100647, + "learning_rate": 4.599439775910364e-05, + "loss": 0.432, + "step": 34174 + }, + { + "epoch": 19.092178770949722, + "grad_norm": 0.5296809077262878, + "learning_rate": 4.596638655462185e-05, + "loss": 0.407, + "step": 34175 + }, + { + "epoch": 19.0927374301676, + "grad_norm": 0.40033671259880066, + "learning_rate": 4.593837535014006e-05, + "loss": 0.4148, + "step": 34176 + }, + { + "epoch": 19.093296089385476, + "grad_norm": 0.76639723777771, + "learning_rate": 4.591036414565827e-05, + "loss": 0.4094, + "step": 34177 + }, + { + "epoch": 19.093854748603352, + "grad_norm": 1.1314560174942017, + "learning_rate": 4.588235294117647e-05, + "loss": 0.3606, + "step": 34178 + }, + { + "epoch": 19.09441340782123, + "grad_norm": 0.4724883437156677, + "learning_rate": 4.585434173669468e-05, + "loss": 0.6665, + "step": 34179 + }, + { + "epoch": 19.094972067039105, + "grad_norm": 0.3455270230770111, + "learning_rate": 4.5826330532212885e-05, + "loss": 0.4485, + "step": 34180 + }, + { + "epoch": 19.095530726256982, + "grad_norm": 0.5339173674583435, + "learning_rate": 4.579831932773109e-05, + "loss": 0.4023, + "step": 34181 + }, + { + "epoch": 19.096089385474862, + "grad_norm": 0.6792379021644592, + "learning_rate": 4.5770308123249304e-05, + "loss": 0.3938, + "step": 34182 + }, + { + "epoch": 19.09664804469274, + "grad_norm": 0.3906536400318146, + "learning_rate": 4.574229691876751e-05, + "loss": 0.4277, + "step": 34183 + }, + { + "epoch": 19.097206703910615, + "grad_norm": 0.5333456993103027, + "learning_rate": 4.5714285714285716e-05, + "loss": 0.394, + "step": 34184 + }, + { + "epoch": 19.09776536312849, + "grad_norm": 0.6192470192909241, + "learning_rate": 4.568627450980392e-05, + "loss": 0.391, + "step": 34185 + }, + { + "epoch": 19.098324022346368, + "grad_norm": 0.2980307340621948, + "learning_rate": 4.565826330532213e-05, + "loss": 0.3264, + "step": 34186 + }, + { + "epoch": 19.098882681564245, + "grad_norm": 0.7014803886413574, + "learning_rate": 4.563025210084034e-05, + "loss": 0.3392, + "step": 34187 + }, + { + "epoch": 19.09944134078212, + "grad_norm": 0.5543592572212219, + "learning_rate": 4.560224089635855e-05, + "loss": 0.3789, + "step": 34188 + }, + { + "epoch": 19.1, + "grad_norm": 0.4921249449253082, + "learning_rate": 4.557422969187675e-05, + "loss": 0.5058, + "step": 34189 + }, + { + "epoch": 19.100558659217878, + "grad_norm": 0.4610847532749176, + "learning_rate": 4.554621848739496e-05, + "loss": 0.4497, + "step": 34190 + }, + { + "epoch": 19.101117318435755, + "grad_norm": 1.6444391012191772, + "learning_rate": 4.5518207282913165e-05, + "loss": 0.3999, + "step": 34191 + }, + { + "epoch": 19.10167597765363, + "grad_norm": 0.7008644342422485, + "learning_rate": 4.549019607843137e-05, + "loss": 0.3456, + "step": 34192 + }, + { + "epoch": 19.102234636871508, + "grad_norm": 0.4882813096046448, + "learning_rate": 4.5462184873949584e-05, + "loss": 0.4038, + "step": 34193 + }, + { + "epoch": 19.102793296089384, + "grad_norm": 0.5796037912368774, + "learning_rate": 4.543417366946779e-05, + "loss": 0.4282, + "step": 34194 + }, + { + "epoch": 19.10335195530726, + "grad_norm": 0.5729399919509888, + "learning_rate": 4.5406162464985996e-05, + "loss": 0.5945, + "step": 34195 + }, + { + "epoch": 19.10391061452514, + "grad_norm": 0.3941247761249542, + "learning_rate": 4.53781512605042e-05, + "loss": 0.459, + "step": 34196 + }, + { + "epoch": 19.104469273743018, + "grad_norm": 1.8043785095214844, + "learning_rate": 4.535014005602241e-05, + "loss": 0.4556, + "step": 34197 + }, + { + "epoch": 19.105027932960894, + "grad_norm": 0.4125872850418091, + "learning_rate": 4.532212885154062e-05, + "loss": 0.398, + "step": 34198 + }, + { + "epoch": 19.10558659217877, + "grad_norm": 0.3589528203010559, + "learning_rate": 4.5294117647058826e-05, + "loss": 0.3206, + "step": 34199 + }, + { + "epoch": 19.106145251396647, + "grad_norm": 0.5594779253005981, + "learning_rate": 4.526610644257703e-05, + "loss": 0.4212, + "step": 34200 + }, + { + "epoch": 19.106703910614524, + "grad_norm": 0.5324401259422302, + "learning_rate": 4.523809523809524e-05, + "loss": 0.3484, + "step": 34201 + }, + { + "epoch": 19.107262569832404, + "grad_norm": 0.8759794235229492, + "learning_rate": 4.5210084033613444e-05, + "loss": 0.3369, + "step": 34202 + }, + { + "epoch": 19.10782122905028, + "grad_norm": 0.33922043442726135, + "learning_rate": 4.518207282913166e-05, + "loss": 0.2869, + "step": 34203 + }, + { + "epoch": 19.108379888268157, + "grad_norm": 0.45556265115737915, + "learning_rate": 4.515406162464986e-05, + "loss": 0.3603, + "step": 34204 + }, + { + "epoch": 19.108938547486034, + "grad_norm": 0.773074209690094, + "learning_rate": 4.512605042016807e-05, + "loss": 0.4423, + "step": 34205 + }, + { + "epoch": 19.10949720670391, + "grad_norm": 0.40810850262641907, + "learning_rate": 4.5098039215686275e-05, + "loss": 0.3475, + "step": 34206 + }, + { + "epoch": 19.110055865921787, + "grad_norm": 0.33923882246017456, + "learning_rate": 4.507002801120448e-05, + "loss": 0.3494, + "step": 34207 + }, + { + "epoch": 19.110614525139663, + "grad_norm": 0.47900938987731934, + "learning_rate": 4.504201680672269e-05, + "loss": 0.6554, + "step": 34208 + }, + { + "epoch": 19.111173184357543, + "grad_norm": 0.6720021963119507, + "learning_rate": 4.50140056022409e-05, + "loss": 0.3882, + "step": 34209 + }, + { + "epoch": 19.11173184357542, + "grad_norm": 2.907823085784912, + "learning_rate": 4.4985994397759106e-05, + "loss": 0.3919, + "step": 34210 + }, + { + "epoch": 19.112290502793297, + "grad_norm": 0.43489328026771545, + "learning_rate": 4.495798319327731e-05, + "loss": 0.3344, + "step": 34211 + }, + { + "epoch": 19.112849162011173, + "grad_norm": 0.46838128566741943, + "learning_rate": 4.492997198879552e-05, + "loss": 0.4066, + "step": 34212 + }, + { + "epoch": 19.11340782122905, + "grad_norm": 0.3710154891014099, + "learning_rate": 4.4901960784313724e-05, + "loss": 0.3643, + "step": 34213 + }, + { + "epoch": 19.113966480446926, + "grad_norm": 0.5053613781929016, + "learning_rate": 4.4873949579831937e-05, + "loss": 0.3608, + "step": 34214 + }, + { + "epoch": 19.114525139664803, + "grad_norm": 0.5019699931144714, + "learning_rate": 4.484593837535014e-05, + "loss": 0.3882, + "step": 34215 + }, + { + "epoch": 19.115083798882683, + "grad_norm": 3.535247802734375, + "learning_rate": 4.481792717086835e-05, + "loss": 0.4134, + "step": 34216 + }, + { + "epoch": 19.11564245810056, + "grad_norm": 0.5509768724441528, + "learning_rate": 4.4789915966386555e-05, + "loss": 0.442, + "step": 34217 + }, + { + "epoch": 19.116201117318436, + "grad_norm": 0.34652987122535706, + "learning_rate": 4.476190476190476e-05, + "loss": 0.3416, + "step": 34218 + }, + { + "epoch": 19.116759776536313, + "grad_norm": 0.3768334984779358, + "learning_rate": 4.473389355742297e-05, + "loss": 0.3529, + "step": 34219 + }, + { + "epoch": 19.11731843575419, + "grad_norm": 0.48385241627693176, + "learning_rate": 4.470588235294118e-05, + "loss": 0.3047, + "step": 34220 + }, + { + "epoch": 19.117877094972066, + "grad_norm": 0.6893239617347717, + "learning_rate": 4.4677871148459385e-05, + "loss": 0.383, + "step": 34221 + }, + { + "epoch": 19.118435754189946, + "grad_norm": 0.6909326314926147, + "learning_rate": 4.464985994397759e-05, + "loss": 0.5975, + "step": 34222 + }, + { + "epoch": 19.118994413407822, + "grad_norm": 0.440336138010025, + "learning_rate": 4.46218487394958e-05, + "loss": 0.3798, + "step": 34223 + }, + { + "epoch": 19.1195530726257, + "grad_norm": 0.34653693437576294, + "learning_rate": 4.4593837535014e-05, + "loss": 0.3647, + "step": 34224 + }, + { + "epoch": 19.120111731843576, + "grad_norm": 1.9989093542099, + "learning_rate": 4.4565826330532216e-05, + "loss": 0.4407, + "step": 34225 + }, + { + "epoch": 19.120670391061452, + "grad_norm": 0.4312792122364044, + "learning_rate": 4.453781512605042e-05, + "loss": 0.516, + "step": 34226 + }, + { + "epoch": 19.12122905027933, + "grad_norm": 0.4135589897632599, + "learning_rate": 4.450980392156863e-05, + "loss": 0.3385, + "step": 34227 + }, + { + "epoch": 19.121787709497205, + "grad_norm": 0.4922415316104889, + "learning_rate": 4.4481792717086834e-05, + "loss": 0.4132, + "step": 34228 + }, + { + "epoch": 19.122346368715085, + "grad_norm": 0.6351668238639832, + "learning_rate": 4.445378151260504e-05, + "loss": 0.5048, + "step": 34229 + }, + { + "epoch": 19.122905027932962, + "grad_norm": 0.676240086555481, + "learning_rate": 4.442577030812325e-05, + "loss": 0.4116, + "step": 34230 + }, + { + "epoch": 19.12346368715084, + "grad_norm": 0.7866749167442322, + "learning_rate": 4.439775910364146e-05, + "loss": 0.2922, + "step": 34231 + }, + { + "epoch": 19.124022346368715, + "grad_norm": 1.8758156299591064, + "learning_rate": 4.4369747899159665e-05, + "loss": 0.4118, + "step": 34232 + }, + { + "epoch": 19.12458100558659, + "grad_norm": 0.7438153028488159, + "learning_rate": 4.434173669467787e-05, + "loss": 0.3668, + "step": 34233 + }, + { + "epoch": 19.12513966480447, + "grad_norm": 0.36478978395462036, + "learning_rate": 4.431372549019608e-05, + "loss": 0.4436, + "step": 34234 + }, + { + "epoch": 19.125698324022345, + "grad_norm": 0.37556782364845276, + "learning_rate": 4.428571428571428e-05, + "loss": 0.4635, + "step": 34235 + }, + { + "epoch": 19.126256983240225, + "grad_norm": 0.38899460434913635, + "learning_rate": 4.4257703081232496e-05, + "loss": 0.414, + "step": 34236 + }, + { + "epoch": 19.1268156424581, + "grad_norm": 0.4715697169303894, + "learning_rate": 4.42296918767507e-05, + "loss": 0.3814, + "step": 34237 + }, + { + "epoch": 19.127374301675978, + "grad_norm": 0.5459057688713074, + "learning_rate": 4.420168067226891e-05, + "loss": 0.4786, + "step": 34238 + }, + { + "epoch": 19.127932960893855, + "grad_norm": 0.49794602394104004, + "learning_rate": 4.4173669467787114e-05, + "loss": 0.349, + "step": 34239 + }, + { + "epoch": 19.12849162011173, + "grad_norm": 0.3777656853199005, + "learning_rate": 4.414565826330532e-05, + "loss": 0.331, + "step": 34240 + }, + { + "epoch": 19.129050279329608, + "grad_norm": 0.46212896704673767, + "learning_rate": 4.411764705882353e-05, + "loss": 0.5189, + "step": 34241 + }, + { + "epoch": 19.129608938547484, + "grad_norm": 0.4109019935131073, + "learning_rate": 4.408963585434174e-05, + "loss": 0.4203, + "step": 34242 + }, + { + "epoch": 19.130167597765364, + "grad_norm": 0.37591230869293213, + "learning_rate": 4.4061624649859944e-05, + "loss": 0.3335, + "step": 34243 + }, + { + "epoch": 19.13072625698324, + "grad_norm": 0.5432429909706116, + "learning_rate": 4.403361344537815e-05, + "loss": 0.3465, + "step": 34244 + }, + { + "epoch": 19.131284916201118, + "grad_norm": 0.46247363090515137, + "learning_rate": 4.4005602240896356e-05, + "loss": 0.4028, + "step": 34245 + }, + { + "epoch": 19.131843575418994, + "grad_norm": 0.39337995648384094, + "learning_rate": 4.397759103641457e-05, + "loss": 0.3571, + "step": 34246 + }, + { + "epoch": 19.13240223463687, + "grad_norm": 0.36914628744125366, + "learning_rate": 4.3949579831932775e-05, + "loss": 0.4886, + "step": 34247 + }, + { + "epoch": 19.132960893854747, + "grad_norm": 0.49211585521698, + "learning_rate": 4.392156862745098e-05, + "loss": 0.4052, + "step": 34248 + }, + { + "epoch": 19.133519553072627, + "grad_norm": 0.4447023868560791, + "learning_rate": 4.389355742296919e-05, + "loss": 0.3776, + "step": 34249 + }, + { + "epoch": 19.134078212290504, + "grad_norm": 1.1806244850158691, + "learning_rate": 4.386554621848739e-05, + "loss": 0.4135, + "step": 34250 + }, + { + "epoch": 19.13463687150838, + "grad_norm": 0.9135016798973083, + "learning_rate": 4.38375350140056e-05, + "loss": 0.4195, + "step": 34251 + }, + { + "epoch": 19.135195530726257, + "grad_norm": 0.35323119163513184, + "learning_rate": 4.380952380952381e-05, + "loss": 0.3915, + "step": 34252 + }, + { + "epoch": 19.135754189944134, + "grad_norm": 0.5484306812286377, + "learning_rate": 4.378151260504202e-05, + "loss": 0.4333, + "step": 34253 + }, + { + "epoch": 19.13631284916201, + "grad_norm": 0.921738862991333, + "learning_rate": 4.3753501400560224e-05, + "loss": 0.4144, + "step": 34254 + }, + { + "epoch": 19.136871508379887, + "grad_norm": 0.48659461736679077, + "learning_rate": 4.372549019607843e-05, + "loss": 0.4253, + "step": 34255 + }, + { + "epoch": 19.137430167597767, + "grad_norm": 0.7575713992118835, + "learning_rate": 4.3697478991596636e-05, + "loss": 0.3186, + "step": 34256 + }, + { + "epoch": 19.137988826815644, + "grad_norm": 0.9563173055648804, + "learning_rate": 4.366946778711485e-05, + "loss": 0.4094, + "step": 34257 + }, + { + "epoch": 19.13854748603352, + "grad_norm": 0.5873318910598755, + "learning_rate": 4.3641456582633055e-05, + "loss": 0.4059, + "step": 34258 + }, + { + "epoch": 19.139106145251397, + "grad_norm": 0.3507838845252991, + "learning_rate": 4.361344537815126e-05, + "loss": 0.4241, + "step": 34259 + }, + { + "epoch": 19.139664804469273, + "grad_norm": 1.0424703359603882, + "learning_rate": 4.3585434173669467e-05, + "loss": 0.4388, + "step": 34260 + }, + { + "epoch": 19.14022346368715, + "grad_norm": 0.4615655839443207, + "learning_rate": 4.355742296918767e-05, + "loss": 0.438, + "step": 34261 + }, + { + "epoch": 19.140782122905026, + "grad_norm": 0.49827060103416443, + "learning_rate": 4.352941176470588e-05, + "loss": 0.4324, + "step": 34262 + }, + { + "epoch": 19.141340782122906, + "grad_norm": 0.4816020131111145, + "learning_rate": 4.350140056022409e-05, + "loss": 0.4084, + "step": 34263 + }, + { + "epoch": 19.141899441340783, + "grad_norm": 0.44822368025779724, + "learning_rate": 4.34733893557423e-05, + "loss": 0.3578, + "step": 34264 + }, + { + "epoch": 19.14245810055866, + "grad_norm": 0.39726537466049194, + "learning_rate": 4.34453781512605e-05, + "loss": 0.3666, + "step": 34265 + }, + { + "epoch": 19.143016759776536, + "grad_norm": 0.47128039598464966, + "learning_rate": 4.341736694677871e-05, + "loss": 0.3857, + "step": 34266 + }, + { + "epoch": 19.143575418994413, + "grad_norm": 1.2847801446914673, + "learning_rate": 4.3389355742296915e-05, + "loss": 0.4591, + "step": 34267 + }, + { + "epoch": 19.14413407821229, + "grad_norm": 0.6383705735206604, + "learning_rate": 4.336134453781513e-05, + "loss": 0.4182, + "step": 34268 + }, + { + "epoch": 19.144692737430166, + "grad_norm": 1.0606571435928345, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.4121, + "step": 34269 + }, + { + "epoch": 19.145251396648046, + "grad_norm": 0.6661317944526672, + "learning_rate": 4.330532212885154e-05, + "loss": 0.4344, + "step": 34270 + }, + { + "epoch": 19.145810055865923, + "grad_norm": 0.593712329864502, + "learning_rate": 4.3277310924369746e-05, + "loss": 0.3228, + "step": 34271 + }, + { + "epoch": 19.1463687150838, + "grad_norm": 0.4032175540924072, + "learning_rate": 4.324929971988795e-05, + "loss": 0.3686, + "step": 34272 + }, + { + "epoch": 19.146927374301676, + "grad_norm": 0.49472740292549133, + "learning_rate": 4.3221288515406165e-05, + "loss": 0.4627, + "step": 34273 + }, + { + "epoch": 19.147486033519552, + "grad_norm": 0.87626713514328, + "learning_rate": 4.319327731092437e-05, + "loss": 0.3669, + "step": 34274 + }, + { + "epoch": 19.14804469273743, + "grad_norm": 0.477796345949173, + "learning_rate": 4.316526610644258e-05, + "loss": 0.4801, + "step": 34275 + }, + { + "epoch": 19.14860335195531, + "grad_norm": 0.461201936006546, + "learning_rate": 4.313725490196078e-05, + "loss": 0.3879, + "step": 34276 + }, + { + "epoch": 19.149162011173186, + "grad_norm": 0.6023568511009216, + "learning_rate": 4.310924369747899e-05, + "loss": 0.5359, + "step": 34277 + }, + { + "epoch": 19.149720670391062, + "grad_norm": 0.4684387445449829, + "learning_rate": 4.3081232492997195e-05, + "loss": 0.3828, + "step": 34278 + }, + { + "epoch": 19.15027932960894, + "grad_norm": 0.3617677688598633, + "learning_rate": 4.305322128851541e-05, + "loss": 0.4087, + "step": 34279 + }, + { + "epoch": 19.150837988826815, + "grad_norm": 0.8472923040390015, + "learning_rate": 4.3025210084033614e-05, + "loss": 0.3591, + "step": 34280 + }, + { + "epoch": 19.15139664804469, + "grad_norm": 0.6464107632637024, + "learning_rate": 4.299719887955182e-05, + "loss": 0.3883, + "step": 34281 + }, + { + "epoch": 19.15195530726257, + "grad_norm": 0.4648975431919098, + "learning_rate": 4.2969187675070026e-05, + "loss": 0.3634, + "step": 34282 + }, + { + "epoch": 19.15251396648045, + "grad_norm": 0.41971632838249207, + "learning_rate": 4.294117647058823e-05, + "loss": 0.4138, + "step": 34283 + }, + { + "epoch": 19.153072625698325, + "grad_norm": 0.8614796996116638, + "learning_rate": 4.2913165266106444e-05, + "loss": 0.6349, + "step": 34284 + }, + { + "epoch": 19.1536312849162, + "grad_norm": 0.4292737543582916, + "learning_rate": 4.288515406162465e-05, + "loss": 0.4658, + "step": 34285 + }, + { + "epoch": 19.154189944134078, + "grad_norm": 0.726313054561615, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.5186, + "step": 34286 + }, + { + "epoch": 19.154748603351955, + "grad_norm": 0.5681427121162415, + "learning_rate": 4.282913165266106e-05, + "loss": 0.5047, + "step": 34287 + }, + { + "epoch": 19.15530726256983, + "grad_norm": 0.39771389961242676, + "learning_rate": 4.280112044817927e-05, + "loss": 0.3963, + "step": 34288 + }, + { + "epoch": 19.155865921787708, + "grad_norm": 0.3797587752342224, + "learning_rate": 4.2773109243697474e-05, + "loss": 0.4568, + "step": 34289 + }, + { + "epoch": 19.156424581005588, + "grad_norm": 0.61158686876297, + "learning_rate": 4.274509803921569e-05, + "loss": 0.4238, + "step": 34290 + }, + { + "epoch": 19.156983240223465, + "grad_norm": 2.9981637001037598, + "learning_rate": 4.271708683473389e-05, + "loss": 0.5333, + "step": 34291 + }, + { + "epoch": 19.15754189944134, + "grad_norm": 0.923304557800293, + "learning_rate": 4.26890756302521e-05, + "loss": 0.5107, + "step": 34292 + }, + { + "epoch": 19.158100558659218, + "grad_norm": 2.4630281925201416, + "learning_rate": 4.2661064425770305e-05, + "loss": 0.4966, + "step": 34293 + }, + { + "epoch": 19.158659217877094, + "grad_norm": 0.43496495485305786, + "learning_rate": 4.263305322128851e-05, + "loss": 0.4827, + "step": 34294 + }, + { + "epoch": 19.15921787709497, + "grad_norm": 1.0934908390045166, + "learning_rate": 4.2605042016806724e-05, + "loss": 0.3397, + "step": 34295 + }, + { + "epoch": 19.159776536312847, + "grad_norm": 0.4419431984424591, + "learning_rate": 4.257703081232493e-05, + "loss": 0.5075, + "step": 34296 + }, + { + "epoch": 19.160335195530728, + "grad_norm": 0.6630734801292419, + "learning_rate": 4.2549019607843136e-05, + "loss": 0.4103, + "step": 34297 + }, + { + "epoch": 19.160893854748604, + "grad_norm": 0.7188488245010376, + "learning_rate": 4.252100840336134e-05, + "loss": 0.4085, + "step": 34298 + }, + { + "epoch": 19.16145251396648, + "grad_norm": 1.0252022743225098, + "learning_rate": 4.249299719887955e-05, + "loss": 0.5157, + "step": 34299 + }, + { + "epoch": 19.162011173184357, + "grad_norm": 1.0057063102722168, + "learning_rate": 4.246498599439776e-05, + "loss": 0.4006, + "step": 34300 + }, + { + "epoch": 19.162569832402234, + "grad_norm": 1.642436146736145, + "learning_rate": 4.2436974789915967e-05, + "loss": 0.345, + "step": 34301 + }, + { + "epoch": 19.16312849162011, + "grad_norm": 0.3450249135494232, + "learning_rate": 4.240896358543417e-05, + "loss": 0.3652, + "step": 34302 + }, + { + "epoch": 19.16368715083799, + "grad_norm": 0.3725767135620117, + "learning_rate": 4.238095238095238e-05, + "loss": 0.4096, + "step": 34303 + }, + { + "epoch": 19.164245810055867, + "grad_norm": 1.5294750928878784, + "learning_rate": 4.2352941176470585e-05, + "loss": 0.3936, + "step": 34304 + }, + { + "epoch": 19.164804469273744, + "grad_norm": 0.3487869203090668, + "learning_rate": 4.232492997198879e-05, + "loss": 0.3021, + "step": 34305 + }, + { + "epoch": 19.16536312849162, + "grad_norm": 1.7775439023971558, + "learning_rate": 4.2296918767507e-05, + "loss": 0.5655, + "step": 34306 + }, + { + "epoch": 19.165921787709497, + "grad_norm": 0.40884989500045776, + "learning_rate": 4.226890756302521e-05, + "loss": 0.3999, + "step": 34307 + }, + { + "epoch": 19.166480446927373, + "grad_norm": 0.6474719047546387, + "learning_rate": 4.2240896358543415e-05, + "loss": 0.5398, + "step": 34308 + }, + { + "epoch": 19.16703910614525, + "grad_norm": 0.5420839786529541, + "learning_rate": 4.221288515406162e-05, + "loss": 0.4273, + "step": 34309 + }, + { + "epoch": 19.16759776536313, + "grad_norm": 0.626081109046936, + "learning_rate": 4.218487394957983e-05, + "loss": 0.4102, + "step": 34310 + }, + { + "epoch": 19.168156424581007, + "grad_norm": 0.5016055703163147, + "learning_rate": 4.215686274509804e-05, + "loss": 0.3793, + "step": 34311 + }, + { + "epoch": 19.168715083798883, + "grad_norm": 0.5190970301628113, + "learning_rate": 4.2128851540616246e-05, + "loss": 0.4073, + "step": 34312 + }, + { + "epoch": 19.16927374301676, + "grad_norm": 0.5294607281684875, + "learning_rate": 4.210084033613445e-05, + "loss": 0.3961, + "step": 34313 + }, + { + "epoch": 19.169832402234636, + "grad_norm": 0.5505123138427734, + "learning_rate": 4.207282913165266e-05, + "loss": 0.3457, + "step": 34314 + }, + { + "epoch": 19.170391061452513, + "grad_norm": 0.4130032956600189, + "learning_rate": 4.2044817927170864e-05, + "loss": 0.4335, + "step": 34315 + }, + { + "epoch": 19.17094972067039, + "grad_norm": 0.44169241189956665, + "learning_rate": 4.201680672268908e-05, + "loss": 0.4047, + "step": 34316 + }, + { + "epoch": 19.17150837988827, + "grad_norm": 0.38089585304260254, + "learning_rate": 4.198879551820728e-05, + "loss": 0.4338, + "step": 34317 + }, + { + "epoch": 19.172067039106146, + "grad_norm": 5.8323655128479, + "learning_rate": 4.196078431372549e-05, + "loss": 0.4143, + "step": 34318 + }, + { + "epoch": 19.172625698324023, + "grad_norm": 1.6275320053100586, + "learning_rate": 4.1932773109243695e-05, + "loss": 0.3966, + "step": 34319 + }, + { + "epoch": 19.1731843575419, + "grad_norm": 0.5544790625572205, + "learning_rate": 4.19047619047619e-05, + "loss": 0.3933, + "step": 34320 + }, + { + "epoch": 19.173743016759776, + "grad_norm": 0.3950328826904297, + "learning_rate": 4.187675070028011e-05, + "loss": 0.4861, + "step": 34321 + }, + { + "epoch": 19.174301675977652, + "grad_norm": 0.4213440716266632, + "learning_rate": 4.184873949579832e-05, + "loss": 0.4476, + "step": 34322 + }, + { + "epoch": 19.174860335195532, + "grad_norm": 0.5959659218788147, + "learning_rate": 4.1820728291316526e-05, + "loss": 0.3459, + "step": 34323 + }, + { + "epoch": 19.17541899441341, + "grad_norm": 0.9817525148391724, + "learning_rate": 4.179271708683473e-05, + "loss": 0.442, + "step": 34324 + }, + { + "epoch": 19.175977653631286, + "grad_norm": 0.4581740200519562, + "learning_rate": 4.176470588235294e-05, + "loss": 0.4052, + "step": 34325 + }, + { + "epoch": 19.176536312849162, + "grad_norm": 0.36220991611480713, + "learning_rate": 4.1736694677871144e-05, + "loss": 0.3479, + "step": 34326 + }, + { + "epoch": 19.17709497206704, + "grad_norm": 0.5001604557037354, + "learning_rate": 4.170868347338936e-05, + "loss": 0.4275, + "step": 34327 + }, + { + "epoch": 19.177653631284915, + "grad_norm": 0.5874848365783691, + "learning_rate": 4.168067226890757e-05, + "loss": 0.4385, + "step": 34328 + }, + { + "epoch": 19.178212290502792, + "grad_norm": 0.46055924892425537, + "learning_rate": 4.1652661064425775e-05, + "loss": 0.3793, + "step": 34329 + }, + { + "epoch": 19.178770949720672, + "grad_norm": 0.4592307209968567, + "learning_rate": 4.162464985994398e-05, + "loss": 0.4431, + "step": 34330 + }, + { + "epoch": 19.17932960893855, + "grad_norm": 0.49831610918045044, + "learning_rate": 4.159663865546219e-05, + "loss": 0.4617, + "step": 34331 + }, + { + "epoch": 19.179888268156425, + "grad_norm": 0.4550251066684723, + "learning_rate": 4.156862745098039e-05, + "loss": 0.4514, + "step": 34332 + }, + { + "epoch": 19.1804469273743, + "grad_norm": 1.1201833486557007, + "learning_rate": 4.1540616246498606e-05, + "loss": 0.423, + "step": 34333 + }, + { + "epoch": 19.18100558659218, + "grad_norm": 0.752461850643158, + "learning_rate": 4.151260504201681e-05, + "loss": 0.4612, + "step": 34334 + }, + { + "epoch": 19.181564245810055, + "grad_norm": 0.9002892374992371, + "learning_rate": 4.148459383753502e-05, + "loss": 0.3933, + "step": 34335 + }, + { + "epoch": 19.18212290502793, + "grad_norm": 0.5792464017868042, + "learning_rate": 4.1456582633053224e-05, + "loss": 0.5658, + "step": 34336 + }, + { + "epoch": 19.18268156424581, + "grad_norm": 0.38768136501312256, + "learning_rate": 4.142857142857143e-05, + "loss": 0.3858, + "step": 34337 + }, + { + "epoch": 19.183240223463688, + "grad_norm": 0.36140042543411255, + "learning_rate": 4.140056022408964e-05, + "loss": 0.3579, + "step": 34338 + }, + { + "epoch": 19.183798882681565, + "grad_norm": 1.685960292816162, + "learning_rate": 4.137254901960785e-05, + "loss": 0.512, + "step": 34339 + }, + { + "epoch": 19.18435754189944, + "grad_norm": 0.35322847962379456, + "learning_rate": 4.1344537815126055e-05, + "loss": 0.4037, + "step": 34340 + }, + { + "epoch": 19.184916201117318, + "grad_norm": 0.672893762588501, + "learning_rate": 4.131652661064426e-05, + "loss": 0.4224, + "step": 34341 + }, + { + "epoch": 19.185474860335194, + "grad_norm": 0.5783244967460632, + "learning_rate": 4.1288515406162467e-05, + "loss": 0.4676, + "step": 34342 + }, + { + "epoch": 19.18603351955307, + "grad_norm": 6.282768249511719, + "learning_rate": 4.126050420168068e-05, + "loss": 0.3382, + "step": 34343 + }, + { + "epoch": 19.18659217877095, + "grad_norm": 0.3705224096775055, + "learning_rate": 4.1232492997198885e-05, + "loss": 0.3064, + "step": 34344 + }, + { + "epoch": 19.187150837988828, + "grad_norm": 0.4761348366737366, + "learning_rate": 4.120448179271709e-05, + "loss": 0.4091, + "step": 34345 + }, + { + "epoch": 19.187709497206704, + "grad_norm": 1.8969731330871582, + "learning_rate": 4.11764705882353e-05, + "loss": 0.3728, + "step": 34346 + }, + { + "epoch": 19.18826815642458, + "grad_norm": 0.5123389959335327, + "learning_rate": 4.11484593837535e-05, + "loss": 0.4134, + "step": 34347 + }, + { + "epoch": 19.188826815642457, + "grad_norm": 0.8245759606361389, + "learning_rate": 4.112044817927171e-05, + "loss": 0.5437, + "step": 34348 + }, + { + "epoch": 19.189385474860334, + "grad_norm": 0.41834890842437744, + "learning_rate": 4.109243697478992e-05, + "loss": 0.3773, + "step": 34349 + }, + { + "epoch": 19.189944134078214, + "grad_norm": 0.5005658268928528, + "learning_rate": 4.106442577030813e-05, + "loss": 0.338, + "step": 34350 + }, + { + "epoch": 19.19050279329609, + "grad_norm": 1.4807153940200806, + "learning_rate": 4.1036414565826334e-05, + "loss": 0.5031, + "step": 34351 + }, + { + "epoch": 19.191061452513967, + "grad_norm": 0.5110713839530945, + "learning_rate": 4.100840336134454e-05, + "loss": 0.3326, + "step": 34352 + }, + { + "epoch": 19.191620111731844, + "grad_norm": 2.1174850463867188, + "learning_rate": 4.0980392156862746e-05, + "loss": 0.3338, + "step": 34353 + }, + { + "epoch": 19.19217877094972, + "grad_norm": 0.8628860712051392, + "learning_rate": 4.095238095238096e-05, + "loss": 0.3338, + "step": 34354 + }, + { + "epoch": 19.192737430167597, + "grad_norm": 0.3803478479385376, + "learning_rate": 4.0924369747899165e-05, + "loss": 0.3605, + "step": 34355 + }, + { + "epoch": 19.193296089385473, + "grad_norm": 0.4083240330219269, + "learning_rate": 4.089635854341737e-05, + "loss": 0.3814, + "step": 34356 + }, + { + "epoch": 19.193854748603353, + "grad_norm": 1.269036054611206, + "learning_rate": 4.086834733893558e-05, + "loss": 0.4311, + "step": 34357 + }, + { + "epoch": 19.19441340782123, + "grad_norm": 0.5924358367919922, + "learning_rate": 4.084033613445378e-05, + "loss": 0.3645, + "step": 34358 + }, + { + "epoch": 19.194972067039107, + "grad_norm": 0.6092901229858398, + "learning_rate": 4.081232492997199e-05, + "loss": 0.4134, + "step": 34359 + }, + { + "epoch": 19.195530726256983, + "grad_norm": 0.5444338917732239, + "learning_rate": 4.07843137254902e-05, + "loss": 0.4707, + "step": 34360 + }, + { + "epoch": 19.19608938547486, + "grad_norm": 0.5880433320999146, + "learning_rate": 4.075630252100841e-05, + "loss": 0.3255, + "step": 34361 + }, + { + "epoch": 19.196648044692736, + "grad_norm": 0.3997005820274353, + "learning_rate": 4.0728291316526614e-05, + "loss": 0.443, + "step": 34362 + }, + { + "epoch": 19.197206703910613, + "grad_norm": 0.4177122712135315, + "learning_rate": 4.070028011204482e-05, + "loss": 0.4718, + "step": 34363 + }, + { + "epoch": 19.197765363128493, + "grad_norm": 0.25567135214805603, + "learning_rate": 4.0672268907563026e-05, + "loss": 0.3029, + "step": 34364 + }, + { + "epoch": 19.19832402234637, + "grad_norm": 0.7818362712860107, + "learning_rate": 4.064425770308124e-05, + "loss": 0.4759, + "step": 34365 + }, + { + "epoch": 19.198882681564246, + "grad_norm": 1.0677684545516968, + "learning_rate": 4.0616246498599444e-05, + "loss": 0.3505, + "step": 34366 + }, + { + "epoch": 19.199441340782123, + "grad_norm": 0.4286250174045563, + "learning_rate": 4.058823529411765e-05, + "loss": 0.4432, + "step": 34367 + }, + { + "epoch": 19.2, + "grad_norm": 0.4016687273979187, + "learning_rate": 4.0560224089635856e-05, + "loss": 0.3748, + "step": 34368 + }, + { + "epoch": 19.200558659217876, + "grad_norm": 0.8131793141365051, + "learning_rate": 4.053221288515406e-05, + "loss": 0.3686, + "step": 34369 + }, + { + "epoch": 19.201117318435756, + "grad_norm": 0.3354417681694031, + "learning_rate": 4.0504201680672275e-05, + "loss": 0.3442, + "step": 34370 + }, + { + "epoch": 19.201675977653633, + "grad_norm": 3.6068902015686035, + "learning_rate": 4.047619047619048e-05, + "loss": 0.3728, + "step": 34371 + }, + { + "epoch": 19.20223463687151, + "grad_norm": 0.5244073867797852, + "learning_rate": 4.044817927170869e-05, + "loss": 0.4244, + "step": 34372 + }, + { + "epoch": 19.202793296089386, + "grad_norm": 0.38286206126213074, + "learning_rate": 4.042016806722689e-05, + "loss": 0.4356, + "step": 34373 + }, + { + "epoch": 19.203351955307262, + "grad_norm": 1.1345473527908325, + "learning_rate": 4.03921568627451e-05, + "loss": 0.3763, + "step": 34374 + }, + { + "epoch": 19.20391061452514, + "grad_norm": 0.6460477709770203, + "learning_rate": 4.0364145658263305e-05, + "loss": 0.4427, + "step": 34375 + }, + { + "epoch": 19.204469273743015, + "grad_norm": 0.5855938792228699, + "learning_rate": 4.033613445378152e-05, + "loss": 0.3745, + "step": 34376 + }, + { + "epoch": 19.205027932960895, + "grad_norm": 0.47824084758758545, + "learning_rate": 4.0308123249299724e-05, + "loss": 0.3633, + "step": 34377 + }, + { + "epoch": 19.205586592178772, + "grad_norm": 0.4581983983516693, + "learning_rate": 4.028011204481793e-05, + "loss": 0.4907, + "step": 34378 + }, + { + "epoch": 19.20614525139665, + "grad_norm": 0.9827057719230652, + "learning_rate": 4.0252100840336136e-05, + "loss": 0.5121, + "step": 34379 + }, + { + "epoch": 19.206703910614525, + "grad_norm": 0.5318775177001953, + "learning_rate": 4.022408963585434e-05, + "loss": 0.3904, + "step": 34380 + }, + { + "epoch": 19.2072625698324, + "grad_norm": 0.31430310010910034, + "learning_rate": 4.0196078431372555e-05, + "loss": 0.3172, + "step": 34381 + }, + { + "epoch": 19.20782122905028, + "grad_norm": 0.44189882278442383, + "learning_rate": 4.016806722689076e-05, + "loss": 0.4017, + "step": 34382 + }, + { + "epoch": 19.208379888268155, + "grad_norm": 0.6885204315185547, + "learning_rate": 4.0140056022408967e-05, + "loss": 0.4175, + "step": 34383 + }, + { + "epoch": 19.208938547486035, + "grad_norm": 0.4150227904319763, + "learning_rate": 4.011204481792717e-05, + "loss": 0.471, + "step": 34384 + }, + { + "epoch": 19.20949720670391, + "grad_norm": 4.07744026184082, + "learning_rate": 4.008403361344538e-05, + "loss": 0.4057, + "step": 34385 + }, + { + "epoch": 19.210055865921788, + "grad_norm": 0.3927232027053833, + "learning_rate": 4.005602240896359e-05, + "loss": 0.3981, + "step": 34386 + }, + { + "epoch": 19.210614525139665, + "grad_norm": 0.4456595778465271, + "learning_rate": 4.00280112044818e-05, + "loss": 0.4037, + "step": 34387 + }, + { + "epoch": 19.21117318435754, + "grad_norm": 0.44463491439819336, + "learning_rate": 4e-05, + "loss": 0.4162, + "step": 34388 + }, + { + "epoch": 19.211731843575418, + "grad_norm": 0.5158103108406067, + "learning_rate": 3.997198879551821e-05, + "loss": 0.4227, + "step": 34389 + }, + { + "epoch": 19.212290502793294, + "grad_norm": 0.4529596269130707, + "learning_rate": 3.9943977591036415e-05, + "loss": 0.5055, + "step": 34390 + }, + { + "epoch": 19.212849162011175, + "grad_norm": 0.4801010191440582, + "learning_rate": 3.991596638655462e-05, + "loss": 0.2862, + "step": 34391 + }, + { + "epoch": 19.21340782122905, + "grad_norm": 0.37588655948638916, + "learning_rate": 3.9887955182072834e-05, + "loss": 0.3313, + "step": 34392 + }, + { + "epoch": 19.213966480446928, + "grad_norm": 0.5175050497055054, + "learning_rate": 3.985994397759104e-05, + "loss": 0.3854, + "step": 34393 + }, + { + "epoch": 19.214525139664804, + "grad_norm": 0.39390328526496887, + "learning_rate": 3.9831932773109246e-05, + "loss": 0.3752, + "step": 34394 + }, + { + "epoch": 19.21508379888268, + "grad_norm": 0.34324905276298523, + "learning_rate": 3.980392156862745e-05, + "loss": 0.3163, + "step": 34395 + }, + { + "epoch": 19.215642458100557, + "grad_norm": 0.3557980954647064, + "learning_rate": 3.977591036414566e-05, + "loss": 0.4621, + "step": 34396 + }, + { + "epoch": 19.216201117318437, + "grad_norm": 0.3141343593597412, + "learning_rate": 3.974789915966387e-05, + "loss": 0.3152, + "step": 34397 + }, + { + "epoch": 19.216759776536314, + "grad_norm": 0.36761367321014404, + "learning_rate": 3.971988795518208e-05, + "loss": 0.3269, + "step": 34398 + }, + { + "epoch": 19.21731843575419, + "grad_norm": 0.9165186285972595, + "learning_rate": 3.969187675070028e-05, + "loss": 0.3777, + "step": 34399 + }, + { + "epoch": 19.217877094972067, + "grad_norm": 9.963189125061035, + "learning_rate": 3.966386554621849e-05, + "loss": 0.4152, + "step": 34400 + }, + { + "epoch": 19.218435754189944, + "grad_norm": 4.866195201873779, + "learning_rate": 3.9635854341736695e-05, + "loss": 0.6291, + "step": 34401 + }, + { + "epoch": 19.21899441340782, + "grad_norm": 0.48058149218559265, + "learning_rate": 3.96078431372549e-05, + "loss": 0.339, + "step": 34402 + }, + { + "epoch": 19.219553072625697, + "grad_norm": 0.41713157296180725, + "learning_rate": 3.9579831932773114e-05, + "loss": 0.4617, + "step": 34403 + }, + { + "epoch": 19.220111731843577, + "grad_norm": 0.3184274733066559, + "learning_rate": 3.955182072829132e-05, + "loss": 0.3874, + "step": 34404 + }, + { + "epoch": 19.220670391061454, + "grad_norm": 0.4267182946205139, + "learning_rate": 3.9523809523809526e-05, + "loss": 0.3292, + "step": 34405 + }, + { + "epoch": 19.22122905027933, + "grad_norm": 0.3267779052257538, + "learning_rate": 3.949579831932773e-05, + "loss": 0.2746, + "step": 34406 + }, + { + "epoch": 19.221787709497207, + "grad_norm": 0.8905510306358337, + "learning_rate": 3.946778711484594e-05, + "loss": 0.4621, + "step": 34407 + }, + { + "epoch": 19.222346368715083, + "grad_norm": 3.1319580078125, + "learning_rate": 3.943977591036415e-05, + "loss": 0.3476, + "step": 34408 + }, + { + "epoch": 19.22290502793296, + "grad_norm": 0.4703458547592163, + "learning_rate": 3.9411764705882356e-05, + "loss": 0.3801, + "step": 34409 + }, + { + "epoch": 19.223463687150836, + "grad_norm": 0.3702363073825836, + "learning_rate": 3.938375350140056e-05, + "loss": 0.3417, + "step": 34410 + }, + { + "epoch": 19.224022346368717, + "grad_norm": 0.5029980540275574, + "learning_rate": 3.935574229691877e-05, + "loss": 0.4235, + "step": 34411 + }, + { + "epoch": 19.224581005586593, + "grad_norm": 0.42715221643447876, + "learning_rate": 3.9327731092436974e-05, + "loss": 0.4443, + "step": 34412 + }, + { + "epoch": 19.22513966480447, + "grad_norm": 0.8617134094238281, + "learning_rate": 3.929971988795519e-05, + "loss": 0.4453, + "step": 34413 + }, + { + "epoch": 19.225698324022346, + "grad_norm": 0.338504433631897, + "learning_rate": 3.927170868347339e-05, + "loss": 0.3728, + "step": 34414 + }, + { + "epoch": 19.226256983240223, + "grad_norm": 0.643187403678894, + "learning_rate": 3.92436974789916e-05, + "loss": 0.6016, + "step": 34415 + }, + { + "epoch": 19.2268156424581, + "grad_norm": 0.35219722986221313, + "learning_rate": 3.9215686274509805e-05, + "loss": 0.2796, + "step": 34416 + }, + { + "epoch": 19.227374301675976, + "grad_norm": 0.8584226369857788, + "learning_rate": 3.918767507002801e-05, + "loss": 0.3179, + "step": 34417 + }, + { + "epoch": 19.227932960893856, + "grad_norm": 0.4579952359199524, + "learning_rate": 3.915966386554622e-05, + "loss": 0.4259, + "step": 34418 + }, + { + "epoch": 19.228491620111733, + "grad_norm": 0.4399353861808777, + "learning_rate": 3.913165266106443e-05, + "loss": 0.3223, + "step": 34419 + }, + { + "epoch": 19.22905027932961, + "grad_norm": 0.5349218845367432, + "learning_rate": 3.9103641456582636e-05, + "loss": 0.4382, + "step": 34420 + }, + { + "epoch": 19.229608938547486, + "grad_norm": 0.4383047819137573, + "learning_rate": 3.907563025210084e-05, + "loss": 0.3807, + "step": 34421 + }, + { + "epoch": 19.230167597765362, + "grad_norm": 0.3751812279224396, + "learning_rate": 3.904761904761905e-05, + "loss": 0.3856, + "step": 34422 + }, + { + "epoch": 19.23072625698324, + "grad_norm": 0.6546064019203186, + "learning_rate": 3.9019607843137254e-05, + "loss": 0.4378, + "step": 34423 + }, + { + "epoch": 19.23128491620112, + "grad_norm": 0.4113582372665405, + "learning_rate": 3.8991596638655467e-05, + "loss": 0.3533, + "step": 34424 + }, + { + "epoch": 19.231843575418996, + "grad_norm": 0.31873294711112976, + "learning_rate": 3.896358543417367e-05, + "loss": 0.3057, + "step": 34425 + }, + { + "epoch": 19.232402234636872, + "grad_norm": 2.620142936706543, + "learning_rate": 3.893557422969188e-05, + "loss": 0.3723, + "step": 34426 + }, + { + "epoch": 19.23296089385475, + "grad_norm": 0.493437796831131, + "learning_rate": 3.8907563025210084e-05, + "loss": 0.3218, + "step": 34427 + }, + { + "epoch": 19.233519553072625, + "grad_norm": 0.4244549870491028, + "learning_rate": 3.887955182072829e-05, + "loss": 0.4358, + "step": 34428 + }, + { + "epoch": 19.234078212290502, + "grad_norm": 0.4533943831920624, + "learning_rate": 3.8851540616246496e-05, + "loss": 0.4536, + "step": 34429 + }, + { + "epoch": 19.23463687150838, + "grad_norm": 0.4940558969974518, + "learning_rate": 3.882352941176471e-05, + "loss": 0.418, + "step": 34430 + }, + { + "epoch": 19.23519553072626, + "grad_norm": 1.029957890510559, + "learning_rate": 3.8795518207282915e-05, + "loss": 0.3927, + "step": 34431 + }, + { + "epoch": 19.235754189944135, + "grad_norm": 0.5873790979385376, + "learning_rate": 3.876750700280112e-05, + "loss": 0.4747, + "step": 34432 + }, + { + "epoch": 19.23631284916201, + "grad_norm": 2.361293077468872, + "learning_rate": 3.873949579831933e-05, + "loss": 0.3537, + "step": 34433 + }, + { + "epoch": 19.23687150837989, + "grad_norm": 2.3189992904663086, + "learning_rate": 3.871148459383753e-05, + "loss": 0.3931, + "step": 34434 + }, + { + "epoch": 19.237430167597765, + "grad_norm": 0.4077621400356293, + "learning_rate": 3.8683473389355746e-05, + "loss": 0.3609, + "step": 34435 + }, + { + "epoch": 19.23798882681564, + "grad_norm": 1.0960241556167603, + "learning_rate": 3.865546218487395e-05, + "loss": 0.4345, + "step": 34436 + }, + { + "epoch": 19.238547486033518, + "grad_norm": 0.38314005732536316, + "learning_rate": 3.862745098039216e-05, + "loss": 0.4599, + "step": 34437 + }, + { + "epoch": 19.239106145251398, + "grad_norm": 0.7055360674858093, + "learning_rate": 3.8599439775910364e-05, + "loss": 0.4675, + "step": 34438 + }, + { + "epoch": 19.239664804469275, + "grad_norm": 0.6184757947921753, + "learning_rate": 3.857142857142857e-05, + "loss": 0.3714, + "step": 34439 + }, + { + "epoch": 19.24022346368715, + "grad_norm": 0.611702024936676, + "learning_rate": 3.854341736694678e-05, + "loss": 0.3398, + "step": 34440 + }, + { + "epoch": 19.240782122905028, + "grad_norm": 0.37282654643058777, + "learning_rate": 3.851540616246499e-05, + "loss": 0.412, + "step": 34441 + }, + { + "epoch": 19.241340782122904, + "grad_norm": 0.5059308409690857, + "learning_rate": 3.8487394957983195e-05, + "loss": 0.4693, + "step": 34442 + }, + { + "epoch": 19.24189944134078, + "grad_norm": 0.5726853609085083, + "learning_rate": 3.84593837535014e-05, + "loss": 0.393, + "step": 34443 + }, + { + "epoch": 19.242458100558657, + "grad_norm": 0.6580373644828796, + "learning_rate": 3.843137254901961e-05, + "loss": 0.4918, + "step": 34444 + }, + { + "epoch": 19.243016759776538, + "grad_norm": 0.3653848171234131, + "learning_rate": 3.840336134453781e-05, + "loss": 0.3033, + "step": 34445 + }, + { + "epoch": 19.243575418994414, + "grad_norm": 0.5073138475418091, + "learning_rate": 3.8375350140056026e-05, + "loss": 0.4398, + "step": 34446 + }, + { + "epoch": 19.24413407821229, + "grad_norm": 0.4576285183429718, + "learning_rate": 3.834733893557423e-05, + "loss": 0.4018, + "step": 34447 + }, + { + "epoch": 19.244692737430167, + "grad_norm": 0.4962644875049591, + "learning_rate": 3.831932773109244e-05, + "loss": 0.3701, + "step": 34448 + }, + { + "epoch": 19.245251396648044, + "grad_norm": 0.3840586841106415, + "learning_rate": 3.8291316526610643e-05, + "loss": 0.3993, + "step": 34449 + }, + { + "epoch": 19.24581005586592, + "grad_norm": 0.3287234306335449, + "learning_rate": 3.826330532212885e-05, + "loss": 0.3254, + "step": 34450 + }, + { + "epoch": 19.2463687150838, + "grad_norm": 0.37090054154396057, + "learning_rate": 3.823529411764706e-05, + "loss": 0.2896, + "step": 34451 + }, + { + "epoch": 19.246927374301677, + "grad_norm": 0.7580893635749817, + "learning_rate": 3.820728291316527e-05, + "loss": 0.3594, + "step": 34452 + }, + { + "epoch": 19.247486033519554, + "grad_norm": 0.679252028465271, + "learning_rate": 3.8179271708683474e-05, + "loss": 0.3687, + "step": 34453 + }, + { + "epoch": 19.24804469273743, + "grad_norm": 0.6639065146446228, + "learning_rate": 3.815126050420168e-05, + "loss": 0.2488, + "step": 34454 + }, + { + "epoch": 19.248603351955307, + "grad_norm": 1.4326963424682617, + "learning_rate": 3.8123249299719886e-05, + "loss": 0.3029, + "step": 34455 + }, + { + "epoch": 19.249162011173183, + "grad_norm": 0.39584365487098694, + "learning_rate": 3.80952380952381e-05, + "loss": 0.4003, + "step": 34456 + }, + { + "epoch": 19.24972067039106, + "grad_norm": 0.36156165599823, + "learning_rate": 3.8067226890756305e-05, + "loss": 0.248, + "step": 34457 + }, + { + "epoch": 19.25027932960894, + "grad_norm": 3.6178572177886963, + "learning_rate": 3.803921568627451e-05, + "loss": 0.3861, + "step": 34458 + }, + { + "epoch": 19.250837988826817, + "grad_norm": 0.5734772086143494, + "learning_rate": 3.801120448179272e-05, + "loss": 0.3264, + "step": 34459 + }, + { + "epoch": 19.251396648044693, + "grad_norm": 0.5030843019485474, + "learning_rate": 3.798319327731092e-05, + "loss": 0.485, + "step": 34460 + }, + { + "epoch": 19.25195530726257, + "grad_norm": 0.5218889713287354, + "learning_rate": 3.795518207282913e-05, + "loss": 0.3418, + "step": 34461 + }, + { + "epoch": 19.252513966480446, + "grad_norm": 0.3216029107570648, + "learning_rate": 3.792717086834734e-05, + "loss": 0.2891, + "step": 34462 + }, + { + "epoch": 19.253072625698323, + "grad_norm": 0.6246047019958496, + "learning_rate": 3.789915966386555e-05, + "loss": 0.436, + "step": 34463 + }, + { + "epoch": 19.2536312849162, + "grad_norm": 0.6183018684387207, + "learning_rate": 3.7871148459383754e-05, + "loss": 0.4096, + "step": 34464 + }, + { + "epoch": 19.25418994413408, + "grad_norm": 0.406149685382843, + "learning_rate": 3.784313725490196e-05, + "loss": 0.4364, + "step": 34465 + }, + { + "epoch": 19.254748603351956, + "grad_norm": 0.5009737014770508, + "learning_rate": 3.7815126050420166e-05, + "loss": 0.4631, + "step": 34466 + }, + { + "epoch": 19.255307262569833, + "grad_norm": 0.803261399269104, + "learning_rate": 3.778711484593838e-05, + "loss": 0.4859, + "step": 34467 + }, + { + "epoch": 19.25586592178771, + "grad_norm": 0.40794411301612854, + "learning_rate": 3.7759103641456584e-05, + "loss": 0.3855, + "step": 34468 + }, + { + "epoch": 19.256424581005586, + "grad_norm": 0.42702820897102356, + "learning_rate": 3.773109243697479e-05, + "loss": 0.3593, + "step": 34469 + }, + { + "epoch": 19.256983240223462, + "grad_norm": 0.43986064195632935, + "learning_rate": 3.7703081232492996e-05, + "loss": 0.4934, + "step": 34470 + }, + { + "epoch": 19.257541899441343, + "grad_norm": 0.46019378304481506, + "learning_rate": 3.76750700280112e-05, + "loss": 0.4586, + "step": 34471 + }, + { + "epoch": 19.25810055865922, + "grad_norm": 0.5878046751022339, + "learning_rate": 3.764705882352941e-05, + "loss": 0.5377, + "step": 34472 + }, + { + "epoch": 19.258659217877096, + "grad_norm": 0.4518672525882721, + "learning_rate": 3.761904761904762e-05, + "loss": 0.4622, + "step": 34473 + }, + { + "epoch": 19.259217877094972, + "grad_norm": 0.3877251446247101, + "learning_rate": 3.759103641456583e-05, + "loss": 0.4393, + "step": 34474 + }, + { + "epoch": 19.25977653631285, + "grad_norm": 0.33077797293663025, + "learning_rate": 3.756302521008403e-05, + "loss": 0.4212, + "step": 34475 + }, + { + "epoch": 19.260335195530725, + "grad_norm": 3.9410672187805176, + "learning_rate": 3.753501400560224e-05, + "loss": 0.3889, + "step": 34476 + }, + { + "epoch": 19.260893854748602, + "grad_norm": 0.3530593514442444, + "learning_rate": 3.7507002801120445e-05, + "loss": 0.4301, + "step": 34477 + }, + { + "epoch": 19.261452513966482, + "grad_norm": 0.29086387157440186, + "learning_rate": 3.747899159663866e-05, + "loss": 0.3044, + "step": 34478 + }, + { + "epoch": 19.26201117318436, + "grad_norm": 0.5265844464302063, + "learning_rate": 3.7450980392156864e-05, + "loss": 0.3984, + "step": 34479 + }, + { + "epoch": 19.262569832402235, + "grad_norm": 1.8132120370864868, + "learning_rate": 3.742296918767507e-05, + "loss": 0.5091, + "step": 34480 + }, + { + "epoch": 19.26312849162011, + "grad_norm": 0.6871200799942017, + "learning_rate": 3.7394957983193276e-05, + "loss": 0.4322, + "step": 34481 + }, + { + "epoch": 19.26368715083799, + "grad_norm": 4.2410712242126465, + "learning_rate": 3.736694677871148e-05, + "loss": 0.3524, + "step": 34482 + }, + { + "epoch": 19.264245810055865, + "grad_norm": 0.3089617192745209, + "learning_rate": 3.7338935574229695e-05, + "loss": 0.3252, + "step": 34483 + }, + { + "epoch": 19.26480446927374, + "grad_norm": 0.5155884027481079, + "learning_rate": 3.73109243697479e-05, + "loss": 0.5773, + "step": 34484 + }, + { + "epoch": 19.26536312849162, + "grad_norm": 0.459256112575531, + "learning_rate": 3.728291316526611e-05, + "loss": 0.4601, + "step": 34485 + }, + { + "epoch": 19.265921787709498, + "grad_norm": 0.5264071226119995, + "learning_rate": 3.725490196078431e-05, + "loss": 0.4621, + "step": 34486 + }, + { + "epoch": 19.266480446927375, + "grad_norm": 0.2996077239513397, + "learning_rate": 3.722689075630252e-05, + "loss": 0.2895, + "step": 34487 + }, + { + "epoch": 19.26703910614525, + "grad_norm": 0.5657638907432556, + "learning_rate": 3.7198879551820725e-05, + "loss": 0.3957, + "step": 34488 + }, + { + "epoch": 19.267597765363128, + "grad_norm": 0.4529856741428375, + "learning_rate": 3.717086834733894e-05, + "loss": 0.3525, + "step": 34489 + }, + { + "epoch": 19.268156424581004, + "grad_norm": 0.43867769837379456, + "learning_rate": 3.7142857142857143e-05, + "loss": 0.4109, + "step": 34490 + }, + { + "epoch": 19.26871508379888, + "grad_norm": 2.437505006790161, + "learning_rate": 3.711484593837535e-05, + "loss": 0.3627, + "step": 34491 + }, + { + "epoch": 19.26927374301676, + "grad_norm": 0.4841361343860626, + "learning_rate": 3.7086834733893555e-05, + "loss": 0.3623, + "step": 34492 + }, + { + "epoch": 19.269832402234638, + "grad_norm": 0.5400592088699341, + "learning_rate": 3.705882352941176e-05, + "loss": 0.3654, + "step": 34493 + }, + { + "epoch": 19.270391061452514, + "grad_norm": 1.1143990755081177, + "learning_rate": 3.7030812324929974e-05, + "loss": 0.4004, + "step": 34494 + }, + { + "epoch": 19.27094972067039, + "grad_norm": 0.37151867151260376, + "learning_rate": 3.700280112044818e-05, + "loss": 0.4146, + "step": 34495 + }, + { + "epoch": 19.271508379888267, + "grad_norm": 0.320733904838562, + "learning_rate": 3.6974789915966386e-05, + "loss": 0.2426, + "step": 34496 + }, + { + "epoch": 19.272067039106144, + "grad_norm": 0.4429011344909668, + "learning_rate": 3.694677871148459e-05, + "loss": 0.3922, + "step": 34497 + }, + { + "epoch": 19.272625698324024, + "grad_norm": 3.2030532360076904, + "learning_rate": 3.69187675070028e-05, + "loss": 0.446, + "step": 34498 + }, + { + "epoch": 19.2731843575419, + "grad_norm": 0.5493384599685669, + "learning_rate": 3.6890756302521004e-05, + "loss": 0.3944, + "step": 34499 + }, + { + "epoch": 19.273743016759777, + "grad_norm": 0.5305752158164978, + "learning_rate": 3.686274509803922e-05, + "loss": 0.3661, + "step": 34500 + }, + { + "epoch": 19.273743016759777, + "eval_cer": 0.08440531133732665, + "eval_loss": 0.31948575377464294, + "eval_runtime": 58.7391, + "eval_samples_per_second": 77.257, + "eval_steps_per_second": 4.835, + "eval_wer": 0.3343482025641742, + "step": 34500 + }, + { + "epoch": 19.274301675977654, + "grad_norm": 0.43951746821403503, + "learning_rate": 3.683473389355742e-05, + "loss": 0.4555, + "step": 34501 + }, + { + "epoch": 19.27486033519553, + "grad_norm": 0.5666416883468628, + "learning_rate": 3.680672268907563e-05, + "loss": 0.4083, + "step": 34502 + }, + { + "epoch": 19.275418994413407, + "grad_norm": 0.3738899827003479, + "learning_rate": 3.6778711484593835e-05, + "loss": 0.3931, + "step": 34503 + }, + { + "epoch": 19.275977653631283, + "grad_norm": 0.8670925498008728, + "learning_rate": 3.675070028011204e-05, + "loss": 0.3392, + "step": 34504 + }, + { + "epoch": 19.276536312849164, + "grad_norm": 0.3728310465812683, + "learning_rate": 3.6722689075630254e-05, + "loss": 0.4187, + "step": 34505 + }, + { + "epoch": 19.27709497206704, + "grad_norm": 0.6954748034477234, + "learning_rate": 3.669467787114846e-05, + "loss": 0.4187, + "step": 34506 + }, + { + "epoch": 19.277653631284917, + "grad_norm": 0.4802590012550354, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.3605, + "step": 34507 + }, + { + "epoch": 19.278212290502793, + "grad_norm": 0.5010871887207031, + "learning_rate": 3.663865546218487e-05, + "loss": 0.449, + "step": 34508 + }, + { + "epoch": 19.27877094972067, + "grad_norm": 0.9805195331573486, + "learning_rate": 3.661064425770308e-05, + "loss": 0.3888, + "step": 34509 + }, + { + "epoch": 19.279329608938546, + "grad_norm": 0.35273832082748413, + "learning_rate": 3.658263305322129e-05, + "loss": 0.322, + "step": 34510 + }, + { + "epoch": 19.279888268156423, + "grad_norm": 0.7681180834770203, + "learning_rate": 3.6554621848739496e-05, + "loss": 0.4689, + "step": 34511 + }, + { + "epoch": 19.280446927374303, + "grad_norm": 2.174262046813965, + "learning_rate": 3.65266106442577e-05, + "loss": 0.4476, + "step": 34512 + }, + { + "epoch": 19.28100558659218, + "grad_norm": 0.5720050930976868, + "learning_rate": 3.649859943977591e-05, + "loss": 0.3996, + "step": 34513 + }, + { + "epoch": 19.281564245810056, + "grad_norm": 0.4021025598049164, + "learning_rate": 3.6470588235294114e-05, + "loss": 0.4229, + "step": 34514 + }, + { + "epoch": 19.282122905027933, + "grad_norm": 0.37996503710746765, + "learning_rate": 3.644257703081232e-05, + "loss": 0.3358, + "step": 34515 + }, + { + "epoch": 19.28268156424581, + "grad_norm": 0.36724987626075745, + "learning_rate": 3.641456582633053e-05, + "loss": 0.3909, + "step": 34516 + }, + { + "epoch": 19.283240223463686, + "grad_norm": 0.3688671886920929, + "learning_rate": 3.638655462184874e-05, + "loss": 0.3492, + "step": 34517 + }, + { + "epoch": 19.283798882681563, + "grad_norm": 0.7051141262054443, + "learning_rate": 3.6358543417366945e-05, + "loss": 0.7063, + "step": 34518 + }, + { + "epoch": 19.284357541899443, + "grad_norm": 0.47929301857948303, + "learning_rate": 3.633053221288515e-05, + "loss": 0.3617, + "step": 34519 + }, + { + "epoch": 19.28491620111732, + "grad_norm": 0.49092692136764526, + "learning_rate": 3.630252100840336e-05, + "loss": 0.3148, + "step": 34520 + }, + { + "epoch": 19.285474860335196, + "grad_norm": 0.6783444285392761, + "learning_rate": 3.627450980392157e-05, + "loss": 0.583, + "step": 34521 + }, + { + "epoch": 19.286033519553072, + "grad_norm": 0.447298139333725, + "learning_rate": 3.6246498599439776e-05, + "loss": 0.4116, + "step": 34522 + }, + { + "epoch": 19.28659217877095, + "grad_norm": 2.8305201530456543, + "learning_rate": 3.621848739495798e-05, + "loss": 0.3925, + "step": 34523 + }, + { + "epoch": 19.287150837988825, + "grad_norm": 0.45870012044906616, + "learning_rate": 3.619047619047619e-05, + "loss": 0.3785, + "step": 34524 + }, + { + "epoch": 19.287709497206706, + "grad_norm": 0.41788142919540405, + "learning_rate": 3.6162464985994394e-05, + "loss": 0.3599, + "step": 34525 + }, + { + "epoch": 19.288268156424582, + "grad_norm": 0.4122820794582367, + "learning_rate": 3.613445378151261e-05, + "loss": 0.4336, + "step": 34526 + }, + { + "epoch": 19.28882681564246, + "grad_norm": 0.709364116191864, + "learning_rate": 3.610644257703081e-05, + "loss": 0.3704, + "step": 34527 + }, + { + "epoch": 19.289385474860335, + "grad_norm": 0.37942859530448914, + "learning_rate": 3.607843137254902e-05, + "loss": 0.4096, + "step": 34528 + }, + { + "epoch": 19.289944134078212, + "grad_norm": 0.6213984489440918, + "learning_rate": 3.6050420168067225e-05, + "loss": 0.4096, + "step": 34529 + }, + { + "epoch": 19.29050279329609, + "grad_norm": 0.4581393599510193, + "learning_rate": 3.602240896358543e-05, + "loss": 0.4387, + "step": 34530 + }, + { + "epoch": 19.291061452513965, + "grad_norm": 0.41568735241889954, + "learning_rate": 3.599439775910364e-05, + "loss": 0.4137, + "step": 34531 + }, + { + "epoch": 19.291620111731845, + "grad_norm": 0.3643159866333008, + "learning_rate": 3.596638655462185e-05, + "loss": 0.4055, + "step": 34532 + }, + { + "epoch": 19.29217877094972, + "grad_norm": 1.5516700744628906, + "learning_rate": 3.5938375350140055e-05, + "loss": 0.5469, + "step": 34533 + }, + { + "epoch": 19.2927374301676, + "grad_norm": 0.3524828255176544, + "learning_rate": 3.591036414565826e-05, + "loss": 0.3719, + "step": 34534 + }, + { + "epoch": 19.293296089385475, + "grad_norm": 0.5035155415534973, + "learning_rate": 3.588235294117647e-05, + "loss": 0.4485, + "step": 34535 + }, + { + "epoch": 19.29385474860335, + "grad_norm": 0.6188202500343323, + "learning_rate": 3.5854341736694673e-05, + "loss": 0.4273, + "step": 34536 + }, + { + "epoch": 19.294413407821228, + "grad_norm": 0.4884275496006012, + "learning_rate": 3.5826330532212886e-05, + "loss": 0.3879, + "step": 34537 + }, + { + "epoch": 19.294972067039105, + "grad_norm": 0.5118287205696106, + "learning_rate": 3.579831932773109e-05, + "loss": 0.3497, + "step": 34538 + }, + { + "epoch": 19.295530726256985, + "grad_norm": 1.9995648860931396, + "learning_rate": 3.57703081232493e-05, + "loss": 0.2845, + "step": 34539 + }, + { + "epoch": 19.29608938547486, + "grad_norm": 0.8530018925666809, + "learning_rate": 3.5742296918767504e-05, + "loss": 0.4404, + "step": 34540 + }, + { + "epoch": 19.296648044692738, + "grad_norm": 0.7184450626373291, + "learning_rate": 3.571428571428571e-05, + "loss": 0.3281, + "step": 34541 + }, + { + "epoch": 19.297206703910614, + "grad_norm": 0.36259332299232483, + "learning_rate": 3.5686274509803916e-05, + "loss": 0.4072, + "step": 34542 + }, + { + "epoch": 19.29776536312849, + "grad_norm": 1.0099579095840454, + "learning_rate": 3.565826330532213e-05, + "loss": 0.5971, + "step": 34543 + }, + { + "epoch": 19.298324022346367, + "grad_norm": 0.3382643163204193, + "learning_rate": 3.5630252100840335e-05, + "loss": 0.3694, + "step": 34544 + }, + { + "epoch": 19.298882681564244, + "grad_norm": 0.5092337727546692, + "learning_rate": 3.560224089635854e-05, + "loss": 0.4617, + "step": 34545 + }, + { + "epoch": 19.299441340782124, + "grad_norm": 0.46436750888824463, + "learning_rate": 3.557422969187675e-05, + "loss": 0.4547, + "step": 34546 + }, + { + "epoch": 19.3, + "grad_norm": 1.312204360961914, + "learning_rate": 3.554621848739495e-05, + "loss": 0.3458, + "step": 34547 + }, + { + "epoch": 19.300558659217877, + "grad_norm": 2.6606457233428955, + "learning_rate": 3.5518207282913166e-05, + "loss": 0.5411, + "step": 34548 + }, + { + "epoch": 19.301117318435754, + "grad_norm": 0.3522096276283264, + "learning_rate": 3.549019607843137e-05, + "loss": 0.3317, + "step": 34549 + }, + { + "epoch": 19.30167597765363, + "grad_norm": 1.109816074371338, + "learning_rate": 3.546218487394958e-05, + "loss": 0.4761, + "step": 34550 + }, + { + "epoch": 19.302234636871507, + "grad_norm": 0.5125716924667358, + "learning_rate": 3.5434173669467784e-05, + "loss": 0.4579, + "step": 34551 + }, + { + "epoch": 19.302793296089387, + "grad_norm": 0.4348616302013397, + "learning_rate": 3.540616246498599e-05, + "loss": 0.3682, + "step": 34552 + }, + { + "epoch": 19.303351955307264, + "grad_norm": 0.5202425122261047, + "learning_rate": 3.53781512605042e-05, + "loss": 0.3406, + "step": 34553 + }, + { + "epoch": 19.30391061452514, + "grad_norm": 0.32495445013046265, + "learning_rate": 3.535014005602241e-05, + "loss": 0.3373, + "step": 34554 + }, + { + "epoch": 19.304469273743017, + "grad_norm": 0.4260399341583252, + "learning_rate": 3.5322128851540614e-05, + "loss": 0.397, + "step": 34555 + }, + { + "epoch": 19.305027932960893, + "grad_norm": 0.6243739128112793, + "learning_rate": 3.529411764705882e-05, + "loss": 0.3791, + "step": 34556 + }, + { + "epoch": 19.30558659217877, + "grad_norm": 0.881285548210144, + "learning_rate": 3.5266106442577026e-05, + "loss": 0.4649, + "step": 34557 + }, + { + "epoch": 19.306145251396647, + "grad_norm": 0.35440558195114136, + "learning_rate": 3.523809523809523e-05, + "loss": 0.331, + "step": 34558 + }, + { + "epoch": 19.306703910614527, + "grad_norm": 0.31542378664016724, + "learning_rate": 3.521008403361345e-05, + "loss": 0.3808, + "step": 34559 + }, + { + "epoch": 19.307262569832403, + "grad_norm": 0.5489582419395447, + "learning_rate": 3.518207282913166e-05, + "loss": 0.4356, + "step": 34560 + }, + { + "epoch": 19.30782122905028, + "grad_norm": 0.41696399450302124, + "learning_rate": 3.5154061624649864e-05, + "loss": 0.3757, + "step": 34561 + }, + { + "epoch": 19.308379888268156, + "grad_norm": 0.39428970217704773, + "learning_rate": 3.512605042016807e-05, + "loss": 0.3902, + "step": 34562 + }, + { + "epoch": 19.308938547486033, + "grad_norm": 1.334436058998108, + "learning_rate": 3.5098039215686276e-05, + "loss": 0.4034, + "step": 34563 + }, + { + "epoch": 19.30949720670391, + "grad_norm": 0.6640145182609558, + "learning_rate": 3.507002801120449e-05, + "loss": 0.2782, + "step": 34564 + }, + { + "epoch": 19.310055865921786, + "grad_norm": 0.45179006457328796, + "learning_rate": 3.5042016806722695e-05, + "loss": 0.423, + "step": 34565 + }, + { + "epoch": 19.310614525139666, + "grad_norm": 0.6381816267967224, + "learning_rate": 3.50140056022409e-05, + "loss": 0.3782, + "step": 34566 + }, + { + "epoch": 19.311173184357543, + "grad_norm": 0.40780389308929443, + "learning_rate": 3.498599439775911e-05, + "loss": 0.4524, + "step": 34567 + }, + { + "epoch": 19.31173184357542, + "grad_norm": 0.4660477340221405, + "learning_rate": 3.495798319327731e-05, + "loss": 0.4591, + "step": 34568 + }, + { + "epoch": 19.312290502793296, + "grad_norm": 0.49867674708366394, + "learning_rate": 3.492997198879552e-05, + "loss": 0.4471, + "step": 34569 + }, + { + "epoch": 19.312849162011172, + "grad_norm": 0.46156176924705505, + "learning_rate": 3.490196078431373e-05, + "loss": 0.3721, + "step": 34570 + }, + { + "epoch": 19.31340782122905, + "grad_norm": 0.46882903575897217, + "learning_rate": 3.487394957983194e-05, + "loss": 0.4086, + "step": 34571 + }, + { + "epoch": 19.31396648044693, + "grad_norm": 0.4375666379928589, + "learning_rate": 3.4845938375350143e-05, + "loss": 0.3814, + "step": 34572 + }, + { + "epoch": 19.314525139664806, + "grad_norm": 1.0045313835144043, + "learning_rate": 3.481792717086835e-05, + "loss": 0.35, + "step": 34573 + }, + { + "epoch": 19.315083798882682, + "grad_norm": 0.5981054306030273, + "learning_rate": 3.4789915966386555e-05, + "loss": 0.4691, + "step": 34574 + }, + { + "epoch": 19.31564245810056, + "grad_norm": 0.5679623484611511, + "learning_rate": 3.476190476190477e-05, + "loss": 0.3854, + "step": 34575 + }, + { + "epoch": 19.316201117318435, + "grad_norm": 0.5705524682998657, + "learning_rate": 3.4733893557422974e-05, + "loss": 0.3607, + "step": 34576 + }, + { + "epoch": 19.316759776536312, + "grad_norm": 0.32652977108955383, + "learning_rate": 3.470588235294118e-05, + "loss": 0.394, + "step": 34577 + }, + { + "epoch": 19.31731843575419, + "grad_norm": 2.34604549407959, + "learning_rate": 3.4677871148459386e-05, + "loss": 0.4035, + "step": 34578 + }, + { + "epoch": 19.31787709497207, + "grad_norm": 0.5323050022125244, + "learning_rate": 3.464985994397759e-05, + "loss": 0.4212, + "step": 34579 + }, + { + "epoch": 19.318435754189945, + "grad_norm": 0.4590151309967041, + "learning_rate": 3.4621848739495805e-05, + "loss": 0.4329, + "step": 34580 + }, + { + "epoch": 19.31899441340782, + "grad_norm": 0.5562870502471924, + "learning_rate": 3.459383753501401e-05, + "loss": 0.4241, + "step": 34581 + }, + { + "epoch": 19.3195530726257, + "grad_norm": 0.4686630368232727, + "learning_rate": 3.456582633053222e-05, + "loss": 0.4644, + "step": 34582 + }, + { + "epoch": 19.320111731843575, + "grad_norm": 0.3051053583621979, + "learning_rate": 3.453781512605042e-05, + "loss": 0.3301, + "step": 34583 + }, + { + "epoch": 19.32067039106145, + "grad_norm": 0.31965896487236023, + "learning_rate": 3.450980392156863e-05, + "loss": 0.4152, + "step": 34584 + }, + { + "epoch": 19.321229050279328, + "grad_norm": 0.3505248725414276, + "learning_rate": 3.4481792717086835e-05, + "loss": 0.4031, + "step": 34585 + }, + { + "epoch": 19.321787709497208, + "grad_norm": 0.3530297577381134, + "learning_rate": 3.445378151260505e-05, + "loss": 0.3574, + "step": 34586 + }, + { + "epoch": 19.322346368715085, + "grad_norm": 0.36637377738952637, + "learning_rate": 3.4425770308123254e-05, + "loss": 0.4012, + "step": 34587 + }, + { + "epoch": 19.32290502793296, + "grad_norm": 0.43705517053604126, + "learning_rate": 3.439775910364146e-05, + "loss": 0.3851, + "step": 34588 + }, + { + "epoch": 19.323463687150838, + "grad_norm": 2.149904727935791, + "learning_rate": 3.4369747899159666e-05, + "loss": 0.4042, + "step": 34589 + }, + { + "epoch": 19.324022346368714, + "grad_norm": 0.5753365159034729, + "learning_rate": 3.434173669467787e-05, + "loss": 0.3401, + "step": 34590 + }, + { + "epoch": 19.32458100558659, + "grad_norm": 0.3856475055217743, + "learning_rate": 3.4313725490196084e-05, + "loss": 0.3026, + "step": 34591 + }, + { + "epoch": 19.325139664804468, + "grad_norm": 0.3130301237106323, + "learning_rate": 3.428571428571429e-05, + "loss": 0.3567, + "step": 34592 + }, + { + "epoch": 19.325698324022348, + "grad_norm": 0.3634701073169708, + "learning_rate": 3.4257703081232496e-05, + "loss": 0.3695, + "step": 34593 + }, + { + "epoch": 19.326256983240224, + "grad_norm": 0.4071289300918579, + "learning_rate": 3.42296918767507e-05, + "loss": 0.4572, + "step": 34594 + }, + { + "epoch": 19.3268156424581, + "grad_norm": 0.4620494246482849, + "learning_rate": 3.420168067226891e-05, + "loss": 0.4014, + "step": 34595 + }, + { + "epoch": 19.327374301675977, + "grad_norm": 0.4361553192138672, + "learning_rate": 3.4173669467787114e-05, + "loss": 0.35, + "step": 34596 + }, + { + "epoch": 19.327932960893854, + "grad_norm": 0.40845033526420593, + "learning_rate": 3.414565826330533e-05, + "loss": 0.4101, + "step": 34597 + }, + { + "epoch": 19.32849162011173, + "grad_norm": 7.659552574157715, + "learning_rate": 3.411764705882353e-05, + "loss": 0.4547, + "step": 34598 + }, + { + "epoch": 19.32905027932961, + "grad_norm": 0.5099524259567261, + "learning_rate": 3.408963585434174e-05, + "loss": 0.4258, + "step": 34599 + }, + { + "epoch": 19.329608938547487, + "grad_norm": 0.40536609292030334, + "learning_rate": 3.4061624649859945e-05, + "loss": 0.3796, + "step": 34600 + }, + { + "epoch": 19.330167597765364, + "grad_norm": 0.34829026460647583, + "learning_rate": 3.403361344537815e-05, + "loss": 0.3913, + "step": 34601 + }, + { + "epoch": 19.33072625698324, + "grad_norm": 0.7247732877731323, + "learning_rate": 3.4005602240896364e-05, + "loss": 0.4874, + "step": 34602 + }, + { + "epoch": 19.331284916201117, + "grad_norm": 1.0088967084884644, + "learning_rate": 3.397759103641457e-05, + "loss": 0.3509, + "step": 34603 + }, + { + "epoch": 19.331843575418993, + "grad_norm": 0.8679409027099609, + "learning_rate": 3.3949579831932776e-05, + "loss": 0.4532, + "step": 34604 + }, + { + "epoch": 19.33240223463687, + "grad_norm": 0.49338603019714355, + "learning_rate": 3.392156862745098e-05, + "loss": 0.3935, + "step": 34605 + }, + { + "epoch": 19.33296089385475, + "grad_norm": 0.5714191794395447, + "learning_rate": 3.389355742296919e-05, + "loss": 0.43, + "step": 34606 + }, + { + "epoch": 19.333519553072627, + "grad_norm": 0.4332340955734253, + "learning_rate": 3.38655462184874e-05, + "loss": 0.3866, + "step": 34607 + }, + { + "epoch": 19.334078212290503, + "grad_norm": 0.5391001105308533, + "learning_rate": 3.383753501400561e-05, + "loss": 0.5297, + "step": 34608 + }, + { + "epoch": 19.33463687150838, + "grad_norm": 0.37089234590530396, + "learning_rate": 3.380952380952381e-05, + "loss": 0.4008, + "step": 34609 + }, + { + "epoch": 19.335195530726256, + "grad_norm": 2.5591518878936768, + "learning_rate": 3.378151260504202e-05, + "loss": 0.5271, + "step": 34610 + }, + { + "epoch": 19.335754189944133, + "grad_norm": 0.5355129241943359, + "learning_rate": 3.3753501400560225e-05, + "loss": 0.3615, + "step": 34611 + }, + { + "epoch": 19.33631284916201, + "grad_norm": 0.5871062874794006, + "learning_rate": 3.372549019607843e-05, + "loss": 0.3361, + "step": 34612 + }, + { + "epoch": 19.33687150837989, + "grad_norm": 0.5193787813186646, + "learning_rate": 3.3697478991596643e-05, + "loss": 0.4015, + "step": 34613 + }, + { + "epoch": 19.337430167597766, + "grad_norm": 0.451000452041626, + "learning_rate": 3.366946778711485e-05, + "loss": 0.5063, + "step": 34614 + }, + { + "epoch": 19.337988826815643, + "grad_norm": 1.3041807413101196, + "learning_rate": 3.3641456582633055e-05, + "loss": 0.3354, + "step": 34615 + }, + { + "epoch": 19.33854748603352, + "grad_norm": 0.5264590382575989, + "learning_rate": 3.361344537815126e-05, + "loss": 0.402, + "step": 34616 + }, + { + "epoch": 19.339106145251396, + "grad_norm": 3.2200613021850586, + "learning_rate": 3.358543417366947e-05, + "loss": 0.4691, + "step": 34617 + }, + { + "epoch": 19.339664804469272, + "grad_norm": 0.6092663407325745, + "learning_rate": 3.355742296918768e-05, + "loss": 0.4262, + "step": 34618 + }, + { + "epoch": 19.340223463687153, + "grad_norm": 0.9767898321151733, + "learning_rate": 3.3529411764705886e-05, + "loss": 0.3824, + "step": 34619 + }, + { + "epoch": 19.34078212290503, + "grad_norm": 0.3635249733924866, + "learning_rate": 3.350140056022409e-05, + "loss": 0.3741, + "step": 34620 + }, + { + "epoch": 19.341340782122906, + "grad_norm": 0.5556697249412537, + "learning_rate": 3.34733893557423e-05, + "loss": 0.4421, + "step": 34621 + }, + { + "epoch": 19.341899441340782, + "grad_norm": 0.38647469878196716, + "learning_rate": 3.3445378151260504e-05, + "loss": 0.4257, + "step": 34622 + }, + { + "epoch": 19.34245810055866, + "grad_norm": 0.9777635931968689, + "learning_rate": 3.341736694677872e-05, + "loss": 0.4714, + "step": 34623 + }, + { + "epoch": 19.343016759776535, + "grad_norm": 0.5502926707267761, + "learning_rate": 3.338935574229692e-05, + "loss": 0.3947, + "step": 34624 + }, + { + "epoch": 19.343575418994412, + "grad_norm": 0.49105897545814514, + "learning_rate": 3.336134453781513e-05, + "loss": 0.3577, + "step": 34625 + }, + { + "epoch": 19.344134078212292, + "grad_norm": 0.5177717804908752, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5035, + "step": 34626 + }, + { + "epoch": 19.34469273743017, + "grad_norm": 1.2236799001693726, + "learning_rate": 3.330532212885154e-05, + "loss": 0.3496, + "step": 34627 + }, + { + "epoch": 19.345251396648045, + "grad_norm": 0.35906124114990234, + "learning_rate": 3.327731092436975e-05, + "loss": 0.3504, + "step": 34628 + }, + { + "epoch": 19.345810055865922, + "grad_norm": 0.3925894796848297, + "learning_rate": 3.324929971988796e-05, + "loss": 0.4446, + "step": 34629 + }, + { + "epoch": 19.3463687150838, + "grad_norm": 0.4980289936065674, + "learning_rate": 3.3221288515406166e-05, + "loss": 0.5324, + "step": 34630 + }, + { + "epoch": 19.346927374301675, + "grad_norm": 0.40415525436401367, + "learning_rate": 3.319327731092437e-05, + "loss": 0.3328, + "step": 34631 + }, + { + "epoch": 19.34748603351955, + "grad_norm": 0.37800678610801697, + "learning_rate": 3.316526610644258e-05, + "loss": 0.4266, + "step": 34632 + }, + { + "epoch": 19.34804469273743, + "grad_norm": 0.33418819308280945, + "learning_rate": 3.3137254901960784e-05, + "loss": 0.3856, + "step": 34633 + }, + { + "epoch": 19.34860335195531, + "grad_norm": 1.0299174785614014, + "learning_rate": 3.3109243697478996e-05, + "loss": 0.5321, + "step": 34634 + }, + { + "epoch": 19.349162011173185, + "grad_norm": 0.38019147515296936, + "learning_rate": 3.30812324929972e-05, + "loss": 0.4656, + "step": 34635 + }, + { + "epoch": 19.34972067039106, + "grad_norm": 0.5020016431808472, + "learning_rate": 3.305322128851541e-05, + "loss": 0.3786, + "step": 34636 + }, + { + "epoch": 19.350279329608938, + "grad_norm": 0.7795881032943726, + "learning_rate": 3.3025210084033614e-05, + "loss": 0.6756, + "step": 34637 + }, + { + "epoch": 19.350837988826814, + "grad_norm": 1.6135830879211426, + "learning_rate": 3.299719887955182e-05, + "loss": 0.4821, + "step": 34638 + }, + { + "epoch": 19.35139664804469, + "grad_norm": 0.3907015025615692, + "learning_rate": 3.2969187675070026e-05, + "loss": 0.3012, + "step": 34639 + }, + { + "epoch": 19.35195530726257, + "grad_norm": 0.39014896750450134, + "learning_rate": 3.294117647058824e-05, + "loss": 0.3422, + "step": 34640 + }, + { + "epoch": 19.352513966480448, + "grad_norm": 0.346729040145874, + "learning_rate": 3.2913165266106445e-05, + "loss": 0.4306, + "step": 34641 + }, + { + "epoch": 19.353072625698324, + "grad_norm": 0.4760814309120178, + "learning_rate": 3.288515406162465e-05, + "loss": 0.4452, + "step": 34642 + }, + { + "epoch": 19.3536312849162, + "grad_norm": 0.42498233914375305, + "learning_rate": 3.285714285714286e-05, + "loss": 0.4091, + "step": 34643 + }, + { + "epoch": 19.354189944134077, + "grad_norm": 1.282052755355835, + "learning_rate": 3.282913165266106e-05, + "loss": 0.3175, + "step": 34644 + }, + { + "epoch": 19.354748603351954, + "grad_norm": 0.5970929265022278, + "learning_rate": 3.2801120448179276e-05, + "loss": 0.5549, + "step": 34645 + }, + { + "epoch": 19.355307262569834, + "grad_norm": 1.0843144655227661, + "learning_rate": 3.277310924369748e-05, + "loss": 0.3928, + "step": 34646 + }, + { + "epoch": 19.35586592178771, + "grad_norm": 0.34287598729133606, + "learning_rate": 3.274509803921569e-05, + "loss": 0.3053, + "step": 34647 + }, + { + "epoch": 19.356424581005587, + "grad_norm": 0.36522290110588074, + "learning_rate": 3.2717086834733894e-05, + "loss": 0.4558, + "step": 34648 + }, + { + "epoch": 19.356983240223464, + "grad_norm": 0.3644511103630066, + "learning_rate": 3.26890756302521e-05, + "loss": 0.3109, + "step": 34649 + }, + { + "epoch": 19.35754189944134, + "grad_norm": 0.4517008662223816, + "learning_rate": 3.266106442577031e-05, + "loss": 0.4343, + "step": 34650 + }, + { + "epoch": 19.358100558659217, + "grad_norm": 0.5071067810058594, + "learning_rate": 3.263305322128852e-05, + "loss": 0.3534, + "step": 34651 + }, + { + "epoch": 19.358659217877094, + "grad_norm": 0.4516407549381256, + "learning_rate": 3.2605042016806725e-05, + "loss": 0.2933, + "step": 34652 + }, + { + "epoch": 19.359217877094974, + "grad_norm": 0.37370795011520386, + "learning_rate": 3.257703081232493e-05, + "loss": 0.3682, + "step": 34653 + }, + { + "epoch": 19.35977653631285, + "grad_norm": 0.38454169034957886, + "learning_rate": 3.254901960784314e-05, + "loss": 0.3677, + "step": 34654 + }, + { + "epoch": 19.360335195530727, + "grad_norm": 0.6930497884750366, + "learning_rate": 3.252100840336134e-05, + "loss": 0.4225, + "step": 34655 + }, + { + "epoch": 19.360893854748603, + "grad_norm": 0.4099404513835907, + "learning_rate": 3.2492997198879555e-05, + "loss": 0.3765, + "step": 34656 + }, + { + "epoch": 19.36145251396648, + "grad_norm": 0.4370623826980591, + "learning_rate": 3.246498599439776e-05, + "loss": 0.343, + "step": 34657 + }, + { + "epoch": 19.362011173184356, + "grad_norm": 0.3761231601238251, + "learning_rate": 3.243697478991597e-05, + "loss": 0.4067, + "step": 34658 + }, + { + "epoch": 19.362569832402233, + "grad_norm": 0.46025410294532776, + "learning_rate": 3.2408963585434173e-05, + "loss": 0.3499, + "step": 34659 + }, + { + "epoch": 19.363128491620113, + "grad_norm": 1.623153805732727, + "learning_rate": 3.238095238095238e-05, + "loss": 0.4312, + "step": 34660 + }, + { + "epoch": 19.36368715083799, + "grad_norm": 0.3973400592803955, + "learning_rate": 3.235294117647059e-05, + "loss": 0.3336, + "step": 34661 + }, + { + "epoch": 19.364245810055866, + "grad_norm": 3.6344473361968994, + "learning_rate": 3.23249299719888e-05, + "loss": 0.4525, + "step": 34662 + }, + { + "epoch": 19.364804469273743, + "grad_norm": 0.7064842581748962, + "learning_rate": 3.2296918767507004e-05, + "loss": 0.3495, + "step": 34663 + }, + { + "epoch": 19.36536312849162, + "grad_norm": 0.36064934730529785, + "learning_rate": 3.226890756302521e-05, + "loss": 0.4462, + "step": 34664 + }, + { + "epoch": 19.365921787709496, + "grad_norm": 0.8353272080421448, + "learning_rate": 3.2240896358543416e-05, + "loss": 0.471, + "step": 34665 + }, + { + "epoch": 19.366480446927373, + "grad_norm": 0.3255285620689392, + "learning_rate": 3.221288515406162e-05, + "loss": 0.2968, + "step": 34666 + }, + { + "epoch": 19.367039106145253, + "grad_norm": 0.9225115180015564, + "learning_rate": 3.2184873949579835e-05, + "loss": 0.3536, + "step": 34667 + }, + { + "epoch": 19.36759776536313, + "grad_norm": 0.40227606892585754, + "learning_rate": 3.215686274509804e-05, + "loss": 0.3704, + "step": 34668 + }, + { + "epoch": 19.368156424581006, + "grad_norm": 0.35369041562080383, + "learning_rate": 3.212885154061625e-05, + "loss": 0.3284, + "step": 34669 + }, + { + "epoch": 19.368715083798882, + "grad_norm": 0.4591996371746063, + "learning_rate": 3.210084033613445e-05, + "loss": 0.4583, + "step": 34670 + }, + { + "epoch": 19.36927374301676, + "grad_norm": 0.5736231207847595, + "learning_rate": 3.207282913165266e-05, + "loss": 0.5323, + "step": 34671 + }, + { + "epoch": 19.369832402234636, + "grad_norm": 0.5638227462768555, + "learning_rate": 3.204481792717087e-05, + "loss": 0.4511, + "step": 34672 + }, + { + "epoch": 19.370391061452516, + "grad_norm": 0.4991433620452881, + "learning_rate": 3.201680672268908e-05, + "loss": 0.5381, + "step": 34673 + }, + { + "epoch": 19.370949720670392, + "grad_norm": 0.5432763695716858, + "learning_rate": 3.1988795518207284e-05, + "loss": 0.3771, + "step": 34674 + }, + { + "epoch": 19.37150837988827, + "grad_norm": 0.42705973982810974, + "learning_rate": 3.196078431372549e-05, + "loss": 0.4681, + "step": 34675 + }, + { + "epoch": 19.372067039106145, + "grad_norm": 0.3913818299770355, + "learning_rate": 3.1932773109243696e-05, + "loss": 0.3428, + "step": 34676 + }, + { + "epoch": 19.372625698324022, + "grad_norm": 0.385786771774292, + "learning_rate": 3.190476190476191e-05, + "loss": 0.3126, + "step": 34677 + }, + { + "epoch": 19.3731843575419, + "grad_norm": 0.4680740535259247, + "learning_rate": 3.1876750700280114e-05, + "loss": 0.5126, + "step": 34678 + }, + { + "epoch": 19.373743016759775, + "grad_norm": 0.6564361453056335, + "learning_rate": 3.184873949579832e-05, + "loss": 0.4606, + "step": 34679 + }, + { + "epoch": 19.374301675977655, + "grad_norm": 5.351374626159668, + "learning_rate": 3.1820728291316526e-05, + "loss": 0.3892, + "step": 34680 + }, + { + "epoch": 19.37486033519553, + "grad_norm": 0.3373904824256897, + "learning_rate": 3.179271708683473e-05, + "loss": 0.4192, + "step": 34681 + }, + { + "epoch": 19.37541899441341, + "grad_norm": 2.507180690765381, + "learning_rate": 3.176470588235294e-05, + "loss": 0.4172, + "step": 34682 + }, + { + "epoch": 19.375977653631285, + "grad_norm": 0.370297908782959, + "learning_rate": 3.173669467787115e-05, + "loss": 0.3853, + "step": 34683 + }, + { + "epoch": 19.37653631284916, + "grad_norm": 0.47303617000579834, + "learning_rate": 3.170868347338936e-05, + "loss": 0.4624, + "step": 34684 + }, + { + "epoch": 19.377094972067038, + "grad_norm": 0.36894431710243225, + "learning_rate": 3.168067226890756e-05, + "loss": 0.361, + "step": 34685 + }, + { + "epoch": 19.377653631284915, + "grad_norm": 0.9282207489013672, + "learning_rate": 3.165266106442577e-05, + "loss": 0.3989, + "step": 34686 + }, + { + "epoch": 19.378212290502795, + "grad_norm": 0.4179244935512543, + "learning_rate": 3.1624649859943975e-05, + "loss": 0.4752, + "step": 34687 + }, + { + "epoch": 19.37877094972067, + "grad_norm": 0.9252038598060608, + "learning_rate": 3.159663865546219e-05, + "loss": 0.4406, + "step": 34688 + }, + { + "epoch": 19.379329608938548, + "grad_norm": 0.5302088856697083, + "learning_rate": 3.1568627450980394e-05, + "loss": 0.5164, + "step": 34689 + }, + { + "epoch": 19.379888268156424, + "grad_norm": 1.5509836673736572, + "learning_rate": 3.15406162464986e-05, + "loss": 0.3704, + "step": 34690 + }, + { + "epoch": 19.3804469273743, + "grad_norm": 0.4000025987625122, + "learning_rate": 3.1512605042016806e-05, + "loss": 0.3496, + "step": 34691 + }, + { + "epoch": 19.381005586592178, + "grad_norm": 0.5078865885734558, + "learning_rate": 3.148459383753501e-05, + "loss": 0.3737, + "step": 34692 + }, + { + "epoch": 19.381564245810054, + "grad_norm": 0.7631915807723999, + "learning_rate": 3.1456582633053225e-05, + "loss": 0.3133, + "step": 34693 + }, + { + "epoch": 19.382122905027934, + "grad_norm": 0.3892311453819275, + "learning_rate": 3.142857142857143e-05, + "loss": 0.4643, + "step": 34694 + }, + { + "epoch": 19.38268156424581, + "grad_norm": 1.2695770263671875, + "learning_rate": 3.140056022408964e-05, + "loss": 0.4245, + "step": 34695 + }, + { + "epoch": 19.383240223463687, + "grad_norm": 0.5960999727249146, + "learning_rate": 3.137254901960784e-05, + "loss": 0.4573, + "step": 34696 + }, + { + "epoch": 19.383798882681564, + "grad_norm": 0.5446678996086121, + "learning_rate": 3.134453781512605e-05, + "loss": 0.3849, + "step": 34697 + }, + { + "epoch": 19.38435754189944, + "grad_norm": 0.7207553386688232, + "learning_rate": 3.1316526610644255e-05, + "loss": 0.4716, + "step": 34698 + }, + { + "epoch": 19.384916201117317, + "grad_norm": 0.44883689284324646, + "learning_rate": 3.128851540616247e-05, + "loss": 0.5281, + "step": 34699 + }, + { + "epoch": 19.385474860335197, + "grad_norm": 0.5003238916397095, + "learning_rate": 3.1260504201680673e-05, + "loss": 0.4242, + "step": 34700 + }, + { + "epoch": 19.386033519553074, + "grad_norm": 0.3824901878833771, + "learning_rate": 3.123249299719888e-05, + "loss": 0.3272, + "step": 34701 + }, + { + "epoch": 19.38659217877095, + "grad_norm": 0.37728121876716614, + "learning_rate": 3.1204481792717085e-05, + "loss": 0.3884, + "step": 34702 + }, + { + "epoch": 19.387150837988827, + "grad_norm": 0.5183541178703308, + "learning_rate": 3.11764705882353e-05, + "loss": 0.4014, + "step": 34703 + }, + { + "epoch": 19.387709497206703, + "grad_norm": 0.3923890292644501, + "learning_rate": 3.1148459383753504e-05, + "loss": 0.3382, + "step": 34704 + }, + { + "epoch": 19.38826815642458, + "grad_norm": 0.5274564623832703, + "learning_rate": 3.112044817927171e-05, + "loss": 0.2708, + "step": 34705 + }, + { + "epoch": 19.388826815642457, + "grad_norm": 0.4084869623184204, + "learning_rate": 3.1092436974789916e-05, + "loss": 0.4536, + "step": 34706 + }, + { + "epoch": 19.389385474860337, + "grad_norm": 2.950998067855835, + "learning_rate": 3.106442577030812e-05, + "loss": 0.5644, + "step": 34707 + }, + { + "epoch": 19.389944134078213, + "grad_norm": 0.45802491903305054, + "learning_rate": 3.1036414565826335e-05, + "loss": 0.3959, + "step": 34708 + }, + { + "epoch": 19.39050279329609, + "grad_norm": 0.6680406928062439, + "learning_rate": 3.100840336134454e-05, + "loss": 0.3962, + "step": 34709 + }, + { + "epoch": 19.391061452513966, + "grad_norm": 0.6626935601234436, + "learning_rate": 3.098039215686275e-05, + "loss": 0.3003, + "step": 34710 + }, + { + "epoch": 19.391620111731843, + "grad_norm": 0.4294644892215729, + "learning_rate": 3.095238095238095e-05, + "loss": 0.3527, + "step": 34711 + }, + { + "epoch": 19.39217877094972, + "grad_norm": 3.947005033493042, + "learning_rate": 3.092436974789916e-05, + "loss": 0.4912, + "step": 34712 + }, + { + "epoch": 19.392737430167596, + "grad_norm": 0.5176776051521301, + "learning_rate": 3.0896358543417365e-05, + "loss": 0.3834, + "step": 34713 + }, + { + "epoch": 19.393296089385476, + "grad_norm": 0.5808885097503662, + "learning_rate": 3.086834733893558e-05, + "loss": 0.3733, + "step": 34714 + }, + { + "epoch": 19.393854748603353, + "grad_norm": 0.49917304515838623, + "learning_rate": 3.0840336134453784e-05, + "loss": 0.3681, + "step": 34715 + }, + { + "epoch": 19.39441340782123, + "grad_norm": 0.7487123608589172, + "learning_rate": 3.081232492997199e-05, + "loss": 0.3508, + "step": 34716 + }, + { + "epoch": 19.394972067039106, + "grad_norm": 0.44437068700790405, + "learning_rate": 3.0784313725490196e-05, + "loss": 0.3922, + "step": 34717 + }, + { + "epoch": 19.395530726256982, + "grad_norm": 0.44631487131118774, + "learning_rate": 3.07563025210084e-05, + "loss": 0.4115, + "step": 34718 + }, + { + "epoch": 19.39608938547486, + "grad_norm": 0.6000242233276367, + "learning_rate": 3.0728291316526614e-05, + "loss": 0.3919, + "step": 34719 + }, + { + "epoch": 19.39664804469274, + "grad_norm": 0.8256241679191589, + "learning_rate": 3.070028011204482e-05, + "loss": 0.4147, + "step": 34720 + }, + { + "epoch": 19.397206703910616, + "grad_norm": 0.6992743015289307, + "learning_rate": 3.0672268907563026e-05, + "loss": 0.4548, + "step": 34721 + }, + { + "epoch": 19.397765363128492, + "grad_norm": 0.5466495752334595, + "learning_rate": 3.064425770308123e-05, + "loss": 0.3992, + "step": 34722 + }, + { + "epoch": 19.39832402234637, + "grad_norm": 0.392668217420578, + "learning_rate": 3.061624649859944e-05, + "loss": 0.4177, + "step": 34723 + }, + { + "epoch": 19.398882681564245, + "grad_norm": 0.7657110095024109, + "learning_rate": 3.058823529411765e-05, + "loss": 0.319, + "step": 34724 + }, + { + "epoch": 19.399441340782122, + "grad_norm": 0.43436890840530396, + "learning_rate": 3.056022408963586e-05, + "loss": 0.382, + "step": 34725 + }, + { + "epoch": 19.4, + "grad_norm": 0.47670605778694153, + "learning_rate": 3.053221288515406e-05, + "loss": 0.3444, + "step": 34726 + }, + { + "epoch": 19.40055865921788, + "grad_norm": 0.4780365526676178, + "learning_rate": 3.050420168067227e-05, + "loss": 0.3661, + "step": 34727 + }, + { + "epoch": 19.401117318435755, + "grad_norm": 0.35390588641166687, + "learning_rate": 3.0476190476190475e-05, + "loss": 0.3873, + "step": 34728 + }, + { + "epoch": 19.401675977653632, + "grad_norm": 0.35138699412345886, + "learning_rate": 3.044817927170868e-05, + "loss": 0.3392, + "step": 34729 + }, + { + "epoch": 19.40223463687151, + "grad_norm": 1.0493677854537964, + "learning_rate": 3.0420168067226894e-05, + "loss": 0.6186, + "step": 34730 + }, + { + "epoch": 19.402793296089385, + "grad_norm": 0.4658297002315521, + "learning_rate": 3.03921568627451e-05, + "loss": 0.3375, + "step": 34731 + }, + { + "epoch": 19.40335195530726, + "grad_norm": 1.4820036888122559, + "learning_rate": 3.0364145658263306e-05, + "loss": 0.4263, + "step": 34732 + }, + { + "epoch": 19.403910614525138, + "grad_norm": 3.14245867729187, + "learning_rate": 3.0336134453781515e-05, + "loss": 0.4717, + "step": 34733 + }, + { + "epoch": 19.404469273743018, + "grad_norm": 0.322102814912796, + "learning_rate": 3.030812324929972e-05, + "loss": 0.3872, + "step": 34734 + }, + { + "epoch": 19.405027932960895, + "grad_norm": 0.3833746910095215, + "learning_rate": 3.028011204481793e-05, + "loss": 0.3989, + "step": 34735 + }, + { + "epoch": 19.40558659217877, + "grad_norm": 2.2707359790802, + "learning_rate": 3.0252100840336137e-05, + "loss": 0.3755, + "step": 34736 + }, + { + "epoch": 19.406145251396648, + "grad_norm": 0.4668048322200775, + "learning_rate": 3.0224089635854343e-05, + "loss": 0.4137, + "step": 34737 + }, + { + "epoch": 19.406703910614524, + "grad_norm": 0.4917636811733246, + "learning_rate": 3.0196078431372552e-05, + "loss": 0.3698, + "step": 34738 + }, + { + "epoch": 19.4072625698324, + "grad_norm": 0.49985042214393616, + "learning_rate": 3.0168067226890758e-05, + "loss": 0.3804, + "step": 34739 + }, + { + "epoch": 19.407821229050278, + "grad_norm": 0.43190765380859375, + "learning_rate": 3.0140056022408964e-05, + "loss": 0.4342, + "step": 34740 + }, + { + "epoch": 19.408379888268158, + "grad_norm": 0.5139693021774292, + "learning_rate": 3.0112044817927173e-05, + "loss": 0.4178, + "step": 34741 + }, + { + "epoch": 19.408938547486034, + "grad_norm": 0.4200821816921234, + "learning_rate": 3.008403361344538e-05, + "loss": 0.3118, + "step": 34742 + }, + { + "epoch": 19.40949720670391, + "grad_norm": 0.40365833044052124, + "learning_rate": 3.005602240896359e-05, + "loss": 0.4482, + "step": 34743 + }, + { + "epoch": 19.410055865921787, + "grad_norm": 0.5570021867752075, + "learning_rate": 3.0028011204481795e-05, + "loss": 0.434, + "step": 34744 + }, + { + "epoch": 19.410614525139664, + "grad_norm": 0.5476948022842407, + "learning_rate": 3e-05, + "loss": 0.3609, + "step": 34745 + }, + { + "epoch": 19.41117318435754, + "grad_norm": 0.5441157221794128, + "learning_rate": 2.997198879551821e-05, + "loss": 0.3706, + "step": 34746 + }, + { + "epoch": 19.41173184357542, + "grad_norm": 0.31823068857192993, + "learning_rate": 2.9943977591036416e-05, + "loss": 0.283, + "step": 34747 + }, + { + "epoch": 19.412290502793297, + "grad_norm": 0.36117902398109436, + "learning_rate": 2.9915966386554622e-05, + "loss": 0.2859, + "step": 34748 + }, + { + "epoch": 19.412849162011174, + "grad_norm": 0.4033918082714081, + "learning_rate": 2.988795518207283e-05, + "loss": 0.343, + "step": 34749 + }, + { + "epoch": 19.41340782122905, + "grad_norm": 0.4970065653324127, + "learning_rate": 2.9859943977591038e-05, + "loss": 0.3617, + "step": 34750 + }, + { + "epoch": 19.413966480446927, + "grad_norm": 1.593176007270813, + "learning_rate": 2.9831932773109247e-05, + "loss": 0.4703, + "step": 34751 + }, + { + "epoch": 19.414525139664804, + "grad_norm": 0.5126480460166931, + "learning_rate": 2.9803921568627453e-05, + "loss": 0.5085, + "step": 34752 + }, + { + "epoch": 19.41508379888268, + "grad_norm": 0.30920934677124023, + "learning_rate": 2.977591036414566e-05, + "loss": 0.2957, + "step": 34753 + }, + { + "epoch": 19.41564245810056, + "grad_norm": 0.49395856261253357, + "learning_rate": 2.9747899159663868e-05, + "loss": 0.4249, + "step": 34754 + }, + { + "epoch": 19.416201117318437, + "grad_norm": 1.4000450372695923, + "learning_rate": 2.9719887955182074e-05, + "loss": 0.4614, + "step": 34755 + }, + { + "epoch": 19.416759776536313, + "grad_norm": 0.4395942986011505, + "learning_rate": 2.969187675070028e-05, + "loss": 0.4361, + "step": 34756 + }, + { + "epoch": 19.41731843575419, + "grad_norm": 0.4884836971759796, + "learning_rate": 2.966386554621849e-05, + "loss": 0.3278, + "step": 34757 + }, + { + "epoch": 19.417877094972066, + "grad_norm": 0.3011881411075592, + "learning_rate": 2.9635854341736696e-05, + "loss": 0.3595, + "step": 34758 + }, + { + "epoch": 19.418435754189943, + "grad_norm": 0.4406471252441406, + "learning_rate": 2.9607843137254905e-05, + "loss": 0.4385, + "step": 34759 + }, + { + "epoch": 19.41899441340782, + "grad_norm": 0.625727117061615, + "learning_rate": 2.957983193277311e-05, + "loss": 0.4696, + "step": 34760 + }, + { + "epoch": 19.4195530726257, + "grad_norm": 0.5656685829162598, + "learning_rate": 2.9551820728291317e-05, + "loss": 0.4403, + "step": 34761 + }, + { + "epoch": 19.420111731843576, + "grad_norm": 0.40907514095306396, + "learning_rate": 2.9523809523809526e-05, + "loss": 0.3767, + "step": 34762 + }, + { + "epoch": 19.420670391061453, + "grad_norm": 0.4315016269683838, + "learning_rate": 2.9495798319327732e-05, + "loss": 0.3623, + "step": 34763 + }, + { + "epoch": 19.42122905027933, + "grad_norm": 0.5315988659858704, + "learning_rate": 2.946778711484594e-05, + "loss": 0.3874, + "step": 34764 + }, + { + "epoch": 19.421787709497206, + "grad_norm": 0.41649487614631653, + "learning_rate": 2.9439775910364148e-05, + "loss": 0.3566, + "step": 34765 + }, + { + "epoch": 19.422346368715083, + "grad_norm": 0.45690855383872986, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.4189, + "step": 34766 + }, + { + "epoch": 19.422905027932963, + "grad_norm": 0.5419706702232361, + "learning_rate": 2.938375350140056e-05, + "loss": 0.392, + "step": 34767 + }, + { + "epoch": 19.42346368715084, + "grad_norm": 0.39451321959495544, + "learning_rate": 2.935574229691877e-05, + "loss": 0.3383, + "step": 34768 + }, + { + "epoch": 19.424022346368716, + "grad_norm": 0.34369826316833496, + "learning_rate": 2.9327731092436975e-05, + "loss": 0.3421, + "step": 34769 + }, + { + "epoch": 19.424581005586592, + "grad_norm": 1.778489112854004, + "learning_rate": 2.9299719887955185e-05, + "loss": 0.4537, + "step": 34770 + }, + { + "epoch": 19.42513966480447, + "grad_norm": 0.48961853981018066, + "learning_rate": 2.927170868347339e-05, + "loss": 0.4071, + "step": 34771 + }, + { + "epoch": 19.425698324022346, + "grad_norm": 0.40935471653938293, + "learning_rate": 2.9243697478991596e-05, + "loss": 0.3474, + "step": 34772 + }, + { + "epoch": 19.426256983240222, + "grad_norm": 0.37686896324157715, + "learning_rate": 2.9215686274509806e-05, + "loss": 0.3683, + "step": 34773 + }, + { + "epoch": 19.426815642458102, + "grad_norm": 0.47350791096687317, + "learning_rate": 2.9187675070028012e-05, + "loss": 0.4736, + "step": 34774 + }, + { + "epoch": 19.42737430167598, + "grad_norm": 0.4565175771713257, + "learning_rate": 2.9159663865546218e-05, + "loss": 0.447, + "step": 34775 + }, + { + "epoch": 19.427932960893855, + "grad_norm": 0.590487003326416, + "learning_rate": 2.9131652661064427e-05, + "loss": 0.4295, + "step": 34776 + }, + { + "epoch": 19.428491620111732, + "grad_norm": 0.6740888357162476, + "learning_rate": 2.9103641456582633e-05, + "loss": 0.5175, + "step": 34777 + }, + { + "epoch": 19.42905027932961, + "grad_norm": 0.6588280200958252, + "learning_rate": 2.9075630252100843e-05, + "loss": 0.4447, + "step": 34778 + }, + { + "epoch": 19.429608938547485, + "grad_norm": 13.812747955322266, + "learning_rate": 2.904761904761905e-05, + "loss": 0.4068, + "step": 34779 + }, + { + "epoch": 19.43016759776536, + "grad_norm": 0.5586323738098145, + "learning_rate": 2.9019607843137255e-05, + "loss": 0.3489, + "step": 34780 + }, + { + "epoch": 19.43072625698324, + "grad_norm": 0.5538607239723206, + "learning_rate": 2.8991596638655464e-05, + "loss": 0.4431, + "step": 34781 + }, + { + "epoch": 19.43128491620112, + "grad_norm": 0.7430354356765747, + "learning_rate": 2.896358543417367e-05, + "loss": 0.3899, + "step": 34782 + }, + { + "epoch": 19.431843575418995, + "grad_norm": 0.8361756801605225, + "learning_rate": 2.8935574229691876e-05, + "loss": 0.4509, + "step": 34783 + }, + { + "epoch": 19.43240223463687, + "grad_norm": 0.5199877023696899, + "learning_rate": 2.8907563025210085e-05, + "loss": 0.4836, + "step": 34784 + }, + { + "epoch": 19.432960893854748, + "grad_norm": 0.3791772425174713, + "learning_rate": 2.887955182072829e-05, + "loss": 0.2965, + "step": 34785 + }, + { + "epoch": 19.433519553072625, + "grad_norm": 0.4693761467933655, + "learning_rate": 2.88515406162465e-05, + "loss": 0.435, + "step": 34786 + }, + { + "epoch": 19.4340782122905, + "grad_norm": 0.5811636447906494, + "learning_rate": 2.8823529411764707e-05, + "loss": 0.4304, + "step": 34787 + }, + { + "epoch": 19.43463687150838, + "grad_norm": 0.5526044964790344, + "learning_rate": 2.8795518207282913e-05, + "loss": 0.487, + "step": 34788 + }, + { + "epoch": 19.435195530726258, + "grad_norm": 0.2887870669364929, + "learning_rate": 2.8767507002801122e-05, + "loss": 0.2608, + "step": 34789 + }, + { + "epoch": 19.435754189944134, + "grad_norm": 0.39098960161209106, + "learning_rate": 2.8739495798319328e-05, + "loss": 0.3385, + "step": 34790 + }, + { + "epoch": 19.43631284916201, + "grad_norm": 0.4343716502189636, + "learning_rate": 2.8711484593837534e-05, + "loss": 0.4337, + "step": 34791 + }, + { + "epoch": 19.436871508379888, + "grad_norm": 0.34231317043304443, + "learning_rate": 2.8683473389355743e-05, + "loss": 0.3784, + "step": 34792 + }, + { + "epoch": 19.437430167597764, + "grad_norm": 0.4189598858356476, + "learning_rate": 2.865546218487395e-05, + "loss": 0.4661, + "step": 34793 + }, + { + "epoch": 19.43798882681564, + "grad_norm": 0.35148534178733826, + "learning_rate": 2.862745098039216e-05, + "loss": 0.3215, + "step": 34794 + }, + { + "epoch": 19.43854748603352, + "grad_norm": 0.3340983986854553, + "learning_rate": 2.8599439775910365e-05, + "loss": 0.3384, + "step": 34795 + }, + { + "epoch": 19.439106145251397, + "grad_norm": 0.5679505467414856, + "learning_rate": 2.857142857142857e-05, + "loss": 0.3756, + "step": 34796 + }, + { + "epoch": 19.439664804469274, + "grad_norm": 0.6255388259887695, + "learning_rate": 2.854341736694678e-05, + "loss": 0.3552, + "step": 34797 + }, + { + "epoch": 19.44022346368715, + "grad_norm": 0.34175461530685425, + "learning_rate": 2.8515406162464986e-05, + "loss": 0.3551, + "step": 34798 + }, + { + "epoch": 19.440782122905027, + "grad_norm": 0.40118804574012756, + "learning_rate": 2.8487394957983192e-05, + "loss": 0.3892, + "step": 34799 + }, + { + "epoch": 19.441340782122904, + "grad_norm": 0.46519577503204346, + "learning_rate": 2.84593837535014e-05, + "loss": 0.423, + "step": 34800 + }, + { + "epoch": 19.441899441340784, + "grad_norm": 0.3849142789840698, + "learning_rate": 2.8431372549019608e-05, + "loss": 0.4067, + "step": 34801 + }, + { + "epoch": 19.44245810055866, + "grad_norm": 0.36764630675315857, + "learning_rate": 2.8403361344537814e-05, + "loss": 0.3341, + "step": 34802 + }, + { + "epoch": 19.443016759776537, + "grad_norm": 0.3311637043952942, + "learning_rate": 2.8375350140056023e-05, + "loss": 0.3163, + "step": 34803 + }, + { + "epoch": 19.443575418994413, + "grad_norm": 0.4395899176597595, + "learning_rate": 2.834733893557423e-05, + "loss": 0.4324, + "step": 34804 + }, + { + "epoch": 19.44413407821229, + "grad_norm": 0.417794793844223, + "learning_rate": 2.831932773109244e-05, + "loss": 0.3394, + "step": 34805 + }, + { + "epoch": 19.444692737430167, + "grad_norm": 0.5990760326385498, + "learning_rate": 2.8291316526610644e-05, + "loss": 0.3996, + "step": 34806 + }, + { + "epoch": 19.445251396648043, + "grad_norm": 0.4159018099308014, + "learning_rate": 2.826330532212885e-05, + "loss": 0.4901, + "step": 34807 + }, + { + "epoch": 19.445810055865923, + "grad_norm": 0.6076653599739075, + "learning_rate": 2.823529411764706e-05, + "loss": 0.4628, + "step": 34808 + }, + { + "epoch": 19.4463687150838, + "grad_norm": 2.1380856037139893, + "learning_rate": 2.8207282913165266e-05, + "loss": 0.3825, + "step": 34809 + }, + { + "epoch": 19.446927374301676, + "grad_norm": 1.1035913228988647, + "learning_rate": 2.8179271708683472e-05, + "loss": 0.4055, + "step": 34810 + }, + { + "epoch": 19.447486033519553, + "grad_norm": 0.5296691060066223, + "learning_rate": 2.815126050420168e-05, + "loss": 0.5932, + "step": 34811 + }, + { + "epoch": 19.44804469273743, + "grad_norm": 0.35844045877456665, + "learning_rate": 2.8123249299719887e-05, + "loss": 0.3516, + "step": 34812 + }, + { + "epoch": 19.448603351955306, + "grad_norm": 0.8088261485099792, + "learning_rate": 2.8095238095238096e-05, + "loss": 0.5878, + "step": 34813 + }, + { + "epoch": 19.449162011173183, + "grad_norm": 0.5955832004547119, + "learning_rate": 2.8067226890756302e-05, + "loss": 0.4585, + "step": 34814 + }, + { + "epoch": 19.449720670391063, + "grad_norm": 0.47626441717147827, + "learning_rate": 2.803921568627451e-05, + "loss": 0.378, + "step": 34815 + }, + { + "epoch": 19.45027932960894, + "grad_norm": 0.8115229606628418, + "learning_rate": 2.8011204481792718e-05, + "loss": 0.4401, + "step": 34816 + }, + { + "epoch": 19.450837988826816, + "grad_norm": 0.4454791843891144, + "learning_rate": 2.7983193277310924e-05, + "loss": 0.3236, + "step": 34817 + }, + { + "epoch": 19.451396648044692, + "grad_norm": 0.5104039311408997, + "learning_rate": 2.795518207282913e-05, + "loss": 0.4567, + "step": 34818 + }, + { + "epoch": 19.45195530726257, + "grad_norm": 0.6141294240951538, + "learning_rate": 2.792717086834734e-05, + "loss": 0.5223, + "step": 34819 + }, + { + "epoch": 19.452513966480446, + "grad_norm": 0.3874828517436981, + "learning_rate": 2.7899159663865545e-05, + "loss": 0.4499, + "step": 34820 + }, + { + "epoch": 19.453072625698326, + "grad_norm": 3.971581220626831, + "learning_rate": 2.7871148459383755e-05, + "loss": 0.3135, + "step": 34821 + }, + { + "epoch": 19.453631284916202, + "grad_norm": 0.400783509016037, + "learning_rate": 2.784313725490196e-05, + "loss": 0.3877, + "step": 34822 + }, + { + "epoch": 19.45418994413408, + "grad_norm": 1.1234560012817383, + "learning_rate": 2.7815126050420167e-05, + "loss": 0.3208, + "step": 34823 + }, + { + "epoch": 19.454748603351955, + "grad_norm": 0.7502797842025757, + "learning_rate": 2.7787114845938376e-05, + "loss": 0.3751, + "step": 34824 + }, + { + "epoch": 19.455307262569832, + "grad_norm": 0.6240797638893127, + "learning_rate": 2.7759103641456582e-05, + "loss": 0.5704, + "step": 34825 + }, + { + "epoch": 19.45586592178771, + "grad_norm": 0.4749441146850586, + "learning_rate": 2.7731092436974788e-05, + "loss": 0.4114, + "step": 34826 + }, + { + "epoch": 19.456424581005585, + "grad_norm": 0.7239744067192078, + "learning_rate": 2.7703081232492997e-05, + "loss": 0.4083, + "step": 34827 + }, + { + "epoch": 19.456983240223465, + "grad_norm": 0.7120631337165833, + "learning_rate": 2.7675070028011203e-05, + "loss": 0.4615, + "step": 34828 + }, + { + "epoch": 19.457541899441342, + "grad_norm": 0.469102144241333, + "learning_rate": 2.7647058823529413e-05, + "loss": 0.3855, + "step": 34829 + }, + { + "epoch": 19.45810055865922, + "grad_norm": 5.470970153808594, + "learning_rate": 2.761904761904762e-05, + "loss": 0.4748, + "step": 34830 + }, + { + "epoch": 19.458659217877095, + "grad_norm": 0.3650527894496918, + "learning_rate": 2.7591036414565825e-05, + "loss": 0.399, + "step": 34831 + }, + { + "epoch": 19.45921787709497, + "grad_norm": 0.6419007182121277, + "learning_rate": 2.7563025210084034e-05, + "loss": 0.4371, + "step": 34832 + }, + { + "epoch": 19.459776536312848, + "grad_norm": 0.3608035147190094, + "learning_rate": 2.753501400560224e-05, + "loss": 0.4125, + "step": 34833 + }, + { + "epoch": 19.460335195530725, + "grad_norm": 0.5687878131866455, + "learning_rate": 2.7507002801120446e-05, + "loss": 0.5592, + "step": 34834 + }, + { + "epoch": 19.460893854748605, + "grad_norm": 0.6292167901992798, + "learning_rate": 2.7478991596638655e-05, + "loss": 0.3765, + "step": 34835 + }, + { + "epoch": 19.46145251396648, + "grad_norm": 0.36823347210884094, + "learning_rate": 2.745098039215686e-05, + "loss": 0.3864, + "step": 34836 + }, + { + "epoch": 19.462011173184358, + "grad_norm": 0.980316698551178, + "learning_rate": 2.7422969187675067e-05, + "loss": 0.4543, + "step": 34837 + }, + { + "epoch": 19.462569832402234, + "grad_norm": 0.7499711513519287, + "learning_rate": 2.7394957983193277e-05, + "loss": 0.4236, + "step": 34838 + }, + { + "epoch": 19.46312849162011, + "grad_norm": 0.5976037383079529, + "learning_rate": 2.7366946778711483e-05, + "loss": 0.413, + "step": 34839 + }, + { + "epoch": 19.463687150837988, + "grad_norm": 0.361826628446579, + "learning_rate": 2.7338935574229692e-05, + "loss": 0.4383, + "step": 34840 + }, + { + "epoch": 19.464245810055864, + "grad_norm": 0.5410655736923218, + "learning_rate": 2.7310924369747898e-05, + "loss": 0.3938, + "step": 34841 + }, + { + "epoch": 19.464804469273744, + "grad_norm": 0.6119258403778076, + "learning_rate": 2.7282913165266104e-05, + "loss": 0.5936, + "step": 34842 + }, + { + "epoch": 19.46536312849162, + "grad_norm": 2.1629810333251953, + "learning_rate": 2.7254901960784317e-05, + "loss": 0.4456, + "step": 34843 + }, + { + "epoch": 19.465921787709497, + "grad_norm": 0.49945780634880066, + "learning_rate": 2.7226890756302523e-05, + "loss": 0.4008, + "step": 34844 + }, + { + "epoch": 19.466480446927374, + "grad_norm": 0.36867979168891907, + "learning_rate": 2.719887955182073e-05, + "loss": 0.3071, + "step": 34845 + }, + { + "epoch": 19.46703910614525, + "grad_norm": 2.1073596477508545, + "learning_rate": 2.717086834733894e-05, + "loss": 0.4274, + "step": 34846 + }, + { + "epoch": 19.467597765363127, + "grad_norm": 0.49559345841407776, + "learning_rate": 2.7142857142857144e-05, + "loss": 0.4909, + "step": 34847 + }, + { + "epoch": 19.468156424581007, + "grad_norm": 0.4453836977481842, + "learning_rate": 2.7114845938375354e-05, + "loss": 0.3993, + "step": 34848 + }, + { + "epoch": 19.468715083798884, + "grad_norm": 0.5157610774040222, + "learning_rate": 2.708683473389356e-05, + "loss": 0.5475, + "step": 34849 + }, + { + "epoch": 19.46927374301676, + "grad_norm": 0.5772706866264343, + "learning_rate": 2.7058823529411766e-05, + "loss": 0.3872, + "step": 34850 + }, + { + "epoch": 19.469832402234637, + "grad_norm": 2.2015509605407715, + "learning_rate": 2.7030812324929975e-05, + "loss": 0.4799, + "step": 34851 + }, + { + "epoch": 19.470391061452514, + "grad_norm": 1.1958978176116943, + "learning_rate": 2.700280112044818e-05, + "loss": 0.3247, + "step": 34852 + }, + { + "epoch": 19.47094972067039, + "grad_norm": 0.4813414216041565, + "learning_rate": 2.6974789915966387e-05, + "loss": 0.4392, + "step": 34853 + }, + { + "epoch": 19.471508379888267, + "grad_norm": 0.7284502387046814, + "learning_rate": 2.6946778711484596e-05, + "loss": 0.6559, + "step": 34854 + }, + { + "epoch": 19.472067039106147, + "grad_norm": 4.674539089202881, + "learning_rate": 2.6918767507002802e-05, + "loss": 0.4551, + "step": 34855 + }, + { + "epoch": 19.472625698324023, + "grad_norm": 0.390680193901062, + "learning_rate": 2.6890756302521012e-05, + "loss": 0.3195, + "step": 34856 + }, + { + "epoch": 19.4731843575419, + "grad_norm": 1.7368971109390259, + "learning_rate": 2.6862745098039218e-05, + "loss": 0.485, + "step": 34857 + }, + { + "epoch": 19.473743016759776, + "grad_norm": 0.6366631984710693, + "learning_rate": 2.6834733893557424e-05, + "loss": 0.4, + "step": 34858 + }, + { + "epoch": 19.474301675977653, + "grad_norm": 0.4273139536380768, + "learning_rate": 2.6806722689075633e-05, + "loss": 0.446, + "step": 34859 + }, + { + "epoch": 19.47486033519553, + "grad_norm": 0.4665451943874359, + "learning_rate": 2.677871148459384e-05, + "loss": 0.4566, + "step": 34860 + }, + { + "epoch": 19.475418994413406, + "grad_norm": 0.869002640247345, + "learning_rate": 2.6750700280112045e-05, + "loss": 0.5105, + "step": 34861 + }, + { + "epoch": 19.475977653631286, + "grad_norm": 0.43392908573150635, + "learning_rate": 2.6722689075630255e-05, + "loss": 0.3639, + "step": 34862 + }, + { + "epoch": 19.476536312849163, + "grad_norm": 1.1479334831237793, + "learning_rate": 2.669467787114846e-05, + "loss": 0.4706, + "step": 34863 + }, + { + "epoch": 19.47709497206704, + "grad_norm": 0.5081384778022766, + "learning_rate": 2.666666666666667e-05, + "loss": 0.3925, + "step": 34864 + }, + { + "epoch": 19.477653631284916, + "grad_norm": 0.40185925364494324, + "learning_rate": 2.6638655462184876e-05, + "loss": 0.4877, + "step": 34865 + }, + { + "epoch": 19.478212290502793, + "grad_norm": 0.3650851845741272, + "learning_rate": 2.6610644257703082e-05, + "loss": 0.3393, + "step": 34866 + }, + { + "epoch": 19.47877094972067, + "grad_norm": 0.48613956570625305, + "learning_rate": 2.658263305322129e-05, + "loss": 0.4484, + "step": 34867 + }, + { + "epoch": 19.47932960893855, + "grad_norm": 0.4635118544101715, + "learning_rate": 2.6554621848739497e-05, + "loss": 0.3699, + "step": 34868 + }, + { + "epoch": 19.479888268156426, + "grad_norm": 0.4014201760292053, + "learning_rate": 2.6526610644257703e-05, + "loss": 0.249, + "step": 34869 + }, + { + "epoch": 19.480446927374302, + "grad_norm": 1.2354720830917358, + "learning_rate": 2.6498599439775913e-05, + "loss": 0.5171, + "step": 34870 + }, + { + "epoch": 19.48100558659218, + "grad_norm": 0.33539703488349915, + "learning_rate": 2.647058823529412e-05, + "loss": 0.3626, + "step": 34871 + }, + { + "epoch": 19.481564245810056, + "grad_norm": 0.6215862035751343, + "learning_rate": 2.6442577030812325e-05, + "loss": 0.3721, + "step": 34872 + }, + { + "epoch": 19.482122905027932, + "grad_norm": 0.38046738505363464, + "learning_rate": 2.6414565826330534e-05, + "loss": 0.4454, + "step": 34873 + }, + { + "epoch": 19.48268156424581, + "grad_norm": 0.3673492670059204, + "learning_rate": 2.638655462184874e-05, + "loss": 0.4318, + "step": 34874 + }, + { + "epoch": 19.48324022346369, + "grad_norm": 0.4376643896102905, + "learning_rate": 2.635854341736695e-05, + "loss": 0.4907, + "step": 34875 + }, + { + "epoch": 19.483798882681565, + "grad_norm": 0.3730255961418152, + "learning_rate": 2.6330532212885155e-05, + "loss": 0.3338, + "step": 34876 + }, + { + "epoch": 19.484357541899442, + "grad_norm": 0.5789554119110107, + "learning_rate": 2.630252100840336e-05, + "loss": 0.3071, + "step": 34877 + }, + { + "epoch": 19.48491620111732, + "grad_norm": 0.4080667495727539, + "learning_rate": 2.627450980392157e-05, + "loss": 0.3741, + "step": 34878 + }, + { + "epoch": 19.485474860335195, + "grad_norm": 0.3730865716934204, + "learning_rate": 2.6246498599439777e-05, + "loss": 0.3944, + "step": 34879 + }, + { + "epoch": 19.48603351955307, + "grad_norm": 0.369336873292923, + "learning_rate": 2.6218487394957983e-05, + "loss": 0.4123, + "step": 34880 + }, + { + "epoch": 19.486592178770948, + "grad_norm": 0.5278117656707764, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.4091, + "step": 34881 + }, + { + "epoch": 19.48715083798883, + "grad_norm": 1.122765064239502, + "learning_rate": 2.6162464985994398e-05, + "loss": 0.3808, + "step": 34882 + }, + { + "epoch": 19.487709497206705, + "grad_norm": 0.7268039584159851, + "learning_rate": 2.6134453781512608e-05, + "loss": 0.3949, + "step": 34883 + }, + { + "epoch": 19.48826815642458, + "grad_norm": 3.184790849685669, + "learning_rate": 2.6106442577030814e-05, + "loss": 0.3785, + "step": 34884 + }, + { + "epoch": 19.488826815642458, + "grad_norm": 0.3668678104877472, + "learning_rate": 2.607843137254902e-05, + "loss": 0.3556, + "step": 34885 + }, + { + "epoch": 19.489385474860335, + "grad_norm": 0.40232452750205994, + "learning_rate": 2.605042016806723e-05, + "loss": 0.4073, + "step": 34886 + }, + { + "epoch": 19.48994413407821, + "grad_norm": 0.3640769422054291, + "learning_rate": 2.6022408963585435e-05, + "loss": 0.4083, + "step": 34887 + }, + { + "epoch": 19.490502793296088, + "grad_norm": 0.42225614190101624, + "learning_rate": 2.599439775910364e-05, + "loss": 0.51, + "step": 34888 + }, + { + "epoch": 19.491061452513968, + "grad_norm": 3.6921956539154053, + "learning_rate": 2.596638655462185e-05, + "loss": 0.5421, + "step": 34889 + }, + { + "epoch": 19.491620111731844, + "grad_norm": 0.6919388771057129, + "learning_rate": 2.5938375350140056e-05, + "loss": 0.6297, + "step": 34890 + }, + { + "epoch": 19.49217877094972, + "grad_norm": 0.6777781844139099, + "learning_rate": 2.5910364145658266e-05, + "loss": 0.392, + "step": 34891 + }, + { + "epoch": 19.492737430167598, + "grad_norm": 0.39118492603302, + "learning_rate": 2.5882352941176472e-05, + "loss": 0.3772, + "step": 34892 + }, + { + "epoch": 19.493296089385474, + "grad_norm": 0.3688738942146301, + "learning_rate": 2.5854341736694678e-05, + "loss": 0.3981, + "step": 34893 + }, + { + "epoch": 19.49385474860335, + "grad_norm": 0.33815377950668335, + "learning_rate": 2.5826330532212887e-05, + "loss": 0.3424, + "step": 34894 + }, + { + "epoch": 19.49441340782123, + "grad_norm": 0.34132981300354004, + "learning_rate": 2.5798319327731093e-05, + "loss": 0.3532, + "step": 34895 + }, + { + "epoch": 19.494972067039107, + "grad_norm": 2.5869383811950684, + "learning_rate": 2.57703081232493e-05, + "loss": 0.3117, + "step": 34896 + }, + { + "epoch": 19.495530726256984, + "grad_norm": 0.6172873973846436, + "learning_rate": 2.574229691876751e-05, + "loss": 0.3574, + "step": 34897 + }, + { + "epoch": 19.49608938547486, + "grad_norm": 0.44866862893104553, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.2921, + "step": 34898 + }, + { + "epoch": 19.496648044692737, + "grad_norm": 0.8284403085708618, + "learning_rate": 2.5686274509803924e-05, + "loss": 0.4283, + "step": 34899 + }, + { + "epoch": 19.497206703910614, + "grad_norm": 1.1001683473587036, + "learning_rate": 2.565826330532213e-05, + "loss": 0.4978, + "step": 34900 + }, + { + "epoch": 19.49776536312849, + "grad_norm": 0.5014494061470032, + "learning_rate": 2.5630252100840336e-05, + "loss": 0.3411, + "step": 34901 + }, + { + "epoch": 19.49832402234637, + "grad_norm": 0.5027364492416382, + "learning_rate": 2.5602240896358545e-05, + "loss": 0.4445, + "step": 34902 + }, + { + "epoch": 19.498882681564247, + "grad_norm": 0.4541790783405304, + "learning_rate": 2.557422969187675e-05, + "loss": 0.4546, + "step": 34903 + }, + { + "epoch": 19.499441340782123, + "grad_norm": 0.44293051958084106, + "learning_rate": 2.5546218487394957e-05, + "loss": 0.3856, + "step": 34904 + }, + { + "epoch": 19.5, + "grad_norm": 0.42757755517959595, + "learning_rate": 2.5518207282913167e-05, + "loss": 0.3473, + "step": 34905 + }, + { + "epoch": 19.500558659217877, + "grad_norm": 0.8330206274986267, + "learning_rate": 2.5490196078431373e-05, + "loss": 0.3887, + "step": 34906 + }, + { + "epoch": 19.501117318435753, + "grad_norm": 0.40243637561798096, + "learning_rate": 2.546218487394958e-05, + "loss": 0.2796, + "step": 34907 + }, + { + "epoch": 19.50167597765363, + "grad_norm": 0.39788687229156494, + "learning_rate": 2.5434173669467788e-05, + "loss": 0.388, + "step": 34908 + }, + { + "epoch": 19.50223463687151, + "grad_norm": 1.57988440990448, + "learning_rate": 2.5406162464985994e-05, + "loss": 0.3657, + "step": 34909 + }, + { + "epoch": 19.502793296089386, + "grad_norm": 0.3384858965873718, + "learning_rate": 2.5378151260504203e-05, + "loss": 0.3439, + "step": 34910 + }, + { + "epoch": 19.503351955307263, + "grad_norm": 0.9958170652389526, + "learning_rate": 2.535014005602241e-05, + "loss": 0.3799, + "step": 34911 + }, + { + "epoch": 19.50391061452514, + "grad_norm": 0.642697274684906, + "learning_rate": 2.5322128851540615e-05, + "loss": 0.4593, + "step": 34912 + }, + { + "epoch": 19.504469273743016, + "grad_norm": 0.3647029995918274, + "learning_rate": 2.5294117647058825e-05, + "loss": 0.4773, + "step": 34913 + }, + { + "epoch": 19.505027932960893, + "grad_norm": 0.6092699766159058, + "learning_rate": 2.526610644257703e-05, + "loss": 0.4089, + "step": 34914 + }, + { + "epoch": 19.505586592178773, + "grad_norm": 1.4237099885940552, + "learning_rate": 2.5238095238095237e-05, + "loss": 0.4038, + "step": 34915 + }, + { + "epoch": 19.50614525139665, + "grad_norm": 0.4050527811050415, + "learning_rate": 2.5210084033613446e-05, + "loss": 0.3612, + "step": 34916 + }, + { + "epoch": 19.506703910614526, + "grad_norm": 0.3727430999279022, + "learning_rate": 2.5182072829131652e-05, + "loss": 0.3419, + "step": 34917 + }, + { + "epoch": 19.507262569832402, + "grad_norm": 0.6199318766593933, + "learning_rate": 2.515406162464986e-05, + "loss": 0.3683, + "step": 34918 + }, + { + "epoch": 19.50782122905028, + "grad_norm": 0.5812656283378601, + "learning_rate": 2.5126050420168067e-05, + "loss": 0.2762, + "step": 34919 + }, + { + "epoch": 19.508379888268156, + "grad_norm": 2.1323699951171875, + "learning_rate": 2.5098039215686273e-05, + "loss": 0.4134, + "step": 34920 + }, + { + "epoch": 19.508938547486032, + "grad_norm": 0.6260694861412048, + "learning_rate": 2.5070028011204483e-05, + "loss": 0.4046, + "step": 34921 + }, + { + "epoch": 19.509497206703912, + "grad_norm": 0.36617955565452576, + "learning_rate": 2.504201680672269e-05, + "loss": 0.4208, + "step": 34922 + }, + { + "epoch": 19.51005586592179, + "grad_norm": 0.37258216738700867, + "learning_rate": 2.5014005602240895e-05, + "loss": 0.3871, + "step": 34923 + }, + { + "epoch": 19.510614525139665, + "grad_norm": 0.35397523641586304, + "learning_rate": 2.4985994397759104e-05, + "loss": 0.4331, + "step": 34924 + }, + { + "epoch": 19.511173184357542, + "grad_norm": 2.2395682334899902, + "learning_rate": 2.495798319327731e-05, + "loss": 0.5199, + "step": 34925 + }, + { + "epoch": 19.51173184357542, + "grad_norm": 0.42774754762649536, + "learning_rate": 2.492997198879552e-05, + "loss": 0.4441, + "step": 34926 + }, + { + "epoch": 19.512290502793295, + "grad_norm": 0.5070825815200806, + "learning_rate": 2.4901960784313726e-05, + "loss": 0.4599, + "step": 34927 + }, + { + "epoch": 19.51284916201117, + "grad_norm": 1.0811023712158203, + "learning_rate": 2.487394957983193e-05, + "loss": 0.408, + "step": 34928 + }, + { + "epoch": 19.513407821229052, + "grad_norm": 0.42528045177459717, + "learning_rate": 2.484593837535014e-05, + "loss": 0.383, + "step": 34929 + }, + { + "epoch": 19.51396648044693, + "grad_norm": 0.4363587200641632, + "learning_rate": 2.4817927170868347e-05, + "loss": 0.5726, + "step": 34930 + }, + { + "epoch": 19.514525139664805, + "grad_norm": 0.42945781350135803, + "learning_rate": 2.4789915966386553e-05, + "loss": 0.4024, + "step": 34931 + }, + { + "epoch": 19.51508379888268, + "grad_norm": 0.47450146079063416, + "learning_rate": 2.4761904761904762e-05, + "loss": 0.4144, + "step": 34932 + }, + { + "epoch": 19.515642458100558, + "grad_norm": 0.4275537133216858, + "learning_rate": 2.473389355742297e-05, + "loss": 0.4054, + "step": 34933 + }, + { + "epoch": 19.516201117318435, + "grad_norm": 0.3809202015399933, + "learning_rate": 2.4705882352941174e-05, + "loss": 0.4281, + "step": 34934 + }, + { + "epoch": 19.51675977653631, + "grad_norm": 0.38599687814712524, + "learning_rate": 2.4677871148459384e-05, + "loss": 0.3593, + "step": 34935 + }, + { + "epoch": 19.51731843575419, + "grad_norm": 0.38828933238983154, + "learning_rate": 2.464985994397759e-05, + "loss": 0.3805, + "step": 34936 + }, + { + "epoch": 19.517877094972068, + "grad_norm": 2.617799997329712, + "learning_rate": 2.46218487394958e-05, + "loss": 0.3243, + "step": 34937 + }, + { + "epoch": 19.518435754189944, + "grad_norm": 0.44306081533432007, + "learning_rate": 2.4593837535014005e-05, + "loss": 0.3522, + "step": 34938 + }, + { + "epoch": 19.51899441340782, + "grad_norm": 0.41663122177124023, + "learning_rate": 2.456582633053221e-05, + "loss": 0.3711, + "step": 34939 + }, + { + "epoch": 19.519553072625698, + "grad_norm": 0.40998974442481995, + "learning_rate": 2.453781512605042e-05, + "loss": 0.3916, + "step": 34940 + }, + { + "epoch": 19.520111731843574, + "grad_norm": 2.1621267795562744, + "learning_rate": 2.4509803921568626e-05, + "loss": 0.4845, + "step": 34941 + }, + { + "epoch": 19.52067039106145, + "grad_norm": 0.4144487977027893, + "learning_rate": 2.4481792717086832e-05, + "loss": 0.4182, + "step": 34942 + }, + { + "epoch": 19.52122905027933, + "grad_norm": 0.8371133208274841, + "learning_rate": 2.4453781512605042e-05, + "loss": 0.4985, + "step": 34943 + }, + { + "epoch": 19.521787709497207, + "grad_norm": 0.3932962119579315, + "learning_rate": 2.4425770308123248e-05, + "loss": 0.4071, + "step": 34944 + }, + { + "epoch": 19.522346368715084, + "grad_norm": 0.3749922513961792, + "learning_rate": 2.4397759103641457e-05, + "loss": 0.4593, + "step": 34945 + }, + { + "epoch": 19.52290502793296, + "grad_norm": 0.7596540451049805, + "learning_rate": 2.4369747899159663e-05, + "loss": 0.4766, + "step": 34946 + }, + { + "epoch": 19.523463687150837, + "grad_norm": 0.38084205985069275, + "learning_rate": 2.434173669467787e-05, + "loss": 0.3785, + "step": 34947 + }, + { + "epoch": 19.524022346368714, + "grad_norm": 0.34402021765708923, + "learning_rate": 2.431372549019608e-05, + "loss": 0.4099, + "step": 34948 + }, + { + "epoch": 19.524581005586594, + "grad_norm": 2.6963329315185547, + "learning_rate": 2.4285714285714285e-05, + "loss": 0.3745, + "step": 34949 + }, + { + "epoch": 19.52513966480447, + "grad_norm": 0.5655094981193542, + "learning_rate": 2.425770308123249e-05, + "loss": 0.4122, + "step": 34950 + }, + { + "epoch": 19.525698324022347, + "grad_norm": 0.4889974892139435, + "learning_rate": 2.42296918767507e-05, + "loss": 0.3939, + "step": 34951 + }, + { + "epoch": 19.526256983240224, + "grad_norm": 0.47463908791542053, + "learning_rate": 2.4201680672268906e-05, + "loss": 0.359, + "step": 34952 + }, + { + "epoch": 19.5268156424581, + "grad_norm": 0.39065998792648315, + "learning_rate": 2.4173669467787115e-05, + "loss": 0.3533, + "step": 34953 + }, + { + "epoch": 19.527374301675977, + "grad_norm": 0.6336252093315125, + "learning_rate": 2.414565826330532e-05, + "loss": 0.3926, + "step": 34954 + }, + { + "epoch": 19.527932960893853, + "grad_norm": 0.6006150245666504, + "learning_rate": 2.4117647058823527e-05, + "loss": 0.5883, + "step": 34955 + }, + { + "epoch": 19.528491620111733, + "grad_norm": 0.9705725312232971, + "learning_rate": 2.4089635854341737e-05, + "loss": 0.3489, + "step": 34956 + }, + { + "epoch": 19.52905027932961, + "grad_norm": 0.42252224683761597, + "learning_rate": 2.4061624649859943e-05, + "loss": 0.4638, + "step": 34957 + }, + { + "epoch": 19.529608938547486, + "grad_norm": 0.30472639203071594, + "learning_rate": 2.403361344537815e-05, + "loss": 0.2838, + "step": 34958 + }, + { + "epoch": 19.530167597765363, + "grad_norm": 0.4616810381412506, + "learning_rate": 2.400560224089636e-05, + "loss": 0.3634, + "step": 34959 + }, + { + "epoch": 19.53072625698324, + "grad_norm": 0.2783021926879883, + "learning_rate": 2.3977591036414567e-05, + "loss": 0.2769, + "step": 34960 + }, + { + "epoch": 19.531284916201116, + "grad_norm": 0.4363904297351837, + "learning_rate": 2.3949579831932777e-05, + "loss": 0.3294, + "step": 34961 + }, + { + "epoch": 19.531843575418993, + "grad_norm": 0.7102211713790894, + "learning_rate": 2.3921568627450983e-05, + "loss": 0.4197, + "step": 34962 + }, + { + "epoch": 19.532402234636873, + "grad_norm": 0.6642683744430542, + "learning_rate": 2.389355742296919e-05, + "loss": 0.4695, + "step": 34963 + }, + { + "epoch": 19.53296089385475, + "grad_norm": 0.31163617968559265, + "learning_rate": 2.3865546218487398e-05, + "loss": 0.3135, + "step": 34964 + }, + { + "epoch": 19.533519553072626, + "grad_norm": 0.37404903769493103, + "learning_rate": 2.3837535014005604e-05, + "loss": 0.4106, + "step": 34965 + }, + { + "epoch": 19.534078212290503, + "grad_norm": 0.4966489374637604, + "learning_rate": 2.380952380952381e-05, + "loss": 0.4069, + "step": 34966 + }, + { + "epoch": 19.53463687150838, + "grad_norm": 0.5149335861206055, + "learning_rate": 2.378151260504202e-05, + "loss": 0.3693, + "step": 34967 + }, + { + "epoch": 19.535195530726256, + "grad_norm": 0.47929149866104126, + "learning_rate": 2.3753501400560226e-05, + "loss": 0.4462, + "step": 34968 + }, + { + "epoch": 19.535754189944136, + "grad_norm": 0.4186246395111084, + "learning_rate": 2.372549019607843e-05, + "loss": 0.3949, + "step": 34969 + }, + { + "epoch": 19.536312849162012, + "grad_norm": 0.5047544836997986, + "learning_rate": 2.369747899159664e-05, + "loss": 0.3725, + "step": 34970 + }, + { + "epoch": 19.53687150837989, + "grad_norm": 0.8106202483177185, + "learning_rate": 2.3669467787114847e-05, + "loss": 0.3615, + "step": 34971 + }, + { + "epoch": 19.537430167597766, + "grad_norm": 0.38692641258239746, + "learning_rate": 2.3641456582633056e-05, + "loss": 0.3533, + "step": 34972 + }, + { + "epoch": 19.537988826815642, + "grad_norm": 0.37314605712890625, + "learning_rate": 2.3613445378151262e-05, + "loss": 0.3369, + "step": 34973 + }, + { + "epoch": 19.53854748603352, + "grad_norm": 0.41331860423088074, + "learning_rate": 2.3585434173669468e-05, + "loss": 0.3654, + "step": 34974 + }, + { + "epoch": 19.539106145251395, + "grad_norm": 0.4739454984664917, + "learning_rate": 2.3557422969187678e-05, + "loss": 0.4056, + "step": 34975 + }, + { + "epoch": 19.539664804469275, + "grad_norm": 0.487355500459671, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.474, + "step": 34976 + }, + { + "epoch": 19.540223463687152, + "grad_norm": 0.4497007131576538, + "learning_rate": 2.350140056022409e-05, + "loss": 0.3209, + "step": 34977 + }, + { + "epoch": 19.54078212290503, + "grad_norm": 0.5294460654258728, + "learning_rate": 2.34733893557423e-05, + "loss": 0.3927, + "step": 34978 + }, + { + "epoch": 19.541340782122905, + "grad_norm": 0.776323139667511, + "learning_rate": 2.3445378151260505e-05, + "loss": 0.3937, + "step": 34979 + }, + { + "epoch": 19.54189944134078, + "grad_norm": 1.2215746641159058, + "learning_rate": 2.3417366946778714e-05, + "loss": 0.3795, + "step": 34980 + }, + { + "epoch": 19.542458100558658, + "grad_norm": 0.6829772591590881, + "learning_rate": 2.338935574229692e-05, + "loss": 0.4107, + "step": 34981 + }, + { + "epoch": 19.543016759776535, + "grad_norm": 0.42722100019454956, + "learning_rate": 2.3361344537815126e-05, + "loss": 0.4278, + "step": 34982 + }, + { + "epoch": 19.543575418994415, + "grad_norm": 1.1316921710968018, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.3934, + "step": 34983 + }, + { + "epoch": 19.54413407821229, + "grad_norm": 0.4051525890827179, + "learning_rate": 2.3305322128851542e-05, + "loss": 0.2968, + "step": 34984 + }, + { + "epoch": 19.544692737430168, + "grad_norm": 0.47396060824394226, + "learning_rate": 2.3277310924369748e-05, + "loss": 0.4394, + "step": 34985 + }, + { + "epoch": 19.545251396648045, + "grad_norm": 0.3654646873474121, + "learning_rate": 2.3249299719887957e-05, + "loss": 0.3884, + "step": 34986 + }, + { + "epoch": 19.54581005586592, + "grad_norm": 0.6824849843978882, + "learning_rate": 2.3221288515406163e-05, + "loss": 0.3576, + "step": 34987 + }, + { + "epoch": 19.546368715083798, + "grad_norm": 1.8359508514404297, + "learning_rate": 2.3193277310924373e-05, + "loss": 0.4149, + "step": 34988 + }, + { + "epoch": 19.546927374301674, + "grad_norm": 0.470069944858551, + "learning_rate": 2.316526610644258e-05, + "loss": 0.317, + "step": 34989 + }, + { + "epoch": 19.547486033519554, + "grad_norm": 0.3980754315853119, + "learning_rate": 2.3137254901960785e-05, + "loss": 0.2888, + "step": 34990 + }, + { + "epoch": 19.54804469273743, + "grad_norm": 0.7938000559806824, + "learning_rate": 2.3109243697478994e-05, + "loss": 0.4342, + "step": 34991 + }, + { + "epoch": 19.548603351955308, + "grad_norm": 0.6193335652351379, + "learning_rate": 2.30812324929972e-05, + "loss": 0.4072, + "step": 34992 + }, + { + "epoch": 19.549162011173184, + "grad_norm": 0.3488420844078064, + "learning_rate": 2.3053221288515406e-05, + "loss": 0.3046, + "step": 34993 + }, + { + "epoch": 19.54972067039106, + "grad_norm": 0.5678327679634094, + "learning_rate": 2.3025210084033615e-05, + "loss": 0.4197, + "step": 34994 + }, + { + "epoch": 19.550279329608937, + "grad_norm": 0.4888420104980469, + "learning_rate": 2.299719887955182e-05, + "loss": 0.4346, + "step": 34995 + }, + { + "epoch": 19.550837988826817, + "grad_norm": 0.3524285852909088, + "learning_rate": 2.296918767507003e-05, + "loss": 0.3785, + "step": 34996 + }, + { + "epoch": 19.551396648044694, + "grad_norm": 0.42418041825294495, + "learning_rate": 2.2941176470588237e-05, + "loss": 0.3854, + "step": 34997 + }, + { + "epoch": 19.55195530726257, + "grad_norm": 0.6350045800209045, + "learning_rate": 2.2913165266106443e-05, + "loss": 0.4108, + "step": 34998 + }, + { + "epoch": 19.552513966480447, + "grad_norm": 0.48599985241889954, + "learning_rate": 2.2885154061624652e-05, + "loss": 0.4059, + "step": 34999 + }, + { + "epoch": 19.553072625698324, + "grad_norm": 4.1164679527282715, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.3567, + "step": 35000 + }, + { + "epoch": 19.553072625698324, + "eval_cer": 0.08433439167294032, + "eval_loss": 0.3190707564353943, + "eval_runtime": 55.5481, + "eval_samples_per_second": 81.695, + "eval_steps_per_second": 5.113, + "eval_wer": 0.33387335549286334, + "step": 35000 + }, + { + "epoch": 19.5536312849162, + "grad_norm": 0.6705339550971985, + "learning_rate": 2.2829131652661064e-05, + "loss": 0.3228, + "step": 35001 + }, + { + "epoch": 19.554189944134077, + "grad_norm": 0.8339620232582092, + "learning_rate": 2.2801120448179273e-05, + "loss": 0.3974, + "step": 35002 + }, + { + "epoch": 19.554748603351957, + "grad_norm": 0.3480084538459778, + "learning_rate": 2.277310924369748e-05, + "loss": 0.3924, + "step": 35003 + }, + { + "epoch": 19.555307262569833, + "grad_norm": 0.4313034117221832, + "learning_rate": 2.2745098039215685e-05, + "loss": 0.4389, + "step": 35004 + }, + { + "epoch": 19.55586592178771, + "grad_norm": 0.49800875782966614, + "learning_rate": 2.2717086834733895e-05, + "loss": 0.4895, + "step": 35005 + }, + { + "epoch": 19.556424581005587, + "grad_norm": 0.35939350724220276, + "learning_rate": 2.26890756302521e-05, + "loss": 0.4327, + "step": 35006 + }, + { + "epoch": 19.556983240223463, + "grad_norm": 0.46768736839294434, + "learning_rate": 2.266106442577031e-05, + "loss": 0.3753, + "step": 35007 + }, + { + "epoch": 19.55754189944134, + "grad_norm": 1.2855732440948486, + "learning_rate": 2.2633053221288516e-05, + "loss": 0.3511, + "step": 35008 + }, + { + "epoch": 19.558100558659216, + "grad_norm": 0.309354305267334, + "learning_rate": 2.2605042016806722e-05, + "loss": 0.3547, + "step": 35009 + }, + { + "epoch": 19.558659217877096, + "grad_norm": 0.5491271615028381, + "learning_rate": 2.257703081232493e-05, + "loss": 0.5688, + "step": 35010 + }, + { + "epoch": 19.559217877094973, + "grad_norm": 0.4882706105709076, + "learning_rate": 2.2549019607843138e-05, + "loss": 0.4048, + "step": 35011 + }, + { + "epoch": 19.55977653631285, + "grad_norm": 2.1504855155944824, + "learning_rate": 2.2521008403361344e-05, + "loss": 0.3801, + "step": 35012 + }, + { + "epoch": 19.560335195530726, + "grad_norm": 0.33867359161376953, + "learning_rate": 2.2492997198879553e-05, + "loss": 0.3937, + "step": 35013 + }, + { + "epoch": 19.560893854748603, + "grad_norm": 0.44972822070121765, + "learning_rate": 2.246498599439776e-05, + "loss": 0.3847, + "step": 35014 + }, + { + "epoch": 19.56145251396648, + "grad_norm": 0.36333194375038147, + "learning_rate": 2.2436974789915968e-05, + "loss": 0.346, + "step": 35015 + }, + { + "epoch": 19.56201117318436, + "grad_norm": 0.7097443342208862, + "learning_rate": 2.2408963585434174e-05, + "loss": 0.401, + "step": 35016 + }, + { + "epoch": 19.562569832402236, + "grad_norm": 0.49143320322036743, + "learning_rate": 2.238095238095238e-05, + "loss": 0.3742, + "step": 35017 + }, + { + "epoch": 19.563128491620112, + "grad_norm": 0.39414462447166443, + "learning_rate": 2.235294117647059e-05, + "loss": 0.468, + "step": 35018 + }, + { + "epoch": 19.56368715083799, + "grad_norm": 0.5225555896759033, + "learning_rate": 2.2324929971988796e-05, + "loss": 0.422, + "step": 35019 + }, + { + "epoch": 19.564245810055866, + "grad_norm": 0.5444976091384888, + "learning_rate": 2.2296918767507e-05, + "loss": 0.4103, + "step": 35020 + }, + { + "epoch": 19.564804469273742, + "grad_norm": 1.5434752702713013, + "learning_rate": 2.226890756302521e-05, + "loss": 0.4295, + "step": 35021 + }, + { + "epoch": 19.56536312849162, + "grad_norm": 0.43311455845832825, + "learning_rate": 2.2240896358543417e-05, + "loss": 0.4176, + "step": 35022 + }, + { + "epoch": 19.5659217877095, + "grad_norm": 3.5761990547180176, + "learning_rate": 2.2212885154061626e-05, + "loss": 0.526, + "step": 35023 + }, + { + "epoch": 19.566480446927375, + "grad_norm": 0.35101911425590515, + "learning_rate": 2.2184873949579832e-05, + "loss": 0.4223, + "step": 35024 + }, + { + "epoch": 19.567039106145252, + "grad_norm": 0.407419890165329, + "learning_rate": 2.215686274509804e-05, + "loss": 0.3558, + "step": 35025 + }, + { + "epoch": 19.56759776536313, + "grad_norm": 0.6738312840461731, + "learning_rate": 2.2128851540616248e-05, + "loss": 0.4056, + "step": 35026 + }, + { + "epoch": 19.568156424581005, + "grad_norm": 0.3665103614330292, + "learning_rate": 2.2100840336134454e-05, + "loss": 0.365, + "step": 35027 + }, + { + "epoch": 19.56871508379888, + "grad_norm": 0.4983854293823242, + "learning_rate": 2.207282913165266e-05, + "loss": 0.3619, + "step": 35028 + }, + { + "epoch": 19.56927374301676, + "grad_norm": 0.7285831570625305, + "learning_rate": 2.204481792717087e-05, + "loss": 0.5124, + "step": 35029 + }, + { + "epoch": 19.56983240223464, + "grad_norm": 0.5841778516769409, + "learning_rate": 2.2016806722689075e-05, + "loss": 0.4676, + "step": 35030 + }, + { + "epoch": 19.570391061452515, + "grad_norm": 0.4914669692516327, + "learning_rate": 2.1988795518207285e-05, + "loss": 0.4527, + "step": 35031 + }, + { + "epoch": 19.57094972067039, + "grad_norm": 0.5048649311065674, + "learning_rate": 2.196078431372549e-05, + "loss": 0.3912, + "step": 35032 + }, + { + "epoch": 19.571508379888268, + "grad_norm": 0.3908544182777405, + "learning_rate": 2.1932773109243697e-05, + "loss": 0.4351, + "step": 35033 + }, + { + "epoch": 19.572067039106145, + "grad_norm": 0.4516026973724365, + "learning_rate": 2.1904761904761906e-05, + "loss": 0.4406, + "step": 35034 + }, + { + "epoch": 19.57262569832402, + "grad_norm": 0.3828793168067932, + "learning_rate": 2.1876750700280112e-05, + "loss": 0.3799, + "step": 35035 + }, + { + "epoch": 19.573184357541898, + "grad_norm": 0.4044423997402191, + "learning_rate": 2.1848739495798318e-05, + "loss": 0.3958, + "step": 35036 + }, + { + "epoch": 19.573743016759778, + "grad_norm": 0.32164379954338074, + "learning_rate": 2.1820728291316527e-05, + "loss": 0.2731, + "step": 35037 + }, + { + "epoch": 19.574301675977654, + "grad_norm": 0.683765709400177, + "learning_rate": 2.1792717086834733e-05, + "loss": 0.4894, + "step": 35038 + }, + { + "epoch": 19.57486033519553, + "grad_norm": 0.4297809898853302, + "learning_rate": 2.176470588235294e-05, + "loss": 0.3929, + "step": 35039 + }, + { + "epoch": 19.575418994413408, + "grad_norm": 0.4439571499824524, + "learning_rate": 2.173669467787115e-05, + "loss": 0.4521, + "step": 35040 + }, + { + "epoch": 19.575977653631284, + "grad_norm": 0.4166252911090851, + "learning_rate": 2.1708683473389355e-05, + "loss": 0.3794, + "step": 35041 + }, + { + "epoch": 19.57653631284916, + "grad_norm": 0.38812676072120667, + "learning_rate": 2.1680672268907564e-05, + "loss": 0.3045, + "step": 35042 + }, + { + "epoch": 19.577094972067037, + "grad_norm": 0.8387113213539124, + "learning_rate": 2.165266106442577e-05, + "loss": 0.4833, + "step": 35043 + }, + { + "epoch": 19.577653631284917, + "grad_norm": 0.538284420967102, + "learning_rate": 2.1624649859943976e-05, + "loss": 0.4566, + "step": 35044 + }, + { + "epoch": 19.578212290502794, + "grad_norm": 0.407591849565506, + "learning_rate": 2.1596638655462185e-05, + "loss": 0.3746, + "step": 35045 + }, + { + "epoch": 19.57877094972067, + "grad_norm": 0.4515630006790161, + "learning_rate": 2.156862745098039e-05, + "loss": 0.3487, + "step": 35046 + }, + { + "epoch": 19.579329608938547, + "grad_norm": 0.7752786874771118, + "learning_rate": 2.1540616246498597e-05, + "loss": 0.6461, + "step": 35047 + }, + { + "epoch": 19.579888268156424, + "grad_norm": 0.39946651458740234, + "learning_rate": 2.1512605042016807e-05, + "loss": 0.3889, + "step": 35048 + }, + { + "epoch": 19.5804469273743, + "grad_norm": 0.6564873456954956, + "learning_rate": 2.1484593837535013e-05, + "loss": 0.3389, + "step": 35049 + }, + { + "epoch": 19.58100558659218, + "grad_norm": 0.4666491746902466, + "learning_rate": 2.1456582633053222e-05, + "loss": 0.3886, + "step": 35050 + }, + { + "epoch": 19.581564245810057, + "grad_norm": 0.4656910002231598, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.3586, + "step": 35051 + }, + { + "epoch": 19.582122905027934, + "grad_norm": 0.5210119485855103, + "learning_rate": 2.1400560224089634e-05, + "loss": 0.3565, + "step": 35052 + }, + { + "epoch": 19.58268156424581, + "grad_norm": 0.7861770987510681, + "learning_rate": 2.1372549019607844e-05, + "loss": 0.4018, + "step": 35053 + }, + { + "epoch": 19.583240223463687, + "grad_norm": 0.45525625348091125, + "learning_rate": 2.134453781512605e-05, + "loss": 0.4268, + "step": 35054 + }, + { + "epoch": 19.583798882681563, + "grad_norm": 0.3897051215171814, + "learning_rate": 2.1316526610644256e-05, + "loss": 0.4099, + "step": 35055 + }, + { + "epoch": 19.58435754189944, + "grad_norm": 0.5481089353561401, + "learning_rate": 2.1288515406162465e-05, + "loss": 0.4637, + "step": 35056 + }, + { + "epoch": 19.58491620111732, + "grad_norm": 0.42182138562202454, + "learning_rate": 2.126050420168067e-05, + "loss": 0.4193, + "step": 35057 + }, + { + "epoch": 19.585474860335196, + "grad_norm": 1.3122252225875854, + "learning_rate": 2.123249299719888e-05, + "loss": 0.3148, + "step": 35058 + }, + { + "epoch": 19.586033519553073, + "grad_norm": 0.2764971852302551, + "learning_rate": 2.1204481792717086e-05, + "loss": 0.2922, + "step": 35059 + }, + { + "epoch": 19.58659217877095, + "grad_norm": 0.3993698060512543, + "learning_rate": 2.1176470588235292e-05, + "loss": 0.394, + "step": 35060 + }, + { + "epoch": 19.587150837988826, + "grad_norm": 3.5782339572906494, + "learning_rate": 2.11484593837535e-05, + "loss": 0.3107, + "step": 35061 + }, + { + "epoch": 19.587709497206703, + "grad_norm": 0.6111013293266296, + "learning_rate": 2.1120448179271708e-05, + "loss": 0.4053, + "step": 35062 + }, + { + "epoch": 19.58826815642458, + "grad_norm": 0.5928622484207153, + "learning_rate": 2.1092436974789914e-05, + "loss": 0.4382, + "step": 35063 + }, + { + "epoch": 19.58882681564246, + "grad_norm": 0.5645527243614197, + "learning_rate": 2.1064425770308123e-05, + "loss": 0.3882, + "step": 35064 + }, + { + "epoch": 19.589385474860336, + "grad_norm": 0.44165825843811035, + "learning_rate": 2.103641456582633e-05, + "loss": 0.4407, + "step": 35065 + }, + { + "epoch": 19.589944134078213, + "grad_norm": 0.6013810038566589, + "learning_rate": 2.100840336134454e-05, + "loss": 0.3858, + "step": 35066 + }, + { + "epoch": 19.59050279329609, + "grad_norm": 0.5412119030952454, + "learning_rate": 2.0980392156862744e-05, + "loss": 0.4121, + "step": 35067 + }, + { + "epoch": 19.591061452513966, + "grad_norm": 0.5015369057655334, + "learning_rate": 2.095238095238095e-05, + "loss": 0.3899, + "step": 35068 + }, + { + "epoch": 19.591620111731842, + "grad_norm": 0.39011484384536743, + "learning_rate": 2.092436974789916e-05, + "loss": 0.4125, + "step": 35069 + }, + { + "epoch": 19.592178770949722, + "grad_norm": 1.8045885562896729, + "learning_rate": 2.0896358543417366e-05, + "loss": 0.4131, + "step": 35070 + }, + { + "epoch": 19.5927374301676, + "grad_norm": 3.1306211948394775, + "learning_rate": 2.0868347338935572e-05, + "loss": 0.3615, + "step": 35071 + }, + { + "epoch": 19.593296089385476, + "grad_norm": 0.4504767060279846, + "learning_rate": 2.0840336134453785e-05, + "loss": 0.333, + "step": 35072 + }, + { + "epoch": 19.593854748603352, + "grad_norm": 1.0253909826278687, + "learning_rate": 2.081232492997199e-05, + "loss": 0.4006, + "step": 35073 + }, + { + "epoch": 19.59441340782123, + "grad_norm": 0.38497379422187805, + "learning_rate": 2.0784313725490197e-05, + "loss": 0.36, + "step": 35074 + }, + { + "epoch": 19.594972067039105, + "grad_norm": 0.414264976978302, + "learning_rate": 2.0756302521008406e-05, + "loss": 0.3276, + "step": 35075 + }, + { + "epoch": 19.595530726256982, + "grad_norm": 0.5852633118629456, + "learning_rate": 2.0728291316526612e-05, + "loss": 0.4064, + "step": 35076 + }, + { + "epoch": 19.596089385474862, + "grad_norm": 3.511029005050659, + "learning_rate": 2.070028011204482e-05, + "loss": 0.3782, + "step": 35077 + }, + { + "epoch": 19.59664804469274, + "grad_norm": 0.32487791776657104, + "learning_rate": 2.0672268907563027e-05, + "loss": 0.3451, + "step": 35078 + }, + { + "epoch": 19.597206703910615, + "grad_norm": 0.46119052171707153, + "learning_rate": 2.0644257703081233e-05, + "loss": 0.3973, + "step": 35079 + }, + { + "epoch": 19.59776536312849, + "grad_norm": 0.3387055993080139, + "learning_rate": 2.0616246498599443e-05, + "loss": 0.3683, + "step": 35080 + }, + { + "epoch": 19.598324022346368, + "grad_norm": 2.1004459857940674, + "learning_rate": 2.058823529411765e-05, + "loss": 0.4996, + "step": 35081 + }, + { + "epoch": 19.598882681564245, + "grad_norm": 0.5597602725028992, + "learning_rate": 2.0560224089635855e-05, + "loss": 0.3378, + "step": 35082 + }, + { + "epoch": 19.59944134078212, + "grad_norm": 0.4323146641254425, + "learning_rate": 2.0532212885154064e-05, + "loss": 0.3227, + "step": 35083 + }, + { + "epoch": 19.6, + "grad_norm": 0.4124544560909271, + "learning_rate": 2.050420168067227e-05, + "loss": 0.3843, + "step": 35084 + }, + { + "epoch": 19.600558659217878, + "grad_norm": 0.5325616598129272, + "learning_rate": 2.047619047619048e-05, + "loss": 0.4052, + "step": 35085 + }, + { + "epoch": 19.601117318435755, + "grad_norm": 0.3568335473537445, + "learning_rate": 2.0448179271708685e-05, + "loss": 0.325, + "step": 35086 + }, + { + "epoch": 19.60167597765363, + "grad_norm": 0.510695219039917, + "learning_rate": 2.042016806722689e-05, + "loss": 0.5959, + "step": 35087 + }, + { + "epoch": 19.602234636871508, + "grad_norm": 1.3934470415115356, + "learning_rate": 2.03921568627451e-05, + "loss": 0.3895, + "step": 35088 + }, + { + "epoch": 19.602793296089384, + "grad_norm": 0.3698326349258423, + "learning_rate": 2.0364145658263307e-05, + "loss": 0.4022, + "step": 35089 + }, + { + "epoch": 19.60335195530726, + "grad_norm": 0.6723500490188599, + "learning_rate": 2.0336134453781513e-05, + "loss": 0.4444, + "step": 35090 + }, + { + "epoch": 19.60391061452514, + "grad_norm": 0.4389816224575043, + "learning_rate": 2.0308123249299722e-05, + "loss": 0.3771, + "step": 35091 + }, + { + "epoch": 19.604469273743018, + "grad_norm": 0.5208408832550049, + "learning_rate": 2.0280112044817928e-05, + "loss": 0.4235, + "step": 35092 + }, + { + "epoch": 19.605027932960894, + "grad_norm": 0.4646131098270416, + "learning_rate": 2.0252100840336138e-05, + "loss": 0.3677, + "step": 35093 + }, + { + "epoch": 19.60558659217877, + "grad_norm": 0.557743489742279, + "learning_rate": 2.0224089635854344e-05, + "loss": 0.5623, + "step": 35094 + }, + { + "epoch": 19.606145251396647, + "grad_norm": 0.5055925250053406, + "learning_rate": 2.019607843137255e-05, + "loss": 0.4085, + "step": 35095 + }, + { + "epoch": 19.606703910614524, + "grad_norm": 0.5961707830429077, + "learning_rate": 2.016806722689076e-05, + "loss": 0.5964, + "step": 35096 + }, + { + "epoch": 19.607262569832404, + "grad_norm": 0.35368698835372925, + "learning_rate": 2.0140056022408965e-05, + "loss": 0.3608, + "step": 35097 + }, + { + "epoch": 19.60782122905028, + "grad_norm": 2.274745464324951, + "learning_rate": 2.011204481792717e-05, + "loss": 0.4127, + "step": 35098 + }, + { + "epoch": 19.608379888268157, + "grad_norm": 0.353723406791687, + "learning_rate": 2.008403361344538e-05, + "loss": 0.4429, + "step": 35099 + }, + { + "epoch": 19.608938547486034, + "grad_norm": 0.337365984916687, + "learning_rate": 2.0056022408963586e-05, + "loss": 0.3128, + "step": 35100 + }, + { + "epoch": 19.60949720670391, + "grad_norm": 0.44131410121917725, + "learning_rate": 2.0028011204481796e-05, + "loss": 0.4234, + "step": 35101 + }, + { + "epoch": 19.610055865921787, + "grad_norm": 0.4321846663951874, + "learning_rate": 2e-05, + "loss": 0.452, + "step": 35102 + }, + { + "epoch": 19.610614525139663, + "grad_norm": 0.41311925649642944, + "learning_rate": 1.9971988795518208e-05, + "loss": 0.3764, + "step": 35103 + }, + { + "epoch": 19.611173184357543, + "grad_norm": 0.4405032694339752, + "learning_rate": 1.9943977591036417e-05, + "loss": 0.3733, + "step": 35104 + }, + { + "epoch": 19.61173184357542, + "grad_norm": 0.506595253944397, + "learning_rate": 1.9915966386554623e-05, + "loss": 0.4627, + "step": 35105 + }, + { + "epoch": 19.612290502793297, + "grad_norm": 1.0056757926940918, + "learning_rate": 1.988795518207283e-05, + "loss": 0.3021, + "step": 35106 + }, + { + "epoch": 19.612849162011173, + "grad_norm": 0.4197084605693817, + "learning_rate": 1.985994397759104e-05, + "loss": 0.4676, + "step": 35107 + }, + { + "epoch": 19.61340782122905, + "grad_norm": 0.44864171743392944, + "learning_rate": 1.9831932773109244e-05, + "loss": 0.3103, + "step": 35108 + }, + { + "epoch": 19.613966480446926, + "grad_norm": 1.8107091188430786, + "learning_rate": 1.980392156862745e-05, + "loss": 0.4444, + "step": 35109 + }, + { + "epoch": 19.614525139664803, + "grad_norm": 0.5745518803596497, + "learning_rate": 1.977591036414566e-05, + "loss": 0.4349, + "step": 35110 + }, + { + "epoch": 19.615083798882683, + "grad_norm": 0.43389493227005005, + "learning_rate": 1.9747899159663866e-05, + "loss": 0.4509, + "step": 35111 + }, + { + "epoch": 19.61564245810056, + "grad_norm": 0.516075611114502, + "learning_rate": 1.9719887955182075e-05, + "loss": 0.4, + "step": 35112 + }, + { + "epoch": 19.616201117318436, + "grad_norm": 0.42388996481895447, + "learning_rate": 1.969187675070028e-05, + "loss": 0.2859, + "step": 35113 + }, + { + "epoch": 19.616759776536313, + "grad_norm": 0.4811156094074249, + "learning_rate": 1.9663865546218487e-05, + "loss": 0.5608, + "step": 35114 + }, + { + "epoch": 19.61731843575419, + "grad_norm": 2.0465471744537354, + "learning_rate": 1.9635854341736697e-05, + "loss": 0.6116, + "step": 35115 + }, + { + "epoch": 19.617877094972066, + "grad_norm": 0.6585325598716736, + "learning_rate": 1.9607843137254903e-05, + "loss": 0.5078, + "step": 35116 + }, + { + "epoch": 19.618435754189946, + "grad_norm": 0.4917345643043518, + "learning_rate": 1.957983193277311e-05, + "loss": 0.289, + "step": 35117 + }, + { + "epoch": 19.618994413407822, + "grad_norm": 1.1267015933990479, + "learning_rate": 1.9551820728291318e-05, + "loss": 0.573, + "step": 35118 + }, + { + "epoch": 19.6195530726257, + "grad_norm": 0.3880383372306824, + "learning_rate": 1.9523809523809524e-05, + "loss": 0.3758, + "step": 35119 + }, + { + "epoch": 19.620111731843576, + "grad_norm": 2.414391040802002, + "learning_rate": 1.9495798319327733e-05, + "loss": 0.3644, + "step": 35120 + }, + { + "epoch": 19.620670391061452, + "grad_norm": 0.5096579790115356, + "learning_rate": 1.946778711484594e-05, + "loss": 0.4748, + "step": 35121 + }, + { + "epoch": 19.62122905027933, + "grad_norm": 0.693800687789917, + "learning_rate": 1.9439775910364145e-05, + "loss": 0.5959, + "step": 35122 + }, + { + "epoch": 19.621787709497205, + "grad_norm": 0.4663922190666199, + "learning_rate": 1.9411764705882355e-05, + "loss": 0.3605, + "step": 35123 + }, + { + "epoch": 19.622346368715085, + "grad_norm": 0.4367486536502838, + "learning_rate": 1.938375350140056e-05, + "loss": 0.4726, + "step": 35124 + }, + { + "epoch": 19.622905027932962, + "grad_norm": 0.6484634280204773, + "learning_rate": 1.9355742296918767e-05, + "loss": 0.4415, + "step": 35125 + }, + { + "epoch": 19.62346368715084, + "grad_norm": 0.3513046205043793, + "learning_rate": 1.9327731092436976e-05, + "loss": 0.269, + "step": 35126 + }, + { + "epoch": 19.624022346368715, + "grad_norm": 0.4450730085372925, + "learning_rate": 1.9299719887955182e-05, + "loss": 0.4782, + "step": 35127 + }, + { + "epoch": 19.62458100558659, + "grad_norm": 0.45933833718299866, + "learning_rate": 1.927170868347339e-05, + "loss": 0.4778, + "step": 35128 + }, + { + "epoch": 19.62513966480447, + "grad_norm": 3.330951690673828, + "learning_rate": 1.9243697478991597e-05, + "loss": 0.5655, + "step": 35129 + }, + { + "epoch": 19.625698324022345, + "grad_norm": 1.941396713256836, + "learning_rate": 1.9215686274509803e-05, + "loss": 0.553, + "step": 35130 + }, + { + "epoch": 19.626256983240225, + "grad_norm": 0.43774154782295227, + "learning_rate": 1.9187675070028013e-05, + "loss": 0.4074, + "step": 35131 + }, + { + "epoch": 19.6268156424581, + "grad_norm": 0.43905583024024963, + "learning_rate": 1.915966386554622e-05, + "loss": 0.457, + "step": 35132 + }, + { + "epoch": 19.627374301675978, + "grad_norm": 0.45976385474205017, + "learning_rate": 1.9131652661064425e-05, + "loss": 0.3267, + "step": 35133 + }, + { + "epoch": 19.627932960893855, + "grad_norm": 0.6096678972244263, + "learning_rate": 1.9103641456582634e-05, + "loss": 0.4661, + "step": 35134 + }, + { + "epoch": 19.62849162011173, + "grad_norm": 0.35695183277130127, + "learning_rate": 1.907563025210084e-05, + "loss": 0.4388, + "step": 35135 + }, + { + "epoch": 19.629050279329608, + "grad_norm": 0.30571630597114563, + "learning_rate": 1.904761904761905e-05, + "loss": 0.3864, + "step": 35136 + }, + { + "epoch": 19.629608938547484, + "grad_norm": 1.6890528202056885, + "learning_rate": 1.9019607843137255e-05, + "loss": 0.4257, + "step": 35137 + }, + { + "epoch": 19.630167597765364, + "grad_norm": 0.3722410500049591, + "learning_rate": 1.899159663865546e-05, + "loss": 0.4071, + "step": 35138 + }, + { + "epoch": 19.63072625698324, + "grad_norm": 0.4365113079547882, + "learning_rate": 1.896358543417367e-05, + "loss": 0.5847, + "step": 35139 + }, + { + "epoch": 19.631284916201118, + "grad_norm": 0.3387068212032318, + "learning_rate": 1.8935574229691877e-05, + "loss": 0.3363, + "step": 35140 + }, + { + "epoch": 19.631843575418994, + "grad_norm": 0.49587541818618774, + "learning_rate": 1.8907563025210083e-05, + "loss": 0.3513, + "step": 35141 + }, + { + "epoch": 19.63240223463687, + "grad_norm": 0.469573050737381, + "learning_rate": 1.8879551820728292e-05, + "loss": 0.508, + "step": 35142 + }, + { + "epoch": 19.632960893854747, + "grad_norm": 0.4225670099258423, + "learning_rate": 1.8851540616246498e-05, + "loss": 0.4525, + "step": 35143 + }, + { + "epoch": 19.633519553072627, + "grad_norm": 0.49111419916152954, + "learning_rate": 1.8823529411764704e-05, + "loss": 0.4103, + "step": 35144 + }, + { + "epoch": 19.634078212290504, + "grad_norm": 0.416355699300766, + "learning_rate": 1.8795518207282914e-05, + "loss": 0.4392, + "step": 35145 + }, + { + "epoch": 19.63463687150838, + "grad_norm": 0.49225419759750366, + "learning_rate": 1.876750700280112e-05, + "loss": 0.4132, + "step": 35146 + }, + { + "epoch": 19.635195530726257, + "grad_norm": 1.0314242839813232, + "learning_rate": 1.873949579831933e-05, + "loss": 0.3441, + "step": 35147 + }, + { + "epoch": 19.635754189944134, + "grad_norm": 0.39084526896476746, + "learning_rate": 1.8711484593837535e-05, + "loss": 0.3476, + "step": 35148 + }, + { + "epoch": 19.63631284916201, + "grad_norm": 1.059799313545227, + "learning_rate": 1.868347338935574e-05, + "loss": 0.3516, + "step": 35149 + }, + { + "epoch": 19.636871508379887, + "grad_norm": 0.4000471234321594, + "learning_rate": 1.865546218487395e-05, + "loss": 0.3686, + "step": 35150 + }, + { + "epoch": 19.637430167597767, + "grad_norm": 0.45973414182662964, + "learning_rate": 1.8627450980392156e-05, + "loss": 0.4799, + "step": 35151 + }, + { + "epoch": 19.637988826815644, + "grad_norm": 0.875268816947937, + "learning_rate": 1.8599439775910362e-05, + "loss": 0.3757, + "step": 35152 + }, + { + "epoch": 19.63854748603352, + "grad_norm": 0.589676558971405, + "learning_rate": 1.8571428571428572e-05, + "loss": 0.3621, + "step": 35153 + }, + { + "epoch": 19.639106145251397, + "grad_norm": 0.3321785628795624, + "learning_rate": 1.8543417366946778e-05, + "loss": 0.2663, + "step": 35154 + }, + { + "epoch": 19.639664804469273, + "grad_norm": 0.40286576747894287, + "learning_rate": 1.8515406162464987e-05, + "loss": 0.348, + "step": 35155 + }, + { + "epoch": 19.64022346368715, + "grad_norm": 0.5802273154258728, + "learning_rate": 1.8487394957983193e-05, + "loss": 0.3961, + "step": 35156 + }, + { + "epoch": 19.640782122905026, + "grad_norm": 17.08884620666504, + "learning_rate": 1.84593837535014e-05, + "loss": 0.3829, + "step": 35157 + }, + { + "epoch": 19.641340782122906, + "grad_norm": 3.4307515621185303, + "learning_rate": 1.843137254901961e-05, + "loss": 0.3988, + "step": 35158 + }, + { + "epoch": 19.641899441340783, + "grad_norm": 0.45335379242897034, + "learning_rate": 1.8403361344537814e-05, + "loss": 0.3577, + "step": 35159 + }, + { + "epoch": 19.64245810055866, + "grad_norm": 0.44909825921058655, + "learning_rate": 1.837535014005602e-05, + "loss": 0.4292, + "step": 35160 + }, + { + "epoch": 19.643016759776536, + "grad_norm": 0.38877925276756287, + "learning_rate": 1.834733893557423e-05, + "loss": 0.4574, + "step": 35161 + }, + { + "epoch": 19.643575418994413, + "grad_norm": 0.5969152450561523, + "learning_rate": 1.8319327731092436e-05, + "loss": 0.5093, + "step": 35162 + }, + { + "epoch": 19.64413407821229, + "grad_norm": 0.6126071810722351, + "learning_rate": 1.8291316526610645e-05, + "loss": 0.3742, + "step": 35163 + }, + { + "epoch": 19.64469273743017, + "grad_norm": 0.7234187722206116, + "learning_rate": 1.826330532212885e-05, + "loss": 0.5294, + "step": 35164 + }, + { + "epoch": 19.645251396648046, + "grad_norm": 0.3739009201526642, + "learning_rate": 1.8235294117647057e-05, + "loss": 0.3257, + "step": 35165 + }, + { + "epoch": 19.645810055865923, + "grad_norm": 0.35236263275146484, + "learning_rate": 1.8207282913165267e-05, + "loss": 0.3686, + "step": 35166 + }, + { + "epoch": 19.6463687150838, + "grad_norm": 0.614061176776886, + "learning_rate": 1.8179271708683473e-05, + "loss": 0.4206, + "step": 35167 + }, + { + "epoch": 19.646927374301676, + "grad_norm": 0.37418922781944275, + "learning_rate": 1.815126050420168e-05, + "loss": 0.448, + "step": 35168 + }, + { + "epoch": 19.647486033519552, + "grad_norm": 0.456551730632782, + "learning_rate": 1.8123249299719888e-05, + "loss": 0.3942, + "step": 35169 + }, + { + "epoch": 19.64804469273743, + "grad_norm": 6.372609615325928, + "learning_rate": 1.8095238095238094e-05, + "loss": 0.3743, + "step": 35170 + }, + { + "epoch": 19.64860335195531, + "grad_norm": 0.6308823823928833, + "learning_rate": 1.8067226890756303e-05, + "loss": 0.5636, + "step": 35171 + }, + { + "epoch": 19.649162011173186, + "grad_norm": 0.3686907887458801, + "learning_rate": 1.803921568627451e-05, + "loss": 0.4425, + "step": 35172 + }, + { + "epoch": 19.649720670391062, + "grad_norm": 5.129613876342773, + "learning_rate": 1.8011204481792715e-05, + "loss": 0.3914, + "step": 35173 + }, + { + "epoch": 19.65027932960894, + "grad_norm": 1.5420482158660889, + "learning_rate": 1.7983193277310925e-05, + "loss": 0.4004, + "step": 35174 + }, + { + "epoch": 19.650837988826815, + "grad_norm": 0.517649233341217, + "learning_rate": 1.795518207282913e-05, + "loss": 0.5575, + "step": 35175 + }, + { + "epoch": 19.65139664804469, + "grad_norm": 1.0616827011108398, + "learning_rate": 1.7927170868347337e-05, + "loss": 0.3526, + "step": 35176 + }, + { + "epoch": 19.65195530726257, + "grad_norm": 0.5183215141296387, + "learning_rate": 1.7899159663865546e-05, + "loss": 0.4371, + "step": 35177 + }, + { + "epoch": 19.65251396648045, + "grad_norm": 0.34969913959503174, + "learning_rate": 1.7871148459383752e-05, + "loss": 0.3941, + "step": 35178 + }, + { + "epoch": 19.653072625698325, + "grad_norm": 0.3530867099761963, + "learning_rate": 1.7843137254901958e-05, + "loss": 0.3556, + "step": 35179 + }, + { + "epoch": 19.6536312849162, + "grad_norm": 0.38902702927589417, + "learning_rate": 1.7815126050420167e-05, + "loss": 0.4256, + "step": 35180 + }, + { + "epoch": 19.654189944134078, + "grad_norm": 0.9034125804901123, + "learning_rate": 1.7787114845938373e-05, + "loss": 0.3844, + "step": 35181 + }, + { + "epoch": 19.654748603351955, + "grad_norm": 0.7404069900512695, + "learning_rate": 1.7759103641456583e-05, + "loss": 0.4225, + "step": 35182 + }, + { + "epoch": 19.65530726256983, + "grad_norm": 0.41993728280067444, + "learning_rate": 1.773109243697479e-05, + "loss": 0.4018, + "step": 35183 + }, + { + "epoch": 19.655865921787708, + "grad_norm": 0.6756036877632141, + "learning_rate": 1.7703081232492995e-05, + "loss": 0.476, + "step": 35184 + }, + { + "epoch": 19.656424581005588, + "grad_norm": 0.4275253415107727, + "learning_rate": 1.7675070028011204e-05, + "loss": 0.3528, + "step": 35185 + }, + { + "epoch": 19.656983240223465, + "grad_norm": 5.222357749938965, + "learning_rate": 1.764705882352941e-05, + "loss": 0.7583, + "step": 35186 + }, + { + "epoch": 19.65754189944134, + "grad_norm": 0.6256312131881714, + "learning_rate": 1.7619047619047616e-05, + "loss": 0.4369, + "step": 35187 + }, + { + "epoch": 19.658100558659218, + "grad_norm": 0.5469828248023987, + "learning_rate": 1.759103641456583e-05, + "loss": 0.39, + "step": 35188 + }, + { + "epoch": 19.658659217877094, + "grad_norm": 0.7170218825340271, + "learning_rate": 1.7563025210084035e-05, + "loss": 0.4195, + "step": 35189 + }, + { + "epoch": 19.65921787709497, + "grad_norm": 0.5640260577201843, + "learning_rate": 1.7535014005602244e-05, + "loss": 0.3716, + "step": 35190 + }, + { + "epoch": 19.659776536312847, + "grad_norm": 0.7066543102264404, + "learning_rate": 1.750700280112045e-05, + "loss": 0.5033, + "step": 35191 + }, + { + "epoch": 19.660335195530728, + "grad_norm": 1.5083872079849243, + "learning_rate": 1.7478991596638656e-05, + "loss": 0.4156, + "step": 35192 + }, + { + "epoch": 19.660893854748604, + "grad_norm": 0.46740296483039856, + "learning_rate": 1.7450980392156866e-05, + "loss": 0.4254, + "step": 35193 + }, + { + "epoch": 19.66145251396648, + "grad_norm": 0.526253342628479, + "learning_rate": 1.7422969187675072e-05, + "loss": 0.4311, + "step": 35194 + }, + { + "epoch": 19.662011173184357, + "grad_norm": 0.46243155002593994, + "learning_rate": 1.7394957983193278e-05, + "loss": 0.3734, + "step": 35195 + }, + { + "epoch": 19.662569832402234, + "grad_norm": 0.5958356261253357, + "learning_rate": 1.7366946778711487e-05, + "loss": 0.5343, + "step": 35196 + }, + { + "epoch": 19.66312849162011, + "grad_norm": 0.6704958081245422, + "learning_rate": 1.7338935574229693e-05, + "loss": 0.3482, + "step": 35197 + }, + { + "epoch": 19.66368715083799, + "grad_norm": 1.38182532787323, + "learning_rate": 1.7310924369747902e-05, + "loss": 0.3423, + "step": 35198 + }, + { + "epoch": 19.664245810055867, + "grad_norm": 0.4332784116268158, + "learning_rate": 1.728291316526611e-05, + "loss": 0.3564, + "step": 35199 + }, + { + "epoch": 19.664804469273744, + "grad_norm": 0.41296276450157166, + "learning_rate": 1.7254901960784314e-05, + "loss": 0.4204, + "step": 35200 + }, + { + "epoch": 19.66536312849162, + "grad_norm": 0.5465002059936523, + "learning_rate": 1.7226890756302524e-05, + "loss": 0.4535, + "step": 35201 + }, + { + "epoch": 19.665921787709497, + "grad_norm": 3.1349384784698486, + "learning_rate": 1.719887955182073e-05, + "loss": 0.4663, + "step": 35202 + }, + { + "epoch": 19.666480446927373, + "grad_norm": 0.5418531894683838, + "learning_rate": 1.7170868347338936e-05, + "loss": 0.3629, + "step": 35203 + }, + { + "epoch": 19.66703910614525, + "grad_norm": 0.41018274426460266, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.3583, + "step": 35204 + }, + { + "epoch": 19.66759776536313, + "grad_norm": 1.4892946481704712, + "learning_rate": 1.711484593837535e-05, + "loss": 0.4989, + "step": 35205 + }, + { + "epoch": 19.668156424581007, + "grad_norm": 0.3679587244987488, + "learning_rate": 1.7086834733893557e-05, + "loss": 0.2931, + "step": 35206 + }, + { + "epoch": 19.668715083798883, + "grad_norm": 0.3493441641330719, + "learning_rate": 1.7058823529411767e-05, + "loss": 0.3441, + "step": 35207 + }, + { + "epoch": 19.66927374301676, + "grad_norm": 0.3642297387123108, + "learning_rate": 1.7030812324929973e-05, + "loss": 0.348, + "step": 35208 + }, + { + "epoch": 19.669832402234636, + "grad_norm": 0.48526158928871155, + "learning_rate": 1.7002801120448182e-05, + "loss": 0.4682, + "step": 35209 + }, + { + "epoch": 19.670391061452513, + "grad_norm": 0.5626773834228516, + "learning_rate": 1.6974789915966388e-05, + "loss": 0.3355, + "step": 35210 + }, + { + "epoch": 19.67094972067039, + "grad_norm": 0.3787447214126587, + "learning_rate": 1.6946778711484594e-05, + "loss": 0.3065, + "step": 35211 + }, + { + "epoch": 19.67150837988827, + "grad_norm": 0.5290059447288513, + "learning_rate": 1.6918767507002803e-05, + "loss": 0.3894, + "step": 35212 + }, + { + "epoch": 19.672067039106146, + "grad_norm": 0.5625268816947937, + "learning_rate": 1.689075630252101e-05, + "loss": 0.369, + "step": 35213 + }, + { + "epoch": 19.672625698324023, + "grad_norm": 0.5560200810432434, + "learning_rate": 1.6862745098039215e-05, + "loss": 0.3481, + "step": 35214 + }, + { + "epoch": 19.6731843575419, + "grad_norm": 0.4861189126968384, + "learning_rate": 1.6834733893557425e-05, + "loss": 0.354, + "step": 35215 + }, + { + "epoch": 19.673743016759776, + "grad_norm": 0.3645785450935364, + "learning_rate": 1.680672268907563e-05, + "loss": 0.3124, + "step": 35216 + }, + { + "epoch": 19.674301675977652, + "grad_norm": 0.4363611340522766, + "learning_rate": 1.677871148459384e-05, + "loss": 0.3786, + "step": 35217 + }, + { + "epoch": 19.674860335195532, + "grad_norm": 0.4242125451564789, + "learning_rate": 1.6750700280112046e-05, + "loss": 0.3411, + "step": 35218 + }, + { + "epoch": 19.67541899441341, + "grad_norm": 0.6243457198143005, + "learning_rate": 1.6722689075630252e-05, + "loss": 0.4185, + "step": 35219 + }, + { + "epoch": 19.675977653631286, + "grad_norm": 0.38432514667510986, + "learning_rate": 1.669467787114846e-05, + "loss": 0.3351, + "step": 35220 + }, + { + "epoch": 19.676536312849162, + "grad_norm": 0.47736856341362, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.3836, + "step": 35221 + }, + { + "epoch": 19.67709497206704, + "grad_norm": 0.3078649342060089, + "learning_rate": 1.6638655462184873e-05, + "loss": 0.302, + "step": 35222 + }, + { + "epoch": 19.677653631284915, + "grad_norm": 0.44212305545806885, + "learning_rate": 1.6610644257703083e-05, + "loss": 0.3989, + "step": 35223 + }, + { + "epoch": 19.678212290502792, + "grad_norm": 0.3291095197200775, + "learning_rate": 1.658263305322129e-05, + "loss": 0.3245, + "step": 35224 + }, + { + "epoch": 19.678770949720672, + "grad_norm": 0.5435345768928528, + "learning_rate": 1.6554621848739498e-05, + "loss": 0.384, + "step": 35225 + }, + { + "epoch": 19.67932960893855, + "grad_norm": 0.5516030192375183, + "learning_rate": 1.6526610644257704e-05, + "loss": 0.5009, + "step": 35226 + }, + { + "epoch": 19.679888268156425, + "grad_norm": 0.29875752329826355, + "learning_rate": 1.649859943977591e-05, + "loss": 0.3368, + "step": 35227 + }, + { + "epoch": 19.6804469273743, + "grad_norm": 0.6018951535224915, + "learning_rate": 1.647058823529412e-05, + "loss": 0.3497, + "step": 35228 + }, + { + "epoch": 19.68100558659218, + "grad_norm": 0.5232781767845154, + "learning_rate": 1.6442577030812326e-05, + "loss": 0.4386, + "step": 35229 + }, + { + "epoch": 19.681564245810055, + "grad_norm": 0.40044742822647095, + "learning_rate": 1.641456582633053e-05, + "loss": 0.3798, + "step": 35230 + }, + { + "epoch": 19.68212290502793, + "grad_norm": 0.514755368232727, + "learning_rate": 1.638655462184874e-05, + "loss": 0.5279, + "step": 35231 + }, + { + "epoch": 19.68268156424581, + "grad_norm": 0.37741488218307495, + "learning_rate": 1.6358543417366947e-05, + "loss": 0.3553, + "step": 35232 + }, + { + "epoch": 19.683240223463688, + "grad_norm": 2.0443544387817383, + "learning_rate": 1.6330532212885156e-05, + "loss": 0.4384, + "step": 35233 + }, + { + "epoch": 19.683798882681565, + "grad_norm": 0.9448440670967102, + "learning_rate": 1.6302521008403362e-05, + "loss": 0.3242, + "step": 35234 + }, + { + "epoch": 19.68435754189944, + "grad_norm": 0.43367841839790344, + "learning_rate": 1.627450980392157e-05, + "loss": 0.3235, + "step": 35235 + }, + { + "epoch": 19.684916201117318, + "grad_norm": 0.4780968129634857, + "learning_rate": 1.6246498599439778e-05, + "loss": 0.4009, + "step": 35236 + }, + { + "epoch": 19.685474860335194, + "grad_norm": 0.42769503593444824, + "learning_rate": 1.6218487394957984e-05, + "loss": 0.5156, + "step": 35237 + }, + { + "epoch": 19.68603351955307, + "grad_norm": 0.4197807312011719, + "learning_rate": 1.619047619047619e-05, + "loss": 0.3193, + "step": 35238 + }, + { + "epoch": 19.68659217877095, + "grad_norm": 1.7733579874038696, + "learning_rate": 1.61624649859944e-05, + "loss": 0.3267, + "step": 35239 + }, + { + "epoch": 19.687150837988828, + "grad_norm": 0.5177894830703735, + "learning_rate": 1.6134453781512605e-05, + "loss": 0.3525, + "step": 35240 + }, + { + "epoch": 19.687709497206704, + "grad_norm": 0.5187222361564636, + "learning_rate": 1.610644257703081e-05, + "loss": 0.4945, + "step": 35241 + }, + { + "epoch": 19.68826815642458, + "grad_norm": 0.527868390083313, + "learning_rate": 1.607843137254902e-05, + "loss": 0.4959, + "step": 35242 + }, + { + "epoch": 19.688826815642457, + "grad_norm": 1.0807161331176758, + "learning_rate": 1.6050420168067226e-05, + "loss": 0.3997, + "step": 35243 + }, + { + "epoch": 19.689385474860334, + "grad_norm": 1.081878662109375, + "learning_rate": 1.6022408963585436e-05, + "loss": 0.4295, + "step": 35244 + }, + { + "epoch": 19.689944134078214, + "grad_norm": 0.5213796496391296, + "learning_rate": 1.5994397759103642e-05, + "loss": 0.4486, + "step": 35245 + }, + { + "epoch": 19.69050279329609, + "grad_norm": 0.9523847103118896, + "learning_rate": 1.5966386554621848e-05, + "loss": 0.3599, + "step": 35246 + }, + { + "epoch": 19.691061452513967, + "grad_norm": 0.5482472777366638, + "learning_rate": 1.5938375350140057e-05, + "loss": 0.3607, + "step": 35247 + }, + { + "epoch": 19.691620111731844, + "grad_norm": 0.6379703283309937, + "learning_rate": 1.5910364145658263e-05, + "loss": 0.3173, + "step": 35248 + }, + { + "epoch": 19.69217877094972, + "grad_norm": 0.4073861539363861, + "learning_rate": 1.588235294117647e-05, + "loss": 0.3485, + "step": 35249 + }, + { + "epoch": 19.692737430167597, + "grad_norm": 0.4410066604614258, + "learning_rate": 1.585434173669468e-05, + "loss": 0.3252, + "step": 35250 + }, + { + "epoch": 19.693296089385473, + "grad_norm": 0.42806291580200195, + "learning_rate": 1.5826330532212885e-05, + "loss": 0.4112, + "step": 35251 + }, + { + "epoch": 19.693854748603353, + "grad_norm": 0.39493927359580994, + "learning_rate": 1.5798319327731094e-05, + "loss": 0.4115, + "step": 35252 + }, + { + "epoch": 19.69441340782123, + "grad_norm": 0.30333179235458374, + "learning_rate": 1.57703081232493e-05, + "loss": 0.3883, + "step": 35253 + }, + { + "epoch": 19.694972067039107, + "grad_norm": 0.4833950400352478, + "learning_rate": 1.5742296918767506e-05, + "loss": 0.4372, + "step": 35254 + }, + { + "epoch": 19.695530726256983, + "grad_norm": 0.4764694571495056, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.5119, + "step": 35255 + }, + { + "epoch": 19.69608938547486, + "grad_norm": 0.431545615196228, + "learning_rate": 1.568627450980392e-05, + "loss": 0.3786, + "step": 35256 + }, + { + "epoch": 19.696648044692736, + "grad_norm": 0.3747271001338959, + "learning_rate": 1.5658263305322127e-05, + "loss": 0.3579, + "step": 35257 + }, + { + "epoch": 19.697206703910613, + "grad_norm": 0.34276649355888367, + "learning_rate": 1.5630252100840337e-05, + "loss": 0.3806, + "step": 35258 + }, + { + "epoch": 19.697765363128493, + "grad_norm": 0.47646328806877136, + "learning_rate": 1.5602240896358543e-05, + "loss": 0.3585, + "step": 35259 + }, + { + "epoch": 19.69832402234637, + "grad_norm": 0.34929677844047546, + "learning_rate": 1.5574229691876752e-05, + "loss": 0.3619, + "step": 35260 + }, + { + "epoch": 19.698882681564246, + "grad_norm": 0.8763848543167114, + "learning_rate": 1.5546218487394958e-05, + "loss": 0.4121, + "step": 35261 + }, + { + "epoch": 19.699441340782123, + "grad_norm": 0.3957643508911133, + "learning_rate": 1.5518207282913167e-05, + "loss": 0.3951, + "step": 35262 + }, + { + "epoch": 19.7, + "grad_norm": 0.48728543519973755, + "learning_rate": 1.5490196078431373e-05, + "loss": 0.4378, + "step": 35263 + }, + { + "epoch": 19.700558659217876, + "grad_norm": 0.4075457751750946, + "learning_rate": 1.546218487394958e-05, + "loss": 0.3792, + "step": 35264 + }, + { + "epoch": 19.701117318435756, + "grad_norm": 0.7470335364341736, + "learning_rate": 1.543417366946779e-05, + "loss": 0.4578, + "step": 35265 + }, + { + "epoch": 19.701675977653633, + "grad_norm": 0.5533445477485657, + "learning_rate": 1.5406162464985995e-05, + "loss": 0.3696, + "step": 35266 + }, + { + "epoch": 19.70223463687151, + "grad_norm": 0.3893565535545349, + "learning_rate": 1.53781512605042e-05, + "loss": 0.4154, + "step": 35267 + }, + { + "epoch": 19.702793296089386, + "grad_norm": 0.5706729292869568, + "learning_rate": 1.535014005602241e-05, + "loss": 0.3055, + "step": 35268 + }, + { + "epoch": 19.703351955307262, + "grad_norm": 0.4026053845882416, + "learning_rate": 1.5322128851540616e-05, + "loss": 0.3227, + "step": 35269 + }, + { + "epoch": 19.70391061452514, + "grad_norm": 0.3754120469093323, + "learning_rate": 1.5294117647058826e-05, + "loss": 0.3245, + "step": 35270 + }, + { + "epoch": 19.704469273743015, + "grad_norm": 0.5366465449333191, + "learning_rate": 1.526610644257703e-05, + "loss": 0.3944, + "step": 35271 + }, + { + "epoch": 19.705027932960895, + "grad_norm": 0.38823482394218445, + "learning_rate": 1.5238095238095238e-05, + "loss": 0.41, + "step": 35272 + }, + { + "epoch": 19.705586592178772, + "grad_norm": 0.6000133752822876, + "learning_rate": 1.5210084033613447e-05, + "loss": 0.3764, + "step": 35273 + }, + { + "epoch": 19.70614525139665, + "grad_norm": 0.4234936237335205, + "learning_rate": 1.5182072829131653e-05, + "loss": 0.3829, + "step": 35274 + }, + { + "epoch": 19.706703910614525, + "grad_norm": 0.38632890582084656, + "learning_rate": 1.515406162464986e-05, + "loss": 0.3921, + "step": 35275 + }, + { + "epoch": 19.7072625698324, + "grad_norm": 0.5408406853675842, + "learning_rate": 1.5126050420168068e-05, + "loss": 0.4565, + "step": 35276 + }, + { + "epoch": 19.70782122905028, + "grad_norm": 0.5979124307632446, + "learning_rate": 1.5098039215686276e-05, + "loss": 0.5407, + "step": 35277 + }, + { + "epoch": 19.708379888268155, + "grad_norm": 1.4332454204559326, + "learning_rate": 1.5070028011204482e-05, + "loss": 0.399, + "step": 35278 + }, + { + "epoch": 19.708938547486035, + "grad_norm": 0.5275501012802124, + "learning_rate": 1.504201680672269e-05, + "loss": 0.3606, + "step": 35279 + }, + { + "epoch": 19.70949720670391, + "grad_norm": 0.6827466487884521, + "learning_rate": 1.5014005602240897e-05, + "loss": 0.4747, + "step": 35280 + }, + { + "epoch": 19.710055865921788, + "grad_norm": 1.2533493041992188, + "learning_rate": 1.4985994397759105e-05, + "loss": 0.393, + "step": 35281 + }, + { + "epoch": 19.710614525139665, + "grad_norm": 0.4410189688205719, + "learning_rate": 1.4957983193277311e-05, + "loss": 0.3918, + "step": 35282 + }, + { + "epoch": 19.71117318435754, + "grad_norm": 0.357056587934494, + "learning_rate": 1.4929971988795519e-05, + "loss": 0.389, + "step": 35283 + }, + { + "epoch": 19.711731843575418, + "grad_norm": 0.7442764639854431, + "learning_rate": 1.4901960784313726e-05, + "loss": 0.3385, + "step": 35284 + }, + { + "epoch": 19.712290502793294, + "grad_norm": 1.0437657833099365, + "learning_rate": 1.4873949579831934e-05, + "loss": 0.4917, + "step": 35285 + }, + { + "epoch": 19.712849162011175, + "grad_norm": 0.37906742095947266, + "learning_rate": 1.484593837535014e-05, + "loss": 0.3227, + "step": 35286 + }, + { + "epoch": 19.71340782122905, + "grad_norm": 0.5135797262191772, + "learning_rate": 1.4817927170868348e-05, + "loss": 0.5272, + "step": 35287 + }, + { + "epoch": 19.713966480446928, + "grad_norm": 0.343237042427063, + "learning_rate": 1.4789915966386556e-05, + "loss": 0.3696, + "step": 35288 + }, + { + "epoch": 19.714525139664804, + "grad_norm": 0.5119778513908386, + "learning_rate": 1.4761904761904763e-05, + "loss": 0.4165, + "step": 35289 + }, + { + "epoch": 19.71508379888268, + "grad_norm": 1.148737907409668, + "learning_rate": 1.473389355742297e-05, + "loss": 0.3871, + "step": 35290 + }, + { + "epoch": 19.715642458100557, + "grad_norm": 0.5131190419197083, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.4368, + "step": 35291 + }, + { + "epoch": 19.716201117318434, + "grad_norm": 0.36445698142051697, + "learning_rate": 1.4677871148459385e-05, + "loss": 0.4555, + "step": 35292 + }, + { + "epoch": 19.716759776536314, + "grad_norm": 0.34603628516197205, + "learning_rate": 1.4649859943977592e-05, + "loss": 0.3412, + "step": 35293 + }, + { + "epoch": 19.71731843575419, + "grad_norm": 0.5670938491821289, + "learning_rate": 1.4621848739495798e-05, + "loss": 0.3815, + "step": 35294 + }, + { + "epoch": 19.717877094972067, + "grad_norm": 0.5026167035102844, + "learning_rate": 1.4593837535014006e-05, + "loss": 0.3656, + "step": 35295 + }, + { + "epoch": 19.718435754189944, + "grad_norm": 1.0616168975830078, + "learning_rate": 1.4565826330532214e-05, + "loss": 0.3349, + "step": 35296 + }, + { + "epoch": 19.71899441340782, + "grad_norm": 0.8057705163955688, + "learning_rate": 1.4537815126050421e-05, + "loss": 0.3383, + "step": 35297 + }, + { + "epoch": 19.719553072625697, + "grad_norm": 0.35158783197402954, + "learning_rate": 1.4509803921568627e-05, + "loss": 0.3749, + "step": 35298 + }, + { + "epoch": 19.720111731843577, + "grad_norm": 0.3770917057991028, + "learning_rate": 1.4481792717086835e-05, + "loss": 0.4597, + "step": 35299 + }, + { + "epoch": 19.720670391061454, + "grad_norm": 0.3776755630970001, + "learning_rate": 1.4453781512605043e-05, + "loss": 0.5356, + "step": 35300 + }, + { + "epoch": 19.72122905027933, + "grad_norm": 1.9294934272766113, + "learning_rate": 1.442577030812325e-05, + "loss": 0.524, + "step": 35301 + }, + { + "epoch": 19.721787709497207, + "grad_norm": 0.34345540404319763, + "learning_rate": 1.4397759103641456e-05, + "loss": 0.3501, + "step": 35302 + }, + { + "epoch": 19.722346368715083, + "grad_norm": 0.5337271690368652, + "learning_rate": 1.4369747899159664e-05, + "loss": 0.3747, + "step": 35303 + }, + { + "epoch": 19.72290502793296, + "grad_norm": 0.6082429885864258, + "learning_rate": 1.4341736694677872e-05, + "loss": 0.4915, + "step": 35304 + }, + { + "epoch": 19.723463687150836, + "grad_norm": 0.793801486492157, + "learning_rate": 1.431372549019608e-05, + "loss": 0.4021, + "step": 35305 + }, + { + "epoch": 19.724022346368717, + "grad_norm": 14.263829231262207, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.7126, + "step": 35306 + }, + { + "epoch": 19.724581005586593, + "grad_norm": 0.35827380418777466, + "learning_rate": 1.4257703081232493e-05, + "loss": 0.3766, + "step": 35307 + }, + { + "epoch": 19.72513966480447, + "grad_norm": 1.4470044374465942, + "learning_rate": 1.42296918767507e-05, + "loss": 0.3858, + "step": 35308 + }, + { + "epoch": 19.725698324022346, + "grad_norm": 0.34180471301078796, + "learning_rate": 1.4201680672268907e-05, + "loss": 0.3456, + "step": 35309 + }, + { + "epoch": 19.726256983240223, + "grad_norm": 0.6229627728462219, + "learning_rate": 1.4173669467787114e-05, + "loss": 0.4423, + "step": 35310 + }, + { + "epoch": 19.7268156424581, + "grad_norm": 0.5809889435768127, + "learning_rate": 1.4145658263305322e-05, + "loss": 0.4294, + "step": 35311 + }, + { + "epoch": 19.727374301675976, + "grad_norm": 0.566570520401001, + "learning_rate": 1.411764705882353e-05, + "loss": 0.295, + "step": 35312 + }, + { + "epoch": 19.727932960893856, + "grad_norm": 4.236278057098389, + "learning_rate": 1.4089635854341736e-05, + "loss": 0.3396, + "step": 35313 + }, + { + "epoch": 19.728491620111733, + "grad_norm": 0.4100249707698822, + "learning_rate": 1.4061624649859944e-05, + "loss": 0.3831, + "step": 35314 + }, + { + "epoch": 19.72905027932961, + "grad_norm": 0.38920170068740845, + "learning_rate": 1.4033613445378151e-05, + "loss": 0.3927, + "step": 35315 + }, + { + "epoch": 19.729608938547486, + "grad_norm": 0.3384273648262024, + "learning_rate": 1.4005602240896359e-05, + "loss": 0.2926, + "step": 35316 + }, + { + "epoch": 19.730167597765362, + "grad_norm": 0.41815540194511414, + "learning_rate": 1.3977591036414565e-05, + "loss": 0.4532, + "step": 35317 + }, + { + "epoch": 19.73072625698324, + "grad_norm": 0.3968935012817383, + "learning_rate": 1.3949579831932773e-05, + "loss": 0.4812, + "step": 35318 + }, + { + "epoch": 19.73128491620112, + "grad_norm": 1.058937668800354, + "learning_rate": 1.392156862745098e-05, + "loss": 0.3715, + "step": 35319 + }, + { + "epoch": 19.731843575418996, + "grad_norm": 0.43217697739601135, + "learning_rate": 1.3893557422969188e-05, + "loss": 0.3202, + "step": 35320 + }, + { + "epoch": 19.732402234636872, + "grad_norm": 0.8799136281013489, + "learning_rate": 1.3865546218487394e-05, + "loss": 0.4151, + "step": 35321 + }, + { + "epoch": 19.73296089385475, + "grad_norm": 0.8152925372123718, + "learning_rate": 1.3837535014005602e-05, + "loss": 0.3754, + "step": 35322 + }, + { + "epoch": 19.733519553072625, + "grad_norm": 5.013149261474609, + "learning_rate": 1.380952380952381e-05, + "loss": 0.4126, + "step": 35323 + }, + { + "epoch": 19.734078212290502, + "grad_norm": 0.43170085549354553, + "learning_rate": 1.3781512605042017e-05, + "loss": 0.4514, + "step": 35324 + }, + { + "epoch": 19.73463687150838, + "grad_norm": 0.44063419103622437, + "learning_rate": 1.3753501400560223e-05, + "loss": 0.4131, + "step": 35325 + }, + { + "epoch": 19.73519553072626, + "grad_norm": 0.5054619312286377, + "learning_rate": 1.372549019607843e-05, + "loss": 0.4444, + "step": 35326 + }, + { + "epoch": 19.735754189944135, + "grad_norm": 0.4828226864337921, + "learning_rate": 1.3697478991596638e-05, + "loss": 0.4329, + "step": 35327 + }, + { + "epoch": 19.73631284916201, + "grad_norm": 0.46969398856163025, + "learning_rate": 1.3669467787114846e-05, + "loss": 0.4188, + "step": 35328 + }, + { + "epoch": 19.73687150837989, + "grad_norm": 0.4652106761932373, + "learning_rate": 1.3641456582633052e-05, + "loss": 0.4382, + "step": 35329 + }, + { + "epoch": 19.737430167597765, + "grad_norm": 0.3739686906337738, + "learning_rate": 1.3613445378151261e-05, + "loss": 0.4117, + "step": 35330 + }, + { + "epoch": 19.73798882681564, + "grad_norm": 0.44180986285209656, + "learning_rate": 1.358543417366947e-05, + "loss": 0.368, + "step": 35331 + }, + { + "epoch": 19.738547486033518, + "grad_norm": 0.6450395584106445, + "learning_rate": 1.3557422969187677e-05, + "loss": 0.3496, + "step": 35332 + }, + { + "epoch": 19.739106145251398, + "grad_norm": 0.6799238324165344, + "learning_rate": 1.3529411764705883e-05, + "loss": 0.4199, + "step": 35333 + }, + { + "epoch": 19.739664804469275, + "grad_norm": 0.35876762866973877, + "learning_rate": 1.350140056022409e-05, + "loss": 0.3994, + "step": 35334 + }, + { + "epoch": 19.74022346368715, + "grad_norm": 2.3797507286071777, + "learning_rate": 1.3473389355742298e-05, + "loss": 0.3226, + "step": 35335 + }, + { + "epoch": 19.740782122905028, + "grad_norm": 0.4078042805194855, + "learning_rate": 1.3445378151260506e-05, + "loss": 0.4355, + "step": 35336 + }, + { + "epoch": 19.741340782122904, + "grad_norm": 1.2120305299758911, + "learning_rate": 1.3417366946778712e-05, + "loss": 0.4802, + "step": 35337 + }, + { + "epoch": 19.74189944134078, + "grad_norm": 2.60994029045105, + "learning_rate": 1.338935574229692e-05, + "loss": 0.3832, + "step": 35338 + }, + { + "epoch": 19.742458100558657, + "grad_norm": 0.7532150149345398, + "learning_rate": 1.3361344537815127e-05, + "loss": 0.5767, + "step": 35339 + }, + { + "epoch": 19.743016759776538, + "grad_norm": 0.33394649624824524, + "learning_rate": 1.3333333333333335e-05, + "loss": 0.3255, + "step": 35340 + }, + { + "epoch": 19.743575418994414, + "grad_norm": 0.3346899151802063, + "learning_rate": 1.3305322128851541e-05, + "loss": 0.3919, + "step": 35341 + }, + { + "epoch": 19.74413407821229, + "grad_norm": 1.0925260782241821, + "learning_rate": 1.3277310924369749e-05, + "loss": 0.3314, + "step": 35342 + }, + { + "epoch": 19.744692737430167, + "grad_norm": 0.384634405374527, + "learning_rate": 1.3249299719887956e-05, + "loss": 0.341, + "step": 35343 + }, + { + "epoch": 19.745251396648044, + "grad_norm": 0.8957847356796265, + "learning_rate": 1.3221288515406162e-05, + "loss": 0.4007, + "step": 35344 + }, + { + "epoch": 19.74581005586592, + "grad_norm": 0.4554111659526825, + "learning_rate": 1.319327731092437e-05, + "loss": 0.4457, + "step": 35345 + }, + { + "epoch": 19.7463687150838, + "grad_norm": 0.8450426459312439, + "learning_rate": 1.3165266106442578e-05, + "loss": 0.3426, + "step": 35346 + }, + { + "epoch": 19.746927374301677, + "grad_norm": 0.4269426763057709, + "learning_rate": 1.3137254901960785e-05, + "loss": 0.4541, + "step": 35347 + }, + { + "epoch": 19.747486033519554, + "grad_norm": 0.4236166179180145, + "learning_rate": 1.3109243697478991e-05, + "loss": 0.4459, + "step": 35348 + }, + { + "epoch": 19.74804469273743, + "grad_norm": 0.45875146985054016, + "learning_rate": 1.3081232492997199e-05, + "loss": 0.557, + "step": 35349 + }, + { + "epoch": 19.748603351955307, + "grad_norm": 0.4377026855945587, + "learning_rate": 1.3053221288515407e-05, + "loss": 0.4098, + "step": 35350 + }, + { + "epoch": 19.749162011173183, + "grad_norm": 0.6969612240791321, + "learning_rate": 1.3025210084033614e-05, + "loss": 0.2941, + "step": 35351 + }, + { + "epoch": 19.74972067039106, + "grad_norm": 0.3948517441749573, + "learning_rate": 1.299719887955182e-05, + "loss": 0.4382, + "step": 35352 + }, + { + "epoch": 19.75027932960894, + "grad_norm": 0.7713333964347839, + "learning_rate": 1.2969187675070028e-05, + "loss": 0.4586, + "step": 35353 + }, + { + "epoch": 19.750837988826817, + "grad_norm": 0.6138208508491516, + "learning_rate": 1.2941176470588236e-05, + "loss": 0.6719, + "step": 35354 + }, + { + "epoch": 19.751396648044693, + "grad_norm": 0.520180344581604, + "learning_rate": 1.2913165266106444e-05, + "loss": 0.3966, + "step": 35355 + }, + { + "epoch": 19.75195530726257, + "grad_norm": 0.39862850308418274, + "learning_rate": 1.288515406162465e-05, + "loss": 0.3426, + "step": 35356 + }, + { + "epoch": 19.752513966480446, + "grad_norm": 0.4524399936199188, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.3527, + "step": 35357 + }, + { + "epoch": 19.753072625698323, + "grad_norm": 1.5671555995941162, + "learning_rate": 1.2829131652661065e-05, + "loss": 0.3862, + "step": 35358 + }, + { + "epoch": 19.7536312849162, + "grad_norm": 0.38465172052383423, + "learning_rate": 1.2801120448179273e-05, + "loss": 0.3717, + "step": 35359 + }, + { + "epoch": 19.75418994413408, + "grad_norm": 0.7431902885437012, + "learning_rate": 1.2773109243697479e-05, + "loss": 0.3524, + "step": 35360 + }, + { + "epoch": 19.754748603351956, + "grad_norm": 0.8761482834815979, + "learning_rate": 1.2745098039215686e-05, + "loss": 0.4172, + "step": 35361 + }, + { + "epoch": 19.755307262569833, + "grad_norm": 0.42005470395088196, + "learning_rate": 1.2717086834733894e-05, + "loss": 0.3334, + "step": 35362 + }, + { + "epoch": 19.75586592178771, + "grad_norm": 0.4235022962093353, + "learning_rate": 1.2689075630252102e-05, + "loss": 0.3524, + "step": 35363 + }, + { + "epoch": 19.756424581005586, + "grad_norm": 0.3634481430053711, + "learning_rate": 1.2661064425770308e-05, + "loss": 0.398, + "step": 35364 + }, + { + "epoch": 19.756983240223462, + "grad_norm": 0.43742242455482483, + "learning_rate": 1.2633053221288515e-05, + "loss": 0.3289, + "step": 35365 + }, + { + "epoch": 19.757541899441343, + "grad_norm": 0.3694218695163727, + "learning_rate": 1.2605042016806723e-05, + "loss": 0.4362, + "step": 35366 + }, + { + "epoch": 19.75810055865922, + "grad_norm": 0.4466545581817627, + "learning_rate": 1.257703081232493e-05, + "loss": 0.463, + "step": 35367 + }, + { + "epoch": 19.758659217877096, + "grad_norm": 0.5588370561599731, + "learning_rate": 1.2549019607843137e-05, + "loss": 0.4343, + "step": 35368 + }, + { + "epoch": 19.759217877094972, + "grad_norm": 0.41058963537216187, + "learning_rate": 1.2521008403361344e-05, + "loss": 0.3628, + "step": 35369 + }, + { + "epoch": 19.75977653631285, + "grad_norm": 0.3880702257156372, + "learning_rate": 1.2492997198879552e-05, + "loss": 0.3759, + "step": 35370 + }, + { + "epoch": 19.760335195530725, + "grad_norm": 0.6218333840370178, + "learning_rate": 1.246498599439776e-05, + "loss": 0.3889, + "step": 35371 + }, + { + "epoch": 19.760893854748602, + "grad_norm": 0.42052242159843445, + "learning_rate": 1.2436974789915966e-05, + "loss": 0.3498, + "step": 35372 + }, + { + "epoch": 19.761452513966482, + "grad_norm": 0.7047318816184998, + "learning_rate": 1.2408963585434173e-05, + "loss": 0.4009, + "step": 35373 + }, + { + "epoch": 19.76201117318436, + "grad_norm": 0.8518757820129395, + "learning_rate": 1.2380952380952381e-05, + "loss": 0.3714, + "step": 35374 + }, + { + "epoch": 19.762569832402235, + "grad_norm": 1.8158037662506104, + "learning_rate": 1.2352941176470587e-05, + "loss": 0.4962, + "step": 35375 + }, + { + "epoch": 19.76312849162011, + "grad_norm": 0.3734445869922638, + "learning_rate": 1.2324929971988795e-05, + "loss": 0.3927, + "step": 35376 + }, + { + "epoch": 19.76368715083799, + "grad_norm": 0.6635797023773193, + "learning_rate": 1.2296918767507003e-05, + "loss": 0.4089, + "step": 35377 + }, + { + "epoch": 19.764245810055865, + "grad_norm": 0.3037201762199402, + "learning_rate": 1.226890756302521e-05, + "loss": 0.2901, + "step": 35378 + }, + { + "epoch": 19.76480446927374, + "grad_norm": 0.34549978375434875, + "learning_rate": 1.2240896358543416e-05, + "loss": 0.3791, + "step": 35379 + }, + { + "epoch": 19.76536312849162, + "grad_norm": 0.3665732741355896, + "learning_rate": 1.2212885154061624e-05, + "loss": 0.3675, + "step": 35380 + }, + { + "epoch": 19.765921787709498, + "grad_norm": 0.4470747709274292, + "learning_rate": 1.2184873949579832e-05, + "loss": 0.4414, + "step": 35381 + }, + { + "epoch": 19.766480446927375, + "grad_norm": 0.42910757660865784, + "learning_rate": 1.215686274509804e-05, + "loss": 0.4816, + "step": 35382 + }, + { + "epoch": 19.76703910614525, + "grad_norm": 5.966570854187012, + "learning_rate": 1.2128851540616245e-05, + "loss": 0.4625, + "step": 35383 + }, + { + "epoch": 19.767597765363128, + "grad_norm": 0.43506544828414917, + "learning_rate": 1.2100840336134453e-05, + "loss": 0.4251, + "step": 35384 + }, + { + "epoch": 19.768156424581004, + "grad_norm": 2.226890802383423, + "learning_rate": 1.207282913165266e-05, + "loss": 0.3989, + "step": 35385 + }, + { + "epoch": 19.76871508379888, + "grad_norm": 0.3308287262916565, + "learning_rate": 1.2044817927170868e-05, + "loss": 0.3876, + "step": 35386 + }, + { + "epoch": 19.76927374301676, + "grad_norm": 0.42185738682746887, + "learning_rate": 1.2016806722689074e-05, + "loss": 0.36, + "step": 35387 + }, + { + "epoch": 19.769832402234638, + "grad_norm": 0.45810985565185547, + "learning_rate": 1.1988795518207284e-05, + "loss": 0.4387, + "step": 35388 + }, + { + "epoch": 19.770391061452514, + "grad_norm": 0.4088507294654846, + "learning_rate": 1.1960784313725491e-05, + "loss": 0.3708, + "step": 35389 + }, + { + "epoch": 19.77094972067039, + "grad_norm": 0.5535271763801575, + "learning_rate": 1.1932773109243699e-05, + "loss": 0.639, + "step": 35390 + }, + { + "epoch": 19.771508379888267, + "grad_norm": 0.31074050068855286, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.2974, + "step": 35391 + }, + { + "epoch": 19.772067039106144, + "grad_norm": 2.0858309268951416, + "learning_rate": 1.1876750700280113e-05, + "loss": 0.4046, + "step": 35392 + }, + { + "epoch": 19.772625698324024, + "grad_norm": 1.9705958366394043, + "learning_rate": 1.184873949579832e-05, + "loss": 0.4135, + "step": 35393 + }, + { + "epoch": 19.7731843575419, + "grad_norm": 0.4865002930164337, + "learning_rate": 1.1820728291316528e-05, + "loss": 0.4873, + "step": 35394 + }, + { + "epoch": 19.773743016759777, + "grad_norm": 0.6817571520805359, + "learning_rate": 1.1792717086834734e-05, + "loss": 0.4266, + "step": 35395 + }, + { + "epoch": 19.774301675977654, + "grad_norm": 0.8982945680618286, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.3915, + "step": 35396 + }, + { + "epoch": 19.77486033519553, + "grad_norm": 0.3575826585292816, + "learning_rate": 1.173669467787115e-05, + "loss": 0.3997, + "step": 35397 + }, + { + "epoch": 19.775418994413407, + "grad_norm": 0.5289197564125061, + "learning_rate": 1.1708683473389357e-05, + "loss": 0.4958, + "step": 35398 + }, + { + "epoch": 19.775977653631283, + "grad_norm": 0.6567515730857849, + "learning_rate": 1.1680672268907563e-05, + "loss": 0.3063, + "step": 35399 + }, + { + "epoch": 19.776536312849164, + "grad_norm": 2.3138108253479004, + "learning_rate": 1.1652661064425771e-05, + "loss": 0.3552, + "step": 35400 + }, + { + "epoch": 19.77709497206704, + "grad_norm": 1.0725176334381104, + "learning_rate": 1.1624649859943979e-05, + "loss": 0.4822, + "step": 35401 + }, + { + "epoch": 19.777653631284917, + "grad_norm": 0.38680484890937805, + "learning_rate": 1.1596638655462186e-05, + "loss": 0.3236, + "step": 35402 + }, + { + "epoch": 19.778212290502793, + "grad_norm": 0.5377716422080994, + "learning_rate": 1.1568627450980392e-05, + "loss": 0.3898, + "step": 35403 + }, + { + "epoch": 19.77877094972067, + "grad_norm": 5.421585559844971, + "learning_rate": 1.15406162464986e-05, + "loss": 0.3205, + "step": 35404 + }, + { + "epoch": 19.779329608938546, + "grad_norm": 0.3550727367401123, + "learning_rate": 1.1512605042016808e-05, + "loss": 0.346, + "step": 35405 + }, + { + "epoch": 19.779888268156423, + "grad_norm": 0.43515947461128235, + "learning_rate": 1.1484593837535015e-05, + "loss": 0.4045, + "step": 35406 + }, + { + "epoch": 19.780446927374303, + "grad_norm": 0.4157072901725769, + "learning_rate": 1.1456582633053221e-05, + "loss": 0.3642, + "step": 35407 + }, + { + "epoch": 19.78100558659218, + "grad_norm": 0.43124279379844666, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.4, + "step": 35408 + }, + { + "epoch": 19.781564245810056, + "grad_norm": 0.5944299697875977, + "learning_rate": 1.1400560224089637e-05, + "loss": 0.4026, + "step": 35409 + }, + { + "epoch": 19.782122905027933, + "grad_norm": 0.3536844253540039, + "learning_rate": 1.1372549019607843e-05, + "loss": 0.3354, + "step": 35410 + }, + { + "epoch": 19.78268156424581, + "grad_norm": 0.42086103558540344, + "learning_rate": 1.134453781512605e-05, + "loss": 0.3785, + "step": 35411 + }, + { + "epoch": 19.783240223463686, + "grad_norm": 1.5688191652297974, + "learning_rate": 1.1316526610644258e-05, + "loss": 0.4498, + "step": 35412 + }, + { + "epoch": 19.783798882681566, + "grad_norm": 0.3635714054107666, + "learning_rate": 1.1288515406162466e-05, + "loss": 0.3804, + "step": 35413 + }, + { + "epoch": 19.784357541899443, + "grad_norm": 0.6300631761550903, + "learning_rate": 1.1260504201680672e-05, + "loss": 0.4114, + "step": 35414 + }, + { + "epoch": 19.78491620111732, + "grad_norm": 8.403244972229004, + "learning_rate": 1.123249299719888e-05, + "loss": 0.3891, + "step": 35415 + }, + { + "epoch": 19.785474860335196, + "grad_norm": 0.7297216653823853, + "learning_rate": 1.1204481792717087e-05, + "loss": 0.538, + "step": 35416 + }, + { + "epoch": 19.786033519553072, + "grad_norm": 0.7535122036933899, + "learning_rate": 1.1176470588235295e-05, + "loss": 0.3785, + "step": 35417 + }, + { + "epoch": 19.78659217877095, + "grad_norm": 0.5579131841659546, + "learning_rate": 1.11484593837535e-05, + "loss": 0.2886, + "step": 35418 + }, + { + "epoch": 19.787150837988825, + "grad_norm": 0.36793315410614014, + "learning_rate": 1.1120448179271709e-05, + "loss": 0.3938, + "step": 35419 + }, + { + "epoch": 19.787709497206706, + "grad_norm": 0.4455862045288086, + "learning_rate": 1.1092436974789916e-05, + "loss": 0.3504, + "step": 35420 + }, + { + "epoch": 19.788268156424582, + "grad_norm": 0.33554697036743164, + "learning_rate": 1.1064425770308124e-05, + "loss": 0.3082, + "step": 35421 + }, + { + "epoch": 19.78882681564246, + "grad_norm": 0.42358070611953735, + "learning_rate": 1.103641456582633e-05, + "loss": 0.4217, + "step": 35422 + }, + { + "epoch": 19.789385474860335, + "grad_norm": 1.4139267206192017, + "learning_rate": 1.1008403361344538e-05, + "loss": 0.3496, + "step": 35423 + }, + { + "epoch": 19.789944134078212, + "grad_norm": 0.5414817929267883, + "learning_rate": 1.0980392156862745e-05, + "loss": 0.4734, + "step": 35424 + }, + { + "epoch": 19.79050279329609, + "grad_norm": 0.47942787408828735, + "learning_rate": 1.0952380952380953e-05, + "loss": 0.4385, + "step": 35425 + }, + { + "epoch": 19.791061452513965, + "grad_norm": 0.7389019727706909, + "learning_rate": 1.0924369747899159e-05, + "loss": 0.3776, + "step": 35426 + }, + { + "epoch": 19.791620111731845, + "grad_norm": 0.3489637076854706, + "learning_rate": 1.0896358543417367e-05, + "loss": 0.3097, + "step": 35427 + }, + { + "epoch": 19.79217877094972, + "grad_norm": 0.6101607084274292, + "learning_rate": 1.0868347338935574e-05, + "loss": 0.4299, + "step": 35428 + }, + { + "epoch": 19.7927374301676, + "grad_norm": 0.4865139424800873, + "learning_rate": 1.0840336134453782e-05, + "loss": 0.3274, + "step": 35429 + }, + { + "epoch": 19.793296089385475, + "grad_norm": 0.38860467076301575, + "learning_rate": 1.0812324929971988e-05, + "loss": 0.4513, + "step": 35430 + }, + { + "epoch": 19.79385474860335, + "grad_norm": 7.287206649780273, + "learning_rate": 1.0784313725490196e-05, + "loss": 0.3786, + "step": 35431 + }, + { + "epoch": 19.794413407821228, + "grad_norm": 0.36772340536117554, + "learning_rate": 1.0756302521008403e-05, + "loss": 0.317, + "step": 35432 + }, + { + "epoch": 19.794972067039105, + "grad_norm": 1.7814099788665771, + "learning_rate": 1.0728291316526611e-05, + "loss": 0.4732, + "step": 35433 + }, + { + "epoch": 19.795530726256985, + "grad_norm": 1.896518349647522, + "learning_rate": 1.0700280112044817e-05, + "loss": 0.4465, + "step": 35434 + }, + { + "epoch": 19.79608938547486, + "grad_norm": 0.5008261799812317, + "learning_rate": 1.0672268907563025e-05, + "loss": 0.4829, + "step": 35435 + }, + { + "epoch": 19.796648044692738, + "grad_norm": 0.39349105954170227, + "learning_rate": 1.0644257703081232e-05, + "loss": 0.4175, + "step": 35436 + }, + { + "epoch": 19.797206703910614, + "grad_norm": 0.483936071395874, + "learning_rate": 1.061624649859944e-05, + "loss": 0.4006, + "step": 35437 + }, + { + "epoch": 19.79776536312849, + "grad_norm": 0.7365521788597107, + "learning_rate": 1.0588235294117646e-05, + "loss": 0.4336, + "step": 35438 + }, + { + "epoch": 19.798324022346367, + "grad_norm": 0.4665125012397766, + "learning_rate": 1.0560224089635854e-05, + "loss": 0.3615, + "step": 35439 + }, + { + "epoch": 19.798882681564244, + "grad_norm": 0.41941338777542114, + "learning_rate": 1.0532212885154062e-05, + "loss": 0.3516, + "step": 35440 + }, + { + "epoch": 19.799441340782124, + "grad_norm": 0.3809608519077301, + "learning_rate": 1.050420168067227e-05, + "loss": 0.3293, + "step": 35441 + }, + { + "epoch": 19.8, + "grad_norm": 0.7504149079322815, + "learning_rate": 1.0476190476190475e-05, + "loss": 0.4951, + "step": 35442 + }, + { + "epoch": 19.800558659217877, + "grad_norm": 0.5064688324928284, + "learning_rate": 1.0448179271708683e-05, + "loss": 0.4691, + "step": 35443 + }, + { + "epoch": 19.801117318435754, + "grad_norm": 0.3260011076927185, + "learning_rate": 1.0420168067226892e-05, + "loss": 0.3691, + "step": 35444 + }, + { + "epoch": 19.80167597765363, + "grad_norm": 0.4795764088630676, + "learning_rate": 1.0392156862745098e-05, + "loss": 0.5028, + "step": 35445 + }, + { + "epoch": 19.802234636871507, + "grad_norm": 0.30835017561912537, + "learning_rate": 1.0364145658263306e-05, + "loss": 0.3702, + "step": 35446 + }, + { + "epoch": 19.802793296089387, + "grad_norm": 1.7510360479354858, + "learning_rate": 1.0336134453781514e-05, + "loss": 0.4261, + "step": 35447 + }, + { + "epoch": 19.803351955307264, + "grad_norm": 0.5377888083457947, + "learning_rate": 1.0308123249299721e-05, + "loss": 0.4054, + "step": 35448 + }, + { + "epoch": 19.80391061452514, + "grad_norm": 0.5830637216567993, + "learning_rate": 1.0280112044817927e-05, + "loss": 0.4766, + "step": 35449 + }, + { + "epoch": 19.804469273743017, + "grad_norm": 1.8150489330291748, + "learning_rate": 1.0252100840336135e-05, + "loss": 0.4179, + "step": 35450 + }, + { + "epoch": 19.805027932960893, + "grad_norm": 0.5734506249427795, + "learning_rate": 1.0224089635854343e-05, + "loss": 0.4011, + "step": 35451 + }, + { + "epoch": 19.80558659217877, + "grad_norm": 0.513920247554779, + "learning_rate": 1.019607843137255e-05, + "loss": 0.4805, + "step": 35452 + }, + { + "epoch": 19.806145251396647, + "grad_norm": 0.36466121673583984, + "learning_rate": 1.0168067226890756e-05, + "loss": 0.3617, + "step": 35453 + }, + { + "epoch": 19.806703910614527, + "grad_norm": 0.44612565636634827, + "learning_rate": 1.0140056022408964e-05, + "loss": 0.4559, + "step": 35454 + }, + { + "epoch": 19.807262569832403, + "grad_norm": 1.0071829557418823, + "learning_rate": 1.0112044817927172e-05, + "loss": 0.3683, + "step": 35455 + }, + { + "epoch": 19.80782122905028, + "grad_norm": 0.8905125260353088, + "learning_rate": 1.008403361344538e-05, + "loss": 0.2977, + "step": 35456 + }, + { + "epoch": 19.808379888268156, + "grad_norm": 1.2234464883804321, + "learning_rate": 1.0056022408963585e-05, + "loss": 0.4066, + "step": 35457 + }, + { + "epoch": 19.808938547486033, + "grad_norm": 1.0835731029510498, + "learning_rate": 1.0028011204481793e-05, + "loss": 0.295, + "step": 35458 + }, + { + "epoch": 19.80949720670391, + "grad_norm": 0.6086183190345764, + "learning_rate": 1e-05, + "loss": 0.5409, + "step": 35459 + }, + { + "epoch": 19.810055865921786, + "grad_norm": 0.41774123907089233, + "learning_rate": 9.971988795518209e-06, + "loss": 0.4359, + "step": 35460 + }, + { + "epoch": 19.810614525139666, + "grad_norm": 1.0672895908355713, + "learning_rate": 9.943977591036415e-06, + "loss": 0.4561, + "step": 35461 + }, + { + "epoch": 19.811173184357543, + "grad_norm": 0.3792206645011902, + "learning_rate": 9.915966386554622e-06, + "loss": 0.3996, + "step": 35462 + }, + { + "epoch": 19.81173184357542, + "grad_norm": 0.5730742812156677, + "learning_rate": 9.88795518207283e-06, + "loss": 0.3345, + "step": 35463 + }, + { + "epoch": 19.812290502793296, + "grad_norm": 0.49028459191322327, + "learning_rate": 9.859943977591038e-06, + "loss": 0.4197, + "step": 35464 + }, + { + "epoch": 19.812849162011172, + "grad_norm": 1.1842142343521118, + "learning_rate": 9.831932773109244e-06, + "loss": 0.5358, + "step": 35465 + }, + { + "epoch": 19.81340782122905, + "grad_norm": 0.38475149869918823, + "learning_rate": 9.803921568627451e-06, + "loss": 0.4663, + "step": 35466 + }, + { + "epoch": 19.81396648044693, + "grad_norm": 0.5872317552566528, + "learning_rate": 9.775910364145659e-06, + "loss": 0.3847, + "step": 35467 + }, + { + "epoch": 19.814525139664806, + "grad_norm": 0.7070002555847168, + "learning_rate": 9.747899159663867e-06, + "loss": 0.4182, + "step": 35468 + }, + { + "epoch": 19.815083798882682, + "grad_norm": 0.5117806196212769, + "learning_rate": 9.719887955182073e-06, + "loss": 0.3946, + "step": 35469 + }, + { + "epoch": 19.81564245810056, + "grad_norm": 1.5071008205413818, + "learning_rate": 9.69187675070028e-06, + "loss": 0.3947, + "step": 35470 + }, + { + "epoch": 19.816201117318435, + "grad_norm": 0.36761990189552307, + "learning_rate": 9.663865546218488e-06, + "loss": 0.3493, + "step": 35471 + }, + { + "epoch": 19.816759776536312, + "grad_norm": 0.4044910967350006, + "learning_rate": 9.635854341736696e-06, + "loss": 0.4225, + "step": 35472 + }, + { + "epoch": 19.81731843575419, + "grad_norm": 0.32278409600257874, + "learning_rate": 9.607843137254902e-06, + "loss": 0.3502, + "step": 35473 + }, + { + "epoch": 19.81787709497207, + "grad_norm": 0.4044205844402313, + "learning_rate": 9.57983193277311e-06, + "loss": 0.4839, + "step": 35474 + }, + { + "epoch": 19.818435754189945, + "grad_norm": 0.6844619512557983, + "learning_rate": 9.551820728291317e-06, + "loss": 0.4976, + "step": 35475 + }, + { + "epoch": 19.81899441340782, + "grad_norm": 1.0425516366958618, + "learning_rate": 9.523809523809525e-06, + "loss": 0.4034, + "step": 35476 + }, + { + "epoch": 19.8195530726257, + "grad_norm": 0.4030227065086365, + "learning_rate": 9.49579831932773e-06, + "loss": 0.4116, + "step": 35477 + }, + { + "epoch": 19.820111731843575, + "grad_norm": 0.5286258459091187, + "learning_rate": 9.467787114845938e-06, + "loss": 0.5643, + "step": 35478 + }, + { + "epoch": 19.82067039106145, + "grad_norm": 0.3709220588207245, + "learning_rate": 9.439775910364146e-06, + "loss": 0.3372, + "step": 35479 + }, + { + "epoch": 19.821229050279328, + "grad_norm": 0.4690113067626953, + "learning_rate": 9.411764705882352e-06, + "loss": 0.3767, + "step": 35480 + }, + { + "epoch": 19.821787709497208, + "grad_norm": 0.3882239758968353, + "learning_rate": 9.38375350140056e-06, + "loss": 0.3847, + "step": 35481 + }, + { + "epoch": 19.822346368715085, + "grad_norm": 0.8706823587417603, + "learning_rate": 9.355742296918767e-06, + "loss": 0.6712, + "step": 35482 + }, + { + "epoch": 19.82290502793296, + "grad_norm": 0.43887171149253845, + "learning_rate": 9.327731092436975e-06, + "loss": 0.3566, + "step": 35483 + }, + { + "epoch": 19.823463687150838, + "grad_norm": 0.5384312868118286, + "learning_rate": 9.299719887955181e-06, + "loss": 0.6602, + "step": 35484 + }, + { + "epoch": 19.824022346368714, + "grad_norm": 0.42510315775871277, + "learning_rate": 9.271708683473389e-06, + "loss": 0.3938, + "step": 35485 + }, + { + "epoch": 19.82458100558659, + "grad_norm": 0.43815770745277405, + "learning_rate": 9.243697478991597e-06, + "loss": 0.3332, + "step": 35486 + }, + { + "epoch": 19.825139664804468, + "grad_norm": 0.742708146572113, + "learning_rate": 9.215686274509804e-06, + "loss": 0.4613, + "step": 35487 + }, + { + "epoch": 19.825698324022348, + "grad_norm": 0.4642021358013153, + "learning_rate": 9.18767507002801e-06, + "loss": 0.3769, + "step": 35488 + }, + { + "epoch": 19.826256983240224, + "grad_norm": 1.5623059272766113, + "learning_rate": 9.159663865546218e-06, + "loss": 0.4048, + "step": 35489 + }, + { + "epoch": 19.8268156424581, + "grad_norm": 0.3773129880428314, + "learning_rate": 9.131652661064426e-06, + "loss": 0.4172, + "step": 35490 + }, + { + "epoch": 19.827374301675977, + "grad_norm": 0.5068979859352112, + "learning_rate": 9.103641456582633e-06, + "loss": 0.4614, + "step": 35491 + }, + { + "epoch": 19.827932960893854, + "grad_norm": 0.5426199436187744, + "learning_rate": 9.07563025210084e-06, + "loss": 0.4455, + "step": 35492 + }, + { + "epoch": 19.82849162011173, + "grad_norm": 0.6335077285766602, + "learning_rate": 9.047619047619047e-06, + "loss": 0.3756, + "step": 35493 + }, + { + "epoch": 19.82905027932961, + "grad_norm": 0.48047536611557007, + "learning_rate": 9.019607843137255e-06, + "loss": 0.4421, + "step": 35494 + }, + { + "epoch": 19.829608938547487, + "grad_norm": 0.5671623945236206, + "learning_rate": 8.991596638655462e-06, + "loss": 0.3646, + "step": 35495 + }, + { + "epoch": 19.830167597765364, + "grad_norm": 0.47398099303245544, + "learning_rate": 8.963585434173668e-06, + "loss": 0.4053, + "step": 35496 + }, + { + "epoch": 19.83072625698324, + "grad_norm": 0.7299337387084961, + "learning_rate": 8.935574229691876e-06, + "loss": 0.4374, + "step": 35497 + }, + { + "epoch": 19.831284916201117, + "grad_norm": 0.38078492879867554, + "learning_rate": 8.907563025210084e-06, + "loss": 0.3514, + "step": 35498 + }, + { + "epoch": 19.831843575418993, + "grad_norm": 0.4372172951698303, + "learning_rate": 8.879551820728291e-06, + "loss": 0.3925, + "step": 35499 + }, + { + "epoch": 19.83240223463687, + "grad_norm": 0.4379926323890686, + "learning_rate": 8.851540616246497e-06, + "loss": 0.3628, + "step": 35500 + }, + { + "epoch": 19.83240223463687, + "eval_cer": 0.08419800770296662, + "eval_loss": 0.3183664083480835, + "eval_runtime": 55.5821, + "eval_samples_per_second": 81.645, + "eval_steps_per_second": 5.11, + "eval_wer": 0.33250467864026145, + "step": 35500 + }, + { + "epoch": 19.83296089385475, + "grad_norm": 0.9604039788246155, + "learning_rate": 8.823529411764705e-06, + "loss": 0.4113, + "step": 35501 + }, + { + "epoch": 19.833519553072627, + "grad_norm": 0.6833616495132446, + "learning_rate": 8.795518207282914e-06, + "loss": 0.3779, + "step": 35502 + }, + { + "epoch": 19.834078212290503, + "grad_norm": 0.4546038806438446, + "learning_rate": 8.767507002801122e-06, + "loss": 0.4312, + "step": 35503 + }, + { + "epoch": 19.83463687150838, + "grad_norm": 0.37255576252937317, + "learning_rate": 8.739495798319328e-06, + "loss": 0.3509, + "step": 35504 + }, + { + "epoch": 19.835195530726256, + "grad_norm": 0.374406635761261, + "learning_rate": 8.711484593837536e-06, + "loss": 0.3985, + "step": 35505 + }, + { + "epoch": 19.835754189944133, + "grad_norm": 0.7711275219917297, + "learning_rate": 8.683473389355744e-06, + "loss": 0.4128, + "step": 35506 + }, + { + "epoch": 19.83631284916201, + "grad_norm": 0.5150610208511353, + "learning_rate": 8.655462184873951e-06, + "loss": 0.5035, + "step": 35507 + }, + { + "epoch": 19.83687150837989, + "grad_norm": 0.436656653881073, + "learning_rate": 8.627450980392157e-06, + "loss": 0.3825, + "step": 35508 + }, + { + "epoch": 19.837430167597766, + "grad_norm": 0.5069315433502197, + "learning_rate": 8.599439775910365e-06, + "loss": 0.3684, + "step": 35509 + }, + { + "epoch": 19.837988826815643, + "grad_norm": 0.39290910959243774, + "learning_rate": 8.571428571428573e-06, + "loss": 0.4058, + "step": 35510 + }, + { + "epoch": 19.83854748603352, + "grad_norm": 0.9138739705085754, + "learning_rate": 8.543417366946779e-06, + "loss": 0.3596, + "step": 35511 + }, + { + "epoch": 19.839106145251396, + "grad_norm": 1.8350526094436646, + "learning_rate": 8.515406162464986e-06, + "loss": 0.3519, + "step": 35512 + }, + { + "epoch": 19.839664804469272, + "grad_norm": 0.9484584927558899, + "learning_rate": 8.487394957983194e-06, + "loss": 0.3813, + "step": 35513 + }, + { + "epoch": 19.840223463687153, + "grad_norm": 0.4576646089553833, + "learning_rate": 8.459383753501402e-06, + "loss": 0.46, + "step": 35514 + }, + { + "epoch": 19.84078212290503, + "grad_norm": 0.7134267091751099, + "learning_rate": 8.431372549019608e-06, + "loss": 0.4448, + "step": 35515 + }, + { + "epoch": 19.841340782122906, + "grad_norm": 0.4767299294471741, + "learning_rate": 8.403361344537815e-06, + "loss": 0.4664, + "step": 35516 + }, + { + "epoch": 19.841899441340782, + "grad_norm": 0.6320598125457764, + "learning_rate": 8.375350140056023e-06, + "loss": 0.3816, + "step": 35517 + }, + { + "epoch": 19.84245810055866, + "grad_norm": 0.520431637763977, + "learning_rate": 8.34733893557423e-06, + "loss": 0.4582, + "step": 35518 + }, + { + "epoch": 19.843016759776535, + "grad_norm": 0.35879379510879517, + "learning_rate": 8.319327731092437e-06, + "loss": 0.3502, + "step": 35519 + }, + { + "epoch": 19.843575418994412, + "grad_norm": 0.3486548960208893, + "learning_rate": 8.291316526610644e-06, + "loss": 0.3546, + "step": 35520 + }, + { + "epoch": 19.844134078212292, + "grad_norm": 0.37583065032958984, + "learning_rate": 8.263305322128852e-06, + "loss": 0.4123, + "step": 35521 + }, + { + "epoch": 19.84469273743017, + "grad_norm": 2.0360541343688965, + "learning_rate": 8.23529411764706e-06, + "loss": 0.5527, + "step": 35522 + }, + { + "epoch": 19.845251396648045, + "grad_norm": 0.5614008903503418, + "learning_rate": 8.207282913165266e-06, + "loss": 0.3478, + "step": 35523 + }, + { + "epoch": 19.845810055865922, + "grad_norm": 0.404690146446228, + "learning_rate": 8.179271708683473e-06, + "loss": 0.3642, + "step": 35524 + }, + { + "epoch": 19.8463687150838, + "grad_norm": 1.168663501739502, + "learning_rate": 8.151260504201681e-06, + "loss": 0.4079, + "step": 35525 + }, + { + "epoch": 19.846927374301675, + "grad_norm": 0.3829575181007385, + "learning_rate": 8.123249299719889e-06, + "loss": 0.3279, + "step": 35526 + }, + { + "epoch": 19.84748603351955, + "grad_norm": 0.48685383796691895, + "learning_rate": 8.095238095238095e-06, + "loss": 0.4543, + "step": 35527 + }, + { + "epoch": 19.84804469273743, + "grad_norm": 1.7935909032821655, + "learning_rate": 8.067226890756303e-06, + "loss": 0.3568, + "step": 35528 + }, + { + "epoch": 19.84860335195531, + "grad_norm": 0.43810322880744934, + "learning_rate": 8.03921568627451e-06, + "loss": 0.4088, + "step": 35529 + }, + { + "epoch": 19.849162011173185, + "grad_norm": 0.5050700306892395, + "learning_rate": 8.011204481792718e-06, + "loss": 0.3397, + "step": 35530 + }, + { + "epoch": 19.84972067039106, + "grad_norm": 0.5490198135375977, + "learning_rate": 7.983193277310924e-06, + "loss": 0.4668, + "step": 35531 + }, + { + "epoch": 19.850279329608938, + "grad_norm": 6.344862461090088, + "learning_rate": 7.955182072829132e-06, + "loss": 0.4066, + "step": 35532 + }, + { + "epoch": 19.850837988826814, + "grad_norm": 0.5124679207801819, + "learning_rate": 7.92717086834734e-06, + "loss": 0.4066, + "step": 35533 + }, + { + "epoch": 19.85139664804469, + "grad_norm": 0.6247010231018066, + "learning_rate": 7.899159663865547e-06, + "loss": 0.4567, + "step": 35534 + }, + { + "epoch": 19.85195530726257, + "grad_norm": 0.3546310365200043, + "learning_rate": 7.871148459383753e-06, + "loss": 0.3596, + "step": 35535 + }, + { + "epoch": 19.852513966480448, + "grad_norm": 0.42871859669685364, + "learning_rate": 7.84313725490196e-06, + "loss": 0.2945, + "step": 35536 + }, + { + "epoch": 19.853072625698324, + "grad_norm": 1.153154969215393, + "learning_rate": 7.815126050420168e-06, + "loss": 0.396, + "step": 35537 + }, + { + "epoch": 19.8536312849162, + "grad_norm": 0.9508769512176514, + "learning_rate": 7.787114845938376e-06, + "loss": 0.3773, + "step": 35538 + }, + { + "epoch": 19.854189944134077, + "grad_norm": 0.35451996326446533, + "learning_rate": 7.759103641456584e-06, + "loss": 0.3973, + "step": 35539 + }, + { + "epoch": 19.854748603351954, + "grad_norm": 0.40359291434288025, + "learning_rate": 7.73109243697479e-06, + "loss": 0.4204, + "step": 35540 + }, + { + "epoch": 19.85530726256983, + "grad_norm": 0.4803354740142822, + "learning_rate": 7.703081232492997e-06, + "loss": 0.4347, + "step": 35541 + }, + { + "epoch": 19.85586592178771, + "grad_norm": 0.4973466396331787, + "learning_rate": 7.675070028011205e-06, + "loss": 0.5028, + "step": 35542 + }, + { + "epoch": 19.856424581005587, + "grad_norm": 0.7691525816917419, + "learning_rate": 7.647058823529413e-06, + "loss": 0.3523, + "step": 35543 + }, + { + "epoch": 19.856983240223464, + "grad_norm": 0.5215095281600952, + "learning_rate": 7.619047619047619e-06, + "loss": 0.3446, + "step": 35544 + }, + { + "epoch": 19.85754189944134, + "grad_norm": 1.2336946725845337, + "learning_rate": 7.5910364145658265e-06, + "loss": 0.3818, + "step": 35545 + }, + { + "epoch": 19.858100558659217, + "grad_norm": 2.777637243270874, + "learning_rate": 7.563025210084034e-06, + "loss": 0.3379, + "step": 35546 + }, + { + "epoch": 19.858659217877094, + "grad_norm": 0.41862061619758606, + "learning_rate": 7.535014005602241e-06, + "loss": 0.3558, + "step": 35547 + }, + { + "epoch": 19.859217877094974, + "grad_norm": 0.5710946917533875, + "learning_rate": 7.507002801120449e-06, + "loss": 0.4097, + "step": 35548 + }, + { + "epoch": 19.85977653631285, + "grad_norm": 0.46634867787361145, + "learning_rate": 7.4789915966386555e-06, + "loss": 0.3715, + "step": 35549 + }, + { + "epoch": 19.860335195530727, + "grad_norm": 0.32197415828704834, + "learning_rate": 7.450980392156863e-06, + "loss": 0.3542, + "step": 35550 + }, + { + "epoch": 19.860893854748603, + "grad_norm": 0.6783162355422974, + "learning_rate": 7.42296918767507e-06, + "loss": 0.6101, + "step": 35551 + }, + { + "epoch": 19.86145251396648, + "grad_norm": 0.8543192148208618, + "learning_rate": 7.394957983193278e-06, + "loss": 0.4123, + "step": 35552 + }, + { + "epoch": 19.862011173184356, + "grad_norm": 0.3979789614677429, + "learning_rate": 7.366946778711485e-06, + "loss": 0.3475, + "step": 35553 + }, + { + "epoch": 19.862569832402233, + "grad_norm": 0.4430939257144928, + "learning_rate": 7.338935574229692e-06, + "loss": 0.4203, + "step": 35554 + }, + { + "epoch": 19.863128491620113, + "grad_norm": 2.5592780113220215, + "learning_rate": 7.310924369747899e-06, + "loss": 0.4729, + "step": 35555 + }, + { + "epoch": 19.86368715083799, + "grad_norm": 0.40907856822013855, + "learning_rate": 7.282913165266107e-06, + "loss": 0.4438, + "step": 35556 + }, + { + "epoch": 19.864245810055866, + "grad_norm": 2.6902639865875244, + "learning_rate": 7.254901960784314e-06, + "loss": 0.3976, + "step": 35557 + }, + { + "epoch": 19.864804469273743, + "grad_norm": 0.491024374961853, + "learning_rate": 7.226890756302521e-06, + "loss": 0.4618, + "step": 35558 + }, + { + "epoch": 19.86536312849162, + "grad_norm": 0.46603935956954956, + "learning_rate": 7.198879551820728e-06, + "loss": 0.3655, + "step": 35559 + }, + { + "epoch": 19.865921787709496, + "grad_norm": 0.6266034245491028, + "learning_rate": 7.170868347338936e-06, + "loss": 0.6292, + "step": 35560 + }, + { + "epoch": 19.866480446927373, + "grad_norm": 0.37468862533569336, + "learning_rate": 7.142857142857143e-06, + "loss": 0.4192, + "step": 35561 + }, + { + "epoch": 19.867039106145253, + "grad_norm": 0.5093718767166138, + "learning_rate": 7.11484593837535e-06, + "loss": 0.3296, + "step": 35562 + }, + { + "epoch": 19.86759776536313, + "grad_norm": 0.6867132186889648, + "learning_rate": 7.086834733893557e-06, + "loss": 0.4028, + "step": 35563 + }, + { + "epoch": 19.868156424581006, + "grad_norm": 0.607503354549408, + "learning_rate": 7.058823529411765e-06, + "loss": 0.4589, + "step": 35564 + }, + { + "epoch": 19.868715083798882, + "grad_norm": 0.6798485517501831, + "learning_rate": 7.030812324929972e-06, + "loss": 0.3539, + "step": 35565 + }, + { + "epoch": 19.86927374301676, + "grad_norm": 0.5485566854476929, + "learning_rate": 7.0028011204481795e-06, + "loss": 0.4842, + "step": 35566 + }, + { + "epoch": 19.869832402234636, + "grad_norm": 0.7571957111358643, + "learning_rate": 6.974789915966386e-06, + "loss": 0.4923, + "step": 35567 + }, + { + "epoch": 19.870391061452516, + "grad_norm": 0.6238499283790588, + "learning_rate": 6.946778711484594e-06, + "loss": 0.5692, + "step": 35568 + }, + { + "epoch": 19.870949720670392, + "grad_norm": 0.730003833770752, + "learning_rate": 6.918767507002801e-06, + "loss": 0.4373, + "step": 35569 + }, + { + "epoch": 19.87150837988827, + "grad_norm": 0.5628697276115417, + "learning_rate": 6.8907563025210085e-06, + "loss": 0.4717, + "step": 35570 + }, + { + "epoch": 19.872067039106145, + "grad_norm": 0.47615954279899597, + "learning_rate": 6.862745098039215e-06, + "loss": 0.367, + "step": 35571 + }, + { + "epoch": 19.872625698324022, + "grad_norm": 0.5309939384460449, + "learning_rate": 6.834733893557423e-06, + "loss": 0.2809, + "step": 35572 + }, + { + "epoch": 19.8731843575419, + "grad_norm": 0.37138187885284424, + "learning_rate": 6.806722689075631e-06, + "loss": 0.4018, + "step": 35573 + }, + { + "epoch": 19.873743016759775, + "grad_norm": 1.0609674453735352, + "learning_rate": 6.7787114845938384e-06, + "loss": 0.3847, + "step": 35574 + }, + { + "epoch": 19.874301675977655, + "grad_norm": 0.4392301142215729, + "learning_rate": 6.750700280112045e-06, + "loss": 0.4211, + "step": 35575 + }, + { + "epoch": 19.87486033519553, + "grad_norm": 0.3797439932823181, + "learning_rate": 6.722689075630253e-06, + "loss": 0.3247, + "step": 35576 + }, + { + "epoch": 19.87541899441341, + "grad_norm": 0.40507978200912476, + "learning_rate": 6.69467787114846e-06, + "loss": 0.3803, + "step": 35577 + }, + { + "epoch": 19.875977653631285, + "grad_norm": 1.6920356750488281, + "learning_rate": 6.6666666666666675e-06, + "loss": 0.3982, + "step": 35578 + }, + { + "epoch": 19.87653631284916, + "grad_norm": 0.42925766110420227, + "learning_rate": 6.638655462184874e-06, + "loss": 0.3347, + "step": 35579 + }, + { + "epoch": 19.877094972067038, + "grad_norm": 0.3393830955028534, + "learning_rate": 6.610644257703081e-06, + "loss": 0.3983, + "step": 35580 + }, + { + "epoch": 19.877653631284915, + "grad_norm": 0.8725360035896301, + "learning_rate": 6.582633053221289e-06, + "loss": 0.3126, + "step": 35581 + }, + { + "epoch": 19.878212290502795, + "grad_norm": 0.47602248191833496, + "learning_rate": 6.554621848739496e-06, + "loss": 0.5332, + "step": 35582 + }, + { + "epoch": 19.87877094972067, + "grad_norm": 0.5204659104347229, + "learning_rate": 6.526610644257703e-06, + "loss": 0.3323, + "step": 35583 + }, + { + "epoch": 19.879329608938548, + "grad_norm": 0.39426782727241516, + "learning_rate": 6.49859943977591e-06, + "loss": 0.3341, + "step": 35584 + }, + { + "epoch": 19.879888268156424, + "grad_norm": 0.422897070646286, + "learning_rate": 6.470588235294118e-06, + "loss": 0.4999, + "step": 35585 + }, + { + "epoch": 19.8804469273743, + "grad_norm": 0.4663355350494385, + "learning_rate": 6.442577030812325e-06, + "loss": 0.3848, + "step": 35586 + }, + { + "epoch": 19.881005586592178, + "grad_norm": 0.6233259439468384, + "learning_rate": 6.4145658263305325e-06, + "loss": 0.4194, + "step": 35587 + }, + { + "epoch": 19.881564245810054, + "grad_norm": 0.4507158696651459, + "learning_rate": 6.386554621848739e-06, + "loss": 0.4627, + "step": 35588 + }, + { + "epoch": 19.882122905027934, + "grad_norm": 0.3214305341243744, + "learning_rate": 6.358543417366947e-06, + "loss": 0.306, + "step": 35589 + }, + { + "epoch": 19.88268156424581, + "grad_norm": 0.8551495671272278, + "learning_rate": 6.330532212885154e-06, + "loss": 0.3744, + "step": 35590 + }, + { + "epoch": 19.883240223463687, + "grad_norm": 0.44746673107147217, + "learning_rate": 6.3025210084033615e-06, + "loss": 0.347, + "step": 35591 + }, + { + "epoch": 19.883798882681564, + "grad_norm": 0.2838791012763977, + "learning_rate": 6.274509803921568e-06, + "loss": 0.3111, + "step": 35592 + }, + { + "epoch": 19.88435754189944, + "grad_norm": 0.3916764557361603, + "learning_rate": 6.246498599439776e-06, + "loss": 0.3976, + "step": 35593 + }, + { + "epoch": 19.884916201117317, + "grad_norm": 0.8163996338844299, + "learning_rate": 6.218487394957983e-06, + "loss": 0.4459, + "step": 35594 + }, + { + "epoch": 19.885474860335197, + "grad_norm": 0.42282184958457947, + "learning_rate": 6.190476190476191e-06, + "loss": 0.4298, + "step": 35595 + }, + { + "epoch": 19.886033519553074, + "grad_norm": 0.4737150967121124, + "learning_rate": 6.162464985994397e-06, + "loss": 0.3543, + "step": 35596 + }, + { + "epoch": 19.88659217877095, + "grad_norm": 0.8519539833068848, + "learning_rate": 6.134453781512605e-06, + "loss": 0.282, + "step": 35597 + }, + { + "epoch": 19.887150837988827, + "grad_norm": 1.1746183633804321, + "learning_rate": 6.106442577030812e-06, + "loss": 0.3679, + "step": 35598 + }, + { + "epoch": 19.887709497206703, + "grad_norm": 0.7324888110160828, + "learning_rate": 6.07843137254902e-06, + "loss": 0.4486, + "step": 35599 + }, + { + "epoch": 19.88826815642458, + "grad_norm": 0.360386461019516, + "learning_rate": 6.0504201680672265e-06, + "loss": 0.4148, + "step": 35600 + }, + { + "epoch": 19.888826815642457, + "grad_norm": 0.357022225856781, + "learning_rate": 6.022408963585434e-06, + "loss": 0.3358, + "step": 35601 + }, + { + "epoch": 19.889385474860337, + "grad_norm": 0.5042193531990051, + "learning_rate": 5.994397759103642e-06, + "loss": 0.4167, + "step": 35602 + }, + { + "epoch": 19.889944134078213, + "grad_norm": 0.5186048746109009, + "learning_rate": 5.9663865546218495e-06, + "loss": 0.298, + "step": 35603 + }, + { + "epoch": 19.89050279329609, + "grad_norm": 0.6707499027252197, + "learning_rate": 5.938375350140056e-06, + "loss": 0.4543, + "step": 35604 + }, + { + "epoch": 19.891061452513966, + "grad_norm": 0.4759967625141144, + "learning_rate": 5.910364145658264e-06, + "loss": 0.3585, + "step": 35605 + }, + { + "epoch": 19.891620111731843, + "grad_norm": 0.4389513432979584, + "learning_rate": 5.882352941176471e-06, + "loss": 0.3579, + "step": 35606 + }, + { + "epoch": 19.89217877094972, + "grad_norm": 1.225927472114563, + "learning_rate": 5.854341736694679e-06, + "loss": 0.3545, + "step": 35607 + }, + { + "epoch": 19.892737430167596, + "grad_norm": 0.39883729815483093, + "learning_rate": 5.8263305322128855e-06, + "loss": 0.3716, + "step": 35608 + }, + { + "epoch": 19.893296089385476, + "grad_norm": 0.33032065629959106, + "learning_rate": 5.798319327731093e-06, + "loss": 0.3658, + "step": 35609 + }, + { + "epoch": 19.893854748603353, + "grad_norm": 0.5306668877601624, + "learning_rate": 5.7703081232493e-06, + "loss": 0.355, + "step": 35610 + }, + { + "epoch": 19.89441340782123, + "grad_norm": 0.5451876521110535, + "learning_rate": 5.742296918767508e-06, + "loss": 0.4293, + "step": 35611 + }, + { + "epoch": 19.894972067039106, + "grad_norm": 0.3425430953502655, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3981, + "step": 35612 + }, + { + "epoch": 19.895530726256982, + "grad_norm": 0.5449585318565369, + "learning_rate": 5.686274509803921e-06, + "loss": 0.3558, + "step": 35613 + }, + { + "epoch": 19.89608938547486, + "grad_norm": 0.8266059160232544, + "learning_rate": 5.658263305322129e-06, + "loss": 0.5354, + "step": 35614 + }, + { + "epoch": 19.89664804469274, + "grad_norm": 0.4417438805103302, + "learning_rate": 5.630252100840336e-06, + "loss": 0.4157, + "step": 35615 + }, + { + "epoch": 19.897206703910616, + "grad_norm": 0.559985339641571, + "learning_rate": 5.6022408963585436e-06, + "loss": 0.4945, + "step": 35616 + }, + { + "epoch": 19.897765363128492, + "grad_norm": 5.538567066192627, + "learning_rate": 5.57422969187675e-06, + "loss": 0.3246, + "step": 35617 + }, + { + "epoch": 19.89832402234637, + "grad_norm": 0.43555939197540283, + "learning_rate": 5.546218487394958e-06, + "loss": 0.5071, + "step": 35618 + }, + { + "epoch": 19.898882681564245, + "grad_norm": 0.57423335313797, + "learning_rate": 5.518207282913165e-06, + "loss": 0.4386, + "step": 35619 + }, + { + "epoch": 19.899441340782122, + "grad_norm": 0.8419376015663147, + "learning_rate": 5.490196078431373e-06, + "loss": 0.3857, + "step": 35620 + }, + { + "epoch": 19.9, + "grad_norm": 0.4784092605113983, + "learning_rate": 5.4621848739495795e-06, + "loss": 0.3082, + "step": 35621 + }, + { + "epoch": 19.90055865921788, + "grad_norm": 0.8151900172233582, + "learning_rate": 5.434173669467787e-06, + "loss": 0.3871, + "step": 35622 + }, + { + "epoch": 19.901117318435755, + "grad_norm": 1.1739178895950317, + "learning_rate": 5.406162464985994e-06, + "loss": 0.3768, + "step": 35623 + }, + { + "epoch": 19.901675977653632, + "grad_norm": 0.42103952169418335, + "learning_rate": 5.378151260504202e-06, + "loss": 0.3949, + "step": 35624 + }, + { + "epoch": 19.90223463687151, + "grad_norm": 0.4123176336288452, + "learning_rate": 5.3501400560224085e-06, + "loss": 0.3185, + "step": 35625 + }, + { + "epoch": 19.902793296089385, + "grad_norm": 0.5617626905441284, + "learning_rate": 5.322128851540616e-06, + "loss": 0.4178, + "step": 35626 + }, + { + "epoch": 19.90335195530726, + "grad_norm": 0.4380773901939392, + "learning_rate": 5.294117647058823e-06, + "loss": 0.4507, + "step": 35627 + }, + { + "epoch": 19.903910614525138, + "grad_norm": 0.5657081604003906, + "learning_rate": 5.266106442577031e-06, + "loss": 0.4342, + "step": 35628 + }, + { + "epoch": 19.904469273743018, + "grad_norm": 0.49421849846839905, + "learning_rate": 5.238095238095238e-06, + "loss": 0.3753, + "step": 35629 + }, + { + "epoch": 19.905027932960895, + "grad_norm": 0.33730989694595337, + "learning_rate": 5.210084033613446e-06, + "loss": 0.3897, + "step": 35630 + }, + { + "epoch": 19.90558659217877, + "grad_norm": 0.4522286355495453, + "learning_rate": 5.182072829131653e-06, + "loss": 0.3804, + "step": 35631 + }, + { + "epoch": 19.906145251396648, + "grad_norm": 0.761443555355072, + "learning_rate": 5.154061624649861e-06, + "loss": 0.3818, + "step": 35632 + }, + { + "epoch": 19.906703910614524, + "grad_norm": 0.6169270873069763, + "learning_rate": 5.1260504201680675e-06, + "loss": 0.3748, + "step": 35633 + }, + { + "epoch": 19.9072625698324, + "grad_norm": 0.4670906066894531, + "learning_rate": 5.098039215686275e-06, + "loss": 0.4335, + "step": 35634 + }, + { + "epoch": 19.907821229050278, + "grad_norm": 0.5424671173095703, + "learning_rate": 5.070028011204482e-06, + "loss": 0.4197, + "step": 35635 + }, + { + "epoch": 19.908379888268158, + "grad_norm": 0.48261988162994385, + "learning_rate": 5.04201680672269e-06, + "loss": 0.3623, + "step": 35636 + }, + { + "epoch": 19.908938547486034, + "grad_norm": 0.4566068947315216, + "learning_rate": 5.0140056022408966e-06, + "loss": 0.4932, + "step": 35637 + }, + { + "epoch": 19.90949720670391, + "grad_norm": 0.4355485141277313, + "learning_rate": 4.985994397759104e-06, + "loss": 0.4828, + "step": 35638 + }, + { + "epoch": 19.910055865921787, + "grad_norm": 1.133594274520874, + "learning_rate": 4.957983193277311e-06, + "loss": 0.3703, + "step": 35639 + }, + { + "epoch": 19.910614525139664, + "grad_norm": 0.6314957737922668, + "learning_rate": 4.929971988795519e-06, + "loss": 0.4299, + "step": 35640 + }, + { + "epoch": 19.91117318435754, + "grad_norm": 0.3796870708465576, + "learning_rate": 4.901960784313726e-06, + "loss": 0.3684, + "step": 35641 + }, + { + "epoch": 19.91173184357542, + "grad_norm": 0.42551523447036743, + "learning_rate": 4.873949579831933e-06, + "loss": 0.3439, + "step": 35642 + }, + { + "epoch": 19.912290502793297, + "grad_norm": 0.7229700684547424, + "learning_rate": 4.84593837535014e-06, + "loss": 0.4305, + "step": 35643 + }, + { + "epoch": 19.912849162011174, + "grad_norm": 0.537805438041687, + "learning_rate": 4.817927170868348e-06, + "loss": 0.3773, + "step": 35644 + }, + { + "epoch": 19.91340782122905, + "grad_norm": 0.5850977301597595, + "learning_rate": 4.789915966386555e-06, + "loss": 0.426, + "step": 35645 + }, + { + "epoch": 19.913966480446927, + "grad_norm": 0.414144366979599, + "learning_rate": 4.761904761904762e-06, + "loss": 0.3362, + "step": 35646 + }, + { + "epoch": 19.914525139664804, + "grad_norm": 0.5388544201850891, + "learning_rate": 4.733893557422969e-06, + "loss": 0.43, + "step": 35647 + }, + { + "epoch": 19.91508379888268, + "grad_norm": 0.4163450300693512, + "learning_rate": 4.705882352941176e-06, + "loss": 0.4243, + "step": 35648 + }, + { + "epoch": 19.91564245810056, + "grad_norm": 0.9192421436309814, + "learning_rate": 4.677871148459384e-06, + "loss": 0.4005, + "step": 35649 + }, + { + "epoch": 19.916201117318437, + "grad_norm": 0.4805905222892761, + "learning_rate": 4.649859943977591e-06, + "loss": 0.3543, + "step": 35650 + }, + { + "epoch": 19.916759776536313, + "grad_norm": 0.548423171043396, + "learning_rate": 4.621848739495798e-06, + "loss": 0.3764, + "step": 35651 + }, + { + "epoch": 19.91731843575419, + "grad_norm": 0.5281224250793457, + "learning_rate": 4.593837535014005e-06, + "loss": 0.4331, + "step": 35652 + }, + { + "epoch": 19.917877094972066, + "grad_norm": 0.49651938676834106, + "learning_rate": 4.565826330532213e-06, + "loss": 0.4811, + "step": 35653 + }, + { + "epoch": 19.918435754189943, + "grad_norm": 0.43199896812438965, + "learning_rate": 4.53781512605042e-06, + "loss": 0.5455, + "step": 35654 + }, + { + "epoch": 19.91899441340782, + "grad_norm": 0.45970430970191956, + "learning_rate": 4.509803921568627e-06, + "loss": 0.3835, + "step": 35655 + }, + { + "epoch": 19.9195530726257, + "grad_norm": 0.5654606819152832, + "learning_rate": 4.481792717086834e-06, + "loss": 0.3718, + "step": 35656 + }, + { + "epoch": 19.920111731843576, + "grad_norm": 1.0351994037628174, + "learning_rate": 4.453781512605042e-06, + "loss": 0.4673, + "step": 35657 + }, + { + "epoch": 19.920670391061453, + "grad_norm": 0.474049836397171, + "learning_rate": 4.425770308123249e-06, + "loss": 0.4639, + "step": 35658 + }, + { + "epoch": 19.92122905027933, + "grad_norm": 0.4930829107761383, + "learning_rate": 4.397759103641457e-06, + "loss": 0.4436, + "step": 35659 + }, + { + "epoch": 19.921787709497206, + "grad_norm": 0.3586808145046234, + "learning_rate": 4.369747899159664e-06, + "loss": 0.3681, + "step": 35660 + }, + { + "epoch": 19.922346368715083, + "grad_norm": 1.6363966464996338, + "learning_rate": 4.341736694677872e-06, + "loss": 0.3905, + "step": 35661 + }, + { + "epoch": 19.922905027932963, + "grad_norm": 0.35228431224823, + "learning_rate": 4.313725490196079e-06, + "loss": 0.3522, + "step": 35662 + }, + { + "epoch": 19.92346368715084, + "grad_norm": 1.6791367530822754, + "learning_rate": 4.285714285714286e-06, + "loss": 0.3659, + "step": 35663 + }, + { + "epoch": 19.924022346368716, + "grad_norm": 0.40718963742256165, + "learning_rate": 4.257703081232493e-06, + "loss": 0.4443, + "step": 35664 + }, + { + "epoch": 19.924581005586592, + "grad_norm": 0.5448503494262695, + "learning_rate": 4.229691876750701e-06, + "loss": 0.5231, + "step": 35665 + }, + { + "epoch": 19.92513966480447, + "grad_norm": 1.078081727027893, + "learning_rate": 4.201680672268908e-06, + "loss": 0.3552, + "step": 35666 + }, + { + "epoch": 19.925698324022346, + "grad_norm": 0.8464499711990356, + "learning_rate": 4.173669467787115e-06, + "loss": 0.4155, + "step": 35667 + }, + { + "epoch": 19.926256983240222, + "grad_norm": 0.4724714756011963, + "learning_rate": 4.145658263305322e-06, + "loss": 0.5077, + "step": 35668 + }, + { + "epoch": 19.926815642458102, + "grad_norm": 0.3425491750240326, + "learning_rate": 4.11764705882353e-06, + "loss": 0.326, + "step": 35669 + }, + { + "epoch": 19.92737430167598, + "grad_norm": 0.49022987484931946, + "learning_rate": 4.089635854341737e-06, + "loss": 0.4266, + "step": 35670 + }, + { + "epoch": 19.927932960893855, + "grad_norm": 0.5103769302368164, + "learning_rate": 4.0616246498599444e-06, + "loss": 0.5553, + "step": 35671 + }, + { + "epoch": 19.928491620111732, + "grad_norm": 0.42951250076293945, + "learning_rate": 4.033613445378151e-06, + "loss": 0.3576, + "step": 35672 + }, + { + "epoch": 19.92905027932961, + "grad_norm": 0.37149930000305176, + "learning_rate": 4.005602240896359e-06, + "loss": 0.3762, + "step": 35673 + }, + { + "epoch": 19.929608938547485, + "grad_norm": 0.9715537428855896, + "learning_rate": 3.977591036414566e-06, + "loss": 0.4102, + "step": 35674 + }, + { + "epoch": 19.93016759776536, + "grad_norm": 0.4984362721443176, + "learning_rate": 3.9495798319327735e-06, + "loss": 0.4779, + "step": 35675 + }, + { + "epoch": 19.93072625698324, + "grad_norm": 0.3813376724720001, + "learning_rate": 3.92156862745098e-06, + "loss": 0.374, + "step": 35676 + }, + { + "epoch": 19.93128491620112, + "grad_norm": 0.323688268661499, + "learning_rate": 3.893557422969188e-06, + "loss": 0.3673, + "step": 35677 + }, + { + "epoch": 19.931843575418995, + "grad_norm": 0.553429901599884, + "learning_rate": 3.865546218487395e-06, + "loss": 0.3484, + "step": 35678 + }, + { + "epoch": 19.93240223463687, + "grad_norm": 0.5127355456352234, + "learning_rate": 3.8375350140056026e-06, + "loss": 0.3694, + "step": 35679 + }, + { + "epoch": 19.932960893854748, + "grad_norm": 0.3489966094493866, + "learning_rate": 3.8095238095238094e-06, + "loss": 0.4427, + "step": 35680 + }, + { + "epoch": 19.933519553072625, + "grad_norm": 0.39821743965148926, + "learning_rate": 3.781512605042017e-06, + "loss": 0.4465, + "step": 35681 + }, + { + "epoch": 19.9340782122905, + "grad_norm": 0.9026612639427185, + "learning_rate": 3.7535014005602243e-06, + "loss": 0.4096, + "step": 35682 + }, + { + "epoch": 19.93463687150838, + "grad_norm": 0.407042533159256, + "learning_rate": 3.7254901960784316e-06, + "loss": 0.3671, + "step": 35683 + }, + { + "epoch": 19.935195530726258, + "grad_norm": 0.5701173543930054, + "learning_rate": 3.697478991596639e-06, + "loss": 0.6763, + "step": 35684 + }, + { + "epoch": 19.935754189944134, + "grad_norm": 0.538378119468689, + "learning_rate": 3.669467787114846e-06, + "loss": 0.36, + "step": 35685 + }, + { + "epoch": 19.93631284916201, + "grad_norm": 0.5826660394668579, + "learning_rate": 3.6414565826330534e-06, + "loss": 0.4464, + "step": 35686 + }, + { + "epoch": 19.936871508379888, + "grad_norm": 0.3692992329597473, + "learning_rate": 3.6134453781512607e-06, + "loss": 0.3863, + "step": 35687 + }, + { + "epoch": 19.937430167597764, + "grad_norm": 0.5763682126998901, + "learning_rate": 3.585434173669468e-06, + "loss": 0.5184, + "step": 35688 + }, + { + "epoch": 19.93798882681564, + "grad_norm": 0.5126556158065796, + "learning_rate": 3.557422969187675e-06, + "loss": 0.4881, + "step": 35689 + }, + { + "epoch": 19.93854748603352, + "grad_norm": 0.31983473896980286, + "learning_rate": 3.5294117647058825e-06, + "loss": 0.3163, + "step": 35690 + }, + { + "epoch": 19.939106145251397, + "grad_norm": 0.6009042263031006, + "learning_rate": 3.5014005602240897e-06, + "loss": 0.3812, + "step": 35691 + }, + { + "epoch": 19.939664804469274, + "grad_norm": 0.4028477072715759, + "learning_rate": 3.473389355742297e-06, + "loss": 0.3111, + "step": 35692 + }, + { + "epoch": 19.94022346368715, + "grad_norm": 1.3376309871673584, + "learning_rate": 3.4453781512605043e-06, + "loss": 0.3678, + "step": 35693 + }, + { + "epoch": 19.940782122905027, + "grad_norm": 13.20019817352295, + "learning_rate": 3.4173669467787115e-06, + "loss": 0.4757, + "step": 35694 + }, + { + "epoch": 19.941340782122904, + "grad_norm": 0.5198339819908142, + "learning_rate": 3.3893557422969192e-06, + "loss": 0.3489, + "step": 35695 + }, + { + "epoch": 19.941899441340784, + "grad_norm": 0.6982374787330627, + "learning_rate": 3.3613445378151265e-06, + "loss": 0.4379, + "step": 35696 + }, + { + "epoch": 19.94245810055866, + "grad_norm": 0.45871031284332275, + "learning_rate": 3.3333333333333337e-06, + "loss": 0.3599, + "step": 35697 + }, + { + "epoch": 19.943016759776537, + "grad_norm": 0.8334909677505493, + "learning_rate": 3.3053221288515406e-06, + "loss": 0.5008, + "step": 35698 + }, + { + "epoch": 19.943575418994413, + "grad_norm": 0.8086140751838684, + "learning_rate": 3.277310924369748e-06, + "loss": 0.413, + "step": 35699 + }, + { + "epoch": 19.94413407821229, + "grad_norm": 1.4512261152267456, + "learning_rate": 3.249299719887955e-06, + "loss": 0.3854, + "step": 35700 + }, + { + "epoch": 19.944692737430167, + "grad_norm": 0.4095904529094696, + "learning_rate": 3.2212885154061624e-06, + "loss": 0.2515, + "step": 35701 + }, + { + "epoch": 19.945251396648043, + "grad_norm": 0.48182395100593567, + "learning_rate": 3.1932773109243696e-06, + "loss": 0.4824, + "step": 35702 + }, + { + "epoch": 19.945810055865923, + "grad_norm": 0.9869712591171265, + "learning_rate": 3.165266106442577e-06, + "loss": 0.3804, + "step": 35703 + }, + { + "epoch": 19.9463687150838, + "grad_norm": 0.3817277252674103, + "learning_rate": 3.137254901960784e-06, + "loss": 0.4203, + "step": 35704 + }, + { + "epoch": 19.946927374301676, + "grad_norm": 0.40958330035209656, + "learning_rate": 3.1092436974789914e-06, + "loss": 0.3923, + "step": 35705 + }, + { + "epoch": 19.947486033519553, + "grad_norm": 0.7038190364837646, + "learning_rate": 3.0812324929971987e-06, + "loss": 0.493, + "step": 35706 + }, + { + "epoch": 19.94804469273743, + "grad_norm": 0.5374528169631958, + "learning_rate": 3.053221288515406e-06, + "loss": 0.4533, + "step": 35707 + }, + { + "epoch": 19.948603351955306, + "grad_norm": 0.3756989538669586, + "learning_rate": 3.0252100840336132e-06, + "loss": 0.3664, + "step": 35708 + }, + { + "epoch": 19.949162011173183, + "grad_norm": 0.4476151466369629, + "learning_rate": 2.997198879551821e-06, + "loss": 0.4512, + "step": 35709 + }, + { + "epoch": 19.949720670391063, + "grad_norm": 0.49882346391677856, + "learning_rate": 2.969187675070028e-06, + "loss": 0.4236, + "step": 35710 + }, + { + "epoch": 19.95027932960894, + "grad_norm": 0.5060874819755554, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.3233, + "step": 35711 + }, + { + "epoch": 19.950837988826816, + "grad_norm": 0.39693549275398254, + "learning_rate": 2.9131652661064427e-06, + "loss": 0.388, + "step": 35712 + }, + { + "epoch": 19.951396648044692, + "grad_norm": 0.4244128167629242, + "learning_rate": 2.88515406162465e-06, + "loss": 0.408, + "step": 35713 + }, + { + "epoch": 19.95195530726257, + "grad_norm": 0.39799681305885315, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.3463, + "step": 35714 + }, + { + "epoch": 19.952513966480446, + "grad_norm": 0.8204896450042725, + "learning_rate": 2.8291316526610645e-06, + "loss": 0.4184, + "step": 35715 + }, + { + "epoch": 19.953072625698326, + "grad_norm": 0.3208429217338562, + "learning_rate": 2.8011204481792718e-06, + "loss": 0.3386, + "step": 35716 + }, + { + "epoch": 19.953631284916202, + "grad_norm": 0.44770392775535583, + "learning_rate": 2.773109243697479e-06, + "loss": 0.5423, + "step": 35717 + }, + { + "epoch": 19.95418994413408, + "grad_norm": 0.5298616886138916, + "learning_rate": 2.7450980392156863e-06, + "loss": 0.3982, + "step": 35718 + }, + { + "epoch": 19.954748603351955, + "grad_norm": 0.5084118843078613, + "learning_rate": 2.7170868347338936e-06, + "loss": 0.3581, + "step": 35719 + }, + { + "epoch": 19.955307262569832, + "grad_norm": 1.0268547534942627, + "learning_rate": 2.689075630252101e-06, + "loss": 0.484, + "step": 35720 + }, + { + "epoch": 19.95586592178771, + "grad_norm": 1.0976309776306152, + "learning_rate": 2.661064425770308e-06, + "loss": 0.3514, + "step": 35721 + }, + { + "epoch": 19.956424581005585, + "grad_norm": 0.6385658383369446, + "learning_rate": 2.6330532212885154e-06, + "loss": 0.2825, + "step": 35722 + }, + { + "epoch": 19.956983240223465, + "grad_norm": 0.8541482090950012, + "learning_rate": 2.605042016806723e-06, + "loss": 0.454, + "step": 35723 + }, + { + "epoch": 19.957541899441342, + "grad_norm": 0.6795336604118347, + "learning_rate": 2.5770308123249303e-06, + "loss": 0.4391, + "step": 35724 + }, + { + "epoch": 19.95810055865922, + "grad_norm": 0.4772208631038666, + "learning_rate": 2.5490196078431376e-06, + "loss": 0.4127, + "step": 35725 + }, + { + "epoch": 19.958659217877095, + "grad_norm": 0.3698780834674835, + "learning_rate": 2.521008403361345e-06, + "loss": 0.4208, + "step": 35726 + }, + { + "epoch": 19.95921787709497, + "grad_norm": 0.5685043931007385, + "learning_rate": 2.492997198879552e-06, + "loss": 0.4064, + "step": 35727 + }, + { + "epoch": 19.959776536312848, + "grad_norm": 0.3657155930995941, + "learning_rate": 2.4649859943977594e-06, + "loss": 0.4184, + "step": 35728 + }, + { + "epoch": 19.960335195530725, + "grad_norm": 1.0049821138381958, + "learning_rate": 2.4369747899159667e-06, + "loss": 0.5792, + "step": 35729 + }, + { + "epoch": 19.960893854748605, + "grad_norm": 15.256537437438965, + "learning_rate": 2.408963585434174e-06, + "loss": 0.3905, + "step": 35730 + }, + { + "epoch": 19.96145251396648, + "grad_norm": 0.45929771661758423, + "learning_rate": 2.380952380952381e-06, + "loss": 0.4531, + "step": 35731 + }, + { + "epoch": 19.962011173184358, + "grad_norm": 0.49803638458251953, + "learning_rate": 2.352941176470588e-06, + "loss": 0.4904, + "step": 35732 + }, + { + "epoch": 19.962569832402234, + "grad_norm": 0.3768760859966278, + "learning_rate": 2.3249299719887953e-06, + "loss": 0.3589, + "step": 35733 + }, + { + "epoch": 19.96312849162011, + "grad_norm": 0.9365134835243225, + "learning_rate": 2.2969187675070026e-06, + "loss": 0.3522, + "step": 35734 + }, + { + "epoch": 19.963687150837988, + "grad_norm": 0.41926106810569763, + "learning_rate": 2.26890756302521e-06, + "loss": 0.4289, + "step": 35735 + }, + { + "epoch": 19.964245810055864, + "grad_norm": 0.5147438645362854, + "learning_rate": 2.240896358543417e-06, + "loss": 0.5659, + "step": 35736 + }, + { + "epoch": 19.964804469273744, + "grad_norm": 0.6456831097602844, + "learning_rate": 2.2128851540616244e-06, + "loss": 0.3807, + "step": 35737 + }, + { + "epoch": 19.96536312849162, + "grad_norm": 0.39884018898010254, + "learning_rate": 2.184873949579832e-06, + "loss": 0.4295, + "step": 35738 + }, + { + "epoch": 19.965921787709497, + "grad_norm": 0.7749279737472534, + "learning_rate": 2.1568627450980393e-06, + "loss": 0.3755, + "step": 35739 + }, + { + "epoch": 19.966480446927374, + "grad_norm": 0.5296819806098938, + "learning_rate": 2.1288515406162466e-06, + "loss": 0.5193, + "step": 35740 + }, + { + "epoch": 19.96703910614525, + "grad_norm": 4.553891658782959, + "learning_rate": 2.100840336134454e-06, + "loss": 0.4418, + "step": 35741 + }, + { + "epoch": 19.967597765363127, + "grad_norm": 0.3707868158817291, + "learning_rate": 2.072829131652661e-06, + "loss": 0.3272, + "step": 35742 + }, + { + "epoch": 19.968156424581007, + "grad_norm": 0.3603146970272064, + "learning_rate": 2.0448179271708684e-06, + "loss": 0.3986, + "step": 35743 + }, + { + "epoch": 19.968715083798884, + "grad_norm": 0.6828670501708984, + "learning_rate": 2.0168067226890756e-06, + "loss": 0.5284, + "step": 35744 + }, + { + "epoch": 19.96927374301676, + "grad_norm": 0.352805495262146, + "learning_rate": 1.988795518207283e-06, + "loss": 0.4627, + "step": 35745 + }, + { + "epoch": 19.969832402234637, + "grad_norm": 0.5262941718101501, + "learning_rate": 1.96078431372549e-06, + "loss": 0.5057, + "step": 35746 + }, + { + "epoch": 19.970391061452514, + "grad_norm": 0.37788015604019165, + "learning_rate": 1.9327731092436974e-06, + "loss": 0.3568, + "step": 35747 + }, + { + "epoch": 19.97094972067039, + "grad_norm": 2.0190670490264893, + "learning_rate": 1.9047619047619047e-06, + "loss": 0.5348, + "step": 35748 + }, + { + "epoch": 19.971508379888267, + "grad_norm": 0.5013555288314819, + "learning_rate": 1.8767507002801122e-06, + "loss": 0.419, + "step": 35749 + }, + { + "epoch": 19.972067039106147, + "grad_norm": 0.38966071605682373, + "learning_rate": 1.8487394957983194e-06, + "loss": 0.3391, + "step": 35750 + }, + { + "epoch": 19.972625698324023, + "grad_norm": 0.4424666166305542, + "learning_rate": 1.8207282913165267e-06, + "loss": 0.5569, + "step": 35751 + }, + { + "epoch": 19.9731843575419, + "grad_norm": 0.4121440052986145, + "learning_rate": 1.792717086834734e-06, + "loss": 0.3891, + "step": 35752 + }, + { + "epoch": 19.973743016759776, + "grad_norm": 0.4010823369026184, + "learning_rate": 1.7647058823529412e-06, + "loss": 0.4402, + "step": 35753 + }, + { + "epoch": 19.974301675977653, + "grad_norm": 0.448160856962204, + "learning_rate": 1.7366946778711485e-06, + "loss": 0.4184, + "step": 35754 + }, + { + "epoch": 19.97486033519553, + "grad_norm": 0.4456930458545685, + "learning_rate": 1.7086834733893558e-06, + "loss": 0.4246, + "step": 35755 + }, + { + "epoch": 19.975418994413406, + "grad_norm": 0.49775293469429016, + "learning_rate": 1.6806722689075632e-06, + "loss": 0.3936, + "step": 35756 + }, + { + "epoch": 19.975977653631286, + "grad_norm": 0.7370997071266174, + "learning_rate": 1.6526610644257703e-06, + "loss": 0.4992, + "step": 35757 + }, + { + "epoch": 19.976536312849163, + "grad_norm": 1.8036106824874878, + "learning_rate": 1.6246498599439776e-06, + "loss": 0.373, + "step": 35758 + }, + { + "epoch": 19.97709497206704, + "grad_norm": 0.3875807821750641, + "learning_rate": 1.5966386554621848e-06, + "loss": 0.3729, + "step": 35759 + }, + { + "epoch": 19.977653631284916, + "grad_norm": 0.4445265233516693, + "learning_rate": 1.568627450980392e-06, + "loss": 0.3927, + "step": 35760 + }, + { + "epoch": 19.978212290502793, + "grad_norm": 0.3668687045574188, + "learning_rate": 1.5406162464985994e-06, + "loss": 0.3326, + "step": 35761 + }, + { + "epoch": 19.97877094972067, + "grad_norm": 0.4288323223590851, + "learning_rate": 1.5126050420168066e-06, + "loss": 0.3051, + "step": 35762 + }, + { + "epoch": 19.97932960893855, + "grad_norm": 0.33855193853378296, + "learning_rate": 1.484593837535014e-06, + "loss": 0.3957, + "step": 35763 + }, + { + "epoch": 19.979888268156426, + "grad_norm": 0.4493440091609955, + "learning_rate": 1.4565826330532214e-06, + "loss": 0.327, + "step": 35764 + }, + { + "epoch": 19.980446927374302, + "grad_norm": 0.3707232177257538, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.4448, + "step": 35765 + }, + { + "epoch": 19.98100558659218, + "grad_norm": 0.3734290599822998, + "learning_rate": 1.4005602240896359e-06, + "loss": 0.3223, + "step": 35766 + }, + { + "epoch": 19.981564245810056, + "grad_norm": 6.52301025390625, + "learning_rate": 1.3725490196078432e-06, + "loss": 0.3994, + "step": 35767 + }, + { + "epoch": 19.982122905027932, + "grad_norm": 0.41697150468826294, + "learning_rate": 1.3445378151260504e-06, + "loss": 0.4148, + "step": 35768 + }, + { + "epoch": 19.98268156424581, + "grad_norm": 0.5138062834739685, + "learning_rate": 1.3165266106442577e-06, + "loss": 0.4871, + "step": 35769 + }, + { + "epoch": 19.98324022346369, + "grad_norm": 0.4564405083656311, + "learning_rate": 1.2885154061624652e-06, + "loss": 0.3831, + "step": 35770 + }, + { + "epoch": 19.983798882681565, + "grad_norm": 0.30999866127967834, + "learning_rate": 1.2605042016806724e-06, + "loss": 0.301, + "step": 35771 + }, + { + "epoch": 19.984357541899442, + "grad_norm": 0.4386674463748932, + "learning_rate": 1.2324929971988797e-06, + "loss": 0.4778, + "step": 35772 + }, + { + "epoch": 19.98491620111732, + "grad_norm": 0.8380294442176819, + "learning_rate": 1.204481792717087e-06, + "loss": 0.455, + "step": 35773 + }, + { + "epoch": 19.985474860335195, + "grad_norm": 0.5301977396011353, + "learning_rate": 1.176470588235294e-06, + "loss": 0.4115, + "step": 35774 + }, + { + "epoch": 19.98603351955307, + "grad_norm": 0.4730093479156494, + "learning_rate": 1.1484593837535013e-06, + "loss": 0.2906, + "step": 35775 + }, + { + "epoch": 19.986592178770948, + "grad_norm": 0.5763904452323914, + "learning_rate": 1.1204481792717085e-06, + "loss": 0.4095, + "step": 35776 + }, + { + "epoch": 19.98715083798883, + "grad_norm": 0.798834502696991, + "learning_rate": 1.092436974789916e-06, + "loss": 0.3321, + "step": 35777 + }, + { + "epoch": 19.987709497206705, + "grad_norm": 0.42858266830444336, + "learning_rate": 1.0644257703081233e-06, + "loss": 0.4252, + "step": 35778 + }, + { + "epoch": 19.98826815642458, + "grad_norm": 0.4689836800098419, + "learning_rate": 1.0364145658263306e-06, + "loss": 0.5972, + "step": 35779 + }, + { + "epoch": 19.988826815642458, + "grad_norm": 0.4905235469341278, + "learning_rate": 1.0084033613445378e-06, + "loss": 0.4909, + "step": 35780 + }, + { + "epoch": 19.989385474860335, + "grad_norm": 0.34495341777801514, + "learning_rate": 9.80392156862745e-07, + "loss": 0.3659, + "step": 35781 + }, + { + "epoch": 19.98994413407821, + "grad_norm": 0.4195338487625122, + "learning_rate": 9.523809523809523e-07, + "loss": 0.3451, + "step": 35782 + }, + { + "epoch": 19.990502793296088, + "grad_norm": 0.40323707461357117, + "learning_rate": 9.243697478991597e-07, + "loss": 0.4844, + "step": 35783 + }, + { + "epoch": 19.991061452513968, + "grad_norm": 0.3888605237007141, + "learning_rate": 8.96358543417367e-07, + "loss": 0.3603, + "step": 35784 + }, + { + "epoch": 19.991620111731844, + "grad_norm": 0.6263288259506226, + "learning_rate": 8.683473389355742e-07, + "loss": 0.4473, + "step": 35785 + }, + { + "epoch": 19.99217877094972, + "grad_norm": 1.1992926597595215, + "learning_rate": 8.403361344537816e-07, + "loss": 0.4191, + "step": 35786 + }, + { + "epoch": 19.992737430167598, + "grad_norm": 0.5969231128692627, + "learning_rate": 8.123249299719888e-07, + "loss": 0.3807, + "step": 35787 + }, + { + "epoch": 19.993296089385474, + "grad_norm": 0.4747834801673889, + "learning_rate": 7.84313725490196e-07, + "loss": 0.3464, + "step": 35788 + }, + { + "epoch": 19.99385474860335, + "grad_norm": 0.4346066117286682, + "learning_rate": 7.563025210084033e-07, + "loss": 0.4275, + "step": 35789 + }, + { + "epoch": 19.994413407821227, + "grad_norm": 2.834784507751465, + "learning_rate": 7.282913165266107e-07, + "loss": 0.557, + "step": 35790 + }, + { + "epoch": 19.994972067039107, + "grad_norm": 0.6398465633392334, + "learning_rate": 7.002801120448179e-07, + "loss": 0.4374, + "step": 35791 + }, + { + "epoch": 19.995530726256984, + "grad_norm": 0.8262577056884766, + "learning_rate": 6.722689075630252e-07, + "loss": 0.3263, + "step": 35792 + }, + { + "epoch": 19.99608938547486, + "grad_norm": 0.6604378819465637, + "learning_rate": 6.442577030812326e-07, + "loss": 0.3759, + "step": 35793 + }, + { + "epoch": 19.996648044692737, + "grad_norm": 0.6472581028938293, + "learning_rate": 6.162464985994398e-07, + "loss": 0.4731, + "step": 35794 + }, + { + "epoch": 19.997206703910614, + "grad_norm": 0.4706597328186035, + "learning_rate": 5.88235294117647e-07, + "loss": 0.3876, + "step": 35795 + }, + { + "epoch": 19.99776536312849, + "grad_norm": 0.5563760995864868, + "learning_rate": 5.602240896358543e-07, + "loss": 0.3993, + "step": 35796 + }, + { + "epoch": 19.99832402234637, + "grad_norm": 2.6378302574157715, + "learning_rate": 5.322128851540616e-07, + "loss": 0.4534, + "step": 35797 + }, + { + "epoch": 19.998882681564247, + "grad_norm": 0.44761624932289124, + "learning_rate": 5.042016806722689e-07, + "loss": 0.4278, + "step": 35798 + }, + { + "epoch": 19.999441340782123, + "grad_norm": 0.47075140476226807, + "learning_rate": 4.761904761904762e-07, + "loss": 0.5231, + "step": 35799 + }, + { + "epoch": 20.0, + "grad_norm": 5.764364242553711, + "learning_rate": 4.481792717086835e-07, + "loss": 0.3833, + "step": 35800 + } + ], + "logging_steps": 1.0, + "max_steps": 35800, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.794046604000698e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}