diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 5000.0, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "grad_norm": 0.7262070178985596, + "learning_rate": 9.333333333333334e-08, + "loss": 0.8408, + "step": 1 + }, + { + "epoch": 0.0008, + "grad_norm": 0.8701621890068054, + "learning_rate": 1.8666666666666667e-07, + "loss": 0.974, + "step": 2 + }, + { + "epoch": 0.0012, + "grad_norm": 0.7751222848892212, + "learning_rate": 2.8e-07, + "loss": 0.9699, + "step": 3 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9030049443244934, + "learning_rate": 3.7333333333333334e-07, + "loss": 0.9404, + "step": 4 + }, + { + "epoch": 0.002, + "grad_norm": 0.9666240215301514, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.8808, + "step": 5 + }, + { + "epoch": 0.0024, + "grad_norm": 1.1372973918914795, + "learning_rate": 5.6e-07, + "loss": 1.0387, + "step": 6 + }, + { + "epoch": 0.0028, + "grad_norm": 0.8488882780075073, + "learning_rate": 6.533333333333334e-07, + "loss": 1.0507, + "step": 7 + }, + { + "epoch": 0.0032, + "grad_norm": 0.6407755017280579, + "learning_rate": 7.466666666666667e-07, + "loss": 0.9219, + "step": 8 + }, + { + "epoch": 0.0036, + "grad_norm": 0.824164628982544, + "learning_rate": 8.4e-07, + "loss": 0.9653, + "step": 9 + }, + { + "epoch": 0.004, + "grad_norm": 0.9674360156059265, + "learning_rate": 9.333333333333333e-07, + "loss": 1.0234, + "step": 10 + }, + { + "epoch": 0.0044, + "grad_norm": 0.9015031456947327, + "learning_rate": 1.0266666666666666e-06, + "loss": 0.9666, + "step": 11 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8106409311294556, + "learning_rate": 1.12e-06, + "loss": 0.9502, + "step": 12 + }, + { + "epoch": 0.0052, + "grad_norm": 0.9036867022514343, + "learning_rate": 1.2133333333333333e-06, + "loss": 0.8926, + "step": 13 + }, + { + "epoch": 0.0056, + "grad_norm": 0.9781032204627991, + "learning_rate": 1.3066666666666667e-06, + "loss": 0.9054, + "step": 14 + }, + { + "epoch": 0.006, + "grad_norm": 0.6719639897346497, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.8787, + "step": 15 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8836888670921326, + "learning_rate": 1.4933333333333334e-06, + "loss": 0.9841, + "step": 16 + }, + { + "epoch": 0.0068, + "grad_norm": 0.8169286251068115, + "learning_rate": 1.5866666666666666e-06, + "loss": 0.9571, + "step": 17 + }, + { + "epoch": 0.0072, + "grad_norm": 0.8721626400947571, + "learning_rate": 1.68e-06, + "loss": 0.959, + "step": 18 + }, + { + "epoch": 0.0076, + "grad_norm": 0.8413893580436707, + "learning_rate": 1.7733333333333334e-06, + "loss": 0.8991, + "step": 19 + }, + { + "epoch": 0.008, + "grad_norm": 0.7627416253089905, + "learning_rate": 1.8666666666666667e-06, + "loss": 0.9145, + "step": 20 + }, + { + "epoch": 0.0084, + "grad_norm": 0.796474814414978, + "learning_rate": 1.9600000000000003e-06, + "loss": 1.0169, + "step": 21 + }, + { + "epoch": 0.0088, + "grad_norm": 0.7570239901542664, + "learning_rate": 2.0533333333333333e-06, + "loss": 0.8138, + "step": 22 + }, + { + "epoch": 0.0092, + "grad_norm": 0.6165841817855835, + "learning_rate": 2.1466666666666663e-06, + "loss": 0.8433, + "step": 23 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7676166296005249, + "learning_rate": 2.24e-06, + "loss": 0.9402, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 0.9886164665222168, + "learning_rate": 2.333333333333333e-06, + "loss": 0.9609, + "step": 25 + }, + { + "epoch": 0.0104, + "grad_norm": 0.6577991843223572, + "learning_rate": 2.4266666666666666e-06, + "loss": 0.911, + "step": 26 + }, + { + "epoch": 0.0108, + "grad_norm": 0.8582744598388672, + "learning_rate": 2.52e-06, + "loss": 0.868, + "step": 27 + }, + { + "epoch": 0.0112, + "grad_norm": 0.7015748023986816, + "learning_rate": 2.6133333333333334e-06, + "loss": 0.9196, + "step": 28 + }, + { + "epoch": 0.0116, + "grad_norm": 0.8589622974395752, + "learning_rate": 2.7066666666666664e-06, + "loss": 0.8029, + "step": 29 + }, + { + "epoch": 0.012, + "grad_norm": 0.708187997341156, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.8276, + "step": 30 + }, + { + "epoch": 0.0124, + "grad_norm": 0.5567463040351868, + "learning_rate": 2.8933333333333333e-06, + "loss": 0.7957, + "step": 31 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6444608569145203, + "learning_rate": 2.9866666666666667e-06, + "loss": 0.8813, + "step": 32 + }, + { + "epoch": 0.0132, + "grad_norm": 0.8051215410232544, + "learning_rate": 3.08e-06, + "loss": 0.8985, + "step": 33 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5687784552574158, + "learning_rate": 3.173333333333333e-06, + "loss": 0.7884, + "step": 34 + }, + { + "epoch": 0.014, + "grad_norm": 0.5885452628135681, + "learning_rate": 3.2666666666666666e-06, + "loss": 0.8811, + "step": 35 + }, + { + "epoch": 0.0144, + "grad_norm": 0.7129512429237366, + "learning_rate": 3.36e-06, + "loss": 0.8201, + "step": 36 + }, + { + "epoch": 0.0148, + "grad_norm": 0.5670292377471924, + "learning_rate": 3.4533333333333334e-06, + "loss": 0.7139, + "step": 37 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5718544125556946, + "learning_rate": 3.546666666666667e-06, + "loss": 0.8111, + "step": 38 + }, + { + "epoch": 0.0156, + "grad_norm": 0.5730088949203491, + "learning_rate": 3.64e-06, + "loss": 0.8961, + "step": 39 + }, + { + "epoch": 0.016, + "grad_norm": 0.6300646662712097, + "learning_rate": 3.7333333333333333e-06, + "loss": 0.7828, + "step": 40 + }, + { + "epoch": 0.0164, + "grad_norm": 0.44455617666244507, + "learning_rate": 3.826666666666667e-06, + "loss": 0.7466, + "step": 41 + }, + { + "epoch": 0.0168, + "grad_norm": 0.5530583262443542, + "learning_rate": 3.920000000000001e-06, + "loss": 0.7742, + "step": 42 + }, + { + "epoch": 0.0172, + "grad_norm": 0.49698424339294434, + "learning_rate": 4.013333333333334e-06, + "loss": 0.8639, + "step": 43 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4044400453567505, + "learning_rate": 4.106666666666667e-06, + "loss": 0.7938, + "step": 44 + }, + { + "epoch": 0.018, + "grad_norm": 0.4309520721435547, + "learning_rate": 4.2e-06, + "loss": 0.7424, + "step": 45 + }, + { + "epoch": 0.0184, + "grad_norm": 0.42502909898757935, + "learning_rate": 4.293333333333333e-06, + "loss": 0.7268, + "step": 46 + }, + { + "epoch": 0.0188, + "grad_norm": 0.4377117455005646, + "learning_rate": 4.3866666666666665e-06, + "loss": 0.7784, + "step": 47 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4521588683128357, + "learning_rate": 4.48e-06, + "loss": 0.7764, + "step": 48 + }, + { + "epoch": 0.0196, + "grad_norm": 0.3998861610889435, + "learning_rate": 4.573333333333333e-06, + "loss": 0.7351, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 0.3292255103588104, + "learning_rate": 4.666666666666666e-06, + "loss": 0.7507, + "step": 50 + }, + { + "epoch": 0.0204, + "grad_norm": 0.3537384867668152, + "learning_rate": 4.76e-06, + "loss": 0.692, + "step": 51 + }, + { + "epoch": 0.0208, + "grad_norm": 0.3487699329853058, + "learning_rate": 4.853333333333333e-06, + "loss": 0.6861, + "step": 52 + }, + { + "epoch": 0.0212, + "grad_norm": 0.4202251732349396, + "learning_rate": 4.946666666666666e-06, + "loss": 0.8594, + "step": 53 + }, + { + "epoch": 0.0216, + "grad_norm": 0.318401962518692, + "learning_rate": 5.04e-06, + "loss": 0.677, + "step": 54 + }, + { + "epoch": 0.022, + "grad_norm": 0.3611930310726166, + "learning_rate": 5.133333333333333e-06, + "loss": 0.8567, + "step": 55 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4041845500469208, + "learning_rate": 5.226666666666667e-06, + "loss": 0.8468, + "step": 56 + }, + { + "epoch": 0.0228, + "grad_norm": 0.3687663674354553, + "learning_rate": 5.32e-06, + "loss": 0.7676, + "step": 57 + }, + { + "epoch": 0.0232, + "grad_norm": 0.3392384946346283, + "learning_rate": 5.413333333333333e-06, + "loss": 0.7888, + "step": 58 + }, + { + "epoch": 0.0236, + "grad_norm": 0.3390963077545166, + "learning_rate": 5.506666666666666e-06, + "loss": 0.7495, + "step": 59 + }, + { + "epoch": 0.024, + "grad_norm": 0.34994274377822876, + "learning_rate": 5.600000000000001e-06, + "loss": 0.7773, + "step": 60 + }, + { + "epoch": 0.0244, + "grad_norm": 0.3523801863193512, + "learning_rate": 5.693333333333334e-06, + "loss": 0.729, + "step": 61 + }, + { + "epoch": 0.0248, + "grad_norm": 0.3572114109992981, + "learning_rate": 5.786666666666667e-06, + "loss": 0.7668, + "step": 62 + }, + { + "epoch": 0.0252, + "grad_norm": 0.3471089005470276, + "learning_rate": 5.88e-06, + "loss": 0.7862, + "step": 63 + }, + { + "epoch": 0.0256, + "grad_norm": 0.35437729954719543, + "learning_rate": 5.9733333333333335e-06, + "loss": 0.7558, + "step": 64 + }, + { + "epoch": 0.026, + "grad_norm": 0.33817169070243835, + "learning_rate": 6.0666666666666665e-06, + "loss": 0.6867, + "step": 65 + }, + { + "epoch": 0.0264, + "grad_norm": 0.34083473682403564, + "learning_rate": 6.16e-06, + "loss": 0.7223, + "step": 66 + }, + { + "epoch": 0.0268, + "grad_norm": 0.35583579540252686, + "learning_rate": 6.253333333333333e-06, + "loss": 0.7972, + "step": 67 + }, + { + "epoch": 0.0272, + "grad_norm": 0.3342086374759674, + "learning_rate": 6.346666666666666e-06, + "loss": 0.6687, + "step": 68 + }, + { + "epoch": 0.0276, + "grad_norm": 0.32780104875564575, + "learning_rate": 6.44e-06, + "loss": 0.7269, + "step": 69 + }, + { + "epoch": 0.028, + "grad_norm": 0.3273721933364868, + "learning_rate": 6.533333333333333e-06, + "loss": 0.7169, + "step": 70 + }, + { + "epoch": 0.0284, + "grad_norm": 0.3392617404460907, + "learning_rate": 6.626666666666666e-06, + "loss": 0.7153, + "step": 71 + }, + { + "epoch": 0.0288, + "grad_norm": 0.3380572199821472, + "learning_rate": 6.72e-06, + "loss": 0.7002, + "step": 72 + }, + { + "epoch": 0.0292, + "grad_norm": 0.30406832695007324, + "learning_rate": 6.813333333333334e-06, + "loss": 0.6909, + "step": 73 + }, + { + "epoch": 0.0296, + "grad_norm": 0.31057706475257874, + "learning_rate": 6.906666666666667e-06, + "loss": 0.6694, + "step": 74 + }, + { + "epoch": 0.03, + "grad_norm": 0.370216429233551, + "learning_rate": 7e-06, + "loss": 0.6705, + "step": 75 + }, + { + "epoch": 0.0304, + "grad_norm": 0.34453248977661133, + "learning_rate": 6.9999970629303366e-06, + "loss": 0.7221, + "step": 76 + }, + { + "epoch": 0.0308, + "grad_norm": 0.37071311473846436, + "learning_rate": 6.999988251726278e-06, + "loss": 0.7792, + "step": 77 + }, + { + "epoch": 0.0312, + "grad_norm": 0.32805272936820984, + "learning_rate": 6.99997356640261e-06, + "loss": 0.6478, + "step": 78 + }, + { + "epoch": 0.0316, + "grad_norm": 0.34910848736763, + "learning_rate": 6.999953006983981e-06, + "loss": 0.8052, + "step": 79 + }, + { + "epoch": 0.032, + "grad_norm": 0.35916391015052795, + "learning_rate": 6.999926573504895e-06, + "loss": 0.7894, + "step": 80 + }, + { + "epoch": 0.0324, + "grad_norm": 0.335122287273407, + "learning_rate": 6.999894266009718e-06, + "loss": 0.7175, + "step": 81 + }, + { + "epoch": 0.0328, + "grad_norm": 0.3043181598186493, + "learning_rate": 6.999856084552671e-06, + "loss": 0.683, + "step": 82 + }, + { + "epoch": 0.0332, + "grad_norm": 0.3139221966266632, + "learning_rate": 6.9998120291978345e-06, + "loss": 0.5613, + "step": 83 + }, + { + "epoch": 0.0336, + "grad_norm": 0.3567010164260864, + "learning_rate": 6.999762100019149e-06, + "loss": 0.7053, + "step": 84 + }, + { + "epoch": 0.034, + "grad_norm": 0.32007086277008057, + "learning_rate": 6.999706297100412e-06, + "loss": 0.7367, + "step": 85 + }, + { + "epoch": 0.0344, + "grad_norm": 0.34357309341430664, + "learning_rate": 6.999644620535277e-06, + "loss": 0.713, + "step": 86 + }, + { + "epoch": 0.0348, + "grad_norm": 0.3997757136821747, + "learning_rate": 6.999577070427259e-06, + "loss": 0.7182, + "step": 87 + }, + { + "epoch": 0.0352, + "grad_norm": 0.36973533034324646, + "learning_rate": 6.999503646889729e-06, + "loss": 0.725, + "step": 88 + }, + { + "epoch": 0.0356, + "grad_norm": 0.3873033821582794, + "learning_rate": 6.999424350045915e-06, + "loss": 0.6845, + "step": 89 + }, + { + "epoch": 0.036, + "grad_norm": 0.3782697319984436, + "learning_rate": 6.9993391800289045e-06, + "loss": 0.6828, + "step": 90 + }, + { + "epoch": 0.0364, + "grad_norm": 0.3623950481414795, + "learning_rate": 6.9992481369816375e-06, + "loss": 0.6517, + "step": 91 + }, + { + "epoch": 0.0368, + "grad_norm": 0.3495238423347473, + "learning_rate": 6.999151221056916e-06, + "loss": 0.6296, + "step": 92 + }, + { + "epoch": 0.0372, + "grad_norm": 0.35579007863998413, + "learning_rate": 6.999048432417397e-06, + "loss": 0.7149, + "step": 93 + }, + { + "epoch": 0.0376, + "grad_norm": 0.35396984219551086, + "learning_rate": 6.9989397712355925e-06, + "loss": 0.6782, + "step": 94 + }, + { + "epoch": 0.038, + "grad_norm": 0.31575071811676025, + "learning_rate": 6.9988252376938705e-06, + "loss": 0.5716, + "step": 95 + }, + { + "epoch": 0.0384, + "grad_norm": 0.327668696641922, + "learning_rate": 6.998704831984456e-06, + "loss": 0.6009, + "step": 96 + }, + { + "epoch": 0.0388, + "grad_norm": 0.32920071482658386, + "learning_rate": 6.998578554309431e-06, + "loss": 0.6265, + "step": 97 + }, + { + "epoch": 0.0392, + "grad_norm": 0.34935787320137024, + "learning_rate": 6.998446404880727e-06, + "loss": 0.5991, + "step": 98 + }, + { + "epoch": 0.0396, + "grad_norm": 0.3906346261501312, + "learning_rate": 6.998308383920136e-06, + "loss": 0.6032, + "step": 99 + }, + { + "epoch": 0.04, + "grad_norm": 0.3648155927658081, + "learning_rate": 6.998164491659303e-06, + "loss": 0.6669, + "step": 100 + }, + { + "epoch": 0.0404, + "grad_norm": 0.35841482877731323, + "learning_rate": 6.998014728339723e-06, + "loss": 0.6297, + "step": 101 + }, + { + "epoch": 0.0408, + "grad_norm": 0.3739035129547119, + "learning_rate": 6.997859094212749e-06, + "loss": 0.5969, + "step": 102 + }, + { + "epoch": 0.0412, + "grad_norm": 0.4193374216556549, + "learning_rate": 6.9976975895395875e-06, + "loss": 0.6492, + "step": 103 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4354825019836426, + "learning_rate": 6.997530214591294e-06, + "loss": 0.6634, + "step": 104 + }, + { + "epoch": 0.042, + "grad_norm": 0.39221763610839844, + "learning_rate": 6.997356969648778e-06, + "loss": 0.6369, + "step": 105 + }, + { + "epoch": 0.0424, + "grad_norm": 0.41635167598724365, + "learning_rate": 6.997177855002802e-06, + "loss": 0.673, + "step": 106 + }, + { + "epoch": 0.0428, + "grad_norm": 0.34026390314102173, + "learning_rate": 6.996992870953979e-06, + "loss": 0.6071, + "step": 107 + }, + { + "epoch": 0.0432, + "grad_norm": 0.30913060903549194, + "learning_rate": 6.996802017812771e-06, + "loss": 0.539, + "step": 108 + }, + { + "epoch": 0.0436, + "grad_norm": 0.40669533610343933, + "learning_rate": 6.9966052958994926e-06, + "loss": 0.7329, + "step": 109 + }, + { + "epoch": 0.044, + "grad_norm": 0.32939252257347107, + "learning_rate": 6.996402705544307e-06, + "loss": 0.6284, + "step": 110 + }, + { + "epoch": 0.0444, + "grad_norm": 0.33887726068496704, + "learning_rate": 6.996194247087227e-06, + "loss": 0.7118, + "step": 111 + }, + { + "epoch": 0.0448, + "grad_norm": 0.40371280908584595, + "learning_rate": 6.995979920878112e-06, + "loss": 0.7295, + "step": 112 + }, + { + "epoch": 0.0452, + "grad_norm": 0.36427947878837585, + "learning_rate": 6.995759727276674e-06, + "loss": 0.6565, + "step": 113 + }, + { + "epoch": 0.0456, + "grad_norm": 0.34461307525634766, + "learning_rate": 6.995533666652468e-06, + "loss": 0.6354, + "step": 114 + }, + { + "epoch": 0.046, + "grad_norm": 0.3352108597755432, + "learning_rate": 6.995301739384897e-06, + "loss": 0.718, + "step": 115 + }, + { + "epoch": 0.0464, + "grad_norm": 0.34925392270088196, + "learning_rate": 6.995063945863211e-06, + "loss": 0.5494, + "step": 116 + }, + { + "epoch": 0.0468, + "grad_norm": 0.3926670253276825, + "learning_rate": 6.994820286486506e-06, + "loss": 0.7144, + "step": 117 + }, + { + "epoch": 0.0472, + "grad_norm": 0.3799579441547394, + "learning_rate": 6.99457076166372e-06, + "loss": 0.6634, + "step": 118 + }, + { + "epoch": 0.0476, + "grad_norm": 0.3147001564502716, + "learning_rate": 6.994315371813637e-06, + "loss": 0.54, + "step": 119 + }, + { + "epoch": 0.048, + "grad_norm": 0.3471606969833374, + "learning_rate": 6.994054117364885e-06, + "loss": 0.6155, + "step": 120 + }, + { + "epoch": 0.0484, + "grad_norm": 0.3033629357814789, + "learning_rate": 6.993786998755934e-06, + "loss": 0.6386, + "step": 121 + }, + { + "epoch": 0.0488, + "grad_norm": 0.3264774680137634, + "learning_rate": 6.9935140164350955e-06, + "loss": 0.5989, + "step": 122 + }, + { + "epoch": 0.0492, + "grad_norm": 0.3133617341518402, + "learning_rate": 6.993235170860523e-06, + "loss": 0.5887, + "step": 123 + }, + { + "epoch": 0.0496, + "grad_norm": 0.30928051471710205, + "learning_rate": 6.99295046250021e-06, + "loss": 0.6331, + "step": 124 + }, + { + "epoch": 0.05, + "grad_norm": 0.35159608721733093, + "learning_rate": 6.992659891831991e-06, + "loss": 0.6124, + "step": 125 + }, + { + "epoch": 0.0504, + "grad_norm": 0.31284284591674805, + "learning_rate": 6.992363459343536e-06, + "loss": 0.5279, + "step": 126 + }, + { + "epoch": 0.0508, + "grad_norm": 0.3226817548274994, + "learning_rate": 6.992061165532358e-06, + "loss": 0.5916, + "step": 127 + }, + { + "epoch": 0.0512, + "grad_norm": 0.35078591108322144, + "learning_rate": 6.9917530109058e-06, + "loss": 0.5826, + "step": 128 + }, + { + "epoch": 0.0516, + "grad_norm": 0.33087560534477234, + "learning_rate": 6.99143899598105e-06, + "loss": 0.508, + "step": 129 + }, + { + "epoch": 0.052, + "grad_norm": 0.33635255694389343, + "learning_rate": 6.991119121285126e-06, + "loss": 0.6149, + "step": 130 + }, + { + "epoch": 0.0524, + "grad_norm": 0.3316454589366913, + "learning_rate": 6.990793387354881e-06, + "loss": 0.5906, + "step": 131 + }, + { + "epoch": 0.0528, + "grad_norm": 0.3063321113586426, + "learning_rate": 6.990461794737004e-06, + "loss": 0.5499, + "step": 132 + }, + { + "epoch": 0.0532, + "grad_norm": 0.3659082055091858, + "learning_rate": 6.990124343988014e-06, + "loss": 0.5934, + "step": 133 + }, + { + "epoch": 0.0536, + "grad_norm": 0.35312604904174805, + "learning_rate": 6.989781035674264e-06, + "loss": 0.6557, + "step": 134 + }, + { + "epoch": 0.054, + "grad_norm": 0.30387458205223083, + "learning_rate": 6.9894318703719365e-06, + "loss": 0.57, + "step": 135 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3087642192840576, + "learning_rate": 6.989076848667046e-06, + "loss": 0.6317, + "step": 136 + }, + { + "epoch": 0.0548, + "grad_norm": 0.3203091621398926, + "learning_rate": 6.988715971155433e-06, + "loss": 0.5949, + "step": 137 + }, + { + "epoch": 0.0552, + "grad_norm": 0.37302759289741516, + "learning_rate": 6.988349238442766e-06, + "loss": 0.7094, + "step": 138 + }, + { + "epoch": 0.0556, + "grad_norm": 0.3452549874782562, + "learning_rate": 6.987976651144545e-06, + "loss": 0.7109, + "step": 139 + }, + { + "epoch": 0.056, + "grad_norm": 0.3411109745502472, + "learning_rate": 6.987598209886092e-06, + "loss": 0.5774, + "step": 140 + }, + { + "epoch": 0.0564, + "grad_norm": 0.3314734399318695, + "learning_rate": 6.987213915302555e-06, + "loss": 0.5756, + "step": 141 + }, + { + "epoch": 0.0568, + "grad_norm": 0.34023550152778625, + "learning_rate": 6.986823768038903e-06, + "loss": 0.7022, + "step": 142 + }, + { + "epoch": 0.0572, + "grad_norm": 0.3167857825756073, + "learning_rate": 6.986427768749931e-06, + "loss": 0.5602, + "step": 143 + }, + { + "epoch": 0.0576, + "grad_norm": 0.3189705014228821, + "learning_rate": 6.986025918100257e-06, + "loss": 0.6159, + "step": 144 + }, + { + "epoch": 0.058, + "grad_norm": 0.33322688937187195, + "learning_rate": 6.985618216764315e-06, + "loss": 0.6296, + "step": 145 + }, + { + "epoch": 0.0584, + "grad_norm": 0.3075767457485199, + "learning_rate": 6.98520466542636e-06, + "loss": 0.5316, + "step": 146 + }, + { + "epoch": 0.0588, + "grad_norm": 0.3329537808895111, + "learning_rate": 6.984785264780469e-06, + "loss": 0.6537, + "step": 147 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4022378623485565, + "learning_rate": 6.984360015530528e-06, + "loss": 0.5802, + "step": 148 + }, + { + "epoch": 0.0596, + "grad_norm": 0.37128475308418274, + "learning_rate": 6.983928918390249e-06, + "loss": 0.6398, + "step": 149 + }, + { + "epoch": 0.06, + "grad_norm": 0.3873206079006195, + "learning_rate": 6.983491974083149e-06, + "loss": 0.7127, + "step": 150 + }, + { + "epoch": 0.0604, + "grad_norm": 0.39454513788223267, + "learning_rate": 6.983049183342565e-06, + "loss": 0.7034, + "step": 151 + }, + { + "epoch": 0.0608, + "grad_norm": 0.35343465209007263, + "learning_rate": 6.982600546911644e-06, + "loss": 0.56, + "step": 152 + }, + { + "epoch": 0.0612, + "grad_norm": 0.35445618629455566, + "learning_rate": 6.982146065543342e-06, + "loss": 0.5552, + "step": 153 + }, + { + "epoch": 0.0616, + "grad_norm": 0.3582383990287781, + "learning_rate": 6.981685740000428e-06, + "loss": 0.6675, + "step": 154 + }, + { + "epoch": 0.062, + "grad_norm": 0.3691483736038208, + "learning_rate": 6.98121957105548e-06, + "loss": 0.64, + "step": 155 + }, + { + "epoch": 0.0624, + "grad_norm": 0.34397178888320923, + "learning_rate": 6.980747559490879e-06, + "loss": 0.4971, + "step": 156 + }, + { + "epoch": 0.0628, + "grad_norm": 0.3170539438724518, + "learning_rate": 6.980269706098813e-06, + "loss": 0.5984, + "step": 157 + }, + { + "epoch": 0.0632, + "grad_norm": 0.3346475064754486, + "learning_rate": 6.979786011681279e-06, + "loss": 0.6547, + "step": 158 + }, + { + "epoch": 0.0636, + "grad_norm": 0.3395736515522003, + "learning_rate": 6.97929647705007e-06, + "loss": 0.5189, + "step": 159 + }, + { + "epoch": 0.064, + "grad_norm": 0.3688207268714905, + "learning_rate": 6.978801103026787e-06, + "loss": 0.6356, + "step": 160 + }, + { + "epoch": 0.0644, + "grad_norm": 0.3267056345939636, + "learning_rate": 6.978299890442828e-06, + "loss": 0.6025, + "step": 161 + }, + { + "epoch": 0.0648, + "grad_norm": 0.33124905824661255, + "learning_rate": 6.977792840139391e-06, + "loss": 0.5353, + "step": 162 + }, + { + "epoch": 0.0652, + "grad_norm": 0.33149927854537964, + "learning_rate": 6.977279952967471e-06, + "loss": 0.5713, + "step": 163 + }, + { + "epoch": 0.0656, + "grad_norm": 0.3635561466217041, + "learning_rate": 6.976761229787861e-06, + "loss": 0.6106, + "step": 164 + }, + { + "epoch": 0.066, + "grad_norm": 0.3570541739463806, + "learning_rate": 6.976236671471146e-06, + "loss": 0.6686, + "step": 165 + }, + { + "epoch": 0.0664, + "grad_norm": 0.35300779342651367, + "learning_rate": 6.975706278897706e-06, + "loss": 0.6507, + "step": 166 + }, + { + "epoch": 0.0668, + "grad_norm": 0.35414016246795654, + "learning_rate": 6.975170052957712e-06, + "loss": 0.625, + "step": 167 + }, + { + "epoch": 0.0672, + "grad_norm": 0.36714860796928406, + "learning_rate": 6.974627994551126e-06, + "loss": 0.622, + "step": 168 + }, + { + "epoch": 0.0676, + "grad_norm": 0.3507433235645294, + "learning_rate": 6.974080104587699e-06, + "loss": 0.6359, + "step": 169 + }, + { + "epoch": 0.068, + "grad_norm": 0.31815552711486816, + "learning_rate": 6.973526383986968e-06, + "loss": 0.5666, + "step": 170 + }, + { + "epoch": 0.0684, + "grad_norm": 0.3590649962425232, + "learning_rate": 6.972966833678257e-06, + "loss": 0.6185, + "step": 171 + }, + { + "epoch": 0.0688, + "grad_norm": 0.3098425269126892, + "learning_rate": 6.9724014546006725e-06, + "loss": 0.589, + "step": 172 + }, + { + "epoch": 0.0692, + "grad_norm": 0.3326118588447571, + "learning_rate": 6.971830247703105e-06, + "loss": 0.5494, + "step": 173 + }, + { + "epoch": 0.0696, + "grad_norm": 0.3343932330608368, + "learning_rate": 6.9712532139442275e-06, + "loss": 0.5835, + "step": 174 + }, + { + "epoch": 0.07, + "grad_norm": 0.3673574924468994, + "learning_rate": 6.970670354292488e-06, + "loss": 0.6293, + "step": 175 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3204609155654907, + "learning_rate": 6.9700816697261155e-06, + "loss": 0.614, + "step": 176 + }, + { + "epoch": 0.0708, + "grad_norm": 0.3640765845775604, + "learning_rate": 6.969487161233115e-06, + "loss": 0.6177, + "step": 177 + }, + { + "epoch": 0.0712, + "grad_norm": 0.3944571912288666, + "learning_rate": 6.968886829811264e-06, + "loss": 0.6153, + "step": 178 + }, + { + "epoch": 0.0716, + "grad_norm": 0.3283381760120392, + "learning_rate": 6.968280676468116e-06, + "loss": 0.6266, + "step": 179 + }, + { + "epoch": 0.072, + "grad_norm": 0.33310529589653015, + "learning_rate": 6.967668702220992e-06, + "loss": 0.6409, + "step": 180 + }, + { + "epoch": 0.0724, + "grad_norm": 0.369968980550766, + "learning_rate": 6.967050908096984e-06, + "loss": 0.6872, + "step": 181 + }, + { + "epoch": 0.0728, + "grad_norm": 0.36735835671424866, + "learning_rate": 6.966427295132952e-06, + "loss": 0.6841, + "step": 182 + }, + { + "epoch": 0.0732, + "grad_norm": 0.3196526765823364, + "learning_rate": 6.965797864375522e-06, + "loss": 0.5331, + "step": 183 + }, + { + "epoch": 0.0736, + "grad_norm": 0.35567739605903625, + "learning_rate": 6.965162616881084e-06, + "loss": 0.6226, + "step": 184 + }, + { + "epoch": 0.074, + "grad_norm": 0.38448742032051086, + "learning_rate": 6.964521553715789e-06, + "loss": 0.6539, + "step": 185 + }, + { + "epoch": 0.0744, + "grad_norm": 0.3042820990085602, + "learning_rate": 6.96387467595555e-06, + "loss": 0.5079, + "step": 186 + }, + { + "epoch": 0.0748, + "grad_norm": 0.37548917531967163, + "learning_rate": 6.963221984686039e-06, + "loss": 0.5517, + "step": 187 + }, + { + "epoch": 0.0752, + "grad_norm": 0.30672183632850647, + "learning_rate": 6.962563481002683e-06, + "loss": 0.533, + "step": 188 + }, + { + "epoch": 0.0756, + "grad_norm": 0.3858972489833832, + "learning_rate": 6.9618991660106675e-06, + "loss": 0.5961, + "step": 189 + }, + { + "epoch": 0.076, + "grad_norm": 0.3610040843486786, + "learning_rate": 6.961229040824927e-06, + "loss": 0.6161, + "step": 190 + }, + { + "epoch": 0.0764, + "grad_norm": 0.33447155356407166, + "learning_rate": 6.960553106570152e-06, + "loss": 0.5496, + "step": 191 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3702964782714844, + "learning_rate": 6.959871364380779e-06, + "loss": 0.5928, + "step": 192 + }, + { + "epoch": 0.0772, + "grad_norm": 0.3681807816028595, + "learning_rate": 6.959183815400994e-06, + "loss": 0.5623, + "step": 193 + }, + { + "epoch": 0.0776, + "grad_norm": 0.3803996741771698, + "learning_rate": 6.958490460784728e-06, + "loss": 0.7486, + "step": 194 + }, + { + "epoch": 0.078, + "grad_norm": 0.37968674302101135, + "learning_rate": 6.957791301695654e-06, + "loss": 0.6576, + "step": 195 + }, + { + "epoch": 0.0784, + "grad_norm": 0.3494243919849396, + "learning_rate": 6.95708633930719e-06, + "loss": 0.641, + "step": 196 + }, + { + "epoch": 0.0788, + "grad_norm": 0.3401181399822235, + "learning_rate": 6.956375574802492e-06, + "loss": 0.5693, + "step": 197 + }, + { + "epoch": 0.0792, + "grad_norm": 0.361893892288208, + "learning_rate": 6.955659009374456e-06, + "loss": 0.5069, + "step": 198 + }, + { + "epoch": 0.0796, + "grad_norm": 0.3742466866970062, + "learning_rate": 6.954936644225709e-06, + "loss": 0.5492, + "step": 199 + }, + { + "epoch": 0.08, + "grad_norm": 0.4014841616153717, + "learning_rate": 6.954208480568617e-06, + "loss": 0.6495, + "step": 200 + }, + { + "epoch": 0.0804, + "grad_norm": 0.36528071761131287, + "learning_rate": 6.953474519625275e-06, + "loss": 0.7123, + "step": 201 + }, + { + "epoch": 0.0808, + "grad_norm": 0.36030465364456177, + "learning_rate": 6.952734762627509e-06, + "loss": 0.5403, + "step": 202 + }, + { + "epoch": 0.0812, + "grad_norm": 0.36890923976898193, + "learning_rate": 6.95198921081687e-06, + "loss": 0.5394, + "step": 203 + }, + { + "epoch": 0.0816, + "grad_norm": 0.3089402914047241, + "learning_rate": 6.951237865444638e-06, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 0.082, + "grad_norm": 0.3627172112464905, + "learning_rate": 6.950480727771816e-06, + "loss": 0.6114, + "step": 205 + }, + { + "epoch": 0.0824, + "grad_norm": 0.35881373286247253, + "learning_rate": 6.949717799069127e-06, + "loss": 0.5708, + "step": 206 + }, + { + "epoch": 0.0828, + "grad_norm": 0.3384152054786682, + "learning_rate": 6.9489490806170125e-06, + "loss": 0.556, + "step": 207 + }, + { + "epoch": 0.0832, + "grad_norm": 0.37565433979034424, + "learning_rate": 6.948174573705633e-06, + "loss": 0.5681, + "step": 208 + }, + { + "epoch": 0.0836, + "grad_norm": 0.3774128556251526, + "learning_rate": 6.947394279634864e-06, + "loss": 0.5823, + "step": 209 + }, + { + "epoch": 0.084, + "grad_norm": 0.35291945934295654, + "learning_rate": 6.946608199714292e-06, + "loss": 0.6107, + "step": 210 + }, + { + "epoch": 0.0844, + "grad_norm": 0.3333498537540436, + "learning_rate": 6.945816335263216e-06, + "loss": 0.5596, + "step": 211 + }, + { + "epoch": 0.0848, + "grad_norm": 0.3444664180278778, + "learning_rate": 6.94501868761064e-06, + "loss": 0.5874, + "step": 212 + }, + { + "epoch": 0.0852, + "grad_norm": 0.3725375235080719, + "learning_rate": 6.94421525809528e-06, + "loss": 0.6062, + "step": 213 + }, + { + "epoch": 0.0856, + "grad_norm": 0.321502149105072, + "learning_rate": 6.943406048065549e-06, + "loss": 0.5407, + "step": 214 + }, + { + "epoch": 0.086, + "grad_norm": 0.37382030487060547, + "learning_rate": 6.942591058879567e-06, + "loss": 0.6295, + "step": 215 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3625917434692383, + "learning_rate": 6.941770291905149e-06, + "loss": 0.5852, + "step": 216 + }, + { + "epoch": 0.0868, + "grad_norm": 0.3712015748023987, + "learning_rate": 6.940943748519811e-06, + "loss": 0.6046, + "step": 217 + }, + { + "epoch": 0.0872, + "grad_norm": 0.3427964448928833, + "learning_rate": 6.940111430110762e-06, + "loss": 0.5867, + "step": 218 + }, + { + "epoch": 0.0876, + "grad_norm": 0.3536797761917114, + "learning_rate": 6.939273338074902e-06, + "loss": 0.5192, + "step": 219 + }, + { + "epoch": 0.088, + "grad_norm": 0.36839908361434937, + "learning_rate": 6.938429473818823e-06, + "loss": 0.7215, + "step": 220 + }, + { + "epoch": 0.0884, + "grad_norm": 0.3873510956764221, + "learning_rate": 6.937579838758804e-06, + "loss": 0.5555, + "step": 221 + }, + { + "epoch": 0.0888, + "grad_norm": 0.371750146150589, + "learning_rate": 6.936724434320808e-06, + "loss": 0.6247, + "step": 222 + }, + { + "epoch": 0.0892, + "grad_norm": 0.35638293623924255, + "learning_rate": 6.935863261940484e-06, + "loss": 0.6369, + "step": 223 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3919801712036133, + "learning_rate": 6.934996323063157e-06, + "loss": 0.6772, + "step": 224 + }, + { + "epoch": 0.09, + "grad_norm": 0.4034706652164459, + "learning_rate": 6.934123619143837e-06, + "loss": 0.655, + "step": 225 + }, + { + "epoch": 0.0904, + "grad_norm": 0.36784958839416504, + "learning_rate": 6.933245151647201e-06, + "loss": 0.6007, + "step": 226 + }, + { + "epoch": 0.0908, + "grad_norm": 0.36665797233581543, + "learning_rate": 6.932360922047605e-06, + "loss": 0.6603, + "step": 227 + }, + { + "epoch": 0.0912, + "grad_norm": 0.334515243768692, + "learning_rate": 6.931470931829073e-06, + "loss": 0.5401, + "step": 228 + }, + { + "epoch": 0.0916, + "grad_norm": 0.3468990921974182, + "learning_rate": 6.9305751824853e-06, + "loss": 0.5969, + "step": 229 + }, + { + "epoch": 0.092, + "grad_norm": 0.38072285056114197, + "learning_rate": 6.929673675519645e-06, + "loss": 0.6163, + "step": 230 + }, + { + "epoch": 0.0924, + "grad_norm": 0.33580705523490906, + "learning_rate": 6.928766412445129e-06, + "loss": 0.5892, + "step": 231 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3787344992160797, + "learning_rate": 6.927853394784435e-06, + "loss": 0.5176, + "step": 232 + }, + { + "epoch": 0.0932, + "grad_norm": 0.3536042273044586, + "learning_rate": 6.926934624069905e-06, + "loss": 0.5962, + "step": 233 + }, + { + "epoch": 0.0936, + "grad_norm": 0.3443499207496643, + "learning_rate": 6.926010101843533e-06, + "loss": 0.5676, + "step": 234 + }, + { + "epoch": 0.094, + "grad_norm": 0.3868633210659027, + "learning_rate": 6.925079829656971e-06, + "loss": 0.6262, + "step": 235 + }, + { + "epoch": 0.0944, + "grad_norm": 0.36493581533432007, + "learning_rate": 6.924143809071516e-06, + "loss": 0.6733, + "step": 236 + }, + { + "epoch": 0.0948, + "grad_norm": 0.3501265347003937, + "learning_rate": 6.923202041658117e-06, + "loss": 0.5836, + "step": 237 + }, + { + "epoch": 0.0952, + "grad_norm": 0.319475919008255, + "learning_rate": 6.922254528997366e-06, + "loss": 0.5541, + "step": 238 + }, + { + "epoch": 0.0956, + "grad_norm": 0.3460980951786041, + "learning_rate": 6.921301272679497e-06, + "loss": 0.5136, + "step": 239 + }, + { + "epoch": 0.096, + "grad_norm": 0.3520514965057373, + "learning_rate": 6.920342274304385e-06, + "loss": 0.525, + "step": 240 + }, + { + "epoch": 0.0964, + "grad_norm": 0.36067137122154236, + "learning_rate": 6.919377535481542e-06, + "loss": 0.5507, + "step": 241 + }, + { + "epoch": 0.0968, + "grad_norm": 0.38827893137931824, + "learning_rate": 6.918407057830112e-06, + "loss": 0.5688, + "step": 242 + }, + { + "epoch": 0.0972, + "grad_norm": 0.3810005784034729, + "learning_rate": 6.917430842978874e-06, + "loss": 0.61, + "step": 243 + }, + { + "epoch": 0.0976, + "grad_norm": 0.40279942750930786, + "learning_rate": 6.916448892566233e-06, + "loss": 0.6187, + "step": 244 + }, + { + "epoch": 0.098, + "grad_norm": 0.3923201858997345, + "learning_rate": 6.915461208240224e-06, + "loss": 0.6091, + "step": 245 + }, + { + "epoch": 0.0984, + "grad_norm": 0.36452221870422363, + "learning_rate": 6.9144677916585e-06, + "loss": 0.5192, + "step": 246 + }, + { + "epoch": 0.0988, + "grad_norm": 0.3636743724346161, + "learning_rate": 6.9134686444883375e-06, + "loss": 0.638, + "step": 247 + }, + { + "epoch": 0.0992, + "grad_norm": 0.35182589292526245, + "learning_rate": 6.912463768406632e-06, + "loss": 0.5919, + "step": 248 + }, + { + "epoch": 0.0996, + "grad_norm": 0.4089539647102356, + "learning_rate": 6.911453165099894e-06, + "loss": 0.6028, + "step": 249 + }, + { + "epoch": 0.1, + "grad_norm": 0.3915441036224365, + "learning_rate": 6.910436836264241e-06, + "loss": 0.5141, + "step": 250 + }, + { + "epoch": 0.1004, + "grad_norm": 0.36682936549186707, + "learning_rate": 6.909414783605407e-06, + "loss": 0.5331, + "step": 251 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4124221205711365, + "learning_rate": 6.908387008838727e-06, + "loss": 0.6104, + "step": 252 + }, + { + "epoch": 0.1012, + "grad_norm": 0.3299160897731781, + "learning_rate": 6.907353513689142e-06, + "loss": 0.5446, + "step": 253 + }, + { + "epoch": 0.1016, + "grad_norm": 0.3507899045944214, + "learning_rate": 6.906314299891195e-06, + "loss": 0.5387, + "step": 254 + }, + { + "epoch": 0.102, + "grad_norm": 0.3491806983947754, + "learning_rate": 6.905269369189024e-06, + "loss": 0.5394, + "step": 255 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3935128450393677, + "learning_rate": 6.904218723336361e-06, + "loss": 0.5568, + "step": 256 + }, + { + "epoch": 0.1028, + "grad_norm": 0.4095175266265869, + "learning_rate": 6.903162364096535e-06, + "loss": 0.6018, + "step": 257 + }, + { + "epoch": 0.1032, + "grad_norm": 0.42139819264411926, + "learning_rate": 6.902100293242458e-06, + "loss": 0.7185, + "step": 258 + }, + { + "epoch": 0.1036, + "grad_norm": 0.388468474149704, + "learning_rate": 6.901032512556632e-06, + "loss": 0.5549, + "step": 259 + }, + { + "epoch": 0.104, + "grad_norm": 0.35161855816841125, + "learning_rate": 6.899959023831139e-06, + "loss": 0.5437, + "step": 260 + }, + { + "epoch": 0.1044, + "grad_norm": 0.4257499575614929, + "learning_rate": 6.898879828867646e-06, + "loss": 0.6383, + "step": 261 + }, + { + "epoch": 0.1048, + "grad_norm": 0.48008543252944946, + "learning_rate": 6.89779492947739e-06, + "loss": 0.6421, + "step": 262 + }, + { + "epoch": 0.1052, + "grad_norm": 0.3906846046447754, + "learning_rate": 6.896704327481186e-06, + "loss": 0.6624, + "step": 263 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3648388385772705, + "learning_rate": 6.895608024709421e-06, + "loss": 0.6273, + "step": 264 + }, + { + "epoch": 0.106, + "grad_norm": 0.3587093949317932, + "learning_rate": 6.894506023002046e-06, + "loss": 0.6208, + "step": 265 + }, + { + "epoch": 0.1064, + "grad_norm": 0.3674977123737335, + "learning_rate": 6.8933983242085795e-06, + "loss": 0.5363, + "step": 266 + }, + { + "epoch": 0.1068, + "grad_norm": 0.3786182403564453, + "learning_rate": 6.8922849301881e-06, + "loss": 0.5776, + "step": 267 + }, + { + "epoch": 0.1072, + "grad_norm": 0.3598049283027649, + "learning_rate": 6.891165842809245e-06, + "loss": 0.5218, + "step": 268 + }, + { + "epoch": 0.1076, + "grad_norm": 0.352017879486084, + "learning_rate": 6.890041063950209e-06, + "loss": 0.5722, + "step": 269 + }, + { + "epoch": 0.108, + "grad_norm": 0.3780212700366974, + "learning_rate": 6.888910595498735e-06, + "loss": 0.5742, + "step": 270 + }, + { + "epoch": 0.1084, + "grad_norm": 0.37677013874053955, + "learning_rate": 6.887774439352119e-06, + "loss": 0.6385, + "step": 271 + }, + { + "epoch": 0.1088, + "grad_norm": 0.34929612278938293, + "learning_rate": 6.886632597417199e-06, + "loss": 0.5202, + "step": 272 + }, + { + "epoch": 0.1092, + "grad_norm": 0.3872514069080353, + "learning_rate": 6.885485071610358e-06, + "loss": 0.6136, + "step": 273 + }, + { + "epoch": 0.1096, + "grad_norm": 0.3379985988140106, + "learning_rate": 6.88433186385752e-06, + "loss": 0.4804, + "step": 274 + }, + { + "epoch": 0.11, + "grad_norm": 0.38230156898498535, + "learning_rate": 6.883172976094139e-06, + "loss": 0.6076, + "step": 275 + }, + { + "epoch": 0.1104, + "grad_norm": 0.3500521183013916, + "learning_rate": 6.8820084102652096e-06, + "loss": 0.5821, + "step": 276 + }, + { + "epoch": 0.1108, + "grad_norm": 0.38483259081840515, + "learning_rate": 6.8808381683252494e-06, + "loss": 0.5886, + "step": 277 + }, + { + "epoch": 0.1112, + "grad_norm": 0.36327463388442993, + "learning_rate": 6.879662252238309e-06, + "loss": 0.5153, + "step": 278 + }, + { + "epoch": 0.1116, + "grad_norm": 0.361081600189209, + "learning_rate": 6.878480663977954e-06, + "loss": 0.5925, + "step": 279 + }, + { + "epoch": 0.112, + "grad_norm": 0.38348904252052307, + "learning_rate": 6.877293405527277e-06, + "loss": 0.5876, + "step": 280 + }, + { + "epoch": 0.1124, + "grad_norm": 0.3530191481113434, + "learning_rate": 6.8761004788788834e-06, + "loss": 0.5548, + "step": 281 + }, + { + "epoch": 0.1128, + "grad_norm": 0.40636175870895386, + "learning_rate": 6.874901886034892e-06, + "loss": 0.6094, + "step": 282 + }, + { + "epoch": 0.1132, + "grad_norm": 0.3557603359222412, + "learning_rate": 6.873697629006933e-06, + "loss": 0.5073, + "step": 283 + }, + { + "epoch": 0.1136, + "grad_norm": 0.3685925304889679, + "learning_rate": 6.87248770981614e-06, + "loss": 0.5382, + "step": 284 + }, + { + "epoch": 0.114, + "grad_norm": 0.3815518319606781, + "learning_rate": 6.871272130493154e-06, + "loss": 0.5528, + "step": 285 + }, + { + "epoch": 0.1144, + "grad_norm": 0.4013282060623169, + "learning_rate": 6.870050893078109e-06, + "loss": 0.6034, + "step": 286 + }, + { + "epoch": 0.1148, + "grad_norm": 0.36867278814315796, + "learning_rate": 6.868823999620642e-06, + "loss": 0.5778, + "step": 287 + }, + { + "epoch": 0.1152, + "grad_norm": 0.37960368394851685, + "learning_rate": 6.8675914521798785e-06, + "loss": 0.6205, + "step": 288 + }, + { + "epoch": 0.1156, + "grad_norm": 0.3762238025665283, + "learning_rate": 6.8663532528244335e-06, + "loss": 0.6443, + "step": 289 + }, + { + "epoch": 0.116, + "grad_norm": 0.3985435366630554, + "learning_rate": 6.86510940363241e-06, + "loss": 0.5664, + "step": 290 + }, + { + "epoch": 0.1164, + "grad_norm": 0.37209147214889526, + "learning_rate": 6.86385990669139e-06, + "loss": 0.5963, + "step": 291 + }, + { + "epoch": 0.1168, + "grad_norm": 0.37561458349227905, + "learning_rate": 6.86260476409844e-06, + "loss": 0.5929, + "step": 292 + }, + { + "epoch": 0.1172, + "grad_norm": 0.3788372576236725, + "learning_rate": 6.861343977960093e-06, + "loss": 0.5671, + "step": 293 + }, + { + "epoch": 0.1176, + "grad_norm": 0.383171021938324, + "learning_rate": 6.860077550392362e-06, + "loss": 0.5947, + "step": 294 + }, + { + "epoch": 0.118, + "grad_norm": 0.33166104555130005, + "learning_rate": 6.858805483520723e-06, + "loss": 0.5039, + "step": 295 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3684594929218292, + "learning_rate": 6.8575277794801195e-06, + "loss": 0.6029, + "step": 296 + }, + { + "epoch": 0.1188, + "grad_norm": 0.40030691027641296, + "learning_rate": 6.856244440414953e-06, + "loss": 0.6785, + "step": 297 + }, + { + "epoch": 0.1192, + "grad_norm": 0.34929394721984863, + "learning_rate": 6.8549554684790855e-06, + "loss": 0.5659, + "step": 298 + }, + { + "epoch": 0.1196, + "grad_norm": 0.3945501446723938, + "learning_rate": 6.853660865835831e-06, + "loss": 0.6544, + "step": 299 + }, + { + "epoch": 0.12, + "grad_norm": 0.39090695977211, + "learning_rate": 6.852360634657955e-06, + "loss": 0.5841, + "step": 300 + }, + { + "epoch": 0.1204, + "grad_norm": 0.3813556134700775, + "learning_rate": 6.8510547771276675e-06, + "loss": 0.6404, + "step": 301 + }, + { + "epoch": 0.1208, + "grad_norm": 0.34115681052207947, + "learning_rate": 6.849743295436623e-06, + "loss": 0.4899, + "step": 302 + }, + { + "epoch": 0.1212, + "grad_norm": 0.4142250716686249, + "learning_rate": 6.8484261917859155e-06, + "loss": 0.6104, + "step": 303 + }, + { + "epoch": 0.1216, + "grad_norm": 0.35604000091552734, + "learning_rate": 6.847103468386074e-06, + "loss": 0.4774, + "step": 304 + }, + { + "epoch": 0.122, + "grad_norm": 0.3621678352355957, + "learning_rate": 6.845775127457056e-06, + "loss": 0.5684, + "step": 305 + }, + { + "epoch": 0.1224, + "grad_norm": 0.3987256586551666, + "learning_rate": 6.844441171228253e-06, + "loss": 0.5834, + "step": 306 + }, + { + "epoch": 0.1228, + "grad_norm": 0.4088221490383148, + "learning_rate": 6.843101601938477e-06, + "loss": 0.672, + "step": 307 + }, + { + "epoch": 0.1232, + "grad_norm": 0.3717223107814789, + "learning_rate": 6.8417564218359585e-06, + "loss": 0.5031, + "step": 308 + }, + { + "epoch": 0.1236, + "grad_norm": 0.3734017610549927, + "learning_rate": 6.840405633178352e-06, + "loss": 0.6068, + "step": 309 + }, + { + "epoch": 0.124, + "grad_norm": 0.39380621910095215, + "learning_rate": 6.839049238232719e-06, + "loss": 0.5903, + "step": 310 + }, + { + "epoch": 0.1244, + "grad_norm": 0.38201144337654114, + "learning_rate": 6.837687239275532e-06, + "loss": 0.5724, + "step": 311 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3409825563430786, + "learning_rate": 6.836319638592667e-06, + "loss": 0.5576, + "step": 312 + }, + { + "epoch": 0.1252, + "grad_norm": 0.31887713074684143, + "learning_rate": 6.834946438479404e-06, + "loss": 0.4632, + "step": 313 + }, + { + "epoch": 0.1256, + "grad_norm": 0.40399324893951416, + "learning_rate": 6.83356764124042e-06, + "loss": 0.5815, + "step": 314 + }, + { + "epoch": 0.126, + "grad_norm": 0.4099929630756378, + "learning_rate": 6.832183249189787e-06, + "loss": 0.5968, + "step": 315 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4446236789226532, + "learning_rate": 6.830793264650963e-06, + "loss": 0.6245, + "step": 316 + }, + { + "epoch": 0.1268, + "grad_norm": 0.3838817775249481, + "learning_rate": 6.8293976899567955e-06, + "loss": 0.5863, + "step": 317 + }, + { + "epoch": 0.1272, + "grad_norm": 0.4060188829898834, + "learning_rate": 6.827996527449513e-06, + "loss": 0.683, + "step": 318 + }, + { + "epoch": 0.1276, + "grad_norm": 0.36124998331069946, + "learning_rate": 6.826589779480722e-06, + "loss": 0.5444, + "step": 319 + }, + { + "epoch": 0.128, + "grad_norm": 0.35842403769493103, + "learning_rate": 6.825177448411405e-06, + "loss": 0.5025, + "step": 320 + }, + { + "epoch": 0.1284, + "grad_norm": 0.40654295682907104, + "learning_rate": 6.823759536611913e-06, + "loss": 0.5622, + "step": 321 + }, + { + "epoch": 0.1288, + "grad_norm": 0.42024552822113037, + "learning_rate": 6.822336046461962e-06, + "loss": 0.5747, + "step": 322 + }, + { + "epoch": 0.1292, + "grad_norm": 0.4173870086669922, + "learning_rate": 6.820906980350632e-06, + "loss": 0.6264, + "step": 323 + }, + { + "epoch": 0.1296, + "grad_norm": 0.37095096707344055, + "learning_rate": 6.819472340676363e-06, + "loss": 0.5924, + "step": 324 + }, + { + "epoch": 0.13, + "grad_norm": 0.3586147725582123, + "learning_rate": 6.818032129846946e-06, + "loss": 0.5423, + "step": 325 + }, + { + "epoch": 0.1304, + "grad_norm": 0.391284316778183, + "learning_rate": 6.816586350279525e-06, + "loss": 0.4857, + "step": 326 + }, + { + "epoch": 0.1308, + "grad_norm": 0.41726839542388916, + "learning_rate": 6.815135004400586e-06, + "loss": 0.6486, + "step": 327 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3971002697944641, + "learning_rate": 6.813678094645962e-06, + "loss": 0.5285, + "step": 328 + }, + { + "epoch": 0.1316, + "grad_norm": 0.35923510789871216, + "learning_rate": 6.812215623460821e-06, + "loss": 0.5287, + "step": 329 + }, + { + "epoch": 0.132, + "grad_norm": 0.35825857520103455, + "learning_rate": 6.810747593299666e-06, + "loss": 0.5429, + "step": 330 + }, + { + "epoch": 0.1324, + "grad_norm": 0.3599400222301483, + "learning_rate": 6.809274006626329e-06, + "loss": 0.51, + "step": 331 + }, + { + "epoch": 0.1328, + "grad_norm": 0.37383949756622314, + "learning_rate": 6.8077948659139695e-06, + "loss": 0.5065, + "step": 332 + }, + { + "epoch": 0.1332, + "grad_norm": 0.3496291935443878, + "learning_rate": 6.8063101736450655e-06, + "loss": 0.5497, + "step": 333 + }, + { + "epoch": 0.1336, + "grad_norm": 0.33252400159835815, + "learning_rate": 6.804819932311415e-06, + "loss": 0.5179, + "step": 334 + }, + { + "epoch": 0.134, + "grad_norm": 0.3825218975543976, + "learning_rate": 6.803324144414127e-06, + "loss": 0.5685, + "step": 335 + }, + { + "epoch": 0.1344, + "grad_norm": 0.40384697914123535, + "learning_rate": 6.801822812463622e-06, + "loss": 0.6212, + "step": 336 + }, + { + "epoch": 0.1348, + "grad_norm": 0.38337215781211853, + "learning_rate": 6.800315938979622e-06, + "loss": 0.5862, + "step": 337 + }, + { + "epoch": 0.1352, + "grad_norm": 0.37698617577552795, + "learning_rate": 6.798803526491154e-06, + "loss": 0.6163, + "step": 338 + }, + { + "epoch": 0.1356, + "grad_norm": 0.3776755928993225, + "learning_rate": 6.797285577536535e-06, + "loss": 0.5958, + "step": 339 + }, + { + "epoch": 0.136, + "grad_norm": 0.4053572118282318, + "learning_rate": 6.795762094663379e-06, + "loss": 0.5847, + "step": 340 + }, + { + "epoch": 0.1364, + "grad_norm": 0.386665403842926, + "learning_rate": 6.794233080428587e-06, + "loss": 0.5703, + "step": 341 + }, + { + "epoch": 0.1368, + "grad_norm": 0.39310121536254883, + "learning_rate": 6.792698537398341e-06, + "loss": 0.5127, + "step": 342 + }, + { + "epoch": 0.1372, + "grad_norm": 0.3588424623012543, + "learning_rate": 6.791158468148105e-06, + "loss": 0.5456, + "step": 343 + }, + { + "epoch": 0.1376, + "grad_norm": 0.45625194907188416, + "learning_rate": 6.789612875262616e-06, + "loss": 0.5879, + "step": 344 + }, + { + "epoch": 0.138, + "grad_norm": 0.40522149205207825, + "learning_rate": 6.788061761335882e-06, + "loss": 0.6526, + "step": 345 + }, + { + "epoch": 0.1384, + "grad_norm": 0.3919071853160858, + "learning_rate": 6.7865051289711785e-06, + "loss": 0.4821, + "step": 346 + }, + { + "epoch": 0.1388, + "grad_norm": 0.4460979998111725, + "learning_rate": 6.784942980781039e-06, + "loss": 0.6008, + "step": 347 + }, + { + "epoch": 0.1392, + "grad_norm": 0.38495203852653503, + "learning_rate": 6.783375319387258e-06, + "loss": 0.6104, + "step": 348 + }, + { + "epoch": 0.1396, + "grad_norm": 0.3975396752357483, + "learning_rate": 6.781802147420881e-06, + "loss": 0.5317, + "step": 349 + }, + { + "epoch": 0.14, + "grad_norm": 0.37524691224098206, + "learning_rate": 6.780223467522203e-06, + "loss": 0.5957, + "step": 350 + }, + { + "epoch": 0.1404, + "grad_norm": 0.3901492953300476, + "learning_rate": 6.778639282340763e-06, + "loss": 0.5707, + "step": 351 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3822447657585144, + "learning_rate": 6.777049594535339e-06, + "loss": 0.5925, + "step": 352 + }, + { + "epoch": 0.1412, + "grad_norm": 0.333995521068573, + "learning_rate": 6.775454406773944e-06, + "loss": 0.5544, + "step": 353 + }, + { + "epoch": 0.1416, + "grad_norm": 0.3573135733604431, + "learning_rate": 6.773853721733824e-06, + "loss": 0.5393, + "step": 354 + }, + { + "epoch": 0.142, + "grad_norm": 0.39041879773139954, + "learning_rate": 6.772247542101449e-06, + "loss": 0.5191, + "step": 355 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3725902736186981, + "learning_rate": 6.770635870572511e-06, + "loss": 0.5571, + "step": 356 + }, + { + "epoch": 0.1428, + "grad_norm": 0.4029092490673065, + "learning_rate": 6.7690187098519185e-06, + "loss": 0.6127, + "step": 357 + }, + { + "epoch": 0.1432, + "grad_norm": 0.37953653931617737, + "learning_rate": 6.767396062653795e-06, + "loss": 0.5833, + "step": 358 + }, + { + "epoch": 0.1436, + "grad_norm": 0.4030202627182007, + "learning_rate": 6.76576793170147e-06, + "loss": 0.6068, + "step": 359 + }, + { + "epoch": 0.144, + "grad_norm": 0.4032868444919586, + "learning_rate": 6.7641343197274775e-06, + "loss": 0.6152, + "step": 360 + }, + { + "epoch": 0.1444, + "grad_norm": 0.4173031151294708, + "learning_rate": 6.762495229473551e-06, + "loss": 0.6258, + "step": 361 + }, + { + "epoch": 0.1448, + "grad_norm": 0.46217605471611023, + "learning_rate": 6.760850663690616e-06, + "loss": 0.5772, + "step": 362 + }, + { + "epoch": 0.1452, + "grad_norm": 0.44322094321250916, + "learning_rate": 6.75920062513879e-06, + "loss": 0.646, + "step": 363 + }, + { + "epoch": 0.1456, + "grad_norm": 0.39636361598968506, + "learning_rate": 6.757545116587375e-06, + "loss": 0.5529, + "step": 364 + }, + { + "epoch": 0.146, + "grad_norm": 0.40710270404815674, + "learning_rate": 6.755884140814852e-06, + "loss": 0.6251, + "step": 365 + }, + { + "epoch": 0.1464, + "grad_norm": 0.4058838188648224, + "learning_rate": 6.754217700608879e-06, + "loss": 0.6392, + "step": 366 + }, + { + "epoch": 0.1468, + "grad_norm": 0.3798361122608185, + "learning_rate": 6.752545798766288e-06, + "loss": 0.5321, + "step": 367 + }, + { + "epoch": 0.1472, + "grad_norm": 0.4159586727619171, + "learning_rate": 6.750868438093072e-06, + "loss": 0.632, + "step": 368 + }, + { + "epoch": 0.1476, + "grad_norm": 0.4010884463787079, + "learning_rate": 6.7491856214043896e-06, + "loss": 0.5941, + "step": 369 + }, + { + "epoch": 0.148, + "grad_norm": 0.3940584063529968, + "learning_rate": 6.747497351524553e-06, + "loss": 0.544, + "step": 370 + }, + { + "epoch": 0.1484, + "grad_norm": 0.3773007094860077, + "learning_rate": 6.74580363128703e-06, + "loss": 0.5979, + "step": 371 + }, + { + "epoch": 0.1488, + "grad_norm": 0.43958812952041626, + "learning_rate": 6.744104463534436e-06, + "loss": 0.4857, + "step": 372 + }, + { + "epoch": 0.1492, + "grad_norm": 0.42353639006614685, + "learning_rate": 6.742399851118524e-06, + "loss": 0.6292, + "step": 373 + }, + { + "epoch": 0.1496, + "grad_norm": 0.3908763825893402, + "learning_rate": 6.740689796900191e-06, + "loss": 0.5618, + "step": 374 + }, + { + "epoch": 0.15, + "grad_norm": 0.3558228313922882, + "learning_rate": 6.738974303749465e-06, + "loss": 0.4899, + "step": 375 + }, + { + "epoch": 0.1504, + "grad_norm": 0.42558762431144714, + "learning_rate": 6.7372533745455e-06, + "loss": 0.6123, + "step": 376 + }, + { + "epoch": 0.1508, + "grad_norm": 0.41860321164131165, + "learning_rate": 6.735527012176576e-06, + "loss": 0.61, + "step": 377 + }, + { + "epoch": 0.1512, + "grad_norm": 0.42713692784309387, + "learning_rate": 6.733795219540093e-06, + "loss": 0.532, + "step": 378 + }, + { + "epoch": 0.1516, + "grad_norm": 0.40444377064704895, + "learning_rate": 6.73205799954256e-06, + "loss": 0.6371, + "step": 379 + }, + { + "epoch": 0.152, + "grad_norm": 0.39443764090538025, + "learning_rate": 6.730315355099601e-06, + "loss": 0.5507, + "step": 380 + }, + { + "epoch": 0.1524, + "grad_norm": 0.4197935461997986, + "learning_rate": 6.728567289135937e-06, + "loss": 0.6081, + "step": 381 + }, + { + "epoch": 0.1528, + "grad_norm": 0.388790100812912, + "learning_rate": 6.726813804585392e-06, + "loss": 0.5826, + "step": 382 + }, + { + "epoch": 0.1532, + "grad_norm": 0.4298461675643921, + "learning_rate": 6.725054904390888e-06, + "loss": 0.5901, + "step": 383 + }, + { + "epoch": 0.1536, + "grad_norm": 0.40709060430526733, + "learning_rate": 6.7232905915044275e-06, + "loss": 0.6201, + "step": 384 + }, + { + "epoch": 0.154, + "grad_norm": 0.4251411557197571, + "learning_rate": 6.721520868887104e-06, + "loss": 0.6158, + "step": 385 + }, + { + "epoch": 0.1544, + "grad_norm": 0.3800036311149597, + "learning_rate": 6.719745739509088e-06, + "loss": 0.4875, + "step": 386 + }, + { + "epoch": 0.1548, + "grad_norm": 0.4196547567844391, + "learning_rate": 6.717965206349623e-06, + "loss": 0.5107, + "step": 387 + }, + { + "epoch": 0.1552, + "grad_norm": 0.4319261610507965, + "learning_rate": 6.716179272397026e-06, + "loss": 0.6308, + "step": 388 + }, + { + "epoch": 0.1556, + "grad_norm": 0.38977041840553284, + "learning_rate": 6.714387940648673e-06, + "loss": 0.514, + "step": 389 + }, + { + "epoch": 0.156, + "grad_norm": 0.43319880962371826, + "learning_rate": 6.712591214111003e-06, + "loss": 0.7118, + "step": 390 + }, + { + "epoch": 0.1564, + "grad_norm": 0.4345259964466095, + "learning_rate": 6.710789095799508e-06, + "loss": 0.6231, + "step": 391 + }, + { + "epoch": 0.1568, + "grad_norm": 0.41207200288772583, + "learning_rate": 6.708981588738728e-06, + "loss": 0.5641, + "step": 392 + }, + { + "epoch": 0.1572, + "grad_norm": 0.41626623272895813, + "learning_rate": 6.70716869596225e-06, + "loss": 0.6129, + "step": 393 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3993598222732544, + "learning_rate": 6.7053504205126975e-06, + "loss": 0.6225, + "step": 394 + }, + { + "epoch": 0.158, + "grad_norm": 0.3917825520038605, + "learning_rate": 6.703526765441728e-06, + "loss": 0.589, + "step": 395 + }, + { + "epoch": 0.1584, + "grad_norm": 0.42511889338493347, + "learning_rate": 6.7016977338100285e-06, + "loss": 0.5113, + "step": 396 + }, + { + "epoch": 0.1588, + "grad_norm": 0.4189378321170807, + "learning_rate": 6.6998633286873104e-06, + "loss": 0.6223, + "step": 397 + }, + { + "epoch": 0.1592, + "grad_norm": 0.4240991771221161, + "learning_rate": 6.698023553152302e-06, + "loss": 0.628, + "step": 398 + }, + { + "epoch": 0.1596, + "grad_norm": 0.3946475386619568, + "learning_rate": 6.696178410292745e-06, + "loss": 0.4924, + "step": 399 + }, + { + "epoch": 0.16, + "grad_norm": 0.4627389907836914, + "learning_rate": 6.694327903205391e-06, + "loss": 0.5833, + "step": 400 + }, + { + "epoch": 0.1604, + "grad_norm": 0.4218280017375946, + "learning_rate": 6.692472034995991e-06, + "loss": 0.5969, + "step": 401 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4313870072364807, + "learning_rate": 6.690610808779299e-06, + "loss": 0.6232, + "step": 402 + }, + { + "epoch": 0.1612, + "grad_norm": 0.4355848729610443, + "learning_rate": 6.688744227679056e-06, + "loss": 0.574, + "step": 403 + }, + { + "epoch": 0.1616, + "grad_norm": 0.4002700746059418, + "learning_rate": 6.686872294827994e-06, + "loss": 0.5956, + "step": 404 + }, + { + "epoch": 0.162, + "grad_norm": 0.41936513781547546, + "learning_rate": 6.6849950133678266e-06, + "loss": 0.5749, + "step": 405 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4025267958641052, + "learning_rate": 6.683112386449241e-06, + "loss": 0.6316, + "step": 406 + }, + { + "epoch": 0.1628, + "grad_norm": 0.3825077414512634, + "learning_rate": 6.681224417231899e-06, + "loss": 0.5343, + "step": 407 + }, + { + "epoch": 0.1632, + "grad_norm": 0.40748634934425354, + "learning_rate": 6.679331108884428e-06, + "loss": 0.6376, + "step": 408 + }, + { + "epoch": 0.1636, + "grad_norm": 0.43103963136672974, + "learning_rate": 6.677432464584414e-06, + "loss": 0.6273, + "step": 409 + }, + { + "epoch": 0.164, + "grad_norm": 0.37271183729171753, + "learning_rate": 6.675528487518402e-06, + "loss": 0.5296, + "step": 410 + }, + { + "epoch": 0.1644, + "grad_norm": 0.4089418351650238, + "learning_rate": 6.673619180881885e-06, + "loss": 0.5102, + "step": 411 + }, + { + "epoch": 0.1648, + "grad_norm": 0.3724643588066101, + "learning_rate": 6.6717045478793e-06, + "loss": 0.5508, + "step": 412 + }, + { + "epoch": 0.1652, + "grad_norm": 0.4249953627586365, + "learning_rate": 6.669784591724026e-06, + "loss": 0.6005, + "step": 413 + }, + { + "epoch": 0.1656, + "grad_norm": 0.40450453758239746, + "learning_rate": 6.667859315638372e-06, + "loss": 0.5968, + "step": 414 + }, + { + "epoch": 0.166, + "grad_norm": 0.4060114026069641, + "learning_rate": 6.665928722853581e-06, + "loss": 0.6613, + "step": 415 + }, + { + "epoch": 0.1664, + "grad_norm": 0.38569897413253784, + "learning_rate": 6.663992816609815e-06, + "loss": 0.5891, + "step": 416 + }, + { + "epoch": 0.1668, + "grad_norm": 0.41489410400390625, + "learning_rate": 6.662051600156154e-06, + "loss": 0.5452, + "step": 417 + }, + { + "epoch": 0.1672, + "grad_norm": 0.4076942205429077, + "learning_rate": 6.660105076750592e-06, + "loss": 0.513, + "step": 418 + }, + { + "epoch": 0.1676, + "grad_norm": 0.40700528025627136, + "learning_rate": 6.658153249660029e-06, + "loss": 0.6024, + "step": 419 + }, + { + "epoch": 0.168, + "grad_norm": 0.3968103528022766, + "learning_rate": 6.656196122160265e-06, + "loss": 0.602, + "step": 420 + }, + { + "epoch": 0.1684, + "grad_norm": 0.4172203242778778, + "learning_rate": 6.654233697535998e-06, + "loss": 0.6217, + "step": 421 + }, + { + "epoch": 0.1688, + "grad_norm": 0.3906404972076416, + "learning_rate": 6.652265979080816e-06, + "loss": 0.5455, + "step": 422 + }, + { + "epoch": 0.1692, + "grad_norm": 0.4094797372817993, + "learning_rate": 6.650292970097189e-06, + "loss": 0.5759, + "step": 423 + }, + { + "epoch": 0.1696, + "grad_norm": 0.4227938652038574, + "learning_rate": 6.64831467389647e-06, + "loss": 0.6473, + "step": 424 + }, + { + "epoch": 0.17, + "grad_norm": 0.3829863965511322, + "learning_rate": 6.646331093798884e-06, + "loss": 0.5197, + "step": 425 + }, + { + "epoch": 0.1704, + "grad_norm": 0.6422174572944641, + "learning_rate": 6.644342233133524e-06, + "loss": 0.6327, + "step": 426 + }, + { + "epoch": 0.1708, + "grad_norm": 0.36776864528656006, + "learning_rate": 6.6423480952383445e-06, + "loss": 0.597, + "step": 427 + }, + { + "epoch": 0.1712, + "grad_norm": 0.4166823625564575, + "learning_rate": 6.6403486834601586e-06, + "loss": 0.5709, + "step": 428 + }, + { + "epoch": 0.1716, + "grad_norm": 0.4036274254322052, + "learning_rate": 6.638344001154631e-06, + "loss": 0.5106, + "step": 429 + }, + { + "epoch": 0.172, + "grad_norm": 0.4060764014720917, + "learning_rate": 6.636334051686271e-06, + "loss": 0.5733, + "step": 430 + }, + { + "epoch": 0.1724, + "grad_norm": 0.4096258282661438, + "learning_rate": 6.634318838428427e-06, + "loss": 0.5882, + "step": 431 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3832622468471527, + "learning_rate": 6.632298364763285e-06, + "loss": 0.5158, + "step": 432 + }, + { + "epoch": 0.1732, + "grad_norm": 0.3626929521560669, + "learning_rate": 6.630272634081854e-06, + "loss": 0.5444, + "step": 433 + }, + { + "epoch": 0.1736, + "grad_norm": 0.41777303814888, + "learning_rate": 6.628241649783973e-06, + "loss": 0.6262, + "step": 434 + }, + { + "epoch": 0.174, + "grad_norm": 0.3933648467063904, + "learning_rate": 6.626205415278292e-06, + "loss": 0.6132, + "step": 435 + }, + { + "epoch": 0.1744, + "grad_norm": 0.35853779315948486, + "learning_rate": 6.624163933982277e-06, + "loss": 0.535, + "step": 436 + }, + { + "epoch": 0.1748, + "grad_norm": 0.428905725479126, + "learning_rate": 6.6221172093221975e-06, + "loss": 0.6116, + "step": 437 + }, + { + "epoch": 0.1752, + "grad_norm": 0.3868679106235504, + "learning_rate": 6.620065244733125e-06, + "loss": 0.5847, + "step": 438 + }, + { + "epoch": 0.1756, + "grad_norm": 0.3978234529495239, + "learning_rate": 6.6180080436589235e-06, + "loss": 0.6209, + "step": 439 + }, + { + "epoch": 0.176, + "grad_norm": 0.37370485067367554, + "learning_rate": 6.6159456095522445e-06, + "loss": 0.5518, + "step": 440 + }, + { + "epoch": 0.1764, + "grad_norm": 0.35376763343811035, + "learning_rate": 6.613877945874525e-06, + "loss": 0.5174, + "step": 441 + }, + { + "epoch": 0.1768, + "grad_norm": 0.47480249404907227, + "learning_rate": 6.611805056095978e-06, + "loss": 0.6535, + "step": 442 + }, + { + "epoch": 0.1772, + "grad_norm": 0.3956994116306305, + "learning_rate": 6.6097269436955864e-06, + "loss": 0.5688, + "step": 443 + }, + { + "epoch": 0.1776, + "grad_norm": 0.40941232442855835, + "learning_rate": 6.6076436121611e-06, + "loss": 0.5516, + "step": 444 + }, + { + "epoch": 0.178, + "grad_norm": 0.3964068591594696, + "learning_rate": 6.605555064989027e-06, + "loss": 0.5267, + "step": 445 + }, + { + "epoch": 0.1784, + "grad_norm": 0.4019598364830017, + "learning_rate": 6.6034613056846306e-06, + "loss": 0.5498, + "step": 446 + }, + { + "epoch": 0.1788, + "grad_norm": 0.3716028928756714, + "learning_rate": 6.60136233776192e-06, + "loss": 0.5977, + "step": 447 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4159116744995117, + "learning_rate": 6.599258164743644e-06, + "loss": 0.589, + "step": 448 + }, + { + "epoch": 0.1796, + "grad_norm": 0.3982093334197998, + "learning_rate": 6.597148790161293e-06, + "loss": 0.5503, + "step": 449 + }, + { + "epoch": 0.18, + "grad_norm": 0.44451043009757996, + "learning_rate": 6.595034217555082e-06, + "loss": 0.5861, + "step": 450 + }, + { + "epoch": 0.1804, + "grad_norm": 0.4734339714050293, + "learning_rate": 6.5929144504739544e-06, + "loss": 0.6438, + "step": 451 + }, + { + "epoch": 0.1808, + "grad_norm": 0.45358017086982727, + "learning_rate": 6.590789492475569e-06, + "loss": 0.6934, + "step": 452 + }, + { + "epoch": 0.1812, + "grad_norm": 0.4209020733833313, + "learning_rate": 6.588659347126295e-06, + "loss": 0.5052, + "step": 453 + }, + { + "epoch": 0.1816, + "grad_norm": 0.4025874435901642, + "learning_rate": 6.586524018001212e-06, + "loss": 0.6219, + "step": 454 + }, + { + "epoch": 0.182, + "grad_norm": 0.44042277336120605, + "learning_rate": 6.584383508684096e-06, + "loss": 0.5807, + "step": 455 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3821624517440796, + "learning_rate": 6.582237822767418e-06, + "loss": 0.6256, + "step": 456 + }, + { + "epoch": 0.1828, + "grad_norm": 0.42015254497528076, + "learning_rate": 6.58008696385234e-06, + "loss": 0.6157, + "step": 457 + }, + { + "epoch": 0.1832, + "grad_norm": 0.43211668729782104, + "learning_rate": 6.5779309355487e-06, + "loss": 0.5517, + "step": 458 + }, + { + "epoch": 0.1836, + "grad_norm": 0.40651610493659973, + "learning_rate": 6.575769741475019e-06, + "loss": 0.5512, + "step": 459 + }, + { + "epoch": 0.184, + "grad_norm": 0.42855092883110046, + "learning_rate": 6.57360338525848e-06, + "loss": 0.5923, + "step": 460 + }, + { + "epoch": 0.1844, + "grad_norm": 0.43255260586738586, + "learning_rate": 6.571431870534937e-06, + "loss": 0.5141, + "step": 461 + }, + { + "epoch": 0.1848, + "grad_norm": 0.4311109781265259, + "learning_rate": 6.569255200948898e-06, + "loss": 0.6245, + "step": 462 + }, + { + "epoch": 0.1852, + "grad_norm": 0.3710770606994629, + "learning_rate": 6.567073380153522e-06, + "loss": 0.5078, + "step": 463 + }, + { + "epoch": 0.1856, + "grad_norm": 0.400833398103714, + "learning_rate": 6.564886411810617e-06, + "loss": 0.4928, + "step": 464 + }, + { + "epoch": 0.186, + "grad_norm": 0.4306429624557495, + "learning_rate": 6.562694299590625e-06, + "loss": 0.5688, + "step": 465 + }, + { + "epoch": 0.1864, + "grad_norm": 0.4112854301929474, + "learning_rate": 6.5604970471726246e-06, + "loss": 0.5235, + "step": 466 + }, + { + "epoch": 0.1868, + "grad_norm": 0.49645546078681946, + "learning_rate": 6.558294658244321e-06, + "loss": 0.6761, + "step": 467 + }, + { + "epoch": 0.1872, + "grad_norm": 0.38806602358818054, + "learning_rate": 6.55608713650204e-06, + "loss": 0.4431, + "step": 468 + }, + { + "epoch": 0.1876, + "grad_norm": 0.43465444445610046, + "learning_rate": 6.553874485650722e-06, + "loss": 0.5803, + "step": 469 + }, + { + "epoch": 0.188, + "grad_norm": 0.4388769268989563, + "learning_rate": 6.551656709403914e-06, + "loss": 0.5055, + "step": 470 + }, + { + "epoch": 0.1884, + "grad_norm": 0.47469204664230347, + "learning_rate": 6.549433811483768e-06, + "loss": 0.6419, + "step": 471 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4161481261253357, + "learning_rate": 6.54720579562103e-06, + "loss": 0.6525, + "step": 472 + }, + { + "epoch": 0.1892, + "grad_norm": 0.42730727791786194, + "learning_rate": 6.544972665555034e-06, + "loss": 0.5436, + "step": 473 + }, + { + "epoch": 0.1896, + "grad_norm": 0.41790273785591125, + "learning_rate": 6.5427344250337025e-06, + "loss": 0.6091, + "step": 474 + }, + { + "epoch": 0.19, + "grad_norm": 0.4080909192562103, + "learning_rate": 6.540491077813529e-06, + "loss": 0.5359, + "step": 475 + }, + { + "epoch": 0.1904, + "grad_norm": 0.3704690933227539, + "learning_rate": 6.53824262765958e-06, + "loss": 0.4738, + "step": 476 + }, + { + "epoch": 0.1908, + "grad_norm": 0.41908273100852966, + "learning_rate": 6.53598907834549e-06, + "loss": 0.6515, + "step": 477 + }, + { + "epoch": 0.1912, + "grad_norm": 0.40230220556259155, + "learning_rate": 6.533730433653446e-06, + "loss": 0.6099, + "step": 478 + }, + { + "epoch": 0.1916, + "grad_norm": 0.4547681212425232, + "learning_rate": 6.5314666973741885e-06, + "loss": 0.6308, + "step": 479 + }, + { + "epoch": 0.192, + "grad_norm": 0.4127230644226074, + "learning_rate": 6.5291978733070066e-06, + "loss": 0.605, + "step": 480 + }, + { + "epoch": 0.1924, + "grad_norm": 0.4303308427333832, + "learning_rate": 6.526923965259722e-06, + "loss": 0.5355, + "step": 481 + }, + { + "epoch": 0.1928, + "grad_norm": 0.42116066813468933, + "learning_rate": 6.524644977048695e-06, + "loss": 0.487, + "step": 482 + }, + { + "epoch": 0.1932, + "grad_norm": 0.43687164783477783, + "learning_rate": 6.5223609124988085e-06, + "loss": 0.5955, + "step": 483 + }, + { + "epoch": 0.1936, + "grad_norm": 0.39758434891700745, + "learning_rate": 6.520071775443468e-06, + "loss": 0.5845, + "step": 484 + }, + { + "epoch": 0.194, + "grad_norm": 0.4243478775024414, + "learning_rate": 6.517777569724588e-06, + "loss": 0.5288, + "step": 485 + }, + { + "epoch": 0.1944, + "grad_norm": 0.43449336290359497, + "learning_rate": 6.5154782991925965e-06, + "loss": 0.6107, + "step": 486 + }, + { + "epoch": 0.1948, + "grad_norm": 0.47633621096611023, + "learning_rate": 6.5131739677064135e-06, + "loss": 0.5935, + "step": 487 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4590698182582855, + "learning_rate": 6.51086457913346e-06, + "loss": 0.5879, + "step": 488 + }, + { + "epoch": 0.1956, + "grad_norm": 0.43686172366142273, + "learning_rate": 6.508550137349641e-06, + "loss": 0.6266, + "step": 489 + }, + { + "epoch": 0.196, + "grad_norm": 0.4315257966518402, + "learning_rate": 6.506230646239344e-06, + "loss": 0.5888, + "step": 490 + }, + { + "epoch": 0.1964, + "grad_norm": 0.3936934769153595, + "learning_rate": 6.50390610969543e-06, + "loss": 0.5724, + "step": 491 + }, + { + "epoch": 0.1968, + "grad_norm": 0.3999297022819519, + "learning_rate": 6.501576531619225e-06, + "loss": 0.529, + "step": 492 + }, + { + "epoch": 0.1972, + "grad_norm": 0.4457099139690399, + "learning_rate": 6.499241915920524e-06, + "loss": 0.6437, + "step": 493 + }, + { + "epoch": 0.1976, + "grad_norm": 0.38449960947036743, + "learning_rate": 6.4969022665175684e-06, + "loss": 0.6073, + "step": 494 + }, + { + "epoch": 0.198, + "grad_norm": 0.47077676653862, + "learning_rate": 6.494557587337054e-06, + "loss": 0.6728, + "step": 495 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4107373356819153, + "learning_rate": 6.492207882314114e-06, + "loss": 0.6166, + "step": 496 + }, + { + "epoch": 0.1988, + "grad_norm": 0.4460639953613281, + "learning_rate": 6.48985315539232e-06, + "loss": 0.5883, + "step": 497 + }, + { + "epoch": 0.1992, + "grad_norm": 0.42625370621681213, + "learning_rate": 6.4874934105236685e-06, + "loss": 0.6357, + "step": 498 + }, + { + "epoch": 0.1996, + "grad_norm": 0.3935810327529907, + "learning_rate": 6.4851286516685816e-06, + "loss": 0.531, + "step": 499 + }, + { + "epoch": 0.2, + "grad_norm": 0.39704349637031555, + "learning_rate": 6.482758882795892e-06, + "loss": 0.5663, + "step": 500 + }, + { + "epoch": 0.2004, + "grad_norm": 0.3972982168197632, + "learning_rate": 6.480384107882846e-06, + "loss": 0.5619, + "step": 501 + }, + { + "epoch": 0.2008, + "grad_norm": 0.4732000529766083, + "learning_rate": 6.478004330915086e-06, + "loss": 0.6, + "step": 502 + }, + { + "epoch": 0.2012, + "grad_norm": 0.450668066740036, + "learning_rate": 6.475619555886654e-06, + "loss": 0.6062, + "step": 503 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4316413998603821, + "learning_rate": 6.473229786799979e-06, + "loss": 0.5445, + "step": 504 + }, + { + "epoch": 0.202, + "grad_norm": 0.415632963180542, + "learning_rate": 6.4708350276658704e-06, + "loss": 0.6405, + "step": 505 + }, + { + "epoch": 0.2024, + "grad_norm": 0.42289984226226807, + "learning_rate": 6.468435282503516e-06, + "loss": 0.557, + "step": 506 + }, + { + "epoch": 0.2028, + "grad_norm": 0.3919405937194824, + "learning_rate": 6.466030555340465e-06, + "loss": 0.4969, + "step": 507 + }, + { + "epoch": 0.2032, + "grad_norm": 0.45820194482803345, + "learning_rate": 6.463620850212638e-06, + "loss": 0.6139, + "step": 508 + }, + { + "epoch": 0.2036, + "grad_norm": 0.4082326591014862, + "learning_rate": 6.461206171164299e-06, + "loss": 0.6121, + "step": 509 + }, + { + "epoch": 0.204, + "grad_norm": 0.42317843437194824, + "learning_rate": 6.458786522248069e-06, + "loss": 0.5903, + "step": 510 + }, + { + "epoch": 0.2044, + "grad_norm": 0.4115223288536072, + "learning_rate": 6.456361907524906e-06, + "loss": 0.5829, + "step": 511 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4232071340084076, + "learning_rate": 6.453932331064102e-06, + "loss": 0.5426, + "step": 512 + }, + { + "epoch": 0.2052, + "grad_norm": 0.3934287428855896, + "learning_rate": 6.451497796943277e-06, + "loss": 0.5725, + "step": 513 + }, + { + "epoch": 0.2056, + "grad_norm": 0.4107876121997833, + "learning_rate": 6.449058309248373e-06, + "loss": 0.5557, + "step": 514 + }, + { + "epoch": 0.206, + "grad_norm": 0.4142535626888275, + "learning_rate": 6.446613872073645e-06, + "loss": 0.5678, + "step": 515 + }, + { + "epoch": 0.2064, + "grad_norm": 0.392920583486557, + "learning_rate": 6.4441644895216515e-06, + "loss": 0.5124, + "step": 516 + }, + { + "epoch": 0.2068, + "grad_norm": 0.4002549946308136, + "learning_rate": 6.441710165703256e-06, + "loss": 0.4938, + "step": 517 + }, + { + "epoch": 0.2072, + "grad_norm": 0.4306708872318268, + "learning_rate": 6.439250904737611e-06, + "loss": 0.5763, + "step": 518 + }, + { + "epoch": 0.2076, + "grad_norm": 0.46144577860832214, + "learning_rate": 6.436786710752159e-06, + "loss": 0.5427, + "step": 519 + }, + { + "epoch": 0.208, + "grad_norm": 0.4587547481060028, + "learning_rate": 6.434317587882619e-06, + "loss": 0.6381, + "step": 520 + }, + { + "epoch": 0.2084, + "grad_norm": 0.42765599489212036, + "learning_rate": 6.431843540272982e-06, + "loss": 0.6067, + "step": 521 + }, + { + "epoch": 0.2088, + "grad_norm": 0.40350085496902466, + "learning_rate": 6.429364572075506e-06, + "loss": 0.5612, + "step": 522 + }, + { + "epoch": 0.2092, + "grad_norm": 0.4018450379371643, + "learning_rate": 6.426880687450706e-06, + "loss": 0.5882, + "step": 523 + }, + { + "epoch": 0.2096, + "grad_norm": 0.4084893465042114, + "learning_rate": 6.42439189056735e-06, + "loss": 0.5958, + "step": 524 + }, + { + "epoch": 0.21, + "grad_norm": 0.4008101224899292, + "learning_rate": 6.421898185602449e-06, + "loss": 0.5665, + "step": 525 + }, + { + "epoch": 0.2104, + "grad_norm": 0.48329246044158936, + "learning_rate": 6.419399576741251e-06, + "loss": 0.6898, + "step": 526 + }, + { + "epoch": 0.2108, + "grad_norm": 0.44100847840309143, + "learning_rate": 6.416896068177236e-06, + "loss": 0.6257, + "step": 527 + }, + { + "epoch": 0.2112, + "grad_norm": 0.388071209192276, + "learning_rate": 6.414387664112106e-06, + "loss": 0.5492, + "step": 528 + }, + { + "epoch": 0.2116, + "grad_norm": 0.39122116565704346, + "learning_rate": 6.4118743687557784e-06, + "loss": 0.5135, + "step": 529 + }, + { + "epoch": 0.212, + "grad_norm": 0.3872377276420593, + "learning_rate": 6.409356186326382e-06, + "loss": 0.5829, + "step": 530 + }, + { + "epoch": 0.2124, + "grad_norm": 0.43584463000297546, + "learning_rate": 6.406833121050248e-06, + "loss": 0.5483, + "step": 531 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4248460829257965, + "learning_rate": 6.404305177161898e-06, + "loss": 0.5424, + "step": 532 + }, + { + "epoch": 0.2132, + "grad_norm": 0.4430016875267029, + "learning_rate": 6.401772358904047e-06, + "loss": 0.6306, + "step": 533 + }, + { + "epoch": 0.2136, + "grad_norm": 0.4296245276927948, + "learning_rate": 6.399234670527588e-06, + "loss": 0.5466, + "step": 534 + }, + { + "epoch": 0.214, + "grad_norm": 0.48698973655700684, + "learning_rate": 6.396692116291589e-06, + "loss": 0.593, + "step": 535 + }, + { + "epoch": 0.2144, + "grad_norm": 0.47233495116233826, + "learning_rate": 6.394144700463282e-06, + "loss": 0.6653, + "step": 536 + }, + { + "epoch": 0.2148, + "grad_norm": 0.45071518421173096, + "learning_rate": 6.391592427318061e-06, + "loss": 0.5932, + "step": 537 + }, + { + "epoch": 0.2152, + "grad_norm": 0.4426057040691376, + "learning_rate": 6.389035301139472e-06, + "loss": 0.5882, + "step": 538 + }, + { + "epoch": 0.2156, + "grad_norm": 0.4603918492794037, + "learning_rate": 6.386473326219203e-06, + "loss": 0.6082, + "step": 539 + }, + { + "epoch": 0.216, + "grad_norm": 0.4099114239215851, + "learning_rate": 6.3839065068570835e-06, + "loss": 0.588, + "step": 540 + }, + { + "epoch": 0.2164, + "grad_norm": 0.46051111817359924, + "learning_rate": 6.381334847361072e-06, + "loss": 0.6374, + "step": 541 + }, + { + "epoch": 0.2168, + "grad_norm": 0.4387073516845703, + "learning_rate": 6.378758352047249e-06, + "loss": 0.5764, + "step": 542 + }, + { + "epoch": 0.2172, + "grad_norm": 0.43461674451828003, + "learning_rate": 6.376177025239813e-06, + "loss": 0.5032, + "step": 543 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4845770597457886, + "learning_rate": 6.373590871271072e-06, + "loss": 0.621, + "step": 544 + }, + { + "epoch": 0.218, + "grad_norm": 0.39095914363861084, + "learning_rate": 6.370999894481431e-06, + "loss": 0.4885, + "step": 545 + }, + { + "epoch": 0.2184, + "grad_norm": 0.39494702219963074, + "learning_rate": 6.3684040992193955e-06, + "loss": 0.5262, + "step": 546 + }, + { + "epoch": 0.2188, + "grad_norm": 0.4588661789894104, + "learning_rate": 6.365803489841554e-06, + "loss": 0.6244, + "step": 547 + }, + { + "epoch": 0.2192, + "grad_norm": 0.43723824620246887, + "learning_rate": 6.363198070712575e-06, + "loss": 0.633, + "step": 548 + }, + { + "epoch": 0.2196, + "grad_norm": 0.3847430944442749, + "learning_rate": 6.360587846205201e-06, + "loss": 0.4945, + "step": 549 + }, + { + "epoch": 0.22, + "grad_norm": 0.41110801696777344, + "learning_rate": 6.357972820700238e-06, + "loss": 0.5271, + "step": 550 + }, + { + "epoch": 0.2204, + "grad_norm": 0.35954952239990234, + "learning_rate": 6.35535299858655e-06, + "loss": 0.525, + "step": 551 + }, + { + "epoch": 0.2208, + "grad_norm": 0.43042629957199097, + "learning_rate": 6.35272838426105e-06, + "loss": 0.5427, + "step": 552 + }, + { + "epoch": 0.2212, + "grad_norm": 0.3824063837528229, + "learning_rate": 6.350098982128698e-06, + "loss": 0.5121, + "step": 553 + }, + { + "epoch": 0.2216, + "grad_norm": 0.4746267795562744, + "learning_rate": 6.347464796602483e-06, + "loss": 0.6304, + "step": 554 + }, + { + "epoch": 0.222, + "grad_norm": 0.4227291941642761, + "learning_rate": 6.34482583210343e-06, + "loss": 0.5746, + "step": 555 + }, + { + "epoch": 0.2224, + "grad_norm": 0.4328877031803131, + "learning_rate": 6.3421820930605774e-06, + "loss": 0.5557, + "step": 556 + }, + { + "epoch": 0.2228, + "grad_norm": 0.37801414728164673, + "learning_rate": 6.33953358391098e-06, + "loss": 0.6082, + "step": 557 + }, + { + "epoch": 0.2232, + "grad_norm": 0.4196787178516388, + "learning_rate": 6.336880309099701e-06, + "loss": 0.5608, + "step": 558 + }, + { + "epoch": 0.2236, + "grad_norm": 0.4383528232574463, + "learning_rate": 6.334222273079796e-06, + "loss": 0.583, + "step": 559 + }, + { + "epoch": 0.224, + "grad_norm": 0.44536638259887695, + "learning_rate": 6.331559480312316e-06, + "loss": 0.6434, + "step": 560 + }, + { + "epoch": 0.2244, + "grad_norm": 0.40932145714759827, + "learning_rate": 6.328891935266295e-06, + "loss": 0.5477, + "step": 561 + }, + { + "epoch": 0.2248, + "grad_norm": 0.4584387242794037, + "learning_rate": 6.3262196424187405e-06, + "loss": 0.5553, + "step": 562 + }, + { + "epoch": 0.2252, + "grad_norm": 0.45622336864471436, + "learning_rate": 6.32354260625463e-06, + "loss": 0.6559, + "step": 563 + }, + { + "epoch": 0.2256, + "grad_norm": 0.4516793191432953, + "learning_rate": 6.3208608312669024e-06, + "loss": 0.5715, + "step": 564 + }, + { + "epoch": 0.226, + "grad_norm": 0.4983437955379486, + "learning_rate": 6.318174321956449e-06, + "loss": 0.5955, + "step": 565 + }, + { + "epoch": 0.2264, + "grad_norm": 0.42384248971939087, + "learning_rate": 6.315483082832107e-06, + "loss": 0.5761, + "step": 566 + }, + { + "epoch": 0.2268, + "grad_norm": 0.4657883942127228, + "learning_rate": 6.312787118410649e-06, + "loss": 0.5292, + "step": 567 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3928786814212799, + "learning_rate": 6.3100864332167856e-06, + "loss": 0.5788, + "step": 568 + }, + { + "epoch": 0.2276, + "grad_norm": 0.44233494997024536, + "learning_rate": 6.307381031783142e-06, + "loss": 0.5317, + "step": 569 + }, + { + "epoch": 0.228, + "grad_norm": 0.43338441848754883, + "learning_rate": 6.304670918650265e-06, + "loss": 0.5289, + "step": 570 + }, + { + "epoch": 0.2284, + "grad_norm": 0.421744704246521, + "learning_rate": 6.301956098366605e-06, + "loss": 0.5815, + "step": 571 + }, + { + "epoch": 0.2288, + "grad_norm": 0.43918102979660034, + "learning_rate": 6.299236575488515e-06, + "loss": 0.6108, + "step": 572 + }, + { + "epoch": 0.2292, + "grad_norm": 0.4225776791572571, + "learning_rate": 6.296512354580237e-06, + "loss": 0.5535, + "step": 573 + }, + { + "epoch": 0.2296, + "grad_norm": 0.51739102602005, + "learning_rate": 6.293783440213905e-06, + "loss": 0.5432, + "step": 574 + }, + { + "epoch": 0.23, + "grad_norm": 0.44613152742385864, + "learning_rate": 6.291049836969523e-06, + "loss": 0.5376, + "step": 575 + }, + { + "epoch": 0.2304, + "grad_norm": 0.45409390330314636, + "learning_rate": 6.2883115494349665e-06, + "loss": 0.6289, + "step": 576 + }, + { + "epoch": 0.2308, + "grad_norm": 0.4217747151851654, + "learning_rate": 6.285568582205975e-06, + "loss": 0.5261, + "step": 577 + }, + { + "epoch": 0.2312, + "grad_norm": 0.41630804538726807, + "learning_rate": 6.28282093988614e-06, + "loss": 0.5454, + "step": 578 + }, + { + "epoch": 0.2316, + "grad_norm": 0.38962697982788086, + "learning_rate": 6.280068627086899e-06, + "loss": 0.5176, + "step": 579 + }, + { + "epoch": 0.232, + "grad_norm": 0.4049234092235565, + "learning_rate": 6.277311648427529e-06, + "loss": 0.5302, + "step": 580 + }, + { + "epoch": 0.2324, + "grad_norm": 0.4065638482570648, + "learning_rate": 6.2745500085351384e-06, + "loss": 0.5437, + "step": 581 + }, + { + "epoch": 0.2328, + "grad_norm": 0.4093839228153229, + "learning_rate": 6.2717837120446565e-06, + "loss": 0.5592, + "step": 582 + }, + { + "epoch": 0.2332, + "grad_norm": 0.44915738701820374, + "learning_rate": 6.269012763598831e-06, + "loss": 0.6082, + "step": 583 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3967624306678772, + "learning_rate": 6.266237167848215e-06, + "loss": 0.5106, + "step": 584 + }, + { + "epoch": 0.234, + "grad_norm": 0.4397246539592743, + "learning_rate": 6.26345692945116e-06, + "loss": 0.5652, + "step": 585 + }, + { + "epoch": 0.2344, + "grad_norm": 0.43744099140167236, + "learning_rate": 6.260672053073813e-06, + "loss": 0.5719, + "step": 586 + }, + { + "epoch": 0.2348, + "grad_norm": 0.4588030278682709, + "learning_rate": 6.2578825433901024e-06, + "loss": 0.5124, + "step": 587 + }, + { + "epoch": 0.2352, + "grad_norm": 0.3984925150871277, + "learning_rate": 6.255088405081733e-06, + "loss": 0.5531, + "step": 588 + }, + { + "epoch": 0.2356, + "grad_norm": 0.42685481905937195, + "learning_rate": 6.252289642838181e-06, + "loss": 0.5728, + "step": 589 + }, + { + "epoch": 0.236, + "grad_norm": 0.4059397280216217, + "learning_rate": 6.249486261356676e-06, + "loss": 0.513, + "step": 590 + }, + { + "epoch": 0.2364, + "grad_norm": 0.45304572582244873, + "learning_rate": 6.246678265342208e-06, + "loss": 0.5868, + "step": 591 + }, + { + "epoch": 0.2368, + "grad_norm": 0.47392749786376953, + "learning_rate": 6.243865659507508e-06, + "loss": 0.636, + "step": 592 + }, + { + "epoch": 0.2372, + "grad_norm": 0.42907243967056274, + "learning_rate": 6.241048448573044e-06, + "loss": 0.6657, + "step": 593 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4455984830856323, + "learning_rate": 6.238226637267012e-06, + "loss": 0.5975, + "step": 594 + }, + { + "epoch": 0.238, + "grad_norm": 0.4236272871494293, + "learning_rate": 6.235400230325331e-06, + "loss": 0.591, + "step": 595 + }, + { + "epoch": 0.2384, + "grad_norm": 0.44643765687942505, + "learning_rate": 6.23256923249163e-06, + "loss": 0.5875, + "step": 596 + }, + { + "epoch": 0.2388, + "grad_norm": 0.3928106725215912, + "learning_rate": 6.229733648517248e-06, + "loss": 0.6052, + "step": 597 + }, + { + "epoch": 0.2392, + "grad_norm": 0.4079158902168274, + "learning_rate": 6.2268934831612164e-06, + "loss": 0.5558, + "step": 598 + }, + { + "epoch": 0.2396, + "grad_norm": 0.422274649143219, + "learning_rate": 6.224048741190257e-06, + "loss": 0.5032, + "step": 599 + }, + { + "epoch": 0.24, + "grad_norm": 0.4376719892024994, + "learning_rate": 6.221199427378773e-06, + "loss": 0.5662, + "step": 600 + }, + { + "epoch": 0.2404, + "grad_norm": 0.47152188420295715, + "learning_rate": 6.218345546508841e-06, + "loss": 0.6223, + "step": 601 + }, + { + "epoch": 0.2408, + "grad_norm": 0.4223569631576538, + "learning_rate": 6.2154871033702e-06, + "loss": 0.5705, + "step": 602 + }, + { + "epoch": 0.2412, + "grad_norm": 0.4249934256076813, + "learning_rate": 6.21262410276025e-06, + "loss": 0.6251, + "step": 603 + }, + { + "epoch": 0.2416, + "grad_norm": 0.4933459460735321, + "learning_rate": 6.209756549484039e-06, + "loss": 0.5981, + "step": 604 + }, + { + "epoch": 0.242, + "grad_norm": 0.4324929416179657, + "learning_rate": 6.206884448354253e-06, + "loss": 0.6305, + "step": 605 + }, + { + "epoch": 0.2424, + "grad_norm": 0.40548714995384216, + "learning_rate": 6.204007804191214e-06, + "loss": 0.5925, + "step": 606 + }, + { + "epoch": 0.2428, + "grad_norm": 0.4025042951107025, + "learning_rate": 6.201126621822866e-06, + "loss": 0.6275, + "step": 607 + }, + { + "epoch": 0.2432, + "grad_norm": 0.4443594217300415, + "learning_rate": 6.19824090608477e-06, + "loss": 0.5989, + "step": 608 + }, + { + "epoch": 0.2436, + "grad_norm": 0.4185742437839508, + "learning_rate": 6.1953506618201e-06, + "loss": 0.5703, + "step": 609 + }, + { + "epoch": 0.244, + "grad_norm": 0.4236041009426117, + "learning_rate": 6.192455893879624e-06, + "loss": 0.509, + "step": 610 + }, + { + "epoch": 0.2444, + "grad_norm": 0.4389362037181854, + "learning_rate": 6.189556607121704e-06, + "loss": 0.5048, + "step": 611 + }, + { + "epoch": 0.2448, + "grad_norm": 0.4090619683265686, + "learning_rate": 6.18665280641229e-06, + "loss": 0.5497, + "step": 612 + }, + { + "epoch": 0.2452, + "grad_norm": 0.4614490568637848, + "learning_rate": 6.183744496624901e-06, + "loss": 0.6688, + "step": 613 + }, + { + "epoch": 0.2456, + "grad_norm": 0.4077858030796051, + "learning_rate": 6.180831682640632e-06, + "loss": 0.4744, + "step": 614 + }, + { + "epoch": 0.246, + "grad_norm": 0.42312952876091003, + "learning_rate": 6.177914369348129e-06, + "loss": 0.5286, + "step": 615 + }, + { + "epoch": 0.2464, + "grad_norm": 0.4584427773952484, + "learning_rate": 6.174992561643597e-06, + "loss": 0.576, + "step": 616 + }, + { + "epoch": 0.2468, + "grad_norm": 0.4387916624546051, + "learning_rate": 6.172066264430778e-06, + "loss": 0.6186, + "step": 617 + }, + { + "epoch": 0.2472, + "grad_norm": 0.44402211904525757, + "learning_rate": 6.169135482620951e-06, + "loss": 0.4917, + "step": 618 + }, + { + "epoch": 0.2476, + "grad_norm": 0.4333488941192627, + "learning_rate": 6.166200221132923e-06, + "loss": 0.5895, + "step": 619 + }, + { + "epoch": 0.248, + "grad_norm": 0.4146369695663452, + "learning_rate": 6.1632604848930185e-06, + "loss": 0.5022, + "step": 620 + }, + { + "epoch": 0.2484, + "grad_norm": 0.3817729353904724, + "learning_rate": 6.160316278835071e-06, + "loss": 0.4959, + "step": 621 + }, + { + "epoch": 0.2488, + "grad_norm": 0.3966714143753052, + "learning_rate": 6.1573676079004185e-06, + "loss": 0.5352, + "step": 622 + }, + { + "epoch": 0.2492, + "grad_norm": 0.45970264077186584, + "learning_rate": 6.154414477037888e-06, + "loss": 0.5645, + "step": 623 + }, + { + "epoch": 0.2496, + "grad_norm": 0.48507970571517944, + "learning_rate": 6.151456891203796e-06, + "loss": 0.6115, + "step": 624 + }, + { + "epoch": 0.25, + "grad_norm": 0.41141727566719055, + "learning_rate": 6.148494855361933e-06, + "loss": 0.5251, + "step": 625 + }, + { + "epoch": 0.2504, + "grad_norm": 0.41636160016059875, + "learning_rate": 6.1455283744835615e-06, + "loss": 0.6228, + "step": 626 + }, + { + "epoch": 0.2508, + "grad_norm": 0.417447030544281, + "learning_rate": 6.1425574535474e-06, + "loss": 0.5013, + "step": 627 + }, + { + "epoch": 0.2512, + "grad_norm": 0.45456668734550476, + "learning_rate": 6.139582097539622e-06, + "loss": 0.5358, + "step": 628 + }, + { + "epoch": 0.2516, + "grad_norm": 0.44917646050453186, + "learning_rate": 6.136602311453844e-06, + "loss": 0.5993, + "step": 629 + }, + { + "epoch": 0.252, + "grad_norm": 0.4428352415561676, + "learning_rate": 6.133618100291116e-06, + "loss": 0.6076, + "step": 630 + }, + { + "epoch": 0.2524, + "grad_norm": 0.4192942678928375, + "learning_rate": 6.130629469059915e-06, + "loss": 0.5457, + "step": 631 + }, + { + "epoch": 0.2528, + "grad_norm": 0.43102753162384033, + "learning_rate": 6.1276364227761394e-06, + "loss": 0.59, + "step": 632 + }, + { + "epoch": 0.2532, + "grad_norm": 0.44437599182128906, + "learning_rate": 6.124638966463093e-06, + "loss": 0.5485, + "step": 633 + }, + { + "epoch": 0.2536, + "grad_norm": 0.44910112023353577, + "learning_rate": 6.1216371051514844e-06, + "loss": 0.5859, + "step": 634 + }, + { + "epoch": 0.254, + "grad_norm": 0.4694381058216095, + "learning_rate": 6.118630843879414e-06, + "loss": 0.5947, + "step": 635 + }, + { + "epoch": 0.2544, + "grad_norm": 0.45699435472488403, + "learning_rate": 6.1156201876923664e-06, + "loss": 0.568, + "step": 636 + }, + { + "epoch": 0.2548, + "grad_norm": 0.48551276326179504, + "learning_rate": 6.1126051416432026e-06, + "loss": 0.6698, + "step": 637 + }, + { + "epoch": 0.2552, + "grad_norm": 0.4661720395088196, + "learning_rate": 6.109585710792152e-06, + "loss": 0.6635, + "step": 638 + }, + { + "epoch": 0.2556, + "grad_norm": 0.42143744230270386, + "learning_rate": 6.106561900206802e-06, + "loss": 0.6606, + "step": 639 + }, + { + "epoch": 0.256, + "grad_norm": 0.38233682513237, + "learning_rate": 6.103533714962091e-06, + "loss": 0.5294, + "step": 640 + }, + { + "epoch": 0.2564, + "grad_norm": 0.40119943022727966, + "learning_rate": 6.1005011601402986e-06, + "loss": 0.5762, + "step": 641 + }, + { + "epoch": 0.2568, + "grad_norm": 0.45029154419898987, + "learning_rate": 6.097464240831041e-06, + "loss": 0.579, + "step": 642 + }, + { + "epoch": 0.2572, + "grad_norm": 0.4179871678352356, + "learning_rate": 6.094422962131257e-06, + "loss": 0.4944, + "step": 643 + }, + { + "epoch": 0.2576, + "grad_norm": 0.4345018267631531, + "learning_rate": 6.091377329145202e-06, + "loss": 0.6728, + "step": 644 + }, + { + "epoch": 0.258, + "grad_norm": 0.4406702518463135, + "learning_rate": 6.0883273469844375e-06, + "loss": 0.6143, + "step": 645 + }, + { + "epoch": 0.2584, + "grad_norm": 0.3850463330745697, + "learning_rate": 6.085273020767829e-06, + "loss": 0.5452, + "step": 646 + }, + { + "epoch": 0.2588, + "grad_norm": 0.4438881278038025, + "learning_rate": 6.08221435562153e-06, + "loss": 0.5663, + "step": 647 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4332631230354309, + "learning_rate": 6.079151356678974e-06, + "loss": 0.595, + "step": 648 + }, + { + "epoch": 0.2596, + "grad_norm": 0.4043017327785492, + "learning_rate": 6.076084029080874e-06, + "loss": 0.5761, + "step": 649 + }, + { + "epoch": 0.26, + "grad_norm": 0.4187481701374054, + "learning_rate": 6.073012377975201e-06, + "loss": 0.6399, + "step": 650 + }, + { + "epoch": 0.2604, + "grad_norm": 0.44606634974479675, + "learning_rate": 6.069936408517187e-06, + "loss": 0.5939, + "step": 651 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4353955388069153, + "learning_rate": 6.066856125869309e-06, + "loss": 0.5386, + "step": 652 + }, + { + "epoch": 0.2612, + "grad_norm": 0.44587963819503784, + "learning_rate": 6.063771535201285e-06, + "loss": 0.5598, + "step": 653 + }, + { + "epoch": 0.2616, + "grad_norm": 0.3983965814113617, + "learning_rate": 6.060682641690062e-06, + "loss": 0.5234, + "step": 654 + }, + { + "epoch": 0.262, + "grad_norm": 0.3644501268863678, + "learning_rate": 6.057589450519807e-06, + "loss": 0.4878, + "step": 655 + }, + { + "epoch": 0.2624, + "grad_norm": 0.489908903837204, + "learning_rate": 6.054491966881905e-06, + "loss": 0.5665, + "step": 656 + }, + { + "epoch": 0.2628, + "grad_norm": 0.4127205014228821, + "learning_rate": 6.05139019597494e-06, + "loss": 0.5241, + "step": 657 + }, + { + "epoch": 0.2632, + "grad_norm": 0.4351557195186615, + "learning_rate": 6.048284143004693e-06, + "loss": 0.6357, + "step": 658 + }, + { + "epoch": 0.2636, + "grad_norm": 0.4123672544956207, + "learning_rate": 6.045173813184132e-06, + "loss": 0.5769, + "step": 659 + }, + { + "epoch": 0.264, + "grad_norm": 0.4411662220954895, + "learning_rate": 6.042059211733405e-06, + "loss": 0.5479, + "step": 660 + }, + { + "epoch": 0.2644, + "grad_norm": 0.4334865212440491, + "learning_rate": 6.038940343879824e-06, + "loss": 0.5502, + "step": 661 + }, + { + "epoch": 0.2648, + "grad_norm": 0.46878618001937866, + "learning_rate": 6.035817214857866e-06, + "loss": 0.6801, + "step": 662 + }, + { + "epoch": 0.2652, + "grad_norm": 0.4362941086292267, + "learning_rate": 6.032689829909158e-06, + "loss": 0.5267, + "step": 663 + }, + { + "epoch": 0.2656, + "grad_norm": 0.40274283289909363, + "learning_rate": 6.0295581942824715e-06, + "loss": 0.4743, + "step": 664 + }, + { + "epoch": 0.266, + "grad_norm": 0.42851364612579346, + "learning_rate": 6.026422313233708e-06, + "loss": 0.4994, + "step": 665 + }, + { + "epoch": 0.2664, + "grad_norm": 0.4585568904876709, + "learning_rate": 6.023282192025897e-06, + "loss": 0.5265, + "step": 666 + }, + { + "epoch": 0.2668, + "grad_norm": 0.41542091965675354, + "learning_rate": 6.020137835929185e-06, + "loss": 0.4668, + "step": 667 + }, + { + "epoch": 0.2672, + "grad_norm": 0.501136839389801, + "learning_rate": 6.016989250220826e-06, + "loss": 0.6468, + "step": 668 + }, + { + "epoch": 0.2676, + "grad_norm": 0.5080068707466125, + "learning_rate": 6.013836440185169e-06, + "loss": 0.5774, + "step": 669 + }, + { + "epoch": 0.268, + "grad_norm": 0.3732645511627197, + "learning_rate": 6.010679411113658e-06, + "loss": 0.4073, + "step": 670 + }, + { + "epoch": 0.2684, + "grad_norm": 0.4972980320453644, + "learning_rate": 6.007518168304815e-06, + "loss": 0.6443, + "step": 671 + }, + { + "epoch": 0.2688, + "grad_norm": 0.43165460228919983, + "learning_rate": 6.004352717064234e-06, + "loss": 0.5778, + "step": 672 + }, + { + "epoch": 0.2692, + "grad_norm": 0.4288862943649292, + "learning_rate": 6.001183062704573e-06, + "loss": 0.58, + "step": 673 + }, + { + "epoch": 0.2696, + "grad_norm": 0.4247485101222992, + "learning_rate": 5.998009210545544e-06, + "loss": 0.4869, + "step": 674 + }, + { + "epoch": 0.27, + "grad_norm": 0.49416670203208923, + "learning_rate": 5.994831165913903e-06, + "loss": 0.6321, + "step": 675 + }, + { + "epoch": 0.2704, + "grad_norm": 0.4453592896461487, + "learning_rate": 5.991648934143443e-06, + "loss": 0.5078, + "step": 676 + }, + { + "epoch": 0.2708, + "grad_norm": 0.424652099609375, + "learning_rate": 5.988462520574987e-06, + "loss": 0.4775, + "step": 677 + }, + { + "epoch": 0.2712, + "grad_norm": 0.4466131925582886, + "learning_rate": 5.985271930556373e-06, + "loss": 0.5983, + "step": 678 + }, + { + "epoch": 0.2716, + "grad_norm": 0.4205477237701416, + "learning_rate": 5.98207716944245e-06, + "loss": 0.5108, + "step": 679 + }, + { + "epoch": 0.272, + "grad_norm": 0.40754249691963196, + "learning_rate": 5.978878242595065e-06, + "loss": 0.5275, + "step": 680 + }, + { + "epoch": 0.2724, + "grad_norm": 0.4342345893383026, + "learning_rate": 5.975675155383063e-06, + "loss": 0.5025, + "step": 681 + }, + { + "epoch": 0.2728, + "grad_norm": 0.4328685402870178, + "learning_rate": 5.972467913182263e-06, + "loss": 0.5089, + "step": 682 + }, + { + "epoch": 0.2732, + "grad_norm": 0.3855776786804199, + "learning_rate": 5.969256521375463e-06, + "loss": 0.4926, + "step": 683 + }, + { + "epoch": 0.2736, + "grad_norm": 0.4109788239002228, + "learning_rate": 5.966040985352424e-06, + "loss": 0.5406, + "step": 684 + }, + { + "epoch": 0.274, + "grad_norm": 0.3982260823249817, + "learning_rate": 5.962821310509862e-06, + "loss": 0.438, + "step": 685 + }, + { + "epoch": 0.2744, + "grad_norm": 0.3830687701702118, + "learning_rate": 5.9595975022514385e-06, + "loss": 0.4475, + "step": 686 + }, + { + "epoch": 0.2748, + "grad_norm": 0.40385526418685913, + "learning_rate": 5.9563695659877555e-06, + "loss": 0.4952, + "step": 687 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4147714674472809, + "learning_rate": 5.953137507136338e-06, + "loss": 0.513, + "step": 688 + }, + { + "epoch": 0.2756, + "grad_norm": 0.4347432553768158, + "learning_rate": 5.949901331121633e-06, + "loss": 0.5845, + "step": 689 + }, + { + "epoch": 0.276, + "grad_norm": 0.45100805163383484, + "learning_rate": 5.946661043375001e-06, + "loss": 0.5552, + "step": 690 + }, + { + "epoch": 0.2764, + "grad_norm": 0.4410780668258667, + "learning_rate": 5.943416649334695e-06, + "loss": 0.6428, + "step": 691 + }, + { + "epoch": 0.2768, + "grad_norm": 0.43005675077438354, + "learning_rate": 5.940168154445869e-06, + "loss": 0.5303, + "step": 692 + }, + { + "epoch": 0.2772, + "grad_norm": 0.48011666536331177, + "learning_rate": 5.93691556416055e-06, + "loss": 0.6382, + "step": 693 + }, + { + "epoch": 0.2776, + "grad_norm": 0.40826767683029175, + "learning_rate": 5.9336588839376465e-06, + "loss": 0.5135, + "step": 694 + }, + { + "epoch": 0.278, + "grad_norm": 0.44731247425079346, + "learning_rate": 5.930398119242927e-06, + "loss": 0.7138, + "step": 695 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4752798080444336, + "learning_rate": 5.9271332755490155e-06, + "loss": 0.6109, + "step": 696 + }, + { + "epoch": 0.2788, + "grad_norm": 0.5010511875152588, + "learning_rate": 5.923864358335384e-06, + "loss": 0.5772, + "step": 697 + }, + { + "epoch": 0.2792, + "grad_norm": 0.5007715225219727, + "learning_rate": 5.920591373088338e-06, + "loss": 0.6622, + "step": 698 + }, + { + "epoch": 0.2796, + "grad_norm": 0.39461269974708557, + "learning_rate": 5.917314325301015e-06, + "loss": 0.5549, + "step": 699 + }, + { + "epoch": 0.28, + "grad_norm": 0.4943528473377228, + "learning_rate": 5.9140332204733655e-06, + "loss": 0.6357, + "step": 700 + }, + { + "epoch": 0.2804, + "grad_norm": 0.45546188950538635, + "learning_rate": 5.910748064112151e-06, + "loss": 0.5509, + "step": 701 + }, + { + "epoch": 0.2808, + "grad_norm": 0.4762568771839142, + "learning_rate": 5.907458861730934e-06, + "loss": 0.5395, + "step": 702 + }, + { + "epoch": 0.2812, + "grad_norm": 0.4343515932559967, + "learning_rate": 5.904165618850068e-06, + "loss": 0.5163, + "step": 703 + }, + { + "epoch": 0.2816, + "grad_norm": 0.4203025996685028, + "learning_rate": 5.900868340996685e-06, + "loss": 0.4834, + "step": 704 + }, + { + "epoch": 0.282, + "grad_norm": 0.4241425096988678, + "learning_rate": 5.897567033704691e-06, + "loss": 0.5301, + "step": 705 + }, + { + "epoch": 0.2824, + "grad_norm": 0.38161882758140564, + "learning_rate": 5.894261702514755e-06, + "loss": 0.5134, + "step": 706 + }, + { + "epoch": 0.2828, + "grad_norm": 0.45214715600013733, + "learning_rate": 5.890952352974299e-06, + "loss": 0.5846, + "step": 707 + }, + { + "epoch": 0.2832, + "grad_norm": 0.42457085847854614, + "learning_rate": 5.887638990637486e-06, + "loss": 0.5388, + "step": 708 + }, + { + "epoch": 0.2836, + "grad_norm": 0.41017526388168335, + "learning_rate": 5.884321621065221e-06, + "loss": 0.4365, + "step": 709 + }, + { + "epoch": 0.284, + "grad_norm": 0.4165228009223938, + "learning_rate": 5.881000249825125e-06, + "loss": 0.5469, + "step": 710 + }, + { + "epoch": 0.2844, + "grad_norm": 0.4457581341266632, + "learning_rate": 5.877674882491543e-06, + "loss": 0.5077, + "step": 711 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4252013564109802, + "learning_rate": 5.874345524645525e-06, + "loss": 0.5792, + "step": 712 + }, + { + "epoch": 0.2852, + "grad_norm": 0.46879255771636963, + "learning_rate": 5.871012181874815e-06, + "loss": 0.6151, + "step": 713 + }, + { + "epoch": 0.2856, + "grad_norm": 0.45289963483810425, + "learning_rate": 5.867674859773849e-06, + "loss": 0.5513, + "step": 714 + }, + { + "epoch": 0.286, + "grad_norm": 0.4870239198207855, + "learning_rate": 5.864333563943737e-06, + "loss": 0.6212, + "step": 715 + }, + { + "epoch": 0.2864, + "grad_norm": 0.4319377839565277, + "learning_rate": 5.860988299992264e-06, + "loss": 0.5148, + "step": 716 + }, + { + "epoch": 0.2868, + "grad_norm": 0.46285220980644226, + "learning_rate": 5.857639073533871e-06, + "loss": 0.6536, + "step": 717 + }, + { + "epoch": 0.2872, + "grad_norm": 0.43863755464553833, + "learning_rate": 5.85428589018965e-06, + "loss": 0.6057, + "step": 718 + }, + { + "epoch": 0.2876, + "grad_norm": 0.39260825514793396, + "learning_rate": 5.850928755587334e-06, + "loss": 0.5483, + "step": 719 + }, + { + "epoch": 0.288, + "grad_norm": 0.43337124586105347, + "learning_rate": 5.847567675361289e-06, + "loss": 0.5708, + "step": 720 + }, + { + "epoch": 0.2884, + "grad_norm": 0.4293830096721649, + "learning_rate": 5.8442026551525e-06, + "loss": 0.5817, + "step": 721 + }, + { + "epoch": 0.2888, + "grad_norm": 0.4857645034790039, + "learning_rate": 5.840833700608566e-06, + "loss": 0.6579, + "step": 722 + }, + { + "epoch": 0.2892, + "grad_norm": 0.43444594740867615, + "learning_rate": 5.837460817383691e-06, + "loss": 0.5676, + "step": 723 + }, + { + "epoch": 0.2896, + "grad_norm": 0.4489664137363434, + "learning_rate": 5.83408401113867e-06, + "loss": 0.5545, + "step": 724 + }, + { + "epoch": 0.29, + "grad_norm": 0.4444904029369354, + "learning_rate": 5.8307032875408836e-06, + "loss": 0.571, + "step": 725 + }, + { + "epoch": 0.2904, + "grad_norm": 0.393856942653656, + "learning_rate": 5.8273186522642866e-06, + "loss": 0.4803, + "step": 726 + }, + { + "epoch": 0.2908, + "grad_norm": 0.4597555994987488, + "learning_rate": 5.8239301109893984e-06, + "loss": 0.6037, + "step": 727 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4321127235889435, + "learning_rate": 5.820537669403295e-06, + "loss": 0.5774, + "step": 728 + }, + { + "epoch": 0.2916, + "grad_norm": 0.4166051149368286, + "learning_rate": 5.817141333199596e-06, + "loss": 0.5717, + "step": 729 + }, + { + "epoch": 0.292, + "grad_norm": 0.436637818813324, + "learning_rate": 5.813741108078462e-06, + "loss": 0.5879, + "step": 730 + }, + { + "epoch": 0.2924, + "grad_norm": 0.43837761878967285, + "learning_rate": 5.810336999746573e-06, + "loss": 0.5178, + "step": 731 + }, + { + "epoch": 0.2928, + "grad_norm": 0.47148534655570984, + "learning_rate": 5.806929013917136e-06, + "loss": 0.5488, + "step": 732 + }, + { + "epoch": 0.2932, + "grad_norm": 0.39476650953292847, + "learning_rate": 5.803517156309857e-06, + "loss": 0.431, + "step": 733 + }, + { + "epoch": 0.2936, + "grad_norm": 0.4546128511428833, + "learning_rate": 5.800101432650946e-06, + "loss": 0.5669, + "step": 734 + }, + { + "epoch": 0.294, + "grad_norm": 0.42975708842277527, + "learning_rate": 5.796681848673098e-06, + "loss": 0.5621, + "step": 735 + }, + { + "epoch": 0.2944, + "grad_norm": 0.46449723839759827, + "learning_rate": 5.7932584101154885e-06, + "loss": 0.6004, + "step": 736 + }, + { + "epoch": 0.2948, + "grad_norm": 0.42449694871902466, + "learning_rate": 5.789831122723761e-06, + "loss": 0.5197, + "step": 737 + }, + { + "epoch": 0.2952, + "grad_norm": 0.39055266976356506, + "learning_rate": 5.78639999225002e-06, + "loss": 0.5367, + "step": 738 + }, + { + "epoch": 0.2956, + "grad_norm": 0.4630034565925598, + "learning_rate": 5.78296502445282e-06, + "loss": 0.5253, + "step": 739 + }, + { + "epoch": 0.296, + "grad_norm": 0.4190136790275574, + "learning_rate": 5.779526225097153e-06, + "loss": 0.4948, + "step": 740 + }, + { + "epoch": 0.2964, + "grad_norm": 0.4667150676250458, + "learning_rate": 5.776083599954447e-06, + "loss": 0.5638, + "step": 741 + }, + { + "epoch": 0.2968, + "grad_norm": 0.45995426177978516, + "learning_rate": 5.7726371548025446e-06, + "loss": 0.5279, + "step": 742 + }, + { + "epoch": 0.2972, + "grad_norm": 0.401014119386673, + "learning_rate": 5.769186895425704e-06, + "loss": 0.5173, + "step": 743 + }, + { + "epoch": 0.2976, + "grad_norm": 0.45050883293151855, + "learning_rate": 5.7657328276145845e-06, + "loss": 0.576, + "step": 744 + }, + { + "epoch": 0.298, + "grad_norm": 0.48369142413139343, + "learning_rate": 5.762274957166234e-06, + "loss": 0.5987, + "step": 745 + }, + { + "epoch": 0.2984, + "grad_norm": 0.5284405946731567, + "learning_rate": 5.758813289884086e-06, + "loss": 0.5244, + "step": 746 + }, + { + "epoch": 0.2988, + "grad_norm": 0.4794248342514038, + "learning_rate": 5.755347831577945e-06, + "loss": 0.5985, + "step": 747 + }, + { + "epoch": 0.2992, + "grad_norm": 0.47695332765579224, + "learning_rate": 5.751878588063979e-06, + "loss": 0.5536, + "step": 748 + }, + { + "epoch": 0.2996, + "grad_norm": 0.435024619102478, + "learning_rate": 5.748405565164705e-06, + "loss": 0.5141, + "step": 749 + }, + { + "epoch": 0.3, + "grad_norm": 0.4625321924686432, + "learning_rate": 5.7449287687089895e-06, + "loss": 0.4985, + "step": 750 + }, + { + "epoch": 0.3004, + "grad_norm": 0.4212191700935364, + "learning_rate": 5.741448204532028e-06, + "loss": 0.4764, + "step": 751 + }, + { + "epoch": 0.3008, + "grad_norm": 0.48766613006591797, + "learning_rate": 5.7379638784753375e-06, + "loss": 0.5614, + "step": 752 + }, + { + "epoch": 0.3012, + "grad_norm": 0.5481346249580383, + "learning_rate": 5.734475796386754e-06, + "loss": 0.6491, + "step": 753 + }, + { + "epoch": 0.3016, + "grad_norm": 0.40259304642677307, + "learning_rate": 5.730983964120414e-06, + "loss": 0.5542, + "step": 754 + }, + { + "epoch": 0.302, + "grad_norm": 0.45115047693252563, + "learning_rate": 5.727488387536749e-06, + "loss": 0.5763, + "step": 755 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4455792009830475, + "learning_rate": 5.723989072502473e-06, + "loss": 0.5814, + "step": 756 + }, + { + "epoch": 0.3028, + "grad_norm": 0.422356516122818, + "learning_rate": 5.720486024890578e-06, + "loss": 0.5086, + "step": 757 + }, + { + "epoch": 0.3032, + "grad_norm": 0.4979648292064667, + "learning_rate": 5.716979250580316e-06, + "loss": 0.5321, + "step": 758 + }, + { + "epoch": 0.3036, + "grad_norm": 0.46708184480667114, + "learning_rate": 5.713468755457198e-06, + "loss": 0.5119, + "step": 759 + }, + { + "epoch": 0.304, + "grad_norm": 0.41758787631988525, + "learning_rate": 5.709954545412975e-06, + "loss": 0.6061, + "step": 760 + }, + { + "epoch": 0.3044, + "grad_norm": 0.44567832350730896, + "learning_rate": 5.706436626345636e-06, + "loss": 0.48, + "step": 761 + }, + { + "epoch": 0.3048, + "grad_norm": 0.4240472912788391, + "learning_rate": 5.702915004159397e-06, + "loss": 0.6254, + "step": 762 + }, + { + "epoch": 0.3052, + "grad_norm": 0.4301627278327942, + "learning_rate": 5.699389684764685e-06, + "loss": 0.5424, + "step": 763 + }, + { + "epoch": 0.3056, + "grad_norm": 0.47439152002334595, + "learning_rate": 5.6958606740781306e-06, + "loss": 0.5885, + "step": 764 + }, + { + "epoch": 0.306, + "grad_norm": 0.4316118359565735, + "learning_rate": 5.692327978022567e-06, + "loss": 0.4796, + "step": 765 + }, + { + "epoch": 0.3064, + "grad_norm": 0.4615771472454071, + "learning_rate": 5.688791602527005e-06, + "loss": 0.6382, + "step": 766 + }, + { + "epoch": 0.3068, + "grad_norm": 0.44186869263648987, + "learning_rate": 5.6852515535266355e-06, + "loss": 0.5634, + "step": 767 + }, + { + "epoch": 0.3072, + "grad_norm": 0.46103107929229736, + "learning_rate": 5.681707836962812e-06, + "loss": 0.576, + "step": 768 + }, + { + "epoch": 0.3076, + "grad_norm": 0.47277137637138367, + "learning_rate": 5.678160458783045e-06, + "loss": 0.5493, + "step": 769 + }, + { + "epoch": 0.308, + "grad_norm": 0.49479013681411743, + "learning_rate": 5.67460942494099e-06, + "loss": 0.5541, + "step": 770 + }, + { + "epoch": 0.3084, + "grad_norm": 0.46736854314804077, + "learning_rate": 5.6710547413964375e-06, + "loss": 0.615, + "step": 771 + }, + { + "epoch": 0.3088, + "grad_norm": 0.45999255776405334, + "learning_rate": 5.667496414115304e-06, + "loss": 0.5378, + "step": 772 + }, + { + "epoch": 0.3092, + "grad_norm": 0.4547343850135803, + "learning_rate": 5.663934449069619e-06, + "loss": 0.5264, + "step": 773 + }, + { + "epoch": 0.3096, + "grad_norm": 0.45823800563812256, + "learning_rate": 5.6603688522375234e-06, + "loss": 0.5644, + "step": 774 + }, + { + "epoch": 0.31, + "grad_norm": 0.4509381949901581, + "learning_rate": 5.656799629603246e-06, + "loss": 0.5604, + "step": 775 + }, + { + "epoch": 0.3104, + "grad_norm": 0.44573745131492615, + "learning_rate": 5.653226787157104e-06, + "loss": 0.4764, + "step": 776 + }, + { + "epoch": 0.3108, + "grad_norm": 0.4186471402645111, + "learning_rate": 5.649650330895492e-06, + "loss": 0.5625, + "step": 777 + }, + { + "epoch": 0.3112, + "grad_norm": 0.39781829714775085, + "learning_rate": 5.646070266820868e-06, + "loss": 0.4717, + "step": 778 + }, + { + "epoch": 0.3116, + "grad_norm": 0.4090104103088379, + "learning_rate": 5.6424866009417425e-06, + "loss": 0.547, + "step": 779 + }, + { + "epoch": 0.312, + "grad_norm": 0.49576470255851746, + "learning_rate": 5.638899339272676e-06, + "loss": 0.6078, + "step": 780 + }, + { + "epoch": 0.3124, + "grad_norm": 0.4280230402946472, + "learning_rate": 5.63530848783426e-06, + "loss": 0.521, + "step": 781 + }, + { + "epoch": 0.3128, + "grad_norm": 0.4742203950881958, + "learning_rate": 5.631714052653111e-06, + "loss": 0.5345, + "step": 782 + }, + { + "epoch": 0.3132, + "grad_norm": 0.49490198493003845, + "learning_rate": 5.628116039761864e-06, + "loss": 0.6582, + "step": 783 + }, + { + "epoch": 0.3136, + "grad_norm": 0.45912107825279236, + "learning_rate": 5.624514455199154e-06, + "loss": 0.582, + "step": 784 + }, + { + "epoch": 0.314, + "grad_norm": 0.45559161901474, + "learning_rate": 5.620909305009612e-06, + "loss": 0.5585, + "step": 785 + }, + { + "epoch": 0.3144, + "grad_norm": 0.46950197219848633, + "learning_rate": 5.617300595243855e-06, + "loss": 0.5311, + "step": 786 + }, + { + "epoch": 0.3148, + "grad_norm": 0.45106202363967896, + "learning_rate": 5.613688331958472e-06, + "loss": 0.5609, + "step": 787 + }, + { + "epoch": 0.3152, + "grad_norm": 0.4136814773082733, + "learning_rate": 5.610072521216017e-06, + "loss": 0.5501, + "step": 788 + }, + { + "epoch": 0.3156, + "grad_norm": 0.4540536403656006, + "learning_rate": 5.606453169084997e-06, + "loss": 0.5868, + "step": 789 + }, + { + "epoch": 0.316, + "grad_norm": 0.42658835649490356, + "learning_rate": 5.602830281639862e-06, + "loss": 0.5929, + "step": 790 + }, + { + "epoch": 0.3164, + "grad_norm": 0.445243775844574, + "learning_rate": 5.599203864961e-06, + "loss": 0.6617, + "step": 791 + }, + { + "epoch": 0.3168, + "grad_norm": 0.5022698044776917, + "learning_rate": 5.595573925134714e-06, + "loss": 0.5943, + "step": 792 + }, + { + "epoch": 0.3172, + "grad_norm": 0.47502660751342773, + "learning_rate": 5.591940468253228e-06, + "loss": 0.54, + "step": 793 + }, + { + "epoch": 0.3176, + "grad_norm": 0.41046321392059326, + "learning_rate": 5.588303500414663e-06, + "loss": 0.5199, + "step": 794 + }, + { + "epoch": 0.318, + "grad_norm": 0.4040413796901703, + "learning_rate": 5.584663027723038e-06, + "loss": 0.5715, + "step": 795 + }, + { + "epoch": 0.3184, + "grad_norm": 0.48011571168899536, + "learning_rate": 5.5810190562882505e-06, + "loss": 0.5669, + "step": 796 + }, + { + "epoch": 0.3188, + "grad_norm": 0.448673278093338, + "learning_rate": 5.577371592226067e-06, + "loss": 0.5856, + "step": 797 + }, + { + "epoch": 0.3192, + "grad_norm": 0.41867223381996155, + "learning_rate": 5.573720641658124e-06, + "loss": 0.523, + "step": 798 + }, + { + "epoch": 0.3196, + "grad_norm": 0.44206374883651733, + "learning_rate": 5.5700662107119035e-06, + "loss": 0.5252, + "step": 799 + }, + { + "epoch": 0.32, + "grad_norm": 0.4805357754230499, + "learning_rate": 5.566408305520729e-06, + "loss": 0.5956, + "step": 800 + }, + { + "epoch": 0.3204, + "grad_norm": 0.42322584986686707, + "learning_rate": 5.562746932223757e-06, + "loss": 0.5521, + "step": 801 + }, + { + "epoch": 0.3208, + "grad_norm": 0.4758487641811371, + "learning_rate": 5.559082096965966e-06, + "loss": 0.5746, + "step": 802 + }, + { + "epoch": 0.3212, + "grad_norm": 0.4096662700176239, + "learning_rate": 5.555413805898139e-06, + "loss": 0.4617, + "step": 803 + }, + { + "epoch": 0.3216, + "grad_norm": 0.40047913789749146, + "learning_rate": 5.551742065176864e-06, + "loss": 0.4953, + "step": 804 + }, + { + "epoch": 0.322, + "grad_norm": 0.4644544720649719, + "learning_rate": 5.5480668809645185e-06, + "loss": 0.6105, + "step": 805 + }, + { + "epoch": 0.3224, + "grad_norm": 0.41639772057533264, + "learning_rate": 5.544388259429255e-06, + "loss": 0.5031, + "step": 806 + }, + { + "epoch": 0.3228, + "grad_norm": 0.48089903593063354, + "learning_rate": 5.540706206745e-06, + "loss": 0.6029, + "step": 807 + }, + { + "epoch": 0.3232, + "grad_norm": 0.4112328588962555, + "learning_rate": 5.537020729091436e-06, + "loss": 0.5244, + "step": 808 + }, + { + "epoch": 0.3236, + "grad_norm": 0.4498112201690674, + "learning_rate": 5.533331832653995e-06, + "loss": 0.4764, + "step": 809 + }, + { + "epoch": 0.324, + "grad_norm": 0.44204649329185486, + "learning_rate": 5.529639523623845e-06, + "loss": 0.654, + "step": 810 + }, + { + "epoch": 0.3244, + "grad_norm": 0.45008304715156555, + "learning_rate": 5.5259438081978826e-06, + "loss": 0.5443, + "step": 811 + }, + { + "epoch": 0.3248, + "grad_norm": 0.4112232029438019, + "learning_rate": 5.522244692578722e-06, + "loss": 0.5507, + "step": 812 + }, + { + "epoch": 0.3252, + "grad_norm": 0.45125049352645874, + "learning_rate": 5.518542182974681e-06, + "loss": 0.4941, + "step": 813 + }, + { + "epoch": 0.3256, + "grad_norm": 0.48122072219848633, + "learning_rate": 5.514836285599779e-06, + "loss": 0.5857, + "step": 814 + }, + { + "epoch": 0.326, + "grad_norm": 0.43177640438079834, + "learning_rate": 5.511127006673717e-06, + "loss": 0.5111, + "step": 815 + }, + { + "epoch": 0.3264, + "grad_norm": 0.37912872433662415, + "learning_rate": 5.507414352421872e-06, + "loss": 0.5635, + "step": 816 + }, + { + "epoch": 0.3268, + "grad_norm": 0.37735727429389954, + "learning_rate": 5.503698329075288e-06, + "loss": 0.4695, + "step": 817 + }, + { + "epoch": 0.3272, + "grad_norm": 0.43305477499961853, + "learning_rate": 5.499978942870659e-06, + "loss": 0.6414, + "step": 818 + }, + { + "epoch": 0.3276, + "grad_norm": 0.4628191888332367, + "learning_rate": 5.496256200050329e-06, + "loss": 0.5046, + "step": 819 + }, + { + "epoch": 0.328, + "grad_norm": 0.4635592997074127, + "learning_rate": 5.4925301068622695e-06, + "loss": 0.5762, + "step": 820 + }, + { + "epoch": 0.3284, + "grad_norm": 0.40314820408821106, + "learning_rate": 5.488800669560079e-06, + "loss": 0.4439, + "step": 821 + }, + { + "epoch": 0.3288, + "grad_norm": 0.508816659450531, + "learning_rate": 5.485067894402968e-06, + "loss": 0.5022, + "step": 822 + }, + { + "epoch": 0.3292, + "grad_norm": 0.4493391811847687, + "learning_rate": 5.481331787655747e-06, + "loss": 0.5457, + "step": 823 + }, + { + "epoch": 0.3296, + "grad_norm": 0.45542025566101074, + "learning_rate": 5.477592355588822e-06, + "loss": 0.5874, + "step": 824 + }, + { + "epoch": 0.33, + "grad_norm": 0.4431161880493164, + "learning_rate": 5.473849604478173e-06, + "loss": 0.5893, + "step": 825 + }, + { + "epoch": 0.3304, + "grad_norm": 0.4549485146999359, + "learning_rate": 5.470103540605358e-06, + "loss": 0.5934, + "step": 826 + }, + { + "epoch": 0.3308, + "grad_norm": 0.4524597227573395, + "learning_rate": 5.466354170257489e-06, + "loss": 0.6158, + "step": 827 + }, + { + "epoch": 0.3312, + "grad_norm": 0.46560999751091003, + "learning_rate": 5.462601499727233e-06, + "loss": 0.5772, + "step": 828 + }, + { + "epoch": 0.3316, + "grad_norm": 0.4526055157184601, + "learning_rate": 5.4588455353127905e-06, + "loss": 0.54, + "step": 829 + }, + { + "epoch": 0.332, + "grad_norm": 0.4451562762260437, + "learning_rate": 5.455086283317893e-06, + "loss": 0.5364, + "step": 830 + }, + { + "epoch": 0.3324, + "grad_norm": 0.4619428217411041, + "learning_rate": 5.45132375005179e-06, + "loss": 0.5536, + "step": 831 + }, + { + "epoch": 0.3328, + "grad_norm": 0.45558473467826843, + "learning_rate": 5.447557941829236e-06, + "loss": 0.5492, + "step": 832 + }, + { + "epoch": 0.3332, + "grad_norm": 0.4863041341304779, + "learning_rate": 5.443788864970483e-06, + "loss": 0.5926, + "step": 833 + }, + { + "epoch": 0.3336, + "grad_norm": 0.4615713953971863, + "learning_rate": 5.440016525801269e-06, + "loss": 0.6397, + "step": 834 + }, + { + "epoch": 0.334, + "grad_norm": 0.47101691365242004, + "learning_rate": 5.436240930652807e-06, + "loss": 0.6057, + "step": 835 + }, + { + "epoch": 0.3344, + "grad_norm": 0.43971094489097595, + "learning_rate": 5.432462085861777e-06, + "loss": 0.4855, + "step": 836 + }, + { + "epoch": 0.3348, + "grad_norm": 0.45537781715393066, + "learning_rate": 5.428679997770307e-06, + "loss": 0.4801, + "step": 837 + }, + { + "epoch": 0.3352, + "grad_norm": 0.4680633544921875, + "learning_rate": 5.424894672725974e-06, + "loss": 0.5117, + "step": 838 + }, + { + "epoch": 0.3356, + "grad_norm": 0.4822414219379425, + "learning_rate": 5.421106117081785e-06, + "loss": 0.6574, + "step": 839 + }, + { + "epoch": 0.336, + "grad_norm": 0.464046448469162, + "learning_rate": 5.417314337196171e-06, + "loss": 0.6024, + "step": 840 + }, + { + "epoch": 0.3364, + "grad_norm": 0.47728002071380615, + "learning_rate": 5.41351933943297e-06, + "loss": 0.5942, + "step": 841 + }, + { + "epoch": 0.3368, + "grad_norm": 0.5082221627235413, + "learning_rate": 5.409721130161428e-06, + "loss": 0.5226, + "step": 842 + }, + { + "epoch": 0.3372, + "grad_norm": 0.5022159218788147, + "learning_rate": 5.4059197157561715e-06, + "loss": 0.5086, + "step": 843 + }, + { + "epoch": 0.3376, + "grad_norm": 0.4637428820133209, + "learning_rate": 5.402115102597215e-06, + "loss": 0.4954, + "step": 844 + }, + { + "epoch": 0.338, + "grad_norm": 0.4835737943649292, + "learning_rate": 5.398307297069937e-06, + "loss": 0.5729, + "step": 845 + }, + { + "epoch": 0.3384, + "grad_norm": 0.4542867839336395, + "learning_rate": 5.394496305565074e-06, + "loss": 0.5587, + "step": 846 + }, + { + "epoch": 0.3388, + "grad_norm": 0.43878334760665894, + "learning_rate": 5.3906821344787105e-06, + "loss": 0.5253, + "step": 847 + }, + { + "epoch": 0.3392, + "grad_norm": 0.48306146264076233, + "learning_rate": 5.386864790212268e-06, + "loss": 0.5476, + "step": 848 + }, + { + "epoch": 0.3396, + "grad_norm": 0.455120325088501, + "learning_rate": 5.383044279172491e-06, + "loss": 0.5114, + "step": 849 + }, + { + "epoch": 0.34, + "grad_norm": 0.448162317276001, + "learning_rate": 5.379220607771444e-06, + "loss": 0.5854, + "step": 850 + }, + { + "epoch": 0.3404, + "grad_norm": 0.4229871928691864, + "learning_rate": 5.375393782426488e-06, + "loss": 0.4675, + "step": 851 + }, + { + "epoch": 0.3408, + "grad_norm": 0.4494837522506714, + "learning_rate": 5.371563809560285e-06, + "loss": 0.5747, + "step": 852 + }, + { + "epoch": 0.3412, + "grad_norm": 0.4818879961967468, + "learning_rate": 5.367730695600774e-06, + "loss": 0.5542, + "step": 853 + }, + { + "epoch": 0.3416, + "grad_norm": 0.4855602979660034, + "learning_rate": 5.363894446981171e-06, + "loss": 0.6312, + "step": 854 + }, + { + "epoch": 0.342, + "grad_norm": 0.3857225477695465, + "learning_rate": 5.360055070139946e-06, + "loss": 0.4853, + "step": 855 + }, + { + "epoch": 0.3424, + "grad_norm": 0.4498901963233948, + "learning_rate": 5.356212571520827e-06, + "loss": 0.5814, + "step": 856 + }, + { + "epoch": 0.3428, + "grad_norm": 0.3908587098121643, + "learning_rate": 5.352366957572774e-06, + "loss": 0.4917, + "step": 857 + }, + { + "epoch": 0.3432, + "grad_norm": 0.433999627828598, + "learning_rate": 5.34851823474998e-06, + "loss": 0.6064, + "step": 858 + }, + { + "epoch": 0.3436, + "grad_norm": 0.44410914182662964, + "learning_rate": 5.344666409511857e-06, + "loss": 0.5617, + "step": 859 + }, + { + "epoch": 0.344, + "grad_norm": 0.49029380083084106, + "learning_rate": 5.340811488323019e-06, + "loss": 0.5801, + "step": 860 + }, + { + "epoch": 0.3444, + "grad_norm": 0.4502889811992645, + "learning_rate": 5.336953477653281e-06, + "loss": 0.5024, + "step": 861 + }, + { + "epoch": 0.3448, + "grad_norm": 0.4806410074234009, + "learning_rate": 5.333092383977638e-06, + "loss": 0.6077, + "step": 862 + }, + { + "epoch": 0.3452, + "grad_norm": 0.4842955470085144, + "learning_rate": 5.329228213776264e-06, + "loss": 0.537, + "step": 863 + }, + { + "epoch": 0.3456, + "grad_norm": 0.47725391387939453, + "learning_rate": 5.325360973534495e-06, + "loss": 0.6212, + "step": 864 + }, + { + "epoch": 0.346, + "grad_norm": 0.40789616107940674, + "learning_rate": 5.321490669742815e-06, + "loss": 0.5103, + "step": 865 + }, + { + "epoch": 0.3464, + "grad_norm": 0.41891586780548096, + "learning_rate": 5.317617308896859e-06, + "loss": 0.5349, + "step": 866 + }, + { + "epoch": 0.3468, + "grad_norm": 0.4041453003883362, + "learning_rate": 5.313740897497384e-06, + "loss": 0.4992, + "step": 867 + }, + { + "epoch": 0.3472, + "grad_norm": 0.4476974308490753, + "learning_rate": 5.309861442050272e-06, + "loss": 0.5829, + "step": 868 + }, + { + "epoch": 0.3476, + "grad_norm": 0.45309120416641235, + "learning_rate": 5.305978949066509e-06, + "loss": 0.5601, + "step": 869 + }, + { + "epoch": 0.348, + "grad_norm": 0.46513673663139343, + "learning_rate": 5.302093425062187e-06, + "loss": 0.613, + "step": 870 + }, + { + "epoch": 0.3484, + "grad_norm": 0.4027946889400482, + "learning_rate": 5.298204876558476e-06, + "loss": 0.554, + "step": 871 + }, + { + "epoch": 0.3488, + "grad_norm": 0.45852765440940857, + "learning_rate": 5.294313310081627e-06, + "loss": 0.6269, + "step": 872 + }, + { + "epoch": 0.3492, + "grad_norm": 0.4091279208660126, + "learning_rate": 5.290418732162957e-06, + "loss": 0.565, + "step": 873 + }, + { + "epoch": 0.3496, + "grad_norm": 0.4555352032184601, + "learning_rate": 5.286521149338833e-06, + "loss": 0.5722, + "step": 874 + }, + { + "epoch": 0.35, + "grad_norm": 0.4212196469306946, + "learning_rate": 5.28262056815067e-06, + "loss": 0.4969, + "step": 875 + }, + { + "epoch": 0.3504, + "grad_norm": 0.4575810730457306, + "learning_rate": 5.278716995144912e-06, + "loss": 0.5113, + "step": 876 + }, + { + "epoch": 0.3508, + "grad_norm": 0.4176642894744873, + "learning_rate": 5.274810436873025e-06, + "loss": 0.5705, + "step": 877 + }, + { + "epoch": 0.3512, + "grad_norm": 0.5067360401153564, + "learning_rate": 5.270900899891487e-06, + "loss": 0.6339, + "step": 878 + }, + { + "epoch": 0.3516, + "grad_norm": 0.46837419271469116, + "learning_rate": 5.266988390761771e-06, + "loss": 0.4302, + "step": 879 + }, + { + "epoch": 0.352, + "grad_norm": 0.4802364408969879, + "learning_rate": 5.263072916050343e-06, + "loss": 0.5163, + "step": 880 + }, + { + "epoch": 0.3524, + "grad_norm": 0.45025691390037537, + "learning_rate": 5.259154482328642e-06, + "loss": 0.5707, + "step": 881 + }, + { + "epoch": 0.3528, + "grad_norm": 0.43300339579582214, + "learning_rate": 5.255233096173078e-06, + "loss": 0.6326, + "step": 882 + }, + { + "epoch": 0.3532, + "grad_norm": 0.47875091433525085, + "learning_rate": 5.251308764165012e-06, + "loss": 0.5576, + "step": 883 + }, + { + "epoch": 0.3536, + "grad_norm": 0.42456284165382385, + "learning_rate": 5.247381492890752e-06, + "loss": 0.4854, + "step": 884 + }, + { + "epoch": 0.354, + "grad_norm": 0.39123857021331787, + "learning_rate": 5.243451288941536e-06, + "loss": 0.542, + "step": 885 + }, + { + "epoch": 0.3544, + "grad_norm": 0.4416203200817108, + "learning_rate": 5.239518158913525e-06, + "loss": 0.5864, + "step": 886 + }, + { + "epoch": 0.3548, + "grad_norm": 0.4715004861354828, + "learning_rate": 5.2355821094077935e-06, + "loss": 0.6147, + "step": 887 + }, + { + "epoch": 0.3552, + "grad_norm": 0.4258134663105011, + "learning_rate": 5.231643147030312e-06, + "loss": 0.6116, + "step": 888 + }, + { + "epoch": 0.3556, + "grad_norm": 0.4103778004646301, + "learning_rate": 5.227701278391942e-06, + "loss": 0.6069, + "step": 889 + }, + { + "epoch": 0.356, + "grad_norm": 0.4473305940628052, + "learning_rate": 5.223756510108424e-06, + "loss": 0.6424, + "step": 890 + }, + { + "epoch": 0.3564, + "grad_norm": 0.4191577136516571, + "learning_rate": 5.219808848800361e-06, + "loss": 0.559, + "step": 891 + }, + { + "epoch": 0.3568, + "grad_norm": 0.46516153216362, + "learning_rate": 5.215858301093216e-06, + "loss": 0.5549, + "step": 892 + }, + { + "epoch": 0.3572, + "grad_norm": 0.4698564410209656, + "learning_rate": 5.211904873617292e-06, + "loss": 0.5695, + "step": 893 + }, + { + "epoch": 0.3576, + "grad_norm": 0.4017867147922516, + "learning_rate": 5.207948573007728e-06, + "loss": 0.5333, + "step": 894 + }, + { + "epoch": 0.358, + "grad_norm": 0.4583630859851837, + "learning_rate": 5.203989405904483e-06, + "loss": 0.6255, + "step": 895 + }, + { + "epoch": 0.3584, + "grad_norm": 0.44605836272239685, + "learning_rate": 5.200027378952332e-06, + "loss": 0.4985, + "step": 896 + }, + { + "epoch": 0.3588, + "grad_norm": 0.4361209571361542, + "learning_rate": 5.196062498800842e-06, + "loss": 0.6041, + "step": 897 + }, + { + "epoch": 0.3592, + "grad_norm": 0.4455268681049347, + "learning_rate": 5.192094772104373e-06, + "loss": 0.5389, + "step": 898 + }, + { + "epoch": 0.3596, + "grad_norm": 0.4654677212238312, + "learning_rate": 5.188124205522065e-06, + "loss": 0.5827, + "step": 899 + }, + { + "epoch": 0.36, + "grad_norm": 0.4446803629398346, + "learning_rate": 5.184150805717818e-06, + "loss": 0.5612, + "step": 900 + }, + { + "epoch": 0.3604, + "grad_norm": 0.4743603467941284, + "learning_rate": 5.180174579360292e-06, + "loss": 0.5681, + "step": 901 + }, + { + "epoch": 0.3608, + "grad_norm": 0.45481276512145996, + "learning_rate": 5.176195533122887e-06, + "loss": 0.5277, + "step": 902 + }, + { + "epoch": 0.3612, + "grad_norm": 0.4798752963542938, + "learning_rate": 5.172213673683741e-06, + "loss": 0.5995, + "step": 903 + }, + { + "epoch": 0.3616, + "grad_norm": 0.44253095984458923, + "learning_rate": 5.168229007725709e-06, + "loss": 0.5572, + "step": 904 + }, + { + "epoch": 0.362, + "grad_norm": 0.5103812217712402, + "learning_rate": 5.164241541936356e-06, + "loss": 0.5673, + "step": 905 + }, + { + "epoch": 0.3624, + "grad_norm": 0.45087578892707825, + "learning_rate": 5.16025128300795e-06, + "loss": 0.4734, + "step": 906 + }, + { + "epoch": 0.3628, + "grad_norm": 0.534394383430481, + "learning_rate": 5.156258237637444e-06, + "loss": 0.5476, + "step": 907 + }, + { + "epoch": 0.3632, + "grad_norm": 0.5126708745956421, + "learning_rate": 5.152262412526467e-06, + "loss": 0.6094, + "step": 908 + }, + { + "epoch": 0.3636, + "grad_norm": 0.41151338815689087, + "learning_rate": 5.148263814381314e-06, + "loss": 0.5243, + "step": 909 + }, + { + "epoch": 0.364, + "grad_norm": 0.5132277011871338, + "learning_rate": 5.144262449912935e-06, + "loss": 0.6284, + "step": 910 + }, + { + "epoch": 0.3644, + "grad_norm": 0.4442318379878998, + "learning_rate": 5.140258325836923e-06, + "loss": 0.4923, + "step": 911 + }, + { + "epoch": 0.3648, + "grad_norm": 0.4278882145881653, + "learning_rate": 5.1362514488735e-06, + "loss": 0.5837, + "step": 912 + }, + { + "epoch": 0.3652, + "grad_norm": 0.4440222382545471, + "learning_rate": 5.13224182574751e-06, + "loss": 0.5436, + "step": 913 + }, + { + "epoch": 0.3656, + "grad_norm": 0.4995422661304474, + "learning_rate": 5.128229463188406e-06, + "loss": 0.5801, + "step": 914 + }, + { + "epoch": 0.366, + "grad_norm": 0.4683188796043396, + "learning_rate": 5.12421436793024e-06, + "loss": 0.6342, + "step": 915 + }, + { + "epoch": 0.3664, + "grad_norm": 0.4733324348926544, + "learning_rate": 5.120196546711647e-06, + "loss": 0.5931, + "step": 916 + }, + { + "epoch": 0.3668, + "grad_norm": 0.4991692304611206, + "learning_rate": 5.1161760062758395e-06, + "loss": 0.6111, + "step": 917 + }, + { + "epoch": 0.3672, + "grad_norm": 0.4712377190589905, + "learning_rate": 5.112152753370594e-06, + "loss": 0.5567, + "step": 918 + }, + { + "epoch": 0.3676, + "grad_norm": 0.47813770174980164, + "learning_rate": 5.108126794748237e-06, + "loss": 0.5108, + "step": 919 + }, + { + "epoch": 0.368, + "grad_norm": 0.4006287157535553, + "learning_rate": 5.1040981371656384e-06, + "loss": 0.4912, + "step": 920 + }, + { + "epoch": 0.3684, + "grad_norm": 0.4339583218097687, + "learning_rate": 5.100066787384198e-06, + "loss": 0.46, + "step": 921 + }, + { + "epoch": 0.3688, + "grad_norm": 0.4966609477996826, + "learning_rate": 5.096032752169831e-06, + "loss": 0.5807, + "step": 922 + }, + { + "epoch": 0.3692, + "grad_norm": 0.43865084648132324, + "learning_rate": 5.091996038292962e-06, + "loss": 0.5623, + "step": 923 + }, + { + "epoch": 0.3696, + "grad_norm": 0.4705183506011963, + "learning_rate": 5.087956652528513e-06, + "loss": 0.5841, + "step": 924 + }, + { + "epoch": 0.37, + "grad_norm": 0.4557614326477051, + "learning_rate": 5.083914601655887e-06, + "loss": 0.5623, + "step": 925 + }, + { + "epoch": 0.3704, + "grad_norm": 0.45096200704574585, + "learning_rate": 5.079869892458959e-06, + "loss": 0.629, + "step": 926 + }, + { + "epoch": 0.3708, + "grad_norm": 0.48638662695884705, + "learning_rate": 5.075822531726071e-06, + "loss": 0.5955, + "step": 927 + }, + { + "epoch": 0.3712, + "grad_norm": 0.5118933320045471, + "learning_rate": 5.071772526250009e-06, + "loss": 0.593, + "step": 928 + }, + { + "epoch": 0.3716, + "grad_norm": 0.498941570520401, + "learning_rate": 5.067719882828004e-06, + "loss": 0.4636, + "step": 929 + }, + { + "epoch": 0.372, + "grad_norm": 0.4572960138320923, + "learning_rate": 5.0636646082617085e-06, + "loss": 0.5629, + "step": 930 + }, + { + "epoch": 0.3724, + "grad_norm": 0.5548087358474731, + "learning_rate": 5.059606709357192e-06, + "loss": 0.5777, + "step": 931 + }, + { + "epoch": 0.3728, + "grad_norm": 0.46791890263557434, + "learning_rate": 5.0555461929249345e-06, + "loss": 0.56, + "step": 932 + }, + { + "epoch": 0.3732, + "grad_norm": 0.5351516008377075, + "learning_rate": 5.0514830657797986e-06, + "loss": 0.6033, + "step": 933 + }, + { + "epoch": 0.3736, + "grad_norm": 0.46923014521598816, + "learning_rate": 5.047417334741038e-06, + "loss": 0.4895, + "step": 934 + }, + { + "epoch": 0.374, + "grad_norm": 0.4047207534313202, + "learning_rate": 5.043349006632271e-06, + "loss": 0.5233, + "step": 935 + }, + { + "epoch": 0.3744, + "grad_norm": 0.428609699010849, + "learning_rate": 5.039278088281479e-06, + "loss": 0.509, + "step": 936 + }, + { + "epoch": 0.3748, + "grad_norm": 0.4471479654312134, + "learning_rate": 5.035204586520985e-06, + "loss": 0.4526, + "step": 937 + }, + { + "epoch": 0.3752, + "grad_norm": 0.4439519941806793, + "learning_rate": 5.031128508187452e-06, + "loss": 0.5774, + "step": 938 + }, + { + "epoch": 0.3756, + "grad_norm": 0.47322627902030945, + "learning_rate": 5.027049860121869e-06, + "loss": 0.574, + "step": 939 + }, + { + "epoch": 0.376, + "grad_norm": 0.4471387267112732, + "learning_rate": 5.022968649169531e-06, + "loss": 0.5038, + "step": 940 + }, + { + "epoch": 0.3764, + "grad_norm": 0.4897695481777191, + "learning_rate": 5.018884882180042e-06, + "loss": 0.6207, + "step": 941 + }, + { + "epoch": 0.3768, + "grad_norm": 0.40873169898986816, + "learning_rate": 5.0147985660072895e-06, + "loss": 0.4889, + "step": 942 + }, + { + "epoch": 0.3772, + "grad_norm": 0.3923221826553345, + "learning_rate": 5.0107097075094445e-06, + "loss": 0.5532, + "step": 943 + }, + { + "epoch": 0.3776, + "grad_norm": 0.4386279881000519, + "learning_rate": 5.00661831354894e-06, + "loss": 0.5483, + "step": 944 + }, + { + "epoch": 0.378, + "grad_norm": 0.4298570156097412, + "learning_rate": 5.002524390992471e-06, + "loss": 0.4718, + "step": 945 + }, + { + "epoch": 0.3784, + "grad_norm": 0.4775792360305786, + "learning_rate": 4.998427946710967e-06, + "loss": 0.6635, + "step": 946 + }, + { + "epoch": 0.3788, + "grad_norm": 0.43783479928970337, + "learning_rate": 4.9943289875796e-06, + "loss": 0.5541, + "step": 947 + }, + { + "epoch": 0.3792, + "grad_norm": 0.4827110171318054, + "learning_rate": 4.9902275204777545e-06, + "loss": 0.6063, + "step": 948 + }, + { + "epoch": 0.3796, + "grad_norm": 0.47438400983810425, + "learning_rate": 4.9861235522890274e-06, + "loss": 0.514, + "step": 949 + }, + { + "epoch": 0.38, + "grad_norm": 0.485522598028183, + "learning_rate": 4.982017089901216e-06, + "loss": 0.586, + "step": 950 + }, + { + "epoch": 0.3804, + "grad_norm": 0.49923890829086304, + "learning_rate": 4.977908140206297e-06, + "loss": 0.6124, + "step": 951 + }, + { + "epoch": 0.3808, + "grad_norm": 0.46352657675743103, + "learning_rate": 4.97379671010043e-06, + "loss": 0.6389, + "step": 952 + }, + { + "epoch": 0.3812, + "grad_norm": 0.467952162027359, + "learning_rate": 4.969682806483929e-06, + "loss": 0.581, + "step": 953 + }, + { + "epoch": 0.3816, + "grad_norm": 0.5383191108703613, + "learning_rate": 4.965566436261266e-06, + "loss": 0.6133, + "step": 954 + }, + { + "epoch": 0.382, + "grad_norm": 0.4607367217540741, + "learning_rate": 4.961447606341048e-06, + "loss": 0.5202, + "step": 955 + }, + { + "epoch": 0.3824, + "grad_norm": 0.48336759209632874, + "learning_rate": 4.957326323636015e-06, + "loss": 0.5309, + "step": 956 + }, + { + "epoch": 0.3828, + "grad_norm": 0.44407516717910767, + "learning_rate": 4.95320259506302e-06, + "loss": 0.5205, + "step": 957 + }, + { + "epoch": 0.3832, + "grad_norm": 0.45355352759361267, + "learning_rate": 4.949076427543021e-06, + "loss": 0.616, + "step": 958 + }, + { + "epoch": 0.3836, + "grad_norm": 0.42455554008483887, + "learning_rate": 4.944947828001071e-06, + "loss": 0.5373, + "step": 959 + }, + { + "epoch": 0.384, + "grad_norm": 0.3933660089969635, + "learning_rate": 4.940816803366304e-06, + "loss": 0.4608, + "step": 960 + }, + { + "epoch": 0.3844, + "grad_norm": 0.47803884744644165, + "learning_rate": 4.936683360571924e-06, + "loss": 0.4946, + "step": 961 + }, + { + "epoch": 0.3848, + "grad_norm": 0.4601835310459137, + "learning_rate": 4.932547506555193e-06, + "loss": 0.5556, + "step": 962 + }, + { + "epoch": 0.3852, + "grad_norm": 0.5340824127197266, + "learning_rate": 4.92840924825742e-06, + "loss": 0.5202, + "step": 963 + }, + { + "epoch": 0.3856, + "grad_norm": 0.4656091630458832, + "learning_rate": 4.924268592623952e-06, + "loss": 0.509, + "step": 964 + }, + { + "epoch": 0.386, + "grad_norm": 0.4116588234901428, + "learning_rate": 4.9201255466041545e-06, + "loss": 0.4783, + "step": 965 + }, + { + "epoch": 0.3864, + "grad_norm": 0.5430503487586975, + "learning_rate": 4.915980117151407e-06, + "loss": 0.5982, + "step": 966 + }, + { + "epoch": 0.3868, + "grad_norm": 0.43445679545402527, + "learning_rate": 4.9118323112230925e-06, + "loss": 0.4827, + "step": 967 + }, + { + "epoch": 0.3872, + "grad_norm": 0.4689733386039734, + "learning_rate": 4.907682135780576e-06, + "loss": 0.6506, + "step": 968 + }, + { + "epoch": 0.3876, + "grad_norm": 0.4890734553337097, + "learning_rate": 4.903529597789206e-06, + "loss": 0.5741, + "step": 969 + }, + { + "epoch": 0.388, + "grad_norm": 0.4545091688632965, + "learning_rate": 4.899374704218291e-06, + "loss": 0.5148, + "step": 970 + }, + { + "epoch": 0.3884, + "grad_norm": 0.5078072547912598, + "learning_rate": 4.8952174620410965e-06, + "loss": 0.6291, + "step": 971 + }, + { + "epoch": 0.3888, + "grad_norm": 0.4194016754627228, + "learning_rate": 4.891057878234827e-06, + "loss": 0.504, + "step": 972 + }, + { + "epoch": 0.3892, + "grad_norm": 0.4390861988067627, + "learning_rate": 4.886895959780618e-06, + "loss": 0.5773, + "step": 973 + }, + { + "epoch": 0.3896, + "grad_norm": 0.4521550238132477, + "learning_rate": 4.882731713663524e-06, + "loss": 0.5093, + "step": 974 + }, + { + "epoch": 0.39, + "grad_norm": 0.4769055247306824, + "learning_rate": 4.878565146872505e-06, + "loss": 0.5763, + "step": 975 + }, + { + "epoch": 0.3904, + "grad_norm": 0.4254240393638611, + "learning_rate": 4.874396266400418e-06, + "loss": 0.5504, + "step": 976 + }, + { + "epoch": 0.3908, + "grad_norm": 0.4530830383300781, + "learning_rate": 4.870225079243998e-06, + "loss": 0.503, + "step": 977 + }, + { + "epoch": 0.3912, + "grad_norm": 0.4451112747192383, + "learning_rate": 4.866051592403859e-06, + "loss": 0.4848, + "step": 978 + }, + { + "epoch": 0.3916, + "grad_norm": 0.5103582739830017, + "learning_rate": 4.8618758128844675e-06, + "loss": 0.5432, + "step": 979 + }, + { + "epoch": 0.392, + "grad_norm": 0.4586694538593292, + "learning_rate": 4.857697747694142e-06, + "loss": 0.5402, + "step": 980 + }, + { + "epoch": 0.3924, + "grad_norm": 0.3923027813434601, + "learning_rate": 4.853517403845037e-06, + "loss": 0.4839, + "step": 981 + }, + { + "epoch": 0.3928, + "grad_norm": 0.43034645915031433, + "learning_rate": 4.849334788353128e-06, + "loss": 0.5624, + "step": 982 + }, + { + "epoch": 0.3932, + "grad_norm": 0.4340299665927887, + "learning_rate": 4.8451499082382075e-06, + "loss": 0.5928, + "step": 983 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5210539698600769, + "learning_rate": 4.840962770523865e-06, + "loss": 0.5105, + "step": 984 + }, + { + "epoch": 0.394, + "grad_norm": 0.4606340527534485, + "learning_rate": 4.836773382237481e-06, + "loss": 0.5041, + "step": 985 + }, + { + "epoch": 0.3944, + "grad_norm": 0.4190974235534668, + "learning_rate": 4.832581750410213e-06, + "loss": 0.4632, + "step": 986 + }, + { + "epoch": 0.3948, + "grad_norm": 0.4932776987552643, + "learning_rate": 4.8283878820769835e-06, + "loss": 0.5377, + "step": 987 + }, + { + "epoch": 0.3952, + "grad_norm": 0.4378507733345032, + "learning_rate": 4.824191784276469e-06, + "loss": 0.4748, + "step": 988 + }, + { + "epoch": 0.3956, + "grad_norm": 0.4637993276119232, + "learning_rate": 4.8199934640510875e-06, + "loss": 0.6412, + "step": 989 + }, + { + "epoch": 0.396, + "grad_norm": 0.5731667280197144, + "learning_rate": 4.815792928446986e-06, + "loss": 0.6257, + "step": 990 + }, + { + "epoch": 0.3964, + "grad_norm": 0.46051129698753357, + "learning_rate": 4.811590184514033e-06, + "loss": 0.4641, + "step": 991 + }, + { + "epoch": 0.3968, + "grad_norm": 0.44054555892944336, + "learning_rate": 4.8073852393057995e-06, + "loss": 0.4956, + "step": 992 + }, + { + "epoch": 0.3972, + "grad_norm": 0.44733765721321106, + "learning_rate": 4.803178099879551e-06, + "loss": 0.5533, + "step": 993 + }, + { + "epoch": 0.3976, + "grad_norm": 0.4734288454055786, + "learning_rate": 4.798968773296238e-06, + "loss": 0.5923, + "step": 994 + }, + { + "epoch": 0.398, + "grad_norm": 0.44051530957221985, + "learning_rate": 4.794757266620482e-06, + "loss": 0.4936, + "step": 995 + }, + { + "epoch": 0.3984, + "grad_norm": 0.48114386200904846, + "learning_rate": 4.790543586920561e-06, + "loss": 0.6136, + "step": 996 + }, + { + "epoch": 0.3988, + "grad_norm": 0.4775163233280182, + "learning_rate": 4.7863277412684e-06, + "loss": 0.5113, + "step": 997 + }, + { + "epoch": 0.3992, + "grad_norm": 0.4406886100769043, + "learning_rate": 4.782109736739562e-06, + "loss": 0.5045, + "step": 998 + }, + { + "epoch": 0.3996, + "grad_norm": 0.5010861754417419, + "learning_rate": 4.777889580413232e-06, + "loss": 0.5119, + "step": 999 + }, + { + "epoch": 0.4, + "grad_norm": 0.47519031167030334, + "learning_rate": 4.773667279372207e-06, + "loss": 0.6024, + "step": 1000 + }, + { + "epoch": 0.4004, + "grad_norm": 0.46936264634132385, + "learning_rate": 4.769442840702879e-06, + "loss": 0.5832, + "step": 1001 + }, + { + "epoch": 0.4008, + "grad_norm": 0.4049946665763855, + "learning_rate": 4.765216271495233e-06, + "loss": 0.5696, + "step": 1002 + }, + { + "epoch": 0.4012, + "grad_norm": 0.4819742739200592, + "learning_rate": 4.760987578842828e-06, + "loss": 0.5264, + "step": 1003 + }, + { + "epoch": 0.4016, + "grad_norm": 0.5121580362319946, + "learning_rate": 4.756756769842788e-06, + "loss": 0.5501, + "step": 1004 + }, + { + "epoch": 0.402, + "grad_norm": 0.4060859978199005, + "learning_rate": 4.752523851595785e-06, + "loss": 0.456, + "step": 1005 + }, + { + "epoch": 0.4024, + "grad_norm": 0.5201736688613892, + "learning_rate": 4.748288831206037e-06, + "loss": 0.6065, + "step": 1006 + }, + { + "epoch": 0.4028, + "grad_norm": 0.4155638515949249, + "learning_rate": 4.744051715781286e-06, + "loss": 0.4812, + "step": 1007 + }, + { + "epoch": 0.4032, + "grad_norm": 0.4884546101093292, + "learning_rate": 4.739812512432788e-06, + "loss": 0.5775, + "step": 1008 + }, + { + "epoch": 0.4036, + "grad_norm": 0.4956738352775574, + "learning_rate": 4.73557122827531e-06, + "loss": 0.6205, + "step": 1009 + }, + { + "epoch": 0.404, + "grad_norm": 0.5215380787849426, + "learning_rate": 4.731327870427103e-06, + "loss": 0.6252, + "step": 1010 + }, + { + "epoch": 0.4044, + "grad_norm": 0.47527143359184265, + "learning_rate": 4.727082446009909e-06, + "loss": 0.56, + "step": 1011 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4762853980064392, + "learning_rate": 4.722834962148927e-06, + "loss": 0.5282, + "step": 1012 + }, + { + "epoch": 0.4052, + "grad_norm": 0.42091140151023865, + "learning_rate": 4.718585425972819e-06, + "loss": 0.495, + "step": 1013 + }, + { + "epoch": 0.4056, + "grad_norm": 0.4845111072063446, + "learning_rate": 4.714333844613692e-06, + "loss": 0.5523, + "step": 1014 + }, + { + "epoch": 0.406, + "grad_norm": 0.4628712832927704, + "learning_rate": 4.71008022520708e-06, + "loss": 0.5445, + "step": 1015 + }, + { + "epoch": 0.4064, + "grad_norm": 0.4790785014629364, + "learning_rate": 4.705824574891944e-06, + "loss": 0.5288, + "step": 1016 + }, + { + "epoch": 0.4068, + "grad_norm": 0.5232818126678467, + "learning_rate": 4.701566900810648e-06, + "loss": 0.6096, + "step": 1017 + }, + { + "epoch": 0.4072, + "grad_norm": 0.46365299820899963, + "learning_rate": 4.697307210108957e-06, + "loss": 0.5674, + "step": 1018 + }, + { + "epoch": 0.4076, + "grad_norm": 0.46038663387298584, + "learning_rate": 4.693045509936018e-06, + "loss": 0.4781, + "step": 1019 + }, + { + "epoch": 0.408, + "grad_norm": 0.4882473051548004, + "learning_rate": 4.6887818074443514e-06, + "loss": 0.5625, + "step": 1020 + }, + { + "epoch": 0.4084, + "grad_norm": 0.4895521402359009, + "learning_rate": 4.6845161097898376e-06, + "loss": 0.5711, + "step": 1021 + }, + { + "epoch": 0.4088, + "grad_norm": 0.4484206438064575, + "learning_rate": 4.680248424131706e-06, + "loss": 0.5218, + "step": 1022 + }, + { + "epoch": 0.4092, + "grad_norm": 0.5106765627861023, + "learning_rate": 4.675978757632522e-06, + "loss": 0.6769, + "step": 1023 + }, + { + "epoch": 0.4096, + "grad_norm": 0.45548179745674133, + "learning_rate": 4.671707117458176e-06, + "loss": 0.4893, + "step": 1024 + }, + { + "epoch": 0.41, + "grad_norm": 0.4506770074367523, + "learning_rate": 4.667433510777872e-06, + "loss": 0.5332, + "step": 1025 + }, + { + "epoch": 0.4104, + "grad_norm": 0.4675017297267914, + "learning_rate": 4.663157944764111e-06, + "loss": 0.5545, + "step": 1026 + }, + { + "epoch": 0.4108, + "grad_norm": 0.47717100381851196, + "learning_rate": 4.658880426592686e-06, + "loss": 0.5616, + "step": 1027 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4532095789909363, + "learning_rate": 4.654600963442665e-06, + "loss": 0.5336, + "step": 1028 + }, + { + "epoch": 0.4116, + "grad_norm": 0.509066641330719, + "learning_rate": 4.65031956249638e-06, + "loss": 0.5933, + "step": 1029 + }, + { + "epoch": 0.412, + "grad_norm": 0.43408674001693726, + "learning_rate": 4.646036230939413e-06, + "loss": 0.4959, + "step": 1030 + }, + { + "epoch": 0.4124, + "grad_norm": 0.5410246849060059, + "learning_rate": 4.641750975960592e-06, + "loss": 0.5755, + "step": 1031 + }, + { + "epoch": 0.4128, + "grad_norm": 0.47988831996917725, + "learning_rate": 4.637463804751969e-06, + "loss": 0.4984, + "step": 1032 + }, + { + "epoch": 0.4132, + "grad_norm": 0.5014576315879822, + "learning_rate": 4.633174724508814e-06, + "loss": 0.5499, + "step": 1033 + }, + { + "epoch": 0.4136, + "grad_norm": 0.5752670764923096, + "learning_rate": 4.628883742429596e-06, + "loss": 0.6064, + "step": 1034 + }, + { + "epoch": 0.414, + "grad_norm": 0.507512092590332, + "learning_rate": 4.624590865715983e-06, + "loss": 0.5523, + "step": 1035 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5002173781394958, + "learning_rate": 4.620296101572819e-06, + "loss": 0.6558, + "step": 1036 + }, + { + "epoch": 0.4148, + "grad_norm": 0.45491892099380493, + "learning_rate": 4.6159994572081155e-06, + "loss": 0.5552, + "step": 1037 + }, + { + "epoch": 0.4152, + "grad_norm": 0.45507368445396423, + "learning_rate": 4.61170093983304e-06, + "loss": 0.4994, + "step": 1038 + }, + { + "epoch": 0.4156, + "grad_norm": 0.5375739932060242, + "learning_rate": 4.607400556661906e-06, + "loss": 0.5885, + "step": 1039 + }, + { + "epoch": 0.416, + "grad_norm": 0.45972853899002075, + "learning_rate": 4.603098314912156e-06, + "loss": 0.626, + "step": 1040 + }, + { + "epoch": 0.4164, + "grad_norm": 0.446978896856308, + "learning_rate": 4.5987942218043484e-06, + "loss": 0.4966, + "step": 1041 + }, + { + "epoch": 0.4168, + "grad_norm": 0.44826751947402954, + "learning_rate": 4.594488284562158e-06, + "loss": 0.5734, + "step": 1042 + }, + { + "epoch": 0.4172, + "grad_norm": 0.4395397901535034, + "learning_rate": 4.590180510412345e-06, + "loss": 0.5354, + "step": 1043 + }, + { + "epoch": 0.4176, + "grad_norm": 0.4548799395561218, + "learning_rate": 4.585870906584758e-06, + "loss": 0.6016, + "step": 1044 + }, + { + "epoch": 0.418, + "grad_norm": 0.46065983176231384, + "learning_rate": 4.581559480312316e-06, + "loss": 0.5366, + "step": 1045 + }, + { + "epoch": 0.4184, + "grad_norm": 0.45573100447654724, + "learning_rate": 4.577246238830995e-06, + "loss": 0.6095, + "step": 1046 + }, + { + "epoch": 0.4188, + "grad_norm": 0.5116010904312134, + "learning_rate": 4.572931189379818e-06, + "loss": 0.5555, + "step": 1047 + }, + { + "epoch": 0.4192, + "grad_norm": 0.472604900598526, + "learning_rate": 4.568614339200843e-06, + "loss": 0.5808, + "step": 1048 + }, + { + "epoch": 0.4196, + "grad_norm": 0.5294572114944458, + "learning_rate": 4.56429569553915e-06, + "loss": 0.574, + "step": 1049 + }, + { + "epoch": 0.42, + "grad_norm": 0.508413553237915, + "learning_rate": 4.559975265642828e-06, + "loss": 0.5414, + "step": 1050 + }, + { + "epoch": 0.4204, + "grad_norm": 0.4205757677555084, + "learning_rate": 4.555653056762965e-06, + "loss": 0.4573, + "step": 1051 + }, + { + "epoch": 0.4208, + "grad_norm": 0.5281642079353333, + "learning_rate": 4.551329076153636e-06, + "loss": 0.5868, + "step": 1052 + }, + { + "epoch": 0.4212, + "grad_norm": 0.46424421668052673, + "learning_rate": 4.547003331071886e-06, + "loss": 0.5842, + "step": 1053 + }, + { + "epoch": 0.4216, + "grad_norm": 0.4419081509113312, + "learning_rate": 4.542675828777725e-06, + "loss": 0.5302, + "step": 1054 + }, + { + "epoch": 0.422, + "grad_norm": 0.46675243973731995, + "learning_rate": 4.538346576534109e-06, + "loss": 0.5809, + "step": 1055 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5093527436256409, + "learning_rate": 4.534015581606934e-06, + "loss": 0.5996, + "step": 1056 + }, + { + "epoch": 0.4228, + "grad_norm": 0.45872142910957336, + "learning_rate": 4.529682851265018e-06, + "loss": 0.5206, + "step": 1057 + }, + { + "epoch": 0.4232, + "grad_norm": 0.4427538514137268, + "learning_rate": 4.525348392780094e-06, + "loss": 0.5077, + "step": 1058 + }, + { + "epoch": 0.4236, + "grad_norm": 0.506925642490387, + "learning_rate": 4.5210122134267925e-06, + "loss": 0.6141, + "step": 1059 + }, + { + "epoch": 0.424, + "grad_norm": 0.5242504477500916, + "learning_rate": 4.516674320482636e-06, + "loss": 0.5984, + "step": 1060 + }, + { + "epoch": 0.4244, + "grad_norm": 0.4801744818687439, + "learning_rate": 4.512334721228021e-06, + "loss": 0.5656, + "step": 1061 + }, + { + "epoch": 0.4248, + "grad_norm": 0.44936510920524597, + "learning_rate": 4.507993422946207e-06, + "loss": 0.5356, + "step": 1062 + }, + { + "epoch": 0.4252, + "grad_norm": 0.5157073140144348, + "learning_rate": 4.503650432923304e-06, + "loss": 0.6277, + "step": 1063 + }, + { + "epoch": 0.4256, + "grad_norm": 0.413225382566452, + "learning_rate": 4.499305758448266e-06, + "loss": 0.4468, + "step": 1064 + }, + { + "epoch": 0.426, + "grad_norm": 0.46837320923805237, + "learning_rate": 4.49495940681287e-06, + "loss": 0.5491, + "step": 1065 + }, + { + "epoch": 0.4264, + "grad_norm": 0.467693567276001, + "learning_rate": 4.490611385311707e-06, + "loss": 0.6004, + "step": 1066 + }, + { + "epoch": 0.4268, + "grad_norm": 0.44910159707069397, + "learning_rate": 4.4862617012421765e-06, + "loss": 0.4998, + "step": 1067 + }, + { + "epoch": 0.4272, + "grad_norm": 0.414615273475647, + "learning_rate": 4.481910361904459e-06, + "loss": 0.5411, + "step": 1068 + }, + { + "epoch": 0.4276, + "grad_norm": 0.48069313168525696, + "learning_rate": 4.477557374601523e-06, + "loss": 0.5788, + "step": 1069 + }, + { + "epoch": 0.428, + "grad_norm": 0.45842450857162476, + "learning_rate": 4.473202746639095e-06, + "loss": 0.5494, + "step": 1070 + }, + { + "epoch": 0.4284, + "grad_norm": 0.48641544580459595, + "learning_rate": 4.468846485325661e-06, + "loss": 0.5893, + "step": 1071 + }, + { + "epoch": 0.4288, + "grad_norm": 0.458332896232605, + "learning_rate": 4.464488597972443e-06, + "loss": 0.5662, + "step": 1072 + }, + { + "epoch": 0.4292, + "grad_norm": 0.5082148909568787, + "learning_rate": 4.460129091893396e-06, + "loss": 0.664, + "step": 1073 + }, + { + "epoch": 0.4296, + "grad_norm": 0.4697319269180298, + "learning_rate": 4.455767974405191e-06, + "loss": 0.6069, + "step": 1074 + }, + { + "epoch": 0.43, + "grad_norm": 0.45500311255455017, + "learning_rate": 4.4514052528272e-06, + "loss": 0.559, + "step": 1075 + }, + { + "epoch": 0.4304, + "grad_norm": 0.4315764904022217, + "learning_rate": 4.447040934481493e-06, + "loss": 0.4619, + "step": 1076 + }, + { + "epoch": 0.4308, + "grad_norm": 0.4703432619571686, + "learning_rate": 4.442675026692815e-06, + "loss": 0.5703, + "step": 1077 + }, + { + "epoch": 0.4312, + "grad_norm": 0.49389466643333435, + "learning_rate": 4.438307536788581e-06, + "loss": 0.5811, + "step": 1078 + }, + { + "epoch": 0.4316, + "grad_norm": 0.4419156610965729, + "learning_rate": 4.433938472098861e-06, + "loss": 0.5763, + "step": 1079 + }, + { + "epoch": 0.432, + "grad_norm": 0.5040791034698486, + "learning_rate": 4.429567839956368e-06, + "loss": 0.5424, + "step": 1080 + }, + { + "epoch": 0.4324, + "grad_norm": 0.5160146951675415, + "learning_rate": 4.4251956476964445e-06, + "loss": 0.584, + "step": 1081 + }, + { + "epoch": 0.4328, + "grad_norm": 0.4821451008319855, + "learning_rate": 4.420821902657055e-06, + "loss": 0.5395, + "step": 1082 + }, + { + "epoch": 0.4332, + "grad_norm": 0.5089398622512817, + "learning_rate": 4.416446612178762e-06, + "loss": 0.6124, + "step": 1083 + }, + { + "epoch": 0.4336, + "grad_norm": 0.4727106988430023, + "learning_rate": 4.412069783604733e-06, + "loss": 0.512, + "step": 1084 + }, + { + "epoch": 0.434, + "grad_norm": 0.5604661703109741, + "learning_rate": 4.407691424280708e-06, + "loss": 0.6138, + "step": 1085 + }, + { + "epoch": 0.4344, + "grad_norm": 0.47186407446861267, + "learning_rate": 4.403311541555e-06, + "loss": 0.5063, + "step": 1086 + }, + { + "epoch": 0.4348, + "grad_norm": 0.4935065805912018, + "learning_rate": 4.398930142778477e-06, + "loss": 0.5855, + "step": 1087 + }, + { + "epoch": 0.4352, + "grad_norm": 0.49573445320129395, + "learning_rate": 4.394547235304554e-06, + "loss": 0.5524, + "step": 1088 + }, + { + "epoch": 0.4356, + "grad_norm": 0.484480082988739, + "learning_rate": 4.390162826489176e-06, + "loss": 0.6309, + "step": 1089 + }, + { + "epoch": 0.436, + "grad_norm": 0.4660501182079315, + "learning_rate": 4.385776923690807e-06, + "loss": 0.5437, + "step": 1090 + }, + { + "epoch": 0.4364, + "grad_norm": 0.4966036081314087, + "learning_rate": 4.381389534270421e-06, + "loss": 0.5881, + "step": 1091 + }, + { + "epoch": 0.4368, + "grad_norm": 0.50444096326828, + "learning_rate": 4.377000665591484e-06, + "loss": 0.5477, + "step": 1092 + }, + { + "epoch": 0.4372, + "grad_norm": 0.4618821442127228, + "learning_rate": 4.37261032501995e-06, + "loss": 0.5197, + "step": 1093 + }, + { + "epoch": 0.4376, + "grad_norm": 0.5184645056724548, + "learning_rate": 4.368218519924235e-06, + "loss": 0.4874, + "step": 1094 + }, + { + "epoch": 0.438, + "grad_norm": 0.4814774990081787, + "learning_rate": 4.363825257675219e-06, + "loss": 0.4983, + "step": 1095 + }, + { + "epoch": 0.4384, + "grad_norm": 0.4355922341346741, + "learning_rate": 4.359430545646229e-06, + "loss": 0.5032, + "step": 1096 + }, + { + "epoch": 0.4388, + "grad_norm": 0.5241148471832275, + "learning_rate": 4.355034391213018e-06, + "loss": 0.5928, + "step": 1097 + }, + { + "epoch": 0.4392, + "grad_norm": 0.4663231372833252, + "learning_rate": 4.350636801753768e-06, + "loss": 0.5468, + "step": 1098 + }, + { + "epoch": 0.4396, + "grad_norm": 0.4439452886581421, + "learning_rate": 4.346237784649063e-06, + "loss": 0.5004, + "step": 1099 + }, + { + "epoch": 0.44, + "grad_norm": 0.5504818558692932, + "learning_rate": 4.341837347281888e-06, + "loss": 0.5775, + "step": 1100 + }, + { + "epoch": 0.4404, + "grad_norm": 0.45938974618911743, + "learning_rate": 4.337435497037606e-06, + "loss": 0.5596, + "step": 1101 + }, + { + "epoch": 0.4408, + "grad_norm": 0.46923092007637024, + "learning_rate": 4.333032241303958e-06, + "loss": 0.5178, + "step": 1102 + }, + { + "epoch": 0.4412, + "grad_norm": 0.48642316460609436, + "learning_rate": 4.328627587471039e-06, + "loss": 0.5191, + "step": 1103 + }, + { + "epoch": 0.4416, + "grad_norm": 0.5553264617919922, + "learning_rate": 4.324221542931292e-06, + "loss": 0.7238, + "step": 1104 + }, + { + "epoch": 0.442, + "grad_norm": 0.5312342047691345, + "learning_rate": 4.319814115079493e-06, + "loss": 0.5319, + "step": 1105 + }, + { + "epoch": 0.4424, + "grad_norm": 0.5320695638656616, + "learning_rate": 4.315405311312743e-06, + "loss": 0.6097, + "step": 1106 + }, + { + "epoch": 0.4428, + "grad_norm": 0.4854208827018738, + "learning_rate": 4.3109951390304484e-06, + "loss": 0.5997, + "step": 1107 + }, + { + "epoch": 0.4432, + "grad_norm": 0.45754969120025635, + "learning_rate": 4.306583605634313e-06, + "loss": 0.6103, + "step": 1108 + }, + { + "epoch": 0.4436, + "grad_norm": 0.47233521938323975, + "learning_rate": 4.3021707185283274e-06, + "loss": 0.5375, + "step": 1109 + }, + { + "epoch": 0.444, + "grad_norm": 0.5084165930747986, + "learning_rate": 4.2977564851187525e-06, + "loss": 0.5823, + "step": 1110 + }, + { + "epoch": 0.4444, + "grad_norm": 0.47002092003822327, + "learning_rate": 4.293340912814108e-06, + "loss": 0.5562, + "step": 1111 + }, + { + "epoch": 0.4448, + "grad_norm": 0.47349244356155396, + "learning_rate": 4.288924009025162e-06, + "loss": 0.5875, + "step": 1112 + }, + { + "epoch": 0.4452, + "grad_norm": 0.4687560796737671, + "learning_rate": 4.284505781164917e-06, + "loss": 0.4749, + "step": 1113 + }, + { + "epoch": 0.4456, + "grad_norm": 0.4924432635307312, + "learning_rate": 4.280086236648599e-06, + "loss": 0.5515, + "step": 1114 + }, + { + "epoch": 0.446, + "grad_norm": 0.4624880850315094, + "learning_rate": 4.2756653828936405e-06, + "loss": 0.563, + "step": 1115 + }, + { + "epoch": 0.4464, + "grad_norm": 0.4840562343597412, + "learning_rate": 4.271243227319673e-06, + "loss": 0.4772, + "step": 1116 + }, + { + "epoch": 0.4468, + "grad_norm": 0.46091228723526, + "learning_rate": 4.266819777348514e-06, + "loss": 0.559, + "step": 1117 + }, + { + "epoch": 0.4472, + "grad_norm": 0.4594923257827759, + "learning_rate": 4.262395040404152e-06, + "loss": 0.5677, + "step": 1118 + }, + { + "epoch": 0.4476, + "grad_norm": 0.47065865993499756, + "learning_rate": 4.257969023912735e-06, + "loss": 0.4923, + "step": 1119 + }, + { + "epoch": 0.448, + "grad_norm": 0.475721538066864, + "learning_rate": 4.253541735302562e-06, + "loss": 0.5119, + "step": 1120 + }, + { + "epoch": 0.4484, + "grad_norm": 0.5129873752593994, + "learning_rate": 4.249113182004063e-06, + "loss": 0.5995, + "step": 1121 + }, + { + "epoch": 0.4488, + "grad_norm": 0.48000508546829224, + "learning_rate": 4.244683371449792e-06, + "loss": 0.5229, + "step": 1122 + }, + { + "epoch": 0.4492, + "grad_norm": 0.46500471234321594, + "learning_rate": 4.240252311074412e-06, + "loss": 0.585, + "step": 1123 + }, + { + "epoch": 0.4496, + "grad_norm": 0.46694400906562805, + "learning_rate": 4.235820008314688e-06, + "loss": 0.5224, + "step": 1124 + }, + { + "epoch": 0.45, + "grad_norm": 0.4840046167373657, + "learning_rate": 4.231386470609463e-06, + "loss": 0.586, + "step": 1125 + }, + { + "epoch": 0.4504, + "grad_norm": 0.48576870560646057, + "learning_rate": 4.226951705399659e-06, + "loss": 0.5795, + "step": 1126 + }, + { + "epoch": 0.4508, + "grad_norm": 0.481308251619339, + "learning_rate": 4.222515720128254e-06, + "loss": 0.4913, + "step": 1127 + }, + { + "epoch": 0.4512, + "grad_norm": 0.46901413798332214, + "learning_rate": 4.218078522240276e-06, + "loss": 0.5488, + "step": 1128 + }, + { + "epoch": 0.4516, + "grad_norm": 0.51523357629776, + "learning_rate": 4.21364011918279e-06, + "loss": 0.5679, + "step": 1129 + }, + { + "epoch": 0.452, + "grad_norm": 0.5236387252807617, + "learning_rate": 4.209200518404876e-06, + "loss": 0.5059, + "step": 1130 + }, + { + "epoch": 0.4524, + "grad_norm": 0.4860883355140686, + "learning_rate": 4.2047597273576335e-06, + "loss": 0.6122, + "step": 1131 + }, + { + "epoch": 0.4528, + "grad_norm": 0.5277671217918396, + "learning_rate": 4.200317753494152e-06, + "loss": 0.5522, + "step": 1132 + }, + { + "epoch": 0.4532, + "grad_norm": 0.45829376578330994, + "learning_rate": 4.195874604269514e-06, + "loss": 0.5819, + "step": 1133 + }, + { + "epoch": 0.4536, + "grad_norm": 0.45748522877693176, + "learning_rate": 4.1914302871407655e-06, + "loss": 0.5398, + "step": 1134 + }, + { + "epoch": 0.454, + "grad_norm": 0.407242089509964, + "learning_rate": 4.186984809566921e-06, + "loss": 0.4597, + "step": 1135 + }, + { + "epoch": 0.4544, + "grad_norm": 0.46401548385620117, + "learning_rate": 4.182538179008938e-06, + "loss": 0.5768, + "step": 1136 + }, + { + "epoch": 0.4548, + "grad_norm": 0.44726067781448364, + "learning_rate": 4.178090402929709e-06, + "loss": 0.5159, + "step": 1137 + }, + { + "epoch": 0.4552, + "grad_norm": 0.4692462086677551, + "learning_rate": 4.173641488794052e-06, + "loss": 0.569, + "step": 1138 + }, + { + "epoch": 0.4556, + "grad_norm": 0.47518929839134216, + "learning_rate": 4.16919144406869e-06, + "loss": 0.6003, + "step": 1139 + }, + { + "epoch": 0.456, + "grad_norm": 0.4495368003845215, + "learning_rate": 4.164740276222249e-06, + "loss": 0.5276, + "step": 1140 + }, + { + "epoch": 0.4564, + "grad_norm": 0.4562149941921234, + "learning_rate": 4.160287992725237e-06, + "loss": 0.4661, + "step": 1141 + }, + { + "epoch": 0.4568, + "grad_norm": 0.469957560300827, + "learning_rate": 4.1558346010500365e-06, + "loss": 0.6068, + "step": 1142 + }, + { + "epoch": 0.4572, + "grad_norm": 0.47348180413246155, + "learning_rate": 4.1513801086708865e-06, + "loss": 0.5595, + "step": 1143 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5264531970024109, + "learning_rate": 4.146924523063875e-06, + "loss": 0.5726, + "step": 1144 + }, + { + "epoch": 0.458, + "grad_norm": 0.41196539998054504, + "learning_rate": 4.142467851706926e-06, + "loss": 0.4739, + "step": 1145 + }, + { + "epoch": 0.4584, + "grad_norm": 0.5362353920936584, + "learning_rate": 4.138010102079785e-06, + "loss": 0.5435, + "step": 1146 + }, + { + "epoch": 0.4588, + "grad_norm": 0.5249359011650085, + "learning_rate": 4.133551281664007e-06, + "loss": 0.5133, + "step": 1147 + }, + { + "epoch": 0.4592, + "grad_norm": 0.47211042046546936, + "learning_rate": 4.129091397942944e-06, + "loss": 0.4308, + "step": 1148 + }, + { + "epoch": 0.4596, + "grad_norm": 0.470157265663147, + "learning_rate": 4.124630458401732e-06, + "loss": 0.499, + "step": 1149 + }, + { + "epoch": 0.46, + "grad_norm": 0.4542332589626312, + "learning_rate": 4.12016847052728e-06, + "loss": 0.5114, + "step": 1150 + }, + { + "epoch": 0.4604, + "grad_norm": 0.5682739615440369, + "learning_rate": 4.115705441808256e-06, + "loss": 0.5619, + "step": 1151 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5314604043960571, + "learning_rate": 4.111241379735075e-06, + "loss": 0.5565, + "step": 1152 + }, + { + "epoch": 0.4612, + "grad_norm": 0.47330886125564575, + "learning_rate": 4.106776291799887e-06, + "loss": 0.5615, + "step": 1153 + }, + { + "epoch": 0.4616, + "grad_norm": 0.4496110677719116, + "learning_rate": 4.1023101854965626e-06, + "loss": 0.5194, + "step": 1154 + }, + { + "epoch": 0.462, + "grad_norm": 0.49995559453964233, + "learning_rate": 4.097843068320681e-06, + "loss": 0.4888, + "step": 1155 + }, + { + "epoch": 0.4624, + "grad_norm": 0.4480656087398529, + "learning_rate": 4.0933749477695205e-06, + "loss": 0.5536, + "step": 1156 + }, + { + "epoch": 0.4628, + "grad_norm": 0.4266318082809448, + "learning_rate": 4.088905831342042e-06, + "loss": 0.4873, + "step": 1157 + }, + { + "epoch": 0.4632, + "grad_norm": 0.48847833275794983, + "learning_rate": 4.084435726538876e-06, + "loss": 0.5844, + "step": 1158 + }, + { + "epoch": 0.4636, + "grad_norm": 0.541998028755188, + "learning_rate": 4.079964640862315e-06, + "loss": 0.528, + "step": 1159 + }, + { + "epoch": 0.464, + "grad_norm": 0.5314731001853943, + "learning_rate": 4.075492581816295e-06, + "loss": 0.6334, + "step": 1160 + }, + { + "epoch": 0.4644, + "grad_norm": 0.47492387890815735, + "learning_rate": 4.071019556906387e-06, + "loss": 0.5644, + "step": 1161 + }, + { + "epoch": 0.4648, + "grad_norm": 0.40192219614982605, + "learning_rate": 4.066545573639785e-06, + "loss": 0.4229, + "step": 1162 + }, + { + "epoch": 0.4652, + "grad_norm": 0.5023147463798523, + "learning_rate": 4.062070639525285e-06, + "loss": 0.5446, + "step": 1163 + }, + { + "epoch": 0.4656, + "grad_norm": 0.44182491302490234, + "learning_rate": 4.057594762073288e-06, + "loss": 0.5071, + "step": 1164 + }, + { + "epoch": 0.466, + "grad_norm": 0.45913881063461304, + "learning_rate": 4.053117948795769e-06, + "loss": 0.5553, + "step": 1165 + }, + { + "epoch": 0.4664, + "grad_norm": 0.4556095004081726, + "learning_rate": 4.048640207206283e-06, + "loss": 0.5374, + "step": 1166 + }, + { + "epoch": 0.4668, + "grad_norm": 0.517315149307251, + "learning_rate": 4.044161544819933e-06, + "loss": 0.6206, + "step": 1167 + }, + { + "epoch": 0.4672, + "grad_norm": 0.461018830537796, + "learning_rate": 4.0396819691533746e-06, + "loss": 0.5424, + "step": 1168 + }, + { + "epoch": 0.4676, + "grad_norm": 0.4887098968029022, + "learning_rate": 4.035201487724794e-06, + "loss": 0.5999, + "step": 1169 + }, + { + "epoch": 0.468, + "grad_norm": 0.4762400984764099, + "learning_rate": 4.0307201080538974e-06, + "loss": 0.5276, + "step": 1170 + }, + { + "epoch": 0.4684, + "grad_norm": 0.5469877123832703, + "learning_rate": 4.026237837661899e-06, + "loss": 0.6401, + "step": 1171 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4401659071445465, + "learning_rate": 4.021754684071506e-06, + "loss": 0.4994, + "step": 1172 + }, + { + "epoch": 0.4692, + "grad_norm": 0.49329283833503723, + "learning_rate": 4.0172706548069125e-06, + "loss": 0.5442, + "step": 1173 + }, + { + "epoch": 0.4696, + "grad_norm": 0.4742457866668701, + "learning_rate": 4.012785757393776e-06, + "loss": 0.6106, + "step": 1174 + }, + { + "epoch": 0.47, + "grad_norm": 0.4602060914039612, + "learning_rate": 4.008299999359216e-06, + "loss": 0.572, + "step": 1175 + }, + { + "epoch": 0.4704, + "grad_norm": 0.4688628315925598, + "learning_rate": 4.003813388231794e-06, + "loss": 0.546, + "step": 1176 + }, + { + "epoch": 0.4708, + "grad_norm": 0.5030138492584229, + "learning_rate": 3.999325931541505e-06, + "loss": 0.5704, + "step": 1177 + }, + { + "epoch": 0.4712, + "grad_norm": 0.4299745559692383, + "learning_rate": 3.994837636819762e-06, + "loss": 0.5594, + "step": 1178 + }, + { + "epoch": 0.4716, + "grad_norm": 0.5180333852767944, + "learning_rate": 3.9903485115993834e-06, + "loss": 0.5891, + "step": 1179 + }, + { + "epoch": 0.472, + "grad_norm": 0.48325803875923157, + "learning_rate": 3.9858585634145845e-06, + "loss": 0.5215, + "step": 1180 + }, + { + "epoch": 0.4724, + "grad_norm": 0.45841389894485474, + "learning_rate": 3.981367799800956e-06, + "loss": 0.6235, + "step": 1181 + }, + { + "epoch": 0.4728, + "grad_norm": 0.4524870216846466, + "learning_rate": 3.976876228295466e-06, + "loss": 0.5418, + "step": 1182 + }, + { + "epoch": 0.4732, + "grad_norm": 0.509226381778717, + "learning_rate": 3.9723838564364305e-06, + "loss": 0.5977, + "step": 1183 + }, + { + "epoch": 0.4736, + "grad_norm": 0.4991690516471863, + "learning_rate": 3.967890691763513e-06, + "loss": 0.5788, + "step": 1184 + }, + { + "epoch": 0.474, + "grad_norm": 0.5439135432243347, + "learning_rate": 3.963396741817706e-06, + "loss": 0.5784, + "step": 1185 + }, + { + "epoch": 0.4744, + "grad_norm": 0.4289707839488983, + "learning_rate": 3.958902014141321e-06, + "loss": 0.5169, + "step": 1186 + }, + { + "epoch": 0.4748, + "grad_norm": 0.5074586272239685, + "learning_rate": 3.954406516277973e-06, + "loss": 0.5715, + "step": 1187 + }, + { + "epoch": 0.4752, + "grad_norm": 0.4507099390029907, + "learning_rate": 3.949910255772572e-06, + "loss": 0.6324, + "step": 1188 + }, + { + "epoch": 0.4756, + "grad_norm": 0.42596912384033203, + "learning_rate": 3.945413240171307e-06, + "loss": 0.5326, + "step": 1189 + }, + { + "epoch": 0.476, + "grad_norm": 0.4830462336540222, + "learning_rate": 3.940915477021632e-06, + "loss": 0.6064, + "step": 1190 + }, + { + "epoch": 0.4764, + "grad_norm": 0.5114102363586426, + "learning_rate": 3.93641697387226e-06, + "loss": 0.5712, + "step": 1191 + }, + { + "epoch": 0.4768, + "grad_norm": 0.5248522162437439, + "learning_rate": 3.931917738273142e-06, + "loss": 0.6312, + "step": 1192 + }, + { + "epoch": 0.4772, + "grad_norm": 0.48439136147499084, + "learning_rate": 3.927417777775461e-06, + "loss": 0.5228, + "step": 1193 + }, + { + "epoch": 0.4776, + "grad_norm": 0.48873478174209595, + "learning_rate": 3.922917099931615e-06, + "loss": 0.5546, + "step": 1194 + }, + { + "epoch": 0.478, + "grad_norm": 0.48473060131073, + "learning_rate": 3.918415712295206e-06, + "loss": 0.5022, + "step": 1195 + }, + { + "epoch": 0.4784, + "grad_norm": 0.48407095670700073, + "learning_rate": 3.913913622421029e-06, + "loss": 0.547, + "step": 1196 + }, + { + "epoch": 0.4788, + "grad_norm": 0.4828053116798401, + "learning_rate": 3.9094108378650566e-06, + "loss": 0.5441, + "step": 1197 + }, + { + "epoch": 0.4792, + "grad_norm": 0.4596104919910431, + "learning_rate": 3.904907366184425e-06, + "loss": 0.5507, + "step": 1198 + }, + { + "epoch": 0.4796, + "grad_norm": 0.47510096430778503, + "learning_rate": 3.900403214937428e-06, + "loss": 0.5972, + "step": 1199 + }, + { + "epoch": 0.48, + "grad_norm": 0.5075947642326355, + "learning_rate": 3.8958983916834955e-06, + "loss": 0.5585, + "step": 1200 + }, + { + "epoch": 0.4804, + "grad_norm": 0.5023922920227051, + "learning_rate": 3.891392903983188e-06, + "loss": 0.5718, + "step": 1201 + }, + { + "epoch": 0.4808, + "grad_norm": 0.5040879249572754, + "learning_rate": 3.886886759398181e-06, + "loss": 0.4281, + "step": 1202 + }, + { + "epoch": 0.4812, + "grad_norm": 0.5060406923294067, + "learning_rate": 3.882379965491252e-06, + "loss": 0.5222, + "step": 1203 + }, + { + "epoch": 0.4816, + "grad_norm": 0.47041571140289307, + "learning_rate": 3.877872529826268e-06, + "loss": 0.5498, + "step": 1204 + }, + { + "epoch": 0.482, + "grad_norm": 0.45117494463920593, + "learning_rate": 3.873364459968172e-06, + "loss": 0.5662, + "step": 1205 + }, + { + "epoch": 0.4824, + "grad_norm": 0.4873691201210022, + "learning_rate": 3.8688557634829766e-06, + "loss": 0.5611, + "step": 1206 + }, + { + "epoch": 0.4828, + "grad_norm": 0.47428447008132935, + "learning_rate": 3.8643464479377375e-06, + "loss": 0.528, + "step": 1207 + }, + { + "epoch": 0.4832, + "grad_norm": 0.4737657606601715, + "learning_rate": 3.859836520900556e-06, + "loss": 0.4435, + "step": 1208 + }, + { + "epoch": 0.4836, + "grad_norm": 0.5013840794563293, + "learning_rate": 3.855325989940559e-06, + "loss": 0.5217, + "step": 1209 + }, + { + "epoch": 0.484, + "grad_norm": 0.46919554471969604, + "learning_rate": 3.850814862627884e-06, + "loss": 0.6178, + "step": 1210 + }, + { + "epoch": 0.4844, + "grad_norm": 0.49590057134628296, + "learning_rate": 3.8463031465336715e-06, + "loss": 0.5874, + "step": 1211 + }, + { + "epoch": 0.4848, + "grad_norm": 0.47950413823127747, + "learning_rate": 3.84179084923005e-06, + "loss": 0.5478, + "step": 1212 + }, + { + "epoch": 0.4852, + "grad_norm": 0.5121056437492371, + "learning_rate": 3.837277978290124e-06, + "loss": 0.5161, + "step": 1213 + }, + { + "epoch": 0.4856, + "grad_norm": 0.44547930359840393, + "learning_rate": 3.832764541287958e-06, + "loss": 0.5022, + "step": 1214 + }, + { + "epoch": 0.486, + "grad_norm": 0.506198525428772, + "learning_rate": 3.828250545798571e-06, + "loss": 0.5883, + "step": 1215 + }, + { + "epoch": 0.4864, + "grad_norm": 0.4669421911239624, + "learning_rate": 3.823735999397913e-06, + "loss": 0.492, + "step": 1216 + }, + { + "epoch": 0.4868, + "grad_norm": 0.46411246061325073, + "learning_rate": 3.819220909662867e-06, + "loss": 0.5047, + "step": 1217 + }, + { + "epoch": 0.4872, + "grad_norm": 0.5031952261924744, + "learning_rate": 3.814705284171221e-06, + "loss": 0.5952, + "step": 1218 + }, + { + "epoch": 0.4876, + "grad_norm": 0.5160549879074097, + "learning_rate": 3.8101891305016643e-06, + "loss": 0.5459, + "step": 1219 + }, + { + "epoch": 0.488, + "grad_norm": 0.5140947699546814, + "learning_rate": 3.8056724562337738e-06, + "loss": 0.6668, + "step": 1220 + }, + { + "epoch": 0.4884, + "grad_norm": 0.4860072433948517, + "learning_rate": 3.8011552689479985e-06, + "loss": 0.5133, + "step": 1221 + }, + { + "epoch": 0.4888, + "grad_norm": 0.4725138247013092, + "learning_rate": 3.79663757622565e-06, + "loss": 0.5731, + "step": 1222 + }, + { + "epoch": 0.4892, + "grad_norm": 0.5040252804756165, + "learning_rate": 3.7921193856488865e-06, + "loss": 0.585, + "step": 1223 + }, + { + "epoch": 0.4896, + "grad_norm": 0.4506980776786804, + "learning_rate": 3.787600704800702e-06, + "loss": 0.5543, + "step": 1224 + }, + { + "epoch": 0.49, + "grad_norm": 0.49135822057724, + "learning_rate": 3.783081541264915e-06, + "loss": 0.5531, + "step": 1225 + }, + { + "epoch": 0.4904, + "grad_norm": 0.48195192217826843, + "learning_rate": 3.778561902626152e-06, + "loss": 0.5561, + "step": 1226 + }, + { + "epoch": 0.4908, + "grad_norm": 0.4301593601703644, + "learning_rate": 3.7740417964698388e-06, + "loss": 0.4825, + "step": 1227 + }, + { + "epoch": 0.4912, + "grad_norm": 0.4790424704551697, + "learning_rate": 3.769521230382185e-06, + "loss": 0.5155, + "step": 1228 + }, + { + "epoch": 0.4916, + "grad_norm": 0.4837892949581146, + "learning_rate": 3.7650002119501715e-06, + "loss": 0.5496, + "step": 1229 + }, + { + "epoch": 0.492, + "grad_norm": 0.4896257817745209, + "learning_rate": 3.7604787487615388e-06, + "loss": 0.5036, + "step": 1230 + }, + { + "epoch": 0.4924, + "grad_norm": 0.4826231896877289, + "learning_rate": 3.755956848404774e-06, + "loss": 0.6042, + "step": 1231 + }, + { + "epoch": 0.4928, + "grad_norm": 0.49948158860206604, + "learning_rate": 3.751434518469099e-06, + "loss": 0.5179, + "step": 1232 + }, + { + "epoch": 0.4932, + "grad_norm": 0.5269796252250671, + "learning_rate": 3.746911766544454e-06, + "loss": 0.5896, + "step": 1233 + }, + { + "epoch": 0.4936, + "grad_norm": 0.5144227743148804, + "learning_rate": 3.7423886002214895e-06, + "loss": 0.5896, + "step": 1234 + }, + { + "epoch": 0.494, + "grad_norm": 0.47878497838974, + "learning_rate": 3.737865027091551e-06, + "loss": 0.5521, + "step": 1235 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4803354740142822, + "learning_rate": 3.7333410547466667e-06, + "loss": 0.5986, + "step": 1236 + }, + { + "epoch": 0.4948, + "grad_norm": 0.48093023896217346, + "learning_rate": 3.7288166907795362e-06, + "loss": 0.566, + "step": 1237 + }, + { + "epoch": 0.4952, + "grad_norm": 0.43498966097831726, + "learning_rate": 3.7242919427835117e-06, + "loss": 0.5302, + "step": 1238 + }, + { + "epoch": 0.4956, + "grad_norm": 0.4801994264125824, + "learning_rate": 3.719766818352597e-06, + "loss": 0.5635, + "step": 1239 + }, + { + "epoch": 0.496, + "grad_norm": 0.4880640208721161, + "learning_rate": 3.715241325081422e-06, + "loss": 0.6138, + "step": 1240 + }, + { + "epoch": 0.4964, + "grad_norm": 0.5129235982894897, + "learning_rate": 3.7107154705652373e-06, + "loss": 0.5623, + "step": 1241 + }, + { + "epoch": 0.4968, + "grad_norm": 0.45849037170410156, + "learning_rate": 3.7061892623999008e-06, + "loss": 0.5122, + "step": 1242 + }, + { + "epoch": 0.4972, + "grad_norm": 0.48277243971824646, + "learning_rate": 3.701662708181863e-06, + "loss": 0.6143, + "step": 1243 + }, + { + "epoch": 0.4976, + "grad_norm": 0.45651403069496155, + "learning_rate": 3.6971358155081555e-06, + "loss": 0.6106, + "step": 1244 + }, + { + "epoch": 0.498, + "grad_norm": 0.46248659491539, + "learning_rate": 3.692608591976377e-06, + "loss": 0.5849, + "step": 1245 + }, + { + "epoch": 0.4984, + "grad_norm": 0.4687730371952057, + "learning_rate": 3.688081045184684e-06, + "loss": 0.544, + "step": 1246 + }, + { + "epoch": 0.4988, + "grad_norm": 0.48340800404548645, + "learning_rate": 3.683553182731771e-06, + "loss": 0.5622, + "step": 1247 + }, + { + "epoch": 0.4992, + "grad_norm": 0.48194611072540283, + "learning_rate": 3.6790250122168688e-06, + "loss": 0.5405, + "step": 1248 + }, + { + "epoch": 0.4996, + "grad_norm": 0.5296016335487366, + "learning_rate": 3.6744965412397173e-06, + "loss": 0.551, + "step": 1249 + }, + { + "epoch": 0.5, + "grad_norm": 0.5403719544410706, + "learning_rate": 3.6699677774005694e-06, + "loss": 0.5956, + "step": 1250 + }, + { + "epoch": 0.5004, + "grad_norm": 0.4566706120967865, + "learning_rate": 3.6654387283001618e-06, + "loss": 0.5344, + "step": 1251 + }, + { + "epoch": 0.5008, + "grad_norm": 0.5087108612060547, + "learning_rate": 3.660909401539714e-06, + "loss": 0.5832, + "step": 1252 + }, + { + "epoch": 0.5012, + "grad_norm": 0.49442917108535767, + "learning_rate": 3.656379804720912e-06, + "loss": 0.5847, + "step": 1253 + }, + { + "epoch": 0.5016, + "grad_norm": 0.5028738379478455, + "learning_rate": 3.6518499454458906e-06, + "loss": 0.5638, + "step": 1254 + }, + { + "epoch": 0.502, + "grad_norm": 0.5300218462944031, + "learning_rate": 3.6473198313172323e-06, + "loss": 0.5987, + "step": 1255 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5029107928276062, + "learning_rate": 3.64278946993794e-06, + "loss": 0.6024, + "step": 1256 + }, + { + "epoch": 0.5028, + "grad_norm": 0.44316819310188293, + "learning_rate": 3.6382588689114374e-06, + "loss": 0.5317, + "step": 1257 + }, + { + "epoch": 0.5032, + "grad_norm": 0.4869939088821411, + "learning_rate": 3.6337280358415456e-06, + "loss": 0.5794, + "step": 1258 + }, + { + "epoch": 0.5036, + "grad_norm": 0.48294582962989807, + "learning_rate": 3.6291969783324784e-06, + "loss": 0.557, + "step": 1259 + }, + { + "epoch": 0.504, + "grad_norm": 0.4743523597717285, + "learning_rate": 3.624665703988825e-06, + "loss": 0.6022, + "step": 1260 + }, + { + "epoch": 0.5044, + "grad_norm": 0.48669537901878357, + "learning_rate": 3.620134220415539e-06, + "loss": 0.5158, + "step": 1261 + }, + { + "epoch": 0.5048, + "grad_norm": 0.5077384114265442, + "learning_rate": 3.615602535217925e-06, + "loss": 0.5877, + "step": 1262 + }, + { + "epoch": 0.5052, + "grad_norm": 0.45598161220550537, + "learning_rate": 3.6110706560016253e-06, + "loss": 0.4984, + "step": 1263 + }, + { + "epoch": 0.5056, + "grad_norm": 0.46461182832717896, + "learning_rate": 3.6065385903726103e-06, + "loss": 0.5788, + "step": 1264 + }, + { + "epoch": 0.506, + "grad_norm": 0.4900059998035431, + "learning_rate": 3.6020063459371594e-06, + "loss": 0.5111, + "step": 1265 + }, + { + "epoch": 0.5064, + "grad_norm": 0.45117202401161194, + "learning_rate": 3.597473930301854e-06, + "loss": 0.4874, + "step": 1266 + }, + { + "epoch": 0.5068, + "grad_norm": 0.5490992665290833, + "learning_rate": 3.592941351073564e-06, + "loss": 0.5638, + "step": 1267 + }, + { + "epoch": 0.5072, + "grad_norm": 0.48065051436424255, + "learning_rate": 3.588408615859433e-06, + "loss": 0.6104, + "step": 1268 + }, + { + "epoch": 0.5076, + "grad_norm": 0.5219455361366272, + "learning_rate": 3.5838757322668647e-06, + "loss": 0.524, + "step": 1269 + }, + { + "epoch": 0.508, + "grad_norm": 0.5040216445922852, + "learning_rate": 3.5793427079035146e-06, + "loss": 0.5249, + "step": 1270 + }, + { + "epoch": 0.5084, + "grad_norm": 0.4984973669052124, + "learning_rate": 3.5748095503772737e-06, + "loss": 0.5192, + "step": 1271 + }, + { + "epoch": 0.5088, + "grad_norm": 0.5044708847999573, + "learning_rate": 3.5702762672962548e-06, + "loss": 0.5929, + "step": 1272 + }, + { + "epoch": 0.5092, + "grad_norm": 0.566099226474762, + "learning_rate": 3.5657428662687833e-06, + "loss": 0.5235, + "step": 1273 + }, + { + "epoch": 0.5096, + "grad_norm": 0.49442431330680847, + "learning_rate": 3.5612093549033803e-06, + "loss": 0.602, + "step": 1274 + }, + { + "epoch": 0.51, + "grad_norm": 0.4640299379825592, + "learning_rate": 3.5566757408087554e-06, + "loss": 0.5834, + "step": 1275 + }, + { + "epoch": 0.5104, + "grad_norm": 0.42226120829582214, + "learning_rate": 3.5521420315937883e-06, + "loss": 0.4978, + "step": 1276 + }, + { + "epoch": 0.5108, + "grad_norm": 0.4852173626422882, + "learning_rate": 3.5476082348675186e-06, + "loss": 0.5135, + "step": 1277 + }, + { + "epoch": 0.5112, + "grad_norm": 0.4628143310546875, + "learning_rate": 3.543074358239133e-06, + "loss": 0.4778, + "step": 1278 + }, + { + "epoch": 0.5116, + "grad_norm": 0.4509110152721405, + "learning_rate": 3.538540409317954e-06, + "loss": 0.5291, + "step": 1279 + }, + { + "epoch": 0.512, + "grad_norm": 0.4843462109565735, + "learning_rate": 3.5340063957134207e-06, + "loss": 0.6027, + "step": 1280 + }, + { + "epoch": 0.5124, + "grad_norm": 0.49010783433914185, + "learning_rate": 3.529472325035087e-06, + "loss": 0.549, + "step": 1281 + }, + { + "epoch": 0.5128, + "grad_norm": 0.5032756328582764, + "learning_rate": 3.524938204892598e-06, + "loss": 0.7043, + "step": 1282 + }, + { + "epoch": 0.5132, + "grad_norm": 0.4765693247318268, + "learning_rate": 3.5204040428956817e-06, + "loss": 0.5727, + "step": 1283 + }, + { + "epoch": 0.5136, + "grad_norm": 0.42829883098602295, + "learning_rate": 3.515869846654141e-06, + "loss": 0.5099, + "step": 1284 + }, + { + "epoch": 0.514, + "grad_norm": 0.4168989360332489, + "learning_rate": 3.5113356237778307e-06, + "loss": 0.4861, + "step": 1285 + }, + { + "epoch": 0.5144, + "grad_norm": 0.4635244309902191, + "learning_rate": 3.5068013818766544e-06, + "loss": 0.5594, + "step": 1286 + }, + { + "epoch": 0.5148, + "grad_norm": 0.4652838706970215, + "learning_rate": 3.5022671285605445e-06, + "loss": 0.552, + "step": 1287 + }, + { + "epoch": 0.5152, + "grad_norm": 0.47168973088264465, + "learning_rate": 3.497732871439455e-06, + "loss": 0.5156, + "step": 1288 + }, + { + "epoch": 0.5156, + "grad_norm": 0.4864683747291565, + "learning_rate": 3.4931986181233455e-06, + "loss": 0.5414, + "step": 1289 + }, + { + "epoch": 0.516, + "grad_norm": 0.4762532114982605, + "learning_rate": 3.4886643762221696e-06, + "loss": 0.5165, + "step": 1290 + }, + { + "epoch": 0.5164, + "grad_norm": 0.4175041913986206, + "learning_rate": 3.48413015334586e-06, + "loss": 0.5117, + "step": 1291 + }, + { + "epoch": 0.5168, + "grad_norm": 0.48602724075317383, + "learning_rate": 3.479595957104319e-06, + "loss": 0.5763, + "step": 1292 + }, + { + "epoch": 0.5172, + "grad_norm": 0.503019392490387, + "learning_rate": 3.475061795107403e-06, + "loss": 0.5537, + "step": 1293 + }, + { + "epoch": 0.5176, + "grad_norm": 0.45459282398223877, + "learning_rate": 3.470527674964913e-06, + "loss": 0.6082, + "step": 1294 + }, + { + "epoch": 0.518, + "grad_norm": 0.4991267919540405, + "learning_rate": 3.4659936042865796e-06, + "loss": 0.5081, + "step": 1295 + }, + { + "epoch": 0.5184, + "grad_norm": 0.5146558880805969, + "learning_rate": 3.461459590682047e-06, + "loss": 0.5846, + "step": 1296 + }, + { + "epoch": 0.5188, + "grad_norm": 0.4307507872581482, + "learning_rate": 3.4569256417608664e-06, + "loss": 0.4484, + "step": 1297 + }, + { + "epoch": 0.5192, + "grad_norm": 0.43799084424972534, + "learning_rate": 3.452391765132481e-06, + "loss": 0.5526, + "step": 1298 + }, + { + "epoch": 0.5196, + "grad_norm": 0.45588675141334534, + "learning_rate": 3.447857968406212e-06, + "loss": 0.5245, + "step": 1299 + }, + { + "epoch": 0.52, + "grad_norm": 0.4409703016281128, + "learning_rate": 3.443324259191245e-06, + "loss": 0.4505, + "step": 1300 + }, + { + "epoch": 0.5204, + "grad_norm": 0.48005324602127075, + "learning_rate": 3.4387906450966205e-06, + "loss": 0.5844, + "step": 1301 + }, + { + "epoch": 0.5208, + "grad_norm": 0.4434102773666382, + "learning_rate": 3.4342571337312174e-06, + "loss": 0.4502, + "step": 1302 + }, + { + "epoch": 0.5212, + "grad_norm": 0.554935872554779, + "learning_rate": 3.429723732703745e-06, + "loss": 0.5439, + "step": 1303 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5339680314064026, + "learning_rate": 3.4251904496227266e-06, + "loss": 0.5693, + "step": 1304 + }, + { + "epoch": 0.522, + "grad_norm": 0.4795043170452118, + "learning_rate": 3.4206572920964853e-06, + "loss": 0.4937, + "step": 1305 + }, + { + "epoch": 0.5224, + "grad_norm": 0.4355802536010742, + "learning_rate": 3.4161242677331352e-06, + "loss": 0.4897, + "step": 1306 + }, + { + "epoch": 0.5228, + "grad_norm": 0.5295188426971436, + "learning_rate": 3.4115913841405666e-06, + "loss": 0.5563, + "step": 1307 + }, + { + "epoch": 0.5232, + "grad_norm": 0.5363090634346008, + "learning_rate": 3.4070586489264362e-06, + "loss": 0.5842, + "step": 1308 + }, + { + "epoch": 0.5236, + "grad_norm": 0.4985086917877197, + "learning_rate": 3.4025260696981458e-06, + "loss": 0.5373, + "step": 1309 + }, + { + "epoch": 0.524, + "grad_norm": 0.461211621761322, + "learning_rate": 3.3979936540628418e-06, + "loss": 0.5216, + "step": 1310 + }, + { + "epoch": 0.5244, + "grad_norm": 0.5300297141075134, + "learning_rate": 3.3934614096273896e-06, + "loss": 0.6442, + "step": 1311 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5206688642501831, + "learning_rate": 3.3889293439983737e-06, + "loss": 0.5457, + "step": 1312 + }, + { + "epoch": 0.5252, + "grad_norm": 0.4997401833534241, + "learning_rate": 3.384397464782075e-06, + "loss": 0.5301, + "step": 1313 + }, + { + "epoch": 0.5256, + "grad_norm": 0.527789294719696, + "learning_rate": 3.3798657795844613e-06, + "loss": 0.5327, + "step": 1314 + }, + { + "epoch": 0.526, + "grad_norm": 0.5220302939414978, + "learning_rate": 3.3753342960111753e-06, + "loss": 0.6045, + "step": 1315 + }, + { + "epoch": 0.5264, + "grad_norm": 0.4488482177257538, + "learning_rate": 3.370803021667521e-06, + "loss": 0.5192, + "step": 1316 + }, + { + "epoch": 0.5268, + "grad_norm": 0.486743688583374, + "learning_rate": 3.3662719641584547e-06, + "loss": 0.6105, + "step": 1317 + }, + { + "epoch": 0.5272, + "grad_norm": 0.45400115847587585, + "learning_rate": 3.361741131088563e-06, + "loss": 0.5667, + "step": 1318 + }, + { + "epoch": 0.5276, + "grad_norm": 0.5188816785812378, + "learning_rate": 3.35721053006206e-06, + "loss": 0.5929, + "step": 1319 + }, + { + "epoch": 0.528, + "grad_norm": 0.4486530125141144, + "learning_rate": 3.3526801686827684e-06, + "loss": 0.4676, + "step": 1320 + }, + { + "epoch": 0.5284, + "grad_norm": 0.46255767345428467, + "learning_rate": 3.348150054554109e-06, + "loss": 0.4653, + "step": 1321 + }, + { + "epoch": 0.5288, + "grad_norm": 0.4916232228279114, + "learning_rate": 3.3436201952790886e-06, + "loss": 0.4699, + "step": 1322 + }, + { + "epoch": 0.5292, + "grad_norm": 0.49927806854248047, + "learning_rate": 3.3390905984602865e-06, + "loss": 0.5563, + "step": 1323 + }, + { + "epoch": 0.5296, + "grad_norm": 0.48269158601760864, + "learning_rate": 3.334561271699839e-06, + "loss": 0.533, + "step": 1324 + }, + { + "epoch": 0.53, + "grad_norm": 0.4846993088722229, + "learning_rate": 3.330032222599432e-06, + "loss": 0.5768, + "step": 1325 + }, + { + "epoch": 0.5304, + "grad_norm": 0.47500863671302795, + "learning_rate": 3.325503458760282e-06, + "loss": 0.4884, + "step": 1326 + }, + { + "epoch": 0.5308, + "grad_norm": 0.5686325430870056, + "learning_rate": 3.320974987783132e-06, + "loss": 0.5854, + "step": 1327 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5267006158828735, + "learning_rate": 3.3164468172682295e-06, + "loss": 0.6222, + "step": 1328 + }, + { + "epoch": 0.5316, + "grad_norm": 0.5143467783927917, + "learning_rate": 3.3119189548153174e-06, + "loss": 0.5843, + "step": 1329 + }, + { + "epoch": 0.532, + "grad_norm": 0.47253331542015076, + "learning_rate": 3.3073914080236225e-06, + "loss": 0.5463, + "step": 1330 + }, + { + "epoch": 0.5324, + "grad_norm": 0.4366183280944824, + "learning_rate": 3.3028641844918448e-06, + "loss": 0.516, + "step": 1331 + }, + { + "epoch": 0.5328, + "grad_norm": 0.4651104509830475, + "learning_rate": 3.2983372918181374e-06, + "loss": 0.5399, + "step": 1332 + }, + { + "epoch": 0.5332, + "grad_norm": 0.5560972690582275, + "learning_rate": 3.2938107376000996e-06, + "loss": 0.4981, + "step": 1333 + }, + { + "epoch": 0.5336, + "grad_norm": 0.42216429114341736, + "learning_rate": 3.2892845294347635e-06, + "loss": 0.5063, + "step": 1334 + }, + { + "epoch": 0.534, + "grad_norm": 0.49016714096069336, + "learning_rate": 3.2847586749185785e-06, + "loss": 0.5798, + "step": 1335 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5078908801078796, + "learning_rate": 3.2802331816474033e-06, + "loss": 0.5965, + "step": 1336 + }, + { + "epoch": 0.5348, + "grad_norm": 0.49926018714904785, + "learning_rate": 3.2757080572164886e-06, + "loss": 0.488, + "step": 1337 + }, + { + "epoch": 0.5352, + "grad_norm": 0.5147686004638672, + "learning_rate": 3.271183309220465e-06, + "loss": 0.553, + "step": 1338 + }, + { + "epoch": 0.5356, + "grad_norm": 0.4854147136211395, + "learning_rate": 3.2666589452533345e-06, + "loss": 0.5469, + "step": 1339 + }, + { + "epoch": 0.536, + "grad_norm": 0.4860020875930786, + "learning_rate": 3.2621349729084486e-06, + "loss": 0.579, + "step": 1340 + }, + { + "epoch": 0.5364, + "grad_norm": 0.4679085612297058, + "learning_rate": 3.2576113997785113e-06, + "loss": 0.5247, + "step": 1341 + }, + { + "epoch": 0.5368, + "grad_norm": 0.5016301274299622, + "learning_rate": 3.2530882334555464e-06, + "loss": 0.4817, + "step": 1342 + }, + { + "epoch": 0.5372, + "grad_norm": 0.48567822575569153, + "learning_rate": 3.2485654815309026e-06, + "loss": 0.444, + "step": 1343 + }, + { + "epoch": 0.5376, + "grad_norm": 0.4987831115722656, + "learning_rate": 3.244043151595226e-06, + "loss": 0.647, + "step": 1344 + }, + { + "epoch": 0.538, + "grad_norm": 0.5409462451934814, + "learning_rate": 3.2395212512384616e-06, + "loss": 0.595, + "step": 1345 + }, + { + "epoch": 0.5384, + "grad_norm": 0.4637865126132965, + "learning_rate": 3.2349997880498293e-06, + "loss": 0.5172, + "step": 1346 + }, + { + "epoch": 0.5388, + "grad_norm": 0.498721718788147, + "learning_rate": 3.2304787696178154e-06, + "loss": 0.5287, + "step": 1347 + }, + { + "epoch": 0.5392, + "grad_norm": 0.47601523995399475, + "learning_rate": 3.2259582035301615e-06, + "loss": 0.514, + "step": 1348 + }, + { + "epoch": 0.5396, + "grad_norm": 0.47651252150535583, + "learning_rate": 3.2214380973738475e-06, + "loss": 0.5567, + "step": 1349 + }, + { + "epoch": 0.54, + "grad_norm": 0.5265172123908997, + "learning_rate": 3.2169184587350856e-06, + "loss": 0.6422, + "step": 1350 + }, + { + "epoch": 0.5404, + "grad_norm": 0.4211336672306061, + "learning_rate": 3.2123992951992984e-06, + "loss": 0.4929, + "step": 1351 + }, + { + "epoch": 0.5408, + "grad_norm": 0.49535515904426575, + "learning_rate": 3.2078806143511146e-06, + "loss": 0.5527, + "step": 1352 + }, + { + "epoch": 0.5412, + "grad_norm": 0.4549042284488678, + "learning_rate": 3.2033624237743506e-06, + "loss": 0.5098, + "step": 1353 + }, + { + "epoch": 0.5416, + "grad_norm": 0.48842307925224304, + "learning_rate": 3.1988447310520005e-06, + "loss": 0.5428, + "step": 1354 + }, + { + "epoch": 0.542, + "grad_norm": 0.4740166962146759, + "learning_rate": 3.194327543766226e-06, + "loss": 0.4719, + "step": 1355 + }, + { + "epoch": 0.5424, + "grad_norm": 0.47193852066993713, + "learning_rate": 3.1898108694983356e-06, + "loss": 0.5602, + "step": 1356 + }, + { + "epoch": 0.5428, + "grad_norm": 0.5566841959953308, + "learning_rate": 3.185294715828779e-06, + "loss": 0.5657, + "step": 1357 + }, + { + "epoch": 0.5432, + "grad_norm": 0.5171043872833252, + "learning_rate": 3.180779090337132e-06, + "loss": 0.5435, + "step": 1358 + }, + { + "epoch": 0.5436, + "grad_norm": 0.552750825881958, + "learning_rate": 3.1762640006020865e-06, + "loss": 0.6116, + "step": 1359 + }, + { + "epoch": 0.544, + "grad_norm": 0.45572030544281006, + "learning_rate": 3.17174945420143e-06, + "loss": 0.4724, + "step": 1360 + }, + { + "epoch": 0.5444, + "grad_norm": 0.5083634853363037, + "learning_rate": 3.1672354587120426e-06, + "loss": 0.564, + "step": 1361 + }, + { + "epoch": 0.5448, + "grad_norm": 0.50925213098526, + "learning_rate": 3.1627220217098767e-06, + "loss": 0.5546, + "step": 1362 + }, + { + "epoch": 0.5452, + "grad_norm": 0.5205938816070557, + "learning_rate": 3.1582091507699496e-06, + "loss": 0.5864, + "step": 1363 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5350721478462219, + "learning_rate": 3.153696853466329e-06, + "loss": 0.5921, + "step": 1364 + }, + { + "epoch": 0.546, + "grad_norm": 0.5171602368354797, + "learning_rate": 3.1491851373721164e-06, + "loss": 0.4859, + "step": 1365 + }, + { + "epoch": 0.5464, + "grad_norm": 0.5181358456611633, + "learning_rate": 3.1446740100594415e-06, + "loss": 0.5139, + "step": 1366 + }, + { + "epoch": 0.5468, + "grad_norm": 0.4513240158557892, + "learning_rate": 3.1401634790994444e-06, + "loss": 0.4164, + "step": 1367 + }, + { + "epoch": 0.5472, + "grad_norm": 0.47397565841674805, + "learning_rate": 3.135653552062263e-06, + "loss": 0.5654, + "step": 1368 + }, + { + "epoch": 0.5476, + "grad_norm": 0.47725215554237366, + "learning_rate": 3.1311442365170238e-06, + "loss": 0.5322, + "step": 1369 + }, + { + "epoch": 0.548, + "grad_norm": 0.4324577748775482, + "learning_rate": 3.1266355400318277e-06, + "loss": 0.4717, + "step": 1370 + }, + { + "epoch": 0.5484, + "grad_norm": 0.5813025236129761, + "learning_rate": 3.1221274701737324e-06, + "loss": 0.5871, + "step": 1371 + }, + { + "epoch": 0.5488, + "grad_norm": 0.4938691556453705, + "learning_rate": 3.117620034508748e-06, + "loss": 0.5284, + "step": 1372 + }, + { + "epoch": 0.5492, + "grad_norm": 0.4700092077255249, + "learning_rate": 3.1131132406018182e-06, + "loss": 0.5146, + "step": 1373 + }, + { + "epoch": 0.5496, + "grad_norm": 0.4936462938785553, + "learning_rate": 3.1086070960168124e-06, + "loss": 0.5504, + "step": 1374 + }, + { + "epoch": 0.55, + "grad_norm": 0.5479834675788879, + "learning_rate": 3.1041016083165053e-06, + "loss": 0.6068, + "step": 1375 + }, + { + "epoch": 0.5504, + "grad_norm": 0.4632750451564789, + "learning_rate": 3.0995967850625737e-06, + "loss": 0.5139, + "step": 1376 + }, + { + "epoch": 0.5508, + "grad_norm": 0.5398275256156921, + "learning_rate": 3.0950926338155747e-06, + "loss": 0.5622, + "step": 1377 + }, + { + "epoch": 0.5512, + "grad_norm": 0.5492286086082458, + "learning_rate": 3.0905891621349433e-06, + "loss": 0.6452, + "step": 1378 + }, + { + "epoch": 0.5516, + "grad_norm": 0.5100933909416199, + "learning_rate": 3.0860863775789707e-06, + "loss": 0.5928, + "step": 1379 + }, + { + "epoch": 0.552, + "grad_norm": 0.44502824544906616, + "learning_rate": 3.0815842877047942e-06, + "loss": 0.5365, + "step": 1380 + }, + { + "epoch": 0.5524, + "grad_norm": 0.46557819843292236, + "learning_rate": 3.0770829000683864e-06, + "loss": 0.5374, + "step": 1381 + }, + { + "epoch": 0.5528, + "grad_norm": 0.5025701522827148, + "learning_rate": 3.0725822222245384e-06, + "loss": 0.582, + "step": 1382 + }, + { + "epoch": 0.5532, + "grad_norm": 0.49238553643226624, + "learning_rate": 3.0680822617268583e-06, + "loss": 0.4892, + "step": 1383 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4772971272468567, + "learning_rate": 3.0635830261277404e-06, + "loss": 0.5475, + "step": 1384 + }, + { + "epoch": 0.554, + "grad_norm": 0.5160426497459412, + "learning_rate": 3.0590845229783684e-06, + "loss": 0.5744, + "step": 1385 + }, + { + "epoch": 0.5544, + "grad_norm": 0.4739657938480377, + "learning_rate": 3.054586759828694e-06, + "loss": 0.4817, + "step": 1386 + }, + { + "epoch": 0.5548, + "grad_norm": 0.4471666216850281, + "learning_rate": 3.0500897442274274e-06, + "loss": 0.5437, + "step": 1387 + }, + { + "epoch": 0.5552, + "grad_norm": 0.49380597472190857, + "learning_rate": 3.0455934837220273e-06, + "loss": 0.5458, + "step": 1388 + }, + { + "epoch": 0.5556, + "grad_norm": 0.4983111023902893, + "learning_rate": 3.04109798585868e-06, + "loss": 0.5455, + "step": 1389 + }, + { + "epoch": 0.556, + "grad_norm": 0.5484086871147156, + "learning_rate": 3.036603258182295e-06, + "loss": 0.5272, + "step": 1390 + }, + { + "epoch": 0.5564, + "grad_norm": 0.5066070556640625, + "learning_rate": 3.032109308236486e-06, + "loss": 0.56, + "step": 1391 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5160626769065857, + "learning_rate": 3.0276161435635694e-06, + "loss": 0.5745, + "step": 1392 + }, + { + "epoch": 0.5572, + "grad_norm": 0.5149394869804382, + "learning_rate": 3.0231237717045343e-06, + "loss": 0.5562, + "step": 1393 + }, + { + "epoch": 0.5576, + "grad_norm": 0.5401614308357239, + "learning_rate": 3.0186322001990444e-06, + "loss": 0.508, + "step": 1394 + }, + { + "epoch": 0.558, + "grad_norm": 0.43853285908699036, + "learning_rate": 3.014141436585417e-06, + "loss": 0.5448, + "step": 1395 + }, + { + "epoch": 0.5584, + "grad_norm": 0.58089679479599, + "learning_rate": 3.0096514884006165e-06, + "loss": 0.6501, + "step": 1396 + }, + { + "epoch": 0.5588, + "grad_norm": 0.49932563304901123, + "learning_rate": 3.0051623631802384e-06, + "loss": 0.5666, + "step": 1397 + }, + { + "epoch": 0.5592, + "grad_norm": 0.5262106657028198, + "learning_rate": 3.0006740684584946e-06, + "loss": 0.5124, + "step": 1398 + }, + { + "epoch": 0.5596, + "grad_norm": 0.5242915749549866, + "learning_rate": 2.9961866117682056e-06, + "loss": 0.6191, + "step": 1399 + }, + { + "epoch": 0.56, + "grad_norm": 0.44526514410972595, + "learning_rate": 2.9917000006407847e-06, + "loss": 0.607, + "step": 1400 + }, + { + "epoch": 0.5604, + "grad_norm": 0.45736733078956604, + "learning_rate": 2.987214242606224e-06, + "loss": 0.5174, + "step": 1401 + }, + { + "epoch": 0.5608, + "grad_norm": 0.5021191239356995, + "learning_rate": 2.9827293451930882e-06, + "loss": 0.5242, + "step": 1402 + }, + { + "epoch": 0.5612, + "grad_norm": 0.45563265681266785, + "learning_rate": 2.978245315928494e-06, + "loss": 0.5253, + "step": 1403 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5284848809242249, + "learning_rate": 2.9737621623381016e-06, + "loss": 0.5263, + "step": 1404 + }, + { + "epoch": 0.562, + "grad_norm": 0.5633329749107361, + "learning_rate": 2.969279891946102e-06, + "loss": 0.6081, + "step": 1405 + }, + { + "epoch": 0.5624, + "grad_norm": 0.5600154399871826, + "learning_rate": 2.9647985122752057e-06, + "loss": 0.5416, + "step": 1406 + }, + { + "epoch": 0.5628, + "grad_norm": 0.47047778964042664, + "learning_rate": 2.960318030846625e-06, + "loss": 0.5438, + "step": 1407 + }, + { + "epoch": 0.5632, + "grad_norm": 0.4990597069263458, + "learning_rate": 2.955838455180067e-06, + "loss": 0.511, + "step": 1408 + }, + { + "epoch": 0.5636, + "grad_norm": 0.50710529088974, + "learning_rate": 2.9513597927937182e-06, + "loss": 0.5696, + "step": 1409 + }, + { + "epoch": 0.564, + "grad_norm": 0.5165752172470093, + "learning_rate": 2.9468820512042297e-06, + "loss": 0.548, + "step": 1410 + }, + { + "epoch": 0.5644, + "grad_norm": 0.44777095317840576, + "learning_rate": 2.9424052379267115e-06, + "loss": 0.4402, + "step": 1411 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4853544533252716, + "learning_rate": 2.9379293604747146e-06, + "loss": 0.5268, + "step": 1412 + }, + { + "epoch": 0.5652, + "grad_norm": 0.4983876645565033, + "learning_rate": 2.933454426360216e-06, + "loss": 0.5174, + "step": 1413 + }, + { + "epoch": 0.5656, + "grad_norm": 0.4267212152481079, + "learning_rate": 2.928980443093614e-06, + "loss": 0.5551, + "step": 1414 + }, + { + "epoch": 0.566, + "grad_norm": 0.48267069458961487, + "learning_rate": 2.9245074181837055e-06, + "loss": 0.494, + "step": 1415 + }, + { + "epoch": 0.5664, + "grad_norm": 0.4844222962856293, + "learning_rate": 2.9200353591376857e-06, + "loss": 0.5433, + "step": 1416 + }, + { + "epoch": 0.5668, + "grad_norm": 0.6255803108215332, + "learning_rate": 2.9155642734611243e-06, + "loss": 0.5762, + "step": 1417 + }, + { + "epoch": 0.5672, + "grad_norm": 0.46967852115631104, + "learning_rate": 2.911094168657959e-06, + "loss": 0.558, + "step": 1418 + }, + { + "epoch": 0.5676, + "grad_norm": 0.4525752067565918, + "learning_rate": 2.906625052230479e-06, + "loss": 0.5339, + "step": 1419 + }, + { + "epoch": 0.568, + "grad_norm": 0.6286513805389404, + "learning_rate": 2.9021569316793184e-06, + "loss": 0.5667, + "step": 1420 + }, + { + "epoch": 0.5684, + "grad_norm": 0.47905561327934265, + "learning_rate": 2.8976898145034386e-06, + "loss": 0.5572, + "step": 1421 + }, + { + "epoch": 0.5688, + "grad_norm": 0.4969237446784973, + "learning_rate": 2.8932237082001134e-06, + "loss": 0.5007, + "step": 1422 + }, + { + "epoch": 0.5692, + "grad_norm": 0.568736732006073, + "learning_rate": 2.8887586202649257e-06, + "loss": 0.597, + "step": 1423 + }, + { + "epoch": 0.5696, + "grad_norm": 0.5033916234970093, + "learning_rate": 2.8842945581917437e-06, + "loss": 0.4943, + "step": 1424 + }, + { + "epoch": 0.57, + "grad_norm": 0.48971912264823914, + "learning_rate": 2.8798315294727205e-06, + "loss": 0.534, + "step": 1425 + }, + { + "epoch": 0.5704, + "grad_norm": 0.5021529793739319, + "learning_rate": 2.8753695415982682e-06, + "loss": 0.5684, + "step": 1426 + }, + { + "epoch": 0.5708, + "grad_norm": 0.4847045838832855, + "learning_rate": 2.8709086020570565e-06, + "loss": 0.4761, + "step": 1427 + }, + { + "epoch": 0.5712, + "grad_norm": 0.492500364780426, + "learning_rate": 2.8664487183359935e-06, + "loss": 0.514, + "step": 1428 + }, + { + "epoch": 0.5716, + "grad_norm": 0.49166664481163025, + "learning_rate": 2.861989897920214e-06, + "loss": 0.5007, + "step": 1429 + }, + { + "epoch": 0.572, + "grad_norm": 0.5053258538246155, + "learning_rate": 2.8575321482930737e-06, + "loss": 0.6097, + "step": 1430 + }, + { + "epoch": 0.5724, + "grad_norm": 0.5131270289421082, + "learning_rate": 2.853075476936125e-06, + "loss": 0.5356, + "step": 1431 + }, + { + "epoch": 0.5728, + "grad_norm": 0.460875004529953, + "learning_rate": 2.848619891329115e-06, + "loss": 0.4404, + "step": 1432 + }, + { + "epoch": 0.5732, + "grad_norm": 0.5094249248504639, + "learning_rate": 2.844165398949963e-06, + "loss": 0.6103, + "step": 1433 + }, + { + "epoch": 0.5736, + "grad_norm": 0.4838579595088959, + "learning_rate": 2.8397120072747624e-06, + "loss": 0.5164, + "step": 1434 + }, + { + "epoch": 0.574, + "grad_norm": 0.5164437294006348, + "learning_rate": 2.8352597237777506e-06, + "loss": 0.5857, + "step": 1435 + }, + { + "epoch": 0.5744, + "grad_norm": 0.5409811735153198, + "learning_rate": 2.830808555931311e-06, + "loss": 0.5189, + "step": 1436 + }, + { + "epoch": 0.5748, + "grad_norm": 0.5643367767333984, + "learning_rate": 2.82635851120595e-06, + "loss": 0.5882, + "step": 1437 + }, + { + "epoch": 0.5752, + "grad_norm": 0.4767090380191803, + "learning_rate": 2.821909597070291e-06, + "loss": 0.5102, + "step": 1438 + }, + { + "epoch": 0.5756, + "grad_norm": 0.4349215626716614, + "learning_rate": 2.8174618209910626e-06, + "loss": 0.4364, + "step": 1439 + }, + { + "epoch": 0.576, + "grad_norm": 0.46870535612106323, + "learning_rate": 2.813015190433079e-06, + "loss": 0.5712, + "step": 1440 + }, + { + "epoch": 0.5764, + "grad_norm": 0.48113444447517395, + "learning_rate": 2.808569712859235e-06, + "loss": 0.5533, + "step": 1441 + }, + { + "epoch": 0.5768, + "grad_norm": 0.4305316209793091, + "learning_rate": 2.8041253957304874e-06, + "loss": 0.4946, + "step": 1442 + }, + { + "epoch": 0.5772, + "grad_norm": 0.5638675093650818, + "learning_rate": 2.799682246505848e-06, + "loss": 0.5814, + "step": 1443 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4995023310184479, + "learning_rate": 2.7952402726423673e-06, + "loss": 0.5185, + "step": 1444 + }, + { + "epoch": 0.578, + "grad_norm": 0.48982349038124084, + "learning_rate": 2.790799481595125e-06, + "loss": 0.5735, + "step": 1445 + }, + { + "epoch": 0.5784, + "grad_norm": 0.43708935379981995, + "learning_rate": 2.7863598808172117e-06, + "loss": 0.3894, + "step": 1446 + }, + { + "epoch": 0.5788, + "grad_norm": 0.4841836094856262, + "learning_rate": 2.7819214777597228e-06, + "loss": 0.5465, + "step": 1447 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5274856090545654, + "learning_rate": 2.777484279871746e-06, + "loss": 0.5903, + "step": 1448 + }, + { + "epoch": 0.5796, + "grad_norm": 0.46429377794265747, + "learning_rate": 2.7730482946003414e-06, + "loss": 0.461, + "step": 1449 + }, + { + "epoch": 0.58, + "grad_norm": 0.46727266907691956, + "learning_rate": 2.768613529390537e-06, + "loss": 0.5232, + "step": 1450 + }, + { + "epoch": 0.5804, + "grad_norm": 0.513526439666748, + "learning_rate": 2.7641799916853137e-06, + "loss": 0.615, + "step": 1451 + }, + { + "epoch": 0.5808, + "grad_norm": 0.5125380754470825, + "learning_rate": 2.759747688925587e-06, + "loss": 0.593, + "step": 1452 + }, + { + "epoch": 0.5812, + "grad_norm": 0.5639067888259888, + "learning_rate": 2.7553166285502083e-06, + "loss": 0.6349, + "step": 1453 + }, + { + "epoch": 0.5816, + "grad_norm": 0.48888012766838074, + "learning_rate": 2.7508868179959372e-06, + "loss": 0.5049, + "step": 1454 + }, + { + "epoch": 0.582, + "grad_norm": 0.5085485577583313, + "learning_rate": 2.7464582646974377e-06, + "loss": 0.4854, + "step": 1455 + }, + { + "epoch": 0.5824, + "grad_norm": 0.4603438973426819, + "learning_rate": 2.7420309760872655e-06, + "loss": 0.6196, + "step": 1456 + }, + { + "epoch": 0.5828, + "grad_norm": 0.48601052165031433, + "learning_rate": 2.737604959595849e-06, + "loss": 0.4975, + "step": 1457 + }, + { + "epoch": 0.5832, + "grad_norm": 0.5217102766036987, + "learning_rate": 2.7331802226514865e-06, + "loss": 0.5848, + "step": 1458 + }, + { + "epoch": 0.5836, + "grad_norm": 0.545604944229126, + "learning_rate": 2.728756772680327e-06, + "loss": 0.6058, + "step": 1459 + }, + { + "epoch": 0.584, + "grad_norm": 0.5154001712799072, + "learning_rate": 2.7243346171063607e-06, + "loss": 0.5404, + "step": 1460 + }, + { + "epoch": 0.5844, + "grad_norm": 0.4819687306880951, + "learning_rate": 2.719913763351402e-06, + "loss": 0.5809, + "step": 1461 + }, + { + "epoch": 0.5848, + "grad_norm": 0.4893083870410919, + "learning_rate": 2.715494218835082e-06, + "loss": 0.6066, + "step": 1462 + }, + { + "epoch": 0.5852, + "grad_norm": 0.48114487528800964, + "learning_rate": 2.7110759909748376e-06, + "loss": 0.5637, + "step": 1463 + }, + { + "epoch": 0.5856, + "grad_norm": 0.4523283541202545, + "learning_rate": 2.7066590871858926e-06, + "loss": 0.5814, + "step": 1464 + }, + { + "epoch": 0.586, + "grad_norm": 0.49530142545700073, + "learning_rate": 2.702243514881249e-06, + "loss": 0.4947, + "step": 1465 + }, + { + "epoch": 0.5864, + "grad_norm": 0.5186442732810974, + "learning_rate": 2.697829281471673e-06, + "loss": 0.5865, + "step": 1466 + }, + { + "epoch": 0.5868, + "grad_norm": 0.47039487957954407, + "learning_rate": 2.693416394365687e-06, + "loss": 0.5396, + "step": 1467 + }, + { + "epoch": 0.5872, + "grad_norm": 0.45197176933288574, + "learning_rate": 2.6890048609695523e-06, + "loss": 0.4497, + "step": 1468 + }, + { + "epoch": 0.5876, + "grad_norm": 0.4874049127101898, + "learning_rate": 2.684594688687258e-06, + "loss": 0.5222, + "step": 1469 + }, + { + "epoch": 0.588, + "grad_norm": 0.51507568359375, + "learning_rate": 2.6801858849205074e-06, + "loss": 0.5094, + "step": 1470 + }, + { + "epoch": 0.5884, + "grad_norm": 0.5144968628883362, + "learning_rate": 2.6757784570687083e-06, + "loss": 0.5865, + "step": 1471 + }, + { + "epoch": 0.5888, + "grad_norm": 0.49269288778305054, + "learning_rate": 2.6713724125289613e-06, + "loss": 0.4984, + "step": 1472 + }, + { + "epoch": 0.5892, + "grad_norm": 0.4876870810985565, + "learning_rate": 2.666967758696042e-06, + "loss": 0.5238, + "step": 1473 + }, + { + "epoch": 0.5896, + "grad_norm": 0.4633502662181854, + "learning_rate": 2.662564502962394e-06, + "loss": 0.5271, + "step": 1474 + }, + { + "epoch": 0.59, + "grad_norm": 0.5110370516777039, + "learning_rate": 2.658162652718113e-06, + "loss": 0.537, + "step": 1475 + }, + { + "epoch": 0.5904, + "grad_norm": 0.4866006672382355, + "learning_rate": 2.653762215350936e-06, + "loss": 0.5239, + "step": 1476 + }, + { + "epoch": 0.5908, + "grad_norm": 0.544572114944458, + "learning_rate": 2.6493631982462313e-06, + "loss": 0.5784, + "step": 1477 + }, + { + "epoch": 0.5912, + "grad_norm": 0.5041329264640808, + "learning_rate": 2.6449656087869815e-06, + "loss": 0.5382, + "step": 1478 + }, + { + "epoch": 0.5916, + "grad_norm": 0.4942692220211029, + "learning_rate": 2.640569454353772e-06, + "loss": 0.5125, + "step": 1479 + }, + { + "epoch": 0.592, + "grad_norm": 0.5565232038497925, + "learning_rate": 2.63617474232478e-06, + "loss": 0.5759, + "step": 1480 + }, + { + "epoch": 0.5924, + "grad_norm": 0.48877766728401184, + "learning_rate": 2.6317814800757655e-06, + "loss": 0.5607, + "step": 1481 + }, + { + "epoch": 0.5928, + "grad_norm": 0.4964582622051239, + "learning_rate": 2.627389674980051e-06, + "loss": 0.5714, + "step": 1482 + }, + { + "epoch": 0.5932, + "grad_norm": 0.5051253437995911, + "learning_rate": 2.622999334408516e-06, + "loss": 0.5631, + "step": 1483 + }, + { + "epoch": 0.5936, + "grad_norm": 0.460700124502182, + "learning_rate": 2.61861046572958e-06, + "loss": 0.5801, + "step": 1484 + }, + { + "epoch": 0.594, + "grad_norm": 0.499276727437973, + "learning_rate": 2.614223076309193e-06, + "loss": 0.5632, + "step": 1485 + }, + { + "epoch": 0.5944, + "grad_norm": 0.4617083668708801, + "learning_rate": 2.609837173510824e-06, + "loss": 0.5333, + "step": 1486 + }, + { + "epoch": 0.5948, + "grad_norm": 0.5099096298217773, + "learning_rate": 2.6054527646954463e-06, + "loss": 0.5713, + "step": 1487 + }, + { + "epoch": 0.5952, + "grad_norm": 0.43509137630462646, + "learning_rate": 2.601069857221523e-06, + "loss": 0.4773, + "step": 1488 + }, + { + "epoch": 0.5956, + "grad_norm": 0.4796929359436035, + "learning_rate": 2.5966884584450014e-06, + "loss": 0.58, + "step": 1489 + }, + { + "epoch": 0.596, + "grad_norm": 0.5261090397834778, + "learning_rate": 2.5923085757192924e-06, + "loss": 0.5708, + "step": 1490 + }, + { + "epoch": 0.5964, + "grad_norm": 0.5301308631896973, + "learning_rate": 2.5879302163952672e-06, + "loss": 0.5811, + "step": 1491 + }, + { + "epoch": 0.5968, + "grad_norm": 0.5250400900840759, + "learning_rate": 2.583553387821238e-06, + "loss": 0.5675, + "step": 1492 + }, + { + "epoch": 0.5972, + "grad_norm": 0.5334447622299194, + "learning_rate": 2.5791780973429466e-06, + "loss": 0.5659, + "step": 1493 + }, + { + "epoch": 0.5976, + "grad_norm": 0.4800483286380768, + "learning_rate": 2.5748043523035546e-06, + "loss": 0.5534, + "step": 1494 + }, + { + "epoch": 0.598, + "grad_norm": 0.5284641981124878, + "learning_rate": 2.5704321600436317e-06, + "loss": 0.5998, + "step": 1495 + }, + { + "epoch": 0.5984, + "grad_norm": 0.5287473797798157, + "learning_rate": 2.5660615279011386e-06, + "loss": 0.4678, + "step": 1496 + }, + { + "epoch": 0.5988, + "grad_norm": 0.43308907747268677, + "learning_rate": 2.561692463211419e-06, + "loss": 0.4371, + "step": 1497 + }, + { + "epoch": 0.5992, + "grad_norm": 0.5262799859046936, + "learning_rate": 2.557324973307186e-06, + "loss": 0.4869, + "step": 1498 + }, + { + "epoch": 0.5996, + "grad_norm": 0.4628347158432007, + "learning_rate": 2.5529590655185074e-06, + "loss": 0.626, + "step": 1499 + }, + { + "epoch": 0.6, + "grad_norm": 0.5220506191253662, + "learning_rate": 2.5485947471727995e-06, + "loss": 0.5732, + "step": 1500 + }, + { + "epoch": 0.6004, + "grad_norm": 0.46114087104797363, + "learning_rate": 2.5442320255948096e-06, + "loss": 0.5597, + "step": 1501 + }, + { + "epoch": 0.6008, + "grad_norm": 0.5224429368972778, + "learning_rate": 2.5398709081066046e-06, + "loss": 0.5119, + "step": 1502 + }, + { + "epoch": 0.6012, + "grad_norm": 0.4979833960533142, + "learning_rate": 2.5355114020275576e-06, + "loss": 0.5346, + "step": 1503 + }, + { + "epoch": 0.6016, + "grad_norm": 0.4844669699668884, + "learning_rate": 2.5311535146743388e-06, + "loss": 0.4852, + "step": 1504 + }, + { + "epoch": 0.602, + "grad_norm": 0.4876634180545807, + "learning_rate": 2.526797253360905e-06, + "loss": 0.5701, + "step": 1505 + }, + { + "epoch": 0.6024, + "grad_norm": 0.47222307324409485, + "learning_rate": 2.5224426253984775e-06, + "loss": 0.5319, + "step": 1506 + }, + { + "epoch": 0.6028, + "grad_norm": 0.5080312490463257, + "learning_rate": 2.5180896380955415e-06, + "loss": 0.5992, + "step": 1507 + }, + { + "epoch": 0.6032, + "grad_norm": 0.49264171719551086, + "learning_rate": 2.5137382987578247e-06, + "loss": 0.5549, + "step": 1508 + }, + { + "epoch": 0.6036, + "grad_norm": 0.46697530150413513, + "learning_rate": 2.5093886146882923e-06, + "loss": 0.5624, + "step": 1509 + }, + { + "epoch": 0.604, + "grad_norm": 0.5428259968757629, + "learning_rate": 2.50504059318713e-06, + "loss": 0.5634, + "step": 1510 + }, + { + "epoch": 0.6044, + "grad_norm": 0.5281015038490295, + "learning_rate": 2.5006942415517338e-06, + "loss": 0.6014, + "step": 1511 + }, + { + "epoch": 0.6048, + "grad_norm": 0.5557918548583984, + "learning_rate": 2.496349567076696e-06, + "loss": 0.5203, + "step": 1512 + }, + { + "epoch": 0.6052, + "grad_norm": 0.4895382821559906, + "learning_rate": 2.492006577053793e-06, + "loss": 0.5726, + "step": 1513 + }, + { + "epoch": 0.6056, + "grad_norm": 0.563296377658844, + "learning_rate": 2.487665278771979e-06, + "loss": 0.5745, + "step": 1514 + }, + { + "epoch": 0.606, + "grad_norm": 0.4936528205871582, + "learning_rate": 2.483325679517363e-06, + "loss": 0.5615, + "step": 1515 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5365152955055237, + "learning_rate": 2.4789877865732082e-06, + "loss": 0.5461, + "step": 1516 + }, + { + "epoch": 0.6068, + "grad_norm": 0.4697369635105133, + "learning_rate": 2.4746516072199076e-06, + "loss": 0.5281, + "step": 1517 + }, + { + "epoch": 0.6072, + "grad_norm": 0.5144125819206238, + "learning_rate": 2.4703171487349826e-06, + "loss": 0.5162, + "step": 1518 + }, + { + "epoch": 0.6076, + "grad_norm": 0.5662809610366821, + "learning_rate": 2.4659844183930663e-06, + "loss": 0.6308, + "step": 1519 + }, + { + "epoch": 0.608, + "grad_norm": 0.5164036750793457, + "learning_rate": 2.4616534234658916e-06, + "loss": 0.5497, + "step": 1520 + }, + { + "epoch": 0.6084, + "grad_norm": 0.5437076091766357, + "learning_rate": 2.457324171222276e-06, + "loss": 0.5659, + "step": 1521 + }, + { + "epoch": 0.6088, + "grad_norm": 0.4282589852809906, + "learning_rate": 2.452996668928115e-06, + "loss": 0.4424, + "step": 1522 + }, + { + "epoch": 0.6092, + "grad_norm": 0.4498371183872223, + "learning_rate": 2.4486709238463642e-06, + "loss": 0.4756, + "step": 1523 + }, + { + "epoch": 0.6096, + "grad_norm": 0.5019795298576355, + "learning_rate": 2.4443469432370347e-06, + "loss": 0.5285, + "step": 1524 + }, + { + "epoch": 0.61, + "grad_norm": 0.47362902760505676, + "learning_rate": 2.440024734357173e-06, + "loss": 0.6729, + "step": 1525 + }, + { + "epoch": 0.6104, + "grad_norm": 0.5483762621879578, + "learning_rate": 2.4357043044608513e-06, + "loss": 0.5122, + "step": 1526 + }, + { + "epoch": 0.6108, + "grad_norm": 0.5297125577926636, + "learning_rate": 2.431385660799157e-06, + "loss": 0.5444, + "step": 1527 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5263299345970154, + "learning_rate": 2.4270688106201816e-06, + "loss": 0.5762, + "step": 1528 + }, + { + "epoch": 0.6116, + "grad_norm": 0.5046945214271545, + "learning_rate": 2.4227537611690053e-06, + "loss": 0.4573, + "step": 1529 + }, + { + "epoch": 0.612, + "grad_norm": 0.5073798894882202, + "learning_rate": 2.418440519687684e-06, + "loss": 0.5786, + "step": 1530 + }, + { + "epoch": 0.6124, + "grad_norm": 0.5219867825508118, + "learning_rate": 2.414129093415243e-06, + "loss": 0.581, + "step": 1531 + }, + { + "epoch": 0.6128, + "grad_norm": 0.46836137771606445, + "learning_rate": 2.4098194895876554e-06, + "loss": 0.4835, + "step": 1532 + }, + { + "epoch": 0.6132, + "grad_norm": 0.4778684079647064, + "learning_rate": 2.4055117154378427e-06, + "loss": 0.5348, + "step": 1533 + }, + { + "epoch": 0.6136, + "grad_norm": 0.4793270230293274, + "learning_rate": 2.4012057781956515e-06, + "loss": 0.5527, + "step": 1534 + }, + { + "epoch": 0.614, + "grad_norm": 0.49084943532943726, + "learning_rate": 2.3969016850878457e-06, + "loss": 0.5705, + "step": 1535 + }, + { + "epoch": 0.6144, + "grad_norm": 0.49898597598075867, + "learning_rate": 2.392599443338094e-06, + "loss": 0.5809, + "step": 1536 + }, + { + "epoch": 0.6148, + "grad_norm": 0.4914925992488861, + "learning_rate": 2.3882990601669587e-06, + "loss": 0.5114, + "step": 1537 + }, + { + "epoch": 0.6152, + "grad_norm": 0.50290846824646, + "learning_rate": 2.3840005427918848e-06, + "loss": 0.4965, + "step": 1538 + }, + { + "epoch": 0.6156, + "grad_norm": 0.4977038502693176, + "learning_rate": 2.3797038984271814e-06, + "loss": 0.5015, + "step": 1539 + }, + { + "epoch": 0.616, + "grad_norm": 0.5011242032051086, + "learning_rate": 2.3754091342840174e-06, + "loss": 0.5551, + "step": 1540 + }, + { + "epoch": 0.6164, + "grad_norm": 0.5203733444213867, + "learning_rate": 2.3711162575704035e-06, + "loss": 0.499, + "step": 1541 + }, + { + "epoch": 0.6168, + "grad_norm": 0.4532087445259094, + "learning_rate": 2.3668252754911866e-06, + "loss": 0.5067, + "step": 1542 + }, + { + "epoch": 0.6172, + "grad_norm": 0.539614200592041, + "learning_rate": 2.3625361952480307e-06, + "loss": 0.5632, + "step": 1543 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5588467717170715, + "learning_rate": 2.3582490240394075e-06, + "loss": 0.5892, + "step": 1544 + }, + { + "epoch": 0.618, + "grad_norm": 0.5543122887611389, + "learning_rate": 2.353963769060587e-06, + "loss": 0.5045, + "step": 1545 + }, + { + "epoch": 0.6184, + "grad_norm": 0.51371830701828, + "learning_rate": 2.3496804375036206e-06, + "loss": 0.5724, + "step": 1546 + }, + { + "epoch": 0.6188, + "grad_norm": 0.6098910570144653, + "learning_rate": 2.3453990365573353e-06, + "loss": 0.5578, + "step": 1547 + }, + { + "epoch": 0.6192, + "grad_norm": 0.46126070618629456, + "learning_rate": 2.3411195734073137e-06, + "loss": 0.4742, + "step": 1548 + }, + { + "epoch": 0.6196, + "grad_norm": 0.5160725712776184, + "learning_rate": 2.336842055235889e-06, + "loss": 0.4787, + "step": 1549 + }, + { + "epoch": 0.62, + "grad_norm": 0.4890509247779846, + "learning_rate": 2.3325664892221287e-06, + "loss": 0.515, + "step": 1550 + }, + { + "epoch": 0.6204, + "grad_norm": 0.5041446089744568, + "learning_rate": 2.328292882541823e-06, + "loss": 0.5502, + "step": 1551 + }, + { + "epoch": 0.6208, + "grad_norm": 0.5130775570869446, + "learning_rate": 2.324021242367478e-06, + "loss": 0.5673, + "step": 1552 + }, + { + "epoch": 0.6212, + "grad_norm": 0.5083710551261902, + "learning_rate": 2.3197515758682943e-06, + "loss": 0.5668, + "step": 1553 + }, + { + "epoch": 0.6216, + "grad_norm": 0.49546748399734497, + "learning_rate": 2.3154838902101623e-06, + "loss": 0.4704, + "step": 1554 + }, + { + "epoch": 0.622, + "grad_norm": 0.47747087478637695, + "learning_rate": 2.311218192555648e-06, + "loss": 0.5373, + "step": 1555 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5391461253166199, + "learning_rate": 2.306954490063982e-06, + "loss": 0.5444, + "step": 1556 + }, + { + "epoch": 0.6228, + "grad_norm": 0.5440353751182556, + "learning_rate": 2.3026927898910426e-06, + "loss": 0.5749, + "step": 1557 + }, + { + "epoch": 0.6232, + "grad_norm": 0.4923385977745056, + "learning_rate": 2.2984330991893527e-06, + "loss": 0.4856, + "step": 1558 + }, + { + "epoch": 0.6236, + "grad_norm": 0.47224387526512146, + "learning_rate": 2.2941754251080574e-06, + "loss": 0.5269, + "step": 1559 + }, + { + "epoch": 0.624, + "grad_norm": 0.48879680037498474, + "learning_rate": 2.28991977479292e-06, + "loss": 0.5758, + "step": 1560 + }, + { + "epoch": 0.6244, + "grad_norm": 0.4885936677455902, + "learning_rate": 2.285666155386308e-06, + "loss": 0.541, + "step": 1561 + }, + { + "epoch": 0.6248, + "grad_norm": 0.44462254643440247, + "learning_rate": 2.281414574027181e-06, + "loss": 0.4438, + "step": 1562 + }, + { + "epoch": 0.6252, + "grad_norm": 0.5792527198791504, + "learning_rate": 2.2771650378510734e-06, + "loss": 0.6358, + "step": 1563 + }, + { + "epoch": 0.6256, + "grad_norm": 0.47472530603408813, + "learning_rate": 2.2729175539900925e-06, + "loss": 0.4899, + "step": 1564 + }, + { + "epoch": 0.626, + "grad_norm": 0.46082860231399536, + "learning_rate": 2.268672129572896e-06, + "loss": 0.6056, + "step": 1565 + }, + { + "epoch": 0.6264, + "grad_norm": 0.5208197832107544, + "learning_rate": 2.2644287717246906e-06, + "loss": 0.5852, + "step": 1566 + }, + { + "epoch": 0.6268, + "grad_norm": 0.502346396446228, + "learning_rate": 2.2601874875672127e-06, + "loss": 0.5329, + "step": 1567 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5229620933532715, + "learning_rate": 2.2559482842187154e-06, + "loss": 0.5473, + "step": 1568 + }, + { + "epoch": 0.6276, + "grad_norm": 0.49244144558906555, + "learning_rate": 2.2517111687939623e-06, + "loss": 0.5066, + "step": 1569 + }, + { + "epoch": 0.628, + "grad_norm": 0.5355249047279358, + "learning_rate": 2.247476148404214e-06, + "loss": 0.5816, + "step": 1570 + }, + { + "epoch": 0.6284, + "grad_norm": 0.5006580352783203, + "learning_rate": 2.243243230157213e-06, + "loss": 0.6485, + "step": 1571 + }, + { + "epoch": 0.6288, + "grad_norm": 0.521933376789093, + "learning_rate": 2.2390124211571718e-06, + "loss": 0.6115, + "step": 1572 + }, + { + "epoch": 0.6292, + "grad_norm": 0.5163118243217468, + "learning_rate": 2.2347837285047677e-06, + "loss": 0.4758, + "step": 1573 + }, + { + "epoch": 0.6296, + "grad_norm": 0.5321094989776611, + "learning_rate": 2.2305571592971214e-06, + "loss": 0.584, + "step": 1574 + }, + { + "epoch": 0.63, + "grad_norm": 0.503724992275238, + "learning_rate": 2.226332720627794e-06, + "loss": 0.5437, + "step": 1575 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5667294263839722, + "learning_rate": 2.2221104195867673e-06, + "loss": 0.5661, + "step": 1576 + }, + { + "epoch": 0.6308, + "grad_norm": 0.48289167881011963, + "learning_rate": 2.217890263260437e-06, + "loss": 0.5256, + "step": 1577 + }, + { + "epoch": 0.6312, + "grad_norm": 0.4751034080982208, + "learning_rate": 2.2136722587316e-06, + "loss": 0.5426, + "step": 1578 + }, + { + "epoch": 0.6316, + "grad_norm": 0.5564718246459961, + "learning_rate": 2.209456413079439e-06, + "loss": 0.5565, + "step": 1579 + }, + { + "epoch": 0.632, + "grad_norm": 0.49462926387786865, + "learning_rate": 2.205242733379518e-06, + "loss": 0.4651, + "step": 1580 + }, + { + "epoch": 0.6324, + "grad_norm": 0.4669487476348877, + "learning_rate": 2.2010312267037617e-06, + "loss": 0.5236, + "step": 1581 + }, + { + "epoch": 0.6328, + "grad_norm": 0.48065274953842163, + "learning_rate": 2.1968219001204503e-06, + "loss": 0.5738, + "step": 1582 + }, + { + "epoch": 0.6332, + "grad_norm": 0.4668215811252594, + "learning_rate": 2.192614760694202e-06, + "loss": 0.5286, + "step": 1583 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5288259983062744, + "learning_rate": 2.188409815485967e-06, + "loss": 0.5231, + "step": 1584 + }, + { + "epoch": 0.634, + "grad_norm": 0.48246291279792786, + "learning_rate": 2.1842070715530135e-06, + "loss": 0.5433, + "step": 1585 + }, + { + "epoch": 0.6344, + "grad_norm": 0.5169889330863953, + "learning_rate": 2.180006535948913e-06, + "loss": 0.5927, + "step": 1586 + }, + { + "epoch": 0.6348, + "grad_norm": 0.45649388432502747, + "learning_rate": 2.175808215723531e-06, + "loss": 0.4159, + "step": 1587 + }, + { + "epoch": 0.6352, + "grad_norm": 0.5261488556861877, + "learning_rate": 2.171612117923016e-06, + "loss": 0.5426, + "step": 1588 + }, + { + "epoch": 0.6356, + "grad_norm": 0.5158681869506836, + "learning_rate": 2.167418249589787e-06, + "loss": 0.5287, + "step": 1589 + }, + { + "epoch": 0.636, + "grad_norm": 0.5511676669120789, + "learning_rate": 2.163226617762519e-06, + "loss": 0.633, + "step": 1590 + }, + { + "epoch": 0.6364, + "grad_norm": 0.45589837431907654, + "learning_rate": 2.159037229476136e-06, + "loss": 0.5101, + "step": 1591 + }, + { + "epoch": 0.6368, + "grad_norm": 0.4977359473705292, + "learning_rate": 2.1548500917617933e-06, + "loss": 0.4789, + "step": 1592 + }, + { + "epoch": 0.6372, + "grad_norm": 0.45518171787261963, + "learning_rate": 2.150665211646871e-06, + "loss": 0.5169, + "step": 1593 + }, + { + "epoch": 0.6376, + "grad_norm": 0.5482791066169739, + "learning_rate": 2.1464825961549635e-06, + "loss": 0.7014, + "step": 1594 + }, + { + "epoch": 0.638, + "grad_norm": 0.4804784059524536, + "learning_rate": 2.142302252305857e-06, + "loss": 0.5088, + "step": 1595 + }, + { + "epoch": 0.6384, + "grad_norm": 0.4980086088180542, + "learning_rate": 2.138124187115532e-06, + "loss": 0.4547, + "step": 1596 + }, + { + "epoch": 0.6388, + "grad_norm": 0.48985469341278076, + "learning_rate": 2.133948407596142e-06, + "loss": 0.5564, + "step": 1597 + }, + { + "epoch": 0.6392, + "grad_norm": 0.5659129023551941, + "learning_rate": 2.1297749207560015e-06, + "loss": 0.5135, + "step": 1598 + }, + { + "epoch": 0.6396, + "grad_norm": 0.5160675048828125, + "learning_rate": 2.1256037335995828e-06, + "loss": 0.5263, + "step": 1599 + }, + { + "epoch": 0.64, + "grad_norm": 0.47263601422309875, + "learning_rate": 2.1214348531274957e-06, + "loss": 0.5682, + "step": 1600 + }, + { + "epoch": 0.6404, + "grad_norm": 0.4825802445411682, + "learning_rate": 2.1172682863364766e-06, + "loss": 0.5353, + "step": 1601 + }, + { + "epoch": 0.6408, + "grad_norm": 0.47716841101646423, + "learning_rate": 2.113104040219382e-06, + "loss": 0.5353, + "step": 1602 + }, + { + "epoch": 0.6412, + "grad_norm": 0.47250092029571533, + "learning_rate": 2.108942121765173e-06, + "loss": 0.5076, + "step": 1603 + }, + { + "epoch": 0.6416, + "grad_norm": 0.4839315116405487, + "learning_rate": 2.1047825379589038e-06, + "loss": 0.56, + "step": 1604 + }, + { + "epoch": 0.642, + "grad_norm": 0.5323452949523926, + "learning_rate": 2.1006252957817083e-06, + "loss": 0.5248, + "step": 1605 + }, + { + "epoch": 0.6424, + "grad_norm": 0.5054993033409119, + "learning_rate": 2.0964704022107945e-06, + "loss": 0.487, + "step": 1606 + }, + { + "epoch": 0.6428, + "grad_norm": 0.5133071541786194, + "learning_rate": 2.0923178642194233e-06, + "loss": 0.6727, + "step": 1607 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5333604216575623, + "learning_rate": 2.088167688776908e-06, + "loss": 0.6296, + "step": 1608 + }, + { + "epoch": 0.6436, + "grad_norm": 0.4804375171661377, + "learning_rate": 2.0840198828485934e-06, + "loss": 0.5005, + "step": 1609 + }, + { + "epoch": 0.644, + "grad_norm": 0.4947167634963989, + "learning_rate": 2.079874453395847e-06, + "loss": 0.5693, + "step": 1610 + }, + { + "epoch": 0.6444, + "grad_norm": 0.5010197162628174, + "learning_rate": 2.07573140737605e-06, + "loss": 0.5887, + "step": 1611 + }, + { + "epoch": 0.6448, + "grad_norm": 0.5427845120429993, + "learning_rate": 2.07159075174258e-06, + "loss": 0.5781, + "step": 1612 + }, + { + "epoch": 0.6452, + "grad_norm": 0.4725536108016968, + "learning_rate": 2.067452493444808e-06, + "loss": 0.533, + "step": 1613 + }, + { + "epoch": 0.6456, + "grad_norm": 0.529393196105957, + "learning_rate": 2.063316639428077e-06, + "loss": 0.4901, + "step": 1614 + }, + { + "epoch": 0.646, + "grad_norm": 0.493694007396698, + "learning_rate": 2.059183196633697e-06, + "loss": 0.5665, + "step": 1615 + }, + { + "epoch": 0.6464, + "grad_norm": 0.48902615904808044, + "learning_rate": 2.055052171998929e-06, + "loss": 0.5396, + "step": 1616 + }, + { + "epoch": 0.6468, + "grad_norm": 0.4694744348526001, + "learning_rate": 2.0509235724569786e-06, + "loss": 0.5601, + "step": 1617 + }, + { + "epoch": 0.6472, + "grad_norm": 0.5156325101852417, + "learning_rate": 2.04679740493698e-06, + "loss": 0.4973, + "step": 1618 + }, + { + "epoch": 0.6476, + "grad_norm": 0.44242939352989197, + "learning_rate": 2.0426736763639846e-06, + "loss": 0.4728, + "step": 1619 + }, + { + "epoch": 0.648, + "grad_norm": 0.47373783588409424, + "learning_rate": 2.0385523936589525e-06, + "loss": 0.5331, + "step": 1620 + }, + { + "epoch": 0.6484, + "grad_norm": 0.5373921394348145, + "learning_rate": 2.0344335637387345e-06, + "loss": 0.5832, + "step": 1621 + }, + { + "epoch": 0.6488, + "grad_norm": 0.5912891626358032, + "learning_rate": 2.0303171935160704e-06, + "loss": 0.5745, + "step": 1622 + }, + { + "epoch": 0.6492, + "grad_norm": 0.5122224688529968, + "learning_rate": 2.0262032898995707e-06, + "loss": 0.5184, + "step": 1623 + }, + { + "epoch": 0.6496, + "grad_norm": 0.49968165159225464, + "learning_rate": 2.022091859793703e-06, + "loss": 0.5457, + "step": 1624 + }, + { + "epoch": 0.65, + "grad_norm": 0.539306640625, + "learning_rate": 2.0179829100987857e-06, + "loss": 0.543, + "step": 1625 + }, + { + "epoch": 0.6504, + "grad_norm": 0.5262332558631897, + "learning_rate": 2.013876447710972e-06, + "loss": 0.6301, + "step": 1626 + }, + { + "epoch": 0.6508, + "grad_norm": 0.4939981997013092, + "learning_rate": 2.009772479522246e-06, + "loss": 0.4742, + "step": 1627 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5203678011894226, + "learning_rate": 2.0056710124204002e-06, + "loss": 0.566, + "step": 1628 + }, + { + "epoch": 0.6516, + "grad_norm": 0.44473370909690857, + "learning_rate": 2.0015720532890332e-06, + "loss": 0.5685, + "step": 1629 + }, + { + "epoch": 0.652, + "grad_norm": 0.570111095905304, + "learning_rate": 1.9974756090075296e-06, + "loss": 0.6232, + "step": 1630 + }, + { + "epoch": 0.6524, + "grad_norm": 0.47535547614097595, + "learning_rate": 1.9933816864510587e-06, + "loss": 0.4962, + "step": 1631 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5097532868385315, + "learning_rate": 1.989290292490556e-06, + "loss": 0.4939, + "step": 1632 + }, + { + "epoch": 0.6532, + "grad_norm": 0.5101918578147888, + "learning_rate": 1.9852014339927113e-06, + "loss": 0.4766, + "step": 1633 + }, + { + "epoch": 0.6536, + "grad_norm": 0.5294265747070312, + "learning_rate": 1.9811151178199597e-06, + "loss": 0.5946, + "step": 1634 + }, + { + "epoch": 0.654, + "grad_norm": 0.5878704786300659, + "learning_rate": 1.9770313508304687e-06, + "loss": 0.5594, + "step": 1635 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5145576000213623, + "learning_rate": 1.9729501398781314e-06, + "loss": 0.5903, + "step": 1636 + }, + { + "epoch": 0.6548, + "grad_norm": 0.53581702709198, + "learning_rate": 1.968871491812547e-06, + "loss": 0.4923, + "step": 1637 + }, + { + "epoch": 0.6552, + "grad_norm": 0.4903261363506317, + "learning_rate": 1.964795413479016e-06, + "loss": 0.5131, + "step": 1638 + }, + { + "epoch": 0.6556, + "grad_norm": 0.47253820300102234, + "learning_rate": 1.960721911718522e-06, + "loss": 0.5141, + "step": 1639 + }, + { + "epoch": 0.656, + "grad_norm": 0.5105094909667969, + "learning_rate": 1.956650993367728e-06, + "loss": 0.6333, + "step": 1640 + }, + { + "epoch": 0.6564, + "grad_norm": 0.4645703136920929, + "learning_rate": 1.9525826652589624e-06, + "loss": 0.5166, + "step": 1641 + }, + { + "epoch": 0.6568, + "grad_norm": 0.4405498206615448, + "learning_rate": 1.9485169342202026e-06, + "loss": 0.4998, + "step": 1642 + }, + { + "epoch": 0.6572, + "grad_norm": 0.5502030849456787, + "learning_rate": 1.944453807075068e-06, + "loss": 0.5153, + "step": 1643 + }, + { + "epoch": 0.6576, + "grad_norm": 0.5228597521781921, + "learning_rate": 1.9403932906428075e-06, + "loss": 0.5555, + "step": 1644 + }, + { + "epoch": 0.658, + "grad_norm": 0.49661386013031006, + "learning_rate": 1.936335391738292e-06, + "loss": 0.4916, + "step": 1645 + }, + { + "epoch": 0.6584, + "grad_norm": 0.507220447063446, + "learning_rate": 1.9322801171719963e-06, + "loss": 0.4786, + "step": 1646 + }, + { + "epoch": 0.6588, + "grad_norm": 0.5505663752555847, + "learning_rate": 1.928227473749991e-06, + "loss": 0.5889, + "step": 1647 + }, + { + "epoch": 0.6592, + "grad_norm": 0.461161345243454, + "learning_rate": 1.924177468273929e-06, + "loss": 0.5665, + "step": 1648 + }, + { + "epoch": 0.6596, + "grad_norm": 0.47537487745285034, + "learning_rate": 1.92013010754104e-06, + "loss": 0.5456, + "step": 1649 + }, + { + "epoch": 0.66, + "grad_norm": 0.5142361521720886, + "learning_rate": 1.9160853983441143e-06, + "loss": 0.5599, + "step": 1650 + }, + { + "epoch": 0.6604, + "grad_norm": 0.498871773481369, + "learning_rate": 1.9120433474714876e-06, + "loss": 0.5814, + "step": 1651 + }, + { + "epoch": 0.6608, + "grad_norm": 0.5202580094337463, + "learning_rate": 1.9080039617070384e-06, + "loss": 0.6258, + "step": 1652 + }, + { + "epoch": 0.6612, + "grad_norm": 0.5052293539047241, + "learning_rate": 1.9039672478301698e-06, + "loss": 0.5705, + "step": 1653 + }, + { + "epoch": 0.6616, + "grad_norm": 0.4765607714653015, + "learning_rate": 1.8999332126158018e-06, + "loss": 0.5216, + "step": 1654 + }, + { + "epoch": 0.662, + "grad_norm": 0.519019365310669, + "learning_rate": 1.8959018628343612e-06, + "loss": 0.4994, + "step": 1655 + }, + { + "epoch": 0.6624, + "grad_norm": 0.5589582920074463, + "learning_rate": 1.8918732052517633e-06, + "loss": 0.5712, + "step": 1656 + }, + { + "epoch": 0.6628, + "grad_norm": 0.5202416777610779, + "learning_rate": 1.887847246629406e-06, + "loss": 0.5674, + "step": 1657 + }, + { + "epoch": 0.6632, + "grad_norm": 0.5023145079612732, + "learning_rate": 1.8838239937241604e-06, + "loss": 0.5578, + "step": 1658 + }, + { + "epoch": 0.6636, + "grad_norm": 0.4727594554424286, + "learning_rate": 1.8798034532883528e-06, + "loss": 0.5314, + "step": 1659 + }, + { + "epoch": 0.664, + "grad_norm": 0.5230659246444702, + "learning_rate": 1.8757856320697609e-06, + "loss": 0.6046, + "step": 1660 + }, + { + "epoch": 0.6644, + "grad_norm": 0.5028753280639648, + "learning_rate": 1.8717705368115946e-06, + "loss": 0.4558, + "step": 1661 + }, + { + "epoch": 0.6648, + "grad_norm": 0.5712804794311523, + "learning_rate": 1.8677581742524908e-06, + "loss": 0.5368, + "step": 1662 + }, + { + "epoch": 0.6652, + "grad_norm": 0.4760363698005676, + "learning_rate": 1.8637485511265004e-06, + "loss": 0.5177, + "step": 1663 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5169165730476379, + "learning_rate": 1.8597416741630777e-06, + "loss": 0.6252, + "step": 1664 + }, + { + "epoch": 0.666, + "grad_norm": 0.48152515292167664, + "learning_rate": 1.8557375500870656e-06, + "loss": 0.5348, + "step": 1665 + }, + { + "epoch": 0.6664, + "grad_norm": 0.4943107068538666, + "learning_rate": 1.851736185618686e-06, + "loss": 0.5256, + "step": 1666 + }, + { + "epoch": 0.6668, + "grad_norm": 0.5224728584289551, + "learning_rate": 1.8477375874735342e-06, + "loss": 0.5988, + "step": 1667 + }, + { + "epoch": 0.6672, + "grad_norm": 0.499190092086792, + "learning_rate": 1.8437417623625558e-06, + "loss": 0.5087, + "step": 1668 + }, + { + "epoch": 0.6676, + "grad_norm": 0.5248624682426453, + "learning_rate": 1.8397487169920495e-06, + "loss": 0.4826, + "step": 1669 + }, + { + "epoch": 0.668, + "grad_norm": 0.5038529634475708, + "learning_rate": 1.835758458063644e-06, + "loss": 0.614, + "step": 1670 + }, + { + "epoch": 0.6684, + "grad_norm": 0.541998028755188, + "learning_rate": 1.8317709922742915e-06, + "loss": 0.5436, + "step": 1671 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5454816222190857, + "learning_rate": 1.8277863263162597e-06, + "loss": 0.483, + "step": 1672 + }, + { + "epoch": 0.6692, + "grad_norm": 0.49650028347969055, + "learning_rate": 1.8238044668771123e-06, + "loss": 0.4889, + "step": 1673 + }, + { + "epoch": 0.6696, + "grad_norm": 0.5571967363357544, + "learning_rate": 1.8198254206397094e-06, + "loss": 0.6268, + "step": 1674 + }, + { + "epoch": 0.67, + "grad_norm": 0.5088862776756287, + "learning_rate": 1.8158491942821822e-06, + "loss": 0.4964, + "step": 1675 + }, + { + "epoch": 0.6704, + "grad_norm": 0.4773922264575958, + "learning_rate": 1.8118757944779358e-06, + "loss": 0.4747, + "step": 1676 + }, + { + "epoch": 0.6708, + "grad_norm": 0.5273101925849915, + "learning_rate": 1.807905227895626e-06, + "loss": 0.5286, + "step": 1677 + }, + { + "epoch": 0.6712, + "grad_norm": 0.4688742160797119, + "learning_rate": 1.8039375011991588e-06, + "loss": 0.4147, + "step": 1678 + }, + { + "epoch": 0.6716, + "grad_norm": 0.536142110824585, + "learning_rate": 1.7999726210476696e-06, + "loss": 0.5764, + "step": 1679 + }, + { + "epoch": 0.672, + "grad_norm": 0.5289327502250671, + "learning_rate": 1.7960105940955164e-06, + "loss": 0.523, + "step": 1680 + }, + { + "epoch": 0.6724, + "grad_norm": 0.46021583676338196, + "learning_rate": 1.792051426992273e-06, + "loss": 0.5167, + "step": 1681 + }, + { + "epoch": 0.6728, + "grad_norm": 0.5137984156608582, + "learning_rate": 1.7880951263827083e-06, + "loss": 0.4866, + "step": 1682 + }, + { + "epoch": 0.6732, + "grad_norm": 0.5088829398155212, + "learning_rate": 1.7841416989067848e-06, + "loss": 0.5143, + "step": 1683 + }, + { + "epoch": 0.6736, + "grad_norm": 0.5051218271255493, + "learning_rate": 1.7801911511996384e-06, + "loss": 0.4959, + "step": 1684 + }, + { + "epoch": 0.674, + "grad_norm": 0.5160439610481262, + "learning_rate": 1.7762434898915764e-06, + "loss": 0.5458, + "step": 1685 + }, + { + "epoch": 0.6744, + "grad_norm": 0.47013598680496216, + "learning_rate": 1.7722987216080584e-06, + "loss": 0.5298, + "step": 1686 + }, + { + "epoch": 0.6748, + "grad_norm": 0.507504403591156, + "learning_rate": 1.7683568529696885e-06, + "loss": 0.4713, + "step": 1687 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5315131545066833, + "learning_rate": 1.764417890592208e-06, + "loss": 0.5132, + "step": 1688 + }, + { + "epoch": 0.6756, + "grad_norm": 0.5345341563224792, + "learning_rate": 1.7604818410864753e-06, + "loss": 0.6151, + "step": 1689 + }, + { + "epoch": 0.676, + "grad_norm": 0.5330063700675964, + "learning_rate": 1.7565487110584654e-06, + "loss": 0.6183, + "step": 1690 + }, + { + "epoch": 0.6764, + "grad_norm": 0.5632655620574951, + "learning_rate": 1.752618507109248e-06, + "loss": 0.5759, + "step": 1691 + }, + { + "epoch": 0.6768, + "grad_norm": 0.4822584092617035, + "learning_rate": 1.7486912358349875e-06, + "loss": 0.4738, + "step": 1692 + }, + { + "epoch": 0.6772, + "grad_norm": 0.4986719489097595, + "learning_rate": 1.744766903826921e-06, + "loss": 0.5118, + "step": 1693 + }, + { + "epoch": 0.6776, + "grad_norm": 0.5079983472824097, + "learning_rate": 1.7408455176713574e-06, + "loss": 0.5201, + "step": 1694 + }, + { + "epoch": 0.678, + "grad_norm": 0.5193080902099609, + "learning_rate": 1.7369270839496581e-06, + "loss": 0.5159, + "step": 1695 + }, + { + "epoch": 0.6784, + "grad_norm": 0.4460771679878235, + "learning_rate": 1.7330116092382291e-06, + "loss": 0.521, + "step": 1696 + }, + { + "epoch": 0.6788, + "grad_norm": 0.49275845289230347, + "learning_rate": 1.7290991001085142e-06, + "loss": 0.4818, + "step": 1697 + }, + { + "epoch": 0.6792, + "grad_norm": 0.4485078752040863, + "learning_rate": 1.7251895631269744e-06, + "loss": 0.5208, + "step": 1698 + }, + { + "epoch": 0.6796, + "grad_norm": 0.5298824310302734, + "learning_rate": 1.7212830048550881e-06, + "loss": 0.5929, + "step": 1699 + }, + { + "epoch": 0.68, + "grad_norm": 0.5249450206756592, + "learning_rate": 1.7173794318493306e-06, + "loss": 0.5053, + "step": 1700 + }, + { + "epoch": 0.6804, + "grad_norm": 0.5092018842697144, + "learning_rate": 1.713478850661167e-06, + "loss": 0.5591, + "step": 1701 + }, + { + "epoch": 0.6808, + "grad_norm": 0.522066056728363, + "learning_rate": 1.7095812678370429e-06, + "loss": 0.582, + "step": 1702 + }, + { + "epoch": 0.6812, + "grad_norm": 0.48720628023147583, + "learning_rate": 1.7056866899183727e-06, + "loss": 0.5319, + "step": 1703 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5656712055206299, + "learning_rate": 1.701795123441525e-06, + "loss": 0.5974, + "step": 1704 + }, + { + "epoch": 0.682, + "grad_norm": 0.4969402551651001, + "learning_rate": 1.697906574937813e-06, + "loss": 0.558, + "step": 1705 + }, + { + "epoch": 0.6824, + "grad_norm": 0.5818575620651245, + "learning_rate": 1.6940210509334889e-06, + "loss": 0.628, + "step": 1706 + }, + { + "epoch": 0.6828, + "grad_norm": 0.4737216830253601, + "learning_rate": 1.6901385579497282e-06, + "loss": 0.5157, + "step": 1707 + }, + { + "epoch": 0.6832, + "grad_norm": 0.49812376499176025, + "learning_rate": 1.6862591025026157e-06, + "loss": 0.5326, + "step": 1708 + }, + { + "epoch": 0.6836, + "grad_norm": 0.5161290168762207, + "learning_rate": 1.6823826911031417e-06, + "loss": 0.6267, + "step": 1709 + }, + { + "epoch": 0.684, + "grad_norm": 0.5259966850280762, + "learning_rate": 1.6785093302571843e-06, + "loss": 0.6185, + "step": 1710 + }, + { + "epoch": 0.6844, + "grad_norm": 0.4863537847995758, + "learning_rate": 1.6746390264655055e-06, + "loss": 0.5257, + "step": 1711 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5145780444145203, + "learning_rate": 1.6707717862237358e-06, + "loss": 0.519, + "step": 1712 + }, + { + "epoch": 0.6852, + "grad_norm": 0.48994773626327515, + "learning_rate": 1.6669076160223625e-06, + "loss": 0.441, + "step": 1713 + }, + { + "epoch": 0.6856, + "grad_norm": 0.5490663051605225, + "learning_rate": 1.6630465223467206e-06, + "loss": 0.5786, + "step": 1714 + }, + { + "epoch": 0.686, + "grad_norm": 0.4834598898887634, + "learning_rate": 1.6591885116769795e-06, + "loss": 0.5122, + "step": 1715 + }, + { + "epoch": 0.6864, + "grad_norm": 0.5213835835456848, + "learning_rate": 1.6553335904881427e-06, + "loss": 0.5692, + "step": 1716 + }, + { + "epoch": 0.6868, + "grad_norm": 0.5010420680046082, + "learning_rate": 1.6514817652500198e-06, + "loss": 0.5693, + "step": 1717 + }, + { + "epoch": 0.6872, + "grad_norm": 0.5002087354660034, + "learning_rate": 1.6476330424272277e-06, + "loss": 0.5556, + "step": 1718 + }, + { + "epoch": 0.6876, + "grad_norm": 0.4816271960735321, + "learning_rate": 1.6437874284791743e-06, + "loss": 0.5158, + "step": 1719 + }, + { + "epoch": 0.688, + "grad_norm": 0.4585452973842621, + "learning_rate": 1.6399449298600533e-06, + "loss": 0.523, + "step": 1720 + }, + { + "epoch": 0.6884, + "grad_norm": 0.47453686594963074, + "learning_rate": 1.6361055530188296e-06, + "loss": 0.5716, + "step": 1721 + }, + { + "epoch": 0.6888, + "grad_norm": 0.49845659732818604, + "learning_rate": 1.632269304399226e-06, + "loss": 0.4753, + "step": 1722 + }, + { + "epoch": 0.6892, + "grad_norm": 0.5592690706253052, + "learning_rate": 1.6284361904397163e-06, + "loss": 0.6543, + "step": 1723 + }, + { + "epoch": 0.6896, + "grad_norm": 0.5578603744506836, + "learning_rate": 1.6246062175735108e-06, + "loss": 0.5954, + "step": 1724 + }, + { + "epoch": 0.69, + "grad_norm": 0.5509665012359619, + "learning_rate": 1.6207793922285566e-06, + "loss": 0.5676, + "step": 1725 + }, + { + "epoch": 0.6904, + "grad_norm": 0.5276341438293457, + "learning_rate": 1.6169557208275087e-06, + "loss": 0.5759, + "step": 1726 + }, + { + "epoch": 0.6908, + "grad_norm": 0.5588827729225159, + "learning_rate": 1.6131352097877332e-06, + "loss": 0.5559, + "step": 1727 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5112521052360535, + "learning_rate": 1.6093178655212896e-06, + "loss": 0.5852, + "step": 1728 + }, + { + "epoch": 0.6916, + "grad_norm": 0.4992835819721222, + "learning_rate": 1.6055036944349258e-06, + "loss": 0.4917, + "step": 1729 + }, + { + "epoch": 0.692, + "grad_norm": 0.4967854917049408, + "learning_rate": 1.6016927029300634e-06, + "loss": 0.4255, + "step": 1730 + }, + { + "epoch": 0.6924, + "grad_norm": 0.5047206878662109, + "learning_rate": 1.5978848974027855e-06, + "loss": 0.5217, + "step": 1731 + }, + { + "epoch": 0.6928, + "grad_norm": 0.49704602360725403, + "learning_rate": 1.5940802842438283e-06, + "loss": 0.4804, + "step": 1732 + }, + { + "epoch": 0.6932, + "grad_norm": 0.509590208530426, + "learning_rate": 1.5902788698385736e-06, + "loss": 0.5788, + "step": 1733 + }, + { + "epoch": 0.6936, + "grad_norm": 0.5450599789619446, + "learning_rate": 1.5864806605670296e-06, + "loss": 0.5217, + "step": 1734 + }, + { + "epoch": 0.694, + "grad_norm": 0.49922382831573486, + "learning_rate": 1.58268566280383e-06, + "loss": 0.6029, + "step": 1735 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5624175667762756, + "learning_rate": 1.5788938829182158e-06, + "loss": 0.6244, + "step": 1736 + }, + { + "epoch": 0.6948, + "grad_norm": 0.46818313002586365, + "learning_rate": 1.5751053272740265e-06, + "loss": 0.4942, + "step": 1737 + }, + { + "epoch": 0.6952, + "grad_norm": 0.6104135513305664, + "learning_rate": 1.571320002229693e-06, + "loss": 0.5757, + "step": 1738 + }, + { + "epoch": 0.6956, + "grad_norm": 0.5003908276557922, + "learning_rate": 1.5675379141382236e-06, + "loss": 0.5161, + "step": 1739 + }, + { + "epoch": 0.696, + "grad_norm": 0.48008689284324646, + "learning_rate": 1.5637590693471931e-06, + "loss": 0.5857, + "step": 1740 + }, + { + "epoch": 0.6964, + "grad_norm": 0.49035823345184326, + "learning_rate": 1.559983474198731e-06, + "loss": 0.4777, + "step": 1741 + }, + { + "epoch": 0.6968, + "grad_norm": 0.5132231116294861, + "learning_rate": 1.556211135029518e-06, + "loss": 0.5248, + "step": 1742 + }, + { + "epoch": 0.6972, + "grad_norm": 0.4993639290332794, + "learning_rate": 1.5524420581707644e-06, + "loss": 0.5057, + "step": 1743 + }, + { + "epoch": 0.6976, + "grad_norm": 0.5058372020721436, + "learning_rate": 1.5486762499482106e-06, + "loss": 0.5187, + "step": 1744 + }, + { + "epoch": 0.698, + "grad_norm": 0.4696761667728424, + "learning_rate": 1.5449137166821078e-06, + "loss": 0.5289, + "step": 1745 + }, + { + "epoch": 0.6984, + "grad_norm": 0.5460999011993408, + "learning_rate": 1.5411544646872094e-06, + "loss": 0.5688, + "step": 1746 + }, + { + "epoch": 0.6988, + "grad_norm": 0.47618138790130615, + "learning_rate": 1.5373985002727679e-06, + "loss": 0.5779, + "step": 1747 + }, + { + "epoch": 0.6992, + "grad_norm": 0.520785391330719, + "learning_rate": 1.5336458297425105e-06, + "loss": 0.5344, + "step": 1748 + }, + { + "epoch": 0.6996, + "grad_norm": 0.49487361311912537, + "learning_rate": 1.5298964593946434e-06, + "loss": 0.5133, + "step": 1749 + }, + { + "epoch": 0.7, + "grad_norm": 0.6331418752670288, + "learning_rate": 1.526150395521827e-06, + "loss": 0.5969, + "step": 1750 + }, + { + "epoch": 0.7004, + "grad_norm": 0.513303816318512, + "learning_rate": 1.522407644411179e-06, + "loss": 0.5477, + "step": 1751 + }, + { + "epoch": 0.7008, + "grad_norm": 0.4873301386833191, + "learning_rate": 1.5186682123442518e-06, + "loss": 0.5146, + "step": 1752 + }, + { + "epoch": 0.7012, + "grad_norm": 0.5341429710388184, + "learning_rate": 1.5149321055970316e-06, + "loss": 0.5552, + "step": 1753 + }, + { + "epoch": 0.7016, + "grad_norm": 0.5356207489967346, + "learning_rate": 1.5111993304399213e-06, + "loss": 0.4732, + "step": 1754 + }, + { + "epoch": 0.702, + "grad_norm": 0.5175428986549377, + "learning_rate": 1.5074698931377304e-06, + "loss": 0.574, + "step": 1755 + }, + { + "epoch": 0.7024, + "grad_norm": 0.5094943642616272, + "learning_rate": 1.5037437999496718e-06, + "loss": 0.5922, + "step": 1756 + }, + { + "epoch": 0.7028, + "grad_norm": 0.4792037904262543, + "learning_rate": 1.5000210571293403e-06, + "loss": 0.4763, + "step": 1757 + }, + { + "epoch": 0.7032, + "grad_norm": 0.540791392326355, + "learning_rate": 1.4963016709247127e-06, + "loss": 0.6105, + "step": 1758 + }, + { + "epoch": 0.7036, + "grad_norm": 0.49768590927124023, + "learning_rate": 1.4925856475781271e-06, + "loss": 0.5767, + "step": 1759 + }, + { + "epoch": 0.704, + "grad_norm": 0.5273868441581726, + "learning_rate": 1.4888729933262833e-06, + "loss": 0.5907, + "step": 1760 + }, + { + "epoch": 0.7044, + "grad_norm": 0.5667710900306702, + "learning_rate": 1.4851637144002219e-06, + "loss": 0.5502, + "step": 1761 + }, + { + "epoch": 0.7048, + "grad_norm": 0.5382874608039856, + "learning_rate": 1.4814578170253192e-06, + "loss": 0.5999, + "step": 1762 + }, + { + "epoch": 0.7052, + "grad_norm": 0.4984513819217682, + "learning_rate": 1.47775530742128e-06, + "loss": 0.4988, + "step": 1763 + }, + { + "epoch": 0.7056, + "grad_norm": 0.4884515404701233, + "learning_rate": 1.4740561918021178e-06, + "loss": 0.474, + "step": 1764 + }, + { + "epoch": 0.706, + "grad_norm": 0.46463531255722046, + "learning_rate": 1.470360476376156e-06, + "loss": 0.4821, + "step": 1765 + }, + { + "epoch": 0.7064, + "grad_norm": 0.521081805229187, + "learning_rate": 1.466668167346005e-06, + "loss": 0.5404, + "step": 1766 + }, + { + "epoch": 0.7068, + "grad_norm": 0.5084238648414612, + "learning_rate": 1.462979270908564e-06, + "loss": 0.5357, + "step": 1767 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5334984064102173, + "learning_rate": 1.4592937932549993e-06, + "loss": 0.5774, + "step": 1768 + }, + { + "epoch": 0.7076, + "grad_norm": 0.5306082963943481, + "learning_rate": 1.455611740570745e-06, + "loss": 0.5521, + "step": 1769 + }, + { + "epoch": 0.708, + "grad_norm": 0.5435188412666321, + "learning_rate": 1.4519331190354828e-06, + "loss": 0.6215, + "step": 1770 + }, + { + "epoch": 0.7084, + "grad_norm": 0.46188175678253174, + "learning_rate": 1.4482579348231357e-06, + "loss": 0.5283, + "step": 1771 + }, + { + "epoch": 0.7088, + "grad_norm": 0.47881054878234863, + "learning_rate": 1.4445861941018614e-06, + "loss": 0.4803, + "step": 1772 + }, + { + "epoch": 0.7092, + "grad_norm": 0.5058978796005249, + "learning_rate": 1.4409179030340343e-06, + "loss": 0.5497, + "step": 1773 + }, + { + "epoch": 0.7096, + "grad_norm": 0.4847659468650818, + "learning_rate": 1.4372530677762425e-06, + "loss": 0.5812, + "step": 1774 + }, + { + "epoch": 0.71, + "grad_norm": 0.4816116690635681, + "learning_rate": 1.4335916944792716e-06, + "loss": 0.5475, + "step": 1775 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5226293206214905, + "learning_rate": 1.429933789288097e-06, + "loss": 0.5325, + "step": 1776 + }, + { + "epoch": 0.7108, + "grad_norm": 0.5266181230545044, + "learning_rate": 1.4262793583418757e-06, + "loss": 0.5531, + "step": 1777 + }, + { + "epoch": 0.7112, + "grad_norm": 0.49322509765625, + "learning_rate": 1.422628407773933e-06, + "loss": 0.544, + "step": 1778 + }, + { + "epoch": 0.7116, + "grad_norm": 0.5125441551208496, + "learning_rate": 1.4189809437117513e-06, + "loss": 0.5705, + "step": 1779 + }, + { + "epoch": 0.712, + "grad_norm": 0.449192613363266, + "learning_rate": 1.4153369722769625e-06, + "loss": 0.5128, + "step": 1780 + }, + { + "epoch": 0.7124, + "grad_norm": 0.4418649673461914, + "learning_rate": 1.4116964995853365e-06, + "loss": 0.4764, + "step": 1781 + }, + { + "epoch": 0.7128, + "grad_norm": 0.4699050784111023, + "learning_rate": 1.4080595317467722e-06, + "loss": 0.4901, + "step": 1782 + }, + { + "epoch": 0.7132, + "grad_norm": 0.4823940098285675, + "learning_rate": 1.4044260748652863e-06, + "loss": 0.5816, + "step": 1783 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5200097560882568, + "learning_rate": 1.4007961350390021e-06, + "loss": 0.5738, + "step": 1784 + }, + { + "epoch": 0.714, + "grad_norm": 0.4924733638763428, + "learning_rate": 1.3971697183601376e-06, + "loss": 0.4668, + "step": 1785 + }, + { + "epoch": 0.7144, + "grad_norm": 0.47781214118003845, + "learning_rate": 1.3935468309150032e-06, + "loss": 0.5402, + "step": 1786 + }, + { + "epoch": 0.7148, + "grad_norm": 0.537818968296051, + "learning_rate": 1.3899274787839834e-06, + "loss": 0.5018, + "step": 1787 + }, + { + "epoch": 0.7152, + "grad_norm": 0.545964241027832, + "learning_rate": 1.3863116680415285e-06, + "loss": 0.5537, + "step": 1788 + }, + { + "epoch": 0.7156, + "grad_norm": 0.4749377369880676, + "learning_rate": 1.3826994047561462e-06, + "loss": 0.4985, + "step": 1789 + }, + { + "epoch": 0.716, + "grad_norm": 0.4744401276111603, + "learning_rate": 1.3790906949903884e-06, + "loss": 0.4716, + "step": 1790 + }, + { + "epoch": 0.7164, + "grad_norm": 0.5007680654525757, + "learning_rate": 1.3754855448008464e-06, + "loss": 0.5631, + "step": 1791 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5083097815513611, + "learning_rate": 1.3718839602381367e-06, + "loss": 0.5688, + "step": 1792 + }, + { + "epoch": 0.7172, + "grad_norm": 0.5184915065765381, + "learning_rate": 1.3682859473468897e-06, + "loss": 0.5933, + "step": 1793 + }, + { + "epoch": 0.7176, + "grad_norm": 0.4937378466129303, + "learning_rate": 1.364691512165741e-06, + "loss": 0.5165, + "step": 1794 + }, + { + "epoch": 0.718, + "grad_norm": 0.4511565566062927, + "learning_rate": 1.3611006607273242e-06, + "loss": 0.483, + "step": 1795 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5037986040115356, + "learning_rate": 1.3575133990582574e-06, + "loss": 0.539, + "step": 1796 + }, + { + "epoch": 0.7188, + "grad_norm": 0.5806722044944763, + "learning_rate": 1.353929733179133e-06, + "loss": 0.5049, + "step": 1797 + }, + { + "epoch": 0.7192, + "grad_norm": 0.5695548057556152, + "learning_rate": 1.3503496691045087e-06, + "loss": 0.6149, + "step": 1798 + }, + { + "epoch": 0.7196, + "grad_norm": 0.5636582374572754, + "learning_rate": 1.346773212842896e-06, + "loss": 0.6077, + "step": 1799 + }, + { + "epoch": 0.72, + "grad_norm": 0.5878316164016724, + "learning_rate": 1.3432003703967542e-06, + "loss": 0.5881, + "step": 1800 + }, + { + "epoch": 0.7204, + "grad_norm": 0.44944626092910767, + "learning_rate": 1.339631147762477e-06, + "loss": 0.4826, + "step": 1801 + }, + { + "epoch": 0.7208, + "grad_norm": 0.551231861114502, + "learning_rate": 1.336065550930381e-06, + "loss": 0.5654, + "step": 1802 + }, + { + "epoch": 0.7212, + "grad_norm": 0.4855599105358124, + "learning_rate": 1.3325035858846964e-06, + "loss": 0.5741, + "step": 1803 + }, + { + "epoch": 0.7216, + "grad_norm": 0.47175052762031555, + "learning_rate": 1.328945258603562e-06, + "loss": 0.5032, + "step": 1804 + }, + { + "epoch": 0.722, + "grad_norm": 0.5383630394935608, + "learning_rate": 1.3253905750590099e-06, + "loss": 0.5234, + "step": 1805 + }, + { + "epoch": 0.7224, + "grad_norm": 0.4824926257133484, + "learning_rate": 1.3218395412169554e-06, + "loss": 0.5013, + "step": 1806 + }, + { + "epoch": 0.7228, + "grad_norm": 0.5814263224601746, + "learning_rate": 1.3182921630371892e-06, + "loss": 0.6488, + "step": 1807 + }, + { + "epoch": 0.7232, + "grad_norm": 0.480724036693573, + "learning_rate": 1.314748446473365e-06, + "loss": 0.4736, + "step": 1808 + }, + { + "epoch": 0.7236, + "grad_norm": 0.5075566172599792, + "learning_rate": 1.3112083974729947e-06, + "loss": 0.52, + "step": 1809 + }, + { + "epoch": 0.724, + "grad_norm": 0.523950457572937, + "learning_rate": 1.3076720219774333e-06, + "loss": 0.5094, + "step": 1810 + }, + { + "epoch": 0.7244, + "grad_norm": 0.45218196511268616, + "learning_rate": 1.3041393259218693e-06, + "loss": 0.4614, + "step": 1811 + }, + { + "epoch": 0.7248, + "grad_norm": 0.5422837138175964, + "learning_rate": 1.3006103152353158e-06, + "loss": 0.5605, + "step": 1812 + }, + { + "epoch": 0.7252, + "grad_norm": 0.46532970666885376, + "learning_rate": 1.297084995840602e-06, + "loss": 0.5027, + "step": 1813 + }, + { + "epoch": 0.7256, + "grad_norm": 0.5618458390235901, + "learning_rate": 1.2935633736543628e-06, + "loss": 0.514, + "step": 1814 + }, + { + "epoch": 0.726, + "grad_norm": 0.5310685634613037, + "learning_rate": 1.2900454545870257e-06, + "loss": 0.5001, + "step": 1815 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5191671252250671, + "learning_rate": 1.2865312445428036e-06, + "loss": 0.5151, + "step": 1816 + }, + { + "epoch": 0.7268, + "grad_norm": 0.5187528133392334, + "learning_rate": 1.283020749419684e-06, + "loss": 0.4881, + "step": 1817 + }, + { + "epoch": 0.7272, + "grad_norm": 0.4712491035461426, + "learning_rate": 1.2795139751094222e-06, + "loss": 0.4651, + "step": 1818 + }, + { + "epoch": 0.7276, + "grad_norm": 0.4946507215499878, + "learning_rate": 1.276010927497527e-06, + "loss": 0.6037, + "step": 1819 + }, + { + "epoch": 0.728, + "grad_norm": 0.4676782786846161, + "learning_rate": 1.272511612463252e-06, + "loss": 0.5508, + "step": 1820 + }, + { + "epoch": 0.7284, + "grad_norm": 0.5122756361961365, + "learning_rate": 1.2690160358795858e-06, + "loss": 0.5076, + "step": 1821 + }, + { + "epoch": 0.7288, + "grad_norm": 0.5033440589904785, + "learning_rate": 1.2655242036132466e-06, + "loss": 0.5678, + "step": 1822 + }, + { + "epoch": 0.7292, + "grad_norm": 0.46391579508781433, + "learning_rate": 1.2620361215246624e-06, + "loss": 0.5384, + "step": 1823 + }, + { + "epoch": 0.7296, + "grad_norm": 0.5027047991752625, + "learning_rate": 1.258551795467973e-06, + "loss": 0.4817, + "step": 1824 + }, + { + "epoch": 0.73, + "grad_norm": 0.4803715646266937, + "learning_rate": 1.255071231291011e-06, + "loss": 0.5046, + "step": 1825 + }, + { + "epoch": 0.7304, + "grad_norm": 0.49049100279808044, + "learning_rate": 1.2515944348352947e-06, + "loss": 0.5937, + "step": 1826 + }, + { + "epoch": 0.7308, + "grad_norm": 0.5216216444969177, + "learning_rate": 1.2481214119360212e-06, + "loss": 0.5592, + "step": 1827 + }, + { + "epoch": 0.7312, + "grad_norm": 0.4808286130428314, + "learning_rate": 1.244652168422055e-06, + "loss": 0.5259, + "step": 1828 + }, + { + "epoch": 0.7316, + "grad_norm": 0.49704375863075256, + "learning_rate": 1.2411867101159147e-06, + "loss": 0.5167, + "step": 1829 + }, + { + "epoch": 0.732, + "grad_norm": 0.5660054087638855, + "learning_rate": 1.2377250428337665e-06, + "loss": 0.5109, + "step": 1830 + }, + { + "epoch": 0.7324, + "grad_norm": 0.5108336210250854, + "learning_rate": 1.2342671723854164e-06, + "loss": 0.5556, + "step": 1831 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4856356084346771, + "learning_rate": 1.2308131045742956e-06, + "loss": 0.528, + "step": 1832 + }, + { + "epoch": 0.7332, + "grad_norm": 0.44966772198677063, + "learning_rate": 1.2273628451974558e-06, + "loss": 0.5224, + "step": 1833 + }, + { + "epoch": 0.7336, + "grad_norm": 0.5239558815956116, + "learning_rate": 1.2239164000455528e-06, + "loss": 0.5129, + "step": 1834 + }, + { + "epoch": 0.734, + "grad_norm": 0.5169456005096436, + "learning_rate": 1.2204737749028466e-06, + "loss": 0.5931, + "step": 1835 + }, + { + "epoch": 0.7344, + "grad_norm": 0.441876620054245, + "learning_rate": 1.217034975547181e-06, + "loss": 0.4571, + "step": 1836 + }, + { + "epoch": 0.7348, + "grad_norm": 0.5893264412879944, + "learning_rate": 1.21360000774998e-06, + "loss": 0.5479, + "step": 1837 + }, + { + "epoch": 0.7352, + "grad_norm": 0.5525321960449219, + "learning_rate": 1.2101688772762398e-06, + "loss": 0.5201, + "step": 1838 + }, + { + "epoch": 0.7356, + "grad_norm": 0.5001885294914246, + "learning_rate": 1.2067415898845114e-06, + "loss": 0.5322, + "step": 1839 + }, + { + "epoch": 0.736, + "grad_norm": 0.5684386491775513, + "learning_rate": 1.2033181513269025e-06, + "loss": 0.5115, + "step": 1840 + }, + { + "epoch": 0.7364, + "grad_norm": 0.6078344583511353, + "learning_rate": 1.1998985673490533e-06, + "loss": 0.6907, + "step": 1841 + }, + { + "epoch": 0.7368, + "grad_norm": 0.5336594581604004, + "learning_rate": 1.196482843690143e-06, + "loss": 0.5895, + "step": 1842 + }, + { + "epoch": 0.7372, + "grad_norm": 0.48478782176971436, + "learning_rate": 1.1930709860828638e-06, + "loss": 0.4413, + "step": 1843 + }, + { + "epoch": 0.7376, + "grad_norm": 0.5144538879394531, + "learning_rate": 1.1896630002534264e-06, + "loss": 0.5178, + "step": 1844 + }, + { + "epoch": 0.738, + "grad_norm": 0.4920802712440491, + "learning_rate": 1.1862588919215395e-06, + "loss": 0.5442, + "step": 1845 + }, + { + "epoch": 0.7384, + "grad_norm": 0.46744486689567566, + "learning_rate": 1.1828586668004037e-06, + "loss": 0.4624, + "step": 1846 + }, + { + "epoch": 0.7388, + "grad_norm": 0.4999946355819702, + "learning_rate": 1.1794623305967057e-06, + "loss": 0.4912, + "step": 1847 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5071706175804138, + "learning_rate": 1.1760698890106015e-06, + "loss": 0.5316, + "step": 1848 + }, + { + "epoch": 0.7396, + "grad_norm": 0.5435724854469299, + "learning_rate": 1.1726813477357138e-06, + "loss": 0.5537, + "step": 1849 + }, + { + "epoch": 0.74, + "grad_norm": 0.5288483500480652, + "learning_rate": 1.169296712459117e-06, + "loss": 0.549, + "step": 1850 + }, + { + "epoch": 0.7404, + "grad_norm": 0.5118080377578735, + "learning_rate": 1.16591598886133e-06, + "loss": 0.51, + "step": 1851 + }, + { + "epoch": 0.7408, + "grad_norm": 0.49369898438453674, + "learning_rate": 1.162539182616309e-06, + "loss": 0.5163, + "step": 1852 + }, + { + "epoch": 0.7412, + "grad_norm": 0.4700876474380493, + "learning_rate": 1.1591662993914344e-06, + "loss": 0.4869, + "step": 1853 + }, + { + "epoch": 0.7416, + "grad_norm": 0.4725787937641144, + "learning_rate": 1.1557973448475015e-06, + "loss": 0.4726, + "step": 1854 + }, + { + "epoch": 0.742, + "grad_norm": 0.5252476334571838, + "learning_rate": 1.1524323246387127e-06, + "loss": 0.5113, + "step": 1855 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5266048908233643, + "learning_rate": 1.1490712444126662e-06, + "loss": 0.5853, + "step": 1856 + }, + { + "epoch": 0.7428, + "grad_norm": 0.49874410033226013, + "learning_rate": 1.1457141098103494e-06, + "loss": 0.4883, + "step": 1857 + }, + { + "epoch": 0.7432, + "grad_norm": 0.4535691738128662, + "learning_rate": 1.142360926466129e-06, + "loss": 0.426, + "step": 1858 + }, + { + "epoch": 0.7436, + "grad_norm": 0.49276694655418396, + "learning_rate": 1.139011700007736e-06, + "loss": 0.5052, + "step": 1859 + }, + { + "epoch": 0.744, + "grad_norm": 0.4988146126270294, + "learning_rate": 1.1356664360562627e-06, + "loss": 0.5846, + "step": 1860 + }, + { + "epoch": 0.7444, + "grad_norm": 0.574359118938446, + "learning_rate": 1.1323251402261509e-06, + "loss": 0.5119, + "step": 1861 + }, + { + "epoch": 0.7448, + "grad_norm": 0.5316257476806641, + "learning_rate": 1.1289878181251847e-06, + "loss": 0.5468, + "step": 1862 + }, + { + "epoch": 0.7452, + "grad_norm": 0.4588572680950165, + "learning_rate": 1.1256544753544755e-06, + "loss": 0.4288, + "step": 1863 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5082074403762817, + "learning_rate": 1.1223251175084573e-06, + "loss": 0.5143, + "step": 1864 + }, + { + "epoch": 0.746, + "grad_norm": 0.43266239762306213, + "learning_rate": 1.1189997501748754e-06, + "loss": 0.4633, + "step": 1865 + }, + { + "epoch": 0.7464, + "grad_norm": 0.5357276201248169, + "learning_rate": 1.11567837893478e-06, + "loss": 0.5964, + "step": 1866 + }, + { + "epoch": 0.7468, + "grad_norm": 0.5177252292633057, + "learning_rate": 1.112361009362514e-06, + "loss": 0.5603, + "step": 1867 + }, + { + "epoch": 0.7472, + "grad_norm": 0.4995522201061249, + "learning_rate": 1.1090476470257024e-06, + "loss": 0.5252, + "step": 1868 + }, + { + "epoch": 0.7476, + "grad_norm": 0.5316324234008789, + "learning_rate": 1.1057382974852448e-06, + "loss": 0.4449, + "step": 1869 + }, + { + "epoch": 0.748, + "grad_norm": 0.5401836633682251, + "learning_rate": 1.1024329662953083e-06, + "loss": 0.5806, + "step": 1870 + }, + { + "epoch": 0.7484, + "grad_norm": 0.4464450180530548, + "learning_rate": 1.0991316590033152e-06, + "loss": 0.4032, + "step": 1871 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5609473586082458, + "learning_rate": 1.095834381149933e-06, + "loss": 0.5282, + "step": 1872 + }, + { + "epoch": 0.7492, + "grad_norm": 0.46891769766807556, + "learning_rate": 1.0925411382690669e-06, + "loss": 0.5425, + "step": 1873 + }, + { + "epoch": 0.7496, + "grad_norm": 0.5601791739463806, + "learning_rate": 1.0892519358878497e-06, + "loss": 0.6361, + "step": 1874 + }, + { + "epoch": 0.75, + "grad_norm": 0.5295112729072571, + "learning_rate": 1.0859667795266348e-06, + "loss": 0.5997, + "step": 1875 + }, + { + "epoch": 0.7504, + "grad_norm": 0.5210896730422974, + "learning_rate": 1.0826856746989853e-06, + "loss": 0.5908, + "step": 1876 + }, + { + "epoch": 0.7508, + "grad_norm": 0.5097439885139465, + "learning_rate": 1.0794086269116617e-06, + "loss": 0.5604, + "step": 1877 + }, + { + "epoch": 0.7512, + "grad_norm": 0.492924302816391, + "learning_rate": 1.076135641664616e-06, + "loss": 0.5139, + "step": 1878 + }, + { + "epoch": 0.7516, + "grad_norm": 0.5186163783073425, + "learning_rate": 1.0728667244509831e-06, + "loss": 0.5638, + "step": 1879 + }, + { + "epoch": 0.752, + "grad_norm": 0.5136158466339111, + "learning_rate": 1.069601880757073e-06, + "loss": 0.5374, + "step": 1880 + }, + { + "epoch": 0.7524, + "grad_norm": 0.5937939882278442, + "learning_rate": 1.066341116062354e-06, + "loss": 0.6895, + "step": 1881 + }, + { + "epoch": 0.7528, + "grad_norm": 0.4891611933708191, + "learning_rate": 1.0630844358394507e-06, + "loss": 0.5208, + "step": 1882 + }, + { + "epoch": 0.7532, + "grad_norm": 0.43400999903678894, + "learning_rate": 1.0598318455541317e-06, + "loss": 0.4802, + "step": 1883 + }, + { + "epoch": 0.7536, + "grad_norm": 0.5274035930633545, + "learning_rate": 1.0565833506653034e-06, + "loss": 0.5246, + "step": 1884 + }, + { + "epoch": 0.754, + "grad_norm": 0.538054883480072, + "learning_rate": 1.0533389566249991e-06, + "loss": 0.5674, + "step": 1885 + }, + { + "epoch": 0.7544, + "grad_norm": 0.44231492280960083, + "learning_rate": 1.0500986688783665e-06, + "loss": 0.4748, + "step": 1886 + }, + { + "epoch": 0.7548, + "grad_norm": 0.5154666900634766, + "learning_rate": 1.0468624928636623e-06, + "loss": 0.619, + "step": 1887 + }, + { + "epoch": 0.7552, + "grad_norm": 0.5303316712379456, + "learning_rate": 1.0436304340122446e-06, + "loss": 0.5738, + "step": 1888 + }, + { + "epoch": 0.7556, + "grad_norm": 0.5067679286003113, + "learning_rate": 1.0404024977485612e-06, + "loss": 0.498, + "step": 1889 + }, + { + "epoch": 0.756, + "grad_norm": 0.5611936450004578, + "learning_rate": 1.0371786894901385e-06, + "loss": 0.6098, + "step": 1890 + }, + { + "epoch": 0.7564, + "grad_norm": 0.5335197448730469, + "learning_rate": 1.0339590146475765e-06, + "loss": 0.593, + "step": 1891 + }, + { + "epoch": 0.7568, + "grad_norm": 0.48436692357063293, + "learning_rate": 1.030743478624537e-06, + "loss": 0.5192, + "step": 1892 + }, + { + "epoch": 0.7572, + "grad_norm": 0.4684750437736511, + "learning_rate": 1.0275320868177364e-06, + "loss": 0.5287, + "step": 1893 + }, + { + "epoch": 0.7576, + "grad_norm": 0.5043955445289612, + "learning_rate": 1.0243248446169373e-06, + "loss": 0.5264, + "step": 1894 + }, + { + "epoch": 0.758, + "grad_norm": 0.5075042247772217, + "learning_rate": 1.0211217574049342e-06, + "loss": 0.584, + "step": 1895 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5217066407203674, + "learning_rate": 1.0179228305575502e-06, + "loss": 0.5258, + "step": 1896 + }, + { + "epoch": 0.7588, + "grad_norm": 0.5070008635520935, + "learning_rate": 1.0147280694436272e-06, + "loss": 0.5319, + "step": 1897 + }, + { + "epoch": 0.7592, + "grad_norm": 0.49269282817840576, + "learning_rate": 1.0115374794250126e-06, + "loss": 0.4965, + "step": 1898 + }, + { + "epoch": 0.7596, + "grad_norm": 0.5396968722343445, + "learning_rate": 1.0083510658565568e-06, + "loss": 0.4938, + "step": 1899 + }, + { + "epoch": 0.76, + "grad_norm": 0.5112387537956238, + "learning_rate": 1.0051688340860985e-06, + "loss": 0.5559, + "step": 1900 + }, + { + "epoch": 0.7604, + "grad_norm": 0.5223495960235596, + "learning_rate": 1.0019907894544567e-06, + "loss": 0.5585, + "step": 1901 + }, + { + "epoch": 0.7608, + "grad_norm": 0.47718486189842224, + "learning_rate": 9.98816937295426e-07, + "loss": 0.5632, + "step": 1902 + }, + { + "epoch": 0.7612, + "grad_norm": 0.5641752481460571, + "learning_rate": 9.956472829357654e-07, + "loss": 0.541, + "step": 1903 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5478566884994507, + "learning_rate": 9.924818316951848e-07, + "loss": 0.5653, + "step": 1904 + }, + { + "epoch": 0.762, + "grad_norm": 0.5488006472587585, + "learning_rate": 9.893205888863411e-07, + "loss": 0.5503, + "step": 1905 + }, + { + "epoch": 0.7624, + "grad_norm": 0.5716015696525574, + "learning_rate": 9.86163559814831e-07, + "loss": 0.5462, + "step": 1906 + }, + { + "epoch": 0.7628, + "grad_norm": 0.5241632461547852, + "learning_rate": 9.830107497791743e-07, + "loss": 0.504, + "step": 1907 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5002646446228027, + "learning_rate": 9.798621640708148e-07, + "loss": 0.5395, + "step": 1908 + }, + { + "epoch": 0.7636, + "grad_norm": 0.5340526103973389, + "learning_rate": 9.76717807974104e-07, + "loss": 0.5614, + "step": 1909 + }, + { + "epoch": 0.764, + "grad_norm": 0.45730161666870117, + "learning_rate": 9.73577686766293e-07, + "loss": 0.4557, + "step": 1910 + }, + { + "epoch": 0.7644, + "grad_norm": 0.5114016532897949, + "learning_rate": 9.704418057175296e-07, + "loss": 0.5585, + "step": 1911 + }, + { + "epoch": 0.7648, + "grad_norm": 0.5574830770492554, + "learning_rate": 9.673101700908416e-07, + "loss": 0.6155, + "step": 1912 + }, + { + "epoch": 0.7652, + "grad_norm": 0.5143188238143921, + "learning_rate": 9.641827851421342e-07, + "loss": 0.559, + "step": 1913 + }, + { + "epoch": 0.7656, + "grad_norm": 0.5527118444442749, + "learning_rate": 9.610596561201755e-07, + "loss": 0.5481, + "step": 1914 + }, + { + "epoch": 0.766, + "grad_norm": 0.5203020572662354, + "learning_rate": 9.579407882665957e-07, + "loss": 0.4671, + "step": 1915 + }, + { + "epoch": 0.7664, + "grad_norm": 0.5298575758934021, + "learning_rate": 9.548261868158677e-07, + "loss": 0.4934, + "step": 1916 + }, + { + "epoch": 0.7668, + "grad_norm": 0.48027393221855164, + "learning_rate": 9.517158569953065e-07, + "loss": 0.5464, + "step": 1917 + }, + { + "epoch": 0.7672, + "grad_norm": 0.45088398456573486, + "learning_rate": 9.486098040250603e-07, + "loss": 0.525, + "step": 1918 + }, + { + "epoch": 0.7676, + "grad_norm": 0.488944411277771, + "learning_rate": 9.455080331180945e-07, + "loss": 0.4518, + "step": 1919 + }, + { + "epoch": 0.768, + "grad_norm": 0.4816155433654785, + "learning_rate": 9.424105494801924e-07, + "loss": 0.4794, + "step": 1920 + }, + { + "epoch": 0.7684, + "grad_norm": 0.5139371752738953, + "learning_rate": 9.393173583099383e-07, + "loss": 0.544, + "step": 1921 + }, + { + "epoch": 0.7688, + "grad_norm": 0.5136451125144958, + "learning_rate": 9.362284647987152e-07, + "loss": 0.5746, + "step": 1922 + }, + { + "epoch": 0.7692, + "grad_norm": 0.514636218547821, + "learning_rate": 9.331438741306904e-07, + "loss": 0.5222, + "step": 1923 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5739938020706177, + "learning_rate": 9.300635914828133e-07, + "loss": 0.5859, + "step": 1924 + }, + { + "epoch": 0.77, + "grad_norm": 0.47024837136268616, + "learning_rate": 9.269876220247995e-07, + "loss": 0.4757, + "step": 1925 + }, + { + "epoch": 0.7704, + "grad_norm": 0.4559095799922943, + "learning_rate": 9.239159709191258e-07, + "loss": 0.5346, + "step": 1926 + }, + { + "epoch": 0.7708, + "grad_norm": 0.5137647390365601, + "learning_rate": 9.208486433210255e-07, + "loss": 0.5204, + "step": 1927 + }, + { + "epoch": 0.7712, + "grad_norm": 0.47413358092308044, + "learning_rate": 9.177856443784699e-07, + "loss": 0.5305, + "step": 1928 + }, + { + "epoch": 0.7716, + "grad_norm": 0.5467235445976257, + "learning_rate": 9.147269792321709e-07, + "loss": 0.6016, + "step": 1929 + }, + { + "epoch": 0.772, + "grad_norm": 0.4952782094478607, + "learning_rate": 9.116726530155633e-07, + "loss": 0.5398, + "step": 1930 + }, + { + "epoch": 0.7724, + "grad_norm": 0.518825113773346, + "learning_rate": 9.086226708547993e-07, + "loss": 0.6043, + "step": 1931 + }, + { + "epoch": 0.7728, + "grad_norm": 0.5249910950660706, + "learning_rate": 9.055770378687427e-07, + "loss": 0.5465, + "step": 1932 + }, + { + "epoch": 0.7732, + "grad_norm": 0.5710473656654358, + "learning_rate": 9.02535759168959e-07, + "loss": 0.5921, + "step": 1933 + }, + { + "epoch": 0.7736, + "grad_norm": 0.4922305643558502, + "learning_rate": 8.994988398597018e-07, + "loss": 0.5021, + "step": 1934 + }, + { + "epoch": 0.774, + "grad_norm": 0.5559587478637695, + "learning_rate": 8.9646628503791e-07, + "loss": 0.5457, + "step": 1935 + }, + { + "epoch": 0.7744, + "grad_norm": 0.5352874994277954, + "learning_rate": 8.934380997931993e-07, + "loss": 0.5983, + "step": 1936 + }, + { + "epoch": 0.7748, + "grad_norm": 0.5431384444236755, + "learning_rate": 8.904142892078484e-07, + "loss": 0.5717, + "step": 1937 + }, + { + "epoch": 0.7752, + "grad_norm": 0.5083884596824646, + "learning_rate": 8.87394858356798e-07, + "loss": 0.5722, + "step": 1938 + }, + { + "epoch": 0.7756, + "grad_norm": 0.520429790019989, + "learning_rate": 8.843798123076346e-07, + "loss": 0.4715, + "step": 1939 + }, + { + "epoch": 0.776, + "grad_norm": 0.527215838432312, + "learning_rate": 8.813691561205864e-07, + "loss": 0.5459, + "step": 1940 + }, + { + "epoch": 0.7764, + "grad_norm": 0.5186598300933838, + "learning_rate": 8.783628948485153e-07, + "loss": 0.5213, + "step": 1941 + }, + { + "epoch": 0.7768, + "grad_norm": 0.5035976767539978, + "learning_rate": 8.75361033536907e-07, + "loss": 0.5397, + "step": 1942 + }, + { + "epoch": 0.7772, + "grad_norm": 0.5414839386940002, + "learning_rate": 8.723635772238613e-07, + "loss": 0.4567, + "step": 1943 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5333899855613708, + "learning_rate": 8.693705309400856e-07, + "loss": 0.5362, + "step": 1944 + }, + { + "epoch": 0.778, + "grad_norm": 0.5623505711555481, + "learning_rate": 8.663818997088838e-07, + "loss": 0.6472, + "step": 1945 + }, + { + "epoch": 0.7784, + "grad_norm": 0.5305650234222412, + "learning_rate": 8.633976885461559e-07, + "loss": 0.5869, + "step": 1946 + }, + { + "epoch": 0.7788, + "grad_norm": 0.5463561415672302, + "learning_rate": 8.604179024603777e-07, + "loss": 0.6323, + "step": 1947 + }, + { + "epoch": 0.7792, + "grad_norm": 0.5228492617607117, + "learning_rate": 8.574425464526005e-07, + "loss": 0.5033, + "step": 1948 + }, + { + "epoch": 0.7796, + "grad_norm": 0.4840620160102844, + "learning_rate": 8.544716255164388e-07, + "loss": 0.5229, + "step": 1949 + }, + { + "epoch": 0.78, + "grad_norm": 0.5401861667633057, + "learning_rate": 8.515051446380663e-07, + "loss": 0.597, + "step": 1950 + }, + { + "epoch": 0.7804, + "grad_norm": 0.5699217319488525, + "learning_rate": 8.485431087962045e-07, + "loss": 0.5568, + "step": 1951 + }, + { + "epoch": 0.7808, + "grad_norm": 0.46224555373191833, + "learning_rate": 8.455855229621131e-07, + "loss": 0.5103, + "step": 1952 + }, + { + "epoch": 0.7812, + "grad_norm": 0.5405699610710144, + "learning_rate": 8.426323920995829e-07, + "loss": 0.5714, + "step": 1953 + }, + { + "epoch": 0.7816, + "grad_norm": 0.5014129877090454, + "learning_rate": 8.396837211649276e-07, + "loss": 0.5612, + "step": 1954 + }, + { + "epoch": 0.782, + "grad_norm": 0.5532823801040649, + "learning_rate": 8.36739515106981e-07, + "loss": 0.5648, + "step": 1955 + }, + { + "epoch": 0.7824, + "grad_norm": 0.49977415800094604, + "learning_rate": 8.337997788670768e-07, + "loss": 0.486, + "step": 1956 + }, + { + "epoch": 0.7828, + "grad_norm": 0.5309237837791443, + "learning_rate": 8.308645173790498e-07, + "loss": 0.524, + "step": 1957 + }, + { + "epoch": 0.7832, + "grad_norm": 0.5683520436286926, + "learning_rate": 8.279337355692226e-07, + "loss": 0.5687, + "step": 1958 + }, + { + "epoch": 0.7836, + "grad_norm": 0.5215836763381958, + "learning_rate": 8.250074383564028e-07, + "loss": 0.5405, + "step": 1959 + }, + { + "epoch": 0.784, + "grad_norm": 0.5125592350959778, + "learning_rate": 8.220856306518701e-07, + "loss": 0.573, + "step": 1960 + }, + { + "epoch": 0.7844, + "grad_norm": 0.4813899099826813, + "learning_rate": 8.191683173593684e-07, + "loss": 0.4918, + "step": 1961 + }, + { + "epoch": 0.7848, + "grad_norm": 0.5178664326667786, + "learning_rate": 8.162555033750991e-07, + "loss": 0.4384, + "step": 1962 + }, + { + "epoch": 0.7852, + "grad_norm": 0.49676766991615295, + "learning_rate": 8.133471935877101e-07, + "loss": 0.5077, + "step": 1963 + }, + { + "epoch": 0.7856, + "grad_norm": 0.4734450876712799, + "learning_rate": 8.104433928782957e-07, + "loss": 0.5201, + "step": 1964 + }, + { + "epoch": 0.786, + "grad_norm": 0.5320947766304016, + "learning_rate": 8.07544106120377e-07, + "loss": 0.5475, + "step": 1965 + }, + { + "epoch": 0.7864, + "grad_norm": 0.5560709238052368, + "learning_rate": 8.046493381799007e-07, + "loss": 0.5751, + "step": 1966 + }, + { + "epoch": 0.7868, + "grad_norm": 0.5085044503211975, + "learning_rate": 8.017590939152293e-07, + "loss": 0.5635, + "step": 1967 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5624022483825684, + "learning_rate": 7.988733781771339e-07, + "loss": 0.4983, + "step": 1968 + }, + { + "epoch": 0.7876, + "grad_norm": 0.5629763603210449, + "learning_rate": 7.95992195808786e-07, + "loss": 0.6108, + "step": 1969 + }, + { + "epoch": 0.788, + "grad_norm": 0.502626895904541, + "learning_rate": 7.931155516457466e-07, + "loss": 0.5313, + "step": 1970 + }, + { + "epoch": 0.7884, + "grad_norm": 0.514829695224762, + "learning_rate": 7.902434505159604e-07, + "loss": 0.4693, + "step": 1971 + }, + { + "epoch": 0.7888, + "grad_norm": 0.5001904964447021, + "learning_rate": 7.873758972397493e-07, + "loss": 0.4843, + "step": 1972 + }, + { + "epoch": 0.7892, + "grad_norm": 0.5443049073219299, + "learning_rate": 7.845128966297995e-07, + "loss": 0.5105, + "step": 1973 + }, + { + "epoch": 0.7896, + "grad_norm": 0.4830927550792694, + "learning_rate": 7.816544534911601e-07, + "loss": 0.5576, + "step": 1974 + }, + { + "epoch": 0.79, + "grad_norm": 0.518970787525177, + "learning_rate": 7.788005726212282e-07, + "loss": 0.6164, + "step": 1975 + }, + { + "epoch": 0.7904, + "grad_norm": 0.6245747804641724, + "learning_rate": 7.759512588097436e-07, + "loss": 0.5214, + "step": 1976 + }, + { + "epoch": 0.7908, + "grad_norm": 0.4417863190174103, + "learning_rate": 7.731065168387841e-07, + "loss": 0.4494, + "step": 1977 + }, + { + "epoch": 0.7912, + "grad_norm": 0.5286259651184082, + "learning_rate": 7.702663514827517e-07, + "loss": 0.5324, + "step": 1978 + }, + { + "epoch": 0.7916, + "grad_norm": 0.5066895484924316, + "learning_rate": 7.674307675083699e-07, + "loss": 0.4509, + "step": 1979 + }, + { + "epoch": 0.792, + "grad_norm": 0.4970628321170807, + "learning_rate": 7.645997696746695e-07, + "loss": 0.6078, + "step": 1980 + }, + { + "epoch": 0.7924, + "grad_norm": 0.5808789730072021, + "learning_rate": 7.617733627329888e-07, + "loss": 0.5558, + "step": 1981 + }, + { + "epoch": 0.7928, + "grad_norm": 0.5080152153968811, + "learning_rate": 7.589515514269565e-07, + "loss": 0.5846, + "step": 1982 + }, + { + "epoch": 0.7932, + "grad_norm": 0.4858357608318329, + "learning_rate": 7.561343404924919e-07, + "loss": 0.5286, + "step": 1983 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5149843096733093, + "learning_rate": 7.533217346577921e-07, + "loss": 0.5441, + "step": 1984 + }, + { + "epoch": 0.794, + "grad_norm": 0.5807613134384155, + "learning_rate": 7.505137386433237e-07, + "loss": 0.5804, + "step": 1985 + }, + { + "epoch": 0.7944, + "grad_norm": 0.48946282267570496, + "learning_rate": 7.477103571618203e-07, + "loss": 0.5502, + "step": 1986 + }, + { + "epoch": 0.7948, + "grad_norm": 0.5640960931777954, + "learning_rate": 7.449115949182662e-07, + "loss": 0.6595, + "step": 1987 + }, + { + "epoch": 0.7952, + "grad_norm": 0.5175038576126099, + "learning_rate": 7.421174566098976e-07, + "loss": 0.4677, + "step": 1988 + }, + { + "epoch": 0.7956, + "grad_norm": 0.5714621543884277, + "learning_rate": 7.393279469261867e-07, + "loss": 0.505, + "step": 1989 + }, + { + "epoch": 0.796, + "grad_norm": 0.49548372626304626, + "learning_rate": 7.365430705488399e-07, + "loss": 0.5464, + "step": 1990 + }, + { + "epoch": 0.7964, + "grad_norm": 0.6030895113945007, + "learning_rate": 7.337628321517861e-07, + "loss": 0.5169, + "step": 1991 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5378836989402771, + "learning_rate": 7.309872364011688e-07, + "loss": 0.5947, + "step": 1992 + }, + { + "epoch": 0.7972, + "grad_norm": 0.5616456270217896, + "learning_rate": 7.282162879553437e-07, + "loss": 0.6075, + "step": 1993 + }, + { + "epoch": 0.7976, + "grad_norm": 0.48758378624916077, + "learning_rate": 7.254499914648619e-07, + "loss": 0.5536, + "step": 1994 + }, + { + "epoch": 0.798, + "grad_norm": 0.5290488600730896, + "learning_rate": 7.226883515724715e-07, + "loss": 0.5948, + "step": 1995 + }, + { + "epoch": 0.7984, + "grad_norm": 0.5616166591644287, + "learning_rate": 7.19931372913101e-07, + "loss": 0.5515, + "step": 1996 + }, + { + "epoch": 0.7988, + "grad_norm": 0.4974389672279358, + "learning_rate": 7.171790601138605e-07, + "loss": 0.5033, + "step": 1997 + }, + { + "epoch": 0.7992, + "grad_norm": 0.5275691747665405, + "learning_rate": 7.144314177940246e-07, + "loss": 0.5464, + "step": 1998 + }, + { + "epoch": 0.7996, + "grad_norm": 0.5309053063392639, + "learning_rate": 7.116884505650332e-07, + "loss": 0.5704, + "step": 1999 + }, + { + "epoch": 0.8, + "grad_norm": 0.5537293553352356, + "learning_rate": 7.089501630304779e-07, + "loss": 0.4483, + "step": 2000 + }, + { + "epoch": 0.8004, + "grad_norm": 0.467911034822464, + "learning_rate": 7.062165597860947e-07, + "loss": 0.4794, + "step": 2001 + }, + { + "epoch": 0.8008, + "grad_norm": 0.5319057703018188, + "learning_rate": 7.034876454197625e-07, + "loss": 0.5058, + "step": 2002 + }, + { + "epoch": 0.8012, + "grad_norm": 0.5028004050254822, + "learning_rate": 7.007634245114858e-07, + "loss": 0.5143, + "step": 2003 + }, + { + "epoch": 0.8016, + "grad_norm": 0.47743862867355347, + "learning_rate": 6.980439016333954e-07, + "loss": 0.46, + "step": 2004 + }, + { + "epoch": 0.802, + "grad_norm": 0.5376464128494263, + "learning_rate": 6.953290813497354e-07, + "loss": 0.5481, + "step": 2005 + }, + { + "epoch": 0.8024, + "grad_norm": 0.48827072978019714, + "learning_rate": 6.926189682168575e-07, + "loss": 0.5273, + "step": 2006 + }, + { + "epoch": 0.8028, + "grad_norm": 0.5766599178314209, + "learning_rate": 6.899135667832138e-07, + "loss": 0.553, + "step": 2007 + }, + { + "epoch": 0.8032, + "grad_norm": 0.6004101037979126, + "learning_rate": 6.872128815893502e-07, + "loss": 0.5864, + "step": 2008 + }, + { + "epoch": 0.8036, + "grad_norm": 0.5494182705879211, + "learning_rate": 6.845169171678945e-07, + "loss": 0.6029, + "step": 2009 + }, + { + "epoch": 0.804, + "grad_norm": 0.5339188575744629, + "learning_rate": 6.81825678043551e-07, + "loss": 0.5386, + "step": 2010 + }, + { + "epoch": 0.8044, + "grad_norm": 0.5146757960319519, + "learning_rate": 6.79139168733098e-07, + "loss": 0.5751, + "step": 2011 + }, + { + "epoch": 0.8048, + "grad_norm": 0.6246601939201355, + "learning_rate": 6.764573937453698e-07, + "loss": 0.5433, + "step": 2012 + }, + { + "epoch": 0.8052, + "grad_norm": 0.5111002326011658, + "learning_rate": 6.737803575812601e-07, + "loss": 0.5189, + "step": 2013 + }, + { + "epoch": 0.8056, + "grad_norm": 0.45920640230178833, + "learning_rate": 6.71108064733706e-07, + "loss": 0.4705, + "step": 2014 + }, + { + "epoch": 0.806, + "grad_norm": 0.5450352430343628, + "learning_rate": 6.684405196876843e-07, + "loss": 0.5344, + "step": 2015 + }, + { + "epoch": 0.8064, + "grad_norm": 0.5059295892715454, + "learning_rate": 6.657777269202041e-07, + "loss": 0.466, + "step": 2016 + }, + { + "epoch": 0.8068, + "grad_norm": 0.48422038555145264, + "learning_rate": 6.631196909002998e-07, + "loss": 0.48, + "step": 2017 + }, + { + "epoch": 0.8072, + "grad_norm": 0.5188162326812744, + "learning_rate": 6.604664160890203e-07, + "loss": 0.5654, + "step": 2018 + }, + { + "epoch": 0.8076, + "grad_norm": 0.48736605048179626, + "learning_rate": 6.57817906939424e-07, + "loss": 0.5568, + "step": 2019 + }, + { + "epoch": 0.808, + "grad_norm": 0.5381132960319519, + "learning_rate": 6.551741678965707e-07, + "loss": 0.4822, + "step": 2020 + }, + { + "epoch": 0.8084, + "grad_norm": 0.4702557623386383, + "learning_rate": 6.525352033975163e-07, + "loss": 0.4102, + "step": 2021 + }, + { + "epoch": 0.8088, + "grad_norm": 0.5244594216346741, + "learning_rate": 6.49901017871303e-07, + "loss": 0.5595, + "step": 2022 + }, + { + "epoch": 0.8092, + "grad_norm": 0.5497479438781738, + "learning_rate": 6.4727161573895e-07, + "loss": 0.513, + "step": 2023 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5612243413925171, + "learning_rate": 6.446470014134504e-07, + "loss": 0.5265, + "step": 2024 + }, + { + "epoch": 0.81, + "grad_norm": 0.576653003692627, + "learning_rate": 6.420271792997612e-07, + "loss": 0.6046, + "step": 2025 + }, + { + "epoch": 0.8104, + "grad_norm": 0.5197085738182068, + "learning_rate": 6.394121537947986e-07, + "loss": 0.5626, + "step": 2026 + }, + { + "epoch": 0.8108, + "grad_norm": 0.5471281409263611, + "learning_rate": 6.368019292874246e-07, + "loss": 0.5512, + "step": 2027 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5379735827445984, + "learning_rate": 6.341965101584466e-07, + "loss": 0.5143, + "step": 2028 + }, + { + "epoch": 0.8116, + "grad_norm": 0.5038474798202515, + "learning_rate": 6.315959007806046e-07, + "loss": 0.4985, + "step": 2029 + }, + { + "epoch": 0.812, + "grad_norm": 0.6646623611450195, + "learning_rate": 6.290001055185684e-07, + "loss": 0.6237, + "step": 2030 + }, + { + "epoch": 0.8124, + "grad_norm": 0.5201054811477661, + "learning_rate": 6.264091287289292e-07, + "loss": 0.5631, + "step": 2031 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4982958137989044, + "learning_rate": 6.238229747601876e-07, + "loss": 0.5284, + "step": 2032 + }, + { + "epoch": 0.8132, + "grad_norm": 0.5728126764297485, + "learning_rate": 6.212416479527513e-07, + "loss": 0.5772, + "step": 2033 + }, + { + "epoch": 0.8136, + "grad_norm": 0.5694175362586975, + "learning_rate": 6.186651526389281e-07, + "loss": 0.5493, + "step": 2034 + }, + { + "epoch": 0.814, + "grad_norm": 0.4679771065711975, + "learning_rate": 6.160934931429164e-07, + "loss": 0.4774, + "step": 2035 + }, + { + "epoch": 0.8144, + "grad_norm": 0.5073397755622864, + "learning_rate": 6.135266737807975e-07, + "loss": 0.4938, + "step": 2036 + }, + { + "epoch": 0.8148, + "grad_norm": 0.5173336863517761, + "learning_rate": 6.10964698860529e-07, + "loss": 0.5557, + "step": 2037 + }, + { + "epoch": 0.8152, + "grad_norm": 0.5217483639717102, + "learning_rate": 6.084075726819389e-07, + "loss": 0.5521, + "step": 2038 + }, + { + "epoch": 0.8156, + "grad_norm": 0.5165453553199768, + "learning_rate": 6.058552995367178e-07, + "loss": 0.5561, + "step": 2039 + }, + { + "epoch": 0.816, + "grad_norm": 0.4597705006599426, + "learning_rate": 6.033078837084113e-07, + "loss": 0.4374, + "step": 2040 + }, + { + "epoch": 0.8164, + "grad_norm": 0.565405547618866, + "learning_rate": 6.007653294724123e-07, + "loss": 0.6586, + "step": 2041 + }, + { + "epoch": 0.8168, + "grad_norm": 0.4685356616973877, + "learning_rate": 5.98227641095953e-07, + "loss": 0.4905, + "step": 2042 + }, + { + "epoch": 0.8172, + "grad_norm": 0.573566734790802, + "learning_rate": 5.956948228381015e-07, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.8176, + "grad_norm": 0.5715039968490601, + "learning_rate": 5.931668789497525e-07, + "loss": 0.5855, + "step": 2044 + }, + { + "epoch": 0.818, + "grad_norm": 0.48921602964401245, + "learning_rate": 5.906438136736182e-07, + "loss": 0.5488, + "step": 2045 + }, + { + "epoch": 0.8184, + "grad_norm": 0.5068395137786865, + "learning_rate": 5.881256312442223e-07, + "loss": 0.5313, + "step": 2046 + }, + { + "epoch": 0.8188, + "grad_norm": 0.5660580992698669, + "learning_rate": 5.856123358878947e-07, + "loss": 0.5712, + "step": 2047 + }, + { + "epoch": 0.8192, + "grad_norm": 0.48958441615104675, + "learning_rate": 5.831039318227639e-07, + "loss": 0.5172, + "step": 2048 + }, + { + "epoch": 0.8196, + "grad_norm": 0.5002844333648682, + "learning_rate": 5.806004232587487e-07, + "loss": 0.5371, + "step": 2049 + }, + { + "epoch": 0.82, + "grad_norm": 0.5133857727050781, + "learning_rate": 5.781018143975516e-07, + "loss": 0.5908, + "step": 2050 + }, + { + "epoch": 0.8204, + "grad_norm": 0.5378307700157166, + "learning_rate": 5.756081094326496e-07, + "loss": 0.5463, + "step": 2051 + }, + { + "epoch": 0.8208, + "grad_norm": 0.574783444404602, + "learning_rate": 5.73119312549294e-07, + "loss": 0.6224, + "step": 2052 + }, + { + "epoch": 0.8212, + "grad_norm": 0.497978538274765, + "learning_rate": 5.706354279244942e-07, + "loss": 0.5832, + "step": 2053 + }, + { + "epoch": 0.8216, + "grad_norm": 0.476604163646698, + "learning_rate": 5.681564597270188e-07, + "loss": 0.5334, + "step": 2054 + }, + { + "epoch": 0.822, + "grad_norm": 0.4663172662258148, + "learning_rate": 5.65682412117382e-07, + "loss": 0.4863, + "step": 2055 + }, + { + "epoch": 0.8224, + "grad_norm": 0.5250645875930786, + "learning_rate": 5.632132892478414e-07, + "loss": 0.5811, + "step": 2056 + }, + { + "epoch": 0.8228, + "grad_norm": 0.48894020915031433, + "learning_rate": 5.607490952623886e-07, + "loss": 0.5533, + "step": 2057 + }, + { + "epoch": 0.8232, + "grad_norm": 0.5382503867149353, + "learning_rate": 5.582898342967445e-07, + "loss": 0.5632, + "step": 2058 + }, + { + "epoch": 0.8236, + "grad_norm": 0.5530444979667664, + "learning_rate": 5.558355104783495e-07, + "loss": 0.5749, + "step": 2059 + }, + { + "epoch": 0.824, + "grad_norm": 0.5484821796417236, + "learning_rate": 5.533861279263556e-07, + "loss": 0.5641, + "step": 2060 + }, + { + "epoch": 0.8244, + "grad_norm": 0.6639271974563599, + "learning_rate": 5.509416907516267e-07, + "loss": 0.6381, + "step": 2061 + }, + { + "epoch": 0.8248, + "grad_norm": 0.49303728342056274, + "learning_rate": 5.485022030567224e-07, + "loss": 0.5542, + "step": 2062 + }, + { + "epoch": 0.8252, + "grad_norm": 0.5660285949707031, + "learning_rate": 5.460676689358982e-07, + "loss": 0.6456, + "step": 2063 + }, + { + "epoch": 0.8256, + "grad_norm": 0.4761320650577545, + "learning_rate": 5.436380924750948e-07, + "loss": 0.5237, + "step": 2064 + }, + { + "epoch": 0.826, + "grad_norm": 0.5708348155021667, + "learning_rate": 5.412134777519309e-07, + "loss": 0.5548, + "step": 2065 + }, + { + "epoch": 0.8264, + "grad_norm": 0.49620676040649414, + "learning_rate": 5.387938288357014e-07, + "loss": 0.5473, + "step": 2066 + }, + { + "epoch": 0.8268, + "grad_norm": 0.5709338784217834, + "learning_rate": 5.363791497873632e-07, + "loss": 0.6886, + "step": 2067 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4843921363353729, + "learning_rate": 5.339694446595349e-07, + "loss": 0.5347, + "step": 2068 + }, + { + "epoch": 0.8276, + "grad_norm": 0.5291857123374939, + "learning_rate": 5.315647174964848e-07, + "loss": 0.6049, + "step": 2069 + }, + { + "epoch": 0.828, + "grad_norm": 0.5444912910461426, + "learning_rate": 5.291649723341296e-07, + "loss": 0.5096, + "step": 2070 + }, + { + "epoch": 0.8284, + "grad_norm": 0.5704883933067322, + "learning_rate": 5.267702132000212e-07, + "loss": 0.5848, + "step": 2071 + }, + { + "epoch": 0.8288, + "grad_norm": 0.45734161138534546, + "learning_rate": 5.243804441133465e-07, + "loss": 0.6085, + "step": 2072 + }, + { + "epoch": 0.8292, + "grad_norm": 0.5407284498214722, + "learning_rate": 5.21995669084915e-07, + "loss": 0.5605, + "step": 2073 + }, + { + "epoch": 0.8296, + "grad_norm": 0.5885137319564819, + "learning_rate": 5.196158921171548e-07, + "loss": 0.5438, + "step": 2074 + }, + { + "epoch": 0.83, + "grad_norm": 0.5144458413124084, + "learning_rate": 5.172411172041079e-07, + "loss": 0.4796, + "step": 2075 + }, + { + "epoch": 0.8304, + "grad_norm": 0.4840572774410248, + "learning_rate": 5.148713483314182e-07, + "loss": 0.4583, + "step": 2076 + }, + { + "epoch": 0.8308, + "grad_norm": 0.4647265076637268, + "learning_rate": 5.12506589476331e-07, + "loss": 0.492, + "step": 2077 + }, + { + "epoch": 0.8312, + "grad_norm": 0.5066642761230469, + "learning_rate": 5.101468446076796e-07, + "loss": 0.473, + "step": 2078 + }, + { + "epoch": 0.8316, + "grad_norm": 0.5504907965660095, + "learning_rate": 5.077921176858854e-07, + "loss": 0.5896, + "step": 2079 + }, + { + "epoch": 0.832, + "grad_norm": 0.5531979203224182, + "learning_rate": 5.054424126629462e-07, + "loss": 0.4968, + "step": 2080 + }, + { + "epoch": 0.8324, + "grad_norm": 0.5404284000396729, + "learning_rate": 5.030977334824312e-07, + "loss": 0.5036, + "step": 2081 + }, + { + "epoch": 0.8328, + "grad_norm": 0.4818439185619354, + "learning_rate": 5.007580840794761e-07, + "loss": 0.4787, + "step": 2082 + }, + { + "epoch": 0.8332, + "grad_norm": 0.48733893036842346, + "learning_rate": 4.984234683807746e-07, + "loss": 0.4894, + "step": 2083 + }, + { + "epoch": 0.8336, + "grad_norm": 0.4569380581378937, + "learning_rate": 4.960938903045715e-07, + "loss": 0.4484, + "step": 2084 + }, + { + "epoch": 0.834, + "grad_norm": 0.5140909552574158, + "learning_rate": 4.937693537606559e-07, + "loss": 0.5013, + "step": 2085 + }, + { + "epoch": 0.8344, + "grad_norm": 0.5413423776626587, + "learning_rate": 4.914498626503589e-07, + "loss": 0.5123, + "step": 2086 + }, + { + "epoch": 0.8348, + "grad_norm": 0.4930032789707184, + "learning_rate": 4.891354208665396e-07, + "loss": 0.557, + "step": 2087 + }, + { + "epoch": 0.8352, + "grad_norm": 0.43929794430732727, + "learning_rate": 4.868260322935864e-07, + "loss": 0.4897, + "step": 2088 + }, + { + "epoch": 0.8356, + "grad_norm": 0.5489115118980408, + "learning_rate": 4.845217008074045e-07, + "loss": 0.5762, + "step": 2089 + }, + { + "epoch": 0.836, + "grad_norm": 0.4723738431930542, + "learning_rate": 4.82222430275411e-07, + "loss": 0.5563, + "step": 2090 + }, + { + "epoch": 0.8364, + "grad_norm": 0.5301296710968018, + "learning_rate": 4.79928224556532e-07, + "loss": 0.51, + "step": 2091 + }, + { + "epoch": 0.8368, + "grad_norm": 0.5511886477470398, + "learning_rate": 4.776390875011912e-07, + "loss": 0.4941, + "step": 2092 + }, + { + "epoch": 0.8372, + "grad_norm": 0.5579310059547424, + "learning_rate": 4.7535502295130544e-07, + "loss": 0.6519, + "step": 2093 + }, + { + "epoch": 0.8376, + "grad_norm": 0.48983925580978394, + "learning_rate": 4.730760347402788e-07, + "loss": 0.5707, + "step": 2094 + }, + { + "epoch": 0.838, + "grad_norm": 0.5601329207420349, + "learning_rate": 4.708021266929944e-07, + "loss": 0.5234, + "step": 2095 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5632297396659851, + "learning_rate": 4.685333026258109e-07, + "loss": 0.6143, + "step": 2096 + }, + { + "epoch": 0.8388, + "grad_norm": 0.6144510507583618, + "learning_rate": 4.6626956634655434e-07, + "loss": 0.5627, + "step": 2097 + }, + { + "epoch": 0.8392, + "grad_norm": 0.4851270616054535, + "learning_rate": 4.6401092165451024e-07, + "loss": 0.5105, + "step": 2098 + }, + { + "epoch": 0.8396, + "grad_norm": 0.5638195872306824, + "learning_rate": 4.617573723404191e-07, + "loss": 0.5906, + "step": 2099 + }, + { + "epoch": 0.84, + "grad_norm": 0.516826331615448, + "learning_rate": 4.59508922186471e-07, + "loss": 0.5586, + "step": 2100 + }, + { + "epoch": 0.8404, + "grad_norm": 0.5381993055343628, + "learning_rate": 4.5726557496629773e-07, + "loss": 0.4996, + "step": 2101 + }, + { + "epoch": 0.8408, + "grad_norm": 0.5378861427307129, + "learning_rate": 4.5502733444496577e-07, + "loss": 0.5762, + "step": 2102 + }, + { + "epoch": 0.8412, + "grad_norm": 0.5263057351112366, + "learning_rate": 4.527942043789708e-07, + "loss": 0.6046, + "step": 2103 + }, + { + "epoch": 0.8416, + "grad_norm": 0.5146876573562622, + "learning_rate": 4.5056618851623173e-07, + "loss": 0.6175, + "step": 2104 + }, + { + "epoch": 0.842, + "grad_norm": 0.5282055139541626, + "learning_rate": 4.4834329059608533e-07, + "loss": 0.5349, + "step": 2105 + }, + { + "epoch": 0.8424, + "grad_norm": 0.5305773615837097, + "learning_rate": 4.461255143492781e-07, + "loss": 0.6091, + "step": 2106 + }, + { + "epoch": 0.8428, + "grad_norm": 0.5523550510406494, + "learning_rate": 4.4391286349796e-07, + "loss": 0.6574, + "step": 2107 + }, + { + "epoch": 0.8432, + "grad_norm": 0.5279895663261414, + "learning_rate": 4.4170534175567863e-07, + "loss": 0.4755, + "step": 2108 + }, + { + "epoch": 0.8436, + "grad_norm": 0.5235905647277832, + "learning_rate": 4.395029528273751e-07, + "loss": 0.5776, + "step": 2109 + }, + { + "epoch": 0.844, + "grad_norm": 0.5142126083374023, + "learning_rate": 4.373057004093756e-07, + "loss": 0.4684, + "step": 2110 + }, + { + "epoch": 0.8444, + "grad_norm": 0.48986274003982544, + "learning_rate": 4.3511358818938384e-07, + "loss": 0.5636, + "step": 2111 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5454509258270264, + "learning_rate": 4.329266198464782e-07, + "loss": 0.5735, + "step": 2112 + }, + { + "epoch": 0.8452, + "grad_norm": 0.6086745262145996, + "learning_rate": 4.3074479905110223e-07, + "loss": 0.5865, + "step": 2113 + }, + { + "epoch": 0.8456, + "grad_norm": 0.5274353623390198, + "learning_rate": 4.285681294650624e-07, + "loss": 0.6151, + "step": 2114 + }, + { + "epoch": 0.846, + "grad_norm": 0.5232753157615662, + "learning_rate": 4.263966147415201e-07, + "loss": 0.526, + "step": 2115 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5116792917251587, + "learning_rate": 4.242302585249821e-07, + "loss": 0.5868, + "step": 2116 + }, + { + "epoch": 0.8468, + "grad_norm": 0.5277671217918396, + "learning_rate": 4.220690644512997e-07, + "loss": 0.5121, + "step": 2117 + }, + { + "epoch": 0.8472, + "grad_norm": 0.5713874697685242, + "learning_rate": 4.199130361476599e-07, + "loss": 0.556, + "step": 2118 + }, + { + "epoch": 0.8476, + "grad_norm": 0.5311291813850403, + "learning_rate": 4.1776217723258134e-07, + "loss": 0.4875, + "step": 2119 + }, + { + "epoch": 0.848, + "grad_norm": 0.5216573476791382, + "learning_rate": 4.1561649131590474e-07, + "loss": 0.5525, + "step": 2120 + }, + { + "epoch": 0.8484, + "grad_norm": 0.5430652499198914, + "learning_rate": 4.1347598199878885e-07, + "loss": 0.5158, + "step": 2121 + }, + { + "epoch": 0.8488, + "grad_norm": 0.5124344229698181, + "learning_rate": 4.113406528737052e-07, + "loss": 0.5256, + "step": 2122 + }, + { + "epoch": 0.8492, + "grad_norm": 0.5016151070594788, + "learning_rate": 4.09210507524431e-07, + "loss": 0.5081, + "step": 2123 + }, + { + "epoch": 0.8496, + "grad_norm": 0.4942253530025482, + "learning_rate": 4.070855495260453e-07, + "loss": 0.5332, + "step": 2124 + }, + { + "epoch": 0.85, + "grad_norm": 0.4644415080547333, + "learning_rate": 4.0496578244491793e-07, + "loss": 0.5018, + "step": 2125 + }, + { + "epoch": 0.8504, + "grad_norm": 0.4998500347137451, + "learning_rate": 4.028512098387071e-07, + "loss": 0.5553, + "step": 2126 + }, + { + "epoch": 0.8508, + "grad_norm": 0.5142858624458313, + "learning_rate": 4.007418352563566e-07, + "loss": 0.5019, + "step": 2127 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5226315855979919, + "learning_rate": 3.9863766223808086e-07, + "loss": 0.5026, + "step": 2128 + }, + { + "epoch": 0.8516, + "grad_norm": 0.518667459487915, + "learning_rate": 3.9653869431536953e-07, + "loss": 0.5007, + "step": 2129 + }, + { + "epoch": 0.852, + "grad_norm": 0.5096085667610168, + "learning_rate": 3.944449350109729e-07, + "loss": 0.5872, + "step": 2130 + }, + { + "epoch": 0.8524, + "grad_norm": 0.5470719933509827, + "learning_rate": 3.9235638783889987e-07, + "loss": 0.5457, + "step": 2131 + }, + { + "epoch": 0.8528, + "grad_norm": 0.5269269943237305, + "learning_rate": 3.902730563044133e-07, + "loss": 0.4615, + "step": 2132 + }, + { + "epoch": 0.8532, + "grad_norm": 0.5528969764709473, + "learning_rate": 3.881949439040224e-07, + "loss": 0.6155, + "step": 2133 + }, + { + "epoch": 0.8536, + "grad_norm": 0.5143035650253296, + "learning_rate": 3.8612205412547534e-07, + "loss": 0.5154, + "step": 2134 + }, + { + "epoch": 0.854, + "grad_norm": 0.5350843667984009, + "learning_rate": 3.840543904477557e-07, + "loss": 0.4668, + "step": 2135 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5625218152999878, + "learning_rate": 3.819919563410773e-07, + "loss": 0.5487, + "step": 2136 + }, + { + "epoch": 0.8548, + "grad_norm": 0.503579318523407, + "learning_rate": 3.799347552668745e-07, + "loss": 0.5635, + "step": 2137 + }, + { + "epoch": 0.8552, + "grad_norm": 0.5495058298110962, + "learning_rate": 3.77882790677802e-07, + "loss": 0.5593, + "step": 2138 + }, + { + "epoch": 0.8556, + "grad_norm": 0.5059952735900879, + "learning_rate": 3.7583606601772346e-07, + "loss": 0.5202, + "step": 2139 + }, + { + "epoch": 0.856, + "grad_norm": 0.5059313774108887, + "learning_rate": 3.7379458472170814e-07, + "loss": 0.5287, + "step": 2140 + }, + { + "epoch": 0.8564, + "grad_norm": 0.5819924473762512, + "learning_rate": 3.717583502160283e-07, + "loss": 0.5815, + "step": 2141 + }, + { + "epoch": 0.8568, + "grad_norm": 0.5166248679161072, + "learning_rate": 3.697273659181459e-07, + "loss": 0.5107, + "step": 2142 + }, + { + "epoch": 0.8572, + "grad_norm": 0.6137151718139648, + "learning_rate": 3.67701635236716e-07, + "loss": 0.5726, + "step": 2143 + }, + { + "epoch": 0.8576, + "grad_norm": 0.468230664730072, + "learning_rate": 3.656811615715727e-07, + "loss": 0.4374, + "step": 2144 + }, + { + "epoch": 0.858, + "grad_norm": 0.4957389831542969, + "learning_rate": 3.636659483137291e-07, + "loss": 0.5401, + "step": 2145 + }, + { + "epoch": 0.8584, + "grad_norm": 0.49795204401016235, + "learning_rate": 3.616559988453683e-07, + "loss": 0.5141, + "step": 2146 + }, + { + "epoch": 0.8588, + "grad_norm": 0.5580368041992188, + "learning_rate": 3.596513165398412e-07, + "loss": 0.5541, + "step": 2147 + }, + { + "epoch": 0.8592, + "grad_norm": 0.5130208134651184, + "learning_rate": 3.576519047616564e-07, + "loss": 0.4491, + "step": 2148 + }, + { + "epoch": 0.8596, + "grad_norm": 0.4993700683116913, + "learning_rate": 3.55657766866477e-07, + "loss": 0.5044, + "step": 2149 + }, + { + "epoch": 0.86, + "grad_norm": 0.5155137181282043, + "learning_rate": 3.536689062011167e-07, + "loss": 0.5642, + "step": 2150 + }, + { + "epoch": 0.8604, + "grad_norm": 0.4971082806587219, + "learning_rate": 3.516853261035304e-07, + "loss": 0.5726, + "step": 2151 + }, + { + "epoch": 0.8608, + "grad_norm": 0.515801727771759, + "learning_rate": 3.497070299028118e-07, + "loss": 0.47, + "step": 2152 + }, + { + "epoch": 0.8612, + "grad_norm": 0.5084040760993958, + "learning_rate": 3.47734020919185e-07, + "loss": 0.5551, + "step": 2153 + }, + { + "epoch": 0.8616, + "grad_norm": 0.45051297545433044, + "learning_rate": 3.4576630246400246e-07, + "loss": 0.4355, + "step": 2154 + }, + { + "epoch": 0.862, + "grad_norm": 0.5345770120620728, + "learning_rate": 3.43803877839736e-07, + "loss": 0.5597, + "step": 2155 + }, + { + "epoch": 0.8624, + "grad_norm": 0.5158503651618958, + "learning_rate": 3.4184675033997157e-07, + "loss": 0.5396, + "step": 2156 + }, + { + "epoch": 0.8628, + "grad_norm": 0.5118535757064819, + "learning_rate": 3.3989492324940856e-07, + "loss": 0.5089, + "step": 2157 + }, + { + "epoch": 0.8632, + "grad_norm": 0.521641194820404, + "learning_rate": 3.379483998438458e-07, + "loss": 0.4735, + "step": 2158 + }, + { + "epoch": 0.8636, + "grad_norm": 0.5525158047676086, + "learning_rate": 3.360071833901854e-07, + "loss": 0.5108, + "step": 2159 + }, + { + "epoch": 0.864, + "grad_norm": 0.48484671115875244, + "learning_rate": 3.340712771464185e-07, + "loss": 0.4106, + "step": 2160 + }, + { + "epoch": 0.8644, + "grad_norm": 0.5603184700012207, + "learning_rate": 3.3214068436162764e-07, + "loss": 0.58, + "step": 2161 + }, + { + "epoch": 0.8648, + "grad_norm": 0.5196020603179932, + "learning_rate": 3.302154082759741e-07, + "loss": 0.5204, + "step": 2162 + }, + { + "epoch": 0.8652, + "grad_norm": 0.5511024594306946, + "learning_rate": 3.2829545212070014e-07, + "loss": 0.5546, + "step": 2163 + }, + { + "epoch": 0.8656, + "grad_norm": 0.529928982257843, + "learning_rate": 3.263808191181157e-07, + "loss": 0.5546, + "step": 2164 + }, + { + "epoch": 0.866, + "grad_norm": 0.4980902671813965, + "learning_rate": 3.244715124815982e-07, + "loss": 0.554, + "step": 2165 + }, + { + "epoch": 0.8664, + "grad_norm": 0.5063270926475525, + "learning_rate": 3.2256753541558635e-07, + "loss": 0.5141, + "step": 2166 + }, + { + "epoch": 0.8668, + "grad_norm": 0.5925174355506897, + "learning_rate": 3.2066889111557254e-07, + "loss": 0.55, + "step": 2167 + }, + { + "epoch": 0.8672, + "grad_norm": 0.5630876421928406, + "learning_rate": 3.1877558276810124e-07, + "loss": 0.5533, + "step": 2168 + }, + { + "epoch": 0.8676, + "grad_norm": 0.4962352216243744, + "learning_rate": 3.1688761355075977e-07, + "loss": 0.5002, + "step": 2169 + }, + { + "epoch": 0.868, + "grad_norm": 0.5295508503913879, + "learning_rate": 3.1500498663217376e-07, + "loss": 0.5161, + "step": 2170 + }, + { + "epoch": 0.8684, + "grad_norm": 0.5267398357391357, + "learning_rate": 3.13127705172005e-07, + "loss": 0.5289, + "step": 2171 + }, + { + "epoch": 0.8688, + "grad_norm": 0.4687250852584839, + "learning_rate": 3.1125577232094355e-07, + "loss": 0.5046, + "step": 2172 + }, + { + "epoch": 0.8692, + "grad_norm": 0.5243983864784241, + "learning_rate": 3.0938919122070123e-07, + "loss": 0.5399, + "step": 2173 + }, + { + "epoch": 0.8696, + "grad_norm": 0.5355934500694275, + "learning_rate": 3.075279650040092e-07, + "loss": 0.5985, + "step": 2174 + }, + { + "epoch": 0.87, + "grad_norm": 0.5579320788383484, + "learning_rate": 3.0567209679460986e-07, + "loss": 0.6162, + "step": 2175 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5460540056228638, + "learning_rate": 3.0382158970725485e-07, + "loss": 0.596, + "step": 2176 + }, + { + "epoch": 0.8708, + "grad_norm": 0.4951015114784241, + "learning_rate": 3.019764468476986e-07, + "loss": 0.4641, + "step": 2177 + }, + { + "epoch": 0.8712, + "grad_norm": 0.5001224279403687, + "learning_rate": 3.001366713126902e-07, + "loss": 0.5414, + "step": 2178 + }, + { + "epoch": 0.8716, + "grad_norm": 0.5136198997497559, + "learning_rate": 2.9830226618997115e-07, + "loss": 0.5539, + "step": 2179 + }, + { + "epoch": 0.872, + "grad_norm": 0.47107475996017456, + "learning_rate": 2.9647323455827183e-07, + "loss": 0.494, + "step": 2180 + }, + { + "epoch": 0.8724, + "grad_norm": 0.5539179444313049, + "learning_rate": 2.946495794873026e-07, + "loss": 0.552, + "step": 2181 + }, + { + "epoch": 0.8728, + "grad_norm": 0.5295327305793762, + "learning_rate": 2.9283130403774985e-07, + "loss": 0.6103, + "step": 2182 + }, + { + "epoch": 0.8732, + "grad_norm": 0.5619650483131409, + "learning_rate": 2.910184112612721e-07, + "loss": 0.5496, + "step": 2183 + }, + { + "epoch": 0.8736, + "grad_norm": 0.45123714208602905, + "learning_rate": 2.8921090420049183e-07, + "loss": 0.509, + "step": 2184 + }, + { + "epoch": 0.874, + "grad_norm": 0.5038043856620789, + "learning_rate": 2.874087858889968e-07, + "loss": 0.4946, + "step": 2185 + }, + { + "epoch": 0.8744, + "grad_norm": 0.5105406641960144, + "learning_rate": 2.856120593513271e-07, + "loss": 0.6208, + "step": 2186 + }, + { + "epoch": 0.8748, + "grad_norm": 0.5043431520462036, + "learning_rate": 2.838207276029743e-07, + "loss": 0.5475, + "step": 2187 + }, + { + "epoch": 0.8752, + "grad_norm": 0.56167072057724, + "learning_rate": 2.8203479365037663e-07, + "loss": 0.6108, + "step": 2188 + }, + { + "epoch": 0.8756, + "grad_norm": 0.5685185790061951, + "learning_rate": 2.802542604909121e-07, + "loss": 0.6608, + "step": 2189 + }, + { + "epoch": 0.876, + "grad_norm": 0.4665569067001343, + "learning_rate": 2.78479131112896e-07, + "loss": 0.4223, + "step": 2190 + }, + { + "epoch": 0.8764, + "grad_norm": 0.5965784788131714, + "learning_rate": 2.7670940849557263e-07, + "loss": 0.6361, + "step": 2191 + }, + { + "epoch": 0.8768, + "grad_norm": 0.5202442407608032, + "learning_rate": 2.7494509560911294e-07, + "loss": 0.5137, + "step": 2192 + }, + { + "epoch": 0.8772, + "grad_norm": 0.5134866833686829, + "learning_rate": 2.731861954146065e-07, + "loss": 0.5788, + "step": 2193 + }, + { + "epoch": 0.8776, + "grad_norm": 0.5923562049865723, + "learning_rate": 2.714327108640634e-07, + "loss": 0.5261, + "step": 2194 + }, + { + "epoch": 0.878, + "grad_norm": 0.49445661902427673, + "learning_rate": 2.6968464490039965e-07, + "loss": 0.51, + "step": 2195 + }, + { + "epoch": 0.8784, + "grad_norm": 0.4957909882068634, + "learning_rate": 2.6794200045743957e-07, + "loss": 0.4834, + "step": 2196 + }, + { + "epoch": 0.8788, + "grad_norm": 0.5249044299125671, + "learning_rate": 2.662047804599067e-07, + "loss": 0.4612, + "step": 2197 + }, + { + "epoch": 0.8792, + "grad_norm": 0.49063214659690857, + "learning_rate": 2.6447298782342277e-07, + "loss": 0.5146, + "step": 2198 + }, + { + "epoch": 0.8796, + "grad_norm": 0.5278512239456177, + "learning_rate": 2.6274662545449997e-07, + "loss": 0.5328, + "step": 2199 + }, + { + "epoch": 0.88, + "grad_norm": 0.5886636972427368, + "learning_rate": 2.6102569625053557e-07, + "loss": 0.5251, + "step": 2200 + }, + { + "epoch": 0.8804, + "grad_norm": 0.5029667615890503, + "learning_rate": 2.5931020309980934e-07, + "loss": 0.5412, + "step": 2201 + }, + { + "epoch": 0.8808, + "grad_norm": 0.4710221588611603, + "learning_rate": 2.5760014888147604e-07, + "loss": 0.4977, + "step": 2202 + }, + { + "epoch": 0.8812, + "grad_norm": 0.5477889180183411, + "learning_rate": 2.5589553646556463e-07, + "loss": 0.5954, + "step": 2203 + }, + { + "epoch": 0.8816, + "grad_norm": 0.5483034253120422, + "learning_rate": 2.541963687129694e-07, + "loss": 0.4608, + "step": 2204 + }, + { + "epoch": 0.882, + "grad_norm": 0.46928951144218445, + "learning_rate": 2.525026484754473e-07, + "loss": 0.5349, + "step": 2205 + }, + { + "epoch": 0.8824, + "grad_norm": 0.498003751039505, + "learning_rate": 2.508143785956107e-07, + "loss": 0.4114, + "step": 2206 + }, + { + "epoch": 0.8828, + "grad_norm": 0.4926424026489258, + "learning_rate": 2.491315619069274e-07, + "loss": 0.5779, + "step": 2207 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5372424125671387, + "learning_rate": 2.474542012337117e-07, + "loss": 0.592, + "step": 2208 + }, + { + "epoch": 0.8836, + "grad_norm": 0.5553046464920044, + "learning_rate": 2.4578229939112027e-07, + "loss": 0.5452, + "step": 2209 + }, + { + "epoch": 0.884, + "grad_norm": 0.5457509160041809, + "learning_rate": 2.441158591851484e-07, + "loss": 0.5042, + "step": 2210 + }, + { + "epoch": 0.8844, + "grad_norm": 0.5078551173210144, + "learning_rate": 2.424548834126262e-07, + "loss": 0.5067, + "step": 2211 + }, + { + "epoch": 0.8848, + "grad_norm": 0.5340900421142578, + "learning_rate": 2.4079937486121005e-07, + "loss": 0.4737, + "step": 2212 + }, + { + "epoch": 0.8852, + "grad_norm": 0.532534122467041, + "learning_rate": 2.391493363093842e-07, + "loss": 0.5541, + "step": 2213 + }, + { + "epoch": 0.8856, + "grad_norm": 0.5554724335670471, + "learning_rate": 2.3750477052644958e-07, + "loss": 0.5777, + "step": 2214 + }, + { + "epoch": 0.886, + "grad_norm": 0.5182691216468811, + "learning_rate": 2.3586568027252208e-07, + "loss": 0.6101, + "step": 2215 + }, + { + "epoch": 0.8864, + "grad_norm": 0.5171369314193726, + "learning_rate": 2.3423206829852994e-07, + "loss": 0.5527, + "step": 2216 + }, + { + "epoch": 0.8868, + "grad_norm": 0.5117000937461853, + "learning_rate": 2.3260393734620494e-07, + "loss": 0.4774, + "step": 2217 + }, + { + "epoch": 0.8872, + "grad_norm": 0.4515170753002167, + "learning_rate": 2.309812901480816e-07, + "loss": 0.4977, + "step": 2218 + }, + { + "epoch": 0.8876, + "grad_norm": 0.48999813199043274, + "learning_rate": 2.2936412942748946e-07, + "loss": 0.4982, + "step": 2219 + }, + { + "epoch": 0.888, + "grad_norm": 0.48553434014320374, + "learning_rate": 2.2775245789855103e-07, + "loss": 0.5583, + "step": 2220 + }, + { + "epoch": 0.8884, + "grad_norm": 0.7433586120605469, + "learning_rate": 2.2614627826617528e-07, + "loss": 0.5753, + "step": 2221 + }, + { + "epoch": 0.8888, + "grad_norm": 0.5436974167823792, + "learning_rate": 2.2454559322605566e-07, + "loss": 0.575, + "step": 2222 + }, + { + "epoch": 0.8892, + "grad_norm": 0.4806951880455017, + "learning_rate": 2.2295040546466156e-07, + "loss": 0.4775, + "step": 2223 + }, + { + "epoch": 0.8896, + "grad_norm": 0.4840577244758606, + "learning_rate": 2.2136071765923748e-07, + "loss": 0.5008, + "step": 2224 + }, + { + "epoch": 0.89, + "grad_norm": 0.5452485680580139, + "learning_rate": 2.197765324777977e-07, + "loss": 0.5678, + "step": 2225 + }, + { + "epoch": 0.8904, + "grad_norm": 0.5001971125602722, + "learning_rate": 2.1819785257911923e-07, + "loss": 0.5008, + "step": 2226 + }, + { + "epoch": 0.8908, + "grad_norm": 0.5222870707511902, + "learning_rate": 2.1662468061274247e-07, + "loss": 0.5653, + "step": 2227 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5725530982017517, + "learning_rate": 2.1505701921896096e-07, + "loss": 0.5058, + "step": 2228 + }, + { + "epoch": 0.8916, + "grad_norm": 0.5029703378677368, + "learning_rate": 2.1349487102882197e-07, + "loss": 0.517, + "step": 2229 + }, + { + "epoch": 0.892, + "grad_norm": 0.5157403945922852, + "learning_rate": 2.119382386641176e-07, + "loss": 0.5673, + "step": 2230 + }, + { + "epoch": 0.8924, + "grad_norm": 0.5268275141716003, + "learning_rate": 2.1038712473738362e-07, + "loss": 0.489, + "step": 2231 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5436747670173645, + "learning_rate": 2.0884153185189492e-07, + "loss": 0.5786, + "step": 2232 + }, + { + "epoch": 0.8932, + "grad_norm": 0.4886317551136017, + "learning_rate": 2.0730146260165904e-07, + "loss": 0.5851, + "step": 2233 + }, + { + "epoch": 0.8936, + "grad_norm": 0.5080478191375732, + "learning_rate": 2.0576691957141374e-07, + "loss": 0.5666, + "step": 2234 + }, + { + "epoch": 0.894, + "grad_norm": 0.4885810613632202, + "learning_rate": 2.0423790533662096e-07, + "loss": 0.4711, + "step": 2235 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5395861268043518, + "learning_rate": 2.0271442246346532e-07, + "loss": 0.4948, + "step": 2236 + }, + { + "epoch": 0.8948, + "grad_norm": 0.5157225131988525, + "learning_rate": 2.0119647350884646e-07, + "loss": 0.6207, + "step": 2237 + }, + { + "epoch": 0.8952, + "grad_norm": 0.5027074813842773, + "learning_rate": 1.9968406102037727e-07, + "loss": 0.5474, + "step": 2238 + }, + { + "epoch": 0.8956, + "grad_norm": 0.5285701155662537, + "learning_rate": 1.9817718753637793e-07, + "loss": 0.5634, + "step": 2239 + }, + { + "epoch": 0.896, + "grad_norm": 0.5563217997550964, + "learning_rate": 1.9667585558587253e-07, + "loss": 0.6273, + "step": 2240 + }, + { + "epoch": 0.8964, + "grad_norm": 0.5578852891921997, + "learning_rate": 1.9518006768858504e-07, + "loss": 0.5693, + "step": 2241 + }, + { + "epoch": 0.8968, + "grad_norm": 0.567421555519104, + "learning_rate": 1.9368982635493408e-07, + "loss": 0.609, + "step": 2242 + }, + { + "epoch": 0.8972, + "grad_norm": 0.5388922691345215, + "learning_rate": 1.9220513408603073e-07, + "loss": 0.5307, + "step": 2243 + }, + { + "epoch": 0.8976, + "grad_norm": 0.4816891551017761, + "learning_rate": 1.9072599337367103e-07, + "loss": 0.5399, + "step": 2244 + }, + { + "epoch": 0.898, + "grad_norm": 0.5367328524589539, + "learning_rate": 1.8925240670033404e-07, + "loss": 0.4935, + "step": 2245 + }, + { + "epoch": 0.8984, + "grad_norm": 0.4941747188568115, + "learning_rate": 1.877843765391789e-07, + "loss": 0.5399, + "step": 2246 + }, + { + "epoch": 0.8988, + "grad_norm": 0.539661169052124, + "learning_rate": 1.8632190535403835e-07, + "loss": 0.5073, + "step": 2247 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5193576216697693, + "learning_rate": 1.8486499559941432e-07, + "loss": 0.5519, + "step": 2248 + }, + { + "epoch": 0.8996, + "grad_norm": 0.5331642031669617, + "learning_rate": 1.834136497204762e-07, + "loss": 0.6818, + "step": 2249 + }, + { + "epoch": 0.9, + "grad_norm": 0.5258761644363403, + "learning_rate": 1.819678701530535e-07, + "loss": 0.584, + "step": 2250 + }, + { + "epoch": 0.9004, + "grad_norm": 0.549641489982605, + "learning_rate": 1.805276593236363e-07, + "loss": 0.6069, + "step": 2251 + }, + { + "epoch": 0.9008, + "grad_norm": 0.5493542551994324, + "learning_rate": 1.7909301964936742e-07, + "loss": 0.5378, + "step": 2252 + }, + { + "epoch": 0.9012, + "grad_norm": 0.5483373403549194, + "learning_rate": 1.7766395353803854e-07, + "loss": 0.5867, + "step": 2253 + }, + { + "epoch": 0.9016, + "grad_norm": 0.5108447670936584, + "learning_rate": 1.7624046338808713e-07, + "loss": 0.5701, + "step": 2254 + }, + { + "epoch": 0.902, + "grad_norm": 0.5171193480491638, + "learning_rate": 1.7482255158859445e-07, + "loss": 0.5197, + "step": 2255 + }, + { + "epoch": 0.9024, + "grad_norm": 0.5908972024917603, + "learning_rate": 1.734102205192774e-07, + "loss": 0.6117, + "step": 2256 + }, + { + "epoch": 0.9028, + "grad_norm": 0.4955964684486389, + "learning_rate": 1.7200347255048743e-07, + "loss": 0.5254, + "step": 2257 + }, + { + "epoch": 0.9032, + "grad_norm": 0.5056068897247314, + "learning_rate": 1.7060231004320537e-07, + "loss": 0.6004, + "step": 2258 + }, + { + "epoch": 0.9036, + "grad_norm": 0.48350733518600464, + "learning_rate": 1.69206735349038e-07, + "loss": 0.5441, + "step": 2259 + }, + { + "epoch": 0.904, + "grad_norm": 0.5320057272911072, + "learning_rate": 1.6781675081021348e-07, + "loss": 0.513, + "step": 2260 + }, + { + "epoch": 0.9044, + "grad_norm": 0.5606966614723206, + "learning_rate": 1.6643235875957951e-07, + "loss": 0.5428, + "step": 2261 + }, + { + "epoch": 0.9048, + "grad_norm": 0.5294163227081299, + "learning_rate": 1.650535615205963e-07, + "loss": 0.5392, + "step": 2262 + }, + { + "epoch": 0.9052, + "grad_norm": 0.5900059938430786, + "learning_rate": 1.6368036140733323e-07, + "loss": 0.5982, + "step": 2263 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5253335237503052, + "learning_rate": 1.6231276072446808e-07, + "loss": 0.5925, + "step": 2264 + }, + { + "epoch": 0.906, + "grad_norm": 0.4768005907535553, + "learning_rate": 1.6095076176728062e-07, + "loss": 0.5083, + "step": 2265 + }, + { + "epoch": 0.9064, + "grad_norm": 0.5210567116737366, + "learning_rate": 1.595943668216473e-07, + "loss": 0.5637, + "step": 2266 + }, + { + "epoch": 0.9068, + "grad_norm": 0.4880336821079254, + "learning_rate": 1.58243578164041e-07, + "loss": 0.5667, + "step": 2267 + }, + { + "epoch": 0.9072, + "grad_norm": 0.48938196897506714, + "learning_rate": 1.5689839806152405e-07, + "loss": 0.4602, + "step": 2268 + }, + { + "epoch": 0.9076, + "grad_norm": 0.5602405667304993, + "learning_rate": 1.5555882877174703e-07, + "loss": 0.6031, + "step": 2269 + }, + { + "epoch": 0.908, + "grad_norm": 0.5172474980354309, + "learning_rate": 1.542248725429441e-07, + "loss": 0.4896, + "step": 2270 + }, + { + "epoch": 0.9084, + "grad_norm": 0.5165852308273315, + "learning_rate": 1.5289653161392674e-07, + "loss": 0.49, + "step": 2271 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5281601548194885, + "learning_rate": 1.515738082140839e-07, + "loss": 0.5399, + "step": 2272 + }, + { + "epoch": 0.9092, + "grad_norm": 0.4762950539588928, + "learning_rate": 1.5025670456337596e-07, + "loss": 0.5318, + "step": 2273 + }, + { + "epoch": 0.9096, + "grad_norm": 0.5265024304389954, + "learning_rate": 1.4894522287233226e-07, + "loss": 0.5934, + "step": 2274 + }, + { + "epoch": 0.91, + "grad_norm": 0.49703449010849, + "learning_rate": 1.4763936534204537e-07, + "loss": 0.5519, + "step": 2275 + }, + { + "epoch": 0.9104, + "grad_norm": 0.5251654982566833, + "learning_rate": 1.463391341641694e-07, + "loss": 0.4945, + "step": 2276 + }, + { + "epoch": 0.9108, + "grad_norm": 0.45492956042289734, + "learning_rate": 1.4504453152091479e-07, + "loss": 0.4593, + "step": 2277 + }, + { + "epoch": 0.9112, + "grad_norm": 0.5544923543930054, + "learning_rate": 1.4375555958504727e-07, + "loss": 0.5885, + "step": 2278 + }, + { + "epoch": 0.9116, + "grad_norm": 0.5534620881080627, + "learning_rate": 1.4247222051988152e-07, + "loss": 0.6237, + "step": 2279 + }, + { + "epoch": 0.912, + "grad_norm": 0.49389734864234924, + "learning_rate": 1.4119451647927732e-07, + "loss": 0.431, + "step": 2280 + }, + { + "epoch": 0.9124, + "grad_norm": 0.4699340760707855, + "learning_rate": 1.399224496076381e-07, + "loss": 0.538, + "step": 2281 + }, + { + "epoch": 0.9128, + "grad_norm": 0.5014921426773071, + "learning_rate": 1.3865602203990652e-07, + "loss": 0.5036, + "step": 2282 + }, + { + "epoch": 0.9132, + "grad_norm": 0.46775099635124207, + "learning_rate": 1.3739523590156022e-07, + "loss": 0.4506, + "step": 2283 + }, + { + "epoch": 0.9136, + "grad_norm": 0.5627043843269348, + "learning_rate": 1.3614009330860922e-07, + "loss": 0.5204, + "step": 2284 + }, + { + "epoch": 0.914, + "grad_norm": 0.631554901599884, + "learning_rate": 1.348905963675903e-07, + "loss": 0.5127, + "step": 2285 + }, + { + "epoch": 0.9144, + "grad_norm": 0.5087348818778992, + "learning_rate": 1.3364674717556647e-07, + "loss": 0.545, + "step": 2286 + }, + { + "epoch": 0.9148, + "grad_norm": 0.5642393231391907, + "learning_rate": 1.3240854782012195e-07, + "loss": 0.5813, + "step": 2287 + }, + { + "epoch": 0.9152, + "grad_norm": 0.46905118227005005, + "learning_rate": 1.3117600037935827e-07, + "loss": 0.4888, + "step": 2288 + }, + { + "epoch": 0.9156, + "grad_norm": 0.5647628903388977, + "learning_rate": 1.2994910692189132e-07, + "loss": 0.5614, + "step": 2289 + }, + { + "epoch": 0.916, + "grad_norm": 0.5584691166877747, + "learning_rate": 1.2872786950684672e-07, + "loss": 0.5206, + "step": 2290 + }, + { + "epoch": 0.9164, + "grad_norm": 0.5193395018577576, + "learning_rate": 1.2751229018385952e-07, + "loss": 0.4593, + "step": 2291 + }, + { + "epoch": 0.9168, + "grad_norm": 0.5479791164398193, + "learning_rate": 1.2630237099306707e-07, + "loss": 0.5706, + "step": 2292 + }, + { + "epoch": 0.9172, + "grad_norm": 0.5571714639663696, + "learning_rate": 1.25098113965108e-07, + "loss": 0.5249, + "step": 2293 + }, + { + "epoch": 0.9176, + "grad_norm": 0.5249247550964355, + "learning_rate": 1.2389952112111708e-07, + "loss": 0.573, + "step": 2294 + }, + { + "epoch": 0.918, + "grad_norm": 0.5296669006347656, + "learning_rate": 1.2270659447272326e-07, + "loss": 0.5324, + "step": 2295 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5688275098800659, + "learning_rate": 1.2151933602204622e-07, + "loss": 0.5893, + "step": 2296 + }, + { + "epoch": 0.9188, + "grad_norm": 0.5549254417419434, + "learning_rate": 1.2033774776169214e-07, + "loss": 0.554, + "step": 2297 + }, + { + "epoch": 0.9192, + "grad_norm": 0.5588293671607971, + "learning_rate": 1.1916183167475047e-07, + "loss": 0.5978, + "step": 2298 + }, + { + "epoch": 0.9196, + "grad_norm": 0.515630841255188, + "learning_rate": 1.179915897347909e-07, + "loss": 0.4945, + "step": 2299 + }, + { + "epoch": 0.92, + "grad_norm": 0.5234367847442627, + "learning_rate": 1.1682702390586102e-07, + "loss": 0.5629, + "step": 2300 + }, + { + "epoch": 0.9204, + "grad_norm": 0.5144403576850891, + "learning_rate": 1.156681361424805e-07, + "loss": 0.6108, + "step": 2301 + }, + { + "epoch": 0.9208, + "grad_norm": 0.4699039161205292, + "learning_rate": 1.1451492838964145e-07, + "loss": 0.468, + "step": 2302 + }, + { + "epoch": 0.9212, + "grad_norm": 0.5121269822120667, + "learning_rate": 1.1336740258280142e-07, + "loss": 0.5558, + "step": 2303 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5284686088562012, + "learning_rate": 1.1222556064788153e-07, + "loss": 0.5622, + "step": 2304 + }, + { + "epoch": 0.922, + "grad_norm": 0.5822998285293579, + "learning_rate": 1.1108940450126481e-07, + "loss": 0.5311, + "step": 2305 + }, + { + "epoch": 0.9224, + "grad_norm": 0.5090004801750183, + "learning_rate": 1.0995893604979124e-07, + "loss": 0.5689, + "step": 2306 + }, + { + "epoch": 0.9228, + "grad_norm": 0.5638589859008789, + "learning_rate": 1.0883415719075457e-07, + "loss": 0.6537, + "step": 2307 + }, + { + "epoch": 0.9232, + "grad_norm": 0.49607881903648376, + "learning_rate": 1.0771506981190004e-07, + "loss": 0.5467, + "step": 2308 + }, + { + "epoch": 0.9236, + "grad_norm": 0.5245644450187683, + "learning_rate": 1.0660167579142088e-07, + "loss": 0.4748, + "step": 2309 + }, + { + "epoch": 0.924, + "grad_norm": 0.5582762956619263, + "learning_rate": 1.0549397699795438e-07, + "loss": 0.557, + "step": 2310 + }, + { + "epoch": 0.9244, + "grad_norm": 0.5717631578445435, + "learning_rate": 1.0439197529057881e-07, + "loss": 0.6091, + "step": 2311 + }, + { + "epoch": 0.9248, + "grad_norm": 0.4912102222442627, + "learning_rate": 1.0329567251881344e-07, + "loss": 0.5659, + "step": 2312 + }, + { + "epoch": 0.9252, + "grad_norm": 0.52315354347229, + "learning_rate": 1.0220507052260997e-07, + "loss": 0.6072, + "step": 2313 + }, + { + "epoch": 0.9256, + "grad_norm": 0.4884204566478729, + "learning_rate": 1.0112017113235444e-07, + "loss": 0.4823, + "step": 2314 + }, + { + "epoch": 0.926, + "grad_norm": 0.5286775231361389, + "learning_rate": 1.0004097616885993e-07, + "loss": 0.6211, + "step": 2315 + }, + { + "epoch": 0.9264, + "grad_norm": 0.5312598347663879, + "learning_rate": 9.896748744336842e-08, + "loss": 0.5514, + "step": 2316 + }, + { + "epoch": 0.9268, + "grad_norm": 0.4938899278640747, + "learning_rate": 9.78997067575419e-08, + "loss": 0.5404, + "step": 2317 + }, + { + "epoch": 0.9272, + "grad_norm": 0.5511752367019653, + "learning_rate": 9.683763590346544e-08, + "loss": 0.553, + "step": 2318 + }, + { + "epoch": 0.9276, + "grad_norm": 0.5150926113128662, + "learning_rate": 9.578127666363872e-08, + "loss": 0.4999, + "step": 2319 + }, + { + "epoch": 0.928, + "grad_norm": 0.5279099941253662, + "learning_rate": 9.473063081097633e-08, + "loss": 0.5558, + "step": 2320 + }, + { + "epoch": 0.9284, + "grad_norm": 0.5396522283554077, + "learning_rate": 9.368570010880467e-08, + "loss": 0.5214, + "step": 2321 + }, + { + "epoch": 0.9288, + "grad_norm": 0.5016295313835144, + "learning_rate": 9.264648631085697e-08, + "loss": 0.537, + "step": 2322 + }, + { + "epoch": 0.9292, + "grad_norm": 0.5354951024055481, + "learning_rate": 9.161299116127324e-08, + "loss": 0.5904, + "step": 2323 + }, + { + "epoch": 0.9296, + "grad_norm": 0.46822237968444824, + "learning_rate": 9.058521639459366e-08, + "loss": 0.5, + "step": 2324 + }, + { + "epoch": 0.93, + "grad_norm": 0.5241938829421997, + "learning_rate": 8.956316373575896e-08, + "loss": 0.6066, + "step": 2325 + }, + { + "epoch": 0.9304, + "grad_norm": 0.5749825835227966, + "learning_rate": 8.854683490010656e-08, + "loss": 0.5586, + "step": 2326 + }, + { + "epoch": 0.9308, + "grad_norm": 0.5310881733894348, + "learning_rate": 8.753623159336748e-08, + "loss": 0.5686, + "step": 2327 + }, + { + "epoch": 0.9312, + "grad_norm": 0.47297438979148865, + "learning_rate": 8.653135551166241e-08, + "loss": 0.5044, + "step": 2328 + }, + { + "epoch": 0.9316, + "grad_norm": 0.49978524446487427, + "learning_rate": 8.553220834150055e-08, + "loss": 0.4801, + "step": 2329 + }, + { + "epoch": 0.932, + "grad_norm": 0.4736655056476593, + "learning_rate": 8.453879175977614e-08, + "loss": 0.5754, + "step": 2330 + }, + { + "epoch": 0.9324, + "grad_norm": 0.5332768559455872, + "learning_rate": 8.355110743376648e-08, + "loss": 0.5547, + "step": 2331 + }, + { + "epoch": 0.9328, + "grad_norm": 0.4920896589756012, + "learning_rate": 8.256915702112616e-08, + "loss": 0.4997, + "step": 2332 + }, + { + "epoch": 0.9332, + "grad_norm": 0.5521984100341797, + "learning_rate": 8.159294216988816e-08, + "loss": 0.5899, + "step": 2333 + }, + { + "epoch": 0.9336, + "grad_norm": 0.5347005128860474, + "learning_rate": 8.062246451845845e-08, + "loss": 0.5885, + "step": 2334 + }, + { + "epoch": 0.934, + "grad_norm": 0.5441831946372986, + "learning_rate": 7.96577256956144e-08, + "loss": 0.5231, + "step": 2335 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5577284097671509, + "learning_rate": 7.869872732050253e-08, + "loss": 0.5694, + "step": 2336 + }, + { + "epoch": 0.9348, + "grad_norm": 0.533072829246521, + "learning_rate": 7.774547100263413e-08, + "loss": 0.5164, + "step": 2337 + }, + { + "epoch": 0.9352, + "grad_norm": 0.5311025381088257, + "learning_rate": 7.679795834188296e-08, + "loss": 0.562, + "step": 2338 + }, + { + "epoch": 0.9356, + "grad_norm": 0.5785207152366638, + "learning_rate": 7.585619092848339e-08, + "loss": 0.5512, + "step": 2339 + }, + { + "epoch": 0.936, + "grad_norm": 0.5167658925056458, + "learning_rate": 7.49201703430291e-08, + "loss": 0.5515, + "step": 2340 + }, + { + "epoch": 0.9364, + "grad_norm": 0.5739880204200745, + "learning_rate": 7.398989815646662e-08, + "loss": 0.5778, + "step": 2341 + }, + { + "epoch": 0.9368, + "grad_norm": 0.5385161638259888, + "learning_rate": 7.306537593009555e-08, + "loss": 0.5224, + "step": 2342 + }, + { + "epoch": 0.9372, + "grad_norm": 0.4797918498516083, + "learning_rate": 7.214660521556482e-08, + "loss": 0.5574, + "step": 2343 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5047261714935303, + "learning_rate": 7.123358755487069e-08, + "loss": 0.5462, + "step": 2344 + }, + { + "epoch": 0.938, + "grad_norm": 0.5187482237815857, + "learning_rate": 7.032632448035481e-08, + "loss": 0.4779, + "step": 2345 + }, + { + "epoch": 0.9384, + "grad_norm": 0.4853595197200775, + "learning_rate": 6.942481751469953e-08, + "loss": 0.5042, + "step": 2346 + }, + { + "epoch": 0.9388, + "grad_norm": 0.5323401093482971, + "learning_rate": 6.85290681709268e-08, + "loss": 0.5266, + "step": 2347 + }, + { + "epoch": 0.9392, + "grad_norm": 0.5271187424659729, + "learning_rate": 6.763907795239538e-08, + "loss": 0.4735, + "step": 2348 + }, + { + "epoch": 0.9396, + "grad_norm": 0.5426302552223206, + "learning_rate": 6.675484835279932e-08, + "loss": 0.5779, + "step": 2349 + }, + { + "epoch": 0.94, + "grad_norm": 0.580814778804779, + "learning_rate": 6.587638085616332e-08, + "loss": 0.5807, + "step": 2350 + }, + { + "epoch": 0.9404, + "grad_norm": 0.5777420997619629, + "learning_rate": 6.500367693684189e-08, + "loss": 0.5932, + "step": 2351 + }, + { + "epoch": 0.9408, + "grad_norm": 0.5483967661857605, + "learning_rate": 6.413673805951591e-08, + "loss": 0.6241, + "step": 2352 + }, + { + "epoch": 0.9412, + "grad_norm": 0.49772343039512634, + "learning_rate": 6.327556567919146e-08, + "loss": 0.4763, + "step": 2353 + }, + { + "epoch": 0.9416, + "grad_norm": 0.5433846712112427, + "learning_rate": 6.242016124119626e-08, + "loss": 0.5387, + "step": 2354 + }, + { + "epoch": 0.942, + "grad_norm": 0.5307440161705017, + "learning_rate": 6.157052618117703e-08, + "loss": 0.5218, + "step": 2355 + }, + { + "epoch": 0.9424, + "grad_norm": 0.5752222537994385, + "learning_rate": 6.072666192509751e-08, + "loss": 0.5562, + "step": 2356 + }, + { + "epoch": 0.9428, + "grad_norm": 0.5130374431610107, + "learning_rate": 5.988856988923769e-08, + "loss": 0.4861, + "step": 2357 + }, + { + "epoch": 0.9432, + "grad_norm": 0.5313212871551514, + "learning_rate": 5.905625148018834e-08, + "loss": 0.5385, + "step": 2358 + }, + { + "epoch": 0.9436, + "grad_norm": 0.5523070693016052, + "learning_rate": 5.8229708094850676e-08, + "loss": 0.5731, + "step": 2359 + }, + { + "epoch": 0.944, + "grad_norm": 0.526229202747345, + "learning_rate": 5.74089411204336e-08, + "loss": 0.5016, + "step": 2360 + }, + { + "epoch": 0.9444, + "grad_norm": 0.5088282823562622, + "learning_rate": 5.6593951934450994e-08, + "loss": 0.5887, + "step": 2361 + }, + { + "epoch": 0.9448, + "grad_norm": 0.5603100061416626, + "learning_rate": 5.578474190472016e-08, + "loss": 0.5791, + "step": 2362 + }, + { + "epoch": 0.9452, + "grad_norm": 0.4806862771511078, + "learning_rate": 5.4981312389359494e-08, + "loss": 0.5215, + "step": 2363 + }, + { + "epoch": 0.9456, + "grad_norm": 0.5049216151237488, + "learning_rate": 5.41836647367846e-08, + "loss": 0.5432, + "step": 2364 + }, + { + "epoch": 0.946, + "grad_norm": 0.5518359541893005, + "learning_rate": 5.339180028570789e-08, + "loss": 0.4736, + "step": 2365 + }, + { + "epoch": 0.9464, + "grad_norm": 0.5348437428474426, + "learning_rate": 5.260572036513589e-08, + "loss": 0.4547, + "step": 2366 + }, + { + "epoch": 0.9468, + "grad_norm": 0.5421221852302551, + "learning_rate": 5.182542629436648e-08, + "loss": 0.5452, + "step": 2367 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4720580279827118, + "learning_rate": 5.105091938298739e-08, + "loss": 0.5686, + "step": 2368 + }, + { + "epoch": 0.9476, + "grad_norm": 0.5404186248779297, + "learning_rate": 5.028220093087343e-08, + "loss": 0.562, + "step": 2369 + }, + { + "epoch": 0.948, + "grad_norm": 0.5333725810050964, + "learning_rate": 4.951927222818381e-08, + "loss": 0.5939, + "step": 2370 + }, + { + "epoch": 0.9484, + "grad_norm": 0.5129185914993286, + "learning_rate": 4.876213455536171e-08, + "loss": 0.5754, + "step": 2371 + }, + { + "epoch": 0.9488, + "grad_norm": 0.5559066534042358, + "learning_rate": 4.8010789183130066e-08, + "loss": 0.5132, + "step": 2372 + }, + { + "epoch": 0.9492, + "grad_norm": 0.5458990335464478, + "learning_rate": 4.7265237372491883e-08, + "loss": 0.5678, + "step": 2373 + }, + { + "epoch": 0.9496, + "grad_norm": 0.5720266699790955, + "learning_rate": 4.652548037472487e-08, + "loss": 0.5237, + "step": 2374 + }, + { + "epoch": 0.95, + "grad_norm": 0.5585337281227112, + "learning_rate": 4.5791519431382544e-08, + "loss": 0.5886, + "step": 2375 + }, + { + "epoch": 0.9504, + "grad_norm": 0.49161794781684875, + "learning_rate": 4.506335577429038e-08, + "loss": 0.4594, + "step": 2376 + }, + { + "epoch": 0.9508, + "grad_norm": 0.4829513430595398, + "learning_rate": 4.434099062554386e-08, + "loss": 0.6157, + "step": 2377 + }, + { + "epoch": 0.9512, + "grad_norm": 0.4910193085670471, + "learning_rate": 4.362442519750692e-08, + "loss": 0.5071, + "step": 2378 + }, + { + "epoch": 0.9516, + "grad_norm": 0.5328912734985352, + "learning_rate": 4.291366069280961e-08, + "loss": 0.5709, + "step": 2379 + }, + { + "epoch": 0.952, + "grad_norm": 0.6091710329055786, + "learning_rate": 4.2208698304346145e-08, + "loss": 0.4926, + "step": 2380 + }, + { + "epoch": 0.9524, + "grad_norm": 0.5346353054046631, + "learning_rate": 4.150953921527262e-08, + "loss": 0.5452, + "step": 2381 + }, + { + "epoch": 0.9528, + "grad_norm": 0.45982980728149414, + "learning_rate": 4.0816184599005764e-08, + "loss": 0.5673, + "step": 2382 + }, + { + "epoch": 0.9532, + "grad_norm": 0.5378580093383789, + "learning_rate": 4.012863561922031e-08, + "loss": 0.5315, + "step": 2383 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5490142703056335, + "learning_rate": 3.944689342984775e-08, + "loss": 0.5656, + "step": 2384 + }, + { + "epoch": 0.954, + "grad_norm": 0.5665016770362854, + "learning_rate": 3.8770959175072506e-08, + "loss": 0.5158, + "step": 2385 + }, + { + "epoch": 0.9544, + "grad_norm": 0.5378921031951904, + "learning_rate": 3.810083398933267e-08, + "loss": 0.5287, + "step": 2386 + }, + { + "epoch": 0.9548, + "grad_norm": 0.4553784132003784, + "learning_rate": 3.7436518997316926e-08, + "loss": 0.4685, + "step": 2387 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5257003903388977, + "learning_rate": 3.6778015313961406e-08, + "loss": 0.5858, + "step": 2388 + }, + { + "epoch": 0.9556, + "grad_norm": 0.5115745067596436, + "learning_rate": 3.6125324044450123e-08, + "loss": 0.5712, + "step": 2389 + }, + { + "epoch": 0.956, + "grad_norm": 0.5314233899116516, + "learning_rate": 3.547844628421104e-08, + "loss": 0.5518, + "step": 2390 + }, + { + "epoch": 0.9564, + "grad_norm": 0.5423491597175598, + "learning_rate": 3.483738311891571e-08, + "loss": 0.6051, + "step": 2391 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5177218317985535, + "learning_rate": 3.420213562447733e-08, + "loss": 0.5818, + "step": 2392 + }, + { + "epoch": 0.9572, + "grad_norm": 0.5107486844062805, + "learning_rate": 3.357270486704722e-08, + "loss": 0.5967, + "step": 2393 + }, + { + "epoch": 0.9576, + "grad_norm": 0.4815487265586853, + "learning_rate": 3.2949091903016033e-08, + "loss": 0.5463, + "step": 2394 + }, + { + "epoch": 0.958, + "grad_norm": 0.5988988280296326, + "learning_rate": 3.2331297779008286e-08, + "loss": 0.5769, + "step": 2395 + }, + { + "epoch": 0.9584, + "grad_norm": 0.6041003465652466, + "learning_rate": 3.171932353188389e-08, + "loss": 0.5919, + "step": 2396 + }, + { + "epoch": 0.9588, + "grad_norm": 0.4653492569923401, + "learning_rate": 3.1113170188735494e-08, + "loss": 0.5024, + "step": 2397 + }, + { + "epoch": 0.9592, + "grad_norm": 0.4932403266429901, + "learning_rate": 3.051283876688493e-08, + "loss": 0.5802, + "step": 2398 + }, + { + "epoch": 0.9596, + "grad_norm": 0.4975288212299347, + "learning_rate": 2.991833027388441e-08, + "loss": 0.5104, + "step": 2399 + }, + { + "epoch": 0.96, + "grad_norm": 0.4812232255935669, + "learning_rate": 2.932964570751223e-08, + "loss": 0.4926, + "step": 2400 + }, + { + "epoch": 0.9604, + "grad_norm": 0.5227335691452026, + "learning_rate": 2.87467860557728e-08, + "loss": 0.4955, + "step": 2401 + }, + { + "epoch": 0.9608, + "grad_norm": 0.5533686876296997, + "learning_rate": 2.816975229689428e-08, + "loss": 0.4968, + "step": 2402 + }, + { + "epoch": 0.9612, + "grad_norm": 0.5150798559188843, + "learning_rate": 2.759854539932782e-08, + "loss": 0.6334, + "step": 2403 + }, + { + "epoch": 0.9616, + "grad_norm": 0.5787931680679321, + "learning_rate": 2.7033166321743296e-08, + "loss": 0.4845, + "step": 2404 + }, + { + "epoch": 0.962, + "grad_norm": 0.5208541750907898, + "learning_rate": 2.6473616013032385e-08, + "loss": 0.5725, + "step": 2405 + }, + { + "epoch": 0.9624, + "grad_norm": 0.49922534823417664, + "learning_rate": 2.5919895412301218e-08, + "loss": 0.6088, + "step": 2406 + }, + { + "epoch": 0.9628, + "grad_norm": 0.48126330971717834, + "learning_rate": 2.5372005448873856e-08, + "loss": 0.5589, + "step": 2407 + }, + { + "epoch": 0.9632, + "grad_norm": 0.4809359610080719, + "learning_rate": 2.4829947042288412e-08, + "loss": 0.5026, + "step": 2408 + }, + { + "epoch": 0.9636, + "grad_norm": 0.5534669756889343, + "learning_rate": 2.429372110229433e-08, + "loss": 0.5381, + "step": 2409 + }, + { + "epoch": 0.964, + "grad_norm": 0.5311241149902344, + "learning_rate": 2.3763328528853932e-08, + "loss": 0.562, + "step": 2410 + }, + { + "epoch": 0.9644, + "grad_norm": 0.5506380200386047, + "learning_rate": 2.323877021213894e-08, + "loss": 0.5741, + "step": 2411 + }, + { + "epoch": 0.9648, + "grad_norm": 0.5539811253547668, + "learning_rate": 2.27200470325285e-08, + "loss": 0.5573, + "step": 2412 + }, + { + "epoch": 0.9652, + "grad_norm": 0.5888808965682983, + "learning_rate": 2.2207159860608837e-08, + "loss": 0.6698, + "step": 2413 + }, + { + "epoch": 0.9656, + "grad_norm": 0.5768120884895325, + "learning_rate": 2.1700109557171666e-08, + "loss": 0.5487, + "step": 2414 + }, + { + "epoch": 0.966, + "grad_norm": 0.5086904168128967, + "learning_rate": 2.1198896973212643e-08, + "loss": 0.6062, + "step": 2415 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5336338877677917, + "learning_rate": 2.0703522949929833e-08, + "loss": 0.5496, + "step": 2416 + }, + { + "epoch": 0.9668, + "grad_norm": 0.5526391863822937, + "learning_rate": 2.021398831872173e-08, + "loss": 0.5389, + "step": 2417 + }, + { + "epoch": 0.9672, + "grad_norm": 0.49860087037086487, + "learning_rate": 1.9730293901186522e-08, + "loss": 0.5651, + "step": 2418 + }, + { + "epoch": 0.9676, + "grad_norm": 0.509070873260498, + "learning_rate": 1.925244050912167e-08, + "loss": 0.5729, + "step": 2419 + }, + { + "epoch": 0.968, + "grad_norm": 0.5186116695404053, + "learning_rate": 1.8780428944520035e-08, + "loss": 0.5349, + "step": 2420 + }, + { + "epoch": 0.9684, + "grad_norm": 0.6276172995567322, + "learning_rate": 1.8314259999571426e-08, + "loss": 0.6, + "step": 2421 + }, + { + "epoch": 0.9688, + "grad_norm": 0.6774277091026306, + "learning_rate": 1.7853934456658336e-08, + "loss": 0.6064, + "step": 2422 + }, + { + "epoch": 0.9692, + "grad_norm": 0.4820622205734253, + "learning_rate": 1.739945308835672e-08, + "loss": 0.5151, + "step": 2423 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5160215497016907, + "learning_rate": 1.6950816657435198e-08, + "loss": 0.5131, + "step": 2424 + }, + { + "epoch": 0.97, + "grad_norm": 0.5508131384849548, + "learning_rate": 1.650802591685119e-08, + "loss": 0.5214, + "step": 2425 + }, + { + "epoch": 0.9704, + "grad_norm": 0.5874997973442078, + "learning_rate": 1.607108160975168e-08, + "loss": 0.615, + "step": 2426 + }, + { + "epoch": 0.9708, + "grad_norm": 0.5136599540710449, + "learning_rate": 1.563998446947129e-08, + "loss": 0.5952, + "step": 2427 + }, + { + "epoch": 0.9712, + "grad_norm": 0.5393446087837219, + "learning_rate": 1.5214735219531484e-08, + "loss": 0.5409, + "step": 2428 + }, + { + "epoch": 0.9716, + "grad_norm": 0.5421205759048462, + "learning_rate": 1.4795334573639407e-08, + "loss": 0.5534, + "step": 2429 + }, + { + "epoch": 0.972, + "grad_norm": 0.5334486365318298, + "learning_rate": 1.4381783235685175e-08, + "loss": 0.5431, + "step": 2430 + }, + { + "epoch": 0.9724, + "grad_norm": 0.5072309970855713, + "learning_rate": 1.397408189974303e-08, + "loss": 0.5234, + "step": 2431 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5345374941825867, + "learning_rate": 1.3572231250068233e-08, + "loss": 0.5247, + "step": 2432 + }, + { + "epoch": 0.9732, + "grad_norm": 0.4882015883922577, + "learning_rate": 1.3176231961097073e-08, + "loss": 0.5452, + "step": 2433 + }, + { + "epoch": 0.9736, + "grad_norm": 0.4947294592857361, + "learning_rate": 1.2786084697445688e-08, + "loss": 0.5102, + "step": 2434 + }, + { + "epoch": 0.974, + "grad_norm": 0.49401557445526123, + "learning_rate": 1.2401790113907351e-08, + "loss": 0.5259, + "step": 2435 + }, + { + "epoch": 0.9744, + "grad_norm": 0.4852409064769745, + "learning_rate": 1.202334885545403e-08, + "loss": 0.5365, + "step": 2436 + }, + { + "epoch": 0.9748, + "grad_norm": 0.49361684918403625, + "learning_rate": 1.1650761557233268e-08, + "loss": 0.5187, + "step": 2437 + }, + { + "epoch": 0.9752, + "grad_norm": 0.5144620537757874, + "learning_rate": 1.1284028844567417e-08, + "loss": 0.5799, + "step": 2438 + }, + { + "epoch": 0.9756, + "grad_norm": 0.5108654499053955, + "learning_rate": 1.0923151332954406e-08, + "loss": 0.5396, + "step": 2439 + }, + { + "epoch": 0.976, + "grad_norm": 0.49677518010139465, + "learning_rate": 1.0568129628063082e-08, + "loss": 0.5351, + "step": 2440 + }, + { + "epoch": 0.9764, + "grad_norm": 0.6048544645309448, + "learning_rate": 1.0218964325735935e-08, + "loss": 0.5381, + "step": 2441 + }, + { + "epoch": 0.9768, + "grad_norm": 0.5242604613304138, + "learning_rate": 9.875656011985589e-09, + "loss": 0.5123, + "step": 2442 + }, + { + "epoch": 0.9772, + "grad_norm": 0.538178563117981, + "learning_rate": 9.53820526299598e-09, + "loss": 0.5112, + "step": 2443 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4944842755794525, + "learning_rate": 9.206612645118462e-09, + "loss": 0.5608, + "step": 2444 + }, + { + "epoch": 0.978, + "grad_norm": 0.5592899918556213, + "learning_rate": 8.880878714873753e-09, + "loss": 0.5691, + "step": 2445 + }, + { + "epoch": 0.9784, + "grad_norm": 0.5722050070762634, + "learning_rate": 8.561004018949602e-09, + "loss": 0.5671, + "step": 2446 + }, + { + "epoch": 0.9788, + "grad_norm": 0.4920216500759125, + "learning_rate": 8.24698909419963e-09, + "loss": 0.5436, + "step": 2447 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5159383416175842, + "learning_rate": 7.93883446764332e-09, + "loss": 0.5927, + "step": 2448 + }, + { + "epoch": 0.9796, + "grad_norm": 0.5492022037506104, + "learning_rate": 7.636540656463686e-09, + "loss": 0.6018, + "step": 2449 + }, + { + "epoch": 0.98, + "grad_norm": 0.5431396961212158, + "learning_rate": 7.3401081680092295e-09, + "loss": 0.5015, + "step": 2450 + }, + { + "epoch": 0.9804, + "grad_norm": 0.5254392623901367, + "learning_rate": 7.04953749978926e-09, + "loss": 0.5593, + "step": 2451 + }, + { + "epoch": 0.9808, + "grad_norm": 0.46500301361083984, + "learning_rate": 6.764829139476625e-09, + "loss": 0.4852, + "step": 2452 + }, + { + "epoch": 0.9812, + "grad_norm": 0.6032522916793823, + "learning_rate": 6.485983564904596e-09, + "loss": 0.6172, + "step": 2453 + }, + { + "epoch": 0.9816, + "grad_norm": 0.5080193877220154, + "learning_rate": 6.213001244066096e-09, + "loss": 0.5481, + "step": 2454 + }, + { + "epoch": 0.982, + "grad_norm": 0.5079247951507568, + "learning_rate": 5.945882635115252e-09, + "loss": 0.5238, + "step": 2455 + }, + { + "epoch": 0.9824, + "grad_norm": 0.537906289100647, + "learning_rate": 5.684628186363116e-09, + "loss": 0.6, + "step": 2456 + }, + { + "epoch": 0.9828, + "grad_norm": 0.5208892226219177, + "learning_rate": 5.429238336280396e-09, + "loss": 0.525, + "step": 2457 + }, + { + "epoch": 0.9832, + "grad_norm": 0.4783654808998108, + "learning_rate": 5.179713513493944e-09, + "loss": 0.5296, + "step": 2458 + }, + { + "epoch": 0.9836, + "grad_norm": 0.6100749373435974, + "learning_rate": 4.936054136788326e-09, + "loss": 0.5711, + "step": 2459 + }, + { + "epoch": 0.984, + "grad_norm": 0.5242696404457092, + "learning_rate": 4.6982606151027005e-09, + "loss": 0.5879, + "step": 2460 + }, + { + "epoch": 0.9844, + "grad_norm": 0.463133305311203, + "learning_rate": 4.466333347531992e-09, + "loss": 0.4878, + "step": 2461 + }, + { + "epoch": 0.9848, + "grad_norm": 0.5177239775657654, + "learning_rate": 4.240272723325722e-09, + "loss": 0.4898, + "step": 2462 + }, + { + "epoch": 0.9852, + "grad_norm": 0.5330920815467834, + "learning_rate": 4.020079121887232e-09, + "loss": 0.4809, + "step": 2463 + }, + { + "epoch": 0.9856, + "grad_norm": 0.4839600622653961, + "learning_rate": 3.805752912773297e-09, + "loss": 0.5072, + "step": 2464 + }, + { + "epoch": 0.986, + "grad_norm": 0.5248364210128784, + "learning_rate": 3.5972944556929563e-09, + "loss": 0.5916, + "step": 2465 + }, + { + "epoch": 0.9864, + "grad_norm": 0.5435823798179626, + "learning_rate": 3.3947041005071287e-09, + "loss": 0.5015, + "step": 2466 + }, + { + "epoch": 0.9868, + "grad_norm": 0.5040256381034851, + "learning_rate": 3.1979821872286097e-09, + "loss": 0.5426, + "step": 2467 + }, + { + "epoch": 0.9872, + "grad_norm": 0.47600534558296204, + "learning_rate": 3.007129046020518e-09, + "loss": 0.5207, + "step": 2468 + }, + { + "epoch": 0.9876, + "grad_norm": 0.5221179127693176, + "learning_rate": 2.8221449971970736e-09, + "loss": 0.4919, + "step": 2469 + }, + { + "epoch": 0.988, + "grad_norm": 0.4820321500301361, + "learning_rate": 2.643030351221265e-09, + "loss": 0.5271, + "step": 2470 + }, + { + "epoch": 0.9884, + "grad_norm": 0.5176935195922852, + "learning_rate": 2.469785408705627e-09, + "loss": 0.5299, + "step": 2471 + }, + { + "epoch": 0.9888, + "grad_norm": 0.5503344535827637, + "learning_rate": 2.302410460412241e-09, + "loss": 0.5496, + "step": 2472 + }, + { + "epoch": 0.9892, + "grad_norm": 0.5495110154151917, + "learning_rate": 2.1409057872504023e-09, + "loss": 0.5612, + "step": 2473 + }, + { + "epoch": 0.9896, + "grad_norm": 0.5193551182746887, + "learning_rate": 1.985271660277399e-09, + "loss": 0.5379, + "step": 2474 + }, + { + "epoch": 0.99, + "grad_norm": 0.5184204578399658, + "learning_rate": 1.8355083406977335e-09, + "loss": 0.4782, + "step": 2475 + }, + { + "epoch": 0.9904, + "grad_norm": 0.5474755167961121, + "learning_rate": 1.6916160798635116e-09, + "loss": 0.5934, + "step": 2476 + }, + { + "epoch": 0.9908, + "grad_norm": 0.49980610609054565, + "learning_rate": 1.553595119272888e-09, + "loss": 0.4373, + "step": 2477 + }, + { + "epoch": 0.9912, + "grad_norm": 0.5719631314277649, + "learning_rate": 1.4214456905696782e-09, + "loss": 0.6477, + "step": 2478 + }, + { + "epoch": 0.9916, + "grad_norm": 0.49593043327331543, + "learning_rate": 1.2951680155433575e-09, + "loss": 0.4784, + "step": 2479 + }, + { + "epoch": 0.992, + "grad_norm": 0.5717119574546814, + "learning_rate": 1.174762306129451e-09, + "loss": 0.5891, + "step": 2480 + }, + { + "epoch": 0.9924, + "grad_norm": 0.4778268039226532, + "learning_rate": 1.0602287644075891e-09, + "loss": 0.5293, + "step": 2481 + }, + { + "epoch": 0.9928, + "grad_norm": 0.5617204308509827, + "learning_rate": 9.515675826026748e-10, + "loss": 0.6281, + "step": 2482 + }, + { + "epoch": 0.9932, + "grad_norm": 0.5070087909698486, + "learning_rate": 8.487789430829395e-10, + "loss": 0.5313, + "step": 2483 + }, + { + "epoch": 0.9936, + "grad_norm": 0.5938708186149597, + "learning_rate": 7.518630183618868e-10, + "loss": 0.5321, + "step": 2484 + }, + { + "epoch": 0.994, + "grad_norm": 0.5089312195777893, + "learning_rate": 6.608199710959606e-10, + "loss": 0.5176, + "step": 2485 + }, + { + "epoch": 0.9944, + "grad_norm": 0.5960400700569153, + "learning_rate": 5.75649954084545e-10, + "loss": 0.662, + "step": 2486 + }, + { + "epoch": 0.9948, + "grad_norm": 0.4653564393520355, + "learning_rate": 4.96353110270742e-10, + "loss": 0.4459, + "step": 2487 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5162554979324341, + "learning_rate": 4.229295727405935e-10, + "loss": 0.4724, + "step": 2488 + }, + { + "epoch": 0.9956, + "grad_norm": 0.5672651529312134, + "learning_rate": 3.5537946472308233e-10, + "loss": 0.5596, + "step": 2489 + }, + { + "epoch": 0.996, + "grad_norm": 0.5480309128761292, + "learning_rate": 2.937028995885771e-10, + "loss": 0.6553, + "step": 2490 + }, + { + "epoch": 0.9964, + "grad_norm": 0.47406649589538574, + "learning_rate": 2.3789998085077536e-10, + "loss": 0.5226, + "step": 2491 + }, + { + "epoch": 0.9968, + "grad_norm": 0.5353025794029236, + "learning_rate": 1.8797080216514938e-10, + "loss": 0.5022, + "step": 2492 + }, + { + "epoch": 0.9972, + "grad_norm": 0.5093353986740112, + "learning_rate": 1.4391544732894612e-10, + "loss": 0.4815, + "step": 2493 + }, + { + "epoch": 0.9976, + "grad_norm": 0.4614766538143158, + "learning_rate": 1.0573399028196428e-10, + "loss": 0.4883, + "step": 2494 + }, + { + "epoch": 0.998, + "grad_norm": 0.48147180676460266, + "learning_rate": 7.342649510422295e-11, + "loss": 0.565, + "step": 2495 + }, + { + "epoch": 0.9984, + "grad_norm": 0.5356802344322205, + "learning_rate": 4.699301601907013e-11, + "loss": 0.5475, + "step": 2496 + }, + { + "epoch": 0.9988, + "grad_norm": 0.5377554297447205, + "learning_rate": 2.6433597389685648e-11, + "loss": 0.5219, + "step": 2497 + }, + { + "epoch": 0.9992, + "grad_norm": 0.4769822061061859, + "learning_rate": 1.1748273722189673e-11, + "loss": 0.506, + "step": 2498 + }, + { + "epoch": 0.9996, + "grad_norm": 0.5197920799255371, + "learning_rate": 2.9370696629227312e-12, + "loss": 0.603, + "step": 2499 + }, + { + "epoch": 1.0, + "grad_norm": 0.5427217483520508, + "learning_rate": 0.0, + "loss": 0.5824, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 1.1471048046280704e+20, + "train_loss": 0.06534878476858139, + "train_runtime": 3332.6777, + "train_samples_per_second": 90.017, + "train_steps_per_second": 0.75 + } + ], + "logging_steps": 1.0, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1471048046280704e+20, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}