{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005263157894736842, "grad_norm": 12.642855011431948, "learning_rate": 5.2631578947368416e-08, "loss": 1.0803, "step": 1 }, { "epoch": 0.010526315789473684, "grad_norm": 12.34425414294974, "learning_rate": 1.0526315789473683e-07, "loss": 1.0462, "step": 2 }, { "epoch": 0.015789473684210527, "grad_norm": 11.451279866893907, "learning_rate": 1.5789473684210525e-07, "loss": 1.0479, "step": 3 }, { "epoch": 0.021052631578947368, "grad_norm": 12.195321064233118, "learning_rate": 2.1052631578947366e-07, "loss": 1.0667, "step": 4 }, { "epoch": 0.02631578947368421, "grad_norm": 11.732705675313184, "learning_rate": 2.631578947368421e-07, "loss": 1.0227, "step": 5 }, { "epoch": 0.031578947368421054, "grad_norm": 11.450018010840793, "learning_rate": 3.157894736842105e-07, "loss": 1.0279, "step": 6 }, { "epoch": 0.03684210526315789, "grad_norm": 11.469564671793638, "learning_rate": 3.684210526315789e-07, "loss": 1.0395, "step": 7 }, { "epoch": 0.042105263157894736, "grad_norm": 10.28912569661124, "learning_rate": 4.2105263157894733e-07, "loss": 1.0004, "step": 8 }, { "epoch": 0.04736842105263158, "grad_norm": 10.064429251308997, "learning_rate": 4.7368421052631574e-07, "loss": 1.0354, "step": 9 }, { "epoch": 0.05263157894736842, "grad_norm": 11.11360392847431, "learning_rate": 5.263157894736842e-07, "loss": 1.0282, "step": 10 }, { "epoch": 0.05789473684210526, "grad_norm": 8.842870004697499, "learning_rate": 5.789473684210526e-07, "loss": 0.9573, "step": 11 }, { "epoch": 0.06315789473684211, "grad_norm": 9.419526776304732, "learning_rate": 6.31578947368421e-07, "loss": 1.0123, "step": 12 }, { "epoch": 0.06842105263157895, "grad_norm": 8.531235500445332, "learning_rate": 6.842105263157895e-07, "loss": 0.9723, "step": 13 }, { "epoch": 0.07368421052631578, "grad_norm": 7.612142885708635, "learning_rate": 7.368421052631578e-07, "loss": 0.9488, "step": 14 }, { "epoch": 0.07894736842105263, "grad_norm": 7.243400548958967, "learning_rate": 7.894736842105263e-07, "loss": 0.9566, "step": 15 }, { "epoch": 0.08421052631578947, "grad_norm": 7.715704166499068, "learning_rate": 8.421052631578947e-07, "loss": 0.9586, "step": 16 }, { "epoch": 0.08947368421052632, "grad_norm": 5.681343828224437, "learning_rate": 8.947368421052631e-07, "loss": 0.8972, "step": 17 }, { "epoch": 0.09473684210526316, "grad_norm": 5.909330321398334, "learning_rate": 9.473684210526315e-07, "loss": 0.9462, "step": 18 }, { "epoch": 0.1, "grad_norm": 5.551955653512476, "learning_rate": 1e-06, "loss": 0.9716, "step": 19 }, { "epoch": 0.10526315789473684, "grad_norm": 3.9884175482678628, "learning_rate": 9.999810668616084e-07, "loss": 0.8733, "step": 20 }, { "epoch": 0.11052631578947368, "grad_norm": 4.3500944022312495, "learning_rate": 9.999242688802884e-07, "loss": 0.8819, "step": 21 }, { "epoch": 0.11578947368421053, "grad_norm": 4.612977019312943, "learning_rate": 9.998296103574966e-07, "loss": 0.8117, "step": 22 }, { "epoch": 0.12105263157894737, "grad_norm": 4.085876398056263, "learning_rate": 9.996970984619641e-07, "loss": 0.8479, "step": 23 }, { "epoch": 0.12631578947368421, "grad_norm": 3.8962745222123605, "learning_rate": 9.995267432291555e-07, "loss": 0.7967, "step": 24 }, { "epoch": 0.13157894736842105, "grad_norm": 4.906007211253714, "learning_rate": 9.993185575605073e-07, "loss": 0.846, "step": 25 }, { "epoch": 0.1368421052631579, "grad_norm": 4.076765235980574, "learning_rate": 9.99072557222452e-07, "loss": 0.8361, "step": 26 }, { "epoch": 0.14210526315789473, "grad_norm": 3.6469407214417875, "learning_rate": 9.987887608452234e-07, "loss": 0.8136, "step": 27 }, { "epoch": 0.14736842105263157, "grad_norm": 4.297874948498594, "learning_rate": 9.984671899214456e-07, "loss": 0.773, "step": 28 }, { "epoch": 0.15263157894736842, "grad_norm": 3.248821707316575, "learning_rate": 9.981078688045062e-07, "loss": 0.7295, "step": 29 }, { "epoch": 0.15789473684210525, "grad_norm": 3.2415876192930315, "learning_rate": 9.977108247067108e-07, "loss": 0.7677, "step": 30 }, { "epoch": 0.1631578947368421, "grad_norm": 3.63042098210306, "learning_rate": 9.972760876972224e-07, "loss": 0.7725, "step": 31 }, { "epoch": 0.16842105263157894, "grad_norm": 3.2114667781734085, "learning_rate": 9.968036906997853e-07, "loss": 0.738, "step": 32 }, { "epoch": 0.1736842105263158, "grad_norm": 3.022035623636562, "learning_rate": 9.962936694902306e-07, "loss": 0.7384, "step": 33 }, { "epoch": 0.17894736842105263, "grad_norm": 2.9523633250041854, "learning_rate": 9.957460626937662e-07, "loss": 0.7509, "step": 34 }, { "epoch": 0.18421052631578946, "grad_norm": 3.080112598494873, "learning_rate": 9.951609117820537e-07, "loss": 0.7605, "step": 35 }, { "epoch": 0.18947368421052632, "grad_norm": 4.039688332869512, "learning_rate": 9.945382610700657e-07, "loss": 0.7369, "step": 36 }, { "epoch": 0.19473684210526315, "grad_norm": 2.8312131444973034, "learning_rate": 9.938781577127306e-07, "loss": 0.7395, "step": 37 }, { "epoch": 0.2, "grad_norm": 3.114482461656541, "learning_rate": 9.931806517013612e-07, "loss": 0.7159, "step": 38 }, { "epoch": 0.20526315789473684, "grad_norm": 2.7086708072786942, "learning_rate": 9.92445795859869e-07, "loss": 0.7222, "step": 39 }, { "epoch": 0.21052631578947367, "grad_norm": 3.7485449995739937, "learning_rate": 9.91673645840763e-07, "loss": 0.6949, "step": 40 }, { "epoch": 0.21578947368421053, "grad_norm": 2.8503331290525566, "learning_rate": 9.908642601209365e-07, "loss": 0.7092, "step": 41 }, { "epoch": 0.22105263157894736, "grad_norm": 3.079001602780246, "learning_rate": 9.900176999972364e-07, "loss": 0.7088, "step": 42 }, { "epoch": 0.22631578947368422, "grad_norm": 2.564364372698868, "learning_rate": 9.89134029581823e-07, "loss": 0.7025, "step": 43 }, { "epoch": 0.23157894736842105, "grad_norm": 3.1544438320150854, "learning_rate": 9.88213315797313e-07, "loss": 0.6982, "step": 44 }, { "epoch": 0.23684210526315788, "grad_norm": 3.3994379889515405, "learning_rate": 9.872556283717124e-07, "loss": 0.7051, "step": 45 }, { "epoch": 0.24210526315789474, "grad_norm": 2.7042300223916067, "learning_rate": 9.862610398331359e-07, "loss": 0.7234, "step": 46 }, { "epoch": 0.24736842105263157, "grad_norm": 2.6577071952641584, "learning_rate": 9.85229625504313e-07, "loss": 0.7011, "step": 47 }, { "epoch": 0.25263157894736843, "grad_norm": 3.050166736127533, "learning_rate": 9.841614634968843e-07, "loss": 0.7075, "step": 48 }, { "epoch": 0.2578947368421053, "grad_norm": 6.357403841745397, "learning_rate": 9.830566347054867e-07, "loss": 0.7151, "step": 49 }, { "epoch": 0.2631578947368421, "grad_norm": 2.545588371195535, "learning_rate": 9.819152228016257e-07, "loss": 0.6481, "step": 50 }, { "epoch": 0.26842105263157895, "grad_norm": 2.5786439940280035, "learning_rate": 9.807373142273394e-07, "loss": 0.6587, "step": 51 }, { "epoch": 0.2736842105263158, "grad_norm": 3.13815883329848, "learning_rate": 9.795229981886521e-07, "loss": 0.646, "step": 52 }, { "epoch": 0.2789473684210526, "grad_norm": 2.773920821177136, "learning_rate": 9.782723666488179e-07, "loss": 0.6955, "step": 53 }, { "epoch": 0.28421052631578947, "grad_norm": 2.617357302802378, "learning_rate": 9.769855143213574e-07, "loss": 0.6312, "step": 54 }, { "epoch": 0.2894736842105263, "grad_norm": 2.8145131931082252, "learning_rate": 9.756625386628832e-07, "loss": 0.6193, "step": 55 }, { "epoch": 0.29473684210526313, "grad_norm": 2.4833405259734307, "learning_rate": 9.7430353986572e-07, "loss": 0.6439, "step": 56 }, { "epoch": 0.3, "grad_norm": 2.5453834053616355, "learning_rate": 9.729086208503173e-07, "loss": 0.6436, "step": 57 }, { "epoch": 0.30526315789473685, "grad_norm": 2.753486017153059, "learning_rate": 9.71477887257454e-07, "loss": 0.6903, "step": 58 }, { "epoch": 0.3105263157894737, "grad_norm": 3.0871711482073767, "learning_rate": 9.700114474402388e-07, "loss": 0.6612, "step": 59 }, { "epoch": 0.3157894736842105, "grad_norm": 2.903286008491912, "learning_rate": 9.685094124559032e-07, "loss": 0.6685, "step": 60 }, { "epoch": 0.32105263157894737, "grad_norm": 2.6328385667684246, "learning_rate": 9.669718960573925e-07, "loss": 0.6562, "step": 61 }, { "epoch": 0.3263157894736842, "grad_norm": 2.4572855900723307, "learning_rate": 9.653990146847498e-07, "loss": 0.6747, "step": 62 }, { "epoch": 0.33157894736842103, "grad_norm": 3.2309917558351215, "learning_rate": 9.637908874562976e-07, "loss": 0.6727, "step": 63 }, { "epoch": 0.3368421052631579, "grad_norm": 2.6952262267407208, "learning_rate": 9.621476361596177e-07, "loss": 0.6816, "step": 64 }, { "epoch": 0.34210526315789475, "grad_norm": 2.4940546746298593, "learning_rate": 9.604693852423268e-07, "loss": 0.6452, "step": 65 }, { "epoch": 0.3473684210526316, "grad_norm": 3.7336842467494984, "learning_rate": 9.587562618026521e-07, "loss": 0.6358, "step": 66 }, { "epoch": 0.3526315789473684, "grad_norm": 2.4509664199477093, "learning_rate": 9.570083955798065e-07, "loss": 0.6626, "step": 67 }, { "epoch": 0.35789473684210527, "grad_norm": 2.91731520990893, "learning_rate": 9.552259189441624e-07, "loss": 0.6769, "step": 68 }, { "epoch": 0.3631578947368421, "grad_norm": 3.0789821120953365, "learning_rate": 9.534089668872273e-07, "loss": 0.6847, "step": 69 }, { "epoch": 0.3684210526315789, "grad_norm": 2.6786175238418726, "learning_rate": 9.515576770114197e-07, "loss": 0.655, "step": 70 }, { "epoch": 0.3736842105263158, "grad_norm": 2.634227056004132, "learning_rate": 9.496721895196495e-07, "loss": 0.7022, "step": 71 }, { "epoch": 0.37894736842105264, "grad_norm": 2.664437320388568, "learning_rate": 9.477526472046994e-07, "loss": 0.6619, "step": 72 }, { "epoch": 0.38421052631578945, "grad_norm": 4.512374130100286, "learning_rate": 9.457991954384103e-07, "loss": 0.667, "step": 73 }, { "epoch": 0.3894736842105263, "grad_norm": 2.8480285570523036, "learning_rate": 9.438119821606727e-07, "loss": 0.638, "step": 74 }, { "epoch": 0.39473684210526316, "grad_norm": 2.2934398392098574, "learning_rate": 9.417911578682228e-07, "loss": 0.6324, "step": 75 }, { "epoch": 0.4, "grad_norm": 2.2872153423968293, "learning_rate": 9.397368756032444e-07, "loss": 0.6718, "step": 76 }, { "epoch": 0.4052631578947368, "grad_norm": 2.6631337006486757, "learning_rate": 9.376492909417795e-07, "loss": 0.6606, "step": 77 }, { "epoch": 0.4105263157894737, "grad_norm": 2.5268933861039717, "learning_rate": 9.355285619819449e-07, "loss": 0.6477, "step": 78 }, { "epoch": 0.41578947368421054, "grad_norm": 2.492744860536843, "learning_rate": 9.333748493319602e-07, "loss": 0.6159, "step": 79 }, { "epoch": 0.42105263157894735, "grad_norm": 6.620410049291206, "learning_rate": 9.311883160979843e-07, "loss": 0.6229, "step": 80 }, { "epoch": 0.4263157894736842, "grad_norm": 2.8596307639279375, "learning_rate": 9.289691278717622e-07, "loss": 0.6608, "step": 81 }, { "epoch": 0.43157894736842106, "grad_norm": 2.2817426902775564, "learning_rate": 9.267174527180854e-07, "loss": 0.6233, "step": 82 }, { "epoch": 0.4368421052631579, "grad_norm": 2.338517201348536, "learning_rate": 9.244334611620627e-07, "loss": 0.6434, "step": 83 }, { "epoch": 0.4421052631578947, "grad_norm": 2.229568223588675, "learning_rate": 9.221173261762073e-07, "loss": 0.645, "step": 84 }, { "epoch": 0.4473684210526316, "grad_norm": 3.2473078589534654, "learning_rate": 9.197692231673359e-07, "loss": 0.6591, "step": 85 }, { "epoch": 0.45263157894736844, "grad_norm": 2.4771191451211605, "learning_rate": 9.173893299632855e-07, "loss": 0.623, "step": 86 }, { "epoch": 0.45789473684210524, "grad_norm": 2.7158915691477565, "learning_rate": 9.149778267994456e-07, "loss": 0.6423, "step": 87 }, { "epoch": 0.4631578947368421, "grad_norm": 2.6330534202405875, "learning_rate": 9.125348963051089e-07, "loss": 0.6598, "step": 88 }, { "epoch": 0.46842105263157896, "grad_norm": 2.524476758159517, "learning_rate": 9.100607234896396e-07, "loss": 0.6308, "step": 89 }, { "epoch": 0.47368421052631576, "grad_norm": 2.501226879638191, "learning_rate": 9.075554957284633e-07, "loss": 0.6393, "step": 90 }, { "epoch": 0.4789473684210526, "grad_norm": 3.933826469332694, "learning_rate": 9.050194027488754e-07, "loss": 0.643, "step": 91 }, { "epoch": 0.4842105263157895, "grad_norm": 2.494431369881789, "learning_rate": 9.024526366156732e-07, "loss": 0.6324, "step": 92 }, { "epoch": 0.48947368421052634, "grad_norm": 2.5647183175292976, "learning_rate": 8.998553917166108e-07, "loss": 0.6678, "step": 93 }, { "epoch": 0.49473684210526314, "grad_norm": 2.7418107425427403, "learning_rate": 8.972278647476763e-07, "loss": 0.6401, "step": 94 }, { "epoch": 0.5, "grad_norm": 2.4908001630523504, "learning_rate": 8.945702546981968e-07, "loss": 0.6376, "step": 95 }, { "epoch": 0.5052631578947369, "grad_norm": 2.3820979238811097, "learning_rate": 8.918827628357677e-07, "loss": 0.6604, "step": 96 }, { "epoch": 0.5105263157894737, "grad_norm": 6.555925708049527, "learning_rate": 8.891655926910102e-07, "loss": 0.602, "step": 97 }, { "epoch": 0.5157894736842106, "grad_norm": 2.767476794957958, "learning_rate": 8.864189500421581e-07, "loss": 0.6351, "step": 98 }, { "epoch": 0.5210526315789473, "grad_norm": 2.5165154253276083, "learning_rate": 8.836430428994731e-07, "loss": 0.6385, "step": 99 }, { "epoch": 0.5263157894736842, "grad_norm": 2.3143831343939936, "learning_rate": 8.808380814894911e-07, "loss": 0.6369, "step": 100 }, { "epoch": 0.531578947368421, "grad_norm": 2.2866922921441972, "learning_rate": 8.780042782391027e-07, "loss": 0.6296, "step": 101 }, { "epoch": 0.5368421052631579, "grad_norm": 3.5036243819126316, "learning_rate": 8.751418477594643e-07, "loss": 0.6345, "step": 102 }, { "epoch": 0.5421052631578948, "grad_norm": 2.4647712993984623, "learning_rate": 8.722510068297453e-07, "loss": 0.6015, "step": 103 }, { "epoch": 0.5473684210526316, "grad_norm": 2.6893219919656612, "learning_rate": 8.693319743807115e-07, "loss": 0.6523, "step": 104 }, { "epoch": 0.5526315789473685, "grad_norm": 2.520295568093033, "learning_rate": 8.663849714781442e-07, "loss": 0.6275, "step": 105 }, { "epoch": 0.5578947368421052, "grad_norm": 2.5102646596872367, "learning_rate": 8.634102213060983e-07, "loss": 0.6235, "step": 106 }, { "epoch": 0.5631578947368421, "grad_norm": 2.420532153270969, "learning_rate": 8.604079491500009e-07, "loss": 0.6415, "step": 107 }, { "epoch": 0.5684210526315789, "grad_norm": 2.419586650925291, "learning_rate": 8.573783823795888e-07, "loss": 0.6083, "step": 108 }, { "epoch": 0.5736842105263158, "grad_norm": 2.4954004438078017, "learning_rate": 8.543217504316895e-07, "loss": 0.6043, "step": 109 }, { "epoch": 0.5789473684210527, "grad_norm": 2.631207821232483, "learning_rate": 8.512382847928461e-07, "loss": 0.5966, "step": 110 }, { "epoch": 0.5842105263157895, "grad_norm": 2.4487969119631607, "learning_rate": 8.48128218981785e-07, "loss": 0.6207, "step": 111 }, { "epoch": 0.5894736842105263, "grad_norm": 2.4819981729995795, "learning_rate": 8.449917885317319e-07, "loss": 0.6043, "step": 112 }, { "epoch": 0.5947368421052631, "grad_norm": 3.0806875014131254, "learning_rate": 8.418292309725738e-07, "loss": 0.6393, "step": 113 }, { "epoch": 0.6, "grad_norm": 2.8047374095987743, "learning_rate": 8.386407858128706e-07, "loss": 0.6162, "step": 114 }, { "epoch": 0.6052631578947368, "grad_norm": 2.2653217647871426, "learning_rate": 8.354266945217159e-07, "loss": 0.6422, "step": 115 }, { "epoch": 0.6105263157894737, "grad_norm": 2.387247632448718, "learning_rate": 8.321872005104508e-07, "loss": 0.6256, "step": 116 }, { "epoch": 0.6157894736842106, "grad_norm": 2.70280163207189, "learning_rate": 8.289225491142291e-07, "loss": 0.6206, "step": 117 }, { "epoch": 0.6210526315789474, "grad_norm": 7.377993411312978, "learning_rate": 8.256329875734374e-07, "loss": 0.6425, "step": 118 }, { "epoch": 0.6263157894736842, "grad_norm": 2.3380220919877988, "learning_rate": 8.223187650149711e-07, "loss": 0.6116, "step": 119 }, { "epoch": 0.631578947368421, "grad_norm": 3.323728626512576, "learning_rate": 8.18980132433368e-07, "loss": 0.608, "step": 120 }, { "epoch": 0.6368421052631579, "grad_norm": 3.1749966640550946, "learning_rate": 8.156173426717988e-07, "loss": 0.5815, "step": 121 }, { "epoch": 0.6421052631578947, "grad_norm": 7.107357659646092, "learning_rate": 8.122306504029193e-07, "loss": 0.597, "step": 122 }, { "epoch": 0.6473684210526316, "grad_norm": 4.479509224501224, "learning_rate": 8.088203121095829e-07, "loss": 0.5931, "step": 123 }, { "epoch": 0.6526315789473685, "grad_norm": 2.709896735413064, "learning_rate": 8.053865860654174e-07, "loss": 0.6132, "step": 124 }, { "epoch": 0.6578947368421053, "grad_norm": 3.1553051157828085, "learning_rate": 8.019297323152641e-07, "loss": 0.6462, "step": 125 }, { "epoch": 0.6631578947368421, "grad_norm": 2.6077332270949025, "learning_rate": 7.984500126554851e-07, "loss": 0.6412, "step": 126 }, { "epoch": 0.6684210526315789, "grad_norm": 2.687935639196366, "learning_rate": 7.949476906141359e-07, "loss": 0.6271, "step": 127 }, { "epoch": 0.6736842105263158, "grad_norm": 2.48251997551827, "learning_rate": 7.914230314310077e-07, "loss": 0.6027, "step": 128 }, { "epoch": 0.6789473684210526, "grad_norm": 2.9438977642779403, "learning_rate": 7.878763020375414e-07, "loss": 0.6412, "step": 129 }, { "epoch": 0.6842105263157895, "grad_norm": 4.032885299122092, "learning_rate": 7.843077710366104e-07, "loss": 0.6592, "step": 130 }, { "epoch": 0.6894736842105263, "grad_norm": 2.7903181652647406, "learning_rate": 7.807177086821801e-07, "loss": 0.6248, "step": 131 }, { "epoch": 0.6947368421052632, "grad_norm": 2.968921847649656, "learning_rate": 7.771063868588399e-07, "loss": 0.6231, "step": 132 }, { "epoch": 0.7, "grad_norm": 2.3977400854216486, "learning_rate": 7.734740790612136e-07, "loss": 0.6228, "step": 133 }, { "epoch": 0.7052631578947368, "grad_norm": 2.3140577613757425, "learning_rate": 7.698210603732454e-07, "loss": 0.5791, "step": 134 }, { "epoch": 0.7105263157894737, "grad_norm": 2.523785887846981, "learning_rate": 7.661476074473694e-07, "loss": 0.5997, "step": 135 }, { "epoch": 0.7157894736842105, "grad_norm": 2.7229089664651585, "learning_rate": 7.624539984835556e-07, "loss": 0.5994, "step": 136 }, { "epoch": 0.7210526315789474, "grad_norm": 2.4305539942752548, "learning_rate": 7.587405132082432e-07, "loss": 0.5941, "step": 137 }, { "epoch": 0.7263157894736842, "grad_norm": 2.253433002879369, "learning_rate": 7.550074328531544e-07, "loss": 0.6323, "step": 138 }, { "epoch": 0.7315789473684211, "grad_norm": 2.87605074258251, "learning_rate": 7.512550401339971e-07, "loss": 0.605, "step": 139 }, { "epoch": 0.7368421052631579, "grad_norm": 2.4820171983737964, "learning_rate": 7.47483619229054e-07, "loss": 0.6038, "step": 140 }, { "epoch": 0.7421052631578947, "grad_norm": 2.994909930351913, "learning_rate": 7.436934557576611e-07, "loss": 0.5898, "step": 141 }, { "epoch": 0.7473684210526316, "grad_norm": 2.4089107961391716, "learning_rate": 7.39884836758576e-07, "loss": 0.6191, "step": 142 }, { "epoch": 0.7526315789473684, "grad_norm": 2.484528784332179, "learning_rate": 7.360580506682413e-07, "loss": 0.593, "step": 143 }, { "epoch": 0.7578947368421053, "grad_norm": 2.526886015112745, "learning_rate": 7.322133872989398e-07, "loss": 0.6036, "step": 144 }, { "epoch": 0.7631578947368421, "grad_norm": 3.1068042639656226, "learning_rate": 7.283511378168457e-07, "loss": 0.5986, "step": 145 }, { "epoch": 0.7684210526315789, "grad_norm": 2.426127834823539, "learning_rate": 7.244715947199749e-07, "loss": 0.6097, "step": 146 }, { "epoch": 0.7736842105263158, "grad_norm": 2.3123421734560115, "learning_rate": 7.20575051816033e-07, "loss": 0.5765, "step": 147 }, { "epoch": 0.7789473684210526, "grad_norm": 3.0322275767396927, "learning_rate": 7.166618042001639e-07, "loss": 0.6209, "step": 148 }, { "epoch": 0.7842105263157895, "grad_norm": 2.0885878316769113, "learning_rate": 7.127321482326026e-07, "loss": 0.6038, "step": 149 }, { "epoch": 0.7894736842105263, "grad_norm": 2.6655995523304443, "learning_rate": 7.087863815162299e-07, "loss": 0.6039, "step": 150 }, { "epoch": 0.7947368421052632, "grad_norm": 3.5462821286242603, "learning_rate": 7.048248028740349e-07, "loss": 0.5951, "step": 151 }, { "epoch": 0.8, "grad_norm": 2.471112364580461, "learning_rate": 7.008477123264847e-07, "loss": 0.5965, "step": 152 }, { "epoch": 0.8052631578947368, "grad_norm": 2.5061966397777162, "learning_rate": 6.968554110688019e-07, "loss": 0.5871, "step": 153 }, { "epoch": 0.8105263157894737, "grad_norm": 2.406700451452237, "learning_rate": 6.928482014481558e-07, "loss": 0.6105, "step": 154 }, { "epoch": 0.8157894736842105, "grad_norm": 3.1910017405823705, "learning_rate": 6.888263869407632e-07, "loss": 0.587, "step": 155 }, { "epoch": 0.8210526315789474, "grad_norm": 2.467282283935378, "learning_rate": 6.847902721289067e-07, "loss": 0.6227, "step": 156 }, { "epoch": 0.8263157894736842, "grad_norm": 2.295127372658213, "learning_rate": 6.807401626778679e-07, "loss": 0.5899, "step": 157 }, { "epoch": 0.8315789473684211, "grad_norm": 2.6433009830471406, "learning_rate": 6.766763653127772e-07, "loss": 0.5892, "step": 158 }, { "epoch": 0.8368421052631579, "grad_norm": 3.0749394453309526, "learning_rate": 6.725991877953867e-07, "loss": 0.6041, "step": 159 }, { "epoch": 0.8421052631578947, "grad_norm": 2.345915225733439, "learning_rate": 6.68508938900761e-07, "loss": 0.586, "step": 160 }, { "epoch": 0.8473684210526315, "grad_norm": 3.0520257716599097, "learning_rate": 6.644059283938937e-07, "loss": 0.6134, "step": 161 }, { "epoch": 0.8526315789473684, "grad_norm": 2.747377304181717, "learning_rate": 6.602904670062475e-07, "loss": 0.5633, "step": 162 }, { "epoch": 0.8578947368421053, "grad_norm": 2.179146773849276, "learning_rate": 6.561628664122226e-07, "loss": 0.5706, "step": 163 }, { "epoch": 0.8631578947368421, "grad_norm": 2.3135324153171495, "learning_rate": 6.520234392055521e-07, "loss": 0.6167, "step": 164 }, { "epoch": 0.868421052631579, "grad_norm": 9.71533571058622, "learning_rate": 6.478724988756284e-07, "loss": 0.6069, "step": 165 }, { "epoch": 0.8736842105263158, "grad_norm": 2.4920221204476554, "learning_rate": 6.437103597837629e-07, "loss": 0.5963, "step": 166 }, { "epoch": 0.8789473684210526, "grad_norm": 3.0329738688798145, "learning_rate": 6.395373371393769e-07, "loss": 0.5822, "step": 167 }, { "epoch": 0.8842105263157894, "grad_norm": 2.6956141149357973, "learning_rate": 6.353537469761315e-07, "loss": 0.5937, "step": 168 }, { "epoch": 0.8894736842105263, "grad_norm": 2.5218470185601998, "learning_rate": 6.311599061279931e-07, "loss": 0.6025, "step": 169 }, { "epoch": 0.8947368421052632, "grad_norm": 2.384640301104755, "learning_rate": 6.269561322052377e-07, "loss": 0.6249, "step": 170 }, { "epoch": 0.9, "grad_norm": 2.5020602687904634, "learning_rate": 6.227427435703995e-07, "loss": 0.5954, "step": 171 }, { "epoch": 0.9052631578947369, "grad_norm": 2.5824028414049014, "learning_rate": 6.185200593141591e-07, "loss": 0.6262, "step": 172 }, { "epoch": 0.9105263157894737, "grad_norm": 3.3671575875344177, "learning_rate": 6.14288399231178e-07, "loss": 0.5806, "step": 173 }, { "epoch": 0.9157894736842105, "grad_norm": 2.8730810418787165, "learning_rate": 6.100480837958801e-07, "loss": 0.5756, "step": 174 }, { "epoch": 0.9210526315789473, "grad_norm": 2.4545013421309414, "learning_rate": 6.057994341381812e-07, "loss": 0.5878, "step": 175 }, { "epoch": 0.9263157894736842, "grad_norm": 2.3488962229180923, "learning_rate": 6.015427720191692e-07, "loss": 0.5887, "step": 176 }, { "epoch": 0.9315789473684211, "grad_norm": 2.4164866608841695, "learning_rate": 5.97278419806736e-07, "loss": 0.5847, "step": 177 }, { "epoch": 0.9368421052631579, "grad_norm": 2.536723291986412, "learning_rate": 5.93006700451164e-07, "loss": 0.6001, "step": 178 }, { "epoch": 0.9421052631578948, "grad_norm": 2.578895932915379, "learning_rate": 5.887279374606679e-07, "loss": 0.5695, "step": 179 }, { "epoch": 0.9473684210526315, "grad_norm": 2.2778696227839865, "learning_rate": 5.844424548768951e-07, "loss": 0.6, "step": 180 }, { "epoch": 0.9526315789473684, "grad_norm": 2.0983119878164005, "learning_rate": 5.801505772503853e-07, "loss": 0.574, "step": 181 }, { "epoch": 0.9578947368421052, "grad_norm": 2.276622095316042, "learning_rate": 5.758526296159905e-07, "loss": 0.5896, "step": 182 }, { "epoch": 0.9631578947368421, "grad_norm": 2.3702885416300936, "learning_rate": 5.7154893746826e-07, "loss": 0.6187, "step": 183 }, { "epoch": 0.968421052631579, "grad_norm": 2.6682239348053076, "learning_rate": 5.672398267367901e-07, "loss": 0.5975, "step": 184 }, { "epoch": 0.9736842105263158, "grad_norm": 2.4953199970370754, "learning_rate": 5.629256237615402e-07, "loss": 0.5833, "step": 185 }, { "epoch": 0.9789473684210527, "grad_norm": 2.3148878554267633, "learning_rate": 5.586066552681179e-07, "loss": 0.5656, "step": 186 }, { "epoch": 0.9842105263157894, "grad_norm": 2.5271611837279138, "learning_rate": 5.542832483430363e-07, "loss": 0.5645, "step": 187 }, { "epoch": 0.9894736842105263, "grad_norm": 2.494396905400437, "learning_rate": 5.499557304089418e-07, "loss": 0.6034, "step": 188 }, { "epoch": 0.9947368421052631, "grad_norm": 2.5191956286524677, "learning_rate": 5.456244291998182e-07, "loss": 0.6025, "step": 189 }, { "epoch": 1.0, "grad_norm": 2.3574219439115813, "learning_rate": 5.412896727361662e-07, "loss": 0.5931, "step": 190 }, { "epoch": 1.0052631578947369, "grad_norm": 2.3448910845211666, "learning_rate": 5.369517893001619e-07, "loss": 0.5247, "step": 191 }, { "epoch": 1.0105263157894737, "grad_norm": 3.1508229753956085, "learning_rate": 5.326111074107951e-07, "loss": 0.5441, "step": 192 }, { "epoch": 1.0157894736842106, "grad_norm": 2.4870945738772305, "learning_rate": 5.282679557989896e-07, "loss": 0.5467, "step": 193 }, { "epoch": 1.0210526315789474, "grad_norm": 2.462541528431831, "learning_rate": 5.239226633827073e-07, "loss": 0.5653, "step": 194 }, { "epoch": 1.0263157894736843, "grad_norm": 2.58644706797543, "learning_rate": 5.195755592420386e-07, "loss": 0.5446, "step": 195 }, { "epoch": 1.0315789473684212, "grad_norm": 2.335054761677809, "learning_rate": 5.152269725942813e-07, "loss": 0.5615, "step": 196 }, { "epoch": 1.0368421052631578, "grad_norm": 2.624795958426164, "learning_rate": 5.108772327690064e-07, "loss": 0.5462, "step": 197 }, { "epoch": 1.0421052631578946, "grad_norm": 2.260065923566593, "learning_rate": 5.06526669183118e-07, "loss": 0.532, "step": 198 }, { "epoch": 1.0473684210526315, "grad_norm": 2.2958442656392495, "learning_rate": 5.021756113159061e-07, "loss": 0.5585, "step": 199 }, { "epoch": 1.0526315789473684, "grad_norm": 2.426007034037249, "learning_rate": 4.978243886840939e-07, "loss": 0.5175, "step": 200 }, { "epoch": 1.0578947368421052, "grad_norm": 2.1330228014416734, "learning_rate": 4.93473330816882e-07, "loss": 0.5483, "step": 201 }, { "epoch": 1.063157894736842, "grad_norm": 2.6733289129426225, "learning_rate": 4.891227672309935e-07, "loss": 0.5351, "step": 202 }, { "epoch": 1.068421052631579, "grad_norm": 2.225783335907971, "learning_rate": 4.847730274057186e-07, "loss": 0.5231, "step": 203 }, { "epoch": 1.0736842105263158, "grad_norm": 2.5020405988561065, "learning_rate": 4.804244407579613e-07, "loss": 0.5527, "step": 204 }, { "epoch": 1.0789473684210527, "grad_norm": 2.2536599203492664, "learning_rate": 4.7607733661729287e-07, "loss": 0.5461, "step": 205 }, { "epoch": 1.0842105263157895, "grad_norm": 2.183669632744124, "learning_rate": 4.717320442010104e-07, "loss": 0.5506, "step": 206 }, { "epoch": 1.0894736842105264, "grad_norm": 2.3465016065867492, "learning_rate": 4.6738889258920476e-07, "loss": 0.5701, "step": 207 }, { "epoch": 1.0947368421052632, "grad_norm": 2.496841967109924, "learning_rate": 4.630482106998381e-07, "loss": 0.559, "step": 208 }, { "epoch": 1.1, "grad_norm": 2.238933045944526, "learning_rate": 4.5871032726383385e-07, "loss": 0.5678, "step": 209 }, { "epoch": 1.1052631578947367, "grad_norm": 3.3765686416844725, "learning_rate": 4.5437557080018175e-07, "loss": 0.575, "step": 210 }, { "epoch": 1.1105263157894736, "grad_norm": 2.6001233660513687, "learning_rate": 4.500442695910581e-07, "loss": 0.5444, "step": 211 }, { "epoch": 1.1157894736842104, "grad_norm": 2.4980230677779156, "learning_rate": 4.4571675165696364e-07, "loss": 0.532, "step": 212 }, { "epoch": 1.1210526315789473, "grad_norm": 2.38332156122142, "learning_rate": 4.4139334473188206e-07, "loss": 0.5275, "step": 213 }, { "epoch": 1.1263157894736842, "grad_norm": 2.7714314545935115, "learning_rate": 4.3707437623845987e-07, "loss": 0.5623, "step": 214 }, { "epoch": 1.131578947368421, "grad_norm": 2.314265141849706, "learning_rate": 4.327601732632098e-07, "loss": 0.5526, "step": 215 }, { "epoch": 1.1368421052631579, "grad_norm": 2.345637212860385, "learning_rate": 4.2845106253173996e-07, "loss": 0.5475, "step": 216 }, { "epoch": 1.1421052631578947, "grad_norm": 2.404757712189546, "learning_rate": 4.241473703840096e-07, "loss": 0.5478, "step": 217 }, { "epoch": 1.1473684210526316, "grad_norm": 2.7886042663656676, "learning_rate": 4.198494227496147e-07, "loss": 0.5449, "step": 218 }, { "epoch": 1.1526315789473685, "grad_norm": 2.240431787069853, "learning_rate": 4.1555754512310474e-07, "loss": 0.5305, "step": 219 }, { "epoch": 1.1578947368421053, "grad_norm": 2.275044580303918, "learning_rate": 4.112720625393321e-07, "loss": 0.555, "step": 220 }, { "epoch": 1.1631578947368422, "grad_norm": 2.176470249987148, "learning_rate": 4.069932995488361e-07, "loss": 0.5342, "step": 221 }, { "epoch": 1.168421052631579, "grad_norm": 2.3180546727082874, "learning_rate": 4.027215801932641e-07, "loss": 0.5545, "step": 222 }, { "epoch": 1.1736842105263159, "grad_norm": 2.500478770242888, "learning_rate": 3.984572279808306e-07, "loss": 0.541, "step": 223 }, { "epoch": 1.1789473684210527, "grad_norm": 3.2129334508357736, "learning_rate": 3.9420056586181876e-07, "loss": 0.531, "step": 224 }, { "epoch": 1.1842105263157894, "grad_norm": 3.190794082513424, "learning_rate": 3.8995191620411994e-07, "loss": 0.5538, "step": 225 }, { "epoch": 1.1894736842105262, "grad_norm": 2.2027187652727087, "learning_rate": 3.85711600768822e-07, "loss": 0.5553, "step": 226 }, { "epoch": 1.194736842105263, "grad_norm": 9.415768580485024, "learning_rate": 3.814799406858408e-07, "loss": 0.5352, "step": 227 }, { "epoch": 1.2, "grad_norm": 2.5331004049850994, "learning_rate": 3.772572564296004e-07, "loss": 0.552, "step": 228 }, { "epoch": 1.2052631578947368, "grad_norm": 2.5568166677231936, "learning_rate": 3.730438677947624e-07, "loss": 0.524, "step": 229 }, { "epoch": 1.2105263157894737, "grad_norm": 2.430967299620147, "learning_rate": 3.688400938720071e-07, "loss": 0.5596, "step": 230 }, { "epoch": 1.2157894736842105, "grad_norm": 2.2841927182707686, "learning_rate": 3.646462530238683e-07, "loss": 0.5504, "step": 231 }, { "epoch": 1.2210526315789474, "grad_norm": 2.709784732818941, "learning_rate": 3.60462662860623e-07, "loss": 0.5485, "step": 232 }, { "epoch": 1.2263157894736842, "grad_norm": 2.35859083822343, "learning_rate": 3.5628964021623696e-07, "loss": 0.556, "step": 233 }, { "epoch": 1.231578947368421, "grad_norm": 2.4468595923428214, "learning_rate": 3.521275011243715e-07, "loss": 0.5603, "step": 234 }, { "epoch": 1.236842105263158, "grad_norm": 2.4049891202411886, "learning_rate": 3.47976560794448e-07, "loss": 0.5585, "step": 235 }, { "epoch": 1.2421052631578948, "grad_norm": 2.292589627214253, "learning_rate": 3.4383713358777735e-07, "loss": 0.5383, "step": 236 }, { "epoch": 1.2473684210526317, "grad_norm": 2.3752679374582426, "learning_rate": 3.3970953299375257e-07, "loss": 0.5516, "step": 237 }, { "epoch": 1.2526315789473683, "grad_norm": 2.4876254806908116, "learning_rate": 3.3559407160610644e-07, "loss": 0.5484, "step": 238 }, { "epoch": 1.2578947368421054, "grad_norm": 2.762247825036067, "learning_rate": 3.3149106109923896e-07, "loss": 0.5537, "step": 239 }, { "epoch": 1.263157894736842, "grad_norm": 2.162751322141153, "learning_rate": 3.274008122046132e-07, "loss": 0.566, "step": 240 }, { "epoch": 1.268421052631579, "grad_norm": 3.581918002736885, "learning_rate": 3.2332363468722267e-07, "loss": 0.5474, "step": 241 }, { "epoch": 1.2736842105263158, "grad_norm": 2.284588189418711, "learning_rate": 3.192598373221322e-07, "loss": 0.5479, "step": 242 }, { "epoch": 1.2789473684210526, "grad_norm": 2.268653101760438, "learning_rate": 3.152097278710933e-07, "loss": 0.5047, "step": 243 }, { "epoch": 1.2842105263157895, "grad_norm": 2.2436341824829715, "learning_rate": 3.1117361305923686e-07, "loss": 0.5442, "step": 244 }, { "epoch": 1.2894736842105263, "grad_norm": 3.526093157139422, "learning_rate": 3.071517985518442e-07, "loss": 0.5301, "step": 245 }, { "epoch": 1.2947368421052632, "grad_norm": 2.4393150883883377, "learning_rate": 3.03144588931198e-07, "loss": 0.552, "step": 246 }, { "epoch": 1.3, "grad_norm": 2.25707824609034, "learning_rate": 2.9915228767351535e-07, "loss": 0.5216, "step": 247 }, { "epoch": 1.305263157894737, "grad_norm": 2.4659870700865554, "learning_rate": 2.9517519712596494e-07, "loss": 0.5531, "step": 248 }, { "epoch": 1.3105263157894738, "grad_norm": 2.211365210217085, "learning_rate": 2.9121361848377013e-07, "loss": 0.5464, "step": 249 }, { "epoch": 1.3157894736842106, "grad_norm": 2.4990811569238147, "learning_rate": 2.872678517673975e-07, "loss": 0.551, "step": 250 }, { "epoch": 1.3210526315789473, "grad_norm": 2.256640053743778, "learning_rate": 2.833381957998362e-07, "loss": 0.537, "step": 251 }, { "epoch": 1.3263157894736843, "grad_norm": 2.439046289136518, "learning_rate": 2.7942494818396687e-07, "loss": 0.5446, "step": 252 }, { "epoch": 1.331578947368421, "grad_norm": 2.4445909166764994, "learning_rate": 2.75528405280025e-07, "loss": 0.5522, "step": 253 }, { "epoch": 1.3368421052631578, "grad_norm": 2.503169992163442, "learning_rate": 2.7164886218315444e-07, "loss": 0.5315, "step": 254 }, { "epoch": 1.3421052631578947, "grad_norm": 3.1819620554512564, "learning_rate": 2.6778661270106025e-07, "loss": 0.5657, "step": 255 }, { "epoch": 1.3473684210526315, "grad_norm": 2.4239145590188675, "learning_rate": 2.639419493317587e-07, "loss": 0.5467, "step": 256 }, { "epoch": 1.3526315789473684, "grad_norm": 2.325463024514789, "learning_rate": 2.601151632414241e-07, "loss": 0.5445, "step": 257 }, { "epoch": 1.3578947368421053, "grad_norm": 2.6223884759828704, "learning_rate": 2.56306544242339e-07, "loss": 0.5484, "step": 258 }, { "epoch": 1.3631578947368421, "grad_norm": 2.8460302822874852, "learning_rate": 2.5251638077094603e-07, "loss": 0.5417, "step": 259 }, { "epoch": 1.368421052631579, "grad_norm": 2.24730489630462, "learning_rate": 2.487449598660029e-07, "loss": 0.5285, "step": 260 }, { "epoch": 1.3736842105263158, "grad_norm": 2.3421953136620406, "learning_rate": 2.449925671468456e-07, "loss": 0.5272, "step": 261 }, { "epoch": 1.3789473684210527, "grad_norm": 2.4559845843844523, "learning_rate": 2.412594867917568e-07, "loss": 0.5421, "step": 262 }, { "epoch": 1.3842105263157896, "grad_norm": 3.115271141122453, "learning_rate": 2.3754600151644444e-07, "loss": 0.5516, "step": 263 }, { "epoch": 1.3894736842105262, "grad_norm": 2.593266549185136, "learning_rate": 2.3385239255263073e-07, "loss": 0.5425, "step": 264 }, { "epoch": 1.3947368421052633, "grad_norm": 2.2082484863636194, "learning_rate": 2.3017893962675454e-07, "loss": 0.542, "step": 265 }, { "epoch": 1.4, "grad_norm": 2.283360151388717, "learning_rate": 2.2652592093878665e-07, "loss": 0.5554, "step": 266 }, { "epoch": 1.4052631578947368, "grad_norm": 2.3911086301133446, "learning_rate": 2.2289361314116006e-07, "loss": 0.5194, "step": 267 }, { "epoch": 1.4105263157894736, "grad_norm": 2.593370895355469, "learning_rate": 2.1928229131782006e-07, "loss": 0.5504, "step": 268 }, { "epoch": 1.4157894736842105, "grad_norm": 2.393818171006261, "learning_rate": 2.1569222896338963e-07, "loss": 0.5452, "step": 269 }, { "epoch": 1.4210526315789473, "grad_norm": 2.4163961537782717, "learning_rate": 2.121236979624586e-07, "loss": 0.5244, "step": 270 }, { "epoch": 1.4263157894736842, "grad_norm": 2.3320647289960883, "learning_rate": 2.085769685689923e-07, "loss": 0.5484, "step": 271 }, { "epoch": 1.431578947368421, "grad_norm": 2.343996890572737, "learning_rate": 2.0505230938586415e-07, "loss": 0.5619, "step": 272 }, { "epoch": 1.436842105263158, "grad_norm": 6.066669575842648, "learning_rate": 2.0154998734451472e-07, "loss": 0.5276, "step": 273 }, { "epoch": 1.4421052631578948, "grad_norm": 2.2887239747665045, "learning_rate": 1.9807026768473579e-07, "loss": 0.5576, "step": 274 }, { "epoch": 1.4473684210526316, "grad_norm": 2.1868450818312644, "learning_rate": 1.9461341393458252e-07, "loss": 0.5505, "step": 275 }, { "epoch": 1.4526315789473685, "grad_norm": 5.547035869334998, "learning_rate": 1.911796878904171e-07, "loss": 0.5304, "step": 276 }, { "epoch": 1.4578947368421051, "grad_norm": 2.356826285784176, "learning_rate": 1.877693495970809e-07, "loss": 0.5072, "step": 277 }, { "epoch": 1.4631578947368422, "grad_norm": 2.1453867969594818, "learning_rate": 1.8438265732820125e-07, "loss": 0.5542, "step": 278 }, { "epoch": 1.4684210526315788, "grad_norm": 2.487577099034787, "learning_rate": 1.8101986756663196e-07, "loss": 0.5136, "step": 279 }, { "epoch": 1.4736842105263157, "grad_norm": 2.1415604665665064, "learning_rate": 1.776812349850289e-07, "loss": 0.5188, "step": 280 }, { "epoch": 1.4789473684210526, "grad_norm": 2.2729801926850026, "learning_rate": 1.743670124265627e-07, "loss": 0.5354, "step": 281 }, { "epoch": 1.4842105263157894, "grad_norm": 2.962685274358996, "learning_rate": 1.7107745088577087e-07, "loss": 0.5444, "step": 282 }, { "epoch": 1.4894736842105263, "grad_norm": 2.493997771189138, "learning_rate": 1.6781279948954918e-07, "loss": 0.5124, "step": 283 }, { "epoch": 1.4947368421052631, "grad_norm": 2.174484329414595, "learning_rate": 1.6457330547828403e-07, "loss": 0.5186, "step": 284 }, { "epoch": 1.5, "grad_norm": 2.294851477833156, "learning_rate": 1.6135921418712955e-07, "loss": 0.5459, "step": 285 }, { "epoch": 1.5052631578947369, "grad_norm": 2.431178684998425, "learning_rate": 1.5817076902742622e-07, "loss": 0.5133, "step": 286 }, { "epoch": 1.5105263157894737, "grad_norm": 2.591260995377334, "learning_rate": 1.5500821146826804e-07, "loss": 0.5263, "step": 287 }, { "epoch": 1.5157894736842106, "grad_norm": 2.296988414628801, "learning_rate": 1.5187178101821502e-07, "loss": 0.5406, "step": 288 }, { "epoch": 1.5210526315789474, "grad_norm": 2.518137143341776, "learning_rate": 1.4876171520715397e-07, "loss": 0.5537, "step": 289 }, { "epoch": 1.526315789473684, "grad_norm": 4.157747210914889, "learning_rate": 1.4567824956831042e-07, "loss": 0.5321, "step": 290 }, { "epoch": 1.5315789473684212, "grad_norm": 2.304265529094299, "learning_rate": 1.426216176204112e-07, "loss": 0.5453, "step": 291 }, { "epoch": 1.5368421052631578, "grad_norm": 2.2909234105971894, "learning_rate": 1.395920508499991e-07, "loss": 0.5329, "step": 292 }, { "epoch": 1.5421052631578949, "grad_norm": 2.642177484706335, "learning_rate": 1.3658977869390164e-07, "loss": 0.5309, "step": 293 }, { "epoch": 1.5473684210526315, "grad_norm": 2.4224952394007966, "learning_rate": 1.336150285218558e-07, "loss": 0.5151, "step": 294 }, { "epoch": 1.5526315789473686, "grad_norm": 2.303479541727698, "learning_rate": 1.3066802561928853e-07, "loss": 0.5529, "step": 295 }, { "epoch": 1.5578947368421052, "grad_norm": 3.187263476988102, "learning_rate": 1.2774899317025467e-07, "loss": 0.5496, "step": 296 }, { "epoch": 1.563157894736842, "grad_norm": 2.1628631688530007, "learning_rate": 1.248581522405358e-07, "loss": 0.5525, "step": 297 }, { "epoch": 1.568421052631579, "grad_norm": 2.406176175843737, "learning_rate": 1.219957217608974e-07, "loss": 0.5429, "step": 298 }, { "epoch": 1.5736842105263158, "grad_norm": 2.262578641804591, "learning_rate": 1.1916191851050871e-07, "loss": 0.5066, "step": 299 }, { "epoch": 1.5789473684210527, "grad_norm": 2.4369446864539395, "learning_rate": 1.1635695710052689e-07, "loss": 0.5285, "step": 300 }, { "epoch": 1.5842105263157895, "grad_norm": 2.2109343115341993, "learning_rate": 1.1358104995784185e-07, "loss": 0.5569, "step": 301 }, { "epoch": 1.5894736842105264, "grad_norm": 2.330458766354278, "learning_rate": 1.1083440730898974e-07, "loss": 0.5665, "step": 302 }, { "epoch": 1.594736842105263, "grad_norm": 2.305815278527985, "learning_rate": 1.0811723716423232e-07, "loss": 0.516, "step": 303 }, { "epoch": 1.6, "grad_norm": 2.3079199650608766, "learning_rate": 1.0542974530180327e-07, "loss": 0.5285, "step": 304 }, { "epoch": 1.6052631578947367, "grad_norm": 2.513610078891517, "learning_rate": 1.027721352523237e-07, "loss": 0.544, "step": 305 }, { "epoch": 1.6105263157894738, "grad_norm": 2.5767331454181255, "learning_rate": 1.0014460828338928e-07, "loss": 0.5293, "step": 306 }, { "epoch": 1.6157894736842104, "grad_norm": 2.199657550666274, "learning_rate": 9.754736338432679e-08, "loss": 0.5391, "step": 307 }, { "epoch": 1.6210526315789475, "grad_norm": 2.754603403945606, "learning_rate": 9.498059725112468e-08, "loss": 0.536, "step": 308 }, { "epoch": 1.6263157894736842, "grad_norm": 2.536336143602128, "learning_rate": 9.244450427153683e-08, "loss": 0.5468, "step": 309 }, { "epoch": 1.631578947368421, "grad_norm": 2.3411920268532587, "learning_rate": 8.993927651036049e-08, "loss": 0.5151, "step": 310 }, { "epoch": 1.6368421052631579, "grad_norm": 2.1051916003559255, "learning_rate": 8.746510369489102e-08, "loss": 0.5238, "step": 311 }, { "epoch": 1.6421052631578947, "grad_norm": 3.544977524584269, "learning_rate": 8.502217320055426e-08, "loss": 0.559, "step": 312 }, { "epoch": 1.6473684210526316, "grad_norm": 2.058244791333844, "learning_rate": 8.261067003671446e-08, "loss": 0.5206, "step": 313 }, { "epoch": 1.6526315789473685, "grad_norm": 2.446605948134395, "learning_rate": 8.023077683266399e-08, "loss": 0.5369, "step": 314 }, { "epoch": 1.6578947368421053, "grad_norm": 2.4153323922926, "learning_rate": 7.78826738237926e-08, "loss": 0.5458, "step": 315 }, { "epoch": 1.663157894736842, "grad_norm": 2.6104861029979016, "learning_rate": 7.556653883793724e-08, "loss": 0.5451, "step": 316 }, { "epoch": 1.668421052631579, "grad_norm": 2.3100120056396296, "learning_rate": 7.328254728191463e-08, "loss": 0.5363, "step": 317 }, { "epoch": 1.6736842105263157, "grad_norm": 2.5621407650897936, "learning_rate": 7.103087212823778e-08, "loss": 0.5099, "step": 318 }, { "epoch": 1.6789473684210527, "grad_norm": 2.118171619643243, "learning_rate": 6.881168390201581e-08, "loss": 0.5322, "step": 319 }, { "epoch": 1.6842105263157894, "grad_norm": 2.907352165494029, "learning_rate": 6.66251506680397e-08, "loss": 0.555, "step": 320 }, { "epoch": 1.6894736842105265, "grad_norm": 2.2807676681250446, "learning_rate": 6.447143801805515e-08, "loss": 0.5408, "step": 321 }, { "epoch": 1.694736842105263, "grad_norm": 2.297176719362461, "learning_rate": 6.23507090582206e-08, "loss": 0.5226, "step": 322 }, { "epoch": 1.7, "grad_norm": 2.287950688640522, "learning_rate": 6.026312439675551e-08, "loss": 0.5096, "step": 323 }, { "epoch": 1.7052631578947368, "grad_norm": 3.6162969976516024, "learning_rate": 5.820884213177712e-08, "loss": 0.5365, "step": 324 }, { "epoch": 1.7105263157894737, "grad_norm": 2.2306965460746473, "learning_rate": 5.618801783932725e-08, "loss": 0.5287, "step": 325 }, { "epoch": 1.7157894736842105, "grad_norm": 2.2028587942070588, "learning_rate": 5.420080456158971e-08, "loss": 0.5566, "step": 326 }, { "epoch": 1.7210526315789474, "grad_norm": 2.18142780543031, "learning_rate": 5.2247352795300626e-08, "loss": 0.5375, "step": 327 }, { "epoch": 1.7263157894736842, "grad_norm": 2.197343563558157, "learning_rate": 5.0327810480350344e-08, "loss": 0.5487, "step": 328 }, { "epoch": 1.731578947368421, "grad_norm": 2.316942598992064, "learning_rate": 4.84423229885802e-08, "loss": 0.5432, "step": 329 }, { "epoch": 1.736842105263158, "grad_norm": 2.3842561102459356, "learning_rate": 4.659103311277274e-08, "loss": 0.5237, "step": 330 }, { "epoch": 1.7421052631578946, "grad_norm": 2.177074041941887, "learning_rate": 4.477408105583741e-08, "loss": 0.5203, "step": 331 }, { "epoch": 1.7473684210526317, "grad_norm": 2.312588944046282, "learning_rate": 4.2991604420193396e-08, "loss": 0.5548, "step": 332 }, { "epoch": 1.7526315789473683, "grad_norm": 3.476255191649119, "learning_rate": 4.124373819734794e-08, "loss": 0.5258, "step": 333 }, { "epoch": 1.7578947368421054, "grad_norm": 2.14006154277014, "learning_rate": 3.953061475767339e-08, "loss": 0.5094, "step": 334 }, { "epoch": 1.763157894736842, "grad_norm": 2.152901623992758, "learning_rate": 3.7852363840382316e-08, "loss": 0.5359, "step": 335 }, { "epoch": 1.768421052631579, "grad_norm": 2.2182151450123437, "learning_rate": 3.6209112543702236e-08, "loss": 0.5026, "step": 336 }, { "epoch": 1.7736842105263158, "grad_norm": 2.2236206549096713, "learning_rate": 3.460098531525018e-08, "loss": 0.5636, "step": 337 }, { "epoch": 1.7789473684210526, "grad_norm": 2.384680110369391, "learning_rate": 3.3028103942607353e-08, "loss": 0.5464, "step": 338 }, { "epoch": 1.7842105263157895, "grad_norm": 2.9760197665338937, "learning_rate": 3.149058754409678e-08, "loss": 0.5301, "step": 339 }, { "epoch": 1.7894736842105263, "grad_norm": 2.3297740433944254, "learning_rate": 2.9988552559761294e-08, "loss": 0.536, "step": 340 }, { "epoch": 1.7947368421052632, "grad_norm": 3.9190533049512073, "learning_rate": 2.85221127425459e-08, "loss": 0.5358, "step": 341 }, { "epoch": 1.8, "grad_norm": 2.4877390644915085, "learning_rate": 2.7091379149682682e-08, "loss": 0.5317, "step": 342 }, { "epoch": 1.805263157894737, "grad_norm": 2.250535562910496, "learning_rate": 2.5696460134279953e-08, "loss": 0.5289, "step": 343 }, { "epoch": 1.8105263157894735, "grad_norm": 2.2819477227854446, "learning_rate": 2.4337461337116892e-08, "loss": 0.5534, "step": 344 }, { "epoch": 1.8157894736842106, "grad_norm": 2.488467928210033, "learning_rate": 2.301448567864256e-08, "loss": 0.5169, "step": 345 }, { "epoch": 1.8210526315789473, "grad_norm": 2.5228811063557366, "learning_rate": 2.1727633351181994e-08, "loss": 0.5516, "step": 346 }, { "epoch": 1.8263157894736843, "grad_norm": 2.1521113439043438, "learning_rate": 2.0477001811347984e-08, "loss": 0.5327, "step": 347 }, { "epoch": 1.831578947368421, "grad_norm": 2.2553781295744786, "learning_rate": 1.9262685772660603e-08, "loss": 0.5468, "step": 348 }, { "epoch": 1.836842105263158, "grad_norm": 2.3149958407144946, "learning_rate": 1.8084777198374313e-08, "loss": 0.5553, "step": 349 }, { "epoch": 1.8421052631578947, "grad_norm": 2.2277223935869235, "learning_rate": 1.6943365294513235e-08, "loss": 0.5614, "step": 350 }, { "epoch": 1.8473684210526315, "grad_norm": 2.3882894364212, "learning_rate": 1.5838536503115675e-08, "loss": 0.5449, "step": 351 }, { "epoch": 1.8526315789473684, "grad_norm": 2.2020675552755, "learning_rate": 1.4770374495687133e-08, "loss": 0.5334, "step": 352 }, { "epoch": 1.8578947368421053, "grad_norm": 2.488077093971711, "learning_rate": 1.37389601668641e-08, "loss": 0.5346, "step": 353 }, { "epoch": 1.8631578947368421, "grad_norm": 2.2318977307050356, "learning_rate": 1.274437162828751e-08, "loss": 0.5377, "step": 354 }, { "epoch": 1.868421052631579, "grad_norm": 2.68631686113767, "learning_rate": 1.1786684202687025e-08, "loss": 0.5704, "step": 355 }, { "epoch": 1.8736842105263158, "grad_norm": 2.5296212288222724, "learning_rate": 1.086597041817705e-08, "loss": 0.5546, "step": 356 }, { "epoch": 1.8789473684210525, "grad_norm": 2.2211846561034654, "learning_rate": 9.98230000276351e-09, "loss": 0.5302, "step": 357 }, { "epoch": 1.8842105263157896, "grad_norm": 2.6225570074321554, "learning_rate": 9.135739879063464e-09, "loss": 0.5302, "step": 358 }, { "epoch": 1.8894736842105262, "grad_norm": 2.1746290538615027, "learning_rate": 8.326354159236882e-09, "loss": 0.5465, "step": 359 }, { "epoch": 1.8947368421052633, "grad_norm": 2.4217463634793037, "learning_rate": 7.554204140131137e-09, "loss": 0.5293, "step": 360 }, { "epoch": 1.9, "grad_norm": 2.608159649984883, "learning_rate": 6.819348298638839e-09, "loss": 0.5158, "step": 361 }, { "epoch": 1.905263157894737, "grad_norm": 2.286787198440065, "learning_rate": 6.1218422872694186e-09, "loss": 0.5566, "step": 362 }, { "epoch": 1.9105263157894736, "grad_norm": 2.3164313990764187, "learning_rate": 5.46173892993429e-09, "loss": 0.5344, "step": 363 }, { "epoch": 1.9157894736842105, "grad_norm": 2.271875111172559, "learning_rate": 4.839088217946208e-09, "loss": 0.5479, "step": 364 }, { "epoch": 1.9210526315789473, "grad_norm": 2.3900875539689257, "learning_rate": 4.2539373062336904e-09, "loss": 0.5368, "step": 365 }, { "epoch": 1.9263157894736842, "grad_norm": 2.280769261985202, "learning_rate": 3.7063305097694287e-09, "loss": 0.5503, "step": 366 }, { "epoch": 1.931578947368421, "grad_norm": 2.591641861271708, "learning_rate": 3.196309300214528e-09, "loss": 0.5516, "step": 367 }, { "epoch": 1.936842105263158, "grad_norm": 6.988790160030566, "learning_rate": 2.72391230277752e-09, "loss": 0.5146, "step": 368 }, { "epoch": 1.9421052631578948, "grad_norm": 2.3712111188938714, "learning_rate": 2.289175293289314e-09, "loss": 0.5327, "step": 369 }, { "epoch": 1.9473684210526314, "grad_norm": 2.2537886816054127, "learning_rate": 1.8921311954937514e-09, "loss": 0.539, "step": 370 }, { "epoch": 1.9526315789473685, "grad_norm": 2.5549021483750742, "learning_rate": 1.5328100785542696e-09, "loss": 0.5858, "step": 371 }, { "epoch": 1.9578947368421051, "grad_norm": 2.260161761471616, "learning_rate": 1.2112391547766109e-09, "loss": 0.5325, "step": 372 }, { "epoch": 1.9631578947368422, "grad_norm": 2.0728662282209296, "learning_rate": 9.27442777547971e-10, "loss": 0.5319, "step": 373 }, { "epoch": 1.9684210526315788, "grad_norm": 2.235237067721937, "learning_rate": 6.814424394926965e-10, "loss": 0.5291, "step": 374 }, { "epoch": 1.973684210526316, "grad_norm": 2.448448284365754, "learning_rate": 4.732567708445879e-10, "loss": 0.5517, "step": 375 }, { "epoch": 1.9789473684210526, "grad_norm": 2.2224536930373695, "learning_rate": 3.0290153803591567e-10, "loss": 0.5473, "step": 376 }, { "epoch": 1.9842105263157894, "grad_norm": 2.27924977744772, "learning_rate": 1.7038964250343236e-10, "loss": 0.541, "step": 377 }, { "epoch": 1.9894736842105263, "grad_norm": 2.329045504499136, "learning_rate": 7.573111971148627e-11, "loss": 0.5445, "step": 378 }, { "epoch": 1.9947368421052631, "grad_norm": 2.3718632081482105, "learning_rate": 1.893313839157473e-11, "loss": 0.5472, "step": 379 }, { "epoch": 2.0, "grad_norm": 2.301821768415375, "learning_rate": 0.0, "loss": 0.5179, "step": 380 }, { "epoch": 2.0, "step": 380, "total_flos": 1205553369972736.0, "train_loss": 0.6109166437073758, "train_runtime": 4792.8804, "train_samples_per_second": 5.067, "train_steps_per_second": 0.079 } ], "logging_steps": 1, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1205553369972736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }