|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.7267080745341614, |
|
"eval_steps": 500, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012422360248447204, |
|
"grad_norm": 0.00018831038323696703, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 2.4276, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024844720496894408, |
|
"grad_norm": 0.00010692311479942873, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 2.2565, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.037267080745341616, |
|
"grad_norm": 6.173110887175426e-05, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 1.9191, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.049689440993788817, |
|
"grad_norm": 4.326488488004543e-05, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 1.7997, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 3.188664049957879e-05, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 1.6296, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07453416149068323, |
|
"grad_norm": 2.6054423869936727e-05, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.5767, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 2.7649846742860973e-05, |
|
"learning_rate": 7.878787878787879e-05, |
|
"loss": 1.5131, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09937888198757763, |
|
"grad_norm": 1.6101763321785256e-05, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.4497, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11180124223602485, |
|
"grad_norm": 2.3354801669484004e-05, |
|
"learning_rate": 0.00010303030303030303, |
|
"loss": 1.4353, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 1.531304769741837e-05, |
|
"learning_rate": 0.00011515151515151516, |
|
"loss": 1.3655, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13664596273291926, |
|
"grad_norm": 9.777257218956947e-06, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 1.3691, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14906832298136646, |
|
"grad_norm": 1.5149210412346292e-05, |
|
"learning_rate": 0.0001393939393939394, |
|
"loss": 1.3916, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16149068322981366, |
|
"grad_norm": 1.1748486940632574e-05, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 1.341, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 3.532869595801458e-05, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 1.3409, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18633540372670807, |
|
"grad_norm": 7.877199095673859e-06, |
|
"learning_rate": 0.00017575757575757578, |
|
"loss": 1.3728, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19875776397515527, |
|
"grad_norm": 1.1650959095277358e-05, |
|
"learning_rate": 0.0001878787878787879, |
|
"loss": 1.3579, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2111801242236025, |
|
"grad_norm": 3.298332740087062e-05, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3835, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2236024844720497, |
|
"grad_norm": 9.103293450607453e-06, |
|
"learning_rate": 0.00019999471258384815, |
|
"loss": 1.3517, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2360248447204969, |
|
"grad_norm": 1.055269422067795e-05, |
|
"learning_rate": 0.000199978850894528, |
|
"loss": 1.3748, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2484472049689441, |
|
"grad_norm": 1.8264010577695444e-05, |
|
"learning_rate": 0.0001999524166093866, |
|
"loss": 1.383, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 9.094917004404124e-06, |
|
"learning_rate": 0.00019991541252380526, |
|
"loss": 1.4102, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2732919254658385, |
|
"grad_norm": 1.286895803787047e-05, |
|
"learning_rate": 0.00019986784255090397, |
|
"loss": 1.3811, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.386065014230553e-05, |
|
"learning_rate": 0.0001998097117211276, |
|
"loss": 1.3794, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2981366459627329, |
|
"grad_norm": 8.020670065889135e-06, |
|
"learning_rate": 0.00019974102618171394, |
|
"loss": 1.3584, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3105590062111801, |
|
"grad_norm": 8.217157301260158e-06, |
|
"learning_rate": 0.00019966179319604355, |
|
"loss": 1.3592, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32298136645962733, |
|
"grad_norm": 1.1515216101543047e-05, |
|
"learning_rate": 0.00019957202114287187, |
|
"loss": 1.3679, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.33540372670807456, |
|
"grad_norm": 8.87443184183212e-06, |
|
"learning_rate": 0.0001994717195154429, |
|
"loss": 1.3779, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 7.773131073918194e-06, |
|
"learning_rate": 0.00019936089892048556, |
|
"loss": 1.3634, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.36024844720496896, |
|
"grad_norm": 8.845816410030238e-06, |
|
"learning_rate": 0.00019923957107709195, |
|
"loss": 1.3312, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.37267080745341613, |
|
"grad_norm": 8.347928087459877e-06, |
|
"learning_rate": 0.000199107748815478, |
|
"loss": 1.3626, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38509316770186336, |
|
"grad_norm": 9.047604180523194e-06, |
|
"learning_rate": 0.0001989654460756269, |
|
"loss": 1.3653, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.39751552795031053, |
|
"grad_norm": 8.44144778966438e-06, |
|
"learning_rate": 0.00019881267790581466, |
|
"loss": 1.3712, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.40993788819875776, |
|
"grad_norm": 8.453951522824354e-06, |
|
"learning_rate": 0.0001986494604610191, |
|
"loss": 1.3027, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.422360248447205, |
|
"grad_norm": 7.831426046323031e-06, |
|
"learning_rate": 0.00019847581100121127, |
|
"loss": 1.3737, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 8.356658327102195e-06, |
|
"learning_rate": 0.00019829174788953038, |
|
"loss": 1.3522, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4472049689440994, |
|
"grad_norm": 8.671122486703098e-06, |
|
"learning_rate": 0.0001980972905903418, |
|
"loss": 1.3808, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.45962732919254656, |
|
"grad_norm": 9.771009899850469e-06, |
|
"learning_rate": 0.00019789245966717883, |
|
"loss": 1.3695, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4720496894409938, |
|
"grad_norm": 6.7302303250471596e-06, |
|
"learning_rate": 0.00019767727678056805, |
|
"loss": 1.3754, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.484472049689441, |
|
"grad_norm": 9.788966963242274e-06, |
|
"learning_rate": 0.00019745176468573893, |
|
"loss": 1.379, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 9.375480658491142e-06, |
|
"learning_rate": 0.00019721594723021732, |
|
"loss": 1.3484, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5093167701863354, |
|
"grad_norm": 3.550610927050002e-05, |
|
"learning_rate": 0.00019696984935130364, |
|
"loss": 1.377, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 9.707620847621001e-06, |
|
"learning_rate": 0.00019671349707343593, |
|
"loss": 1.3587, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5341614906832298, |
|
"grad_norm": 1.0621732144500129e-05, |
|
"learning_rate": 0.00019644691750543767, |
|
"loss": 1.393, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.546583850931677, |
|
"grad_norm": 9.491611308476422e-06, |
|
"learning_rate": 0.0001961701388376511, |
|
"loss": 1.3715, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5590062111801242, |
|
"grad_norm": 8.982805411505979e-06, |
|
"learning_rate": 0.00019588319033895623, |
|
"loss": 1.3829, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.0132759598491248e-05, |
|
"learning_rate": 0.0001955861023536756, |
|
"loss": 1.3395, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5838509316770186, |
|
"grad_norm": 1.3477648280968424e-05, |
|
"learning_rate": 0.0001952789062983654, |
|
"loss": 1.3908, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5962732919254659, |
|
"grad_norm": 1.1435187843744643e-05, |
|
"learning_rate": 0.0001949616346584934, |
|
"loss": 1.3799, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 1.273245652555488e-05, |
|
"learning_rate": 0.00019463432098500337, |
|
"loss": 1.3973, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6211180124223602, |
|
"grad_norm": 1.1162846021761652e-05, |
|
"learning_rate": 0.00019429699989076746, |
|
"loss": 1.3564, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6335403726708074, |
|
"grad_norm": 1.1645292943285313e-05, |
|
"learning_rate": 0.00019394970704692566, |
|
"loss": 1.3535, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6459627329192547, |
|
"grad_norm": 3.541897240211256e-05, |
|
"learning_rate": 0.00019359247917911384, |
|
"loss": 1.3813, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6583850931677019, |
|
"grad_norm": 9.2498794401763e-06, |
|
"learning_rate": 0.00019322535406358, |
|
"loss": 1.3617, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6708074534161491, |
|
"grad_norm": 8.978020559879951e-06, |
|
"learning_rate": 0.00019284837052318933, |
|
"loss": 1.3813, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6832298136645962, |
|
"grad_norm": 1.3010416296310723e-05, |
|
"learning_rate": 0.00019246156842331918, |
|
"loss": 1.3726, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 1.2821891687053721e-05, |
|
"learning_rate": 0.00019206498866764288, |
|
"loss": 1.4042, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7080745341614907, |
|
"grad_norm": 9.70847668213537e-06, |
|
"learning_rate": 0.00019165867319380456, |
|
"loss": 1.4061, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7204968944099379, |
|
"grad_norm": 1.6388959920732304e-05, |
|
"learning_rate": 0.0001912426649689842, |
|
"loss": 1.3668, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7329192546583851, |
|
"grad_norm": 1.66231893672375e-05, |
|
"learning_rate": 0.00019081700798535397, |
|
"loss": 1.3712, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7453416149068323, |
|
"grad_norm": 1.864772457338404e-05, |
|
"learning_rate": 0.00019038174725542604, |
|
"loss": 1.3849, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7577639751552795, |
|
"grad_norm": 1.4855336303298827e-05, |
|
"learning_rate": 0.0001899369288072927, |
|
"loss": 1.4006, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7701863354037267, |
|
"grad_norm": 1.8388282114756294e-05, |
|
"learning_rate": 0.00018948259967975888, |
|
"loss": 1.4008, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 2.689829489099793e-05, |
|
"learning_rate": 0.00018901880791736793, |
|
"loss": 1.3808, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7950310559006211, |
|
"grad_norm": 2.5062678105314262e-05, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 1.3948, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8074534161490683, |
|
"grad_norm": 2.1250931240501814e-05, |
|
"learning_rate": 0.0001880630336642905, |
|
"loss": 1.402, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8198757763975155, |
|
"grad_norm": 2.403008875262458e-05, |
|
"learning_rate": 0.0001875711522451284, |
|
"loss": 1.384, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8322981366459627, |
|
"grad_norm": 2.597655839053914e-05, |
|
"learning_rate": 0.00018707001032347, |
|
"loss": 1.3541, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.84472049689441, |
|
"grad_norm": 2.3845692339818925e-05, |
|
"learning_rate": 0.0001865596608942331, |
|
"loss": 1.396, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 1.8721084416029043e-05, |
|
"learning_rate": 0.00018604015792601396, |
|
"loss": 1.3427, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 3.2439467759104446e-05, |
|
"learning_rate": 0.0001855115563553803, |
|
"loss": 1.376, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8819875776397516, |
|
"grad_norm": 2.3974960640771315e-05, |
|
"learning_rate": 0.0001849739120810618, |
|
"loss": 1.4019, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8944099378881988, |
|
"grad_norm": 3.250224108342081e-05, |
|
"learning_rate": 0.00018442728195803881, |
|
"loss": 1.3915, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.906832298136646, |
|
"grad_norm": 2.820833833538927e-05, |
|
"learning_rate": 0.0001838717237915302, |
|
"loss": 1.3943, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9192546583850931, |
|
"grad_norm": 4.0885188354877755e-05, |
|
"learning_rate": 0.00018330729633088045, |
|
"loss": 1.3818, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9316770186335404, |
|
"grad_norm": 5.243903797236271e-05, |
|
"learning_rate": 0.00018273405926334696, |
|
"loss": 1.4112, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9440993788819876, |
|
"grad_norm": 4.928431008011103e-05, |
|
"learning_rate": 0.0001821520732077883, |
|
"loss": 4.4804, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9565217391304348, |
|
"grad_norm": 7.073425513226539e-05, |
|
"learning_rate": 0.00018156139970825391, |
|
"loss": 1.4202, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.968944099378882, |
|
"grad_norm": 4.9798843974713236e-05, |
|
"learning_rate": 0.00018096210122747584, |
|
"loss": 1.4218, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9813664596273292, |
|
"grad_norm": 6.847319309599698e-05, |
|
"learning_rate": 0.0001803542411402634, |
|
"loss": 1.401, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9937888198757764, |
|
"grad_norm": 5.58948922844138e-05, |
|
"learning_rate": 0.0001797378837268015, |
|
"loss": 1.3425, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0062111801242235, |
|
"grad_norm": 4.716762123280205e-05, |
|
"learning_rate": 0.00017911309416585296, |
|
"loss": 1.3779, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0186335403726707, |
|
"grad_norm": 5.352822699933313e-05, |
|
"learning_rate": 0.0001784799385278661, |
|
"loss": 1.341, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.031055900621118, |
|
"grad_norm": 5.088459511171095e-05, |
|
"learning_rate": 0.0001778384837679879, |
|
"loss": 1.4021, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 5.318582770996727e-05, |
|
"learning_rate": 0.00017718879771898348, |
|
"loss": 1.3597, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0559006211180124, |
|
"grad_norm": 4.925034227198921e-05, |
|
"learning_rate": 0.00017653094908406301, |
|
"loss": 1.3425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0683229813664596, |
|
"grad_norm": 5.734206206398085e-05, |
|
"learning_rate": 0.00017586500742961653, |
|
"loss": 1.389, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.0807453416149069, |
|
"grad_norm": 5.8051424275618047e-05, |
|
"learning_rate": 0.00017519104317785717, |
|
"loss": 1.3776, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.093167701863354, |
|
"grad_norm": 4.752572931465693e-05, |
|
"learning_rate": 0.00017450912759937434, |
|
"loss": 1.4045, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1055900621118013, |
|
"grad_norm": 4.9184389354195446e-05, |
|
"learning_rate": 0.00017381933280559693, |
|
"loss": 2.4013, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1180124223602483, |
|
"grad_norm": 6.995958392508328e-05, |
|
"learning_rate": 0.00017312173174116762, |
|
"loss": 1.6484, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1304347826086956, |
|
"grad_norm": 4.960764272254892e-05, |
|
"learning_rate": 0.0001724163981762291, |
|
"loss": 1.3301, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 4.448357503861189e-05, |
|
"learning_rate": 0.0001717034066986231, |
|
"loss": 1.4058, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.15527950310559, |
|
"grad_norm": 4.6940349420765415e-05, |
|
"learning_rate": 0.00017098283270600267, |
|
"loss": 1.3731, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.1677018633540373, |
|
"grad_norm": 4.82694485981483e-05, |
|
"learning_rate": 0.00017025475239785919, |
|
"loss": 1.3586, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.1801242236024845, |
|
"grad_norm": 4.615723810275085e-05, |
|
"learning_rate": 0.00016951924276746425, |
|
"loss": 1.3777, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1925465838509317, |
|
"grad_norm": 4.163803532719612e-05, |
|
"learning_rate": 0.00016877638159372782, |
|
"loss": 1.3974, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.204968944099379, |
|
"grad_norm": 4.0616945625515655e-05, |
|
"learning_rate": 0.00016802624743297333, |
|
"loss": 1.41, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 4.708253254648298e-05, |
|
"learning_rate": 0.00016726891961063028, |
|
"loss": 1.3499, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2298136645962732, |
|
"grad_norm": 4.392105984152295e-05, |
|
"learning_rate": 0.00016650447821284594, |
|
"loss": 1.3619, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2422360248447206, |
|
"grad_norm": 4.838509266846813e-05, |
|
"learning_rate": 0.00016573300407801616, |
|
"loss": 1.3634, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2546583850931676, |
|
"grad_norm": 4.574718332150951e-05, |
|
"learning_rate": 0.0001649545787882369, |
|
"loss": 1.3715, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.2670807453416149, |
|
"grad_norm": 4.9299873353447765e-05, |
|
"learning_rate": 0.00016416928466067723, |
|
"loss": 1.3917, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.279503105590062, |
|
"grad_norm": 5.623263132292777e-05, |
|
"learning_rate": 0.0001633772047388742, |
|
"loss": 1.3913, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.2919254658385093, |
|
"grad_norm": 5.661992690875195e-05, |
|
"learning_rate": 0.00016257842278395127, |
|
"loss": 1.3962, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 6.093809497542679e-05, |
|
"learning_rate": 0.0001617730232657606, |
|
"loss": 1.4153, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3167701863354038, |
|
"grad_norm": 6.311327160801739e-05, |
|
"learning_rate": 0.0001609610913539507, |
|
"loss": 1.4051, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.329192546583851, |
|
"grad_norm": 5.701288318959996e-05, |
|
"learning_rate": 0.00016014271290895965, |
|
"loss": 1.3797, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.341614906832298, |
|
"grad_norm": 6.887744530104101e-05, |
|
"learning_rate": 0.00015931797447293552, |
|
"loss": 1.4091, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3540372670807455, |
|
"grad_norm": 6.53747993055731e-05, |
|
"learning_rate": 0.00015848696326058498, |
|
"loss": 1.3931, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.3664596273291925, |
|
"grad_norm": 7.101244409568608e-05, |
|
"learning_rate": 0.00015764976714995009, |
|
"loss": 1.4177, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3788819875776397, |
|
"grad_norm": 7.210439071059227e-05, |
|
"learning_rate": 0.00015680647467311557, |
|
"loss": 1.4148, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 6.667665729764849e-05, |
|
"learning_rate": 0.0001559571750068468, |
|
"loss": 1.382, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4037267080745341, |
|
"grad_norm": 7.299148273887113e-05, |
|
"learning_rate": 0.00015510195796315915, |
|
"loss": 1.4309, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4161490683229814, |
|
"grad_norm": 7.622403063578531e-05, |
|
"learning_rate": 0.00015424091397982082, |
|
"loss": 1.425, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 8.405456901527941e-05, |
|
"learning_rate": 0.00015337413411078914, |
|
"loss": 1.4566, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4409937888198758, |
|
"grad_norm": 7.596038631163538e-05, |
|
"learning_rate": 0.00015250171001658172, |
|
"loss": 1.4299, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.453416149068323, |
|
"grad_norm": 7.857976015657187e-05, |
|
"learning_rate": 0.00015162373395458345, |
|
"loss": 1.4614, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.4658385093167703, |
|
"grad_norm": 7.91482743807137e-05, |
|
"learning_rate": 0.00015074029876929057, |
|
"loss": 1.4776, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.4782608695652173, |
|
"grad_norm": 8.164716564351693e-05, |
|
"learning_rate": 0.00014985149788249248, |
|
"loss": 1.4841, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.4906832298136645, |
|
"grad_norm": 7.853667193558067e-05, |
|
"learning_rate": 0.0001489574252833924, |
|
"loss": 1.4848, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5031055900621118, |
|
"grad_norm": 7.80523187131621e-05, |
|
"learning_rate": 0.00014805817551866838, |
|
"loss": 1.4909, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.515527950310559, |
|
"grad_norm": 7.651840132893994e-05, |
|
"learning_rate": 0.00014715384368247506, |
|
"loss": 1.518, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5279503105590062, |
|
"grad_norm": 7.088932034093887e-05, |
|
"learning_rate": 0.0001462445254063876, |
|
"loss": 1.5447, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5403726708074534, |
|
"grad_norm": 6.846669566584751e-05, |
|
"learning_rate": 0.00014533031684928874, |
|
"loss": 1.57, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.5527950310559007, |
|
"grad_norm": 6.54789837426506e-05, |
|
"learning_rate": 0.00014441131468720037, |
|
"loss": 1.5749, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 6.425156607292593e-05, |
|
"learning_rate": 0.00014348761610305993, |
|
"loss": 1.6024, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.5776397515527951, |
|
"grad_norm": 5.851482637808658e-05, |
|
"learning_rate": 0.00014255931877644372, |
|
"loss": 1.6127, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.5900621118012421, |
|
"grad_norm": 6.256994674913585e-05, |
|
"learning_rate": 0.00014162652087323734, |
|
"loss": 1.6352, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6024844720496896, |
|
"grad_norm": 5.2070794481551275e-05, |
|
"learning_rate": 0.00014068932103525472, |
|
"loss": 1.6374, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.6149068322981366, |
|
"grad_norm": 4.993085531168617e-05, |
|
"learning_rate": 0.00013974781836980713, |
|
"loss": 1.6439, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6273291925465838, |
|
"grad_norm": 4.7208366595441476e-05, |
|
"learning_rate": 0.0001388021124392225, |
|
"loss": 1.6509, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.639751552795031, |
|
"grad_norm": 4.80003327538725e-05, |
|
"learning_rate": 0.00013785230325031719, |
|
"loss": 1.674, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.6521739130434783, |
|
"grad_norm": 4.6028744691284373e-05, |
|
"learning_rate": 0.00013689849124382005, |
|
"loss": 1.69, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.6645962732919255, |
|
"grad_norm": 4.340623490861617e-05, |
|
"learning_rate": 0.00013594077728375128, |
|
"loss": 1.6875, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.6770186335403725, |
|
"grad_norm": 4.233741856296547e-05, |
|
"learning_rate": 0.00013497926264675612, |
|
"loss": 1.7301, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.68944099378882, |
|
"grad_norm": 4.126655767322518e-05, |
|
"learning_rate": 0.000134014049011395, |
|
"loss": 1.7203, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.701863354037267, |
|
"grad_norm": 3.361159542691894e-05, |
|
"learning_rate": 0.00013304523844739124, |
|
"loss": 1.7313, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 4.023270957986824e-05, |
|
"learning_rate": 0.00013207293340483726, |
|
"loss": 1.7382, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7267080745341614, |
|
"grad_norm": 3.606557584134862e-05, |
|
"learning_rate": 0.0001310972367033609, |
|
"loss": 1.7184, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 3.40029873768799e-05, |
|
"learning_rate": 0.00013011825152125204, |
|
"loss": 1.7307, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7515527950310559, |
|
"grad_norm": 3.170113996020518e-05, |
|
"learning_rate": 0.00012913608138455203, |
|
"loss": 1.7444, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.763975155279503, |
|
"grad_norm": 3.378765541128814e-05, |
|
"learning_rate": 0.00012815083015610572, |
|
"loss": 1.7305, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.7763975155279503, |
|
"grad_norm": 3.630015271482989e-05, |
|
"learning_rate": 0.00012716260202457827, |
|
"loss": 1.7104, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.7888198757763976, |
|
"grad_norm": 3.279558950453065e-05, |
|
"learning_rate": 0.00012617150149343744, |
|
"loss": 1.7303, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.8012422360248448, |
|
"grad_norm": 3.356958768563345e-05, |
|
"learning_rate": 0.0001251776333699023, |
|
"loss": 1.7274, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8136645962732918, |
|
"grad_norm": 3.0309342037071474e-05, |
|
"learning_rate": 0.00012418110275386028, |
|
"loss": 1.7267, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8260869565217392, |
|
"grad_norm": 3.178627594024874e-05, |
|
"learning_rate": 0.00012318201502675285, |
|
"loss": 1.7517, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.8385093167701863, |
|
"grad_norm": 3.2376421586377546e-05, |
|
"learning_rate": 0.0001221804758404317, |
|
"loss": 1.7193, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.8509316770186337, |
|
"grad_norm": 3.2105788704939187e-05, |
|
"learning_rate": 0.0001211765911059863, |
|
"loss": 1.7256, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.8633540372670807, |
|
"grad_norm": 3.046994243049994e-05, |
|
"learning_rate": 0.00012017046698254375, |
|
"loss": 1.756, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.875776397515528, |
|
"grad_norm": 3.129871765850112e-05, |
|
"learning_rate": 0.00011916220986604296, |
|
"loss": 1.7421, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.8881987577639752, |
|
"grad_norm": 2.9315173378563486e-05, |
|
"learning_rate": 0.00011815192637798314, |
|
"loss": 1.7563, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9006211180124224, |
|
"grad_norm": 2.882684020732995e-05, |
|
"learning_rate": 0.00011713972335414895, |
|
"loss": 1.7561, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 3.170946365571581e-05, |
|
"learning_rate": 0.00011612570783331279, |
|
"loss": 1.7426, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9254658385093166, |
|
"grad_norm": 2.7420563128544018e-05, |
|
"learning_rate": 0.00011510998704591542, |
|
"loss": 1.7876, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.937888198757764, |
|
"grad_norm": 3.0591610993724316e-05, |
|
"learning_rate": 0.00011409266840272685, |
|
"loss": 1.7634, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.950310559006211, |
|
"grad_norm": 2.7189913453185e-05, |
|
"learning_rate": 0.00011307385948348762, |
|
"loss": 1.8024, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.9627329192546585, |
|
"grad_norm": 3.112399281235412e-05, |
|
"learning_rate": 0.0001120536680255323, |
|
"loss": 1.7523, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.9751552795031055, |
|
"grad_norm": 2.740700074355118e-05, |
|
"learning_rate": 0.0001110322019123969, |
|
"loss": 1.7581, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9875776397515528, |
|
"grad_norm": 2.866005706891883e-05, |
|
"learning_rate": 0.00011000956916240985, |
|
"loss": 1.7639, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0002034466597251594, |
|
"learning_rate": 0.00010898587791726955, |
|
"loss": 1.7987, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.012422360248447, |
|
"grad_norm": 3.0956751288613304e-05, |
|
"learning_rate": 0.00010796123643060847, |
|
"loss": 1.7031, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.0248447204968945, |
|
"grad_norm": 5.328710176399909e-05, |
|
"learning_rate": 0.00010693575305654558, |
|
"loss": 1.6955, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.0372670807453415, |
|
"grad_norm": 5.25999566889368e-05, |
|
"learning_rate": 0.00010590953623822794, |
|
"loss": 1.7009, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.049689440993789, |
|
"grad_norm": 3.9534101233584806e-05, |
|
"learning_rate": 0.00010488269449636316, |
|
"loss": 1.7035, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.062111801242236, |
|
"grad_norm": 2.8875314455945045e-05, |
|
"learning_rate": 0.00010385533641774354, |
|
"loss": 1.7026, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.0745341614906834, |
|
"grad_norm": 2.6082469048560597e-05, |
|
"learning_rate": 0.00010282757064376297, |
|
"loss": 1.6991, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 2.0509953174041584e-05, |
|
"learning_rate": 0.00010179950585892855, |
|
"loss": 1.6917, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.099378881987578, |
|
"grad_norm": 2.403454709565267e-05, |
|
"learning_rate": 0.00010077125077936727, |
|
"loss": 1.7032, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.111801242236025, |
|
"grad_norm": 2.1703061065636575e-05, |
|
"learning_rate": 9.97429141413294e-05, |
|
"loss": 1.7203, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.124223602484472, |
|
"grad_norm": 1.91131402971223e-05, |
|
"learning_rate": 9.871460468969001e-05, |
|
"loss": 1.7406, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.1366459627329193, |
|
"grad_norm": 1.4955992810428143e-05, |
|
"learning_rate": 9.768643116644909e-05, |
|
"loss": 1.7497, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.1490683229813663, |
|
"grad_norm": 1.873956534836907e-05, |
|
"learning_rate": 9.665850229923258e-05, |
|
"loss": 1.7676, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.1614906832298137, |
|
"grad_norm": 2.206020690209698e-05, |
|
"learning_rate": 9.56309267897943e-05, |
|
"loss": 1.775, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 2.225383468612563e-05, |
|
"learning_rate": 9.460381330252127e-05, |
|
"loss": 1.7803, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.186335403726708, |
|
"grad_norm": 2.4063519958872348e-05, |
|
"learning_rate": 9.357727045294228e-05, |
|
"loss": 1.762, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.198757763975155, |
|
"grad_norm": 2.5652494514361024e-05, |
|
"learning_rate": 9.255140679624219e-05, |
|
"loss": 1.7849, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2111801242236027, |
|
"grad_norm": 2.7277781555312686e-05, |
|
"learning_rate": 9.152633081578243e-05, |
|
"loss": 1.7558, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.2236024844720497, |
|
"grad_norm": 2.8397616915754043e-05, |
|
"learning_rate": 9.050215091162884e-05, |
|
"loss": 1.7711, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.2360248447204967, |
|
"grad_norm": 2.8468950404203497e-05, |
|
"learning_rate": 8.947897538908882e-05, |
|
"loss": 1.7623, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.248447204968944, |
|
"grad_norm": 2.824833063641563e-05, |
|
"learning_rate": 8.845691244725802e-05, |
|
"loss": 1.7742, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 2.4112812752719037e-05, |
|
"learning_rate": 8.743607016757858e-05, |
|
"loss": 1.7931, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.2732919254658386, |
|
"grad_norm": 2.2189098672242835e-05, |
|
"learning_rate": 8.641655650240971e-05, |
|
"loss": 1.7806, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 2.327991751371883e-05, |
|
"learning_rate": 8.539847926361173e-05, |
|
"loss": 1.763, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.298136645962733, |
|
"grad_norm": 2.3889626390882768e-05, |
|
"learning_rate": 8.438194611114547e-05, |
|
"loss": 1.7523, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.31055900621118, |
|
"grad_norm": 2.383732862654142e-05, |
|
"learning_rate": 8.336706454168701e-05, |
|
"loss": 1.7727, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.3229813664596275, |
|
"grad_norm": 1.8651016944204457e-05, |
|
"learning_rate": 8.235394187726046e-05, |
|
"loss": 1.7714, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.3354037267080745, |
|
"grad_norm": 2.1619023755192757e-05, |
|
"learning_rate": 8.134268525388862e-05, |
|
"loss": 1.7656, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.3478260869565215, |
|
"grad_norm": 1.8270022337674163e-05, |
|
"learning_rate": 8.033340161026351e-05, |
|
"loss": 1.7535, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.360248447204969, |
|
"grad_norm": 1.9947459804825485e-05, |
|
"learning_rate": 7.932619767643801e-05, |
|
"loss": 1.7295, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.372670807453416, |
|
"grad_norm": 1.5592337149428204e-05, |
|
"learning_rate": 7.832117996253898e-05, |
|
"loss": 1.7559, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.3850931677018634, |
|
"grad_norm": 1.71353585756151e-05, |
|
"learning_rate": 7.731845474750438e-05, |
|
"loss": 1.7417, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.3975155279503104, |
|
"grad_norm": 1.2591329323186073e-05, |
|
"learning_rate": 7.631812806784406e-05, |
|
"loss": 1.7626, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.409937888198758, |
|
"grad_norm": 1.6798217984614894e-05, |
|
"learning_rate": 7.532030570642699e-05, |
|
"loss": 1.7447, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.422360248447205, |
|
"grad_norm": 1.387998418067582e-05, |
|
"learning_rate": 7.43250931812945e-05, |
|
"loss": 1.7625, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 1.4774514966120478e-05, |
|
"learning_rate": 7.333259573450222e-05, |
|
"loss": 1.7383, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.4472049689440993, |
|
"grad_norm": 1.4047771401237696e-05, |
|
"learning_rate": 7.234291832099078e-05, |
|
"loss": 1.7617, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.4596273291925463, |
|
"grad_norm": 1.4316069609776605e-05, |
|
"learning_rate": 7.135616559748699e-05, |
|
"loss": 1.7396, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.472049689440994, |
|
"grad_norm": 1.4282129995990545e-05, |
|
"learning_rate": 7.037244191143661e-05, |
|
"loss": 1.7325, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.4844720496894412, |
|
"grad_norm": 1.467942274757661e-05, |
|
"learning_rate": 6.939185128996976e-05, |
|
"loss": 1.7467, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4968944099378882, |
|
"grad_norm": 1.3236918675829656e-05, |
|
"learning_rate": 6.841449742890032e-05, |
|
"loss": 1.7485, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.5093167701863353, |
|
"grad_norm": 1.197248184325872e-05, |
|
"learning_rate": 6.744048368175999e-05, |
|
"loss": 1.7751, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.5217391304347827, |
|
"grad_norm": 1.2137723388150334e-05, |
|
"learning_rate": 6.646991304886922e-05, |
|
"loss": 1.7267, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.5341614906832297, |
|
"grad_norm": 1.3516695616999641e-05, |
|
"learning_rate": 6.550288816644483e-05, |
|
"loss": 1.7591, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.546583850931677, |
|
"grad_norm": 1.1421960152802058e-05, |
|
"learning_rate": 6.453951129574644e-05, |
|
"loss": 1.7509, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.559006211180124, |
|
"grad_norm": 1.3100820979161654e-05, |
|
"learning_rate": 6.357988431226261e-05, |
|
"loss": 1.7295, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 1.404106842528563e-05, |
|
"learning_rate": 6.262410869493757e-05, |
|
"loss": 1.7288, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.5838509316770186, |
|
"grad_norm": 1.2432222320057917e-05, |
|
"learning_rate": 6.167228551544007e-05, |
|
"loss": 1.7341, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.596273291925466, |
|
"grad_norm": 1.2816779417335056e-05, |
|
"learning_rate": 6.0724515427475105e-05, |
|
"loss": 1.7449, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 1.3977178241475485e-05, |
|
"learning_rate": 5.978089865614016e-05, |
|
"loss": 1.7453, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.62111801242236, |
|
"grad_norm": 1.2115887329855468e-05, |
|
"learning_rate": 5.884153498732642e-05, |
|
"loss": 1.7391, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.6335403726708075, |
|
"grad_norm": 1.1972248103120364e-05, |
|
"learning_rate": 5.790652375716652e-05, |
|
"loss": 1.7307, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.6459627329192545, |
|
"grad_norm": 1.3673886314791162e-05, |
|
"learning_rate": 5.697596384153009e-05, |
|
"loss": 1.7524, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.658385093167702, |
|
"grad_norm": 1.2259893992450088e-05, |
|
"learning_rate": 5.60499536455677e-05, |
|
"loss": 1.7492, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.670807453416149, |
|
"grad_norm": 1.3245457921584602e-05, |
|
"learning_rate": 5.5128591093304726e-05, |
|
"loss": 1.7268, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.683229813664596, |
|
"grad_norm": 1.2506958228186704e-05, |
|
"learning_rate": 5.4211973617285873e-05, |
|
"loss": 1.7168, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.6956521739130435, |
|
"grad_norm": 1.2610958037839737e-05, |
|
"learning_rate": 5.3300198148272185e-05, |
|
"loss": 1.7381, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.708074534161491, |
|
"grad_norm": 9.776917977433186e-06, |
|
"learning_rate": 5.239336110499053e-05, |
|
"loss": 1.747, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.720496894409938, |
|
"grad_norm": 1.3282600775710307e-05, |
|
"learning_rate": 5.149155838393737e-05, |
|
"loss": 1.725, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.732919254658385, |
|
"grad_norm": 1.2231404070917051e-05, |
|
"learning_rate": 5.059488534923831e-05, |
|
"loss": 1.7531, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.7453416149068324, |
|
"grad_norm": 0.0001226964232046157, |
|
"learning_rate": 4.9703436822562986e-05, |
|
"loss": 1.7278, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.7577639751552794, |
|
"grad_norm": 1.286713631998282e-05, |
|
"learning_rate": 4.881730707309821e-05, |
|
"loss": 1.7377, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.770186335403727, |
|
"grad_norm": 1.2882816008641385e-05, |
|
"learning_rate": 4.7936589807578946e-05, |
|
"loss": 1.7332, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 1.1893400369444862e-05, |
|
"learning_rate": 4.706137816037913e-05, |
|
"loss": 1.7399, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.795031055900621, |
|
"grad_norm": 1.2113765478716232e-05, |
|
"learning_rate": 4.6191764683662744e-05, |
|
"loss": 1.7265, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.8074534161490683, |
|
"grad_norm": 1.5664125385228544e-05, |
|
"learning_rate": 4.532784133759663e-05, |
|
"loss": 1.7195, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.8198757763975157, |
|
"grad_norm": 1.1506239388836548e-05, |
|
"learning_rate": 4.4469699480625884e-05, |
|
"loss": 1.7231, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.8322981366459627, |
|
"grad_norm": 1.1975148481724318e-05, |
|
"learning_rate": 4.361742985981278e-05, |
|
"loss": 1.7288, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.8447204968944098, |
|
"grad_norm": 9.909866093948949e-06, |
|
"learning_rate": 4.2771122601240576e-05, |
|
"loss": 1.728, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.0602449947327841e-05, |
|
"learning_rate": 4.193086720048249e-05, |
|
"loss": 1.7252, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.869565217391304, |
|
"grad_norm": 1.3800339729641564e-05, |
|
"learning_rate": 4.1096752513138216e-05, |
|
"loss": 1.7296, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.8819875776397517, |
|
"grad_norm": 1.312944368692115e-05, |
|
"learning_rate": 4.026886674543713e-05, |
|
"loss": 1.7188, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.8944099378881987, |
|
"grad_norm": 1.0245595149172004e-05, |
|
"learning_rate": 3.944729744491078e-05, |
|
"loss": 1.7235, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.906832298136646, |
|
"grad_norm": 1.5070048902998678e-05, |
|
"learning_rate": 3.863213149113498e-05, |
|
"loss": 1.7093, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.919254658385093, |
|
"grad_norm": 1.2049433280481026e-05, |
|
"learning_rate": 3.782345508654235e-05, |
|
"loss": 2.3222, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.9316770186335406, |
|
"grad_norm": 1.2924440852657426e-05, |
|
"learning_rate": 3.702135374730655e-05, |
|
"loss": 1.737, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.9440993788819876, |
|
"grad_norm": 1.540554512757808e-05, |
|
"learning_rate": 3.622591229429911e-05, |
|
"loss": 1.7166, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.9565217391304346, |
|
"grad_norm": 1.3177206710679457e-05, |
|
"learning_rate": 3.543721484411976e-05, |
|
"loss": 1.6965, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.968944099378882, |
|
"grad_norm": 1.6786321793915704e-05, |
|
"learning_rate": 3.465534480020124e-05, |
|
"loss": 1.6752, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.981366459627329, |
|
"grad_norm": 1.813886410673149e-05, |
|
"learning_rate": 3.3880384843989535e-05, |
|
"loss": 1.6826, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.9937888198757765, |
|
"grad_norm": 1.3371391105465591e-05, |
|
"learning_rate": 3.311241692620045e-05, |
|
"loss": 1.7133, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.0062111801242235, |
|
"grad_norm": 1.4623566130467225e-05, |
|
"learning_rate": 3.2351522258153345e-05, |
|
"loss": 1.6685, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.018633540372671, |
|
"grad_norm": 1.2453562703740317e-05, |
|
"learning_rate": 3.1597781303183516e-05, |
|
"loss": 1.6891, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.031055900621118, |
|
"grad_norm": 1.1916249604837503e-05, |
|
"learning_rate": 3.085127376813285e-05, |
|
"loss": 1.6965, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 9.490851880400442e-06, |
|
"learning_rate": 3.011207859492131e-05, |
|
"loss": 1.7216, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.0559006211180124, |
|
"grad_norm": 1.1531004020071123e-05, |
|
"learning_rate": 2.9380273952198955e-05, |
|
"loss": 1.701, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.0683229813664594, |
|
"grad_norm": 1.3211531040724367e-05, |
|
"learning_rate": 2.8655937227079466e-05, |
|
"loss": 1.7105, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.080745341614907, |
|
"grad_norm": 1.4893200386723038e-05, |
|
"learning_rate": 2.7939145016956845e-05, |
|
"loss": 1.7059, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.093167701863354, |
|
"grad_norm": 1.3906304047850426e-05, |
|
"learning_rate": 2.7229973121405295e-05, |
|
"loss": 1.7289, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.1055900621118013, |
|
"grad_norm": 1.3707428479392547e-05, |
|
"learning_rate": 2.6528496534163538e-05, |
|
"loss": 1.7354, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.1180124223602483, |
|
"grad_norm": 1.7548509276821278e-05, |
|
"learning_rate": 2.5834789435204243e-05, |
|
"loss": 1.7368, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.130434782608696, |
|
"grad_norm": 1.4127151189313736e-05, |
|
"learning_rate": 2.514892518288988e-05, |
|
"loss": 1.7412, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 1.5105492821021471e-05, |
|
"learning_rate": 2.4470976306214978e-05, |
|
"loss": 1.7338, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.1552795031055902, |
|
"grad_norm": 1.5982424883986823e-05, |
|
"learning_rate": 2.3801014497136254e-05, |
|
"loss": 1.7362, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.1677018633540373, |
|
"grad_norm": 1.56604601215804e-05, |
|
"learning_rate": 2.3139110602991633e-05, |
|
"loss": 1.7507, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.1801242236024843, |
|
"grad_norm": 1.4908397133694962e-05, |
|
"learning_rate": 2.2485334619007835e-05, |
|
"loss": 1.73, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.1925465838509317, |
|
"grad_norm": 1.5296851415769197e-05, |
|
"learning_rate": 2.1839755680898853e-05, |
|
"loss": 1.7442, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.2049689440993787, |
|
"grad_norm": 1.5975452697603032e-05, |
|
"learning_rate": 2.1202442057554785e-05, |
|
"loss": 1.7497, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.217391304347826, |
|
"grad_norm": 1.6455916920676827e-05, |
|
"learning_rate": 2.0573461143822524e-05, |
|
"loss": 1.7388, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.229813664596273, |
|
"grad_norm": 1.3599975318356883e-05, |
|
"learning_rate": 1.9952879453378938e-05, |
|
"loss": 1.7463, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.2422360248447206, |
|
"grad_norm": 1.7985181330004707e-05, |
|
"learning_rate": 1.9340762611697093e-05, |
|
"loss": 1.7265, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.2546583850931676, |
|
"grad_norm": 1.5109700143511873e-05, |
|
"learning_rate": 1.873717534910653e-05, |
|
"loss": 1.7349, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.267080745341615, |
|
"grad_norm": 1.492269802838564e-05, |
|
"learning_rate": 1.8142181493947997e-05, |
|
"loss": 1.7392, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.279503105590062, |
|
"grad_norm": 1.1795749742304906e-05, |
|
"learning_rate": 1.7555843965823992e-05, |
|
"loss": 1.7598, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.291925465838509, |
|
"grad_norm": 1.5022421393950935e-05, |
|
"learning_rate": 1.697822476894477e-05, |
|
"loss": 1.7436, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.3043478260869565, |
|
"grad_norm": 1.4739445759914815e-05, |
|
"learning_rate": 1.640938498557175e-05, |
|
"loss": 1.751, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.3167701863354035, |
|
"grad_norm": 1.211661674460629e-05, |
|
"learning_rate": 1.5849384769558195e-05, |
|
"loss": 1.7634, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.329192546583851, |
|
"grad_norm": 1.4754497897229157e-05, |
|
"learning_rate": 1.5298283339987762e-05, |
|
"loss": 1.7481, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.341614906832298, |
|
"grad_norm": 1.5657935364288278e-05, |
|
"learning_rate": 1.475613897491248e-05, |
|
"loss": 1.7262, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.3540372670807455, |
|
"grad_norm": 1.1670357707771473e-05, |
|
"learning_rate": 1.4223009005189792e-05, |
|
"loss": 1.7713, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.3664596273291925, |
|
"grad_norm": 1.154390974988928e-05, |
|
"learning_rate": 1.3698949808419959e-05, |
|
"loss": 1.7564, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.37888198757764, |
|
"grad_norm": 1.1009120498783886e-05, |
|
"learning_rate": 1.3184016802984212e-05, |
|
"loss": 1.7553, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.391304347826087, |
|
"grad_norm": 1.2527529179351404e-05, |
|
"learning_rate": 1.2678264442184362e-05, |
|
"loss": 1.7462, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.403726708074534, |
|
"grad_norm": 1.1460819223430008e-05, |
|
"learning_rate": 1.2181746208484424e-05, |
|
"loss": 1.7748, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.4161490683229814, |
|
"grad_norm": 1.1391004591132514e-05, |
|
"learning_rate": 1.1694514607854968e-05, |
|
"loss": 1.7565, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 1.1396230547688901e-05, |
|
"learning_rate": 1.121662116422072e-05, |
|
"loss": 1.7622, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.440993788819876, |
|
"grad_norm": 1.2809962754545268e-05, |
|
"learning_rate": 1.0748116414011888e-05, |
|
"loss": 1.7584, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.453416149068323, |
|
"grad_norm": 1.191952560475329e-05, |
|
"learning_rate": 1.0289049900820148e-05, |
|
"loss": 1.747, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.4658385093167703, |
|
"grad_norm": 1.0926189133897424e-05, |
|
"learning_rate": 9.839470170159492e-06, |
|
"loss": 1.7754, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 1.4316407032310963e-05, |
|
"learning_rate": 9.399424764332432e-06, |
|
"loss": 1.7706, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.4906832298136647, |
|
"grad_norm": 1.1862053725053556e-05, |
|
"learning_rate": 8.968960217402745e-06, |
|
"loss": 1.7543, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.5031055900621118, |
|
"grad_norm": 1.197822257381631e-05, |
|
"learning_rate": 8.548122050274366e-06, |
|
"loss": 1.7556, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.5155279503105588, |
|
"grad_norm": 1.1042899132007733e-05, |
|
"learning_rate": 8.136954765877748e-06, |
|
"loss": 1.7768, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.527950310559006, |
|
"grad_norm": 1.2004837117274292e-05, |
|
"learning_rate": 7.73550184446371e-06, |
|
"loss": 1.7705, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.5403726708074537, |
|
"grad_norm": 1.1749471013899893e-05, |
|
"learning_rate": 7.3438057390054715e-06, |
|
"loss": 1.7738, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.5527950310559007, |
|
"grad_norm": 1.1303432984277606e-05, |
|
"learning_rate": 6.9619078707093166e-06, |
|
"loss": 1.7754, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.5652173913043477, |
|
"grad_norm": 1.2692734344454948e-05, |
|
"learning_rate": 6.589848624634398e-06, |
|
"loss": 1.7554, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.577639751552795, |
|
"grad_norm": 1.2839472219638992e-05, |
|
"learning_rate": 6.227667345422061e-06, |
|
"loss": 1.7717, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.590062111801242, |
|
"grad_norm": 1.2379147847241256e-05, |
|
"learning_rate": 5.875402333135183e-06, |
|
"loss": 1.768, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.6024844720496896, |
|
"grad_norm": 1.4026823919266462e-05, |
|
"learning_rate": 5.533090839208133e-06, |
|
"loss": 1.7482, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.6149068322981366, |
|
"grad_norm": 1.5235823411785532e-05, |
|
"learning_rate": 5.200769062507305e-06, |
|
"loss": 1.7578, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.6273291925465836, |
|
"grad_norm": 1.2792443158105016e-05, |
|
"learning_rate": 4.8784721455032875e-06, |
|
"loss": 1.7746, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.639751552795031, |
|
"grad_norm": 1.2493183930928353e-05, |
|
"learning_rate": 4.566234170554639e-06, |
|
"loss": 1.7623, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.6521739130434785, |
|
"grad_norm": 1.148242245108122e-05, |
|
"learning_rate": 4.264088156303536e-06, |
|
"loss": 1.7659, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.6645962732919255, |
|
"grad_norm": 1.2335087376413867e-05, |
|
"learning_rate": 3.97206605418432e-06, |
|
"loss": 1.7659, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.6770186335403725, |
|
"grad_norm": 1.3579897313320544e-05, |
|
"learning_rate": 3.6901987450445707e-06, |
|
"loss": 1.7637, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.68944099378882, |
|
"grad_norm": 1.1136858120153192e-05, |
|
"learning_rate": 3.418516035879571e-06, |
|
"loss": 1.8644, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.701863354037267, |
|
"grad_norm": 1.2910018995171413e-05, |
|
"learning_rate": 3.1570466566801737e-06, |
|
"loss": 1.7642, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 1.555276867293287e-05, |
|
"learning_rate": 2.905818257394799e-06, |
|
"loss": 1.7472, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.7267080745341614, |
|
"grad_norm": 1.4716911209688988e-05, |
|
"learning_rate": 2.66485740500535e-06, |
|
"loss": 1.7653, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 644, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.698382123091886e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|