|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1670, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005988023952095808, |
|
"grad_norm": 5.398237705230713, |
|
"learning_rate": 1.1976047904191619e-06, |
|
"loss": 5.2288, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0029940119760479044, |
|
"grad_norm": 4.768432140350342, |
|
"learning_rate": 5.9880239520958085e-06, |
|
"loss": 5.1682, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005988023952095809, |
|
"grad_norm": 4.144229412078857, |
|
"learning_rate": 1.1976047904191617e-05, |
|
"loss": 5.1486, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008982035928143712, |
|
"grad_norm": 3.3602042198181152, |
|
"learning_rate": 1.7964071856287426e-05, |
|
"loss": 5.0451, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011976047904191617, |
|
"grad_norm": 3.0493969917297363, |
|
"learning_rate": 2.3952095808383234e-05, |
|
"loss": 4.9569, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014970059880239521, |
|
"grad_norm": 2.564634323120117, |
|
"learning_rate": 2.994011976047904e-05, |
|
"loss": 4.8593, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017964071856287425, |
|
"grad_norm": 2.1028244495391846, |
|
"learning_rate": 3.592814371257485e-05, |
|
"loss": 4.7325, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.020958083832335328, |
|
"grad_norm": 2.0943355560302734, |
|
"learning_rate": 4.191616766467066e-05, |
|
"loss": 4.6032, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.023952095808383235, |
|
"grad_norm": 2.1831929683685303, |
|
"learning_rate": 4.790419161676647e-05, |
|
"loss": 4.5291, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02694610778443114, |
|
"grad_norm": 2.2486143112182617, |
|
"learning_rate": 5.389221556886228e-05, |
|
"loss": 4.4485, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.029940119760479042, |
|
"grad_norm": 1.977081537246704, |
|
"learning_rate": 5.988023952095808e-05, |
|
"loss": 4.3569, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03293413173652695, |
|
"grad_norm": 2.1440491676330566, |
|
"learning_rate": 6.58682634730539e-05, |
|
"loss": 4.2673, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03592814371257485, |
|
"grad_norm": 1.845775842666626, |
|
"learning_rate": 7.18562874251497e-05, |
|
"loss": 4.1614, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.038922155688622756, |
|
"grad_norm": 2.0755035877227783, |
|
"learning_rate": 7.784431137724552e-05, |
|
"loss": 4.1784, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.041916167664670656, |
|
"grad_norm": 2.0788209438323975, |
|
"learning_rate": 8.383233532934131e-05, |
|
"loss": 4.0587, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04491017964071856, |
|
"grad_norm": 2.0177366733551025, |
|
"learning_rate": 8.982035928143712e-05, |
|
"loss": 4.0414, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04790419161676647, |
|
"grad_norm": 1.9424874782562256, |
|
"learning_rate": 9.580838323353294e-05, |
|
"loss": 3.9817, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05089820359281437, |
|
"grad_norm": 2.1245999336242676, |
|
"learning_rate": 0.00010179640718562875, |
|
"loss": 3.8902, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05389221556886228, |
|
"grad_norm": 1.7750043869018555, |
|
"learning_rate": 0.00010778443113772456, |
|
"loss": 3.8991, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05688622754491018, |
|
"grad_norm": 1.888543725013733, |
|
"learning_rate": 0.00011377245508982037, |
|
"loss": 3.8088, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.059880239520958084, |
|
"grad_norm": 1.696926236152649, |
|
"learning_rate": 0.00011976047904191617, |
|
"loss": 3.814, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06287425149700598, |
|
"grad_norm": 1.7164498567581177, |
|
"learning_rate": 0.00012574850299401196, |
|
"loss": 3.7896, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0658682634730539, |
|
"grad_norm": 1.924892544746399, |
|
"learning_rate": 0.0001317365269461078, |
|
"loss": 3.733, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0688622754491018, |
|
"grad_norm": 2.0697178840637207, |
|
"learning_rate": 0.00013772455089820359, |
|
"loss": 3.7365, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0718562874251497, |
|
"grad_norm": 1.8498303890228271, |
|
"learning_rate": 0.0001437125748502994, |
|
"loss": 3.6889, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0748502994011976, |
|
"grad_norm": 1.799263596534729, |
|
"learning_rate": 0.0001497005988023952, |
|
"loss": 3.6785, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07784431137724551, |
|
"grad_norm": 1.8041021823883057, |
|
"learning_rate": 0.00015568862275449103, |
|
"loss": 3.6193, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08083832335329341, |
|
"grad_norm": 2.032904863357544, |
|
"learning_rate": 0.00016167664670658683, |
|
"loss": 3.5986, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08383233532934131, |
|
"grad_norm": 2.044524908065796, |
|
"learning_rate": 0.00016766467065868263, |
|
"loss": 3.549, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08682634730538923, |
|
"grad_norm": 2.168320655822754, |
|
"learning_rate": 0.00017365269461077845, |
|
"loss": 3.5249, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08982035928143713, |
|
"grad_norm": 1.8781085014343262, |
|
"learning_rate": 0.00017964071856287425, |
|
"loss": 3.5475, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09281437125748503, |
|
"grad_norm": 1.6960855722427368, |
|
"learning_rate": 0.00018562874251497007, |
|
"loss": 3.5204, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09580838323353294, |
|
"grad_norm": 1.9243298768997192, |
|
"learning_rate": 0.00019161676646706587, |
|
"loss": 3.5206, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09880239520958084, |
|
"grad_norm": 2.000701904296875, |
|
"learning_rate": 0.0001976047904191617, |
|
"loss": 3.5163, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10179640718562874, |
|
"grad_norm": 1.6321606636047363, |
|
"learning_rate": 0.00019999803395762152, |
|
"loss": 3.4382, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10479041916167664, |
|
"grad_norm": 1.5973315238952637, |
|
"learning_rate": 0.00019998601953415373, |
|
"loss": 3.4415, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10778443113772455, |
|
"grad_norm": 1.6465246677398682, |
|
"learning_rate": 0.00019996308424365594, |
|
"loss": 3.4091, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11077844311377245, |
|
"grad_norm": 1.6946362257003784, |
|
"learning_rate": 0.00019992923059121106, |
|
"loss": 3.3752, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.11377245508982035, |
|
"grad_norm": 1.602065086364746, |
|
"learning_rate": 0.0001998844622744483, |
|
"loss": 3.415, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11676646706586827, |
|
"grad_norm": 1.8944158554077148, |
|
"learning_rate": 0.0001998287841831396, |
|
"loss": 3.3535, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 2.0829012393951416, |
|
"learning_rate": 0.00019976220239866562, |
|
"loss": 3.3339, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12275449101796407, |
|
"grad_norm": 1.8381649255752563, |
|
"learning_rate": 0.00019968472419335106, |
|
"loss": 3.3609, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12574850299401197, |
|
"grad_norm": 1.864918828010559, |
|
"learning_rate": 0.00019959635802967087, |
|
"loss": 3.3285, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12874251497005987, |
|
"grad_norm": 2.0552523136138916, |
|
"learning_rate": 0.00019949711355932566, |
|
"loss": 3.3141, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1317365269461078, |
|
"grad_norm": 1.4688948392868042, |
|
"learning_rate": 0.0001993870016221875, |
|
"loss": 3.3043, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1347305389221557, |
|
"grad_norm": 1.7277065515518188, |
|
"learning_rate": 0.000199266034245116, |
|
"loss": 3.3307, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1377245508982036, |
|
"grad_norm": 1.6926337480545044, |
|
"learning_rate": 0.0001991342246406448, |
|
"loss": 3.302, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1407185628742515, |
|
"grad_norm": 1.7383888959884644, |
|
"learning_rate": 0.00019899158720553824, |
|
"loss": 3.2949, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1437125748502994, |
|
"grad_norm": 1.4267903566360474, |
|
"learning_rate": 0.00019883813751921903, |
|
"loss": 3.2932, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1467065868263473, |
|
"grad_norm": 1.4597861766815186, |
|
"learning_rate": 0.00019867389234206654, |
|
"loss": 3.2629, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1497005988023952, |
|
"grad_norm": 1.6646181344985962, |
|
"learning_rate": 0.00019849886961358621, |
|
"loss": 3.2334, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15269461077844312, |
|
"grad_norm": 1.46786367893219, |
|
"learning_rate": 0.0001983130884504501, |
|
"loss": 3.2368, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.15568862275449102, |
|
"grad_norm": 1.3595575094223022, |
|
"learning_rate": 0.00019811656914440885, |
|
"loss": 3.2124, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15868263473053892, |
|
"grad_norm": 1.7106595039367676, |
|
"learning_rate": 0.0001979093331600754, |
|
"loss": 3.2399, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.16167664670658682, |
|
"grad_norm": 1.7180885076522827, |
|
"learning_rate": 0.0001976914031325806, |
|
"loss": 3.1848, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16467065868263472, |
|
"grad_norm": 1.5397311449050903, |
|
"learning_rate": 0.0001974628028651007, |
|
"loss": 3.1918, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.16766467065868262, |
|
"grad_norm": 1.2814067602157593, |
|
"learning_rate": 0.00019722355732625774, |
|
"loss": 3.1661, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.17065868263473055, |
|
"grad_norm": 1.3572536706924438, |
|
"learning_rate": 0.0001969736926473921, |
|
"loss": 3.2363, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.17365269461077845, |
|
"grad_norm": 1.1298632621765137, |
|
"learning_rate": 0.0001967132361197086, |
|
"loss": 3.1441, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.17664670658682635, |
|
"grad_norm": 1.3632957935333252, |
|
"learning_rate": 0.00019644221619129548, |
|
"loss": 3.1402, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.17964071856287425, |
|
"grad_norm": 1.4946106672286987, |
|
"learning_rate": 0.00019616066246401717, |
|
"loss": 3.1352, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18263473053892215, |
|
"grad_norm": 1.3740743398666382, |
|
"learning_rate": 0.00019586860569028124, |
|
"loss": 3.1609, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.18562874251497005, |
|
"grad_norm": 1.359784483909607, |
|
"learning_rate": 0.0001955660777696793, |
|
"loss": 3.1191, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18862275449101795, |
|
"grad_norm": 1.0663578510284424, |
|
"learning_rate": 0.00019525311174550285, |
|
"loss": 3.1508, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.19161676646706588, |
|
"grad_norm": 1.2853511571884155, |
|
"learning_rate": 0.00019492974180113426, |
|
"loss": 3.1197, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19461077844311378, |
|
"grad_norm": 1.2358075380325317, |
|
"learning_rate": 0.00019459600325631303, |
|
"loss": 3.0737, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.19760479041916168, |
|
"grad_norm": 1.2823344469070435, |
|
"learning_rate": 0.0001942519325632781, |
|
"loss": 3.1486, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.20059880239520958, |
|
"grad_norm": 1.0905367136001587, |
|
"learning_rate": 0.00019389756730278627, |
|
"loss": 3.1254, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.20359281437125748, |
|
"grad_norm": 1.1666399240493774, |
|
"learning_rate": 0.00019353294618000758, |
|
"loss": 3.0982, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.20658682634730538, |
|
"grad_norm": 1.352597951889038, |
|
"learning_rate": 0.00019315810902029786, |
|
"loss": 3.1079, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.20958083832335328, |
|
"grad_norm": 1.5166900157928467, |
|
"learning_rate": 0.00019277309676484858, |
|
"loss": 3.1142, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2125748502994012, |
|
"grad_norm": 1.1076050996780396, |
|
"learning_rate": 0.0001923779514662154, |
|
"loss": 3.0817, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2155688622754491, |
|
"grad_norm": 1.1077985763549805, |
|
"learning_rate": 0.00019197271628372482, |
|
"loss": 3.0804, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.218562874251497, |
|
"grad_norm": 1.0608789920806885, |
|
"learning_rate": 0.00019155743547876023, |
|
"loss": 3.0823, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2215568862275449, |
|
"grad_norm": 1.2678635120391846, |
|
"learning_rate": 0.00019113215440992752, |
|
"loss": 3.0845, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2245508982035928, |
|
"grad_norm": 1.1210081577301025, |
|
"learning_rate": 0.0001906969195281007, |
|
"loss": 3.066, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2275449101796407, |
|
"grad_norm": 1.3343095779418945, |
|
"learning_rate": 0.00019025177837134858, |
|
"loss": 3.0512, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.23053892215568864, |
|
"grad_norm": 1.1427243947982788, |
|
"learning_rate": 0.00018979677955974228, |
|
"loss": 3.073, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.23353293413173654, |
|
"grad_norm": 1.2000013589859009, |
|
"learning_rate": 0.0001893319727900448, |
|
"loss": 3.0551, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23652694610778444, |
|
"grad_norm": 1.0942010879516602, |
|
"learning_rate": 0.0001888574088302831, |
|
"loss": 3.0414, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 0.9435099363327026, |
|
"learning_rate": 0.00018837313951420272, |
|
"loss": 3.0231, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.24251497005988024, |
|
"grad_norm": 1.0394734144210815, |
|
"learning_rate": 0.00018787921773560657, |
|
"loss": 3.0175, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.24550898203592814, |
|
"grad_norm": 0.9441319704055786, |
|
"learning_rate": 0.00018737569744257756, |
|
"loss": 3.0385, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24850299401197604, |
|
"grad_norm": 1.020687460899353, |
|
"learning_rate": 0.00018686263363158602, |
|
"loss": 3.0505, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.25149700598802394, |
|
"grad_norm": 0.9998067617416382, |
|
"learning_rate": 0.0001863400823414831, |
|
"loss": 3.0358, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.25449101796407186, |
|
"grad_norm": 1.0903741121292114, |
|
"learning_rate": 0.00018580810064737965, |
|
"loss": 3.0195, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.25748502994011974, |
|
"grad_norm": 1.0015404224395752, |
|
"learning_rate": 0.00018526674665441257, |
|
"loss": 3.0362, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.26047904191616766, |
|
"grad_norm": 0.9371241927146912, |
|
"learning_rate": 0.00018471607949139803, |
|
"loss": 3.0123, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2634730538922156, |
|
"grad_norm": 0.9184331893920898, |
|
"learning_rate": 0.00018415615930437337, |
|
"loss": 3.0075, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.26646706586826346, |
|
"grad_norm": 1.0484634637832642, |
|
"learning_rate": 0.00018358704725002768, |
|
"loss": 3.0029, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2694610778443114, |
|
"grad_norm": 1.0423493385314941, |
|
"learning_rate": 0.000183008805489022, |
|
"loss": 2.9913, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.27245508982035926, |
|
"grad_norm": 1.2088091373443604, |
|
"learning_rate": 0.00018242149717919993, |
|
"loss": 2.9834, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2754491017964072, |
|
"grad_norm": 0.8996514678001404, |
|
"learning_rate": 0.0001818251864686893, |
|
"loss": 2.972, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.27844311377245506, |
|
"grad_norm": 1.014440894126892, |
|
"learning_rate": 0.00018121993848889552, |
|
"loss": 2.9687, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.281437125748503, |
|
"grad_norm": 0.8995974063873291, |
|
"learning_rate": 0.00018060581934738784, |
|
"loss": 2.9317, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2844311377245509, |
|
"grad_norm": 1.0253899097442627, |
|
"learning_rate": 0.00017998289612067864, |
|
"loss": 2.9695, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.2874251497005988, |
|
"grad_norm": 0.954319179058075, |
|
"learning_rate": 0.00017935123684689733, |
|
"loss": 2.9884, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2904191616766467, |
|
"grad_norm": 0.8713873028755188, |
|
"learning_rate": 0.00017871091051835874, |
|
"loss": 2.9689, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2934131736526946, |
|
"grad_norm": 0.9465665817260742, |
|
"learning_rate": 0.00017806198707402752, |
|
"loss": 2.9073, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2964071856287425, |
|
"grad_norm": 0.9693202972412109, |
|
"learning_rate": 0.00017740453739187922, |
|
"loss": 2.953, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2994011976047904, |
|
"grad_norm": 0.9308465719223022, |
|
"learning_rate": 0.0001767386332811587, |
|
"loss": 2.9439, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3023952095808383, |
|
"grad_norm": 0.8614184260368347, |
|
"learning_rate": 0.0001760643474745368, |
|
"loss": 2.9567, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.30538922155688625, |
|
"grad_norm": 0.913870632648468, |
|
"learning_rate": 0.00017538175362016622, |
|
"loss": 2.9447, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3083832335329341, |
|
"grad_norm": 0.8058397769927979, |
|
"learning_rate": 0.00017469092627363738, |
|
"loss": 2.9677, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.31137724550898205, |
|
"grad_norm": 0.7762190699577332, |
|
"learning_rate": 0.00017399194088983511, |
|
"loss": 2.9242, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3143712574850299, |
|
"grad_norm": 0.8444433808326721, |
|
"learning_rate": 0.0001732848738146973, |
|
"loss": 2.9612, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.31736526946107785, |
|
"grad_norm": 0.7365703582763672, |
|
"learning_rate": 0.00017256980227687595, |
|
"loss": 2.9629, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3203592814371258, |
|
"grad_norm": 0.841096818447113, |
|
"learning_rate": 0.00017184680437930198, |
|
"loss": 2.9349, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.32335329341317365, |
|
"grad_norm": 0.7778092622756958, |
|
"learning_rate": 0.00017111595909065466, |
|
"loss": 2.9147, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3263473053892216, |
|
"grad_norm": 0.8155597448348999, |
|
"learning_rate": 0.00017037734623673615, |
|
"loss": 2.9255, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.32934131736526945, |
|
"grad_norm": 0.693092942237854, |
|
"learning_rate": 0.00016963104649175272, |
|
"loss": 2.902, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3323353293413174, |
|
"grad_norm": 0.8276283144950867, |
|
"learning_rate": 0.0001688771413695032, |
|
"loss": 2.9414, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.33532934131736525, |
|
"grad_norm": 0.8656878471374512, |
|
"learning_rate": 0.00016811571321447566, |
|
"loss": 2.9105, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3383233532934132, |
|
"grad_norm": 0.713503897190094, |
|
"learning_rate": 0.00016734684519285344, |
|
"loss": 2.9138, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3413173652694611, |
|
"grad_norm": 0.7443877458572388, |
|
"learning_rate": 0.00016657062128343144, |
|
"loss": 2.8801, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.344311377245509, |
|
"grad_norm": 0.8257678151130676, |
|
"learning_rate": 0.00016578712626844365, |
|
"loss": 2.8803, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3473053892215569, |
|
"grad_norm": 0.8291301131248474, |
|
"learning_rate": 0.00016499644572430278, |
|
"loss": 2.9076, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3502994011976048, |
|
"grad_norm": 0.738528311252594, |
|
"learning_rate": 0.0001641986660122534, |
|
"loss": 2.8978, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3532934131736527, |
|
"grad_norm": 0.7673987150192261, |
|
"learning_rate": 0.00016339387426893918, |
|
"loss": 2.9111, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3562874251497006, |
|
"grad_norm": 0.8196045756340027, |
|
"learning_rate": 0.0001625821583968855, |
|
"loss": 2.8467, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 0.7220514416694641, |
|
"learning_rate": 0.00016176360705489823, |
|
"loss": 2.8808, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.36227544910179643, |
|
"grad_norm": 0.8424475193023682, |
|
"learning_rate": 0.00016093830964838035, |
|
"loss": 2.858, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3652694610778443, |
|
"grad_norm": 0.7498836517333984, |
|
"learning_rate": 0.00016010635631956652, |
|
"loss": 2.8987, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.36826347305389223, |
|
"grad_norm": 0.7145126461982727, |
|
"learning_rate": 0.0001592678379376775, |
|
"loss": 2.8769, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3712574850299401, |
|
"grad_norm": 0.6803230047225952, |
|
"learning_rate": 0.0001584228460889949, |
|
"loss": 2.8812, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.37425149700598803, |
|
"grad_norm": 0.6997132897377014, |
|
"learning_rate": 0.00015757147306685808, |
|
"loss": 2.8886, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3772455089820359, |
|
"grad_norm": 0.7119357585906982, |
|
"learning_rate": 0.00015671381186158312, |
|
"loss": 2.9011, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.38023952095808383, |
|
"grad_norm": 0.8835166096687317, |
|
"learning_rate": 0.00015584995615030634, |
|
"loss": 2.8876, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.38323353293413176, |
|
"grad_norm": 0.7940571904182434, |
|
"learning_rate": 0.0001549800002867524, |
|
"loss": 2.8739, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.38622754491017963, |
|
"grad_norm": 0.7138169407844543, |
|
"learning_rate": 0.00015410403929092857, |
|
"loss": 2.869, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.38922155688622756, |
|
"grad_norm": 0.711258053779602, |
|
"learning_rate": 0.00015322216883874643, |
|
"loss": 2.878, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.39221556886227543, |
|
"grad_norm": 0.730831503868103, |
|
"learning_rate": 0.0001523344852515716, |
|
"loss": 2.8573, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.39520958083832336, |
|
"grad_norm": 0.7164433598518372, |
|
"learning_rate": 0.00015144108548570322, |
|
"loss": 2.8375, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.39820359281437123, |
|
"grad_norm": 0.7521623969078064, |
|
"learning_rate": 0.000150542067121784, |
|
"loss": 2.8714, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.40119760479041916, |
|
"grad_norm": 0.6829126477241516, |
|
"learning_rate": 0.00014963752835414203, |
|
"loss": 2.8614, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4041916167664671, |
|
"grad_norm": 0.6588369011878967, |
|
"learning_rate": 0.00014872756798006576, |
|
"loss": 2.8498, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.40718562874251496, |
|
"grad_norm": 0.670486569404602, |
|
"learning_rate": 0.00014781228538901267, |
|
"loss": 2.853, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4101796407185629, |
|
"grad_norm": 0.6251630783081055, |
|
"learning_rate": 0.00014689178055175394, |
|
"loss": 2.8709, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.41317365269461076, |
|
"grad_norm": 0.6421228647232056, |
|
"learning_rate": 0.00014596615400945496, |
|
"loss": 2.8585, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4161676646706587, |
|
"grad_norm": 0.7128710746765137, |
|
"learning_rate": 0.0001450355068626939, |
|
"loss": 2.8333, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.41916167664670656, |
|
"grad_norm": 0.7639265060424805, |
|
"learning_rate": 0.0001440999407604192, |
|
"loss": 2.8495, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4221556886227545, |
|
"grad_norm": 0.7937913537025452, |
|
"learning_rate": 0.00014315955788884698, |
|
"loss": 2.8667, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4251497005988024, |
|
"grad_norm": 0.7219135761260986, |
|
"learning_rate": 0.00014221446096029992, |
|
"loss": 2.8137, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4281437125748503, |
|
"grad_norm": 0.6893067359924316, |
|
"learning_rate": 0.00014126475320198843, |
|
"loss": 2.8147, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.4311377245508982, |
|
"grad_norm": 0.7299759387969971, |
|
"learning_rate": 0.00014031053834473613, |
|
"loss": 2.8157, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4341317365269461, |
|
"grad_norm": 0.6203035116195679, |
|
"learning_rate": 0.00013935192061164956, |
|
"loss": 2.842, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.437125748502994, |
|
"grad_norm": 0.67593914270401, |
|
"learning_rate": 0.0001383890047067348, |
|
"loss": 2.8281, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.44011976047904194, |
|
"grad_norm": 0.629625678062439, |
|
"learning_rate": 0.0001374218958034612, |
|
"loss": 2.8321, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.4431137724550898, |
|
"grad_norm": 0.7435698509216309, |
|
"learning_rate": 0.0001364506995332739, |
|
"loss": 2.8069, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.44610778443113774, |
|
"grad_norm": 0.7014979124069214, |
|
"learning_rate": 0.00013547552197405632, |
|
"loss": 2.8271, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.4491017964071856, |
|
"grad_norm": 0.6280860900878906, |
|
"learning_rate": 0.00013449646963854396, |
|
"loss": 2.7851, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.45209580838323354, |
|
"grad_norm": 0.6429287195205688, |
|
"learning_rate": 0.00013351364946269072, |
|
"loss": 2.8342, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.4550898203592814, |
|
"grad_norm": 0.6879150867462158, |
|
"learning_rate": 0.00013252716879398884, |
|
"loss": 2.8017, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.45808383233532934, |
|
"grad_norm": 0.6519134044647217, |
|
"learning_rate": 0.00013153713537974394, |
|
"loss": 2.8272, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.46107784431137727, |
|
"grad_norm": 0.7050421833992004, |
|
"learning_rate": 0.00013054365735530664, |
|
"loss": 2.8046, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.46407185628742514, |
|
"grad_norm": 0.6754117012023926, |
|
"learning_rate": 0.00012954684323226136, |
|
"loss": 2.7996, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.46706586826347307, |
|
"grad_norm": 0.6119791269302368, |
|
"learning_rate": 0.00012854680188657437, |
|
"loss": 2.825, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.47005988023952094, |
|
"grad_norm": 0.7737504839897156, |
|
"learning_rate": 0.00012754364254670192, |
|
"loss": 2.8229, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.47305389221556887, |
|
"grad_norm": 0.6356379985809326, |
|
"learning_rate": 0.00012653747478165987, |
|
"loss": 2.7901, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.47604790419161674, |
|
"grad_norm": 0.679063081741333, |
|
"learning_rate": 0.0001255284084890562, |
|
"loss": 2.7847, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 0.6137480139732361, |
|
"learning_rate": 0.0001245165538830873, |
|
"loss": 2.7921, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4820359281437126, |
|
"grad_norm": 0.6218224167823792, |
|
"learning_rate": 0.00012350202148250037, |
|
"loss": 2.8368, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.48502994011976047, |
|
"grad_norm": 0.6735543012619019, |
|
"learning_rate": 0.0001224849220985218, |
|
"loss": 2.7808, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4880239520958084, |
|
"grad_norm": 0.5928522944450378, |
|
"learning_rate": 0.00012146536682275387, |
|
"loss": 2.7922, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.49101796407185627, |
|
"grad_norm": 0.640355110168457, |
|
"learning_rate": 0.00012044346701504128, |
|
"loss": 2.8388, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4940119760479042, |
|
"grad_norm": 0.5893163084983826, |
|
"learning_rate": 0.00011941933429130758, |
|
"loss": 2.8005, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.49700598802395207, |
|
"grad_norm": 0.5506909489631653, |
|
"learning_rate": 0.0001183930805113643, |
|
"loss": 2.8018, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6417153477668762, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 2.7829, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5029940119760479, |
|
"grad_norm": 0.6764044761657715, |
|
"learning_rate": 0.00011633465836820243, |
|
"loss": 2.7853, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5059880239520959, |
|
"grad_norm": 0.6103654503822327, |
|
"learning_rate": 0.00011530271483396115, |
|
"loss": 2.7922, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5089820359281437, |
|
"grad_norm": 0.6200462579727173, |
|
"learning_rate": 0.00011426909987690819, |
|
"loss": 2.798, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5119760479041916, |
|
"grad_norm": 0.556036114692688, |
|
"learning_rate": 0.00011323392639254193, |
|
"loss": 2.823, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5149700598802395, |
|
"grad_norm": 0.5561334490776062, |
|
"learning_rate": 0.00011219730744658921, |
|
"loss": 2.7946, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5179640718562875, |
|
"grad_norm": 0.6291623711585999, |
|
"learning_rate": 0.00011115935626265594, |
|
"loss": 2.769, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5209580838323353, |
|
"grad_norm": 0.622279703617096, |
|
"learning_rate": 0.00011012018620986028, |
|
"loss": 2.7905, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5239520958083832, |
|
"grad_norm": 0.5651618242263794, |
|
"learning_rate": 0.00010907991079045006, |
|
"loss": 2.7611, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5269461077844312, |
|
"grad_norm": 0.5883045792579651, |
|
"learning_rate": 0.00010803864362740562, |
|
"loss": 2.7853, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5299401197604791, |
|
"grad_norm": 0.5984832048416138, |
|
"learning_rate": 0.00010699649845202934, |
|
"loss": 2.7686, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5329341317365269, |
|
"grad_norm": 0.631034791469574, |
|
"learning_rate": 0.00010595358909152378, |
|
"loss": 2.7649, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5359281437125748, |
|
"grad_norm": 0.5855838656425476, |
|
"learning_rate": 0.00010491002945655861, |
|
"loss": 2.7972, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5389221556886228, |
|
"grad_norm": 0.644307017326355, |
|
"learning_rate": 0.00010386593352882909, |
|
"loss": 2.7894, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5419161676646707, |
|
"grad_norm": 0.6458478569984436, |
|
"learning_rate": 0.0001028214153486066, |
|
"loss": 2.7875, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5449101796407185, |
|
"grad_norm": 0.6464332342147827, |
|
"learning_rate": 0.00010177658900228249, |
|
"loss": 2.7967, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5479041916167665, |
|
"grad_norm": 0.564663827419281, |
|
"learning_rate": 0.0001007315686099072, |
|
"loss": 2.8149, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5508982035928144, |
|
"grad_norm": 0.669620931148529, |
|
"learning_rate": 9.96864683127257e-05, |
|
"loss": 2.773, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5538922155688623, |
|
"grad_norm": 0.6294126510620117, |
|
"learning_rate": 9.864140226071053e-05, |
|
"loss": 2.7909, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5568862275449101, |
|
"grad_norm": 0.5996381044387817, |
|
"learning_rate": 9.759648460009376e-05, |
|
"loss": 2.7736, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5598802395209581, |
|
"grad_norm": 0.6062589287757874, |
|
"learning_rate": 9.655182946089956e-05, |
|
"loss": 2.7693, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.562874251497006, |
|
"grad_norm": 0.6130212545394897, |
|
"learning_rate": 9.550755094447848e-05, |
|
"loss": 2.7422, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5658682634730539, |
|
"grad_norm": 0.6118180155754089, |
|
"learning_rate": 9.446376311104494e-05, |
|
"loss": 2.7847, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5688622754491018, |
|
"grad_norm": 0.6395862698554993, |
|
"learning_rate": 9.342057996721894e-05, |
|
"loss": 2.7557, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5718562874251497, |
|
"grad_norm": 0.5906466841697693, |
|
"learning_rate": 9.237811545357392e-05, |
|
"loss": 2.7821, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5748502994011976, |
|
"grad_norm": 0.6405777335166931, |
|
"learning_rate": 9.133648343219168e-05, |
|
"loss": 2.7561, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5778443113772455, |
|
"grad_norm": 0.537714421749115, |
|
"learning_rate": 9.029579767422592e-05, |
|
"loss": 2.7624, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5808383233532934, |
|
"grad_norm": 0.5861092805862427, |
|
"learning_rate": 8.925617184747584e-05, |
|
"loss": 2.7885, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5838323353293413, |
|
"grad_norm": 0.5366087555885315, |
|
"learning_rate": 8.821771950397066e-05, |
|
"loss": 2.7717, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5868263473053892, |
|
"grad_norm": 0.5149749517440796, |
|
"learning_rate": 8.718055406756714e-05, |
|
"loss": 2.7722, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5898203592814372, |
|
"grad_norm": 0.6070750951766968, |
|
"learning_rate": 8.614478882156103e-05, |
|
"loss": 2.7747, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.592814371257485, |
|
"grad_norm": 0.5566183924674988, |
|
"learning_rate": 8.51105368963137e-05, |
|
"loss": 2.7363, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5958083832335329, |
|
"grad_norm": 0.5821384191513062, |
|
"learning_rate": 8.407791125689578e-05, |
|
"loss": 2.7286, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 0.5972596406936646, |
|
"learning_rate": 8.30470246907484e-05, |
|
"loss": 2.8008, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6017964071856288, |
|
"grad_norm": 0.5653948187828064, |
|
"learning_rate": 8.201798979536437e-05, |
|
"loss": 2.7381, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6047904191616766, |
|
"grad_norm": 0.5987522602081299, |
|
"learning_rate": 8.099091896598964e-05, |
|
"loss": 2.7171, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6077844311377245, |
|
"grad_norm": 0.6162002086639404, |
|
"learning_rate": 7.996592438334728e-05, |
|
"loss": 2.7595, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6107784431137725, |
|
"grad_norm": 0.5301627516746521, |
|
"learning_rate": 7.894311800138432e-05, |
|
"loss": 2.7199, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6137724550898204, |
|
"grad_norm": 0.5902780890464783, |
|
"learning_rate": 7.792261153504402e-05, |
|
"loss": 2.7465, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6167664670658682, |
|
"grad_norm": 0.5662907958030701, |
|
"learning_rate": 7.690451644806372e-05, |
|
"loss": 2.7463, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6197604790419161, |
|
"grad_norm": 0.5153141617774963, |
|
"learning_rate": 7.588894394080045e-05, |
|
"loss": 2.7497, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6227544910179641, |
|
"grad_norm": 0.5608941316604614, |
|
"learning_rate": 7.487600493808513e-05, |
|
"loss": 2.7587, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.625748502994012, |
|
"grad_norm": 0.5900591015815735, |
|
"learning_rate": 7.386581007710693e-05, |
|
"loss": 2.721, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6287425149700598, |
|
"grad_norm": 0.5923062562942505, |
|
"learning_rate": 7.285846969532907e-05, |
|
"loss": 2.7245, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6317365269461078, |
|
"grad_norm": 0.5622250437736511, |
|
"learning_rate": 7.185409381843727e-05, |
|
"loss": 2.7514, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6347305389221557, |
|
"grad_norm": 0.5878713130950928, |
|
"learning_rate": 7.085279214832233e-05, |
|
"loss": 2.7361, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6377245508982036, |
|
"grad_norm": 0.5391808748245239, |
|
"learning_rate": 6.985467405109815e-05, |
|
"loss": 2.7708, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6407185628742516, |
|
"grad_norm": 0.5266147255897522, |
|
"learning_rate": 6.885984854515623e-05, |
|
"loss": 2.739, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6437125748502994, |
|
"grad_norm": 0.5293008089065552, |
|
"learning_rate": 6.786842428925821e-05, |
|
"loss": 2.7338, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6467065868263473, |
|
"grad_norm": 0.5374533534049988, |
|
"learning_rate": 6.688050957066787e-05, |
|
"loss": 2.7521, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6497005988023952, |
|
"grad_norm": 0.519822895526886, |
|
"learning_rate": 6.58962122933234e-05, |
|
"loss": 2.7509, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6526946107784432, |
|
"grad_norm": 0.4745498597621918, |
|
"learning_rate": 6.491563996605198e-05, |
|
"loss": 2.6944, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.655688622754491, |
|
"grad_norm": 0.5428311228752136, |
|
"learning_rate": 6.393889969082691e-05, |
|
"loss": 2.746, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6586826347305389, |
|
"grad_norm": 0.6130102276802063, |
|
"learning_rate": 6.29660981510697e-05, |
|
"loss": 2.7142, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6616766467065869, |
|
"grad_norm": 0.565981388092041, |
|
"learning_rate": 6.199734159999769e-05, |
|
"loss": 2.7377, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.6646706586826348, |
|
"grad_norm": 0.5873706340789795, |
|
"learning_rate": 6.103273584901856e-05, |
|
"loss": 2.7707, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6676646706586826, |
|
"grad_norm": 0.6199204325675964, |
|
"learning_rate": 6.007238625617333e-05, |
|
"loss": 2.7288, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.6706586826347305, |
|
"grad_norm": 0.5575565695762634, |
|
"learning_rate": 5.911639771462858e-05, |
|
"loss": 2.7454, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6736526946107785, |
|
"grad_norm": 0.5518234372138977, |
|
"learning_rate": 5.8164874641219735e-05, |
|
"loss": 2.7345, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6766467065868264, |
|
"grad_norm": 0.5617517828941345, |
|
"learning_rate": 5.721792096504611e-05, |
|
"loss": 2.756, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6796407185628742, |
|
"grad_norm": 0.5457236170768738, |
|
"learning_rate": 5.627564011611961e-05, |
|
"loss": 2.7492, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6826347305389222, |
|
"grad_norm": 0.5588122606277466, |
|
"learning_rate": 5.5338135014067395e-05, |
|
"loss": 2.7385, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6856287425149701, |
|
"grad_norm": 0.5922512412071228, |
|
"learning_rate": 5.440550805689075e-05, |
|
"loss": 2.7128, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.688622754491018, |
|
"grad_norm": 0.5327529311180115, |
|
"learning_rate": 5.3477861109780835e-05, |
|
"loss": 2.7507, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6916167664670658, |
|
"grad_norm": 0.5234887003898621, |
|
"learning_rate": 5.255529549399234e-05, |
|
"loss": 2.77, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6946107784431138, |
|
"grad_norm": 0.5305109024047852, |
|
"learning_rate": 5.163791197577714e-05, |
|
"loss": 2.7765, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6976047904191617, |
|
"grad_norm": 0.5282033085823059, |
|
"learning_rate": 5.0725810755377825e-05, |
|
"loss": 2.7518, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.7005988023952096, |
|
"grad_norm": 0.5210415124893188, |
|
"learning_rate": 4.9819091456083644e-05, |
|
"loss": 2.7493, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7035928143712575, |
|
"grad_norm": 0.560530424118042, |
|
"learning_rate": 4.891785311334923e-05, |
|
"loss": 2.7429, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7065868263473054, |
|
"grad_norm": 0.5484561324119568, |
|
"learning_rate": 4.8022194163977494e-05, |
|
"loss": 2.7131, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7095808383233533, |
|
"grad_norm": 0.5590568780899048, |
|
"learning_rate": 4.713221243536816e-05, |
|
"loss": 2.715, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.7125748502994012, |
|
"grad_norm": 0.5286269783973694, |
|
"learning_rate": 4.6248005134832394e-05, |
|
"loss": 2.7006, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7155688622754491, |
|
"grad_norm": 0.5053289532661438, |
|
"learning_rate": 4.5369668838975597e-05, |
|
"loss": 2.7375, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 0.5476385354995728, |
|
"learning_rate": 4.449729948314894e-05, |
|
"loss": 2.7402, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7215568862275449, |
|
"grad_norm": 0.5178956985473633, |
|
"learning_rate": 4.363099235097087e-05, |
|
"loss": 2.7, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.7245508982035929, |
|
"grad_norm": 0.5388510227203369, |
|
"learning_rate": 4.277084206391989e-05, |
|
"loss": 2.7014, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7275449101796407, |
|
"grad_norm": 0.510368824005127, |
|
"learning_rate": 4.191694257099962e-05, |
|
"loss": 2.7267, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7305389221556886, |
|
"grad_norm": 0.563225269317627, |
|
"learning_rate": 4.10693871384773e-05, |
|
"loss": 2.7277, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7335329341317365, |
|
"grad_norm": 0.5411152243614197, |
|
"learning_rate": 4.022826833969692e-05, |
|
"loss": 2.7442, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.7365269461077845, |
|
"grad_norm": 0.5123066902160645, |
|
"learning_rate": 3.93936780449679e-05, |
|
"loss": 2.7042, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7395209580838323, |
|
"grad_norm": 0.5124099254608154, |
|
"learning_rate": 3.856570741153087e-05, |
|
"loss": 2.7288, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.7425149700598802, |
|
"grad_norm": 0.5201666355133057, |
|
"learning_rate": 3.774444687360082e-05, |
|
"loss": 2.7266, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7455089820359282, |
|
"grad_norm": 0.5414671301841736, |
|
"learning_rate": 3.692998613248977e-05, |
|
"loss": 2.7237, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"grad_norm": 0.5279539823532104, |
|
"learning_rate": 3.6122414146809014e-05, |
|
"loss": 2.7114, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7514970059880239, |
|
"grad_norm": 0.5688903331756592, |
|
"learning_rate": 3.532181912275301e-05, |
|
"loss": 2.7434, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.7544910179640718, |
|
"grad_norm": 0.5434580445289612, |
|
"learning_rate": 3.4528288504464844e-05, |
|
"loss": 2.7369, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7574850299401198, |
|
"grad_norm": 0.5412757992744446, |
|
"learning_rate": 3.3741908964485414e-05, |
|
"loss": 2.7672, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.7604790419161677, |
|
"grad_norm": 0.5109795331954956, |
|
"learning_rate": 3.296276639428665e-05, |
|
"loss": 2.6753, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7634730538922155, |
|
"grad_norm": 0.5179950594902039, |
|
"learning_rate": 3.21909458948901e-05, |
|
"loss": 2.7301, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.7664670658682635, |
|
"grad_norm": 0.5099706649780273, |
|
"learning_rate": 3.1426531767572e-05, |
|
"loss": 2.7018, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7694610778443114, |
|
"grad_norm": 0.5369354486465454, |
|
"learning_rate": 3.0669607504655326e-05, |
|
"loss": 2.7473, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.7724550898203593, |
|
"grad_norm": 0.45971599221229553, |
|
"learning_rate": 2.9920255780390617e-05, |
|
"loss": 2.7035, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7754491017964071, |
|
"grad_norm": 0.49725160002708435, |
|
"learning_rate": 2.917855844192584e-05, |
|
"loss": 2.7258, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.7784431137724551, |
|
"grad_norm": 0.48002126812934875, |
|
"learning_rate": 2.8444596500366825e-05, |
|
"loss": 2.6843, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.781437125748503, |
|
"grad_norm": 0.47459498047828674, |
|
"learning_rate": 2.7718450121928918e-05, |
|
"loss": 2.7133, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.7844311377245509, |
|
"grad_norm": 0.5074918270111084, |
|
"learning_rate": 2.7000198619180794e-05, |
|
"loss": 2.7191, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7874251497005988, |
|
"grad_norm": 0.48583555221557617, |
|
"learning_rate": 2.6289920442381722e-05, |
|
"loss": 2.7248, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7904191616766467, |
|
"grad_norm": 0.5015809535980225, |
|
"learning_rate": 2.5587693170912875e-05, |
|
"loss": 2.7373, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7934131736526946, |
|
"grad_norm": 0.49757909774780273, |
|
"learning_rate": 2.4893593504803826e-05, |
|
"loss": 2.7156, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7964071856287425, |
|
"grad_norm": 0.48436880111694336, |
|
"learning_rate": 2.4207697256355145e-05, |
|
"loss": 2.7378, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7994011976047904, |
|
"grad_norm": 0.5281265377998352, |
|
"learning_rate": 2.353007934185768e-05, |
|
"loss": 2.7332, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.8023952095808383, |
|
"grad_norm": 0.5300716161727905, |
|
"learning_rate": 2.2860813773410106e-05, |
|
"loss": 2.663, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8053892215568862, |
|
"grad_norm": 0.429849237203598, |
|
"learning_rate": 2.2199973650834906e-05, |
|
"loss": 2.6993, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.8083832335329342, |
|
"grad_norm": 0.5136377811431885, |
|
"learning_rate": 2.154763115369419e-05, |
|
"loss": 2.7297, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.811377245508982, |
|
"grad_norm": 0.523758053779602, |
|
"learning_rate": 2.0903857533405958e-05, |
|
"loss": 2.7227, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.8143712574850299, |
|
"grad_norm": 0.5091609954833984, |
|
"learning_rate": 2.026872310546165e-05, |
|
"loss": 2.7174, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8173652694610778, |
|
"grad_norm": 0.5340681672096252, |
|
"learning_rate": 1.9642297241746142e-05, |
|
"loss": 2.706, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.8203592814371258, |
|
"grad_norm": 0.4736381471157074, |
|
"learning_rate": 1.902464836296054e-05, |
|
"loss": 2.7244, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8233532934131736, |
|
"grad_norm": 0.5413190126419067, |
|
"learning_rate": 1.841584393114919e-05, |
|
"loss": 2.682, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8263473053892215, |
|
"grad_norm": 0.45962581038475037, |
|
"learning_rate": 1.7815950442330963e-05, |
|
"loss": 2.6946, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8293413173652695, |
|
"grad_norm": 0.5146113634109497, |
|
"learning_rate": 1.7225033419236503e-05, |
|
"loss": 2.6919, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.8323353293413174, |
|
"grad_norm": 0.48548585176467896, |
|
"learning_rate": 1.6643157404151467e-05, |
|
"loss": 2.7278, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8353293413173652, |
|
"grad_norm": 0.46142202615737915, |
|
"learning_rate": 1.6070385951866952e-05, |
|
"loss": 2.7241, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 0.5457370281219482, |
|
"learning_rate": 1.5506781622737942e-05, |
|
"loss": 2.7047, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8413173652694611, |
|
"grad_norm": 0.48171794414520264, |
|
"learning_rate": 1.4952405975850026e-05, |
|
"loss": 2.6901, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.844311377245509, |
|
"grad_norm": 0.5125330090522766, |
|
"learning_rate": 1.4407319562295762e-05, |
|
"loss": 2.7052, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8473053892215568, |
|
"grad_norm": 0.4971305727958679, |
|
"learning_rate": 1.387158191856105e-05, |
|
"loss": 2.7405, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.8502994011976048, |
|
"grad_norm": 0.539444625377655, |
|
"learning_rate": 1.3345251560022288e-05, |
|
"loss": 2.6869, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8532934131736527, |
|
"grad_norm": 0.5079681873321533, |
|
"learning_rate": 1.2828385974555202e-05, |
|
"loss": 2.7132, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.8562874251497006, |
|
"grad_norm": 0.5335781574249268, |
|
"learning_rate": 1.2321041616255614e-05, |
|
"loss": 2.7378, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8592814371257484, |
|
"grad_norm": 0.4930953085422516, |
|
"learning_rate": 1.1823273899273435e-05, |
|
"loss": 2.678, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.8622754491017964, |
|
"grad_norm": 0.4556470513343811, |
|
"learning_rate": 1.1335137191760093e-05, |
|
"loss": 2.6918, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8652694610778443, |
|
"grad_norm": 0.49691587686538696, |
|
"learning_rate": 1.0856684809930151e-05, |
|
"loss": 2.6948, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.8682634730538922, |
|
"grad_norm": 0.48159509897232056, |
|
"learning_rate": 1.0387969012238064e-05, |
|
"loss": 2.694, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8712574850299402, |
|
"grad_norm": 0.4907771050930023, |
|
"learning_rate": 9.929040993670114e-06, |
|
"loss": 2.724, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.874251497005988, |
|
"grad_norm": 0.4428238570690155, |
|
"learning_rate": 9.47995088015281e-06, |
|
"loss": 2.6853, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8772455089820359, |
|
"grad_norm": 0.4720156192779541, |
|
"learning_rate": 9.040747723077902e-06, |
|
"loss": 2.7465, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.8802395209580839, |
|
"grad_norm": 0.4877772629261017, |
|
"learning_rate": 8.61147949394483e-06, |
|
"loss": 2.7284, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8832335329341318, |
|
"grad_norm": 0.48317137360572815, |
|
"learning_rate": 8.192193079121002e-06, |
|
"loss": 2.7261, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.8862275449101796, |
|
"grad_norm": 0.48045286536216736, |
|
"learning_rate": 7.782934274720777e-06, |
|
"loss": 2.6818, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8892215568862275, |
|
"grad_norm": 0.47768479585647583, |
|
"learning_rate": 7.3837477816033896e-06, |
|
"loss": 2.692, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.8922155688622755, |
|
"grad_norm": 0.5294510722160339, |
|
"learning_rate": 6.994677200490507e-06, |
|
"loss": 2.6831, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8952095808383234, |
|
"grad_norm": 0.48493650555610657, |
|
"learning_rate": 6.615765027204102e-06, |
|
"loss": 2.6695, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.8982035928143712, |
|
"grad_norm": 0.5145286917686462, |
|
"learning_rate": 6.247052648024765e-06, |
|
"loss": 2.6966, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9011976047904192, |
|
"grad_norm": 0.4654744565486908, |
|
"learning_rate": 5.888580335171368e-06, |
|
"loss": 2.7247, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.9041916167664671, |
|
"grad_norm": 0.4812209904193878, |
|
"learning_rate": 5.540387242402434e-06, |
|
"loss": 2.7261, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.907185628742515, |
|
"grad_norm": 0.4804396331310272, |
|
"learning_rate": 5.20251140073953e-06, |
|
"loss": 2.692, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.9101796407185628, |
|
"grad_norm": 0.4617965519428253, |
|
"learning_rate": 4.874989714313449e-06, |
|
"loss": 2.7015, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9131736526946108, |
|
"grad_norm": 0.4632161259651184, |
|
"learning_rate": 4.5578579563333e-06, |
|
"loss": 2.6971, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.9161676646706587, |
|
"grad_norm": 0.530784010887146, |
|
"learning_rate": 4.251150765179291e-06, |
|
"loss": 2.7031, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9191616766467066, |
|
"grad_norm": 0.49502164125442505, |
|
"learning_rate": 3.954901640619368e-06, |
|
"loss": 2.6704, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.9221556886227545, |
|
"grad_norm": 0.4807766079902649, |
|
"learning_rate": 3.6691429401502053e-06, |
|
"loss": 2.6889, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9251497005988024, |
|
"grad_norm": 0.49187591671943665, |
|
"learning_rate": 3.3939058754630882e-06, |
|
"loss": 2.7056, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.9281437125748503, |
|
"grad_norm": 0.5204874277114868, |
|
"learning_rate": 3.1292205090347248e-06, |
|
"loss": 2.7579, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9311377245508982, |
|
"grad_norm": 0.5218049883842468, |
|
"learning_rate": 2.875115750843771e-06, |
|
"loss": 2.722, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9341317365269461, |
|
"grad_norm": 0.4557659327983856, |
|
"learning_rate": 2.6316193552131884e-06, |
|
"loss": 2.7076, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.937125748502994, |
|
"grad_norm": 0.4934738874435425, |
|
"learning_rate": 2.398757917778727e-06, |
|
"loss": 2.7231, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.9401197604790419, |
|
"grad_norm": 0.48129838705062866, |
|
"learning_rate": 2.176556872584168e-06, |
|
"loss": 2.7231, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9431137724550899, |
|
"grad_norm": 0.4986419975757599, |
|
"learning_rate": 1.965040489303194e-06, |
|
"loss": 2.7142, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.9461077844311377, |
|
"grad_norm": 0.48590248823165894, |
|
"learning_rate": 1.7642318705886286e-06, |
|
"loss": 2.7517, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9491017964071856, |
|
"grad_norm": 0.4665396809577942, |
|
"learning_rate": 1.574152949549057e-06, |
|
"loss": 2.7028, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.9520958083832335, |
|
"grad_norm": 0.4755544364452362, |
|
"learning_rate": 1.3948244873532078e-06, |
|
"loss": 2.6893, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9550898203592815, |
|
"grad_norm": 0.44289329648017883, |
|
"learning_rate": 1.226266070962323e-06, |
|
"loss": 2.7409, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 0.5039628744125366, |
|
"learning_rate": 1.0684961109908353e-06, |
|
"loss": 2.6975, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9610778443113772, |
|
"grad_norm": 0.47264382243156433, |
|
"learning_rate": 9.21531839695411e-07, |
|
"loss": 2.6629, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.9640718562874252, |
|
"grad_norm": 0.4993440806865692, |
|
"learning_rate": 7.853893090928654e-07, |
|
"loss": 2.7854, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9670658682634731, |
|
"grad_norm": 0.4866315722465515, |
|
"learning_rate": 6.600833892068336e-07, |
|
"loss": 2.7192, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.9700598802395209, |
|
"grad_norm": 0.42745715379714966, |
|
"learning_rate": 5.456277664436127e-07, |
|
"loss": 2.7148, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9730538922155688, |
|
"grad_norm": 0.49950963258743286, |
|
"learning_rate": 4.4203494209733576e-07, |
|
"loss": 2.743, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.9760479041916168, |
|
"grad_norm": 0.45195409655570984, |
|
"learning_rate": 3.4931623098445334e-07, |
|
"loss": 2.7275, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9790419161676647, |
|
"grad_norm": 0.47945547103881836, |
|
"learning_rate": 2.674817602079327e-07, |
|
"loss": 2.725, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.9820359281437125, |
|
"grad_norm": 0.48270925879478455, |
|
"learning_rate": 1.965404680511207e-07, |
|
"loss": 2.6793, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9850299401197605, |
|
"grad_norm": 0.4643472731113434, |
|
"learning_rate": 1.3650010300150228e-07, |
|
"loss": 2.7078, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.9880239520958084, |
|
"grad_norm": 0.4867922365665436, |
|
"learning_rate": 8.736722290429988e-08, |
|
"loss": 2.7148, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9910179640718563, |
|
"grad_norm": 0.5043019652366638, |
|
"learning_rate": 4.9147194246290664e-08, |
|
"loss": 2.7255, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.9940119760479041, |
|
"grad_norm": 0.4517767131328583, |
|
"learning_rate": 2.1844191569597716e-08, |
|
"loss": 2.6949, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9970059880239521, |
|
"grad_norm": 0.4739466905593872, |
|
"learning_rate": 5.461197015765862e-09, |
|
"loss": 2.7152, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.46331822872161865, |
|
"learning_rate": 0.0, |
|
"loss": 2.7246, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1670, |
|
"total_flos": 1.396344619008e+16, |
|
"train_loss": 2.9739012581145694, |
|
"train_runtime": 9187.7213, |
|
"train_samples_per_second": 11.629, |
|
"train_steps_per_second": 0.182 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1670, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.396344619008e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|