{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.944055944055943, "eval_steps": 500, "global_step": 2852, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06993006993006994, "grad_norm": 30.581405639648438, "learning_rate": 1.3986013986013988e-05, "loss": 1.1014, "step": 10 }, { "epoch": 0.13986013986013987, "grad_norm": 2.1117727756500244, "learning_rate": 2.7972027972027976e-05, "loss": 0.693, "step": 20 }, { "epoch": 0.2097902097902098, "grad_norm": 1.6114064455032349, "learning_rate": 4.195804195804196e-05, "loss": 0.3677, "step": 30 }, { "epoch": 0.27972027972027974, "grad_norm": 0.6671944260597229, "learning_rate": 5.594405594405595e-05, "loss": 0.2686, "step": 40 }, { "epoch": 0.34965034965034963, "grad_norm": 0.8836167454719543, "learning_rate": 6.993006993006993e-05, "loss": 0.2072, "step": 50 }, { "epoch": 0.4195804195804196, "grad_norm": 0.974694013595581, "learning_rate": 8.391608391608392e-05, "loss": 0.1783, "step": 60 }, { "epoch": 0.48951048951048953, "grad_norm": 0.8012842535972595, "learning_rate": 9.790209790209791e-05, "loss": 0.1542, "step": 70 }, { "epoch": 0.5594405594405595, "grad_norm": 0.5671571493148804, "learning_rate": 0.0001118881118881119, "loss": 0.1271, "step": 80 }, { "epoch": 0.6293706293706294, "grad_norm": 0.6660890579223633, "learning_rate": 0.00012587412587412587, "loss": 0.1197, "step": 90 }, { "epoch": 0.6993006993006993, "grad_norm": 0.4433947503566742, "learning_rate": 0.00013986013986013986, "loss": 0.1032, "step": 100 }, { "epoch": 0.7692307692307693, "grad_norm": 0.36532077193260193, "learning_rate": 0.00015384615384615385, "loss": 0.1052, "step": 110 }, { "epoch": 0.8391608391608392, "grad_norm": 0.48936668038368225, "learning_rate": 0.00016783216783216784, "loss": 0.0878, "step": 120 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6362347602844238, "learning_rate": 0.00018181818181818183, "loss": 0.0865, "step": 130 }, { "epoch": 0.9790209790209791, "grad_norm": 0.4699188470840454, "learning_rate": 0.00019580419580419583, "loss": 0.0904, "step": 140 }, { "epoch": 1.048951048951049, "grad_norm": 0.42141956090927124, "learning_rate": 0.00019999670507574947, "loss": 0.0842, "step": 150 }, { "epoch": 1.118881118881119, "grad_norm": 0.3909319043159485, "learning_rate": 0.00019998056719395973, "loss": 0.0808, "step": 160 }, { "epoch": 1.1888111888111887, "grad_norm": 0.5473119616508484, "learning_rate": 0.00019995098333206742, "loss": 0.0713, "step": 170 }, { "epoch": 1.2587412587412588, "grad_norm": 0.37541836500167847, "learning_rate": 0.00019990795746868583, "loss": 0.0644, "step": 180 }, { "epoch": 1.3286713286713288, "grad_norm": 0.34557509422302246, "learning_rate": 0.00019985149539018855, "loss": 0.0707, "step": 190 }, { "epoch": 1.3986013986013985, "grad_norm": 0.29241743683815, "learning_rate": 0.00019978160468993094, "loss": 0.0612, "step": 200 }, { "epoch": 1.4685314685314685, "grad_norm": 0.481886625289917, "learning_rate": 0.00019969829476722923, "loss": 0.065, "step": 210 }, { "epoch": 1.5384615384615383, "grad_norm": 0.30471473932266235, "learning_rate": 0.00019960157682609632, "loss": 0.0596, "step": 220 }, { "epoch": 1.6083916083916083, "grad_norm": 0.30594855546951294, "learning_rate": 0.00019949146387373493, "loss": 0.06, "step": 230 }, { "epoch": 1.6783216783216783, "grad_norm": 0.31589317321777344, "learning_rate": 0.00019936797071878854, "loss": 0.0643, "step": 240 }, { "epoch": 1.7482517482517483, "grad_norm": 0.293947696685791, "learning_rate": 0.00019923111396934957, "loss": 0.0568, "step": 250 }, { "epoch": 1.8181818181818183, "grad_norm": 0.25768476724624634, "learning_rate": 0.00019908091203072598, "loss": 0.0556, "step": 260 }, { "epoch": 1.8881118881118881, "grad_norm": 0.258080393075943, "learning_rate": 0.00019891738510296602, "loss": 0.0543, "step": 270 }, { "epoch": 1.958041958041958, "grad_norm": 0.27586156129837036, "learning_rate": 0.0001987405551781415, "loss": 0.051, "step": 280 }, { "epoch": 2.027972027972028, "grad_norm": 0.26486584544181824, "learning_rate": 0.0001985504460373903, "loss": 0.0521, "step": 290 }, { "epoch": 2.097902097902098, "grad_norm": 0.2695034146308899, "learning_rate": 0.00019834708324771797, "loss": 0.0531, "step": 300 }, { "epoch": 2.167832167832168, "grad_norm": 0.30032461881637573, "learning_rate": 0.00019813049415855964, "loss": 0.0541, "step": 310 }, { "epoch": 2.237762237762238, "grad_norm": 0.3316197395324707, "learning_rate": 0.00019790070789810145, "loss": 0.0458, "step": 320 }, { "epoch": 2.3076923076923075, "grad_norm": 0.3239120543003082, "learning_rate": 0.00019765775536936367, "loss": 0.0533, "step": 330 }, { "epoch": 2.3776223776223775, "grad_norm": 0.20758603513240814, "learning_rate": 0.00019740166924604431, "loss": 0.0527, "step": 340 }, { "epoch": 2.4475524475524475, "grad_norm": 0.31676483154296875, "learning_rate": 0.00019713248396812524, "loss": 0.0521, "step": 350 }, { "epoch": 2.5174825174825175, "grad_norm": 0.25248146057128906, "learning_rate": 0.00019685023573724037, "loss": 0.0518, "step": 360 }, { "epoch": 2.5874125874125875, "grad_norm": 0.26571494340896606, "learning_rate": 0.0001965549625118071, "loss": 0.045, "step": 370 }, { "epoch": 2.6573426573426575, "grad_norm": 0.2656267285346985, "learning_rate": 0.00019624670400192126, "loss": 0.0434, "step": 380 }, { "epoch": 2.7272727272727275, "grad_norm": 0.30044880509376526, "learning_rate": 0.00019592550166401695, "loss": 0.0443, "step": 390 }, { "epoch": 2.797202797202797, "grad_norm": 0.27586933970451355, "learning_rate": 0.00019559139869529103, "loss": 0.044, "step": 400 }, { "epoch": 2.867132867132867, "grad_norm": 0.34137022495269775, "learning_rate": 0.00019524444002789383, "loss": 0.0473, "step": 410 }, { "epoch": 2.937062937062937, "grad_norm": 0.2440134882926941, "learning_rate": 0.0001948846723228862, "loss": 0.0438, "step": 420 }, { "epoch": 3.006993006993007, "grad_norm": 0.23091812431812286, "learning_rate": 0.00019451214396396454, "loss": 0.0463, "step": 430 }, { "epoch": 3.076923076923077, "grad_norm": 0.15170010924339294, "learning_rate": 0.00019412690505095365, "loss": 0.0416, "step": 440 }, { "epoch": 3.1468531468531467, "grad_norm": 0.25807687640190125, "learning_rate": 0.00019372900739306908, "loss": 0.0419, "step": 450 }, { "epoch": 3.2167832167832167, "grad_norm": 0.38740265369415283, "learning_rate": 0.00019331850450194957, "loss": 0.0379, "step": 460 }, { "epoch": 3.2867132867132867, "grad_norm": 0.24318642914295197, "learning_rate": 0.00019289545158446045, "loss": 0.0391, "step": 470 }, { "epoch": 3.3566433566433567, "grad_norm": 0.2211398035287857, "learning_rate": 0.00019245990553526905, "loss": 0.043, "step": 480 }, { "epoch": 3.4265734265734267, "grad_norm": 0.3056994378566742, "learning_rate": 0.00019201192492919317, "loss": 0.0376, "step": 490 }, { "epoch": 3.4965034965034967, "grad_norm": 0.18653175234794617, "learning_rate": 0.00019155157001332374, "loss": 0.0442, "step": 500 }, { "epoch": 3.5664335664335667, "grad_norm": 0.21762531995773315, "learning_rate": 0.00019107890269892214, "loss": 0.0392, "step": 510 }, { "epoch": 3.6363636363636362, "grad_norm": 0.24727760255336761, "learning_rate": 0.0001905939865530944, "loss": 0.0413, "step": 520 }, { "epoch": 3.7062937062937062, "grad_norm": 0.34239208698272705, "learning_rate": 0.0001900968867902419, "loss": 0.0417, "step": 530 }, { "epoch": 3.7762237762237763, "grad_norm": 0.23154211044311523, "learning_rate": 0.0001895876702632913, "loss": 0.0391, "step": 540 }, { "epoch": 3.8461538461538463, "grad_norm": 0.25248488783836365, "learning_rate": 0.00018906640545470355, "loss": 0.0367, "step": 550 }, { "epoch": 3.916083916083916, "grad_norm": 0.2887849807739258, "learning_rate": 0.000188533162467264, "loss": 0.0394, "step": 560 }, { "epoch": 3.986013986013986, "grad_norm": 0.30540716648101807, "learning_rate": 0.0001879880130146547, "loss": 0.0435, "step": 570 }, { "epoch": 4.055944055944056, "grad_norm": 0.2462405562400818, "learning_rate": 0.0001874310304118096, "loss": 0.0354, "step": 580 }, { "epoch": 4.125874125874126, "grad_norm": 0.2431563138961792, "learning_rate": 0.00018686228956505516, "loss": 0.0409, "step": 590 }, { "epoch": 4.195804195804196, "grad_norm": 0.2564241886138916, "learning_rate": 0.00018628186696203612, "loss": 0.0397, "step": 600 }, { "epoch": 4.265734265734266, "grad_norm": 0.24190612137317657, "learning_rate": 0.00018568984066142917, "loss": 0.0377, "step": 610 }, { "epoch": 4.335664335664336, "grad_norm": 0.23087257146835327, "learning_rate": 0.00018508629028244519, "loss": 0.0412, "step": 620 }, { "epoch": 4.405594405594406, "grad_norm": 0.4008491039276123, "learning_rate": 0.00018447129699412142, "loss": 0.039, "step": 630 }, { "epoch": 4.475524475524476, "grad_norm": 0.27394184470176697, "learning_rate": 0.00018384494350440553, "loss": 0.038, "step": 640 }, { "epoch": 4.545454545454545, "grad_norm": 0.20550896227359772, "learning_rate": 0.0001832073140490325, "loss": 0.0404, "step": 650 }, { "epoch": 4.615384615384615, "grad_norm": 0.2519960403442383, "learning_rate": 0.00018255849438019608, "loss": 0.0378, "step": 660 }, { "epoch": 4.685314685314685, "grad_norm": 0.3346613645553589, "learning_rate": 0.00018189857175501635, "loss": 0.035, "step": 670 }, { "epoch": 4.755244755244755, "grad_norm": 0.32892099022865295, "learning_rate": 0.00018122763492380486, "loss": 0.0325, "step": 680 }, { "epoch": 4.825174825174825, "grad_norm": 0.24375475943088531, "learning_rate": 0.00018054577411812895, "loss": 0.0318, "step": 690 }, { "epoch": 4.895104895104895, "grad_norm": 0.19709643721580505, "learning_rate": 0.00017985308103867688, "loss": 0.0368, "step": 700 }, { "epoch": 4.965034965034965, "grad_norm": 0.25721314549446106, "learning_rate": 0.00017914964884292544, "loss": 0.0377, "step": 710 }, { "epoch": 5.034965034965035, "grad_norm": 0.3517259955406189, "learning_rate": 0.00017843557213261142, "loss": 0.0359, "step": 720 }, { "epoch": 5.104895104895105, "grad_norm": 0.2421354055404663, "learning_rate": 0.00017771094694100925, "loss": 0.0375, "step": 730 }, { "epoch": 5.174825174825175, "grad_norm": 0.22226205468177795, "learning_rate": 0.00017697587072001557, "loss": 0.0376, "step": 740 }, { "epoch": 5.244755244755245, "grad_norm": 0.2880263924598694, "learning_rate": 0.0001762304423270436, "loss": 0.0377, "step": 750 }, { "epoch": 5.314685314685315, "grad_norm": 0.2628781795501709, "learning_rate": 0.00017547476201172808, "loss": 0.0366, "step": 760 }, { "epoch": 5.384615384615385, "grad_norm": 0.21351908147335052, "learning_rate": 0.00017470893140244303, "loss": 0.0346, "step": 770 }, { "epoch": 5.454545454545454, "grad_norm": 0.23355896770954132, "learning_rate": 0.00017393305349263434, "loss": 0.0367, "step": 780 }, { "epoch": 5.524475524475524, "grad_norm": 0.17863404750823975, "learning_rate": 0.00017314723262696848, "loss": 0.0319, "step": 790 }, { "epoch": 5.594405594405594, "grad_norm": 0.2492302805185318, "learning_rate": 0.00017235157448729967, "loss": 0.0345, "step": 800 }, { "epoch": 5.664335664335664, "grad_norm": 0.1684318482875824, "learning_rate": 0.00017154618607845702, "loss": 0.0325, "step": 810 }, { "epoch": 5.734265734265734, "grad_norm": 0.19033940136432648, "learning_rate": 0.00017073117571385414, "loss": 0.0325, "step": 820 }, { "epoch": 5.804195804195804, "grad_norm": 0.1792939007282257, "learning_rate": 0.00016990665300092224, "loss": 0.037, "step": 830 }, { "epoch": 5.874125874125874, "grad_norm": 0.17368149757385254, "learning_rate": 0.00016907272882636968, "loss": 0.0329, "step": 840 }, { "epoch": 5.944055944055944, "grad_norm": 0.20785829424858093, "learning_rate": 0.0001682295153412691, "loss": 0.0316, "step": 850 }, { "epoch": 6.013986013986014, "grad_norm": 0.19825145602226257, "learning_rate": 0.00016737712594597483, "loss": 0.0274, "step": 860 }, { "epoch": 6.083916083916084, "grad_norm": 0.15773798525333405, "learning_rate": 0.00016651567527487204, "loss": 0.0347, "step": 870 }, { "epoch": 6.153846153846154, "grad_norm": 0.26358160376548767, "learning_rate": 0.00016564527918096005, "loss": 0.0299, "step": 880 }, { "epoch": 6.223776223776224, "grad_norm": 0.3937130272388458, "learning_rate": 0.00016476605472027172, "loss": 0.0309, "step": 890 }, { "epoch": 6.293706293706293, "grad_norm": 0.13225628435611725, "learning_rate": 0.00016387812013613103, "loss": 0.035, "step": 900 }, { "epoch": 6.363636363636363, "grad_norm": 0.19665835797786713, "learning_rate": 0.00016298159484325118, "loss": 0.0316, "step": 910 }, { "epoch": 6.433566433566433, "grad_norm": 0.21388469636440277, "learning_rate": 0.00016207659941167485, "loss": 0.0272, "step": 920 }, { "epoch": 6.503496503496503, "grad_norm": 0.47336331009864807, "learning_rate": 0.00016116325555055915, "loss": 0.0293, "step": 930 }, { "epoch": 6.573426573426573, "grad_norm": 0.1514803022146225, "learning_rate": 0.00016024168609180757, "loss": 0.0316, "step": 940 }, { "epoch": 6.643356643356643, "grad_norm": 0.20940926671028137, "learning_rate": 0.00015931201497355088, "loss": 0.0282, "step": 950 }, { "epoch": 6.713286713286713, "grad_norm": 0.22871293127536774, "learning_rate": 0.000158374367223479, "loss": 0.0349, "step": 960 }, { "epoch": 6.783216783216783, "grad_norm": 0.18799573183059692, "learning_rate": 0.00015742886894202674, "loss": 0.0295, "step": 970 }, { "epoch": 6.853146853146853, "grad_norm": 0.17343267798423767, "learning_rate": 0.00015647564728541485, "loss": 0.0299, "step": 980 }, { "epoch": 6.923076923076923, "grad_norm": 0.21930481493473053, "learning_rate": 0.00015551483044854954, "loss": 0.0285, "step": 990 }, { "epoch": 6.993006993006993, "grad_norm": 0.15558023750782013, "learning_rate": 0.00015454654764778187, "loss": 0.03, "step": 1000 }, { "epoch": 7.062937062937063, "grad_norm": 0.39069393277168274, "learning_rate": 0.00015357092910353001, "loss": 0.0263, "step": 1010 }, { "epoch": 7.1328671328671325, "grad_norm": 0.22350826859474182, "learning_rate": 0.00015258810602276654, "loss": 0.0341, "step": 1020 }, { "epoch": 7.2027972027972025, "grad_norm": 0.16752642393112183, "learning_rate": 0.00015159821058137278, "loss": 0.0279, "step": 1030 }, { "epoch": 7.2727272727272725, "grad_norm": 0.17457683384418488, "learning_rate": 0.00015060137590636318, "loss": 0.0268, "step": 1040 }, { "epoch": 7.3426573426573425, "grad_norm": 0.1841355264186859, "learning_rate": 0.00014959773605798145, "loss": 0.0248, "step": 1050 }, { "epoch": 7.4125874125874125, "grad_norm": 0.18273039162158966, "learning_rate": 0.0001485874260116714, "loss": 0.0256, "step": 1060 }, { "epoch": 7.4825174825174825, "grad_norm": 0.21696248650550842, "learning_rate": 0.00014757058163992464, "loss": 0.026, "step": 1070 }, { "epoch": 7.5524475524475525, "grad_norm": 0.23004591464996338, "learning_rate": 0.0001465473396940078, "loss": 0.0305, "step": 1080 }, { "epoch": 7.6223776223776225, "grad_norm": 0.17306514084339142, "learning_rate": 0.000145517837785571, "loss": 0.0247, "step": 1090 }, { "epoch": 7.6923076923076925, "grad_norm": 0.2152336984872818, "learning_rate": 0.0001444822143681415, "loss": 0.0274, "step": 1100 }, { "epoch": 7.7622377622377625, "grad_norm": 0.1681831181049347, "learning_rate": 0.00014344060871850325, "loss": 0.0247, "step": 1110 }, { "epoch": 7.8321678321678325, "grad_norm": 0.1374141126871109, "learning_rate": 0.00014239316091796647, "loss": 0.0285, "step": 1120 }, { "epoch": 7.902097902097902, "grad_norm": 0.3081527352333069, "learning_rate": 0.00014134001183352832, "loss": 0.0246, "step": 1130 }, { "epoch": 7.972027972027972, "grad_norm": 0.4359898865222931, "learning_rate": 0.0001402813030989286, "loss": 0.0284, "step": 1140 }, { "epoch": 8.041958041958042, "grad_norm": 0.15682591497898102, "learning_rate": 0.00013921717709560182, "loss": 0.0261, "step": 1150 }, { "epoch": 8.111888111888112, "grad_norm": 0.23213213682174683, "learning_rate": 0.000138147776933529, "loss": 0.0265, "step": 1160 }, { "epoch": 8.181818181818182, "grad_norm": 0.1675710529088974, "learning_rate": 0.00013707324643199114, "loss": 0.0297, "step": 1170 }, { "epoch": 8.251748251748252, "grad_norm": 0.1905348300933838, "learning_rate": 0.00013599373010022794, "loss": 0.0282, "step": 1180 }, { "epoch": 8.321678321678322, "grad_norm": 0.22106771171092987, "learning_rate": 0.0001349093731180031, "loss": 0.0265, "step": 1190 }, { "epoch": 8.391608391608392, "grad_norm": 0.21079835295677185, "learning_rate": 0.00013382032131607966, "loss": 0.0288, "step": 1200 }, { "epoch": 8.461538461538462, "grad_norm": 0.3102055788040161, "learning_rate": 0.00013272672115660796, "loss": 0.0226, "step": 1210 }, { "epoch": 8.531468531468532, "grad_norm": 0.16320601105690002, "learning_rate": 0.00013162871971342837, "loss": 0.026, "step": 1220 }, { "epoch": 8.601398601398602, "grad_norm": 0.2458457201719284, "learning_rate": 0.00013052646465229207, "loss": 0.0257, "step": 1230 }, { "epoch": 8.671328671328672, "grad_norm": 0.21947446465492249, "learning_rate": 0.00012942010421100207, "loss": 0.0255, "step": 1240 }, { "epoch": 8.741258741258742, "grad_norm": 0.1916748285293579, "learning_rate": 0.00012830978717947718, "loss": 0.0239, "step": 1250 }, { "epoch": 8.811188811188812, "grad_norm": 0.14072898030281067, "learning_rate": 0.00012719566287974204, "loss": 0.0261, "step": 1260 }, { "epoch": 8.881118881118882, "grad_norm": 0.1773282140493393, "learning_rate": 0.00012607788114584522, "loss": 0.023, "step": 1270 }, { "epoch": 8.951048951048952, "grad_norm": 0.13442933559417725, "learning_rate": 0.0001249565923037088, "loss": 0.0251, "step": 1280 }, { "epoch": 9.020979020979022, "grad_norm": 0.17046289145946503, "learning_rate": 0.00012383194715091163, "loss": 0.0253, "step": 1290 }, { "epoch": 9.090909090909092, "grad_norm": 0.25058501958847046, "learning_rate": 0.00012270409693640905, "loss": 0.0262, "step": 1300 }, { "epoch": 9.16083916083916, "grad_norm": 0.22269104421138763, "learning_rate": 0.00012157319334019219, "loss": 0.0235, "step": 1310 }, { "epoch": 9.23076923076923, "grad_norm": 0.25462645292282104, "learning_rate": 0.00012043938845288904, "loss": 0.0256, "step": 1320 }, { "epoch": 9.3006993006993, "grad_norm": 0.18311487138271332, "learning_rate": 0.00011930283475531048, "loss": 0.0259, "step": 1330 }, { "epoch": 9.37062937062937, "grad_norm": 0.16915184259414673, "learning_rate": 0.00011816368509794364, "loss": 0.0259, "step": 1340 }, { "epoch": 9.44055944055944, "grad_norm": 0.16743628680706024, "learning_rate": 0.00011702209268039581, "loss": 0.0229, "step": 1350 }, { "epoch": 9.51048951048951, "grad_norm": 0.16608262062072754, "learning_rate": 0.00011587821103079111, "loss": 0.0238, "step": 1360 }, { "epoch": 9.58041958041958, "grad_norm": 0.14811694622039795, "learning_rate": 0.00011473219398512316, "loss": 0.0226, "step": 1370 }, { "epoch": 9.65034965034965, "grad_norm": 0.1586376130580902, "learning_rate": 0.00011358419566656642, "loss": 0.0196, "step": 1380 }, { "epoch": 9.72027972027972, "grad_norm": 0.13793979585170746, "learning_rate": 0.00011243437046474853, "loss": 0.0209, "step": 1390 }, { "epoch": 9.79020979020979, "grad_norm": 0.207151859998703, "learning_rate": 0.00011128287301498739, "loss": 0.0235, "step": 1400 }, { "epoch": 9.86013986013986, "grad_norm": 0.48499253392219543, "learning_rate": 0.00011012985817749463, "loss": 0.0217, "step": 1410 }, { "epoch": 9.93006993006993, "grad_norm": 0.11960924416780472, "learning_rate": 0.00010897548101654926, "loss": 0.0217, "step": 1420 }, { "epoch": 10.0, "grad_norm": 0.12925837934017181, "learning_rate": 0.00010781989677964355, "loss": 0.0207, "step": 1430 }, { "epoch": 10.06993006993007, "grad_norm": 0.1686684936285019, "learning_rate": 0.00010666326087660458, "loss": 0.0204, "step": 1440 }, { "epoch": 10.13986013986014, "grad_norm": 0.15361668169498444, "learning_rate": 0.00010550572885869367, "loss": 0.0197, "step": 1450 }, { "epoch": 10.20979020979021, "grad_norm": 0.2281733602285385, "learning_rate": 0.00010434745639768705, "loss": 0.0252, "step": 1460 }, { "epoch": 10.27972027972028, "grad_norm": 0.14611436426639557, "learning_rate": 0.00010318859926494014, "loss": 0.02, "step": 1470 }, { "epoch": 10.34965034965035, "grad_norm": 0.3116054832935333, "learning_rate": 0.00010202931331043839, "loss": 0.0217, "step": 1480 }, { "epoch": 10.41958041958042, "grad_norm": 0.19651482999324799, "learning_rate": 0.00010086975444183782, "loss": 0.0232, "step": 1490 }, { "epoch": 10.48951048951049, "grad_norm": 0.18464644253253937, "learning_rate": 9.971007860349756e-05, "loss": 0.02, "step": 1500 }, { "epoch": 10.55944055944056, "grad_norm": 0.18407182395458221, "learning_rate": 9.855044175550756e-05, "loss": 0.0232, "step": 1510 }, { "epoch": 10.62937062937063, "grad_norm": 0.12513527274131775, "learning_rate": 9.739099985271394e-05, "loss": 0.0219, "step": 1520 }, { "epoch": 10.6993006993007, "grad_norm": 0.13158084452152252, "learning_rate": 9.623190882374564e-05, "loss": 0.0224, "step": 1530 }, { "epoch": 10.76923076923077, "grad_norm": 0.17640575766563416, "learning_rate": 9.507332455004395e-05, "loss": 0.0226, "step": 1540 }, { "epoch": 10.83916083916084, "grad_norm": 0.30499282479286194, "learning_rate": 9.391540284489862e-05, "loss": 0.0216, "step": 1550 }, { "epoch": 10.909090909090908, "grad_norm": 0.1940823644399643, "learning_rate": 9.275829943249334e-05, "loss": 0.0241, "step": 1560 }, { "epoch": 10.979020979020978, "grad_norm": 0.159086212515831, "learning_rate": 9.160216992696286e-05, "loss": 0.022, "step": 1570 }, { "epoch": 11.048951048951048, "grad_norm": 0.19389483332633972, "learning_rate": 9.044716981146526e-05, "loss": 0.022, "step": 1580 }, { "epoch": 11.118881118881118, "grad_norm": 0.12246847152709961, "learning_rate": 8.929345441727142e-05, "loss": 0.0205, "step": 1590 }, { "epoch": 11.188811188811188, "grad_norm": 0.16194018721580505, "learning_rate": 8.814117890287538e-05, "loss": 0.0207, "step": 1600 }, { "epoch": 11.258741258741258, "grad_norm": 0.17201407253742218, "learning_rate": 8.699049823312748e-05, "loss": 0.0216, "step": 1610 }, { "epoch": 11.328671328671328, "grad_norm": 0.12679952383041382, "learning_rate": 8.584156715839401e-05, "loss": 0.0194, "step": 1620 }, { "epoch": 11.398601398601398, "grad_norm": 0.19168208539485931, "learning_rate": 8.469454019374531e-05, "loss": 0.016, "step": 1630 }, { "epoch": 11.468531468531468, "grad_norm": 0.20202843844890594, "learning_rate": 8.354957159817561e-05, "loss": 0.0227, "step": 1640 }, { "epoch": 11.538461538461538, "grad_norm": 0.14431844651699066, "learning_rate": 8.240681535385757e-05, "loss": 0.0177, "step": 1650 }, { "epoch": 11.608391608391608, "grad_norm": 0.16581477224826813, "learning_rate": 8.126642514543359e-05, "loss": 0.017, "step": 1660 }, { "epoch": 11.678321678321678, "grad_norm": 0.15920375287532806, "learning_rate": 8.012855433934765e-05, "loss": 0.02, "step": 1670 }, { "epoch": 11.748251748251748, "grad_norm": 0.15303653478622437, "learning_rate": 7.899335596321944e-05, "loss": 0.0207, "step": 1680 }, { "epoch": 11.818181818181818, "grad_norm": 0.15237000584602356, "learning_rate": 7.786098268526448e-05, "loss": 0.0225, "step": 1690 }, { "epoch": 11.888111888111888, "grad_norm": 0.13649679720401764, "learning_rate": 7.673158679376234e-05, "loss": 0.0204, "step": 1700 }, { "epoch": 11.958041958041958, "grad_norm": 0.13087141513824463, "learning_rate": 7.560532017657585e-05, "loss": 0.0211, "step": 1710 }, { "epoch": 12.027972027972028, "grad_norm": 0.13551278412342072, "learning_rate": 7.448233430072466e-05, "loss": 0.0192, "step": 1720 }, { "epoch": 12.097902097902098, "grad_norm": 0.19065268337726593, "learning_rate": 7.336278019201462e-05, "loss": 0.0174, "step": 1730 }, { "epoch": 12.167832167832168, "grad_norm": 0.16062477231025696, "learning_rate": 7.224680841472741e-05, "loss": 0.0209, "step": 1740 }, { "epoch": 12.237762237762238, "grad_norm": 0.3261907696723938, "learning_rate": 7.113456905137132e-05, "loss": 0.0204, "step": 1750 }, { "epoch": 12.307692307692308, "grad_norm": 0.36956819891929626, "learning_rate": 7.002621168249759e-05, "loss": 0.0235, "step": 1760 }, { "epoch": 12.377622377622378, "grad_norm": 0.16811484098434448, "learning_rate": 6.892188536658369e-05, "loss": 0.0201, "step": 1770 }, { "epoch": 12.447552447552448, "grad_norm": 0.14103484153747559, "learning_rate": 6.782173861998726e-05, "loss": 0.0226, "step": 1780 }, { "epoch": 12.517482517482517, "grad_norm": 0.17502908408641815, "learning_rate": 6.672591939697261e-05, "loss": 0.0204, "step": 1790 }, { "epoch": 12.587412587412587, "grad_norm": 0.13043367862701416, "learning_rate": 6.563457506981297e-05, "loss": 0.0201, "step": 1800 }, { "epoch": 12.657342657342657, "grad_norm": 0.13106204569339752, "learning_rate": 6.454785240897112e-05, "loss": 0.0199, "step": 1810 }, { "epoch": 12.727272727272727, "grad_norm": 0.30531424283981323, "learning_rate": 6.34658975633605e-05, "loss": 0.0184, "step": 1820 }, { "epoch": 12.797202797202797, "grad_norm": 0.16103509068489075, "learning_rate": 6.238885604069075e-05, "loss": 0.0212, "step": 1830 }, { "epoch": 12.867132867132867, "grad_norm": 0.12883076071739197, "learning_rate": 6.131687268789838e-05, "loss": 0.0166, "step": 1840 }, { "epoch": 12.937062937062937, "grad_norm": 0.1775168776512146, "learning_rate": 6.0250091671667484e-05, "loss": 0.022, "step": 1850 }, { "epoch": 13.006993006993007, "grad_norm": 0.14147868752479553, "learning_rate": 5.9188656459040837e-05, "loss": 0.024, "step": 1860 }, { "epoch": 13.076923076923077, "grad_norm": 0.10535666346549988, "learning_rate": 5.813270979812589e-05, "loss": 0.0187, "step": 1870 }, { "epoch": 13.146853146853147, "grad_norm": 0.18340010941028595, "learning_rate": 5.7082393698897166e-05, "loss": 0.0201, "step": 1880 }, { "epoch": 13.216783216783217, "grad_norm": 0.12412499636411667, "learning_rate": 5.60378494140976e-05, "loss": 0.0189, "step": 1890 }, { "epoch": 13.286713286713287, "grad_norm": 0.21642570197582245, "learning_rate": 5.4999217420242576e-05, "loss": 0.0178, "step": 1900 }, { "epoch": 13.356643356643357, "grad_norm": 0.17489974200725555, "learning_rate": 5.396663739872725e-05, "loss": 0.0173, "step": 1910 }, { "epoch": 13.426573426573427, "grad_norm": 0.12585236132144928, "learning_rate": 5.294024821704172e-05, "loss": 0.0206, "step": 1920 }, { "epoch": 13.496503496503497, "grad_norm": 0.12454655766487122, "learning_rate": 5.192018791009521e-05, "loss": 0.0172, "step": 1930 }, { "epoch": 13.566433566433567, "grad_norm": 0.1352427452802658, "learning_rate": 5.090659366165227e-05, "loss": 0.0185, "step": 1940 }, { "epoch": 13.636363636363637, "grad_norm": 0.15033438801765442, "learning_rate": 4.989960178588357e-05, "loss": 0.0177, "step": 1950 }, { "epoch": 13.706293706293707, "grad_norm": 0.2707054018974304, "learning_rate": 4.889934770903336e-05, "loss": 0.0165, "step": 1960 }, { "epoch": 13.776223776223777, "grad_norm": 0.19302873313426971, "learning_rate": 4.790596595120699e-05, "loss": 0.0158, "step": 1970 }, { "epoch": 13.846153846153847, "grad_norm": 0.14991451799869537, "learning_rate": 4.6919590108279254e-05, "loss": 0.0191, "step": 1980 }, { "epoch": 13.916083916083917, "grad_norm": 0.1627596765756607, "learning_rate": 4.594035283392815e-05, "loss": 0.0178, "step": 1990 }, { "epoch": 13.986013986013987, "grad_norm": 0.1320677548646927, "learning_rate": 4.49683858217944e-05, "loss": 0.0169, "step": 2000 }, { "epoch": 14.055944055944057, "grad_norm": 0.25264090299606323, "learning_rate": 4.4003819787770964e-05, "loss": 0.0179, "step": 2010 }, { "epoch": 14.125874125874127, "grad_norm": 0.11774495244026184, "learning_rate": 4.304678445242309e-05, "loss": 0.0152, "step": 2020 }, { "epoch": 14.195804195804195, "grad_norm": 0.17087212204933167, "learning_rate": 4.209740852354313e-05, "loss": 0.017, "step": 2030 }, { "epoch": 14.265734265734265, "grad_norm": 0.1586153358221054, "learning_rate": 4.115581967884094e-05, "loss": 0.0199, "step": 2040 }, { "epoch": 14.335664335664335, "grad_norm": 0.15853182971477509, "learning_rate": 4.022214454877305e-05, "loss": 0.0148, "step": 2050 }, { "epoch": 14.405594405594405, "grad_norm": 0.131947860121727, "learning_rate": 3.929650869951278e-05, "loss": 0.0178, "step": 2060 }, { "epoch": 14.475524475524475, "grad_norm": 0.26362699270248413, "learning_rate": 3.8379036616063066e-05, "loss": 0.0173, "step": 2070 }, { "epoch": 14.545454545454545, "grad_norm": 0.12325582653284073, "learning_rate": 3.746985168551532e-05, "loss": 0.0167, "step": 2080 }, { "epoch": 14.615384615384615, "grad_norm": 0.1346844732761383, "learning_rate": 3.65690761804554e-05, "loss": 0.0161, "step": 2090 }, { "epoch": 14.685314685314685, "grad_norm": 0.17594227194786072, "learning_rate": 3.567683124251972e-05, "loss": 0.0198, "step": 2100 }, { "epoch": 14.755244755244755, "grad_norm": 0.11365789920091629, "learning_rate": 3.4793236866103294e-05, "loss": 0.0138, "step": 2110 }, { "epoch": 14.825174825174825, "grad_norm": 0.11247463524341583, "learning_rate": 3.391841188222246e-05, "loss": 0.0137, "step": 2120 }, { "epoch": 14.895104895104895, "grad_norm": 0.17104235291481018, "learning_rate": 3.305247394253349e-05, "loss": 0.0171, "step": 2130 }, { "epoch": 14.965034965034965, "grad_norm": 0.11906550079584122, "learning_rate": 3.2195539503510164e-05, "loss": 0.0195, "step": 2140 }, { "epoch": 15.034965034965035, "grad_norm": 0.13180206716060638, "learning_rate": 3.1347723810782134e-05, "loss": 0.0147, "step": 2150 }, { "epoch": 15.104895104895105, "grad_norm": 0.27480393648147583, "learning_rate": 3.05091408836359e-05, "loss": 0.0142, "step": 2160 }, { "epoch": 15.174825174825175, "grad_norm": 0.19437533617019653, "learning_rate": 2.967990349968086e-05, "loss": 0.0159, "step": 2170 }, { "epoch": 15.244755244755245, "grad_norm": 0.10018286108970642, "learning_rate": 2.8860123179682242e-05, "loss": 0.0139, "step": 2180 }, { "epoch": 15.314685314685315, "grad_norm": 0.10660770535469055, "learning_rate": 2.8049910172563188e-05, "loss": 0.0138, "step": 2190 }, { "epoch": 15.384615384615385, "grad_norm": 0.19103875756263733, "learning_rate": 2.7249373440577963e-05, "loss": 0.0148, "step": 2200 }, { "epoch": 15.454545454545455, "grad_norm": 0.11443736404180527, "learning_rate": 2.6458620644657693e-05, "loss": 0.0193, "step": 2210 }, { "epoch": 15.524475524475525, "grad_norm": 0.1157824918627739, "learning_rate": 2.567775812993186e-05, "loss": 0.0179, "step": 2220 }, { "epoch": 15.594405594405595, "grad_norm": 0.09878715127706528, "learning_rate": 2.4906890911426208e-05, "loss": 0.0177, "step": 2230 }, { "epoch": 15.664335664335665, "grad_norm": 0.18489457666873932, "learning_rate": 2.4146122659939686e-05, "loss": 0.0127, "step": 2240 }, { "epoch": 15.734265734265735, "grad_norm": 0.15683647990226746, "learning_rate": 2.339555568810221e-05, "loss": 0.0152, "step": 2250 }, { "epoch": 15.804195804195803, "grad_norm": 0.13426269590854645, "learning_rate": 2.2655290936615093e-05, "loss": 0.0163, "step": 2260 }, { "epoch": 15.874125874125873, "grad_norm": 0.10421716421842575, "learning_rate": 2.1925427960675894e-05, "loss": 0.0148, "step": 2270 }, { "epoch": 15.944055944055943, "grad_norm": 0.20967915654182434, "learning_rate": 2.120606491658966e-05, "loss": 0.0128, "step": 2280 }, { "epoch": 16.013986013986013, "grad_norm": 0.11200631409883499, "learning_rate": 2.049729854856832e-05, "loss": 0.0133, "step": 2290 }, { "epoch": 16.083916083916083, "grad_norm": 0.10998646169900894, "learning_rate": 1.9799224175719767e-05, "loss": 0.015, "step": 2300 }, { "epoch": 16.153846153846153, "grad_norm": 0.1218709722161293, "learning_rate": 1.9111935679229142e-05, "loss": 0.0155, "step": 2310 }, { "epoch": 16.223776223776223, "grad_norm": 0.11821790039539337, "learning_rate": 1.843552548973272e-05, "loss": 0.0144, "step": 2320 }, { "epoch": 16.293706293706293, "grad_norm": 0.1463775783777237, "learning_rate": 1.7770084574887567e-05, "loss": 0.0117, "step": 2330 }, { "epoch": 16.363636363636363, "grad_norm": 0.15775668621063232, "learning_rate": 1.7115702427137616e-05, "loss": 0.0174, "step": 2340 }, { "epoch": 16.433566433566433, "grad_norm": 0.1515921652317047, "learning_rate": 1.647246705167812e-05, "loss": 0.0137, "step": 2350 }, { "epoch": 16.503496503496503, "grad_norm": 0.16316431760787964, "learning_rate": 1.5840464954620206e-05, "loss": 0.016, "step": 2360 }, { "epoch": 16.573426573426573, "grad_norm": 0.07553193718194962, "learning_rate": 1.5219781131357103e-05, "loss": 0.0162, "step": 2370 }, { "epoch": 16.643356643356643, "grad_norm": 0.1977609395980835, "learning_rate": 1.4610499055133375e-05, "loss": 0.014, "step": 2380 }, { "epoch": 16.713286713286713, "grad_norm": 0.2475104033946991, "learning_rate": 1.401270066581899e-05, "loss": 0.0157, "step": 2390 }, { "epoch": 16.783216783216783, "grad_norm": 0.20461061596870422, "learning_rate": 1.3426466358889545e-05, "loss": 0.0154, "step": 2400 }, { "epoch": 16.853146853146853, "grad_norm": 0.17063836753368378, "learning_rate": 1.2851874974614097e-05, "loss": 0.0132, "step": 2410 }, { "epoch": 16.923076923076923, "grad_norm": 0.16239117085933685, "learning_rate": 1.2289003787452557e-05, "loss": 0.0142, "step": 2420 }, { "epoch": 16.993006993006993, "grad_norm": 0.09241970628499985, "learning_rate": 1.1737928495662964e-05, "loss": 0.0179, "step": 2430 }, { "epoch": 17.062937062937063, "grad_norm": 0.104482501745224, "learning_rate": 1.1198723211121442e-05, "loss": 0.0142, "step": 2440 }, { "epoch": 17.132867132867133, "grad_norm": 0.10794669389724731, "learning_rate": 1.0671460449355075e-05, "loss": 0.014, "step": 2450 }, { "epoch": 17.202797202797203, "grad_norm": 0.11316874623298645, "learning_rate": 1.0156211119789583e-05, "loss": 0.0152, "step": 2460 }, { "epoch": 17.272727272727273, "grad_norm": 0.0924522653222084, "learning_rate": 9.65304451621304e-06, "loss": 0.0113, "step": 2470 }, { "epoch": 17.342657342657343, "grad_norm": 0.08180312067270279, "learning_rate": 9.162028307456771e-06, "loss": 0.0144, "step": 2480 }, { "epoch": 17.412587412587413, "grad_norm": 0.19703391194343567, "learning_rate": 8.683228528294929e-06, "loss": 0.0151, "step": 2490 }, { "epoch": 17.482517482517483, "grad_norm": 0.10454503446817398, "learning_rate": 8.216709570563685e-06, "loss": 0.0144, "step": 2500 }, { "epoch": 17.552447552447553, "grad_norm": 0.12390058487653732, "learning_rate": 7.76253417450149e-06, "loss": 0.0129, "step": 2510 }, { "epoch": 17.622377622377623, "grad_norm": 0.0972568616271019, "learning_rate": 7.320763420311261e-06, "loss": 0.0153, "step": 2520 }, { "epoch": 17.692307692307693, "grad_norm": 0.14459727704524994, "learning_rate": 6.891456719946188e-06, "loss": 0.0151, "step": 2530 }, { "epoch": 17.762237762237763, "grad_norm": 0.15339726209640503, "learning_rate": 6.4746718091194254e-06, "loss": 0.0125, "step": 2540 }, { "epoch": 17.832167832167833, "grad_norm": 0.2148740440607071, "learning_rate": 6.07046473953955e-06, "loss": 0.0126, "step": 2550 }, { "epoch": 17.902097902097903, "grad_norm": 0.08716779202222824, "learning_rate": 5.678889871372428e-06, "loss": 0.0143, "step": 2560 }, { "epoch": 17.972027972027973, "grad_norm": 0.2405886948108673, "learning_rate": 5.299999865930505e-06, "loss": 0.0159, "step": 2570 }, { "epoch": 18.041958041958043, "grad_norm": 0.10744497925043106, "learning_rate": 4.933845678590587e-06, "loss": 0.0121, "step": 2580 }, { "epoch": 18.111888111888113, "grad_norm": 0.24450422823429108, "learning_rate": 4.580476551941037e-06, "loss": 0.0119, "step": 2590 }, { "epoch": 18.181818181818183, "grad_norm": 0.13196663558483124, "learning_rate": 4.2399400091594154e-06, "loss": 0.0146, "step": 2600 }, { "epoch": 18.251748251748253, "grad_norm": 0.0889667496085167, "learning_rate": 3.912281847621213e-06, "loss": 0.0157, "step": 2610 }, { "epoch": 18.32167832167832, "grad_norm": 0.10608735680580139, "learning_rate": 3.597546132740792e-06, "loss": 0.0143, "step": 2620 }, { "epoch": 18.39160839160839, "grad_norm": 0.0715111494064331, "learning_rate": 3.295775192045181e-06, "loss": 0.012, "step": 2630 }, { "epoch": 18.46153846153846, "grad_norm": 0.11866133660078049, "learning_rate": 3.0070096094816037e-06, "loss": 0.0136, "step": 2640 }, { "epoch": 18.53146853146853, "grad_norm": 0.1221456378698349, "learning_rate": 2.7312882199595826e-06, "loss": 0.0143, "step": 2650 }, { "epoch": 18.6013986013986, "grad_norm": 0.11350057274103165, "learning_rate": 2.4686481041280574e-06, "loss": 0.0119, "step": 2660 }, { "epoch": 18.67132867132867, "grad_norm": 0.10500749945640564, "learning_rate": 2.2191245833886987e-06, "loss": 0.0131, "step": 2670 }, { "epoch": 18.74125874125874, "grad_norm": 0.208559051156044, "learning_rate": 1.9827512151456173e-06, "loss": 0.0137, "step": 2680 }, { "epoch": 18.81118881118881, "grad_norm": 0.10356497019529343, "learning_rate": 1.7595597882923309e-06, "loss": 0.0142, "step": 2690 }, { "epoch": 18.88111888111888, "grad_norm": 0.12341023981571198, "learning_rate": 1.549580318936672e-06, "loss": 0.0125, "step": 2700 }, { "epoch": 18.95104895104895, "grad_norm": 0.19302833080291748, "learning_rate": 1.3528410463639728e-06, "loss": 0.0135, "step": 2710 }, { "epoch": 19.02097902097902, "grad_norm": 0.18865394592285156, "learning_rate": 1.1693684292393704e-06, "loss": 0.0122, "step": 2720 }, { "epoch": 19.09090909090909, "grad_norm": 0.08378203958272934, "learning_rate": 9.991871420493736e-07, "loss": 0.0128, "step": 2730 }, { "epoch": 19.16083916083916, "grad_norm": 0.12866666913032532, "learning_rate": 8.423200717835977e-07, "loss": 0.014, "step": 2740 }, { "epoch": 19.23076923076923, "grad_norm": 0.13293299078941345, "learning_rate": 6.987883148567131e-07, "loss": 0.0112, "step": 2750 }, { "epoch": 19.3006993006993, "grad_norm": 0.12522312998771667, "learning_rate": 5.686111742713162e-07, "loss": 0.0119, "step": 2760 }, { "epoch": 19.37062937062937, "grad_norm": 0.10424693673849106, "learning_rate": 4.5180615702192783e-07, "loss": 0.0119, "step": 2770 }, { "epoch": 19.44055944055944, "grad_norm": 0.11369384080171585, "learning_rate": 3.4838897174055417e-07, "loss": 0.0144, "step": 2780 }, { "epoch": 19.51048951048951, "grad_norm": 0.279256671667099, "learning_rate": 2.583735265840992e-07, "loss": 0.0114, "step": 2790 }, { "epoch": 19.58041958041958, "grad_norm": 0.12756973505020142, "learning_rate": 1.8177192736390515e-07, "loss": 0.0124, "step": 2800 }, { "epoch": 19.65034965034965, "grad_norm": 0.13745011389255524, "learning_rate": 1.1859447591769934e-07, "loss": 0.0163, "step": 2810 }, { "epoch": 19.72027972027972, "grad_norm": 0.14598555862903595, "learning_rate": 6.884966872412468e-08, "loss": 0.0127, "step": 2820 }, { "epoch": 19.79020979020979, "grad_norm": 0.08950311690568924, "learning_rate": 3.254419576012024e-08, "loss": 0.0154, "step": 2830 }, { "epoch": 19.86013986013986, "grad_norm": 0.09730339050292969, "learning_rate": 9.682939601185492e-09, "loss": 0.0126, "step": 2840 }, { "epoch": 19.93006993006993, "grad_norm": 0.07458403706550598, "learning_rate": 2.689747647166563e-10, "loss": 0.0124, "step": 2850 }, { "epoch": 19.944055944055943, "step": 2852, "total_flos": 3.989050496377632e+17, "train_loss": 0.03802360156629376, "train_runtime": 3245.0123, "train_samples_per_second": 56.249, "train_steps_per_second": 0.879 } ], "logging_steps": 10, "max_steps": 2852, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.989050496377632e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }