{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2735, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018281535648994516, "grad_norm": 16.712358474731445, "learning_rate": 1.45985401459854e-05, "loss": 1.4822, "step": 10 }, { "epoch": 0.03656307129798903, "grad_norm": 7.676208019256592, "learning_rate": 2.91970802919708e-05, "loss": 0.7397, "step": 20 }, { "epoch": 0.054844606946983544, "grad_norm": 2.2206971645355225, "learning_rate": 4.379562043795621e-05, "loss": 0.4701, "step": 30 }, { "epoch": 0.07312614259597806, "grad_norm": 1.7638039588928223, "learning_rate": 5.83941605839416e-05, "loss": 0.2966, "step": 40 }, { "epoch": 0.09140767824497258, "grad_norm": 1.6052724123001099, "learning_rate": 7.299270072992701e-05, "loss": 0.2162, "step": 50 }, { "epoch": 0.10968921389396709, "grad_norm": 2.617760181427002, "learning_rate": 8.759124087591242e-05, "loss": 0.2019, "step": 60 }, { "epoch": 0.12797074954296161, "grad_norm": 1.7860541343688965, "learning_rate": 0.00010218978102189782, "loss": 0.1457, "step": 70 }, { "epoch": 0.14625228519195613, "grad_norm": 1.4474908113479614, "learning_rate": 0.0001167883211678832, "loss": 0.1387, "step": 80 }, { "epoch": 0.16453382084095064, "grad_norm": 1.5035394430160522, "learning_rate": 0.0001313868613138686, "loss": 0.1363, "step": 90 }, { "epoch": 0.18281535648994515, "grad_norm": 1.4598884582519531, "learning_rate": 0.00014598540145985403, "loss": 0.1124, "step": 100 }, { "epoch": 0.20109689213893966, "grad_norm": 1.7308577299118042, "learning_rate": 0.00016058394160583942, "loss": 0.1215, "step": 110 }, { "epoch": 0.21937842778793418, "grad_norm": 1.5704491138458252, "learning_rate": 0.00017518248175182484, "loss": 0.1332, "step": 120 }, { "epoch": 0.2376599634369287, "grad_norm": 0.7519080638885498, "learning_rate": 0.00018978102189781023, "loss": 0.1015, "step": 130 }, { "epoch": 0.25594149908592323, "grad_norm": 1.0302314758300781, "learning_rate": 0.00019999934198849153, "loss": 0.1043, "step": 140 }, { "epoch": 0.2742230347349177, "grad_norm": 1.1439878940582275, "learning_rate": 0.00019998764424701714, "loss": 0.1105, "step": 150 }, { "epoch": 0.29250457038391225, "grad_norm": 0.8649179935455322, "learning_rate": 0.00019996132599641746, "loss": 0.0969, "step": 160 }, { "epoch": 0.31078610603290674, "grad_norm": 0.9194239377975464, "learning_rate": 0.00019992039108503024, "loss": 0.097, "step": 170 }, { "epoch": 0.3290676416819013, "grad_norm": 0.6259992718696594, "learning_rate": 0.00019986484549848745, "loss": 0.0853, "step": 180 }, { "epoch": 0.3473491773308958, "grad_norm": 1.0033239126205444, "learning_rate": 0.00019979469735884026, "loss": 0.0944, "step": 190 }, { "epoch": 0.3656307129798903, "grad_norm": 1.261385440826416, "learning_rate": 0.00019970995692337114, "loss": 0.1078, "step": 200 }, { "epoch": 0.38391224862888484, "grad_norm": 0.9231658577919006, "learning_rate": 0.00019961063658309418, "loss": 0.0821, "step": 210 }, { "epoch": 0.40219378427787933, "grad_norm": 0.996103048324585, "learning_rate": 0.00019949675086094326, "loss": 0.0911, "step": 220 }, { "epoch": 0.42047531992687387, "grad_norm": 0.9832742810249329, "learning_rate": 0.0001993683164096483, "loss": 0.0692, "step": 230 }, { "epoch": 0.43875685557586835, "grad_norm": 0.6472922563552856, "learning_rate": 0.00019922535200930046, "loss": 0.0706, "step": 240 }, { "epoch": 0.4570383912248629, "grad_norm": 0.5999054312705994, "learning_rate": 0.00019906787856460581, "loss": 0.0731, "step": 250 }, { "epoch": 0.4753199268738574, "grad_norm": 0.667738139629364, "learning_rate": 0.00019889591910182876, "loss": 0.0708, "step": 260 }, { "epoch": 0.4936014625228519, "grad_norm": 0.554964542388916, "learning_rate": 0.0001987094987654251, "loss": 0.0591, "step": 270 }, { "epoch": 0.5118829981718465, "grad_norm": 1.1600011587142944, "learning_rate": 0.00019850864481436514, "loss": 0.0795, "step": 280 }, { "epoch": 0.5301645338208409, "grad_norm": 0.6419970393180847, "learning_rate": 0.00019829338661814797, "loss": 0.0659, "step": 290 }, { "epoch": 0.5484460694698354, "grad_norm": 0.735856831073761, "learning_rate": 0.00019806375565250685, "loss": 0.0724, "step": 300 }, { "epoch": 0.56672760511883, "grad_norm": 0.5395373106002808, "learning_rate": 0.00019781978549480682, "loss": 0.0626, "step": 310 }, { "epoch": 0.5850091407678245, "grad_norm": 0.8947715759277344, "learning_rate": 0.00019756151181913483, "loss": 0.0601, "step": 320 }, { "epoch": 0.603290676416819, "grad_norm": 0.5075414180755615, "learning_rate": 0.00019728897239108342, "loss": 0.0691, "step": 330 }, { "epoch": 0.6215722120658135, "grad_norm": 1.3236219882965088, "learning_rate": 0.00019700220706222858, "loss": 0.0488, "step": 340 }, { "epoch": 0.6398537477148081, "grad_norm": 0.9153704047203064, "learning_rate": 0.00019670125776430228, "loss": 0.0622, "step": 350 }, { "epoch": 0.6581352833638026, "grad_norm": 0.6496918797492981, "learning_rate": 0.00019638616850306133, "loss": 0.0572, "step": 360 }, { "epoch": 0.676416819012797, "grad_norm": 0.6905117034912109, "learning_rate": 0.00019605698535185266, "loss": 0.0506, "step": 370 }, { "epoch": 0.6946983546617916, "grad_norm": 0.6502402424812317, "learning_rate": 0.00019571375644487625, "loss": 0.0528, "step": 380 }, { "epoch": 0.7129798903107861, "grad_norm": 0.7400691509246826, "learning_rate": 0.0001953565319701469, "loss": 0.0674, "step": 390 }, { "epoch": 0.7312614259597806, "grad_norm": 0.5896055698394775, "learning_rate": 0.0001949853641621555, "loss": 0.0471, "step": 400 }, { "epoch": 0.7495429616087751, "grad_norm": 0.4026470482349396, "learning_rate": 0.00019460030729423114, "loss": 0.0512, "step": 410 }, { "epoch": 0.7678244972577697, "grad_norm": 0.47957828640937805, "learning_rate": 0.0001942014176706052, "loss": 0.0629, "step": 420 }, { "epoch": 0.7861060329067642, "grad_norm": 0.4520862400531769, "learning_rate": 0.00019378875361817817, "loss": 0.0533, "step": 430 }, { "epoch": 0.8043875685557587, "grad_norm": 0.4732885956764221, "learning_rate": 0.00019336237547799108, "loss": 0.058, "step": 440 }, { "epoch": 0.8226691042047533, "grad_norm": 0.7703008651733398, "learning_rate": 0.0001929223455964022, "loss": 0.0532, "step": 450 }, { "epoch": 0.8409506398537477, "grad_norm": 0.45097994804382324, "learning_rate": 0.00019246872831597055, "loss": 0.0465, "step": 460 }, { "epoch": 0.8592321755027422, "grad_norm": 0.5736098289489746, "learning_rate": 0.00019200158996604753, "loss": 0.0487, "step": 470 }, { "epoch": 0.8775137111517367, "grad_norm": 0.7237376570701599, "learning_rate": 0.0001915209988530779, "loss": 0.0551, "step": 480 }, { "epoch": 0.8957952468007313, "grad_norm": 0.4645770192146301, "learning_rate": 0.00019102702525061207, "loss": 0.0495, "step": 490 }, { "epoch": 0.9140767824497258, "grad_norm": 0.5169672966003418, "learning_rate": 0.00019051974138903027, "loss": 0.0433, "step": 500 }, { "epoch": 0.9323583180987203, "grad_norm": 0.7457365989685059, "learning_rate": 0.00018999922144498084, "loss": 0.0518, "step": 510 }, { "epoch": 0.9506398537477148, "grad_norm": 0.5059699416160583, "learning_rate": 0.00018946554153053395, "loss": 0.0474, "step": 520 }, { "epoch": 0.9689213893967094, "grad_norm": 0.8174113035202026, "learning_rate": 0.00018891877968205213, "loss": 0.0517, "step": 530 }, { "epoch": 0.9872029250457038, "grad_norm": 0.5508332252502441, "learning_rate": 0.00018835901584877973, "loss": 0.0709, "step": 540 }, { "epoch": 1.0054844606946984, "grad_norm": 0.5709052681922913, "learning_rate": 0.00018778633188115223, "loss": 0.0484, "step": 550 }, { "epoch": 1.023765996343693, "grad_norm": 0.4354308247566223, "learning_rate": 0.0001872008115188281, "loss": 0.0544, "step": 560 }, { "epoch": 1.0420475319926874, "grad_norm": 0.535977303981781, "learning_rate": 0.00018660254037844388, "loss": 0.0562, "step": 570 }, { "epoch": 1.0603290676416819, "grad_norm": 0.2939574420452118, "learning_rate": 0.00018599160594109522, "loss": 0.0489, "step": 580 }, { "epoch": 1.0786106032906764, "grad_norm": 0.3677907884120941, "learning_rate": 0.000185368097539545, "loss": 0.0358, "step": 590 }, { "epoch": 1.0968921389396709, "grad_norm": 0.5382636785507202, "learning_rate": 0.0001847321063451609, "loss": 0.0395, "step": 600 }, { "epoch": 1.1151736745886653, "grad_norm": 0.457963764667511, "learning_rate": 0.00018408372535458397, "loss": 0.0523, "step": 610 }, { "epoch": 1.13345521023766, "grad_norm": 0.5560534000396729, "learning_rate": 0.00018342304937613032, "loss": 0.0531, "step": 620 }, { "epoch": 1.1517367458866545, "grad_norm": 0.6328279376029968, "learning_rate": 0.00018275017501592818, "loss": 0.0452, "step": 630 }, { "epoch": 1.170018281535649, "grad_norm": 0.45685553550720215, "learning_rate": 0.0001820652006637915, "loss": 0.0402, "step": 640 }, { "epoch": 1.1882998171846435, "grad_norm": 0.21566231548786163, "learning_rate": 0.0001813682264788334, "loss": 0.0401, "step": 650 }, { "epoch": 1.206581352833638, "grad_norm": 0.36770665645599365, "learning_rate": 0.00018065935437482037, "loss": 0.04, "step": 660 }, { "epoch": 1.2248628884826325, "grad_norm": 0.4096185863018036, "learning_rate": 0.0001799386880052703, "loss": 0.0352, "step": 670 }, { "epoch": 1.2431444241316272, "grad_norm": 0.4246453642845154, "learning_rate": 0.00017920633274829575, "loss": 0.045, "step": 680 }, { "epoch": 1.2614259597806217, "grad_norm": 0.4160013496875763, "learning_rate": 0.00017846239569119528, "loss": 0.0357, "step": 690 }, { "epoch": 1.2797074954296161, "grad_norm": 0.5409733653068542, "learning_rate": 0.00017770698561479496, "loss": 0.0376, "step": 700 }, { "epoch": 1.2979890310786106, "grad_norm": 0.22224466502666473, "learning_rate": 0.00017694021297754188, "loss": 0.041, "step": 710 }, { "epoch": 1.3162705667276051, "grad_norm": 0.5606803894042969, "learning_rate": 0.00017616218989935272, "loss": 0.0367, "step": 720 }, { "epoch": 1.3345521023765996, "grad_norm": 0.3131175935268402, "learning_rate": 0.00017537303014521918, "loss": 0.0466, "step": 730 }, { "epoch": 1.352833638025594, "grad_norm": 0.37444230914115906, "learning_rate": 0.0001745728491085728, "loss": 0.0401, "step": 740 }, { "epoch": 1.3711151736745886, "grad_norm": 0.6337727308273315, "learning_rate": 0.0001737617637944119, "loss": 0.0505, "step": 750 }, { "epoch": 1.389396709323583, "grad_norm": 0.5669440627098083, "learning_rate": 0.00017293989280219274, "loss": 0.0372, "step": 760 }, { "epoch": 1.4076782449725778, "grad_norm": 0.388346791267395, "learning_rate": 0.00017210735630848745, "loss": 0.035, "step": 770 }, { "epoch": 1.4259597806215722, "grad_norm": 0.5280373096466064, "learning_rate": 0.00017126427604941148, "loss": 0.0466, "step": 780 }, { "epoch": 1.4442413162705667, "grad_norm": 0.565298855304718, "learning_rate": 0.00017041077530282294, "loss": 0.0365, "step": 790 }, { "epoch": 1.4625228519195612, "grad_norm": 0.35680803656578064, "learning_rate": 0.00016954697887029655, "loss": 0.0383, "step": 800 }, { "epoch": 1.4808043875685557, "grad_norm": 0.42788997292518616, "learning_rate": 0.00016867301305887474, "loss": 0.0337, "step": 810 }, { "epoch": 1.4990859232175504, "grad_norm": 0.43233945965766907, "learning_rate": 0.00016778900566259865, "loss": 0.0505, "step": 820 }, { "epoch": 1.517367458866545, "grad_norm": 0.4589940905570984, "learning_rate": 0.0001668950859438216, "loss": 0.0438, "step": 830 }, { "epoch": 1.5356489945155394, "grad_norm": 0.48594310879707336, "learning_rate": 0.00016599138461430814, "loss": 0.0323, "step": 840 }, { "epoch": 1.5539305301645339, "grad_norm": 0.31333279609680176, "learning_rate": 0.00016507803381612076, "loss": 0.0393, "step": 850 }, { "epoch": 1.5722120658135283, "grad_norm": 0.49847719073295593, "learning_rate": 0.00016415516710229766, "loss": 0.0453, "step": 860 }, { "epoch": 1.5904936014625228, "grad_norm": 0.4276566505432129, "learning_rate": 0.00016322291941732442, "loss": 0.0362, "step": 870 }, { "epoch": 1.6087751371115173, "grad_norm": 0.47734275460243225, "learning_rate": 0.0001622814270774018, "loss": 0.0349, "step": 880 }, { "epoch": 1.6270566727605118, "grad_norm": 0.24307364225387573, "learning_rate": 0.00016133082775051313, "loss": 0.0365, "step": 890 }, { "epoch": 1.6453382084095063, "grad_norm": 0.4327755272388458, "learning_rate": 0.00016037126043629422, "loss": 0.0318, "step": 900 }, { "epoch": 1.6636197440585008, "grad_norm": 0.2253831923007965, "learning_rate": 0.0001594028654457083, "loss": 0.0324, "step": 910 }, { "epoch": 1.6819012797074955, "grad_norm": 0.42007511854171753, "learning_rate": 0.0001584257843805293, "loss": 0.0387, "step": 920 }, { "epoch": 1.70018281535649, "grad_norm": 0.5654010772705078, "learning_rate": 0.00015744016011263638, "loss": 0.0461, "step": 930 }, { "epoch": 1.7184643510054844, "grad_norm": 0.5979740619659424, "learning_rate": 0.00015644613676312288, "loss": 0.0288, "step": 940 }, { "epoch": 1.736745886654479, "grad_norm": 0.6250779628753662, "learning_rate": 0.00015544385968122227, "loss": 0.0339, "step": 950 }, { "epoch": 1.7550274223034736, "grad_norm": 0.4420310854911804, "learning_rate": 0.00015443347542305484, "loss": 0.0446, "step": 960 }, { "epoch": 1.7733089579524681, "grad_norm": 0.4242953956127167, "learning_rate": 0.0001534151317301979, "loss": 0.0402, "step": 970 }, { "epoch": 1.7915904936014626, "grad_norm": 0.2853521406650543, "learning_rate": 0.00015238897750808242, "loss": 0.0367, "step": 980 }, { "epoch": 1.809872029250457, "grad_norm": 0.5415486693382263, "learning_rate": 0.00015135516280421945, "loss": 0.0312, "step": 990 }, { "epoch": 1.8281535648994516, "grad_norm": 0.3944428265094757, "learning_rate": 0.00015031383878626016, "loss": 0.0293, "step": 1000 }, { "epoch": 1.846435100548446, "grad_norm": 0.42964455485343933, "learning_rate": 0.00014926515771989104, "loss": 0.0462, "step": 1010 }, { "epoch": 1.8647166361974405, "grad_norm": 0.3574308454990387, "learning_rate": 0.00014820927294656973, "loss": 0.0358, "step": 1020 }, { "epoch": 1.882998171846435, "grad_norm": 0.38193315267562866, "learning_rate": 0.00014714633886110242, "loss": 0.0393, "step": 1030 }, { "epoch": 1.9012797074954295, "grad_norm": 0.4956030249595642, "learning_rate": 0.00014607651088906809, "loss": 0.0312, "step": 1040 }, { "epoch": 1.919561243144424, "grad_norm": 0.4244064688682556, "learning_rate": 0.00014499994546409152, "loss": 0.031, "step": 1050 }, { "epoch": 1.9378427787934185, "grad_norm": 0.46385011076927185, "learning_rate": 0.00014391680000496932, "loss": 0.0424, "step": 1060 }, { "epoch": 1.9561243144424132, "grad_norm": 0.5440361499786377, "learning_rate": 0.0001428272328926512, "loss": 0.0328, "step": 1070 }, { "epoch": 1.9744058500914077, "grad_norm": 0.3221015930175781, "learning_rate": 0.00014173140344708152, "loss": 0.0424, "step": 1080 }, { "epoch": 1.9926873857404022, "grad_norm": 0.520367443561554, "learning_rate": 0.00014062947190390262, "loss": 0.0396, "step": 1090 }, { "epoch": 2.010968921389397, "grad_norm": 0.29480573534965515, "learning_rate": 0.0001395215993910249, "loss": 0.0351, "step": 1100 }, { "epoch": 2.0292504570383914, "grad_norm": 0.35179761052131653, "learning_rate": 0.00013840794790506616, "loss": 0.0271, "step": 1110 }, { "epoch": 2.047531992687386, "grad_norm": 0.377270370721817, "learning_rate": 0.00013728868028766377, "loss": 0.0311, "step": 1120 }, { "epoch": 2.0658135283363803, "grad_norm": 0.4772701859474182, "learning_rate": 0.0001361639602016637, "loss": 0.0372, "step": 1130 }, { "epoch": 2.084095063985375, "grad_norm": 0.30298906564712524, "learning_rate": 0.000135033952107189, "loss": 0.0255, "step": 1140 }, { "epoch": 2.1023765996343693, "grad_norm": 0.39370113611221313, "learning_rate": 0.00013389882123759206, "loss": 0.0327, "step": 1150 }, { "epoch": 2.1206581352833638, "grad_norm": 0.2912181317806244, "learning_rate": 0.00013275873357529368, "loss": 0.0268, "step": 1160 }, { "epoch": 2.1389396709323583, "grad_norm": 0.29357820749282837, "learning_rate": 0.00013161385582751247, "loss": 0.0273, "step": 1170 }, { "epoch": 2.1572212065813527, "grad_norm": 0.3242945075035095, "learning_rate": 0.00013046435540188848, "loss": 0.0296, "step": 1180 }, { "epoch": 2.1755027422303472, "grad_norm": 1.168150544166565, "learning_rate": 0.00012931040038200435, "loss": 0.0416, "step": 1190 }, { "epoch": 2.1937842778793417, "grad_norm": 0.3501128852367401, "learning_rate": 0.00012815215950280753, "loss": 0.0379, "step": 1200 }, { "epoch": 2.212065813528336, "grad_norm": 0.46127256751060486, "learning_rate": 0.0001269898021259373, "loss": 0.0372, "step": 1210 }, { "epoch": 2.2303473491773307, "grad_norm": 0.4480052888393402, "learning_rate": 0.0001258234982149604, "loss": 0.0366, "step": 1220 }, { "epoch": 2.2486288848263256, "grad_norm": 0.38535383343696594, "learning_rate": 0.0001246534183105181, "loss": 0.0289, "step": 1230 }, { "epoch": 2.26691042047532, "grad_norm": 0.39918404817581177, "learning_rate": 0.00012347973350538936, "loss": 0.029, "step": 1240 }, { "epoch": 2.2851919561243146, "grad_norm": 0.27928563952445984, "learning_rate": 0.00012230261541947316, "loss": 0.0262, "step": 1250 }, { "epoch": 2.303473491773309, "grad_norm": 0.43867453932762146, "learning_rate": 0.00012112223617469372, "loss": 0.0227, "step": 1260 }, { "epoch": 2.3217550274223036, "grad_norm": 0.3848976194858551, "learning_rate": 0.00011993876836983198, "loss": 0.0251, "step": 1270 }, { "epoch": 2.340036563071298, "grad_norm": 0.3365519046783447, "learning_rate": 0.0001187523850552881, "loss": 0.0345, "step": 1280 }, { "epoch": 2.3583180987202925, "grad_norm": 0.3406737446784973, "learning_rate": 0.00011756325970777717, "loss": 0.0273, "step": 1290 }, { "epoch": 2.376599634369287, "grad_norm": 0.28142690658569336, "learning_rate": 0.00011637156620496308, "loss": 0.0275, "step": 1300 }, { "epoch": 2.3948811700182815, "grad_norm": 0.36976391077041626, "learning_rate": 0.00011517747880003335, "loss": 0.0243, "step": 1310 }, { "epoch": 2.413162705667276, "grad_norm": 0.22825664281845093, "learning_rate": 0.00011398117209621966, "loss": 0.0278, "step": 1320 }, { "epoch": 2.4314442413162705, "grad_norm": 0.3394540548324585, "learning_rate": 0.00011278282102126633, "loss": 0.0357, "step": 1330 }, { "epoch": 2.449725776965265, "grad_norm": 0.26682886481285095, "learning_rate": 0.00011158260080185226, "loss": 0.0407, "step": 1340 }, { "epoch": 2.4680073126142594, "grad_norm": 0.23459388315677643, "learning_rate": 0.00011038068693796846, "loss": 0.0263, "step": 1350 }, { "epoch": 2.4862888482632544, "grad_norm": 0.32797005772590637, "learning_rate": 0.00010917725517725608, "loss": 0.0354, "step": 1360 }, { "epoch": 2.504570383912249, "grad_norm": 0.2672847509384155, "learning_rate": 0.00010797248148930783, "loss": 0.0203, "step": 1370 }, { "epoch": 2.5228519195612433, "grad_norm": 0.34542036056518555, "learning_rate": 0.00010676654203993732, "loss": 0.0246, "step": 1380 }, { "epoch": 2.541133455210238, "grad_norm": 0.5064176321029663, "learning_rate": 0.00010555961316541946, "loss": 0.0276, "step": 1390 }, { "epoch": 2.5594149908592323, "grad_norm": 0.34542617201805115, "learning_rate": 0.00010435187134670607, "loss": 0.0238, "step": 1400 }, { "epoch": 2.577696526508227, "grad_norm": 0.3336438238620758, "learning_rate": 0.00010314349318362015, "loss": 0.0353, "step": 1410 }, { "epoch": 2.5959780621572213, "grad_norm": 0.22887752950191498, "learning_rate": 0.00010193465536903307, "loss": 0.028, "step": 1420 }, { "epoch": 2.6142595978062158, "grad_norm": 0.1399448662996292, "learning_rate": 0.00010072553466302784, "loss": 0.028, "step": 1430 }, { "epoch": 2.6325411334552102, "grad_norm": 0.36335644125938416, "learning_rate": 9.951630786705279e-05, "loss": 0.0196, "step": 1440 }, { "epoch": 2.6508226691042047, "grad_norm": 0.22947153449058533, "learning_rate": 9.830715179806905e-05, "loss": 0.0275, "step": 1450 }, { "epoch": 2.669104204753199, "grad_norm": 0.21563003957271576, "learning_rate": 9.709824326269576e-05, "loss": 0.0216, "step": 1460 }, { "epoch": 2.6873857404021937, "grad_norm": 0.3260309100151062, "learning_rate": 9.5889759031357e-05, "loss": 0.018, "step": 1470 }, { "epoch": 2.705667276051188, "grad_norm": 0.15418443083763123, "learning_rate": 9.468187581243378e-05, "loss": 0.0244, "step": 1480 }, { "epoch": 2.7239488117001827, "grad_norm": 0.2873231768608093, "learning_rate": 9.347477022642503e-05, "loss": 0.0186, "step": 1490 }, { "epoch": 2.742230347349177, "grad_norm": 0.2715139091014862, "learning_rate": 9.226861878012197e-05, "loss": 0.0273, "step": 1500 }, { "epoch": 2.7605118829981716, "grad_norm": 0.17074620723724365, "learning_rate": 9.106359784079832e-05, "loss": 0.0174, "step": 1510 }, { "epoch": 2.778793418647166, "grad_norm": 0.2897492051124573, "learning_rate": 8.985988361042153e-05, "loss": 0.0283, "step": 1520 }, { "epoch": 2.797074954296161, "grad_norm": 0.5155644416809082, "learning_rate": 8.8657652099888e-05, "loss": 0.0216, "step": 1530 }, { "epoch": 2.8153564899451555, "grad_norm": 0.33276352286338806, "learning_rate": 8.745707910328615e-05, "loss": 0.0245, "step": 1540 }, { "epoch": 2.83363802559415, "grad_norm": 0.4756206274032593, "learning_rate": 8.625834017219113e-05, "loss": 0.0303, "step": 1550 }, { "epoch": 2.8519195612431445, "grad_norm": 0.2755451202392578, "learning_rate": 8.506161058999541e-05, "loss": 0.0199, "step": 1560 }, { "epoch": 2.870201096892139, "grad_norm": 0.26369351148605347, "learning_rate": 8.386706534627805e-05, "loss": 0.0204, "step": 1570 }, { "epoch": 2.8884826325411335, "grad_norm": 0.2358650118112564, "learning_rate": 8.267487911121715e-05, "loss": 0.0211, "step": 1580 }, { "epoch": 2.906764168190128, "grad_norm": 0.22182169556617737, "learning_rate": 8.148522621004926e-05, "loss": 0.0233, "step": 1590 }, { "epoch": 2.9250457038391224, "grad_norm": 0.30960527062416077, "learning_rate": 8.029828059757875e-05, "loss": 0.0243, "step": 1600 }, { "epoch": 2.943327239488117, "grad_norm": 0.38207757472991943, "learning_rate": 7.91142158327417e-05, "loss": 0.0295, "step": 1610 }, { "epoch": 2.9616087751371114, "grad_norm": 0.24521781504154205, "learning_rate": 7.793320505322761e-05, "loss": 0.0206, "step": 1620 }, { "epoch": 2.979890310786106, "grad_norm": 0.3253994286060333, "learning_rate": 7.675542095016256e-05, "loss": 0.026, "step": 1630 }, { "epoch": 2.998171846435101, "grad_norm": 0.3253840208053589, "learning_rate": 7.558103574285779e-05, "loss": 0.0219, "step": 1640 }, { "epoch": 3.016453382084095, "grad_norm": 0.2342890352010727, "learning_rate": 7.441022115362729e-05, "loss": 0.0181, "step": 1650 }, { "epoch": 3.03473491773309, "grad_norm": 0.2249564677476883, "learning_rate": 7.324314838267796e-05, "loss": 0.0228, "step": 1660 }, { "epoch": 3.0530164533820843, "grad_norm": 0.24722999334335327, "learning_rate": 7.207998808307628e-05, "loss": 0.018, "step": 1670 }, { "epoch": 3.0712979890310788, "grad_norm": 0.22779327630996704, "learning_rate": 7.092091033579475e-05, "loss": 0.0193, "step": 1680 }, { "epoch": 3.0895795246800732, "grad_norm": 0.34452179074287415, "learning_rate": 6.976608462484226e-05, "loss": 0.0327, "step": 1690 }, { "epoch": 3.1078610603290677, "grad_norm": 0.30508124828338623, "learning_rate": 6.861567981248142e-05, "loss": 0.0261, "step": 1700 }, { "epoch": 3.126142595978062, "grad_norm": 0.319670706987381, "learning_rate": 6.746986411453717e-05, "loss": 0.0189, "step": 1710 }, { "epoch": 3.1444241316270567, "grad_norm": 0.35580283403396606, "learning_rate": 6.632880507579957e-05, "loss": 0.0242, "step": 1720 }, { "epoch": 3.162705667276051, "grad_norm": 0.3020285964012146, "learning_rate": 6.519266954552502e-05, "loss": 0.0176, "step": 1730 }, { "epoch": 3.1809872029250457, "grad_norm": 0.27105554938316345, "learning_rate": 6.406162365303882e-05, "loss": 0.0268, "step": 1740 }, { "epoch": 3.19926873857404, "grad_norm": 0.20928241312503815, "learning_rate": 6.293583278344361e-05, "loss": 0.0206, "step": 1750 }, { "epoch": 3.2175502742230346, "grad_norm": 0.2314785271883011, "learning_rate": 6.181546155343579e-05, "loss": 0.0198, "step": 1760 }, { "epoch": 3.235831809872029, "grad_norm": 0.2732461988925934, "learning_rate": 6.070067378723501e-05, "loss": 0.0177, "step": 1770 }, { "epoch": 3.2541133455210236, "grad_norm": 0.17697979509830475, "learning_rate": 5.959163249262913e-05, "loss": 0.0155, "step": 1780 }, { "epoch": 3.272394881170018, "grad_norm": 0.24429567158222198, "learning_rate": 5.848849983713894e-05, "loss": 0.0212, "step": 1790 }, { "epoch": 3.2906764168190126, "grad_norm": 0.36660096049308777, "learning_rate": 5.739143712430521e-05, "loss": 0.0281, "step": 1800 }, { "epoch": 3.3089579524680075, "grad_norm": 0.2895634174346924, "learning_rate": 5.630060477010253e-05, "loss": 0.018, "step": 1810 }, { "epoch": 3.327239488117002, "grad_norm": 0.3412606418132782, "learning_rate": 5.5216162279482964e-05, "loss": 0.0134, "step": 1820 }, { "epoch": 3.3455210237659965, "grad_norm": 0.22716091573238373, "learning_rate": 5.4138268223052326e-05, "loss": 0.016, "step": 1830 }, { "epoch": 3.363802559414991, "grad_norm": 0.24945920705795288, "learning_rate": 5.306708021388378e-05, "loss": 0.0208, "step": 1840 }, { "epoch": 3.3820840950639854, "grad_norm": 0.24487105011940002, "learning_rate": 5.200275488447104e-05, "loss": 0.018, "step": 1850 }, { "epoch": 3.40036563071298, "grad_norm": 0.24816852807998657, "learning_rate": 5.094544786382522e-05, "loss": 0.0159, "step": 1860 }, { "epoch": 3.4186471663619744, "grad_norm": 0.1848219782114029, "learning_rate": 4.989531375471805e-05, "loss": 0.0142, "step": 1870 }, { "epoch": 3.436928702010969, "grad_norm": 0.19923894107341766, "learning_rate": 4.885250611107558e-05, "loss": 0.0214, "step": 1880 }, { "epoch": 3.4552102376599634, "grad_norm": 0.1752861738204956, "learning_rate": 4.7817177415524796e-05, "loss": 0.0198, "step": 1890 }, { "epoch": 3.473491773308958, "grad_norm": 0.3053307831287384, "learning_rate": 4.678947905709744e-05, "loss": 0.0225, "step": 1900 }, { "epoch": 3.4917733089579523, "grad_norm": 0.19800381362438202, "learning_rate": 4.576956130909317e-05, "loss": 0.016, "step": 1910 }, { "epoch": 3.510054844606947, "grad_norm": 0.1873503029346466, "learning_rate": 4.475757330710621e-05, "loss": 0.0144, "step": 1920 }, { "epoch": 3.5283363802559418, "grad_norm": 0.23367895185947418, "learning_rate": 4.375366302721825e-05, "loss": 0.0161, "step": 1930 }, { "epoch": 3.5466179159049362, "grad_norm": 0.17103944718837738, "learning_rate": 4.2757977264361046e-05, "loss": 0.0146, "step": 1940 }, { "epoch": 3.5648994515539307, "grad_norm": 0.2473006546497345, "learning_rate": 4.177066161085148e-05, "loss": 0.0184, "step": 1950 }, { "epoch": 3.583180987202925, "grad_norm": 0.31398236751556396, "learning_rate": 4.0791860435102524e-05, "loss": 0.0146, "step": 1960 }, { "epoch": 3.6014625228519197, "grad_norm": 0.33835136890411377, "learning_rate": 3.982171686051334e-05, "loss": 0.021, "step": 1970 }, { "epoch": 3.619744058500914, "grad_norm": 0.16258537769317627, "learning_rate": 3.8860372744541407e-05, "loss": 0.0196, "step": 1980 }, { "epoch": 3.6380255941499087, "grad_norm": 0.3083174228668213, "learning_rate": 3.790796865795947e-05, "loss": 0.0152, "step": 1990 }, { "epoch": 3.656307129798903, "grad_norm": 0.21282333135604858, "learning_rate": 3.696464386430093e-05, "loss": 0.0215, "step": 2000 }, { "epoch": 3.6745886654478976, "grad_norm": 0.20800185203552246, "learning_rate": 3.6030536299496395e-05, "loss": 0.0155, "step": 2010 }, { "epoch": 3.692870201096892, "grad_norm": 0.251663476228714, "learning_rate": 3.5105782551704145e-05, "loss": 0.0222, "step": 2020 }, { "epoch": 3.7111517367458866, "grad_norm": 0.24097998440265656, "learning_rate": 3.419051784133773e-05, "loss": 0.0142, "step": 2030 }, { "epoch": 3.729433272394881, "grad_norm": 0.18417520821094513, "learning_rate": 3.328487600129371e-05, "loss": 0.0147, "step": 2040 }, { "epoch": 3.7477148080438756, "grad_norm": 0.18106205761432648, "learning_rate": 3.2388989457382126e-05, "loss": 0.0125, "step": 2050 }, { "epoch": 3.76599634369287, "grad_norm": 0.14622414112091064, "learning_rate": 3.1502989208962855e-05, "loss": 0.0151, "step": 2060 }, { "epoch": 3.7842778793418645, "grad_norm": 0.29628556966781616, "learning_rate": 3.062700480979046e-05, "loss": 0.0206, "step": 2070 }, { "epoch": 3.802559414990859, "grad_norm": 0.26881730556488037, "learning_rate": 2.9761164349070315e-05, "loss": 0.0176, "step": 2080 }, { "epoch": 3.8208409506398535, "grad_norm": 0.4180646240711212, "learning_rate": 2.8905594432729055e-05, "loss": 0.0179, "step": 2090 }, { "epoch": 3.839122486288848, "grad_norm": 0.25500163435935974, "learning_rate": 2.8060420164902012e-05, "loss": 0.0142, "step": 2100 }, { "epoch": 3.857404021937843, "grad_norm": 0.21968974173069, "learning_rate": 2.7225765129639836e-05, "loss": 0.0161, "step": 2110 }, { "epoch": 3.8756855575868374, "grad_norm": 0.24668078124523163, "learning_rate": 2.6401751372837813e-05, "loss": 0.0217, "step": 2120 }, { "epoch": 3.893967093235832, "grad_norm": 0.2258848249912262, "learning_rate": 2.5588499384389865e-05, "loss": 0.0178, "step": 2130 }, { "epoch": 3.9122486288848264, "grad_norm": 0.1784961074590683, "learning_rate": 2.478612808057018e-05, "loss": 0.0114, "step": 2140 }, { "epoch": 3.930530164533821, "grad_norm": 0.28832298517227173, "learning_rate": 2.3994754786644923e-05, "loss": 0.0109, "step": 2150 }, { "epoch": 3.9488117001828154, "grad_norm": 0.12029292434453964, "learning_rate": 2.3214495219716436e-05, "loss": 0.0211, "step": 2160 }, { "epoch": 3.96709323583181, "grad_norm": 0.19231897592544556, "learning_rate": 2.2445463471802785e-05, "loss": 0.0098, "step": 2170 }, { "epoch": 3.9853747714808043, "grad_norm": 0.08982887864112854, "learning_rate": 2.1687771993155004e-05, "loss": 0.0077, "step": 2180 }, { "epoch": 4.003656307129799, "grad_norm": 0.21466206014156342, "learning_rate": 2.0941531575813988e-05, "loss": 0.0159, "step": 2190 }, { "epoch": 4.021937842778794, "grad_norm": 0.17917244136333466, "learning_rate": 2.0206851337410415e-05, "loss": 0.0139, "step": 2200 }, { "epoch": 4.040219378427788, "grad_norm": 0.08883915841579437, "learning_rate": 1.9483838705209012e-05, "loss": 0.0152, "step": 2210 }, { "epoch": 4.058500914076783, "grad_norm": 0.16674405336380005, "learning_rate": 1.8772599400400258e-05, "loss": 0.0196, "step": 2220 }, { "epoch": 4.076782449725777, "grad_norm": 0.10342701524496078, "learning_rate": 1.807323742264162e-05, "loss": 0.0161, "step": 2230 }, { "epoch": 4.095063985374772, "grad_norm": 0.1896440088748932, "learning_rate": 1.7385855034850184e-05, "loss": 0.0122, "step": 2240 }, { "epoch": 4.113345521023766, "grad_norm": 0.16498374938964844, "learning_rate": 1.6710552748249598e-05, "loss": 0.0133, "step": 2250 }, { "epoch": 4.131627056672761, "grad_norm": 0.17953291535377502, "learning_rate": 1.604742930767298e-05, "loss": 0.0219, "step": 2260 }, { "epoch": 4.149908592321755, "grad_norm": 0.18694134056568146, "learning_rate": 1.5396581677124124e-05, "loss": 0.0169, "step": 2270 }, { "epoch": 4.16819012797075, "grad_norm": 0.17348328232765198, "learning_rate": 1.4758105025599068e-05, "loss": 0.0159, "step": 2280 }, { "epoch": 4.186471663619744, "grad_norm": 0.16517849266529083, "learning_rate": 1.4132092713170242e-05, "loss": 0.0137, "step": 2290 }, { "epoch": 4.204753199268739, "grad_norm": 0.13645470142364502, "learning_rate": 1.3518636277335084e-05, "loss": 0.0149, "step": 2300 }, { "epoch": 4.223034734917733, "grad_norm": 0.14027458429336548, "learning_rate": 1.291782541963107e-05, "loss": 0.0147, "step": 2310 }, { "epoch": 4.2413162705667276, "grad_norm": 0.11632464081048965, "learning_rate": 1.2329747992519269e-05, "loss": 0.0137, "step": 2320 }, { "epoch": 4.259597806215722, "grad_norm": 0.2426212579011917, "learning_rate": 1.1754489986538419e-05, "loss": 0.0117, "step": 2330 }, { "epoch": 4.2778793418647165, "grad_norm": 0.20155277848243713, "learning_rate": 1.1192135517730884e-05, "loss": 0.0147, "step": 2340 }, { "epoch": 4.296160877513711, "grad_norm": 0.14590322971343994, "learning_rate": 1.0642766815343196e-05, "loss": 0.0119, "step": 2350 }, { "epoch": 4.3144424131627055, "grad_norm": 0.17194287478923798, "learning_rate": 1.0106464209802013e-05, "loss": 0.0115, "step": 2360 }, { "epoch": 4.3327239488117, "grad_norm": 0.243038609623909, "learning_rate": 9.583306120968072e-06, "loss": 0.0153, "step": 2370 }, { "epoch": 4.3510054844606945, "grad_norm": 0.29729005694389343, "learning_rate": 9.0733690466694e-06, "loss": 0.0136, "step": 2380 }, { "epoch": 4.369287020109689, "grad_norm": 0.1595577597618103, "learning_rate": 8.576727551515474e-06, "loss": 0.0156, "step": 2390 }, { "epoch": 4.387568555758683, "grad_norm": 0.12783007323741913, "learning_rate": 8.093454255994248e-06, "loss": 0.0122, "step": 2400 }, { "epoch": 4.405850091407678, "grad_norm": 0.26608356833457947, "learning_rate": 7.6236198258532675e-06, "loss": 0.0136, "step": 2410 }, { "epoch": 4.424131627056672, "grad_norm": 0.1889527142047882, "learning_rate": 7.167292961766725e-06, "loss": 0.015, "step": 2420 }, { "epoch": 4.442413162705667, "grad_norm": 0.2580418884754181, "learning_rate": 6.724540389289913e-06, "loss": 0.0132, "step": 2430 }, { "epoch": 4.460694698354661, "grad_norm": 0.22082190215587616, "learning_rate": 6.295426849102271e-06, "loss": 0.0113, "step": 2440 }, { "epoch": 4.478976234003657, "grad_norm": 0.11176195740699768, "learning_rate": 5.8800150875408574e-06, "loss": 0.0141, "step": 2450 }, { "epoch": 4.497257769652651, "grad_norm": 0.1779015064239502, "learning_rate": 5.478365847425449e-06, "loss": 0.0113, "step": 2460 }, { "epoch": 4.515539305301646, "grad_norm": 0.15661382675170898, "learning_rate": 5.090537859176425e-06, "loss": 0.0102, "step": 2470 }, { "epoch": 4.53382084095064, "grad_norm": 0.21932142972946167, "learning_rate": 4.716587832227071e-06, "loss": 0.0147, "step": 2480 }, { "epoch": 4.552102376599635, "grad_norm": 0.30200353264808655, "learning_rate": 4.356570446731356e-06, "loss": 0.0152, "step": 2490 }, { "epoch": 4.570383912248629, "grad_norm": 0.11431296914815903, "learning_rate": 4.010538345568371e-06, "loss": 0.017, "step": 2500 }, { "epoch": 4.588665447897624, "grad_norm": 0.2187824845314026, "learning_rate": 3.678542126644813e-06, "loss": 0.0168, "step": 2510 }, { "epoch": 4.606946983546618, "grad_norm": 0.12425347417593002, "learning_rate": 3.360630335496362e-06, "loss": 0.0113, "step": 2520 }, { "epoch": 4.625228519195613, "grad_norm": 0.17450736463069916, "learning_rate": 3.056849458189115e-06, "loss": 0.015, "step": 2530 }, { "epoch": 4.643510054844607, "grad_norm": 0.2220509946346283, "learning_rate": 2.7672439145223773e-06, "loss": 0.0196, "step": 2540 }, { "epoch": 4.661791590493602, "grad_norm": 0.2917903959751129, "learning_rate": 2.491856051533392e-06, "loss": 0.0165, "step": 2550 }, { "epoch": 4.680073126142596, "grad_norm": 0.22880949079990387, "learning_rate": 2.230726137305206e-06, "loss": 0.0165, "step": 2560 }, { "epoch": 4.698354661791591, "grad_norm": 0.2307160645723343, "learning_rate": 1.983892355078587e-06, "loss": 0.0129, "step": 2570 }, { "epoch": 4.716636197440585, "grad_norm": 0.1975175142288208, "learning_rate": 1.7513907976687283e-06, "loss": 0.016, "step": 2580 }, { "epoch": 4.7349177330895795, "grad_norm": 0.23436793684959412, "learning_rate": 1.533255462187666e-06, "loss": 0.0108, "step": 2590 }, { "epoch": 4.753199268738574, "grad_norm": 0.14805355668067932, "learning_rate": 1.329518245073047e-06, "loss": 0.0182, "step": 2600 }, { "epoch": 4.7714808043875685, "grad_norm": 0.1988326609134674, "learning_rate": 1.1402089374242365e-06, "loss": 0.0119, "step": 2610 }, { "epoch": 4.789762340036563, "grad_norm": 0.12207505851984024, "learning_rate": 9.65355220646036e-07, "loss": 0.0128, "step": 2620 }, { "epoch": 4.8080438756855575, "grad_norm": 0.1775001883506775, "learning_rate": 8.049826624011881e-07, "loss": 0.0166, "step": 2630 }, { "epoch": 4.826325411334552, "grad_norm": 0.2577812075614929, "learning_rate": 6.591147128716224e-07, "loss": 0.0191, "step": 2640 }, { "epoch": 4.844606946983546, "grad_norm": 0.1870380938053131, "learning_rate": 5.277727013296097e-07, "loss": 0.0125, "step": 2650 }, { "epoch": 4.862888482632541, "grad_norm": 0.22090613842010498, "learning_rate": 4.1097583301888954e-07, "loss": 0.009, "step": 2660 }, { "epoch": 4.881170018281535, "grad_norm": 0.25381821393966675, "learning_rate": 3.0874118634640626e-07, "loss": 0.0158, "step": 2670 }, { "epoch": 4.89945155393053, "grad_norm": 0.25577688217163086, "learning_rate": 2.210837103850949e-07, "loss": 0.0074, "step": 2680 }, { "epoch": 4.917733089579524, "grad_norm": 0.1378592997789383, "learning_rate": 1.4801622268791892e-07, "loss": 0.0104, "step": 2690 }, { "epoch": 4.936014625228519, "grad_norm": 0.1672651469707489, "learning_rate": 8.954940741369155e-08, "loss": 0.0126, "step": 2700 }, { "epoch": 4.954296160877513, "grad_norm": 0.10131768137216568, "learning_rate": 4.5691813764803247e-08, "loss": 0.0093, "step": 2710 }, { "epoch": 4.972577696526509, "grad_norm": 0.19686748087406158, "learning_rate": 1.644985473709948e-08, "loss": 0.0132, "step": 2720 }, { "epoch": 4.990859232175502, "grad_norm": 0.16079658269882202, "learning_rate": 1.8278061821863646e-09, "loss": 0.0096, "step": 2730 }, { "epoch": 5.0, "step": 2735, "total_flos": 9.752547304210464e+16, "train_loss": 0.045771664261164136, "train_runtime": 1237.486, "train_samples_per_second": 35.362, "train_steps_per_second": 2.21 } ], "logging_steps": 10, "max_steps": 2735, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.752547304210464e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }