{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990182603573532, "eval_steps": 500, "global_step": 636, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007853917141174161, "grad_norm": 9.454776697912866, "learning_rate": 1.25e-06, "loss": 1.9975, "step": 5 }, { "epoch": 0.015707834282348322, "grad_norm": 7.36591132940049, "learning_rate": 2.8125e-06, "loss": 1.8813, "step": 10 }, { "epoch": 0.023561751423522483, "grad_norm": 7.154615745335707, "learning_rate": 4.3750000000000005e-06, "loss": 1.4274, "step": 15 }, { "epoch": 0.031415668564696644, "grad_norm": 8.822500376414714, "learning_rate": 5.9375e-06, "loss": 0.8007, "step": 20 }, { "epoch": 0.0392695857058708, "grad_norm": 1.5626818502455462, "learning_rate": 7.500000000000001e-06, "loss": 0.2708, "step": 25 }, { "epoch": 0.04712350284704497, "grad_norm": 0.95138015574769, "learning_rate": 9.0625e-06, "loss": 0.1895, "step": 30 }, { "epoch": 0.054977419988219124, "grad_norm": 0.632172201703669, "learning_rate": 1.0625e-05, "loss": 0.161, "step": 35 }, { "epoch": 0.06283133712939329, "grad_norm": 0.6087184175934243, "learning_rate": 1.2187500000000001e-05, "loss": 0.1514, "step": 40 }, { "epoch": 0.07068525427056745, "grad_norm": 0.43002905751035403, "learning_rate": 1.375e-05, "loss": 0.1405, "step": 45 }, { "epoch": 0.0785391714117416, "grad_norm": 0.4316303477126074, "learning_rate": 1.5312500000000003e-05, "loss": 0.1323, "step": 50 }, { "epoch": 0.08639308855291576, "grad_norm": 0.41295406365002396, "learning_rate": 1.6875e-05, "loss": 0.1232, "step": 55 }, { "epoch": 0.09424700569408993, "grad_norm": 0.3869106906435148, "learning_rate": 1.84375e-05, "loss": 0.1185, "step": 60 }, { "epoch": 0.10210092283526409, "grad_norm": 0.38396751864248035, "learning_rate": 2e-05, "loss": 0.1127, "step": 65 }, { "epoch": 0.10995483997643825, "grad_norm": 0.31600318922334614, "learning_rate": 1.9996229574120564e-05, "loss": 0.1141, "step": 70 }, { "epoch": 0.1178087571176124, "grad_norm": 0.2807626851547414, "learning_rate": 1.998492113970451e-05, "loss": 0.1091, "step": 75 }, { "epoch": 0.12566267425878658, "grad_norm": 0.2712214079154975, "learning_rate": 1.99660832242746e-05, "loss": 0.1062, "step": 80 }, { "epoch": 0.13351659139996072, "grad_norm": 0.3514322765970332, "learning_rate": 1.99397300332236e-05, "loss": 0.1043, "step": 85 }, { "epoch": 0.1413705085411349, "grad_norm": 0.2565181736697417, "learning_rate": 1.9905881439102222e-05, "loss": 0.1012, "step": 90 }, { "epoch": 0.14922442568230906, "grad_norm": 0.22648554493196996, "learning_rate": 1.9864562966633517e-05, "loss": 0.1091, "step": 95 }, { "epoch": 0.1570783428234832, "grad_norm": 0.3160797286348635, "learning_rate": 1.9815805773465064e-05, "loss": 0.1003, "step": 100 }, { "epoch": 0.16493225996465738, "grad_norm": 0.24530695236987562, "learning_rate": 1.9759646626673445e-05, "loss": 0.1054, "step": 105 }, { "epoch": 0.17278617710583152, "grad_norm": 0.3308392923974687, "learning_rate": 1.9696127875038753e-05, "loss": 0.1021, "step": 110 }, { "epoch": 0.1806400942470057, "grad_norm": 0.281121787116649, "learning_rate": 1.9625297417109982e-05, "loss": 0.0967, "step": 115 }, { "epoch": 0.18849401138817987, "grad_norm": 0.20020373984472567, "learning_rate": 1.954720866508546e-05, "loss": 0.1, "step": 120 }, { "epoch": 0.196347928529354, "grad_norm": 0.27280707245432356, "learning_rate": 1.946192050453549e-05, "loss": 0.0999, "step": 125 }, { "epoch": 0.20420184567052818, "grad_norm": 0.24726594932319515, "learning_rate": 1.936949724999762e-05, "loss": 0.0947, "step": 130 }, { "epoch": 0.21205576281170233, "grad_norm": 0.2383332440997248, "learning_rate": 1.9270008596478008e-05, "loss": 0.0982, "step": 135 }, { "epoch": 0.2199096799528765, "grad_norm": 0.2203997646125469, "learning_rate": 1.916352956689544e-05, "loss": 0.1017, "step": 140 }, { "epoch": 0.22776359709405067, "grad_norm": 0.2362083987436209, "learning_rate": 1.905014045550767e-05, "loss": 0.0987, "step": 145 }, { "epoch": 0.2356175142352248, "grad_norm": 0.23431067680898818, "learning_rate": 1.89299267673627e-05, "loss": 0.1055, "step": 150 }, { "epoch": 0.24347143137639898, "grad_norm": 0.21183859562424995, "learning_rate": 1.88029791538207e-05, "loss": 0.0972, "step": 155 }, { "epoch": 0.25132534851757315, "grad_norm": 0.22397328492306617, "learning_rate": 1.8669393344195154e-05, "loss": 0.1009, "step": 160 }, { "epoch": 0.2591792656587473, "grad_norm": 0.22197633586551052, "learning_rate": 1.852927007356481e-05, "loss": 0.0936, "step": 165 }, { "epoch": 0.26703318279992144, "grad_norm": 0.21347022910458285, "learning_rate": 1.8382715006810853e-05, "loss": 0.0957, "step": 170 }, { "epoch": 0.2748870999410956, "grad_norm": 0.19957938768669062, "learning_rate": 1.8229838658936566e-05, "loss": 0.0873, "step": 175 }, { "epoch": 0.2827410170822698, "grad_norm": 0.21372555100198343, "learning_rate": 1.807075631172963e-05, "loss": 0.0931, "step": 180 }, { "epoch": 0.29059493422344396, "grad_norm": 0.24190910676139596, "learning_rate": 1.7905587926829815e-05, "loss": 0.0954, "step": 185 }, { "epoch": 0.29844885136461813, "grad_norm": 0.2348573879122653, "learning_rate": 1.77344580552677e-05, "loss": 0.092, "step": 190 }, { "epoch": 0.30630276850579224, "grad_norm": 0.21204667498616597, "learning_rate": 1.7557495743542586e-05, "loss": 0.0865, "step": 195 }, { "epoch": 0.3141566856469664, "grad_norm": 0.23025633594185083, "learning_rate": 1.7374834436310427e-05, "loss": 0.0892, "step": 200 }, { "epoch": 0.3220106027881406, "grad_norm": 0.1958893222829977, "learning_rate": 1.7186611875755227e-05, "loss": 0.0896, "step": 205 }, { "epoch": 0.32986451992931476, "grad_norm": 0.22589475386438357, "learning_rate": 1.6992969997719658e-05, "loss": 0.0945, "step": 210 }, { "epoch": 0.33771843707048893, "grad_norm": 0.20789776750177996, "learning_rate": 1.679405482467338e-05, "loss": 0.0904, "step": 215 }, { "epoch": 0.34557235421166305, "grad_norm": 0.22053512123077823, "learning_rate": 1.6590016355599653e-05, "loss": 0.0882, "step": 220 }, { "epoch": 0.3534262713528372, "grad_norm": 0.20437871058955417, "learning_rate": 1.638100845288331e-05, "loss": 0.0943, "step": 225 }, { "epoch": 0.3612801884940114, "grad_norm": 0.2090640789388261, "learning_rate": 1.6167188726285433e-05, "loss": 0.0924, "step": 230 }, { "epoch": 0.36913410563518556, "grad_norm": 0.21993457381708645, "learning_rate": 1.5948718414092163e-05, "loss": 0.082, "step": 235 }, { "epoch": 0.37698802277635973, "grad_norm": 0.21951962440906275, "learning_rate": 1.5725762261527295e-05, "loss": 0.0918, "step": 240 }, { "epoch": 0.38484193991753385, "grad_norm": 0.19872240145820674, "learning_rate": 1.549848839652035e-05, "loss": 0.0955, "step": 245 }, { "epoch": 0.392695857058708, "grad_norm": 0.2184861301177773, "learning_rate": 1.5267068202923802e-05, "loss": 0.0899, "step": 250 }, { "epoch": 0.4005497741998822, "grad_norm": 0.21059689355945896, "learning_rate": 1.503167619127504e-05, "loss": 0.0919, "step": 255 }, { "epoch": 0.40840369134105636, "grad_norm": 0.21063064163571898, "learning_rate": 1.479248986720057e-05, "loss": 0.086, "step": 260 }, { "epoch": 0.41625760848223053, "grad_norm": 0.19969666720995202, "learning_rate": 1.4549689597561652e-05, "loss": 0.0949, "step": 265 }, { "epoch": 0.42411152562340465, "grad_norm": 0.21154526388191275, "learning_rate": 1.4303458474442325e-05, "loss": 0.0869, "step": 270 }, { "epoch": 0.4319654427645788, "grad_norm": 0.2305997701221425, "learning_rate": 1.4053982177082369e-05, "loss": 0.09, "step": 275 }, { "epoch": 0.439819359905753, "grad_norm": 0.18766215420046584, "learning_rate": 1.3801448831859363e-05, "loss": 0.0869, "step": 280 }, { "epoch": 0.44767327704692716, "grad_norm": 0.22942333663793876, "learning_rate": 1.3546048870425356e-05, "loss": 0.0929, "step": 285 }, { "epoch": 0.45552719418810134, "grad_norm": 0.24300158612250203, "learning_rate": 1.328797488610519e-05, "loss": 0.0865, "step": 290 }, { "epoch": 0.46338111132927545, "grad_norm": 0.20752136799187096, "learning_rate": 1.3027421488664723e-05, "loss": 0.0927, "step": 295 }, { "epoch": 0.4712350284704496, "grad_norm": 0.21217812864504576, "learning_rate": 1.2764585157558486e-05, "loss": 0.0879, "step": 300 }, { "epoch": 0.4790889456116238, "grad_norm": 0.22299949763483096, "learning_rate": 1.2499664093767458e-05, "loss": 0.0938, "step": 305 }, { "epoch": 0.48694286275279797, "grad_norm": 0.23957557508813998, "learning_rate": 1.2232858070338618e-05, "loss": 0.0904, "step": 310 }, { "epoch": 0.49479677989397214, "grad_norm": 0.2023486493539765, "learning_rate": 1.1964368281739078e-05, "loss": 0.0874, "step": 315 }, { "epoch": 0.5026506970351463, "grad_norm": 0.1861038104854032, "learning_rate": 1.1694397192138295e-05, "loss": 0.0816, "step": 320 }, { "epoch": 0.5105046141763204, "grad_norm": 0.2014117982786214, "learning_rate": 1.1423148382732854e-05, "loss": 0.0823, "step": 325 }, { "epoch": 0.5183585313174947, "grad_norm": 0.22960664067039807, "learning_rate": 1.1150826398228904e-05, "loss": 0.0882, "step": 330 }, { "epoch": 0.5262124484586688, "grad_norm": 0.222023007250355, "learning_rate": 1.087763659259803e-05, "loss": 0.0863, "step": 335 }, { "epoch": 0.5340663655998429, "grad_norm": 0.19468935638754573, "learning_rate": 1.0603784974222862e-05, "loss": 0.0884, "step": 340 }, { "epoch": 0.5419202827410171, "grad_norm": 0.2047369290635767, "learning_rate": 1.0329478050549208e-05, "loss": 0.0781, "step": 345 }, { "epoch": 0.5497741998821912, "grad_norm": 0.2057117950843339, "learning_rate": 1.0054922672361858e-05, "loss": 0.0813, "step": 350 }, { "epoch": 0.5576281170233655, "grad_norm": 0.23197127290798342, "learning_rate": 9.780325877801455e-06, "loss": 0.0989, "step": 355 }, { "epoch": 0.5654820341645396, "grad_norm": 0.3305907734524743, "learning_rate": 9.50589473624013e-06, "loss": 0.0816, "step": 360 }, { "epoch": 0.5733359513057137, "grad_norm": 0.215453345630245, "learning_rate": 9.231836192133532e-06, "loss": 0.0861, "step": 365 }, { "epoch": 0.5811898684468879, "grad_norm": 0.1908748794995548, "learning_rate": 8.958356908967104e-06, "loss": 0.0855, "step": 370 }, { "epoch": 0.589043785588062, "grad_norm": 0.28572080285977014, "learning_rate": 8.685663113414186e-06, "loss": 0.0841, "step": 375 }, { "epoch": 0.5968977027292363, "grad_norm": 0.29366381772227595, "learning_rate": 8.413960439823567e-06, "loss": 0.0824, "step": 380 }, { "epoch": 0.6047516198704104, "grad_norm": 0.21387609796609025, "learning_rate": 8.143453775153646e-06, "loss": 0.0846, "step": 385 }, { "epoch": 0.6126055370115845, "grad_norm": 0.18382031478587926, "learning_rate": 7.874347104470234e-06, "loss": 0.0886, "step": 390 }, { "epoch": 0.6204594541527587, "grad_norm": 0.20443500947333, "learning_rate": 7.606843357124426e-06, "loss": 0.0818, "step": 395 }, { "epoch": 0.6283133712939328, "grad_norm": 0.1868753348119755, "learning_rate": 7.341144253726583e-06, "loss": 0.0801, "step": 400 }, { "epoch": 0.6361672884351071, "grad_norm": 0.1919110684349949, "learning_rate": 7.0774501540318305e-06, "loss": 0.0837, "step": 405 }, { "epoch": 0.6440212055762812, "grad_norm": 0.19733646385316747, "learning_rate": 6.815959905851715e-06, "loss": 0.0907, "step": 410 }, { "epoch": 0.6518751227174553, "grad_norm": 0.20695118469945245, "learning_rate": 6.556870695106028e-06, "loss": 0.0839, "step": 415 }, { "epoch": 0.6597290398586295, "grad_norm": 0.206342467457721, "learning_rate": 6.300377897127825e-06, "loss": 0.0817, "step": 420 }, { "epoch": 0.6675829569998036, "grad_norm": 0.19498340314833404, "learning_rate": 6.046674929333787e-06, "loss": 0.0832, "step": 425 }, { "epoch": 0.6754368741409779, "grad_norm": 0.18423257358883185, "learning_rate": 5.795953105371e-06, "loss": 0.0894, "step": 430 }, { "epoch": 0.683290791282152, "grad_norm": 0.19500438735567713, "learning_rate": 5.548401490850193e-06, "loss": 0.0841, "step": 435 }, { "epoch": 0.6911447084233261, "grad_norm": 0.1943344540910271, "learning_rate": 5.304206760774139e-06, "loss": 0.0857, "step": 440 }, { "epoch": 0.6989986255645003, "grad_norm": 0.18694442634566882, "learning_rate": 5.063553058768814e-06, "loss": 0.0873, "step": 445 }, { "epoch": 0.7068525427056744, "grad_norm": 0.2528592095445329, "learning_rate": 4.826621858223431e-06, "loss": 0.0817, "step": 450 }, { "epoch": 0.7147064598468487, "grad_norm": 0.2090763236109062, "learning_rate": 4.593591825444028e-06, "loss": 0.0845, "step": 455 }, { "epoch": 0.7225603769880228, "grad_norm": 0.18944053970106092, "learning_rate": 4.364638684923848e-06, "loss": 0.0871, "step": 460 }, { "epoch": 0.7304142941291969, "grad_norm": 0.20174647482444266, "learning_rate": 4.13993508683214e-06, "loss": 0.084, "step": 465 }, { "epoch": 0.7382682112703711, "grad_norm": 0.19890794882086518, "learning_rate": 3.919650476821192e-06, "loss": 0.0848, "step": 470 }, { "epoch": 0.7461221284115452, "grad_norm": 0.17866134537019918, "learning_rate": 3.7039509682498887e-06, "loss": 0.0767, "step": 475 }, { "epoch": 0.7539760455527195, "grad_norm": 0.1758570225271436, "learning_rate": 3.4929992169200865e-06, "loss": 0.079, "step": 480 }, { "epoch": 0.7618299626938936, "grad_norm": 0.19532675849601197, "learning_rate": 3.2869542984202974e-06, "loss": 0.0872, "step": 485 }, { "epoch": 0.7696838798350677, "grad_norm": 0.18336677624616388, "learning_rate": 3.0859715881691267e-06, "loss": 0.078, "step": 490 }, { "epoch": 0.7775377969762419, "grad_norm": 0.19477530024566625, "learning_rate": 2.890202644248983e-06, "loss": 0.0888, "step": 495 }, { "epoch": 0.785391714117416, "grad_norm": 0.18612540878842743, "learning_rate": 2.6997950931183736e-06, "loss": 0.0806, "step": 500 }, { "epoch": 0.785391714117416, "eval_loss": 0.07095803320407867, "eval_runtime": 199.4165, "eval_samples_per_second": 9.046, "eval_steps_per_second": 2.262, "step": 500 }, { "epoch": 0.7932456312585903, "grad_norm": 0.20861974562159583, "learning_rate": 2.514892518288988e-06, "loss": 0.0816, "step": 505 }, { "epoch": 0.8010995483997644, "grad_norm": 0.19828891809733562, "learning_rate": 2.335634352051488e-06, "loss": 0.0927, "step": 510 }, { "epoch": 0.8089534655409385, "grad_norm": 0.17567422510168001, "learning_rate": 2.1621557703316876e-06, "loss": 0.077, "step": 515 }, { "epoch": 0.8168073826821127, "grad_norm": 0.43901734050274105, "learning_rate": 1.994587590756397e-06, "loss": 0.0879, "step": 520 }, { "epoch": 0.8246612998232868, "grad_norm": 0.19303609197478824, "learning_rate": 1.8330561740057839e-06, "loss": 0.0798, "step": 525 }, { "epoch": 0.8325152169644611, "grad_norm": 0.21014700571002942, "learning_rate": 1.6776833285266602e-06, "loss": 0.0846, "step": 530 }, { "epoch": 0.8403691341056352, "grad_norm": 0.19381351445718112, "learning_rate": 1.528586218678535e-06, "loss": 0.0828, "step": 535 }, { "epoch": 0.8482230512468093, "grad_norm": 0.20006096517086086, "learning_rate": 1.3858772763817174e-06, "loss": 0.0817, "step": 540 }, { "epoch": 0.8560769683879835, "grad_norm": 0.25891905601753346, "learning_rate": 1.2496641163340562e-06, "loss": 0.0815, "step": 545 }, { "epoch": 0.8639308855291576, "grad_norm": 0.20338109286639236, "learning_rate": 1.120049454860307e-06, "loss": 0.0827, "step": 550 }, { "epoch": 0.8717848026703319, "grad_norm": 0.1817547896016754, "learning_rate": 9.971310324552597e-07, "loss": 0.0845, "step": 555 }, { "epoch": 0.879638719811506, "grad_norm": 0.20333004229909032, "learning_rate": 8.810015400790994e-07, "loss": 0.0836, "step": 560 }, { "epoch": 0.8874926369526801, "grad_norm": 0.2005258856819357, "learning_rate": 7.71748549260507e-07, "loss": 0.0813, "step": 565 }, { "epoch": 0.8953465540938543, "grad_norm": 0.17505423789580266, "learning_rate": 6.694544460602825e-07, "loss": 0.0881, "step": 570 }, { "epoch": 0.9032004712350284, "grad_norm": 0.19509475957323322, "learning_rate": 5.741963689452268e-07, "loss": 0.082, "step": 575 }, { "epoch": 0.9110543883762027, "grad_norm": 0.19518986043069664, "learning_rate": 4.860461506191782e-07, "loss": 0.0842, "step": 580 }, { "epoch": 0.9189083055173768, "grad_norm": 0.1925064249529951, "learning_rate": 4.0507026385502747e-07, "loss": 0.0839, "step": 585 }, { "epoch": 0.9267622226585509, "grad_norm": 0.18327816158141888, "learning_rate": 3.313297713685859e-07, "loss": 0.087, "step": 590 }, { "epoch": 0.9346161397997251, "grad_norm": 0.20588287191995575, "learning_rate": 2.6488027977210175e-07, "loss": 0.0827, "step": 595 }, { "epoch": 0.9424700569408992, "grad_norm": 0.197357389547204, "learning_rate": 2.057718976421341e-07, "loss": 0.0815, "step": 600 }, { "epoch": 0.9503239740820735, "grad_norm": 0.18513291000922844, "learning_rate": 1.5404919773341576e-07, "loss": 0.082, "step": 605 }, { "epoch": 0.9581778912232476, "grad_norm": 0.185473668497204, "learning_rate": 1.0975118336720603e-07, "loss": 0.0846, "step": 610 }, { "epoch": 0.9660318083644217, "grad_norm": 0.1803986934893818, "learning_rate": 7.291125901946027e-08, "loss": 0.0777, "step": 615 }, { "epoch": 0.9738857255055959, "grad_norm": 0.18308958490608873, "learning_rate": 4.3557205131008475e-08, "loss": 0.0771, "step": 620 }, { "epoch": 0.98173964264677, "grad_norm": 0.2968041785813041, "learning_rate": 2.171115715874139e-08, "loss": 0.0872, "step": 625 }, { "epoch": 0.9895935597879443, "grad_norm": 0.19838566337698643, "learning_rate": 7.389588883585097e-09, "loss": 0.0789, "step": 630 }, { "epoch": 0.9974474769291184, "grad_norm": 0.17786747719054677, "learning_rate": 6.032999878735624e-10, "loss": 0.0821, "step": 635 }, { "epoch": 0.9990182603573532, "step": 636, "total_flos": 3801667199303680.0, "train_loss": 0.13866351419509207, "train_runtime": 19174.2073, "train_samples_per_second": 2.125, "train_steps_per_second": 0.033 } ], "logging_steps": 5, "max_steps": 636, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3801667199303680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }