{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.654545454545456, "eval_steps": 500, "global_step": 1081, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18181818181818182, "grad_norm": 3.9825470447540283, "learning_rate": 3.6363636363636364e-05, "loss": 1.293, "step": 10 }, { "epoch": 0.36363636363636365, "grad_norm": 1.8670121431350708, "learning_rate": 7.272727272727273e-05, "loss": 0.4981, "step": 20 }, { "epoch": 0.5454545454545454, "grad_norm": 1.4379267692565918, "learning_rate": 0.00010909090909090909, "loss": 0.2739, "step": 30 }, { "epoch": 0.7272727272727273, "grad_norm": 0.9473206400871277, "learning_rate": 0.00014545454545454546, "loss": 0.2078, "step": 40 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6841241717338562, "learning_rate": 0.00018181818181818183, "loss": 0.1781, "step": 50 }, { "epoch": 1.0909090909090908, "grad_norm": 1.1733323335647583, "learning_rate": 0.000199988280568259, "loss": 0.1661, "step": 60 }, { "epoch": 1.2727272727272727, "grad_norm": 0.7865139245986938, "learning_rate": 0.0001998945415950969, "loss": 0.1498, "step": 70 }, { "epoch": 1.4545454545454546, "grad_norm": 0.9522141218185425, "learning_rate": 0.00019970715152902254, "loss": 0.1335, "step": 80 }, { "epoch": 1.6363636363636362, "grad_norm": 0.7283738851547241, "learning_rate": 0.00019942628604814825, "loss": 0.1292, "step": 90 }, { "epoch": 1.8181818181818183, "grad_norm": 0.6728459596633911, "learning_rate": 0.00019905220846375032, "loss": 0.1078, "step": 100 }, { "epoch": 2.0, "grad_norm": 1.5674101114273071, "learning_rate": 0.00019858526947341497, "loss": 0.1197, "step": 110 }, { "epoch": 2.1818181818181817, "grad_norm": 0.42472681403160095, "learning_rate": 0.00019802590683225946, "loss": 0.1154, "step": 120 }, { "epoch": 2.3636363636363638, "grad_norm": 0.3803574740886688, "learning_rate": 0.0001973746449425368, "loss": 0.1068, "step": 130 }, { "epoch": 2.5454545454545454, "grad_norm": 0.685002863407135, "learning_rate": 0.00019663209436200887, "loss": 0.1054, "step": 140 }, { "epoch": 2.7272727272727275, "grad_norm": 0.8086279034614563, "learning_rate": 0.0001957989512315489, "loss": 0.101, "step": 150 }, { "epoch": 2.909090909090909, "grad_norm": 0.3693373501300812, "learning_rate": 0.00019487599662250943, "loss": 0.0981, "step": 160 }, { "epoch": 3.090909090909091, "grad_norm": 0.5383622646331787, "learning_rate": 0.00019386409580446844, "loss": 0.0847, "step": 170 }, { "epoch": 3.2727272727272725, "grad_norm": 0.49988943338394165, "learning_rate": 0.00019276419743403933, "loss": 0.0979, "step": 180 }, { "epoch": 3.4545454545454546, "grad_norm": 0.4996686577796936, "learning_rate": 0.00019157733266550575, "loss": 0.0865, "step": 190 }, { "epoch": 3.6363636363636362, "grad_norm": 0.6447169780731201, "learning_rate": 0.00019030461418411497, "loss": 0.0822, "step": 200 }, { "epoch": 3.8181818181818183, "grad_norm": 0.4973439574241638, "learning_rate": 0.00018894723516293583, "loss": 0.0896, "step": 210 }, { "epoch": 4.0, "grad_norm": 1.349974513053894, "learning_rate": 0.00018750646814425938, "loss": 0.0844, "step": 220 }, { "epoch": 4.181818181818182, "grad_norm": 0.6695556640625, "learning_rate": 0.0001859836638465911, "loss": 0.0832, "step": 230 }, { "epoch": 4.363636363636363, "grad_norm": 0.5756475925445557, "learning_rate": 0.0001843802498983529, "loss": 0.0867, "step": 240 }, { "epoch": 4.545454545454545, "grad_norm": 0.334103524684906, "learning_rate": 0.00018269772949948182, "loss": 0.0794, "step": 250 }, { "epoch": 4.7272727272727275, "grad_norm": 0.7557447552680969, "learning_rate": 0.00018093768001218094, "loss": 0.085, "step": 260 }, { "epoch": 4.909090909090909, "grad_norm": 0.37030595541000366, "learning_rate": 0.00017910175148214274, "loss": 0.079, "step": 270 }, { "epoch": 5.090909090909091, "grad_norm": 0.7280672192573547, "learning_rate": 0.0001771916650916321, "loss": 0.0738, "step": 280 }, { "epoch": 5.2727272727272725, "grad_norm": 0.6517002582550049, "learning_rate": 0.00017520921154587843, "loss": 0.0718, "step": 290 }, { "epoch": 5.454545454545454, "grad_norm": 0.40652868151664734, "learning_rate": 0.00017315624939429037, "loss": 0.072, "step": 300 }, { "epoch": 5.636363636363637, "grad_norm": 0.5963271260261536, "learning_rate": 0.0001710347032880664, "loss": 0.0786, "step": 310 }, { "epoch": 5.818181818181818, "grad_norm": 0.356381356716156, "learning_rate": 0.00016884656217583518, "loss": 0.0578, "step": 320 }, { "epoch": 6.0, "grad_norm": 1.6636348962783813, "learning_rate": 0.00016659387743901685, "loss": 0.0893, "step": 330 }, { "epoch": 6.181818181818182, "grad_norm": 0.42870157957077026, "learning_rate": 0.00016427876096865394, "loss": 0.0714, "step": 340 }, { "epoch": 6.363636363636363, "grad_norm": 0.32923629879951477, "learning_rate": 0.00016190338318551427, "loss": 0.0695, "step": 350 }, { "epoch": 6.545454545454545, "grad_norm": 0.491409569978714, "learning_rate": 0.0001594699710053223, "loss": 0.0797, "step": 360 }, { "epoch": 6.7272727272727275, "grad_norm": 0.3972729742527008, "learning_rate": 0.00015698080575102661, "loss": 0.0697, "step": 370 }, { "epoch": 6.909090909090909, "grad_norm": 0.4414363205432892, "learning_rate": 0.00015443822101406064, "loss": 0.0591, "step": 380 }, { "epoch": 7.090909090909091, "grad_norm": 0.3927883207798004, "learning_rate": 0.00015184460046660137, "loss": 0.0665, "step": 390 }, { "epoch": 7.2727272727272725, "grad_norm": 0.39698049426078796, "learning_rate": 0.00014920237562687785, "loss": 0.0626, "step": 400 }, { "epoch": 7.454545454545454, "grad_norm": 0.7567592263221741, "learning_rate": 0.00014651402357962367, "loss": 0.0604, "step": 410 }, { "epoch": 7.636363636363637, "grad_norm": 0.5023426413536072, "learning_rate": 0.0001437820646538112, "loss": 0.0613, "step": 420 }, { "epoch": 7.818181818181818, "grad_norm": 0.38268569111824036, "learning_rate": 0.00014100906005984403, "loss": 0.0607, "step": 430 }, { "epoch": 8.0, "grad_norm": 0.9016662836074829, "learning_rate": 0.0001381976094884232, "loss": 0.0592, "step": 440 }, { "epoch": 8.181818181818182, "grad_norm": 0.5547723770141602, "learning_rate": 0.00013535034867333837, "loss": 0.0586, "step": 450 }, { "epoch": 8.363636363636363, "grad_norm": 0.44761940836906433, "learning_rate": 0.00013246994692046836, "loss": 0.0579, "step": 460 }, { "epoch": 8.545454545454545, "grad_norm": 0.24242262542247772, "learning_rate": 0.00012955910460530788, "loss": 0.0539, "step": 470 }, { "epoch": 8.727272727272727, "grad_norm": 0.5021650195121765, "learning_rate": 0.00012662055064136668, "loss": 0.0564, "step": 480 }, { "epoch": 8.909090909090908, "grad_norm": 0.37628263235092163, "learning_rate": 0.00012365703992181425, "loss": 0.0522, "step": 490 }, { "epoch": 9.090909090909092, "grad_norm": 0.3592880964279175, "learning_rate": 0.0001206713507367684, "loss": 0.0539, "step": 500 }, { "epoch": 9.272727272727273, "grad_norm": 0.3467582166194916, "learning_rate": 0.0001176662821686496, "loss": 0.0575, "step": 510 }, { "epoch": 9.454545454545455, "grad_norm": 0.3472346067428589, "learning_rate": 0.00011464465146804217, "loss": 0.0602, "step": 520 }, { "epoch": 9.636363636363637, "grad_norm": 0.2870403826236725, "learning_rate": 0.00011160929141252303, "loss": 0.0524, "step": 530 }, { "epoch": 9.818181818181818, "grad_norm": 0.2755221128463745, "learning_rate": 0.0001085630476509339, "loss": 0.0536, "step": 540 }, { "epoch": 10.0, "grad_norm": 1.0921339988708496, "learning_rate": 0.00010550877603558655, "loss": 0.05, "step": 550 }, { "epoch": 10.181818181818182, "grad_norm": 0.2905895411968231, "learning_rate": 0.00010244933994490249, "loss": 0.0469, "step": 560 }, { "epoch": 10.363636363636363, "grad_norm": 0.4015822112560272, "learning_rate": 9.938760759899674e-05, "loss": 0.0519, "step": 570 }, { "epoch": 10.545454545454545, "grad_norm": 0.26584434509277344, "learning_rate": 9.632644937072277e-05, "loss": 0.0527, "step": 580 }, { "epoch": 10.727272727272727, "grad_norm": 0.29837700724601746, "learning_rate": 9.326873509469887e-05, "loss": 0.0506, "step": 590 }, { "epoch": 10.909090909090908, "grad_norm": 0.3558896780014038, "learning_rate": 9.021733137683962e-05, "loss": 0.0441, "step": 600 }, { "epoch": 11.090909090909092, "grad_norm": 0.2896372377872467, "learning_rate": 8.717509890691368e-05, "loss": 0.0428, "step": 610 }, { "epoch": 11.272727272727273, "grad_norm": 0.5100614428520203, "learning_rate": 8.414488977664859e-05, "loss": 0.0421, "step": 620 }, { "epoch": 11.454545454545455, "grad_norm": 0.3731076121330261, "learning_rate": 8.112954480589558e-05, "loss": 0.0431, "step": 630 }, { "epoch": 11.636363636363637, "grad_norm": 0.2851674258708954, "learning_rate": 7.813189087936243e-05, "loss": 0.0431, "step": 640 }, { "epoch": 11.818181818181818, "grad_norm": 0.3725188672542572, "learning_rate": 7.515473829640987e-05, "loss": 0.0427, "step": 650 }, { "epoch": 12.0, "grad_norm": 1.0623624324798584, "learning_rate": 7.220087813639736e-05, "loss": 0.0485, "step": 660 }, { "epoch": 12.181818181818182, "grad_norm": 0.3354561924934387, "learning_rate": 6.927307964204694e-05, "loss": 0.042, "step": 670 }, { "epoch": 12.363636363636363, "grad_norm": 0.2896369695663452, "learning_rate": 6.637408762327972e-05, "loss": 0.0398, "step": 680 }, { "epoch": 12.545454545454545, "grad_norm": 0.24495282769203186, "learning_rate": 6.350661988395723e-05, "loss": 0.0366, "step": 690 }, { "epoch": 12.727272727272727, "grad_norm": 0.3516719937324524, "learning_rate": 6.067336467394169e-05, "loss": 0.0355, "step": 700 }, { "epoch": 12.909090909090908, "grad_norm": 0.22353091835975647, "learning_rate": 5.787697816886273e-05, "loss": 0.0365, "step": 710 }, { "epoch": 13.090909090909092, "grad_norm": 0.3955240249633789, "learning_rate": 5.5120081979953785e-05, "loss": 0.033, "step": 720 }, { "epoch": 13.272727272727273, "grad_norm": 0.35612088441848755, "learning_rate": 5.240526069629265e-05, "loss": 0.0448, "step": 730 }, { "epoch": 13.454545454545455, "grad_norm": 0.3462859094142914, "learning_rate": 4.97350594617502e-05, "loss": 0.0418, "step": 740 }, { "epoch": 13.636363636363637, "grad_norm": 0.29914817214012146, "learning_rate": 4.7111981588919084e-05, "loss": 0.0412, "step": 750 }, { "epoch": 13.818181818181818, "grad_norm": 0.3142814338207245, "learning_rate": 4.453848621225912e-05, "loss": 0.0368, "step": 760 }, { "epoch": 14.0, "grad_norm": 0.4887177348136902, "learning_rate": 4.201698598265973e-05, "loss": 0.0387, "step": 770 }, { "epoch": 14.181818181818182, "grad_norm": 0.19729413092136383, "learning_rate": 3.9549844805580706e-05, "loss": 0.0362, "step": 780 }, { "epoch": 14.363636363636363, "grad_norm": 0.3818693161010742, "learning_rate": 3.713937562489179e-05, "loss": 0.0425, "step": 790 }, { "epoch": 14.545454545454545, "grad_norm": 0.21857142448425293, "learning_rate": 3.4787838254488694e-05, "loss": 0.035, "step": 800 }, { "epoch": 14.727272727272727, "grad_norm": 0.26334837079048157, "learning_rate": 3.249743725971849e-05, "loss": 0.0332, "step": 810 }, { "epoch": 14.909090909090908, "grad_norm": 0.13971690833568573, "learning_rate": 3.0270319890600462e-05, "loss": 0.0362, "step": 820 }, { "epoch": 15.090909090909092, "grad_norm": 0.3234981298446655, "learning_rate": 2.810857406878009e-05, "loss": 0.0288, "step": 830 }, { "epoch": 15.272727272727273, "grad_norm": 0.29610171914100647, "learning_rate": 2.601422643010335e-05, "loss": 0.0363, "step": 840 }, { "epoch": 15.454545454545455, "grad_norm": 1.4063012599945068, "learning_rate": 2.3989240424646355e-05, "loss": 0.0322, "step": 850 }, { "epoch": 15.636363636363637, "grad_norm": 0.261489599943161, "learning_rate": 2.2035514475981756e-05, "loss": 0.037, "step": 860 }, { "epoch": 15.818181818181818, "grad_norm": 0.21999020874500275, "learning_rate": 2.0154880201407367e-05, "loss": 0.0319, "step": 870 }, { "epoch": 16.0, "grad_norm": 2.1982412338256836, "learning_rate": 1.834910069480571e-05, "loss": 0.0571, "step": 880 }, { "epoch": 16.181818181818183, "grad_norm": 0.295622855424881, "learning_rate": 1.6619868873744147e-05, "loss": 0.0313, "step": 890 }, { "epoch": 16.363636363636363, "grad_norm": 0.266347736120224, "learning_rate": 1.49688058923654e-05, "loss": 0.034, "step": 900 }, { "epoch": 16.545454545454547, "grad_norm": 0.3452344238758087, "learning_rate": 1.339745962155613e-05, "loss": 0.0391, "step": 910 }, { "epoch": 16.727272727272727, "grad_norm": 0.21851502358913422, "learning_rate": 1.1907303197818665e-05, "loss": 0.0307, "step": 920 }, { "epoch": 16.90909090909091, "grad_norm": 0.4546089470386505, "learning_rate": 1.0499733642206033e-05, "loss": 0.0273, "step": 930 }, { "epoch": 17.09090909090909, "grad_norm": 0.22212770581245422, "learning_rate": 9.176070550615378e-06, "loss": 0.0305, "step": 940 }, { "epoch": 17.272727272727273, "grad_norm": 0.24768735468387604, "learning_rate": 7.937554856667196e-06, "loss": 0.0333, "step": 950 }, { "epoch": 17.454545454545453, "grad_norm": 0.19499576091766357, "learning_rate": 6.785347668330777e-06, "loss": 0.0263, "step": 960 }, { "epoch": 17.636363636363637, "grad_norm": 0.16682682931423187, "learning_rate": 5.720529179385659e-06, "loss": 0.0381, "step": 970 }, { "epoch": 17.818181818181817, "grad_norm": 0.223773792386055, "learning_rate": 4.744097656740709e-06, "loss": 0.0312, "step": 980 }, { "epoch": 18.0, "grad_norm": 0.8798514604568481, "learning_rate": 3.856968504558989e-06, "loss": 0.0277, "step": 990 }, { "epoch": 18.181818181818183, "grad_norm": 0.18742257356643677, "learning_rate": 3.059973406066963e-06, "loss": 0.0288, "step": 1000 }, { "epoch": 18.363636363636363, "grad_norm": 0.19658038020133972, "learning_rate": 2.353859543851644e-06, "loss": 0.0378, "step": 1010 }, { "epoch": 18.545454545454547, "grad_norm": 0.18045732378959656, "learning_rate": 1.7392888993773005e-06, "loss": 0.0247, "step": 1020 }, { "epoch": 18.727272727272727, "grad_norm": 0.28070148825645447, "learning_rate": 1.216837632378065e-06, "loss": 0.0314, "step": 1030 }, { "epoch": 18.90909090909091, "grad_norm": 0.20971472561359406, "learning_rate": 7.86995540708424e-07, "loss": 0.0278, "step": 1040 }, { "epoch": 19.09090909090909, "grad_norm": 0.1779453009366989, "learning_rate": 4.501656011579036e-07, "loss": 0.0445, "step": 1050 }, { "epoch": 19.272727272727273, "grad_norm": 0.2201903909444809, "learning_rate": 2.066635916605386e-07, "loss": 0.0247, "step": 1060 }, { "epoch": 19.454545454545453, "grad_norm": 0.17639388144016266, "learning_rate": 5.6717795253113935e-08, "loss": 0.025, "step": 1070 }, { "epoch": 19.636363636363637, "grad_norm": 0.2432723492383957, "learning_rate": 4.687860599927873e-10, "loss": 0.0306, "step": 1080 }, { "epoch": 19.654545454545456, "step": 1081, "total_flos": 1.40579546183604e+17, "train_loss": 0.0782834448637122, "train_runtime": 1256.8447, "train_samples_per_second": 55.046, "train_steps_per_second": 0.86 } ], "logging_steps": 10, "max_steps": 1081, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.40579546183604e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }