{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.993485342019544, "eval_steps": 500, "global_step": 1533, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03257328990228013, "grad_norm": 10.128602027893066, "learning_rate": 2.5974025974025972e-05, "loss": 1.7762, "step": 10 }, { "epoch": 0.06514657980456026, "grad_norm": 11.557284355163574, "learning_rate": 5.1948051948051944e-05, "loss": 0.5874, "step": 20 }, { "epoch": 0.09771986970684039, "grad_norm": 1.7021989822387695, "learning_rate": 7.792207792207793e-05, "loss": 0.2361, "step": 30 }, { "epoch": 0.13029315960912052, "grad_norm": 2.088670015335083, "learning_rate": 0.00010389610389610389, "loss": 0.176, "step": 40 }, { "epoch": 0.16286644951140064, "grad_norm": 2.070906400680542, "learning_rate": 0.00012987012987012987, "loss": 0.1546, "step": 50 }, { "epoch": 0.19543973941368079, "grad_norm": 1.9704210758209229, "learning_rate": 0.00015584415584415587, "loss": 0.1463, "step": 60 }, { "epoch": 0.2280130293159609, "grad_norm": 1.2319754362106323, "learning_rate": 0.00018181818181818183, "loss": 0.1306, "step": 70 }, { "epoch": 0.26058631921824105, "grad_norm": 1.4384595155715942, "learning_rate": 0.0001999979049808708, "loss": 0.1218, "step": 80 }, { "epoch": 0.2931596091205212, "grad_norm": 1.3861066102981567, "learning_rate": 0.00019996066263830531, "loss": 0.1031, "step": 90 }, { "epoch": 0.3257328990228013, "grad_norm": 1.3069312572479248, "learning_rate": 0.00019987688427197897, "loss": 0.1125, "step": 100 }, { "epoch": 0.3583061889250814, "grad_norm": 1.1740531921386719, "learning_rate": 0.0001997466088843548, "loss": 0.0984, "step": 110 }, { "epoch": 0.39087947882736157, "grad_norm": 0.831253170967102, "learning_rate": 0.00019956989712427577, "loss": 0.1013, "step": 120 }, { "epoch": 0.4234527687296417, "grad_norm": 1.1512802839279175, "learning_rate": 0.0001993468312587303, "loss": 0.0827, "step": 130 }, { "epoch": 0.4560260586319218, "grad_norm": 0.7890095710754395, "learning_rate": 0.00019907751513455302, "loss": 0.0747, "step": 140 }, { "epoch": 0.48859934853420195, "grad_norm": 0.8635206818580627, "learning_rate": 0.00019876207413008015, "loss": 0.0911, "step": 150 }, { "epoch": 0.5211726384364821, "grad_norm": 0.984118640422821, "learning_rate": 0.00019840065509677988, "loss": 0.0708, "step": 160 }, { "epoch": 0.5537459283387622, "grad_norm": 1.1899775266647339, "learning_rate": 0.00019799342629088702, "loss": 0.0759, "step": 170 }, { "epoch": 0.5863192182410424, "grad_norm": 0.8736202120780945, "learning_rate": 0.00019754057729507227, "loss": 0.0747, "step": 180 }, { "epoch": 0.6188925081433225, "grad_norm": 0.6165608763694763, "learning_rate": 0.0001970423189301833, "loss": 0.0697, "step": 190 }, { "epoch": 0.6514657980456026, "grad_norm": 1.2145845890045166, "learning_rate": 0.00019649888315709843, "loss": 0.0768, "step": 200 }, { "epoch": 0.6840390879478827, "grad_norm": 0.7146447896957397, "learning_rate": 0.00019591052296873888, "loss": 0.0575, "step": 210 }, { "epoch": 0.7166123778501629, "grad_norm": 1.0241068601608276, "learning_rate": 0.00019527751227228963, "loss": 0.0688, "step": 220 }, { "epoch": 0.749185667752443, "grad_norm": 0.8579204678535461, "learning_rate": 0.00019460014576168358, "loss": 0.0544, "step": 230 }, { "epoch": 0.7817589576547231, "grad_norm": 0.5284867286682129, "learning_rate": 0.0001938787387804088, "loss": 0.0516, "step": 240 }, { "epoch": 0.8143322475570033, "grad_norm": 0.6213068962097168, "learning_rate": 0.00019311362717470268, "loss": 0.0501, "step": 250 }, { "epoch": 0.8469055374592834, "grad_norm": 0.7320097088813782, "learning_rate": 0.00019230516713720052, "loss": 0.0581, "step": 260 }, { "epoch": 0.8794788273615635, "grad_norm": 0.36288219690322876, "learning_rate": 0.00019145373504111279, "loss": 0.0478, "step": 270 }, { "epoch": 0.9120521172638436, "grad_norm": 0.6096987724304199, "learning_rate": 0.00019055972726500695, "loss": 0.0548, "step": 280 }, { "epoch": 0.9446254071661238, "grad_norm": 0.7391396760940552, "learning_rate": 0.0001896235600082759, "loss": 0.0444, "step": 290 }, { "epoch": 0.9771986970684039, "grad_norm": 0.7820376753807068, "learning_rate": 0.00018864566909737937, "loss": 0.0566, "step": 300 }, { "epoch": 1.009771986970684, "grad_norm": 0.3828499913215637, "learning_rate": 0.00018762650978294758, "loss": 0.0533, "step": 310 }, { "epoch": 1.0423452768729642, "grad_norm": 0.49720415472984314, "learning_rate": 0.0001865665565278424, "loss": 0.0582, "step": 320 }, { "epoch": 1.0749185667752443, "grad_norm": 0.4968441128730774, "learning_rate": 0.00018546630278627437, "loss": 0.0473, "step": 330 }, { "epoch": 1.1074918566775245, "grad_norm": 0.812972366809845, "learning_rate": 0.00018432626077407829, "loss": 0.0382, "step": 340 }, { "epoch": 1.1400651465798046, "grad_norm": 0.4055006802082062, "learning_rate": 0.00018314696123025454, "loss": 0.043, "step": 350 }, { "epoch": 1.1726384364820848, "grad_norm": 0.6709649562835693, "learning_rate": 0.0001819289531698871, "loss": 0.0343, "step": 360 }, { "epoch": 1.205211726384365, "grad_norm": 0.6343613862991333, "learning_rate": 0.0001806728036285532, "loss": 0.0388, "step": 370 }, { "epoch": 1.237785016286645, "grad_norm": 0.44080787897109985, "learning_rate": 0.00017937909739834367, "loss": 0.0419, "step": 380 }, { "epoch": 1.2703583061889252, "grad_norm": 0.714772641658783, "learning_rate": 0.00017804843675561677, "loss": 0.0476, "step": 390 }, { "epoch": 1.3029315960912053, "grad_norm": 0.4127221703529358, "learning_rate": 0.00017668144118061262, "loss": 0.0354, "step": 400 }, { "epoch": 1.3355048859934853, "grad_norm": 0.44271162152290344, "learning_rate": 0.00017527874706905805, "loss": 0.0385, "step": 410 }, { "epoch": 1.3680781758957654, "grad_norm": 0.6225533485412598, "learning_rate": 0.00017384100743589697, "loss": 0.0665, "step": 420 }, { "epoch": 1.4006514657980456, "grad_norm": 0.4418353736400604, "learning_rate": 0.0001723688916112835, "loss": 0.0554, "step": 430 }, { "epoch": 1.4332247557003257, "grad_norm": 0.3992595672607422, "learning_rate": 0.00017086308492897983, "loss": 0.0522, "step": 440 }, { "epoch": 1.4657980456026058, "grad_norm": 0.45478880405426025, "learning_rate": 0.0001693242884073035, "loss": 0.0398, "step": 450 }, { "epoch": 1.498371335504886, "grad_norm": 0.5688516497612, "learning_rate": 0.00016775321842277312, "loss": 0.032, "step": 460 }, { "epoch": 1.5309446254071661, "grad_norm": 0.4958943724632263, "learning_rate": 0.00016615060637660388, "loss": 0.0422, "step": 470 }, { "epoch": 1.5635179153094463, "grad_norm": 0.4975818991661072, "learning_rate": 0.00016451719835420877, "loss": 0.0317, "step": 480 }, { "epoch": 1.5960912052117264, "grad_norm": 0.5288175344467163, "learning_rate": 0.00016285375477786322, "loss": 0.0412, "step": 490 }, { "epoch": 1.6286644951140063, "grad_norm": 0.6416569352149963, "learning_rate": 0.0001611610500526957, "loss": 0.0275, "step": 500 }, { "epoch": 1.6612377850162865, "grad_norm": 0.42349156737327576, "learning_rate": 0.00015943987220616855, "loss": 0.029, "step": 510 }, { "epoch": 1.6938110749185666, "grad_norm": 0.2572161853313446, "learning_rate": 0.00015769102252121702, "loss": 0.0308, "step": 520 }, { "epoch": 1.7263843648208468, "grad_norm": 0.5944684147834778, "learning_rate": 0.0001559153151632171, "loss": 0.0354, "step": 530 }, { "epoch": 1.758957654723127, "grad_norm": 0.5154451727867126, "learning_rate": 0.0001541135768009566, "loss": 0.0424, "step": 540 }, { "epoch": 1.791530944625407, "grad_norm": 0.5298171043395996, "learning_rate": 0.00015228664622178467, "loss": 0.0263, "step": 550 }, { "epoch": 1.8241042345276872, "grad_norm": 0.3582160472869873, "learning_rate": 0.00015043537394112007, "loss": 0.0386, "step": 560 }, { "epoch": 1.8566775244299674, "grad_norm": 0.6350813508033752, "learning_rate": 0.0001485606218064993, "loss": 0.0444, "step": 570 }, { "epoch": 1.8892508143322475, "grad_norm": 0.6265957355499268, "learning_rate": 0.00014666326259634918, "loss": 0.0417, "step": 580 }, { "epoch": 1.9218241042345277, "grad_norm": 0.41339483857154846, "learning_rate": 0.00014474417961367065, "loss": 0.0309, "step": 590 }, { "epoch": 1.9543973941368078, "grad_norm": 0.41287094354629517, "learning_rate": 0.000142804266274823, "loss": 0.0308, "step": 600 }, { "epoch": 1.986970684039088, "grad_norm": 0.5833337903022766, "learning_rate": 0.00014084442569359964, "loss": 0.0346, "step": 610 }, { "epoch": 2.019543973941368, "grad_norm": 0.3714321255683899, "learning_rate": 0.00013886557026078955, "loss": 0.0337, "step": 620 }, { "epoch": 2.0521172638436482, "grad_norm": 0.2712138295173645, "learning_rate": 0.0001368686212194199, "loss": 0.0196, "step": 630 }, { "epoch": 2.0846905537459284, "grad_norm": 0.6208952069282532, "learning_rate": 0.00013485450823587725, "loss": 0.0288, "step": 640 }, { "epoch": 2.1172638436482085, "grad_norm": 0.34229573607444763, "learning_rate": 0.00013282416896710778, "loss": 0.0246, "step": 650 }, { "epoch": 2.1498371335504887, "grad_norm": 0.4573960304260254, "learning_rate": 0.00013077854862409696, "loss": 0.0249, "step": 660 }, { "epoch": 2.182410423452769, "grad_norm": 0.3675532042980194, "learning_rate": 0.0001287185995318333, "loss": 0.022, "step": 670 }, { "epoch": 2.214983713355049, "grad_norm": 0.5372172594070435, "learning_rate": 0.0001266452806859594, "loss": 0.0343, "step": 680 }, { "epoch": 2.247557003257329, "grad_norm": 0.46904900670051575, "learning_rate": 0.00012455955730631804, "loss": 0.0233, "step": 690 }, { "epoch": 2.2801302931596092, "grad_norm": 0.4394093155860901, "learning_rate": 0.00012246240038760043, "loss": 0.0209, "step": 700 }, { "epoch": 2.3127035830618894, "grad_norm": 0.3127492368221283, "learning_rate": 0.00012035478624730608, "loss": 0.0321, "step": 710 }, { "epoch": 2.3452768729641695, "grad_norm": 0.601370096206665, "learning_rate": 0.00011823769607122479, "loss": 0.0243, "step": 720 }, { "epoch": 2.3778501628664497, "grad_norm": 0.5871070623397827, "learning_rate": 0.00011611211545665184, "loss": 0.0337, "step": 730 }, { "epoch": 2.41042345276873, "grad_norm": 0.3546801805496216, "learning_rate": 0.00011397903395354996, "loss": 0.0288, "step": 740 }, { "epoch": 2.44299674267101, "grad_norm": 0.8319407105445862, "learning_rate": 0.0001118394446038708, "loss": 0.0337, "step": 750 }, { "epoch": 2.47557003257329, "grad_norm": 0.5210663080215454, "learning_rate": 0.00010969434347925076, "loss": 0.026, "step": 760 }, { "epoch": 2.5081433224755703, "grad_norm": 0.5834184288978577, "learning_rate": 0.00010754472921729661, "loss": 0.0282, "step": 770 }, { "epoch": 2.5407166123778504, "grad_norm": 0.42890864610671997, "learning_rate": 0.00010539160255667623, "loss": 0.028, "step": 780 }, { "epoch": 2.5732899022801305, "grad_norm": 0.4473400413990021, "learning_rate": 0.00010323596587123145, "loss": 0.025, "step": 790 }, { "epoch": 2.6058631921824107, "grad_norm": 0.5189303159713745, "learning_rate": 0.00010107882270332952, "loss": 0.0293, "step": 800 }, { "epoch": 2.6384364820846904, "grad_norm": 0.43365001678466797, "learning_rate": 9.892117729667052e-05, "loss": 0.0175, "step": 810 }, { "epoch": 2.6710097719869705, "grad_norm": 0.28346696496009827, "learning_rate": 9.676403412876856e-05, "loss": 0.0334, "step": 820 }, { "epoch": 2.7035830618892507, "grad_norm": 0.3956477642059326, "learning_rate": 9.460839744332378e-05, "loss": 0.0271, "step": 830 }, { "epoch": 2.736156351791531, "grad_norm": 0.30705949664115906, "learning_rate": 9.245527078270341e-05, "loss": 0.0217, "step": 840 }, { "epoch": 2.768729641693811, "grad_norm": 0.40188854932785034, "learning_rate": 9.030565652074926e-05, "loss": 0.019, "step": 850 }, { "epoch": 2.801302931596091, "grad_norm": 0.3447129428386688, "learning_rate": 8.816055539612924e-05, "loss": 0.028, "step": 860 }, { "epoch": 2.8338762214983713, "grad_norm": 0.38768622279167175, "learning_rate": 8.602096604645009e-05, "loss": 0.0218, "step": 870 }, { "epoch": 2.8664495114006514, "grad_norm": 0.26912721991539, "learning_rate": 8.388788454334817e-05, "loss": 0.0173, "step": 880 }, { "epoch": 2.8990228013029316, "grad_norm": 0.33078861236572266, "learning_rate": 8.176230392877523e-05, "loss": 0.0233, "step": 890 }, { "epoch": 2.9315960912052117, "grad_norm": 0.24832488596439362, "learning_rate": 7.964521375269396e-05, "loss": 0.0171, "step": 900 }, { "epoch": 2.964169381107492, "grad_norm": 0.6595136523246765, "learning_rate": 7.753759961239964e-05, "loss": 0.0272, "step": 910 }, { "epoch": 2.996742671009772, "grad_norm": 0.2780207097530365, "learning_rate": 7.544044269368197e-05, "loss": 0.0338, "step": 920 }, { "epoch": 3.029315960912052, "grad_norm": 0.7173179388046265, "learning_rate": 7.335471931404063e-05, "loss": 0.0365, "step": 930 }, { "epoch": 3.0618892508143323, "grad_norm": 0.33753442764282227, "learning_rate": 7.128140046816671e-05, "loss": 0.0195, "step": 940 }, { "epoch": 3.0944625407166124, "grad_norm": 0.35064950585365295, "learning_rate": 6.922145137590306e-05, "loss": 0.02, "step": 950 }, { "epoch": 3.1270358306188926, "grad_norm": 0.39598166942596436, "learning_rate": 6.717583103289229e-05, "loss": 0.0203, "step": 960 }, { "epoch": 3.1596091205211727, "grad_norm": 0.18257524073123932, "learning_rate": 6.514549176412275e-05, "loss": 0.0134, "step": 970 }, { "epoch": 3.192182410423453, "grad_norm": 0.4458347260951996, "learning_rate": 6.313137878058013e-05, "loss": 0.0236, "step": 980 }, { "epoch": 3.224755700325733, "grad_norm": 0.22742605209350586, "learning_rate": 6.113442973921046e-05, "loss": 0.0208, "step": 990 }, { "epoch": 3.257328990228013, "grad_norm": 0.1858537793159485, "learning_rate": 5.9155574306400395e-05, "loss": 0.0218, "step": 1000 }, { "epoch": 3.2899022801302933, "grad_norm": 0.24626286327838898, "learning_rate": 5.7195733725176994e-05, "loss": 0.0232, "step": 1010 }, { "epoch": 3.3224755700325734, "grad_norm": 0.2719153165817261, "learning_rate": 5.525582038632934e-05, "loss": 0.0148, "step": 1020 }, { "epoch": 3.3550488599348536, "grad_norm": 0.218730166554451, "learning_rate": 5.333673740365083e-05, "loss": 0.0157, "step": 1030 }, { "epoch": 3.3876221498371337, "grad_norm": 0.20292945206165314, "learning_rate": 5.1439378193500707e-05, "loss": 0.0143, "step": 1040 }, { "epoch": 3.420195439739414, "grad_norm": 0.2846449017524719, "learning_rate": 4.956462605887994e-05, "loss": 0.0177, "step": 1050 }, { "epoch": 3.4527687296416936, "grad_norm": 0.322721391916275, "learning_rate": 4.771335377821535e-05, "loss": 0.0224, "step": 1060 }, { "epoch": 3.4853420195439737, "grad_norm": 0.1719449758529663, "learning_rate": 4.588642319904343e-05, "loss": 0.0234, "step": 1070 }, { "epoch": 3.517915309446254, "grad_norm": 0.44704851508140564, "learning_rate": 4.408468483678293e-05, "loss": 0.019, "step": 1080 }, { "epoch": 3.550488599348534, "grad_norm": 0.4159814417362213, "learning_rate": 4.230897747878303e-05, "loss": 0.0156, "step": 1090 }, { "epoch": 3.583061889250814, "grad_norm": 0.19604472815990448, "learning_rate": 4.056012779383145e-05, "loss": 0.0158, "step": 1100 }, { "epoch": 3.6156351791530943, "grad_norm": 0.19116809964179993, "learning_rate": 3.883894994730428e-05, "loss": 0.0174, "step": 1110 }, { "epoch": 3.6482084690553744, "grad_norm": 0.3637801706790924, "learning_rate": 3.714624522213681e-05, "loss": 0.0162, "step": 1120 }, { "epoch": 3.6807817589576546, "grad_norm": 0.1877295821905136, "learning_rate": 3.548280164579126e-05, "loss": 0.0142, "step": 1130 }, { "epoch": 3.7133550488599347, "grad_norm": 0.1830226182937622, "learning_rate": 3.384939362339614e-05, "loss": 0.0119, "step": 1140 }, { "epoch": 3.745928338762215, "grad_norm": 0.15163740515708923, "learning_rate": 3.224678157722689e-05, "loss": 0.0181, "step": 1150 }, { "epoch": 3.778501628664495, "grad_norm": 0.2479788213968277, "learning_rate": 3.067571159269651e-05, "loss": 0.0138, "step": 1160 }, { "epoch": 3.811074918566775, "grad_norm": 0.6171669960021973, "learning_rate": 2.913691507102019e-05, "loss": 0.0197, "step": 1170 }, { "epoch": 3.8436482084690553, "grad_norm": 0.18519634008407593, "learning_rate": 2.763110838871651e-05, "loss": 0.0137, "step": 1180 }, { "epoch": 3.8762214983713354, "grad_norm": 0.26303982734680176, "learning_rate": 2.6158992564103058e-05, "loss": 0.0172, "step": 1190 }, { "epoch": 3.9087947882736156, "grad_norm": 0.28331807255744934, "learning_rate": 2.4721252930941974e-05, "loss": 0.0168, "step": 1200 }, { "epoch": 3.9413680781758957, "grad_norm": 0.20530906319618225, "learning_rate": 2.3318558819387404e-05, "loss": 0.0199, "step": 1210 }, { "epoch": 3.973941368078176, "grad_norm": 0.16924133896827698, "learning_rate": 2.1951563244383233e-05, "loss": 0.0146, "step": 1220 }, { "epoch": 4.006514657980456, "grad_norm": 0.13186028599739075, "learning_rate": 2.0620902601656345e-05, "loss": 0.0124, "step": 1230 }, { "epoch": 4.039087947882736, "grad_norm": 0.24360792338848114, "learning_rate": 1.9327196371446776e-05, "loss": 0.0119, "step": 1240 }, { "epoch": 4.071661237785016, "grad_norm": 0.09876150637865067, "learning_rate": 1.807104683011289e-05, "loss": 0.012, "step": 1250 }, { "epoch": 4.1042345276872965, "grad_norm": 0.2283184826374054, "learning_rate": 1.6853038769745467e-05, "loss": 0.0142, "step": 1260 }, { "epoch": 4.136807817589577, "grad_norm": 0.32383596897125244, "learning_rate": 1.5673739225921758e-05, "loss": 0.012, "step": 1270 }, { "epoch": 4.169381107491857, "grad_norm": 0.2783248722553253, "learning_rate": 1.4533697213725662e-05, "loss": 0.0163, "step": 1280 }, { "epoch": 4.201954397394137, "grad_norm": 0.17678265273571014, "learning_rate": 1.3433443472157613e-05, "loss": 0.012, "step": 1290 }, { "epoch": 4.234527687296417, "grad_norm": 0.25102487206459045, "learning_rate": 1.237349021705243e-05, "loss": 0.0158, "step": 1300 }, { "epoch": 4.267100977198697, "grad_norm": 0.15461167693138123, "learning_rate": 1.1354330902620636e-05, "loss": 0.0126, "step": 1310 }, { "epoch": 4.299674267100977, "grad_norm": 0.24122057855129242, "learning_rate": 1.0376439991724096e-05, "loss": 0.0168, "step": 1320 }, { "epoch": 4.3322475570032575, "grad_norm": 0.14669205248355865, "learning_rate": 9.440272734993072e-06, "loss": 0.0179, "step": 1330 }, { "epoch": 4.364820846905538, "grad_norm": 0.32440969347953796, "learning_rate": 8.546264958887219e-06, "loss": 0.0197, "step": 1340 }, { "epoch": 4.397394136807818, "grad_norm": 0.14456795156002045, "learning_rate": 7.694832862799505e-06, "loss": 0.0111, "step": 1350 }, { "epoch": 4.429967426710098, "grad_norm": 0.17956456542015076, "learning_rate": 6.886372825297349e-06, "loss": 0.0085, "step": 1360 }, { "epoch": 4.462540716612378, "grad_norm": 0.30424752831459045, "learning_rate": 6.12126121959119e-06, "loss": 0.0207, "step": 1370 }, { "epoch": 4.495114006514658, "grad_norm": 0.18671758472919464, "learning_rate": 5.399854238316437e-06, "loss": 0.013, "step": 1380 }, { "epoch": 4.527687296416938, "grad_norm": 0.3565406799316406, "learning_rate": 4.722487727710368e-06, "loss": 0.0165, "step": 1390 }, { "epoch": 4.5602605863192185, "grad_norm": 0.26344749331474304, "learning_rate": 4.089477031261113e-06, "loss": 0.0148, "step": 1400 }, { "epoch": 4.592833876221499, "grad_norm": 0.18339155614376068, "learning_rate": 3.5011168429016083e-06, "loss": 0.0195, "step": 1410 }, { "epoch": 4.625407166123779, "grad_norm": 0.2598022222518921, "learning_rate": 2.95768106981672e-06, "loss": 0.0135, "step": 1420 }, { "epoch": 4.657980456026059, "grad_norm": 0.3853515684604645, "learning_rate": 2.4594227049277386e-06, "loss": 0.0177, "step": 1430 }, { "epoch": 4.690553745928339, "grad_norm": 0.13664180040359497, "learning_rate": 2.006573709112991e-06, "loss": 0.0086, "step": 1440 }, { "epoch": 4.723127035830619, "grad_norm": 0.1015399917960167, "learning_rate": 1.5993449032201458e-06, "loss": 0.0116, "step": 1450 }, { "epoch": 4.755700325732899, "grad_norm": 0.18885648250579834, "learning_rate": 1.237925869919887e-06, "loss": 0.0175, "step": 1460 }, { "epoch": 4.7882736156351795, "grad_norm": 0.18131224811077118, "learning_rate": 9.224848654469931e-07, "loss": 0.0088, "step": 1470 }, { "epoch": 4.82084690553746, "grad_norm": 0.194551482796669, "learning_rate": 6.531687412697496e-07, "loss": 0.014, "step": 1480 }, { "epoch": 4.85342019543974, "grad_norm": 0.23798178136348724, "learning_rate": 4.3010287572422537e-07, "loss": 0.0097, "step": 1490 }, { "epoch": 4.88599348534202, "grad_norm": 0.141094371676445, "learning_rate": 2.5339111564521844e-07, "loss": 0.0151, "step": 1500 }, { "epoch": 4.918566775244299, "grad_norm": 0.17839759588241577, "learning_rate": 1.2311572802105043e-07, "loss": 0.0097, "step": 1510 }, { "epoch": 4.95114006514658, "grad_norm": 0.3124100863933563, "learning_rate": 3.933736169471347e-08, "loss": 0.0098, "step": 1520 }, { "epoch": 4.9837133550488595, "grad_norm": 0.17664480209350586, "learning_rate": 2.0950191292112842e-09, "loss": 0.0142, "step": 1530 }, { "epoch": 4.993485342019544, "step": 1533, "total_flos": 5.185032946443418e+16, "train_loss": 0.05182663513868756, "train_runtime": 722.6992, "train_samples_per_second": 33.939, "train_steps_per_second": 2.121 } ], "logging_steps": 10, "max_steps": 1533, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.185032946443418e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }