|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.993485342019544, |
|
"eval_steps": 500, |
|
"global_step": 1533, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03257328990228013, |
|
"grad_norm": 10.128602027893066, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 1.7762, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06514657980456026, |
|
"grad_norm": 11.557284355163574, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.5874, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09771986970684039, |
|
"grad_norm": 1.7021989822387695, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.2361, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13029315960912052, |
|
"grad_norm": 2.088670015335083, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 0.176, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16286644951140064, |
|
"grad_norm": 2.070906400680542, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 0.1546, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19543973941368079, |
|
"grad_norm": 1.9704210758209229, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 0.1463, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2280130293159609, |
|
"grad_norm": 1.2319754362106323, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.1306, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.26058631921824105, |
|
"grad_norm": 1.4384595155715942, |
|
"learning_rate": 0.0001999979049808708, |
|
"loss": 0.1218, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2931596091205212, |
|
"grad_norm": 1.3861066102981567, |
|
"learning_rate": 0.00019996066263830531, |
|
"loss": 0.1031, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3257328990228013, |
|
"grad_norm": 1.3069312572479248, |
|
"learning_rate": 0.00019987688427197897, |
|
"loss": 0.1125, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3583061889250814, |
|
"grad_norm": 1.1740531921386719, |
|
"learning_rate": 0.0001997466088843548, |
|
"loss": 0.0984, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.39087947882736157, |
|
"grad_norm": 0.831253170967102, |
|
"learning_rate": 0.00019956989712427577, |
|
"loss": 0.1013, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4234527687296417, |
|
"grad_norm": 1.1512802839279175, |
|
"learning_rate": 0.0001993468312587303, |
|
"loss": 0.0827, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4560260586319218, |
|
"grad_norm": 0.7890095710754395, |
|
"learning_rate": 0.00019907751513455302, |
|
"loss": 0.0747, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.48859934853420195, |
|
"grad_norm": 0.8635206818580627, |
|
"learning_rate": 0.00019876207413008015, |
|
"loss": 0.0911, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5211726384364821, |
|
"grad_norm": 0.984118640422821, |
|
"learning_rate": 0.00019840065509677988, |
|
"loss": 0.0708, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5537459283387622, |
|
"grad_norm": 1.1899775266647339, |
|
"learning_rate": 0.00019799342629088702, |
|
"loss": 0.0759, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5863192182410424, |
|
"grad_norm": 0.8736202120780945, |
|
"learning_rate": 0.00019754057729507227, |
|
"loss": 0.0747, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6188925081433225, |
|
"grad_norm": 0.6165608763694763, |
|
"learning_rate": 0.0001970423189301833, |
|
"loss": 0.0697, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6514657980456026, |
|
"grad_norm": 1.2145845890045166, |
|
"learning_rate": 0.00019649888315709843, |
|
"loss": 0.0768, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6840390879478827, |
|
"grad_norm": 0.7146447896957397, |
|
"learning_rate": 0.00019591052296873888, |
|
"loss": 0.0575, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7166123778501629, |
|
"grad_norm": 1.0241068601608276, |
|
"learning_rate": 0.00019527751227228963, |
|
"loss": 0.0688, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.749185667752443, |
|
"grad_norm": 0.8579204678535461, |
|
"learning_rate": 0.00019460014576168358, |
|
"loss": 0.0544, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7817589576547231, |
|
"grad_norm": 0.5284867286682129, |
|
"learning_rate": 0.0001938787387804088, |
|
"loss": 0.0516, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8143322475570033, |
|
"grad_norm": 0.6213068962097168, |
|
"learning_rate": 0.00019311362717470268, |
|
"loss": 0.0501, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8469055374592834, |
|
"grad_norm": 0.7320097088813782, |
|
"learning_rate": 0.00019230516713720052, |
|
"loss": 0.0581, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8794788273615635, |
|
"grad_norm": 0.36288219690322876, |
|
"learning_rate": 0.00019145373504111279, |
|
"loss": 0.0478, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9120521172638436, |
|
"grad_norm": 0.6096987724304199, |
|
"learning_rate": 0.00019055972726500695, |
|
"loss": 0.0548, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9446254071661238, |
|
"grad_norm": 0.7391396760940552, |
|
"learning_rate": 0.0001896235600082759, |
|
"loss": 0.0444, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9771986970684039, |
|
"grad_norm": 0.7820376753807068, |
|
"learning_rate": 0.00018864566909737937, |
|
"loss": 0.0566, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.009771986970684, |
|
"grad_norm": 0.3828499913215637, |
|
"learning_rate": 0.00018762650978294758, |
|
"loss": 0.0533, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0423452768729642, |
|
"grad_norm": 0.49720415472984314, |
|
"learning_rate": 0.0001865665565278424, |
|
"loss": 0.0582, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0749185667752443, |
|
"grad_norm": 0.4968441128730774, |
|
"learning_rate": 0.00018546630278627437, |
|
"loss": 0.0473, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1074918566775245, |
|
"grad_norm": 0.812972366809845, |
|
"learning_rate": 0.00018432626077407829, |
|
"loss": 0.0382, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1400651465798046, |
|
"grad_norm": 0.4055006802082062, |
|
"learning_rate": 0.00018314696123025454, |
|
"loss": 0.043, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1726384364820848, |
|
"grad_norm": 0.6709649562835693, |
|
"learning_rate": 0.0001819289531698871, |
|
"loss": 0.0343, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.205211726384365, |
|
"grad_norm": 0.6343613862991333, |
|
"learning_rate": 0.0001806728036285532, |
|
"loss": 0.0388, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.237785016286645, |
|
"grad_norm": 0.44080787897109985, |
|
"learning_rate": 0.00017937909739834367, |
|
"loss": 0.0419, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2703583061889252, |
|
"grad_norm": 0.714772641658783, |
|
"learning_rate": 0.00017804843675561677, |
|
"loss": 0.0476, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3029315960912053, |
|
"grad_norm": 0.4127221703529358, |
|
"learning_rate": 0.00017668144118061262, |
|
"loss": 0.0354, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3355048859934853, |
|
"grad_norm": 0.44271162152290344, |
|
"learning_rate": 0.00017527874706905805, |
|
"loss": 0.0385, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3680781758957654, |
|
"grad_norm": 0.6225533485412598, |
|
"learning_rate": 0.00017384100743589697, |
|
"loss": 0.0665, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4006514657980456, |
|
"grad_norm": 0.4418353736400604, |
|
"learning_rate": 0.0001723688916112835, |
|
"loss": 0.0554, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4332247557003257, |
|
"grad_norm": 0.3992595672607422, |
|
"learning_rate": 0.00017086308492897983, |
|
"loss": 0.0522, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4657980456026058, |
|
"grad_norm": 0.45478880405426025, |
|
"learning_rate": 0.0001693242884073035, |
|
"loss": 0.0398, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.498371335504886, |
|
"grad_norm": 0.5688516497612, |
|
"learning_rate": 0.00016775321842277312, |
|
"loss": 0.032, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5309446254071661, |
|
"grad_norm": 0.4958943724632263, |
|
"learning_rate": 0.00016615060637660388, |
|
"loss": 0.0422, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5635179153094463, |
|
"grad_norm": 0.4975818991661072, |
|
"learning_rate": 0.00016451719835420877, |
|
"loss": 0.0317, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5960912052117264, |
|
"grad_norm": 0.5288175344467163, |
|
"learning_rate": 0.00016285375477786322, |
|
"loss": 0.0412, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6286644951140063, |
|
"grad_norm": 0.6416569352149963, |
|
"learning_rate": 0.0001611610500526957, |
|
"loss": 0.0275, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6612377850162865, |
|
"grad_norm": 0.42349156737327576, |
|
"learning_rate": 0.00015943987220616855, |
|
"loss": 0.029, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6938110749185666, |
|
"grad_norm": 0.2572161853313446, |
|
"learning_rate": 0.00015769102252121702, |
|
"loss": 0.0308, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7263843648208468, |
|
"grad_norm": 0.5944684147834778, |
|
"learning_rate": 0.0001559153151632171, |
|
"loss": 0.0354, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.758957654723127, |
|
"grad_norm": 0.5154451727867126, |
|
"learning_rate": 0.0001541135768009566, |
|
"loss": 0.0424, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.791530944625407, |
|
"grad_norm": 0.5298171043395996, |
|
"learning_rate": 0.00015228664622178467, |
|
"loss": 0.0263, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8241042345276872, |
|
"grad_norm": 0.3582160472869873, |
|
"learning_rate": 0.00015043537394112007, |
|
"loss": 0.0386, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8566775244299674, |
|
"grad_norm": 0.6350813508033752, |
|
"learning_rate": 0.0001485606218064993, |
|
"loss": 0.0444, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8892508143322475, |
|
"grad_norm": 0.6265957355499268, |
|
"learning_rate": 0.00014666326259634918, |
|
"loss": 0.0417, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9218241042345277, |
|
"grad_norm": 0.41339483857154846, |
|
"learning_rate": 0.00014474417961367065, |
|
"loss": 0.0309, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9543973941368078, |
|
"grad_norm": 0.41287094354629517, |
|
"learning_rate": 0.000142804266274823, |
|
"loss": 0.0308, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.986970684039088, |
|
"grad_norm": 0.5833337903022766, |
|
"learning_rate": 0.00014084442569359964, |
|
"loss": 0.0346, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.019543973941368, |
|
"grad_norm": 0.3714321255683899, |
|
"learning_rate": 0.00013886557026078955, |
|
"loss": 0.0337, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0521172638436482, |
|
"grad_norm": 0.2712138295173645, |
|
"learning_rate": 0.0001368686212194199, |
|
"loss": 0.0196, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0846905537459284, |
|
"grad_norm": 0.6208952069282532, |
|
"learning_rate": 0.00013485450823587725, |
|
"loss": 0.0288, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1172638436482085, |
|
"grad_norm": 0.34229573607444763, |
|
"learning_rate": 0.00013282416896710778, |
|
"loss": 0.0246, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.1498371335504887, |
|
"grad_norm": 0.4573960304260254, |
|
"learning_rate": 0.00013077854862409696, |
|
"loss": 0.0249, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.182410423452769, |
|
"grad_norm": 0.3675532042980194, |
|
"learning_rate": 0.0001287185995318333, |
|
"loss": 0.022, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.214983713355049, |
|
"grad_norm": 0.5372172594070435, |
|
"learning_rate": 0.0001266452806859594, |
|
"loss": 0.0343, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.247557003257329, |
|
"grad_norm": 0.46904900670051575, |
|
"learning_rate": 0.00012455955730631804, |
|
"loss": 0.0233, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2801302931596092, |
|
"grad_norm": 0.4394093155860901, |
|
"learning_rate": 0.00012246240038760043, |
|
"loss": 0.0209, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3127035830618894, |
|
"grad_norm": 0.3127492368221283, |
|
"learning_rate": 0.00012035478624730608, |
|
"loss": 0.0321, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3452768729641695, |
|
"grad_norm": 0.601370096206665, |
|
"learning_rate": 0.00011823769607122479, |
|
"loss": 0.0243, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3778501628664497, |
|
"grad_norm": 0.5871070623397827, |
|
"learning_rate": 0.00011611211545665184, |
|
"loss": 0.0337, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.41042345276873, |
|
"grad_norm": 0.3546801805496216, |
|
"learning_rate": 0.00011397903395354996, |
|
"loss": 0.0288, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.44299674267101, |
|
"grad_norm": 0.8319407105445862, |
|
"learning_rate": 0.0001118394446038708, |
|
"loss": 0.0337, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.47557003257329, |
|
"grad_norm": 0.5210663080215454, |
|
"learning_rate": 0.00010969434347925076, |
|
"loss": 0.026, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5081433224755703, |
|
"grad_norm": 0.5834184288978577, |
|
"learning_rate": 0.00010754472921729661, |
|
"loss": 0.0282, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.5407166123778504, |
|
"grad_norm": 0.42890864610671997, |
|
"learning_rate": 0.00010539160255667623, |
|
"loss": 0.028, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.5732899022801305, |
|
"grad_norm": 0.4473400413990021, |
|
"learning_rate": 0.00010323596587123145, |
|
"loss": 0.025, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6058631921824107, |
|
"grad_norm": 0.5189303159713745, |
|
"learning_rate": 0.00010107882270332952, |
|
"loss": 0.0293, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.6384364820846904, |
|
"grad_norm": 0.43365001678466797, |
|
"learning_rate": 9.892117729667052e-05, |
|
"loss": 0.0175, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.6710097719869705, |
|
"grad_norm": 0.28346696496009827, |
|
"learning_rate": 9.676403412876856e-05, |
|
"loss": 0.0334, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.7035830618892507, |
|
"grad_norm": 0.3956477642059326, |
|
"learning_rate": 9.460839744332378e-05, |
|
"loss": 0.0271, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.736156351791531, |
|
"grad_norm": 0.30705949664115906, |
|
"learning_rate": 9.245527078270341e-05, |
|
"loss": 0.0217, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.768729641693811, |
|
"grad_norm": 0.40188854932785034, |
|
"learning_rate": 9.030565652074926e-05, |
|
"loss": 0.019, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.801302931596091, |
|
"grad_norm": 0.3447129428386688, |
|
"learning_rate": 8.816055539612924e-05, |
|
"loss": 0.028, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.8338762214983713, |
|
"grad_norm": 0.38768622279167175, |
|
"learning_rate": 8.602096604645009e-05, |
|
"loss": 0.0218, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.8664495114006514, |
|
"grad_norm": 0.26912721991539, |
|
"learning_rate": 8.388788454334817e-05, |
|
"loss": 0.0173, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8990228013029316, |
|
"grad_norm": 0.33078861236572266, |
|
"learning_rate": 8.176230392877523e-05, |
|
"loss": 0.0233, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.9315960912052117, |
|
"grad_norm": 0.24832488596439362, |
|
"learning_rate": 7.964521375269396e-05, |
|
"loss": 0.0171, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.964169381107492, |
|
"grad_norm": 0.6595136523246765, |
|
"learning_rate": 7.753759961239964e-05, |
|
"loss": 0.0272, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.996742671009772, |
|
"grad_norm": 0.2780207097530365, |
|
"learning_rate": 7.544044269368197e-05, |
|
"loss": 0.0338, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.029315960912052, |
|
"grad_norm": 0.7173179388046265, |
|
"learning_rate": 7.335471931404063e-05, |
|
"loss": 0.0365, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.0618892508143323, |
|
"grad_norm": 0.33753442764282227, |
|
"learning_rate": 7.128140046816671e-05, |
|
"loss": 0.0195, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.0944625407166124, |
|
"grad_norm": 0.35064950585365295, |
|
"learning_rate": 6.922145137590306e-05, |
|
"loss": 0.02, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.1270358306188926, |
|
"grad_norm": 0.39598166942596436, |
|
"learning_rate": 6.717583103289229e-05, |
|
"loss": 0.0203, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.1596091205211727, |
|
"grad_norm": 0.18257524073123932, |
|
"learning_rate": 6.514549176412275e-05, |
|
"loss": 0.0134, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.192182410423453, |
|
"grad_norm": 0.4458347260951996, |
|
"learning_rate": 6.313137878058013e-05, |
|
"loss": 0.0236, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.224755700325733, |
|
"grad_norm": 0.22742605209350586, |
|
"learning_rate": 6.113442973921046e-05, |
|
"loss": 0.0208, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.257328990228013, |
|
"grad_norm": 0.1858537793159485, |
|
"learning_rate": 5.9155574306400395e-05, |
|
"loss": 0.0218, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.2899022801302933, |
|
"grad_norm": 0.24626286327838898, |
|
"learning_rate": 5.7195733725176994e-05, |
|
"loss": 0.0232, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.3224755700325734, |
|
"grad_norm": 0.2719153165817261, |
|
"learning_rate": 5.525582038632934e-05, |
|
"loss": 0.0148, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.3550488599348536, |
|
"grad_norm": 0.218730166554451, |
|
"learning_rate": 5.333673740365083e-05, |
|
"loss": 0.0157, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.3876221498371337, |
|
"grad_norm": 0.20292945206165314, |
|
"learning_rate": 5.1439378193500707e-05, |
|
"loss": 0.0143, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.420195439739414, |
|
"grad_norm": 0.2846449017524719, |
|
"learning_rate": 4.956462605887994e-05, |
|
"loss": 0.0177, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.4527687296416936, |
|
"grad_norm": 0.322721391916275, |
|
"learning_rate": 4.771335377821535e-05, |
|
"loss": 0.0224, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.4853420195439737, |
|
"grad_norm": 0.1719449758529663, |
|
"learning_rate": 4.588642319904343e-05, |
|
"loss": 0.0234, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.517915309446254, |
|
"grad_norm": 0.44704851508140564, |
|
"learning_rate": 4.408468483678293e-05, |
|
"loss": 0.019, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.550488599348534, |
|
"grad_norm": 0.4159814417362213, |
|
"learning_rate": 4.230897747878303e-05, |
|
"loss": 0.0156, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.583061889250814, |
|
"grad_norm": 0.19604472815990448, |
|
"learning_rate": 4.056012779383145e-05, |
|
"loss": 0.0158, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.6156351791530943, |
|
"grad_norm": 0.19116809964179993, |
|
"learning_rate": 3.883894994730428e-05, |
|
"loss": 0.0174, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.6482084690553744, |
|
"grad_norm": 0.3637801706790924, |
|
"learning_rate": 3.714624522213681e-05, |
|
"loss": 0.0162, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.6807817589576546, |
|
"grad_norm": 0.1877295821905136, |
|
"learning_rate": 3.548280164579126e-05, |
|
"loss": 0.0142, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.7133550488599347, |
|
"grad_norm": 0.1830226182937622, |
|
"learning_rate": 3.384939362339614e-05, |
|
"loss": 0.0119, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.745928338762215, |
|
"grad_norm": 0.15163740515708923, |
|
"learning_rate": 3.224678157722689e-05, |
|
"loss": 0.0181, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.778501628664495, |
|
"grad_norm": 0.2479788213968277, |
|
"learning_rate": 3.067571159269651e-05, |
|
"loss": 0.0138, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.811074918566775, |
|
"grad_norm": 0.6171669960021973, |
|
"learning_rate": 2.913691507102019e-05, |
|
"loss": 0.0197, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.8436482084690553, |
|
"grad_norm": 0.18519634008407593, |
|
"learning_rate": 2.763110838871651e-05, |
|
"loss": 0.0137, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.8762214983713354, |
|
"grad_norm": 0.26303982734680176, |
|
"learning_rate": 2.6158992564103058e-05, |
|
"loss": 0.0172, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.9087947882736156, |
|
"grad_norm": 0.28331807255744934, |
|
"learning_rate": 2.4721252930941974e-05, |
|
"loss": 0.0168, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.9413680781758957, |
|
"grad_norm": 0.20530906319618225, |
|
"learning_rate": 2.3318558819387404e-05, |
|
"loss": 0.0199, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.973941368078176, |
|
"grad_norm": 0.16924133896827698, |
|
"learning_rate": 2.1951563244383233e-05, |
|
"loss": 0.0146, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.006514657980456, |
|
"grad_norm": 0.13186028599739075, |
|
"learning_rate": 2.0620902601656345e-05, |
|
"loss": 0.0124, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.039087947882736, |
|
"grad_norm": 0.24360792338848114, |
|
"learning_rate": 1.9327196371446776e-05, |
|
"loss": 0.0119, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.071661237785016, |
|
"grad_norm": 0.09876150637865067, |
|
"learning_rate": 1.807104683011289e-05, |
|
"loss": 0.012, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.1042345276872965, |
|
"grad_norm": 0.2283184826374054, |
|
"learning_rate": 1.6853038769745467e-05, |
|
"loss": 0.0142, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.136807817589577, |
|
"grad_norm": 0.32383596897125244, |
|
"learning_rate": 1.5673739225921758e-05, |
|
"loss": 0.012, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.169381107491857, |
|
"grad_norm": 0.2783248722553253, |
|
"learning_rate": 1.4533697213725662e-05, |
|
"loss": 0.0163, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.201954397394137, |
|
"grad_norm": 0.17678265273571014, |
|
"learning_rate": 1.3433443472157613e-05, |
|
"loss": 0.012, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.234527687296417, |
|
"grad_norm": 0.25102487206459045, |
|
"learning_rate": 1.237349021705243e-05, |
|
"loss": 0.0158, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.267100977198697, |
|
"grad_norm": 0.15461167693138123, |
|
"learning_rate": 1.1354330902620636e-05, |
|
"loss": 0.0126, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.299674267100977, |
|
"grad_norm": 0.24122057855129242, |
|
"learning_rate": 1.0376439991724096e-05, |
|
"loss": 0.0168, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.3322475570032575, |
|
"grad_norm": 0.14669205248355865, |
|
"learning_rate": 9.440272734993072e-06, |
|
"loss": 0.0179, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.364820846905538, |
|
"grad_norm": 0.32440969347953796, |
|
"learning_rate": 8.546264958887219e-06, |
|
"loss": 0.0197, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.397394136807818, |
|
"grad_norm": 0.14456795156002045, |
|
"learning_rate": 7.694832862799505e-06, |
|
"loss": 0.0111, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.429967426710098, |
|
"grad_norm": 0.17956456542015076, |
|
"learning_rate": 6.886372825297349e-06, |
|
"loss": 0.0085, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.462540716612378, |
|
"grad_norm": 0.30424752831459045, |
|
"learning_rate": 6.12126121959119e-06, |
|
"loss": 0.0207, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.495114006514658, |
|
"grad_norm": 0.18671758472919464, |
|
"learning_rate": 5.399854238316437e-06, |
|
"loss": 0.013, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.527687296416938, |
|
"grad_norm": 0.3565406799316406, |
|
"learning_rate": 4.722487727710368e-06, |
|
"loss": 0.0165, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.5602605863192185, |
|
"grad_norm": 0.26344749331474304, |
|
"learning_rate": 4.089477031261113e-06, |
|
"loss": 0.0148, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.592833876221499, |
|
"grad_norm": 0.18339155614376068, |
|
"learning_rate": 3.5011168429016083e-06, |
|
"loss": 0.0195, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.625407166123779, |
|
"grad_norm": 0.2598022222518921, |
|
"learning_rate": 2.95768106981672e-06, |
|
"loss": 0.0135, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.657980456026059, |
|
"grad_norm": 0.3853515684604645, |
|
"learning_rate": 2.4594227049277386e-06, |
|
"loss": 0.0177, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.690553745928339, |
|
"grad_norm": 0.13664180040359497, |
|
"learning_rate": 2.006573709112991e-06, |
|
"loss": 0.0086, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.723127035830619, |
|
"grad_norm": 0.1015399917960167, |
|
"learning_rate": 1.5993449032201458e-06, |
|
"loss": 0.0116, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.755700325732899, |
|
"grad_norm": 0.18885648250579834, |
|
"learning_rate": 1.237925869919887e-06, |
|
"loss": 0.0175, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.7882736156351795, |
|
"grad_norm": 0.18131224811077118, |
|
"learning_rate": 9.224848654469931e-07, |
|
"loss": 0.0088, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.82084690553746, |
|
"grad_norm": 0.194551482796669, |
|
"learning_rate": 6.531687412697496e-07, |
|
"loss": 0.014, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.85342019543974, |
|
"grad_norm": 0.23798178136348724, |
|
"learning_rate": 4.3010287572422537e-07, |
|
"loss": 0.0097, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.88599348534202, |
|
"grad_norm": 0.141094371676445, |
|
"learning_rate": 2.5339111564521844e-07, |
|
"loss": 0.0151, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.918566775244299, |
|
"grad_norm": 0.17839759588241577, |
|
"learning_rate": 1.2311572802105043e-07, |
|
"loss": 0.0097, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.95114006514658, |
|
"grad_norm": 0.3124100863933563, |
|
"learning_rate": 3.933736169471347e-08, |
|
"loss": 0.0098, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.9837133550488595, |
|
"grad_norm": 0.17664480209350586, |
|
"learning_rate": 2.0950191292112842e-09, |
|
"loss": 0.0142, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.993485342019544, |
|
"step": 1533, |
|
"total_flos": 5.185032946443418e+16, |
|
"train_loss": 0.05182663513868756, |
|
"train_runtime": 722.6992, |
|
"train_samples_per_second": 33.939, |
|
"train_steps_per_second": 2.121 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1533, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.185032946443418e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|