{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.990259740259741, "eval_steps": 500, "global_step": 1537, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032467532467532464, "grad_norm": 11.862669944763184, "learning_rate": 2.5974025974025972e-05, "loss": 2.2746, "step": 10 }, { "epoch": 0.06493506493506493, "grad_norm": 14.773530006408691, "learning_rate": 5.1948051948051944e-05, "loss": 0.6306, "step": 20 }, { "epoch": 0.09740259740259741, "grad_norm": 2.444714069366455, "learning_rate": 7.792207792207793e-05, "loss": 0.2859, "step": 30 }, { "epoch": 0.12987012987012986, "grad_norm": 1.5860096216201782, "learning_rate": 0.00010389610389610389, "loss": 0.1714, "step": 40 }, { "epoch": 0.16233766233766234, "grad_norm": 1.3271065950393677, "learning_rate": 0.00012987012987012987, "loss": 0.1621, "step": 50 }, { "epoch": 0.19480519480519481, "grad_norm": 1.9922699928283691, "learning_rate": 0.00015584415584415587, "loss": 0.164, "step": 60 }, { "epoch": 0.22727272727272727, "grad_norm": 1.0023343563079834, "learning_rate": 0.00018181818181818183, "loss": 0.1308, "step": 70 }, { "epoch": 0.2597402597402597, "grad_norm": 0.9265744686126709, "learning_rate": 0.00019999791644466247, "loss": 0.1217, "step": 80 }, { "epoch": 0.2922077922077922, "grad_norm": 1.0755594968795776, "learning_rate": 0.00019996087787618624, "loss": 0.1126, "step": 90 }, { "epoch": 0.3246753246753247, "grad_norm": 1.6246726512908936, "learning_rate": 0.00019987755781707668, "loss": 0.1066, "step": 100 }, { "epoch": 0.35714285714285715, "grad_norm": 1.3438688516616821, "learning_rate": 0.00019974799484419106, "loss": 0.0931, "step": 110 }, { "epoch": 0.38961038961038963, "grad_norm": 1.0927611589431763, "learning_rate": 0.0001995722489446729, "loss": 0.0955, "step": 120 }, { "epoch": 0.42207792207792205, "grad_norm": 0.5384888648986816, "learning_rate": 0.0001993504014881777, "loss": 0.0543, "step": 130 }, { "epoch": 0.45454545454545453, "grad_norm": 1.163609504699707, "learning_rate": 0.0001990825551891994, "loss": 0.0649, "step": 140 }, { "epoch": 0.487012987012987, "grad_norm": 1.2434951066970825, "learning_rate": 0.00019876883405951377, "loss": 0.0798, "step": 150 }, { "epoch": 0.5194805194805194, "grad_norm": 0.865424633026123, "learning_rate": 0.00019840938335076162, "loss": 0.082, "step": 160 }, { "epoch": 0.551948051948052, "grad_norm": 0.8742974400520325, "learning_rate": 0.00019800436948719775, "loss": 0.0725, "step": 170 }, { "epoch": 0.5844155844155844, "grad_norm": 0.5820501446723938, "learning_rate": 0.0001975539799886372, "loss": 0.0602, "step": 180 }, { "epoch": 0.6168831168831169, "grad_norm": 0.6646164655685425, "learning_rate": 0.00019705842338363434, "loss": 0.0614, "step": 190 }, { "epoch": 0.6493506493506493, "grad_norm": 1.1510143280029297, "learning_rate": 0.00019651792911293508, "loss": 0.066, "step": 200 }, { "epoch": 0.6818181818181818, "grad_norm": 0.9399957060813904, "learning_rate": 0.0001959327474232464, "loss": 0.0627, "step": 210 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9072800874710083, "learning_rate": 0.00019530314925137355, "loss": 0.0666, "step": 220 }, { "epoch": 0.7467532467532467, "grad_norm": 0.8320746421813965, "learning_rate": 0.00019462942609877696, "loss": 0.0518, "step": 230 }, { "epoch": 0.7792207792207793, "grad_norm": 0.820354163646698, "learning_rate": 0.00019391188989660848, "loss": 0.0857, "step": 240 }, { "epoch": 0.8116883116883117, "grad_norm": 0.9190356731414795, "learning_rate": 0.0001931508728612885, "loss": 0.0651, "step": 250 }, { "epoch": 0.8441558441558441, "grad_norm": 0.6117440462112427, "learning_rate": 0.0001923467273406908, "loss": 0.0526, "step": 260 }, { "epoch": 0.8766233766233766, "grad_norm": 0.8513985276222229, "learning_rate": 0.00019149982565100728, "loss": 0.0549, "step": 270 }, { "epoch": 0.9090909090909091, "grad_norm": 0.5364561080932617, "learning_rate": 0.00019061055990436665, "loss": 0.0612, "step": 280 }, { "epoch": 0.9415584415584416, "grad_norm": 0.8399089574813843, "learning_rate": 0.00018967934182728837, "loss": 0.0417, "step": 290 }, { "epoch": 0.974025974025974, "grad_norm": 0.7521604299545288, "learning_rate": 0.00018870660257005466, "loss": 0.0535, "step": 300 }, { "epoch": 1.0064935064935066, "grad_norm": 0.5698351263999939, "learning_rate": 0.00018769279250708976, "loss": 0.0489, "step": 310 }, { "epoch": 1.0389610389610389, "grad_norm": 0.6740209460258484, "learning_rate": 0.0001866383810284384, "loss": 0.0569, "step": 320 }, { "epoch": 1.0714285714285714, "grad_norm": 0.49634265899658203, "learning_rate": 0.00018554385632244, "loss": 0.0561, "step": 330 }, { "epoch": 1.103896103896104, "grad_norm": 0.6321184039115906, "learning_rate": 0.00018440972514969926, "loss": 0.0413, "step": 340 }, { "epoch": 1.1363636363636362, "grad_norm": 0.41812852025032043, "learning_rate": 0.00018323651260845832, "loss": 0.0411, "step": 350 }, { "epoch": 1.1688311688311688, "grad_norm": 0.3897482454776764, "learning_rate": 0.00018202476189147803, "loss": 0.0505, "step": 360 }, { "epoch": 1.2012987012987013, "grad_norm": 0.6317518353462219, "learning_rate": 0.00018077503403454216, "loss": 0.0387, "step": 370 }, { "epoch": 1.2337662337662338, "grad_norm": 0.49315449595451355, "learning_rate": 0.0001794879076566998, "loss": 0.0458, "step": 380 }, { "epoch": 1.2662337662337662, "grad_norm": 0.522301971912384, "learning_rate": 0.00017816397869236717, "loss": 0.0509, "step": 390 }, { "epoch": 1.2987012987012987, "grad_norm": 0.6012665629386902, "learning_rate": 0.00017680386011541222, "loss": 0.0379, "step": 400 }, { "epoch": 1.3311688311688312, "grad_norm": 0.783263087272644, "learning_rate": 0.0001754081816553504, "loss": 0.047, "step": 410 }, { "epoch": 1.3636363636363638, "grad_norm": 0.7606521248817444, "learning_rate": 0.00017397758950578207, "loss": 0.0733, "step": 420 }, { "epoch": 1.396103896103896, "grad_norm": 0.4706479012966156, "learning_rate": 0.00017251274602520766, "loss": 0.0556, "step": 430 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5615910887718201, "learning_rate": 0.00017101432943035825, "loss": 0.0491, "step": 440 }, { "epoch": 1.4610389610389611, "grad_norm": 0.6203674674034119, "learning_rate": 0.0001694830334821838, "loss": 0.0414, "step": 450 }, { "epoch": 1.4935064935064934, "grad_norm": 0.6230823397636414, "learning_rate": 0.00016791956716464472, "loss": 0.0484, "step": 460 }, { "epoch": 1.525974025974026, "grad_norm": 0.49440866708755493, "learning_rate": 0.0001663246543564551, "loss": 0.056, "step": 470 }, { "epoch": 1.5584415584415585, "grad_norm": 0.5235504508018494, "learning_rate": 0.00016469903349592956, "loss": 0.036, "step": 480 }, { "epoch": 1.5909090909090908, "grad_norm": 0.7212300300598145, "learning_rate": 0.0001630434572390895, "loss": 0.0464, "step": 490 }, { "epoch": 1.6233766233766234, "grad_norm": 0.45477214455604553, "learning_rate": 0.00016135869211118603, "loss": 0.04, "step": 500 }, { "epoch": 1.655844155844156, "grad_norm": 0.6098753809928894, "learning_rate": 0.00015964551815180213, "loss": 0.0396, "step": 510 }, { "epoch": 1.6883116883116882, "grad_norm": 1.0758476257324219, "learning_rate": 0.00015790472855369716, "loss": 0.0567, "step": 520 }, { "epoch": 1.7207792207792207, "grad_norm": 0.8209078311920166, "learning_rate": 0.00015613712929556193, "loss": 0.0506, "step": 530 }, { "epoch": 1.7532467532467533, "grad_norm": 0.7815719246864319, "learning_rate": 0.00015434353876885362, "loss": 0.0474, "step": 540 }, { "epoch": 1.7857142857142856, "grad_norm": 0.3328838050365448, "learning_rate": 0.00015252478739888385, "loss": 0.0333, "step": 550 }, { "epoch": 1.8181818181818183, "grad_norm": 0.6883902549743652, "learning_rate": 0.0001506817172603351, "loss": 0.0342, "step": 560 }, { "epoch": 1.8506493506493507, "grad_norm": 0.4302406311035156, "learning_rate": 0.0001488151816873834, "loss": 0.0415, "step": 570 }, { "epoch": 1.883116883116883, "grad_norm": 0.5647683143615723, "learning_rate": 0.00014692604487860785, "loss": 0.0492, "step": 580 }, { "epoch": 1.9155844155844157, "grad_norm": 0.46459880471229553, "learning_rate": 0.00014501518149687042, "loss": 0.035, "step": 590 }, { "epoch": 1.948051948051948, "grad_norm": 0.49570703506469727, "learning_rate": 0.0001430834762643502, "loss": 0.0402, "step": 600 }, { "epoch": 1.9805194805194806, "grad_norm": 0.44045427441596985, "learning_rate": 0.00014113182355292078, "loss": 0.045, "step": 610 }, { "epoch": 2.012987012987013, "grad_norm": 0.22453299164772034, "learning_rate": 0.0001391611269700594, "loss": 0.0419, "step": 620 }, { "epoch": 2.0454545454545454, "grad_norm": 0.4719075560569763, "learning_rate": 0.00013717229894048038, "loss": 0.037, "step": 630 }, { "epoch": 2.0779220779220777, "grad_norm": 0.4710325598716736, "learning_rate": 0.0001351662602836861, "loss": 0.0368, "step": 640 }, { "epoch": 2.1103896103896105, "grad_norm": 0.5058510303497314, "learning_rate": 0.0001331439397876312, "loss": 0.0385, "step": 650 }, { "epoch": 2.142857142857143, "grad_norm": 0.5033309459686279, "learning_rate": 0.0001311062737786974, "loss": 0.0357, "step": 660 }, { "epoch": 2.175324675324675, "grad_norm": 0.36022046208381653, "learning_rate": 0.0001290542056881781, "loss": 0.0243, "step": 670 }, { "epoch": 2.207792207792208, "grad_norm": 0.3985787034034729, "learning_rate": 0.0001269886856154735, "loss": 0.0471, "step": 680 }, { "epoch": 2.24025974025974, "grad_norm": 0.47278791666030884, "learning_rate": 0.0001249106698881982, "loss": 0.0353, "step": 690 }, { "epoch": 2.2727272727272725, "grad_norm": 0.5603681802749634, "learning_rate": 0.0001228211206194055, "loss": 0.0324, "step": 700 }, { "epoch": 2.3051948051948052, "grad_norm": 0.28177714347839355, "learning_rate": 0.0001207210052621327, "loss": 0.0432, "step": 710 }, { "epoch": 2.3376623376623376, "grad_norm": 0.5803484320640564, "learning_rate": 0.00011861129616147418, "loss": 0.0322, "step": 720 }, { "epoch": 2.3701298701298703, "grad_norm": 0.298414409160614, "learning_rate": 0.00011649297010438956, "loss": 0.0428, "step": 730 }, { "epoch": 2.4025974025974026, "grad_norm": 0.21691694855690002, "learning_rate": 0.00011436700786745515, "loss": 0.0243, "step": 740 }, { "epoch": 2.435064935064935, "grad_norm": 0.5338732600212097, "learning_rate": 0.00011223439376276835, "loss": 0.0325, "step": 750 }, { "epoch": 2.4675324675324677, "grad_norm": 0.5570558905601501, "learning_rate": 0.00011009611518221489, "loss": 0.0368, "step": 760 }, { "epoch": 2.5, "grad_norm": 0.4559939503669739, "learning_rate": 0.00010795316214031048, "loss": 0.0268, "step": 770 }, { "epoch": 2.5324675324675323, "grad_norm": 0.33397603034973145, "learning_rate": 0.0001058065268158279, "loss": 0.0359, "step": 780 }, { "epoch": 2.564935064935065, "grad_norm": 0.22799064218997955, "learning_rate": 0.00010365720309242217, "loss": 0.0264, "step": 790 }, { "epoch": 2.5974025974025974, "grad_norm": 0.38165828585624695, "learning_rate": 0.00010150618609846638, "loss": 0.0301, "step": 800 }, { "epoch": 2.62987012987013, "grad_norm": 0.5442892909049988, "learning_rate": 9.935447174631119e-05, "loss": 0.0238, "step": 810 }, { "epoch": 2.6623376623376624, "grad_norm": 0.22587241232395172, "learning_rate": 9.720305627118126e-05, "loss": 0.0337, "step": 820 }, { "epoch": 2.6948051948051948, "grad_norm": 0.2878405451774597, "learning_rate": 9.505293576992251e-05, "loss": 0.0253, "step": 830 }, { "epoch": 2.7272727272727275, "grad_norm": 0.5154989957809448, "learning_rate": 9.290510573981298e-05, "loss": 0.0279, "step": 840 }, { "epoch": 2.75974025974026, "grad_norm": 0.42716383934020996, "learning_rate": 9.076056061765173e-05, "loss": 0.0358, "step": 850 }, { "epoch": 2.792207792207792, "grad_norm": 0.6430768370628357, "learning_rate": 8.862029331933828e-05, "loss": 0.0315, "step": 860 }, { "epoch": 2.824675324675325, "grad_norm": 0.6324707269668579, "learning_rate": 8.648529478015685e-05, "loss": 0.0305, "step": 870 }, { "epoch": 2.857142857142857, "grad_norm": 0.27602899074554443, "learning_rate": 8.435655349597689e-05, "loss": 0.026, "step": 880 }, { "epoch": 2.8896103896103895, "grad_norm": 0.5011958479881287, "learning_rate": 8.223505506558375e-05, "loss": 0.0274, "step": 890 }, { "epoch": 2.9220779220779223, "grad_norm": 0.3036370277404785, "learning_rate": 8.012178173434986e-05, "loss": 0.0306, "step": 900 }, { "epoch": 2.9545454545454546, "grad_norm": 0.3664824366569519, "learning_rate": 7.80177119394596e-05, "loss": 0.0241, "step": 910 }, { "epoch": 2.987012987012987, "grad_norm": 0.43233174085617065, "learning_rate": 7.592381985689628e-05, "loss": 0.0309, "step": 920 }, { "epoch": 3.0194805194805197, "grad_norm": 0.3537318706512451, "learning_rate": 7.384107495040284e-05, "loss": 0.0321, "step": 930 }, { "epoch": 3.051948051948052, "grad_norm": 0.2594110071659088, "learning_rate": 7.177044152262369e-05, "loss": 0.0245, "step": 940 }, { "epoch": 3.0844155844155843, "grad_norm": 0.3139798939228058, "learning_rate": 6.97128782686367e-05, "loss": 0.0218, "step": 950 }, { "epoch": 3.116883116883117, "grad_norm": 0.2899845540523529, "learning_rate": 6.766933783208092e-05, "loss": 0.029, "step": 960 }, { "epoch": 3.1493506493506493, "grad_norm": 0.320487916469574, "learning_rate": 6.564076636408656e-05, "loss": 0.0226, "step": 970 }, { "epoch": 3.1818181818181817, "grad_norm": 0.4478653371334076, "learning_rate": 6.362810308521054e-05, "loss": 0.0224, "step": 980 }, { "epoch": 3.2142857142857144, "grad_norm": 0.20480212569236755, "learning_rate": 6.16322798505813e-05, "loss": 0.0224, "step": 990 }, { "epoch": 3.2467532467532467, "grad_norm": 0.4546569287776947, "learning_rate": 5.9654220718453424e-05, "loss": 0.0311, "step": 1000 }, { "epoch": 3.279220779220779, "grad_norm": 0.2200387865304947, "learning_rate": 5.769484152237249e-05, "loss": 0.026, "step": 1010 }, { "epoch": 3.311688311688312, "grad_norm": 0.40406161546707153, "learning_rate": 5.5755049447147444e-05, "loss": 0.028, "step": 1020 }, { "epoch": 3.344155844155844, "grad_norm": 0.41747692227363586, "learning_rate": 5.383574260882802e-05, "loss": 0.0255, "step": 1030 }, { "epoch": 3.3766233766233764, "grad_norm": 0.4012701213359833, "learning_rate": 5.1937809638880374e-05, "loss": 0.0289, "step": 1040 }, { "epoch": 3.409090909090909, "grad_norm": 0.49661171436309814, "learning_rate": 5.0062129272754146e-05, "loss": 0.0249, "step": 1050 }, { "epoch": 3.4415584415584415, "grad_norm": 0.225033700466156, "learning_rate": 4.8209569943031516e-05, "loss": 0.0274, "step": 1060 }, { "epoch": 3.474025974025974, "grad_norm": 0.2046009600162506, "learning_rate": 4.638098937734648e-05, "loss": 0.0232, "step": 1070 }, { "epoch": 3.5064935064935066, "grad_norm": 0.4959232211112976, "learning_rate": 4.4577234201260196e-05, "loss": 0.0233, "step": 1080 }, { "epoch": 3.538961038961039, "grad_norm": 0.42567142844200134, "learning_rate": 4.279913954627667e-05, "loss": 0.0188, "step": 1090 }, { "epoch": 3.571428571428571, "grad_norm": 0.22492988407611847, "learning_rate": 4.104752866318026e-05, "loss": 0.0178, "step": 1100 }, { "epoch": 3.603896103896104, "grad_norm": 0.2631309926509857, "learning_rate": 3.932321254087389e-05, "loss": 0.0187, "step": 1110 }, { "epoch": 3.6363636363636362, "grad_norm": 0.45389196276664734, "learning_rate": 3.7626989530894285e-05, "loss": 0.0211, "step": 1120 }, { "epoch": 3.6688311688311686, "grad_norm": 0.20797108113765717, "learning_rate": 3.5959644977778386e-05, "loss": 0.024, "step": 1130 }, { "epoch": 3.7012987012987013, "grad_norm": 0.22460906207561493, "learning_rate": 3.432195085545191e-05, "loss": 0.0161, "step": 1140 }, { "epoch": 3.7337662337662336, "grad_norm": 0.3012770116329193, "learning_rate": 3.271466540980862e-05, "loss": 0.0232, "step": 1150 }, { "epoch": 3.7662337662337664, "grad_norm": 0.38262003660202026, "learning_rate": 3.1138532807645394e-05, "loss": 0.0174, "step": 1160 }, { "epoch": 3.7987012987012987, "grad_norm": 0.22977425158023834, "learning_rate": 2.9594282792115857e-05, "loss": 0.0227, "step": 1170 }, { "epoch": 3.8311688311688314, "grad_norm": 0.16646578907966614, "learning_rate": 2.808263034486226e-05, "loss": 0.0155, "step": 1180 }, { "epoch": 3.8636363636363638, "grad_norm": 0.257495254278183, "learning_rate": 2.660427535498191e-05, "loss": 0.0218, "step": 1190 }, { "epoch": 3.896103896103896, "grad_norm": 0.3283085227012634, "learning_rate": 2.5159902294981197e-05, "loss": 0.0224, "step": 1200 }, { "epoch": 3.928571428571429, "grad_norm": 0.36105039715766907, "learning_rate": 2.3750179903867443e-05, "loss": 0.0278, "step": 1210 }, { "epoch": 3.961038961038961, "grad_norm": 0.3740982115268707, "learning_rate": 2.237576087752554e-05, "loss": 0.0241, "step": 1220 }, { "epoch": 3.9935064935064934, "grad_norm": 0.19882728159427643, "learning_rate": 2.1037281566522304e-05, "loss": 0.0174, "step": 1230 }, { "epoch": 4.025974025974026, "grad_norm": 0.12800416350364685, "learning_rate": 1.973536168147867e-05, "loss": 0.0137, "step": 1240 }, { "epoch": 4.058441558441558, "grad_norm": 0.352993905544281, "learning_rate": 1.8470604006146064e-05, "loss": 0.0221, "step": 1250 }, { "epoch": 4.090909090909091, "grad_norm": 0.3981715738773346, "learning_rate": 1.7243594118319985e-05, "loss": 0.0162, "step": 1260 }, { "epoch": 4.123376623376624, "grad_norm": 0.23238126933574677, "learning_rate": 1.6054900118719807e-05, "loss": 0.0201, "step": 1270 }, { "epoch": 4.1558441558441555, "grad_norm": 0.3487814962863922, "learning_rate": 1.4905072367960437e-05, "loss": 0.0182, "step": 1280 }, { "epoch": 4.188311688311688, "grad_norm": 0.2413235455751419, "learning_rate": 1.3794643231737348e-05, "loss": 0.0161, "step": 1290 }, { "epoch": 4.220779220779221, "grad_norm": 0.3678736984729767, "learning_rate": 1.2724126834343564e-05, "loss": 0.02, "step": 1300 }, { "epoch": 4.253246753246753, "grad_norm": 0.0916026160120964, "learning_rate": 1.1694018820632068e-05, "loss": 0.0179, "step": 1310 }, { "epoch": 4.285714285714286, "grad_norm": 0.16847121715545654, "learning_rate": 1.0704796126534234e-05, "loss": 0.0182, "step": 1320 }, { "epoch": 4.318181818181818, "grad_norm": 0.10810457170009613, "learning_rate": 9.756916758240286e-06, "loss": 0.0219, "step": 1330 }, { "epoch": 4.35064935064935, "grad_norm": 0.31443580985069275, "learning_rate": 8.850819580144387e-06, "loss": 0.0254, "step": 1340 }, { "epoch": 4.383116883116883, "grad_norm": 0.2282458394765854, "learning_rate": 7.986924111652006e-06, "loss": 0.0177, "step": 1350 }, { "epoch": 4.415584415584416, "grad_norm": 0.34387949109077454, "learning_rate": 7.16563033294424e-06, "loss": 0.0167, "step": 1360 }, { "epoch": 4.448051948051948, "grad_norm": 0.12599095702171326, "learning_rate": 6.387318499788497e-06, "loss": 0.0205, "step": 1370 }, { "epoch": 4.48051948051948, "grad_norm": 0.14966098964214325, "learning_rate": 5.652348967481569e-06, "loss": 0.0201, "step": 1380 }, { "epoch": 4.512987012987013, "grad_norm": 0.28549131751060486, "learning_rate": 4.961062024006591e-06, "loss": 0.019, "step": 1390 }, { "epoch": 4.545454545454545, "grad_norm": 0.22848688066005707, "learning_rate": 4.313777732481039e-06, "loss": 0.0182, "step": 1400 }, { "epoch": 4.577922077922078, "grad_norm": 0.20382894575595856, "learning_rate": 3.7107957829688234e-06, "loss": 0.0228, "step": 1410 }, { "epoch": 4.6103896103896105, "grad_norm": 0.1720917969942093, "learning_rate": 3.1523953537248684e-06, "loss": 0.0189, "step": 1420 }, { "epoch": 4.642857142857143, "grad_norm": 0.10501822084188461, "learning_rate": 2.638834981936744e-06, "loss": 0.015, "step": 1430 }, { "epoch": 4.675324675324675, "grad_norm": 0.13112539052963257, "learning_rate": 2.1703524440230383e-06, "loss": 0.0118, "step": 1440 }, { "epoch": 4.707792207792208, "grad_norm": 0.17099051177501678, "learning_rate": 1.7471646455437085e-06, "loss": 0.0151, "step": 1450 }, { "epoch": 4.740259740259741, "grad_norm": 0.3271525204181671, "learning_rate": 1.3694675207737151e-06, "loss": 0.0223, "step": 1460 }, { "epoch": 4.7727272727272725, "grad_norm": 0.35389673709869385, "learning_rate": 1.03743594198622e-06, "loss": 0.0133, "step": 1470 }, { "epoch": 4.805194805194805, "grad_norm": 0.25904619693756104, "learning_rate": 7.512236384874305e-07, "loss": 0.0166, "step": 1480 }, { "epoch": 4.837662337662338, "grad_norm": 0.14534501731395721, "learning_rate": 5.109631254405445e-07, "loss": 0.0133, "step": 1490 }, { "epoch": 4.87012987012987, "grad_norm": 0.17412607371807098, "learning_rate": 3.1676564251171824e-07, "loss": 0.0188, "step": 1500 }, { "epoch": 4.902597402597403, "grad_norm": 0.20664988458156586, "learning_rate": 1.687211023665647e-07, "loss": 0.0114, "step": 1510 }, { "epoch": 4.935064935064935, "grad_norm": 0.19708259403705597, "learning_rate": 6.68980490409421e-08, "loss": 0.0179, "step": 1520 }, { "epoch": 4.967532467532467, "grad_norm": 0.5721464157104492, "learning_rate": 1.134362620534013e-08, "loss": 0.0192, "step": 1530 }, { "epoch": 4.990259740259741, "step": 1537, "total_flos": 5.134548117135398e+16, "train_loss": 0.05982626907687413, "train_runtime": 715.8651, "train_samples_per_second": 34.353, "train_steps_per_second": 2.147 } ], "logging_steps": 10, "max_steps": 1537, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.134548117135398e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }