|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.990259740259741, |
|
"eval_steps": 500, |
|
"global_step": 1537, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032467532467532464, |
|
"grad_norm": 11.862669944763184, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 2.2746, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 14.773530006408691, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.6306, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09740259740259741, |
|
"grad_norm": 2.444714069366455, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.2859, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 1.5860096216201782, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 0.1714, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16233766233766234, |
|
"grad_norm": 1.3271065950393677, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 0.1621, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 1.9922699928283691, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 0.164, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 1.0023343563079834, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.1308, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.9265744686126709, |
|
"learning_rate": 0.00019999791644466247, |
|
"loss": 0.1217, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2922077922077922, |
|
"grad_norm": 1.0755594968795776, |
|
"learning_rate": 0.00019996087787618624, |
|
"loss": 0.1126, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 1.6246726512908936, |
|
"learning_rate": 0.00019987755781707668, |
|
"loss": 0.1066, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 1.3438688516616821, |
|
"learning_rate": 0.00019974799484419106, |
|
"loss": 0.0931, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 1.0927611589431763, |
|
"learning_rate": 0.0001995722489446729, |
|
"loss": 0.0955, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.42207792207792205, |
|
"grad_norm": 0.5384888648986816, |
|
"learning_rate": 0.0001993504014881777, |
|
"loss": 0.0543, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.163609504699707, |
|
"learning_rate": 0.0001990825551891994, |
|
"loss": 0.0649, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.487012987012987, |
|
"grad_norm": 1.2434951066970825, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.0798, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.865424633026123, |
|
"learning_rate": 0.00019840938335076162, |
|
"loss": 0.082, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.551948051948052, |
|
"grad_norm": 0.8742974400520325, |
|
"learning_rate": 0.00019800436948719775, |
|
"loss": 0.0725, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 0.5820501446723938, |
|
"learning_rate": 0.0001975539799886372, |
|
"loss": 0.0602, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6168831168831169, |
|
"grad_norm": 0.6646164655685425, |
|
"learning_rate": 0.00019705842338363434, |
|
"loss": 0.0614, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 1.1510143280029297, |
|
"learning_rate": 0.00019651792911293508, |
|
"loss": 0.066, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.9399957060813904, |
|
"learning_rate": 0.0001959327474232464, |
|
"loss": 0.0627, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.9072800874710083, |
|
"learning_rate": 0.00019530314925137355, |
|
"loss": 0.0666, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7467532467532467, |
|
"grad_norm": 0.8320746421813965, |
|
"learning_rate": 0.00019462942609877696, |
|
"loss": 0.0518, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.820354163646698, |
|
"learning_rate": 0.00019391188989660848, |
|
"loss": 0.0857, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8116883116883117, |
|
"grad_norm": 0.9190356731414795, |
|
"learning_rate": 0.0001931508728612885, |
|
"loss": 0.0651, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 0.6117440462112427, |
|
"learning_rate": 0.0001923467273406908, |
|
"loss": 0.0526, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8766233766233766, |
|
"grad_norm": 0.8513985276222229, |
|
"learning_rate": 0.00019149982565100728, |
|
"loss": 0.0549, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.5364561080932617, |
|
"learning_rate": 0.00019061055990436665, |
|
"loss": 0.0612, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9415584415584416, |
|
"grad_norm": 0.8399089574813843, |
|
"learning_rate": 0.00018967934182728837, |
|
"loss": 0.0417, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 0.7521604299545288, |
|
"learning_rate": 0.00018870660257005466, |
|
"loss": 0.0535, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0064935064935066, |
|
"grad_norm": 0.5698351263999939, |
|
"learning_rate": 0.00018769279250708976, |
|
"loss": 0.0489, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 0.6740209460258484, |
|
"learning_rate": 0.0001866383810284384, |
|
"loss": 0.0569, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.49634265899658203, |
|
"learning_rate": 0.00018554385632244, |
|
"loss": 0.0561, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.103896103896104, |
|
"grad_norm": 0.6321184039115906, |
|
"learning_rate": 0.00018440972514969926, |
|
"loss": 0.0413, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 0.41812852025032043, |
|
"learning_rate": 0.00018323651260845832, |
|
"loss": 0.0411, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1688311688311688, |
|
"grad_norm": 0.3897482454776764, |
|
"learning_rate": 0.00018202476189147803, |
|
"loss": 0.0505, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2012987012987013, |
|
"grad_norm": 0.6317518353462219, |
|
"learning_rate": 0.00018077503403454216, |
|
"loss": 0.0387, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2337662337662338, |
|
"grad_norm": 0.49315449595451355, |
|
"learning_rate": 0.0001794879076566998, |
|
"loss": 0.0458, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2662337662337662, |
|
"grad_norm": 0.522301971912384, |
|
"learning_rate": 0.00017816397869236717, |
|
"loss": 0.0509, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.6012665629386902, |
|
"learning_rate": 0.00017680386011541222, |
|
"loss": 0.0379, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3311688311688312, |
|
"grad_norm": 0.783263087272644, |
|
"learning_rate": 0.0001754081816553504, |
|
"loss": 0.047, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.7606521248817444, |
|
"learning_rate": 0.00017397758950578207, |
|
"loss": 0.0733, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.396103896103896, |
|
"grad_norm": 0.4706479012966156, |
|
"learning_rate": 0.00017251274602520766, |
|
"loss": 0.0556, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.5615910887718201, |
|
"learning_rate": 0.00017101432943035825, |
|
"loss": 0.0491, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4610389610389611, |
|
"grad_norm": 0.6203674674034119, |
|
"learning_rate": 0.0001694830334821838, |
|
"loss": 0.0414, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4935064935064934, |
|
"grad_norm": 0.6230823397636414, |
|
"learning_rate": 0.00016791956716464472, |
|
"loss": 0.0484, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.525974025974026, |
|
"grad_norm": 0.49440866708755493, |
|
"learning_rate": 0.0001663246543564551, |
|
"loss": 0.056, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.5235504508018494, |
|
"learning_rate": 0.00016469903349592956, |
|
"loss": 0.036, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 0.7212300300598145, |
|
"learning_rate": 0.0001630434572390895, |
|
"loss": 0.0464, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6233766233766234, |
|
"grad_norm": 0.45477214455604553, |
|
"learning_rate": 0.00016135869211118603, |
|
"loss": 0.04, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.655844155844156, |
|
"grad_norm": 0.6098753809928894, |
|
"learning_rate": 0.00015964551815180213, |
|
"loss": 0.0396, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6883116883116882, |
|
"grad_norm": 1.0758476257324219, |
|
"learning_rate": 0.00015790472855369716, |
|
"loss": 0.0567, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7207792207792207, |
|
"grad_norm": 0.8209078311920166, |
|
"learning_rate": 0.00015613712929556193, |
|
"loss": 0.0506, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7532467532467533, |
|
"grad_norm": 0.7815719246864319, |
|
"learning_rate": 0.00015434353876885362, |
|
"loss": 0.0474, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.3328838050365448, |
|
"learning_rate": 0.00015252478739888385, |
|
"loss": 0.0333, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.6883902549743652, |
|
"learning_rate": 0.0001506817172603351, |
|
"loss": 0.0342, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8506493506493507, |
|
"grad_norm": 0.4302406311035156, |
|
"learning_rate": 0.0001488151816873834, |
|
"loss": 0.0415, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.883116883116883, |
|
"grad_norm": 0.5647683143615723, |
|
"learning_rate": 0.00014692604487860785, |
|
"loss": 0.0492, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9155844155844157, |
|
"grad_norm": 0.46459880471229553, |
|
"learning_rate": 0.00014501518149687042, |
|
"loss": 0.035, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"grad_norm": 0.49570703506469727, |
|
"learning_rate": 0.0001430834762643502, |
|
"loss": 0.0402, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9805194805194806, |
|
"grad_norm": 0.44045427441596985, |
|
"learning_rate": 0.00014113182355292078, |
|
"loss": 0.045, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.012987012987013, |
|
"grad_norm": 0.22453299164772034, |
|
"learning_rate": 0.0001391611269700594, |
|
"loss": 0.0419, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 0.4719075560569763, |
|
"learning_rate": 0.00013717229894048038, |
|
"loss": 0.037, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.4710325598716736, |
|
"learning_rate": 0.0001351662602836861, |
|
"loss": 0.0368, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1103896103896105, |
|
"grad_norm": 0.5058510303497314, |
|
"learning_rate": 0.0001331439397876312, |
|
"loss": 0.0385, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.5033309459686279, |
|
"learning_rate": 0.0001311062737786974, |
|
"loss": 0.0357, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.175324675324675, |
|
"grad_norm": 0.36022046208381653, |
|
"learning_rate": 0.0001290542056881781, |
|
"loss": 0.0243, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.207792207792208, |
|
"grad_norm": 0.3985787034034729, |
|
"learning_rate": 0.0001269886856154735, |
|
"loss": 0.0471, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.24025974025974, |
|
"grad_norm": 0.47278791666030884, |
|
"learning_rate": 0.0001249106698881982, |
|
"loss": 0.0353, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.5603681802749634, |
|
"learning_rate": 0.0001228211206194055, |
|
"loss": 0.0324, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3051948051948052, |
|
"grad_norm": 0.28177714347839355, |
|
"learning_rate": 0.0001207210052621327, |
|
"loss": 0.0432, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.5803484320640564, |
|
"learning_rate": 0.00011861129616147418, |
|
"loss": 0.0322, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3701298701298703, |
|
"grad_norm": 0.298414409160614, |
|
"learning_rate": 0.00011649297010438956, |
|
"loss": 0.0428, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.4025974025974026, |
|
"grad_norm": 0.21691694855690002, |
|
"learning_rate": 0.00011436700786745515, |
|
"loss": 0.0243, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.435064935064935, |
|
"grad_norm": 0.5338732600212097, |
|
"learning_rate": 0.00011223439376276835, |
|
"loss": 0.0325, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.4675324675324677, |
|
"grad_norm": 0.5570558905601501, |
|
"learning_rate": 0.00011009611518221489, |
|
"loss": 0.0368, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.4559939503669739, |
|
"learning_rate": 0.00010795316214031048, |
|
"loss": 0.0268, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.5324675324675323, |
|
"grad_norm": 0.33397603034973145, |
|
"learning_rate": 0.0001058065268158279, |
|
"loss": 0.0359, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.564935064935065, |
|
"grad_norm": 0.22799064218997955, |
|
"learning_rate": 0.00010365720309242217, |
|
"loss": 0.0264, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.38165828585624695, |
|
"learning_rate": 0.00010150618609846638, |
|
"loss": 0.0301, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.62987012987013, |
|
"grad_norm": 0.5442892909049988, |
|
"learning_rate": 9.935447174631119e-05, |
|
"loss": 0.0238, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.6623376623376624, |
|
"grad_norm": 0.22587241232395172, |
|
"learning_rate": 9.720305627118126e-05, |
|
"loss": 0.0337, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6948051948051948, |
|
"grad_norm": 0.2878405451774597, |
|
"learning_rate": 9.505293576992251e-05, |
|
"loss": 0.0253, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.5154989957809448, |
|
"learning_rate": 9.290510573981298e-05, |
|
"loss": 0.0279, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.75974025974026, |
|
"grad_norm": 0.42716383934020996, |
|
"learning_rate": 9.076056061765173e-05, |
|
"loss": 0.0358, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.792207792207792, |
|
"grad_norm": 0.6430768370628357, |
|
"learning_rate": 8.862029331933828e-05, |
|
"loss": 0.0315, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.824675324675325, |
|
"grad_norm": 0.6324707269668579, |
|
"learning_rate": 8.648529478015685e-05, |
|
"loss": 0.0305, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.27602899074554443, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 0.026, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8896103896103895, |
|
"grad_norm": 0.5011958479881287, |
|
"learning_rate": 8.223505506558375e-05, |
|
"loss": 0.0274, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.9220779220779223, |
|
"grad_norm": 0.3036370277404785, |
|
"learning_rate": 8.012178173434986e-05, |
|
"loss": 0.0306, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 0.3664824366569519, |
|
"learning_rate": 7.80177119394596e-05, |
|
"loss": 0.0241, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.987012987012987, |
|
"grad_norm": 0.43233174085617065, |
|
"learning_rate": 7.592381985689628e-05, |
|
"loss": 0.0309, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.0194805194805197, |
|
"grad_norm": 0.3537318706512451, |
|
"learning_rate": 7.384107495040284e-05, |
|
"loss": 0.0321, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.051948051948052, |
|
"grad_norm": 0.2594110071659088, |
|
"learning_rate": 7.177044152262369e-05, |
|
"loss": 0.0245, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.0844155844155843, |
|
"grad_norm": 0.3139798939228058, |
|
"learning_rate": 6.97128782686367e-05, |
|
"loss": 0.0218, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.116883116883117, |
|
"grad_norm": 0.2899845540523529, |
|
"learning_rate": 6.766933783208092e-05, |
|
"loss": 0.029, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.1493506493506493, |
|
"grad_norm": 0.320487916469574, |
|
"learning_rate": 6.564076636408656e-05, |
|
"loss": 0.0226, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.4478653371334076, |
|
"learning_rate": 6.362810308521054e-05, |
|
"loss": 0.0224, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 0.20480212569236755, |
|
"learning_rate": 6.16322798505813e-05, |
|
"loss": 0.0224, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.2467532467532467, |
|
"grad_norm": 0.4546569287776947, |
|
"learning_rate": 5.9654220718453424e-05, |
|
"loss": 0.0311, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.279220779220779, |
|
"grad_norm": 0.2200387865304947, |
|
"learning_rate": 5.769484152237249e-05, |
|
"loss": 0.026, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.311688311688312, |
|
"grad_norm": 0.40406161546707153, |
|
"learning_rate": 5.5755049447147444e-05, |
|
"loss": 0.028, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.344155844155844, |
|
"grad_norm": 0.41747692227363586, |
|
"learning_rate": 5.383574260882802e-05, |
|
"loss": 0.0255, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.3766233766233764, |
|
"grad_norm": 0.4012701213359833, |
|
"learning_rate": 5.1937809638880374e-05, |
|
"loss": 0.0289, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 0.49661171436309814, |
|
"learning_rate": 5.0062129272754146e-05, |
|
"loss": 0.0249, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.4415584415584415, |
|
"grad_norm": 0.225033700466156, |
|
"learning_rate": 4.8209569943031516e-05, |
|
"loss": 0.0274, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.474025974025974, |
|
"grad_norm": 0.2046009600162506, |
|
"learning_rate": 4.638098937734648e-05, |
|
"loss": 0.0232, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.5064935064935066, |
|
"grad_norm": 0.4959232211112976, |
|
"learning_rate": 4.4577234201260196e-05, |
|
"loss": 0.0233, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.538961038961039, |
|
"grad_norm": 0.42567142844200134, |
|
"learning_rate": 4.279913954627667e-05, |
|
"loss": 0.0188, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.22492988407611847, |
|
"learning_rate": 4.104752866318026e-05, |
|
"loss": 0.0178, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.603896103896104, |
|
"grad_norm": 0.2631309926509857, |
|
"learning_rate": 3.932321254087389e-05, |
|
"loss": 0.0187, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.45389196276664734, |
|
"learning_rate": 3.7626989530894285e-05, |
|
"loss": 0.0211, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.6688311688311686, |
|
"grad_norm": 0.20797108113765717, |
|
"learning_rate": 3.5959644977778386e-05, |
|
"loss": 0.024, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.7012987012987013, |
|
"grad_norm": 0.22460906207561493, |
|
"learning_rate": 3.432195085545191e-05, |
|
"loss": 0.0161, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.7337662337662336, |
|
"grad_norm": 0.3012770116329193, |
|
"learning_rate": 3.271466540980862e-05, |
|
"loss": 0.0232, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.7662337662337664, |
|
"grad_norm": 0.38262003660202026, |
|
"learning_rate": 3.1138532807645394e-05, |
|
"loss": 0.0174, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.7987012987012987, |
|
"grad_norm": 0.22977425158023834, |
|
"learning_rate": 2.9594282792115857e-05, |
|
"loss": 0.0227, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.8311688311688314, |
|
"grad_norm": 0.16646578907966614, |
|
"learning_rate": 2.808263034486226e-05, |
|
"loss": 0.0155, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"grad_norm": 0.257495254278183, |
|
"learning_rate": 2.660427535498191e-05, |
|
"loss": 0.0218, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"grad_norm": 0.3283085227012634, |
|
"learning_rate": 2.5159902294981197e-05, |
|
"loss": 0.0224, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.928571428571429, |
|
"grad_norm": 0.36105039715766907, |
|
"learning_rate": 2.3750179903867443e-05, |
|
"loss": 0.0278, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.961038961038961, |
|
"grad_norm": 0.3740982115268707, |
|
"learning_rate": 2.237576087752554e-05, |
|
"loss": 0.0241, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.9935064935064934, |
|
"grad_norm": 0.19882728159427643, |
|
"learning_rate": 2.1037281566522304e-05, |
|
"loss": 0.0174, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.025974025974026, |
|
"grad_norm": 0.12800416350364685, |
|
"learning_rate": 1.973536168147867e-05, |
|
"loss": 0.0137, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.058441558441558, |
|
"grad_norm": 0.352993905544281, |
|
"learning_rate": 1.8470604006146064e-05, |
|
"loss": 0.0221, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 0.3981715738773346, |
|
"learning_rate": 1.7243594118319985e-05, |
|
"loss": 0.0162, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.123376623376624, |
|
"grad_norm": 0.23238126933574677, |
|
"learning_rate": 1.6054900118719807e-05, |
|
"loss": 0.0201, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.1558441558441555, |
|
"grad_norm": 0.3487814962863922, |
|
"learning_rate": 1.4905072367960437e-05, |
|
"loss": 0.0182, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.188311688311688, |
|
"grad_norm": 0.2413235455751419, |
|
"learning_rate": 1.3794643231737348e-05, |
|
"loss": 0.0161, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.220779220779221, |
|
"grad_norm": 0.3678736984729767, |
|
"learning_rate": 1.2724126834343564e-05, |
|
"loss": 0.02, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.253246753246753, |
|
"grad_norm": 0.0916026160120964, |
|
"learning_rate": 1.1694018820632068e-05, |
|
"loss": 0.0179, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.16847121715545654, |
|
"learning_rate": 1.0704796126534234e-05, |
|
"loss": 0.0182, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.318181818181818, |
|
"grad_norm": 0.10810457170009613, |
|
"learning_rate": 9.756916758240286e-06, |
|
"loss": 0.0219, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.35064935064935, |
|
"grad_norm": 0.31443580985069275, |
|
"learning_rate": 8.850819580144387e-06, |
|
"loss": 0.0254, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.383116883116883, |
|
"grad_norm": 0.2282458394765854, |
|
"learning_rate": 7.986924111652006e-06, |
|
"loss": 0.0177, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.415584415584416, |
|
"grad_norm": 0.34387949109077454, |
|
"learning_rate": 7.16563033294424e-06, |
|
"loss": 0.0167, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.448051948051948, |
|
"grad_norm": 0.12599095702171326, |
|
"learning_rate": 6.387318499788497e-06, |
|
"loss": 0.0205, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.48051948051948, |
|
"grad_norm": 0.14966098964214325, |
|
"learning_rate": 5.652348967481569e-06, |
|
"loss": 0.0201, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.512987012987013, |
|
"grad_norm": 0.28549131751060486, |
|
"learning_rate": 4.961062024006591e-06, |
|
"loss": 0.019, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.22848688066005707, |
|
"learning_rate": 4.313777732481039e-06, |
|
"loss": 0.0182, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.577922077922078, |
|
"grad_norm": 0.20382894575595856, |
|
"learning_rate": 3.7107957829688234e-06, |
|
"loss": 0.0228, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.6103896103896105, |
|
"grad_norm": 0.1720917969942093, |
|
"learning_rate": 3.1523953537248684e-06, |
|
"loss": 0.0189, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.642857142857143, |
|
"grad_norm": 0.10501822084188461, |
|
"learning_rate": 2.638834981936744e-06, |
|
"loss": 0.015, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.675324675324675, |
|
"grad_norm": 0.13112539052963257, |
|
"learning_rate": 2.1703524440230383e-06, |
|
"loss": 0.0118, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.707792207792208, |
|
"grad_norm": 0.17099051177501678, |
|
"learning_rate": 1.7471646455437085e-06, |
|
"loss": 0.0151, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.740259740259741, |
|
"grad_norm": 0.3271525204181671, |
|
"learning_rate": 1.3694675207737151e-06, |
|
"loss": 0.0223, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.7727272727272725, |
|
"grad_norm": 0.35389673709869385, |
|
"learning_rate": 1.03743594198622e-06, |
|
"loss": 0.0133, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.805194805194805, |
|
"grad_norm": 0.25904619693756104, |
|
"learning_rate": 7.512236384874305e-07, |
|
"loss": 0.0166, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.837662337662338, |
|
"grad_norm": 0.14534501731395721, |
|
"learning_rate": 5.109631254405445e-07, |
|
"loss": 0.0133, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.87012987012987, |
|
"grad_norm": 0.17412607371807098, |
|
"learning_rate": 3.1676564251171824e-07, |
|
"loss": 0.0188, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.902597402597403, |
|
"grad_norm": 0.20664988458156586, |
|
"learning_rate": 1.687211023665647e-07, |
|
"loss": 0.0114, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.935064935064935, |
|
"grad_norm": 0.19708259403705597, |
|
"learning_rate": 6.68980490409421e-08, |
|
"loss": 0.0179, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.967532467532467, |
|
"grad_norm": 0.5721464157104492, |
|
"learning_rate": 1.134362620534013e-08, |
|
"loss": 0.0192, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.990259740259741, |
|
"step": 1537, |
|
"total_flos": 5.134548117135398e+16, |
|
"train_loss": 0.05982626907687413, |
|
"train_runtime": 715.8651, |
|
"train_samples_per_second": 34.353, |
|
"train_steps_per_second": 2.147 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1537, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.134548117135398e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|