|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.931506849315067, |
|
"eval_steps": 500, |
|
"global_step": 1455, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 7.088261127471924, |
|
"learning_rate": 2.7397260273972603e-05, |
|
"loss": 1.4774, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 3.0658137798309326, |
|
"learning_rate": 5.479452054794521e-05, |
|
"loss": 0.5535, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.410958904109589, |
|
"grad_norm": 1.415347695350647, |
|
"learning_rate": 8.219178082191781e-05, |
|
"loss": 0.2769, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 1.2462260723114014, |
|
"learning_rate": 0.00010958904109589041, |
|
"loss": 0.2071, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.684931506849315, |
|
"grad_norm": 1.1219278573989868, |
|
"learning_rate": 0.000136986301369863, |
|
"loss": 0.179, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 2.4104228019714355, |
|
"learning_rate": 0.00016438356164383562, |
|
"loss": 0.1587, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.958904109589041, |
|
"grad_norm": 1.2239787578582764, |
|
"learning_rate": 0.0001917808219178082, |
|
"loss": 0.1366, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.9942715167999268, |
|
"learning_rate": 0.00019998733979961563, |
|
"loss": 0.1218, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.2328767123287672, |
|
"grad_norm": 0.6293880939483643, |
|
"learning_rate": 0.0001999253383717226, |
|
"loss": 0.1168, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.7170248031616211, |
|
"learning_rate": 0.00019981170237143067, |
|
"loss": 0.1052, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5068493150684932, |
|
"grad_norm": 0.7464343905448914, |
|
"learning_rate": 0.00019964649051804355, |
|
"loss": 0.1066, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.6828764081001282, |
|
"learning_rate": 0.000199429788181734, |
|
"loss": 0.1057, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.7808219178082192, |
|
"grad_norm": 0.6028720736503601, |
|
"learning_rate": 0.0001991617073394306, |
|
"loss": 0.0843, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.5440357327461243, |
|
"learning_rate": 0.00019884238651695556, |
|
"loss": 0.0948, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.0547945205479454, |
|
"grad_norm": 0.8612964749336243, |
|
"learning_rate": 0.00019847199071744415, |
|
"loss": 0.085, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.889124870300293, |
|
"learning_rate": 0.00019805071133608242, |
|
"loss": 0.0962, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.328767123287671, |
|
"grad_norm": 0.45466411113739014, |
|
"learning_rate": 0.0001975787660612072, |
|
"loss": 0.0763, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 0.42088282108306885, |
|
"learning_rate": 0.00019705639876181969, |
|
"loss": 0.0635, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.602739726027397, |
|
"grad_norm": 0.5170985460281372, |
|
"learning_rate": 0.00019648387936157068, |
|
"loss": 0.0726, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.4313249886035919, |
|
"learning_rate": 0.00019586150369928245, |
|
"loss": 0.0669, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.8767123287671232, |
|
"grad_norm": 0.3355115056037903, |
|
"learning_rate": 0.00019518959337607957, |
|
"loss": 0.0682, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.0136986301369864, |
|
"grad_norm": 0.34427109360694885, |
|
"learning_rate": 0.0001944684955892075, |
|
"loss": 0.0638, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.1506849315068495, |
|
"grad_norm": 0.2929873466491699, |
|
"learning_rate": 0.0001936985829526247, |
|
"loss": 0.0632, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.287671232876712, |
|
"grad_norm": 0.3884938657283783, |
|
"learning_rate": 0.00019288025330446126, |
|
"loss": 0.0655, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.4246575342465753, |
|
"grad_norm": 0.27399152517318726, |
|
"learning_rate": 0.00019201392950144363, |
|
"loss": 0.0533, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.5616438356164384, |
|
"grad_norm": 0.2924444079399109, |
|
"learning_rate": 0.0001911000592003909, |
|
"loss": 0.0589, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.6986301369863015, |
|
"grad_norm": 0.43013861775398254, |
|
"learning_rate": 0.00019013911462689668, |
|
"loss": 0.0615, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.8356164383561646, |
|
"grad_norm": 0.5247001647949219, |
|
"learning_rate": 0.000189131592331315, |
|
"loss": 0.0583, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.9726027397260273, |
|
"grad_norm": 0.5796880722045898, |
|
"learning_rate": 0.00018807801293217735, |
|
"loss": 0.0556, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.109589041095891, |
|
"grad_norm": 0.5179729461669922, |
|
"learning_rate": 0.00018697892084717238, |
|
"loss": 0.056, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.2465753424657535, |
|
"grad_norm": 0.42960262298583984, |
|
"learning_rate": 0.00018583488401182843, |
|
"loss": 0.0637, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.383561643835616, |
|
"grad_norm": 0.3196163773536682, |
|
"learning_rate": 0.0001846464935860431, |
|
"loss": 0.0518, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.52054794520548, |
|
"grad_norm": 0.4424096643924713, |
|
"learning_rate": 0.0001834143636486124, |
|
"loss": 0.0524, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.657534246575342, |
|
"grad_norm": 0.50010746717453, |
|
"learning_rate": 0.00018213913087991685, |
|
"loss": 0.0629, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.794520547945205, |
|
"grad_norm": 0.4036540389060974, |
|
"learning_rate": 0.00018082145423292868, |
|
"loss": 0.0531, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.931506849315069, |
|
"grad_norm": 0.36036092042922974, |
|
"learning_rate": 0.0001794620145927101, |
|
"loss": 0.0556, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.068493150684931, |
|
"grad_norm": 0.22472509741783142, |
|
"learning_rate": 0.00017806151442457827, |
|
"loss": 0.0446, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.205479452054795, |
|
"grad_norm": 0.3514921963214874, |
|
"learning_rate": 0.00017662067741111974, |
|
"loss": 0.0443, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.342465753424658, |
|
"grad_norm": 0.2920095920562744, |
|
"learning_rate": 0.00017514024807824055, |
|
"loss": 0.0451, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.47945205479452, |
|
"grad_norm": 0.21051590144634247, |
|
"learning_rate": 0.00017362099141044626, |
|
"loss": 0.0476, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.616438356164384, |
|
"grad_norm": 0.36196619272232056, |
|
"learning_rate": 0.00017206369245555036, |
|
"loss": 0.0521, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.7534246575342465, |
|
"grad_norm": 0.3503723442554474, |
|
"learning_rate": 0.0001704691559190155, |
|
"loss": 0.0472, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.890410958904109, |
|
"grad_norm": 0.3881896734237671, |
|
"learning_rate": 0.0001688382057481364, |
|
"loss": 0.0537, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.027397260273973, |
|
"grad_norm": 0.29409492015838623, |
|
"learning_rate": 0.00016717168470628077, |
|
"loss": 0.0436, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.164383561643835, |
|
"grad_norm": 0.2455558031797409, |
|
"learning_rate": 0.0001654704539374066, |
|
"loss": 0.0429, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.301369863013699, |
|
"grad_norm": 0.30749672651290894, |
|
"learning_rate": 0.00016373539252108202, |
|
"loss": 0.042, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.438356164383562, |
|
"grad_norm": 0.4117829501628876, |
|
"learning_rate": 0.00016196739701823716, |
|
"loss": 0.0422, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.575342465753424, |
|
"grad_norm": 0.3047957718372345, |
|
"learning_rate": 0.00016016738100788297, |
|
"loss": 0.0456, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.712328767123288, |
|
"grad_norm": 0.3104310631752014, |
|
"learning_rate": 0.00015833627461503595, |
|
"loss": 0.0405, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.8493150684931505, |
|
"grad_norm": 0.3713166415691376, |
|
"learning_rate": 0.0001564750240300934, |
|
"loss": 0.0451, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.986301369863014, |
|
"grad_norm": 0.23804673552513123, |
|
"learning_rate": 0.00015458459101990693, |
|
"loss": 0.0387, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.123287671232877, |
|
"grad_norm": 0.4476951062679291, |
|
"learning_rate": 0.00015266595243080714, |
|
"loss": 0.0406, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.260273972602739, |
|
"grad_norm": 0.27973777055740356, |
|
"learning_rate": 0.00015072009968383656, |
|
"loss": 0.0464, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.397260273972603, |
|
"grad_norm": 0.3597777783870697, |
|
"learning_rate": 0.00014874803826245089, |
|
"loss": 0.0459, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 7.534246575342466, |
|
"grad_norm": 0.27027377486228943, |
|
"learning_rate": 0.00014675078719295415, |
|
"loss": 0.0375, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.671232876712329, |
|
"grad_norm": 0.27681443095207214, |
|
"learning_rate": 0.00014472937851793557, |
|
"loss": 0.0421, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.808219178082192, |
|
"grad_norm": 0.3312411904335022, |
|
"learning_rate": 0.00014268485676298078, |
|
"loss": 0.048, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.945205479452055, |
|
"grad_norm": 0.2358381599187851, |
|
"learning_rate": 0.0001406182783969324, |
|
"loss": 0.0409, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.082191780821917, |
|
"grad_norm": 0.19072838127613068, |
|
"learning_rate": 0.00013853071128597924, |
|
"loss": 0.0417, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.219178082191782, |
|
"grad_norm": 0.3328644931316376, |
|
"learning_rate": 0.0001364232341418564, |
|
"loss": 0.0397, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 8.356164383561644, |
|
"grad_norm": 0.27157458662986755, |
|
"learning_rate": 0.00013429693596444067, |
|
"loss": 0.0395, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 8.493150684931507, |
|
"grad_norm": 0.2969032824039459, |
|
"learning_rate": 0.00013215291547903006, |
|
"loss": 0.0406, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 8.63013698630137, |
|
"grad_norm": 0.2864357829093933, |
|
"learning_rate": 0.00012999228056859784, |
|
"loss": 0.0424, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.767123287671232, |
|
"grad_norm": 0.25885725021362305, |
|
"learning_rate": 0.00012781614770131442, |
|
"loss": 0.0392, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.904109589041095, |
|
"grad_norm": 0.2456735372543335, |
|
"learning_rate": 0.00012562564135363313, |
|
"loss": 0.0415, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.04109589041096, |
|
"grad_norm": 0.41431066393852234, |
|
"learning_rate": 0.0001234218934292376, |
|
"loss": 0.0407, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.178082191780822, |
|
"grad_norm": 0.260213702917099, |
|
"learning_rate": 0.00012120604267415172, |
|
"loss": 0.0393, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 9.315068493150685, |
|
"grad_norm": 0.3395901322364807, |
|
"learning_rate": 0.00011897923408831346, |
|
"loss": 0.035, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 9.452054794520548, |
|
"grad_norm": 0.3405311405658722, |
|
"learning_rate": 0.0001167426183339174, |
|
"loss": 0.0342, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 9.58904109589041, |
|
"grad_norm": 0.20802819728851318, |
|
"learning_rate": 0.00011449735114083127, |
|
"loss": 0.0347, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 9.726027397260275, |
|
"grad_norm": 0.5094506144523621, |
|
"learning_rate": 0.00011224459270939384, |
|
"loss": 0.0373, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"grad_norm": 0.21799403429031372, |
|
"learning_rate": 0.000109985507110903, |
|
"loss": 0.0392, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.28433603048324585, |
|
"learning_rate": 0.00010772126168610325, |
|
"loss": 0.0373, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 10.136986301369863, |
|
"grad_norm": 0.3425813913345337, |
|
"learning_rate": 0.00010545302644198405, |
|
"loss": 0.0385, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 10.273972602739725, |
|
"grad_norm": 0.2662697434425354, |
|
"learning_rate": 0.00010318197344720018, |
|
"loss": 0.0347, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 10.41095890410959, |
|
"grad_norm": 0.2841816842556, |
|
"learning_rate": 0.0001009092762264271, |
|
"loss": 0.04, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 10.547945205479452, |
|
"grad_norm": 0.2933363914489746, |
|
"learning_rate": 9.863610915396365e-05, |
|
"loss": 0.0363, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 10.684931506849315, |
|
"grad_norm": 0.20692330598831177, |
|
"learning_rate": 9.63636468468959e-05, |
|
"loss": 0.0361, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 10.821917808219178, |
|
"grad_norm": 0.24741721153259277, |
|
"learning_rate": 9.409306355813529e-05, |
|
"loss": 0.0341, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 10.95890410958904, |
|
"grad_norm": 0.1948077529668808, |
|
"learning_rate": 9.18255325696454e-05, |
|
"loss": 0.0349, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 11.095890410958905, |
|
"grad_norm": 0.16165360808372498, |
|
"learning_rate": 8.956222558616998e-05, |
|
"loss": 0.0318, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 11.232876712328768, |
|
"grad_norm": 0.25702184438705444, |
|
"learning_rate": 8.730431212977625e-05, |
|
"loss": 0.0281, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 11.36986301369863, |
|
"grad_norm": 0.27587395906448364, |
|
"learning_rate": 8.505295893552594e-05, |
|
"loss": 0.0349, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 11.506849315068493, |
|
"grad_norm": 0.3140430152416229, |
|
"learning_rate": 8.280932934858652e-05, |
|
"loss": 0.0305, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 11.643835616438356, |
|
"grad_norm": 0.21165433526039124, |
|
"learning_rate": 8.05745827230941e-05, |
|
"loss": 0.0314, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 11.780821917808218, |
|
"grad_norm": 0.20445489883422852, |
|
"learning_rate": 7.834987382307861e-05, |
|
"loss": 0.0319, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 11.917808219178083, |
|
"grad_norm": 0.27832481265068054, |
|
"learning_rate": 7.613635222576072e-05, |
|
"loss": 0.0334, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 12.054794520547945, |
|
"grad_norm": 0.25728923082351685, |
|
"learning_rate": 7.393516172752919e-05, |
|
"loss": 0.033, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 12.191780821917808, |
|
"grad_norm": 0.2254086136817932, |
|
"learning_rate": 7.174743975290513e-05, |
|
"loss": 0.0346, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 12.32876712328767, |
|
"grad_norm": 0.31018713116645813, |
|
"learning_rate": 6.957431676679896e-05, |
|
"loss": 0.0329, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 12.465753424657533, |
|
"grad_norm": 0.32662343978881836, |
|
"learning_rate": 6.741691569036338e-05, |
|
"loss": 0.0342, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 12.602739726027398, |
|
"grad_norm": 0.2533169984817505, |
|
"learning_rate": 6.527635132074493e-05, |
|
"loss": 0.0264, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 12.73972602739726, |
|
"grad_norm": 0.27445635199546814, |
|
"learning_rate": 6.315372975503285e-05, |
|
"loss": 0.0281, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 12.876712328767123, |
|
"grad_norm": 0.21471256017684937, |
|
"learning_rate": 6.1050147818704e-05, |
|
"loss": 0.0321, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 13.013698630136986, |
|
"grad_norm": 0.19105984270572662, |
|
"learning_rate": 5.896669249885851e-05, |
|
"loss": 0.0273, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 13.150684931506849, |
|
"grad_norm": 0.3308360278606415, |
|
"learning_rate": 5.690444038253935e-05, |
|
"loss": 0.0343, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 13.287671232876713, |
|
"grad_norm": 0.1988590806722641, |
|
"learning_rate": 5.4864457100425783e-05, |
|
"loss": 0.028, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 13.424657534246576, |
|
"grad_norm": 0.1858794391155243, |
|
"learning_rate": 5.284779677618841e-05, |
|
"loss": 0.0273, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 13.561643835616438, |
|
"grad_norm": 0.29671627283096313, |
|
"learning_rate": 5.0855501481790305e-05, |
|
"loss": 0.0271, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 13.698630136986301, |
|
"grad_norm": 0.17693527042865753, |
|
"learning_rate": 4.8888600699015496e-05, |
|
"loss": 0.034, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 13.835616438356164, |
|
"grad_norm": 0.31038013100624084, |
|
"learning_rate": 4.694811078750338e-05, |
|
"loss": 0.0251, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 13.972602739726028, |
|
"grad_norm": 0.3317829668521881, |
|
"learning_rate": 4.50350344595635e-05, |
|
"loss": 0.0334, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 14.10958904109589, |
|
"grad_norm": 0.1818408966064453, |
|
"learning_rate": 4.315036026204262e-05, |
|
"loss": 0.0272, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 14.246575342465754, |
|
"grad_norm": 0.2105715572834015, |
|
"learning_rate": 4.129506206551138e-05, |
|
"loss": 0.025, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 14.383561643835616, |
|
"grad_norm": 0.18613150715827942, |
|
"learning_rate": 3.947009856103465e-05, |
|
"loss": 0.0238, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 14.520547945205479, |
|
"grad_norm": 0.2959461212158203, |
|
"learning_rate": 3.767641276478563e-05, |
|
"loss": 0.0249, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 14.657534246575342, |
|
"grad_norm": 0.18495745956897736, |
|
"learning_rate": 3.591493153075966e-05, |
|
"loss": 0.0214, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 14.794520547945206, |
|
"grad_norm": 0.1501263529062271, |
|
"learning_rate": 3.41865650718396e-05, |
|
"loss": 0.0266, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 14.931506849315069, |
|
"grad_norm": 0.3387095332145691, |
|
"learning_rate": 3.24922064894601e-05, |
|
"loss": 0.0268, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 15.068493150684931, |
|
"grad_norm": 0.23434942960739136, |
|
"learning_rate": 3.083273131211382e-05, |
|
"loss": 0.0272, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 15.205479452054794, |
|
"grad_norm": 0.163187175989151, |
|
"learning_rate": 2.920899704293849e-05, |
|
"loss": 0.0232, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 15.342465753424657, |
|
"grad_norm": 0.20000265538692474, |
|
"learning_rate": 2.762184271661785e-05, |
|
"loss": 0.0261, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 15.479452054794521, |
|
"grad_norm": 0.18943333625793457, |
|
"learning_rate": 2.6072088465826038e-05, |
|
"loss": 0.0246, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 15.616438356164384, |
|
"grad_norm": 0.2833252251148224, |
|
"learning_rate": 2.4560535097439108e-05, |
|
"loss": 0.0253, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 15.753424657534246, |
|
"grad_norm": 0.1302843540906906, |
|
"learning_rate": 2.308796367873296e-05, |
|
"loss": 0.0246, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 15.89041095890411, |
|
"grad_norm": 0.16615238785743713, |
|
"learning_rate": 2.165513513378121e-05, |
|
"loss": 0.0254, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 16.027397260273972, |
|
"grad_norm": 0.17113815248012543, |
|
"learning_rate": 2.0262789850261798e-05, |
|
"loss": 0.0288, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 16.164383561643834, |
|
"grad_norm": 0.21394069492816925, |
|
"learning_rate": 1.8911647296875147e-05, |
|
"loss": 0.025, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 16.301369863013697, |
|
"grad_norm": 0.2763649523258209, |
|
"learning_rate": 1.7602405651572275e-05, |
|
"loss": 0.0219, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 16.438356164383563, |
|
"grad_norm": 0.13925646245479584, |
|
"learning_rate": 1.6335741440784035e-05, |
|
"loss": 0.0217, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 16.575342465753426, |
|
"grad_norm": 0.20826192200183868, |
|
"learning_rate": 1.511230918983867e-05, |
|
"loss": 0.023, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 16.71232876712329, |
|
"grad_norm": 0.2256271094083786, |
|
"learning_rate": 1.3932741084747913e-05, |
|
"loss": 0.023, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 16.84931506849315, |
|
"grad_norm": 0.27016547322273254, |
|
"learning_rate": 1.2797646645536566e-05, |
|
"loss": 0.0211, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 16.986301369863014, |
|
"grad_norm": 0.26627489924430847, |
|
"learning_rate": 1.1707612411284253e-05, |
|
"loss": 0.0235, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 17.123287671232877, |
|
"grad_norm": 0.18498767912387848, |
|
"learning_rate": 1.0663201637042252e-05, |
|
"loss": 0.022, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 17.26027397260274, |
|
"grad_norm": 0.23852607607841492, |
|
"learning_rate": 9.664954002781745e-06, |
|
"loss": 0.0228, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 17.397260273972602, |
|
"grad_norm": 0.15411531925201416, |
|
"learning_rate": 8.713385334524283e-06, |
|
"loss": 0.0198, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 17.534246575342465, |
|
"grad_norm": 0.25403866171836853, |
|
"learning_rate": 7.808987337798158e-06, |
|
"loss": 0.0257, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 17.671232876712327, |
|
"grad_norm": 0.14403975009918213, |
|
"learning_rate": 6.952227343558671e-06, |
|
"loss": 0.0215, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 17.80821917808219, |
|
"grad_norm": 0.188527911901474, |
|
"learning_rate": 6.143548066703475e-06, |
|
"loss": 0.0224, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 17.945205479452056, |
|
"grad_norm": 0.1309424489736557, |
|
"learning_rate": 5.383367377307857e-06, |
|
"loss": 0.0215, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 18.08219178082192, |
|
"grad_norm": 0.11233002692461014, |
|
"learning_rate": 4.672078084698095e-06, |
|
"loss": 0.0211, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 18.21917808219178, |
|
"grad_norm": 0.22869743406772614, |
|
"learning_rate": 4.010047734474454e-06, |
|
"loss": 0.0215, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 18.356164383561644, |
|
"grad_norm": 0.11979719996452332, |
|
"learning_rate": 3.397618418588877e-06, |
|
"loss": 0.0273, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 18.493150684931507, |
|
"grad_norm": 0.2112375795841217, |
|
"learning_rate": 2.8351065985751766e-06, |
|
"loss": 0.0228, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 18.63013698630137, |
|
"grad_norm": 0.14134034514427185, |
|
"learning_rate": 2.322802942023461e-06, |
|
"loss": 0.0247, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 18.767123287671232, |
|
"grad_norm": 0.09884881973266602, |
|
"learning_rate": 1.8609721723830132e-06, |
|
"loss": 0.0196, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 18.904109589041095, |
|
"grad_norm": 0.14044946432113647, |
|
"learning_rate": 1.4498529321713584e-06, |
|
"loss": 0.0198, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 19.041095890410958, |
|
"grad_norm": 0.13853876292705536, |
|
"learning_rate": 1.0896576596600705e-06, |
|
"loss": 0.0182, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 19.17808219178082, |
|
"grad_norm": 0.1654110848903656, |
|
"learning_rate": 7.80572479101327e-07, |
|
"loss": 0.0229, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 19.315068493150687, |
|
"grad_norm": 0.15151838958263397, |
|
"learning_rate": 5.227571045515633e-07, |
|
"loss": 0.0202, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 19.45205479452055, |
|
"grad_norm": 0.2258201688528061, |
|
"learning_rate": 3.163447573422351e-07, |
|
"loss": 0.0197, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 19.589041095890412, |
|
"grad_norm": 0.24640779197216034, |
|
"learning_rate": 1.614420972401165e-07, |
|
"loss": 0.0187, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 19.726027397260275, |
|
"grad_norm": 0.21181590855121613, |
|
"learning_rate": 5.812916733284324e-08, |
|
"loss": 0.0198, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 19.863013698630137, |
|
"grad_norm": 0.14787183701992035, |
|
"learning_rate": 6.459352668164442e-09, |
|
"loss": 0.0186, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 19.931506849315067, |
|
"step": 1455, |
|
"total_flos": 1.1504025698630573e+17, |
|
"train_loss": 0.05927258820058554, |
|
"train_runtime": 1048.7401, |
|
"train_samples_per_second": 88.792, |
|
"train_steps_per_second": 1.387 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1455, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1504025698630573e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|