|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.821428571428573, |
|
"eval_steps": 500, |
|
"global_step": 1665, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 4.05410099029541, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 1.0474, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.3301610946655273, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.4554, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 1.262498378753662, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.3098, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.8813798427581787, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 0.2559, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 1.2415088415145874, |
|
"learning_rate": 0.00011904761904761905, |
|
"loss": 0.2103, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.0791610479354858, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.1988, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.8729690313339233, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.1797, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 1.2665623426437378, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.1687, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.9594415426254272, |
|
"learning_rate": 0.00019999289272096886, |
|
"loss": 0.1538, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 1.446765422821045, |
|
"learning_rate": 0.00019994946300764274, |
|
"loss": 0.145, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3095238095238095, |
|
"grad_norm": 0.7596405744552612, |
|
"learning_rate": 0.00019986656919636397, |
|
"loss": 0.1428, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.8046891093254089, |
|
"learning_rate": 0.00019974424401696808, |
|
"loss": 0.1262, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5476190476190477, |
|
"grad_norm": 0.3929666578769684, |
|
"learning_rate": 0.00019958253576839256, |
|
"loss": 0.117, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.0657896995544434, |
|
"learning_rate": 0.00019938150829960634, |
|
"loss": 0.1223, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.4697973132133484, |
|
"learning_rate": 0.00019914124098439974, |
|
"loss": 0.1152, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.40580397844314575, |
|
"learning_rate": 0.00019886182869004445, |
|
"loss": 0.104, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0238095238095237, |
|
"grad_norm": 0.5756620168685913, |
|
"learning_rate": 0.00019854338173983614, |
|
"loss": 0.0907, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.33545899391174316, |
|
"learning_rate": 0.00019818602586953415, |
|
"loss": 0.0889, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.261904761904762, |
|
"grad_norm": 0.7573866844177246, |
|
"learning_rate": 0.00019778990217771621, |
|
"loss": 0.0932, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.45556336641311646, |
|
"learning_rate": 0.00019735516707006676, |
|
"loss": 0.0855, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.346583753824234, |
|
"learning_rate": 0.00019688199219762182, |
|
"loss": 0.0853, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.44082921743392944, |
|
"learning_rate": 0.0001963705643889941, |
|
"loss": 0.0818, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.738095238095238, |
|
"grad_norm": 0.5446610450744629, |
|
"learning_rate": 0.00019582108557660553, |
|
"loss": 0.079, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5620890259742737, |
|
"learning_rate": 0.0001952337727169561, |
|
"loss": 0.0826, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 0.27951428294181824, |
|
"learning_rate": 0.0001946088577049608, |
|
"loss": 0.077, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 0.28334465622901917, |
|
"learning_rate": 0.00019394658728238794, |
|
"loss": 0.0791, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 0.33841413259506226, |
|
"learning_rate": 0.00019324722294043558, |
|
"loss": 0.0832, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.3847584128379822, |
|
"learning_rate": 0.0001925110408164842, |
|
"loss": 0.0716, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.4523809523809526, |
|
"grad_norm": 0.277832567691803, |
|
"learning_rate": 0.00019173833158506648, |
|
"loss": 0.07, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.33352115750312805, |
|
"learning_rate": 0.00019092940034309722, |
|
"loss": 0.0709, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.6904761904761907, |
|
"grad_norm": 0.5264162421226501, |
|
"learning_rate": 0.0001900845664894086, |
|
"loss": 0.0731, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.3690634071826935, |
|
"learning_rate": 0.00018920416359863887, |
|
"loss": 0.0635, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.928571428571429, |
|
"grad_norm": 0.33004167675971985, |
|
"learning_rate": 0.0001882885392895232, |
|
"loss": 0.0745, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.0476190476190474, |
|
"grad_norm": 0.4185870587825775, |
|
"learning_rate": 0.00018733805508764002, |
|
"loss": 0.0802, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 0.30809977650642395, |
|
"learning_rate": 0.00018635308628266585, |
|
"loss": 0.071, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.3416142463684082, |
|
"learning_rate": 0.00018533402178019594, |
|
"loss": 0.0746, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.404761904761905, |
|
"grad_norm": 0.3178706467151642, |
|
"learning_rate": 0.0001842812639481884, |
|
"loss": 0.0615, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.523809523809524, |
|
"grad_norm": 0.2836775779724121, |
|
"learning_rate": 0.00018319522845809306, |
|
"loss": 0.0595, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.642857142857143, |
|
"grad_norm": 0.3071931004524231, |
|
"learning_rate": 0.00018207634412072764, |
|
"loss": 0.0636, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.4010617434978485, |
|
"learning_rate": 0.0001809250527169658, |
|
"loss": 0.0593, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.880952380952381, |
|
"grad_norm": 0.2992096245288849, |
|
"learning_rate": 0.00017974180882330412, |
|
"loss": 0.0687, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7678210735321045, |
|
"learning_rate": 0.0001785270796323769, |
|
"loss": 0.0647, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.119047619047619, |
|
"grad_norm": 0.2817954123020172, |
|
"learning_rate": 0.00017728134476848966, |
|
"loss": 0.0637, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.238095238095238, |
|
"grad_norm": 0.3238624930381775, |
|
"learning_rate": 0.00017600509609824388, |
|
"loss": 0.0654, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.357142857142857, |
|
"grad_norm": 0.4325568377971649, |
|
"learning_rate": 0.00017469883753632817, |
|
"loss": 0.0574, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.476190476190476, |
|
"grad_norm": 0.3361673653125763, |
|
"learning_rate": 0.0001733630848465525, |
|
"loss": 0.0614, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.595238095238095, |
|
"grad_norm": 0.2163662165403366, |
|
"learning_rate": 0.00017199836543820357, |
|
"loss": 0.0564, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.3433023989200592, |
|
"learning_rate": 0.00017060521815780223, |
|
"loss": 0.0566, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.833333333333333, |
|
"grad_norm": 0.2643953859806061, |
|
"learning_rate": 0.0001691841930763453, |
|
"loss": 0.0486, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.9523809523809526, |
|
"grad_norm": 0.2475927174091339, |
|
"learning_rate": 0.00016773585127211478, |
|
"loss": 0.0567, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.071428571428571, |
|
"grad_norm": 0.23853139579296112, |
|
"learning_rate": 0.00016626076460914198, |
|
"loss": 0.0516, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.190476190476191, |
|
"grad_norm": 0.22467206418514252, |
|
"learning_rate": 0.00016475951551141199, |
|
"loss": 0.0515, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.309523809523809, |
|
"grad_norm": 0.3674350380897522, |
|
"learning_rate": 0.0001632326967328993, |
|
"loss": 0.0513, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"grad_norm": 0.2875892221927643, |
|
"learning_rate": 0.0001616809111235244, |
|
"loss": 0.0546, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.5476190476190474, |
|
"grad_norm": 0.27081480622291565, |
|
"learning_rate": 0.0001601047713911244, |
|
"loss": 0.0486, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.29003506898880005, |
|
"learning_rate": 0.00015850489985953076, |
|
"loss": 0.0519, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.785714285714286, |
|
"grad_norm": 0.267265647649765, |
|
"learning_rate": 0.00015688192822285117, |
|
"loss": 0.0534, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.904761904761905, |
|
"grad_norm": 0.2007722705602646, |
|
"learning_rate": 0.0001552364972960506, |
|
"loss": 0.0525, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.023809523809524, |
|
"grad_norm": 0.2781243324279785, |
|
"learning_rate": 0.0001535692567619319, |
|
"loss": 0.049, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 0.302306592464447, |
|
"learning_rate": 0.00015188086491461466, |
|
"loss": 0.0479, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.261904761904762, |
|
"grad_norm": 0.23352427780628204, |
|
"learning_rate": 0.0001501719883996139, |
|
"loss": 0.0522, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.380952380952381, |
|
"grad_norm": 0.3669947385787964, |
|
"learning_rate": 0.00014844330195062144, |
|
"loss": 0.0498, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.22105714678764343, |
|
"learning_rate": 0.00014669548812309388, |
|
"loss": 0.048, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.619047619047619, |
|
"grad_norm": 0.2889458239078522, |
|
"learning_rate": 0.00014492923702475182, |
|
"loss": 0.0475, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.738095238095238, |
|
"grad_norm": 0.22984769940376282, |
|
"learning_rate": 0.00014314524604309748, |
|
"loss": 0.0537, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 0.22611601650714874, |
|
"learning_rate": 0.00014134421957005775, |
|
"loss": 0.048, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.976190476190476, |
|
"grad_norm": 0.37012603878974915, |
|
"learning_rate": 0.00013952686872386195, |
|
"loss": 0.0489, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.095238095238095, |
|
"grad_norm": 0.27276352047920227, |
|
"learning_rate": 0.00013769391106826327, |
|
"loss": 0.0479, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.214285714285714, |
|
"grad_norm": 0.23436495661735535, |
|
"learning_rate": 0.00013584607032921566, |
|
"loss": 0.0433, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.2781357169151306, |
|
"learning_rate": 0.0001339840761091175, |
|
"loss": 0.043, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.452380952380953, |
|
"grad_norm": 0.2650757133960724, |
|
"learning_rate": 0.00013210866359873505, |
|
"loss": 0.0447, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.3430781364440918, |
|
"learning_rate": 0.00013022057328691914, |
|
"loss": 0.0452, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.69047619047619, |
|
"grad_norm": 0.21453280746936798, |
|
"learning_rate": 0.00012832055066823038, |
|
"loss": 0.0438, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.80952380952381, |
|
"grad_norm": 0.29838255047798157, |
|
"learning_rate": 0.00012640934594858774, |
|
"loss": 0.0479, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.928571428571429, |
|
"grad_norm": 0.19045886397361755, |
|
"learning_rate": 0.00012448771374905655, |
|
"loss": 0.0459, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.047619047619047, |
|
"grad_norm": 0.4510950744152069, |
|
"learning_rate": 0.00012255641280789386, |
|
"loss": 0.0512, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.166666666666666, |
|
"grad_norm": 0.24877306818962097, |
|
"learning_rate": 0.0001206162056809676, |
|
"loss": 0.0472, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"grad_norm": 0.2406831532716751, |
|
"learning_rate": 0.00011866785844066883, |
|
"loss": 0.0476, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.404761904761905, |
|
"grad_norm": 0.2927370071411133, |
|
"learning_rate": 0.00011671214037343514, |
|
"loss": 0.0412, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.523809523809524, |
|
"grad_norm": 0.20210541784763336, |
|
"learning_rate": 0.00011474982367600525, |
|
"loss": 0.0424, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.642857142857142, |
|
"grad_norm": 0.2872091829776764, |
|
"learning_rate": 0.00011278168315052445, |
|
"loss": 0.0399, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.761904761904763, |
|
"grad_norm": 0.290270060300827, |
|
"learning_rate": 0.00011080849589862142, |
|
"loss": 0.0357, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.880952380952381, |
|
"grad_norm": 0.20899908244609833, |
|
"learning_rate": 0.0001088310410145768, |
|
"loss": 0.0401, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.248155415058136, |
|
"learning_rate": 0.00010685009927770542, |
|
"loss": 0.035, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 10.119047619047619, |
|
"grad_norm": 0.18403619527816772, |
|
"learning_rate": 0.00010486645284407281, |
|
"loss": 0.039, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 10.238095238095237, |
|
"grad_norm": 0.16922874748706818, |
|
"learning_rate": 0.00010288088493766845, |
|
"loss": 0.0361, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.357142857142858, |
|
"grad_norm": 0.19182036817073822, |
|
"learning_rate": 0.00010089417954115714, |
|
"loss": 0.0404, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 10.476190476190476, |
|
"grad_norm": 0.2680899202823639, |
|
"learning_rate": 9.890712108633076e-05, |
|
"loss": 0.0366, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 10.595238095238095, |
|
"grad_norm": 0.21088555455207825, |
|
"learning_rate": 9.692049414438299e-05, |
|
"loss": 0.0397, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 10.714285714285714, |
|
"grad_norm": 0.27201998233795166, |
|
"learning_rate": 9.493508311612874e-05, |
|
"loss": 0.0411, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 10.833333333333334, |
|
"grad_norm": 0.15937431156635284, |
|
"learning_rate": 9.295167192229093e-05, |
|
"loss": 0.0394, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 10.952380952380953, |
|
"grad_norm": 0.20022912323474884, |
|
"learning_rate": 9.097104369397682e-05, |
|
"loss": 0.0361, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 11.071428571428571, |
|
"grad_norm": 0.2788039743900299, |
|
"learning_rate": 8.899398046346608e-05, |
|
"loss": 0.0397, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 11.19047619047619, |
|
"grad_norm": 0.2813689708709717, |
|
"learning_rate": 8.702126285543286e-05, |
|
"loss": 0.0374, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 11.30952380952381, |
|
"grad_norm": 0.1632857620716095, |
|
"learning_rate": 8.505366977872336e-05, |
|
"loss": 0.0372, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 11.428571428571429, |
|
"grad_norm": 0.16908283531665802, |
|
"learning_rate": 8.309197811881127e-05, |
|
"loss": 0.0361, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 11.547619047619047, |
|
"grad_norm": 0.2518416941165924, |
|
"learning_rate": 8.113696243105176e-05, |
|
"loss": 0.0374, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 11.666666666666666, |
|
"grad_norm": 0.21874170005321503, |
|
"learning_rate": 7.918939463485568e-05, |
|
"loss": 0.0363, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 11.785714285714286, |
|
"grad_norm": 0.2551394999027252, |
|
"learning_rate": 7.72500437089046e-05, |
|
"loss": 0.0348, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 11.904761904761905, |
|
"grad_norm": 0.2511221170425415, |
|
"learning_rate": 7.531967538752656e-05, |
|
"loss": 0.0412, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 12.023809523809524, |
|
"grad_norm": 0.24874921143054962, |
|
"learning_rate": 7.33990518583535e-05, |
|
"loss": 0.0369, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 12.142857142857142, |
|
"grad_norm": 0.25549542903900146, |
|
"learning_rate": 7.148893146137852e-05, |
|
"loss": 0.0334, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 12.261904761904763, |
|
"grad_norm": 0.25841251015663147, |
|
"learning_rate": 6.95900683895325e-05, |
|
"loss": 0.0378, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 12.380952380952381, |
|
"grad_norm": 0.2499120533466339, |
|
"learning_rate": 6.770321239089826e-05, |
|
"loss": 0.0314, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 0.19628271460533142, |
|
"learning_rate": 6.582910847267957e-05, |
|
"loss": 0.0287, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 12.619047619047619, |
|
"grad_norm": 0.18714749813079834, |
|
"learning_rate": 6.396849660704205e-05, |
|
"loss": 0.0302, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 12.738095238095237, |
|
"grad_norm": 0.19048580527305603, |
|
"learning_rate": 6.21221114389424e-05, |
|
"loss": 0.0276, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 12.857142857142858, |
|
"grad_norm": 0.21586266160011292, |
|
"learning_rate": 6.0290681996060605e-05, |
|
"loss": 0.0341, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 12.976190476190476, |
|
"grad_norm": 0.22894449532032013, |
|
"learning_rate": 5.847493140095029e-05, |
|
"loss": 0.0315, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 13.095238095238095, |
|
"grad_norm": 0.2722737491130829, |
|
"learning_rate": 5.6675576585520786e-05, |
|
"loss": 0.0288, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 13.214285714285714, |
|
"grad_norm": 0.14149416983127594, |
|
"learning_rate": 5.4893328007963094e-05, |
|
"loss": 0.0302, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 0.2547774314880371, |
|
"learning_rate": 5.312888937223244e-05, |
|
"loss": 0.0282, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 13.452380952380953, |
|
"grad_norm": 0.1506737470626831, |
|
"learning_rate": 5.1382957350197405e-05, |
|
"loss": 0.0285, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 13.571428571428571, |
|
"grad_norm": 0.19141684472560883, |
|
"learning_rate": 4.965622130656551e-05, |
|
"loss": 0.0323, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 13.69047619047619, |
|
"grad_norm": 0.18162524700164795, |
|
"learning_rate": 4.794936302669417e-05, |
|
"loss": 0.0306, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 13.80952380952381, |
|
"grad_norm": 0.20886124670505524, |
|
"learning_rate": 4.6263056447394347e-05, |
|
"loss": 0.0272, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 13.928571428571429, |
|
"grad_norm": 0.14155210554599762, |
|
"learning_rate": 4.459796739083274e-05, |
|
"loss": 0.0299, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 14.047619047619047, |
|
"grad_norm": 0.13192126154899597, |
|
"learning_rate": 4.2954753301638315e-05, |
|
"loss": 0.0337, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 14.166666666666666, |
|
"grad_norm": 0.17799627780914307, |
|
"learning_rate": 4.133406298731669e-05, |
|
"loss": 0.0278, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 0.1519002765417099, |
|
"learning_rate": 3.973653636207437e-05, |
|
"loss": 0.026, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 14.404761904761905, |
|
"grad_norm": 0.24323873221874237, |
|
"learning_rate": 3.8162804194154864e-05, |
|
"loss": 0.0329, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 14.523809523809524, |
|
"grad_norm": 0.25485578179359436, |
|
"learning_rate": 3.661348785678574e-05, |
|
"loss": 0.0282, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 14.642857142857142, |
|
"grad_norm": 0.14652098715305328, |
|
"learning_rate": 3.508919908283543e-05, |
|
"loss": 0.0263, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 14.761904761904763, |
|
"grad_norm": 0.26984453201293945, |
|
"learning_rate": 3.3590539723276083e-05, |
|
"loss": 0.029, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 14.880952380952381, |
|
"grad_norm": 0.13968226313591003, |
|
"learning_rate": 3.211810150954867e-05, |
|
"loss": 0.0267, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.29029059410095215, |
|
"learning_rate": 3.067246581992321e-05, |
|
"loss": 0.0255, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 15.119047619047619, |
|
"grad_norm": 0.16652598977088928, |
|
"learning_rate": 2.925420344994719e-05, |
|
"loss": 0.0268, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 15.238095238095237, |
|
"grad_norm": 0.27046117186546326, |
|
"learning_rate": 2.786387438707231e-05, |
|
"loss": 0.0245, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 15.357142857142858, |
|
"grad_norm": 0.22596995532512665, |
|
"learning_rate": 2.6502027589548862e-05, |
|
"loss": 0.0288, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 15.476190476190476, |
|
"grad_norm": 0.16162142157554626, |
|
"learning_rate": 2.516920076967455e-05, |
|
"loss": 0.0263, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 15.595238095238095, |
|
"grad_norm": 0.1669321060180664, |
|
"learning_rate": 2.3865920181484123e-05, |
|
"loss": 0.0284, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 15.714285714285714, |
|
"grad_norm": 0.20296935737133026, |
|
"learning_rate": 2.2592700412962777e-05, |
|
"loss": 0.026, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 15.833333333333334, |
|
"grad_norm": 0.17937733232975006, |
|
"learning_rate": 2.1350044182866025e-05, |
|
"loss": 0.0217, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 15.952380952380953, |
|
"grad_norm": 0.14423789083957672, |
|
"learning_rate": 2.0138442142226e-05, |
|
"loss": 0.0289, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 16.071428571428573, |
|
"grad_norm": 0.21349196135997772, |
|
"learning_rate": 1.895837268062256e-05, |
|
"loss": 0.029, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 16.19047619047619, |
|
"grad_norm": 0.17763833701610565, |
|
"learning_rate": 1.7810301737295588e-05, |
|
"loss": 0.0234, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 16.30952380952381, |
|
"grad_norm": 0.1566387116909027, |
|
"learning_rate": 1.6694682617173452e-05, |
|
"loss": 0.025, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 16.428571428571427, |
|
"grad_norm": 0.19837665557861328, |
|
"learning_rate": 1.5611955811889644e-05, |
|
"loss": 0.0254, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 16.547619047619047, |
|
"grad_norm": 0.17018267512321472, |
|
"learning_rate": 1.456254882585909e-05, |
|
"loss": 0.0212, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 0.11024856567382812, |
|
"learning_rate": 1.3546876007481845e-05, |
|
"loss": 0.0208, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 16.785714285714285, |
|
"grad_norm": 0.16558444499969482, |
|
"learning_rate": 1.2565338385541792e-05, |
|
"loss": 0.0234, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 16.904761904761905, |
|
"grad_norm": 0.14750836789608002, |
|
"learning_rate": 1.161832351086396e-05, |
|
"loss": 0.0234, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 17.023809523809526, |
|
"grad_norm": 0.14175614714622498, |
|
"learning_rate": 1.0706205303294026e-05, |
|
"loss": 0.0233, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 17.142857142857142, |
|
"grad_norm": 0.128941610455513, |
|
"learning_rate": 9.82934390405934e-06, |
|
"loss": 0.0228, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 17.261904761904763, |
|
"grad_norm": 0.17659620940685272, |
|
"learning_rate": 8.988085533570833e-06, |
|
"loss": 0.0244, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 17.38095238095238, |
|
"grad_norm": 0.1548050194978714, |
|
"learning_rate": 8.182762354720985e-06, |
|
"loss": 0.0203, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 0.16272428631782532, |
|
"learning_rate": 7.413692341732581e-06, |
|
"loss": 0.0285, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 17.61904761904762, |
|
"grad_norm": 0.22217531502246857, |
|
"learning_rate": 6.681179154609462e-06, |
|
"loss": 0.0195, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 17.738095238095237, |
|
"grad_norm": 0.16655217111110687, |
|
"learning_rate": 5.985512019239392e-06, |
|
"loss": 0.0236, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 17.857142857142858, |
|
"grad_norm": 0.19496409595012665, |
|
"learning_rate": 5.326965613195867e-06, |
|
"loss": 0.0246, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 17.976190476190474, |
|
"grad_norm": 0.18502101302146912, |
|
"learning_rate": 4.705799957284351e-06, |
|
"loss": 0.0209, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 18.095238095238095, |
|
"grad_norm": 0.20573672652244568, |
|
"learning_rate": 4.122260312875437e-06, |
|
"loss": 0.023, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 18.214285714285715, |
|
"grad_norm": 0.18070857226848602, |
|
"learning_rate": 3.576577085065824e-06, |
|
"loss": 0.0246, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 18.333333333333332, |
|
"grad_norm": 0.15732447803020477, |
|
"learning_rate": 3.0689657317049204e-06, |
|
"loss": 0.0232, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 18.452380952380953, |
|
"grad_norm": 0.18765419721603394, |
|
"learning_rate": 2.5996266783235078e-06, |
|
"loss": 0.0238, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 18.571428571428573, |
|
"grad_norm": 0.12155826389789581, |
|
"learning_rate": 2.1687452389974826e-06, |
|
"loss": 0.0223, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 18.69047619047619, |
|
"grad_norm": 0.12302082777023315, |
|
"learning_rate": 1.7764915431784378e-06, |
|
"loss": 0.0267, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 18.80952380952381, |
|
"grad_norm": 0.1826854795217514, |
|
"learning_rate": 1.4230204685196203e-06, |
|
"loss": 0.0238, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 18.928571428571427, |
|
"grad_norm": 0.1735326200723648, |
|
"learning_rate": 1.1084715797239798e-06, |
|
"loss": 0.0222, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 19.047619047619047, |
|
"grad_norm": 0.17233559489250183, |
|
"learning_rate": 8.329690734383277e-07, |
|
"loss": 0.0245, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 19.166666666666668, |
|
"grad_norm": 0.14161868393421173, |
|
"learning_rate": 5.966217292155296e-07, |
|
"loss": 0.0212, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 19.285714285714285, |
|
"grad_norm": 0.16612544655799866, |
|
"learning_rate": 3.99522866563895e-07, |
|
"loss": 0.0245, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 19.404761904761905, |
|
"grad_norm": 0.08409969508647919, |
|
"learning_rate": 2.417503081008632e-07, |
|
"loss": 0.0213, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 19.523809523809526, |
|
"grad_norm": 0.17151199281215668, |
|
"learning_rate": 1.2336634882544884e-07, |
|
"loss": 0.0176, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 19.642857142857142, |
|
"grad_norm": 0.13334044814109802, |
|
"learning_rate": 4.4417731521717574e-08, |
|
"loss": 0.0263, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 19.761904761904763, |
|
"grad_norm": 0.07808782160282135, |
|
"learning_rate": 4.935628302760176e-09, |
|
"loss": 0.0211, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 19.821428571428573, |
|
"step": 1665, |
|
"total_flos": 2.314379607040872e+17, |
|
"train_loss": 0.06144854805401496, |
|
"train_runtime": 1922.2885, |
|
"train_samples_per_second": 55.434, |
|
"train_steps_per_second": 0.866 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1665, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.314379607040872e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|