phospho-app
/

matcha_stir-kdzf5twob4

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 19.821428571428573,
+  "eval_steps": 500,
+  "global_step": 1665,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.11904761904761904,
+      "grad_norm": 4.05410099029541,
+      "learning_rate": 2.380952380952381e-05,
+      "loss": 1.0474,
+      "step": 10
+    },
+    {
+      "epoch": 0.23809523809523808,
+      "grad_norm": 1.3301610946655273,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 0.4554,
+      "step": 20
+    },
+    {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 1.262498378753662,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 0.3098,
+      "step": 30
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.8813798427581787,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 0.2559,
+      "step": 40
+    },
+    {
+      "epoch": 0.5952380952380952,
+      "grad_norm": 1.2415088415145874,
+      "learning_rate": 0.00011904761904761905,
+      "loss": 0.2103,
+      "step": 50
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 1.0791610479354858,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.1988,
+      "step": 60
+    },
+    {
+      "epoch": 0.8333333333333334,
+      "grad_norm": 0.8729690313339233,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.1797,
+      "step": 70
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 1.2665623426437378,
+      "learning_rate": 0.00019047619047619048,
+      "loss": 0.1687,
+      "step": 80
+    },
+    {
+      "epoch": 1.0714285714285714,
+      "grad_norm": 0.9594415426254272,
+      "learning_rate": 0.00019999289272096886,
+      "loss": 0.1538,
+      "step": 90
+    },
+    {
+      "epoch": 1.1904761904761905,
+      "grad_norm": 1.446765422821045,
+      "learning_rate": 0.00019994946300764274,
+      "loss": 0.145,
+      "step": 100
+    },
+    {
+      "epoch": 1.3095238095238095,
+      "grad_norm": 0.7596405744552612,
+      "learning_rate": 0.00019986656919636397,
+      "loss": 0.1428,
+      "step": 110
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.8046891093254089,
+      "learning_rate": 0.00019974424401696808,
+      "loss": 0.1262,
+      "step": 120
+    },
+    {
+      "epoch": 1.5476190476190477,
+      "grad_norm": 0.3929666578769684,
+      "learning_rate": 0.00019958253576839256,
+      "loss": 0.117,
+      "step": 130
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 1.0657896995544434,
+      "learning_rate": 0.00019938150829960634,
+      "loss": 0.1223,
+      "step": 140
+    },
+    {
+      "epoch": 1.7857142857142856,
+      "grad_norm": 0.4697973132133484,
+      "learning_rate": 0.00019914124098439974,
+      "loss": 0.1152,
+      "step": 150
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.40580397844314575,
+      "learning_rate": 0.00019886182869004445,
+      "loss": 0.104,
+      "step": 160
+    },
+    {
+      "epoch": 2.0238095238095237,
+      "grad_norm": 0.5756620168685913,
+      "learning_rate": 0.00019854338173983614,
+      "loss": 0.0907,
+      "step": 170
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.33545899391174316,
+      "learning_rate": 0.00019818602586953415,
+      "loss": 0.0889,
+      "step": 180
+    },
+    {
+      "epoch": 2.261904761904762,
+      "grad_norm": 0.7573866844177246,
+      "learning_rate": 0.00019778990217771621,
+      "loss": 0.0932,
+      "step": 190
+    },
+    {
+      "epoch": 2.380952380952381,
+      "grad_norm": 0.45556336641311646,
+      "learning_rate": 0.00019735516707006676,
+      "loss": 0.0855,
+      "step": 200
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.346583753824234,
+      "learning_rate": 0.00019688199219762182,
+      "loss": 0.0853,
+      "step": 210
+    },
+    {
+      "epoch": 2.619047619047619,
+      "grad_norm": 0.44082921743392944,
+      "learning_rate": 0.0001963705643889941,
+      "loss": 0.0818,
+      "step": 220
+    },
+    {
+      "epoch": 2.738095238095238,
+      "grad_norm": 0.5446610450744629,
+      "learning_rate": 0.00019582108557660553,
+      "loss": 0.079,
+      "step": 230
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.5620890259742737,
+      "learning_rate": 0.0001952337727169561,
+      "loss": 0.0826,
+      "step": 240
+    },
+    {
+      "epoch": 2.9761904761904763,
+      "grad_norm": 0.27951428294181824,
+      "learning_rate": 0.0001946088577049608,
+      "loss": 0.077,
+      "step": 250
+    },
+    {
+      "epoch": 3.0952380952380953,
+      "grad_norm": 0.28334465622901917,
+      "learning_rate": 0.00019394658728238794,
+      "loss": 0.0791,
+      "step": 260
+    },
+    {
+      "epoch": 3.2142857142857144,
+      "grad_norm": 0.33841413259506226,
+      "learning_rate": 0.00019324722294043558,
+      "loss": 0.0832,
+      "step": 270
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 0.3847584128379822,
+      "learning_rate": 0.0001925110408164842,
+      "loss": 0.0716,
+      "step": 280
+    },
+    {
+      "epoch": 3.4523809523809526,
+      "grad_norm": 0.277832567691803,
+      "learning_rate": 0.00019173833158506648,
+      "loss": 0.07,
+      "step": 290
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 0.33352115750312805,
+      "learning_rate": 0.00019092940034309722,
+      "loss": 0.0709,
+      "step": 300
+    },
+    {
+      "epoch": 3.6904761904761907,
+      "grad_norm": 0.5264162421226501,
+      "learning_rate": 0.0001900845664894086,
+      "loss": 0.0731,
+      "step": 310
+    },
+    {
+      "epoch": 3.8095238095238093,
+      "grad_norm": 0.3690634071826935,
+      "learning_rate": 0.00018920416359863887,
+      "loss": 0.0635,
+      "step": 320
+    },
+    {
+      "epoch": 3.928571428571429,
+      "grad_norm": 0.33004167675971985,
+      "learning_rate": 0.0001882885392895232,
+      "loss": 0.0745,
+      "step": 330
+    },
+    {
+      "epoch": 4.0476190476190474,
+      "grad_norm": 0.4185870587825775,
+      "learning_rate": 0.00018733805508764002,
+      "loss": 0.0802,
+      "step": 340
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": 0.30809977650642395,
+      "learning_rate": 0.00018635308628266585,
+      "loss": 0.071,
+      "step": 350
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 0.3416142463684082,
+      "learning_rate": 0.00018533402178019594,
+      "loss": 0.0746,
+      "step": 360
+    },
+    {
+      "epoch": 4.404761904761905,
+      "grad_norm": 0.3178706467151642,
+      "learning_rate": 0.0001842812639481884,
+      "loss": 0.0615,
+      "step": 370
+    },
+    {
+      "epoch": 4.523809523809524,
+      "grad_norm": 0.2836775779724121,
+      "learning_rate": 0.00018319522845809306,
+      "loss": 0.0595,
+      "step": 380
+    },
+    {
+      "epoch": 4.642857142857143,
+      "grad_norm": 0.3071931004524231,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 0.0636,
+      "step": 390
+    },
+    {
+      "epoch": 4.761904761904762,
+      "grad_norm": 0.4010617434978485,
+      "learning_rate": 0.0001809250527169658,
+      "loss": 0.0593,
+      "step": 400
+    },
+    {
+      "epoch": 4.880952380952381,
+      "grad_norm": 0.2992096245288849,
+      "learning_rate": 0.00017974180882330412,
+      "loss": 0.0687,
+      "step": 410
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.7678210735321045,
+      "learning_rate": 0.0001785270796323769,
+      "loss": 0.0647,
+      "step": 420
+    },
+    {
+      "epoch": 5.119047619047619,
+      "grad_norm": 0.2817954123020172,
+      "learning_rate": 0.00017728134476848966,
+      "loss": 0.0637,
+      "step": 430
+    },
+    {
+      "epoch": 5.238095238095238,
+      "grad_norm": 0.3238624930381775,
+      "learning_rate": 0.00017600509609824388,
+      "loss": 0.0654,
+      "step": 440
+    },
+    {
+      "epoch": 5.357142857142857,
+      "grad_norm": 0.4325568377971649,
+      "learning_rate": 0.00017469883753632817,
+      "loss": 0.0574,
+      "step": 450
+    },
+    {
+      "epoch": 5.476190476190476,
+      "grad_norm": 0.3361673653125763,
+      "learning_rate": 0.0001733630848465525,
+      "loss": 0.0614,
+      "step": 460
+    },
+    {
+      "epoch": 5.595238095238095,
+      "grad_norm": 0.2163662165403366,
+      "learning_rate": 0.00017199836543820357,
+      "loss": 0.0564,
+      "step": 470
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 0.3433023989200592,
+      "learning_rate": 0.00017060521815780223,
+      "loss": 0.0566,
+      "step": 480
+    },
+    {
+      "epoch": 5.833333333333333,
+      "grad_norm": 0.2643953859806061,
+      "learning_rate": 0.0001691841930763453,
+      "loss": 0.0486,
+      "step": 490
+    },
+    {
+      "epoch": 5.9523809523809526,
+      "grad_norm": 0.2475927174091339,
+      "learning_rate": 0.00016773585127211478,
+      "loss": 0.0567,
+      "step": 500
+    },
+    {
+      "epoch": 6.071428571428571,
+      "grad_norm": 0.23853139579296112,
+      "learning_rate": 0.00016626076460914198,
+      "loss": 0.0516,
+      "step": 510
+    },
+    {
+      "epoch": 6.190476190476191,
+      "grad_norm": 0.22467206418514252,
+      "learning_rate": 0.00016475951551141199,
+      "loss": 0.0515,
+      "step": 520
+    },
+    {
+      "epoch": 6.309523809523809,
+      "grad_norm": 0.3674350380897522,
+      "learning_rate": 0.0001632326967328993,
+      "loss": 0.0513,
+      "step": 530
+    },
+    {
+      "epoch": 6.428571428571429,
+      "grad_norm": 0.2875892221927643,
+      "learning_rate": 0.0001616809111235244,
+      "loss": 0.0546,
+      "step": 540
+    },
+    {
+      "epoch": 6.5476190476190474,
+      "grad_norm": 0.27081480622291565,
+      "learning_rate": 0.0001601047713911244,
+      "loss": 0.0486,
+      "step": 550
+    },
+    {
+      "epoch": 6.666666666666667,
+      "grad_norm": 0.29003506898880005,
+      "learning_rate": 0.00015850489985953076,
+      "loss": 0.0519,
+      "step": 560
+    },
+    {
+      "epoch": 6.785714285714286,
+      "grad_norm": 0.267265647649765,
+      "learning_rate": 0.00015688192822285117,
+      "loss": 0.0534,
+      "step": 570
+    },
+    {
+      "epoch": 6.904761904761905,
+      "grad_norm": 0.2007722705602646,
+      "learning_rate": 0.0001552364972960506,
+      "loss": 0.0525,
+      "step": 580
+    },
+    {
+      "epoch": 7.023809523809524,
+      "grad_norm": 0.2781243324279785,
+      "learning_rate": 0.0001535692567619319,
+      "loss": 0.049,
+      "step": 590
+    },
+    {
+      "epoch": 7.142857142857143,
+      "grad_norm": 0.302306592464447,
+      "learning_rate": 0.00015188086491461466,
+      "loss": 0.0479,
+      "step": 600
+    },
+    {
+      "epoch": 7.261904761904762,
+      "grad_norm": 0.23352427780628204,
+      "learning_rate": 0.0001501719883996139,
+      "loss": 0.0522,
+      "step": 610
+    },
+    {
+      "epoch": 7.380952380952381,
+      "grad_norm": 0.3669947385787964,
+      "learning_rate": 0.00014844330195062144,
+      "loss": 0.0498,
+      "step": 620
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.22105714678764343,
+      "learning_rate": 0.00014669548812309388,
+      "loss": 0.048,
+      "step": 630
+    },
+    {
+      "epoch": 7.619047619047619,
+      "grad_norm": 0.2889458239078522,
+      "learning_rate": 0.00014492923702475182,
+      "loss": 0.0475,
+      "step": 640
+    },
+    {
+      "epoch": 7.738095238095238,
+      "grad_norm": 0.22984769940376282,
+      "learning_rate": 0.00014314524604309748,
+      "loss": 0.0537,
+      "step": 650
+    },
+    {
+      "epoch": 7.857142857142857,
+      "grad_norm": 0.22611601650714874,
+      "learning_rate": 0.00014134421957005775,
+      "loss": 0.048,
+      "step": 660
+    },
+    {
+      "epoch": 7.976190476190476,
+      "grad_norm": 0.37012603878974915,
+      "learning_rate": 0.00013952686872386195,
+      "loss": 0.0489,
+      "step": 670
+    },
+    {
+      "epoch": 8.095238095238095,
+      "grad_norm": 0.27276352047920227,
+      "learning_rate": 0.00013769391106826327,
+      "loss": 0.0479,
+      "step": 680
+    },
+    {
+      "epoch": 8.214285714285714,
+      "grad_norm": 0.23436495661735535,
+      "learning_rate": 0.00013584607032921566,
+      "loss": 0.0433,
+      "step": 690
+    },
+    {
+      "epoch": 8.333333333333334,
+      "grad_norm": 0.2781357169151306,
+      "learning_rate": 0.0001339840761091175,
+      "loss": 0.043,
+      "step": 700
+    },
+    {
+      "epoch": 8.452380952380953,
+      "grad_norm": 0.2650757133960724,
+      "learning_rate": 0.00013210866359873505,
+      "loss": 0.0447,
+      "step": 710
+    },
+    {
+      "epoch": 8.571428571428571,
+      "grad_norm": 0.3430781364440918,
+      "learning_rate": 0.00013022057328691914,
+      "loss": 0.0452,
+      "step": 720
+    },
+    {
+      "epoch": 8.69047619047619,
+      "grad_norm": 0.21453280746936798,
+      "learning_rate": 0.00012832055066823038,
+      "loss": 0.0438,
+      "step": 730
+    },
+    {
+      "epoch": 8.80952380952381,
+      "grad_norm": 0.29838255047798157,
+      "learning_rate": 0.00012640934594858774,
+      "loss": 0.0479,
+      "step": 740
+    },
+    {
+      "epoch": 8.928571428571429,
+      "grad_norm": 0.19045886397361755,
+      "learning_rate": 0.00012448771374905655,
+      "loss": 0.0459,
+      "step": 750
+    },
+    {
+      "epoch": 9.047619047619047,
+      "grad_norm": 0.4510950744152069,
+      "learning_rate": 0.00012255641280789386,
+      "loss": 0.0512,
+      "step": 760
+    },
+    {
+      "epoch": 9.166666666666666,
+      "grad_norm": 0.24877306818962097,
+      "learning_rate": 0.0001206162056809676,
+      "loss": 0.0472,
+      "step": 770
+    },
+    {
+      "epoch": 9.285714285714286,
+      "grad_norm": 0.2406831532716751,
+      "learning_rate": 0.00011866785844066883,
+      "loss": 0.0476,
+      "step": 780
+    },
+    {
+      "epoch": 9.404761904761905,
+      "grad_norm": 0.2927370071411133,
+      "learning_rate": 0.00011671214037343514,
+      "loss": 0.0412,
+      "step": 790
+    },
+    {
+      "epoch": 9.523809523809524,
+      "grad_norm": 0.20210541784763336,
+      "learning_rate": 0.00011474982367600525,
+      "loss": 0.0424,
+      "step": 800
+    },
+    {
+      "epoch": 9.642857142857142,
+      "grad_norm": 0.2872091829776764,
+      "learning_rate": 0.00011278168315052445,
+      "loss": 0.0399,
+      "step": 810
+    },
+    {
+      "epoch": 9.761904761904763,
+      "grad_norm": 0.290270060300827,
+      "learning_rate": 0.00011080849589862142,
+      "loss": 0.0357,
+      "step": 820
+    },
+    {
+      "epoch": 9.880952380952381,
+      "grad_norm": 0.20899908244609833,
+      "learning_rate": 0.0001088310410145768,
+      "loss": 0.0401,
+      "step": 830
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.248155415058136,
+      "learning_rate": 0.00010685009927770542,
+      "loss": 0.035,
+      "step": 840
+    },
+    {
+      "epoch": 10.119047619047619,
+      "grad_norm": 0.18403619527816772,
+      "learning_rate": 0.00010486645284407281,
+      "loss": 0.039,
+      "step": 850
+    },
+    {
+      "epoch": 10.238095238095237,
+      "grad_norm": 0.16922874748706818,
+      "learning_rate": 0.00010288088493766845,
+      "loss": 0.0361,
+      "step": 860
+    },
+    {
+      "epoch": 10.357142857142858,
+      "grad_norm": 0.19182036817073822,
+      "learning_rate": 0.00010089417954115714,
+      "loss": 0.0404,
+      "step": 870
+    },
+    {
+      "epoch": 10.476190476190476,
+      "grad_norm": 0.2680899202823639,
+      "learning_rate": 9.890712108633076e-05,
+      "loss": 0.0366,
+      "step": 880
+    },
+    {
+      "epoch": 10.595238095238095,
+      "grad_norm": 0.21088555455207825,
+      "learning_rate": 9.692049414438299e-05,
+      "loss": 0.0397,
+      "step": 890
+    },
+    {
+      "epoch": 10.714285714285714,
+      "grad_norm": 0.27201998233795166,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 0.0411,
+      "step": 900
+    },
+    {
+      "epoch": 10.833333333333334,
+      "grad_norm": 0.15937431156635284,
+      "learning_rate": 9.295167192229093e-05,
+      "loss": 0.0394,
+      "step": 910
+    },
+    {
+      "epoch": 10.952380952380953,
+      "grad_norm": 0.20022912323474884,
+      "learning_rate": 9.097104369397682e-05,
+      "loss": 0.0361,
+      "step": 920
+    },
+    {
+      "epoch": 11.071428571428571,
+      "grad_norm": 0.2788039743900299,
+      "learning_rate": 8.899398046346608e-05,
+      "loss": 0.0397,
+      "step": 930
+    },
+    {
+      "epoch": 11.19047619047619,
+      "grad_norm": 0.2813689708709717,
+      "learning_rate": 8.702126285543286e-05,
+      "loss": 0.0374,
+      "step": 940
+    },
+    {
+      "epoch": 11.30952380952381,
+      "grad_norm": 0.1632857620716095,
+      "learning_rate": 8.505366977872336e-05,
+      "loss": 0.0372,
+      "step": 950
+    },
+    {
+      "epoch": 11.428571428571429,
+      "grad_norm": 0.16908283531665802,
+      "learning_rate": 8.309197811881127e-05,
+      "loss": 0.0361,
+      "step": 960
+    },
+    {
+      "epoch": 11.547619047619047,
+      "grad_norm": 0.2518416941165924,
+      "learning_rate": 8.113696243105176e-05,
+      "loss": 0.0374,
+      "step": 970
+    },
+    {
+      "epoch": 11.666666666666666,
+      "grad_norm": 0.21874170005321503,
+      "learning_rate": 7.918939463485568e-05,
+      "loss": 0.0363,
+      "step": 980
+    },
+    {
+      "epoch": 11.785714285714286,
+      "grad_norm": 0.2551394999027252,
+      "learning_rate": 7.72500437089046e-05,
+      "loss": 0.0348,
+      "step": 990
+    },
+    {
+      "epoch": 11.904761904761905,
+      "grad_norm": 0.2511221170425415,
+      "learning_rate": 7.531967538752656e-05,
+      "loss": 0.0412,
+      "step": 1000
+    },
+    {
+      "epoch": 12.023809523809524,
+      "grad_norm": 0.24874921143054962,
+      "learning_rate": 7.33990518583535e-05,
+      "loss": 0.0369,
+      "step": 1010
+    },
+    {
+      "epoch": 12.142857142857142,
+      "grad_norm": 0.25549542903900146,
+      "learning_rate": 7.148893146137852e-05,
+      "loss": 0.0334,
+      "step": 1020
+    },
+    {
+      "epoch": 12.261904761904763,
+      "grad_norm": 0.25841251015663147,
+      "learning_rate": 6.95900683895325e-05,
+      "loss": 0.0378,
+      "step": 1030
+    },
+    {
+      "epoch": 12.380952380952381,
+      "grad_norm": 0.2499120533466339,
+      "learning_rate": 6.770321239089826e-05,
+      "loss": 0.0314,
+      "step": 1040
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.19628271460533142,
+      "learning_rate": 6.582910847267957e-05,
+      "loss": 0.0287,
+      "step": 1050
+    },
+    {
+      "epoch": 12.619047619047619,
+      "grad_norm": 0.18714749813079834,
+      "learning_rate": 6.396849660704205e-05,
+      "loss": 0.0302,
+      "step": 1060
+    },
+    {
+      "epoch": 12.738095238095237,
+      "grad_norm": 0.19048580527305603,
+      "learning_rate": 6.21221114389424e-05,
+      "loss": 0.0276,
+      "step": 1070
+    },
+    {
+      "epoch": 12.857142857142858,
+      "grad_norm": 0.21586266160011292,
+      "learning_rate": 6.0290681996060605e-05,
+      "loss": 0.0341,
+      "step": 1080
+    },
+    {
+      "epoch": 12.976190476190476,
+      "grad_norm": 0.22894449532032013,
+      "learning_rate": 5.847493140095029e-05,
+      "loss": 0.0315,
+      "step": 1090
+    },
+    {
+      "epoch": 13.095238095238095,
+      "grad_norm": 0.2722737491130829,
+      "learning_rate": 5.6675576585520786e-05,
+      "loss": 0.0288,
+      "step": 1100
+    },
+    {
+      "epoch": 13.214285714285714,
+      "grad_norm": 0.14149416983127594,
+      "learning_rate": 5.4893328007963094e-05,
+      "loss": 0.0302,
+      "step": 1110
+    },
+    {
+      "epoch": 13.333333333333334,
+      "grad_norm": 0.2547774314880371,
+      "learning_rate": 5.312888937223244e-05,
+      "loss": 0.0282,
+      "step": 1120
+    },
+    {
+      "epoch": 13.452380952380953,
+      "grad_norm": 0.1506737470626831,
+      "learning_rate": 5.1382957350197405e-05,
+      "loss": 0.0285,
+      "step": 1130
+    },
+    {
+      "epoch": 13.571428571428571,
+      "grad_norm": 0.19141684472560883,
+      "learning_rate": 4.965622130656551e-05,
+      "loss": 0.0323,
+      "step": 1140
+    },
+    {
+      "epoch": 13.69047619047619,
+      "grad_norm": 0.18162524700164795,
+      "learning_rate": 4.794936302669417e-05,
+      "loss": 0.0306,
+      "step": 1150
+    },
+    {
+      "epoch": 13.80952380952381,
+      "grad_norm": 0.20886124670505524,
+      "learning_rate": 4.6263056447394347e-05,
+      "loss": 0.0272,
+      "step": 1160
+    },
+    {
+      "epoch": 13.928571428571429,
+      "grad_norm": 0.14155210554599762,
+      "learning_rate": 4.459796739083274e-05,
+      "loss": 0.0299,
+      "step": 1170
+    },
+    {
+      "epoch": 14.047619047619047,
+      "grad_norm": 0.13192126154899597,
+      "learning_rate": 4.2954753301638315e-05,
+      "loss": 0.0337,
+      "step": 1180
+    },
+    {
+      "epoch": 14.166666666666666,
+      "grad_norm": 0.17799627780914307,
+      "learning_rate": 4.133406298731669e-05,
+      "loss": 0.0278,
+      "step": 1190
+    },
+    {
+      "epoch": 14.285714285714286,
+      "grad_norm": 0.1519002765417099,
+      "learning_rate": 3.973653636207437e-05,
+      "loss": 0.026,
+      "step": 1200
+    },
+    {
+      "epoch": 14.404761904761905,
+      "grad_norm": 0.24323873221874237,
+      "learning_rate": 3.8162804194154864e-05,
+      "loss": 0.0329,
+      "step": 1210
+    },
+    {
+      "epoch": 14.523809523809524,
+      "grad_norm": 0.25485578179359436,
+      "learning_rate": 3.661348785678574e-05,
+      "loss": 0.0282,
+      "step": 1220
+    },
+    {
+      "epoch": 14.642857142857142,
+      "grad_norm": 0.14652098715305328,
+      "learning_rate": 3.508919908283543e-05,
+      "loss": 0.0263,
+      "step": 1230
+    },
+    {
+      "epoch": 14.761904761904763,
+      "grad_norm": 0.26984453201293945,
+      "learning_rate": 3.3590539723276083e-05,
+      "loss": 0.029,
+      "step": 1240
+    },
+    {
+      "epoch": 14.880952380952381,
+      "grad_norm": 0.13968226313591003,
+      "learning_rate": 3.211810150954867e-05,
+      "loss": 0.0267,
+      "step": 1250
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.29029059410095215,
+      "learning_rate": 3.067246581992321e-05,
+      "loss": 0.0255,
+      "step": 1260
+    },
+    {
+      "epoch": 15.119047619047619,
+      "grad_norm": 0.16652598977088928,
+      "learning_rate": 2.925420344994719e-05,
+      "loss": 0.0268,
+      "step": 1270
+    },
+    {
+      "epoch": 15.238095238095237,
+      "grad_norm": 0.27046117186546326,
+      "learning_rate": 2.786387438707231e-05,
+      "loss": 0.0245,
+      "step": 1280
+    },
+    {
+      "epoch": 15.357142857142858,
+      "grad_norm": 0.22596995532512665,
+      "learning_rate": 2.6502027589548862e-05,
+      "loss": 0.0288,
+      "step": 1290
+    },
+    {
+      "epoch": 15.476190476190476,
+      "grad_norm": 0.16162142157554626,
+      "learning_rate": 2.516920076967455e-05,
+      "loss": 0.0263,
+      "step": 1300
+    },
+    {
+      "epoch": 15.595238095238095,
+      "grad_norm": 0.1669321060180664,
+      "learning_rate": 2.3865920181484123e-05,
+      "loss": 0.0284,
+      "step": 1310
+    },
+    {
+      "epoch": 15.714285714285714,
+      "grad_norm": 0.20296935737133026,
+      "learning_rate": 2.2592700412962777e-05,
+      "loss": 0.026,
+      "step": 1320
+    },
+    {
+      "epoch": 15.833333333333334,
+      "grad_norm": 0.17937733232975006,
+      "learning_rate": 2.1350044182866025e-05,
+      "loss": 0.0217,
+      "step": 1330
+    },
+    {
+      "epoch": 15.952380952380953,
+      "grad_norm": 0.14423789083957672,
+      "learning_rate": 2.0138442142226e-05,
+      "loss": 0.0289,
+      "step": 1340
+    },
+    {
+      "epoch": 16.071428571428573,
+      "grad_norm": 0.21349196135997772,
+      "learning_rate": 1.895837268062256e-05,
+      "loss": 0.029,
+      "step": 1350
+    },
+    {
+      "epoch": 16.19047619047619,
+      "grad_norm": 0.17763833701610565,
+      "learning_rate": 1.7810301737295588e-05,
+      "loss": 0.0234,
+      "step": 1360
+    },
+    {
+      "epoch": 16.30952380952381,
+      "grad_norm": 0.1566387116909027,
+      "learning_rate": 1.6694682617173452e-05,
+      "loss": 0.025,
+      "step": 1370
+    },
+    {
+      "epoch": 16.428571428571427,
+      "grad_norm": 0.19837665557861328,
+      "learning_rate": 1.5611955811889644e-05,
+      "loss": 0.0254,
+      "step": 1380
+    },
+    {
+      "epoch": 16.547619047619047,
+      "grad_norm": 0.17018267512321472,
+      "learning_rate": 1.456254882585909e-05,
+      "loss": 0.0212,
+      "step": 1390
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 0.11024856567382812,
+      "learning_rate": 1.3546876007481845e-05,
+      "loss": 0.0208,
+      "step": 1400
+    },
+    {
+      "epoch": 16.785714285714285,
+      "grad_norm": 0.16558444499969482,
+      "learning_rate": 1.2565338385541792e-05,
+      "loss": 0.0234,
+      "step": 1410
+    },
+    {
+      "epoch": 16.904761904761905,
+      "grad_norm": 0.14750836789608002,
+      "learning_rate": 1.161832351086396e-05,
+      "loss": 0.0234,
+      "step": 1420
+    },
+    {
+      "epoch": 17.023809523809526,
+      "grad_norm": 0.14175614714622498,
+      "learning_rate": 1.0706205303294026e-05,
+      "loss": 0.0233,
+      "step": 1430
+    },
+    {
+      "epoch": 17.142857142857142,
+      "grad_norm": 0.128941610455513,
+      "learning_rate": 9.82934390405934e-06,
+      "loss": 0.0228,
+      "step": 1440
+    },
+    {
+      "epoch": 17.261904761904763,
+      "grad_norm": 0.17659620940685272,
+      "learning_rate": 8.988085533570833e-06,
+      "loss": 0.0244,
+      "step": 1450
+    },
+    {
+      "epoch": 17.38095238095238,
+      "grad_norm": 0.1548050194978714,
+      "learning_rate": 8.182762354720985e-06,
+      "loss": 0.0203,
+      "step": 1460
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.16272428631782532,
+      "learning_rate": 7.413692341732581e-06,
+      "loss": 0.0285,
+      "step": 1470
+    },
+    {
+      "epoch": 17.61904761904762,
+      "grad_norm": 0.22217531502246857,
+      "learning_rate": 6.681179154609462e-06,
+      "loss": 0.0195,
+      "step": 1480
+    },
+    {
+      "epoch": 17.738095238095237,
+      "grad_norm": 0.16655217111110687,
+      "learning_rate": 5.985512019239392e-06,
+      "loss": 0.0236,
+      "step": 1490
+    },
+    {
+      "epoch": 17.857142857142858,
+      "grad_norm": 0.19496409595012665,
+      "learning_rate": 5.326965613195867e-06,
+      "loss": 0.0246,
+      "step": 1500
+    },
+    {
+      "epoch": 17.976190476190474,
+      "grad_norm": 0.18502101302146912,
+      "learning_rate": 4.705799957284351e-06,
+      "loss": 0.0209,
+      "step": 1510
+    },
+    {
+      "epoch": 18.095238095238095,
+      "grad_norm": 0.20573672652244568,
+      "learning_rate": 4.122260312875437e-06,
+      "loss": 0.023,
+      "step": 1520
+    },
+    {
+      "epoch": 18.214285714285715,
+      "grad_norm": 0.18070857226848602,
+      "learning_rate": 3.576577085065824e-06,
+      "loss": 0.0246,
+      "step": 1530
+    },
+    {
+      "epoch": 18.333333333333332,
+      "grad_norm": 0.15732447803020477,
+      "learning_rate": 3.0689657317049204e-06,
+      "loss": 0.0232,
+      "step": 1540
+    },
+    {
+      "epoch": 18.452380952380953,
+      "grad_norm": 0.18765419721603394,
+      "learning_rate": 2.5996266783235078e-06,
+      "loss": 0.0238,
+      "step": 1550
+    },
+    {
+      "epoch": 18.571428571428573,
+      "grad_norm": 0.12155826389789581,
+      "learning_rate": 2.1687452389974826e-06,
+      "loss": 0.0223,
+      "step": 1560
+    },
+    {
+      "epoch": 18.69047619047619,
+      "grad_norm": 0.12302082777023315,
+      "learning_rate": 1.7764915431784378e-06,
+      "loss": 0.0267,
+      "step": 1570
+    },
+    {
+      "epoch": 18.80952380952381,
+      "grad_norm": 0.1826854795217514,
+      "learning_rate": 1.4230204685196203e-06,
+      "loss": 0.0238,
+      "step": 1580
+    },
+    {
+      "epoch": 18.928571428571427,
+      "grad_norm": 0.1735326200723648,
+      "learning_rate": 1.1084715797239798e-06,
+      "loss": 0.0222,
+      "step": 1590
+    },
+    {
+      "epoch": 19.047619047619047,
+      "grad_norm": 0.17233559489250183,
+      "learning_rate": 8.329690734383277e-07,
+      "loss": 0.0245,
+      "step": 1600
+    },
+    {
+      "epoch": 19.166666666666668,
+      "grad_norm": 0.14161868393421173,
+      "learning_rate": 5.966217292155296e-07,
+      "loss": 0.0212,
+      "step": 1610
+    },
+    {
+      "epoch": 19.285714285714285,
+      "grad_norm": 0.16612544655799866,
+      "learning_rate": 3.99522866563895e-07,
+      "loss": 0.0245,
+      "step": 1620
+    },
+    {
+      "epoch": 19.404761904761905,
+      "grad_norm": 0.08409969508647919,
+      "learning_rate": 2.417503081008632e-07,
+      "loss": 0.0213,
+      "step": 1630
+    },
+    {
+      "epoch": 19.523809523809526,
+      "grad_norm": 0.17151199281215668,
+      "learning_rate": 1.2336634882544884e-07,
+      "loss": 0.0176,
+      "step": 1640
+    },
+    {
+      "epoch": 19.642857142857142,
+      "grad_norm": 0.13334044814109802,
+      "learning_rate": 4.4417731521717574e-08,
+      "loss": 0.0263,
+      "step": 1650
+    },
+    {
+      "epoch": 19.761904761904763,
+      "grad_norm": 0.07808782160282135,
+      "learning_rate": 4.935628302760176e-09,
+      "loss": 0.0211,
+      "step": 1660
+    },
+    {
+      "epoch": 19.821428571428573,
+      "step": 1665,
+      "total_flos": 2.314379607040872e+17,
+      "train_loss": 0.06144854805401496,
+      "train_runtime": 1922.2885,
+      "train_samples_per_second": 55.434,
+      "train_steps_per_second": 0.866
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1665,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.314379607040872e+17,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}