phospho-app
/

GetTheRubber-qn7utf0wwa

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.988603988603988,
+  "eval_steps": 500,
+  "global_step": 1751,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02849002849002849,
+      "grad_norm": 4.866657257080078,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 1.0817,
+      "step": 10
+    },
+    {
+      "epoch": 0.05698005698005698,
+      "grad_norm": 2.505408525466919,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 0.553,
+      "step": 20
+    },
+    {
+      "epoch": 0.08547008547008547,
+      "grad_norm": 1.517915964126587,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 0.3016,
+      "step": 30
+    },
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 1.582049012184143,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 0.2392,
+      "step": 40
+    },
+    {
+      "epoch": 0.14245014245014245,
+      "grad_norm": 1.7641891241073608,
+      "learning_rate": 0.00011363636363636365,
+      "loss": 0.2087,
+      "step": 50
+    },
+    {
+      "epoch": 0.17094017094017094,
+      "grad_norm": 1.5408800840377808,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 0.2057,
+      "step": 60
+    },
+    {
+      "epoch": 0.19943019943019943,
+      "grad_norm": 1.1511545181274414,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 0.1738,
+      "step": 70
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 1.0401157140731812,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 0.1387,
+      "step": 80
+    },
+    {
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.794109582901001,
+      "learning_rate": 0.0001999992862522931,
+      "loss": 0.1362,
+      "step": 90
+    },
+    {
+      "epoch": 0.2849002849002849,
+      "grad_norm": 1.0705766677856445,
+      "learning_rate": 0.00019997430615234976,
+      "loss": 0.1311,
+      "step": 100
+    },
+    {
+      "epoch": 0.31339031339031337,
+      "grad_norm": 0.7216269373893738,
+      "learning_rate": 0.0001999136488551224,
+      "loss": 0.1267,
+      "step": 110
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.9830422401428223,
+      "learning_rate": 0.00019981733600699645,
+      "loss": 0.1186,
+      "step": 120
+    },
+    {
+      "epoch": 0.37037037037037035,
+      "grad_norm": 0.8986383676528931,
+      "learning_rate": 0.00019968540197852787,
+      "loss": 0.1082,
+      "step": 130
+    },
+    {
+      "epoch": 0.39886039886039887,
+      "grad_norm": 0.9030706286430359,
+      "learning_rate": 0.00019951789385217757,
+      "loss": 0.1081,
+      "step": 140
+    },
+    {
+      "epoch": 0.42735042735042733,
+      "grad_norm": 0.8084211349487305,
+      "learning_rate": 0.00019931487140550935,
+      "loss": 0.1159,
+      "step": 150
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.4105013310909271,
+      "learning_rate": 0.00019907640708985766,
+      "loss": 0.1002,
+      "step": 160
+    },
+    {
+      "epoch": 0.4843304843304843,
+      "grad_norm": 0.6791099905967712,
+      "learning_rate": 0.0001988025860044721,
+      "loss": 0.0992,
+      "step": 170
+    },
+    {
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.611254870891571,
+      "learning_rate": 0.00019849350586614866,
+      "loss": 0.0741,
+      "step": 180
+    },
+    {
+      "epoch": 0.5413105413105413,
+      "grad_norm": 0.49187201261520386,
+      "learning_rate": 0.00019814927697435827,
+      "loss": 0.0913,
+      "step": 190
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.9148091077804565,
+      "learning_rate": 0.00019777002217188482,
+      "loss": 0.0858,
+      "step": 200
+    },
+    {
+      "epoch": 0.5982905982905983,
+      "grad_norm": 0.6754726767539978,
+      "learning_rate": 0.000197355876800987,
+      "loss": 0.0878,
+      "step": 210
+    },
+    {
+      "epoch": 0.6267806267806267,
+      "grad_norm": 0.5443580150604248,
+      "learning_rate": 0.00019690698865509966,
+      "loss": 0.0816,
+      "step": 220
+    },
+    {
+      "epoch": 0.6552706552706553,
+      "grad_norm": 0.5719950795173645,
+      "learning_rate": 0.00019642351792609165,
+      "loss": 0.0818,
+      "step": 230
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.570486843585968,
+      "learning_rate": 0.00019590563714709918,
+      "loss": 0.089,
+      "step": 240
+    },
+    {
+      "epoch": 0.7122507122507122,
+      "grad_norm": 0.5645405054092407,
+      "learning_rate": 0.00019535353113095494,
+      "loss": 0.0748,
+      "step": 250
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.4007212817668915,
+      "learning_rate": 0.00019476739690423532,
+      "loss": 0.0735,
+      "step": 260
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.5221670269966125,
+      "learning_rate": 0.00019414744363694845,
+      "loss": 0.0702,
+      "step": 270
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.4181097447872162,
+      "learning_rate": 0.00019349389256788943,
+      "loss": 0.0927,
+      "step": 280
+    },
+    {
+      "epoch": 0.8262108262108262,
+      "grad_norm": 0.42916539311408997,
+      "learning_rate": 0.0001928069769256879,
+      "loss": 0.0662,
+      "step": 290
+    },
+    {
+      "epoch": 0.8547008547008547,
+      "grad_norm": 0.4586508572101593,
+      "learning_rate": 0.00019208694184557736,
+      "loss": 0.0757,
+      "step": 300
+    },
+    {
+      "epoch": 0.8831908831908832,
+      "grad_norm": 0.49600309133529663,
+      "learning_rate": 0.00019133404428191533,
+      "loss": 0.0641,
+      "step": 310
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.5553969740867615,
+      "learning_rate": 0.00019054855291648562,
+      "loss": 0.0733,
+      "step": 320
+    },
+    {
+      "epoch": 0.9401709401709402,
+      "grad_norm": 0.508593738079071,
+      "learning_rate": 0.00018973074806261558,
+      "loss": 0.0747,
+      "step": 330
+    },
+    {
+      "epoch": 0.9686609686609686,
+      "grad_norm": 0.5126791000366211,
+      "learning_rate": 0.00018888092156514255,
+      "loss": 0.0563,
+      "step": 340
+    },
+    {
+      "epoch": 0.9971509971509972,
+      "grad_norm": 0.6333485841751099,
+      "learning_rate": 0.00018799937669626484,
+      "loss": 0.059,
+      "step": 350
+    },
+    {
+      "epoch": 1.0256410256410255,
+      "grad_norm": 0.5621979236602783,
+      "learning_rate": 0.00018708642804731517,
+      "loss": 0.0533,
+      "step": 360
+    },
+    {
+      "epoch": 1.0541310541310542,
+      "grad_norm": 0.25756293535232544,
+      "learning_rate": 0.0001861424014164941,
+      "loss": 0.0511,
+      "step": 370
+    },
+    {
+      "epoch": 1.0826210826210827,
+      "grad_norm": 0.4778578281402588,
+      "learning_rate": 0.00018516763369260493,
+      "loss": 0.061,
+      "step": 380
+    },
+    {
+      "epoch": 1.1111111111111112,
+      "grad_norm": 0.4505862295627594,
+      "learning_rate": 0.00018416247273482988,
+      "loss": 0.0707,
+      "step": 390
+    },
+    {
+      "epoch": 1.1396011396011396,
+      "grad_norm": 0.4163416624069214,
+      "learning_rate": 0.0001831272772485922,
+      "loss": 0.0514,
+      "step": 400
+    },
+    {
+      "epoch": 1.168091168091168,
+      "grad_norm": 0.44356927275657654,
+      "learning_rate": 0.00018206241665754688,
+      "loss": 0.057,
+      "step": 410
+    },
+    {
+      "epoch": 1.1965811965811965,
+      "grad_norm": 0.2967655658721924,
+      "learning_rate": 0.000180968270971747,
+      "loss": 0.0641,
+      "step": 420
+    },
+    {
+      "epoch": 1.225071225071225,
+      "grad_norm": 0.47910967469215393,
+      "learning_rate": 0.00017984523065203188,
+      "loss": 0.0618,
+      "step": 430
+    },
+    {
+      "epoch": 1.2535612535612537,
+      "grad_norm": 0.3292757570743561,
+      "learning_rate": 0.0001786936964706858,
+      "loss": 0.0517,
+      "step": 440
+    },
+    {
+      "epoch": 1.282051282051282,
+      "grad_norm": 0.37650346755981445,
+      "learning_rate": 0.00017751407936841688,
+      "loss": 0.0589,
+      "step": 450
+    },
+    {
+      "epoch": 1.3105413105413106,
+      "grad_norm": 0.4217515289783478,
+      "learning_rate": 0.00017630680030770735,
+      "loss": 0.0564,
+      "step": 460
+    },
+    {
+      "epoch": 1.339031339031339,
+      "grad_norm": 0.3700660467147827,
+      "learning_rate": 0.00017507229012258732,
+      "loss": 0.0588,
+      "step": 470
+    },
+    {
+      "epoch": 1.3675213675213675,
+      "grad_norm": 0.2905580401420593,
+      "learning_rate": 0.00017381098936488574,
+      "loss": 0.0464,
+      "step": 480
+    },
+    {
+      "epoch": 1.396011396011396,
+      "grad_norm": 0.39054426550865173,
+      "learning_rate": 0.00017252334814701353,
+      "loss": 0.0578,
+      "step": 490
+    },
+    {
+      "epoch": 1.4245014245014245,
+      "grad_norm": 0.3755801320075989,
+      "learning_rate": 0.00017120982598133456,
+      "loss": 0.0455,
+      "step": 500
+    },
+    {
+      "epoch": 1.452991452991453,
+      "grad_norm": 0.3761601448059082,
+      "learning_rate": 0.0001698708916161829,
+      "loss": 0.0471,
+      "step": 510
+    },
+    {
+      "epoch": 1.4814814814814814,
+      "grad_norm": 0.3036906123161316,
+      "learning_rate": 0.000168507022868583,
+      "loss": 0.0563,
+      "step": 520
+    },
+    {
+      "epoch": 1.50997150997151,
+      "grad_norm": 0.39106476306915283,
+      "learning_rate": 0.0001671187064537345,
+      "loss": 0.0587,
+      "step": 530
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.6104059219360352,
+      "learning_rate": 0.00016570643781132122,
+      "loss": 0.0612,
+      "step": 540
+    },
+    {
+      "epoch": 1.566951566951567,
+      "grad_norm": 0.3543621003627777,
+      "learning_rate": 0.00016427072092870651,
+      "loss": 0.0507,
+      "step": 550
+    },
+    {
+      "epoch": 1.5954415954415955,
+      "grad_norm": 0.4155671000480652,
+      "learning_rate": 0.0001628120681610789,
+      "loss": 0.0545,
+      "step": 560
+    },
+    {
+      "epoch": 1.623931623931624,
+      "grad_norm": 0.318392276763916,
+      "learning_rate": 0.00016133100004861082,
+      "loss": 0.0563,
+      "step": 570
+    },
+    {
+      "epoch": 1.6524216524216524,
+      "grad_norm": 0.34228989481925964,
+      "learning_rate": 0.00015982804513069664,
+      "loss": 0.0629,
+      "step": 580
+    },
+    {
+      "epoch": 1.6809116809116809,
+      "grad_norm": 0.3822249472141266,
+      "learning_rate": 0.0001583037397573366,
+      "loss": 0.0473,
+      "step": 590
+    },
+    {
+      "epoch": 1.7094017094017095,
+      "grad_norm": 0.3222091794013977,
+      "learning_rate": 0.00015675862789773243,
+      "loss": 0.0465,
+      "step": 600
+    },
+    {
+      "epoch": 1.7378917378917378,
+      "grad_norm": 0.4308369755744934,
+      "learning_rate": 0.00015519326094616507,
+      "loss": 0.0538,
+      "step": 610
+    },
+    {
+      "epoch": 1.7663817663817665,
+      "grad_norm": 0.33870652318000793,
+      "learning_rate": 0.00015360819752522164,
+      "loss": 0.0575,
+      "step": 620
+    },
+    {
+      "epoch": 1.7948717948717947,
+      "grad_norm": 0.2933538258075714,
+      "learning_rate": 0.00015200400328644356,
+      "loss": 0.0411,
+      "step": 630
+    },
+    {
+      "epoch": 1.8233618233618234,
+      "grad_norm": 0.39347216486930847,
+      "learning_rate": 0.000150381250708466,
+      "loss": 0.042,
+      "step": 640
+    },
+    {
+      "epoch": 1.8518518518518519,
+      "grad_norm": 0.2737712562084198,
+      "learning_rate": 0.00014874051889272107,
+      "loss": 0.0497,
+      "step": 650
+    },
+    {
+      "epoch": 1.8803418803418803,
+      "grad_norm": 0.23161457479000092,
+      "learning_rate": 0.0001470823933567776,
+      "loss": 0.044,
+      "step": 660
+    },
+    {
+      "epoch": 1.9088319088319088,
+      "grad_norm": 0.25970926880836487,
+      "learning_rate": 0.00014540746582539108,
+      "loss": 0.04,
+      "step": 670
+    },
+    {
+      "epoch": 1.9373219373219372,
+      "grad_norm": 0.33561354875564575,
+      "learning_rate": 0.00014371633401933872,
+      "loss": 0.0495,
+      "step": 680
+    },
+    {
+      "epoch": 1.965811965811966,
+      "grad_norm": 0.2616577744483948,
+      "learning_rate": 0.0001420096014421146,
+      "loss": 0.0411,
+      "step": 690
+    },
+    {
+      "epoch": 1.9943019943019942,
+      "grad_norm": 0.3040335774421692,
+      "learning_rate": 0.0001402878771645611,
+      "loss": 0.0418,
+      "step": 700
+    },
+    {
+      "epoch": 2.022792022792023,
+      "grad_norm": 0.4659029245376587,
+      "learning_rate": 0.00013855177560751376,
+      "loss": 0.0465,
+      "step": 710
+    },
+    {
+      "epoch": 2.051282051282051,
+      "grad_norm": 0.36660319566726685,
+      "learning_rate": 0.00013680191632253682,
+      "loss": 0.0441,
+      "step": 720
+    },
+    {
+      "epoch": 2.07977207977208,
+      "grad_norm": 0.3704872131347656,
+      "learning_rate": 0.00013503892377082763,
+      "loss": 0.0578,
+      "step": 730
+    },
+    {
+      "epoch": 2.1082621082621085,
+      "grad_norm": 0.26848968863487244,
+      "learning_rate": 0.00013326342710036934,
+      "loss": 0.0493,
+      "step": 740
+    },
+    {
+      "epoch": 2.1367521367521367,
+      "grad_norm": 0.23576417565345764,
+      "learning_rate": 0.00013147605992141066,
+      "loss": 0.0389,
+      "step": 750
+    },
+    {
+      "epoch": 2.1652421652421654,
+      "grad_norm": 0.31747928261756897,
+      "learning_rate": 0.00012967746008035348,
+      "loss": 0.0439,
+      "step": 760
+    },
+    {
+      "epoch": 2.1937321937321936,
+      "grad_norm": 0.41468170285224915,
+      "learning_rate": 0.00012786826943212874,
+      "loss": 0.0432,
+      "step": 770
+    },
+    {
+      "epoch": 2.2222222222222223,
+      "grad_norm": 0.27693742513656616,
+      "learning_rate": 0.00012604913361114148,
+      "loss": 0.0433,
+      "step": 780
+    },
+    {
+      "epoch": 2.2507122507122506,
+      "grad_norm": 0.31801000237464905,
+      "learning_rate": 0.00012422070180086772,
+      "loss": 0.0312,
+      "step": 790
+    },
+    {
+      "epoch": 2.2792022792022792,
+      "grad_norm": 0.5060178637504578,
+      "learning_rate": 0.0001223836265021841,
+      "loss": 0.0474,
+      "step": 800
+    },
+    {
+      "epoch": 2.3076923076923075,
+      "grad_norm": 0.4824029803276062,
+      "learning_rate": 0.0001205385633005144,
+      "loss": 0.0354,
+      "step": 810
+    },
+    {
+      "epoch": 2.336182336182336,
+      "grad_norm": 0.36395755410194397,
+      "learning_rate": 0.00011868617063187462,
+      "loss": 0.0485,
+      "step": 820
+    },
+    {
+      "epoch": 2.364672364672365,
+      "grad_norm": 0.43524518609046936,
+      "learning_rate": 0.0001168271095479012,
+      "loss": 0.0433,
+      "step": 830
+    },
+    {
+      "epoch": 2.393162393162393,
+      "grad_norm": 0.24890539050102234,
+      "learning_rate": 0.0001149620434799457,
+      "loss": 0.0396,
+      "step": 840
+    },
+    {
+      "epoch": 2.421652421652422,
+      "grad_norm": 0.2868204712867737,
+      "learning_rate": 0.00011309163800232039,
+      "loss": 0.0428,
+      "step": 850
+    },
+    {
+      "epoch": 2.45014245014245,
+      "grad_norm": 0.30886110663414,
+      "learning_rate": 0.00011121656059477874,
+      "loss": 0.048,
+      "step": 860
+    },
+    {
+      "epoch": 2.4786324786324787,
+      "grad_norm": 0.33962446451187134,
+      "learning_rate": 0.00010933748040431626,
+      "loss": 0.038,
+      "step": 870
+    },
+    {
+      "epoch": 2.5071225071225074,
+      "grad_norm": 0.414177268743515,
+      "learning_rate": 0.00010745506800637624,
+      "loss": 0.0376,
+      "step": 880
+    },
+    {
+      "epoch": 2.5356125356125356,
+      "grad_norm": 0.4189615547657013,
+      "learning_rate": 0.00010556999516554583,
+      "loss": 0.0406,
+      "step": 890
+    },
+    {
+      "epoch": 2.564102564102564,
+      "grad_norm": 0.32983464002609253,
+      "learning_rate": 0.00010368293459582743,
+      "loss": 0.0387,
+      "step": 900
+    },
+    {
+      "epoch": 2.5925925925925926,
+      "grad_norm": 0.37069281935691833,
+      "learning_rate": 0.00010179455972057182,
+      "loss": 0.0382,
+      "step": 910
+    },
+    {
+      "epoch": 2.6210826210826212,
+      "grad_norm": 0.28479892015457153,
+      "learning_rate": 9.990554443215748e-05,
+      "loss": 0.0443,
+      "step": 920
+    },
+    {
+      "epoch": 2.6495726495726495,
+      "grad_norm": 0.31271642446517944,
+      "learning_rate": 9.80165628515033e-05,
+      "loss": 0.0479,
+      "step": 930
+    },
+    {
+      "epoch": 2.678062678062678,
+      "grad_norm": 0.17170262336730957,
+      "learning_rate": 9.612828908749888e-05,
+      "loss": 0.035,
+      "step": 940
+    },
+    {
+      "epoch": 2.7065527065527064,
+      "grad_norm": 0.33237120509147644,
+      "learning_rate": 9.424139699643992e-05,
+      "loss": 0.033,
+      "step": 950
+    },
+    {
+      "epoch": 2.735042735042735,
+      "grad_norm": 0.2521449625492096,
+      "learning_rate": 9.235655994155326e-05,
+      "loss": 0.0399,
+      "step": 960
+    },
+    {
+      "epoch": 2.763532763532764,
+      "grad_norm": 0.20456038415431976,
+      "learning_rate": 9.047445055269784e-05,
+      "loss": 0.0299,
+      "step": 970
+    },
+    {
+      "epoch": 2.792022792022792,
+      "grad_norm": 0.2347419261932373,
+      "learning_rate": 8.859574048632786e-05,
+      "loss": 0.0344,
+      "step": 980
+    },
+    {
+      "epoch": 2.8205128205128203,
+      "grad_norm": 0.2831404507160187,
+      "learning_rate": 8.672110018580282e-05,
+      "loss": 0.0346,
+      "step": 990
+    },
+    {
+      "epoch": 2.849002849002849,
+      "grad_norm": 0.2277534157037735,
+      "learning_rate": 8.485119864213058e-05,
+      "loss": 0.0399,
+      "step": 1000
+    },
+    {
+      "epoch": 2.8774928774928776,
+      "grad_norm": 0.27118125557899475,
+      "learning_rate": 8.298670315522894e-05,
+      "loss": 0.0455,
+      "step": 1010
+    },
+    {
+      "epoch": 2.905982905982906,
+      "grad_norm": 0.21337424218654633,
+      "learning_rate": 8.112827909579045e-05,
+      "loss": 0.025,
+      "step": 1020
+    },
+    {
+      "epoch": 2.9344729344729346,
+      "grad_norm": 0.28749632835388184,
+      "learning_rate": 7.927658966783576e-05,
+      "loss": 0.0398,
+      "step": 1030
+    },
+    {
+      "epoch": 2.962962962962963,
+      "grad_norm": 0.26412132382392883,
+      "learning_rate": 7.743229567204036e-05,
+      "loss": 0.029,
+      "step": 1040
+    },
+    {
+      "epoch": 2.9914529914529915,
+      "grad_norm": 0.2315894365310669,
+      "learning_rate": 7.559605526991859e-05,
+      "loss": 0.0305,
+      "step": 1050
+    },
+    {
+      "epoch": 3.0199430199430197,
+      "grad_norm": 0.22662468254566193,
+      "learning_rate": 7.376852374894989e-05,
+      "loss": 0.0405,
+      "step": 1060
+    },
+    {
+      "epoch": 3.0484330484330484,
+      "grad_norm": 0.19257226586341858,
+      "learning_rate": 7.195035328873048e-05,
+      "loss": 0.0266,
+      "step": 1070
+    },
+    {
+      "epoch": 3.076923076923077,
+      "grad_norm": 0.3498987555503845,
+      "learning_rate": 7.014219272823407e-05,
+      "loss": 0.0335,
+      "step": 1080
+    },
+    {
+      "epoch": 3.1054131054131053,
+      "grad_norm": 0.20060497522354126,
+      "learning_rate": 6.834468733426498e-05,
+      "loss": 0.0273,
+      "step": 1090
+    },
+    {
+      "epoch": 3.133903133903134,
+      "grad_norm": 0.23581984639167786,
+      "learning_rate": 6.65584785711856e-05,
+      "loss": 0.0393,
+      "step": 1100
+    },
+    {
+      "epoch": 3.1623931623931623,
+      "grad_norm": 0.35779276490211487,
+      "learning_rate": 6.478420387200135e-05,
+      "loss": 0.0281,
+      "step": 1110
+    },
+    {
+      "epoch": 3.190883190883191,
+      "grad_norm": 0.46333932876586914,
+      "learning_rate": 6.302249641088384e-05,
+      "loss": 0.0319,
+      "step": 1120
+    },
+    {
+      "epoch": 3.219373219373219,
+      "grad_norm": 0.2620326280593872,
+      "learning_rate": 6.127398487721398e-05,
+      "loss": 0.0386,
+      "step": 1130
+    },
+    {
+      "epoch": 3.247863247863248,
+      "grad_norm": 0.28111764788627625,
+      "learning_rate": 5.953929325122578e-05,
+      "loss": 0.0248,
+      "step": 1140
+    },
+    {
+      "epoch": 3.2763532763532766,
+      "grad_norm": 0.2124904990196228,
+      "learning_rate": 5.781904058133016e-05,
+      "loss": 0.0359,
+      "step": 1150
+    },
+    {
+      "epoch": 3.304843304843305,
+      "grad_norm": 0.15361037850379944,
+      "learning_rate": 5.611384076319944e-05,
+      "loss": 0.0235,
+      "step": 1160
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 0.2945825159549713,
+      "learning_rate": 5.442430232069e-05,
+      "loss": 0.0316,
+      "step": 1170
+    },
+    {
+      "epoch": 3.3618233618233617,
+      "grad_norm": 0.2039274275302887,
+      "learning_rate": 5.2751028188682315e-05,
+      "loss": 0.0301,
+      "step": 1180
+    },
+    {
+      "epoch": 3.3903133903133904,
+      "grad_norm": 0.18716038763523102,
+      "learning_rate": 5.109461549791541e-05,
+      "loss": 0.0335,
+      "step": 1190
+    },
+    {
+      "epoch": 3.4188034188034186,
+      "grad_norm": 0.25252339243888855,
+      "learning_rate": 4.9455655361892663e-05,
+      "loss": 0.0295,
+      "step": 1200
+    },
+    {
+      "epoch": 3.4472934472934473,
+      "grad_norm": 0.3167603313922882,
+      "learning_rate": 4.783473266593471e-05,
+      "loss": 0.0365,
+      "step": 1210
+    },
+    {
+      "epoch": 3.4757834757834756,
+      "grad_norm": 0.3018173575401306,
+      "learning_rate": 4.62324258584553e-05,
+      "loss": 0.0361,
+      "step": 1220
+    },
+    {
+      "epoch": 3.5042735042735043,
+      "grad_norm": 0.3451387882232666,
+      "learning_rate": 4.464930674453393e-05,
+      "loss": 0.0321,
+      "step": 1230
+    },
+    {
+      "epoch": 3.532763532763533,
+      "grad_norm": 0.22405466437339783,
+      "learning_rate": 4.308594028185944e-05,
+      "loss": 0.0238,
+      "step": 1240
+    },
+    {
+      "epoch": 3.561253561253561,
+      "grad_norm": 0.37276986241340637,
+      "learning_rate": 4.154288437911732e-05,
+      "loss": 0.0341,
+      "step": 1250
+    },
+    {
+      "epoch": 3.58974358974359,
+      "grad_norm": 0.3159347474575043,
+      "learning_rate": 4.0020689696891944e-05,
+      "loss": 0.0227,
+      "step": 1260
+    },
+    {
+      "epoch": 3.618233618233618,
+      "grad_norm": 0.2659304738044739,
+      "learning_rate": 3.8519899451156325e-05,
+      "loss": 0.0239,
+      "step": 1270
+    },
+    {
+      "epoch": 3.646723646723647,
+      "grad_norm": 0.3740275502204895,
+      "learning_rate": 3.704104921941768e-05,
+      "loss": 0.0309,
+      "step": 1280
+    },
+    {
+      "epoch": 3.6752136752136755,
+      "grad_norm": 0.29278329014778137,
+      "learning_rate": 3.558466674958947e-05,
+      "loss": 0.0313,
+      "step": 1290
+    },
+    {
+      "epoch": 3.7037037037037037,
+      "grad_norm": 0.30194053053855896,
+      "learning_rate": 3.415127177165729e-05,
+      "loss": 0.0396,
+      "step": 1300
+    },
+    {
+      "epoch": 3.732193732193732,
+      "grad_norm": 0.1718418151140213,
+      "learning_rate": 3.274137581220614e-05,
+      "loss": 0.0225,
+      "step": 1310
+    },
+    {
+      "epoch": 3.7606837606837606,
+      "grad_norm": 0.22094787657260895,
+      "learning_rate": 3.135548201187514e-05,
+      "loss": 0.0225,
+      "step": 1320
+    },
+    {
+      "epoch": 3.7891737891737893,
+      "grad_norm": 0.19169935584068298,
+      "learning_rate": 2.9994084945805047e-05,
+      "loss": 0.0341,
+      "step": 1330
+    },
+    {
+      "epoch": 3.8176638176638176,
+      "grad_norm": 0.33710628747940063,
+      "learning_rate": 2.865767044714205e-05,
+      "loss": 0.0378,
+      "step": 1340
+    },
+    {
+      "epoch": 3.8461538461538463,
+      "grad_norm": 0.22694231569766998,
+      "learning_rate": 2.7346715433661985e-05,
+      "loss": 0.0299,
+      "step": 1350
+    },
+    {
+      "epoch": 3.8746438746438745,
+      "grad_norm": 0.4005008339881897,
+      "learning_rate": 2.6061687737575378e-05,
+      "loss": 0.029,
+      "step": 1360
+    },
+    {
+      "epoch": 3.903133903133903,
+      "grad_norm": 0.25156232714653015,
+      "learning_rate": 2.4803045938575287e-05,
+      "loss": 0.0242,
+      "step": 1370
+    },
+    {
+      "epoch": 3.931623931623932,
+      "grad_norm": 0.30339252948760986,
+      "learning_rate": 2.3571239200186844e-05,
+      "loss": 0.03,
+      "step": 1380
+    },
+    {
+      "epoch": 3.96011396011396,
+      "grad_norm": 0.2436976432800293,
+      "learning_rate": 2.236670710947707e-05,
+      "loss": 0.0292,
+      "step": 1390
+    },
+    {
+      "epoch": 3.9886039886039883,
+      "grad_norm": 0.2527185082435608,
+      "learning_rate": 2.1189879520182267e-05,
+      "loss": 0.0269,
+      "step": 1400
+    },
+    {
+      "epoch": 4.017094017094017,
+      "grad_norm": 0.32539618015289307,
+      "learning_rate": 2.0041176399308924e-05,
+      "loss": 0.0463,
+      "step": 1410
+    },
+    {
+      "epoch": 4.045584045584046,
+      "grad_norm": 0.28222835063934326,
+      "learning_rate": 1.892100767726247e-05,
+      "loss": 0.0275,
+      "step": 1420
+    },
+    {
+      "epoch": 4.074074074074074,
+      "grad_norm": 0.30946633219718933,
+      "learning_rate": 1.7829773101558365e-05,
+      "loss": 0.0261,
+      "step": 1430
+    },
+    {
+      "epoch": 4.102564102564102,
+      "grad_norm": 0.1978222280740738,
+      "learning_rate": 1.6767862094166498e-05,
+      "loss": 0.0226,
+      "step": 1440
+    },
+    {
+      "epoch": 4.131054131054131,
+      "grad_norm": 0.14639084041118622,
+      "learning_rate": 1.573565361254069e-05,
+      "loss": 0.0238,
+      "step": 1450
+    },
+    {
+      "epoch": 4.15954415954416,
+      "grad_norm": 0.25420257449150085,
+      "learning_rate": 1.4733516014382586e-05,
+      "loss": 0.0298,
+      "step": 1460
+    },
+    {
+      "epoch": 4.188034188034188,
+      "grad_norm": 0.281435489654541,
+      "learning_rate": 1.3761806926188148e-05,
+      "loss": 0.0197,
+      "step": 1470
+    },
+    {
+      "epoch": 4.216524216524217,
+      "grad_norm": 0.26053982973098755,
+      "learning_rate": 1.2820873115623722e-05,
+      "loss": 0.0229,
+      "step": 1480
+    },
+    {
+      "epoch": 4.245014245014245,
+      "grad_norm": 0.21949049830436707,
+      "learning_rate": 1.1911050367777388e-05,
+      "loss": 0.0205,
+      "step": 1490
+    },
+    {
+      "epoch": 4.273504273504273,
+      "grad_norm": 0.3151997923851013,
+      "learning_rate": 1.1032663365329255e-05,
+      "loss": 0.0325,
+      "step": 1500
+    },
+    {
+      "epoch": 4.301994301994302,
+      "grad_norm": 0.2235259711742401,
+      "learning_rate": 1.0186025572684189e-05,
+      "loss": 0.0208,
+      "step": 1510
+    },
+    {
+      "epoch": 4.330484330484331,
+      "grad_norm": 0.21038542687892914,
+      "learning_rate": 9.37143912410765e-06,
+      "loss": 0.0216,
+      "step": 1520
+    },
+    {
+      "epoch": 4.358974358974359,
+      "grad_norm": 0.21731996536254883,
+      "learning_rate": 8.589194715905036e-06,
+      "loss": 0.0283,
+      "step": 1530
+    },
+    {
+      "epoch": 4.387464387464387,
+      "grad_norm": 0.20443207025527954,
+      "learning_rate": 7.839571502682652e-06,
+      "loss": 0.0194,
+      "step": 1540
+    },
+    {
+      "epoch": 4.415954415954416,
+      "grad_norm": 0.32589828968048096,
+      "learning_rate": 7.122836997727611e-06,
+      "loss": 0.0313,
+      "step": 1550
+    },
+    {
+      "epoch": 4.444444444444445,
+      "grad_norm": 0.19662369787693024,
+      "learning_rate": 6.439246977542091e-06,
+      "loss": 0.0218,
+      "step": 1560
+    },
+    {
+      "epoch": 4.472934472934473,
+      "grad_norm": 0.24466551840305328,
+      "learning_rate": 5.789045390565951e-06,
+      "loss": 0.0196,
+      "step": 1570
+    },
+    {
+      "epoch": 4.501424501424501,
+      "grad_norm": 0.20958594977855682,
+      "learning_rate": 5.1724642701203916e-06,
+      "loss": 0.0215,
+      "step": 1580
+    },
+    {
+      "epoch": 4.52991452991453,
+      "grad_norm": 0.28344252705574036,
+      "learning_rate": 4.589723651603739e-06,
+      "loss": 0.0233,
+      "step": 1590
+    },
+    {
+      "epoch": 4.5584045584045585,
+      "grad_norm": 0.15124832093715668,
+      "learning_rate": 4.0410314939687725e-06,
+      "loss": 0.0225,
+      "step": 1600
+    },
+    {
+      "epoch": 4.586894586894587,
+      "grad_norm": 0.2302241027355194,
+      "learning_rate": 3.5265836055097258e-06,
+      "loss": 0.0239,
+      "step": 1610
+    },
+    {
+      "epoch": 4.615384615384615,
+      "grad_norm": 0.3110940456390381,
+      "learning_rate": 3.0465635739854725e-06,
+      "loss": 0.0357,
+      "step": 1620
+    },
+    {
+      "epoch": 4.643874643874644,
+      "grad_norm": 0.19868485629558563,
+      "learning_rate": 2.6011427011036317e-06,
+      "loss": 0.0296,
+      "step": 1630
+    },
+    {
+      "epoch": 4.672364672364672,
+      "grad_norm": 0.180753692984581,
+      "learning_rate": 2.1904799413893116e-06,
+      "loss": 0.0271,
+      "step": 1640
+    },
+    {
+      "epoch": 4.700854700854701,
+      "grad_norm": 0.24652840197086334,
+      "learning_rate": 1.814721845459977e-06,
+      "loss": 0.0192,
+      "step": 1650
+    },
+    {
+      "epoch": 4.72934472934473,
+      "grad_norm": 0.27937212586402893,
+      "learning_rate": 1.4740025077268128e-06,
+      "loss": 0.0265,
+      "step": 1660
+    },
+    {
+      "epoch": 4.7578347578347575,
+      "grad_norm": 0.2583824694156647,
+      "learning_rate": 1.1684435185413733e-06,
+      "loss": 0.0274,
+      "step": 1670
+    },
+    {
+      "epoch": 4.786324786324786,
+      "grad_norm": 0.2571396231651306,
+      "learning_rate": 8.981539208043788e-07,
+      "loss": 0.0248,
+      "step": 1680
+    },
+    {
+      "epoch": 4.814814814814815,
+      "grad_norm": 0.26664578914642334,
+      "learning_rate": 6.632301710522205e-07,
+      "loss": 0.022,
+      "step": 1690
+    },
+    {
+      "epoch": 4.843304843304844,
+      "grad_norm": 0.22444820404052734,
+      "learning_rate": 4.63756105035229e-07,
+      "loss": 0.0357,
+      "step": 1700
+    },
+    {
+      "epoch": 4.871794871794872,
+      "grad_norm": 0.21229873597621918,
+      "learning_rate": 2.9980290779960676e-07,
+      "loss": 0.0236,
+      "step": 1710
+    },
+    {
+      "epoch": 4.9002849002849,
+      "grad_norm": 0.2009340524673462,
+      "learning_rate": 1.71429088284214e-07,
+      "loss": 0.0201,
+      "step": 1720
+    },
+    {
+      "epoch": 4.928774928774929,
+      "grad_norm": 0.2139723300933838,
+      "learning_rate": 7.868045844073857e-08,
+      "loss": 0.0255,
+      "step": 1730
+    },
+    {
+      "epoch": 4.957264957264957,
+      "grad_norm": 0.10056009143590927,
+      "learning_rate": 2.1590116885139567e-08,
+      "loss": 0.0238,
+      "step": 1740
+    },
+    {
+      "epoch": 4.985754985754986,
+      "grad_norm": 0.21970975399017334,
+      "learning_rate": 1.7843708592923947e-10,
+      "loss": 0.0277,
+      "step": 1750
+    },
+    {
+      "epoch": 4.988603988603988,
+      "step": 1751,
+      "total_flos": 3.081693778301952e+16,
+      "train_loss": 0.060192157869157555,
+      "train_runtime": 593.7414,
+      "train_samples_per_second": 47.186,
+      "train_steps_per_second": 2.949
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1751,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.081693778301952e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}