Training in progress, step 2200, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1412 -4

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3315a011cd350f3414c7a89b58d02ae04b0c89a4818fe65d144ef3f07bf0fa5e
 size 500770656

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f4451577bf82ae4c14fb8b5f6d15593c695f63d1bc7c8c377049e28c0b6f430
 size 500770656

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8917c47b8bee59dbb6cc723b84d580f20118d23257b96d0b64142c9d88b49cc1
 size 254918356

 version https://git-lfs.github.com/spec/v1
+oid sha256:435f4a73c69232486ea2c5684eb01e7449a2602d9445e4a4dbe0c21719127715
 size 254918356

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7783dc92c497f856cfd411b5c600bbbdcef47d49a460249bc87f8ae070b0745e
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:c6015ab40414177a8cb3a25519cffb5a624e999127e3ac742f7bf693b450cb8e
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:326280ad626dfb0afdd3fc527e3332b744f0a35ffa7ea6d8eb922af144c94bec
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:e66e55baeee62db229bddf3da45b85b2a91fe7343a6a75e11aba725017a7a321
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.635880708694458,
   "best_model_checkpoint": "miner_id_24/checkpoint-1800",
-  "epoch": 0.2758810952479481,
   "eval_steps": 200,
-  "global_step": 2000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -14095,6 +14095,1414 @@
       "eval_samples_per_second": 2.508,
       "eval_steps_per_second": 2.508,
       "step": 2000
     }
   ],
   "logging_steps": 1,
@@ -14109,7 +15517,7 @@
         "early_stopping_threshold": 0.0
       },
       "attributes": {
-        "early_stopping_patience_counter": 1
       }
     },
     "TrainerControl": {
@@ -14123,7 +15531,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.1657976759844864e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.635880708694458,
   "best_model_checkpoint": "miner_id_24/checkpoint-1800",
+  "epoch": 0.303469204772743,
   "eval_steps": 200,
+  "global_step": 2200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2.508,
       "eval_steps_per_second": 2.508,
       "step": 2000
+    },
+    {
+      "epoch": 0.27601903579557213,
+      "grad_norm": 0.7976175546646118,
+      "learning_rate": 0.00019596262479772337,
+      "loss": 0.8986,
+      "step": 2001
+    },
+    {
+      "epoch": 0.2761569763431961,
+      "grad_norm": 0.7986498475074768,
+      "learning_rate": 0.0001959585548074112,
+      "loss": 0.6112,
+      "step": 2002
+    },
+    {
+      "epoch": 0.27629491689082003,
+      "grad_norm": 0.6366571187973022,
+      "learning_rate": 0.00019595448280900626,
+      "loss": 0.4923,
+      "step": 2003
+    },
+    {
+      "epoch": 0.27643285743844404,
+      "grad_norm": 0.6254854798316956,
+      "learning_rate": 0.0001959504088025937,
+      "loss": 0.3532,
+      "step": 2004
+    },
+    {
+      "epoch": 0.276570797986068,
+      "grad_norm": 0.9452973008155823,
+      "learning_rate": 0.0001959463327882588,
+      "loss": 1.054,
+      "step": 2005
+    },
+    {
+      "epoch": 0.276708738533692,
+      "grad_norm": 0.8644076585769653,
+      "learning_rate": 0.00019594225476608686,
+      "loss": 0.6988,
+      "step": 2006
+    },
+    {
+      "epoch": 0.27684667908131594,
+      "grad_norm": 0.563485860824585,
+      "learning_rate": 0.00019593817473616322,
+      "loss": 0.5089,
+      "step": 2007
+    },
+    {
+      "epoch": 0.27698461962893994,
+      "grad_norm": 0.9795621037483215,
+      "learning_rate": 0.00019593409269857325,
+      "loss": 0.2905,
+      "step": 2008
+    },
+    {
+      "epoch": 0.2771225601765639,
+      "grad_norm": 0.6783512234687805,
+      "learning_rate": 0.00019593000865340238,
+      "loss": 0.3544,
+      "step": 2009
+    },
+    {
+      "epoch": 0.2772605007241879,
+      "grad_norm": 0.6736263036727905,
+      "learning_rate": 0.00019592592260073613,
+      "loss": 0.508,
+      "step": 2010
+    },
+    {
+      "epoch": 0.27739844127181185,
+      "grad_norm": 0.8978418111801147,
+      "learning_rate": 0.00019592183454065988,
+      "loss": 0.8027,
+      "step": 2011
+    },
+    {
+      "epoch": 0.2775363818194358,
+      "grad_norm": 0.877034604549408,
+      "learning_rate": 0.0001959177444732593,
+      "loss": 0.7846,
+      "step": 2012
+    },
+    {
+      "epoch": 0.2776743223670598,
+      "grad_norm": 0.6984195113182068,
+      "learning_rate": 0.00019591365239861994,
+      "loss": 0.559,
+      "step": 2013
+    },
+    {
+      "epoch": 0.27781226291468375,
+      "grad_norm": 1.1937421560287476,
+      "learning_rate": 0.00019590955831682742,
+      "loss": 0.7328,
+      "step": 2014
+    },
+    {
+      "epoch": 0.27795020346230775,
+      "grad_norm": 0.7599695920944214,
+      "learning_rate": 0.00019590546222796742,
+      "loss": 0.392,
+      "step": 2015
+    },
+    {
+      "epoch": 0.2780881440099317,
+      "grad_norm": 0.8356521129608154,
+      "learning_rate": 0.00019590136413212566,
+      "loss": 0.3812,
+      "step": 2016
+    },
+    {
+      "epoch": 0.2782260845575557,
+      "grad_norm": 0.6217128038406372,
+      "learning_rate": 0.00019589726402938792,
+      "loss": 0.5064,
+      "step": 2017
+    },
+    {
+      "epoch": 0.27836402510517966,
+      "grad_norm": 0.9336037039756775,
+      "learning_rate": 0.00019589316191984,
+      "loss": 0.7596,
+      "step": 2018
+    },
+    {
+      "epoch": 0.27850196565280366,
+      "grad_norm": 0.6983953714370728,
+      "learning_rate": 0.0001958890578035677,
+      "loss": 0.7579,
+      "step": 2019
+    },
+    {
+      "epoch": 0.2786399062004276,
+      "grad_norm": 0.6743526458740234,
+      "learning_rate": 0.00019588495168065692,
+      "loss": 0.4536,
+      "step": 2020
+    },
+    {
+      "epoch": 0.2787778467480516,
+      "grad_norm": 0.8309145569801331,
+      "learning_rate": 0.00019588084355119363,
+      "loss": 0.5478,
+      "step": 2021
+    },
+    {
+      "epoch": 0.27891578729567557,
+      "grad_norm": 0.9033045172691345,
+      "learning_rate": 0.00019587673341526376,
+      "loss": 0.4243,
+      "step": 2022
+    },
+    {
+      "epoch": 0.2790537278432995,
+      "grad_norm": 0.8193897604942322,
+      "learning_rate": 0.00019587262127295331,
+      "loss": 0.5688,
+      "step": 2023
+    },
+    {
+      "epoch": 0.2791916683909235,
+      "grad_norm": 0.6730914115905762,
+      "learning_rate": 0.0001958685071243484,
+      "loss": 0.5477,
+      "step": 2024
+    },
+    {
+      "epoch": 0.27932960893854747,
+      "grad_norm": 0.5275852680206299,
+      "learning_rate": 0.00019586439096953506,
+      "loss": 0.2861,
+      "step": 2025
+    },
+    {
+      "epoch": 0.2794675494861715,
+      "grad_norm": 0.7503786087036133,
+      "learning_rate": 0.00019586027280859945,
+      "loss": 0.7682,
+      "step": 2026
+    },
+    {
+      "epoch": 0.2796054900337954,
+      "grad_norm": 0.6576685309410095,
+      "learning_rate": 0.00019585615264162772,
+      "loss": 0.6214,
+      "step": 2027
+    },
+    {
+      "epoch": 0.2797434305814194,
+      "grad_norm": 0.7833530306816101,
+      "learning_rate": 0.00019585203046870614,
+      "loss": 0.9256,
+      "step": 2028
+    },
+    {
+      "epoch": 0.2798813711290434,
+      "grad_norm": 0.779478132724762,
+      "learning_rate": 0.00019584790628992098,
+      "loss": 0.6512,
+      "step": 2029
+    },
+    {
+      "epoch": 0.2800193116766674,
+      "grad_norm": 0.5535669922828674,
+      "learning_rate": 0.0001958437801053585,
+      "loss": 0.3869,
+      "step": 2030
+    },
+    {
+      "epoch": 0.28015725222429133,
+      "grad_norm": 0.694486141204834,
+      "learning_rate": 0.00019583965191510505,
+      "loss": 0.3586,
+      "step": 2031
+    },
+    {
+      "epoch": 0.2802951927719153,
+      "grad_norm": 0.7821094989776611,
+      "learning_rate": 0.00019583552171924704,
+      "loss": 0.5341,
+      "step": 2032
+    },
+    {
+      "epoch": 0.2804331333195393,
+      "grad_norm": 0.570767879486084,
+      "learning_rate": 0.0001958313895178709,
+      "loss": 0.4214,
+      "step": 2033
+    },
+    {
+      "epoch": 0.28057107386716323,
+      "grad_norm": 0.7766290307044983,
+      "learning_rate": 0.00019582725531106307,
+      "loss": 0.6409,
+      "step": 2034
+    },
+    {
+      "epoch": 0.28070901441478724,
+      "grad_norm": 0.7544063925743103,
+      "learning_rate": 0.00019582311909891012,
+      "loss": 0.5586,
+      "step": 2035
+    },
+    {
+      "epoch": 0.2808469549624112,
+      "grad_norm": 0.6841877102851868,
+      "learning_rate": 0.0001958189808814986,
+      "loss": 0.4377,
+      "step": 2036
+    },
+    {
+      "epoch": 0.2809848955100352,
+      "grad_norm": 0.584334135055542,
+      "learning_rate": 0.00019581484065891506,
+      "loss": 0.5649,
+      "step": 2037
+    },
+    {
+      "epoch": 0.28112283605765914,
+      "grad_norm": 0.7064344882965088,
+      "learning_rate": 0.00019581069843124617,
+      "loss": 0.8847,
+      "step": 2038
+    },
+    {
+      "epoch": 0.28126077660528315,
+      "grad_norm": 0.5461025834083557,
+      "learning_rate": 0.00019580655419857866,
+      "loss": 0.3344,
+      "step": 2039
+    },
+    {
+      "epoch": 0.2813987171529071,
+      "grad_norm": 1.2574125528335571,
+      "learning_rate": 0.00019580240796099915,
+      "loss": 0.7018,
+      "step": 2040
+    },
+    {
+      "epoch": 0.28153665770053105,
+      "grad_norm": 1.02732253074646,
+      "learning_rate": 0.00019579825971859452,
+      "loss": 1.1026,
+      "step": 2041
+    },
+    {
+      "epoch": 0.28167459824815505,
+      "grad_norm": 0.4866338074207306,
+      "learning_rate": 0.00019579410947145146,
+      "loss": 0.4095,
+      "step": 2042
+    },
+    {
+      "epoch": 0.281812538795779,
+      "grad_norm": 0.7297942042350769,
+      "learning_rate": 0.00019578995721965695,
+      "loss": 0.7477,
+      "step": 2043
+    },
+    {
+      "epoch": 0.281950479343403,
+      "grad_norm": 0.671257734298706,
+      "learning_rate": 0.0001957858029632978,
+      "loss": 0.6971,
+      "step": 2044
+    },
+    {
+      "epoch": 0.28208841989102695,
+      "grad_norm": 0.6661747097969055,
+      "learning_rate": 0.00019578164670246094,
+      "loss": 0.4219,
+      "step": 2045
+    },
+    {
+      "epoch": 0.28222636043865096,
+      "grad_norm": 1.152039885520935,
+      "learning_rate": 0.00019577748843723337,
+      "loss": 0.9014,
+      "step": 2046
+    },
+    {
+      "epoch": 0.2823643009862749,
+      "grad_norm": 0.8204615712165833,
+      "learning_rate": 0.0001957733281677021,
+      "loss": 0.7038,
+      "step": 2047
+    },
+    {
+      "epoch": 0.2825022415338989,
+      "grad_norm": 0.6705266237258911,
+      "learning_rate": 0.00019576916589395424,
+      "loss": 0.7392,
+      "step": 2048
+    },
+    {
+      "epoch": 0.28264018208152286,
+      "grad_norm": 1.229459524154663,
+      "learning_rate": 0.00019576500161607685,
+      "loss": 1.0651,
+      "step": 2049
+    },
+    {
+      "epoch": 0.2827781226291468,
+      "grad_norm": 0.685117244720459,
+      "learning_rate": 0.00019576083533415703,
+      "loss": 0.7557,
+      "step": 2050
+    },
+    {
+      "epoch": 0.2829160631767708,
+      "grad_norm": 0.8755848407745361,
+      "learning_rate": 0.00019575666704828206,
+      "loss": 0.931,
+      "step": 2051
+    },
+    {
+      "epoch": 0.28305400372439476,
+      "grad_norm": 0.6004536747932434,
+      "learning_rate": 0.00019575249675853908,
+      "loss": 0.5779,
+      "step": 2052
+    },
+    {
+      "epoch": 0.28319194427201877,
+      "grad_norm": 0.671427845954895,
+      "learning_rate": 0.00019574832446501544,
+      "loss": 0.4515,
+      "step": 2053
+    },
+    {
+      "epoch": 0.2833298848196427,
+      "grad_norm": 0.9582410454750061,
+      "learning_rate": 0.0001957441501677984,
+      "loss": 0.6176,
+      "step": 2054
+    },
+    {
+      "epoch": 0.2834678253672667,
+      "grad_norm": 0.8629324436187744,
+      "learning_rate": 0.00019573997386697532,
+      "loss": 0.7077,
+      "step": 2055
+    },
+    {
+      "epoch": 0.2836057659148907,
+      "grad_norm": 0.9006950259208679,
+      "learning_rate": 0.0001957357955626336,
+      "loss": 0.6964,
+      "step": 2056
+    },
+    {
+      "epoch": 0.2837437064625147,
+      "grad_norm": 0.6615795493125916,
+      "learning_rate": 0.0001957316152548607,
+      "loss": 0.4452,
+      "step": 2057
+    },
+    {
+      "epoch": 0.2838816470101386,
+      "grad_norm": 0.7859619855880737,
+      "learning_rate": 0.00019572743294374404,
+      "loss": 1.0109,
+      "step": 2058
+    },
+    {
+      "epoch": 0.28401958755776263,
+      "grad_norm": 0.6359809041023254,
+      "learning_rate": 0.00019572324862937124,
+      "loss": 0.452,
+      "step": 2059
+    },
+    {
+      "epoch": 0.2841575281053866,
+      "grad_norm": 0.780289351940155,
+      "learning_rate": 0.00019571906231182978,
+      "loss": 0.8381,
+      "step": 2060
+    },
+    {
+      "epoch": 0.28429546865301053,
+      "grad_norm": 0.8848547339439392,
+      "learning_rate": 0.0001957148739912073,
+      "loss": 0.6521,
+      "step": 2061
+    },
+    {
+      "epoch": 0.28443340920063453,
+      "grad_norm": 0.6815661787986755,
+      "learning_rate": 0.00019571068366759143,
+      "loss": 0.5813,
+      "step": 2062
+    },
+    {
+      "epoch": 0.2845713497482585,
+      "grad_norm": 0.5312855243682861,
+      "learning_rate": 0.00019570649134106985,
+      "loss": 0.3351,
+      "step": 2063
+    },
+    {
+      "epoch": 0.2847092902958825,
+      "grad_norm": 0.5981124043464661,
+      "learning_rate": 0.00019570229701173036,
+      "loss": 0.4126,
+      "step": 2064
+    },
+    {
+      "epoch": 0.28484723084350644,
+      "grad_norm": 0.9804319739341736,
+      "learning_rate": 0.00019569810067966066,
+      "loss": 0.8333,
+      "step": 2065
+    },
+    {
+      "epoch": 0.28498517139113044,
+      "grad_norm": 1.0361062288284302,
+      "learning_rate": 0.00019569390234494858,
+      "loss": 0.6087,
+      "step": 2066
+    },
+    {
+      "epoch": 0.2851231119387544,
+      "grad_norm": 0.7839725017547607,
+      "learning_rate": 0.000195689702007682,
+      "loss": 0.7404,
+      "step": 2067
+    },
+    {
+      "epoch": 0.2852610524863784,
+      "grad_norm": 1.3355668783187866,
+      "learning_rate": 0.0001956854996679488,
+      "loss": 0.4984,
+      "step": 2068
+    },
+    {
+      "epoch": 0.28539899303400235,
+      "grad_norm": 0.6724937558174133,
+      "learning_rate": 0.00019568129532583693,
+      "loss": 0.4341,
+      "step": 2069
+    },
+    {
+      "epoch": 0.2855369335816263,
+      "grad_norm": 0.7715407013893127,
+      "learning_rate": 0.00019567708898143437,
+      "loss": 0.6913,
+      "step": 2070
+    },
+    {
+      "epoch": 0.2856748741292503,
+      "grad_norm": 0.8403461575508118,
+      "learning_rate": 0.00019567288063482914,
+      "loss": 0.5184,
+      "step": 2071
+    },
+    {
+      "epoch": 0.28581281467687425,
+      "grad_norm": 0.6787713766098022,
+      "learning_rate": 0.0001956686702861093,
+      "loss": 0.5928,
+      "step": 2072
+    },
+    {
+      "epoch": 0.28595075522449825,
+      "grad_norm": 0.5545241832733154,
+      "learning_rate": 0.00019566445793536299,
+      "loss": 0.4176,
+      "step": 2073
+    },
+    {
+      "epoch": 0.2860886957721222,
+      "grad_norm": 0.5456835031509399,
+      "learning_rate": 0.00019566024358267834,
+      "loss": 0.409,
+      "step": 2074
+    },
+    {
+      "epoch": 0.2862266363197462,
+      "grad_norm": 1.8867385387420654,
+      "learning_rate": 0.00019565602722814354,
+      "loss": 0.6322,
+      "step": 2075
+    },
+    {
+      "epoch": 0.28636457686737016,
+      "grad_norm": 0.7244119644165039,
+      "learning_rate": 0.0001956518088718468,
+      "loss": 0.4894,
+      "step": 2076
+    },
+    {
+      "epoch": 0.28650251741499416,
+      "grad_norm": 0.7089682817459106,
+      "learning_rate": 0.00019564758851387649,
+      "loss": 0.5693,
+      "step": 2077
+    },
+    {
+      "epoch": 0.2866404579626181,
+      "grad_norm": 0.6970006823539734,
+      "learning_rate": 0.0001956433661543208,
+      "loss": 0.4493,
+      "step": 2078
+    },
+    {
+      "epoch": 0.28677839851024206,
+      "grad_norm": 0.7393503785133362,
+      "learning_rate": 0.00019563914179326818,
+      "loss": 0.5863,
+      "step": 2079
+    },
+    {
+      "epoch": 0.28691633905786607,
+      "grad_norm": 0.6624215841293335,
+      "learning_rate": 0.00019563491543080698,
+      "loss": 0.3739,
+      "step": 2080
+    },
+    {
+      "epoch": 0.28705427960549,
+      "grad_norm": 0.7205662727355957,
+      "learning_rate": 0.0001956306870670257,
+      "loss": 0.5124,
+      "step": 2081
+    },
+    {
+      "epoch": 0.287192220153114,
+      "grad_norm": 1.1564881801605225,
+      "learning_rate": 0.00019562645670201276,
+      "loss": 1.0517,
+      "step": 2082
+    },
+    {
+      "epoch": 0.28733016070073797,
+      "grad_norm": 0.7639877796173096,
+      "learning_rate": 0.00019562222433585673,
+      "loss": 0.9036,
+      "step": 2083
+    },
+    {
+      "epoch": 0.287468101248362,
+      "grad_norm": 0.6498881578445435,
+      "learning_rate": 0.00019561798996864618,
+      "loss": 0.618,
+      "step": 2084
+    },
+    {
+      "epoch": 0.2876060417959859,
+      "grad_norm": 0.7746434807777405,
+      "learning_rate": 0.0001956137536004697,
+      "loss": 0.5444,
+      "step": 2085
+    },
+    {
+      "epoch": 0.2877439823436099,
+      "grad_norm": 1.1528464555740356,
+      "learning_rate": 0.00019560951523141595,
+      "loss": 0.4188,
+      "step": 2086
+    },
+    {
+      "epoch": 0.2878819228912339,
+      "grad_norm": 0.6776193976402283,
+      "learning_rate": 0.00019560527486157364,
+      "loss": 0.4812,
+      "step": 2087
+    },
+    {
+      "epoch": 0.2880198634388578,
+      "grad_norm": 1.0938503742218018,
+      "learning_rate": 0.00019560103249103148,
+      "loss": 0.8737,
+      "step": 2088
+    },
+    {
+      "epoch": 0.28815780398648183,
+      "grad_norm": 0.6782721281051636,
+      "learning_rate": 0.00019559678811987828,
+      "loss": 0.5982,
+      "step": 2089
+    },
+    {
+      "epoch": 0.2882957445341058,
+      "grad_norm": 0.6858242154121399,
+      "learning_rate": 0.00019559254174820282,
+      "loss": 0.6636,
+      "step": 2090
+    },
+    {
+      "epoch": 0.2884336850817298,
+      "grad_norm": 0.8259555697441101,
+      "learning_rate": 0.00019558829337609402,
+      "loss": 0.4079,
+      "step": 2091
+    },
+    {
+      "epoch": 0.28857162562935373,
+      "grad_norm": 0.8771445155143738,
+      "learning_rate": 0.00019558404300364072,
+      "loss": 0.6069,
+      "step": 2092
+    },
+    {
+      "epoch": 0.28870956617697774,
+      "grad_norm": 0.5591592192649841,
+      "learning_rate": 0.00019557979063093188,
+      "loss": 0.373,
+      "step": 2093
+    },
+    {
+      "epoch": 0.2888475067246017,
+      "grad_norm": 0.7256616353988647,
+      "learning_rate": 0.00019557553625805657,
+      "loss": 0.6074,
+      "step": 2094
+    },
+    {
+      "epoch": 0.2889854472722257,
+      "grad_norm": 0.646175742149353,
+      "learning_rate": 0.00019557127988510372,
+      "loss": 0.6554,
+      "step": 2095
+    },
+    {
+      "epoch": 0.28912338781984964,
+      "grad_norm": 0.5466925501823425,
+      "learning_rate": 0.00019556702151216242,
+      "loss": 0.4869,
+      "step": 2096
+    },
+    {
+      "epoch": 0.2892613283674736,
+      "grad_norm": 0.8264899253845215,
+      "learning_rate": 0.00019556276113932183,
+      "loss": 0.5827,
+      "step": 2097
+    },
+    {
+      "epoch": 0.2893992689150976,
+      "grad_norm": 0.7389553189277649,
+      "learning_rate": 0.00019555849876667103,
+      "loss": 0.3154,
+      "step": 2098
+    },
+    {
+      "epoch": 0.28953720946272155,
+      "grad_norm": 0.6903903484344482,
+      "learning_rate": 0.0001955542343942993,
+      "loss": 0.7065,
+      "step": 2099
+    },
+    {
+      "epoch": 0.28967515001034555,
+      "grad_norm": 1.1037869453430176,
+      "learning_rate": 0.00019554996802229583,
+      "loss": 0.7192,
+      "step": 2100
+    },
+    {
+      "epoch": 0.2898130905579695,
+      "grad_norm": 1.640199065208435,
+      "learning_rate": 0.00019554569965074992,
+      "loss": 0.553,
+      "step": 2101
+    },
+    {
+      "epoch": 0.2899510311055935,
+      "grad_norm": 0.6174784302711487,
+      "learning_rate": 0.00019554142927975088,
+      "loss": 0.5931,
+      "step": 2102
+    },
+    {
+      "epoch": 0.29008897165321745,
+      "grad_norm": 0.9308575987815857,
+      "learning_rate": 0.0001955371569093881,
+      "loss": 0.5784,
+      "step": 2103
+    },
+    {
+      "epoch": 0.29022691220084146,
+      "grad_norm": 1.0464619398117065,
+      "learning_rate": 0.00019553288253975094,
+      "loss": 0.6852,
+      "step": 2104
+    },
+    {
+      "epoch": 0.2903648527484654,
+      "grad_norm": 0.8496968746185303,
+      "learning_rate": 0.00019552860617092887,
+      "loss": 0.7255,
+      "step": 2105
+    },
+    {
+      "epoch": 0.2905027932960894,
+      "grad_norm": 0.7223439812660217,
+      "learning_rate": 0.00019552432780301139,
+      "loss": 0.6047,
+      "step": 2106
+    },
+    {
+      "epoch": 0.29064073384371336,
+      "grad_norm": 0.7429842948913574,
+      "learning_rate": 0.00019552004743608804,
+      "loss": 0.4424,
+      "step": 2107
+    },
+    {
+      "epoch": 0.2907786743913373,
+      "grad_norm": 0.6475143432617188,
+      "learning_rate": 0.0001955157650702484,
+      "loss": 0.4936,
+      "step": 2108
+    },
+    {
+      "epoch": 0.2909166149389613,
+      "grad_norm": 0.744999885559082,
+      "learning_rate": 0.00019551148070558205,
+      "loss": 0.7856,
+      "step": 2109
+    },
+    {
+      "epoch": 0.29105455548658526,
+      "grad_norm": 0.6844127178192139,
+      "learning_rate": 0.00019550719434217865,
+      "loss": 0.6373,
+      "step": 2110
+    },
+    {
+      "epoch": 0.29119249603420927,
+      "grad_norm": 0.7517552375793457,
+      "learning_rate": 0.00019550290598012793,
+      "loss": 0.4843,
+      "step": 2111
+    },
+    {
+      "epoch": 0.2913304365818332,
+      "grad_norm": 0.7101173996925354,
+      "learning_rate": 0.00019549861561951959,
+      "loss": 0.3913,
+      "step": 2112
+    },
+    {
+      "epoch": 0.2914683771294572,
+      "grad_norm": 0.721660852432251,
+      "learning_rate": 0.00019549432326044345,
+      "loss": 0.302,
+      "step": 2113
+    },
+    {
+      "epoch": 0.2916063176770812,
+      "grad_norm": 0.6831248998641968,
+      "learning_rate": 0.00019549002890298934,
+      "loss": 0.5832,
+      "step": 2114
+    },
+    {
+      "epoch": 0.2917442582247052,
+      "grad_norm": 0.6875988245010376,
+      "learning_rate": 0.00019548573254724708,
+      "loss": 0.3342,
+      "step": 2115
+    },
+    {
+      "epoch": 0.2918821987723291,
+      "grad_norm": 0.5807436108589172,
+      "learning_rate": 0.00019548143419330661,
+      "loss": 0.7861,
+      "step": 2116
+    },
+    {
+      "epoch": 0.2920201393199531,
+      "grad_norm": 1.089648962020874,
+      "learning_rate": 0.0001954771338412579,
+      "loss": 0.6961,
+      "step": 2117
+    },
+    {
+      "epoch": 0.2921580798675771,
+      "grad_norm": 0.7699464559555054,
+      "learning_rate": 0.00019547283149119092,
+      "loss": 0.4305,
+      "step": 2118
+    },
+    {
+      "epoch": 0.29229602041520103,
+      "grad_norm": 0.7928016781806946,
+      "learning_rate": 0.0001954685271431957,
+      "loss": 0.6308,
+      "step": 2119
+    },
+    {
+      "epoch": 0.29243396096282503,
+      "grad_norm": 0.9115915298461914,
+      "learning_rate": 0.0001954642207973623,
+      "loss": 0.6821,
+      "step": 2120
+    },
+    {
+      "epoch": 0.292571901510449,
+      "grad_norm": 0.8448048233985901,
+      "learning_rate": 0.00019545991245378087,
+      "loss": 0.7006,
+      "step": 2121
+    },
+    {
+      "epoch": 0.292709842058073,
+      "grad_norm": 0.7800453305244446,
+      "learning_rate": 0.00019545560211254155,
+      "loss": 0.5068,
+      "step": 2122
+    },
+    {
+      "epoch": 0.29284778260569694,
+      "grad_norm": 0.8466667532920837,
+      "learning_rate": 0.00019545128977373454,
+      "loss": 0.8495,
+      "step": 2123
+    },
+    {
+      "epoch": 0.29298572315332094,
+      "grad_norm": 0.5569940805435181,
+      "learning_rate": 0.00019544697543745013,
+      "loss": 0.4394,
+      "step": 2124
+    },
+    {
+      "epoch": 0.2931236637009449,
+      "grad_norm": 1.0552653074264526,
+      "learning_rate": 0.0001954426591037785,
+      "loss": 0.932,
+      "step": 2125
+    },
+    {
+      "epoch": 0.29326160424856884,
+      "grad_norm": 0.8036336898803711,
+      "learning_rate": 0.00019543834077281007,
+      "loss": 0.5089,
+      "step": 2126
+    },
+    {
+      "epoch": 0.29339954479619285,
+      "grad_norm": 0.9038389325141907,
+      "learning_rate": 0.00019543402044463521,
+      "loss": 0.4724,
+      "step": 2127
+    },
+    {
+      "epoch": 0.2935374853438168,
+      "grad_norm": 0.5815970301628113,
+      "learning_rate": 0.00019542969811934426,
+      "loss": 0.3765,
+      "step": 2128
+    },
+    {
+      "epoch": 0.2936754258914408,
+      "grad_norm": 0.8801495432853699,
+      "learning_rate": 0.00019542537379702772,
+      "loss": 0.6568,
+      "step": 2129
+    },
+    {
+      "epoch": 0.29381336643906475,
+      "grad_norm": 0.9125809073448181,
+      "learning_rate": 0.0001954210474777761,
+      "loss": 1.0139,
+      "step": 2130
+    },
+    {
+      "epoch": 0.29395130698668875,
+      "grad_norm": 0.6941331624984741,
+      "learning_rate": 0.00019541671916167987,
+      "loss": 0.5135,
+      "step": 2131
+    },
+    {
+      "epoch": 0.2940892475343127,
+      "grad_norm": 0.721238911151886,
+      "learning_rate": 0.00019541238884882966,
+      "loss": 0.3743,
+      "step": 2132
+    },
+    {
+      "epoch": 0.2942271880819367,
+      "grad_norm": 1.1131097078323364,
+      "learning_rate": 0.00019540805653931609,
+      "loss": 0.8024,
+      "step": 2133
+    },
+    {
+      "epoch": 0.29436512862956066,
+      "grad_norm": 0.6035623550415039,
+      "learning_rate": 0.0001954037222332298,
+      "loss": 0.4709,
+      "step": 2134
+    },
+    {
+      "epoch": 0.2945030691771846,
+      "grad_norm": 0.6855323314666748,
+      "learning_rate": 0.0001953993859306615,
+      "loss": 0.4904,
+      "step": 2135
+    },
+    {
+      "epoch": 0.2946410097248086,
+      "grad_norm": 0.7864904403686523,
+      "learning_rate": 0.00019539504763170192,
+      "loss": 0.73,
+      "step": 2136
+    },
+    {
+      "epoch": 0.29477895027243256,
+      "grad_norm": 1.1502920389175415,
+      "learning_rate": 0.0001953907073364419,
+      "loss": 1.1485,
+      "step": 2137
+    },
+    {
+      "epoch": 0.29491689082005657,
+      "grad_norm": 0.8464686870574951,
+      "learning_rate": 0.0001953863650449722,
+      "loss": 0.6272,
+      "step": 2138
+    },
+    {
+      "epoch": 0.2950548313676805,
+      "grad_norm": 0.645497739315033,
+      "learning_rate": 0.00019538202075738373,
+      "loss": 0.5731,
+      "step": 2139
+    },
+    {
+      "epoch": 0.2951927719153045,
+      "grad_norm": 0.7950919270515442,
+      "learning_rate": 0.00019537767447376736,
+      "loss": 0.6039,
+      "step": 2140
+    },
+    {
+      "epoch": 0.29533071246292847,
+      "grad_norm": 0.7011622190475464,
+      "learning_rate": 0.0001953733261942141,
+      "loss": 0.4037,
+      "step": 2141
+    },
+    {
+      "epoch": 0.2954686530105525,
+      "grad_norm": 0.7136140465736389,
+      "learning_rate": 0.0001953689759188149,
+      "loss": 0.6339,
+      "step": 2142
+    },
+    {
+      "epoch": 0.2956065935581764,
+      "grad_norm": 1.0056228637695312,
+      "learning_rate": 0.0001953646236476608,
+      "loss": 0.5475,
+      "step": 2143
+    },
+    {
+      "epoch": 0.2957445341058004,
+      "grad_norm": 0.8086057901382446,
+      "learning_rate": 0.00019536026938084296,
+      "loss": 0.6254,
+      "step": 2144
+    },
+    {
+      "epoch": 0.2958824746534244,
+      "grad_norm": 0.8735644817352295,
+      "learning_rate": 0.00019535591311845235,
+      "loss": 0.5957,
+      "step": 2145
+    },
+    {
+      "epoch": 0.2960204152010483,
+      "grad_norm": 0.6294654607772827,
+      "learning_rate": 0.00019535155486058027,
+      "loss": 0.4358,
+      "step": 2146
+    },
+    {
+      "epoch": 0.29615835574867233,
+      "grad_norm": 0.6147223114967346,
+      "learning_rate": 0.00019534719460731785,
+      "loss": 0.5106,
+      "step": 2147
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.6865537166595459,
+      "learning_rate": 0.00019534283235875637,
+      "loss": 0.6796,
+      "step": 2148
+    },
+    {
+      "epoch": 0.2964342368439203,
+      "grad_norm": 0.8193590641021729,
+      "learning_rate": 0.0001953384681149871,
+      "loss": 0.6479,
+      "step": 2149
+    },
+    {
+      "epoch": 0.29657217739154423,
+      "grad_norm": 0.8016851544380188,
+      "learning_rate": 0.00019533410187610138,
+      "loss": 0.8757,
+      "step": 2150
+    },
+    {
+      "epoch": 0.29671011793916824,
+      "grad_norm": 0.8192347288131714,
+      "learning_rate": 0.00019532973364219054,
+      "loss": 0.8549,
+      "step": 2151
+    },
+    {
+      "epoch": 0.2968480584867922,
+      "grad_norm": 1.2745975255966187,
+      "learning_rate": 0.000195325363413346,
+      "loss": 0.8386,
+      "step": 2152
+    },
+    {
+      "epoch": 0.2969859990344162,
+      "grad_norm": 0.7096378207206726,
+      "learning_rate": 0.00019532099118965931,
+      "loss": 0.5653,
+      "step": 2153
+    },
+    {
+      "epoch": 0.29712393958204014,
+      "grad_norm": 0.5258468985557556,
+      "learning_rate": 0.00019531661697122184,
+      "loss": 0.3627,
+      "step": 2154
+    },
+    {
+      "epoch": 0.2972618801296641,
+      "grad_norm": 0.6194223165512085,
+      "learning_rate": 0.00019531224075812524,
+      "loss": 0.2661,
+      "step": 2155
+    },
+    {
+      "epoch": 0.2973998206772881,
+      "grad_norm": 0.760379433631897,
+      "learning_rate": 0.000195307862550461,
+      "loss": 0.5146,
+      "step": 2156
+    },
+    {
+      "epoch": 0.29753776122491205,
+      "grad_norm": 0.6956475973129272,
+      "learning_rate": 0.00019530348234832076,
+      "loss": 0.8747,
+      "step": 2157
+    },
+    {
+      "epoch": 0.29767570177253605,
+      "grad_norm": 0.6443366408348083,
+      "learning_rate": 0.0001952991001517962,
+      "loss": 0.3854,
+      "step": 2158
+    },
+    {
+      "epoch": 0.29781364232016,
+      "grad_norm": 0.7563418745994568,
+      "learning_rate": 0.00019529471596097902,
+      "loss": 0.4861,
+      "step": 2159
+    },
+    {
+      "epoch": 0.297951582867784,
+      "grad_norm": 1.3392752408981323,
+      "learning_rate": 0.000195290329775961,
+      "loss": 0.6267,
+      "step": 2160
+    },
+    {
+      "epoch": 0.29808952341540795,
+      "grad_norm": 0.5877766013145447,
+      "learning_rate": 0.00019528594159683385,
+      "loss": 0.4547,
+      "step": 2161
+    },
+    {
+      "epoch": 0.29822746396303196,
+      "grad_norm": 0.9212716817855835,
+      "learning_rate": 0.00019528155142368948,
+      "loss": 0.8183,
+      "step": 2162
+    },
+    {
+      "epoch": 0.2983654045106559,
+      "grad_norm": 2.37778902053833,
+      "learning_rate": 0.00019527715925661974,
+      "loss": 0.9423,
+      "step": 2163
+    },
+    {
+      "epoch": 0.29850334505827986,
+      "grad_norm": 0.6656827330589294,
+      "learning_rate": 0.0001952727650957165,
+      "loss": 0.4532,
+      "step": 2164
+    },
+    {
+      "epoch": 0.29864128560590386,
+      "grad_norm": 0.9072442650794983,
+      "learning_rate": 0.00019526836894107175,
+      "loss": 0.8813,
+      "step": 2165
+    },
+    {
+      "epoch": 0.2987792261535278,
+      "grad_norm": 1.0257484912872314,
+      "learning_rate": 0.00019526397079277748,
+      "loss": 0.8555,
+      "step": 2166
+    },
+    {
+      "epoch": 0.2989171667011518,
+      "grad_norm": 0.8982290029525757,
+      "learning_rate": 0.00019525957065092575,
+      "loss": 0.5986,
+      "step": 2167
+    },
+    {
+      "epoch": 0.29905510724877576,
+      "grad_norm": 0.6626170873641968,
+      "learning_rate": 0.00019525516851560859,
+      "loss": 0.6548,
+      "step": 2168
+    },
+    {
+      "epoch": 0.29919304779639977,
+      "grad_norm": 1.0469564199447632,
+      "learning_rate": 0.00019525076438691818,
+      "loss": 1.2987,
+      "step": 2169
+    },
+    {
+      "epoch": 0.2993309883440237,
+      "grad_norm": 0.9261853098869324,
+      "learning_rate": 0.00019524635826494665,
+      "loss": 0.7951,
+      "step": 2170
+    },
+    {
+      "epoch": 0.2994689288916477,
+      "grad_norm": 0.6855120062828064,
+      "learning_rate": 0.00019524195014978624,
+      "loss": 0.5499,
+      "step": 2171
+    },
+    {
+      "epoch": 0.2996068694392717,
+      "grad_norm": 0.8790969252586365,
+      "learning_rate": 0.00019523754004152912,
+      "loss": 0.6992,
+      "step": 2172
+    },
+    {
+      "epoch": 0.2997448099868956,
+      "grad_norm": 0.8488597869873047,
+      "learning_rate": 0.00019523312794026768,
+      "loss": 0.6438,
+      "step": 2173
+    },
+    {
+      "epoch": 0.2998827505345196,
+      "grad_norm": 1.2970049381256104,
+      "learning_rate": 0.00019522871384609417,
+      "loss": 0.7046,
+      "step": 2174
+    },
+    {
+      "epoch": 0.3000206910821436,
+      "grad_norm": 1.1517971754074097,
+      "learning_rate": 0.000195224297759101,
+      "loss": 0.8484,
+      "step": 2175
+    },
+    {
+      "epoch": 0.3001586316297676,
+      "grad_norm": 1.1928848028182983,
+      "learning_rate": 0.00019521987967938058,
+      "loss": 0.7795,
+      "step": 2176
+    },
+    {
+      "epoch": 0.30029657217739153,
+      "grad_norm": 0.7839152216911316,
+      "learning_rate": 0.00019521545960702534,
+      "loss": 0.337,
+      "step": 2177
+    },
+    {
+      "epoch": 0.30043451272501553,
+      "grad_norm": 0.6438336372375488,
+      "learning_rate": 0.0001952110375421278,
+      "loss": 0.5161,
+      "step": 2178
+    },
+    {
+      "epoch": 0.3005724532726395,
+      "grad_norm": 0.6859175562858582,
+      "learning_rate": 0.00019520661348478054,
+      "loss": 0.4835,
+      "step": 2179
+    },
+    {
+      "epoch": 0.3007103938202635,
+      "grad_norm": 0.6152809858322144,
+      "learning_rate": 0.00019520218743507606,
+      "loss": 0.394,
+      "step": 2180
+    },
+    {
+      "epoch": 0.30084833436788744,
+      "grad_norm": 0.6782438158988953,
+      "learning_rate": 0.00019519775939310705,
+      "loss": 0.6891,
+      "step": 2181
+    },
+    {
+      "epoch": 0.3009862749155114,
+      "grad_norm": 0.9672862887382507,
+      "learning_rate": 0.00019519332935896613,
+      "loss": 0.6395,
+      "step": 2182
+    },
+    {
+      "epoch": 0.3011242154631354,
+      "grad_norm": 0.6512202620506287,
+      "learning_rate": 0.000195188897332746,
+      "loss": 0.4909,
+      "step": 2183
+    },
+    {
+      "epoch": 0.30126215601075934,
+      "grad_norm": 1.2240195274353027,
+      "learning_rate": 0.00019518446331453948,
+      "loss": 0.9607,
+      "step": 2184
+    },
+    {
+      "epoch": 0.30140009655838335,
+      "grad_norm": 0.6230162978172302,
+      "learning_rate": 0.00019518002730443927,
+      "loss": 0.4991,
+      "step": 2185
+    },
+    {
+      "epoch": 0.3015380371060073,
+      "grad_norm": 1.0958621501922607,
+      "learning_rate": 0.00019517558930253826,
+      "loss": 0.6996,
+      "step": 2186
+    },
+    {
+      "epoch": 0.3016759776536313,
+      "grad_norm": 0.8363164067268372,
+      "learning_rate": 0.00019517114930892927,
+      "loss": 0.8149,
+      "step": 2187
+    },
+    {
+      "epoch": 0.30181391820125525,
+      "grad_norm": 0.7095656394958496,
+      "learning_rate": 0.00019516670732370528,
+      "loss": 0.4541,
+      "step": 2188
+    },
+    {
+      "epoch": 0.30195185874887925,
+      "grad_norm": 0.9975584149360657,
+      "learning_rate": 0.0001951622633469592,
+      "loss": 0.7558,
+      "step": 2189
+    },
+    {
+      "epoch": 0.3020897992965032,
+      "grad_norm": 0.5672247409820557,
+      "learning_rate": 0.00019515781737878402,
+      "loss": 0.2655,
+      "step": 2190
+    },
+    {
+      "epoch": 0.3022277398441272,
+      "grad_norm": 0.6647024154663086,
+      "learning_rate": 0.00019515336941927283,
+      "loss": 0.5032,
+      "step": 2191
+    },
+    {
+      "epoch": 0.30236568039175116,
+      "grad_norm": 0.7022714614868164,
+      "learning_rate": 0.00019514891946851868,
+      "loss": 0.5562,
+      "step": 2192
+    },
+    {
+      "epoch": 0.3025036209393751,
+      "grad_norm": 0.6081823706626892,
+      "learning_rate": 0.00019514446752661466,
+      "loss": 0.4645,
+      "step": 2193
+    },
+    {
+      "epoch": 0.3026415614869991,
+      "grad_norm": 0.6334623098373413,
+      "learning_rate": 0.00019514001359365399,
+      "loss": 0.6747,
+      "step": 2194
+    },
+    {
+      "epoch": 0.30277950203462306,
+      "grad_norm": 0.6264985799789429,
+      "learning_rate": 0.00019513555766972987,
+      "loss": 0.5149,
+      "step": 2195
+    },
+    {
+      "epoch": 0.30291744258224707,
+      "grad_norm": 0.784883975982666,
+      "learning_rate": 0.0001951310997549355,
+      "loss": 0.7089,
+      "step": 2196
+    },
+    {
+      "epoch": 0.303055383129871,
+      "grad_norm": 0.5658442974090576,
+      "learning_rate": 0.00019512663984936422,
+      "loss": 0.4461,
+      "step": 2197
+    },
+    {
+      "epoch": 0.303193323677495,
+      "grad_norm": 0.6339519023895264,
+      "learning_rate": 0.00019512217795310933,
+      "loss": 0.5541,
+      "step": 2198
+    },
+    {
+      "epoch": 0.30333126422511897,
+      "grad_norm": 0.8487290740013123,
+      "learning_rate": 0.0001951177140662642,
+      "loss": 0.7045,
+      "step": 2199
+    },
+    {
+      "epoch": 0.303469204772743,
+      "grad_norm": 0.5900312662124634,
+      "learning_rate": 0.00019511324818892228,
+      "loss": 0.42,
+      "step": 2200
+    },
+    {
+      "epoch": 0.303469204772743,
+      "eval_loss": 0.661845326423645,
+      "eval_runtime": 23.5093,
+      "eval_samples_per_second": 2.51,
+      "eval_steps_per_second": 2.51,
+      "step": 2200
     }
   ],
   "logging_steps": 1,
         "early_stopping_threshold": 0.0
       },
       "attributes": {
+        "early_stopping_patience_counter": 2
       }
     },
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 3.483236466111283e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null