Training in progress, step 200, checkpoint

e2b33eb verified 6 months ago

36.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.03305238803503553,
	"eval_steps": 50,
	"global_step": 200,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.00016526194017517766,
	"grad_norm": 0.555425226688385,
	"learning_rate": 2e-05,
	"loss": 2.2272,
	"step": 1
	},
	{
	"epoch": 0.00016526194017517766,
	"eval_loss": 1.8130699396133423,
	"eval_runtime": 42.0001,
	"eval_samples_per_second": 60.667,
	"eval_steps_per_second": 30.333,
	"step": 1
	},
	{
	"epoch": 0.0003305238803503553,
	"grad_norm": 0.38278359174728394,
	"learning_rate": 4e-05,
	"loss": 1.7691,
	"step": 2
	},
	{
	"epoch": 0.0004957858205255329,
	"grad_norm": 0.2836111783981323,
	"learning_rate": 6e-05,
	"loss": 1.6578,
	"step": 3
	},
	{
	"epoch": 0.0006610477607007106,
	"grad_norm": 0.4891590178012848,
	"learning_rate": 8e-05,
	"loss": 2.1334,
	"step": 4
	},
	{
	"epoch": 0.0008263097008758883,
	"grad_norm": 0.3973068296909332,
	"learning_rate": 0.0001,
	"loss": 1.8242,
	"step": 5
	},
	{
	"epoch": 0.0009915716410510659,
	"grad_norm": 0.3104502856731415,
	"learning_rate": 0.00012,
	"loss": 1.6421,
	"step": 6
	},
	{
	"epoch": 0.0011568335812262437,
	"grad_norm": 0.4168192446231842,
	"learning_rate": 0.00014,
	"loss": 1.8231,
	"step": 7
	},
	{
	"epoch": 0.0013220955214014213,
	"grad_norm": 0.45680752396583557,
	"learning_rate": 0.00016,
	"loss": 1.7829,
	"step": 8
	},
	{
	"epoch": 0.001487357461576599,
	"grad_norm": 0.40455353260040283,
	"learning_rate": 0.00018,
	"loss": 2.0192,
	"step": 9
	},
	{
	"epoch": 0.0016526194017517765,
	"grad_norm": 0.3940141499042511,
	"learning_rate": 0.0002,
	"loss": 1.6667,
	"step": 10
	},
	{
	"epoch": 0.0018178813419269541,
	"grad_norm": 0.3571617603302002,
	"learning_rate": 0.0001999863304992469,
	"loss": 1.6667,
	"step": 11
	},
	{
	"epoch": 0.0019831432821021317,
	"grad_norm": 0.5306565761566162,
	"learning_rate": 0.00019994532573409262,
	"loss": 1.767,
	"step": 12
	},
	{
	"epoch": 0.0021484052222773093,
	"grad_norm": 0.33301275968551636,
	"learning_rate": 0.00019987699691483048,
	"loss": 1.4221,
	"step": 13
	},
	{
	"epoch": 0.0023136671624524874,
	"grad_norm": 0.3087587058544159,
	"learning_rate": 0.00019978136272187747,
	"loss": 1.5223,
	"step": 14
	},
	{
	"epoch": 0.002478929102627665,
	"grad_norm": 0.5549050569534302,
	"learning_rate": 0.000199658449300667,
	"loss": 1.6935,
	"step": 15
	},
	{
	"epoch": 0.0026441910428028426,
	"grad_norm": 0.43276911973953247,
	"learning_rate": 0.00019950829025450114,
	"loss": 1.7447,
	"step": 16
	},
	{
	"epoch": 0.00280945298297802,
	"grad_norm": 0.41811099648475647,
	"learning_rate": 0.00019933092663536382,
	"loss": 1.6279,
	"step": 17
	},
	{
	"epoch": 0.002974714923153198,
	"grad_norm": 0.7340661287307739,
	"learning_rate": 0.00019912640693269752,
	"loss": 1.7762,
	"step": 18
	},
	{
	"epoch": 0.0031399768633283754,
	"grad_norm": 0.5529046654701233,
	"learning_rate": 0.00019889478706014687,
	"loss": 1.7278,
	"step": 19
	},
	{
	"epoch": 0.003305238803503553,
	"grad_norm": 0.5795073509216309,
	"learning_rate": 0.00019863613034027224,
	"loss": 1.6349,
	"step": 20
	},
	{
	"epoch": 0.0034705007436787306,
	"grad_norm": 0.4024738073348999,
	"learning_rate": 0.00019835050748723824,
	"loss": 1.478,
	"step": 21
	},
	{
	"epoch": 0.0036357626838539082,
	"grad_norm": 0.5151705145835876,
	"learning_rate": 0.00019803799658748094,
	"loss": 1.3476,
	"step": 22
	},
	{
	"epoch": 0.0038010246240290863,
	"grad_norm": 0.47924888134002686,
	"learning_rate": 0.00019769868307835994,
	"loss": 1.6887,
	"step": 23
	},
	{
	"epoch": 0.0039662865642042635,
	"grad_norm": 0.414813756942749,
	"learning_rate": 0.0001973326597248006,
	"loss": 1.4679,
	"step": 24
	},
	{
	"epoch": 0.0041315485043794415,
	"grad_norm": 0.5144776701927185,
	"learning_rate": 0.00019694002659393305,
	"loss": 1.6018,
	"step": 25
	},
	{
	"epoch": 0.004296810444554619,
	"grad_norm": 0.6902279853820801,
	"learning_rate": 0.00019652089102773488,
	"loss": 1.6141,
	"step": 26
	},
	{
	"epoch": 0.004462072384729797,
	"grad_norm": 0.5526428818702698,
	"learning_rate": 0.00019607536761368484,
	"loss": 1.4361,
	"step": 27
	},
	{
	"epoch": 0.004627334324904975,
	"grad_norm": 0.9285733699798584,
	"learning_rate": 0.00019560357815343577,
	"loss": 1.9259,
	"step": 28
	},
	{
	"epoch": 0.004792596265080152,
	"grad_norm": 0.37774184346199036,
	"learning_rate": 0.00019510565162951537,
	"loss": 1.4737,
	"step": 29
	},
	{
	"epoch": 0.00495785820525533,
	"grad_norm": 0.36778566241264343,
	"learning_rate": 0.00019458172417006347,
	"loss": 1.5279,
	"step": 30
	},
	{
	"epoch": 0.005123120145430507,
	"grad_norm": 0.5695986151695251,
	"learning_rate": 0.00019403193901161613,
	"loss": 1.8606,
	"step": 31
	},
	{
	"epoch": 0.005288382085605685,
	"grad_norm": 0.3802303969860077,
	"learning_rate": 0.0001934564464599461,
	"loss": 1.6067,
	"step": 32
	},
	{
	"epoch": 0.005453644025780862,
	"grad_norm": 0.7587772011756897,
	"learning_rate": 0.00019285540384897073,
	"loss": 1.8386,
	"step": 33
	},
	{
	"epoch": 0.00561890596595604,
	"grad_norm": 0.6221739053726196,
	"learning_rate": 0.00019222897549773848,
	"loss": 1.4816,
	"step": 34
	},
	{
	"epoch": 0.005784167906131218,
	"grad_norm": 0.583857536315918,
	"learning_rate": 0.00019157733266550575,
	"loss": 1.6782,
	"step": 35
	},
	{
	"epoch": 0.005949429846306396,
	"grad_norm": 0.481245756149292,
	"learning_rate": 0.00019090065350491626,
	"loss": 1.4703,
	"step": 36
	},
	{
	"epoch": 0.006114691786481574,
	"grad_norm": 0.44734448194503784,
	"learning_rate": 0.00019019912301329592,
	"loss": 1.5667,
	"step": 37
	},
	{
	"epoch": 0.006279953726656751,
	"grad_norm": 0.4786554276943207,
	"learning_rate": 0.00018947293298207635,
	"loss": 1.5302,
	"step": 38
	},
	{
	"epoch": 0.006445215666831929,
	"grad_norm": 0.47823983430862427,
	"learning_rate": 0.0001887222819443612,
	"loss": 1.4859,
	"step": 39
	},
	{
	"epoch": 0.006610477607007106,
	"grad_norm": 0.4842425286769867,
	"learning_rate": 0.0001879473751206489,
	"loss": 1.3932,
	"step": 40
	},
	{
	"epoch": 0.006775739547182284,
	"grad_norm": 0.3999955356121063,
	"learning_rate": 0.00018714842436272773,
	"loss": 1.4406,
	"step": 41
	},
	{
	"epoch": 0.006941001487357461,
	"grad_norm": 0.4841754138469696,
	"learning_rate": 0.00018632564809575742,
	"loss": 1.4164,
	"step": 42
	},
	{
	"epoch": 0.007106263427532639,
	"grad_norm": 0.5468775033950806,
	"learning_rate": 0.0001854792712585539,
	"loss": 1.7234,
	"step": 43
	},
	{
	"epoch": 0.0072715253677078165,
	"grad_norm": 0.4681849479675293,
	"learning_rate": 0.00018460952524209355,
	"loss": 1.3656,
	"step": 44
	},
	{
	"epoch": 0.0074367873078829945,
	"grad_norm": 0.5926626920700073,
	"learning_rate": 0.00018371664782625287,
	"loss": 1.5722,
	"step": 45
	},
	{
	"epoch": 0.007602049248058173,
	"grad_norm": 0.5072199702262878,
	"learning_rate": 0.00018280088311480201,
	"loss": 1.4031,
	"step": 46
	},
	{
	"epoch": 0.00776731118823335,
	"grad_norm": 0.48731309175491333,
	"learning_rate": 0.00018186248146866927,
	"loss": 1.4756,
	"step": 47
	},
	{
	"epoch": 0.007932573128408527,
	"grad_norm": 0.4783889949321747,
	"learning_rate": 0.00018090169943749476,
	"loss": 1.0337,
	"step": 48
	},
	{
	"epoch": 0.008097835068583706,
	"grad_norm": 0.458065927028656,
	"learning_rate": 0.0001799187996894925,
	"loss": 1.5369,
	"step": 49
	},
	{
	"epoch": 0.008263097008758883,
	"grad_norm": 0.44745975732803345,
	"learning_rate": 0.00017891405093963938,
	"loss": 1.4635,
	"step": 50
	},
	{
	"epoch": 0.008263097008758883,
	"eval_loss": 1.5854825973510742,
	"eval_runtime": 42.2167,
	"eval_samples_per_second": 60.355,
	"eval_steps_per_second": 30.178,
	"step": 50
	},
	{
	"epoch": 0.00842835894893406,
	"grad_norm": 0.47436589002609253,
	"learning_rate": 0.00017788772787621126,
	"loss": 1.667,
	"step": 51
	},
	{
	"epoch": 0.008593620889109237,
	"grad_norm": 0.6580041646957397,
	"learning_rate": 0.00017684011108568592,
	"loss": 1.4318,
	"step": 52
	},
	{
	"epoch": 0.008758882829284416,
	"grad_norm": 0.5122596621513367,
	"learning_rate": 0.0001757714869760335,
	"loss": 1.5615,
	"step": 53
	},
	{
	"epoch": 0.008924144769459593,
	"grad_norm": 0.5801290273666382,
	"learning_rate": 0.0001746821476984154,
	"loss": 1.3862,
	"step": 54
	},
	{
	"epoch": 0.00908940670963477,
	"grad_norm": 0.6132497191429138,
	"learning_rate": 0.00017357239106731317,
	"loss": 1.3698,
	"step": 55
	},
	{
	"epoch": 0.00925466864980995,
	"grad_norm": 0.6059843301773071,
	"learning_rate": 0.00017244252047910892,
	"loss": 1.6441,
	"step": 56
	},
	{
	"epoch": 0.009419930589985127,
	"grad_norm": 0.6696584224700928,
	"learning_rate": 0.00017129284482913972,
	"loss": 1.5325,
	"step": 57
	},
	{
	"epoch": 0.009585192530160304,
	"grad_norm": 0.6695738434791565,
	"learning_rate": 0.00017012367842724887,
	"loss": 1.3917,
	"step": 58
	},
	{
	"epoch": 0.009750454470335481,
	"grad_norm": 0.5053813457489014,
	"learning_rate": 0.0001689353409118566,
	"loss": 1.597,
	"step": 59
	},
	{
	"epoch": 0.00991571641051066,
	"grad_norm": 0.568986713886261,
	"learning_rate": 0.00016772815716257412,
	"loss": 1.926,
	"step": 60
	},
	{
	"epoch": 0.010080978350685837,
	"grad_norm": 0.5120527148246765,
	"learning_rate": 0.0001665024572113848,
	"loss": 1.5176,
	"step": 61
	},
	{
	"epoch": 0.010246240290861014,
	"grad_norm": 0.3892988860607147,
	"learning_rate": 0.00016525857615241687,
	"loss": 1.6132,
	"step": 62
	},
	{
	"epoch": 0.010411502231036193,
	"grad_norm": 0.5233981609344482,
	"learning_rate": 0.00016399685405033167,
	"loss": 1.501,
	"step": 63
	},
	{
	"epoch": 0.01057676417121137,
	"grad_norm": 0.5761738419532776,
	"learning_rate": 0.0001627176358473537,
	"loss": 1.6147,
	"step": 64
	},
	{
	"epoch": 0.010742026111386548,
	"grad_norm": 0.475273460149765,
	"learning_rate": 0.0001614212712689668,
	"loss": 1.6735,
	"step": 65
	},
	{
	"epoch": 0.010907288051561725,
	"grad_norm": 0.4451874792575836,
	"learning_rate": 0.00016010811472830252,
	"loss": 1.3603,
	"step": 66
	},
	{
	"epoch": 0.011072549991736904,
	"grad_norm": 0.3606850504875183,
	"learning_rate": 0.00015877852522924732,
	"loss": 1.2809,
	"step": 67
	},
	{
	"epoch": 0.01123781193191208,
	"grad_norm": 0.4811137020587921,
	"learning_rate": 0.00015743286626829437,
	"loss": 1.5322,
	"step": 68
	},
	{
	"epoch": 0.011403073872087258,
	"grad_norm": 0.40435728430747986,
	"learning_rate": 0.0001560715057351673,
	"loss": 1.4096,
	"step": 69
	},
	{
	"epoch": 0.011568335812262435,
	"grad_norm": 0.4875733554363251,
	"learning_rate": 0.00015469481581224272,
	"loss": 1.694,
	"step": 70
	},
	{
	"epoch": 0.011733597752437614,
	"grad_norm": 0.3643505871295929,
	"learning_rate": 0.0001533031728727994,
	"loss": 1.4144,
	"step": 71
	},
	{
	"epoch": 0.011898859692612791,
	"grad_norm": 0.5411165952682495,
	"learning_rate": 0.00015189695737812152,
	"loss": 1.5356,
	"step": 72
	},
	{
	"epoch": 0.012064121632787968,
	"grad_norm": 0.48910924792289734,
	"learning_rate": 0.0001504765537734844,
	"loss": 1.4965,
	"step": 73
	},
	{
	"epoch": 0.012229383572963147,
	"grad_norm": 0.3826967477798462,
	"learning_rate": 0.00014904235038305083,
	"loss": 1.5359,
	"step": 74
	},
	{
	"epoch": 0.012394645513138325,
	"grad_norm": 0.6282126903533936,
	"learning_rate": 0.00014759473930370736,
	"loss": 1.8296,
	"step": 75
	},
	{
	"epoch": 0.012559907453313502,
	"grad_norm": 0.444775253534317,
	"learning_rate": 0.0001461341162978688,
	"loss": 1.3698,
	"step": 76
	},
	{
	"epoch": 0.012725169393488679,
	"grad_norm": 0.5965347290039062,
	"learning_rate": 0.00014466088068528068,
	"loss": 1.8349,
	"step": 77
	},
	{
	"epoch": 0.012890431333663858,
	"grad_norm": 0.6038627624511719,
	"learning_rate": 0.00014317543523384928,
	"loss": 1.4731,
	"step": 78
	},
	{
	"epoch": 0.013055693273839035,
	"grad_norm": 0.5650402307510376,
	"learning_rate": 0.00014167818604952906,
	"loss": 1.5489,
	"step": 79
	},
	{
	"epoch": 0.013220955214014212,
	"grad_norm": 0.6513015627861023,
	"learning_rate": 0.00014016954246529696,
	"loss": 1.6598,
	"step": 80
	},
	{
	"epoch": 0.013386217154189391,
	"grad_norm": 0.7023486495018005,
	"learning_rate": 0.00013864991692924523,
	"loss": 1.5978,
	"step": 81
	},
	{
	"epoch": 0.013551479094364568,
	"grad_norm": 0.4741460680961609,
	"learning_rate": 0.00013711972489182208,
	"loss": 1.4441,
	"step": 82
	},
	{
	"epoch": 0.013716741034539745,
	"grad_norm": 0.5267840623855591,
	"learning_rate": 0.00013557938469225167,
	"loss": 1.6758,
	"step": 83
	},
	{
	"epoch": 0.013882002974714923,
	"grad_norm": 0.4459191858768463,
	"learning_rate": 0.00013402931744416433,
	"loss": 1.6122,
	"step": 84
	},
	{
	"epoch": 0.014047264914890101,
	"grad_norm": 0.4186781048774719,
	"learning_rate": 0.00013246994692046836,
	"loss": 1.4066,
	"step": 85
	},
	{
	"epoch": 0.014212526855065279,
	"grad_norm": 0.6138430833816528,
	"learning_rate": 0.00013090169943749476,
	"loss": 1.6558,
	"step": 86
	},
	{
	"epoch": 0.014377788795240456,
	"grad_norm": 0.5305865406990051,
	"learning_rate": 0.0001293250037384465,
	"loss": 1.5737,
	"step": 87
	},
	{
	"epoch": 0.014543050735415633,
	"grad_norm": 0.44837602972984314,
	"learning_rate": 0.00012774029087618446,
	"loss": 1.4177,
	"step": 88
	},
	{
	"epoch": 0.014708312675590812,
	"grad_norm": 0.3854394853115082,
	"learning_rate": 0.00012614799409538198,
	"loss": 1.3886,
	"step": 89
	},
	{
	"epoch": 0.014873574615765989,
	"grad_norm": 0.4300711154937744,
	"learning_rate": 0.00012454854871407994,
	"loss": 1.4607,
	"step": 90
	},
	{
	"epoch": 0.015038836555941166,
	"grad_norm": 0.5587463974952698,
	"learning_rate": 0.00012294239200467516,
	"loss": 1.5351,
	"step": 91
	},
	{
	"epoch": 0.015204098496116345,
	"grad_norm": 0.5253020524978638,
	"learning_rate": 0.0001213299630743747,
	"loss": 1.3066,
	"step": 92
	},
	{
	"epoch": 0.015369360436291522,
	"grad_norm": 0.46560612320899963,
	"learning_rate": 0.00011971170274514802,
	"loss": 1.5455,
	"step": 93
	},
	{
	"epoch": 0.0155346223764667,
	"grad_norm": 0.5804600715637207,
	"learning_rate": 0.000118088053433211,
	"loss": 1.8134,
	"step": 94
	},
	{
	"epoch": 0.015699884316641877,
	"grad_norm": 0.4481876790523529,
	"learning_rate": 0.00011645945902807341,
	"loss": 1.589,
	"step": 95
	},
	{
	"epoch": 0.015865146256817054,
	"grad_norm": 0.43116191029548645,
	"learning_rate": 0.0001148263647711842,
	"loss": 1.601,
	"step": 96
	},
	{
	"epoch": 0.01603040819699223,
	"grad_norm": 0.6765596270561218,
	"learning_rate": 0.00011318921713420691,
	"loss": 1.6353,
	"step": 97
	},
	{
	"epoch": 0.01619567013716741,
	"grad_norm": 0.37676534056663513,
	"learning_rate": 0.00011154846369695863,
	"loss": 1.5367,
	"step": 98
	},
	{
	"epoch": 0.01636093207734259,
	"grad_norm": 0.8401118516921997,
	"learning_rate": 0.0001099045530250463,
	"loss": 1.8154,
	"step": 99
	},
	{
	"epoch": 0.016526194017517766,
	"grad_norm": 0.45617929100990295,
	"learning_rate": 0.00010825793454723325,
	"loss": 1.2815,
	"step": 100
	},
	{
	"epoch": 0.016526194017517766,
	"eval_loss": 1.5545666217803955,
	"eval_runtime": 41.38,
	"eval_samples_per_second": 61.576,
	"eval_steps_per_second": 30.788,
	"step": 100
	},
	{
	"epoch": 0.016691455957692943,
	"grad_norm": 0.5702753663063049,
	"learning_rate": 0.00010660905843256994,
	"loss": 1.553,
	"step": 101
	},
	{
	"epoch": 0.01685671789786812,
	"grad_norm": 0.5369312167167664,
	"learning_rate": 0.00010495837546732224,
	"loss": 1.5788,
	"step": 102
	},
	{
	"epoch": 0.017021979838043298,
	"grad_norm": 0.4222307801246643,
	"learning_rate": 0.00010330633693173082,
	"loss": 1.5621,
	"step": 103
	},
	{
	"epoch": 0.017187241778218475,
	"grad_norm": 0.5532189607620239,
	"learning_rate": 0.00010165339447663587,
	"loss": 1.4918,
	"step": 104
	},
	{
	"epoch": 0.017352503718393655,
	"grad_norm": 0.7070478796958923,
	"learning_rate": 0.0001,
	"loss": 1.623,
	"step": 105
	},
	{
	"epoch": 0.017517765658568833,
	"grad_norm": 0.6058260202407837,
	"learning_rate": 9.834660552336415e-05,
	"loss": 1.5853,
	"step": 106
	},
	{
	"epoch": 0.01768302759874401,
	"grad_norm": 0.3898620009422302,
	"learning_rate": 9.669366306826919e-05,
	"loss": 1.5686,
	"step": 107
	},
	{
	"epoch": 0.017848289538919187,
	"grad_norm": 0.5478883385658264,
	"learning_rate": 9.504162453267777e-05,
	"loss": 1.5867,
	"step": 108
	},
	{
	"epoch": 0.018013551479094364,
	"grad_norm": 0.47779756784439087,
	"learning_rate": 9.339094156743007e-05,
	"loss": 1.4921,
	"step": 109
	},
	{
	"epoch": 0.01817881341926954,
	"grad_norm": 0.8399332165718079,
	"learning_rate": 9.174206545276677e-05,
	"loss": 1.6164,
	"step": 110
	},
	{
	"epoch": 0.01834407535944472,
	"grad_norm": 0.5388432145118713,
	"learning_rate": 9.009544697495374e-05,
	"loss": 1.6265,
	"step": 111
	},
	{
	"epoch": 0.0185093372996199,
	"grad_norm": 0.5579386949539185,
	"learning_rate": 8.845153630304139e-05,
	"loss": 1.3753,
	"step": 112
	},
	{
	"epoch": 0.018674599239795076,
	"grad_norm": 0.40802866220474243,
	"learning_rate": 8.681078286579311e-05,
	"loss": 1.3763,
	"step": 113
	},
	{
	"epoch": 0.018839861179970253,
	"grad_norm": 0.552657425403595,
	"learning_rate": 8.517363522881579e-05,
	"loss": 1.3575,
	"step": 114
	},
	{
	"epoch": 0.01900512312014543,
	"grad_norm": 0.8166072964668274,
	"learning_rate": 8.35405409719266e-05,
	"loss": 1.6855,
	"step": 115
	},
	{
	"epoch": 0.019170385060320608,
	"grad_norm": 0.8131512999534607,
	"learning_rate": 8.191194656678904e-05,
	"loss": 1.8306,
	"step": 116
	},
	{
	"epoch": 0.019335647000495785,
	"grad_norm": 0.42067432403564453,
	"learning_rate": 8.028829725485199e-05,
	"loss": 1.5663,
	"step": 117
	},
	{
	"epoch": 0.019500908940670962,
	"grad_norm": 0.5358683466911316,
	"learning_rate": 7.867003692562534e-05,
	"loss": 1.7055,
	"step": 118
	},
	{
	"epoch": 0.019666170880846143,
	"grad_norm": 0.6525923013687134,
	"learning_rate": 7.705760799532485e-05,
	"loss": 1.3311,
	"step": 119
	},
	{
	"epoch": 0.01983143282102132,
	"grad_norm": 0.6140702962875366,
	"learning_rate": 7.54514512859201e-05,
	"loss": 1.7929,
	"step": 120
	},
	{
	"epoch": 0.019996694761196497,
	"grad_norm": 0.6280947923660278,
	"learning_rate": 7.385200590461803e-05,
	"loss": 1.7175,
	"step": 121
	},
	{
	"epoch": 0.020161956701371674,
	"grad_norm": 0.686147153377533,
	"learning_rate": 7.225970912381556e-05,
	"loss": 1.6716,
	"step": 122
	},
	{
	"epoch": 0.02032721864154685,
	"grad_norm": 0.5507709383964539,
	"learning_rate": 7.067499626155354e-05,
	"loss": 1.3555,
	"step": 123
	},
	{
	"epoch": 0.02049248058172203,
	"grad_norm": 0.5885040163993835,
	"learning_rate": 6.909830056250527e-05,
	"loss": 1.3161,
	"step": 124
	},
	{
	"epoch": 0.020657742521897206,
	"grad_norm": 0.4891628623008728,
	"learning_rate": 6.753005307953167e-05,
	"loss": 1.4675,
	"step": 125
	},
	{
	"epoch": 0.020823004462072386,
	"grad_norm": 0.8562346696853638,
	"learning_rate": 6.59706825558357e-05,
	"loss": 1.5319,
	"step": 126
	},
	{
	"epoch": 0.020988266402247564,
	"grad_norm": 0.676142156124115,
	"learning_rate": 6.442061530774834e-05,
	"loss": 1.4428,
	"step": 127
	},
	{
	"epoch": 0.02115352834242274,
	"grad_norm": 0.5416271686553955,
	"learning_rate": 6.28802751081779e-05,
	"loss": 1.4903,
	"step": 128
	},
	{
	"epoch": 0.021318790282597918,
	"grad_norm": 0.4796253442764282,
	"learning_rate": 6.135008307075481e-05,
	"loss": 1.4259,
	"step": 129
	},
	{
	"epoch": 0.021484052222773095,
	"grad_norm": 0.4909597635269165,
	"learning_rate": 5.983045753470308e-05,
	"loss": 1.3397,
	"step": 130
	},
	{
	"epoch": 0.021649314162948272,
	"grad_norm": 0.5054872035980225,
	"learning_rate": 5.832181395047098e-05,
	"loss": 1.6404,
	"step": 131
	},
	{
	"epoch": 0.02181457610312345,
	"grad_norm": 0.6180214285850525,
	"learning_rate": 5.6824564766150726e-05,
	"loss": 1.782,
	"step": 132
	},
	{
	"epoch": 0.021979838043298627,
	"grad_norm": 0.657794177532196,
	"learning_rate": 5.533911931471936e-05,
	"loss": 1.5387,
	"step": 133
	},
	{
	"epoch": 0.022145099983473807,
	"grad_norm": 0.5247397422790527,
	"learning_rate": 5.386588370213124e-05,
	"loss": 1.4955,
	"step": 134
	},
	{
	"epoch": 0.022310361923648984,
	"grad_norm": 0.5147126913070679,
	"learning_rate": 5.240526069629265e-05,
	"loss": 1.4354,
	"step": 135
	},
	{
	"epoch": 0.02247562386382416,
	"grad_norm": 0.4486481249332428,
	"learning_rate": 5.095764961694922e-05,
	"loss": 1.3835,
	"step": 136
	},
	{
	"epoch": 0.02264088580399934,
	"grad_norm": 0.4644688367843628,
	"learning_rate": 4.952344622651566e-05,
	"loss": 1.5637,
	"step": 137
	},
	{
	"epoch": 0.022806147744174516,
	"grad_norm": 0.7985265254974365,
	"learning_rate": 4.810304262187852e-05,
	"loss": 1.4771,
	"step": 138
	},
	{
	"epoch": 0.022971409684349693,
	"grad_norm": 0.43993720412254333,
	"learning_rate": 4.669682712720065e-05,
	"loss": 1.4761,
	"step": 139
	},
	{
	"epoch": 0.02313667162452487,
	"grad_norm": 0.597682535648346,
	"learning_rate": 4.530518418775733e-05,
	"loss": 1.5009,
	"step": 140
	},
	{
	"epoch": 0.02330193356470005,
	"grad_norm": 0.46359845995903015,
	"learning_rate": 4.392849426483274e-05,
	"loss": 1.4938,
	"step": 141
	},
	{
	"epoch": 0.023467195504875228,
	"grad_norm": 0.5101354718208313,
	"learning_rate": 4.256713373170564e-05,
	"loss": 1.4861,
	"step": 142
	},
	{
	"epoch": 0.023632457445050405,
	"grad_norm": 0.5093466639518738,
	"learning_rate": 4.12214747707527e-05,
	"loss": 1.4807,
	"step": 143
	},
	{
	"epoch": 0.023797719385225583,
	"grad_norm": 0.5195640325546265,
	"learning_rate": 3.9891885271697496e-05,
	"loss": 1.61,
	"step": 144
	},
	{
	"epoch": 0.02396298132540076,
	"grad_norm": 0.500438392162323,
	"learning_rate": 3.857872873103322e-05,
	"loss": 1.4983,
	"step": 145
	},
	{
	"epoch": 0.024128243265575937,
	"grad_norm": 0.4799457788467407,
	"learning_rate": 3.7282364152646297e-05,
	"loss": 1.3141,
	"step": 146
	},
	{
	"epoch": 0.024293505205751114,
	"grad_norm": 0.5169847011566162,
	"learning_rate": 3.600314594966834e-05,
	"loss": 1.7889,
	"step": 147
	},
	{
	"epoch": 0.024458767145926295,
	"grad_norm": 0.49090850353240967,
	"learning_rate": 3.4741423847583134e-05,
	"loss": 1.6389,
	"step": 148
	},
	{
	"epoch": 0.024624029086101472,
	"grad_norm": 0.6075248122215271,
	"learning_rate": 3.349754278861517e-05,
	"loss": 1.5403,
	"step": 149
	},
	{
	"epoch": 0.02478929102627665,
	"grad_norm": 0.5191071033477783,
	"learning_rate": 3.227184283742591e-05,
	"loss": 1.3608,
	"step": 150
	},
	{
	"epoch": 0.02478929102627665,
	"eval_loss": 1.5432822704315186,
	"eval_runtime": 41.6111,
	"eval_samples_per_second": 61.234,
	"eval_steps_per_second": 30.617,
	"step": 150
	},
	{
	"epoch": 0.024954552966451826,
	"grad_norm": 1.1631395816802979,
	"learning_rate": 3.106465908814342e-05,
	"loss": 1.8673,
	"step": 151
	},
	{
	"epoch": 0.025119814906627003,
	"grad_norm": 0.5590987205505371,
	"learning_rate": 2.9876321572751144e-05,
	"loss": 1.5469,
	"step": 152
	},
	{
	"epoch": 0.02528507684680218,
	"grad_norm": 0.49430692195892334,
	"learning_rate": 2.87071551708603e-05,
	"loss": 1.5041,
	"step": 153
	},
	{
	"epoch": 0.025450338786977358,
	"grad_norm": 0.539833128452301,
	"learning_rate": 2.7557479520891104e-05,
	"loss": 1.5439,
	"step": 154
	},
	{
	"epoch": 0.02561560072715254,
	"grad_norm": 0.5095410346984863,
	"learning_rate": 2.6427608932686843e-05,
	"loss": 1.6017,
	"step": 155
	},
	{
	"epoch": 0.025780862667327716,
	"grad_norm": 0.4884549379348755,
	"learning_rate": 2.5317852301584643e-05,
	"loss": 1.3461,
	"step": 156
	},
	{
	"epoch": 0.025946124607502893,
	"grad_norm": 0.4909934997558594,
	"learning_rate": 2.422851302396655e-05,
	"loss": 1.4771,
	"step": 157
	},
	{
	"epoch": 0.02611138654767807,
	"grad_norm": 0.6615016460418701,
	"learning_rate": 2.315988891431412e-05,
	"loss": 1.6216,
	"step": 158
	},
	{
	"epoch": 0.026276648487853247,
	"grad_norm": 0.5424089431762695,
	"learning_rate": 2.2112272123788768e-05,
	"loss": 1.4016,
	"step": 159
	},
	{
	"epoch": 0.026441910428028424,
	"grad_norm": 0.4783364236354828,
	"learning_rate": 2.1085949060360654e-05,
	"loss": 1.2278,
	"step": 160
	},
	{
	"epoch": 0.0266071723682036,
	"grad_norm": 0.4361153841018677,
	"learning_rate": 2.008120031050753e-05,
	"loss": 1.5001,
	"step": 161
	},
	{
	"epoch": 0.026772434308378782,
	"grad_norm": 0.4618135392665863,
	"learning_rate": 1.9098300562505266e-05,
	"loss": 1.5697,
	"step": 162
	},
	{
	"epoch": 0.02693769624855396,
	"grad_norm": 0.5692691206932068,
	"learning_rate": 1.8137518531330767e-05,
	"loss": 1.7622,
	"step": 163
	},
	{
	"epoch": 0.027102958188729136,
	"grad_norm": 0.7295346856117249,
	"learning_rate": 1.7199116885197995e-05,
	"loss": 1.8002,
	"step": 164
	},
	{
	"epoch": 0.027268220128904314,
	"grad_norm": 0.6334387063980103,
	"learning_rate": 1.6283352173747145e-05,
	"loss": 1.3173,
	"step": 165
	},
	{
	"epoch": 0.02743348206907949,
	"grad_norm": 0.5129164457321167,
	"learning_rate": 1.5390474757906446e-05,
	"loss": 1.1693,
	"step": 166
	},
	{
	"epoch": 0.027598744009254668,
	"grad_norm": 0.56026291847229,
	"learning_rate": 1.4520728741446089e-05,
	"loss": 1.4209,
	"step": 167
	},
	{
	"epoch": 0.027764005949429845,
	"grad_norm": 0.692166805267334,
	"learning_rate": 1.3674351904242611e-05,
	"loss": 1.315,
	"step": 168
	},
	{
	"epoch": 0.027929267889605022,
	"grad_norm": 0.595594048500061,
	"learning_rate": 1.2851575637272262e-05,
	"loss": 1.4176,
	"step": 169
	},
	{
	"epoch": 0.028094529829780203,
	"grad_norm": 0.36242327094078064,
	"learning_rate": 1.2052624879351104e-05,
	"loss": 1.6176,
	"step": 170
	},
	{
	"epoch": 0.02825979176995538,
	"grad_norm": 0.4128834307193756,
	"learning_rate": 1.1277718055638819e-05,
	"loss": 1.4342,
	"step": 171
	},
	{
	"epoch": 0.028425053710130557,
	"grad_norm": 0.533230185508728,
	"learning_rate": 1.0527067017923654e-05,
	"loss": 1.6252,
	"step": 172
	},
	{
	"epoch": 0.028590315650305734,
	"grad_norm": 0.5379830598831177,
	"learning_rate": 9.80087698670411e-06,
	"loss": 1.3614,
	"step": 173
	},
	{
	"epoch": 0.02875557759048091,
	"grad_norm": 0.5856308341026306,
	"learning_rate": 9.09934649508375e-06,
	"loss": 1.4459,
	"step": 174
	},
	{
	"epoch": 0.02892083953065609,
	"grad_norm": 0.5911732316017151,
	"learning_rate": 8.422667334494249e-06,
	"loss": 1.4906,
	"step": 175
	},
	{
	"epoch": 0.029086101470831266,
	"grad_norm": 0.8282976150512695,
	"learning_rate": 7.771024502261526e-06,
	"loss": 1.81,
	"step": 176
	},
	{
	"epoch": 0.029251363411006447,
	"grad_norm": 0.5876789689064026,
	"learning_rate": 7.144596151029303e-06,
	"loss": 1.4096,
	"step": 177
	},
	{
	"epoch": 0.029416625351181624,
	"grad_norm": 0.5349177122116089,
	"learning_rate": 6.543553540053926e-06,
	"loss": 1.4491,
	"step": 178
	},
	{
	"epoch": 0.0295818872913568,
	"grad_norm": 0.6630465388298035,
	"learning_rate": 5.968060988383883e-06,
	"loss": 1.3134,
	"step": 179
	},
	{
	"epoch": 0.029747149231531978,
	"grad_norm": 0.6198413968086243,
	"learning_rate": 5.418275829936537e-06,
	"loss": 1.746,
	"step": 180
	},
	{
	"epoch": 0.029912411171707155,
	"grad_norm": 0.4724350571632385,
	"learning_rate": 4.8943483704846475e-06,
	"loss": 1.4524,
	"step": 181
	},
	{
	"epoch": 0.030077673111882332,
	"grad_norm": 0.5511718988418579,
	"learning_rate": 4.3964218465642355e-06,
	"loss": 1.395,
	"step": 182
	},
	{
	"epoch": 0.03024293505205751,
	"grad_norm": 0.6854819059371948,
	"learning_rate": 3.924632386315186e-06,
	"loss": 1.6976,
	"step": 183
	},
	{
	"epoch": 0.03040819699223269,
	"grad_norm": 0.5091037750244141,
	"learning_rate": 3.4791089722651436e-06,
	"loss": 1.2758,
	"step": 184
	},
	{
	"epoch": 0.030573458932407867,
	"grad_norm": 0.5180460810661316,
	"learning_rate": 3.059973406066963e-06,
	"loss": 1.4986,
	"step": 185
	},
	{
	"epoch": 0.030738720872583045,
	"grad_norm": 0.8126318454742432,
	"learning_rate": 2.667340275199426e-06,
	"loss": 1.7087,
	"step": 186
	},
	{
	"epoch": 0.030903982812758222,
	"grad_norm": 0.567361056804657,
	"learning_rate": 2.3013169216400733e-06,
	"loss": 1.5864,
	"step": 187
	},
	{
	"epoch": 0.0310692447529334,
	"grad_norm": 0.6181171536445618,
	"learning_rate": 1.9620034125190644e-06,
	"loss": 1.6582,
	"step": 188
	},
	{
	"epoch": 0.031234506693108576,
	"grad_norm": 0.4644870460033417,
	"learning_rate": 1.6494925127617634e-06,
	"loss": 1.3485,
	"step": 189
	},
	{
	"epoch": 0.03139976863328375,
	"grad_norm": 0.6458631753921509,
	"learning_rate": 1.3638696597277679e-06,
	"loss": 1.6619,
	"step": 190
	},
	{
	"epoch": 0.031565030573458934,
	"grad_norm": 0.6136733293533325,
	"learning_rate": 1.1052129398531507e-06,
	"loss": 1.453,
	"step": 191
	},
	{
	"epoch": 0.03173029251363411,
	"grad_norm": 0.5048571825027466,
	"learning_rate": 8.735930673024806e-07,
	"loss": 1.7202,
	"step": 192
	},
	{
	"epoch": 0.03189555445380929,
	"grad_norm": 0.4640374481678009,
	"learning_rate": 6.690733646361857e-07,
	"loss": 1.4326,
	"step": 193
	},
	{
	"epoch": 0.03206081639398446,
	"grad_norm": 0.8511689901351929,
	"learning_rate": 4.917097454988584e-07,
	"loss": 1.701,
	"step": 194
	},
	{
	"epoch": 0.03222607833415964,
	"grad_norm": 0.6593170166015625,
	"learning_rate": 3.415506993330153e-07,
	"loss": 1.2468,
	"step": 195
	},
	{
	"epoch": 0.03239134027433482,
	"grad_norm": 0.5053160190582275,
	"learning_rate": 2.1863727812254653e-07,
	"loss": 1.7233,
	"step": 196
	},
	{
	"epoch": 0.03255660221451,
	"grad_norm": 0.5864344239234924,
	"learning_rate": 1.230030851695263e-07,
	"loss": 1.5125,
	"step": 197
	},
	{
	"epoch": 0.03272186415468518,
	"grad_norm": 0.5440968871116638,
	"learning_rate": 5.467426590739511e-08,
	"loss": 1.5666,
	"step": 198
	},
	{
	"epoch": 0.03288712609486035,
	"grad_norm": 0.47717025876045227,
	"learning_rate": 1.3669500753099585e-08,
	"loss": 1.458,
	"step": 199
	},
	{
	"epoch": 0.03305238803503553,
	"grad_norm": 0.5752529501914978,
	"learning_rate": 0.0,
	"loss": 1.4294,
	"step": 200
	},
	{
	"epoch": 0.03305238803503553,
	"eval_loss": 1.540518879890442,
	"eval_runtime": 42.0573,
	"eval_samples_per_second": 60.584,
	"eval_steps_per_second": 30.292,
	"step": 200
	}
	],
	"logging_steps": 1,
	"max_steps": 200,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 5243735554129920.0,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}