adammandic87's picture
Training in progress, step 200, checkpoint
e2b33eb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.03305238803503553,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00016526194017517766,
"grad_norm": 0.555425226688385,
"learning_rate": 2e-05,
"loss": 2.2272,
"step": 1
},
{
"epoch": 0.00016526194017517766,
"eval_loss": 1.8130699396133423,
"eval_runtime": 42.0001,
"eval_samples_per_second": 60.667,
"eval_steps_per_second": 30.333,
"step": 1
},
{
"epoch": 0.0003305238803503553,
"grad_norm": 0.38278359174728394,
"learning_rate": 4e-05,
"loss": 1.7691,
"step": 2
},
{
"epoch": 0.0004957858205255329,
"grad_norm": 0.2836111783981323,
"learning_rate": 6e-05,
"loss": 1.6578,
"step": 3
},
{
"epoch": 0.0006610477607007106,
"grad_norm": 0.4891590178012848,
"learning_rate": 8e-05,
"loss": 2.1334,
"step": 4
},
{
"epoch": 0.0008263097008758883,
"grad_norm": 0.3973068296909332,
"learning_rate": 0.0001,
"loss": 1.8242,
"step": 5
},
{
"epoch": 0.0009915716410510659,
"grad_norm": 0.3104502856731415,
"learning_rate": 0.00012,
"loss": 1.6421,
"step": 6
},
{
"epoch": 0.0011568335812262437,
"grad_norm": 0.4168192446231842,
"learning_rate": 0.00014,
"loss": 1.8231,
"step": 7
},
{
"epoch": 0.0013220955214014213,
"grad_norm": 0.45680752396583557,
"learning_rate": 0.00016,
"loss": 1.7829,
"step": 8
},
{
"epoch": 0.001487357461576599,
"grad_norm": 0.40455353260040283,
"learning_rate": 0.00018,
"loss": 2.0192,
"step": 9
},
{
"epoch": 0.0016526194017517765,
"grad_norm": 0.3940141499042511,
"learning_rate": 0.0002,
"loss": 1.6667,
"step": 10
},
{
"epoch": 0.0018178813419269541,
"grad_norm": 0.3571617603302002,
"learning_rate": 0.0001999863304992469,
"loss": 1.6667,
"step": 11
},
{
"epoch": 0.0019831432821021317,
"grad_norm": 0.5306565761566162,
"learning_rate": 0.00019994532573409262,
"loss": 1.767,
"step": 12
},
{
"epoch": 0.0021484052222773093,
"grad_norm": 0.33301275968551636,
"learning_rate": 0.00019987699691483048,
"loss": 1.4221,
"step": 13
},
{
"epoch": 0.0023136671624524874,
"grad_norm": 0.3087587058544159,
"learning_rate": 0.00019978136272187747,
"loss": 1.5223,
"step": 14
},
{
"epoch": 0.002478929102627665,
"grad_norm": 0.5549050569534302,
"learning_rate": 0.000199658449300667,
"loss": 1.6935,
"step": 15
},
{
"epoch": 0.0026441910428028426,
"grad_norm": 0.43276911973953247,
"learning_rate": 0.00019950829025450114,
"loss": 1.7447,
"step": 16
},
{
"epoch": 0.00280945298297802,
"grad_norm": 0.41811099648475647,
"learning_rate": 0.00019933092663536382,
"loss": 1.6279,
"step": 17
},
{
"epoch": 0.002974714923153198,
"grad_norm": 0.7340661287307739,
"learning_rate": 0.00019912640693269752,
"loss": 1.7762,
"step": 18
},
{
"epoch": 0.0031399768633283754,
"grad_norm": 0.5529046654701233,
"learning_rate": 0.00019889478706014687,
"loss": 1.7278,
"step": 19
},
{
"epoch": 0.003305238803503553,
"grad_norm": 0.5795073509216309,
"learning_rate": 0.00019863613034027224,
"loss": 1.6349,
"step": 20
},
{
"epoch": 0.0034705007436787306,
"grad_norm": 0.4024738073348999,
"learning_rate": 0.00019835050748723824,
"loss": 1.478,
"step": 21
},
{
"epoch": 0.0036357626838539082,
"grad_norm": 0.5151705145835876,
"learning_rate": 0.00019803799658748094,
"loss": 1.3476,
"step": 22
},
{
"epoch": 0.0038010246240290863,
"grad_norm": 0.47924888134002686,
"learning_rate": 0.00019769868307835994,
"loss": 1.6887,
"step": 23
},
{
"epoch": 0.0039662865642042635,
"grad_norm": 0.414813756942749,
"learning_rate": 0.0001973326597248006,
"loss": 1.4679,
"step": 24
},
{
"epoch": 0.0041315485043794415,
"grad_norm": 0.5144776701927185,
"learning_rate": 0.00019694002659393305,
"loss": 1.6018,
"step": 25
},
{
"epoch": 0.004296810444554619,
"grad_norm": 0.6902279853820801,
"learning_rate": 0.00019652089102773488,
"loss": 1.6141,
"step": 26
},
{
"epoch": 0.004462072384729797,
"grad_norm": 0.5526428818702698,
"learning_rate": 0.00019607536761368484,
"loss": 1.4361,
"step": 27
},
{
"epoch": 0.004627334324904975,
"grad_norm": 0.9285733699798584,
"learning_rate": 0.00019560357815343577,
"loss": 1.9259,
"step": 28
},
{
"epoch": 0.004792596265080152,
"grad_norm": 0.37774184346199036,
"learning_rate": 0.00019510565162951537,
"loss": 1.4737,
"step": 29
},
{
"epoch": 0.00495785820525533,
"grad_norm": 0.36778566241264343,
"learning_rate": 0.00019458172417006347,
"loss": 1.5279,
"step": 30
},
{
"epoch": 0.005123120145430507,
"grad_norm": 0.5695986151695251,
"learning_rate": 0.00019403193901161613,
"loss": 1.8606,
"step": 31
},
{
"epoch": 0.005288382085605685,
"grad_norm": 0.3802303969860077,
"learning_rate": 0.0001934564464599461,
"loss": 1.6067,
"step": 32
},
{
"epoch": 0.005453644025780862,
"grad_norm": 0.7587772011756897,
"learning_rate": 0.00019285540384897073,
"loss": 1.8386,
"step": 33
},
{
"epoch": 0.00561890596595604,
"grad_norm": 0.6221739053726196,
"learning_rate": 0.00019222897549773848,
"loss": 1.4816,
"step": 34
},
{
"epoch": 0.005784167906131218,
"grad_norm": 0.583857536315918,
"learning_rate": 0.00019157733266550575,
"loss": 1.6782,
"step": 35
},
{
"epoch": 0.005949429846306396,
"grad_norm": 0.481245756149292,
"learning_rate": 0.00019090065350491626,
"loss": 1.4703,
"step": 36
},
{
"epoch": 0.006114691786481574,
"grad_norm": 0.44734448194503784,
"learning_rate": 0.00019019912301329592,
"loss": 1.5667,
"step": 37
},
{
"epoch": 0.006279953726656751,
"grad_norm": 0.4786554276943207,
"learning_rate": 0.00018947293298207635,
"loss": 1.5302,
"step": 38
},
{
"epoch": 0.006445215666831929,
"grad_norm": 0.47823983430862427,
"learning_rate": 0.0001887222819443612,
"loss": 1.4859,
"step": 39
},
{
"epoch": 0.006610477607007106,
"grad_norm": 0.4842425286769867,
"learning_rate": 0.0001879473751206489,
"loss": 1.3932,
"step": 40
},
{
"epoch": 0.006775739547182284,
"grad_norm": 0.3999955356121063,
"learning_rate": 0.00018714842436272773,
"loss": 1.4406,
"step": 41
},
{
"epoch": 0.006941001487357461,
"grad_norm": 0.4841754138469696,
"learning_rate": 0.00018632564809575742,
"loss": 1.4164,
"step": 42
},
{
"epoch": 0.007106263427532639,
"grad_norm": 0.5468775033950806,
"learning_rate": 0.0001854792712585539,
"loss": 1.7234,
"step": 43
},
{
"epoch": 0.0072715253677078165,
"grad_norm": 0.4681849479675293,
"learning_rate": 0.00018460952524209355,
"loss": 1.3656,
"step": 44
},
{
"epoch": 0.0074367873078829945,
"grad_norm": 0.5926626920700073,
"learning_rate": 0.00018371664782625287,
"loss": 1.5722,
"step": 45
},
{
"epoch": 0.007602049248058173,
"grad_norm": 0.5072199702262878,
"learning_rate": 0.00018280088311480201,
"loss": 1.4031,
"step": 46
},
{
"epoch": 0.00776731118823335,
"grad_norm": 0.48731309175491333,
"learning_rate": 0.00018186248146866927,
"loss": 1.4756,
"step": 47
},
{
"epoch": 0.007932573128408527,
"grad_norm": 0.4783889949321747,
"learning_rate": 0.00018090169943749476,
"loss": 1.0337,
"step": 48
},
{
"epoch": 0.008097835068583706,
"grad_norm": 0.458065927028656,
"learning_rate": 0.0001799187996894925,
"loss": 1.5369,
"step": 49
},
{
"epoch": 0.008263097008758883,
"grad_norm": 0.44745975732803345,
"learning_rate": 0.00017891405093963938,
"loss": 1.4635,
"step": 50
},
{
"epoch": 0.008263097008758883,
"eval_loss": 1.5854825973510742,
"eval_runtime": 42.2167,
"eval_samples_per_second": 60.355,
"eval_steps_per_second": 30.178,
"step": 50
},
{
"epoch": 0.00842835894893406,
"grad_norm": 0.47436589002609253,
"learning_rate": 0.00017788772787621126,
"loss": 1.667,
"step": 51
},
{
"epoch": 0.008593620889109237,
"grad_norm": 0.6580041646957397,
"learning_rate": 0.00017684011108568592,
"loss": 1.4318,
"step": 52
},
{
"epoch": 0.008758882829284416,
"grad_norm": 0.5122596621513367,
"learning_rate": 0.0001757714869760335,
"loss": 1.5615,
"step": 53
},
{
"epoch": 0.008924144769459593,
"grad_norm": 0.5801290273666382,
"learning_rate": 0.0001746821476984154,
"loss": 1.3862,
"step": 54
},
{
"epoch": 0.00908940670963477,
"grad_norm": 0.6132497191429138,
"learning_rate": 0.00017357239106731317,
"loss": 1.3698,
"step": 55
},
{
"epoch": 0.00925466864980995,
"grad_norm": 0.6059843301773071,
"learning_rate": 0.00017244252047910892,
"loss": 1.6441,
"step": 56
},
{
"epoch": 0.009419930589985127,
"grad_norm": 0.6696584224700928,
"learning_rate": 0.00017129284482913972,
"loss": 1.5325,
"step": 57
},
{
"epoch": 0.009585192530160304,
"grad_norm": 0.6695738434791565,
"learning_rate": 0.00017012367842724887,
"loss": 1.3917,
"step": 58
},
{
"epoch": 0.009750454470335481,
"grad_norm": 0.5053813457489014,
"learning_rate": 0.0001689353409118566,
"loss": 1.597,
"step": 59
},
{
"epoch": 0.00991571641051066,
"grad_norm": 0.568986713886261,
"learning_rate": 0.00016772815716257412,
"loss": 1.926,
"step": 60
},
{
"epoch": 0.010080978350685837,
"grad_norm": 0.5120527148246765,
"learning_rate": 0.0001665024572113848,
"loss": 1.5176,
"step": 61
},
{
"epoch": 0.010246240290861014,
"grad_norm": 0.3892988860607147,
"learning_rate": 0.00016525857615241687,
"loss": 1.6132,
"step": 62
},
{
"epoch": 0.010411502231036193,
"grad_norm": 0.5233981609344482,
"learning_rate": 0.00016399685405033167,
"loss": 1.501,
"step": 63
},
{
"epoch": 0.01057676417121137,
"grad_norm": 0.5761738419532776,
"learning_rate": 0.0001627176358473537,
"loss": 1.6147,
"step": 64
},
{
"epoch": 0.010742026111386548,
"grad_norm": 0.475273460149765,
"learning_rate": 0.0001614212712689668,
"loss": 1.6735,
"step": 65
},
{
"epoch": 0.010907288051561725,
"grad_norm": 0.4451874792575836,
"learning_rate": 0.00016010811472830252,
"loss": 1.3603,
"step": 66
},
{
"epoch": 0.011072549991736904,
"grad_norm": 0.3606850504875183,
"learning_rate": 0.00015877852522924732,
"loss": 1.2809,
"step": 67
},
{
"epoch": 0.01123781193191208,
"grad_norm": 0.4811137020587921,
"learning_rate": 0.00015743286626829437,
"loss": 1.5322,
"step": 68
},
{
"epoch": 0.011403073872087258,
"grad_norm": 0.40435728430747986,
"learning_rate": 0.0001560715057351673,
"loss": 1.4096,
"step": 69
},
{
"epoch": 0.011568335812262435,
"grad_norm": 0.4875733554363251,
"learning_rate": 0.00015469481581224272,
"loss": 1.694,
"step": 70
},
{
"epoch": 0.011733597752437614,
"grad_norm": 0.3643505871295929,
"learning_rate": 0.0001533031728727994,
"loss": 1.4144,
"step": 71
},
{
"epoch": 0.011898859692612791,
"grad_norm": 0.5411165952682495,
"learning_rate": 0.00015189695737812152,
"loss": 1.5356,
"step": 72
},
{
"epoch": 0.012064121632787968,
"grad_norm": 0.48910924792289734,
"learning_rate": 0.0001504765537734844,
"loss": 1.4965,
"step": 73
},
{
"epoch": 0.012229383572963147,
"grad_norm": 0.3826967477798462,
"learning_rate": 0.00014904235038305083,
"loss": 1.5359,
"step": 74
},
{
"epoch": 0.012394645513138325,
"grad_norm": 0.6282126903533936,
"learning_rate": 0.00014759473930370736,
"loss": 1.8296,
"step": 75
},
{
"epoch": 0.012559907453313502,
"grad_norm": 0.444775253534317,
"learning_rate": 0.0001461341162978688,
"loss": 1.3698,
"step": 76
},
{
"epoch": 0.012725169393488679,
"grad_norm": 0.5965347290039062,
"learning_rate": 0.00014466088068528068,
"loss": 1.8349,
"step": 77
},
{
"epoch": 0.012890431333663858,
"grad_norm": 0.6038627624511719,
"learning_rate": 0.00014317543523384928,
"loss": 1.4731,
"step": 78
},
{
"epoch": 0.013055693273839035,
"grad_norm": 0.5650402307510376,
"learning_rate": 0.00014167818604952906,
"loss": 1.5489,
"step": 79
},
{
"epoch": 0.013220955214014212,
"grad_norm": 0.6513015627861023,
"learning_rate": 0.00014016954246529696,
"loss": 1.6598,
"step": 80
},
{
"epoch": 0.013386217154189391,
"grad_norm": 0.7023486495018005,
"learning_rate": 0.00013864991692924523,
"loss": 1.5978,
"step": 81
},
{
"epoch": 0.013551479094364568,
"grad_norm": 0.4741460680961609,
"learning_rate": 0.00013711972489182208,
"loss": 1.4441,
"step": 82
},
{
"epoch": 0.013716741034539745,
"grad_norm": 0.5267840623855591,
"learning_rate": 0.00013557938469225167,
"loss": 1.6758,
"step": 83
},
{
"epoch": 0.013882002974714923,
"grad_norm": 0.4459191858768463,
"learning_rate": 0.00013402931744416433,
"loss": 1.6122,
"step": 84
},
{
"epoch": 0.014047264914890101,
"grad_norm": 0.4186781048774719,
"learning_rate": 0.00013246994692046836,
"loss": 1.4066,
"step": 85
},
{
"epoch": 0.014212526855065279,
"grad_norm": 0.6138430833816528,
"learning_rate": 0.00013090169943749476,
"loss": 1.6558,
"step": 86
},
{
"epoch": 0.014377788795240456,
"grad_norm": 0.5305865406990051,
"learning_rate": 0.0001293250037384465,
"loss": 1.5737,
"step": 87
},
{
"epoch": 0.014543050735415633,
"grad_norm": 0.44837602972984314,
"learning_rate": 0.00012774029087618446,
"loss": 1.4177,
"step": 88
},
{
"epoch": 0.014708312675590812,
"grad_norm": 0.3854394853115082,
"learning_rate": 0.00012614799409538198,
"loss": 1.3886,
"step": 89
},
{
"epoch": 0.014873574615765989,
"grad_norm": 0.4300711154937744,
"learning_rate": 0.00012454854871407994,
"loss": 1.4607,
"step": 90
},
{
"epoch": 0.015038836555941166,
"grad_norm": 0.5587463974952698,
"learning_rate": 0.00012294239200467516,
"loss": 1.5351,
"step": 91
},
{
"epoch": 0.015204098496116345,
"grad_norm": 0.5253020524978638,
"learning_rate": 0.0001213299630743747,
"loss": 1.3066,
"step": 92
},
{
"epoch": 0.015369360436291522,
"grad_norm": 0.46560612320899963,
"learning_rate": 0.00011971170274514802,
"loss": 1.5455,
"step": 93
},
{
"epoch": 0.0155346223764667,
"grad_norm": 0.5804600715637207,
"learning_rate": 0.000118088053433211,
"loss": 1.8134,
"step": 94
},
{
"epoch": 0.015699884316641877,
"grad_norm": 0.4481876790523529,
"learning_rate": 0.00011645945902807341,
"loss": 1.589,
"step": 95
},
{
"epoch": 0.015865146256817054,
"grad_norm": 0.43116191029548645,
"learning_rate": 0.0001148263647711842,
"loss": 1.601,
"step": 96
},
{
"epoch": 0.01603040819699223,
"grad_norm": 0.6765596270561218,
"learning_rate": 0.00011318921713420691,
"loss": 1.6353,
"step": 97
},
{
"epoch": 0.01619567013716741,
"grad_norm": 0.37676534056663513,
"learning_rate": 0.00011154846369695863,
"loss": 1.5367,
"step": 98
},
{
"epoch": 0.01636093207734259,
"grad_norm": 0.8401118516921997,
"learning_rate": 0.0001099045530250463,
"loss": 1.8154,
"step": 99
},
{
"epoch": 0.016526194017517766,
"grad_norm": 0.45617929100990295,
"learning_rate": 0.00010825793454723325,
"loss": 1.2815,
"step": 100
},
{
"epoch": 0.016526194017517766,
"eval_loss": 1.5545666217803955,
"eval_runtime": 41.38,
"eval_samples_per_second": 61.576,
"eval_steps_per_second": 30.788,
"step": 100
},
{
"epoch": 0.016691455957692943,
"grad_norm": 0.5702753663063049,
"learning_rate": 0.00010660905843256994,
"loss": 1.553,
"step": 101
},
{
"epoch": 0.01685671789786812,
"grad_norm": 0.5369312167167664,
"learning_rate": 0.00010495837546732224,
"loss": 1.5788,
"step": 102
},
{
"epoch": 0.017021979838043298,
"grad_norm": 0.4222307801246643,
"learning_rate": 0.00010330633693173082,
"loss": 1.5621,
"step": 103
},
{
"epoch": 0.017187241778218475,
"grad_norm": 0.5532189607620239,
"learning_rate": 0.00010165339447663587,
"loss": 1.4918,
"step": 104
},
{
"epoch": 0.017352503718393655,
"grad_norm": 0.7070478796958923,
"learning_rate": 0.0001,
"loss": 1.623,
"step": 105
},
{
"epoch": 0.017517765658568833,
"grad_norm": 0.6058260202407837,
"learning_rate": 9.834660552336415e-05,
"loss": 1.5853,
"step": 106
},
{
"epoch": 0.01768302759874401,
"grad_norm": 0.3898620009422302,
"learning_rate": 9.669366306826919e-05,
"loss": 1.5686,
"step": 107
},
{
"epoch": 0.017848289538919187,
"grad_norm": 0.5478883385658264,
"learning_rate": 9.504162453267777e-05,
"loss": 1.5867,
"step": 108
},
{
"epoch": 0.018013551479094364,
"grad_norm": 0.47779756784439087,
"learning_rate": 9.339094156743007e-05,
"loss": 1.4921,
"step": 109
},
{
"epoch": 0.01817881341926954,
"grad_norm": 0.8399332165718079,
"learning_rate": 9.174206545276677e-05,
"loss": 1.6164,
"step": 110
},
{
"epoch": 0.01834407535944472,
"grad_norm": 0.5388432145118713,
"learning_rate": 9.009544697495374e-05,
"loss": 1.6265,
"step": 111
},
{
"epoch": 0.0185093372996199,
"grad_norm": 0.5579386949539185,
"learning_rate": 8.845153630304139e-05,
"loss": 1.3753,
"step": 112
},
{
"epoch": 0.018674599239795076,
"grad_norm": 0.40802866220474243,
"learning_rate": 8.681078286579311e-05,
"loss": 1.3763,
"step": 113
},
{
"epoch": 0.018839861179970253,
"grad_norm": 0.552657425403595,
"learning_rate": 8.517363522881579e-05,
"loss": 1.3575,
"step": 114
},
{
"epoch": 0.01900512312014543,
"grad_norm": 0.8166072964668274,
"learning_rate": 8.35405409719266e-05,
"loss": 1.6855,
"step": 115
},
{
"epoch": 0.019170385060320608,
"grad_norm": 0.8131512999534607,
"learning_rate": 8.191194656678904e-05,
"loss": 1.8306,
"step": 116
},
{
"epoch": 0.019335647000495785,
"grad_norm": 0.42067432403564453,
"learning_rate": 8.028829725485199e-05,
"loss": 1.5663,
"step": 117
},
{
"epoch": 0.019500908940670962,
"grad_norm": 0.5358683466911316,
"learning_rate": 7.867003692562534e-05,
"loss": 1.7055,
"step": 118
},
{
"epoch": 0.019666170880846143,
"grad_norm": 0.6525923013687134,
"learning_rate": 7.705760799532485e-05,
"loss": 1.3311,
"step": 119
},
{
"epoch": 0.01983143282102132,
"grad_norm": 0.6140702962875366,
"learning_rate": 7.54514512859201e-05,
"loss": 1.7929,
"step": 120
},
{
"epoch": 0.019996694761196497,
"grad_norm": 0.6280947923660278,
"learning_rate": 7.385200590461803e-05,
"loss": 1.7175,
"step": 121
},
{
"epoch": 0.020161956701371674,
"grad_norm": 0.686147153377533,
"learning_rate": 7.225970912381556e-05,
"loss": 1.6716,
"step": 122
},
{
"epoch": 0.02032721864154685,
"grad_norm": 0.5507709383964539,
"learning_rate": 7.067499626155354e-05,
"loss": 1.3555,
"step": 123
},
{
"epoch": 0.02049248058172203,
"grad_norm": 0.5885040163993835,
"learning_rate": 6.909830056250527e-05,
"loss": 1.3161,
"step": 124
},
{
"epoch": 0.020657742521897206,
"grad_norm": 0.4891628623008728,
"learning_rate": 6.753005307953167e-05,
"loss": 1.4675,
"step": 125
},
{
"epoch": 0.020823004462072386,
"grad_norm": 0.8562346696853638,
"learning_rate": 6.59706825558357e-05,
"loss": 1.5319,
"step": 126
},
{
"epoch": 0.020988266402247564,
"grad_norm": 0.676142156124115,
"learning_rate": 6.442061530774834e-05,
"loss": 1.4428,
"step": 127
},
{
"epoch": 0.02115352834242274,
"grad_norm": 0.5416271686553955,
"learning_rate": 6.28802751081779e-05,
"loss": 1.4903,
"step": 128
},
{
"epoch": 0.021318790282597918,
"grad_norm": 0.4796253442764282,
"learning_rate": 6.135008307075481e-05,
"loss": 1.4259,
"step": 129
},
{
"epoch": 0.021484052222773095,
"grad_norm": 0.4909597635269165,
"learning_rate": 5.983045753470308e-05,
"loss": 1.3397,
"step": 130
},
{
"epoch": 0.021649314162948272,
"grad_norm": 0.5054872035980225,
"learning_rate": 5.832181395047098e-05,
"loss": 1.6404,
"step": 131
},
{
"epoch": 0.02181457610312345,
"grad_norm": 0.6180214285850525,
"learning_rate": 5.6824564766150726e-05,
"loss": 1.782,
"step": 132
},
{
"epoch": 0.021979838043298627,
"grad_norm": 0.657794177532196,
"learning_rate": 5.533911931471936e-05,
"loss": 1.5387,
"step": 133
},
{
"epoch": 0.022145099983473807,
"grad_norm": 0.5247397422790527,
"learning_rate": 5.386588370213124e-05,
"loss": 1.4955,
"step": 134
},
{
"epoch": 0.022310361923648984,
"grad_norm": 0.5147126913070679,
"learning_rate": 5.240526069629265e-05,
"loss": 1.4354,
"step": 135
},
{
"epoch": 0.02247562386382416,
"grad_norm": 0.4486481249332428,
"learning_rate": 5.095764961694922e-05,
"loss": 1.3835,
"step": 136
},
{
"epoch": 0.02264088580399934,
"grad_norm": 0.4644688367843628,
"learning_rate": 4.952344622651566e-05,
"loss": 1.5637,
"step": 137
},
{
"epoch": 0.022806147744174516,
"grad_norm": 0.7985265254974365,
"learning_rate": 4.810304262187852e-05,
"loss": 1.4771,
"step": 138
},
{
"epoch": 0.022971409684349693,
"grad_norm": 0.43993720412254333,
"learning_rate": 4.669682712720065e-05,
"loss": 1.4761,
"step": 139
},
{
"epoch": 0.02313667162452487,
"grad_norm": 0.597682535648346,
"learning_rate": 4.530518418775733e-05,
"loss": 1.5009,
"step": 140
},
{
"epoch": 0.02330193356470005,
"grad_norm": 0.46359845995903015,
"learning_rate": 4.392849426483274e-05,
"loss": 1.4938,
"step": 141
},
{
"epoch": 0.023467195504875228,
"grad_norm": 0.5101354718208313,
"learning_rate": 4.256713373170564e-05,
"loss": 1.4861,
"step": 142
},
{
"epoch": 0.023632457445050405,
"grad_norm": 0.5093466639518738,
"learning_rate": 4.12214747707527e-05,
"loss": 1.4807,
"step": 143
},
{
"epoch": 0.023797719385225583,
"grad_norm": 0.5195640325546265,
"learning_rate": 3.9891885271697496e-05,
"loss": 1.61,
"step": 144
},
{
"epoch": 0.02396298132540076,
"grad_norm": 0.500438392162323,
"learning_rate": 3.857872873103322e-05,
"loss": 1.4983,
"step": 145
},
{
"epoch": 0.024128243265575937,
"grad_norm": 0.4799457788467407,
"learning_rate": 3.7282364152646297e-05,
"loss": 1.3141,
"step": 146
},
{
"epoch": 0.024293505205751114,
"grad_norm": 0.5169847011566162,
"learning_rate": 3.600314594966834e-05,
"loss": 1.7889,
"step": 147
},
{
"epoch": 0.024458767145926295,
"grad_norm": 0.49090850353240967,
"learning_rate": 3.4741423847583134e-05,
"loss": 1.6389,
"step": 148
},
{
"epoch": 0.024624029086101472,
"grad_norm": 0.6075248122215271,
"learning_rate": 3.349754278861517e-05,
"loss": 1.5403,
"step": 149
},
{
"epoch": 0.02478929102627665,
"grad_norm": 0.5191071033477783,
"learning_rate": 3.227184283742591e-05,
"loss": 1.3608,
"step": 150
},
{
"epoch": 0.02478929102627665,
"eval_loss": 1.5432822704315186,
"eval_runtime": 41.6111,
"eval_samples_per_second": 61.234,
"eval_steps_per_second": 30.617,
"step": 150
},
{
"epoch": 0.024954552966451826,
"grad_norm": 1.1631395816802979,
"learning_rate": 3.106465908814342e-05,
"loss": 1.8673,
"step": 151
},
{
"epoch": 0.025119814906627003,
"grad_norm": 0.5590987205505371,
"learning_rate": 2.9876321572751144e-05,
"loss": 1.5469,
"step": 152
},
{
"epoch": 0.02528507684680218,
"grad_norm": 0.49430692195892334,
"learning_rate": 2.87071551708603e-05,
"loss": 1.5041,
"step": 153
},
{
"epoch": 0.025450338786977358,
"grad_norm": 0.539833128452301,
"learning_rate": 2.7557479520891104e-05,
"loss": 1.5439,
"step": 154
},
{
"epoch": 0.02561560072715254,
"grad_norm": 0.5095410346984863,
"learning_rate": 2.6427608932686843e-05,
"loss": 1.6017,
"step": 155
},
{
"epoch": 0.025780862667327716,
"grad_norm": 0.4884549379348755,
"learning_rate": 2.5317852301584643e-05,
"loss": 1.3461,
"step": 156
},
{
"epoch": 0.025946124607502893,
"grad_norm": 0.4909934997558594,
"learning_rate": 2.422851302396655e-05,
"loss": 1.4771,
"step": 157
},
{
"epoch": 0.02611138654767807,
"grad_norm": 0.6615016460418701,
"learning_rate": 2.315988891431412e-05,
"loss": 1.6216,
"step": 158
},
{
"epoch": 0.026276648487853247,
"grad_norm": 0.5424089431762695,
"learning_rate": 2.2112272123788768e-05,
"loss": 1.4016,
"step": 159
},
{
"epoch": 0.026441910428028424,
"grad_norm": 0.4783364236354828,
"learning_rate": 2.1085949060360654e-05,
"loss": 1.2278,
"step": 160
},
{
"epoch": 0.0266071723682036,
"grad_norm": 0.4361153841018677,
"learning_rate": 2.008120031050753e-05,
"loss": 1.5001,
"step": 161
},
{
"epoch": 0.026772434308378782,
"grad_norm": 0.4618135392665863,
"learning_rate": 1.9098300562505266e-05,
"loss": 1.5697,
"step": 162
},
{
"epoch": 0.02693769624855396,
"grad_norm": 0.5692691206932068,
"learning_rate": 1.8137518531330767e-05,
"loss": 1.7622,
"step": 163
},
{
"epoch": 0.027102958188729136,
"grad_norm": 0.7295346856117249,
"learning_rate": 1.7199116885197995e-05,
"loss": 1.8002,
"step": 164
},
{
"epoch": 0.027268220128904314,
"grad_norm": 0.6334387063980103,
"learning_rate": 1.6283352173747145e-05,
"loss": 1.3173,
"step": 165
},
{
"epoch": 0.02743348206907949,
"grad_norm": 0.5129164457321167,
"learning_rate": 1.5390474757906446e-05,
"loss": 1.1693,
"step": 166
},
{
"epoch": 0.027598744009254668,
"grad_norm": 0.56026291847229,
"learning_rate": 1.4520728741446089e-05,
"loss": 1.4209,
"step": 167
},
{
"epoch": 0.027764005949429845,
"grad_norm": 0.692166805267334,
"learning_rate": 1.3674351904242611e-05,
"loss": 1.315,
"step": 168
},
{
"epoch": 0.027929267889605022,
"grad_norm": 0.595594048500061,
"learning_rate": 1.2851575637272262e-05,
"loss": 1.4176,
"step": 169
},
{
"epoch": 0.028094529829780203,
"grad_norm": 0.36242327094078064,
"learning_rate": 1.2052624879351104e-05,
"loss": 1.6176,
"step": 170
},
{
"epoch": 0.02825979176995538,
"grad_norm": 0.4128834307193756,
"learning_rate": 1.1277718055638819e-05,
"loss": 1.4342,
"step": 171
},
{
"epoch": 0.028425053710130557,
"grad_norm": 0.533230185508728,
"learning_rate": 1.0527067017923654e-05,
"loss": 1.6252,
"step": 172
},
{
"epoch": 0.028590315650305734,
"grad_norm": 0.5379830598831177,
"learning_rate": 9.80087698670411e-06,
"loss": 1.3614,
"step": 173
},
{
"epoch": 0.02875557759048091,
"grad_norm": 0.5856308341026306,
"learning_rate": 9.09934649508375e-06,
"loss": 1.4459,
"step": 174
},
{
"epoch": 0.02892083953065609,
"grad_norm": 0.5911732316017151,
"learning_rate": 8.422667334494249e-06,
"loss": 1.4906,
"step": 175
},
{
"epoch": 0.029086101470831266,
"grad_norm": 0.8282976150512695,
"learning_rate": 7.771024502261526e-06,
"loss": 1.81,
"step": 176
},
{
"epoch": 0.029251363411006447,
"grad_norm": 0.5876789689064026,
"learning_rate": 7.144596151029303e-06,
"loss": 1.4096,
"step": 177
},
{
"epoch": 0.029416625351181624,
"grad_norm": 0.5349177122116089,
"learning_rate": 6.543553540053926e-06,
"loss": 1.4491,
"step": 178
},
{
"epoch": 0.0295818872913568,
"grad_norm": 0.6630465388298035,
"learning_rate": 5.968060988383883e-06,
"loss": 1.3134,
"step": 179
},
{
"epoch": 0.029747149231531978,
"grad_norm": 0.6198413968086243,
"learning_rate": 5.418275829936537e-06,
"loss": 1.746,
"step": 180
},
{
"epoch": 0.029912411171707155,
"grad_norm": 0.4724350571632385,
"learning_rate": 4.8943483704846475e-06,
"loss": 1.4524,
"step": 181
},
{
"epoch": 0.030077673111882332,
"grad_norm": 0.5511718988418579,
"learning_rate": 4.3964218465642355e-06,
"loss": 1.395,
"step": 182
},
{
"epoch": 0.03024293505205751,
"grad_norm": 0.6854819059371948,
"learning_rate": 3.924632386315186e-06,
"loss": 1.6976,
"step": 183
},
{
"epoch": 0.03040819699223269,
"grad_norm": 0.5091037750244141,
"learning_rate": 3.4791089722651436e-06,
"loss": 1.2758,
"step": 184
},
{
"epoch": 0.030573458932407867,
"grad_norm": 0.5180460810661316,
"learning_rate": 3.059973406066963e-06,
"loss": 1.4986,
"step": 185
},
{
"epoch": 0.030738720872583045,
"grad_norm": 0.8126318454742432,
"learning_rate": 2.667340275199426e-06,
"loss": 1.7087,
"step": 186
},
{
"epoch": 0.030903982812758222,
"grad_norm": 0.567361056804657,
"learning_rate": 2.3013169216400733e-06,
"loss": 1.5864,
"step": 187
},
{
"epoch": 0.0310692447529334,
"grad_norm": 0.6181171536445618,
"learning_rate": 1.9620034125190644e-06,
"loss": 1.6582,
"step": 188
},
{
"epoch": 0.031234506693108576,
"grad_norm": 0.4644870460033417,
"learning_rate": 1.6494925127617634e-06,
"loss": 1.3485,
"step": 189
},
{
"epoch": 0.03139976863328375,
"grad_norm": 0.6458631753921509,
"learning_rate": 1.3638696597277679e-06,
"loss": 1.6619,
"step": 190
},
{
"epoch": 0.031565030573458934,
"grad_norm": 0.6136733293533325,
"learning_rate": 1.1052129398531507e-06,
"loss": 1.453,
"step": 191
},
{
"epoch": 0.03173029251363411,
"grad_norm": 0.5048571825027466,
"learning_rate": 8.735930673024806e-07,
"loss": 1.7202,
"step": 192
},
{
"epoch": 0.03189555445380929,
"grad_norm": 0.4640374481678009,
"learning_rate": 6.690733646361857e-07,
"loss": 1.4326,
"step": 193
},
{
"epoch": 0.03206081639398446,
"grad_norm": 0.8511689901351929,
"learning_rate": 4.917097454988584e-07,
"loss": 1.701,
"step": 194
},
{
"epoch": 0.03222607833415964,
"grad_norm": 0.6593170166015625,
"learning_rate": 3.415506993330153e-07,
"loss": 1.2468,
"step": 195
},
{
"epoch": 0.03239134027433482,
"grad_norm": 0.5053160190582275,
"learning_rate": 2.1863727812254653e-07,
"loss": 1.7233,
"step": 196
},
{
"epoch": 0.03255660221451,
"grad_norm": 0.5864344239234924,
"learning_rate": 1.230030851695263e-07,
"loss": 1.5125,
"step": 197
},
{
"epoch": 0.03272186415468518,
"grad_norm": 0.5440968871116638,
"learning_rate": 5.467426590739511e-08,
"loss": 1.5666,
"step": 198
},
{
"epoch": 0.03288712609486035,
"grad_norm": 0.47717025876045227,
"learning_rate": 1.3669500753099585e-08,
"loss": 1.458,
"step": 199
},
{
"epoch": 0.03305238803503553,
"grad_norm": 0.5752529501914978,
"learning_rate": 0.0,
"loss": 1.4294,
"step": 200
},
{
"epoch": 0.03305238803503553,
"eval_loss": 1.540518879890442,
"eval_runtime": 42.0573,
"eval_samples_per_second": 60.584,
"eval_steps_per_second": 30.292,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5243735554129920.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}