NTIS's picture
Upload checkpoint-16000
8a562a2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.041528805092988834,
"eval_steps": 500,
"global_step": 16000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00012977751591559012,
"grad_norm": 29.529769897460938,
"learning_rate": 2.1196980511755674e-09,
"loss": 2.2098,
"step": 50
},
{
"epoch": 0.00025955503183118023,
"grad_norm": 30.7341251373291,
"learning_rate": 4.282655246252677e-09,
"loss": 2.2367,
"step": 100
},
{
"epoch": 0.00038933254774677035,
"grad_norm": 29.822181701660156,
"learning_rate": 6.4456124413297865e-09,
"loss": 2.198,
"step": 150
},
{
"epoch": 0.0005191100636623605,
"grad_norm": 28.564294815063477,
"learning_rate": 8.608569636406895e-09,
"loss": 2.2167,
"step": 200
},
{
"epoch": 0.0006488875795779505,
"grad_norm": 28.514026641845703,
"learning_rate": 1.0771526831484006e-08,
"loss": 2.2001,
"step": 250
},
{
"epoch": 0.0007786650954935407,
"grad_norm": 29.033863067626953,
"learning_rate": 1.2934484026561114e-08,
"loss": 2.1908,
"step": 300
},
{
"epoch": 0.0009084426114091308,
"grad_norm": 4.49394416809082,
"learning_rate": 1.5097441221638225e-08,
"loss": 2.1841,
"step": 350
},
{
"epoch": 0.001038220127324721,
"grad_norm": 2.1610796451568604,
"learning_rate": 1.7260398416715337e-08,
"loss": 2.1702,
"step": 400
},
{
"epoch": 0.0011679976432403109,
"grad_norm": 1.9879209995269775,
"learning_rate": 1.9423355611792444e-08,
"loss": 2.1613,
"step": 450
},
{
"epoch": 0.001297775159155901,
"grad_norm": 1.880299687385559,
"learning_rate": 2.1586312806869556e-08,
"loss": 2.1529,
"step": 500
},
{
"epoch": 0.0014275526750714912,
"grad_norm": 1.7597101926803589,
"learning_rate": 2.3749270001946664e-08,
"loss": 2.1641,
"step": 550
},
{
"epoch": 0.0015573301909870814,
"grad_norm": 1.5660640001296997,
"learning_rate": 2.591222719702377e-08,
"loss": 2.1501,
"step": 600
},
{
"epoch": 0.0016871077069026714,
"grad_norm": 1.5481892824172974,
"learning_rate": 2.8075184392100883e-08,
"loss": 2.1583,
"step": 650
},
{
"epoch": 0.0018168852228182615,
"grad_norm": 1.525515079498291,
"learning_rate": 3.0238141587177994e-08,
"loss": 2.1547,
"step": 700
},
{
"epoch": 0.0019466627387338517,
"grad_norm": 1.4705111980438232,
"learning_rate": 3.24010987822551e-08,
"loss": 2.1365,
"step": 750
},
{
"epoch": 0.002076440254649442,
"grad_norm": 1.3471802473068237,
"learning_rate": 3.4564055977332216e-08,
"loss": 2.1777,
"step": 800
},
{
"epoch": 0.002206217770565032,
"grad_norm": 1.3963252305984497,
"learning_rate": 3.6727013172409324e-08,
"loss": 2.1356,
"step": 850
},
{
"epoch": 0.0023359952864806218,
"grad_norm": 1.2665836811065674,
"learning_rate": 3.888997036748643e-08,
"loss": 2.1565,
"step": 900
},
{
"epoch": 0.002465772802396212,
"grad_norm": 1.4085878133773804,
"learning_rate": 4.105292756256354e-08,
"loss": 2.1497,
"step": 950
},
{
"epoch": 0.002595550318311802,
"grad_norm": 1.4020620584487915,
"learning_rate": 4.3215884757640654e-08,
"loss": 2.1549,
"step": 1000
},
{
"epoch": 0.002725327834227392,
"grad_norm": 1.5391197204589844,
"learning_rate": 4.5378841952717755e-08,
"loss": 2.1519,
"step": 1050
},
{
"epoch": 0.0028551053501429825,
"grad_norm": 1.2532999515533447,
"learning_rate": 4.754179914779487e-08,
"loss": 2.1411,
"step": 1100
},
{
"epoch": 0.0029848828660585724,
"grad_norm": 1.2143096923828125,
"learning_rate": 4.970475634287198e-08,
"loss": 2.1435,
"step": 1150
},
{
"epoch": 0.003114660381974163,
"grad_norm": 1.4227972030639648,
"learning_rate": 5.1867713537949086e-08,
"loss": 2.142,
"step": 1200
},
{
"epoch": 0.0032444378978897528,
"grad_norm": 1.0749537944793701,
"learning_rate": 5.40306707330262e-08,
"loss": 2.1134,
"step": 1250
},
{
"epoch": 0.0033742154138053427,
"grad_norm": 1.169149398803711,
"learning_rate": 5.619362792810331e-08,
"loss": 2.1215,
"step": 1300
},
{
"epoch": 0.003503992929720933,
"grad_norm": 1.3452564477920532,
"learning_rate": 5.8356585123180416e-08,
"loss": 2.1524,
"step": 1350
},
{
"epoch": 0.003633770445636523,
"grad_norm": 1.3610825538635254,
"learning_rate": 6.051954231825752e-08,
"loss": 2.1389,
"step": 1400
},
{
"epoch": 0.003763547961552113,
"grad_norm": 1.2172240018844604,
"learning_rate": 6.268249951333464e-08,
"loss": 2.1428,
"step": 1450
},
{
"epoch": 0.0038933254774677034,
"grad_norm": 1.1681610345840454,
"learning_rate": 6.484545670841174e-08,
"loss": 2.1253,
"step": 1500
},
{
"epoch": 0.004023102993383294,
"grad_norm": 1.0709235668182373,
"learning_rate": 6.700841390348886e-08,
"loss": 2.1533,
"step": 1550
},
{
"epoch": 0.004152880509298884,
"grad_norm": 1.2001404762268066,
"learning_rate": 6.917137109856597e-08,
"loss": 2.1371,
"step": 1600
},
{
"epoch": 0.004282658025214474,
"grad_norm": 1.2642266750335693,
"learning_rate": 7.133432829364308e-08,
"loss": 2.129,
"step": 1650
},
{
"epoch": 0.004412435541130064,
"grad_norm": 1.030280590057373,
"learning_rate": 7.349728548872018e-08,
"loss": 2.128,
"step": 1700
},
{
"epoch": 0.004542213057045654,
"grad_norm": 1.4037690162658691,
"learning_rate": 7.566024268379729e-08,
"loss": 2.1383,
"step": 1750
},
{
"epoch": 0.0046719905729612436,
"grad_norm": 1.1600065231323242,
"learning_rate": 7.78231998788744e-08,
"loss": 2.1193,
"step": 1800
},
{
"epoch": 0.004801768088876834,
"grad_norm": 1.2075531482696533,
"learning_rate": 7.998615707395151e-08,
"loss": 2.1126,
"step": 1850
},
{
"epoch": 0.004931545604792424,
"grad_norm": 1.1088111400604248,
"learning_rate": 8.214911426902863e-08,
"loss": 2.1394,
"step": 1900
},
{
"epoch": 0.005061323120708014,
"grad_norm": 1.195092797279358,
"learning_rate": 8.431207146410574e-08,
"loss": 2.1359,
"step": 1950
},
{
"epoch": 0.005191100636623604,
"grad_norm": 1.1595982313156128,
"learning_rate": 8.647502865918283e-08,
"loss": 2.1366,
"step": 2000
},
{
"epoch": 0.005320878152539194,
"grad_norm": 1.1727768182754517,
"learning_rate": 8.863798585425995e-08,
"loss": 2.1106,
"step": 2050
},
{
"epoch": 0.005450655668454784,
"grad_norm": 1.2424023151397705,
"learning_rate": 9.080094304933706e-08,
"loss": 2.1339,
"step": 2100
},
{
"epoch": 0.005580433184370375,
"grad_norm": 1.2954424619674683,
"learning_rate": 9.296390024441417e-08,
"loss": 2.1107,
"step": 2150
},
{
"epoch": 0.005710210700285965,
"grad_norm": 1.0388058423995972,
"learning_rate": 9.512685743949129e-08,
"loss": 2.118,
"step": 2200
},
{
"epoch": 0.005839988216201555,
"grad_norm": 4.136488914489746,
"learning_rate": 9.728981463456838e-08,
"loss": 2.1159,
"step": 2250
},
{
"epoch": 0.005969765732117145,
"grad_norm": 1.0597412586212158,
"learning_rate": 9.945277182964549e-08,
"loss": 2.1303,
"step": 2300
},
{
"epoch": 0.006099543248032735,
"grad_norm": 1.0780799388885498,
"learning_rate": 1.0161572902472261e-07,
"loss": 2.1095,
"step": 2350
},
{
"epoch": 0.006229320763948326,
"grad_norm": 1.2739732265472412,
"learning_rate": 1.0377868621979972e-07,
"loss": 2.1264,
"step": 2400
},
{
"epoch": 0.0063590982798639156,
"grad_norm": 3.3836748600006104,
"learning_rate": 1.0594164341487683e-07,
"loss": 2.1183,
"step": 2450
},
{
"epoch": 0.0064888757957795055,
"grad_norm": 1.269440770149231,
"learning_rate": 1.0810460060995395e-07,
"loss": 2.1243,
"step": 2500
},
{
"epoch": 0.0066186533116950955,
"grad_norm": 1.1834193468093872,
"learning_rate": 1.1026755780503104e-07,
"loss": 2.1207,
"step": 2550
},
{
"epoch": 0.006748430827610685,
"grad_norm": 1.2411448955535889,
"learning_rate": 1.1243051500010815e-07,
"loss": 2.1201,
"step": 2600
},
{
"epoch": 0.006878208343526275,
"grad_norm": 1.1330574750900269,
"learning_rate": 1.1459347219518527e-07,
"loss": 2.1125,
"step": 2650
},
{
"epoch": 0.007007985859441866,
"grad_norm": 1.199055790901184,
"learning_rate": 1.1675642939026238e-07,
"loss": 2.13,
"step": 2700
},
{
"epoch": 0.007137763375357456,
"grad_norm": 1.2272251844406128,
"learning_rate": 1.1891938658533949e-07,
"loss": 2.1239,
"step": 2750
},
{
"epoch": 0.007267540891273046,
"grad_norm": 1.1257809400558472,
"learning_rate": 1.2108234378041658e-07,
"loss": 2.12,
"step": 2800
},
{
"epoch": 0.007397318407188636,
"grad_norm": 1.1028845310211182,
"learning_rate": 1.2324530097549372e-07,
"loss": 2.1227,
"step": 2850
},
{
"epoch": 0.007527095923104226,
"grad_norm": 1.2196507453918457,
"learning_rate": 1.254082581705708e-07,
"loss": 2.1241,
"step": 2900
},
{
"epoch": 0.007656873439019816,
"grad_norm": 1.0342847108840942,
"learning_rate": 1.275712153656479e-07,
"loss": 2.1148,
"step": 2950
},
{
"epoch": 0.007786650954935407,
"grad_norm": 1.1587927341461182,
"learning_rate": 1.2973417256072504e-07,
"loss": 2.1293,
"step": 3000
},
{
"epoch": 0.007916428470850997,
"grad_norm": 1.1751174926757812,
"learning_rate": 1.3189712975580215e-07,
"loss": 2.1084,
"step": 3050
},
{
"epoch": 0.008046205986766588,
"grad_norm": 12.056378364562988,
"learning_rate": 1.3406008695087926e-07,
"loss": 2.103,
"step": 3100
},
{
"epoch": 0.008175983502682177,
"grad_norm": 1.0078307390213013,
"learning_rate": 1.3622304414595637e-07,
"loss": 2.1345,
"step": 3150
},
{
"epoch": 0.008305761018597767,
"grad_norm": 1.0840246677398682,
"learning_rate": 1.3838600134103347e-07,
"loss": 2.1151,
"step": 3200
},
{
"epoch": 0.008435538534513357,
"grad_norm": 1.213310956954956,
"learning_rate": 1.4054895853611058e-07,
"loss": 2.1275,
"step": 3250
},
{
"epoch": 0.008565316050428947,
"grad_norm": 1.2903615236282349,
"learning_rate": 1.427119157311877e-07,
"loss": 2.1105,
"step": 3300
},
{
"epoch": 0.008695093566344536,
"grad_norm": 1.1746351718902588,
"learning_rate": 1.448748729262648e-07,
"loss": 2.1344,
"step": 3350
},
{
"epoch": 0.008824871082260127,
"grad_norm": 1.1928184032440186,
"learning_rate": 1.470378301213419e-07,
"loss": 2.1278,
"step": 3400
},
{
"epoch": 0.008954648598175718,
"grad_norm": 1.1132676601409912,
"learning_rate": 1.4920078731641904e-07,
"loss": 2.1134,
"step": 3450
},
{
"epoch": 0.009084426114091307,
"grad_norm": 1.4539573192596436,
"learning_rate": 1.5136374451149612e-07,
"loss": 2.0943,
"step": 3500
},
{
"epoch": 0.009214203630006898,
"grad_norm": 1.176128625869751,
"learning_rate": 1.5352670170657323e-07,
"loss": 2.1025,
"step": 3550
},
{
"epoch": 0.009343981145922487,
"grad_norm": 1.0798020362854004,
"learning_rate": 1.5568965890165036e-07,
"loss": 2.0946,
"step": 3600
},
{
"epoch": 0.009473758661838078,
"grad_norm": 1.1570450067520142,
"learning_rate": 1.5785261609672747e-07,
"loss": 2.1119,
"step": 3650
},
{
"epoch": 0.009603536177753669,
"grad_norm": 0.9933484792709351,
"learning_rate": 1.6001557329180458e-07,
"loss": 2.1171,
"step": 3700
},
{
"epoch": 0.009733313693669258,
"grad_norm": 1.1642405986785889,
"learning_rate": 1.6217853048688166e-07,
"loss": 2.1047,
"step": 3750
},
{
"epoch": 0.009863091209584849,
"grad_norm": 1.266423225402832,
"learning_rate": 1.643414876819588e-07,
"loss": 2.0984,
"step": 3800
},
{
"epoch": 0.009992868725500438,
"grad_norm": 1.0179153680801392,
"learning_rate": 1.665044448770359e-07,
"loss": 2.0977,
"step": 3850
},
{
"epoch": 0.010122646241416029,
"grad_norm": 1.0938276052474976,
"learning_rate": 1.68667402072113e-07,
"loss": 2.1249,
"step": 3900
},
{
"epoch": 0.01025242375733162,
"grad_norm": 1.1981333494186401,
"learning_rate": 1.7083035926719012e-07,
"loss": 2.1186,
"step": 3950
},
{
"epoch": 0.010382201273247208,
"grad_norm": 1.238035798072815,
"learning_rate": 1.7299331646226725e-07,
"loss": 2.1254,
"step": 4000
},
{
"epoch": 0.0105119787891628,
"grad_norm": 1.0942214727401733,
"learning_rate": 1.7515627365734433e-07,
"loss": 2.0972,
"step": 4050
},
{
"epoch": 0.010641756305078388,
"grad_norm": 1.1418002843856812,
"learning_rate": 1.7731923085242144e-07,
"loss": 2.1286,
"step": 4100
},
{
"epoch": 0.01077153382099398,
"grad_norm": 2.164984703063965,
"learning_rate": 1.7948218804749858e-07,
"loss": 2.1195,
"step": 4150
},
{
"epoch": 0.010901311336909568,
"grad_norm": 1.119779109954834,
"learning_rate": 1.8164514524257566e-07,
"loss": 2.1242,
"step": 4200
},
{
"epoch": 0.011031088852825159,
"grad_norm": 1.0193780660629272,
"learning_rate": 1.8380810243765277e-07,
"loss": 2.0951,
"step": 4250
},
{
"epoch": 0.01116086636874075,
"grad_norm": 1.1878997087478638,
"learning_rate": 1.859710596327299e-07,
"loss": 2.1067,
"step": 4300
},
{
"epoch": 0.011290643884656339,
"grad_norm": 1.048115849494934,
"learning_rate": 1.8813401682780698e-07,
"loss": 2.0861,
"step": 4350
},
{
"epoch": 0.01142042140057193,
"grad_norm": 1.1059479713439941,
"learning_rate": 1.9029697402288412e-07,
"loss": 2.1204,
"step": 4400
},
{
"epoch": 0.011550198916487519,
"grad_norm": 0.942563533782959,
"learning_rate": 1.9245993121796122e-07,
"loss": 2.1256,
"step": 4450
},
{
"epoch": 0.01167997643240311,
"grad_norm": 3.292470932006836,
"learning_rate": 1.946228884130383e-07,
"loss": 2.0964,
"step": 4500
},
{
"epoch": 0.0118097539483187,
"grad_norm": 1.131181001663208,
"learning_rate": 1.9678584560811544e-07,
"loss": 2.1079,
"step": 4550
},
{
"epoch": 0.01193953146423429,
"grad_norm": 1.1684244871139526,
"learning_rate": 1.9894880280319255e-07,
"loss": 2.0918,
"step": 4600
},
{
"epoch": 0.01206930898014988,
"grad_norm": 1.094470500946045,
"learning_rate": 2.0111175999826965e-07,
"loss": 2.1129,
"step": 4650
},
{
"epoch": 0.01219908649606547,
"grad_norm": 1.1580520868301392,
"learning_rate": 2.0327471719334676e-07,
"loss": 2.1175,
"step": 4700
},
{
"epoch": 0.01232886401198106,
"grad_norm": 1.110461711883545,
"learning_rate": 2.054376743884239e-07,
"loss": 2.1188,
"step": 4750
},
{
"epoch": 0.012458641527896651,
"grad_norm": 1.1533987522125244,
"learning_rate": 2.0760063158350098e-07,
"loss": 2.1072,
"step": 4800
},
{
"epoch": 0.01258841904381224,
"grad_norm": 1.3358995914459229,
"learning_rate": 2.0976358877857809e-07,
"loss": 2.1031,
"step": 4850
},
{
"epoch": 0.012718196559727831,
"grad_norm": 1.100576639175415,
"learning_rate": 2.119265459736552e-07,
"loss": 2.0903,
"step": 4900
},
{
"epoch": 0.01284797407564342,
"grad_norm": 1.1505376100540161,
"learning_rate": 2.140895031687323e-07,
"loss": 2.123,
"step": 4950
},
{
"epoch": 0.012977751591559011,
"grad_norm": 1.063899278640747,
"learning_rate": 2.1625246036380944e-07,
"loss": 2.0802,
"step": 5000
},
{
"epoch": 0.0131075291074746,
"grad_norm": 0.9469916224479675,
"learning_rate": 2.1841541755888652e-07,
"loss": 2.0917,
"step": 5050
},
{
"epoch": 0.013237306623390191,
"grad_norm": 1.1475921869277954,
"learning_rate": 2.2057837475396363e-07,
"loss": 2.116,
"step": 5100
},
{
"epoch": 0.013367084139305782,
"grad_norm": 1.5712428092956543,
"learning_rate": 2.2274133194904076e-07,
"loss": 2.1019,
"step": 5150
},
{
"epoch": 0.01349686165522137,
"grad_norm": 1.0301058292388916,
"learning_rate": 2.2490428914411784e-07,
"loss": 2.0985,
"step": 5200
},
{
"epoch": 0.013626639171136962,
"grad_norm": 1.1751989126205444,
"learning_rate": 2.2706724633919495e-07,
"loss": 2.1023,
"step": 5250
},
{
"epoch": 0.01375641668705255,
"grad_norm": 2.097857713699341,
"learning_rate": 2.2923020353427208e-07,
"loss": 2.1226,
"step": 5300
},
{
"epoch": 0.013886194202968142,
"grad_norm": 1.0331202745437622,
"learning_rate": 2.3139316072934916e-07,
"loss": 2.1205,
"step": 5350
},
{
"epoch": 0.014015971718883732,
"grad_norm": 1.0770936012268066,
"learning_rate": 2.335561179244263e-07,
"loss": 2.1037,
"step": 5400
},
{
"epoch": 0.014145749234799321,
"grad_norm": 1.1082065105438232,
"learning_rate": 2.357190751195034e-07,
"loss": 2.1027,
"step": 5450
},
{
"epoch": 0.014275526750714912,
"grad_norm": 1.2101250886917114,
"learning_rate": 2.378820323145805e-07,
"loss": 2.1103,
"step": 5500
},
{
"epoch": 0.014405304266630501,
"grad_norm": 1.163560152053833,
"learning_rate": 2.400449895096576e-07,
"loss": 2.1035,
"step": 5550
},
{
"epoch": 0.014535081782546092,
"grad_norm": 1.1837356090545654,
"learning_rate": 2.4220794670473476e-07,
"loss": 2.0891,
"step": 5600
},
{
"epoch": 0.014664859298461683,
"grad_norm": 1.2113701105117798,
"learning_rate": 2.4437090389981184e-07,
"loss": 2.1096,
"step": 5650
},
{
"epoch": 0.014794636814377272,
"grad_norm": 1.1284654140472412,
"learning_rate": 2.4653386109488897e-07,
"loss": 2.0948,
"step": 5700
},
{
"epoch": 0.014924414330292863,
"grad_norm": 1.4470899105072021,
"learning_rate": 2.4869681828996605e-07,
"loss": 2.0954,
"step": 5750
},
{
"epoch": 0.015054191846208452,
"grad_norm": 1.0791606903076172,
"learning_rate": 2.5085977548504314e-07,
"loss": 2.1022,
"step": 5800
},
{
"epoch": 0.015183969362124043,
"grad_norm": 1.1813191175460815,
"learning_rate": 2.5302273268012027e-07,
"loss": 2.1102,
"step": 5850
},
{
"epoch": 0.015313746878039632,
"grad_norm": 1.1993714570999146,
"learning_rate": 2.551856898751974e-07,
"loss": 2.0967,
"step": 5900
},
{
"epoch": 0.015443524393955223,
"grad_norm": 1.1809765100479126,
"learning_rate": 2.573486470702745e-07,
"loss": 2.1054,
"step": 5950
},
{
"epoch": 0.015573301909870814,
"grad_norm": 1.0799180269241333,
"learning_rate": 2.595116042653516e-07,
"loss": 2.1144,
"step": 6000
},
{
"epoch": 0.015703079425786404,
"grad_norm": 1.0349640846252441,
"learning_rate": 2.6167456146042875e-07,
"loss": 2.0978,
"step": 6050
},
{
"epoch": 0.015832856941701993,
"grad_norm": 0.9997969269752502,
"learning_rate": 2.6383751865550584e-07,
"loss": 2.1274,
"step": 6100
},
{
"epoch": 0.015962634457617583,
"grad_norm": 1.3014293909072876,
"learning_rate": 2.660004758505829e-07,
"loss": 2.0857,
"step": 6150
},
{
"epoch": 0.016092411973533175,
"grad_norm": 1.1863785982131958,
"learning_rate": 2.6816343304566005e-07,
"loss": 2.1051,
"step": 6200
},
{
"epoch": 0.016222189489448764,
"grad_norm": 1.6769137382507324,
"learning_rate": 2.7032639024073713e-07,
"loss": 2.0934,
"step": 6250
},
{
"epoch": 0.016351967005364353,
"grad_norm": 1.0514180660247803,
"learning_rate": 2.7248934743581427e-07,
"loss": 2.0948,
"step": 6300
},
{
"epoch": 0.016481744521279942,
"grad_norm": 1.0475189685821533,
"learning_rate": 2.746523046308914e-07,
"loss": 2.088,
"step": 6350
},
{
"epoch": 0.016611522037195535,
"grad_norm": 2.3959105014801025,
"learning_rate": 2.768152618259685e-07,
"loss": 2.1053,
"step": 6400
},
{
"epoch": 0.016741299553111124,
"grad_norm": 1.291269302368164,
"learning_rate": 2.789782190210456e-07,
"loss": 2.0576,
"step": 6450
},
{
"epoch": 0.016871077069026713,
"grad_norm": 3.1083991527557373,
"learning_rate": 2.811411762161227e-07,
"loss": 2.103,
"step": 6500
},
{
"epoch": 0.017000854584942306,
"grad_norm": 1.2403531074523926,
"learning_rate": 2.833041334111998e-07,
"loss": 2.1099,
"step": 6550
},
{
"epoch": 0.017130632100857895,
"grad_norm": 1.0552589893341064,
"learning_rate": 2.854670906062769e-07,
"loss": 2.1089,
"step": 6600
},
{
"epoch": 0.017260409616773484,
"grad_norm": 1.15003502368927,
"learning_rate": 2.8763004780135405e-07,
"loss": 2.1039,
"step": 6650
},
{
"epoch": 0.017390187132689073,
"grad_norm": 1.0832091569900513,
"learning_rate": 2.8979300499643113e-07,
"loss": 2.1084,
"step": 6700
},
{
"epoch": 0.017519964648604665,
"grad_norm": 1.3141324520111084,
"learning_rate": 2.9195596219150826e-07,
"loss": 2.1135,
"step": 6750
},
{
"epoch": 0.017649742164520255,
"grad_norm": 1.1030374765396118,
"learning_rate": 2.941189193865854e-07,
"loss": 2.1015,
"step": 6800
},
{
"epoch": 0.017779519680435844,
"grad_norm": 1.1264283657073975,
"learning_rate": 2.962818765816625e-07,
"loss": 2.1046,
"step": 6850
},
{
"epoch": 0.017909297196351436,
"grad_norm": 1.0352332592010498,
"learning_rate": 2.9844483377673956e-07,
"loss": 2.1123,
"step": 6900
},
{
"epoch": 0.018039074712267025,
"grad_norm": 1.6451269388198853,
"learning_rate": 3.006077909718167e-07,
"loss": 2.0875,
"step": 6950
},
{
"epoch": 0.018168852228182614,
"grad_norm": 1.0955110788345337,
"learning_rate": 3.027707481668938e-07,
"loss": 2.0838,
"step": 7000
},
{
"epoch": 0.018298629744098207,
"grad_norm": 1.128531813621521,
"learning_rate": 3.049337053619709e-07,
"loss": 2.1029,
"step": 7050
},
{
"epoch": 0.018428407260013796,
"grad_norm": 1.0489044189453125,
"learning_rate": 3.0709666255704805e-07,
"loss": 2.0908,
"step": 7100
},
{
"epoch": 0.018558184775929385,
"grad_norm": 1.0876027345657349,
"learning_rate": 3.0925961975212513e-07,
"loss": 2.0817,
"step": 7150
},
{
"epoch": 0.018687962291844974,
"grad_norm": 1.025060772895813,
"learning_rate": 3.1142257694720226e-07,
"loss": 2.0924,
"step": 7200
},
{
"epoch": 0.018817739807760567,
"grad_norm": 1.3133209943771362,
"learning_rate": 3.1358553414227934e-07,
"loss": 2.0848,
"step": 7250
},
{
"epoch": 0.018947517323676156,
"grad_norm": 1.159995436668396,
"learning_rate": 3.157484913373565e-07,
"loss": 2.0938,
"step": 7300
},
{
"epoch": 0.019077294839591745,
"grad_norm": 1.151329755783081,
"learning_rate": 3.1791144853243356e-07,
"loss": 2.1002,
"step": 7350
},
{
"epoch": 0.019207072355507337,
"grad_norm": 1.123695731163025,
"learning_rate": 3.200744057275107e-07,
"loss": 2.0951,
"step": 7400
},
{
"epoch": 0.019336849871422927,
"grad_norm": 1.143547534942627,
"learning_rate": 3.222373629225878e-07,
"loss": 2.1034,
"step": 7450
},
{
"epoch": 0.019466627387338516,
"grad_norm": 1.093329906463623,
"learning_rate": 3.2440032011766486e-07,
"loss": 2.0921,
"step": 7500
},
{
"epoch": 0.019596404903254105,
"grad_norm": 1.3572251796722412,
"learning_rate": 3.2656327731274204e-07,
"loss": 2.0902,
"step": 7550
},
{
"epoch": 0.019726182419169697,
"grad_norm": 1.146531343460083,
"learning_rate": 3.287262345078191e-07,
"loss": 2.0899,
"step": 7600
},
{
"epoch": 0.019855959935085286,
"grad_norm": 1.0585743188858032,
"learning_rate": 3.308891917028962e-07,
"loss": 2.108,
"step": 7650
},
{
"epoch": 0.019985737451000875,
"grad_norm": 1.1923290491104126,
"learning_rate": 3.3305214889797334e-07,
"loss": 2.098,
"step": 7700
},
{
"epoch": 0.020115514966916468,
"grad_norm": 1.1357568502426147,
"learning_rate": 3.352151060930504e-07,
"loss": 2.1021,
"step": 7750
},
{
"epoch": 0.020245292482832057,
"grad_norm": 1.2182716131210327,
"learning_rate": 3.373780632881275e-07,
"loss": 2.0816,
"step": 7800
},
{
"epoch": 0.020375069998747646,
"grad_norm": 1.1091363430023193,
"learning_rate": 3.395410204832047e-07,
"loss": 2.0832,
"step": 7850
},
{
"epoch": 0.02050484751466324,
"grad_norm": 1.1325336694717407,
"learning_rate": 3.4170397767828177e-07,
"loss": 2.0872,
"step": 7900
},
{
"epoch": 0.020634625030578828,
"grad_norm": 1.020922064781189,
"learning_rate": 3.4386693487335885e-07,
"loss": 2.0798,
"step": 7950
},
{
"epoch": 0.020764402546494417,
"grad_norm": 1.1414934396743774,
"learning_rate": 3.46029892068436e-07,
"loss": 2.0745,
"step": 8000
},
{
"epoch": 0.020894180062410006,
"grad_norm": 1.1155861616134644,
"learning_rate": 3.481928492635131e-07,
"loss": 2.0975,
"step": 8050
},
{
"epoch": 0.0210239575783256,
"grad_norm": 0.9747071266174316,
"learning_rate": 3.503558064585902e-07,
"loss": 2.0796,
"step": 8100
},
{
"epoch": 0.021153735094241188,
"grad_norm": 1.2895739078521729,
"learning_rate": 3.5251876365366734e-07,
"loss": 2.0951,
"step": 8150
},
{
"epoch": 0.021283512610156777,
"grad_norm": 1.147414207458496,
"learning_rate": 3.546817208487444e-07,
"loss": 2.0955,
"step": 8200
},
{
"epoch": 0.02141329012607237,
"grad_norm": 1.1811184883117676,
"learning_rate": 3.568446780438215e-07,
"loss": 2.0895,
"step": 8250
},
{
"epoch": 0.02154306764198796,
"grad_norm": 1.2630963325500488,
"learning_rate": 3.590076352388987e-07,
"loss": 2.0668,
"step": 8300
},
{
"epoch": 0.021672845157903547,
"grad_norm": 1.0883618593215942,
"learning_rate": 3.6117059243397577e-07,
"loss": 2.0981,
"step": 8350
},
{
"epoch": 0.021802622673819137,
"grad_norm": 1.2500261068344116,
"learning_rate": 3.6333354962905285e-07,
"loss": 2.0911,
"step": 8400
},
{
"epoch": 0.02193240018973473,
"grad_norm": 1.133091926574707,
"learning_rate": 3.6549650682413e-07,
"loss": 2.082,
"step": 8450
},
{
"epoch": 0.022062177705650318,
"grad_norm": 1.1500440835952759,
"learning_rate": 3.6765946401920707e-07,
"loss": 2.091,
"step": 8500
},
{
"epoch": 0.022191955221565907,
"grad_norm": 1.1573790311813354,
"learning_rate": 3.6982242121428415e-07,
"loss": 2.091,
"step": 8550
},
{
"epoch": 0.0223217327374815,
"grad_norm": 1.0022162199020386,
"learning_rate": 3.7198537840936134e-07,
"loss": 2.0734,
"step": 8600
},
{
"epoch": 0.02245151025339709,
"grad_norm": 1.2101961374282837,
"learning_rate": 3.741483356044384e-07,
"loss": 2.0976,
"step": 8650
},
{
"epoch": 0.022581287769312678,
"grad_norm": 1.1183929443359375,
"learning_rate": 3.763112927995155e-07,
"loss": 2.0742,
"step": 8700
},
{
"epoch": 0.02271106528522827,
"grad_norm": 1.1698428392410278,
"learning_rate": 3.784742499945927e-07,
"loss": 2.1189,
"step": 8750
},
{
"epoch": 0.02284084280114386,
"grad_norm": 1.238348126411438,
"learning_rate": 3.8063720718966977e-07,
"loss": 2.0864,
"step": 8800
},
{
"epoch": 0.02297062031705945,
"grad_norm": 1.0891568660736084,
"learning_rate": 3.8280016438474685e-07,
"loss": 2.071,
"step": 8850
},
{
"epoch": 0.023100397832975038,
"grad_norm": 1.0950003862380981,
"learning_rate": 3.84963121579824e-07,
"loss": 2.0944,
"step": 8900
},
{
"epoch": 0.02323017534889063,
"grad_norm": 1.0031663179397583,
"learning_rate": 3.8712607877490106e-07,
"loss": 2.0688,
"step": 8950
},
{
"epoch": 0.02335995286480622,
"grad_norm": 1.1025946140289307,
"learning_rate": 3.8928903596997815e-07,
"loss": 2.0853,
"step": 9000
},
{
"epoch": 0.02348973038072181,
"grad_norm": 1.0795261859893799,
"learning_rate": 3.9145199316505533e-07,
"loss": 2.0813,
"step": 9050
},
{
"epoch": 0.0236195078966374,
"grad_norm": 1.1669412851333618,
"learning_rate": 3.936149503601324e-07,
"loss": 2.0802,
"step": 9100
},
{
"epoch": 0.02374928541255299,
"grad_norm": 1.186626672744751,
"learning_rate": 3.957779075552095e-07,
"loss": 2.1015,
"step": 9150
},
{
"epoch": 0.02387906292846858,
"grad_norm": 1.053902506828308,
"learning_rate": 3.9794086475028663e-07,
"loss": 2.1003,
"step": 9200
},
{
"epoch": 0.02400884044438417,
"grad_norm": 1.1948777437210083,
"learning_rate": 4.001038219453637e-07,
"loss": 2.0914,
"step": 9250
},
{
"epoch": 0.02413861796029976,
"grad_norm": 1.0830193758010864,
"learning_rate": 4.0226677914044085e-07,
"loss": 2.0892,
"step": 9300
},
{
"epoch": 0.02426839547621535,
"grad_norm": 1.0737528800964355,
"learning_rate": 4.04429736335518e-07,
"loss": 2.0949,
"step": 9350
},
{
"epoch": 0.02439817299213094,
"grad_norm": 1.2443790435791016,
"learning_rate": 4.0659269353059506e-07,
"loss": 2.0725,
"step": 9400
},
{
"epoch": 0.02452795050804653,
"grad_norm": 0.9910159111022949,
"learning_rate": 4.0875565072567214e-07,
"loss": 2.092,
"step": 9450
},
{
"epoch": 0.02465772802396212,
"grad_norm": 1.1105308532714844,
"learning_rate": 4.1091860792074933e-07,
"loss": 2.0894,
"step": 9500
},
{
"epoch": 0.02478750553987771,
"grad_norm": 1.3401215076446533,
"learning_rate": 4.130815651158264e-07,
"loss": 2.091,
"step": 9550
},
{
"epoch": 0.024917283055793302,
"grad_norm": 1.1136138439178467,
"learning_rate": 4.152445223109035e-07,
"loss": 2.1022,
"step": 9600
},
{
"epoch": 0.02504706057170889,
"grad_norm": 1.1129764318466187,
"learning_rate": 4.1740747950598063e-07,
"loss": 2.0841,
"step": 9650
},
{
"epoch": 0.02517683808762448,
"grad_norm": 1.1361297369003296,
"learning_rate": 4.195704367010577e-07,
"loss": 2.0987,
"step": 9700
},
{
"epoch": 0.02530661560354007,
"grad_norm": 1.2290136814117432,
"learning_rate": 4.217333938961348e-07,
"loss": 2.0967,
"step": 9750
},
{
"epoch": 0.025436393119455662,
"grad_norm": 1.1932119131088257,
"learning_rate": 4.23896351091212e-07,
"loss": 2.1018,
"step": 9800
},
{
"epoch": 0.02556617063537125,
"grad_norm": 1.1398112773895264,
"learning_rate": 4.2605930828628906e-07,
"loss": 2.076,
"step": 9850
},
{
"epoch": 0.02569594815128684,
"grad_norm": 1.255175232887268,
"learning_rate": 4.2822226548136614e-07,
"loss": 2.0979,
"step": 9900
},
{
"epoch": 0.025825725667202433,
"grad_norm": 1.063835620880127,
"learning_rate": 4.303852226764433e-07,
"loss": 2.0982,
"step": 9950
},
{
"epoch": 0.025955503183118022,
"grad_norm": 1.0199131965637207,
"learning_rate": 4.3254817987152036e-07,
"loss": 2.077,
"step": 10000
},
{
"epoch": 0.02608528069903361,
"grad_norm": 1.2938398122787476,
"learning_rate": 4.347111370665975e-07,
"loss": 2.0966,
"step": 10050
},
{
"epoch": 0.0262150582149492,
"grad_norm": 1.2516087293624878,
"learning_rate": 4.368740942616746e-07,
"loss": 2.0996,
"step": 10100
},
{
"epoch": 0.026344835730864793,
"grad_norm": 1.140458345413208,
"learning_rate": 4.390370514567517e-07,
"loss": 2.0862,
"step": 10150
},
{
"epoch": 0.026474613246780382,
"grad_norm": 1.245771884918213,
"learning_rate": 4.412000086518288e-07,
"loss": 2.095,
"step": 10200
},
{
"epoch": 0.02660439076269597,
"grad_norm": 1.1775400638580322,
"learning_rate": 4.43362965846906e-07,
"loss": 2.0818,
"step": 10250
},
{
"epoch": 0.026734168278611564,
"grad_norm": 1.043639898300171,
"learning_rate": 4.4552592304198306e-07,
"loss": 2.075,
"step": 10300
},
{
"epoch": 0.026863945794527153,
"grad_norm": 1.0813723802566528,
"learning_rate": 4.4768888023706014e-07,
"loss": 2.0525,
"step": 10350
},
{
"epoch": 0.02699372331044274,
"grad_norm": 1.0008471012115479,
"learning_rate": 4.4985183743213727e-07,
"loss": 2.0835,
"step": 10400
},
{
"epoch": 0.027123500826358334,
"grad_norm": 1.2055691480636597,
"learning_rate": 4.5201479462721435e-07,
"loss": 2.0877,
"step": 10450
},
{
"epoch": 0.027253278342273923,
"grad_norm": 1.2838592529296875,
"learning_rate": 4.5417775182229143e-07,
"loss": 2.0844,
"step": 10500
},
{
"epoch": 0.027383055858189512,
"grad_norm": 1.2694274187088013,
"learning_rate": 4.563407090173686e-07,
"loss": 2.0791,
"step": 10550
},
{
"epoch": 0.0275128333741051,
"grad_norm": 1.1208597421646118,
"learning_rate": 4.585036662124457e-07,
"loss": 2.0846,
"step": 10600
},
{
"epoch": 0.027642610890020694,
"grad_norm": 1.0968207120895386,
"learning_rate": 4.606666234075228e-07,
"loss": 2.0834,
"step": 10650
},
{
"epoch": 0.027772388405936283,
"grad_norm": 0.9584913849830627,
"learning_rate": 4.628295806025999e-07,
"loss": 2.0792,
"step": 10700
},
{
"epoch": 0.027902165921851872,
"grad_norm": 1.19157874584198,
"learning_rate": 4.6499253779767705e-07,
"loss": 2.0895,
"step": 10750
},
{
"epoch": 0.028031943437767465,
"grad_norm": 1.1074448823928833,
"learning_rate": 4.6715549499275413e-07,
"loss": 2.0725,
"step": 10800
},
{
"epoch": 0.028161720953683054,
"grad_norm": 1.2112274169921875,
"learning_rate": 4.6931845218783127e-07,
"loss": 2.097,
"step": 10850
},
{
"epoch": 0.028291498469598643,
"grad_norm": 1.1916898488998413,
"learning_rate": 4.7148140938290835e-07,
"loss": 2.0703,
"step": 10900
},
{
"epoch": 0.028421275985514232,
"grad_norm": 1.0195530652999878,
"learning_rate": 4.7364436657798543e-07,
"loss": 2.1164,
"step": 10950
},
{
"epoch": 0.028551053501429825,
"grad_norm": 1.1741176843643188,
"learning_rate": 4.758073237730626e-07,
"loss": 2.0762,
"step": 11000
},
{
"epoch": 0.028680831017345414,
"grad_norm": 1.100441336631775,
"learning_rate": 4.779702809681397e-07,
"loss": 2.0843,
"step": 11050
},
{
"epoch": 0.028810608533261003,
"grad_norm": 1.2614903450012207,
"learning_rate": 4.801332381632167e-07,
"loss": 2.0912,
"step": 11100
},
{
"epoch": 0.028940386049176595,
"grad_norm": 1.2177016735076904,
"learning_rate": 4.82296195358294e-07,
"loss": 2.0872,
"step": 11150
},
{
"epoch": 0.029070163565092184,
"grad_norm": 1.0506807565689087,
"learning_rate": 4.84459152553371e-07,
"loss": 2.0469,
"step": 11200
},
{
"epoch": 0.029199941081007773,
"grad_norm": 1.0414236783981323,
"learning_rate": 4.866221097484481e-07,
"loss": 2.0683,
"step": 11250
},
{
"epoch": 0.029329718596923366,
"grad_norm": 1.0935665369033813,
"learning_rate": 4.887850669435253e-07,
"loss": 2.0809,
"step": 11300
},
{
"epoch": 0.029459496112838955,
"grad_norm": 1.0495206117630005,
"learning_rate": 4.909480241386023e-07,
"loss": 2.0788,
"step": 11350
},
{
"epoch": 0.029589273628754544,
"grad_norm": 1.3227081298828125,
"learning_rate": 4.931109813336794e-07,
"loss": 2.0858,
"step": 11400
},
{
"epoch": 0.029719051144670133,
"grad_norm": 1.1815470457077026,
"learning_rate": 4.952739385287566e-07,
"loss": 2.0832,
"step": 11450
},
{
"epoch": 0.029848828660585726,
"grad_norm": 1.1781071424484253,
"learning_rate": 4.974368957238337e-07,
"loss": 2.0796,
"step": 11500
},
{
"epoch": 0.029978606176501315,
"grad_norm": 1.2186251878738403,
"learning_rate": 4.995998529189107e-07,
"loss": 2.0742,
"step": 11550
},
{
"epoch": 0.030108383692416904,
"grad_norm": 1.1277467012405396,
"learning_rate": 5.01762810113988e-07,
"loss": 2.0745,
"step": 11600
},
{
"epoch": 0.030238161208332497,
"grad_norm": 1.1841695308685303,
"learning_rate": 5.03925767309065e-07,
"loss": 2.0911,
"step": 11650
},
{
"epoch": 0.030367938724248086,
"grad_norm": 1.2147952318191528,
"learning_rate": 5.060887245041421e-07,
"loss": 2.0719,
"step": 11700
},
{
"epoch": 0.030497716240163675,
"grad_norm": 1.1648039817810059,
"learning_rate": 5.082516816992193e-07,
"loss": 2.0884,
"step": 11750
},
{
"epoch": 0.030627493756079264,
"grad_norm": 1.2530500888824463,
"learning_rate": 5.104146388942963e-07,
"loss": 2.0804,
"step": 11800
},
{
"epoch": 0.030757271271994856,
"grad_norm": 1.3491883277893066,
"learning_rate": 5.125775960893734e-07,
"loss": 2.0872,
"step": 11850
},
{
"epoch": 0.030887048787910446,
"grad_norm": 1.2144231796264648,
"learning_rate": 5.147405532844506e-07,
"loss": 2.0628,
"step": 11900
},
{
"epoch": 0.031016826303826035,
"grad_norm": 1.2478595972061157,
"learning_rate": 5.169035104795277e-07,
"loss": 2.0756,
"step": 11950
},
{
"epoch": 0.031146603819741627,
"grad_norm": 1.0973178148269653,
"learning_rate": 5.190664676746047e-07,
"loss": 2.0966,
"step": 12000
},
{
"epoch": 0.031276381335657216,
"grad_norm": 1.1409740447998047,
"learning_rate": 5.212294248696819e-07,
"loss": 2.0812,
"step": 12050
},
{
"epoch": 0.03140615885157281,
"grad_norm": 1.206933617591858,
"learning_rate": 5.23392382064759e-07,
"loss": 2.1006,
"step": 12100
},
{
"epoch": 0.031535936367488394,
"grad_norm": 1.085492491722107,
"learning_rate": 5.255553392598361e-07,
"loss": 2.1002,
"step": 12150
},
{
"epoch": 0.03166571388340399,
"grad_norm": 1.0168064832687378,
"learning_rate": 5.277182964549132e-07,
"loss": 2.066,
"step": 12200
},
{
"epoch": 0.03179549139931958,
"grad_norm": 1.0635970830917358,
"learning_rate": 5.298812536499903e-07,
"loss": 2.0697,
"step": 12250
},
{
"epoch": 0.031925268915235165,
"grad_norm": 1.1196577548980713,
"learning_rate": 5.320442108450674e-07,
"loss": 2.0862,
"step": 12300
},
{
"epoch": 0.03205504643115076,
"grad_norm": 1.4039437770843506,
"learning_rate": 5.342071680401445e-07,
"loss": 2.0736,
"step": 12350
},
{
"epoch": 0.03218482394706635,
"grad_norm": 1.1052871942520142,
"learning_rate": 5.363701252352217e-07,
"loss": 2.0877,
"step": 12400
},
{
"epoch": 0.032314601462981936,
"grad_norm": 0.966698408126831,
"learning_rate": 5.385330824302987e-07,
"loss": 2.0866,
"step": 12450
},
{
"epoch": 0.03244437897889753,
"grad_norm": 0.9846018552780151,
"learning_rate": 5.406960396253759e-07,
"loss": 2.0717,
"step": 12500
},
{
"epoch": 0.032574156494813114,
"grad_norm": 1.1824718713760376,
"learning_rate": 5.42858996820453e-07,
"loss": 2.0651,
"step": 12550
},
{
"epoch": 0.03270393401072871,
"grad_norm": 1.1409893035888672,
"learning_rate": 5.4502195401553e-07,
"loss": 2.0904,
"step": 12600
},
{
"epoch": 0.0328337115266443,
"grad_norm": 1.079150676727295,
"learning_rate": 5.471849112106072e-07,
"loss": 2.0842,
"step": 12650
},
{
"epoch": 0.032963489042559885,
"grad_norm": 1.0430059432983398,
"learning_rate": 5.493478684056843e-07,
"loss": 2.0869,
"step": 12700
},
{
"epoch": 0.03309326655847548,
"grad_norm": 1.085353136062622,
"learning_rate": 5.515108256007614e-07,
"loss": 2.0703,
"step": 12750
},
{
"epoch": 0.03322304407439107,
"grad_norm": 1.1028053760528564,
"learning_rate": 5.536737827958384e-07,
"loss": 2.0804,
"step": 12800
},
{
"epoch": 0.033352821590306655,
"grad_norm": 1.4301245212554932,
"learning_rate": 5.558367399909157e-07,
"loss": 2.0934,
"step": 12850
},
{
"epoch": 0.03348259910622225,
"grad_norm": 1.1223058700561523,
"learning_rate": 5.579996971859927e-07,
"loss": 2.0927,
"step": 12900
},
{
"epoch": 0.03361237662213784,
"grad_norm": 1.0447497367858887,
"learning_rate": 5.601626543810699e-07,
"loss": 2.0792,
"step": 12950
},
{
"epoch": 0.033742154138053426,
"grad_norm": 1.1141220331192017,
"learning_rate": 5.62325611576147e-07,
"loss": 2.0661,
"step": 13000
},
{
"epoch": 0.03387193165396902,
"grad_norm": 1.146681547164917,
"learning_rate": 5.64488568771224e-07,
"loss": 2.0906,
"step": 13050
},
{
"epoch": 0.03400170916988461,
"grad_norm": 1.0003130435943604,
"learning_rate": 5.666515259663012e-07,
"loss": 2.0913,
"step": 13100
},
{
"epoch": 0.0341314866858002,
"grad_norm": 1.2144840955734253,
"learning_rate": 5.688144831613783e-07,
"loss": 2.0802,
"step": 13150
},
{
"epoch": 0.03426126420171579,
"grad_norm": 1.11890709400177,
"learning_rate": 5.709774403564554e-07,
"loss": 2.0709,
"step": 13200
},
{
"epoch": 0.03439104171763138,
"grad_norm": 1.1874761581420898,
"learning_rate": 5.731403975515324e-07,
"loss": 2.0885,
"step": 13250
},
{
"epoch": 0.03452081923354697,
"grad_norm": 1.155839443206787,
"learning_rate": 5.753033547466096e-07,
"loss": 2.0676,
"step": 13300
},
{
"epoch": 0.03465059674946256,
"grad_norm": 1.1086000204086304,
"learning_rate": 5.774663119416867e-07,
"loss": 2.0721,
"step": 13350
},
{
"epoch": 0.034780374265378146,
"grad_norm": 1.3329235315322876,
"learning_rate": 5.796292691367639e-07,
"loss": 2.0794,
"step": 13400
},
{
"epoch": 0.03491015178129374,
"grad_norm": 1.0079444646835327,
"learning_rate": 5.81792226331841e-07,
"loss": 2.0946,
"step": 13450
},
{
"epoch": 0.03503992929720933,
"grad_norm": 1.1033422946929932,
"learning_rate": 5.83955183526918e-07,
"loss": 2.0648,
"step": 13500
},
{
"epoch": 0.03516970681312492,
"grad_norm": 1.1726226806640625,
"learning_rate": 5.861181407219951e-07,
"loss": 2.0772,
"step": 13550
},
{
"epoch": 0.03529948432904051,
"grad_norm": 1.2747085094451904,
"learning_rate": 5.882810979170723e-07,
"loss": 2.0655,
"step": 13600
},
{
"epoch": 0.0354292618449561,
"grad_norm": 1.3106458187103271,
"learning_rate": 5.904440551121494e-07,
"loss": 2.0703,
"step": 13650
},
{
"epoch": 0.03555903936087169,
"grad_norm": 1.2663166522979736,
"learning_rate": 5.926070123072264e-07,
"loss": 2.0841,
"step": 13700
},
{
"epoch": 0.03568881687678728,
"grad_norm": 1.0045231580734253,
"learning_rate": 5.947699695023036e-07,
"loss": 2.0815,
"step": 13750
},
{
"epoch": 0.03581859439270287,
"grad_norm": 1.0900031328201294,
"learning_rate": 5.969329266973807e-07,
"loss": 2.0818,
"step": 13800
},
{
"epoch": 0.03594837190861846,
"grad_norm": 1.065185308456421,
"learning_rate": 5.990958838924577e-07,
"loss": 2.0734,
"step": 13850
},
{
"epoch": 0.03607814942453405,
"grad_norm": 1.117867112159729,
"learning_rate": 6.01258841087535e-07,
"loss": 2.065,
"step": 13900
},
{
"epoch": 0.03620792694044964,
"grad_norm": 1.092624306678772,
"learning_rate": 6.03421798282612e-07,
"loss": 2.0847,
"step": 13950
},
{
"epoch": 0.03633770445636523,
"grad_norm": 1.2159847021102905,
"learning_rate": 6.055847554776891e-07,
"loss": 2.0553,
"step": 14000
},
{
"epoch": 0.03646748197228082,
"grad_norm": 1.0683104991912842,
"learning_rate": 6.077477126727663e-07,
"loss": 2.0834,
"step": 14050
},
{
"epoch": 0.036597259488196414,
"grad_norm": 1.0531790256500244,
"learning_rate": 6.099106698678434e-07,
"loss": 2.0854,
"step": 14100
},
{
"epoch": 0.036727037004112,
"grad_norm": 1.2325146198272705,
"learning_rate": 6.120736270629204e-07,
"loss": 2.0813,
"step": 14150
},
{
"epoch": 0.03685681452002759,
"grad_norm": 1.091143012046814,
"learning_rate": 6.142365842579976e-07,
"loss": 2.0767,
"step": 14200
},
{
"epoch": 0.03698659203594318,
"grad_norm": 1.235277533531189,
"learning_rate": 6.163995414530747e-07,
"loss": 2.1023,
"step": 14250
},
{
"epoch": 0.03711636955185877,
"grad_norm": 1.065708041191101,
"learning_rate": 6.185624986481517e-07,
"loss": 2.0525,
"step": 14300
},
{
"epoch": 0.03724614706777436,
"grad_norm": 1.121060848236084,
"learning_rate": 6.20725455843229e-07,
"loss": 2.086,
"step": 14350
},
{
"epoch": 0.03737592458368995,
"grad_norm": 1.087768316268921,
"learning_rate": 6.22888413038306e-07,
"loss": 2.0788,
"step": 14400
},
{
"epoch": 0.03750570209960554,
"grad_norm": 1.1760495901107788,
"learning_rate": 6.250513702333831e-07,
"loss": 2.0599,
"step": 14450
},
{
"epoch": 0.037635479615521134,
"grad_norm": 1.1613560914993286,
"learning_rate": 6.272143274284602e-07,
"loss": 2.0784,
"step": 14500
},
{
"epoch": 0.03776525713143672,
"grad_norm": 1.0611546039581299,
"learning_rate": 6.293772846235373e-07,
"loss": 2.0622,
"step": 14550
},
{
"epoch": 0.03789503464735231,
"grad_norm": 1.23395836353302,
"learning_rate": 6.315402418186145e-07,
"loss": 2.0801,
"step": 14600
},
{
"epoch": 0.038024812163267904,
"grad_norm": 1.3746421337127686,
"learning_rate": 6.337031990136916e-07,
"loss": 2.0865,
"step": 14650
},
{
"epoch": 0.03815458967918349,
"grad_norm": 1.2074872255325317,
"learning_rate": 6.358661562087687e-07,
"loss": 2.0814,
"step": 14700
},
{
"epoch": 0.03828436719509908,
"grad_norm": 1.3351000547409058,
"learning_rate": 6.380291134038457e-07,
"loss": 2.072,
"step": 14750
},
{
"epoch": 0.038414144711014675,
"grad_norm": 1.2587641477584839,
"learning_rate": 6.401920705989229e-07,
"loss": 2.0674,
"step": 14800
},
{
"epoch": 0.03854392222693026,
"grad_norm": 1.3082313537597656,
"learning_rate": 6.423550277939999e-07,
"loss": 2.0591,
"step": 14850
},
{
"epoch": 0.03867369974284585,
"grad_norm": 1.0227408409118652,
"learning_rate": 6.445179849890771e-07,
"loss": 2.0622,
"step": 14900
},
{
"epoch": 0.038803477258761446,
"grad_norm": 1.067315697669983,
"learning_rate": 6.466809421841543e-07,
"loss": 2.0742,
"step": 14950
},
{
"epoch": 0.03893325477467703,
"grad_norm": 1.0379241704940796,
"learning_rate": 6.488438993792313e-07,
"loss": 2.0816,
"step": 15000
},
{
"epoch": 0.039063032290592624,
"grad_norm": 1.2002571821212769,
"learning_rate": 6.510068565743084e-07,
"loss": 2.089,
"step": 15050
},
{
"epoch": 0.03919280980650821,
"grad_norm": 0.9409751296043396,
"learning_rate": 6.531698137693855e-07,
"loss": 2.0937,
"step": 15100
},
{
"epoch": 0.0393225873224238,
"grad_norm": 1.1032634973526,
"learning_rate": 6.553327709644626e-07,
"loss": 2.0728,
"step": 15150
},
{
"epoch": 0.039452364838339395,
"grad_norm": 1.1253653764724731,
"learning_rate": 6.574957281595398e-07,
"loss": 2.0654,
"step": 15200
},
{
"epoch": 0.03958214235425498,
"grad_norm": 1.1984907388687134,
"learning_rate": 6.596586853546169e-07,
"loss": 2.0635,
"step": 15250
},
{
"epoch": 0.03971191987017057,
"grad_norm": 1.8919013738632202,
"learning_rate": 6.61821642549694e-07,
"loss": 2.0908,
"step": 15300
},
{
"epoch": 0.039841697386086165,
"grad_norm": 1.084962248802185,
"learning_rate": 6.639845997447711e-07,
"loss": 2.0776,
"step": 15350
},
{
"epoch": 0.03997147490200175,
"grad_norm": 1.2666141986846924,
"learning_rate": 6.661475569398482e-07,
"loss": 2.0572,
"step": 15400
},
{
"epoch": 0.040101252417917344,
"grad_norm": 0.9767414331436157,
"learning_rate": 6.683105141349253e-07,
"loss": 2.0601,
"step": 15450
},
{
"epoch": 0.040231029933832936,
"grad_norm": 1.0183193683624268,
"learning_rate": 6.704734713300024e-07,
"loss": 2.0603,
"step": 15500
},
{
"epoch": 0.04036080744974852,
"grad_norm": 1.166793704032898,
"learning_rate": 6.726364285250796e-07,
"loss": 2.067,
"step": 15550
},
{
"epoch": 0.040490584965664114,
"grad_norm": 1.0793973207473755,
"learning_rate": 6.747993857201567e-07,
"loss": 2.0573,
"step": 15600
},
{
"epoch": 0.04062036248157971,
"grad_norm": 1.1123896837234497,
"learning_rate": 6.769623429152337e-07,
"loss": 2.0871,
"step": 15650
},
{
"epoch": 0.04075013999749529,
"grad_norm": 1.1629297733306885,
"learning_rate": 6.791253001103109e-07,
"loss": 2.084,
"step": 15700
},
{
"epoch": 0.040879917513410885,
"grad_norm": 1.1267614364624023,
"learning_rate": 6.812882573053879e-07,
"loss": 2.078,
"step": 15750
},
{
"epoch": 0.04100969502932648,
"grad_norm": 1.084740161895752,
"learning_rate": 6.834512145004651e-07,
"loss": 2.0553,
"step": 15800
},
{
"epoch": 0.04113947254524206,
"grad_norm": 1.520933747291565,
"learning_rate": 6.856141716955423e-07,
"loss": 2.0453,
"step": 15850
},
{
"epoch": 0.041269250061157656,
"grad_norm": 1.133069396018982,
"learning_rate": 6.877771288906193e-07,
"loss": 2.0711,
"step": 15900
},
{
"epoch": 0.04139902757707324,
"grad_norm": 1.0531269311904907,
"learning_rate": 6.899400860856964e-07,
"loss": 2.0538,
"step": 15950
},
{
"epoch": 0.041528805092988834,
"grad_norm": 1.2274492979049683,
"learning_rate": 6.921030432807735e-07,
"loss": 2.0566,
"step": 16000
}
],
"logging_steps": 50,
"max_steps": 1155822,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.781564350436475e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}