NTIS's picture
Upload checkpoint-10000
7ab5fee verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.025955503183118022,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00012977751591559012,
"grad_norm": 29.529769897460938,
"learning_rate": 2.1196980511755674e-09,
"loss": 2.2098,
"step": 50
},
{
"epoch": 0.00025955503183118023,
"grad_norm": 30.7341251373291,
"learning_rate": 4.282655246252677e-09,
"loss": 2.2367,
"step": 100
},
{
"epoch": 0.00038933254774677035,
"grad_norm": 29.822181701660156,
"learning_rate": 6.4456124413297865e-09,
"loss": 2.198,
"step": 150
},
{
"epoch": 0.0005191100636623605,
"grad_norm": 28.564294815063477,
"learning_rate": 8.608569636406895e-09,
"loss": 2.2167,
"step": 200
},
{
"epoch": 0.0006488875795779505,
"grad_norm": 28.514026641845703,
"learning_rate": 1.0771526831484006e-08,
"loss": 2.2001,
"step": 250
},
{
"epoch": 0.0007786650954935407,
"grad_norm": 29.033863067626953,
"learning_rate": 1.2934484026561114e-08,
"loss": 2.1908,
"step": 300
},
{
"epoch": 0.0009084426114091308,
"grad_norm": 4.49394416809082,
"learning_rate": 1.5097441221638225e-08,
"loss": 2.1841,
"step": 350
},
{
"epoch": 0.001038220127324721,
"grad_norm": 2.1610796451568604,
"learning_rate": 1.7260398416715337e-08,
"loss": 2.1702,
"step": 400
},
{
"epoch": 0.0011679976432403109,
"grad_norm": 1.9879209995269775,
"learning_rate": 1.9423355611792444e-08,
"loss": 2.1613,
"step": 450
},
{
"epoch": 0.001297775159155901,
"grad_norm": 1.880299687385559,
"learning_rate": 2.1586312806869556e-08,
"loss": 2.1529,
"step": 500
},
{
"epoch": 0.0014275526750714912,
"grad_norm": 1.7597101926803589,
"learning_rate": 2.3749270001946664e-08,
"loss": 2.1641,
"step": 550
},
{
"epoch": 0.0015573301909870814,
"grad_norm": 1.5660640001296997,
"learning_rate": 2.591222719702377e-08,
"loss": 2.1501,
"step": 600
},
{
"epoch": 0.0016871077069026714,
"grad_norm": 1.5481892824172974,
"learning_rate": 2.8075184392100883e-08,
"loss": 2.1583,
"step": 650
},
{
"epoch": 0.0018168852228182615,
"grad_norm": 1.525515079498291,
"learning_rate": 3.0238141587177994e-08,
"loss": 2.1547,
"step": 700
},
{
"epoch": 0.0019466627387338517,
"grad_norm": 1.4705111980438232,
"learning_rate": 3.24010987822551e-08,
"loss": 2.1365,
"step": 750
},
{
"epoch": 0.002076440254649442,
"grad_norm": 1.3471802473068237,
"learning_rate": 3.4564055977332216e-08,
"loss": 2.1777,
"step": 800
},
{
"epoch": 0.002206217770565032,
"grad_norm": 1.3963252305984497,
"learning_rate": 3.6727013172409324e-08,
"loss": 2.1356,
"step": 850
},
{
"epoch": 0.0023359952864806218,
"grad_norm": 1.2665836811065674,
"learning_rate": 3.888997036748643e-08,
"loss": 2.1565,
"step": 900
},
{
"epoch": 0.002465772802396212,
"grad_norm": 1.4085878133773804,
"learning_rate": 4.105292756256354e-08,
"loss": 2.1497,
"step": 950
},
{
"epoch": 0.002595550318311802,
"grad_norm": 1.4020620584487915,
"learning_rate": 4.3215884757640654e-08,
"loss": 2.1549,
"step": 1000
},
{
"epoch": 0.002725327834227392,
"grad_norm": 1.5391197204589844,
"learning_rate": 4.5378841952717755e-08,
"loss": 2.1519,
"step": 1050
},
{
"epoch": 0.0028551053501429825,
"grad_norm": 1.2532999515533447,
"learning_rate": 4.754179914779487e-08,
"loss": 2.1411,
"step": 1100
},
{
"epoch": 0.0029848828660585724,
"grad_norm": 1.2143096923828125,
"learning_rate": 4.970475634287198e-08,
"loss": 2.1435,
"step": 1150
},
{
"epoch": 0.003114660381974163,
"grad_norm": 1.4227972030639648,
"learning_rate": 5.1867713537949086e-08,
"loss": 2.142,
"step": 1200
},
{
"epoch": 0.0032444378978897528,
"grad_norm": 1.0749537944793701,
"learning_rate": 5.40306707330262e-08,
"loss": 2.1134,
"step": 1250
},
{
"epoch": 0.0033742154138053427,
"grad_norm": 1.169149398803711,
"learning_rate": 5.619362792810331e-08,
"loss": 2.1215,
"step": 1300
},
{
"epoch": 0.003503992929720933,
"grad_norm": 1.3452564477920532,
"learning_rate": 5.8356585123180416e-08,
"loss": 2.1524,
"step": 1350
},
{
"epoch": 0.003633770445636523,
"grad_norm": 1.3610825538635254,
"learning_rate": 6.051954231825752e-08,
"loss": 2.1389,
"step": 1400
},
{
"epoch": 0.003763547961552113,
"grad_norm": 1.2172240018844604,
"learning_rate": 6.268249951333464e-08,
"loss": 2.1428,
"step": 1450
},
{
"epoch": 0.0038933254774677034,
"grad_norm": 1.1681610345840454,
"learning_rate": 6.484545670841174e-08,
"loss": 2.1253,
"step": 1500
},
{
"epoch": 0.004023102993383294,
"grad_norm": 1.0709235668182373,
"learning_rate": 6.700841390348886e-08,
"loss": 2.1533,
"step": 1550
},
{
"epoch": 0.004152880509298884,
"grad_norm": 1.2001404762268066,
"learning_rate": 6.917137109856597e-08,
"loss": 2.1371,
"step": 1600
},
{
"epoch": 0.004282658025214474,
"grad_norm": 1.2642266750335693,
"learning_rate": 7.133432829364308e-08,
"loss": 2.129,
"step": 1650
},
{
"epoch": 0.004412435541130064,
"grad_norm": 1.030280590057373,
"learning_rate": 7.349728548872018e-08,
"loss": 2.128,
"step": 1700
},
{
"epoch": 0.004542213057045654,
"grad_norm": 1.4037690162658691,
"learning_rate": 7.566024268379729e-08,
"loss": 2.1383,
"step": 1750
},
{
"epoch": 0.0046719905729612436,
"grad_norm": 1.1600065231323242,
"learning_rate": 7.78231998788744e-08,
"loss": 2.1193,
"step": 1800
},
{
"epoch": 0.004801768088876834,
"grad_norm": 1.2075531482696533,
"learning_rate": 7.998615707395151e-08,
"loss": 2.1126,
"step": 1850
},
{
"epoch": 0.004931545604792424,
"grad_norm": 1.1088111400604248,
"learning_rate": 8.214911426902863e-08,
"loss": 2.1394,
"step": 1900
},
{
"epoch": 0.005061323120708014,
"grad_norm": 1.195092797279358,
"learning_rate": 8.431207146410574e-08,
"loss": 2.1359,
"step": 1950
},
{
"epoch": 0.005191100636623604,
"grad_norm": 1.1595982313156128,
"learning_rate": 8.647502865918283e-08,
"loss": 2.1366,
"step": 2000
},
{
"epoch": 0.005320878152539194,
"grad_norm": 1.1727768182754517,
"learning_rate": 8.863798585425995e-08,
"loss": 2.1106,
"step": 2050
},
{
"epoch": 0.005450655668454784,
"grad_norm": 1.2424023151397705,
"learning_rate": 9.080094304933706e-08,
"loss": 2.1339,
"step": 2100
},
{
"epoch": 0.005580433184370375,
"grad_norm": 1.2954424619674683,
"learning_rate": 9.296390024441417e-08,
"loss": 2.1107,
"step": 2150
},
{
"epoch": 0.005710210700285965,
"grad_norm": 1.0388058423995972,
"learning_rate": 9.512685743949129e-08,
"loss": 2.118,
"step": 2200
},
{
"epoch": 0.005839988216201555,
"grad_norm": 4.136488914489746,
"learning_rate": 9.728981463456838e-08,
"loss": 2.1159,
"step": 2250
},
{
"epoch": 0.005969765732117145,
"grad_norm": 1.0597412586212158,
"learning_rate": 9.945277182964549e-08,
"loss": 2.1303,
"step": 2300
},
{
"epoch": 0.006099543248032735,
"grad_norm": 1.0780799388885498,
"learning_rate": 1.0161572902472261e-07,
"loss": 2.1095,
"step": 2350
},
{
"epoch": 0.006229320763948326,
"grad_norm": 1.2739732265472412,
"learning_rate": 1.0377868621979972e-07,
"loss": 2.1264,
"step": 2400
},
{
"epoch": 0.0063590982798639156,
"grad_norm": 3.3836748600006104,
"learning_rate": 1.0594164341487683e-07,
"loss": 2.1183,
"step": 2450
},
{
"epoch": 0.0064888757957795055,
"grad_norm": 1.269440770149231,
"learning_rate": 1.0810460060995395e-07,
"loss": 2.1243,
"step": 2500
},
{
"epoch": 0.0066186533116950955,
"grad_norm": 1.1834193468093872,
"learning_rate": 1.1026755780503104e-07,
"loss": 2.1207,
"step": 2550
},
{
"epoch": 0.006748430827610685,
"grad_norm": 1.2411448955535889,
"learning_rate": 1.1243051500010815e-07,
"loss": 2.1201,
"step": 2600
},
{
"epoch": 0.006878208343526275,
"grad_norm": 1.1330574750900269,
"learning_rate": 1.1459347219518527e-07,
"loss": 2.1125,
"step": 2650
},
{
"epoch": 0.007007985859441866,
"grad_norm": 1.199055790901184,
"learning_rate": 1.1675642939026238e-07,
"loss": 2.13,
"step": 2700
},
{
"epoch": 0.007137763375357456,
"grad_norm": 1.2272251844406128,
"learning_rate": 1.1891938658533949e-07,
"loss": 2.1239,
"step": 2750
},
{
"epoch": 0.007267540891273046,
"grad_norm": 1.1257809400558472,
"learning_rate": 1.2108234378041658e-07,
"loss": 2.12,
"step": 2800
},
{
"epoch": 0.007397318407188636,
"grad_norm": 1.1028845310211182,
"learning_rate": 1.2324530097549372e-07,
"loss": 2.1227,
"step": 2850
},
{
"epoch": 0.007527095923104226,
"grad_norm": 1.2196507453918457,
"learning_rate": 1.254082581705708e-07,
"loss": 2.1241,
"step": 2900
},
{
"epoch": 0.007656873439019816,
"grad_norm": 1.0342847108840942,
"learning_rate": 1.275712153656479e-07,
"loss": 2.1148,
"step": 2950
},
{
"epoch": 0.007786650954935407,
"grad_norm": 1.1587927341461182,
"learning_rate": 1.2973417256072504e-07,
"loss": 2.1293,
"step": 3000
},
{
"epoch": 0.007916428470850997,
"grad_norm": 1.1751174926757812,
"learning_rate": 1.3189712975580215e-07,
"loss": 2.1084,
"step": 3050
},
{
"epoch": 0.008046205986766588,
"grad_norm": 12.056378364562988,
"learning_rate": 1.3406008695087926e-07,
"loss": 2.103,
"step": 3100
},
{
"epoch": 0.008175983502682177,
"grad_norm": 1.0078307390213013,
"learning_rate": 1.3622304414595637e-07,
"loss": 2.1345,
"step": 3150
},
{
"epoch": 0.008305761018597767,
"grad_norm": 1.0840246677398682,
"learning_rate": 1.3838600134103347e-07,
"loss": 2.1151,
"step": 3200
},
{
"epoch": 0.008435538534513357,
"grad_norm": 1.213310956954956,
"learning_rate": 1.4054895853611058e-07,
"loss": 2.1275,
"step": 3250
},
{
"epoch": 0.008565316050428947,
"grad_norm": 1.2903615236282349,
"learning_rate": 1.427119157311877e-07,
"loss": 2.1105,
"step": 3300
},
{
"epoch": 0.008695093566344536,
"grad_norm": 1.1746351718902588,
"learning_rate": 1.448748729262648e-07,
"loss": 2.1344,
"step": 3350
},
{
"epoch": 0.008824871082260127,
"grad_norm": 1.1928184032440186,
"learning_rate": 1.470378301213419e-07,
"loss": 2.1278,
"step": 3400
},
{
"epoch": 0.008954648598175718,
"grad_norm": 1.1132676601409912,
"learning_rate": 1.4920078731641904e-07,
"loss": 2.1134,
"step": 3450
},
{
"epoch": 0.009084426114091307,
"grad_norm": 1.4539573192596436,
"learning_rate": 1.5136374451149612e-07,
"loss": 2.0943,
"step": 3500
},
{
"epoch": 0.009214203630006898,
"grad_norm": 1.176128625869751,
"learning_rate": 1.5352670170657323e-07,
"loss": 2.1025,
"step": 3550
},
{
"epoch": 0.009343981145922487,
"grad_norm": 1.0798020362854004,
"learning_rate": 1.5568965890165036e-07,
"loss": 2.0946,
"step": 3600
},
{
"epoch": 0.009473758661838078,
"grad_norm": 1.1570450067520142,
"learning_rate": 1.5785261609672747e-07,
"loss": 2.1119,
"step": 3650
},
{
"epoch": 0.009603536177753669,
"grad_norm": 0.9933484792709351,
"learning_rate": 1.6001557329180458e-07,
"loss": 2.1171,
"step": 3700
},
{
"epoch": 0.009733313693669258,
"grad_norm": 1.1642405986785889,
"learning_rate": 1.6217853048688166e-07,
"loss": 2.1047,
"step": 3750
},
{
"epoch": 0.009863091209584849,
"grad_norm": 1.266423225402832,
"learning_rate": 1.643414876819588e-07,
"loss": 2.0984,
"step": 3800
},
{
"epoch": 0.009992868725500438,
"grad_norm": 1.0179153680801392,
"learning_rate": 1.665044448770359e-07,
"loss": 2.0977,
"step": 3850
},
{
"epoch": 0.010122646241416029,
"grad_norm": 1.0938276052474976,
"learning_rate": 1.68667402072113e-07,
"loss": 2.1249,
"step": 3900
},
{
"epoch": 0.01025242375733162,
"grad_norm": 1.1981333494186401,
"learning_rate": 1.7083035926719012e-07,
"loss": 2.1186,
"step": 3950
},
{
"epoch": 0.010382201273247208,
"grad_norm": 1.238035798072815,
"learning_rate": 1.7299331646226725e-07,
"loss": 2.1254,
"step": 4000
},
{
"epoch": 0.0105119787891628,
"grad_norm": 1.0942214727401733,
"learning_rate": 1.7515627365734433e-07,
"loss": 2.0972,
"step": 4050
},
{
"epoch": 0.010641756305078388,
"grad_norm": 1.1418002843856812,
"learning_rate": 1.7731923085242144e-07,
"loss": 2.1286,
"step": 4100
},
{
"epoch": 0.01077153382099398,
"grad_norm": 2.164984703063965,
"learning_rate": 1.7948218804749858e-07,
"loss": 2.1195,
"step": 4150
},
{
"epoch": 0.010901311336909568,
"grad_norm": 1.119779109954834,
"learning_rate": 1.8164514524257566e-07,
"loss": 2.1242,
"step": 4200
},
{
"epoch": 0.011031088852825159,
"grad_norm": 1.0193780660629272,
"learning_rate": 1.8380810243765277e-07,
"loss": 2.0951,
"step": 4250
},
{
"epoch": 0.01116086636874075,
"grad_norm": 1.1878997087478638,
"learning_rate": 1.859710596327299e-07,
"loss": 2.1067,
"step": 4300
},
{
"epoch": 0.011290643884656339,
"grad_norm": 1.048115849494934,
"learning_rate": 1.8813401682780698e-07,
"loss": 2.0861,
"step": 4350
},
{
"epoch": 0.01142042140057193,
"grad_norm": 1.1059479713439941,
"learning_rate": 1.9029697402288412e-07,
"loss": 2.1204,
"step": 4400
},
{
"epoch": 0.011550198916487519,
"grad_norm": 0.942563533782959,
"learning_rate": 1.9245993121796122e-07,
"loss": 2.1256,
"step": 4450
},
{
"epoch": 0.01167997643240311,
"grad_norm": 3.292470932006836,
"learning_rate": 1.946228884130383e-07,
"loss": 2.0964,
"step": 4500
},
{
"epoch": 0.0118097539483187,
"grad_norm": 1.131181001663208,
"learning_rate": 1.9678584560811544e-07,
"loss": 2.1079,
"step": 4550
},
{
"epoch": 0.01193953146423429,
"grad_norm": 1.1684244871139526,
"learning_rate": 1.9894880280319255e-07,
"loss": 2.0918,
"step": 4600
},
{
"epoch": 0.01206930898014988,
"grad_norm": 1.094470500946045,
"learning_rate": 2.0111175999826965e-07,
"loss": 2.1129,
"step": 4650
},
{
"epoch": 0.01219908649606547,
"grad_norm": 1.1580520868301392,
"learning_rate": 2.0327471719334676e-07,
"loss": 2.1175,
"step": 4700
},
{
"epoch": 0.01232886401198106,
"grad_norm": 1.110461711883545,
"learning_rate": 2.054376743884239e-07,
"loss": 2.1188,
"step": 4750
},
{
"epoch": 0.012458641527896651,
"grad_norm": 1.1533987522125244,
"learning_rate": 2.0760063158350098e-07,
"loss": 2.1072,
"step": 4800
},
{
"epoch": 0.01258841904381224,
"grad_norm": 1.3358995914459229,
"learning_rate": 2.0976358877857809e-07,
"loss": 2.1031,
"step": 4850
},
{
"epoch": 0.012718196559727831,
"grad_norm": 1.100576639175415,
"learning_rate": 2.119265459736552e-07,
"loss": 2.0903,
"step": 4900
},
{
"epoch": 0.01284797407564342,
"grad_norm": 1.1505376100540161,
"learning_rate": 2.140895031687323e-07,
"loss": 2.123,
"step": 4950
},
{
"epoch": 0.012977751591559011,
"grad_norm": 1.063899278640747,
"learning_rate": 2.1625246036380944e-07,
"loss": 2.0802,
"step": 5000
},
{
"epoch": 0.0131075291074746,
"grad_norm": 0.9469916224479675,
"learning_rate": 2.1841541755888652e-07,
"loss": 2.0917,
"step": 5050
},
{
"epoch": 0.013237306623390191,
"grad_norm": 1.1475921869277954,
"learning_rate": 2.2057837475396363e-07,
"loss": 2.116,
"step": 5100
},
{
"epoch": 0.013367084139305782,
"grad_norm": 1.5712428092956543,
"learning_rate": 2.2274133194904076e-07,
"loss": 2.1019,
"step": 5150
},
{
"epoch": 0.01349686165522137,
"grad_norm": 1.0301058292388916,
"learning_rate": 2.2490428914411784e-07,
"loss": 2.0985,
"step": 5200
},
{
"epoch": 0.013626639171136962,
"grad_norm": 1.1751989126205444,
"learning_rate": 2.2706724633919495e-07,
"loss": 2.1023,
"step": 5250
},
{
"epoch": 0.01375641668705255,
"grad_norm": 2.097857713699341,
"learning_rate": 2.2923020353427208e-07,
"loss": 2.1226,
"step": 5300
},
{
"epoch": 0.013886194202968142,
"grad_norm": 1.0331202745437622,
"learning_rate": 2.3139316072934916e-07,
"loss": 2.1205,
"step": 5350
},
{
"epoch": 0.014015971718883732,
"grad_norm": 1.0770936012268066,
"learning_rate": 2.335561179244263e-07,
"loss": 2.1037,
"step": 5400
},
{
"epoch": 0.014145749234799321,
"grad_norm": 1.1082065105438232,
"learning_rate": 2.357190751195034e-07,
"loss": 2.1027,
"step": 5450
},
{
"epoch": 0.014275526750714912,
"grad_norm": 1.2101250886917114,
"learning_rate": 2.378820323145805e-07,
"loss": 2.1103,
"step": 5500
},
{
"epoch": 0.014405304266630501,
"grad_norm": 1.163560152053833,
"learning_rate": 2.400449895096576e-07,
"loss": 2.1035,
"step": 5550
},
{
"epoch": 0.014535081782546092,
"grad_norm": 1.1837356090545654,
"learning_rate": 2.4220794670473476e-07,
"loss": 2.0891,
"step": 5600
},
{
"epoch": 0.014664859298461683,
"grad_norm": 1.2113701105117798,
"learning_rate": 2.4437090389981184e-07,
"loss": 2.1096,
"step": 5650
},
{
"epoch": 0.014794636814377272,
"grad_norm": 1.1284654140472412,
"learning_rate": 2.4653386109488897e-07,
"loss": 2.0948,
"step": 5700
},
{
"epoch": 0.014924414330292863,
"grad_norm": 1.4470899105072021,
"learning_rate": 2.4869681828996605e-07,
"loss": 2.0954,
"step": 5750
},
{
"epoch": 0.015054191846208452,
"grad_norm": 1.0791606903076172,
"learning_rate": 2.5085977548504314e-07,
"loss": 2.1022,
"step": 5800
},
{
"epoch": 0.015183969362124043,
"grad_norm": 1.1813191175460815,
"learning_rate": 2.5302273268012027e-07,
"loss": 2.1102,
"step": 5850
},
{
"epoch": 0.015313746878039632,
"grad_norm": 1.1993714570999146,
"learning_rate": 2.551856898751974e-07,
"loss": 2.0967,
"step": 5900
},
{
"epoch": 0.015443524393955223,
"grad_norm": 1.1809765100479126,
"learning_rate": 2.573486470702745e-07,
"loss": 2.1054,
"step": 5950
},
{
"epoch": 0.015573301909870814,
"grad_norm": 1.0799180269241333,
"learning_rate": 2.595116042653516e-07,
"loss": 2.1144,
"step": 6000
},
{
"epoch": 0.015703079425786404,
"grad_norm": 1.0349640846252441,
"learning_rate": 2.6167456146042875e-07,
"loss": 2.0978,
"step": 6050
},
{
"epoch": 0.015832856941701993,
"grad_norm": 0.9997969269752502,
"learning_rate": 2.6383751865550584e-07,
"loss": 2.1274,
"step": 6100
},
{
"epoch": 0.015962634457617583,
"grad_norm": 1.3014293909072876,
"learning_rate": 2.660004758505829e-07,
"loss": 2.0857,
"step": 6150
},
{
"epoch": 0.016092411973533175,
"grad_norm": 1.1863785982131958,
"learning_rate": 2.6816343304566005e-07,
"loss": 2.1051,
"step": 6200
},
{
"epoch": 0.016222189489448764,
"grad_norm": 1.6769137382507324,
"learning_rate": 2.7032639024073713e-07,
"loss": 2.0934,
"step": 6250
},
{
"epoch": 0.016351967005364353,
"grad_norm": 1.0514180660247803,
"learning_rate": 2.7248934743581427e-07,
"loss": 2.0948,
"step": 6300
},
{
"epoch": 0.016481744521279942,
"grad_norm": 1.0475189685821533,
"learning_rate": 2.746523046308914e-07,
"loss": 2.088,
"step": 6350
},
{
"epoch": 0.016611522037195535,
"grad_norm": 2.3959105014801025,
"learning_rate": 2.768152618259685e-07,
"loss": 2.1053,
"step": 6400
},
{
"epoch": 0.016741299553111124,
"grad_norm": 1.291269302368164,
"learning_rate": 2.789782190210456e-07,
"loss": 2.0576,
"step": 6450
},
{
"epoch": 0.016871077069026713,
"grad_norm": 3.1083991527557373,
"learning_rate": 2.811411762161227e-07,
"loss": 2.103,
"step": 6500
},
{
"epoch": 0.017000854584942306,
"grad_norm": 1.2403531074523926,
"learning_rate": 2.833041334111998e-07,
"loss": 2.1099,
"step": 6550
},
{
"epoch": 0.017130632100857895,
"grad_norm": 1.0552589893341064,
"learning_rate": 2.854670906062769e-07,
"loss": 2.1089,
"step": 6600
},
{
"epoch": 0.017260409616773484,
"grad_norm": 1.15003502368927,
"learning_rate": 2.8763004780135405e-07,
"loss": 2.1039,
"step": 6650
},
{
"epoch": 0.017390187132689073,
"grad_norm": 1.0832091569900513,
"learning_rate": 2.8979300499643113e-07,
"loss": 2.1084,
"step": 6700
},
{
"epoch": 0.017519964648604665,
"grad_norm": 1.3141324520111084,
"learning_rate": 2.9195596219150826e-07,
"loss": 2.1135,
"step": 6750
},
{
"epoch": 0.017649742164520255,
"grad_norm": 1.1030374765396118,
"learning_rate": 2.941189193865854e-07,
"loss": 2.1015,
"step": 6800
},
{
"epoch": 0.017779519680435844,
"grad_norm": 1.1264283657073975,
"learning_rate": 2.962818765816625e-07,
"loss": 2.1046,
"step": 6850
},
{
"epoch": 0.017909297196351436,
"grad_norm": 1.0352332592010498,
"learning_rate": 2.9844483377673956e-07,
"loss": 2.1123,
"step": 6900
},
{
"epoch": 0.018039074712267025,
"grad_norm": 1.6451269388198853,
"learning_rate": 3.006077909718167e-07,
"loss": 2.0875,
"step": 6950
},
{
"epoch": 0.018168852228182614,
"grad_norm": 1.0955110788345337,
"learning_rate": 3.027707481668938e-07,
"loss": 2.0838,
"step": 7000
},
{
"epoch": 0.018298629744098207,
"grad_norm": 1.128531813621521,
"learning_rate": 3.049337053619709e-07,
"loss": 2.1029,
"step": 7050
},
{
"epoch": 0.018428407260013796,
"grad_norm": 1.0489044189453125,
"learning_rate": 3.0709666255704805e-07,
"loss": 2.0908,
"step": 7100
},
{
"epoch": 0.018558184775929385,
"grad_norm": 1.0876027345657349,
"learning_rate": 3.0925961975212513e-07,
"loss": 2.0817,
"step": 7150
},
{
"epoch": 0.018687962291844974,
"grad_norm": 1.025060772895813,
"learning_rate": 3.1142257694720226e-07,
"loss": 2.0924,
"step": 7200
},
{
"epoch": 0.018817739807760567,
"grad_norm": 1.3133209943771362,
"learning_rate": 3.1358553414227934e-07,
"loss": 2.0848,
"step": 7250
},
{
"epoch": 0.018947517323676156,
"grad_norm": 1.159995436668396,
"learning_rate": 3.157484913373565e-07,
"loss": 2.0938,
"step": 7300
},
{
"epoch": 0.019077294839591745,
"grad_norm": 1.151329755783081,
"learning_rate": 3.1791144853243356e-07,
"loss": 2.1002,
"step": 7350
},
{
"epoch": 0.019207072355507337,
"grad_norm": 1.123695731163025,
"learning_rate": 3.200744057275107e-07,
"loss": 2.0951,
"step": 7400
},
{
"epoch": 0.019336849871422927,
"grad_norm": 1.143547534942627,
"learning_rate": 3.222373629225878e-07,
"loss": 2.1034,
"step": 7450
},
{
"epoch": 0.019466627387338516,
"grad_norm": 1.093329906463623,
"learning_rate": 3.2440032011766486e-07,
"loss": 2.0921,
"step": 7500
},
{
"epoch": 0.019596404903254105,
"grad_norm": 1.3572251796722412,
"learning_rate": 3.2656327731274204e-07,
"loss": 2.0902,
"step": 7550
},
{
"epoch": 0.019726182419169697,
"grad_norm": 1.146531343460083,
"learning_rate": 3.287262345078191e-07,
"loss": 2.0899,
"step": 7600
},
{
"epoch": 0.019855959935085286,
"grad_norm": 1.0585743188858032,
"learning_rate": 3.308891917028962e-07,
"loss": 2.108,
"step": 7650
},
{
"epoch": 0.019985737451000875,
"grad_norm": 1.1923290491104126,
"learning_rate": 3.3305214889797334e-07,
"loss": 2.098,
"step": 7700
},
{
"epoch": 0.020115514966916468,
"grad_norm": 1.1357568502426147,
"learning_rate": 3.352151060930504e-07,
"loss": 2.1021,
"step": 7750
},
{
"epoch": 0.020245292482832057,
"grad_norm": 1.2182716131210327,
"learning_rate": 3.373780632881275e-07,
"loss": 2.0816,
"step": 7800
},
{
"epoch": 0.020375069998747646,
"grad_norm": 1.1091363430023193,
"learning_rate": 3.395410204832047e-07,
"loss": 2.0832,
"step": 7850
},
{
"epoch": 0.02050484751466324,
"grad_norm": 1.1325336694717407,
"learning_rate": 3.4170397767828177e-07,
"loss": 2.0872,
"step": 7900
},
{
"epoch": 0.020634625030578828,
"grad_norm": 1.020922064781189,
"learning_rate": 3.4386693487335885e-07,
"loss": 2.0798,
"step": 7950
},
{
"epoch": 0.020764402546494417,
"grad_norm": 1.1414934396743774,
"learning_rate": 3.46029892068436e-07,
"loss": 2.0745,
"step": 8000
},
{
"epoch": 0.020894180062410006,
"grad_norm": 1.1155861616134644,
"learning_rate": 3.481928492635131e-07,
"loss": 2.0975,
"step": 8050
},
{
"epoch": 0.0210239575783256,
"grad_norm": 0.9747071266174316,
"learning_rate": 3.503558064585902e-07,
"loss": 2.0796,
"step": 8100
},
{
"epoch": 0.021153735094241188,
"grad_norm": 1.2895739078521729,
"learning_rate": 3.5251876365366734e-07,
"loss": 2.0951,
"step": 8150
},
{
"epoch": 0.021283512610156777,
"grad_norm": 1.147414207458496,
"learning_rate": 3.546817208487444e-07,
"loss": 2.0955,
"step": 8200
},
{
"epoch": 0.02141329012607237,
"grad_norm": 1.1811184883117676,
"learning_rate": 3.568446780438215e-07,
"loss": 2.0895,
"step": 8250
},
{
"epoch": 0.02154306764198796,
"grad_norm": 1.2630963325500488,
"learning_rate": 3.590076352388987e-07,
"loss": 2.0668,
"step": 8300
},
{
"epoch": 0.021672845157903547,
"grad_norm": 1.0883618593215942,
"learning_rate": 3.6117059243397577e-07,
"loss": 2.0981,
"step": 8350
},
{
"epoch": 0.021802622673819137,
"grad_norm": 1.2500261068344116,
"learning_rate": 3.6333354962905285e-07,
"loss": 2.0911,
"step": 8400
},
{
"epoch": 0.02193240018973473,
"grad_norm": 1.133091926574707,
"learning_rate": 3.6549650682413e-07,
"loss": 2.082,
"step": 8450
},
{
"epoch": 0.022062177705650318,
"grad_norm": 1.1500440835952759,
"learning_rate": 3.6765946401920707e-07,
"loss": 2.091,
"step": 8500
},
{
"epoch": 0.022191955221565907,
"grad_norm": 1.1573790311813354,
"learning_rate": 3.6982242121428415e-07,
"loss": 2.091,
"step": 8550
},
{
"epoch": 0.0223217327374815,
"grad_norm": 1.0022162199020386,
"learning_rate": 3.7198537840936134e-07,
"loss": 2.0734,
"step": 8600
},
{
"epoch": 0.02245151025339709,
"grad_norm": 1.2101961374282837,
"learning_rate": 3.741483356044384e-07,
"loss": 2.0976,
"step": 8650
},
{
"epoch": 0.022581287769312678,
"grad_norm": 1.1183929443359375,
"learning_rate": 3.763112927995155e-07,
"loss": 2.0742,
"step": 8700
},
{
"epoch": 0.02271106528522827,
"grad_norm": 1.1698428392410278,
"learning_rate": 3.784742499945927e-07,
"loss": 2.1189,
"step": 8750
},
{
"epoch": 0.02284084280114386,
"grad_norm": 1.238348126411438,
"learning_rate": 3.8063720718966977e-07,
"loss": 2.0864,
"step": 8800
},
{
"epoch": 0.02297062031705945,
"grad_norm": 1.0891568660736084,
"learning_rate": 3.8280016438474685e-07,
"loss": 2.071,
"step": 8850
},
{
"epoch": 0.023100397832975038,
"grad_norm": 1.0950003862380981,
"learning_rate": 3.84963121579824e-07,
"loss": 2.0944,
"step": 8900
},
{
"epoch": 0.02323017534889063,
"grad_norm": 1.0031663179397583,
"learning_rate": 3.8712607877490106e-07,
"loss": 2.0688,
"step": 8950
},
{
"epoch": 0.02335995286480622,
"grad_norm": 1.1025946140289307,
"learning_rate": 3.8928903596997815e-07,
"loss": 2.0853,
"step": 9000
},
{
"epoch": 0.02348973038072181,
"grad_norm": 1.0795261859893799,
"learning_rate": 3.9145199316505533e-07,
"loss": 2.0813,
"step": 9050
},
{
"epoch": 0.0236195078966374,
"grad_norm": 1.1669412851333618,
"learning_rate": 3.936149503601324e-07,
"loss": 2.0802,
"step": 9100
},
{
"epoch": 0.02374928541255299,
"grad_norm": 1.186626672744751,
"learning_rate": 3.957779075552095e-07,
"loss": 2.1015,
"step": 9150
},
{
"epoch": 0.02387906292846858,
"grad_norm": 1.053902506828308,
"learning_rate": 3.9794086475028663e-07,
"loss": 2.1003,
"step": 9200
},
{
"epoch": 0.02400884044438417,
"grad_norm": 1.1948777437210083,
"learning_rate": 4.001038219453637e-07,
"loss": 2.0914,
"step": 9250
},
{
"epoch": 0.02413861796029976,
"grad_norm": 1.0830193758010864,
"learning_rate": 4.0226677914044085e-07,
"loss": 2.0892,
"step": 9300
},
{
"epoch": 0.02426839547621535,
"grad_norm": 1.0737528800964355,
"learning_rate": 4.04429736335518e-07,
"loss": 2.0949,
"step": 9350
},
{
"epoch": 0.02439817299213094,
"grad_norm": 1.2443790435791016,
"learning_rate": 4.0659269353059506e-07,
"loss": 2.0725,
"step": 9400
},
{
"epoch": 0.02452795050804653,
"grad_norm": 0.9910159111022949,
"learning_rate": 4.0875565072567214e-07,
"loss": 2.092,
"step": 9450
},
{
"epoch": 0.02465772802396212,
"grad_norm": 1.1105308532714844,
"learning_rate": 4.1091860792074933e-07,
"loss": 2.0894,
"step": 9500
},
{
"epoch": 0.02478750553987771,
"grad_norm": 1.3401215076446533,
"learning_rate": 4.130815651158264e-07,
"loss": 2.091,
"step": 9550
},
{
"epoch": 0.024917283055793302,
"grad_norm": 1.1136138439178467,
"learning_rate": 4.152445223109035e-07,
"loss": 2.1022,
"step": 9600
},
{
"epoch": 0.02504706057170889,
"grad_norm": 1.1129764318466187,
"learning_rate": 4.1740747950598063e-07,
"loss": 2.0841,
"step": 9650
},
{
"epoch": 0.02517683808762448,
"grad_norm": 1.1361297369003296,
"learning_rate": 4.195704367010577e-07,
"loss": 2.0987,
"step": 9700
},
{
"epoch": 0.02530661560354007,
"grad_norm": 1.2290136814117432,
"learning_rate": 4.217333938961348e-07,
"loss": 2.0967,
"step": 9750
},
{
"epoch": 0.025436393119455662,
"grad_norm": 1.1932119131088257,
"learning_rate": 4.23896351091212e-07,
"loss": 2.1018,
"step": 9800
},
{
"epoch": 0.02556617063537125,
"grad_norm": 1.1398112773895264,
"learning_rate": 4.2605930828628906e-07,
"loss": 2.076,
"step": 9850
},
{
"epoch": 0.02569594815128684,
"grad_norm": 1.255175232887268,
"learning_rate": 4.2822226548136614e-07,
"loss": 2.0979,
"step": 9900
},
{
"epoch": 0.025825725667202433,
"grad_norm": 1.063835620880127,
"learning_rate": 4.303852226764433e-07,
"loss": 2.0982,
"step": 9950
},
{
"epoch": 0.025955503183118022,
"grad_norm": 1.0199131965637207,
"learning_rate": 4.3254817987152036e-07,
"loss": 2.077,
"step": 10000
}
],
"logging_steps": 50,
"max_steps": 1155822,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.488477719022797e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}