ModernBERT-MOSGrader / trainer_state.json
ismaelR's picture
Upload 4 files
2686475 verified
{
"best_metric": 0.6869426704202687,
"best_model_checkpoint": "/userstorage/modernbert-llm-grader/checkpoint-31216",
"epoch": 4.0,
"eval_steps": 500,
"global_step": 31216,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012813941568426449,
"grad_norm": 5.321617603302002,
"learning_rate": 4.987186058431574e-05,
"loss": 1.4033,
"step": 100
},
{
"epoch": 0.025627883136852898,
"grad_norm": 3.621730089187622,
"learning_rate": 4.974372116863147e-05,
"loss": 1.3035,
"step": 200
},
{
"epoch": 0.03844182470527934,
"grad_norm": 7.95962381362915,
"learning_rate": 4.961558175294721e-05,
"loss": 1.2506,
"step": 300
},
{
"epoch": 0.051255766273705795,
"grad_norm": 3.631398916244507,
"learning_rate": 4.9487442337262944e-05,
"loss": 1.2354,
"step": 400
},
{
"epoch": 0.06406970784213224,
"grad_norm": 2.6680333614349365,
"learning_rate": 4.935930292157868e-05,
"loss": 1.2397,
"step": 500
},
{
"epoch": 0.07688364941055868,
"grad_norm": 6.042360305786133,
"learning_rate": 4.9231163505894415e-05,
"loss": 1.1811,
"step": 600
},
{
"epoch": 0.08969759097898514,
"grad_norm": 6.7501959800720215,
"learning_rate": 4.9103024090210154e-05,
"loss": 1.1947,
"step": 700
},
{
"epoch": 0.10251153254741159,
"grad_norm": 3.4089736938476562,
"learning_rate": 4.8974884674525886e-05,
"loss": 1.1812,
"step": 800
},
{
"epoch": 0.11532547411583803,
"grad_norm": 5.0775604248046875,
"learning_rate": 4.884674525884162e-05,
"loss": 1.1752,
"step": 900
},
{
"epoch": 0.12813941568426448,
"grad_norm": 4.529630184173584,
"learning_rate": 4.8718605843157357e-05,
"loss": 1.1867,
"step": 1000
},
{
"epoch": 0.14095335725269093,
"grad_norm": 4.961220741271973,
"learning_rate": 4.859046642747309e-05,
"loss": 1.2129,
"step": 1100
},
{
"epoch": 0.15376729882111737,
"grad_norm": 4.113813400268555,
"learning_rate": 4.846232701178883e-05,
"loss": 1.1293,
"step": 1200
},
{
"epoch": 0.16658124038954383,
"grad_norm": 7.25917387008667,
"learning_rate": 4.8334187596104566e-05,
"loss": 1.1008,
"step": 1300
},
{
"epoch": 0.17939518195797027,
"grad_norm": 5.579372882843018,
"learning_rate": 4.82060481804203e-05,
"loss": 1.1327,
"step": 1400
},
{
"epoch": 0.1922091235263967,
"grad_norm": 9.794898986816406,
"learning_rate": 4.807790876473604e-05,
"loss": 1.1355,
"step": 1500
},
{
"epoch": 0.20502306509482318,
"grad_norm": 9.875951766967773,
"learning_rate": 4.794976934905177e-05,
"loss": 1.0057,
"step": 1600
},
{
"epoch": 0.21783700666324962,
"grad_norm": 7.271333694458008,
"learning_rate": 4.782162993336751e-05,
"loss": 1.1101,
"step": 1700
},
{
"epoch": 0.23065094823167606,
"grad_norm": 6.730026721954346,
"learning_rate": 4.769349051768324e-05,
"loss": 1.0762,
"step": 1800
},
{
"epoch": 0.2434648898001025,
"grad_norm": 5.596224784851074,
"learning_rate": 4.756535110199898e-05,
"loss": 1.0413,
"step": 1900
},
{
"epoch": 0.25627883136852897,
"grad_norm": 4.591865539550781,
"learning_rate": 4.743721168631472e-05,
"loss": 1.0593,
"step": 2000
},
{
"epoch": 0.2690927729369554,
"grad_norm": 6.357232570648193,
"learning_rate": 4.730907227063045e-05,
"loss": 1.0434,
"step": 2100
},
{
"epoch": 0.28190671450538185,
"grad_norm": 5.185873508453369,
"learning_rate": 4.718093285494619e-05,
"loss": 1.021,
"step": 2200
},
{
"epoch": 0.2947206560738083,
"grad_norm": 8.19482135772705,
"learning_rate": 4.705279343926192e-05,
"loss": 1.1102,
"step": 2300
},
{
"epoch": 0.30753459764223473,
"grad_norm": 6.1499176025390625,
"learning_rate": 4.692465402357765e-05,
"loss": 1.0115,
"step": 2400
},
{
"epoch": 0.3203485392106612,
"grad_norm": 11.092570304870605,
"learning_rate": 4.679651460789339e-05,
"loss": 0.9576,
"step": 2500
},
{
"epoch": 0.33316248077908767,
"grad_norm": 5.10243034362793,
"learning_rate": 4.666837519220912e-05,
"loss": 1.0674,
"step": 2600
},
{
"epoch": 0.3459764223475141,
"grad_norm": 4.633431434631348,
"learning_rate": 4.654023577652486e-05,
"loss": 1.0004,
"step": 2700
},
{
"epoch": 0.35879036391594055,
"grad_norm": 5.507874488830566,
"learning_rate": 4.6412096360840594e-05,
"loss": 1.0439,
"step": 2800
},
{
"epoch": 0.371604305484367,
"grad_norm": 5.591798305511475,
"learning_rate": 4.628395694515633e-05,
"loss": 1.0509,
"step": 2900
},
{
"epoch": 0.3844182470527934,
"grad_norm": 4.341959476470947,
"learning_rate": 4.6155817529472065e-05,
"loss": 0.9864,
"step": 3000
},
{
"epoch": 0.3972321886212199,
"grad_norm": 3.6542482376098633,
"learning_rate": 4.6027678113787804e-05,
"loss": 0.9897,
"step": 3100
},
{
"epoch": 0.41004613018964636,
"grad_norm": 6.769758701324463,
"learning_rate": 4.589953869810354e-05,
"loss": 1.0528,
"step": 3200
},
{
"epoch": 0.4228600717580728,
"grad_norm": 5.762277603149414,
"learning_rate": 4.5771399282419274e-05,
"loss": 1.0036,
"step": 3300
},
{
"epoch": 0.43567401332649924,
"grad_norm": 7.389179229736328,
"learning_rate": 4.564325986673501e-05,
"loss": 1.0304,
"step": 3400
},
{
"epoch": 0.44848795489492566,
"grad_norm": 3.9039294719696045,
"learning_rate": 4.5515120451050745e-05,
"loss": 0.9962,
"step": 3500
},
{
"epoch": 0.4613018964633521,
"grad_norm": 3.0561447143554688,
"learning_rate": 4.5386981035366484e-05,
"loss": 0.9777,
"step": 3600
},
{
"epoch": 0.4741158380317786,
"grad_norm": 6.340303897857666,
"learning_rate": 4.5258841619682216e-05,
"loss": 0.9603,
"step": 3700
},
{
"epoch": 0.486929779600205,
"grad_norm": 9.058144569396973,
"learning_rate": 4.5130702203997955e-05,
"loss": 0.9658,
"step": 3800
},
{
"epoch": 0.49974372116863147,
"grad_norm": 8.219672203063965,
"learning_rate": 4.500256278831369e-05,
"loss": 0.9856,
"step": 3900
},
{
"epoch": 0.5125576627370579,
"grad_norm": 3.6466543674468994,
"learning_rate": 4.487442337262942e-05,
"loss": 0.9734,
"step": 4000
},
{
"epoch": 0.5253716043054844,
"grad_norm": 7.289781093597412,
"learning_rate": 4.474628395694516e-05,
"loss": 0.9045,
"step": 4100
},
{
"epoch": 0.5381855458739108,
"grad_norm": 6.18227481842041,
"learning_rate": 4.461814454126089e-05,
"loss": 0.9679,
"step": 4200
},
{
"epoch": 0.5509994874423373,
"grad_norm": 3.994476318359375,
"learning_rate": 4.449000512557663e-05,
"loss": 0.8958,
"step": 4300
},
{
"epoch": 0.5638134290107637,
"grad_norm": 3.913896322250366,
"learning_rate": 4.436186570989236e-05,
"loss": 0.9453,
"step": 4400
},
{
"epoch": 0.5766273705791901,
"grad_norm": 4.39192008972168,
"learning_rate": 4.42337262942081e-05,
"loss": 0.9132,
"step": 4500
},
{
"epoch": 0.5894413121476166,
"grad_norm": 5.574671745300293,
"learning_rate": 4.410558687852384e-05,
"loss": 0.9069,
"step": 4600
},
{
"epoch": 0.602255253716043,
"grad_norm": 4.218778610229492,
"learning_rate": 4.397744746283957e-05,
"loss": 0.9631,
"step": 4700
},
{
"epoch": 0.6150691952844695,
"grad_norm": 7.804980754852295,
"learning_rate": 4.384930804715531e-05,
"loss": 0.9121,
"step": 4800
},
{
"epoch": 0.627883136852896,
"grad_norm": 7.064172744750977,
"learning_rate": 4.372116863147104e-05,
"loss": 0.9387,
"step": 4900
},
{
"epoch": 0.6406970784213224,
"grad_norm": 5.293111324310303,
"learning_rate": 4.359302921578678e-05,
"loss": 0.9264,
"step": 5000
},
{
"epoch": 0.6535110199897488,
"grad_norm": 7.019448757171631,
"learning_rate": 4.346488980010251e-05,
"loss": 0.9452,
"step": 5100
},
{
"epoch": 0.6663249615581753,
"grad_norm": 6.714709758758545,
"learning_rate": 4.333675038441825e-05,
"loss": 0.8648,
"step": 5200
},
{
"epoch": 0.6791389031266017,
"grad_norm": 8.232748031616211,
"learning_rate": 4.320861096873399e-05,
"loss": 0.904,
"step": 5300
},
{
"epoch": 0.6919528446950282,
"grad_norm": 9.853933334350586,
"learning_rate": 4.308047155304972e-05,
"loss": 0.8895,
"step": 5400
},
{
"epoch": 0.7047667862634547,
"grad_norm": 6.8710455894470215,
"learning_rate": 4.2952332137365454e-05,
"loss": 0.86,
"step": 5500
},
{
"epoch": 0.7175807278318811,
"grad_norm": 6.45287561416626,
"learning_rate": 4.2824192721681186e-05,
"loss": 0.8718,
"step": 5600
},
{
"epoch": 0.7303946694003075,
"grad_norm": 5.772899627685547,
"learning_rate": 4.2696053305996924e-05,
"loss": 0.8477,
"step": 5700
},
{
"epoch": 0.743208610968734,
"grad_norm": 6.193540573120117,
"learning_rate": 4.256791389031266e-05,
"loss": 0.9184,
"step": 5800
},
{
"epoch": 0.7560225525371604,
"grad_norm": 2.5397393703460693,
"learning_rate": 4.2439774474628395e-05,
"loss": 0.9537,
"step": 5900
},
{
"epoch": 0.7688364941055869,
"grad_norm": 8.280569076538086,
"learning_rate": 4.2311635058944134e-05,
"loss": 0.8988,
"step": 6000
},
{
"epoch": 0.7816504356740134,
"grad_norm": 10.563502311706543,
"learning_rate": 4.2183495643259866e-05,
"loss": 0.8518,
"step": 6100
},
{
"epoch": 0.7944643772424398,
"grad_norm": 3.090008497238159,
"learning_rate": 4.2055356227575605e-05,
"loss": 0.8731,
"step": 6200
},
{
"epoch": 0.8072783188108662,
"grad_norm": 4.051167011260986,
"learning_rate": 4.192721681189134e-05,
"loss": 0.8713,
"step": 6300
},
{
"epoch": 0.8200922603792927,
"grad_norm": 7.207763671875,
"learning_rate": 4.1799077396207076e-05,
"loss": 0.8781,
"step": 6400
},
{
"epoch": 0.8329062019477191,
"grad_norm": 6.396823883056641,
"learning_rate": 4.1670937980522815e-05,
"loss": 0.8231,
"step": 6500
},
{
"epoch": 0.8457201435161456,
"grad_norm": 6.260582447052002,
"learning_rate": 4.1542798564838547e-05,
"loss": 0.8658,
"step": 6600
},
{
"epoch": 0.858534085084572,
"grad_norm": 8.35356616973877,
"learning_rate": 4.1414659149154285e-05,
"loss": 0.8637,
"step": 6700
},
{
"epoch": 0.8713480266529985,
"grad_norm": 7.236725330352783,
"learning_rate": 4.128651973347002e-05,
"loss": 0.8525,
"step": 6800
},
{
"epoch": 0.8841619682214249,
"grad_norm": 14.001522064208984,
"learning_rate": 4.1158380317785756e-05,
"loss": 0.8628,
"step": 6900
},
{
"epoch": 0.8969759097898513,
"grad_norm": 4.257541179656982,
"learning_rate": 4.103024090210149e-05,
"loss": 0.8443,
"step": 7000
},
{
"epoch": 0.9097898513582778,
"grad_norm": 5.065970420837402,
"learning_rate": 4.090210148641722e-05,
"loss": 0.8329,
"step": 7100
},
{
"epoch": 0.9226037929267042,
"grad_norm": 6.647068977355957,
"learning_rate": 4.077396207073296e-05,
"loss": 0.8585,
"step": 7200
},
{
"epoch": 0.9354177344951307,
"grad_norm": 8.440242767333984,
"learning_rate": 4.064582265504869e-05,
"loss": 0.8749,
"step": 7300
},
{
"epoch": 0.9482316760635572,
"grad_norm": 7.684078216552734,
"learning_rate": 4.051768323936443e-05,
"loss": 0.7771,
"step": 7400
},
{
"epoch": 0.9610456176319836,
"grad_norm": 6.4709577560424805,
"learning_rate": 4.038954382368016e-05,
"loss": 0.8597,
"step": 7500
},
{
"epoch": 0.97385955920041,
"grad_norm": 4.3970489501953125,
"learning_rate": 4.02614044079959e-05,
"loss": 0.7852,
"step": 7600
},
{
"epoch": 0.9866735007688365,
"grad_norm": 9.167794227600098,
"learning_rate": 4.013326499231164e-05,
"loss": 0.8563,
"step": 7700
},
{
"epoch": 0.9994874423372629,
"grad_norm": 6.251096248626709,
"learning_rate": 4.000512557662737e-05,
"loss": 0.8243,
"step": 7800
},
{
"epoch": 1.0,
"eval_f1": 0.640692076906927,
"eval_loss": 0.8794865608215332,
"eval_runtime": 744.6214,
"eval_samples_per_second": 10.48,
"eval_steps_per_second": 2.62,
"step": 7804
},
{
"epoch": 1.0123013839056894,
"grad_norm": 5.928829669952393,
"learning_rate": 3.987698616094311e-05,
"loss": 0.7046,
"step": 7900
},
{
"epoch": 1.0251153254741159,
"grad_norm": 2.885106086730957,
"learning_rate": 3.974884674525884e-05,
"loss": 0.7663,
"step": 8000
},
{
"epoch": 1.0379292670425422,
"grad_norm": 5.951350212097168,
"learning_rate": 3.962070732957458e-05,
"loss": 0.7374,
"step": 8100
},
{
"epoch": 1.0507432086109687,
"grad_norm": 2.5160486698150635,
"learning_rate": 3.949256791389031e-05,
"loss": 0.7126,
"step": 8200
},
{
"epoch": 1.0635571501793952,
"grad_norm": 6.847401142120361,
"learning_rate": 3.936442849820605e-05,
"loss": 0.6785,
"step": 8300
},
{
"epoch": 1.0763710917478215,
"grad_norm": 4.729136943817139,
"learning_rate": 3.923628908252179e-05,
"loss": 0.7085,
"step": 8400
},
{
"epoch": 1.089185033316248,
"grad_norm": 5.535890102386475,
"learning_rate": 3.910814966683752e-05,
"loss": 0.7548,
"step": 8500
},
{
"epoch": 1.1019989748846746,
"grad_norm": 6.188892364501953,
"learning_rate": 3.8980010251153255e-05,
"loss": 0.7193,
"step": 8600
},
{
"epoch": 1.1148129164531009,
"grad_norm": 5.806282997131348,
"learning_rate": 3.885187083546899e-05,
"loss": 0.7143,
"step": 8700
},
{
"epoch": 1.1276268580215274,
"grad_norm": 10.726571083068848,
"learning_rate": 3.8723731419784726e-05,
"loss": 0.6892,
"step": 8800
},
{
"epoch": 1.140440799589954,
"grad_norm": 7.0307512283325195,
"learning_rate": 3.8595592004100465e-05,
"loss": 0.7264,
"step": 8900
},
{
"epoch": 1.1532547411583802,
"grad_norm": 20.715412139892578,
"learning_rate": 3.8467452588416197e-05,
"loss": 0.6987,
"step": 9000
},
{
"epoch": 1.1660686827268067,
"grad_norm": 6.620629787445068,
"learning_rate": 3.8339313172731935e-05,
"loss": 0.7041,
"step": 9100
},
{
"epoch": 1.1788826242952333,
"grad_norm": 5.27125883102417,
"learning_rate": 3.821117375704767e-05,
"loss": 0.67,
"step": 9200
},
{
"epoch": 1.1916965658636596,
"grad_norm": 6.010765552520752,
"learning_rate": 3.8083034341363406e-05,
"loss": 0.6737,
"step": 9300
},
{
"epoch": 1.204510507432086,
"grad_norm": 14.393863677978516,
"learning_rate": 3.795489492567914e-05,
"loss": 0.7097,
"step": 9400
},
{
"epoch": 1.2173244490005126,
"grad_norm": 6.37823486328125,
"learning_rate": 3.782675550999488e-05,
"loss": 0.7157,
"step": 9500
},
{
"epoch": 1.230138390568939,
"grad_norm": 11.626152992248535,
"learning_rate": 3.7698616094310616e-05,
"loss": 0.7066,
"step": 9600
},
{
"epoch": 1.2429523321373654,
"grad_norm": 5.520190238952637,
"learning_rate": 3.757047667862635e-05,
"loss": 0.7303,
"step": 9700
},
{
"epoch": 1.255766273705792,
"grad_norm": 9.865089416503906,
"learning_rate": 3.744233726294209e-05,
"loss": 0.7559,
"step": 9800
},
{
"epoch": 1.2685802152742183,
"grad_norm": 7.075952529907227,
"learning_rate": 3.731419784725782e-05,
"loss": 0.6941,
"step": 9900
},
{
"epoch": 1.2813941568426448,
"grad_norm": 3.4892656803131104,
"learning_rate": 3.718605843157356e-05,
"loss": 0.7164,
"step": 10000
},
{
"epoch": 1.2942080984110713,
"grad_norm": 9.843413352966309,
"learning_rate": 3.705791901588929e-05,
"loss": 0.695,
"step": 10100
},
{
"epoch": 1.3070220399794976,
"grad_norm": 12.128110885620117,
"learning_rate": 3.692977960020502e-05,
"loss": 0.6563,
"step": 10200
},
{
"epoch": 1.3198359815479241,
"grad_norm": 11.26876163482666,
"learning_rate": 3.680164018452076e-05,
"loss": 0.6803,
"step": 10300
},
{
"epoch": 1.3326499231163507,
"grad_norm": 12.95758056640625,
"learning_rate": 3.667350076883649e-05,
"loss": 0.6864,
"step": 10400
},
{
"epoch": 1.345463864684777,
"grad_norm": 4.91602897644043,
"learning_rate": 3.654536135315223e-05,
"loss": 0.7184,
"step": 10500
},
{
"epoch": 1.3582778062532035,
"grad_norm": 4.799069881439209,
"learning_rate": 3.641722193746796e-05,
"loss": 0.7558,
"step": 10600
},
{
"epoch": 1.37109174782163,
"grad_norm": 64.9485855102539,
"learning_rate": 3.62890825217837e-05,
"loss": 0.7292,
"step": 10700
},
{
"epoch": 1.3839056893900563,
"grad_norm": 6.147428512573242,
"learning_rate": 3.616094310609944e-05,
"loss": 0.6623,
"step": 10800
},
{
"epoch": 1.3967196309584828,
"grad_norm": 7.638481140136719,
"learning_rate": 3.603280369041517e-05,
"loss": 0.6981,
"step": 10900
},
{
"epoch": 1.4095335725269091,
"grad_norm": 4.798500061035156,
"learning_rate": 3.590466427473091e-05,
"loss": 0.7569,
"step": 11000
},
{
"epoch": 1.4223475140953357,
"grad_norm": 4.413691520690918,
"learning_rate": 3.5776524859046644e-05,
"loss": 0.6391,
"step": 11100
},
{
"epoch": 1.4351614556637622,
"grad_norm": 6.2526421546936035,
"learning_rate": 3.564838544336238e-05,
"loss": 0.7045,
"step": 11200
},
{
"epoch": 1.4479753972321885,
"grad_norm": 6.3732805252075195,
"learning_rate": 3.5520246027678114e-05,
"loss": 0.6916,
"step": 11300
},
{
"epoch": 1.460789338800615,
"grad_norm": 25.24698829650879,
"learning_rate": 3.539210661199385e-05,
"loss": 0.7652,
"step": 11400
},
{
"epoch": 1.4736032803690415,
"grad_norm": 4.716599941253662,
"learning_rate": 3.5263967196309585e-05,
"loss": 0.7199,
"step": 11500
},
{
"epoch": 1.4864172219374678,
"grad_norm": 13.750917434692383,
"learning_rate": 3.5135827780625324e-05,
"loss": 0.7032,
"step": 11600
},
{
"epoch": 1.4992311635058944,
"grad_norm": 3.6678273677825928,
"learning_rate": 3.500768836494106e-05,
"loss": 0.6821,
"step": 11700
},
{
"epoch": 1.5120451050743209,
"grad_norm": 7.891080856323242,
"learning_rate": 3.487954894925679e-05,
"loss": 0.7301,
"step": 11800
},
{
"epoch": 1.5248590466427472,
"grad_norm": 3.25317645072937,
"learning_rate": 3.475140953357253e-05,
"loss": 0.6665,
"step": 11900
},
{
"epoch": 1.5376729882111737,
"grad_norm": 12.75395679473877,
"learning_rate": 3.462327011788826e-05,
"loss": 0.733,
"step": 12000
},
{
"epoch": 1.5504869297796002,
"grad_norm": 10.9820556640625,
"learning_rate": 3.4495130702204e-05,
"loss": 0.7064,
"step": 12100
},
{
"epoch": 1.5633008713480265,
"grad_norm": 6.558383941650391,
"learning_rate": 3.4366991286519737e-05,
"loss": 0.7105,
"step": 12200
},
{
"epoch": 1.576114812916453,
"grad_norm": 8.5501070022583,
"learning_rate": 3.423885187083547e-05,
"loss": 0.706,
"step": 12300
},
{
"epoch": 1.5889287544848796,
"grad_norm": 5.319694995880127,
"learning_rate": 3.411071245515121e-05,
"loss": 0.7239,
"step": 12400
},
{
"epoch": 1.6017426960533059,
"grad_norm": 5.92519474029541,
"learning_rate": 3.398257303946694e-05,
"loss": 0.7043,
"step": 12500
},
{
"epoch": 1.6145566376217324,
"grad_norm": 8.853275299072266,
"learning_rate": 3.385443362378268e-05,
"loss": 0.6831,
"step": 12600
},
{
"epoch": 1.627370579190159,
"grad_norm": 9.30588150024414,
"learning_rate": 3.372629420809841e-05,
"loss": 0.6756,
"step": 12700
},
{
"epoch": 1.6401845207585852,
"grad_norm": 5.903197288513184,
"learning_rate": 3.359815479241415e-05,
"loss": 0.725,
"step": 12800
},
{
"epoch": 1.6529984623270118,
"grad_norm": 5.500326156616211,
"learning_rate": 3.347001537672989e-05,
"loss": 0.6801,
"step": 12900
},
{
"epoch": 1.6658124038954383,
"grad_norm": 7.896096229553223,
"learning_rate": 3.334187596104562e-05,
"loss": 0.6975,
"step": 13000
},
{
"epoch": 1.6786263454638646,
"grad_norm": 6.674001216888428,
"learning_rate": 3.321373654536136e-05,
"loss": 0.6681,
"step": 13100
},
{
"epoch": 1.691440287032291,
"grad_norm": 21.74435806274414,
"learning_rate": 3.308559712967709e-05,
"loss": 0.7045,
"step": 13200
},
{
"epoch": 1.7042542286007176,
"grad_norm": 6.329532146453857,
"learning_rate": 3.295745771399282e-05,
"loss": 0.6885,
"step": 13300
},
{
"epoch": 1.717068170169144,
"grad_norm": 24.047470092773438,
"learning_rate": 3.282931829830856e-05,
"loss": 0.7003,
"step": 13400
},
{
"epoch": 1.7298821117375704,
"grad_norm": 7.407759666442871,
"learning_rate": 3.2701178882624294e-05,
"loss": 0.6856,
"step": 13500
},
{
"epoch": 1.742696053305997,
"grad_norm": 5.755215167999268,
"learning_rate": 3.257303946694003e-05,
"loss": 0.7005,
"step": 13600
},
{
"epoch": 1.7555099948744233,
"grad_norm": 11.444562911987305,
"learning_rate": 3.2444900051255764e-05,
"loss": 0.7136,
"step": 13700
},
{
"epoch": 1.7683239364428498,
"grad_norm": 8.267853736877441,
"learning_rate": 3.23167606355715e-05,
"loss": 0.7029,
"step": 13800
},
{
"epoch": 1.7811378780112763,
"grad_norm": 6.73785924911499,
"learning_rate": 3.2188621219887235e-05,
"loss": 0.6572,
"step": 13900
},
{
"epoch": 1.7939518195797026,
"grad_norm": 5.369395732879639,
"learning_rate": 3.2060481804202974e-05,
"loss": 0.6617,
"step": 14000
},
{
"epoch": 1.8067657611481291,
"grad_norm": 2.288243293762207,
"learning_rate": 3.193234238851871e-05,
"loss": 0.6688,
"step": 14100
},
{
"epoch": 1.8195797027165557,
"grad_norm": 14.942804336547852,
"learning_rate": 3.1804202972834445e-05,
"loss": 0.6792,
"step": 14200
},
{
"epoch": 1.832393644284982,
"grad_norm": 8.988631248474121,
"learning_rate": 3.1676063557150184e-05,
"loss": 0.6518,
"step": 14300
},
{
"epoch": 1.8452075858534085,
"grad_norm": 7.9590630531311035,
"learning_rate": 3.1547924141465916e-05,
"loss": 0.6503,
"step": 14400
},
{
"epoch": 1.858021527421835,
"grad_norm": 9.33973503112793,
"learning_rate": 3.1419784725781655e-05,
"loss": 0.6647,
"step": 14500
},
{
"epoch": 1.8708354689902613,
"grad_norm": 9.39842700958252,
"learning_rate": 3.1291645310097387e-05,
"loss": 0.6515,
"step": 14600
},
{
"epoch": 1.8836494105586878,
"grad_norm": 10.142439842224121,
"learning_rate": 3.1163505894413125e-05,
"loss": 0.6794,
"step": 14700
},
{
"epoch": 1.8964633521271144,
"grad_norm": 11.658042907714844,
"learning_rate": 3.1035366478728864e-05,
"loss": 0.6931,
"step": 14800
},
{
"epoch": 1.9092772936955407,
"grad_norm": 8.672663688659668,
"learning_rate": 3.090722706304459e-05,
"loss": 0.6377,
"step": 14900
},
{
"epoch": 1.9220912352639672,
"grad_norm": 6.620725631713867,
"learning_rate": 3.077908764736033e-05,
"loss": 0.7044,
"step": 15000
},
{
"epoch": 1.9349051768323937,
"grad_norm": 8.3103609085083,
"learning_rate": 3.065094823167606e-05,
"loss": 0.641,
"step": 15100
},
{
"epoch": 1.94771911840082,
"grad_norm": 8.163315773010254,
"learning_rate": 3.05228088159918e-05,
"loss": 0.7094,
"step": 15200
},
{
"epoch": 1.9605330599692465,
"grad_norm": 3.6365621089935303,
"learning_rate": 3.0394669400307534e-05,
"loss": 0.7022,
"step": 15300
},
{
"epoch": 1.973347001537673,
"grad_norm": 4.264801502227783,
"learning_rate": 3.026652998462327e-05,
"loss": 0.6833,
"step": 15400
},
{
"epoch": 1.9861609431060994,
"grad_norm": 6.547428131103516,
"learning_rate": 3.0138390568939005e-05,
"loss": 0.6126,
"step": 15500
},
{
"epoch": 1.9989748846745259,
"grad_norm": 6.155936241149902,
"learning_rate": 3.0010251153254744e-05,
"loss": 0.6851,
"step": 15600
},
{
"epoch": 2.0,
"eval_f1": 0.6772664805551888,
"eval_loss": 0.781230092048645,
"eval_runtime": 778.3436,
"eval_samples_per_second": 10.026,
"eval_steps_per_second": 2.507,
"step": 15608
},
{
"epoch": 2.0117888262429524,
"grad_norm": 8.777030944824219,
"learning_rate": 2.988211173757048e-05,
"loss": 0.4707,
"step": 15700
},
{
"epoch": 2.0246027678113787,
"grad_norm": 4.798321723937988,
"learning_rate": 2.9753972321886215e-05,
"loss": 0.4366,
"step": 15800
},
{
"epoch": 2.037416709379805,
"grad_norm": 2.5244762897491455,
"learning_rate": 2.962583290620195e-05,
"loss": 0.504,
"step": 15900
},
{
"epoch": 2.0502306509482318,
"grad_norm": 15.636524200439453,
"learning_rate": 2.9497693490517686e-05,
"loss": 0.4234,
"step": 16000
},
{
"epoch": 2.063044592516658,
"grad_norm": 8.811060905456543,
"learning_rate": 2.936955407483342e-05,
"loss": 0.3911,
"step": 16100
},
{
"epoch": 2.0758585340850844,
"grad_norm": 4.1310930252075195,
"learning_rate": 2.9241414659149157e-05,
"loss": 0.4538,
"step": 16200
},
{
"epoch": 2.088672475653511,
"grad_norm": 9.516937255859375,
"learning_rate": 2.9113275243464892e-05,
"loss": 0.4461,
"step": 16300
},
{
"epoch": 2.1014864172219374,
"grad_norm": 4.6523756980896,
"learning_rate": 2.8985135827780624e-05,
"loss": 0.4808,
"step": 16400
},
{
"epoch": 2.1143003587903637,
"grad_norm": 4.160647392272949,
"learning_rate": 2.885699641209636e-05,
"loss": 0.4879,
"step": 16500
},
{
"epoch": 2.1271143003587905,
"grad_norm": 11.32701587677002,
"learning_rate": 2.8728856996412095e-05,
"loss": 0.4544,
"step": 16600
},
{
"epoch": 2.1399282419272168,
"grad_norm": 4.703444004058838,
"learning_rate": 2.860071758072783e-05,
"loss": 0.466,
"step": 16700
},
{
"epoch": 2.152742183495643,
"grad_norm": 8.985660552978516,
"learning_rate": 2.847257816504357e-05,
"loss": 0.4734,
"step": 16800
},
{
"epoch": 2.16555612506407,
"grad_norm": 12.306890487670898,
"learning_rate": 2.8344438749359304e-05,
"loss": 0.4287,
"step": 16900
},
{
"epoch": 2.178370066632496,
"grad_norm": 5.025609016418457,
"learning_rate": 2.821629933367504e-05,
"loss": 0.4657,
"step": 17000
},
{
"epoch": 2.1911840082009224,
"grad_norm": 31.554025650024414,
"learning_rate": 2.8088159917990775e-05,
"loss": 0.4378,
"step": 17100
},
{
"epoch": 2.203997949769349,
"grad_norm": 9.015434265136719,
"learning_rate": 2.796002050230651e-05,
"loss": 0.4538,
"step": 17200
},
{
"epoch": 2.2168118913377755,
"grad_norm": 15.61099624633789,
"learning_rate": 2.7831881086622246e-05,
"loss": 0.4134,
"step": 17300
},
{
"epoch": 2.2296258329062018,
"grad_norm": 10.191957473754883,
"learning_rate": 2.770374167093798e-05,
"loss": 0.5188,
"step": 17400
},
{
"epoch": 2.2424397744746285,
"grad_norm": 2.2506730556488037,
"learning_rate": 2.7575602255253717e-05,
"loss": 0.4028,
"step": 17500
},
{
"epoch": 2.255253716043055,
"grad_norm": 23.088764190673828,
"learning_rate": 2.7447462839569456e-05,
"loss": 0.4814,
"step": 17600
},
{
"epoch": 2.268067657611481,
"grad_norm": 4.473659515380859,
"learning_rate": 2.731932342388519e-05,
"loss": 0.4822,
"step": 17700
},
{
"epoch": 2.280881599179908,
"grad_norm": 2.1489970684051514,
"learning_rate": 2.7191184008200927e-05,
"loss": 0.4934,
"step": 17800
},
{
"epoch": 2.293695540748334,
"grad_norm": 1.4255170822143555,
"learning_rate": 2.7063044592516662e-05,
"loss": 0.4314,
"step": 17900
},
{
"epoch": 2.3065094823167605,
"grad_norm": 4.612204074859619,
"learning_rate": 2.693490517683239e-05,
"loss": 0.4322,
"step": 18000
},
{
"epoch": 2.319323423885187,
"grad_norm": 3.1022679805755615,
"learning_rate": 2.680676576114813e-05,
"loss": 0.424,
"step": 18100
},
{
"epoch": 2.3321373654536135,
"grad_norm": 3.745171070098877,
"learning_rate": 2.6678626345463865e-05,
"loss": 0.4269,
"step": 18200
},
{
"epoch": 2.34495130702204,
"grad_norm": 4.0442328453063965,
"learning_rate": 2.65504869297796e-05,
"loss": 0.4698,
"step": 18300
},
{
"epoch": 2.3577652485904665,
"grad_norm": 21.303607940673828,
"learning_rate": 2.6422347514095336e-05,
"loss": 0.4909,
"step": 18400
},
{
"epoch": 2.370579190158893,
"grad_norm": 9.175422668457031,
"learning_rate": 2.629420809841107e-05,
"loss": 0.4598,
"step": 18500
},
{
"epoch": 2.383393131727319,
"grad_norm": 5.787283420562744,
"learning_rate": 2.6166068682726807e-05,
"loss": 0.4409,
"step": 18600
},
{
"epoch": 2.396207073295746,
"grad_norm": 7.338250637054443,
"learning_rate": 2.6037929267042542e-05,
"loss": 0.4157,
"step": 18700
},
{
"epoch": 2.409021014864172,
"grad_norm": 13.879666328430176,
"learning_rate": 2.590978985135828e-05,
"loss": 0.4584,
"step": 18800
},
{
"epoch": 2.4218349564325985,
"grad_norm": 9.484577178955078,
"learning_rate": 2.5781650435674016e-05,
"loss": 0.4914,
"step": 18900
},
{
"epoch": 2.4346488980010252,
"grad_norm": 10.865300178527832,
"learning_rate": 2.565351101998975e-05,
"loss": 0.4259,
"step": 19000
},
{
"epoch": 2.4474628395694515,
"grad_norm": 16.69988441467285,
"learning_rate": 2.5525371604305487e-05,
"loss": 0.4563,
"step": 19100
},
{
"epoch": 2.460276781137878,
"grad_norm": 19.711631774902344,
"learning_rate": 2.5397232188621222e-05,
"loss": 0.4159,
"step": 19200
},
{
"epoch": 2.4730907227063046,
"grad_norm": 13.3755521774292,
"learning_rate": 2.5269092772936958e-05,
"loss": 0.537,
"step": 19300
},
{
"epoch": 2.485904664274731,
"grad_norm": 6.953076362609863,
"learning_rate": 2.5140953357252693e-05,
"loss": 0.4288,
"step": 19400
},
{
"epoch": 2.498718605843157,
"grad_norm": 47.91322708129883,
"learning_rate": 2.5012813941568432e-05,
"loss": 0.5049,
"step": 19500
},
{
"epoch": 2.511532547411584,
"grad_norm": 1.6553832292556763,
"learning_rate": 2.4884674525884164e-05,
"loss": 0.4779,
"step": 19600
},
{
"epoch": 2.5243464889800102,
"grad_norm": 12.199808120727539,
"learning_rate": 2.47565351101999e-05,
"loss": 0.4246,
"step": 19700
},
{
"epoch": 2.5371604305484365,
"grad_norm": 11.326825141906738,
"learning_rate": 2.4628395694515635e-05,
"loss": 0.4482,
"step": 19800
},
{
"epoch": 2.5499743721168633,
"grad_norm": 9.247246742248535,
"learning_rate": 2.450025627883137e-05,
"loss": 0.4656,
"step": 19900
},
{
"epoch": 2.5627883136852896,
"grad_norm": 1.773540735244751,
"learning_rate": 2.4372116863147106e-05,
"loss": 0.4776,
"step": 20000
},
{
"epoch": 2.575602255253716,
"grad_norm": 7.454749584197998,
"learning_rate": 2.424397744746284e-05,
"loss": 0.4161,
"step": 20100
},
{
"epoch": 2.5884161968221426,
"grad_norm": 19.77891731262207,
"learning_rate": 2.4115838031778577e-05,
"loss": 0.4609,
"step": 20200
},
{
"epoch": 2.601230138390569,
"grad_norm": 12.208200454711914,
"learning_rate": 2.3987698616094312e-05,
"loss": 0.453,
"step": 20300
},
{
"epoch": 2.6140440799589952,
"grad_norm": 11.438812255859375,
"learning_rate": 2.3859559200410047e-05,
"loss": 0.4439,
"step": 20400
},
{
"epoch": 2.626858021527422,
"grad_norm": 1.6863147020339966,
"learning_rate": 2.3731419784725783e-05,
"loss": 0.3987,
"step": 20500
},
{
"epoch": 2.6396719630958483,
"grad_norm": 1.3637946844100952,
"learning_rate": 2.3603280369041518e-05,
"loss": 0.4523,
"step": 20600
},
{
"epoch": 2.6524859046642746,
"grad_norm": 21.555208206176758,
"learning_rate": 2.3475140953357254e-05,
"loss": 0.4624,
"step": 20700
},
{
"epoch": 2.6652998462327013,
"grad_norm": 8.768684387207031,
"learning_rate": 2.334700153767299e-05,
"loss": 0.4585,
"step": 20800
},
{
"epoch": 2.6781137878011276,
"grad_norm": 3.2959704399108887,
"learning_rate": 2.3218862121988724e-05,
"loss": 0.4579,
"step": 20900
},
{
"epoch": 2.690927729369554,
"grad_norm": 16.97565269470215,
"learning_rate": 2.309072270630446e-05,
"loss": 0.4132,
"step": 21000
},
{
"epoch": 2.7037416709379807,
"grad_norm": 14.613641738891602,
"learning_rate": 2.2962583290620195e-05,
"loss": 0.4297,
"step": 21100
},
{
"epoch": 2.716555612506407,
"grad_norm": 28.61090087890625,
"learning_rate": 2.283444387493593e-05,
"loss": 0.4479,
"step": 21200
},
{
"epoch": 2.7293695540748333,
"grad_norm": 9.84257984161377,
"learning_rate": 2.2706304459251666e-05,
"loss": 0.4428,
"step": 21300
},
{
"epoch": 2.74218349564326,
"grad_norm": 8.199345588684082,
"learning_rate": 2.2578165043567405e-05,
"loss": 0.3999,
"step": 21400
},
{
"epoch": 2.7549974372116863,
"grad_norm": 15.411248207092285,
"learning_rate": 2.2450025627883137e-05,
"loss": 0.4423,
"step": 21500
},
{
"epoch": 2.7678113787801126,
"grad_norm": 7.122200012207031,
"learning_rate": 2.2321886212198872e-05,
"loss": 0.4675,
"step": 21600
},
{
"epoch": 2.7806253203485394,
"grad_norm": 11.358266830444336,
"learning_rate": 2.2193746796514608e-05,
"loss": 0.4885,
"step": 21700
},
{
"epoch": 2.7934392619169657,
"grad_norm": 9.456644058227539,
"learning_rate": 2.2065607380830343e-05,
"loss": 0.4973,
"step": 21800
},
{
"epoch": 2.806253203485392,
"grad_norm": 28.7235164642334,
"learning_rate": 2.193746796514608e-05,
"loss": 0.429,
"step": 21900
},
{
"epoch": 2.8190671450538183,
"grad_norm": 14.859136581420898,
"learning_rate": 2.1809328549461817e-05,
"loss": 0.4867,
"step": 22000
},
{
"epoch": 2.831881086622245,
"grad_norm": 3.089897394180298,
"learning_rate": 2.1681189133777553e-05,
"loss": 0.4249,
"step": 22100
},
{
"epoch": 2.8446950281906713,
"grad_norm": 14.606719970703125,
"learning_rate": 2.1553049718093288e-05,
"loss": 0.4429,
"step": 22200
},
{
"epoch": 2.857508969759098,
"grad_norm": 7.761451244354248,
"learning_rate": 2.142491030240902e-05,
"loss": 0.4639,
"step": 22300
},
{
"epoch": 2.8703229113275244,
"grad_norm": 6.9101362228393555,
"learning_rate": 2.1296770886724756e-05,
"loss": 0.4606,
"step": 22400
},
{
"epoch": 2.8831368528959507,
"grad_norm": 6.754969120025635,
"learning_rate": 2.116863147104049e-05,
"loss": 0.4784,
"step": 22500
},
{
"epoch": 2.895950794464377,
"grad_norm": 20.884119033813477,
"learning_rate": 2.104049205535623e-05,
"loss": 0.4625,
"step": 22600
},
{
"epoch": 2.9087647360328037,
"grad_norm": 18.428529739379883,
"learning_rate": 2.0912352639671965e-05,
"loss": 0.4121,
"step": 22700
},
{
"epoch": 2.92157867760123,
"grad_norm": 9.211915969848633,
"learning_rate": 2.07842132239877e-05,
"loss": 0.457,
"step": 22800
},
{
"epoch": 2.9343926191696568,
"grad_norm": 5.744906425476074,
"learning_rate": 2.0656073808303436e-05,
"loss": 0.4169,
"step": 22900
},
{
"epoch": 2.947206560738083,
"grad_norm": 10.679366111755371,
"learning_rate": 2.052793439261917e-05,
"loss": 0.4719,
"step": 23000
},
{
"epoch": 2.9600205023065094,
"grad_norm": 8.72630500793457,
"learning_rate": 2.0399794976934904e-05,
"loss": 0.4743,
"step": 23100
},
{
"epoch": 2.9728344438749357,
"grad_norm": 5.53284215927124,
"learning_rate": 2.0271655561250642e-05,
"loss": 0.4592,
"step": 23200
},
{
"epoch": 2.9856483854433624,
"grad_norm": 10.75283432006836,
"learning_rate": 2.0143516145566378e-05,
"loss": 0.3971,
"step": 23300
},
{
"epoch": 2.9984623270117887,
"grad_norm": 10.634764671325684,
"learning_rate": 2.0015376729882113e-05,
"loss": 0.4295,
"step": 23400
},
{
"epoch": 3.0,
"eval_f1": 0.6853715205850849,
"eval_loss": 1.0191140174865723,
"eval_runtime": 837.6905,
"eval_samples_per_second": 9.316,
"eval_steps_per_second": 2.329,
"step": 23412
},
{
"epoch": 3.0112762685802155,
"grad_norm": 3.43902587890625,
"learning_rate": 1.988723731419785e-05,
"loss": 0.2448,
"step": 23500
},
{
"epoch": 3.0240902101486418,
"grad_norm": 0.5649552941322327,
"learning_rate": 1.9759097898513584e-05,
"loss": 0.1908,
"step": 23600
},
{
"epoch": 3.036904151717068,
"grad_norm": 1.3035610914230347,
"learning_rate": 1.963095848282932e-05,
"loss": 0.275,
"step": 23700
},
{
"epoch": 3.049718093285495,
"grad_norm": 27.42232322692871,
"learning_rate": 1.9502819067145055e-05,
"loss": 0.2727,
"step": 23800
},
{
"epoch": 3.062532034853921,
"grad_norm": 1.675907015800476,
"learning_rate": 1.937467965146079e-05,
"loss": 0.2916,
"step": 23900
},
{
"epoch": 3.0753459764223474,
"grad_norm": 9.602179527282715,
"learning_rate": 1.9246540235776526e-05,
"loss": 0.2645,
"step": 24000
},
{
"epoch": 3.088159917990774,
"grad_norm": 16.757831573486328,
"learning_rate": 1.911840082009226e-05,
"loss": 0.2476,
"step": 24100
},
{
"epoch": 3.1009738595592005,
"grad_norm": 5.842043876647949,
"learning_rate": 1.8990261404407997e-05,
"loss": 0.2829,
"step": 24200
},
{
"epoch": 3.1137878011276268,
"grad_norm": 0.593449592590332,
"learning_rate": 1.8862121988723732e-05,
"loss": 0.289,
"step": 24300
},
{
"epoch": 3.1266017426960535,
"grad_norm": 5.712982177734375,
"learning_rate": 1.8733982573039467e-05,
"loss": 0.2355,
"step": 24400
},
{
"epoch": 3.13941568426448,
"grad_norm": 0.3152589201927185,
"learning_rate": 1.8605843157355203e-05,
"loss": 0.2491,
"step": 24500
},
{
"epoch": 3.152229625832906,
"grad_norm": 19.951833724975586,
"learning_rate": 1.8477703741670938e-05,
"loss": 0.271,
"step": 24600
},
{
"epoch": 3.165043567401333,
"grad_norm": 5.257028579711914,
"learning_rate": 1.8349564325986674e-05,
"loss": 0.277,
"step": 24700
},
{
"epoch": 3.177857508969759,
"grad_norm": 3.6717381477355957,
"learning_rate": 1.822142491030241e-05,
"loss": 0.2736,
"step": 24800
},
{
"epoch": 3.1906714505381855,
"grad_norm": 38.49631881713867,
"learning_rate": 1.8093285494618144e-05,
"loss": 0.2789,
"step": 24900
},
{
"epoch": 3.2034853921066118,
"grad_norm": 5.944704055786133,
"learning_rate": 1.796514607893388e-05,
"loss": 0.3111,
"step": 25000
},
{
"epoch": 3.2162993336750385,
"grad_norm": 3.278078079223633,
"learning_rate": 1.7837006663249615e-05,
"loss": 0.287,
"step": 25100
},
{
"epoch": 3.229113275243465,
"grad_norm": 13.320869445800781,
"learning_rate": 1.7708867247565354e-05,
"loss": 0.2708,
"step": 25200
},
{
"epoch": 3.2419272168118916,
"grad_norm": 9.01321029663086,
"learning_rate": 1.758072783188109e-05,
"loss": 0.2891,
"step": 25300
},
{
"epoch": 3.254741158380318,
"grad_norm": 14.35201644897461,
"learning_rate": 1.745258841619682e-05,
"loss": 0.1523,
"step": 25400
},
{
"epoch": 3.267555099948744,
"grad_norm": 5.268370628356934,
"learning_rate": 1.7324449000512557e-05,
"loss": 0.3608,
"step": 25500
},
{
"epoch": 3.2803690415171705,
"grad_norm": 3.338168144226074,
"learning_rate": 1.7196309584828292e-05,
"loss": 0.2829,
"step": 25600
},
{
"epoch": 3.293182983085597,
"grad_norm": 12.441572189331055,
"learning_rate": 1.7068170169144028e-05,
"loss": 0.2563,
"step": 25700
},
{
"epoch": 3.3059969246540235,
"grad_norm": 2.870978832244873,
"learning_rate": 1.6940030753459767e-05,
"loss": 0.2957,
"step": 25800
},
{
"epoch": 3.3188108662224503,
"grad_norm": 10.626642227172852,
"learning_rate": 1.6811891337775502e-05,
"loss": 0.3493,
"step": 25900
},
{
"epoch": 3.3316248077908766,
"grad_norm": 1.1796225309371948,
"learning_rate": 1.6683751922091237e-05,
"loss": 0.293,
"step": 26000
},
{
"epoch": 3.344438749359303,
"grad_norm": 46.64753341674805,
"learning_rate": 1.6555612506406973e-05,
"loss": 0.2739,
"step": 26100
},
{
"epoch": 3.357252690927729,
"grad_norm": 17.778207778930664,
"learning_rate": 1.6427473090722705e-05,
"loss": 0.2897,
"step": 26200
},
{
"epoch": 3.370066632496156,
"grad_norm": 1.6698403358459473,
"learning_rate": 1.629933367503844e-05,
"loss": 0.2661,
"step": 26300
},
{
"epoch": 3.382880574064582,
"grad_norm": 0.18206116557121277,
"learning_rate": 1.617119425935418e-05,
"loss": 0.2847,
"step": 26400
},
{
"epoch": 3.395694515633009,
"grad_norm": 6.839690208435059,
"learning_rate": 1.6043054843669915e-05,
"loss": 0.3044,
"step": 26500
},
{
"epoch": 3.4085084572014352,
"grad_norm": 0.6313930749893188,
"learning_rate": 1.591491542798565e-05,
"loss": 0.2623,
"step": 26600
},
{
"epoch": 3.4213223987698616,
"grad_norm": 70.23905181884766,
"learning_rate": 1.5786776012301385e-05,
"loss": 0.2573,
"step": 26700
},
{
"epoch": 3.434136340338288,
"grad_norm": 16.72913360595703,
"learning_rate": 1.565863659661712e-05,
"loss": 0.2626,
"step": 26800
},
{
"epoch": 3.4469502819067146,
"grad_norm": 43.662845611572266,
"learning_rate": 1.5530497180932856e-05,
"loss": 0.2679,
"step": 26900
},
{
"epoch": 3.459764223475141,
"grad_norm": 20.96466064453125,
"learning_rate": 1.540235776524859e-05,
"loss": 0.3082,
"step": 27000
},
{
"epoch": 3.4725781650435676,
"grad_norm": 45.02407455444336,
"learning_rate": 1.5274218349564327e-05,
"loss": 0.2492,
"step": 27100
},
{
"epoch": 3.485392106611994,
"grad_norm": 14.404077529907227,
"learning_rate": 1.5146078933880062e-05,
"loss": 0.2704,
"step": 27200
},
{
"epoch": 3.4982060481804202,
"grad_norm": 19.40283966064453,
"learning_rate": 1.5017939518195798e-05,
"loss": 0.3089,
"step": 27300
},
{
"epoch": 3.5110199897488465,
"grad_norm": 13.016902923583984,
"learning_rate": 1.4889800102511533e-05,
"loss": 0.2953,
"step": 27400
},
{
"epoch": 3.5238339313172733,
"grad_norm": 6.934922695159912,
"learning_rate": 1.4761660686827269e-05,
"loss": 0.2132,
"step": 27500
},
{
"epoch": 3.5366478728856996,
"grad_norm": 49.58895492553711,
"learning_rate": 1.4633521271143006e-05,
"loss": 0.271,
"step": 27600
},
{
"epoch": 3.5494618144541263,
"grad_norm": 4.814508438110352,
"learning_rate": 1.4505381855458741e-05,
"loss": 0.3195,
"step": 27700
},
{
"epoch": 3.5622757560225526,
"grad_norm": 28.65342903137207,
"learning_rate": 1.4377242439774475e-05,
"loss": 0.2869,
"step": 27800
},
{
"epoch": 3.575089697590979,
"grad_norm": 5.931487083435059,
"learning_rate": 1.424910302409021e-05,
"loss": 0.2982,
"step": 27900
},
{
"epoch": 3.5879036391594052,
"grad_norm": 0.22432470321655273,
"learning_rate": 1.4120963608405946e-05,
"loss": 0.3167,
"step": 28000
},
{
"epoch": 3.600717580727832,
"grad_norm": 27.89299964904785,
"learning_rate": 1.3992824192721681e-05,
"loss": 0.2831,
"step": 28100
},
{
"epoch": 3.6135315222962583,
"grad_norm": 6.232203006744385,
"learning_rate": 1.3864684777037418e-05,
"loss": 0.2328,
"step": 28200
},
{
"epoch": 3.626345463864685,
"grad_norm": 0.3798358738422394,
"learning_rate": 1.3736545361353154e-05,
"loss": 0.2565,
"step": 28300
},
{
"epoch": 3.6391594054331113,
"grad_norm": 2.3177566528320312,
"learning_rate": 1.3608405945668889e-05,
"loss": 0.2822,
"step": 28400
},
{
"epoch": 3.6519733470015376,
"grad_norm": 0.9287611246109009,
"learning_rate": 1.3480266529984623e-05,
"loss": 0.2206,
"step": 28500
},
{
"epoch": 3.664787288569964,
"grad_norm": 19.89398765563965,
"learning_rate": 1.3352127114300358e-05,
"loss": 0.2934,
"step": 28600
},
{
"epoch": 3.6776012301383907,
"grad_norm": 14.735712051391602,
"learning_rate": 1.3223987698616094e-05,
"loss": 0.2667,
"step": 28700
},
{
"epoch": 3.690415171706817,
"grad_norm": 2.782954454421997,
"learning_rate": 1.309584828293183e-05,
"loss": 0.2565,
"step": 28800
},
{
"epoch": 3.7032291132752437,
"grad_norm": 20.082395553588867,
"learning_rate": 1.2967708867247566e-05,
"loss": 0.3069,
"step": 28900
},
{
"epoch": 3.71604305484367,
"grad_norm": 1.8632967472076416,
"learning_rate": 1.2839569451563302e-05,
"loss": 0.2484,
"step": 29000
},
{
"epoch": 3.7288569964120963,
"grad_norm": 6.2880330085754395,
"learning_rate": 1.2711430035879037e-05,
"loss": 0.2769,
"step": 29100
},
{
"epoch": 3.7416709379805226,
"grad_norm": 18.328922271728516,
"learning_rate": 1.2583290620194774e-05,
"loss": 0.284,
"step": 29200
},
{
"epoch": 3.7544848795489494,
"grad_norm": 0.2658964991569519,
"learning_rate": 1.2455151204510508e-05,
"loss": 0.2725,
"step": 29300
},
{
"epoch": 3.7672988211173757,
"grad_norm": 7.819123268127441,
"learning_rate": 1.2327011788826243e-05,
"loss": 0.2513,
"step": 29400
},
{
"epoch": 3.7801127626858024,
"grad_norm": 4.6279144287109375,
"learning_rate": 1.2198872373141979e-05,
"loss": 0.2573,
"step": 29500
},
{
"epoch": 3.7929267042542287,
"grad_norm": 24.996662139892578,
"learning_rate": 1.2070732957457714e-05,
"loss": 0.2621,
"step": 29600
},
{
"epoch": 3.805740645822655,
"grad_norm": 20.87746810913086,
"learning_rate": 1.194259354177345e-05,
"loss": 0.2499,
"step": 29700
},
{
"epoch": 3.8185545873910813,
"grad_norm": 1.5061414241790771,
"learning_rate": 1.1814454126089187e-05,
"loss": 0.265,
"step": 29800
},
{
"epoch": 3.831368528959508,
"grad_norm": 2.7230064868927,
"learning_rate": 1.168631471040492e-05,
"loss": 0.2469,
"step": 29900
},
{
"epoch": 3.8441824705279344,
"grad_norm": 0.6768075823783875,
"learning_rate": 1.1558175294720656e-05,
"loss": 0.2686,
"step": 30000
},
{
"epoch": 3.8569964120963607,
"grad_norm": 0.08343211561441422,
"learning_rate": 1.1430035879036393e-05,
"loss": 0.2565,
"step": 30100
},
{
"epoch": 3.8698103536647874,
"grad_norm": 25.58348274230957,
"learning_rate": 1.1301896463352128e-05,
"loss": 0.2967,
"step": 30200
},
{
"epoch": 3.8826242952332137,
"grad_norm": 1.0459709167480469,
"learning_rate": 1.1173757047667862e-05,
"loss": 0.3028,
"step": 30300
},
{
"epoch": 3.89543823680164,
"grad_norm": 0.33878639340400696,
"learning_rate": 1.1045617631983599e-05,
"loss": 0.2243,
"step": 30400
},
{
"epoch": 3.9082521783700668,
"grad_norm": 2.021047592163086,
"learning_rate": 1.0917478216299335e-05,
"loss": 0.3656,
"step": 30500
},
{
"epoch": 3.921066119938493,
"grad_norm": 1.6855653524398804,
"learning_rate": 1.078933880061507e-05,
"loss": 0.2323,
"step": 30600
},
{
"epoch": 3.9338800615069194,
"grad_norm": 21.66104507446289,
"learning_rate": 1.0661199384930805e-05,
"loss": 0.2205,
"step": 30700
},
{
"epoch": 3.946694003075346,
"grad_norm": 2.4428458213806152,
"learning_rate": 1.053305996924654e-05,
"loss": 0.2436,
"step": 30800
},
{
"epoch": 3.9595079446437724,
"grad_norm": 39.37623596191406,
"learning_rate": 1.0404920553562276e-05,
"loss": 0.2831,
"step": 30900
},
{
"epoch": 3.9723218862121987,
"grad_norm": 44.4313850402832,
"learning_rate": 1.0276781137878012e-05,
"loss": 0.2522,
"step": 31000
},
{
"epoch": 3.9851358277806255,
"grad_norm": 2.6004929542541504,
"learning_rate": 1.0148641722193747e-05,
"loss": 0.3209,
"step": 31100
},
{
"epoch": 3.9979497693490518,
"grad_norm": 2.536029815673828,
"learning_rate": 1.0020502306509482e-05,
"loss": 0.2807,
"step": 31200
},
{
"epoch": 4.0,
"eval_f1": 0.6869426704202687,
"eval_loss": 2.03011155128479,
"eval_runtime": 825.2142,
"eval_samples_per_second": 9.457,
"eval_steps_per_second": 2.364,
"step": 31216
}
],
"logging_steps": 100,
"max_steps": 39020,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.105948110057636e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}