Hamanasu-Magnum-QwQ-32B / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
8624888 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9921259842519685,
"eval_steps": 500,
"global_step": 380,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005249343832020997,
"grad_norm": 4.262586140207107,
"learning_rate": 1.2500000000000002e-07,
"loss": 1.2143,
"step": 1
},
{
"epoch": 0.010498687664041995,
"grad_norm": 4.1559742669756154,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.2307,
"step": 2
},
{
"epoch": 0.015748031496062992,
"grad_norm": 4.2196647284049895,
"learning_rate": 3.75e-07,
"loss": 1.2286,
"step": 3
},
{
"epoch": 0.02099737532808399,
"grad_norm": 4.13634077943981,
"learning_rate": 5.000000000000001e-07,
"loss": 1.2002,
"step": 4
},
{
"epoch": 0.026246719160104987,
"grad_norm": 4.015668455829927,
"learning_rate": 6.25e-07,
"loss": 1.1672,
"step": 5
},
{
"epoch": 0.031496062992125984,
"grad_norm": 3.832855314884781,
"learning_rate": 7.5e-07,
"loss": 1.1993,
"step": 6
},
{
"epoch": 0.03674540682414698,
"grad_norm": 3.8323407788221733,
"learning_rate": 8.75e-07,
"loss": 1.1554,
"step": 7
},
{
"epoch": 0.04199475065616798,
"grad_norm": 3.7465244180174917,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.1672,
"step": 8
},
{
"epoch": 0.047244094488188976,
"grad_norm": 3.7827251172961986,
"learning_rate": 1.125e-06,
"loss": 1.1755,
"step": 9
},
{
"epoch": 0.05249343832020997,
"grad_norm": 3.470602675526565,
"learning_rate": 1.25e-06,
"loss": 1.1419,
"step": 10
},
{
"epoch": 0.05774278215223097,
"grad_norm": 3.556221853917274,
"learning_rate": 1.3750000000000002e-06,
"loss": 1.194,
"step": 11
},
{
"epoch": 0.06299212598425197,
"grad_norm": 3.324934060085957,
"learning_rate": 1.5e-06,
"loss": 1.1336,
"step": 12
},
{
"epoch": 0.06824146981627296,
"grad_norm": 2.965981688480075,
"learning_rate": 1.6250000000000001e-06,
"loss": 1.1349,
"step": 13
},
{
"epoch": 0.07349081364829396,
"grad_norm": 2.8658973663115046,
"learning_rate": 1.75e-06,
"loss": 1.1776,
"step": 14
},
{
"epoch": 0.07874015748031496,
"grad_norm": 2.720689909744523,
"learning_rate": 1.8750000000000003e-06,
"loss": 1.1549,
"step": 15
},
{
"epoch": 0.08398950131233596,
"grad_norm": 2.439062154183451,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.141,
"step": 16
},
{
"epoch": 0.08923884514435695,
"grad_norm": 2.1353279033918002,
"learning_rate": 2.125e-06,
"loss": 1.139,
"step": 17
},
{
"epoch": 0.09448818897637795,
"grad_norm": 1.2092875477650313,
"learning_rate": 2.25e-06,
"loss": 1.0516,
"step": 18
},
{
"epoch": 0.09973753280839895,
"grad_norm": 1.0763847439987342,
"learning_rate": 2.375e-06,
"loss": 1.0802,
"step": 19
},
{
"epoch": 0.10498687664041995,
"grad_norm": 1.0340250902529846,
"learning_rate": 2.5e-06,
"loss": 1.0607,
"step": 20
},
{
"epoch": 0.11023622047244094,
"grad_norm": 0.8630354040462489,
"learning_rate": 2.6250000000000003e-06,
"loss": 1.0496,
"step": 21
},
{
"epoch": 0.11548556430446194,
"grad_norm": 0.8072735722523627,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.08,
"step": 22
},
{
"epoch": 0.12073490813648294,
"grad_norm": 0.7305262800316248,
"learning_rate": 2.875e-06,
"loss": 1.0539,
"step": 23
},
{
"epoch": 0.12598425196850394,
"grad_norm": 0.7406394208995156,
"learning_rate": 3e-06,
"loss": 1.0529,
"step": 24
},
{
"epoch": 0.13123359580052493,
"grad_norm": 0.7903255346265977,
"learning_rate": 3.125e-06,
"loss": 0.9914,
"step": 25
},
{
"epoch": 0.13648293963254593,
"grad_norm": 0.8017049075586423,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.0313,
"step": 26
},
{
"epoch": 0.14173228346456693,
"grad_norm": 0.7764417012146556,
"learning_rate": 3.3750000000000003e-06,
"loss": 1.0422,
"step": 27
},
{
"epoch": 0.14698162729658792,
"grad_norm": 0.7112571206699242,
"learning_rate": 3.5e-06,
"loss": 1.0187,
"step": 28
},
{
"epoch": 0.15223097112860892,
"grad_norm": 0.6069574236656299,
"learning_rate": 3.625e-06,
"loss": 0.9958,
"step": 29
},
{
"epoch": 0.15748031496062992,
"grad_norm": 0.5747553586770598,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.9858,
"step": 30
},
{
"epoch": 0.16272965879265092,
"grad_norm": 0.590510044443011,
"learning_rate": 3.875e-06,
"loss": 0.9841,
"step": 31
},
{
"epoch": 0.1679790026246719,
"grad_norm": 0.5240510507345066,
"learning_rate": 4.000000000000001e-06,
"loss": 1.0171,
"step": 32
},
{
"epoch": 0.1732283464566929,
"grad_norm": 0.4913378134702597,
"learning_rate": 4.125e-06,
"loss": 1.0218,
"step": 33
},
{
"epoch": 0.1784776902887139,
"grad_norm": 0.47909247638813307,
"learning_rate": 4.25e-06,
"loss": 1.02,
"step": 34
},
{
"epoch": 0.1837270341207349,
"grad_norm": 0.4949722185401137,
"learning_rate": 4.3750000000000005e-06,
"loss": 1.0206,
"step": 35
},
{
"epoch": 0.1889763779527559,
"grad_norm": 0.48456911030733396,
"learning_rate": 4.5e-06,
"loss": 1.0321,
"step": 36
},
{
"epoch": 0.1942257217847769,
"grad_norm": 0.49965866704889284,
"learning_rate": 4.625000000000001e-06,
"loss": 1.0184,
"step": 37
},
{
"epoch": 0.1994750656167979,
"grad_norm": 0.4898360279427832,
"learning_rate": 4.75e-06,
"loss": 1.0165,
"step": 38
},
{
"epoch": 0.2047244094488189,
"grad_norm": 0.5039246597121783,
"learning_rate": 4.875e-06,
"loss": 0.9811,
"step": 39
},
{
"epoch": 0.2099737532808399,
"grad_norm": 0.462371012430454,
"learning_rate": 5e-06,
"loss": 0.9999,
"step": 40
},
{
"epoch": 0.2152230971128609,
"grad_norm": 0.4564370676360458,
"learning_rate": 4.99989327925842e-06,
"loss": 1.0056,
"step": 41
},
{
"epoch": 0.2204724409448819,
"grad_norm": 0.4838548423542603,
"learning_rate": 4.999573126145132e-06,
"loss": 1.0099,
"step": 42
},
{
"epoch": 0.22572178477690288,
"grad_norm": 0.47657399375831033,
"learning_rate": 4.999039567993719e-06,
"loss": 1.0059,
"step": 43
},
{
"epoch": 0.23097112860892388,
"grad_norm": 0.4609228968128241,
"learning_rate": 4.998292650357558e-06,
"loss": 0.9613,
"step": 44
},
{
"epoch": 0.23622047244094488,
"grad_norm": 0.5379441335347738,
"learning_rate": 4.997332437005932e-06,
"loss": 0.9912,
"step": 45
},
{
"epoch": 0.24146981627296588,
"grad_norm": 0.4488276134846175,
"learning_rate": 4.996159009918586e-06,
"loss": 0.9623,
"step": 46
},
{
"epoch": 0.24671916010498687,
"grad_norm": 0.7899879742350473,
"learning_rate": 4.994772469278726e-06,
"loss": 0.9373,
"step": 47
},
{
"epoch": 0.25196850393700787,
"grad_norm": 0.45351107525432893,
"learning_rate": 4.99317293346447e-06,
"loss": 0.9312,
"step": 48
},
{
"epoch": 0.2572178477690289,
"grad_norm": 0.4145057356223518,
"learning_rate": 4.991360539038737e-06,
"loss": 0.9133,
"step": 49
},
{
"epoch": 0.26246719160104987,
"grad_norm": 0.4371226648356658,
"learning_rate": 4.989335440737587e-06,
"loss": 0.9763,
"step": 50
},
{
"epoch": 0.2677165354330709,
"grad_norm": 0.49987252416920314,
"learning_rate": 4.987097811457015e-06,
"loss": 0.9753,
"step": 51
},
{
"epoch": 0.27296587926509186,
"grad_norm": 0.4737378597917066,
"learning_rate": 4.984647842238185e-06,
"loss": 0.9509,
"step": 52
},
{
"epoch": 0.2782152230971129,
"grad_norm": 0.4803218242006868,
"learning_rate": 4.981985742251123e-06,
"loss": 1.0008,
"step": 53
},
{
"epoch": 0.28346456692913385,
"grad_norm": 0.4238793960297473,
"learning_rate": 4.9791117387768575e-06,
"loss": 1.0024,
"step": 54
},
{
"epoch": 0.2887139107611549,
"grad_norm": 0.4135698693902407,
"learning_rate": 4.976026077188013e-06,
"loss": 0.9208,
"step": 55
},
{
"epoch": 0.29396325459317585,
"grad_norm": 0.49893312330659967,
"learning_rate": 4.972729020927866e-06,
"loss": 0.9771,
"step": 56
},
{
"epoch": 0.2992125984251969,
"grad_norm": 0.43878390791709027,
"learning_rate": 4.9692208514878445e-06,
"loss": 0.937,
"step": 57
},
{
"epoch": 0.30446194225721784,
"grad_norm": 0.42250251333050837,
"learning_rate": 4.965501868383507e-06,
"loss": 0.9287,
"step": 58
},
{
"epoch": 0.30971128608923887,
"grad_norm": 0.42426620742455357,
"learning_rate": 4.961572389128959e-06,
"loss": 0.9374,
"step": 59
},
{
"epoch": 0.31496062992125984,
"grad_norm": 0.4622588583575654,
"learning_rate": 4.957432749209755e-06,
"loss": 0.99,
"step": 60
},
{
"epoch": 0.32020997375328086,
"grad_norm": 0.4324798534582787,
"learning_rate": 4.953083302054247e-06,
"loss": 1.0035,
"step": 61
},
{
"epoch": 0.32545931758530183,
"grad_norm": 0.4303590460079348,
"learning_rate": 4.948524419003415e-06,
"loss": 0.9585,
"step": 62
},
{
"epoch": 0.33070866141732286,
"grad_norm": 0.42861048906851473,
"learning_rate": 4.943756489279164e-06,
"loss": 0.9772,
"step": 63
},
{
"epoch": 0.3359580052493438,
"grad_norm": 0.4115697149677722,
"learning_rate": 4.938779919951092e-06,
"loss": 0.9426,
"step": 64
},
{
"epoch": 0.34120734908136485,
"grad_norm": 0.417300147661056,
"learning_rate": 4.933595135901733e-06,
"loss": 0.9447,
"step": 65
},
{
"epoch": 0.3464566929133858,
"grad_norm": 0.4040601468423496,
"learning_rate": 4.928202579790285e-06,
"loss": 0.966,
"step": 66
},
{
"epoch": 0.35170603674540685,
"grad_norm": 0.3677161548087925,
"learning_rate": 4.9226027120148195e-06,
"loss": 0.941,
"step": 67
},
{
"epoch": 0.3569553805774278,
"grad_norm": 0.3832286188469758,
"learning_rate": 4.916796010672969e-06,
"loss": 0.9822,
"step": 68
},
{
"epoch": 0.36220472440944884,
"grad_norm": 0.4345291089971557,
"learning_rate": 4.910782971521112e-06,
"loss": 0.9687,
"step": 69
},
{
"epoch": 0.3674540682414698,
"grad_norm": 0.4133843702108161,
"learning_rate": 4.904564107932048e-06,
"loss": 0.9283,
"step": 70
},
{
"epoch": 0.37270341207349084,
"grad_norm": 0.38465081625519515,
"learning_rate": 4.898139950851163e-06,
"loss": 0.9479,
"step": 71
},
{
"epoch": 0.3779527559055118,
"grad_norm": 0.424340283488392,
"learning_rate": 4.891511048751102e-06,
"loss": 0.9593,
"step": 72
},
{
"epoch": 0.38320209973753283,
"grad_norm": 0.40638719244646565,
"learning_rate": 4.884677967584945e-06,
"loss": 0.9264,
"step": 73
},
{
"epoch": 0.3884514435695538,
"grad_norm": 0.4168684939745582,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.9399,
"step": 74
},
{
"epoch": 0.3937007874015748,
"grad_norm": 0.3928820462579945,
"learning_rate": 4.870401618977415e-06,
"loss": 0.9736,
"step": 75
},
{
"epoch": 0.3989501312335958,
"grad_norm": 0.48005159652101537,
"learning_rate": 4.86295957040205e-06,
"loss": 0.9935,
"step": 76
},
{
"epoch": 0.4041994750656168,
"grad_norm": 0.4229896582271501,
"learning_rate": 4.855315780388541e-06,
"loss": 0.9358,
"step": 77
},
{
"epoch": 0.4094488188976378,
"grad_norm": 0.4258651800976577,
"learning_rate": 4.847470901537642e-06,
"loss": 0.9583,
"step": 78
},
{
"epoch": 0.4146981627296588,
"grad_norm": 0.4069975102427097,
"learning_rate": 4.839425603618382e-06,
"loss": 0.9237,
"step": 79
},
{
"epoch": 0.4199475065616798,
"grad_norm": 0.3853018091795304,
"learning_rate": 4.83118057351089e-06,
"loss": 0.9528,
"step": 80
},
{
"epoch": 0.4251968503937008,
"grad_norm": 0.41640976933906576,
"learning_rate": 4.822736515147748e-06,
"loss": 0.9281,
"step": 81
},
{
"epoch": 0.4304461942257218,
"grad_norm": 0.4341414623727107,
"learning_rate": 4.814094149453891e-06,
"loss": 0.983,
"step": 82
},
{
"epoch": 0.4356955380577428,
"grad_norm": 0.41893835966431175,
"learning_rate": 4.805254214285061e-06,
"loss": 0.9691,
"step": 83
},
{
"epoch": 0.4409448818897638,
"grad_norm": 0.3778203088676148,
"learning_rate": 4.796217464364808e-06,
"loss": 0.9386,
"step": 84
},
{
"epoch": 0.4461942257217848,
"grad_norm": 0.4049971835755209,
"learning_rate": 4.786984671220053e-06,
"loss": 0.9604,
"step": 85
},
{
"epoch": 0.45144356955380577,
"grad_norm": 0.39430021496671025,
"learning_rate": 4.7775566231152216e-06,
"loss": 0.9625,
"step": 86
},
{
"epoch": 0.4566929133858268,
"grad_norm": 0.4303135030837173,
"learning_rate": 4.767934124984941e-06,
"loss": 0.9421,
"step": 87
},
{
"epoch": 0.46194225721784776,
"grad_norm": 0.4037289734626591,
"learning_rate": 4.7581179983653224e-06,
"loss": 0.9395,
"step": 88
},
{
"epoch": 0.4671916010498688,
"grad_norm": 0.389977925556389,
"learning_rate": 4.7481090813238145e-06,
"loss": 0.9494,
"step": 89
},
{
"epoch": 0.47244094488188976,
"grad_norm": 0.4221791671439025,
"learning_rate": 4.737908228387656e-06,
"loss": 0.9236,
"step": 90
},
{
"epoch": 0.4776902887139108,
"grad_norm": 0.38571752654505004,
"learning_rate": 4.72751631047092e-06,
"loss": 0.9836,
"step": 91
},
{
"epoch": 0.48293963254593175,
"grad_norm": 0.3756683416227555,
"learning_rate": 4.716934214800155e-06,
"loss": 0.9847,
"step": 92
},
{
"epoch": 0.4881889763779528,
"grad_norm": 0.43805821160946656,
"learning_rate": 4.70616284483864e-06,
"loss": 0.9759,
"step": 93
},
{
"epoch": 0.49343832020997375,
"grad_norm": 0.42186430904629324,
"learning_rate": 4.695203120209245e-06,
"loss": 0.9381,
"step": 94
},
{
"epoch": 0.49868766404199477,
"grad_norm": 0.3816672833085737,
"learning_rate": 4.684055976615924e-06,
"loss": 0.9381,
"step": 95
},
{
"epoch": 0.5039370078740157,
"grad_norm": 0.4151475033902527,
"learning_rate": 4.672722365763821e-06,
"loss": 0.9449,
"step": 96
},
{
"epoch": 0.5091863517060368,
"grad_norm": 0.39782263887391656,
"learning_rate": 4.66120325527802e-06,
"loss": 0.9617,
"step": 97
},
{
"epoch": 0.5144356955380578,
"grad_norm": 0.40910931809868095,
"learning_rate": 4.649499628620931e-06,
"loss": 0.9334,
"step": 98
},
{
"epoch": 0.5196850393700787,
"grad_norm": 0.3910070221772484,
"learning_rate": 4.637612485008328e-06,
"loss": 0.9344,
"step": 99
},
{
"epoch": 0.5249343832020997,
"grad_norm": 0.3983780342477285,
"learning_rate": 4.625542839324036e-06,
"loss": 0.9201,
"step": 100
},
{
"epoch": 0.5301837270341208,
"grad_norm": 0.41440216503056093,
"learning_rate": 4.613291722033285e-06,
"loss": 0.9854,
"step": 101
},
{
"epoch": 0.5354330708661418,
"grad_norm": 0.399617487921351,
"learning_rate": 4.600860179094732e-06,
"loss": 0.9317,
"step": 102
},
{
"epoch": 0.5406824146981627,
"grad_norm": 0.38269422826680943,
"learning_rate": 4.588249271871164e-06,
"loss": 0.9026,
"step": 103
},
{
"epoch": 0.5459317585301837,
"grad_norm": 0.4161308186315819,
"learning_rate": 4.575460077038877e-06,
"loss": 0.9402,
"step": 104
},
{
"epoch": 0.5511811023622047,
"grad_norm": 0.38768080702286994,
"learning_rate": 4.562493686495756e-06,
"loss": 0.9276,
"step": 105
},
{
"epoch": 0.5564304461942258,
"grad_norm": 0.4454064598197059,
"learning_rate": 4.5493512072680535e-06,
"loss": 0.9452,
"step": 106
},
{
"epoch": 0.5616797900262467,
"grad_norm": 0.3695333796660741,
"learning_rate": 4.536033761415871e-06,
"loss": 0.9493,
"step": 107
},
{
"epoch": 0.5669291338582677,
"grad_norm": 0.4111826761670116,
"learning_rate": 4.522542485937369e-06,
"loss": 0.9109,
"step": 108
},
{
"epoch": 0.5721784776902887,
"grad_norm": 0.384299356893817,
"learning_rate": 4.508878532671684e-06,
"loss": 0.938,
"step": 109
},
{
"epoch": 0.5774278215223098,
"grad_norm": 0.39579785919606036,
"learning_rate": 4.4950430682005995e-06,
"loss": 0.924,
"step": 110
},
{
"epoch": 0.5826771653543307,
"grad_norm": 0.40856237862457445,
"learning_rate": 4.481037273748935e-06,
"loss": 0.9092,
"step": 111
},
{
"epoch": 0.5879265091863517,
"grad_norm": 0.38651275760437465,
"learning_rate": 4.4668623450837085e-06,
"loss": 0.9311,
"step": 112
},
{
"epoch": 0.5931758530183727,
"grad_norm": 0.378464413540759,
"learning_rate": 4.452519492412039e-06,
"loss": 0.9255,
"step": 113
},
{
"epoch": 0.5984251968503937,
"grad_norm": 0.3779607392785478,
"learning_rate": 4.438009940277825e-06,
"loss": 0.9024,
"step": 114
},
{
"epoch": 0.6036745406824147,
"grad_norm": 0.410919812420786,
"learning_rate": 4.423334927457198e-06,
"loss": 0.9105,
"step": 115
},
{
"epoch": 0.6089238845144357,
"grad_norm": 0.41911457133998964,
"learning_rate": 4.408495706852758e-06,
"loss": 0.9483,
"step": 116
},
{
"epoch": 0.6141732283464567,
"grad_norm": 0.39458100856937656,
"learning_rate": 4.393493545386607e-06,
"loss": 0.9388,
"step": 117
},
{
"epoch": 0.6194225721784777,
"grad_norm": 0.40200990450921853,
"learning_rate": 4.378329723892184e-06,
"loss": 0.927,
"step": 118
},
{
"epoch": 0.6246719160104987,
"grad_norm": 0.42346347209583185,
"learning_rate": 4.3630055370049065e-06,
"loss": 0.9439,
"step": 119
},
{
"epoch": 0.6299212598425197,
"grad_norm": 0.40678132773256936,
"learning_rate": 4.3475222930516484e-06,
"loss": 0.994,
"step": 120
},
{
"epoch": 0.6351706036745407,
"grad_norm": 0.4215857320252809,
"learning_rate": 4.3318813139390295e-06,
"loss": 0.8946,
"step": 121
},
{
"epoch": 0.6404199475065617,
"grad_norm": 0.39311078218513845,
"learning_rate": 4.316083935040561e-06,
"loss": 0.9129,
"step": 122
},
{
"epoch": 0.6456692913385826,
"grad_norm": 0.38855202294392815,
"learning_rate": 4.300131505082637e-06,
"loss": 0.9229,
"step": 123
},
{
"epoch": 0.6509186351706037,
"grad_norm": 0.437029643792513,
"learning_rate": 4.284025386029381e-06,
"loss": 0.9151,
"step": 124
},
{
"epoch": 0.6561679790026247,
"grad_norm": 0.41975844377898647,
"learning_rate": 4.267766952966369e-06,
"loss": 0.9224,
"step": 125
},
{
"epoch": 0.6614173228346457,
"grad_norm": 0.39675424131353054,
"learning_rate": 4.251357593983228e-06,
"loss": 0.9371,
"step": 126
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.3980469727389334,
"learning_rate": 4.234798710055124e-06,
"loss": 0.928,
"step": 127
},
{
"epoch": 0.6719160104986877,
"grad_norm": 0.3868709015712275,
"learning_rate": 4.218091714923157e-06,
"loss": 0.925,
"step": 128
},
{
"epoch": 0.6771653543307087,
"grad_norm": 0.4150283358784954,
"learning_rate": 4.2012380349736544e-06,
"loss": 0.9632,
"step": 129
},
{
"epoch": 0.6824146981627297,
"grad_norm": 0.3965769285465639,
"learning_rate": 4.184239109116393e-06,
"loss": 0.916,
"step": 130
},
{
"epoch": 0.6876640419947506,
"grad_norm": 0.4040770175389451,
"learning_rate": 4.167096388661754e-06,
"loss": 0.9013,
"step": 131
},
{
"epoch": 0.6929133858267716,
"grad_norm": 0.4065859256340288,
"learning_rate": 4.149811337196808e-06,
"loss": 0.9115,
"step": 132
},
{
"epoch": 0.6981627296587927,
"grad_norm": 0.39095752489168806,
"learning_rate": 4.132385430460361e-06,
"loss": 0.9401,
"step": 133
},
{
"epoch": 0.7034120734908137,
"grad_norm": 0.37457240781741025,
"learning_rate": 4.114820156216969e-06,
"loss": 0.9178,
"step": 134
},
{
"epoch": 0.7086614173228346,
"grad_norm": 0.399797934048477,
"learning_rate": 4.097117014129903e-06,
"loss": 0.9579,
"step": 135
},
{
"epoch": 0.7139107611548556,
"grad_norm": 0.4477976289426918,
"learning_rate": 4.079277515633127e-06,
"loss": 0.9448,
"step": 136
},
{
"epoch": 0.7191601049868767,
"grad_norm": 0.38322925064614577,
"learning_rate": 4.061303183802248e-06,
"loss": 0.9192,
"step": 137
},
{
"epoch": 0.7244094488188977,
"grad_norm": 0.40594305556063015,
"learning_rate": 4.043195553224482e-06,
"loss": 0.9254,
"step": 138
},
{
"epoch": 0.7296587926509186,
"grad_norm": 0.36120175115175723,
"learning_rate": 4.024956169867642e-06,
"loss": 0.9591,
"step": 139
},
{
"epoch": 0.7349081364829396,
"grad_norm": 0.38003421628995726,
"learning_rate": 4.006586590948141e-06,
"loss": 0.9154,
"step": 140
},
{
"epoch": 0.7401574803149606,
"grad_norm": 0.3779196146369032,
"learning_rate": 3.9880883847980475e-06,
"loss": 0.9412,
"step": 141
},
{
"epoch": 0.7454068241469817,
"grad_norm": 0.3908051818324559,
"learning_rate": 3.969463130731183e-06,
"loss": 0.9037,
"step": 142
},
{
"epoch": 0.7506561679790026,
"grad_norm": 0.4043989800878751,
"learning_rate": 3.95071241890829e-06,
"loss": 0.9032,
"step": 143
},
{
"epoch": 0.7559055118110236,
"grad_norm": 0.396255171092691,
"learning_rate": 3.9318378502012636e-06,
"loss": 0.9288,
"step": 144
},
{
"epoch": 0.7611548556430446,
"grad_norm": 0.40757200005039956,
"learning_rate": 3.91284103605648e-06,
"loss": 0.9179,
"step": 145
},
{
"epoch": 0.7664041994750657,
"grad_norm": 0.3862826523020415,
"learning_rate": 3.893723598357214e-06,
"loss": 0.8894,
"step": 146
},
{
"epoch": 0.7716535433070866,
"grad_norm": 0.3770664118848218,
"learning_rate": 3.874487169285168e-06,
"loss": 0.8898,
"step": 147
},
{
"epoch": 0.7769028871391076,
"grad_norm": 0.3854873052422692,
"learning_rate": 3.855133391181124e-06,
"loss": 0.9135,
"step": 148
},
{
"epoch": 0.7821522309711286,
"grad_norm": 0.38997928953459443,
"learning_rate": 3.835663916404721e-06,
"loss": 0.8843,
"step": 149
},
{
"epoch": 0.7874015748031497,
"grad_norm": 0.45712052623884775,
"learning_rate": 3.81608040719339e-06,
"loss": 0.9694,
"step": 150
},
{
"epoch": 0.7926509186351706,
"grad_norm": 0.41182630339038373,
"learning_rate": 3.7963845355204303e-06,
"loss": 0.9194,
"step": 151
},
{
"epoch": 0.7979002624671916,
"grad_norm": 0.4065794661619882,
"learning_rate": 3.7765779829522674e-06,
"loss": 0.9278,
"step": 152
},
{
"epoch": 0.8031496062992126,
"grad_norm": 0.38122636394786497,
"learning_rate": 3.7566624405048847e-06,
"loss": 0.892,
"step": 153
},
{
"epoch": 0.8083989501312336,
"grad_norm": 0.4026809828765533,
"learning_rate": 3.736639608499448e-06,
"loss": 0.9246,
"step": 154
},
{
"epoch": 0.8136482939632546,
"grad_norm": 0.3884561226623423,
"learning_rate": 3.7165111964171407e-06,
"loss": 0.9438,
"step": 155
},
{
"epoch": 0.8188976377952756,
"grad_norm": 0.3844815855908215,
"learning_rate": 3.6962789227532165e-06,
"loss": 0.9316,
"step": 156
},
{
"epoch": 0.8241469816272966,
"grad_norm": 0.3715029261772477,
"learning_rate": 3.675944514870274e-06,
"loss": 0.924,
"step": 157
},
{
"epoch": 0.8293963254593176,
"grad_norm": 0.4062567635907239,
"learning_rate": 3.6555097088507837e-06,
"loss": 0.9616,
"step": 158
},
{
"epoch": 0.8346456692913385,
"grad_norm": 0.40370287012056855,
"learning_rate": 3.634976249348867e-06,
"loss": 0.9526,
"step": 159
},
{
"epoch": 0.8398950131233596,
"grad_norm": 0.3899308479039462,
"learning_rate": 3.6143458894413463e-06,
"loss": 0.9215,
"step": 160
},
{
"epoch": 0.8451443569553806,
"grad_norm": 0.3813874729696962,
"learning_rate": 3.5936203904780665e-06,
"loss": 0.953,
"step": 161
},
{
"epoch": 0.8503937007874016,
"grad_norm": 0.38224710953621555,
"learning_rate": 3.5728015219315226e-06,
"loss": 0.894,
"step": 162
},
{
"epoch": 0.8556430446194225,
"grad_norm": 0.4038493099805114,
"learning_rate": 3.5518910612457885e-06,
"loss": 0.9614,
"step": 163
},
{
"epoch": 0.8608923884514436,
"grad_norm": 0.3976228258032158,
"learning_rate": 3.530890793684759e-06,
"loss": 0.9364,
"step": 164
},
{
"epoch": 0.8661417322834646,
"grad_norm": 0.4141980981190029,
"learning_rate": 3.5098025121797375e-06,
"loss": 0.9316,
"step": 165
},
{
"epoch": 0.8713910761154856,
"grad_norm": 0.4156903411116242,
"learning_rate": 3.4886280171763563e-06,
"loss": 0.923,
"step": 166
},
{
"epoch": 0.8766404199475065,
"grad_norm": 0.38250613966133334,
"learning_rate": 3.467369116480864e-06,
"loss": 0.9153,
"step": 167
},
{
"epoch": 0.8818897637795275,
"grad_norm": 0.4070344049084728,
"learning_rate": 3.446027625105776e-06,
"loss": 0.9347,
"step": 168
},
{
"epoch": 0.8871391076115486,
"grad_norm": 0.3844877252304378,
"learning_rate": 3.424605365114923e-06,
"loss": 0.9214,
"step": 169
},
{
"epoch": 0.8923884514435696,
"grad_norm": 0.37585915847896717,
"learning_rate": 3.403104165467883e-06,
"loss": 0.9133,
"step": 170
},
{
"epoch": 0.8976377952755905,
"grad_norm": 0.6300998021233689,
"learning_rate": 3.3815258618638316e-06,
"loss": 0.9395,
"step": 171
},
{
"epoch": 0.9028871391076115,
"grad_norm": 0.3994171061317929,
"learning_rate": 3.359872296584821e-06,
"loss": 0.917,
"step": 172
},
{
"epoch": 0.9081364829396326,
"grad_norm": 0.3806190962949758,
"learning_rate": 3.338145318338485e-06,
"loss": 0.9408,
"step": 173
},
{
"epoch": 0.9133858267716536,
"grad_norm": 0.36777162273585867,
"learning_rate": 3.3163467821002082e-06,
"loss": 0.9346,
"step": 174
},
{
"epoch": 0.9186351706036745,
"grad_norm": 0.3927948155295108,
"learning_rate": 3.2944785489547544e-06,
"loss": 0.9121,
"step": 175
},
{
"epoch": 0.9238845144356955,
"grad_norm": 0.3958182964876464,
"learning_rate": 3.272542485937369e-06,
"loss": 0.9318,
"step": 176
},
{
"epoch": 0.9291338582677166,
"grad_norm": 0.39754399123912254,
"learning_rate": 3.250540465874382e-06,
"loss": 0.9244,
"step": 177
},
{
"epoch": 0.9343832020997376,
"grad_norm": 0.38861358420887904,
"learning_rate": 3.228474367223312e-06,
"loss": 0.9051,
"step": 178
},
{
"epoch": 0.9396325459317585,
"grad_norm": 0.3926071787199394,
"learning_rate": 3.206346073912488e-06,
"loss": 0.9409,
"step": 179
},
{
"epoch": 0.9448818897637795,
"grad_norm": 0.4008739814562732,
"learning_rate": 3.184157475180208e-06,
"loss": 0.9222,
"step": 180
},
{
"epoch": 0.9501312335958005,
"grad_norm": 0.3665607732753151,
"learning_rate": 3.1619104654134397e-06,
"loss": 0.913,
"step": 181
},
{
"epoch": 0.9553805774278216,
"grad_norm": 0.38983081632202093,
"learning_rate": 3.1396069439860894e-06,
"loss": 0.9297,
"step": 182
},
{
"epoch": 0.9606299212598425,
"grad_norm": 0.3645800135143814,
"learning_rate": 3.117248815096833e-06,
"loss": 0.8883,
"step": 183
},
{
"epoch": 0.9658792650918635,
"grad_norm": 0.3720108758786826,
"learning_rate": 3.094837987606547e-06,
"loss": 0.9204,
"step": 184
},
{
"epoch": 0.9711286089238845,
"grad_norm": 0.3818234182095755,
"learning_rate": 3.0723763748753354e-06,
"loss": 0.8814,
"step": 185
},
{
"epoch": 0.9763779527559056,
"grad_norm": 0.3847151463563777,
"learning_rate": 3.049865894599172e-06,
"loss": 0.9133,
"step": 186
},
{
"epoch": 0.9816272965879265,
"grad_norm": 0.37971775342950864,
"learning_rate": 3.027308468646175e-06,
"loss": 0.8906,
"step": 187
},
{
"epoch": 0.9868766404199475,
"grad_norm": 0.41572124992250203,
"learning_rate": 3.0047060228925256e-06,
"loss": 0.9672,
"step": 188
},
{
"epoch": 0.9921259842519685,
"grad_norm": 0.40219608509658256,
"learning_rate": 2.9820604870580426e-06,
"loss": 0.9011,
"step": 189
},
{
"epoch": 0.9973753280839895,
"grad_norm": 0.397234257384178,
"learning_rate": 2.9593737945414264e-06,
"loss": 0.9174,
"step": 190
},
{
"epoch": 1.0,
"grad_norm": 0.397234257384178,
"learning_rate": 2.9366478822551973e-06,
"loss": 0.9204,
"step": 191
},
{
"epoch": 1.005249343832021,
"grad_norm": 0.6279719871788059,
"learning_rate": 2.913884690460325e-06,
"loss": 0.8376,
"step": 192
},
{
"epoch": 1.010498687664042,
"grad_norm": 0.41242697308859033,
"learning_rate": 2.8910861626005774e-06,
"loss": 0.8369,
"step": 193
},
{
"epoch": 1.015748031496063,
"grad_norm": 0.38981163388237194,
"learning_rate": 2.8682542451365943e-06,
"loss": 0.8493,
"step": 194
},
{
"epoch": 1.020997375328084,
"grad_norm": 0.42318354204075453,
"learning_rate": 2.845390887379706e-06,
"loss": 0.8618,
"step": 195
},
{
"epoch": 1.026246719160105,
"grad_norm": 0.5052809667467608,
"learning_rate": 2.822498041325509e-06,
"loss": 0.8644,
"step": 196
},
{
"epoch": 1.031496062992126,
"grad_norm": 0.36960238923766053,
"learning_rate": 2.7995776614872083e-06,
"loss": 0.8484,
"step": 197
},
{
"epoch": 1.036745406824147,
"grad_norm": 0.3606472920225704,
"learning_rate": 2.776631704728752e-06,
"loss": 0.8413,
"step": 198
},
{
"epoch": 1.041994750656168,
"grad_norm": 0.38778609923815943,
"learning_rate": 2.753662130097758e-06,
"loss": 0.8266,
"step": 199
},
{
"epoch": 1.047244094488189,
"grad_norm": 0.3636856280047818,
"learning_rate": 2.730670898658255e-06,
"loss": 0.8285,
"step": 200
},
{
"epoch": 1.05249343832021,
"grad_norm": 0.3886565787437705,
"learning_rate": 2.70765997332326e-06,
"loss": 0.8628,
"step": 201
},
{
"epoch": 1.057742782152231,
"grad_norm": 0.41378173196429036,
"learning_rate": 2.684631318687185e-06,
"loss": 0.8549,
"step": 202
},
{
"epoch": 1.0629921259842519,
"grad_norm": 0.3657527566283362,
"learning_rate": 2.661586900858111e-06,
"loss": 0.8472,
"step": 203
},
{
"epoch": 1.068241469816273,
"grad_norm": 0.3666340150026852,
"learning_rate": 2.638528687289925e-06,
"loss": 0.8331,
"step": 204
},
{
"epoch": 1.073490813648294,
"grad_norm": 0.3661321106045701,
"learning_rate": 2.6154586466143495e-06,
"loss": 0.8706,
"step": 205
},
{
"epoch": 1.078740157480315,
"grad_norm": 0.3796204270047528,
"learning_rate": 2.592378748472863e-06,
"loss": 0.8329,
"step": 206
},
{
"epoch": 1.083989501312336,
"grad_norm": 0.4003268539729557,
"learning_rate": 2.5692909633485414e-06,
"loss": 0.8762,
"step": 207
},
{
"epoch": 1.0892388451443569,
"grad_norm": 0.3816091507612548,
"learning_rate": 2.546197262397825e-06,
"loss": 0.8499,
"step": 208
},
{
"epoch": 1.094488188976378,
"grad_norm": 0.4135433735229758,
"learning_rate": 2.5230996172822274e-06,
"loss": 0.8191,
"step": 209
},
{
"epoch": 1.099737532808399,
"grad_norm": 0.38519176727175336,
"learning_rate": 2.5e-06,
"loss": 0.8164,
"step": 210
},
{
"epoch": 1.10498687664042,
"grad_norm": 0.397224934471037,
"learning_rate": 2.4769003827177735e-06,
"loss": 0.8373,
"step": 211
},
{
"epoch": 1.110236220472441,
"grad_norm": 0.37696141546345585,
"learning_rate": 2.453802737602176e-06,
"loss": 0.8575,
"step": 212
},
{
"epoch": 1.1154855643044619,
"grad_norm": 0.38562550508165394,
"learning_rate": 2.4307090366514594e-06,
"loss": 0.8372,
"step": 213
},
{
"epoch": 1.120734908136483,
"grad_norm": 0.392610655726213,
"learning_rate": 2.4076212515271384e-06,
"loss": 0.8561,
"step": 214
},
{
"epoch": 1.125984251968504,
"grad_norm": 0.3752434088251031,
"learning_rate": 2.3845413533856517e-06,
"loss": 0.8539,
"step": 215
},
{
"epoch": 1.1312335958005248,
"grad_norm": 0.3769283394800778,
"learning_rate": 2.3614713127100752e-06,
"loss": 0.8134,
"step": 216
},
{
"epoch": 1.136482939632546,
"grad_norm": 0.39533971296250825,
"learning_rate": 2.3384130991418896e-06,
"loss": 0.8608,
"step": 217
},
{
"epoch": 1.141732283464567,
"grad_norm": 0.5791866799791944,
"learning_rate": 2.3153686813128153e-06,
"loss": 0.8334,
"step": 218
},
{
"epoch": 1.1469816272965878,
"grad_norm": 0.3810153033409976,
"learning_rate": 2.2923400266767406e-06,
"loss": 0.8472,
"step": 219
},
{
"epoch": 1.152230971128609,
"grad_norm": 0.3728143509479016,
"learning_rate": 2.269329101341745e-06,
"loss": 0.8376,
"step": 220
},
{
"epoch": 1.1574803149606299,
"grad_norm": 0.40305096887095054,
"learning_rate": 2.246337869902243e-06,
"loss": 0.8275,
"step": 221
},
{
"epoch": 1.162729658792651,
"grad_norm": 0.3692851271916895,
"learning_rate": 2.2233682952712484e-06,
"loss": 0.8413,
"step": 222
},
{
"epoch": 1.167979002624672,
"grad_norm": 0.3963527736664121,
"learning_rate": 2.2004223385127925e-06,
"loss": 0.8295,
"step": 223
},
{
"epoch": 1.1732283464566928,
"grad_norm": 0.3587695306587395,
"learning_rate": 2.1775019586744924e-06,
"loss": 0.8547,
"step": 224
},
{
"epoch": 1.178477690288714,
"grad_norm": 0.3614709935886563,
"learning_rate": 2.1546091126202955e-06,
"loss": 0.813,
"step": 225
},
{
"epoch": 1.1837270341207349,
"grad_norm": 0.4140010983968987,
"learning_rate": 2.131745754863406e-06,
"loss": 0.8398,
"step": 226
},
{
"epoch": 1.188976377952756,
"grad_norm": 0.45996885685295197,
"learning_rate": 2.1089138373994226e-06,
"loss": 0.8518,
"step": 227
},
{
"epoch": 1.194225721784777,
"grad_norm": 0.36254259181474985,
"learning_rate": 2.086115309539675e-06,
"loss": 0.8647,
"step": 228
},
{
"epoch": 1.1994750656167978,
"grad_norm": 0.4122269000782737,
"learning_rate": 2.063352117744803e-06,
"loss": 0.8881,
"step": 229
},
{
"epoch": 1.204724409448819,
"grad_norm": 0.40615776242804974,
"learning_rate": 2.040626205458574e-06,
"loss": 0.8328,
"step": 230
},
{
"epoch": 1.20997375328084,
"grad_norm": 0.41826600558691523,
"learning_rate": 2.017939512941958e-06,
"loss": 0.8281,
"step": 231
},
{
"epoch": 1.2152230971128608,
"grad_norm": 0.4153271729906844,
"learning_rate": 1.995293977107475e-06,
"loss": 0.8693,
"step": 232
},
{
"epoch": 1.220472440944882,
"grad_norm": 0.3654074693662248,
"learning_rate": 1.972691531353826e-06,
"loss": 0.821,
"step": 233
},
{
"epoch": 1.2257217847769029,
"grad_norm": 0.3981178374801672,
"learning_rate": 1.9501341054008292e-06,
"loss": 0.8962,
"step": 234
},
{
"epoch": 1.2309711286089238,
"grad_norm": 0.4049324135862524,
"learning_rate": 1.9276236251246655e-06,
"loss": 0.7905,
"step": 235
},
{
"epoch": 1.236220472440945,
"grad_norm": 0.33631867862267323,
"learning_rate": 1.9051620123934538e-06,
"loss": 0.8284,
"step": 236
},
{
"epoch": 1.2414698162729658,
"grad_norm": 0.37393386640045784,
"learning_rate": 1.882751184903167e-06,
"loss": 0.8405,
"step": 237
},
{
"epoch": 1.246719160104987,
"grad_norm": 0.3462880519364805,
"learning_rate": 1.860393056013911e-06,
"loss": 0.7939,
"step": 238
},
{
"epoch": 1.2519685039370079,
"grad_norm": 0.3848360075044728,
"learning_rate": 1.8380895345865603e-06,
"loss": 0.8375,
"step": 239
},
{
"epoch": 1.257217847769029,
"grad_norm": 0.3617106402936481,
"learning_rate": 1.8158425248197931e-06,
"loss": 0.8162,
"step": 240
},
{
"epoch": 1.26246719160105,
"grad_norm": 0.34772753646733273,
"learning_rate": 1.7936539260875125e-06,
"loss": 0.8408,
"step": 241
},
{
"epoch": 1.2677165354330708,
"grad_norm": 0.36406532287683085,
"learning_rate": 1.7715256327766887e-06,
"loss": 0.8103,
"step": 242
},
{
"epoch": 1.272965879265092,
"grad_norm": 0.3765574533307364,
"learning_rate": 1.7494595341256185e-06,
"loss": 0.8461,
"step": 243
},
{
"epoch": 1.2782152230971129,
"grad_norm": 0.3859435308304487,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.8761,
"step": 244
},
{
"epoch": 1.2834645669291338,
"grad_norm": 0.3936691957558663,
"learning_rate": 1.7055214510452462e-06,
"loss": 0.9159,
"step": 245
},
{
"epoch": 1.288713910761155,
"grad_norm": 0.5107365379023212,
"learning_rate": 1.6836532178997922e-06,
"loss": 0.8649,
"step": 246
},
{
"epoch": 1.2939632545931758,
"grad_norm": 0.388972979502565,
"learning_rate": 1.6618546816615162e-06,
"loss": 0.8734,
"step": 247
},
{
"epoch": 1.2992125984251968,
"grad_norm": 0.4410810635653612,
"learning_rate": 1.6401277034151798e-06,
"loss": 0.8405,
"step": 248
},
{
"epoch": 1.304461942257218,
"grad_norm": 0.38502463729196623,
"learning_rate": 1.6184741381361684e-06,
"loss": 0.8546,
"step": 249
},
{
"epoch": 1.3097112860892388,
"grad_norm": 0.38032181472927906,
"learning_rate": 1.5968958345321178e-06,
"loss": 0.8253,
"step": 250
},
{
"epoch": 1.3149606299212597,
"grad_norm": 0.36970392450888717,
"learning_rate": 1.5753946348850774e-06,
"loss": 0.8558,
"step": 251
},
{
"epoch": 1.3202099737532809,
"grad_norm": 0.3934484470589474,
"learning_rate": 1.5539723748942246e-06,
"loss": 0.8299,
"step": 252
},
{
"epoch": 1.3254593175853018,
"grad_norm": 0.3602834073019444,
"learning_rate": 1.5326308835191372e-06,
"loss": 0.8476,
"step": 253
},
{
"epoch": 1.330708661417323,
"grad_norm": 0.3482676328694225,
"learning_rate": 1.5113719828236439e-06,
"loss": 0.858,
"step": 254
},
{
"epoch": 1.3359580052493438,
"grad_norm": 0.44035367856347457,
"learning_rate": 1.490197487820263e-06,
"loss": 0.8351,
"step": 255
},
{
"epoch": 1.341207349081365,
"grad_norm": 0.371715281121202,
"learning_rate": 1.4691092063152417e-06,
"loss": 0.827,
"step": 256
},
{
"epoch": 1.3464566929133859,
"grad_norm": 0.35070038567136974,
"learning_rate": 1.4481089387542134e-06,
"loss": 0.8492,
"step": 257
},
{
"epoch": 1.3517060367454068,
"grad_norm": 0.3499117772233927,
"learning_rate": 1.4271984780684778e-06,
"loss": 0.8399,
"step": 258
},
{
"epoch": 1.356955380577428,
"grad_norm": 0.3811483044855955,
"learning_rate": 1.4063796095219345e-06,
"loss": 0.8688,
"step": 259
},
{
"epoch": 1.3622047244094488,
"grad_norm": 0.3714007196954483,
"learning_rate": 1.3856541105586545e-06,
"loss": 0.813,
"step": 260
},
{
"epoch": 1.3674540682414698,
"grad_norm": 0.3577252388966486,
"learning_rate": 1.3650237506511333e-06,
"loss": 0.8506,
"step": 261
},
{
"epoch": 1.372703412073491,
"grad_norm": 0.3670679328461459,
"learning_rate": 1.3444902911492174e-06,
"loss": 0.8267,
"step": 262
},
{
"epoch": 1.3779527559055118,
"grad_norm": 0.3521888751612076,
"learning_rate": 1.324055485129727e-06,
"loss": 0.8079,
"step": 263
},
{
"epoch": 1.3832020997375327,
"grad_norm": 0.3751986141403746,
"learning_rate": 1.303721077246784e-06,
"loss": 0.8491,
"step": 264
},
{
"epoch": 1.3884514435695539,
"grad_norm": 0.3589588384505892,
"learning_rate": 1.2834888035828597e-06,
"loss": 0.8768,
"step": 265
},
{
"epoch": 1.3937007874015748,
"grad_norm": 0.372280310354174,
"learning_rate": 1.2633603915005535e-06,
"loss": 0.8335,
"step": 266
},
{
"epoch": 1.3989501312335957,
"grad_norm": 0.40534475247811924,
"learning_rate": 1.2433375594951166e-06,
"loss": 0.8719,
"step": 267
},
{
"epoch": 1.4041994750656168,
"grad_norm": 0.40997625659212245,
"learning_rate": 1.2234220170477332e-06,
"loss": 0.8269,
"step": 268
},
{
"epoch": 1.4094488188976377,
"grad_norm": 0.3668404917652971,
"learning_rate": 1.2036154644795697e-06,
"loss": 0.7913,
"step": 269
},
{
"epoch": 1.4146981627296589,
"grad_norm": 0.3641040286199539,
"learning_rate": 1.1839195928066101e-06,
"loss": 0.8469,
"step": 270
},
{
"epoch": 1.4199475065616798,
"grad_norm": 0.36110674154533795,
"learning_rate": 1.164336083595279e-06,
"loss": 0.8698,
"step": 271
},
{
"epoch": 1.425196850393701,
"grad_norm": 0.376323216718325,
"learning_rate": 1.1448666088188766e-06,
"loss": 0.8355,
"step": 272
},
{
"epoch": 1.4304461942257218,
"grad_norm": 0.37039873117700245,
"learning_rate": 1.1255128307148319e-06,
"loss": 0.8219,
"step": 273
},
{
"epoch": 1.4356955380577427,
"grad_norm": 0.3612230977145218,
"learning_rate": 1.1062764016427864e-06,
"loss": 0.8568,
"step": 274
},
{
"epoch": 1.4409448818897639,
"grad_norm": 0.3962177187198973,
"learning_rate": 1.0871589639435204e-06,
"loss": 0.8115,
"step": 275
},
{
"epoch": 1.4461942257217848,
"grad_norm": 0.38303094180829605,
"learning_rate": 1.068162149798737e-06,
"loss": 0.818,
"step": 276
},
{
"epoch": 1.4514435695538057,
"grad_norm": 0.364448052838668,
"learning_rate": 1.049287581091711e-06,
"loss": 0.803,
"step": 277
},
{
"epoch": 1.4566929133858268,
"grad_norm": 0.37746023670056045,
"learning_rate": 1.0305368692688175e-06,
"loss": 0.8495,
"step": 278
},
{
"epoch": 1.4619422572178478,
"grad_norm": 0.40984945067844814,
"learning_rate": 1.0119116152019535e-06,
"loss": 0.8631,
"step": 279
},
{
"epoch": 1.4671916010498687,
"grad_norm": 0.3840327770579671,
"learning_rate": 9.934134090518593e-07,
"loss": 0.8318,
"step": 280
},
{
"epoch": 1.4724409448818898,
"grad_norm": 0.3575557636540703,
"learning_rate": 9.750438301323584e-07,
"loss": 0.8244,
"step": 281
},
{
"epoch": 1.4776902887139107,
"grad_norm": 0.363642443404998,
"learning_rate": 9.56804446775518e-07,
"loss": 0.8109,
"step": 282
},
{
"epoch": 1.4829396325459316,
"grad_norm": 0.3410295671236108,
"learning_rate": 9.386968161977528e-07,
"loss": 0.8227,
"step": 283
},
{
"epoch": 1.4881889763779528,
"grad_norm": 0.3693264048471325,
"learning_rate": 9.207224843668733e-07,
"loss": 0.8349,
"step": 284
},
{
"epoch": 1.4934383202099737,
"grad_norm": 0.3912247837861982,
"learning_rate": 9.028829858700974e-07,
"loss": 0.8218,
"step": 285
},
{
"epoch": 1.4986876640419948,
"grad_norm": 0.3771536484006796,
"learning_rate": 8.851798437830323e-07,
"loss": 0.8328,
"step": 286
},
{
"epoch": 1.5039370078740157,
"grad_norm": 0.3956914054034924,
"learning_rate": 8.676145695396399e-07,
"loss": 0.8779,
"step": 287
},
{
"epoch": 1.5091863517060369,
"grad_norm": 0.34503047792761804,
"learning_rate": 8.501886628031941e-07,
"loss": 0.8265,
"step": 288
},
{
"epoch": 1.5144356955380578,
"grad_norm": 0.3888965654379534,
"learning_rate": 8.329036113382474e-07,
"loss": 0.8354,
"step": 289
},
{
"epoch": 1.5196850393700787,
"grad_norm": 0.38477861699293,
"learning_rate": 8.157608908836071e-07,
"loss": 0.8109,
"step": 290
},
{
"epoch": 1.5249343832020998,
"grad_norm": 0.378643274506092,
"learning_rate": 7.987619650263462e-07,
"loss": 0.8507,
"step": 291
},
{
"epoch": 1.5301837270341208,
"grad_norm": 0.3619947185774947,
"learning_rate": 7.819082850768433e-07,
"loss": 0.8321,
"step": 292
},
{
"epoch": 1.5354330708661417,
"grad_norm": 0.3793890733316615,
"learning_rate": 7.652012899448761e-07,
"loss": 0.846,
"step": 293
},
{
"epoch": 1.5406824146981628,
"grad_norm": 0.37104232865461917,
"learning_rate": 7.486424060167726e-07,
"loss": 0.8113,
"step": 294
},
{
"epoch": 1.5459317585301837,
"grad_norm": 2.1051715634012864,
"learning_rate": 7.322330470336314e-07,
"loss": 0.8174,
"step": 295
},
{
"epoch": 1.5511811023622046,
"grad_norm": 0.45390059444688674,
"learning_rate": 7.159746139706194e-07,
"loss": 0.8414,
"step": 296
},
{
"epoch": 1.5564304461942258,
"grad_norm": 0.39115421496994207,
"learning_rate": 6.99868494917364e-07,
"loss": 0.8085,
"step": 297
},
{
"epoch": 1.5616797900262467,
"grad_norm": 0.37250458373292433,
"learning_rate": 6.839160649594401e-07,
"loss": 0.7906,
"step": 298
},
{
"epoch": 1.5669291338582676,
"grad_norm": 0.3746838213451273,
"learning_rate": 6.68118686060972e-07,
"loss": 0.8314,
"step": 299
},
{
"epoch": 1.5721784776902887,
"grad_norm": 0.5365749531577879,
"learning_rate": 6.524777069483526e-07,
"loss": 0.8332,
"step": 300
},
{
"epoch": 1.5774278215223099,
"grad_norm": 0.3788491507367419,
"learning_rate": 6.369944629950933e-07,
"loss": 0.8611,
"step": 301
},
{
"epoch": 1.5826771653543306,
"grad_norm": 0.3690009715495826,
"learning_rate": 6.216702761078167e-07,
"loss": 0.8099,
"step": 302
},
{
"epoch": 1.5879265091863517,
"grad_norm": 0.34643553497540036,
"learning_rate": 6.06506454613393e-07,
"loss": 0.8255,
"step": 303
},
{
"epoch": 1.5931758530183728,
"grad_norm": 0.37384009617772274,
"learning_rate": 5.915042931472426e-07,
"loss": 0.8024,
"step": 304
},
{
"epoch": 1.5984251968503937,
"grad_norm": 0.34112911330910745,
"learning_rate": 5.766650725428027e-07,
"loss": 0.8172,
"step": 305
},
{
"epoch": 1.6036745406824147,
"grad_norm": 0.3675924179740712,
"learning_rate": 5.619900597221753e-07,
"loss": 0.8195,
"step": 306
},
{
"epoch": 1.6089238845144358,
"grad_norm": 0.36296097997746496,
"learning_rate": 5.474805075879616e-07,
"loss": 0.808,
"step": 307
},
{
"epoch": 1.6141732283464567,
"grad_norm": 0.37901033900237063,
"learning_rate": 5.33137654916292e-07,
"loss": 0.8068,
"step": 308
},
{
"epoch": 1.6194225721784776,
"grad_norm": 0.3662350700773784,
"learning_rate": 5.189627262510655e-07,
"loss": 0.8695,
"step": 309
},
{
"epoch": 1.6246719160104988,
"grad_norm": 0.36967015418887056,
"learning_rate": 5.049569317994013e-07,
"loss": 0.8448,
"step": 310
},
{
"epoch": 1.6299212598425197,
"grad_norm": 0.3579476966594129,
"learning_rate": 4.911214673283157e-07,
"loss": 0.8375,
"step": 311
},
{
"epoch": 1.6351706036745406,
"grad_norm": 0.3611024965491316,
"learning_rate": 4.774575140626317e-07,
"loss": 0.8519,
"step": 312
},
{
"epoch": 1.6404199475065617,
"grad_norm": 0.3485470659035558,
"learning_rate": 4.639662385841293e-07,
"loss": 0.8217,
"step": 313
},
{
"epoch": 1.6456692913385826,
"grad_norm": 0.37287096492671606,
"learning_rate": 4.506487927319475e-07,
"loss": 0.8413,
"step": 314
},
{
"epoch": 1.6509186351706036,
"grad_norm": 0.38705573070810245,
"learning_rate": 4.3750631350424456e-07,
"loss": 0.8499,
"step": 315
},
{
"epoch": 1.6561679790026247,
"grad_norm": 0.3582574621798544,
"learning_rate": 4.2453992296112384e-07,
"loss": 0.8422,
"step": 316
},
{
"epoch": 1.6614173228346458,
"grad_norm": 0.36916485545424343,
"learning_rate": 4.117507281288366e-07,
"loss": 0.8312,
"step": 317
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.36367894508342596,
"learning_rate": 3.991398209052685e-07,
"loss": 0.8342,
"step": 318
},
{
"epoch": 1.6719160104986877,
"grad_norm": 0.39384159873675784,
"learning_rate": 3.8670827796671637e-07,
"loss": 0.8006,
"step": 319
},
{
"epoch": 1.6771653543307088,
"grad_norm": 0.39604366934016394,
"learning_rate": 3.7445716067596506e-07,
"loss": 0.8556,
"step": 320
},
{
"epoch": 1.6824146981627297,
"grad_norm": 0.3672901516398384,
"learning_rate": 3.623875149916725e-07,
"loss": 0.8621,
"step": 321
},
{
"epoch": 1.6876640419947506,
"grad_norm": 0.38389464950775926,
"learning_rate": 3.505003713790689e-07,
"loss": 0.8343,
"step": 322
},
{
"epoch": 1.6929133858267718,
"grad_norm": 0.36767539320644105,
"learning_rate": 3.387967447219803e-07,
"loss": 0.842,
"step": 323
},
{
"epoch": 1.6981627296587927,
"grad_norm": 0.3907752041858798,
"learning_rate": 3.2727763423617915e-07,
"loss": 0.8767,
"step": 324
},
{
"epoch": 1.7034120734908136,
"grad_norm": 0.37713933030901214,
"learning_rate": 3.1594402338407633e-07,
"loss": 0.8326,
"step": 325
},
{
"epoch": 1.7086614173228347,
"grad_norm": 0.38192173068300844,
"learning_rate": 3.047968797907552e-07,
"loss": 0.8168,
"step": 326
},
{
"epoch": 1.7139107611548556,
"grad_norm": 0.35632236660696437,
"learning_rate": 2.9383715516136083e-07,
"loss": 0.8344,
"step": 327
},
{
"epoch": 1.7191601049868765,
"grad_norm": 0.39281712398945773,
"learning_rate": 2.8306578519984526e-07,
"loss": 0.8057,
"step": 328
},
{
"epoch": 1.7244094488188977,
"grad_norm": 0.3713265865015316,
"learning_rate": 2.7248368952908055e-07,
"loss": 0.8128,
"step": 329
},
{
"epoch": 1.7296587926509186,
"grad_norm": 0.3704631773575278,
"learning_rate": 2.620917716123444e-07,
"loss": 0.8593,
"step": 330
},
{
"epoch": 1.7349081364829395,
"grad_norm": 0.3833563384401533,
"learning_rate": 2.5189091867618615e-07,
"loss": 0.8353,
"step": 331
},
{
"epoch": 1.7401574803149606,
"grad_norm": 0.39756909736931284,
"learning_rate": 2.418820016346779e-07,
"loss": 0.8596,
"step": 332
},
{
"epoch": 1.7454068241469818,
"grad_norm": 0.36212623044011794,
"learning_rate": 2.3206587501505866e-07,
"loss": 0.8311,
"step": 333
},
{
"epoch": 1.7506561679790025,
"grad_norm": 0.36098218836000906,
"learning_rate": 2.224433768847789e-07,
"loss": 0.8091,
"step": 334
},
{
"epoch": 1.7559055118110236,
"grad_norm": 0.3713293844163563,
"learning_rate": 2.1301532877994747e-07,
"loss": 0.8147,
"step": 335
},
{
"epoch": 1.7611548556430447,
"grad_norm": 0.34562968172819897,
"learning_rate": 2.0378253563519247e-07,
"loss": 0.8284,
"step": 336
},
{
"epoch": 1.7664041994750657,
"grad_norm": 0.39461152581923564,
"learning_rate": 1.9474578571493874e-07,
"loss": 0.8632,
"step": 337
},
{
"epoch": 1.7716535433070866,
"grad_norm": 0.3689417081438894,
"learning_rate": 1.859058505461095e-07,
"loss": 0.8259,
"step": 338
},
{
"epoch": 1.7769028871391077,
"grad_norm": 0.3926353728632124,
"learning_rate": 1.7726348485225337e-07,
"loss": 0.8364,
"step": 339
},
{
"epoch": 1.7821522309711286,
"grad_norm": 0.354022004999706,
"learning_rate": 1.6881942648911077e-07,
"loss": 0.8773,
"step": 340
},
{
"epoch": 1.7874015748031495,
"grad_norm": 0.3876142010040814,
"learning_rate": 1.6057439638161891e-07,
"loss": 0.82,
"step": 341
},
{
"epoch": 1.7926509186351707,
"grad_norm": 0.35474834193863947,
"learning_rate": 1.5252909846235898e-07,
"loss": 0.8193,
"step": 342
},
{
"epoch": 1.7979002624671916,
"grad_norm": 0.35916164129360717,
"learning_rate": 1.4468421961145924e-07,
"loss": 0.8398,
"step": 343
},
{
"epoch": 1.8031496062992125,
"grad_norm": 0.3592392678728242,
"learning_rate": 1.3704042959795132e-07,
"loss": 0.8384,
"step": 344
},
{
"epoch": 1.8083989501312336,
"grad_norm": 0.3717050715996407,
"learning_rate": 1.2959838102258537e-07,
"loss": 0.827,
"step": 345
},
{
"epoch": 1.8136482939632546,
"grad_norm": 0.3714676540830701,
"learning_rate": 1.223587092621162e-07,
"loss": 0.811,
"step": 346
},
{
"epoch": 1.8188976377952755,
"grad_norm": 0.36420858424939995,
"learning_rate": 1.1532203241505474e-07,
"loss": 0.8769,
"step": 347
},
{
"epoch": 1.8241469816272966,
"grad_norm": 0.3993728488042469,
"learning_rate": 1.0848895124889819e-07,
"loss": 0.8437,
"step": 348
},
{
"epoch": 1.8293963254593177,
"grad_norm": 0.36458761423411645,
"learning_rate": 1.0186004914883779e-07,
"loss": 0.8378,
"step": 349
},
{
"epoch": 1.8346456692913384,
"grad_norm": 0.3874822926899934,
"learning_rate": 9.54358920679524e-08,
"loss": 0.8284,
"step": 350
},
{
"epoch": 1.8398950131233596,
"grad_norm": 0.38256154200443654,
"learning_rate": 8.921702847888791e-08,
"loss": 0.8602,
"step": 351
},
{
"epoch": 1.8451443569553807,
"grad_norm": 0.36179666269244404,
"learning_rate": 8.320398932703145e-08,
"loss": 0.8274,
"step": 352
},
{
"epoch": 1.8503937007874016,
"grad_norm": 0.365807165050502,
"learning_rate": 7.739728798518115e-08,
"loss": 0.8709,
"step": 353
},
{
"epoch": 1.8556430446194225,
"grad_norm": 0.35458800620057285,
"learning_rate": 7.17974202097152e-08,
"loss": 0.8111,
"step": 354
},
{
"epoch": 1.8608923884514437,
"grad_norm": 0.37957333795063364,
"learning_rate": 6.640486409826785e-08,
"loss": 0.7994,
"step": 355
},
{
"epoch": 1.8661417322834646,
"grad_norm": 0.3527171222570775,
"learning_rate": 6.12200800489085e-08,
"loss": 0.853,
"step": 356
},
{
"epoch": 1.8713910761154855,
"grad_norm": 0.39242714191446193,
"learning_rate": 5.624351072083561e-08,
"loss": 0.8203,
"step": 357
},
{
"epoch": 1.8766404199475066,
"grad_norm": 0.35707175993984824,
"learning_rate": 5.1475580996585285e-08,
"loss": 0.7958,
"step": 358
},
{
"epoch": 1.8818897637795275,
"grad_norm": 0.3335686552470708,
"learning_rate": 4.691669794575388e-08,
"loss": 0.8433,
"step": 359
},
{
"epoch": 1.8871391076115485,
"grad_norm": 0.38462069858291403,
"learning_rate": 4.256725079024554e-08,
"loss": 0.7975,
"step": 360
},
{
"epoch": 1.8923884514435696,
"grad_norm": 0.34131290146851945,
"learning_rate": 3.8427610871041024e-08,
"loss": 0.8223,
"step": 361
},
{
"epoch": 1.8976377952755905,
"grad_norm": 0.35781280354041417,
"learning_rate": 3.449813161649357e-08,
"loss": 0.9063,
"step": 362
},
{
"epoch": 1.9028871391076114,
"grad_norm": 0.33992014890280325,
"learning_rate": 3.077914851215585e-08,
"loss": 0.8081,
"step": 363
},
{
"epoch": 1.9081364829396326,
"grad_norm": 0.3582145429279079,
"learning_rate": 2.7270979072135106e-08,
"loss": 0.8487,
"step": 364
},
{
"epoch": 1.9133858267716537,
"grad_norm": 0.36355378789809917,
"learning_rate": 2.3973922811987295e-08,
"loss": 0.8128,
"step": 365
},
{
"epoch": 1.9186351706036744,
"grad_norm": 0.36811282667120626,
"learning_rate": 2.0888261223143136e-08,
"loss": 0.8442,
"step": 366
},
{
"epoch": 1.9238845144356955,
"grad_norm": 0.36592765014539297,
"learning_rate": 1.8014257748877606e-08,
"loss": 0.8385,
"step": 367
},
{
"epoch": 1.9291338582677167,
"grad_norm": 0.3695353834129384,
"learning_rate": 1.5352157761815978e-08,
"loss": 0.809,
"step": 368
},
{
"epoch": 1.9343832020997376,
"grad_norm": 0.3907850295180158,
"learning_rate": 1.2902188542986139e-08,
"loss": 0.8295,
"step": 369
},
{
"epoch": 1.9396325459317585,
"grad_norm": 0.34933462418116523,
"learning_rate": 1.0664559262413831e-08,
"loss": 0.8238,
"step": 370
},
{
"epoch": 1.9448818897637796,
"grad_norm": 0.3537875486215043,
"learning_rate": 8.639460961263612e-09,
"loss": 0.8377,
"step": 371
},
{
"epoch": 1.9501312335958005,
"grad_norm": 0.35563927647325777,
"learning_rate": 6.827066535529947e-09,
"loss": 0.7943,
"step": 372
},
{
"epoch": 1.9553805774278215,
"grad_norm": 0.3474400330562922,
"learning_rate": 5.2275307212742986e-09,
"loss": 0.8511,
"step": 373
},
{
"epoch": 1.9606299212598426,
"grad_norm": 0.3593397542649248,
"learning_rate": 3.840990081415141e-09,
"loss": 0.8186,
"step": 374
},
{
"epoch": 1.9658792650918635,
"grad_norm": 0.3612580709923566,
"learning_rate": 2.6675629940689508e-09,
"loss": 0.848,
"step": 375
},
{
"epoch": 1.9711286089238844,
"grad_norm": 0.3614238458155997,
"learning_rate": 1.707349642442735e-09,
"loss": 0.7933,
"step": 376
},
{
"epoch": 1.9763779527559056,
"grad_norm": 0.39289720311660936,
"learning_rate": 9.604320062814309e-10,
"loss": 0.8323,
"step": 377
},
{
"epoch": 1.9816272965879265,
"grad_norm": 0.36167265662489084,
"learning_rate": 4.268738548682261e-10,
"loss": 0.8512,
"step": 378
},
{
"epoch": 1.9868766404199474,
"grad_norm": 0.3690750146631276,
"learning_rate": 1.0672074158030176e-10,
"loss": 0.832,
"step": 379
},
{
"epoch": 1.9921259842519685,
"grad_norm": 0.35408134628230764,
"learning_rate": 0.0,
"loss": 0.8172,
"step": 380
}
],
"logging_steps": 1,
"max_steps": 380,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 95,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.333503833071944e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}