pc-agent-test-26 / trainer_state.json
Henry He
upload the model
e494a97
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 346,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005780346820809248,
"grad_norm": 12.858534195216157,
"learning_rate": 1.111111111111111e-07,
"loss": 1.0276,
"step": 1
},
{
"epoch": 0.011560693641618497,
"grad_norm": 12.232356830857812,
"learning_rate": 2.222222222222222e-07,
"loss": 1.0802,
"step": 2
},
{
"epoch": 0.017341040462427744,
"grad_norm": 12.065559434019754,
"learning_rate": 3.333333333333333e-07,
"loss": 1.0239,
"step": 3
},
{
"epoch": 0.023121387283236993,
"grad_norm": 11.287800396105506,
"learning_rate": 4.444444444444444e-07,
"loss": 1.0354,
"step": 4
},
{
"epoch": 0.028901734104046242,
"grad_norm": 12.118961503435317,
"learning_rate": 5.555555555555555e-07,
"loss": 1.0642,
"step": 5
},
{
"epoch": 0.03468208092485549,
"grad_norm": 10.614304274386905,
"learning_rate": 6.666666666666666e-07,
"loss": 1.0397,
"step": 6
},
{
"epoch": 0.04046242774566474,
"grad_norm": 10.029891779857477,
"learning_rate": 7.777777777777778e-07,
"loss": 1.0264,
"step": 7
},
{
"epoch": 0.046242774566473986,
"grad_norm": 8.67060643578771,
"learning_rate": 8.888888888888888e-07,
"loss": 0.9512,
"step": 8
},
{
"epoch": 0.05202312138728324,
"grad_norm": 7.757378710626247,
"learning_rate": 1e-06,
"loss": 0.9608,
"step": 9
},
{
"epoch": 0.057803468208092484,
"grad_norm": 7.142309385955806,
"learning_rate": 1.111111111111111e-06,
"loss": 0.9715,
"step": 10
},
{
"epoch": 0.06358381502890173,
"grad_norm": 6.115644634164,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.9309,
"step": 11
},
{
"epoch": 0.06936416184971098,
"grad_norm": 6.526598892938256,
"learning_rate": 1.3333333333333332e-06,
"loss": 0.9119,
"step": 12
},
{
"epoch": 0.07514450867052024,
"grad_norm": 5.701678407172686,
"learning_rate": 1.4444444444444443e-06,
"loss": 0.9509,
"step": 13
},
{
"epoch": 0.08092485549132948,
"grad_norm": 4.636147369080921,
"learning_rate": 1.5555555555555556e-06,
"loss": 0.9168,
"step": 14
},
{
"epoch": 0.08670520231213873,
"grad_norm": 4.795029784944483,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.8854,
"step": 15
},
{
"epoch": 0.09248554913294797,
"grad_norm": 4.272500187460561,
"learning_rate": 1.7777777777777775e-06,
"loss": 0.8752,
"step": 16
},
{
"epoch": 0.09826589595375723,
"grad_norm": 4.381820648769374,
"learning_rate": 1.8888888888888888e-06,
"loss": 0.8719,
"step": 17
},
{
"epoch": 0.10404624277456648,
"grad_norm": 4.2844649887933635,
"learning_rate": 2e-06,
"loss": 0.8252,
"step": 18
},
{
"epoch": 0.10982658959537572,
"grad_norm": 4.672757325891935,
"learning_rate": 1.9999541310559686e-06,
"loss": 0.7784,
"step": 19
},
{
"epoch": 0.11560693641618497,
"grad_norm": 4.140790454113861,
"learning_rate": 1.9998165284317942e-06,
"loss": 0.7805,
"step": 20
},
{
"epoch": 0.12138728323699421,
"grad_norm": 3.806744515631364,
"learning_rate": 1.999587204750851e-06,
"loss": 0.775,
"step": 21
},
{
"epoch": 0.12716763005780346,
"grad_norm": 4.3316680977985635,
"learning_rate": 1.99926618105081e-06,
"loss": 0.757,
"step": 22
},
{
"epoch": 0.1329479768786127,
"grad_norm": 6.658906531282003,
"learning_rate": 1.9988534867817065e-06,
"loss": 0.7762,
"step": 23
},
{
"epoch": 0.13872832369942195,
"grad_norm": 3.2840929759235498,
"learning_rate": 1.998349159803241e-06,
"loss": 0.7427,
"step": 24
},
{
"epoch": 0.14450867052023122,
"grad_norm": 3.1908462214509865,
"learning_rate": 1.9977532463813065e-06,
"loss": 0.7354,
"step": 25
},
{
"epoch": 0.15028901734104047,
"grad_norm": 3.3673488290162124,
"learning_rate": 1.9970658011837403e-06,
"loss": 0.6922,
"step": 26
},
{
"epoch": 0.15606936416184972,
"grad_norm": 3.682299563499068,
"learning_rate": 1.9962868872753143e-06,
"loss": 0.7066,
"step": 27
},
{
"epoch": 0.16184971098265896,
"grad_norm": 3.039028308153685,
"learning_rate": 1.9954165761119447e-06,
"loss": 0.6644,
"step": 28
},
{
"epoch": 0.1676300578034682,
"grad_norm": 3.0874127013971147,
"learning_rate": 1.99445494753414e-06,
"loss": 0.6828,
"step": 29
},
{
"epoch": 0.17341040462427745,
"grad_norm": 3.0558701865961884,
"learning_rate": 1.9934020897596747e-06,
"loss": 0.6854,
"step": 30
},
{
"epoch": 0.1791907514450867,
"grad_norm": 2.6355155421023206,
"learning_rate": 1.992258099375498e-06,
"loss": 0.7024,
"step": 31
},
{
"epoch": 0.18497109826589594,
"grad_norm": 2.671560023832056,
"learning_rate": 1.991023081328871e-06,
"loss": 0.6726,
"step": 32
},
{
"epoch": 0.1907514450867052,
"grad_norm": 2.5109346773716648,
"learning_rate": 1.9896971489177416e-06,
"loss": 0.6953,
"step": 33
},
{
"epoch": 0.19653179190751446,
"grad_norm": 2.671094772757736,
"learning_rate": 1.9882804237803485e-06,
"loss": 0.6749,
"step": 34
},
{
"epoch": 0.2023121387283237,
"grad_norm": 2.55280503478005,
"learning_rate": 1.986773035884064e-06,
"loss": 0.6313,
"step": 35
},
{
"epoch": 0.20809248554913296,
"grad_norm": 2.997693445850087,
"learning_rate": 1.98517512351347e-06,
"loss": 0.6912,
"step": 36
},
{
"epoch": 0.2138728323699422,
"grad_norm": 2.594837372299819,
"learning_rate": 1.9834868332576726e-06,
"loss": 0.7095,
"step": 37
},
{
"epoch": 0.21965317919075145,
"grad_norm": 2.248796606760258,
"learning_rate": 1.981708319996855e-06,
"loss": 0.6278,
"step": 38
},
{
"epoch": 0.2254335260115607,
"grad_norm": 2.4139138839205456,
"learning_rate": 1.9798397468880667e-06,
"loss": 0.6601,
"step": 39
},
{
"epoch": 0.23121387283236994,
"grad_norm": 5.055501773819069,
"learning_rate": 1.977881285350259e-06,
"loss": 0.6615,
"step": 40
},
{
"epoch": 0.23699421965317918,
"grad_norm": 2.3632268984701055,
"learning_rate": 1.975833115048557e-06,
"loss": 0.6483,
"step": 41
},
{
"epoch": 0.24277456647398843,
"grad_norm": 2.830666537018327,
"learning_rate": 1.973695423877779e-06,
"loss": 0.6755,
"step": 42
},
{
"epoch": 0.24855491329479767,
"grad_norm": 5.0327630976034765,
"learning_rate": 1.9714684079451977e-06,
"loss": 0.6932,
"step": 43
},
{
"epoch": 0.2543352601156069,
"grad_norm": 2.5324019369470454,
"learning_rate": 1.9691522715525517e-06,
"loss": 0.6662,
"step": 44
},
{
"epoch": 0.26011560693641617,
"grad_norm": 4.5913801929017515,
"learning_rate": 1.9667472271773023e-06,
"loss": 0.642,
"step": 45
},
{
"epoch": 0.2658959537572254,
"grad_norm": 2.5796270825002288,
"learning_rate": 1.964253495453141e-06,
"loss": 0.6501,
"step": 46
},
{
"epoch": 0.27167630057803466,
"grad_norm": 3.180271404782544,
"learning_rate": 1.9616713051497493e-06,
"loss": 0.6556,
"step": 47
},
{
"epoch": 0.2774566473988439,
"grad_norm": 2.4116071032049335,
"learning_rate": 1.959000893151813e-06,
"loss": 0.6506,
"step": 48
},
{
"epoch": 0.2832369942196532,
"grad_norm": 2.483272208248862,
"learning_rate": 1.9562425044372884e-06,
"loss": 0.6329,
"step": 49
},
{
"epoch": 0.28901734104046245,
"grad_norm": 2.3308296183451187,
"learning_rate": 1.9533963920549303e-06,
"loss": 0.6388,
"step": 50
},
{
"epoch": 0.2947976878612717,
"grad_norm": 2.622711013779675,
"learning_rate": 1.950462817101079e-06,
"loss": 0.6474,
"step": 51
},
{
"epoch": 0.30057803468208094,
"grad_norm": 2.1446274232932554,
"learning_rate": 1.947442048695704e-06,
"loss": 0.6592,
"step": 52
},
{
"epoch": 0.3063583815028902,
"grad_norm": 3.3168218759230275,
"learning_rate": 1.9443343639577202e-06,
"loss": 0.6504,
"step": 53
},
{
"epoch": 0.31213872832369943,
"grad_norm": 2.256777697180995,
"learning_rate": 1.9411400479795615e-06,
"loss": 0.6297,
"step": 54
},
{
"epoch": 0.3179190751445087,
"grad_norm": 2.1765404003111746,
"learning_rate": 1.93785939380103e-06,
"loss": 0.6336,
"step": 55
},
{
"epoch": 0.3236994219653179,
"grad_norm": 2.2217137619742906,
"learning_rate": 1.934492702382411e-06,
"loss": 0.6076,
"step": 56
},
{
"epoch": 0.32947976878612717,
"grad_norm": 5.281605039359051,
"learning_rate": 1.931040282576865e-06,
"loss": 0.6413,
"step": 57
},
{
"epoch": 0.3352601156069364,
"grad_norm": 2.681122616975899,
"learning_rate": 1.927502451102095e-06,
"loss": 0.6327,
"step": 58
},
{
"epoch": 0.34104046242774566,
"grad_norm": 2.3609007898823515,
"learning_rate": 1.9238795325112867e-06,
"loss": 0.6184,
"step": 59
},
{
"epoch": 0.3468208092485549,
"grad_norm": 2.6838363995230226,
"learning_rate": 1.9201718591633418e-06,
"loss": 0.6032,
"step": 60
},
{
"epoch": 0.35260115606936415,
"grad_norm": 2.4540324898863677,
"learning_rate": 1.9163797711923823e-06,
"loss": 0.5896,
"step": 61
},
{
"epoch": 0.3583815028901734,
"grad_norm": 2.298112606896863,
"learning_rate": 1.91250361647655e-06,
"loss": 0.6412,
"step": 62
},
{
"epoch": 0.36416184971098264,
"grad_norm": 2.2321684695854094,
"learning_rate": 1.9085437506060924e-06,
"loss": 0.5847,
"step": 63
},
{
"epoch": 0.3699421965317919,
"grad_norm": 2.1730547121243453,
"learning_rate": 1.9045005368507417e-06,
"loss": 0.6202,
"step": 64
},
{
"epoch": 0.37572254335260113,
"grad_norm": 2.648301208745142,
"learning_rate": 1.9003743461263883e-06,
"loss": 0.6246,
"step": 65
},
{
"epoch": 0.3815028901734104,
"grad_norm": 2.6270984399658377,
"learning_rate": 1.8961655569610556e-06,
"loss": 0.6336,
"step": 66
},
{
"epoch": 0.3872832369942196,
"grad_norm": 2.10356408342345,
"learning_rate": 1.8918745554601724e-06,
"loss": 0.6125,
"step": 67
},
{
"epoch": 0.3930635838150289,
"grad_norm": 2.0797656753425504,
"learning_rate": 1.8875017352711545e-06,
"loss": 0.6071,
"step": 68
},
{
"epoch": 0.3988439306358382,
"grad_norm": 2.3796052834987647,
"learning_rate": 1.8830474975472903e-06,
"loss": 0.6139,
"step": 69
},
{
"epoch": 0.4046242774566474,
"grad_norm": 2.195962808705189,
"learning_rate": 1.8785122509109423e-06,
"loss": 0.6376,
"step": 70
},
{
"epoch": 0.41040462427745666,
"grad_norm": 3.372968127914077,
"learning_rate": 1.8738964114160583e-06,
"loss": 0.6012,
"step": 71
},
{
"epoch": 0.4161849710982659,
"grad_norm": 2.4690022952436745,
"learning_rate": 1.8692004025100051e-06,
"loss": 0.6102,
"step": 72
},
{
"epoch": 0.42196531791907516,
"grad_norm": 2.9541703010934683,
"learning_rate": 1.8644246549947224e-06,
"loss": 0.5802,
"step": 73
},
{
"epoch": 0.4277456647398844,
"grad_norm": 2.219684411723982,
"learning_rate": 1.859569606987201e-06,
"loss": 0.6019,
"step": 74
},
{
"epoch": 0.43352601156069365,
"grad_norm": 3.2503821713287238,
"learning_rate": 1.8546357038792918e-06,
"loss": 0.6044,
"step": 75
},
{
"epoch": 0.4393063583815029,
"grad_norm": 2.3126944705292267,
"learning_rate": 1.8496233982968455e-06,
"loss": 0.6194,
"step": 76
},
{
"epoch": 0.44508670520231214,
"grad_norm": 2.2966676588642807,
"learning_rate": 1.8445331500581904e-06,
"loss": 0.6529,
"step": 77
},
{
"epoch": 0.4508670520231214,
"grad_norm": 2.2129730164894132,
"learning_rate": 1.83936542613195e-06,
"loss": 0.6032,
"step": 78
},
{
"epoch": 0.45664739884393063,
"grad_norm": 2.4969392606595764,
"learning_rate": 1.8341207005942032e-06,
"loss": 0.6165,
"step": 79
},
{
"epoch": 0.4624277456647399,
"grad_norm": 3.5760934505103146,
"learning_rate": 1.8287994545849945e-06,
"loss": 0.6135,
"step": 80
},
{
"epoch": 0.4682080924855491,
"grad_norm": 2.5392579698029802,
"learning_rate": 1.8234021762641945e-06,
"loss": 0.6132,
"step": 81
},
{
"epoch": 0.47398843930635837,
"grad_norm": 2.3783180990515156,
"learning_rate": 1.8179293607667177e-06,
"loss": 0.6026,
"step": 82
},
{
"epoch": 0.4797687861271676,
"grad_norm": 2.3017213740362674,
"learning_rate": 1.8123815101570995e-06,
"loss": 0.6068,
"step": 83
},
{
"epoch": 0.48554913294797686,
"grad_norm": 3.136830371172065,
"learning_rate": 1.806759133383438e-06,
"loss": 0.6198,
"step": 84
},
{
"epoch": 0.4913294797687861,
"grad_norm": 2.1335031297446356,
"learning_rate": 1.8010627462307046e-06,
"loss": 0.6317,
"step": 85
},
{
"epoch": 0.49710982658959535,
"grad_norm": 2.375395675829993,
"learning_rate": 1.7952928712734265e-06,
"loss": 0.5952,
"step": 86
},
{
"epoch": 0.5028901734104047,
"grad_norm": 2.338668259539757,
"learning_rate": 1.789450037827746e-06,
"loss": 0.6005,
"step": 87
},
{
"epoch": 0.5086705202312138,
"grad_norm": 2.0920820695913602,
"learning_rate": 1.783534781902864e-06,
"loss": 0.6108,
"step": 88
},
{
"epoch": 0.5144508670520231,
"grad_norm": 2.6397590095844516,
"learning_rate": 1.7775476461518666e-06,
"loss": 0.597,
"step": 89
},
{
"epoch": 0.5202312138728323,
"grad_norm": 2.2731273093251465,
"learning_rate": 1.771489179821943e-06,
"loss": 0.6205,
"step": 90
},
{
"epoch": 0.5260115606936416,
"grad_norm": 2.8578178406983854,
"learning_rate": 1.765359938703999e-06,
"loss": 0.624,
"step": 91
},
{
"epoch": 0.5317919075144508,
"grad_norm": 2.1683178371524385,
"learning_rate": 1.7591604850816704e-06,
"loss": 0.6008,
"step": 92
},
{
"epoch": 0.5375722543352601,
"grad_norm": 2.0931424578479216,
"learning_rate": 1.7528913876797397e-06,
"loss": 0.5781,
"step": 93
},
{
"epoch": 0.5433526011560693,
"grad_norm": 2.046087261006992,
"learning_rate": 1.7465532216119624e-06,
"loss": 0.5942,
"step": 94
},
{
"epoch": 0.5491329479768786,
"grad_norm": 2.0926363828893666,
"learning_rate": 1.740146568328308e-06,
"loss": 0.5845,
"step": 95
},
{
"epoch": 0.5549132947976878,
"grad_norm": 2.2826438145157253,
"learning_rate": 1.7336720155616185e-06,
"loss": 0.5771,
"step": 96
},
{
"epoch": 0.5606936416184971,
"grad_norm": 2.1729270459605075,
"learning_rate": 1.7271301572736903e-06,
"loss": 0.6072,
"step": 97
},
{
"epoch": 0.5664739884393064,
"grad_norm": 2.1958221501396804,
"learning_rate": 1.7205215936007869e-06,
"loss": 0.5795,
"step": 98
},
{
"epoch": 0.5722543352601156,
"grad_norm": 2.474781726279538,
"learning_rate": 1.713846930798583e-06,
"loss": 0.5904,
"step": 99
},
{
"epoch": 0.5780346820809249,
"grad_norm": 2.7388227639736753,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.5781,
"step": 100
},
{
"epoch": 0.5838150289017341,
"grad_norm": 2.0078080924277963,
"learning_rate": 1.700301763091771e-06,
"loss": 0.6035,
"step": 101
},
{
"epoch": 0.5895953757225434,
"grad_norm": 2.057681142337808,
"learning_rate": 1.6934325007922417e-06,
"loss": 0.5993,
"step": 102
},
{
"epoch": 0.5953757225433526,
"grad_norm": 2.129426765761644,
"learning_rate": 1.6864996244595755e-06,
"loss": 0.556,
"step": 103
},
{
"epoch": 0.6011560693641619,
"grad_norm": 2.135875027874883,
"learning_rate": 1.6795037701012055e-06,
"loss": 0.6003,
"step": 104
},
{
"epoch": 0.6069364161849711,
"grad_norm": 2.254631204567563,
"learning_rate": 1.6724455795020357e-06,
"loss": 0.5819,
"step": 105
},
{
"epoch": 0.6127167630057804,
"grad_norm": 2.369367886126561,
"learning_rate": 1.665325700165565e-06,
"loss": 0.5825,
"step": 106
},
{
"epoch": 0.6184971098265896,
"grad_norm": 3.8018094119209382,
"learning_rate": 1.6581447852544877e-06,
"loss": 0.5584,
"step": 107
},
{
"epoch": 0.6242774566473989,
"grad_norm": 2.2913674882572663,
"learning_rate": 1.6509034935307714e-06,
"loss": 0.6143,
"step": 108
},
{
"epoch": 0.630057803468208,
"grad_norm": 3.297808364709142,
"learning_rate": 1.6436024892952253e-06,
"loss": 0.567,
"step": 109
},
{
"epoch": 0.6358381502890174,
"grad_norm": 2.1684588873033563,
"learning_rate": 1.6362424423265597e-06,
"loss": 0.5895,
"step": 110
},
{
"epoch": 0.6416184971098265,
"grad_norm": 2.1187621296515484,
"learning_rate": 1.6288240278199393e-06,
"loss": 0.5775,
"step": 111
},
{
"epoch": 0.6473988439306358,
"grad_norm": 2.0285848144538057,
"learning_rate": 1.6213479263250432e-06,
"loss": 0.5953,
"step": 112
},
{
"epoch": 0.653179190751445,
"grad_norm": 2.3252936492887484,
"learning_rate": 1.6138148236836337e-06,
"loss": 0.5773,
"step": 113
},
{
"epoch": 0.6589595375722543,
"grad_norm": 3.325919273645213,
"learning_rate": 1.606225410966638e-06,
"loss": 0.5909,
"step": 114
},
{
"epoch": 0.6647398843930635,
"grad_norm": 2.269916885150113,
"learning_rate": 1.5985803844107502e-06,
"loss": 0.5843,
"step": 115
},
{
"epoch": 0.6705202312138728,
"grad_norm": 2.3232673917342397,
"learning_rate": 1.5908804453545606e-06,
"loss": 0.5978,
"step": 116
},
{
"epoch": 0.6763005780346821,
"grad_norm": 3.4581168744539714,
"learning_rate": 1.5831263001742165e-06,
"loss": 0.5804,
"step": 117
},
{
"epoch": 0.6820809248554913,
"grad_norm": 2.0671980165005728,
"learning_rate": 1.5753186602186206e-06,
"loss": 0.5725,
"step": 118
},
{
"epoch": 0.6878612716763006,
"grad_norm": 2.3838726654032674,
"learning_rate": 1.5674582417441731e-06,
"loss": 0.5808,
"step": 119
},
{
"epoch": 0.6936416184971098,
"grad_norm": 2.2314237741431664,
"learning_rate": 1.559545765849064e-06,
"loss": 0.5965,
"step": 120
},
{
"epoch": 0.6994219653179191,
"grad_norm": 2.846135318582479,
"learning_rate": 1.5515819584071214e-06,
"loss": 0.5988,
"step": 121
},
{
"epoch": 0.7052023121387283,
"grad_norm": 2.0012935495630853,
"learning_rate": 1.5435675500012212e-06,
"loss": 0.5819,
"step": 122
},
{
"epoch": 0.7109826589595376,
"grad_norm": 2.217041365110391,
"learning_rate": 1.535503275856264e-06,
"loss": 0.5976,
"step": 123
},
{
"epoch": 0.7167630057803468,
"grad_norm": 2.032603573737414,
"learning_rate": 1.5273898757717292e-06,
"loss": 0.6074,
"step": 124
},
{
"epoch": 0.7225433526011561,
"grad_norm": 2.3350139620743864,
"learning_rate": 1.5192280940538055e-06,
"loss": 0.5844,
"step": 125
},
{
"epoch": 0.7283236994219653,
"grad_norm": 2.714512688497629,
"learning_rate": 1.5110186794471103e-06,
"loss": 0.5884,
"step": 126
},
{
"epoch": 0.7341040462427746,
"grad_norm": 2.6037265515423984,
"learning_rate": 1.502762385066002e-06,
"loss": 0.5895,
"step": 127
},
{
"epoch": 0.7398843930635838,
"grad_norm": 2.2313177421020667,
"learning_rate": 1.49445996832549e-06,
"loss": 0.5997,
"step": 128
},
{
"epoch": 0.7456647398843931,
"grad_norm": 1.964807059869446,
"learning_rate": 1.4861121908717526e-06,
"loss": 0.5605,
"step": 129
},
{
"epoch": 0.7514450867052023,
"grad_norm": 2.3890635979269703,
"learning_rate": 1.4777198185122628e-06,
"loss": 0.5816,
"step": 130
},
{
"epoch": 0.7572254335260116,
"grad_norm": 2.3155649540352687,
"learning_rate": 1.469283621145537e-06,
"loss": 0.5742,
"step": 131
},
{
"epoch": 0.7630057803468208,
"grad_norm": 2.224388579727507,
"learning_rate": 1.4608043726905049e-06,
"loss": 0.6063,
"step": 132
},
{
"epoch": 0.7687861271676301,
"grad_norm": 2.049353631849029,
"learning_rate": 1.4522828510155121e-06,
"loss": 0.588,
"step": 133
},
{
"epoch": 0.7745664739884393,
"grad_norm": 2.3198548514734147,
"learning_rate": 1.4437198378669597e-06,
"loss": 0.5607,
"step": 134
},
{
"epoch": 0.7803468208092486,
"grad_norm": 2.0249418082784727,
"learning_rate": 1.4351161187975902e-06,
"loss": 0.5691,
"step": 135
},
{
"epoch": 0.7861271676300579,
"grad_norm": 2.3694044016660745,
"learning_rate": 1.4264724830944197e-06,
"loss": 0.5925,
"step": 136
},
{
"epoch": 0.791907514450867,
"grad_norm": 2.249961753553928,
"learning_rate": 1.4177897237063335e-06,
"loss": 0.5661,
"step": 137
},
{
"epoch": 0.7976878612716763,
"grad_norm": 2.4659041973808735,
"learning_rate": 1.40906863717134e-06,
"loss": 0.5408,
"step": 138
},
{
"epoch": 0.8034682080924855,
"grad_norm": 2.0311840956249068,
"learning_rate": 1.4003100235434998e-06,
"loss": 0.5382,
"step": 139
},
{
"epoch": 0.8092485549132948,
"grad_norm": 2.176204821196535,
"learning_rate": 1.391514686319529e-06,
"loss": 0.5819,
"step": 140
},
{
"epoch": 0.815028901734104,
"grad_norm": 2.5319891016136324,
"learning_rate": 1.3826834323650898e-06,
"loss": 0.5612,
"step": 141
},
{
"epoch": 0.8208092485549133,
"grad_norm": 2.2126657534234933,
"learning_rate": 1.3738170718407686e-06,
"loss": 0.561,
"step": 142
},
{
"epoch": 0.8265895953757225,
"grad_norm": 2.039076902248122,
"learning_rate": 1.3649164181277553e-06,
"loss": 0.5701,
"step": 143
},
{
"epoch": 0.8323699421965318,
"grad_norm": 1.967704448699255,
"learning_rate": 1.3559822877532232e-06,
"loss": 0.5919,
"step": 144
},
{
"epoch": 0.838150289017341,
"grad_norm": 2.3220611845701677,
"learning_rate": 1.3470155003154248e-06,
"loss": 0.5737,
"step": 145
},
{
"epoch": 0.8439306358381503,
"grad_norm": 2.0704795325160252,
"learning_rate": 1.3380168784085026e-06,
"loss": 0.5594,
"step": 146
},
{
"epoch": 0.8497109826589595,
"grad_norm": 2.1453836439669374,
"learning_rate": 1.3289872475470256e-06,
"loss": 0.5531,
"step": 147
},
{
"epoch": 0.8554913294797688,
"grad_norm": 2.2895757955477274,
"learning_rate": 1.3199274360902588e-06,
"loss": 0.6007,
"step": 148
},
{
"epoch": 0.861271676300578,
"grad_norm": 5.3794532392535634,
"learning_rate": 1.310838275166172e-06,
"loss": 0.5744,
"step": 149
},
{
"epoch": 0.8670520231213873,
"grad_norm": 2.3811778937325054,
"learning_rate": 1.3017205985951924e-06,
"loss": 0.5676,
"step": 150
},
{
"epoch": 0.8728323699421965,
"grad_norm": 2.106851655179906,
"learning_rate": 1.2925752428137125e-06,
"loss": 0.6148,
"step": 151
},
{
"epoch": 0.8786127167630058,
"grad_norm": 2.0537876367376664,
"learning_rate": 1.2834030467973571e-06,
"loss": 0.5762,
"step": 152
},
{
"epoch": 0.884393063583815,
"grad_norm": 2.2071876047592727,
"learning_rate": 1.274204851984018e-06,
"loss": 0.5758,
"step": 153
},
{
"epoch": 0.8901734104046243,
"grad_norm": 2.13960824519687,
"learning_rate": 1.264981502196662e-06,
"loss": 0.5445,
"step": 154
},
{
"epoch": 0.8959537572254336,
"grad_norm": 2.4673552744509157,
"learning_rate": 1.255733843565918e-06,
"loss": 0.6071,
"step": 155
},
{
"epoch": 0.9017341040462428,
"grad_norm": 1.9871480766650433,
"learning_rate": 1.2464627244524593e-06,
"loss": 0.576,
"step": 156
},
{
"epoch": 0.9075144508670521,
"grad_norm": 2.19749359656742,
"learning_rate": 1.237168995369173e-06,
"loss": 0.5799,
"step": 157
},
{
"epoch": 0.9132947976878613,
"grad_norm": 2.1193685610704476,
"learning_rate": 1.2278535089031377e-06,
"loss": 0.5879,
"step": 158
},
{
"epoch": 0.9190751445086706,
"grad_norm": 2.1184302830201744,
"learning_rate": 1.2185171196374078e-06,
"loss": 0.5372,
"step": 159
},
{
"epoch": 0.9248554913294798,
"grad_norm": 2.0407022944958033,
"learning_rate": 1.2091606840726167e-06,
"loss": 0.585,
"step": 160
},
{
"epoch": 0.930635838150289,
"grad_norm": 2.168905043199937,
"learning_rate": 1.1997850605484032e-06,
"loss": 0.5604,
"step": 161
},
{
"epoch": 0.9364161849710982,
"grad_norm": 2.138142429910041,
"learning_rate": 1.1903911091646684e-06,
"loss": 0.593,
"step": 162
},
{
"epoch": 0.9421965317919075,
"grad_norm": 2.4892163650092516,
"learning_rate": 1.1809796917026728e-06,
"loss": 0.6056,
"step": 163
},
{
"epoch": 0.9479768786127167,
"grad_norm": 1.9576416066436553,
"learning_rate": 1.1715516715459784e-06,
"loss": 0.564,
"step": 164
},
{
"epoch": 0.953757225433526,
"grad_norm": 2.046605869264703,
"learning_rate": 1.1621079136012425e-06,
"loss": 0.5769,
"step": 165
},
{
"epoch": 0.9595375722543352,
"grad_norm": 2.094005375428543,
"learning_rate": 1.1526492842188744e-06,
"loss": 0.5847,
"step": 166
},
{
"epoch": 0.9653179190751445,
"grad_norm": 2.070280693183011,
"learning_rate": 1.143176651113558e-06,
"loss": 0.5564,
"step": 167
},
{
"epoch": 0.9710982658959537,
"grad_norm": 2.2148708162128212,
"learning_rate": 1.1336908832846483e-06,
"loss": 0.5263,
"step": 168
},
{
"epoch": 0.976878612716763,
"grad_norm": 2.1683895469622496,
"learning_rate": 1.124192850936453e-06,
"loss": 0.5375,
"step": 169
},
{
"epoch": 0.9826589595375722,
"grad_norm": 2.0595174642305216,
"learning_rate": 1.1146834253984005e-06,
"loss": 0.5859,
"step": 170
},
{
"epoch": 0.9884393063583815,
"grad_norm": 2.4501932894172653,
"learning_rate": 1.1051634790451058e-06,
"loss": 0.5525,
"step": 171
},
{
"epoch": 0.9942196531791907,
"grad_norm": 2.073182251777239,
"learning_rate": 1.0956338852163423e-06,
"loss": 0.5797,
"step": 172
},
{
"epoch": 1.0,
"grad_norm": 2.1339124056307996,
"learning_rate": 1.0860955181369217e-06,
"loss": 0.5445,
"step": 173
},
{
"epoch": 1.0057803468208093,
"grad_norm": 1.889514155050403,
"learning_rate": 1.076549252836496e-06,
"loss": 0.4816,
"step": 174
},
{
"epoch": 1.0115606936416186,
"grad_norm": 1.9256918662534441,
"learning_rate": 1.0669959650692818e-06,
"loss": 0.5073,
"step": 175
},
{
"epoch": 1.0173410404624277,
"grad_norm": 1.9262060377414805,
"learning_rate": 1.0574365312337234e-06,
"loss": 0.5085,
"step": 176
},
{
"epoch": 1.023121387283237,
"grad_norm": 2.123630125385134,
"learning_rate": 1.047871828292092e-06,
"loss": 0.4866,
"step": 177
},
{
"epoch": 1.0289017341040463,
"grad_norm": 2.188774815332412,
"learning_rate": 1.0383027336900353e-06,
"loss": 0.511,
"step": 178
},
{
"epoch": 1.0346820809248556,
"grad_norm": 2.0846346019353064,
"learning_rate": 1.028730125276083e-06,
"loss": 0.4731,
"step": 179
},
{
"epoch": 1.0404624277456647,
"grad_norm": 2.0139582535739042,
"learning_rate": 1.0191548812211142e-06,
"loss": 0.4332,
"step": 180
},
{
"epoch": 1.046242774566474,
"grad_norm": 2.2271331676676516,
"learning_rate": 1.0095778799377959e-06,
"loss": 0.4548,
"step": 181
},
{
"epoch": 1.0520231213872833,
"grad_norm": 2.143584429924543,
"learning_rate": 1e-06,
"loss": 0.5114,
"step": 182
},
{
"epoch": 1.0578034682080926,
"grad_norm": 1.9376604838146982,
"learning_rate": 9.904221200622043e-07,
"loss": 0.4628,
"step": 183
},
{
"epoch": 1.0635838150289016,
"grad_norm": 1.9520446252559986,
"learning_rate": 9.80845118778886e-07,
"loss": 0.4619,
"step": 184
},
{
"epoch": 1.069364161849711,
"grad_norm": 1.8153622741750448,
"learning_rate": 9.71269874723917e-07,
"loss": 0.4859,
"step": 185
},
{
"epoch": 1.0751445086705202,
"grad_norm": 2.0497117157639395,
"learning_rate": 9.616972663099646e-07,
"loss": 0.4682,
"step": 186
},
{
"epoch": 1.0809248554913296,
"grad_norm": 1.9422240692097994,
"learning_rate": 9.521281717079081e-07,
"loss": 0.4862,
"step": 187
},
{
"epoch": 1.0867052023121386,
"grad_norm": 2.0311048893088515,
"learning_rate": 9.425634687662766e-07,
"loss": 0.5078,
"step": 188
},
{
"epoch": 1.092485549132948,
"grad_norm": 2.573871459192162,
"learning_rate": 9.330040349307183e-07,
"loss": 0.4822,
"step": 189
},
{
"epoch": 1.0982658959537572,
"grad_norm": 2.179513115768778,
"learning_rate": 9.234507471635042e-07,
"loss": 0.4986,
"step": 190
},
{
"epoch": 1.1040462427745665,
"grad_norm": 2.2820802238060143,
"learning_rate": 9.139044818630783e-07,
"loss": 0.4922,
"step": 191
},
{
"epoch": 1.1098265895953756,
"grad_norm": 2.0023850906436405,
"learning_rate": 9.043661147836578e-07,
"loss": 0.4494,
"step": 192
},
{
"epoch": 1.115606936416185,
"grad_norm": 1.9343529979889897,
"learning_rate": 8.948365209548941e-07,
"loss": 0.4656,
"step": 193
},
{
"epoch": 1.1213872832369942,
"grad_norm": 2.2579648984673053,
"learning_rate": 8.853165746015995e-07,
"loss": 0.4793,
"step": 194
},
{
"epoch": 1.1271676300578035,
"grad_norm": 2.4825890947622167,
"learning_rate": 8.758071490635468e-07,
"loss": 0.473,
"step": 195
},
{
"epoch": 1.1329479768786128,
"grad_norm": 2.543399308768905,
"learning_rate": 8.663091167153514e-07,
"loss": 0.5026,
"step": 196
},
{
"epoch": 1.138728323699422,
"grad_norm": 2.196355049811149,
"learning_rate": 8.568233488864419e-07,
"loss": 0.4909,
"step": 197
},
{
"epoch": 1.1445086705202312,
"grad_norm": 1.8400038174670734,
"learning_rate": 8.473507157811254e-07,
"loss": 0.4852,
"step": 198
},
{
"epoch": 1.1502890173410405,
"grad_norm": 2.0651998028564305,
"learning_rate": 8.378920863987575e-07,
"loss": 0.4798,
"step": 199
},
{
"epoch": 1.1560693641618498,
"grad_norm": 1.7845307873709155,
"learning_rate": 8.284483284540216e-07,
"loss": 0.4613,
"step": 200
},
{
"epoch": 1.1618497109826589,
"grad_norm": 1.895202511289734,
"learning_rate": 8.190203082973271e-07,
"loss": 0.5084,
"step": 201
},
{
"epoch": 1.1676300578034682,
"grad_norm": 2.051181108200922,
"learning_rate": 8.096088908353315e-07,
"loss": 0.4672,
"step": 202
},
{
"epoch": 1.1734104046242775,
"grad_norm": 2.1934950530726445,
"learning_rate": 8.002149394515972e-07,
"loss": 0.4895,
"step": 203
},
{
"epoch": 1.1791907514450868,
"grad_norm": 2.3380870740629587,
"learning_rate": 7.908393159273836e-07,
"loss": 0.4666,
"step": 204
},
{
"epoch": 1.1849710982658959,
"grad_norm": 1.9114359910024667,
"learning_rate": 7.814828803625925e-07,
"loss": 0.4974,
"step": 205
},
{
"epoch": 1.1907514450867052,
"grad_norm": 2.054987235601307,
"learning_rate": 7.721464910968626e-07,
"loss": 0.4499,
"step": 206
},
{
"epoch": 1.1965317919075145,
"grad_norm": 2.0694031895214904,
"learning_rate": 7.628310046308272e-07,
"loss": 0.4853,
"step": 207
},
{
"epoch": 1.2023121387283238,
"grad_norm": 1.9497478503918264,
"learning_rate": 7.53537275547541e-07,
"loss": 0.4701,
"step": 208
},
{
"epoch": 1.208092485549133,
"grad_norm": 2.0471759991586578,
"learning_rate": 7.442661564340822e-07,
"loss": 0.4672,
"step": 209
},
{
"epoch": 1.2138728323699421,
"grad_norm": 2.1249362738674336,
"learning_rate": 7.350184978033385e-07,
"loss": 0.4671,
"step": 210
},
{
"epoch": 1.2196531791907514,
"grad_norm": 2.1197465337178047,
"learning_rate": 7.257951480159819e-07,
"loss": 0.462,
"step": 211
},
{
"epoch": 1.2254335260115607,
"grad_norm": 1.851706914708072,
"learning_rate": 7.165969532026429e-07,
"loss": 0.4583,
"step": 212
},
{
"epoch": 1.2312138728323698,
"grad_norm": 2.0504382329015627,
"learning_rate": 7.074247571862877e-07,
"loss": 0.4698,
"step": 213
},
{
"epoch": 1.2369942196531791,
"grad_norm": 1.9545030444177458,
"learning_rate": 6.982794014048077e-07,
"loss": 0.4832,
"step": 214
},
{
"epoch": 1.2427745664739884,
"grad_norm": 1.8828619878855422,
"learning_rate": 6.891617248338282e-07,
"loss": 0.4664,
"step": 215
},
{
"epoch": 1.2485549132947977,
"grad_norm": 1.8683663335668868,
"learning_rate": 6.800725639097411e-07,
"loss": 0.4735,
"step": 216
},
{
"epoch": 1.254335260115607,
"grad_norm": 2.0704005138553505,
"learning_rate": 6.710127524529745e-07,
"loss": 0.5023,
"step": 217
},
{
"epoch": 1.260115606936416,
"grad_norm": 2.272926759535781,
"learning_rate": 6.619831215914973e-07,
"loss": 0.4699,
"step": 218
},
{
"epoch": 1.2658959537572254,
"grad_norm": 2.091914123638361,
"learning_rate": 6.52984499684575e-07,
"loss": 0.4573,
"step": 219
},
{
"epoch": 1.2716763005780347,
"grad_norm": 1.995494161392343,
"learning_rate": 6.440177122467768e-07,
"loss": 0.4873,
"step": 220
},
{
"epoch": 1.2774566473988438,
"grad_norm": 2.041258798930975,
"learning_rate": 6.350835818722449e-07,
"loss": 0.4936,
"step": 221
},
{
"epoch": 1.2832369942196533,
"grad_norm": 1.9625875782578102,
"learning_rate": 6.261829281592312e-07,
"loss": 0.4748,
"step": 222
},
{
"epoch": 1.2890173410404624,
"grad_norm": 1.9356789263214738,
"learning_rate": 6.173165676349102e-07,
"loss": 0.4803,
"step": 223
},
{
"epoch": 1.2947976878612717,
"grad_norm": 1.942573776135112,
"learning_rate": 6.084853136804711e-07,
"loss": 0.4635,
"step": 224
},
{
"epoch": 1.300578034682081,
"grad_norm": 2.02929712164466,
"learning_rate": 5.996899764565005e-07,
"loss": 0.476,
"step": 225
},
{
"epoch": 1.30635838150289,
"grad_norm": 2.082626277063179,
"learning_rate": 5.9093136282866e-07,
"loss": 0.4856,
"step": 226
},
{
"epoch": 1.3121387283236994,
"grad_norm": 2.1016956249931775,
"learning_rate": 5.822102762936666e-07,
"loss": 0.506,
"step": 227
},
{
"epoch": 1.3179190751445087,
"grad_norm": 1.9312802366426565,
"learning_rate": 5.735275169055803e-07,
"loss": 0.4749,
"step": 228
},
{
"epoch": 1.323699421965318,
"grad_norm": 2.061206641650247,
"learning_rate": 5.648838812024099e-07,
"loss": 0.4512,
"step": 229
},
{
"epoch": 1.3294797687861273,
"grad_norm": 1.9814472946523185,
"learning_rate": 5.562801621330402e-07,
"loss": 0.4776,
"step": 230
},
{
"epoch": 1.3352601156069364,
"grad_norm": 1.8543611294874782,
"learning_rate": 5.477171489844881e-07,
"loss": 0.5093,
"step": 231
},
{
"epoch": 1.3410404624277457,
"grad_norm": 1.9906348566872387,
"learning_rate": 5.391956273094951e-07,
"loss": 0.4852,
"step": 232
},
{
"epoch": 1.346820809248555,
"grad_norm": 1.9315896688508982,
"learning_rate": 5.307163788544629e-07,
"loss": 0.4608,
"step": 233
},
{
"epoch": 1.352601156069364,
"grad_norm": 1.8959836891458495,
"learning_rate": 5.222801814877369e-07,
"loss": 0.449,
"step": 234
},
{
"epoch": 1.3583815028901733,
"grad_norm": 1.8907180611172163,
"learning_rate": 5.138878091282471e-07,
"loss": 0.4458,
"step": 235
},
{
"epoch": 1.3641618497109826,
"grad_norm": 1.9420465209074669,
"learning_rate": 5.055400316745095e-07,
"loss": 0.4756,
"step": 236
},
{
"epoch": 1.369942196531792,
"grad_norm": 1.94702430879285,
"learning_rate": 4.972376149339978e-07,
"loss": 0.457,
"step": 237
},
{
"epoch": 1.3757225433526012,
"grad_norm": 2.5499359496821277,
"learning_rate": 4.889813205528894e-07,
"loss": 0.4758,
"step": 238
},
{
"epoch": 1.3815028901734103,
"grad_norm": 1.9308947290061484,
"learning_rate": 4.807719059461942e-07,
"loss": 0.4611,
"step": 239
},
{
"epoch": 1.3872832369942196,
"grad_norm": 1.9505629087051528,
"learning_rate": 4.7261012422827074e-07,
"loss": 0.4719,
"step": 240
},
{
"epoch": 1.393063583815029,
"grad_norm": 2.267217020333816,
"learning_rate": 4.6449672414373597e-07,
"loss": 0.4802,
"step": 241
},
{
"epoch": 1.3988439306358382,
"grad_norm": 2.152359255572803,
"learning_rate": 4.5643244999877896e-07,
"loss": 0.4635,
"step": 242
},
{
"epoch": 1.4046242774566475,
"grad_norm": 1.9757352489131026,
"learning_rate": 4.4841804159287857e-07,
"loss": 0.4716,
"step": 243
},
{
"epoch": 1.4104046242774566,
"grad_norm": 2.0536715397019942,
"learning_rate": 4.40454234150936e-07,
"loss": 0.473,
"step": 244
},
{
"epoch": 1.416184971098266,
"grad_norm": 1.9534035655257582,
"learning_rate": 4.3254175825582693e-07,
"loss": 0.4803,
"step": 245
},
{
"epoch": 1.4219653179190752,
"grad_norm": 1.9057679988988923,
"learning_rate": 4.246813397813794e-07,
"loss": 0.4601,
"step": 246
},
{
"epoch": 1.4277456647398843,
"grad_norm": 1.9037622302779358,
"learning_rate": 4.1687369982578346e-07,
"loss": 0.4527,
"step": 247
},
{
"epoch": 1.4335260115606936,
"grad_norm": 2.9426991633061945,
"learning_rate": 4.0911955464543976e-07,
"loss": 0.4833,
"step": 248
},
{
"epoch": 1.439306358381503,
"grad_norm": 1.9189468581058093,
"learning_rate": 4.014196155892502e-07,
"loss": 0.4573,
"step": 249
},
{
"epoch": 1.4450867052023122,
"grad_norm": 2.0139556098301186,
"learning_rate": 3.9377458903336223e-07,
"loss": 0.4758,
"step": 250
},
{
"epoch": 1.4508670520231215,
"grad_norm": 2.0450079177733276,
"learning_rate": 3.861851763163665e-07,
"loss": 0.4663,
"step": 251
},
{
"epoch": 1.4566473988439306,
"grad_norm": 3.137157219646671,
"learning_rate": 3.786520736749571e-07,
"loss": 0.4744,
"step": 252
},
{
"epoch": 1.4624277456647399,
"grad_norm": 2.107997689195662,
"learning_rate": 3.71175972180061e-07,
"loss": 0.4675,
"step": 253
},
{
"epoch": 1.4682080924855492,
"grad_norm": 2.0169649469031823,
"learning_rate": 3.6375755767344043e-07,
"loss": 0.4654,
"step": 254
},
{
"epoch": 1.4739884393063583,
"grad_norm": 1.9965328414982166,
"learning_rate": 3.563975107047747e-07,
"loss": 0.4788,
"step": 255
},
{
"epoch": 1.4797687861271676,
"grad_norm": 1.909053888926535,
"learning_rate": 3.4909650646922894e-07,
"loss": 0.4864,
"step": 256
},
{
"epoch": 1.4855491329479769,
"grad_norm": 2.1284220971629098,
"learning_rate": 3.4185521474551247e-07,
"loss": 0.464,
"step": 257
},
{
"epoch": 1.4913294797687862,
"grad_norm": 2.033246742298473,
"learning_rate": 3.3467429983443476e-07,
"loss": 0.469,
"step": 258
},
{
"epoch": 1.4971098265895955,
"grad_norm": 2.1479521556785355,
"learning_rate": 3.2755442049796425e-07,
"loss": 0.4889,
"step": 259
},
{
"epoch": 1.5028901734104045,
"grad_norm": 2.0332714192782615,
"learning_rate": 3.204962298987944e-07,
"loss": 0.4686,
"step": 260
},
{
"epoch": 1.5086705202312138,
"grad_norm": 1.9047792343443468,
"learning_rate": 3.135003755404244e-07,
"loss": 0.4657,
"step": 261
},
{
"epoch": 1.5144508670520231,
"grad_norm": 1.9650730838682373,
"learning_rate": 3.065674992077584e-07,
"loss": 0.4754,
"step": 262
},
{
"epoch": 1.5202312138728322,
"grad_norm": 2.419000097894264,
"learning_rate": 2.9969823690822904e-07,
"loss": 0.4646,
"step": 263
},
{
"epoch": 1.5260115606936417,
"grad_norm": 1.971230314254081,
"learning_rate": 2.9289321881345254e-07,
"loss": 0.4667,
"step": 264
},
{
"epoch": 1.5317919075144508,
"grad_norm": 2.121737526252635,
"learning_rate": 2.861530692014169e-07,
"loss": 0.4674,
"step": 265
},
{
"epoch": 1.5375722543352601,
"grad_norm": 2.0795424904002515,
"learning_rate": 2.7947840639921303e-07,
"loss": 0.5152,
"step": 266
},
{
"epoch": 1.5433526011560694,
"grad_norm": 1.9716533778945817,
"learning_rate": 2.728698427263096e-07,
"loss": 0.4774,
"step": 267
},
{
"epoch": 1.5491329479768785,
"grad_norm": 1.8977607875089808,
"learning_rate": 2.6632798443838145e-07,
"loss": 0.4514,
"step": 268
},
{
"epoch": 1.5549132947976878,
"grad_norm": 1.9253571622286325,
"learning_rate": 2.598534316716917e-07,
"loss": 0.4607,
"step": 269
},
{
"epoch": 1.560693641618497,
"grad_norm": 2.078466033180376,
"learning_rate": 2.534467783880373e-07,
"loss": 0.4883,
"step": 270
},
{
"epoch": 1.5664739884393064,
"grad_norm": 2.5971424603146436,
"learning_rate": 2.4710861232026013e-07,
"loss": 0.4746,
"step": 271
},
{
"epoch": 1.5722543352601157,
"grad_norm": 1.9666452852090528,
"learning_rate": 2.408395149183294e-07,
"loss": 0.5058,
"step": 272
},
{
"epoch": 1.5780346820809248,
"grad_norm": 2.035908272574928,
"learning_rate": 2.346400612960009e-07,
"loss": 0.4849,
"step": 273
},
{
"epoch": 1.583815028901734,
"grad_norm": 2.0288243942210378,
"learning_rate": 2.28510820178057e-07,
"loss": 0.4778,
"step": 274
},
{
"epoch": 1.5895953757225434,
"grad_norm": 1.9735647171840094,
"learning_rate": 2.2245235384813332e-07,
"loss": 0.4851,
"step": 275
},
{
"epoch": 1.5953757225433525,
"grad_norm": 2.0907181002682877,
"learning_rate": 2.164652180971358e-07,
"loss": 0.4604,
"step": 276
},
{
"epoch": 1.601156069364162,
"grad_norm": 2.020880323505046,
"learning_rate": 2.1054996217225385e-07,
"loss": 0.4629,
"step": 277
},
{
"epoch": 1.606936416184971,
"grad_norm": 2.259358632099597,
"learning_rate": 2.0470712872657348e-07,
"loss": 0.4806,
"step": 278
},
{
"epoch": 1.6127167630057804,
"grad_norm": 2.8709064155831023,
"learning_rate": 1.9893725376929504e-07,
"loss": 0.4324,
"step": 279
},
{
"epoch": 1.6184971098265897,
"grad_norm": 2.073098194342015,
"learning_rate": 1.9324086661656168e-07,
"loss": 0.4731,
"step": 280
},
{
"epoch": 1.6242774566473988,
"grad_norm": 2.1188843044480965,
"learning_rate": 1.8761848984290062e-07,
"loss": 0.4616,
"step": 281
},
{
"epoch": 1.630057803468208,
"grad_norm": 1.9759976408035527,
"learning_rate": 1.8207063923328235e-07,
"loss": 0.481,
"step": 282
},
{
"epoch": 1.6358381502890174,
"grad_norm": 2.046719438470121,
"learning_rate": 1.7659782373580555e-07,
"loss": 0.4666,
"step": 283
},
{
"epoch": 1.6416184971098264,
"grad_norm": 1.9283494713604754,
"learning_rate": 1.712005454150055e-07,
"loss": 0.4581,
"step": 284
},
{
"epoch": 1.647398843930636,
"grad_norm": 1.9357329743637328,
"learning_rate": 1.658792994057968e-07,
"loss": 0.4426,
"step": 285
},
{
"epoch": 1.653179190751445,
"grad_norm": 1.942353186463166,
"learning_rate": 1.6063457386805003e-07,
"loss": 0.4805,
"step": 286
},
{
"epoch": 1.6589595375722543,
"grad_norm": 2.1127349981346053,
"learning_rate": 1.554668499418097e-07,
"loss": 0.4699,
"step": 287
},
{
"epoch": 1.6647398843930636,
"grad_norm": 1.947713066714842,
"learning_rate": 1.503766017031547e-07,
"loss": 0.4709,
"step": 288
},
{
"epoch": 1.6705202312138727,
"grad_norm": 1.931903633238329,
"learning_rate": 1.4536429612070843e-07,
"loss": 0.4887,
"step": 289
},
{
"epoch": 1.6763005780346822,
"grad_norm": 1.9911200450774031,
"learning_rate": 1.4043039301279903e-07,
"loss": 0.4904,
"step": 290
},
{
"epoch": 1.6820809248554913,
"grad_norm": 1.9439202742202357,
"learning_rate": 1.3557534500527768e-07,
"loss": 0.4531,
"step": 291
},
{
"epoch": 1.6878612716763006,
"grad_norm": 1.9164172510841055,
"learning_rate": 1.3079959748999493e-07,
"loss": 0.4563,
"step": 292
},
{
"epoch": 1.69364161849711,
"grad_norm": 2.0311554173466666,
"learning_rate": 1.2610358858394188e-07,
"loss": 0.4828,
"step": 293
},
{
"epoch": 1.699421965317919,
"grad_norm": 1.9998690205184846,
"learning_rate": 1.2148774908905778e-07,
"loss": 0.4786,
"step": 294
},
{
"epoch": 1.7052023121387283,
"grad_norm": 4.169828381438711,
"learning_rate": 1.169525024527096e-07,
"loss": 0.4677,
"step": 295
},
{
"epoch": 1.7109826589595376,
"grad_norm": 1.8989428779004813,
"learning_rate": 1.1249826472884571e-07,
"loss": 0.4401,
"step": 296
},
{
"epoch": 1.7167630057803467,
"grad_norm": 2.1129726622598093,
"learning_rate": 1.0812544453982764e-07,
"loss": 0.4903,
"step": 297
},
{
"epoch": 1.7225433526011562,
"grad_norm": 2.0629596593806934,
"learning_rate": 1.038344430389445e-07,
"loss": 0.4925,
"step": 298
},
{
"epoch": 1.7283236994219653,
"grad_norm": 2.0320095664676665,
"learning_rate": 9.962565387361166e-08,
"loss": 0.4614,
"step": 299
},
{
"epoch": 1.7341040462427746,
"grad_norm": 3.0540554132959183,
"learning_rate": 9.549946314925839e-08,
"loss": 0.4964,
"step": 300
},
{
"epoch": 1.739884393063584,
"grad_norm": 2.016687599667707,
"learning_rate": 9.145624939390761e-08,
"loss": 0.4527,
"step": 301
},
{
"epoch": 1.745664739884393,
"grad_norm": 1.8452228441209821,
"learning_rate": 8.749638352345001e-08,
"loss": 0.4878,
"step": 302
},
{
"epoch": 1.7514450867052023,
"grad_norm": 2.385887245808352,
"learning_rate": 8.362022880761776e-08,
"loss": 0.4974,
"step": 303
},
{
"epoch": 1.7572254335260116,
"grad_norm": 1.9832314558835376,
"learning_rate": 7.982814083665823e-08,
"loss": 0.4668,
"step": 304
},
{
"epoch": 1.7630057803468207,
"grad_norm": 2.564651988633928,
"learning_rate": 7.612046748871326e-08,
"loss": 0.4604,
"step": 305
},
{
"epoch": 1.7687861271676302,
"grad_norm": 2.135369579624982,
"learning_rate": 7.249754889790538e-08,
"loss": 0.4981,
"step": 306
},
{
"epoch": 1.7745664739884393,
"grad_norm": 2.167361327770021,
"learning_rate": 6.895971742313467e-08,
"loss": 0.4484,
"step": 307
},
{
"epoch": 1.7803468208092486,
"grad_norm": 1.824815893708037,
"learning_rate": 6.550729761758899e-08,
"loss": 0.467,
"step": 308
},
{
"epoch": 1.7861271676300579,
"grad_norm": 2.1085663211118777,
"learning_rate": 6.21406061989701e-08,
"loss": 0.4705,
"step": 309
},
{
"epoch": 1.791907514450867,
"grad_norm": 2.00560428209146,
"learning_rate": 5.885995202043847e-08,
"loss": 0.4708,
"step": 310
},
{
"epoch": 1.7976878612716765,
"grad_norm": 2.076853013886715,
"learning_rate": 5.5665636042279696e-08,
"loss": 0.4676,
"step": 311
},
{
"epoch": 1.8034682080924855,
"grad_norm": 1.9453450448573704,
"learning_rate": 5.2557951304295747e-08,
"loss": 0.4671,
"step": 312
},
{
"epoch": 1.8092485549132948,
"grad_norm": 2.3993736624613256,
"learning_rate": 4.953718289892106e-08,
"loss": 0.4652,
"step": 313
},
{
"epoch": 1.8150289017341041,
"grad_norm": 1.930078423666107,
"learning_rate": 4.6603607945069456e-08,
"loss": 0.5143,
"step": 314
},
{
"epoch": 1.8208092485549132,
"grad_norm": 1.9424714850195004,
"learning_rate": 4.375749556271169e-08,
"loss": 0.4978,
"step": 315
},
{
"epoch": 1.8265895953757225,
"grad_norm": 1.9534413743723884,
"learning_rate": 4.099910684818697e-08,
"loss": 0.46,
"step": 316
},
{
"epoch": 1.8323699421965318,
"grad_norm": 2.0016352335992575,
"learning_rate": 3.8328694850250475e-08,
"loss": 0.4765,
"step": 317
},
{
"epoch": 1.838150289017341,
"grad_norm": 2.009683412643769,
"learning_rate": 3.574650454685901e-08,
"loss": 0.4611,
"step": 318
},
{
"epoch": 1.8439306358381504,
"grad_norm": 1.9761898032699974,
"learning_rate": 3.325277282269756e-08,
"loss": 0.4924,
"step": 319
},
{
"epoch": 1.8497109826589595,
"grad_norm": 1.9213759865585416,
"learning_rate": 3.08477284474481e-08,
"loss": 0.4635,
"step": 320
},
{
"epoch": 1.8554913294797688,
"grad_norm": 2.1643446222196188,
"learning_rate": 2.8531592054802157e-08,
"loss": 0.49,
"step": 321
},
{
"epoch": 1.861271676300578,
"grad_norm": 1.930410206836703,
"learning_rate": 2.6304576122221034e-08,
"loss": 0.4694,
"step": 322
},
{
"epoch": 1.8670520231213872,
"grad_norm": 2.012454921101362,
"learning_rate": 2.4166884951442702e-08,
"loss": 0.4542,
"step": 323
},
{
"epoch": 1.8728323699421965,
"grad_norm": 2.211103638969066,
"learning_rate": 2.211871464974091e-08,
"loss": 0.4755,
"step": 324
},
{
"epoch": 1.8786127167630058,
"grad_norm": 2.1913424628785356,
"learning_rate": 2.0160253111933145e-08,
"loss": 0.4583,
"step": 325
},
{
"epoch": 1.8843930635838149,
"grad_norm": 2.407050555682028,
"learning_rate": 1.8291680003145073e-08,
"loss": 0.4787,
"step": 326
},
{
"epoch": 1.8901734104046244,
"grad_norm": 1.9694723993714849,
"learning_rate": 1.6513166742327168e-08,
"loss": 0.476,
"step": 327
},
{
"epoch": 1.8959537572254335,
"grad_norm": 2.0948624133751363,
"learning_rate": 1.482487648653008e-08,
"loss": 0.477,
"step": 328
},
{
"epoch": 1.9017341040462428,
"grad_norm": 1.9404754923970793,
"learning_rate": 1.3226964115936045e-08,
"loss": 0.4892,
"step": 329
},
{
"epoch": 1.907514450867052,
"grad_norm": 2.6043671145247234,
"learning_rate": 1.1719576219651584e-08,
"loss": 0.4509,
"step": 330
},
{
"epoch": 1.9132947976878611,
"grad_norm": 2.4502507284918273,
"learning_rate": 1.0302851082258367e-08,
"loss": 0.4524,
"step": 331
},
{
"epoch": 1.9190751445086707,
"grad_norm": 1.9921214924857538,
"learning_rate": 8.97691867112882e-09,
"loss": 0.4883,
"step": 332
},
{
"epoch": 1.9248554913294798,
"grad_norm": 1.9469866747763405,
"learning_rate": 7.741900624501974e-09,
"loss": 0.4591,
"step": 333
},
{
"epoch": 1.930635838150289,
"grad_norm": 2.0065329448073155,
"learning_rate": 6.5979102403249664e-09,
"loss": 0.4609,
"step": 334
},
{
"epoch": 1.9364161849710984,
"grad_norm": 5.000176973794001,
"learning_rate": 5.54505246585979e-09,
"loss": 0.4556,
"step": 335
},
{
"epoch": 1.9421965317919074,
"grad_norm": 2.1392791718201787,
"learning_rate": 4.583423888055105e-09,
"loss": 0.4574,
"step": 336
},
{
"epoch": 1.9479768786127167,
"grad_norm": 1.8931847358388805,
"learning_rate": 3.713112724685663e-09,
"loss": 0.4902,
"step": 337
},
{
"epoch": 1.953757225433526,
"grad_norm": 1.905567476983522,
"learning_rate": 2.934198816259559e-09,
"loss": 0.4923,
"step": 338
},
{
"epoch": 1.9595375722543351,
"grad_norm": 2.01728565941551,
"learning_rate": 2.246753618693753e-09,
"loss": 0.4533,
"step": 339
},
{
"epoch": 1.9653179190751446,
"grad_norm": 1.7782643184255449,
"learning_rate": 1.6508401967588736e-09,
"loss": 0.4655,
"step": 340
},
{
"epoch": 1.9710982658959537,
"grad_norm": 2.154854344890582,
"learning_rate": 1.146513218293621e-09,
"loss": 0.4473,
"step": 341
},
{
"epoch": 1.976878612716763,
"grad_norm": 2.1149088823103908,
"learning_rate": 7.338189491900015e-10,
"loss": 0.4753,
"step": 342
},
{
"epoch": 1.9826589595375723,
"grad_norm": 2.06214678310316,
"learning_rate": 4.1279524914861194e-10,
"loss": 0.4521,
"step": 343
},
{
"epoch": 1.9884393063583814,
"grad_norm": 2.0644533457335825,
"learning_rate": 1.834715682056398e-10,
"loss": 0.459,
"step": 344
},
{
"epoch": 1.9942196531791907,
"grad_norm": 2.137977836744415,
"learning_rate": 4.586894403146857e-11,
"loss": 0.5149,
"step": 345
},
{
"epoch": 2.0,
"grad_norm": 1.7708373617580409,
"learning_rate": 0.0,
"loss": 0.3975,
"step": 346
},
{
"epoch": 2.0,
"step": 346,
"total_flos": 1102036773634048.0,
"train_loss": 0.5610263916109338,
"train_runtime": 4562.3378,
"train_samples_per_second": 4.828,
"train_steps_per_second": 0.076
}
],
"logging_steps": 1,
"max_steps": 346,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1102036773634048.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}