llama_3_8b_open_qlora_r1_mi250x / trainer_state.json
sam2ai's picture
Add files using upload-large-folder tool
03787da verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9856,
"eval_steps": 500,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0064,
"grad_norm": 2.451486825942993,
"learning_rate": 2.1276595744680853e-06,
"loss": 1.5704,
"step": 1
},
{
"epoch": 0.0128,
"grad_norm": 1.047212839126587,
"learning_rate": 4.255319148936171e-06,
"loss": 1.3347,
"step": 2
},
{
"epoch": 0.0192,
"grad_norm": 1.2422709465026855,
"learning_rate": 6.3829787234042555e-06,
"loss": 1.4116,
"step": 3
},
{
"epoch": 0.0256,
"grad_norm": 2.0192582607269287,
"learning_rate": 8.510638297872341e-06,
"loss": 1.5004,
"step": 4
},
{
"epoch": 0.032,
"grad_norm": 1.6840580701828003,
"learning_rate": 1.0638297872340426e-05,
"loss": 1.478,
"step": 5
},
{
"epoch": 0.0384,
"grad_norm": 1.680579662322998,
"learning_rate": 1.2765957446808511e-05,
"loss": 1.3965,
"step": 6
},
{
"epoch": 0.0448,
"grad_norm": 1.8126763105392456,
"learning_rate": 1.4893617021276596e-05,
"loss": 1.4944,
"step": 7
},
{
"epoch": 0.0512,
"grad_norm": 2.288001537322998,
"learning_rate": 1.7021276595744682e-05,
"loss": 1.4845,
"step": 8
},
{
"epoch": 0.0576,
"grad_norm": 1.4367578029632568,
"learning_rate": 1.9148936170212766e-05,
"loss": 1.3447,
"step": 9
},
{
"epoch": 0.064,
"grad_norm": 1.576250433921814,
"learning_rate": 2.1276595744680852e-05,
"loss": 1.3918,
"step": 10
},
{
"epoch": 0.0704,
"grad_norm": 1.6590297222137451,
"learning_rate": 2.340425531914894e-05,
"loss": 1.3442,
"step": 11
},
{
"epoch": 0.0768,
"grad_norm": 2.338789701461792,
"learning_rate": 2.5531914893617022e-05,
"loss": 1.7413,
"step": 12
},
{
"epoch": 0.0832,
"grad_norm": 3.2776601314544678,
"learning_rate": 2.765957446808511e-05,
"loss": 1.7825,
"step": 13
},
{
"epoch": 0.0896,
"grad_norm": 1.463249921798706,
"learning_rate": 2.9787234042553192e-05,
"loss": 1.2387,
"step": 14
},
{
"epoch": 0.096,
"grad_norm": 2.5417563915252686,
"learning_rate": 3.191489361702128e-05,
"loss": 1.5182,
"step": 15
},
{
"epoch": 0.1024,
"grad_norm": 1.448764681816101,
"learning_rate": 3.4042553191489365e-05,
"loss": 1.1844,
"step": 16
},
{
"epoch": 0.1088,
"grad_norm": 1.5268467664718628,
"learning_rate": 3.617021276595745e-05,
"loss": 1.3816,
"step": 17
},
{
"epoch": 0.1152,
"grad_norm": 2.561483383178711,
"learning_rate": 3.829787234042553e-05,
"loss": 1.3909,
"step": 18
},
{
"epoch": 0.1216,
"grad_norm": 1.2603929042816162,
"learning_rate": 4.0425531914893614e-05,
"loss": 1.241,
"step": 19
},
{
"epoch": 0.128,
"grad_norm": 0.7591760158538818,
"learning_rate": 4.2553191489361704e-05,
"loss": 1.1528,
"step": 20
},
{
"epoch": 0.1344,
"grad_norm": 1.6874586343765259,
"learning_rate": 4.468085106382979e-05,
"loss": 1.3256,
"step": 21
},
{
"epoch": 0.1408,
"grad_norm": 1.5209052562713623,
"learning_rate": 4.680851063829788e-05,
"loss": 1.3031,
"step": 22
},
{
"epoch": 0.1472,
"grad_norm": 0.5414145588874817,
"learning_rate": 4.893617021276596e-05,
"loss": 1.192,
"step": 23
},
{
"epoch": 0.1536,
"grad_norm": 1.071704626083374,
"learning_rate": 5.1063829787234044e-05,
"loss": 1.0929,
"step": 24
},
{
"epoch": 0.16,
"grad_norm": 0.5227977633476257,
"learning_rate": 5.319148936170213e-05,
"loss": 1.1455,
"step": 25
},
{
"epoch": 0.1664,
"grad_norm": 0.5635082721710205,
"learning_rate": 5.531914893617022e-05,
"loss": 1.2057,
"step": 26
},
{
"epoch": 0.1728,
"grad_norm": 0.6073344945907593,
"learning_rate": 5.744680851063831e-05,
"loss": 1.2319,
"step": 27
},
{
"epoch": 0.1792,
"grad_norm": 1.3395496606826782,
"learning_rate": 5.9574468085106384e-05,
"loss": 1.3086,
"step": 28
},
{
"epoch": 0.1856,
"grad_norm": 0.5415111184120178,
"learning_rate": 6.170212765957447e-05,
"loss": 1.2653,
"step": 29
},
{
"epoch": 0.192,
"grad_norm": 0.7579400539398193,
"learning_rate": 6.382978723404256e-05,
"loss": 1.0565,
"step": 30
},
{
"epoch": 0.1984,
"grad_norm": 0.4650651514530182,
"learning_rate": 6.595744680851063e-05,
"loss": 1.3408,
"step": 31
},
{
"epoch": 0.2048,
"grad_norm": 0.31583133339881897,
"learning_rate": 6.808510638297873e-05,
"loss": 1.1341,
"step": 32
},
{
"epoch": 0.2112,
"grad_norm": 0.3642672896385193,
"learning_rate": 7.021276595744681e-05,
"loss": 1.2263,
"step": 33
},
{
"epoch": 0.2176,
"grad_norm": 0.40529701113700867,
"learning_rate": 7.23404255319149e-05,
"loss": 1.2229,
"step": 34
},
{
"epoch": 0.224,
"grad_norm": 0.2421422302722931,
"learning_rate": 7.446808510638298e-05,
"loss": 0.9803,
"step": 35
},
{
"epoch": 0.2304,
"grad_norm": 0.3055451810359955,
"learning_rate": 7.659574468085106e-05,
"loss": 1.0436,
"step": 36
},
{
"epoch": 0.2368,
"grad_norm": 0.26383817195892334,
"learning_rate": 7.872340425531916e-05,
"loss": 0.9907,
"step": 37
},
{
"epoch": 0.2432,
"grad_norm": 0.3040568232536316,
"learning_rate": 8.085106382978723e-05,
"loss": 1.1131,
"step": 38
},
{
"epoch": 0.2496,
"grad_norm": 0.20183025300502777,
"learning_rate": 8.297872340425533e-05,
"loss": 0.955,
"step": 39
},
{
"epoch": 0.256,
"grad_norm": 0.24510295689105988,
"learning_rate": 8.510638297872341e-05,
"loss": 1.0317,
"step": 40
},
{
"epoch": 0.2624,
"grad_norm": 0.22165146470069885,
"learning_rate": 8.723404255319149e-05,
"loss": 0.9927,
"step": 41
},
{
"epoch": 0.2688,
"grad_norm": 0.18096181750297546,
"learning_rate": 8.936170212765958e-05,
"loss": 0.9169,
"step": 42
},
{
"epoch": 0.2752,
"grad_norm": 0.2374049425125122,
"learning_rate": 9.148936170212766e-05,
"loss": 0.9629,
"step": 43
},
{
"epoch": 0.2816,
"grad_norm": 0.28989022970199585,
"learning_rate": 9.361702127659576e-05,
"loss": 1.0841,
"step": 44
},
{
"epoch": 0.288,
"grad_norm": 0.25799044966697693,
"learning_rate": 9.574468085106384e-05,
"loss": 1.0746,
"step": 45
},
{
"epoch": 0.2944,
"grad_norm": 0.22019030153751373,
"learning_rate": 9.787234042553192e-05,
"loss": 0.9699,
"step": 46
},
{
"epoch": 0.3008,
"grad_norm": 0.18696561455726624,
"learning_rate": 0.0001,
"loss": 0.8615,
"step": 47
},
{
"epoch": 0.3072,
"grad_norm": 0.2744862735271454,
"learning_rate": 9.999860789001946e-05,
"loss": 1.0763,
"step": 48
},
{
"epoch": 0.3136,
"grad_norm": 0.19078461825847626,
"learning_rate": 9.999443163759668e-05,
"loss": 0.8894,
"step": 49
},
{
"epoch": 0.32,
"grad_norm": 0.19642512500286102,
"learning_rate": 9.998747147528374e-05,
"loss": 0.8941,
"step": 50
},
{
"epoch": 0.3264,
"grad_norm": 0.1972423791885376,
"learning_rate": 9.997772779065312e-05,
"loss": 0.9058,
"step": 51
},
{
"epoch": 0.3328,
"grad_norm": 0.19428911805152893,
"learning_rate": 9.996520112627602e-05,
"loss": 0.9221,
"step": 52
},
{
"epoch": 0.3392,
"grad_norm": 0.20967088639736176,
"learning_rate": 9.994989217969224e-05,
"loss": 0.9024,
"step": 53
},
{
"epoch": 0.3456,
"grad_norm": 0.19962789118289948,
"learning_rate": 9.993180180337126e-05,
"loss": 0.9491,
"step": 54
},
{
"epoch": 0.352,
"grad_norm": 0.1546671986579895,
"learning_rate": 9.991093100466482e-05,
"loss": 0.7734,
"step": 55
},
{
"epoch": 0.3584,
"grad_norm": 0.15869870781898499,
"learning_rate": 9.988728094575082e-05,
"loss": 0.8626,
"step": 56
},
{
"epoch": 0.3648,
"grad_norm": 0.2690005302429199,
"learning_rate": 9.986085294356857e-05,
"loss": 0.905,
"step": 57
},
{
"epoch": 0.3712,
"grad_norm": 0.265375018119812,
"learning_rate": 9.983164846974548e-05,
"loss": 0.9522,
"step": 58
},
{
"epoch": 0.3776,
"grad_norm": 0.1881999671459198,
"learning_rate": 9.979966915051517e-05,
"loss": 0.8813,
"step": 59
},
{
"epoch": 0.384,
"grad_norm": 0.1687404066324234,
"learning_rate": 9.97649167666268e-05,
"loss": 0.7873,
"step": 60
},
{
"epoch": 0.3904,
"grad_norm": 0.16878949105739594,
"learning_rate": 9.972739325324596e-05,
"loss": 0.8416,
"step": 61
},
{
"epoch": 0.3968,
"grad_norm": 0.1852271556854248,
"learning_rate": 9.968710069984698e-05,
"loss": 0.8725,
"step": 62
},
{
"epoch": 0.4032,
"grad_norm": 0.20418129861354828,
"learning_rate": 9.964404135009648e-05,
"loss": 0.8924,
"step": 63
},
{
"epoch": 0.4096,
"grad_norm": 0.17017708718776703,
"learning_rate": 9.95982176017285e-05,
"loss": 0.8465,
"step": 64
},
{
"epoch": 0.416,
"grad_norm": 0.200260192155838,
"learning_rate": 9.95496320064109e-05,
"loss": 0.897,
"step": 65
},
{
"epoch": 0.4224,
"grad_norm": 0.19314530491828918,
"learning_rate": 9.94982872696034e-05,
"loss": 0.8314,
"step": 66
},
{
"epoch": 0.4288,
"grad_norm": 0.16122443974018097,
"learning_rate": 9.94441862504068e-05,
"loss": 0.8258,
"step": 67
},
{
"epoch": 0.4352,
"grad_norm": 0.1727280616760254,
"learning_rate": 9.938733196140386e-05,
"loss": 0.852,
"step": 68
},
{
"epoch": 0.4416,
"grad_norm": 0.1809166669845581,
"learning_rate": 9.932772756849153e-05,
"loss": 0.7956,
"step": 69
},
{
"epoch": 0.448,
"grad_norm": 0.13888217508792877,
"learning_rate": 9.926537639070457e-05,
"loss": 0.7456,
"step": 70
},
{
"epoch": 0.4544,
"grad_norm": 0.1737329661846161,
"learning_rate": 9.92002819000309e-05,
"loss": 0.7735,
"step": 71
},
{
"epoch": 0.4608,
"grad_norm": 0.21496297419071198,
"learning_rate": 9.91324477212181e-05,
"loss": 0.8925,
"step": 72
},
{
"epoch": 0.4672,
"grad_norm": 0.1533055156469345,
"learning_rate": 9.906187763157168e-05,
"loss": 0.7513,
"step": 73
},
{
"epoch": 0.4736,
"grad_norm": 0.17420215904712677,
"learning_rate": 9.898857556074468e-05,
"loss": 0.82,
"step": 74
},
{
"epoch": 0.48,
"grad_norm": 0.17036575078964233,
"learning_rate": 9.891254559051885e-05,
"loss": 0.8445,
"step": 75
},
{
"epoch": 0.4864,
"grad_norm": 0.19612562656402588,
"learning_rate": 9.883379195457746e-05,
"loss": 0.7657,
"step": 76
},
{
"epoch": 0.4928,
"grad_norm": 0.18145954608917236,
"learning_rate": 9.875231903826936e-05,
"loss": 0.8554,
"step": 77
},
{
"epoch": 0.4992,
"grad_norm": 0.19653621315956116,
"learning_rate": 9.866813137836499e-05,
"loss": 0.8247,
"step": 78
},
{
"epoch": 0.5056,
"grad_norm": 0.15563753247261047,
"learning_rate": 9.858123366280358e-05,
"loss": 0.8427,
"step": 79
},
{
"epoch": 0.512,
"grad_norm": 0.18898862600326538,
"learning_rate": 9.849163073043223e-05,
"loss": 0.8435,
"step": 80
},
{
"epoch": 0.5184,
"grad_norm": 0.2034827470779419,
"learning_rate": 9.839932757073638e-05,
"loss": 0.8755,
"step": 81
},
{
"epoch": 0.5248,
"grad_norm": 0.18574374914169312,
"learning_rate": 9.830432932356206e-05,
"loss": 0.7832,
"step": 82
},
{
"epoch": 0.5312,
"grad_norm": 0.1965630203485489,
"learning_rate": 9.820664127882957e-05,
"loss": 0.8227,
"step": 83
},
{
"epoch": 0.5376,
"grad_norm": 0.1712363362312317,
"learning_rate": 9.8106268876239e-05,
"loss": 0.7677,
"step": 84
},
{
"epoch": 0.544,
"grad_norm": 0.1882326453924179,
"learning_rate": 9.800321770496726e-05,
"loss": 0.8642,
"step": 85
},
{
"epoch": 0.5504,
"grad_norm": 0.21462184190750122,
"learning_rate": 9.789749350335693e-05,
"loss": 0.8289,
"step": 86
},
{
"epoch": 0.5568,
"grad_norm": 0.16833196580410004,
"learning_rate": 9.778910215859667e-05,
"loss": 0.7127,
"step": 87
},
{
"epoch": 0.5632,
"grad_norm": 0.23049375414848328,
"learning_rate": 9.767804970639339e-05,
"loss": 0.8281,
"step": 88
},
{
"epoch": 0.5696,
"grad_norm": 0.22619804739952087,
"learning_rate": 9.756434233063616e-05,
"loss": 0.8528,
"step": 89
},
{
"epoch": 0.576,
"grad_norm": 0.17294900119304657,
"learning_rate": 9.744798636305188e-05,
"loss": 0.7163,
"step": 90
},
{
"epoch": 0.5824,
"grad_norm": 0.21446748077869415,
"learning_rate": 9.732898828285273e-05,
"loss": 0.76,
"step": 91
},
{
"epoch": 0.5888,
"grad_norm": 0.19360435009002686,
"learning_rate": 9.72073547163753e-05,
"loss": 0.7694,
"step": 92
},
{
"epoch": 0.5952,
"grad_norm": 0.16693833470344543,
"learning_rate": 9.708309243671165e-05,
"loss": 0.7513,
"step": 93
},
{
"epoch": 0.6016,
"grad_norm": 0.2053443044424057,
"learning_rate": 9.695620836333219e-05,
"loss": 0.7768,
"step": 94
},
{
"epoch": 0.608,
"grad_norm": 0.20644915103912354,
"learning_rate": 9.68267095617003e-05,
"loss": 0.7645,
"step": 95
},
{
"epoch": 0.6144,
"grad_norm": 0.179234117269516,
"learning_rate": 9.669460324287898e-05,
"loss": 0.8299,
"step": 96
},
{
"epoch": 0.6208,
"grad_norm": 0.18114909529685974,
"learning_rate": 9.655989676312918e-05,
"loss": 0.7326,
"step": 97
},
{
"epoch": 0.6272,
"grad_norm": 0.19695445895195007,
"learning_rate": 9.642259762350033e-05,
"loss": 0.7831,
"step": 98
},
{
"epoch": 0.6336,
"grad_norm": 0.17625164985656738,
"learning_rate": 9.628271346941252e-05,
"loss": 0.7511,
"step": 99
},
{
"epoch": 0.64,
"grad_norm": 0.2164350301027298,
"learning_rate": 9.614025209023084e-05,
"loss": 0.7882,
"step": 100
},
{
"epoch": 0.6464,
"grad_norm": 0.18725153803825378,
"learning_rate": 9.59952214188316e-05,
"loss": 0.7671,
"step": 101
},
{
"epoch": 0.6528,
"grad_norm": 0.18197496235370636,
"learning_rate": 9.58476295311606e-05,
"loss": 0.7698,
"step": 102
},
{
"epoch": 0.6592,
"grad_norm": 0.19369439780712128,
"learning_rate": 9.569748464578343e-05,
"loss": 0.7534,
"step": 103
},
{
"epoch": 0.6656,
"grad_norm": 0.2009640485048294,
"learning_rate": 9.554479512342784e-05,
"loss": 0.7415,
"step": 104
},
{
"epoch": 0.672,
"grad_norm": 0.19398680329322815,
"learning_rate": 9.538956946651815e-05,
"loss": 0.7429,
"step": 105
},
{
"epoch": 0.6784,
"grad_norm": 0.20824606716632843,
"learning_rate": 9.52318163187018e-05,
"loss": 0.815,
"step": 106
},
{
"epoch": 0.6848,
"grad_norm": 0.21556821465492249,
"learning_rate": 9.507154446436805e-05,
"loss": 0.7476,
"step": 107
},
{
"epoch": 0.6912,
"grad_norm": 0.17891979217529297,
"learning_rate": 9.490876282815884e-05,
"loss": 0.7209,
"step": 108
},
{
"epoch": 0.6976,
"grad_norm": 0.21326586604118347,
"learning_rate": 9.474348047447177e-05,
"loss": 0.7386,
"step": 109
},
{
"epoch": 0.704,
"grad_norm": 0.17727015912532806,
"learning_rate": 9.457570660695541e-05,
"loss": 0.7015,
"step": 110
},
{
"epoch": 0.7104,
"grad_norm": 0.20510976016521454,
"learning_rate": 9.440545056799677e-05,
"loss": 0.7266,
"step": 111
},
{
"epoch": 0.7168,
"grad_norm": 0.17708668112754822,
"learning_rate": 9.423272183820108e-05,
"loss": 0.7472,
"step": 112
},
{
"epoch": 0.7232,
"grad_norm": 0.18563494086265564,
"learning_rate": 9.405753003586395e-05,
"loss": 0.7652,
"step": 113
},
{
"epoch": 0.7296,
"grad_norm": 0.17155848443508148,
"learning_rate": 9.387988491643558e-05,
"loss": 0.7821,
"step": 114
},
{
"epoch": 0.736,
"grad_norm": 0.22160275280475616,
"learning_rate": 9.369979637197775e-05,
"loss": 0.7631,
"step": 115
},
{
"epoch": 0.7424,
"grad_norm": 0.20093092322349548,
"learning_rate": 9.351727443061283e-05,
"loss": 0.7896,
"step": 116
},
{
"epoch": 0.7488,
"grad_norm": 0.19560478627681732,
"learning_rate": 9.333232925596552e-05,
"loss": 0.7933,
"step": 117
},
{
"epoch": 0.7552,
"grad_norm": 0.1992943435907364,
"learning_rate": 9.314497114659671e-05,
"loss": 0.7548,
"step": 118
},
{
"epoch": 0.7616,
"grad_norm": 0.19478629529476166,
"learning_rate": 9.295521053543019e-05,
"loss": 0.7775,
"step": 119
},
{
"epoch": 0.768,
"grad_norm": 0.18368546664714813,
"learning_rate": 9.276305798917159e-05,
"loss": 0.7601,
"step": 120
},
{
"epoch": 0.7744,
"grad_norm": 0.2044031322002411,
"learning_rate": 9.256852420771998e-05,
"loss": 0.7497,
"step": 121
},
{
"epoch": 0.7808,
"grad_norm": 0.1834166944026947,
"learning_rate": 9.237162002357214e-05,
"loss": 0.7278,
"step": 122
},
{
"epoch": 0.7872,
"grad_norm": 0.19447724521160126,
"learning_rate": 9.217235640121926e-05,
"loss": 0.72,
"step": 123
},
{
"epoch": 0.7936,
"grad_norm": 0.1712876558303833,
"learning_rate": 9.197074443653642e-05,
"loss": 0.7404,
"step": 124
},
{
"epoch": 0.8,
"grad_norm": 0.20997127890586853,
"learning_rate": 9.176679535616477e-05,
"loss": 0.7825,
"step": 125
},
{
"epoch": 0.8064,
"grad_norm": 0.1855696141719818,
"learning_rate": 9.156052051688632e-05,
"loss": 0.7547,
"step": 126
},
{
"epoch": 0.8128,
"grad_norm": 0.20914626121520996,
"learning_rate": 9.135193140499156e-05,
"loss": 0.7972,
"step": 127
},
{
"epoch": 0.8192,
"grad_norm": 0.21978524327278137,
"learning_rate": 9.114103963563986e-05,
"loss": 0.7325,
"step": 128
},
{
"epoch": 0.8256,
"grad_norm": 0.2079222947359085,
"learning_rate": 9.092785695221271e-05,
"loss": 0.8084,
"step": 129
},
{
"epoch": 0.832,
"grad_norm": 0.1860748827457428,
"learning_rate": 9.071239522565977e-05,
"loss": 0.7359,
"step": 130
},
{
"epoch": 0.8384,
"grad_norm": 0.1871631145477295,
"learning_rate": 9.049466645383784e-05,
"loss": 0.7138,
"step": 131
},
{
"epoch": 0.8448,
"grad_norm": 0.19846683740615845,
"learning_rate": 9.027468276084275e-05,
"loss": 0.7176,
"step": 132
},
{
"epoch": 0.8512,
"grad_norm": 0.2049422711133957,
"learning_rate": 9.00524563963343e-05,
"loss": 0.7595,
"step": 133
},
{
"epoch": 0.8576,
"grad_norm": 0.17989754676818848,
"learning_rate": 8.982799973485407e-05,
"loss": 0.7445,
"step": 134
},
{
"epoch": 0.864,
"grad_norm": 0.32045578956604004,
"learning_rate": 8.960132527513643e-05,
"loss": 0.7214,
"step": 135
},
{
"epoch": 0.8704,
"grad_norm": 0.2243393063545227,
"learning_rate": 8.937244563941247e-05,
"loss": 0.8011,
"step": 136
},
{
"epoch": 0.8768,
"grad_norm": 0.20574022829532623,
"learning_rate": 8.914137357270723e-05,
"loss": 0.768,
"step": 137
},
{
"epoch": 0.8832,
"grad_norm": 0.2292594015598297,
"learning_rate": 8.890812194212988e-05,
"loss": 0.7083,
"step": 138
},
{
"epoch": 0.8896,
"grad_norm": 0.2046431005001068,
"learning_rate": 8.867270373615734e-05,
"loss": 0.77,
"step": 139
},
{
"epoch": 0.896,
"grad_norm": 0.22153380513191223,
"learning_rate": 8.843513206391101e-05,
"loss": 0.7262,
"step": 140
},
{
"epoch": 0.9024,
"grad_norm": 0.214860737323761,
"learning_rate": 8.81954201544267e-05,
"loss": 0.8284,
"step": 141
},
{
"epoch": 0.9088,
"grad_norm": 0.21554942429065704,
"learning_rate": 8.795358135591811e-05,
"loss": 0.7712,
"step": 142
},
{
"epoch": 0.9152,
"grad_norm": 0.21957093477249146,
"learning_rate": 8.77096291350334e-05,
"loss": 0.7235,
"step": 143
},
{
"epoch": 0.9216,
"grad_norm": 0.20161031186580658,
"learning_rate": 8.746357707610545e-05,
"loss": 0.7071,
"step": 144
},
{
"epoch": 0.928,
"grad_norm": 0.20437265932559967,
"learning_rate": 8.721543888039533e-05,
"loss": 0.7134,
"step": 145
},
{
"epoch": 0.9344,
"grad_norm": 0.19950765371322632,
"learning_rate": 8.69652283653294e-05,
"loss": 0.6978,
"step": 146
},
{
"epoch": 0.9408,
"grad_norm": 0.19119159877300262,
"learning_rate": 8.671295946372988e-05,
"loss": 0.7583,
"step": 147
},
{
"epoch": 0.9472,
"grad_norm": 0.20924100279808044,
"learning_rate": 8.645864622303898e-05,
"loss": 0.7563,
"step": 148
},
{
"epoch": 0.9536,
"grad_norm": 0.18858124315738678,
"learning_rate": 8.620230280453673e-05,
"loss": 0.7121,
"step": 149
},
{
"epoch": 0.96,
"grad_norm": 0.17950883507728577,
"learning_rate": 8.594394348255238e-05,
"loss": 0.6578,
"step": 150
},
{
"epoch": 0.9664,
"grad_norm": 0.20606084167957306,
"learning_rate": 8.568358264366957e-05,
"loss": 0.71,
"step": 151
},
{
"epoch": 0.9728,
"grad_norm": 0.18285425007343292,
"learning_rate": 8.542123478592518e-05,
"loss": 0.6941,
"step": 152
},
{
"epoch": 0.9792,
"grad_norm": 0.20143665373325348,
"learning_rate": 8.515691451800205e-05,
"loss": 0.636,
"step": 153
},
{
"epoch": 0.9856,
"grad_norm": 0.20506098866462708,
"learning_rate": 8.489063655841551e-05,
"loss": 0.7111,
"step": 154
},
{
"epoch": 0.992,
"grad_norm": 0.18759530782699585,
"learning_rate": 8.462241573469379e-05,
"loss": 0.733,
"step": 155
},
{
"epoch": 0.9984,
"grad_norm": 0.22259001433849335,
"learning_rate": 8.435226698255227e-05,
"loss": 0.7405,
"step": 156
},
{
"epoch": 1.0,
"grad_norm": 0.3408335745334625,
"learning_rate": 8.408020534506195e-05,
"loss": 0.6648,
"step": 157
},
{
"epoch": 1.0064,
"grad_norm": 0.18838848173618317,
"learning_rate": 8.380624597181165e-05,
"loss": 0.6529,
"step": 158
},
{
"epoch": 1.0128,
"grad_norm": 0.20577505230903625,
"learning_rate": 8.353040411806447e-05,
"loss": 0.6837,
"step": 159
},
{
"epoch": 1.0192,
"grad_norm": 0.20288875699043274,
"learning_rate": 8.325269514390835e-05,
"loss": 0.7313,
"step": 160
},
{
"epoch": 1.0256,
"grad_norm": 0.2250303328037262,
"learning_rate": 8.297313451340064e-05,
"loss": 0.7675,
"step": 161
},
{
"epoch": 1.032,
"grad_norm": 0.21464917063713074,
"learning_rate": 8.269173779370711e-05,
"loss": 0.778,
"step": 162
},
{
"epoch": 1.0384,
"grad_norm": 0.19386309385299683,
"learning_rate": 8.240852065423506e-05,
"loss": 0.7163,
"step": 163
},
{
"epoch": 1.0448,
"grad_norm": 0.1796194165945053,
"learning_rate": 8.21234988657607e-05,
"loss": 0.6736,
"step": 164
},
{
"epoch": 1.0512,
"grad_norm": 0.21359069645404816,
"learning_rate": 8.183668829955111e-05,
"loss": 0.7748,
"step": 165
},
{
"epoch": 1.0576,
"grad_norm": 0.19759395718574524,
"learning_rate": 8.154810492648037e-05,
"loss": 0.746,
"step": 166
},
{
"epoch": 1.064,
"grad_norm": 0.19113849103450775,
"learning_rate": 8.125776481614024e-05,
"loss": 0.6629,
"step": 167
},
{
"epoch": 1.0704,
"grad_norm": 0.18363501131534576,
"learning_rate": 8.096568413594533e-05,
"loss": 0.674,
"step": 168
},
{
"epoch": 1.0768,
"grad_norm": 0.20529989898204803,
"learning_rate": 8.067187915023282e-05,
"loss": 0.7051,
"step": 169
},
{
"epoch": 1.0832,
"grad_norm": 0.176677405834198,
"learning_rate": 8.037636621935685e-05,
"loss": 0.7434,
"step": 170
},
{
"epoch": 1.0896,
"grad_norm": 0.2223319411277771,
"learning_rate": 8.007916179877741e-05,
"loss": 0.6931,
"step": 171
},
{
"epoch": 1.096,
"grad_norm": 0.18287041783332825,
"learning_rate": 7.978028243814415e-05,
"loss": 0.7043,
"step": 172
},
{
"epoch": 1.1024,
"grad_norm": 0.2017446905374527,
"learning_rate": 7.947974478037468e-05,
"loss": 0.7687,
"step": 173
},
{
"epoch": 1.1088,
"grad_norm": 0.17362761497497559,
"learning_rate": 7.91775655607279e-05,
"loss": 0.7475,
"step": 174
},
{
"epoch": 1.1152,
"grad_norm": 0.23194265365600586,
"learning_rate": 7.887376160587215e-05,
"loss": 0.7347,
"step": 175
},
{
"epoch": 1.1216,
"grad_norm": 0.19501113891601562,
"learning_rate": 7.85683498329481e-05,
"loss": 0.7044,
"step": 176
},
{
"epoch": 1.1280000000000001,
"grad_norm": 0.20629528164863586,
"learning_rate": 7.826134724862687e-05,
"loss": 0.7334,
"step": 177
},
{
"epoch": 1.1344,
"grad_norm": 0.21323485672473907,
"learning_rate": 7.795277094816291e-05,
"loss": 0.7211,
"step": 178
},
{
"epoch": 1.1408,
"grad_norm": 0.23606210947036743,
"learning_rate": 7.764263811444215e-05,
"loss": 0.6876,
"step": 179
},
{
"epoch": 1.1472,
"grad_norm": 0.1839892566204071,
"learning_rate": 7.733096601702507e-05,
"loss": 0.7583,
"step": 180
},
{
"epoch": 1.1536,
"grad_norm": 0.21783791482448578,
"learning_rate": 7.70177720111852e-05,
"loss": 0.6689,
"step": 181
},
{
"epoch": 1.16,
"grad_norm": 0.1926935911178589,
"learning_rate": 7.67030735369426e-05,
"loss": 0.76,
"step": 182
},
{
"epoch": 1.1663999999999999,
"grad_norm": 0.18627937138080597,
"learning_rate": 7.638688811809274e-05,
"loss": 0.6958,
"step": 183
},
{
"epoch": 1.1728,
"grad_norm": 0.20846357941627502,
"learning_rate": 7.60692333612307e-05,
"loss": 0.6813,
"step": 184
},
{
"epoch": 1.1792,
"grad_norm": 0.1936255395412445,
"learning_rate": 7.575012695477076e-05,
"loss": 0.6926,
"step": 185
},
{
"epoch": 1.1856,
"grad_norm": 0.18793527781963348,
"learning_rate": 7.54295866679615e-05,
"loss": 0.6974,
"step": 186
},
{
"epoch": 1.192,
"grad_norm": 0.19964028894901276,
"learning_rate": 7.510763034989617e-05,
"loss": 0.7603,
"step": 187
},
{
"epoch": 1.1984,
"grad_norm": 0.20926351845264435,
"learning_rate": 7.478427592851893e-05,
"loss": 0.6682,
"step": 188
},
{
"epoch": 1.2048,
"grad_norm": 0.22575032711029053,
"learning_rate": 7.44595414096265e-05,
"loss": 0.7939,
"step": 189
},
{
"epoch": 1.2112,
"grad_norm": 0.19395670294761658,
"learning_rate": 7.413344487586542e-05,
"loss": 0.6991,
"step": 190
},
{
"epoch": 1.2176,
"grad_norm": 0.19221967458724976,
"learning_rate": 7.380600448572531e-05,
"loss": 0.7237,
"step": 191
},
{
"epoch": 1.224,
"grad_norm": 0.2095392495393753,
"learning_rate": 7.347723847252756e-05,
"loss": 0.7686,
"step": 192
},
{
"epoch": 1.2304,
"grad_norm": 0.19817109405994415,
"learning_rate": 7.314716514341006e-05,
"loss": 0.711,
"step": 193
},
{
"epoch": 1.2368000000000001,
"grad_norm": 0.19115367531776428,
"learning_rate": 7.28158028783079e-05,
"loss": 0.6759,
"step": 194
},
{
"epoch": 1.2432,
"grad_norm": 0.1996709704399109,
"learning_rate": 7.248317012892969e-05,
"loss": 0.7026,
"step": 195
},
{
"epoch": 1.2496,
"grad_norm": 0.1884952187538147,
"learning_rate": 7.214928541773027e-05,
"loss": 0.7281,
"step": 196
},
{
"epoch": 1.256,
"grad_norm": 0.1934693157672882,
"learning_rate": 7.181416733687919e-05,
"loss": 0.6471,
"step": 197
},
{
"epoch": 1.2624,
"grad_norm": 0.19305215775966644,
"learning_rate": 7.147783454722545e-05,
"loss": 0.7218,
"step": 198
},
{
"epoch": 1.2688,
"grad_norm": 0.17377148568630219,
"learning_rate": 7.114030577725836e-05,
"loss": 0.6985,
"step": 199
},
{
"epoch": 1.2752,
"grad_norm": 0.19253912568092346,
"learning_rate": 7.080159982206471e-05,
"loss": 0.723,
"step": 200
},
{
"epoch": 1.2816,
"grad_norm": 0.2046697437763214,
"learning_rate": 7.046173554228213e-05,
"loss": 0.6556,
"step": 201
},
{
"epoch": 1.288,
"grad_norm": 0.18565595149993896,
"learning_rate": 7.012073186304886e-05,
"loss": 0.7111,
"step": 202
},
{
"epoch": 1.2944,
"grad_norm": 0.1991768181324005,
"learning_rate": 6.977860777294988e-05,
"loss": 0.7236,
"step": 203
},
{
"epoch": 1.3008,
"grad_norm": 0.20029094815254211,
"learning_rate": 6.943538232295964e-05,
"loss": 0.6902,
"step": 204
},
{
"epoch": 1.3072,
"grad_norm": 0.20728713274002075,
"learning_rate": 6.909107462538113e-05,
"loss": 0.723,
"step": 205
},
{
"epoch": 1.3136,
"grad_norm": 0.20966534316539764,
"learning_rate": 6.874570385278161e-05,
"loss": 0.7214,
"step": 206
},
{
"epoch": 1.32,
"grad_norm": 0.19986563920974731,
"learning_rate": 6.839928923692504e-05,
"loss": 0.7464,
"step": 207
},
{
"epoch": 1.3264,
"grad_norm": 0.22603270411491394,
"learning_rate": 6.805185006770124e-05,
"loss": 0.7287,
"step": 208
},
{
"epoch": 1.3328,
"grad_norm": 0.1987605094909668,
"learning_rate": 6.770340569205158e-05,
"loss": 0.7483,
"step": 209
},
{
"epoch": 1.3392,
"grad_norm": 0.23768657445907593,
"learning_rate": 6.735397551289178e-05,
"loss": 0.685,
"step": 210
},
{
"epoch": 1.3456000000000001,
"grad_norm": 0.19051390886306763,
"learning_rate": 6.700357898803145e-05,
"loss": 0.7484,
"step": 211
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.1968153864145279,
"learning_rate": 6.665223562909058e-05,
"loss": 0.7095,
"step": 212
},
{
"epoch": 1.3584,
"grad_norm": 0.21399809420108795,
"learning_rate": 6.629996500041299e-05,
"loss": 0.6991,
"step": 213
},
{
"epoch": 1.3648,
"grad_norm": 0.21878115832805634,
"learning_rate": 6.594678671797703e-05,
"loss": 0.7192,
"step": 214
},
{
"epoch": 1.3712,
"grad_norm": 0.19469957053661346,
"learning_rate": 6.559272044830317e-05,
"loss": 0.6861,
"step": 215
},
{
"epoch": 1.3776,
"grad_norm": 0.23191341757774353,
"learning_rate": 6.523778590735891e-05,
"loss": 0.7194,
"step": 216
},
{
"epoch": 1.384,
"grad_norm": 0.21132110059261322,
"learning_rate": 6.488200285946094e-05,
"loss": 0.685,
"step": 217
},
{
"epoch": 1.3904,
"grad_norm": 0.22124455869197845,
"learning_rate": 6.452539111617453e-05,
"loss": 0.6685,
"step": 218
},
{
"epoch": 1.3968,
"grad_norm": 0.19542793929576874,
"learning_rate": 6.416797053521038e-05,
"loss": 0.6813,
"step": 219
},
{
"epoch": 1.4032,
"grad_norm": 0.2289544641971588,
"learning_rate": 6.38097610193188e-05,
"loss": 0.6714,
"step": 220
},
{
"epoch": 1.4096,
"grad_norm": 0.2579493224620819,
"learning_rate": 6.345078251518143e-05,
"loss": 0.7159,
"step": 221
},
{
"epoch": 1.416,
"grad_norm": 0.19421054422855377,
"learning_rate": 6.309105501230067e-05,
"loss": 0.7322,
"step": 222
},
{
"epoch": 1.4224,
"grad_norm": 0.20962068438529968,
"learning_rate": 6.273059854188636e-05,
"loss": 0.7126,
"step": 223
},
{
"epoch": 1.4288,
"grad_norm": 0.19890280067920685,
"learning_rate": 6.236943317574054e-05,
"loss": 0.7368,
"step": 224
},
{
"epoch": 1.4352,
"grad_norm": 0.20143720507621765,
"learning_rate": 6.200757902513962e-05,
"loss": 0.7455,
"step": 225
},
{
"epoch": 1.4416,
"grad_norm": 0.23619429767131805,
"learning_rate": 6.164505623971457e-05,
"loss": 0.7332,
"step": 226
},
{
"epoch": 1.448,
"grad_norm": 0.18844173848628998,
"learning_rate": 6.128188500632892e-05,
"loss": 0.726,
"step": 227
},
{
"epoch": 1.4544000000000001,
"grad_norm": 0.2009935975074768,
"learning_rate": 6.091808554795462e-05,
"loss": 0.7047,
"step": 228
},
{
"epoch": 1.4607999999999999,
"grad_norm": 0.21597343683242798,
"learning_rate": 6.055367812254592e-05,
"loss": 0.6953,
"step": 229
},
{
"epoch": 1.4672,
"grad_norm": 0.21373218297958374,
"learning_rate": 6.0188683021911396e-05,
"loss": 0.7517,
"step": 230
},
{
"epoch": 1.4736,
"grad_norm": 0.2007780373096466,
"learning_rate": 5.9823120570583926e-05,
"loss": 0.6925,
"step": 231
},
{
"epoch": 1.48,
"grad_norm": 0.24474382400512695,
"learning_rate": 5.9457011124689023e-05,
"loss": 0.6962,
"step": 232
},
{
"epoch": 1.4864,
"grad_norm": 0.20391108095645905,
"learning_rate": 5.909037507081121e-05,
"loss": 0.75,
"step": 233
},
{
"epoch": 1.4928,
"grad_norm": 0.20164735615253448,
"learning_rate": 5.8723232824858886e-05,
"loss": 0.7141,
"step": 234
},
{
"epoch": 1.4992,
"grad_norm": 0.19647496938705444,
"learning_rate": 5.835560483092743e-05,
"loss": 0.7297,
"step": 235
},
{
"epoch": 1.5056,
"grad_norm": 0.1943722367286682,
"learning_rate": 5.798751156016085e-05,
"loss": 0.7056,
"step": 236
},
{
"epoch": 1.512,
"grad_norm": 0.1920538693666458,
"learning_rate": 5.761897350961175e-05,
"loss": 0.7046,
"step": 237
},
{
"epoch": 1.5184,
"grad_norm": 0.2010536789894104,
"learning_rate": 5.7250011201100095e-05,
"loss": 0.7222,
"step": 238
},
{
"epoch": 1.5248,
"grad_norm": 0.20179545879364014,
"learning_rate": 5.688064518007036e-05,
"loss": 0.6906,
"step": 239
},
{
"epoch": 1.5312000000000001,
"grad_norm": 0.19752287864685059,
"learning_rate": 5.6510896014447526e-05,
"loss": 0.7016,
"step": 240
},
{
"epoch": 1.5375999999999999,
"grad_norm": 0.20254461467266083,
"learning_rate": 5.6140784293491725e-05,
"loss": 0.6707,
"step": 241
},
{
"epoch": 1.544,
"grad_norm": 0.2074304223060608,
"learning_rate": 5.577033062665179e-05,
"loss": 0.7068,
"step": 242
},
{
"epoch": 1.5504,
"grad_norm": 0.1754094362258911,
"learning_rate": 5.53995556424176e-05,
"loss": 0.6778,
"step": 243
},
{
"epoch": 1.5568,
"grad_norm": 0.19732975959777832,
"learning_rate": 5.50284799871714e-05,
"loss": 0.7431,
"step": 244
},
{
"epoch": 1.5632000000000001,
"grad_norm": 0.22642773389816284,
"learning_rate": 5.465712432403812e-05,
"loss": 0.7276,
"step": 245
},
{
"epoch": 1.5695999999999999,
"grad_norm": 0.19384561479091644,
"learning_rate": 5.428550933173476e-05,
"loss": 0.6988,
"step": 246
},
{
"epoch": 1.576,
"grad_norm": 0.19731836020946503,
"learning_rate": 5.391365570341893e-05,
"loss": 0.6659,
"step": 247
},
{
"epoch": 1.5824,
"grad_norm": 0.1964057832956314,
"learning_rate": 5.3541584145536475e-05,
"loss": 0.6876,
"step": 248
},
{
"epoch": 1.5888,
"grad_norm": 0.21582961082458496,
"learning_rate": 5.316931537666856e-05,
"loss": 0.7048,
"step": 249
},
{
"epoch": 1.5952,
"grad_norm": 0.18675744533538818,
"learning_rate": 5.279687012637799e-05,
"loss": 0.657,
"step": 250
},
{
"epoch": 1.6016,
"grad_norm": 0.21063970029354095,
"learning_rate": 5.24242691340547e-05,
"loss": 0.6786,
"step": 251
},
{
"epoch": 1.608,
"grad_norm": 0.20948725938796997,
"learning_rate": 5.2051533147761155e-05,
"loss": 0.712,
"step": 252
},
{
"epoch": 1.6143999999999998,
"grad_norm": 0.2167293280363083,
"learning_rate": 5.167868292307678e-05,
"loss": 0.668,
"step": 253
},
{
"epoch": 1.6208,
"grad_norm": 0.19093649089336395,
"learning_rate": 5.1305739221942364e-05,
"loss": 0.7175,
"step": 254
},
{
"epoch": 1.6272,
"grad_norm": 0.2434680163860321,
"learning_rate": 5.093272281150383e-05,
"loss": 0.6888,
"step": 255
},
{
"epoch": 1.6336,
"grad_norm": 0.21365131437778473,
"learning_rate": 5.05596544629559e-05,
"loss": 0.6907,
"step": 256
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.24530403316020966,
"learning_rate": 5.018655495038541e-05,
"loss": 0.6995,
"step": 257
},
{
"epoch": 1.6463999999999999,
"grad_norm": 0.2155607044696808,
"learning_rate": 4.981344504961459e-05,
"loss": 0.6894,
"step": 258
},
{
"epoch": 1.6528,
"grad_norm": 0.193691223859787,
"learning_rate": 4.944034553704412e-05,
"loss": 0.6693,
"step": 259
},
{
"epoch": 1.6592,
"grad_norm": 0.23936335742473602,
"learning_rate": 4.9067277188496185e-05,
"loss": 0.7161,
"step": 260
},
{
"epoch": 1.6656,
"grad_norm": 0.19866809248924255,
"learning_rate": 4.869426077805765e-05,
"loss": 0.6558,
"step": 261
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.21519921720027924,
"learning_rate": 4.8321317076923223e-05,
"loss": 0.6388,
"step": 262
},
{
"epoch": 1.6784,
"grad_norm": 0.23403239250183105,
"learning_rate": 4.794846685223884e-05,
"loss": 0.723,
"step": 263
},
{
"epoch": 1.6848,
"grad_norm": 0.20778563618659973,
"learning_rate": 4.757573086594529e-05,
"loss": 0.7281,
"step": 264
},
{
"epoch": 1.6912,
"grad_norm": 0.2214445173740387,
"learning_rate": 4.7203129873622045e-05,
"loss": 0.6819,
"step": 265
},
{
"epoch": 1.6976,
"grad_norm": 0.23840227723121643,
"learning_rate": 4.6830684623331446e-05,
"loss": 0.6612,
"step": 266
},
{
"epoch": 1.704,
"grad_norm": 0.21336662769317627,
"learning_rate": 4.645841585446356e-05,
"loss": 0.6873,
"step": 267
},
{
"epoch": 1.7104,
"grad_norm": 0.20929306745529175,
"learning_rate": 4.60863442965811e-05,
"loss": 0.6937,
"step": 268
},
{
"epoch": 1.7168,
"grad_norm": 0.2372131198644638,
"learning_rate": 4.5714490668265245e-05,
"loss": 0.6603,
"step": 269
},
{
"epoch": 1.7231999999999998,
"grad_norm": 0.19182223081588745,
"learning_rate": 4.5342875675961885e-05,
"loss": 0.7286,
"step": 270
},
{
"epoch": 1.7296,
"grad_norm": 0.19139911234378815,
"learning_rate": 4.497152001282861e-05,
"loss": 0.6884,
"step": 271
},
{
"epoch": 1.736,
"grad_norm": 0.22471372783184052,
"learning_rate": 4.460044435758241e-05,
"loss": 0.6634,
"step": 272
},
{
"epoch": 1.7424,
"grad_norm": 0.1903676688671112,
"learning_rate": 4.4229669373348226e-05,
"loss": 0.6813,
"step": 273
},
{
"epoch": 1.7488000000000001,
"grad_norm": 0.18955154716968536,
"learning_rate": 4.3859215706508294e-05,
"loss": 0.6851,
"step": 274
},
{
"epoch": 1.7551999999999999,
"grad_norm": 0.208612322807312,
"learning_rate": 4.348910398555249e-05,
"loss": 0.6685,
"step": 275
},
{
"epoch": 1.7616,
"grad_norm": 0.19398434460163116,
"learning_rate": 4.3119354819929655e-05,
"loss": 0.6544,
"step": 276
},
{
"epoch": 1.768,
"grad_norm": 0.1975647658109665,
"learning_rate": 4.27499887988999e-05,
"loss": 0.6744,
"step": 277
},
{
"epoch": 1.7744,
"grad_norm": 0.19207845628261566,
"learning_rate": 4.2381026490388245e-05,
"loss": 0.6928,
"step": 278
},
{
"epoch": 1.7808000000000002,
"grad_norm": 0.19270960986614227,
"learning_rate": 4.201248843983918e-05,
"loss": 0.6818,
"step": 279
},
{
"epoch": 1.7872,
"grad_norm": 0.20096321403980255,
"learning_rate": 4.164439516907258e-05,
"loss": 0.6712,
"step": 280
},
{
"epoch": 1.7936,
"grad_norm": 0.20221710205078125,
"learning_rate": 4.127676717514113e-05,
"loss": 0.697,
"step": 281
},
{
"epoch": 1.8,
"grad_norm": 0.19503732025623322,
"learning_rate": 4.0909624929188805e-05,
"loss": 0.6939,
"step": 282
},
{
"epoch": 1.8064,
"grad_norm": 0.20334869623184204,
"learning_rate": 4.0542988875310995e-05,
"loss": 0.644,
"step": 283
},
{
"epoch": 1.8128,
"grad_norm": 0.20078979432582855,
"learning_rate": 4.0176879429416086e-05,
"loss": 0.7029,
"step": 284
},
{
"epoch": 1.8192,
"grad_norm": 0.20540271699428558,
"learning_rate": 3.981131697808862e-05,
"loss": 0.6893,
"step": 285
},
{
"epoch": 1.8256000000000001,
"grad_norm": 0.21139805018901825,
"learning_rate": 3.9446321877454094e-05,
"loss": 0.7459,
"step": 286
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.2058684229850769,
"learning_rate": 3.90819144520454e-05,
"loss": 0.6872,
"step": 287
},
{
"epoch": 1.8384,
"grad_norm": 0.20217950642108917,
"learning_rate": 3.8718114993671084e-05,
"loss": 0.679,
"step": 288
},
{
"epoch": 1.8448,
"grad_norm": 0.1952074021100998,
"learning_rate": 3.835494376028544e-05,
"loss": 0.7296,
"step": 289
},
{
"epoch": 1.8512,
"grad_norm": 0.20614856481552124,
"learning_rate": 3.7992420974860384e-05,
"loss": 0.6543,
"step": 290
},
{
"epoch": 1.8576000000000001,
"grad_norm": 0.19841645658016205,
"learning_rate": 3.7630566824259456e-05,
"loss": 0.6866,
"step": 291
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.20447316765785217,
"learning_rate": 3.726940145811363e-05,
"loss": 0.6684,
"step": 292
},
{
"epoch": 1.8704,
"grad_norm": 0.2253882735967636,
"learning_rate": 3.6908944987699345e-05,
"loss": 0.6934,
"step": 293
},
{
"epoch": 1.8768,
"grad_norm": 0.20601052045822144,
"learning_rate": 3.654921748481858e-05,
"loss": 0.6936,
"step": 294
},
{
"epoch": 1.8832,
"grad_norm": 0.19950821995735168,
"learning_rate": 3.6190238980681236e-05,
"loss": 0.6804,
"step": 295
},
{
"epoch": 1.8896,
"grad_norm": 0.23119482398033142,
"learning_rate": 3.583202946478963e-05,
"loss": 0.682,
"step": 296
},
{
"epoch": 1.896,
"grad_norm": 0.19434580206871033,
"learning_rate": 3.547460888382547e-05,
"loss": 0.6698,
"step": 297
},
{
"epoch": 1.9024,
"grad_norm": 0.19283151626586914,
"learning_rate": 3.511799714053907e-05,
"loss": 0.6838,
"step": 298
},
{
"epoch": 1.9088,
"grad_norm": 0.2131282240152359,
"learning_rate": 3.47622140926411e-05,
"loss": 0.7203,
"step": 299
},
{
"epoch": 1.9152,
"grad_norm": 0.20812910795211792,
"learning_rate": 3.4407279551696846e-05,
"loss": 0.697,
"step": 300
},
{
"epoch": 1.9216,
"grad_norm": 0.19355922937393188,
"learning_rate": 3.4053213282022984e-05,
"loss": 0.7057,
"step": 301
},
{
"epoch": 1.928,
"grad_norm": 0.19805647432804108,
"learning_rate": 3.370003499958703e-05,
"loss": 0.7373,
"step": 302
},
{
"epoch": 1.9344000000000001,
"grad_norm": 0.21588417887687683,
"learning_rate": 3.334776437090944e-05,
"loss": 0.7187,
"step": 303
},
{
"epoch": 1.9407999999999999,
"grad_norm": 0.17682795226573944,
"learning_rate": 3.299642101196854e-05,
"loss": 0.7133,
"step": 304
},
{
"epoch": 1.9472,
"grad_norm": 0.201612651348114,
"learning_rate": 3.2646024487108215e-05,
"loss": 0.6767,
"step": 305
},
{
"epoch": 1.9536,
"grad_norm": 0.20109272003173828,
"learning_rate": 3.2296594307948425e-05,
"loss": 0.6907,
"step": 306
},
{
"epoch": 1.96,
"grad_norm": 0.1948445439338684,
"learning_rate": 3.1948149932298774e-05,
"loss": 0.7196,
"step": 307
},
{
"epoch": 1.9664000000000001,
"grad_norm": 0.20404204726219177,
"learning_rate": 3.160071076307497e-05,
"loss": 0.7123,
"step": 308
},
{
"epoch": 1.9727999999999999,
"grad_norm": 0.20908042788505554,
"learning_rate": 3.125429614721842e-05,
"loss": 0.6982,
"step": 309
},
{
"epoch": 1.9792,
"grad_norm": 0.21416354179382324,
"learning_rate": 3.0908925374618895e-05,
"loss": 0.6476,
"step": 310
},
{
"epoch": 1.9856,
"grad_norm": 0.20553742349147797,
"learning_rate": 3.056461767704037e-05,
"loss": 0.729,
"step": 311
},
{
"epoch": 1.992,
"grad_norm": 0.2184964120388031,
"learning_rate": 3.0221392227050126e-05,
"loss": 0.6833,
"step": 312
},
{
"epoch": 1.9984,
"grad_norm": 0.19415715336799622,
"learning_rate": 2.987926813695116e-05,
"loss": 0.75,
"step": 313
},
{
"epoch": 2.0,
"grad_norm": 0.4077971279621124,
"learning_rate": 2.9538264457717878e-05,
"loss": 0.7006,
"step": 314
},
{
"epoch": 2.0064,
"grad_norm": 0.23124489188194275,
"learning_rate": 2.9198400177935305e-05,
"loss": 0.6592,
"step": 315
},
{
"epoch": 2.0128,
"grad_norm": 0.2515321969985962,
"learning_rate": 2.885969422274165e-05,
"loss": 0.6851,
"step": 316
},
{
"epoch": 2.0192,
"grad_norm": 0.21909447014331818,
"learning_rate": 2.8522165452774557e-05,
"loss": 0.7244,
"step": 317
},
{
"epoch": 2.0256,
"grad_norm": 0.20850086212158203,
"learning_rate": 2.8185832663120815e-05,
"loss": 0.7144,
"step": 318
},
{
"epoch": 2.032,
"grad_norm": 0.2121347337961197,
"learning_rate": 2.7850714582269722e-05,
"loss": 0.6756,
"step": 319
},
{
"epoch": 2.0384,
"grad_norm": 0.22937630116939545,
"learning_rate": 2.7516829871070292e-05,
"loss": 0.6907,
"step": 320
},
{
"epoch": 2.0448,
"grad_norm": 0.20349667966365814,
"learning_rate": 2.7184197121692127e-05,
"loss": 0.6523,
"step": 321
},
{
"epoch": 2.0512,
"grad_norm": 0.19774508476257324,
"learning_rate": 2.6852834856589947e-05,
"loss": 0.6745,
"step": 322
},
{
"epoch": 2.0576,
"grad_norm": 0.1831101030111313,
"learning_rate": 2.652276152747246e-05,
"loss": 0.6973,
"step": 323
},
{
"epoch": 2.064,
"grad_norm": 0.20878373086452484,
"learning_rate": 2.6193995514274705e-05,
"loss": 0.67,
"step": 324
},
{
"epoch": 2.0704,
"grad_norm": 0.21983082592487335,
"learning_rate": 2.5866555124134577e-05,
"loss": 0.6811,
"step": 325
},
{
"epoch": 2.0768,
"grad_norm": 0.19198405742645264,
"learning_rate": 2.5540458590373527e-05,
"loss": 0.6282,
"step": 326
},
{
"epoch": 2.0832,
"grad_norm": 0.19146208465099335,
"learning_rate": 2.5215724071481072e-05,
"loss": 0.7243,
"step": 327
},
{
"epoch": 2.0896,
"grad_norm": 0.22032876312732697,
"learning_rate": 2.4892369650103836e-05,
"loss": 0.6853,
"step": 328
},
{
"epoch": 2.096,
"grad_norm": 0.19633299112319946,
"learning_rate": 2.457041333203852e-05,
"loss": 0.6914,
"step": 329
},
{
"epoch": 2.1024,
"grad_norm": 0.17531536519527435,
"learning_rate": 2.4249873045229244e-05,
"loss": 0.7256,
"step": 330
},
{
"epoch": 2.1088,
"grad_norm": 0.2074475884437561,
"learning_rate": 2.3930766638769326e-05,
"loss": 0.7376,
"step": 331
},
{
"epoch": 2.1152,
"grad_norm": 0.20909680426120758,
"learning_rate": 2.3613111881907275e-05,
"loss": 0.6866,
"step": 332
},
{
"epoch": 2.1216,
"grad_norm": 0.19580084085464478,
"learning_rate": 2.3296926463057396e-05,
"loss": 0.7069,
"step": 333
},
{
"epoch": 2.128,
"grad_norm": 0.18059836328029633,
"learning_rate": 2.2982227988814796e-05,
"loss": 0.6449,
"step": 334
},
{
"epoch": 2.1344,
"grad_norm": 0.1941729485988617,
"learning_rate": 2.2669033982974945e-05,
"loss": 0.6295,
"step": 335
},
{
"epoch": 2.1408,
"grad_norm": 0.2014259248971939,
"learning_rate": 2.235736188555787e-05,
"loss": 0.6776,
"step": 336
},
{
"epoch": 2.1471999999999998,
"grad_norm": 0.21426711976528168,
"learning_rate": 2.2047229051837102e-05,
"loss": 0.6988,
"step": 337
},
{
"epoch": 2.1536,
"grad_norm": 0.21534332633018494,
"learning_rate": 2.173865275137314e-05,
"loss": 0.6451,
"step": 338
},
{
"epoch": 2.16,
"grad_norm": 0.21011656522750854,
"learning_rate": 2.143165016705192e-05,
"loss": 0.6699,
"step": 339
},
{
"epoch": 2.1664,
"grad_norm": 0.21305830776691437,
"learning_rate": 2.1126238394127868e-05,
"loss": 0.6535,
"step": 340
},
{
"epoch": 2.1728,
"grad_norm": 0.22721970081329346,
"learning_rate": 2.0822434439272122e-05,
"loss": 0.6317,
"step": 341
},
{
"epoch": 2.1792,
"grad_norm": 0.20172137022018433,
"learning_rate": 2.052025521962534e-05,
"loss": 0.6911,
"step": 342
},
{
"epoch": 2.1856,
"grad_norm": 0.19113659858703613,
"learning_rate": 2.0219717561855855e-05,
"loss": 0.7039,
"step": 343
},
{
"epoch": 2.192,
"grad_norm": 0.2135518491268158,
"learning_rate": 1.992083820122259e-05,
"loss": 0.684,
"step": 344
},
{
"epoch": 2.1984,
"grad_norm": 0.19133660197257996,
"learning_rate": 1.962363378064316e-05,
"loss": 0.6835,
"step": 345
},
{
"epoch": 2.2048,
"grad_norm": 0.19742318987846375,
"learning_rate": 1.9328120849767194e-05,
"loss": 0.6411,
"step": 346
},
{
"epoch": 2.2112,
"grad_norm": 0.2134242057800293,
"learning_rate": 1.903431586405468e-05,
"loss": 0.6987,
"step": 347
},
{
"epoch": 2.2176,
"grad_norm": 0.19351407885551453,
"learning_rate": 1.8742235183859747e-05,
"loss": 0.6387,
"step": 348
},
{
"epoch": 2.224,
"grad_norm": 0.19710959494113922,
"learning_rate": 1.8451895073519643e-05,
"loss": 0.6638,
"step": 349
},
{
"epoch": 2.2304,
"grad_norm": 0.18867988884449005,
"learning_rate": 1.8163311700448898e-05,
"loss": 0.6697,
"step": 350
},
{
"epoch": 2.2368,
"grad_norm": 0.18838374316692352,
"learning_rate": 1.7876501134239316e-05,
"loss": 0.7276,
"step": 351
},
{
"epoch": 2.2432,
"grad_norm": 0.1833486109972,
"learning_rate": 1.7591479345764973e-05,
"loss": 0.6753,
"step": 352
},
{
"epoch": 2.2496,
"grad_norm": 0.23941516876220703,
"learning_rate": 1.7308262206292897e-05,
"loss": 0.6962,
"step": 353
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.19911228120326996,
"learning_rate": 1.7026865486599377e-05,
"loss": 0.6764,
"step": 354
},
{
"epoch": 2.2624,
"grad_norm": 0.21614587306976318,
"learning_rate": 1.6747304856091662e-05,
"loss": 0.6218,
"step": 355
},
{
"epoch": 2.2688,
"grad_norm": 0.1931372880935669,
"learning_rate": 1.6469595881935525e-05,
"loss": 0.7262,
"step": 356
},
{
"epoch": 2.2752,
"grad_norm": 0.19275271892547607,
"learning_rate": 1.6193754028188364e-05,
"loss": 0.7365,
"step": 357
},
{
"epoch": 2.2816,
"grad_norm": 0.19287358224391937,
"learning_rate": 1.591979465493806e-05,
"loss": 0.6888,
"step": 358
},
{
"epoch": 2.288,
"grad_norm": 0.18417416512966156,
"learning_rate": 1.564773301744774e-05,
"loss": 0.6533,
"step": 359
},
{
"epoch": 2.2944,
"grad_norm": 0.1791851967573166,
"learning_rate": 1.5377584265306223e-05,
"loss": 0.6738,
"step": 360
},
{
"epoch": 2.3008,
"grad_norm": 0.1805466115474701,
"learning_rate": 1.510936344158448e-05,
"loss": 0.664,
"step": 361
},
{
"epoch": 2.3072,
"grad_norm": 0.1946578025817871,
"learning_rate": 1.4843085481997959e-05,
"loss": 0.7261,
"step": 362
},
{
"epoch": 2.3136,
"grad_norm": 0.1858345866203308,
"learning_rate": 1.457876521407484e-05,
"loss": 0.6374,
"step": 363
},
{
"epoch": 2.32,
"grad_norm": 0.19666247069835663,
"learning_rate": 1.431641735633044e-05,
"loss": 0.6469,
"step": 364
},
{
"epoch": 2.3264,
"grad_norm": 0.16774997115135193,
"learning_rate": 1.4056056517447635e-05,
"loss": 0.6785,
"step": 365
},
{
"epoch": 2.3327999999999998,
"grad_norm": 0.19413676857948303,
"learning_rate": 1.3797697195463278e-05,
"loss": 0.6797,
"step": 366
},
{
"epoch": 2.3392,
"grad_norm": 0.1797027438879013,
"learning_rate": 1.3541353776961036e-05,
"loss": 0.6968,
"step": 367
},
{
"epoch": 2.3456,
"grad_norm": 0.17702320218086243,
"learning_rate": 1.3287040536270135e-05,
"loss": 0.6696,
"step": 368
},
{
"epoch": 2.352,
"grad_norm": 0.19950032234191895,
"learning_rate": 1.3034771634670601e-05,
"loss": 0.6559,
"step": 369
},
{
"epoch": 2.3584,
"grad_norm": 0.19765135645866394,
"learning_rate": 1.2784561119604682e-05,
"loss": 0.6154,
"step": 370
},
{
"epoch": 2.3648,
"grad_norm": 0.21111543476581573,
"learning_rate": 1.2536422923894564e-05,
"loss": 0.6153,
"step": 371
},
{
"epoch": 2.3712,
"grad_norm": 0.20043286681175232,
"learning_rate": 1.2290370864966622e-05,
"loss": 0.6988,
"step": 372
},
{
"epoch": 2.3776,
"grad_norm": 0.19548365473747253,
"learning_rate": 1.2046418644081903e-05,
"loss": 0.7292,
"step": 373
},
{
"epoch": 2.384,
"grad_norm": 0.1912689507007599,
"learning_rate": 1.1804579845573289e-05,
"loss": 0.7139,
"step": 374
},
{
"epoch": 2.3904,
"grad_norm": 0.18108686804771423,
"learning_rate": 1.1564867936088992e-05,
"loss": 0.6878,
"step": 375
},
{
"epoch": 2.3968,
"grad_norm": 0.2090510129928589,
"learning_rate": 1.1327296263842652e-05,
"loss": 0.6882,
"step": 376
},
{
"epoch": 2.4032,
"grad_norm": 0.19618839025497437,
"learning_rate": 1.1091878057870136e-05,
"loss": 0.6613,
"step": 377
},
{
"epoch": 2.4096,
"grad_norm": 0.18068960309028625,
"learning_rate": 1.0858626427292795e-05,
"loss": 0.6645,
"step": 378
},
{
"epoch": 2.416,
"grad_norm": 0.1936527043581009,
"learning_rate": 1.0627554360587534e-05,
"loss": 0.6942,
"step": 379
},
{
"epoch": 2.4224,
"grad_norm": 0.19242267310619354,
"learning_rate": 1.0398674724863583e-05,
"loss": 0.707,
"step": 380
},
{
"epoch": 2.4288,
"grad_norm": 0.17682623863220215,
"learning_rate": 1.0172000265145936e-05,
"loss": 0.6429,
"step": 381
},
{
"epoch": 2.4352,
"grad_norm": 0.18140776455402374,
"learning_rate": 9.94754360366571e-06,
"loss": 0.7112,
"step": 382
},
{
"epoch": 2.4416,
"grad_norm": 0.1756039410829544,
"learning_rate": 9.72531723915726e-06,
"loss": 0.6616,
"step": 383
},
{
"epoch": 2.448,
"grad_norm": 0.19845832884311676,
"learning_rate": 9.505333546162171e-06,
"loss": 0.6774,
"step": 384
},
{
"epoch": 2.4544,
"grad_norm": 0.19389702379703522,
"learning_rate": 9.287604774340236e-06,
"loss": 0.7189,
"step": 385
},
{
"epoch": 2.4608,
"grad_norm": 0.20373724400997162,
"learning_rate": 9.07214304778729e-06,
"loss": 0.7034,
"step": 386
},
{
"epoch": 2.4672,
"grad_norm": 0.1839911788702011,
"learning_rate": 8.858960364360141e-06,
"loss": 0.6394,
"step": 387
},
{
"epoch": 2.4736000000000002,
"grad_norm": 0.18362964689731598,
"learning_rate": 8.648068595008457e-06,
"loss": 0.673,
"step": 388
},
{
"epoch": 2.48,
"grad_norm": 0.18894684314727783,
"learning_rate": 8.439479483113683e-06,
"loss": 0.6781,
"step": 389
},
{
"epoch": 2.4864,
"grad_norm": 0.1816522479057312,
"learning_rate": 8.233204643835236e-06,
"loss": 0.6894,
"step": 390
},
{
"epoch": 2.4928,
"grad_norm": 0.1781761348247528,
"learning_rate": 8.029255563463589e-06,
"loss": 0.681,
"step": 391
},
{
"epoch": 2.4992,
"grad_norm": 0.18166184425354004,
"learning_rate": 7.827643598780749e-06,
"loss": 0.6752,
"step": 392
},
{
"epoch": 2.5056000000000003,
"grad_norm": 0.19368097186088562,
"learning_rate": 7.628379976427868e-06,
"loss": 0.6476,
"step": 393
},
{
"epoch": 2.512,
"grad_norm": 0.1752738654613495,
"learning_rate": 7.431475792280018e-06,
"loss": 0.6838,
"step": 394
},
{
"epoch": 2.5183999999999997,
"grad_norm": 0.1779792755842209,
"learning_rate": 7.236942010828429e-06,
"loss": 0.7104,
"step": 395
},
{
"epoch": 2.5248,
"grad_norm": 0.18969357013702393,
"learning_rate": 7.0447894645698175e-06,
"loss": 0.6944,
"step": 396
},
{
"epoch": 2.5312,
"grad_norm": 0.18680952489376068,
"learning_rate": 6.855028853403294e-06,
"loss": 0.6931,
"step": 397
},
{
"epoch": 2.5376,
"grad_norm": 0.19200138747692108,
"learning_rate": 6.667670744034499e-06,
"loss": 0.732,
"step": 398
},
{
"epoch": 2.544,
"grad_norm": 0.1732376366853714,
"learning_rate": 6.482725569387172e-06,
"loss": 0.6958,
"step": 399
},
{
"epoch": 2.5504,
"grad_norm": 0.1787935346364975,
"learning_rate": 6.300203628022272e-06,
"loss": 0.6913,
"step": 400
},
{
"epoch": 2.5568,
"grad_norm": 0.18525420129299164,
"learning_rate": 6.120115083564431e-06,
"loss": 0.6651,
"step": 401
},
{
"epoch": 2.5632,
"grad_norm": 0.19965125620365143,
"learning_rate": 5.942469964136055e-06,
"loss": 0.6701,
"step": 402
},
{
"epoch": 2.5696,
"grad_norm": 0.20355017483234406,
"learning_rate": 5.767278161798911e-06,
"loss": 0.6546,
"step": 403
},
{
"epoch": 2.576,
"grad_norm": 0.18207506835460663,
"learning_rate": 5.5945494320032434e-06,
"loss": 0.651,
"step": 404
},
{
"epoch": 2.5824,
"grad_norm": 0.16914696991443634,
"learning_rate": 5.424293393044611e-06,
"loss": 0.6754,
"step": 405
},
{
"epoch": 2.5888,
"grad_norm": 0.17765933275222778,
"learning_rate": 5.256519525528253e-06,
"loss": 0.7015,
"step": 406
},
{
"epoch": 2.5952,
"grad_norm": 0.17559665441513062,
"learning_rate": 5.091237171841173e-06,
"loss": 0.7134,
"step": 407
},
{
"epoch": 2.6016,
"grad_norm": 0.17291496694087982,
"learning_rate": 4.928455535631959e-06,
"loss": 0.7031,
"step": 408
},
{
"epoch": 2.608,
"grad_norm": 0.18128253519535065,
"learning_rate": 4.768183681298211e-06,
"loss": 0.7073,
"step": 409
},
{
"epoch": 2.6144,
"grad_norm": 0.1818084418773651,
"learning_rate": 4.610430533481857e-06,
"loss": 0.7378,
"step": 410
},
{
"epoch": 2.6208,
"grad_norm": 0.17898201942443848,
"learning_rate": 4.455204876572172e-06,
"loss": 0.7126,
"step": 411
},
{
"epoch": 2.6272,
"grad_norm": 0.18443121016025543,
"learning_rate": 4.302515354216574e-06,
"loss": 0.6359,
"step": 412
},
{
"epoch": 2.6336,
"grad_norm": 0.16818998754024506,
"learning_rate": 4.1523704688394176e-06,
"loss": 0.6955,
"step": 413
},
{
"epoch": 2.64,
"grad_norm": 0.17661628127098083,
"learning_rate": 4.004778581168412e-06,
"loss": 0.6518,
"step": 414
},
{
"epoch": 2.6464,
"grad_norm": 0.17434482276439667,
"learning_rate": 3.859747909769162e-06,
"loss": 0.6369,
"step": 415
},
{
"epoch": 2.6528,
"grad_norm": 0.1895863562822342,
"learning_rate": 3.7172865305874826e-06,
"loss": 0.6902,
"step": 416
},
{
"epoch": 2.6592000000000002,
"grad_norm": 0.17753319442272186,
"learning_rate": 3.5774023764996723e-06,
"loss": 0.7559,
"step": 417
},
{
"epoch": 2.6656,
"grad_norm": 0.16980338096618652,
"learning_rate": 3.440103236870823e-06,
"loss": 0.6977,
"step": 418
},
{
"epoch": 2.672,
"grad_norm": 0.17221416532993317,
"learning_rate": 3.3053967571210378e-06,
"loss": 0.673,
"step": 419
},
{
"epoch": 2.6784,
"grad_norm": 0.18261651694774628,
"learning_rate": 3.1732904382996976e-06,
"loss": 0.6809,
"step": 420
},
{
"epoch": 2.6848,
"grad_norm": 0.18153472244739532,
"learning_rate": 3.04379163666782e-06,
"loss": 0.6428,
"step": 421
},
{
"epoch": 2.6912000000000003,
"grad_norm": 0.1869465857744217,
"learning_rate": 2.916907563288357e-06,
"loss": 0.6529,
"step": 422
},
{
"epoch": 2.6976,
"grad_norm": 0.18409712612628937,
"learning_rate": 2.792645283624712e-06,
"loss": 0.6518,
"step": 423
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.19606178998947144,
"learning_rate": 2.671011717147276e-06,
"loss": 0.6725,
"step": 424
},
{
"epoch": 2.7104,
"grad_norm": 0.17141272127628326,
"learning_rate": 2.5520136369481198e-06,
"loss": 0.6554,
"step": 425
},
{
"epoch": 2.7168,
"grad_norm": 0.16549499332904816,
"learning_rate": 2.4356576693638556e-06,
"loss": 0.6629,
"step": 426
},
{
"epoch": 2.7232,
"grad_norm": 0.18200421333312988,
"learning_rate": 2.321950293606623e-06,
"loss": 0.7158,
"step": 427
},
{
"epoch": 2.7296,
"grad_norm": 0.1918717324733734,
"learning_rate": 2.210897841403331e-06,
"loss": 0.7156,
"step": 428
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.16785910725593567,
"learning_rate": 2.1025064966430696e-06,
"loss": 0.6626,
"step": 429
},
{
"epoch": 2.7424,
"grad_norm": 0.17111513018608093,
"learning_rate": 1.9967822950327454e-06,
"loss": 0.6273,
"step": 430
},
{
"epoch": 2.7488,
"grad_norm": 0.18554243445396423,
"learning_rate": 1.8937311237610166e-06,
"loss": 0.6827,
"step": 431
},
{
"epoch": 2.7552,
"grad_norm": 0.1815970093011856,
"learning_rate": 1.793358721170435e-06,
"loss": 0.6945,
"step": 432
},
{
"epoch": 2.7616,
"grad_norm": 0.18188947439193726,
"learning_rate": 1.6956706764379438e-06,
"loss": 0.6751,
"step": 433
},
{
"epoch": 2.768,
"grad_norm": 0.17477133870124817,
"learning_rate": 1.6006724292636166e-06,
"loss": 0.6617,
"step": 434
},
{
"epoch": 2.7744,
"grad_norm": 0.17050299048423767,
"learning_rate": 1.5083692695677832e-06,
"loss": 0.6813,
"step": 435
},
{
"epoch": 2.7808,
"grad_norm": 0.18110086023807526,
"learning_rate": 1.418766337196431e-06,
"loss": 0.654,
"step": 436
},
{
"epoch": 2.7872,
"grad_norm": 0.17326004803180695,
"learning_rate": 1.331868621635024e-06,
"loss": 0.6791,
"step": 437
},
{
"epoch": 2.7936,
"grad_norm": 0.16975191235542297,
"learning_rate": 1.2476809617306407e-06,
"loss": 0.6558,
"step": 438
},
{
"epoch": 2.8,
"grad_norm": 0.18156126141548157,
"learning_rate": 1.166208045422551e-06,
"loss": 0.6519,
"step": 439
},
{
"epoch": 2.8064,
"grad_norm": 0.17490343749523163,
"learning_rate": 1.0874544094811423e-06,
"loss": 0.6796,
"step": 440
},
{
"epoch": 2.8128,
"grad_norm": 0.2003561407327652,
"learning_rate": 1.0114244392553317e-06,
"loss": 0.6837,
"step": 441
},
{
"epoch": 2.8192,
"grad_norm": 0.17029455304145813,
"learning_rate": 9.381223684283291e-07,
"loss": 0.619,
"step": 442
},
{
"epoch": 2.8256,
"grad_norm": 0.18762946128845215,
"learning_rate": 8.675522787819023e-07,
"loss": 0.7032,
"step": 443
},
{
"epoch": 2.832,
"grad_norm": 0.1766004115343094,
"learning_rate": 7.9971809996911e-07,
"loss": 0.6922,
"step": 444
},
{
"epoch": 2.8384,
"grad_norm": 0.17784501612186432,
"learning_rate": 7.346236092954318e-07,
"loss": 0.735,
"step": 445
},
{
"epoch": 2.8448,
"grad_norm": 0.1895914524793625,
"learning_rate": 6.722724315084805e-07,
"loss": 0.7254,
"step": 446
},
{
"epoch": 2.8512,
"grad_norm": 0.17988507449626923,
"learning_rate": 6.12668038596137e-07,
"loss": 0.6898,
"step": 447
},
{
"epoch": 2.8576,
"grad_norm": 0.18528032302856445,
"learning_rate": 5.558137495932037e-07,
"loss": 0.6719,
"step": 448
},
{
"epoch": 2.864,
"grad_norm": 0.1748681515455246,
"learning_rate": 5.017127303966085e-07,
"loss": 0.6558,
"step": 449
},
{
"epoch": 2.8704,
"grad_norm": 0.1760055422782898,
"learning_rate": 4.5036799358910697e-07,
"loss": 0.7183,
"step": 450
},
{
"epoch": 2.8768000000000002,
"grad_norm": 0.17728447914123535,
"learning_rate": 4.0178239827151075e-07,
"loss": 0.6566,
"step": 451
},
{
"epoch": 2.8832,
"grad_norm": 0.1707494556903839,
"learning_rate": 3.5595864990352056e-07,
"loss": 0.7113,
"step": 452
},
{
"epoch": 2.8895999999999997,
"grad_norm": 0.17298643290996552,
"learning_rate": 3.128993001530245e-07,
"loss": 0.636,
"step": 453
},
{
"epoch": 2.896,
"grad_norm": 0.18352839350700378,
"learning_rate": 2.72606746754045e-07,
"loss": 0.6611,
"step": 454
},
{
"epoch": 2.9024,
"grad_norm": 0.1850634068250656,
"learning_rate": 2.3508323337321226e-07,
"loss": 0.6805,
"step": 455
},
{
"epoch": 2.9088000000000003,
"grad_norm": 0.176837757229805,
"learning_rate": 2.0033084948483105e-07,
"loss": 0.7048,
"step": 456
},
{
"epoch": 2.9152,
"grad_norm": 0.1697586327791214,
"learning_rate": 1.6835153025451245e-07,
"loss": 0.6847,
"step": 457
},
{
"epoch": 2.9215999999999998,
"grad_norm": 0.16893458366394043,
"learning_rate": 1.3914705643143789e-07,
"loss": 0.6545,
"step": 458
},
{
"epoch": 2.928,
"grad_norm": 0.17263327538967133,
"learning_rate": 1.1271905424918295e-07,
"loss": 0.7085,
"step": 459
},
{
"epoch": 2.9344,
"grad_norm": 0.17428065836429596,
"learning_rate": 8.906899533517865e-08,
"loss": 0.6696,
"step": 460
},
{
"epoch": 2.9408,
"grad_norm": 0.17830806970596313,
"learning_rate": 6.819819662874371e-08,
"loss": 0.6562,
"step": 461
},
{
"epoch": 2.9472,
"grad_norm": 0.16810065507888794,
"learning_rate": 5.0107820307770947e-08,
"loss": 0.7167,
"step": 462
},
{
"epoch": 2.9536,
"grad_norm": 0.18890230357646942,
"learning_rate": 3.4798873723984605e-08,
"loss": 0.7027,
"step": 463
},
{
"epoch": 2.96,
"grad_norm": 0.181377574801445,
"learning_rate": 2.227220934688523e-08,
"loss": 0.6166,
"step": 464
},
{
"epoch": 2.9664,
"grad_norm": 0.16959308087825775,
"learning_rate": 1.2528524716259871e-08,
"loss": 0.7082,
"step": 465
},
{
"epoch": 2.9728,
"grad_norm": 0.16611914336681366,
"learning_rate": 5.568362403318705e-09,
"loss": 0.6582,
"step": 466
},
{
"epoch": 2.9792,
"grad_norm": 0.16966696083545685,
"learning_rate": 1.3921099805302984e-09,
"loss": 0.676,
"step": 467
},
{
"epoch": 2.9856,
"grad_norm": 0.16860099136829376,
"learning_rate": 0.0,
"loss": 0.6525,
"step": 468
},
{
"epoch": 2.9856,
"step": 468,
"total_flos": 6.618679011990045e+18,
"train_loss": 0.7691191567314996,
"train_runtime": 29137.5675,
"train_samples_per_second": 1.03,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.618679011990045e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}