cv-jd-ckpt / trainer_state.json
NhaiDao's picture
Move checkpoint files to repo root
daecfe4
{
"best_global_step": 900,
"best_metric": 0.4358259439468384,
"best_model_checkpoint": "./cv_jd_finetuned_model/checkpoint-900",
"epoch": 2.5003474635163307,
"eval_steps": 300,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002779708130646282,
"grad_norm": 0.47256213426589966,
"learning_rate": 0.0,
"loss": 1.7714,
"step": 1
},
{
"epoch": 0.005559416261292564,
"grad_norm": 0.43564292788505554,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.7393,
"step": 2
},
{
"epoch": 0.008339124391938846,
"grad_norm": 0.4533561170101166,
"learning_rate": 4.000000000000001e-06,
"loss": 1.7359,
"step": 3
},
{
"epoch": 0.011118832522585128,
"grad_norm": 0.4747747778892517,
"learning_rate": 6e-06,
"loss": 1.79,
"step": 4
},
{
"epoch": 0.01389854065323141,
"grad_norm": 0.4859880208969116,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7613,
"step": 5
},
{
"epoch": 0.01667824878387769,
"grad_norm": 0.46851226687431335,
"learning_rate": 1e-05,
"loss": 1.7779,
"step": 6
},
{
"epoch": 0.019457956914523976,
"grad_norm": 0.468657910823822,
"learning_rate": 1.2e-05,
"loss": 1.7623,
"step": 7
},
{
"epoch": 0.022237665045170257,
"grad_norm": 0.468654602766037,
"learning_rate": 1.4000000000000001e-05,
"loss": 1.801,
"step": 8
},
{
"epoch": 0.02501737317581654,
"grad_norm": 0.46270835399627686,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.6855,
"step": 9
},
{
"epoch": 0.02779708130646282,
"grad_norm": 0.4720575511455536,
"learning_rate": 1.8e-05,
"loss": 1.7408,
"step": 10
},
{
"epoch": 0.030576789437109102,
"grad_norm": 0.4369213283061981,
"learning_rate": 2e-05,
"loss": 1.7412,
"step": 11
},
{
"epoch": 0.03335649756775538,
"grad_norm": 0.4069555401802063,
"learning_rate": 2.2000000000000003e-05,
"loss": 1.6908,
"step": 12
},
{
"epoch": 0.03613620569840167,
"grad_norm": 0.44282251596450806,
"learning_rate": 2.4e-05,
"loss": 1.7716,
"step": 13
},
{
"epoch": 0.03891591382904795,
"grad_norm": 0.4090988039970398,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.66,
"step": 14
},
{
"epoch": 0.04169562195969423,
"grad_norm": 0.44722557067871094,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.7268,
"step": 15
},
{
"epoch": 0.04447533009034051,
"grad_norm": 0.423772931098938,
"learning_rate": 3e-05,
"loss": 1.7199,
"step": 16
},
{
"epoch": 0.0472550382209868,
"grad_norm": 0.3943701386451721,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.7025,
"step": 17
},
{
"epoch": 0.05003474635163308,
"grad_norm": 0.3880269527435303,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.6418,
"step": 18
},
{
"epoch": 0.05281445448227936,
"grad_norm": 0.3725886046886444,
"learning_rate": 3.6e-05,
"loss": 1.6165,
"step": 19
},
{
"epoch": 0.05559416261292564,
"grad_norm": 0.3829341530799866,
"learning_rate": 3.8e-05,
"loss": 1.6348,
"step": 20
},
{
"epoch": 0.05837387074357193,
"grad_norm": 0.38574135303497314,
"learning_rate": 4e-05,
"loss": 1.6028,
"step": 21
},
{
"epoch": 0.061153578874218205,
"grad_norm": 0.34429848194122314,
"learning_rate": 4.2e-05,
"loss": 1.6449,
"step": 22
},
{
"epoch": 0.06393328700486449,
"grad_norm": 0.33892038464546204,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.627,
"step": 23
},
{
"epoch": 0.06671299513551077,
"grad_norm": 0.31090790033340454,
"learning_rate": 4.600000000000001e-05,
"loss": 1.5466,
"step": 24
},
{
"epoch": 0.06949270326615706,
"grad_norm": 0.3035236895084381,
"learning_rate": 4.8e-05,
"loss": 1.5175,
"step": 25
},
{
"epoch": 0.07227241139680333,
"grad_norm": 0.2932650148868561,
"learning_rate": 5e-05,
"loss": 1.4668,
"step": 26
},
{
"epoch": 0.07505211952744961,
"grad_norm": 0.27232736349105835,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.5586,
"step": 27
},
{
"epoch": 0.0778318276580959,
"grad_norm": 0.25519007444381714,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.5489,
"step": 28
},
{
"epoch": 0.08061153578874218,
"grad_norm": 0.23789168894290924,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.5095,
"step": 29
},
{
"epoch": 0.08339124391938846,
"grad_norm": 0.2281704694032669,
"learning_rate": 5.8e-05,
"loss": 1.5199,
"step": 30
},
{
"epoch": 0.08617095205003475,
"grad_norm": 0.23827943205833435,
"learning_rate": 6e-05,
"loss": 1.3646,
"step": 31
},
{
"epoch": 0.08895066018068103,
"grad_norm": 0.24246342480182648,
"learning_rate": 6.2e-05,
"loss": 1.4594,
"step": 32
},
{
"epoch": 0.0917303683113273,
"grad_norm": 0.2456520050764084,
"learning_rate": 6.400000000000001e-05,
"loss": 1.4243,
"step": 33
},
{
"epoch": 0.0945100764419736,
"grad_norm": 0.2509920001029968,
"learning_rate": 6.6e-05,
"loss": 1.3821,
"step": 34
},
{
"epoch": 0.09728978457261987,
"grad_norm": 0.2679535448551178,
"learning_rate": 6.800000000000001e-05,
"loss": 1.3136,
"step": 35
},
{
"epoch": 0.10006949270326616,
"grad_norm": 0.2893657684326172,
"learning_rate": 7e-05,
"loss": 1.3514,
"step": 36
},
{
"epoch": 0.10284920083391244,
"grad_norm": 0.31409314274787903,
"learning_rate": 7.2e-05,
"loss": 1.3186,
"step": 37
},
{
"epoch": 0.10562890896455872,
"grad_norm": 0.29165610671043396,
"learning_rate": 7.4e-05,
"loss": 1.2345,
"step": 38
},
{
"epoch": 0.10840861709520501,
"grad_norm": 0.31779947876930237,
"learning_rate": 7.6e-05,
"loss": 1.2938,
"step": 39
},
{
"epoch": 0.11118832522585129,
"grad_norm": 0.298592209815979,
"learning_rate": 7.800000000000001e-05,
"loss": 1.2937,
"step": 40
},
{
"epoch": 0.11396803335649756,
"grad_norm": 0.30116093158721924,
"learning_rate": 8e-05,
"loss": 1.2387,
"step": 41
},
{
"epoch": 0.11674774148714386,
"grad_norm": 0.28846633434295654,
"learning_rate": 8.2e-05,
"loss": 1.2688,
"step": 42
},
{
"epoch": 0.11952744961779013,
"grad_norm": 0.27421048283576965,
"learning_rate": 8.4e-05,
"loss": 1.3615,
"step": 43
},
{
"epoch": 0.12230715774843641,
"grad_norm": 0.25272664427757263,
"learning_rate": 8.6e-05,
"loss": 1.2837,
"step": 44
},
{
"epoch": 0.1250868658790827,
"grad_norm": 0.28714367747306824,
"learning_rate": 8.800000000000001e-05,
"loss": 1.1749,
"step": 45
},
{
"epoch": 0.12786657400972898,
"grad_norm": 0.27853626012802124,
"learning_rate": 9e-05,
"loss": 1.2653,
"step": 46
},
{
"epoch": 0.13064628214037527,
"grad_norm": 0.29935234785079956,
"learning_rate": 9.200000000000001e-05,
"loss": 1.0846,
"step": 47
},
{
"epoch": 0.13342599027102153,
"grad_norm": 0.2770155072212219,
"learning_rate": 9.4e-05,
"loss": 1.1273,
"step": 48
},
{
"epoch": 0.13620569840166782,
"grad_norm": 0.29866236448287964,
"learning_rate": 9.6e-05,
"loss": 1.1003,
"step": 49
},
{
"epoch": 0.13898540653231412,
"grad_norm": 0.3060087561607361,
"learning_rate": 9.8e-05,
"loss": 1.0825,
"step": 50
},
{
"epoch": 0.14176511466296038,
"grad_norm": 0.2942337095737457,
"learning_rate": 0.0001,
"loss": 1.1054,
"step": 51
},
{
"epoch": 0.14454482279360667,
"grad_norm": 0.2934916019439697,
"learning_rate": 0.00010200000000000001,
"loss": 1.0711,
"step": 52
},
{
"epoch": 0.14732453092425296,
"grad_norm": 0.33025407791137695,
"learning_rate": 0.00010400000000000001,
"loss": 0.9641,
"step": 53
},
{
"epoch": 0.15010423905489922,
"grad_norm": 0.3368203341960907,
"learning_rate": 0.00010600000000000002,
"loss": 0.9126,
"step": 54
},
{
"epoch": 0.15288394718554552,
"grad_norm": 0.3417721092700958,
"learning_rate": 0.00010800000000000001,
"loss": 1.0195,
"step": 55
},
{
"epoch": 0.1556636553161918,
"grad_norm": 0.33795756101608276,
"learning_rate": 0.00011000000000000002,
"loss": 0.8796,
"step": 56
},
{
"epoch": 0.15844336344683807,
"grad_norm": 0.341162770986557,
"learning_rate": 0.00011200000000000001,
"loss": 0.9361,
"step": 57
},
{
"epoch": 0.16122307157748436,
"grad_norm": 0.3660062849521637,
"learning_rate": 0.00011399999999999999,
"loss": 0.8179,
"step": 58
},
{
"epoch": 0.16400277970813065,
"grad_norm": 0.3034508526325226,
"learning_rate": 0.000116,
"loss": 0.8709,
"step": 59
},
{
"epoch": 0.16678248783877692,
"grad_norm": 0.31419962644577026,
"learning_rate": 0.000118,
"loss": 0.8925,
"step": 60
},
{
"epoch": 0.1695621959694232,
"grad_norm": 0.32202863693237305,
"learning_rate": 0.00012,
"loss": 0.9444,
"step": 61
},
{
"epoch": 0.1723419041000695,
"grad_norm": 0.29973354935646057,
"learning_rate": 0.000122,
"loss": 0.9334,
"step": 62
},
{
"epoch": 0.17512161223071576,
"grad_norm": 0.2913822531700134,
"learning_rate": 0.000124,
"loss": 0.8633,
"step": 63
},
{
"epoch": 0.17790132036136205,
"grad_norm": 0.3123023509979248,
"learning_rate": 0.000126,
"loss": 0.8474,
"step": 64
},
{
"epoch": 0.18068102849200834,
"grad_norm": 0.36691340804100037,
"learning_rate": 0.00012800000000000002,
"loss": 0.8491,
"step": 65
},
{
"epoch": 0.1834607366226546,
"grad_norm": 0.2867358922958374,
"learning_rate": 0.00013000000000000002,
"loss": 0.8812,
"step": 66
},
{
"epoch": 0.1862404447533009,
"grad_norm": 0.33814680576324463,
"learning_rate": 0.000132,
"loss": 0.7977,
"step": 67
},
{
"epoch": 0.1890201528839472,
"grad_norm": 0.2705424129962921,
"learning_rate": 0.000134,
"loss": 0.8189,
"step": 68
},
{
"epoch": 0.19179986101459348,
"grad_norm": 0.27234122157096863,
"learning_rate": 0.00013600000000000003,
"loss": 0.8661,
"step": 69
},
{
"epoch": 0.19457956914523974,
"grad_norm": 0.3268398642539978,
"learning_rate": 0.000138,
"loss": 0.7359,
"step": 70
},
{
"epoch": 0.19735927727588604,
"grad_norm": 0.29902949929237366,
"learning_rate": 0.00014,
"loss": 0.7943,
"step": 71
},
{
"epoch": 0.20013898540653233,
"grad_norm": 0.31011486053466797,
"learning_rate": 0.000142,
"loss": 1.0169,
"step": 72
},
{
"epoch": 0.2029186935371786,
"grad_norm": 0.28908616304397583,
"learning_rate": 0.000144,
"loss": 0.7617,
"step": 73
},
{
"epoch": 0.20569840166782488,
"grad_norm": 0.31617382168769836,
"learning_rate": 0.000146,
"loss": 0.7512,
"step": 74
},
{
"epoch": 0.20847810979847117,
"grad_norm": 0.32012176513671875,
"learning_rate": 0.000148,
"loss": 0.838,
"step": 75
},
{
"epoch": 0.21125781792911744,
"grad_norm": 0.32813915610313416,
"learning_rate": 0.00015000000000000001,
"loss": 0.8314,
"step": 76
},
{
"epoch": 0.21403752605976373,
"grad_norm": 0.311210572719574,
"learning_rate": 0.000152,
"loss": 0.7498,
"step": 77
},
{
"epoch": 0.21681723419041002,
"grad_norm": 0.2896203100681305,
"learning_rate": 0.000154,
"loss": 0.8062,
"step": 78
},
{
"epoch": 0.21959694232105628,
"grad_norm": 0.30190443992614746,
"learning_rate": 0.00015600000000000002,
"loss": 0.8198,
"step": 79
},
{
"epoch": 0.22237665045170257,
"grad_norm": 0.31988754868507385,
"learning_rate": 0.00015800000000000002,
"loss": 0.8781,
"step": 80
},
{
"epoch": 0.22515635858234886,
"grad_norm": 0.29290610551834106,
"learning_rate": 0.00016,
"loss": 0.7494,
"step": 81
},
{
"epoch": 0.22793606671299513,
"grad_norm": 0.29982951283454895,
"learning_rate": 0.000162,
"loss": 0.6786,
"step": 82
},
{
"epoch": 0.23071577484364142,
"grad_norm": 0.3004741966724396,
"learning_rate": 0.000164,
"loss": 0.6723,
"step": 83
},
{
"epoch": 0.2334954829742877,
"grad_norm": 0.30331557989120483,
"learning_rate": 0.000166,
"loss": 0.6686,
"step": 84
},
{
"epoch": 0.23627519110493397,
"grad_norm": 0.320073664188385,
"learning_rate": 0.000168,
"loss": 0.7309,
"step": 85
},
{
"epoch": 0.23905489923558026,
"grad_norm": 0.37164461612701416,
"learning_rate": 0.00017,
"loss": 0.883,
"step": 86
},
{
"epoch": 0.24183460736622656,
"grad_norm": 0.3153933584690094,
"learning_rate": 0.000172,
"loss": 0.7538,
"step": 87
},
{
"epoch": 0.24461431549687282,
"grad_norm": 0.37620604038238525,
"learning_rate": 0.000174,
"loss": 0.7233,
"step": 88
},
{
"epoch": 0.2473940236275191,
"grad_norm": 0.3310216963291168,
"learning_rate": 0.00017600000000000002,
"loss": 0.668,
"step": 89
},
{
"epoch": 0.2501737317581654,
"grad_norm": 0.3648437261581421,
"learning_rate": 0.00017800000000000002,
"loss": 0.7219,
"step": 90
},
{
"epoch": 0.2529534398888117,
"grad_norm": 0.33160319924354553,
"learning_rate": 0.00018,
"loss": 0.6968,
"step": 91
},
{
"epoch": 0.25573314801945796,
"grad_norm": 0.3842083513736725,
"learning_rate": 0.000182,
"loss": 0.7334,
"step": 92
},
{
"epoch": 0.2585128561501042,
"grad_norm": 0.34885042905807495,
"learning_rate": 0.00018400000000000003,
"loss": 0.6107,
"step": 93
},
{
"epoch": 0.26129256428075054,
"grad_norm": 0.4350070059299469,
"learning_rate": 0.00018600000000000002,
"loss": 0.6964,
"step": 94
},
{
"epoch": 0.2640722724113968,
"grad_norm": 0.3995032012462616,
"learning_rate": 0.000188,
"loss": 0.7111,
"step": 95
},
{
"epoch": 0.26685198054204307,
"grad_norm": 0.4035239517688751,
"learning_rate": 0.00019,
"loss": 0.6004,
"step": 96
},
{
"epoch": 0.2696316886726894,
"grad_norm": 0.39563143253326416,
"learning_rate": 0.000192,
"loss": 0.7174,
"step": 97
},
{
"epoch": 0.27241139680333565,
"grad_norm": 0.4484492540359497,
"learning_rate": 0.000194,
"loss": 0.732,
"step": 98
},
{
"epoch": 0.2751911049339819,
"grad_norm": 0.4170342683792114,
"learning_rate": 0.000196,
"loss": 0.6415,
"step": 99
},
{
"epoch": 0.27797081306462823,
"grad_norm": 0.36684471368789673,
"learning_rate": 0.00019800000000000002,
"loss": 0.7303,
"step": 100
},
{
"epoch": 0.2807505211952745,
"grad_norm": 0.417539119720459,
"learning_rate": 0.0002,
"loss": 0.7282,
"step": 101
},
{
"epoch": 0.28353022932592076,
"grad_norm": 0.3748982846736908,
"learning_rate": 0.00019979591836734694,
"loss": 0.7453,
"step": 102
},
{
"epoch": 0.2863099374565671,
"grad_norm": 0.3669414222240448,
"learning_rate": 0.0001995918367346939,
"loss": 0.7206,
"step": 103
},
{
"epoch": 0.28908964558721334,
"grad_norm": 0.43745529651641846,
"learning_rate": 0.00019938775510204082,
"loss": 0.7008,
"step": 104
},
{
"epoch": 0.2918693537178596,
"grad_norm": 0.4588426351547241,
"learning_rate": 0.00019918367346938775,
"loss": 0.6981,
"step": 105
},
{
"epoch": 0.2946490618485059,
"grad_norm": 0.46913576126098633,
"learning_rate": 0.0001989795918367347,
"loss": 0.7276,
"step": 106
},
{
"epoch": 0.2974287699791522,
"grad_norm": 0.38426539301872253,
"learning_rate": 0.00019877551020408164,
"loss": 0.6409,
"step": 107
},
{
"epoch": 0.30020847810979845,
"grad_norm": 0.4170157313346863,
"learning_rate": 0.0001985714285714286,
"loss": 0.6113,
"step": 108
},
{
"epoch": 0.30298818624044477,
"grad_norm": 0.4131574332714081,
"learning_rate": 0.00019836734693877553,
"loss": 0.5925,
"step": 109
},
{
"epoch": 0.30576789437109103,
"grad_norm": 0.4434458017349243,
"learning_rate": 0.00019816326530612246,
"loss": 0.7052,
"step": 110
},
{
"epoch": 0.3085476025017373,
"grad_norm": 0.45088446140289307,
"learning_rate": 0.00019795918367346938,
"loss": 0.5575,
"step": 111
},
{
"epoch": 0.3113273106323836,
"grad_norm": 0.48712992668151855,
"learning_rate": 0.00019775510204081634,
"loss": 0.6706,
"step": 112
},
{
"epoch": 0.3141070187630299,
"grad_norm": 0.44475221633911133,
"learning_rate": 0.00019755102040816327,
"loss": 0.6569,
"step": 113
},
{
"epoch": 0.31688672689367614,
"grad_norm": 0.4516242742538452,
"learning_rate": 0.0001973469387755102,
"loss": 0.5981,
"step": 114
},
{
"epoch": 0.31966643502432246,
"grad_norm": 0.4031848907470703,
"learning_rate": 0.00019714285714285716,
"loss": 0.7407,
"step": 115
},
{
"epoch": 0.3224461431549687,
"grad_norm": 0.5948619842529297,
"learning_rate": 0.00019693877551020409,
"loss": 0.5176,
"step": 116
},
{
"epoch": 0.325225851285615,
"grad_norm": 0.42586585879325867,
"learning_rate": 0.00019673469387755104,
"loss": 0.5305,
"step": 117
},
{
"epoch": 0.3280055594162613,
"grad_norm": 1.2202147245407104,
"learning_rate": 0.00019653061224489797,
"loss": 0.7352,
"step": 118
},
{
"epoch": 0.33078526754690757,
"grad_norm": 0.4561997354030609,
"learning_rate": 0.0001963265306122449,
"loss": 0.4617,
"step": 119
},
{
"epoch": 0.33356497567755383,
"grad_norm": 0.38021618127822876,
"learning_rate": 0.00019612244897959183,
"loss": 0.5754,
"step": 120
},
{
"epoch": 0.33634468380820015,
"grad_norm": 0.4197412431240082,
"learning_rate": 0.0001959183673469388,
"loss": 0.6426,
"step": 121
},
{
"epoch": 0.3391243919388464,
"grad_norm": 0.39121460914611816,
"learning_rate": 0.00019571428571428572,
"loss": 0.6242,
"step": 122
},
{
"epoch": 0.3419041000694927,
"grad_norm": 0.413143515586853,
"learning_rate": 0.00019551020408163265,
"loss": 0.6013,
"step": 123
},
{
"epoch": 0.344683808200139,
"grad_norm": 0.4672967195510864,
"learning_rate": 0.0001953061224489796,
"loss": 0.5439,
"step": 124
},
{
"epoch": 0.34746351633078526,
"grad_norm": 0.4725366532802582,
"learning_rate": 0.00019510204081632656,
"loss": 0.6042,
"step": 125
},
{
"epoch": 0.3502432244614315,
"grad_norm": 0.483952134847641,
"learning_rate": 0.0001948979591836735,
"loss": 0.4544,
"step": 126
},
{
"epoch": 0.35302293259207784,
"grad_norm": 0.39228469133377075,
"learning_rate": 0.00019469387755102042,
"loss": 0.6016,
"step": 127
},
{
"epoch": 0.3558026407227241,
"grad_norm": 0.4152607023715973,
"learning_rate": 0.00019448979591836735,
"loss": 0.4931,
"step": 128
},
{
"epoch": 0.35858234885337037,
"grad_norm": 0.402338445186615,
"learning_rate": 0.0001942857142857143,
"loss": 0.525,
"step": 129
},
{
"epoch": 0.3613620569840167,
"grad_norm": 0.42365092039108276,
"learning_rate": 0.00019408163265306123,
"loss": 0.6608,
"step": 130
},
{
"epoch": 0.36414176511466295,
"grad_norm": 0.4249265491962433,
"learning_rate": 0.00019387755102040816,
"loss": 0.5565,
"step": 131
},
{
"epoch": 0.3669214732453092,
"grad_norm": 0.6368371248245239,
"learning_rate": 0.0001936734693877551,
"loss": 0.4547,
"step": 132
},
{
"epoch": 0.36970118137595553,
"grad_norm": 0.37348538637161255,
"learning_rate": 0.00019346938775510205,
"loss": 0.4073,
"step": 133
},
{
"epoch": 0.3724808895066018,
"grad_norm": 0.3562554717063904,
"learning_rate": 0.000193265306122449,
"loss": 0.4968,
"step": 134
},
{
"epoch": 0.3752605976372481,
"grad_norm": 0.42278632521629333,
"learning_rate": 0.00019306122448979593,
"loss": 0.5421,
"step": 135
},
{
"epoch": 0.3780403057678944,
"grad_norm": 0.47804152965545654,
"learning_rate": 0.00019285714285714286,
"loss": 0.6516,
"step": 136
},
{
"epoch": 0.38082001389854064,
"grad_norm": 0.4298154413700104,
"learning_rate": 0.0001926530612244898,
"loss": 0.4563,
"step": 137
},
{
"epoch": 0.38359972202918696,
"grad_norm": 0.37862634658813477,
"learning_rate": 0.00019244897959183675,
"loss": 0.5742,
"step": 138
},
{
"epoch": 0.3863794301598332,
"grad_norm": 0.5051096081733704,
"learning_rate": 0.00019224489795918368,
"loss": 0.4427,
"step": 139
},
{
"epoch": 0.3891591382904795,
"grad_norm": 0.4495854079723358,
"learning_rate": 0.0001920408163265306,
"loss": 0.5915,
"step": 140
},
{
"epoch": 0.3919388464211258,
"grad_norm": 0.535527765750885,
"learning_rate": 0.00019183673469387756,
"loss": 0.4995,
"step": 141
},
{
"epoch": 0.39471855455177207,
"grad_norm": 0.4394996464252472,
"learning_rate": 0.00019163265306122452,
"loss": 0.4898,
"step": 142
},
{
"epoch": 0.39749826268241834,
"grad_norm": 0.3254806697368622,
"learning_rate": 0.00019142857142857145,
"loss": 0.4628,
"step": 143
},
{
"epoch": 0.40027797081306465,
"grad_norm": 0.4018654525279999,
"learning_rate": 0.00019122448979591838,
"loss": 0.571,
"step": 144
},
{
"epoch": 0.4030576789437109,
"grad_norm": 0.4496287703514099,
"learning_rate": 0.0001910204081632653,
"loss": 0.4502,
"step": 145
},
{
"epoch": 0.4058373870743572,
"grad_norm": 0.3519289195537567,
"learning_rate": 0.00019081632653061227,
"loss": 0.5355,
"step": 146
},
{
"epoch": 0.4086170952050035,
"grad_norm": 0.44861000776290894,
"learning_rate": 0.0001906122448979592,
"loss": 0.5026,
"step": 147
},
{
"epoch": 0.41139680333564976,
"grad_norm": 0.4265352487564087,
"learning_rate": 0.00019040816326530612,
"loss": 0.5444,
"step": 148
},
{
"epoch": 0.414176511466296,
"grad_norm": 0.3963538706302643,
"learning_rate": 0.00019020408163265305,
"loss": 0.5061,
"step": 149
},
{
"epoch": 0.41695621959694235,
"grad_norm": 0.4118248522281647,
"learning_rate": 0.00019,
"loss": 0.4564,
"step": 150
},
{
"epoch": 0.4197359277275886,
"grad_norm": 0.3554701507091522,
"learning_rate": 0.00018979591836734697,
"loss": 0.5173,
"step": 151
},
{
"epoch": 0.4225156358582349,
"grad_norm": 0.3614487648010254,
"learning_rate": 0.0001895918367346939,
"loss": 0.5661,
"step": 152
},
{
"epoch": 0.4252953439888812,
"grad_norm": 0.3155677318572998,
"learning_rate": 0.00018938775510204083,
"loss": 0.5626,
"step": 153
},
{
"epoch": 0.42807505211952745,
"grad_norm": 0.4833175241947174,
"learning_rate": 0.00018918367346938776,
"loss": 0.6273,
"step": 154
},
{
"epoch": 0.4308547602501737,
"grad_norm": 0.37056633830070496,
"learning_rate": 0.0001889795918367347,
"loss": 0.5984,
"step": 155
},
{
"epoch": 0.43363446838082004,
"grad_norm": 0.36759206652641296,
"learning_rate": 0.00018877551020408164,
"loss": 0.5022,
"step": 156
},
{
"epoch": 0.4364141765114663,
"grad_norm": 0.4722707271575928,
"learning_rate": 0.00018857142857142857,
"loss": 0.5679,
"step": 157
},
{
"epoch": 0.43919388464211256,
"grad_norm": 0.5502737760543823,
"learning_rate": 0.0001883673469387755,
"loss": 0.5484,
"step": 158
},
{
"epoch": 0.4419735927727589,
"grad_norm": 0.41092830896377563,
"learning_rate": 0.00018816326530612246,
"loss": 0.5346,
"step": 159
},
{
"epoch": 0.44475330090340515,
"grad_norm": 0.35556185245513916,
"learning_rate": 0.0001879591836734694,
"loss": 0.5094,
"step": 160
},
{
"epoch": 0.4475330090340514,
"grad_norm": 0.4490595757961273,
"learning_rate": 0.00018775510204081634,
"loss": 0.5659,
"step": 161
},
{
"epoch": 0.45031271716469773,
"grad_norm": 0.4202437102794647,
"learning_rate": 0.00018755102040816327,
"loss": 0.5904,
"step": 162
},
{
"epoch": 0.453092425295344,
"grad_norm": 0.39136362075805664,
"learning_rate": 0.00018734693877551023,
"loss": 0.5214,
"step": 163
},
{
"epoch": 0.45587213342599026,
"grad_norm": 0.3859161138534546,
"learning_rate": 0.00018714285714285716,
"loss": 0.5011,
"step": 164
},
{
"epoch": 0.4586518415566366,
"grad_norm": 0.5123438835144043,
"learning_rate": 0.0001869387755102041,
"loss": 0.6053,
"step": 165
},
{
"epoch": 0.46143154968728284,
"grad_norm": 0.466137170791626,
"learning_rate": 0.00018673469387755102,
"loss": 0.4499,
"step": 166
},
{
"epoch": 0.4642112578179291,
"grad_norm": 0.4053160548210144,
"learning_rate": 0.00018653061224489797,
"loss": 0.5452,
"step": 167
},
{
"epoch": 0.4669909659485754,
"grad_norm": 0.3608758747577667,
"learning_rate": 0.0001863265306122449,
"loss": 0.4941,
"step": 168
},
{
"epoch": 0.4697706740792217,
"grad_norm": 0.36142799258232117,
"learning_rate": 0.00018612244897959183,
"loss": 0.5056,
"step": 169
},
{
"epoch": 0.47255038220986795,
"grad_norm": 0.2707204818725586,
"learning_rate": 0.0001859183673469388,
"loss": 0.3958,
"step": 170
},
{
"epoch": 0.47533009034051427,
"grad_norm": 0.3678928017616272,
"learning_rate": 0.00018571428571428572,
"loss": 0.47,
"step": 171
},
{
"epoch": 0.47810979847116053,
"grad_norm": 0.37397581338882446,
"learning_rate": 0.00018551020408163267,
"loss": 0.5774,
"step": 172
},
{
"epoch": 0.4808895066018068,
"grad_norm": 0.3246667981147766,
"learning_rate": 0.0001853061224489796,
"loss": 0.4607,
"step": 173
},
{
"epoch": 0.4836692147324531,
"grad_norm": 0.34554117918014526,
"learning_rate": 0.00018510204081632653,
"loss": 0.3862,
"step": 174
},
{
"epoch": 0.4864489228630994,
"grad_norm": 0.356503963470459,
"learning_rate": 0.00018489795918367346,
"loss": 0.4969,
"step": 175
},
{
"epoch": 0.48922863099374564,
"grad_norm": 0.3777051866054535,
"learning_rate": 0.00018469387755102042,
"loss": 0.4337,
"step": 176
},
{
"epoch": 0.49200833912439196,
"grad_norm": 0.3148108422756195,
"learning_rate": 0.00018448979591836735,
"loss": 0.4244,
"step": 177
},
{
"epoch": 0.4947880472550382,
"grad_norm": 0.49584245681762695,
"learning_rate": 0.00018428571428571428,
"loss": 0.5775,
"step": 178
},
{
"epoch": 0.4975677553856845,
"grad_norm": 0.34690654277801514,
"learning_rate": 0.00018408163265306123,
"loss": 0.577,
"step": 179
},
{
"epoch": 0.5003474635163307,
"grad_norm": 0.36074724793434143,
"learning_rate": 0.0001838775510204082,
"loss": 0.441,
"step": 180
},
{
"epoch": 0.5031271716469771,
"grad_norm": 0.3747076392173767,
"learning_rate": 0.00018367346938775512,
"loss": 0.5022,
"step": 181
},
{
"epoch": 0.5059068797776234,
"grad_norm": 0.41926515102386475,
"learning_rate": 0.00018346938775510205,
"loss": 0.4178,
"step": 182
},
{
"epoch": 0.5086865879082696,
"grad_norm": 0.32807132601737976,
"learning_rate": 0.00018326530612244898,
"loss": 0.4493,
"step": 183
},
{
"epoch": 0.5114662960389159,
"grad_norm": 0.3409689962863922,
"learning_rate": 0.00018306122448979593,
"loss": 0.557,
"step": 184
},
{
"epoch": 0.5142460041695622,
"grad_norm": 0.4119493365287781,
"learning_rate": 0.00018285714285714286,
"loss": 0.4969,
"step": 185
},
{
"epoch": 0.5170257123002084,
"grad_norm": 0.2936202585697174,
"learning_rate": 0.0001826530612244898,
"loss": 0.4222,
"step": 186
},
{
"epoch": 0.5198054204308548,
"grad_norm": 0.4165465235710144,
"learning_rate": 0.00018244897959183672,
"loss": 0.5563,
"step": 187
},
{
"epoch": 0.5225851285615011,
"grad_norm": 0.34087347984313965,
"learning_rate": 0.00018224489795918368,
"loss": 0.5508,
"step": 188
},
{
"epoch": 0.5253648366921473,
"grad_norm": 0.39741185307502747,
"learning_rate": 0.00018204081632653064,
"loss": 0.4635,
"step": 189
},
{
"epoch": 0.5281445448227936,
"grad_norm": 0.373943954706192,
"learning_rate": 0.00018183673469387757,
"loss": 0.4667,
"step": 190
},
{
"epoch": 0.5309242529534399,
"grad_norm": 0.3398171067237854,
"learning_rate": 0.0001816326530612245,
"loss": 0.488,
"step": 191
},
{
"epoch": 0.5337039610840861,
"grad_norm": 0.5641401410102844,
"learning_rate": 0.00018142857142857142,
"loss": 0.377,
"step": 192
},
{
"epoch": 0.5364836692147325,
"grad_norm": 0.47961774468421936,
"learning_rate": 0.00018122448979591838,
"loss": 0.5046,
"step": 193
},
{
"epoch": 0.5392633773453788,
"grad_norm": 0.4699658155441284,
"learning_rate": 0.0001810204081632653,
"loss": 0.4936,
"step": 194
},
{
"epoch": 0.542043085476025,
"grad_norm": 0.2884581983089447,
"learning_rate": 0.00018081632653061224,
"loss": 0.4059,
"step": 195
},
{
"epoch": 0.5448227936066713,
"grad_norm": 0.4616682827472687,
"learning_rate": 0.00018061224489795917,
"loss": 0.553,
"step": 196
},
{
"epoch": 0.5476025017373176,
"grad_norm": 0.35249197483062744,
"learning_rate": 0.00018040816326530615,
"loss": 0.4637,
"step": 197
},
{
"epoch": 0.5503822098679638,
"grad_norm": 0.4296030104160309,
"learning_rate": 0.00018020408163265308,
"loss": 0.5117,
"step": 198
},
{
"epoch": 0.5531619179986101,
"grad_norm": 0.3835342228412628,
"learning_rate": 0.00018,
"loss": 0.5288,
"step": 199
},
{
"epoch": 0.5559416261292565,
"grad_norm": 0.3516342043876648,
"learning_rate": 0.00017979591836734694,
"loss": 0.5322,
"step": 200
},
{
"epoch": 0.5587213342599027,
"grad_norm": 0.4156709909439087,
"learning_rate": 0.0001795918367346939,
"loss": 0.4886,
"step": 201
},
{
"epoch": 0.561501042390549,
"grad_norm": 0.32229727506637573,
"learning_rate": 0.00017938775510204083,
"loss": 0.4377,
"step": 202
},
{
"epoch": 0.5642807505211953,
"grad_norm": 0.384962260723114,
"learning_rate": 0.00017918367346938776,
"loss": 0.5345,
"step": 203
},
{
"epoch": 0.5670604586518415,
"grad_norm": 0.41784927248954773,
"learning_rate": 0.00017897959183673469,
"loss": 0.5386,
"step": 204
},
{
"epoch": 0.5698401667824878,
"grad_norm": 0.46640586853027344,
"learning_rate": 0.00017877551020408164,
"loss": 0.4544,
"step": 205
},
{
"epoch": 0.5726198749131342,
"grad_norm": 0.34132063388824463,
"learning_rate": 0.0001785714285714286,
"loss": 0.454,
"step": 206
},
{
"epoch": 0.5753995830437804,
"grad_norm": 0.461137592792511,
"learning_rate": 0.00017836734693877553,
"loss": 0.478,
"step": 207
},
{
"epoch": 0.5781792911744267,
"grad_norm": 0.5866886377334595,
"learning_rate": 0.00017816326530612246,
"loss": 0.546,
"step": 208
},
{
"epoch": 0.580958999305073,
"grad_norm": 0.3185846209526062,
"learning_rate": 0.0001779591836734694,
"loss": 0.5385,
"step": 209
},
{
"epoch": 0.5837387074357192,
"grad_norm": 0.43802475929260254,
"learning_rate": 0.00017775510204081634,
"loss": 0.554,
"step": 210
},
{
"epoch": 0.5865184155663655,
"grad_norm": 0.2952940762042999,
"learning_rate": 0.00017755102040816327,
"loss": 0.4358,
"step": 211
},
{
"epoch": 0.5892981236970118,
"grad_norm": 0.32370179891586304,
"learning_rate": 0.0001773469387755102,
"loss": 0.4418,
"step": 212
},
{
"epoch": 0.592077831827658,
"grad_norm": 0.38936758041381836,
"learning_rate": 0.00017714285714285713,
"loss": 0.409,
"step": 213
},
{
"epoch": 0.5948575399583044,
"grad_norm": 0.3157341182231903,
"learning_rate": 0.0001769387755102041,
"loss": 0.4019,
"step": 214
},
{
"epoch": 0.5976372480889507,
"grad_norm": 0.39846348762512207,
"learning_rate": 0.00017673469387755104,
"loss": 0.4804,
"step": 215
},
{
"epoch": 0.6004169562195969,
"grad_norm": 0.3177434802055359,
"learning_rate": 0.00017653061224489797,
"loss": 0.4092,
"step": 216
},
{
"epoch": 0.6031966643502432,
"grad_norm": 0.39317747950553894,
"learning_rate": 0.0001763265306122449,
"loss": 0.5265,
"step": 217
},
{
"epoch": 0.6059763724808895,
"grad_norm": 0.26835039258003235,
"learning_rate": 0.00017612244897959186,
"loss": 0.3989,
"step": 218
},
{
"epoch": 0.6087560806115357,
"grad_norm": 0.4008518159389496,
"learning_rate": 0.0001759183673469388,
"loss": 0.498,
"step": 219
},
{
"epoch": 0.6115357887421821,
"grad_norm": 0.31940510869026184,
"learning_rate": 0.00017571428571428572,
"loss": 0.4493,
"step": 220
},
{
"epoch": 0.6143154968728284,
"grad_norm": 0.43578922748565674,
"learning_rate": 0.00017551020408163265,
"loss": 0.4742,
"step": 221
},
{
"epoch": 0.6170952050034746,
"grad_norm": 0.3231724500656128,
"learning_rate": 0.0001753061224489796,
"loss": 0.5019,
"step": 222
},
{
"epoch": 0.6198749131341209,
"grad_norm": 0.30763351917266846,
"learning_rate": 0.00017510204081632653,
"loss": 0.6015,
"step": 223
},
{
"epoch": 0.6226546212647672,
"grad_norm": 0.32532060146331787,
"learning_rate": 0.0001748979591836735,
"loss": 0.3062,
"step": 224
},
{
"epoch": 0.6254343293954134,
"grad_norm": 0.39833390712738037,
"learning_rate": 0.00017469387755102042,
"loss": 0.3969,
"step": 225
},
{
"epoch": 0.6282140375260598,
"grad_norm": 0.411516010761261,
"learning_rate": 0.00017448979591836735,
"loss": 0.4763,
"step": 226
},
{
"epoch": 0.6309937456567061,
"grad_norm": 0.40843451023101807,
"learning_rate": 0.0001742857142857143,
"loss": 0.4998,
"step": 227
},
{
"epoch": 0.6337734537873523,
"grad_norm": 0.39122384786605835,
"learning_rate": 0.00017408163265306123,
"loss": 0.4365,
"step": 228
},
{
"epoch": 0.6365531619179986,
"grad_norm": 0.37201231718063354,
"learning_rate": 0.00017387755102040816,
"loss": 0.4427,
"step": 229
},
{
"epoch": 0.6393328700486449,
"grad_norm": 0.4359400272369385,
"learning_rate": 0.0001736734693877551,
"loss": 0.4686,
"step": 230
},
{
"epoch": 0.6421125781792911,
"grad_norm": 0.4254358112812042,
"learning_rate": 0.00017346938775510205,
"loss": 0.4274,
"step": 231
},
{
"epoch": 0.6448922863099374,
"grad_norm": 0.383859783411026,
"learning_rate": 0.00017326530612244898,
"loss": 0.3683,
"step": 232
},
{
"epoch": 0.6476719944405838,
"grad_norm": 0.41463732719421387,
"learning_rate": 0.00017306122448979594,
"loss": 0.4606,
"step": 233
},
{
"epoch": 0.65045170257123,
"grad_norm": 0.32726776599884033,
"learning_rate": 0.00017285714285714287,
"loss": 0.3883,
"step": 234
},
{
"epoch": 0.6532314107018763,
"grad_norm": 0.4143235683441162,
"learning_rate": 0.00017265306122448982,
"loss": 0.5929,
"step": 235
},
{
"epoch": 0.6560111188325226,
"grad_norm": 0.419869601726532,
"learning_rate": 0.00017244897959183675,
"loss": 0.527,
"step": 236
},
{
"epoch": 0.6587908269631688,
"grad_norm": 0.4330616295337677,
"learning_rate": 0.00017224489795918368,
"loss": 0.523,
"step": 237
},
{
"epoch": 0.6615705350938151,
"grad_norm": 0.42590218782424927,
"learning_rate": 0.0001720408163265306,
"loss": 0.4894,
"step": 238
},
{
"epoch": 0.6643502432244615,
"grad_norm": 0.5940903425216675,
"learning_rate": 0.00017183673469387757,
"loss": 0.458,
"step": 239
},
{
"epoch": 0.6671299513551077,
"grad_norm": 0.3962993025779724,
"learning_rate": 0.0001716326530612245,
"loss": 0.4934,
"step": 240
},
{
"epoch": 0.669909659485754,
"grad_norm": 0.3732195496559143,
"learning_rate": 0.00017142857142857143,
"loss": 0.4779,
"step": 241
},
{
"epoch": 0.6726893676164003,
"grad_norm": 0.39993205666542053,
"learning_rate": 0.00017122448979591838,
"loss": 0.3835,
"step": 242
},
{
"epoch": 0.6754690757470465,
"grad_norm": 0.37663185596466064,
"learning_rate": 0.0001710204081632653,
"loss": 0.3876,
"step": 243
},
{
"epoch": 0.6782487838776928,
"grad_norm": 0.33526360988616943,
"learning_rate": 0.00017081632653061227,
"loss": 0.4415,
"step": 244
},
{
"epoch": 0.6810284920083391,
"grad_norm": 0.4333009421825409,
"learning_rate": 0.0001706122448979592,
"loss": 0.4897,
"step": 245
},
{
"epoch": 0.6838082001389854,
"grad_norm": 0.4983868896961212,
"learning_rate": 0.00017040816326530613,
"loss": 0.4477,
"step": 246
},
{
"epoch": 0.6865879082696317,
"grad_norm": 0.39352232217788696,
"learning_rate": 0.00017020408163265306,
"loss": 0.4524,
"step": 247
},
{
"epoch": 0.689367616400278,
"grad_norm": 0.40973153710365295,
"learning_rate": 0.00017,
"loss": 0.4884,
"step": 248
},
{
"epoch": 0.6921473245309242,
"grad_norm": 0.33771470189094543,
"learning_rate": 0.00016979591836734694,
"loss": 0.4938,
"step": 249
},
{
"epoch": 0.6949270326615705,
"grad_norm": 0.34451448917388916,
"learning_rate": 0.00016959183673469387,
"loss": 0.3722,
"step": 250
},
{
"epoch": 0.6977067407922168,
"grad_norm": 0.38862481713294983,
"learning_rate": 0.00016938775510204083,
"loss": 0.426,
"step": 251
},
{
"epoch": 0.700486448922863,
"grad_norm": 0.41708311438560486,
"learning_rate": 0.00016918367346938778,
"loss": 0.5239,
"step": 252
},
{
"epoch": 0.7032661570535094,
"grad_norm": 0.4317916929721832,
"learning_rate": 0.0001689795918367347,
"loss": 0.4556,
"step": 253
},
{
"epoch": 0.7060458651841557,
"grad_norm": 0.3338056802749634,
"learning_rate": 0.00016877551020408164,
"loss": 0.3332,
"step": 254
},
{
"epoch": 0.7088255733148019,
"grad_norm": 0.41748374700546265,
"learning_rate": 0.00016857142857142857,
"loss": 0.4594,
"step": 255
},
{
"epoch": 0.7116052814454482,
"grad_norm": 0.4123172461986542,
"learning_rate": 0.00016836734693877553,
"loss": 0.4289,
"step": 256
},
{
"epoch": 0.7143849895760945,
"grad_norm": 0.40377530455589294,
"learning_rate": 0.00016816326530612246,
"loss": 0.5744,
"step": 257
},
{
"epoch": 0.7171646977067407,
"grad_norm": 0.369667649269104,
"learning_rate": 0.0001679591836734694,
"loss": 0.5848,
"step": 258
},
{
"epoch": 0.7199444058373871,
"grad_norm": 0.39214134216308594,
"learning_rate": 0.00016775510204081632,
"loss": 0.3939,
"step": 259
},
{
"epoch": 0.7227241139680334,
"grad_norm": 0.3648947775363922,
"learning_rate": 0.00016755102040816327,
"loss": 0.3941,
"step": 260
},
{
"epoch": 0.7255038220986796,
"grad_norm": 0.3529266119003296,
"learning_rate": 0.00016734693877551023,
"loss": 0.3505,
"step": 261
},
{
"epoch": 0.7282835302293259,
"grad_norm": 0.3326796889305115,
"learning_rate": 0.00016714285714285716,
"loss": 0.3973,
"step": 262
},
{
"epoch": 0.7310632383599722,
"grad_norm": 0.37780439853668213,
"learning_rate": 0.0001669387755102041,
"loss": 0.4882,
"step": 263
},
{
"epoch": 0.7338429464906184,
"grad_norm": 0.4995975196361542,
"learning_rate": 0.00016673469387755102,
"loss": 0.4417,
"step": 264
},
{
"epoch": 0.7366226546212647,
"grad_norm": 0.40474021434783936,
"learning_rate": 0.00016653061224489797,
"loss": 0.4511,
"step": 265
},
{
"epoch": 0.7394023627519111,
"grad_norm": 0.8704133629798889,
"learning_rate": 0.0001663265306122449,
"loss": 0.6664,
"step": 266
},
{
"epoch": 0.7421820708825573,
"grad_norm": 0.7991705536842346,
"learning_rate": 0.00016612244897959183,
"loss": 0.5963,
"step": 267
},
{
"epoch": 0.7449617790132036,
"grad_norm": 0.4240580201148987,
"learning_rate": 0.00016591836734693876,
"loss": 0.4474,
"step": 268
},
{
"epoch": 0.7477414871438499,
"grad_norm": 0.4676007032394409,
"learning_rate": 0.00016571428571428575,
"loss": 0.5125,
"step": 269
},
{
"epoch": 0.7505211952744962,
"grad_norm": 0.2894349694252014,
"learning_rate": 0.00016551020408163268,
"loss": 0.4297,
"step": 270
},
{
"epoch": 0.7533009034051424,
"grad_norm": 0.4876716732978821,
"learning_rate": 0.0001653061224489796,
"loss": 0.391,
"step": 271
},
{
"epoch": 0.7560806115357888,
"grad_norm": 0.37176764011383057,
"learning_rate": 0.00016510204081632653,
"loss": 0.4825,
"step": 272
},
{
"epoch": 0.7588603196664351,
"grad_norm": 0.49970927834510803,
"learning_rate": 0.0001648979591836735,
"loss": 0.3729,
"step": 273
},
{
"epoch": 0.7616400277970813,
"grad_norm": 0.3654176890850067,
"learning_rate": 0.00016469387755102042,
"loss": 0.4255,
"step": 274
},
{
"epoch": 0.7644197359277276,
"grad_norm": 0.44572046399116516,
"learning_rate": 0.00016448979591836735,
"loss": 0.502,
"step": 275
},
{
"epoch": 0.7671994440583739,
"grad_norm": 0.4408852159976959,
"learning_rate": 0.00016428571428571428,
"loss": 0.4563,
"step": 276
},
{
"epoch": 0.7699791521890201,
"grad_norm": 0.36519378423690796,
"learning_rate": 0.00016408163265306124,
"loss": 0.4626,
"step": 277
},
{
"epoch": 0.7727588603196665,
"grad_norm": 0.48397397994995117,
"learning_rate": 0.0001638775510204082,
"loss": 0.4365,
"step": 278
},
{
"epoch": 0.7755385684503128,
"grad_norm": 0.39511287212371826,
"learning_rate": 0.00016367346938775512,
"loss": 0.4178,
"step": 279
},
{
"epoch": 0.778318276580959,
"grad_norm": 0.5128254890441895,
"learning_rate": 0.00016346938775510205,
"loss": 0.3998,
"step": 280
},
{
"epoch": 0.7810979847116053,
"grad_norm": 0.5283316969871521,
"learning_rate": 0.00016326530612244898,
"loss": 0.3428,
"step": 281
},
{
"epoch": 0.7838776928422516,
"grad_norm": 0.4386744201183319,
"learning_rate": 0.00016306122448979594,
"loss": 0.3976,
"step": 282
},
{
"epoch": 0.7866574009728978,
"grad_norm": 0.5863499641418457,
"learning_rate": 0.00016285714285714287,
"loss": 0.5529,
"step": 283
},
{
"epoch": 0.7894371091035441,
"grad_norm": 0.27297189831733704,
"learning_rate": 0.0001626530612244898,
"loss": 0.3282,
"step": 284
},
{
"epoch": 0.7922168172341905,
"grad_norm": 0.29970136284828186,
"learning_rate": 0.00016244897959183672,
"loss": 0.365,
"step": 285
},
{
"epoch": 0.7949965253648367,
"grad_norm": 0.3835904598236084,
"learning_rate": 0.00016224489795918368,
"loss": 0.4768,
"step": 286
},
{
"epoch": 0.797776233495483,
"grad_norm": 0.4071420729160309,
"learning_rate": 0.0001620408163265306,
"loss": 0.5426,
"step": 287
},
{
"epoch": 0.8005559416261293,
"grad_norm": 0.28784051537513733,
"learning_rate": 0.00016183673469387757,
"loss": 0.4108,
"step": 288
},
{
"epoch": 0.8033356497567755,
"grad_norm": 0.4735048711299896,
"learning_rate": 0.0001616326530612245,
"loss": 0.5495,
"step": 289
},
{
"epoch": 0.8061153578874218,
"grad_norm": 0.2624104917049408,
"learning_rate": 0.00016142857142857145,
"loss": 0.4767,
"step": 290
},
{
"epoch": 0.8088950660180682,
"grad_norm": 0.38945189118385315,
"learning_rate": 0.00016122448979591838,
"loss": 0.5055,
"step": 291
},
{
"epoch": 0.8116747741487144,
"grad_norm": 0.4181615114212036,
"learning_rate": 0.0001610204081632653,
"loss": 0.4497,
"step": 292
},
{
"epoch": 0.8144544822793607,
"grad_norm": 0.37034186720848083,
"learning_rate": 0.00016081632653061224,
"loss": 0.3471,
"step": 293
},
{
"epoch": 0.817234190410007,
"grad_norm": 0.3208980858325958,
"learning_rate": 0.0001606122448979592,
"loss": 0.3868,
"step": 294
},
{
"epoch": 0.8200138985406532,
"grad_norm": 0.4345311224460602,
"learning_rate": 0.00016040816326530613,
"loss": 0.439,
"step": 295
},
{
"epoch": 0.8227936066712995,
"grad_norm": 0.31438905000686646,
"learning_rate": 0.00016020408163265306,
"loss": 0.4661,
"step": 296
},
{
"epoch": 0.8255733148019458,
"grad_norm": 0.2713527977466583,
"learning_rate": 0.00016,
"loss": 0.3945,
"step": 297
},
{
"epoch": 0.828353022932592,
"grad_norm": 0.2937558591365814,
"learning_rate": 0.00015979591836734694,
"loss": 0.3287,
"step": 298
},
{
"epoch": 0.8311327310632384,
"grad_norm": 0.37041494250297546,
"learning_rate": 0.0001595918367346939,
"loss": 0.5133,
"step": 299
},
{
"epoch": 0.8339124391938847,
"grad_norm": 0.4814389646053314,
"learning_rate": 0.00015938775510204083,
"loss": 0.498,
"step": 300
},
{
"epoch": 0.8339124391938847,
"eval_loss": 0.4747912585735321,
"eval_runtime": 212.458,
"eval_samples_per_second": 1.694,
"eval_steps_per_second": 1.694,
"step": 300
},
{
"epoch": 0.8366921473245309,
"grad_norm": 0.3283718526363373,
"learning_rate": 0.00015918367346938776,
"loss": 0.4485,
"step": 301
},
{
"epoch": 0.8394718554551772,
"grad_norm": 0.38611340522766113,
"learning_rate": 0.0001589795918367347,
"loss": 0.4188,
"step": 302
},
{
"epoch": 0.8422515635858235,
"grad_norm": 0.29100102186203003,
"learning_rate": 0.00015877551020408164,
"loss": 0.3489,
"step": 303
},
{
"epoch": 0.8450312717164697,
"grad_norm": 0.32593274116516113,
"learning_rate": 0.00015857142857142857,
"loss": 0.391,
"step": 304
},
{
"epoch": 0.8478109798471161,
"grad_norm": 0.5677832365036011,
"learning_rate": 0.0001583673469387755,
"loss": 0.4837,
"step": 305
},
{
"epoch": 0.8505906879777624,
"grad_norm": 0.334756463766098,
"learning_rate": 0.00015816326530612246,
"loss": 0.3565,
"step": 306
},
{
"epoch": 0.8533703961084086,
"grad_norm": 0.5696679353713989,
"learning_rate": 0.00015795918367346942,
"loss": 0.4759,
"step": 307
},
{
"epoch": 0.8561501042390549,
"grad_norm": 0.25399741530418396,
"learning_rate": 0.00015775510204081634,
"loss": 0.4281,
"step": 308
},
{
"epoch": 0.8589298123697012,
"grad_norm": 0.3591265380382538,
"learning_rate": 0.00015755102040816327,
"loss": 0.4464,
"step": 309
},
{
"epoch": 0.8617095205003474,
"grad_norm": 0.3444579839706421,
"learning_rate": 0.0001573469387755102,
"loss": 0.4565,
"step": 310
},
{
"epoch": 0.8644892286309938,
"grad_norm": 0.3792060315608978,
"learning_rate": 0.00015714285714285716,
"loss": 0.5026,
"step": 311
},
{
"epoch": 0.8672689367616401,
"grad_norm": 0.2783966362476349,
"learning_rate": 0.0001569387755102041,
"loss": 0.4912,
"step": 312
},
{
"epoch": 0.8700486448922863,
"grad_norm": 0.31529495120048523,
"learning_rate": 0.00015673469387755102,
"loss": 0.3993,
"step": 313
},
{
"epoch": 0.8728283530229326,
"grad_norm": 0.3652310073375702,
"learning_rate": 0.00015653061224489795,
"loss": 0.5009,
"step": 314
},
{
"epoch": 0.8756080611535789,
"grad_norm": 0.3820590078830719,
"learning_rate": 0.0001563265306122449,
"loss": 0.3424,
"step": 315
},
{
"epoch": 0.8783877692842251,
"grad_norm": 0.3363693654537201,
"learning_rate": 0.00015612244897959186,
"loss": 0.5251,
"step": 316
},
{
"epoch": 0.8811674774148714,
"grad_norm": 0.31948599219322205,
"learning_rate": 0.0001559183673469388,
"loss": 0.4199,
"step": 317
},
{
"epoch": 0.8839471855455178,
"grad_norm": 0.4196965992450714,
"learning_rate": 0.00015571428571428572,
"loss": 0.493,
"step": 318
},
{
"epoch": 0.886726893676164,
"grad_norm": 0.4823121428489685,
"learning_rate": 0.00015551020408163265,
"loss": 0.43,
"step": 319
},
{
"epoch": 0.8895066018068103,
"grad_norm": 0.32050636410713196,
"learning_rate": 0.0001553061224489796,
"loss": 0.4247,
"step": 320
},
{
"epoch": 0.8922863099374566,
"grad_norm": 0.3208867609500885,
"learning_rate": 0.00015510204081632654,
"loss": 0.3224,
"step": 321
},
{
"epoch": 0.8950660180681028,
"grad_norm": 0.36326608061790466,
"learning_rate": 0.00015489795918367346,
"loss": 0.4366,
"step": 322
},
{
"epoch": 0.8978457261987491,
"grad_norm": 0.44612210988998413,
"learning_rate": 0.0001546938775510204,
"loss": 0.4212,
"step": 323
},
{
"epoch": 0.9006254343293955,
"grad_norm": 0.5350055694580078,
"learning_rate": 0.00015448979591836735,
"loss": 0.42,
"step": 324
},
{
"epoch": 0.9034051424600417,
"grad_norm": 0.5650726556777954,
"learning_rate": 0.0001542857142857143,
"loss": 0.5001,
"step": 325
},
{
"epoch": 0.906184850590688,
"grad_norm": 0.2960895001888275,
"learning_rate": 0.00015408163265306124,
"loss": 0.2943,
"step": 326
},
{
"epoch": 0.9089645587213343,
"grad_norm": 0.4059947729110718,
"learning_rate": 0.00015387755102040817,
"loss": 0.4147,
"step": 327
},
{
"epoch": 0.9117442668519805,
"grad_norm": 0.31508710980415344,
"learning_rate": 0.00015367346938775512,
"loss": 0.3567,
"step": 328
},
{
"epoch": 0.9145239749826268,
"grad_norm": 0.30250322818756104,
"learning_rate": 0.00015346938775510205,
"loss": 0.4534,
"step": 329
},
{
"epoch": 0.9173036831132731,
"grad_norm": 0.45266756415367126,
"learning_rate": 0.00015326530612244898,
"loss": 0.4675,
"step": 330
},
{
"epoch": 0.9200833912439194,
"grad_norm": 0.2555678188800812,
"learning_rate": 0.0001530612244897959,
"loss": 0.4132,
"step": 331
},
{
"epoch": 0.9228630993745657,
"grad_norm": 0.3064277768135071,
"learning_rate": 0.00015285714285714287,
"loss": 0.5,
"step": 332
},
{
"epoch": 0.925642807505212,
"grad_norm": 0.40955591201782227,
"learning_rate": 0.00015265306122448982,
"loss": 0.4131,
"step": 333
},
{
"epoch": 0.9284225156358582,
"grad_norm": 0.32479333877563477,
"learning_rate": 0.00015244897959183675,
"loss": 0.4303,
"step": 334
},
{
"epoch": 0.9312022237665045,
"grad_norm": 0.4044603705406189,
"learning_rate": 0.00015224489795918368,
"loss": 0.4498,
"step": 335
},
{
"epoch": 0.9339819318971508,
"grad_norm": 0.4078894555568695,
"learning_rate": 0.0001520408163265306,
"loss": 0.4442,
"step": 336
},
{
"epoch": 0.936761640027797,
"grad_norm": 0.4486389756202698,
"learning_rate": 0.00015183673469387757,
"loss": 0.3902,
"step": 337
},
{
"epoch": 0.9395413481584434,
"grad_norm": 0.3673665523529053,
"learning_rate": 0.0001516326530612245,
"loss": 0.4158,
"step": 338
},
{
"epoch": 0.9423210562890897,
"grad_norm": 0.28217577934265137,
"learning_rate": 0.00015142857142857143,
"loss": 0.4327,
"step": 339
},
{
"epoch": 0.9451007644197359,
"grad_norm": 0.30868950486183167,
"learning_rate": 0.00015122448979591836,
"loss": 0.411,
"step": 340
},
{
"epoch": 0.9478804725503822,
"grad_norm": 0.27955666184425354,
"learning_rate": 0.0001510204081632653,
"loss": 0.4125,
"step": 341
},
{
"epoch": 0.9506601806810285,
"grad_norm": 0.49935731291770935,
"learning_rate": 0.00015081632653061227,
"loss": 0.369,
"step": 342
},
{
"epoch": 0.9534398888116747,
"grad_norm": 0.26663827896118164,
"learning_rate": 0.0001506122448979592,
"loss": 0.3864,
"step": 343
},
{
"epoch": 0.9562195969423211,
"grad_norm": 0.32989761233329773,
"learning_rate": 0.00015040816326530613,
"loss": 0.3859,
"step": 344
},
{
"epoch": 0.9589993050729674,
"grad_norm": 0.37399861216545105,
"learning_rate": 0.00015020408163265306,
"loss": 0.4813,
"step": 345
},
{
"epoch": 0.9617790132036136,
"grad_norm": 0.3359721302986145,
"learning_rate": 0.00015000000000000001,
"loss": 0.4135,
"step": 346
},
{
"epoch": 0.9645587213342599,
"grad_norm": 0.29389145970344543,
"learning_rate": 0.00014979591836734694,
"loss": 0.407,
"step": 347
},
{
"epoch": 0.9673384294649062,
"grad_norm": 0.39862900972366333,
"learning_rate": 0.00014959183673469387,
"loss": 0.4659,
"step": 348
},
{
"epoch": 0.9701181375955524,
"grad_norm": 0.36769983172416687,
"learning_rate": 0.00014938775510204083,
"loss": 0.4076,
"step": 349
},
{
"epoch": 0.9728978457261988,
"grad_norm": 0.29756975173950195,
"learning_rate": 0.00014918367346938776,
"loss": 0.3922,
"step": 350
},
{
"epoch": 0.9756775538568451,
"grad_norm": 0.4466356337070465,
"learning_rate": 0.00014897959183673472,
"loss": 0.4123,
"step": 351
},
{
"epoch": 0.9784572619874913,
"grad_norm": 0.5409598350524902,
"learning_rate": 0.00014877551020408164,
"loss": 0.2904,
"step": 352
},
{
"epoch": 0.9812369701181376,
"grad_norm": 0.5088945627212524,
"learning_rate": 0.00014857142857142857,
"loss": 0.3555,
"step": 353
},
{
"epoch": 0.9840166782487839,
"grad_norm": 0.41926005482673645,
"learning_rate": 0.00014836734693877553,
"loss": 0.3168,
"step": 354
},
{
"epoch": 0.9867963863794301,
"grad_norm": 0.3465840220451355,
"learning_rate": 0.00014816326530612246,
"loss": 0.4542,
"step": 355
},
{
"epoch": 0.9895760945100764,
"grad_norm": 0.29874542355537415,
"learning_rate": 0.0001479591836734694,
"loss": 0.4541,
"step": 356
},
{
"epoch": 0.9923558026407228,
"grad_norm": 0.32591426372528076,
"learning_rate": 0.00014775510204081632,
"loss": 0.3355,
"step": 357
},
{
"epoch": 0.995135510771369,
"grad_norm": 0.6298475861549377,
"learning_rate": 0.00014755102040816328,
"loss": 0.4996,
"step": 358
},
{
"epoch": 0.9979152189020153,
"grad_norm": 0.6214368939399719,
"learning_rate": 0.0001473469387755102,
"loss": 0.5361,
"step": 359
},
{
"epoch": 1.0,
"grad_norm": 0.29986897110939026,
"learning_rate": 0.00014714285714285716,
"loss": 0.2962,
"step": 360
},
{
"epoch": 1.0027797081306462,
"grad_norm": 0.27898886799812317,
"learning_rate": 0.0001469387755102041,
"loss": 0.3485,
"step": 361
},
{
"epoch": 1.0055594162612926,
"grad_norm": 0.3615298569202423,
"learning_rate": 0.00014673469387755102,
"loss": 0.4651,
"step": 362
},
{
"epoch": 1.0083391243919388,
"grad_norm": 0.31298184394836426,
"learning_rate": 0.00014653061224489798,
"loss": 0.4272,
"step": 363
},
{
"epoch": 1.011118832522585,
"grad_norm": 0.27693745493888855,
"learning_rate": 0.0001463265306122449,
"loss": 0.36,
"step": 364
},
{
"epoch": 1.0138985406532315,
"grad_norm": 0.3590083122253418,
"learning_rate": 0.00014612244897959183,
"loss": 0.4548,
"step": 365
},
{
"epoch": 1.0166782487838777,
"grad_norm": 0.3620007336139679,
"learning_rate": 0.0001459183673469388,
"loss": 0.3749,
"step": 366
},
{
"epoch": 1.019457956914524,
"grad_norm": 0.46811267733573914,
"learning_rate": 0.00014571428571428572,
"loss": 0.4619,
"step": 367
},
{
"epoch": 1.0222376650451703,
"grad_norm": 0.3233739137649536,
"learning_rate": 0.00014551020408163265,
"loss": 0.4955,
"step": 368
},
{
"epoch": 1.0250173731758165,
"grad_norm": 0.3408876061439514,
"learning_rate": 0.0001453061224489796,
"loss": 0.3564,
"step": 369
},
{
"epoch": 1.0277970813064627,
"grad_norm": 0.33044156432151794,
"learning_rate": 0.00014510204081632654,
"loss": 0.4784,
"step": 370
},
{
"epoch": 1.0305767894371092,
"grad_norm": 0.4836321175098419,
"learning_rate": 0.0001448979591836735,
"loss": 0.2986,
"step": 371
},
{
"epoch": 1.0333564975677554,
"grad_norm": 0.3213842213153839,
"learning_rate": 0.00014469387755102042,
"loss": 0.3768,
"step": 372
},
{
"epoch": 1.0361362056984016,
"grad_norm": 0.4250502288341522,
"learning_rate": 0.00014448979591836735,
"loss": 0.4024,
"step": 373
},
{
"epoch": 1.038915913829048,
"grad_norm": 0.5815131068229675,
"learning_rate": 0.00014428571428571428,
"loss": 0.4631,
"step": 374
},
{
"epoch": 1.0416956219596942,
"grad_norm": 0.3646114468574524,
"learning_rate": 0.00014408163265306124,
"loss": 0.4658,
"step": 375
},
{
"epoch": 1.0444753300903404,
"grad_norm": 0.4233134388923645,
"learning_rate": 0.00014387755102040817,
"loss": 0.4287,
"step": 376
},
{
"epoch": 1.0472550382209869,
"grad_norm": 0.29038846492767334,
"learning_rate": 0.0001436734693877551,
"loss": 0.3537,
"step": 377
},
{
"epoch": 1.050034746351633,
"grad_norm": 0.3281858265399933,
"learning_rate": 0.00014346938775510205,
"loss": 0.3594,
"step": 378
},
{
"epoch": 1.0528144544822793,
"grad_norm": 0.3003385066986084,
"learning_rate": 0.00014326530612244898,
"loss": 0.429,
"step": 379
},
{
"epoch": 1.0555941626129257,
"grad_norm": 0.42301732301712036,
"learning_rate": 0.00014306122448979594,
"loss": 0.3517,
"step": 380
},
{
"epoch": 1.058373870743572,
"grad_norm": 0.3951142728328705,
"learning_rate": 0.00014285714285714287,
"loss": 0.3341,
"step": 381
},
{
"epoch": 1.0611535788742181,
"grad_norm": 0.3241204023361206,
"learning_rate": 0.0001426530612244898,
"loss": 0.4393,
"step": 382
},
{
"epoch": 1.0639332870048646,
"grad_norm": 0.37029820680618286,
"learning_rate": 0.00014244897959183673,
"loss": 0.2978,
"step": 383
},
{
"epoch": 1.0667129951355108,
"grad_norm": 0.2742837965488434,
"learning_rate": 0.00014224489795918368,
"loss": 0.3306,
"step": 384
},
{
"epoch": 1.069492703266157,
"grad_norm": 0.26682788133621216,
"learning_rate": 0.0001420408163265306,
"loss": 0.4009,
"step": 385
},
{
"epoch": 1.0722724113968034,
"grad_norm": 0.3267010450363159,
"learning_rate": 0.00014183673469387754,
"loss": 0.303,
"step": 386
},
{
"epoch": 1.0750521195274496,
"grad_norm": 0.3354584276676178,
"learning_rate": 0.0001416326530612245,
"loss": 0.3721,
"step": 387
},
{
"epoch": 1.0778318276580958,
"grad_norm": 0.33933278918266296,
"learning_rate": 0.00014142857142857145,
"loss": 0.437,
"step": 388
},
{
"epoch": 1.0806115357887423,
"grad_norm": 0.6241405606269836,
"learning_rate": 0.00014122448979591838,
"loss": 0.4561,
"step": 389
},
{
"epoch": 1.0833912439193885,
"grad_norm": 0.6265623569488525,
"learning_rate": 0.00014102040816326531,
"loss": 0.5705,
"step": 390
},
{
"epoch": 1.0861709520500347,
"grad_norm": 0.4538445770740509,
"learning_rate": 0.00014081632653061224,
"loss": 0.3084,
"step": 391
},
{
"epoch": 1.088950660180681,
"grad_norm": 0.2949851453304291,
"learning_rate": 0.0001406122448979592,
"loss": 0.3774,
"step": 392
},
{
"epoch": 1.0917303683113273,
"grad_norm": 0.25528761744499207,
"learning_rate": 0.00014040816326530613,
"loss": 0.362,
"step": 393
},
{
"epoch": 1.0945100764419735,
"grad_norm": 0.25837084650993347,
"learning_rate": 0.00014020408163265306,
"loss": 0.341,
"step": 394
},
{
"epoch": 1.09728978457262,
"grad_norm": 0.3381750285625458,
"learning_rate": 0.00014,
"loss": 0.409,
"step": 395
},
{
"epoch": 1.1000694927032661,
"grad_norm": 0.296481728553772,
"learning_rate": 0.00013979591836734694,
"loss": 0.3378,
"step": 396
},
{
"epoch": 1.1028492008339124,
"grad_norm": 0.2741848826408386,
"learning_rate": 0.0001395918367346939,
"loss": 0.3112,
"step": 397
},
{
"epoch": 1.1056289089645588,
"grad_norm": 0.2974790036678314,
"learning_rate": 0.00013938775510204083,
"loss": 0.434,
"step": 398
},
{
"epoch": 1.108408617095205,
"grad_norm": 0.5039945840835571,
"learning_rate": 0.00013918367346938776,
"loss": 0.4185,
"step": 399
},
{
"epoch": 1.1111883252258512,
"grad_norm": 0.3443140387535095,
"learning_rate": 0.0001389795918367347,
"loss": 0.2975,
"step": 400
},
{
"epoch": 1.1139680333564976,
"grad_norm": 0.38616564869880676,
"learning_rate": 0.00013877551020408165,
"loss": 0.5452,
"step": 401
},
{
"epoch": 1.1167477414871438,
"grad_norm": 0.34680166840553284,
"learning_rate": 0.00013857142857142857,
"loss": 0.3738,
"step": 402
},
{
"epoch": 1.11952744961779,
"grad_norm": 0.30565282702445984,
"learning_rate": 0.0001383673469387755,
"loss": 0.4436,
"step": 403
},
{
"epoch": 1.1223071577484365,
"grad_norm": 0.3291468322277069,
"learning_rate": 0.00013816326530612243,
"loss": 0.369,
"step": 404
},
{
"epoch": 1.1250868658790827,
"grad_norm": 0.36906489729881287,
"learning_rate": 0.00013795918367346942,
"loss": 0.4367,
"step": 405
},
{
"epoch": 1.127866574009729,
"grad_norm": 0.2758554220199585,
"learning_rate": 0.00013775510204081635,
"loss": 0.437,
"step": 406
},
{
"epoch": 1.1306462821403753,
"grad_norm": 0.3159145414829254,
"learning_rate": 0.00013755102040816328,
"loss": 0.4224,
"step": 407
},
{
"epoch": 1.1334259902710215,
"grad_norm": 0.25819459557533264,
"learning_rate": 0.0001373469387755102,
"loss": 0.331,
"step": 408
},
{
"epoch": 1.1362056984016677,
"grad_norm": 0.33512744307518005,
"learning_rate": 0.00013714285714285716,
"loss": 0.4196,
"step": 409
},
{
"epoch": 1.1389854065323142,
"grad_norm": 0.3556046187877655,
"learning_rate": 0.0001369387755102041,
"loss": 0.3745,
"step": 410
},
{
"epoch": 1.1417651146629604,
"grad_norm": 0.48709914088249207,
"learning_rate": 0.00013673469387755102,
"loss": 0.346,
"step": 411
},
{
"epoch": 1.1445448227936066,
"grad_norm": 0.3741767704486847,
"learning_rate": 0.00013653061224489795,
"loss": 0.4871,
"step": 412
},
{
"epoch": 1.147324530924253,
"grad_norm": 0.4699570834636688,
"learning_rate": 0.0001363265306122449,
"loss": 0.3674,
"step": 413
},
{
"epoch": 1.1501042390548992,
"grad_norm": 0.32232385873794556,
"learning_rate": 0.00013612244897959184,
"loss": 0.3821,
"step": 414
},
{
"epoch": 1.1528839471855454,
"grad_norm": 0.34662458300590515,
"learning_rate": 0.0001359183673469388,
"loss": 0.3121,
"step": 415
},
{
"epoch": 1.1556636553161919,
"grad_norm": 0.32288941740989685,
"learning_rate": 0.00013571428571428572,
"loss": 0.3808,
"step": 416
},
{
"epoch": 1.158443363446838,
"grad_norm": 0.3495519459247589,
"learning_rate": 0.00013551020408163265,
"loss": 0.352,
"step": 417
},
{
"epoch": 1.1612230715774843,
"grad_norm": 0.3813597559928894,
"learning_rate": 0.0001353061224489796,
"loss": 0.4044,
"step": 418
},
{
"epoch": 1.1640027797081307,
"grad_norm": 0.2824418246746063,
"learning_rate": 0.00013510204081632654,
"loss": 0.3902,
"step": 419
},
{
"epoch": 1.166782487838777,
"grad_norm": 0.25736352801322937,
"learning_rate": 0.00013489795918367347,
"loss": 0.3357,
"step": 420
},
{
"epoch": 1.1695621959694231,
"grad_norm": 0.33888882398605347,
"learning_rate": 0.0001346938775510204,
"loss": 0.3691,
"step": 421
},
{
"epoch": 1.1723419041000696,
"grad_norm": 0.3581472933292389,
"learning_rate": 0.00013448979591836735,
"loss": 0.3576,
"step": 422
},
{
"epoch": 1.1751216122307158,
"grad_norm": 0.3296295702457428,
"learning_rate": 0.00013428571428571428,
"loss": 0.3832,
"step": 423
},
{
"epoch": 1.177901320361362,
"grad_norm": 0.33110418915748596,
"learning_rate": 0.00013408163265306124,
"loss": 0.3247,
"step": 424
},
{
"epoch": 1.1806810284920084,
"grad_norm": 0.3340078592300415,
"learning_rate": 0.00013387755102040817,
"loss": 0.3718,
"step": 425
},
{
"epoch": 1.1834607366226546,
"grad_norm": 0.3025374114513397,
"learning_rate": 0.00013367346938775512,
"loss": 0.3766,
"step": 426
},
{
"epoch": 1.1862404447533008,
"grad_norm": 0.2760503590106964,
"learning_rate": 0.00013346938775510205,
"loss": 0.3311,
"step": 427
},
{
"epoch": 1.1890201528839472,
"grad_norm": 0.27493569254875183,
"learning_rate": 0.00013326530612244898,
"loss": 0.3487,
"step": 428
},
{
"epoch": 1.1917998610145935,
"grad_norm": 0.27025917172431946,
"learning_rate": 0.0001330612244897959,
"loss": 0.3817,
"step": 429
},
{
"epoch": 1.1945795691452397,
"grad_norm": 0.6159951686859131,
"learning_rate": 0.00013285714285714287,
"loss": 0.5815,
"step": 430
},
{
"epoch": 1.197359277275886,
"grad_norm": 0.30732738971710205,
"learning_rate": 0.0001326530612244898,
"loss": 0.3646,
"step": 431
},
{
"epoch": 1.2001389854065323,
"grad_norm": 0.3065405786037445,
"learning_rate": 0.00013244897959183673,
"loss": 0.4138,
"step": 432
},
{
"epoch": 1.2029186935371785,
"grad_norm": 0.4519249498844147,
"learning_rate": 0.00013224489795918368,
"loss": 0.5033,
"step": 433
},
{
"epoch": 1.205698401667825,
"grad_norm": 0.2661092281341553,
"learning_rate": 0.0001320408163265306,
"loss": 0.3642,
"step": 434
},
{
"epoch": 1.2084781097984711,
"grad_norm": 0.27004894614219666,
"learning_rate": 0.00013183673469387757,
"loss": 0.3814,
"step": 435
},
{
"epoch": 1.2112578179291174,
"grad_norm": 0.39225614070892334,
"learning_rate": 0.0001316326530612245,
"loss": 0.4773,
"step": 436
},
{
"epoch": 1.2140375260597638,
"grad_norm": 0.2685422897338867,
"learning_rate": 0.00013142857142857143,
"loss": 0.3679,
"step": 437
},
{
"epoch": 1.21681723419041,
"grad_norm": 0.32003405690193176,
"learning_rate": 0.00013122448979591836,
"loss": 0.4101,
"step": 438
},
{
"epoch": 1.2195969423210562,
"grad_norm": 0.2831343114376068,
"learning_rate": 0.00013102040816326531,
"loss": 0.3832,
"step": 439
},
{
"epoch": 1.2223766504517026,
"grad_norm": 0.327888548374176,
"learning_rate": 0.00013081632653061224,
"loss": 0.427,
"step": 440
},
{
"epoch": 1.2251563585823488,
"grad_norm": 0.37563320994377136,
"learning_rate": 0.00013061224489795917,
"loss": 0.4982,
"step": 441
},
{
"epoch": 1.227936066712995,
"grad_norm": 0.374957799911499,
"learning_rate": 0.00013040816326530613,
"loss": 0.3724,
"step": 442
},
{
"epoch": 1.2307157748436415,
"grad_norm": 0.3069044053554535,
"learning_rate": 0.00013020408163265309,
"loss": 0.4736,
"step": 443
},
{
"epoch": 1.2334954829742877,
"grad_norm": 0.2836786210536957,
"learning_rate": 0.00013000000000000002,
"loss": 0.3139,
"step": 444
},
{
"epoch": 1.236275191104934,
"grad_norm": 0.5021543502807617,
"learning_rate": 0.00012979591836734695,
"loss": 0.4687,
"step": 445
},
{
"epoch": 1.2390548992355803,
"grad_norm": 0.34825125336647034,
"learning_rate": 0.00012959183673469387,
"loss": 0.4624,
"step": 446
},
{
"epoch": 1.2418346073662265,
"grad_norm": 0.2713720202445984,
"learning_rate": 0.00012938775510204083,
"loss": 0.3841,
"step": 447
},
{
"epoch": 1.2446143154968727,
"grad_norm": 0.2995285093784332,
"learning_rate": 0.00012918367346938776,
"loss": 0.4582,
"step": 448
},
{
"epoch": 1.2473940236275192,
"grad_norm": 0.2539880871772766,
"learning_rate": 0.0001289795918367347,
"loss": 0.3232,
"step": 449
},
{
"epoch": 1.2501737317581654,
"grad_norm": 0.24956463277339935,
"learning_rate": 0.00012877551020408162,
"loss": 0.3282,
"step": 450
},
{
"epoch": 1.2529534398888118,
"grad_norm": 0.23669935762882233,
"learning_rate": 0.00012857142857142858,
"loss": 0.3036,
"step": 451
},
{
"epoch": 1.255733148019458,
"grad_norm": 0.31620267033576965,
"learning_rate": 0.00012836734693877553,
"loss": 0.4203,
"step": 452
},
{
"epoch": 1.2585128561501042,
"grad_norm": 0.37676119804382324,
"learning_rate": 0.00012816326530612246,
"loss": 0.4163,
"step": 453
},
{
"epoch": 1.2612925642807506,
"grad_norm": 0.2864663004875183,
"learning_rate": 0.0001279591836734694,
"loss": 0.4024,
"step": 454
},
{
"epoch": 1.2640722724113969,
"grad_norm": 0.27227628231048584,
"learning_rate": 0.00012775510204081632,
"loss": 0.32,
"step": 455
},
{
"epoch": 1.266851980542043,
"grad_norm": 0.402497261762619,
"learning_rate": 0.00012755102040816328,
"loss": 0.4626,
"step": 456
},
{
"epoch": 1.2696316886726895,
"grad_norm": 0.3603473901748657,
"learning_rate": 0.0001273469387755102,
"loss": 0.3218,
"step": 457
},
{
"epoch": 1.2724113968033357,
"grad_norm": 0.2873530685901642,
"learning_rate": 0.00012714285714285714,
"loss": 0.4026,
"step": 458
},
{
"epoch": 1.275191104933982,
"grad_norm": 0.29491978883743286,
"learning_rate": 0.00012693877551020406,
"loss": 0.2958,
"step": 459
},
{
"epoch": 1.2779708130646283,
"grad_norm": 0.3286297917366028,
"learning_rate": 0.00012673469387755105,
"loss": 0.3922,
"step": 460
},
{
"epoch": 1.2807505211952745,
"grad_norm": 0.3373448848724365,
"learning_rate": 0.00012653061224489798,
"loss": 0.4216,
"step": 461
},
{
"epoch": 1.2835302293259208,
"grad_norm": 0.305073082447052,
"learning_rate": 0.0001263265306122449,
"loss": 0.3676,
"step": 462
},
{
"epoch": 1.2863099374565672,
"grad_norm": 0.3803780674934387,
"learning_rate": 0.00012612244897959184,
"loss": 0.3714,
"step": 463
},
{
"epoch": 1.2890896455872134,
"grad_norm": 0.38346725702285767,
"learning_rate": 0.0001259183673469388,
"loss": 0.4816,
"step": 464
},
{
"epoch": 1.2918693537178596,
"grad_norm": 0.3128606379032135,
"learning_rate": 0.00012571428571428572,
"loss": 0.4051,
"step": 465
},
{
"epoch": 1.294649061848506,
"grad_norm": 0.2748924493789673,
"learning_rate": 0.00012551020408163265,
"loss": 0.383,
"step": 466
},
{
"epoch": 1.2974287699791522,
"grad_norm": 0.2946728765964508,
"learning_rate": 0.00012530612244897958,
"loss": 0.3495,
"step": 467
},
{
"epoch": 1.3002084781097984,
"grad_norm": 0.4133062958717346,
"learning_rate": 0.00012510204081632654,
"loss": 0.4196,
"step": 468
},
{
"epoch": 1.3029881862404449,
"grad_norm": 0.2911101281642914,
"learning_rate": 0.0001248979591836735,
"loss": 0.3733,
"step": 469
},
{
"epoch": 1.305767894371091,
"grad_norm": 0.35561490058898926,
"learning_rate": 0.00012469387755102042,
"loss": 0.4773,
"step": 470
},
{
"epoch": 1.3085476025017373,
"grad_norm": 0.23788857460021973,
"learning_rate": 0.00012448979591836735,
"loss": 0.3111,
"step": 471
},
{
"epoch": 1.3113273106323837,
"grad_norm": 0.3194721043109894,
"learning_rate": 0.00012428571428571428,
"loss": 0.3641,
"step": 472
},
{
"epoch": 1.31410701876303,
"grad_norm": 0.30017781257629395,
"learning_rate": 0.00012408163265306124,
"loss": 0.3239,
"step": 473
},
{
"epoch": 1.3168867268936761,
"grad_norm": 0.2877587676048279,
"learning_rate": 0.00012387755102040817,
"loss": 0.3622,
"step": 474
},
{
"epoch": 1.3196664350243226,
"grad_norm": 0.3146721422672272,
"learning_rate": 0.0001236734693877551,
"loss": 0.3629,
"step": 475
},
{
"epoch": 1.3224461431549688,
"grad_norm": 0.28086069226264954,
"learning_rate": 0.00012346938775510203,
"loss": 0.3967,
"step": 476
},
{
"epoch": 1.325225851285615,
"grad_norm": 0.2777217626571655,
"learning_rate": 0.00012326530612244898,
"loss": 0.3531,
"step": 477
},
{
"epoch": 1.3280055594162614,
"grad_norm": 0.2867282032966614,
"learning_rate": 0.00012306122448979594,
"loss": 0.293,
"step": 478
},
{
"epoch": 1.3307852675469076,
"grad_norm": 0.3004007339477539,
"learning_rate": 0.00012285714285714287,
"loss": 0.3577,
"step": 479
},
{
"epoch": 1.3335649756775538,
"grad_norm": 0.32095468044281006,
"learning_rate": 0.0001226530612244898,
"loss": 0.4233,
"step": 480
},
{
"epoch": 1.3363446838082003,
"grad_norm": 0.2812075614929199,
"learning_rate": 0.00012244897959183676,
"loss": 0.3405,
"step": 481
},
{
"epoch": 1.3391243919388465,
"grad_norm": 0.38106903433799744,
"learning_rate": 0.00012224489795918368,
"loss": 0.3918,
"step": 482
},
{
"epoch": 1.3419041000694927,
"grad_norm": 0.3127501606941223,
"learning_rate": 0.00012204081632653061,
"loss": 0.3959,
"step": 483
},
{
"epoch": 1.344683808200139,
"grad_norm": 0.31547605991363525,
"learning_rate": 0.00012183673469387756,
"loss": 0.4783,
"step": 484
},
{
"epoch": 1.3474635163307853,
"grad_norm": 0.3430207669734955,
"learning_rate": 0.00012163265306122449,
"loss": 0.4181,
"step": 485
},
{
"epoch": 1.3502432244614315,
"grad_norm": 0.30673524737358093,
"learning_rate": 0.00012142857142857143,
"loss": 0.4545,
"step": 486
},
{
"epoch": 1.353022932592078,
"grad_norm": 0.2744535505771637,
"learning_rate": 0.00012122448979591839,
"loss": 0.3568,
"step": 487
},
{
"epoch": 1.3558026407227242,
"grad_norm": 0.28088897466659546,
"learning_rate": 0.00012102040816326532,
"loss": 0.2957,
"step": 488
},
{
"epoch": 1.3585823488533704,
"grad_norm": 0.2807769775390625,
"learning_rate": 0.00012081632653061226,
"loss": 0.316,
"step": 489
},
{
"epoch": 1.3613620569840168,
"grad_norm": 0.33393171429634094,
"learning_rate": 0.00012061224489795919,
"loss": 0.3952,
"step": 490
},
{
"epoch": 1.364141765114663,
"grad_norm": 0.270470529794693,
"learning_rate": 0.00012040816326530613,
"loss": 0.3263,
"step": 491
},
{
"epoch": 1.3669214732453092,
"grad_norm": 0.26981666684150696,
"learning_rate": 0.00012020408163265306,
"loss": 0.3758,
"step": 492
},
{
"epoch": 1.3697011813759556,
"grad_norm": 0.41900643706321716,
"learning_rate": 0.00012,
"loss": 0.4978,
"step": 493
},
{
"epoch": 1.3724808895066019,
"grad_norm": 0.30232396721839905,
"learning_rate": 0.00011979591836734693,
"loss": 0.3857,
"step": 494
},
{
"epoch": 1.375260597637248,
"grad_norm": 0.302409827709198,
"learning_rate": 0.00011959183673469388,
"loss": 0.4455,
"step": 495
},
{
"epoch": 1.3780403057678945,
"grad_norm": 0.30566728115081787,
"learning_rate": 0.00011938775510204083,
"loss": 0.3365,
"step": 496
},
{
"epoch": 1.3808200138985407,
"grad_norm": 0.2815055847167969,
"learning_rate": 0.00011918367346938777,
"loss": 0.395,
"step": 497
},
{
"epoch": 1.383599722029187,
"grad_norm": 0.27659547328948975,
"learning_rate": 0.0001189795918367347,
"loss": 0.3635,
"step": 498
},
{
"epoch": 1.3863794301598333,
"grad_norm": 0.2691234052181244,
"learning_rate": 0.00011877551020408165,
"loss": 0.354,
"step": 499
},
{
"epoch": 1.3891591382904795,
"grad_norm": 0.26110008358955383,
"learning_rate": 0.00011857142857142858,
"loss": 0.3056,
"step": 500
},
{
"epoch": 1.3919388464211258,
"grad_norm": 0.3188588619232178,
"learning_rate": 0.00011836734693877552,
"loss": 0.3483,
"step": 501
},
{
"epoch": 1.3947185545517722,
"grad_norm": 0.26784244179725647,
"learning_rate": 0.00011816326530612245,
"loss": 0.4165,
"step": 502
},
{
"epoch": 1.3974982626824184,
"grad_norm": 0.27212825417518616,
"learning_rate": 0.00011795918367346939,
"loss": 0.3358,
"step": 503
},
{
"epoch": 1.4002779708130646,
"grad_norm": 0.27269822359085083,
"learning_rate": 0.00011775510204081632,
"loss": 0.3894,
"step": 504
},
{
"epoch": 1.403057678943711,
"grad_norm": 0.33584582805633545,
"learning_rate": 0.00011755102040816328,
"loss": 0.3801,
"step": 505
},
{
"epoch": 1.4058373870743572,
"grad_norm": 0.34887197613716125,
"learning_rate": 0.00011734693877551022,
"loss": 0.4057,
"step": 506
},
{
"epoch": 1.4086170952050034,
"grad_norm": 0.2805575728416443,
"learning_rate": 0.00011714285714285715,
"loss": 0.3556,
"step": 507
},
{
"epoch": 1.4113968033356499,
"grad_norm": 0.3376087546348572,
"learning_rate": 0.00011693877551020409,
"loss": 0.3511,
"step": 508
},
{
"epoch": 1.414176511466296,
"grad_norm": 0.29646357893943787,
"learning_rate": 0.00011673469387755102,
"loss": 0.3396,
"step": 509
},
{
"epoch": 1.4169562195969423,
"grad_norm": 0.2987593710422516,
"learning_rate": 0.00011653061224489797,
"loss": 0.3152,
"step": 510
},
{
"epoch": 1.4197359277275887,
"grad_norm": 0.27871114015579224,
"learning_rate": 0.0001163265306122449,
"loss": 0.3039,
"step": 511
},
{
"epoch": 1.422515635858235,
"grad_norm": 0.3051932752132416,
"learning_rate": 0.00011612244897959184,
"loss": 0.3619,
"step": 512
},
{
"epoch": 1.4252953439888811,
"grad_norm": 0.31153935194015503,
"learning_rate": 0.00011591836734693877,
"loss": 0.4433,
"step": 513
},
{
"epoch": 1.4280750521195276,
"grad_norm": 0.30891695618629456,
"learning_rate": 0.00011571428571428574,
"loss": 0.4737,
"step": 514
},
{
"epoch": 1.4308547602501738,
"grad_norm": 0.2646159529685974,
"learning_rate": 0.00011551020408163267,
"loss": 0.3155,
"step": 515
},
{
"epoch": 1.43363446838082,
"grad_norm": 0.31023016571998596,
"learning_rate": 0.00011530612244897961,
"loss": 0.468,
"step": 516
},
{
"epoch": 1.4364141765114664,
"grad_norm": 0.32484954595565796,
"learning_rate": 0.00011510204081632654,
"loss": 0.4442,
"step": 517
},
{
"epoch": 1.4391938846421126,
"grad_norm": 0.30216851830482483,
"learning_rate": 0.00011489795918367348,
"loss": 0.3959,
"step": 518
},
{
"epoch": 1.4419735927727588,
"grad_norm": 0.3013352155685425,
"learning_rate": 0.00011469387755102041,
"loss": 0.3682,
"step": 519
},
{
"epoch": 1.4447533009034053,
"grad_norm": 0.3074597716331482,
"learning_rate": 0.00011448979591836735,
"loss": 0.3796,
"step": 520
},
{
"epoch": 1.4475330090340515,
"grad_norm": 0.29347458481788635,
"learning_rate": 0.00011428571428571428,
"loss": 0.387,
"step": 521
},
{
"epoch": 1.4503127171646977,
"grad_norm": 0.2919844686985016,
"learning_rate": 0.00011408163265306123,
"loss": 0.3413,
"step": 522
},
{
"epoch": 1.453092425295344,
"grad_norm": 0.274503618478775,
"learning_rate": 0.00011387755102040818,
"loss": 0.3384,
"step": 523
},
{
"epoch": 1.4558721334259903,
"grad_norm": 0.30001312494277954,
"learning_rate": 0.00011367346938775511,
"loss": 0.3218,
"step": 524
},
{
"epoch": 1.4586518415566365,
"grad_norm": 0.2913724482059479,
"learning_rate": 0.00011346938775510206,
"loss": 0.3212,
"step": 525
},
{
"epoch": 1.461431549687283,
"grad_norm": 0.2810291051864624,
"learning_rate": 0.00011326530612244898,
"loss": 0.4783,
"step": 526
},
{
"epoch": 1.4642112578179292,
"grad_norm": 0.28154751658439636,
"learning_rate": 0.00011306122448979593,
"loss": 0.384,
"step": 527
},
{
"epoch": 1.4669909659485754,
"grad_norm": 0.3322899639606476,
"learning_rate": 0.00011285714285714286,
"loss": 0.345,
"step": 528
},
{
"epoch": 1.4697706740792218,
"grad_norm": 0.2776385247707367,
"learning_rate": 0.0001126530612244898,
"loss": 0.3287,
"step": 529
},
{
"epoch": 1.472550382209868,
"grad_norm": 0.3325115144252777,
"learning_rate": 0.00011244897959183673,
"loss": 0.4717,
"step": 530
},
{
"epoch": 1.4753300903405142,
"grad_norm": 0.30507102608680725,
"learning_rate": 0.00011224489795918367,
"loss": 0.3773,
"step": 531
},
{
"epoch": 1.4781097984711606,
"grad_norm": 0.28410202264785767,
"learning_rate": 0.0001120408163265306,
"loss": 0.3844,
"step": 532
},
{
"epoch": 1.4808895066018068,
"grad_norm": 0.3436387777328491,
"learning_rate": 0.00011183673469387757,
"loss": 0.4171,
"step": 533
},
{
"epoch": 1.483669214732453,
"grad_norm": 0.2561332881450653,
"learning_rate": 0.0001116326530612245,
"loss": 0.3597,
"step": 534
},
{
"epoch": 1.4864489228630995,
"grad_norm": 0.31354856491088867,
"learning_rate": 0.00011142857142857144,
"loss": 0.3473,
"step": 535
},
{
"epoch": 1.4892286309937457,
"grad_norm": 0.26828843355178833,
"learning_rate": 0.00011122448979591837,
"loss": 0.2624,
"step": 536
},
{
"epoch": 1.492008339124392,
"grad_norm": 0.3968587815761566,
"learning_rate": 0.00011102040816326532,
"loss": 0.387,
"step": 537
},
{
"epoch": 1.4947880472550383,
"grad_norm": 0.28001683950424194,
"learning_rate": 0.00011081632653061225,
"loss": 0.3883,
"step": 538
},
{
"epoch": 1.4975677553856845,
"grad_norm": 0.2872998118400574,
"learning_rate": 0.00011061224489795919,
"loss": 0.3847,
"step": 539
},
{
"epoch": 1.5003474635163307,
"grad_norm": 0.27555397152900696,
"learning_rate": 0.00011040816326530612,
"loss": 0.2961,
"step": 540
},
{
"epoch": 1.5031271716469772,
"grad_norm": 0.27156969904899597,
"learning_rate": 0.00011020408163265306,
"loss": 0.3722,
"step": 541
},
{
"epoch": 1.5059068797776234,
"grad_norm": 0.3042210638523102,
"learning_rate": 0.00011000000000000002,
"loss": 0.4093,
"step": 542
},
{
"epoch": 1.5086865879082696,
"grad_norm": 0.2673455774784088,
"learning_rate": 0.00010979591836734695,
"loss": 0.3467,
"step": 543
},
{
"epoch": 1.511466296038916,
"grad_norm": 0.4710778594017029,
"learning_rate": 0.00010959183673469389,
"loss": 0.4335,
"step": 544
},
{
"epoch": 1.5142460041695622,
"grad_norm": 0.28414055705070496,
"learning_rate": 0.00010938775510204082,
"loss": 0.3329,
"step": 545
},
{
"epoch": 1.5170257123002084,
"grad_norm": 0.28005218505859375,
"learning_rate": 0.00010918367346938776,
"loss": 0.3585,
"step": 546
},
{
"epoch": 1.5198054204308549,
"grad_norm": 0.36730489134788513,
"learning_rate": 0.00010897959183673469,
"loss": 0.3679,
"step": 547
},
{
"epoch": 1.522585128561501,
"grad_norm": 0.2829911410808563,
"learning_rate": 0.00010877551020408163,
"loss": 0.364,
"step": 548
},
{
"epoch": 1.5253648366921473,
"grad_norm": 0.27309224009513855,
"learning_rate": 0.00010857142857142856,
"loss": 0.413,
"step": 549
},
{
"epoch": 1.5281445448227937,
"grad_norm": 0.25464004278182983,
"learning_rate": 0.00010836734693877551,
"loss": 0.3196,
"step": 550
},
{
"epoch": 1.53092425295344,
"grad_norm": 0.2633742094039917,
"learning_rate": 0.00010816326530612246,
"loss": 0.312,
"step": 551
},
{
"epoch": 1.5337039610840861,
"grad_norm": 0.34395769238471985,
"learning_rate": 0.0001079591836734694,
"loss": 0.3483,
"step": 552
},
{
"epoch": 1.5364836692147326,
"grad_norm": 0.27649009227752686,
"learning_rate": 0.00010775510204081634,
"loss": 0.4565,
"step": 553
},
{
"epoch": 1.5392633773453788,
"grad_norm": 0.4239721894264221,
"learning_rate": 0.00010755102040816328,
"loss": 0.4087,
"step": 554
},
{
"epoch": 1.542043085476025,
"grad_norm": 0.5541340708732605,
"learning_rate": 0.00010734693877551021,
"loss": 0.4309,
"step": 555
},
{
"epoch": 1.5448227936066714,
"grad_norm": 0.2893712818622589,
"learning_rate": 0.00010714285714285715,
"loss": 0.3447,
"step": 556
},
{
"epoch": 1.5476025017373176,
"grad_norm": 0.26238903403282166,
"learning_rate": 0.00010693877551020408,
"loss": 0.3256,
"step": 557
},
{
"epoch": 1.5503822098679638,
"grad_norm": 0.29899922013282776,
"learning_rate": 0.00010673469387755102,
"loss": 0.3142,
"step": 558
},
{
"epoch": 1.5531619179986103,
"grad_norm": 0.28399357199668884,
"learning_rate": 0.00010653061224489795,
"loss": 0.3314,
"step": 559
},
{
"epoch": 1.5559416261292565,
"grad_norm": 0.3450354337692261,
"learning_rate": 0.00010632653061224491,
"loss": 0.3733,
"step": 560
},
{
"epoch": 1.5587213342599027,
"grad_norm": 0.283597856760025,
"learning_rate": 0.00010612244897959185,
"loss": 0.3828,
"step": 561
},
{
"epoch": 1.561501042390549,
"grad_norm": 0.28581732511520386,
"learning_rate": 0.00010591836734693878,
"loss": 0.3131,
"step": 562
},
{
"epoch": 1.5642807505211953,
"grad_norm": 0.2778254747390747,
"learning_rate": 0.00010571428571428572,
"loss": 0.4744,
"step": 563
},
{
"epoch": 1.5670604586518415,
"grad_norm": 0.2736669182777405,
"learning_rate": 0.00010551020408163265,
"loss": 0.4084,
"step": 564
},
{
"epoch": 1.569840166782488,
"grad_norm": 0.31702694296836853,
"learning_rate": 0.0001053061224489796,
"loss": 0.3877,
"step": 565
},
{
"epoch": 1.5726198749131342,
"grad_norm": 0.4662436246871948,
"learning_rate": 0.00010510204081632653,
"loss": 0.3619,
"step": 566
},
{
"epoch": 1.5753995830437804,
"grad_norm": 0.26047173142433167,
"learning_rate": 0.00010489795918367347,
"loss": 0.3031,
"step": 567
},
{
"epoch": 1.5781792911744268,
"grad_norm": 0.4014153778553009,
"learning_rate": 0.0001046938775510204,
"loss": 0.416,
"step": 568
},
{
"epoch": 1.580958999305073,
"grad_norm": 0.2595798969268799,
"learning_rate": 0.00010448979591836735,
"loss": 0.3635,
"step": 569
},
{
"epoch": 1.5837387074357192,
"grad_norm": 0.3327747881412506,
"learning_rate": 0.0001042857142857143,
"loss": 0.5032,
"step": 570
},
{
"epoch": 1.5865184155663656,
"grad_norm": 0.28941619396209717,
"learning_rate": 0.00010408163265306123,
"loss": 0.4001,
"step": 571
},
{
"epoch": 1.5892981236970118,
"grad_norm": 0.29737424850463867,
"learning_rate": 0.00010387755102040817,
"loss": 0.4263,
"step": 572
},
{
"epoch": 1.592077831827658,
"grad_norm": 0.3156750202178955,
"learning_rate": 0.00010367346938775511,
"loss": 0.326,
"step": 573
},
{
"epoch": 1.5948575399583045,
"grad_norm": 0.33826690912246704,
"learning_rate": 0.00010346938775510204,
"loss": 0.4291,
"step": 574
},
{
"epoch": 1.5976372480889507,
"grad_norm": 0.2737540900707245,
"learning_rate": 0.00010326530612244899,
"loss": 0.4464,
"step": 575
},
{
"epoch": 1.600416956219597,
"grad_norm": 0.3012180030345917,
"learning_rate": 0.00010306122448979591,
"loss": 0.4044,
"step": 576
},
{
"epoch": 1.6031966643502433,
"grad_norm": 0.3628576397895813,
"learning_rate": 0.00010285714285714286,
"loss": 0.4376,
"step": 577
},
{
"epoch": 1.6059763724808895,
"grad_norm": 0.3489641845226288,
"learning_rate": 0.00010265306122448981,
"loss": 0.3054,
"step": 578
},
{
"epoch": 1.6087560806115357,
"grad_norm": 0.28231537342071533,
"learning_rate": 0.00010244897959183674,
"loss": 0.2566,
"step": 579
},
{
"epoch": 1.6115357887421822,
"grad_norm": 0.2675357758998871,
"learning_rate": 0.00010224489795918369,
"loss": 0.3917,
"step": 580
},
{
"epoch": 1.6143154968728284,
"grad_norm": 0.25992777943611145,
"learning_rate": 0.00010204081632653062,
"loss": 0.3511,
"step": 581
},
{
"epoch": 1.6170952050034746,
"grad_norm": 0.2999131977558136,
"learning_rate": 0.00010183673469387756,
"loss": 0.3135,
"step": 582
},
{
"epoch": 1.619874913134121,
"grad_norm": 0.2928028404712677,
"learning_rate": 0.00010163265306122449,
"loss": 0.34,
"step": 583
},
{
"epoch": 1.6226546212647672,
"grad_norm": 0.24898597598075867,
"learning_rate": 0.00010142857142857143,
"loss": 0.3334,
"step": 584
},
{
"epoch": 1.6254343293954134,
"grad_norm": 0.309908002614975,
"learning_rate": 0.00010122448979591836,
"loss": 0.4202,
"step": 585
},
{
"epoch": 1.6282140375260599,
"grad_norm": 0.2774292230606079,
"learning_rate": 0.0001010204081632653,
"loss": 0.3986,
"step": 586
},
{
"epoch": 1.630993745656706,
"grad_norm": 0.4288952648639679,
"learning_rate": 0.00010081632653061226,
"loss": 0.4282,
"step": 587
},
{
"epoch": 1.6337734537873523,
"grad_norm": 0.2605331540107727,
"learning_rate": 0.00010061224489795919,
"loss": 0.356,
"step": 588
},
{
"epoch": 1.6365531619179987,
"grad_norm": 0.2896113395690918,
"learning_rate": 0.00010040816326530613,
"loss": 0.3454,
"step": 589
},
{
"epoch": 1.639332870048645,
"grad_norm": 0.26861974596977234,
"learning_rate": 0.00010020408163265306,
"loss": 0.4541,
"step": 590
},
{
"epoch": 1.6421125781792911,
"grad_norm": 0.3458561599254608,
"learning_rate": 0.0001,
"loss": 0.3547,
"step": 591
},
{
"epoch": 1.6448922863099376,
"grad_norm": 0.287142276763916,
"learning_rate": 9.979591836734695e-05,
"loss": 0.3585,
"step": 592
},
{
"epoch": 1.6476719944405838,
"grad_norm": 0.2605501711368561,
"learning_rate": 9.959183673469388e-05,
"loss": 0.345,
"step": 593
},
{
"epoch": 1.65045170257123,
"grad_norm": 0.2797659635543823,
"learning_rate": 9.938775510204082e-05,
"loss": 0.3176,
"step": 594
},
{
"epoch": 1.6532314107018764,
"grad_norm": 0.26743918657302856,
"learning_rate": 9.918367346938776e-05,
"loss": 0.2928,
"step": 595
},
{
"epoch": 1.6560111188325226,
"grad_norm": 0.2472202330827713,
"learning_rate": 9.897959183673469e-05,
"loss": 0.2373,
"step": 596
},
{
"epoch": 1.6587908269631688,
"grad_norm": 0.26467829942703247,
"learning_rate": 9.877551020408164e-05,
"loss": 0.3824,
"step": 597
},
{
"epoch": 1.6615705350938152,
"grad_norm": 0.272390753030777,
"learning_rate": 9.857142857142858e-05,
"loss": 0.322,
"step": 598
},
{
"epoch": 1.6643502432244615,
"grad_norm": 0.2785828709602356,
"learning_rate": 9.836734693877552e-05,
"loss": 0.3201,
"step": 599
},
{
"epoch": 1.6671299513551077,
"grad_norm": 0.28154081106185913,
"learning_rate": 9.816326530612245e-05,
"loss": 0.3767,
"step": 600
},
{
"epoch": 1.6671299513551077,
"eval_loss": 0.4453812837600708,
"eval_runtime": 212.689,
"eval_samples_per_second": 1.693,
"eval_steps_per_second": 1.693,
"step": 600
},
{
"epoch": 1.669909659485754,
"grad_norm": 0.26373493671417236,
"learning_rate": 9.79591836734694e-05,
"loss": 0.3756,
"step": 601
},
{
"epoch": 1.6726893676164003,
"grad_norm": 0.31205254793167114,
"learning_rate": 9.775510204081632e-05,
"loss": 0.2962,
"step": 602
},
{
"epoch": 1.6754690757470465,
"grad_norm": 0.2865038812160492,
"learning_rate": 9.755102040816328e-05,
"loss": 0.345,
"step": 603
},
{
"epoch": 1.678248783877693,
"grad_norm": 0.2950930893421173,
"learning_rate": 9.734693877551021e-05,
"loss": 0.3678,
"step": 604
},
{
"epoch": 1.6810284920083391,
"grad_norm": 0.35823482275009155,
"learning_rate": 9.714285714285715e-05,
"loss": 0.4096,
"step": 605
},
{
"epoch": 1.6838082001389854,
"grad_norm": 0.2647465169429779,
"learning_rate": 9.693877551020408e-05,
"loss": 0.4015,
"step": 606
},
{
"epoch": 1.6865879082696318,
"grad_norm": 0.30505606532096863,
"learning_rate": 9.673469387755102e-05,
"loss": 0.4116,
"step": 607
},
{
"epoch": 1.689367616400278,
"grad_norm": 0.3076132833957672,
"learning_rate": 9.653061224489797e-05,
"loss": 0.3867,
"step": 608
},
{
"epoch": 1.6921473245309242,
"grad_norm": 0.2777102589607239,
"learning_rate": 9.63265306122449e-05,
"loss": 0.3944,
"step": 609
},
{
"epoch": 1.6949270326615706,
"grad_norm": 0.3163359463214874,
"learning_rate": 9.612244897959184e-05,
"loss": 0.4266,
"step": 610
},
{
"epoch": 1.6977067407922168,
"grad_norm": 0.3227400779724121,
"learning_rate": 9.591836734693878e-05,
"loss": 0.397,
"step": 611
},
{
"epoch": 1.700486448922863,
"grad_norm": 0.27346518635749817,
"learning_rate": 9.571428571428573e-05,
"loss": 0.3463,
"step": 612
},
{
"epoch": 1.7032661570535095,
"grad_norm": 0.3154374957084656,
"learning_rate": 9.551020408163265e-05,
"loss": 0.3921,
"step": 613
},
{
"epoch": 1.7060458651841557,
"grad_norm": 0.2859618365764618,
"learning_rate": 9.53061224489796e-05,
"loss": 0.3736,
"step": 614
},
{
"epoch": 1.708825573314802,
"grad_norm": 0.28541162610054016,
"learning_rate": 9.510204081632653e-05,
"loss": 0.3689,
"step": 615
},
{
"epoch": 1.7116052814454483,
"grad_norm": 0.2870318293571472,
"learning_rate": 9.489795918367348e-05,
"loss": 0.2594,
"step": 616
},
{
"epoch": 1.7143849895760945,
"grad_norm": 0.2808590531349182,
"learning_rate": 9.469387755102041e-05,
"loss": 0.4471,
"step": 617
},
{
"epoch": 1.7171646977067407,
"grad_norm": 0.2777983248233795,
"learning_rate": 9.448979591836736e-05,
"loss": 0.304,
"step": 618
},
{
"epoch": 1.7199444058373872,
"grad_norm": 0.2791382372379303,
"learning_rate": 9.428571428571429e-05,
"loss": 0.3117,
"step": 619
},
{
"epoch": 1.7227241139680334,
"grad_norm": 0.3301670253276825,
"learning_rate": 9.408163265306123e-05,
"loss": 0.3445,
"step": 620
},
{
"epoch": 1.7255038220986796,
"grad_norm": 0.2752249240875244,
"learning_rate": 9.387755102040817e-05,
"loss": 0.3807,
"step": 621
},
{
"epoch": 1.728283530229326,
"grad_norm": 0.32973968982696533,
"learning_rate": 9.367346938775511e-05,
"loss": 0.4078,
"step": 622
},
{
"epoch": 1.7310632383599722,
"grad_norm": 0.30001533031463623,
"learning_rate": 9.346938775510204e-05,
"loss": 0.4355,
"step": 623
},
{
"epoch": 1.7338429464906184,
"grad_norm": 0.255593866109848,
"learning_rate": 9.326530612244899e-05,
"loss": 0.3447,
"step": 624
},
{
"epoch": 1.7366226546212649,
"grad_norm": 0.2974906861782074,
"learning_rate": 9.306122448979592e-05,
"loss": 0.4654,
"step": 625
},
{
"epoch": 1.739402362751911,
"grad_norm": 0.3247474133968353,
"learning_rate": 9.285714285714286e-05,
"loss": 0.3117,
"step": 626
},
{
"epoch": 1.7421820708825573,
"grad_norm": 0.27801772952079773,
"learning_rate": 9.26530612244898e-05,
"loss": 0.4573,
"step": 627
},
{
"epoch": 1.7449617790132037,
"grad_norm": 0.2884966731071472,
"learning_rate": 9.244897959183673e-05,
"loss": 0.3575,
"step": 628
},
{
"epoch": 1.74774148714385,
"grad_norm": 0.27776768803596497,
"learning_rate": 9.224489795918367e-05,
"loss": 0.4243,
"step": 629
},
{
"epoch": 1.7505211952744961,
"grad_norm": 0.2852678596973419,
"learning_rate": 9.204081632653062e-05,
"loss": 0.4214,
"step": 630
},
{
"epoch": 1.7533009034051426,
"grad_norm": 0.7854850888252258,
"learning_rate": 9.183673469387756e-05,
"loss": 0.4479,
"step": 631
},
{
"epoch": 1.7560806115357888,
"grad_norm": 0.3441762626171112,
"learning_rate": 9.163265306122449e-05,
"loss": 0.3926,
"step": 632
},
{
"epoch": 1.758860319666435,
"grad_norm": 0.27086740732192993,
"learning_rate": 9.142857142857143e-05,
"loss": 0.3389,
"step": 633
},
{
"epoch": 1.7616400277970814,
"grad_norm": 0.273234486579895,
"learning_rate": 9.122448979591836e-05,
"loss": 0.3376,
"step": 634
},
{
"epoch": 1.7644197359277276,
"grad_norm": 0.29030004143714905,
"learning_rate": 9.102040816326532e-05,
"loss": 0.3156,
"step": 635
},
{
"epoch": 1.7671994440583738,
"grad_norm": 0.2696784436702728,
"learning_rate": 9.081632653061225e-05,
"loss": 0.3006,
"step": 636
},
{
"epoch": 1.7699791521890202,
"grad_norm": 0.2931046485900879,
"learning_rate": 9.061224489795919e-05,
"loss": 0.3289,
"step": 637
},
{
"epoch": 1.7727588603196665,
"grad_norm": 0.2624610364437103,
"learning_rate": 9.040816326530612e-05,
"loss": 0.316,
"step": 638
},
{
"epoch": 1.7755385684503127,
"grad_norm": 0.2946118414402008,
"learning_rate": 9.020408163265308e-05,
"loss": 0.4373,
"step": 639
},
{
"epoch": 1.778318276580959,
"grad_norm": 0.32654690742492676,
"learning_rate": 9e-05,
"loss": 0.3699,
"step": 640
},
{
"epoch": 1.7810979847116053,
"grad_norm": 0.2824501693248749,
"learning_rate": 8.979591836734695e-05,
"loss": 0.3846,
"step": 641
},
{
"epoch": 1.7838776928422515,
"grad_norm": 0.3406214118003845,
"learning_rate": 8.959183673469388e-05,
"loss": 0.3932,
"step": 642
},
{
"epoch": 1.786657400972898,
"grad_norm": 0.26329147815704346,
"learning_rate": 8.938775510204082e-05,
"loss": 0.3152,
"step": 643
},
{
"epoch": 1.7894371091035441,
"grad_norm": 0.2712422311306,
"learning_rate": 8.918367346938776e-05,
"loss": 0.3595,
"step": 644
},
{
"epoch": 1.7922168172341904,
"grad_norm": 0.30618801712989807,
"learning_rate": 8.89795918367347e-05,
"loss": 0.3724,
"step": 645
},
{
"epoch": 1.7949965253648368,
"grad_norm": 0.2918386161327362,
"learning_rate": 8.877551020408164e-05,
"loss": 0.3537,
"step": 646
},
{
"epoch": 1.797776233495483,
"grad_norm": 0.27820420265197754,
"learning_rate": 8.857142857142857e-05,
"loss": 0.3404,
"step": 647
},
{
"epoch": 1.8005559416261292,
"grad_norm": 0.296987920999527,
"learning_rate": 8.836734693877552e-05,
"loss": 0.3644,
"step": 648
},
{
"epoch": 1.8033356497567756,
"grad_norm": 0.29092004895210266,
"learning_rate": 8.816326530612245e-05,
"loss": 0.428,
"step": 649
},
{
"epoch": 1.8061153578874218,
"grad_norm": 0.2843736410140991,
"learning_rate": 8.79591836734694e-05,
"loss": 0.3709,
"step": 650
},
{
"epoch": 1.808895066018068,
"grad_norm": 0.2677766978740692,
"learning_rate": 8.775510204081632e-05,
"loss": 0.2649,
"step": 651
},
{
"epoch": 1.8116747741487145,
"grad_norm": 0.2586863338947296,
"learning_rate": 8.755102040816327e-05,
"loss": 0.3241,
"step": 652
},
{
"epoch": 1.8144544822793607,
"grad_norm": 0.32978034019470215,
"learning_rate": 8.734693877551021e-05,
"loss": 0.3787,
"step": 653
},
{
"epoch": 1.8172341904100069,
"grad_norm": 0.27411404252052307,
"learning_rate": 8.714285714285715e-05,
"loss": 0.3447,
"step": 654
},
{
"epoch": 1.8200138985406533,
"grad_norm": 0.26756396889686584,
"learning_rate": 8.693877551020408e-05,
"loss": 0.3528,
"step": 655
},
{
"epoch": 1.8227936066712995,
"grad_norm": 0.30654609203338623,
"learning_rate": 8.673469387755102e-05,
"loss": 0.3805,
"step": 656
},
{
"epoch": 1.8255733148019457,
"grad_norm": 0.29328277707099915,
"learning_rate": 8.653061224489797e-05,
"loss": 0.3276,
"step": 657
},
{
"epoch": 1.8283530229325922,
"grad_norm": 0.2501872777938843,
"learning_rate": 8.632653061224491e-05,
"loss": 0.3368,
"step": 658
},
{
"epoch": 1.8311327310632384,
"grad_norm": 0.3712775707244873,
"learning_rate": 8.612244897959184e-05,
"loss": 0.3671,
"step": 659
},
{
"epoch": 1.8339124391938846,
"grad_norm": 0.24037517607212067,
"learning_rate": 8.591836734693878e-05,
"loss": 0.259,
"step": 660
},
{
"epoch": 1.836692147324531,
"grad_norm": 0.25154754519462585,
"learning_rate": 8.571428571428571e-05,
"loss": 0.2983,
"step": 661
},
{
"epoch": 1.8394718554551772,
"grad_norm": 0.2672985792160034,
"learning_rate": 8.551020408163266e-05,
"loss": 0.3657,
"step": 662
},
{
"epoch": 1.8422515635858234,
"grad_norm": 0.31283143162727356,
"learning_rate": 8.53061224489796e-05,
"loss": 0.3211,
"step": 663
},
{
"epoch": 1.8450312717164699,
"grad_norm": 0.26797670125961304,
"learning_rate": 8.510204081632653e-05,
"loss": 0.3665,
"step": 664
},
{
"epoch": 1.847810979847116,
"grad_norm": 0.2569994032382965,
"learning_rate": 8.489795918367347e-05,
"loss": 0.4167,
"step": 665
},
{
"epoch": 1.8505906879777623,
"grad_norm": 0.261764258146286,
"learning_rate": 8.469387755102041e-05,
"loss": 0.3976,
"step": 666
},
{
"epoch": 1.8533703961084087,
"grad_norm": 0.28137752413749695,
"learning_rate": 8.448979591836736e-05,
"loss": 0.2909,
"step": 667
},
{
"epoch": 1.856150104239055,
"grad_norm": 0.2962735891342163,
"learning_rate": 8.428571428571429e-05,
"loss": 0.3961,
"step": 668
},
{
"epoch": 1.8589298123697011,
"grad_norm": 0.2491084784269333,
"learning_rate": 8.408163265306123e-05,
"loss": 0.334,
"step": 669
},
{
"epoch": 1.8617095205003475,
"grad_norm": 0.30861160159111023,
"learning_rate": 8.387755102040816e-05,
"loss": 0.3896,
"step": 670
},
{
"epoch": 1.8644892286309938,
"grad_norm": 0.2767215371131897,
"learning_rate": 8.367346938775511e-05,
"loss": 0.457,
"step": 671
},
{
"epoch": 1.86726893676164,
"grad_norm": 0.2909168303012848,
"learning_rate": 8.346938775510204e-05,
"loss": 0.4469,
"step": 672
},
{
"epoch": 1.8700486448922864,
"grad_norm": 0.30907315015792847,
"learning_rate": 8.326530612244899e-05,
"loss": 0.4008,
"step": 673
},
{
"epoch": 1.8728283530229326,
"grad_norm": 0.26147618889808655,
"learning_rate": 8.306122448979592e-05,
"loss": 0.3456,
"step": 674
},
{
"epoch": 1.8756080611535788,
"grad_norm": 0.280230313539505,
"learning_rate": 8.285714285714287e-05,
"loss": 0.454,
"step": 675
},
{
"epoch": 1.8783877692842252,
"grad_norm": 0.2674858570098877,
"learning_rate": 8.26530612244898e-05,
"loss": 0.3247,
"step": 676
},
{
"epoch": 1.8811674774148714,
"grad_norm": 0.26258382201194763,
"learning_rate": 8.244897959183675e-05,
"loss": 0.3402,
"step": 677
},
{
"epoch": 1.8839471855455177,
"grad_norm": 0.2922073006629944,
"learning_rate": 8.224489795918367e-05,
"loss": 0.4958,
"step": 678
},
{
"epoch": 1.886726893676164,
"grad_norm": 0.2971295714378357,
"learning_rate": 8.204081632653062e-05,
"loss": 0.3341,
"step": 679
},
{
"epoch": 1.8895066018068103,
"grad_norm": 0.27569159865379333,
"learning_rate": 8.183673469387756e-05,
"loss": 0.3873,
"step": 680
},
{
"epoch": 1.8922863099374565,
"grad_norm": 0.30602556467056274,
"learning_rate": 8.163265306122449e-05,
"loss": 0.2462,
"step": 681
},
{
"epoch": 1.895066018068103,
"grad_norm": 0.298258900642395,
"learning_rate": 8.142857142857143e-05,
"loss": 0.3863,
"step": 682
},
{
"epoch": 1.8978457261987491,
"grad_norm": 0.2906138598918915,
"learning_rate": 8.122448979591836e-05,
"loss": 0.4014,
"step": 683
},
{
"epoch": 1.9006254343293953,
"grad_norm": 0.2641445994377136,
"learning_rate": 8.10204081632653e-05,
"loss": 0.367,
"step": 684
},
{
"epoch": 1.9034051424600418,
"grad_norm": 0.7982620596885681,
"learning_rate": 8.081632653061225e-05,
"loss": 0.6423,
"step": 685
},
{
"epoch": 1.906184850590688,
"grad_norm": 0.26612165570259094,
"learning_rate": 8.061224489795919e-05,
"loss": 0.3586,
"step": 686
},
{
"epoch": 1.9089645587213342,
"grad_norm": 0.27895480394363403,
"learning_rate": 8.040816326530612e-05,
"loss": 0.3828,
"step": 687
},
{
"epoch": 1.9117442668519806,
"grad_norm": 0.2650473713874817,
"learning_rate": 8.020408163265306e-05,
"loss": 0.403,
"step": 688
},
{
"epoch": 1.9145239749826268,
"grad_norm": 0.2670430541038513,
"learning_rate": 8e-05,
"loss": 0.2993,
"step": 689
},
{
"epoch": 1.917303683113273,
"grad_norm": 0.27033767104148865,
"learning_rate": 7.979591836734695e-05,
"loss": 0.3602,
"step": 690
},
{
"epoch": 1.9200833912439195,
"grad_norm": 0.279803603887558,
"learning_rate": 7.959183673469388e-05,
"loss": 0.4034,
"step": 691
},
{
"epoch": 1.9228630993745657,
"grad_norm": 0.2839685082435608,
"learning_rate": 7.938775510204082e-05,
"loss": 0.3458,
"step": 692
},
{
"epoch": 1.9256428075052119,
"grad_norm": 0.2718431353569031,
"learning_rate": 7.918367346938775e-05,
"loss": 0.3323,
"step": 693
},
{
"epoch": 1.9284225156358583,
"grad_norm": 0.2734295725822449,
"learning_rate": 7.897959183673471e-05,
"loss": 0.2751,
"step": 694
},
{
"epoch": 1.9312022237665045,
"grad_norm": 0.3010067641735077,
"learning_rate": 7.877551020408164e-05,
"loss": 0.3508,
"step": 695
},
{
"epoch": 1.9339819318971507,
"grad_norm": 0.3028511703014374,
"learning_rate": 7.857142857142858e-05,
"loss": 0.3794,
"step": 696
},
{
"epoch": 1.9367616400277972,
"grad_norm": 0.26215803623199463,
"learning_rate": 7.836734693877551e-05,
"loss": 0.3737,
"step": 697
},
{
"epoch": 1.9395413481584434,
"grad_norm": 0.3025459051132202,
"learning_rate": 7.816326530612245e-05,
"loss": 0.44,
"step": 698
},
{
"epoch": 1.9423210562890896,
"grad_norm": 0.29042086005210876,
"learning_rate": 7.79591836734694e-05,
"loss": 0.3401,
"step": 699
},
{
"epoch": 1.945100764419736,
"grad_norm": 0.31652623414993286,
"learning_rate": 7.775510204081632e-05,
"loss": 0.4041,
"step": 700
},
{
"epoch": 1.9478804725503822,
"grad_norm": 0.2801991403102875,
"learning_rate": 7.755102040816327e-05,
"loss": 0.3173,
"step": 701
},
{
"epoch": 1.9506601806810284,
"grad_norm": 0.30574724078178406,
"learning_rate": 7.73469387755102e-05,
"loss": 0.3477,
"step": 702
},
{
"epoch": 1.9534398888116749,
"grad_norm": 0.3101007640361786,
"learning_rate": 7.714285714285715e-05,
"loss": 0.3974,
"step": 703
},
{
"epoch": 1.956219596942321,
"grad_norm": 0.27257412672042847,
"learning_rate": 7.693877551020408e-05,
"loss": 0.3449,
"step": 704
},
{
"epoch": 1.9589993050729673,
"grad_norm": 0.2764023542404175,
"learning_rate": 7.673469387755103e-05,
"loss": 0.3517,
"step": 705
},
{
"epoch": 1.9617790132036137,
"grad_norm": 0.29003384709358215,
"learning_rate": 7.653061224489796e-05,
"loss": 0.3886,
"step": 706
},
{
"epoch": 1.96455872133426,
"grad_norm": 0.4509872794151306,
"learning_rate": 7.632653061224491e-05,
"loss": 0.4117,
"step": 707
},
{
"epoch": 1.9673384294649061,
"grad_norm": 0.28539761900901794,
"learning_rate": 7.612244897959184e-05,
"loss": 0.3671,
"step": 708
},
{
"epoch": 1.9701181375955525,
"grad_norm": 0.27598991990089417,
"learning_rate": 7.591836734693878e-05,
"loss": 0.3327,
"step": 709
},
{
"epoch": 1.9728978457261988,
"grad_norm": 0.36246633529663086,
"learning_rate": 7.571428571428571e-05,
"loss": 0.4338,
"step": 710
},
{
"epoch": 1.975677553856845,
"grad_norm": 0.29050305485725403,
"learning_rate": 7.551020408163266e-05,
"loss": 0.5175,
"step": 711
},
{
"epoch": 1.9784572619874914,
"grad_norm": 0.4746512174606323,
"learning_rate": 7.53061224489796e-05,
"loss": 0.4983,
"step": 712
},
{
"epoch": 1.9812369701181376,
"grad_norm": 0.27185899019241333,
"learning_rate": 7.510204081632653e-05,
"loss": 0.3781,
"step": 713
},
{
"epoch": 1.9840166782487838,
"grad_norm": 0.28276991844177246,
"learning_rate": 7.489795918367347e-05,
"loss": 0.3137,
"step": 714
},
{
"epoch": 1.9867963863794302,
"grad_norm": 0.27965956926345825,
"learning_rate": 7.469387755102041e-05,
"loss": 0.3681,
"step": 715
},
{
"epoch": 1.9895760945100764,
"grad_norm": 0.245852991938591,
"learning_rate": 7.448979591836736e-05,
"loss": 0.3879,
"step": 716
},
{
"epoch": 1.9923558026407227,
"grad_norm": 0.28121981024742126,
"learning_rate": 7.428571428571429e-05,
"loss": 0.4085,
"step": 717
},
{
"epoch": 1.995135510771369,
"grad_norm": 0.2986759543418884,
"learning_rate": 7.408163265306123e-05,
"loss": 0.2768,
"step": 718
},
{
"epoch": 1.9979152189020153,
"grad_norm": 0.26322054862976074,
"learning_rate": 7.387755102040816e-05,
"loss": 0.3901,
"step": 719
},
{
"epoch": 2.0,
"grad_norm": 0.3256114721298218,
"learning_rate": 7.36734693877551e-05,
"loss": 0.3634,
"step": 720
},
{
"epoch": 2.0027797081306464,
"grad_norm": 0.27563294768333435,
"learning_rate": 7.346938775510205e-05,
"loss": 0.3962,
"step": 721
},
{
"epoch": 2.0055594162612924,
"grad_norm": 0.36298710107803345,
"learning_rate": 7.326530612244899e-05,
"loss": 0.3589,
"step": 722
},
{
"epoch": 2.008339124391939,
"grad_norm": 0.28008803725242615,
"learning_rate": 7.306122448979592e-05,
"loss": 0.3883,
"step": 723
},
{
"epoch": 2.0111188325225853,
"grad_norm": 0.34973540902137756,
"learning_rate": 7.285714285714286e-05,
"loss": 0.4629,
"step": 724
},
{
"epoch": 2.0138985406532313,
"grad_norm": 0.38526445627212524,
"learning_rate": 7.26530612244898e-05,
"loss": 0.4442,
"step": 725
},
{
"epoch": 2.0166782487838777,
"grad_norm": 0.2718683183193207,
"learning_rate": 7.244897959183675e-05,
"loss": 0.3513,
"step": 726
},
{
"epoch": 2.019457956914524,
"grad_norm": 0.26923590898513794,
"learning_rate": 7.224489795918368e-05,
"loss": 0.326,
"step": 727
},
{
"epoch": 2.02223766504517,
"grad_norm": 0.2851525545120239,
"learning_rate": 7.204081632653062e-05,
"loss": 0.2804,
"step": 728
},
{
"epoch": 2.0250173731758165,
"grad_norm": 0.3298304080963135,
"learning_rate": 7.183673469387755e-05,
"loss": 0.4877,
"step": 729
},
{
"epoch": 2.027797081306463,
"grad_norm": 0.304770290851593,
"learning_rate": 7.163265306122449e-05,
"loss": 0.3777,
"step": 730
},
{
"epoch": 2.030576789437109,
"grad_norm": 0.2693670094013214,
"learning_rate": 7.142857142857143e-05,
"loss": 0.3315,
"step": 731
},
{
"epoch": 2.0333564975677554,
"grad_norm": 0.2542417049407959,
"learning_rate": 7.122448979591836e-05,
"loss": 0.3493,
"step": 732
},
{
"epoch": 2.036136205698402,
"grad_norm": 0.2779878079891205,
"learning_rate": 7.10204081632653e-05,
"loss": 0.3686,
"step": 733
},
{
"epoch": 2.038915913829048,
"grad_norm": 0.3229525685310364,
"learning_rate": 7.081632653061225e-05,
"loss": 0.4557,
"step": 734
},
{
"epoch": 2.0416956219596942,
"grad_norm": 0.30455026030540466,
"learning_rate": 7.061224489795919e-05,
"loss": 0.3291,
"step": 735
},
{
"epoch": 2.0444753300903407,
"grad_norm": 0.26810574531555176,
"learning_rate": 7.040816326530612e-05,
"loss": 0.3269,
"step": 736
},
{
"epoch": 2.0472550382209866,
"grad_norm": 0.26769372820854187,
"learning_rate": 7.020408163265306e-05,
"loss": 0.3385,
"step": 737
},
{
"epoch": 2.050034746351633,
"grad_norm": 0.3141888380050659,
"learning_rate": 7e-05,
"loss": 0.3584,
"step": 738
},
{
"epoch": 2.0528144544822795,
"grad_norm": 0.31162917613983154,
"learning_rate": 6.979591836734695e-05,
"loss": 0.3418,
"step": 739
},
{
"epoch": 2.0555941626129255,
"grad_norm": 0.28655368089675903,
"learning_rate": 6.959183673469388e-05,
"loss": 0.3356,
"step": 740
},
{
"epoch": 2.058373870743572,
"grad_norm": 0.3098087012767792,
"learning_rate": 6.938775510204082e-05,
"loss": 0.4558,
"step": 741
},
{
"epoch": 2.0611535788742184,
"grad_norm": 0.28569385409355164,
"learning_rate": 6.918367346938775e-05,
"loss": 0.2827,
"step": 742
},
{
"epoch": 2.0639332870048643,
"grad_norm": 0.2704540491104126,
"learning_rate": 6.897959183673471e-05,
"loss": 0.3118,
"step": 743
},
{
"epoch": 2.0667129951355108,
"grad_norm": 0.2655661702156067,
"learning_rate": 6.877551020408164e-05,
"loss": 0.3569,
"step": 744
},
{
"epoch": 2.069492703266157,
"grad_norm": 0.292889267206192,
"learning_rate": 6.857142857142858e-05,
"loss": 0.4,
"step": 745
},
{
"epoch": 2.072272411396803,
"grad_norm": 0.30672675371170044,
"learning_rate": 6.836734693877551e-05,
"loss": 0.4197,
"step": 746
},
{
"epoch": 2.0750521195274496,
"grad_norm": 0.2781777083873749,
"learning_rate": 6.816326530612245e-05,
"loss": 0.3949,
"step": 747
},
{
"epoch": 2.077831827658096,
"grad_norm": 0.2681523561477661,
"learning_rate": 6.79591836734694e-05,
"loss": 0.3073,
"step": 748
},
{
"epoch": 2.080611535788742,
"grad_norm": 0.2789252996444702,
"learning_rate": 6.775510204081633e-05,
"loss": 0.308,
"step": 749
},
{
"epoch": 2.0833912439193885,
"grad_norm": 0.3158692717552185,
"learning_rate": 6.755102040816327e-05,
"loss": 0.4182,
"step": 750
},
{
"epoch": 2.086170952050035,
"grad_norm": 0.2763406038284302,
"learning_rate": 6.73469387755102e-05,
"loss": 0.3707,
"step": 751
},
{
"epoch": 2.088950660180681,
"grad_norm": 0.2762833833694458,
"learning_rate": 6.714285714285714e-05,
"loss": 0.3287,
"step": 752
},
{
"epoch": 2.0917303683113273,
"grad_norm": 0.26660582423210144,
"learning_rate": 6.693877551020408e-05,
"loss": 0.2388,
"step": 753
},
{
"epoch": 2.0945100764419737,
"grad_norm": 0.28191235661506653,
"learning_rate": 6.673469387755103e-05,
"loss": 0.2989,
"step": 754
},
{
"epoch": 2.0972897845726197,
"grad_norm": 0.29017800092697144,
"learning_rate": 6.653061224489796e-05,
"loss": 0.3203,
"step": 755
},
{
"epoch": 2.100069492703266,
"grad_norm": 0.2984425127506256,
"learning_rate": 6.63265306122449e-05,
"loss": 0.3803,
"step": 756
},
{
"epoch": 2.1028492008339126,
"grad_norm": 0.30566540360450745,
"learning_rate": 6.612244897959184e-05,
"loss": 0.3248,
"step": 757
},
{
"epoch": 2.1056289089645586,
"grad_norm": 0.2737206220626831,
"learning_rate": 6.591836734693878e-05,
"loss": 0.3448,
"step": 758
},
{
"epoch": 2.108408617095205,
"grad_norm": 0.3234175145626068,
"learning_rate": 6.571428571428571e-05,
"loss": 0.3773,
"step": 759
},
{
"epoch": 2.1111883252258514,
"grad_norm": 0.3321143090724945,
"learning_rate": 6.551020408163266e-05,
"loss": 0.4053,
"step": 760
},
{
"epoch": 2.1139680333564974,
"grad_norm": 0.3120156228542328,
"learning_rate": 6.530612244897959e-05,
"loss": 0.3614,
"step": 761
},
{
"epoch": 2.116747741487144,
"grad_norm": 0.3069126307964325,
"learning_rate": 6.510204081632654e-05,
"loss": 0.3392,
"step": 762
},
{
"epoch": 2.1195274496177903,
"grad_norm": 0.2996152937412262,
"learning_rate": 6.489795918367347e-05,
"loss": 0.4007,
"step": 763
},
{
"epoch": 2.1223071577484363,
"grad_norm": 0.28609102964401245,
"learning_rate": 6.469387755102042e-05,
"loss": 0.3619,
"step": 764
},
{
"epoch": 2.1250868658790827,
"grad_norm": 0.28463420271873474,
"learning_rate": 6.448979591836734e-05,
"loss": 0.3684,
"step": 765
},
{
"epoch": 2.127866574009729,
"grad_norm": 0.2721307873725891,
"learning_rate": 6.428571428571429e-05,
"loss": 0.3153,
"step": 766
},
{
"epoch": 2.130646282140375,
"grad_norm": 0.29114431142807007,
"learning_rate": 6.408163265306123e-05,
"loss": 0.3558,
"step": 767
},
{
"epoch": 2.1334259902710215,
"grad_norm": 0.2995449900627136,
"learning_rate": 6.387755102040816e-05,
"loss": 0.3632,
"step": 768
},
{
"epoch": 2.136205698401668,
"grad_norm": 0.28987494111061096,
"learning_rate": 6.36734693877551e-05,
"loss": 0.3928,
"step": 769
},
{
"epoch": 2.138985406532314,
"grad_norm": 0.2857901155948639,
"learning_rate": 6.346938775510203e-05,
"loss": 0.4101,
"step": 770
},
{
"epoch": 2.1417651146629604,
"grad_norm": 0.2712436020374298,
"learning_rate": 6.326530612244899e-05,
"loss": 0.379,
"step": 771
},
{
"epoch": 2.144544822793607,
"grad_norm": 0.2805217504501343,
"learning_rate": 6.306122448979592e-05,
"loss": 0.3018,
"step": 772
},
{
"epoch": 2.147324530924253,
"grad_norm": 0.29896092414855957,
"learning_rate": 6.285714285714286e-05,
"loss": 0.3065,
"step": 773
},
{
"epoch": 2.1501042390548992,
"grad_norm": 0.27301499247550964,
"learning_rate": 6.265306122448979e-05,
"loss": 0.3213,
"step": 774
},
{
"epoch": 2.1528839471855457,
"grad_norm": 0.2966340482234955,
"learning_rate": 6.244897959183675e-05,
"loss": 0.5333,
"step": 775
},
{
"epoch": 2.1556636553161916,
"grad_norm": 0.31995972990989685,
"learning_rate": 6.224489795918368e-05,
"loss": 0.3398,
"step": 776
},
{
"epoch": 2.158443363446838,
"grad_norm": 0.2765233516693115,
"learning_rate": 6.204081632653062e-05,
"loss": 0.3369,
"step": 777
},
{
"epoch": 2.1612230715774845,
"grad_norm": 0.2692531645298004,
"learning_rate": 6.183673469387755e-05,
"loss": 0.3347,
"step": 778
},
{
"epoch": 2.1640027797081305,
"grad_norm": 0.2936429977416992,
"learning_rate": 6.163265306122449e-05,
"loss": 0.3794,
"step": 779
},
{
"epoch": 2.166782487838777,
"grad_norm": 0.28063851594924927,
"learning_rate": 6.142857142857143e-05,
"loss": 0.295,
"step": 780
},
{
"epoch": 2.1695621959694233,
"grad_norm": 0.29556742310523987,
"learning_rate": 6.122448979591838e-05,
"loss": 0.3784,
"step": 781
},
{
"epoch": 2.1723419041000693,
"grad_norm": 0.2919243276119232,
"learning_rate": 6.102040816326531e-05,
"loss": 0.2643,
"step": 782
},
{
"epoch": 2.1751216122307158,
"grad_norm": 0.28946730494499207,
"learning_rate": 6.081632653061224e-05,
"loss": 0.3054,
"step": 783
},
{
"epoch": 2.177901320361362,
"grad_norm": 0.3075507581233978,
"learning_rate": 6.061224489795919e-05,
"loss": 0.3919,
"step": 784
},
{
"epoch": 2.180681028492008,
"grad_norm": 0.27501824498176575,
"learning_rate": 6.040816326530613e-05,
"loss": 0.3273,
"step": 785
},
{
"epoch": 2.1834607366226546,
"grad_norm": 0.30170246958732605,
"learning_rate": 6.0204081632653065e-05,
"loss": 0.3575,
"step": 786
},
{
"epoch": 2.186240444753301,
"grad_norm": 0.3145295977592468,
"learning_rate": 6e-05,
"loss": 0.3849,
"step": 787
},
{
"epoch": 2.189020152883947,
"grad_norm": 0.2984488308429718,
"learning_rate": 5.979591836734694e-05,
"loss": 0.3593,
"step": 788
},
{
"epoch": 2.1917998610145935,
"grad_norm": 0.27264416217803955,
"learning_rate": 5.959183673469389e-05,
"loss": 0.2832,
"step": 789
},
{
"epoch": 2.19457956914524,
"grad_norm": 0.27420729398727417,
"learning_rate": 5.9387755102040824e-05,
"loss": 0.362,
"step": 790
},
{
"epoch": 2.197359277275886,
"grad_norm": 0.31026434898376465,
"learning_rate": 5.918367346938776e-05,
"loss": 0.3357,
"step": 791
},
{
"epoch": 2.2001389854065323,
"grad_norm": 0.31472471356391907,
"learning_rate": 5.8979591836734696e-05,
"loss": 0.3327,
"step": 792
},
{
"epoch": 2.2029186935371787,
"grad_norm": 0.2863774597644806,
"learning_rate": 5.877551020408164e-05,
"loss": 0.3819,
"step": 793
},
{
"epoch": 2.2056984016678247,
"grad_norm": 0.35021165013313293,
"learning_rate": 5.8571428571428575e-05,
"loss": 0.3515,
"step": 794
},
{
"epoch": 2.208478109798471,
"grad_norm": 0.2918795347213745,
"learning_rate": 5.836734693877551e-05,
"loss": 0.4171,
"step": 795
},
{
"epoch": 2.2112578179291176,
"grad_norm": 0.2973978817462921,
"learning_rate": 5.816326530612245e-05,
"loss": 0.3353,
"step": 796
},
{
"epoch": 2.2140375260597636,
"grad_norm": 0.3020112216472626,
"learning_rate": 5.7959183673469384e-05,
"loss": 0.3738,
"step": 797
},
{
"epoch": 2.21681723419041,
"grad_norm": 0.2840433716773987,
"learning_rate": 5.775510204081633e-05,
"loss": 0.3097,
"step": 798
},
{
"epoch": 2.2195969423210564,
"grad_norm": 0.29329997301101685,
"learning_rate": 5.755102040816327e-05,
"loss": 0.3839,
"step": 799
},
{
"epoch": 2.2223766504517024,
"grad_norm": 0.29126960039138794,
"learning_rate": 5.7346938775510206e-05,
"loss": 0.3542,
"step": 800
},
{
"epoch": 2.225156358582349,
"grad_norm": 0.30318763852119446,
"learning_rate": 5.714285714285714e-05,
"loss": 0.3761,
"step": 801
},
{
"epoch": 2.2279360667129953,
"grad_norm": 0.4189499616622925,
"learning_rate": 5.693877551020409e-05,
"loss": 0.405,
"step": 802
},
{
"epoch": 2.2307157748436413,
"grad_norm": 0.2965310513973236,
"learning_rate": 5.673469387755103e-05,
"loss": 0.3449,
"step": 803
},
{
"epoch": 2.2334954829742877,
"grad_norm": 0.2907022535800934,
"learning_rate": 5.6530612244897964e-05,
"loss": 0.3174,
"step": 804
},
{
"epoch": 2.236275191104934,
"grad_norm": 0.32027149200439453,
"learning_rate": 5.63265306122449e-05,
"loss": 0.3595,
"step": 805
},
{
"epoch": 2.23905489923558,
"grad_norm": 0.33651795983314514,
"learning_rate": 5.6122448979591836e-05,
"loss": 0.4608,
"step": 806
},
{
"epoch": 2.2418346073662265,
"grad_norm": 0.3164324462413788,
"learning_rate": 5.5918367346938786e-05,
"loss": 0.3974,
"step": 807
},
{
"epoch": 2.244614315496873,
"grad_norm": 0.28678157925605774,
"learning_rate": 5.571428571428572e-05,
"loss": 0.3312,
"step": 808
},
{
"epoch": 2.247394023627519,
"grad_norm": 0.27763816714286804,
"learning_rate": 5.551020408163266e-05,
"loss": 0.2562,
"step": 809
},
{
"epoch": 2.2501737317581654,
"grad_norm": 0.28213679790496826,
"learning_rate": 5.5306122448979594e-05,
"loss": 0.3686,
"step": 810
},
{
"epoch": 2.252953439888812,
"grad_norm": 0.28878074884414673,
"learning_rate": 5.510204081632653e-05,
"loss": 0.3424,
"step": 811
},
{
"epoch": 2.255733148019458,
"grad_norm": 0.3575231432914734,
"learning_rate": 5.4897959183673473e-05,
"loss": 0.3903,
"step": 812
},
{
"epoch": 2.258512856150104,
"grad_norm": 0.3188578486442566,
"learning_rate": 5.469387755102041e-05,
"loss": 0.2794,
"step": 813
},
{
"epoch": 2.2612925642807506,
"grad_norm": 0.2995283603668213,
"learning_rate": 5.4489795918367346e-05,
"loss": 0.3653,
"step": 814
},
{
"epoch": 2.2640722724113966,
"grad_norm": 0.288529634475708,
"learning_rate": 5.428571428571428e-05,
"loss": 0.2245,
"step": 815
},
{
"epoch": 2.266851980542043,
"grad_norm": 0.29823359847068787,
"learning_rate": 5.408163265306123e-05,
"loss": 0.4187,
"step": 816
},
{
"epoch": 2.2696316886726895,
"grad_norm": 0.2879655063152313,
"learning_rate": 5.387755102040817e-05,
"loss": 0.2789,
"step": 817
},
{
"epoch": 2.2724113968033355,
"grad_norm": 0.3046426773071289,
"learning_rate": 5.3673469387755104e-05,
"loss": 0.3757,
"step": 818
},
{
"epoch": 2.275191104933982,
"grad_norm": 0.28883394598960876,
"learning_rate": 5.346938775510204e-05,
"loss": 0.3508,
"step": 819
},
{
"epoch": 2.2779708130646283,
"grad_norm": 0.2815608084201813,
"learning_rate": 5.3265306122448976e-05,
"loss": 0.2869,
"step": 820
},
{
"epoch": 2.2807505211952743,
"grad_norm": 0.27973008155822754,
"learning_rate": 5.3061224489795926e-05,
"loss": 0.3415,
"step": 821
},
{
"epoch": 2.2835302293259208,
"grad_norm": 0.3324487805366516,
"learning_rate": 5.285714285714286e-05,
"loss": 0.3302,
"step": 822
},
{
"epoch": 2.286309937456567,
"grad_norm": 0.30007997155189514,
"learning_rate": 5.26530612244898e-05,
"loss": 0.3831,
"step": 823
},
{
"epoch": 2.289089645587213,
"grad_norm": 0.3399452269077301,
"learning_rate": 5.2448979591836735e-05,
"loss": 0.3788,
"step": 824
},
{
"epoch": 2.2918693537178596,
"grad_norm": 0.28804537653923035,
"learning_rate": 5.224489795918368e-05,
"loss": 0.2934,
"step": 825
},
{
"epoch": 2.294649061848506,
"grad_norm": 0.34307217597961426,
"learning_rate": 5.2040816326530614e-05,
"loss": 0.3915,
"step": 826
},
{
"epoch": 2.297428769979152,
"grad_norm": 0.26888802647590637,
"learning_rate": 5.1836734693877557e-05,
"loss": 0.2455,
"step": 827
},
{
"epoch": 2.3002084781097984,
"grad_norm": 0.2897135615348816,
"learning_rate": 5.163265306122449e-05,
"loss": 0.2991,
"step": 828
},
{
"epoch": 2.302988186240445,
"grad_norm": 0.2886922359466553,
"learning_rate": 5.142857142857143e-05,
"loss": 0.3311,
"step": 829
},
{
"epoch": 2.305767894371091,
"grad_norm": 0.28240638971328735,
"learning_rate": 5.122448979591837e-05,
"loss": 0.2945,
"step": 830
},
{
"epoch": 2.3085476025017373,
"grad_norm": 0.29429686069488525,
"learning_rate": 5.102040816326531e-05,
"loss": 0.3996,
"step": 831
},
{
"epoch": 2.3113273106323837,
"grad_norm": 0.30782943964004517,
"learning_rate": 5.0816326530612244e-05,
"loss": 0.3983,
"step": 832
},
{
"epoch": 2.3141070187630297,
"grad_norm": 0.2880532741546631,
"learning_rate": 5.061224489795918e-05,
"loss": 0.4122,
"step": 833
},
{
"epoch": 2.316886726893676,
"grad_norm": 0.29007789492607117,
"learning_rate": 5.040816326530613e-05,
"loss": 0.2972,
"step": 834
},
{
"epoch": 2.3196664350243226,
"grad_norm": 0.31411439180374146,
"learning_rate": 5.0204081632653066e-05,
"loss": 0.3198,
"step": 835
},
{
"epoch": 2.3224461431549686,
"grad_norm": 0.2791215479373932,
"learning_rate": 5e-05,
"loss": 0.2701,
"step": 836
},
{
"epoch": 2.325225851285615,
"grad_norm": 0.3492055833339691,
"learning_rate": 4.979591836734694e-05,
"loss": 0.2776,
"step": 837
},
{
"epoch": 2.3280055594162614,
"grad_norm": 0.2989236116409302,
"learning_rate": 4.959183673469388e-05,
"loss": 0.3567,
"step": 838
},
{
"epoch": 2.3307852675469074,
"grad_norm": 0.2757430970668793,
"learning_rate": 4.938775510204082e-05,
"loss": 0.311,
"step": 839
},
{
"epoch": 2.333564975677554,
"grad_norm": 0.27763327956199646,
"learning_rate": 4.918367346938776e-05,
"loss": 0.3006,
"step": 840
},
{
"epoch": 2.3363446838082003,
"grad_norm": 0.2845268249511719,
"learning_rate": 4.89795918367347e-05,
"loss": 0.3055,
"step": 841
},
{
"epoch": 2.3391243919388462,
"grad_norm": 0.29126036167144775,
"learning_rate": 4.877551020408164e-05,
"loss": 0.3033,
"step": 842
},
{
"epoch": 2.3419041000694927,
"grad_norm": 0.29777050018310547,
"learning_rate": 4.8571428571428576e-05,
"loss": 0.2959,
"step": 843
},
{
"epoch": 2.344683808200139,
"grad_norm": 0.27072009444236755,
"learning_rate": 4.836734693877551e-05,
"loss": 0.3122,
"step": 844
},
{
"epoch": 2.347463516330785,
"grad_norm": 0.31056344509124756,
"learning_rate": 4.816326530612245e-05,
"loss": 0.3475,
"step": 845
},
{
"epoch": 2.3502432244614315,
"grad_norm": 0.2831839323043823,
"learning_rate": 4.795918367346939e-05,
"loss": 0.3048,
"step": 846
},
{
"epoch": 2.353022932592078,
"grad_norm": 0.2710871398448944,
"learning_rate": 4.775510204081633e-05,
"loss": 0.3236,
"step": 847
},
{
"epoch": 2.355802640722724,
"grad_norm": 0.2876143753528595,
"learning_rate": 4.7551020408163263e-05,
"loss": 0.3534,
"step": 848
},
{
"epoch": 2.3585823488533704,
"grad_norm": 0.2987620532512665,
"learning_rate": 4.7346938775510206e-05,
"loss": 0.3324,
"step": 849
},
{
"epoch": 2.361362056984017,
"grad_norm": 0.2859969735145569,
"learning_rate": 4.714285714285714e-05,
"loss": 0.3564,
"step": 850
},
{
"epoch": 2.364141765114663,
"grad_norm": 0.3041338324546814,
"learning_rate": 4.6938775510204086e-05,
"loss": 0.4652,
"step": 851
},
{
"epoch": 2.366921473245309,
"grad_norm": 0.28285476565361023,
"learning_rate": 4.673469387755102e-05,
"loss": 0.3897,
"step": 852
},
{
"epoch": 2.3697011813759556,
"grad_norm": 0.307158499956131,
"learning_rate": 4.653061224489796e-05,
"loss": 0.3194,
"step": 853
},
{
"epoch": 2.3724808895066016,
"grad_norm": 0.31216737627983093,
"learning_rate": 4.63265306122449e-05,
"loss": 0.3554,
"step": 854
},
{
"epoch": 2.375260597637248,
"grad_norm": 0.3206304907798767,
"learning_rate": 4.612244897959184e-05,
"loss": 0.4214,
"step": 855
},
{
"epoch": 2.3780403057678945,
"grad_norm": 0.29540154337882996,
"learning_rate": 4.591836734693878e-05,
"loss": 0.3425,
"step": 856
},
{
"epoch": 2.3808200138985405,
"grad_norm": 0.2654211223125458,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.2876,
"step": 857
},
{
"epoch": 2.383599722029187,
"grad_norm": 0.2998722195625305,
"learning_rate": 4.551020408163266e-05,
"loss": 0.3234,
"step": 858
},
{
"epoch": 2.3863794301598333,
"grad_norm": 0.305711030960083,
"learning_rate": 4.5306122448979595e-05,
"loss": 0.3229,
"step": 859
},
{
"epoch": 2.3891591382904793,
"grad_norm": 0.29556146264076233,
"learning_rate": 4.510204081632654e-05,
"loss": 0.3013,
"step": 860
},
{
"epoch": 2.3919388464211258,
"grad_norm": 0.2814411222934723,
"learning_rate": 4.4897959183673474e-05,
"loss": 0.2751,
"step": 861
},
{
"epoch": 2.394718554551772,
"grad_norm": 0.29568272829055786,
"learning_rate": 4.469387755102041e-05,
"loss": 0.3107,
"step": 862
},
{
"epoch": 2.397498262682418,
"grad_norm": 0.28841209411621094,
"learning_rate": 4.448979591836735e-05,
"loss": 0.3088,
"step": 863
},
{
"epoch": 2.4002779708130646,
"grad_norm": 0.3105250298976898,
"learning_rate": 4.428571428571428e-05,
"loss": 0.3483,
"step": 864
},
{
"epoch": 2.403057678943711,
"grad_norm": 0.287725567817688,
"learning_rate": 4.4081632653061226e-05,
"loss": 0.3556,
"step": 865
},
{
"epoch": 2.405837387074357,
"grad_norm": 0.33084672689437866,
"learning_rate": 4.387755102040816e-05,
"loss": 0.4709,
"step": 866
},
{
"epoch": 2.4086170952050034,
"grad_norm": 0.3187430799007416,
"learning_rate": 4.3673469387755105e-05,
"loss": 0.3438,
"step": 867
},
{
"epoch": 2.41139680333565,
"grad_norm": 0.29251527786254883,
"learning_rate": 4.346938775510204e-05,
"loss": 0.4519,
"step": 868
},
{
"epoch": 2.414176511466296,
"grad_norm": 0.27169179916381836,
"learning_rate": 4.3265306122448984e-05,
"loss": 0.3039,
"step": 869
},
{
"epoch": 2.4169562195969423,
"grad_norm": 0.2867499589920044,
"learning_rate": 4.306122448979592e-05,
"loss": 0.3444,
"step": 870
},
{
"epoch": 2.4197359277275887,
"grad_norm": 0.33954930305480957,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.4085,
"step": 871
},
{
"epoch": 2.4225156358582347,
"grad_norm": 0.31500715017318726,
"learning_rate": 4.26530612244898e-05,
"loss": 0.3574,
"step": 872
},
{
"epoch": 2.425295343988881,
"grad_norm": 0.31862321496009827,
"learning_rate": 4.2448979591836735e-05,
"loss": 0.361,
"step": 873
},
{
"epoch": 2.4280750521195276,
"grad_norm": 0.28617167472839355,
"learning_rate": 4.224489795918368e-05,
"loss": 0.362,
"step": 874
},
{
"epoch": 2.4308547602501736,
"grad_norm": 0.3150579333305359,
"learning_rate": 4.2040816326530615e-05,
"loss": 0.281,
"step": 875
},
{
"epoch": 2.43363446838082,
"grad_norm": 0.28917694091796875,
"learning_rate": 4.183673469387756e-05,
"loss": 0.3215,
"step": 876
},
{
"epoch": 2.4364141765114664,
"grad_norm": 0.29830271005630493,
"learning_rate": 4.1632653061224494e-05,
"loss": 0.2891,
"step": 877
},
{
"epoch": 2.4391938846421124,
"grad_norm": 0.30335041880607605,
"learning_rate": 4.1428571428571437e-05,
"loss": 0.3507,
"step": 878
},
{
"epoch": 2.441973592772759,
"grad_norm": 0.2745303213596344,
"learning_rate": 4.122448979591837e-05,
"loss": 0.366,
"step": 879
},
{
"epoch": 2.4447533009034053,
"grad_norm": 0.26936864852905273,
"learning_rate": 4.102040816326531e-05,
"loss": 0.3247,
"step": 880
},
{
"epoch": 2.4475330090340512,
"grad_norm": 0.3225851058959961,
"learning_rate": 4.0816326530612245e-05,
"loss": 0.3757,
"step": 881
},
{
"epoch": 2.4503127171646977,
"grad_norm": 0.28156349062919617,
"learning_rate": 4.061224489795918e-05,
"loss": 0.219,
"step": 882
},
{
"epoch": 2.453092425295344,
"grad_norm": 0.46358248591423035,
"learning_rate": 4.0408163265306124e-05,
"loss": 0.3567,
"step": 883
},
{
"epoch": 2.45587213342599,
"grad_norm": 0.2787911295890808,
"learning_rate": 4.020408163265306e-05,
"loss": 0.275,
"step": 884
},
{
"epoch": 2.4586518415566365,
"grad_norm": 0.3119618892669678,
"learning_rate": 4e-05,
"loss": 0.3376,
"step": 885
},
{
"epoch": 2.461431549687283,
"grad_norm": 0.2867750823497772,
"learning_rate": 3.979591836734694e-05,
"loss": 0.2792,
"step": 886
},
{
"epoch": 2.464211257817929,
"grad_norm": 0.29150545597076416,
"learning_rate": 3.9591836734693876e-05,
"loss": 0.3543,
"step": 887
},
{
"epoch": 2.4669909659485754,
"grad_norm": 0.28618574142456055,
"learning_rate": 3.938775510204082e-05,
"loss": 0.3556,
"step": 888
},
{
"epoch": 2.469770674079222,
"grad_norm": 0.2973514199256897,
"learning_rate": 3.9183673469387755e-05,
"loss": 0.3071,
"step": 889
},
{
"epoch": 2.472550382209868,
"grad_norm": 0.36355265974998474,
"learning_rate": 3.89795918367347e-05,
"loss": 0.37,
"step": 890
},
{
"epoch": 2.475330090340514,
"grad_norm": 0.28134381771087646,
"learning_rate": 3.8775510204081634e-05,
"loss": 0.3189,
"step": 891
},
{
"epoch": 2.4781097984711606,
"grad_norm": 0.29104146361351013,
"learning_rate": 3.857142857142858e-05,
"loss": 0.3467,
"step": 892
},
{
"epoch": 2.4808895066018066,
"grad_norm": 0.3080955743789673,
"learning_rate": 3.836734693877551e-05,
"loss": 0.439,
"step": 893
},
{
"epoch": 2.483669214732453,
"grad_norm": 0.3061097264289856,
"learning_rate": 3.8163265306122456e-05,
"loss": 0.2978,
"step": 894
},
{
"epoch": 2.4864489228630995,
"grad_norm": 0.30751508474349976,
"learning_rate": 3.795918367346939e-05,
"loss": 0.3819,
"step": 895
},
{
"epoch": 2.4892286309937455,
"grad_norm": 0.2971366047859192,
"learning_rate": 3.775510204081633e-05,
"loss": 0.2825,
"step": 896
},
{
"epoch": 2.492008339124392,
"grad_norm": 0.29852065443992615,
"learning_rate": 3.7551020408163264e-05,
"loss": 0.341,
"step": 897
},
{
"epoch": 2.4947880472550383,
"grad_norm": 0.28704988956451416,
"learning_rate": 3.734693877551021e-05,
"loss": 0.3001,
"step": 898
},
{
"epoch": 2.4975677553856843,
"grad_norm": 0.3021228015422821,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.3204,
"step": 899
},
{
"epoch": 2.5003474635163307,
"grad_norm": 0.30266740918159485,
"learning_rate": 3.693877551020408e-05,
"loss": 0.299,
"step": 900
},
{
"epoch": 2.5003474635163307,
"eval_loss": 0.4358259439468384,
"eval_runtime": 212.5847,
"eval_samples_per_second": 1.693,
"eval_steps_per_second": 1.693,
"step": 900
}
],
"logging_steps": 1,
"max_steps": 1080,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.8347978086418432e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}