atsuki-yamaguchi's picture
Upload folder using huggingface_hub
8af1188 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9055484126199189,
"eval_steps": 500,
"global_step": 27468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010219892526291499,
"grad_norm": 52.572940826416016,
"learning_rate": 1.0157273918741808e-06,
"loss": 9.6136,
"step": 31
},
{
"epoch": 0.0020439785052582997,
"grad_norm": 26.900487899780273,
"learning_rate": 2.0314547837483616e-06,
"loss": 7.35,
"step": 62
},
{
"epoch": 0.003065967757887449,
"grad_norm": 15.413382530212402,
"learning_rate": 3.0471821756225426e-06,
"loss": 5.872,
"step": 93
},
{
"epoch": 0.0040879570105165994,
"grad_norm": 9.475220680236816,
"learning_rate": 4.062909567496723e-06,
"loss": 4.8148,
"step": 124
},
{
"epoch": 0.005109946263145749,
"grad_norm": 16.33511734008789,
"learning_rate": 5.078636959370905e-06,
"loss": 4.2357,
"step": 155
},
{
"epoch": 0.006131935515774898,
"grad_norm": 17.79788589477539,
"learning_rate": 6.094364351245085e-06,
"loss": 3.9005,
"step": 186
},
{
"epoch": 0.007153924768404049,
"grad_norm": 7.385252952575684,
"learning_rate": 7.110091743119267e-06,
"loss": 3.6349,
"step": 217
},
{
"epoch": 0.008175914021033199,
"grad_norm": 7.6574835777282715,
"learning_rate": 8.125819134993446e-06,
"loss": 3.4197,
"step": 248
},
{
"epoch": 0.009197903273662348,
"grad_norm": 10.027671813964844,
"learning_rate": 9.141546526867629e-06,
"loss": 3.2883,
"step": 279
},
{
"epoch": 0.010219892526291498,
"grad_norm": 7.447092056274414,
"learning_rate": 1.015727391874181e-05,
"loss": 3.1606,
"step": 310
},
{
"epoch": 0.011241881778920647,
"grad_norm": 8.796594619750977,
"learning_rate": 1.117300131061599e-05,
"loss": 3.0724,
"step": 341
},
{
"epoch": 0.012263871031549797,
"grad_norm": 8.47999382019043,
"learning_rate": 1.218872870249017e-05,
"loss": 3.0249,
"step": 372
},
{
"epoch": 0.013285860284178948,
"grad_norm": 11.703062057495117,
"learning_rate": 1.3204456094364351e-05,
"loss": 2.9383,
"step": 403
},
{
"epoch": 0.014307849536808097,
"grad_norm": 7.270478248596191,
"learning_rate": 1.4220183486238533e-05,
"loss": 2.8753,
"step": 434
},
{
"epoch": 0.015329838789437247,
"grad_norm": 5.699134349822998,
"learning_rate": 1.5235910878112714e-05,
"loss": 2.8226,
"step": 465
},
{
"epoch": 0.016351828042066398,
"grad_norm": 6.3912529945373535,
"learning_rate": 1.6251638269986893e-05,
"loss": 2.7754,
"step": 496
},
{
"epoch": 0.017373817294695545,
"grad_norm": 8.240257263183594,
"learning_rate": 1.7267365661861077e-05,
"loss": 2.7351,
"step": 527
},
{
"epoch": 0.018395806547324697,
"grad_norm": 9.017704010009766,
"learning_rate": 1.8283093053735257e-05,
"loss": 2.7084,
"step": 558
},
{
"epoch": 0.019417795799953844,
"grad_norm": 6.017426013946533,
"learning_rate": 1.9298820445609438e-05,
"loss": 2.6741,
"step": 589
},
{
"epoch": 0.020439785052582995,
"grad_norm": 3.789416790008545,
"learning_rate": 2.031454783748362e-05,
"loss": 2.6605,
"step": 620
},
{
"epoch": 0.021461774305212147,
"grad_norm": 4.747366428375244,
"learning_rate": 2.13302752293578e-05,
"loss": 2.6245,
"step": 651
},
{
"epoch": 0.022483763557841294,
"grad_norm": 3.8344268798828125,
"learning_rate": 2.234600262123198e-05,
"loss": 2.5791,
"step": 682
},
{
"epoch": 0.023505752810470446,
"grad_norm": 4.5539116859436035,
"learning_rate": 2.336173001310616e-05,
"loss": 2.5648,
"step": 713
},
{
"epoch": 0.024527742063099593,
"grad_norm": 4.216089248657227,
"learning_rate": 2.437745740498034e-05,
"loss": 2.5125,
"step": 744
},
{
"epoch": 0.025549731315728744,
"grad_norm": 4.225554466247559,
"learning_rate": 2.5393184796854525e-05,
"loss": 2.4973,
"step": 775
},
{
"epoch": 0.026571720568357896,
"grad_norm": 3.357351541519165,
"learning_rate": 2.6408912188728702e-05,
"loss": 2.4532,
"step": 806
},
{
"epoch": 0.027593709820987043,
"grad_norm": 4.109170436859131,
"learning_rate": 2.7424639580602886e-05,
"loss": 2.4504,
"step": 837
},
{
"epoch": 0.028615699073616194,
"grad_norm": 17.247940063476562,
"learning_rate": 2.8440366972477066e-05,
"loss": 2.4226,
"step": 868
},
{
"epoch": 0.029637688326245342,
"grad_norm": 3.2851662635803223,
"learning_rate": 2.9456094364351244e-05,
"loss": 2.4053,
"step": 899
},
{
"epoch": 0.030659677578874493,
"grad_norm": 3.3796141147613525,
"learning_rate": 3.0471821756225428e-05,
"loss": 2.3977,
"step": 930
},
{
"epoch": 0.03168166683150364,
"grad_norm": 3.0984764099121094,
"learning_rate": 3.148754914809961e-05,
"loss": 2.3622,
"step": 961
},
{
"epoch": 0.032703656084132796,
"grad_norm": 2.5464305877685547,
"learning_rate": 3.2503276539973785e-05,
"loss": 2.348,
"step": 992
},
{
"epoch": 0.03372564533676194,
"grad_norm": 2.848860263824463,
"learning_rate": 3.351900393184797e-05,
"loss": 2.3208,
"step": 1023
},
{
"epoch": 0.03474763458939109,
"grad_norm": 2.5870606899261475,
"learning_rate": 3.453473132372215e-05,
"loss": 2.3023,
"step": 1054
},
{
"epoch": 0.03576962384202024,
"grad_norm": 2.6556804180145264,
"learning_rate": 3.555045871559633e-05,
"loss": 2.2698,
"step": 1085
},
{
"epoch": 0.03679161309464939,
"grad_norm": 2.779650926589966,
"learning_rate": 3.6566186107470514e-05,
"loss": 2.2614,
"step": 1116
},
{
"epoch": 0.03781360234727854,
"grad_norm": 2.624191999435425,
"learning_rate": 3.7581913499344695e-05,
"loss": 2.2422,
"step": 1147
},
{
"epoch": 0.03883559159990769,
"grad_norm": 2.5255484580993652,
"learning_rate": 3.8597640891218876e-05,
"loss": 2.2419,
"step": 1178
},
{
"epoch": 0.03985758085253684,
"grad_norm": 2.3195745944976807,
"learning_rate": 3.9613368283093056e-05,
"loss": 2.2468,
"step": 1209
},
{
"epoch": 0.04087957010516599,
"grad_norm": 2.4712162017822266,
"learning_rate": 4.062909567496724e-05,
"loss": 2.2072,
"step": 1240
},
{
"epoch": 0.04190155935779514,
"grad_norm": 4.451296806335449,
"learning_rate": 4.164482306684142e-05,
"loss": 2.2136,
"step": 1271
},
{
"epoch": 0.04292354861042429,
"grad_norm": 2.4180150032043457,
"learning_rate": 4.26605504587156e-05,
"loss": 2.1843,
"step": 1302
},
{
"epoch": 0.04394553786305344,
"grad_norm": 2.395840883255005,
"learning_rate": 4.367627785058978e-05,
"loss": 2.1447,
"step": 1333
},
{
"epoch": 0.04496752711568259,
"grad_norm": 3.072429895401001,
"learning_rate": 4.469200524246396e-05,
"loss": 2.1405,
"step": 1364
},
{
"epoch": 0.045989516368311736,
"grad_norm": 1.9566724300384521,
"learning_rate": 4.570773263433814e-05,
"loss": 2.1321,
"step": 1395
},
{
"epoch": 0.04701150562094089,
"grad_norm": 23.60377311706543,
"learning_rate": 4.672346002621232e-05,
"loss": 2.1289,
"step": 1426
},
{
"epoch": 0.04803349487357004,
"grad_norm": 2.117250919342041,
"learning_rate": 4.77391874180865e-05,
"loss": 2.1239,
"step": 1457
},
{
"epoch": 0.049055484126199186,
"grad_norm": 2.361362934112549,
"learning_rate": 4.875491480996068e-05,
"loss": 2.0838,
"step": 1488
},
{
"epoch": 0.05007747337882834,
"grad_norm": 2.1867992877960205,
"learning_rate": 4.977064220183487e-05,
"loss": 2.0622,
"step": 1519
},
{
"epoch": 0.05109946263145749,
"grad_norm": 1.9326456785202026,
"learning_rate": 4.9999915451558777e-05,
"loss": 2.0441,
"step": 1550
},
{
"epoch": 0.052121451884086636,
"grad_norm": 2.076503038406372,
"learning_rate": 4.999955597496219e-05,
"loss": 2.0503,
"step": 1581
},
{
"epoch": 0.05314344113671579,
"grad_norm": 3.051212787628174,
"learning_rate": 4.9998914381774255e-05,
"loss": 2.0357,
"step": 1612
},
{
"epoch": 0.05416543038934494,
"grad_norm": 1.7924102544784546,
"learning_rate": 4.999799067923527e-05,
"loss": 2.0326,
"step": 1643
},
{
"epoch": 0.055187419641974086,
"grad_norm": 1.8103365898132324,
"learning_rate": 4.999678487776908e-05,
"loss": 2.0023,
"step": 1674
},
{
"epoch": 0.056209408894603234,
"grad_norm": 1.913725733757019,
"learning_rate": 4.9995296990983006e-05,
"loss": 2.0035,
"step": 1705
},
{
"epoch": 0.05723139814723239,
"grad_norm": 1.7415859699249268,
"learning_rate": 4.999352703566763e-05,
"loss": 2.0063,
"step": 1736
},
{
"epoch": 0.058253387399861536,
"grad_norm": 1.8145827054977417,
"learning_rate": 4.999147503179668e-05,
"loss": 1.9731,
"step": 1767
},
{
"epoch": 0.059275376652490684,
"grad_norm": 1.8761731386184692,
"learning_rate": 4.998914100252672e-05,
"loss": 1.9883,
"step": 1798
},
{
"epoch": 0.06029736590511984,
"grad_norm": 6.824073791503906,
"learning_rate": 4.998652497419696e-05,
"loss": 1.9713,
"step": 1829
},
{
"epoch": 0.061319355157748986,
"grad_norm": 1.7231906652450562,
"learning_rate": 4.9983626976328927e-05,
"loss": 1.9492,
"step": 1860
},
{
"epoch": 0.062341344410378134,
"grad_norm": 1.6888827085494995,
"learning_rate": 4.998044704162613e-05,
"loss": 1.9449,
"step": 1891
},
{
"epoch": 0.06336333366300728,
"grad_norm": 1.5924124717712402,
"learning_rate": 4.9976985205973705e-05,
"loss": 1.938,
"step": 1922
},
{
"epoch": 0.06438532291563644,
"grad_norm": 1.7029409408569336,
"learning_rate": 4.997324150843799e-05,
"loss": 1.9285,
"step": 1953
},
{
"epoch": 0.06540731216826559,
"grad_norm": 1.5554330348968506,
"learning_rate": 4.99692159912661e-05,
"loss": 1.9091,
"step": 1984
},
{
"epoch": 0.06642930142089473,
"grad_norm": 1.5639121532440186,
"learning_rate": 4.996490869988546e-05,
"loss": 1.9105,
"step": 2015
},
{
"epoch": 0.06745129067352389,
"grad_norm": 1.6262747049331665,
"learning_rate": 4.996031968290326e-05,
"loss": 1.9164,
"step": 2046
},
{
"epoch": 0.06847327992615304,
"grad_norm": 1.5408495664596558,
"learning_rate": 4.995544899210594e-05,
"loss": 1.8728,
"step": 2077
},
{
"epoch": 0.06949526917878218,
"grad_norm": 1.5183970928192139,
"learning_rate": 4.9950296682458583e-05,
"loss": 1.8854,
"step": 2108
},
{
"epoch": 0.07051725843141134,
"grad_norm": 1.9362810850143433,
"learning_rate": 4.994486281210429e-05,
"loss": 1.8811,
"step": 2139
},
{
"epoch": 0.07153924768404048,
"grad_norm": 1.5646640062332153,
"learning_rate": 4.9939147442363566e-05,
"loss": 1.8744,
"step": 2170
},
{
"epoch": 0.07256123693666963,
"grad_norm": 1.5929124355316162,
"learning_rate": 4.9933150637733574e-05,
"loss": 1.867,
"step": 2201
},
{
"epoch": 0.07358322618929879,
"grad_norm": 1.7689995765686035,
"learning_rate": 4.992687246588743e-05,
"loss": 1.8659,
"step": 2232
},
{
"epoch": 0.07460521544192793,
"grad_norm": 1.375406265258789,
"learning_rate": 4.992031299767347e-05,
"loss": 1.8477,
"step": 2263
},
{
"epoch": 0.07562720469455708,
"grad_norm": 1.3596042394638062,
"learning_rate": 4.9913472307114386e-05,
"loss": 1.8498,
"step": 2294
},
{
"epoch": 0.07664919394718624,
"grad_norm": 1.3918544054031372,
"learning_rate": 4.9906350471406446e-05,
"loss": 1.8538,
"step": 2325
},
{
"epoch": 0.07767118319981538,
"grad_norm": 1.4614112377166748,
"learning_rate": 4.989894757091861e-05,
"loss": 1.8359,
"step": 2356
},
{
"epoch": 0.07869317245244453,
"grad_norm": 1.563186764717102,
"learning_rate": 4.989126368919158e-05,
"loss": 1.821,
"step": 2387
},
{
"epoch": 0.07971516170507369,
"grad_norm": 1.3217226266860962,
"learning_rate": 4.988329891293693e-05,
"loss": 1.8345,
"step": 2418
},
{
"epoch": 0.08073715095770283,
"grad_norm": 1.3906276226043701,
"learning_rate": 4.987505333203608e-05,
"loss": 1.8313,
"step": 2449
},
{
"epoch": 0.08175914021033198,
"grad_norm": 1.2736890316009521,
"learning_rate": 4.9866527039539276e-05,
"loss": 1.8234,
"step": 2480
},
{
"epoch": 0.08278112946296114,
"grad_norm": 1.3050850629806519,
"learning_rate": 4.9857720131664594e-05,
"loss": 1.8181,
"step": 2511
},
{
"epoch": 0.08380311871559028,
"grad_norm": 1.3847019672393799,
"learning_rate": 4.9848632707796773e-05,
"loss": 1.8128,
"step": 2542
},
{
"epoch": 0.08482510796821943,
"grad_norm": 1.3893433809280396,
"learning_rate": 4.9839264870486155e-05,
"loss": 1.7871,
"step": 2573
},
{
"epoch": 0.08584709722084859,
"grad_norm": 1.3469080924987793,
"learning_rate": 4.9829616725447526e-05,
"loss": 1.8212,
"step": 2604
},
{
"epoch": 0.08686908647347773,
"grad_norm": 1.269865870475769,
"learning_rate": 4.981968838155888e-05,
"loss": 1.7868,
"step": 2635
},
{
"epoch": 0.08789107572610688,
"grad_norm": 1.2972242832183838,
"learning_rate": 4.980947995086024e-05,
"loss": 1.7987,
"step": 2666
},
{
"epoch": 0.08891306497873604,
"grad_norm": 1.3488340377807617,
"learning_rate": 4.979899154855234e-05,
"loss": 1.7859,
"step": 2697
},
{
"epoch": 0.08993505423136518,
"grad_norm": 1.3487133979797363,
"learning_rate": 4.9788223292995386e-05,
"loss": 1.7851,
"step": 2728
},
{
"epoch": 0.09095704348399433,
"grad_norm": 1.2412410974502563,
"learning_rate": 4.977717530570768e-05,
"loss": 1.7802,
"step": 2759
},
{
"epoch": 0.09197903273662347,
"grad_norm": 1.3017562627792358,
"learning_rate": 4.976584771136425e-05,
"loss": 1.775,
"step": 2790
},
{
"epoch": 0.09300102198925263,
"grad_norm": 1.2923076152801514,
"learning_rate": 4.975424063779547e-05,
"loss": 1.76,
"step": 2821
},
{
"epoch": 0.09402301124188178,
"grad_norm": 1.2735223770141602,
"learning_rate": 4.974235421598557e-05,
"loss": 1.7604,
"step": 2852
},
{
"epoch": 0.09504500049451092,
"grad_norm": 1.303673505783081,
"learning_rate": 4.973018858007122e-05,
"loss": 1.7685,
"step": 2883
},
{
"epoch": 0.09606698974714008,
"grad_norm": 1.5336410999298096,
"learning_rate": 4.9717743867339963e-05,
"loss": 1.7682,
"step": 2914
},
{
"epoch": 0.09708897899976923,
"grad_norm": 5.014227867126465,
"learning_rate": 4.9705020218228695e-05,
"loss": 1.7617,
"step": 2945
},
{
"epoch": 0.09811096825239837,
"grad_norm": 1.2603938579559326,
"learning_rate": 4.969201777632205e-05,
"loss": 1.7621,
"step": 2976
},
{
"epoch": 0.09913295750502753,
"grad_norm": 1.32491934299469,
"learning_rate": 4.9678736688350846e-05,
"loss": 1.7411,
"step": 3007
},
{
"epoch": 0.10015494675765668,
"grad_norm": 1.206735372543335,
"learning_rate": 4.966517710419033e-05,
"loss": 1.7262,
"step": 3038
},
{
"epoch": 0.10117693601028582,
"grad_norm": 1.254231572151184,
"learning_rate": 4.965133917685858e-05,
"loss": 1.7454,
"step": 3069
},
{
"epoch": 0.10219892526291498,
"grad_norm": 1.3085408210754395,
"learning_rate": 4.9637223062514714e-05,
"loss": 1.7517,
"step": 3100
},
{
"epoch": 0.10322091451554413,
"grad_norm": 1.3184605836868286,
"learning_rate": 4.962282892045718e-05,
"loss": 1.7542,
"step": 3131
},
{
"epoch": 0.10424290376817327,
"grad_norm": 1.2284983396530151,
"learning_rate": 4.9608156913121904e-05,
"loss": 1.738,
"step": 3162
},
{
"epoch": 0.10526489302080243,
"grad_norm": 1.3870880603790283,
"learning_rate": 4.959320720608049e-05,
"loss": 1.7358,
"step": 3193
},
{
"epoch": 0.10628688227343158,
"grad_norm": 1.1335322856903076,
"learning_rate": 4.9577979968038354e-05,
"loss": 1.742,
"step": 3224
},
{
"epoch": 0.10730887152606072,
"grad_norm": 1.1794465780258179,
"learning_rate": 4.956247537083282e-05,
"loss": 1.7357,
"step": 3255
},
{
"epoch": 0.10833086077868988,
"grad_norm": 1.1701149940490723,
"learning_rate": 4.9546693589431145e-05,
"loss": 1.7289,
"step": 3286
},
{
"epoch": 0.10935285003131903,
"grad_norm": 1.169094443321228,
"learning_rate": 4.9530634801928595e-05,
"loss": 1.7238,
"step": 3317
},
{
"epoch": 0.11037483928394817,
"grad_norm": 1.2735379934310913,
"learning_rate": 4.9514299189546395e-05,
"loss": 1.7275,
"step": 3348
},
{
"epoch": 0.11139682853657733,
"grad_norm": 1.2806981801986694,
"learning_rate": 4.949768693662973e-05,
"loss": 1.7135,
"step": 3379
},
{
"epoch": 0.11241881778920647,
"grad_norm": 1.208024263381958,
"learning_rate": 4.948079823064559e-05,
"loss": 1.7251,
"step": 3410
},
{
"epoch": 0.11344080704183562,
"grad_norm": 1.275516152381897,
"learning_rate": 4.946363326218074e-05,
"loss": 1.7259,
"step": 3441
},
{
"epoch": 0.11446279629446478,
"grad_norm": 1.1761465072631836,
"learning_rate": 4.9446192224939525e-05,
"loss": 1.7024,
"step": 3472
},
{
"epoch": 0.11548478554709392,
"grad_norm": 1.1216075420379639,
"learning_rate": 4.942847531574167e-05,
"loss": 1.6849,
"step": 3503
},
{
"epoch": 0.11650677479972307,
"grad_norm": 1.1286563873291016,
"learning_rate": 4.941048273452008e-05,
"loss": 1.7004,
"step": 3534
},
{
"epoch": 0.11752876405235223,
"grad_norm": 1.1531614065170288,
"learning_rate": 4.9392214684318605e-05,
"loss": 1.7101,
"step": 3565
},
{
"epoch": 0.11855075330498137,
"grad_norm": 1.143051266670227,
"learning_rate": 4.93736713712897e-05,
"loss": 1.7127,
"step": 3596
},
{
"epoch": 0.11957274255761052,
"grad_norm": 1.1483272314071655,
"learning_rate": 4.9354853004692124e-05,
"loss": 1.6968,
"step": 3627
},
{
"epoch": 0.12059473181023968,
"grad_norm": 1.1267555952072144,
"learning_rate": 4.93357597968886e-05,
"loss": 1.7023,
"step": 3658
},
{
"epoch": 0.12161672106286882,
"grad_norm": 1.1552249193191528,
"learning_rate": 4.931639196334338e-05,
"loss": 1.7171,
"step": 3689
},
{
"epoch": 0.12263871031549797,
"grad_norm": 1.1388061046600342,
"learning_rate": 4.9296749722619826e-05,
"loss": 1.7061,
"step": 3720
},
{
"epoch": 0.12366069956812713,
"grad_norm": 1.127455234527588,
"learning_rate": 4.9276833296377966e-05,
"loss": 1.6879,
"step": 3751
},
{
"epoch": 0.12468268882075627,
"grad_norm": 1.1534373760223389,
"learning_rate": 4.925664290937196e-05,
"loss": 1.7091,
"step": 3782
},
{
"epoch": 0.12570467807338542,
"grad_norm": 1.2369154691696167,
"learning_rate": 4.9236178789447576e-05,
"loss": 1.6923,
"step": 3813
},
{
"epoch": 0.12672666732601456,
"grad_norm": 1.074436068534851,
"learning_rate": 4.921544116753962e-05,
"loss": 1.6805,
"step": 3844
},
{
"epoch": 0.12774865657864373,
"grad_norm": 1.112226963043213,
"learning_rate": 4.919443027766935e-05,
"loss": 1.6527,
"step": 3875
},
{
"epoch": 0.12877064583127287,
"grad_norm": 1.0999863147735596,
"learning_rate": 4.91731463569418e-05,
"loss": 1.6614,
"step": 3906
},
{
"epoch": 0.129792635083902,
"grad_norm": 1.098036527633667,
"learning_rate": 4.915158964554312e-05,
"loss": 1.6826,
"step": 3937
},
{
"epoch": 0.13081462433653118,
"grad_norm": 1.1108450889587402,
"learning_rate": 4.912976038673786e-05,
"loss": 1.6886,
"step": 3968
},
{
"epoch": 0.13183661358916032,
"grad_norm": 1.0916872024536133,
"learning_rate": 4.9107658826866254e-05,
"loss": 1.6782,
"step": 3999
},
{
"epoch": 0.13285860284178946,
"grad_norm": 1.0818581581115723,
"learning_rate": 4.908528521534139e-05,
"loss": 1.6796,
"step": 4030
},
{
"epoch": 0.13388059209441863,
"grad_norm": 1.0908610820770264,
"learning_rate": 4.906263980464644e-05,
"loss": 1.6662,
"step": 4061
},
{
"epoch": 0.13490258134704777,
"grad_norm": 1.0384143590927124,
"learning_rate": 4.903972285033178e-05,
"loss": 1.6811,
"step": 4092
},
{
"epoch": 0.1359245705996769,
"grad_norm": 1.0998533964157104,
"learning_rate": 4.901653461101213e-05,
"loss": 1.6817,
"step": 4123
},
{
"epoch": 0.13694655985230608,
"grad_norm": 1.1726231575012207,
"learning_rate": 4.8993075348363626e-05,
"loss": 1.6532,
"step": 4154
},
{
"epoch": 0.13796854910493522,
"grad_norm": 1.075464129447937,
"learning_rate": 4.896934532712084e-05,
"loss": 1.671,
"step": 4185
},
{
"epoch": 0.13899053835756436,
"grad_norm": 1.0557868480682373,
"learning_rate": 4.8945344815073846e-05,
"loss": 1.6548,
"step": 4216
},
{
"epoch": 0.14001252761019353,
"grad_norm": 1.0531095266342163,
"learning_rate": 4.892107408306516e-05,
"loss": 1.6526,
"step": 4247
},
{
"epoch": 0.14103451686282267,
"grad_norm": 1.138203501701355,
"learning_rate": 4.889653340498669e-05,
"loss": 1.678,
"step": 4278
},
{
"epoch": 0.1420565061154518,
"grad_norm": 1.0668121576309204,
"learning_rate": 4.8871723057776664e-05,
"loss": 1.6552,
"step": 4309
},
{
"epoch": 0.14307849536808095,
"grad_norm": 1.0312261581420898,
"learning_rate": 4.8846643321416476e-05,
"loss": 1.6731,
"step": 4340
},
{
"epoch": 0.14410048462071012,
"grad_norm": 1.0868667364120483,
"learning_rate": 4.882129447892753e-05,
"loss": 1.6713,
"step": 4371
},
{
"epoch": 0.14512247387333926,
"grad_norm": 1.0213130712509155,
"learning_rate": 4.8795676816368076e-05,
"loss": 1.6515,
"step": 4402
},
{
"epoch": 0.1461444631259684,
"grad_norm": 1.190875768661499,
"learning_rate": 4.876979062282995e-05,
"loss": 1.651,
"step": 4433
},
{
"epoch": 0.14716645237859757,
"grad_norm": 1.0551568269729614,
"learning_rate": 4.8743636190435325e-05,
"loss": 1.6751,
"step": 4464
},
{
"epoch": 0.1481884416312267,
"grad_norm": 1.0654323101043701,
"learning_rate": 4.871721381433344e-05,
"loss": 1.6499,
"step": 4495
},
{
"epoch": 0.14921043088385585,
"grad_norm": 1.04425048828125,
"learning_rate": 4.869052379269719e-05,
"loss": 1.6401,
"step": 4526
},
{
"epoch": 0.15023242013648502,
"grad_norm": 1.3416290283203125,
"learning_rate": 4.866356642671985e-05,
"loss": 1.6553,
"step": 4557
},
{
"epoch": 0.15125440938911416,
"grad_norm": 1.073529601097107,
"learning_rate": 4.8636342020611634e-05,
"loss": 1.6413,
"step": 4588
},
{
"epoch": 0.1522763986417433,
"grad_norm": 1.0386462211608887,
"learning_rate": 4.860885088159626e-05,
"loss": 1.6595,
"step": 4619
},
{
"epoch": 0.15329838789437247,
"grad_norm": 1.0544514656066895,
"learning_rate": 4.858109331990751e-05,
"loss": 1.6387,
"step": 4650
},
{
"epoch": 0.15432037714700161,
"grad_norm": 1.088112473487854,
"learning_rate": 4.855306964878567e-05,
"loss": 1.628,
"step": 4681
},
{
"epoch": 0.15534236639963075,
"grad_norm": 0.9930492639541626,
"learning_rate": 4.8524780184474084e-05,
"loss": 1.6554,
"step": 4712
},
{
"epoch": 0.15636435565225992,
"grad_norm": 0.9864984154701233,
"learning_rate": 4.8496225246215496e-05,
"loss": 1.6558,
"step": 4743
},
{
"epoch": 0.15738634490488906,
"grad_norm": 1.0170128345489502,
"learning_rate": 4.8467405156248505e-05,
"loss": 1.6289,
"step": 4774
},
{
"epoch": 0.1584083341575182,
"grad_norm": 1.0360223054885864,
"learning_rate": 4.843832023980392e-05,
"loss": 1.6314,
"step": 4805
},
{
"epoch": 0.15943032341014737,
"grad_norm": 1.0165129899978638,
"learning_rate": 4.840897082510106e-05,
"loss": 1.6294,
"step": 4836
},
{
"epoch": 0.16045231266277651,
"grad_norm": 1.079991102218628,
"learning_rate": 4.8379357243344084e-05,
"loss": 1.6204,
"step": 4867
},
{
"epoch": 0.16147430191540565,
"grad_norm": 1.0515645742416382,
"learning_rate": 4.8349479828718236e-05,
"loss": 1.6322,
"step": 4898
},
{
"epoch": 0.16249629116803482,
"grad_norm": 1.0226655006408691,
"learning_rate": 4.8319338918386075e-05,
"loss": 1.6476,
"step": 4929
},
{
"epoch": 0.16351828042066396,
"grad_norm": 1.0581114292144775,
"learning_rate": 4.828893485248369e-05,
"loss": 1.6302,
"step": 4960
},
{
"epoch": 0.1645402696732931,
"grad_norm": 1.0156742334365845,
"learning_rate": 4.825826797411682e-05,
"loss": 1.6292,
"step": 4991
},
{
"epoch": 0.16556225892592227,
"grad_norm": 1.0153559446334839,
"learning_rate": 4.822733862935702e-05,
"loss": 1.6367,
"step": 5022
},
{
"epoch": 0.16658424817855141,
"grad_norm": 1.0488505363464355,
"learning_rate": 4.819614716723775e-05,
"loss": 1.644,
"step": 5053
},
{
"epoch": 0.16760623743118055,
"grad_norm": 1.092781901359558,
"learning_rate": 4.8164693939750425e-05,
"loss": 1.6318,
"step": 5084
},
{
"epoch": 0.16862822668380972,
"grad_norm": 0.9972744584083557,
"learning_rate": 4.813297930184042e-05,
"loss": 1.6206,
"step": 5115
},
{
"epoch": 0.16965021593643886,
"grad_norm": 1.0455750226974487,
"learning_rate": 4.810100361140314e-05,
"loss": 1.6296,
"step": 5146
},
{
"epoch": 0.170672205189068,
"grad_norm": 0.9624122977256775,
"learning_rate": 4.8068767229279885e-05,
"loss": 1.6307,
"step": 5177
},
{
"epoch": 0.17169419444169717,
"grad_norm": 1.4828526973724365,
"learning_rate": 4.8036270519253854e-05,
"loss": 1.6377,
"step": 5208
},
{
"epoch": 0.17271618369432631,
"grad_norm": 1.0732771158218384,
"learning_rate": 4.8003513848046e-05,
"loss": 1.6148,
"step": 5239
},
{
"epoch": 0.17373817294695545,
"grad_norm": 1.0065757036209106,
"learning_rate": 4.79704975853109e-05,
"loss": 1.6192,
"step": 5270
},
{
"epoch": 0.17476016219958462,
"grad_norm": 1.7071099281311035,
"learning_rate": 4.793722210363262e-05,
"loss": 1.6205,
"step": 5301
},
{
"epoch": 0.17578215145221376,
"grad_norm": 1.009507417678833,
"learning_rate": 4.7903687778520414e-05,
"loss": 1.6261,
"step": 5332
},
{
"epoch": 0.1768041407048429,
"grad_norm": 1.0593280792236328,
"learning_rate": 4.7869894988404593e-05,
"loss": 1.6286,
"step": 5363
},
{
"epoch": 0.17782612995747207,
"grad_norm": 1.0053679943084717,
"learning_rate": 4.783584411463221e-05,
"loss": 1.6424,
"step": 5394
},
{
"epoch": 0.17884811921010121,
"grad_norm": 0.983214795589447,
"learning_rate": 4.780153554146274e-05,
"loss": 1.6292,
"step": 5425
},
{
"epoch": 0.17987010846273035,
"grad_norm": 0.9617491960525513,
"learning_rate": 4.7766969656063766e-05,
"loss": 1.6182,
"step": 5456
},
{
"epoch": 0.18089209771535952,
"grad_norm": 1.0862797498703003,
"learning_rate": 4.773214684850662e-05,
"loss": 1.6213,
"step": 5487
},
{
"epoch": 0.18191408696798866,
"grad_norm": 1.0468218326568604,
"learning_rate": 4.769706751176193e-05,
"loss": 1.6176,
"step": 5518
},
{
"epoch": 0.1829360762206178,
"grad_norm": 0.9474911093711853,
"learning_rate": 4.7661732041695264e-05,
"loss": 1.5864,
"step": 5549
},
{
"epoch": 0.18395806547324695,
"grad_norm": 0.9839109778404236,
"learning_rate": 4.762614083706258e-05,
"loss": 1.6177,
"step": 5580
},
{
"epoch": 0.18498005472587611,
"grad_norm": 1.9305601119995117,
"learning_rate": 4.759029429950581e-05,
"loss": 1.6132,
"step": 5611
},
{
"epoch": 0.18600204397850525,
"grad_norm": 0.9609850645065308,
"learning_rate": 4.7554192833548235e-05,
"loss": 1.6115,
"step": 5642
},
{
"epoch": 0.1870240332311344,
"grad_norm": 1.0501559972763062,
"learning_rate": 4.751783684659e-05,
"loss": 1.6017,
"step": 5673
},
{
"epoch": 0.18804602248376356,
"grad_norm": 0.9933464527130127,
"learning_rate": 4.748122674890348e-05,
"loss": 1.6136,
"step": 5704
},
{
"epoch": 0.1890680117363927,
"grad_norm": 0.9463350772857666,
"learning_rate": 4.7444362953628654e-05,
"loss": 1.6102,
"step": 5735
},
{
"epoch": 0.19009000098902185,
"grad_norm": 0.9974256753921509,
"learning_rate": 4.7407245876768424e-05,
"loss": 1.6101,
"step": 5766
},
{
"epoch": 0.19111199024165101,
"grad_norm": 0.9747878313064575,
"learning_rate": 4.736987593718397e-05,
"loss": 1.5967,
"step": 5797
},
{
"epoch": 0.19213397949428015,
"grad_norm": 0.9755719900131226,
"learning_rate": 4.733225355658999e-05,
"loss": 1.5987,
"step": 5828
},
{
"epoch": 0.1931559687469093,
"grad_norm": 0.9605233669281006,
"learning_rate": 4.7294379159549926e-05,
"loss": 1.6238,
"step": 5859
},
{
"epoch": 0.19417795799953846,
"grad_norm": 0.9777940511703491,
"learning_rate": 4.725625317347119e-05,
"loss": 1.5976,
"step": 5890
},
{
"epoch": 0.1951999472521676,
"grad_norm": 0.9619265198707581,
"learning_rate": 4.7217876028600374e-05,
"loss": 1.5985,
"step": 5921
},
{
"epoch": 0.19622193650479675,
"grad_norm": 0.970813512802124,
"learning_rate": 4.717924815801832e-05,
"loss": 1.6097,
"step": 5952
},
{
"epoch": 0.19724392575742591,
"grad_norm": 1.1910721063613892,
"learning_rate": 4.714036999763532e-05,
"loss": 1.6156,
"step": 5983
},
{
"epoch": 0.19826591501005505,
"grad_norm": 1.1106246709823608,
"learning_rate": 4.7101241986186116e-05,
"loss": 1.5761,
"step": 6014
},
{
"epoch": 0.1992879042626842,
"grad_norm": 1.0065436363220215,
"learning_rate": 4.7061864565225e-05,
"loss": 1.5857,
"step": 6045
},
{
"epoch": 0.20030989351531336,
"grad_norm": 0.9245477914810181,
"learning_rate": 4.702223817912081e-05,
"loss": 1.6099,
"step": 6076
},
{
"epoch": 0.2013318827679425,
"grad_norm": 0.9705063104629517,
"learning_rate": 4.698236327505195e-05,
"loss": 1.5995,
"step": 6107
},
{
"epoch": 0.20235387202057165,
"grad_norm": 0.9455100893974304,
"learning_rate": 4.694224030300127e-05,
"loss": 1.6302,
"step": 6138
},
{
"epoch": 0.20337586127320081,
"grad_norm": 0.9505909085273743,
"learning_rate": 4.690186971575107e-05,
"loss": 1.5799,
"step": 6169
},
{
"epoch": 0.20439785052582995,
"grad_norm": 0.9440078139305115,
"learning_rate": 4.6861251968877916e-05,
"loss": 1.5889,
"step": 6200
},
{
"epoch": 0.2054198397784591,
"grad_norm": 0.9610021710395813,
"learning_rate": 4.68203875207476e-05,
"loss": 1.6049,
"step": 6231
},
{
"epoch": 0.20644182903108826,
"grad_norm": 0.9686371684074402,
"learning_rate": 4.677927683250983e-05,
"loss": 1.5985,
"step": 6262
},
{
"epoch": 0.2074638182837174,
"grad_norm": 0.9532095789909363,
"learning_rate": 4.6737920368093156e-05,
"loss": 1.5763,
"step": 6293
},
{
"epoch": 0.20848580753634655,
"grad_norm": 0.9185531139373779,
"learning_rate": 4.669631859419965e-05,
"loss": 1.5719,
"step": 6324
},
{
"epoch": 0.20950779678897571,
"grad_norm": 0.917314350605011,
"learning_rate": 4.6654471980299676e-05,
"loss": 1.5634,
"step": 6355
},
{
"epoch": 0.21052978604160485,
"grad_norm": 0.9497798681259155,
"learning_rate": 4.661238099862658e-05,
"loss": 1.5836,
"step": 6386
},
{
"epoch": 0.211551775294234,
"grad_norm": 0.939011812210083,
"learning_rate": 4.657004612417138e-05,
"loss": 1.601,
"step": 6417
},
{
"epoch": 0.21257376454686316,
"grad_norm": 0.981315016746521,
"learning_rate": 4.6527467834677374e-05,
"loss": 1.5896,
"step": 6448
},
{
"epoch": 0.2135957537994923,
"grad_norm": 0.9485774636268616,
"learning_rate": 4.648464661063478e-05,
"loss": 1.5912,
"step": 6479
},
{
"epoch": 0.21461774305212145,
"grad_norm": 0.9504795670509338,
"learning_rate": 4.6441582935275264e-05,
"loss": 1.5827,
"step": 6510
},
{
"epoch": 0.21563973230475061,
"grad_norm": 0.9627436399459839,
"learning_rate": 4.6398277294566586e-05,
"loss": 1.5858,
"step": 6541
},
{
"epoch": 0.21666172155737976,
"grad_norm": 0.9468591809272766,
"learning_rate": 4.6354730177207e-05,
"loss": 1.5884,
"step": 6572
},
{
"epoch": 0.2176837108100089,
"grad_norm": 0.9847991466522217,
"learning_rate": 4.6310942074619787e-05,
"loss": 1.5744,
"step": 6603
},
{
"epoch": 0.21870570006263806,
"grad_norm": 0.9207347631454468,
"learning_rate": 4.626691348094777e-05,
"loss": 1.5675,
"step": 6634
},
{
"epoch": 0.2197276893152672,
"grad_norm": 0.9114487171173096,
"learning_rate": 4.622264489304762e-05,
"loss": 1.5726,
"step": 6665
},
{
"epoch": 0.22074967856789635,
"grad_norm": 0.9141913652420044,
"learning_rate": 4.617813681048434e-05,
"loss": 1.5757,
"step": 6696
},
{
"epoch": 0.2217716678205255,
"grad_norm": 0.9770637154579163,
"learning_rate": 4.61333897355256e-05,
"loss": 1.5819,
"step": 6727
},
{
"epoch": 0.22279365707315466,
"grad_norm": 0.9790964126586914,
"learning_rate": 4.608840417313604e-05,
"loss": 1.5793,
"step": 6758
},
{
"epoch": 0.2238156463257838,
"grad_norm": 0.9311193227767944,
"learning_rate": 4.6043180630971646e-05,
"loss": 1.5823,
"step": 6789
},
{
"epoch": 0.22483763557841294,
"grad_norm": 0.934339165687561,
"learning_rate": 4.599771961937391e-05,
"loss": 1.5643,
"step": 6820
},
{
"epoch": 0.2258596248310421,
"grad_norm": 0.9052058458328247,
"learning_rate": 4.5952021651364204e-05,
"loss": 1.5752,
"step": 6851
},
{
"epoch": 0.22688161408367125,
"grad_norm": 0.9528570175170898,
"learning_rate": 4.590608724263786e-05,
"loss": 1.5603,
"step": 6882
},
{
"epoch": 0.2279036033363004,
"grad_norm": 0.9073063135147095,
"learning_rate": 4.585991691155845e-05,
"loss": 1.555,
"step": 6913
},
{
"epoch": 0.22892559258892956,
"grad_norm": 0.9537662267684937,
"learning_rate": 4.581351117915188e-05,
"loss": 1.5662,
"step": 6944
},
{
"epoch": 0.2299475818415587,
"grad_norm": 0.9296181201934814,
"learning_rate": 4.5766870569100534e-05,
"loss": 1.5649,
"step": 6975
},
{
"epoch": 0.23096957109418784,
"grad_norm": 0.947211503982544,
"learning_rate": 4.571999560773736e-05,
"loss": 1.5834,
"step": 7006
},
{
"epoch": 0.231991560346817,
"grad_norm": 0.9705089330673218,
"learning_rate": 4.5672886824039915e-05,
"loss": 1.5817,
"step": 7037
},
{
"epoch": 0.23301354959944615,
"grad_norm": 0.9289253950119019,
"learning_rate": 4.5625544749624435e-05,
"loss": 1.5792,
"step": 7068
},
{
"epoch": 0.2340355388520753,
"grad_norm": 0.9166892766952515,
"learning_rate": 4.5577969918739794e-05,
"loss": 1.559,
"step": 7099
},
{
"epoch": 0.23505752810470446,
"grad_norm": 0.9205060601234436,
"learning_rate": 4.5530162868261486e-05,
"loss": 1.5564,
"step": 7130
},
{
"epoch": 0.2360795173573336,
"grad_norm": 0.9769343733787537,
"learning_rate": 4.548212413768558e-05,
"loss": 1.5499,
"step": 7161
},
{
"epoch": 0.23710150660996274,
"grad_norm": 1.0154651403427124,
"learning_rate": 4.543385426912261e-05,
"loss": 1.5832,
"step": 7192
},
{
"epoch": 0.2381234958625919,
"grad_norm": 0.9247255325317383,
"learning_rate": 4.53853538072915e-05,
"loss": 1.5844,
"step": 7223
},
{
"epoch": 0.23914548511522105,
"grad_norm": 0.8840001225471497,
"learning_rate": 4.533662329951336e-05,
"loss": 1.5456,
"step": 7254
},
{
"epoch": 0.2401674743678502,
"grad_norm": 0.986392080783844,
"learning_rate": 4.528766329570536e-05,
"loss": 1.5743,
"step": 7285
},
{
"epoch": 0.24118946362047936,
"grad_norm": 0.8750962615013123,
"learning_rate": 4.523847434837447e-05,
"loss": 1.5751,
"step": 7316
},
{
"epoch": 0.2422114528731085,
"grad_norm": 0.9039379954338074,
"learning_rate": 4.518905701261128e-05,
"loss": 1.5603,
"step": 7347
},
{
"epoch": 0.24323344212573764,
"grad_norm": 0.9081151485443115,
"learning_rate": 4.5139411846083715e-05,
"loss": 1.5542,
"step": 7378
},
{
"epoch": 0.2442554313783668,
"grad_norm": 1.532193899154663,
"learning_rate": 4.508953940903073e-05,
"loss": 1.5693,
"step": 7409
},
{
"epoch": 0.24527742063099595,
"grad_norm": 0.9598657488822937,
"learning_rate": 4.5039440264255994e-05,
"loss": 1.5672,
"step": 7440
},
{
"epoch": 0.2462994098836251,
"grad_norm": 1.3318407535552979,
"learning_rate": 4.498911497712155e-05,
"loss": 1.5739,
"step": 7471
},
{
"epoch": 0.24732139913625426,
"grad_norm": 0.9309579730033875,
"learning_rate": 4.493856411554142e-05,
"loss": 1.5504,
"step": 7502
},
{
"epoch": 0.2483433883888834,
"grad_norm": 0.9426462650299072,
"learning_rate": 4.4887788249975206e-05,
"loss": 1.5529,
"step": 7533
},
{
"epoch": 0.24936537764151254,
"grad_norm": 0.9388718605041504,
"learning_rate": 4.4836787953421656e-05,
"loss": 1.5589,
"step": 7564
},
{
"epoch": 0.2503873668941417,
"grad_norm": 0.8922486901283264,
"learning_rate": 4.478556380141218e-05,
"loss": 1.5453,
"step": 7595
},
{
"epoch": 0.25140935614677085,
"grad_norm": 0.9597366452217102,
"learning_rate": 4.4734116372004375e-05,
"loss": 1.5565,
"step": 7626
},
{
"epoch": 0.2524313453994,
"grad_norm": 0.880445659160614,
"learning_rate": 4.4682446245775477e-05,
"loss": 1.5429,
"step": 7657
},
{
"epoch": 0.2534533346520291,
"grad_norm": 0.8895862102508545,
"learning_rate": 4.463055400581586e-05,
"loss": 1.5462,
"step": 7688
},
{
"epoch": 0.25447532390465827,
"grad_norm": 0.8859049677848816,
"learning_rate": 4.4578440237722374e-05,
"loss": 1.5678,
"step": 7719
},
{
"epoch": 0.25549731315728746,
"grad_norm": 0.9074852466583252,
"learning_rate": 4.452610552959183e-05,
"loss": 1.5695,
"step": 7750
},
{
"epoch": 0.2565193024099166,
"grad_norm": 0.9472444653511047,
"learning_rate": 4.447355047201428e-05,
"loss": 1.5527,
"step": 7781
},
{
"epoch": 0.25754129166254575,
"grad_norm": 0.901016891002655,
"learning_rate": 4.4420775658066414e-05,
"loss": 1.5523,
"step": 7812
},
{
"epoch": 0.2585632809151749,
"grad_norm": 0.8963896632194519,
"learning_rate": 4.436778168330484e-05,
"loss": 1.5623,
"step": 7843
},
{
"epoch": 0.259585270167804,
"grad_norm": 0.9571655988693237,
"learning_rate": 4.4314569145759353e-05,
"loss": 1.5446,
"step": 7874
},
{
"epoch": 0.26060725942043317,
"grad_norm": 0.9321922659873962,
"learning_rate": 4.42611386459262e-05,
"loss": 1.5514,
"step": 7905
},
{
"epoch": 0.26162924867306236,
"grad_norm": 0.8753949403762817,
"learning_rate": 4.420749078676133e-05,
"loss": 1.5524,
"step": 7936
},
{
"epoch": 0.2626512379256915,
"grad_norm": 0.8667870759963989,
"learning_rate": 4.4153626173673516e-05,
"loss": 1.5435,
"step": 7967
},
{
"epoch": 0.26367322717832065,
"grad_norm": 0.926670491695404,
"learning_rate": 4.409954541451762e-05,
"loss": 1.5448,
"step": 7998
},
{
"epoch": 0.2646952164309498,
"grad_norm": 0.9438245892524719,
"learning_rate": 4.404524911958764e-05,
"loss": 1.5706,
"step": 8029
},
{
"epoch": 0.2657172056835789,
"grad_norm": 0.9131088256835938,
"learning_rate": 4.399073790160989e-05,
"loss": 1.5361,
"step": 8060
},
{
"epoch": 0.26673919493620807,
"grad_norm": 0.914857804775238,
"learning_rate": 4.393601237573607e-05,
"loss": 1.5588,
"step": 8091
},
{
"epoch": 0.26776118418883726,
"grad_norm": 0.9113429188728333,
"learning_rate": 4.388107315953628e-05,
"loss": 1.5603,
"step": 8122
},
{
"epoch": 0.2687831734414664,
"grad_norm": 0.8804867267608643,
"learning_rate": 4.382592087299212e-05,
"loss": 1.5577,
"step": 8153
},
{
"epoch": 0.26980516269409555,
"grad_norm": 0.8368428349494934,
"learning_rate": 4.377055613848964e-05,
"loss": 1.5369,
"step": 8184
},
{
"epoch": 0.2708271519467247,
"grad_norm": 0.9133582711219788,
"learning_rate": 4.3714979580812355e-05,
"loss": 1.5522,
"step": 8215
},
{
"epoch": 0.2718491411993538,
"grad_norm": 0.902574360370636,
"learning_rate": 4.365919182713416e-05,
"loss": 1.5572,
"step": 8246
},
{
"epoch": 0.27287113045198297,
"grad_norm": 0.9236746430397034,
"learning_rate": 4.360319350701226e-05,
"loss": 1.5477,
"step": 8277
},
{
"epoch": 0.27389311970461216,
"grad_norm": 0.8866250514984131,
"learning_rate": 4.3546985252380115e-05,
"loss": 1.5576,
"step": 8308
},
{
"epoch": 0.2749151089572413,
"grad_norm": 0.8597050905227661,
"learning_rate": 4.349056769754021e-05,
"loss": 1.5454,
"step": 8339
},
{
"epoch": 0.27593709820987045,
"grad_norm": 0.9166654348373413,
"learning_rate": 4.3433941479156994e-05,
"loss": 1.5342,
"step": 8370
},
{
"epoch": 0.2769590874624996,
"grad_norm": 0.8913152813911438,
"learning_rate": 4.3377107236249647e-05,
"loss": 1.5489,
"step": 8401
},
{
"epoch": 0.2779810767151287,
"grad_norm": 0.9009787440299988,
"learning_rate": 4.332006561018488e-05,
"loss": 1.5617,
"step": 8432
},
{
"epoch": 0.27900306596775787,
"grad_norm": 0.8625615239143372,
"learning_rate": 4.3262817244669683e-05,
"loss": 1.5545,
"step": 8463
},
{
"epoch": 0.28002505522038706,
"grad_norm": 0.8483917713165283,
"learning_rate": 4.3205362785744083e-05,
"loss": 1.5287,
"step": 8494
},
{
"epoch": 0.2810470444730162,
"grad_norm": 0.8959261178970337,
"learning_rate": 4.314770288177384e-05,
"loss": 1.5503,
"step": 8525
},
{
"epoch": 0.28206903372564535,
"grad_norm": 0.9090222716331482,
"learning_rate": 4.308983818344313e-05,
"loss": 1.5469,
"step": 8556
},
{
"epoch": 0.2830910229782745,
"grad_norm": 0.9215665459632874,
"learning_rate": 4.3031769343747206e-05,
"loss": 1.519,
"step": 8587
},
{
"epoch": 0.2841130122309036,
"grad_norm": 0.8699467778205872,
"learning_rate": 4.297349701798505e-05,
"loss": 1.5507,
"step": 8618
},
{
"epoch": 0.28513500148353277,
"grad_norm": 0.9755619764328003,
"learning_rate": 4.2915021863751916e-05,
"loss": 1.5542,
"step": 8649
},
{
"epoch": 0.2861569907361619,
"grad_norm": 0.8612878918647766,
"learning_rate": 4.285634454093198e-05,
"loss": 1.55,
"step": 8680
},
{
"epoch": 0.2871789799887911,
"grad_norm": 0.8708077073097229,
"learning_rate": 4.279746571169086e-05,
"loss": 1.5239,
"step": 8711
},
{
"epoch": 0.28820096924142025,
"grad_norm": 0.8952695727348328,
"learning_rate": 4.2738386040468136e-05,
"loss": 1.5275,
"step": 8742
},
{
"epoch": 0.2892229584940494,
"grad_norm": 0.8305310010910034,
"learning_rate": 4.2679106193969866e-05,
"loss": 1.5419,
"step": 8773
},
{
"epoch": 0.2902449477466785,
"grad_norm": 0.9172886610031128,
"learning_rate": 4.261962684116106e-05,
"loss": 1.5266,
"step": 8804
},
{
"epoch": 0.29126693699930767,
"grad_norm": 0.8972066044807434,
"learning_rate": 4.2559948653258145e-05,
"loss": 1.5428,
"step": 8835
},
{
"epoch": 0.2922889262519368,
"grad_norm": 0.8838576078414917,
"learning_rate": 4.250007230372134e-05,
"loss": 1.5685,
"step": 8866
},
{
"epoch": 0.293310915504566,
"grad_norm": 0.8614609241485596,
"learning_rate": 4.2439998468247126e-05,
"loss": 1.5624,
"step": 8897
},
{
"epoch": 0.29433290475719515,
"grad_norm": 0.8305181860923767,
"learning_rate": 4.2379727824760566e-05,
"loss": 1.5424,
"step": 8928
},
{
"epoch": 0.2953548940098243,
"grad_norm": 0.8790427446365356,
"learning_rate": 4.231926105340768e-05,
"loss": 1.5316,
"step": 8959
},
{
"epoch": 0.2963768832624534,
"grad_norm": 0.8905590772628784,
"learning_rate": 4.225859883654776e-05,
"loss": 1.5136,
"step": 8990
},
{
"epoch": 0.29739887251508257,
"grad_norm": 0.8998729586601257,
"learning_rate": 4.219774185874569e-05,
"loss": 1.5372,
"step": 9021
},
{
"epoch": 0.2984208617677117,
"grad_norm": 0.8840643763542175,
"learning_rate": 4.213669080676418e-05,
"loss": 1.5371,
"step": 9052
},
{
"epoch": 0.2994428510203409,
"grad_norm": 0.9000579714775085,
"learning_rate": 4.2075446369556056e-05,
"loss": 1.5369,
"step": 9083
},
{
"epoch": 0.30046484027297005,
"grad_norm": 0.9541018009185791,
"learning_rate": 4.201400923825648e-05,
"loss": 1.5537,
"step": 9114
},
{
"epoch": 0.3014868295255992,
"grad_norm": 0.8957076072692871,
"learning_rate": 4.195238010617511e-05,
"loss": 1.5409,
"step": 9145
},
{
"epoch": 0.30250881877822833,
"grad_norm": 0.8320883512496948,
"learning_rate": 4.1890559668788344e-05,
"loss": 1.5178,
"step": 9176
},
{
"epoch": 0.30353080803085747,
"grad_norm": 0.8729486465454102,
"learning_rate": 4.1828548623731405e-05,
"loss": 1.5498,
"step": 9207
},
{
"epoch": 0.3045527972834866,
"grad_norm": 0.8582361936569214,
"learning_rate": 4.1766347670790506e-05,
"loss": 1.5209,
"step": 9238
},
{
"epoch": 0.3055747865361158,
"grad_norm": 0.890997052192688,
"learning_rate": 4.170395751189495e-05,
"loss": 1.5423,
"step": 9269
},
{
"epoch": 0.30659677578874495,
"grad_norm": 0.9070558547973633,
"learning_rate": 4.164137885110921e-05,
"loss": 1.5519,
"step": 9300
},
{
"epoch": 0.3076187650413741,
"grad_norm": 0.9573651552200317,
"learning_rate": 4.157861239462495e-05,
"loss": 1.5439,
"step": 9331
},
{
"epoch": 0.30864075429400323,
"grad_norm": 0.9029926061630249,
"learning_rate": 4.1515658850753114e-05,
"loss": 1.539,
"step": 9362
},
{
"epoch": 0.30966274354663237,
"grad_norm": 0.8370001316070557,
"learning_rate": 4.145251892991588e-05,
"loss": 1.5401,
"step": 9393
},
{
"epoch": 0.3106847327992615,
"grad_norm": 0.8818012475967407,
"learning_rate": 4.138919334463868e-05,
"loss": 1.5166,
"step": 9424
},
{
"epoch": 0.3117067220518907,
"grad_norm": 0.8851699233055115,
"learning_rate": 4.1325682809542124e-05,
"loss": 1.5334,
"step": 9455
},
{
"epoch": 0.31272871130451985,
"grad_norm": 0.8938205242156982,
"learning_rate": 4.126198804133398e-05,
"loss": 1.5408,
"step": 9486
},
{
"epoch": 0.313750700557149,
"grad_norm": 0.8480469584465027,
"learning_rate": 4.1198109758801055e-05,
"loss": 1.5149,
"step": 9517
},
{
"epoch": 0.31477268980977813,
"grad_norm": 0.8756515383720398,
"learning_rate": 4.113404868280107e-05,
"loss": 1.5174,
"step": 9548
},
{
"epoch": 0.31579467906240727,
"grad_norm": 0.8261292576789856,
"learning_rate": 4.106980553625457e-05,
"loss": 1.5295,
"step": 9579
},
{
"epoch": 0.3168166683150364,
"grad_norm": 0.9329167604446411,
"learning_rate": 4.100538104413674e-05,
"loss": 1.5151,
"step": 9610
},
{
"epoch": 0.3178386575676656,
"grad_norm": 0.8843585848808289,
"learning_rate": 4.09407759334692e-05,
"loss": 1.5241,
"step": 9641
},
{
"epoch": 0.31886064682029475,
"grad_norm": 0.8441824316978455,
"learning_rate": 4.087599093331186e-05,
"loss": 1.5293,
"step": 9672
},
{
"epoch": 0.3198826360729239,
"grad_norm": 0.8727023005485535,
"learning_rate": 4.081102677475462e-05,
"loss": 1.5332,
"step": 9703
},
{
"epoch": 0.32090462532555303,
"grad_norm": 0.8603296279907227,
"learning_rate": 4.0745884190909194e-05,
"loss": 1.5205,
"step": 9734
},
{
"epoch": 0.32192661457818217,
"grad_norm": 0.8445816040039062,
"learning_rate": 4.0680563916900796e-05,
"loss": 1.541,
"step": 9765
},
{
"epoch": 0.3229486038308113,
"grad_norm": 0.8408164978027344,
"learning_rate": 4.0615066689859815e-05,
"loss": 1.5106,
"step": 9796
},
{
"epoch": 0.32397059308344045,
"grad_norm": 0.901888370513916,
"learning_rate": 4.0549393248913584e-05,
"loss": 1.5376,
"step": 9827
},
{
"epoch": 0.32499258233606965,
"grad_norm": 0.878149688243866,
"learning_rate": 4.048354433517794e-05,
"loss": 1.5197,
"step": 9858
},
{
"epoch": 0.3260145715886988,
"grad_norm": 0.8681669235229492,
"learning_rate": 4.0417520691748916e-05,
"loss": 1.524,
"step": 9889
},
{
"epoch": 0.32703656084132793,
"grad_norm": 0.8651006817817688,
"learning_rate": 4.035132306369438e-05,
"loss": 1.5257,
"step": 9920
},
{
"epoch": 0.32805855009395707,
"grad_norm": 0.8485890030860901,
"learning_rate": 4.028495219804555e-05,
"loss": 1.5325,
"step": 9951
},
{
"epoch": 0.3290805393465862,
"grad_norm": 0.8644028902053833,
"learning_rate": 4.021840884378864e-05,
"loss": 1.5432,
"step": 9982
},
{
"epoch": 0.33010252859921535,
"grad_norm": 0.9188100695610046,
"learning_rate": 4.015169375185633e-05,
"loss": 1.5381,
"step": 10013
},
{
"epoch": 0.33112451785184455,
"grad_norm": 0.9121026396751404,
"learning_rate": 4.0084807675119396e-05,
"loss": 1.5157,
"step": 10044
},
{
"epoch": 0.3321465071044737,
"grad_norm": 0.8835768699645996,
"learning_rate": 4.0017751368378106e-05,
"loss": 1.527,
"step": 10075
},
{
"epoch": 0.33316849635710283,
"grad_norm": 0.8608120679855347,
"learning_rate": 3.995052558835377e-05,
"loss": 1.5326,
"step": 10106
},
{
"epoch": 0.33419048560973197,
"grad_norm": 0.8970103859901428,
"learning_rate": 3.988313109368017e-05,
"loss": 1.5241,
"step": 10137
},
{
"epoch": 0.3352124748623611,
"grad_norm": 0.832877516746521,
"learning_rate": 3.981556864489504e-05,
"loss": 1.5093,
"step": 10168
},
{
"epoch": 0.33623446411499025,
"grad_norm": 0.8215609788894653,
"learning_rate": 3.974783900443142e-05,
"loss": 1.5243,
"step": 10199
},
{
"epoch": 0.33725645336761945,
"grad_norm": 0.8891729712486267,
"learning_rate": 3.9679942936609095e-05,
"loss": 1.527,
"step": 10230
},
{
"epoch": 0.3382784426202486,
"grad_norm": 0.8482518196105957,
"learning_rate": 3.961188120762596e-05,
"loss": 1.5171,
"step": 10261
},
{
"epoch": 0.33930043187287773,
"grad_norm": 0.8347421288490295,
"learning_rate": 3.954365458554938e-05,
"loss": 1.5209,
"step": 10292
},
{
"epoch": 0.34032242112550687,
"grad_norm": 0.8662514686584473,
"learning_rate": 3.947526384030751e-05,
"loss": 1.5225,
"step": 10323
},
{
"epoch": 0.341344410378136,
"grad_norm": 0.8288858532905579,
"learning_rate": 3.9406709743680624e-05,
"loss": 1.4988,
"step": 10354
},
{
"epoch": 0.34236639963076515,
"grad_norm": 0.8532800674438477,
"learning_rate": 3.9337993069292366e-05,
"loss": 1.5292,
"step": 10385
},
{
"epoch": 0.34338838888339435,
"grad_norm": 0.8530318140983582,
"learning_rate": 3.926911459260109e-05,
"loss": 1.5439,
"step": 10416
},
{
"epoch": 0.3444103781360235,
"grad_norm": 0.8556480407714844,
"learning_rate": 3.920007509089102e-05,
"loss": 1.502,
"step": 10447
},
{
"epoch": 0.34543236738865263,
"grad_norm": 0.834280252456665,
"learning_rate": 3.913087534326357e-05,
"loss": 1.5359,
"step": 10478
},
{
"epoch": 0.34645435664128177,
"grad_norm": 0.8753178119659424,
"learning_rate": 3.9061516130628475e-05,
"loss": 1.5247,
"step": 10509
},
{
"epoch": 0.3474763458939109,
"grad_norm": 0.8129472732543945,
"learning_rate": 3.8991998235695025e-05,
"loss": 1.509,
"step": 10540
},
{
"epoch": 0.34849833514654005,
"grad_norm": 0.8485814929008484,
"learning_rate": 3.8922322442963224e-05,
"loss": 1.5155,
"step": 10571
},
{
"epoch": 0.34952032439916925,
"grad_norm": 0.8839988708496094,
"learning_rate": 3.885248953871491e-05,
"loss": 1.5076,
"step": 10602
},
{
"epoch": 0.3505423136517984,
"grad_norm": 0.8462734818458557,
"learning_rate": 3.8782500311004915e-05,
"loss": 1.5016,
"step": 10633
},
{
"epoch": 0.35156430290442753,
"grad_norm": 0.8573621511459351,
"learning_rate": 3.871235554965218e-05,
"loss": 1.5418,
"step": 10664
},
{
"epoch": 0.35258629215705667,
"grad_norm": 0.8314201235771179,
"learning_rate": 3.864205604623078e-05,
"loss": 1.5167,
"step": 10695
},
{
"epoch": 0.3536082814096858,
"grad_norm": 0.8317237496376038,
"learning_rate": 3.857160259406107e-05,
"loss": 1.5121,
"step": 10726
},
{
"epoch": 0.35463027066231495,
"grad_norm": 0.8582017421722412,
"learning_rate": 3.8500995988200674e-05,
"loss": 1.5137,
"step": 10757
},
{
"epoch": 0.35565225991494415,
"grad_norm": 0.8427022695541382,
"learning_rate": 3.843023702543556e-05,
"loss": 1.5248,
"step": 10788
},
{
"epoch": 0.3566742491675733,
"grad_norm": 0.8414435386657715,
"learning_rate": 3.8359326504270984e-05,
"loss": 1.4923,
"step": 10819
},
{
"epoch": 0.35769623842020243,
"grad_norm": 0.8638574481010437,
"learning_rate": 3.828826522492255e-05,
"loss": 1.4947,
"step": 10850
},
{
"epoch": 0.35871822767283157,
"grad_norm": 0.8254904747009277,
"learning_rate": 3.821705398930713e-05,
"loss": 1.524,
"step": 10881
},
{
"epoch": 0.3597402169254607,
"grad_norm": 0.8651305437088013,
"learning_rate": 3.814569360103385e-05,
"loss": 1.5034,
"step": 10912
},
{
"epoch": 0.36076220617808985,
"grad_norm": 0.8567565679550171,
"learning_rate": 3.807418486539499e-05,
"loss": 1.4971,
"step": 10943
},
{
"epoch": 0.36178419543071905,
"grad_norm": 0.8213040828704834,
"learning_rate": 3.80025285893569e-05,
"loss": 1.5095,
"step": 10974
},
{
"epoch": 0.3628061846833482,
"grad_norm": 0.8153424859046936,
"learning_rate": 3.793072558155093e-05,
"loss": 1.5168,
"step": 11005
},
{
"epoch": 0.36382817393597733,
"grad_norm": 0.8211629390716553,
"learning_rate": 3.785877665226426e-05,
"loss": 1.5208,
"step": 11036
},
{
"epoch": 0.36485016318860647,
"grad_norm": 0.8744972348213196,
"learning_rate": 3.778668261343079e-05,
"loss": 1.5133,
"step": 11067
},
{
"epoch": 0.3658721524412356,
"grad_norm": 0.8211522698402405,
"learning_rate": 3.771444427862192e-05,
"loss": 1.5042,
"step": 11098
},
{
"epoch": 0.36689414169386475,
"grad_norm": 0.8390249609947205,
"learning_rate": 3.7642062463037465e-05,
"loss": 1.5214,
"step": 11129
},
{
"epoch": 0.3679161309464939,
"grad_norm": 0.8379174470901489,
"learning_rate": 3.7569537983496373e-05,
"loss": 1.5164,
"step": 11160
},
{
"epoch": 0.3689381201991231,
"grad_norm": 0.8449585437774658,
"learning_rate": 3.749687165842753e-05,
"loss": 1.5049,
"step": 11191
},
{
"epoch": 0.36996010945175223,
"grad_norm": 0.7959738969802856,
"learning_rate": 3.7424064307860536e-05,
"loss": 1.4941,
"step": 11222
},
{
"epoch": 0.37098209870438137,
"grad_norm": 0.8311371207237244,
"learning_rate": 3.735111675341645e-05,
"loss": 1.513,
"step": 11253
},
{
"epoch": 0.3720040879570105,
"grad_norm": 0.828087329864502,
"learning_rate": 3.7278029818298524e-05,
"loss": 1.4983,
"step": 11284
},
{
"epoch": 0.37302607720963965,
"grad_norm": 0.8467016220092773,
"learning_rate": 3.720480432728287e-05,
"loss": 1.5167,
"step": 11315
},
{
"epoch": 0.3740480664622688,
"grad_norm": 0.8619351983070374,
"learning_rate": 3.71314411067092e-05,
"loss": 1.5065,
"step": 11346
},
{
"epoch": 0.375070055714898,
"grad_norm": 0.8230463862419128,
"learning_rate": 3.70579409844715e-05,
"loss": 1.4987,
"step": 11377
},
{
"epoch": 0.37609204496752713,
"grad_norm": 0.8631263971328735,
"learning_rate": 3.698430479000865e-05,
"loss": 1.5023,
"step": 11408
},
{
"epoch": 0.37711403422015627,
"grad_norm": 0.8831419348716736,
"learning_rate": 3.691053335429509e-05,
"loss": 1.5046,
"step": 11439
},
{
"epoch": 0.3781360234727854,
"grad_norm": 0.8332011103630066,
"learning_rate": 3.683662750983147e-05,
"loss": 1.5002,
"step": 11470
},
{
"epoch": 0.37915801272541455,
"grad_norm": 0.8661298751831055,
"learning_rate": 3.676258809063518e-05,
"loss": 1.51,
"step": 11501
},
{
"epoch": 0.3801800019780437,
"grad_norm": 0.8714830279350281,
"learning_rate": 3.6688415932231004e-05,
"loss": 1.5031,
"step": 11532
},
{
"epoch": 0.3812019912306729,
"grad_norm": 0.8633294701576233,
"learning_rate": 3.661411187164166e-05,
"loss": 1.4854,
"step": 11563
},
{
"epoch": 0.38222398048330203,
"grad_norm": 0.8436555862426758,
"learning_rate": 3.65396767473784e-05,
"loss": 1.5213,
"step": 11594
},
{
"epoch": 0.38324596973593117,
"grad_norm": 0.8612047433853149,
"learning_rate": 3.6465111399431465e-05,
"loss": 1.5002,
"step": 11625
},
{
"epoch": 0.3842679589885603,
"grad_norm": 0.8069844245910645,
"learning_rate": 3.6390416669260674e-05,
"loss": 1.5034,
"step": 11656
},
{
"epoch": 0.38528994824118945,
"grad_norm": 0.8230804204940796,
"learning_rate": 3.63155933997859e-05,
"loss": 1.5155,
"step": 11687
},
{
"epoch": 0.3863119374938186,
"grad_norm": 0.8166376352310181,
"learning_rate": 3.624064243537758e-05,
"loss": 1.487,
"step": 11718
},
{
"epoch": 0.3873339267464478,
"grad_norm": 0.8213214874267578,
"learning_rate": 3.616556462184716e-05,
"loss": 1.5178,
"step": 11749
},
{
"epoch": 0.38835591599907693,
"grad_norm": 0.8418950438499451,
"learning_rate": 3.609036080643755e-05,
"loss": 1.5173,
"step": 11780
},
{
"epoch": 0.38937790525170607,
"grad_norm": 0.839227020740509,
"learning_rate": 3.60150318378136e-05,
"loss": 1.51,
"step": 11811
},
{
"epoch": 0.3903998945043352,
"grad_norm": 0.8407977819442749,
"learning_rate": 3.5939578566052465e-05,
"loss": 1.4967,
"step": 11842
},
{
"epoch": 0.39142188375696435,
"grad_norm": 0.8286086320877075,
"learning_rate": 3.586400184263408e-05,
"loss": 1.4817,
"step": 11873
},
{
"epoch": 0.3924438730095935,
"grad_norm": 0.8609039783477783,
"learning_rate": 3.578830252043148e-05,
"loss": 1.4842,
"step": 11904
},
{
"epoch": 0.3934658622622227,
"grad_norm": 0.8497804403305054,
"learning_rate": 3.571248145370125e-05,
"loss": 1.509,
"step": 11935
},
{
"epoch": 0.39448785151485183,
"grad_norm": 0.8288097977638245,
"learning_rate": 3.5636539498073794e-05,
"loss": 1.4914,
"step": 11966
},
{
"epoch": 0.39550984076748097,
"grad_norm": 0.8199227452278137,
"learning_rate": 3.556047751054378e-05,
"loss": 1.4754,
"step": 11997
},
{
"epoch": 0.3965318300201101,
"grad_norm": 0.8441981077194214,
"learning_rate": 3.548429634946039e-05,
"loss": 1.4826,
"step": 12028
},
{
"epoch": 0.39755381927273925,
"grad_norm": 0.7997297048568726,
"learning_rate": 3.540799687451768e-05,
"loss": 1.4804,
"step": 12059
},
{
"epoch": 0.3985758085253684,
"grad_norm": 0.8018672466278076,
"learning_rate": 3.533157994674485e-05,
"loss": 1.4995,
"step": 12090
},
{
"epoch": 0.3995977977779976,
"grad_norm": 0.8712153434753418,
"learning_rate": 3.5255046428496546e-05,
"loss": 1.5044,
"step": 12121
},
{
"epoch": 0.40061978703062673,
"grad_norm": 0.852215588092804,
"learning_rate": 3.517839718344311e-05,
"loss": 1.4931,
"step": 12152
},
{
"epoch": 0.40164177628325587,
"grad_norm": 0.8421558141708374,
"learning_rate": 3.510163307656086e-05,
"loss": 1.4967,
"step": 12183
},
{
"epoch": 0.402663765535885,
"grad_norm": 0.8428821563720703,
"learning_rate": 3.5024754974122324e-05,
"loss": 1.5042,
"step": 12214
},
{
"epoch": 0.40368575478851415,
"grad_norm": 0.8397454023361206,
"learning_rate": 3.494776374368643e-05,
"loss": 1.4987,
"step": 12245
},
{
"epoch": 0.4047077440411433,
"grad_norm": 0.8511505126953125,
"learning_rate": 3.4870660254088724e-05,
"loss": 1.5062,
"step": 12276
},
{
"epoch": 0.40572973329377243,
"grad_norm": 0.8236075639724731,
"learning_rate": 3.479344537543164e-05,
"loss": 1.4907,
"step": 12307
},
{
"epoch": 0.40675172254640163,
"grad_norm": 0.8540985584259033,
"learning_rate": 3.4716119979074565e-05,
"loss": 1.4898,
"step": 12338
},
{
"epoch": 0.40777371179903077,
"grad_norm": 0.8223760724067688,
"learning_rate": 3.463868493762412e-05,
"loss": 1.4794,
"step": 12369
},
{
"epoch": 0.4087957010516599,
"grad_norm": 0.8553142547607422,
"learning_rate": 3.456114112492418e-05,
"loss": 1.5021,
"step": 12400
},
{
"epoch": 0.40981769030428905,
"grad_norm": 0.83010333776474,
"learning_rate": 3.4483489416046164e-05,
"loss": 1.4776,
"step": 12431
},
{
"epoch": 0.4108396795569182,
"grad_norm": 0.8293822407722473,
"learning_rate": 3.440573068727905e-05,
"loss": 1.4883,
"step": 12462
},
{
"epoch": 0.41186166880954733,
"grad_norm": 0.8343706727027893,
"learning_rate": 3.4327865816119495e-05,
"loss": 1.4919,
"step": 12493
},
{
"epoch": 0.41288365806217653,
"grad_norm": 0.8471996784210205,
"learning_rate": 3.4249895681262025e-05,
"loss": 1.4999,
"step": 12524
},
{
"epoch": 0.41390564731480567,
"grad_norm": 0.8553738594055176,
"learning_rate": 3.417182116258899e-05,
"loss": 1.4816,
"step": 12555
},
{
"epoch": 0.4149276365674348,
"grad_norm": 0.8174490332603455,
"learning_rate": 3.409364314116074e-05,
"loss": 1.471,
"step": 12586
},
{
"epoch": 0.41594962582006395,
"grad_norm": 0.797466516494751,
"learning_rate": 3.401536249920559e-05,
"loss": 1.4922,
"step": 12617
},
{
"epoch": 0.4169716150726931,
"grad_norm": 0.8381075263023376,
"learning_rate": 3.393698012010998e-05,
"loss": 1.4955,
"step": 12648
},
{
"epoch": 0.41799360432532223,
"grad_norm": 0.8211097717285156,
"learning_rate": 3.385849688840839e-05,
"loss": 1.5054,
"step": 12679
},
{
"epoch": 0.41901559357795143,
"grad_norm": 0.8614106178283691,
"learning_rate": 3.3779913689773414e-05,
"loss": 1.4906,
"step": 12710
},
{
"epoch": 0.42003758283058057,
"grad_norm": 0.8300454616546631,
"learning_rate": 3.370123141100578e-05,
"loss": 1.5008,
"step": 12741
},
{
"epoch": 0.4210595720832097,
"grad_norm": 0.8297508955001831,
"learning_rate": 3.3622450940024305e-05,
"loss": 1.4885,
"step": 12772
},
{
"epoch": 0.42208156133583885,
"grad_norm": 0.8315346837043762,
"learning_rate": 3.35435731658559e-05,
"loss": 1.4839,
"step": 12803
},
{
"epoch": 0.423103550588468,
"grad_norm": 0.8035879135131836,
"learning_rate": 3.346459897862552e-05,
"loss": 1.4849,
"step": 12834
},
{
"epoch": 0.42412553984109713,
"grad_norm": 0.8297046422958374,
"learning_rate": 3.338552926954613e-05,
"loss": 1.4874,
"step": 12865
},
{
"epoch": 0.42514752909372633,
"grad_norm": 0.8357899188995361,
"learning_rate": 3.330636493090868e-05,
"loss": 1.4894,
"step": 12896
},
{
"epoch": 0.42616951834635547,
"grad_norm": 0.8307090997695923,
"learning_rate": 3.322710685607193e-05,
"loss": 1.4906,
"step": 12927
},
{
"epoch": 0.4271915075989846,
"grad_norm": 0.8082265853881836,
"learning_rate": 3.314775593945251e-05,
"loss": 1.4879,
"step": 12958
},
{
"epoch": 0.42821349685161375,
"grad_norm": 0.8400994539260864,
"learning_rate": 3.3068313076514714e-05,
"loss": 1.4902,
"step": 12989
},
{
"epoch": 0.4292354861042429,
"grad_norm": 0.8030775785446167,
"learning_rate": 3.298877916376047e-05,
"loss": 1.4844,
"step": 13020
},
{
"epoch": 0.43025747535687203,
"grad_norm": 0.8701795935630798,
"learning_rate": 3.290915509871915e-05,
"loss": 1.4803,
"step": 13051
},
{
"epoch": 0.43127946460950123,
"grad_norm": 0.8453795313835144,
"learning_rate": 3.282944177993753e-05,
"loss": 1.4841,
"step": 13082
},
{
"epoch": 0.43230145386213037,
"grad_norm": 0.8204758167266846,
"learning_rate": 3.274964010696957e-05,
"loss": 1.4879,
"step": 13113
},
{
"epoch": 0.4333234431147595,
"grad_norm": 0.8047789931297302,
"learning_rate": 3.266975098036629e-05,
"loss": 1.4869,
"step": 13144
},
{
"epoch": 0.43434543236738865,
"grad_norm": 0.8196751475334167,
"learning_rate": 3.258977530166562e-05,
"loss": 1.487,
"step": 13175
},
{
"epoch": 0.4353674216200178,
"grad_norm": 0.8294420838356018,
"learning_rate": 3.250971397338227e-05,
"loss": 1.4876,
"step": 13206
},
{
"epoch": 0.43638941087264693,
"grad_norm": 0.7943994998931885,
"learning_rate": 3.2429567898997404e-05,
"loss": 1.4601,
"step": 13237
},
{
"epoch": 0.43741140012527613,
"grad_norm": 0.8354049921035767,
"learning_rate": 3.234933798294859e-05,
"loss": 1.4914,
"step": 13268
},
{
"epoch": 0.43843338937790527,
"grad_norm": 0.8477530479431152,
"learning_rate": 3.2269025130619535e-05,
"loss": 1.4559,
"step": 13299
},
{
"epoch": 0.4394553786305344,
"grad_norm": 0.8300078511238098,
"learning_rate": 3.218863024832985e-05,
"loss": 1.4727,
"step": 13330
},
{
"epoch": 0.44047736788316355,
"grad_norm": 0.83632892370224,
"learning_rate": 3.2108154243324864e-05,
"loss": 1.4822,
"step": 13361
},
{
"epoch": 0.4414993571357927,
"grad_norm": 0.8274350166320801,
"learning_rate": 3.2027598023765345e-05,
"loss": 1.482,
"step": 13392
},
{
"epoch": 0.44252134638842183,
"grad_norm": 0.8118026852607727,
"learning_rate": 3.194696249871729e-05,
"loss": 1.4947,
"step": 13423
},
{
"epoch": 0.443543335641051,
"grad_norm": 0.7988345623016357,
"learning_rate": 3.186624857814164e-05,
"loss": 1.4634,
"step": 13454
},
{
"epoch": 0.44456532489368017,
"grad_norm": 0.8391137719154358,
"learning_rate": 3.178545717288401e-05,
"loss": 1.4715,
"step": 13485
},
{
"epoch": 0.4455873141463093,
"grad_norm": 0.8533878326416016,
"learning_rate": 3.170458919466444e-05,
"loss": 1.4667,
"step": 13516
},
{
"epoch": 0.44660930339893845,
"grad_norm": 0.8210632801055908,
"learning_rate": 3.1623645556067063e-05,
"loss": 1.475,
"step": 13547
},
{
"epoch": 0.4476312926515676,
"grad_norm": 0.8421019911766052,
"learning_rate": 3.154262717052985e-05,
"loss": 1.4814,
"step": 13578
},
{
"epoch": 0.44865328190419673,
"grad_norm": 0.8044966459274292,
"learning_rate": 3.146153495233426e-05,
"loss": 1.4739,
"step": 13609
},
{
"epoch": 0.4496752711568259,
"grad_norm": 0.8291748762130737,
"learning_rate": 3.1380369816594944e-05,
"loss": 1.4723,
"step": 13640
},
{
"epoch": 0.45069726040945507,
"grad_norm": 0.8362712264060974,
"learning_rate": 3.129913267924946e-05,
"loss": 1.4783,
"step": 13671
},
{
"epoch": 0.4517192496620842,
"grad_norm": 0.809481143951416,
"learning_rate": 3.121782445704782e-05,
"loss": 1.4771,
"step": 13702
},
{
"epoch": 0.45274123891471335,
"grad_norm": 0.8484416007995605,
"learning_rate": 3.11364460675423e-05,
"loss": 1.4615,
"step": 13733
},
{
"epoch": 0.4537632281673425,
"grad_norm": 0.8735551238059998,
"learning_rate": 3.1054998429076934e-05,
"loss": 1.4754,
"step": 13764
},
{
"epoch": 0.45478521741997163,
"grad_norm": 0.8091564774513245,
"learning_rate": 3.097348246077728e-05,
"loss": 1.4853,
"step": 13795
},
{
"epoch": 0.4558072066726008,
"grad_norm": 0.8234131932258606,
"learning_rate": 3.0891899082539924e-05,
"loss": 1.4786,
"step": 13826
},
{
"epoch": 0.45682919592522997,
"grad_norm": 0.8537734746932983,
"learning_rate": 3.0810249215022233e-05,
"loss": 1.4757,
"step": 13857
},
{
"epoch": 0.4578511851778591,
"grad_norm": 0.8478782773017883,
"learning_rate": 3.0728533779631865e-05,
"loss": 1.4621,
"step": 13888
},
{
"epoch": 0.45887317443048825,
"grad_norm": 0.8195151090621948,
"learning_rate": 3.064675369851637e-05,
"loss": 1.4727,
"step": 13919
},
{
"epoch": 0.4598951636831174,
"grad_norm": 0.8208017349243164,
"learning_rate": 3.056490989455289e-05,
"loss": 1.4744,
"step": 13950
},
{
"epoch": 0.46091715293574653,
"grad_norm": 0.7893416285514832,
"learning_rate": 3.0483003291337596e-05,
"loss": 1.4713,
"step": 13981
},
{
"epoch": 0.4619391421883757,
"grad_norm": 2.002336263656616,
"learning_rate": 3.040103481317539e-05,
"loss": 1.4523,
"step": 14012
},
{
"epoch": 0.46296113144100487,
"grad_norm": 0.7943160533905029,
"learning_rate": 3.03190053850694e-05,
"loss": 1.4635,
"step": 14043
},
{
"epoch": 0.463983120693634,
"grad_norm": 0.8262372612953186,
"learning_rate": 3.0236915932710573e-05,
"loss": 1.47,
"step": 14074
},
{
"epoch": 0.46500510994626315,
"grad_norm": 0.8457150459289551,
"learning_rate": 3.0154767382467232e-05,
"loss": 1.4757,
"step": 14105
},
{
"epoch": 0.4660270991988923,
"grad_norm": 0.8377997279167175,
"learning_rate": 3.0072560661374582e-05,
"loss": 1.4752,
"step": 14136
},
{
"epoch": 0.46704908845152143,
"grad_norm": 0.7971871495246887,
"learning_rate": 2.999029669712431e-05,
"loss": 1.4744,
"step": 14167
},
{
"epoch": 0.4680710777041506,
"grad_norm": 0.8239099383354187,
"learning_rate": 2.990797641805408e-05,
"loss": 1.4683,
"step": 14198
},
{
"epoch": 0.46909306695677977,
"grad_norm": 0.8354623317718506,
"learning_rate": 2.982560075313704e-05,
"loss": 1.4859,
"step": 14229
},
{
"epoch": 0.4701150562094089,
"grad_norm": 0.8527148962020874,
"learning_rate": 2.9743170631971368e-05,
"loss": 1.4569,
"step": 14260
},
{
"epoch": 0.47113704546203805,
"grad_norm": 0.8640620112419128,
"learning_rate": 2.9660686984769792e-05,
"loss": 1.4711,
"step": 14291
},
{
"epoch": 0.4721590347146672,
"grad_norm": 0.8033869862556458,
"learning_rate": 2.9578150742349047e-05,
"loss": 1.4803,
"step": 14322
},
{
"epoch": 0.47318102396729633,
"grad_norm": 0.8113462924957275,
"learning_rate": 2.949556283611942e-05,
"loss": 1.4644,
"step": 14353
},
{
"epoch": 0.4742030132199255,
"grad_norm": 0.8008654117584229,
"learning_rate": 2.9412924198074206e-05,
"loss": 1.4591,
"step": 14384
},
{
"epoch": 0.47522500247255467,
"grad_norm": 0.8297123312950134,
"learning_rate": 2.9330235760779208e-05,
"loss": 1.4772,
"step": 14415
},
{
"epoch": 0.4762469917251838,
"grad_norm": 0.8336069583892822,
"learning_rate": 2.9247498457362188e-05,
"loss": 1.4735,
"step": 14446
},
{
"epoch": 0.47726898097781295,
"grad_norm": 0.7794061899185181,
"learning_rate": 2.9164713221502373e-05,
"loss": 1.4655,
"step": 14477
},
{
"epoch": 0.4782909702304421,
"grad_norm": 0.8473496437072754,
"learning_rate": 2.9081880987419912e-05,
"loss": 1.4764,
"step": 14508
},
{
"epoch": 0.47931295948307123,
"grad_norm": 0.8163958191871643,
"learning_rate": 2.8999002689865296e-05,
"loss": 1.4735,
"step": 14539
},
{
"epoch": 0.4803349487357004,
"grad_norm": 0.8134874701499939,
"learning_rate": 2.8916079264108852e-05,
"loss": 1.4725,
"step": 14570
},
{
"epoch": 0.48135693798832957,
"grad_norm": 0.788411557674408,
"learning_rate": 2.883311164593017e-05,
"loss": 1.4641,
"step": 14601
},
{
"epoch": 0.4823789272409587,
"grad_norm": 0.8075402975082397,
"learning_rate": 2.875010077160754e-05,
"loss": 1.473,
"step": 14632
},
{
"epoch": 0.48340091649358785,
"grad_norm": 0.8053046464920044,
"learning_rate": 2.866704757790741e-05,
"loss": 1.4642,
"step": 14663
},
{
"epoch": 0.484422905746217,
"grad_norm": 0.8402507901191711,
"learning_rate": 2.858395300207376e-05,
"loss": 1.4768,
"step": 14694
},
{
"epoch": 0.48544489499884613,
"grad_norm": 0.8206636905670166,
"learning_rate": 2.8500817981817607e-05,
"loss": 1.448,
"step": 14725
},
{
"epoch": 0.4864668842514753,
"grad_norm": 0.8397791385650635,
"learning_rate": 2.8417643455306336e-05,
"loss": 1.4611,
"step": 14756
},
{
"epoch": 0.4874888735041044,
"grad_norm": 0.8360273838043213,
"learning_rate": 2.8334430361153185e-05,
"loss": 1.4681,
"step": 14787
},
{
"epoch": 0.4885108627567336,
"grad_norm": 0.7953571677207947,
"learning_rate": 2.8251179638406612e-05,
"loss": 1.4749,
"step": 14818
},
{
"epoch": 0.48953285200936275,
"grad_norm": 0.8476656079292297,
"learning_rate": 2.8167892226539704e-05,
"loss": 1.4774,
"step": 14849
},
{
"epoch": 0.4905548412619919,
"grad_norm": 0.7990041375160217,
"learning_rate": 2.8084569065439588e-05,
"loss": 1.4599,
"step": 14880
},
{
"epoch": 0.49157683051462103,
"grad_norm": 0.8252399563789368,
"learning_rate": 2.8001211095396807e-05,
"loss": 1.482,
"step": 14911
},
{
"epoch": 0.4925988197672502,
"grad_norm": 0.8177118897438049,
"learning_rate": 2.791781925709473e-05,
"loss": 1.476,
"step": 14942
},
{
"epoch": 0.4936208090198793,
"grad_norm": 0.8130631446838379,
"learning_rate": 2.7834394491598908e-05,
"loss": 1.4584,
"step": 14973
},
{
"epoch": 0.4946427982725085,
"grad_norm": 0.8575690388679504,
"learning_rate": 2.7750937740346485e-05,
"loss": 1.4567,
"step": 15004
},
{
"epoch": 0.49566478752513765,
"grad_norm": 0.8180646300315857,
"learning_rate": 2.7667449945135564e-05,
"loss": 1.4838,
"step": 15035
},
{
"epoch": 0.4966867767777668,
"grad_norm": 0.8085652589797974,
"learning_rate": 2.7583932048114557e-05,
"loss": 1.4473,
"step": 15066
},
{
"epoch": 0.49770876603039593,
"grad_norm": 0.7771590352058411,
"learning_rate": 2.7500384991771587e-05,
"loss": 1.4613,
"step": 15097
},
{
"epoch": 0.4987307552830251,
"grad_norm": 0.8312346339225769,
"learning_rate": 2.7416809718923825e-05,
"loss": 1.4561,
"step": 15128
},
{
"epoch": 0.4997527445356542,
"grad_norm": 0.8133281469345093,
"learning_rate": 2.7333207172706864e-05,
"loss": 1.4617,
"step": 15159
},
{
"epoch": 0.5007747337882834,
"grad_norm": 0.8043552041053772,
"learning_rate": 2.7249578296564088e-05,
"loss": 1.4814,
"step": 15190
},
{
"epoch": 0.5017967230409125,
"grad_norm": 0.8398190140724182,
"learning_rate": 2.7165924034235973e-05,
"loss": 1.4604,
"step": 15221
},
{
"epoch": 0.5028187122935417,
"grad_norm": 0.8101472854614258,
"learning_rate": 2.708224532974953e-05,
"loss": 1.4655,
"step": 15252
},
{
"epoch": 0.5038407015461709,
"grad_norm": 0.8169765472412109,
"learning_rate": 2.6998543127407538e-05,
"loss": 1.4649,
"step": 15283
},
{
"epoch": 0.5048626907988,
"grad_norm": 0.8130091428756714,
"learning_rate": 2.6914818371777988e-05,
"loss": 1.4532,
"step": 15314
},
{
"epoch": 0.5058846800514292,
"grad_norm": 0.8258066177368164,
"learning_rate": 2.6831072007683373e-05,
"loss": 1.4645,
"step": 15345
},
{
"epoch": 0.5069066693040583,
"grad_norm": 0.8129923939704895,
"learning_rate": 2.6747304980190018e-05,
"loss": 1.4486,
"step": 15376
},
{
"epoch": 0.5079286585566875,
"grad_norm": 0.8236402273178101,
"learning_rate": 2.6663518234597453e-05,
"loss": 1.4655,
"step": 15407
},
{
"epoch": 0.5089506478093165,
"grad_norm": 0.8337636590003967,
"learning_rate": 2.6579712716427696e-05,
"loss": 1.4617,
"step": 15438
},
{
"epoch": 0.5099726370619457,
"grad_norm": 0.8363469839096069,
"learning_rate": 2.6495889371414652e-05,
"loss": 1.4635,
"step": 15469
},
{
"epoch": 0.5109946263145749,
"grad_norm": 0.8415119647979736,
"learning_rate": 2.6412049145493367e-05,
"loss": 1.4664,
"step": 15500
},
{
"epoch": 0.512016615567204,
"grad_norm": 0.8207569718360901,
"learning_rate": 2.632819298478939e-05,
"loss": 1.4513,
"step": 15531
},
{
"epoch": 0.5130386048198332,
"grad_norm": 0.840300440788269,
"learning_rate": 2.6244321835608105e-05,
"loss": 1.4657,
"step": 15562
},
{
"epoch": 0.5140605940724623,
"grad_norm": 0.8326119780540466,
"learning_rate": 2.6160436644424024e-05,
"loss": 1.454,
"step": 15593
},
{
"epoch": 0.5150825833250915,
"grad_norm": 0.8397566080093384,
"learning_rate": 2.6076538357870133e-05,
"loss": 1.4827,
"step": 15624
},
{
"epoch": 0.5161045725777207,
"grad_norm": 0.8247369527816772,
"learning_rate": 2.5992627922727196e-05,
"loss": 1.4503,
"step": 15655
},
{
"epoch": 0.5171265618303498,
"grad_norm": 0.7943838238716125,
"learning_rate": 2.5908706285913066e-05,
"loss": 1.463,
"step": 15686
},
{
"epoch": 0.518148551082979,
"grad_norm": 0.8546155095100403,
"learning_rate": 2.5824774394472008e-05,
"loss": 1.4511,
"step": 15717
},
{
"epoch": 0.519170540335608,
"grad_norm": 0.8017929792404175,
"learning_rate": 2.5740833195563996e-05,
"loss": 1.4583,
"step": 15748
},
{
"epoch": 0.5201925295882373,
"grad_norm": 0.8227274417877197,
"learning_rate": 2.5656883636454067e-05,
"loss": 1.4633,
"step": 15779
},
{
"epoch": 0.5212145188408663,
"grad_norm": 0.8150655031204224,
"learning_rate": 2.557292666450159e-05,
"loss": 1.4549,
"step": 15810
},
{
"epoch": 0.5222365080934955,
"grad_norm": 0.8417273163795471,
"learning_rate": 2.5488963227149566e-05,
"loss": 1.4525,
"step": 15841
},
{
"epoch": 0.5232584973461247,
"grad_norm": 0.8203164935112,
"learning_rate": 2.5404994271913983e-05,
"loss": 1.4664,
"step": 15872
},
{
"epoch": 0.5242804865987538,
"grad_norm": 0.781090497970581,
"learning_rate": 2.5321020746373085e-05,
"loss": 1.4529,
"step": 15903
},
{
"epoch": 0.525302475851383,
"grad_norm": 0.7886222004890442,
"learning_rate": 2.52370435981567e-05,
"loss": 1.449,
"step": 15934
},
{
"epoch": 0.5263244651040121,
"grad_norm": 0.8068331480026245,
"learning_rate": 2.5153063774935533e-05,
"loss": 1.4468,
"step": 15965
},
{
"epoch": 0.5273464543566413,
"grad_norm": 0.8176882863044739,
"learning_rate": 2.506908222441045e-05,
"loss": 1.4431,
"step": 15996
},
{
"epoch": 0.5283684436092704,
"grad_norm": 0.8044219017028809,
"learning_rate": 2.498509989430187e-05,
"loss": 1.4565,
"step": 16027
},
{
"epoch": 0.5293904328618996,
"grad_norm": 0.8286380767822266,
"learning_rate": 2.4901117732338958e-05,
"loss": 1.438,
"step": 16058
},
{
"epoch": 0.5304124221145288,
"grad_norm": 0.7985462546348572,
"learning_rate": 2.481713668624899e-05,
"loss": 1.4596,
"step": 16089
},
{
"epoch": 0.5314344113671579,
"grad_norm": 0.8129749894142151,
"learning_rate": 2.4733157703746663e-05,
"loss": 1.4643,
"step": 16120
},
{
"epoch": 0.532456400619787,
"grad_norm": 0.818228542804718,
"learning_rate": 2.4649181732523392e-05,
"loss": 1.459,
"step": 16151
},
{
"epoch": 0.5334783898724161,
"grad_norm": 0.8294692039489746,
"learning_rate": 2.4565209720236582e-05,
"loss": 1.4608,
"step": 16182
},
{
"epoch": 0.5345003791250453,
"grad_norm": 0.8209260106086731,
"learning_rate": 2.4481242614498975e-05,
"loss": 1.4615,
"step": 16213
},
{
"epoch": 0.5355223683776745,
"grad_norm": 0.8328977227210999,
"learning_rate": 2.439728136286796e-05,
"loss": 1.4407,
"step": 16244
},
{
"epoch": 0.5365443576303036,
"grad_norm": 0.8058875799179077,
"learning_rate": 2.4313326912834852e-05,
"loss": 1.4548,
"step": 16275
},
{
"epoch": 0.5375663468829328,
"grad_norm": 0.8151506185531616,
"learning_rate": 2.4229380211814206e-05,
"loss": 1.4673,
"step": 16306
},
{
"epoch": 0.5385883361355619,
"grad_norm": 0.7918756008148193,
"learning_rate": 2.4145442207133124e-05,
"loss": 1.4552,
"step": 16337
},
{
"epoch": 0.5396103253881911,
"grad_norm": 0.8043615818023682,
"learning_rate": 2.406151384602059e-05,
"loss": 1.4384,
"step": 16368
},
{
"epoch": 0.5406323146408202,
"grad_norm": 0.7996934652328491,
"learning_rate": 2.3977596075596747e-05,
"loss": 1.4498,
"step": 16399
},
{
"epoch": 0.5416543038934494,
"grad_norm": 0.8233758211135864,
"learning_rate": 2.3893689842862223e-05,
"loss": 1.4576,
"step": 16430
},
{
"epoch": 0.5426762931460786,
"grad_norm": 0.8136016726493835,
"learning_rate": 2.3809796094687475e-05,
"loss": 1.4599,
"step": 16461
},
{
"epoch": 0.5436982823987077,
"grad_norm": 0.8084482550621033,
"learning_rate": 2.372591577780202e-05,
"loss": 1.4462,
"step": 16492
},
{
"epoch": 0.5447202716513369,
"grad_norm": 0.8046656847000122,
"learning_rate": 2.3642049838783838e-05,
"loss": 1.4618,
"step": 16523
},
{
"epoch": 0.5457422609039659,
"grad_norm": 0.8591692447662354,
"learning_rate": 2.3558199224048666e-05,
"loss": 1.4407,
"step": 16554
},
{
"epoch": 0.5467642501565951,
"grad_norm": 0.8005566000938416,
"learning_rate": 2.347436487983929e-05,
"loss": 1.4473,
"step": 16585
},
{
"epoch": 0.5477862394092243,
"grad_norm": 0.8062754273414612,
"learning_rate": 2.3390547752214888e-05,
"loss": 1.4475,
"step": 16616
},
{
"epoch": 0.5488082286618534,
"grad_norm": 0.7946265935897827,
"learning_rate": 2.330674878704035e-05,
"loss": 1.4386,
"step": 16647
},
{
"epoch": 0.5498302179144826,
"grad_norm": 0.7979186177253723,
"learning_rate": 2.322296892997561e-05,
"loss": 1.4294,
"step": 16678
},
{
"epoch": 0.5508522071671117,
"grad_norm": 0.8142803907394409,
"learning_rate": 2.313920912646497e-05,
"loss": 1.4448,
"step": 16709
},
{
"epoch": 0.5518741964197409,
"grad_norm": 0.8202729821205139,
"learning_rate": 2.305547032172643e-05,
"loss": 1.4665,
"step": 16740
},
{
"epoch": 0.55289618567237,
"grad_norm": 0.7969973087310791,
"learning_rate": 2.2971753460741014e-05,
"loss": 1.4441,
"step": 16771
},
{
"epoch": 0.5539181749249992,
"grad_norm": 0.7817745208740234,
"learning_rate": 2.288805948824212e-05,
"loss": 1.4618,
"step": 16802
},
{
"epoch": 0.5549401641776284,
"grad_norm": 0.8136980533599854,
"learning_rate": 2.2804389348704858e-05,
"loss": 1.44,
"step": 16833
},
{
"epoch": 0.5559621534302575,
"grad_norm": 0.8196117281913757,
"learning_rate": 2.2720743986335374e-05,
"loss": 1.4387,
"step": 16864
},
{
"epoch": 0.5569841426828867,
"grad_norm": 0.8157840371131897,
"learning_rate": 2.2637124345060233e-05,
"loss": 1.4549,
"step": 16895
},
{
"epoch": 0.5580061319355157,
"grad_norm": 0.8260380029678345,
"learning_rate": 2.2553531368515695e-05,
"loss": 1.4694,
"step": 16926
},
{
"epoch": 0.5590281211881449,
"grad_norm": 0.8016270399093628,
"learning_rate": 2.2469966000037144e-05,
"loss": 1.4579,
"step": 16957
},
{
"epoch": 0.5600501104407741,
"grad_norm": 0.816955029964447,
"learning_rate": 2.2386429182648417e-05,
"loss": 1.4503,
"step": 16988
},
{
"epoch": 0.5610720996934032,
"grad_norm": 0.7901566624641418,
"learning_rate": 2.230292185905114e-05,
"loss": 1.446,
"step": 17019
},
{
"epoch": 0.5620940889460324,
"grad_norm": 0.8029798865318298,
"learning_rate": 2.2219444971614116e-05,
"loss": 1.4631,
"step": 17050
},
{
"epoch": 0.5631160781986615,
"grad_norm": 0.8106251358985901,
"learning_rate": 2.2135999462362655e-05,
"loss": 1.4553,
"step": 17081
},
{
"epoch": 0.5641380674512907,
"grad_norm": 0.8000419735908508,
"learning_rate": 2.2052586272968003e-05,
"loss": 1.4503,
"step": 17112
},
{
"epoch": 0.5651600567039198,
"grad_norm": 0.7883618474006653,
"learning_rate": 2.196920634473666e-05,
"loss": 1.446,
"step": 17143
},
{
"epoch": 0.566182045956549,
"grad_norm": 0.7922521829605103,
"learning_rate": 2.1885860618599787e-05,
"loss": 1.4349,
"step": 17174
},
{
"epoch": 0.5672040352091782,
"grad_norm": 0.8302006721496582,
"learning_rate": 2.1802550035102577e-05,
"loss": 1.4421,
"step": 17205
},
{
"epoch": 0.5682260244618073,
"grad_norm": 0.8268263339996338,
"learning_rate": 2.171927553439363e-05,
"loss": 1.4415,
"step": 17236
},
{
"epoch": 0.5692480137144365,
"grad_norm": 0.815686047077179,
"learning_rate": 2.1636038056214376e-05,
"loss": 1.4394,
"step": 17267
},
{
"epoch": 0.5702700029670655,
"grad_norm": 0.8233394622802734,
"learning_rate": 2.155283853988844e-05,
"loss": 1.4607,
"step": 17298
},
{
"epoch": 0.5712919922196947,
"grad_norm": 0.8064000010490417,
"learning_rate": 2.146967792431106e-05,
"loss": 1.448,
"step": 17329
},
{
"epoch": 0.5723139814723238,
"grad_norm": 0.8143057823181152,
"learning_rate": 2.138655714793849e-05,
"loss": 1.4253,
"step": 17360
},
{
"epoch": 0.573335970724953,
"grad_norm": 0.8112401366233826,
"learning_rate": 2.1303477148777367e-05,
"loss": 1.4321,
"step": 17391
},
{
"epoch": 0.5743579599775822,
"grad_norm": 0.8024718761444092,
"learning_rate": 2.122043886437421e-05,
"loss": 1.445,
"step": 17422
},
{
"epoch": 0.5753799492302113,
"grad_norm": 0.8294973373413086,
"learning_rate": 2.1137443231804765e-05,
"loss": 1.4713,
"step": 17453
},
{
"epoch": 0.5764019384828405,
"grad_norm": 0.8058011531829834,
"learning_rate": 2.105449118766347e-05,
"loss": 1.4374,
"step": 17484
},
{
"epoch": 0.5774239277354696,
"grad_norm": 0.8186344504356384,
"learning_rate": 2.097158366805287e-05,
"loss": 1.4488,
"step": 17515
},
{
"epoch": 0.5784459169880988,
"grad_norm": 0.8200953602790833,
"learning_rate": 2.0888721608573047e-05,
"loss": 1.4464,
"step": 17546
},
{
"epoch": 0.579467906240728,
"grad_norm": 0.8406069278717041,
"learning_rate": 2.0805905944311087e-05,
"loss": 1.4484,
"step": 17577
},
{
"epoch": 0.580489895493357,
"grad_norm": 0.8079460859298706,
"learning_rate": 2.0723137609830497e-05,
"loss": 1.4437,
"step": 17608
},
{
"epoch": 0.5815118847459863,
"grad_norm": 0.7628518342971802,
"learning_rate": 2.0640417539160686e-05,
"loss": 1.398,
"step": 17639
},
{
"epoch": 0.5825338739986153,
"grad_norm": 0.8117976188659668,
"learning_rate": 2.0557746665786427e-05,
"loss": 1.4394,
"step": 17670
},
{
"epoch": 0.5835558632512445,
"grad_norm": 0.8591923713684082,
"learning_rate": 2.0475125922637256e-05,
"loss": 1.4379,
"step": 17701
},
{
"epoch": 0.5845778525038736,
"grad_norm": 0.8447268009185791,
"learning_rate": 2.0392556242077047e-05,
"loss": 1.4388,
"step": 17732
},
{
"epoch": 0.5855998417565028,
"grad_norm": 0.8177095055580139,
"learning_rate": 2.031003855589343e-05,
"loss": 1.4258,
"step": 17763
},
{
"epoch": 0.586621831009132,
"grad_norm": 0.8127830028533936,
"learning_rate": 2.022757379528727e-05,
"loss": 1.4361,
"step": 17794
},
{
"epoch": 0.5876438202617611,
"grad_norm": 0.8459563255310059,
"learning_rate": 2.0145162890862184e-05,
"loss": 1.425,
"step": 17825
},
{
"epoch": 0.5886658095143903,
"grad_norm": 0.7994370460510254,
"learning_rate": 2.0062806772614022e-05,
"loss": 1.4407,
"step": 17856
},
{
"epoch": 0.5896877987670194,
"grad_norm": 0.8096714019775391,
"learning_rate": 1.9980506369920392e-05,
"loss": 1.4396,
"step": 17887
},
{
"epoch": 0.5907097880196486,
"grad_norm": 0.760986864566803,
"learning_rate": 1.989826261153015e-05,
"loss": 1.4394,
"step": 17918
},
{
"epoch": 0.5917317772722778,
"grad_norm": 0.7998522520065308,
"learning_rate": 1.9816076425552923e-05,
"loss": 1.4465,
"step": 17949
},
{
"epoch": 0.5927537665249069,
"grad_norm": 0.78706294298172,
"learning_rate": 1.9733948739448676e-05,
"loss": 1.4537,
"step": 17980
},
{
"epoch": 0.593775755777536,
"grad_norm": 0.792362630367279,
"learning_rate": 1.9651880480017155e-05,
"loss": 1.4264,
"step": 18011
},
{
"epoch": 0.5947977450301651,
"grad_norm": 0.8106920123100281,
"learning_rate": 1.9569872573387516e-05,
"loss": 1.4326,
"step": 18042
},
{
"epoch": 0.5958197342827943,
"grad_norm": 0.8342007994651794,
"learning_rate": 1.9487925945007854e-05,
"loss": 1.4437,
"step": 18073
},
{
"epoch": 0.5968417235354234,
"grad_norm": 0.8332077860832214,
"learning_rate": 1.9406041519634726e-05,
"loss": 1.4551,
"step": 18104
},
{
"epoch": 0.5978637127880526,
"grad_norm": 0.7965781092643738,
"learning_rate": 1.932422022132275e-05,
"loss": 1.4367,
"step": 18135
},
{
"epoch": 0.5988857020406818,
"grad_norm": 0.8394030928611755,
"learning_rate": 1.924246297341414e-05,
"loss": 1.422,
"step": 18166
},
{
"epoch": 0.5999076912933109,
"grad_norm": 0.8186848163604736,
"learning_rate": 1.9160770698528338e-05,
"loss": 1.4482,
"step": 18197
},
{
"epoch": 0.6009296805459401,
"grad_norm": 0.7956410646438599,
"learning_rate": 1.907914431855156e-05,
"loss": 1.4269,
"step": 18228
},
{
"epoch": 0.6019516697985692,
"grad_norm": 0.8348777890205383,
"learning_rate": 1.8997584754626412e-05,
"loss": 1.4342,
"step": 18259
},
{
"epoch": 0.6029736590511984,
"grad_norm": 0.8190683126449585,
"learning_rate": 1.8916092927141486e-05,
"loss": 1.4463,
"step": 18290
},
{
"epoch": 0.6039956483038275,
"grad_norm": 0.7944843769073486,
"learning_rate": 1.883466975572098e-05,
"loss": 1.4435,
"step": 18321
},
{
"epoch": 0.6050176375564567,
"grad_norm": 0.8158681988716125,
"learning_rate": 1.8753316159214312e-05,
"loss": 1.4355,
"step": 18352
},
{
"epoch": 0.6060396268090859,
"grad_norm": 0.8052075505256653,
"learning_rate": 1.8672033055685766e-05,
"loss": 1.4379,
"step": 18383
},
{
"epoch": 0.6070616160617149,
"grad_norm": 0.7949001789093018,
"learning_rate": 1.8590821362404116e-05,
"loss": 1.4289,
"step": 18414
},
{
"epoch": 0.6080836053143441,
"grad_norm": 0.8260155916213989,
"learning_rate": 1.8509681995832294e-05,
"loss": 1.4317,
"step": 18445
},
{
"epoch": 0.6091055945669732,
"grad_norm": 0.8101741075515747,
"learning_rate": 1.8428615871617004e-05,
"loss": 1.4341,
"step": 18476
},
{
"epoch": 0.6101275838196024,
"grad_norm": 0.8563366532325745,
"learning_rate": 1.8347623904578448e-05,
"loss": 1.4432,
"step": 18507
},
{
"epoch": 0.6111495730722316,
"grad_norm": 0.7924138307571411,
"learning_rate": 1.8266707008699975e-05,
"loss": 1.4341,
"step": 18538
},
{
"epoch": 0.6121715623248607,
"grad_norm": 0.7919387221336365,
"learning_rate": 1.818586609711774e-05,
"loss": 1.4613,
"step": 18569
},
{
"epoch": 0.6131935515774899,
"grad_norm": 0.7915093302726746,
"learning_rate": 1.8105102082110462e-05,
"loss": 1.4332,
"step": 18600
},
{
"epoch": 0.614215540830119,
"grad_norm": 0.7878425717353821,
"learning_rate": 1.8024415875089058e-05,
"loss": 1.4372,
"step": 18631
},
{
"epoch": 0.6152375300827482,
"grad_norm": 0.7944186329841614,
"learning_rate": 1.7943808386586407e-05,
"loss": 1.431,
"step": 18662
},
{
"epoch": 0.6162595193353773,
"grad_norm": 0.7792430520057678,
"learning_rate": 1.7863280526247073e-05,
"loss": 1.43,
"step": 18693
},
{
"epoch": 0.6172815085880065,
"grad_norm": 0.838062047958374,
"learning_rate": 1.7782833202817003e-05,
"loss": 1.4377,
"step": 18724
},
{
"epoch": 0.6183034978406357,
"grad_norm": 0.8279454112052917,
"learning_rate": 1.7702467324133327e-05,
"loss": 1.4248,
"step": 18755
},
{
"epoch": 0.6193254870932647,
"grad_norm": 0.8460195064544678,
"learning_rate": 1.7622183797114042e-05,
"loss": 1.4058,
"step": 18786
},
{
"epoch": 0.6203474763458939,
"grad_norm": 0.8065218925476074,
"learning_rate": 1.7541983527747838e-05,
"loss": 1.419,
"step": 18817
},
{
"epoch": 0.621369465598523,
"grad_norm": 0.8114182353019714,
"learning_rate": 1.746186742108387e-05,
"loss": 1.4305,
"step": 18848
},
{
"epoch": 0.6223914548511522,
"grad_norm": 0.8157764673233032,
"learning_rate": 1.73818363812215e-05,
"loss": 1.4362,
"step": 18879
},
{
"epoch": 0.6234134441037814,
"grad_norm": 0.8182953596115112,
"learning_rate": 1.7301891311300153e-05,
"loss": 1.4292,
"step": 18910
},
{
"epoch": 0.6244354333564105,
"grad_norm": 0.8444055914878845,
"learning_rate": 1.7222033113489055e-05,
"loss": 1.4339,
"step": 18941
},
{
"epoch": 0.6254574226090397,
"grad_norm": 0.8451635837554932,
"learning_rate": 1.7142262688977127e-05,
"loss": 1.4272,
"step": 18972
},
{
"epoch": 0.6264794118616688,
"grad_norm": 0.7921797633171082,
"learning_rate": 1.7062580937962764e-05,
"loss": 1.4249,
"step": 19003
},
{
"epoch": 0.627501401114298,
"grad_norm": 0.8066070675849915,
"learning_rate": 1.698298875964369e-05,
"loss": 1.422,
"step": 19034
},
{
"epoch": 0.6285233903669271,
"grad_norm": 0.7984006404876709,
"learning_rate": 1.690348705220684e-05,
"loss": 1.4279,
"step": 19065
},
{
"epoch": 0.6295453796195563,
"grad_norm": 0.8582960367202759,
"learning_rate": 1.6824076712818156e-05,
"loss": 1.4224,
"step": 19096
},
{
"epoch": 0.6305673688721855,
"grad_norm": 0.8283519148826599,
"learning_rate": 1.6744758637612533e-05,
"loss": 1.4509,
"step": 19127
},
{
"epoch": 0.6315893581248145,
"grad_norm": 0.8150811791419983,
"learning_rate": 1.6665533721683664e-05,
"loss": 1.4216,
"step": 19158
},
{
"epoch": 0.6326113473774437,
"grad_norm": 0.8012325167655945,
"learning_rate": 1.6586402859073974e-05,
"loss": 1.4263,
"step": 19189
},
{
"epoch": 0.6336333366300728,
"grad_norm": 0.8255190849304199,
"learning_rate": 1.6507366942764463e-05,
"loss": 1.432,
"step": 19220
},
{
"epoch": 0.634655325882702,
"grad_norm": 0.8162024021148682,
"learning_rate": 1.6428426864664732e-05,
"loss": 1.4308,
"step": 19251
},
{
"epoch": 0.6356773151353312,
"grad_norm": 0.9059862494468689,
"learning_rate": 1.6349583515602816e-05,
"loss": 1.4258,
"step": 19282
},
{
"epoch": 0.6366993043879603,
"grad_norm": 0.8172095417976379,
"learning_rate": 1.6270837785315208e-05,
"loss": 1.426,
"step": 19313
},
{
"epoch": 0.6377212936405895,
"grad_norm": 0.8332613110542297,
"learning_rate": 1.619219056243676e-05,
"loss": 1.4275,
"step": 19344
},
{
"epoch": 0.6387432828932186,
"grad_norm": 0.8458919525146484,
"learning_rate": 1.6113642734490698e-05,
"loss": 1.4133,
"step": 19375
},
{
"epoch": 0.6397652721458478,
"grad_norm": 0.8014411926269531,
"learning_rate": 1.6035195187878577e-05,
"loss": 1.4323,
"step": 19406
},
{
"epoch": 0.6407872613984769,
"grad_norm": 0.8090296387672424,
"learning_rate": 1.5956848807870305e-05,
"loss": 1.4314,
"step": 19437
},
{
"epoch": 0.6418092506511061,
"grad_norm": 0.8195688128471375,
"learning_rate": 1.587860447859413e-05,
"loss": 1.4371,
"step": 19468
},
{
"epoch": 0.6428312399037353,
"grad_norm": 0.7982098460197449,
"learning_rate": 1.5800463083026686e-05,
"loss": 1.4314,
"step": 19499
},
{
"epoch": 0.6438532291563643,
"grad_norm": 0.8345074653625488,
"learning_rate": 1.572242550298298e-05,
"loss": 1.4289,
"step": 19530
},
{
"epoch": 0.6448752184089935,
"grad_norm": 0.8276827335357666,
"learning_rate": 1.56444926191065e-05,
"loss": 1.4358,
"step": 19561
},
{
"epoch": 0.6458972076616226,
"grad_norm": 0.8199188709259033,
"learning_rate": 1.5566665310859257e-05,
"loss": 1.4291,
"step": 19592
},
{
"epoch": 0.6469191969142518,
"grad_norm": 0.8202061057090759,
"learning_rate": 1.5488944456511846e-05,
"loss": 1.4319,
"step": 19623
},
{
"epoch": 0.6479411861668809,
"grad_norm": 0.8072242140769958,
"learning_rate": 1.5411330933133546e-05,
"loss": 1.428,
"step": 19654
},
{
"epoch": 0.6489631754195101,
"grad_norm": 0.8337129354476929,
"learning_rate": 1.533382561658241e-05,
"loss": 1.4317,
"step": 19685
},
{
"epoch": 0.6499851646721393,
"grad_norm": 0.8070532083511353,
"learning_rate": 1.525642938149541e-05,
"loss": 1.4263,
"step": 19716
},
{
"epoch": 0.6510071539247684,
"grad_norm": 0.7998501658439636,
"learning_rate": 1.5179143101278536e-05,
"loss": 1.4231,
"step": 19747
},
{
"epoch": 0.6520291431773976,
"grad_norm": 0.7996141314506531,
"learning_rate": 1.5101967648096955e-05,
"loss": 1.4295,
"step": 19778
},
{
"epoch": 0.6530511324300267,
"grad_norm": 0.8256941437721252,
"learning_rate": 1.5024903892865172e-05,
"loss": 1.4426,
"step": 19809
},
{
"epoch": 0.6540731216826559,
"grad_norm": 0.8443533182144165,
"learning_rate": 1.4947952705237184e-05,
"loss": 1.4206,
"step": 19840
},
{
"epoch": 0.655095110935285,
"grad_norm": 0.8303670287132263,
"learning_rate": 1.4871114953596682e-05,
"loss": 1.4442,
"step": 19871
},
{
"epoch": 0.6561171001879141,
"grad_norm": 0.8437788486480713,
"learning_rate": 1.4794391505047256e-05,
"loss": 1.4321,
"step": 19902
},
{
"epoch": 0.6571390894405433,
"grad_norm": 0.8075599670410156,
"learning_rate": 1.4717783225402596e-05,
"loss": 1.4304,
"step": 19933
},
{
"epoch": 0.6581610786931724,
"grad_norm": 0.8010841608047485,
"learning_rate": 1.4641290979176735e-05,
"loss": 1.4357,
"step": 19964
},
{
"epoch": 0.6591830679458016,
"grad_norm": 0.8034616112709045,
"learning_rate": 1.4564915629574246e-05,
"loss": 1.4194,
"step": 19995
},
{
"epoch": 0.6602050571984307,
"grad_norm": 0.8247554302215576,
"learning_rate": 1.4488658038480601e-05,
"loss": 1.4316,
"step": 20026
},
{
"epoch": 0.6612270464510599,
"grad_norm": 0.8289808034896851,
"learning_rate": 1.4412519066452323e-05,
"loss": 1.4277,
"step": 20057
},
{
"epoch": 0.6622490357036891,
"grad_norm": 0.8197374939918518,
"learning_rate": 1.4336499572707373e-05,
"loss": 1.421,
"step": 20088
},
{
"epoch": 0.6632710249563182,
"grad_norm": 0.832967221736908,
"learning_rate": 1.4260600415115433e-05,
"loss": 1.4165,
"step": 20119
},
{
"epoch": 0.6642930142089474,
"grad_norm": 0.8057476282119751,
"learning_rate": 1.4184822450188137e-05,
"loss": 1.4348,
"step": 20150
},
{
"epoch": 0.6653150034615765,
"grad_norm": 0.8722820281982422,
"learning_rate": 1.410916653306954e-05,
"loss": 1.4171,
"step": 20181
},
{
"epoch": 0.6663369927142057,
"grad_norm": 0.8184958696365356,
"learning_rate": 1.403363351752639e-05,
"loss": 1.4232,
"step": 20212
},
{
"epoch": 0.6673589819668349,
"grad_norm": 0.828702449798584,
"learning_rate": 1.3958224255938485e-05,
"loss": 1.4066,
"step": 20243
},
{
"epoch": 0.6683809712194639,
"grad_norm": 0.826805055141449,
"learning_rate": 1.388293959928911e-05,
"loss": 1.4177,
"step": 20274
},
{
"epoch": 0.6694029604720931,
"grad_norm": 0.8016971349716187,
"learning_rate": 1.3807780397155379e-05,
"loss": 1.4298,
"step": 20305
},
{
"epoch": 0.6704249497247222,
"grad_norm": 0.8369885683059692,
"learning_rate": 1.3732747497698655e-05,
"loss": 1.4286,
"step": 20336
},
{
"epoch": 0.6714469389773514,
"grad_norm": 0.8385952115058899,
"learning_rate": 1.3657841747655038e-05,
"loss": 1.4056,
"step": 20367
},
{
"epoch": 0.6724689282299805,
"grad_norm": 0.7958812713623047,
"learning_rate": 1.3583063992325706e-05,
"loss": 1.4153,
"step": 20398
},
{
"epoch": 0.6734909174826097,
"grad_norm": 0.815703809261322,
"learning_rate": 1.3508415075567496e-05,
"loss": 1.4171,
"step": 20429
},
{
"epoch": 0.6745129067352389,
"grad_norm": 0.8329752087593079,
"learning_rate": 1.343389583978327e-05,
"loss": 1.433,
"step": 20460
},
{
"epoch": 0.675534895987868,
"grad_norm": 0.8523135185241699,
"learning_rate": 1.3359507125912468e-05,
"loss": 1.4223,
"step": 20491
},
{
"epoch": 0.6765568852404972,
"grad_norm": 0.835186243057251,
"learning_rate": 1.3285249773421627e-05,
"loss": 1.4194,
"step": 20522
},
{
"epoch": 0.6775788744931263,
"grad_norm": 0.8810424208641052,
"learning_rate": 1.3211124620294884e-05,
"loss": 1.4261,
"step": 20553
},
{
"epoch": 0.6786008637457555,
"grad_norm": 0.841140627861023,
"learning_rate": 1.313713250302451e-05,
"loss": 1.4256,
"step": 20584
},
{
"epoch": 0.6796228529983847,
"grad_norm": 1.0968244075775146,
"learning_rate": 1.3063274256601479e-05,
"loss": 1.4261,
"step": 20615
},
{
"epoch": 0.6806448422510137,
"grad_norm": 0.8303719162940979,
"learning_rate": 1.2989550714506086e-05,
"loss": 1.4134,
"step": 20646
},
{
"epoch": 0.6816668315036429,
"grad_norm": 0.8236658573150635,
"learning_rate": 1.291596270869846e-05,
"loss": 1.4236,
"step": 20677
},
{
"epoch": 0.682688820756272,
"grad_norm": 0.8341703414916992,
"learning_rate": 1.284251106960927e-05,
"loss": 1.4296,
"step": 20708
},
{
"epoch": 0.6837108100089012,
"grad_norm": 0.8321917057037354,
"learning_rate": 1.2769196626130263e-05,
"loss": 1.4403,
"step": 20739
},
{
"epoch": 0.6847327992615303,
"grad_norm": 0.81299889087677,
"learning_rate": 1.2696020205604969e-05,
"loss": 1.4295,
"step": 20770
},
{
"epoch": 0.6857547885141595,
"grad_norm": 0.8226946592330933,
"learning_rate": 1.2622982633819359e-05,
"loss": 1.4248,
"step": 20801
},
{
"epoch": 0.6867767777667887,
"grad_norm": 0.8110581040382385,
"learning_rate": 1.2550084734992484e-05,
"loss": 1.4173,
"step": 20832
},
{
"epoch": 0.6877987670194178,
"grad_norm": 0.8225431442260742,
"learning_rate": 1.247732733176724e-05,
"loss": 1.4294,
"step": 20863
},
{
"epoch": 0.688820756272047,
"grad_norm": 0.8062965273857117,
"learning_rate": 1.2404711245201044e-05,
"loss": 1.4381,
"step": 20894
},
{
"epoch": 0.6898427455246761,
"grad_norm": 0.8408602476119995,
"learning_rate": 1.2332237294756535e-05,
"loss": 1.4233,
"step": 20925
},
{
"epoch": 0.6908647347773053,
"grad_norm": 0.8272174000740051,
"learning_rate": 1.225990629829241e-05,
"loss": 1.4153,
"step": 20956
},
{
"epoch": 0.6918867240299343,
"grad_norm": 0.8655986785888672,
"learning_rate": 1.2187719072054136e-05,
"loss": 1.4211,
"step": 20987
},
{
"epoch": 0.6929087132825635,
"grad_norm": 0.8234033584594727,
"learning_rate": 1.2115676430664735e-05,
"loss": 1.4125,
"step": 21018
},
{
"epoch": 0.6939307025351927,
"grad_norm": 0.812782883644104,
"learning_rate": 1.2043779187115647e-05,
"loss": 1.4384,
"step": 21049
},
{
"epoch": 0.6949526917878218,
"grad_norm": 0.8280140161514282,
"learning_rate": 1.1972028152757476e-05,
"loss": 1.4291,
"step": 21080
},
{
"epoch": 0.695974681040451,
"grad_norm": 0.8197391629219055,
"learning_rate": 1.1900424137290889e-05,
"loss": 1.418,
"step": 21111
},
{
"epoch": 0.6969966702930801,
"grad_norm": 0.8022619485855103,
"learning_rate": 1.1828967948757482e-05,
"loss": 1.421,
"step": 21142
},
{
"epoch": 0.6980186595457093,
"grad_norm": 0.8374767303466797,
"learning_rate": 1.175766039353062e-05,
"loss": 1.4228,
"step": 21173
},
{
"epoch": 0.6990406487983385,
"grad_norm": 0.8427351117134094,
"learning_rate": 1.1686502276306382e-05,
"loss": 1.4456,
"step": 21204
},
{
"epoch": 0.7000626380509676,
"grad_norm": 0.8131812214851379,
"learning_rate": 1.1615494400094445e-05,
"loss": 1.4265,
"step": 21235
},
{
"epoch": 0.7010846273035968,
"grad_norm": 0.7899516224861145,
"learning_rate": 1.1544637566209029e-05,
"loss": 1.4189,
"step": 21266
},
{
"epoch": 0.7021066165562259,
"grad_norm": 0.802736222743988,
"learning_rate": 1.1473932574259886e-05,
"loss": 1.4245,
"step": 21297
},
{
"epoch": 0.7031286058088551,
"grad_norm": 0.825494647026062,
"learning_rate": 1.1403380222143247e-05,
"loss": 1.4246,
"step": 21328
},
{
"epoch": 0.7041505950614841,
"grad_norm": 0.8523460626602173,
"learning_rate": 1.1332981306032808e-05,
"loss": 1.4244,
"step": 21359
},
{
"epoch": 0.7051725843141133,
"grad_norm": 0.7921078205108643,
"learning_rate": 1.1262736620370762e-05,
"loss": 1.4,
"step": 21390
},
{
"epoch": 0.7061945735667425,
"grad_norm": 0.833227813243866,
"learning_rate": 1.1192646957858854e-05,
"loss": 1.4373,
"step": 21421
},
{
"epoch": 0.7072165628193716,
"grad_norm": 0.8187243938446045,
"learning_rate": 1.1122713109449381e-05,
"loss": 1.41,
"step": 21452
},
{
"epoch": 0.7082385520720008,
"grad_norm": 0.8406710028648376,
"learning_rate": 1.105293586433634e-05,
"loss": 1.4207,
"step": 21483
},
{
"epoch": 0.7092605413246299,
"grad_norm": 0.8350772857666016,
"learning_rate": 1.0983316009946446e-05,
"loss": 1.4088,
"step": 21514
},
{
"epoch": 0.7102825305772591,
"grad_norm": 0.8365492820739746,
"learning_rate": 1.0913854331930282e-05,
"loss": 1.4168,
"step": 21545
},
{
"epoch": 0.7113045198298883,
"grad_norm": 0.822426974773407,
"learning_rate": 1.0844551614153456e-05,
"loss": 1.397,
"step": 21576
},
{
"epoch": 0.7123265090825174,
"grad_norm": 0.8318968415260315,
"learning_rate": 1.0775408638687725e-05,
"loss": 1.4036,
"step": 21607
},
{
"epoch": 0.7133484983351466,
"grad_norm": 0.8235137462615967,
"learning_rate": 1.0706426185802165e-05,
"loss": 1.429,
"step": 21638
},
{
"epoch": 0.7143704875877757,
"grad_norm": 0.8110867738723755,
"learning_rate": 1.0637605033954371e-05,
"loss": 1.4002,
"step": 21669
},
{
"epoch": 0.7153924768404049,
"grad_norm": 0.8007574081420898,
"learning_rate": 1.05689459597817e-05,
"loss": 1.4224,
"step": 21700
},
{
"epoch": 0.7164144660930339,
"grad_norm": 0.802810549736023,
"learning_rate": 1.050044973809246e-05,
"loss": 1.4197,
"step": 21731
},
{
"epoch": 0.7174364553456631,
"grad_norm": 0.8145677447319031,
"learning_rate": 1.043211714185722e-05,
"loss": 1.4084,
"step": 21762
},
{
"epoch": 0.7184584445982923,
"grad_norm": 0.8142935037612915,
"learning_rate": 1.036394894220003e-05,
"loss": 1.4101,
"step": 21793
},
{
"epoch": 0.7194804338509214,
"grad_norm": 0.8112974166870117,
"learning_rate": 1.0295945908389751e-05,
"loss": 1.4229,
"step": 21824
},
{
"epoch": 0.7205024231035506,
"grad_norm": 0.8134146332740784,
"learning_rate": 1.0228108807831393e-05,
"loss": 1.4103,
"step": 21855
},
{
"epoch": 0.7215244123561797,
"grad_norm": 0.8121820092201233,
"learning_rate": 1.01604384060574e-05,
"loss": 1.4103,
"step": 21886
},
{
"epoch": 0.7225464016088089,
"grad_norm": 0.8066170811653137,
"learning_rate": 1.009293546671907e-05,
"loss": 1.3994,
"step": 21917
},
{
"epoch": 0.7235683908614381,
"grad_norm": 0.8202908635139465,
"learning_rate": 1.002560075157791e-05,
"loss": 1.4099,
"step": 21948
},
{
"epoch": 0.7245903801140672,
"grad_norm": 0.8023785352706909,
"learning_rate": 9.958435020496995e-06,
"loss": 1.4208,
"step": 21979
},
{
"epoch": 0.7256123693666964,
"grad_norm": 0.8268675804138184,
"learning_rate": 9.89143903143249e-06,
"loss": 1.4068,
"step": 22010
},
{
"epoch": 0.7266343586193255,
"grad_norm": 0.8300354480743408,
"learning_rate": 9.824613540425038e-06,
"loss": 1.4195,
"step": 22041
},
{
"epoch": 0.7276563478719547,
"grad_norm": 0.8289726376533508,
"learning_rate": 9.757959301591197e-06,
"loss": 1.4189,
"step": 22072
},
{
"epoch": 0.7286783371245837,
"grad_norm": 0.8267973065376282,
"learning_rate": 9.691477067115017e-06,
"loss": 1.4047,
"step": 22103
},
{
"epoch": 0.7297003263772129,
"grad_norm": 0.8307402729988098,
"learning_rate": 9.625167587239467e-06,
"loss": 1.4101,
"step": 22134
},
{
"epoch": 0.7307223156298421,
"grad_norm": 0.8467045426368713,
"learning_rate": 9.559031610258007e-06,
"loss": 1.4119,
"step": 22165
},
{
"epoch": 0.7317443048824712,
"grad_norm": 0.842170774936676,
"learning_rate": 9.493069882506164e-06,
"loss": 1.408,
"step": 22196
},
{
"epoch": 0.7327662941351004,
"grad_norm": 0.80977863073349,
"learning_rate": 9.427283148353056e-06,
"loss": 1.4115,
"step": 22227
},
{
"epoch": 0.7337882833877295,
"grad_norm": 0.8142424821853638,
"learning_rate": 9.361672150193052e-06,
"loss": 1.4188,
"step": 22258
},
{
"epoch": 0.7348102726403587,
"grad_norm": 0.8256626725196838,
"learning_rate": 9.29623762843734e-06,
"loss": 1.4097,
"step": 22289
},
{
"epoch": 0.7358322618929878,
"grad_norm": 0.9075655341148376,
"learning_rate": 9.230980321505594e-06,
"loss": 1.412,
"step": 22320
},
{
"epoch": 0.736854251145617,
"grad_norm": 0.8277102708816528,
"learning_rate": 9.165900965817668e-06,
"loss": 1.3973,
"step": 22351
},
{
"epoch": 0.7378762403982462,
"grad_norm": 0.8417026400566101,
"learning_rate": 9.101000295785245e-06,
"loss": 1.4091,
"step": 22382
},
{
"epoch": 0.7388982296508753,
"grad_norm": 0.8252837657928467,
"learning_rate": 9.036279043803565e-06,
"loss": 1.4057,
"step": 22413
},
{
"epoch": 0.7399202189035045,
"grad_norm": 0.8327083587646484,
"learning_rate": 8.971737940243147e-06,
"loss": 1.4044,
"step": 22444
},
{
"epoch": 0.7409422081561335,
"grad_norm": 0.8233001232147217,
"learning_rate": 8.907377713441592e-06,
"loss": 1.4142,
"step": 22475
},
{
"epoch": 0.7419641974087627,
"grad_norm": 0.7973216772079468,
"learning_rate": 8.843199089695293e-06,
"loss": 1.4074,
"step": 22506
},
{
"epoch": 0.7429861866613919,
"grad_norm": 0.8216118812561035,
"learning_rate": 8.779202793251311e-06,
"loss": 1.3943,
"step": 22537
},
{
"epoch": 0.744008175914021,
"grad_norm": 0.8081651329994202,
"learning_rate": 8.715389546299149e-06,
"loss": 1.4008,
"step": 22568
},
{
"epoch": 0.7450301651666502,
"grad_norm": 0.8267298340797424,
"learning_rate": 8.651760068962617e-06,
"loss": 1.4318,
"step": 22599
},
{
"epoch": 0.7460521544192793,
"grad_norm": 0.8131210803985596,
"learning_rate": 8.588315079291733e-06,
"loss": 1.3889,
"step": 22630
},
{
"epoch": 0.7470741436719085,
"grad_norm": 0.8630332350730896,
"learning_rate": 8.52505529325457e-06,
"loss": 1.4196,
"step": 22661
},
{
"epoch": 0.7480961329245376,
"grad_norm": 0.8226453065872192,
"learning_rate": 8.461981424729216e-06,
"loss": 1.4265,
"step": 22692
},
{
"epoch": 0.7491181221771668,
"grad_norm": 0.8107773661613464,
"learning_rate": 8.399094185495725e-06,
"loss": 1.4202,
"step": 22723
},
{
"epoch": 0.750140111429796,
"grad_norm": 0.8276836276054382,
"learning_rate": 8.336394285228017e-06,
"loss": 1.4148,
"step": 22754
},
{
"epoch": 0.7511621006824251,
"grad_norm": 0.823788583278656,
"learning_rate": 8.273882431485952e-06,
"loss": 1.404,
"step": 22785
},
{
"epoch": 0.7521840899350543,
"grad_norm": 0.8203224539756775,
"learning_rate": 8.211559329707316e-06,
"loss": 1.4057,
"step": 22816
},
{
"epoch": 0.7532060791876833,
"grad_norm": 0.8357372283935547,
"learning_rate": 8.149425683199823e-06,
"loss": 1.4132,
"step": 22847
},
{
"epoch": 0.7542280684403125,
"grad_norm": 0.8468825221061707,
"learning_rate": 8.08748219313325e-06,
"loss": 1.4009,
"step": 22878
},
{
"epoch": 0.7552500576929417,
"grad_norm": 0.9028123617172241,
"learning_rate": 8.025729558531453e-06,
"loss": 1.4324,
"step": 22909
},
{
"epoch": 0.7562720469455708,
"grad_norm": 0.8219141960144043,
"learning_rate": 7.964168476264508e-06,
"loss": 1.4122,
"step": 22940
},
{
"epoch": 0.7572940361982,
"grad_norm": 0.8215070366859436,
"learning_rate": 7.902799641040884e-06,
"loss": 1.417,
"step": 22971
},
{
"epoch": 0.7583160254508291,
"grad_norm": 0.8290405869483948,
"learning_rate": 7.841623745399523e-06,
"loss": 1.4146,
"step": 23002
},
{
"epoch": 0.7593380147034583,
"grad_norm": 0.8678696751594543,
"learning_rate": 7.780641479702114e-06,
"loss": 1.42,
"step": 23033
},
{
"epoch": 0.7603600039560874,
"grad_norm": 0.8347179889678955,
"learning_rate": 7.719853532125227e-06,
"loss": 1.409,
"step": 23064
},
{
"epoch": 0.7613819932087166,
"grad_norm": 0.8424118757247925,
"learning_rate": 7.65926058865258e-06,
"loss": 1.4034,
"step": 23095
},
{
"epoch": 0.7624039824613458,
"grad_norm": 0.8058504462242126,
"learning_rate": 7.598863333067313e-06,
"loss": 1.3934,
"step": 23126
},
{
"epoch": 0.7634259717139749,
"grad_norm": 0.8032201528549194,
"learning_rate": 7.538662446944253e-06,
"loss": 1.4358,
"step": 23157
},
{
"epoch": 0.7644479609666041,
"grad_norm": 0.8517287373542786,
"learning_rate": 7.478658609642211e-06,
"loss": 1.4018,
"step": 23188
},
{
"epoch": 0.7654699502192331,
"grad_norm": 0.8032835125923157,
"learning_rate": 7.418852498296327e-06,
"loss": 1.4139,
"step": 23219
},
{
"epoch": 0.7664919394718623,
"grad_norm": 0.806354820728302,
"learning_rate": 7.359244787810457e-06,
"loss": 1.405,
"step": 23250
},
{
"epoch": 0.7675139287244914,
"grad_norm": 0.8073768615722656,
"learning_rate": 7.299836150849493e-06,
"loss": 1.4082,
"step": 23281
},
{
"epoch": 0.7685359179771206,
"grad_norm": 0.8103781938552856,
"learning_rate": 7.240627257831847e-06,
"loss": 1.3994,
"step": 23312
},
{
"epoch": 0.7695579072297498,
"grad_norm": 0.8298788666725159,
"learning_rate": 7.1816187769218195e-06,
"loss": 1.3971,
"step": 23343
},
{
"epoch": 0.7705798964823789,
"grad_norm": 0.8138905763626099,
"learning_rate": 7.1228113740220895e-06,
"loss": 1.3956,
"step": 23374
},
{
"epoch": 0.7716018857350081,
"grad_norm": 0.8347598314285278,
"learning_rate": 7.064205712766226e-06,
"loss": 1.4,
"step": 23405
},
{
"epoch": 0.7726238749876372,
"grad_norm": 0.8050975799560547,
"learning_rate": 7.005802454511129e-06,
"loss": 1.4171,
"step": 23436
},
{
"epoch": 0.7736458642402664,
"grad_norm": 0.8573086261749268,
"learning_rate": 6.947602258329639e-06,
"loss": 1.4181,
"step": 23467
},
{
"epoch": 0.7746678534928956,
"grad_norm": 0.7980222105979919,
"learning_rate": 6.889605781003078e-06,
"loss": 1.423,
"step": 23498
},
{
"epoch": 0.7756898427455247,
"grad_norm": 0.7984530329704285,
"learning_rate": 6.831813677013776e-06,
"loss": 1.4164,
"step": 23529
},
{
"epoch": 0.7767118319981539,
"grad_norm": 0.838580846786499,
"learning_rate": 6.774226598537792e-06,
"loss": 1.4215,
"step": 23560
},
{
"epoch": 0.7777338212507829,
"grad_norm": 0.8243645429611206,
"learning_rate": 6.716845195437482e-06,
"loss": 1.4311,
"step": 23591
},
{
"epoch": 0.7787558105034121,
"grad_norm": 0.8245717883110046,
"learning_rate": 6.659670115254168e-06,
"loss": 1.4217,
"step": 23622
},
{
"epoch": 0.7797777997560412,
"grad_norm": 0.8594714999198914,
"learning_rate": 6.602702003200872e-06,
"loss": 1.4103,
"step": 23653
},
{
"epoch": 0.7807997890086704,
"grad_norm": 0.8197819590568542,
"learning_rate": 6.545941502154992e-06,
"loss": 1.3962,
"step": 23684
},
{
"epoch": 0.7818217782612996,
"grad_norm": 0.8505415320396423,
"learning_rate": 6.489389252651057e-06,
"loss": 1.415,
"step": 23715
},
{
"epoch": 0.7828437675139287,
"grad_norm": 0.8418869376182556,
"learning_rate": 6.4330458928735325e-06,
"loss": 1.395,
"step": 23746
},
{
"epoch": 0.7838657567665579,
"grad_norm": 0.8336290717124939,
"learning_rate": 6.376912058649559e-06,
"loss": 1.4035,
"step": 23777
},
{
"epoch": 0.784887746019187,
"grad_norm": 0.8226044178009033,
"learning_rate": 6.320988383441845e-06,
"loss": 1.4041,
"step": 23808
},
{
"epoch": 0.7859097352718162,
"grad_norm": 0.821552038192749,
"learning_rate": 6.265275498341452e-06,
"loss": 1.3928,
"step": 23839
},
{
"epoch": 0.7869317245244454,
"grad_norm": 0.862799882888794,
"learning_rate": 6.209774032060714e-06,
"loss": 1.4082,
"step": 23870
},
{
"epoch": 0.7879537137770745,
"grad_norm": 0.8196310997009277,
"learning_rate": 6.1544846109261365e-06,
"loss": 1.41,
"step": 23901
},
{
"epoch": 0.7889757030297037,
"grad_norm": 0.8420688509941101,
"learning_rate": 6.099407858871342e-06,
"loss": 1.4144,
"step": 23932
},
{
"epoch": 0.7899976922823327,
"grad_norm": 0.8264843225479126,
"learning_rate": 6.044544397429958e-06,
"loss": 1.4082,
"step": 23963
},
{
"epoch": 0.7910196815349619,
"grad_norm": 0.8335662484169006,
"learning_rate": 5.989894845728708e-06,
"loss": 1.392,
"step": 23994
},
{
"epoch": 0.792041670787591,
"grad_norm": 0.8109620213508606,
"learning_rate": 5.9354598204803605e-06,
"loss": 1.4103,
"step": 24025
},
{
"epoch": 0.7930636600402202,
"grad_norm": 0.8195939064025879,
"learning_rate": 5.881239935976762e-06,
"loss": 1.4049,
"step": 24056
},
{
"epoch": 0.7940856492928494,
"grad_norm": 0.8132045865058899,
"learning_rate": 5.827235804081954e-06,
"loss": 1.4137,
"step": 24087
},
{
"epoch": 0.7951076385454785,
"grad_norm": 0.8163465261459351,
"learning_rate": 5.773448034225221e-06,
"loss": 1.3913,
"step": 24118
},
{
"epoch": 0.7961296277981077,
"grad_norm": 0.8209108710289001,
"learning_rate": 5.719877233394228e-06,
"loss": 1.3912,
"step": 24149
},
{
"epoch": 0.7971516170507368,
"grad_norm": 0.8075419664382935,
"learning_rate": 5.666524006128191e-06,
"loss": 1.3968,
"step": 24180
},
{
"epoch": 0.798173606303366,
"grad_norm": 0.8444731831550598,
"learning_rate": 5.613388954511015e-06,
"loss": 1.3818,
"step": 24211
},
{
"epoch": 0.7991955955559952,
"grad_norm": 0.8630134463310242,
"learning_rate": 5.560472678164552e-06,
"loss": 1.4201,
"step": 24242
},
{
"epoch": 0.8002175848086243,
"grad_norm": 0.8386521339416504,
"learning_rate": 5.507775774241775e-06,
"loss": 1.3984,
"step": 24273
},
{
"epoch": 0.8012395740612535,
"grad_norm": 0.8426861763000488,
"learning_rate": 5.4552988374200945e-06,
"loss": 1.4069,
"step": 24304
},
{
"epoch": 0.8022615633138825,
"grad_norm": 0.7985621690750122,
"learning_rate": 5.403042459894597e-06,
"loss": 1.4115,
"step": 24335
},
{
"epoch": 0.8032835525665117,
"grad_norm": 0.8200878500938416,
"learning_rate": 5.3510072313714135e-06,
"loss": 1.4055,
"step": 24366
},
{
"epoch": 0.8043055418191408,
"grad_norm": 0.816440761089325,
"learning_rate": 5.2991937390610205e-06,
"loss": 1.4062,
"step": 24397
},
{
"epoch": 0.80532753107177,
"grad_norm": 0.8369777798652649,
"learning_rate": 5.247602567671625e-06,
"loss": 1.3964,
"step": 24428
},
{
"epoch": 0.8063495203243992,
"grad_norm": 0.8133443593978882,
"learning_rate": 5.196234299402603e-06,
"loss": 1.4207,
"step": 24459
},
{
"epoch": 0.8073715095770283,
"grad_norm": 0.8752077221870422,
"learning_rate": 5.145089513937865e-06,
"loss": 1.3898,
"step": 24490
},
{
"epoch": 0.8083934988296575,
"grad_norm": 0.8062163591384888,
"learning_rate": 5.094168788439369e-06,
"loss": 1.4005,
"step": 24521
},
{
"epoch": 0.8094154880822866,
"grad_norm": 0.8645463585853577,
"learning_rate": 5.043472697540594e-06,
"loss": 1.3918,
"step": 24552
},
{
"epoch": 0.8104374773349158,
"grad_norm": 0.8500407934188843,
"learning_rate": 4.993001813340012e-06,
"loss": 1.4103,
"step": 24583
},
{
"epoch": 0.8114594665875449,
"grad_norm": 0.8387671113014221,
"learning_rate": 4.942756705394702e-06,
"loss": 1.4072,
"step": 24614
},
{
"epoch": 0.8124814558401741,
"grad_norm": 0.8170669674873352,
"learning_rate": 4.892737940713884e-06,
"loss": 1.3783,
"step": 24645
},
{
"epoch": 0.8135034450928033,
"grad_norm": 0.873763382434845,
"learning_rate": 4.842946083752511e-06,
"loss": 1.3995,
"step": 24676
},
{
"epoch": 0.8145254343454323,
"grad_norm": 0.8309612274169922,
"learning_rate": 4.79338169640493e-06,
"loss": 1.4044,
"step": 24707
},
{
"epoch": 0.8155474235980615,
"grad_norm": 0.8278691172599792,
"learning_rate": 4.74404533799851e-06,
"loss": 1.4097,
"step": 24738
},
{
"epoch": 0.8165694128506906,
"grad_norm": 0.7878281474113464,
"learning_rate": 4.694937565287344e-06,
"loss": 1.4075,
"step": 24769
},
{
"epoch": 0.8175914021033198,
"grad_norm": 0.8145809769630432,
"learning_rate": 4.646058932445985e-06,
"loss": 1.4032,
"step": 24800
},
{
"epoch": 0.818613391355949,
"grad_norm": 0.8413559794425964,
"learning_rate": 4.597409991063148e-06,
"loss": 1.4097,
"step": 24831
},
{
"epoch": 0.8196353806085781,
"grad_norm": 0.8196817636489868,
"learning_rate": 4.5489912901355375e-06,
"loss": 1.3989,
"step": 24862
},
{
"epoch": 0.8206573698612073,
"grad_norm": 0.8410577774047852,
"learning_rate": 4.500803376061608e-06,
"loss": 1.4059,
"step": 24893
},
{
"epoch": 0.8216793591138364,
"grad_norm": 0.8097809553146362,
"learning_rate": 4.45284679263541e-06,
"loss": 1.3922,
"step": 24924
},
{
"epoch": 0.8227013483664656,
"grad_norm": 0.8499380350112915,
"learning_rate": 4.4051220810404775e-06,
"loss": 1.4004,
"step": 24955
},
{
"epoch": 0.8237233376190947,
"grad_norm": 0.8219107985496521,
"learning_rate": 4.3576297798437025e-06,
"loss": 1.3987,
"step": 24986
},
{
"epoch": 0.8247453268717239,
"grad_norm": 0.850868284702301,
"learning_rate": 4.3103704249892436e-06,
"loss": 1.3938,
"step": 25017
},
{
"epoch": 0.8257673161243531,
"grad_norm": 0.8800864815711975,
"learning_rate": 4.263344549792487e-06,
"loss": 1.4216,
"step": 25048
},
{
"epoch": 0.8267893053769821,
"grad_norm": 0.8150456547737122,
"learning_rate": 4.216552684934056e-06,
"loss": 1.4157,
"step": 25079
},
{
"epoch": 0.8278112946296113,
"grad_norm": 0.9749488234519958,
"learning_rate": 4.169995358453777e-06,
"loss": 1.4015,
"step": 25110
},
{
"epoch": 0.8288332838822404,
"grad_norm": 0.8335748910903931,
"learning_rate": 4.123673095744757e-06,
"loss": 1.4005,
"step": 25141
},
{
"epoch": 0.8298552731348696,
"grad_norm": 0.8542735576629639,
"learning_rate": 4.077586419547435e-06,
"loss": 1.4014,
"step": 25172
},
{
"epoch": 0.8308772623874988,
"grad_norm": 0.8510357141494751,
"learning_rate": 4.03173584994368e-06,
"loss": 1.3915,
"step": 25203
},
{
"epoch": 0.8318992516401279,
"grad_norm": 0.884564995765686,
"learning_rate": 3.986121904350948e-06,
"loss": 1.4081,
"step": 25234
},
{
"epoch": 0.8329212408927571,
"grad_norm": 0.8385151624679565,
"learning_rate": 3.940745097516407e-06,
"loss": 1.4065,
"step": 25265
},
{
"epoch": 0.8339432301453862,
"grad_norm": 0.8125067353248596,
"learning_rate": 3.89560594151116e-06,
"loss": 1.4129,
"step": 25296
},
{
"epoch": 0.8349652193980154,
"grad_norm": 0.8358818292617798,
"learning_rate": 3.850704945724456e-06,
"loss": 1.4278,
"step": 25327
},
{
"epoch": 0.8359872086506445,
"grad_norm": 0.8372805714607239,
"learning_rate": 3.8060426168579077e-06,
"loss": 1.3907,
"step": 25358
},
{
"epoch": 0.8370091979032737,
"grad_norm": 0.8480520248413086,
"learning_rate": 3.7616194589198407e-06,
"loss": 1.3941,
"step": 25389
},
{
"epoch": 0.8380311871559029,
"grad_norm": 0.8412033319473267,
"learning_rate": 3.7174359732195574e-06,
"loss": 1.404,
"step": 25420
},
{
"epoch": 0.8390531764085319,
"grad_norm": 0.8560240268707275,
"learning_rate": 3.673492658361677e-06,
"loss": 1.4272,
"step": 25451
},
{
"epoch": 0.8400751656611611,
"grad_norm": 0.8140300512313843,
"learning_rate": 3.6297900102405467e-06,
"loss": 1.414,
"step": 25482
},
{
"epoch": 0.8410971549137902,
"grad_norm": 0.8061606884002686,
"learning_rate": 3.586328522034607e-06,
"loss": 1.414,
"step": 25513
},
{
"epoch": 0.8421191441664194,
"grad_norm": 0.8134278059005737,
"learning_rate": 3.543108684200838e-06,
"loss": 1.4152,
"step": 25544
},
{
"epoch": 0.8431411334190486,
"grad_norm": 0.8117665648460388,
"learning_rate": 3.5001309844692464e-06,
"loss": 1.3981,
"step": 25575
},
{
"epoch": 0.8441631226716777,
"grad_norm": 0.834642767906189,
"learning_rate": 3.4573959078373215e-06,
"loss": 1.381,
"step": 25606
},
{
"epoch": 0.8451851119243069,
"grad_norm": 0.8252747058868408,
"learning_rate": 3.4149039365646063e-06,
"loss": 1.4088,
"step": 25637
},
{
"epoch": 0.846207101176936,
"grad_norm": 0.8141157627105713,
"learning_rate": 3.3726555501672143e-06,
"loss": 1.4095,
"step": 25668
},
{
"epoch": 0.8472290904295652,
"grad_norm": 0.8327375054359436,
"learning_rate": 3.33065122541244e-06,
"loss": 1.3955,
"step": 25699
},
{
"epoch": 0.8482510796821943,
"grad_norm": 0.8337382078170776,
"learning_rate": 3.288891436313385e-06,
"loss": 1.4118,
"step": 25730
},
{
"epoch": 0.8492730689348235,
"grad_norm": 0.855179488658905,
"learning_rate": 3.2473766541235963e-06,
"loss": 1.3819,
"step": 25761
},
{
"epoch": 0.8502950581874527,
"grad_norm": 0.8489012718200684,
"learning_rate": 3.2061073473317466e-06,
"loss": 1.3896,
"step": 25792
},
{
"epoch": 0.8513170474400817,
"grad_norm": 0.8339117765426636,
"learning_rate": 3.1650839816563444e-06,
"loss": 1.4018,
"step": 25823
},
{
"epoch": 0.8523390366927109,
"grad_norm": 0.8457059264183044,
"learning_rate": 3.1243070200405093e-06,
"loss": 1.4051,
"step": 25854
},
{
"epoch": 0.85336102594534,
"grad_norm": 0.8139636516571045,
"learning_rate": 3.0837769226467e-06,
"loss": 1.4022,
"step": 25885
},
{
"epoch": 0.8543830151979692,
"grad_norm": 0.8181917071342468,
"learning_rate": 3.0434941468515666e-06,
"loss": 1.3857,
"step": 25916
},
{
"epoch": 0.8554050044505983,
"grad_norm": 0.827022910118103,
"learning_rate": 3.003459147240753e-06,
"loss": 1.4097,
"step": 25947
},
{
"epoch": 0.8564269937032275,
"grad_norm": 0.8474435210227966,
"learning_rate": 2.9636723756037875e-06,
"loss": 1.3959,
"step": 25978
},
{
"epoch": 0.8574489829558567,
"grad_norm": 0.8524008393287659,
"learning_rate": 2.9241342809289833e-06,
"loss": 1.4108,
"step": 26009
},
{
"epoch": 0.8584709722084858,
"grad_norm": 0.8360846638679504,
"learning_rate": 2.8848453093983594e-06,
"loss": 1.3946,
"step": 26040
},
{
"epoch": 0.859492961461115,
"grad_norm": 0.8294342160224915,
"learning_rate": 2.8458059043826257e-06,
"loss": 1.4134,
"step": 26071
},
{
"epoch": 0.8605149507137441,
"grad_norm": 0.85257887840271,
"learning_rate": 2.807016506436172e-06,
"loss": 1.3838,
"step": 26102
},
{
"epoch": 0.8615369399663733,
"grad_norm": 0.8425765633583069,
"learning_rate": 2.7684775532920566e-06,
"loss": 1.3965,
"step": 26133
},
{
"epoch": 0.8625589292190025,
"grad_norm": 0.8341211080551147,
"learning_rate": 2.7301894798571425e-06,
"loss": 1.4012,
"step": 26164
},
{
"epoch": 0.8635809184716315,
"grad_norm": 0.8640623688697815,
"learning_rate": 2.6921527182071386e-06,
"loss": 1.386,
"step": 26195
},
{
"epoch": 0.8646029077242607,
"grad_norm": 0.8298767805099487,
"learning_rate": 2.654367697581725e-06,
"loss": 1.3828,
"step": 26226
},
{
"epoch": 0.8656248969768898,
"grad_norm": 0.8219887018203735,
"learning_rate": 2.6168348443797175e-06,
"loss": 1.3955,
"step": 26257
},
{
"epoch": 0.866646886229519,
"grad_norm": 0.8416712284088135,
"learning_rate": 2.5795545821542757e-06,
"loss": 1.4063,
"step": 26288
},
{
"epoch": 0.8676688754821481,
"grad_norm": 0.823794960975647,
"learning_rate": 2.54252733160808e-06,
"loss": 1.3832,
"step": 26319
},
{
"epoch": 0.8686908647347773,
"grad_norm": 0.8214231133460999,
"learning_rate": 2.5057535105886294e-06,
"loss": 1.3872,
"step": 26350
},
{
"epoch": 0.8697128539874065,
"grad_norm": 0.8258060812950134,
"learning_rate": 2.4692335340834953e-06,
"loss": 1.3815,
"step": 26381
},
{
"epoch": 0.8707348432400356,
"grad_norm": 0.8566368222236633,
"learning_rate": 2.432967814215639e-06,
"loss": 1.4109,
"step": 26412
},
{
"epoch": 0.8717568324926648,
"grad_norm": 0.8273147344589233,
"learning_rate": 2.396956760238794e-06,
"loss": 1.4059,
"step": 26443
},
{
"epoch": 0.8727788217452939,
"grad_norm": 0.7971722483634949,
"learning_rate": 2.361200778532796e-06,
"loss": 1.3898,
"step": 26474
},
{
"epoch": 0.8738008109979231,
"grad_norm": 0.9347776174545288,
"learning_rate": 2.325700272599049e-06,
"loss": 1.42,
"step": 26505
},
{
"epoch": 0.8748228002505523,
"grad_norm": 0.8639031052589417,
"learning_rate": 2.2904556430559415e-06,
"loss": 1.4068,
"step": 26536
},
{
"epoch": 0.8758447895031813,
"grad_norm": 0.8582850098609924,
"learning_rate": 2.2554672876343106e-06,
"loss": 1.4089,
"step": 26567
},
{
"epoch": 0.8768667787558105,
"grad_norm": 0.8343849182128906,
"learning_rate": 2.220735601173002e-06,
"loss": 1.3902,
"step": 26598
},
{
"epoch": 0.8778887680084396,
"grad_norm": 0.8293885588645935,
"learning_rate": 2.186260975614382e-06,
"loss": 1.4027,
"step": 26629
},
{
"epoch": 0.8789107572610688,
"grad_norm": 0.8358698487281799,
"learning_rate": 2.1520437999999034e-06,
"loss": 1.3941,
"step": 26660
},
{
"epoch": 0.8799327465136979,
"grad_norm": 0.8251244425773621,
"learning_rate": 2.1180844604657526e-06,
"loss": 1.4191,
"step": 26691
},
{
"epoch": 0.8809547357663271,
"grad_norm": 0.8112486600875854,
"learning_rate": 2.084383340238455e-06,
"loss": 1.4123,
"step": 26722
},
{
"epoch": 0.8819767250189563,
"grad_norm": 0.833122193813324,
"learning_rate": 2.0509408196305704e-06,
"loss": 1.3796,
"step": 26753
},
{
"epoch": 0.8829987142715854,
"grad_norm": 0.8029480576515198,
"learning_rate": 2.017757276036403e-06,
"loss": 1.3847,
"step": 26784
},
{
"epoch": 0.8840207035242146,
"grad_norm": 0.8319761753082275,
"learning_rate": 1.984833083927726e-06,
"loss": 1.4015,
"step": 26815
},
{
"epoch": 0.8850426927768437,
"grad_norm": 0.8120732307434082,
"learning_rate": 1.952168614849581e-06,
"loss": 1.4064,
"step": 26846
},
{
"epoch": 0.8860646820294729,
"grad_norm": 0.8120958209037781,
"learning_rate": 1.919764237416058e-06,
"loss": 1.3988,
"step": 26877
},
{
"epoch": 0.887086671282102,
"grad_norm": 0.8618322014808655,
"learning_rate": 1.8876203173061463e-06,
"loss": 1.4116,
"step": 26908
},
{
"epoch": 0.8881086605347311,
"grad_norm": 0.8309136033058167,
"learning_rate": 1.8557372172596206e-06,
"loss": 1.4013,
"step": 26939
},
{
"epoch": 0.8891306497873603,
"grad_norm": 0.8589499592781067,
"learning_rate": 1.8241152970729341e-06,
"loss": 1.3942,
"step": 26970
},
{
"epoch": 0.8901526390399894,
"grad_norm": 0.8159814476966858,
"learning_rate": 1.7927549135951572e-06,
"loss": 1.3885,
"step": 27001
},
{
"epoch": 0.8911746282926186,
"grad_norm": 0.854448676109314,
"learning_rate": 1.7616564207239477e-06,
"loss": 1.3914,
"step": 27032
},
{
"epoch": 0.8921966175452477,
"grad_norm": 0.8111308217048645,
"learning_rate": 1.730820169401584e-06,
"loss": 1.3782,
"step": 27063
},
{
"epoch": 0.8932186067978769,
"grad_norm": 0.8496434092521667,
"learning_rate": 1.7002465076109558e-06,
"loss": 1.3927,
"step": 27094
},
{
"epoch": 0.8942405960505061,
"grad_norm": 0.8473883271217346,
"learning_rate": 1.6699357803716898e-06,
"loss": 1.3887,
"step": 27125
},
{
"epoch": 0.8952625853031352,
"grad_norm": 0.8557064533233643,
"learning_rate": 1.6398883297362305e-06,
"loss": 1.3985,
"step": 27156
},
{
"epoch": 0.8962845745557644,
"grad_norm": 0.8828888535499573,
"learning_rate": 1.6101044947859606e-06,
"loss": 1.3875,
"step": 27187
},
{
"epoch": 0.8973065638083935,
"grad_norm": 0.8234705328941345,
"learning_rate": 1.5805846116274114e-06,
"loss": 1.4123,
"step": 27218
},
{
"epoch": 0.8983285530610227,
"grad_norm": 0.814002513885498,
"learning_rate": 1.5513290133884611e-06,
"loss": 1.4021,
"step": 27249
},
{
"epoch": 0.8993505423136517,
"grad_norm": 0.8468136191368103,
"learning_rate": 1.5223380302145512e-06,
"loss": 1.4031,
"step": 27280
},
{
"epoch": 0.9003725315662809,
"grad_norm": 0.823250949382782,
"learning_rate": 1.4936119892649925e-06,
"loss": 1.3922,
"step": 27311
},
{
"epoch": 0.9013945208189101,
"grad_norm": 0.8171223998069763,
"learning_rate": 1.4651512147092482e-06,
"loss": 1.3878,
"step": 27342
},
{
"epoch": 0.9024165100715392,
"grad_norm": 0.823663055896759,
"learning_rate": 1.4369560277232908e-06,
"loss": 1.4193,
"step": 27373
},
{
"epoch": 0.9034384993241684,
"grad_norm": 0.8320732712745667,
"learning_rate": 1.409026746485978e-06,
"loss": 1.3795,
"step": 27404
},
{
"epoch": 0.9044604885767975,
"grad_norm": 0.8476362228393555,
"learning_rate": 1.3813636861754464e-06,
"loss": 1.3872,
"step": 27435
},
{
"epoch": 0.9054824778294267,
"grad_norm": 0.8781965374946594,
"learning_rate": 1.3539671589655773e-06,
"loss": 1.3957,
"step": 27466
}
],
"logging_steps": 31,
"max_steps": 30517,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3052,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9185168775775257e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}