Hamanasu-QwQ-V2-RP / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
578d393 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9850107066381155,
"eval_steps": 500,
"global_step": 932,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004282655246252677,
"grad_norm": 5.760822510102549,
"learning_rate": 6.25e-07,
"loss": 1.9283,
"step": 1
},
{
"epoch": 0.008565310492505354,
"grad_norm": 9.541601401962827,
"learning_rate": 1.25e-06,
"loss": 1.9234,
"step": 2
},
{
"epoch": 0.01284796573875803,
"grad_norm": 6.538784535229611,
"learning_rate": 1.875e-06,
"loss": 1.8408,
"step": 3
},
{
"epoch": 0.017130620985010708,
"grad_norm": 7.213295700964454,
"learning_rate": 2.5e-06,
"loss": 1.9104,
"step": 4
},
{
"epoch": 0.021413276231263382,
"grad_norm": 6.314491554982707,
"learning_rate": 3.125e-06,
"loss": 1.9888,
"step": 5
},
{
"epoch": 0.02569593147751606,
"grad_norm": 2.828604192300556,
"learning_rate": 3.75e-06,
"loss": 1.8774,
"step": 6
},
{
"epoch": 0.029978586723768737,
"grad_norm": 1.9892932329122277,
"learning_rate": 4.375e-06,
"loss": 1.8232,
"step": 7
},
{
"epoch": 0.034261241970021415,
"grad_norm": 2.301797599943145,
"learning_rate": 5e-06,
"loss": 1.7929,
"step": 8
},
{
"epoch": 0.03854389721627409,
"grad_norm": 2.642325145637469,
"learning_rate": 5.625e-06,
"loss": 1.6999,
"step": 9
},
{
"epoch": 0.042826552462526764,
"grad_norm": 1.54457145688109,
"learning_rate": 6.25e-06,
"loss": 1.797,
"step": 10
},
{
"epoch": 0.047109207708779445,
"grad_norm": 0.6231435902899881,
"learning_rate": 6.875000000000001e-06,
"loss": 1.8327,
"step": 11
},
{
"epoch": 0.05139186295503212,
"grad_norm": 1.9278562906780023,
"learning_rate": 7.5e-06,
"loss": 1.8551,
"step": 12
},
{
"epoch": 0.055674518201284794,
"grad_norm": 2.1201436104190123,
"learning_rate": 8.125000000000001e-06,
"loss": 1.713,
"step": 13
},
{
"epoch": 0.059957173447537475,
"grad_norm": 1.6833290019534897,
"learning_rate": 8.75e-06,
"loss": 1.7723,
"step": 14
},
{
"epoch": 0.06423982869379015,
"grad_norm": 1.7592896430688332,
"learning_rate": 9.375000000000001e-06,
"loss": 1.7786,
"step": 15
},
{
"epoch": 0.06852248394004283,
"grad_norm": 1.3527651484156915,
"learning_rate": 1e-05,
"loss": 1.8679,
"step": 16
},
{
"epoch": 0.0728051391862955,
"grad_norm": 0.487619361955017,
"learning_rate": 1.0625e-05,
"loss": 1.7012,
"step": 17
},
{
"epoch": 0.07708779443254818,
"grad_norm": 1.429523434604011,
"learning_rate": 1.125e-05,
"loss": 1.6743,
"step": 18
},
{
"epoch": 0.08137044967880086,
"grad_norm": 1.1313491131725162,
"learning_rate": 1.1875e-05,
"loss": 1.7325,
"step": 19
},
{
"epoch": 0.08565310492505353,
"grad_norm": 0.7296310910231132,
"learning_rate": 1.25e-05,
"loss": 1.6433,
"step": 20
},
{
"epoch": 0.08993576017130621,
"grad_norm": 1.0873378233145565,
"learning_rate": 1.3125e-05,
"loss": 1.8281,
"step": 21
},
{
"epoch": 0.09421841541755889,
"grad_norm": 0.4193072086415473,
"learning_rate": 1.3750000000000002e-05,
"loss": 1.7653,
"step": 22
},
{
"epoch": 0.09850107066381156,
"grad_norm": 0.6318751609453201,
"learning_rate": 1.4374999999999999e-05,
"loss": 1.806,
"step": 23
},
{
"epoch": 0.10278372591006424,
"grad_norm": 0.7998125137748582,
"learning_rate": 1.5e-05,
"loss": 1.7682,
"step": 24
},
{
"epoch": 0.10706638115631692,
"grad_norm": 0.4607962108022345,
"learning_rate": 1.5625e-05,
"loss": 1.7188,
"step": 25
},
{
"epoch": 0.11134903640256959,
"grad_norm": 0.6691122325494083,
"learning_rate": 1.6250000000000002e-05,
"loss": 1.7694,
"step": 26
},
{
"epoch": 0.11563169164882227,
"grad_norm": 0.5966739618315919,
"learning_rate": 1.6875000000000004e-05,
"loss": 1.709,
"step": 27
},
{
"epoch": 0.11991434689507495,
"grad_norm": 0.5572079216591783,
"learning_rate": 1.75e-05,
"loss": 1.7757,
"step": 28
},
{
"epoch": 0.12419700214132762,
"grad_norm": 0.5765542135816161,
"learning_rate": 1.8125e-05,
"loss": 1.6293,
"step": 29
},
{
"epoch": 0.1284796573875803,
"grad_norm": 0.43151210853354827,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.7835,
"step": 30
},
{
"epoch": 0.13276231263383298,
"grad_norm": 0.6485552906377018,
"learning_rate": 1.9375e-05,
"loss": 1.5803,
"step": 31
},
{
"epoch": 0.13704496788008566,
"grad_norm": 0.4482132894148142,
"learning_rate": 2e-05,
"loss": 1.6477,
"step": 32
},
{
"epoch": 0.14132762312633834,
"grad_norm": 0.611457842177946,
"learning_rate": 2.0625e-05,
"loss": 1.6426,
"step": 33
},
{
"epoch": 0.145610278372591,
"grad_norm": 0.38871825671554094,
"learning_rate": 2.125e-05,
"loss": 1.5813,
"step": 34
},
{
"epoch": 0.14989293361884368,
"grad_norm": 0.5250386322112104,
"learning_rate": 2.1875e-05,
"loss": 1.6655,
"step": 35
},
{
"epoch": 0.15417558886509636,
"grad_norm": 0.41266647411410223,
"learning_rate": 2.25e-05,
"loss": 1.7505,
"step": 36
},
{
"epoch": 0.15845824411134904,
"grad_norm": 0.4161967413290072,
"learning_rate": 2.3125000000000003e-05,
"loss": 1.6562,
"step": 37
},
{
"epoch": 0.16274089935760172,
"grad_norm": 0.4023472892837264,
"learning_rate": 2.375e-05,
"loss": 1.5572,
"step": 38
},
{
"epoch": 0.1670235546038544,
"grad_norm": 0.4716743391777267,
"learning_rate": 2.4375e-05,
"loss": 1.654,
"step": 39
},
{
"epoch": 0.17130620985010706,
"grad_norm": 0.41899791933237945,
"learning_rate": 2.5e-05,
"loss": 1.6262,
"step": 40
},
{
"epoch": 0.17558886509635974,
"grad_norm": 0.4691927568541724,
"learning_rate": 2.4999922473602244e-05,
"loss": 1.7226,
"step": 41
},
{
"epoch": 0.17987152034261242,
"grad_norm": 0.3788848931279516,
"learning_rate": 2.499968989537063e-05,
"loss": 1.5837,
"step": 42
},
{
"epoch": 0.1841541755888651,
"grad_norm": 0.4140681800260881,
"learning_rate": 2.4999302268190118e-05,
"loss": 1.6646,
"step": 43
},
{
"epoch": 0.18843683083511778,
"grad_norm": 0.44590521442058745,
"learning_rate": 2.4998759596868908e-05,
"loss": 1.6526,
"step": 44
},
{
"epoch": 0.19271948608137046,
"grad_norm": 0.42731019400707126,
"learning_rate": 2.499806188813843e-05,
"loss": 1.6234,
"step": 45
},
{
"epoch": 0.19700214132762311,
"grad_norm": 0.41939827714864014,
"learning_rate": 2.4997209150653212e-05,
"loss": 1.6093,
"step": 46
},
{
"epoch": 0.2012847965738758,
"grad_norm": 0.4139349892587555,
"learning_rate": 2.4996201394990805e-05,
"loss": 1.6361,
"step": 47
},
{
"epoch": 0.20556745182012848,
"grad_norm": 0.4278900632702424,
"learning_rate": 2.4995038633651627e-05,
"loss": 1.6978,
"step": 48
},
{
"epoch": 0.20985010706638116,
"grad_norm": 0.3616303314921443,
"learning_rate": 2.499372088105884e-05,
"loss": 1.4552,
"step": 49
},
{
"epoch": 0.21413276231263384,
"grad_norm": 0.42854114017469835,
"learning_rate": 2.4992248153558134e-05,
"loss": 1.7517,
"step": 50
},
{
"epoch": 0.21841541755888652,
"grad_norm": 0.526378364056868,
"learning_rate": 2.4990620469417554e-05,
"loss": 1.6058,
"step": 51
},
{
"epoch": 0.22269807280513917,
"grad_norm": 0.4059991411502784,
"learning_rate": 2.498883784882726e-05,
"loss": 1.5755,
"step": 52
},
{
"epoch": 0.22698072805139186,
"grad_norm": 0.4066265032195638,
"learning_rate": 2.4986900313899273e-05,
"loss": 1.6502,
"step": 53
},
{
"epoch": 0.23126338329764454,
"grad_norm": 0.4679382599100356,
"learning_rate": 2.498480788866721e-05,
"loss": 1.5904,
"step": 54
},
{
"epoch": 0.23554603854389722,
"grad_norm": 0.4146983639334321,
"learning_rate": 2.4982560599085984e-05,
"loss": 1.7578,
"step": 55
},
{
"epoch": 0.2398286937901499,
"grad_norm": 0.5011668519133488,
"learning_rate": 2.4980158473031472e-05,
"loss": 1.6348,
"step": 56
},
{
"epoch": 0.24411134903640258,
"grad_norm": 0.39760241005080976,
"learning_rate": 2.4977601540300188e-05,
"loss": 1.6521,
"step": 57
},
{
"epoch": 0.24839400428265523,
"grad_norm": 0.3842980574877057,
"learning_rate": 2.49748898326089e-05,
"loss": 1.5468,
"step": 58
},
{
"epoch": 0.25267665952890794,
"grad_norm": 0.5027132906331951,
"learning_rate": 2.497202338359423e-05,
"loss": 1.6786,
"step": 59
},
{
"epoch": 0.2569593147751606,
"grad_norm": 0.42843610006674887,
"learning_rate": 2.4969002228812256e-05,
"loss": 1.5481,
"step": 60
},
{
"epoch": 0.26124197002141325,
"grad_norm": 0.418048089271474,
"learning_rate": 2.4965826405738054e-05,
"loss": 1.5,
"step": 61
},
{
"epoch": 0.26552462526766596,
"grad_norm": 0.38187537005853855,
"learning_rate": 2.4962495953765248e-05,
"loss": 1.5241,
"step": 62
},
{
"epoch": 0.2698072805139186,
"grad_norm": 0.41994694502120095,
"learning_rate": 2.495901091420551e-05,
"loss": 1.5668,
"step": 63
},
{
"epoch": 0.2740899357601713,
"grad_norm": 0.41737827691699964,
"learning_rate": 2.4955371330288045e-05,
"loss": 1.6447,
"step": 64
},
{
"epoch": 0.278372591006424,
"grad_norm": 0.42240231204308587,
"learning_rate": 2.4951577247159068e-05,
"loss": 1.5265,
"step": 65
},
{
"epoch": 0.2826552462526767,
"grad_norm": 0.4053695799459516,
"learning_rate": 2.494762871188124e-05,
"loss": 1.6668,
"step": 66
},
{
"epoch": 0.28693790149892934,
"grad_norm": 0.3755341743631125,
"learning_rate": 2.4943525773433063e-05,
"loss": 1.4097,
"step": 67
},
{
"epoch": 0.291220556745182,
"grad_norm": 0.3698075541817392,
"learning_rate": 2.4939268482708318e-05,
"loss": 1.7374,
"step": 68
},
{
"epoch": 0.2955032119914347,
"grad_norm": 0.35883020884289013,
"learning_rate": 2.4934856892515378e-05,
"loss": 1.7297,
"step": 69
},
{
"epoch": 0.29978586723768735,
"grad_norm": 0.42882530161941707,
"learning_rate": 2.4930291057576603e-05,
"loss": 1.6139,
"step": 70
},
{
"epoch": 0.30406852248394006,
"grad_norm": 0.43658211064964164,
"learning_rate": 2.4925571034527633e-05,
"loss": 1.6844,
"step": 71
},
{
"epoch": 0.3083511777301927,
"grad_norm": 0.375017122269398,
"learning_rate": 2.492069688191668e-05,
"loss": 1.5154,
"step": 72
},
{
"epoch": 0.31263383297644537,
"grad_norm": 0.4040881085038259,
"learning_rate": 2.4915668660203827e-05,
"loss": 1.6869,
"step": 73
},
{
"epoch": 0.3169164882226981,
"grad_norm": 0.5315682285098243,
"learning_rate": 2.4910486431760266e-05,
"loss": 1.6036,
"step": 74
},
{
"epoch": 0.32119914346895073,
"grad_norm": 0.6819504727092934,
"learning_rate": 2.490515026086751e-05,
"loss": 1.7321,
"step": 75
},
{
"epoch": 0.32548179871520344,
"grad_norm": 0.4459093235436832,
"learning_rate": 2.489966021371662e-05,
"loss": 1.6316,
"step": 76
},
{
"epoch": 0.3297644539614561,
"grad_norm": 0.41265976791945247,
"learning_rate": 2.4894016358407368e-05,
"loss": 1.6822,
"step": 77
},
{
"epoch": 0.3340471092077088,
"grad_norm": 0.40455952502188075,
"learning_rate": 2.4888218764947397e-05,
"loss": 1.6279,
"step": 78
},
{
"epoch": 0.33832976445396146,
"grad_norm": 0.39048708108607677,
"learning_rate": 2.488226750525135e-05,
"loss": 1.67,
"step": 79
},
{
"epoch": 0.3426124197002141,
"grad_norm": 0.37437295904257595,
"learning_rate": 2.487616265313999e-05,
"loss": 1.6237,
"step": 80
},
{
"epoch": 0.3468950749464668,
"grad_norm": 0.5090276930456816,
"learning_rate": 2.486990428433926e-05,
"loss": 1.6003,
"step": 81
},
{
"epoch": 0.3511777301927195,
"grad_norm": 0.46215544091371435,
"learning_rate": 2.486349247647938e-05,
"loss": 1.6227,
"step": 82
},
{
"epoch": 0.3554603854389722,
"grad_norm": 0.41822651733354704,
"learning_rate": 2.485692730909383e-05,
"loss": 1.669,
"step": 83
},
{
"epoch": 0.35974304068522484,
"grad_norm": 0.441655220825228,
"learning_rate": 2.4850208863618425e-05,
"loss": 1.4542,
"step": 84
},
{
"epoch": 0.3640256959314775,
"grad_norm": 0.3786999347152407,
"learning_rate": 2.4843337223390267e-05,
"loss": 1.4966,
"step": 85
},
{
"epoch": 0.3683083511777302,
"grad_norm": 0.363991509035686,
"learning_rate": 2.483631247364671e-05,
"loss": 1.4573,
"step": 86
},
{
"epoch": 0.37259100642398285,
"grad_norm": 0.36392542721746446,
"learning_rate": 2.482913470152433e-05,
"loss": 1.5823,
"step": 87
},
{
"epoch": 0.37687366167023556,
"grad_norm": 0.3644244797395943,
"learning_rate": 2.482180399605781e-05,
"loss": 1.5918,
"step": 88
},
{
"epoch": 0.3811563169164882,
"grad_norm": 0.3517233506762531,
"learning_rate": 2.481432044817887e-05,
"loss": 1.6118,
"step": 89
},
{
"epoch": 0.3854389721627409,
"grad_norm": 0.44516577454752954,
"learning_rate": 2.4806684150715097e-05,
"loss": 1.5337,
"step": 90
},
{
"epoch": 0.3897216274089936,
"grad_norm": 0.4170570804452654,
"learning_rate": 2.4798895198388845e-05,
"loss": 1.6465,
"step": 91
},
{
"epoch": 0.39400428265524623,
"grad_norm": 0.36685661291454347,
"learning_rate": 2.4790953687816017e-05,
"loss": 1.6072,
"step": 92
},
{
"epoch": 0.39828693790149894,
"grad_norm": 0.4465652273066297,
"learning_rate": 2.4782859717504883e-05,
"loss": 1.648,
"step": 93
},
{
"epoch": 0.4025695931477516,
"grad_norm": 0.551115294286871,
"learning_rate": 2.4774613387854866e-05,
"loss": 1.6789,
"step": 94
},
{
"epoch": 0.4068522483940043,
"grad_norm": 0.4583820418024637,
"learning_rate": 2.4766214801155276e-05,
"loss": 1.5697,
"step": 95
},
{
"epoch": 0.41113490364025695,
"grad_norm": 0.41125366081563586,
"learning_rate": 2.475766406158407e-05,
"loss": 1.5489,
"step": 96
},
{
"epoch": 0.41541755888650966,
"grad_norm": 0.5258069733050229,
"learning_rate": 2.4748961275206527e-05,
"loss": 1.5782,
"step": 97
},
{
"epoch": 0.4197002141327623,
"grad_norm": 0.5370435285656707,
"learning_rate": 2.4740106549973953e-05,
"loss": 1.4463,
"step": 98
},
{
"epoch": 0.42398286937901497,
"grad_norm": 0.3656167120256986,
"learning_rate": 2.4731099995722353e-05,
"loss": 1.503,
"step": 99
},
{
"epoch": 0.4282655246252677,
"grad_norm": 0.37413674489512233,
"learning_rate": 2.4721941724171025e-05,
"loss": 1.5299,
"step": 100
},
{
"epoch": 0.43254817987152033,
"grad_norm": 0.5442125290315152,
"learning_rate": 2.4712631848921224e-05,
"loss": 1.47,
"step": 101
},
{
"epoch": 0.43683083511777304,
"grad_norm": 0.41142510883515865,
"learning_rate": 2.470317048545473e-05,
"loss": 1.6346,
"step": 102
},
{
"epoch": 0.4411134903640257,
"grad_norm": 0.36843680594934913,
"learning_rate": 2.4693557751132405e-05,
"loss": 1.5707,
"step": 103
},
{
"epoch": 0.44539614561027835,
"grad_norm": 0.40580526926230925,
"learning_rate": 2.4683793765192753e-05,
"loss": 1.611,
"step": 104
},
{
"epoch": 0.44967880085653106,
"grad_norm": 0.43640536110068956,
"learning_rate": 2.4673878648750446e-05,
"loss": 1.6646,
"step": 105
},
{
"epoch": 0.4539614561027837,
"grad_norm": 0.36401706952184854,
"learning_rate": 2.4663812524794803e-05,
"loss": 1.501,
"step": 106
},
{
"epoch": 0.4582441113490364,
"grad_norm": 0.4597685050788604,
"learning_rate": 2.4653595518188276e-05,
"loss": 1.4702,
"step": 107
},
{
"epoch": 0.4625267665952891,
"grad_norm": 0.4198721125351821,
"learning_rate": 2.4643227755664898e-05,
"loss": 1.5426,
"step": 108
},
{
"epoch": 0.4668094218415418,
"grad_norm": 0.4415892969218905,
"learning_rate": 2.463270936582872e-05,
"loss": 1.5348,
"step": 109
},
{
"epoch": 0.47109207708779444,
"grad_norm": 0.468616016936323,
"learning_rate": 2.4622040479152195e-05,
"loss": 1.5948,
"step": 110
},
{
"epoch": 0.4753747323340471,
"grad_norm": 0.7486771610195644,
"learning_rate": 2.4611221227974584e-05,
"loss": 1.683,
"step": 111
},
{
"epoch": 0.4796573875802998,
"grad_norm": 0.779148114510847,
"learning_rate": 2.4600251746500296e-05,
"loss": 1.4869,
"step": 112
},
{
"epoch": 0.48394004282655245,
"grad_norm": 0.4689233006931303,
"learning_rate": 2.4589132170797234e-05,
"loss": 1.561,
"step": 113
},
{
"epoch": 0.48822269807280516,
"grad_norm": 0.7040850097227628,
"learning_rate": 2.4577862638795098e-05,
"loss": 1.6254,
"step": 114
},
{
"epoch": 0.4925053533190578,
"grad_norm": 0.4769716615847163,
"learning_rate": 2.456644329028369e-05,
"loss": 1.5774,
"step": 115
},
{
"epoch": 0.49678800856531047,
"grad_norm": 0.6632040162872483,
"learning_rate": 2.4554874266911157e-05,
"loss": 1.5463,
"step": 116
},
{
"epoch": 0.5010706638115632,
"grad_norm": 1.1457031665772415,
"learning_rate": 2.4543155712182252e-05,
"loss": 1.642,
"step": 117
},
{
"epoch": 0.5053533190578159,
"grad_norm": 0.5148470344333809,
"learning_rate": 2.4531287771456556e-05,
"loss": 1.5455,
"step": 118
},
{
"epoch": 0.5096359743040685,
"grad_norm": 0.9100598002476826,
"learning_rate": 2.4519270591946653e-05,
"loss": 1.555,
"step": 119
},
{
"epoch": 0.5139186295503212,
"grad_norm": 1.3104358729746841,
"learning_rate": 2.4507104322716326e-05,
"loss": 1.5604,
"step": 120
},
{
"epoch": 0.5182012847965739,
"grad_norm": 0.5181336432498789,
"learning_rate": 2.44947891146787e-05,
"loss": 1.5029,
"step": 121
},
{
"epoch": 0.5224839400428265,
"grad_norm": 1.0934800951662504,
"learning_rate": 2.4482325120594374e-05,
"loss": 1.5449,
"step": 122
},
{
"epoch": 0.5267665952890792,
"grad_norm": 0.6352277468903285,
"learning_rate": 2.4469712495069507e-05,
"loss": 1.588,
"step": 123
},
{
"epoch": 0.5310492505353319,
"grad_norm": 1.4867821380058142,
"learning_rate": 2.445695139455394e-05,
"loss": 1.6408,
"step": 124
},
{
"epoch": 0.5353319057815846,
"grad_norm": 0.6556668793792217,
"learning_rate": 2.444404197733921e-05,
"loss": 1.5059,
"step": 125
},
{
"epoch": 0.5396145610278372,
"grad_norm": 1.3366811637363765,
"learning_rate": 2.4430984403556613e-05,
"loss": 1.6334,
"step": 126
},
{
"epoch": 0.5438972162740899,
"grad_norm": 0.8391779284464247,
"learning_rate": 2.441777883517522e-05,
"loss": 1.5342,
"step": 127
},
{
"epoch": 0.5481798715203426,
"grad_norm": 1.5151547233227163,
"learning_rate": 2.4404425435999857e-05,
"loss": 1.4767,
"step": 128
},
{
"epoch": 0.5524625267665952,
"grad_norm": 0.8456634115358744,
"learning_rate": 2.4390924371669065e-05,
"loss": 1.3985,
"step": 129
},
{
"epoch": 0.556745182012848,
"grad_norm": 0.9899617700169978,
"learning_rate": 2.437727580965307e-05,
"loss": 1.547,
"step": 130
},
{
"epoch": 0.5610278372591007,
"grad_norm": 0.8748009025292892,
"learning_rate": 2.436347991925169e-05,
"loss": 1.5895,
"step": 131
},
{
"epoch": 0.5653104925053534,
"grad_norm": 0.8284458411110256,
"learning_rate": 2.4349536871592227e-05,
"loss": 1.5536,
"step": 132
},
{
"epoch": 0.569593147751606,
"grad_norm": 0.918716629707354,
"learning_rate": 2.4335446839627375e-05,
"loss": 1.6851,
"step": 133
},
{
"epoch": 0.5738758029978587,
"grad_norm": 1.0628279716423659,
"learning_rate": 2.4321209998133025e-05,
"loss": 1.6705,
"step": 134
},
{
"epoch": 0.5781584582441114,
"grad_norm": 0.73934998026875,
"learning_rate": 2.430682652370616e-05,
"loss": 1.6545,
"step": 135
},
{
"epoch": 0.582441113490364,
"grad_norm": 0.8203168050853737,
"learning_rate": 2.4292296594762602e-05,
"loss": 1.6182,
"step": 136
},
{
"epoch": 0.5867237687366167,
"grad_norm": 0.6116686513293031,
"learning_rate": 2.4277620391534845e-05,
"loss": 1.6446,
"step": 137
},
{
"epoch": 0.5910064239828694,
"grad_norm": 0.9614170914314591,
"learning_rate": 2.4262798096069788e-05,
"loss": 1.494,
"step": 138
},
{
"epoch": 0.5952890792291221,
"grad_norm": 0.3676545315742134,
"learning_rate": 2.424782989222651e-05,
"loss": 1.595,
"step": 139
},
{
"epoch": 0.5995717344753747,
"grad_norm": 0.7408509458451011,
"learning_rate": 2.4232715965673952e-05,
"loss": 1.6386,
"step": 140
},
{
"epoch": 0.6038543897216274,
"grad_norm": 0.40233518217652775,
"learning_rate": 2.421745650388864e-05,
"loss": 1.5558,
"step": 141
},
{
"epoch": 0.6081370449678801,
"grad_norm": 0.5148389113634867,
"learning_rate": 2.4202051696152353e-05,
"loss": 1.5015,
"step": 142
},
{
"epoch": 0.6124197002141327,
"grad_norm": 0.3807057141875052,
"learning_rate": 2.418650173354977e-05,
"loss": 1.6467,
"step": 143
},
{
"epoch": 0.6167023554603854,
"grad_norm": 0.3896991503471914,
"learning_rate": 2.41708068089661e-05,
"loss": 1.6053,
"step": 144
},
{
"epoch": 0.6209850107066381,
"grad_norm": 0.35553342191514337,
"learning_rate": 2.4154967117084705e-05,
"loss": 1.5364,
"step": 145
},
{
"epoch": 0.6252676659528907,
"grad_norm": 0.36567417738111496,
"learning_rate": 2.4138982854384663e-05,
"loss": 1.6348,
"step": 146
},
{
"epoch": 0.6295503211991434,
"grad_norm": 0.38816867602696453,
"learning_rate": 2.412285421913834e-05,
"loss": 1.4694,
"step": 147
},
{
"epoch": 0.6338329764453962,
"grad_norm": 0.35173888322190433,
"learning_rate": 2.410658141140894e-05,
"loss": 1.646,
"step": 148
},
{
"epoch": 0.6381156316916489,
"grad_norm": 0.36815567692224666,
"learning_rate": 2.4090164633048e-05,
"loss": 1.6168,
"step": 149
},
{
"epoch": 0.6423982869379015,
"grad_norm": 0.3789787801030716,
"learning_rate": 2.4073604087692925e-05,
"loss": 1.5451,
"step": 150
},
{
"epoch": 0.6466809421841542,
"grad_norm": 0.4139676112725167,
"learning_rate": 2.4056899980764407e-05,
"loss": 1.5772,
"step": 151
},
{
"epoch": 0.6509635974304069,
"grad_norm": 0.4317710716550067,
"learning_rate": 2.404005251946394e-05,
"loss": 1.5901,
"step": 152
},
{
"epoch": 0.6552462526766595,
"grad_norm": 0.3793107950355877,
"learning_rate": 2.4023061912771188e-05,
"loss": 1.4831,
"step": 153
},
{
"epoch": 0.6595289079229122,
"grad_norm": 0.36255246115756395,
"learning_rate": 2.4005928371441444e-05,
"loss": 1.5417,
"step": 154
},
{
"epoch": 0.6638115631691649,
"grad_norm": 0.35515016194574406,
"learning_rate": 2.3988652108002984e-05,
"loss": 1.4822,
"step": 155
},
{
"epoch": 0.6680942184154176,
"grad_norm": 0.3462285743933349,
"learning_rate": 2.3971233336754444e-05,
"loss": 1.5157,
"step": 156
},
{
"epoch": 0.6723768736616702,
"grad_norm": 0.3669326112622935,
"learning_rate": 2.395367227376216e-05,
"loss": 1.5652,
"step": 157
},
{
"epoch": 0.6766595289079229,
"grad_norm": 0.3704783452888347,
"learning_rate": 2.393596913685748e-05,
"loss": 1.5836,
"step": 158
},
{
"epoch": 0.6809421841541756,
"grad_norm": 0.3829979392497551,
"learning_rate": 2.391812414563408e-05,
"loss": 1.5023,
"step": 159
},
{
"epoch": 0.6852248394004282,
"grad_norm": 0.3630273112296912,
"learning_rate": 2.390013752144521e-05,
"loss": 1.6907,
"step": 160
},
{
"epoch": 0.6895074946466809,
"grad_norm": 0.3351207679536815,
"learning_rate": 2.3882009487400993e-05,
"loss": 1.4393,
"step": 161
},
{
"epoch": 0.6937901498929336,
"grad_norm": 0.3497511991840534,
"learning_rate": 2.386374026836561e-05,
"loss": 1.598,
"step": 162
},
{
"epoch": 0.6980728051391863,
"grad_norm": 0.3337980565250301,
"learning_rate": 2.3845330090954542e-05,
"loss": 1.4704,
"step": 163
},
{
"epoch": 0.702355460385439,
"grad_norm": 0.36707456896757124,
"learning_rate": 2.3826779183531744e-05,
"loss": 1.5851,
"step": 164
},
{
"epoch": 0.7066381156316917,
"grad_norm": 0.37164461991634257,
"learning_rate": 2.380808777620682e-05,
"loss": 1.531,
"step": 165
},
{
"epoch": 0.7109207708779444,
"grad_norm": 0.3505467917592193,
"learning_rate": 2.3789256100832173e-05,
"loss": 1.4713,
"step": 166
},
{
"epoch": 0.715203426124197,
"grad_norm": 0.47758470060633207,
"learning_rate": 2.3770284391000113e-05,
"loss": 1.5102,
"step": 167
},
{
"epoch": 0.7194860813704497,
"grad_norm": 0.33053663778093284,
"learning_rate": 2.375117288203997e-05,
"loss": 1.4791,
"step": 168
},
{
"epoch": 0.7237687366167024,
"grad_norm": 0.3393970208208402,
"learning_rate": 2.3731921811015175e-05,
"loss": 1.6291,
"step": 169
},
{
"epoch": 0.728051391862955,
"grad_norm": 0.35855145219326184,
"learning_rate": 2.3712531416720317e-05,
"loss": 1.5539,
"step": 170
},
{
"epoch": 0.7323340471092077,
"grad_norm": 0.369911611756327,
"learning_rate": 2.3693001939678183e-05,
"loss": 1.4999,
"step": 171
},
{
"epoch": 0.7366167023554604,
"grad_norm": 0.5505558187826747,
"learning_rate": 2.367333362213678e-05,
"loss": 1.5852,
"step": 172
},
{
"epoch": 0.7408993576017131,
"grad_norm": 0.3528456056150531,
"learning_rate": 2.3653526708066314e-05,
"loss": 1.5358,
"step": 173
},
{
"epoch": 0.7451820128479657,
"grad_norm": 0.3968210406914177,
"learning_rate": 2.3633581443156178e-05,
"loss": 1.5028,
"step": 174
},
{
"epoch": 0.7494646680942184,
"grad_norm": 0.47087903951900106,
"learning_rate": 2.361349807481189e-05,
"loss": 1.6258,
"step": 175
},
{
"epoch": 0.7537473233404711,
"grad_norm": 0.380904082979793,
"learning_rate": 2.3593276852152056e-05,
"loss": 1.5982,
"step": 176
},
{
"epoch": 0.7580299785867237,
"grad_norm": 0.40302033351805244,
"learning_rate": 2.3572918026005235e-05,
"loss": 1.6539,
"step": 177
},
{
"epoch": 0.7623126338329764,
"grad_norm": 0.41272981679464077,
"learning_rate": 2.355242184890686e-05,
"loss": 1.4144,
"step": 178
},
{
"epoch": 0.7665952890792291,
"grad_norm": 0.3606085218359927,
"learning_rate": 2.35317885750961e-05,
"loss": 1.5244,
"step": 179
},
{
"epoch": 0.7708779443254818,
"grad_norm": 0.34295993724517143,
"learning_rate": 2.3511018460512696e-05,
"loss": 1.4102,
"step": 180
},
{
"epoch": 0.7751605995717344,
"grad_norm": 0.4192738060845751,
"learning_rate": 2.349011176279379e-05,
"loss": 1.5336,
"step": 181
},
{
"epoch": 0.7794432548179872,
"grad_norm": 0.3651804066614457,
"learning_rate": 2.3469068741270744e-05,
"loss": 1.5337,
"step": 182
},
{
"epoch": 0.7837259100642399,
"grad_norm": 0.3621028477405051,
"learning_rate": 2.3447889656965896e-05,
"loss": 1.6515,
"step": 183
},
{
"epoch": 0.7880085653104925,
"grad_norm": 0.4192540795103203,
"learning_rate": 2.342657477258935e-05,
"loss": 1.6674,
"step": 184
},
{
"epoch": 0.7922912205567452,
"grad_norm": 0.40348346440086696,
"learning_rate": 2.340512435253569e-05,
"loss": 1.5162,
"step": 185
},
{
"epoch": 0.7965738758029979,
"grad_norm": 0.39794985457766996,
"learning_rate": 2.3383538662880732e-05,
"loss": 1.4518,
"step": 186
},
{
"epoch": 0.8008565310492506,
"grad_norm": 0.40289663289027905,
"learning_rate": 2.3361817971378197e-05,
"loss": 1.6306,
"step": 187
},
{
"epoch": 0.8051391862955032,
"grad_norm": 0.35469529427153196,
"learning_rate": 2.3339962547456397e-05,
"loss": 1.3989,
"step": 188
},
{
"epoch": 0.8094218415417559,
"grad_norm": 0.3727961066406737,
"learning_rate": 2.3317972662214898e-05,
"loss": 1.5999,
"step": 189
},
{
"epoch": 0.8137044967880086,
"grad_norm": 0.4333692956220233,
"learning_rate": 2.329584858842116e-05,
"loss": 1.5081,
"step": 190
},
{
"epoch": 0.8179871520342612,
"grad_norm": 0.3789155310493327,
"learning_rate": 2.3273590600507135e-05,
"loss": 1.4586,
"step": 191
},
{
"epoch": 0.8222698072805139,
"grad_norm": 0.3768886247305229,
"learning_rate": 2.3251198974565887e-05,
"loss": 1.5521,
"step": 192
},
{
"epoch": 0.8265524625267666,
"grad_norm": 0.45287917232766545,
"learning_rate": 2.322867398834815e-05,
"loss": 1.6411,
"step": 193
},
{
"epoch": 0.8308351177730193,
"grad_norm": 0.36693401699800615,
"learning_rate": 2.320601592125889e-05,
"loss": 1.5276,
"step": 194
},
{
"epoch": 0.8351177730192719,
"grad_norm": 0.44127614034536217,
"learning_rate": 2.318322505435384e-05,
"loss": 1.4782,
"step": 195
},
{
"epoch": 0.8394004282655246,
"grad_norm": 0.3608904748036842,
"learning_rate": 2.316030167033601e-05,
"loss": 1.5273,
"step": 196
},
{
"epoch": 0.8436830835117773,
"grad_norm": 0.38846305560083205,
"learning_rate": 2.313724605355218e-05,
"loss": 1.4738,
"step": 197
},
{
"epoch": 0.8479657387580299,
"grad_norm": 0.3509606665662544,
"learning_rate": 2.3114058489989378e-05,
"loss": 1.4431,
"step": 198
},
{
"epoch": 0.8522483940042827,
"grad_norm": 0.3858549090015476,
"learning_rate": 2.3090739267271332e-05,
"loss": 1.515,
"step": 199
},
{
"epoch": 0.8565310492505354,
"grad_norm": 0.4113088516859706,
"learning_rate": 2.306728867465489e-05,
"loss": 1.5244,
"step": 200
},
{
"epoch": 0.860813704496788,
"grad_norm": 0.44586677646136047,
"learning_rate": 2.3043707003026452e-05,
"loss": 1.4043,
"step": 201
},
{
"epoch": 0.8650963597430407,
"grad_norm": 0.43850720329826914,
"learning_rate": 2.3019994544898345e-05,
"loss": 1.5149,
"step": 202
},
{
"epoch": 0.8693790149892934,
"grad_norm": 0.4409370304445262,
"learning_rate": 2.2996151594405196e-05,
"loss": 1.5645,
"step": 203
},
{
"epoch": 0.8736616702355461,
"grad_norm": 0.5520670678955565,
"learning_rate": 2.2972178447300305e-05,
"loss": 1.5525,
"step": 204
},
{
"epoch": 0.8779443254817987,
"grad_norm": 0.44992372464956326,
"learning_rate": 2.2948075400951946e-05,
"loss": 1.5927,
"step": 205
},
{
"epoch": 0.8822269807280514,
"grad_norm": 0.5250810847046828,
"learning_rate": 2.2923842754339696e-05,
"loss": 1.5617,
"step": 206
},
{
"epoch": 0.8865096359743041,
"grad_norm": 0.43126340615021524,
"learning_rate": 2.2899480808050724e-05,
"loss": 1.6348,
"step": 207
},
{
"epoch": 0.8907922912205567,
"grad_norm": 0.5913654606733179,
"learning_rate": 2.2874989864276058e-05,
"loss": 1.5646,
"step": 208
},
{
"epoch": 0.8950749464668094,
"grad_norm": 0.5253786434201022,
"learning_rate": 2.2850370226806846e-05,
"loss": 1.5984,
"step": 209
},
{
"epoch": 0.8993576017130621,
"grad_norm": 0.4009456934819743,
"learning_rate": 2.2825622201030572e-05,
"loss": 1.5283,
"step": 210
},
{
"epoch": 0.9036402569593148,
"grad_norm": 0.5333990945105044,
"learning_rate": 2.280074609392729e-05,
"loss": 1.5867,
"step": 211
},
{
"epoch": 0.9079229122055674,
"grad_norm": 0.3887789131541451,
"learning_rate": 2.2775742214065786e-05,
"loss": 1.3414,
"step": 212
},
{
"epoch": 0.9122055674518201,
"grad_norm": 0.5198803692192113,
"learning_rate": 2.2750610871599782e-05,
"loss": 1.5405,
"step": 213
},
{
"epoch": 0.9164882226980728,
"grad_norm": 0.3926454337534817,
"learning_rate": 2.2725352378264074e-05,
"loss": 1.509,
"step": 214
},
{
"epoch": 0.9207708779443254,
"grad_norm": 0.42675935243666635,
"learning_rate": 2.2699967047370656e-05,
"loss": 1.5438,
"step": 215
},
{
"epoch": 0.9250535331905781,
"grad_norm": 0.3709378032432874,
"learning_rate": 2.2674455193804857e-05,
"loss": 1.6725,
"step": 216
},
{
"epoch": 0.9293361884368309,
"grad_norm": 0.40669761633617474,
"learning_rate": 2.26488171340214e-05,
"loss": 1.485,
"step": 217
},
{
"epoch": 0.9336188436830836,
"grad_norm": 0.41102950360303664,
"learning_rate": 2.2623053186040533e-05,
"loss": 1.6809,
"step": 218
},
{
"epoch": 0.9379014989293362,
"grad_norm": 0.40461859144094875,
"learning_rate": 2.259716366944401e-05,
"loss": 1.4951,
"step": 219
},
{
"epoch": 0.9421841541755889,
"grad_norm": 0.3897126856825778,
"learning_rate": 2.25711489053712e-05,
"loss": 1.5844,
"step": 220
},
{
"epoch": 0.9464668094218416,
"grad_norm": 0.42222904373725634,
"learning_rate": 2.2545009216515038e-05,
"loss": 1.4944,
"step": 221
},
{
"epoch": 0.9507494646680942,
"grad_norm": 0.40547118703731166,
"learning_rate": 2.2518744927118085e-05,
"loss": 1.5574,
"step": 222
},
{
"epoch": 0.9550321199143469,
"grad_norm": 0.3513543405028927,
"learning_rate": 2.2492356362968452e-05,
"loss": 1.4118,
"step": 223
},
{
"epoch": 0.9593147751605996,
"grad_norm": 0.48633500004889796,
"learning_rate": 2.2465843851395796e-05,
"loss": 1.5477,
"step": 224
},
{
"epoch": 0.9635974304068522,
"grad_norm": 0.3590985254593397,
"learning_rate": 2.2439207721267236e-05,
"loss": 1.4816,
"step": 225
},
{
"epoch": 0.9678800856531049,
"grad_norm": 0.3702932493860504,
"learning_rate": 2.2412448302983286e-05,
"loss": 1.5548,
"step": 226
},
{
"epoch": 0.9721627408993576,
"grad_norm": 0.40425531625329014,
"learning_rate": 2.2385565928473758e-05,
"loss": 1.6429,
"step": 227
},
{
"epoch": 0.9764453961456103,
"grad_norm": 0.4058276769467583,
"learning_rate": 2.2358560931193636e-05,
"loss": 1.4335,
"step": 228
},
{
"epoch": 0.9807280513918629,
"grad_norm": 0.3312315245440172,
"learning_rate": 2.2331433646118946e-05,
"loss": 1.3716,
"step": 229
},
{
"epoch": 0.9850107066381156,
"grad_norm": 0.45936537843711933,
"learning_rate": 2.2304184409742602e-05,
"loss": 1.6051,
"step": 230
},
{
"epoch": 0.9892933618843683,
"grad_norm": 0.41972232909317975,
"learning_rate": 2.227681356007022e-05,
"loss": 1.5685,
"step": 231
},
{
"epoch": 0.9935760171306209,
"grad_norm": 0.3634109524654273,
"learning_rate": 2.224932143661594e-05,
"loss": 1.5598,
"step": 232
},
{
"epoch": 0.9978586723768736,
"grad_norm": 0.45907719960230176,
"learning_rate": 2.222170838039822e-05,
"loss": 1.5116,
"step": 233
},
{
"epoch": 1.0,
"grad_norm": 0.45907719960230176,
"learning_rate": 2.2193974733935573e-05,
"loss": 1.6087,
"step": 234
},
{
"epoch": 1.0042826552462527,
"grad_norm": 0.5861411036899304,
"learning_rate": 2.216612084124236e-05,
"loss": 1.3689,
"step": 235
},
{
"epoch": 1.0085653104925054,
"grad_norm": 0.4289041901369656,
"learning_rate": 2.213814704782449e-05,
"loss": 1.5579,
"step": 236
},
{
"epoch": 1.0128479657387581,
"grad_norm": 0.6259476055605661,
"learning_rate": 2.2110053700675153e-05,
"loss": 1.4052,
"step": 237
},
{
"epoch": 1.0171306209850106,
"grad_norm": 0.38820446634590455,
"learning_rate": 2.2081841148270517e-05,
"loss": 1.4333,
"step": 238
},
{
"epoch": 1.0214132762312633,
"grad_norm": 0.5061006213518089,
"learning_rate": 2.205350974056538e-05,
"loss": 1.356,
"step": 239
},
{
"epoch": 1.025695931477516,
"grad_norm": 0.3610425739202918,
"learning_rate": 2.2025059828988873e-05,
"loss": 1.3948,
"step": 240
},
{
"epoch": 1.0299785867237687,
"grad_norm": 0.423679381495652,
"learning_rate": 2.1996491766440047e-05,
"loss": 1.3546,
"step": 241
},
{
"epoch": 1.0342612419700214,
"grad_norm": 0.34897540177436914,
"learning_rate": 2.196780590728355e-05,
"loss": 1.4721,
"step": 242
},
{
"epoch": 1.0385438972162742,
"grad_norm": 0.425385319438199,
"learning_rate": 2.193900260734519e-05,
"loss": 1.4658,
"step": 243
},
{
"epoch": 1.0428265524625269,
"grad_norm": 0.3792487113919495,
"learning_rate": 2.191008222390754e-05,
"loss": 1.4699,
"step": 244
},
{
"epoch": 1.0471092077087794,
"grad_norm": 0.40281504819932906,
"learning_rate": 2.188104511570551e-05,
"loss": 1.3331,
"step": 245
},
{
"epoch": 1.051391862955032,
"grad_norm": 0.395699301044668,
"learning_rate": 2.1851891642921875e-05,
"loss": 1.4023,
"step": 246
},
{
"epoch": 1.0556745182012848,
"grad_norm": 0.37492910340499946,
"learning_rate": 2.1822622167182837e-05,
"loss": 1.4737,
"step": 247
},
{
"epoch": 1.0599571734475375,
"grad_norm": 0.3952955885524941,
"learning_rate": 2.1793237051553516e-05,
"loss": 1.3771,
"step": 248
},
{
"epoch": 1.0642398286937902,
"grad_norm": 0.3870229140110392,
"learning_rate": 2.176373666053346e-05,
"loss": 1.4438,
"step": 249
},
{
"epoch": 1.068522483940043,
"grad_norm": 0.40050568793681735,
"learning_rate": 2.1734121360052117e-05,
"loss": 1.3037,
"step": 250
},
{
"epoch": 1.0728051391862956,
"grad_norm": 0.36180001178651866,
"learning_rate": 2.1704391517464297e-05,
"loss": 1.4278,
"step": 251
},
{
"epoch": 1.077087794432548,
"grad_norm": 0.4411737907590586,
"learning_rate": 2.1674547501545615e-05,
"loss": 1.3945,
"step": 252
},
{
"epoch": 1.0813704496788008,
"grad_norm": 0.49343649178046994,
"learning_rate": 2.164458968248792e-05,
"loss": 1.3915,
"step": 253
},
{
"epoch": 1.0856531049250535,
"grad_norm": 0.3296867039273728,
"learning_rate": 2.16145184318947e-05,
"loss": 1.3265,
"step": 254
},
{
"epoch": 1.0899357601713062,
"grad_norm": 0.39840035584346023,
"learning_rate": 2.158433412277647e-05,
"loss": 1.3751,
"step": 255
},
{
"epoch": 1.094218415417559,
"grad_norm": 0.3633584286546075,
"learning_rate": 2.1554037129546153e-05,
"loss": 1.354,
"step": 256
},
{
"epoch": 1.0985010706638116,
"grad_norm": 0.4160505299653988,
"learning_rate": 2.152362782801443e-05,
"loss": 1.4007,
"step": 257
},
{
"epoch": 1.1027837259100641,
"grad_norm": 0.41007015982955497,
"learning_rate": 2.1493106595385075e-05,
"loss": 1.5213,
"step": 258
},
{
"epoch": 1.1070663811563168,
"grad_norm": 0.4650280917344183,
"learning_rate": 2.1462473810250283e-05,
"loss": 1.3312,
"step": 259
},
{
"epoch": 1.1113490364025695,
"grad_norm": 0.4266636624788006,
"learning_rate": 2.1431729852585973e-05,
"loss": 1.4889,
"step": 260
},
{
"epoch": 1.1156316916488223,
"grad_norm": 0.3484736446907606,
"learning_rate": 2.140087510374707e-05,
"loss": 1.3312,
"step": 261
},
{
"epoch": 1.119914346895075,
"grad_norm": 0.41911843923802033,
"learning_rate": 2.1369909946462785e-05,
"loss": 1.3692,
"step": 262
},
{
"epoch": 1.1241970021413277,
"grad_norm": 0.3732407300025524,
"learning_rate": 2.1338834764831845e-05,
"loss": 1.3838,
"step": 263
},
{
"epoch": 1.1284796573875804,
"grad_norm": 0.38178586641917484,
"learning_rate": 2.1307649944317757e-05,
"loss": 1.2793,
"step": 264
},
{
"epoch": 1.132762312633833,
"grad_norm": 0.3673713909731938,
"learning_rate": 2.1276355871744014e-05,
"loss": 1.4399,
"step": 265
},
{
"epoch": 1.1370449678800856,
"grad_norm": 0.3901268012108484,
"learning_rate": 2.124495293528928e-05,
"loss": 1.4587,
"step": 266
},
{
"epoch": 1.1413276231263383,
"grad_norm": 0.3360533239959902,
"learning_rate": 2.121344152448261e-05,
"loss": 1.243,
"step": 267
},
{
"epoch": 1.145610278372591,
"grad_norm": 0.3771399946534415,
"learning_rate": 2.118182203019859e-05,
"loss": 1.3957,
"step": 268
},
{
"epoch": 1.1498929336188437,
"grad_norm": 0.4880244995913143,
"learning_rate": 2.1150094844652493e-05,
"loss": 1.3888,
"step": 269
},
{
"epoch": 1.1541755888650964,
"grad_norm": 0.3578978890422881,
"learning_rate": 2.1118260361395428e-05,
"loss": 1.4619,
"step": 270
},
{
"epoch": 1.1584582441113491,
"grad_norm": 0.4201377835773034,
"learning_rate": 2.108631897530945e-05,
"loss": 1.4785,
"step": 271
},
{
"epoch": 1.1627408993576016,
"grad_norm": 0.4499980376910688,
"learning_rate": 2.1054271082602646e-05,
"loss": 1.4159,
"step": 272
},
{
"epoch": 1.1670235546038543,
"grad_norm": 0.3320870014261129,
"learning_rate": 2.102211708080425e-05,
"loss": 1.3894,
"step": 273
},
{
"epoch": 1.171306209850107,
"grad_norm": 0.42013650446350975,
"learning_rate": 2.0989857368759686e-05,
"loss": 1.3316,
"step": 274
},
{
"epoch": 1.1755888650963597,
"grad_norm": 0.35386203059819066,
"learning_rate": 2.0957492346625647e-05,
"loss": 1.4005,
"step": 275
},
{
"epoch": 1.1798715203426124,
"grad_norm": 0.3484835954332615,
"learning_rate": 2.0925022415865093e-05,
"loss": 1.275,
"step": 276
},
{
"epoch": 1.1841541755888652,
"grad_norm": 0.4266307426695914,
"learning_rate": 2.0892447979242314e-05,
"loss": 1.3413,
"step": 277
},
{
"epoch": 1.1884368308351179,
"grad_norm": 0.4145417718791916,
"learning_rate": 2.085976944081791e-05,
"loss": 1.4286,
"step": 278
},
{
"epoch": 1.1927194860813706,
"grad_norm": 0.4464633405061637,
"learning_rate": 2.0826987205943772e-05,
"loss": 1.4146,
"step": 279
},
{
"epoch": 1.197002141327623,
"grad_norm": 0.3813440974126778,
"learning_rate": 2.0794101681258077e-05,
"loss": 1.4651,
"step": 280
},
{
"epoch": 1.2012847965738758,
"grad_norm": 0.37367647405069787,
"learning_rate": 2.0761113274680227e-05,
"loss": 1.3905,
"step": 281
},
{
"epoch": 1.2055674518201285,
"grad_norm": 0.4209973043589035,
"learning_rate": 2.0728022395405794e-05,
"loss": 1.3164,
"step": 282
},
{
"epoch": 1.2098501070663812,
"grad_norm": 0.35285764889842397,
"learning_rate": 2.069482945390145e-05,
"loss": 1.3184,
"step": 283
},
{
"epoch": 1.214132762312634,
"grad_norm": 0.6553038505857459,
"learning_rate": 2.0661534861899858e-05,
"loss": 1.2821,
"step": 284
},
{
"epoch": 1.2184154175588866,
"grad_norm": 0.4444549917679711,
"learning_rate": 2.0628139032394582e-05,
"loss": 1.3502,
"step": 285
},
{
"epoch": 1.222698072805139,
"grad_norm": 0.3352896065598441,
"learning_rate": 2.0594642379634972e-05,
"loss": 1.4577,
"step": 286
},
{
"epoch": 1.2269807280513918,
"grad_norm": 0.47069617049270435,
"learning_rate": 2.0561045319120986e-05,
"loss": 1.4025,
"step": 287
},
{
"epoch": 1.2312633832976445,
"grad_norm": 0.3991774380744109,
"learning_rate": 2.0527348267598085e-05,
"loss": 1.3674,
"step": 288
},
{
"epoch": 1.2355460385438972,
"grad_norm": 0.45298444147723504,
"learning_rate": 2.049355164305203e-05,
"loss": 1.2552,
"step": 289
},
{
"epoch": 1.23982869379015,
"grad_norm": 0.33638821026760457,
"learning_rate": 2.0459655864703708e-05,
"loss": 1.2414,
"step": 290
},
{
"epoch": 1.2441113490364026,
"grad_norm": 0.4270670356767359,
"learning_rate": 2.0425661353003932e-05,
"loss": 1.261,
"step": 291
},
{
"epoch": 1.2483940042826553,
"grad_norm": 0.40636537980947196,
"learning_rate": 2.0391568529628237e-05,
"loss": 1.3725,
"step": 292
},
{
"epoch": 1.252676659528908,
"grad_norm": 0.36195547030323016,
"learning_rate": 2.035737781747162e-05,
"loss": 1.3342,
"step": 293
},
{
"epoch": 1.2569593147751605,
"grad_norm": 0.3539734470288324,
"learning_rate": 2.0323089640643326e-05,
"loss": 1.2697,
"step": 294
},
{
"epoch": 1.2612419700214133,
"grad_norm": 0.3540155063008326,
"learning_rate": 2.0288704424461565e-05,
"loss": 1.3329,
"step": 295
},
{
"epoch": 1.265524625267666,
"grad_norm": 0.4090169739563911,
"learning_rate": 2.0254222595448248e-05,
"loss": 1.4402,
"step": 296
},
{
"epoch": 1.2698072805139187,
"grad_norm": 0.4193574818141074,
"learning_rate": 2.0219644581323698e-05,
"loss": 1.3086,
"step": 297
},
{
"epoch": 1.2740899357601714,
"grad_norm": 0.38365729947629434,
"learning_rate": 2.0184970811001337e-05,
"loss": 1.4018,
"step": 298
},
{
"epoch": 1.2783725910064239,
"grad_norm": 0.4219737883083424,
"learning_rate": 2.0150201714582356e-05,
"loss": 1.3844,
"step": 299
},
{
"epoch": 1.2826552462526766,
"grad_norm": 0.43507834104776355,
"learning_rate": 2.011533772335041e-05,
"loss": 1.3706,
"step": 300
},
{
"epoch": 1.2869379014989293,
"grad_norm": 0.4133280809903553,
"learning_rate": 2.008037926976625e-05,
"loss": 1.376,
"step": 301
},
{
"epoch": 1.291220556745182,
"grad_norm": 0.36852825890998525,
"learning_rate": 2.0045326787462333e-05,
"loss": 1.328,
"step": 302
},
{
"epoch": 1.2955032119914347,
"grad_norm": 0.4205230066377953,
"learning_rate": 2.001018071123751e-05,
"loss": 1.2974,
"step": 303
},
{
"epoch": 1.2997858672376874,
"grad_norm": 0.4329679857419846,
"learning_rate": 1.9974941477051558e-05,
"loss": 1.3526,
"step": 304
},
{
"epoch": 1.3040685224839401,
"grad_norm": 0.3705004730863205,
"learning_rate": 1.9939609522019818e-05,
"loss": 1.2298,
"step": 305
},
{
"epoch": 1.3083511777301928,
"grad_norm": 0.39436925521218896,
"learning_rate": 1.9904185284407772e-05,
"loss": 1.3945,
"step": 306
},
{
"epoch": 1.3126338329764453,
"grad_norm": 0.35298924796738734,
"learning_rate": 1.986866920362558e-05,
"loss": 1.3016,
"step": 307
},
{
"epoch": 1.316916488222698,
"grad_norm": 0.3894071215590034,
"learning_rate": 1.9833061720222647e-05,
"loss": 1.2325,
"step": 308
},
{
"epoch": 1.3211991434689507,
"grad_norm": 0.3213378234068627,
"learning_rate": 1.9797363275882165e-05,
"loss": 1.2817,
"step": 309
},
{
"epoch": 1.3254817987152034,
"grad_norm": 0.4084287292776311,
"learning_rate": 1.9761574313415617e-05,
"loss": 1.4881,
"step": 310
},
{
"epoch": 1.3297644539614561,
"grad_norm": 0.40532300063738275,
"learning_rate": 1.9725695276757302e-05,
"loss": 1.4029,
"step": 311
},
{
"epoch": 1.3340471092077089,
"grad_norm": 0.3507190637097869,
"learning_rate": 1.9689726610958814e-05,
"loss": 1.4194,
"step": 312
},
{
"epoch": 1.3383297644539613,
"grad_norm": 0.3805072033067047,
"learning_rate": 1.9653668762183526e-05,
"loss": 1.3264,
"step": 313
},
{
"epoch": 1.342612419700214,
"grad_norm": 0.3367128120964735,
"learning_rate": 1.9617522177701058e-05,
"loss": 1.3298,
"step": 314
},
{
"epoch": 1.3468950749464668,
"grad_norm": 0.3977736636900147,
"learning_rate": 1.9581287305881733e-05,
"loss": 1.3487,
"step": 315
},
{
"epoch": 1.3511777301927195,
"grad_norm": 0.3236399137428874,
"learning_rate": 1.9544964596190996e-05,
"loss": 1.2795,
"step": 316
},
{
"epoch": 1.3554603854389722,
"grad_norm": 0.4410261852426088,
"learning_rate": 1.9508554499183867e-05,
"loss": 1.2954,
"step": 317
},
{
"epoch": 1.359743040685225,
"grad_norm": 0.33824185574060495,
"learning_rate": 1.9472057466499332e-05,
"loss": 1.2966,
"step": 318
},
{
"epoch": 1.3640256959314776,
"grad_norm": 0.5560403035800862,
"learning_rate": 1.9435473950854745e-05,
"loss": 1.4434,
"step": 319
},
{
"epoch": 1.3683083511777303,
"grad_norm": 0.36625625108883125,
"learning_rate": 1.939880440604021e-05,
"loss": 1.2226,
"step": 320
},
{
"epoch": 1.3725910064239828,
"grad_norm": 0.35699181136533303,
"learning_rate": 1.9362049286912976e-05,
"loss": 1.2464,
"step": 321
},
{
"epoch": 1.3768736616702355,
"grad_norm": 0.3813490989402076,
"learning_rate": 1.9325209049391745e-05,
"loss": 1.3279,
"step": 322
},
{
"epoch": 1.3811563169164882,
"grad_norm": 0.37459529309165335,
"learning_rate": 1.9288284150451075e-05,
"loss": 1.4422,
"step": 323
},
{
"epoch": 1.385438972162741,
"grad_norm": 0.39667372726355776,
"learning_rate": 1.9251275048115664e-05,
"loss": 1.5061,
"step": 324
},
{
"epoch": 1.3897216274089936,
"grad_norm": 0.34082355171490486,
"learning_rate": 1.9214182201454695e-05,
"loss": 1.3049,
"step": 325
},
{
"epoch": 1.3940042826552461,
"grad_norm": 0.4260735758035037,
"learning_rate": 1.917700607057613e-05,
"loss": 1.3912,
"step": 326
},
{
"epoch": 1.3982869379014988,
"grad_norm": 0.4021033157629882,
"learning_rate": 1.9139747116621015e-05,
"loss": 1.4421,
"step": 327
},
{
"epoch": 1.4025695931477515,
"grad_norm": 0.4034799522400383,
"learning_rate": 1.910240580175775e-05,
"loss": 1.3598,
"step": 328
},
{
"epoch": 1.4068522483940042,
"grad_norm": 0.44358114185104625,
"learning_rate": 1.906498258917635e-05,
"loss": 1.4136,
"step": 329
},
{
"epoch": 1.411134903640257,
"grad_norm": 0.3945332504871927,
"learning_rate": 1.9027477943082713e-05,
"loss": 1.2517,
"step": 330
},
{
"epoch": 1.4154175588865097,
"grad_norm": 0.3778742839914516,
"learning_rate": 1.8989892328692864e-05,
"loss": 1.333,
"step": 331
},
{
"epoch": 1.4197002141327624,
"grad_norm": 0.3796237837136356,
"learning_rate": 1.895222621222716e-05,
"loss": 1.3931,
"step": 332
},
{
"epoch": 1.423982869379015,
"grad_norm": 0.38301575785071823,
"learning_rate": 1.8914480060904537e-05,
"loss": 1.424,
"step": 333
},
{
"epoch": 1.4282655246252678,
"grad_norm": 0.421930928101693,
"learning_rate": 1.88766543429367e-05,
"loss": 1.402,
"step": 334
},
{
"epoch": 1.4325481798715203,
"grad_norm": 0.3699757863435036,
"learning_rate": 1.8838749527522315e-05,
"loss": 1.4079,
"step": 335
},
{
"epoch": 1.436830835117773,
"grad_norm": 0.42666319657235885,
"learning_rate": 1.8800766084841183e-05,
"loss": 1.3614,
"step": 336
},
{
"epoch": 1.4411134903640257,
"grad_norm": 0.35291694731273704,
"learning_rate": 1.8762704486048427e-05,
"loss": 1.3407,
"step": 337
},
{
"epoch": 1.4453961456102784,
"grad_norm": 0.37044240049931565,
"learning_rate": 1.872456520326863e-05,
"loss": 1.3531,
"step": 338
},
{
"epoch": 1.4496788008565311,
"grad_norm": 0.374037870809853,
"learning_rate": 1.8686348709589982e-05,
"loss": 1.4962,
"step": 339
},
{
"epoch": 1.4539614561027836,
"grad_norm": 0.39143283644429916,
"learning_rate": 1.8648055479058422e-05,
"loss": 1.3451,
"step": 340
},
{
"epoch": 1.4582441113490363,
"grad_norm": 0.3862274046133055,
"learning_rate": 1.8609685986671744e-05,
"loss": 1.4157,
"step": 341
},
{
"epoch": 1.462526766595289,
"grad_norm": 0.32589359289541453,
"learning_rate": 1.8571240708373707e-05,
"loss": 1.3611,
"step": 342
},
{
"epoch": 1.4668094218415417,
"grad_norm": 0.38467743700470014,
"learning_rate": 1.853272012104815e-05,
"loss": 1.4441,
"step": 343
},
{
"epoch": 1.4710920770877944,
"grad_norm": 0.3740956575298423,
"learning_rate": 1.849412470251305e-05,
"loss": 1.4004,
"step": 344
},
{
"epoch": 1.4753747323340471,
"grad_norm": 0.30848423646912154,
"learning_rate": 1.8455454931514605e-05,
"loss": 1.262,
"step": 345
},
{
"epoch": 1.4796573875802999,
"grad_norm": 0.3740097120746422,
"learning_rate": 1.8416711287721303e-05,
"loss": 1.2179,
"step": 346
},
{
"epoch": 1.4839400428265526,
"grad_norm": 0.4082863839360843,
"learning_rate": 1.8377894251717974e-05,
"loss": 1.4259,
"step": 347
},
{
"epoch": 1.4882226980728053,
"grad_norm": 0.3948652596870541,
"learning_rate": 1.8339004304999806e-05,
"loss": 1.3442,
"step": 348
},
{
"epoch": 1.4925053533190578,
"grad_norm": 0.4678512487151559,
"learning_rate": 1.8300041929966404e-05,
"loss": 1.4306,
"step": 349
},
{
"epoch": 1.4967880085653105,
"grad_norm": 0.45548221851750526,
"learning_rate": 1.8261007609915773e-05,
"loss": 1.3257,
"step": 350
},
{
"epoch": 1.5010706638115632,
"grad_norm": 0.3961504677246392,
"learning_rate": 1.8221901829038347e-05,
"loss": 1.4226,
"step": 351
},
{
"epoch": 1.5053533190578159,
"grad_norm": 0.48575304661026586,
"learning_rate": 1.818272507241099e-05,
"loss": 1.3101,
"step": 352
},
{
"epoch": 1.5096359743040684,
"grad_norm": 0.4223474689775986,
"learning_rate": 1.8143477825990938e-05,
"loss": 1.3738,
"step": 353
},
{
"epoch": 1.513918629550321,
"grad_norm": 0.4328835573924883,
"learning_rate": 1.8104160576609828e-05,
"loss": 1.4613,
"step": 354
},
{
"epoch": 1.5182012847965738,
"grad_norm": 0.36894215625076815,
"learning_rate": 1.80647738119676e-05,
"loss": 1.4421,
"step": 355
},
{
"epoch": 1.5224839400428265,
"grad_norm": 0.42960329602264624,
"learning_rate": 1.8025318020626497e-05,
"loss": 1.4449,
"step": 356
},
{
"epoch": 1.5267665952890792,
"grad_norm": 0.4381808830561339,
"learning_rate": 1.7985793692004983e-05,
"loss": 1.3895,
"step": 357
},
{
"epoch": 1.531049250535332,
"grad_norm": 0.511639740310659,
"learning_rate": 1.7946201316371665e-05,
"loss": 1.5033,
"step": 358
},
{
"epoch": 1.5353319057815846,
"grad_norm": 0.30935207991898406,
"learning_rate": 1.7906541384839226e-05,
"loss": 1.2179,
"step": 359
},
{
"epoch": 1.5396145610278373,
"grad_norm": 0.5149363491855712,
"learning_rate": 1.7866814389358323e-05,
"loss": 1.3692,
"step": 360
},
{
"epoch": 1.54389721627409,
"grad_norm": 0.3768568355085642,
"learning_rate": 1.7827020822711493e-05,
"loss": 1.4404,
"step": 361
},
{
"epoch": 1.5481798715203428,
"grad_norm": 0.5075668454602467,
"learning_rate": 1.7787161178507045e-05,
"loss": 1.4351,
"step": 362
},
{
"epoch": 1.5524625267665952,
"grad_norm": 0.429005671047687,
"learning_rate": 1.7747235951172908e-05,
"loss": 1.2954,
"step": 363
},
{
"epoch": 1.556745182012848,
"grad_norm": 0.4773307561454311,
"learning_rate": 1.7707245635950536e-05,
"loss": 1.3229,
"step": 364
},
{
"epoch": 1.5610278372591007,
"grad_norm": 0.46224461269568345,
"learning_rate": 1.7667190728888743e-05,
"loss": 1.4701,
"step": 365
},
{
"epoch": 1.5653104925053534,
"grad_norm": 0.4398714446841838,
"learning_rate": 1.7627071726837556e-05,
"loss": 1.3617,
"step": 366
},
{
"epoch": 1.5695931477516059,
"grad_norm": 0.3774107684610511,
"learning_rate": 1.7586889127442045e-05,
"loss": 1.3137,
"step": 367
},
{
"epoch": 1.5738758029978586,
"grad_norm": 0.4646696934362882,
"learning_rate": 1.754664342913616e-05,
"loss": 1.3487,
"step": 368
},
{
"epoch": 1.5781584582441113,
"grad_norm": 0.3570064846109861,
"learning_rate": 1.7506335131136548e-05,
"loss": 1.3087,
"step": 369
},
{
"epoch": 1.582441113490364,
"grad_norm": 0.4493705452348863,
"learning_rate": 1.7465964733436342e-05,
"loss": 1.5064,
"step": 370
},
{
"epoch": 1.5867237687366167,
"grad_norm": 0.35347935083263654,
"learning_rate": 1.7425532736798994e-05,
"loss": 1.354,
"step": 371
},
{
"epoch": 1.5910064239828694,
"grad_norm": 0.38802945271200445,
"learning_rate": 1.7385039642752026e-05,
"loss": 1.3905,
"step": 372
},
{
"epoch": 1.595289079229122,
"grad_norm": 0.3971847941983123,
"learning_rate": 1.7344485953580834e-05,
"loss": 1.3172,
"step": 373
},
{
"epoch": 1.5995717344753748,
"grad_norm": 0.4063900151850949,
"learning_rate": 1.730387217232245e-05,
"loss": 1.3902,
"step": 374
},
{
"epoch": 1.6038543897216275,
"grad_norm": 0.3482101582890047,
"learning_rate": 1.72631988027593e-05,
"loss": 1.4267,
"step": 375
},
{
"epoch": 1.6081370449678802,
"grad_norm": 0.3907023409634497,
"learning_rate": 1.7222466349412953e-05,
"loss": 1.3657,
"step": 376
},
{
"epoch": 1.6124197002141327,
"grad_norm": 0.39648365466974855,
"learning_rate": 1.718167531753787e-05,
"loss": 1.3757,
"step": 377
},
{
"epoch": 1.6167023554603854,
"grad_norm": 0.3482003705389042,
"learning_rate": 1.7140826213115134e-05,
"loss": 1.3889,
"step": 378
},
{
"epoch": 1.6209850107066381,
"grad_norm": 0.43357670792552266,
"learning_rate": 1.7099919542846174e-05,
"loss": 1.3975,
"step": 379
},
{
"epoch": 1.6252676659528906,
"grad_norm": 0.344012746609685,
"learning_rate": 1.705895581414647e-05,
"loss": 1.3761,
"step": 380
},
{
"epoch": 1.6295503211991433,
"grad_norm": 0.3912736883863624,
"learning_rate": 1.7017935535139286e-05,
"loss": 1.2256,
"step": 381
},
{
"epoch": 1.633832976445396,
"grad_norm": 0.32389309159432333,
"learning_rate": 1.697685921464932e-05,
"loss": 1.2611,
"step": 382
},
{
"epoch": 1.6381156316916488,
"grad_norm": 0.3808112089261434,
"learning_rate": 1.6935727362196453e-05,
"loss": 1.3773,
"step": 383
},
{
"epoch": 1.6423982869379015,
"grad_norm": 0.3815707909378436,
"learning_rate": 1.6894540487989374e-05,
"loss": 1.4341,
"step": 384
},
{
"epoch": 1.6466809421841542,
"grad_norm": 0.3707311578105496,
"learning_rate": 1.6853299102919278e-05,
"loss": 1.3912,
"step": 385
},
{
"epoch": 1.6509635974304069,
"grad_norm": 0.3477881955581895,
"learning_rate": 1.681200371855354e-05,
"loss": 1.4454,
"step": 386
},
{
"epoch": 1.6552462526766596,
"grad_norm": 0.3749155440303463,
"learning_rate": 1.6770654847129336e-05,
"loss": 1.3565,
"step": 387
},
{
"epoch": 1.6595289079229123,
"grad_norm": 0.37356126951976065,
"learning_rate": 1.6729253001547313e-05,
"loss": 1.2841,
"step": 388
},
{
"epoch": 1.663811563169165,
"grad_norm": 0.3479511050011833,
"learning_rate": 1.6687798695365224e-05,
"loss": 1.3371,
"step": 389
},
{
"epoch": 1.6680942184154177,
"grad_norm": 0.3581912213414331,
"learning_rate": 1.6646292442791557e-05,
"loss": 1.232,
"step": 390
},
{
"epoch": 1.6723768736616702,
"grad_norm": 0.3215446113048358,
"learning_rate": 1.6604734758679147e-05,
"loss": 1.3963,
"step": 391
},
{
"epoch": 1.676659528907923,
"grad_norm": 0.4376359515021747,
"learning_rate": 1.6563126158518806e-05,
"loss": 1.3747,
"step": 392
},
{
"epoch": 1.6809421841541756,
"grad_norm": 0.3060677115981459,
"learning_rate": 1.6521467158432916e-05,
"loss": 1.3455,
"step": 393
},
{
"epoch": 1.685224839400428,
"grad_norm": 0.39842372210368826,
"learning_rate": 1.647975827516902e-05,
"loss": 1.3162,
"step": 394
},
{
"epoch": 1.6895074946466808,
"grad_norm": 0.32860459996161495,
"learning_rate": 1.6438000026093447e-05,
"loss": 1.4114,
"step": 395
},
{
"epoch": 1.6937901498929335,
"grad_norm": 0.42177195772773357,
"learning_rate": 1.6396192929184852e-05,
"loss": 1.3835,
"step": 396
},
{
"epoch": 1.6980728051391862,
"grad_norm": 0.37483985613490883,
"learning_rate": 1.6354337503027817e-05,
"loss": 1.4495,
"step": 397
},
{
"epoch": 1.702355460385439,
"grad_norm": 0.3287442844969753,
"learning_rate": 1.6312434266806406e-05,
"loss": 1.3417,
"step": 398
},
{
"epoch": 1.7066381156316917,
"grad_norm": 0.3409487933679222,
"learning_rate": 1.627048374029773e-05,
"loss": 1.3727,
"step": 399
},
{
"epoch": 1.7109207708779444,
"grad_norm": 0.364966633180017,
"learning_rate": 1.622848644386551e-05,
"loss": 1.3445,
"step": 400
},
{
"epoch": 1.715203426124197,
"grad_norm": 0.40782880089567125,
"learning_rate": 1.6186442898453593e-05,
"loss": 1.4314,
"step": 401
},
{
"epoch": 1.7194860813704498,
"grad_norm": 0.35338981155106325,
"learning_rate": 1.614435362557953e-05,
"loss": 1.2992,
"step": 402
},
{
"epoch": 1.7237687366167025,
"grad_norm": 0.3458710703190408,
"learning_rate": 1.6102219147328064e-05,
"loss": 1.2444,
"step": 403
},
{
"epoch": 1.728051391862955,
"grad_norm": 0.34047208337511875,
"learning_rate": 1.6060039986344692e-05,
"loss": 1.3841,
"step": 404
},
{
"epoch": 1.7323340471092077,
"grad_norm": 0.34973667960604016,
"learning_rate": 1.601781666582916e-05,
"loss": 1.3197,
"step": 405
},
{
"epoch": 1.7366167023554604,
"grad_norm": 0.3619484642212399,
"learning_rate": 1.5975549709528977e-05,
"loss": 1.3597,
"step": 406
},
{
"epoch": 1.740899357601713,
"grad_norm": 0.3485323431598921,
"learning_rate": 1.593323964173292e-05,
"loss": 1.3541,
"step": 407
},
{
"epoch": 1.7451820128479656,
"grad_norm": 0.3722079995799495,
"learning_rate": 1.5890886987264536e-05,
"loss": 1.3639,
"step": 408
},
{
"epoch": 1.7494646680942183,
"grad_norm": 0.32734387518519825,
"learning_rate": 1.5848492271475622e-05,
"loss": 1.4136,
"step": 409
},
{
"epoch": 1.753747323340471,
"grad_norm": 0.3864261811647076,
"learning_rate": 1.5806056020239714e-05,
"loss": 1.4231,
"step": 410
},
{
"epoch": 1.7580299785867237,
"grad_norm": 0.341163146089911,
"learning_rate": 1.576357875994556e-05,
"loss": 1.3912,
"step": 411
},
{
"epoch": 1.7623126338329764,
"grad_norm": 0.4322424139588224,
"learning_rate": 1.5721061017490594e-05,
"loss": 1.3543,
"step": 412
},
{
"epoch": 1.7665952890792291,
"grad_norm": 0.3430090140811513,
"learning_rate": 1.5678503320274407e-05,
"loss": 1.4195,
"step": 413
},
{
"epoch": 1.7708779443254818,
"grad_norm": 0.39442054888019096,
"learning_rate": 1.5635906196192194e-05,
"loss": 1.3609,
"step": 414
},
{
"epoch": 1.7751605995717346,
"grad_norm": 0.39246818337147305,
"learning_rate": 1.5593270173628208e-05,
"loss": 1.3496,
"step": 415
},
{
"epoch": 1.7794432548179873,
"grad_norm": 0.3896357465642991,
"learning_rate": 1.5550595781449205e-05,
"loss": 1.2962,
"step": 416
},
{
"epoch": 1.78372591006424,
"grad_norm": 0.40875227853762397,
"learning_rate": 1.550788354899789e-05,
"loss": 1.2827,
"step": 417
},
{
"epoch": 1.7880085653104925,
"grad_norm": 0.32384312840403434,
"learning_rate": 1.5465134006086347e-05,
"loss": 1.4018,
"step": 418
},
{
"epoch": 1.7922912205567452,
"grad_norm": 0.4319845932792659,
"learning_rate": 1.5422347682989467e-05,
"loss": 1.2837,
"step": 419
},
{
"epoch": 1.7965738758029979,
"grad_norm": 0.4015204521770257,
"learning_rate": 1.5379525110438374e-05,
"loss": 1.445,
"step": 420
},
{
"epoch": 1.8008565310492506,
"grad_norm": 0.3636542581207264,
"learning_rate": 1.5336666819613832e-05,
"loss": 1.3278,
"step": 421
},
{
"epoch": 1.805139186295503,
"grad_norm": 0.42635584079656125,
"learning_rate": 1.5293773342139662e-05,
"loss": 1.3899,
"step": 422
},
{
"epoch": 1.8094218415417558,
"grad_norm": 0.3796172113574308,
"learning_rate": 1.5250845210076151e-05,
"loss": 1.2944,
"step": 423
},
{
"epoch": 1.8137044967880085,
"grad_norm": 0.4222877528683101,
"learning_rate": 1.5207882955913457e-05,
"loss": 1.4121,
"step": 424
},
{
"epoch": 1.8179871520342612,
"grad_norm": 0.6206094866942423,
"learning_rate": 1.5164887112564985e-05,
"loss": 1.3037,
"step": 425
},
{
"epoch": 1.822269807280514,
"grad_norm": 0.338186939979986,
"learning_rate": 1.5121858213360793e-05,
"loss": 1.4515,
"step": 426
},
{
"epoch": 1.8265524625267666,
"grad_norm": 0.42085883637300137,
"learning_rate": 1.507879679204096e-05,
"loss": 1.3801,
"step": 427
},
{
"epoch": 1.8308351177730193,
"grad_norm": 0.33029638552346774,
"learning_rate": 1.5035703382749e-05,
"loss": 1.3197,
"step": 428
},
{
"epoch": 1.835117773019272,
"grad_norm": 0.3796212349112593,
"learning_rate": 1.4992578520025194e-05,
"loss": 1.3341,
"step": 429
},
{
"epoch": 1.8394004282655247,
"grad_norm": 0.3416764792743133,
"learning_rate": 1.4949422738799982e-05,
"loss": 1.2933,
"step": 430
},
{
"epoch": 1.8436830835117775,
"grad_norm": 0.37923918821239594,
"learning_rate": 1.4906236574387326e-05,
"loss": 1.3359,
"step": 431
},
{
"epoch": 1.84796573875803,
"grad_norm": 0.30907027792758374,
"learning_rate": 1.4863020562478064e-05,
"loss": 1.2737,
"step": 432
},
{
"epoch": 1.8522483940042827,
"grad_norm": 0.3903264898543205,
"learning_rate": 1.4819775239133283e-05,
"loss": 1.3131,
"step": 433
},
{
"epoch": 1.8565310492505354,
"grad_norm": 0.3841336756186868,
"learning_rate": 1.4776501140777637e-05,
"loss": 1.3649,
"step": 434
},
{
"epoch": 1.8608137044967878,
"grad_norm": 0.4074493999576374,
"learning_rate": 1.4733198804192724e-05,
"loss": 1.2991,
"step": 435
},
{
"epoch": 1.8650963597430406,
"grad_norm": 0.3855125688098399,
"learning_rate": 1.4689868766510406e-05,
"loss": 1.3823,
"step": 436
},
{
"epoch": 1.8693790149892933,
"grad_norm": 0.37126874922918807,
"learning_rate": 1.4646511565206164e-05,
"loss": 1.3426,
"step": 437
},
{
"epoch": 1.873661670235546,
"grad_norm": 0.3714258164077467,
"learning_rate": 1.4603127738092423e-05,
"loss": 1.2718,
"step": 438
},
{
"epoch": 1.8779443254817987,
"grad_norm": 0.3429261958678687,
"learning_rate": 1.455971782331187e-05,
"loss": 1.3858,
"step": 439
},
{
"epoch": 1.8822269807280514,
"grad_norm": 0.38495602247470384,
"learning_rate": 1.4516282359330801e-05,
"loss": 1.2777,
"step": 440
},
{
"epoch": 1.886509635974304,
"grad_norm": 0.3699329784967151,
"learning_rate": 1.4472821884932426e-05,
"loss": 1.3578,
"step": 441
},
{
"epoch": 1.8907922912205568,
"grad_norm": 0.3599785136664482,
"learning_rate": 1.442933693921018e-05,
"loss": 1.416,
"step": 442
},
{
"epoch": 1.8950749464668095,
"grad_norm": 0.33538664994930595,
"learning_rate": 1.4385828061561066e-05,
"loss": 1.3407,
"step": 443
},
{
"epoch": 1.8993576017130622,
"grad_norm": 0.36336031298257154,
"learning_rate": 1.434229579167893e-05,
"loss": 1.2169,
"step": 444
},
{
"epoch": 1.903640256959315,
"grad_norm": 0.31518334287029476,
"learning_rate": 1.429874066954778e-05,
"loss": 1.3974,
"step": 445
},
{
"epoch": 1.9079229122055674,
"grad_norm": 0.380470589989531,
"learning_rate": 1.425516323543509e-05,
"loss": 1.3915,
"step": 446
},
{
"epoch": 1.9122055674518201,
"grad_norm": 0.3510136894640434,
"learning_rate": 1.4211564029885102e-05,
"loss": 1.3113,
"step": 447
},
{
"epoch": 1.9164882226980728,
"grad_norm": 0.34050831451001196,
"learning_rate": 1.4167943593712113e-05,
"loss": 1.3751,
"step": 448
},
{
"epoch": 1.9207708779443253,
"grad_norm": 0.3583661125603097,
"learning_rate": 1.4124302467993769e-05,
"loss": 1.3255,
"step": 449
},
{
"epoch": 1.925053533190578,
"grad_norm": 0.3389101579476846,
"learning_rate": 1.4080641194064348e-05,
"loss": 1.4168,
"step": 450
},
{
"epoch": 1.9293361884368307,
"grad_norm": 0.3834913291170707,
"learning_rate": 1.403696031350806e-05,
"loss": 1.3644,
"step": 451
},
{
"epoch": 1.9336188436830835,
"grad_norm": 0.4308322141053784,
"learning_rate": 1.3993260368152317e-05,
"loss": 1.4786,
"step": 452
},
{
"epoch": 1.9379014989293362,
"grad_norm": 0.3537841876121041,
"learning_rate": 1.3949541900061014e-05,
"loss": 1.2849,
"step": 453
},
{
"epoch": 1.9421841541755889,
"grad_norm": 0.3739024334028022,
"learning_rate": 1.3905805451527806e-05,
"loss": 1.2974,
"step": 454
},
{
"epoch": 1.9464668094218416,
"grad_norm": 0.3756096151923131,
"learning_rate": 1.386205156506938e-05,
"loss": 1.2532,
"step": 455
},
{
"epoch": 1.9507494646680943,
"grad_norm": 0.3642163049913141,
"learning_rate": 1.381828078341873e-05,
"loss": 1.3066,
"step": 456
},
{
"epoch": 1.955032119914347,
"grad_norm": 0.4016856878315503,
"learning_rate": 1.3774493649518424e-05,
"loss": 1.3514,
"step": 457
},
{
"epoch": 1.9593147751605997,
"grad_norm": 0.3570908964430489,
"learning_rate": 1.373069070651386e-05,
"loss": 1.3798,
"step": 458
},
{
"epoch": 1.9635974304068522,
"grad_norm": 0.4546768723455663,
"learning_rate": 1.3686872497746539e-05,
"loss": 1.2297,
"step": 459
},
{
"epoch": 1.967880085653105,
"grad_norm": 0.39770363928777963,
"learning_rate": 1.364303956674732e-05,
"loss": 1.3251,
"step": 460
},
{
"epoch": 1.9721627408993576,
"grad_norm": 0.4625841972208585,
"learning_rate": 1.359919245722969e-05,
"loss": 1.4199,
"step": 461
},
{
"epoch": 1.9764453961456103,
"grad_norm": 0.4133274366928544,
"learning_rate": 1.3555331713082991e-05,
"loss": 1.3047,
"step": 462
},
{
"epoch": 1.9807280513918628,
"grad_norm": 0.4108939632332837,
"learning_rate": 1.351145787836571e-05,
"loss": 1.3929,
"step": 463
},
{
"epoch": 1.9850107066381155,
"grad_norm": 0.37835291483581496,
"learning_rate": 1.3467571497298703e-05,
"loss": 1.1941,
"step": 464
},
{
"epoch": 1.9892933618843682,
"grad_norm": 0.37813972695047565,
"learning_rate": 1.342367311425845e-05,
"loss": 1.4973,
"step": 465
},
{
"epoch": 1.993576017130621,
"grad_norm": 1.6403016895398341,
"learning_rate": 1.3379763273770324e-05,
"loss": 1.3624,
"step": 466
},
{
"epoch": 1.9978586723768736,
"grad_norm": 0.4830892612436795,
"learning_rate": 1.3335842520501795e-05,
"loss": 1.302,
"step": 467
},
{
"epoch": 2.0,
"grad_norm": 0.6829899377473765,
"learning_rate": 1.3291911399255713e-05,
"loss": 1.2285,
"step": 468
},
{
"epoch": 2.0042826552462527,
"grad_norm": 0.8136378650415125,
"learning_rate": 1.3247970454963531e-05,
"loss": 1.1863,
"step": 469
},
{
"epoch": 2.0085653104925054,
"grad_norm": 0.6124913953543332,
"learning_rate": 1.3204020232678549e-05,
"loss": 1.1323,
"step": 470
},
{
"epoch": 2.012847965738758,
"grad_norm": 0.9415264304617837,
"learning_rate": 1.3160061277569156e-05,
"loss": 1.1341,
"step": 471
},
{
"epoch": 2.017130620985011,
"grad_norm": 0.5598470498427739,
"learning_rate": 1.3116094134912055e-05,
"loss": 1.0978,
"step": 472
},
{
"epoch": 2.0214132762312635,
"grad_norm": 0.5199782381878686,
"learning_rate": 1.3072119350085524e-05,
"loss": 1.15,
"step": 473
},
{
"epoch": 2.0256959314775163,
"grad_norm": 0.4796395014344232,
"learning_rate": 1.3028137468562624e-05,
"loss": 1.2802,
"step": 474
},
{
"epoch": 2.0299785867237685,
"grad_norm": 0.4542325665519593,
"learning_rate": 1.2984149035904447e-05,
"loss": 1.0659,
"step": 475
},
{
"epoch": 2.0342612419700212,
"grad_norm": 0.4431903012032383,
"learning_rate": 1.2940154597753356e-05,
"loss": 1.0986,
"step": 476
},
{
"epoch": 2.038543897216274,
"grad_norm": 0.46952279850037054,
"learning_rate": 1.2896154699826201e-05,
"loss": 1.1216,
"step": 477
},
{
"epoch": 2.0428265524625266,
"grad_norm": 0.45033430393074514,
"learning_rate": 1.2852149887907553e-05,
"loss": 1.1881,
"step": 478
},
{
"epoch": 2.0471092077087794,
"grad_norm": 0.4606628838219141,
"learning_rate": 1.2808140707842936e-05,
"loss": 1.0762,
"step": 479
},
{
"epoch": 2.051391862955032,
"grad_norm": 0.4522706754261223,
"learning_rate": 1.276412770553207e-05,
"loss": 1.1182,
"step": 480
},
{
"epoch": 2.0556745182012848,
"grad_norm": 0.4275410449005914,
"learning_rate": 1.2720111426922072e-05,
"loss": 1.1262,
"step": 481
},
{
"epoch": 2.0599571734475375,
"grad_norm": 0.4117922922818347,
"learning_rate": 1.2676092418000709e-05,
"loss": 1.0937,
"step": 482
},
{
"epoch": 2.06423982869379,
"grad_norm": 0.4076420511090681,
"learning_rate": 1.2632071224789613e-05,
"loss": 1.1588,
"step": 483
},
{
"epoch": 2.068522483940043,
"grad_norm": 0.39985814020478855,
"learning_rate": 1.2588048393337503e-05,
"loss": 1.2315,
"step": 484
},
{
"epoch": 2.0728051391862956,
"grad_norm": 0.41357202132909343,
"learning_rate": 1.2544024469713437e-05,
"loss": 1.1924,
"step": 485
},
{
"epoch": 2.0770877944325483,
"grad_norm": 0.39780940223532485,
"learning_rate": 1.25e-05,
"loss": 1.1816,
"step": 486
},
{
"epoch": 2.081370449678801,
"grad_norm": 0.42899527932620385,
"learning_rate": 1.245597553028657e-05,
"loss": 1.1841,
"step": 487
},
{
"epoch": 2.0856531049250537,
"grad_norm": 0.39083398721432966,
"learning_rate": 1.2411951606662498e-05,
"loss": 1.1098,
"step": 488
},
{
"epoch": 2.089935760171306,
"grad_norm": 0.43420822774302814,
"learning_rate": 1.2367928775210393e-05,
"loss": 1.1627,
"step": 489
},
{
"epoch": 2.0942184154175587,
"grad_norm": 0.3732705280561028,
"learning_rate": 1.2323907581999292e-05,
"loss": 1.129,
"step": 490
},
{
"epoch": 2.0985010706638114,
"grad_norm": 0.41632399144455645,
"learning_rate": 1.2279888573077935e-05,
"loss": 0.9738,
"step": 491
},
{
"epoch": 2.102783725910064,
"grad_norm": 0.38659287989811325,
"learning_rate": 1.2235872294467934e-05,
"loss": 1.1593,
"step": 492
},
{
"epoch": 2.107066381156317,
"grad_norm": 0.3920026187084851,
"learning_rate": 1.2191859292157066e-05,
"loss": 1.0827,
"step": 493
},
{
"epoch": 2.1113490364025695,
"grad_norm": 0.3994514767198869,
"learning_rate": 1.2147850112092448e-05,
"loss": 1.1405,
"step": 494
},
{
"epoch": 2.1156316916488223,
"grad_norm": 0.43445357298460374,
"learning_rate": 1.2103845300173801e-05,
"loss": 1.0986,
"step": 495
},
{
"epoch": 2.119914346895075,
"grad_norm": 0.4042400771293127,
"learning_rate": 1.2059845402246642e-05,
"loss": 1.1418,
"step": 496
},
{
"epoch": 2.1241970021413277,
"grad_norm": 0.3788718739976897,
"learning_rate": 1.2015850964095555e-05,
"loss": 1.1349,
"step": 497
},
{
"epoch": 2.1284796573875804,
"grad_norm": 0.3821076969792679,
"learning_rate": 1.197186253143738e-05,
"loss": 1.2081,
"step": 498
},
{
"epoch": 2.132762312633833,
"grad_norm": 0.4411851187923958,
"learning_rate": 1.192788064991448e-05,
"loss": 1.1522,
"step": 499
},
{
"epoch": 2.137044967880086,
"grad_norm": 0.404962832392533,
"learning_rate": 1.1883905865087944e-05,
"loss": 1.1383,
"step": 500
},
{
"epoch": 2.1413276231263385,
"grad_norm": 0.39962573083698255,
"learning_rate": 1.1839938722430849e-05,
"loss": 1.0717,
"step": 501
},
{
"epoch": 2.145610278372591,
"grad_norm": 0.4004973819254198,
"learning_rate": 1.1795979767321451e-05,
"loss": 1.2155,
"step": 502
},
{
"epoch": 2.1498929336188435,
"grad_norm": 0.42839296529898985,
"learning_rate": 1.175202954503647e-05,
"loss": 1.1801,
"step": 503
},
{
"epoch": 2.154175588865096,
"grad_norm": 0.39581686357900003,
"learning_rate": 1.1708088600744292e-05,
"loss": 1.1871,
"step": 504
},
{
"epoch": 2.158458244111349,
"grad_norm": 0.3515337940814968,
"learning_rate": 1.166415747949821e-05,
"loss": 1.0689,
"step": 505
},
{
"epoch": 2.1627408993576016,
"grad_norm": 0.38280355472311695,
"learning_rate": 1.1620236726229684e-05,
"loss": 1.1653,
"step": 506
},
{
"epoch": 2.1670235546038543,
"grad_norm": 0.3601455061997376,
"learning_rate": 1.157632688574155e-05,
"loss": 1.1316,
"step": 507
},
{
"epoch": 2.171306209850107,
"grad_norm": 0.4036025468502878,
"learning_rate": 1.1532428502701303e-05,
"loss": 1.1332,
"step": 508
},
{
"epoch": 2.1755888650963597,
"grad_norm": 0.3689501638767867,
"learning_rate": 1.1488542121634292e-05,
"loss": 1.1398,
"step": 509
},
{
"epoch": 2.1798715203426124,
"grad_norm": 0.44516877676862204,
"learning_rate": 1.1444668286917013e-05,
"loss": 1.1009,
"step": 510
},
{
"epoch": 2.184154175588865,
"grad_norm": 0.35171086043635746,
"learning_rate": 1.1400807542770314e-05,
"loss": 1.1452,
"step": 511
},
{
"epoch": 2.188436830835118,
"grad_norm": 0.37133980314166626,
"learning_rate": 1.135696043325268e-05,
"loss": 1.1579,
"step": 512
},
{
"epoch": 2.1927194860813706,
"grad_norm": 0.34968878321273367,
"learning_rate": 1.1313127502253462e-05,
"loss": 1.1296,
"step": 513
},
{
"epoch": 2.1970021413276233,
"grad_norm": 0.35409451711365186,
"learning_rate": 1.1269309293486144e-05,
"loss": 1.149,
"step": 514
},
{
"epoch": 2.201284796573876,
"grad_norm": 0.39987353315213703,
"learning_rate": 1.1225506350481577e-05,
"loss": 1.0483,
"step": 515
},
{
"epoch": 2.2055674518201283,
"grad_norm": 0.37950153309424184,
"learning_rate": 1.1181719216581272e-05,
"loss": 1.123,
"step": 516
},
{
"epoch": 2.209850107066381,
"grad_norm": 0.3738479054688087,
"learning_rate": 1.1137948434930622e-05,
"loss": 1.1478,
"step": 517
},
{
"epoch": 2.2141327623126337,
"grad_norm": 0.37447253121660345,
"learning_rate": 1.1094194548472197e-05,
"loss": 1.1929,
"step": 518
},
{
"epoch": 2.2184154175588864,
"grad_norm": 0.36554010421344446,
"learning_rate": 1.1050458099938985e-05,
"loss": 1.1651,
"step": 519
},
{
"epoch": 2.222698072805139,
"grad_norm": 0.35742517390118567,
"learning_rate": 1.1006739631847684e-05,
"loss": 1.0415,
"step": 520
},
{
"epoch": 2.226980728051392,
"grad_norm": 0.3678474681557672,
"learning_rate": 1.0963039686491942e-05,
"loss": 1.0773,
"step": 521
},
{
"epoch": 2.2312633832976445,
"grad_norm": 0.35021617103631075,
"learning_rate": 1.0919358805935653e-05,
"loss": 1.0147,
"step": 522
},
{
"epoch": 2.235546038543897,
"grad_norm": 0.3725259580183268,
"learning_rate": 1.0875697532006237e-05,
"loss": 1.1326,
"step": 523
},
{
"epoch": 2.23982869379015,
"grad_norm": 0.36036157437462213,
"learning_rate": 1.0832056406287888e-05,
"loss": 1.1178,
"step": 524
},
{
"epoch": 2.2441113490364026,
"grad_norm": 0.38080054734059177,
"learning_rate": 1.0788435970114902e-05,
"loss": 1.2065,
"step": 525
},
{
"epoch": 2.2483940042826553,
"grad_norm": 0.3744350777602071,
"learning_rate": 1.0744836764564914e-05,
"loss": 1.1504,
"step": 526
},
{
"epoch": 2.252676659528908,
"grad_norm": 0.37119670203538174,
"learning_rate": 1.0701259330452227e-05,
"loss": 1.1754,
"step": 527
},
{
"epoch": 2.2569593147751608,
"grad_norm": 0.3450626261101503,
"learning_rate": 1.0657704208321073e-05,
"loss": 1.1758,
"step": 528
},
{
"epoch": 2.2612419700214135,
"grad_norm": 0.3761085257204848,
"learning_rate": 1.0614171938438937e-05,
"loss": 1.1058,
"step": 529
},
{
"epoch": 2.265524625267666,
"grad_norm": 0.3534345956983803,
"learning_rate": 1.0570663060789819e-05,
"loss": 1.0396,
"step": 530
},
{
"epoch": 2.2698072805139184,
"grad_norm": 0.3339089724596173,
"learning_rate": 1.0527178115067577e-05,
"loss": 1.0607,
"step": 531
},
{
"epoch": 2.274089935760171,
"grad_norm": 0.36758786848355013,
"learning_rate": 1.0483717640669198e-05,
"loss": 1.096,
"step": 532
},
{
"epoch": 2.278372591006424,
"grad_norm": 0.37103014849499344,
"learning_rate": 1.0440282176688132e-05,
"loss": 1.2022,
"step": 533
},
{
"epoch": 2.2826552462526766,
"grad_norm": 0.3933653572064292,
"learning_rate": 1.0396872261907578e-05,
"loss": 1.1886,
"step": 534
},
{
"epoch": 2.2869379014989293,
"grad_norm": 0.3478722741253696,
"learning_rate": 1.0353488434793839e-05,
"loss": 1.1061,
"step": 535
},
{
"epoch": 2.291220556745182,
"grad_norm": 0.38454344787523614,
"learning_rate": 1.0310131233489595e-05,
"loss": 1.1058,
"step": 536
},
{
"epoch": 2.2955032119914347,
"grad_norm": 0.3964599267526657,
"learning_rate": 1.0266801195807279e-05,
"loss": 1.1536,
"step": 537
},
{
"epoch": 2.2997858672376874,
"grad_norm": 0.3505311887204956,
"learning_rate": 1.0223498859222367e-05,
"loss": 1.005,
"step": 538
},
{
"epoch": 2.30406852248394,
"grad_norm": 0.42646591198465056,
"learning_rate": 1.018022476086672e-05,
"loss": 1.1385,
"step": 539
},
{
"epoch": 2.308351177730193,
"grad_norm": 0.3516417735648486,
"learning_rate": 1.0136979437521937e-05,
"loss": 1.1299,
"step": 540
},
{
"epoch": 2.3126338329764455,
"grad_norm": 0.37292041166385276,
"learning_rate": 1.0093763425612677e-05,
"loss": 1.1697,
"step": 541
},
{
"epoch": 2.3169164882226982,
"grad_norm": 0.37139285774167097,
"learning_rate": 1.0050577261200025e-05,
"loss": 1.0958,
"step": 542
},
{
"epoch": 2.3211991434689505,
"grad_norm": 0.36732514272211636,
"learning_rate": 1.000742147997481e-05,
"loss": 1.0663,
"step": 543
},
{
"epoch": 2.325481798715203,
"grad_norm": 0.425696024428236,
"learning_rate": 9.964296617251004e-06,
"loss": 1.0172,
"step": 544
},
{
"epoch": 2.329764453961456,
"grad_norm": 0.45633961518765603,
"learning_rate": 9.92120320795904e-06,
"loss": 1.2115,
"step": 545
},
{
"epoch": 2.3340471092077086,
"grad_norm": 0.42776392011984465,
"learning_rate": 9.878141786639212e-06,
"loss": 1.1263,
"step": 546
},
{
"epoch": 2.3383297644539613,
"grad_norm": 0.4063925688250011,
"learning_rate": 9.835112887435014e-06,
"loss": 1.1167,
"step": 547
},
{
"epoch": 2.342612419700214,
"grad_norm": 0.347005382841865,
"learning_rate": 9.792117044086544e-06,
"loss": 1.0471,
"step": 548
},
{
"epoch": 2.3468950749464668,
"grad_norm": 0.41426650830417605,
"learning_rate": 9.749154789923847e-06,
"loss": 1.2857,
"step": 549
},
{
"epoch": 2.3511777301927195,
"grad_norm": 0.3732639695626659,
"learning_rate": 9.70622665786034e-06,
"loss": 1.133,
"step": 550
},
{
"epoch": 2.355460385438972,
"grad_norm": 0.3953893693115576,
"learning_rate": 9.663333180386169e-06,
"loss": 1.1723,
"step": 551
},
{
"epoch": 2.359743040685225,
"grad_norm": 0.3945096837746996,
"learning_rate": 9.620474889561629e-06,
"loss": 1.1853,
"step": 552
},
{
"epoch": 2.3640256959314776,
"grad_norm": 0.353229521713685,
"learning_rate": 9.57765231701053e-06,
"loss": 1.224,
"step": 553
},
{
"epoch": 2.3683083511777303,
"grad_norm": 0.38038911754225274,
"learning_rate": 9.534865993913656e-06,
"loss": 1.0707,
"step": 554
},
{
"epoch": 2.372591006423983,
"grad_norm": 0.40137304773118665,
"learning_rate": 9.492116451002114e-06,
"loss": 1.0614,
"step": 555
},
{
"epoch": 2.3768736616702357,
"grad_norm": 0.3799373348779043,
"learning_rate": 9.4494042185508e-06,
"loss": 1.0317,
"step": 556
},
{
"epoch": 2.3811563169164884,
"grad_norm": 0.35846465331360783,
"learning_rate": 9.4067298263718e-06,
"loss": 1.0816,
"step": 557
},
{
"epoch": 2.385438972162741,
"grad_norm": 0.3892380274193281,
"learning_rate": 9.364093803807807e-06,
"loss": 1.0922,
"step": 558
},
{
"epoch": 2.3897216274089934,
"grad_norm": 0.40336093781540333,
"learning_rate": 9.321496679725596e-06,
"loss": 1.0938,
"step": 559
},
{
"epoch": 2.394004282655246,
"grad_norm": 0.3817697005333532,
"learning_rate": 9.278938982509409e-06,
"loss": 1.0803,
"step": 560
},
{
"epoch": 2.398286937901499,
"grad_norm": 0.3881301113148313,
"learning_rate": 9.236421240054449e-06,
"loss": 1.1377,
"step": 561
},
{
"epoch": 2.4025695931477515,
"grad_norm": 0.445891116690163,
"learning_rate": 9.193943979760292e-06,
"loss": 1.0991,
"step": 562
},
{
"epoch": 2.4068522483940042,
"grad_norm": 0.4010581039655185,
"learning_rate": 9.151507728524382e-06,
"loss": 1.041,
"step": 563
},
{
"epoch": 2.411134903640257,
"grad_norm": 0.3694140168350837,
"learning_rate": 9.109113012735467e-06,
"loss": 0.9861,
"step": 564
},
{
"epoch": 2.4154175588865097,
"grad_norm": 0.38742555130206846,
"learning_rate": 9.066760358267081e-06,
"loss": 1.0938,
"step": 565
},
{
"epoch": 2.4197002141327624,
"grad_norm": 0.3559783185134848,
"learning_rate": 9.024450290471026e-06,
"loss": 1.0395,
"step": 566
},
{
"epoch": 2.423982869379015,
"grad_norm": 0.3636369864702618,
"learning_rate": 8.982183334170844e-06,
"loss": 1.0933,
"step": 567
},
{
"epoch": 2.428265524625268,
"grad_norm": 0.35525649048200675,
"learning_rate": 8.939960013655311e-06,
"loss": 1.0766,
"step": 568
},
{
"epoch": 2.4325481798715205,
"grad_norm": 0.3775765508703813,
"learning_rate": 8.897780852671939e-06,
"loss": 1.0256,
"step": 569
},
{
"epoch": 2.436830835117773,
"grad_norm": 0.42139839896816106,
"learning_rate": 8.855646374420472e-06,
"loss": 1.1425,
"step": 570
},
{
"epoch": 2.4411134903640255,
"grad_norm": 0.3511194625690293,
"learning_rate": 8.813557101546408e-06,
"loss": 0.9875,
"step": 571
},
{
"epoch": 2.445396145610278,
"grad_norm": 0.35870293115859425,
"learning_rate": 8.771513556134497e-06,
"loss": 1.1143,
"step": 572
},
{
"epoch": 2.449678800856531,
"grad_norm": 0.3511476581215571,
"learning_rate": 8.729516259702272e-06,
"loss": 1.1216,
"step": 573
},
{
"epoch": 2.4539614561027836,
"grad_norm": 0.3896756471995198,
"learning_rate": 8.6875657331936e-06,
"loss": 1.2131,
"step": 574
},
{
"epoch": 2.4582441113490363,
"grad_norm": 0.346301000515738,
"learning_rate": 8.645662496972186e-06,
"loss": 1.1267,
"step": 575
},
{
"epoch": 2.462526766595289,
"grad_norm": 0.3279075184069246,
"learning_rate": 8.603807070815152e-06,
"loss": 1.0078,
"step": 576
},
{
"epoch": 2.4668094218415417,
"grad_norm": 0.3524877782412061,
"learning_rate": 8.561999973906554e-06,
"loss": 1.1589,
"step": 577
},
{
"epoch": 2.4710920770877944,
"grad_norm": 0.3744186526110544,
"learning_rate": 8.520241724830983e-06,
"loss": 1.1987,
"step": 578
},
{
"epoch": 2.475374732334047,
"grad_norm": 0.37193508975714884,
"learning_rate": 8.478532841567089e-06,
"loss": 1.143,
"step": 579
},
{
"epoch": 2.4796573875803,
"grad_norm": 0.3563664250992986,
"learning_rate": 8.436873841481197e-06,
"loss": 1.1024,
"step": 580
},
{
"epoch": 2.4839400428265526,
"grad_norm": 0.3621802163845544,
"learning_rate": 8.395265241320852e-06,
"loss": 1.1237,
"step": 581
},
{
"epoch": 2.4882226980728053,
"grad_norm": 0.3534462614928483,
"learning_rate": 8.353707557208448e-06,
"loss": 0.9731,
"step": 582
},
{
"epoch": 2.492505353319058,
"grad_norm": 0.3756351095987366,
"learning_rate": 8.312201304634775e-06,
"loss": 1.0517,
"step": 583
},
{
"epoch": 2.4967880085653107,
"grad_norm": 0.3810521940082933,
"learning_rate": 8.270746998452688e-06,
"loss": 1.0853,
"step": 584
},
{
"epoch": 2.5010706638115634,
"grad_norm": 0.39222567553145227,
"learning_rate": 8.229345152870666e-06,
"loss": 1.1764,
"step": 585
},
{
"epoch": 2.505353319057816,
"grad_norm": 0.3739366136336243,
"learning_rate": 8.18799628144646e-06,
"loss": 1.1238,
"step": 586
},
{
"epoch": 2.5096359743040684,
"grad_norm": 0.38711368859863554,
"learning_rate": 8.14670089708072e-06,
"loss": 1.1465,
"step": 587
},
{
"epoch": 2.513918629550321,
"grad_norm": 0.41098527253509576,
"learning_rate": 8.105459512010629e-06,
"loss": 1.041,
"step": 588
},
{
"epoch": 2.518201284796574,
"grad_norm": 0.406134178093035,
"learning_rate": 8.064272637803553e-06,
"loss": 1.1861,
"step": 589
},
{
"epoch": 2.5224839400428265,
"grad_norm": 0.3736862306104564,
"learning_rate": 8.02314078535068e-06,
"loss": 1.0904,
"step": 590
},
{
"epoch": 2.526766595289079,
"grad_norm": 0.3781276058410365,
"learning_rate": 7.982064464860722e-06,
"loss": 1.1083,
"step": 591
},
{
"epoch": 2.531049250535332,
"grad_norm": 0.40011911371910797,
"learning_rate": 7.94104418585353e-06,
"loss": 1.0687,
"step": 592
},
{
"epoch": 2.5353319057815846,
"grad_norm": 0.3683735339293543,
"learning_rate": 7.90008045715383e-06,
"loss": 1.1211,
"step": 593
},
{
"epoch": 2.5396145610278373,
"grad_norm": 0.3878127219742661,
"learning_rate": 7.859173786884867e-06,
"loss": 1.086,
"step": 594
},
{
"epoch": 2.54389721627409,
"grad_norm": 0.37501963993427256,
"learning_rate": 7.818324682462135e-06,
"loss": 1.0673,
"step": 595
},
{
"epoch": 2.5481798715203428,
"grad_norm": 0.37276593704270844,
"learning_rate": 7.77753365058705e-06,
"loss": 1.1055,
"step": 596
},
{
"epoch": 2.552462526766595,
"grad_norm": 0.38843603696651813,
"learning_rate": 7.736801197240703e-06,
"loss": 1.0339,
"step": 597
},
{
"epoch": 2.5567451820128477,
"grad_norm": 0.4110286435387141,
"learning_rate": 7.696127827677551e-06,
"loss": 1.0975,
"step": 598
},
{
"epoch": 2.5610278372591004,
"grad_norm": 0.3610377475070173,
"learning_rate": 7.655514046419169e-06,
"loss": 1.0753,
"step": 599
},
{
"epoch": 2.565310492505353,
"grad_norm": 0.46624031730321613,
"learning_rate": 7.614960357247974e-06,
"loss": 1.0819,
"step": 600
},
{
"epoch": 2.569593147751606,
"grad_norm": 0.35714403479890183,
"learning_rate": 7.57446726320101e-06,
"loss": 1.0661,
"step": 601
},
{
"epoch": 2.5738758029978586,
"grad_norm": 0.3537005412507155,
"learning_rate": 7.534035266563657e-06,
"loss": 1.0783,
"step": 602
},
{
"epoch": 2.5781584582441113,
"grad_norm": 0.3609965104402262,
"learning_rate": 7.493664868863456e-06,
"loss": 1.1183,
"step": 603
},
{
"epoch": 2.582441113490364,
"grad_norm": 0.3414893487662722,
"learning_rate": 7.453356570863838e-06,
"loss": 1.1513,
"step": 604
},
{
"epoch": 2.5867237687366167,
"grad_norm": 0.34768494822065116,
"learning_rate": 7.413110872557957e-06,
"loss": 1.075,
"step": 605
},
{
"epoch": 2.5910064239828694,
"grad_norm": 0.35110711512371934,
"learning_rate": 7.372928273162444e-06,
"loss": 1.0302,
"step": 606
},
{
"epoch": 2.595289079229122,
"grad_norm": 0.37389978926958345,
"learning_rate": 7.332809271111258e-06,
"loss": 1.127,
"step": 607
},
{
"epoch": 2.599571734475375,
"grad_norm": 0.36202234697320473,
"learning_rate": 7.2927543640494675e-06,
"loss": 1.0841,
"step": 608
},
{
"epoch": 2.6038543897216275,
"grad_norm": 0.3692912620672064,
"learning_rate": 7.252764048827096e-06,
"loss": 1.0937,
"step": 609
},
{
"epoch": 2.6081370449678802,
"grad_norm": 0.371407363782464,
"learning_rate": 7.212838821492962e-06,
"loss": 1.1222,
"step": 610
},
{
"epoch": 2.612419700214133,
"grad_norm": 0.34843882518833746,
"learning_rate": 7.172979177288505e-06,
"loss": 0.945,
"step": 611
},
{
"epoch": 2.6167023554603857,
"grad_norm": 0.3677558592711015,
"learning_rate": 7.133185610641683e-06,
"loss": 1.1127,
"step": 612
},
{
"epoch": 2.6209850107066384,
"grad_norm": 0.36958952805111067,
"learning_rate": 7.0934586151607764e-06,
"loss": 1.1137,
"step": 613
},
{
"epoch": 2.6252676659528906,
"grad_norm": 0.3474020257100841,
"learning_rate": 7.053798683628335e-06,
"loss": 0.9744,
"step": 614
},
{
"epoch": 2.6295503211991433,
"grad_norm": 0.3558866341734782,
"learning_rate": 7.014206307995016e-06,
"loss": 1.1125,
"step": 615
},
{
"epoch": 2.633832976445396,
"grad_norm": 0.3614597470882593,
"learning_rate": 6.974681979373501e-06,
"loss": 1.1009,
"step": 616
},
{
"epoch": 2.6381156316916488,
"grad_norm": 0.3714477690148325,
"learning_rate": 6.935226188032401e-06,
"loss": 0.9984,
"step": 617
},
{
"epoch": 2.6423982869379015,
"grad_norm": 0.3317262663806771,
"learning_rate": 6.895839423390175e-06,
"loss": 1.0966,
"step": 618
},
{
"epoch": 2.646680942184154,
"grad_norm": 0.36917263116104493,
"learning_rate": 6.856522174009061e-06,
"loss": 1.0764,
"step": 619
},
{
"epoch": 2.650963597430407,
"grad_norm": 0.3777881832761566,
"learning_rate": 6.817274927589014e-06,
"loss": 1.0345,
"step": 620
},
{
"epoch": 2.6552462526766596,
"grad_norm": 0.35567953357582066,
"learning_rate": 6.7780981709616495e-06,
"loss": 1.1184,
"step": 621
},
{
"epoch": 2.6595289079229123,
"grad_norm": 0.3719255516818532,
"learning_rate": 6.738992390084232e-06,
"loss": 1.1226,
"step": 622
},
{
"epoch": 2.663811563169165,
"grad_norm": 0.3829939577200986,
"learning_rate": 6.699958070033596e-06,
"loss": 1.0708,
"step": 623
},
{
"epoch": 2.6680942184154177,
"grad_norm": 0.36003883214692967,
"learning_rate": 6.660995695000191e-06,
"loss": 1.1787,
"step": 624
},
{
"epoch": 2.67237687366167,
"grad_norm": 0.3688924024392204,
"learning_rate": 6.622105748282031e-06,
"loss": 1.0507,
"step": 625
},
{
"epoch": 2.6766595289079227,
"grad_norm": 0.37105335768283265,
"learning_rate": 6.583288712278697e-06,
"loss": 1.0864,
"step": 626
},
{
"epoch": 2.6809421841541754,
"grad_norm": 0.3676936052384596,
"learning_rate": 6.544545068485404e-06,
"loss": 1.1649,
"step": 627
},
{
"epoch": 2.685224839400428,
"grad_norm": 0.35833428730388167,
"learning_rate": 6.5058752974869545e-06,
"loss": 1.0467,
"step": 628
},
{
"epoch": 2.689507494646681,
"grad_norm": 0.3560192973325353,
"learning_rate": 6.4672798789518515e-06,
"loss": 1.0385,
"step": 629
},
{
"epoch": 2.6937901498929335,
"grad_norm": 0.3422819495514087,
"learning_rate": 6.428759291626294e-06,
"loss": 1.0643,
"step": 630
},
{
"epoch": 2.6980728051391862,
"grad_norm": 0.3596524934289582,
"learning_rate": 6.39031401332826e-06,
"loss": 1.0874,
"step": 631
},
{
"epoch": 2.702355460385439,
"grad_norm": 0.3581329395952061,
"learning_rate": 6.35194452094158e-06,
"loss": 1.029,
"step": 632
},
{
"epoch": 2.7066381156316917,
"grad_norm": 0.3646878019734804,
"learning_rate": 6.313651290410021e-06,
"loss": 1.1463,
"step": 633
},
{
"epoch": 2.7109207708779444,
"grad_norm": 0.46965105187278144,
"learning_rate": 6.2754347967313694e-06,
"loss": 1.1599,
"step": 634
},
{
"epoch": 2.715203426124197,
"grad_norm": 0.35199634686850134,
"learning_rate": 6.237295513951577e-06,
"loss": 1.0447,
"step": 635
},
{
"epoch": 2.71948608137045,
"grad_norm": 0.3552040815294978,
"learning_rate": 6.199233915158817e-06,
"loss": 1.0355,
"step": 636
},
{
"epoch": 2.7237687366167025,
"grad_norm": 0.3701464344073716,
"learning_rate": 6.161250472477692e-06,
"loss": 1.1069,
"step": 637
},
{
"epoch": 2.728051391862955,
"grad_norm": 0.3481745786199797,
"learning_rate": 6.123345657063299e-06,
"loss": 1.0379,
"step": 638
},
{
"epoch": 2.732334047109208,
"grad_norm": 0.34908887773290137,
"learning_rate": 6.085519939095463e-06,
"loss": 1.0759,
"step": 639
},
{
"epoch": 2.7366167023554606,
"grad_norm": 0.406969071848584,
"learning_rate": 6.047773787772843e-06,
"loss": 1.1397,
"step": 640
},
{
"epoch": 2.7408993576017133,
"grad_norm": 0.369214552502764,
"learning_rate": 6.01010767130714e-06,
"loss": 1.1652,
"step": 641
},
{
"epoch": 2.7451820128479656,
"grad_norm": 0.35958281005557274,
"learning_rate": 5.972522056917287e-06,
"loss": 1.0651,
"step": 642
},
{
"epoch": 2.7494646680942183,
"grad_norm": 0.34773227498527454,
"learning_rate": 5.9350174108236525e-06,
"loss": 1.2105,
"step": 643
},
{
"epoch": 2.753747323340471,
"grad_norm": 0.3785529745910018,
"learning_rate": 5.897594198242253e-06,
"loss": 1.1186,
"step": 644
},
{
"epoch": 2.7580299785867237,
"grad_norm": 0.3476745823127357,
"learning_rate": 5.860252883378986e-06,
"loss": 1.1053,
"step": 645
},
{
"epoch": 2.7623126338329764,
"grad_norm": 0.35740833434939384,
"learning_rate": 5.822993929423872e-06,
"loss": 1.156,
"step": 646
},
{
"epoch": 2.766595289079229,
"grad_norm": 0.3461287440443304,
"learning_rate": 5.78581779854531e-06,
"loss": 1.034,
"step": 647
},
{
"epoch": 2.770877944325482,
"grad_norm": 0.3484778190549007,
"learning_rate": 5.748724951884339e-06,
"loss": 1.147,
"step": 648
},
{
"epoch": 2.7751605995717346,
"grad_norm": 0.3463824371518374,
"learning_rate": 5.711715849548924e-06,
"loss": 1.2487,
"step": 649
},
{
"epoch": 2.7794432548179873,
"grad_norm": 0.3609765242563188,
"learning_rate": 5.674790950608257e-06,
"loss": 1.0038,
"step": 650
},
{
"epoch": 2.78372591006424,
"grad_norm": 0.3678624338311653,
"learning_rate": 5.6379507130870245e-06,
"loss": 1.1145,
"step": 651
},
{
"epoch": 2.7880085653104922,
"grad_norm": 0.35376315009965914,
"learning_rate": 5.601195593959788e-06,
"loss": 1.0577,
"step": 652
},
{
"epoch": 2.792291220556745,
"grad_norm": 0.3363214828483723,
"learning_rate": 5.5645260491452575e-06,
"loss": 1.0486,
"step": 653
},
{
"epoch": 2.7965738758029977,
"grad_norm": 0.3622636185655521,
"learning_rate": 5.52794253350067e-06,
"loss": 1.0547,
"step": 654
},
{
"epoch": 2.8008565310492504,
"grad_norm": 0.5067875911549902,
"learning_rate": 5.491445500816134e-06,
"loss": 1.1395,
"step": 655
},
{
"epoch": 2.805139186295503,
"grad_norm": 0.34289895282316957,
"learning_rate": 5.4550354038090055e-06,
"loss": 1.1781,
"step": 656
},
{
"epoch": 2.809421841541756,
"grad_norm": 0.35445697790502123,
"learning_rate": 5.41871269411827e-06,
"loss": 1.1037,
"step": 657
},
{
"epoch": 2.8137044967880085,
"grad_norm": 0.360842710721591,
"learning_rate": 5.3824778222989424e-06,
"loss": 1.1276,
"step": 658
},
{
"epoch": 2.817987152034261,
"grad_norm": 0.3432929406538927,
"learning_rate": 5.346331237816477e-06,
"loss": 1.0847,
"step": 659
},
{
"epoch": 2.822269807280514,
"grad_norm": 0.34235194233646365,
"learning_rate": 5.31027338904119e-06,
"loss": 1.099,
"step": 660
},
{
"epoch": 2.8265524625267666,
"grad_norm": 0.3494573350685968,
"learning_rate": 5.274304723242701e-06,
"loss": 1.0714,
"step": 661
},
{
"epoch": 2.8308351177730193,
"grad_norm": 0.36423601172734904,
"learning_rate": 5.238425686584383e-06,
"loss": 1.0917,
"step": 662
},
{
"epoch": 2.835117773019272,
"grad_norm": 0.3390326644331241,
"learning_rate": 5.2026367241178415e-06,
"loss": 1.0927,
"step": 663
},
{
"epoch": 2.8394004282655247,
"grad_norm": 0.3389574380550951,
"learning_rate": 5.166938279777356e-06,
"loss": 1.0654,
"step": 664
},
{
"epoch": 2.8436830835117775,
"grad_norm": 0.3558059969945493,
"learning_rate": 5.131330796374428e-06,
"loss": 1.2394,
"step": 665
},
{
"epoch": 2.84796573875803,
"grad_norm": 0.3449281004788474,
"learning_rate": 5.095814715592229e-06,
"loss": 1.104,
"step": 666
},
{
"epoch": 2.852248394004283,
"grad_norm": 0.5741950084872994,
"learning_rate": 5.060390477980181e-06,
"loss": 1.1246,
"step": 667
},
{
"epoch": 2.8565310492505356,
"grad_norm": 0.3518602777082471,
"learning_rate": 5.0250585229484445e-06,
"loss": 1.0384,
"step": 668
},
{
"epoch": 2.860813704496788,
"grad_norm": 0.33201611617766386,
"learning_rate": 4.9898192887624946e-06,
"loss": 0.99,
"step": 669
},
{
"epoch": 2.8650963597430406,
"grad_norm": 0.33654063236244514,
"learning_rate": 4.954673212537668e-06,
"loss": 1.0835,
"step": 670
},
{
"epoch": 2.8693790149892933,
"grad_norm": 0.35749153943774153,
"learning_rate": 4.9196207302337564e-06,
"loss": 1.238,
"step": 671
},
{
"epoch": 2.873661670235546,
"grad_norm": 0.3963712296443138,
"learning_rate": 4.884662276649588e-06,
"loss": 1.0847,
"step": 672
},
{
"epoch": 2.8779443254817987,
"grad_norm": 0.33900776494342877,
"learning_rate": 4.8497982854176475e-06,
"loss": 0.9872,
"step": 673
},
{
"epoch": 2.8822269807280514,
"grad_norm": 0.3390240674831931,
"learning_rate": 4.8150291889986655e-06,
"loss": 1.1353,
"step": 674
},
{
"epoch": 2.886509635974304,
"grad_norm": 0.3789710837716194,
"learning_rate": 4.780355418676305e-06,
"loss": 1.1636,
"step": 675
},
{
"epoch": 2.890792291220557,
"grad_norm": 0.3773675590887804,
"learning_rate": 4.745777404551755e-06,
"loss": 1.1598,
"step": 676
},
{
"epoch": 2.8950749464668095,
"grad_norm": 0.350034350612991,
"learning_rate": 4.711295575538437e-06,
"loss": 0.9807,
"step": 677
},
{
"epoch": 2.8993576017130622,
"grad_norm": 0.35389009806788396,
"learning_rate": 4.6769103593566805e-06,
"loss": 1.1225,
"step": 678
},
{
"epoch": 2.903640256959315,
"grad_norm": 0.3480099705955127,
"learning_rate": 4.6426221825283804e-06,
"loss": 1.0797,
"step": 679
},
{
"epoch": 2.907922912205567,
"grad_norm": 0.4017077706255267,
"learning_rate": 4.608431470371764e-06,
"loss": 1.0613,
"step": 680
},
{
"epoch": 2.91220556745182,
"grad_norm": 0.3918078161458431,
"learning_rate": 4.574338646996068e-06,
"loss": 1.1085,
"step": 681
},
{
"epoch": 2.9164882226980726,
"grad_norm": 0.32920278218913035,
"learning_rate": 4.540344135296296e-06,
"loss": 0.9627,
"step": 682
},
{
"epoch": 2.9207708779443253,
"grad_norm": 0.3684497632182809,
"learning_rate": 4.506448356947973e-06,
"loss": 1.1601,
"step": 683
},
{
"epoch": 2.925053533190578,
"grad_norm": 0.3433737649981929,
"learning_rate": 4.4726517324019165e-06,
"loss": 1.0455,
"step": 684
},
{
"epoch": 2.9293361884368307,
"grad_norm": 0.35325748706550913,
"learning_rate": 4.438954680879015e-06,
"loss": 1.0403,
"step": 685
},
{
"epoch": 2.9336188436830835,
"grad_norm": 0.34196653123502885,
"learning_rate": 4.405357620365032e-06,
"loss": 1.2242,
"step": 686
},
{
"epoch": 2.937901498929336,
"grad_norm": 0.3473358887939904,
"learning_rate": 4.371860967605413e-06,
"loss": 0.9848,
"step": 687
},
{
"epoch": 2.942184154175589,
"grad_norm": 0.3408666843863744,
"learning_rate": 4.338465138100147e-06,
"loss": 1.0415,
"step": 688
},
{
"epoch": 2.9464668094218416,
"grad_norm": 0.3480886088157686,
"learning_rate": 4.305170546098551e-06,
"loss": 1.0479,
"step": 689
},
{
"epoch": 2.9507494646680943,
"grad_norm": 0.35083424116981776,
"learning_rate": 4.271977604594206e-06,
"loss": 1.1681,
"step": 690
},
{
"epoch": 2.955032119914347,
"grad_norm": 0.35317744200985374,
"learning_rate": 4.238886725319774e-06,
"loss": 1.1004,
"step": 691
},
{
"epoch": 2.9593147751605997,
"grad_norm": 0.36992718168834315,
"learning_rate": 4.205898318741925e-06,
"loss": 1.1501,
"step": 692
},
{
"epoch": 2.9635974304068524,
"grad_norm": 0.368258055811205,
"learning_rate": 4.173012794056235e-06,
"loss": 1.0589,
"step": 693
},
{
"epoch": 2.967880085653105,
"grad_norm": 0.3542218292326262,
"learning_rate": 4.1402305591820945e-06,
"loss": 1.1059,
"step": 694
},
{
"epoch": 2.972162740899358,
"grad_norm": 0.34221816300659097,
"learning_rate": 4.107552020757688e-06,
"loss": 0.9976,
"step": 695
},
{
"epoch": 2.9764453961456105,
"grad_norm": 0.3798509842359927,
"learning_rate": 4.07497758413491e-06,
"loss": 1.0692,
"step": 696
},
{
"epoch": 2.980728051391863,
"grad_norm": 0.3371568887516198,
"learning_rate": 4.0425076533743585e-06,
"loss": 1.1132,
"step": 697
},
{
"epoch": 2.9850107066381155,
"grad_norm": 0.34200886091760746,
"learning_rate": 4.010142631240317e-06,
"loss": 1.1367,
"step": 698
},
{
"epoch": 2.9892933618843682,
"grad_norm": 0.3874331285336969,
"learning_rate": 3.977882919195755e-06,
"loss": 1.1251,
"step": 699
},
{
"epoch": 2.993576017130621,
"grad_norm": 0.6572496407131426,
"learning_rate": 3.945728917397355e-06,
"loss": 1.1292,
"step": 700
},
{
"epoch": 2.9978586723768736,
"grad_norm": 0.8967911622926727,
"learning_rate": 3.913681024690556e-06,
"loss": 1.2485,
"step": 701
},
{
"epoch": 3.0,
"grad_norm": 0.8967911622926727,
"learning_rate": 3.88173963860457e-06,
"loss": 1.1349,
"step": 702
},
{
"epoch": 3.0042826552462527,
"grad_norm": 0.7045871892163175,
"learning_rate": 3.849905155347512e-06,
"loss": 0.919,
"step": 703
},
{
"epoch": 3.0085653104925054,
"grad_norm": 0.8731451662221503,
"learning_rate": 3.818177969801412e-06,
"loss": 0.9352,
"step": 704
},
{
"epoch": 3.012847965738758,
"grad_norm": 0.5862193210847736,
"learning_rate": 3.7865584755173907e-06,
"loss": 0.8273,
"step": 705
},
{
"epoch": 3.017130620985011,
"grad_norm": 0.4530975739265527,
"learning_rate": 3.7550470647107205e-06,
"loss": 0.8568,
"step": 706
},
{
"epoch": 3.0214132762312635,
"grad_norm": 0.775182178811676,
"learning_rate": 3.723644128255989e-06,
"loss": 0.8563,
"step": 707
},
{
"epoch": 3.0256959314775163,
"grad_norm": 0.8036787462194873,
"learning_rate": 3.6923500556822433e-06,
"loss": 0.9373,
"step": 708
},
{
"epoch": 3.0299785867237685,
"grad_norm": 0.938179132189991,
"learning_rate": 3.6611652351681568e-06,
"loss": 0.9144,
"step": 709
},
{
"epoch": 3.0342612419700212,
"grad_norm": 0.533475946230046,
"learning_rate": 3.630090053537219e-06,
"loss": 0.9413,
"step": 710
},
{
"epoch": 3.038543897216274,
"grad_norm": 0.4769499859035611,
"learning_rate": 3.5991248962529313e-06,
"loss": 0.8983,
"step": 711
},
{
"epoch": 3.0428265524625266,
"grad_norm": 0.7175275279939133,
"learning_rate": 3.568270147414031e-06,
"loss": 1.0184,
"step": 712
},
{
"epoch": 3.0471092077087794,
"grad_norm": 0.6710751659916476,
"learning_rate": 3.5375261897497208e-06,
"loss": 0.8867,
"step": 713
},
{
"epoch": 3.051391862955032,
"grad_norm": 0.5533721206962046,
"learning_rate": 3.5068934046149303e-06,
"loss": 0.9861,
"step": 714
},
{
"epoch": 3.0556745182012848,
"grad_norm": 0.5096487279270119,
"learning_rate": 3.47637217198557e-06,
"loss": 0.9957,
"step": 715
},
{
"epoch": 3.0599571734475375,
"grad_norm": 0.392777064751308,
"learning_rate": 3.4459628704538503e-06,
"loss": 0.8717,
"step": 716
},
{
"epoch": 3.06423982869379,
"grad_norm": 0.5848251912632335,
"learning_rate": 3.41566587722353e-06,
"loss": 0.9097,
"step": 717
},
{
"epoch": 3.068522483940043,
"grad_norm": 0.6598671290081435,
"learning_rate": 3.3854815681053045e-06,
"loss": 0.8214,
"step": 718
},
{
"epoch": 3.0728051391862956,
"grad_norm": 0.5792866171130799,
"learning_rate": 3.355410317512081e-06,
"loss": 0.939,
"step": 719
},
{
"epoch": 3.0770877944325483,
"grad_norm": 0.5597042015871566,
"learning_rate": 3.3254524984543858e-06,
"loss": 0.973,
"step": 720
},
{
"epoch": 3.081370449678801,
"grad_norm": 0.43120275321986723,
"learning_rate": 3.2956084825357046e-06,
"loss": 0.9494,
"step": 721
},
{
"epoch": 3.0856531049250537,
"grad_norm": 0.43798987398686245,
"learning_rate": 3.265878639947885e-06,
"loss": 0.9386,
"step": 722
},
{
"epoch": 3.089935760171306,
"grad_norm": 0.5043861622984578,
"learning_rate": 3.2362633394665414e-06,
"loss": 0.8571,
"step": 723
},
{
"epoch": 3.0942184154175587,
"grad_norm": 0.47877006992255494,
"learning_rate": 3.206762948446486e-06,
"loss": 0.8921,
"step": 724
},
{
"epoch": 3.0985010706638114,
"grad_norm": 0.48382021072189335,
"learning_rate": 3.177377832817163e-06,
"loss": 0.9232,
"step": 725
},
{
"epoch": 3.102783725910064,
"grad_norm": 0.4428791922415224,
"learning_rate": 3.148108357078128e-06,
"loss": 0.8745,
"step": 726
},
{
"epoch": 3.107066381156317,
"grad_norm": 0.3690822664254283,
"learning_rate": 3.118954884294495e-06,
"loss": 0.9788,
"step": 727
},
{
"epoch": 3.1113490364025695,
"grad_norm": 0.43897184340546713,
"learning_rate": 3.0899177760924616e-06,
"loss": 0.9244,
"step": 728
},
{
"epoch": 3.1156316916488223,
"grad_norm": 0.4770826738507552,
"learning_rate": 3.060997392654813e-06,
"loss": 0.8922,
"step": 729
},
{
"epoch": 3.119914346895075,
"grad_norm": 0.4254105042307734,
"learning_rate": 3.032194092716449e-06,
"loss": 0.8362,
"step": 730
},
{
"epoch": 3.1241970021413277,
"grad_norm": 0.4468366976539863,
"learning_rate": 3.0035082335599555e-06,
"loss": 0.87,
"step": 731
},
{
"epoch": 3.1284796573875804,
"grad_norm": 0.4429010036845597,
"learning_rate": 2.9749401710111286e-06,
"loss": 0.9305,
"step": 732
},
{
"epoch": 3.132762312633833,
"grad_norm": 0.4127010809706913,
"learning_rate": 2.9464902594346185e-06,
"loss": 0.9775,
"step": 733
},
{
"epoch": 3.137044967880086,
"grad_norm": 0.4086014968435575,
"learning_rate": 2.9181588517294857e-06,
"loss": 0.999,
"step": 734
},
{
"epoch": 3.1413276231263385,
"grad_norm": 0.3990791790573375,
"learning_rate": 2.8899462993248473e-06,
"loss": 0.9982,
"step": 735
},
{
"epoch": 3.145610278372591,
"grad_norm": 0.39305406800729714,
"learning_rate": 2.861852952175513e-06,
"loss": 0.8755,
"step": 736
},
{
"epoch": 3.1498929336188435,
"grad_norm": 0.42386938503526844,
"learning_rate": 2.8338791587576435e-06,
"loss": 0.9166,
"step": 737
},
{
"epoch": 3.154175588865096,
"grad_norm": 0.39610798719172463,
"learning_rate": 2.80602526606443e-06,
"loss": 0.8548,
"step": 738
},
{
"epoch": 3.158458244111349,
"grad_norm": 0.39866226920058223,
"learning_rate": 2.7782916196017846e-06,
"loss": 0.9252,
"step": 739
},
{
"epoch": 3.1627408993576016,
"grad_norm": 0.37822843502350373,
"learning_rate": 2.7506785633840583e-06,
"loss": 0.9459,
"step": 740
},
{
"epoch": 3.1670235546038543,
"grad_norm": 0.3821806357973024,
"learning_rate": 2.7231864399297856e-06,
"loss": 0.8745,
"step": 741
},
{
"epoch": 3.171306209850107,
"grad_norm": 0.42244290458780526,
"learning_rate": 2.6958155902574e-06,
"loss": 0.8758,
"step": 742
},
{
"epoch": 3.1755888650963597,
"grad_norm": 0.3891254431155144,
"learning_rate": 2.6685663538810536e-06,
"loss": 0.8505,
"step": 743
},
{
"epoch": 3.1798715203426124,
"grad_norm": 0.40848076108585224,
"learning_rate": 2.6414390688063687e-06,
"loss": 0.9505,
"step": 744
},
{
"epoch": 3.184154175588865,
"grad_norm": 0.3911863355408845,
"learning_rate": 2.6144340715262437e-06,
"loss": 0.9777,
"step": 745
},
{
"epoch": 3.188436830835118,
"grad_norm": 0.3568604123347815,
"learning_rate": 2.58755169701672e-06,
"loss": 0.9195,
"step": 746
},
{
"epoch": 3.1927194860813706,
"grad_norm": 0.4017015638792494,
"learning_rate": 2.560792278732768e-06,
"loss": 0.9821,
"step": 747
},
{
"epoch": 3.1970021413276233,
"grad_norm": 0.4407901593054486,
"learning_rate": 2.534156148604207e-06,
"loss": 0.8664,
"step": 748
},
{
"epoch": 3.201284796573876,
"grad_norm": 0.3486898375672858,
"learning_rate": 2.5076436370315496e-06,
"loss": 0.9108,
"step": 749
},
{
"epoch": 3.2055674518201283,
"grad_norm": 0.38504490186433393,
"learning_rate": 2.4812550728819188e-06,
"loss": 0.9088,
"step": 750
},
{
"epoch": 3.209850107066381,
"grad_norm": 0.4564848674737477,
"learning_rate": 2.4549907834849644e-06,
"loss": 0.9815,
"step": 751
},
{
"epoch": 3.2141327623126337,
"grad_norm": 0.3627458052575124,
"learning_rate": 2.4288510946288063e-06,
"loss": 0.9947,
"step": 752
},
{
"epoch": 3.2184154175588864,
"grad_norm": 0.39127133347387394,
"learning_rate": 2.4028363305559894e-06,
"loss": 0.855,
"step": 753
},
{
"epoch": 3.222698072805139,
"grad_norm": 1.0193828924775918,
"learning_rate": 2.3769468139594727e-06,
"loss": 0.9804,
"step": 754
},
{
"epoch": 3.226980728051392,
"grad_norm": 0.37456178415299207,
"learning_rate": 2.3511828659785975e-06,
"loss": 0.9075,
"step": 755
},
{
"epoch": 3.2312633832976445,
"grad_norm": 0.39856388723773317,
"learning_rate": 2.3255448061951514e-06,
"loss": 0.8887,
"step": 756
},
{
"epoch": 3.235546038543897,
"grad_norm": 0.38837064304140856,
"learning_rate": 2.3000329526293456e-06,
"loss": 0.9574,
"step": 757
},
{
"epoch": 3.23982869379015,
"grad_norm": 0.40519546139819784,
"learning_rate": 2.2746476217359285e-06,
"loss": 0.9492,
"step": 758
},
{
"epoch": 3.2441113490364026,
"grad_norm": 0.37621301359779613,
"learning_rate": 2.249389128400219e-06,
"loss": 0.9414,
"step": 759
},
{
"epoch": 3.2483940042826553,
"grad_norm": 0.40315000345725827,
"learning_rate": 2.224257785934217e-06,
"loss": 0.8958,
"step": 760
},
{
"epoch": 3.252676659528908,
"grad_norm": 0.3767855628173954,
"learning_rate": 2.1992539060727137e-06,
"loss": 0.8632,
"step": 761
},
{
"epoch": 3.2569593147751608,
"grad_norm": 0.38794589527885637,
"learning_rate": 2.1743777989694292e-06,
"loss": 0.8607,
"step": 762
},
{
"epoch": 3.2612419700214135,
"grad_norm": 0.365416432156038,
"learning_rate": 2.1496297731931557e-06,
"loss": 0.9429,
"step": 763
},
{
"epoch": 3.265524625267666,
"grad_norm": 0.38839779583938294,
"learning_rate": 2.1250101357239426e-06,
"loss": 0.8837,
"step": 764
},
{
"epoch": 3.2698072805139184,
"grad_norm": 0.3983702485606278,
"learning_rate": 2.1005191919492795e-06,
"loss": 0.9003,
"step": 765
},
{
"epoch": 3.274089935760171,
"grad_norm": 0.36784959637004716,
"learning_rate": 2.0761572456603066e-06,
"loss": 0.9904,
"step": 766
},
{
"epoch": 3.278372591006424,
"grad_norm": 0.4086191337846277,
"learning_rate": 2.051924599048058e-06,
"loss": 0.9865,
"step": 767
},
{
"epoch": 3.2826552462526766,
"grad_norm": 0.385807346281981,
"learning_rate": 2.027821552699695e-06,
"loss": 0.8834,
"step": 768
},
{
"epoch": 3.2869379014989293,
"grad_norm": 0.38623842578363365,
"learning_rate": 2.0038484055948076e-06,
"loss": 0.8881,
"step": 769
},
{
"epoch": 3.291220556745182,
"grad_norm": 0.43545389555296216,
"learning_rate": 1.9800054551016593e-06,
"loss": 0.9753,
"step": 770
},
{
"epoch": 3.2955032119914347,
"grad_norm": 0.41514320812303884,
"learning_rate": 1.9562929969735494e-06,
"loss": 0.9497,
"step": 771
},
{
"epoch": 3.2997858672376874,
"grad_norm": 0.4038608215680401,
"learning_rate": 1.93271132534511e-06,
"loss": 0.8644,
"step": 772
},
{
"epoch": 3.30406852248394,
"grad_norm": 0.3644719902383785,
"learning_rate": 1.909260732728668e-06,
"loss": 0.9556,
"step": 773
},
{
"epoch": 3.308351177730193,
"grad_norm": 0.42036574911137053,
"learning_rate": 1.885941510010622e-06,
"loss": 0.8886,
"step": 774
},
{
"epoch": 3.3126338329764455,
"grad_norm": 0.42796972706377573,
"learning_rate": 1.8627539464478219e-06,
"loss": 0.9207,
"step": 775
},
{
"epoch": 3.3169164882226982,
"grad_norm": 0.42284493016560876,
"learning_rate": 1.8396983296639928e-06,
"loss": 0.9094,
"step": 776
},
{
"epoch": 3.3211991434689505,
"grad_norm": 0.34934919011874943,
"learning_rate": 1.816774945646163e-06,
"loss": 0.8775,
"step": 777
},
{
"epoch": 3.325481798715203,
"grad_norm": 0.6600800009141096,
"learning_rate": 1.7939840787411135e-06,
"loss": 1.0994,
"step": 778
},
{
"epoch": 3.329764453961456,
"grad_norm": 0.3976354396493046,
"learning_rate": 1.771326011651854e-06,
"loss": 0.9024,
"step": 779
},
{
"epoch": 3.3340471092077086,
"grad_norm": 0.376362118495897,
"learning_rate": 1.7488010254341172e-06,
"loss": 0.8615,
"step": 780
},
{
"epoch": 3.3383297644539613,
"grad_norm": 0.40607166419814433,
"learning_rate": 1.7264093994928648e-06,
"loss": 0.912,
"step": 781
},
{
"epoch": 3.342612419700214,
"grad_norm": 0.4191724820681144,
"learning_rate": 1.7041514115788428e-06,
"loss": 0.8292,
"step": 782
},
{
"epoch": 3.3468950749464668,
"grad_norm": 0.3781354302914862,
"learning_rate": 1.6820273377850997e-06,
"loss": 0.8707,
"step": 783
},
{
"epoch": 3.3511777301927195,
"grad_norm": 0.42426853842502676,
"learning_rate": 1.6600374525436057e-06,
"loss": 0.7958,
"step": 784
},
{
"epoch": 3.355460385438972,
"grad_norm": 0.39253316989568815,
"learning_rate": 1.6381820286218027e-06,
"loss": 0.9362,
"step": 785
},
{
"epoch": 3.359743040685225,
"grad_norm": 0.42081804014283164,
"learning_rate": 1.6164613371192668e-06,
"loss": 0.8808,
"step": 786
},
{
"epoch": 3.3640256959314776,
"grad_norm": 0.3805908666364616,
"learning_rate": 1.5948756474643098e-06,
"loss": 0.9281,
"step": 787
},
{
"epoch": 3.3683083511777303,
"grad_norm": 0.3931155152751046,
"learning_rate": 1.5734252274106549e-06,
"loss": 0.8649,
"step": 788
},
{
"epoch": 3.372591006423983,
"grad_norm": 0.36893746954226686,
"learning_rate": 1.5521103430341063e-06,
"loss": 0.9245,
"step": 789
},
{
"epoch": 3.3768736616702357,
"grad_norm": 0.421055167309563,
"learning_rate": 1.5309312587292595e-06,
"loss": 0.9075,
"step": 790
},
{
"epoch": 3.3811563169164884,
"grad_norm": 0.39708496701725404,
"learning_rate": 1.5098882372062084e-06,
"loss": 0.9268,
"step": 791
},
{
"epoch": 3.385438972162741,
"grad_norm": 0.4147457741610103,
"learning_rate": 1.488981539487308e-06,
"loss": 0.9095,
"step": 792
},
{
"epoch": 3.3897216274089934,
"grad_norm": 0.3870528540533133,
"learning_rate": 1.4682114249039007e-06,
"loss": 0.9108,
"step": 793
},
{
"epoch": 3.394004282655246,
"grad_norm": 0.37912624135371875,
"learning_rate": 1.447578151093143e-06,
"loss": 0.8086,
"step": 794
},
{
"epoch": 3.398286937901499,
"grad_norm": 0.39893951982141634,
"learning_rate": 1.427081973994769e-06,
"loss": 0.8207,
"step": 795
},
{
"epoch": 3.4025695931477515,
"grad_norm": 0.41813759081817203,
"learning_rate": 1.4067231478479465e-06,
"loss": 0.8587,
"step": 796
},
{
"epoch": 3.4068522483940042,
"grad_norm": 0.37522463321771077,
"learning_rate": 1.386501925188112e-06,
"loss": 0.9387,
"step": 797
},
{
"epoch": 3.411134903640257,
"grad_norm": 0.40082201779472715,
"learning_rate": 1.3664185568438252e-06,
"loss": 0.8501,
"step": 798
},
{
"epoch": 3.4154175588865097,
"grad_norm": 0.4044778971930763,
"learning_rate": 1.3464732919336877e-06,
"loss": 0.9708,
"step": 799
},
{
"epoch": 3.4197002141327624,
"grad_norm": 0.3999055484285562,
"learning_rate": 1.32666637786322e-06,
"loss": 0.832,
"step": 800
},
{
"epoch": 3.423982869379015,
"grad_norm": 0.3940297074656928,
"learning_rate": 1.3069980603218165e-06,
"loss": 0.8606,
"step": 801
},
{
"epoch": 3.428265524625268,
"grad_norm": 0.4037209018320114,
"learning_rate": 1.2874685832796856e-06,
"loss": 0.9606,
"step": 802
},
{
"epoch": 3.4325481798715205,
"grad_norm": 0.36235619726375323,
"learning_rate": 1.2680781889848296e-06,
"loss": 0.8037,
"step": 803
},
{
"epoch": 3.436830835117773,
"grad_norm": 0.4134563817140967,
"learning_rate": 1.248827117960033e-06,
"loss": 0.9296,
"step": 804
},
{
"epoch": 3.4411134903640255,
"grad_norm": 0.37477385004204616,
"learning_rate": 1.2297156089998887e-06,
"loss": 0.8875,
"step": 805
},
{
"epoch": 3.445396145610278,
"grad_norm": 0.3598044961225808,
"learning_rate": 1.2107438991678252e-06,
"loss": 0.9181,
"step": 806
},
{
"epoch": 3.449678800856531,
"grad_norm": 0.4068544774348545,
"learning_rate": 1.191912223793179e-06,
"loss": 0.802,
"step": 807
},
{
"epoch": 3.4539614561027836,
"grad_norm": 0.39025679795801216,
"learning_rate": 1.1732208164682567e-06,
"loss": 0.9481,
"step": 808
},
{
"epoch": 3.4582441113490363,
"grad_norm": 0.40099768389636997,
"learning_rate": 1.1546699090454596e-06,
"loss": 0.8793,
"step": 809
},
{
"epoch": 3.462526766595289,
"grad_norm": 0.3527515368666591,
"learning_rate": 1.1362597316343897e-06,
"loss": 0.8926,
"step": 810
},
{
"epoch": 3.4668094218415417,
"grad_norm": 0.3960092351592858,
"learning_rate": 1.117990512599007e-06,
"loss": 0.8198,
"step": 811
},
{
"epoch": 3.4710920770877944,
"grad_norm": 0.37647074443425715,
"learning_rate": 1.0998624785547916e-06,
"loss": 0.8726,
"step": 812
},
{
"epoch": 3.475374732334047,
"grad_norm": 0.4260177464381465,
"learning_rate": 1.081875854365924e-06,
"loss": 0.8411,
"step": 813
},
{
"epoch": 3.4796573875803,
"grad_norm": 0.3678229667943419,
"learning_rate": 1.0640308631425206e-06,
"loss": 0.9303,
"step": 814
},
{
"epoch": 3.4839400428265526,
"grad_norm": 0.40562771211697285,
"learning_rate": 1.0463277262378418e-06,
"loss": 0.9258,
"step": 815
},
{
"epoch": 3.4882226980728053,
"grad_norm": 0.39758544559495274,
"learning_rate": 1.0287666632455562e-06,
"loss": 0.8981,
"step": 816
},
{
"epoch": 3.492505353319058,
"grad_norm": 0.4330255432907014,
"learning_rate": 1.0113478919970166e-06,
"loss": 0.877,
"step": 817
},
{
"epoch": 3.4967880085653107,
"grad_norm": 0.4091350493182955,
"learning_rate": 9.940716285585572e-07,
"loss": 0.8589,
"step": 818
},
{
"epoch": 3.5010706638115634,
"grad_norm": 0.3756040003940408,
"learning_rate": 9.769380872288112e-07,
"loss": 0.8303,
"step": 819
},
{
"epoch": 3.505353319057816,
"grad_norm": 0.3845542537371508,
"learning_rate": 9.599474805360636e-07,
"loss": 0.8673,
"step": 820
},
{
"epoch": 3.5096359743040684,
"grad_norm": 0.3621491496685947,
"learning_rate": 9.431000192355904e-07,
"loss": 0.8285,
"step": 821
},
{
"epoch": 3.513918629550321,
"grad_norm": 0.38581119937487457,
"learning_rate": 9.263959123070792e-07,
"loss": 0.9607,
"step": 822
},
{
"epoch": 3.518201284796574,
"grad_norm": 0.40699298803550954,
"learning_rate": 9.098353669519985e-07,
"loss": 0.9999,
"step": 823
},
{
"epoch": 3.5224839400428265,
"grad_norm": 0.36404111618752655,
"learning_rate": 8.934185885910634e-07,
"loss": 0.9621,
"step": 824
},
{
"epoch": 3.526766595289079,
"grad_norm": 0.4080837339902542,
"learning_rate": 8.771457808616615e-07,
"loss": 0.9385,
"step": 825
},
{
"epoch": 3.531049250535332,
"grad_norm": 0.37542101809408207,
"learning_rate": 8.610171456153407e-07,
"loss": 0.8838,
"step": 826
},
{
"epoch": 3.5353319057815846,
"grad_norm": 0.3622139219889446,
"learning_rate": 8.450328829152962e-07,
"loss": 0.9147,
"step": 827
},
{
"epoch": 3.5396145610278373,
"grad_norm": 0.41604941573448845,
"learning_rate": 8.291931910339016e-07,
"loss": 1.0337,
"step": 828
},
{
"epoch": 3.54389721627409,
"grad_norm": 0.3702662014383576,
"learning_rate": 8.134982664502313e-07,
"loss": 0.8722,
"step": 829
},
{
"epoch": 3.5481798715203428,
"grad_norm": 0.3968324847661136,
"learning_rate": 7.979483038476496e-07,
"loss": 0.8719,
"step": 830
},
{
"epoch": 3.552462526766595,
"grad_norm": 0.37196472198781777,
"learning_rate": 7.825434961113612e-07,
"loss": 0.9101,
"step": 831
},
{
"epoch": 3.5567451820128477,
"grad_norm": 0.404292826856257,
"learning_rate": 7.672840343260503e-07,
"loss": 0.883,
"step": 832
},
{
"epoch": 3.5610278372591004,
"grad_norm": 0.3986607359258053,
"learning_rate": 7.521701077734921e-07,
"loss": 0.914,
"step": 833
},
{
"epoch": 3.565310492505353,
"grad_norm": 0.37342839604299854,
"learning_rate": 7.372019039302111e-07,
"loss": 0.8733,
"step": 834
},
{
"epoch": 3.569593147751606,
"grad_norm": 0.3789431810782268,
"learning_rate": 7.223796084651596e-07,
"loss": 1.0656,
"step": 835
},
{
"epoch": 3.5738758029978586,
"grad_norm": 0.4143391476747435,
"learning_rate": 7.077034052373991e-07,
"loss": 0.9481,
"step": 836
},
{
"epoch": 3.5781584582441113,
"grad_norm": 0.3802282910841205,
"learning_rate": 6.931734762938416e-07,
"loss": 0.8704,
"step": 837
},
{
"epoch": 3.582441113490364,
"grad_norm": 0.4383295863697292,
"learning_rate": 6.787900018669747e-07,
"loss": 0.8664,
"step": 838
},
{
"epoch": 3.5867237687366167,
"grad_norm": 0.3620529674823113,
"learning_rate": 6.645531603726287e-07,
"loss": 0.8701,
"step": 839
},
{
"epoch": 3.5910064239828694,
"grad_norm": 0.4003391688371413,
"learning_rate": 6.50463128407773e-07,
"loss": 0.956,
"step": 840
},
{
"epoch": 3.595289079229122,
"grad_norm": 0.35710168185845254,
"learning_rate": 6.365200807483138e-07,
"loss": 0.9395,
"step": 841
},
{
"epoch": 3.599571734475375,
"grad_norm": 0.3888127985496108,
"learning_rate": 6.227241903469322e-07,
"loss": 0.868,
"step": 842
},
{
"epoch": 3.6038543897216275,
"grad_norm": 0.3788842530917126,
"learning_rate": 6.090756283309379e-07,
"loss": 0.9023,
"step": 843
},
{
"epoch": 3.6081370449678802,
"grad_norm": 0.4293764780811211,
"learning_rate": 5.955745640001453e-07,
"loss": 0.912,
"step": 844
},
{
"epoch": 3.612419700214133,
"grad_norm": 0.36701829937079145,
"learning_rate": 5.822211648247797e-07,
"loss": 0.9178,
"step": 845
},
{
"epoch": 3.6167023554603857,
"grad_norm": 0.420252154230346,
"learning_rate": 5.690155964433868e-07,
"loss": 0.9341,
"step": 846
},
{
"epoch": 3.6209850107066384,
"grad_norm": 0.4321448436806155,
"learning_rate": 5.559580226607921e-07,
"loss": 0.9177,
"step": 847
},
{
"epoch": 3.6252676659528906,
"grad_norm": 0.37257126041542216,
"learning_rate": 5.430486054460629e-07,
"loss": 0.9424,
"step": 848
},
{
"epoch": 3.6295503211991433,
"grad_norm": 0.3772731501472801,
"learning_rate": 5.30287504930492e-07,
"loss": 0.9146,
"step": 849
},
{
"epoch": 3.633832976445396,
"grad_norm": 0.3877711033336446,
"learning_rate": 5.176748794056316e-07,
"loss": 0.912,
"step": 850
},
{
"epoch": 3.6381156316916488,
"grad_norm": 0.3770006556479151,
"learning_rate": 5.052108853213e-07,
"loss": 1.0339,
"step": 851
},
{
"epoch": 3.6423982869379015,
"grad_norm": 0.40082811910610466,
"learning_rate": 4.928956772836751e-07,
"loss": 0.9,
"step": 852
},
{
"epoch": 3.646680942184154,
"grad_norm": 0.4080349447803649,
"learning_rate": 4.807294080533486e-07,
"loss": 0.9017,
"step": 853
},
{
"epoch": 3.650963597430407,
"grad_norm": 0.3750103705444987,
"learning_rate": 4.687122285434456e-07,
"loss": 0.9218,
"step": 854
},
{
"epoch": 3.6552462526766596,
"grad_norm": 0.4168122554116308,
"learning_rate": 4.568442878177467e-07,
"loss": 0.9165,
"step": 855
},
{
"epoch": 3.6595289079229123,
"grad_norm": 0.42052436299883195,
"learning_rate": 4.451257330888442e-07,
"loss": 1.0046,
"step": 856
},
{
"epoch": 3.663811563169165,
"grad_norm": 0.3775819321872966,
"learning_rate": 4.33556709716311e-07,
"loss": 0.8148,
"step": 857
},
{
"epoch": 3.6680942184154177,
"grad_norm": 0.40588411521050055,
"learning_rate": 4.2213736120490373e-07,
"loss": 0.9766,
"step": 858
},
{
"epoch": 3.67237687366167,
"grad_norm": 0.3879183257896917,
"learning_rate": 4.1086782920276845e-07,
"loss": 0.9038,
"step": 859
},
{
"epoch": 3.6766595289079227,
"grad_norm": 0.371088950938356,
"learning_rate": 3.997482534997071e-07,
"loss": 0.9691,
"step": 860
},
{
"epoch": 3.6809421841541754,
"grad_norm": 0.3974254305078794,
"learning_rate": 3.8877877202541793e-07,
"loss": 0.9505,
"step": 861
},
{
"epoch": 3.685224839400428,
"grad_norm": 0.38333801357842573,
"learning_rate": 3.779595208478065e-07,
"loss": 0.8308,
"step": 862
},
{
"epoch": 3.689507494646681,
"grad_norm": 0.37315579927328224,
"learning_rate": 3.6729063417128285e-07,
"loss": 0.8951,
"step": 863
},
{
"epoch": 3.6937901498929335,
"grad_norm": 0.41169860046752177,
"learning_rate": 3.567722443351032e-07,
"loss": 0.856,
"step": 864
},
{
"epoch": 3.6980728051391862,
"grad_norm": 0.3540168865001641,
"learning_rate": 3.464044818117268e-07,
"loss": 0.9567,
"step": 865
},
{
"epoch": 3.702355460385439,
"grad_norm": 0.41805384086496045,
"learning_rate": 3.361874752051991e-07,
"loss": 0.8485,
"step": 866
},
{
"epoch": 3.7066381156316917,
"grad_norm": 0.3932453571640372,
"learning_rate": 3.2612135124955453e-07,
"loss": 0.8981,
"step": 867
},
{
"epoch": 3.7109207708779444,
"grad_norm": 0.35556756655208993,
"learning_rate": 3.1620623480724807e-07,
"loss": 0.7991,
"step": 868
},
{
"epoch": 3.715203426124197,
"grad_norm": 0.38025591039841,
"learning_rate": 3.064422488675986e-07,
"loss": 0.921,
"step": 869
},
{
"epoch": 3.71948608137045,
"grad_norm": 0.39447979117902376,
"learning_rate": 2.968295145452715e-07,
"loss": 0.8516,
"step": 870
},
{
"epoch": 3.7237687366167025,
"grad_norm": 0.36729974047622016,
"learning_rate": 2.8736815107877626e-07,
"loss": 0.9292,
"step": 871
},
{
"epoch": 3.728051391862955,
"grad_norm": 0.3892287341045359,
"learning_rate": 2.7805827582897683e-07,
"loss": 0.8804,
"step": 872
},
{
"epoch": 3.732334047109208,
"grad_norm": 0.41914843746271097,
"learning_rate": 2.6890000427765157e-07,
"loss": 0.8756,
"step": 873
},
{
"epoch": 3.7366167023554606,
"grad_norm": 0.39627355945962395,
"learning_rate": 2.598934500260455e-07,
"loss": 0.9612,
"step": 874
},
{
"epoch": 3.7408993576017133,
"grad_norm": 0.40215083865929563,
"learning_rate": 2.510387247934759e-07,
"loss": 1.0171,
"step": 875
},
{
"epoch": 3.7451820128479656,
"grad_norm": 0.3908638307412036,
"learning_rate": 2.4233593841593295e-07,
"loss": 0.8599,
"step": 876
},
{
"epoch": 3.7494646680942183,
"grad_norm": 0.4326871280589204,
"learning_rate": 2.3378519884472428e-07,
"loss": 1.0263,
"step": 877
},
{
"epoch": 3.753747323340471,
"grad_norm": 0.38245250647594886,
"learning_rate": 2.25386612145137e-07,
"loss": 0.9593,
"step": 878
},
{
"epoch": 3.7580299785867237,
"grad_norm": 0.3778573404558164,
"learning_rate": 2.1714028249511798e-07,
"loss": 0.9466,
"step": 879
},
{
"epoch": 3.7623126338329764,
"grad_norm": 0.3700075006593136,
"learning_rate": 2.0904631218398445e-07,
"loss": 0.8128,
"step": 880
},
{
"epoch": 3.766595289079229,
"grad_norm": 0.3843775256635492,
"learning_rate": 2.011048016111544e-07,
"loss": 0.9134,
"step": 881
},
{
"epoch": 3.770877944325482,
"grad_norm": 0.385219325392379,
"learning_rate": 1.9331584928490159e-07,
"loss": 0.8527,
"step": 882
},
{
"epoch": 3.7751605995717346,
"grad_norm": 0.36661581147669026,
"learning_rate": 1.8567955182113295e-07,
"loss": 0.8592,
"step": 883
},
{
"epoch": 3.7794432548179873,
"grad_norm": 0.401361109957553,
"learning_rate": 1.7819600394218956e-07,
"loss": 0.9088,
"step": 884
},
{
"epoch": 3.78372591006424,
"grad_norm": 0.32988480791991265,
"learning_rate": 1.7086529847566979e-07,
"loss": 0.7957,
"step": 885
},
{
"epoch": 3.7880085653104922,
"grad_norm": 0.37989640262936986,
"learning_rate": 1.6368752635328998e-07,
"loss": 0.8675,
"step": 886
},
{
"epoch": 3.792291220556745,
"grad_norm": 0.3937658078234294,
"learning_rate": 1.5666277660973533e-07,
"loss": 0.8864,
"step": 887
},
{
"epoch": 3.7965738758029977,
"grad_norm": 0.3722219853982238,
"learning_rate": 1.49791136381576e-07,
"loss": 0.9096,
"step": 888
},
{
"epoch": 3.8008565310492504,
"grad_norm": 0.37559569493426515,
"learning_rate": 1.430726909061722e-07,
"loss": 0.8924,
"step": 889
},
{
"epoch": 3.805139186295503,
"grad_norm": 0.38719709372883876,
"learning_rate": 1.3650752352062508e-07,
"loss": 0.8479,
"step": 890
},
{
"epoch": 3.809421841541756,
"grad_norm": 0.3911144136584381,
"learning_rate": 1.3009571566073853e-07,
"loss": 0.9491,
"step": 891
},
{
"epoch": 3.8137044967880085,
"grad_norm": 0.37807417768830925,
"learning_rate": 1.238373468600118e-07,
"loss": 0.9301,
"step": 892
},
{
"epoch": 3.817987152034261,
"grad_norm": 0.37694080855509665,
"learning_rate": 1.1773249474865133e-07,
"loss": 0.8065,
"step": 893
},
{
"epoch": 3.822269807280514,
"grad_norm": 0.388921594089528,
"learning_rate": 1.1178123505260623e-07,
"loss": 0.9592,
"step": 894
},
{
"epoch": 3.8265524625267666,
"grad_norm": 0.4116324419131167,
"learning_rate": 1.0598364159263436e-07,
"loss": 0.8211,
"step": 895
},
{
"epoch": 3.8308351177730193,
"grad_norm": 0.36448244518924466,
"learning_rate": 1.0033978628338214e-07,
"loss": 0.8574,
"step": 896
},
{
"epoch": 3.835117773019272,
"grad_norm": 0.37097780876337194,
"learning_rate": 9.484973913249096e-08,
"loss": 0.9514,
"step": 897
},
{
"epoch": 3.8394004282655247,
"grad_norm": 0.36937494307460916,
"learning_rate": 8.95135682397366e-08,
"loss": 1.0152,
"step": 898
},
{
"epoch": 3.8436830835117775,
"grad_norm": 0.38701761947361546,
"learning_rate": 8.433133979617313e-08,
"loss": 0.944,
"step": 899
},
{
"epoch": 3.84796573875803,
"grad_norm": 0.4062184881145919,
"learning_rate": 7.930311808332092e-08,
"loss": 0.9758,
"step": 900
},
{
"epoch": 3.852248394004283,
"grad_norm": 0.37343762843807315,
"learning_rate": 7.442896547237011e-08,
"loss": 0.8735,
"step": 901
},
{
"epoch": 3.8565310492505356,
"grad_norm": 0.3671379727642055,
"learning_rate": 6.970894242339516e-08,
"loss": 0.8647,
"step": 902
},
{
"epoch": 3.860813704496788,
"grad_norm": 0.3958355267876771,
"learning_rate": 6.514310748462205e-08,
"loss": 0.9561,
"step": 903
},
{
"epoch": 3.8650963597430406,
"grad_norm": 0.382409326734392,
"learning_rate": 6.073151729168585e-08,
"loss": 0.8091,
"step": 904
},
{
"epoch": 3.8693790149892933,
"grad_norm": 0.4074968347015751,
"learning_rate": 5.6474226566938236e-08,
"loss": 0.9165,
"step": 905
},
{
"epoch": 3.873661670235546,
"grad_norm": 0.3600503231444295,
"learning_rate": 5.2371288118764626e-08,
"loss": 0.8608,
"step": 906
},
{
"epoch": 3.8779443254817987,
"grad_norm": 0.4385570932475021,
"learning_rate": 4.8422752840933393e-08,
"loss": 1.0001,
"step": 907
},
{
"epoch": 3.8822269807280514,
"grad_norm": 0.3526075337659528,
"learning_rate": 4.462866971195745e-08,
"loss": 0.8845,
"step": 908
},
{
"epoch": 3.886509635974304,
"grad_norm": 0.3912267742586606,
"learning_rate": 4.098908579449334e-08,
"loss": 0.9521,
"step": 909
},
{
"epoch": 3.890792291220557,
"grad_norm": 0.38329918065994895,
"learning_rate": 3.750404623475284e-08,
"loss": 0.9337,
"step": 910
},
{
"epoch": 3.8950749464668095,
"grad_norm": 0.3671570660694989,
"learning_rate": 3.4173594261947826e-08,
"loss": 0.8763,
"step": 911
},
{
"epoch": 3.8993576017130622,
"grad_norm": 0.36467443117322795,
"learning_rate": 3.099777118774766e-08,
"loss": 0.7929,
"step": 912
},
{
"epoch": 3.903640256959315,
"grad_norm": 0.3850495904484138,
"learning_rate": 2.797661640577265e-08,
"loss": 0.8685,
"step": 913
},
{
"epoch": 3.907922912205567,
"grad_norm": 0.3947552978578375,
"learning_rate": 2.511016739110139e-08,
"loss": 1.0001,
"step": 914
},
{
"epoch": 3.91220556745182,
"grad_norm": 0.35654913515444236,
"learning_rate": 2.2398459699811415e-08,
"loss": 0.8357,
"step": 915
},
{
"epoch": 3.9164882226980726,
"grad_norm": 0.3755511134463352,
"learning_rate": 1.9841526968528145e-08,
"loss": 0.8337,
"step": 916
},
{
"epoch": 3.9207708779443253,
"grad_norm": 0.4178807150767805,
"learning_rate": 1.74394009140183e-08,
"loss": 1.0103,
"step": 917
},
{
"epoch": 3.925053533190578,
"grad_norm": 0.36343375952599793,
"learning_rate": 1.5192111332791582e-08,
"loss": 1.0066,
"step": 918
},
{
"epoch": 3.9293361884368307,
"grad_norm": 0.42967159160836504,
"learning_rate": 1.3099686100728758e-08,
"loss": 0.8981,
"step": 919
},
{
"epoch": 3.9336188436830835,
"grad_norm": 0.37863498483420355,
"learning_rate": 1.1162151172741664e-08,
"loss": 0.9011,
"step": 920
},
{
"epoch": 3.937901498929336,
"grad_norm": 0.36158296723772976,
"learning_rate": 9.379530582445672e-09,
"loss": 0.9935,
"step": 921
},
{
"epoch": 3.942184154175589,
"grad_norm": 0.3992757868964545,
"learning_rate": 7.751846441866883e-09,
"loss": 0.9523,
"step": 922
},
{
"epoch": 3.9464668094218416,
"grad_norm": 0.4093754834796768,
"learning_rate": 6.279118941163176e-09,
"loss": 0.9193,
"step": 923
},
{
"epoch": 3.9507494646680943,
"grad_norm": 0.3812402104730706,
"learning_rate": 4.961366348374408e-09,
"loss": 0.8255,
"step": 924
},
{
"epoch": 3.955032119914347,
"grad_norm": 0.3932024496097198,
"learning_rate": 3.798605009198986e-09,
"loss": 0.8468,
"step": 925
},
{
"epoch": 3.9593147751605997,
"grad_norm": 0.36137752700716075,
"learning_rate": 2.790849346788471e-09,
"loss": 0.8799,
"step": 926
},
{
"epoch": 3.9635974304068524,
"grad_norm": 0.39672575824023565,
"learning_rate": 1.9381118615699467e-09,
"loss": 0.9367,
"step": 927
},
{
"epoch": 3.967880085653105,
"grad_norm": 0.4049246679049995,
"learning_rate": 1.240403131090584e-09,
"loss": 0.9305,
"step": 928
},
{
"epoch": 3.972162740899358,
"grad_norm": 0.36851044379383624,
"learning_rate": 6.977318098844165e-10,
"loss": 0.8928,
"step": 929
},
{
"epoch": 3.9764453961456105,
"grad_norm": 0.3887303558382742,
"learning_rate": 3.1010462936825745e-10,
"loss": 0.8732,
"step": 930
},
{
"epoch": 3.980728051391863,
"grad_norm": 0.38791626187967704,
"learning_rate": 7.752639775565618e-11,
"loss": 0.9141,
"step": 931
},
{
"epoch": 3.9850107066381155,
"grad_norm": 0.3676480337759505,
"learning_rate": 0.0,
"loss": 0.9296,
"step": 932
}
],
"logging_steps": 1,
"max_steps": 932,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 117,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.2867186494210048e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}