kimnamssya's picture
Upload folder using huggingface_hub
4a4253b verified
raw
history blame contribute delete
No virus
116 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9879518072289155,
"eval_steps": 500,
"global_step": 664,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030120481927710845,
"grad_norm": 0.4040583074092865,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.684,
"step": 1
},
{
"epoch": 0.006024096385542169,
"grad_norm": 0.4095500409603119,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.6744,
"step": 2
},
{
"epoch": 0.009036144578313253,
"grad_norm": 0.43334975838661194,
"learning_rate": 3e-06,
"loss": 1.5956,
"step": 3
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.39737147092819214,
"learning_rate": 4.000000000000001e-06,
"loss": 1.6404,
"step": 4
},
{
"epoch": 0.015060240963855422,
"grad_norm": 0.41804248094558716,
"learning_rate": 5e-06,
"loss": 1.6398,
"step": 5
},
{
"epoch": 0.018072289156626505,
"grad_norm": 0.41640806198120117,
"learning_rate": 6e-06,
"loss": 1.6439,
"step": 6
},
{
"epoch": 0.02108433734939759,
"grad_norm": 0.41058269143104553,
"learning_rate": 7e-06,
"loss": 1.595,
"step": 7
},
{
"epoch": 0.024096385542168676,
"grad_norm": 0.3926224708557129,
"learning_rate": 8.000000000000001e-06,
"loss": 1.6345,
"step": 8
},
{
"epoch": 0.02710843373493976,
"grad_norm": 0.4289781451225281,
"learning_rate": 9e-06,
"loss": 1.644,
"step": 9
},
{
"epoch": 0.030120481927710843,
"grad_norm": 0.40193450450897217,
"learning_rate": 1e-05,
"loss": 1.5528,
"step": 10
},
{
"epoch": 0.03313253012048193,
"grad_norm": 0.4374159574508667,
"learning_rate": 9.999942312273667e-06,
"loss": 1.6132,
"step": 11
},
{
"epoch": 0.03614457831325301,
"grad_norm": 0.42067164182662964,
"learning_rate": 9.999769250425817e-06,
"loss": 1.5811,
"step": 12
},
{
"epoch": 0.0391566265060241,
"grad_norm": 0.45214423537254333,
"learning_rate": 9.999480818449868e-06,
"loss": 1.5985,
"step": 13
},
{
"epoch": 0.04216867469879518,
"grad_norm": 0.42575493454933167,
"learning_rate": 9.999077023001411e-06,
"loss": 1.6494,
"step": 14
},
{
"epoch": 0.045180722891566265,
"grad_norm": 0.4473470151424408,
"learning_rate": 9.998557873398066e-06,
"loss": 1.6773,
"step": 15
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.4473958909511566,
"learning_rate": 9.997923381619257e-06,
"loss": 1.6342,
"step": 16
},
{
"epoch": 0.05120481927710843,
"grad_norm": 0.46099603176116943,
"learning_rate": 9.997173562305937e-06,
"loss": 1.5608,
"step": 17
},
{
"epoch": 0.05421686746987952,
"grad_norm": 0.44749438762664795,
"learning_rate": 9.996308432760257e-06,
"loss": 1.6081,
"step": 18
},
{
"epoch": 0.0572289156626506,
"grad_norm": 0.46720796823501587,
"learning_rate": 9.995328012945158e-06,
"loss": 1.595,
"step": 19
},
{
"epoch": 0.060240963855421686,
"grad_norm": 0.436519056558609,
"learning_rate": 9.994232325483917e-06,
"loss": 1.6167,
"step": 20
},
{
"epoch": 0.06325301204819277,
"grad_norm": 0.42265141010284424,
"learning_rate": 9.99302139565962e-06,
"loss": 1.6195,
"step": 21
},
{
"epoch": 0.06626506024096386,
"grad_norm": 0.3948360085487366,
"learning_rate": 9.991695251414584e-06,
"loss": 1.5915,
"step": 22
},
{
"epoch": 0.06927710843373494,
"grad_norm": 0.4320020377635956,
"learning_rate": 9.990253923349706e-06,
"loss": 1.5906,
"step": 23
},
{
"epoch": 0.07228915662650602,
"grad_norm": 0.44574347138404846,
"learning_rate": 9.988697444723763e-06,
"loss": 1.5712,
"step": 24
},
{
"epoch": 0.07530120481927711,
"grad_norm": 0.41239818930625916,
"learning_rate": 9.98702585145264e-06,
"loss": 1.5752,
"step": 25
},
{
"epoch": 0.0783132530120482,
"grad_norm": 0.42800942063331604,
"learning_rate": 9.9852391821085e-06,
"loss": 1.5531,
"step": 26
},
{
"epoch": 0.08132530120481928,
"grad_norm": 0.41428813338279724,
"learning_rate": 9.983337477918904e-06,
"loss": 1.5882,
"step": 27
},
{
"epoch": 0.08433734939759036,
"grad_norm": 0.4180893898010254,
"learning_rate": 9.981320782765847e-06,
"loss": 1.5694,
"step": 28
},
{
"epoch": 0.08734939759036145,
"grad_norm": 0.4115102291107178,
"learning_rate": 9.97918914318475e-06,
"loss": 1.5347,
"step": 29
},
{
"epoch": 0.09036144578313253,
"grad_norm": 0.42680180072784424,
"learning_rate": 9.976942608363394e-06,
"loss": 1.5275,
"step": 30
},
{
"epoch": 0.09337349397590361,
"grad_norm": 0.39122140407562256,
"learning_rate": 9.97458123014077e-06,
"loss": 1.4961,
"step": 31
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.39151236414909363,
"learning_rate": 9.972105063005895e-06,
"loss": 1.5359,
"step": 32
},
{
"epoch": 0.09939759036144578,
"grad_norm": 0.38214412331581116,
"learning_rate": 9.969514164096548e-06,
"loss": 1.5103,
"step": 33
},
{
"epoch": 0.10240963855421686,
"grad_norm": 0.39758872985839844,
"learning_rate": 9.966808593197959e-06,
"loss": 1.4839,
"step": 34
},
{
"epoch": 0.10542168674698796,
"grad_norm": 0.3730682134628296,
"learning_rate": 9.96398841274142e-06,
"loss": 1.4908,
"step": 35
},
{
"epoch": 0.10843373493975904,
"grad_norm": 0.367106556892395,
"learning_rate": 9.96105368780285e-06,
"loss": 1.4372,
"step": 36
},
{
"epoch": 0.11144578313253012,
"grad_norm": 0.3657713532447815,
"learning_rate": 9.958004486101293e-06,
"loss": 1.4791,
"step": 37
},
{
"epoch": 0.1144578313253012,
"grad_norm": 0.3604431450366974,
"learning_rate": 9.954840877997356e-06,
"loss": 1.443,
"step": 38
},
{
"epoch": 0.11746987951807229,
"grad_norm": 0.3567802906036377,
"learning_rate": 9.95156293649158e-06,
"loss": 1.4589,
"step": 39
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.3510221838951111,
"learning_rate": 9.948170737222763e-06,
"loss": 1.378,
"step": 40
},
{
"epoch": 0.12349397590361445,
"grad_norm": 0.35590696334838867,
"learning_rate": 9.94466435846621e-06,
"loss": 1.4064,
"step": 41
},
{
"epoch": 0.12650602409638553,
"grad_norm": 0.3688894510269165,
"learning_rate": 9.941043881131928e-06,
"loss": 1.3728,
"step": 42
},
{
"epoch": 0.12951807228915663,
"grad_norm": 0.35922420024871826,
"learning_rate": 9.93730938876276e-06,
"loss": 1.3809,
"step": 43
},
{
"epoch": 0.13253012048192772,
"grad_norm": 0.34513840079307556,
"learning_rate": 9.933460967532454e-06,
"loss": 1.4276,
"step": 44
},
{
"epoch": 0.1355421686746988,
"grad_norm": 0.34767019748687744,
"learning_rate": 9.929498706243681e-06,
"loss": 1.3542,
"step": 45
},
{
"epoch": 0.13855421686746988,
"grad_norm": 0.3442816138267517,
"learning_rate": 9.925422696325976e-06,
"loss": 1.3512,
"step": 46
},
{
"epoch": 0.14156626506024098,
"grad_norm": 0.36368539929389954,
"learning_rate": 9.921233031833639e-06,
"loss": 1.3736,
"step": 47
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.33587586879730225,
"learning_rate": 9.916929809443555e-06,
"loss": 1.3906,
"step": 48
},
{
"epoch": 0.14759036144578314,
"grad_norm": 0.34448426961898804,
"learning_rate": 9.912513128452974e-06,
"loss": 1.362,
"step": 49
},
{
"epoch": 0.15060240963855423,
"grad_norm": 0.3427204489707947,
"learning_rate": 9.907983090777206e-06,
"loss": 1.3292,
"step": 50
},
{
"epoch": 0.1536144578313253,
"grad_norm": 0.38191652297973633,
"learning_rate": 9.903339800947284e-06,
"loss": 1.3735,
"step": 51
},
{
"epoch": 0.1566265060240964,
"grad_norm": 0.3497113287448883,
"learning_rate": 9.898583366107539e-06,
"loss": 1.3776,
"step": 52
},
{
"epoch": 0.15963855421686746,
"grad_norm": 0.31867995858192444,
"learning_rate": 9.893713896013134e-06,
"loss": 1.3058,
"step": 53
},
{
"epoch": 0.16265060240963855,
"grad_norm": 0.3136507272720337,
"learning_rate": 9.888731503027535e-06,
"loss": 1.3463,
"step": 54
},
{
"epoch": 0.16566265060240964,
"grad_norm": 0.3268043100833893,
"learning_rate": 9.883636302119911e-06,
"loss": 1.3189,
"step": 55
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.3265782296657562,
"learning_rate": 9.878428410862484e-06,
"loss": 1.2983,
"step": 56
},
{
"epoch": 0.1716867469879518,
"grad_norm": 0.30159902572631836,
"learning_rate": 9.873107949427815e-06,
"loss": 1.3545,
"step": 57
},
{
"epoch": 0.1746987951807229,
"grad_norm": 0.3109259307384491,
"learning_rate": 9.867675040586035e-06,
"loss": 1.3894,
"step": 58
},
{
"epoch": 0.17771084337349397,
"grad_norm": 0.30744504928588867,
"learning_rate": 9.862129809702006e-06,
"loss": 1.3777,
"step": 59
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.3043947219848633,
"learning_rate": 9.856472384732432e-06,
"loss": 1.354,
"step": 60
},
{
"epoch": 0.18373493975903615,
"grad_norm": 0.3052617013454437,
"learning_rate": 9.850702896222908e-06,
"loss": 1.3074,
"step": 61
},
{
"epoch": 0.18674698795180722,
"grad_norm": 0.3007952570915222,
"learning_rate": 9.844821477304904e-06,
"loss": 1.2124,
"step": 62
},
{
"epoch": 0.1897590361445783,
"grad_norm": 0.2832448482513428,
"learning_rate": 9.838828263692693e-06,
"loss": 1.2841,
"step": 63
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.27628499269485474,
"learning_rate": 9.832723393680222e-06,
"loss": 1.2425,
"step": 64
},
{
"epoch": 0.19578313253012047,
"grad_norm": 0.2700969874858856,
"learning_rate": 9.826507008137919e-06,
"loss": 1.2543,
"step": 65
},
{
"epoch": 0.19879518072289157,
"grad_norm": 0.2948736548423767,
"learning_rate": 9.820179250509442e-06,
"loss": 1.2708,
"step": 66
},
{
"epoch": 0.20180722891566266,
"grad_norm": 0.29045990109443665,
"learning_rate": 9.813740266808375e-06,
"loss": 1.3043,
"step": 67
},
{
"epoch": 0.20481927710843373,
"grad_norm": 0.27807915210723877,
"learning_rate": 9.807190205614847e-06,
"loss": 1.206,
"step": 68
},
{
"epoch": 0.20783132530120482,
"grad_norm": 0.267451673746109,
"learning_rate": 9.800529218072112e-06,
"loss": 1.2255,
"step": 69
},
{
"epoch": 0.21084337349397592,
"grad_norm": 0.2782948613166809,
"learning_rate": 9.793757457883062e-06,
"loss": 1.2236,
"step": 70
},
{
"epoch": 0.21385542168674698,
"grad_norm": 0.276692271232605,
"learning_rate": 9.786875081306677e-06,
"loss": 1.2588,
"step": 71
},
{
"epoch": 0.21686746987951808,
"grad_norm": 0.2745719254016876,
"learning_rate": 9.779882247154419e-06,
"loss": 1.215,
"step": 72
},
{
"epoch": 0.21987951807228914,
"grad_norm": 0.2591319978237152,
"learning_rate": 9.772779116786568e-06,
"loss": 1.2833,
"step": 73
},
{
"epoch": 0.22289156626506024,
"grad_norm": 0.27248722314834595,
"learning_rate": 9.765565854108503e-06,
"loss": 1.2575,
"step": 74
},
{
"epoch": 0.22590361445783133,
"grad_norm": 0.273562490940094,
"learning_rate": 9.758242625566912e-06,
"loss": 1.2134,
"step": 75
},
{
"epoch": 0.2289156626506024,
"grad_norm": 0.29504141211509705,
"learning_rate": 9.750809600145955e-06,
"loss": 1.2222,
"step": 76
},
{
"epoch": 0.2319277108433735,
"grad_norm": 0.2649330496788025,
"learning_rate": 9.743266949363368e-06,
"loss": 1.1992,
"step": 77
},
{
"epoch": 0.23493975903614459,
"grad_norm": 0.26566869020462036,
"learning_rate": 9.735614847266502e-06,
"loss": 1.2432,
"step": 78
},
{
"epoch": 0.23795180722891565,
"grad_norm": 0.25488752126693726,
"learning_rate": 9.727853470428301e-06,
"loss": 1.1646,
"step": 79
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.280771404504776,
"learning_rate": 9.719982997943245e-06,
"loss": 1.2075,
"step": 80
},
{
"epoch": 0.24397590361445784,
"grad_norm": 0.3047221899032593,
"learning_rate": 9.712003611423194e-06,
"loss": 1.2378,
"step": 81
},
{
"epoch": 0.2469879518072289,
"grad_norm": 0.2466488480567932,
"learning_rate": 9.703915494993215e-06,
"loss": 1.2169,
"step": 82
},
{
"epoch": 0.25,
"grad_norm": 0.25351837277412415,
"learning_rate": 9.695718835287328e-06,
"loss": 1.1995,
"step": 83
},
{
"epoch": 0.25301204819277107,
"grad_norm": 0.26343056559562683,
"learning_rate": 9.6874138214442e-06,
"loss": 1.2329,
"step": 84
},
{
"epoch": 0.2560240963855422,
"grad_norm": 0.2685820460319519,
"learning_rate": 9.679000645102771e-06,
"loss": 1.2054,
"step": 85
},
{
"epoch": 0.25903614457831325,
"grad_norm": 0.2713760733604431,
"learning_rate": 9.670479500397854e-06,
"loss": 1.2417,
"step": 86
},
{
"epoch": 0.2620481927710843,
"grad_norm": 0.2718029320240021,
"learning_rate": 9.66185058395563e-06,
"loss": 1.2587,
"step": 87
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.26352736353874207,
"learning_rate": 9.653114094889128e-06,
"loss": 1.2541,
"step": 88
},
{
"epoch": 0.2680722891566265,
"grad_norm": 0.2755894362926483,
"learning_rate": 9.644270234793625e-06,
"loss": 1.2091,
"step": 89
},
{
"epoch": 0.2710843373493976,
"grad_norm": 0.25112101435661316,
"learning_rate": 9.63531920774199e-06,
"loss": 1.2425,
"step": 90
},
{
"epoch": 0.2740963855421687,
"grad_norm": 0.26138511300086975,
"learning_rate": 9.62626122027999e-06,
"loss": 1.2252,
"step": 91
},
{
"epoch": 0.27710843373493976,
"grad_norm": 0.2561100721359253,
"learning_rate": 9.617096481421498e-06,
"loss": 1.2206,
"step": 92
},
{
"epoch": 0.28012048192771083,
"grad_norm": 0.26238083839416504,
"learning_rate": 9.607825202643696e-06,
"loss": 1.1859,
"step": 93
},
{
"epoch": 0.28313253012048195,
"grad_norm": 0.27538710832595825,
"learning_rate": 9.598447597882181e-06,
"loss": 1.2062,
"step": 94
},
{
"epoch": 0.286144578313253,
"grad_norm": 0.25858640670776367,
"learning_rate": 9.588963883526033e-06,
"loss": 1.2354,
"step": 95
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.2807197570800781,
"learning_rate": 9.579374278412819e-06,
"loss": 1.2433,
"step": 96
},
{
"epoch": 0.2921686746987952,
"grad_norm": 0.28452298045158386,
"learning_rate": 9.569679003823542e-06,
"loss": 1.2191,
"step": 97
},
{
"epoch": 0.29518072289156627,
"grad_norm": 0.25671708583831787,
"learning_rate": 9.559878283477546e-06,
"loss": 1.2095,
"step": 98
},
{
"epoch": 0.29819277108433734,
"grad_norm": 0.25289785861968994,
"learning_rate": 9.549972343527336e-06,
"loss": 1.2033,
"step": 99
},
{
"epoch": 0.30120481927710846,
"grad_norm": 0.27585139870643616,
"learning_rate": 9.539961412553375e-06,
"loss": 1.149,
"step": 100
},
{
"epoch": 0.3042168674698795,
"grad_norm": 0.2492348849773407,
"learning_rate": 9.529845721558802e-06,
"loss": 1.1271,
"step": 101
},
{
"epoch": 0.3072289156626506,
"grad_norm": 0.254409521818161,
"learning_rate": 9.5196255039641e-06,
"loss": 1.2528,
"step": 102
},
{
"epoch": 0.3102409638554217,
"grad_norm": 0.3059585690498352,
"learning_rate": 9.50930099560172e-06,
"loss": 1.2058,
"step": 103
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.2655487656593323,
"learning_rate": 9.498872434710624e-06,
"loss": 1.1311,
"step": 104
},
{
"epoch": 0.31626506024096385,
"grad_norm": 0.271914005279541,
"learning_rate": 9.488340061930797e-06,
"loss": 1.1831,
"step": 105
},
{
"epoch": 0.3192771084337349,
"grad_norm": 0.29053163528442383,
"learning_rate": 9.477704120297698e-06,
"loss": 1.1585,
"step": 106
},
{
"epoch": 0.32228915662650603,
"grad_norm": 0.26874732971191406,
"learning_rate": 9.46696485523664e-06,
"loss": 1.2012,
"step": 107
},
{
"epoch": 0.3253012048192771,
"grad_norm": 0.25582486391067505,
"learning_rate": 9.45612251455714e-06,
"loss": 1.1397,
"step": 108
},
{
"epoch": 0.32831325301204817,
"grad_norm": 0.26407524943351746,
"learning_rate": 9.445177348447187e-06,
"loss": 1.1887,
"step": 109
},
{
"epoch": 0.3313253012048193,
"grad_norm": 0.2506115734577179,
"learning_rate": 9.434129609467484e-06,
"loss": 1.2219,
"step": 110
},
{
"epoch": 0.33433734939759036,
"grad_norm": 0.2572745084762573,
"learning_rate": 9.422979552545604e-06,
"loss": 1.1362,
"step": 111
},
{
"epoch": 0.3373493975903614,
"grad_norm": 0.28277891874313354,
"learning_rate": 9.411727434970121e-06,
"loss": 1.1409,
"step": 112
},
{
"epoch": 0.34036144578313254,
"grad_norm": 0.30223405361175537,
"learning_rate": 9.400373516384671e-06,
"loss": 1.1546,
"step": 113
},
{
"epoch": 0.3433734939759036,
"grad_norm": 0.2697835862636566,
"learning_rate": 9.388918058781947e-06,
"loss": 1.2384,
"step": 114
},
{
"epoch": 0.3463855421686747,
"grad_norm": 0.2695978283882141,
"learning_rate": 9.377361326497673e-06,
"loss": 1.1364,
"step": 115
},
{
"epoch": 0.3493975903614458,
"grad_norm": 0.25360485911369324,
"learning_rate": 9.365703586204495e-06,
"loss": 1.178,
"step": 116
},
{
"epoch": 0.35240963855421686,
"grad_norm": 0.27773186564445496,
"learning_rate": 9.353945106905822e-06,
"loss": 1.1682,
"step": 117
},
{
"epoch": 0.35542168674698793,
"grad_norm": 0.27416011691093445,
"learning_rate": 9.342086159929629e-06,
"loss": 1.1477,
"step": 118
},
{
"epoch": 0.35843373493975905,
"grad_norm": 0.27577441930770874,
"learning_rate": 9.330127018922195e-06,
"loss": 1.1497,
"step": 119
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.2513567805290222,
"learning_rate": 9.318067959841776e-06,
"loss": 1.1339,
"step": 120
},
{
"epoch": 0.3644578313253012,
"grad_norm": 0.27182286977767944,
"learning_rate": 9.305909260952255e-06,
"loss": 1.1362,
"step": 121
},
{
"epoch": 0.3674698795180723,
"grad_norm": 0.26553475856781006,
"learning_rate": 9.29365120281671e-06,
"loss": 1.1576,
"step": 122
},
{
"epoch": 0.3704819277108434,
"grad_norm": 0.25539693236351013,
"learning_rate": 9.28129406829094e-06,
"loss": 1.1384,
"step": 123
},
{
"epoch": 0.37349397590361444,
"grad_norm": 0.2685853838920593,
"learning_rate": 9.268838142516943e-06,
"loss": 1.1842,
"step": 124
},
{
"epoch": 0.37650602409638556,
"grad_norm": 0.2612561881542206,
"learning_rate": 9.256283712916337e-06,
"loss": 1.1578,
"step": 125
},
{
"epoch": 0.3795180722891566,
"grad_norm": 0.26739126443862915,
"learning_rate": 9.24363106918372e-06,
"loss": 1.1477,
"step": 126
},
{
"epoch": 0.3825301204819277,
"grad_norm": 0.2942097783088684,
"learning_rate": 9.230880503279991e-06,
"loss": 1.1747,
"step": 127
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.2746829688549042,
"learning_rate": 9.218032309425613e-06,
"loss": 1.1651,
"step": 128
},
{
"epoch": 0.3885542168674699,
"grad_norm": 0.27550533413887024,
"learning_rate": 9.205086784093823e-06,
"loss": 1.1361,
"step": 129
},
{
"epoch": 0.39156626506024095,
"grad_norm": 0.31240707635879517,
"learning_rate": 9.19204422600379e-06,
"loss": 1.1933,
"step": 130
},
{
"epoch": 0.39457831325301207,
"grad_norm": 0.24383339285850525,
"learning_rate": 9.178904936113719e-06,
"loss": 1.1739,
"step": 131
},
{
"epoch": 0.39759036144578314,
"grad_norm": 0.3256170153617859,
"learning_rate": 9.165669217613919e-06,
"loss": 1.1709,
"step": 132
},
{
"epoch": 0.4006024096385542,
"grad_norm": 0.2967703938484192,
"learning_rate": 9.152337375919792e-06,
"loss": 1.1379,
"step": 133
},
{
"epoch": 0.4036144578313253,
"grad_norm": 0.2854821979999542,
"learning_rate": 9.138909718664788e-06,
"loss": 1.1741,
"step": 134
},
{
"epoch": 0.4066265060240964,
"grad_norm": 0.33066266775131226,
"learning_rate": 9.125386555693316e-06,
"loss": 1.1779,
"step": 135
},
{
"epoch": 0.40963855421686746,
"grad_norm": 0.27965742349624634,
"learning_rate": 9.111768199053588e-06,
"loss": 1.1717,
"step": 136
},
{
"epoch": 0.4126506024096386,
"grad_norm": 0.29010990262031555,
"learning_rate": 9.098054962990415e-06,
"loss": 1.1526,
"step": 137
},
{
"epoch": 0.41566265060240964,
"grad_norm": 0.2726079523563385,
"learning_rate": 9.084247163937959e-06,
"loss": 1.1136,
"step": 138
},
{
"epoch": 0.4186746987951807,
"grad_norm": 0.2590181231498718,
"learning_rate": 9.070345120512436e-06,
"loss": 1.1267,
"step": 139
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.291429340839386,
"learning_rate": 9.056349153504753e-06,
"loss": 1.1429,
"step": 140
},
{
"epoch": 0.4246987951807229,
"grad_norm": 0.2864663004875183,
"learning_rate": 9.042259585873119e-06,
"loss": 1.1161,
"step": 141
},
{
"epoch": 0.42771084337349397,
"grad_norm": 0.29812097549438477,
"learning_rate": 9.028076742735583e-06,
"loss": 1.157,
"step": 142
},
{
"epoch": 0.4307228915662651,
"grad_norm": 0.29142752289772034,
"learning_rate": 9.013800951362532e-06,
"loss": 1.0919,
"step": 143
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.2857559621334076,
"learning_rate": 8.999432541169145e-06,
"loss": 1.1391,
"step": 144
},
{
"epoch": 0.4367469879518072,
"grad_norm": 0.29825499653816223,
"learning_rate": 8.984971843707787e-06,
"loss": 1.1589,
"step": 145
},
{
"epoch": 0.4397590361445783,
"grad_norm": 0.26081719994544983,
"learning_rate": 8.970419192660366e-06,
"loss": 1.1411,
"step": 146
},
{
"epoch": 0.4427710843373494,
"grad_norm": 0.3022754490375519,
"learning_rate": 8.955774923830618e-06,
"loss": 1.1528,
"step": 147
},
{
"epoch": 0.4457831325301205,
"grad_norm": 0.28860539197921753,
"learning_rate": 8.94103937513637e-06,
"loss": 1.1784,
"step": 148
},
{
"epoch": 0.44879518072289154,
"grad_norm": 0.25238746404647827,
"learning_rate": 8.92621288660175e-06,
"loss": 1.1447,
"step": 149
},
{
"epoch": 0.45180722891566266,
"grad_norm": 0.2728082239627838,
"learning_rate": 8.911295800349316e-06,
"loss": 1.0984,
"step": 150
},
{
"epoch": 0.45481927710843373,
"grad_norm": 0.26758912205696106,
"learning_rate": 8.896288460592187e-06,
"loss": 1.0918,
"step": 151
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.27047985792160034,
"learning_rate": 8.881191213626084e-06,
"loss": 1.1279,
"step": 152
},
{
"epoch": 0.4608433734939759,
"grad_norm": 0.309121698141098,
"learning_rate": 8.86600440782135e-06,
"loss": 1.1366,
"step": 153
},
{
"epoch": 0.463855421686747,
"grad_norm": 0.2778535485267639,
"learning_rate": 8.850728393614903e-06,
"loss": 1.1423,
"step": 154
},
{
"epoch": 0.46686746987951805,
"grad_norm": 0.2797792851924896,
"learning_rate": 8.835363523502154e-06,
"loss": 1.1664,
"step": 155
},
{
"epoch": 0.46987951807228917,
"grad_norm": 0.3094732463359833,
"learning_rate": 8.819910152028872e-06,
"loss": 1.1295,
"step": 156
},
{
"epoch": 0.47289156626506024,
"grad_norm": 0.2910013496875763,
"learning_rate": 8.804368635783002e-06,
"loss": 1.0793,
"step": 157
},
{
"epoch": 0.4759036144578313,
"grad_norm": 0.26490893959999084,
"learning_rate": 8.788739333386443e-06,
"loss": 1.092,
"step": 158
},
{
"epoch": 0.4789156626506024,
"grad_norm": 0.25550705194473267,
"learning_rate": 8.773022605486755e-06,
"loss": 1.1325,
"step": 159
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.2488010972738266,
"learning_rate": 8.75721881474886e-06,
"loss": 1.0885,
"step": 160
},
{
"epoch": 0.48493975903614456,
"grad_norm": 0.3159677982330322,
"learning_rate": 8.741328325846663e-06,
"loss": 1.1544,
"step": 161
},
{
"epoch": 0.4879518072289157,
"grad_norm": 0.30506086349487305,
"learning_rate": 8.725351505454631e-06,
"loss": 1.1716,
"step": 162
},
{
"epoch": 0.49096385542168675,
"grad_norm": 0.29045408964157104,
"learning_rate": 8.709288722239345e-06,
"loss": 1.1199,
"step": 163
},
{
"epoch": 0.4939759036144578,
"grad_norm": 0.2709057033061981,
"learning_rate": 8.693140346850975e-06,
"loss": 1.113,
"step": 164
},
{
"epoch": 0.49698795180722893,
"grad_norm": 0.28410249948501587,
"learning_rate": 8.67690675191475e-06,
"loss": 1.1383,
"step": 165
},
{
"epoch": 0.5,
"grad_norm": 0.29826584458351135,
"learning_rate": 8.660588312022345e-06,
"loss": 1.0619,
"step": 166
},
{
"epoch": 0.5030120481927711,
"grad_norm": 0.3092498779296875,
"learning_rate": 8.644185403723231e-06,
"loss": 1.1101,
"step": 167
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.30253866314888,
"learning_rate": 8.627698405516007e-06,
"loss": 1.0649,
"step": 168
},
{
"epoch": 0.5090361445783133,
"grad_norm": 0.2906908690929413,
"learning_rate": 8.611127697839649e-06,
"loss": 1.1436,
"step": 169
},
{
"epoch": 0.5120481927710844,
"grad_norm": 0.30768147110939026,
"learning_rate": 8.594473663064735e-06,
"loss": 1.1116,
"step": 170
},
{
"epoch": 0.5150602409638554,
"grad_norm": 0.3316003680229187,
"learning_rate": 8.577736685484626e-06,
"loss": 1.1484,
"step": 171
},
{
"epoch": 0.5180722891566265,
"grad_norm": 0.3070067763328552,
"learning_rate": 8.560917151306594e-06,
"loss": 1.144,
"step": 172
},
{
"epoch": 0.5210843373493976,
"grad_norm": 0.27163851261138916,
"learning_rate": 8.544015448642916e-06,
"loss": 1.1071,
"step": 173
},
{
"epoch": 0.5240963855421686,
"grad_norm": 0.2992447316646576,
"learning_rate": 8.527031967501906e-06,
"loss": 1.1647,
"step": 174
},
{
"epoch": 0.5271084337349398,
"grad_norm": 0.262173593044281,
"learning_rate": 8.509967099778934e-06,
"loss": 1.2107,
"step": 175
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.33722054958343506,
"learning_rate": 8.492821239247365e-06,
"loss": 1.0553,
"step": 176
},
{
"epoch": 0.5331325301204819,
"grad_norm": 0.27636295557022095,
"learning_rate": 8.475594781549483e-06,
"loss": 1.1275,
"step": 177
},
{
"epoch": 0.536144578313253,
"grad_norm": 0.2799915671348572,
"learning_rate": 8.45828812418736e-06,
"loss": 1.0764,
"step": 178
},
{
"epoch": 0.5391566265060241,
"grad_norm": 0.27971795201301575,
"learning_rate": 8.44090166651368e-06,
"loss": 1.0634,
"step": 179
},
{
"epoch": 0.5421686746987951,
"grad_norm": 0.3047524690628052,
"learning_rate": 8.42343580972253e-06,
"loss": 1.1203,
"step": 180
},
{
"epoch": 0.5451807228915663,
"grad_norm": 0.3009694218635559,
"learning_rate": 8.405890956840136e-06,
"loss": 1.1168,
"step": 181
},
{
"epoch": 0.5481927710843374,
"grad_norm": 0.30559536814689636,
"learning_rate": 8.388267512715565e-06,
"loss": 1.113,
"step": 182
},
{
"epoch": 0.5512048192771084,
"grad_norm": 0.3450864851474762,
"learning_rate": 8.370565884011389e-06,
"loss": 1.0621,
"step": 183
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.3391083776950836,
"learning_rate": 8.352786479194288e-06,
"loss": 1.1276,
"step": 184
},
{
"epoch": 0.5572289156626506,
"grad_norm": 0.3621962070465088,
"learning_rate": 8.33492970852564e-06,
"loss": 1.081,
"step": 185
},
{
"epoch": 0.5602409638554217,
"grad_norm": 0.28517264127731323,
"learning_rate": 8.316995984052048e-06,
"loss": 1.0723,
"step": 186
},
{
"epoch": 0.5632530120481928,
"grad_norm": 0.3252887427806854,
"learning_rate": 8.298985719595824e-06,
"loss": 1.0727,
"step": 187
},
{
"epoch": 0.5662650602409639,
"grad_norm": 0.3289787769317627,
"learning_rate": 8.280899330745452e-06,
"loss": 1.0726,
"step": 188
},
{
"epoch": 0.5692771084337349,
"grad_norm": 0.3431254029273987,
"learning_rate": 8.262737234845993e-06,
"loss": 1.0908,
"step": 189
},
{
"epoch": 0.572289156626506,
"grad_norm": 0.295175164937973,
"learning_rate": 8.244499850989453e-06,
"loss": 1.1408,
"step": 190
},
{
"epoch": 0.5753012048192772,
"grad_norm": 0.3039282262325287,
"learning_rate": 8.226187600005116e-06,
"loss": 1.1105,
"step": 191
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.30349868535995483,
"learning_rate": 8.207800904449829e-06,
"loss": 1.109,
"step": 192
},
{
"epoch": 0.5813253012048193,
"grad_norm": 0.3329324722290039,
"learning_rate": 8.189340188598263e-06,
"loss": 1.0828,
"step": 193
},
{
"epoch": 0.5843373493975904,
"grad_norm": 0.32696786522865295,
"learning_rate": 8.1708058784331e-06,
"loss": 1.116,
"step": 194
},
{
"epoch": 0.5873493975903614,
"grad_norm": 0.30085158348083496,
"learning_rate": 8.15219840163523e-06,
"loss": 1.141,
"step": 195
},
{
"epoch": 0.5903614457831325,
"grad_norm": 0.30034953355789185,
"learning_rate": 8.133518187573864e-06,
"loss": 1.1254,
"step": 196
},
{
"epoch": 0.5933734939759037,
"grad_norm": 0.35607779026031494,
"learning_rate": 8.114765667296628e-06,
"loss": 1.0621,
"step": 197
},
{
"epoch": 0.5963855421686747,
"grad_norm": 0.30774402618408203,
"learning_rate": 8.095941273519634e-06,
"loss": 1.0462,
"step": 198
},
{
"epoch": 0.5993975903614458,
"grad_norm": 0.3458847999572754,
"learning_rate": 8.077045440617465e-06,
"loss": 1.0695,
"step": 199
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.3302537202835083,
"learning_rate": 8.058078604613178e-06,
"loss": 1.1314,
"step": 200
},
{
"epoch": 0.6054216867469879,
"grad_norm": 0.32025519013404846,
"learning_rate": 8.039041203168233e-06,
"loss": 1.1179,
"step": 201
},
{
"epoch": 0.608433734939759,
"grad_norm": 0.32808589935302734,
"learning_rate": 8.019933675572389e-06,
"loss": 1.1393,
"step": 202
},
{
"epoch": 0.6114457831325302,
"grad_norm": 0.31607547402381897,
"learning_rate": 8.000756462733577e-06,
"loss": 1.1027,
"step": 203
},
{
"epoch": 0.6144578313253012,
"grad_norm": 0.33204394578933716,
"learning_rate": 7.981510007167719e-06,
"loss": 1.0795,
"step": 204
},
{
"epoch": 0.6174698795180723,
"grad_norm": 0.3012982904911041,
"learning_rate": 7.962194752988519e-06,
"loss": 1.104,
"step": 205
},
{
"epoch": 0.6204819277108434,
"grad_norm": 0.28379830718040466,
"learning_rate": 7.942811145897215e-06,
"loss": 1.1108,
"step": 206
},
{
"epoch": 0.6234939759036144,
"grad_norm": 0.3218439817428589,
"learning_rate": 7.923359633172299e-06,
"loss": 1.0856,
"step": 207
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.2985135614871979,
"learning_rate": 7.903840663659186e-06,
"loss": 1.1621,
"step": 208
},
{
"epoch": 0.6295180722891566,
"grad_norm": 0.3362099528312683,
"learning_rate": 7.884254687759863e-06,
"loss": 1.1173,
"step": 209
},
{
"epoch": 0.6325301204819277,
"grad_norm": 0.32187989354133606,
"learning_rate": 7.864602157422501e-06,
"loss": 1.1293,
"step": 210
},
{
"epoch": 0.6355421686746988,
"grad_norm": 0.34748998284339905,
"learning_rate": 7.844883526131014e-06,
"loss": 1.1501,
"step": 211
},
{
"epoch": 0.6385542168674698,
"grad_norm": 0.2776443660259247,
"learning_rate": 7.8250992488946e-06,
"loss": 1.1272,
"step": 212
},
{
"epoch": 0.641566265060241,
"grad_norm": 0.34776571393013,
"learning_rate": 7.805249782237256e-06,
"loss": 1.0993,
"step": 213
},
{
"epoch": 0.6445783132530121,
"grad_norm": 0.3251356780529022,
"learning_rate": 7.78533558418722e-06,
"loss": 1.0717,
"step": 214
},
{
"epoch": 0.6475903614457831,
"grad_norm": 0.32606494426727295,
"learning_rate": 7.765357114266409e-06,
"loss": 1.1061,
"step": 215
},
{
"epoch": 0.6506024096385542,
"grad_norm": 0.32897332310676575,
"learning_rate": 7.745314833479834e-06,
"loss": 1.065,
"step": 216
},
{
"epoch": 0.6536144578313253,
"grad_norm": 0.34086140990257263,
"learning_rate": 7.72520920430493e-06,
"loss": 1.1221,
"step": 217
},
{
"epoch": 0.6566265060240963,
"grad_norm": 0.395309180021286,
"learning_rate": 7.705040690680915e-06,
"loss": 1.0839,
"step": 218
},
{
"epoch": 0.6596385542168675,
"grad_norm": 0.3107753396034241,
"learning_rate": 7.684809757998066e-06,
"loss": 1.0287,
"step": 219
},
{
"epoch": 0.6626506024096386,
"grad_norm": 0.32579633593559265,
"learning_rate": 7.664516873086987e-06,
"loss": 1.0925,
"step": 220
},
{
"epoch": 0.6656626506024096,
"grad_norm": 0.32496118545532227,
"learning_rate": 7.644162504207834e-06,
"loss": 1.0225,
"step": 221
},
{
"epoch": 0.6686746987951807,
"grad_norm": 0.34487584233283997,
"learning_rate": 7.623747121039512e-06,
"loss": 1.1216,
"step": 222
},
{
"epoch": 0.6716867469879518,
"grad_norm": 0.28649845719337463,
"learning_rate": 7.603271194668835e-06,
"loss": 1.0989,
"step": 223
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.3071340024471283,
"learning_rate": 7.582735197579657e-06,
"loss": 1.0908,
"step": 224
},
{
"epoch": 0.677710843373494,
"grad_norm": 0.33348020911216736,
"learning_rate": 7.562139603641971e-06,
"loss": 1.0497,
"step": 225
},
{
"epoch": 0.6807228915662651,
"grad_norm": 0.3527333736419678,
"learning_rate": 7.541484888100974e-06,
"loss": 1.1121,
"step": 226
},
{
"epoch": 0.6837349397590361,
"grad_norm": 0.3623991310596466,
"learning_rate": 7.520771527566093e-06,
"loss": 1.0675,
"step": 227
},
{
"epoch": 0.6867469879518072,
"grad_norm": 0.3683350384235382,
"learning_rate": 7.500000000000001e-06,
"loss": 1.082,
"step": 228
},
{
"epoch": 0.6897590361445783,
"grad_norm": 0.30012479424476624,
"learning_rate": 7.479170784707574e-06,
"loss": 1.1421,
"step": 229
},
{
"epoch": 0.6927710843373494,
"grad_norm": 0.32032355666160583,
"learning_rate": 7.458284362324844e-06,
"loss": 1.0996,
"step": 230
},
{
"epoch": 0.6957831325301205,
"grad_norm": 0.30791187286376953,
"learning_rate": 7.437341214807895e-06,
"loss": 1.1221,
"step": 231
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.3271755576133728,
"learning_rate": 7.416341825421755e-06,
"loss": 1.0937,
"step": 232
},
{
"epoch": 0.7018072289156626,
"grad_norm": 0.320961594581604,
"learning_rate": 7.395286678729232e-06,
"loss": 1.0727,
"step": 233
},
{
"epoch": 0.7048192771084337,
"grad_norm": 0.31970059871673584,
"learning_rate": 7.374176260579746e-06,
"loss": 1.104,
"step": 234
},
{
"epoch": 0.7078313253012049,
"grad_norm": 0.3410727083683014,
"learning_rate": 7.353011058098104e-06,
"loss": 1.0866,
"step": 235
},
{
"epoch": 0.7108433734939759,
"grad_norm": 0.3979572653770447,
"learning_rate": 7.33179155967327e-06,
"loss": 1.0773,
"step": 236
},
{
"epoch": 0.713855421686747,
"grad_norm": 0.3386681079864502,
"learning_rate": 7.310518254947092e-06,
"loss": 1.0943,
"step": 237
},
{
"epoch": 0.7168674698795181,
"grad_norm": 0.32043731212615967,
"learning_rate": 7.289191634803002e-06,
"loss": 1.1104,
"step": 238
},
{
"epoch": 0.7198795180722891,
"grad_norm": 0.3244670331478119,
"learning_rate": 7.267812191354691e-06,
"loss": 1.1137,
"step": 239
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.32313597202301025,
"learning_rate": 7.246380417934752e-06,
"loss": 1.1296,
"step": 240
},
{
"epoch": 0.7259036144578314,
"grad_norm": 0.4050733149051666,
"learning_rate": 7.224896809083297e-06,
"loss": 1.0725,
"step": 241
},
{
"epoch": 0.7289156626506024,
"grad_norm": 0.2902127206325531,
"learning_rate": 7.203361860536544e-06,
"loss": 1.119,
"step": 242
},
{
"epoch": 0.7319277108433735,
"grad_norm": 0.31548964977264404,
"learning_rate": 7.181776069215382e-06,
"loss": 1.0712,
"step": 243
},
{
"epoch": 0.7349397590361446,
"grad_norm": 0.31955307722091675,
"learning_rate": 7.160139933213899e-06,
"loss": 1.0925,
"step": 244
},
{
"epoch": 0.7379518072289156,
"grad_norm": 0.37396878004074097,
"learning_rate": 7.138453951787894e-06,
"loss": 1.1029,
"step": 245
},
{
"epoch": 0.7409638554216867,
"grad_norm": 0.3100704550743103,
"learning_rate": 7.1167186253433474e-06,
"loss": 1.1001,
"step": 246
},
{
"epoch": 0.7439759036144579,
"grad_norm": 0.32318195700645447,
"learning_rate": 7.094934455424889e-06,
"loss": 1.0909,
"step": 247
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.40869641304016113,
"learning_rate": 7.073101944704209e-06,
"loss": 1.0925,
"step": 248
},
{
"epoch": 0.75,
"grad_norm": 0.31567490100860596,
"learning_rate": 7.051221596968471e-06,
"loss": 1.0973,
"step": 249
},
{
"epoch": 0.7530120481927711,
"grad_norm": 0.32018548250198364,
"learning_rate": 7.029293917108678e-06,
"loss": 1.0222,
"step": 250
},
{
"epoch": 0.7560240963855421,
"grad_norm": 0.3648555874824524,
"learning_rate": 7.0073194111080315e-06,
"loss": 1.075,
"step": 251
},
{
"epoch": 0.7590361445783133,
"grad_norm": 0.3636914789676666,
"learning_rate": 6.985298586030241e-06,
"loss": 1.1419,
"step": 252
},
{
"epoch": 0.7620481927710844,
"grad_norm": 0.337217777967453,
"learning_rate": 6.963231950007845e-06,
"loss": 1.0848,
"step": 253
},
{
"epoch": 0.7650602409638554,
"grad_norm": 0.34433966875076294,
"learning_rate": 6.941120012230464e-06,
"loss": 1.0675,
"step": 254
},
{
"epoch": 0.7680722891566265,
"grad_norm": 0.31864967942237854,
"learning_rate": 6.918963282933063e-06,
"loss": 1.0576,
"step": 255
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.37333500385284424,
"learning_rate": 6.896762273384179e-06,
"loss": 1.0536,
"step": 256
},
{
"epoch": 0.7740963855421686,
"grad_norm": 0.389068603515625,
"learning_rate": 6.8745174958741164e-06,
"loss": 1.0992,
"step": 257
},
{
"epoch": 0.7771084337349398,
"grad_norm": 0.3978760540485382,
"learning_rate": 6.852229463703131e-06,
"loss": 1.124,
"step": 258
},
{
"epoch": 0.7801204819277109,
"grad_norm": 0.37109795212745667,
"learning_rate": 6.829898691169581e-06,
"loss": 1.065,
"step": 259
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.3223637044429779,
"learning_rate": 6.8075256935580655e-06,
"loss": 1.0475,
"step": 260
},
{
"epoch": 0.786144578313253,
"grad_norm": 0.32434552907943726,
"learning_rate": 6.78511098712753e-06,
"loss": 1.0797,
"step": 261
},
{
"epoch": 0.7891566265060241,
"grad_norm": 0.3082960546016693,
"learning_rate": 6.762655089099353e-06,
"loss": 1.0889,
"step": 262
},
{
"epoch": 0.7921686746987951,
"grad_norm": 0.33072763681411743,
"learning_rate": 6.740158517645418e-06,
"loss": 1.0575,
"step": 263
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.3404625952243805,
"learning_rate": 6.717621791876147e-06,
"loss": 1.0192,
"step": 264
},
{
"epoch": 0.7981927710843374,
"grad_norm": 0.31751227378845215,
"learning_rate": 6.695045431828524e-06,
"loss": 1.105,
"step": 265
},
{
"epoch": 0.8012048192771084,
"grad_norm": 0.3528308868408203,
"learning_rate": 6.672429958454103e-06,
"loss": 1.0803,
"step": 266
},
{
"epoch": 0.8042168674698795,
"grad_norm": 0.3395234942436218,
"learning_rate": 6.649775893606982e-06,
"loss": 1.1057,
"step": 267
},
{
"epoch": 0.8072289156626506,
"grad_norm": 0.37763112783432007,
"learning_rate": 6.627083760031755e-06,
"loss": 1.0911,
"step": 268
},
{
"epoch": 0.8102409638554217,
"grad_norm": 0.3695107400417328,
"learning_rate": 6.604354081351461e-06,
"loss": 1.1105,
"step": 269
},
{
"epoch": 0.8132530120481928,
"grad_norm": 0.3498575687408447,
"learning_rate": 6.5815873820554925e-06,
"loss": 1.0347,
"step": 270
},
{
"epoch": 0.8162650602409639,
"grad_norm": 0.3670216202735901,
"learning_rate": 6.558784187487495e-06,
"loss": 1.009,
"step": 271
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.38344910740852356,
"learning_rate": 6.535945023833249e-06,
"loss": 1.0132,
"step": 272
},
{
"epoch": 0.822289156626506,
"grad_norm": 0.3509382903575897,
"learning_rate": 6.513070418108525e-06,
"loss": 1.0768,
"step": 273
},
{
"epoch": 0.8253012048192772,
"grad_norm": 0.37638577818870544,
"learning_rate": 6.490160898146919e-06,
"loss": 1.0435,
"step": 274
},
{
"epoch": 0.8283132530120482,
"grad_norm": 0.36278653144836426,
"learning_rate": 6.467216992587679e-06,
"loss": 1.1227,
"step": 275
},
{
"epoch": 0.8313253012048193,
"grad_norm": 0.34076735377311707,
"learning_rate": 6.444239230863505e-06,
"loss": 1.042,
"step": 276
},
{
"epoch": 0.8343373493975904,
"grad_norm": 0.3733161687850952,
"learning_rate": 6.421228143188325e-06,
"loss": 1.0266,
"step": 277
},
{
"epoch": 0.8373493975903614,
"grad_norm": 0.3508923351764679,
"learning_rate": 6.398184260545072e-06,
"loss": 1.0716,
"step": 278
},
{
"epoch": 0.8403614457831325,
"grad_norm": 0.38440215587615967,
"learning_rate": 6.375108114673425e-06,
"loss": 1.1266,
"step": 279
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.38378679752349854,
"learning_rate": 6.3520002380575395e-06,
"loss": 1.1126,
"step": 280
},
{
"epoch": 0.8463855421686747,
"grad_norm": 0.36522167921066284,
"learning_rate": 6.32886116391376e-06,
"loss": 1.1011,
"step": 281
},
{
"epoch": 0.8493975903614458,
"grad_norm": 0.345027357339859,
"learning_rate": 6.305691426178316e-06,
"loss": 1.1179,
"step": 282
},
{
"epoch": 0.8524096385542169,
"grad_norm": 0.3252032697200775,
"learning_rate": 6.282491559495005e-06,
"loss": 1.0666,
"step": 283
},
{
"epoch": 0.8554216867469879,
"grad_norm": 0.3353135585784912,
"learning_rate": 6.259262099202849e-06,
"loss": 1.045,
"step": 284
},
{
"epoch": 0.858433734939759,
"grad_norm": 0.35613197088241577,
"learning_rate": 6.23600358132375e-06,
"loss": 1.0619,
"step": 285
},
{
"epoch": 0.8614457831325302,
"grad_norm": 0.3165310025215149,
"learning_rate": 6.212716542550112e-06,
"loss": 1.0846,
"step": 286
},
{
"epoch": 0.8644578313253012,
"grad_norm": 0.3935311436653137,
"learning_rate": 6.189401520232464e-06,
"loss": 1.0634,
"step": 287
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.3519918620586395,
"learning_rate": 6.166059052367055e-06,
"loss": 1.1106,
"step": 288
},
{
"epoch": 0.8704819277108434,
"grad_norm": 0.34923064708709717,
"learning_rate": 6.142689677583447e-06,
"loss": 1.0479,
"step": 289
},
{
"epoch": 0.8734939759036144,
"grad_norm": 0.327006459236145,
"learning_rate": 6.119293935132076e-06,
"loss": 1.0652,
"step": 290
},
{
"epoch": 0.8765060240963856,
"grad_norm": 0.3877696692943573,
"learning_rate": 6.095872364871818e-06,
"loss": 1.0686,
"step": 291
},
{
"epoch": 0.8795180722891566,
"grad_norm": 0.3664681613445282,
"learning_rate": 6.072425507257528e-06,
"loss": 1.0205,
"step": 292
},
{
"epoch": 0.8825301204819277,
"grad_norm": 0.35179319977760315,
"learning_rate": 6.048953903327568e-06,
"loss": 1.0839,
"step": 293
},
{
"epoch": 0.8855421686746988,
"grad_norm": 0.3436523675918579,
"learning_rate": 6.025458094691323e-06,
"loss": 1.1028,
"step": 294
},
{
"epoch": 0.8885542168674698,
"grad_norm": 0.3567025363445282,
"learning_rate": 6.0019386235167055e-06,
"loss": 1.0638,
"step": 295
},
{
"epoch": 0.891566265060241,
"grad_norm": 0.34170979261398315,
"learning_rate": 5.978396032517641e-06,
"loss": 1.1007,
"step": 296
},
{
"epoch": 0.8945783132530121,
"grad_norm": 0.32985955476760864,
"learning_rate": 5.9548308649415486e-06,
"loss": 1.1342,
"step": 297
},
{
"epoch": 0.8975903614457831,
"grad_norm": 0.3722776174545288,
"learning_rate": 5.931243664556803e-06,
"loss": 1.1253,
"step": 298
},
{
"epoch": 0.9006024096385542,
"grad_norm": 0.3619896173477173,
"learning_rate": 5.90763497564019e-06,
"loss": 1.0155,
"step": 299
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.39154163002967834,
"learning_rate": 5.884005342964343e-06,
"loss": 1.151,
"step": 300
},
{
"epoch": 0.9066265060240963,
"grad_norm": 0.35082048177719116,
"learning_rate": 5.860355311785175e-06,
"loss": 1.0529,
"step": 301
},
{
"epoch": 0.9096385542168675,
"grad_norm": 0.3391878306865692,
"learning_rate": 5.836685427829296e-06,
"loss": 1.1057,
"step": 302
},
{
"epoch": 0.9126506024096386,
"grad_norm": 0.34184518456459045,
"learning_rate": 5.812996237281423e-06,
"loss": 1.0481,
"step": 303
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.3681842088699341,
"learning_rate": 5.7892882867717705e-06,
"loss": 1.0455,
"step": 304
},
{
"epoch": 0.9186746987951807,
"grad_norm": 0.3326142728328705,
"learning_rate": 5.765562123363445e-06,
"loss": 1.071,
"step": 305
},
{
"epoch": 0.9216867469879518,
"grad_norm": 0.36516857147216797,
"learning_rate": 5.7418182945398136e-06,
"loss": 1.0701,
"step": 306
},
{
"epoch": 0.9246987951807228,
"grad_norm": 0.3817295730113983,
"learning_rate": 5.718057348191874e-06,
"loss": 1.0718,
"step": 307
},
{
"epoch": 0.927710843373494,
"grad_norm": 0.3265933096408844,
"learning_rate": 5.6942798326056205e-06,
"loss": 1.0765,
"step": 308
},
{
"epoch": 0.9307228915662651,
"grad_norm": 0.3510778248310089,
"learning_rate": 5.670486296449373e-06,
"loss": 1.1283,
"step": 309
},
{
"epoch": 0.9337349397590361,
"grad_norm": 0.32118940353393555,
"learning_rate": 5.646677288761132e-06,
"loss": 1.0592,
"step": 310
},
{
"epoch": 0.9367469879518072,
"grad_norm": 0.3270410895347595,
"learning_rate": 5.622853358935908e-06,
"loss": 1.0876,
"step": 311
},
{
"epoch": 0.9397590361445783,
"grad_norm": 0.35170766711235046,
"learning_rate": 5.599015056713037e-06,
"loss": 1.0684,
"step": 312
},
{
"epoch": 0.9427710843373494,
"grad_norm": 0.3354102671146393,
"learning_rate": 5.575162932163501e-06,
"loss": 1.0861,
"step": 313
},
{
"epoch": 0.9457831325301205,
"grad_norm": 0.37484198808670044,
"learning_rate": 5.551297535677236e-06,
"loss": 1.0697,
"step": 314
},
{
"epoch": 0.9487951807228916,
"grad_norm": 0.3812544047832489,
"learning_rate": 5.527419417950424e-06,
"loss": 1.0526,
"step": 315
},
{
"epoch": 0.9518072289156626,
"grad_norm": 0.3424987196922302,
"learning_rate": 5.503529129972792e-06,
"loss": 1.0456,
"step": 316
},
{
"epoch": 0.9548192771084337,
"grad_norm": 0.37978166341781616,
"learning_rate": 5.479627223014902e-06,
"loss": 1.0712,
"step": 317
},
{
"epoch": 0.9578313253012049,
"grad_norm": 0.37075453996658325,
"learning_rate": 5.455714248615417e-06,
"loss": 1.0659,
"step": 318
},
{
"epoch": 0.9608433734939759,
"grad_norm": 0.3791234791278839,
"learning_rate": 5.431790758568388e-06,
"loss": 1.0408,
"step": 319
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.3565094769001007,
"learning_rate": 5.4078573049105135e-06,
"loss": 1.0777,
"step": 320
},
{
"epoch": 0.9668674698795181,
"grad_norm": 0.3704342246055603,
"learning_rate": 5.383914439908403e-06,
"loss": 1.1454,
"step": 321
},
{
"epoch": 0.9698795180722891,
"grad_norm": 0.3541310429573059,
"learning_rate": 5.359962716045836e-06,
"loss": 1.0392,
"step": 322
},
{
"epoch": 0.9728915662650602,
"grad_norm": 0.380628377199173,
"learning_rate": 5.336002686011007e-06,
"loss": 1.137,
"step": 323
},
{
"epoch": 0.9759036144578314,
"grad_norm": 0.39858028292655945,
"learning_rate": 5.312034902683779e-06,
"loss": 1.1154,
"step": 324
},
{
"epoch": 0.9789156626506024,
"grad_norm": 0.3649790585041046,
"learning_rate": 5.288059919122922e-06,
"loss": 0.9955,
"step": 325
},
{
"epoch": 0.9819277108433735,
"grad_norm": 0.41761839389801025,
"learning_rate": 5.2640782885533515e-06,
"loss": 1.0635,
"step": 326
},
{
"epoch": 0.9849397590361446,
"grad_norm": 0.43014606833457947,
"learning_rate": 5.240090564353365e-06,
"loss": 1.0369,
"step": 327
},
{
"epoch": 0.9879518072289156,
"grad_norm": 0.36708715558052063,
"learning_rate": 5.21609730004187e-06,
"loss": 1.0698,
"step": 328
},
{
"epoch": 0.9909638554216867,
"grad_norm": 0.34585151076316833,
"learning_rate": 5.1920990492656135e-06,
"loss": 1.1109,
"step": 329
},
{
"epoch": 0.9939759036144579,
"grad_norm": 0.3957839906215668,
"learning_rate": 5.168096365786402e-06,
"loss": 1.0439,
"step": 330
},
{
"epoch": 0.9969879518072289,
"grad_norm": 0.35115697979927063,
"learning_rate": 5.144089803468333e-06,
"loss": 1.1163,
"step": 331
},
{
"epoch": 1.0,
"grad_norm": 0.38435807824134827,
"learning_rate": 5.1200799162650035e-06,
"loss": 1.0951,
"step": 332
},
{
"epoch": 1.0030120481927711,
"grad_norm": 0.31511375308036804,
"learning_rate": 5.096067258206735e-06,
"loss": 1.1165,
"step": 333
},
{
"epoch": 1.0060240963855422,
"grad_norm": 0.41014838218688965,
"learning_rate": 5.072052383387787e-06,
"loss": 1.1078,
"step": 334
},
{
"epoch": 1.0090361445783131,
"grad_norm": 0.36247673630714417,
"learning_rate": 5.048035845953569e-06,
"loss": 0.9971,
"step": 335
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.3278728127479553,
"learning_rate": 5.024018200087855e-06,
"loss": 1.1189,
"step": 336
},
{
"epoch": 1.0030120481927711,
"grad_norm": 0.3709251880645752,
"learning_rate": 5e-06,
"loss": 1.0844,
"step": 337
},
{
"epoch": 1.0060240963855422,
"grad_norm": 0.3526400029659271,
"learning_rate": 4.975981799912147e-06,
"loss": 1.0526,
"step": 338
},
{
"epoch": 1.0090361445783131,
"grad_norm": 0.3683416545391083,
"learning_rate": 4.951964154046432e-06,
"loss": 1.0687,
"step": 339
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.3674441874027252,
"learning_rate": 4.927947616612216e-06,
"loss": 1.048,
"step": 340
},
{
"epoch": 1.0150602409638554,
"grad_norm": 0.33722299337387085,
"learning_rate": 4.903932741793266e-06,
"loss": 0.9881,
"step": 341
},
{
"epoch": 1.0180722891566265,
"grad_norm": 0.38287389278411865,
"learning_rate": 4.879920083734997e-06,
"loss": 1.0368,
"step": 342
},
{
"epoch": 1.0210843373493976,
"grad_norm": 0.38486406207084656,
"learning_rate": 4.855910196531669e-06,
"loss": 1.036,
"step": 343
},
{
"epoch": 1.0240963855421688,
"grad_norm": 0.3734920620918274,
"learning_rate": 4.8319036342135985e-06,
"loss": 1.0488,
"step": 344
},
{
"epoch": 1.0271084337349397,
"grad_norm": 0.40039393305778503,
"learning_rate": 4.807900950734388e-06,
"loss": 1.0315,
"step": 345
},
{
"epoch": 1.0301204819277108,
"grad_norm": 0.3681272566318512,
"learning_rate": 4.78390269995813e-06,
"loss": 1.0515,
"step": 346
},
{
"epoch": 1.033132530120482,
"grad_norm": 0.3793656826019287,
"learning_rate": 4.759909435646636e-06,
"loss": 1.1554,
"step": 347
},
{
"epoch": 1.036144578313253,
"grad_norm": 0.3956240117549896,
"learning_rate": 4.735921711446649e-06,
"loss": 1.0764,
"step": 348
},
{
"epoch": 1.0391566265060241,
"grad_norm": 0.3506197929382324,
"learning_rate": 4.711940080877079e-06,
"loss": 1.0664,
"step": 349
},
{
"epoch": 1.0421686746987953,
"grad_norm": 0.33289358019828796,
"learning_rate": 4.687965097316223e-06,
"loss": 1.0989,
"step": 350
},
{
"epoch": 1.0451807228915662,
"grad_norm": 0.38886559009552,
"learning_rate": 4.6639973139889944e-06,
"loss": 1.0367,
"step": 351
},
{
"epoch": 1.0481927710843373,
"grad_norm": 0.36474519968032837,
"learning_rate": 4.640037283954165e-06,
"loss": 1.041,
"step": 352
},
{
"epoch": 1.0512048192771084,
"grad_norm": 0.3719565272331238,
"learning_rate": 4.616085560091596e-06,
"loss": 1.0481,
"step": 353
},
{
"epoch": 1.0542168674698795,
"grad_norm": 0.38648533821105957,
"learning_rate": 4.592142695089489e-06,
"loss": 1.0783,
"step": 354
},
{
"epoch": 1.0572289156626506,
"grad_norm": 0.4134596884250641,
"learning_rate": 4.568209241431615e-06,
"loss": 1.0206,
"step": 355
},
{
"epoch": 1.0602409638554218,
"grad_norm": 0.35198330879211426,
"learning_rate": 4.544285751384585e-06,
"loss": 1.0578,
"step": 356
},
{
"epoch": 1.0632530120481927,
"grad_norm": 0.4130623936653137,
"learning_rate": 4.520372776985101e-06,
"loss": 1.0467,
"step": 357
},
{
"epoch": 1.0662650602409638,
"grad_norm": 0.35723182559013367,
"learning_rate": 4.496470870027209e-06,
"loss": 1.0781,
"step": 358
},
{
"epoch": 1.069277108433735,
"grad_norm": 0.35406294465065,
"learning_rate": 4.472580582049578e-06,
"loss": 1.001,
"step": 359
},
{
"epoch": 1.072289156626506,
"grad_norm": 0.38317278027534485,
"learning_rate": 4.448702464322764e-06,
"loss": 1.0656,
"step": 360
},
{
"epoch": 1.0753012048192772,
"grad_norm": 0.338810533285141,
"learning_rate": 4.4248370678364995e-06,
"loss": 0.9687,
"step": 361
},
{
"epoch": 1.0783132530120483,
"grad_norm": 0.33876633644104004,
"learning_rate": 4.400984943286965e-06,
"loss": 1.0505,
"step": 362
},
{
"epoch": 1.0813253012048192,
"grad_norm": 0.3846857249736786,
"learning_rate": 4.377146641064093e-06,
"loss": 1.0058,
"step": 363
},
{
"epoch": 1.0843373493975903,
"grad_norm": 0.4660666286945343,
"learning_rate": 4.3533227112388694e-06,
"loss": 1.0146,
"step": 364
},
{
"epoch": 1.0873493975903614,
"grad_norm": 0.4065142869949341,
"learning_rate": 4.329513703550628e-06,
"loss": 1.0294,
"step": 365
},
{
"epoch": 1.0903614457831325,
"grad_norm": 0.39198002219200134,
"learning_rate": 4.305720167394381e-06,
"loss": 1.0866,
"step": 366
},
{
"epoch": 1.0933734939759037,
"grad_norm": 0.35108157992362976,
"learning_rate": 4.2819426518081265e-06,
"loss": 1.0525,
"step": 367
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.34058380126953125,
"learning_rate": 4.258181705460188e-06,
"loss": 1.0815,
"step": 368
},
{
"epoch": 1.0993975903614457,
"grad_norm": 0.4144446849822998,
"learning_rate": 4.234437876636557e-06,
"loss": 1.0305,
"step": 369
},
{
"epoch": 1.1024096385542168,
"grad_norm": 0.3802807927131653,
"learning_rate": 4.21071171322823e-06,
"loss": 1.0463,
"step": 370
},
{
"epoch": 1.105421686746988,
"grad_norm": 0.3633134067058563,
"learning_rate": 4.1870037627185785e-06,
"loss": 1.0386,
"step": 371
},
{
"epoch": 1.108433734939759,
"grad_norm": 0.4094638526439667,
"learning_rate": 4.163314572170704e-06,
"loss": 1.0414,
"step": 372
},
{
"epoch": 1.1114457831325302,
"grad_norm": 0.37921878695487976,
"learning_rate": 4.139644688214827e-06,
"loss": 1.0142,
"step": 373
},
{
"epoch": 1.1144578313253013,
"grad_norm": 0.413327157497406,
"learning_rate": 4.115994657035659e-06,
"loss": 1.0886,
"step": 374
},
{
"epoch": 1.1174698795180722,
"grad_norm": 0.37856829166412354,
"learning_rate": 4.0923650243598104e-06,
"loss": 1.084,
"step": 375
},
{
"epoch": 1.1204819277108433,
"grad_norm": 0.41401001811027527,
"learning_rate": 4.0687563354431986e-06,
"loss": 1.118,
"step": 376
},
{
"epoch": 1.1234939759036144,
"grad_norm": 0.3299630582332611,
"learning_rate": 4.045169135058452e-06,
"loss": 0.9993,
"step": 377
},
{
"epoch": 1.1265060240963856,
"grad_norm": 0.40372124314308167,
"learning_rate": 4.021603967482361e-06,
"loss": 0.9855,
"step": 378
},
{
"epoch": 1.1295180722891567,
"grad_norm": 0.360078364610672,
"learning_rate": 3.998061376483298e-06,
"loss": 1.0382,
"step": 379
},
{
"epoch": 1.1325301204819278,
"grad_norm": 0.3652278184890747,
"learning_rate": 3.974541905308679e-06,
"loss": 1.0232,
"step": 380
},
{
"epoch": 1.1355421686746987,
"grad_norm": 0.3333640396595001,
"learning_rate": 3.951046096672434e-06,
"loss": 1.0304,
"step": 381
},
{
"epoch": 1.1385542168674698,
"grad_norm": 0.3765230178833008,
"learning_rate": 3.927574492742473e-06,
"loss": 1.0738,
"step": 382
},
{
"epoch": 1.141566265060241,
"grad_norm": 0.3517187833786011,
"learning_rate": 3.904127635128184e-06,
"loss": 1.0491,
"step": 383
},
{
"epoch": 1.144578313253012,
"grad_norm": 0.35913482308387756,
"learning_rate": 3.880706064867927e-06,
"loss": 1.0509,
"step": 384
},
{
"epoch": 1.1475903614457832,
"grad_norm": 0.3901945650577545,
"learning_rate": 3.857310322416555e-06,
"loss": 1.0653,
"step": 385
},
{
"epoch": 1.1506024096385543,
"grad_norm": 0.3298746347427368,
"learning_rate": 3.833940947632947e-06,
"loss": 0.9943,
"step": 386
},
{
"epoch": 1.1536144578313252,
"grad_norm": 0.3933880031108856,
"learning_rate": 3.8105984797675364e-06,
"loss": 1.06,
"step": 387
},
{
"epoch": 1.1566265060240963,
"grad_norm": 0.42192092537879944,
"learning_rate": 3.7872834574498894e-06,
"loss": 1.0453,
"step": 388
},
{
"epoch": 1.1596385542168675,
"grad_norm": 0.38652369379997253,
"learning_rate": 3.7639964186762506e-06,
"loss": 1.035,
"step": 389
},
{
"epoch": 1.1626506024096386,
"grad_norm": 0.44319620728492737,
"learning_rate": 3.740737900797151e-06,
"loss": 1.1098,
"step": 390
},
{
"epoch": 1.1656626506024097,
"grad_norm": 0.3664276599884033,
"learning_rate": 3.7175084405049978e-06,
"loss": 0.991,
"step": 391
},
{
"epoch": 1.1686746987951806,
"grad_norm": 0.3838660717010498,
"learning_rate": 3.6943085738216855e-06,
"loss": 1.092,
"step": 392
},
{
"epoch": 1.1716867469879517,
"grad_norm": 0.38596200942993164,
"learning_rate": 3.6711388360862417e-06,
"loss": 1.077,
"step": 393
},
{
"epoch": 1.1746987951807228,
"grad_norm": 0.337519109249115,
"learning_rate": 3.6479997619424605e-06,
"loss": 1.0932,
"step": 394
},
{
"epoch": 1.177710843373494,
"grad_norm": 0.350619912147522,
"learning_rate": 3.6248918853265756e-06,
"loss": 1.0796,
"step": 395
},
{
"epoch": 1.180722891566265,
"grad_norm": 0.38858625292778015,
"learning_rate": 3.6018157394549287e-06,
"loss": 1.0689,
"step": 396
},
{
"epoch": 1.1837349397590362,
"grad_norm": 0.38901758193969727,
"learning_rate": 3.5787718568116764e-06,
"loss": 1.1019,
"step": 397
},
{
"epoch": 1.1867469879518073,
"grad_norm": 0.34919285774230957,
"learning_rate": 3.5557607691364983e-06,
"loss": 1.0646,
"step": 398
},
{
"epoch": 1.1897590361445782,
"grad_norm": 0.41810017824172974,
"learning_rate": 3.5327830074123214e-06,
"loss": 1.0429,
"step": 399
},
{
"epoch": 1.1927710843373494,
"grad_norm": 0.3683408796787262,
"learning_rate": 3.509839101853082e-06,
"loss": 0.9905,
"step": 400
},
{
"epoch": 1.1957831325301205,
"grad_norm": 0.3720911741256714,
"learning_rate": 3.486929581891476e-06,
"loss": 1.0213,
"step": 401
},
{
"epoch": 1.1987951807228916,
"grad_norm": 0.3495194911956787,
"learning_rate": 3.464054976166753e-06,
"loss": 1.0386,
"step": 402
},
{
"epoch": 1.2018072289156627,
"grad_norm": 0.36551299691200256,
"learning_rate": 3.441215812512508e-06,
"loss": 1.0043,
"step": 403
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.3687341809272766,
"learning_rate": 3.41841261794451e-06,
"loss": 1.0313,
"step": 404
},
{
"epoch": 1.2078313253012047,
"grad_norm": 0.3739585280418396,
"learning_rate": 3.3956459186485414e-06,
"loss": 1.0326,
"step": 405
},
{
"epoch": 1.2108433734939759,
"grad_norm": 0.38974305987358093,
"learning_rate": 3.372916239968246e-06,
"loss": 1.0665,
"step": 406
},
{
"epoch": 1.213855421686747,
"grad_norm": 0.4061500132083893,
"learning_rate": 3.3502241063930196e-06,
"loss": 1.114,
"step": 407
},
{
"epoch": 1.216867469879518,
"grad_norm": 0.398306280374527,
"learning_rate": 3.327570041545897e-06,
"loss": 1.0584,
"step": 408
},
{
"epoch": 1.2198795180722892,
"grad_norm": 0.36864137649536133,
"learning_rate": 3.304954568171478e-06,
"loss": 1.081,
"step": 409
},
{
"epoch": 1.2228915662650603,
"grad_norm": 0.3283785581588745,
"learning_rate": 3.282378208123856e-06,
"loss": 1.0605,
"step": 410
},
{
"epoch": 1.2259036144578312,
"grad_norm": 0.38243263959884644,
"learning_rate": 3.259841482354582e-06,
"loss": 1.0161,
"step": 411
},
{
"epoch": 1.2289156626506024,
"grad_norm": 0.38818714022636414,
"learning_rate": 3.2373449109006476e-06,
"loss": 1.0602,
"step": 412
},
{
"epoch": 1.2319277108433735,
"grad_norm": 0.3809143304824829,
"learning_rate": 3.21488901287247e-06,
"loss": 1.0088,
"step": 413
},
{
"epoch": 1.2349397590361446,
"grad_norm": 0.37948790192604065,
"learning_rate": 3.192474306441936e-06,
"loss": 1.0532,
"step": 414
},
{
"epoch": 1.2379518072289157,
"grad_norm": 0.44067418575286865,
"learning_rate": 3.170101308830421e-06,
"loss": 1.0377,
"step": 415
},
{
"epoch": 1.2409638554216866,
"grad_norm": 0.3667253255844116,
"learning_rate": 3.1477705362968702e-06,
"loss": 1.0234,
"step": 416
},
{
"epoch": 1.2439759036144578,
"grad_norm": 0.37526583671569824,
"learning_rate": 3.1254825041258852e-06,
"loss": 1.0344,
"step": 417
},
{
"epoch": 1.2469879518072289,
"grad_norm": 0.42664584517478943,
"learning_rate": 3.103237726615822e-06,
"loss": 1.0439,
"step": 418
},
{
"epoch": 1.25,
"grad_norm": 0.3878503441810608,
"learning_rate": 3.081036717066938e-06,
"loss": 1.1294,
"step": 419
},
{
"epoch": 1.2530120481927711,
"grad_norm": 0.4370405972003937,
"learning_rate": 3.0588799877695375e-06,
"loss": 1.0563,
"step": 420
},
{
"epoch": 1.2560240963855422,
"grad_norm": 0.38727104663848877,
"learning_rate": 3.036768049992157e-06,
"loss": 1.0561,
"step": 421
},
{
"epoch": 1.2590361445783134,
"grad_norm": 0.3639293909072876,
"learning_rate": 3.0147014139697596e-06,
"loss": 1.0747,
"step": 422
},
{
"epoch": 1.2620481927710843,
"grad_norm": 0.3889468014240265,
"learning_rate": 2.99268058889197e-06,
"loss": 1.0575,
"step": 423
},
{
"epoch": 1.2650602409638554,
"grad_norm": 0.3735024929046631,
"learning_rate": 2.9707060828913226e-06,
"loss": 1.0432,
"step": 424
},
{
"epoch": 1.2680722891566265,
"grad_norm": 0.3623259365558624,
"learning_rate": 2.9487784030315297e-06,
"loss": 1.0929,
"step": 425
},
{
"epoch": 1.2710843373493976,
"grad_norm": 0.38363751769065857,
"learning_rate": 2.9268980552957917e-06,
"loss": 1.018,
"step": 426
},
{
"epoch": 1.2740963855421688,
"grad_norm": 0.36796835064888,
"learning_rate": 2.905065544575114e-06,
"loss": 1.0636,
"step": 427
},
{
"epoch": 1.2771084337349397,
"grad_norm": 0.3460337817668915,
"learning_rate": 2.8832813746566546e-06,
"loss": 1.1039,
"step": 428
},
{
"epoch": 1.2801204819277108,
"grad_norm": 0.37609270215034485,
"learning_rate": 2.86154604821211e-06,
"loss": 1.0746,
"step": 429
},
{
"epoch": 1.283132530120482,
"grad_norm": 0.39871373772621155,
"learning_rate": 2.8398600667861032e-06,
"loss": 1.0095,
"step": 430
},
{
"epoch": 1.286144578313253,
"grad_norm": 0.38184547424316406,
"learning_rate": 2.8182239307846195e-06,
"loss": 1.0278,
"step": 431
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.40051835775375366,
"learning_rate": 2.796638139463456e-06,
"loss": 1.0261,
"step": 432
},
{
"epoch": 1.2921686746987953,
"grad_norm": 0.38206747174263,
"learning_rate": 2.7751031909167046e-06,
"loss": 1.0817,
"step": 433
},
{
"epoch": 1.2951807228915664,
"grad_norm": 0.42132294178009033,
"learning_rate": 2.7536195820652506e-06,
"loss": 1.0253,
"step": 434
},
{
"epoch": 1.2981927710843373,
"grad_norm": 0.37671446800231934,
"learning_rate": 2.73218780864531e-06,
"loss": 1.0555,
"step": 435
},
{
"epoch": 1.3012048192771084,
"grad_norm": 0.405241459608078,
"learning_rate": 2.710808365197e-06,
"loss": 1.0957,
"step": 436
},
{
"epoch": 1.3042168674698795,
"grad_norm": 0.3754029870033264,
"learning_rate": 2.689481745052908e-06,
"loss": 0.9929,
"step": 437
},
{
"epoch": 1.3072289156626506,
"grad_norm": 0.3823848068714142,
"learning_rate": 2.6682084403267305e-06,
"loss": 1.0884,
"step": 438
},
{
"epoch": 1.3102409638554218,
"grad_norm": 0.3721786439418793,
"learning_rate": 2.6469889419018985e-06,
"loss": 1.0173,
"step": 439
},
{
"epoch": 1.3132530120481927,
"grad_norm": 0.3947805166244507,
"learning_rate": 2.6258237394202556e-06,
"loss": 1.0628,
"step": 440
},
{
"epoch": 1.3162650602409638,
"grad_norm": 0.3939521908760071,
"learning_rate": 2.60471332127077e-06,
"loss": 1.0576,
"step": 441
},
{
"epoch": 1.319277108433735,
"grad_norm": 0.40392783284187317,
"learning_rate": 2.5836581745782474e-06,
"loss": 1.0515,
"step": 442
},
{
"epoch": 1.322289156626506,
"grad_norm": 0.39871639013290405,
"learning_rate": 2.5626587851921053e-06,
"loss": 1.0039,
"step": 443
},
{
"epoch": 1.3253012048192772,
"grad_norm": 0.409939706325531,
"learning_rate": 2.541715637675156e-06,
"loss": 1.0394,
"step": 444
},
{
"epoch": 1.3283132530120483,
"grad_norm": 0.3738921880722046,
"learning_rate": 2.520829215292426e-06,
"loss": 1.0766,
"step": 445
},
{
"epoch": 1.3313253012048194,
"grad_norm": 0.3500833213329315,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.0533,
"step": 446
},
{
"epoch": 1.3343373493975903,
"grad_norm": 0.3490578532218933,
"learning_rate": 2.4792284724339077e-06,
"loss": 1.0512,
"step": 447
},
{
"epoch": 1.3373493975903614,
"grad_norm": 0.32971325516700745,
"learning_rate": 2.4585151118990286e-06,
"loss": 1.0417,
"step": 448
},
{
"epoch": 1.3403614457831325,
"grad_norm": 0.36310428380966187,
"learning_rate": 2.4378603963580293e-06,
"loss": 1.122,
"step": 449
},
{
"epoch": 1.3433734939759037,
"grad_norm": 0.40908730030059814,
"learning_rate": 2.417264802420343e-06,
"loss": 1.0535,
"step": 450
},
{
"epoch": 1.3463855421686746,
"grad_norm": 0.3725447654724121,
"learning_rate": 2.396728805331167e-06,
"loss": 1.0547,
"step": 451
},
{
"epoch": 1.3493975903614457,
"grad_norm": 0.39754316210746765,
"learning_rate": 2.3762528789604887e-06,
"loss": 1.0292,
"step": 452
},
{
"epoch": 1.3524096385542168,
"grad_norm": 0.39532670378685,
"learning_rate": 2.3558374957921678e-06,
"loss": 1.0182,
"step": 453
},
{
"epoch": 1.355421686746988,
"grad_norm": 0.40215209126472473,
"learning_rate": 2.3354831269130133e-06,
"loss": 1.0495,
"step": 454
},
{
"epoch": 1.358433734939759,
"grad_norm": 0.421367347240448,
"learning_rate": 2.3151902420019357e-06,
"loss": 1.0389,
"step": 455
},
{
"epoch": 1.3614457831325302,
"grad_norm": 0.38005751371383667,
"learning_rate": 2.2949593093190863e-06,
"loss": 1.0681,
"step": 456
},
{
"epoch": 1.3644578313253013,
"grad_norm": 0.3765680193901062,
"learning_rate": 2.274790795695071e-06,
"loss": 1.0338,
"step": 457
},
{
"epoch": 1.3674698795180724,
"grad_norm": 0.34579628705978394,
"learning_rate": 2.2546851665201692e-06,
"loss": 1.0749,
"step": 458
},
{
"epoch": 1.3704819277108433,
"grad_norm": 0.3837708830833435,
"learning_rate": 2.2346428857335904e-06,
"loss": 1.0642,
"step": 459
},
{
"epoch": 1.3734939759036144,
"grad_norm": 0.3635129928588867,
"learning_rate": 2.2146644158127827e-06,
"loss": 1.0432,
"step": 460
},
{
"epoch": 1.3765060240963856,
"grad_norm": 0.40961354970932007,
"learning_rate": 2.1947502177627437e-06,
"loss": 1.0437,
"step": 461
},
{
"epoch": 1.3795180722891567,
"grad_norm": 0.37368935346603394,
"learning_rate": 2.1749007511054005e-06,
"loss": 1.0578,
"step": 462
},
{
"epoch": 1.3825301204819276,
"grad_norm": 0.40420466661453247,
"learning_rate": 2.1551164738689896e-06,
"loss": 1.0743,
"step": 463
},
{
"epoch": 1.3855421686746987,
"grad_norm": 0.3825657069683075,
"learning_rate": 2.1353978425775006e-06,
"loss": 1.0327,
"step": 464
},
{
"epoch": 1.3885542168674698,
"grad_norm": 0.39921796321868896,
"learning_rate": 2.1157453122401385e-06,
"loss": 1.0576,
"step": 465
},
{
"epoch": 1.391566265060241,
"grad_norm": 0.36656901240348816,
"learning_rate": 2.0961593363408154e-06,
"loss": 1.0264,
"step": 466
},
{
"epoch": 1.394578313253012,
"grad_norm": 0.3587695360183716,
"learning_rate": 2.076640366827703e-06,
"loss": 1.071,
"step": 467
},
{
"epoch": 1.3975903614457832,
"grad_norm": 0.3668745756149292,
"learning_rate": 2.0571888541027857e-06,
"loss": 0.9852,
"step": 468
},
{
"epoch": 1.4006024096385543,
"grad_norm": 0.41092541813850403,
"learning_rate": 2.0378052470114822e-06,
"loss": 1.0234,
"step": 469
},
{
"epoch": 1.4036144578313254,
"grad_norm": 0.42871734499931335,
"learning_rate": 2.018489992832283e-06,
"loss": 1.0427,
"step": 470
},
{
"epoch": 1.4066265060240963,
"grad_norm": 0.3699125349521637,
"learning_rate": 1.999243537266424e-06,
"loss": 1.0422,
"step": 471
},
{
"epoch": 1.4096385542168675,
"grad_norm": 0.36434075236320496,
"learning_rate": 1.980066324427613e-06,
"loss": 1.0588,
"step": 472
},
{
"epoch": 1.4126506024096386,
"grad_norm": 0.4026855528354645,
"learning_rate": 1.960958796831769e-06,
"loss": 1.0295,
"step": 473
},
{
"epoch": 1.4156626506024097,
"grad_norm": 0.3882656395435333,
"learning_rate": 1.9419213953868236e-06,
"loss": 1.0366,
"step": 474
},
{
"epoch": 1.4186746987951806,
"grad_norm": 0.40121057629585266,
"learning_rate": 1.9229545593825367e-06,
"loss": 1.0806,
"step": 475
},
{
"epoch": 1.4216867469879517,
"grad_norm": 0.3884546756744385,
"learning_rate": 1.9040587264803673e-06,
"loss": 1.1063,
"step": 476
},
{
"epoch": 1.4246987951807228,
"grad_norm": 0.3452583849430084,
"learning_rate": 1.8852343327033717e-06,
"loss": 1.0373,
"step": 477
},
{
"epoch": 1.427710843373494,
"grad_norm": 0.39576640725135803,
"learning_rate": 1.8664818124261375e-06,
"loss": 1.0804,
"step": 478
},
{
"epoch": 1.430722891566265,
"grad_norm": 0.40806901454925537,
"learning_rate": 1.8478015983647718e-06,
"loss": 1.0341,
"step": 479
},
{
"epoch": 1.4337349397590362,
"grad_norm": 0.37504813075065613,
"learning_rate": 1.8291941215669024e-06,
"loss": 1.0557,
"step": 480
},
{
"epoch": 1.4367469879518073,
"grad_norm": 0.39833274483680725,
"learning_rate": 1.8106598114017398e-06,
"loss": 1.0336,
"step": 481
},
{
"epoch": 1.4397590361445782,
"grad_norm": 0.39540019631385803,
"learning_rate": 1.7921990955501705e-06,
"loss": 1.0473,
"step": 482
},
{
"epoch": 1.4427710843373494,
"grad_norm": 0.40363839268684387,
"learning_rate": 1.7738123999948853e-06,
"loss": 1.0193,
"step": 483
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.37323495745658875,
"learning_rate": 1.755500149010549e-06,
"loss": 0.9827,
"step": 484
},
{
"epoch": 1.4487951807228916,
"grad_norm": 0.41902491450309753,
"learning_rate": 1.737262765154008e-06,
"loss": 1.069,
"step": 485
},
{
"epoch": 1.4518072289156627,
"grad_norm": 0.40718671679496765,
"learning_rate": 1.7191006692545493e-06,
"loss": 1.0873,
"step": 486
},
{
"epoch": 1.4548192771084336,
"grad_norm": 0.4020282030105591,
"learning_rate": 1.7010142804041785e-06,
"loss": 1.0425,
"step": 487
},
{
"epoch": 1.4578313253012047,
"grad_norm": 0.3684733510017395,
"learning_rate": 1.6830040159479521e-06,
"loss": 1.0121,
"step": 488
},
{
"epoch": 1.4608433734939759,
"grad_norm": 0.3506666421890259,
"learning_rate": 1.66507029147436e-06,
"loss": 1.0484,
"step": 489
},
{
"epoch": 1.463855421686747,
"grad_norm": 0.468654602766037,
"learning_rate": 1.6472135208057128e-06,
"loss": 1.0682,
"step": 490
},
{
"epoch": 1.466867469879518,
"grad_norm": 0.4075433313846588,
"learning_rate": 1.629434115988614e-06,
"loss": 1.0589,
"step": 491
},
{
"epoch": 1.4698795180722892,
"grad_norm": 0.3535695970058441,
"learning_rate": 1.611732487284437e-06,
"loss": 1.0628,
"step": 492
},
{
"epoch": 1.4728915662650603,
"grad_norm": 0.37299081683158875,
"learning_rate": 1.5941090431598654e-06,
"loss": 1.019,
"step": 493
},
{
"epoch": 1.4759036144578312,
"grad_norm": 0.34906429052352905,
"learning_rate": 1.5765641902774704e-06,
"loss": 1.0281,
"step": 494
},
{
"epoch": 1.4789156626506024,
"grad_norm": 0.4228847920894623,
"learning_rate": 1.5590983334863191e-06,
"loss": 1.0176,
"step": 495
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.4109274446964264,
"learning_rate": 1.5417118758126408e-06,
"loss": 1.0818,
"step": 496
},
{
"epoch": 1.4849397590361446,
"grad_norm": 0.3916458189487457,
"learning_rate": 1.524405218450517e-06,
"loss": 1.0299,
"step": 497
},
{
"epoch": 1.4879518072289157,
"grad_norm": 0.3761802911758423,
"learning_rate": 1.5071787607526366e-06,
"loss": 1.0152,
"step": 498
},
{
"epoch": 1.4909638554216866,
"grad_norm": 0.3690100610256195,
"learning_rate": 1.4900329002210684e-06,
"loss": 1.0818,
"step": 499
},
{
"epoch": 1.4939759036144578,
"grad_norm": 0.36231639981269836,
"learning_rate": 1.472968032498095e-06,
"loss": 1.0708,
"step": 500
},
{
"epoch": 1.4969879518072289,
"grad_norm": 0.3943842053413391,
"learning_rate": 1.4559845513570859e-06,
"loss": 1.0399,
"step": 501
},
{
"epoch": 1.5,
"grad_norm": 0.380312979221344,
"learning_rate": 1.439082848693406e-06,
"loss": 0.9593,
"step": 502
},
{
"epoch": 1.5030120481927711,
"grad_norm": 0.43198204040527344,
"learning_rate": 1.4222633145153758e-06,
"loss": 0.9807,
"step": 503
},
{
"epoch": 1.5060240963855422,
"grad_norm": 0.3783879578113556,
"learning_rate": 1.4055263369352673e-06,
"loss": 1.0255,
"step": 504
},
{
"epoch": 1.5090361445783134,
"grad_norm": 0.3918922543525696,
"learning_rate": 1.388872302160353e-06,
"loss": 1.0401,
"step": 505
},
{
"epoch": 1.5120481927710845,
"grad_norm": 0.39092695713043213,
"learning_rate": 1.3723015944839947e-06,
"loss": 1.0715,
"step": 506
},
{
"epoch": 1.5150602409638554,
"grad_norm": 0.33539846539497375,
"learning_rate": 1.35581459627677e-06,
"loss": 1.0185,
"step": 507
},
{
"epoch": 1.5180722891566265,
"grad_norm": 0.3622112572193146,
"learning_rate": 1.339411687977657e-06,
"loss": 1.0932,
"step": 508
},
{
"epoch": 1.5210843373493976,
"grad_norm": 0.3799549341201782,
"learning_rate": 1.3230932480852487e-06,
"loss": 1.0413,
"step": 509
},
{
"epoch": 1.5240963855421685,
"grad_norm": 0.37101662158966064,
"learning_rate": 1.3068596531490253e-06,
"loss": 1.0402,
"step": 510
},
{
"epoch": 1.5271084337349397,
"grad_norm": 0.3901662826538086,
"learning_rate": 1.290711277760658e-06,
"loss": 1.0245,
"step": 511
},
{
"epoch": 1.5301204819277108,
"grad_norm": 0.37363961338996887,
"learning_rate": 1.2746484945453691e-06,
"loss": 1.0387,
"step": 512
},
{
"epoch": 1.533132530120482,
"grad_norm": 0.376298725605011,
"learning_rate": 1.2586716741533389e-06,
"loss": 1.0305,
"step": 513
},
{
"epoch": 1.536144578313253,
"grad_norm": 0.35384973883628845,
"learning_rate": 1.2427811852511396e-06,
"loss": 1.0001,
"step": 514
},
{
"epoch": 1.5391566265060241,
"grad_norm": 0.3355305790901184,
"learning_rate": 1.226977394513247e-06,
"loss": 1.0756,
"step": 515
},
{
"epoch": 1.5421686746987953,
"grad_norm": 0.3982202112674713,
"learning_rate": 1.2112606666135602e-06,
"loss": 1.0102,
"step": 516
},
{
"epoch": 1.5451807228915664,
"grad_norm": 0.33996695280075073,
"learning_rate": 1.1956313642169974e-06,
"loss": 1.0388,
"step": 517
},
{
"epoch": 1.5481927710843375,
"grad_norm": 0.3969401717185974,
"learning_rate": 1.1800898479711293e-06,
"loss": 1.0541,
"step": 518
},
{
"epoch": 1.5512048192771084,
"grad_norm": 0.3649154603481293,
"learning_rate": 1.1646364764978468e-06,
"loss": 1.0625,
"step": 519
},
{
"epoch": 1.5542168674698795,
"grad_norm": 0.39856594800949097,
"learning_rate": 1.1492716063850973e-06,
"loss": 1.0405,
"step": 520
},
{
"epoch": 1.5572289156626506,
"grad_norm": 0.3574175238609314,
"learning_rate": 1.1339955921786504e-06,
"loss": 1.0486,
"step": 521
},
{
"epoch": 1.5602409638554215,
"grad_norm": 0.36913472414016724,
"learning_rate": 1.1188087863739173e-06,
"loss": 0.9595,
"step": 522
},
{
"epoch": 1.5632530120481927,
"grad_norm": 0.32440900802612305,
"learning_rate": 1.1037115394078162e-06,
"loss": 1.0586,
"step": 523
},
{
"epoch": 1.5662650602409638,
"grad_norm": 0.41809505224227905,
"learning_rate": 1.0887041996506858e-06,
"loss": 1.0959,
"step": 524
},
{
"epoch": 1.569277108433735,
"grad_norm": 0.3481323719024658,
"learning_rate": 1.0737871133982524e-06,
"loss": 1.0388,
"step": 525
},
{
"epoch": 1.572289156626506,
"grad_norm": 0.3880089223384857,
"learning_rate": 1.0589606248636291e-06,
"loss": 1.0153,
"step": 526
},
{
"epoch": 1.5753012048192772,
"grad_norm": 0.3808007836341858,
"learning_rate": 1.0442250761693829e-06,
"loss": 1.0111,
"step": 527
},
{
"epoch": 1.5783132530120483,
"grad_norm": 0.38831576704978943,
"learning_rate": 1.0295808073396352e-06,
"loss": 0.9816,
"step": 528
},
{
"epoch": 1.5813253012048194,
"grad_norm": 0.41834479570388794,
"learning_rate": 1.015028156292212e-06,
"loss": 1.0189,
"step": 529
},
{
"epoch": 1.5843373493975905,
"grad_norm": 0.3809266984462738,
"learning_rate": 1.0005674588308566e-06,
"loss": 1.0146,
"step": 530
},
{
"epoch": 1.5873493975903614,
"grad_norm": 0.4059775471687317,
"learning_rate": 9.861990486374695e-07,
"loss": 0.9792,
"step": 531
},
{
"epoch": 1.5903614457831325,
"grad_norm": 0.36427873373031616,
"learning_rate": 9.719232572644189e-07,
"loss": 1.0827,
"step": 532
},
{
"epoch": 1.5933734939759037,
"grad_norm": 0.3794417679309845,
"learning_rate": 9.577404141268815e-07,
"loss": 1.0314,
"step": 533
},
{
"epoch": 1.5963855421686746,
"grad_norm": 0.40571263432502747,
"learning_rate": 9.436508464952471e-07,
"loss": 1.0521,
"step": 534
},
{
"epoch": 1.5993975903614457,
"grad_norm": 0.36858484148979187,
"learning_rate": 9.296548794875659e-07,
"loss": 1.0314,
"step": 535
},
{
"epoch": 1.6024096385542168,
"grad_norm": 0.35998910665512085,
"learning_rate": 9.157528360620416e-07,
"loss": 1.0451,
"step": 536
},
{
"epoch": 1.605421686746988,
"grad_norm": 0.3696284294128418,
"learning_rate": 9.019450370095867e-07,
"loss": 0.9977,
"step": 537
},
{
"epoch": 1.608433734939759,
"grad_norm": 0.4475997984409332,
"learning_rate": 8.882318009464124e-07,
"loss": 1.0073,
"step": 538
},
{
"epoch": 1.6114457831325302,
"grad_norm": 0.40017929673194885,
"learning_rate": 8.74613444306684e-07,
"loss": 0.9603,
"step": 539
},
{
"epoch": 1.6144578313253013,
"grad_norm": 0.3758133053779602,
"learning_rate": 8.61090281335214e-07,
"loss": 0.9584,
"step": 540
},
{
"epoch": 1.6174698795180724,
"grad_norm": 0.35535839200019836,
"learning_rate": 8.476626240802099e-07,
"loss": 1.1102,
"step": 541
},
{
"epoch": 1.6204819277108435,
"grad_norm": 0.43646156787872314,
"learning_rate": 8.343307823860819e-07,
"loss": 1.0792,
"step": 542
},
{
"epoch": 1.6234939759036144,
"grad_norm": 0.39517444372177124,
"learning_rate": 8.210950638862813e-07,
"loss": 1.0216,
"step": 543
},
{
"epoch": 1.6265060240963856,
"grad_norm": 0.42866745591163635,
"learning_rate": 8.079557739962129e-07,
"loss": 1.0596,
"step": 544
},
{
"epoch": 1.6295180722891565,
"grad_norm": 0.3550488352775574,
"learning_rate": 7.949132159061784e-07,
"loss": 1.0535,
"step": 545
},
{
"epoch": 1.6325301204819276,
"grad_norm": 0.3993145823478699,
"learning_rate": 7.819676905743872e-07,
"loss": 1.008,
"step": 546
},
{
"epoch": 1.6355421686746987,
"grad_norm": 0.39240461587905884,
"learning_rate": 7.691194967200099e-07,
"loss": 1.0231,
"step": 547
},
{
"epoch": 1.6385542168674698,
"grad_norm": 0.356810063123703,
"learning_rate": 7.563689308162803e-07,
"loss": 1.0048,
"step": 548
},
{
"epoch": 1.641566265060241,
"grad_norm": 0.36379274725914,
"learning_rate": 7.43716287083664e-07,
"loss": 1.0925,
"step": 549
},
{
"epoch": 1.644578313253012,
"grad_norm": 0.4245232045650482,
"learning_rate": 7.31161857483057e-07,
"loss": 1.0368,
"step": 550
},
{
"epoch": 1.6475903614457832,
"grad_norm": 0.3779962658882141,
"learning_rate": 7.187059317090622e-07,
"loss": 1.1019,
"step": 551
},
{
"epoch": 1.6506024096385543,
"grad_norm": 0.41444671154022217,
"learning_rate": 7.063487971832922e-07,
"loss": 1.084,
"step": 552
},
{
"epoch": 1.6536144578313254,
"grad_norm": 0.369693398475647,
"learning_rate": 6.940907390477458e-07,
"loss": 1.0164,
"step": 553
},
{
"epoch": 1.6566265060240963,
"grad_norm": 0.43131789565086365,
"learning_rate": 6.819320401582258e-07,
"loss": 1.0915,
"step": 554
},
{
"epoch": 1.6596385542168675,
"grad_norm": 0.41402745246887207,
"learning_rate": 6.698729810778065e-07,
"loss": 1.0014,
"step": 555
},
{
"epoch": 1.6626506024096386,
"grad_norm": 0.38247060775756836,
"learning_rate": 6.579138400703716e-07,
"loss": 1.0127,
"step": 556
},
{
"epoch": 1.6656626506024095,
"grad_norm": 0.45507028698921204,
"learning_rate": 6.460548930941801e-07,
"loss": 1.0202,
"step": 557
},
{
"epoch": 1.6686746987951806,
"grad_norm": 0.381002813577652,
"learning_rate": 6.342964137955071e-07,
"loss": 1.035,
"step": 558
},
{
"epoch": 1.6716867469879517,
"grad_norm": 0.4605034291744232,
"learning_rate": 6.226386735023271e-07,
"loss": 1.0472,
"step": 559
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.3616805970668793,
"learning_rate": 6.110819412180535e-07,
"loss": 1.0302,
"step": 560
},
{
"epoch": 1.677710843373494,
"grad_norm": 0.3862994313240051,
"learning_rate": 5.99626483615331e-07,
"loss": 1.024,
"step": 561
},
{
"epoch": 1.680722891566265,
"grad_norm": 0.406364381313324,
"learning_rate": 5.882725650298787e-07,
"loss": 1.0184,
"step": 562
},
{
"epoch": 1.6837349397590362,
"grad_norm": 0.42682695388793945,
"learning_rate": 5.770204474543978e-07,
"loss": 1.0347,
"step": 563
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.4065680503845215,
"learning_rate": 5.658703905325186e-07,
"loss": 1.0352,
"step": 564
},
{
"epoch": 1.6897590361445785,
"grad_norm": 0.402649462223053,
"learning_rate": 5.548226515528133e-07,
"loss": 1.0293,
"step": 565
},
{
"epoch": 1.6927710843373494,
"grad_norm": 0.38777557015419006,
"learning_rate": 5.438774854428614e-07,
"loss": 1.0521,
"step": 566
},
{
"epoch": 1.6957831325301205,
"grad_norm": 0.42119914293289185,
"learning_rate": 5.330351447633603e-07,
"loss": 1.0862,
"step": 567
},
{
"epoch": 1.6987951807228916,
"grad_norm": 0.3981137275695801,
"learning_rate": 5.222958797023036e-07,
"loss": 1.0312,
"step": 568
},
{
"epoch": 1.7018072289156625,
"grad_norm": 0.40969544649124146,
"learning_rate": 5.11659938069205e-07,
"loss": 1.0397,
"step": 569
},
{
"epoch": 1.7048192771084336,
"grad_norm": 0.373832643032074,
"learning_rate": 5.011275652893782e-07,
"loss": 1.0546,
"step": 570
},
{
"epoch": 1.7078313253012047,
"grad_norm": 0.4301709532737732,
"learning_rate": 4.906990043982813e-07,
"loss": 1.0475,
"step": 571
},
{
"epoch": 1.7108433734939759,
"grad_norm": 0.4075815975666046,
"learning_rate": 4.803744960358992e-07,
"loss": 0.9895,
"step": 572
},
{
"epoch": 1.713855421686747,
"grad_norm": 0.41060760617256165,
"learning_rate": 4.701542784411994e-07,
"loss": 1.032,
"step": 573
},
{
"epoch": 1.716867469879518,
"grad_norm": 0.38388729095458984,
"learning_rate": 4.6003858744662564e-07,
"loss": 1.0629,
"step": 574
},
{
"epoch": 1.7198795180722892,
"grad_norm": 0.37711286544799805,
"learning_rate": 4.500276564726652e-07,
"loss": 1.0032,
"step": 575
},
{
"epoch": 1.7228915662650603,
"grad_norm": 0.4005860388278961,
"learning_rate": 4.401217165224564e-07,
"loss": 1.0953,
"step": 576
},
{
"epoch": 1.7259036144578315,
"grad_norm": 0.39737778902053833,
"learning_rate": 4.3032099617645874e-07,
"loss": 1.0731,
"step": 577
},
{
"epoch": 1.7289156626506024,
"grad_norm": 0.39624249935150146,
"learning_rate": 4.2062572158718284e-07,
"loss": 1.0633,
"step": 578
},
{
"epoch": 1.7319277108433735,
"grad_norm": 0.3743440508842468,
"learning_rate": 4.1103611647396734e-07,
"loss": 1.0415,
"step": 579
},
{
"epoch": 1.7349397590361446,
"grad_norm": 0.3983217477798462,
"learning_rate": 4.0155240211781966e-07,
"loss": 1.0129,
"step": 580
},
{
"epoch": 1.7379518072289155,
"grad_norm": 0.4027600586414337,
"learning_rate": 3.921747973563056e-07,
"loss": 1.0909,
"step": 581
},
{
"epoch": 1.7409638554216866,
"grad_norm": 0.4163624942302704,
"learning_rate": 3.829035185785035e-07,
"loss": 1.0766,
"step": 582
},
{
"epoch": 1.7439759036144578,
"grad_norm": 0.3989628255367279,
"learning_rate": 3.737387797200126e-07,
"loss": 1.0506,
"step": 583
},
{
"epoch": 1.7469879518072289,
"grad_norm": 0.339167058467865,
"learning_rate": 3.646807922580098e-07,
"loss": 1.027,
"step": 584
},
{
"epoch": 1.75,
"grad_norm": 0.44778549671173096,
"learning_rate": 3.557297652063768e-07,
"loss": 1.0107,
"step": 585
},
{
"epoch": 1.7530120481927711,
"grad_norm": 0.43992355465888977,
"learning_rate": 3.4688590511087304e-07,
"loss": 1.0068,
"step": 586
},
{
"epoch": 1.7560240963855422,
"grad_norm": 0.36404716968536377,
"learning_rate": 3.3814941604437155e-07,
"loss": 1.0631,
"step": 587
},
{
"epoch": 1.7590361445783134,
"grad_norm": 0.39936619997024536,
"learning_rate": 3.2952049960214785e-07,
"loss": 0.9933,
"step": 588
},
{
"epoch": 1.7620481927710845,
"grad_norm": 0.42165055871009827,
"learning_rate": 3.20999354897229e-07,
"loss": 1.0362,
"step": 589
},
{
"epoch": 1.7650602409638554,
"grad_norm": 0.41388002038002014,
"learning_rate": 3.1258617855580155e-07,
"loss": 1.048,
"step": 590
},
{
"epoch": 1.7680722891566265,
"grad_norm": 0.37040451169013977,
"learning_rate": 3.0428116471267146e-07,
"loss": 1.073,
"step": 591
},
{
"epoch": 1.7710843373493976,
"grad_norm": 0.4236885607242584,
"learning_rate": 2.9608450500678566e-07,
"loss": 1.076,
"step": 592
},
{
"epoch": 1.7740963855421685,
"grad_norm": 0.442690908908844,
"learning_rate": 2.879963885768083e-07,
"loss": 1.0546,
"step": 593
},
{
"epoch": 1.7771084337349397,
"grad_norm": 0.4121167063713074,
"learning_rate": 2.800170020567566e-07,
"loss": 1.0169,
"step": 594
},
{
"epoch": 1.7801204819277108,
"grad_norm": 0.37887606024742126,
"learning_rate": 2.721465295716996e-07,
"loss": 1.0828,
"step": 595
},
{
"epoch": 1.783132530120482,
"grad_norm": 0.380744993686676,
"learning_rate": 2.643851527335006e-07,
"loss": 1.0376,
"step": 596
},
{
"epoch": 1.786144578313253,
"grad_norm": 0.3593333065509796,
"learning_rate": 2.5673305063663335e-07,
"loss": 0.9841,
"step": 597
},
{
"epoch": 1.7891566265060241,
"grad_norm": 0.40235635638237,
"learning_rate": 2.4919039985404626e-07,
"loss": 1.0454,
"step": 598
},
{
"epoch": 1.7921686746987953,
"grad_norm": 0.3604947030544281,
"learning_rate": 2.4175737443308976e-07,
"loss": 1.0195,
"step": 599
},
{
"epoch": 1.7951807228915664,
"grad_norm": 0.3955729007720947,
"learning_rate": 2.3443414589149838e-07,
"loss": 1.0324,
"step": 600
},
{
"epoch": 1.7981927710843375,
"grad_norm": 0.3765583038330078,
"learning_rate": 2.272208832134326e-07,
"loss": 1.0905,
"step": 601
},
{
"epoch": 1.8012048192771084,
"grad_norm": 0.3759413957595825,
"learning_rate": 2.201177528455828e-07,
"loss": 1.0711,
"step": 602
},
{
"epoch": 1.8042168674698795,
"grad_norm": 0.3818155825138092,
"learning_rate": 2.131249186933243e-07,
"loss": 1.0911,
"step": 603
},
{
"epoch": 1.8072289156626506,
"grad_norm": 0.39899012446403503,
"learning_rate": 2.0624254211693894e-07,
"loss": 1.0562,
"step": 604
},
{
"epoch": 1.8102409638554215,
"grad_norm": 0.390240341424942,
"learning_rate": 1.994707819278896e-07,
"loss": 1.02,
"step": 605
},
{
"epoch": 1.8132530120481927,
"grad_norm": 0.4145658016204834,
"learning_rate": 1.9280979438515479e-07,
"loss": 1.022,
"step": 606
},
{
"epoch": 1.8162650602409638,
"grad_norm": 0.4315582811832428,
"learning_rate": 1.8625973319162605e-07,
"loss": 1.0332,
"step": 607
},
{
"epoch": 1.819277108433735,
"grad_norm": 0.40108925104141235,
"learning_rate": 1.7982074949055794e-07,
"loss": 1.0494,
"step": 608
},
{
"epoch": 1.822289156626506,
"grad_norm": 0.37406808137893677,
"learning_rate": 1.7349299186208258e-07,
"loss": 1.0744,
"step": 609
},
{
"epoch": 1.8253012048192772,
"grad_norm": 0.49371138215065,
"learning_rate": 1.6727660631977894e-07,
"loss": 1.0319,
"step": 610
},
{
"epoch": 1.8283132530120483,
"grad_norm": 0.3623702824115753,
"learning_rate": 1.6117173630730787e-07,
"loss": 1.1106,
"step": 611
},
{
"epoch": 1.8313253012048194,
"grad_norm": 0.3635117709636688,
"learning_rate": 1.5517852269509692e-07,
"loss": 1.0454,
"step": 612
},
{
"epoch": 1.8343373493975905,
"grad_norm": 0.3568932116031647,
"learning_rate": 1.492971037770924e-07,
"loss": 1.0008,
"step": 613
},
{
"epoch": 1.8373493975903614,
"grad_norm": 0.3618745803833008,
"learning_rate": 1.435276152675691e-07,
"loss": 1.0265,
"step": 614
},
{
"epoch": 1.8403614457831325,
"grad_norm": 0.3803001940250397,
"learning_rate": 1.378701902979962e-07,
"loss": 1.0643,
"step": 615
},
{
"epoch": 1.8433734939759037,
"grad_norm": 0.459064781665802,
"learning_rate": 1.323249594139664e-07,
"loss": 1.0108,
"step": 616
},
{
"epoch": 1.8463855421686746,
"grad_norm": 0.3967600166797638,
"learning_rate": 1.2689205057218602e-07,
"loss": 0.9983,
"step": 617
},
{
"epoch": 1.8493975903614457,
"grad_norm": 0.3921276926994324,
"learning_rate": 1.2157158913751687e-07,
"loss": 0.9829,
"step": 618
},
{
"epoch": 1.8524096385542168,
"grad_norm": 0.3510358929634094,
"learning_rate": 1.1636369788008973e-07,
"loss": 1.0848,
"step": 619
},
{
"epoch": 1.855421686746988,
"grad_norm": 0.33366602659225464,
"learning_rate": 1.1126849697246533e-07,
"loss": 1.0474,
"step": 620
},
{
"epoch": 1.858433734939759,
"grad_norm": 0.3874680697917938,
"learning_rate": 1.0628610398686679e-07,
"loss": 1.0968,
"step": 621
},
{
"epoch": 1.8614457831325302,
"grad_norm": 0.3490632474422455,
"learning_rate": 1.014166338924627e-07,
"loss": 1.0689,
"step": 622
},
{
"epoch": 1.8644578313253013,
"grad_norm": 0.44556349515914917,
"learning_rate": 9.666019905271662e-08,
"loss": 1.0402,
"step": 623
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.4002796411514282,
"learning_rate": 9.201690922279405e-08,
"loss": 1.0333,
"step": 624
},
{
"epoch": 1.8704819277108435,
"grad_norm": 0.4069937467575073,
"learning_rate": 8.748687154702673e-08,
"loss": 1.1043,
"step": 625
},
{
"epoch": 1.8734939759036144,
"grad_norm": 0.4305992126464844,
"learning_rate": 8.307019055644517e-08,
"loss": 1.0116,
"step": 626
},
{
"epoch": 1.8765060240963856,
"grad_norm": 0.36993542313575745,
"learning_rate": 7.876696816636276e-08,
"loss": 1.0075,
"step": 627
},
{
"epoch": 1.8795180722891565,
"grad_norm": 0.3679683208465576,
"learning_rate": 7.45773036740255e-08,
"loss": 1.0131,
"step": 628
},
{
"epoch": 1.8825301204819276,
"grad_norm": 0.39285150170326233,
"learning_rate": 7.050129375632098e-08,
"loss": 1.0376,
"step": 629
},
{
"epoch": 1.8855421686746987,
"grad_norm": 0.4058956801891327,
"learning_rate": 6.65390324675469e-08,
"loss": 1.0214,
"step": 630
},
{
"epoch": 1.8885542168674698,
"grad_norm": 0.3686175048351288,
"learning_rate": 6.269061123724163e-08,
"loss": 1.0229,
"step": 631
},
{
"epoch": 1.891566265060241,
"grad_norm": 0.37797847390174866,
"learning_rate": 5.895611886807317e-08,
"loss": 1.0389,
"step": 632
},
{
"epoch": 1.894578313253012,
"grad_norm": 0.4053489565849304,
"learning_rate": 5.533564153379134e-08,
"loss": 1.0475,
"step": 633
},
{
"epoch": 1.8975903614457832,
"grad_norm": 0.4105575382709503,
"learning_rate": 5.182926277723821e-08,
"loss": 1.029,
"step": 634
},
{
"epoch": 1.9006024096385543,
"grad_norm": 0.3795925974845886,
"learning_rate": 4.843706350842081e-08,
"loss": 1.0502,
"step": 635
},
{
"epoch": 1.9036144578313254,
"grad_norm": 0.3822070062160492,
"learning_rate": 4.515912200264427e-08,
"loss": 1.025,
"step": 636
},
{
"epoch": 1.9066265060240963,
"grad_norm": 0.389304518699646,
"learning_rate": 4.19955138987066e-08,
"loss": 1.0387,
"step": 637
},
{
"epoch": 1.9096385542168675,
"grad_norm": 0.4106118381023407,
"learning_rate": 3.894631219715006e-08,
"loss": 1.0442,
"step": 638
},
{
"epoch": 1.9126506024096386,
"grad_norm": 0.39749521017074585,
"learning_rate": 3.601158725858034e-08,
"loss": 1.0183,
"step": 639
},
{
"epoch": 1.9156626506024095,
"grad_norm": 0.37224337458610535,
"learning_rate": 3.3191406802041693e-08,
"loss": 0.9505,
"step": 640
},
{
"epoch": 1.9186746987951806,
"grad_norm": 0.4012593924999237,
"learning_rate": 3.048583590345266e-08,
"loss": 0.9986,
"step": 641
},
{
"epoch": 1.9216867469879517,
"grad_norm": 0.39663243293762207,
"learning_rate": 2.7894936994106724e-08,
"loss": 1.0163,
"step": 642
},
{
"epoch": 1.9246987951807228,
"grad_norm": 0.467464804649353,
"learning_rate": 2.5418769859231194e-08,
"loss": 1.0142,
"step": 643
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.3819372355937958,
"learning_rate": 2.3057391636606698e-08,
"loss": 0.993,
"step": 644
},
{
"epoch": 1.930722891566265,
"grad_norm": 0.3579060733318329,
"learning_rate": 2.081085681524986e-08,
"loss": 1.0213,
"step": 645
},
{
"epoch": 1.9337349397590362,
"grad_norm": 0.33856555819511414,
"learning_rate": 1.8679217234154335e-08,
"loss": 1.0315,
"step": 646
},
{
"epoch": 1.9367469879518073,
"grad_norm": 0.36222004890441895,
"learning_rate": 1.6662522081097308e-08,
"loss": 0.9624,
"step": 647
},
{
"epoch": 1.9397590361445785,
"grad_norm": 0.3850827217102051,
"learning_rate": 1.4760817891500966e-08,
"loss": 1.0241,
"step": 648
},
{
"epoch": 1.9427710843373494,
"grad_norm": 0.42454764246940613,
"learning_rate": 1.2974148547362231e-08,
"loss": 0.96,
"step": 649
},
{
"epoch": 1.9457831325301205,
"grad_norm": 0.3735847473144531,
"learning_rate": 1.1302555276238581e-08,
"loss": 1.007,
"step": 650
},
{
"epoch": 1.9487951807228916,
"grad_norm": 0.3734501302242279,
"learning_rate": 9.746076650294922e-09,
"loss": 1.0119,
"step": 651
},
{
"epoch": 1.9518072289156625,
"grad_norm": 0.431612491607666,
"learning_rate": 8.304748585417077e-09,
"loss": 1.0564,
"step": 652
},
{
"epoch": 1.9548192771084336,
"grad_norm": 0.3824908137321472,
"learning_rate": 6.978604340380779e-09,
"loss": 0.9928,
"step": 653
},
{
"epoch": 1.9578313253012047,
"grad_norm": 0.38838639855384827,
"learning_rate": 5.767674516083954e-09,
"loss": 1.0136,
"step": 654
},
{
"epoch": 1.9608433734939759,
"grad_norm": 0.3842463493347168,
"learning_rate": 4.671987054842842e-09,
"loss": 1.1033,
"step": 655
},
{
"epoch": 1.963855421686747,
"grad_norm": 0.3982362449169159,
"learning_rate": 3.6915672397436208e-09,
"loss": 0.9286,
"step": 656
},
{
"epoch": 1.966867469879518,
"grad_norm": 0.4161483645439148,
"learning_rate": 2.8264376940634332e-09,
"loss": 1.0393,
"step": 657
},
{
"epoch": 1.9698795180722892,
"grad_norm": 0.37065836787223816,
"learning_rate": 2.076618380744133e-09,
"loss": 1.0168,
"step": 658
},
{
"epoch": 1.9728915662650603,
"grad_norm": 0.3696196973323822,
"learning_rate": 1.4421266019348789e-09,
"loss": 1.0211,
"step": 659
},
{
"epoch": 1.9759036144578315,
"grad_norm": 0.4084506630897522,
"learning_rate": 9.229769985902304e-10,
"loss": 1.0667,
"step": 660
},
{
"epoch": 1.9789156626506024,
"grad_norm": 0.4110511839389801,
"learning_rate": 5.191815501343067e-10,
"loss": 1.044,
"step": 661
},
{
"epoch": 1.9819277108433735,
"grad_norm": 0.36027991771698,
"learning_rate": 2.307495741843413e-10,
"loss": 1.0415,
"step": 662
},
{
"epoch": 1.9849397590361446,
"grad_norm": 0.3866373598575592,
"learning_rate": 5.768772633363284e-11,
"loss": 1.0336,
"step": 663
},
{
"epoch": 1.9879518072289155,
"grad_norm": 0.3646948039531708,
"learning_rate": 0.0,
"loss": 1.0516,
"step": 664
}
],
"logging_steps": 1,
"max_steps": 664,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 166,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.6730319840173097e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}