xiewenya's picture
Upload folder using huggingface_hub
243c727 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9273525109702585,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005850804485616773,
"grad_norm": 4.189145565032959,
"learning_rate": 1.9607843137254904e-07,
"loss": 0.6022,
"step": 1
},
{
"epoch": 0.011701608971233545,
"grad_norm": 4.088385105133057,
"learning_rate": 3.921568627450981e-07,
"loss": 0.6105,
"step": 2
},
{
"epoch": 0.017552413456850317,
"grad_norm": 4.105137348175049,
"learning_rate": 5.882352941176471e-07,
"loss": 0.6234,
"step": 3
},
{
"epoch": 0.02340321794246709,
"grad_norm": 4.010756015777588,
"learning_rate": 7.843137254901962e-07,
"loss": 0.5629,
"step": 4
},
{
"epoch": 0.02925402242808386,
"grad_norm": 4.201730728149414,
"learning_rate": 9.80392156862745e-07,
"loss": 0.6236,
"step": 5
},
{
"epoch": 0.035104826913700635,
"grad_norm": 4.13097620010376,
"learning_rate": 1.1764705882352942e-06,
"loss": 0.6058,
"step": 6
},
{
"epoch": 0.040955631399317405,
"grad_norm": 3.753781318664551,
"learning_rate": 1.3725490196078434e-06,
"loss": 0.5798,
"step": 7
},
{
"epoch": 0.04680643588493418,
"grad_norm": 3.1203114986419678,
"learning_rate": 1.5686274509803923e-06,
"loss": 0.5575,
"step": 8
},
{
"epoch": 0.05265724037055095,
"grad_norm": 3.1326870918273926,
"learning_rate": 1.7647058823529414e-06,
"loss": 0.5794,
"step": 9
},
{
"epoch": 0.05850804485616772,
"grad_norm": 3.01350736618042,
"learning_rate": 1.96078431372549e-06,
"loss": 0.5721,
"step": 10
},
{
"epoch": 0.0643588493417845,
"grad_norm": 2.0586817264556885,
"learning_rate": 2.1568627450980393e-06,
"loss": 0.5389,
"step": 11
},
{
"epoch": 0.07020965382740127,
"grad_norm": 2.056138753890991,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.5578,
"step": 12
},
{
"epoch": 0.07606045831301804,
"grad_norm": 1.8458319902420044,
"learning_rate": 2.549019607843137e-06,
"loss": 0.5432,
"step": 13
},
{
"epoch": 0.08191126279863481,
"grad_norm": 1.3385547399520874,
"learning_rate": 2.7450980392156867e-06,
"loss": 0.5375,
"step": 14
},
{
"epoch": 0.08776206728425158,
"grad_norm": 2.10184383392334,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.4834,
"step": 15
},
{
"epoch": 0.09361287176986836,
"grad_norm": 2.354717254638672,
"learning_rate": 3.1372549019607846e-06,
"loss": 0.5087,
"step": 16
},
{
"epoch": 0.09946367625548513,
"grad_norm": 2.4186935424804688,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5408,
"step": 17
},
{
"epoch": 0.1053144807411019,
"grad_norm": 2.02093243598938,
"learning_rate": 3.529411764705883e-06,
"loss": 0.4967,
"step": 18
},
{
"epoch": 0.11116528522671867,
"grad_norm": 1.9769740104675293,
"learning_rate": 3.7254901960784316e-06,
"loss": 0.5429,
"step": 19
},
{
"epoch": 0.11701608971233544,
"grad_norm": 1.4087600708007812,
"learning_rate": 3.92156862745098e-06,
"loss": 0.4855,
"step": 20
},
{
"epoch": 0.12286689419795221,
"grad_norm": 1.4071195125579834,
"learning_rate": 4.11764705882353e-06,
"loss": 0.4956,
"step": 21
},
{
"epoch": 0.128717698683569,
"grad_norm": 1.4400174617767334,
"learning_rate": 4.313725490196079e-06,
"loss": 0.4966,
"step": 22
},
{
"epoch": 0.13456850316918575,
"grad_norm": 1.2176562547683716,
"learning_rate": 4.509803921568628e-06,
"loss": 0.4892,
"step": 23
},
{
"epoch": 0.14041930765480254,
"grad_norm": 1.0557763576507568,
"learning_rate": 4.705882352941177e-06,
"loss": 0.4664,
"step": 24
},
{
"epoch": 0.1462701121404193,
"grad_norm": 1.0654219388961792,
"learning_rate": 4.901960784313726e-06,
"loss": 0.4427,
"step": 25
},
{
"epoch": 0.15212091662603608,
"grad_norm": 0.8639155626296997,
"learning_rate": 5.098039215686274e-06,
"loss": 0.4676,
"step": 26
},
{
"epoch": 0.15797172111165286,
"grad_norm": 0.8091264963150024,
"learning_rate": 5.294117647058824e-06,
"loss": 0.4339,
"step": 27
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.7697594165802002,
"learning_rate": 5.4901960784313735e-06,
"loss": 0.4164,
"step": 28
},
{
"epoch": 0.1696733300828864,
"grad_norm": 0.8522382378578186,
"learning_rate": 5.686274509803922e-06,
"loss": 0.4512,
"step": 29
},
{
"epoch": 0.17552413456850316,
"grad_norm": 0.7640376687049866,
"learning_rate": 5.882352941176471e-06,
"loss": 0.432,
"step": 30
},
{
"epoch": 0.18137493905411994,
"grad_norm": 0.6247867941856384,
"learning_rate": 6.07843137254902e-06,
"loss": 0.408,
"step": 31
},
{
"epoch": 0.18722574353973673,
"grad_norm": 0.6288900971412659,
"learning_rate": 6.274509803921569e-06,
"loss": 0.4611,
"step": 32
},
{
"epoch": 0.19307654802535348,
"grad_norm": 0.6182562708854675,
"learning_rate": 6.470588235294119e-06,
"loss": 0.4257,
"step": 33
},
{
"epoch": 0.19892735251097027,
"grad_norm": 0.6193389892578125,
"learning_rate": 6.666666666666667e-06,
"loss": 0.4063,
"step": 34
},
{
"epoch": 0.20477815699658702,
"grad_norm": 0.6892727017402649,
"learning_rate": 6.862745098039216e-06,
"loss": 0.3967,
"step": 35
},
{
"epoch": 0.2106289614822038,
"grad_norm": 0.6725057363510132,
"learning_rate": 7.058823529411766e-06,
"loss": 0.4428,
"step": 36
},
{
"epoch": 0.21647976596782056,
"grad_norm": 0.5203535556793213,
"learning_rate": 7.2549019607843145e-06,
"loss": 0.4151,
"step": 37
},
{
"epoch": 0.22233057045343735,
"grad_norm": 0.45232418179512024,
"learning_rate": 7.450980392156863e-06,
"loss": 0.3666,
"step": 38
},
{
"epoch": 0.22818137493905413,
"grad_norm": 0.5872768759727478,
"learning_rate": 7.647058823529411e-06,
"loss": 0.4144,
"step": 39
},
{
"epoch": 0.2340321794246709,
"grad_norm": 0.526172399520874,
"learning_rate": 7.84313725490196e-06,
"loss": 0.4346,
"step": 40
},
{
"epoch": 0.23988298391028767,
"grad_norm": 0.5474228858947754,
"learning_rate": 8.03921568627451e-06,
"loss": 0.3965,
"step": 41
},
{
"epoch": 0.24573378839590443,
"grad_norm": 0.46727877855300903,
"learning_rate": 8.23529411764706e-06,
"loss": 0.4417,
"step": 42
},
{
"epoch": 0.2515845928815212,
"grad_norm": 0.40532198548316956,
"learning_rate": 8.43137254901961e-06,
"loss": 0.3851,
"step": 43
},
{
"epoch": 0.257435397367138,
"grad_norm": 0.4897397458553314,
"learning_rate": 8.627450980392157e-06,
"loss": 0.4013,
"step": 44
},
{
"epoch": 0.26328620185275475,
"grad_norm": 0.4565890431404114,
"learning_rate": 8.823529411764707e-06,
"loss": 0.3745,
"step": 45
},
{
"epoch": 0.2691370063383715,
"grad_norm": 0.38417261838912964,
"learning_rate": 9.019607843137256e-06,
"loss": 0.3783,
"step": 46
},
{
"epoch": 0.2749878108239883,
"grad_norm": 0.40912356972694397,
"learning_rate": 9.215686274509804e-06,
"loss": 0.3879,
"step": 47
},
{
"epoch": 0.2808386153096051,
"grad_norm": 0.42792415618896484,
"learning_rate": 9.411764705882354e-06,
"loss": 0.3837,
"step": 48
},
{
"epoch": 0.28668941979522183,
"grad_norm": 0.4394405484199524,
"learning_rate": 9.607843137254903e-06,
"loss": 0.4004,
"step": 49
},
{
"epoch": 0.2925402242808386,
"grad_norm": 0.4622238576412201,
"learning_rate": 9.803921568627451e-06,
"loss": 0.409,
"step": 50
},
{
"epoch": 0.2983910287664554,
"grad_norm": 0.3894466757774353,
"learning_rate": 1e-05,
"loss": 0.3766,
"step": 51
},
{
"epoch": 0.30424183325207216,
"grad_norm": 0.39314836263656616,
"learning_rate": 9.999882884955554e-06,
"loss": 0.3418,
"step": 52
},
{
"epoch": 0.3100926377376889,
"grad_norm": 0.44764766097068787,
"learning_rate": 9.999531545308584e-06,
"loss": 0.3909,
"step": 53
},
{
"epoch": 0.3159434422233057,
"grad_norm": 0.403144896030426,
"learning_rate": 9.998945997517957e-06,
"loss": 0.3716,
"step": 54
},
{
"epoch": 0.3217942467089225,
"grad_norm": 0.4303280711174011,
"learning_rate": 9.998126269014255e-06,
"loss": 0.4026,
"step": 55
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.4083136022090912,
"learning_rate": 9.997072398198492e-06,
"loss": 0.3842,
"step": 56
},
{
"epoch": 0.333495855680156,
"grad_norm": 0.3750261664390564,
"learning_rate": 9.99578443444032e-06,
"loss": 0.3605,
"step": 57
},
{
"epoch": 0.3393466601657728,
"grad_norm": 0.43343302607536316,
"learning_rate": 9.994262438075713e-06,
"loss": 0.4119,
"step": 58
},
{
"epoch": 0.34519746465138956,
"grad_norm": 0.3778004050254822,
"learning_rate": 9.992506480404137e-06,
"loss": 0.3616,
"step": 59
},
{
"epoch": 0.3510482691370063,
"grad_norm": 0.36973798274993896,
"learning_rate": 9.990516643685222e-06,
"loss": 0.3793,
"step": 60
},
{
"epoch": 0.35689907362262313,
"grad_norm": 0.3836229145526886,
"learning_rate": 9.988293021134888e-06,
"loss": 0.3492,
"step": 61
},
{
"epoch": 0.3627498781082399,
"grad_norm": 0.3700697720050812,
"learning_rate": 9.985835716921e-06,
"loss": 0.3583,
"step": 62
},
{
"epoch": 0.36860068259385664,
"grad_norm": 0.4023352861404419,
"learning_rate": 9.983144846158472e-06,
"loss": 0.3697,
"step": 63
},
{
"epoch": 0.37445148707947346,
"grad_norm": 0.38035494089126587,
"learning_rate": 9.980220534903889e-06,
"loss": 0.3772,
"step": 64
},
{
"epoch": 0.3803022915650902,
"grad_norm": 0.3641819953918457,
"learning_rate": 9.977062920149583e-06,
"loss": 0.3562,
"step": 65
},
{
"epoch": 0.38615309605070697,
"grad_norm": 0.39018484950065613,
"learning_rate": 9.973672149817232e-06,
"loss": 0.3377,
"step": 66
},
{
"epoch": 0.3920039005363237,
"grad_norm": 0.351622998714447,
"learning_rate": 9.970048382750925e-06,
"loss": 0.351,
"step": 67
},
{
"epoch": 0.39785470502194054,
"grad_norm": 0.40039461851119995,
"learning_rate": 9.966191788709716e-06,
"loss": 0.3775,
"step": 68
},
{
"epoch": 0.4037055095075573,
"grad_norm": 0.3892274796962738,
"learning_rate": 9.96210254835968e-06,
"loss": 0.4034,
"step": 69
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.4052744507789612,
"learning_rate": 9.957780853265441e-06,
"loss": 0.4079,
"step": 70
},
{
"epoch": 0.41540711847879086,
"grad_norm": 0.3877456486225128,
"learning_rate": 9.953226905881208e-06,
"loss": 0.3342,
"step": 71
},
{
"epoch": 0.4212579229644076,
"grad_norm": 0.4107078015804291,
"learning_rate": 9.948440919541277e-06,
"loss": 0.358,
"step": 72
},
{
"epoch": 0.4271087274500244,
"grad_norm": 0.37597158551216125,
"learning_rate": 9.943423118450051e-06,
"loss": 0.3948,
"step": 73
},
{
"epoch": 0.43295953193564113,
"grad_norm": 0.4590906798839569,
"learning_rate": 9.938173737671531e-06,
"loss": 0.3847,
"step": 74
},
{
"epoch": 0.43881033642125794,
"grad_norm": 0.48799118399620056,
"learning_rate": 9.932693023118299e-06,
"loss": 0.3845,
"step": 75
},
{
"epoch": 0.4446611409068747,
"grad_norm": 0.39222586154937744,
"learning_rate": 9.926981231540007e-06,
"loss": 0.3872,
"step": 76
},
{
"epoch": 0.45051194539249145,
"grad_norm": 0.4158020615577698,
"learning_rate": 9.921038630511345e-06,
"loss": 0.388,
"step": 77
},
{
"epoch": 0.45636274987810826,
"grad_norm": 0.40331101417541504,
"learning_rate": 9.91486549841951e-06,
"loss": 0.3705,
"step": 78
},
{
"epoch": 0.462213554363725,
"grad_norm": 0.4275971055030823,
"learning_rate": 9.908462124451152e-06,
"loss": 0.3849,
"step": 79
},
{
"epoch": 0.4680643588493418,
"grad_norm": 0.3466413915157318,
"learning_rate": 9.901828808578846e-06,
"loss": 0.347,
"step": 80
},
{
"epoch": 0.47391516333495853,
"grad_norm": 0.44375771284103394,
"learning_rate": 9.894965861547023e-06,
"loss": 0.373,
"step": 81
},
{
"epoch": 0.47976596782057535,
"grad_norm": 0.38661712408065796,
"learning_rate": 9.887873604857424e-06,
"loss": 0.3702,
"step": 82
},
{
"epoch": 0.4856167723061921,
"grad_norm": 0.41488274931907654,
"learning_rate": 9.88055237075403e-06,
"loss": 0.3574,
"step": 83
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.41137149930000305,
"learning_rate": 9.873002502207502e-06,
"loss": 0.3901,
"step": 84
},
{
"epoch": 0.49731838127742567,
"grad_norm": 0.39136987924575806,
"learning_rate": 9.86522435289912e-06,
"loss": 0.38,
"step": 85
},
{
"epoch": 0.5031691857630424,
"grad_norm": 0.37086671590805054,
"learning_rate": 9.857218287204204e-06,
"loss": 0.3541,
"step": 86
},
{
"epoch": 0.5090199902486592,
"grad_norm": 0.43105342984199524,
"learning_rate": 9.848984680175049e-06,
"loss": 0.4087,
"step": 87
},
{
"epoch": 0.514870794734276,
"grad_norm": 0.36811238527297974,
"learning_rate": 9.840523917523354e-06,
"loss": 0.3639,
"step": 88
},
{
"epoch": 0.5207215992198927,
"grad_norm": 0.378967821598053,
"learning_rate": 9.831836395602164e-06,
"loss": 0.3251,
"step": 89
},
{
"epoch": 0.5265724037055095,
"grad_norm": 0.36341214179992676,
"learning_rate": 9.822922521387277e-06,
"loss": 0.3705,
"step": 90
},
{
"epoch": 0.5324232081911263,
"grad_norm": 0.37682002782821655,
"learning_rate": 9.813782712458206e-06,
"loss": 0.3513,
"step": 91
},
{
"epoch": 0.538274012676743,
"grad_norm": 0.4142582416534424,
"learning_rate": 9.804417396978605e-06,
"loss": 0.3716,
"step": 92
},
{
"epoch": 0.5441248171623598,
"grad_norm": 0.4432157278060913,
"learning_rate": 9.794827013676206e-06,
"loss": 0.4126,
"step": 93
},
{
"epoch": 0.5499756216479766,
"grad_norm": 0.47457224130630493,
"learning_rate": 9.78501201182228e-06,
"loss": 0.3941,
"step": 94
},
{
"epoch": 0.5558264261335933,
"grad_norm": 0.35374128818511963,
"learning_rate": 9.774972851210572e-06,
"loss": 0.3893,
"step": 95
},
{
"epoch": 0.5616772306192102,
"grad_norm": 0.37110310792922974,
"learning_rate": 9.764710002135784e-06,
"loss": 0.3453,
"step": 96
},
{
"epoch": 0.567528035104827,
"grad_norm": 0.4286816716194153,
"learning_rate": 9.754223945371524e-06,
"loss": 0.3674,
"step": 97
},
{
"epoch": 0.5733788395904437,
"grad_norm": 0.3735758662223816,
"learning_rate": 9.743515172147793e-06,
"loss": 0.3572,
"step": 98
},
{
"epoch": 0.5792296440760605,
"grad_norm": 0.3784080445766449,
"learning_rate": 9.732584184127973e-06,
"loss": 0.3864,
"step": 99
},
{
"epoch": 0.5850804485616772,
"grad_norm": 0.40882179141044617,
"learning_rate": 9.721431493385322e-06,
"loss": 0.3458,
"step": 100
},
{
"epoch": 0.590931253047294,
"grad_norm": 0.3924429416656494,
"learning_rate": 9.710057622378992e-06,
"loss": 0.3497,
"step": 101
},
{
"epoch": 0.5967820575329108,
"grad_norm": 0.41799789667129517,
"learning_rate": 9.698463103929542e-06,
"loss": 0.3915,
"step": 102
},
{
"epoch": 0.6026328620185275,
"grad_norm": 0.4201458990573883,
"learning_rate": 9.686648481193994e-06,
"loss": 0.3797,
"step": 103
},
{
"epoch": 0.6084836665041443,
"grad_norm": 0.3876160979270935,
"learning_rate": 9.674614307640368e-06,
"loss": 0.3667,
"step": 104
},
{
"epoch": 0.6143344709897611,
"grad_norm": 0.39733994007110596,
"learning_rate": 9.66236114702178e-06,
"loss": 0.3746,
"step": 105
},
{
"epoch": 0.6201852754753778,
"grad_norm": 0.4422380030155182,
"learning_rate": 9.649889573350006e-06,
"loss": 0.3657,
"step": 106
},
{
"epoch": 0.6260360799609946,
"grad_norm": 0.34534451365470886,
"learning_rate": 9.637200170868607e-06,
"loss": 0.3173,
"step": 107
},
{
"epoch": 0.6318868844466115,
"grad_norm": 0.49448907375335693,
"learning_rate": 9.62429353402556e-06,
"loss": 0.3528,
"step": 108
},
{
"epoch": 0.6377376889322282,
"grad_norm": 0.4157074987888336,
"learning_rate": 9.611170267445401e-06,
"loss": 0.3647,
"step": 109
},
{
"epoch": 0.643588493417845,
"grad_norm": 0.3649308383464813,
"learning_rate": 9.597830985900913e-06,
"loss": 0.3592,
"step": 110
},
{
"epoch": 0.6494392979034618,
"grad_norm": 0.38802069425582886,
"learning_rate": 9.584276314284316e-06,
"loss": 0.3749,
"step": 111
},
{
"epoch": 0.6552901023890785,
"grad_norm": 0.41905415058135986,
"learning_rate": 9.570506887577994e-06,
"loss": 0.3761,
"step": 112
},
{
"epoch": 0.6611409068746953,
"grad_norm": 0.34973040223121643,
"learning_rate": 9.556523350824759e-06,
"loss": 0.3377,
"step": 113
},
{
"epoch": 0.666991711360312,
"grad_norm": 0.42152735590934753,
"learning_rate": 9.542326359097619e-06,
"loss": 0.3758,
"step": 114
},
{
"epoch": 0.6728425158459288,
"grad_norm": 0.34654316306114197,
"learning_rate": 9.527916577469104e-06,
"loss": 0.3612,
"step": 115
},
{
"epoch": 0.6786933203315456,
"grad_norm": 0.3440297842025757,
"learning_rate": 9.5132946809801e-06,
"loss": 0.37,
"step": 116
},
{
"epoch": 0.6845441248171623,
"grad_norm": 0.36565279960632324,
"learning_rate": 9.498461354608228e-06,
"loss": 0.352,
"step": 117
},
{
"epoch": 0.6903949293027791,
"grad_norm": 0.3970431983470917,
"learning_rate": 9.483417293235759e-06,
"loss": 0.3694,
"step": 118
},
{
"epoch": 0.6962457337883959,
"grad_norm": 0.3433384895324707,
"learning_rate": 9.468163201617063e-06,
"loss": 0.3657,
"step": 119
},
{
"epoch": 0.7020965382740126,
"grad_norm": 0.39245930314064026,
"learning_rate": 9.452699794345583e-06,
"loss": 0.362,
"step": 120
},
{
"epoch": 0.7079473427596294,
"grad_norm": 0.38453614711761475,
"learning_rate": 9.437027795820373e-06,
"loss": 0.3675,
"step": 121
},
{
"epoch": 0.7137981472452463,
"grad_norm": 0.369517058134079,
"learning_rate": 9.421147940212152e-06,
"loss": 0.3634,
"step": 122
},
{
"epoch": 0.719648951730863,
"grad_norm": 0.38849949836730957,
"learning_rate": 9.405060971428924e-06,
"loss": 0.3387,
"step": 123
},
{
"epoch": 0.7254997562164798,
"grad_norm": 0.4063083231449127,
"learning_rate": 9.388767643081109e-06,
"loss": 0.3719,
"step": 124
},
{
"epoch": 0.7313505607020966,
"grad_norm": 0.40234676003456116,
"learning_rate": 9.372268718446259e-06,
"loss": 0.3939,
"step": 125
},
{
"epoch": 0.7372013651877133,
"grad_norm": 0.3845783770084381,
"learning_rate": 9.355564970433288e-06,
"loss": 0.3699,
"step": 126
},
{
"epoch": 0.7430521696733301,
"grad_norm": 0.3887750506401062,
"learning_rate": 9.338657181546277e-06,
"loss": 0.3686,
"step": 127
},
{
"epoch": 0.7489029741589469,
"grad_norm": 0.3700850307941437,
"learning_rate": 9.321546143847802e-06,
"loss": 0.3431,
"step": 128
},
{
"epoch": 0.7547537786445636,
"grad_norm": 0.44235607981681824,
"learning_rate": 9.30423265892184e-06,
"loss": 0.3836,
"step": 129
},
{
"epoch": 0.7606045831301804,
"grad_norm": 0.39945074915885925,
"learning_rate": 9.286717537836211e-06,
"loss": 0.3706,
"step": 130
},
{
"epoch": 0.7664553876157971,
"grad_norm": 0.42615601420402527,
"learning_rate": 9.269001601104593e-06,
"loss": 0.369,
"step": 131
},
{
"epoch": 0.7723061921014139,
"grad_norm": 0.4713898003101349,
"learning_rate": 9.251085678648072e-06,
"loss": 0.3818,
"step": 132
},
{
"epoch": 0.7781569965870307,
"grad_norm": 0.3744489550590515,
"learning_rate": 9.232970609756267e-06,
"loss": 0.3542,
"step": 133
},
{
"epoch": 0.7840078010726474,
"grad_norm": 0.3802720308303833,
"learning_rate": 9.214657243048021e-06,
"loss": 0.3346,
"step": 134
},
{
"epoch": 0.7898586055582643,
"grad_norm": 0.45320552587509155,
"learning_rate": 9.196146436431635e-06,
"loss": 0.3766,
"step": 135
},
{
"epoch": 0.7957094100438811,
"grad_norm": 0.3729214370250702,
"learning_rate": 9.177439057064684e-06,
"loss": 0.3694,
"step": 136
},
{
"epoch": 0.8015602145294978,
"grad_norm": 0.3678078055381775,
"learning_rate": 9.158535981313395e-06,
"loss": 0.3515,
"step": 137
},
{
"epoch": 0.8074110190151146,
"grad_norm": 0.4144746959209442,
"learning_rate": 9.13943809471159e-06,
"loss": 0.3756,
"step": 138
},
{
"epoch": 0.8132618235007314,
"grad_norm": 0.3548150658607483,
"learning_rate": 9.120146291919206e-06,
"loss": 0.3494,
"step": 139
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.3966399133205414,
"learning_rate": 9.100661476680379e-06,
"loss": 0.3427,
"step": 140
},
{
"epoch": 0.8249634324719649,
"grad_norm": 0.4523519277572632,
"learning_rate": 9.08098456178111e-06,
"loss": 0.3641,
"step": 141
},
{
"epoch": 0.8308142369575817,
"grad_norm": 0.45737963914871216,
"learning_rate": 9.061116469006504e-06,
"loss": 0.3643,
"step": 142
},
{
"epoch": 0.8366650414431984,
"grad_norm": 0.34355804324150085,
"learning_rate": 9.041058129097586e-06,
"loss": 0.3227,
"step": 143
},
{
"epoch": 0.8425158459288152,
"grad_norm": 0.4239197373390198,
"learning_rate": 9.020810481707709e-06,
"loss": 0.3604,
"step": 144
},
{
"epoch": 0.8483666504144319,
"grad_norm": 0.4363431930541992,
"learning_rate": 9.00037447535852e-06,
"loss": 0.3785,
"step": 145
},
{
"epoch": 0.8542174549000487,
"grad_norm": 0.383635550737381,
"learning_rate": 8.979751067395534e-06,
"loss": 0.355,
"step": 146
},
{
"epoch": 0.8600682593856656,
"grad_norm": 0.3972126543521881,
"learning_rate": 8.958941223943292e-06,
"loss": 0.394,
"step": 147
},
{
"epoch": 0.8659190638712823,
"grad_norm": 0.3762996196746826,
"learning_rate": 8.937945919860086e-06,
"loss": 0.3779,
"step": 148
},
{
"epoch": 0.8717698683568991,
"grad_norm": 0.40220147371292114,
"learning_rate": 8.916766138692303e-06,
"loss": 0.3725,
"step": 149
},
{
"epoch": 0.8776206728425159,
"grad_norm": 0.35849395394325256,
"learning_rate": 8.895402872628352e-06,
"loss": 0.3533,
"step": 150
},
{
"epoch": 0.8834714773281326,
"grad_norm": 0.3301231861114502,
"learning_rate": 8.873857122452174e-06,
"loss": 0.3156,
"step": 151
},
{
"epoch": 0.8893222818137494,
"grad_norm": 0.39462047815322876,
"learning_rate": 8.852129897496367e-06,
"loss": 0.3538,
"step": 152
},
{
"epoch": 0.8951730862993662,
"grad_norm": 0.3844425082206726,
"learning_rate": 8.83022221559489e-06,
"loss": 0.3913,
"step": 153
},
{
"epoch": 0.9010238907849829,
"grad_norm": 0.37792298197746277,
"learning_rate": 8.808135103035407e-06,
"loss": 0.3495,
"step": 154
},
{
"epoch": 0.9068746952705997,
"grad_norm": 0.39290040731430054,
"learning_rate": 8.785869594511182e-06,
"loss": 0.3784,
"step": 155
},
{
"epoch": 0.9127254997562165,
"grad_norm": 0.3619037866592407,
"learning_rate": 8.763426733072624e-06,
"loss": 0.3614,
"step": 156
},
{
"epoch": 0.9185763042418332,
"grad_norm": 0.3633933663368225,
"learning_rate": 8.740807570078419e-06,
"loss": 0.3902,
"step": 157
},
{
"epoch": 0.92442710872745,
"grad_norm": 0.3714929223060608,
"learning_rate": 8.718013165146275e-06,
"loss": 0.3274,
"step": 158
},
{
"epoch": 0.9302779132130667,
"grad_norm": 0.38371893763542175,
"learning_rate": 8.695044586103297e-06,
"loss": 0.3507,
"step": 159
},
{
"epoch": 0.9361287176986836,
"grad_norm": 0.34635236859321594,
"learning_rate": 8.671902908935942e-06,
"loss": 0.3275,
"step": 160
},
{
"epoch": 0.9419795221843004,
"grad_norm": 0.34420835971832275,
"learning_rate": 8.648589217739635e-06,
"loss": 0.3461,
"step": 161
},
{
"epoch": 0.9478303266699171,
"grad_norm": 0.3969476819038391,
"learning_rate": 8.625104604667965e-06,
"loss": 0.3579,
"step": 162
},
{
"epoch": 0.9536811311555339,
"grad_norm": 0.3697619140148163,
"learning_rate": 8.601450169881533e-06,
"loss": 0.3476,
"step": 163
},
{
"epoch": 0.9595319356411507,
"grad_norm": 0.3809903860092163,
"learning_rate": 8.577627021496413e-06,
"loss": 0.36,
"step": 164
},
{
"epoch": 0.9653827401267674,
"grad_norm": 0.3934761881828308,
"learning_rate": 8.553636275532236e-06,
"loss": 0.3704,
"step": 165
},
{
"epoch": 0.9712335446123842,
"grad_norm": 0.3420058786869049,
"learning_rate": 8.529479055859918e-06,
"loss": 0.3335,
"step": 166
},
{
"epoch": 0.977084349098001,
"grad_norm": 0.3801231384277344,
"learning_rate": 8.505156494148997e-06,
"loss": 0.3723,
"step": 167
},
{
"epoch": 0.9829351535836177,
"grad_norm": 0.38984423875808716,
"learning_rate": 8.480669729814635e-06,
"loss": 0.3563,
"step": 168
},
{
"epoch": 0.9887859580692345,
"grad_norm": 0.369872123003006,
"learning_rate": 8.456019909964224e-06,
"loss": 0.3494,
"step": 169
},
{
"epoch": 0.9946367625548513,
"grad_norm": 0.3835128843784332,
"learning_rate": 8.43120818934367e-06,
"loss": 0.3672,
"step": 170
},
{
"epoch": 1.0014627011214041,
"grad_norm": 0.4482472538948059,
"learning_rate": 8.40623573028327e-06,
"loss": 0.4454,
"step": 171
},
{
"epoch": 1.007313505607021,
"grad_norm": 0.45144927501678467,
"learning_rate": 8.381103702643295e-06,
"loss": 0.3454,
"step": 172
},
{
"epoch": 1.0131643100926377,
"grad_norm": 0.3322243094444275,
"learning_rate": 8.35581328375915e-06,
"loss": 0.2828,
"step": 173
},
{
"epoch": 1.0190151145782544,
"grad_norm": 0.397659033536911,
"learning_rate": 8.330365658386252e-06,
"loss": 0.3287,
"step": 174
},
{
"epoch": 1.0248659190638714,
"grad_norm": 0.3485862910747528,
"learning_rate": 8.30476201864451e-06,
"loss": 0.2744,
"step": 175
},
{
"epoch": 1.030716723549488,
"grad_norm": 0.3832169473171234,
"learning_rate": 8.27900356396249e-06,
"loss": 0.2868,
"step": 176
},
{
"epoch": 1.0365675280351048,
"grad_norm": 0.4184396266937256,
"learning_rate": 8.25309150102121e-06,
"loss": 0.3291,
"step": 177
},
{
"epoch": 1.0424183325207217,
"grad_norm": 0.45518970489501953,
"learning_rate": 8.227027043697642e-06,
"loss": 0.3489,
"step": 178
},
{
"epoch": 1.0482691370063384,
"grad_norm": 0.3730817437171936,
"learning_rate": 8.200811413007808e-06,
"loss": 0.3055,
"step": 179
},
{
"epoch": 1.054119941491955,
"grad_norm": 0.398185133934021,
"learning_rate": 8.174445837049614e-06,
"loss": 0.326,
"step": 180
},
{
"epoch": 1.059970745977572,
"grad_norm": 0.4147329032421112,
"learning_rate": 8.147931550945301e-06,
"loss": 0.2961,
"step": 181
},
{
"epoch": 1.0658215504631887,
"grad_norm": 0.4088496267795563,
"learning_rate": 8.121269796783585e-06,
"loss": 0.3239,
"step": 182
},
{
"epoch": 1.0716723549488054,
"grad_norm": 0.35450735688209534,
"learning_rate": 8.094461823561473e-06,
"loss": 0.2851,
"step": 183
},
{
"epoch": 1.0775231594344223,
"grad_norm": 0.4081903100013733,
"learning_rate": 8.06750888712576e-06,
"loss": 0.3188,
"step": 184
},
{
"epoch": 1.083373963920039,
"grad_norm": 0.3934895396232605,
"learning_rate": 8.040412250114184e-06,
"loss": 0.2891,
"step": 185
},
{
"epoch": 1.0892247684056557,
"grad_norm": 0.35631951689720154,
"learning_rate": 8.013173181896283e-06,
"loss": 0.2667,
"step": 186
},
{
"epoch": 1.0950755728912727,
"grad_norm": 0.42703738808631897,
"learning_rate": 7.985792958513932e-06,
"loss": 0.312,
"step": 187
},
{
"epoch": 1.1009263773768894,
"grad_norm": 0.4023725986480713,
"learning_rate": 7.958272862621562e-06,
"loss": 0.3343,
"step": 188
},
{
"epoch": 1.106777181862506,
"grad_norm": 0.3514081537723541,
"learning_rate": 7.930614183426074e-06,
"loss": 0.2959,
"step": 189
},
{
"epoch": 1.1126279863481228,
"grad_norm": 0.40648946166038513,
"learning_rate": 7.902818216626446e-06,
"loss": 0.3529,
"step": 190
},
{
"epoch": 1.1184787908337397,
"grad_norm": 0.38296204805374146,
"learning_rate": 7.874886264353035e-06,
"loss": 0.2988,
"step": 191
},
{
"epoch": 1.1243295953193564,
"grad_norm": 0.4062958061695099,
"learning_rate": 7.846819635106569e-06,
"loss": 0.3344,
"step": 192
},
{
"epoch": 1.130180399804973,
"grad_norm": 0.3408312499523163,
"learning_rate": 7.818619643696863e-06,
"loss": 0.2857,
"step": 193
},
{
"epoch": 1.13603120429059,
"grad_norm": 0.3789331316947937,
"learning_rate": 7.790287611181217e-06,
"loss": 0.3077,
"step": 194
},
{
"epoch": 1.1418820087762067,
"grad_norm": 0.38520050048828125,
"learning_rate": 7.76182486480253e-06,
"loss": 0.3025,
"step": 195
},
{
"epoch": 1.1477328132618234,
"grad_norm": 0.3634053170681,
"learning_rate": 7.733232737927123e-06,
"loss": 0.3037,
"step": 196
},
{
"epoch": 1.1535836177474403,
"grad_norm": 0.42052581906318665,
"learning_rate": 7.70451256998228e-06,
"loss": 0.304,
"step": 197
},
{
"epoch": 1.159434422233057,
"grad_norm": 0.3758928179740906,
"learning_rate": 7.675665706393502e-06,
"loss": 0.2755,
"step": 198
},
{
"epoch": 1.1652852267186737,
"grad_norm": 0.35784485936164856,
"learning_rate": 7.646693498521472e-06,
"loss": 0.2876,
"step": 199
},
{
"epoch": 1.1711360312042907,
"grad_norm": 0.38650694489479065,
"learning_rate": 7.617597303598754e-06,
"loss": 0.288,
"step": 200
},
{
"epoch": 1.1769868356899074,
"grad_norm": 0.3944965898990631,
"learning_rate": 7.588378484666214e-06,
"loss": 0.3211,
"step": 201
},
{
"epoch": 1.182837640175524,
"grad_norm": 0.3851556181907654,
"learning_rate": 7.559038410509161e-06,
"loss": 0.3389,
"step": 202
},
{
"epoch": 1.188688444661141,
"grad_norm": 0.3507968783378601,
"learning_rate": 7.529578455593232e-06,
"loss": 0.2943,
"step": 203
},
{
"epoch": 1.1945392491467577,
"grad_norm": 0.3462185561656952,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3112,
"step": 204
},
{
"epoch": 1.2003900536323744,
"grad_norm": 0.3465600609779358,
"learning_rate": 7.47030442936232e-06,
"loss": 0.3165,
"step": 205
},
{
"epoch": 1.2062408581179913,
"grad_norm": 0.3432478904724121,
"learning_rate": 7.440493134799425e-06,
"loss": 0.2977,
"step": 206
},
{
"epoch": 1.212091662603608,
"grad_norm": 0.3325629234313965,
"learning_rate": 7.4105675128517456e-06,
"loss": 0.2809,
"step": 207
},
{
"epoch": 1.2179424670892247,
"grad_norm": 0.37305665016174316,
"learning_rate": 7.380528965415501e-06,
"loss": 0.3494,
"step": 208
},
{
"epoch": 1.2237932715748416,
"grad_norm": 0.3855370283126831,
"learning_rate": 7.35037889967702e-06,
"loss": 0.331,
"step": 209
},
{
"epoch": 1.2296440760604583,
"grad_norm": 0.38624921441078186,
"learning_rate": 7.320118728046818e-06,
"loss": 0.3249,
"step": 210
},
{
"epoch": 1.235494880546075,
"grad_norm": 0.339275985956192,
"learning_rate": 7.289749868093432e-06,
"loss": 0.2979,
"step": 211
},
{
"epoch": 1.2413456850316917,
"grad_norm": 0.362403929233551,
"learning_rate": 7.259273742477017e-06,
"loss": 0.3071,
"step": 212
},
{
"epoch": 1.2471964895173087,
"grad_norm": 0.331527978181839,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.2959,
"step": 213
},
{
"epoch": 1.2530472940029254,
"grad_norm": 0.34029752016067505,
"learning_rate": 7.19800540995367e-06,
"loss": 0.2873,
"step": 214
},
{
"epoch": 1.258898098488542,
"grad_norm": 0.38359367847442627,
"learning_rate": 7.167216073224136e-06,
"loss": 0.3215,
"step": 215
},
{
"epoch": 1.264748902974159,
"grad_norm": 0.3701342046260834,
"learning_rate": 7.136325211051905e-06,
"loss": 0.2931,
"step": 216
},
{
"epoch": 1.2705997074597757,
"grad_norm": 0.3997856080532074,
"learning_rate": 7.1053342705508564e-06,
"loss": 0.319,
"step": 217
},
{
"epoch": 1.2764505119453924,
"grad_norm": 0.3141786456108093,
"learning_rate": 7.074244703523137e-06,
"loss": 0.2628,
"step": 218
},
{
"epoch": 1.2823013164310093,
"grad_norm": 0.363447368144989,
"learning_rate": 7.043057966391158e-06,
"loss": 0.3079,
"step": 219
},
{
"epoch": 1.288152120916626,
"grad_norm": 0.3675538897514343,
"learning_rate": 7.011775520129363e-06,
"loss": 0.2912,
"step": 220
},
{
"epoch": 1.2940029254022427,
"grad_norm": 0.3745831251144409,
"learning_rate": 6.980398830195785e-06,
"loss": 0.287,
"step": 221
},
{
"epoch": 1.2998537298878596,
"grad_norm": 0.34273862838745117,
"learning_rate": 6.948929366463397e-06,
"loss": 0.2739,
"step": 222
},
{
"epoch": 1.3057045343734763,
"grad_norm": 0.38599085807800293,
"learning_rate": 6.9173686031512595e-06,
"loss": 0.3386,
"step": 223
},
{
"epoch": 1.311555338859093,
"grad_norm": 0.35338225960731506,
"learning_rate": 6.885718018755448e-06,
"loss": 0.3034,
"step": 224
},
{
"epoch": 1.31740614334471,
"grad_norm": 0.35684457421302795,
"learning_rate": 6.8539790959798045e-06,
"loss": 0.3159,
"step": 225
},
{
"epoch": 1.3232569478303267,
"grad_norm": 0.342815101146698,
"learning_rate": 6.822153321666469e-06,
"loss": 0.3237,
"step": 226
},
{
"epoch": 1.3291077523159434,
"grad_norm": 0.36875948309898376,
"learning_rate": 6.790242186726231e-06,
"loss": 0.3084,
"step": 227
},
{
"epoch": 1.3349585568015603,
"grad_norm": 0.37179967761039734,
"learning_rate": 6.758247186068684e-06,
"loss": 0.3171,
"step": 228
},
{
"epoch": 1.340809361287177,
"grad_norm": 0.35630038380622864,
"learning_rate": 6.7261698185322e-06,
"loss": 0.3041,
"step": 229
},
{
"epoch": 1.3466601657727937,
"grad_norm": 0.39249274134635925,
"learning_rate": 6.6940115868137065e-06,
"loss": 0.2953,
"step": 230
},
{
"epoch": 1.3525109702584106,
"grad_norm": 0.3363463878631592,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.3005,
"step": 231
},
{
"epoch": 1.3583617747440273,
"grad_norm": 0.36309415102005005,
"learning_rate": 6.629458560488664e-06,
"loss": 0.3415,
"step": 232
},
{
"epoch": 1.364212579229644,
"grad_norm": 0.3635103106498718,
"learning_rate": 6.597066789934336e-06,
"loss": 0.3117,
"step": 233
},
{
"epoch": 1.370063383715261,
"grad_norm": 0.3717254102230072,
"learning_rate": 6.5646002031607726e-06,
"loss": 0.3336,
"step": 234
},
{
"epoch": 1.3759141882008776,
"grad_norm": 0.3539208173751831,
"learning_rate": 6.5320603210982745e-06,
"loss": 0.3335,
"step": 235
},
{
"epoch": 1.3817649926864943,
"grad_norm": 0.3605196475982666,
"learning_rate": 6.499448668110735e-06,
"loss": 0.319,
"step": 236
},
{
"epoch": 1.3876157971721113,
"grad_norm": 0.39067190885543823,
"learning_rate": 6.466766771924231e-06,
"loss": 0.3104,
"step": 237
},
{
"epoch": 1.393466601657728,
"grad_norm": 0.3777407705783844,
"learning_rate": 6.434016163555452e-06,
"loss": 0.3069,
"step": 238
},
{
"epoch": 1.3993174061433447,
"grad_norm": 0.34741804003715515,
"learning_rate": 6.401198377239979e-06,
"loss": 0.2852,
"step": 239
},
{
"epoch": 1.4051682106289616,
"grad_norm": 0.3834282457828522,
"learning_rate": 6.368314950360416e-06,
"loss": 0.3474,
"step": 240
},
{
"epoch": 1.4110190151145783,
"grad_norm": 0.3760935664176941,
"learning_rate": 6.3353674233743585e-06,
"loss": 0.3136,
"step": 241
},
{
"epoch": 1.416869819600195,
"grad_norm": 0.3629906475543976,
"learning_rate": 6.302357339742245e-06,
"loss": 0.3403,
"step": 242
},
{
"epoch": 1.422720624085812,
"grad_norm": 0.342675119638443,
"learning_rate": 6.269286245855039e-06,
"loss": 0.2915,
"step": 243
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.3933790326118469,
"learning_rate": 6.236155690961795e-06,
"loss": 0.3048,
"step": 244
},
{
"epoch": 1.4344222330570453,
"grad_norm": 0.35148119926452637,
"learning_rate": 6.202967227097073e-06,
"loss": 0.3072,
"step": 245
},
{
"epoch": 1.4402730375426622,
"grad_norm": 0.3553239405155182,
"learning_rate": 6.169722409008244e-06,
"loss": 0.2988,
"step": 246
},
{
"epoch": 1.446123842028279,
"grad_norm": 0.39217159152030945,
"learning_rate": 6.136422794082645e-06,
"loss": 0.2945,
"step": 247
},
{
"epoch": 1.4519746465138956,
"grad_norm": 0.39117711782455444,
"learning_rate": 6.10306994227463e-06,
"loss": 0.3038,
"step": 248
},
{
"epoch": 1.4578254509995126,
"grad_norm": 0.3591575026512146,
"learning_rate": 6.0696654160324875e-06,
"loss": 0.3136,
"step": 249
},
{
"epoch": 1.4636762554851293,
"grad_norm": 0.4656267464160919,
"learning_rate": 6.0362107802252486e-06,
"loss": 0.3496,
"step": 250
},
{
"epoch": 1.469527059970746,
"grad_norm": 0.3674546778202057,
"learning_rate": 6.002707602069377e-06,
"loss": 0.3121,
"step": 251
},
{
"epoch": 1.4753778644563629,
"grad_norm": 0.4174729585647583,
"learning_rate": 5.9691574510553505e-06,
"loss": 0.3121,
"step": 252
},
{
"epoch": 1.4812286689419796,
"grad_norm": 0.3748752176761627,
"learning_rate": 5.935561898874142e-06,
"loss": 0.3125,
"step": 253
},
{
"epoch": 1.4870794734275963,
"grad_norm": 0.3187505006790161,
"learning_rate": 5.901922519343586e-06,
"loss": 0.3013,
"step": 254
},
{
"epoch": 1.4929302779132132,
"grad_norm": 0.34686118364334106,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.3099,
"step": 255
},
{
"epoch": 1.49878108239883,
"grad_norm": 0.38693419098854065,
"learning_rate": 5.834518583697628e-06,
"loss": 0.343,
"step": 256
},
{
"epoch": 1.5046318868844466,
"grad_norm": 0.38468196988105774,
"learning_rate": 5.800757185188195e-06,
"loss": 0.3152,
"step": 257
},
{
"epoch": 1.5104826913700635,
"grad_norm": 0.3720076084136963,
"learning_rate": 5.766958274393428e-06,
"loss": 0.3289,
"step": 258
},
{
"epoch": 1.51633349585568,
"grad_norm": 0.3495715260505676,
"learning_rate": 5.733123434657704e-06,
"loss": 0.3268,
"step": 259
},
{
"epoch": 1.522184300341297,
"grad_norm": 0.33257222175598145,
"learning_rate": 5.699254251008524e-06,
"loss": 0.306,
"step": 260
},
{
"epoch": 1.5280351048269138,
"grad_norm": 0.35938987135887146,
"learning_rate": 5.66535231008227e-06,
"loss": 0.3221,
"step": 261
},
{
"epoch": 1.5338859093125303,
"grad_norm": 0.3358217477798462,
"learning_rate": 5.631419200049867e-06,
"loss": 0.3109,
"step": 262
},
{
"epoch": 1.5397367137981472,
"grad_norm": 0.3260052502155304,
"learning_rate": 5.597456510542395e-06,
"loss": 0.2735,
"step": 263
},
{
"epoch": 1.5455875182837642,
"grad_norm": 0.3558763861656189,
"learning_rate": 5.5634658325766066e-06,
"loss": 0.3133,
"step": 264
},
{
"epoch": 1.5514383227693807,
"grad_norm": 0.34226661920547485,
"learning_rate": 5.529448758480408e-06,
"loss": 0.301,
"step": 265
},
{
"epoch": 1.5572891272549976,
"grad_norm": 0.40270325541496277,
"learning_rate": 5.495406881818256e-06,
"loss": 0.3427,
"step": 266
},
{
"epoch": 1.5631399317406145,
"grad_norm": 0.3240657150745392,
"learning_rate": 5.46134179731651e-06,
"loss": 0.2948,
"step": 267
},
{
"epoch": 1.568990736226231,
"grad_norm": 0.36010023951530457,
"learning_rate": 5.427255100788726e-06,
"loss": 0.2869,
"step": 268
},
{
"epoch": 1.574841540711848,
"grad_norm": 0.3521655797958374,
"learning_rate": 5.393148389060893e-06,
"loss": 0.2908,
"step": 269
},
{
"epoch": 1.5806923451974646,
"grad_norm": 0.3522508442401886,
"learning_rate": 5.359023259896638e-06,
"loss": 0.3222,
"step": 270
},
{
"epoch": 1.5865431496830813,
"grad_norm": 0.358254075050354,
"learning_rate": 5.3248813119223665e-06,
"loss": 0.3191,
"step": 271
},
{
"epoch": 1.5923939541686982,
"grad_norm": 0.36198315024375916,
"learning_rate": 5.290724144552379e-06,
"loss": 0.315,
"step": 272
},
{
"epoch": 1.598244758654315,
"grad_norm": 0.353097528219223,
"learning_rate": 5.2565533579139484e-06,
"loss": 0.3015,
"step": 273
},
{
"epoch": 1.6040955631399316,
"grad_norm": 0.35641244053840637,
"learning_rate": 5.222370552772353e-06,
"loss": 0.3108,
"step": 274
},
{
"epoch": 1.6099463676255485,
"grad_norm": 0.35300660133361816,
"learning_rate": 5.188177330455886e-06,
"loss": 0.3443,
"step": 275
},
{
"epoch": 1.6157971721111652,
"grad_norm": 0.33080846071243286,
"learning_rate": 5.153975292780852e-06,
"loss": 0.2871,
"step": 276
},
{
"epoch": 1.621647976596782,
"grad_norm": 0.33396315574645996,
"learning_rate": 5.119766041976516e-06,
"loss": 0.3089,
"step": 277
},
{
"epoch": 1.6274987810823989,
"grad_norm": 0.34597212076187134,
"learning_rate": 5.085551180610046e-06,
"loss": 0.2817,
"step": 278
},
{
"epoch": 1.6333495855680156,
"grad_norm": 0.3279144763946533,
"learning_rate": 5.05133231151145e-06,
"loss": 0.2944,
"step": 279
},
{
"epoch": 1.6392003900536323,
"grad_norm": 0.3529197871685028,
"learning_rate": 5.017111037698477e-06,
"loss": 0.3195,
"step": 280
},
{
"epoch": 1.6450511945392492,
"grad_norm": 0.36540284752845764,
"learning_rate": 4.9828889623015265e-06,
"loss": 0.3282,
"step": 281
},
{
"epoch": 1.650901999024866,
"grad_norm": 0.33339953422546387,
"learning_rate": 4.948667688488552e-06,
"loss": 0.2907,
"step": 282
},
{
"epoch": 1.6567528035104826,
"grad_norm": 0.32981109619140625,
"learning_rate": 4.9144488193899546e-06,
"loss": 0.2982,
"step": 283
},
{
"epoch": 1.6626036079960995,
"grad_norm": 0.33798947930336,
"learning_rate": 4.880233958023486e-06,
"loss": 0.2964,
"step": 284
},
{
"epoch": 1.6684544124817162,
"grad_norm": 0.3474103808403015,
"learning_rate": 4.846024707219149e-06,
"loss": 0.3301,
"step": 285
},
{
"epoch": 1.674305216967333,
"grad_norm": 0.3323943316936493,
"learning_rate": 4.811822669544115e-06,
"loss": 0.3014,
"step": 286
},
{
"epoch": 1.6801560214529498,
"grad_norm": 0.38225099444389343,
"learning_rate": 4.777629447227649e-06,
"loss": 0.3389,
"step": 287
},
{
"epoch": 1.6860068259385665,
"grad_norm": 0.3148108720779419,
"learning_rate": 4.7434466420860515e-06,
"loss": 0.298,
"step": 288
},
{
"epoch": 1.6918576304241832,
"grad_norm": 0.3262878656387329,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.29,
"step": 289
},
{
"epoch": 1.6977084349098002,
"grad_norm": 0.3702300190925598,
"learning_rate": 4.675118688077634e-06,
"loss": 0.327,
"step": 290
},
{
"epoch": 1.7035592393954169,
"grad_norm": 0.3070249855518341,
"learning_rate": 4.640976740103363e-06,
"loss": 0.2918,
"step": 291
},
{
"epoch": 1.7094100438810336,
"grad_norm": 0.3508608937263489,
"learning_rate": 4.606851610939108e-06,
"loss": 0.3251,
"step": 292
},
{
"epoch": 1.7152608483666505,
"grad_norm": 0.3425685465335846,
"learning_rate": 4.572744899211275e-06,
"loss": 0.3039,
"step": 293
},
{
"epoch": 1.7211116528522672,
"grad_norm": 0.33032500743865967,
"learning_rate": 4.53865820268349e-06,
"loss": 0.2874,
"step": 294
},
{
"epoch": 1.726962457337884,
"grad_norm": 0.34354081749916077,
"learning_rate": 4.504593118181745e-06,
"loss": 0.293,
"step": 295
},
{
"epoch": 1.7328132618235008,
"grad_norm": 0.35744139552116394,
"learning_rate": 4.470551241519594e-06,
"loss": 0.3136,
"step": 296
},
{
"epoch": 1.7386640663091175,
"grad_norm": 0.34493860602378845,
"learning_rate": 4.436534167423395e-06,
"loss": 0.2967,
"step": 297
},
{
"epoch": 1.7445148707947342,
"grad_norm": 0.35344043374061584,
"learning_rate": 4.402543489457607e-06,
"loss": 0.3073,
"step": 298
},
{
"epoch": 1.7503656752803511,
"grad_norm": 0.3236096203327179,
"learning_rate": 4.368580799950133e-06,
"loss": 0.3045,
"step": 299
},
{
"epoch": 1.7562164797659678,
"grad_norm": 0.32016465067863464,
"learning_rate": 4.334647689917734e-06,
"loss": 0.2846,
"step": 300
},
{
"epoch": 1.7620672842515845,
"grad_norm": 0.3745932877063751,
"learning_rate": 4.300745748991478e-06,
"loss": 0.3333,
"step": 301
},
{
"epoch": 1.7679180887372015,
"grad_norm": 0.387076735496521,
"learning_rate": 4.266876565342298e-06,
"loss": 0.3218,
"step": 302
},
{
"epoch": 1.7737688932228182,
"grad_norm": 0.3995639979839325,
"learning_rate": 4.233041725606573e-06,
"loss": 0.3007,
"step": 303
},
{
"epoch": 1.7796196977084349,
"grad_norm": 0.3345247507095337,
"learning_rate": 4.199242814811807e-06,
"loss": 0.3214,
"step": 304
},
{
"epoch": 1.7854705021940518,
"grad_norm": 0.3709820806980133,
"learning_rate": 4.1654814163023735e-06,
"loss": 0.3168,
"step": 305
},
{
"epoch": 1.7913213066796685,
"grad_norm": 0.34402501583099365,
"learning_rate": 4.131759111665349e-06,
"loss": 0.2967,
"step": 306
},
{
"epoch": 1.7971721111652852,
"grad_norm": 0.3674980103969574,
"learning_rate": 4.098077480656415e-06,
"loss": 0.3069,
"step": 307
},
{
"epoch": 1.8030229156509021,
"grad_norm": 0.35379621386528015,
"learning_rate": 4.064438101125859e-06,
"loss": 0.3105,
"step": 308
},
{
"epoch": 1.8088737201365188,
"grad_norm": 0.41910186409950256,
"learning_rate": 4.03084254894465e-06,
"loss": 0.3471,
"step": 309
},
{
"epoch": 1.8147245246221355,
"grad_norm": 0.3440791964530945,
"learning_rate": 3.997292397930624e-06,
"loss": 0.2799,
"step": 310
},
{
"epoch": 1.8205753291077524,
"grad_norm": 0.3493747413158417,
"learning_rate": 3.963789219774753e-06,
"loss": 0.3011,
"step": 311
},
{
"epoch": 1.8264261335933691,
"grad_norm": 0.3454689681529999,
"learning_rate": 3.930334583967514e-06,
"loss": 0.2977,
"step": 312
},
{
"epoch": 1.8322769380789858,
"grad_norm": 0.3456018567085266,
"learning_rate": 3.896930057725372e-06,
"loss": 0.3083,
"step": 313
},
{
"epoch": 1.8381277425646028,
"grad_norm": 0.3650881052017212,
"learning_rate": 3.863577205917356e-06,
"loss": 0.292,
"step": 314
},
{
"epoch": 1.8439785470502192,
"grad_norm": 0.37091773748397827,
"learning_rate": 3.8302775909917585e-06,
"loss": 0.3371,
"step": 315
},
{
"epoch": 1.8498293515358362,
"grad_norm": 0.34685999155044556,
"learning_rate": 3.7970327729029288e-06,
"loss": 0.303,
"step": 316
},
{
"epoch": 1.855680156021453,
"grad_norm": 0.3407152593135834,
"learning_rate": 3.7638443090382067e-06,
"loss": 0.3268,
"step": 317
},
{
"epoch": 1.8615309605070696,
"grad_norm": 0.3154624104499817,
"learning_rate": 3.730713754144961e-06,
"loss": 0.2752,
"step": 318
},
{
"epoch": 1.8673817649926865,
"grad_norm": 0.3909953534603119,
"learning_rate": 3.6976426602577565e-06,
"loss": 0.3347,
"step": 319
},
{
"epoch": 1.8732325694783034,
"grad_norm": 0.3309001922607422,
"learning_rate": 3.6646325766256423e-06,
"loss": 0.2865,
"step": 320
},
{
"epoch": 1.87908337396392,
"grad_norm": 0.32067787647247314,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.3059,
"step": 321
},
{
"epoch": 1.8849341784495368,
"grad_norm": 0.35044965147972107,
"learning_rate": 3.598801622760021e-06,
"loss": 0.311,
"step": 322
},
{
"epoch": 1.8907849829351537,
"grad_norm": 0.3437960743904114,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.2987,
"step": 323
},
{
"epoch": 1.8966357874207702,
"grad_norm": 0.372930645942688,
"learning_rate": 3.5332332280757706e-06,
"loss": 0.3277,
"step": 324
},
{
"epoch": 1.9024865919063871,
"grad_norm": 0.33684802055358887,
"learning_rate": 3.5005513318892666e-06,
"loss": 0.304,
"step": 325
},
{
"epoch": 1.908337396392004,
"grad_norm": 0.34415948390960693,
"learning_rate": 3.4679396789017263e-06,
"loss": 0.3257,
"step": 326
},
{
"epoch": 1.9141882008776205,
"grad_norm": 0.3257143795490265,
"learning_rate": 3.4353997968392295e-06,
"loss": 0.2821,
"step": 327
},
{
"epoch": 1.9200390053632375,
"grad_norm": 0.33275625109672546,
"learning_rate": 3.402933210065665e-06,
"loss": 0.3002,
"step": 328
},
{
"epoch": 1.9258898098488544,
"grad_norm": 0.31569424271583557,
"learning_rate": 3.3705414395113354e-06,
"loss": 0.2999,
"step": 329
},
{
"epoch": 1.9317406143344709,
"grad_norm": 0.3612264394760132,
"learning_rate": 3.3382260026017027e-06,
"loss": 0.3035,
"step": 330
},
{
"epoch": 1.9375914188200878,
"grad_norm": 0.3221355378627777,
"learning_rate": 3.305988413186295e-06,
"loss": 0.2915,
"step": 331
},
{
"epoch": 1.9434422233057045,
"grad_norm": 0.34144338965415955,
"learning_rate": 3.2738301814678015e-06,
"loss": 0.288,
"step": 332
},
{
"epoch": 1.9492930277913212,
"grad_norm": 0.35333582758903503,
"learning_rate": 3.241752813931316e-06,
"loss": 0.3185,
"step": 333
},
{
"epoch": 1.955143832276938,
"grad_norm": 0.3483865559101105,
"learning_rate": 3.2097578132737716e-06,
"loss": 0.294,
"step": 334
},
{
"epoch": 1.9609946367625548,
"grad_norm": 0.4015137851238251,
"learning_rate": 3.1778466783335328e-06,
"loss": 0.3608,
"step": 335
},
{
"epoch": 1.9668454412481715,
"grad_norm": 0.35391393303871155,
"learning_rate": 3.1460209040201967e-06,
"loss": 0.2948,
"step": 336
},
{
"epoch": 1.9726962457337884,
"grad_norm": 0.3409406244754791,
"learning_rate": 3.114281981244553e-06,
"loss": 0.2983,
"step": 337
},
{
"epoch": 1.9785470502194051,
"grad_norm": 0.36691051721572876,
"learning_rate": 3.082631396848743e-06,
"loss": 0.3138,
"step": 338
},
{
"epoch": 1.9843978547050218,
"grad_norm": 0.33103111386299133,
"learning_rate": 3.0510706335366034e-06,
"loss": 0.2874,
"step": 339
},
{
"epoch": 1.9902486591906388,
"grad_norm": 0.3499497175216675,
"learning_rate": 3.019601169804216e-06,
"loss": 0.3114,
"step": 340
},
{
"epoch": 1.9960994636762555,
"grad_norm": 0.36926743388175964,
"learning_rate": 2.9882244798706372e-06,
"loss": 0.3,
"step": 341
},
{
"epoch": 2.0029254022428082,
"grad_norm": 0.37020495533943176,
"learning_rate": 2.956942033608843e-06,
"loss": 0.2719,
"step": 342
},
{
"epoch": 2.008776206728425,
"grad_norm": 0.39733192324638367,
"learning_rate": 2.9257552964768644e-06,
"loss": 0.2797,
"step": 343
},
{
"epoch": 2.014627011214042,
"grad_norm": 0.4049554169178009,
"learning_rate": 2.8946657294491452e-06,
"loss": 0.2898,
"step": 344
},
{
"epoch": 2.0204778156996586,
"grad_norm": 0.33150407671928406,
"learning_rate": 2.863674788948097e-06,
"loss": 0.2544,
"step": 345
},
{
"epoch": 2.0263286201852755,
"grad_norm": 0.34981128573417664,
"learning_rate": 2.832783926775865e-06,
"loss": 0.3092,
"step": 346
},
{
"epoch": 2.0321794246708924,
"grad_norm": 0.3597969114780426,
"learning_rate": 2.8019945900463307e-06,
"loss": 0.2516,
"step": 347
},
{
"epoch": 2.038030229156509,
"grad_norm": 0.3807888925075531,
"learning_rate": 2.771308221117309e-06,
"loss": 0.2399,
"step": 348
},
{
"epoch": 2.043881033642126,
"grad_norm": 0.4420805275440216,
"learning_rate": 2.740726257522987e-06,
"loss": 0.2623,
"step": 349
},
{
"epoch": 2.0497318381277427,
"grad_norm": 0.3532399535179138,
"learning_rate": 2.7102501319065706e-06,
"loss": 0.2603,
"step": 350
},
{
"epoch": 2.055582642613359,
"grad_norm": 0.35583576560020447,
"learning_rate": 2.6798812719531843e-06,
"loss": 0.2784,
"step": 351
},
{
"epoch": 2.061433447098976,
"grad_norm": 0.3376619517803192,
"learning_rate": 2.6496211003229795e-06,
"loss": 0.2687,
"step": 352
},
{
"epoch": 2.067284251584593,
"grad_norm": 0.35551944375038147,
"learning_rate": 2.6194710345845e-06,
"loss": 0.2666,
"step": 353
},
{
"epoch": 2.0731350560702095,
"grad_norm": 0.3540509343147278,
"learning_rate": 2.5894324871482557e-06,
"loss": 0.2553,
"step": 354
},
{
"epoch": 2.0789858605558265,
"grad_norm": 0.354968786239624,
"learning_rate": 2.559506865200576e-06,
"loss": 0.2533,
"step": 355
},
{
"epoch": 2.0848366650414434,
"grad_norm": 0.36224791407585144,
"learning_rate": 2.529695570637679e-06,
"loss": 0.2621,
"step": 356
},
{
"epoch": 2.09068746952706,
"grad_norm": 0.3591248095035553,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.2911,
"step": 357
},
{
"epoch": 2.096538274012677,
"grad_norm": 0.3431352376937866,
"learning_rate": 2.4704215444067684e-06,
"loss": 0.2552,
"step": 358
},
{
"epoch": 2.1023890784982937,
"grad_norm": 0.3626757562160492,
"learning_rate": 2.4409615894908407e-06,
"loss": 0.2895,
"step": 359
},
{
"epoch": 2.10823988298391,
"grad_norm": 0.3287983238697052,
"learning_rate": 2.411621515333788e-06,
"loss": 0.259,
"step": 360
},
{
"epoch": 2.114090687469527,
"grad_norm": 0.35708674788475037,
"learning_rate": 2.3824026964012487e-06,
"loss": 0.2709,
"step": 361
},
{
"epoch": 2.119941491955144,
"grad_norm": 0.3249606192111969,
"learning_rate": 2.35330650147853e-06,
"loss": 0.2618,
"step": 362
},
{
"epoch": 2.1257922964407605,
"grad_norm": 0.309447705745697,
"learning_rate": 2.324334293606499e-06,
"loss": 0.2439,
"step": 363
},
{
"epoch": 2.1316431009263774,
"grad_norm": 0.328646719455719,
"learning_rate": 2.2954874300177197e-06,
"loss": 0.2303,
"step": 364
},
{
"epoch": 2.1374939054119944,
"grad_norm": 0.3447718024253845,
"learning_rate": 2.266767262072878e-06,
"loss": 0.2685,
"step": 365
},
{
"epoch": 2.143344709897611,
"grad_norm": 0.3506672978401184,
"learning_rate": 2.238175135197471e-06,
"loss": 0.2728,
"step": 366
},
{
"epoch": 2.1491955143832278,
"grad_norm": 0.34329918026924133,
"learning_rate": 2.2097123888187825e-06,
"loss": 0.2646,
"step": 367
},
{
"epoch": 2.1550463188688447,
"grad_norm": 0.3055090606212616,
"learning_rate": 2.181380356303139e-06,
"loss": 0.239,
"step": 368
},
{
"epoch": 2.160897123354461,
"grad_norm": 0.30475035309791565,
"learning_rate": 2.1531803648934333e-06,
"loss": 0.2683,
"step": 369
},
{
"epoch": 2.166747927840078,
"grad_norm": 0.32849615812301636,
"learning_rate": 2.1251137356469677e-06,
"loss": 0.2491,
"step": 370
},
{
"epoch": 2.172598732325695,
"grad_norm": 0.3533441126346588,
"learning_rate": 2.0971817833735548e-06,
"loss": 0.2781,
"step": 371
},
{
"epoch": 2.1784495368113115,
"grad_norm": 0.30950412154197693,
"learning_rate": 2.069385816573928e-06,
"loss": 0.2258,
"step": 372
},
{
"epoch": 2.1843003412969284,
"grad_norm": 0.34565675258636475,
"learning_rate": 2.0417271373784403e-06,
"loss": 0.3049,
"step": 373
},
{
"epoch": 2.1901511457825453,
"grad_norm": 0.32770001888275146,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.254,
"step": 374
},
{
"epoch": 2.196001950268162,
"grad_norm": 0.34241920709609985,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.2612,
"step": 375
},
{
"epoch": 2.2018527547537787,
"grad_norm": 0.33506783843040466,
"learning_rate": 1.9595877498858175e-06,
"loss": 0.2748,
"step": 376
},
{
"epoch": 2.2077035592393957,
"grad_norm": 0.3262109160423279,
"learning_rate": 1.9324911128742406e-06,
"loss": 0.2665,
"step": 377
},
{
"epoch": 2.213554363725012,
"grad_norm": 0.3280249536037445,
"learning_rate": 1.9055381764385272e-06,
"loss": 0.2591,
"step": 378
},
{
"epoch": 2.219405168210629,
"grad_norm": 0.33232155442237854,
"learning_rate": 1.8787302032164168e-06,
"loss": 0.2833,
"step": 379
},
{
"epoch": 2.2252559726962455,
"grad_norm": 0.3800523579120636,
"learning_rate": 1.8520684490547014e-06,
"loss": 0.2895,
"step": 380
},
{
"epoch": 2.2311067771818625,
"grad_norm": 0.3366720974445343,
"learning_rate": 1.8255541629503865e-06,
"loss": 0.2682,
"step": 381
},
{
"epoch": 2.2369575816674794,
"grad_norm": 0.31780189275741577,
"learning_rate": 1.7991885869921928e-06,
"loss": 0.2567,
"step": 382
},
{
"epoch": 2.242808386153096,
"grad_norm": 0.3226467967033386,
"learning_rate": 1.7729729563023613e-06,
"loss": 0.2575,
"step": 383
},
{
"epoch": 2.248659190638713,
"grad_norm": 0.3137516677379608,
"learning_rate": 1.746908498978791e-06,
"loss": 0.2464,
"step": 384
},
{
"epoch": 2.2545099951243297,
"grad_norm": 0.3348909914493561,
"learning_rate": 1.7209964360375137e-06,
"loss": 0.2779,
"step": 385
},
{
"epoch": 2.260360799609946,
"grad_norm": 0.3146172761917114,
"learning_rate": 1.6952379813554914e-06,
"loss": 0.2533,
"step": 386
},
{
"epoch": 2.266211604095563,
"grad_norm": 0.32093656063079834,
"learning_rate": 1.6696343416137495e-06,
"loss": 0.2626,
"step": 387
},
{
"epoch": 2.27206240858118,
"grad_norm": 0.3114534914493561,
"learning_rate": 1.6441867162408514e-06,
"loss": 0.2435,
"step": 388
},
{
"epoch": 2.2779132130667965,
"grad_norm": 0.326259583234787,
"learning_rate": 1.6188962973567068e-06,
"loss": 0.2572,
"step": 389
},
{
"epoch": 2.2837640175524134,
"grad_norm": 0.3404834568500519,
"learning_rate": 1.5937642697167288e-06,
"loss": 0.2941,
"step": 390
},
{
"epoch": 2.2896148220380304,
"grad_norm": 0.28849631547927856,
"learning_rate": 1.5687918106563326e-06,
"loss": 0.2377,
"step": 391
},
{
"epoch": 2.295465626523647,
"grad_norm": 0.2880316972732544,
"learning_rate": 1.5439800900357765e-06,
"loss": 0.2376,
"step": 392
},
{
"epoch": 2.3013164310092638,
"grad_norm": 0.3132326900959015,
"learning_rate": 1.5193302701853674e-06,
"loss": 0.2483,
"step": 393
},
{
"epoch": 2.3071672354948807,
"grad_norm": 0.34062352776527405,
"learning_rate": 1.4948435058510036e-06,
"loss": 0.2695,
"step": 394
},
{
"epoch": 2.313018039980497,
"grad_norm": 0.36842963099479675,
"learning_rate": 1.4705209441400841e-06,
"loss": 0.2897,
"step": 395
},
{
"epoch": 2.318868844466114,
"grad_norm": 0.3079017996788025,
"learning_rate": 1.4463637244677648e-06,
"loss": 0.2234,
"step": 396
},
{
"epoch": 2.324719648951731,
"grad_norm": 0.3228384852409363,
"learning_rate": 1.422372978503589e-06,
"loss": 0.2654,
"step": 397
},
{
"epoch": 2.3305704534373475,
"grad_norm": 0.32149839401245117,
"learning_rate": 1.3985498301184685e-06,
"loss": 0.2642,
"step": 398
},
{
"epoch": 2.3364212579229644,
"grad_norm": 0.3348996937274933,
"learning_rate": 1.374895395332037e-06,
"loss": 0.2639,
"step": 399
},
{
"epoch": 2.3422720624085813,
"grad_norm": 0.3477461338043213,
"learning_rate": 1.351410782260366e-06,
"loss": 0.278,
"step": 400
},
{
"epoch": 2.348122866894198,
"grad_norm": 0.3569883108139038,
"learning_rate": 1.3280970910640573e-06,
"loss": 0.2847,
"step": 401
},
{
"epoch": 2.3539736713798147,
"grad_norm": 0.3160240352153778,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.2462,
"step": 402
},
{
"epoch": 2.3598244758654316,
"grad_norm": 0.34394779801368713,
"learning_rate": 1.2819868348537263e-06,
"loss": 0.2942,
"step": 403
},
{
"epoch": 2.365675280351048,
"grad_norm": 0.31302496790885925,
"learning_rate": 1.259192429921584e-06,
"loss": 0.2607,
"step": 404
},
{
"epoch": 2.371526084836665,
"grad_norm": 0.3243591785430908,
"learning_rate": 1.2365732669273778e-06,
"loss": 0.2544,
"step": 405
},
{
"epoch": 2.377376889322282,
"grad_norm": 0.31446540355682373,
"learning_rate": 1.2141304054888204e-06,
"loss": 0.2604,
"step": 406
},
{
"epoch": 2.3832276938078985,
"grad_norm": 0.31701111793518066,
"learning_rate": 1.1918648969645947e-06,
"loss": 0.2832,
"step": 407
},
{
"epoch": 2.3890784982935154,
"grad_norm": 0.28770485520362854,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.2387,
"step": 408
},
{
"epoch": 2.3949293027791323,
"grad_norm": 0.3169179856777191,
"learning_rate": 1.1478701025036359e-06,
"loss": 0.2501,
"step": 409
},
{
"epoch": 2.4007801072647488,
"grad_norm": 0.3161545991897583,
"learning_rate": 1.126142877547826e-06,
"loss": 0.2478,
"step": 410
},
{
"epoch": 2.4066309117503657,
"grad_norm": 0.3387742340564728,
"learning_rate": 1.1045971273716476e-06,
"loss": 0.2776,
"step": 411
},
{
"epoch": 2.4124817162359826,
"grad_norm": 0.3186393678188324,
"learning_rate": 1.083233861307697e-06,
"loss": 0.2634,
"step": 412
},
{
"epoch": 2.418332520721599,
"grad_norm": 0.31719592213630676,
"learning_rate": 1.062054080139916e-06,
"loss": 0.2877,
"step": 413
},
{
"epoch": 2.424183325207216,
"grad_norm": 0.3102681338787079,
"learning_rate": 1.0410587760567104e-06,
"loss": 0.2717,
"step": 414
},
{
"epoch": 2.430034129692833,
"grad_norm": 0.33135566115379333,
"learning_rate": 1.0202489326044663e-06,
"loss": 0.2508,
"step": 415
},
{
"epoch": 2.4358849341784494,
"grad_norm": 0.3247663378715515,
"learning_rate": 9.99625524641481e-07,
"loss": 0.252,
"step": 416
},
{
"epoch": 2.4417357386640663,
"grad_norm": 0.3152580261230469,
"learning_rate": 9.791895182922911e-07,
"loss": 0.2627,
"step": 417
},
{
"epoch": 2.4475865431496833,
"grad_norm": 0.3281997740268707,
"learning_rate": 9.589418709024146e-07,
"loss": 0.2724,
"step": 418
},
{
"epoch": 2.4534373476352997,
"grad_norm": 0.3040336072444916,
"learning_rate": 9.388835309934985e-07,
"loss": 0.2691,
"step": 419
},
{
"epoch": 2.4592881521209167,
"grad_norm": 0.3166237473487854,
"learning_rate": 9.190154382188921e-07,
"loss": 0.2651,
"step": 420
},
{
"epoch": 2.465138956606533,
"grad_norm": 0.29170283675193787,
"learning_rate": 8.993385233196223e-07,
"loss": 0.2381,
"step": 421
},
{
"epoch": 2.47098976109215,
"grad_norm": 0.33377811312675476,
"learning_rate": 8.79853708080795e-07,
"loss": 0.2942,
"step": 422
},
{
"epoch": 2.476840565577767,
"grad_norm": 0.3009023070335388,
"learning_rate": 8.605619052884106e-07,
"loss": 0.2434,
"step": 423
},
{
"epoch": 2.4826913700633835,
"grad_norm": 0.2991524040699005,
"learning_rate": 8.414640186866063e-07,
"loss": 0.2672,
"step": 424
},
{
"epoch": 2.4885421745490004,
"grad_norm": 0.31273865699768066,
"learning_rate": 8.225609429353187e-07,
"loss": 0.2444,
"step": 425
},
{
"epoch": 2.4943929790346173,
"grad_norm": 0.33205345273017883,
"learning_rate": 8.03853563568367e-07,
"loss": 0.2763,
"step": 426
},
{
"epoch": 2.500243783520234,
"grad_norm": 0.3440842032432556,
"learning_rate": 7.8534275695198e-07,
"loss": 0.3004,
"step": 427
},
{
"epoch": 2.5060945880058507,
"grad_norm": 0.33286216855049133,
"learning_rate": 7.670293902437331e-07,
"loss": 0.2864,
"step": 428
},
{
"epoch": 2.5119453924914676,
"grad_norm": 0.335589736700058,
"learning_rate": 7.489143213519301e-07,
"loss": 0.2877,
"step": 429
},
{
"epoch": 2.517796196977084,
"grad_norm": 0.32931384444236755,
"learning_rate": 7.309983988954078e-07,
"loss": 0.2758,
"step": 430
},
{
"epoch": 2.523647001462701,
"grad_norm": 0.3004399240016937,
"learning_rate": 7.132824621637891e-07,
"loss": 0.2271,
"step": 431
},
{
"epoch": 2.529497805948318,
"grad_norm": 0.34769243001937866,
"learning_rate": 6.957673410781617e-07,
"loss": 0.2831,
"step": 432
},
{
"epoch": 2.5353486104339344,
"grad_norm": 0.3238453269004822,
"learning_rate": 6.784538561521986e-07,
"loss": 0.2627,
"step": 433
},
{
"epoch": 2.5411994149195514,
"grad_norm": 0.3278021812438965,
"learning_rate": 6.613428184537235e-07,
"loss": 0.2921,
"step": 434
},
{
"epoch": 2.5470502194051683,
"grad_norm": 0.3187845051288605,
"learning_rate": 6.444350295667112e-07,
"loss": 0.2546,
"step": 435
},
{
"epoch": 2.5529010238907848,
"grad_norm": 0.3074102997779846,
"learning_rate": 6.277312815537423e-07,
"loss": 0.2628,
"step": 436
},
{
"epoch": 2.5587518283764017,
"grad_norm": 0.3095274269580841,
"learning_rate": 6.112323569188927e-07,
"loss": 0.2473,
"step": 437
},
{
"epoch": 2.5646026328620186,
"grad_norm": 0.3091028034687042,
"learning_rate": 5.949390285710777e-07,
"loss": 0.2445,
"step": 438
},
{
"epoch": 2.570453437347635,
"grad_norm": 0.35130709409713745,
"learning_rate": 5.788520597878477e-07,
"loss": 0.2736,
"step": 439
},
{
"epoch": 2.576304241833252,
"grad_norm": 0.30751124024391174,
"learning_rate": 5.629722041796292e-07,
"loss": 0.2456,
"step": 440
},
{
"epoch": 2.582155046318869,
"grad_norm": 0.33394762873649597,
"learning_rate": 5.473002056544191e-07,
"loss": 0.2653,
"step": 441
},
{
"epoch": 2.5880058508044854,
"grad_norm": 0.344565749168396,
"learning_rate": 5.318367983829393e-07,
"loss": 0.2534,
"step": 442
},
{
"epoch": 2.5938566552901023,
"grad_norm": 0.3401236832141876,
"learning_rate": 5.165827067642415e-07,
"loss": 0.264,
"step": 443
},
{
"epoch": 2.5997074597757193,
"grad_norm": 0.33671292662620544,
"learning_rate": 5.015386453917742e-07,
"loss": 0.2675,
"step": 444
},
{
"epoch": 2.6055582642613357,
"grad_norm": 0.32880690693855286,
"learning_rate": 4.867053190199011e-07,
"loss": 0.2684,
"step": 445
},
{
"epoch": 2.6114090687469527,
"grad_norm": 0.3041062355041504,
"learning_rate": 4.720834225308962e-07,
"loss": 0.277,
"step": 446
},
{
"epoch": 2.6172598732325696,
"grad_norm": 0.33339980244636536,
"learning_rate": 4.576736409023813e-07,
"loss": 0.2821,
"step": 447
},
{
"epoch": 2.623110677718186,
"grad_norm": 0.30308592319488525,
"learning_rate": 4.4347664917524293e-07,
"loss": 0.2541,
"step": 448
},
{
"epoch": 2.628961482203803,
"grad_norm": 0.29520875215530396,
"learning_rate": 4.29493112422007e-07,
"loss": 0.2448,
"step": 449
},
{
"epoch": 2.63481228668942,
"grad_norm": 0.32247990369796753,
"learning_rate": 4.15723685715686e-07,
"loss": 0.2579,
"step": 450
},
{
"epoch": 2.6406630911750364,
"grad_norm": 0.3348802626132965,
"learning_rate": 4.0216901409908695e-07,
"loss": 0.2717,
"step": 451
},
{
"epoch": 2.6465138956606533,
"grad_norm": 0.2945747673511505,
"learning_rate": 3.8882973255459975e-07,
"loss": 0.2349,
"step": 452
},
{
"epoch": 2.6523647001462702,
"grad_norm": 0.31064727902412415,
"learning_rate": 3.7570646597444196e-07,
"loss": 0.2615,
"step": 453
},
{
"epoch": 2.6582155046318867,
"grad_norm": 0.32341885566711426,
"learning_rate": 3.627998291313939e-07,
"loss": 0.2697,
"step": 454
},
{
"epoch": 2.6640663091175036,
"grad_norm": 0.3337382674217224,
"learning_rate": 3.5011042664999663e-07,
"loss": 0.2919,
"step": 455
},
{
"epoch": 2.6699171136031206,
"grad_norm": 0.3329227566719055,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.2909,
"step": 456
},
{
"epoch": 2.675767918088737,
"grad_norm": 0.2781767249107361,
"learning_rate": 3.2538569235963216e-07,
"loss": 0.2345,
"step": 457
},
{
"epoch": 2.681618722574354,
"grad_norm": 0.29668092727661133,
"learning_rate": 3.133515188060077e-07,
"loss": 0.2309,
"step": 458
},
{
"epoch": 2.687469527059971,
"grad_norm": 0.31288012862205505,
"learning_rate": 3.015368960704584e-07,
"loss": 0.2409,
"step": 459
},
{
"epoch": 2.6933203315455874,
"grad_norm": 0.3310166597366333,
"learning_rate": 2.899423776210092e-07,
"loss": 0.283,
"step": 460
},
{
"epoch": 2.6991711360312043,
"grad_norm": 0.3168513774871826,
"learning_rate": 2.785685066146776e-07,
"loss": 0.2916,
"step": 461
},
{
"epoch": 2.705021940516821,
"grad_norm": 0.3137360215187073,
"learning_rate": 2.6741581587202747e-07,
"loss": 0.2432,
"step": 462
},
{
"epoch": 2.7108727450024377,
"grad_norm": 0.3316532075405121,
"learning_rate": 2.5648482785220865e-07,
"loss": 0.2855,
"step": 463
},
{
"epoch": 2.7167235494880546,
"grad_norm": 0.30476975440979004,
"learning_rate": 2.4577605462847764e-07,
"loss": 0.2482,
"step": 464
},
{
"epoch": 2.7225743539736715,
"grad_norm": 0.3032921552658081,
"learning_rate": 2.3528999786421758e-07,
"loss": 0.2486,
"step": 465
},
{
"epoch": 2.728425158459288,
"grad_norm": 0.32038578391075134,
"learning_rate": 2.25027148789429e-07,
"loss": 0.27,
"step": 466
},
{
"epoch": 2.734275962944905,
"grad_norm": 0.31940269470214844,
"learning_rate": 2.1498798817772281e-07,
"loss": 0.2665,
"step": 467
},
{
"epoch": 2.740126767430522,
"grad_norm": 0.3099590539932251,
"learning_rate": 2.0517298632379445e-07,
"loss": 0.2576,
"step": 468
},
{
"epoch": 2.7459775719161383,
"grad_norm": 0.33475908637046814,
"learning_rate": 1.9558260302139642e-07,
"loss": 0.2681,
"step": 469
},
{
"epoch": 2.7518283764017553,
"grad_norm": 0.2855778634548187,
"learning_rate": 1.8621728754179392e-07,
"loss": 0.2205,
"step": 470
},
{
"epoch": 2.757679180887372,
"grad_norm": 0.31299954652786255,
"learning_rate": 1.770774786127244e-07,
"loss": 0.2728,
"step": 471
},
{
"epoch": 2.7635299853729887,
"grad_norm": 0.31098952889442444,
"learning_rate": 1.6816360439783797e-07,
"loss": 0.26,
"step": 472
},
{
"epoch": 2.7693807898586056,
"grad_norm": 0.33787840604782104,
"learning_rate": 1.5947608247664558e-07,
"loss": 0.2757,
"step": 473
},
{
"epoch": 2.7752315943442225,
"grad_norm": 0.3227766156196594,
"learning_rate": 1.510153198249531e-07,
"loss": 0.2614,
"step": 474
},
{
"epoch": 2.781082398829839,
"grad_norm": 0.31379225850105286,
"learning_rate": 1.4278171279579757e-07,
"loss": 0.235,
"step": 475
},
{
"epoch": 2.786933203315456,
"grad_norm": 0.33349987864494324,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.2613,
"step": 476
},
{
"epoch": 2.792784007801073,
"grad_norm": 0.33754172921180725,
"learning_rate": 1.2699749779249926e-07,
"loss": 0.2939,
"step": 477
},
{
"epoch": 2.7986348122866893,
"grad_norm": 0.31575557589530945,
"learning_rate": 1.1944762924597286e-07,
"loss": 0.2397,
"step": 478
},
{
"epoch": 2.8044856167723062,
"grad_norm": 0.3182400166988373,
"learning_rate": 1.1212639514257829e-07,
"loss": 0.2793,
"step": 479
},
{
"epoch": 2.810336421257923,
"grad_norm": 0.29797741770744324,
"learning_rate": 1.0503413845297739e-07,
"loss": 0.2578,
"step": 480
},
{
"epoch": 2.8161872257435396,
"grad_norm": 0.2968318462371826,
"learning_rate": 9.817119142115472e-08,
"loss": 0.2663,
"step": 481
},
{
"epoch": 2.8220380302291566,
"grad_norm": 0.30038806796073914,
"learning_rate": 9.15378755488483e-08,
"loss": 0.2645,
"step": 482
},
{
"epoch": 2.8278888347147735,
"grad_norm": 0.3152545392513275,
"learning_rate": 8.513450158049109e-08,
"loss": 0.2726,
"step": 483
},
{
"epoch": 2.83373963920039,
"grad_norm": 0.2907554805278778,
"learning_rate": 7.896136948865429e-08,
"loss": 0.2451,
"step": 484
},
{
"epoch": 2.839590443686007,
"grad_norm": 0.32411476969718933,
"learning_rate": 7.301876845999368e-08,
"loss": 0.2812,
"step": 485
},
{
"epoch": 2.845441248171624,
"grad_norm": 0.3213962912559509,
"learning_rate": 6.730697688170251e-08,
"loss": 0.2732,
"step": 486
},
{
"epoch": 2.8512920526572403,
"grad_norm": 0.31291988492012024,
"learning_rate": 6.182626232847044e-08,
"loss": 0.2915,
"step": 487
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.31493186950683594,
"learning_rate": 5.6576881549949e-08,
"loss": 0.25,
"step": 488
},
{
"epoch": 2.862993661628474,
"grad_norm": 0.29286989569664,
"learning_rate": 5.155908045872349e-08,
"loss": 0.2645,
"step": 489
},
{
"epoch": 2.8688444661140906,
"grad_norm": 0.28838157653808594,
"learning_rate": 4.677309411879327e-08,
"loss": 0.2534,
"step": 490
},
{
"epoch": 2.8746952705997075,
"grad_norm": 0.3015024662017822,
"learning_rate": 4.221914673455896e-08,
"loss": 0.2519,
"step": 491
},
{
"epoch": 2.8805460750853245,
"grad_norm": 0.3019368052482605,
"learning_rate": 3.7897451640321326e-08,
"loss": 0.2436,
"step": 492
},
{
"epoch": 2.886396879570941,
"grad_norm": 0.33348119258880615,
"learning_rate": 3.3808211290284886e-08,
"loss": 0.29,
"step": 493
},
{
"epoch": 2.892247684056558,
"grad_norm": 0.32297229766845703,
"learning_rate": 2.995161724907658e-08,
"loss": 0.2394,
"step": 494
},
{
"epoch": 2.8980984885421748,
"grad_norm": 0.35386550426483154,
"learning_rate": 2.6327850182769065e-08,
"loss": 0.2698,
"step": 495
},
{
"epoch": 2.9039492930277913,
"grad_norm": 0.3366953134536743,
"learning_rate": 2.29370798504186e-08,
"loss": 0.2629,
"step": 496
},
{
"epoch": 2.909800097513408,
"grad_norm": 0.34689807891845703,
"learning_rate": 1.9779465096112505e-08,
"loss": 0.278,
"step": 497
},
{
"epoch": 2.915650901999025,
"grad_norm": 0.31642869114875793,
"learning_rate": 1.6855153841527915e-08,
"loss": 0.2569,
"step": 498
},
{
"epoch": 2.9215017064846416,
"grad_norm": 0.31688442826271057,
"learning_rate": 1.4164283079001196e-08,
"loss": 0.2722,
"step": 499
},
{
"epoch": 2.9273525109702585,
"grad_norm": 0.2896118462085724,
"learning_rate": 1.1706978865113072e-08,
"loss": 0.2399,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 510,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.25354585607268e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}