Text Generation
Transformers
Safetensors
English
llama
code
text-generation-inference
Inference Endpoints
CodeDrafter-500M / trainer_state.json
Zhuominc's picture
Upload folder using huggingface_hub
a509497
{
"best_metric": 0.798653244972229,
"best_model_checkpoint": "FastCoderL4-ITX/checkpoint-500",
"epoch": 1.0,
"eval_steps": 250,
"global_step": 547,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018281535648994515,
"grad_norm": 16.024444580078125,
"learning_rate": 1.2000000000000002e-07,
"loss": 1.6383,
"step": 1
},
{
"epoch": 0.003656307129798903,
"grad_norm": 16.114477157592773,
"learning_rate": 2.4000000000000003e-07,
"loss": 1.7323,
"step": 2
},
{
"epoch": 0.005484460694698354,
"grad_norm": 14.292167663574219,
"learning_rate": 3.6e-07,
"loss": 1.4207,
"step": 3
},
{
"epoch": 0.007312614259597806,
"grad_norm": 15.010176658630371,
"learning_rate": 4.800000000000001e-07,
"loss": 1.5956,
"step": 4
},
{
"epoch": 0.009140767824497258,
"grad_norm": 13.827630996704102,
"learning_rate": 6.000000000000001e-07,
"loss": 1.49,
"step": 5
},
{
"epoch": 0.010968921389396709,
"grad_norm": 15.43071174621582,
"learning_rate": 7.2e-07,
"loss": 1.6081,
"step": 6
},
{
"epoch": 0.012797074954296161,
"grad_norm": 14.97592544555664,
"learning_rate": 8.4e-07,
"loss": 1.6164,
"step": 7
},
{
"epoch": 0.014625228519195612,
"grad_norm": 11.73971939086914,
"learning_rate": 9.600000000000001e-07,
"loss": 1.4299,
"step": 8
},
{
"epoch": 0.016453382084095063,
"grad_norm": 12.449714660644531,
"learning_rate": 1.08e-06,
"loss": 1.3328,
"step": 9
},
{
"epoch": 0.018281535648994516,
"grad_norm": 12.710100173950195,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.4129,
"step": 10
},
{
"epoch": 0.02010968921389397,
"grad_norm": 12.13203239440918,
"learning_rate": 1.3199999999999999e-06,
"loss": 1.3971,
"step": 11
},
{
"epoch": 0.021937842778793418,
"grad_norm": 10.500185012817383,
"learning_rate": 1.44e-06,
"loss": 1.4321,
"step": 12
},
{
"epoch": 0.02376599634369287,
"grad_norm": 10.064560890197754,
"learning_rate": 1.5599999999999999e-06,
"loss": 1.2872,
"step": 13
},
{
"epoch": 0.025594149908592323,
"grad_norm": 7.85143518447876,
"learning_rate": 1.68e-06,
"loss": 1.2345,
"step": 14
},
{
"epoch": 0.027422303473491772,
"grad_norm": 7.530126094818115,
"learning_rate": 1.8e-06,
"loss": 1.1803,
"step": 15
},
{
"epoch": 0.029250457038391225,
"grad_norm": 6.091775417327881,
"learning_rate": 1.9200000000000003e-06,
"loss": 1.2247,
"step": 16
},
{
"epoch": 0.031078610603290677,
"grad_norm": 4.9651384353637695,
"learning_rate": 2.0400000000000004e-06,
"loss": 1.1655,
"step": 17
},
{
"epoch": 0.03290676416819013,
"grad_norm": 6.209571361541748,
"learning_rate": 2.16e-06,
"loss": 1.0649,
"step": 18
},
{
"epoch": 0.03473491773308958,
"grad_norm": 4.946502208709717,
"learning_rate": 2.28e-06,
"loss": 1.1046,
"step": 19
},
{
"epoch": 0.03656307129798903,
"grad_norm": 4.954932689666748,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.0964,
"step": 20
},
{
"epoch": 0.038391224862888484,
"grad_norm": 3.8354671001434326,
"learning_rate": 2.52e-06,
"loss": 1.2277,
"step": 21
},
{
"epoch": 0.04021937842778794,
"grad_norm": 4.310220718383789,
"learning_rate": 2.6399999999999997e-06,
"loss": 1.042,
"step": 22
},
{
"epoch": 0.04204753199268738,
"grad_norm": 3.9748997688293457,
"learning_rate": 2.76e-06,
"loss": 1.0234,
"step": 23
},
{
"epoch": 0.043875685557586835,
"grad_norm": 3.9019360542297363,
"learning_rate": 2.88e-06,
"loss": 1.1286,
"step": 24
},
{
"epoch": 0.04570383912248629,
"grad_norm": 4.246694564819336,
"learning_rate": 3e-06,
"loss": 0.9793,
"step": 25
},
{
"epoch": 0.04753199268738574,
"grad_norm": 3.8797051906585693,
"learning_rate": 3.1199999999999998e-06,
"loss": 1.0747,
"step": 26
},
{
"epoch": 0.04936014625228519,
"grad_norm": 4.0023908615112305,
"learning_rate": 3.24e-06,
"loss": 1.1031,
"step": 27
},
{
"epoch": 0.051188299817184646,
"grad_norm": 4.26245641708374,
"learning_rate": 3.36e-06,
"loss": 1.003,
"step": 28
},
{
"epoch": 0.05301645338208409,
"grad_norm": 4.6040215492248535,
"learning_rate": 3.48e-06,
"loss": 0.9311,
"step": 29
},
{
"epoch": 0.054844606946983544,
"grad_norm": 4.464705467224121,
"learning_rate": 3.6e-06,
"loss": 1.0341,
"step": 30
},
{
"epoch": 0.056672760511883,
"grad_norm": 3.787562608718872,
"learning_rate": 3.72e-06,
"loss": 0.984,
"step": 31
},
{
"epoch": 0.05850091407678245,
"grad_norm": 3.2259016036987305,
"learning_rate": 3.8400000000000005e-06,
"loss": 0.9167,
"step": 32
},
{
"epoch": 0.0603290676416819,
"grad_norm": 3.7597789764404297,
"learning_rate": 3.96e-06,
"loss": 1.0784,
"step": 33
},
{
"epoch": 0.062157221206581355,
"grad_norm": 3.173090934753418,
"learning_rate": 4.080000000000001e-06,
"loss": 0.9436,
"step": 34
},
{
"epoch": 0.06398537477148081,
"grad_norm": 3.336909055709839,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.8013,
"step": 35
},
{
"epoch": 0.06581352833638025,
"grad_norm": 2.738156318664551,
"learning_rate": 4.32e-06,
"loss": 1.1238,
"step": 36
},
{
"epoch": 0.06764168190127971,
"grad_norm": 3.3270339965820312,
"learning_rate": 4.44e-06,
"loss": 0.8423,
"step": 37
},
{
"epoch": 0.06946983546617916,
"grad_norm": 2.872663736343384,
"learning_rate": 4.56e-06,
"loss": 0.9931,
"step": 38
},
{
"epoch": 0.0712979890310786,
"grad_norm": 3.2571451663970947,
"learning_rate": 4.68e-06,
"loss": 0.9323,
"step": 39
},
{
"epoch": 0.07312614259597806,
"grad_norm": 2.999234437942505,
"learning_rate": 4.800000000000001e-06,
"loss": 0.9247,
"step": 40
},
{
"epoch": 0.07495429616087751,
"grad_norm": 2.9580419063568115,
"learning_rate": 4.92e-06,
"loss": 0.8751,
"step": 41
},
{
"epoch": 0.07678244972577697,
"grad_norm": 2.8437395095825195,
"learning_rate": 5.04e-06,
"loss": 0.8857,
"step": 42
},
{
"epoch": 0.07861060329067641,
"grad_norm": 3.175656318664551,
"learning_rate": 5.16e-06,
"loss": 0.8942,
"step": 43
},
{
"epoch": 0.08043875685557587,
"grad_norm": 2.684788703918457,
"learning_rate": 5.279999999999999e-06,
"loss": 0.8725,
"step": 44
},
{
"epoch": 0.08226691042047532,
"grad_norm": 3.000286340713501,
"learning_rate": 5.4e-06,
"loss": 0.8803,
"step": 45
},
{
"epoch": 0.08409506398537477,
"grad_norm": 2.856066942214966,
"learning_rate": 5.52e-06,
"loss": 0.9705,
"step": 46
},
{
"epoch": 0.08592321755027423,
"grad_norm": 3.0575389862060547,
"learning_rate": 5.64e-06,
"loss": 0.8106,
"step": 47
},
{
"epoch": 0.08775137111517367,
"grad_norm": 2.649608612060547,
"learning_rate": 5.76e-06,
"loss": 1.0701,
"step": 48
},
{
"epoch": 0.08957952468007313,
"grad_norm": 3.1014580726623535,
"learning_rate": 5.8800000000000005e-06,
"loss": 0.9607,
"step": 49
},
{
"epoch": 0.09140767824497258,
"grad_norm": 2.6570193767547607,
"learning_rate": 6e-06,
"loss": 0.9685,
"step": 50
},
{
"epoch": 0.09323583180987204,
"grad_norm": 3.082258462905884,
"learning_rate": 6.12e-06,
"loss": 1.0039,
"step": 51
},
{
"epoch": 0.09506398537477148,
"grad_norm": 2.4003512859344482,
"learning_rate": 6.2399999999999995e-06,
"loss": 0.8934,
"step": 52
},
{
"epoch": 0.09689213893967093,
"grad_norm": 2.605583667755127,
"learning_rate": 6.36e-06,
"loss": 0.8891,
"step": 53
},
{
"epoch": 0.09872029250457039,
"grad_norm": 2.541799306869507,
"learning_rate": 6.48e-06,
"loss": 0.8183,
"step": 54
},
{
"epoch": 0.10054844606946983,
"grad_norm": 2.594459056854248,
"learning_rate": 6.6e-06,
"loss": 0.9906,
"step": 55
},
{
"epoch": 0.10237659963436929,
"grad_norm": 2.9506289958953857,
"learning_rate": 6.72e-06,
"loss": 0.8263,
"step": 56
},
{
"epoch": 0.10420475319926874,
"grad_norm": 2.8362669944763184,
"learning_rate": 6.840000000000001e-06,
"loss": 0.9,
"step": 57
},
{
"epoch": 0.10603290676416818,
"grad_norm": 2.6192896366119385,
"learning_rate": 6.96e-06,
"loss": 1.05,
"step": 58
},
{
"epoch": 0.10786106032906764,
"grad_norm": 2.7502949237823486,
"learning_rate": 7.08e-06,
"loss": 0.87,
"step": 59
},
{
"epoch": 0.10968921389396709,
"grad_norm": 2.6745474338531494,
"learning_rate": 7.2e-06,
"loss": 0.8163,
"step": 60
},
{
"epoch": 0.11151736745886655,
"grad_norm": 2.6584086418151855,
"learning_rate": 7.32e-06,
"loss": 0.8813,
"step": 61
},
{
"epoch": 0.113345521023766,
"grad_norm": 2.689574956893921,
"learning_rate": 7.44e-06,
"loss": 0.9404,
"step": 62
},
{
"epoch": 0.11517367458866545,
"grad_norm": 2.754441738128662,
"learning_rate": 7.5600000000000005e-06,
"loss": 0.7416,
"step": 63
},
{
"epoch": 0.1170018281535649,
"grad_norm": 2.8178014755249023,
"learning_rate": 7.680000000000001e-06,
"loss": 0.8377,
"step": 64
},
{
"epoch": 0.11882998171846434,
"grad_norm": 2.8821122646331787,
"learning_rate": 7.8e-06,
"loss": 0.7101,
"step": 65
},
{
"epoch": 0.1206581352833638,
"grad_norm": 2.6646909713745117,
"learning_rate": 7.92e-06,
"loss": 1.0581,
"step": 66
},
{
"epoch": 0.12248628884826325,
"grad_norm": 2.9155476093292236,
"learning_rate": 8.040000000000001e-06,
"loss": 0.8417,
"step": 67
},
{
"epoch": 0.12431444241316271,
"grad_norm": 2.7877771854400635,
"learning_rate": 8.160000000000001e-06,
"loss": 0.9266,
"step": 68
},
{
"epoch": 0.12614259597806216,
"grad_norm": 2.625126361846924,
"learning_rate": 8.28e-06,
"loss": 1.0048,
"step": 69
},
{
"epoch": 0.12797074954296161,
"grad_norm": 2.7259960174560547,
"learning_rate": 8.400000000000001e-06,
"loss": 0.9485,
"step": 70
},
{
"epoch": 0.12979890310786105,
"grad_norm": 2.743478536605835,
"learning_rate": 8.52e-06,
"loss": 0.9221,
"step": 71
},
{
"epoch": 0.1316270566727605,
"grad_norm": 2.586174964904785,
"learning_rate": 8.64e-06,
"loss": 0.8967,
"step": 72
},
{
"epoch": 0.13345521023765997,
"grad_norm": 2.817873954772949,
"learning_rate": 8.759999999999999e-06,
"loss": 0.943,
"step": 73
},
{
"epoch": 0.13528336380255943,
"grad_norm": 2.692861557006836,
"learning_rate": 8.88e-06,
"loss": 0.8334,
"step": 74
},
{
"epoch": 0.13711151736745886,
"grad_norm": 2.9305572509765625,
"learning_rate": 9e-06,
"loss": 0.8215,
"step": 75
},
{
"epoch": 0.13893967093235832,
"grad_norm": 2.898930072784424,
"learning_rate": 9.12e-06,
"loss": 0.8979,
"step": 76
},
{
"epoch": 0.14076782449725778,
"grad_norm": 2.8066327571868896,
"learning_rate": 9.24e-06,
"loss": 1.0717,
"step": 77
},
{
"epoch": 0.1425959780621572,
"grad_norm": 3.126624584197998,
"learning_rate": 9.36e-06,
"loss": 0.8887,
"step": 78
},
{
"epoch": 0.14442413162705667,
"grad_norm": 2.469200611114502,
"learning_rate": 9.48e-06,
"loss": 0.9542,
"step": 79
},
{
"epoch": 0.14625228519195613,
"grad_norm": 2.6940770149230957,
"learning_rate": 9.600000000000001e-06,
"loss": 0.9756,
"step": 80
},
{
"epoch": 0.1480804387568556,
"grad_norm": 2.847891330718994,
"learning_rate": 9.72e-06,
"loss": 0.8966,
"step": 81
},
{
"epoch": 0.14990859232175502,
"grad_norm": 2.9159109592437744,
"learning_rate": 9.84e-06,
"loss": 0.8055,
"step": 82
},
{
"epoch": 0.15173674588665448,
"grad_norm": 2.9693570137023926,
"learning_rate": 9.960000000000001e-06,
"loss": 0.8913,
"step": 83
},
{
"epoch": 0.15356489945155394,
"grad_norm": 2.6382272243499756,
"learning_rate": 1.008e-05,
"loss": 0.8565,
"step": 84
},
{
"epoch": 0.15539305301645337,
"grad_norm": 2.7299423217773438,
"learning_rate": 1.02e-05,
"loss": 0.8096,
"step": 85
},
{
"epoch": 0.15722120658135283,
"grad_norm": 2.7661237716674805,
"learning_rate": 1.032e-05,
"loss": 0.9193,
"step": 86
},
{
"epoch": 0.1590493601462523,
"grad_norm": 3.0896854400634766,
"learning_rate": 1.044e-05,
"loss": 0.7745,
"step": 87
},
{
"epoch": 0.16087751371115175,
"grad_norm": 2.6443893909454346,
"learning_rate": 1.0559999999999999e-05,
"loss": 0.8674,
"step": 88
},
{
"epoch": 0.16270566727605118,
"grad_norm": 3.047353506088257,
"learning_rate": 1.068e-05,
"loss": 0.9062,
"step": 89
},
{
"epoch": 0.16453382084095064,
"grad_norm": 2.7751214504241943,
"learning_rate": 1.08e-05,
"loss": 0.8222,
"step": 90
},
{
"epoch": 0.1663619744058501,
"grad_norm": 2.5556681156158447,
"learning_rate": 1.092e-05,
"loss": 0.7737,
"step": 91
},
{
"epoch": 0.16819012797074953,
"grad_norm": 2.840104103088379,
"learning_rate": 1.104e-05,
"loss": 0.9967,
"step": 92
},
{
"epoch": 0.170018281535649,
"grad_norm": 2.784130811691284,
"learning_rate": 1.116e-05,
"loss": 0.8571,
"step": 93
},
{
"epoch": 0.17184643510054845,
"grad_norm": 2.5982677936553955,
"learning_rate": 1.128e-05,
"loss": 0.7934,
"step": 94
},
{
"epoch": 0.1736745886654479,
"grad_norm": 3.1838393211364746,
"learning_rate": 1.1400000000000001e-05,
"loss": 0.8569,
"step": 95
},
{
"epoch": 0.17550274223034734,
"grad_norm": 2.793653726577759,
"learning_rate": 1.152e-05,
"loss": 0.9144,
"step": 96
},
{
"epoch": 0.1773308957952468,
"grad_norm": 2.6756796836853027,
"learning_rate": 1.164e-05,
"loss": 0.8517,
"step": 97
},
{
"epoch": 0.17915904936014626,
"grad_norm": 2.6979010105133057,
"learning_rate": 1.1760000000000001e-05,
"loss": 0.7551,
"step": 98
},
{
"epoch": 0.1809872029250457,
"grad_norm": 2.9032483100891113,
"learning_rate": 1.1880000000000001e-05,
"loss": 0.777,
"step": 99
},
{
"epoch": 0.18281535648994515,
"grad_norm": 2.555727243423462,
"learning_rate": 1.2e-05,
"loss": 0.7583,
"step": 100
},
{
"epoch": 0.1846435100548446,
"grad_norm": 2.7780463695526123,
"learning_rate": 1.2120000000000001e-05,
"loss": 1.0916,
"step": 101
},
{
"epoch": 0.18647166361974407,
"grad_norm": 2.791424512863159,
"learning_rate": 1.224e-05,
"loss": 0.9344,
"step": 102
},
{
"epoch": 0.1882998171846435,
"grad_norm": 2.590106248855591,
"learning_rate": 1.236e-05,
"loss": 0.8391,
"step": 103
},
{
"epoch": 0.19012797074954296,
"grad_norm": 2.7519073486328125,
"learning_rate": 1.2479999999999999e-05,
"loss": 0.7809,
"step": 104
},
{
"epoch": 0.19195612431444242,
"grad_norm": 2.8074002265930176,
"learning_rate": 1.26e-05,
"loss": 0.8258,
"step": 105
},
{
"epoch": 0.19378427787934185,
"grad_norm": 2.6220719814300537,
"learning_rate": 1.272e-05,
"loss": 0.7542,
"step": 106
},
{
"epoch": 0.1956124314442413,
"grad_norm": 2.8143625259399414,
"learning_rate": 1.284e-05,
"loss": 0.8587,
"step": 107
},
{
"epoch": 0.19744058500914077,
"grad_norm": 2.4876911640167236,
"learning_rate": 1.296e-05,
"loss": 0.8425,
"step": 108
},
{
"epoch": 0.19926873857404023,
"grad_norm": 2.7102651596069336,
"learning_rate": 1.308e-05,
"loss": 0.9726,
"step": 109
},
{
"epoch": 0.20109689213893966,
"grad_norm": 2.375572919845581,
"learning_rate": 1.32e-05,
"loss": 0.8122,
"step": 110
},
{
"epoch": 0.20292504570383912,
"grad_norm": 2.485874652862549,
"learning_rate": 1.3320000000000001e-05,
"loss": 0.7726,
"step": 111
},
{
"epoch": 0.20475319926873858,
"grad_norm": 2.5263822078704834,
"learning_rate": 1.344e-05,
"loss": 0.9219,
"step": 112
},
{
"epoch": 0.20658135283363802,
"grad_norm": 2.5467567443847656,
"learning_rate": 1.356e-05,
"loss": 0.8116,
"step": 113
},
{
"epoch": 0.20840950639853748,
"grad_norm": 2.3540358543395996,
"learning_rate": 1.3680000000000001e-05,
"loss": 1.0343,
"step": 114
},
{
"epoch": 0.21023765996343693,
"grad_norm": 2.6379354000091553,
"learning_rate": 1.3800000000000002e-05,
"loss": 0.8242,
"step": 115
},
{
"epoch": 0.21206581352833637,
"grad_norm": 2.5178139209747314,
"learning_rate": 1.392e-05,
"loss": 0.8899,
"step": 116
},
{
"epoch": 0.21389396709323583,
"grad_norm": 2.802619695663452,
"learning_rate": 1.4040000000000001e-05,
"loss": 0.8031,
"step": 117
},
{
"epoch": 0.21572212065813529,
"grad_norm": 2.7448935508728027,
"learning_rate": 1.416e-05,
"loss": 0.8676,
"step": 118
},
{
"epoch": 0.21755027422303475,
"grad_norm": 2.626340627670288,
"learning_rate": 1.428e-05,
"loss": 0.9465,
"step": 119
},
{
"epoch": 0.21937842778793418,
"grad_norm": 2.5691044330596924,
"learning_rate": 1.44e-05,
"loss": 0.712,
"step": 120
},
{
"epoch": 0.22120658135283364,
"grad_norm": 2.877453565597534,
"learning_rate": 1.452e-05,
"loss": 0.8605,
"step": 121
},
{
"epoch": 0.2230347349177331,
"grad_norm": 2.409876585006714,
"learning_rate": 1.464e-05,
"loss": 0.8972,
"step": 122
},
{
"epoch": 0.22486288848263253,
"grad_norm": 2.517220973968506,
"learning_rate": 1.4760000000000001e-05,
"loss": 0.822,
"step": 123
},
{
"epoch": 0.226691042047532,
"grad_norm": 2.53521728515625,
"learning_rate": 1.488e-05,
"loss": 0.7721,
"step": 124
},
{
"epoch": 0.22851919561243145,
"grad_norm": 2.533579111099243,
"learning_rate": 1.5e-05,
"loss": 0.7182,
"step": 125
},
{
"epoch": 0.2303473491773309,
"grad_norm": 2.8807780742645264,
"learning_rate": 1.5120000000000001e-05,
"loss": 0.8755,
"step": 126
},
{
"epoch": 0.23217550274223034,
"grad_norm": 2.8886823654174805,
"learning_rate": 1.524e-05,
"loss": 0.8119,
"step": 127
},
{
"epoch": 0.2340036563071298,
"grad_norm": 2.710432529449463,
"learning_rate": 1.5360000000000002e-05,
"loss": 0.7054,
"step": 128
},
{
"epoch": 0.23583180987202926,
"grad_norm": 2.3780925273895264,
"learning_rate": 1.548e-05,
"loss": 0.9101,
"step": 129
},
{
"epoch": 0.2376599634369287,
"grad_norm": 2.6293869018554688,
"learning_rate": 1.56e-05,
"loss": 0.7895,
"step": 130
},
{
"epoch": 0.23948811700182815,
"grad_norm": 2.584303617477417,
"learning_rate": 1.5720000000000002e-05,
"loss": 1.0317,
"step": 131
},
{
"epoch": 0.2413162705667276,
"grad_norm": 2.4637179374694824,
"learning_rate": 1.584e-05,
"loss": 0.7805,
"step": 132
},
{
"epoch": 0.24314442413162707,
"grad_norm": 2.4105379581451416,
"learning_rate": 1.596e-05,
"loss": 0.8044,
"step": 133
},
{
"epoch": 0.2449725776965265,
"grad_norm": 2.476205825805664,
"learning_rate": 1.6080000000000002e-05,
"loss": 0.7283,
"step": 134
},
{
"epoch": 0.24680073126142596,
"grad_norm": 2.620548725128174,
"learning_rate": 1.62e-05,
"loss": 0.8035,
"step": 135
},
{
"epoch": 0.24862888482632542,
"grad_norm": 2.4662225246429443,
"learning_rate": 1.6320000000000003e-05,
"loss": 0.8235,
"step": 136
},
{
"epoch": 0.25045703839122485,
"grad_norm": 2.405362367630005,
"learning_rate": 1.6440000000000002e-05,
"loss": 0.8681,
"step": 137
},
{
"epoch": 0.2522851919561243,
"grad_norm": 2.331638813018799,
"learning_rate": 1.656e-05,
"loss": 0.8784,
"step": 138
},
{
"epoch": 0.25411334552102377,
"grad_norm": 2.796093463897705,
"learning_rate": 1.6680000000000003e-05,
"loss": 0.9942,
"step": 139
},
{
"epoch": 0.25594149908592323,
"grad_norm": 2.3736331462860107,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.7229,
"step": 140
},
{
"epoch": 0.2577696526508227,
"grad_norm": 2.4110031127929688,
"learning_rate": 1.6919999999999997e-05,
"loss": 0.8202,
"step": 141
},
{
"epoch": 0.2595978062157221,
"grad_norm": 2.3349928855895996,
"learning_rate": 1.704e-05,
"loss": 0.7966,
"step": 142
},
{
"epoch": 0.26142595978062155,
"grad_norm": 2.4862008094787598,
"learning_rate": 1.716e-05,
"loss": 0.8141,
"step": 143
},
{
"epoch": 0.263254113345521,
"grad_norm": 2.787587881088257,
"learning_rate": 1.728e-05,
"loss": 0.7861,
"step": 144
},
{
"epoch": 0.26508226691042047,
"grad_norm": 2.687865972518921,
"learning_rate": 1.74e-05,
"loss": 0.9085,
"step": 145
},
{
"epoch": 0.26691042047531993,
"grad_norm": 2.517024278640747,
"learning_rate": 1.7519999999999998e-05,
"loss": 0.8719,
"step": 146
},
{
"epoch": 0.2687385740402194,
"grad_norm": 2.4157791137695312,
"learning_rate": 1.764e-05,
"loss": 0.8469,
"step": 147
},
{
"epoch": 0.27056672760511885,
"grad_norm": 2.647015333175659,
"learning_rate": 1.776e-05,
"loss": 0.8133,
"step": 148
},
{
"epoch": 0.27239488117001825,
"grad_norm": 2.7705986499786377,
"learning_rate": 1.7879999999999998e-05,
"loss": 0.8819,
"step": 149
},
{
"epoch": 0.2742230347349177,
"grad_norm": 2.2369964122772217,
"learning_rate": 1.8e-05,
"loss": 0.88,
"step": 150
},
{
"epoch": 0.2760511882998172,
"grad_norm": 2.239433765411377,
"learning_rate": 1.812e-05,
"loss": 0.7873,
"step": 151
},
{
"epoch": 0.27787934186471663,
"grad_norm": 2.493117332458496,
"learning_rate": 1.824e-05,
"loss": 0.8111,
"step": 152
},
{
"epoch": 0.2797074954296161,
"grad_norm": 2.5309877395629883,
"learning_rate": 1.836e-05,
"loss": 0.7235,
"step": 153
},
{
"epoch": 0.28153564899451555,
"grad_norm": 2.403522491455078,
"learning_rate": 1.848e-05,
"loss": 0.816,
"step": 154
},
{
"epoch": 0.283363802559415,
"grad_norm": 2.8262531757354736,
"learning_rate": 1.86e-05,
"loss": 0.9069,
"step": 155
},
{
"epoch": 0.2851919561243144,
"grad_norm": 2.51188588142395,
"learning_rate": 1.872e-05,
"loss": 0.8979,
"step": 156
},
{
"epoch": 0.2870201096892139,
"grad_norm": 2.493990659713745,
"learning_rate": 1.884e-05,
"loss": 0.798,
"step": 157
},
{
"epoch": 0.28884826325411334,
"grad_norm": 2.5412824153900146,
"learning_rate": 1.896e-05,
"loss": 0.7898,
"step": 158
},
{
"epoch": 0.2906764168190128,
"grad_norm": 2.4731011390686035,
"learning_rate": 1.908e-05,
"loss": 0.8854,
"step": 159
},
{
"epoch": 0.29250457038391225,
"grad_norm": 2.6185050010681152,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.8163,
"step": 160
},
{
"epoch": 0.2943327239488117,
"grad_norm": 2.384073495864868,
"learning_rate": 1.932e-05,
"loss": 0.7888,
"step": 161
},
{
"epoch": 0.2961608775137112,
"grad_norm": 2.566452741622925,
"learning_rate": 1.944e-05,
"loss": 0.8344,
"step": 162
},
{
"epoch": 0.2979890310786106,
"grad_norm": 2.4498672485351562,
"learning_rate": 1.9560000000000002e-05,
"loss": 0.8288,
"step": 163
},
{
"epoch": 0.29981718464351004,
"grad_norm": 2.7561299800872803,
"learning_rate": 1.968e-05,
"loss": 0.8321,
"step": 164
},
{
"epoch": 0.3016453382084095,
"grad_norm": 2.5148916244506836,
"learning_rate": 1.98e-05,
"loss": 0.8343,
"step": 165
},
{
"epoch": 0.30347349177330896,
"grad_norm": 2.444960594177246,
"learning_rate": 1.9920000000000002e-05,
"loss": 0.6833,
"step": 166
},
{
"epoch": 0.3053016453382084,
"grad_norm": 2.5153768062591553,
"learning_rate": 2.004e-05,
"loss": 0.9192,
"step": 167
},
{
"epoch": 0.3071297989031079,
"grad_norm": 2.301560640335083,
"learning_rate": 2.016e-05,
"loss": 0.7864,
"step": 168
},
{
"epoch": 0.30895795246800734,
"grad_norm": 2.628103733062744,
"learning_rate": 2.0280000000000002e-05,
"loss": 0.8426,
"step": 169
},
{
"epoch": 0.31078610603290674,
"grad_norm": 2.4587066173553467,
"learning_rate": 2.04e-05,
"loss": 0.8344,
"step": 170
},
{
"epoch": 0.3126142595978062,
"grad_norm": 2.4356703758239746,
"learning_rate": 2.0520000000000003e-05,
"loss": 0.7558,
"step": 171
},
{
"epoch": 0.31444241316270566,
"grad_norm": 2.531304121017456,
"learning_rate": 2.064e-05,
"loss": 0.855,
"step": 172
},
{
"epoch": 0.3162705667276051,
"grad_norm": 2.2168610095977783,
"learning_rate": 2.0759999999999998e-05,
"loss": 0.8551,
"step": 173
},
{
"epoch": 0.3180987202925046,
"grad_norm": 2.4772465229034424,
"learning_rate": 2.088e-05,
"loss": 0.8782,
"step": 174
},
{
"epoch": 0.31992687385740404,
"grad_norm": 2.4406375885009766,
"learning_rate": 2.1e-05,
"loss": 0.775,
"step": 175
},
{
"epoch": 0.3217550274223035,
"grad_norm": 2.638505697250366,
"learning_rate": 2.1119999999999998e-05,
"loss": 0.9181,
"step": 176
},
{
"epoch": 0.3235831809872029,
"grad_norm": 2.452930212020874,
"learning_rate": 2.124e-05,
"loss": 0.8452,
"step": 177
},
{
"epoch": 0.32541133455210236,
"grad_norm": 2.370314836502075,
"learning_rate": 2.136e-05,
"loss": 1.0293,
"step": 178
},
{
"epoch": 0.3272394881170018,
"grad_norm": 2.4259750843048096,
"learning_rate": 2.148e-05,
"loss": 0.7744,
"step": 179
},
{
"epoch": 0.3290676416819013,
"grad_norm": 2.374286413192749,
"learning_rate": 2.16e-05,
"loss": 0.8336,
"step": 180
},
{
"epoch": 0.33089579524680074,
"grad_norm": 2.4372458457946777,
"learning_rate": 2.172e-05,
"loss": 0.9673,
"step": 181
},
{
"epoch": 0.3327239488117002,
"grad_norm": 2.6595754623413086,
"learning_rate": 2.184e-05,
"loss": 0.8805,
"step": 182
},
{
"epoch": 0.33455210237659966,
"grad_norm": 2.521261692047119,
"learning_rate": 2.196e-05,
"loss": 0.962,
"step": 183
},
{
"epoch": 0.33638025594149906,
"grad_norm": 2.559983015060425,
"learning_rate": 2.208e-05,
"loss": 0.8236,
"step": 184
},
{
"epoch": 0.3382084095063985,
"grad_norm": 2.5021865367889404,
"learning_rate": 2.22e-05,
"loss": 0.7696,
"step": 185
},
{
"epoch": 0.340036563071298,
"grad_norm": 2.389669418334961,
"learning_rate": 2.232e-05,
"loss": 0.9296,
"step": 186
},
{
"epoch": 0.34186471663619744,
"grad_norm": 2.8006410598754883,
"learning_rate": 2.2440000000000002e-05,
"loss": 1.1051,
"step": 187
},
{
"epoch": 0.3436928702010969,
"grad_norm": 2.246638774871826,
"learning_rate": 2.256e-05,
"loss": 0.67,
"step": 188
},
{
"epoch": 0.34552102376599636,
"grad_norm": 2.3323843479156494,
"learning_rate": 2.268e-05,
"loss": 0.7483,
"step": 189
},
{
"epoch": 0.3473491773308958,
"grad_norm": 2.599168539047241,
"learning_rate": 2.2800000000000002e-05,
"loss": 0.7095,
"step": 190
},
{
"epoch": 0.3491773308957952,
"grad_norm": 2.5335357189178467,
"learning_rate": 2.292e-05,
"loss": 0.7943,
"step": 191
},
{
"epoch": 0.3510054844606947,
"grad_norm": 2.523808717727661,
"learning_rate": 2.304e-05,
"loss": 0.8714,
"step": 192
},
{
"epoch": 0.35283363802559414,
"grad_norm": 2.3433940410614014,
"learning_rate": 2.3160000000000002e-05,
"loss": 0.7879,
"step": 193
},
{
"epoch": 0.3546617915904936,
"grad_norm": 2.5101304054260254,
"learning_rate": 2.328e-05,
"loss": 0.9299,
"step": 194
},
{
"epoch": 0.35648994515539306,
"grad_norm": 2.652029275894165,
"learning_rate": 2.3400000000000003e-05,
"loss": 0.813,
"step": 195
},
{
"epoch": 0.3583180987202925,
"grad_norm": 2.250645160675049,
"learning_rate": 2.3520000000000002e-05,
"loss": 0.9784,
"step": 196
},
{
"epoch": 0.360146252285192,
"grad_norm": 2.2848877906799316,
"learning_rate": 2.364e-05,
"loss": 0.9483,
"step": 197
},
{
"epoch": 0.3619744058500914,
"grad_norm": 2.4996519088745117,
"learning_rate": 2.3760000000000003e-05,
"loss": 0.8746,
"step": 198
},
{
"epoch": 0.36380255941499084,
"grad_norm": 2.451387882232666,
"learning_rate": 2.3880000000000002e-05,
"loss": 0.8514,
"step": 199
},
{
"epoch": 0.3656307129798903,
"grad_norm": 2.382949113845825,
"learning_rate": 2.4e-05,
"loss": 1.0895,
"step": 200
},
{
"epoch": 0.36745886654478976,
"grad_norm": 2.407252788543701,
"learning_rate": 2.4120000000000003e-05,
"loss": 0.9273,
"step": 201
},
{
"epoch": 0.3692870201096892,
"grad_norm": 2.554053544998169,
"learning_rate": 2.4240000000000002e-05,
"loss": 0.8187,
"step": 202
},
{
"epoch": 0.3711151736745887,
"grad_norm": 2.1548268795013428,
"learning_rate": 2.4360000000000004e-05,
"loss": 0.9683,
"step": 203
},
{
"epoch": 0.37294332723948814,
"grad_norm": 2.419849395751953,
"learning_rate": 2.448e-05,
"loss": 0.8276,
"step": 204
},
{
"epoch": 0.37477148080438755,
"grad_norm": 2.300262451171875,
"learning_rate": 2.4599999999999998e-05,
"loss": 0.8748,
"step": 205
},
{
"epoch": 0.376599634369287,
"grad_norm": 2.4870543479919434,
"learning_rate": 2.472e-05,
"loss": 0.8901,
"step": 206
},
{
"epoch": 0.37842778793418647,
"grad_norm": 2.703481435775757,
"learning_rate": 2.484e-05,
"loss": 0.871,
"step": 207
},
{
"epoch": 0.3802559414990859,
"grad_norm": 2.597571611404419,
"learning_rate": 2.4959999999999998e-05,
"loss": 0.747,
"step": 208
},
{
"epoch": 0.3820840950639854,
"grad_norm": 2.4933812618255615,
"learning_rate": 2.508e-05,
"loss": 0.7869,
"step": 209
},
{
"epoch": 0.38391224862888484,
"grad_norm": 2.566986322402954,
"learning_rate": 2.52e-05,
"loss": 0.9081,
"step": 210
},
{
"epoch": 0.3857404021937843,
"grad_norm": 2.4893436431884766,
"learning_rate": 2.5319999999999998e-05,
"loss": 0.866,
"step": 211
},
{
"epoch": 0.3875685557586837,
"grad_norm": 2.5950074195861816,
"learning_rate": 2.544e-05,
"loss": 0.8783,
"step": 212
},
{
"epoch": 0.38939670932358317,
"grad_norm": 2.3816328048706055,
"learning_rate": 2.556e-05,
"loss": 0.8963,
"step": 213
},
{
"epoch": 0.3912248628884826,
"grad_norm": 2.064539670944214,
"learning_rate": 2.568e-05,
"loss": 0.8979,
"step": 214
},
{
"epoch": 0.3930530164533821,
"grad_norm": 2.43748140335083,
"learning_rate": 2.58e-05,
"loss": 0.8466,
"step": 215
},
{
"epoch": 0.39488117001828155,
"grad_norm": 2.2571210861206055,
"learning_rate": 2.592e-05,
"loss": 0.8433,
"step": 216
},
{
"epoch": 0.396709323583181,
"grad_norm": 2.3223443031311035,
"learning_rate": 2.604e-05,
"loss": 0.7485,
"step": 217
},
{
"epoch": 0.39853747714808047,
"grad_norm": 2.435385227203369,
"learning_rate": 2.616e-05,
"loss": 0.8868,
"step": 218
},
{
"epoch": 0.40036563071297987,
"grad_norm": 2.4609930515289307,
"learning_rate": 2.628e-05,
"loss": 0.7649,
"step": 219
},
{
"epoch": 0.40219378427787933,
"grad_norm": 2.3334007263183594,
"learning_rate": 2.64e-05,
"loss": 0.8722,
"step": 220
},
{
"epoch": 0.4040219378427788,
"grad_norm": 2.4103660583496094,
"learning_rate": 2.652e-05,
"loss": 0.8687,
"step": 221
},
{
"epoch": 0.40585009140767825,
"grad_norm": 2.386665105819702,
"learning_rate": 2.6640000000000002e-05,
"loss": 0.9062,
"step": 222
},
{
"epoch": 0.4076782449725777,
"grad_norm": 2.420870065689087,
"learning_rate": 2.676e-05,
"loss": 0.9941,
"step": 223
},
{
"epoch": 0.40950639853747717,
"grad_norm": 2.643944025039673,
"learning_rate": 2.688e-05,
"loss": 0.8953,
"step": 224
},
{
"epoch": 0.4113345521023766,
"grad_norm": 2.400880813598633,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.8583,
"step": 225
},
{
"epoch": 0.41316270566727603,
"grad_norm": 2.415785312652588,
"learning_rate": 2.712e-05,
"loss": 0.7549,
"step": 226
},
{
"epoch": 0.4149908592321755,
"grad_norm": 2.6550943851470947,
"learning_rate": 2.724e-05,
"loss": 0.9005,
"step": 227
},
{
"epoch": 0.41681901279707495,
"grad_norm": 2.31974720954895,
"learning_rate": 2.7360000000000002e-05,
"loss": 0.9962,
"step": 228
},
{
"epoch": 0.4186471663619744,
"grad_norm": 2.463061571121216,
"learning_rate": 2.748e-05,
"loss": 0.7754,
"step": 229
},
{
"epoch": 0.42047531992687387,
"grad_norm": 2.5701842308044434,
"learning_rate": 2.7600000000000003e-05,
"loss": 0.772,
"step": 230
},
{
"epoch": 0.42230347349177333,
"grad_norm": 2.3573224544525146,
"learning_rate": 2.7720000000000002e-05,
"loss": 0.8872,
"step": 231
},
{
"epoch": 0.42413162705667273,
"grad_norm": 2.345667600631714,
"learning_rate": 2.784e-05,
"loss": 0.7977,
"step": 232
},
{
"epoch": 0.4259597806215722,
"grad_norm": 2.583740234375,
"learning_rate": 2.7960000000000003e-05,
"loss": 0.9406,
"step": 233
},
{
"epoch": 0.42778793418647165,
"grad_norm": 2.51877760887146,
"learning_rate": 2.8080000000000002e-05,
"loss": 0.8245,
"step": 234
},
{
"epoch": 0.4296160877513711,
"grad_norm": 2.6624832153320312,
"learning_rate": 2.8199999999999998e-05,
"loss": 0.8747,
"step": 235
},
{
"epoch": 0.43144424131627057,
"grad_norm": 2.6126315593719482,
"learning_rate": 2.832e-05,
"loss": 0.881,
"step": 236
},
{
"epoch": 0.43327239488117003,
"grad_norm": 2.533567428588867,
"learning_rate": 2.844e-05,
"loss": 0.9505,
"step": 237
},
{
"epoch": 0.4351005484460695,
"grad_norm": 2.4115335941314697,
"learning_rate": 2.856e-05,
"loss": 0.9703,
"step": 238
},
{
"epoch": 0.4369287020109689,
"grad_norm": 2.2946977615356445,
"learning_rate": 2.868e-05,
"loss": 0.8025,
"step": 239
},
{
"epoch": 0.43875685557586835,
"grad_norm": 2.7821929454803467,
"learning_rate": 2.88e-05,
"loss": 0.8108,
"step": 240
},
{
"epoch": 0.4405850091407678,
"grad_norm": 2.5924153327941895,
"learning_rate": 2.892e-05,
"loss": 0.7716,
"step": 241
},
{
"epoch": 0.4424131627056673,
"grad_norm": 2.484504222869873,
"learning_rate": 2.904e-05,
"loss": 0.8917,
"step": 242
},
{
"epoch": 0.44424131627056673,
"grad_norm": 2.4044761657714844,
"learning_rate": 2.916e-05,
"loss": 0.9806,
"step": 243
},
{
"epoch": 0.4460694698354662,
"grad_norm": 2.3332765102386475,
"learning_rate": 2.928e-05,
"loss": 0.7616,
"step": 244
},
{
"epoch": 0.44789762340036565,
"grad_norm": 2.3703112602233887,
"learning_rate": 2.94e-05,
"loss": 0.8937,
"step": 245
},
{
"epoch": 0.44972577696526506,
"grad_norm": 2.3351054191589355,
"learning_rate": 2.9520000000000002e-05,
"loss": 0.83,
"step": 246
},
{
"epoch": 0.4515539305301645,
"grad_norm": 2.3738510608673096,
"learning_rate": 2.964e-05,
"loss": 0.904,
"step": 247
},
{
"epoch": 0.453382084095064,
"grad_norm": 2.5012619495391846,
"learning_rate": 2.976e-05,
"loss": 0.8809,
"step": 248
},
{
"epoch": 0.45521023765996343,
"grad_norm": 2.5719287395477295,
"learning_rate": 2.9880000000000002e-05,
"loss": 0.773,
"step": 249
},
{
"epoch": 0.4570383912248629,
"grad_norm": 2.3036999702453613,
"learning_rate": 3e-05,
"loss": 0.7487,
"step": 250
},
{
"epoch": 0.4570383912248629,
"eval_loss": 0.8340924382209778,
"eval_runtime": 11.3221,
"eval_samples_per_second": 98.215,
"eval_steps_per_second": 3.091,
"step": 250
},
{
"epoch": 0.45886654478976235,
"grad_norm": 2.355015754699707,
"learning_rate": 2.9999160841378727e-05,
"loss": 0.7973,
"step": 251
},
{
"epoch": 0.4606946983546618,
"grad_norm": 2.296038866043091,
"learning_rate": 2.9996643459406528e-05,
"loss": 0.8632,
"step": 252
},
{
"epoch": 0.4625228519195612,
"grad_norm": 2.2504048347473145,
"learning_rate": 2.999244813574778e-05,
"loss": 0.704,
"step": 253
},
{
"epoch": 0.4643510054844607,
"grad_norm": 2.4145545959472656,
"learning_rate": 2.9986575339808077e-05,
"loss": 0.7892,
"step": 254
},
{
"epoch": 0.46617915904936014,
"grad_norm": 2.3196182250976562,
"learning_rate": 2.997902572868174e-05,
"loss": 0.9237,
"step": 255
},
{
"epoch": 0.4680073126142596,
"grad_norm": 2.5195236206054688,
"learning_rate": 2.9969800147078265e-05,
"loss": 0.8632,
"step": 256
},
{
"epoch": 0.46983546617915906,
"grad_norm": 2.3776962757110596,
"learning_rate": 2.995889962722784e-05,
"loss": 0.8948,
"step": 257
},
{
"epoch": 0.4716636197440585,
"grad_norm": 2.3582563400268555,
"learning_rate": 2.9946325388765812e-05,
"loss": 0.8258,
"step": 258
},
{
"epoch": 0.473491773308958,
"grad_norm": 2.4774725437164307,
"learning_rate": 2.993207883859627e-05,
"loss": 0.8687,
"step": 259
},
{
"epoch": 0.4753199268738574,
"grad_norm": 2.2049193382263184,
"learning_rate": 2.99161615707346e-05,
"loss": 0.9289,
"step": 260
},
{
"epoch": 0.47714808043875684,
"grad_norm": 2.2471542358398438,
"learning_rate": 2.9898575366129145e-05,
"loss": 0.8769,
"step": 261
},
{
"epoch": 0.4789762340036563,
"grad_norm": 2.2609918117523193,
"learning_rate": 2.9879322192461932e-05,
"loss": 1.0632,
"step": 262
},
{
"epoch": 0.48080438756855576,
"grad_norm": 2.3569087982177734,
"learning_rate": 2.985840420392851e-05,
"loss": 0.854,
"step": 263
},
{
"epoch": 0.4826325411334552,
"grad_norm": 2.398346185684204,
"learning_rate": 2.9835823740996944e-05,
"loss": 0.7765,
"step": 264
},
{
"epoch": 0.4844606946983547,
"grad_norm": 2.251390218734741,
"learning_rate": 2.9811583330145915e-05,
"loss": 0.8045,
"step": 265
},
{
"epoch": 0.48628884826325414,
"grad_norm": 2.3630456924438477,
"learning_rate": 2.9785685683582057e-05,
"loss": 0.8945,
"step": 266
},
{
"epoch": 0.48811700182815354,
"grad_norm": 2.259655714035034,
"learning_rate": 2.975813369893649e-05,
"loss": 0.7409,
"step": 267
},
{
"epoch": 0.489945155393053,
"grad_norm": 2.4072036743164062,
"learning_rate": 2.97289304589406e-05,
"loss": 0.8358,
"step": 268
},
{
"epoch": 0.49177330895795246,
"grad_norm": 2.3019490242004395,
"learning_rate": 2.9698079231081144e-05,
"loss": 0.8837,
"step": 269
},
{
"epoch": 0.4936014625228519,
"grad_norm": 2.3812527656555176,
"learning_rate": 2.966558346723463e-05,
"loss": 0.8772,
"step": 270
},
{
"epoch": 0.4954296160877514,
"grad_norm": 2.3249640464782715,
"learning_rate": 2.963144680328111e-05,
"loss": 0.7369,
"step": 271
},
{
"epoch": 0.49725776965265084,
"grad_norm": 2.431414842605591,
"learning_rate": 2.959567305869736e-05,
"loss": 0.8207,
"step": 272
},
{
"epoch": 0.4990859232175503,
"grad_norm": 2.3795621395111084,
"learning_rate": 2.955826623612954e-05,
"loss": 0.73,
"step": 273
},
{
"epoch": 0.5009140767824497,
"grad_norm": 2.426405906677246,
"learning_rate": 2.9519230520945346e-05,
"loss": 0.9324,
"step": 274
},
{
"epoch": 0.5027422303473492,
"grad_norm": 2.2649593353271484,
"learning_rate": 2.947857028076569e-05,
"loss": 0.8003,
"step": 275
},
{
"epoch": 0.5045703839122486,
"grad_norm": 2.481842041015625,
"learning_rate": 2.943629006497606e-05,
"loss": 0.7915,
"step": 276
},
{
"epoch": 0.506398537477148,
"grad_norm": 2.5210118293762207,
"learning_rate": 2.939239460421746e-05,
"loss": 0.7953,
"step": 277
},
{
"epoch": 0.5082266910420475,
"grad_norm": 2.3630707263946533,
"learning_rate": 2.934688880985714e-05,
"loss": 0.8232,
"step": 278
},
{
"epoch": 0.5100548446069469,
"grad_norm": 2.3418996334075928,
"learning_rate": 2.9299777773439056e-05,
"loss": 0.909,
"step": 279
},
{
"epoch": 0.5118829981718465,
"grad_norm": 2.34122633934021,
"learning_rate": 2.925106676611418e-05,
"loss": 0.7633,
"step": 280
},
{
"epoch": 0.5137111517367459,
"grad_norm": 2.499547243118286,
"learning_rate": 2.9200761238050756e-05,
"loss": 0.851,
"step": 281
},
{
"epoch": 0.5155393053016454,
"grad_norm": 2.456969738006592,
"learning_rate": 2.9148866817824454e-05,
"loss": 0.8803,
"step": 282
},
{
"epoch": 0.5173674588665448,
"grad_norm": 2.2602295875549316,
"learning_rate": 2.9095389311788626e-05,
"loss": 0.8049,
"step": 283
},
{
"epoch": 0.5191956124314442,
"grad_norm": 2.1520049571990967,
"learning_rate": 2.9040334703424637e-05,
"loss": 0.7233,
"step": 284
},
{
"epoch": 0.5210237659963437,
"grad_norm": 2.4685440063476562,
"learning_rate": 2.8983709152672386e-05,
"loss": 0.9514,
"step": 285
},
{
"epoch": 0.5228519195612431,
"grad_norm": 2.296013593673706,
"learning_rate": 2.892551899524109e-05,
"loss": 0.7938,
"step": 286
},
{
"epoch": 0.5246800731261426,
"grad_norm": 2.3713924884796143,
"learning_rate": 2.8865770741900382e-05,
"loss": 0.93,
"step": 287
},
{
"epoch": 0.526508226691042,
"grad_norm": 2.6389975547790527,
"learning_rate": 2.8804471077751847e-05,
"loss": 0.9036,
"step": 288
},
{
"epoch": 0.5283363802559415,
"grad_norm": 2.4582440853118896,
"learning_rate": 2.8741626861481043e-05,
"loss": 0.9437,
"step": 289
},
{
"epoch": 0.5301645338208409,
"grad_norm": 2.3008275032043457,
"learning_rate": 2.8677245124590087e-05,
"loss": 0.7939,
"step": 290
},
{
"epoch": 0.5319926873857403,
"grad_norm": 2.319469928741455,
"learning_rate": 2.8611333070610918e-05,
"loss": 0.8535,
"step": 291
},
{
"epoch": 0.5338208409506399,
"grad_norm": 2.295746088027954,
"learning_rate": 2.8543898074299322e-05,
"loss": 0.736,
"step": 292
},
{
"epoch": 0.5356489945155393,
"grad_norm": 2.5527262687683105,
"learning_rate": 2.8474947680809754e-05,
"loss": 0.8192,
"step": 293
},
{
"epoch": 0.5374771480804388,
"grad_norm": 2.308958053588867,
"learning_rate": 2.8404489604851186e-05,
"loss": 0.9077,
"step": 294
},
{
"epoch": 0.5393053016453382,
"grad_norm": 2.524796724319458,
"learning_rate": 2.8332531729823853e-05,
"loss": 0.8038,
"step": 295
},
{
"epoch": 0.5411334552102377,
"grad_norm": 2.420640468597412,
"learning_rate": 2.8259082106937255e-05,
"loss": 0.7417,
"step": 296
},
{
"epoch": 0.5429616087751371,
"grad_norm": 2.364328384399414,
"learning_rate": 2.8184148954309295e-05,
"loss": 0.8791,
"step": 297
},
{
"epoch": 0.5447897623400365,
"grad_norm": 2.412336587905884,
"learning_rate": 2.8107740656046775e-05,
"loss": 0.83,
"step": 298
},
{
"epoch": 0.546617915904936,
"grad_norm": 2.5241622924804688,
"learning_rate": 2.802986576130733e-05,
"loss": 0.8886,
"step": 299
},
{
"epoch": 0.5484460694698354,
"grad_norm": 2.330146074295044,
"learning_rate": 2.7950532983342863e-05,
"loss": 0.8117,
"step": 300
},
{
"epoch": 0.5502742230347349,
"grad_norm": 2.1738884449005127,
"learning_rate": 2.7869751198524656e-05,
"loss": 0.8588,
"step": 301
},
{
"epoch": 0.5521023765996343,
"grad_norm": 2.343388319015503,
"learning_rate": 2.7787529445350192e-05,
"loss": 0.7355,
"step": 302
},
{
"epoch": 0.5539305301645339,
"grad_norm": 2.2163190841674805,
"learning_rate": 2.7703876923431882e-05,
"loss": 0.8508,
"step": 303
},
{
"epoch": 0.5557586837294333,
"grad_norm": 2.1025807857513428,
"learning_rate": 2.7618802992467718e-05,
"loss": 0.7909,
"step": 304
},
{
"epoch": 0.5575868372943327,
"grad_norm": 2.4115538597106934,
"learning_rate": 2.753231717119405e-05,
"loss": 0.7964,
"step": 305
},
{
"epoch": 0.5594149908592322,
"grad_norm": 2.2953007221221924,
"learning_rate": 2.744442913632054e-05,
"loss": 0.8284,
"step": 306
},
{
"epoch": 0.5612431444241316,
"grad_norm": 2.4674270153045654,
"learning_rate": 2.7355148721447492e-05,
"loss": 0.9302,
"step": 307
},
{
"epoch": 0.5630712979890311,
"grad_norm": 2.447037935256958,
"learning_rate": 2.7264485915965548e-05,
"loss": 0.9281,
"step": 308
},
{
"epoch": 0.5648994515539305,
"grad_norm": 2.1784889698028564,
"learning_rate": 2.717245086393801e-05,
"loss": 0.7989,
"step": 309
},
{
"epoch": 0.56672760511883,
"grad_norm": 2.2562270164489746,
"learning_rate": 2.707905386296588e-05,
"loss": 0.8856,
"step": 310
},
{
"epoch": 0.5685557586837294,
"grad_norm": 2.272416591644287,
"learning_rate": 2.6984305363035616e-05,
"loss": 1.0322,
"step": 311
},
{
"epoch": 0.5703839122486288,
"grad_norm": 2.2202160358428955,
"learning_rate": 2.6888215965349974e-05,
"loss": 0.9454,
"step": 312
},
{
"epoch": 0.5722120658135283,
"grad_norm": 2.4724793434143066,
"learning_rate": 2.6790796421141813e-05,
"loss": 0.8584,
"step": 313
},
{
"epoch": 0.5740402193784278,
"grad_norm": 2.3383536338806152,
"learning_rate": 2.6692057630471184e-05,
"loss": 0.978,
"step": 314
},
{
"epoch": 0.5758683729433273,
"grad_norm": 2.173809766769409,
"learning_rate": 2.6592010641005745e-05,
"loss": 0.8318,
"step": 315
},
{
"epoch": 0.5776965265082267,
"grad_norm": 2.306762456893921,
"learning_rate": 2.649066664678467e-05,
"loss": 0.841,
"step": 316
},
{
"epoch": 0.5795246800731262,
"grad_norm": 2.038734197616577,
"learning_rate": 2.638803698696615e-05,
"loss": 0.8219,
"step": 317
},
{
"epoch": 0.5813528336380256,
"grad_norm": 2.2740612030029297,
"learning_rate": 2.6284133144558697e-05,
"loss": 0.8945,
"step": 318
},
{
"epoch": 0.583180987202925,
"grad_norm": 2.338181972503662,
"learning_rate": 2.6178966745136322e-05,
"loss": 1.0114,
"step": 319
},
{
"epoch": 0.5850091407678245,
"grad_norm": 2.357879877090454,
"learning_rate": 2.60725495555378e-05,
"loss": 0.7024,
"step": 320
},
{
"epoch": 0.5868372943327239,
"grad_norm": 2.271117925643921,
"learning_rate": 2.5964893482550076e-05,
"loss": 0.8802,
"step": 321
},
{
"epoch": 0.5886654478976234,
"grad_norm": 2.092961072921753,
"learning_rate": 2.5856010571576052e-05,
"loss": 0.8343,
"step": 322
},
{
"epoch": 0.5904936014625228,
"grad_norm": 2.297849655151367,
"learning_rate": 2.574591300528686e-05,
"loss": 0.8124,
"step": 323
},
{
"epoch": 0.5923217550274223,
"grad_norm": 2.293593645095825,
"learning_rate": 2.563461310225875e-05,
"loss": 0.7819,
"step": 324
},
{
"epoch": 0.5941499085923218,
"grad_norm": 2.2364585399627686,
"learning_rate": 2.552212331559482e-05,
"loss": 0.9649,
"step": 325
},
{
"epoch": 0.5959780621572212,
"grad_norm": 2.2145204544067383,
"learning_rate": 2.5408456231531634e-05,
"loss": 0.8959,
"step": 326
},
{
"epoch": 0.5978062157221207,
"grad_norm": 2.4612884521484375,
"learning_rate": 2.5293624568031008e-05,
"loss": 0.929,
"step": 327
},
{
"epoch": 0.5996343692870201,
"grad_norm": 2.4367892742156982,
"learning_rate": 2.5177641173356985e-05,
"loss": 0.7942,
"step": 328
},
{
"epoch": 0.6014625228519196,
"grad_norm": 2.5621209144592285,
"learning_rate": 2.5060519024638312e-05,
"loss": 0.9107,
"step": 329
},
{
"epoch": 0.603290676416819,
"grad_norm": 2.2086422443389893,
"learning_rate": 2.4942271226416444e-05,
"loss": 0.7485,
"step": 330
},
{
"epoch": 0.6051188299817185,
"grad_norm": 2.4878604412078857,
"learning_rate": 2.482291100917928e-05,
"loss": 0.8663,
"step": 331
},
{
"epoch": 0.6069469835466179,
"grad_norm": 2.4622035026550293,
"learning_rate": 2.4702451727880862e-05,
"loss": 0.9976,
"step": 332
},
{
"epoch": 0.6087751371115173,
"grad_norm": 2.313488245010376,
"learning_rate": 2.458090686044712e-05,
"loss": 0.86,
"step": 333
},
{
"epoch": 0.6106032906764168,
"grad_norm": 2.495249032974243,
"learning_rate": 2.445829000626784e-05,
"loss": 0.7586,
"step": 334
},
{
"epoch": 0.6124314442413162,
"grad_norm": 2.2994625568389893,
"learning_rate": 2.433461488467505e-05,
"loss": 0.9011,
"step": 335
},
{
"epoch": 0.6142595978062158,
"grad_norm": 2.410585403442383,
"learning_rate": 2.4209895333408028e-05,
"loss": 0.7784,
"step": 336
},
{
"epoch": 0.6160877513711152,
"grad_norm": 2.371408462524414,
"learning_rate": 2.4084145307065e-05,
"loss": 0.9034,
"step": 337
},
{
"epoch": 0.6179159049360147,
"grad_norm": 2.2253592014312744,
"learning_rate": 2.3957378875541795e-05,
"loss": 0.8581,
"step": 338
},
{
"epoch": 0.6197440585009141,
"grad_norm": 2.18859601020813,
"learning_rate": 2.382961022245759e-05,
"loss": 0.8338,
"step": 339
},
{
"epoch": 0.6215722120658135,
"grad_norm": 2.1277389526367188,
"learning_rate": 2.3700853643567973e-05,
"loss": 0.7985,
"step": 340
},
{
"epoch": 0.623400365630713,
"grad_norm": 2.2631025314331055,
"learning_rate": 2.3571123545165362e-05,
"loss": 0.865,
"step": 341
},
{
"epoch": 0.6252285191956124,
"grad_norm": 2.4531781673431396,
"learning_rate": 2.3440434442467155e-05,
"loss": 0.8673,
"step": 342
},
{
"epoch": 0.6270566727605119,
"grad_norm": 2.3396685123443604,
"learning_rate": 2.3308800957991657e-05,
"loss": 0.868,
"step": 343
},
{
"epoch": 0.6288848263254113,
"grad_norm": 2.2110092639923096,
"learning_rate": 2.3176237819921975e-05,
"loss": 0.7553,
"step": 344
},
{
"epoch": 0.6307129798903108,
"grad_norm": 2.3857622146606445,
"learning_rate": 2.3042759860458142e-05,
"loss": 0.7463,
"step": 345
},
{
"epoch": 0.6325411334552102,
"grad_norm": 2.304614782333374,
"learning_rate": 2.2908382014157536e-05,
"loss": 0.939,
"step": 346
},
{
"epoch": 0.6343692870201096,
"grad_norm": 2.360813617706299,
"learning_rate": 2.2773119316263935e-05,
"loss": 0.7792,
"step": 347
},
{
"epoch": 0.6361974405850092,
"grad_norm": 2.41550612449646,
"learning_rate": 2.2636986901025208e-05,
"loss": 0.8776,
"step": 348
},
{
"epoch": 0.6380255941499086,
"grad_norm": 2.514841318130493,
"learning_rate": 2.25e-05,
"loss": 0.8356,
"step": 349
},
{
"epoch": 0.6398537477148081,
"grad_norm": 2.2054624557495117,
"learning_rate": 2.2362173940353522e-05,
"loss": 0.7899,
"step": 350
},
{
"epoch": 0.6416819012797075,
"grad_norm": 2.144213914871216,
"learning_rate": 2.2223524143142595e-05,
"loss": 0.8054,
"step": 351
},
{
"epoch": 0.643510054844607,
"grad_norm": 2.340751886367798,
"learning_rate": 2.2084066121590242e-05,
"loss": 0.8224,
"step": 352
},
{
"epoch": 0.6453382084095064,
"grad_norm": 2.3917925357818604,
"learning_rate": 2.194381547934994e-05,
"loss": 0.8739,
"step": 353
},
{
"epoch": 0.6471663619744058,
"grad_norm": 2.30846905708313,
"learning_rate": 2.1802787908759767e-05,
"loss": 0.866,
"step": 354
},
{
"epoch": 0.6489945155393053,
"grad_norm": 2.0527448654174805,
"learning_rate": 2.1660999189086613e-05,
"loss": 0.8253,
"step": 355
},
{
"epoch": 0.6508226691042047,
"grad_norm": 2.263025999069214,
"learning_rate": 2.1518465184760686e-05,
"loss": 0.8838,
"step": 356
},
{
"epoch": 0.6526508226691042,
"grad_norm": 2.3904080390930176,
"learning_rate": 2.1375201843600452e-05,
"loss": 0.9442,
"step": 357
},
{
"epoch": 0.6544789762340036,
"grad_norm": 2.1965222358703613,
"learning_rate": 2.12312251950283e-05,
"loss": 0.6803,
"step": 358
},
{
"epoch": 0.6563071297989032,
"grad_norm": 2.2777087688446045,
"learning_rate": 2.108655134827701e-05,
"loss": 0.8077,
"step": 359
},
{
"epoch": 0.6581352833638026,
"grad_norm": 2.2738406658172607,
"learning_rate": 2.0941196490587352e-05,
"loss": 0.855,
"step": 360
},
{
"epoch": 0.659963436928702,
"grad_norm": 2.04484486579895,
"learning_rate": 2.0795176885396928e-05,
"loss": 0.8816,
"step": 361
},
{
"epoch": 0.6617915904936015,
"grad_norm": 2.364666223526001,
"learning_rate": 2.064850887052048e-05,
"loss": 0.9707,
"step": 362
},
{
"epoch": 0.6636197440585009,
"grad_norm": 2.2735183238983154,
"learning_rate": 2.0501208856321895e-05,
"loss": 0.8226,
"step": 363
},
{
"epoch": 0.6654478976234004,
"grad_norm": 2.370248794555664,
"learning_rate": 2.035329332387808e-05,
"loss": 0.797,
"step": 364
},
{
"epoch": 0.6672760511882998,
"grad_norm": 2.614694595336914,
"learning_rate": 2.0204778823134936e-05,
"loss": 0.8665,
"step": 365
},
{
"epoch": 0.6691042047531993,
"grad_norm": 2.3441321849823,
"learning_rate": 2.0055681971055626e-05,
"loss": 0.8658,
"step": 366
},
{
"epoch": 0.6709323583180987,
"grad_norm": 2.3217623233795166,
"learning_rate": 1.990601944976133e-05,
"loss": 0.8256,
"step": 367
},
{
"epoch": 0.6727605118829981,
"grad_norm": 2.209233522415161,
"learning_rate": 1.9755808004664702e-05,
"loss": 0.7482,
"step": 368
},
{
"epoch": 0.6745886654478976,
"grad_norm": 2.4364049434661865,
"learning_rate": 1.9605064442596316e-05,
"loss": 0.8031,
"step": 369
},
{
"epoch": 0.676416819012797,
"grad_norm": 2.168339967727661,
"learning_rate": 1.9453805629924126e-05,
"loss": 0.8416,
"step": 370
},
{
"epoch": 0.6782449725776966,
"grad_norm": 2.428342580795288,
"learning_rate": 1.9302048490666356e-05,
"loss": 0.8554,
"step": 371
},
{
"epoch": 0.680073126142596,
"grad_norm": 1.9630411863327026,
"learning_rate": 1.9149810004597906e-05,
"loss": 0.7988,
"step": 372
},
{
"epoch": 0.6819012797074955,
"grad_norm": 2.591010570526123,
"learning_rate": 1.8997107205350525e-05,
"loss": 1.048,
"step": 373
},
{
"epoch": 0.6837294332723949,
"grad_norm": 2.476414442062378,
"learning_rate": 1.884395717850694e-05,
"loss": 0.8041,
"step": 374
},
{
"epoch": 0.6855575868372943,
"grad_norm": 2.514333486557007,
"learning_rate": 1.8690377059689202e-05,
"loss": 0.8906,
"step": 375
},
{
"epoch": 0.6873857404021938,
"grad_norm": 2.299752712249756,
"learning_rate": 1.853638403264141e-05,
"loss": 0.9203,
"step": 376
},
{
"epoch": 0.6892138939670932,
"grad_norm": 2.3039369583129883,
"learning_rate": 1.8381995327307067e-05,
"loss": 0.8833,
"step": 377
},
{
"epoch": 0.6910420475319927,
"grad_norm": 2.3373348712921143,
"learning_rate": 1.822722821790126e-05,
"loss": 0.7324,
"step": 378
},
{
"epoch": 0.6928702010968921,
"grad_norm": 2.774083137512207,
"learning_rate": 1.807210002097786e-05,
"loss": 0.8778,
"step": 379
},
{
"epoch": 0.6946983546617916,
"grad_norm": 2.214552402496338,
"learning_rate": 1.791662809349206e-05,
"loss": 0.8044,
"step": 380
},
{
"epoch": 0.696526508226691,
"grad_norm": 2.298497438430786,
"learning_rate": 1.7760829830858305e-05,
"loss": 0.8667,
"step": 381
},
{
"epoch": 0.6983546617915904,
"grad_norm": 2.23805570602417,
"learning_rate": 1.760472266500396e-05,
"loss": 0.7938,
"step": 382
},
{
"epoch": 0.70018281535649,
"grad_norm": 2.18110990524292,
"learning_rate": 1.744832406241889e-05,
"loss": 0.8147,
"step": 383
},
{
"epoch": 0.7020109689213894,
"grad_norm": 2.2718112468719482,
"learning_rate": 1.7291651522201208e-05,
"loss": 0.973,
"step": 384
},
{
"epoch": 0.7038391224862889,
"grad_norm": 2.254279375076294,
"learning_rate": 1.713472257409928e-05,
"loss": 0.7439,
"step": 385
},
{
"epoch": 0.7056672760511883,
"grad_norm": 2.268983840942383,
"learning_rate": 1.6977554776550403e-05,
"loss": 0.8309,
"step": 386
},
{
"epoch": 0.7074954296160878,
"grad_norm": 2.189608097076416,
"learning_rate": 1.682016571471623e-05,
"loss": 0.8748,
"step": 387
},
{
"epoch": 0.7093235831809872,
"grad_norm": 2.231454610824585,
"learning_rate": 1.6662572998515166e-05,
"loss": 0.8759,
"step": 388
},
{
"epoch": 0.7111517367458866,
"grad_norm": 2.324653148651123,
"learning_rate": 1.6504794260652077e-05,
"loss": 0.7731,
"step": 389
},
{
"epoch": 0.7129798903107861,
"grad_norm": 2.113718271255493,
"learning_rate": 1.6346847154645376e-05,
"loss": 0.7961,
"step": 390
},
{
"epoch": 0.7148080438756855,
"grad_norm": 2.413463830947876,
"learning_rate": 1.6188749352851825e-05,
"loss": 0.9315,
"step": 391
},
{
"epoch": 0.716636197440585,
"grad_norm": 2.175915002822876,
"learning_rate": 1.6030518544489215e-05,
"loss": 0.7061,
"step": 392
},
{
"epoch": 0.7184643510054844,
"grad_norm": 2.2238268852233887,
"learning_rate": 1.587217243365714e-05,
"loss": 0.8585,
"step": 393
},
{
"epoch": 0.720292504570384,
"grad_norm": 2.3010525703430176,
"learning_rate": 1.5713728737356138e-05,
"loss": 0.8064,
"step": 394
},
{
"epoch": 0.7221206581352834,
"grad_norm": 2.2713418006896973,
"learning_rate": 1.555520518350537e-05,
"loss": 0.8125,
"step": 395
},
{
"epoch": 0.7239488117001828,
"grad_norm": 2.311316967010498,
"learning_rate": 1.5396619508959102e-05,
"loss": 0.7494,
"step": 396
},
{
"epoch": 0.7257769652650823,
"grad_norm": 2.3094563484191895,
"learning_rate": 1.523798945752212e-05,
"loss": 0.8135,
"step": 397
},
{
"epoch": 0.7276051188299817,
"grad_norm": 2.1408050060272217,
"learning_rate": 1.5079332777964467e-05,
"loss": 0.8519,
"step": 398
},
{
"epoch": 0.7294332723948812,
"grad_norm": 2.196596622467041,
"learning_rate": 1.4920667222035532e-05,
"loss": 0.9019,
"step": 399
},
{
"epoch": 0.7312614259597806,
"grad_norm": 2.4077069759368896,
"learning_rate": 1.4762010542477881e-05,
"loss": 0.8437,
"step": 400
},
{
"epoch": 0.7330895795246801,
"grad_norm": 2.138925075531006,
"learning_rate": 1.46033804910409e-05,
"loss": 0.7867,
"step": 401
},
{
"epoch": 0.7349177330895795,
"grad_norm": 2.280134439468384,
"learning_rate": 1.4444794816494629e-05,
"loss": 1.0417,
"step": 402
},
{
"epoch": 0.7367458866544789,
"grad_norm": 2.484534502029419,
"learning_rate": 1.4286271262643866e-05,
"loss": 0.7929,
"step": 403
},
{
"epoch": 0.7385740402193784,
"grad_norm": 2.2009499073028564,
"learning_rate": 1.4127827566342864e-05,
"loss": 0.7963,
"step": 404
},
{
"epoch": 0.7404021937842779,
"grad_norm": 2.313990831375122,
"learning_rate": 1.3969481455510787e-05,
"loss": 0.9538,
"step": 405
},
{
"epoch": 0.7422303473491774,
"grad_norm": 2.1209707260131836,
"learning_rate": 1.3811250647148172e-05,
"loss": 0.8327,
"step": 406
},
{
"epoch": 0.7440585009140768,
"grad_norm": 2.3821375370025635,
"learning_rate": 1.3653152845354625e-05,
"loss": 0.8677,
"step": 407
},
{
"epoch": 0.7458866544789763,
"grad_norm": 2.179967164993286,
"learning_rate": 1.3495205739347925e-05,
"loss": 0.8095,
"step": 408
},
{
"epoch": 0.7477148080438757,
"grad_norm": 2.5116395950317383,
"learning_rate": 1.3337427001484836e-05,
"loss": 0.9218,
"step": 409
},
{
"epoch": 0.7495429616087751,
"grad_norm": 2.173802375793457,
"learning_rate": 1.3179834285283773e-05,
"loss": 0.7475,
"step": 410
},
{
"epoch": 0.7513711151736746,
"grad_norm": 2.0795040130615234,
"learning_rate": 1.3022445223449596e-05,
"loss": 0.8749,
"step": 411
},
{
"epoch": 0.753199268738574,
"grad_norm": 2.1474385261535645,
"learning_rate": 1.2865277425900725e-05,
"loss": 0.8277,
"step": 412
},
{
"epoch": 0.7550274223034735,
"grad_norm": 2.243417978286743,
"learning_rate": 1.2708348477798795e-05,
"loss": 0.8147,
"step": 413
},
{
"epoch": 0.7568555758683729,
"grad_norm": 2.3106589317321777,
"learning_rate": 1.255167593758111e-05,
"loss": 0.7848,
"step": 414
},
{
"epoch": 0.7586837294332724,
"grad_norm": 2.397627830505371,
"learning_rate": 1.2395277334996045e-05,
"loss": 0.9778,
"step": 415
},
{
"epoch": 0.7605118829981719,
"grad_norm": 2.3535757064819336,
"learning_rate": 1.2239170169141696e-05,
"loss": 0.7996,
"step": 416
},
{
"epoch": 0.7623400365630713,
"grad_norm": 2.224731922149658,
"learning_rate": 1.2083371906507939e-05,
"loss": 0.8442,
"step": 417
},
{
"epoch": 0.7641681901279708,
"grad_norm": 2.4303503036499023,
"learning_rate": 1.1927899979022143e-05,
"loss": 0.8317,
"step": 418
},
{
"epoch": 0.7659963436928702,
"grad_norm": 2.4696667194366455,
"learning_rate": 1.1772771782098748e-05,
"loss": 0.8581,
"step": 419
},
{
"epoch": 0.7678244972577697,
"grad_norm": 2.2766096591949463,
"learning_rate": 1.1618004672692937e-05,
"loss": 0.781,
"step": 420
},
{
"epoch": 0.7696526508226691,
"grad_norm": 2.2170205116271973,
"learning_rate": 1.146361596735859e-05,
"loss": 0.6847,
"step": 421
},
{
"epoch": 0.7714808043875686,
"grad_norm": 2.301888942718506,
"learning_rate": 1.1309622940310798e-05,
"loss": 0.9334,
"step": 422
},
{
"epoch": 0.773308957952468,
"grad_norm": 2.0786006450653076,
"learning_rate": 1.1156042821493062e-05,
"loss": 0.8339,
"step": 423
},
{
"epoch": 0.7751371115173674,
"grad_norm": 2.1867787837982178,
"learning_rate": 1.1002892794649478e-05,
"loss": 0.8398,
"step": 424
},
{
"epoch": 0.7769652650822669,
"grad_norm": 2.1924829483032227,
"learning_rate": 1.0850189995402096e-05,
"loss": 0.8241,
"step": 425
},
{
"epoch": 0.7787934186471663,
"grad_norm": 2.104240655899048,
"learning_rate": 1.069795150933365e-05,
"loss": 0.83,
"step": 426
},
{
"epoch": 0.7806215722120659,
"grad_norm": 2.301518201828003,
"learning_rate": 1.0546194370075882e-05,
"loss": 0.7494,
"step": 427
},
{
"epoch": 0.7824497257769653,
"grad_norm": 2.3547585010528564,
"learning_rate": 1.0394935557403684e-05,
"loss": 0.7907,
"step": 428
},
{
"epoch": 0.7842778793418648,
"grad_norm": 2.225034713745117,
"learning_rate": 1.0244191995335299e-05,
"loss": 0.8484,
"step": 429
},
{
"epoch": 0.7861060329067642,
"grad_norm": 2.3130884170532227,
"learning_rate": 1.0093980550238676e-05,
"loss": 0.8425,
"step": 430
},
{
"epoch": 0.7879341864716636,
"grad_norm": 2.425241708755493,
"learning_rate": 9.944318028944374e-06,
"loss": 0.9269,
"step": 431
},
{
"epoch": 0.7897623400365631,
"grad_norm": 2.1149165630340576,
"learning_rate": 9.795221176865065e-06,
"loss": 0.7503,
"step": 432
},
{
"epoch": 0.7915904936014625,
"grad_norm": 2.3856897354125977,
"learning_rate": 9.646706676121924e-06,
"loss": 0.8628,
"step": 433
},
{
"epoch": 0.793418647166362,
"grad_norm": 2.1912615299224854,
"learning_rate": 9.49879114367811e-06,
"loss": 0.8198,
"step": 434
},
{
"epoch": 0.7952468007312614,
"grad_norm": 2.1112685203552246,
"learning_rate": 9.351491129479519e-06,
"loss": 0.8933,
"step": 435
},
{
"epoch": 0.7970749542961609,
"grad_norm": 2.3817248344421387,
"learning_rate": 9.20482311460307e-06,
"loss": 0.8212,
"step": 436
},
{
"epoch": 0.7989031078610603,
"grad_norm": 2.216339349746704,
"learning_rate": 9.058803509412647e-06,
"loss": 0.7964,
"step": 437
},
{
"epoch": 0.8007312614259597,
"grad_norm": 2.2197396755218506,
"learning_rate": 8.913448651722994e-06,
"loss": 0.7535,
"step": 438
},
{
"epoch": 0.8025594149908593,
"grad_norm": 2.083980083465576,
"learning_rate": 8.768774804971705e-06,
"loss": 0.9009,
"step": 439
},
{
"epoch": 0.8043875685557587,
"grad_norm": 2.0909934043884277,
"learning_rate": 8.624798156399554e-06,
"loss": 0.8016,
"step": 440
},
{
"epoch": 0.8062157221206582,
"grad_norm": 2.4581222534179688,
"learning_rate": 8.481534815239323e-06,
"loss": 0.9227,
"step": 441
},
{
"epoch": 0.8080438756855576,
"grad_norm": 2.1503217220306396,
"learning_rate": 8.339000810913388e-06,
"loss": 0.7305,
"step": 442
},
{
"epoch": 0.8098720292504571,
"grad_norm": 1.9855475425720215,
"learning_rate": 8.197212091240237e-06,
"loss": 0.7195,
"step": 443
},
{
"epoch": 0.8117001828153565,
"grad_norm": 2.25361967086792,
"learning_rate": 8.056184520650064e-06,
"loss": 0.7594,
"step": 444
},
{
"epoch": 0.8135283363802559,
"grad_norm": 2.2054708003997803,
"learning_rate": 7.915933878409762e-06,
"loss": 0.7931,
"step": 445
},
{
"epoch": 0.8153564899451554,
"grad_norm": 2.134115219116211,
"learning_rate": 7.776475856857409e-06,
"loss": 0.7195,
"step": 446
},
{
"epoch": 0.8171846435100548,
"grad_norm": 1.9758131504058838,
"learning_rate": 7.63782605964648e-06,
"loss": 0.872,
"step": 447
},
{
"epoch": 0.8190127970749543,
"grad_norm": 2.291642904281616,
"learning_rate": 7.500000000000004e-06,
"loss": 0.8467,
"step": 448
},
{
"epoch": 0.8208409506398537,
"grad_norm": 2.2243387699127197,
"learning_rate": 7.3630130989748e-06,
"loss": 0.9038,
"step": 449
},
{
"epoch": 0.8226691042047533,
"grad_norm": 2.283393383026123,
"learning_rate": 7.226880683736066e-06,
"loss": 0.8102,
"step": 450
},
{
"epoch": 0.8244972577696527,
"grad_norm": 2.078200101852417,
"learning_rate": 7.091617985842463e-06,
"loss": 0.761,
"step": 451
},
{
"epoch": 0.8263254113345521,
"grad_norm": 2.3057701587677,
"learning_rate": 6.9572401395418615e-06,
"loss": 0.8682,
"step": 452
},
{
"epoch": 0.8281535648994516,
"grad_norm": 2.171827793121338,
"learning_rate": 6.8237621800780255e-06,
"loss": 0.7561,
"step": 453
},
{
"epoch": 0.829981718464351,
"grad_norm": 2.3417348861694336,
"learning_rate": 6.691199042008346e-06,
"loss": 0.8277,
"step": 454
},
{
"epoch": 0.8318098720292505,
"grad_norm": 2.1309165954589844,
"learning_rate": 6.559565557532847e-06,
"loss": 0.8441,
"step": 455
},
{
"epoch": 0.8336380255941499,
"grad_norm": 2.3415029048919678,
"learning_rate": 6.428876454834643e-06,
"loss": 0.787,
"step": 456
},
{
"epoch": 0.8354661791590493,
"grad_norm": 2.2141568660736084,
"learning_rate": 6.2991463564320296e-06,
"loss": 0.8158,
"step": 457
},
{
"epoch": 0.8372943327239488,
"grad_norm": 2.0096514225006104,
"learning_rate": 6.170389777542409e-06,
"loss": 0.7489,
"step": 458
},
{
"epoch": 0.8391224862888482,
"grad_norm": 2.125929355621338,
"learning_rate": 6.0426211244582105e-06,
"loss": 0.8803,
"step": 459
},
{
"epoch": 0.8409506398537477,
"grad_norm": 2.0805740356445312,
"learning_rate": 5.915854692935002e-06,
"loss": 0.773,
"step": 460
},
{
"epoch": 0.8427787934186471,
"grad_norm": 2.357139825820923,
"learning_rate": 5.790104666591974e-06,
"loss": 0.7609,
"step": 461
},
{
"epoch": 0.8446069469835467,
"grad_norm": 2.277031898498535,
"learning_rate": 5.665385115324954e-06,
"loss": 0.8573,
"step": 462
},
{
"epoch": 0.8464351005484461,
"grad_norm": 2.2020912170410156,
"learning_rate": 5.541709993732168e-06,
"loss": 0.9261,
"step": 463
},
{
"epoch": 0.8482632541133455,
"grad_norm": 2.294649362564087,
"learning_rate": 5.419093139552878e-06,
"loss": 0.8164,
"step": 464
},
{
"epoch": 0.850091407678245,
"grad_norm": 2.047896385192871,
"learning_rate": 5.297548272119138e-06,
"loss": 0.8419,
"step": 465
},
{
"epoch": 0.8519195612431444,
"grad_norm": 2.4558777809143066,
"learning_rate": 5.177088990820725e-06,
"loss": 0.8319,
"step": 466
},
{
"epoch": 0.8537477148080439,
"grad_norm": 2.008725643157959,
"learning_rate": 5.05772877358356e-06,
"loss": 0.7503,
"step": 467
},
{
"epoch": 0.8555758683729433,
"grad_norm": 2.16011643409729,
"learning_rate": 4.939480975361687e-06,
"loss": 0.7007,
"step": 468
},
{
"epoch": 0.8574040219378428,
"grad_norm": 2.166571855545044,
"learning_rate": 4.822358826643019e-06,
"loss": 0.7383,
"step": 469
},
{
"epoch": 0.8592321755027422,
"grad_norm": 2.3428239822387695,
"learning_rate": 4.706375431968998e-06,
"loss": 0.792,
"step": 470
},
{
"epoch": 0.8610603290676416,
"grad_norm": 2.3133058547973633,
"learning_rate": 4.591543768468364e-06,
"loss": 0.7791,
"step": 471
},
{
"epoch": 0.8628884826325411,
"grad_norm": 2.227383852005005,
"learning_rate": 4.4778766844051795e-06,
"loss": 0.8838,
"step": 472
},
{
"epoch": 0.8647166361974405,
"grad_norm": 1.9852975606918335,
"learning_rate": 4.365386897741249e-06,
"loss": 0.8375,
"step": 473
},
{
"epoch": 0.8665447897623401,
"grad_norm": 2.151278018951416,
"learning_rate": 4.254086994713141e-06,
"loss": 0.7966,
"step": 474
},
{
"epoch": 0.8683729433272395,
"grad_norm": 2.355102777481079,
"learning_rate": 4.1439894284239474e-06,
"loss": 0.8264,
"step": 475
},
{
"epoch": 0.870201096892139,
"grad_norm": 2.390646457672119,
"learning_rate": 4.035106517449926e-06,
"loss": 0.8292,
"step": 476
},
{
"epoch": 0.8720292504570384,
"grad_norm": 2.1484568119049072,
"learning_rate": 3.9274504444622025e-06,
"loss": 0.8624,
"step": 477
},
{
"epoch": 0.8738574040219378,
"grad_norm": 2.134361505508423,
"learning_rate": 3.82103325486368e-06,
"loss": 0.8226,
"step": 478
},
{
"epoch": 0.8756855575868373,
"grad_norm": 2.1799209117889404,
"learning_rate": 3.715866855441309e-06,
"loss": 0.7563,
"step": 479
},
{
"epoch": 0.8775137111517367,
"grad_norm": 2.338834285736084,
"learning_rate": 3.6119630130338537e-06,
"loss": 0.8319,
"step": 480
},
{
"epoch": 0.8793418647166362,
"grad_norm": 2.032010555267334,
"learning_rate": 3.5093333532153316e-06,
"loss": 0.7693,
"step": 481
},
{
"epoch": 0.8811700182815356,
"grad_norm": 2.1978771686553955,
"learning_rate": 3.4079893589942544e-06,
"loss": 0.7642,
"step": 482
},
{
"epoch": 0.8829981718464351,
"grad_norm": 2.5220754146575928,
"learning_rate": 3.3079423695288204e-06,
"loss": 0.9182,
"step": 483
},
{
"epoch": 0.8848263254113345,
"grad_norm": 2.1148622035980225,
"learning_rate": 3.2092035788581907e-06,
"loss": 0.8411,
"step": 484
},
{
"epoch": 0.886654478976234,
"grad_norm": 2.1336936950683594,
"learning_rate": 3.1117840346500287e-06,
"loss": 0.7711,
"step": 485
},
{
"epoch": 0.8884826325411335,
"grad_norm": 2.175741672515869,
"learning_rate": 3.0156946369643803e-06,
"loss": 0.9526,
"step": 486
},
{
"epoch": 0.8903107861060329,
"grad_norm": 2.207550525665283,
"learning_rate": 2.9209461370341204e-06,
"loss": 0.7538,
"step": 487
},
{
"epoch": 0.8921389396709324,
"grad_norm": 2.0048232078552246,
"learning_rate": 2.8275491360619875e-06,
"loss": 0.8079,
"step": 488
},
{
"epoch": 0.8939670932358318,
"grad_norm": 2.2302756309509277,
"learning_rate": 2.735514084034457e-06,
"loss": 0.8385,
"step": 489
},
{
"epoch": 0.8957952468007313,
"grad_norm": 2.7533788681030273,
"learning_rate": 2.64485127855251e-06,
"loss": 0.7718,
"step": 490
},
{
"epoch": 0.8976234003656307,
"grad_norm": 2.3614344596862793,
"learning_rate": 2.5555708636794594e-06,
"loss": 0.7767,
"step": 491
},
{
"epoch": 0.8994515539305301,
"grad_norm": 2.726402521133423,
"learning_rate": 2.467682828805956e-06,
"loss": 0.7917,
"step": 492
},
{
"epoch": 0.9012797074954296,
"grad_norm": 2.2285687923431396,
"learning_rate": 2.38119700753228e-06,
"loss": 0.8958,
"step": 493
},
{
"epoch": 0.903107861060329,
"grad_norm": 2.1934146881103516,
"learning_rate": 2.2961230765681158e-06,
"loss": 0.7796,
"step": 494
},
{
"epoch": 0.9049360146252285,
"grad_norm": 2.349043607711792,
"learning_rate": 2.212470554649805e-06,
"loss": 0.8538,
"step": 495
},
{
"epoch": 0.906764168190128,
"grad_norm": 1.995997667312622,
"learning_rate": 2.130248801475344e-06,
"loss": 0.8433,
"step": 496
},
{
"epoch": 0.9085923217550275,
"grad_norm": 2.1767685413360596,
"learning_rate": 2.0494670166571356e-06,
"loss": 0.8276,
"step": 497
},
{
"epoch": 0.9104204753199269,
"grad_norm": 2.255619525909424,
"learning_rate": 1.9701342386926712e-06,
"loss": 0.7797,
"step": 498
},
{
"epoch": 0.9122486288848263,
"grad_norm": 2.3576643466949463,
"learning_rate": 1.892259343953226e-06,
"loss": 0.9015,
"step": 499
},
{
"epoch": 0.9140767824497258,
"grad_norm": 1.9980827569961548,
"learning_rate": 1.815851045690708e-06,
"loss": 0.6846,
"step": 500
},
{
"epoch": 0.9140767824497258,
"eval_loss": 0.798653244972229,
"eval_runtime": 11.4055,
"eval_samples_per_second": 97.497,
"eval_steps_per_second": 3.069,
"step": 500
},
{
"epoch": 0.9159049360146252,
"grad_norm": 2.24575138092041,
"learning_rate": 1.7409178930627473e-06,
"loss": 0.8362,
"step": 501
},
{
"epoch": 0.9177330895795247,
"grad_norm": 2.058715343475342,
"learning_rate": 1.6674682701761496e-06,
"loss": 0.8225,
"step": 502
},
{
"epoch": 0.9195612431444241,
"grad_norm": 2.0738391876220703,
"learning_rate": 1.5955103951488177e-06,
"loss": 0.7747,
"step": 503
},
{
"epoch": 0.9213893967093236,
"grad_norm": 2.142606735229492,
"learning_rate": 1.5250523191902455e-06,
"loss": 0.8331,
"step": 504
},
{
"epoch": 0.923217550274223,
"grad_norm": 2.2022759914398193,
"learning_rate": 1.456101925700684e-06,
"loss": 0.8037,
"step": 505
},
{
"epoch": 0.9250457038391224,
"grad_norm": 2.1481759548187256,
"learning_rate": 1.3886669293890837e-06,
"loss": 0.7431,
"step": 506
},
{
"epoch": 0.926873857404022,
"grad_norm": 2.3185274600982666,
"learning_rate": 1.322754875409915e-06,
"loss": 0.7726,
"step": 507
},
{
"epoch": 0.9287020109689214,
"grad_norm": 2.315138816833496,
"learning_rate": 1.2583731385189562e-06,
"loss": 0.7026,
"step": 508
},
{
"epoch": 0.9305301645338209,
"grad_norm": 2.050353527069092,
"learning_rate": 1.1955289222481513e-06,
"loss": 0.7373,
"step": 509
},
{
"epoch": 0.9323583180987203,
"grad_norm": 2.3529744148254395,
"learning_rate": 1.1342292580996195e-06,
"loss": 0.8461,
"step": 510
},
{
"epoch": 0.9341864716636198,
"grad_norm": 2.264411687850952,
"learning_rate": 1.0744810047589116e-06,
"loss": 1.05,
"step": 511
},
{
"epoch": 0.9360146252285192,
"grad_norm": 2.2528390884399414,
"learning_rate": 1.0162908473276133e-06,
"loss": 0.8218,
"step": 512
},
{
"epoch": 0.9378427787934186,
"grad_norm": 2.23812198638916,
"learning_rate": 9.596652965753632e-07,
"loss": 0.8533,
"step": 513
},
{
"epoch": 0.9396709323583181,
"grad_norm": 2.4503235816955566,
"learning_rate": 9.046106882113753e-07,
"loss": 0.8821,
"step": 514
},
{
"epoch": 0.9414990859232175,
"grad_norm": 2.152954578399658,
"learning_rate": 8.511331821755459e-07,
"loss": 0.7932,
"step": 515
},
{
"epoch": 0.943327239488117,
"grad_norm": 2.1594455242156982,
"learning_rate": 7.992387619492436e-07,
"loss": 0.7988,
"step": 516
},
{
"epoch": 0.9451553930530164,
"grad_norm": 2.086651086807251,
"learning_rate": 7.489332338858202e-07,
"loss": 0.8552,
"step": 517
},
{
"epoch": 0.946983546617916,
"grad_norm": 2.134727954864502,
"learning_rate": 7.002222265609476e-07,
"loss": 0.8825,
"step": 518
},
{
"epoch": 0.9488117001828154,
"grad_norm": 2.169853448867798,
"learning_rate": 6.53111190142861e-07,
"loss": 0.8105,
"step": 519
},
{
"epoch": 0.9506398537477148,
"grad_norm": 2.000743865966797,
"learning_rate": 6.076053957825411e-07,
"loss": 0.6882,
"step": 520
},
{
"epoch": 0.9524680073126143,
"grad_norm": 2.1314992904663086,
"learning_rate": 5.637099350239427e-07,
"loss": 0.7354,
"step": 521
},
{
"epoch": 0.9542961608775137,
"grad_norm": 2.3546230792999268,
"learning_rate": 5.214297192343104e-07,
"loss": 0.8793,
"step": 522
},
{
"epoch": 0.9561243144424132,
"grad_norm": 2.054684638977051,
"learning_rate": 4.807694790546563e-07,
"loss": 0.8644,
"step": 523
},
{
"epoch": 0.9579524680073126,
"grad_norm": 2.0605905055999756,
"learning_rate": 4.417337638704588e-07,
"loss": 0.675,
"step": 524
},
{
"epoch": 0.9597806215722121,
"grad_norm": 2.196253776550293,
"learning_rate": 4.043269413026429e-07,
"loss": 0.8171,
"step": 525
},
{
"epoch": 0.9616087751371115,
"grad_norm": 2.239720582962036,
"learning_rate": 3.6855319671889433e-07,
"loss": 0.7863,
"step": 526
},
{
"epoch": 0.9634369287020109,
"grad_norm": 2.3303980827331543,
"learning_rate": 3.3441653276537253e-07,
"loss": 0.7169,
"step": 527
},
{
"epoch": 0.9652650822669104,
"grad_norm": 2.10151743888855,
"learning_rate": 3.0192076891885745e-07,
"loss": 0.8925,
"step": 528
},
{
"epoch": 0.9670932358318098,
"grad_norm": 2.475900411605835,
"learning_rate": 2.710695410593994e-07,
"loss": 0.8043,
"step": 529
},
{
"epoch": 0.9689213893967094,
"grad_norm": 2.0351574420928955,
"learning_rate": 2.418663010635114e-07,
"loss": 0.6677,
"step": 530
},
{
"epoch": 0.9707495429616088,
"grad_norm": 2.2573163509368896,
"learning_rate": 2.1431431641794287e-07,
"loss": 0.8685,
"step": 531
},
{
"epoch": 0.9725776965265083,
"grad_norm": 2.2806551456451416,
"learning_rate": 1.8841666985408566e-07,
"loss": 1.0264,
"step": 532
},
{
"epoch": 0.9744058500914077,
"grad_norm": 2.0971100330352783,
"learning_rate": 1.6417625900305656e-07,
"loss": 0.663,
"step": 533
},
{
"epoch": 0.9762340036563071,
"grad_norm": 2.1479334831237793,
"learning_rate": 1.4159579607148976e-07,
"loss": 0.7461,
"step": 534
},
{
"epoch": 0.9780621572212066,
"grad_norm": 2.1846601963043213,
"learning_rate": 1.206778075380699e-07,
"loss": 0.7843,
"step": 535
},
{
"epoch": 0.979890310786106,
"grad_norm": 2.18355131149292,
"learning_rate": 1.0142463387085465e-07,
"loss": 0.8233,
"step": 536
},
{
"epoch": 0.9817184643510055,
"grad_norm": 1.9972505569458008,
"learning_rate": 8.38384292653993e-08,
"loss": 0.6412,
"step": 537
},
{
"epoch": 0.9835466179159049,
"grad_norm": 2.2325432300567627,
"learning_rate": 6.792116140373117e-08,
"loss": 0.7315,
"step": 538
},
{
"epoch": 0.9853747714808044,
"grad_norm": 2.270096778869629,
"learning_rate": 5.367461123419071e-08,
"loss": 0.7166,
"step": 539
},
{
"epoch": 0.9872029250457038,
"grad_norm": 2.084451675415039,
"learning_rate": 4.110037277216427e-08,
"loss": 0.7703,
"step": 540
},
{
"epoch": 0.9890310786106032,
"grad_norm": 2.1931207180023193,
"learning_rate": 3.0199852921735104e-08,
"loss": 0.9388,
"step": 541
},
{
"epoch": 0.9908592321755028,
"grad_norm": 2.08048939704895,
"learning_rate": 2.0974271318260907e-08,
"loss": 0.669,
"step": 542
},
{
"epoch": 0.9926873857404022,
"grad_norm": 2.402120351791382,
"learning_rate": 1.342466019192301e-08,
"loss": 0.8257,
"step": 543
},
{
"epoch": 0.9945155393053017,
"grad_norm": 2.3177034854888916,
"learning_rate": 7.551864252223762e-09,
"loss": 0.8117,
"step": 544
},
{
"epoch": 0.9963436928702011,
"grad_norm": 2.3477089405059814,
"learning_rate": 3.3565405934721237e-09,
"loss": 0.8285,
"step": 545
},
{
"epoch": 0.9981718464351006,
"grad_norm": 2.4188199043273926,
"learning_rate": 8.391586212741498e-10,
"loss": 0.8643,
"step": 546
},
{
"epoch": 1.0,
"grad_norm": 2.4587948322296143,
"learning_rate": 0.0,
"loss": 0.8511,
"step": 547
}
],
"logging_steps": 1,
"max_steps": 547,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.615833264128e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}