yuzhounie's picture
End of training
606be91 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9836065573770494,
"eval_steps": 500,
"global_step": 426,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00702576112412178,
"grad_norm": 0.14412136375904083,
"learning_rate": 2.3255813953488374e-07,
"loss": 0.7429,
"step": 1
},
{
"epoch": 0.01405152224824356,
"grad_norm": 0.12180829793214798,
"learning_rate": 4.651162790697675e-07,
"loss": 0.6834,
"step": 2
},
{
"epoch": 0.02107728337236534,
"grad_norm": 0.13547690212726593,
"learning_rate": 6.976744186046513e-07,
"loss": 0.7261,
"step": 3
},
{
"epoch": 0.02810304449648712,
"grad_norm": 0.15328572690486908,
"learning_rate": 9.30232558139535e-07,
"loss": 0.7679,
"step": 4
},
{
"epoch": 0.0351288056206089,
"grad_norm": 0.11058026552200317,
"learning_rate": 1.1627906976744188e-06,
"loss": 0.642,
"step": 5
},
{
"epoch": 0.04215456674473068,
"grad_norm": 0.1449006348848343,
"learning_rate": 1.3953488372093025e-06,
"loss": 0.7518,
"step": 6
},
{
"epoch": 0.04918032786885246,
"grad_norm": 0.14010243117809296,
"learning_rate": 1.6279069767441862e-06,
"loss": 0.734,
"step": 7
},
{
"epoch": 0.05620608899297424,
"grad_norm": 0.11249666661024094,
"learning_rate": 1.86046511627907e-06,
"loss": 0.6573,
"step": 8
},
{
"epoch": 0.06323185011709602,
"grad_norm": 0.12850059568881989,
"learning_rate": 2.0930232558139536e-06,
"loss": 0.6856,
"step": 9
},
{
"epoch": 0.0702576112412178,
"grad_norm": 0.09605734795331955,
"learning_rate": 2.3255813953488376e-06,
"loss": 0.6657,
"step": 10
},
{
"epoch": 0.07728337236533958,
"grad_norm": 0.09640911221504211,
"learning_rate": 2.558139534883721e-06,
"loss": 0.7243,
"step": 11
},
{
"epoch": 0.08430913348946135,
"grad_norm": 0.09078703820705414,
"learning_rate": 2.790697674418605e-06,
"loss": 0.7013,
"step": 12
},
{
"epoch": 0.09133489461358314,
"grad_norm": 0.09065765142440796,
"learning_rate": 3.0232558139534885e-06,
"loss": 0.6744,
"step": 13
},
{
"epoch": 0.09836065573770492,
"grad_norm": 0.06950970739126205,
"learning_rate": 3.2558139534883724e-06,
"loss": 0.6745,
"step": 14
},
{
"epoch": 0.1053864168618267,
"grad_norm": 0.07783018797636032,
"learning_rate": 3.4883720930232564e-06,
"loss": 0.6294,
"step": 15
},
{
"epoch": 0.11241217798594848,
"grad_norm": 0.0778842344880104,
"learning_rate": 3.72093023255814e-06,
"loss": 0.609,
"step": 16
},
{
"epoch": 0.11943793911007025,
"grad_norm": 0.07574747502803802,
"learning_rate": 3.953488372093024e-06,
"loss": 0.5765,
"step": 17
},
{
"epoch": 0.12646370023419204,
"grad_norm": 0.07703674584627151,
"learning_rate": 4.186046511627907e-06,
"loss": 0.6541,
"step": 18
},
{
"epoch": 0.13348946135831383,
"grad_norm": 0.06555178016424179,
"learning_rate": 4.418604651162791e-06,
"loss": 0.5956,
"step": 19
},
{
"epoch": 0.1405152224824356,
"grad_norm": 0.08275623619556427,
"learning_rate": 4.651162790697675e-06,
"loss": 0.7029,
"step": 20
},
{
"epoch": 0.14754098360655737,
"grad_norm": 0.0675705149769783,
"learning_rate": 4.883720930232559e-06,
"loss": 0.5811,
"step": 21
},
{
"epoch": 0.15456674473067916,
"grad_norm": 0.07332336902618408,
"learning_rate": 5.116279069767442e-06,
"loss": 0.6043,
"step": 22
},
{
"epoch": 0.16159250585480095,
"grad_norm": 0.0802861675620079,
"learning_rate": 5.348837209302326e-06,
"loss": 0.6869,
"step": 23
},
{
"epoch": 0.1686182669789227,
"grad_norm": 0.06729163229465485,
"learning_rate": 5.58139534883721e-06,
"loss": 0.576,
"step": 24
},
{
"epoch": 0.1756440281030445,
"grad_norm": 0.06569469720125198,
"learning_rate": 5.8139534883720935e-06,
"loss": 0.6487,
"step": 25
},
{
"epoch": 0.18266978922716628,
"grad_norm": 0.06604806333780289,
"learning_rate": 6.046511627906977e-06,
"loss": 0.585,
"step": 26
},
{
"epoch": 0.18969555035128804,
"grad_norm": 0.06028835102915764,
"learning_rate": 6.279069767441861e-06,
"loss": 0.606,
"step": 27
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.05667126551270485,
"learning_rate": 6.511627906976745e-06,
"loss": 0.5471,
"step": 28
},
{
"epoch": 0.20374707259953162,
"grad_norm": 0.06196806579828262,
"learning_rate": 6.744186046511628e-06,
"loss": 0.5737,
"step": 29
},
{
"epoch": 0.2107728337236534,
"grad_norm": 0.058749496936798096,
"learning_rate": 6.976744186046513e-06,
"loss": 0.5741,
"step": 30
},
{
"epoch": 0.21779859484777517,
"grad_norm": 0.059330519288778305,
"learning_rate": 7.209302325581395e-06,
"loss": 0.5934,
"step": 31
},
{
"epoch": 0.22482435597189696,
"grad_norm": 0.0617949403822422,
"learning_rate": 7.44186046511628e-06,
"loss": 0.5195,
"step": 32
},
{
"epoch": 0.23185011709601874,
"grad_norm": 0.0625167116522789,
"learning_rate": 7.674418604651164e-06,
"loss": 0.5721,
"step": 33
},
{
"epoch": 0.2388758782201405,
"grad_norm": 0.0701775774359703,
"learning_rate": 7.906976744186048e-06,
"loss": 0.6073,
"step": 34
},
{
"epoch": 0.2459016393442623,
"grad_norm": 0.06266116350889206,
"learning_rate": 8.139534883720931e-06,
"loss": 0.556,
"step": 35
},
{
"epoch": 0.2529274004683841,
"grad_norm": 0.06269484013319016,
"learning_rate": 8.372093023255815e-06,
"loss": 0.5486,
"step": 36
},
{
"epoch": 0.25995316159250587,
"grad_norm": 0.05984916910529137,
"learning_rate": 8.604651162790698e-06,
"loss": 0.5585,
"step": 37
},
{
"epoch": 0.26697892271662765,
"grad_norm": 0.0559409074485302,
"learning_rate": 8.837209302325582e-06,
"loss": 0.4833,
"step": 38
},
{
"epoch": 0.27400468384074944,
"grad_norm": 0.056493211537599564,
"learning_rate": 9.069767441860465e-06,
"loss": 0.4957,
"step": 39
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.0535312257707119,
"learning_rate": 9.30232558139535e-06,
"loss": 0.5069,
"step": 40
},
{
"epoch": 0.28805620608899296,
"grad_norm": 0.05835776776075363,
"learning_rate": 9.534883720930234e-06,
"loss": 0.5302,
"step": 41
},
{
"epoch": 0.29508196721311475,
"grad_norm": 0.06827262043952942,
"learning_rate": 9.767441860465117e-06,
"loss": 0.5361,
"step": 42
},
{
"epoch": 0.30210772833723654,
"grad_norm": 0.06111191585659981,
"learning_rate": 1e-05,
"loss": 0.5522,
"step": 43
},
{
"epoch": 0.3091334894613583,
"grad_norm": 0.05481801554560661,
"learning_rate": 9.99983179466314e-06,
"loss": 0.4842,
"step": 44
},
{
"epoch": 0.3161592505854801,
"grad_norm": 0.06906605511903763,
"learning_rate": 9.999327189969768e-06,
"loss": 0.4833,
"step": 45
},
{
"epoch": 0.3231850117096019,
"grad_norm": 0.05364421010017395,
"learning_rate": 9.998486219870769e-06,
"loss": 0.5137,
"step": 46
},
{
"epoch": 0.33021077283372363,
"grad_norm": 0.06280182301998138,
"learning_rate": 9.997308940948405e-06,
"loss": 0.5609,
"step": 47
},
{
"epoch": 0.3372365339578454,
"grad_norm": 0.0569726936519146,
"learning_rate": 9.995795432412513e-06,
"loss": 0.5416,
"step": 48
},
{
"epoch": 0.3442622950819672,
"grad_norm": 0.06173605099320412,
"learning_rate": 9.993945796095183e-06,
"loss": 0.5162,
"step": 49
},
{
"epoch": 0.351288056206089,
"grad_norm": 0.05827682837843895,
"learning_rate": 9.991760156443892e-06,
"loss": 0.4773,
"step": 50
},
{
"epoch": 0.3583138173302108,
"grad_norm": 0.06012466922402382,
"learning_rate": 9.989238660513141e-06,
"loss": 0.5915,
"step": 51
},
{
"epoch": 0.36533957845433257,
"grad_norm": 0.05899444967508316,
"learning_rate": 9.98638147795456e-06,
"loss": 0.5253,
"step": 52
},
{
"epoch": 0.37236533957845436,
"grad_norm": 0.05364922806620598,
"learning_rate": 9.983188801005492e-06,
"loss": 0.486,
"step": 53
},
{
"epoch": 0.3793911007025761,
"grad_norm": 0.05417551472783089,
"learning_rate": 9.979660844476056e-06,
"loss": 0.5291,
"step": 54
},
{
"epoch": 0.3864168618266979,
"grad_norm": 0.062476933002471924,
"learning_rate": 9.975797845734699e-06,
"loss": 0.548,
"step": 55
},
{
"epoch": 0.39344262295081966,
"grad_norm": 0.06160496175289154,
"learning_rate": 9.971600064692222e-06,
"loss": 0.4919,
"step": 56
},
{
"epoch": 0.40046838407494145,
"grad_norm": 0.054618533700704575,
"learning_rate": 9.967067783784297e-06,
"loss": 0.5071,
"step": 57
},
{
"epoch": 0.40749414519906324,
"grad_norm": 0.059714607894420624,
"learning_rate": 9.962201307952455e-06,
"loss": 0.5347,
"step": 58
},
{
"epoch": 0.41451990632318503,
"grad_norm": 0.06326267123222351,
"learning_rate": 9.957000964623585e-06,
"loss": 0.5288,
"step": 59
},
{
"epoch": 0.4215456674473068,
"grad_norm": 0.05612269043922424,
"learning_rate": 9.951467103687879e-06,
"loss": 0.4878,
"step": 60
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.04823453351855278,
"learning_rate": 9.945600097475322e-06,
"loss": 0.4699,
"step": 61
},
{
"epoch": 0.43559718969555034,
"grad_norm": 0.057434167712926865,
"learning_rate": 9.939400340730611e-06,
"loss": 0.5389,
"step": 62
},
{
"epoch": 0.4426229508196721,
"grad_norm": 0.052870023995637894,
"learning_rate": 9.932868250586619e-06,
"loss": 0.4969,
"step": 63
},
{
"epoch": 0.4496487119437939,
"grad_norm": 0.050160668790340424,
"learning_rate": 9.926004266536314e-06,
"loss": 0.4506,
"step": 64
},
{
"epoch": 0.4566744730679157,
"grad_norm": 0.054149653762578964,
"learning_rate": 9.918808850403192e-06,
"loss": 0.5076,
"step": 65
},
{
"epoch": 0.4637002341920375,
"grad_norm": 0.056766077876091,
"learning_rate": 9.911282486310214e-06,
"loss": 0.4922,
"step": 66
},
{
"epoch": 0.4707259953161593,
"grad_norm": 0.06275495141744614,
"learning_rate": 9.903425680647225e-06,
"loss": 0.5696,
"step": 67
},
{
"epoch": 0.477751756440281,
"grad_norm": 0.05460723116993904,
"learning_rate": 9.895238962036878e-06,
"loss": 0.441,
"step": 68
},
{
"epoch": 0.4847775175644028,
"grad_norm": 0.06439048796892166,
"learning_rate": 9.88672288129908e-06,
"loss": 0.5162,
"step": 69
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.0556609220802784,
"learning_rate": 9.877878011413924e-06,
"loss": 0.4695,
"step": 70
},
{
"epoch": 0.49882903981264637,
"grad_norm": 0.0601162388920784,
"learning_rate": 9.868704947483134e-06,
"loss": 0.526,
"step": 71
},
{
"epoch": 0.5058548009367682,
"grad_norm": 0.05364784598350525,
"learning_rate": 9.859204306690038e-06,
"loss": 0.4963,
"step": 72
},
{
"epoch": 0.5128805620608899,
"grad_norm": 0.052927836775779724,
"learning_rate": 9.849376728258024e-06,
"loss": 0.514,
"step": 73
},
{
"epoch": 0.5199063231850117,
"grad_norm": 0.052183471620082855,
"learning_rate": 9.839222873407553e-06,
"loss": 0.484,
"step": 74
},
{
"epoch": 0.5269320843091335,
"grad_norm": 0.056657999753952026,
"learning_rate": 9.828743425311654e-06,
"loss": 0.4871,
"step": 75
},
{
"epoch": 0.5339578454332553,
"grad_norm": 0.056676190346479416,
"learning_rate": 9.817939089049964e-06,
"loss": 0.4665,
"step": 76
},
{
"epoch": 0.5409836065573771,
"grad_norm": 0.05315388739109039,
"learning_rate": 9.806810591561295e-06,
"loss": 0.4489,
"step": 77
},
{
"epoch": 0.5480093676814989,
"grad_norm": 0.05231834575533867,
"learning_rate": 9.795358681594712e-06,
"loss": 0.4952,
"step": 78
},
{
"epoch": 0.5550351288056206,
"grad_norm": 0.06096820533275604,
"learning_rate": 9.783584129659162e-06,
"loss": 0.5192,
"step": 79
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.07792366296052933,
"learning_rate": 9.771487727971642e-06,
"loss": 0.5,
"step": 80
},
{
"epoch": 0.5690866510538641,
"grad_norm": 0.05434305965900421,
"learning_rate": 9.759070290403873e-06,
"loss": 0.4835,
"step": 81
},
{
"epoch": 0.5761124121779859,
"grad_norm": 0.049161121249198914,
"learning_rate": 9.746332652427566e-06,
"loss": 0.4817,
"step": 82
},
{
"epoch": 0.5831381733021077,
"grad_norm": 0.06948748230934143,
"learning_rate": 9.733275671058195e-06,
"loss": 0.5137,
"step": 83
},
{
"epoch": 0.5901639344262295,
"grad_norm": 0.05787486582994461,
"learning_rate": 9.71990022479734e-06,
"loss": 0.4925,
"step": 84
},
{
"epoch": 0.5971896955503513,
"grad_norm": 0.0546451136469841,
"learning_rate": 9.70620721357358e-06,
"loss": 0.4758,
"step": 85
},
{
"epoch": 0.6042154566744731,
"grad_norm": 0.05614368990063667,
"learning_rate": 9.69219755868194e-06,
"loss": 0.4793,
"step": 86
},
{
"epoch": 0.6112412177985949,
"grad_norm": 0.055385395884513855,
"learning_rate": 9.677872202721906e-06,
"loss": 0.512,
"step": 87
},
{
"epoch": 0.6182669789227166,
"grad_norm": 0.05747217312455177,
"learning_rate": 9.663232109534011e-06,
"loss": 0.5597,
"step": 88
},
{
"epoch": 0.6252927400468384,
"grad_norm": 0.05803421884775162,
"learning_rate": 9.648278264134977e-06,
"loss": 0.4618,
"step": 89
},
{
"epoch": 0.6323185011709602,
"grad_norm": 0.05690561234951019,
"learning_rate": 9.633011672651443e-06,
"loss": 0.4276,
"step": 90
},
{
"epoch": 0.639344262295082,
"grad_norm": 0.05551106855273247,
"learning_rate": 9.617433362252277e-06,
"loss": 0.5031,
"step": 91
},
{
"epoch": 0.6463700234192038,
"grad_norm": 0.04912833124399185,
"learning_rate": 9.601544381079457e-06,
"loss": 0.4666,
"step": 92
},
{
"epoch": 0.6533957845433255,
"grad_norm": 0.0519762858748436,
"learning_rate": 9.585345798177557e-06,
"loss": 0.522,
"step": 93
},
{
"epoch": 0.6604215456674473,
"grad_norm": 0.06260818988084793,
"learning_rate": 9.56883870342181e-06,
"loss": 0.4578,
"step": 94
},
{
"epoch": 0.667447306791569,
"grad_norm": 0.06875967979431152,
"learning_rate": 9.552024207444794e-06,
"loss": 0.4448,
"step": 95
},
{
"epoch": 0.6744730679156908,
"grad_norm": 0.058077067136764526,
"learning_rate": 9.534903441561693e-06,
"loss": 0.5177,
"step": 96
},
{
"epoch": 0.6814988290398126,
"grad_norm": 0.06001827120780945,
"learning_rate": 9.517477557694182e-06,
"loss": 0.5171,
"step": 97
},
{
"epoch": 0.6885245901639344,
"grad_norm": 0.05816899985074997,
"learning_rate": 9.499747728292928e-06,
"loss": 0.5271,
"step": 98
},
{
"epoch": 0.6955503512880562,
"grad_norm": 0.054729413241147995,
"learning_rate": 9.481715146258699e-06,
"loss": 0.446,
"step": 99
},
{
"epoch": 0.702576112412178,
"grad_norm": 0.055416759103536606,
"learning_rate": 9.463381024862116e-06,
"loss": 0.5345,
"step": 100
},
{
"epoch": 0.7096018735362998,
"grad_norm": 0.06506048887968063,
"learning_rate": 9.444746597662e-06,
"loss": 0.5212,
"step": 101
},
{
"epoch": 0.7166276346604216,
"grad_norm": 0.052193962037563324,
"learning_rate": 9.425813118422393e-06,
"loss": 0.4809,
"step": 102
},
{
"epoch": 0.7236533957845434,
"grad_norm": 0.056404754519462585,
"learning_rate": 9.406581861028199e-06,
"loss": 0.5527,
"step": 103
},
{
"epoch": 0.7306791569086651,
"grad_norm": 0.05873854085803032,
"learning_rate": 9.387054119399466e-06,
"loss": 0.4389,
"step": 104
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.05391126498579979,
"learning_rate": 9.36723120740434e-06,
"loss": 0.4684,
"step": 105
},
{
"epoch": 0.7447306791569087,
"grad_norm": 0.06326638162136078,
"learning_rate": 9.347114458770656e-06,
"loss": 0.4912,
"step": 106
},
{
"epoch": 0.7517564402810304,
"grad_norm": 0.05993535369634628,
"learning_rate": 9.326705226996207e-06,
"loss": 0.4747,
"step": 107
},
{
"epoch": 0.7587822014051522,
"grad_norm": 0.06042395904660225,
"learning_rate": 9.306004885257675e-06,
"loss": 0.477,
"step": 108
},
{
"epoch": 0.765807962529274,
"grad_norm": 0.059888120740652084,
"learning_rate": 9.28501482631824e-06,
"loss": 0.4948,
"step": 109
},
{
"epoch": 0.7728337236533958,
"grad_norm": 0.05696633458137512,
"learning_rate": 9.26373646243388e-06,
"loss": 0.5053,
"step": 110
},
{
"epoch": 0.7798594847775175,
"grad_norm": 0.05638626217842102,
"learning_rate": 9.242171225258336e-06,
"loss": 0.4918,
"step": 111
},
{
"epoch": 0.7868852459016393,
"grad_norm": 0.05654750391840935,
"learning_rate": 9.220320565746806e-06,
"loss": 0.4604,
"step": 112
},
{
"epoch": 0.7939110070257611,
"grad_norm": 0.04910074546933174,
"learning_rate": 9.198185954058305e-06,
"loss": 0.486,
"step": 113
},
{
"epoch": 0.8009367681498829,
"grad_norm": 0.05476020276546478,
"learning_rate": 9.175768879456759e-06,
"loss": 0.4701,
"step": 114
},
{
"epoch": 0.8079625292740047,
"grad_norm": 0.05878998339176178,
"learning_rate": 9.153070850210803e-06,
"loss": 0.4583,
"step": 115
},
{
"epoch": 0.8149882903981265,
"grad_norm": 0.050011828541755676,
"learning_rate": 9.130093393492302e-06,
"loss": 0.441,
"step": 116
},
{
"epoch": 0.8220140515222483,
"grad_norm": 0.0516488291323185,
"learning_rate": 9.106838055273589e-06,
"loss": 0.4663,
"step": 117
},
{
"epoch": 0.8290398126463701,
"grad_norm": 0.058606114238500595,
"learning_rate": 9.083306400223465e-06,
"loss": 0.5017,
"step": 118
},
{
"epoch": 0.8360655737704918,
"grad_norm": 0.05311114713549614,
"learning_rate": 9.059500011601919e-06,
"loss": 0.4615,
"step": 119
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.05270574986934662,
"learning_rate": 9.035420491153596e-06,
"loss": 0.4469,
"step": 120
},
{
"epoch": 0.8501170960187353,
"grad_norm": 0.05161169916391373,
"learning_rate": 9.011069459000035e-06,
"loss": 0.4882,
"step": 121
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.0560365691781044,
"learning_rate": 8.986448553530665e-06,
"loss": 0.4454,
"step": 122
},
{
"epoch": 0.8641686182669789,
"grad_norm": 0.050510723143815994,
"learning_rate": 8.961559431292562e-06,
"loss": 0.4535,
"step": 123
},
{
"epoch": 0.8711943793911007,
"grad_norm": 0.05732515826821327,
"learning_rate": 8.936403766879003e-06,
"loss": 0.4369,
"step": 124
},
{
"epoch": 0.8782201405152225,
"grad_norm": 0.05295110121369362,
"learning_rate": 8.910983252816794e-06,
"loss": 0.4286,
"step": 125
},
{
"epoch": 0.8852459016393442,
"grad_norm": 0.050234604626894,
"learning_rate": 8.885299599452381e-06,
"loss": 0.4578,
"step": 126
},
{
"epoch": 0.892271662763466,
"grad_norm": 0.06070755422115326,
"learning_rate": 8.859354534836797e-06,
"loss": 0.4594,
"step": 127
},
{
"epoch": 0.8992974238875878,
"grad_norm": 0.05537045747041702,
"learning_rate": 8.833149804609372e-06,
"loss": 0.425,
"step": 128
},
{
"epoch": 0.9063231850117096,
"grad_norm": 0.051650241017341614,
"learning_rate": 8.806687171880298e-06,
"loss": 0.4714,
"step": 129
},
{
"epoch": 0.9133489461358314,
"grad_norm": 0.051121823489665985,
"learning_rate": 8.779968417111991e-06,
"loss": 0.4549,
"step": 130
},
{
"epoch": 0.9203747072599532,
"grad_norm": 0.056628625839948654,
"learning_rate": 8.752995337999316e-06,
"loss": 0.5337,
"step": 131
},
{
"epoch": 0.927400468384075,
"grad_norm": 0.05635831505060196,
"learning_rate": 8.725769749348612e-06,
"loss": 0.4747,
"step": 132
},
{
"epoch": 0.9344262295081968,
"grad_norm": 0.055249571800231934,
"learning_rate": 8.698293482955605e-06,
"loss": 0.4773,
"step": 133
},
{
"epoch": 0.9414519906323185,
"grad_norm": 0.056761760264635086,
"learning_rate": 8.670568387482153e-06,
"loss": 0.4751,
"step": 134
},
{
"epoch": 0.9484777517564403,
"grad_norm": 0.06461669504642487,
"learning_rate": 8.642596328331864e-06,
"loss": 0.4715,
"step": 135
},
{
"epoch": 0.955503512880562,
"grad_norm": 0.054414063692092896,
"learning_rate": 8.614379187524593e-06,
"loss": 0.4489,
"step": 136
},
{
"epoch": 0.9625292740046838,
"grad_norm": 0.053149014711380005,
"learning_rate": 8.585918863569806e-06,
"loss": 0.4493,
"step": 137
},
{
"epoch": 0.9695550351288056,
"grad_norm": 0.052316196262836456,
"learning_rate": 8.55721727133886e-06,
"loss": 0.4691,
"step": 138
},
{
"epoch": 0.9765807962529274,
"grad_norm": 0.052749183028936386,
"learning_rate": 8.528276341936146e-06,
"loss": 0.4877,
"step": 139
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.05662725865840912,
"learning_rate": 8.499098022569177e-06,
"loss": 0.4579,
"step": 140
},
{
"epoch": 0.990632318501171,
"grad_norm": 0.05714595317840576,
"learning_rate": 8.469684276417568e-06,
"loss": 0.5069,
"step": 141
},
{
"epoch": 0.9976580796252927,
"grad_norm": 0.05158586427569389,
"learning_rate": 8.440037082500953e-06,
"loss": 0.4544,
"step": 142
},
{
"epoch": 1.0,
"grad_norm": 0.05158586427569389,
"learning_rate": 8.410158435545825e-06,
"loss": 0.5067,
"step": 143
},
{
"epoch": 1.0070257611241218,
"grad_norm": 0.10291286557912827,
"learning_rate": 8.380050345851338e-06,
"loss": 0.3535,
"step": 144
},
{
"epoch": 1.0140515222482436,
"grad_norm": 0.047048892825841904,
"learning_rate": 8.349714839154035e-06,
"loss": 0.3635,
"step": 145
},
{
"epoch": 1.0210772833723654,
"grad_norm": 0.052344005554914474,
"learning_rate": 8.319153956491567e-06,
"loss": 0.3643,
"step": 146
},
{
"epoch": 1.0281030444964872,
"grad_norm": 0.050871554762125015,
"learning_rate": 8.288369754065362e-06,
"loss": 0.3487,
"step": 147
},
{
"epoch": 1.035128805620609,
"grad_norm": 0.04831859841942787,
"learning_rate": 8.257364303102275e-06,
"loss": 0.3836,
"step": 148
},
{
"epoch": 1.0421545667447307,
"grad_norm": 0.05533618479967117,
"learning_rate": 8.226139689715233e-06,
"loss": 0.3699,
"step": 149
},
{
"epoch": 1.0491803278688525,
"grad_norm": 0.05083422362804413,
"learning_rate": 8.19469801476288e-06,
"loss": 0.4104,
"step": 150
},
{
"epoch": 1.0562060889929743,
"grad_norm": 0.05335497856140137,
"learning_rate": 8.16304139370823e-06,
"loss": 0.3209,
"step": 151
},
{
"epoch": 1.063231850117096,
"grad_norm": 0.054009512066841125,
"learning_rate": 8.131171956476328e-06,
"loss": 0.3853,
"step": 152
},
{
"epoch": 1.0702576112412179,
"grad_norm": 0.049055177718400955,
"learning_rate": 8.09909184731094e-06,
"loss": 0.3542,
"step": 153
},
{
"epoch": 1.0772833723653397,
"grad_norm": 0.06105168163776398,
"learning_rate": 8.066803224630295e-06,
"loss": 0.3527,
"step": 154
},
{
"epoch": 1.0843091334894615,
"grad_norm": 0.05668722093105316,
"learning_rate": 8.034308260881854e-06,
"loss": 0.3725,
"step": 155
},
{
"epoch": 1.0913348946135832,
"grad_norm": 0.05870070680975914,
"learning_rate": 8.00160914239615e-06,
"loss": 0.3502,
"step": 156
},
{
"epoch": 1.098360655737705,
"grad_norm": 0.05561830475926399,
"learning_rate": 7.968708069239672e-06,
"loss": 0.4132,
"step": 157
},
{
"epoch": 1.1053864168618266,
"grad_norm": 0.05985680967569351,
"learning_rate": 7.935607255066867e-06,
"loss": 0.387,
"step": 158
},
{
"epoch": 1.1124121779859484,
"grad_norm": 0.05309848487377167,
"learning_rate": 7.902308926971166e-06,
"loss": 0.3512,
"step": 159
},
{
"epoch": 1.1194379391100702,
"grad_norm": 0.057192280888557434,
"learning_rate": 7.868815325335168e-06,
"loss": 0.3755,
"step": 160
},
{
"epoch": 1.126463700234192,
"grad_norm": 0.06615495681762695,
"learning_rate": 7.835128703679896e-06,
"loss": 0.3666,
"step": 161
},
{
"epoch": 1.1334894613583137,
"grad_norm": 0.05387312173843384,
"learning_rate": 7.801251328513164e-06,
"loss": 0.3669,
"step": 162
},
{
"epoch": 1.1405152224824355,
"grad_norm": 0.06274469196796417,
"learning_rate": 7.767185479177092e-06,
"loss": 0.3513,
"step": 163
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.06123442202806473,
"learning_rate": 7.732933447694748e-06,
"loss": 0.3789,
"step": 164
},
{
"epoch": 1.154566744730679,
"grad_norm": 0.05250508710741997,
"learning_rate": 7.698497538615928e-06,
"loss": 0.337,
"step": 165
},
{
"epoch": 1.161592505854801,
"grad_norm": 0.05276589095592499,
"learning_rate": 7.663880068862106e-06,
"loss": 0.3281,
"step": 166
},
{
"epoch": 1.1686182669789227,
"grad_norm": 0.05875389277935028,
"learning_rate": 7.629083367570547e-06,
"loss": 0.3786,
"step": 167
},
{
"epoch": 1.1756440281030445,
"grad_norm": 0.06692057847976685,
"learning_rate": 7.594109775937595e-06,
"loss": 0.3656,
"step": 168
},
{
"epoch": 1.1826697892271663,
"grad_norm": 0.05434219911694527,
"learning_rate": 7.558961647061156e-06,
"loss": 0.3733,
"step": 169
},
{
"epoch": 1.189695550351288,
"grad_norm": 0.06000547111034393,
"learning_rate": 7.5236413457823745e-06,
"loss": 0.3174,
"step": 170
},
{
"epoch": 1.1967213114754098,
"grad_norm": 0.05446619912981987,
"learning_rate": 7.488151248526518e-06,
"loss": 0.3304,
"step": 171
},
{
"epoch": 1.2037470725995316,
"grad_norm": 0.05152672156691551,
"learning_rate": 7.452493743143092e-06,
"loss": 0.3353,
"step": 172
},
{
"epoch": 1.2107728337236534,
"grad_norm": 0.06070106849074364,
"learning_rate": 7.416671228745181e-06,
"loss": 0.4031,
"step": 173
},
{
"epoch": 1.2177985948477752,
"grad_norm": 0.05588310956954956,
"learning_rate": 7.380686115548024e-06,
"loss": 0.3465,
"step": 174
},
{
"epoch": 1.224824355971897,
"grad_norm": 0.05709127336740494,
"learning_rate": 7.344540824706855e-06,
"loss": 0.3529,
"step": 175
},
{
"epoch": 1.2318501170960188,
"grad_norm": 0.05777303874492645,
"learning_rate": 7.3082377881540025e-06,
"loss": 0.3622,
"step": 176
},
{
"epoch": 1.2388758782201406,
"grad_norm": 0.05582602322101593,
"learning_rate": 7.271779448435265e-06,
"loss": 0.3663,
"step": 177
},
{
"epoch": 1.2459016393442623,
"grad_norm": 0.060280896723270416,
"learning_rate": 7.235168258545569e-06,
"loss": 0.3681,
"step": 178
},
{
"epoch": 1.2529274004683841,
"grad_norm": 0.059871841222047806,
"learning_rate": 7.198406681763925e-06,
"loss": 0.3706,
"step": 179
},
{
"epoch": 1.259953161592506,
"grad_norm": 0.06064627692103386,
"learning_rate": 7.161497191487693e-06,
"loss": 0.379,
"step": 180
},
{
"epoch": 1.2669789227166277,
"grad_norm": 0.04873732104897499,
"learning_rate": 7.124442271066174e-06,
"loss": 0.3558,
"step": 181
},
{
"epoch": 1.2740046838407495,
"grad_norm": 0.06199304386973381,
"learning_rate": 7.087244413633516e-06,
"loss": 0.3575,
"step": 182
},
{
"epoch": 1.281030444964871,
"grad_norm": 0.058650556951761246,
"learning_rate": 7.049906121940974e-06,
"loss": 0.3685,
"step": 183
},
{
"epoch": 1.288056206088993,
"grad_norm": 0.06006557121872902,
"learning_rate": 7.012429908188523e-06,
"loss": 0.345,
"step": 184
},
{
"epoch": 1.2950819672131146,
"grad_norm": 0.059959061443805695,
"learning_rate": 6.9748182938558225e-06,
"loss": 0.3252,
"step": 185
},
{
"epoch": 1.3021077283372366,
"grad_norm": 0.05382518842816353,
"learning_rate": 6.937073809532581e-06,
"loss": 0.4011,
"step": 186
},
{
"epoch": 1.3091334894613582,
"grad_norm": 0.05975024029612541,
"learning_rate": 6.899198994748274e-06,
"loss": 0.3351,
"step": 187
},
{
"epoch": 1.3161592505854802,
"grad_norm": 0.05733481049537659,
"learning_rate": 6.861196397801297e-06,
"loss": 0.309,
"step": 188
},
{
"epoch": 1.3231850117096018,
"grad_norm": 0.051539335399866104,
"learning_rate": 6.823068575587496e-06,
"loss": 0.3394,
"step": 189
},
{
"epoch": 1.3302107728337236,
"grad_norm": 0.05479830130934715,
"learning_rate": 6.784818093428144e-06,
"loss": 0.3243,
"step": 190
},
{
"epoch": 1.3372365339578454,
"grad_norm": 0.05627552792429924,
"learning_rate": 6.746447524897335e-06,
"loss": 0.3534,
"step": 191
},
{
"epoch": 1.3442622950819672,
"grad_norm": 0.05136909708380699,
"learning_rate": 6.70795945164883e-06,
"loss": 0.36,
"step": 192
},
{
"epoch": 1.351288056206089,
"grad_norm": 0.06510506570339203,
"learning_rate": 6.6693564632423626e-06,
"loss": 0.3617,
"step": 193
},
{
"epoch": 1.3583138173302107,
"grad_norm": 0.06433955579996109,
"learning_rate": 6.630641156969397e-06,
"loss": 0.334,
"step": 194
},
{
"epoch": 1.3653395784543325,
"grad_norm": 0.05501256510615349,
"learning_rate": 6.591816137678388e-06,
"loss": 0.3502,
"step": 195
},
{
"epoch": 1.3723653395784543,
"grad_norm": 0.06037002056837082,
"learning_rate": 6.552884017599517e-06,
"loss": 0.3673,
"step": 196
},
{
"epoch": 1.379391100702576,
"grad_norm": 0.06693354994058609,
"learning_rate": 6.513847416168929e-06,
"loss": 0.3842,
"step": 197
},
{
"epoch": 1.3864168618266979,
"grad_norm": 0.06090663745999336,
"learning_rate": 6.474708959852504e-06,
"loss": 0.31,
"step": 198
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.05243751406669617,
"learning_rate": 6.435471281969133e-06,
"loss": 0.329,
"step": 199
},
{
"epoch": 1.4004683840749415,
"grad_norm": 0.057668376713991165,
"learning_rate": 6.396137022513545e-06,
"loss": 0.3504,
"step": 200
},
{
"epoch": 1.4074941451990632,
"grad_norm": 0.05978507921099663,
"learning_rate": 6.3567088279786885e-06,
"loss": 0.3664,
"step": 201
},
{
"epoch": 1.414519906323185,
"grad_norm": 0.0557040311396122,
"learning_rate": 6.317189351177657e-06,
"loss": 0.3667,
"step": 202
},
{
"epoch": 1.4215456674473068,
"grad_norm": 0.06627603620290756,
"learning_rate": 6.277581251065217e-06,
"loss": 0.3627,
"step": 203
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.05487382784485817,
"learning_rate": 6.237887192558894e-06,
"loss": 0.3806,
"step": 204
},
{
"epoch": 1.4355971896955504,
"grad_norm": 0.06525876373052597,
"learning_rate": 6.198109846359682e-06,
"loss": 0.3483,
"step": 205
},
{
"epoch": 1.4426229508196722,
"grad_norm": 0.061048902571201324,
"learning_rate": 6.15825188877235e-06,
"loss": 0.4125,
"step": 206
},
{
"epoch": 1.449648711943794,
"grad_norm": 0.057914849370718,
"learning_rate": 6.118316001525368e-06,
"loss": 0.3748,
"step": 207
},
{
"epoch": 1.4566744730679158,
"grad_norm": 0.0596294067800045,
"learning_rate": 6.078304871590485e-06,
"loss": 0.3302,
"step": 208
},
{
"epoch": 1.4637002341920375,
"grad_norm": 0.056739531457424164,
"learning_rate": 6.038221191001935e-06,
"loss": 0.3529,
"step": 209
},
{
"epoch": 1.4707259953161593,
"grad_norm": 0.050509966909885406,
"learning_rate": 5.998067656675318e-06,
"loss": 0.3776,
"step": 210
},
{
"epoch": 1.4777517564402811,
"grad_norm": 0.05876694247126579,
"learning_rate": 5.95784697022614e-06,
"loss": 0.3552,
"step": 211
},
{
"epoch": 1.4847775175644027,
"grad_norm": 0.05890351161360741,
"learning_rate": 5.917561837788046e-06,
"loss": 0.3556,
"step": 212
},
{
"epoch": 1.4918032786885247,
"grad_norm": 0.05058097094297409,
"learning_rate": 5.877214969830746e-06,
"loss": 0.3184,
"step": 213
},
{
"epoch": 1.4988290398126463,
"grad_norm": 0.057646844536066055,
"learning_rate": 5.836809080977644e-06,
"loss": 0.3577,
"step": 214
},
{
"epoch": 1.5058548009367683,
"grad_norm": 0.05617011711001396,
"learning_rate": 5.7963468898232026e-06,
"loss": 0.351,
"step": 215
},
{
"epoch": 1.5128805620608898,
"grad_norm": 0.07923811674118042,
"learning_rate": 5.755831118750016e-06,
"loss": 0.3816,
"step": 216
},
{
"epoch": 1.5199063231850118,
"grad_norm": 0.05769532546401024,
"learning_rate": 5.715264493745652e-06,
"loss": 0.355,
"step": 217
},
{
"epoch": 1.5269320843091334,
"grad_norm": 0.05954836681485176,
"learning_rate": 5.6746497442192425e-06,
"loss": 0.3514,
"step": 218
},
{
"epoch": 1.5339578454332554,
"grad_norm": 0.056089069694280624,
"learning_rate": 5.633989602817837e-06,
"loss": 0.3369,
"step": 219
},
{
"epoch": 1.540983606557377,
"grad_norm": 0.0603008046746254,
"learning_rate": 5.593286805242549e-06,
"loss": 0.3669,
"step": 220
},
{
"epoch": 1.548009367681499,
"grad_norm": 0.06486702710390091,
"learning_rate": 5.552544090064487e-06,
"loss": 0.3657,
"step": 221
},
{
"epoch": 1.5550351288056206,
"grad_norm": 0.06320104748010635,
"learning_rate": 5.5117641985405055e-06,
"loss": 0.3354,
"step": 222
},
{
"epoch": 1.5620608899297423,
"grad_norm": 0.05017773061990738,
"learning_rate": 5.47094987442876e-06,
"loss": 0.3442,
"step": 223
},
{
"epoch": 1.5690866510538641,
"grad_norm": 0.05650470405817032,
"learning_rate": 5.430103863804107e-06,
"loss": 0.3522,
"step": 224
},
{
"epoch": 1.576112412177986,
"grad_norm": 0.061710115522146225,
"learning_rate": 5.389228914873334e-06,
"loss": 0.3705,
"step": 225
},
{
"epoch": 1.5831381733021077,
"grad_norm": 0.04975937306880951,
"learning_rate": 5.348327777790262e-06,
"loss": 0.3351,
"step": 226
},
{
"epoch": 1.5901639344262295,
"grad_norm": 0.054747324436903,
"learning_rate": 5.307403204470711e-06,
"loss": 0.3588,
"step": 227
},
{
"epoch": 1.5971896955503513,
"grad_norm": 0.06445419788360596,
"learning_rate": 5.266457948407336e-06,
"loss": 0.3728,
"step": 228
},
{
"epoch": 1.604215456674473,
"grad_norm": 0.0565023347735405,
"learning_rate": 5.2254947644843735e-06,
"loss": 0.3523,
"step": 229
},
{
"epoch": 1.6112412177985949,
"grad_norm": 0.05139541998505592,
"learning_rate": 5.18451640879228e-06,
"loss": 0.3392,
"step": 230
},
{
"epoch": 1.6182669789227166,
"grad_norm": 0.05746756121516228,
"learning_rate": 5.14352563844231e-06,
"loss": 0.2981,
"step": 231
},
{
"epoch": 1.6252927400468384,
"grad_norm": 0.05003352463245392,
"learning_rate": 5.1025252113809945e-06,
"loss": 0.3195,
"step": 232
},
{
"epoch": 1.6323185011709602,
"grad_norm": 0.05454004183411598,
"learning_rate": 5.061517886204592e-06,
"loss": 0.3319,
"step": 233
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.054113056510686874,
"learning_rate": 5.02050642197348e-06,
"loss": 0.3514,
"step": 234
},
{
"epoch": 1.6463700234192038,
"grad_norm": 0.06152534484863281,
"learning_rate": 4.979493578026523e-06,
"loss": 0.3455,
"step": 235
},
{
"epoch": 1.6533957845433256,
"grad_norm": 0.05544520169496536,
"learning_rate": 4.9384821137954106e-06,
"loss": 0.3751,
"step": 236
},
{
"epoch": 1.6604215456674472,
"grad_norm": 0.07202091068029404,
"learning_rate": 4.897474788619007e-06,
"loss": 0.3418,
"step": 237
},
{
"epoch": 1.6674473067915692,
"grad_norm": 0.059407323598861694,
"learning_rate": 4.856474361557692e-06,
"loss": 0.3178,
"step": 238
},
{
"epoch": 1.6744730679156907,
"grad_norm": 0.051997967064380646,
"learning_rate": 4.815483591207721e-06,
"loss": 0.3754,
"step": 239
},
{
"epoch": 1.6814988290398127,
"grad_norm": 0.05671803653240204,
"learning_rate": 4.774505235515628e-06,
"loss": 0.3784,
"step": 240
},
{
"epoch": 1.6885245901639343,
"grad_norm": 0.056249409914016724,
"learning_rate": 4.733542051592665e-06,
"loss": 0.3327,
"step": 241
},
{
"epoch": 1.6955503512880563,
"grad_norm": 0.05896645039319992,
"learning_rate": 4.69259679552929e-06,
"loss": 0.3588,
"step": 242
},
{
"epoch": 1.7025761124121779,
"grad_norm": 0.070353664457798,
"learning_rate": 4.651672222209738e-06,
"loss": 0.3816,
"step": 243
},
{
"epoch": 1.7096018735362999,
"grad_norm": 0.05775173380970955,
"learning_rate": 4.6107710851266695e-06,
"loss": 0.3384,
"step": 244
},
{
"epoch": 1.7166276346604215,
"grad_norm": 0.05771046131849289,
"learning_rate": 4.5698961361958955e-06,
"loss": 0.3377,
"step": 245
},
{
"epoch": 1.7236533957845435,
"grad_norm": 0.055200010538101196,
"learning_rate": 4.529050125571241e-06,
"loss": 0.3456,
"step": 246
},
{
"epoch": 1.730679156908665,
"grad_norm": 0.05298285186290741,
"learning_rate": 4.488235801459495e-06,
"loss": 0.3166,
"step": 247
},
{
"epoch": 1.737704918032787,
"grad_norm": 0.05973465368151665,
"learning_rate": 4.447455909935513e-06,
"loss": 0.3711,
"step": 248
},
{
"epoch": 1.7447306791569086,
"grad_norm": 0.07082070410251617,
"learning_rate": 4.4067131947574515e-06,
"loss": 0.335,
"step": 249
},
{
"epoch": 1.7517564402810304,
"grad_norm": 0.05250892415642738,
"learning_rate": 4.3660103971821635e-06,
"loss": 0.3443,
"step": 250
},
{
"epoch": 1.7587822014051522,
"grad_norm": 0.06379300355911255,
"learning_rate": 4.3253502557807575e-06,
"loss": 0.3399,
"step": 251
},
{
"epoch": 1.765807962529274,
"grad_norm": 0.058025211095809937,
"learning_rate": 4.28473550625435e-06,
"loss": 0.3706,
"step": 252
},
{
"epoch": 1.7728337236533958,
"grad_norm": 0.05636170506477356,
"learning_rate": 4.244168881249986e-06,
"loss": 0.3737,
"step": 253
},
{
"epoch": 1.7798594847775175,
"grad_norm": 0.05882354453206062,
"learning_rate": 4.203653110176798e-06,
"loss": 0.3033,
"step": 254
},
{
"epoch": 1.7868852459016393,
"grad_norm": 0.05535350739955902,
"learning_rate": 4.163190919022357e-06,
"loss": 0.338,
"step": 255
},
{
"epoch": 1.7939110070257611,
"grad_norm": 0.0554145909845829,
"learning_rate": 4.122785030169256e-06,
"loss": 0.371,
"step": 256
},
{
"epoch": 1.800936768149883,
"grad_norm": 0.05324379727244377,
"learning_rate": 4.082438162211955e-06,
"loss": 0.3402,
"step": 257
},
{
"epoch": 1.8079625292740047,
"grad_norm": 0.06222432479262352,
"learning_rate": 4.042153029773861e-06,
"loss": 0.3405,
"step": 258
},
{
"epoch": 1.8149882903981265,
"grad_norm": 0.054615411907434464,
"learning_rate": 4.001932343324683e-06,
"loss": 0.3555,
"step": 259
},
{
"epoch": 1.8220140515222483,
"grad_norm": 0.0694437026977539,
"learning_rate": 3.961778808998066e-06,
"loss": 0.3863,
"step": 260
},
{
"epoch": 1.82903981264637,
"grad_norm": 0.05631214752793312,
"learning_rate": 3.921695128409517e-06,
"loss": 0.38,
"step": 261
},
{
"epoch": 1.8360655737704918,
"grad_norm": 0.05278317630290985,
"learning_rate": 3.8816839984746334e-06,
"loss": 0.3073,
"step": 262
},
{
"epoch": 1.8430913348946136,
"grad_norm": 0.05563074350357056,
"learning_rate": 3.841748111227652e-06,
"loss": 0.3417,
"step": 263
},
{
"epoch": 1.8501170960187352,
"grad_norm": 0.0652734711766243,
"learning_rate": 3.8018901536403198e-06,
"loss": 0.3819,
"step": 264
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.05986921489238739,
"learning_rate": 3.762112807441108e-06,
"loss": 0.3887,
"step": 265
},
{
"epoch": 1.8641686182669788,
"grad_norm": 0.060439128428697586,
"learning_rate": 3.7224187489347847e-06,
"loss": 0.3564,
"step": 266
},
{
"epoch": 1.8711943793911008,
"grad_norm": 0.06072353571653366,
"learning_rate": 3.682810648822343e-06,
"loss": 0.3868,
"step": 267
},
{
"epoch": 1.8782201405152223,
"grad_norm": 0.053765103220939636,
"learning_rate": 3.6432911720213127e-06,
"loss": 0.3699,
"step": 268
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.05896330624818802,
"learning_rate": 3.6038629774864563e-06,
"loss": 0.3384,
"step": 269
},
{
"epoch": 1.892271662763466,
"grad_norm": 0.06258895993232727,
"learning_rate": 3.56452871803087e-06,
"loss": 0.3837,
"step": 270
},
{
"epoch": 1.899297423887588,
"grad_norm": 0.06024617701768875,
"learning_rate": 3.525291040147498e-06,
"loss": 0.3078,
"step": 271
},
{
"epoch": 1.9063231850117095,
"grad_norm": 0.0663781389594078,
"learning_rate": 3.486152583831072e-06,
"loss": 0.3401,
"step": 272
},
{
"epoch": 1.9133489461358315,
"grad_norm": 0.0578744150698185,
"learning_rate": 3.447115982400485e-06,
"loss": 0.3572,
"step": 273
},
{
"epoch": 1.920374707259953,
"grad_norm": 0.05523526668548584,
"learning_rate": 3.4081838623216124e-06,
"loss": 0.381,
"step": 274
},
{
"epoch": 1.927400468384075,
"grad_norm": 0.0603664331138134,
"learning_rate": 3.3693588430306035e-06,
"loss": 0.3118,
"step": 275
},
{
"epoch": 1.9344262295081966,
"grad_norm": 0.05586825683712959,
"learning_rate": 3.330643536757638e-06,
"loss": 0.3449,
"step": 276
},
{
"epoch": 1.9414519906323187,
"grad_norm": 0.05573516711592674,
"learning_rate": 3.2920405483511702e-06,
"loss": 0.3405,
"step": 277
},
{
"epoch": 1.9484777517564402,
"grad_norm": 0.058013953268527985,
"learning_rate": 3.253552475102668e-06,
"loss": 0.3462,
"step": 278
},
{
"epoch": 1.955503512880562,
"grad_norm": 0.055303193628787994,
"learning_rate": 3.215181906571858e-06,
"loss": 0.3719,
"step": 279
},
{
"epoch": 1.9625292740046838,
"grad_norm": 0.05314116179943085,
"learning_rate": 3.1769314244125056e-06,
"loss": 0.3472,
"step": 280
},
{
"epoch": 1.9695550351288056,
"grad_norm": 0.06574741005897522,
"learning_rate": 3.1388036021987047e-06,
"loss": 0.3987,
"step": 281
},
{
"epoch": 1.9765807962529274,
"grad_norm": 0.053546082228422165,
"learning_rate": 3.100801005251727e-06,
"loss": 0.3384,
"step": 282
},
{
"epoch": 1.9836065573770492,
"grad_norm": 0.06009920313954353,
"learning_rate": 3.0629261904674206e-06,
"loss": 0.3563,
"step": 283
},
{
"epoch": 1.990632318501171,
"grad_norm": 0.05714387819170952,
"learning_rate": 3.025181706144178e-06,
"loss": 0.3169,
"step": 284
},
{
"epoch": 1.9976580796252927,
"grad_norm": 0.05756373330950737,
"learning_rate": 2.987570091811479e-06,
"loss": 0.3797,
"step": 285
},
{
"epoch": 2.0,
"grad_norm": 0.05756373330950737,
"learning_rate": 2.9500938780590276e-06,
"loss": 0.3294,
"step": 286
},
{
"epoch": 2.0070257611241216,
"grad_norm": 0.11346685141324997,
"learning_rate": 2.9127555863664857e-06,
"loss": 0.2824,
"step": 287
},
{
"epoch": 2.0140515222482436,
"grad_norm": 0.11799391359090805,
"learning_rate": 2.8755577289338267e-06,
"loss": 0.2677,
"step": 288
},
{
"epoch": 2.021077283372365,
"grad_norm": 0.0632392093539238,
"learning_rate": 2.838502808512309e-06,
"loss": 0.2369,
"step": 289
},
{
"epoch": 2.028103044496487,
"grad_norm": 0.07154154777526855,
"learning_rate": 2.801593318236078e-06,
"loss": 0.2623,
"step": 290
},
{
"epoch": 2.0351288056206087,
"grad_norm": 0.060281310230493546,
"learning_rate": 2.764831741454432e-06,
"loss": 0.286,
"step": 291
},
{
"epoch": 2.0421545667447307,
"grad_norm": 0.06496189534664154,
"learning_rate": 2.7282205515647348e-06,
"loss": 0.2172,
"step": 292
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.05627848207950592,
"learning_rate": 2.6917622118459975e-06,
"loss": 0.2247,
"step": 293
},
{
"epoch": 2.0562060889929743,
"grad_norm": 0.061047762632369995,
"learning_rate": 2.655459175293146e-06,
"loss": 0.2094,
"step": 294
},
{
"epoch": 2.063231850117096,
"grad_norm": 0.05773235112428665,
"learning_rate": 2.6193138844519785e-06,
"loss": 0.273,
"step": 295
},
{
"epoch": 2.070257611241218,
"grad_norm": 0.0726071298122406,
"learning_rate": 2.58332877125482e-06,
"loss": 0.2392,
"step": 296
},
{
"epoch": 2.0772833723653394,
"grad_norm": 0.06737970560789108,
"learning_rate": 2.5475062568569077e-06,
"loss": 0.2721,
"step": 297
},
{
"epoch": 2.0843091334894615,
"grad_norm": 0.1003628745675087,
"learning_rate": 2.511848751473485e-06,
"loss": 0.2392,
"step": 298
},
{
"epoch": 2.091334894613583,
"grad_norm": 0.05960209295153618,
"learning_rate": 2.476358654217627e-06,
"loss": 0.2195,
"step": 299
},
{
"epoch": 2.098360655737705,
"grad_norm": 0.07600712776184082,
"learning_rate": 2.4410383529388448e-06,
"loss": 0.2397,
"step": 300
},
{
"epoch": 2.1053864168618266,
"grad_norm": 0.05939944460988045,
"learning_rate": 2.405890224062406e-06,
"loss": 0.2456,
"step": 301
},
{
"epoch": 2.1124121779859486,
"grad_norm": 0.05378476157784462,
"learning_rate": 2.370916632429455e-06,
"loss": 0.2124,
"step": 302
},
{
"epoch": 2.11943793911007,
"grad_norm": 0.07477736473083496,
"learning_rate": 2.336119931137897e-06,
"loss": 0.2426,
"step": 303
},
{
"epoch": 2.126463700234192,
"grad_norm": 0.06331060081720352,
"learning_rate": 2.3015024613840742e-06,
"loss": 0.2446,
"step": 304
},
{
"epoch": 2.1334894613583137,
"grad_norm": 0.059247083961963654,
"learning_rate": 2.2670665523052534e-06,
"loss": 0.2388,
"step": 305
},
{
"epoch": 2.1405152224824358,
"grad_norm": 0.055599454790353775,
"learning_rate": 2.2328145208229096e-06,
"loss": 0.2119,
"step": 306
},
{
"epoch": 2.1475409836065573,
"grad_norm": 0.05521732196211815,
"learning_rate": 2.1987486714868384e-06,
"loss": 0.2212,
"step": 307
},
{
"epoch": 2.1545667447306793,
"grad_norm": 0.0638333410024643,
"learning_rate": 2.164871296320106e-06,
"loss": 0.2423,
"step": 308
},
{
"epoch": 2.161592505854801,
"grad_norm": 0.06265348196029663,
"learning_rate": 2.1311846746648325e-06,
"loss": 0.214,
"step": 309
},
{
"epoch": 2.168618266978923,
"grad_norm": 0.0508870929479599,
"learning_rate": 2.097691073028836e-06,
"loss": 0.2307,
"step": 310
},
{
"epoch": 2.1756440281030445,
"grad_norm": 0.05442043021321297,
"learning_rate": 2.064392744933135e-06,
"loss": 0.2381,
"step": 311
},
{
"epoch": 2.1826697892271665,
"grad_norm": 0.06381048262119293,
"learning_rate": 2.0312919307603286e-06,
"loss": 0.2056,
"step": 312
},
{
"epoch": 2.189695550351288,
"grad_norm": 0.05382630601525307,
"learning_rate": 1.998390857603853e-06,
"loss": 0.2282,
"step": 313
},
{
"epoch": 2.19672131147541,
"grad_norm": 0.06103895604610443,
"learning_rate": 1.965691739118146e-06,
"loss": 0.2176,
"step": 314
},
{
"epoch": 2.2037470725995316,
"grad_norm": 0.07506411522626877,
"learning_rate": 1.9331967753697077e-06,
"loss": 0.2111,
"step": 315
},
{
"epoch": 2.210772833723653,
"grad_norm": 0.061480604112148285,
"learning_rate": 1.9009081526890622e-06,
"loss": 0.2162,
"step": 316
},
{
"epoch": 2.217798594847775,
"grad_norm": 0.0700734481215477,
"learning_rate": 1.8688280435236732e-06,
"loss": 0.2104,
"step": 317
},
{
"epoch": 2.2248243559718968,
"grad_norm": 0.06788410246372223,
"learning_rate": 1.8369586062917693e-06,
"loss": 0.2528,
"step": 318
},
{
"epoch": 2.2318501170960188,
"grad_norm": 0.06086277589201927,
"learning_rate": 1.8053019852371195e-06,
"loss": 0.2433,
"step": 319
},
{
"epoch": 2.2388758782201403,
"grad_norm": 0.06438933312892914,
"learning_rate": 1.7738603102847696e-06,
"loss": 0.2306,
"step": 320
},
{
"epoch": 2.2459016393442623,
"grad_norm": 0.05693851783871651,
"learning_rate": 1.7426356968977265e-06,
"loss": 0.2603,
"step": 321
},
{
"epoch": 2.252927400468384,
"grad_norm": 0.07242682576179504,
"learning_rate": 1.711630245934638e-06,
"loss": 0.2595,
"step": 322
},
{
"epoch": 2.259953161592506,
"grad_norm": 0.057473134249448776,
"learning_rate": 1.6808460435084316e-06,
"loss": 0.2465,
"step": 323
},
{
"epoch": 2.2669789227166275,
"grad_norm": 0.060898784548044205,
"learning_rate": 1.6502851608459668e-06,
"loss": 0.2364,
"step": 324
},
{
"epoch": 2.2740046838407495,
"grad_norm": 0.05470450222492218,
"learning_rate": 1.6199496541486647e-06,
"loss": 0.2162,
"step": 325
},
{
"epoch": 2.281030444964871,
"grad_norm": 0.058283645659685135,
"learning_rate": 1.589841564454176e-06,
"loss": 0.2432,
"step": 326
},
{
"epoch": 2.288056206088993,
"grad_norm": 0.05749303847551346,
"learning_rate": 1.5599629174990482e-06,
"loss": 0.2493,
"step": 327
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.0628993809223175,
"learning_rate": 1.5303157235824323e-06,
"loss": 0.227,
"step": 328
},
{
"epoch": 2.3021077283372366,
"grad_norm": 0.054379165172576904,
"learning_rate": 1.5009019774308249e-06,
"loss": 0.2256,
"step": 329
},
{
"epoch": 2.309133489461358,
"grad_norm": 0.0579175241291523,
"learning_rate": 1.471723658063856e-06,
"loss": 0.2536,
"step": 330
},
{
"epoch": 2.3161592505854802,
"grad_norm": 0.06285049021244049,
"learning_rate": 1.4427827286611412e-06,
"loss": 0.2514,
"step": 331
},
{
"epoch": 2.323185011709602,
"grad_norm": 0.06163005530834198,
"learning_rate": 1.4140811364301931e-06,
"loss": 0.1979,
"step": 332
},
{
"epoch": 2.330210772833724,
"grad_norm": 0.06147882342338562,
"learning_rate": 1.385620812475409e-06,
"loss": 0.2382,
"step": 333
},
{
"epoch": 2.3372365339578454,
"grad_norm": 0.06549369543790817,
"learning_rate": 1.3574036716681366e-06,
"loss": 0.2688,
"step": 334
},
{
"epoch": 2.3442622950819674,
"grad_norm": 0.05481060966849327,
"learning_rate": 1.3294316125178474e-06,
"loss": 0.2419,
"step": 335
},
{
"epoch": 2.351288056206089,
"grad_norm": 0.059850070625543594,
"learning_rate": 1.301706517044395e-06,
"loss": 0.2359,
"step": 336
},
{
"epoch": 2.358313817330211,
"grad_norm": 0.05873354524374008,
"learning_rate": 1.2742302506513894e-06,
"loss": 0.2394,
"step": 337
},
{
"epoch": 2.3653395784543325,
"grad_norm": 0.06211516261100769,
"learning_rate": 1.247004662000686e-06,
"loss": 0.2564,
"step": 338
},
{
"epoch": 2.3723653395784545,
"grad_norm": 0.06391850858926773,
"learning_rate": 1.2200315828880094e-06,
"loss": 0.241,
"step": 339
},
{
"epoch": 2.379391100702576,
"grad_norm": 0.07204084098339081,
"learning_rate": 1.1933128281197042e-06,
"loss": 0.2272,
"step": 340
},
{
"epoch": 2.3864168618266977,
"grad_norm": 0.06479175388813019,
"learning_rate": 1.166850195390628e-06,
"loss": 0.2684,
"step": 341
},
{
"epoch": 2.3934426229508197,
"grad_norm": 0.06499191373586655,
"learning_rate": 1.1406454651632042e-06,
"loss": 0.2646,
"step": 342
},
{
"epoch": 2.4004683840749417,
"grad_norm": 0.06663113832473755,
"learning_rate": 1.1147004005476192e-06,
"loss": 0.2644,
"step": 343
},
{
"epoch": 2.4074941451990632,
"grad_norm": 0.0672060027718544,
"learning_rate": 1.089016747183208e-06,
"loss": 0.2051,
"step": 344
},
{
"epoch": 2.414519906323185,
"grad_norm": 0.055617500096559525,
"learning_rate": 1.063596233120997e-06,
"loss": 0.2343,
"step": 345
},
{
"epoch": 2.421545667447307,
"grad_norm": 0.05767429992556572,
"learning_rate": 1.03844056870744e-06,
"loss": 0.248,
"step": 346
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.05945609137415886,
"learning_rate": 1.013551446469337e-06,
"loss": 0.2329,
"step": 347
},
{
"epoch": 2.4355971896955504,
"grad_norm": 0.061543092131614685,
"learning_rate": 9.889305409999656e-07,
"loss": 0.2217,
"step": 348
},
{
"epoch": 2.442622950819672,
"grad_norm": 0.06861676275730133,
"learning_rate": 9.64579508846405e-07,
"loss": 0.2869,
"step": 349
},
{
"epoch": 2.449648711943794,
"grad_norm": 0.06629474461078644,
"learning_rate": 9.40499988398082e-07,
"loss": 0.2398,
"step": 350
},
{
"epoch": 2.4566744730679155,
"grad_norm": 0.06839589029550552,
"learning_rate": 9.166935997765364e-07,
"loss": 0.2631,
"step": 351
},
{
"epoch": 2.4637002341920375,
"grad_norm": 0.07068444043397903,
"learning_rate": 8.93161944726414e-07,
"loss": 0.2439,
"step": 352
},
{
"epoch": 2.470725995316159,
"grad_norm": 0.06256645917892456,
"learning_rate": 8.699066065077005e-07,
"loss": 0.243,
"step": 353
},
{
"epoch": 2.477751756440281,
"grad_norm": 0.053054191172122955,
"learning_rate": 8.469291497891979e-07,
"loss": 0.2405,
"step": 354
},
{
"epoch": 2.4847775175644027,
"grad_norm": 0.0644262358546257,
"learning_rate": 8.242311205432418e-07,
"loss": 0.2141,
"step": 355
},
{
"epoch": 2.4918032786885247,
"grad_norm": 0.05528811737895012,
"learning_rate": 8.018140459416962e-07,
"loss": 0.2511,
"step": 356
},
{
"epoch": 2.4988290398126463,
"grad_norm": 0.06793423742055893,
"learning_rate": 7.796794342531949e-07,
"loss": 0.2414,
"step": 357
},
{
"epoch": 2.5058548009367683,
"grad_norm": 0.06469738483428955,
"learning_rate": 7.57828774741664e-07,
"loss": 0.2389,
"step": 358
},
{
"epoch": 2.51288056206089,
"grad_norm": 0.05862313508987427,
"learning_rate": 7.362635375661225e-07,
"loss": 0.2483,
"step": 359
},
{
"epoch": 2.519906323185012,
"grad_norm": 0.06477522104978561,
"learning_rate": 7.149851736817609e-07,
"loss": 0.2725,
"step": 360
},
{
"epoch": 2.5269320843091334,
"grad_norm": 0.06782645732164383,
"learning_rate": 6.939951147423269e-07,
"loss": 0.2171,
"step": 361
},
{
"epoch": 2.5339578454332554,
"grad_norm": 0.06721869856119156,
"learning_rate": 6.732947730037936e-07,
"loss": 0.2272,
"step": 362
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.06349222362041473,
"learning_rate": 6.52885541229345e-07,
"loss": 0.2361,
"step": 363
},
{
"epoch": 2.548009367681499,
"grad_norm": 0.0673484057188034,
"learning_rate": 6.327687925956616e-07,
"loss": 0.2242,
"step": 364
},
{
"epoch": 2.5550351288056206,
"grad_norm": 0.05453097075223923,
"learning_rate": 6.12945880600535e-07,
"loss": 0.2066,
"step": 365
},
{
"epoch": 2.562060889929742,
"grad_norm": 0.05890359729528427,
"learning_rate": 5.93418138971803e-07,
"loss": 0.27,
"step": 366
},
{
"epoch": 2.569086651053864,
"grad_norm": 0.06143670156598091,
"learning_rate": 5.741868815776081e-07,
"loss": 0.2532,
"step": 367
},
{
"epoch": 2.576112412177986,
"grad_norm": 0.06809256970882416,
"learning_rate": 5.552534023380024e-07,
"loss": 0.2507,
"step": 368
},
{
"epoch": 2.5831381733021077,
"grad_norm": 0.06404256820678711,
"learning_rate": 5.366189751378858e-07,
"loss": 0.2079,
"step": 369
},
{
"epoch": 2.5901639344262293,
"grad_norm": 0.05889306962490082,
"learning_rate": 5.18284853741301e-07,
"loss": 0.2592,
"step": 370
},
{
"epoch": 2.5971896955503513,
"grad_norm": 0.06670808047056198,
"learning_rate": 5.002522717070751e-07,
"loss": 0.2246,
"step": 371
},
{
"epoch": 2.6042154566744733,
"grad_norm": 0.060436200350522995,
"learning_rate": 4.8252244230582e-07,
"loss": 0.2291,
"step": 372
},
{
"epoch": 2.611241217798595,
"grad_norm": 0.06188951060175896,
"learning_rate": 4.6509655843830827e-07,
"loss": 0.2375,
"step": 373
},
{
"epoch": 2.6182669789227164,
"grad_norm": 0.06185289844870567,
"learning_rate": 4.4797579255520585e-07,
"loss": 0.2412,
"step": 374
},
{
"epoch": 2.6252927400468384,
"grad_norm": 0.061204344034194946,
"learning_rate": 4.311612965781903e-07,
"loss": 0.2453,
"step": 375
},
{
"epoch": 2.6323185011709604,
"grad_norm": 0.0526043102145195,
"learning_rate": 4.1465420182244476e-07,
"loss": 0.2399,
"step": 376
},
{
"epoch": 2.639344262295082,
"grad_norm": 0.06435679644346237,
"learning_rate": 3.984556189205441e-07,
"loss": 0.2267,
"step": 377
},
{
"epoch": 2.6463700234192036,
"grad_norm": 0.06311339884996414,
"learning_rate": 3.8256663774772383e-07,
"loss": 0.2584,
"step": 378
},
{
"epoch": 2.6533957845433256,
"grad_norm": 0.06288459897041321,
"learning_rate": 3.669883273485575e-07,
"loss": 0.2621,
"step": 379
},
{
"epoch": 2.660421545667447,
"grad_norm": 0.06034472957253456,
"learning_rate": 3.5172173586502543e-07,
"loss": 0.2132,
"step": 380
},
{
"epoch": 2.667447306791569,
"grad_norm": 0.06609684228897095,
"learning_rate": 3.3676789046599045e-07,
"loss": 0.219,
"step": 381
},
{
"epoch": 2.6744730679156907,
"grad_norm": 0.06798077374696732,
"learning_rate": 3.2212779727809504e-07,
"loss": 0.2571,
"step": 382
},
{
"epoch": 2.6814988290398127,
"grad_norm": 0.06018667295575142,
"learning_rate": 3.0780244131806193e-07,
"loss": 0.2567,
"step": 383
},
{
"epoch": 2.6885245901639343,
"grad_norm": 0.06950001418590546,
"learning_rate": 2.937927864264206e-07,
"loss": 0.2567,
"step": 384
},
{
"epoch": 2.6955503512880563,
"grad_norm": 0.06350544840097427,
"learning_rate": 2.800997752026596e-07,
"loss": 0.2359,
"step": 385
},
{
"epoch": 2.702576112412178,
"grad_norm": 0.06425828486680984,
"learning_rate": 2.667243289418059e-07,
"loss": 0.2229,
"step": 386
},
{
"epoch": 2.7096018735363,
"grad_norm": 0.061528194695711136,
"learning_rate": 2.5366734757243496e-07,
"loss": 0.1988,
"step": 387
},
{
"epoch": 2.7166276346604215,
"grad_norm": 0.06735736131668091,
"learning_rate": 2.4092970959612885e-07,
"loss": 0.2168,
"step": 388
},
{
"epoch": 2.7236533957845435,
"grad_norm": 0.051086440682411194,
"learning_rate": 2.2851227202836002e-07,
"loss": 0.2529,
"step": 389
},
{
"epoch": 2.730679156908665,
"grad_norm": 0.06252393126487732,
"learning_rate": 2.1641587034083756e-07,
"loss": 0.2351,
"step": 390
},
{
"epoch": 2.737704918032787,
"grad_norm": 0.05734705179929733,
"learning_rate": 2.0464131840528978e-07,
"loss": 0.2152,
"step": 391
},
{
"epoch": 2.7447306791569086,
"grad_norm": 0.055918820202350616,
"learning_rate": 1.9318940843870594e-07,
"loss": 0.2105,
"step": 392
},
{
"epoch": 2.7517564402810306,
"grad_norm": 0.05873579904437065,
"learning_rate": 1.8206091095003543e-07,
"loss": 0.2531,
"step": 393
},
{
"epoch": 2.758782201405152,
"grad_norm": 0.059994373470544815,
"learning_rate": 1.7125657468834656e-07,
"loss": 0.222,
"step": 394
},
{
"epoch": 2.7658079625292737,
"grad_norm": 0.05768108740448952,
"learning_rate": 1.6077712659244792e-07,
"loss": 0.2338,
"step": 395
},
{
"epoch": 2.7728337236533958,
"grad_norm": 0.0613434873521328,
"learning_rate": 1.5062327174197645e-07,
"loss": 0.2134,
"step": 396
},
{
"epoch": 2.7798594847775178,
"grad_norm": 0.0738491341471672,
"learning_rate": 1.4079569330996412e-07,
"loss": 0.2774,
"step": 397
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.0638870969414711,
"learning_rate": 1.3129505251686603e-07,
"loss": 0.2163,
"step": 398
},
{
"epoch": 2.793911007025761,
"grad_norm": 0.06510435044765472,
"learning_rate": 1.2212198858607694e-07,
"loss": 0.2268,
"step": 399
},
{
"epoch": 2.800936768149883,
"grad_norm": 0.060825150460004807,
"learning_rate": 1.1327711870091963e-07,
"loss": 0.2397,
"step": 400
},
{
"epoch": 2.807962529274005,
"grad_norm": 0.06411723047494888,
"learning_rate": 1.0476103796312254e-07,
"loss": 0.2509,
"step": 401
},
{
"epoch": 2.8149882903981265,
"grad_norm": 0.06250890344381332,
"learning_rate": 9.657431935277629e-08,
"loss": 0.2432,
"step": 402
},
{
"epoch": 2.822014051522248,
"grad_norm": 0.05637110397219658,
"learning_rate": 8.871751368978554e-08,
"loss": 0.2307,
"step": 403
},
{
"epoch": 2.82903981264637,
"grad_norm": 0.0648709312081337,
"learning_rate": 8.119114959680929e-08,
"loss": 0.2077,
"step": 404
},
{
"epoch": 2.836065573770492,
"grad_norm": 0.07058855891227722,
"learning_rate": 7.399573346368871e-08,
"loss": 0.2623,
"step": 405
},
{
"epoch": 2.8430913348946136,
"grad_norm": 0.06563594192266464,
"learning_rate": 6.713174941338163e-08,
"loss": 0.2555,
"step": 406
},
{
"epoch": 2.850117096018735,
"grad_norm": 0.06609778106212616,
"learning_rate": 6.05996592693886e-08,
"loss": 0.1752,
"step": 407
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.053535059094429016,
"learning_rate": 5.439990252467886e-08,
"loss": 0.2673,
"step": 408
},
{
"epoch": 2.8641686182669788,
"grad_norm": 0.05849752202630043,
"learning_rate": 4.853289631212066e-08,
"loss": 0.2596,
"step": 409
},
{
"epoch": 2.871194379391101,
"grad_norm": 0.06690798699855804,
"learning_rate": 4.299903537641703e-08,
"loss": 0.2261,
"step": 410
},
{
"epoch": 2.8782201405152223,
"grad_norm": 0.06300397962331772,
"learning_rate": 3.779869204754427e-08,
"loss": 0.2502,
"step": 411
},
{
"epoch": 2.8852459016393444,
"grad_norm": 0.06787782162427902,
"learning_rate": 3.2932216215704195e-08,
"loss": 0.2246,
"step": 412
},
{
"epoch": 2.892271662763466,
"grad_norm": 0.05957074463367462,
"learning_rate": 2.8399935307778516e-08,
"loss": 0.2237,
"step": 413
},
{
"epoch": 2.899297423887588,
"grad_norm": 0.06010279804468155,
"learning_rate": 2.420215426530259e-08,
"loss": 0.2146,
"step": 414
},
{
"epoch": 2.9063231850117095,
"grad_norm": 0.07070771604776382,
"learning_rate": 2.0339155523945164e-08,
"loss": 0.2146,
"step": 415
},
{
"epoch": 2.9133489461358315,
"grad_norm": 0.06385600566864014,
"learning_rate": 1.681119899450856e-08,
"loss": 0.2336,
"step": 416
},
{
"epoch": 2.920374707259953,
"grad_norm": 0.06282834708690643,
"learning_rate": 1.3618522045439897e-08,
"loss": 0.2257,
"step": 417
},
{
"epoch": 2.927400468384075,
"grad_norm": 0.06092951446771622,
"learning_rate": 1.0761339486859424e-08,
"loss": 0.2299,
"step": 418
},
{
"epoch": 2.9344262295081966,
"grad_norm": 0.05854687839746475,
"learning_rate": 8.239843556108739e-09,
"loss": 0.191,
"step": 419
},
{
"epoch": 2.9414519906323187,
"grad_norm": 0.06888988614082336,
"learning_rate": 6.054203904817812e-09,
"loss": 0.2607,
"step": 420
},
{
"epoch": 2.9484777517564402,
"grad_norm": 0.059768833220005035,
"learning_rate": 4.204567587486885e-09,
"loss": 0.261,
"step": 421
},
{
"epoch": 2.9555035128805622,
"grad_norm": 0.06158865615725517,
"learning_rate": 2.6910590515966117e-09,
"loss": 0.2063,
"step": 422
},
{
"epoch": 2.962529274004684,
"grad_norm": 0.06061291694641113,
"learning_rate": 1.5137801292325338e-09,
"loss": 0.2551,
"step": 423
},
{
"epoch": 2.9695550351288054,
"grad_norm": 0.0802014172077179,
"learning_rate": 6.728100302327844e-10,
"loss": 0.2163,
"step": 424
},
{
"epoch": 2.9765807962529274,
"grad_norm": 0.05840716511011124,
"learning_rate": 1.6820533686179308e-10,
"loss": 0.2531,
"step": 425
},
{
"epoch": 2.9836065573770494,
"grad_norm": 0.06497234106063843,
"learning_rate": 0.0,
"loss": 0.2231,
"step": 426
},
{
"epoch": 2.9836065573770494,
"step": 426,
"total_flos": 1.0634701552012493e+17,
"train_loss": 0.3742912175230017,
"train_runtime": 25220.8812,
"train_samples_per_second": 0.406,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 426,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0634701552012493e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}