qwen2.5-omni-3b-lora-sft / trainer_state.json
FINGU-AI's picture
Upload folder using huggingface_hub
a68bc10 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.98159509202454,
"eval_steps": 500,
"global_step": 366,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0081799591002045,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 1.978,
"step": 1
},
{
"epoch": 0.016359918200409,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 2.0417,
"step": 2
},
{
"epoch": 0.024539877300613498,
"grad_norm": 3.960280656814575,
"learning_rate": 0.0,
"loss": 2.4526,
"step": 3
},
{
"epoch": 0.032719836400818,
"grad_norm": 4.871670722961426,
"learning_rate": 2.702702702702703e-06,
"loss": 2.3708,
"step": 4
},
{
"epoch": 0.0408997955010225,
"grad_norm": NaN,
"learning_rate": 5.405405405405406e-06,
"loss": 2.7402,
"step": 5
},
{
"epoch": 0.049079754601226995,
"grad_norm": 2.961730718612671,
"learning_rate": 5.405405405405406e-06,
"loss": 2.6255,
"step": 6
},
{
"epoch": 0.05725971370143149,
"grad_norm": 2.4267706871032715,
"learning_rate": 8.108108108108109e-06,
"loss": 2.1672,
"step": 7
},
{
"epoch": 0.065439672801636,
"grad_norm": 4.815489768981934,
"learning_rate": 1.0810810810810812e-05,
"loss": 2.6853,
"step": 8
},
{
"epoch": 0.0736196319018405,
"grad_norm": 5.672784805297852,
"learning_rate": 1.3513513513513515e-05,
"loss": 2.5256,
"step": 9
},
{
"epoch": 0.081799591002045,
"grad_norm": 2.97552490234375,
"learning_rate": 1.6216216216216218e-05,
"loss": 2.1108,
"step": 10
},
{
"epoch": 0.08997955010224949,
"grad_norm": 2.305542469024658,
"learning_rate": 1.891891891891892e-05,
"loss": 2.2621,
"step": 11
},
{
"epoch": 0.09815950920245399,
"grad_norm": 2.330063581466675,
"learning_rate": 2.1621621621621624e-05,
"loss": 2.2973,
"step": 12
},
{
"epoch": 0.10633946830265849,
"grad_norm": 2.395848274230957,
"learning_rate": 2.4324324324324327e-05,
"loss": 2.2853,
"step": 13
},
{
"epoch": 0.11451942740286299,
"grad_norm": 3.5902745723724365,
"learning_rate": 2.702702702702703e-05,
"loss": 2.3628,
"step": 14
},
{
"epoch": 0.12269938650306748,
"grad_norm": 3.785466194152832,
"learning_rate": 2.9729729729729733e-05,
"loss": 1.6333,
"step": 15
},
{
"epoch": 0.130879345603272,
"grad_norm": 2.845073699951172,
"learning_rate": 3.2432432432432436e-05,
"loss": 1.6592,
"step": 16
},
{
"epoch": 0.1390593047034765,
"grad_norm": 2.9714362621307373,
"learning_rate": 3.513513513513514e-05,
"loss": 2.4303,
"step": 17
},
{
"epoch": 0.147239263803681,
"grad_norm": 3.2374515533447266,
"learning_rate": 3.783783783783784e-05,
"loss": 1.6276,
"step": 18
},
{
"epoch": 0.1554192229038855,
"grad_norm": 2.4501020908355713,
"learning_rate": 4.0540540540540545e-05,
"loss": 2.1786,
"step": 19
},
{
"epoch": 0.16359918200409,
"grad_norm": 1.689795970916748,
"learning_rate": 4.324324324324325e-05,
"loss": 1.7335,
"step": 20
},
{
"epoch": 0.17177914110429449,
"grad_norm": 1.5767645835876465,
"learning_rate": 4.594594594594595e-05,
"loss": 1.9214,
"step": 21
},
{
"epoch": 0.17995910020449898,
"grad_norm": 2.6853578090667725,
"learning_rate": 4.8648648648648654e-05,
"loss": 1.9089,
"step": 22
},
{
"epoch": 0.18813905930470348,
"grad_norm": 3.5681397914886475,
"learning_rate": 5.135135135135135e-05,
"loss": 2.092,
"step": 23
},
{
"epoch": 0.19631901840490798,
"grad_norm": 3.208242416381836,
"learning_rate": 5.405405405405406e-05,
"loss": 1.9184,
"step": 24
},
{
"epoch": 0.20449897750511248,
"grad_norm": 3.677274227142334,
"learning_rate": 5.6756756756756757e-05,
"loss": 1.6762,
"step": 25
},
{
"epoch": 0.21267893660531698,
"grad_norm": 1.9500216245651245,
"learning_rate": 5.9459459459459466e-05,
"loss": 1.8447,
"step": 26
},
{
"epoch": 0.22085889570552147,
"grad_norm": 2.72743821144104,
"learning_rate": 6.216216216216216e-05,
"loss": 1.659,
"step": 27
},
{
"epoch": 0.22903885480572597,
"grad_norm": 1.4266787767410278,
"learning_rate": 6.486486486486487e-05,
"loss": 1.5518,
"step": 28
},
{
"epoch": 0.23721881390593047,
"grad_norm": 1.8338373899459839,
"learning_rate": 6.756756756756757e-05,
"loss": 1.7725,
"step": 29
},
{
"epoch": 0.24539877300613497,
"grad_norm": 2.702836751937866,
"learning_rate": 7.027027027027028e-05,
"loss": 1.3214,
"step": 30
},
{
"epoch": 0.25357873210633947,
"grad_norm": 3.3664143085479736,
"learning_rate": 7.297297297297297e-05,
"loss": 1.5599,
"step": 31
},
{
"epoch": 0.261758691206544,
"grad_norm": 1.7983371019363403,
"learning_rate": 7.567567567567568e-05,
"loss": 1.4007,
"step": 32
},
{
"epoch": 0.26993865030674846,
"grad_norm": 1.4321403503417969,
"learning_rate": 7.837837837837838e-05,
"loss": 1.4209,
"step": 33
},
{
"epoch": 0.278118609406953,
"grad_norm": 1.7886905670166016,
"learning_rate": 8.108108108108109e-05,
"loss": 1.1963,
"step": 34
},
{
"epoch": 0.28629856850715746,
"grad_norm": 2.0502827167510986,
"learning_rate": 8.378378378378379e-05,
"loss": 1.6071,
"step": 35
},
{
"epoch": 0.294478527607362,
"grad_norm": 2.351100206375122,
"learning_rate": 8.64864864864865e-05,
"loss": 1.1902,
"step": 36
},
{
"epoch": 0.30265848670756645,
"grad_norm": 3.3446481227874756,
"learning_rate": 8.918918918918919e-05,
"loss": 1.1838,
"step": 37
},
{
"epoch": 0.310838445807771,
"grad_norm": 1.3906322717666626,
"learning_rate": 9.18918918918919e-05,
"loss": 0.9948,
"step": 38
},
{
"epoch": 0.31901840490797545,
"grad_norm": 1.9603602886199951,
"learning_rate": 9.45945945945946e-05,
"loss": 1.0997,
"step": 39
},
{
"epoch": 0.32719836400818,
"grad_norm": 1.3630380630493164,
"learning_rate": 9.729729729729731e-05,
"loss": 0.6781,
"step": 40
},
{
"epoch": 0.33537832310838445,
"grad_norm": 2.0062906742095947,
"learning_rate": 0.0001,
"loss": 1.4761,
"step": 41
},
{
"epoch": 0.34355828220858897,
"grad_norm": 1.2718311548233032,
"learning_rate": 9.99977204734326e-05,
"loss": 1.0233,
"step": 42
},
{
"epoch": 0.35173824130879344,
"grad_norm": 1.5002365112304688,
"learning_rate": 9.999088210158001e-05,
"loss": 0.8199,
"step": 43
},
{
"epoch": 0.35991820040899797,
"grad_norm": 1.430582880973816,
"learning_rate": 9.997948550797227e-05,
"loss": 1.1087,
"step": 44
},
{
"epoch": 0.36809815950920244,
"grad_norm": 2.0703279972076416,
"learning_rate": 9.996353173176289e-05,
"loss": 0.8149,
"step": 45
},
{
"epoch": 0.37627811860940696,
"grad_norm": 1.5086464881896973,
"learning_rate": 9.994302222763414e-05,
"loss": 0.9549,
"step": 46
},
{
"epoch": 0.38445807770961143,
"grad_norm": 1.5020016431808472,
"learning_rate": 9.991795886566441e-05,
"loss": 0.9542,
"step": 47
},
{
"epoch": 0.39263803680981596,
"grad_norm": 1.6791132688522339,
"learning_rate": 9.988834393115767e-05,
"loss": 0.8864,
"step": 48
},
{
"epoch": 0.40081799591002043,
"grad_norm": 2.7517454624176025,
"learning_rate": 9.98541801244351e-05,
"loss": 1.0826,
"step": 49
},
{
"epoch": 0.40899795501022496,
"grad_norm": 1.6971582174301147,
"learning_rate": 9.981547056058893e-05,
"loss": 0.9114,
"step": 50
},
{
"epoch": 0.4171779141104294,
"grad_norm": 1.4928799867630005,
"learning_rate": 9.977221876919833e-05,
"loss": 0.9755,
"step": 51
},
{
"epoch": 0.42535787321063395,
"grad_norm": 1.1075421571731567,
"learning_rate": 9.972442869400759e-05,
"loss": 0.741,
"step": 52
},
{
"epoch": 0.4335378323108384,
"grad_norm": 1.3488271236419678,
"learning_rate": 9.967210469256656e-05,
"loss": 1.0441,
"step": 53
},
{
"epoch": 0.44171779141104295,
"grad_norm": 3.3010079860687256,
"learning_rate": 9.961525153583327e-05,
"loss": 1.2885,
"step": 54
},
{
"epoch": 0.4498977505112474,
"grad_norm": 1.24274742603302,
"learning_rate": 9.9553874407739e-05,
"loss": 0.8107,
"step": 55
},
{
"epoch": 0.45807770961145194,
"grad_norm": 1.1659082174301147,
"learning_rate": 9.948797890471551e-05,
"loss": 0.6129,
"step": 56
},
{
"epoch": 0.4662576687116564,
"grad_norm": 1.5525306463241577,
"learning_rate": 9.941757103518478e-05,
"loss": 0.9262,
"step": 57
},
{
"epoch": 0.47443762781186094,
"grad_norm": 1.3923064470291138,
"learning_rate": 9.93426572190112e-05,
"loss": 0.7552,
"step": 58
},
{
"epoch": 0.48261758691206547,
"grad_norm": 1.166669487953186,
"learning_rate": 9.926324428691611e-05,
"loss": 0.6346,
"step": 59
},
{
"epoch": 0.49079754601226994,
"grad_norm": 2.103994131088257,
"learning_rate": 9.917933947985507e-05,
"loss": 0.8199,
"step": 60
},
{
"epoch": 0.49897750511247446,
"grad_norm": 1.633812665939331,
"learning_rate": 9.909095044835754e-05,
"loss": 0.7485,
"step": 61
},
{
"epoch": 0.5071574642126789,
"grad_norm": 1.4533647298812866,
"learning_rate": 9.899808525182935e-05,
"loss": 1.1649,
"step": 62
},
{
"epoch": 0.5153374233128835,
"grad_norm": 1.6267237663269043,
"learning_rate": 9.890075235781779e-05,
"loss": 1.1159,
"step": 63
},
{
"epoch": 0.523517382413088,
"grad_norm": 1.2796165943145752,
"learning_rate": 9.879896064123961e-05,
"loss": 0.9613,
"step": 64
},
{
"epoch": 0.5316973415132924,
"grad_norm": 1.3240866661071777,
"learning_rate": 9.869271938357167e-05,
"loss": 1.047,
"step": 65
},
{
"epoch": 0.5398773006134969,
"grad_norm": 1.190612554550171,
"learning_rate": 9.858203827200476e-05,
"loss": 1.1846,
"step": 66
},
{
"epoch": 0.5480572597137015,
"grad_norm": 1.1655223369598389,
"learning_rate": 9.846692739856024e-05,
"loss": 0.9566,
"step": 67
},
{
"epoch": 0.556237218813906,
"grad_norm": 1.2617253065109253,
"learning_rate": 9.834739725916988e-05,
"loss": 1.1108,
"step": 68
},
{
"epoch": 0.5644171779141104,
"grad_norm": 1.3576512336730957,
"learning_rate": 9.822345875271883e-05,
"loss": 1.1265,
"step": 69
},
{
"epoch": 0.5725971370143149,
"grad_norm": 1.4342156648635864,
"learning_rate": 9.809512318005181e-05,
"loss": 0.7757,
"step": 70
},
{
"epoch": 0.5807770961145194,
"grad_norm": 1.0733706951141357,
"learning_rate": 9.796240224294271e-05,
"loss": 0.9006,
"step": 71
},
{
"epoch": 0.588957055214724,
"grad_norm": 1.323440432548523,
"learning_rate": 9.782530804302763e-05,
"loss": 0.9322,
"step": 72
},
{
"epoch": 0.5971370143149284,
"grad_norm": 1.2899342775344849,
"learning_rate": 9.768385308070138e-05,
"loss": 0.8403,
"step": 73
},
{
"epoch": 0.6053169734151329,
"grad_norm": 1.2755167484283447,
"learning_rate": 9.753805025397779e-05,
"loss": 0.8397,
"step": 74
},
{
"epoch": 0.6134969325153374,
"grad_norm": 1.265972375869751,
"learning_rate": 9.738791285731352e-05,
"loss": 0.8143,
"step": 75
},
{
"epoch": 0.621676891615542,
"grad_norm": 1.1493557691574097,
"learning_rate": 9.723345458039594e-05,
"loss": 1.059,
"step": 76
},
{
"epoch": 0.6298568507157464,
"grad_norm": 1.1361910104751587,
"learning_rate": 9.707468950689491e-05,
"loss": 0.9112,
"step": 77
},
{
"epoch": 0.6380368098159509,
"grad_norm": 1.4090393781661987,
"learning_rate": 9.691163211317853e-05,
"loss": 0.7847,
"step": 78
},
{
"epoch": 0.6462167689161554,
"grad_norm": 1.300688624382019,
"learning_rate": 9.674429726699323e-05,
"loss": 0.9806,
"step": 79
},
{
"epoch": 0.65439672801636,
"grad_norm": 1.0143762826919556,
"learning_rate": 9.657270022610813e-05,
"loss": 0.6648,
"step": 80
},
{
"epoch": 0.6625766871165644,
"grad_norm": 1.2058460712432861,
"learning_rate": 9.63968566369238e-05,
"loss": 0.9702,
"step": 81
},
{
"epoch": 0.6707566462167689,
"grad_norm": 1.0876555442810059,
"learning_rate": 9.62167825330455e-05,
"loss": 0.7861,
"step": 82
},
{
"epoch": 0.6789366053169734,
"grad_norm": 1.502429723739624,
"learning_rate": 9.603249433382144e-05,
"loss": 1.1252,
"step": 83
},
{
"epoch": 0.6871165644171779,
"grad_norm": 4.1136860847473145,
"learning_rate": 9.584400884284545e-05,
"loss": 0.7109,
"step": 84
},
{
"epoch": 0.6952965235173824,
"grad_norm": 0.9980061650276184,
"learning_rate": 9.56513432464249e-05,
"loss": 0.6421,
"step": 85
},
{
"epoch": 0.7034764826175869,
"grad_norm": 1.1087136268615723,
"learning_rate": 9.545451511201364e-05,
"loss": 0.6337,
"step": 86
},
{
"epoch": 0.7116564417177914,
"grad_norm": 1.4353466033935547,
"learning_rate": 9.525354238661009e-05,
"loss": 1.0757,
"step": 87
},
{
"epoch": 0.7198364008179959,
"grad_norm": 1.1413462162017822,
"learning_rate": 9.504844339512095e-05,
"loss": 0.7056,
"step": 88
},
{
"epoch": 0.7280163599182005,
"grad_norm": 1.5157971382141113,
"learning_rate": 9.483923683869024e-05,
"loss": 0.8767,
"step": 89
},
{
"epoch": 0.7361963190184049,
"grad_norm": 0.999251663684845,
"learning_rate": 9.462594179299406e-05,
"loss": 0.922,
"step": 90
},
{
"epoch": 0.7443762781186094,
"grad_norm": 1.2393922805786133,
"learning_rate": 9.440857770650138e-05,
"loss": 0.8301,
"step": 91
},
{
"epoch": 0.7525562372188139,
"grad_norm": 1.1513807773590088,
"learning_rate": 9.418716439870057e-05,
"loss": 0.5308,
"step": 92
},
{
"epoch": 0.7607361963190185,
"grad_norm": 1.4229981899261475,
"learning_rate": 9.396172205829234e-05,
"loss": 1.1116,
"step": 93
},
{
"epoch": 0.7689161554192229,
"grad_norm": 1.4916852712631226,
"learning_rate": 9.373227124134888e-05,
"loss": 0.8806,
"step": 94
},
{
"epoch": 0.7770961145194274,
"grad_norm": 1.693434715270996,
"learning_rate": 9.34988328694395e-05,
"loss": 0.7924,
"step": 95
},
{
"epoch": 0.7852760736196319,
"grad_norm": 1.1455564498901367,
"learning_rate": 9.326142822772302e-05,
"loss": 0.7091,
"step": 96
},
{
"epoch": 0.7934560327198364,
"grad_norm": 1.031115174293518,
"learning_rate": 9.302007896300698e-05,
"loss": 0.8828,
"step": 97
},
{
"epoch": 0.8016359918200409,
"grad_norm": 1.5678527355194092,
"learning_rate": 9.27748070817738e-05,
"loss": 1.0531,
"step": 98
},
{
"epoch": 0.8098159509202454,
"grad_norm": 1.2558964490890503,
"learning_rate": 9.252563494817425e-05,
"loss": 0.9222,
"step": 99
},
{
"epoch": 0.8179959100204499,
"grad_norm": 1.2573111057281494,
"learning_rate": 9.227258528198831e-05,
"loss": 0.8988,
"step": 100
},
{
"epoch": 0.8261758691206544,
"grad_norm": 1.3229575157165527,
"learning_rate": 9.201568115655342e-05,
"loss": 1.0139,
"step": 101
},
{
"epoch": 0.8343558282208589,
"grad_norm": 0.913388729095459,
"learning_rate": 9.175494599666077e-05,
"loss": 0.6802,
"step": 102
},
{
"epoch": 0.8425357873210634,
"grad_norm": 3.0073134899139404,
"learning_rate": 9.149040357641929e-05,
"loss": 0.8834,
"step": 103
},
{
"epoch": 0.8507157464212679,
"grad_norm": 0.9436314105987549,
"learning_rate": 9.122207801708802e-05,
"loss": 0.6559,
"step": 104
},
{
"epoch": 0.8588957055214724,
"grad_norm": 1.1913410425186157,
"learning_rate": 9.094999378487659e-05,
"loss": 0.7427,
"step": 105
},
{
"epoch": 0.8670756646216768,
"grad_norm": 1.03123140335083,
"learning_rate": 9.067417568871445e-05,
"loss": 0.6253,
"step": 106
},
{
"epoch": 0.8752556237218814,
"grad_norm": 0.9424878358840942,
"learning_rate": 9.03946488779887e-05,
"loss": 0.7601,
"step": 107
},
{
"epoch": 0.8834355828220859,
"grad_norm": 1.2202138900756836,
"learning_rate": 9.011143884025101e-05,
"loss": 1.024,
"step": 108
},
{
"epoch": 0.8916155419222904,
"grad_norm": 1.3849170207977295,
"learning_rate": 8.982457139889357e-05,
"loss": 0.8283,
"step": 109
},
{
"epoch": 0.8997955010224948,
"grad_norm": 1.2288554906845093,
"learning_rate": 8.953407271079455e-05,
"loss": 0.8297,
"step": 110
},
{
"epoch": 0.9079754601226994,
"grad_norm": 0.9206739664077759,
"learning_rate": 8.923996926393305e-05,
"loss": 0.5966,
"step": 111
},
{
"epoch": 0.9161554192229039,
"grad_norm": 0.9282044172286987,
"learning_rate": 8.894228787497389e-05,
"loss": 0.7775,
"step": 112
},
{
"epoch": 0.9243353783231084,
"grad_norm": 1.2421815395355225,
"learning_rate": 8.864105568682244e-05,
"loss": 0.7838,
"step": 113
},
{
"epoch": 0.9325153374233128,
"grad_norm": 1.1172124147415161,
"learning_rate": 8.833630016614976e-05,
"loss": 0.5921,
"step": 114
},
{
"epoch": 0.9406952965235174,
"grad_norm": 1.310473918914795,
"learning_rate": 8.802804910088809e-05,
"loss": 1.0578,
"step": 115
},
{
"epoch": 0.9488752556237219,
"grad_norm": 1.1656848192214966,
"learning_rate": 8.771633059769711e-05,
"loss": 0.8205,
"step": 116
},
{
"epoch": 0.9570552147239264,
"grad_norm": 1.0159672498703003,
"learning_rate": 8.740117307940123e-05,
"loss": 0.8237,
"step": 117
},
{
"epoch": 0.9652351738241309,
"grad_norm": 1.2568827867507935,
"learning_rate": 8.708260528239788e-05,
"loss": 1.0473,
"step": 118
},
{
"epoch": 0.9734151329243353,
"grad_norm": 1.2711869478225708,
"learning_rate": 8.676065625403733e-05,
"loss": 1.0789,
"step": 119
},
{
"epoch": 0.9815950920245399,
"grad_norm": 1.2562803030014038,
"learning_rate": 8.64353553499741e-05,
"loss": 0.5315,
"step": 120
},
{
"epoch": 0.9897750511247444,
"grad_norm": 1.4107680320739746,
"learning_rate": 8.610673223149034e-05,
"loss": 0.9738,
"step": 121
},
{
"epoch": 0.9979550102249489,
"grad_norm": 1.0771377086639404,
"learning_rate": 8.577481686279123e-05,
"loss": 0.6114,
"step": 122
},
{
"epoch": 1.0,
"grad_norm": 0.6168379187583923,
"learning_rate": 8.543963950827279e-05,
"loss": 0.1857,
"step": 123
},
{
"epoch": 1.0081799591002045,
"grad_norm": 1.0099263191223145,
"learning_rate": 8.510123072976239e-05,
"loss": 0.77,
"step": 124
},
{
"epoch": 1.016359918200409,
"grad_norm": 1.0608845949172974,
"learning_rate": 8.475962138373213e-05,
"loss": 0.588,
"step": 125
},
{
"epoch": 1.0245398773006136,
"grad_norm": 1.2696729898452759,
"learning_rate": 8.441484261848514e-05,
"loss": 0.8879,
"step": 126
},
{
"epoch": 1.032719836400818,
"grad_norm": 1.556289792060852,
"learning_rate": 8.406692587131568e-05,
"loss": 1.1292,
"step": 127
},
{
"epoch": 1.0408997955010224,
"grad_norm": 1.027779459953308,
"learning_rate": 8.371590286564247e-05,
"loss": 0.6589,
"step": 128
},
{
"epoch": 1.049079754601227,
"grad_norm": 1.1132937669754028,
"learning_rate": 8.336180560811619e-05,
"loss": 0.6267,
"step": 129
},
{
"epoch": 1.0572597137014315,
"grad_norm": 1.7684708833694458,
"learning_rate": 8.30046663857011e-05,
"loss": 0.7399,
"step": 130
},
{
"epoch": 1.065439672801636,
"grad_norm": 1.372917890548706,
"learning_rate": 8.264451776273104e-05,
"loss": 0.8849,
"step": 131
},
{
"epoch": 1.0736196319018405,
"grad_norm": 1.1678389310836792,
"learning_rate": 8.228139257794012e-05,
"loss": 0.901,
"step": 132
},
{
"epoch": 1.081799591002045,
"grad_norm": 1.0002321004867554,
"learning_rate": 8.191532394146865e-05,
"loss": 0.3923,
"step": 133
},
{
"epoch": 1.0899795501022496,
"grad_norm": 1.0493892431259155,
"learning_rate": 8.154634523184388e-05,
"loss": 0.77,
"step": 134
},
{
"epoch": 1.098159509202454,
"grad_norm": 1.4020309448242188,
"learning_rate": 8.117449009293668e-05,
"loss": 0.7849,
"step": 135
},
{
"epoch": 1.1063394683026584,
"grad_norm": 1.3133962154388428,
"learning_rate": 8.07997924308938e-05,
"loss": 0.7912,
"step": 136
},
{
"epoch": 1.114519427402863,
"grad_norm": 1.3940467834472656,
"learning_rate": 8.042228641104622e-05,
"loss": 0.7142,
"step": 137
},
{
"epoch": 1.1226993865030674,
"grad_norm": 1.3877304792404175,
"learning_rate": 8.004200645479403e-05,
"loss": 0.5454,
"step": 138
},
{
"epoch": 1.130879345603272,
"grad_norm": 1.1132571697235107,
"learning_rate": 7.965898723646776e-05,
"loss": 0.7387,
"step": 139
},
{
"epoch": 1.1390593047034765,
"grad_norm": 1.206382155418396,
"learning_rate": 7.927326368016677e-05,
"loss": 0.7271,
"step": 140
},
{
"epoch": 1.147239263803681,
"grad_norm": 1.2813421487808228,
"learning_rate": 7.888487095657484e-05,
"loss": 0.8452,
"step": 141
},
{
"epoch": 1.1554192229038855,
"grad_norm": 1.1103274822235107,
"learning_rate": 7.849384447975321e-05,
"loss": 0.5735,
"step": 142
},
{
"epoch": 1.16359918200409,
"grad_norm": 1.1904572248458862,
"learning_rate": 7.810021990391164e-05,
"loss": 0.486,
"step": 143
},
{
"epoch": 1.1717791411042944,
"grad_norm": 1.361222743988037,
"learning_rate": 7.770403312015721e-05,
"loss": 0.9265,
"step": 144
},
{
"epoch": 1.179959100204499,
"grad_norm": 1.1453652381896973,
"learning_rate": 7.73053202532219e-05,
"loss": 0.6186,
"step": 145
},
{
"epoch": 1.1881390593047034,
"grad_norm": 1.2070741653442383,
"learning_rate": 7.690411765816864e-05,
"loss": 0.7012,
"step": 146
},
{
"epoch": 1.196319018404908,
"grad_norm": 1.4246371984481812,
"learning_rate": 7.650046191707641e-05,
"loss": 0.7644,
"step": 147
},
{
"epoch": 1.2044989775051125,
"grad_norm": 1.2275187969207764,
"learning_rate": 7.60943898357046e-05,
"loss": 0.614,
"step": 148
},
{
"epoch": 1.212678936605317,
"grad_norm": 1.292330265045166,
"learning_rate": 7.568593844013718e-05,
"loss": 0.6722,
"step": 149
},
{
"epoch": 1.2208588957055215,
"grad_norm": 1.54197359085083,
"learning_rate": 7.527514497340642e-05,
"loss": 0.6981,
"step": 150
},
{
"epoch": 1.229038854805726,
"grad_norm": 1.605914831161499,
"learning_rate": 7.48620468920972e-05,
"loss": 0.7524,
"step": 151
},
{
"epoch": 1.2372188139059306,
"grad_norm": 1.2442176342010498,
"learning_rate": 7.444668186293153e-05,
"loss": 0.6238,
"step": 152
},
{
"epoch": 1.2453987730061349,
"grad_norm": 1.4838721752166748,
"learning_rate": 7.402908775933419e-05,
"loss": 0.7599,
"step": 153
},
{
"epoch": 1.2535787321063394,
"grad_norm": 1.8454984426498413,
"learning_rate": 7.360930265797935e-05,
"loss": 1.1331,
"step": 154
},
{
"epoch": 1.261758691206544,
"grad_norm": 1.3571646213531494,
"learning_rate": 7.31873648353186e-05,
"loss": 0.6468,
"step": 155
},
{
"epoch": 1.2699386503067485,
"grad_norm": 1.3795866966247559,
"learning_rate": 7.276331276409106e-05,
"loss": 0.7253,
"step": 156
},
{
"epoch": 1.278118609406953,
"grad_norm": 1.4821308851242065,
"learning_rate": 7.23371851098152e-05,
"loss": 0.842,
"step": 157
},
{
"epoch": 1.2862985685071575,
"grad_norm": 1.0921138525009155,
"learning_rate": 7.190902072726335e-05,
"loss": 0.5379,
"step": 158
},
{
"epoch": 1.294478527607362,
"grad_norm": 1.5662935972213745,
"learning_rate": 7.147885865691899e-05,
"loss": 0.918,
"step": 159
},
{
"epoch": 1.3026584867075663,
"grad_norm": 1.3333555459976196,
"learning_rate": 7.104673812141675e-05,
"loss": 0.6727,
"step": 160
},
{
"epoch": 1.310838445807771,
"grad_norm": 1.1487497091293335,
"learning_rate": 7.061269852196632e-05,
"loss": 0.4279,
"step": 161
},
{
"epoch": 1.3190184049079754,
"grad_norm": 1.1033565998077393,
"learning_rate": 7.017677943475961e-05,
"loss": 0.6372,
"step": 162
},
{
"epoch": 1.32719836400818,
"grad_norm": 1.2100588083267212,
"learning_rate": 6.973902060736226e-05,
"loss": 0.7071,
"step": 163
},
{
"epoch": 1.3353783231083844,
"grad_norm": 1.421066403388977,
"learning_rate": 6.929946195508932e-05,
"loss": 0.767,
"step": 164
},
{
"epoch": 1.343558282208589,
"grad_norm": 1.2306902408599854,
"learning_rate": 6.885814355736586e-05,
"loss": 0.5587,
"step": 165
},
{
"epoch": 1.3517382413087935,
"grad_norm": 1.5315287113189697,
"learning_rate": 6.841510565407235e-05,
"loss": 0.7519,
"step": 166
},
{
"epoch": 1.359918200408998,
"grad_norm": 1.2497670650482178,
"learning_rate": 6.797038864187564e-05,
"loss": 0.5612,
"step": 167
},
{
"epoch": 1.3680981595092025,
"grad_norm": 1.6106078624725342,
"learning_rate": 6.752403307054549e-05,
"loss": 0.7194,
"step": 168
},
{
"epoch": 1.3762781186094069,
"grad_norm": 1.2407530546188354,
"learning_rate": 6.707607963925724e-05,
"loss": 0.531,
"step": 169
},
{
"epoch": 1.3844580777096114,
"grad_norm": 1.663898229598999,
"learning_rate": 6.66265691928808e-05,
"loss": 0.7906,
"step": 170
},
{
"epoch": 1.392638036809816,
"grad_norm": 1.3650121688842773,
"learning_rate": 6.617554271825636e-05,
"loss": 0.7207,
"step": 171
},
{
"epoch": 1.4008179959100204,
"grad_norm": 1.1001300811767578,
"learning_rate": 6.572304134045717e-05,
"loss": 0.5145,
"step": 172
},
{
"epoch": 1.408997955010225,
"grad_norm": 1.0687707662582397,
"learning_rate": 6.526910631903973e-05,
"loss": 0.3521,
"step": 173
},
{
"epoch": 1.4171779141104295,
"grad_norm": 1.2442213296890259,
"learning_rate": 6.481377904428171e-05,
"loss": 0.7026,
"step": 174
},
{
"epoch": 1.425357873210634,
"grad_norm": 1.31452214717865,
"learning_rate": 6.435710103340786e-05,
"loss": 0.7313,
"step": 175
},
{
"epoch": 1.4335378323108383,
"grad_norm": 1.5573769807815552,
"learning_rate": 6.389911392680456e-05,
"loss": 0.7659,
"step": 176
},
{
"epoch": 1.441717791411043,
"grad_norm": 1.2089431285858154,
"learning_rate": 6.343985948422287e-05,
"loss": 0.6916,
"step": 177
},
{
"epoch": 1.4498977505112474,
"grad_norm": 1.5785194635391235,
"learning_rate": 6.297937958097094e-05,
"loss": 0.8101,
"step": 178
},
{
"epoch": 1.4580777096114519,
"grad_norm": 1.4134269952774048,
"learning_rate": 6.251771620409563e-05,
"loss": 0.7504,
"step": 179
},
{
"epoch": 1.4662576687116564,
"grad_norm": 1.4751485586166382,
"learning_rate": 6.205491144855432e-05,
"loss": 0.5948,
"step": 180
},
{
"epoch": 1.474437627811861,
"grad_norm": 1.31548273563385,
"learning_rate": 6.159100751337642e-05,
"loss": 0.7057,
"step": 181
},
{
"epoch": 1.4826175869120655,
"grad_norm": 1.8151648044586182,
"learning_rate": 6.112604669781572e-05,
"loss": 0.9037,
"step": 182
},
{
"epoch": 1.49079754601227,
"grad_norm": 1.3681972026824951,
"learning_rate": 6.0660071397493514e-05,
"loss": 0.7223,
"step": 183
},
{
"epoch": 1.4989775051124745,
"grad_norm": 1.6292760372161865,
"learning_rate": 6.019312410053286e-05,
"loss": 0.6083,
"step": 184
},
{
"epoch": 1.5071574642126788,
"grad_norm": 1.8144514560699463,
"learning_rate": 5.972524738368452e-05,
"loss": 0.7662,
"step": 185
},
{
"epoch": 1.5153374233128836,
"grad_norm": 1.650654911994934,
"learning_rate": 5.925648390844476e-05,
"loss": 0.902,
"step": 186
},
{
"epoch": 1.5235173824130879,
"grad_norm": 1.4780257940292358,
"learning_rate": 5.878687641716538e-05,
"loss": 0.6566,
"step": 187
},
{
"epoch": 1.5316973415132924,
"grad_norm": 1.1706862449645996,
"learning_rate": 5.831646772915651e-05,
"loss": 0.4189,
"step": 188
},
{
"epoch": 1.539877300613497,
"grad_norm": 1.287718653678894,
"learning_rate": 5.7845300736782204e-05,
"loss": 0.5549,
"step": 189
},
{
"epoch": 1.5480572597137015,
"grad_norm": 1.3776918649673462,
"learning_rate": 5.737341840154956e-05,
"loss": 0.5456,
"step": 190
},
{
"epoch": 1.556237218813906,
"grad_norm": 1.2569301128387451,
"learning_rate": 5.6900863750191347e-05,
"loss": 0.6808,
"step": 191
},
{
"epoch": 1.5644171779141103,
"grad_norm": 1.7013508081436157,
"learning_rate": 5.642767987074288e-05,
"loss": 0.7974,
"step": 192
},
{
"epoch": 1.572597137014315,
"grad_norm": 1.5190353393554688,
"learning_rate": 5.5953909908613114e-05,
"loss": 0.5416,
"step": 193
},
{
"epoch": 1.5807770961145193,
"grad_norm": 1.4736334085464478,
"learning_rate": 5.547959706265068e-05,
"loss": 0.6788,
"step": 194
},
{
"epoch": 1.588957055214724,
"grad_norm": 2.006303548812866,
"learning_rate": 5.5004784581204927e-05,
"loss": 0.9634,
"step": 195
},
{
"epoch": 1.5971370143149284,
"grad_norm": 1.3578423261642456,
"learning_rate": 5.4529515758182506e-05,
"loss": 0.6563,
"step": 196
},
{
"epoch": 1.605316973415133,
"grad_norm": 1.4116990566253662,
"learning_rate": 5.405383392909973e-05,
"loss": 0.6062,
"step": 197
},
{
"epoch": 1.6134969325153374,
"grad_norm": 1.2362536191940308,
"learning_rate": 5.357778246713131e-05,
"loss": 0.4829,
"step": 198
},
{
"epoch": 1.621676891615542,
"grad_norm": 1.1780372858047485,
"learning_rate": 5.310140477915544e-05,
"loss": 0.465,
"step": 199
},
{
"epoch": 1.6298568507157465,
"grad_norm": 1.3919291496276855,
"learning_rate": 5.262474430179597e-05,
"loss": 0.6967,
"step": 200
},
{
"epoch": 1.6380368098159508,
"grad_norm": 1.7452629804611206,
"learning_rate": 5.214784449746174e-05,
"loss": 0.9096,
"step": 201
},
{
"epoch": 1.6462167689161555,
"grad_norm": 1.4730846881866455,
"learning_rate": 5.167074885038373e-05,
"loss": 0.7473,
"step": 202
},
{
"epoch": 1.6543967280163598,
"grad_norm": 1.5404870510101318,
"learning_rate": 5.119350086265004e-05,
"loss": 0.6233,
"step": 203
},
{
"epoch": 1.6625766871165644,
"grad_norm": 1.4780898094177246,
"learning_rate": 5.0716144050239375e-05,
"loss": 0.7599,
"step": 204
},
{
"epoch": 1.670756646216769,
"grad_norm": 1.194542407989502,
"learning_rate": 5.023872193905316e-05,
"loss": 0.5638,
"step": 205
},
{
"epoch": 1.6789366053169734,
"grad_norm": 1.3504347801208496,
"learning_rate": 4.976127806094684e-05,
"loss": 0.4701,
"step": 206
},
{
"epoch": 1.687116564417178,
"grad_norm": 2.1446099281311035,
"learning_rate": 4.928385594976063e-05,
"loss": 0.9383,
"step": 207
},
{
"epoch": 1.6952965235173822,
"grad_norm": 1.4745142459869385,
"learning_rate": 4.880649913734996e-05,
"loss": 0.5817,
"step": 208
},
{
"epoch": 1.703476482617587,
"grad_norm": 1.3029444217681885,
"learning_rate": 4.832925114961629e-05,
"loss": 0.3481,
"step": 209
},
{
"epoch": 1.7116564417177913,
"grad_norm": 1.1414580345153809,
"learning_rate": 4.785215550253826e-05,
"loss": 0.4348,
"step": 210
},
{
"epoch": 1.719836400817996,
"grad_norm": 1.4996669292449951,
"learning_rate": 4.7375255698204045e-05,
"loss": 0.653,
"step": 211
},
{
"epoch": 1.7280163599182004,
"grad_norm": 1.66719388961792,
"learning_rate": 4.6898595220844574e-05,
"loss": 0.7181,
"step": 212
},
{
"epoch": 1.7361963190184049,
"grad_norm": 1.476560354232788,
"learning_rate": 4.64222175328687e-05,
"loss": 0.6183,
"step": 213
},
{
"epoch": 1.7443762781186094,
"grad_norm": 1.7405219078063965,
"learning_rate": 4.594616607090028e-05,
"loss": 0.689,
"step": 214
},
{
"epoch": 1.752556237218814,
"grad_norm": 1.1866732835769653,
"learning_rate": 4.547048424181751e-05,
"loss": 0.4616,
"step": 215
},
{
"epoch": 1.7607361963190185,
"grad_norm": 1.7068077325820923,
"learning_rate": 4.4995215418795085e-05,
"loss": 0.6318,
"step": 216
},
{
"epoch": 1.7689161554192228,
"grad_norm": 1.4736443758010864,
"learning_rate": 4.452040293734934e-05,
"loss": 0.4611,
"step": 217
},
{
"epoch": 1.7770961145194275,
"grad_norm": 0.8084559440612793,
"learning_rate": 4.404609009138689e-05,
"loss": 0.1962,
"step": 218
},
{
"epoch": 1.7852760736196318,
"grad_norm": 1.1126220226287842,
"learning_rate": 4.357232012925714e-05,
"loss": 0.3804,
"step": 219
},
{
"epoch": 1.7934560327198366,
"grad_norm": 1.4977810382843018,
"learning_rate": 4.3099136249808665e-05,
"loss": 0.5431,
"step": 220
},
{
"epoch": 1.8016359918200409,
"grad_norm": 1.47788405418396,
"learning_rate": 4.262658159845046e-05,
"loss": 0.6498,
"step": 221
},
{
"epoch": 1.8098159509202454,
"grad_norm": 1.2339309453964233,
"learning_rate": 4.215469926321779e-05,
"loss": 0.4812,
"step": 222
},
{
"epoch": 1.81799591002045,
"grad_norm": 1.4342414140701294,
"learning_rate": 4.1683532270843504e-05,
"loss": 0.5703,
"step": 223
},
{
"epoch": 1.8261758691206544,
"grad_norm": 1.795954942703247,
"learning_rate": 4.121312358283463e-05,
"loss": 0.992,
"step": 224
},
{
"epoch": 1.834355828220859,
"grad_norm": 1.3253310918807983,
"learning_rate": 4.074351609155527e-05,
"loss": 0.5907,
"step": 225
},
{
"epoch": 1.8425357873210633,
"grad_norm": 1.4935518503189087,
"learning_rate": 4.027475261631548e-05,
"loss": 0.7448,
"step": 226
},
{
"epoch": 1.850715746421268,
"grad_norm": 1.8565064668655396,
"learning_rate": 3.980687589946715e-05,
"loss": 0.8506,
"step": 227
},
{
"epoch": 1.8588957055214723,
"grad_norm": 2.0860605239868164,
"learning_rate": 3.9339928602506505e-05,
"loss": 0.4935,
"step": 228
},
{
"epoch": 1.8670756646216768,
"grad_norm": 1.6324408054351807,
"learning_rate": 3.887395330218429e-05,
"loss": 0.586,
"step": 229
},
{
"epoch": 1.8752556237218814,
"grad_norm": 1.8466233015060425,
"learning_rate": 3.840899248662358e-05,
"loss": 0.7596,
"step": 230
},
{
"epoch": 1.883435582822086,
"grad_norm": 1.867876648902893,
"learning_rate": 3.7945088551445693e-05,
"loss": 0.7946,
"step": 231
},
{
"epoch": 1.8916155419222904,
"grad_norm": 1.3713316917419434,
"learning_rate": 3.748228379590438e-05,
"loss": 0.5414,
"step": 232
},
{
"epoch": 1.8997955010224947,
"grad_norm": 1.6689939498901367,
"learning_rate": 3.7020620419029094e-05,
"loss": 0.6574,
"step": 233
},
{
"epoch": 1.9079754601226995,
"grad_norm": 1.4076114892959595,
"learning_rate": 3.656014051577713e-05,
"loss": 0.5052,
"step": 234
},
{
"epoch": 1.9161554192229038,
"grad_norm": 1.6957688331604004,
"learning_rate": 3.610088607319544e-05,
"loss": 0.4209,
"step": 235
},
{
"epoch": 1.9243353783231085,
"grad_norm": 1.465134620666504,
"learning_rate": 3.564289896659214e-05,
"loss": 0.562,
"step": 236
},
{
"epoch": 1.9325153374233128,
"grad_norm": 1.626769781112671,
"learning_rate": 3.5186220955718306e-05,
"loss": 0.6494,
"step": 237
},
{
"epoch": 1.9406952965235174,
"grad_norm": 1.2987111806869507,
"learning_rate": 3.473089368096026e-05,
"loss": 0.5365,
"step": 238
},
{
"epoch": 1.9488752556237219,
"grad_norm": 1.7133764028549194,
"learning_rate": 3.427695865954284e-05,
"loss": 0.7972,
"step": 239
},
{
"epoch": 1.9570552147239264,
"grad_norm": 1.067958116531372,
"learning_rate": 3.3824457281743646e-05,
"loss": 0.2413,
"step": 240
},
{
"epoch": 1.965235173824131,
"grad_norm": 1.5035715103149414,
"learning_rate": 3.337343080711921e-05,
"loss": 0.655,
"step": 241
},
{
"epoch": 1.9734151329243352,
"grad_norm": 1.9790688753128052,
"learning_rate": 3.2923920360742774e-05,
"loss": 0.7517,
"step": 242
},
{
"epoch": 1.98159509202454,
"grad_norm": 1.79633367061615,
"learning_rate": 3.2475966929454504e-05,
"loss": 0.527,
"step": 243
},
{
"epoch": 1.9897750511247443,
"grad_norm": 1.59013032913208,
"learning_rate": 3.202961135812437e-05,
"loss": 0.5922,
"step": 244
},
{
"epoch": 1.997955010224949,
"grad_norm": 1.6466726064682007,
"learning_rate": 3.158489434592766e-05,
"loss": 0.6738,
"step": 245
},
{
"epoch": 2.0,
"grad_norm": 0.8072463274002075,
"learning_rate": 3.114185644263415e-05,
"loss": 0.1228,
"step": 246
},
{
"epoch": 2.0081799591002043,
"grad_norm": 1.412455439567566,
"learning_rate": 3.070053804491068e-05,
"loss": 0.5372,
"step": 247
},
{
"epoch": 2.016359918200409,
"grad_norm": 1.15187406539917,
"learning_rate": 3.026097939263775e-05,
"loss": 0.3056,
"step": 248
},
{
"epoch": 2.0245398773006134,
"grad_norm": 1.3967753648757935,
"learning_rate": 2.9823220565240394e-05,
"loss": 0.5469,
"step": 249
},
{
"epoch": 2.032719836400818,
"grad_norm": 1.5053709745407104,
"learning_rate": 2.938730147803369e-05,
"loss": 0.5333,
"step": 250
},
{
"epoch": 2.0408997955010224,
"grad_norm": 1.2719367742538452,
"learning_rate": 2.895326187858326e-05,
"loss": 0.4873,
"step": 251
},
{
"epoch": 2.049079754601227,
"grad_norm": 1.3638321161270142,
"learning_rate": 2.852114134308104e-05,
"loss": 0.4676,
"step": 252
},
{
"epoch": 2.0572597137014315,
"grad_norm": 1.4079426527023315,
"learning_rate": 2.8090979272736662e-05,
"loss": 0.5474,
"step": 253
},
{
"epoch": 2.065439672801636,
"grad_norm": 1.3648539781570435,
"learning_rate": 2.7662814890184818e-05,
"loss": 0.3774,
"step": 254
},
{
"epoch": 2.0736196319018405,
"grad_norm": 1.411365032196045,
"learning_rate": 2.7236687235908953e-05,
"loss": 0.5021,
"step": 255
},
{
"epoch": 2.081799591002045,
"grad_norm": 0.8350080251693726,
"learning_rate": 2.6812635164681386e-05,
"loss": 0.295,
"step": 256
},
{
"epoch": 2.0899795501022496,
"grad_norm": 1.4837121963500977,
"learning_rate": 2.6390697342020665e-05,
"loss": 0.4359,
"step": 257
},
{
"epoch": 2.098159509202454,
"grad_norm": 1.447041392326355,
"learning_rate": 2.5970912240665813e-05,
"loss": 0.4699,
"step": 258
},
{
"epoch": 2.1063394683026586,
"grad_norm": 1.5087660551071167,
"learning_rate": 2.555331813706847e-05,
"loss": 0.5016,
"step": 259
},
{
"epoch": 2.114519427402863,
"grad_norm": 1.4970585107803345,
"learning_rate": 2.5137953107902813e-05,
"loss": 0.4827,
"step": 260
},
{
"epoch": 2.1226993865030677,
"grad_norm": 1.5823018550872803,
"learning_rate": 2.472485502659358e-05,
"loss": 0.3951,
"step": 261
},
{
"epoch": 2.130879345603272,
"grad_norm": 1.208630919456482,
"learning_rate": 2.4314061559862833e-05,
"loss": 0.3384,
"step": 262
},
{
"epoch": 2.1390593047034763,
"grad_norm": 1.6956486701965332,
"learning_rate": 2.3905610164295394e-05,
"loss": 0.4982,
"step": 263
},
{
"epoch": 2.147239263803681,
"grad_norm": 1.4397342205047607,
"learning_rate": 2.3499538082923606e-05,
"loss": 0.4574,
"step": 264
},
{
"epoch": 2.1554192229038853,
"grad_norm": 1.3102678060531616,
"learning_rate": 2.3095882341831372e-05,
"loss": 0.3559,
"step": 265
},
{
"epoch": 2.16359918200409,
"grad_norm": 1.2937331199645996,
"learning_rate": 2.2694679746778115e-05,
"loss": 0.3721,
"step": 266
},
{
"epoch": 2.1717791411042944,
"grad_norm": 1.5506526231765747,
"learning_rate": 2.22959668798428e-05,
"loss": 0.4909,
"step": 267
},
{
"epoch": 2.179959100204499,
"grad_norm": 1.8627556562423706,
"learning_rate": 2.1899780096088375e-05,
"loss": 0.7858,
"step": 268
},
{
"epoch": 2.1881390593047034,
"grad_norm": 1.7848111391067505,
"learning_rate": 2.1506155520246797e-05,
"loss": 0.6337,
"step": 269
},
{
"epoch": 2.196319018404908,
"grad_norm": 1.2659337520599365,
"learning_rate": 2.1115129043425187e-05,
"loss": 0.2693,
"step": 270
},
{
"epoch": 2.2044989775051125,
"grad_norm": 1.6412032842636108,
"learning_rate": 2.0726736319833228e-05,
"loss": 0.5306,
"step": 271
},
{
"epoch": 2.212678936605317,
"grad_norm": 1.611624002456665,
"learning_rate": 2.0341012763532243e-05,
"loss": 0.3388,
"step": 272
},
{
"epoch": 2.2208588957055215,
"grad_norm": 1.1925326585769653,
"learning_rate": 1.995799354520598e-05,
"loss": 0.3615,
"step": 273
},
{
"epoch": 2.229038854805726,
"grad_norm": 1.7512476444244385,
"learning_rate": 1.9577713588953795e-05,
"loss": 0.5129,
"step": 274
},
{
"epoch": 2.2372188139059306,
"grad_norm": 1.5006930828094482,
"learning_rate": 1.9200207569106216e-05,
"loss": 0.4129,
"step": 275
},
{
"epoch": 2.245398773006135,
"grad_norm": 1.8680585622787476,
"learning_rate": 1.8825509907063327e-05,
"loss": 0.5374,
"step": 276
},
{
"epoch": 2.2535787321063396,
"grad_norm": 1.8856024742126465,
"learning_rate": 1.8453654768156138e-05,
"loss": 0.562,
"step": 277
},
{
"epoch": 2.261758691206544,
"grad_norm": 1.9243358373641968,
"learning_rate": 1.8084676058531373e-05,
"loss": 0.6637,
"step": 278
},
{
"epoch": 2.2699386503067487,
"grad_norm": 2.3150854110717773,
"learning_rate": 1.771860742205988e-05,
"loss": 0.5932,
"step": 279
},
{
"epoch": 2.278118609406953,
"grad_norm": 1.2950345277786255,
"learning_rate": 1.7355482237268983e-05,
"loss": 0.341,
"step": 280
},
{
"epoch": 2.2862985685071573,
"grad_norm": 1.5685244798660278,
"learning_rate": 1.699533361429891e-05,
"loss": 0.4248,
"step": 281
},
{
"epoch": 2.294478527607362,
"grad_norm": 1.7234948873519897,
"learning_rate": 1.663819439188382e-05,
"loss": 0.7139,
"step": 282
},
{
"epoch": 2.3026584867075663,
"grad_norm": 1.5493229627609253,
"learning_rate": 1.6284097134357536e-05,
"loss": 0.4609,
"step": 283
},
{
"epoch": 2.310838445807771,
"grad_norm": 1.262978196144104,
"learning_rate": 1.5933074128684332e-05,
"loss": 0.3572,
"step": 284
},
{
"epoch": 2.3190184049079754,
"grad_norm": 1.7874940633773804,
"learning_rate": 1.5585157381514875e-05,
"loss": 0.5078,
"step": 285
},
{
"epoch": 2.32719836400818,
"grad_norm": 1.7057137489318848,
"learning_rate": 1.5240378616267886e-05,
"loss": 0.5262,
"step": 286
},
{
"epoch": 2.3353783231083844,
"grad_norm": 1.5174486637115479,
"learning_rate": 1.489876927023761e-05,
"loss": 0.4075,
"step": 287
},
{
"epoch": 2.3435582822085887,
"grad_norm": 1.473712682723999,
"learning_rate": 1.4560360491727231e-05,
"loss": 0.4237,
"step": 288
},
{
"epoch": 2.3517382413087935,
"grad_norm": 2.0275111198425293,
"learning_rate": 1.4225183137208776e-05,
"loss": 0.7344,
"step": 289
},
{
"epoch": 2.359918200408998,
"grad_norm": 1.5504990816116333,
"learning_rate": 1.389326776850966e-05,
"loss": 0.5226,
"step": 290
},
{
"epoch": 2.3680981595092025,
"grad_norm": 0.9763877987861633,
"learning_rate": 1.3564644650025893e-05,
"loss": 0.2004,
"step": 291
},
{
"epoch": 2.376278118609407,
"grad_norm": 1.6431723833084106,
"learning_rate": 1.3239343745962679e-05,
"loss": 0.5426,
"step": 292
},
{
"epoch": 2.3844580777096116,
"grad_norm": 1.4661204814910889,
"learning_rate": 1.2917394717602121e-05,
"loss": 0.3689,
"step": 293
},
{
"epoch": 2.392638036809816,
"grad_norm": 1.3995070457458496,
"learning_rate": 1.2598826920598772e-05,
"loss": 0.3994,
"step": 294
},
{
"epoch": 2.40081799591002,
"grad_norm": 1.6375926733016968,
"learning_rate": 1.2283669402302878e-05,
"loss": 0.4635,
"step": 295
},
{
"epoch": 2.408997955010225,
"grad_norm": 1.6579980850219727,
"learning_rate": 1.197195089911191e-05,
"loss": 0.44,
"step": 296
},
{
"epoch": 2.4171779141104293,
"grad_norm": 2.057859420776367,
"learning_rate": 1.1663699833850238e-05,
"loss": 0.809,
"step": 297
},
{
"epoch": 2.425357873210634,
"grad_norm": 1.9846243858337402,
"learning_rate": 1.1358944313177567e-05,
"loss": 0.526,
"step": 298
},
{
"epoch": 2.4335378323108383,
"grad_norm": 1.8454967737197876,
"learning_rate": 1.1057712125026116e-05,
"loss": 0.4943,
"step": 299
},
{
"epoch": 2.441717791411043,
"grad_norm": 1.3751471042633057,
"learning_rate": 1.0760030736066951e-05,
"loss": 0.2973,
"step": 300
},
{
"epoch": 2.4498977505112474,
"grad_norm": 1.7352081537246704,
"learning_rate": 1.0465927289205452e-05,
"loss": 0.4647,
"step": 301
},
{
"epoch": 2.458077709611452,
"grad_norm": 1.6583192348480225,
"learning_rate": 1.017542860110644e-05,
"loss": 0.5614,
"step": 302
},
{
"epoch": 2.4662576687116564,
"grad_norm": 1.1086567640304565,
"learning_rate": 9.888561159748993e-06,
"loss": 0.2343,
"step": 303
},
{
"epoch": 2.474437627811861,
"grad_norm": 1.2182183265686035,
"learning_rate": 9.605351122011309e-06,
"loss": 0.5084,
"step": 304
},
{
"epoch": 2.4826175869120655,
"grad_norm": 1.5897687673568726,
"learning_rate": 9.325824311285564e-06,
"loss": 0.4916,
"step": 305
},
{
"epoch": 2.4907975460122698,
"grad_norm": 1.7576637268066406,
"learning_rate": 9.050006215123419e-06,
"loss": 0.5896,
"step": 306
},
{
"epoch": 2.4989775051124745,
"grad_norm": 1.3375118970870972,
"learning_rate": 8.777921982911996e-06,
"loss": 0.3472,
"step": 307
},
{
"epoch": 2.507157464212679,
"grad_norm": 1.643762230873108,
"learning_rate": 8.509596423580712e-06,
"loss": 0.6561,
"step": 308
},
{
"epoch": 2.5153374233128836,
"grad_norm": 1.8207759857177734,
"learning_rate": 8.245054003339247e-06,
"loss": 0.446,
"step": 309
},
{
"epoch": 2.523517382413088,
"grad_norm": 1.7931218147277832,
"learning_rate": 7.984318843446593e-06,
"loss": 0.6626,
"step": 310
},
{
"epoch": 2.5316973415132926,
"grad_norm": 1.5871256589889526,
"learning_rate": 7.727414718011704e-06,
"loss": 0.6779,
"step": 311
},
{
"epoch": 2.539877300613497,
"grad_norm": 1.6045511960983276,
"learning_rate": 7.474365051825749e-06,
"loss": 0.4369,
"step": 312
},
{
"epoch": 2.5480572597137012,
"grad_norm": 1.9614039659500122,
"learning_rate": 7.225192918226214e-06,
"loss": 0.5339,
"step": 313
},
{
"epoch": 2.556237218813906,
"grad_norm": 1.6761356592178345,
"learning_rate": 6.979921036993042e-06,
"loss": 0.4714,
"step": 314
},
{
"epoch": 2.5644171779141103,
"grad_norm": 1.268598198890686,
"learning_rate": 6.738571772276997e-06,
"loss": 0.3589,
"step": 315
},
{
"epoch": 2.572597137014315,
"grad_norm": 1.9515974521636963,
"learning_rate": 6.501167130560515e-06,
"loss": 0.7677,
"step": 316
},
{
"epoch": 2.5807770961145193,
"grad_norm": 1.752503514289856,
"learning_rate": 6.267728758651132e-06,
"loss": 0.6019,
"step": 317
},
{
"epoch": 2.588957055214724,
"grad_norm": 1.6404023170471191,
"learning_rate": 6.03827794170767e-06,
"loss": 0.3813,
"step": 318
},
{
"epoch": 2.5971370143149284,
"grad_norm": 1.6431866884231567,
"learning_rate": 5.8128356012994375e-06,
"loss": 0.5397,
"step": 319
},
{
"epoch": 2.6053169734151327,
"grad_norm": 1.604200005531311,
"learning_rate": 5.591422293498633e-06,
"loss": 0.5326,
"step": 320
},
{
"epoch": 2.6134969325153374,
"grad_norm": 1.955712080001831,
"learning_rate": 5.374058207005944e-06,
"loss": 0.6279,
"step": 321
},
{
"epoch": 2.621676891615542,
"grad_norm": 1.9583613872528076,
"learning_rate": 5.160763161309767e-06,
"loss": 0.7064,
"step": 322
},
{
"epoch": 2.6298568507157465,
"grad_norm": 1.465756893157959,
"learning_rate": 4.951556604879048e-06,
"loss": 0.3176,
"step": 323
},
{
"epoch": 2.638036809815951,
"grad_norm": 1.0220084190368652,
"learning_rate": 4.746457613389904e-06,
"loss": 0.1989,
"step": 324
},
{
"epoch": 2.6462167689161555,
"grad_norm": 1.9900139570236206,
"learning_rate": 4.545484887986368e-06,
"loss": 0.4488,
"step": 325
},
{
"epoch": 2.65439672801636,
"grad_norm": 1.8389681577682495,
"learning_rate": 4.348656753575092e-06,
"loss": 0.8159,
"step": 326
},
{
"epoch": 2.662576687116564,
"grad_norm": 1.8046656847000122,
"learning_rate": 4.155991157154554e-06,
"loss": 0.5941,
"step": 327
},
{
"epoch": 2.670756646216769,
"grad_norm": 1.5946298837661743,
"learning_rate": 3.967505666178556e-06,
"loss": 0.6167,
"step": 328
},
{
"epoch": 2.6789366053169736,
"grad_norm": 1.6215424537658691,
"learning_rate": 3.783217466954503e-06,
"loss": 0.5432,
"step": 329
},
{
"epoch": 2.687116564417178,
"grad_norm": 1.5136370658874512,
"learning_rate": 3.603143363076217e-06,
"loss": 0.2688,
"step": 330
},
{
"epoch": 2.6952965235173822,
"grad_norm": 2.0225648880004883,
"learning_rate": 3.427299773891868e-06,
"loss": 0.3968,
"step": 331
},
{
"epoch": 2.703476482617587,
"grad_norm": 1.170069694519043,
"learning_rate": 3.2557027330067658e-06,
"loss": 0.3143,
"step": 332
},
{
"epoch": 2.7116564417177913,
"grad_norm": 1.2336766719818115,
"learning_rate": 3.0883678868214806e-06,
"loss": 0.4023,
"step": 333
},
{
"epoch": 2.719836400817996,
"grad_norm": 1.8785996437072754,
"learning_rate": 2.925310493105099e-06,
"loss": 0.6501,
"step": 334
},
{
"epoch": 2.7280163599182004,
"grad_norm": 1.7136589288711548,
"learning_rate": 2.7665454196040664e-06,
"loss": 0.3418,
"step": 335
},
{
"epoch": 2.736196319018405,
"grad_norm": 1.5453672409057617,
"learning_rate": 2.612087142686487e-06,
"loss": 0.4047,
"step": 336
},
{
"epoch": 2.7443762781186094,
"grad_norm": 1.5091831684112549,
"learning_rate": 2.4619497460222184e-06,
"loss": 0.3707,
"step": 337
},
{
"epoch": 2.7525562372188137,
"grad_norm": 1.996533751487732,
"learning_rate": 2.316146919298623e-06,
"loss": 0.7776,
"step": 338
},
{
"epoch": 2.7607361963190185,
"grad_norm": 2.2293291091918945,
"learning_rate": 2.1746919569723855e-06,
"loss": 0.7055,
"step": 339
},
{
"epoch": 2.7689161554192228,
"grad_norm": 1.906553864479065,
"learning_rate": 2.0375977570572967e-06,
"loss": 0.6423,
"step": 340
},
{
"epoch": 2.7770961145194275,
"grad_norm": 1.684910535812378,
"learning_rate": 1.9048768199481982e-06,
"loss": 0.5679,
"step": 341
},
{
"epoch": 2.785276073619632,
"grad_norm": 1.3118062019348145,
"learning_rate": 1.7765412472811771e-06,
"loss": 0.3036,
"step": 342
},
{
"epoch": 2.7934560327198366,
"grad_norm": 1.9178974628448486,
"learning_rate": 1.6526027408301226e-06,
"loss": 0.5829,
"step": 343
},
{
"epoch": 2.801635991820041,
"grad_norm": 1.860939860343933,
"learning_rate": 1.5330726014397668e-06,
"loss": 0.4617,
"step": 344
},
{
"epoch": 2.809815950920245,
"grad_norm": 1.7959818840026855,
"learning_rate": 1.417961727995254e-06,
"loss": 0.4604,
"step": 345
},
{
"epoch": 2.81799591002045,
"grad_norm": 1.414788842201233,
"learning_rate": 1.3072806164283358e-06,
"loss": 0.3398,
"step": 346
},
{
"epoch": 2.8261758691206547,
"grad_norm": 1.316179633140564,
"learning_rate": 1.2010393587603974e-06,
"loss": 0.3707,
"step": 347
},
{
"epoch": 2.834355828220859,
"grad_norm": 2.140214443206787,
"learning_rate": 1.099247642182205e-06,
"loss": 0.6991,
"step": 348
},
{
"epoch": 2.8425357873210633,
"grad_norm": 1.6871670484542847,
"learning_rate": 1.0019147481706625e-06,
"loss": 0.6069,
"step": 349
},
{
"epoch": 2.850715746421268,
"grad_norm": 1.7937754392623901,
"learning_rate": 9.090495516424713e-07,
"loss": 0.3841,
"step": 350
},
{
"epoch": 2.8588957055214723,
"grad_norm": 1.3567599058151245,
"learning_rate": 8.206605201449447e-07,
"loss": 0.3186,
"step": 351
},
{
"epoch": 2.8670756646216766,
"grad_norm": 1.0344539880752563,
"learning_rate": 7.36755713083892e-07,
"loss": 0.1676,
"step": 352
},
{
"epoch": 2.8752556237218814,
"grad_norm": 1.908477783203125,
"learning_rate": 6.573427809888067e-07,
"loss": 0.7295,
"step": 353
},
{
"epoch": 2.883435582822086,
"grad_norm": 2.14554500579834,
"learning_rate": 5.824289648152126e-07,
"loss": 0.8187,
"step": 354
},
{
"epoch": 2.8916155419222904,
"grad_norm": 1.6814268827438354,
"learning_rate": 5.120210952844872e-07,
"loss": 0.5205,
"step": 355
},
{
"epoch": 2.8997955010224947,
"grad_norm": 1.6498082876205444,
"learning_rate": 4.461255922609986e-07,
"loss": 0.4557,
"step": 356
},
{
"epoch": 2.9079754601226995,
"grad_norm": 1.4337708950042725,
"learning_rate": 3.8474846416672874e-07,
"loss": 0.3251,
"step": 357
},
{
"epoch": 2.9161554192229038,
"grad_norm": 1.875313401222229,
"learning_rate": 3.278953074334512e-07,
"loss": 0.5001,
"step": 358
},
{
"epoch": 2.9243353783231085,
"grad_norm": 1.350846529006958,
"learning_rate": 2.75571305992417e-07,
"loss": 0.2414,
"step": 359
},
{
"epoch": 2.932515337423313,
"grad_norm": 1.5978336334228516,
"learning_rate": 2.2778123080167135e-07,
"loss": 0.4585,
"step": 360
},
{
"epoch": 2.9406952965235176,
"grad_norm": 1.672541856765747,
"learning_rate": 1.8452943941106859e-07,
"loss": 0.5382,
"step": 361
},
{
"epoch": 2.948875255623722,
"grad_norm": 1.3987324237823486,
"learning_rate": 1.4581987556490095e-07,
"loss": 0.3326,
"step": 362
},
{
"epoch": 2.957055214723926,
"grad_norm": 1.4565430879592896,
"learning_rate": 1.1165606884234181e-07,
"loss": 0.4901,
"step": 363
},
{
"epoch": 2.965235173824131,
"grad_norm": 1.3861486911773682,
"learning_rate": 8.204113433559201e-08,
"loss": 0.2756,
"step": 364
},
{
"epoch": 2.9734151329243352,
"grad_norm": 1.4839295148849487,
"learning_rate": 5.697777236585711e-08,
"loss": 0.3303,
"step": 365
},
{
"epoch": 2.98159509202454,
"grad_norm": 1.6138904094696045,
"learning_rate": 3.6468268237105366e-08,
"loss": 0.558,
"step": 366
},
{
"epoch": 2.98159509202454,
"step": 366,
"total_flos": 3.142935032247091e+16,
"train_loss": 0.7803121163341843,
"train_runtime": 701.3998,
"train_samples_per_second": 4.183,
"train_steps_per_second": 0.522
}
],
"logging_steps": 1,
"max_steps": 366,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.142935032247091e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}