JasperHaozhe's picture
Add files using upload-large-folder tool
25f82bc verified
raw
history blame
46.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1.0,
"global_step": 246,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004073319755600814,
"grad_norm": 34.300819396972656,
"learning_rate": 8.130081300813008e-09,
"loss": 1.59619802236557,
"step": 1
},
{
"epoch": 0.008146639511201629,
"grad_norm": 30.720197677612305,
"learning_rate": 1.6260162601626016e-08,
"loss": 1.468272864818573,
"step": 2
},
{
"epoch": 0.012219959266802444,
"grad_norm": 30.16754722595215,
"learning_rate": 2.4390243902439023e-08,
"loss": 1.3843095302581787,
"step": 3
},
{
"epoch": 0.016293279022403257,
"grad_norm": 38.58047103881836,
"learning_rate": 3.252032520325203e-08,
"loss": 1.7031245231628418,
"step": 4
},
{
"epoch": 0.020366598778004074,
"grad_norm": 30.89760971069336,
"learning_rate": 4.065040650406504e-08,
"loss": 1.4844104647636414,
"step": 5
},
{
"epoch": 0.024439918533604887,
"grad_norm": 34.434993743896484,
"learning_rate": 4.878048780487805e-08,
"loss": 1.574910283088684,
"step": 6
},
{
"epoch": 0.028513238289205704,
"grad_norm": 32.540470123291016,
"learning_rate": 5.6910569105691055e-08,
"loss": 1.4606674909591675,
"step": 7
},
{
"epoch": 0.032586558044806514,
"grad_norm": 36.41299819946289,
"learning_rate": 6.504065040650406e-08,
"loss": 1.553576111793518,
"step": 8
},
{
"epoch": 0.03665987780040733,
"grad_norm": 34.50511932373047,
"learning_rate": 7.317073170731706e-08,
"loss": 1.3344553709030151,
"step": 9
},
{
"epoch": 0.04073319755600815,
"grad_norm": 27.898704528808594,
"learning_rate": 8.130081300813008e-08,
"loss": 1.3406395316123962,
"step": 10
},
{
"epoch": 0.04480651731160896,
"grad_norm": 29.29271125793457,
"learning_rate": 8.943089430894309e-08,
"loss": 1.4415303468704224,
"step": 11
},
{
"epoch": 0.048879837067209775,
"grad_norm": 28.2354736328125,
"learning_rate": 9.75609756097561e-08,
"loss": 1.2696096301078796,
"step": 12
},
{
"epoch": 0.05295315682281059,
"grad_norm": 35.44163131713867,
"learning_rate": 1.0569105691056911e-07,
"loss": 1.598312497138977,
"step": 13
},
{
"epoch": 0.05702647657841141,
"grad_norm": 26.94402313232422,
"learning_rate": 1.1382113821138211e-07,
"loss": 1.3497812747955322,
"step": 14
},
{
"epoch": 0.06109979633401222,
"grad_norm": 37.78248977661133,
"learning_rate": 1.219512195121951e-07,
"loss": 1.5689660906791687,
"step": 15
},
{
"epoch": 0.06517311608961303,
"grad_norm": 31.73078155517578,
"learning_rate": 1.3008130081300813e-07,
"loss": 1.525648295879364,
"step": 16
},
{
"epoch": 0.06924643584521385,
"grad_norm": 27.77250862121582,
"learning_rate": 1.3821138211382114e-07,
"loss": 1.304672360420227,
"step": 17
},
{
"epoch": 0.07331975560081466,
"grad_norm": 28.092498779296875,
"learning_rate": 1.4634146341463413e-07,
"loss": 1.346445381641388,
"step": 18
},
{
"epoch": 0.07739307535641547,
"grad_norm": 30.995866775512695,
"learning_rate": 1.5447154471544717e-07,
"loss": 1.447025179862976,
"step": 19
},
{
"epoch": 0.0814663951120163,
"grad_norm": 28.858421325683594,
"learning_rate": 1.6260162601626016e-07,
"loss": 1.3801668882369995,
"step": 20
},
{
"epoch": 0.0855397148676171,
"grad_norm": 31.91228485107422,
"learning_rate": 1.7073170731707317e-07,
"loss": 1.4577875137329102,
"step": 21
},
{
"epoch": 0.08961303462321792,
"grad_norm": 31.215259552001953,
"learning_rate": 1.7886178861788619e-07,
"loss": 1.4091373682022095,
"step": 22
},
{
"epoch": 0.09368635437881874,
"grad_norm": 30.24734115600586,
"learning_rate": 1.8699186991869917e-07,
"loss": 1.4649581909179688,
"step": 23
},
{
"epoch": 0.09775967413441955,
"grad_norm": 31.560291290283203,
"learning_rate": 1.951219512195122e-07,
"loss": 1.5308585166931152,
"step": 24
},
{
"epoch": 0.10183299389002037,
"grad_norm": 27.27391242980957,
"learning_rate": 2.032520325203252e-07,
"loss": 1.5144553780555725,
"step": 25
},
{
"epoch": 0.10590631364562118,
"grad_norm": 29.813785552978516,
"learning_rate": 2.1138211382113822e-07,
"loss": 1.519466757774353,
"step": 26
},
{
"epoch": 0.109979633401222,
"grad_norm": 24.201751708984375,
"learning_rate": 2.195121951219512e-07,
"loss": 1.3116011023521423,
"step": 27
},
{
"epoch": 0.11405295315682282,
"grad_norm": 27.95865249633789,
"learning_rate": 2.2764227642276422e-07,
"loss": 1.4637184143066406,
"step": 28
},
{
"epoch": 0.11812627291242363,
"grad_norm": 26.65915870666504,
"learning_rate": 2.3577235772357723e-07,
"loss": 1.4885194301605225,
"step": 29
},
{
"epoch": 0.12219959266802444,
"grad_norm": 27.386289596557617,
"learning_rate": 2.439024390243902e-07,
"loss": 1.3836334347724915,
"step": 30
},
{
"epoch": 0.12627291242362526,
"grad_norm": 25.87419319152832,
"learning_rate": 2.520325203252032e-07,
"loss": 1.3642336130142212,
"step": 31
},
{
"epoch": 0.13034623217922606,
"grad_norm": 26.620105743408203,
"learning_rate": 2.6016260162601625e-07,
"loss": 1.3461121916770935,
"step": 32
},
{
"epoch": 0.13441955193482688,
"grad_norm": 22.665058135986328,
"learning_rate": 2.682926829268293e-07,
"loss": 1.2577590942382812,
"step": 33
},
{
"epoch": 0.1384928716904277,
"grad_norm": 23.679920196533203,
"learning_rate": 2.764227642276423e-07,
"loss": 1.2572017908096313,
"step": 34
},
{
"epoch": 0.1425661914460285,
"grad_norm": 25.136371612548828,
"learning_rate": 2.8455284552845527e-07,
"loss": 1.2670851349830627,
"step": 35
},
{
"epoch": 0.14663951120162932,
"grad_norm": 21.567337036132812,
"learning_rate": 2.9268292682926825e-07,
"loss": 1.242683231830597,
"step": 36
},
{
"epoch": 0.15071283095723015,
"grad_norm": 20.61647605895996,
"learning_rate": 3.008130081300813e-07,
"loss": 1.279579222202301,
"step": 37
},
{
"epoch": 0.15478615071283094,
"grad_norm": 20.656513214111328,
"learning_rate": 3.0894308943089434e-07,
"loss": 1.2040475606918335,
"step": 38
},
{
"epoch": 0.15885947046843177,
"grad_norm": 22.86530876159668,
"learning_rate": 3.170731707317073e-07,
"loss": 1.2522715330123901,
"step": 39
},
{
"epoch": 0.1629327902240326,
"grad_norm": 20.22757911682129,
"learning_rate": 3.252032520325203e-07,
"loss": 1.2012774348258972,
"step": 40
},
{
"epoch": 0.1670061099796334,
"grad_norm": 23.09739875793457,
"learning_rate": 3.333333333333333e-07,
"loss": 1.2088268399238586,
"step": 41
},
{
"epoch": 0.1710794297352342,
"grad_norm": 22.845685958862305,
"learning_rate": 3.4146341463414634e-07,
"loss": 1.0982880592346191,
"step": 42
},
{
"epoch": 0.17515274949083504,
"grad_norm": 19.80814552307129,
"learning_rate": 3.4959349593495933e-07,
"loss": 1.1271469593048096,
"step": 43
},
{
"epoch": 0.17922606924643583,
"grad_norm": 20.553686141967773,
"learning_rate": 3.5772357723577237e-07,
"loss": 1.0008204579353333,
"step": 44
},
{
"epoch": 0.18329938900203666,
"grad_norm": 16.66282844543457,
"learning_rate": 3.6585365853658536e-07,
"loss": 0.9251897931098938,
"step": 45
},
{
"epoch": 0.18737270875763748,
"grad_norm": 15.797308921813965,
"learning_rate": 3.7398373983739835e-07,
"loss": 1.0191328525543213,
"step": 46
},
{
"epoch": 0.19144602851323828,
"grad_norm": 13.579208374023438,
"learning_rate": 3.821138211382114e-07,
"loss": 0.774791806936264,
"step": 47
},
{
"epoch": 0.1955193482688391,
"grad_norm": 14.556002616882324,
"learning_rate": 3.902439024390244e-07,
"loss": 1.0026790797710419,
"step": 48
},
{
"epoch": 0.19959266802443992,
"grad_norm": 14.489509582519531,
"learning_rate": 3.9837398373983736e-07,
"loss": 0.9430837631225586,
"step": 49
},
{
"epoch": 0.20366598778004075,
"grad_norm": 12.495223999023438,
"learning_rate": 4.065040650406504e-07,
"loss": 0.8999880254268646,
"step": 50
},
{
"epoch": 0.20773930753564154,
"grad_norm": 11.441575050354004,
"learning_rate": 4.146341463414634e-07,
"loss": 0.8320233225822449,
"step": 51
},
{
"epoch": 0.21181262729124237,
"grad_norm": 10.894216537475586,
"learning_rate": 4.2276422764227643e-07,
"loss": 0.8139239549636841,
"step": 52
},
{
"epoch": 0.2158859470468432,
"grad_norm": 10.404220581054688,
"learning_rate": 4.308943089430894e-07,
"loss": 0.8323288261890411,
"step": 53
},
{
"epoch": 0.219959266802444,
"grad_norm": 10.463072776794434,
"learning_rate": 4.390243902439024e-07,
"loss": 0.882573276758194,
"step": 54
},
{
"epoch": 0.2240325865580448,
"grad_norm": 10.669075012207031,
"learning_rate": 4.471544715447154e-07,
"loss": 0.749780923128128,
"step": 55
},
{
"epoch": 0.22810590631364563,
"grad_norm": 10.453638076782227,
"learning_rate": 4.5528455284552844e-07,
"loss": 0.7727148830890656,
"step": 56
},
{
"epoch": 0.23217922606924643,
"grad_norm": 11.427080154418945,
"learning_rate": 4.634146341463415e-07,
"loss": 0.8585084676742554,
"step": 57
},
{
"epoch": 0.23625254582484725,
"grad_norm": 8.558117866516113,
"learning_rate": 4.7154471544715447e-07,
"loss": 0.7314337491989136,
"step": 58
},
{
"epoch": 0.24032586558044808,
"grad_norm": 9.031648635864258,
"learning_rate": 4.796747967479675e-07,
"loss": 0.701579749584198,
"step": 59
},
{
"epoch": 0.24439918533604887,
"grad_norm": 8.817708969116211,
"learning_rate": 4.878048780487804e-07,
"loss": 0.7815204560756683,
"step": 60
},
{
"epoch": 0.2484725050916497,
"grad_norm": 8.00804615020752,
"learning_rate": 4.959349593495934e-07,
"loss": 0.655106246471405,
"step": 61
},
{
"epoch": 0.2525458248472505,
"grad_norm": 6.538842678070068,
"learning_rate": 5.040650406504064e-07,
"loss": 0.6697916388511658,
"step": 62
},
{
"epoch": 0.25661914460285135,
"grad_norm": 7.5446553230285645,
"learning_rate": 5.121951219512195e-07,
"loss": 0.7426944077014923,
"step": 63
},
{
"epoch": 0.2606924643584521,
"grad_norm": 6.402474403381348,
"learning_rate": 5.203252032520325e-07,
"loss": 0.6401277780532837,
"step": 64
},
{
"epoch": 0.26476578411405294,
"grad_norm": 7.257569313049316,
"learning_rate": 5.284552845528455e-07,
"loss": 0.6731106042861938,
"step": 65
},
{
"epoch": 0.26883910386965376,
"grad_norm": 6.263636589050293,
"learning_rate": 5.365853658536586e-07,
"loss": 0.5806022882461548,
"step": 66
},
{
"epoch": 0.2729124236252546,
"grad_norm": 5.273800849914551,
"learning_rate": 5.447154471544715e-07,
"loss": 0.5338439792394638,
"step": 67
},
{
"epoch": 0.2769857433808554,
"grad_norm": 5.2786149978637695,
"learning_rate": 5.528455284552846e-07,
"loss": 0.5390533208847046,
"step": 68
},
{
"epoch": 0.28105906313645623,
"grad_norm": 4.901702404022217,
"learning_rate": 5.609756097560975e-07,
"loss": 0.5899032056331635,
"step": 69
},
{
"epoch": 0.285132382892057,
"grad_norm": 4.853933811187744,
"learning_rate": 5.691056910569105e-07,
"loss": 0.5600310862064362,
"step": 70
},
{
"epoch": 0.2892057026476578,
"grad_norm": 4.680273532867432,
"learning_rate": 5.772357723577236e-07,
"loss": 0.5319355428218842,
"step": 71
},
{
"epoch": 0.29327902240325865,
"grad_norm": 3.7406885623931885,
"learning_rate": 5.853658536585365e-07,
"loss": 0.508156955242157,
"step": 72
},
{
"epoch": 0.2973523421588595,
"grad_norm": 4.389779567718506,
"learning_rate": 5.934959349593496e-07,
"loss": 0.49855048954486847,
"step": 73
},
{
"epoch": 0.3014256619144603,
"grad_norm": 4.23866081237793,
"learning_rate": 6.016260162601626e-07,
"loss": 0.5242476612329483,
"step": 74
},
{
"epoch": 0.3054989816700611,
"grad_norm": 4.1824951171875,
"learning_rate": 6.097560975609756e-07,
"loss": 0.532037615776062,
"step": 75
},
{
"epoch": 0.3095723014256619,
"grad_norm": 3.7223150730133057,
"learning_rate": 6.178861788617887e-07,
"loss": 0.46959882974624634,
"step": 76
},
{
"epoch": 0.3136456211812627,
"grad_norm": 3.545388698577881,
"learning_rate": 6.260162601626016e-07,
"loss": 0.4825982600450516,
"step": 77
},
{
"epoch": 0.31771894093686354,
"grad_norm": 3.6351099014282227,
"learning_rate": 6.341463414634146e-07,
"loss": 0.5095209777355194,
"step": 78
},
{
"epoch": 0.32179226069246436,
"grad_norm": 3.243072271347046,
"learning_rate": 6.422764227642276e-07,
"loss": 0.4842926263809204,
"step": 79
},
{
"epoch": 0.3258655804480652,
"grad_norm": 3.5646300315856934,
"learning_rate": 6.504065040650406e-07,
"loss": 0.4908552020788193,
"step": 80
},
{
"epoch": 0.329938900203666,
"grad_norm": 3.5380759239196777,
"learning_rate": 6.585365853658536e-07,
"loss": 0.4536065459251404,
"step": 81
},
{
"epoch": 0.3340122199592668,
"grad_norm": 3.128525495529175,
"learning_rate": 6.666666666666666e-07,
"loss": 0.47657161951065063,
"step": 82
},
{
"epoch": 0.3380855397148676,
"grad_norm": 3.3621485233306885,
"learning_rate": 6.747967479674797e-07,
"loss": 0.43791596591472626,
"step": 83
},
{
"epoch": 0.3421588594704684,
"grad_norm": 3.39066219329834,
"learning_rate": 6.829268292682927e-07,
"loss": 0.42947711050510406,
"step": 84
},
{
"epoch": 0.34623217922606925,
"grad_norm": 3.7795698642730713,
"learning_rate": 6.910569105691057e-07,
"loss": 0.4219910502433777,
"step": 85
},
{
"epoch": 0.35030549898167007,
"grad_norm": 3.633206367492676,
"learning_rate": 6.991869918699187e-07,
"loss": 0.4253977984189987,
"step": 86
},
{
"epoch": 0.3543788187372709,
"grad_norm": 3.6160175800323486,
"learning_rate": 7.073170731707316e-07,
"loss": 0.449339896440506,
"step": 87
},
{
"epoch": 0.35845213849287166,
"grad_norm": 3.30557918548584,
"learning_rate": 7.154471544715447e-07,
"loss": 0.45001736283302307,
"step": 88
},
{
"epoch": 0.3625254582484725,
"grad_norm": 3.1727640628814697,
"learning_rate": 7.235772357723577e-07,
"loss": 0.4165496975183487,
"step": 89
},
{
"epoch": 0.3665987780040733,
"grad_norm": 3.073976516723633,
"learning_rate": 7.317073170731707e-07,
"loss": 0.4443822205066681,
"step": 90
},
{
"epoch": 0.37067209775967414,
"grad_norm": 3.129105567932129,
"learning_rate": 7.398373983739837e-07,
"loss": 0.4265598952770233,
"step": 91
},
{
"epoch": 0.37474541751527496,
"grad_norm": 3.1485190391540527,
"learning_rate": 7.479674796747967e-07,
"loss": 0.3882734924554825,
"step": 92
},
{
"epoch": 0.3788187372708758,
"grad_norm": 3.1610565185546875,
"learning_rate": 7.560975609756097e-07,
"loss": 0.37010858952999115,
"step": 93
},
{
"epoch": 0.38289205702647655,
"grad_norm": 3.039264440536499,
"learning_rate": 7.642276422764228e-07,
"loss": 0.400989294052124,
"step": 94
},
{
"epoch": 0.3869653767820774,
"grad_norm": 2.9321980476379395,
"learning_rate": 7.723577235772358e-07,
"loss": 0.3771343380212784,
"step": 95
},
{
"epoch": 0.3910386965376782,
"grad_norm": 2.807072162628174,
"learning_rate": 7.804878048780488e-07,
"loss": 0.4001482129096985,
"step": 96
},
{
"epoch": 0.395112016293279,
"grad_norm": 2.8286941051483154,
"learning_rate": 7.886178861788617e-07,
"loss": 0.4234430640935898,
"step": 97
},
{
"epoch": 0.39918533604887985,
"grad_norm": 2.9245986938476562,
"learning_rate": 7.967479674796747e-07,
"loss": 0.3854667395353317,
"step": 98
},
{
"epoch": 0.40325865580448067,
"grad_norm": 3.015875816345215,
"learning_rate": 8.048780487804878e-07,
"loss": 0.38027653098106384,
"step": 99
},
{
"epoch": 0.4073319755600815,
"grad_norm": 2.907216787338257,
"learning_rate": 8.130081300813008e-07,
"loss": 0.34937676787376404,
"step": 100
},
{
"epoch": 0.41140529531568226,
"grad_norm": 3.131850004196167,
"learning_rate": 8.211382113821138e-07,
"loss": 0.4414845108985901,
"step": 101
},
{
"epoch": 0.4154786150712831,
"grad_norm": 2.9019775390625,
"learning_rate": 8.292682926829268e-07,
"loss": 0.3990558981895447,
"step": 102
},
{
"epoch": 0.4195519348268839,
"grad_norm": 2.9362523555755615,
"learning_rate": 8.373983739837398e-07,
"loss": 0.41413092613220215,
"step": 103
},
{
"epoch": 0.42362525458248473,
"grad_norm": 3.0895473957061768,
"learning_rate": 8.455284552845529e-07,
"loss": 0.3904542028903961,
"step": 104
},
{
"epoch": 0.42769857433808556,
"grad_norm": 2.9235992431640625,
"learning_rate": 8.536585365853657e-07,
"loss": 0.3995140939950943,
"step": 105
},
{
"epoch": 0.4317718940936864,
"grad_norm": 2.919102668762207,
"learning_rate": 8.617886178861788e-07,
"loss": 0.32857778668403625,
"step": 106
},
{
"epoch": 0.43584521384928715,
"grad_norm": 2.831698417663574,
"learning_rate": 8.699186991869918e-07,
"loss": 0.3507983237504959,
"step": 107
},
{
"epoch": 0.439918533604888,
"grad_norm": 2.952693223953247,
"learning_rate": 8.780487804878048e-07,
"loss": 0.37046514451503754,
"step": 108
},
{
"epoch": 0.4439918533604888,
"grad_norm": 3.315002679824829,
"learning_rate": 8.861788617886179e-07,
"loss": 0.391086682677269,
"step": 109
},
{
"epoch": 0.4480651731160896,
"grad_norm": 2.7241294384002686,
"learning_rate": 8.943089430894308e-07,
"loss": 0.3864188492298126,
"step": 110
},
{
"epoch": 0.45213849287169044,
"grad_norm": 2.782064199447632,
"learning_rate": 9.024390243902439e-07,
"loss": 0.38219109177589417,
"step": 111
},
{
"epoch": 0.45621181262729127,
"grad_norm": 4.001572132110596,
"learning_rate": 9.105691056910569e-07,
"loss": 0.3784598410129547,
"step": 112
},
{
"epoch": 0.46028513238289204,
"grad_norm": 2.607434034347534,
"learning_rate": 9.186991869918699e-07,
"loss": 0.3763512521982193,
"step": 113
},
{
"epoch": 0.46435845213849286,
"grad_norm": 2.97188138961792,
"learning_rate": 9.26829268292683e-07,
"loss": 0.36788034439086914,
"step": 114
},
{
"epoch": 0.4684317718940937,
"grad_norm": 2.9631524085998535,
"learning_rate": 9.349593495934958e-07,
"loss": 0.3696867823600769,
"step": 115
},
{
"epoch": 0.4725050916496945,
"grad_norm": 2.5895049571990967,
"learning_rate": 9.430894308943089e-07,
"loss": 0.3349902927875519,
"step": 116
},
{
"epoch": 0.47657841140529533,
"grad_norm": 2.600832462310791,
"learning_rate": 9.512195121951218e-07,
"loss": 0.34966227412223816,
"step": 117
},
{
"epoch": 0.48065173116089616,
"grad_norm": 3.0639443397521973,
"learning_rate": 9.59349593495935e-07,
"loss": 0.38310858607292175,
"step": 118
},
{
"epoch": 0.4847250509164969,
"grad_norm": 2.6944706439971924,
"learning_rate": 9.67479674796748e-07,
"loss": 0.3360476493835449,
"step": 119
},
{
"epoch": 0.48879837067209775,
"grad_norm": 2.8398237228393555,
"learning_rate": 9.756097560975609e-07,
"loss": 0.39176714420318604,
"step": 120
},
{
"epoch": 0.49287169042769857,
"grad_norm": 2.8028745651245117,
"learning_rate": 9.83739837398374e-07,
"loss": 0.37909021973609924,
"step": 121
},
{
"epoch": 0.4969450101832994,
"grad_norm": 2.6169185638427734,
"learning_rate": 9.918699186991869e-07,
"loss": 0.37069061398506165,
"step": 122
},
{
"epoch": 0.5010183299389002,
"grad_norm": 2.572046995162964,
"learning_rate": 1e-06,
"loss": 0.3428824096918106,
"step": 123
},
{
"epoch": 0.505091649694501,
"grad_norm": 2.7804417610168457,
"learning_rate": 9.999979682219186e-07,
"loss": 0.3680119812488556,
"step": 124
},
{
"epoch": 0.5091649694501018,
"grad_norm": 2.5910799503326416,
"learning_rate": 9.999918729041868e-07,
"loss": 0.33467385172843933,
"step": 125
},
{
"epoch": 0.5132382892057027,
"grad_norm": 2.8417587280273438,
"learning_rate": 9.999817140963419e-07,
"loss": 0.35100705921649933,
"step": 126
},
{
"epoch": 0.5173116089613035,
"grad_norm": 2.905728340148926,
"learning_rate": 9.999674918809457e-07,
"loss": 0.32811686396598816,
"step": 127
},
{
"epoch": 0.5213849287169042,
"grad_norm": 2.5878095626831055,
"learning_rate": 9.99949206373584e-07,
"loss": 0.32490645349025726,
"step": 128
},
{
"epoch": 0.5254582484725051,
"grad_norm": 2.9762229919433594,
"learning_rate": 9.999268577228648e-07,
"loss": 0.3934018760919571,
"step": 129
},
{
"epoch": 0.5295315682281059,
"grad_norm": 2.792989492416382,
"learning_rate": 9.99900446110418e-07,
"loss": 0.3315049111843109,
"step": 130
},
{
"epoch": 0.5336048879837068,
"grad_norm": 2.6891062259674072,
"learning_rate": 9.998699717508945e-07,
"loss": 0.3097301423549652,
"step": 131
},
{
"epoch": 0.5376782077393075,
"grad_norm": 2.92191481590271,
"learning_rate": 9.99835434891962e-07,
"loss": 0.34749817848205566,
"step": 132
},
{
"epoch": 0.5417515274949084,
"grad_norm": 2.980543851852417,
"learning_rate": 9.99796835814306e-07,
"loss": 0.3367327153682709,
"step": 133
},
{
"epoch": 0.5458248472505092,
"grad_norm": 2.50433611869812,
"learning_rate": 9.99754174831625e-07,
"loss": 0.3090934008359909,
"step": 134
},
{
"epoch": 0.5498981670061099,
"grad_norm": 2.869647979736328,
"learning_rate": 9.9970745229063e-07,
"loss": 0.35603591799736023,
"step": 135
},
{
"epoch": 0.5539714867617108,
"grad_norm": 2.6435837745666504,
"learning_rate": 9.9965666857104e-07,
"loss": 0.3288918733596802,
"step": 136
},
{
"epoch": 0.5580448065173116,
"grad_norm": 2.7970142364501953,
"learning_rate": 9.996018240855806e-07,
"loss": 0.3878723680973053,
"step": 137
},
{
"epoch": 0.5621181262729125,
"grad_norm": 2.593043327331543,
"learning_rate": 9.995429192799788e-07,
"loss": 0.3534126281738281,
"step": 138
},
{
"epoch": 0.5661914460285132,
"grad_norm": 2.8867013454437256,
"learning_rate": 9.994799546329602e-07,
"loss": 0.38061630725860596,
"step": 139
},
{
"epoch": 0.570264765784114,
"grad_norm": 2.589017152786255,
"learning_rate": 9.994129306562458e-07,
"loss": 0.37725748121738434,
"step": 140
},
{
"epoch": 0.5743380855397149,
"grad_norm": 2.369696617126465,
"learning_rate": 9.993418478945472e-07,
"loss": 0.32034583389759064,
"step": 141
},
{
"epoch": 0.5784114052953157,
"grad_norm": 2.6410069465637207,
"learning_rate": 9.992667069255618e-07,
"loss": 0.36017628014087677,
"step": 142
},
{
"epoch": 0.5824847250509165,
"grad_norm": 2.597259283065796,
"learning_rate": 9.991875083599688e-07,
"loss": 0.32577911019325256,
"step": 143
},
{
"epoch": 0.5865580448065173,
"grad_norm": 2.761859655380249,
"learning_rate": 9.991042528414237e-07,
"loss": 0.33353830873966217,
"step": 144
},
{
"epoch": 0.5906313645621182,
"grad_norm": 2.7634713649749756,
"learning_rate": 9.990169410465536e-07,
"loss": 0.33604632318019867,
"step": 145
},
{
"epoch": 0.594704684317719,
"grad_norm": 2.820897340774536,
"learning_rate": 9.98925573684951e-07,
"loss": 0.3069554716348648,
"step": 146
},
{
"epoch": 0.5987780040733197,
"grad_norm": 2.856700897216797,
"learning_rate": 9.98830151499169e-07,
"loss": 0.33896636962890625,
"step": 147
},
{
"epoch": 0.6028513238289206,
"grad_norm": 2.9203782081604004,
"learning_rate": 9.987306752647142e-07,
"loss": 0.35070909559726715,
"step": 148
},
{
"epoch": 0.6069246435845214,
"grad_norm": 2.679352283477783,
"learning_rate": 9.986271457900414e-07,
"loss": 0.3325359970331192,
"step": 149
},
{
"epoch": 0.6109979633401222,
"grad_norm": 2.4953606128692627,
"learning_rate": 9.98519563916546e-07,
"loss": 0.32330869138240814,
"step": 150
},
{
"epoch": 0.615071283095723,
"grad_norm": 2.618744134902954,
"learning_rate": 9.98407930518558e-07,
"loss": 0.33912393450737,
"step": 151
},
{
"epoch": 0.6191446028513238,
"grad_norm": 2.6512296199798584,
"learning_rate": 9.982922465033348e-07,
"loss": 0.3045920431613922,
"step": 152
},
{
"epoch": 0.6232179226069247,
"grad_norm": 2.7606050968170166,
"learning_rate": 9.981725128110532e-07,
"loss": 0.32916732132434845,
"step": 153
},
{
"epoch": 0.6272912423625254,
"grad_norm": 2.95037841796875,
"learning_rate": 9.980487304148024e-07,
"loss": 0.36757831275463104,
"step": 154
},
{
"epoch": 0.6313645621181263,
"grad_norm": 2.890489339828491,
"learning_rate": 9.97920900320576e-07,
"loss": 0.36117151379585266,
"step": 155
},
{
"epoch": 0.6354378818737271,
"grad_norm": 2.7488858699798584,
"learning_rate": 9.97789023567263e-07,
"loss": 0.35026322305202484,
"step": 156
},
{
"epoch": 0.639511201629328,
"grad_norm": 2.5479671955108643,
"learning_rate": 9.976531012266413e-07,
"loss": 0.308156818151474,
"step": 157
},
{
"epoch": 0.6435845213849287,
"grad_norm": 2.717344045639038,
"learning_rate": 9.975131344033664e-07,
"loss": 0.29827529191970825,
"step": 158
},
{
"epoch": 0.6476578411405295,
"grad_norm": 2.569551467895508,
"learning_rate": 9.973691242349648e-07,
"loss": 0.3232528269290924,
"step": 159
},
{
"epoch": 0.6517311608961304,
"grad_norm": 3.0013420581817627,
"learning_rate": 9.972210718918233e-07,
"loss": 0.3270832598209381,
"step": 160
},
{
"epoch": 0.6558044806517311,
"grad_norm": 2.7339162826538086,
"learning_rate": 9.970689785771798e-07,
"loss": 0.3668155074119568,
"step": 161
},
{
"epoch": 0.659877800407332,
"grad_norm": 2.6689724922180176,
"learning_rate": 9.969128455271137e-07,
"loss": 0.32853490114212036,
"step": 162
},
{
"epoch": 0.6639511201629328,
"grad_norm": 3.042081117630005,
"learning_rate": 9.967526740105358e-07,
"loss": 0.3487651199102402,
"step": 163
},
{
"epoch": 0.6680244399185336,
"grad_norm": 2.4641284942626953,
"learning_rate": 9.965884653291783e-07,
"loss": 0.35704147815704346,
"step": 164
},
{
"epoch": 0.6720977596741344,
"grad_norm": 2.6836225986480713,
"learning_rate": 9.964202208175833e-07,
"loss": 0.33587950468063354,
"step": 165
},
{
"epoch": 0.6761710794297352,
"grad_norm": 2.2905988693237305,
"learning_rate": 9.962479418430932e-07,
"loss": 0.3061918318271637,
"step": 166
},
{
"epoch": 0.6802443991853361,
"grad_norm": 2.4772934913635254,
"learning_rate": 9.960716298058381e-07,
"loss": 0.2896444499492645,
"step": 167
},
{
"epoch": 0.6843177189409368,
"grad_norm": 2.6987321376800537,
"learning_rate": 9.958912861387258e-07,
"loss": 0.3374595195055008,
"step": 168
},
{
"epoch": 0.6883910386965377,
"grad_norm": 2.6165449619293213,
"learning_rate": 9.9570691230743e-07,
"loss": 0.33027225732803345,
"step": 169
},
{
"epoch": 0.6924643584521385,
"grad_norm": 3.1326680183410645,
"learning_rate": 9.955185098103771e-07,
"loss": 0.3138381540775299,
"step": 170
},
{
"epoch": 0.6965376782077393,
"grad_norm": 2.5313732624053955,
"learning_rate": 9.953260801787356e-07,
"loss": 0.31824737787246704,
"step": 171
},
{
"epoch": 0.7006109979633401,
"grad_norm": 2.529325008392334,
"learning_rate": 9.951296249764025e-07,
"loss": 0.298155277967453,
"step": 172
},
{
"epoch": 0.7046843177189409,
"grad_norm": 2.6821744441986084,
"learning_rate": 9.949291457999916e-07,
"loss": 0.33296874165534973,
"step": 173
},
{
"epoch": 0.7087576374745418,
"grad_norm": 2.588157892227173,
"learning_rate": 9.947246442788193e-07,
"loss": 0.31226691603660583,
"step": 174
},
{
"epoch": 0.7128309572301426,
"grad_norm": 2.7822420597076416,
"learning_rate": 9.945161220748927e-07,
"loss": 0.322743222117424,
"step": 175
},
{
"epoch": 0.7169042769857433,
"grad_norm": 2.379702091217041,
"learning_rate": 9.943035808828953e-07,
"loss": 0.3056500107049942,
"step": 176
},
{
"epoch": 0.7209775967413442,
"grad_norm": 2.4450721740722656,
"learning_rate": 9.94087022430173e-07,
"loss": 0.3037564754486084,
"step": 177
},
{
"epoch": 0.725050916496945,
"grad_norm": 2.5885887145996094,
"learning_rate": 9.938664484767205e-07,
"loss": 0.327587828040123,
"step": 178
},
{
"epoch": 0.7291242362525459,
"grad_norm": 2.613290309906006,
"learning_rate": 9.936418608151675e-07,
"loss": 0.33323927223682404,
"step": 179
},
{
"epoch": 0.7331975560081466,
"grad_norm": 2.6541707515716553,
"learning_rate": 9.93413261270763e-07,
"loss": 0.3316569924354553,
"step": 180
},
{
"epoch": 0.7372708757637475,
"grad_norm": 2.646383047103882,
"learning_rate": 9.931806517013612e-07,
"loss": 0.35486292839050293,
"step": 181
},
{
"epoch": 0.7413441955193483,
"grad_norm": 2.5270328521728516,
"learning_rate": 9.92944033997406e-07,
"loss": 0.3157142102718353,
"step": 182
},
{
"epoch": 0.745417515274949,
"grad_norm": 2.5851869583129883,
"learning_rate": 9.927034100819163e-07,
"loss": 0.3013855814933777,
"step": 183
},
{
"epoch": 0.7494908350305499,
"grad_norm": 2.75219988822937,
"learning_rate": 9.924587819104695e-07,
"loss": 0.3420049250125885,
"step": 184
},
{
"epoch": 0.7535641547861507,
"grad_norm": 2.436596632003784,
"learning_rate": 9.922101514711865e-07,
"loss": 0.3062688261270523,
"step": 185
},
{
"epoch": 0.7576374745417516,
"grad_norm": 2.9479236602783203,
"learning_rate": 9.919575207847145e-07,
"loss": 0.31793762743473053,
"step": 186
},
{
"epoch": 0.7617107942973523,
"grad_norm": 2.5482208728790283,
"learning_rate": 9.917008919042116e-07,
"loss": 0.3306496888399124,
"step": 187
},
{
"epoch": 0.7657841140529531,
"grad_norm": 2.609839677810669,
"learning_rate": 9.914402669153295e-07,
"loss": 0.29324449598789215,
"step": 188
},
{
"epoch": 0.769857433808554,
"grad_norm": 2.5740039348602295,
"learning_rate": 9.91175647936197e-07,
"loss": 0.3193310797214508,
"step": 189
},
{
"epoch": 0.7739307535641547,
"grad_norm": 2.3878629207611084,
"learning_rate": 9.909070371174019e-07,
"loss": 0.3040658235549927,
"step": 190
},
{
"epoch": 0.7780040733197556,
"grad_norm": 2.755152463912964,
"learning_rate": 9.906344366419746e-07,
"loss": 0.33930477499961853,
"step": 191
},
{
"epoch": 0.7820773930753564,
"grad_norm": 2.58367657661438,
"learning_rate": 9.9035784872537e-07,
"loss": 0.3244568109512329,
"step": 192
},
{
"epoch": 0.7861507128309573,
"grad_norm": 2.350712537765503,
"learning_rate": 9.90077275615449e-07,
"loss": 0.2779058516025543,
"step": 193
},
{
"epoch": 0.790224032586558,
"grad_norm": 2.7418465614318848,
"learning_rate": 9.897927195924608e-07,
"loss": 0.32641272246837616,
"step": 194
},
{
"epoch": 0.7942973523421588,
"grad_norm": 2.516510009765625,
"learning_rate": 9.895041829690238e-07,
"loss": 0.3083319664001465,
"step": 195
},
{
"epoch": 0.7983706720977597,
"grad_norm": 2.7772316932678223,
"learning_rate": 9.892116680901084e-07,
"loss": 0.30357757210731506,
"step": 196
},
{
"epoch": 0.8024439918533605,
"grad_norm": 2.5389041900634766,
"learning_rate": 9.88915177333015e-07,
"loss": 0.30694054067134857,
"step": 197
},
{
"epoch": 0.8065173116089613,
"grad_norm": 2.7129383087158203,
"learning_rate": 9.886147131073579e-07,
"loss": 0.3402569591999054,
"step": 198
},
{
"epoch": 0.8105906313645621,
"grad_norm": 2.654186248779297,
"learning_rate": 9.883102778550434e-07,
"loss": 0.3343619704246521,
"step": 199
},
{
"epoch": 0.814663951120163,
"grad_norm": 2.380168914794922,
"learning_rate": 9.880018740502508e-07,
"loss": 0.3020651191473007,
"step": 200
},
{
"epoch": 0.8187372708757638,
"grad_norm": 2.771951198577881,
"learning_rate": 9.876895041994127e-07,
"loss": 0.30565840005874634,
"step": 201
},
{
"epoch": 0.8228105906313645,
"grad_norm": 2.4966540336608887,
"learning_rate": 9.873731708411939e-07,
"loss": 0.3085058331489563,
"step": 202
},
{
"epoch": 0.8268839103869654,
"grad_norm": 2.5919551849365234,
"learning_rate": 9.870528765464711e-07,
"loss": 0.34540820121765137,
"step": 203
},
{
"epoch": 0.8309572301425662,
"grad_norm": 3.0668885707855225,
"learning_rate": 9.867286239183122e-07,
"loss": 0.3307037353515625,
"step": 204
},
{
"epoch": 0.835030549898167,
"grad_norm": 2.4281554222106934,
"learning_rate": 9.864004155919544e-07,
"loss": 0.28929875791072845,
"step": 205
},
{
"epoch": 0.8391038696537678,
"grad_norm": 2.5561623573303223,
"learning_rate": 9.860682542347838e-07,
"loss": 0.3272414803504944,
"step": 206
},
{
"epoch": 0.8431771894093686,
"grad_norm": 2.824591636657715,
"learning_rate": 9.85732142546313e-07,
"loss": 0.3192295432090759,
"step": 207
},
{
"epoch": 0.8472505091649695,
"grad_norm": 2.643718719482422,
"learning_rate": 9.853920832581597e-07,
"loss": 0.31284041702747345,
"step": 208
},
{
"epoch": 0.8513238289205702,
"grad_norm": 2.6777195930480957,
"learning_rate": 9.850480791340236e-07,
"loss": 0.3136574327945709,
"step": 209
},
{
"epoch": 0.8553971486761711,
"grad_norm": 2.5229766368865967,
"learning_rate": 9.847001329696652e-07,
"loss": 0.3047819435596466,
"step": 210
},
{
"epoch": 0.8594704684317719,
"grad_norm": 2.659447431564331,
"learning_rate": 9.843482475928818e-07,
"loss": 0.3642407953739166,
"step": 211
},
{
"epoch": 0.8635437881873728,
"grad_norm": 2.697049379348755,
"learning_rate": 9.839924258634853e-07,
"loss": 0.3134022653102875,
"step": 212
},
{
"epoch": 0.8676171079429735,
"grad_norm": 2.629868745803833,
"learning_rate": 9.83632670673279e-07,
"loss": 0.306331992149353,
"step": 213
},
{
"epoch": 0.8716904276985743,
"grad_norm": 2.4997003078460693,
"learning_rate": 9.832689849460339e-07,
"loss": 0.3142865002155304,
"step": 214
},
{
"epoch": 0.8757637474541752,
"grad_norm": 2.826869010925293,
"learning_rate": 9.829013716374647e-07,
"loss": 0.2904099076986313,
"step": 215
},
{
"epoch": 0.879837067209776,
"grad_norm": 2.6697499752044678,
"learning_rate": 9.825298337352058e-07,
"loss": 0.29838354885578156,
"step": 216
},
{
"epoch": 0.8839103869653768,
"grad_norm": 2.5330023765563965,
"learning_rate": 9.821543742587876e-07,
"loss": 0.3052047789096832,
"step": 217
},
{
"epoch": 0.8879837067209776,
"grad_norm": 2.806683301925659,
"learning_rate": 9.817749962596114e-07,
"loss": 0.3121778964996338,
"step": 218
},
{
"epoch": 0.8920570264765784,
"grad_norm": 2.718122720718384,
"learning_rate": 9.81391702820925e-07,
"loss": 0.32955022156238556,
"step": 219
},
{
"epoch": 0.8961303462321792,
"grad_norm": 2.346466541290283,
"learning_rate": 9.81004497057797e-07,
"loss": 0.291049063205719,
"step": 220
},
{
"epoch": 0.90020366598778,
"grad_norm": 2.4048361778259277,
"learning_rate": 9.806133821170924e-07,
"loss": 0.30249159038066864,
"step": 221
},
{
"epoch": 0.9042769857433809,
"grad_norm": 2.681546688079834,
"learning_rate": 9.80218361177446e-07,
"loss": 0.362154021859169,
"step": 222
},
{
"epoch": 0.9083503054989817,
"grad_norm": 2.792266368865967,
"learning_rate": 9.798194374492375e-07,
"loss": 0.28344525396823883,
"step": 223
},
{
"epoch": 0.9124236252545825,
"grad_norm": 2.507050037384033,
"learning_rate": 9.794166141745646e-07,
"loss": 0.2935172915458679,
"step": 224
},
{
"epoch": 0.9164969450101833,
"grad_norm": 2.7160379886627197,
"learning_rate": 9.790098946272177e-07,
"loss": 0.3005199581384659,
"step": 225
},
{
"epoch": 0.9205702647657841,
"grad_norm": 2.666494131088257,
"learning_rate": 9.785992821126518e-07,
"loss": 0.30710943043231964,
"step": 226
},
{
"epoch": 0.924643584521385,
"grad_norm": 2.699313163757324,
"learning_rate": 9.781847799679615e-07,
"loss": 0.3164513558149338,
"step": 227
},
{
"epoch": 0.9287169042769857,
"grad_norm": 2.49406099319458,
"learning_rate": 9.777663915618517e-07,
"loss": 0.3061770647764206,
"step": 228
},
{
"epoch": 0.9327902240325866,
"grad_norm": 2.552093029022217,
"learning_rate": 9.773441202946121e-07,
"loss": 0.2973909080028534,
"step": 229
},
{
"epoch": 0.9368635437881874,
"grad_norm": 2.5773231983184814,
"learning_rate": 9.76917969598089e-07,
"loss": 0.31120532751083374,
"step": 230
},
{
"epoch": 0.9409368635437881,
"grad_norm": 2.653515100479126,
"learning_rate": 9.76487942935657e-07,
"loss": 0.3365926146507263,
"step": 231
},
{
"epoch": 0.945010183299389,
"grad_norm": 2.670433282852173,
"learning_rate": 9.760540438021907e-07,
"loss": 0.3196941614151001,
"step": 232
},
{
"epoch": 0.9490835030549898,
"grad_norm": 2.892035961151123,
"learning_rate": 9.756162757240373e-07,
"loss": 0.33982205390930176,
"step": 233
},
{
"epoch": 0.9531568228105907,
"grad_norm": 2.5157856941223145,
"learning_rate": 9.751746422589872e-07,
"loss": 0.2537951096892357,
"step": 234
},
{
"epoch": 0.9572301425661914,
"grad_norm": 2.6808388233184814,
"learning_rate": 9.747291469962452e-07,
"loss": 0.2846526652574539,
"step": 235
},
{
"epoch": 0.9613034623217923,
"grad_norm": 2.451559066772461,
"learning_rate": 9.742797935564011e-07,
"loss": 0.29611095786094666,
"step": 236
},
{
"epoch": 0.9653767820773931,
"grad_norm": 2.7313358783721924,
"learning_rate": 9.738265855914012e-07,
"loss": 0.3275996297597885,
"step": 237
},
{
"epoch": 0.9694501018329938,
"grad_norm": 2.5593299865722656,
"learning_rate": 9.733695267845171e-07,
"loss": 0.2993656247854233,
"step": 238
},
{
"epoch": 0.9735234215885947,
"grad_norm": 2.6013288497924805,
"learning_rate": 9.729086208503173e-07,
"loss": 0.31615155935287476,
"step": 239
},
{
"epoch": 0.9775967413441955,
"grad_norm": 2.5403575897216797,
"learning_rate": 9.72443871534636e-07,
"loss": 0.2843424677848816,
"step": 240
},
{
"epoch": 0.9816700610997964,
"grad_norm": 2.4495410919189453,
"learning_rate": 9.719752826145432e-07,
"loss": 0.2987358868122101,
"step": 241
},
{
"epoch": 0.9857433808553971,
"grad_norm": 2.719775676727295,
"learning_rate": 9.715028578983136e-07,
"loss": 0.34320636093616486,
"step": 242
},
{
"epoch": 0.9898167006109979,
"grad_norm": 2.7152929306030273,
"learning_rate": 9.71026601225396e-07,
"loss": 0.2937510758638382,
"step": 243
},
{
"epoch": 0.9938900203665988,
"grad_norm": 2.4305663108825684,
"learning_rate": 9.705465164663817e-07,
"loss": 0.29807206988334656,
"step": 244
},
{
"epoch": 0.9979633401221996,
"grad_norm": 2.322704792022705,
"learning_rate": 9.700626075229738e-07,
"loss": 0.3189048618078232,
"step": 245
},
{
"epoch": 1.0,
"grad_norm": 2.322704792022705,
"learning_rate": 9.695748783279544e-07,
"loss": 0.3195984363555908,
"step": 246
}
],
"logging_steps": 1.0,
"max_steps": 1225,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}