ciloku's picture
Training in progress, step 200, checkpoint
74da288 verified
{
"best_metric": 1.2467516660690308,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 1.103448275862069,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005517241379310344,
"grad_norm": 81.85233306884766,
"learning_rate": 6e-06,
"loss": 12.4813,
"step": 1
},
{
"epoch": 0.005517241379310344,
"eval_loss": 3.5472052097320557,
"eval_runtime": 21.8033,
"eval_samples_per_second": 13.989,
"eval_steps_per_second": 3.532,
"step": 1
},
{
"epoch": 0.011034482758620689,
"grad_norm": 67.96113586425781,
"learning_rate": 1.2e-05,
"loss": 10.6517,
"step": 2
},
{
"epoch": 0.016551724137931035,
"grad_norm": 87.34710693359375,
"learning_rate": 1.8e-05,
"loss": 12.4284,
"step": 3
},
{
"epoch": 0.022068965517241378,
"grad_norm": 68.4135513305664,
"learning_rate": 2.4e-05,
"loss": 11.0609,
"step": 4
},
{
"epoch": 0.027586206896551724,
"grad_norm": 75.67364501953125,
"learning_rate": 3e-05,
"loss": 10.8972,
"step": 5
},
{
"epoch": 0.03310344827586207,
"grad_norm": 76.12530517578125,
"learning_rate": 3.6e-05,
"loss": 10.0061,
"step": 6
},
{
"epoch": 0.038620689655172416,
"grad_norm": 79.84320068359375,
"learning_rate": 4.2e-05,
"loss": 9.8832,
"step": 7
},
{
"epoch": 0.044137931034482755,
"grad_norm": 50.45161056518555,
"learning_rate": 4.8e-05,
"loss": 9.4741,
"step": 8
},
{
"epoch": 0.0496551724137931,
"grad_norm": 37.255496978759766,
"learning_rate": 5.4000000000000005e-05,
"loss": 8.7538,
"step": 9
},
{
"epoch": 0.05517241379310345,
"grad_norm": 45.016075134277344,
"learning_rate": 6e-05,
"loss": 8.3654,
"step": 10
},
{
"epoch": 0.060689655172413794,
"grad_norm": 33.683189392089844,
"learning_rate": 5.999589914977407e-05,
"loss": 7.7713,
"step": 11
},
{
"epoch": 0.06620689655172414,
"grad_norm": 25.923715591430664,
"learning_rate": 5.998359772022778e-05,
"loss": 6.825,
"step": 12
},
{
"epoch": 0.07172413793103448,
"grad_norm": 36.873512268066406,
"learning_rate": 5.996309907444915e-05,
"loss": 8.1533,
"step": 13
},
{
"epoch": 0.07724137931034483,
"grad_norm": 23.235549926757812,
"learning_rate": 5.9934408816563236e-05,
"loss": 7.3939,
"step": 14
},
{
"epoch": 0.08275862068965517,
"grad_norm": 24.117935180664062,
"learning_rate": 5.98975347902001e-05,
"loss": 7.3477,
"step": 15
},
{
"epoch": 0.08827586206896551,
"grad_norm": 21.500791549682617,
"learning_rate": 5.9852487076350345e-05,
"loss": 7.0144,
"step": 16
},
{
"epoch": 0.09379310344827586,
"grad_norm": 23.357107162475586,
"learning_rate": 5.979927799060915e-05,
"loss": 6.9306,
"step": 17
},
{
"epoch": 0.0993103448275862,
"grad_norm": 26.277986526489258,
"learning_rate": 5.9737922079809257e-05,
"loss": 7.2077,
"step": 18
},
{
"epoch": 0.10482758620689656,
"grad_norm": 20.46823501586914,
"learning_rate": 5.9668436118044054e-05,
"loss": 6.7012,
"step": 19
},
{
"epoch": 0.1103448275862069,
"grad_norm": 35.533565521240234,
"learning_rate": 5.959083910208167e-05,
"loss": 6.5318,
"step": 20
},
{
"epoch": 0.11586206896551725,
"grad_norm": 21.680561065673828,
"learning_rate": 5.9505152246171474e-05,
"loss": 6.8213,
"step": 21
},
{
"epoch": 0.12137931034482759,
"grad_norm": 38.167606353759766,
"learning_rate": 5.941139897624428e-05,
"loss": 6.7067,
"step": 22
},
{
"epoch": 0.12689655172413794,
"grad_norm": 22.853790283203125,
"learning_rate": 5.9309604923507984e-05,
"loss": 6.8576,
"step": 23
},
{
"epoch": 0.13241379310344828,
"grad_norm": 23.914487838745117,
"learning_rate": 5.9199797917440176e-05,
"loss": 6.7643,
"step": 24
},
{
"epoch": 0.13793103448275862,
"grad_norm": 36.852081298828125,
"learning_rate": 5.908200797817991e-05,
"loss": 6.7685,
"step": 25
},
{
"epoch": 0.14344827586206896,
"grad_norm": 28.959129333496094,
"learning_rate": 5.895626730832046e-05,
"loss": 7.1442,
"step": 26
},
{
"epoch": 0.1489655172413793,
"grad_norm": 23.524133682250977,
"learning_rate": 5.882261028410545e-05,
"loss": 6.6208,
"step": 27
},
{
"epoch": 0.15448275862068966,
"grad_norm": 29.11441993713379,
"learning_rate": 5.8681073446030734e-05,
"loss": 8.4364,
"step": 28
},
{
"epoch": 0.16,
"grad_norm": 29.907983779907227,
"learning_rate": 5.853169548885461e-05,
"loss": 6.3796,
"step": 29
},
{
"epoch": 0.16551724137931034,
"grad_norm": 28.542919158935547,
"learning_rate": 5.8374517251019035e-05,
"loss": 7.1945,
"step": 30
},
{
"epoch": 0.17103448275862068,
"grad_norm": 33.40464401245117,
"learning_rate": 5.820958170348484e-05,
"loss": 8.0262,
"step": 31
},
{
"epoch": 0.17655172413793102,
"grad_norm": 23.497774124145508,
"learning_rate": 5.8036933937983825e-05,
"loss": 7.4088,
"step": 32
},
{
"epoch": 0.1820689655172414,
"grad_norm": 25.99003791809082,
"learning_rate": 5.7856621154691217e-05,
"loss": 8.0957,
"step": 33
},
{
"epoch": 0.18758620689655173,
"grad_norm": 20.609542846679688,
"learning_rate": 5.766869264932154e-05,
"loss": 6.9359,
"step": 34
},
{
"epoch": 0.19310344827586207,
"grad_norm": 20.997133255004883,
"learning_rate": 5.747319979965172e-05,
"loss": 8.0183,
"step": 35
},
{
"epoch": 0.1986206896551724,
"grad_norm": 27.197574615478516,
"learning_rate": 5.727019605147488e-05,
"loss": 7.0162,
"step": 36
},
{
"epoch": 0.20413793103448277,
"grad_norm": 23.199996948242188,
"learning_rate": 5.7059736903988775e-05,
"loss": 7.9867,
"step": 37
},
{
"epoch": 0.2096551724137931,
"grad_norm": 29.606868743896484,
"learning_rate": 5.684187989462291e-05,
"loss": 6.414,
"step": 38
},
{
"epoch": 0.21517241379310345,
"grad_norm": 24.353099822998047,
"learning_rate": 5.661668458330836e-05,
"loss": 6.8272,
"step": 39
},
{
"epoch": 0.2206896551724138,
"grad_norm": 22.26854133605957,
"learning_rate": 5.638421253619467e-05,
"loss": 5.6701,
"step": 40
},
{
"epoch": 0.22620689655172413,
"grad_norm": 23.756772994995117,
"learning_rate": 5.614452730881832e-05,
"loss": 6.4688,
"step": 41
},
{
"epoch": 0.2317241379310345,
"grad_norm": 26.136266708374023,
"learning_rate": 5.589769442872722e-05,
"loss": 6.3717,
"step": 42
},
{
"epoch": 0.23724137931034484,
"grad_norm": 23.154205322265625,
"learning_rate": 5.5643781377566175e-05,
"loss": 6.0505,
"step": 43
},
{
"epoch": 0.24275862068965517,
"grad_norm": 36.596099853515625,
"learning_rate": 5.538285757262806e-05,
"loss": 6.3005,
"step": 44
},
{
"epoch": 0.2482758620689655,
"grad_norm": 52.90260314941406,
"learning_rate": 5.5114994347875856e-05,
"loss": 7.8155,
"step": 45
},
{
"epoch": 0.2537931034482759,
"grad_norm": 121.07559967041016,
"learning_rate": 5.48402649344406e-05,
"loss": 11.1405,
"step": 46
},
{
"epoch": 0.2593103448275862,
"grad_norm": 63.700775146484375,
"learning_rate": 5.455874444060078e-05,
"loss": 10.3049,
"step": 47
},
{
"epoch": 0.26482758620689656,
"grad_norm": 27.780960083007812,
"learning_rate": 5.427050983124843e-05,
"loss": 7.6377,
"step": 48
},
{
"epoch": 0.27034482758620687,
"grad_norm": 36.80820846557617,
"learning_rate": 5.397563990684774e-05,
"loss": 7.0712,
"step": 49
},
{
"epoch": 0.27586206896551724,
"grad_norm": 45.82158660888672,
"learning_rate": 5.367421528189181e-05,
"loss": 7.8169,
"step": 50
},
{
"epoch": 0.27586206896551724,
"eval_loss": 1.749925971031189,
"eval_runtime": 22.2372,
"eval_samples_per_second": 13.716,
"eval_steps_per_second": 3.463,
"step": 50
},
{
"epoch": 0.2813793103448276,
"grad_norm": 32.10503005981445,
"learning_rate": 5.336631836286338e-05,
"loss": 7.0049,
"step": 51
},
{
"epoch": 0.2868965517241379,
"grad_norm": 20.947622299194336,
"learning_rate": 5.3052033325705774e-05,
"loss": 6.132,
"step": 52
},
{
"epoch": 0.2924137931034483,
"grad_norm": 18.891435623168945,
"learning_rate": 5.2731446092810044e-05,
"loss": 6.8589,
"step": 53
},
{
"epoch": 0.2979310344827586,
"grad_norm": 18.95319175720215,
"learning_rate": 5.240464430952462e-05,
"loss": 6.297,
"step": 54
},
{
"epoch": 0.30344827586206896,
"grad_norm": 18.678953170776367,
"learning_rate": 5.207171732019395e-05,
"loss": 6.4108,
"step": 55
},
{
"epoch": 0.30896551724137933,
"grad_norm": 17.936857223510742,
"learning_rate": 5.1732756143732675e-05,
"loss": 6.0579,
"step": 56
},
{
"epoch": 0.31448275862068964,
"grad_norm": 21.307292938232422,
"learning_rate": 5.1387853448741916e-05,
"loss": 5.6681,
"step": 57
},
{
"epoch": 0.32,
"grad_norm": 20.082971572875977,
"learning_rate": 5.103710352817465e-05,
"loss": 6.8433,
"step": 58
},
{
"epoch": 0.3255172413793103,
"grad_norm": 16.341541290283203,
"learning_rate": 5.068060227355698e-05,
"loss": 5.6629,
"step": 59
},
{
"epoch": 0.3310344827586207,
"grad_norm": 17.958162307739258,
"learning_rate": 5.0318447148772234e-05,
"loss": 5.8787,
"step": 60
},
{
"epoch": 0.33655172413793105,
"grad_norm": 16.37387466430664,
"learning_rate": 4.995073716341545e-05,
"loss": 5.8848,
"step": 61
},
{
"epoch": 0.34206896551724136,
"grad_norm": 18.893905639648438,
"learning_rate": 4.957757284572506e-05,
"loss": 5.7301,
"step": 62
},
{
"epoch": 0.34758620689655173,
"grad_norm": 20.43461036682129,
"learning_rate": 4.91990562150995e-05,
"loss": 5.7722,
"step": 63
},
{
"epoch": 0.35310344827586204,
"grad_norm": 18.65955924987793,
"learning_rate": 4.881529075420611e-05,
"loss": 5.825,
"step": 64
},
{
"epoch": 0.3586206896551724,
"grad_norm": 22.72260284423828,
"learning_rate": 4.8426381380690036e-05,
"loss": 5.7372,
"step": 65
},
{
"epoch": 0.3641379310344828,
"grad_norm": 19.48175811767578,
"learning_rate": 4.8032434418490753e-05,
"loss": 6.3996,
"step": 66
},
{
"epoch": 0.3696551724137931,
"grad_norm": 19.140289306640625,
"learning_rate": 4.7633557568774194e-05,
"loss": 6.5187,
"step": 67
},
{
"epoch": 0.37517241379310345,
"grad_norm": 17.06688690185547,
"learning_rate": 4.722985988048831e-05,
"loss": 6.3792,
"step": 68
},
{
"epoch": 0.38068965517241377,
"grad_norm": 18.04470443725586,
"learning_rate": 4.6821451720550184e-05,
"loss": 5.2939,
"step": 69
},
{
"epoch": 0.38620689655172413,
"grad_norm": 17.178627014160156,
"learning_rate": 4.640844474367282e-05,
"loss": 6.0978,
"step": 70
},
{
"epoch": 0.3917241379310345,
"grad_norm": 19.80453109741211,
"learning_rate": 4.5990951861839815e-05,
"loss": 6.1981,
"step": 71
},
{
"epoch": 0.3972413793103448,
"grad_norm": 21.434978485107422,
"learning_rate": 4.5569087213436455e-05,
"loss": 6.0233,
"step": 72
},
{
"epoch": 0.4027586206896552,
"grad_norm": 26.16228675842285,
"learning_rate": 4.514296613204532e-05,
"loss": 6.1622,
"step": 73
},
{
"epoch": 0.40827586206896554,
"grad_norm": 28.807147979736328,
"learning_rate": 4.471270511491525e-05,
"loss": 6.3984,
"step": 74
},
{
"epoch": 0.41379310344827586,
"grad_norm": 23.963605880737305,
"learning_rate": 4.427842179111221e-05,
"loss": 6.9674,
"step": 75
},
{
"epoch": 0.4193103448275862,
"grad_norm": 28.31536102294922,
"learning_rate": 4.3840234889360634e-05,
"loss": 7.5399,
"step": 76
},
{
"epoch": 0.42482758620689653,
"grad_norm": 17.923322677612305,
"learning_rate": 4.33982642055842e-05,
"loss": 6.0038,
"step": 77
},
{
"epoch": 0.4303448275862069,
"grad_norm": 21.877260208129883,
"learning_rate": 4.2952630570154785e-05,
"loss": 6.4209,
"step": 78
},
{
"epoch": 0.43586206896551727,
"grad_norm": 18.959318161010742,
"learning_rate": 4.250345581485871e-05,
"loss": 6.9811,
"step": 79
},
{
"epoch": 0.4413793103448276,
"grad_norm": 24.570106506347656,
"learning_rate": 4.205086273958909e-05,
"loss": 7.1458,
"step": 80
},
{
"epoch": 0.44689655172413795,
"grad_norm": 25.716232299804688,
"learning_rate": 4.1594975078773565e-05,
"loss": 6.8472,
"step": 81
},
{
"epoch": 0.45241379310344826,
"grad_norm": 21.59646987915039,
"learning_rate": 4.113591746754662e-05,
"loss": 6.0973,
"step": 82
},
{
"epoch": 0.4579310344827586,
"grad_norm": 24.108993530273438,
"learning_rate": 4.06738154076755e-05,
"loss": 5.961,
"step": 83
},
{
"epoch": 0.463448275862069,
"grad_norm": 23.924850463867188,
"learning_rate": 4.020879523324929e-05,
"loss": 5.7052,
"step": 84
},
{
"epoch": 0.4689655172413793,
"grad_norm": 18.18124008178711,
"learning_rate": 3.974098407614051e-05,
"loss": 5.0366,
"step": 85
},
{
"epoch": 0.47448275862068967,
"grad_norm": 24.268354415893555,
"learning_rate": 3.927050983124842e-05,
"loss": 5.2907,
"step": 86
},
{
"epoch": 0.48,
"grad_norm": 24.5296630859375,
"learning_rate": 3.8797501121533946e-05,
"loss": 6.244,
"step": 87
},
{
"epoch": 0.48551724137931035,
"grad_norm": 22.945091247558594,
"learning_rate": 3.832208726285534e-05,
"loss": 4.8176,
"step": 88
},
{
"epoch": 0.4910344827586207,
"grad_norm": 27.000350952148438,
"learning_rate": 3.784439822861459e-05,
"loss": 4.9201,
"step": 89
},
{
"epoch": 0.496551724137931,
"grad_norm": 40.46442413330078,
"learning_rate": 3.7364564614223976e-05,
"loss": 5.4079,
"step": 90
},
{
"epoch": 0.5020689655172413,
"grad_norm": 44.84676742553711,
"learning_rate": 3.688271760140255e-05,
"loss": 8.7851,
"step": 91
},
{
"epoch": 0.5075862068965518,
"grad_norm": 38.57492446899414,
"learning_rate": 3.6398988922312406e-05,
"loss": 8.1502,
"step": 92
},
{
"epoch": 0.5131034482758621,
"grad_norm": 23.454004287719727,
"learning_rate": 3.591351082354441e-05,
"loss": 6.3463,
"step": 93
},
{
"epoch": 0.5186206896551724,
"grad_norm": 21.795522689819336,
"learning_rate": 3.54264160299633e-05,
"loss": 7.3336,
"step": 94
},
{
"epoch": 0.5241379310344828,
"grad_norm": 18.910308837890625,
"learning_rate": 3.493783770842202e-05,
"loss": 5.9417,
"step": 95
},
{
"epoch": 0.5296551724137931,
"grad_norm": 20.068740844726562,
"learning_rate": 3.444790943135526e-05,
"loss": 7.0276,
"step": 96
},
{
"epoch": 0.5351724137931034,
"grad_norm": 16.92316436767578,
"learning_rate": 3.3956765140262074e-05,
"loss": 5.9382,
"step": 97
},
{
"epoch": 0.5406896551724137,
"grad_norm": 15.357147216796875,
"learning_rate": 3.346453910908759e-05,
"loss": 5.2924,
"step": 98
},
{
"epoch": 0.5462068965517242,
"grad_norm": 15.970285415649414,
"learning_rate": 3.297136590751389e-05,
"loss": 5.3965,
"step": 99
},
{
"epoch": 0.5517241379310345,
"grad_norm": 17.879304885864258,
"learning_rate": 3.247738036416998e-05,
"loss": 5.2484,
"step": 100
},
{
"epoch": 0.5517241379310345,
"eval_loss": 1.4723836183547974,
"eval_runtime": 22.2418,
"eval_samples_per_second": 13.713,
"eval_steps_per_second": 3.462,
"step": 100
},
{
"epoch": 0.5572413793103448,
"grad_norm": 16.214435577392578,
"learning_rate": 3.1982717529770985e-05,
"loss": 5.8451,
"step": 101
},
{
"epoch": 0.5627586206896552,
"grad_norm": 14.805839538574219,
"learning_rate": 3.148751264019667e-05,
"loss": 5.5145,
"step": 102
},
{
"epoch": 0.5682758620689655,
"grad_norm": 18.47629165649414,
"learning_rate": 3.099190107951924e-05,
"loss": 6.4664,
"step": 103
},
{
"epoch": 0.5737931034482758,
"grad_norm": 20.32671356201172,
"learning_rate": 3.049601834299076e-05,
"loss": 4.8782,
"step": 104
},
{
"epoch": 0.5793103448275863,
"grad_norm": 14.552179336547852,
"learning_rate": 3e-05,
"loss": 4.837,
"step": 105
},
{
"epoch": 0.5848275862068966,
"grad_norm": 16.728355407714844,
"learning_rate": 2.9503981657009246e-05,
"loss": 4.9243,
"step": 106
},
{
"epoch": 0.5903448275862069,
"grad_norm": 15.932158470153809,
"learning_rate": 2.9008098920480752e-05,
"loss": 5.0072,
"step": 107
},
{
"epoch": 0.5958620689655172,
"grad_norm": 14.092022895812988,
"learning_rate": 2.851248735980333e-05,
"loss": 4.4422,
"step": 108
},
{
"epoch": 0.6013793103448276,
"grad_norm": 14.16683292388916,
"learning_rate": 2.801728247022902e-05,
"loss": 5.0976,
"step": 109
},
{
"epoch": 0.6068965517241379,
"grad_norm": 15.384170532226562,
"learning_rate": 2.7522619635830034e-05,
"loss": 5.0471,
"step": 110
},
{
"epoch": 0.6124137931034482,
"grad_norm": 14.895685195922852,
"learning_rate": 2.702863409248612e-05,
"loss": 4.8804,
"step": 111
},
{
"epoch": 0.6179310344827587,
"grad_norm": 16.85907554626465,
"learning_rate": 2.6535460890912416e-05,
"loss": 5.7392,
"step": 112
},
{
"epoch": 0.623448275862069,
"grad_norm": 17.640703201293945,
"learning_rate": 2.604323485973793e-05,
"loss": 5.387,
"step": 113
},
{
"epoch": 0.6289655172413793,
"grad_norm": 16.179893493652344,
"learning_rate": 2.555209056864474e-05,
"loss": 5.212,
"step": 114
},
{
"epoch": 0.6344827586206897,
"grad_norm": 17.897340774536133,
"learning_rate": 2.5062162291577978e-05,
"loss": 4.851,
"step": 115
},
{
"epoch": 0.64,
"grad_norm": 17.833797454833984,
"learning_rate": 2.4573583970036712e-05,
"loss": 5.0259,
"step": 116
},
{
"epoch": 0.6455172413793103,
"grad_norm": 16.975656509399414,
"learning_rate": 2.4086489176455595e-05,
"loss": 4.5732,
"step": 117
},
{
"epoch": 0.6510344827586206,
"grad_norm": 17.853879928588867,
"learning_rate": 2.36010110776876e-05,
"loss": 4.9148,
"step": 118
},
{
"epoch": 0.6565517241379311,
"grad_norm": 17.815895080566406,
"learning_rate": 2.3117282398597456e-05,
"loss": 5.7234,
"step": 119
},
{
"epoch": 0.6620689655172414,
"grad_norm": 18.00694465637207,
"learning_rate": 2.263543538577603e-05,
"loss": 5.6019,
"step": 120
},
{
"epoch": 0.6675862068965517,
"grad_norm": 21.1002140045166,
"learning_rate": 2.215560177138541e-05,
"loss": 5.9787,
"step": 121
},
{
"epoch": 0.6731034482758621,
"grad_norm": 18.753433227539062,
"learning_rate": 2.167791273714467e-05,
"loss": 5.7944,
"step": 122
},
{
"epoch": 0.6786206896551724,
"grad_norm": 20.584749221801758,
"learning_rate": 2.1202498878466062e-05,
"loss": 5.4823,
"step": 123
},
{
"epoch": 0.6841379310344827,
"grad_norm": 21.03537940979004,
"learning_rate": 2.072949016875158e-05,
"loss": 5.1549,
"step": 124
},
{
"epoch": 0.6896551724137931,
"grad_norm": 19.885255813598633,
"learning_rate": 2.0259015923859498e-05,
"loss": 6.1,
"step": 125
},
{
"epoch": 0.6951724137931035,
"grad_norm": 20.665145874023438,
"learning_rate": 1.979120476675071e-05,
"loss": 5.7938,
"step": 126
},
{
"epoch": 0.7006896551724138,
"grad_norm": 23.18328094482422,
"learning_rate": 1.9326184592324503e-05,
"loss": 6.5563,
"step": 127
},
{
"epoch": 0.7062068965517241,
"grad_norm": 24.821685791015625,
"learning_rate": 1.8864082532453373e-05,
"loss": 5.9187,
"step": 128
},
{
"epoch": 0.7117241379310345,
"grad_norm": 20.381574630737305,
"learning_rate": 1.840502492122644e-05,
"loss": 5.0224,
"step": 129
},
{
"epoch": 0.7172413793103448,
"grad_norm": 25.46938705444336,
"learning_rate": 1.7949137260410924e-05,
"loss": 5.9761,
"step": 130
},
{
"epoch": 0.7227586206896551,
"grad_norm": 24.690874099731445,
"learning_rate": 1.7496544185141295e-05,
"loss": 6.0875,
"step": 131
},
{
"epoch": 0.7282758620689656,
"grad_norm": 20.75514793395996,
"learning_rate": 1.7047369429845216e-05,
"loss": 4.9455,
"step": 132
},
{
"epoch": 0.7337931034482759,
"grad_norm": 18.397716522216797,
"learning_rate": 1.6601735794415806e-05,
"loss": 3.7152,
"step": 133
},
{
"epoch": 0.7393103448275862,
"grad_norm": 24.97144317626953,
"learning_rate": 1.615976511063937e-05,
"loss": 4.8452,
"step": 134
},
{
"epoch": 0.7448275862068966,
"grad_norm": 32.98619842529297,
"learning_rate": 1.5721578208887793e-05,
"loss": 4.977,
"step": 135
},
{
"epoch": 0.7503448275862069,
"grad_norm": 23.18657684326172,
"learning_rate": 1.5287294885084766e-05,
"loss": 8.0823,
"step": 136
},
{
"epoch": 0.7558620689655172,
"grad_norm": 18.28014373779297,
"learning_rate": 1.4857033867954697e-05,
"loss": 7.2086,
"step": 137
},
{
"epoch": 0.7613793103448275,
"grad_norm": 16.946495056152344,
"learning_rate": 1.4430912786563554e-05,
"loss": 5.8691,
"step": 138
},
{
"epoch": 0.766896551724138,
"grad_norm": 15.615433692932129,
"learning_rate": 1.4009048138160195e-05,
"loss": 5.6876,
"step": 139
},
{
"epoch": 0.7724137931034483,
"grad_norm": 15.042305946350098,
"learning_rate": 1.3591555256327199e-05,
"loss": 5.1846,
"step": 140
},
{
"epoch": 0.7779310344827586,
"grad_norm": 14.744568824768066,
"learning_rate": 1.3178548279449822e-05,
"loss": 5.5824,
"step": 141
},
{
"epoch": 0.783448275862069,
"grad_norm": 15.294870376586914,
"learning_rate": 1.2770140119511693e-05,
"loss": 5.23,
"step": 142
},
{
"epoch": 0.7889655172413793,
"grad_norm": 15.637053489685059,
"learning_rate": 1.2366442431225809e-05,
"loss": 4.4227,
"step": 143
},
{
"epoch": 0.7944827586206896,
"grad_norm": 17.9448184967041,
"learning_rate": 1.1967565581509248e-05,
"loss": 6.4883,
"step": 144
},
{
"epoch": 0.8,
"grad_norm": 17.575916290283203,
"learning_rate": 1.1573618619309965e-05,
"loss": 4.7533,
"step": 145
},
{
"epoch": 0.8055172413793104,
"grad_norm": 12.778429985046387,
"learning_rate": 1.1184709245793889e-05,
"loss": 4.6147,
"step": 146
},
{
"epoch": 0.8110344827586207,
"grad_norm": 16.762371063232422,
"learning_rate": 1.0800943784900502e-05,
"loss": 5.0578,
"step": 147
},
{
"epoch": 0.8165517241379311,
"grad_norm": 15.866377830505371,
"learning_rate": 1.042242715427494e-05,
"loss": 4.8278,
"step": 148
},
{
"epoch": 0.8220689655172414,
"grad_norm": 16.081371307373047,
"learning_rate": 1.004926283658455e-05,
"loss": 5.0605,
"step": 149
},
{
"epoch": 0.8275862068965517,
"grad_norm": 17.62371826171875,
"learning_rate": 9.681552851227774e-06,
"loss": 4.6947,
"step": 150
},
{
"epoch": 0.8275862068965517,
"eval_loss": 1.2990185022354126,
"eval_runtime": 22.2401,
"eval_samples_per_second": 13.714,
"eval_steps_per_second": 3.462,
"step": 150
},
{
"epoch": 0.833103448275862,
"grad_norm": 15.228355407714844,
"learning_rate": 9.319397726443026e-06,
"loss": 4.537,
"step": 151
},
{
"epoch": 0.8386206896551724,
"grad_norm": 16.1376953125,
"learning_rate": 8.962896471825342e-06,
"loss": 4.2811,
"step": 152
},
{
"epoch": 0.8441379310344828,
"grad_norm": 17.54079246520996,
"learning_rate": 8.61214655125809e-06,
"loss": 5.1807,
"step": 153
},
{
"epoch": 0.8496551724137931,
"grad_norm": 15.536772727966309,
"learning_rate": 8.267243856267331e-06,
"loss": 4.5729,
"step": 154
},
{
"epoch": 0.8551724137931035,
"grad_norm": 19.280027389526367,
"learning_rate": 7.928282679806052e-06,
"loss": 4.8167,
"step": 155
},
{
"epoch": 0.8606896551724138,
"grad_norm": 16.457712173461914,
"learning_rate": 7.595355690475393e-06,
"loss": 4.7762,
"step": 156
},
{
"epoch": 0.8662068965517241,
"grad_norm": 18.711694717407227,
"learning_rate": 7.268553907189964e-06,
"loss": 4.7609,
"step": 157
},
{
"epoch": 0.8717241379310345,
"grad_norm": 15.429764747619629,
"learning_rate": 6.947966674294236e-06,
"loss": 4.4401,
"step": 158
},
{
"epoch": 0.8772413793103448,
"grad_norm": 18.525232315063477,
"learning_rate": 6.6336816371366305e-06,
"loss": 4.5169,
"step": 159
},
{
"epoch": 0.8827586206896552,
"grad_norm": 20.283931732177734,
"learning_rate": 6.325784718108196e-06,
"loss": 5.5306,
"step": 160
},
{
"epoch": 0.8882758620689655,
"grad_norm": 23.284629821777344,
"learning_rate": 6.0243600931522595e-06,
"loss": 4.9871,
"step": 161
},
{
"epoch": 0.8937931034482759,
"grad_norm": 16.778202056884766,
"learning_rate": 5.72949016875158e-06,
"loss": 5.2333,
"step": 162
},
{
"epoch": 0.8993103448275862,
"grad_norm": 21.578655242919922,
"learning_rate": 5.44125555939923e-06,
"loss": 5.4952,
"step": 163
},
{
"epoch": 0.9048275862068965,
"grad_norm": 18.722185134887695,
"learning_rate": 5.159735065559399e-06,
"loss": 4.6659,
"step": 164
},
{
"epoch": 0.9103448275862069,
"grad_norm": 19.489261627197266,
"learning_rate": 4.885005652124144e-06,
"loss": 4.7271,
"step": 165
},
{
"epoch": 0.9158620689655173,
"grad_norm": 19.362234115600586,
"learning_rate": 4.617142427371934e-06,
"loss": 4.8474,
"step": 166
},
{
"epoch": 0.9213793103448276,
"grad_norm": 21.526710510253906,
"learning_rate": 4.3562186224338265e-06,
"loss": 5.7556,
"step": 167
},
{
"epoch": 0.926896551724138,
"grad_norm": 18.980236053466797,
"learning_rate": 4.102305571272783e-06,
"loss": 5.0587,
"step": 168
},
{
"epoch": 0.9324137931034483,
"grad_norm": 18.678552627563477,
"learning_rate": 3.855472691181678e-06,
"loss": 4.3548,
"step": 169
},
{
"epoch": 0.9379310344827586,
"grad_norm": 25.089895248413086,
"learning_rate": 3.615787463805331e-06,
"loss": 5.2909,
"step": 170
},
{
"epoch": 0.9434482758620689,
"grad_norm": 24.198055267333984,
"learning_rate": 3.383315416691646e-06,
"loss": 4.7572,
"step": 171
},
{
"epoch": 0.9489655172413793,
"grad_norm": 19.62432861328125,
"learning_rate": 3.158120105377096e-06,
"loss": 5.0689,
"step": 172
},
{
"epoch": 0.9544827586206897,
"grad_norm": 20.130558013916016,
"learning_rate": 2.940263096011233e-06,
"loss": 4.3257,
"step": 173
},
{
"epoch": 0.96,
"grad_norm": 25.325763702392578,
"learning_rate": 2.729803948525125e-06,
"loss": 5.8623,
"step": 174
},
{
"epoch": 0.9655172413793104,
"grad_norm": 21.457735061645508,
"learning_rate": 2.526800200348275e-06,
"loss": 4.4341,
"step": 175
},
{
"epoch": 0.9710344827586207,
"grad_norm": 22.726511001586914,
"learning_rate": 2.3313073506784575e-06,
"loss": 4.9613,
"step": 176
},
{
"epoch": 0.976551724137931,
"grad_norm": 27.022869110107422,
"learning_rate": 2.143378845308791e-06,
"loss": 5.0634,
"step": 177
},
{
"epoch": 0.9820689655172414,
"grad_norm": 25.477567672729492,
"learning_rate": 1.9630660620161777e-06,
"loss": 4.4496,
"step": 178
},
{
"epoch": 0.9875862068965517,
"grad_norm": 24.415048599243164,
"learning_rate": 1.790418296515165e-06,
"loss": 4.149,
"step": 179
},
{
"epoch": 0.993103448275862,
"grad_norm": 34.604026794433594,
"learning_rate": 1.625482748980961e-06,
"loss": 5.0238,
"step": 180
},
{
"epoch": 0.9986206896551724,
"grad_norm": 16.760923385620117,
"learning_rate": 1.4683045111453942e-06,
"loss": 5.2373,
"step": 181
},
{
"epoch": 1.0041379310344827,
"grad_norm": 16.47662925720215,
"learning_rate": 1.3189265539692707e-06,
"loss": 4.4056,
"step": 182
},
{
"epoch": 1.0096551724137932,
"grad_norm": 16.61656951904297,
"learning_rate": 1.1773897158945557e-06,
"loss": 5.7166,
"step": 183
},
{
"epoch": 1.0151724137931035,
"grad_norm": 16.779102325439453,
"learning_rate": 1.0437326916795432e-06,
"loss": 5.355,
"step": 184
},
{
"epoch": 1.0206896551724138,
"grad_norm": 15.03419017791748,
"learning_rate": 9.179920218200888e-07,
"loss": 4.5611,
"step": 185
},
{
"epoch": 1.0262068965517241,
"grad_norm": 16.868610382080078,
"learning_rate": 8.002020825598277e-07,
"loss": 4.6968,
"step": 186
},
{
"epoch": 1.0317241379310345,
"grad_norm": 16.311511993408203,
"learning_rate": 6.90395076492022e-07,
"loss": 4.3882,
"step": 187
},
{
"epoch": 1.0372413793103448,
"grad_norm": 14.302980422973633,
"learning_rate": 5.886010237557194e-07,
"loss": 4.1967,
"step": 188
},
{
"epoch": 1.042758620689655,
"grad_norm": 15.19272232055664,
"learning_rate": 4.94847753828529e-07,
"loss": 3.7039,
"step": 189
},
{
"epoch": 1.0482758620689656,
"grad_norm": 13.517196655273438,
"learning_rate": 4.091608979183303e-07,
"loss": 3.6311,
"step": 190
},
{
"epoch": 1.053793103448276,
"grad_norm": 18.231815338134766,
"learning_rate": 3.315638819559452e-07,
"loss": 3.5249,
"step": 191
},
{
"epoch": 1.0593103448275862,
"grad_norm": 13.879122734069824,
"learning_rate": 2.6207792019074414e-07,
"loss": 3.1093,
"step": 192
},
{
"epoch": 1.0648275862068965,
"grad_norm": 14.321069717407227,
"learning_rate": 2.0072200939085573e-07,
"loss": 4.4564,
"step": 193
},
{
"epoch": 1.0703448275862069,
"grad_norm": 15.723977088928223,
"learning_rate": 1.475129236496575e-07,
"loss": 3.9035,
"step": 194
},
{
"epoch": 1.0758620689655172,
"grad_norm": 14.243907928466797,
"learning_rate": 1.0246520979990459e-07,
"loss": 3.327,
"step": 195
},
{
"epoch": 1.0813793103448275,
"grad_norm": 11.917428970336914,
"learning_rate": 6.559118343676396e-08,
"loss": 2.6171,
"step": 196
},
{
"epoch": 1.086896551724138,
"grad_norm": 11.877168655395508,
"learning_rate": 3.690092555085789e-08,
"loss": 2.8428,
"step": 197
},
{
"epoch": 1.0924137931034483,
"grad_norm": 12.58403491973877,
"learning_rate": 1.640227977221853e-08,
"loss": 3.6764,
"step": 198
},
{
"epoch": 1.0979310344827586,
"grad_norm": 12.414690017700195,
"learning_rate": 4.1008502259298755e-09,
"loss": 2.7963,
"step": 199
},
{
"epoch": 1.103448275862069,
"grad_norm": 11.689571380615234,
"learning_rate": 0.0,
"loss": 2.3991,
"step": 200
},
{
"epoch": 1.103448275862069,
"eval_loss": 1.2467516660690308,
"eval_runtime": 22.2287,
"eval_samples_per_second": 13.721,
"eval_steps_per_second": 3.464,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 4,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8598115829284864e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}