MS2501-24b-Ink-apollo-ep1 / trainer_state.json
Fizzarolli's picture
Upload folder using huggingface_hub
e642053 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 544,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001838235294117647,
"grad_norm": 0.0,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.9627,
"step": 1
},
{
"epoch": 0.003676470588235294,
"grad_norm": 0.0,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.6036,
"step": 2
},
{
"epoch": 0.0055147058823529415,
"grad_norm": 0.0,
"learning_rate": 6.000000000000001e-07,
"loss": 1.7332,
"step": 3
},
{
"epoch": 0.007352941176470588,
"grad_norm": 0.0,
"learning_rate": 8.000000000000001e-07,
"loss": 1.8943,
"step": 4
},
{
"epoch": 0.009191176470588236,
"grad_norm": 0.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.9555,
"step": 5
},
{
"epoch": 0.011029411764705883,
"grad_norm": 0.0,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.972,
"step": 6
},
{
"epoch": 0.012867647058823529,
"grad_norm": 0.0,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.7141,
"step": 7
},
{
"epoch": 0.014705882352941176,
"grad_norm": 0.0,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.8038,
"step": 8
},
{
"epoch": 0.016544117647058824,
"grad_norm": 0.0,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.9202,
"step": 9
},
{
"epoch": 0.01838235294117647,
"grad_norm": 0.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.6748,
"step": 10
},
{
"epoch": 0.02022058823529412,
"grad_norm": 0.0,
"learning_rate": 2.2e-06,
"loss": 1.8197,
"step": 11
},
{
"epoch": 0.022058823529411766,
"grad_norm": 0.0,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.7396,
"step": 12
},
{
"epoch": 0.02389705882352941,
"grad_norm": 0.0,
"learning_rate": 2.6e-06,
"loss": 1.654,
"step": 13
},
{
"epoch": 0.025735294117647058,
"grad_norm": 0.0,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.7685,
"step": 14
},
{
"epoch": 0.027573529411764705,
"grad_norm": 0.0,
"learning_rate": 3e-06,
"loss": 1.5536,
"step": 15
},
{
"epoch": 0.029411764705882353,
"grad_norm": 0.0,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.7035,
"step": 16
},
{
"epoch": 0.03125,
"grad_norm": 0.0,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.9268,
"step": 17
},
{
"epoch": 0.03308823529411765,
"grad_norm": 0.0,
"learning_rate": 3.6000000000000003e-06,
"loss": 2.0697,
"step": 18
},
{
"epoch": 0.034926470588235295,
"grad_norm": 0.0,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.9225,
"step": 19
},
{
"epoch": 0.03676470588235294,
"grad_norm": 0.0,
"learning_rate": 4.000000000000001e-06,
"loss": 1.8257,
"step": 20
},
{
"epoch": 0.03860294117647059,
"grad_norm": 0.0,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.8509,
"step": 21
},
{
"epoch": 0.04044117647058824,
"grad_norm": 0.0,
"learning_rate": 4.4e-06,
"loss": 1.8072,
"step": 22
},
{
"epoch": 0.042279411764705885,
"grad_norm": 0.0,
"learning_rate": 4.600000000000001e-06,
"loss": 1.7604,
"step": 23
},
{
"epoch": 0.04411764705882353,
"grad_norm": 0.0,
"learning_rate": 4.800000000000001e-06,
"loss": 1.7735,
"step": 24
},
{
"epoch": 0.04595588235294118,
"grad_norm": 0.0,
"learning_rate": 5e-06,
"loss": 1.8244,
"step": 25
},
{
"epoch": 0.04779411764705882,
"grad_norm": 0.0,
"learning_rate": 4.999989082004443e-06,
"loss": 1.9756,
"step": 26
},
{
"epoch": 0.04963235294117647,
"grad_norm": 0.0,
"learning_rate": 4.999956328113134e-06,
"loss": 1.6347,
"step": 27
},
{
"epoch": 0.051470588235294115,
"grad_norm": 0.0,
"learning_rate": 4.999901738612159e-06,
"loss": 1.817,
"step": 28
},
{
"epoch": 0.05330882352941176,
"grad_norm": 0.0,
"learning_rate": 4.999825313978322e-06,
"loss": 1.6143,
"step": 29
},
{
"epoch": 0.05514705882352941,
"grad_norm": 0.0,
"learning_rate": 4.999727054879149e-06,
"loss": 1.7087,
"step": 30
},
{
"epoch": 0.05698529411764706,
"grad_norm": 0.0,
"learning_rate": 4.999606962172872e-06,
"loss": 1.9148,
"step": 31
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.0,
"learning_rate": 4.999465036908429e-06,
"loss": 1.7264,
"step": 32
},
{
"epoch": 0.06066176470588235,
"grad_norm": 0.0,
"learning_rate": 4.999301280325452e-06,
"loss": 1.5771,
"step": 33
},
{
"epoch": 0.0625,
"grad_norm": 0.0,
"learning_rate": 4.999115693854255e-06,
"loss": 1.5012,
"step": 34
},
{
"epoch": 0.06433823529411764,
"grad_norm": 0.0,
"learning_rate": 4.998908279115825e-06,
"loss": 1.8459,
"step": 35
},
{
"epoch": 0.0661764705882353,
"grad_norm": 0.0,
"learning_rate": 4.998679037921803e-06,
"loss": 1.7001,
"step": 36
},
{
"epoch": 0.06801470588235294,
"grad_norm": 0.0,
"learning_rate": 4.998427972274473e-06,
"loss": 1.719,
"step": 37
},
{
"epoch": 0.06985294117647059,
"grad_norm": 0.0,
"learning_rate": 4.998155084366744e-06,
"loss": 1.9945,
"step": 38
},
{
"epoch": 0.07169117647058823,
"grad_norm": 0.0,
"learning_rate": 4.997860376582123e-06,
"loss": 1.8024,
"step": 39
},
{
"epoch": 0.07352941176470588,
"grad_norm": 0.0,
"learning_rate": 4.997543851494709e-06,
"loss": 1.7099,
"step": 40
},
{
"epoch": 0.07536764705882353,
"grad_norm": 0.0,
"learning_rate": 4.9972055118691545e-06,
"loss": 1.5121,
"step": 41
},
{
"epoch": 0.07720588235294118,
"grad_norm": 0.0,
"learning_rate": 4.996845360660652e-06,
"loss": 1.7949,
"step": 42
},
{
"epoch": 0.07904411764705882,
"grad_norm": 0.0,
"learning_rate": 4.996463401014908e-06,
"loss": 1.6773,
"step": 43
},
{
"epoch": 0.08088235294117647,
"grad_norm": 0.0,
"learning_rate": 4.9960596362681054e-06,
"loss": 1.7022,
"step": 44
},
{
"epoch": 0.08272058823529412,
"grad_norm": 0.0,
"learning_rate": 4.9956340699468896e-06,
"loss": 1.7003,
"step": 45
},
{
"epoch": 0.08455882352941177,
"grad_norm": 0.0,
"learning_rate": 4.995186705768322e-06,
"loss": 1.5486,
"step": 46
},
{
"epoch": 0.08639705882352941,
"grad_norm": 0.0,
"learning_rate": 4.9947175476398606e-06,
"loss": 1.7186,
"step": 47
},
{
"epoch": 0.08823529411764706,
"grad_norm": 0.0,
"learning_rate": 4.994226599659319e-06,
"loss": 1.6925,
"step": 48
},
{
"epoch": 0.0900735294117647,
"grad_norm": 0.0,
"learning_rate": 4.993713866114829e-06,
"loss": 1.9749,
"step": 49
},
{
"epoch": 0.09191176470588236,
"grad_norm": 0.0,
"learning_rate": 4.993179351484811e-06,
"loss": 2.0403,
"step": 50
},
{
"epoch": 0.09375,
"grad_norm": 0.0,
"learning_rate": 4.9926230604379275e-06,
"loss": 1.8667,
"step": 51
},
{
"epoch": 0.09558823529411764,
"grad_norm": 0.0,
"learning_rate": 4.992044997833044e-06,
"loss": 1.8817,
"step": 52
},
{
"epoch": 0.0974264705882353,
"grad_norm": 0.0,
"learning_rate": 4.991445168719189e-06,
"loss": 1.909,
"step": 53
},
{
"epoch": 0.09926470588235294,
"grad_norm": 0.0,
"learning_rate": 4.9908235783355075e-06,
"loss": 1.6816,
"step": 54
},
{
"epoch": 0.10110294117647059,
"grad_norm": 0.0,
"learning_rate": 4.990180232111217e-06,
"loss": 1.627,
"step": 55
},
{
"epoch": 0.10294117647058823,
"grad_norm": 0.0,
"learning_rate": 4.989515135665558e-06,
"loss": 1.7085,
"step": 56
},
{
"epoch": 0.10477941176470588,
"grad_norm": 0.0,
"learning_rate": 4.988828294807746e-06,
"loss": 1.884,
"step": 57
},
{
"epoch": 0.10661764705882353,
"grad_norm": 0.0,
"learning_rate": 4.988119715536922e-06,
"loss": 1.7246,
"step": 58
},
{
"epoch": 0.10845588235294118,
"grad_norm": 0.0,
"learning_rate": 4.987389404042098e-06,
"loss": 1.7751,
"step": 59
},
{
"epoch": 0.11029411764705882,
"grad_norm": 0.0,
"learning_rate": 4.986637366702105e-06,
"loss": 1.7463,
"step": 60
},
{
"epoch": 0.11213235294117647,
"grad_norm": 0.0,
"learning_rate": 4.985863610085534e-06,
"loss": 1.695,
"step": 61
},
{
"epoch": 0.11397058823529412,
"grad_norm": 0.0,
"learning_rate": 4.985068140950683e-06,
"loss": 1.8484,
"step": 62
},
{
"epoch": 0.11580882352941177,
"grad_norm": 0.0,
"learning_rate": 4.984250966245495e-06,
"loss": 1.9041,
"step": 63
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.0,
"learning_rate": 4.983412093107496e-06,
"loss": 1.9226,
"step": 64
},
{
"epoch": 0.11948529411764706,
"grad_norm": 0.0,
"learning_rate": 4.982551528863738e-06,
"loss": 1.8654,
"step": 65
},
{
"epoch": 0.1213235294117647,
"grad_norm": 0.0,
"learning_rate": 4.981669281030731e-06,
"loss": 1.7118,
"step": 66
},
{
"epoch": 0.12316176470588236,
"grad_norm": 0.0,
"learning_rate": 4.980765357314376e-06,
"loss": 1.7896,
"step": 67
},
{
"epoch": 0.125,
"grad_norm": 0.0,
"learning_rate": 4.9798397656099005e-06,
"loss": 1.5778,
"step": 68
},
{
"epoch": 0.12683823529411764,
"grad_norm": 0.0,
"learning_rate": 4.978892514001792e-06,
"loss": 1.9112,
"step": 69
},
{
"epoch": 0.12867647058823528,
"grad_norm": 0.0,
"learning_rate": 4.977923610763719e-06,
"loss": 1.8526,
"step": 70
},
{
"epoch": 0.13051470588235295,
"grad_norm": 0.0,
"learning_rate": 4.976933064358467e-06,
"loss": 1.6893,
"step": 71
},
{
"epoch": 0.1323529411764706,
"grad_norm": 0.0,
"learning_rate": 4.975920883437862e-06,
"loss": 1.8083,
"step": 72
},
{
"epoch": 0.13419117647058823,
"grad_norm": 0.0,
"learning_rate": 4.974887076842694e-06,
"loss": 1.9307,
"step": 73
},
{
"epoch": 0.13602941176470587,
"grad_norm": 0.0,
"learning_rate": 4.973831653602637e-06,
"loss": 1.828,
"step": 74
},
{
"epoch": 0.13786764705882354,
"grad_norm": 0.0,
"learning_rate": 4.972754622936178e-06,
"loss": 1.7038,
"step": 75
},
{
"epoch": 0.13970588235294118,
"grad_norm": 0.0,
"learning_rate": 4.971655994250529e-06,
"loss": 1.6064,
"step": 76
},
{
"epoch": 0.14154411764705882,
"grad_norm": 0.0,
"learning_rate": 4.97053577714155e-06,
"loss": 1.7252,
"step": 77
},
{
"epoch": 0.14338235294117646,
"grad_norm": 0.0,
"learning_rate": 4.96939398139366e-06,
"loss": 1.979,
"step": 78
},
{
"epoch": 0.14522058823529413,
"grad_norm": 0.0,
"learning_rate": 4.968230616979755e-06,
"loss": 1.9525,
"step": 79
},
{
"epoch": 0.14705882352941177,
"grad_norm": 0.0,
"learning_rate": 4.967045694061122e-06,
"loss": 1.912,
"step": 80
},
{
"epoch": 0.1488970588235294,
"grad_norm": 0.0,
"learning_rate": 4.965839222987348e-06,
"loss": 1.6298,
"step": 81
},
{
"epoch": 0.15073529411764705,
"grad_norm": 0.0,
"learning_rate": 4.9646112142962295e-06,
"loss": 1.6537,
"step": 82
},
{
"epoch": 0.15257352941176472,
"grad_norm": 0.0,
"learning_rate": 4.96336167871368e-06,
"loss": 1.9089,
"step": 83
},
{
"epoch": 0.15441176470588236,
"grad_norm": 0.0,
"learning_rate": 4.96209062715364e-06,
"loss": 1.8703,
"step": 84
},
{
"epoch": 0.15625,
"grad_norm": 0.0,
"learning_rate": 4.960798070717977e-06,
"loss": 1.9559,
"step": 85
},
{
"epoch": 0.15808823529411764,
"grad_norm": 0.0,
"learning_rate": 4.959484020696392e-06,
"loss": 1.8444,
"step": 86
},
{
"epoch": 0.15992647058823528,
"grad_norm": 0.0,
"learning_rate": 4.9581484885663175e-06,
"loss": 1.6396,
"step": 87
},
{
"epoch": 0.16176470588235295,
"grad_norm": 0.0,
"learning_rate": 4.956791485992823e-06,
"loss": 1.9488,
"step": 88
},
{
"epoch": 0.1636029411764706,
"grad_norm": 0.0,
"learning_rate": 4.955413024828504e-06,
"loss": 1.8038,
"step": 89
},
{
"epoch": 0.16544117647058823,
"grad_norm": 0.0,
"learning_rate": 4.9540131171133884e-06,
"loss": 1.7477,
"step": 90
},
{
"epoch": 0.16727941176470587,
"grad_norm": 0.0,
"learning_rate": 4.952591775074825e-06,
"loss": 1.7757,
"step": 91
},
{
"epoch": 0.16911764705882354,
"grad_norm": 0.0,
"learning_rate": 4.951149011127379e-06,
"loss": 1.7452,
"step": 92
},
{
"epoch": 0.17095588235294118,
"grad_norm": 0.0,
"learning_rate": 4.949684837872723e-06,
"loss": 1.6137,
"step": 93
},
{
"epoch": 0.17279411764705882,
"grad_norm": 0.0,
"learning_rate": 4.948199268099525e-06,
"loss": 1.8074,
"step": 94
},
{
"epoch": 0.17463235294117646,
"grad_norm": 0.0,
"learning_rate": 4.946692314783342e-06,
"loss": 1.7006,
"step": 95
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.0,
"learning_rate": 4.9451639910865016e-06,
"loss": 1.6746,
"step": 96
},
{
"epoch": 0.17830882352941177,
"grad_norm": 0.0,
"learning_rate": 4.943614310357987e-06,
"loss": 1.5338,
"step": 97
},
{
"epoch": 0.1801470588235294,
"grad_norm": 0.0,
"learning_rate": 4.942043286133326e-06,
"loss": 1.639,
"step": 98
},
{
"epoch": 0.18198529411764705,
"grad_norm": 0.0,
"learning_rate": 4.940450932134467e-06,
"loss": 1.8445,
"step": 99
},
{
"epoch": 0.18382352941176472,
"grad_norm": 0.0,
"learning_rate": 4.9388372622696605e-06,
"loss": 1.6577,
"step": 100
},
{
"epoch": 0.18566176470588236,
"grad_norm": 0.0,
"learning_rate": 4.937202290633337e-06,
"loss": 1.9322,
"step": 101
},
{
"epoch": 0.1875,
"grad_norm": 0.0,
"learning_rate": 4.935546031505991e-06,
"loss": 1.8592,
"step": 102
},
{
"epoch": 0.18933823529411764,
"grad_norm": 0.0,
"learning_rate": 4.933868499354043e-06,
"loss": 1.8238,
"step": 103
},
{
"epoch": 0.19117647058823528,
"grad_norm": 0.0,
"learning_rate": 4.932169708829725e-06,
"loss": 1.8892,
"step": 104
},
{
"epoch": 0.19301470588235295,
"grad_norm": 0.0,
"learning_rate": 4.930449674770947e-06,
"loss": 1.6668,
"step": 105
},
{
"epoch": 0.1948529411764706,
"grad_norm": 0.0,
"learning_rate": 4.928708412201169e-06,
"loss": 2.0112,
"step": 106
},
{
"epoch": 0.19669117647058823,
"grad_norm": 0.0,
"learning_rate": 4.926945936329266e-06,
"loss": 1.8705,
"step": 107
},
{
"epoch": 0.19852941176470587,
"grad_norm": 0.0,
"learning_rate": 4.925162262549405e-06,
"loss": 1.8025,
"step": 108
},
{
"epoch": 0.20036764705882354,
"grad_norm": 0.0,
"learning_rate": 4.923357406440896e-06,
"loss": 1.9824,
"step": 109
},
{
"epoch": 0.20220588235294118,
"grad_norm": 0.0,
"learning_rate": 4.921531383768071e-06,
"loss": 1.6375,
"step": 110
},
{
"epoch": 0.20404411764705882,
"grad_norm": 0.0,
"learning_rate": 4.919684210480134e-06,
"loss": 1.9491,
"step": 111
},
{
"epoch": 0.20588235294117646,
"grad_norm": 0.0,
"learning_rate": 4.917815902711029e-06,
"loss": 2.0238,
"step": 112
},
{
"epoch": 0.20772058823529413,
"grad_norm": 0.0,
"learning_rate": 4.915926476779297e-06,
"loss": 1.8125,
"step": 113
},
{
"epoch": 0.20955882352941177,
"grad_norm": 0.0,
"learning_rate": 4.914015949187934e-06,
"loss": 1.7428,
"step": 114
},
{
"epoch": 0.2113970588235294,
"grad_norm": 0.0,
"learning_rate": 4.912084336624243e-06,
"loss": 1.6849,
"step": 115
},
{
"epoch": 0.21323529411764705,
"grad_norm": 0.0,
"learning_rate": 4.910131655959697e-06,
"loss": 1.8793,
"step": 116
},
{
"epoch": 0.21507352941176472,
"grad_norm": 0.0,
"learning_rate": 4.908157924249781e-06,
"loss": 1.9526,
"step": 117
},
{
"epoch": 0.21691176470588236,
"grad_norm": 0.0,
"learning_rate": 4.906163158733851e-06,
"loss": 1.7916,
"step": 118
},
{
"epoch": 0.21875,
"grad_norm": 0.0,
"learning_rate": 4.904147376834979e-06,
"loss": 1.8987,
"step": 119
},
{
"epoch": 0.22058823529411764,
"grad_norm": 0.0,
"learning_rate": 4.9021105961598046e-06,
"loss": 1.877,
"step": 120
},
{
"epoch": 0.22242647058823528,
"grad_norm": 0.0,
"learning_rate": 4.900052834498377e-06,
"loss": 1.7896,
"step": 121
},
{
"epoch": 0.22426470588235295,
"grad_norm": 0.0,
"learning_rate": 4.897974109824002e-06,
"loss": 1.8914,
"step": 122
},
{
"epoch": 0.2261029411764706,
"grad_norm": 0.0,
"learning_rate": 4.895874440293085e-06,
"loss": 1.7302,
"step": 123
},
{
"epoch": 0.22794117647058823,
"grad_norm": 0.0,
"learning_rate": 4.8937538442449724e-06,
"loss": 1.807,
"step": 124
},
{
"epoch": 0.22977941176470587,
"grad_norm": 0.0,
"learning_rate": 4.891612340201791e-06,
"loss": 1.6097,
"step": 125
},
{
"epoch": 0.23161764705882354,
"grad_norm": 0.0,
"learning_rate": 4.8894499468682865e-06,
"loss": 1.9383,
"step": 126
},
{
"epoch": 0.23345588235294118,
"grad_norm": 0.0,
"learning_rate": 4.887266683131659e-06,
"loss": 1.6959,
"step": 127
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.0,
"learning_rate": 4.885062568061399e-06,
"loss": 1.9403,
"step": 128
},
{
"epoch": 0.23713235294117646,
"grad_norm": 0.0,
"learning_rate": 4.882837620909121e-06,
"loss": 1.6888,
"step": 129
},
{
"epoch": 0.23897058823529413,
"grad_norm": 0.0,
"learning_rate": 4.880591861108397e-06,
"loss": 1.7798,
"step": 130
},
{
"epoch": 0.24080882352941177,
"grad_norm": 0.0,
"learning_rate": 4.878325308274583e-06,
"loss": 1.796,
"step": 131
},
{
"epoch": 0.2426470588235294,
"grad_norm": 0.0,
"learning_rate": 4.876037982204649e-06,
"loss": 1.6234,
"step": 132
},
{
"epoch": 0.24448529411764705,
"grad_norm": 0.0,
"learning_rate": 4.873729902877009e-06,
"loss": 1.8065,
"step": 133
},
{
"epoch": 0.24632352941176472,
"grad_norm": 0.0,
"learning_rate": 4.871401090451342e-06,
"loss": 1.9266,
"step": 134
},
{
"epoch": 0.24816176470588236,
"grad_norm": 0.0,
"learning_rate": 4.869051565268419e-06,
"loss": 1.772,
"step": 135
},
{
"epoch": 0.25,
"grad_norm": 0.0,
"learning_rate": 4.866681347849925e-06,
"loss": 1.5869,
"step": 136
},
{
"epoch": 0.25183823529411764,
"grad_norm": 0.0,
"learning_rate": 4.8642904588982785e-06,
"loss": 1.9012,
"step": 137
},
{
"epoch": 0.2536764705882353,
"grad_norm": 0.0,
"learning_rate": 4.861878919296451e-06,
"loss": 1.9242,
"step": 138
},
{
"epoch": 0.2555147058823529,
"grad_norm": 0.0,
"learning_rate": 4.859446750107786e-06,
"loss": 2.0885,
"step": 139
},
{
"epoch": 0.25735294117647056,
"grad_norm": 0.0,
"learning_rate": 4.856993972575813e-06,
"loss": 1.5305,
"step": 140
},
{
"epoch": 0.25919117647058826,
"grad_norm": 0.0,
"learning_rate": 4.854520608124063e-06,
"loss": 1.8923,
"step": 141
},
{
"epoch": 0.2610294117647059,
"grad_norm": 0.0,
"learning_rate": 4.8520266783558825e-06,
"loss": 1.8581,
"step": 142
},
{
"epoch": 0.26286764705882354,
"grad_norm": 0.0,
"learning_rate": 4.849512205054242e-06,
"loss": 1.6467,
"step": 143
},
{
"epoch": 0.2647058823529412,
"grad_norm": 0.0,
"learning_rate": 4.846977210181549e-06,
"loss": 1.8146,
"step": 144
},
{
"epoch": 0.2665441176470588,
"grad_norm": 0.0,
"learning_rate": 4.844421715879453e-06,
"loss": 1.555,
"step": 145
},
{
"epoch": 0.26838235294117646,
"grad_norm": 0.0,
"learning_rate": 4.841845744468655e-06,
"loss": 1.7029,
"step": 146
},
{
"epoch": 0.2702205882352941,
"grad_norm": 0.0,
"learning_rate": 4.83924931844871e-06,
"loss": 1.7241,
"step": 147
},
{
"epoch": 0.27205882352941174,
"grad_norm": 0.0,
"learning_rate": 4.836632460497832e-06,
"loss": 1.667,
"step": 148
},
{
"epoch": 0.27389705882352944,
"grad_norm": 0.0,
"learning_rate": 4.833995193472697e-06,
"loss": 1.5294,
"step": 149
},
{
"epoch": 0.2757352941176471,
"grad_norm": 0.0,
"learning_rate": 4.831337540408239e-06,
"loss": 1.7341,
"step": 150
},
{
"epoch": 0.2775735294117647,
"grad_norm": 0.0,
"learning_rate": 4.828659524517455e-06,
"loss": 1.7731,
"step": 151
},
{
"epoch": 0.27941176470588236,
"grad_norm": 0.0,
"learning_rate": 4.825961169191196e-06,
"loss": 1.891,
"step": 152
},
{
"epoch": 0.28125,
"grad_norm": 0.0,
"learning_rate": 4.8232424979979684e-06,
"loss": 1.5459,
"step": 153
},
{
"epoch": 0.28308823529411764,
"grad_norm": 0.0,
"learning_rate": 4.820503534683725e-06,
"loss": 1.7663,
"step": 154
},
{
"epoch": 0.2849264705882353,
"grad_norm": 0.0,
"learning_rate": 4.8177443031716545e-06,
"loss": 1.9843,
"step": 155
},
{
"epoch": 0.2867647058823529,
"grad_norm": 0.0,
"learning_rate": 4.814964827561981e-06,
"loss": 1.9345,
"step": 156
},
{
"epoch": 0.28860294117647056,
"grad_norm": 0.0,
"learning_rate": 4.812165132131746e-06,
"loss": 1.651,
"step": 157
},
{
"epoch": 0.29044117647058826,
"grad_norm": 0.0,
"learning_rate": 4.809345241334598e-06,
"loss": 1.7562,
"step": 158
},
{
"epoch": 0.2922794117647059,
"grad_norm": 0.0,
"learning_rate": 4.806505179800583e-06,
"loss": 1.7144,
"step": 159
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.0,
"learning_rate": 4.803644972335925e-06,
"loss": 1.8868,
"step": 160
},
{
"epoch": 0.2959558823529412,
"grad_norm": 0.0,
"learning_rate": 4.800764643922806e-06,
"loss": 1.7201,
"step": 161
},
{
"epoch": 0.2977941176470588,
"grad_norm": 0.0,
"learning_rate": 4.797864219719161e-06,
"loss": 2.0389,
"step": 162
},
{
"epoch": 0.29963235294117646,
"grad_norm": 0.0,
"learning_rate": 4.794943725058441e-06,
"loss": 1.7262,
"step": 163
},
{
"epoch": 0.3014705882352941,
"grad_norm": 0.0,
"learning_rate": 4.792003185449406e-06,
"loss": 1.8069,
"step": 164
},
{
"epoch": 0.30330882352941174,
"grad_norm": 0.0,
"learning_rate": 4.789042626575895e-06,
"loss": 1.8573,
"step": 165
},
{
"epoch": 0.30514705882352944,
"grad_norm": 0.0,
"learning_rate": 4.786062074296602e-06,
"loss": 1.7839,
"step": 166
},
{
"epoch": 0.3069852941176471,
"grad_norm": 0.0,
"learning_rate": 4.783061554644853e-06,
"loss": 1.8461,
"step": 167
},
{
"epoch": 0.3088235294117647,
"grad_norm": 0.0,
"learning_rate": 4.780041093828376e-06,
"loss": 1.7538,
"step": 168
},
{
"epoch": 0.31066176470588236,
"grad_norm": 0.0,
"learning_rate": 4.777000718229072e-06,
"loss": 1.6497,
"step": 169
},
{
"epoch": 0.3125,
"grad_norm": 0.0,
"learning_rate": 4.773940454402789e-06,
"loss": 1.6723,
"step": 170
},
{
"epoch": 0.31433823529411764,
"grad_norm": 0.0,
"learning_rate": 4.770860329079083e-06,
"loss": 1.8927,
"step": 171
},
{
"epoch": 0.3161764705882353,
"grad_norm": 0.0,
"learning_rate": 4.7677603691609905e-06,
"loss": 1.7236,
"step": 172
},
{
"epoch": 0.3180147058823529,
"grad_norm": 0.0,
"learning_rate": 4.7646406017247895e-06,
"loss": 1.971,
"step": 173
},
{
"epoch": 0.31985294117647056,
"grad_norm": 0.0,
"learning_rate": 4.761501054019766e-06,
"loss": 1.8082,
"step": 174
},
{
"epoch": 0.32169117647058826,
"grad_norm": 0.0,
"learning_rate": 4.758341753467975e-06,
"loss": 1.9078,
"step": 175
},
{
"epoch": 0.3235294117647059,
"grad_norm": 0.0,
"learning_rate": 4.755162727663998e-06,
"loss": 1.6387,
"step": 176
},
{
"epoch": 0.32536764705882354,
"grad_norm": 0.0,
"learning_rate": 4.751964004374709e-06,
"loss": 2.0215,
"step": 177
},
{
"epoch": 0.3272058823529412,
"grad_norm": 0.0,
"learning_rate": 4.748745611539024e-06,
"loss": 1.8042,
"step": 178
},
{
"epoch": 0.3290441176470588,
"grad_norm": 0.0,
"learning_rate": 4.745507577267663e-06,
"loss": 2.1742,
"step": 179
},
{
"epoch": 0.33088235294117646,
"grad_norm": 0.0,
"learning_rate": 4.7422499298429e-06,
"loss": 1.8744,
"step": 180
},
{
"epoch": 0.3327205882352941,
"grad_norm": 0.0,
"learning_rate": 4.738972697718319e-06,
"loss": 1.7443,
"step": 181
},
{
"epoch": 0.33455882352941174,
"grad_norm": 0.0,
"learning_rate": 4.735675909518565e-06,
"loss": 1.9355,
"step": 182
},
{
"epoch": 0.33639705882352944,
"grad_norm": 0.0,
"learning_rate": 4.732359594039094e-06,
"loss": 1.5572,
"step": 183
},
{
"epoch": 0.3382352941176471,
"grad_norm": 0.0,
"learning_rate": 4.729023780245919e-06,
"loss": 1.6816,
"step": 184
},
{
"epoch": 0.3400735294117647,
"grad_norm": 0.0,
"learning_rate": 4.725668497275361e-06,
"loss": 1.8776,
"step": 185
},
{
"epoch": 0.34191176470588236,
"grad_norm": 0.0,
"learning_rate": 4.72229377443379e-06,
"loss": 1.7296,
"step": 186
},
{
"epoch": 0.34375,
"grad_norm": 0.0,
"learning_rate": 4.718899641197375e-06,
"loss": 1.8298,
"step": 187
},
{
"epoch": 0.34558823529411764,
"grad_norm": 0.0,
"learning_rate": 4.71548612721182e-06,
"loss": 1.7406,
"step": 188
},
{
"epoch": 0.3474264705882353,
"grad_norm": 0.0,
"learning_rate": 4.712053262292111e-06,
"loss": 1.953,
"step": 189
},
{
"epoch": 0.3492647058823529,
"grad_norm": 0.0,
"learning_rate": 4.70860107642225e-06,
"loss": 1.8692,
"step": 190
},
{
"epoch": 0.35110294117647056,
"grad_norm": 0.0,
"learning_rate": 4.7051295997549964e-06,
"loss": 1.8754,
"step": 191
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.0,
"learning_rate": 4.701638862611605e-06,
"loss": 1.8684,
"step": 192
},
{
"epoch": 0.3547794117647059,
"grad_norm": 0.0,
"learning_rate": 4.698128895481557e-06,
"loss": 1.5358,
"step": 193
},
{
"epoch": 0.35661764705882354,
"grad_norm": 0.0,
"learning_rate": 4.694599729022297e-06,
"loss": 1.56,
"step": 194
},
{
"epoch": 0.3584558823529412,
"grad_norm": 0.0,
"learning_rate": 4.691051394058965e-06,
"loss": 1.7223,
"step": 195
},
{
"epoch": 0.3602941176470588,
"grad_norm": 0.0,
"learning_rate": 4.687483921584124e-06,
"loss": 1.6848,
"step": 196
},
{
"epoch": 0.36213235294117646,
"grad_norm": 0.0,
"learning_rate": 4.683897342757493e-06,
"loss": 1.654,
"step": 197
},
{
"epoch": 0.3639705882352941,
"grad_norm": 0.0,
"learning_rate": 4.680291688905674e-06,
"loss": 1.6973,
"step": 198
},
{
"epoch": 0.36580882352941174,
"grad_norm": 0.0,
"learning_rate": 4.676666991521876e-06,
"loss": 1.6474,
"step": 199
},
{
"epoch": 0.36764705882352944,
"grad_norm": 0.0,
"learning_rate": 4.673023282265645e-06,
"loss": 1.5936,
"step": 200
},
{
"epoch": 0.3694852941176471,
"grad_norm": 0.0,
"learning_rate": 4.669360592962581e-06,
"loss": 1.6647,
"step": 201
},
{
"epoch": 0.3713235294117647,
"grad_norm": 0.0,
"learning_rate": 4.665678955604064e-06,
"loss": 1.9738,
"step": 202
},
{
"epoch": 0.37316176470588236,
"grad_norm": 0.0,
"learning_rate": 4.661978402346974e-06,
"loss": 1.7933,
"step": 203
},
{
"epoch": 0.375,
"grad_norm": 0.0,
"learning_rate": 4.658258965513412e-06,
"loss": 1.9133,
"step": 204
},
{
"epoch": 0.37683823529411764,
"grad_norm": 0.0,
"learning_rate": 4.654520677590412e-06,
"loss": 1.8377,
"step": 205
},
{
"epoch": 0.3786764705882353,
"grad_norm": 0.0,
"learning_rate": 4.650763571229664e-06,
"loss": 1.79,
"step": 206
},
{
"epoch": 0.3805147058823529,
"grad_norm": 0.0,
"learning_rate": 4.646987679247223e-06,
"loss": 1.5877,
"step": 207
},
{
"epoch": 0.38235294117647056,
"grad_norm": 0.0,
"learning_rate": 4.643193034623229e-06,
"loss": 1.7125,
"step": 208
},
{
"epoch": 0.38419117647058826,
"grad_norm": 0.0,
"learning_rate": 4.6393796705016105e-06,
"loss": 1.7207,
"step": 209
},
{
"epoch": 0.3860294117647059,
"grad_norm": 0.0,
"learning_rate": 4.635547620189802e-06,
"loss": 1.6849,
"step": 210
},
{
"epoch": 0.38786764705882354,
"grad_norm": 0.0,
"learning_rate": 4.631696917158449e-06,
"loss": 1.716,
"step": 211
},
{
"epoch": 0.3897058823529412,
"grad_norm": 0.0,
"learning_rate": 4.62782759504112e-06,
"loss": 1.7206,
"step": 212
},
{
"epoch": 0.3915441176470588,
"grad_norm": 0.0,
"learning_rate": 4.623939687634009e-06,
"loss": 1.4938,
"step": 213
},
{
"epoch": 0.39338235294117646,
"grad_norm": 0.0,
"learning_rate": 4.620033228895639e-06,
"loss": 1.9391,
"step": 214
},
{
"epoch": 0.3952205882352941,
"grad_norm": 0.0,
"learning_rate": 4.616108252946568e-06,
"loss": 1.688,
"step": 215
},
{
"epoch": 0.39705882352941174,
"grad_norm": 0.0,
"learning_rate": 4.612164794069096e-06,
"loss": 1.9585,
"step": 216
},
{
"epoch": 0.39889705882352944,
"grad_norm": 0.0,
"learning_rate": 4.608202886706953e-06,
"loss": 1.6469,
"step": 217
},
{
"epoch": 0.4007352941176471,
"grad_norm": 0.0,
"learning_rate": 4.6042225654650096e-06,
"loss": 1.8181,
"step": 218
},
{
"epoch": 0.4025735294117647,
"grad_norm": 0.0,
"learning_rate": 4.60022386510897e-06,
"loss": 1.8259,
"step": 219
},
{
"epoch": 0.40441176470588236,
"grad_norm": 0.0,
"learning_rate": 4.5962068205650674e-06,
"loss": 1.8962,
"step": 220
},
{
"epoch": 0.40625,
"grad_norm": 0.0,
"learning_rate": 4.592171466919762e-06,
"loss": 1.868,
"step": 221
},
{
"epoch": 0.40808823529411764,
"grad_norm": 0.0,
"learning_rate": 4.588117839419432e-06,
"loss": 1.7946,
"step": 222
},
{
"epoch": 0.4099264705882353,
"grad_norm": 0.0,
"learning_rate": 4.584045973470067e-06,
"loss": 1.6068,
"step": 223
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.0,
"learning_rate": 4.579955904636959e-06,
"loss": 1.8194,
"step": 224
},
{
"epoch": 0.41360294117647056,
"grad_norm": 0.0,
"learning_rate": 4.5758476686443905e-06,
"loss": 1.8958,
"step": 225
},
{
"epoch": 0.41544117647058826,
"grad_norm": 0.0,
"learning_rate": 4.571721301375323e-06,
"loss": 1.5318,
"step": 226
},
{
"epoch": 0.4172794117647059,
"grad_norm": 0.0,
"learning_rate": 4.5675768388710855e-06,
"loss": 1.6046,
"step": 227
},
{
"epoch": 0.41911764705882354,
"grad_norm": 0.0,
"learning_rate": 4.563414317331053e-06,
"loss": 1.6724,
"step": 228
},
{
"epoch": 0.4209558823529412,
"grad_norm": 0.0,
"learning_rate": 4.559233773112343e-06,
"loss": 1.8096,
"step": 229
},
{
"epoch": 0.4227941176470588,
"grad_norm": 0.0,
"learning_rate": 4.5550352427294836e-06,
"loss": 1.4821,
"step": 230
},
{
"epoch": 0.42463235294117646,
"grad_norm": 0.0,
"learning_rate": 4.550818762854105e-06,
"loss": 1.6695,
"step": 231
},
{
"epoch": 0.4264705882352941,
"grad_norm": 0.0,
"learning_rate": 4.546584370314613e-06,
"loss": 1.6973,
"step": 232
},
{
"epoch": 0.42830882352941174,
"grad_norm": 0.0,
"learning_rate": 4.542332102095871e-06,
"loss": 1.8328,
"step": 233
},
{
"epoch": 0.43014705882352944,
"grad_norm": 0.0,
"learning_rate": 4.538061995338875e-06,
"loss": 1.6589,
"step": 234
},
{
"epoch": 0.4319852941176471,
"grad_norm": 0.0,
"learning_rate": 4.533774087340431e-06,
"loss": 1.7145,
"step": 235
},
{
"epoch": 0.4338235294117647,
"grad_norm": 0.0,
"learning_rate": 4.529468415552829e-06,
"loss": 1.4717,
"step": 236
},
{
"epoch": 0.43566176470588236,
"grad_norm": 0.0,
"learning_rate": 4.52514501758351e-06,
"loss": 1.7362,
"step": 237
},
{
"epoch": 0.4375,
"grad_norm": 0.0,
"learning_rate": 4.520803931194747e-06,
"loss": 1.8571,
"step": 238
},
{
"epoch": 0.43933823529411764,
"grad_norm": 0.0,
"learning_rate": 4.5164451943033105e-06,
"loss": 1.9605,
"step": 239
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.0,
"learning_rate": 4.512068844980136e-06,
"loss": 1.9368,
"step": 240
},
{
"epoch": 0.4430147058823529,
"grad_norm": 0.0,
"learning_rate": 4.507674921449994e-06,
"loss": 1.45,
"step": 241
},
{
"epoch": 0.44485294117647056,
"grad_norm": 0.0,
"learning_rate": 4.503263462091153e-06,
"loss": 1.6417,
"step": 242
},
{
"epoch": 0.44669117647058826,
"grad_norm": 0.0,
"learning_rate": 4.49883450543505e-06,
"loss": 1.7531,
"step": 243
},
{
"epoch": 0.4485294117647059,
"grad_norm": 0.0,
"learning_rate": 4.494388090165947e-06,
"loss": 1.826,
"step": 244
},
{
"epoch": 0.45036764705882354,
"grad_norm": 0.0,
"learning_rate": 4.489924255120597e-06,
"loss": 1.5047,
"step": 245
},
{
"epoch": 0.4522058823529412,
"grad_norm": 0.0,
"learning_rate": 4.485443039287907e-06,
"loss": 1.7405,
"step": 246
},
{
"epoch": 0.4540441176470588,
"grad_norm": 0.0,
"learning_rate": 4.48094448180859e-06,
"loss": 1.7201,
"step": 247
},
{
"epoch": 0.45588235294117646,
"grad_norm": 0.0,
"learning_rate": 4.476428621974833e-06,
"loss": 1.9913,
"step": 248
},
{
"epoch": 0.4577205882352941,
"grad_norm": 0.0,
"learning_rate": 4.471895499229946e-06,
"loss": 1.5852,
"step": 249
},
{
"epoch": 0.45955882352941174,
"grad_norm": 0.0,
"learning_rate": 4.467345153168018e-06,
"loss": 1.5358,
"step": 250
},
{
"epoch": 0.46139705882352944,
"grad_norm": 0.0,
"learning_rate": 4.462777623533577e-06,
"loss": 1.7271,
"step": 251
},
{
"epoch": 0.4632352941176471,
"grad_norm": 0.0,
"learning_rate": 4.458192950221237e-06,
"loss": 1.5025,
"step": 252
},
{
"epoch": 0.4650735294117647,
"grad_norm": 0.0,
"learning_rate": 4.4535911732753535e-06,
"loss": 1.9186,
"step": 253
},
{
"epoch": 0.46691176470588236,
"grad_norm": 0.0,
"learning_rate": 4.448972332889669e-06,
"loss": 1.8936,
"step": 254
},
{
"epoch": 0.46875,
"grad_norm": 0.0,
"learning_rate": 4.444336469406968e-06,
"loss": 1.699,
"step": 255
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.0,
"learning_rate": 4.4396836233187195e-06,
"loss": 1.9617,
"step": 256
},
{
"epoch": 0.4724264705882353,
"grad_norm": 0.0,
"learning_rate": 4.435013835264725e-06,
"loss": 1.9323,
"step": 257
},
{
"epoch": 0.4742647058823529,
"grad_norm": 0.0,
"learning_rate": 4.4303271460327655e-06,
"loss": 1.6515,
"step": 258
},
{
"epoch": 0.47610294117647056,
"grad_norm": 0.0,
"learning_rate": 4.425623596558243e-06,
"loss": 1.6436,
"step": 259
},
{
"epoch": 0.47794117647058826,
"grad_norm": 0.0,
"learning_rate": 4.420903227923823e-06,
"loss": 1.9221,
"step": 260
},
{
"epoch": 0.4797794117647059,
"grad_norm": 0.0,
"learning_rate": 4.416166081359077e-06,
"loss": 1.9025,
"step": 261
},
{
"epoch": 0.48161764705882354,
"grad_norm": 0.0,
"learning_rate": 4.411412198240119e-06,
"loss": 1.866,
"step": 262
},
{
"epoch": 0.4834558823529412,
"grad_norm": 0.0,
"learning_rate": 4.406641620089252e-06,
"loss": 1.6989,
"step": 263
},
{
"epoch": 0.4852941176470588,
"grad_norm": 0.0,
"learning_rate": 4.401854388574595e-06,
"loss": 1.7039,
"step": 264
},
{
"epoch": 0.48713235294117646,
"grad_norm": 0.0,
"learning_rate": 4.397050545509726e-06,
"loss": 1.9074,
"step": 265
},
{
"epoch": 0.4889705882352941,
"grad_norm": 0.0,
"learning_rate": 4.392230132853316e-06,
"loss": 1.5768,
"step": 266
},
{
"epoch": 0.49080882352941174,
"grad_norm": 0.0,
"learning_rate": 4.387393192708758e-06,
"loss": 1.6722,
"step": 267
},
{
"epoch": 0.49264705882352944,
"grad_norm": 0.0,
"learning_rate": 4.382539767323805e-06,
"loss": 1.8256,
"step": 268
},
{
"epoch": 0.4944852941176471,
"grad_norm": 0.0,
"learning_rate": 4.377669899090202e-06,
"loss": 1.6908,
"step": 269
},
{
"epoch": 0.4963235294117647,
"grad_norm": 0.0,
"learning_rate": 4.372783630543305e-06,
"loss": 1.6795,
"step": 270
},
{
"epoch": 0.49816176470588236,
"grad_norm": 0.0,
"learning_rate": 4.3678810043617215e-06,
"loss": 1.7813,
"step": 271
},
{
"epoch": 0.5,
"grad_norm": 0.0,
"learning_rate": 4.362962063366933e-06,
"loss": 1.6982,
"step": 272
},
{
"epoch": 0.5018382352941176,
"grad_norm": 0.0,
"learning_rate": 4.358026850522919e-06,
"loss": 2.0328,
"step": 273
},
{
"epoch": 0.5036764705882353,
"grad_norm": 0.0,
"learning_rate": 4.353075408935787e-06,
"loss": 1.7353,
"step": 274
},
{
"epoch": 0.5055147058823529,
"grad_norm": 0.0,
"learning_rate": 4.348107781853389e-06,
"loss": 1.7333,
"step": 275
},
{
"epoch": 0.5073529411764706,
"grad_norm": 0.0,
"learning_rate": 4.34312401266495e-06,
"loss": 1.914,
"step": 276
},
{
"epoch": 0.5091911764705882,
"grad_norm": 0.0,
"learning_rate": 4.338124144900685e-06,
"loss": 1.6887,
"step": 277
},
{
"epoch": 0.5110294117647058,
"grad_norm": 0.0,
"learning_rate": 4.333108222231423e-06,
"loss": 1.6988,
"step": 278
},
{
"epoch": 0.5128676470588235,
"grad_norm": 0.0,
"learning_rate": 4.32807628846822e-06,
"loss": 1.5326,
"step": 279
},
{
"epoch": 0.5147058823529411,
"grad_norm": 0.0,
"learning_rate": 4.3230283875619815e-06,
"loss": 1.9346,
"step": 280
},
{
"epoch": 0.5165441176470589,
"grad_norm": 0.0,
"learning_rate": 4.317964563603073e-06,
"loss": 1.6371,
"step": 281
},
{
"epoch": 0.5183823529411765,
"grad_norm": 0.0,
"learning_rate": 4.312884860820942e-06,
"loss": 2.1047,
"step": 282
},
{
"epoch": 0.5202205882352942,
"grad_norm": 0.0,
"learning_rate": 4.307789323583727e-06,
"loss": 1.5355,
"step": 283
},
{
"epoch": 0.5220588235294118,
"grad_norm": 0.0,
"learning_rate": 4.302677996397868e-06,
"loss": 1.5629,
"step": 284
},
{
"epoch": 0.5238970588235294,
"grad_norm": 0.0,
"learning_rate": 4.297550923907726e-06,
"loss": 1.9965,
"step": 285
},
{
"epoch": 0.5257352941176471,
"grad_norm": 0.0,
"learning_rate": 4.2924081508951824e-06,
"loss": 1.8088,
"step": 286
},
{
"epoch": 0.5275735294117647,
"grad_norm": 0.0,
"learning_rate": 4.287249722279257e-06,
"loss": 1.8258,
"step": 287
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.0,
"learning_rate": 4.28207568311571e-06,
"loss": 1.7162,
"step": 288
},
{
"epoch": 0.53125,
"grad_norm": 0.0,
"learning_rate": 4.27688607859665e-06,
"loss": 1.6649,
"step": 289
},
{
"epoch": 0.5330882352941176,
"grad_norm": 0.0,
"learning_rate": 4.27168095405014e-06,
"loss": 1.8048,
"step": 290
},
{
"epoch": 0.5349264705882353,
"grad_norm": 0.0,
"learning_rate": 4.266460354939803e-06,
"loss": 1.7108,
"step": 291
},
{
"epoch": 0.5367647058823529,
"grad_norm": 0.0,
"learning_rate": 4.26122432686442e-06,
"loss": 1.9229,
"step": 292
},
{
"epoch": 0.5386029411764706,
"grad_norm": 0.0,
"learning_rate": 4.255972915557537e-06,
"loss": 1.9534,
"step": 293
},
{
"epoch": 0.5404411764705882,
"grad_norm": 0.0,
"learning_rate": 4.250706166887061e-06,
"loss": 1.9423,
"step": 294
},
{
"epoch": 0.5422794117647058,
"grad_norm": 0.0,
"learning_rate": 4.245424126854864e-06,
"loss": 1.7946,
"step": 295
},
{
"epoch": 0.5441176470588235,
"grad_norm": 0.0,
"learning_rate": 4.240126841596377e-06,
"loss": 1.6288,
"step": 296
},
{
"epoch": 0.5459558823529411,
"grad_norm": 0.0,
"learning_rate": 4.234814357380189e-06,
"loss": 1.9226,
"step": 297
},
{
"epoch": 0.5477941176470589,
"grad_norm": 0.0,
"learning_rate": 4.229486720607645e-06,
"loss": 1.5787,
"step": 298
},
{
"epoch": 0.5496323529411765,
"grad_norm": 0.0,
"learning_rate": 4.224143977812435e-06,
"loss": 1.6782,
"step": 299
},
{
"epoch": 0.5514705882352942,
"grad_norm": 0.0,
"learning_rate": 4.218786175660194e-06,
"loss": 1.4888,
"step": 300
},
{
"epoch": 0.5533088235294118,
"grad_norm": 0.0,
"learning_rate": 4.213413360948089e-06,
"loss": 1.7777,
"step": 301
},
{
"epoch": 0.5551470588235294,
"grad_norm": 0.0,
"learning_rate": 4.208025580604413e-06,
"loss": 1.7884,
"step": 302
},
{
"epoch": 0.5569852941176471,
"grad_norm": 0.0,
"learning_rate": 4.202622881688178e-06,
"loss": 1.6578,
"step": 303
},
{
"epoch": 0.5588235294117647,
"grad_norm": 0.0,
"learning_rate": 4.197205311388698e-06,
"loss": 1.991,
"step": 304
},
{
"epoch": 0.5606617647058824,
"grad_norm": 0.0,
"learning_rate": 4.1917729170251765e-06,
"loss": 1.7002,
"step": 305
},
{
"epoch": 0.5625,
"grad_norm": 0.0,
"learning_rate": 4.186325746046302e-06,
"loss": 1.9256,
"step": 306
},
{
"epoch": 0.5643382352941176,
"grad_norm": 0.0,
"learning_rate": 4.180863846029825e-06,
"loss": 1.8477,
"step": 307
},
{
"epoch": 0.5661764705882353,
"grad_norm": 0.0,
"learning_rate": 4.175387264682146e-06,
"loss": 1.9161,
"step": 308
},
{
"epoch": 0.5680147058823529,
"grad_norm": 0.0,
"learning_rate": 4.169896049837899e-06,
"loss": 1.6926,
"step": 309
},
{
"epoch": 0.5698529411764706,
"grad_norm": 0.0,
"learning_rate": 4.164390249459526e-06,
"loss": 1.8339,
"step": 310
},
{
"epoch": 0.5716911764705882,
"grad_norm": 0.0,
"learning_rate": 4.158869911636876e-06,
"loss": 1.8295,
"step": 311
},
{
"epoch": 0.5735294117647058,
"grad_norm": 0.0,
"learning_rate": 4.153335084586766e-06,
"loss": 1.8681,
"step": 312
},
{
"epoch": 0.5753676470588235,
"grad_norm": 0.0,
"learning_rate": 4.147785816652569e-06,
"loss": 1.6268,
"step": 313
},
{
"epoch": 0.5772058823529411,
"grad_norm": 0.0,
"learning_rate": 4.142222156303792e-06,
"loss": 1.773,
"step": 314
},
{
"epoch": 0.5790441176470589,
"grad_norm": 0.0,
"learning_rate": 4.13664415213565e-06,
"loss": 2.0425,
"step": 315
},
{
"epoch": 0.5808823529411765,
"grad_norm": 0.0,
"learning_rate": 4.131051852868643e-06,
"loss": 1.8064,
"step": 316
},
{
"epoch": 0.5827205882352942,
"grad_norm": 0.0,
"learning_rate": 4.125445307348129e-06,
"loss": 1.5052,
"step": 317
},
{
"epoch": 0.5845588235294118,
"grad_norm": 0.0,
"learning_rate": 4.119824564543901e-06,
"loss": 1.7783,
"step": 318
},
{
"epoch": 0.5863970588235294,
"grad_norm": 0.0,
"learning_rate": 4.114189673549752e-06,
"loss": 1.6945,
"step": 319
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.0,
"learning_rate": 4.108540683583057e-06,
"loss": 1.7935,
"step": 320
},
{
"epoch": 0.5900735294117647,
"grad_norm": 0.0,
"learning_rate": 4.102877643984332e-06,
"loss": 2.0515,
"step": 321
},
{
"epoch": 0.5919117647058824,
"grad_norm": 0.0,
"learning_rate": 4.097200604216811e-06,
"loss": 1.7803,
"step": 322
},
{
"epoch": 0.59375,
"grad_norm": 0.0,
"learning_rate": 4.09150961386601e-06,
"loss": 1.7374,
"step": 323
},
{
"epoch": 0.5955882352941176,
"grad_norm": 0.0,
"learning_rate": 4.085804722639293e-06,
"loss": 1.6242,
"step": 324
},
{
"epoch": 0.5974264705882353,
"grad_norm": 0.0,
"learning_rate": 4.0800859803654436e-06,
"loss": 1.8858,
"step": 325
},
{
"epoch": 0.5992647058823529,
"grad_norm": 0.0,
"learning_rate": 4.074353436994223e-06,
"loss": 1.8843,
"step": 326
},
{
"epoch": 0.6011029411764706,
"grad_norm": 0.0,
"learning_rate": 4.068607142595939e-06,
"loss": 1.4963,
"step": 327
},
{
"epoch": 0.6029411764705882,
"grad_norm": 0.0,
"learning_rate": 4.062847147361003e-06,
"loss": 1.6638,
"step": 328
},
{
"epoch": 0.6047794117647058,
"grad_norm": 0.0,
"learning_rate": 4.0570735015994986e-06,
"loss": 1.9207,
"step": 329
},
{
"epoch": 0.6066176470588235,
"grad_norm": 0.0,
"learning_rate": 4.0512862557407365e-06,
"loss": 1.5746,
"step": 330
},
{
"epoch": 0.6084558823529411,
"grad_norm": 0.0,
"learning_rate": 4.045485460332815e-06,
"loss": 1.8553,
"step": 331
},
{
"epoch": 0.6102941176470589,
"grad_norm": 0.0,
"learning_rate": 4.0396711660421825e-06,
"loss": 1.8915,
"step": 332
},
{
"epoch": 0.6121323529411765,
"grad_norm": 0.0,
"learning_rate": 4.03384342365319e-06,
"loss": 1.8034,
"step": 333
},
{
"epoch": 0.6139705882352942,
"grad_norm": 0.0,
"learning_rate": 4.02800228406765e-06,
"loss": 1.9337,
"step": 334
},
{
"epoch": 0.6158088235294118,
"grad_norm": 0.0,
"learning_rate": 4.02214779830439e-06,
"loss": 1.9172,
"step": 335
},
{
"epoch": 0.6176470588235294,
"grad_norm": 0.0,
"learning_rate": 4.016280017498812e-06,
"loss": 1.5344,
"step": 336
},
{
"epoch": 0.6194852941176471,
"grad_norm": 0.0,
"learning_rate": 4.010398992902437e-06,
"loss": 1.6145,
"step": 337
},
{
"epoch": 0.6213235294117647,
"grad_norm": 0.0,
"learning_rate": 4.004504775882467e-06,
"loss": 1.6857,
"step": 338
},
{
"epoch": 0.6231617647058824,
"grad_norm": 0.0,
"learning_rate": 3.998597417921331e-06,
"loss": 1.6453,
"step": 339
},
{
"epoch": 0.625,
"grad_norm": 0.0,
"learning_rate": 3.992676970616233e-06,
"loss": 1.9115,
"step": 340
},
{
"epoch": 0.6268382352941176,
"grad_norm": 0.0,
"learning_rate": 3.98674348567871e-06,
"loss": 1.7092,
"step": 341
},
{
"epoch": 0.6286764705882353,
"grad_norm": 0.0,
"learning_rate": 3.980797014934169e-06,
"loss": 1.7614,
"step": 342
},
{
"epoch": 0.6305147058823529,
"grad_norm": 0.0,
"learning_rate": 3.974837610321445e-06,
"loss": 1.6805,
"step": 343
},
{
"epoch": 0.6323529411764706,
"grad_norm": 0.0,
"learning_rate": 3.968865323892339e-06,
"loss": 1.9099,
"step": 344
},
{
"epoch": 0.6341911764705882,
"grad_norm": 0.0,
"learning_rate": 3.962880207811168e-06,
"loss": 1.5895,
"step": 345
},
{
"epoch": 0.6360294117647058,
"grad_norm": 0.0,
"learning_rate": 3.95688231435431e-06,
"loss": 1.6966,
"step": 346
},
{
"epoch": 0.6378676470588235,
"grad_norm": 0.0,
"learning_rate": 3.950871695909744e-06,
"loss": 1.6833,
"step": 347
},
{
"epoch": 0.6397058823529411,
"grad_norm": 0.0,
"learning_rate": 3.944848404976593e-06,
"loss": 1.8615,
"step": 348
},
{
"epoch": 0.6415441176470589,
"grad_norm": 0.0,
"learning_rate": 3.93881249416467e-06,
"loss": 1.93,
"step": 349
},
{
"epoch": 0.6433823529411765,
"grad_norm": 0.0,
"learning_rate": 3.932764016194013e-06,
"loss": 1.8436,
"step": 350
},
{
"epoch": 0.6452205882352942,
"grad_norm": 0.0,
"learning_rate": 3.926703023894424e-06,
"loss": 1.4891,
"step": 351
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.0,
"learning_rate": 3.920629570205014e-06,
"loss": 2.0484,
"step": 352
},
{
"epoch": 0.6488970588235294,
"grad_norm": 0.0,
"learning_rate": 3.914543708173735e-06,
"loss": 1.7981,
"step": 353
},
{
"epoch": 0.6507352941176471,
"grad_norm": 0.0,
"learning_rate": 3.90844549095692e-06,
"loss": 1.8744,
"step": 354
},
{
"epoch": 0.6525735294117647,
"grad_norm": 0.0,
"learning_rate": 3.9023349718188155e-06,
"loss": 1.9961,
"step": 355
},
{
"epoch": 0.6544117647058824,
"grad_norm": 0.0,
"learning_rate": 3.8962122041311155e-06,
"loss": 1.8839,
"step": 356
},
{
"epoch": 0.65625,
"grad_norm": 0.0,
"learning_rate": 3.890077241372503e-06,
"loss": 1.7395,
"step": 357
},
{
"epoch": 0.6580882352941176,
"grad_norm": 0.0,
"learning_rate": 3.883930137128175e-06,
"loss": 1.6163,
"step": 358
},
{
"epoch": 0.6599264705882353,
"grad_norm": 0.0,
"learning_rate": 3.877770945089377e-06,
"loss": 1.7127,
"step": 359
},
{
"epoch": 0.6617647058823529,
"grad_norm": 0.0,
"learning_rate": 3.871599719052931e-06,
"loss": 1.6822,
"step": 360
},
{
"epoch": 0.6636029411764706,
"grad_norm": 0.0,
"learning_rate": 3.865416512920776e-06,
"loss": 2.0061,
"step": 361
},
{
"epoch": 0.6654411764705882,
"grad_norm": 0.0,
"learning_rate": 3.859221380699482e-06,
"loss": 1.4916,
"step": 362
},
{
"epoch": 0.6672794117647058,
"grad_norm": 0.0,
"learning_rate": 3.853014376499792e-06,
"loss": 1.5192,
"step": 363
},
{
"epoch": 0.6691176470588235,
"grad_norm": 0.0,
"learning_rate": 3.846795554536141e-06,
"loss": 1.8608,
"step": 364
},
{
"epoch": 0.6709558823529411,
"grad_norm": 0.0,
"learning_rate": 3.840564969126186e-06,
"loss": 1.7084,
"step": 365
},
{
"epoch": 0.6727941176470589,
"grad_norm": 0.0,
"learning_rate": 3.834322674690329e-06,
"loss": 1.6686,
"step": 366
},
{
"epoch": 0.6746323529411765,
"grad_norm": 0.0,
"learning_rate": 3.828068725751245e-06,
"loss": 1.7066,
"step": 367
},
{
"epoch": 0.6764705882352942,
"grad_norm": 0.0,
"learning_rate": 3.8218031769334024e-06,
"loss": 1.7413,
"step": 368
},
{
"epoch": 0.6783088235294118,
"grad_norm": 0.0,
"learning_rate": 3.81552608296259e-06,
"loss": 1.9639,
"step": 369
},
{
"epoch": 0.6801470588235294,
"grad_norm": 0.0,
"learning_rate": 3.809237498665434e-06,
"loss": 1.6569,
"step": 370
},
{
"epoch": 0.6819852941176471,
"grad_norm": 0.0,
"learning_rate": 3.8029374789689234e-06,
"loss": 1.7029,
"step": 371
},
{
"epoch": 0.6838235294117647,
"grad_norm": 0.0,
"learning_rate": 3.7966260788999278e-06,
"loss": 1.4536,
"step": 372
},
{
"epoch": 0.6856617647058824,
"grad_norm": 0.0,
"learning_rate": 3.7903033535847167e-06,
"loss": 1.7632,
"step": 373
},
{
"epoch": 0.6875,
"grad_norm": 0.0,
"learning_rate": 3.7839693582484806e-06,
"loss": 1.698,
"step": 374
},
{
"epoch": 0.6893382352941176,
"grad_norm": 0.0,
"learning_rate": 3.7776241482148452e-06,
"loss": 1.6655,
"step": 375
},
{
"epoch": 0.6911764705882353,
"grad_norm": 0.0,
"learning_rate": 3.771267778905391e-06,
"loss": 1.8655,
"step": 376
},
{
"epoch": 0.6930147058823529,
"grad_norm": 0.0,
"learning_rate": 3.7649003058391664e-06,
"loss": 1.7093,
"step": 377
},
{
"epoch": 0.6948529411764706,
"grad_norm": 0.0,
"learning_rate": 3.7585217846322075e-06,
"loss": 1.6746,
"step": 378
},
{
"epoch": 0.6966911764705882,
"grad_norm": 0.0,
"learning_rate": 3.7521322709970454e-06,
"loss": 1.9697,
"step": 379
},
{
"epoch": 0.6985294117647058,
"grad_norm": 0.0,
"learning_rate": 3.745731820742227e-06,
"loss": 2.0496,
"step": 380
},
{
"epoch": 0.7003676470588235,
"grad_norm": 0.0,
"learning_rate": 3.7393204897718194e-06,
"loss": 1.8899,
"step": 381
},
{
"epoch": 0.7022058823529411,
"grad_norm": 0.0,
"learning_rate": 3.7328983340849324e-06,
"loss": 1.9481,
"step": 382
},
{
"epoch": 0.7040441176470589,
"grad_norm": 0.0,
"learning_rate": 3.7264654097752173e-06,
"loss": 1.8767,
"step": 383
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.0,
"learning_rate": 3.7200217730303865e-06,
"loss": 1.7622,
"step": 384
},
{
"epoch": 0.7077205882352942,
"grad_norm": 0.0,
"learning_rate": 3.713567480131718e-06,
"loss": 1.8596,
"step": 385
},
{
"epoch": 0.7095588235294118,
"grad_norm": 0.0,
"learning_rate": 3.7071025874535643e-06,
"loss": 1.7558,
"step": 386
},
{
"epoch": 0.7113970588235294,
"grad_norm": 0.0,
"learning_rate": 3.7006271514628617e-06,
"loss": 2.0891,
"step": 387
},
{
"epoch": 0.7132352941176471,
"grad_norm": 0.0,
"learning_rate": 3.694141228718634e-06,
"loss": 1.5486,
"step": 388
},
{
"epoch": 0.7150735294117647,
"grad_norm": 0.0,
"learning_rate": 3.6876448758715028e-06,
"loss": 1.6308,
"step": 389
},
{
"epoch": 0.7169117647058824,
"grad_norm": 0.0,
"learning_rate": 3.681138149663189e-06,
"loss": 1.9086,
"step": 390
},
{
"epoch": 0.71875,
"grad_norm": 0.0,
"learning_rate": 3.6746211069260197e-06,
"loss": 1.9397,
"step": 391
},
{
"epoch": 0.7205882352941176,
"grad_norm": 0.0,
"learning_rate": 3.6680938045824284e-06,
"loss": 1.6673,
"step": 392
},
{
"epoch": 0.7224264705882353,
"grad_norm": 0.0,
"learning_rate": 3.661556299644462e-06,
"loss": 1.7074,
"step": 393
},
{
"epoch": 0.7242647058823529,
"grad_norm": 0.0,
"learning_rate": 3.6550086492132804e-06,
"loss": 1.298,
"step": 394
},
{
"epoch": 0.7261029411764706,
"grad_norm": 0.0,
"learning_rate": 3.6484509104786582e-06,
"loss": 1.711,
"step": 395
},
{
"epoch": 0.7279411764705882,
"grad_norm": 0.0,
"learning_rate": 3.6418831407184856e-06,
"loss": 1.5623,
"step": 396
},
{
"epoch": 0.7297794117647058,
"grad_norm": 0.0,
"learning_rate": 3.6353053972982676e-06,
"loss": 1.7687,
"step": 397
},
{
"epoch": 0.7316176470588235,
"grad_norm": 0.0,
"learning_rate": 3.628717737670623e-06,
"loss": 1.6687,
"step": 398
},
{
"epoch": 0.7334558823529411,
"grad_norm": 0.0,
"learning_rate": 3.6221202193747818e-06,
"loss": 1.6686,
"step": 399
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.0,
"learning_rate": 3.6155129000360846e-06,
"loss": 1.4463,
"step": 400
},
{
"epoch": 0.7371323529411765,
"grad_norm": 0.0,
"learning_rate": 3.6088958373654794e-06,
"loss": 1.8492,
"step": 401
},
{
"epoch": 0.7389705882352942,
"grad_norm": 0.0,
"learning_rate": 3.602269089159013e-06,
"loss": 1.7782,
"step": 402
},
{
"epoch": 0.7408088235294118,
"grad_norm": 0.0,
"learning_rate": 3.5956327132973313e-06,
"loss": 1.7213,
"step": 403
},
{
"epoch": 0.7426470588235294,
"grad_norm": 0.0,
"learning_rate": 3.588986767745174e-06,
"loss": 1.6712,
"step": 404
},
{
"epoch": 0.7444852941176471,
"grad_norm": 0.0,
"learning_rate": 3.5823313105508626e-06,
"loss": 1.679,
"step": 405
},
{
"epoch": 0.7463235294117647,
"grad_norm": 0.0,
"learning_rate": 3.575666399845799e-06,
"loss": 1.8439,
"step": 406
},
{
"epoch": 0.7481617647058824,
"grad_norm": 0.0,
"learning_rate": 3.568992093843956e-06,
"loss": 2.0623,
"step": 407
},
{
"epoch": 0.75,
"grad_norm": 0.0,
"learning_rate": 3.5623084508413685e-06,
"loss": 1.6133,
"step": 408
},
{
"epoch": 0.7518382352941176,
"grad_norm": 0.0,
"learning_rate": 3.555615529215623e-06,
"loss": 1.8713,
"step": 409
},
{
"epoch": 0.7536764705882353,
"grad_norm": 0.0,
"learning_rate": 3.5489133874253516e-06,
"loss": 1.8986,
"step": 410
},
{
"epoch": 0.7555147058823529,
"grad_norm": 0.0,
"learning_rate": 3.5422020840097173e-06,
"loss": 1.6793,
"step": 411
},
{
"epoch": 0.7573529411764706,
"grad_norm": 0.0,
"learning_rate": 3.535481677587904e-06,
"loss": 1.6225,
"step": 412
},
{
"epoch": 0.7591911764705882,
"grad_norm": 0.0,
"learning_rate": 3.5287522268586074e-06,
"loss": 1.7254,
"step": 413
},
{
"epoch": 0.7610294117647058,
"grad_norm": 0.0,
"learning_rate": 3.5220137905995165e-06,
"loss": 1.7691,
"step": 414
},
{
"epoch": 0.7628676470588235,
"grad_norm": 0.0,
"learning_rate": 3.515266427666806e-06,
"loss": 1.7055,
"step": 415
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.0,
"learning_rate": 3.508510196994618e-06,
"loss": 1.7593,
"step": 416
},
{
"epoch": 0.7665441176470589,
"grad_norm": 0.0,
"learning_rate": 3.50174515759455e-06,
"loss": 1.7012,
"step": 417
},
{
"epoch": 0.7683823529411765,
"grad_norm": 0.0,
"learning_rate": 3.4949713685551377e-06,
"loss": 1.887,
"step": 418
},
{
"epoch": 0.7702205882352942,
"grad_norm": 0.0,
"learning_rate": 3.488188889041341e-06,
"loss": 1.7187,
"step": 419
},
{
"epoch": 0.7720588235294118,
"grad_norm": 0.0,
"learning_rate": 3.4813977782940234e-06,
"loss": 1.8475,
"step": 420
},
{
"epoch": 0.7738970588235294,
"grad_norm": 0.0,
"learning_rate": 3.4745980956294396e-06,
"loss": 1.6388,
"step": 421
},
{
"epoch": 0.7757352941176471,
"grad_norm": 0.0,
"learning_rate": 3.4677899004387134e-06,
"loss": 1.9088,
"step": 422
},
{
"epoch": 0.7775735294117647,
"grad_norm": 0.0,
"learning_rate": 3.460973252187321e-06,
"loss": 1.6816,
"step": 423
},
{
"epoch": 0.7794117647058824,
"grad_norm": 0.0,
"learning_rate": 3.4541482104145695e-06,
"loss": 1.6671,
"step": 424
},
{
"epoch": 0.78125,
"grad_norm": 0.0,
"learning_rate": 3.447314834733081e-06,
"loss": 1.6012,
"step": 425
},
{
"epoch": 0.7830882352941176,
"grad_norm": 0.0,
"learning_rate": 3.440473184828266e-06,
"loss": 1.6968,
"step": 426
},
{
"epoch": 0.7849264705882353,
"grad_norm": 0.0,
"learning_rate": 3.433623320457809e-06,
"loss": 1.5466,
"step": 427
},
{
"epoch": 0.7867647058823529,
"grad_norm": 0.0,
"learning_rate": 3.4267653014511405e-06,
"loss": 1.8788,
"step": 428
},
{
"epoch": 0.7886029411764706,
"grad_norm": 0.0,
"learning_rate": 3.419899187708917e-06,
"loss": 1.8398,
"step": 429
},
{
"epoch": 0.7904411764705882,
"grad_norm": 0.0,
"learning_rate": 3.4130250392024973e-06,
"loss": 1.7668,
"step": 430
},
{
"epoch": 0.7922794117647058,
"grad_norm": 0.0,
"learning_rate": 3.4061429159734207e-06,
"loss": 1.7916,
"step": 431
},
{
"epoch": 0.7941176470588235,
"grad_norm": 0.0,
"learning_rate": 3.3992528781328793e-06,
"loss": 1.628,
"step": 432
},
{
"epoch": 0.7959558823529411,
"grad_norm": 0.0,
"learning_rate": 3.3923549858611958e-06,
"loss": 1.6921,
"step": 433
},
{
"epoch": 0.7977941176470589,
"grad_norm": 0.0,
"learning_rate": 3.385449299407296e-06,
"loss": 1.7011,
"step": 434
},
{
"epoch": 0.7996323529411765,
"grad_norm": 0.0,
"learning_rate": 3.378535879088182e-06,
"loss": 1.6465,
"step": 435
},
{
"epoch": 0.8014705882352942,
"grad_norm": 0.0,
"learning_rate": 3.3716147852884073e-06,
"loss": 1.7443,
"step": 436
},
{
"epoch": 0.8033088235294118,
"grad_norm": 0.0,
"learning_rate": 3.3646860784595512e-06,
"loss": 1.7987,
"step": 437
},
{
"epoch": 0.8051470588235294,
"grad_norm": 0.0,
"learning_rate": 3.357749819119685e-06,
"loss": 1.9102,
"step": 438
},
{
"epoch": 0.8069852941176471,
"grad_norm": 0.0,
"learning_rate": 3.3508060678528464e-06,
"loss": 1.6303,
"step": 439
},
{
"epoch": 0.8088235294117647,
"grad_norm": 0.0,
"learning_rate": 3.3438548853085135e-06,
"loss": 1.5065,
"step": 440
},
{
"epoch": 0.8106617647058824,
"grad_norm": 0.0,
"learning_rate": 3.3368963322010695e-06,
"loss": 1.7563,
"step": 441
},
{
"epoch": 0.8125,
"grad_norm": 0.0,
"learning_rate": 3.329930469309276e-06,
"loss": 1.6226,
"step": 442
},
{
"epoch": 0.8143382352941176,
"grad_norm": 0.0,
"learning_rate": 3.322957357475741e-06,
"loss": 1.8419,
"step": 443
},
{
"epoch": 0.8161764705882353,
"grad_norm": 0.0,
"learning_rate": 3.315977057606388e-06,
"loss": 1.7456,
"step": 444
},
{
"epoch": 0.8180147058823529,
"grad_norm": 0.0,
"learning_rate": 3.3089896306699233e-06,
"loss": 1.9603,
"step": 445
},
{
"epoch": 0.8198529411764706,
"grad_norm": 0.0,
"learning_rate": 3.301995137697304e-06,
"loss": 1.6238,
"step": 446
},
{
"epoch": 0.8216911764705882,
"grad_norm": 0.0,
"learning_rate": 3.2949936397812055e-06,
"loss": 1.6546,
"step": 447
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.0,
"learning_rate": 3.287985198075484e-06,
"loss": 1.5644,
"step": 448
},
{
"epoch": 0.8253676470588235,
"grad_norm": 0.0,
"learning_rate": 3.2809698737946494e-06,
"loss": 1.9652,
"step": 449
},
{
"epoch": 0.8272058823529411,
"grad_norm": 0.0,
"learning_rate": 3.2739477282133253e-06,
"loss": 1.6981,
"step": 450
},
{
"epoch": 0.8290441176470589,
"grad_norm": 0.0,
"learning_rate": 3.266918822665715e-06,
"loss": 1.8254,
"step": 451
},
{
"epoch": 0.8308823529411765,
"grad_norm": 0.0,
"learning_rate": 3.259883218545065e-06,
"loss": 1.5648,
"step": 452
},
{
"epoch": 0.8327205882352942,
"grad_norm": 0.0,
"learning_rate": 3.2528409773031322e-06,
"loss": 1.649,
"step": 453
},
{
"epoch": 0.8345588235294118,
"grad_norm": 0.0,
"learning_rate": 3.2457921604496435e-06,
"loss": 1.9707,
"step": 454
},
{
"epoch": 0.8363970588235294,
"grad_norm": 0.0,
"learning_rate": 3.2387368295517586e-06,
"loss": 1.4134,
"step": 455
},
{
"epoch": 0.8382352941176471,
"grad_norm": 0.0,
"learning_rate": 3.231675046233536e-06,
"loss": 1.834,
"step": 456
},
{
"epoch": 0.8400735294117647,
"grad_norm": 0.0,
"learning_rate": 3.22460687217539e-06,
"loss": 1.7787,
"step": 457
},
{
"epoch": 0.8419117647058824,
"grad_norm": 0.0,
"learning_rate": 3.217532369113555e-06,
"loss": 1.7776,
"step": 458
},
{
"epoch": 0.84375,
"grad_norm": 0.0,
"learning_rate": 3.2104515988395456e-06,
"loss": 1.8862,
"step": 459
},
{
"epoch": 0.8455882352941176,
"grad_norm": 0.0,
"learning_rate": 3.2033646231996167e-06,
"loss": 1.5536,
"step": 460
},
{
"epoch": 0.8474264705882353,
"grad_norm": 0.0,
"learning_rate": 3.196271504094223e-06,
"loss": 1.6952,
"step": 461
},
{
"epoch": 0.8492647058823529,
"grad_norm": 0.0,
"learning_rate": 3.189172303477478e-06,
"loss": 1.8626,
"step": 462
},
{
"epoch": 0.8511029411764706,
"grad_norm": 0.0,
"learning_rate": 3.182067083356616e-06,
"loss": 1.7898,
"step": 463
},
{
"epoch": 0.8529411764705882,
"grad_norm": 0.0,
"learning_rate": 3.174955905791444e-06,
"loss": 1.7797,
"step": 464
},
{
"epoch": 0.8547794117647058,
"grad_norm": 0.0,
"learning_rate": 3.1678388328938093e-06,
"loss": 1.6529,
"step": 465
},
{
"epoch": 0.8566176470588235,
"grad_norm": 0.0,
"learning_rate": 3.1607159268270447e-06,
"loss": 1.6602,
"step": 466
},
{
"epoch": 0.8584558823529411,
"grad_norm": 0.0,
"learning_rate": 3.153587249805438e-06,
"loss": 1.6258,
"step": 467
},
{
"epoch": 0.8602941176470589,
"grad_norm": 0.0,
"learning_rate": 3.1464528640936797e-06,
"loss": 1.7756,
"step": 468
},
{
"epoch": 0.8621323529411765,
"grad_norm": 0.0,
"learning_rate": 3.139312832006323e-06,
"loss": 1.771,
"step": 469
},
{
"epoch": 0.8639705882352942,
"grad_norm": 0.0,
"learning_rate": 3.132167215907238e-06,
"loss": 1.9377,
"step": 470
},
{
"epoch": 0.8658088235294118,
"grad_norm": 0.0,
"learning_rate": 3.12501607820907e-06,
"loss": 1.7851,
"step": 471
},
{
"epoch": 0.8676470588235294,
"grad_norm": 0.0,
"learning_rate": 3.11785948137269e-06,
"loss": 1.9384,
"step": 472
},
{
"epoch": 0.8694852941176471,
"grad_norm": 0.0,
"learning_rate": 3.1106974879066514e-06,
"loss": 1.4842,
"step": 473
},
{
"epoch": 0.8713235294117647,
"grad_norm": 0.0,
"learning_rate": 3.1035301603666456e-06,
"loss": 1.7289,
"step": 474
},
{
"epoch": 0.8731617647058824,
"grad_norm": 0.0,
"learning_rate": 3.0963575613549523e-06,
"loss": 1.8963,
"step": 475
},
{
"epoch": 0.875,
"grad_norm": 0.0,
"learning_rate": 3.089179753519894e-06,
"loss": 1.8238,
"step": 476
},
{
"epoch": 0.8768382352941176,
"grad_norm": 0.0,
"learning_rate": 3.0819967995552913e-06,
"loss": 2.1243,
"step": 477
},
{
"epoch": 0.8786764705882353,
"grad_norm": 0.0,
"learning_rate": 3.074808762199911e-06,
"loss": 1.7607,
"step": 478
},
{
"epoch": 0.8805147058823529,
"grad_norm": 0.0,
"learning_rate": 3.0676157042369213e-06,
"loss": 1.7313,
"step": 479
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.0,
"learning_rate": 3.0604176884933422e-06,
"loss": 1.6726,
"step": 480
},
{
"epoch": 0.8841911764705882,
"grad_norm": 0.0,
"learning_rate": 3.053214777839496e-06,
"loss": 1.7602,
"step": 481
},
{
"epoch": 0.8860294117647058,
"grad_norm": 0.0,
"learning_rate": 3.0460070351884614e-06,
"loss": 1.7777,
"step": 482
},
{
"epoch": 0.8878676470588235,
"grad_norm": 0.0,
"learning_rate": 3.0387945234955187e-06,
"loss": 1.759,
"step": 483
},
{
"epoch": 0.8897058823529411,
"grad_norm": 0.0,
"learning_rate": 3.031577305757605e-06,
"loss": 2.0917,
"step": 484
},
{
"epoch": 0.8915441176470589,
"grad_norm": 0.0,
"learning_rate": 3.024355445012761e-06,
"loss": 1.7402,
"step": 485
},
{
"epoch": 0.8933823529411765,
"grad_norm": 0.0,
"learning_rate": 3.0171290043395823e-06,
"loss": 1.9261,
"step": 486
},
{
"epoch": 0.8952205882352942,
"grad_norm": 0.0,
"learning_rate": 3.0098980468566663e-06,
"loss": 1.6524,
"step": 487
},
{
"epoch": 0.8970588235294118,
"grad_norm": 0.0,
"learning_rate": 3.0026626357220623e-06,
"loss": 1.8296,
"step": 488
},
{
"epoch": 0.8988970588235294,
"grad_norm": 0.0,
"learning_rate": 2.9954228341327192e-06,
"loss": 1.8665,
"step": 489
},
{
"epoch": 0.9007352941176471,
"grad_norm": 0.0,
"learning_rate": 2.988178705323934e-06,
"loss": 1.8146,
"step": 490
},
{
"epoch": 0.9025735294117647,
"grad_norm": 0.0,
"learning_rate": 2.9809303125688004e-06,
"loss": 1.7391,
"step": 491
},
{
"epoch": 0.9044117647058824,
"grad_norm": 0.0,
"learning_rate": 2.9736777191776543e-06,
"loss": 1.6417,
"step": 492
},
{
"epoch": 0.90625,
"grad_norm": 0.0,
"learning_rate": 2.966420988497522e-06,
"loss": 1.8464,
"step": 493
},
{
"epoch": 0.9080882352941176,
"grad_norm": 0.0,
"learning_rate": 2.959160183911565e-06,
"loss": 1.8636,
"step": 494
},
{
"epoch": 0.9099264705882353,
"grad_norm": 0.0,
"learning_rate": 2.9518953688385298e-06,
"loss": 1.6568,
"step": 495
},
{
"epoch": 0.9117647058823529,
"grad_norm": 0.0,
"learning_rate": 2.9446266067321904e-06,
"loss": 1.9179,
"step": 496
},
{
"epoch": 0.9136029411764706,
"grad_norm": 0.0,
"learning_rate": 2.9373539610807983e-06,
"loss": 1.9894,
"step": 497
},
{
"epoch": 0.9154411764705882,
"grad_norm": 0.0,
"learning_rate": 2.930077495406523e-06,
"loss": 1.8537,
"step": 498
},
{
"epoch": 0.9172794117647058,
"grad_norm": 0.0,
"learning_rate": 2.9227972732649e-06,
"loss": 1.8176,
"step": 499
},
{
"epoch": 0.9191176470588235,
"grad_norm": 0.0,
"learning_rate": 2.915513358244276e-06,
"loss": 1.7762,
"step": 500
},
{
"epoch": 0.9209558823529411,
"grad_norm": 0.0,
"learning_rate": 2.9082258139652536e-06,
"loss": 1.7569,
"step": 501
},
{
"epoch": 0.9227941176470589,
"grad_norm": 0.0,
"learning_rate": 2.900934704080133e-06,
"loss": 1.7657,
"step": 502
},
{
"epoch": 0.9246323529411765,
"grad_norm": 0.0,
"learning_rate": 2.893640092272357e-06,
"loss": 1.7845,
"step": 503
},
{
"epoch": 0.9264705882352942,
"grad_norm": 0.0,
"learning_rate": 2.8863420422559577e-06,
"loss": 1.4962,
"step": 504
},
{
"epoch": 0.9283088235294118,
"grad_norm": 0.0,
"learning_rate": 2.8790406177749985e-06,
"loss": 1.6051,
"step": 505
},
{
"epoch": 0.9301470588235294,
"grad_norm": 0.0,
"learning_rate": 2.8717358826030158e-06,
"loss": 1.8549,
"step": 506
},
{
"epoch": 0.9319852941176471,
"grad_norm": 0.0,
"learning_rate": 2.86442790054246e-06,
"loss": 1.7021,
"step": 507
},
{
"epoch": 0.9338235294117647,
"grad_norm": 0.0,
"learning_rate": 2.8571167354241445e-06,
"loss": 1.6309,
"step": 508
},
{
"epoch": 0.9356617647058824,
"grad_norm": 0.0,
"learning_rate": 2.849802451106685e-06,
"loss": 1.77,
"step": 509
},
{
"epoch": 0.9375,
"grad_norm": 0.0,
"learning_rate": 2.84248511147594e-06,
"loss": 1.7613,
"step": 510
},
{
"epoch": 0.9393382352941176,
"grad_norm": 0.0,
"learning_rate": 2.835164780444455e-06,
"loss": 1.9886,
"step": 511
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.0,
"learning_rate": 2.8278415219509025e-06,
"loss": 1.6941,
"step": 512
},
{
"epoch": 0.9430147058823529,
"grad_norm": 0.0,
"learning_rate": 2.8205153999595253e-06,
"loss": 1.641,
"step": 513
},
{
"epoch": 0.9448529411764706,
"grad_norm": 0.0,
"learning_rate": 2.8131864784595788e-06,
"loss": 1.8998,
"step": 514
},
{
"epoch": 0.9466911764705882,
"grad_norm": 0.0,
"learning_rate": 2.8058548214647674e-06,
"loss": 1.7034,
"step": 515
},
{
"epoch": 0.9485294117647058,
"grad_norm": 0.0,
"learning_rate": 2.798520493012691e-06,
"loss": 1.7346,
"step": 516
},
{
"epoch": 0.9503676470588235,
"grad_norm": 0.0,
"learning_rate": 2.7911835571642816e-06,
"loss": 2.0461,
"step": 517
},
{
"epoch": 0.9522058823529411,
"grad_norm": 0.0,
"learning_rate": 2.783844078003245e-06,
"loss": 1.7676,
"step": 518
},
{
"epoch": 0.9540441176470589,
"grad_norm": 0.0,
"learning_rate": 2.7765021196355023e-06,
"loss": 1.592,
"step": 519
},
{
"epoch": 0.9558823529411765,
"grad_norm": 0.0,
"learning_rate": 2.76915774618863e-06,
"loss": 1.5685,
"step": 520
},
{
"epoch": 0.9577205882352942,
"grad_norm": 0.0,
"learning_rate": 2.761811021811295e-06,
"loss": 1.7379,
"step": 521
},
{
"epoch": 0.9595588235294118,
"grad_norm": 0.0,
"learning_rate": 2.754462010672701e-06,
"loss": 1.9914,
"step": 522
},
{
"epoch": 0.9613970588235294,
"grad_norm": 0.0,
"learning_rate": 2.7471107769620258e-06,
"loss": 1.8213,
"step": 523
},
{
"epoch": 0.9632352941176471,
"grad_norm": 0.0,
"learning_rate": 2.739757384887859e-06,
"loss": 1.6564,
"step": 524
},
{
"epoch": 0.9650735294117647,
"grad_norm": 0.0,
"learning_rate": 2.732401898677642e-06,
"loss": 1.8431,
"step": 525
},
{
"epoch": 0.9669117647058824,
"grad_norm": 0.0,
"learning_rate": 2.725044382577107e-06,
"loss": 1.8911,
"step": 526
},
{
"epoch": 0.96875,
"grad_norm": 0.0,
"learning_rate": 2.7176849008497165e-06,
"loss": 1.6735,
"step": 527
},
{
"epoch": 0.9705882352941176,
"grad_norm": 0.0,
"learning_rate": 2.7103235177761018e-06,
"loss": 1.6467,
"step": 528
},
{
"epoch": 0.9724264705882353,
"grad_norm": 0.0,
"learning_rate": 2.702960297653501e-06,
"loss": 1.7016,
"step": 529
},
{
"epoch": 0.9742647058823529,
"grad_norm": 0.0,
"learning_rate": 2.695595304795197e-06,
"loss": 1.8497,
"step": 530
},
{
"epoch": 0.9761029411764706,
"grad_norm": 0.0,
"learning_rate": 2.688228603529959e-06,
"loss": 1.9022,
"step": 531
},
{
"epoch": 0.9779411764705882,
"grad_norm": 0.0,
"learning_rate": 2.680860258201475e-06,
"loss": 1.6943,
"step": 532
},
{
"epoch": 0.9797794117647058,
"grad_norm": 0.0,
"learning_rate": 2.6734903331677946e-06,
"loss": 1.886,
"step": 533
},
{
"epoch": 0.9816176470588235,
"grad_norm": 0.0,
"learning_rate": 2.666118892800765e-06,
"loss": 1.715,
"step": 534
},
{
"epoch": 0.9834558823529411,
"grad_norm": 0.0,
"learning_rate": 2.658746001485469e-06,
"loss": 1.7098,
"step": 535
},
{
"epoch": 0.9852941176470589,
"grad_norm": 0.0,
"learning_rate": 2.651371723619661e-06,
"loss": 1.6282,
"step": 536
},
{
"epoch": 0.9871323529411765,
"grad_norm": 0.0,
"learning_rate": 2.6439961236132083e-06,
"loss": 1.8106,
"step": 537
},
{
"epoch": 0.9889705882352942,
"grad_norm": 0.0,
"learning_rate": 2.6366192658875256e-06,
"loss": 1.95,
"step": 538
},
{
"epoch": 0.9908088235294118,
"grad_norm": 0.0,
"learning_rate": 2.629241214875013e-06,
"loss": 1.5364,
"step": 539
},
{
"epoch": 0.9926470588235294,
"grad_norm": 0.0,
"learning_rate": 2.621862035018492e-06,
"loss": 1.8866,
"step": 540
},
{
"epoch": 0.9944852941176471,
"grad_norm": 0.0,
"learning_rate": 2.6144817907706453e-06,
"loss": 1.6631,
"step": 541
},
{
"epoch": 0.9963235294117647,
"grad_norm": 0.0,
"learning_rate": 2.607100546593453e-06,
"loss": 1.7325,
"step": 542
},
{
"epoch": 0.9981617647058824,
"grad_norm": 0.0,
"learning_rate": 2.5997183669576264e-06,
"loss": 1.6731,
"step": 543
},
{
"epoch": 1.0,
"grad_norm": 0.0,
"learning_rate": 2.59233531634205e-06,
"loss": 1.481,
"step": 544
}
],
"logging_steps": 1,
"max_steps": 1088,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 272,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.898808838477578e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}