backward_model / trainer_state.json
luckyrabbits's picture
Upload trainer_state.json with huggingface_hub
7de4c8f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11865211200759374,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00023730422401518748,
"grad_norm": 0.19874855875968933,
"learning_rate": 9.980000000000001e-06,
"loss": 2.3055,
"step": 1
},
{
"epoch": 0.00047460844803037496,
"grad_norm": 0.3101324141025543,
"learning_rate": 9.960000000000001e-06,
"loss": 0.8548,
"step": 2
},
{
"epoch": 0.0007119126720455624,
"grad_norm": 0.18851338326931,
"learning_rate": 9.940000000000001e-06,
"loss": 1.3783,
"step": 3
},
{
"epoch": 0.0009492168960607499,
"grad_norm": 0.5474942326545715,
"learning_rate": 9.920000000000002e-06,
"loss": 2.5833,
"step": 4
},
{
"epoch": 0.0011865211200759373,
"grad_norm": 0.28883224725723267,
"learning_rate": 9.9e-06,
"loss": 2.4045,
"step": 5
},
{
"epoch": 0.0014238253440911248,
"grad_norm": 0.4798765778541565,
"learning_rate": 9.88e-06,
"loss": 1.2589,
"step": 6
},
{
"epoch": 0.0016611295681063123,
"grad_norm": 0.48112648725509644,
"learning_rate": 9.86e-06,
"loss": 2.9112,
"step": 7
},
{
"epoch": 0.0018984337921214998,
"grad_norm": 0.8760956525802612,
"learning_rate": 9.84e-06,
"loss": 2.3175,
"step": 8
},
{
"epoch": 0.0021357380161366873,
"grad_norm": 0.44891074299812317,
"learning_rate": 9.820000000000001e-06,
"loss": 1.3196,
"step": 9
},
{
"epoch": 0.0023730422401518746,
"grad_norm": 0.3395187556743622,
"learning_rate": 9.800000000000001e-06,
"loss": 1.588,
"step": 10
},
{
"epoch": 0.0026103464641670624,
"grad_norm": 0.587505578994751,
"learning_rate": 9.780000000000001e-06,
"loss": 1.2509,
"step": 11
},
{
"epoch": 0.0028476506881822496,
"grad_norm": 0.4817255437374115,
"learning_rate": 9.760000000000001e-06,
"loss": 1.3541,
"step": 12
},
{
"epoch": 0.003084954912197437,
"grad_norm": 0.312285840511322,
"learning_rate": 9.74e-06,
"loss": 1.8525,
"step": 13
},
{
"epoch": 0.0033222591362126247,
"grad_norm": 0.7621486783027649,
"learning_rate": 9.72e-06,
"loss": 1.3197,
"step": 14
},
{
"epoch": 0.003559563360227812,
"grad_norm": 0.5916957259178162,
"learning_rate": 9.7e-06,
"loss": 1.0805,
"step": 15
},
{
"epoch": 0.0037968675842429997,
"grad_norm": 0.7447299361228943,
"learning_rate": 9.68e-06,
"loss": 2.5366,
"step": 16
},
{
"epoch": 0.004034171808258187,
"grad_norm": 0.38069915771484375,
"learning_rate": 9.66e-06,
"loss": 1.6038,
"step": 17
},
{
"epoch": 0.004271476032273375,
"grad_norm": 0.4375569224357605,
"learning_rate": 9.640000000000001e-06,
"loss": 2.8857,
"step": 18
},
{
"epoch": 0.004508780256288562,
"grad_norm": 0.2266787886619568,
"learning_rate": 9.620000000000001e-06,
"loss": 2.045,
"step": 19
},
{
"epoch": 0.004746084480303749,
"grad_norm": 0.3714630901813507,
"learning_rate": 9.600000000000001e-06,
"loss": 1.6152,
"step": 20
},
{
"epoch": 0.0049833887043189366,
"grad_norm": 0.45660167932510376,
"learning_rate": 9.58e-06,
"loss": 2.1712,
"step": 21
},
{
"epoch": 0.005220692928334125,
"grad_norm": 0.3804182708263397,
"learning_rate": 9.56e-06,
"loss": 1.7326,
"step": 22
},
{
"epoch": 0.005457997152349312,
"grad_norm": 0.5945218205451965,
"learning_rate": 9.54e-06,
"loss": 2.8499,
"step": 23
},
{
"epoch": 0.005695301376364499,
"grad_norm": 0.49986812472343445,
"learning_rate": 9.52e-06,
"loss": 1.887,
"step": 24
},
{
"epoch": 0.005932605600379687,
"grad_norm": 0.41545894742012024,
"learning_rate": 9.5e-06,
"loss": 1.0656,
"step": 25
},
{
"epoch": 0.006169909824394874,
"grad_norm": 0.6399343609809875,
"learning_rate": 9.48e-06,
"loss": 1.8672,
"step": 26
},
{
"epoch": 0.006407214048410062,
"grad_norm": 0.3765011727809906,
"learning_rate": 9.460000000000001e-06,
"loss": 0.9629,
"step": 27
},
{
"epoch": 0.006644518272425249,
"grad_norm": 0.48257043957710266,
"learning_rate": 9.440000000000001e-06,
"loss": 1.9923,
"step": 28
},
{
"epoch": 0.006881822496440437,
"grad_norm": 0.5844452977180481,
"learning_rate": 9.42e-06,
"loss": 2.0896,
"step": 29
},
{
"epoch": 0.007119126720455624,
"grad_norm": 0.6092790961265564,
"learning_rate": 9.4e-06,
"loss": 2.4446,
"step": 30
},
{
"epoch": 0.007356430944470811,
"grad_norm": 0.6862596869468689,
"learning_rate": 9.38e-06,
"loss": 1.9678,
"step": 31
},
{
"epoch": 0.007593735168485999,
"grad_norm": 0.5207828879356384,
"learning_rate": 9.360000000000002e-06,
"loss": 1.611,
"step": 32
},
{
"epoch": 0.007831039392501186,
"grad_norm": 0.5294057130813599,
"learning_rate": 9.340000000000002e-06,
"loss": 1.8447,
"step": 33
},
{
"epoch": 0.008068343616516375,
"grad_norm": 0.4702499210834503,
"learning_rate": 9.32e-06,
"loss": 2.035,
"step": 34
},
{
"epoch": 0.008305647840531562,
"grad_norm": 0.5244491696357727,
"learning_rate": 9.3e-06,
"loss": 1.4395,
"step": 35
},
{
"epoch": 0.00854295206454675,
"grad_norm": 0.644615650177002,
"learning_rate": 9.280000000000001e-06,
"loss": 1.7666,
"step": 36
},
{
"epoch": 0.008780256288561937,
"grad_norm": 0.5552751421928406,
"learning_rate": 9.260000000000001e-06,
"loss": 1.5753,
"step": 37
},
{
"epoch": 0.009017560512577124,
"grad_norm": 0.2882249057292938,
"learning_rate": 9.240000000000001e-06,
"loss": 1.5176,
"step": 38
},
{
"epoch": 0.009254864736592311,
"grad_norm": 0.38219153881073,
"learning_rate": 9.220000000000002e-06,
"loss": 1.7808,
"step": 39
},
{
"epoch": 0.009492168960607499,
"grad_norm": 0.6532744765281677,
"learning_rate": 9.200000000000002e-06,
"loss": 1.8763,
"step": 40
},
{
"epoch": 0.009729473184622686,
"grad_norm": 0.7350605726242065,
"learning_rate": 9.180000000000002e-06,
"loss": 2.5947,
"step": 41
},
{
"epoch": 0.009966777408637873,
"grad_norm": 0.44435158371925354,
"learning_rate": 9.16e-06,
"loss": 2.4829,
"step": 42
},
{
"epoch": 0.01020408163265306,
"grad_norm": 0.3937893509864807,
"learning_rate": 9.14e-06,
"loss": 2.106,
"step": 43
},
{
"epoch": 0.01044138585666825,
"grad_norm": 0.8209621906280518,
"learning_rate": 9.12e-06,
"loss": 1.8711,
"step": 44
},
{
"epoch": 0.010678690080683437,
"grad_norm": 1.220982313156128,
"learning_rate": 9.100000000000001e-06,
"loss": 2.4524,
"step": 45
},
{
"epoch": 0.010915994304698624,
"grad_norm": 0.4203420877456665,
"learning_rate": 9.080000000000001e-06,
"loss": 1.775,
"step": 46
},
{
"epoch": 0.011153298528713811,
"grad_norm": 0.28424403071403503,
"learning_rate": 9.060000000000001e-06,
"loss": 1.7594,
"step": 47
},
{
"epoch": 0.011390602752728999,
"grad_norm": 0.7268348336219788,
"learning_rate": 9.040000000000002e-06,
"loss": 0.9244,
"step": 48
},
{
"epoch": 0.011627906976744186,
"grad_norm": 0.4843103885650635,
"learning_rate": 9.020000000000002e-06,
"loss": 1.9955,
"step": 49
},
{
"epoch": 0.011865211200759373,
"grad_norm": 0.3405970335006714,
"learning_rate": 9e-06,
"loss": 1.6384,
"step": 50
},
{
"epoch": 0.01210251542477456,
"grad_norm": 1.1265524625778198,
"learning_rate": 8.98e-06,
"loss": 1.7042,
"step": 51
},
{
"epoch": 0.012339819648789748,
"grad_norm": 0.13442721962928772,
"learning_rate": 8.96e-06,
"loss": 1.3843,
"step": 52
},
{
"epoch": 0.012577123872804937,
"grad_norm": 0.4972473382949829,
"learning_rate": 8.94e-06,
"loss": 0.8437,
"step": 53
},
{
"epoch": 0.012814428096820124,
"grad_norm": 0.28667712211608887,
"learning_rate": 8.920000000000001e-06,
"loss": 0.9238,
"step": 54
},
{
"epoch": 0.013051732320835311,
"grad_norm": 0.2998306155204773,
"learning_rate": 8.900000000000001e-06,
"loss": 2.2332,
"step": 55
},
{
"epoch": 0.013289036544850499,
"grad_norm": 0.6133009195327759,
"learning_rate": 8.880000000000001e-06,
"loss": 2.2289,
"step": 56
},
{
"epoch": 0.013526340768865686,
"grad_norm": 0.48921748995780945,
"learning_rate": 8.860000000000002e-06,
"loss": 0.6597,
"step": 57
},
{
"epoch": 0.013763644992880873,
"grad_norm": 0.8161422610282898,
"learning_rate": 8.84e-06,
"loss": 1.3381,
"step": 58
},
{
"epoch": 0.01400094921689606,
"grad_norm": 0.4998335540294647,
"learning_rate": 8.82e-06,
"loss": 1.5505,
"step": 59
},
{
"epoch": 0.014238253440911248,
"grad_norm": 0.8967633843421936,
"learning_rate": 8.8e-06,
"loss": 1.2807,
"step": 60
},
{
"epoch": 0.014475557664926435,
"grad_norm": 0.8106015920639038,
"learning_rate": 8.78e-06,
"loss": 1.2792,
"step": 61
},
{
"epoch": 0.014712861888941622,
"grad_norm": 0.6022857427597046,
"learning_rate": 8.76e-06,
"loss": 2.023,
"step": 62
},
{
"epoch": 0.014950166112956811,
"grad_norm": 0.6330555081367493,
"learning_rate": 8.740000000000001e-06,
"loss": 0.931,
"step": 63
},
{
"epoch": 0.015187470336971999,
"grad_norm": 0.6975427269935608,
"learning_rate": 8.720000000000001e-06,
"loss": 1.5096,
"step": 64
},
{
"epoch": 0.015424774560987186,
"grad_norm": 0.9109779596328735,
"learning_rate": 8.700000000000001e-06,
"loss": 1.4295,
"step": 65
},
{
"epoch": 0.01566207878500237,
"grad_norm": 0.5679107904434204,
"learning_rate": 8.68e-06,
"loss": 1.6368,
"step": 66
},
{
"epoch": 0.01589938300901756,
"grad_norm": 0.4659746587276459,
"learning_rate": 8.66e-06,
"loss": 1.292,
"step": 67
},
{
"epoch": 0.01613668723303275,
"grad_norm": 0.6395153403282166,
"learning_rate": 8.64e-06,
"loss": 2.0611,
"step": 68
},
{
"epoch": 0.016373991457047935,
"grad_norm": 0.37920185923576355,
"learning_rate": 8.62e-06,
"loss": 1.1706,
"step": 69
},
{
"epoch": 0.016611295681063124,
"grad_norm": 0.2950354218482971,
"learning_rate": 8.6e-06,
"loss": 1.4619,
"step": 70
},
{
"epoch": 0.01684859990507831,
"grad_norm": 0.664068877696991,
"learning_rate": 8.580000000000001e-06,
"loss": 2.3014,
"step": 71
},
{
"epoch": 0.0170859041290935,
"grad_norm": 0.4015841782093048,
"learning_rate": 8.560000000000001e-06,
"loss": 2.6305,
"step": 72
},
{
"epoch": 0.017323208353108684,
"grad_norm": 0.8365151882171631,
"learning_rate": 8.540000000000001e-06,
"loss": 2.4402,
"step": 73
},
{
"epoch": 0.017560512577123873,
"grad_norm": 0.4587593376636505,
"learning_rate": 8.52e-06,
"loss": 1.1948,
"step": 74
},
{
"epoch": 0.01779781680113906,
"grad_norm": 0.5275629162788391,
"learning_rate": 8.5e-06,
"loss": 1.2889,
"step": 75
},
{
"epoch": 0.018035121025154248,
"grad_norm": 0.5322698354721069,
"learning_rate": 8.48e-06,
"loss": 2.5775,
"step": 76
},
{
"epoch": 0.018272425249169437,
"grad_norm": 0.7846812009811401,
"learning_rate": 8.46e-06,
"loss": 1.2905,
"step": 77
},
{
"epoch": 0.018509729473184623,
"grad_norm": 0.4759507179260254,
"learning_rate": 8.44e-06,
"loss": 1.1162,
"step": 78
},
{
"epoch": 0.01874703369719981,
"grad_norm": 0.596358597278595,
"learning_rate": 8.42e-06,
"loss": 1.6618,
"step": 79
},
{
"epoch": 0.018984337921214997,
"grad_norm": 0.5133060812950134,
"learning_rate": 8.400000000000001e-06,
"loss": 1.9102,
"step": 80
},
{
"epoch": 0.019221642145230186,
"grad_norm": 0.8989421129226685,
"learning_rate": 8.380000000000001e-06,
"loss": 2.1852,
"step": 81
},
{
"epoch": 0.01945894636924537,
"grad_norm": 0.4702143669128418,
"learning_rate": 8.36e-06,
"loss": 1.2668,
"step": 82
},
{
"epoch": 0.01969625059326056,
"grad_norm": 1.2159205675125122,
"learning_rate": 8.34e-06,
"loss": 1.1101,
"step": 83
},
{
"epoch": 0.019933554817275746,
"grad_norm": 0.5116935968399048,
"learning_rate": 8.32e-06,
"loss": 1.222,
"step": 84
},
{
"epoch": 0.020170859041290935,
"grad_norm": 1.3710129261016846,
"learning_rate": 8.3e-06,
"loss": 3.1337,
"step": 85
},
{
"epoch": 0.02040816326530612,
"grad_norm": 0.7467148303985596,
"learning_rate": 8.28e-06,
"loss": 2.1887,
"step": 86
},
{
"epoch": 0.02064546748932131,
"grad_norm": 0.6403272151947021,
"learning_rate": 8.26e-06,
"loss": 1.2013,
"step": 87
},
{
"epoch": 0.0208827717133365,
"grad_norm": 0.4310401678085327,
"learning_rate": 8.24e-06,
"loss": 1.4044,
"step": 88
},
{
"epoch": 0.021120075937351684,
"grad_norm": 0.6710259318351746,
"learning_rate": 8.220000000000001e-06,
"loss": 2.5627,
"step": 89
},
{
"epoch": 0.021357380161366873,
"grad_norm": 0.5828210115432739,
"learning_rate": 8.2e-06,
"loss": 1.7674,
"step": 90
},
{
"epoch": 0.02159468438538206,
"grad_norm": 0.5417571663856506,
"learning_rate": 8.18e-06,
"loss": 2.2125,
"step": 91
},
{
"epoch": 0.021831988609397248,
"grad_norm": 0.5095130205154419,
"learning_rate": 8.16e-06,
"loss": 3.1429,
"step": 92
},
{
"epoch": 0.022069292833412434,
"grad_norm": 0.2329273670911789,
"learning_rate": 8.14e-06,
"loss": 1.84,
"step": 93
},
{
"epoch": 0.022306597057427623,
"grad_norm": 0.4823262691497803,
"learning_rate": 8.120000000000002e-06,
"loss": 1.7104,
"step": 94
},
{
"epoch": 0.022543901281442808,
"grad_norm": 0.6276722550392151,
"learning_rate": 8.1e-06,
"loss": 0.7729,
"step": 95
},
{
"epoch": 0.022781205505457997,
"grad_norm": 0.642092764377594,
"learning_rate": 8.08e-06,
"loss": 2.1965,
"step": 96
},
{
"epoch": 0.023018509729473186,
"grad_norm": 0.34217798709869385,
"learning_rate": 8.06e-06,
"loss": 0.9981,
"step": 97
},
{
"epoch": 0.023255813953488372,
"grad_norm": 0.47839802503585815,
"learning_rate": 8.040000000000001e-06,
"loss": 1.594,
"step": 98
},
{
"epoch": 0.02349311817750356,
"grad_norm": 1.0722686052322388,
"learning_rate": 8.020000000000001e-06,
"loss": 2.0473,
"step": 99
},
{
"epoch": 0.023730422401518746,
"grad_norm": 0.7368118166923523,
"learning_rate": 8.000000000000001e-06,
"loss": 2.4467,
"step": 100
},
{
"epoch": 0.023967726625533935,
"grad_norm": 1.064618706703186,
"learning_rate": 7.980000000000002e-06,
"loss": 1.9095,
"step": 101
},
{
"epoch": 0.02420503084954912,
"grad_norm": 0.43928399682044983,
"learning_rate": 7.960000000000002e-06,
"loss": 1.7245,
"step": 102
},
{
"epoch": 0.02444233507356431,
"grad_norm": 0.6588628888130188,
"learning_rate": 7.94e-06,
"loss": 0.8569,
"step": 103
},
{
"epoch": 0.024679639297579496,
"grad_norm": 0.5403575301170349,
"learning_rate": 7.92e-06,
"loss": 1.2521,
"step": 104
},
{
"epoch": 0.024916943521594685,
"grad_norm": 0.6686379313468933,
"learning_rate": 7.9e-06,
"loss": 1.6869,
"step": 105
},
{
"epoch": 0.025154247745609874,
"grad_norm": 0.3803173005580902,
"learning_rate": 7.88e-06,
"loss": 2.1051,
"step": 106
},
{
"epoch": 0.02539155196962506,
"grad_norm": 0.12461218237876892,
"learning_rate": 7.860000000000001e-06,
"loss": 1.5027,
"step": 107
},
{
"epoch": 0.025628856193640248,
"grad_norm": 0.47309207916259766,
"learning_rate": 7.840000000000001e-06,
"loss": 1.156,
"step": 108
},
{
"epoch": 0.025866160417655434,
"grad_norm": 0.5996202826499939,
"learning_rate": 7.820000000000001e-06,
"loss": 1.8978,
"step": 109
},
{
"epoch": 0.026103464641670623,
"grad_norm": 1.3080424070358276,
"learning_rate": 7.800000000000002e-06,
"loss": 1.2111,
"step": 110
},
{
"epoch": 0.02634076886568581,
"grad_norm": 0.5753285884857178,
"learning_rate": 7.78e-06,
"loss": 1.2555,
"step": 111
},
{
"epoch": 0.026578073089700997,
"grad_norm": 0.4644084870815277,
"learning_rate": 7.76e-06,
"loss": 0.5773,
"step": 112
},
{
"epoch": 0.026815377313716183,
"grad_norm": 0.7260199189186096,
"learning_rate": 7.74e-06,
"loss": 1.3806,
"step": 113
},
{
"epoch": 0.027052681537731372,
"grad_norm": 3.247457265853882,
"learning_rate": 7.72e-06,
"loss": 3.6028,
"step": 114
},
{
"epoch": 0.027289985761746557,
"grad_norm": 0.6819869875907898,
"learning_rate": 7.7e-06,
"loss": 1.3892,
"step": 115
},
{
"epoch": 0.027527289985761746,
"grad_norm": 0.5939836502075195,
"learning_rate": 7.680000000000001e-06,
"loss": 0.8986,
"step": 116
},
{
"epoch": 0.027764594209776935,
"grad_norm": 0.9432902336120605,
"learning_rate": 7.660000000000001e-06,
"loss": 1.9631,
"step": 117
},
{
"epoch": 0.02800189843379212,
"grad_norm": 0.7726979851722717,
"learning_rate": 7.640000000000001e-06,
"loss": 1.8061,
"step": 118
},
{
"epoch": 0.02823920265780731,
"grad_norm": 1.1715900897979736,
"learning_rate": 7.620000000000001e-06,
"loss": 1.6768,
"step": 119
},
{
"epoch": 0.028476506881822496,
"grad_norm": 0.422097772359848,
"learning_rate": 7.600000000000001e-06,
"loss": 1.2938,
"step": 120
},
{
"epoch": 0.028713811105837685,
"grad_norm": 0.4177633225917816,
"learning_rate": 7.58e-06,
"loss": 2.4758,
"step": 121
},
{
"epoch": 0.02895111532985287,
"grad_norm": 0.8148231506347656,
"learning_rate": 7.5600000000000005e-06,
"loss": 1.6307,
"step": 122
},
{
"epoch": 0.02918841955386806,
"grad_norm": 0.36970993876457214,
"learning_rate": 7.540000000000001e-06,
"loss": 1.923,
"step": 123
},
{
"epoch": 0.029425723777883245,
"grad_norm": 0.9572102427482605,
"learning_rate": 7.520000000000001e-06,
"loss": 2.4889,
"step": 124
},
{
"epoch": 0.029663028001898434,
"grad_norm": 1.023463487625122,
"learning_rate": 7.500000000000001e-06,
"loss": 0.753,
"step": 125
},
{
"epoch": 0.029900332225913623,
"grad_norm": 0.7689042091369629,
"learning_rate": 7.48e-06,
"loss": 1.8955,
"step": 126
},
{
"epoch": 0.03013763644992881,
"grad_norm": 0.8892776370048523,
"learning_rate": 7.4600000000000006e-06,
"loss": 1.907,
"step": 127
},
{
"epoch": 0.030374940673943997,
"grad_norm": 0.6348279118537903,
"learning_rate": 7.440000000000001e-06,
"loss": 0.9907,
"step": 128
},
{
"epoch": 0.030612244897959183,
"grad_norm": 0.8271303772926331,
"learning_rate": 7.420000000000001e-06,
"loss": 0.8349,
"step": 129
},
{
"epoch": 0.030849549121974372,
"grad_norm": 0.567034900188446,
"learning_rate": 7.4e-06,
"loss": 1.5324,
"step": 130
},
{
"epoch": 0.031086853345989558,
"grad_norm": 0.7354723215103149,
"learning_rate": 7.3800000000000005e-06,
"loss": 1.8572,
"step": 131
},
{
"epoch": 0.03132415757000474,
"grad_norm": 0.7156671285629272,
"learning_rate": 7.360000000000001e-06,
"loss": 1.6312,
"step": 132
},
{
"epoch": 0.03156146179401993,
"grad_norm": 0.36890867352485657,
"learning_rate": 7.340000000000001e-06,
"loss": 0.8384,
"step": 133
},
{
"epoch": 0.03179876601803512,
"grad_norm": 0.6410567760467529,
"learning_rate": 7.32e-06,
"loss": 1.1728,
"step": 134
},
{
"epoch": 0.03203607024205031,
"grad_norm": 0.5395240187644958,
"learning_rate": 7.3e-06,
"loss": 1.3754,
"step": 135
},
{
"epoch": 0.0322733744660655,
"grad_norm": 0.9049538373947144,
"learning_rate": 7.280000000000001e-06,
"loss": 1.1323,
"step": 136
},
{
"epoch": 0.03251067869008068,
"grad_norm": 0.7486042380332947,
"learning_rate": 7.260000000000001e-06,
"loss": 2.6436,
"step": 137
},
{
"epoch": 0.03274798291409587,
"grad_norm": 0.2955069839954376,
"learning_rate": 7.24e-06,
"loss": 1.6258,
"step": 138
},
{
"epoch": 0.03298528713811106,
"grad_norm": 0.5585281848907471,
"learning_rate": 7.22e-06,
"loss": 1.0526,
"step": 139
},
{
"epoch": 0.03322259136212625,
"grad_norm": 0.6172239780426025,
"learning_rate": 7.2000000000000005e-06,
"loss": 2.0741,
"step": 140
},
{
"epoch": 0.03345989558614143,
"grad_norm": 1.0919370651245117,
"learning_rate": 7.180000000000001e-06,
"loss": 2.6546,
"step": 141
},
{
"epoch": 0.03369719981015662,
"grad_norm": 0.6277972459793091,
"learning_rate": 7.16e-06,
"loss": 1.4516,
"step": 142
},
{
"epoch": 0.03393450403417181,
"grad_norm": 1.262568712234497,
"learning_rate": 7.14e-06,
"loss": 1.3782,
"step": 143
},
{
"epoch": 0.034171808258187,
"grad_norm": 0.9131320714950562,
"learning_rate": 7.1200000000000004e-06,
"loss": 1.3814,
"step": 144
},
{
"epoch": 0.03440911248220219,
"grad_norm": 0.4619258642196655,
"learning_rate": 7.100000000000001e-06,
"loss": 2.8978,
"step": 145
},
{
"epoch": 0.03464641670621737,
"grad_norm": 0.17557378113269806,
"learning_rate": 7.08e-06,
"loss": 1.5882,
"step": 146
},
{
"epoch": 0.03488372093023256,
"grad_norm": 0.6798960566520691,
"learning_rate": 7.06e-06,
"loss": 2.5057,
"step": 147
},
{
"epoch": 0.03512102515424775,
"grad_norm": 0.7229902744293213,
"learning_rate": 7.04e-06,
"loss": 1.195,
"step": 148
},
{
"epoch": 0.035358329378262936,
"grad_norm": 0.5356243848800659,
"learning_rate": 7.0200000000000006e-06,
"loss": 2.3255,
"step": 149
},
{
"epoch": 0.03559563360227812,
"grad_norm": 0.735577404499054,
"learning_rate": 7e-06,
"loss": 0.5763,
"step": 150
},
{
"epoch": 0.03583293782629331,
"grad_norm": 0.7191525101661682,
"learning_rate": 6.98e-06,
"loss": 1.5225,
"step": 151
},
{
"epoch": 0.036070242050308496,
"grad_norm": 0.5553634166717529,
"learning_rate": 6.96e-06,
"loss": 2.0026,
"step": 152
},
{
"epoch": 0.036307546274323685,
"grad_norm": 0.9029828310012817,
"learning_rate": 6.9400000000000005e-06,
"loss": 0.8248,
"step": 153
},
{
"epoch": 0.036544850498338874,
"grad_norm": 0.8566996455192566,
"learning_rate": 6.92e-06,
"loss": 1.9291,
"step": 154
},
{
"epoch": 0.036782154722354056,
"grad_norm": 0.677769124507904,
"learning_rate": 6.9e-06,
"loss": 1.5933,
"step": 155
},
{
"epoch": 0.037019458946369245,
"grad_norm": 0.5915787220001221,
"learning_rate": 6.88e-06,
"loss": 0.9965,
"step": 156
},
{
"epoch": 0.037256763170384434,
"grad_norm": 0.511048436164856,
"learning_rate": 6.860000000000001e-06,
"loss": 1.8256,
"step": 157
},
{
"epoch": 0.03749406739439962,
"grad_norm": 0.49128812551498413,
"learning_rate": 6.8400000000000014e-06,
"loss": 1.6187,
"step": 158
},
{
"epoch": 0.037731371618414805,
"grad_norm": 0.554414689540863,
"learning_rate": 6.820000000000001e-06,
"loss": 1.1991,
"step": 159
},
{
"epoch": 0.037968675842429994,
"grad_norm": 0.9458298683166504,
"learning_rate": 6.800000000000001e-06,
"loss": 2.2793,
"step": 160
},
{
"epoch": 0.03820598006644518,
"grad_norm": 2.0072669982910156,
"learning_rate": 6.780000000000001e-06,
"loss": 1.89,
"step": 161
},
{
"epoch": 0.03844328429046037,
"grad_norm": 1.146154761314392,
"learning_rate": 6.760000000000001e-06,
"loss": 2.607,
"step": 162
},
{
"epoch": 0.038680588514475554,
"grad_norm": 0.6168065667152405,
"learning_rate": 6.740000000000001e-06,
"loss": 1.459,
"step": 163
},
{
"epoch": 0.03891789273849074,
"grad_norm": 0.4497089684009552,
"learning_rate": 6.720000000000001e-06,
"loss": 1.2466,
"step": 164
},
{
"epoch": 0.03915519696250593,
"grad_norm": 0.7007705569267273,
"learning_rate": 6.700000000000001e-06,
"loss": 2.4729,
"step": 165
},
{
"epoch": 0.03939250118652112,
"grad_norm": 0.8613377809524536,
"learning_rate": 6.680000000000001e-06,
"loss": 2.2219,
"step": 166
},
{
"epoch": 0.03962980541053631,
"grad_norm": 0.539036750793457,
"learning_rate": 6.660000000000001e-06,
"loss": 0.5526,
"step": 167
},
{
"epoch": 0.03986710963455149,
"grad_norm": 0.6085006594657898,
"learning_rate": 6.640000000000001e-06,
"loss": 2.1238,
"step": 168
},
{
"epoch": 0.04010441385856668,
"grad_norm": 1.8861020803451538,
"learning_rate": 6.620000000000001e-06,
"loss": 2.6934,
"step": 169
},
{
"epoch": 0.04034171808258187,
"grad_norm": 0.5803074240684509,
"learning_rate": 6.600000000000001e-06,
"loss": 1.2254,
"step": 170
},
{
"epoch": 0.04057902230659706,
"grad_norm": 1.0310598611831665,
"learning_rate": 6.5800000000000005e-06,
"loss": 1.1757,
"step": 171
},
{
"epoch": 0.04081632653061224,
"grad_norm": 0.9412042498588562,
"learning_rate": 6.560000000000001e-06,
"loss": 1.3748,
"step": 172
},
{
"epoch": 0.04105363075462743,
"grad_norm": 0.6556461453437805,
"learning_rate": 6.540000000000001e-06,
"loss": 1.0809,
"step": 173
},
{
"epoch": 0.04129093497864262,
"grad_norm": 0.4990858733654022,
"learning_rate": 6.520000000000001e-06,
"loss": 1.2934,
"step": 174
},
{
"epoch": 0.04152823920265781,
"grad_norm": 0.6699053645133972,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.4814,
"step": 175
},
{
"epoch": 0.041765543426673,
"grad_norm": 0.8843134641647339,
"learning_rate": 6.480000000000001e-06,
"loss": 1.9615,
"step": 176
},
{
"epoch": 0.04200284765068818,
"grad_norm": 0.6757798790931702,
"learning_rate": 6.460000000000001e-06,
"loss": 1.7584,
"step": 177
},
{
"epoch": 0.04224015187470337,
"grad_norm": 0.46641185879707336,
"learning_rate": 6.440000000000001e-06,
"loss": 1.3054,
"step": 178
},
{
"epoch": 0.04247745609871856,
"grad_norm": 0.7104260325431824,
"learning_rate": 6.42e-06,
"loss": 1.1761,
"step": 179
},
{
"epoch": 0.04271476032273375,
"grad_norm": 0.3888971209526062,
"learning_rate": 6.4000000000000006e-06,
"loss": 2.2055,
"step": 180
},
{
"epoch": 0.04295206454674893,
"grad_norm": 1.1661548614501953,
"learning_rate": 6.380000000000001e-06,
"loss": 1.1329,
"step": 181
},
{
"epoch": 0.04318936877076412,
"grad_norm": 0.40143099427223206,
"learning_rate": 6.360000000000001e-06,
"loss": 1.0135,
"step": 182
},
{
"epoch": 0.04342667299477931,
"grad_norm": 0.692574679851532,
"learning_rate": 6.34e-06,
"loss": 1.3553,
"step": 183
},
{
"epoch": 0.043663977218794496,
"grad_norm": 0.6210284233093262,
"learning_rate": 6.3200000000000005e-06,
"loss": 1.4226,
"step": 184
},
{
"epoch": 0.043901281442809685,
"grad_norm": 0.6218913197517395,
"learning_rate": 6.300000000000001e-06,
"loss": 1.8475,
"step": 185
},
{
"epoch": 0.04413858566682487,
"grad_norm": 0.6004898548126221,
"learning_rate": 6.280000000000001e-06,
"loss": 1.8829,
"step": 186
},
{
"epoch": 0.044375889890840056,
"grad_norm": 0.8992873430252075,
"learning_rate": 6.26e-06,
"loss": 1.0599,
"step": 187
},
{
"epoch": 0.044613194114855245,
"grad_norm": 0.5328947305679321,
"learning_rate": 6.24e-06,
"loss": 0.7189,
"step": 188
},
{
"epoch": 0.044850498338870434,
"grad_norm": 0.5015738010406494,
"learning_rate": 6.220000000000001e-06,
"loss": 2.0798,
"step": 189
},
{
"epoch": 0.045087802562885616,
"grad_norm": 0.47242438793182373,
"learning_rate": 6.200000000000001e-06,
"loss": 1.3423,
"step": 190
},
{
"epoch": 0.045325106786900805,
"grad_norm": 0.9236852526664734,
"learning_rate": 6.18e-06,
"loss": 1.2769,
"step": 191
},
{
"epoch": 0.045562411010915994,
"grad_norm": 0.47883141040802,
"learning_rate": 6.16e-06,
"loss": 1.5209,
"step": 192
},
{
"epoch": 0.04579971523493118,
"grad_norm": 0.760746955871582,
"learning_rate": 6.1400000000000005e-06,
"loss": 1.6353,
"step": 193
},
{
"epoch": 0.04603701945894637,
"grad_norm": 0.5924590826034546,
"learning_rate": 6.120000000000001e-06,
"loss": 1.3738,
"step": 194
},
{
"epoch": 0.046274323682961555,
"grad_norm": 0.7453778386116028,
"learning_rate": 6.1e-06,
"loss": 0.8792,
"step": 195
},
{
"epoch": 0.046511627906976744,
"grad_norm": 0.8486724495887756,
"learning_rate": 6.08e-06,
"loss": 1.7324,
"step": 196
},
{
"epoch": 0.04674893213099193,
"grad_norm": 0.5349167585372925,
"learning_rate": 6.0600000000000004e-06,
"loss": 1.1938,
"step": 197
},
{
"epoch": 0.04698623635500712,
"grad_norm": 0.4522935748100281,
"learning_rate": 6.040000000000001e-06,
"loss": 1.282,
"step": 198
},
{
"epoch": 0.047223540579022304,
"grad_norm": 0.6473549008369446,
"learning_rate": 6.02e-06,
"loss": 0.9753,
"step": 199
},
{
"epoch": 0.04746084480303749,
"grad_norm": 0.8137974739074707,
"learning_rate": 6e-06,
"loss": 1.6364,
"step": 200
},
{
"epoch": 0.04769814902705268,
"grad_norm": 0.48920387029647827,
"learning_rate": 5.98e-06,
"loss": 1.8822,
"step": 201
},
{
"epoch": 0.04793545325106787,
"grad_norm": 0.7745688557624817,
"learning_rate": 5.9600000000000005e-06,
"loss": 1.1183,
"step": 202
},
{
"epoch": 0.04817275747508306,
"grad_norm": 1.2582346200942993,
"learning_rate": 5.94e-06,
"loss": 2.3109,
"step": 203
},
{
"epoch": 0.04841006169909824,
"grad_norm": 0.6862124800682068,
"learning_rate": 5.92e-06,
"loss": 1.972,
"step": 204
},
{
"epoch": 0.04864736592311343,
"grad_norm": 0.2611483931541443,
"learning_rate": 5.9e-06,
"loss": 1.657,
"step": 205
},
{
"epoch": 0.04888467014712862,
"grad_norm": 0.6788746118545532,
"learning_rate": 5.8800000000000005e-06,
"loss": 1.4215,
"step": 206
},
{
"epoch": 0.04912197437114381,
"grad_norm": 0.7344043254852295,
"learning_rate": 5.86e-06,
"loss": 1.8161,
"step": 207
},
{
"epoch": 0.04935927859515899,
"grad_norm": 0.9730067849159241,
"learning_rate": 5.84e-06,
"loss": 2.009,
"step": 208
},
{
"epoch": 0.04959658281917418,
"grad_norm": 0.5687656998634338,
"learning_rate": 5.82e-06,
"loss": 1.371,
"step": 209
},
{
"epoch": 0.04983388704318937,
"grad_norm": 0.9074623584747314,
"learning_rate": 5.8e-06,
"loss": 0.9096,
"step": 210
},
{
"epoch": 0.05007119126720456,
"grad_norm": 0.5359849333763123,
"learning_rate": 5.78e-06,
"loss": 1.7314,
"step": 211
},
{
"epoch": 0.05030849549121975,
"grad_norm": 0.5226656198501587,
"learning_rate": 5.76e-06,
"loss": 0.7389,
"step": 212
},
{
"epoch": 0.05054579971523493,
"grad_norm": 0.8956894874572754,
"learning_rate": 5.74e-06,
"loss": 1.1937,
"step": 213
},
{
"epoch": 0.05078310393925012,
"grad_norm": 1.2422311305999756,
"learning_rate": 5.72e-06,
"loss": 1.8545,
"step": 214
},
{
"epoch": 0.05102040816326531,
"grad_norm": 0.5634399056434631,
"learning_rate": 5.7e-06,
"loss": 1.446,
"step": 215
},
{
"epoch": 0.051257712387280496,
"grad_norm": 0.7447414398193359,
"learning_rate": 5.68e-06,
"loss": 1.7231,
"step": 216
},
{
"epoch": 0.05149501661129568,
"grad_norm": 1.131425380706787,
"learning_rate": 5.66e-06,
"loss": 1.4652,
"step": 217
},
{
"epoch": 0.05173232083531087,
"grad_norm": 0.8668593764305115,
"learning_rate": 5.64e-06,
"loss": 2.0443,
"step": 218
},
{
"epoch": 0.051969625059326056,
"grad_norm": 1.2437993288040161,
"learning_rate": 5.620000000000001e-06,
"loss": 2.1323,
"step": 219
},
{
"epoch": 0.052206929283341245,
"grad_norm": 0.914955198764801,
"learning_rate": 5.600000000000001e-06,
"loss": 2.2205,
"step": 220
},
{
"epoch": 0.05244423350735643,
"grad_norm": 1.0641348361968994,
"learning_rate": 5.580000000000001e-06,
"loss": 2.4159,
"step": 221
},
{
"epoch": 0.05268153773137162,
"grad_norm": 0.17957435548305511,
"learning_rate": 5.560000000000001e-06,
"loss": 1.1683,
"step": 222
},
{
"epoch": 0.052918841955386806,
"grad_norm": 1.058635950088501,
"learning_rate": 5.540000000000001e-06,
"loss": 1.8851,
"step": 223
},
{
"epoch": 0.053156146179401995,
"grad_norm": 0.6216885447502136,
"learning_rate": 5.5200000000000005e-06,
"loss": 2.0638,
"step": 224
},
{
"epoch": 0.053393450403417184,
"grad_norm": 0.32197320461273193,
"learning_rate": 5.500000000000001e-06,
"loss": 4.2387,
"step": 225
},
{
"epoch": 0.053630754627432366,
"grad_norm": 0.5334311127662659,
"learning_rate": 5.480000000000001e-06,
"loss": 1.199,
"step": 226
},
{
"epoch": 0.053868058851447555,
"grad_norm": 0.8656753301620483,
"learning_rate": 5.460000000000001e-06,
"loss": 1.0053,
"step": 227
},
{
"epoch": 0.054105363075462744,
"grad_norm": 0.9633522629737854,
"learning_rate": 5.4400000000000004e-06,
"loss": 2.6078,
"step": 228
},
{
"epoch": 0.05434266729947793,
"grad_norm": 0.8262597322463989,
"learning_rate": 5.420000000000001e-06,
"loss": 2.4638,
"step": 229
},
{
"epoch": 0.054579971523493115,
"grad_norm": 0.9816875457763672,
"learning_rate": 5.400000000000001e-06,
"loss": 2.3545,
"step": 230
},
{
"epoch": 0.054817275747508304,
"grad_norm": 0.7079796195030212,
"learning_rate": 5.380000000000001e-06,
"loss": 1.7035,
"step": 231
},
{
"epoch": 0.05505457997152349,
"grad_norm": 0.2551076412200928,
"learning_rate": 5.36e-06,
"loss": 2.3941,
"step": 232
},
{
"epoch": 0.05529188419553868,
"grad_norm": 0.8667798042297363,
"learning_rate": 5.3400000000000005e-06,
"loss": 2.5876,
"step": 233
},
{
"epoch": 0.05552918841955387,
"grad_norm": 0.48972687125205994,
"learning_rate": 5.320000000000001e-06,
"loss": 1.9343,
"step": 234
},
{
"epoch": 0.05576649264356905,
"grad_norm": 1.166282057762146,
"learning_rate": 5.300000000000001e-06,
"loss": 1.5903,
"step": 235
},
{
"epoch": 0.05600379686758424,
"grad_norm": 1.0136897563934326,
"learning_rate": 5.28e-06,
"loss": 1.9003,
"step": 236
},
{
"epoch": 0.05624110109159943,
"grad_norm": 0.9301249384880066,
"learning_rate": 5.2600000000000005e-06,
"loss": 2.2514,
"step": 237
},
{
"epoch": 0.05647840531561462,
"grad_norm": 0.6378384232521057,
"learning_rate": 5.240000000000001e-06,
"loss": 1.0407,
"step": 238
},
{
"epoch": 0.0567157095396298,
"grad_norm": 0.7191042900085449,
"learning_rate": 5.220000000000001e-06,
"loss": 1.8487,
"step": 239
},
{
"epoch": 0.05695301376364499,
"grad_norm": 0.8724852204322815,
"learning_rate": 5.2e-06,
"loss": 1.9766,
"step": 240
},
{
"epoch": 0.05719031798766018,
"grad_norm": 1.2465623617172241,
"learning_rate": 5.18e-06,
"loss": 2.0123,
"step": 241
},
{
"epoch": 0.05742762221167537,
"grad_norm": 0.6831521987915039,
"learning_rate": 5.1600000000000006e-06,
"loss": 2.5164,
"step": 242
},
{
"epoch": 0.05766492643569056,
"grad_norm": 0.9678359627723694,
"learning_rate": 5.140000000000001e-06,
"loss": 1.1767,
"step": 243
},
{
"epoch": 0.05790223065970574,
"grad_norm": 0.7171100378036499,
"learning_rate": 5.12e-06,
"loss": 1.5144,
"step": 244
},
{
"epoch": 0.05813953488372093,
"grad_norm": 1.8189458847045898,
"learning_rate": 5.1e-06,
"loss": 0.8019,
"step": 245
},
{
"epoch": 0.05837683910773612,
"grad_norm": 0.8519846796989441,
"learning_rate": 5.0800000000000005e-06,
"loss": 0.5605,
"step": 246
},
{
"epoch": 0.05861414333175131,
"grad_norm": 0.8352647423744202,
"learning_rate": 5.060000000000001e-06,
"loss": 1.83,
"step": 247
},
{
"epoch": 0.05885144755576649,
"grad_norm": 0.8553798198699951,
"learning_rate": 5.04e-06,
"loss": 0.8078,
"step": 248
},
{
"epoch": 0.05908875177978168,
"grad_norm": 0.5836890935897827,
"learning_rate": 5.02e-06,
"loss": 1.805,
"step": 249
},
{
"epoch": 0.05932605600379687,
"grad_norm": 1.2686045169830322,
"learning_rate": 5e-06,
"loss": 2.9273,
"step": 250
},
{
"epoch": 0.05956336022781206,
"grad_norm": 1.0224002599716187,
"learning_rate": 4.980000000000001e-06,
"loss": 1.5689,
"step": 251
},
{
"epoch": 0.059800664451827246,
"grad_norm": 0.6084955334663391,
"learning_rate": 4.960000000000001e-06,
"loss": 1.4633,
"step": 252
},
{
"epoch": 0.06003796867584243,
"grad_norm": 1.0595405101776123,
"learning_rate": 4.94e-06,
"loss": 1.799,
"step": 253
},
{
"epoch": 0.06027527289985762,
"grad_norm": 0.6184794306755066,
"learning_rate": 4.92e-06,
"loss": 0.936,
"step": 254
},
{
"epoch": 0.060512577123872806,
"grad_norm": 0.8224856853485107,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.2481,
"step": 255
},
{
"epoch": 0.060749881347887995,
"grad_norm": 0.4918515086174011,
"learning_rate": 4.880000000000001e-06,
"loss": 1.6237,
"step": 256
},
{
"epoch": 0.06098718557190318,
"grad_norm": 0.7177821397781372,
"learning_rate": 4.86e-06,
"loss": 1.6721,
"step": 257
},
{
"epoch": 0.061224489795918366,
"grad_norm": 0.654577374458313,
"learning_rate": 4.84e-06,
"loss": 1.6336,
"step": 258
},
{
"epoch": 0.061461794019933555,
"grad_norm": 0.6433861255645752,
"learning_rate": 4.8200000000000004e-06,
"loss": 0.7603,
"step": 259
},
{
"epoch": 0.061699098243948744,
"grad_norm": 0.8999320864677429,
"learning_rate": 4.800000000000001e-06,
"loss": 1.5193,
"step": 260
},
{
"epoch": 0.06193640246796393,
"grad_norm": 1.071006417274475,
"learning_rate": 4.78e-06,
"loss": 2.1792,
"step": 261
},
{
"epoch": 0.062173706691979115,
"grad_norm": 0.950939416885376,
"learning_rate": 4.76e-06,
"loss": 1.7475,
"step": 262
},
{
"epoch": 0.062411010915994304,
"grad_norm": 0.6791463494300842,
"learning_rate": 4.74e-06,
"loss": 1.5734,
"step": 263
},
{
"epoch": 0.06264831514000949,
"grad_norm": 0.8757117986679077,
"learning_rate": 4.7200000000000005e-06,
"loss": 0.9807,
"step": 264
},
{
"epoch": 0.06288561936402468,
"grad_norm": 1.6328972578048706,
"learning_rate": 4.7e-06,
"loss": 1.3517,
"step": 265
},
{
"epoch": 0.06312292358803986,
"grad_norm": 1.2624143362045288,
"learning_rate": 4.680000000000001e-06,
"loss": 1.935,
"step": 266
},
{
"epoch": 0.06336022781205505,
"grad_norm": 0.8143572211265564,
"learning_rate": 4.66e-06,
"loss": 1.2027,
"step": 267
},
{
"epoch": 0.06359753203607024,
"grad_norm": 1.05904221534729,
"learning_rate": 4.6400000000000005e-06,
"loss": 2.0852,
"step": 268
},
{
"epoch": 0.06383483626008543,
"grad_norm": 0.5743423104286194,
"learning_rate": 4.620000000000001e-06,
"loss": 1.649,
"step": 269
},
{
"epoch": 0.06407214048410062,
"grad_norm": 1.2721295356750488,
"learning_rate": 4.600000000000001e-06,
"loss": 1.7225,
"step": 270
},
{
"epoch": 0.06430944470811581,
"grad_norm": 0.8935225009918213,
"learning_rate": 4.58e-06,
"loss": 1.7969,
"step": 271
},
{
"epoch": 0.064546748932131,
"grad_norm": 0.4470404088497162,
"learning_rate": 4.56e-06,
"loss": 1.4834,
"step": 272
},
{
"epoch": 0.06478405315614617,
"grad_norm": 0.41209957003593445,
"learning_rate": 4.540000000000001e-06,
"loss": 1.5577,
"step": 273
},
{
"epoch": 0.06502135738016136,
"grad_norm": 1.3055024147033691,
"learning_rate": 4.520000000000001e-06,
"loss": 2.0445,
"step": 274
},
{
"epoch": 0.06525866160417655,
"grad_norm": 1.182127594947815,
"learning_rate": 4.5e-06,
"loss": 2.5339,
"step": 275
},
{
"epoch": 0.06549596582819174,
"grad_norm": 1.030988335609436,
"learning_rate": 4.48e-06,
"loss": 2.8977,
"step": 276
},
{
"epoch": 0.06573327005220693,
"grad_norm": 1.023729920387268,
"learning_rate": 4.4600000000000005e-06,
"loss": 3.7561,
"step": 277
},
{
"epoch": 0.06597057427622212,
"grad_norm": 0.8441396951675415,
"learning_rate": 4.440000000000001e-06,
"loss": 0.9105,
"step": 278
},
{
"epoch": 0.06620787850023731,
"grad_norm": 0.781200647354126,
"learning_rate": 4.42e-06,
"loss": 1.1585,
"step": 279
},
{
"epoch": 0.0664451827242525,
"grad_norm": 1.0872159004211426,
"learning_rate": 4.4e-06,
"loss": 1.8595,
"step": 280
},
{
"epoch": 0.06668248694826769,
"grad_norm": 0.7548374533653259,
"learning_rate": 4.38e-06,
"loss": 1.5494,
"step": 281
},
{
"epoch": 0.06691979117228286,
"grad_norm": 0.7101930975914001,
"learning_rate": 4.360000000000001e-06,
"loss": 0.9779,
"step": 282
},
{
"epoch": 0.06715709539629805,
"grad_norm": 2.0798416137695312,
"learning_rate": 4.34e-06,
"loss": 2.2541,
"step": 283
},
{
"epoch": 0.06739439962031324,
"grad_norm": 0.45076173543930054,
"learning_rate": 4.32e-06,
"loss": 2.0375,
"step": 284
},
{
"epoch": 0.06763170384432843,
"grad_norm": 1.132407784461975,
"learning_rate": 4.3e-06,
"loss": 1.8917,
"step": 285
},
{
"epoch": 0.06786900806834362,
"grad_norm": 0.4013515114784241,
"learning_rate": 4.2800000000000005e-06,
"loss": 0.3564,
"step": 286
},
{
"epoch": 0.0681063122923588,
"grad_norm": 0.7896368503570557,
"learning_rate": 4.26e-06,
"loss": 1.5522,
"step": 287
},
{
"epoch": 0.068343616516374,
"grad_norm": 0.5332828760147095,
"learning_rate": 4.24e-06,
"loss": 0.56,
"step": 288
},
{
"epoch": 0.06858092074038918,
"grad_norm": 0.6611258387565613,
"learning_rate": 4.22e-06,
"loss": 1.5975,
"step": 289
},
{
"epoch": 0.06881822496440437,
"grad_norm": 0.8199064135551453,
"learning_rate": 4.2000000000000004e-06,
"loss": 2.1785,
"step": 290
},
{
"epoch": 0.06905552918841955,
"grad_norm": 0.8175731301307678,
"learning_rate": 4.18e-06,
"loss": 2.1028,
"step": 291
},
{
"epoch": 0.06929283341243474,
"grad_norm": 0.7817385196685791,
"learning_rate": 4.16e-06,
"loss": 1.6518,
"step": 292
},
{
"epoch": 0.06953013763644993,
"grad_norm": 0.7397461533546448,
"learning_rate": 4.14e-06,
"loss": 1.4586,
"step": 293
},
{
"epoch": 0.06976744186046512,
"grad_norm": 0.39525383710861206,
"learning_rate": 4.12e-06,
"loss": 2.1693,
"step": 294
},
{
"epoch": 0.0700047460844803,
"grad_norm": 0.16452622413635254,
"learning_rate": 4.1e-06,
"loss": 2.0744,
"step": 295
},
{
"epoch": 0.0702420503084955,
"grad_norm": 0.49008700251579285,
"learning_rate": 4.08e-06,
"loss": 1.5107,
"step": 296
},
{
"epoch": 0.07047935453251068,
"grad_norm": 0.6975173354148865,
"learning_rate": 4.060000000000001e-06,
"loss": 1.4174,
"step": 297
},
{
"epoch": 0.07071665875652587,
"grad_norm": 0.6213851571083069,
"learning_rate": 4.04e-06,
"loss": 1.4038,
"step": 298
},
{
"epoch": 0.07095396298054106,
"grad_norm": 0.5320644378662109,
"learning_rate": 4.0200000000000005e-06,
"loss": 0.8553,
"step": 299
},
{
"epoch": 0.07119126720455624,
"grad_norm": 1.0871286392211914,
"learning_rate": 4.000000000000001e-06,
"loss": 2.0419,
"step": 300
},
{
"epoch": 0.07142857142857142,
"grad_norm": 0.3801209330558777,
"learning_rate": 3.980000000000001e-06,
"loss": 0.5386,
"step": 301
},
{
"epoch": 0.07166587565258661,
"grad_norm": 0.39513254165649414,
"learning_rate": 3.96e-06,
"loss": 2.0508,
"step": 302
},
{
"epoch": 0.0719031798766018,
"grad_norm": 0.45403411984443665,
"learning_rate": 3.94e-06,
"loss": 1.3929,
"step": 303
},
{
"epoch": 0.07214048410061699,
"grad_norm": 0.9575373530387878,
"learning_rate": 3.920000000000001e-06,
"loss": 2.3445,
"step": 304
},
{
"epoch": 0.07237778832463218,
"grad_norm": 0.7246173620223999,
"learning_rate": 3.900000000000001e-06,
"loss": 1.1485,
"step": 305
},
{
"epoch": 0.07261509254864737,
"grad_norm": 0.4268713891506195,
"learning_rate": 3.88e-06,
"loss": 1.7111,
"step": 306
},
{
"epoch": 0.07285239677266256,
"grad_norm": 0.3135124742984772,
"learning_rate": 3.86e-06,
"loss": 1.4731,
"step": 307
},
{
"epoch": 0.07308970099667775,
"grad_norm": 0.647866427898407,
"learning_rate": 3.8400000000000005e-06,
"loss": 0.9345,
"step": 308
},
{
"epoch": 0.07332700522069292,
"grad_norm": 0.6480103135108948,
"learning_rate": 3.820000000000001e-06,
"loss": 0.5553,
"step": 309
},
{
"epoch": 0.07356430944470811,
"grad_norm": 0.7226047515869141,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.1579,
"step": 310
},
{
"epoch": 0.0738016136687233,
"grad_norm": 0.6003654599189758,
"learning_rate": 3.7800000000000002e-06,
"loss": 1.2483,
"step": 311
},
{
"epoch": 0.07403891789273849,
"grad_norm": 0.6066475510597229,
"learning_rate": 3.7600000000000004e-06,
"loss": 0.7838,
"step": 312
},
{
"epoch": 0.07427622211675368,
"grad_norm": 0.5357272624969482,
"learning_rate": 3.74e-06,
"loss": 1.8871,
"step": 313
},
{
"epoch": 0.07451352634076887,
"grad_norm": 0.8201131820678711,
"learning_rate": 3.7200000000000004e-06,
"loss": 1.6833,
"step": 314
},
{
"epoch": 0.07475083056478406,
"grad_norm": 0.6600767970085144,
"learning_rate": 3.7e-06,
"loss": 1.5297,
"step": 315
},
{
"epoch": 0.07498813478879925,
"grad_norm": 0.6373748779296875,
"learning_rate": 3.6800000000000003e-06,
"loss": 1.5093,
"step": 316
},
{
"epoch": 0.07522543901281442,
"grad_norm": 0.7496886849403381,
"learning_rate": 3.66e-06,
"loss": 2.4157,
"step": 317
},
{
"epoch": 0.07546274323682961,
"grad_norm": 0.9333056211471558,
"learning_rate": 3.6400000000000003e-06,
"loss": 1.5617,
"step": 318
},
{
"epoch": 0.0757000474608448,
"grad_norm": 1.0693997144699097,
"learning_rate": 3.62e-06,
"loss": 1.5375,
"step": 319
},
{
"epoch": 0.07593735168485999,
"grad_norm": 0.5746883749961853,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.4276,
"step": 320
},
{
"epoch": 0.07617465590887518,
"grad_norm": 0.9793761968612671,
"learning_rate": 3.58e-06,
"loss": 0.9466,
"step": 321
},
{
"epoch": 0.07641196013289037,
"grad_norm": 0.6006602048873901,
"learning_rate": 3.5600000000000002e-06,
"loss": 1.028,
"step": 322
},
{
"epoch": 0.07664926435690556,
"grad_norm": 0.7533923983573914,
"learning_rate": 3.54e-06,
"loss": 1.9837,
"step": 323
},
{
"epoch": 0.07688656858092074,
"grad_norm": 1.0659205913543701,
"learning_rate": 3.52e-06,
"loss": 1.4845,
"step": 324
},
{
"epoch": 0.07712387280493593,
"grad_norm": 0.9382888078689575,
"learning_rate": 3.5e-06,
"loss": 1.8868,
"step": 325
},
{
"epoch": 0.07736117702895111,
"grad_norm": 0.5263766050338745,
"learning_rate": 3.48e-06,
"loss": 1.6957,
"step": 326
},
{
"epoch": 0.0775984812529663,
"grad_norm": 1.1845793724060059,
"learning_rate": 3.46e-06,
"loss": 1.7971,
"step": 327
},
{
"epoch": 0.07783578547698149,
"grad_norm": 0.7983663082122803,
"learning_rate": 3.44e-06,
"loss": 1.7826,
"step": 328
},
{
"epoch": 0.07807308970099668,
"grad_norm": 0.999782145023346,
"learning_rate": 3.4200000000000007e-06,
"loss": 1.4238,
"step": 329
},
{
"epoch": 0.07831039392501186,
"grad_norm": 0.44168537855148315,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.3762,
"step": 330
},
{
"epoch": 0.07854769814902705,
"grad_norm": 3.451951026916504,
"learning_rate": 3.3800000000000007e-06,
"loss": 1.8961,
"step": 331
},
{
"epoch": 0.07878500237304224,
"grad_norm": 1.2203079462051392,
"learning_rate": 3.3600000000000004e-06,
"loss": 2.4091,
"step": 332
},
{
"epoch": 0.07902230659705743,
"grad_norm": 0.7909596562385559,
"learning_rate": 3.3400000000000006e-06,
"loss": 1.288,
"step": 333
},
{
"epoch": 0.07925961082107262,
"grad_norm": 1.0289673805236816,
"learning_rate": 3.3200000000000004e-06,
"loss": 2.7401,
"step": 334
},
{
"epoch": 0.0794969150450878,
"grad_norm": 1.550726056098938,
"learning_rate": 3.3000000000000006e-06,
"loss": 2.0937,
"step": 335
},
{
"epoch": 0.07973421926910298,
"grad_norm": 0.9550947546958923,
"learning_rate": 3.2800000000000004e-06,
"loss": 1.3554,
"step": 336
},
{
"epoch": 0.07997152349311817,
"grad_norm": 0.5482783913612366,
"learning_rate": 3.2600000000000006e-06,
"loss": 2.0287,
"step": 337
},
{
"epoch": 0.08020882771713336,
"grad_norm": 1.071254014968872,
"learning_rate": 3.2400000000000003e-06,
"loss": 1.6622,
"step": 338
},
{
"epoch": 0.08044613194114855,
"grad_norm": 1.0661407709121704,
"learning_rate": 3.2200000000000005e-06,
"loss": 0.9957,
"step": 339
},
{
"epoch": 0.08068343616516374,
"grad_norm": 0.91053307056427,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.8996,
"step": 340
},
{
"epoch": 0.08092074038917893,
"grad_norm": 1.1073331832885742,
"learning_rate": 3.1800000000000005e-06,
"loss": 2.591,
"step": 341
},
{
"epoch": 0.08115804461319412,
"grad_norm": 1.430469274520874,
"learning_rate": 3.1600000000000002e-06,
"loss": 2.6224,
"step": 342
},
{
"epoch": 0.08139534883720931,
"grad_norm": 0.8856847882270813,
"learning_rate": 3.1400000000000004e-06,
"loss": 1.8773,
"step": 343
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.7149975895881653,
"learning_rate": 3.12e-06,
"loss": 1.9744,
"step": 344
},
{
"epoch": 0.08186995728523967,
"grad_norm": 1.156784176826477,
"learning_rate": 3.1000000000000004e-06,
"loss": 2.718,
"step": 345
},
{
"epoch": 0.08210726150925486,
"grad_norm": 1.512473464012146,
"learning_rate": 3.08e-06,
"loss": 1.0725,
"step": 346
},
{
"epoch": 0.08234456573327005,
"grad_norm": 0.5760719776153564,
"learning_rate": 3.0600000000000003e-06,
"loss": 1.3986,
"step": 347
},
{
"epoch": 0.08258186995728524,
"grad_norm": 1.1456931829452515,
"learning_rate": 3.04e-06,
"loss": 2.6081,
"step": 348
},
{
"epoch": 0.08281917418130043,
"grad_norm": 0.6848717927932739,
"learning_rate": 3.0200000000000003e-06,
"loss": 2.8127,
"step": 349
},
{
"epoch": 0.08305647840531562,
"grad_norm": 1.2799925804138184,
"learning_rate": 3e-06,
"loss": 1.1195,
"step": 350
},
{
"epoch": 0.0832937826293308,
"grad_norm": 1.1718430519104004,
"learning_rate": 2.9800000000000003e-06,
"loss": 1.6587,
"step": 351
},
{
"epoch": 0.083531086853346,
"grad_norm": 0.5741757750511169,
"learning_rate": 2.96e-06,
"loss": 1.6265,
"step": 352
},
{
"epoch": 0.08376839107736117,
"grad_norm": 0.8194566965103149,
"learning_rate": 2.9400000000000002e-06,
"loss": 1.2456,
"step": 353
},
{
"epoch": 0.08400569530137636,
"grad_norm": 0.49410197138786316,
"learning_rate": 2.92e-06,
"loss": 1.5949,
"step": 354
},
{
"epoch": 0.08424299952539155,
"grad_norm": 0.9407163858413696,
"learning_rate": 2.9e-06,
"loss": 1.1365,
"step": 355
},
{
"epoch": 0.08448030374940674,
"grad_norm": 0.764671266078949,
"learning_rate": 2.88e-06,
"loss": 0.9231,
"step": 356
},
{
"epoch": 0.08471760797342193,
"grad_norm": 0.6322979927062988,
"learning_rate": 2.86e-06,
"loss": 0.477,
"step": 357
},
{
"epoch": 0.08495491219743712,
"grad_norm": 0.7397903800010681,
"learning_rate": 2.84e-06,
"loss": 1.2955,
"step": 358
},
{
"epoch": 0.0851922164214523,
"grad_norm": 1.3564809560775757,
"learning_rate": 2.82e-06,
"loss": 1.0156,
"step": 359
},
{
"epoch": 0.0854295206454675,
"grad_norm": 0.9099704027175903,
"learning_rate": 2.8000000000000003e-06,
"loss": 2.5199,
"step": 360
},
{
"epoch": 0.08566682486948268,
"grad_norm": 0.596339225769043,
"learning_rate": 2.7800000000000005e-06,
"loss": 1.9148,
"step": 361
},
{
"epoch": 0.08590412909349786,
"grad_norm": 0.8687242269515991,
"learning_rate": 2.7600000000000003e-06,
"loss": 1.6878,
"step": 362
},
{
"epoch": 0.08614143331751305,
"grad_norm": 1.1088467836380005,
"learning_rate": 2.7400000000000004e-06,
"loss": 1.3883,
"step": 363
},
{
"epoch": 0.08637873754152824,
"grad_norm": 0.26222410798072815,
"learning_rate": 2.7200000000000002e-06,
"loss": 1.6821,
"step": 364
},
{
"epoch": 0.08661604176554343,
"grad_norm": 0.9348676800727844,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.3761,
"step": 365
},
{
"epoch": 0.08685334598955861,
"grad_norm": 0.6078274250030518,
"learning_rate": 2.68e-06,
"loss": 0.8237,
"step": 366
},
{
"epoch": 0.0870906502135738,
"grad_norm": 2.2705135345458984,
"learning_rate": 2.6600000000000004e-06,
"loss": 2.1154,
"step": 367
},
{
"epoch": 0.08732795443758899,
"grad_norm": 0.5769693851470947,
"learning_rate": 2.64e-06,
"loss": 0.7956,
"step": 368
},
{
"epoch": 0.08756525866160418,
"grad_norm": 0.9362378120422363,
"learning_rate": 2.6200000000000003e-06,
"loss": 0.8264,
"step": 369
},
{
"epoch": 0.08780256288561937,
"grad_norm": 1.3028643131256104,
"learning_rate": 2.6e-06,
"loss": 2.0418,
"step": 370
},
{
"epoch": 0.08803986710963455,
"grad_norm": 1.020676612854004,
"learning_rate": 2.5800000000000003e-06,
"loss": 1.6203,
"step": 371
},
{
"epoch": 0.08827717133364973,
"grad_norm": 0.5925426483154297,
"learning_rate": 2.56e-06,
"loss": 2.0088,
"step": 372
},
{
"epoch": 0.08851447555766492,
"grad_norm": 0.9247467517852783,
"learning_rate": 2.5400000000000002e-06,
"loss": 1.4702,
"step": 373
},
{
"epoch": 0.08875177978168011,
"grad_norm": 0.8355708718299866,
"learning_rate": 2.52e-06,
"loss": 1.511,
"step": 374
},
{
"epoch": 0.0889890840056953,
"grad_norm": 0.8599538207054138,
"learning_rate": 2.5e-06,
"loss": 1.1829,
"step": 375
},
{
"epoch": 0.08922638822971049,
"grad_norm": 0.7480601668357849,
"learning_rate": 2.4800000000000004e-06,
"loss": 0.5197,
"step": 376
},
{
"epoch": 0.08946369245372568,
"grad_norm": 0.6589898467063904,
"learning_rate": 2.46e-06,
"loss": 1.8403,
"step": 377
},
{
"epoch": 0.08970099667774087,
"grad_norm": 1.1234686374664307,
"learning_rate": 2.4400000000000004e-06,
"loss": 1.1402,
"step": 378
},
{
"epoch": 0.08993830090175606,
"grad_norm": 0.4725819528102875,
"learning_rate": 2.42e-06,
"loss": 1.3948,
"step": 379
},
{
"epoch": 0.09017560512577123,
"grad_norm": 0.579430878162384,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.2087,
"step": 380
},
{
"epoch": 0.09041290934978642,
"grad_norm": 0.9052660465240479,
"learning_rate": 2.38e-06,
"loss": 0.9077,
"step": 381
},
{
"epoch": 0.09065021357380161,
"grad_norm": 0.5743672847747803,
"learning_rate": 2.3600000000000003e-06,
"loss": 2.4246,
"step": 382
},
{
"epoch": 0.0908875177978168,
"grad_norm": 0.5894416570663452,
"learning_rate": 2.3400000000000005e-06,
"loss": 0.9541,
"step": 383
},
{
"epoch": 0.09112482202183199,
"grad_norm": 0.5887079238891602,
"learning_rate": 2.3200000000000002e-06,
"loss": 1.5845,
"step": 384
},
{
"epoch": 0.09136212624584718,
"grad_norm": 0.6027985215187073,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.8314,
"step": 385
},
{
"epoch": 0.09159943046986237,
"grad_norm": 1.0281093120574951,
"learning_rate": 2.28e-06,
"loss": 1.0648,
"step": 386
},
{
"epoch": 0.09183673469387756,
"grad_norm": 1.0283626317977905,
"learning_rate": 2.2600000000000004e-06,
"loss": 2.3067,
"step": 387
},
{
"epoch": 0.09207403891789274,
"grad_norm": 1.1482547521591187,
"learning_rate": 2.24e-06,
"loss": 1.8681,
"step": 388
},
{
"epoch": 0.09231134314190792,
"grad_norm": 0.8530061841011047,
"learning_rate": 2.2200000000000003e-06,
"loss": 0.8857,
"step": 389
},
{
"epoch": 0.09254864736592311,
"grad_norm": 0.836930513381958,
"learning_rate": 2.2e-06,
"loss": 1.8586,
"step": 390
},
{
"epoch": 0.0927859515899383,
"grad_norm": 0.8930790424346924,
"learning_rate": 2.1800000000000003e-06,
"loss": 1.6977,
"step": 391
},
{
"epoch": 0.09302325581395349,
"grad_norm": 0.4565262496471405,
"learning_rate": 2.16e-06,
"loss": 2.1539,
"step": 392
},
{
"epoch": 0.09326056003796868,
"grad_norm": 1.635286569595337,
"learning_rate": 2.1400000000000003e-06,
"loss": 2.4099,
"step": 393
},
{
"epoch": 0.09349786426198387,
"grad_norm": 1.1868668794631958,
"learning_rate": 2.12e-06,
"loss": 1.6283,
"step": 394
},
{
"epoch": 0.09373516848599905,
"grad_norm": 1.4009878635406494,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.4061,
"step": 395
},
{
"epoch": 0.09397247271001424,
"grad_norm": 0.46099644899368286,
"learning_rate": 2.08e-06,
"loss": 0.4871,
"step": 396
},
{
"epoch": 0.09420977693402943,
"grad_norm": 0.7012650370597839,
"learning_rate": 2.06e-06,
"loss": 1.5595,
"step": 397
},
{
"epoch": 0.09444708115804461,
"grad_norm": 0.7766276001930237,
"learning_rate": 2.04e-06,
"loss": 1.0104,
"step": 398
},
{
"epoch": 0.0946843853820598,
"grad_norm": 1.029155969619751,
"learning_rate": 2.02e-06,
"loss": 2.6892,
"step": 399
},
{
"epoch": 0.09492168960607499,
"grad_norm": 1.6015249490737915,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.7859,
"step": 400
},
{
"epoch": 0.09515899383009017,
"grad_norm": 0.3838267922401428,
"learning_rate": 1.98e-06,
"loss": 1.5704,
"step": 401
},
{
"epoch": 0.09539629805410536,
"grad_norm": 0.6478832364082336,
"learning_rate": 1.9600000000000003e-06,
"loss": 1.2718,
"step": 402
},
{
"epoch": 0.09563360227812055,
"grad_norm": 0.9483569860458374,
"learning_rate": 1.94e-06,
"loss": 1.9627,
"step": 403
},
{
"epoch": 0.09587090650213574,
"grad_norm": 0.5800157785415649,
"learning_rate": 1.9200000000000003e-06,
"loss": 0.9114,
"step": 404
},
{
"epoch": 0.09610821072615093,
"grad_norm": 0.5951704978942871,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.2684,
"step": 405
},
{
"epoch": 0.09634551495016612,
"grad_norm": 0.7555820345878601,
"learning_rate": 1.8800000000000002e-06,
"loss": 1.7506,
"step": 406
},
{
"epoch": 0.0965828191741813,
"grad_norm": 1.0840675830841064,
"learning_rate": 1.8600000000000002e-06,
"loss": 2.3729,
"step": 407
},
{
"epoch": 0.09682012339819648,
"grad_norm": 1.0009734630584717,
"learning_rate": 1.8400000000000002e-06,
"loss": 1.0095,
"step": 408
},
{
"epoch": 0.09705742762221167,
"grad_norm": 0.8436226844787598,
"learning_rate": 1.8200000000000002e-06,
"loss": 1.6425,
"step": 409
},
{
"epoch": 0.09729473184622686,
"grad_norm": 0.6967753767967224,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.2186,
"step": 410
},
{
"epoch": 0.09753203607024205,
"grad_norm": 0.9439252614974976,
"learning_rate": 1.7800000000000001e-06,
"loss": 1.0762,
"step": 411
},
{
"epoch": 0.09776934029425724,
"grad_norm": 1.6090588569641113,
"learning_rate": 1.76e-06,
"loss": 1.6685,
"step": 412
},
{
"epoch": 0.09800664451827243,
"grad_norm": 0.6204804182052612,
"learning_rate": 1.74e-06,
"loss": 0.7148,
"step": 413
},
{
"epoch": 0.09824394874228762,
"grad_norm": 0.9542770385742188,
"learning_rate": 1.72e-06,
"loss": 0.7676,
"step": 414
},
{
"epoch": 0.0984812529663028,
"grad_norm": 1.0385842323303223,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.6284,
"step": 415
},
{
"epoch": 0.09871855719031798,
"grad_norm": 0.643661379814148,
"learning_rate": 1.6800000000000002e-06,
"loss": 0.9284,
"step": 416
},
{
"epoch": 0.09895586141433317,
"grad_norm": 0.7566413283348083,
"learning_rate": 1.6600000000000002e-06,
"loss": 1.8893,
"step": 417
},
{
"epoch": 0.09919316563834836,
"grad_norm": 0.805566132068634,
"learning_rate": 1.6400000000000002e-06,
"loss": 1.8842,
"step": 418
},
{
"epoch": 0.09943046986236355,
"grad_norm": 0.503933310508728,
"learning_rate": 1.6200000000000002e-06,
"loss": 1.4513,
"step": 419
},
{
"epoch": 0.09966777408637874,
"grad_norm": 2.3548262119293213,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.381,
"step": 420
},
{
"epoch": 0.09990507831039393,
"grad_norm": 1.1577256917953491,
"learning_rate": 1.5800000000000001e-06,
"loss": 2.0068,
"step": 421
},
{
"epoch": 0.10014238253440912,
"grad_norm": 0.8693385124206543,
"learning_rate": 1.56e-06,
"loss": 1.38,
"step": 422
},
{
"epoch": 0.1003796867584243,
"grad_norm": 0.27834704518318176,
"learning_rate": 1.54e-06,
"loss": 0.3649,
"step": 423
},
{
"epoch": 0.1006169909824395,
"grad_norm": 0.6906237006187439,
"learning_rate": 1.52e-06,
"loss": 2.2498,
"step": 424
},
{
"epoch": 0.10085429520645467,
"grad_norm": 1.5801548957824707,
"learning_rate": 1.5e-06,
"loss": 1.6734,
"step": 425
},
{
"epoch": 0.10109159943046986,
"grad_norm": 0.6525102853775024,
"learning_rate": 1.48e-06,
"loss": 1.7765,
"step": 426
},
{
"epoch": 0.10132890365448505,
"grad_norm": 0.6370388865470886,
"learning_rate": 1.46e-06,
"loss": 1.497,
"step": 427
},
{
"epoch": 0.10156620787850024,
"grad_norm": 0.9169662594795227,
"learning_rate": 1.44e-06,
"loss": 1.7245,
"step": 428
},
{
"epoch": 0.10180351210251543,
"grad_norm": 0.8349008560180664,
"learning_rate": 1.42e-06,
"loss": 1.2525,
"step": 429
},
{
"epoch": 0.10204081632653061,
"grad_norm": 0.5627273917198181,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.5604,
"step": 430
},
{
"epoch": 0.1022781205505458,
"grad_norm": 0.6167902946472168,
"learning_rate": 1.3800000000000001e-06,
"loss": 0.4832,
"step": 431
},
{
"epoch": 0.10251542477456099,
"grad_norm": 0.6913139224052429,
"learning_rate": 1.3600000000000001e-06,
"loss": 0.4277,
"step": 432
},
{
"epoch": 0.10275272899857618,
"grad_norm": 0.9053479433059692,
"learning_rate": 1.34e-06,
"loss": 1.1996,
"step": 433
},
{
"epoch": 0.10299003322259136,
"grad_norm": 0.5853822827339172,
"learning_rate": 1.32e-06,
"loss": 1.5833,
"step": 434
},
{
"epoch": 0.10322733744660655,
"grad_norm": 0.6833449602127075,
"learning_rate": 1.3e-06,
"loss": 1.5597,
"step": 435
},
{
"epoch": 0.10346464167062173,
"grad_norm": 0.48403409123420715,
"learning_rate": 1.28e-06,
"loss": 1.7934,
"step": 436
},
{
"epoch": 0.10370194589463692,
"grad_norm": 0.8773030638694763,
"learning_rate": 1.26e-06,
"loss": 0.5695,
"step": 437
},
{
"epoch": 0.10393925011865211,
"grad_norm": 0.6054906249046326,
"learning_rate": 1.2400000000000002e-06,
"loss": 2.3307,
"step": 438
},
{
"epoch": 0.1041765543426673,
"grad_norm": 0.6956624388694763,
"learning_rate": 1.2200000000000002e-06,
"loss": 1.0714,
"step": 439
},
{
"epoch": 0.10441385856668249,
"grad_norm": 1.519985556602478,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.695,
"step": 440
},
{
"epoch": 0.10465116279069768,
"grad_norm": 1.0551714897155762,
"learning_rate": 1.1800000000000001e-06,
"loss": 2.457,
"step": 441
},
{
"epoch": 0.10488846701471286,
"grad_norm": 0.7493646740913391,
"learning_rate": 1.1600000000000001e-06,
"loss": 1.965,
"step": 442
},
{
"epoch": 0.10512577123872804,
"grad_norm": 0.7913329601287842,
"learning_rate": 1.14e-06,
"loss": 1.5939,
"step": 443
},
{
"epoch": 0.10536307546274323,
"grad_norm": 1.10833740234375,
"learning_rate": 1.12e-06,
"loss": 2.1043,
"step": 444
},
{
"epoch": 0.10560037968675842,
"grad_norm": 0.8675681948661804,
"learning_rate": 1.1e-06,
"loss": 1.6333,
"step": 445
},
{
"epoch": 0.10583768391077361,
"grad_norm": 0.8735470771789551,
"learning_rate": 1.08e-06,
"loss": 2.1604,
"step": 446
},
{
"epoch": 0.1060749881347888,
"grad_norm": 0.9015608429908752,
"learning_rate": 1.06e-06,
"loss": 0.9548,
"step": 447
},
{
"epoch": 0.10631229235880399,
"grad_norm": 0.7339662313461304,
"learning_rate": 1.04e-06,
"loss": 1.3065,
"step": 448
},
{
"epoch": 0.10654959658281918,
"grad_norm": 0.5532211661338806,
"learning_rate": 1.02e-06,
"loss": 1.8098,
"step": 449
},
{
"epoch": 0.10678690080683437,
"grad_norm": 0.8225467801094055,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.2966,
"step": 450
},
{
"epoch": 0.10702420503084954,
"grad_norm": 0.5000866651535034,
"learning_rate": 9.800000000000001e-07,
"loss": 0.2921,
"step": 451
},
{
"epoch": 0.10726150925486473,
"grad_norm": 1.0391067266464233,
"learning_rate": 9.600000000000001e-07,
"loss": 2.2974,
"step": 452
},
{
"epoch": 0.10749881347887992,
"grad_norm": 0.685451865196228,
"learning_rate": 9.400000000000001e-07,
"loss": 2.0784,
"step": 453
},
{
"epoch": 0.10773611770289511,
"grad_norm": 0.5864785313606262,
"learning_rate": 9.200000000000001e-07,
"loss": 2.2062,
"step": 454
},
{
"epoch": 0.1079734219269103,
"grad_norm": 0.6014403104782104,
"learning_rate": 9.000000000000001e-07,
"loss": 0.8348,
"step": 455
},
{
"epoch": 0.10821072615092549,
"grad_norm": 1.6633024215698242,
"learning_rate": 8.8e-07,
"loss": 1.2606,
"step": 456
},
{
"epoch": 0.10844803037494068,
"grad_norm": 0.7936999797821045,
"learning_rate": 8.6e-07,
"loss": 2.0804,
"step": 457
},
{
"epoch": 0.10868533459895587,
"grad_norm": 0.22339628636837006,
"learning_rate": 8.400000000000001e-07,
"loss": 2.1541,
"step": 458
},
{
"epoch": 0.10892263882297105,
"grad_norm": 0.6188535690307617,
"learning_rate": 8.200000000000001e-07,
"loss": 1.4218,
"step": 459
},
{
"epoch": 0.10915994304698623,
"grad_norm": 0.8845232129096985,
"learning_rate": 8.000000000000001e-07,
"loss": 1.4453,
"step": 460
},
{
"epoch": 0.10939724727100142,
"grad_norm": 0.5522229671478271,
"learning_rate": 7.8e-07,
"loss": 1.4562,
"step": 461
},
{
"epoch": 0.10963455149501661,
"grad_norm": 0.49053800106048584,
"learning_rate": 7.6e-07,
"loss": 0.6361,
"step": 462
},
{
"epoch": 0.1098718557190318,
"grad_norm": 0.5572338104248047,
"learning_rate": 7.4e-07,
"loss": 0.8394,
"step": 463
},
{
"epoch": 0.11010915994304699,
"grad_norm": 0.9283513426780701,
"learning_rate": 7.2e-07,
"loss": 1.6316,
"step": 464
},
{
"epoch": 0.11034646416706217,
"grad_norm": 0.8812064528465271,
"learning_rate": 7.000000000000001e-07,
"loss": 1.2302,
"step": 465
},
{
"epoch": 0.11058376839107736,
"grad_norm": 0.7392125129699707,
"learning_rate": 6.800000000000001e-07,
"loss": 0.8159,
"step": 466
},
{
"epoch": 0.11082107261509255,
"grad_norm": 0.7616608738899231,
"learning_rate": 6.6e-07,
"loss": 1.0404,
"step": 467
},
{
"epoch": 0.11105837683910774,
"grad_norm": 0.7175336480140686,
"learning_rate": 6.4e-07,
"loss": 1.7509,
"step": 468
},
{
"epoch": 0.11129568106312292,
"grad_norm": 0.7180752754211426,
"learning_rate": 6.200000000000001e-07,
"loss": 0.9795,
"step": 469
},
{
"epoch": 0.1115329852871381,
"grad_norm": 0.875347912311554,
"learning_rate": 6.000000000000001e-07,
"loss": 1.8271,
"step": 470
},
{
"epoch": 0.1117702895111533,
"grad_norm": 0.6006546020507812,
"learning_rate": 5.800000000000001e-07,
"loss": 2.0545,
"step": 471
},
{
"epoch": 0.11200759373516848,
"grad_norm": 1.1124011278152466,
"learning_rate": 5.6e-07,
"loss": 2.2172,
"step": 472
},
{
"epoch": 0.11224489795918367,
"grad_norm": 1.5857324600219727,
"learning_rate": 5.4e-07,
"loss": 1.2116,
"step": 473
},
{
"epoch": 0.11248220218319886,
"grad_norm": 0.6902075409889221,
"learning_rate": 5.2e-07,
"loss": 2.0822,
"step": 474
},
{
"epoch": 0.11271950640721405,
"grad_norm": 0.9990330934524536,
"learning_rate": 5.000000000000001e-07,
"loss": 1.9775,
"step": 475
},
{
"epoch": 0.11295681063122924,
"grad_norm": 1.281016230583191,
"learning_rate": 4.800000000000001e-07,
"loss": 1.9021,
"step": 476
},
{
"epoch": 0.11319411485524443,
"grad_norm": 0.3223126232624054,
"learning_rate": 4.6000000000000004e-07,
"loss": 1.8383,
"step": 477
},
{
"epoch": 0.1134314190792596,
"grad_norm": 1.1316232681274414,
"learning_rate": 4.4e-07,
"loss": 1.6243,
"step": 478
},
{
"epoch": 0.1136687233032748,
"grad_norm": 0.6887989640235901,
"learning_rate": 4.2000000000000006e-07,
"loss": 1.5909,
"step": 479
},
{
"epoch": 0.11390602752728998,
"grad_norm": 0.8150919675827026,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.1949,
"step": 480
},
{
"epoch": 0.11414333175130517,
"grad_norm": 0.6823549866676331,
"learning_rate": 3.8e-07,
"loss": 2.095,
"step": 481
},
{
"epoch": 0.11438063597532036,
"grad_norm": 1.052901268005371,
"learning_rate": 3.6e-07,
"loss": 1.7599,
"step": 482
},
{
"epoch": 0.11461794019933555,
"grad_norm": 1.048052430152893,
"learning_rate": 3.4000000000000003e-07,
"loss": 2.1143,
"step": 483
},
{
"epoch": 0.11485524442335074,
"grad_norm": 1.2748647928237915,
"learning_rate": 3.2e-07,
"loss": 1.8183,
"step": 484
},
{
"epoch": 0.11509254864736593,
"grad_norm": 1.2471035718917847,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.8442,
"step": 485
},
{
"epoch": 0.11532985287138112,
"grad_norm": 0.46195486187934875,
"learning_rate": 2.8e-07,
"loss": 1.1617,
"step": 486
},
{
"epoch": 0.11556715709539629,
"grad_norm": 0.6743305325508118,
"learning_rate": 2.6e-07,
"loss": 1.4748,
"step": 487
},
{
"epoch": 0.11580446131941148,
"grad_norm": 1.0564024448394775,
"learning_rate": 2.4000000000000003e-07,
"loss": 1.5775,
"step": 488
},
{
"epoch": 0.11604176554342667,
"grad_norm": 0.6965152025222778,
"learning_rate": 2.2e-07,
"loss": 2.5251,
"step": 489
},
{
"epoch": 0.11627906976744186,
"grad_norm": 0.8700504899024963,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.6123,
"step": 490
},
{
"epoch": 0.11651637399145705,
"grad_norm": 0.7157378196716309,
"learning_rate": 1.8e-07,
"loss": 1.7864,
"step": 491
},
{
"epoch": 0.11675367821547224,
"grad_norm": 0.9464967250823975,
"learning_rate": 1.6e-07,
"loss": 2.0064,
"step": 492
},
{
"epoch": 0.11699098243948743,
"grad_norm": 0.734511137008667,
"learning_rate": 1.4e-07,
"loss": 0.8449,
"step": 493
},
{
"epoch": 0.11722828666350261,
"grad_norm": 1.0827577114105225,
"learning_rate": 1.2000000000000002e-07,
"loss": 1.8062,
"step": 494
},
{
"epoch": 0.1174655908875178,
"grad_norm": 1.1060395240783691,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.9935,
"step": 495
},
{
"epoch": 0.11770289511153298,
"grad_norm": 0.9821236729621887,
"learning_rate": 8e-08,
"loss": 1.1501,
"step": 496
},
{
"epoch": 0.11794019933554817,
"grad_norm": 0.46705755591392517,
"learning_rate": 6.000000000000001e-08,
"loss": 2.2727,
"step": 497
},
{
"epoch": 0.11817750355956336,
"grad_norm": 0.7528263926506042,
"learning_rate": 4e-08,
"loss": 2.2196,
"step": 498
},
{
"epoch": 0.11841480778357855,
"grad_norm": 1.029765009880066,
"learning_rate": 2e-08,
"loss": 1.3983,
"step": 499
},
{
"epoch": 0.11865211200759374,
"grad_norm": 0.9231687188148499,
"learning_rate": 9.000000000000001e-11,
"loss": 0.7083,
"step": 500
},
{
"epoch": 0.11865211200759374,
"step": 500,
"total_flos": 8.201567207424e+16,
"train_loss": 1.6423830435276032,
"train_runtime": 704.5977,
"train_samples_per_second": 1.419,
"train_steps_per_second": 0.71
}
],
"logging_steps": 1.0,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.201567207424e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}