|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"grad_norm": 3.1352566768118555, |
|
"learning_rate": 1.0810810810810812e-06, |
|
"loss": 1.0885, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"grad_norm": 3.0707759310558114, |
|
"learning_rate": 2.1621621621621623e-06, |
|
"loss": 1.081, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02027027027027027, |
|
"grad_norm": 3.1041877911160487, |
|
"learning_rate": 3.2432432432432437e-06, |
|
"loss": 1.0779, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 2.894341433491889, |
|
"learning_rate": 4.324324324324325e-06, |
|
"loss": 1.066, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.033783783783783786, |
|
"grad_norm": 2.4474574648074654, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 1.0464, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04054054054054054, |
|
"grad_norm": 1.7416847938696791, |
|
"learning_rate": 6.486486486486487e-06, |
|
"loss": 1.0062, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0472972972972973, |
|
"grad_norm": 1.737012288070095, |
|
"learning_rate": 7.567567567567569e-06, |
|
"loss": 0.9661, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 1.54166088046008, |
|
"learning_rate": 8.64864864864865e-06, |
|
"loss": 0.9625, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.060810810810810814, |
|
"grad_norm": 1.1305140237848132, |
|
"learning_rate": 9.729729729729732e-06, |
|
"loss": 0.9355, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06756756756756757, |
|
"grad_norm": 1.6829884643585495, |
|
"learning_rate": 1.0810810810810812e-05, |
|
"loss": 0.8814, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07432432432432433, |
|
"grad_norm": 1.2528889517867892, |
|
"learning_rate": 1.1891891891891894e-05, |
|
"loss": 0.8733, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 0.9520381902650208, |
|
"learning_rate": 1.2972972972972975e-05, |
|
"loss": 0.8579, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08783783783783784, |
|
"grad_norm": 1.4164792552755203, |
|
"learning_rate": 1.4054054054054055e-05, |
|
"loss": 0.82, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0945945945945946, |
|
"grad_norm": 1.1661768167320599, |
|
"learning_rate": 1.5135135135135138e-05, |
|
"loss": 0.823, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10135135135135136, |
|
"grad_norm": 0.966545525495581, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 0.8177, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 0.9970915856247566, |
|
"learning_rate": 1.72972972972973e-05, |
|
"loss": 0.7828, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11486486486486487, |
|
"grad_norm": 0.7510807795940354, |
|
"learning_rate": 1.8378378378378383e-05, |
|
"loss": 0.7752, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12162162162162163, |
|
"grad_norm": 0.6801364883047033, |
|
"learning_rate": 1.9459459459459463e-05, |
|
"loss": 0.7802, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12837837837837837, |
|
"grad_norm": 0.8052966224978529, |
|
"learning_rate": 2.054054054054054e-05, |
|
"loss": 0.7803, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 0.6731724951990521, |
|
"learning_rate": 2.1621621621621624e-05, |
|
"loss": 0.774, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14189189189189189, |
|
"grad_norm": 0.5240064900372621, |
|
"learning_rate": 2.2702702702702705e-05, |
|
"loss": 0.7503, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14864864864864866, |
|
"grad_norm": 0.5238422824665931, |
|
"learning_rate": 2.378378378378379e-05, |
|
"loss": 0.7452, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1554054054054054, |
|
"grad_norm": 0.5685661395814289, |
|
"learning_rate": 2.4864864864864866e-05, |
|
"loss": 0.7357, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 0.5559414871229672, |
|
"learning_rate": 2.594594594594595e-05, |
|
"loss": 0.7529, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16891891891891891, |
|
"grad_norm": 0.44354381932505604, |
|
"learning_rate": 2.702702702702703e-05, |
|
"loss": 0.7531, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17567567567567569, |
|
"grad_norm": 0.5353496542990213, |
|
"learning_rate": 2.810810810810811e-05, |
|
"loss": 0.7443, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.18243243243243243, |
|
"grad_norm": 0.5011572381597165, |
|
"learning_rate": 2.918918918918919e-05, |
|
"loss": 0.7309, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 0.46554848262029747, |
|
"learning_rate": 3.0270270270270275e-05, |
|
"loss": 0.73, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19594594594594594, |
|
"grad_norm": 0.5645942632228904, |
|
"learning_rate": 3.135135135135135e-05, |
|
"loss": 0.7218, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.20270270270270271, |
|
"grad_norm": 0.48122141248903777, |
|
"learning_rate": 3.2432432432432436e-05, |
|
"loss": 0.7272, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20945945945945946, |
|
"grad_norm": 0.4302278552572059, |
|
"learning_rate": 3.351351351351351e-05, |
|
"loss": 0.7138, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.38359657484495446, |
|
"learning_rate": 3.45945945945946e-05, |
|
"loss": 0.716, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.22297297297297297, |
|
"grad_norm": 0.37734924853300067, |
|
"learning_rate": 3.567567567567568e-05, |
|
"loss": 0.7258, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22972972972972974, |
|
"grad_norm": 0.5860873173612458, |
|
"learning_rate": 3.6756756756756765e-05, |
|
"loss": 0.7146, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.23648648648648649, |
|
"grad_norm": 0.7521759711776034, |
|
"learning_rate": 3.783783783783784e-05, |
|
"loss": 0.727, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 0.46287517636393716, |
|
"learning_rate": 3.8918918918918926e-05, |
|
"loss": 0.6978, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3949073220604405, |
|
"learning_rate": 4e-05, |
|
"loss": 0.7143, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.25675675675675674, |
|
"grad_norm": 0.5390567372094507, |
|
"learning_rate": 4.108108108108108e-05, |
|
"loss": 0.7147, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2635135135135135, |
|
"grad_norm": 0.5260020316144047, |
|
"learning_rate": 4.2162162162162164e-05, |
|
"loss": 0.7042, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 0.6245421084330798, |
|
"learning_rate": 4.324324324324325e-05, |
|
"loss": 0.7066, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.27702702702702703, |
|
"grad_norm": 0.4274410936399283, |
|
"learning_rate": 4.4324324324324325e-05, |
|
"loss": 0.7002, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.28378378378378377, |
|
"grad_norm": 0.5645116869141522, |
|
"learning_rate": 4.540540540540541e-05, |
|
"loss": 0.7017, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2905405405405405, |
|
"grad_norm": 0.8531985478315304, |
|
"learning_rate": 4.6486486486486486e-05, |
|
"loss": 0.7105, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 0.8463718591267281, |
|
"learning_rate": 4.756756756756758e-05, |
|
"loss": 0.6926, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.30405405405405406, |
|
"grad_norm": 0.527363152011757, |
|
"learning_rate": 4.8648648648648654e-05, |
|
"loss": 0.698, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3108108108108108, |
|
"grad_norm": 0.8921529578247511, |
|
"learning_rate": 4.972972972972973e-05, |
|
"loss": 0.7113, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.31756756756756754, |
|
"grad_norm": 0.844585341198514, |
|
"learning_rate": 5.081081081081081e-05, |
|
"loss": 0.6918, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 0.6610922858373763, |
|
"learning_rate": 5.18918918918919e-05, |
|
"loss": 0.6874, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3310810810810811, |
|
"grad_norm": 0.9289696025390548, |
|
"learning_rate": 5.2972972972972976e-05, |
|
"loss": 0.6965, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.33783783783783783, |
|
"grad_norm": 0.7104021291109865, |
|
"learning_rate": 5.405405405405406e-05, |
|
"loss": 0.7025, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.34459459459459457, |
|
"grad_norm": 0.7179610269369078, |
|
"learning_rate": 5.513513513513514e-05, |
|
"loss": 0.6906, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 0.6612006329248038, |
|
"learning_rate": 5.621621621621622e-05, |
|
"loss": 0.7026, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3581081081081081, |
|
"grad_norm": 0.6920205226216654, |
|
"learning_rate": 5.7297297297297305e-05, |
|
"loss": 0.6849, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.36486486486486486, |
|
"grad_norm": 0.5619919459693192, |
|
"learning_rate": 5.837837837837838e-05, |
|
"loss": 0.6827, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3716216216216216, |
|
"grad_norm": 0.789835519140916, |
|
"learning_rate": 5.945945945945946e-05, |
|
"loss": 0.6753, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 0.8468946375056802, |
|
"learning_rate": 6.054054054054055e-05, |
|
"loss": 0.6803, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.38513513513513514, |
|
"grad_norm": 1.1480634289873959, |
|
"learning_rate": 6.162162162162163e-05, |
|
"loss": 0.6849, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3918918918918919, |
|
"grad_norm": 0.8338876354018716, |
|
"learning_rate": 6.27027027027027e-05, |
|
"loss": 0.6822, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.39864864864864863, |
|
"grad_norm": 0.8619635833656949, |
|
"learning_rate": 6.378378378378379e-05, |
|
"loss": 0.6885, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 0.9841339840879071, |
|
"learning_rate": 6.486486486486487e-05, |
|
"loss": 0.6941, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41216216216216217, |
|
"grad_norm": 0.7846924650076373, |
|
"learning_rate": 6.594594594594596e-05, |
|
"loss": 0.6734, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4189189189189189, |
|
"grad_norm": 0.6084845178399872, |
|
"learning_rate": 6.702702702702703e-05, |
|
"loss": 0.6766, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.42567567567567566, |
|
"grad_norm": 0.8121605538110969, |
|
"learning_rate": 6.810810810810811e-05, |
|
"loss": 0.6882, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.9962451486752875, |
|
"learning_rate": 6.91891891891892e-05, |
|
"loss": 0.6837, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4391891891891892, |
|
"grad_norm": 0.6516040658377318, |
|
"learning_rate": 7.027027027027028e-05, |
|
"loss": 0.6841, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.44594594594594594, |
|
"grad_norm": 0.683694787120212, |
|
"learning_rate": 7.135135135135136e-05, |
|
"loss": 0.6952, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4527027027027027, |
|
"grad_norm": 0.6582713522549495, |
|
"learning_rate": 7.243243243243243e-05, |
|
"loss": 0.6727, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 0.673908455931949, |
|
"learning_rate": 7.351351351351353e-05, |
|
"loss": 0.6841, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.46621621621621623, |
|
"grad_norm": 0.8017390116132226, |
|
"learning_rate": 7.45945945945946e-05, |
|
"loss": 0.67, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.47297297297297297, |
|
"grad_norm": 0.9799335757769192, |
|
"learning_rate": 7.567567567567568e-05, |
|
"loss": 0.6914, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4797297297297297, |
|
"grad_norm": 1.5088822309237815, |
|
"learning_rate": 7.675675675675675e-05, |
|
"loss": 0.6925, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 0.7235799329993131, |
|
"learning_rate": 7.783783783783785e-05, |
|
"loss": 0.689, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.49324324324324326, |
|
"grad_norm": 1.467550439828714, |
|
"learning_rate": 7.891891891891892e-05, |
|
"loss": 0.6927, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9680632895596434, |
|
"learning_rate": 8e-05, |
|
"loss": 0.6729, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5067567567567568, |
|
"grad_norm": 1.4048772257060618, |
|
"learning_rate": 7.999955497902857e-05, |
|
"loss": 0.6949, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"grad_norm": 1.187179427424338, |
|
"learning_rate": 7.999821992601645e-05, |
|
"loss": 0.693, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5202702702702703, |
|
"grad_norm": 0.9529005283405405, |
|
"learning_rate": 7.999599487066996e-05, |
|
"loss": 0.6839, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.527027027027027, |
|
"grad_norm": 0.9173343088851188, |
|
"learning_rate": 7.999287986249894e-05, |
|
"loss": 0.6766, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5337837837837838, |
|
"grad_norm": 0.7318824071301274, |
|
"learning_rate": 7.998887497081555e-05, |
|
"loss": 0.6847, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 0.6529765700425837, |
|
"learning_rate": 7.998398028473287e-05, |
|
"loss": 0.678, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5472972972972973, |
|
"grad_norm": 0.6928101408737816, |
|
"learning_rate": 7.997819591316278e-05, |
|
"loss": 0.6954, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5540540540540541, |
|
"grad_norm": 0.5879827240121825, |
|
"learning_rate": 7.99715219848136e-05, |
|
"loss": 0.6642, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5608108108108109, |
|
"grad_norm": 0.680605729950147, |
|
"learning_rate": 7.996395864818727e-05, |
|
"loss": 0.6804, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 0.5423663061022331, |
|
"learning_rate": 7.995550607157592e-05, |
|
"loss": 0.6688, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5743243243243243, |
|
"grad_norm": 0.606163015240631, |
|
"learning_rate": 7.994616444305826e-05, |
|
"loss": 0.6667, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.581081081081081, |
|
"grad_norm": 0.5950010094955055, |
|
"learning_rate": 7.993593397049533e-05, |
|
"loss": 0.679, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5878378378378378, |
|
"grad_norm": 0.6429149705782599, |
|
"learning_rate": 7.992481488152585e-05, |
|
"loss": 0.6685, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 0.46140817883542623, |
|
"learning_rate": 7.991280742356124e-05, |
|
"loss": 0.6603, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6013513513513513, |
|
"grad_norm": 0.6795669467936494, |
|
"learning_rate": 7.989991186378e-05, |
|
"loss": 0.6699, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6081081081081081, |
|
"grad_norm": 0.5738680266825441, |
|
"learning_rate": 7.988612848912186e-05, |
|
"loss": 0.6819, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6148648648648649, |
|
"grad_norm": 0.5789217573047567, |
|
"learning_rate": 7.987145760628138e-05, |
|
"loss": 0.6708, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"grad_norm": 0.49449419241667797, |
|
"learning_rate": 7.985589954170107e-05, |
|
"loss": 0.6665, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6283783783783784, |
|
"grad_norm": 0.4987045947630836, |
|
"learning_rate": 7.983945464156419e-05, |
|
"loss": 0.6597, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6351351351351351, |
|
"grad_norm": 0.40082592041703324, |
|
"learning_rate": 7.982212327178699e-05, |
|
"loss": 0.6675, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6418918918918919, |
|
"grad_norm": 0.3431463481882859, |
|
"learning_rate": 7.980390581801064e-05, |
|
"loss": 0.6616, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.44148520818681297, |
|
"learning_rate": 7.97848026855926e-05, |
|
"loss": 0.6761, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6554054054054054, |
|
"grad_norm": 0.4038247824809848, |
|
"learning_rate": 7.976481429959758e-05, |
|
"loss": 0.6596, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6621621621621622, |
|
"grad_norm": 0.3448527603506924, |
|
"learning_rate": 7.974394110478813e-05, |
|
"loss": 0.6536, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.668918918918919, |
|
"grad_norm": 0.30698753802117784, |
|
"learning_rate": 7.972218356561471e-05, |
|
"loss": 0.6772, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 0.30195068085071514, |
|
"learning_rate": 7.96995421662054e-05, |
|
"loss": 0.6527, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6824324324324325, |
|
"grad_norm": 0.3166797371131855, |
|
"learning_rate": 7.967601741035507e-05, |
|
"loss": 0.6555, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6891891891891891, |
|
"grad_norm": 0.32820963864069536, |
|
"learning_rate": 7.965160982151422e-05, |
|
"loss": 0.6696, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6959459459459459, |
|
"grad_norm": 0.3483528954236439, |
|
"learning_rate": 7.962631994277728e-05, |
|
"loss": 0.663, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 0.4983197508582807, |
|
"learning_rate": 7.960014833687055e-05, |
|
"loss": 0.6633, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7094594594594594, |
|
"grad_norm": 0.7805957154226273, |
|
"learning_rate": 7.957309558613974e-05, |
|
"loss": 0.6587, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7162162162162162, |
|
"grad_norm": 1.106633618810969, |
|
"learning_rate": 7.954516229253691e-05, |
|
"loss": 0.662, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.722972972972973, |
|
"grad_norm": 0.757015003567712, |
|
"learning_rate": 7.951634907760713e-05, |
|
"loss": 0.6593, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 0.8427739970040843, |
|
"learning_rate": 7.948665658247463e-05, |
|
"loss": 0.6601, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7364864864864865, |
|
"grad_norm": 0.7431541528418906, |
|
"learning_rate": 7.945608546782858e-05, |
|
"loss": 0.667, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7432432432432432, |
|
"grad_norm": 0.6845263081020915, |
|
"learning_rate": 7.942463641390834e-05, |
|
"loss": 0.6569, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8712514657793697, |
|
"learning_rate": 7.939231012048833e-05, |
|
"loss": 0.6632, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 0.6774356142156878, |
|
"learning_rate": 7.935910730686246e-05, |
|
"loss": 0.6706, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7635135135135135, |
|
"grad_norm": 0.7702128806997793, |
|
"learning_rate": 7.932502871182818e-05, |
|
"loss": 0.6679, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7702702702702703, |
|
"grad_norm": 0.6032796273856512, |
|
"learning_rate": 7.929007509366994e-05, |
|
"loss": 0.6612, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.777027027027027, |
|
"grad_norm": 0.7262342972595842, |
|
"learning_rate": 7.925424723014239e-05, |
|
"loss": 0.6639, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"grad_norm": 0.6319433116490037, |
|
"learning_rate": 7.921754591845307e-05, |
|
"loss": 0.662, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7905405405405406, |
|
"grad_norm": 0.6175612756087657, |
|
"learning_rate": 7.917997197524467e-05, |
|
"loss": 0.6566, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7972972972972973, |
|
"grad_norm": 0.7923618524838222, |
|
"learning_rate": 7.914152623657678e-05, |
|
"loss": 0.6469, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8040540540540541, |
|
"grad_norm": 0.7707195088131242, |
|
"learning_rate": 7.910220955790746e-05, |
|
"loss": 0.6563, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 0.4152288654221399, |
|
"learning_rate": 7.906202281407398e-05, |
|
"loss": 0.6519, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8175675675675675, |
|
"grad_norm": 0.5439040195272984, |
|
"learning_rate": 7.902096689927355e-05, |
|
"loss": 0.6727, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8243243243243243, |
|
"grad_norm": 0.5159119091886112, |
|
"learning_rate": 7.897904272704333e-05, |
|
"loss": 0.6563, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.831081081081081, |
|
"grad_norm": 0.49623067194442205, |
|
"learning_rate": 7.893625123024011e-05, |
|
"loss": 0.6523, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"grad_norm": 0.4176229579696035, |
|
"learning_rate": 7.889259336101957e-05, |
|
"loss": 0.6461, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8445945945945946, |
|
"grad_norm": 0.5424146541267013, |
|
"learning_rate": 7.884807009081506e-05, |
|
"loss": 0.6535, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8513513513513513, |
|
"grad_norm": 0.53245673115827, |
|
"learning_rate": 7.880268241031604e-05, |
|
"loss": 0.6516, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8581081081081081, |
|
"grad_norm": 0.4218647663890849, |
|
"learning_rate": 7.875643132944599e-05, |
|
"loss": 0.6557, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.31388453196671723, |
|
"learning_rate": 7.870931787733996e-05, |
|
"loss": 0.6504, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8716216216216216, |
|
"grad_norm": 0.42120451899053063, |
|
"learning_rate": 7.866134310232167e-05, |
|
"loss": 0.6637, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8783783783783784, |
|
"grad_norm": 0.3569049182508329, |
|
"learning_rate": 7.861250807188014e-05, |
|
"loss": 0.6565, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8851351351351351, |
|
"grad_norm": 0.46980061573845006, |
|
"learning_rate": 7.856281387264603e-05, |
|
"loss": 0.6643, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 0.5602330500767254, |
|
"learning_rate": 7.851226161036739e-05, |
|
"loss": 0.6541, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8986486486486487, |
|
"grad_norm": 0.555307896703473, |
|
"learning_rate": 7.846085240988503e-05, |
|
"loss": 0.6498, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9054054054054054, |
|
"grad_norm": 0.4906170522953401, |
|
"learning_rate": 7.840858741510758e-05, |
|
"loss": 0.6482, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9121621621621622, |
|
"grad_norm": 0.4486181714024936, |
|
"learning_rate": 7.835546778898599e-05, |
|
"loss": 0.6457, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 0.5704154766929971, |
|
"learning_rate": 7.830149471348763e-05, |
|
"loss": 0.6508, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9256756756756757, |
|
"grad_norm": 0.738054877738817, |
|
"learning_rate": 7.824666938957004e-05, |
|
"loss": 0.6531, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9324324324324325, |
|
"grad_norm": 0.7131460924145132, |
|
"learning_rate": 7.819099303715414e-05, |
|
"loss": 0.6577, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9391891891891891, |
|
"grad_norm": 0.5314442328007468, |
|
"learning_rate": 7.813446689509718e-05, |
|
"loss": 0.6472, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 0.4354387291682998, |
|
"learning_rate": 7.807709222116506e-05, |
|
"loss": 0.6566, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9527027027027027, |
|
"grad_norm": 0.4367558414891408, |
|
"learning_rate": 7.801887029200448e-05, |
|
"loss": 0.6548, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9594594594594594, |
|
"grad_norm": 0.49114789228090255, |
|
"learning_rate": 7.795980240311436e-05, |
|
"loss": 0.6651, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9662162162162162, |
|
"grad_norm": 0.5563118895714915, |
|
"learning_rate": 7.789988986881719e-05, |
|
"loss": 0.6534, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 0.5068498475757595, |
|
"learning_rate": 7.78391340222297e-05, |
|
"loss": 0.6635, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9797297297297297, |
|
"grad_norm": 0.40377422580342515, |
|
"learning_rate": 7.777753621523316e-05, |
|
"loss": 0.662, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9864864864864865, |
|
"grad_norm": 0.48943576691657636, |
|
"learning_rate": 7.771509781844338e-05, |
|
"loss": 0.6465, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9932432432432432, |
|
"grad_norm": 0.45222666736741574, |
|
"learning_rate": 7.765182022118014e-05, |
|
"loss": 0.6576, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3524115634430567, |
|
"learning_rate": 7.758770483143634e-05, |
|
"loss": 0.66, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0067567567567568, |
|
"grad_norm": 0.3465470718278503, |
|
"learning_rate": 7.752275307584664e-05, |
|
"loss": 0.6528, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0135135135135136, |
|
"grad_norm": 0.4289730330622286, |
|
"learning_rate": 7.745696639965569e-05, |
|
"loss": 0.6284, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0202702702702702, |
|
"grad_norm": 0.4078417015334056, |
|
"learning_rate": 7.739034626668605e-05, |
|
"loss": 0.6409, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.027027027027027, |
|
"grad_norm": 0.4224099719890233, |
|
"learning_rate": 7.732289415930549e-05, |
|
"loss": 0.63, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0337837837837838, |
|
"grad_norm": 0.43135712270900994, |
|
"learning_rate": 7.725461157839417e-05, |
|
"loss": 0.6606, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0405405405405406, |
|
"grad_norm": 0.4832560652031643, |
|
"learning_rate": 7.71855000433111e-05, |
|
"loss": 0.6439, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0472972972972974, |
|
"grad_norm": 0.5148651119907238, |
|
"learning_rate": 7.711556109186039e-05, |
|
"loss": 0.6405, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.054054054054054, |
|
"grad_norm": 0.49381906164540984, |
|
"learning_rate": 7.704479628025704e-05, |
|
"loss": 0.635, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0608108108108107, |
|
"grad_norm": 0.4238088291952583, |
|
"learning_rate": 7.697320718309235e-05, |
|
"loss": 0.6363, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0675675675675675, |
|
"grad_norm": 0.34639148621347143, |
|
"learning_rate": 7.690079539329875e-05, |
|
"loss": 0.6434, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0743243243243243, |
|
"grad_norm": 0.3670601156648982, |
|
"learning_rate": 7.682756252211453e-05, |
|
"loss": 0.6321, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.48931637412654205, |
|
"learning_rate": 7.675351019904785e-05, |
|
"loss": 0.6371, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.087837837837838, |
|
"grad_norm": 0.5514281223158661, |
|
"learning_rate": 7.667864007184054e-05, |
|
"loss": 0.6313, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0945945945945945, |
|
"grad_norm": 0.48466826872522856, |
|
"learning_rate": 7.660295380643144e-05, |
|
"loss": 0.6358, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1013513513513513, |
|
"grad_norm": 0.46142889218279065, |
|
"learning_rate": 7.652645308691933e-05, |
|
"loss": 0.6418, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1081081081081081, |
|
"grad_norm": 0.4371908363892829, |
|
"learning_rate": 7.644913961552544e-05, |
|
"loss": 0.6327, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.114864864864865, |
|
"grad_norm": 0.4159116225859721, |
|
"learning_rate": 7.637101511255554e-05, |
|
"loss": 0.6332, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1216216216216217, |
|
"grad_norm": 0.48524137415448176, |
|
"learning_rate": 7.629208131636179e-05, |
|
"loss": 0.6385, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.1283783783783783, |
|
"grad_norm": 0.502011807585537, |
|
"learning_rate": 7.621233998330387e-05, |
|
"loss": 0.6534, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 0.4849012769427631, |
|
"learning_rate": 7.61317928877101e-05, |
|
"loss": 0.6382, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.1418918918918919, |
|
"grad_norm": 0.40497697606883803, |
|
"learning_rate": 7.605044182183779e-05, |
|
"loss": 0.6335, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.1486486486486487, |
|
"grad_norm": 0.26814241993209303, |
|
"learning_rate": 7.596828859583347e-05, |
|
"loss": 0.6226, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1554054054054055, |
|
"grad_norm": 0.36135092127292284, |
|
"learning_rate": 7.588533503769257e-05, |
|
"loss": 0.6286, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1621621621621623, |
|
"grad_norm": 0.4608397427011372, |
|
"learning_rate": 7.580158299321872e-05, |
|
"loss": 0.6438, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1689189189189189, |
|
"grad_norm": 0.4171913645594413, |
|
"learning_rate": 7.571703432598275e-05, |
|
"loss": 0.6356, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1756756756756757, |
|
"grad_norm": 0.4370902558666279, |
|
"learning_rate": 7.563169091728115e-05, |
|
"loss": 0.6422, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1824324324324325, |
|
"grad_norm": 0.4283980244016275, |
|
"learning_rate": 7.554555466609425e-05, |
|
"loss": 0.6407, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1891891891891893, |
|
"grad_norm": 0.4381556639196284, |
|
"learning_rate": 7.545862748904394e-05, |
|
"loss": 0.6434, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.195945945945946, |
|
"grad_norm": 0.44668924620805633, |
|
"learning_rate": 7.537091132035111e-05, |
|
"loss": 0.6219, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2027027027027026, |
|
"grad_norm": 0.37602920076180757, |
|
"learning_rate": 7.528240811179245e-05, |
|
"loss": 0.6419, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.2094594594594594, |
|
"grad_norm": 0.2672376131285308, |
|
"learning_rate": 7.519311983265718e-05, |
|
"loss": 0.6366, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 0.29369317421661273, |
|
"learning_rate": 7.510304846970311e-05, |
|
"loss": 0.6341, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.222972972972973, |
|
"grad_norm": 0.3602685800886801, |
|
"learning_rate": 7.501219602711253e-05, |
|
"loss": 0.6432, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.2297297297297298, |
|
"grad_norm": 0.3237536586353385, |
|
"learning_rate": 7.492056452644753e-05, |
|
"loss": 0.6415, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2364864864864864, |
|
"grad_norm": 0.28545557770169117, |
|
"learning_rate": 7.48281560066051e-05, |
|
"loss": 0.6364, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.2432432432432432, |
|
"grad_norm": 0.2623342902523158, |
|
"learning_rate": 7.473497252377171e-05, |
|
"loss": 0.6333, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.3160766346456171, |
|
"learning_rate": 7.464101615137756e-05, |
|
"loss": 0.6256, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2567567567567568, |
|
"grad_norm": 0.4173934303596852, |
|
"learning_rate": 7.454628898005043e-05, |
|
"loss": 0.6521, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2635135135135136, |
|
"grad_norm": 0.4424346331081593, |
|
"learning_rate": 7.445079311756924e-05, |
|
"loss": 0.6541, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2702702702702702, |
|
"grad_norm": 0.38764273759912543, |
|
"learning_rate": 7.435453068881706e-05, |
|
"loss": 0.6385, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.277027027027027, |
|
"grad_norm": 0.3811463582022792, |
|
"learning_rate": 7.425750383573384e-05, |
|
"loss": 0.6295, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2837837837837838, |
|
"grad_norm": 0.5272316642769805, |
|
"learning_rate": 7.415971471726884e-05, |
|
"loss": 0.6307, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2905405405405406, |
|
"grad_norm": 0.7635509141776905, |
|
"learning_rate": 7.406116550933246e-05, |
|
"loss": 0.6313, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 0.9673085818387788, |
|
"learning_rate": 7.396185840474792e-05, |
|
"loss": 0.6495, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.304054054054054, |
|
"grad_norm": 0.8691278543652353, |
|
"learning_rate": 7.386179561320243e-05, |
|
"loss": 0.6285, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.3108108108108107, |
|
"grad_norm": 0.7233647856746338, |
|
"learning_rate": 7.376097936119803e-05, |
|
"loss": 0.6435, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.3175675675675675, |
|
"grad_norm": 0.4888365541378487, |
|
"learning_rate": 7.365941189200201e-05, |
|
"loss": 0.6345, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3243243243243243, |
|
"grad_norm": 0.41440516733521193, |
|
"learning_rate": 7.35570954655971e-05, |
|
"loss": 0.6522, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3310810810810811, |
|
"grad_norm": 0.39733170188465466, |
|
"learning_rate": 7.345403235863105e-05, |
|
"loss": 0.636, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.3378378378378377, |
|
"grad_norm": 0.37538788641282456, |
|
"learning_rate": 7.335022486436608e-05, |
|
"loss": 0.6405, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3445945945945945, |
|
"grad_norm": 0.4912257786205075, |
|
"learning_rate": 7.324567529262775e-05, |
|
"loss": 0.6341, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 0.3964231351313485, |
|
"learning_rate": 7.31403859697537e-05, |
|
"loss": 0.6329, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3581081081081081, |
|
"grad_norm": 0.23258287711574407, |
|
"learning_rate": 7.303435923854172e-05, |
|
"loss": 0.6346, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.364864864864865, |
|
"grad_norm": 0.36875878813699686, |
|
"learning_rate": 7.292759745819781e-05, |
|
"loss": 0.6423, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3716216216216215, |
|
"grad_norm": 0.3816088768921796, |
|
"learning_rate": 7.282010300428351e-05, |
|
"loss": 0.6247, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3783783783783785, |
|
"grad_norm": 0.32935135005176236, |
|
"learning_rate": 7.271187826866312e-05, |
|
"loss": 0.6306, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.385135135135135, |
|
"grad_norm": 0.24849408204445086, |
|
"learning_rate": 7.260292565945049e-05, |
|
"loss": 0.6425, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3918918918918919, |
|
"grad_norm": 0.22496521412221002, |
|
"learning_rate": 7.249324760095544e-05, |
|
"loss": 0.64, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.3986486486486487, |
|
"grad_norm": 0.30450316180118336, |
|
"learning_rate": 7.238284653362977e-05, |
|
"loss": 0.6497, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.4054054054054055, |
|
"grad_norm": 0.2922608158162527, |
|
"learning_rate": 7.227172491401299e-05, |
|
"loss": 0.6339, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.4121621621621623, |
|
"grad_norm": 0.23745259799881574, |
|
"learning_rate": 7.215988521467763e-05, |
|
"loss": 0.629, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.4189189189189189, |
|
"grad_norm": 0.29205738332283676, |
|
"learning_rate": 7.204732992417431e-05, |
|
"loss": 0.6276, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4256756756756757, |
|
"grad_norm": 0.2764709836681376, |
|
"learning_rate": 7.193406154697625e-05, |
|
"loss": 0.6325, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.4324324324324325, |
|
"grad_norm": 0.30413188245977635, |
|
"learning_rate": 7.18200826034236e-05, |
|
"loss": 0.6347, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.4391891891891893, |
|
"grad_norm": 0.32848243797717097, |
|
"learning_rate": 7.17053956296674e-05, |
|
"loss": 0.6173, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.445945945945946, |
|
"grad_norm": 0.3056033484058255, |
|
"learning_rate": 7.159000317761305e-05, |
|
"loss": 0.6339, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.4527027027027026, |
|
"grad_norm": 0.3085891817121419, |
|
"learning_rate": 7.14739078148636e-05, |
|
"loss": 0.6364, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 0.3326838165486728, |
|
"learning_rate": 7.135711212466264e-05, |
|
"loss": 0.6422, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4662162162162162, |
|
"grad_norm": 0.3216850978230415, |
|
"learning_rate": 7.123961870583671e-05, |
|
"loss": 0.6342, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.472972972972973, |
|
"grad_norm": 0.35372809908349234, |
|
"learning_rate": 7.112143017273759e-05, |
|
"loss": 0.637, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4797297297297298, |
|
"grad_norm": 0.3689688937687713, |
|
"learning_rate": 7.100254915518408e-05, |
|
"loss": 0.6342, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 0.4017045063635094, |
|
"learning_rate": 7.088297829840346e-05, |
|
"loss": 0.6286, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4932432432432432, |
|
"grad_norm": 0.4446976891407491, |
|
"learning_rate": 7.076272026297268e-05, |
|
"loss": 0.6342, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.432808144889846, |
|
"learning_rate": 7.064177772475912e-05, |
|
"loss": 0.6452, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5067567567567568, |
|
"grad_norm": 0.3443564755955631, |
|
"learning_rate": 7.052015337486109e-05, |
|
"loss": 0.6378, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 0.30789638902879063, |
|
"learning_rate": 7.03978499195479e-05, |
|
"loss": 0.6284, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.5202702702702702, |
|
"grad_norm": 0.3444085722973672, |
|
"learning_rate": 7.027487008019969e-05, |
|
"loss": 0.6439, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.527027027027027, |
|
"grad_norm": 0.3698979548144048, |
|
"learning_rate": 7.015121659324678e-05, |
|
"loss": 0.6328, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.5337837837837838, |
|
"grad_norm": 0.4166719230003736, |
|
"learning_rate": 7.002689221010897e-05, |
|
"loss": 0.6295, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.5405405405405406, |
|
"grad_norm": 0.4418395627095002, |
|
"learning_rate": 6.990189969713416e-05, |
|
"loss": 0.6303, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.5472972972972974, |
|
"grad_norm": 0.4102988370573012, |
|
"learning_rate": 6.977624183553676e-05, |
|
"loss": 0.6431, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.554054054054054, |
|
"grad_norm": 0.39270575425283927, |
|
"learning_rate": 6.964992142133602e-05, |
|
"loss": 0.6333, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.560810810810811, |
|
"grad_norm": 0.3396456921939933, |
|
"learning_rate": 6.952294126529356e-05, |
|
"loss": 0.6274, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.5675675675675675, |
|
"grad_norm": 0.29161162699565907, |
|
"learning_rate": 6.939530419285104e-05, |
|
"loss": 0.6346, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.5743243243243243, |
|
"grad_norm": 0.3270645478162509, |
|
"learning_rate": 6.926701304406713e-05, |
|
"loss": 0.6307, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5810810810810811, |
|
"grad_norm": 0.33343232181767685, |
|
"learning_rate": 6.913807067355445e-05, |
|
"loss": 0.6338, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5878378378378377, |
|
"grad_norm": 0.34421561990305105, |
|
"learning_rate": 6.90084799504159e-05, |
|
"loss": 0.6417, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5945945945945947, |
|
"grad_norm": 0.31139357648881777, |
|
"learning_rate": 6.887824375818099e-05, |
|
"loss": 0.6399, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6013513513513513, |
|
"grad_norm": 0.2657664210184094, |
|
"learning_rate": 6.874736499474154e-05, |
|
"loss": 0.6411, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.6081081081081081, |
|
"grad_norm": 0.25481549866865094, |
|
"learning_rate": 6.861584657228728e-05, |
|
"loss": 0.6418, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.614864864864865, |
|
"grad_norm": 0.27638932514157166, |
|
"learning_rate": 6.848369141724104e-05, |
|
"loss": 0.6435, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 0.32918191799417873, |
|
"learning_rate": 6.835090247019354e-05, |
|
"loss": 0.6269, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6283783783783785, |
|
"grad_norm": 0.334162128338808, |
|
"learning_rate": 6.821748268583813e-05, |
|
"loss": 0.6382, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.635135135135135, |
|
"grad_norm": 0.3172895628393372, |
|
"learning_rate": 6.808343503290491e-05, |
|
"loss": 0.627, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.6418918918918919, |
|
"grad_norm": 0.3353087016871893, |
|
"learning_rate": 6.79487624940947e-05, |
|
"loss": 0.6228, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.6486486486486487, |
|
"grad_norm": 0.3718369962163942, |
|
"learning_rate": 6.781346806601273e-05, |
|
"loss": 0.6305, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.6554054054054053, |
|
"grad_norm": 0.39658980608696004, |
|
"learning_rate": 6.767755475910185e-05, |
|
"loss": 0.6227, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6621621621621623, |
|
"grad_norm": 0.4269305577093925, |
|
"learning_rate": 6.754102559757569e-05, |
|
"loss": 0.6338, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.6689189189189189, |
|
"grad_norm": 0.48834528749062367, |
|
"learning_rate": 6.740388361935125e-05, |
|
"loss": 0.6285, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.6756756756756757, |
|
"grad_norm": 0.5054797184183533, |
|
"learning_rate": 6.726613187598132e-05, |
|
"loss": 0.6363, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6824324324324325, |
|
"grad_norm": 0.46558382405228205, |
|
"learning_rate": 6.712777343258666e-05, |
|
"loss": 0.6342, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.689189189189189, |
|
"grad_norm": 0.3349216901360824, |
|
"learning_rate": 6.698881136778771e-05, |
|
"loss": 0.638, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.695945945945946, |
|
"grad_norm": 0.296031271653257, |
|
"learning_rate": 6.684924877363613e-05, |
|
"loss": 0.6449, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.7027027027027026, |
|
"grad_norm": 0.34651680386096584, |
|
"learning_rate": 6.670908875554594e-05, |
|
"loss": 0.6373, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.7094594594594594, |
|
"grad_norm": 0.34431135065378676, |
|
"learning_rate": 6.656833443222458e-05, |
|
"loss": 0.6222, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.7162162162162162, |
|
"grad_norm": 0.3856861975419152, |
|
"learning_rate": 6.642698893560327e-05, |
|
"loss": 0.6389, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.722972972972973, |
|
"grad_norm": 0.3419900662624624, |
|
"learning_rate": 6.628505541076755e-05, |
|
"loss": 0.6345, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 0.25513443874276925, |
|
"learning_rate": 6.614253701588718e-05, |
|
"loss": 0.6337, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.7364864864864864, |
|
"grad_norm": 0.40205274571625865, |
|
"learning_rate": 6.599943692214587e-05, |
|
"loss": 0.6329, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.7432432432432432, |
|
"grad_norm": 0.38134230469861735, |
|
"learning_rate": 6.585575831367078e-05, |
|
"loss": 0.6315, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.2108025719904083, |
|
"learning_rate": 6.571150438746157e-05, |
|
"loss": 0.6332, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 0.2896853669566718, |
|
"learning_rate": 6.55666783533194e-05, |
|
"loss": 0.6254, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7635135135135136, |
|
"grad_norm": 0.3811309603864002, |
|
"learning_rate": 6.542128343377536e-05, |
|
"loss": 0.6355, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.7702702702702702, |
|
"grad_norm": 0.30773816113154867, |
|
"learning_rate": 6.527532286401889e-05, |
|
"loss": 0.6337, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.777027027027027, |
|
"grad_norm": 0.22231601239252788, |
|
"learning_rate": 6.51287998918257e-05, |
|
"loss": 0.6204, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 0.2581820667256423, |
|
"learning_rate": 6.498171777748557e-05, |
|
"loss": 0.6335, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7905405405405406, |
|
"grad_norm": 0.3087437556000368, |
|
"learning_rate": 6.483407979372975e-05, |
|
"loss": 0.6221, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7972972972972974, |
|
"grad_norm": 0.229624863667569, |
|
"learning_rate": 6.468588922565822e-05, |
|
"loss": 0.6305, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.804054054054054, |
|
"grad_norm": 0.15965170109349433, |
|
"learning_rate": 6.453714937066648e-05, |
|
"loss": 0.6277, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.810810810810811, |
|
"grad_norm": 0.16500327907249202, |
|
"learning_rate": 6.438786353837228e-05, |
|
"loss": 0.6279, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.8175675675675675, |
|
"grad_norm": 0.19228879276238184, |
|
"learning_rate": 6.423803505054193e-05, |
|
"loss": 0.637, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.8243243243243243, |
|
"grad_norm": 0.1960334960484057, |
|
"learning_rate": 6.408766724101638e-05, |
|
"loss": 0.6344, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8310810810810811, |
|
"grad_norm": 0.22246057335109667, |
|
"learning_rate": 6.393676345563708e-05, |
|
"loss": 0.6315, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"grad_norm": 0.25500558074002594, |
|
"learning_rate": 6.378532705217148e-05, |
|
"loss": 0.6267, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.8445945945945947, |
|
"grad_norm": 0.2797665813089635, |
|
"learning_rate": 6.363336140023833e-05, |
|
"loss": 0.6198, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.8513513513513513, |
|
"grad_norm": 0.32237396509938987, |
|
"learning_rate": 6.348086988123274e-05, |
|
"loss": 0.6302, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.8581081081081081, |
|
"grad_norm": 0.3837879616068762, |
|
"learning_rate": 6.332785588825094e-05, |
|
"loss": 0.6366, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.864864864864865, |
|
"grad_norm": 0.3847808366177522, |
|
"learning_rate": 6.317432282601469e-05, |
|
"loss": 0.6405, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.8716216216216215, |
|
"grad_norm": 0.3485054619073992, |
|
"learning_rate": 6.302027411079562e-05, |
|
"loss": 0.6273, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.8783783783783785, |
|
"grad_norm": 0.3171200624449193, |
|
"learning_rate": 6.286571317033915e-05, |
|
"loss": 0.6337, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.885135135135135, |
|
"grad_norm": 0.27373328598715324, |
|
"learning_rate": 6.271064344378832e-05, |
|
"loss": 0.6403, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 0.3045549423645127, |
|
"learning_rate": 6.255506838160711e-05, |
|
"loss": 0.6317, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8986486486486487, |
|
"grad_norm": 0.23258625831405336, |
|
"learning_rate": 6.239899144550383e-05, |
|
"loss": 0.6195, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.9054054054054053, |
|
"grad_norm": 0.30422094244714903, |
|
"learning_rate": 6.224241610835391e-05, |
|
"loss": 0.6422, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.9121621621621623, |
|
"grad_norm": 0.3974220280836707, |
|
"learning_rate": 6.208534585412282e-05, |
|
"loss": 0.6341, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.9189189189189189, |
|
"grad_norm": 0.3588942161699914, |
|
"learning_rate": 6.19277841777884e-05, |
|
"loss": 0.6277, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.9256756756756757, |
|
"grad_norm": 0.23348715119029995, |
|
"learning_rate": 6.176973458526317e-05, |
|
"loss": 0.6196, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9324324324324325, |
|
"grad_norm": 0.18377971391888995, |
|
"learning_rate": 6.161120059331628e-05, |
|
"loss": 0.6332, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.939189189189189, |
|
"grad_norm": 0.2385215605440726, |
|
"learning_rate": 6.14521857294953e-05, |
|
"loss": 0.6479, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 0.2797923846654065, |
|
"learning_rate": 6.129269353204769e-05, |
|
"loss": 0.6354, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.9527027027027026, |
|
"grad_norm": 0.3046327128914545, |
|
"learning_rate": 6.113272754984206e-05, |
|
"loss": 0.6394, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.9594594594594594, |
|
"grad_norm": 0.2685120513408917, |
|
"learning_rate": 6.0972291342289274e-05, |
|
"loss": 0.6285, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9662162162162162, |
|
"grad_norm": 0.19923838754652695, |
|
"learning_rate": 6.081138847926317e-05, |
|
"loss": 0.6239, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.972972972972973, |
|
"grad_norm": 0.23688012061160577, |
|
"learning_rate": 6.065002254102116e-05, |
|
"loss": 0.6391, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.9797297297297298, |
|
"grad_norm": 0.2444120597492392, |
|
"learning_rate": 6.048819711812457e-05, |
|
"loss": 0.6315, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.9864864864864864, |
|
"grad_norm": 0.2374660611243683, |
|
"learning_rate": 6.032591581135878e-05, |
|
"loss": 0.635, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.9932432432432432, |
|
"grad_norm": 0.19031195961334121, |
|
"learning_rate": 6.0163182231652985e-05, |
|
"loss": 0.6266, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2396442769161295, |
|
"learning_rate": 6.000000000000001e-05, |
|
"loss": 0.6384, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.0067567567567566, |
|
"grad_norm": 0.2791243307653228, |
|
"learning_rate": 5.983637274737558e-05, |
|
"loss": 0.6251, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.0135135135135136, |
|
"grad_norm": 0.3160836645945337, |
|
"learning_rate": 5.967230411465768e-05, |
|
"loss": 0.6261, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.02027027027027, |
|
"grad_norm": 0.2816061126247031, |
|
"learning_rate": 5.950779775254539e-05, |
|
"loss": 0.6146, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 0.3070948777190884, |
|
"learning_rate": 5.934285732147778e-05, |
|
"loss": 0.6234, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0337837837837838, |
|
"grad_norm": 0.31200072769112625, |
|
"learning_rate": 5.91774864915524e-05, |
|
"loss": 0.6129, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.0405405405405403, |
|
"grad_norm": 0.28687016611033417, |
|
"learning_rate": 5.90116889424436e-05, |
|
"loss": 0.6144, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.0472972972972974, |
|
"grad_norm": 0.3063930815670496, |
|
"learning_rate": 5.884546836332072e-05, |
|
"loss": 0.6176, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.054054054054054, |
|
"grad_norm": 0.24478115745724477, |
|
"learning_rate": 5.867882845276593e-05, |
|
"loss": 0.6164, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.060810810810811, |
|
"grad_norm": 0.17845425335065362, |
|
"learning_rate": 5.851177291869197e-05, |
|
"loss": 0.6118, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.0675675675675675, |
|
"grad_norm": 0.20058855750068053, |
|
"learning_rate": 5.834430547825964e-05, |
|
"loss": 0.6146, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.074324324324324, |
|
"grad_norm": 0.21828538712863907, |
|
"learning_rate": 5.8176429857795104e-05, |
|
"loss": 0.6176, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.081081081081081, |
|
"grad_norm": 0.2218763340787443, |
|
"learning_rate": 5.8008149792706936e-05, |
|
"loss": 0.6059, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.0878378378378377, |
|
"grad_norm": 0.26010217837913424, |
|
"learning_rate": 5.783946902740304e-05, |
|
"loss": 0.6172, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.0945945945945947, |
|
"grad_norm": 0.2968234389164342, |
|
"learning_rate": 5.767039131520733e-05, |
|
"loss": 0.6176, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.1013513513513513, |
|
"grad_norm": 0.29905738657768327, |
|
"learning_rate": 5.750092041827618e-05, |
|
"loss": 0.5998, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 0.287828129320029, |
|
"learning_rate": 5.7331060107514754e-05, |
|
"loss": 0.597, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.114864864864865, |
|
"grad_norm": 0.26831745019142034, |
|
"learning_rate": 5.716081416249307e-05, |
|
"loss": 0.6131, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.1216216216216215, |
|
"grad_norm": 0.28531963857984527, |
|
"learning_rate": 5.699018637136192e-05, |
|
"loss": 0.6056, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.1283783783783785, |
|
"grad_norm": 0.18651675715743835, |
|
"learning_rate": 5.681918053076858e-05, |
|
"loss": 0.6167, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.135135135135135, |
|
"grad_norm": 0.21775207949583675, |
|
"learning_rate": 5.664780044577231e-05, |
|
"loss": 0.6219, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.141891891891892, |
|
"grad_norm": 0.25323829124729375, |
|
"learning_rate": 5.6476049929759714e-05, |
|
"loss": 0.6196, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.1486486486486487, |
|
"grad_norm": 0.19732115665472313, |
|
"learning_rate": 5.6303932804359857e-05, |
|
"loss": 0.6156, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.1554054054054053, |
|
"grad_norm": 0.16849505056239242, |
|
"learning_rate": 5.613145289935926e-05, |
|
"loss": 0.6042, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.19969762470742003, |
|
"learning_rate": 5.595861405261666e-05, |
|
"loss": 0.6031, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.168918918918919, |
|
"grad_norm": 0.2036043502047779, |
|
"learning_rate": 5.578542010997764e-05, |
|
"loss": 0.633, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.175675675675676, |
|
"grad_norm": 0.16807494045865312, |
|
"learning_rate": 5.561187492518903e-05, |
|
"loss": 0.6118, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.1824324324324325, |
|
"grad_norm": 0.1717048805694881, |
|
"learning_rate": 5.5437982359813156e-05, |
|
"loss": 0.6116, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.189189189189189, |
|
"grad_norm": 0.17594564977890167, |
|
"learning_rate": 5.526374628314195e-05, |
|
"loss": 0.6162, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.195945945945946, |
|
"grad_norm": 0.21489547073195236, |
|
"learning_rate": 5.50891705721108e-05, |
|
"loss": 0.5984, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.2027027027027026, |
|
"grad_norm": 0.21447613988889852, |
|
"learning_rate": 5.4914259111212355e-05, |
|
"loss": 0.6198, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.2094594594594597, |
|
"grad_norm": 0.2207232176733451, |
|
"learning_rate": 5.473901579241e-05, |
|
"loss": 0.6092, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.2162162162162162, |
|
"grad_norm": 0.19937702101442842, |
|
"learning_rate": 5.4563444515051354e-05, |
|
"loss": 0.6065, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.222972972972973, |
|
"grad_norm": 0.19293343910472205, |
|
"learning_rate": 5.438754918578144e-05, |
|
"loss": 0.6063, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.22972972972973, |
|
"grad_norm": 0.17732267437010826, |
|
"learning_rate": 5.4211333718455756e-05, |
|
"loss": 0.6076, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.2364864864864864, |
|
"grad_norm": 0.18101481051912677, |
|
"learning_rate": 5.4034802034053223e-05, |
|
"loss": 0.6154, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.2432432432432434, |
|
"grad_norm": 0.19133561762748252, |
|
"learning_rate": 5.3857958060588955e-05, |
|
"loss": 0.5988, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.18341857255842758, |
|
"learning_rate": 5.368080573302676e-05, |
|
"loss": 0.6093, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.2567567567567566, |
|
"grad_norm": 0.2486112524831487, |
|
"learning_rate": 5.3503348993191706e-05, |
|
"loss": 0.6159, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.2635135135135136, |
|
"grad_norm": 0.21869922307823292, |
|
"learning_rate": 5.332559178968231e-05, |
|
"loss": 0.6128, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 0.1808704522354482, |
|
"learning_rate": 5.314753807778276e-05, |
|
"loss": 0.6157, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.277027027027027, |
|
"grad_norm": 0.19637386821805067, |
|
"learning_rate": 5.296919181937485e-05, |
|
"loss": 0.6206, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.2837837837837838, |
|
"grad_norm": 0.1867094463718795, |
|
"learning_rate": 5.279055698284982e-05, |
|
"loss": 0.6209, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.2905405405405403, |
|
"grad_norm": 0.17359407157166826, |
|
"learning_rate": 5.261163754302011e-05, |
|
"loss": 0.6012, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 0.18569999759822897, |
|
"learning_rate": 5.2432437481030855e-05, |
|
"loss": 0.6048, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.304054054054054, |
|
"grad_norm": 0.1773148171307398, |
|
"learning_rate": 5.225296078427135e-05, |
|
"loss": 0.5963, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.310810810810811, |
|
"grad_norm": 0.16243539870707618, |
|
"learning_rate": 5.207321144628628e-05, |
|
"loss": 0.6178, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.3175675675675675, |
|
"grad_norm": 0.18693063507419128, |
|
"learning_rate": 5.18931934666869e-05, |
|
"loss": 0.6136, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.3243243243243246, |
|
"grad_norm": 0.1661326777898068, |
|
"learning_rate": 5.171291085106202e-05, |
|
"loss": 0.6235, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.331081081081081, |
|
"grad_norm": 0.15331403054989767, |
|
"learning_rate": 5.153236761088888e-05, |
|
"loss": 0.6098, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.3378378378378377, |
|
"grad_norm": 0.1570395157158325, |
|
"learning_rate": 5.135156776344389e-05, |
|
"loss": 0.6137, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.3445945945945947, |
|
"grad_norm": 0.17430865513764107, |
|
"learning_rate": 5.117051533171321e-05, |
|
"loss": 0.6177, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.3513513513513513, |
|
"grad_norm": 0.21717070798995733, |
|
"learning_rate": 5.098921434430333e-05, |
|
"loss": 0.6203, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.358108108108108, |
|
"grad_norm": 0.20392614118792024, |
|
"learning_rate": 5.080766883535129e-05, |
|
"loss": 0.6136, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.364864864864865, |
|
"grad_norm": 0.21588477709369264, |
|
"learning_rate": 5.062588284443505e-05, |
|
"loss": 0.6145, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3716216216216215, |
|
"grad_norm": 0.21049543340334342, |
|
"learning_rate": 5.0443860416483536e-05, |
|
"loss": 0.6019, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 0.1730662628064374, |
|
"learning_rate": 5.026160560168661e-05, |
|
"loss": 0.6058, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.385135135135135, |
|
"grad_norm": 0.17831683554874725, |
|
"learning_rate": 5.0079122455405014e-05, |
|
"loss": 0.6208, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.391891891891892, |
|
"grad_norm": 0.17744701669503546, |
|
"learning_rate": 4.989641503808011e-05, |
|
"loss": 0.609, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.3986486486486487, |
|
"grad_norm": 0.17994097783132235, |
|
"learning_rate": 4.971348741514349e-05, |
|
"loss": 0.6066, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.4054054054054053, |
|
"grad_norm": 0.17459787928555573, |
|
"learning_rate": 4.95303436569266e-05, |
|
"loss": 0.6146, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.4121621621621623, |
|
"grad_norm": 0.16284962487352878, |
|
"learning_rate": 4.934698783857011e-05, |
|
"loss": 0.6044, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.418918918918919, |
|
"grad_norm": 0.17767021224773352, |
|
"learning_rate": 4.91634240399332e-05, |
|
"loss": 0.6211, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.4256756756756754, |
|
"grad_norm": 0.1496311992502912, |
|
"learning_rate": 4.8979656345502904e-05, |
|
"loss": 0.602, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 0.16981969578628817, |
|
"learning_rate": 4.8795688844303114e-05, |
|
"loss": 0.6148, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.439189189189189, |
|
"grad_norm": 0.2089769520148497, |
|
"learning_rate": 4.861152562980362e-05, |
|
"loss": 0.6123, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.445945945945946, |
|
"grad_norm": 0.20528877980388496, |
|
"learning_rate": 4.8427170799829055e-05, |
|
"loss": 0.6095, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.4527027027027026, |
|
"grad_norm": 0.1827881621296737, |
|
"learning_rate": 4.824262845646771e-05, |
|
"loss": 0.6112, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.4594594594594597, |
|
"grad_norm": 0.19580070665324928, |
|
"learning_rate": 4.805790270598021e-05, |
|
"loss": 0.6209, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.4662162162162162, |
|
"grad_norm": 0.23276386582983333, |
|
"learning_rate": 4.787299765870822e-05, |
|
"loss": 0.6083, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.472972972972973, |
|
"grad_norm": 0.20338848706607754, |
|
"learning_rate": 4.768791742898292e-05, |
|
"loss": 0.615, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.47972972972973, |
|
"grad_norm": 0.1521765603212478, |
|
"learning_rate": 4.7502666135033486e-05, |
|
"loss": 0.6154, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.4864864864864864, |
|
"grad_norm": 0.1559952589700639, |
|
"learning_rate": 4.731724789889547e-05, |
|
"loss": 0.6026, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.4932432432432434, |
|
"grad_norm": 0.14084577612463858, |
|
"learning_rate": 4.7131666846319036e-05, |
|
"loss": 0.607, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.1450823104838612, |
|
"learning_rate": 4.694592710667723e-05, |
|
"loss": 0.6173, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.506756756756757, |
|
"grad_norm": 0.14280439260452932, |
|
"learning_rate": 4.676003281287397e-05, |
|
"loss": 0.6035, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.5135135135135136, |
|
"grad_norm": 0.1731629109803712, |
|
"learning_rate": 4.657398810125225e-05, |
|
"loss": 0.6044, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.52027027027027, |
|
"grad_norm": 0.14551348661799535, |
|
"learning_rate": 4.638779711150198e-05, |
|
"loss": 0.6223, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.527027027027027, |
|
"grad_norm": 0.15724571828258985, |
|
"learning_rate": 4.620146398656792e-05, |
|
"loss": 0.6174, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.5337837837837838, |
|
"grad_norm": 0.1622156186595054, |
|
"learning_rate": 4.601499287255748e-05, |
|
"loss": 0.6159, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.5405405405405403, |
|
"grad_norm": 0.16326776251404446, |
|
"learning_rate": 4.582838791864846e-05, |
|
"loss": 0.6178, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.5472972972972974, |
|
"grad_norm": 0.16705938089441114, |
|
"learning_rate": 4.5641653276996774e-05, |
|
"loss": 0.6069, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.554054054054054, |
|
"grad_norm": 0.1240389952109112, |
|
"learning_rate": 4.5454793102644006e-05, |
|
"loss": 0.6129, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.560810810810811, |
|
"grad_norm": 0.16500872440232617, |
|
"learning_rate": 4.5267811553424945e-05, |
|
"loss": 0.6213, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 0.17252309025914744, |
|
"learning_rate": 4.5080712789875154e-05, |
|
"loss": 0.6172, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.5743243243243246, |
|
"grad_norm": 0.163801171559839, |
|
"learning_rate": 4.489350097513829e-05, |
|
"loss": 0.6205, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.581081081081081, |
|
"grad_norm": 0.14139376100405637, |
|
"learning_rate": 4.470618027487354e-05, |
|
"loss": 0.6101, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.5878378378378377, |
|
"grad_norm": 0.14839912258194626, |
|
"learning_rate": 4.451875485716292e-05, |
|
"loss": 0.6214, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 0.13849384656147448, |
|
"learning_rate": 4.4331228892418473e-05, |
|
"loss": 0.6111, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.6013513513513513, |
|
"grad_norm": 0.16314353286789263, |
|
"learning_rate": 4.414360655328957e-05, |
|
"loss": 0.6201, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.608108108108108, |
|
"grad_norm": 0.13166529990196998, |
|
"learning_rate": 4.395589201457e-05, |
|
"loss": 0.6167, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.614864864864865, |
|
"grad_norm": 0.14086485553732253, |
|
"learning_rate": 4.376808945310505e-05, |
|
"loss": 0.6219, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.6216216216216215, |
|
"grad_norm": 0.15193755776745674, |
|
"learning_rate": 4.358020304769867e-05, |
|
"loss": 0.5994, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.6283783783783785, |
|
"grad_norm": 0.14946480137653986, |
|
"learning_rate": 4.339223697902037e-05, |
|
"loss": 0.61, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.635135135135135, |
|
"grad_norm": 0.15531265023719382, |
|
"learning_rate": 4.320419542951228e-05, |
|
"loss": 0.6179, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.641891891891892, |
|
"grad_norm": 0.14264508419628688, |
|
"learning_rate": 4.3016082583296067e-05, |
|
"loss": 0.6167, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.6486486486486487, |
|
"grad_norm": 0.12288922241404748, |
|
"learning_rate": 4.2827902626079784e-05, |
|
"loss": 0.6195, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.6554054054054053, |
|
"grad_norm": 0.19330457974395565, |
|
"learning_rate": 4.263965974506483e-05, |
|
"loss": 0.6207, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.6621621621621623, |
|
"grad_norm": 0.16883088734435447, |
|
"learning_rate": 4.2451358128852654e-05, |
|
"loss": 0.5989, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.668918918918919, |
|
"grad_norm": 0.1599604515688246, |
|
"learning_rate": 4.22630019673517e-05, |
|
"loss": 0.6142, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.6756756756756754, |
|
"grad_norm": 0.15518611864455714, |
|
"learning_rate": 4.207459545168405e-05, |
|
"loss": 0.6102, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.6824324324324325, |
|
"grad_norm": 0.16742006705568305, |
|
"learning_rate": 4.188614277409224e-05, |
|
"loss": 0.6171, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.689189189189189, |
|
"grad_norm": 0.15553602245497833, |
|
"learning_rate": 4.169764812784594e-05, |
|
"loss": 0.6143, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.695945945945946, |
|
"grad_norm": 0.1595818544532621, |
|
"learning_rate": 4.1509115707148695e-05, |
|
"loss": 0.6047, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.13690560623925258, |
|
"learning_rate": 4.132054970704454e-05, |
|
"loss": 0.6092, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.7094594594594597, |
|
"grad_norm": 0.1341753352699138, |
|
"learning_rate": 4.1131954323324734e-05, |
|
"loss": 0.6023, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.7162162162162162, |
|
"grad_norm": 0.14452976105100054, |
|
"learning_rate": 4.094333375243428e-05, |
|
"loss": 0.6121, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.722972972972973, |
|
"grad_norm": 0.17855418886582433, |
|
"learning_rate": 4.075469219137868e-05, |
|
"loss": 0.6192, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.72972972972973, |
|
"grad_norm": 0.15337873200421298, |
|
"learning_rate": 4.056603383763049e-05, |
|
"loss": 0.6137, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.7364864864864864, |
|
"grad_norm": 0.14193840867721655, |
|
"learning_rate": 4.0377362889035875e-05, |
|
"loss": 0.5971, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.743243243243243, |
|
"grad_norm": 0.14049198222822518, |
|
"learning_rate": 4.0188683543721295e-05, |
|
"loss": 0.61, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.1643045132978014, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6224, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.756756756756757, |
|
"grad_norm": 0.14527602919509278, |
|
"learning_rate": 3.981131645627872e-05, |
|
"loss": 0.6164, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.7635135135135136, |
|
"grad_norm": 0.1384596570378767, |
|
"learning_rate": 3.9622637110964125e-05, |
|
"loss": 0.6158, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.77027027027027, |
|
"grad_norm": 0.18210804820042478, |
|
"learning_rate": 3.943396616236953e-05, |
|
"loss": 0.6149, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.777027027027027, |
|
"grad_norm": 0.16943522003893646, |
|
"learning_rate": 3.9245307808621325e-05, |
|
"loss": 0.6159, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.7837837837837838, |
|
"grad_norm": 0.16603181052517613, |
|
"learning_rate": 3.905666624756573e-05, |
|
"loss": 0.6186, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.7905405405405403, |
|
"grad_norm": 0.15368628411505494, |
|
"learning_rate": 3.886804567667528e-05, |
|
"loss": 0.6155, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.7972972972972974, |
|
"grad_norm": 0.1721774875064551, |
|
"learning_rate": 3.867945029295546e-05, |
|
"loss": 0.6099, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.804054054054054, |
|
"grad_norm": 0.1893984092920587, |
|
"learning_rate": 3.8490884292851325e-05, |
|
"loss": 0.6146, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 0.1792007727966493, |
|
"learning_rate": 3.830235187215408e-05, |
|
"loss": 0.6195, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.8175675675675675, |
|
"grad_norm": 0.1569338853010942, |
|
"learning_rate": 3.8113857225907783e-05, |
|
"loss": 0.6173, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.8243243243243246, |
|
"grad_norm": 0.17031740028829662, |
|
"learning_rate": 3.792540454831596e-05, |
|
"loss": 0.6163, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.831081081081081, |
|
"grad_norm": 0.15004651120970428, |
|
"learning_rate": 3.7736998032648305e-05, |
|
"loss": 0.6047, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 0.142075802466718, |
|
"learning_rate": 3.754864187114736e-05, |
|
"loss": 0.6253, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.8445945945945947, |
|
"grad_norm": 0.15599780395707422, |
|
"learning_rate": 3.736034025493519e-05, |
|
"loss": 0.593, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.8513513513513513, |
|
"grad_norm": 0.1563201754706643, |
|
"learning_rate": 3.717209737392022e-05, |
|
"loss": 0.6128, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.858108108108108, |
|
"grad_norm": 0.15805265922848155, |
|
"learning_rate": 3.698391741670394e-05, |
|
"loss": 0.6059, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.864864864864865, |
|
"grad_norm": 0.17933457522155424, |
|
"learning_rate": 3.679580457048772e-05, |
|
"loss": 0.6086, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.8716216216216215, |
|
"grad_norm": 0.17371124474695815, |
|
"learning_rate": 3.660776302097965e-05, |
|
"loss": 0.6079, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.8783783783783785, |
|
"grad_norm": 0.1315584411453832, |
|
"learning_rate": 3.641979695230135e-05, |
|
"loss": 0.6039, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.885135135135135, |
|
"grad_norm": 0.148272614562961, |
|
"learning_rate": 3.6231910546894956e-05, |
|
"loss": 0.6161, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.891891891891892, |
|
"grad_norm": 0.12013666775227018, |
|
"learning_rate": 3.6044107985430015e-05, |
|
"loss": 0.6113, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.8986486486486487, |
|
"grad_norm": 0.15454206210883253, |
|
"learning_rate": 3.585639344671043e-05, |
|
"loss": 0.6113, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.9054054054054053, |
|
"grad_norm": 0.130251935264837, |
|
"learning_rate": 3.5668771107581526e-05, |
|
"loss": 0.613, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.9121621621621623, |
|
"grad_norm": 0.1370783380801396, |
|
"learning_rate": 3.5481245142837095e-05, |
|
"loss": 0.6168, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.918918918918919, |
|
"grad_norm": 0.13116479132749334, |
|
"learning_rate": 3.5293819725126464e-05, |
|
"loss": 0.6136, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.9256756756756754, |
|
"grad_norm": 0.14031976128589324, |
|
"learning_rate": 3.5106499024861715e-05, |
|
"loss": 0.6175, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.9324324324324325, |
|
"grad_norm": 0.12054376627925889, |
|
"learning_rate": 3.491928721012485e-05, |
|
"loss": 0.6134, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.939189189189189, |
|
"grad_norm": 0.14272909957039542, |
|
"learning_rate": 3.4732188446575055e-05, |
|
"loss": 0.6096, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.945945945945946, |
|
"grad_norm": 0.1343117824779506, |
|
"learning_rate": 3.454520689735602e-05, |
|
"loss": 0.6203, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.9527027027027026, |
|
"grad_norm": 0.11376422264610185, |
|
"learning_rate": 3.435834672300324e-05, |
|
"loss": 0.6057, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.9594594594594597, |
|
"grad_norm": 0.14208820005659473, |
|
"learning_rate": 3.417161208135155e-05, |
|
"loss": 0.6193, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.9662162162162162, |
|
"grad_norm": 0.11247340517088436, |
|
"learning_rate": 3.398500712744254e-05, |
|
"loss": 0.6218, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 0.12141011100880744, |
|
"learning_rate": 3.379853601343209e-05, |
|
"loss": 0.6124, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.97972972972973, |
|
"grad_norm": 0.11441162073496848, |
|
"learning_rate": 3.361220288849804e-05, |
|
"loss": 0.6146, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.9864864864864864, |
|
"grad_norm": 0.11128267132483809, |
|
"learning_rate": 3.342601189874777e-05, |
|
"loss": 0.6141, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.993243243243243, |
|
"grad_norm": 0.13308500191665273, |
|
"learning_rate": 3.323996718712605e-05, |
|
"loss": 0.6154, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.13634885719680506, |
|
"learning_rate": 3.305407289332279e-05, |
|
"loss": 0.6063, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.0067567567567566, |
|
"grad_norm": 0.14117643344569666, |
|
"learning_rate": 3.2868333153680964e-05, |
|
"loss": 0.5952, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.0135135135135136, |
|
"grad_norm": 0.1441355574419299, |
|
"learning_rate": 3.2682752101104536e-05, |
|
"loss": 0.5807, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.02027027027027, |
|
"grad_norm": 0.13317791462913253, |
|
"learning_rate": 3.249733386496653e-05, |
|
"loss": 0.6011, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.027027027027027, |
|
"grad_norm": 0.1488227563399244, |
|
"learning_rate": 3.231208257101709e-05, |
|
"loss": 0.6055, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.0337837837837838, |
|
"grad_norm": 0.13698170944523078, |
|
"learning_rate": 3.212700234129179e-05, |
|
"loss": 0.6012, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.0405405405405403, |
|
"grad_norm": 0.13426258482203746, |
|
"learning_rate": 3.194209729401979e-05, |
|
"loss": 0.5955, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.0472972972972974, |
|
"grad_norm": 0.12763528667729904, |
|
"learning_rate": 3.175737154353231e-05, |
|
"loss": 0.5829, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.054054054054054, |
|
"grad_norm": 0.12302649236306719, |
|
"learning_rate": 3.157282920017096e-05, |
|
"loss": 0.6031, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.060810810810811, |
|
"grad_norm": 0.12836088172431226, |
|
"learning_rate": 3.1388474370196395e-05, |
|
"loss": 0.6029, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.0675675675675675, |
|
"grad_norm": 0.12713682511389784, |
|
"learning_rate": 3.12043111556969e-05, |
|
"loss": 0.5898, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.074324324324324, |
|
"grad_norm": 0.13954610740683865, |
|
"learning_rate": 3.1020343654497096e-05, |
|
"loss": 0.5882, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.081081081081081, |
|
"grad_norm": 0.14394153328876694, |
|
"learning_rate": 3.083657596006681e-05, |
|
"loss": 0.5878, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.0878378378378377, |
|
"grad_norm": 0.11738094672449853, |
|
"learning_rate": 3.065301216142991e-05, |
|
"loss": 0.5916, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.0945945945945947, |
|
"grad_norm": 0.1473396296692266, |
|
"learning_rate": 3.046965634307341e-05, |
|
"loss": 0.6003, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.1013513513513513, |
|
"grad_norm": 0.12128542181194722, |
|
"learning_rate": 3.028651258485652e-05, |
|
"loss": 0.5807, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 0.12076772408917641, |
|
"learning_rate": 3.010358496191991e-05, |
|
"loss": 0.5968, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.114864864864865, |
|
"grad_norm": 0.12207943508497456, |
|
"learning_rate": 2.9920877544595002e-05, |
|
"loss": 0.5911, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.1216216216216215, |
|
"grad_norm": 0.1193967138970814, |
|
"learning_rate": 2.9738394398313405e-05, |
|
"loss": 0.5867, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.1283783783783785, |
|
"grad_norm": 0.11942196780407818, |
|
"learning_rate": 2.955613958351647e-05, |
|
"loss": 0.5927, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.135135135135135, |
|
"grad_norm": 0.12137977118056532, |
|
"learning_rate": 2.9374117155564957e-05, |
|
"loss": 0.5923, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.141891891891892, |
|
"grad_norm": 0.1204663577707489, |
|
"learning_rate": 2.919233116464872e-05, |
|
"loss": 0.5807, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.1486486486486487, |
|
"grad_norm": 0.1277645131662919, |
|
"learning_rate": 2.9010785655696698e-05, |
|
"loss": 0.602, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.1554054054054053, |
|
"grad_norm": 0.14373007650684674, |
|
"learning_rate": 2.88294846682868e-05, |
|
"loss": 0.5952, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.1621621621621623, |
|
"grad_norm": 0.12531967044629988, |
|
"learning_rate": 2.864843223655613e-05, |
|
"loss": 0.6065, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.168918918918919, |
|
"grad_norm": 0.1344060345828822, |
|
"learning_rate": 2.8467632389111126e-05, |
|
"loss": 0.5879, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.175675675675676, |
|
"grad_norm": 0.11307986878514827, |
|
"learning_rate": 2.828708914893799e-05, |
|
"loss": 0.5923, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.1824324324324325, |
|
"grad_norm": 0.1205066746149822, |
|
"learning_rate": 2.8106806533313106e-05, |
|
"loss": 0.6042, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.189189189189189, |
|
"grad_norm": 0.11358248639127554, |
|
"learning_rate": 2.7926788553713734e-05, |
|
"loss": 0.6017, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.195945945945946, |
|
"grad_norm": 0.11273620191303046, |
|
"learning_rate": 2.7747039215728667e-05, |
|
"loss": 0.5909, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.2027027027027026, |
|
"grad_norm": 0.1527832963288818, |
|
"learning_rate": 2.7567562518969155e-05, |
|
"loss": 0.5884, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.2094594594594597, |
|
"grad_norm": 0.1251866201941187, |
|
"learning_rate": 2.7388362456979906e-05, |
|
"loss": 0.5942, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.2162162162162162, |
|
"grad_norm": 0.131897788334417, |
|
"learning_rate": 2.7209443017150193e-05, |
|
"loss": 0.6028, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.222972972972973, |
|
"grad_norm": 0.10949010701632424, |
|
"learning_rate": 2.703080818062517e-05, |
|
"loss": 0.5899, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.22972972972973, |
|
"grad_norm": 0.1085593433907961, |
|
"learning_rate": 2.6852461922217253e-05, |
|
"loss": 0.5915, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.2364864864864864, |
|
"grad_norm": 0.10851474030600673, |
|
"learning_rate": 2.66744082103177e-05, |
|
"loss": 0.5956, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 0.10556652124147405, |
|
"learning_rate": 2.6496651006808308e-05, |
|
"loss": 0.5923, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.11511281568946971, |
|
"learning_rate": 2.6319194266973256e-05, |
|
"loss": 0.5982, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.2567567567567566, |
|
"grad_norm": 0.10661745620059551, |
|
"learning_rate": 2.614204193941107e-05, |
|
"loss": 0.6029, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.2635135135135136, |
|
"grad_norm": 0.12822931878201987, |
|
"learning_rate": 2.5965197965946783e-05, |
|
"loss": 0.5935, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.27027027027027, |
|
"grad_norm": 0.1156142465379123, |
|
"learning_rate": 2.5788666281544258e-05, |
|
"loss": 0.6142, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.277027027027027, |
|
"grad_norm": 0.13148336347712752, |
|
"learning_rate": 2.561245081421857e-05, |
|
"loss": 0.5942, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.2837837837837838, |
|
"grad_norm": 0.10123146621891457, |
|
"learning_rate": 2.5436555484948643e-05, |
|
"loss": 0.5989, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.2905405405405403, |
|
"grad_norm": 0.1375355982170109, |
|
"learning_rate": 2.5260984207590015e-05, |
|
"loss": 0.5913, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.2972972972972974, |
|
"grad_norm": 0.1071253679793487, |
|
"learning_rate": 2.5085740888787662e-05, |
|
"loss": 0.6034, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.304054054054054, |
|
"grad_norm": 0.13183672834367724, |
|
"learning_rate": 2.4910829427889205e-05, |
|
"loss": 0.6016, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.310810810810811, |
|
"grad_norm": 0.11417300702406326, |
|
"learning_rate": 2.473625371685806e-05, |
|
"loss": 0.5964, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.3175675675675675, |
|
"grad_norm": 0.11783207513787236, |
|
"learning_rate": 2.4562017640186847e-05, |
|
"loss": 0.5983, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.3243243243243246, |
|
"grad_norm": 0.12341779828696402, |
|
"learning_rate": 2.4388125074810986e-05, |
|
"loss": 0.5925, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.331081081081081, |
|
"grad_norm": 0.11637168120668157, |
|
"learning_rate": 2.4214579890022373e-05, |
|
"loss": 0.5771, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.3378378378378377, |
|
"grad_norm": 0.10923183772815541, |
|
"learning_rate": 2.404138594738335e-05, |
|
"loss": 0.5919, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.3445945945945947, |
|
"grad_norm": 0.10526844464204445, |
|
"learning_rate": 2.386854710064075e-05, |
|
"loss": 0.607, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.3513513513513513, |
|
"grad_norm": 0.10474734642926019, |
|
"learning_rate": 2.369606719564015e-05, |
|
"loss": 0.5907, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.358108108108108, |
|
"grad_norm": 0.11107800655708087, |
|
"learning_rate": 2.35239500702403e-05, |
|
"loss": 0.5981, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.364864864864865, |
|
"grad_norm": 0.10187850179282668, |
|
"learning_rate": 2.3352199554227698e-05, |
|
"loss": 0.5938, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.3716216216216215, |
|
"grad_norm": 0.12632250115529653, |
|
"learning_rate": 2.318081946923144e-05, |
|
"loss": 0.6117, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 0.10055327704116511, |
|
"learning_rate": 2.3009813628638085e-05, |
|
"loss": 0.5964, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.385135135135135, |
|
"grad_norm": 0.11916682604562856, |
|
"learning_rate": 2.283918583750695e-05, |
|
"loss": 0.5871, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.391891891891892, |
|
"grad_norm": 0.1010245925435501, |
|
"learning_rate": 2.266893989248527e-05, |
|
"loss": 0.6004, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.3986486486486487, |
|
"grad_norm": 0.10446456567129823, |
|
"learning_rate": 2.2499079581723846e-05, |
|
"loss": 0.5942, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.4054054054054053, |
|
"grad_norm": 0.10099772014538502, |
|
"learning_rate": 2.2329608684792676e-05, |
|
"loss": 0.5956, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.4121621621621623, |
|
"grad_norm": 0.1005267977448251, |
|
"learning_rate": 2.216053097259697e-05, |
|
"loss": 0.5898, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.418918918918919, |
|
"grad_norm": 0.10513240417556334, |
|
"learning_rate": 2.1991850207293064e-05, |
|
"loss": 0.5997, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.4256756756756754, |
|
"grad_norm": 0.09929924802833451, |
|
"learning_rate": 2.1823570142204902e-05, |
|
"loss": 0.5932, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.4324324324324325, |
|
"grad_norm": 0.10494976279274904, |
|
"learning_rate": 2.1655694521740376e-05, |
|
"loss": 0.5935, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.439189189189189, |
|
"grad_norm": 0.10732590635113842, |
|
"learning_rate": 2.1488227081308054e-05, |
|
"loss": 0.589, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.445945945945946, |
|
"grad_norm": 0.1154273309217794, |
|
"learning_rate": 2.132117154723408e-05, |
|
"loss": 0.5927, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.4527027027027026, |
|
"grad_norm": 0.10250291685305624, |
|
"learning_rate": 2.115453163667929e-05, |
|
"loss": 0.6047, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.4594594594594597, |
|
"grad_norm": 0.1053185431025155, |
|
"learning_rate": 2.0988311057556397e-05, |
|
"loss": 0.5841, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.4662162162162162, |
|
"grad_norm": 0.09414135194355852, |
|
"learning_rate": 2.0822513508447608e-05, |
|
"loss": 0.5863, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.472972972972973, |
|
"grad_norm": 0.10071155506936441, |
|
"learning_rate": 2.065714267852223e-05, |
|
"loss": 0.5923, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.47972972972973, |
|
"grad_norm": 0.10121850558468308, |
|
"learning_rate": 2.049220224745463e-05, |
|
"loss": 0.5962, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.4864864864864864, |
|
"grad_norm": 0.10161225481650848, |
|
"learning_rate": 2.032769588534233e-05, |
|
"loss": 0.5925, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.4932432432432434, |
|
"grad_norm": 0.09239639095614875, |
|
"learning_rate": 2.0163627252624427e-05, |
|
"loss": 0.5944, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.11360448222663692, |
|
"learning_rate": 2.0000000000000012e-05, |
|
"loss": 0.6116, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.506756756756757, |
|
"grad_norm": 0.09631002455489597, |
|
"learning_rate": 1.9836817768347015e-05, |
|
"loss": 0.5947, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 0.1167172974336559, |
|
"learning_rate": 1.9674084188641235e-05, |
|
"loss": 0.5843, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.52027027027027, |
|
"grad_norm": 0.10349743444704453, |
|
"learning_rate": 1.9511802881875438e-05, |
|
"loss": 0.6026, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.527027027027027, |
|
"grad_norm": 0.11753598782061575, |
|
"learning_rate": 1.9349977458978846e-05, |
|
"loss": 0.5892, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.5337837837837838, |
|
"grad_norm": 0.09146602359405204, |
|
"learning_rate": 1.9188611520736846e-05, |
|
"loss": 0.5916, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.5405405405405403, |
|
"grad_norm": 0.10892529356211986, |
|
"learning_rate": 1.902770865771074e-05, |
|
"loss": 0.5854, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.5472972972972974, |
|
"grad_norm": 0.09418036132529382, |
|
"learning_rate": 1.886727245015794e-05, |
|
"loss": 0.5993, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.554054054054054, |
|
"grad_norm": 0.1025611805390442, |
|
"learning_rate": 1.8707306467952323e-05, |
|
"loss": 0.5916, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.560810810810811, |
|
"grad_norm": 0.08852437916251449, |
|
"learning_rate": 1.8547814270504705e-05, |
|
"loss": 0.5946, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.5675675675675675, |
|
"grad_norm": 0.0999923376365888, |
|
"learning_rate": 1.838879940668373e-05, |
|
"loss": 0.6, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.5743243243243246, |
|
"grad_norm": 0.10873141849605454, |
|
"learning_rate": 1.823026541473684e-05, |
|
"loss": 0.6017, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.581081081081081, |
|
"grad_norm": 0.10599260470783008, |
|
"learning_rate": 1.8072215822211613e-05, |
|
"loss": 0.5968, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.5878378378378377, |
|
"grad_norm": 0.11188333764585037, |
|
"learning_rate": 1.7914654145877187e-05, |
|
"loss": 0.6064, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.5945945945945947, |
|
"grad_norm": 0.09583819123841314, |
|
"learning_rate": 1.77575838916461e-05, |
|
"loss": 0.5876, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.6013513513513513, |
|
"grad_norm": 0.10206469144841168, |
|
"learning_rate": 1.760100855449619e-05, |
|
"loss": 0.5948, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.608108108108108, |
|
"grad_norm": 0.10036289538603702, |
|
"learning_rate": 1.7444931618392894e-05, |
|
"loss": 0.6107, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.614864864864865, |
|
"grad_norm": 0.08314263844698841, |
|
"learning_rate": 1.7289356556211687e-05, |
|
"loss": 0.6059, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.6216216216216215, |
|
"grad_norm": 0.10484951196819213, |
|
"learning_rate": 1.7134286829660855e-05, |
|
"loss": 0.5961, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.6283783783783785, |
|
"grad_norm": 0.07757732001833932, |
|
"learning_rate": 1.697972588920439e-05, |
|
"loss": 0.603, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.635135135135135, |
|
"grad_norm": 0.09636592692846442, |
|
"learning_rate": 1.6825677173985332e-05, |
|
"loss": 0.597, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.641891891891892, |
|
"grad_norm": 0.0934961698224681, |
|
"learning_rate": 1.6672144111749066e-05, |
|
"loss": 0.6065, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 0.0956796061229755, |
|
"learning_rate": 1.6519130118767258e-05, |
|
"loss": 0.5978, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.6554054054054053, |
|
"grad_norm": 0.1161980558323059, |
|
"learning_rate": 1.6366638599761676e-05, |
|
"loss": 0.5914, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.6621621621621623, |
|
"grad_norm": 0.08312306306871219, |
|
"learning_rate": 1.621467294782854e-05, |
|
"loss": 0.5855, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.668918918918919, |
|
"grad_norm": 0.11701156024970683, |
|
"learning_rate": 1.606323654436293e-05, |
|
"loss": 0.5904, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.6756756756756754, |
|
"grad_norm": 0.09466465341207426, |
|
"learning_rate": 1.591233275898363e-05, |
|
"loss": 0.5915, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.6824324324324325, |
|
"grad_norm": 0.10284506462525071, |
|
"learning_rate": 1.5761964949458076e-05, |
|
"loss": 0.5847, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.689189189189189, |
|
"grad_norm": 0.10382209320021844, |
|
"learning_rate": 1.5612136461627726e-05, |
|
"loss": 0.596, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.695945945945946, |
|
"grad_norm": 0.09709377064685741, |
|
"learning_rate": 1.546285062933352e-05, |
|
"loss": 0.5909, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.7027027027027026, |
|
"grad_norm": 0.11272439075208067, |
|
"learning_rate": 1.5314110774341803e-05, |
|
"loss": 0.5953, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.7094594594594597, |
|
"grad_norm": 0.11904326038877579, |
|
"learning_rate": 1.5165920206270257e-05, |
|
"loss": 0.5929, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.7162162162162162, |
|
"grad_norm": 0.1039606133991536, |
|
"learning_rate": 1.5018282222514451e-05, |
|
"loss": 0.5877, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.722972972972973, |
|
"grad_norm": 0.11190632151336002, |
|
"learning_rate": 1.4871200108174306e-05, |
|
"loss": 0.5917, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.72972972972973, |
|
"grad_norm": 0.09272507731002902, |
|
"learning_rate": 1.4724677135981118e-05, |
|
"loss": 0.5959, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.7364864864864864, |
|
"grad_norm": 0.09431866458492948, |
|
"learning_rate": 1.457871656622463e-05, |
|
"loss": 0.5887, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.743243243243243, |
|
"grad_norm": 0.08680813466014699, |
|
"learning_rate": 1.4433321646680614e-05, |
|
"loss": 0.6032, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.0844092691423774, |
|
"learning_rate": 1.4288495612538427e-05, |
|
"loss": 0.6062, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.756756756756757, |
|
"grad_norm": 0.0874893569172525, |
|
"learning_rate": 1.4144241686329236e-05, |
|
"loss": 0.5833, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.7635135135135136, |
|
"grad_norm": 0.10957613038759607, |
|
"learning_rate": 1.400056307785413e-05, |
|
"loss": 0.6056, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.77027027027027, |
|
"grad_norm": 0.07882052024046089, |
|
"learning_rate": 1.3857462984112831e-05, |
|
"loss": 0.5912, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.777027027027027, |
|
"grad_norm": 0.08565900595306047, |
|
"learning_rate": 1.371494458923246e-05, |
|
"loss": 0.5919, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 0.09674275812299338, |
|
"learning_rate": 1.3573011064396751e-05, |
|
"loss": 0.5985, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.7905405405405403, |
|
"grad_norm": 0.08500752883113108, |
|
"learning_rate": 1.3431665567775439e-05, |
|
"loss": 0.5983, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.7972972972972974, |
|
"grad_norm": 0.11583654081493171, |
|
"learning_rate": 1.3290911244454066e-05, |
|
"loss": 0.5903, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.804054054054054, |
|
"grad_norm": 0.07940239999432974, |
|
"learning_rate": 1.3150751226363886e-05, |
|
"loss": 0.5931, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.810810810810811, |
|
"grad_norm": 0.10093820387387048, |
|
"learning_rate": 1.3011188632212307e-05, |
|
"loss": 0.6033, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.8175675675675675, |
|
"grad_norm": 0.0890166406955722, |
|
"learning_rate": 1.2872226567413346e-05, |
|
"loss": 0.5924, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.8243243243243246, |
|
"grad_norm": 0.07708649898799794, |
|
"learning_rate": 1.2733868124018694e-05, |
|
"loss": 0.5891, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.831081081081081, |
|
"grad_norm": 0.11011562257155211, |
|
"learning_rate": 1.2596116380648761e-05, |
|
"loss": 0.5898, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.8378378378378377, |
|
"grad_norm": 0.07996964493884025, |
|
"learning_rate": 1.2458974402424312e-05, |
|
"loss": 0.5937, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.8445945945945947, |
|
"grad_norm": 0.09078629614798739, |
|
"learning_rate": 1.2322445240898158e-05, |
|
"loss": 0.5908, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.8513513513513513, |
|
"grad_norm": 0.08453528616636899, |
|
"learning_rate": 1.2186531933987294e-05, |
|
"loss": 0.5895, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.858108108108108, |
|
"grad_norm": 0.08207256114306587, |
|
"learning_rate": 1.2051237505905302e-05, |
|
"loss": 0.6, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.864864864864865, |
|
"grad_norm": 0.08328736066936966, |
|
"learning_rate": 1.19165649670951e-05, |
|
"loss": 0.5901, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.8716216216216215, |
|
"grad_norm": 0.0796730356081974, |
|
"learning_rate": 1.1782517314161872e-05, |
|
"loss": 0.5825, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.8783783783783785, |
|
"grad_norm": 0.09614075098109777, |
|
"learning_rate": 1.164909752980648e-05, |
|
"loss": 0.5949, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.885135135135135, |
|
"grad_norm": 0.08310475221153353, |
|
"learning_rate": 1.1516308582758983e-05, |
|
"loss": 0.604, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.891891891891892, |
|
"grad_norm": 0.08282668808548053, |
|
"learning_rate": 1.1384153427712729e-05, |
|
"loss": 0.6002, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.8986486486486487, |
|
"grad_norm": 0.0986705467432139, |
|
"learning_rate": 1.1252635005258466e-05, |
|
"loss": 0.5853, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.9054054054054053, |
|
"grad_norm": 0.08062037787963591, |
|
"learning_rate": 1.1121756241819023e-05, |
|
"loss": 0.6047, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.9121621621621623, |
|
"grad_norm": 0.07523397720678549, |
|
"learning_rate": 1.0991520049584112e-05, |
|
"loss": 0.5842, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 0.08533444706267539, |
|
"learning_rate": 1.0861929326445572e-05, |
|
"loss": 0.5952, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.9256756756756754, |
|
"grad_norm": 0.0867340533978853, |
|
"learning_rate": 1.0732986955932869e-05, |
|
"loss": 0.5978, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.9324324324324325, |
|
"grad_norm": 0.06754168833386048, |
|
"learning_rate": 1.0604695807148971e-05, |
|
"loss": 0.5987, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.939189189189189, |
|
"grad_norm": 0.0796662865647217, |
|
"learning_rate": 1.0477058734706436e-05, |
|
"loss": 0.588, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.945945945945946, |
|
"grad_norm": 0.0813837559182795, |
|
"learning_rate": 1.0350078578664005e-05, |
|
"loss": 0.6009, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.9527027027027026, |
|
"grad_norm": 0.06912060191449508, |
|
"learning_rate": 1.0223758164463246e-05, |
|
"loss": 0.6055, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.9594594594594597, |
|
"grad_norm": 0.06784642823506538, |
|
"learning_rate": 1.0098100302865865e-05, |
|
"loss": 0.5882, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.9662162162162162, |
|
"grad_norm": 0.0760135339338161, |
|
"learning_rate": 9.973107789891024e-06, |
|
"loss": 0.596, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.972972972972973, |
|
"grad_norm": 0.07657252850751217, |
|
"learning_rate": 9.848783406753224e-06, |
|
"loss": 0.5962, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.97972972972973, |
|
"grad_norm": 0.08050666474861323, |
|
"learning_rate": 9.725129919800339e-06, |
|
"loss": 0.5957, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.9864864864864864, |
|
"grad_norm": 0.0778632825822388, |
|
"learning_rate": 9.60215008045211e-06, |
|
"loss": 0.5952, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.993243243243243, |
|
"grad_norm": 0.07984922923288139, |
|
"learning_rate": 9.479846625138909e-06, |
|
"loss": 0.6023, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.0916488655332951, |
|
"learning_rate": 9.358222275240884e-06, |
|
"loss": 0.5842, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 4.006756756756757, |
|
"grad_norm": 0.08766439343096606, |
|
"learning_rate": 9.237279737027326e-06, |
|
"loss": 0.5851, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 4.013513513513513, |
|
"grad_norm": 0.07964740730111751, |
|
"learning_rate": 9.117021701596567e-06, |
|
"loss": 0.5776, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 4.02027027027027, |
|
"grad_norm": 0.08933138991359177, |
|
"learning_rate": 8.99745084481594e-06, |
|
"loss": 0.5953, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.027027027027027, |
|
"grad_norm": 0.07914528740585995, |
|
"learning_rate": 8.87856982726243e-06, |
|
"loss": 0.5892, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 4.033783783783784, |
|
"grad_norm": 0.07297723048835614, |
|
"learning_rate": 8.7603812941633e-06, |
|
"loss": 0.5823, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 4.04054054054054, |
|
"grad_norm": 0.10229836912930287, |
|
"learning_rate": 8.642887875337376e-06, |
|
"loss": 0.5767, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 4.047297297297297, |
|
"grad_norm": 0.0781799178126612, |
|
"learning_rate": 8.526092185136394e-06, |
|
"loss": 0.5784, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 4.054054054054054, |
|
"grad_norm": 0.09402345888004014, |
|
"learning_rate": 8.409996822386972e-06, |
|
"loss": 0.5828, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.0608108108108105, |
|
"grad_norm": 0.07679009840525959, |
|
"learning_rate": 8.294604370332613e-06, |
|
"loss": 0.5754, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 4.0675675675675675, |
|
"grad_norm": 0.07923822298946714, |
|
"learning_rate": 8.17991739657641e-06, |
|
"loss": 0.573, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 4.074324324324325, |
|
"grad_norm": 0.08082893460744046, |
|
"learning_rate": 8.06593845302376e-06, |
|
"loss": 0.5916, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 4.081081081081081, |
|
"grad_norm": 0.07578230123409459, |
|
"learning_rate": 7.952670075825702e-06, |
|
"loss": 0.5851, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 4.087837837837838, |
|
"grad_norm": 0.07539180401042113, |
|
"learning_rate": 7.840114785322384e-06, |
|
"loss": 0.5939, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.094594594594595, |
|
"grad_norm": 0.07932706141071251, |
|
"learning_rate": 7.728275085987041e-06, |
|
"loss": 0.5908, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 4.101351351351352, |
|
"grad_norm": 0.07344646624670495, |
|
"learning_rate": 7.6171534663702416e-06, |
|
"loss": 0.5915, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 4.108108108108108, |
|
"grad_norm": 0.07204192586718106, |
|
"learning_rate": 7.50675239904457e-06, |
|
"loss": 0.5855, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 4.114864864864865, |
|
"grad_norm": 0.06717242761020562, |
|
"learning_rate": 7.397074340549508e-06, |
|
"loss": 0.5919, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 4.121621621621622, |
|
"grad_norm": 0.07615198090948691, |
|
"learning_rate": 7.288121731336901e-06, |
|
"loss": 0.5883, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.128378378378378, |
|
"grad_norm": 0.07186386521547546, |
|
"learning_rate": 7.1798969957165025e-06, |
|
"loss": 0.5666, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 4.135135135135135, |
|
"grad_norm": 0.07771245688676058, |
|
"learning_rate": 7.072402541802197e-06, |
|
"loss": 0.572, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 4.141891891891892, |
|
"grad_norm": 0.06560942291485741, |
|
"learning_rate": 6.965640761458274e-06, |
|
"loss": 0.5853, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 4.148648648648648, |
|
"grad_norm": 0.07554536141277478, |
|
"learning_rate": 6.859614030246318e-06, |
|
"loss": 0.5746, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 4.155405405405405, |
|
"grad_norm": 0.0766866599574075, |
|
"learning_rate": 6.754324707372264e-06, |
|
"loss": 0.5827, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.162162162162162, |
|
"grad_norm": 0.0685589990578116, |
|
"learning_rate": 6.649775135633944e-06, |
|
"loss": 0.5708, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 4.168918918918919, |
|
"grad_norm": 0.06830496975914782, |
|
"learning_rate": 6.545967641368958e-06, |
|
"loss": 0.5772, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 4.175675675675675, |
|
"grad_norm": 0.06792930742316021, |
|
"learning_rate": 6.4429045344029136e-06, |
|
"loss": 0.5801, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 4.1824324324324325, |
|
"grad_norm": 0.0728029236733438, |
|
"learning_rate": 6.340588107997994e-06, |
|
"loss": 0.5666, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 4.1891891891891895, |
|
"grad_norm": 0.07156462363762543, |
|
"learning_rate": 6.239020638801987e-06, |
|
"loss": 0.5848, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.195945945945946, |
|
"grad_norm": 0.06757664374848818, |
|
"learning_rate": 6.1382043867975836e-06, |
|
"loss": 0.5679, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 4.202702702702703, |
|
"grad_norm": 0.07016074613879056, |
|
"learning_rate": 6.038141595252094e-06, |
|
"loss": 0.5839, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 4.20945945945946, |
|
"grad_norm": 0.06809385798805276, |
|
"learning_rate": 5.9388344906675485e-06, |
|
"loss": 0.575, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.216216216216216, |
|
"grad_norm": 0.061129888925191694, |
|
"learning_rate": 5.840285282731173e-06, |
|
"loss": 0.5914, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.222972972972973, |
|
"grad_norm": 0.0728841561672064, |
|
"learning_rate": 5.742496164266174e-06, |
|
"loss": 0.578, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.22972972972973, |
|
"grad_norm": 0.07404139765077528, |
|
"learning_rate": 5.645469311182958e-06, |
|
"loss": 0.5788, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.236486486486487, |
|
"grad_norm": 0.06599738371244251, |
|
"learning_rate": 5.549206882430773e-06, |
|
"loss": 0.5938, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.243243243243243, |
|
"grad_norm": 0.07571258452222435, |
|
"learning_rate": 5.453711019949581e-06, |
|
"loss": 0.5761, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.06873039122654918, |
|
"learning_rate": 5.358983848622452e-06, |
|
"loss": 0.5865, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.256756756756757, |
|
"grad_norm": 0.05980111880094078, |
|
"learning_rate": 5.265027476228297e-06, |
|
"loss": 0.58, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.263513513513513, |
|
"grad_norm": 0.07210626853825096, |
|
"learning_rate": 5.171843993394903e-06, |
|
"loss": 0.5756, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.27027027027027, |
|
"grad_norm": 0.07396976700448737, |
|
"learning_rate": 5.079435473552474e-06, |
|
"loss": 0.5811, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.277027027027027, |
|
"grad_norm": 0.09171081358728068, |
|
"learning_rate": 4.987803972887482e-06, |
|
"loss": 0.5871, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.283783783783784, |
|
"grad_norm": 0.06701824447243687, |
|
"learning_rate": 4.896951530296896e-06, |
|
"loss": 0.579, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.29054054054054, |
|
"grad_norm": 0.07518459749761504, |
|
"learning_rate": 4.806880167342831e-06, |
|
"loss": 0.5822, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.297297297297297, |
|
"grad_norm": 0.07098354334790964, |
|
"learning_rate": 4.7175918882075465e-06, |
|
"loss": 0.5861, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.304054054054054, |
|
"grad_norm": 0.07479138842797883, |
|
"learning_rate": 4.6290886796488946e-06, |
|
"loss": 0.5863, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.3108108108108105, |
|
"grad_norm": 0.06686921193950773, |
|
"learning_rate": 4.54137251095605e-06, |
|
"loss": 0.5916, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.3175675675675675, |
|
"grad_norm": 0.07897960962457094, |
|
"learning_rate": 4.454445333905768e-06, |
|
"loss": 0.5948, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 0.07496951321971122, |
|
"learning_rate": 4.3683090827188666e-06, |
|
"loss": 0.5813, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.331081081081081, |
|
"grad_norm": 0.07536718259174548, |
|
"learning_rate": 4.282965674017265e-06, |
|
"loss": 0.5872, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.337837837837838, |
|
"grad_norm": 0.06854859712850378, |
|
"learning_rate": 4.198417006781283e-06, |
|
"loss": 0.5841, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.344594594594595, |
|
"grad_norm": 0.07618804161631497, |
|
"learning_rate": 4.114664962307439e-06, |
|
"loss": 0.5817, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.351351351351352, |
|
"grad_norm": 0.07601070844404975, |
|
"learning_rate": 4.031711404166525e-06, |
|
"loss": 0.584, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.358108108108108, |
|
"grad_norm": 0.06914624176338743, |
|
"learning_rate": 3.949558178162209e-06, |
|
"loss": 0.5888, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.364864864864865, |
|
"grad_norm": 0.07027655538494314, |
|
"learning_rate": 3.86820711228991e-06, |
|
"loss": 0.5867, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.371621621621622, |
|
"grad_norm": 0.0760270326599887, |
|
"learning_rate": 3.7876600166961353e-06, |
|
"loss": 0.5961, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.378378378378378, |
|
"grad_norm": 0.06782313812781374, |
|
"learning_rate": 3.707918683638223e-06, |
|
"loss": 0.5944, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.385135135135135, |
|
"grad_norm": 0.06192386606423058, |
|
"learning_rate": 3.628984887444462e-06, |
|
"loss": 0.5969, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.391891891891892, |
|
"grad_norm": 0.062017594322198465, |
|
"learning_rate": 3.550860384474568e-06, |
|
"loss": 0.5863, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.398648648648648, |
|
"grad_norm": 0.058946604304473224, |
|
"learning_rate": 3.473546913080674e-06, |
|
"loss": 0.5847, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.405405405405405, |
|
"grad_norm": 0.08803265906901768, |
|
"learning_rate": 3.397046193568558e-06, |
|
"loss": 0.5707, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.412162162162162, |
|
"grad_norm": 0.06583037908170926, |
|
"learning_rate": 3.3213599281594688e-06, |
|
"loss": 0.582, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.418918918918919, |
|
"grad_norm": 0.06086938296974132, |
|
"learning_rate": 3.246489800952155e-06, |
|
"loss": 0.5953, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.425675675675675, |
|
"grad_norm": 0.05932401857518893, |
|
"learning_rate": 3.172437477885475e-06, |
|
"loss": 0.5819, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.4324324324324325, |
|
"grad_norm": 0.057752799006265246, |
|
"learning_rate": 3.099204606701256e-06, |
|
"loss": 0.5766, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.4391891891891895, |
|
"grad_norm": 0.06397066535508446, |
|
"learning_rate": 3.026792816907671e-06, |
|
"loss": 0.588, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.445945945945946, |
|
"grad_norm": 0.06029045959058417, |
|
"learning_rate": 2.955203719742965e-06, |
|
"loss": 0.591, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.452702702702703, |
|
"grad_norm": 0.05838249627908107, |
|
"learning_rate": 2.884438908139626e-06, |
|
"loss": 0.5929, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.45945945945946, |
|
"grad_norm": 0.06075718886251464, |
|
"learning_rate": 2.814499956688912e-06, |
|
"loss": 0.5813, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.466216216216216, |
|
"grad_norm": 0.0719277704750579, |
|
"learning_rate": 2.7453884216058368e-06, |
|
"loss": 0.5795, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.472972972972973, |
|
"grad_norm": 0.06075719822797747, |
|
"learning_rate": 2.677105840694507e-06, |
|
"loss": 0.5674, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.47972972972973, |
|
"grad_norm": 0.05975207139447743, |
|
"learning_rate": 2.6096537333139616e-06, |
|
"loss": 0.5805, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.486486486486487, |
|
"grad_norm": 0.06213586333263577, |
|
"learning_rate": 2.5430336003443045e-06, |
|
"loss": 0.5906, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.493243243243243, |
|
"grad_norm": 0.05895116376323496, |
|
"learning_rate": 2.4772469241533648e-06, |
|
"loss": 0.5844, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.06765046020471, |
|
"learning_rate": 2.4122951685636674e-06, |
|
"loss": 0.5835, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.506756756756757, |
|
"grad_norm": 0.06650006977809148, |
|
"learning_rate": 2.3481797788198745e-06, |
|
"loss": 0.5789, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.513513513513513, |
|
"grad_norm": 0.05846137750044436, |
|
"learning_rate": 2.284902181556632e-06, |
|
"loss": 0.5745, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.52027027027027, |
|
"grad_norm": 0.05989337419115531, |
|
"learning_rate": 2.2224637847668484e-06, |
|
"loss": 0.5733, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.527027027027027, |
|
"grad_norm": 0.06939319651650724, |
|
"learning_rate": 2.1608659777703033e-06, |
|
"loss": 0.581, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.533783783783784, |
|
"grad_norm": 0.054679826375991857, |
|
"learning_rate": 2.100110131182813e-06, |
|
"loss": 0.5931, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.54054054054054, |
|
"grad_norm": 0.05548818045052255, |
|
"learning_rate": 2.0401975968856514e-06, |
|
"loss": 0.5786, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.547297297297297, |
|
"grad_norm": 0.059389572689147435, |
|
"learning_rate": 1.981129707995542e-06, |
|
"loss": 0.5923, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.554054054054054, |
|
"grad_norm": 0.06975826873383643, |
|
"learning_rate": 1.9229077788349393e-06, |
|
"loss": 0.577, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.5608108108108105, |
|
"grad_norm": 0.07220930223960455, |
|
"learning_rate": 1.865533104902828e-06, |
|
"loss": 0.5916, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.5675675675675675, |
|
"grad_norm": 0.053811020975530956, |
|
"learning_rate": 1.8090069628458583e-06, |
|
"loss": 0.5704, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.574324324324325, |
|
"grad_norm": 0.05538660298263909, |
|
"learning_rate": 1.7533306104299663e-06, |
|
"loss": 0.5856, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.581081081081081, |
|
"grad_norm": 0.05829509686058199, |
|
"learning_rate": 1.6985052865123641e-06, |
|
"loss": 0.5754, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.587837837837838, |
|
"grad_norm": 0.06323909786498662, |
|
"learning_rate": 1.6445322110140116e-06, |
|
"loss": 0.5814, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.594594594594595, |
|
"grad_norm": 0.056972721072935675, |
|
"learning_rate": 1.59141258489242e-06, |
|
"loss": 0.5756, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.601351351351351, |
|
"grad_norm": 0.05816167859998497, |
|
"learning_rate": 1.53914759011498e-06, |
|
"loss": 0.5889, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.608108108108108, |
|
"grad_norm": 0.059044794864098894, |
|
"learning_rate": 1.4877383896326269e-06, |
|
"loss": 0.5781, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.614864864864865, |
|
"grad_norm": 0.06237941099681927, |
|
"learning_rate": 1.4371861273539778e-06, |
|
"loss": 0.576, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.621621621621622, |
|
"grad_norm": 0.06531112971211704, |
|
"learning_rate": 1.3874919281198662e-06, |
|
"loss": 0.5799, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.628378378378378, |
|
"grad_norm": 0.05265860386748638, |
|
"learning_rate": 1.3386568976783453e-06, |
|
"loss": 0.5891, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.635135135135135, |
|
"grad_norm": 0.05785779557411594, |
|
"learning_rate": 1.2906821226600453e-06, |
|
"loss": 0.5768, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.641891891891892, |
|
"grad_norm": 0.05639941023635892, |
|
"learning_rate": 1.2435686705540228e-06, |
|
"loss": 0.5842, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.648648648648649, |
|
"grad_norm": 0.06338285927327728, |
|
"learning_rate": 1.1973175896839684e-06, |
|
"loss": 0.573, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.655405405405405, |
|
"grad_norm": 0.054671739680034734, |
|
"learning_rate": 1.1519299091849523e-06, |
|
"loss": 0.5774, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.662162162162162, |
|
"grad_norm": 0.057698821845826635, |
|
"learning_rate": 1.1074066389804395e-06, |
|
"loss": 0.5932, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.668918918918919, |
|
"grad_norm": 0.05697783615575523, |
|
"learning_rate": 1.0637487697598937e-06, |
|
"loss": 0.5823, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.675675675675675, |
|
"grad_norm": 0.054051563368746565, |
|
"learning_rate": 1.0209572729566708e-06, |
|
"loss": 0.5787, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.6824324324324325, |
|
"grad_norm": 0.05344229805841218, |
|
"learning_rate": 9.790331007264543e-07, |
|
"loss": 0.5844, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.6891891891891895, |
|
"grad_norm": 0.06959576323504683, |
|
"learning_rate": 9.379771859260267e-07, |
|
"loss": 0.5817, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.695945945945946, |
|
"grad_norm": 0.06318026825084985, |
|
"learning_rate": 8.977904420925543e-07, |
|
"loss": 0.5892, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.702702702702703, |
|
"grad_norm": 0.056039993579189235, |
|
"learning_rate": 8.584737634232154e-07, |
|
"loss": 0.5835, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.70945945945946, |
|
"grad_norm": 0.05269309541455996, |
|
"learning_rate": 8.200280247553461e-07, |
|
"loss": 0.5803, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.716216216216216, |
|
"grad_norm": 0.05510521164357771, |
|
"learning_rate": 7.824540815469306e-07, |
|
"loss": 0.5709, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.722972972972973, |
|
"grad_norm": 0.05770652073739158, |
|
"learning_rate": 7.457527698576217e-07, |
|
"loss": 0.5855, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 4.72972972972973, |
|
"grad_norm": 0.055593016439349656, |
|
"learning_rate": 7.099249063300751e-07, |
|
"loss": 0.5847, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.736486486486487, |
|
"grad_norm": 0.05730248759901012, |
|
"learning_rate": 6.749712881718306e-07, |
|
"loss": 0.5639, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 4.743243243243243, |
|
"grad_norm": 0.05973974788088902, |
|
"learning_rate": 6.408926931375403e-07, |
|
"loss": 0.5808, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.05557100230846073, |
|
"learning_rate": 6.076898795116792e-07, |
|
"loss": 0.5848, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 4.756756756756757, |
|
"grad_norm": 0.0563081652270169, |
|
"learning_rate": 5.753635860916617e-07, |
|
"loss": 0.5888, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.763513513513513, |
|
"grad_norm": 0.0536729552491486, |
|
"learning_rate": 5.43914532171419e-07, |
|
"loss": 0.5873, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.77027027027027, |
|
"grad_norm": 0.049916139412812977, |
|
"learning_rate": 5.133434175253715e-07, |
|
"loss": 0.5792, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.777027027027027, |
|
"grad_norm": 0.05386272187182479, |
|
"learning_rate": 4.83650922392882e-07, |
|
"loss": 0.5829, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 4.783783783783784, |
|
"grad_norm": 0.05503179443312982, |
|
"learning_rate": 4.5483770746309383e-07, |
|
"loss": 0.5873, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.79054054054054, |
|
"grad_norm": 0.05614293253442416, |
|
"learning_rate": 4.269044138602585e-07, |
|
"loss": 0.5879, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 4.797297297297297, |
|
"grad_norm": 0.053484276503556256, |
|
"learning_rate": 3.998516631294491e-07, |
|
"loss": 0.5794, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.804054054054054, |
|
"grad_norm": 0.055818505320336, |
|
"learning_rate": 3.736800572227317e-07, |
|
"loss": 0.5905, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 4.8108108108108105, |
|
"grad_norm": 0.05644903653041531, |
|
"learning_rate": 3.483901784857846e-07, |
|
"loss": 0.5721, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.8175675675675675, |
|
"grad_norm": 0.04844049723060667, |
|
"learning_rate": 3.239825896449267e-07, |
|
"loss": 0.5826, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 4.824324324324325, |
|
"grad_norm": 0.053088243917807275, |
|
"learning_rate": 3.004578337945985e-07, |
|
"loss": 0.5811, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.831081081081081, |
|
"grad_norm": 0.05140464370774195, |
|
"learning_rate": 2.778164343852918e-07, |
|
"loss": 0.5791, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.837837837837838, |
|
"grad_norm": 0.05679451911188569, |
|
"learning_rate": 2.5605889521188364e-07, |
|
"loss": 0.5787, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 4.844594594594595, |
|
"grad_norm": 0.06082278783885325, |
|
"learning_rate": 2.351857004024316e-07, |
|
"loss": 0.5868, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 4.851351351351351, |
|
"grad_norm": 0.05433042972998646, |
|
"learning_rate": 2.1519731440740487e-07, |
|
"loss": 0.5812, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 4.858108108108108, |
|
"grad_norm": 0.055224715989184006, |
|
"learning_rate": 1.9609418198935916e-07, |
|
"loss": 0.5881, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 4.864864864864865, |
|
"grad_norm": 0.05348313369078817, |
|
"learning_rate": 1.778767282130156e-07, |
|
"loss": 0.5945, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.871621621621622, |
|
"grad_norm": 0.05163946442780112, |
|
"learning_rate": 1.6054535843582854e-07, |
|
"loss": 0.5891, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 4.878378378378378, |
|
"grad_norm": 0.055008800596469655, |
|
"learning_rate": 1.4410045829893915e-07, |
|
"loss": 0.5817, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 4.885135135135135, |
|
"grad_norm": 0.05385755694710143, |
|
"learning_rate": 1.2854239371863142e-07, |
|
"loss": 0.5803, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 4.891891891891892, |
|
"grad_norm": 0.050256418798137074, |
|
"learning_rate": 1.1387151087814297e-07, |
|
"loss": 0.5776, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 4.898648648648649, |
|
"grad_norm": 0.06422788348062423, |
|
"learning_rate": 1.0008813622001345e-07, |
|
"loss": 0.5865, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.905405405405405, |
|
"grad_norm": 0.058912661052812065, |
|
"learning_rate": 8.719257643877044e-08, |
|
"loss": 0.5904, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 4.912162162162162, |
|
"grad_norm": 0.05385241384320565, |
|
"learning_rate": 7.51851184741481e-08, |
|
"loss": 0.5944, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 4.918918918918919, |
|
"grad_norm": 0.05443985642845785, |
|
"learning_rate": 6.40660295046791e-08, |
|
"loss": 0.5857, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 4.925675675675675, |
|
"grad_norm": 0.05281227387100857, |
|
"learning_rate": 5.3835556941743695e-08, |
|
"loss": 0.5742, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 4.9324324324324325, |
|
"grad_norm": 0.05531460046068238, |
|
"learning_rate": 4.449392842408529e-08, |
|
"loss": 0.5743, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.9391891891891895, |
|
"grad_norm": 0.05538814823209324, |
|
"learning_rate": 3.6041351812743374e-08, |
|
"loss": 0.5877, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 4.945945945945946, |
|
"grad_norm": 0.055186199974581227, |
|
"learning_rate": 2.8478015186399477e-08, |
|
"loss": 0.5842, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.952702702702703, |
|
"grad_norm": 0.055151710910165075, |
|
"learning_rate": 2.1804086837229344e-08, |
|
"loss": 0.5846, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 4.95945945945946, |
|
"grad_norm": 0.05558334617444456, |
|
"learning_rate": 1.601971526713708e-08, |
|
"loss": 0.5766, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.966216216216216, |
|
"grad_norm": 0.049437407136724947, |
|
"learning_rate": 1.112502918445113e-08, |
|
"loss": 0.586, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.972972972972973, |
|
"grad_norm": 0.05838258812809817, |
|
"learning_rate": 7.12013750107321e-09, |
|
"loss": 0.5918, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.97972972972973, |
|
"grad_norm": 0.05252990242194222, |
|
"learning_rate": 4.00512933004471e-09, |
|
"loss": 0.5853, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 4.986486486486487, |
|
"grad_norm": 0.05539911735758133, |
|
"learning_rate": 1.7800739835616143e-09, |
|
"loss": 0.5731, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.993243243243243, |
|
"grad_norm": 0.059325115613032664, |
|
"learning_rate": 4.450209714379483e-10, |
|
"loss": 0.5814, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.059385070432276595, |
|
"learning_rate": 0.0, |
|
"loss": 0.577, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 740, |
|
"total_flos": 4.168626538020864e+16, |
|
"train_loss": 0.6272868978010642, |
|
"train_runtime": 42334.4318, |
|
"train_samples_per_second": 8.913, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.168626538020864e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|