|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.00100150225338, |
|
"eval_steps": 50, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00200300450676014, |
|
"grad_norm": 1.1490445137023926, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.2042, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00400600901352028, |
|
"grad_norm": 1.0771784782409668, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.307, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006009013520280421, |
|
"grad_norm": 1.569467306137085, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.3554, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00801201802704056, |
|
"grad_norm": 1.3319200277328491, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.293, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.010015022533800702, |
|
"grad_norm": 1.0480939149856567, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.2224, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012018027040560842, |
|
"grad_norm": 1.1433207988739014, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.2897, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.014021031547320982, |
|
"grad_norm": 1.073263168334961, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 1.2234, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01602403605408112, |
|
"grad_norm": 1.4199696779251099, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.2185, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.018027040560841263, |
|
"grad_norm": 1.0688554048538208, |
|
"learning_rate": 6e-06, |
|
"loss": 1.1962, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.020030045067601403, |
|
"grad_norm": 0.7635518312454224, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.2015, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022033049574361543, |
|
"grad_norm": 0.898193895816803, |
|
"learning_rate": 7.333333333333334e-06, |
|
"loss": 1.2538, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.024036054081121683, |
|
"grad_norm": 0.5049487948417664, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.1567, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.026039058587881823, |
|
"grad_norm": 0.5218433141708374, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 1.1553, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.028042063094641963, |
|
"grad_norm": 0.6712301969528198, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.2026, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.030045067601402103, |
|
"grad_norm": 0.5391427278518677, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2703, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03204807210816224, |
|
"grad_norm": 0.462812602519989, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 1.1924, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03405107661492238, |
|
"grad_norm": 0.6354833245277405, |
|
"learning_rate": 1.1333333333333334e-05, |
|
"loss": 1.1828, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03605408112168253, |
|
"grad_norm": 0.41658806800842285, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.0911, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03805708562844266, |
|
"grad_norm": 0.40210819244384766, |
|
"learning_rate": 1.2666666666666668e-05, |
|
"loss": 1.1088, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04006009013520281, |
|
"grad_norm": 0.4195331931114197, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.1107, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04206309464196294, |
|
"grad_norm": 0.4773981273174286, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.0895, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.04406609914872309, |
|
"grad_norm": 0.44729089736938477, |
|
"learning_rate": 1.4666666666666668e-05, |
|
"loss": 1.1064, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04606910365548322, |
|
"grad_norm": 0.4262336492538452, |
|
"learning_rate": 1.5333333333333334e-05, |
|
"loss": 1.1339, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04807210816224337, |
|
"grad_norm": 0.48148858547210693, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.0981, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0500751126690035, |
|
"grad_norm": 0.39283275604248047, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.093, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0500751126690035, |
|
"eval_loss": 1.2894173860549927, |
|
"eval_runtime": 3.763, |
|
"eval_samples_per_second": 15.413, |
|
"eval_steps_per_second": 7.707, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05207811717576365, |
|
"grad_norm": 0.37658512592315674, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 1.139, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.05408112168252378, |
|
"grad_norm": 0.5392587184906006, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.125, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.05608412618928393, |
|
"grad_norm": 0.4281522333621979, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 1.142, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.05808713069604406, |
|
"grad_norm": 0.3900790512561798, |
|
"learning_rate": 1.9333333333333333e-05, |
|
"loss": 1.0687, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06009013520280421, |
|
"grad_norm": 0.43412598967552185, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0266, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06209313970956434, |
|
"grad_norm": 0.35002750158309937, |
|
"learning_rate": 2.0666666666666666e-05, |
|
"loss": 1.0595, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.06409614421632448, |
|
"grad_norm": 0.4777143597602844, |
|
"learning_rate": 2.1333333333333335e-05, |
|
"loss": 1.1136, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.06609914872308463, |
|
"grad_norm": 0.49310263991355896, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.0185, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.06810215322984477, |
|
"grad_norm": 0.449856698513031, |
|
"learning_rate": 2.2666666666666668e-05, |
|
"loss": 1.1279, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0701051577366049, |
|
"grad_norm": 0.38826239109039307, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 1.0885, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07210816224336505, |
|
"grad_norm": 0.4807354509830475, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.0903, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.07411116675012519, |
|
"grad_norm": 0.4949500262737274, |
|
"learning_rate": 2.466666666666667e-05, |
|
"loss": 1.127, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.07611417125688533, |
|
"grad_norm": 0.3626649081707001, |
|
"learning_rate": 2.5333333333333337e-05, |
|
"loss": 1.0255, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.07811717576364546, |
|
"grad_norm": 0.5750380754470825, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.1275, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08012018027040561, |
|
"grad_norm": 0.39814862608909607, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 1.0341, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08212318477716575, |
|
"grad_norm": 0.4639066457748413, |
|
"learning_rate": 2.733333333333333e-05, |
|
"loss": 1.0229, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.08412618928392589, |
|
"grad_norm": 0.4696304500102997, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.9657, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.08612919379068602, |
|
"grad_norm": 0.4721640646457672, |
|
"learning_rate": 2.8666666666666668e-05, |
|
"loss": 1.0449, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.08813219829744617, |
|
"grad_norm": 0.538497805595398, |
|
"learning_rate": 2.9333333333333336e-05, |
|
"loss": 1.0298, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.09013520280420631, |
|
"grad_norm": 0.4559970498085022, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1037, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09213820731096645, |
|
"grad_norm": 0.5490939617156982, |
|
"learning_rate": 3.066666666666667e-05, |
|
"loss": 1.0027, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.09414121181772658, |
|
"grad_norm": 0.45646870136260986, |
|
"learning_rate": 3.1333333333333334e-05, |
|
"loss": 0.9897, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.09614421632448673, |
|
"grad_norm": 0.43321868777275085, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.0761, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.09814722083124687, |
|
"grad_norm": 0.5118622183799744, |
|
"learning_rate": 3.266666666666667e-05, |
|
"loss": 1.02, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.100150225338007, |
|
"grad_norm": 0.496593177318573, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.0625, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.100150225338007, |
|
"eval_loss": 1.240967035293579, |
|
"eval_runtime": 3.786, |
|
"eval_samples_per_second": 15.32, |
|
"eval_steps_per_second": 7.66, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10215322984476716, |
|
"grad_norm": 0.4841687083244324, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.0166, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1041562343515273, |
|
"grad_norm": 0.5562867522239685, |
|
"learning_rate": 3.466666666666667e-05, |
|
"loss": 1.0716, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.10615923885828743, |
|
"grad_norm": 0.5093795657157898, |
|
"learning_rate": 3.5333333333333336e-05, |
|
"loss": 1.0912, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.10816224336504757, |
|
"grad_norm": 0.4446066915988922, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.0152, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.11016524787180772, |
|
"grad_norm": 0.518335223197937, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 1.0098, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11216825237856785, |
|
"grad_norm": 0.47020334005355835, |
|
"learning_rate": 3.733333333333334e-05, |
|
"loss": 1.0347, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.11417125688532799, |
|
"grad_norm": 0.5809981226921082, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.0242, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.11617426139208813, |
|
"grad_norm": 0.49666646122932434, |
|
"learning_rate": 3.866666666666667e-05, |
|
"loss": 1.053, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.11817726589884828, |
|
"grad_norm": 0.47094520926475525, |
|
"learning_rate": 3.933333333333333e-05, |
|
"loss": 1.0258, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.12018027040560841, |
|
"grad_norm": 0.5577300786972046, |
|
"learning_rate": 4e-05, |
|
"loss": 1.0197, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12218327491236855, |
|
"grad_norm": 0.5453508496284485, |
|
"learning_rate": 4.066666666666667e-05, |
|
"loss": 0.9842, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.12418627941912869, |
|
"grad_norm": 0.5353218913078308, |
|
"learning_rate": 4.133333333333333e-05, |
|
"loss": 1.1579, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.12618928392588882, |
|
"grad_norm": 0.617546021938324, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.0052, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.12819228843264896, |
|
"grad_norm": 0.48849716782569885, |
|
"learning_rate": 4.266666666666667e-05, |
|
"loss": 1.0416, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.13019529293940912, |
|
"grad_norm": 0.5549625754356384, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 1.0562, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13219829744616926, |
|
"grad_norm": 0.6010375618934631, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.046, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.1342013019529294, |
|
"grad_norm": 0.481374591588974, |
|
"learning_rate": 4.466666666666667e-05, |
|
"loss": 1.0136, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.13620430645968953, |
|
"grad_norm": 0.4886944591999054, |
|
"learning_rate": 4.5333333333333335e-05, |
|
"loss": 0.9658, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.13820731096644967, |
|
"grad_norm": 0.6117609739303589, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.0545, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.1402103154732098, |
|
"grad_norm": 0.5340180397033691, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.9826, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14221331997996994, |
|
"grad_norm": 0.5061513781547546, |
|
"learning_rate": 4.7333333333333336e-05, |
|
"loss": 0.9832, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1442163244867301, |
|
"grad_norm": 0.5090388059616089, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.9733, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.14621932899349024, |
|
"grad_norm": 0.5136658549308777, |
|
"learning_rate": 4.866666666666667e-05, |
|
"loss": 0.9673, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.14822233350025038, |
|
"grad_norm": 0.5653979778289795, |
|
"learning_rate": 4.933333333333334e-05, |
|
"loss": 0.9908, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.15022533800701052, |
|
"grad_norm": 0.5377776026725769, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9428, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15022533800701052, |
|
"eval_loss": 1.2466219663619995, |
|
"eval_runtime": 3.7908, |
|
"eval_samples_per_second": 15.3, |
|
"eval_steps_per_second": 7.65, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15222834251377065, |
|
"grad_norm": 0.5484976768493652, |
|
"learning_rate": 4.9999728022003156e-05, |
|
"loss": 0.9523, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.1542313470205308, |
|
"grad_norm": 0.4431094229221344, |
|
"learning_rate": 4.999891209393037e-05, |
|
"loss": 0.9937, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.15623435152729093, |
|
"grad_norm": 0.6066553592681885, |
|
"learning_rate": 4.999755223353482e-05, |
|
"loss": 1.0431, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.15823735603405106, |
|
"grad_norm": 0.6024964451789856, |
|
"learning_rate": 4.9995648470404664e-05, |
|
"loss": 0.9671, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.16024036054081123, |
|
"grad_norm": 0.486589640378952, |
|
"learning_rate": 4.9993200845962434e-05, |
|
"loss": 0.949, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16224336504757136, |
|
"grad_norm": 0.505987823009491, |
|
"learning_rate": 4.9990209413464136e-05, |
|
"loss": 1.0444, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1642463695543315, |
|
"grad_norm": 0.46255800127983093, |
|
"learning_rate": 4.998667423799807e-05, |
|
"loss": 0.971, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.16624937406109164, |
|
"grad_norm": 0.5792336463928223, |
|
"learning_rate": 4.9982595396483435e-05, |
|
"loss": 0.9869, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.16825237856785177, |
|
"grad_norm": 0.48192256689071655, |
|
"learning_rate": 4.997797297766864e-05, |
|
"loss": 1.0234, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.1702553830746119, |
|
"grad_norm": 0.5370559692382812, |
|
"learning_rate": 4.997280708212939e-05, |
|
"loss": 0.9721, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17225838758137205, |
|
"grad_norm": 0.4554755985736847, |
|
"learning_rate": 4.996709782226646e-05, |
|
"loss": 0.9292, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1742613920881322, |
|
"grad_norm": 0.4883841872215271, |
|
"learning_rate": 4.9960845322303315e-05, |
|
"loss": 0.9815, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.17626439659489235, |
|
"grad_norm": 0.5221249461174011, |
|
"learning_rate": 4.995404971828333e-05, |
|
"loss": 1.0216, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.17826740110165248, |
|
"grad_norm": 0.5130178332328796, |
|
"learning_rate": 4.994671115806691e-05, |
|
"loss": 0.9855, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.18027040560841262, |
|
"grad_norm": 0.46121644973754883, |
|
"learning_rate": 4.993882980132819e-05, |
|
"loss": 0.9196, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18227341011517276, |
|
"grad_norm": 0.49680426716804504, |
|
"learning_rate": 4.9930405819551627e-05, |
|
"loss": 1.0151, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.1842764146219329, |
|
"grad_norm": 0.48591047525405884, |
|
"learning_rate": 4.992143939602823e-05, |
|
"loss": 0.9901, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.18627941912869303, |
|
"grad_norm": 0.47092878818511963, |
|
"learning_rate": 4.9911930725851583e-05, |
|
"loss": 0.9632, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.18828242363545317, |
|
"grad_norm": 0.40838295221328735, |
|
"learning_rate": 4.990188001591363e-05, |
|
"loss": 0.9469, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.19028542814221333, |
|
"grad_norm": 0.4728156626224518, |
|
"learning_rate": 4.9891287484900124e-05, |
|
"loss": 0.9667, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19228843264897347, |
|
"grad_norm": 0.534322202205658, |
|
"learning_rate": 4.988015336328589e-05, |
|
"loss": 0.9982, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1942914371557336, |
|
"grad_norm": 0.43927860260009766, |
|
"learning_rate": 4.986847789332981e-05, |
|
"loss": 0.9898, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.19629444166249374, |
|
"grad_norm": 0.40531125664711, |
|
"learning_rate": 4.985626132906957e-05, |
|
"loss": 0.9442, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.19829744616925388, |
|
"grad_norm": 0.5949648022651672, |
|
"learning_rate": 4.9843503936316095e-05, |
|
"loss": 1.0381, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.200300450676014, |
|
"grad_norm": 0.43230050802230835, |
|
"learning_rate": 4.983020599264781e-05, |
|
"loss": 1.0166, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.200300450676014, |
|
"eval_loss": 1.2184284925460815, |
|
"eval_runtime": 3.8028, |
|
"eval_samples_per_second": 15.252, |
|
"eval_steps_per_second": 7.626, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20230345518277415, |
|
"grad_norm": 0.4429769814014435, |
|
"learning_rate": 4.9816367787404534e-05, |
|
"loss": 0.9594, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.20430645968953431, |
|
"grad_norm": 0.5523216724395752, |
|
"learning_rate": 4.980198962168128e-05, |
|
"loss": 1.0446, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.20630946419629445, |
|
"grad_norm": 0.4551699459552765, |
|
"learning_rate": 4.978707180832161e-05, |
|
"loss": 0.9913, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.2083124687030546, |
|
"grad_norm": 0.41649895906448364, |
|
"learning_rate": 4.977161467191089e-05, |
|
"loss": 0.9163, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.21031547320981472, |
|
"grad_norm": 0.4184020459651947, |
|
"learning_rate": 4.97556185487692e-05, |
|
"loss": 0.9463, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21231847771657486, |
|
"grad_norm": 0.6365268230438232, |
|
"learning_rate": 4.9739083786944016e-05, |
|
"loss": 0.9992, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.214321482223335, |
|
"grad_norm": 0.5223124027252197, |
|
"learning_rate": 4.9722010746202664e-05, |
|
"loss": 0.923, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.21632448673009513, |
|
"grad_norm": 0.42879560589790344, |
|
"learning_rate": 4.970439979802445e-05, |
|
"loss": 0.9788, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.21832749123685527, |
|
"grad_norm": 0.4171353578567505, |
|
"learning_rate": 4.96862513255926e-05, |
|
"loss": 1.0101, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.22033049574361543, |
|
"grad_norm": 0.42286214232444763, |
|
"learning_rate": 4.966756572378593e-05, |
|
"loss": 0.981, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22233350025037557, |
|
"grad_norm": 0.6001223921775818, |
|
"learning_rate": 4.964834339917025e-05, |
|
"loss": 1.0276, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2243365047571357, |
|
"grad_norm": 0.6153950095176697, |
|
"learning_rate": 4.9628584769989504e-05, |
|
"loss": 1.0437, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.22633950926389584, |
|
"grad_norm": 0.419117271900177, |
|
"learning_rate": 4.9608290266156695e-05, |
|
"loss": 1.0168, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.22834251377065598, |
|
"grad_norm": 0.40286022424697876, |
|
"learning_rate": 4.958746032924448e-05, |
|
"loss": 0.988, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.23034551827741612, |
|
"grad_norm": 0.5287054181098938, |
|
"learning_rate": 4.9566095412475636e-05, |
|
"loss": 1.019, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23234852278417625, |
|
"grad_norm": 0.43865758180618286, |
|
"learning_rate": 4.9544195980713136e-05, |
|
"loss": 0.9563, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.23435152729093642, |
|
"grad_norm": 0.5529116988182068, |
|
"learning_rate": 4.952176251045008e-05, |
|
"loss": 0.9288, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.23635453179769655, |
|
"grad_norm": 0.5552803874015808, |
|
"learning_rate": 4.9498795489799276e-05, |
|
"loss": 0.8924, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2383575363044567, |
|
"grad_norm": 0.722111165523529, |
|
"learning_rate": 4.947529541848268e-05, |
|
"loss": 0.9598, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.24036054081121683, |
|
"grad_norm": 0.4804269075393677, |
|
"learning_rate": 4.9451262807820466e-05, |
|
"loss": 0.9757, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24236354531797696, |
|
"grad_norm": 0.5181965231895447, |
|
"learning_rate": 4.942669818071994e-05, |
|
"loss": 1.0138, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.2443665498247371, |
|
"grad_norm": 0.43212518095970154, |
|
"learning_rate": 4.9401602071664155e-05, |
|
"loss": 0.9027, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.24636955433149724, |
|
"grad_norm": 0.5169520974159241, |
|
"learning_rate": 4.937597502670027e-05, |
|
"loss": 0.9668, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.24837255883825737, |
|
"grad_norm": 0.4116087555885315, |
|
"learning_rate": 4.934981760342766e-05, |
|
"loss": 0.9634, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.25037556334501754, |
|
"grad_norm": 0.5354374647140503, |
|
"learning_rate": 4.932313037098582e-05, |
|
"loss": 0.9993, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25037556334501754, |
|
"eval_loss": 1.207343339920044, |
|
"eval_runtime": 3.8101, |
|
"eval_samples_per_second": 15.223, |
|
"eval_steps_per_second": 7.611, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25237856785177765, |
|
"grad_norm": 0.5648212432861328, |
|
"learning_rate": 4.929591391004196e-05, |
|
"loss": 1.0219, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2543815723585378, |
|
"grad_norm": 0.6550512909889221, |
|
"learning_rate": 4.926816881277834e-05, |
|
"loss": 0.9505, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.2563845768652979, |
|
"grad_norm": 0.4034920334815979, |
|
"learning_rate": 4.923989568287946e-05, |
|
"loss": 0.929, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.2583875813720581, |
|
"grad_norm": 0.475777804851532, |
|
"learning_rate": 4.921109513551885e-05, |
|
"loss": 0.9811, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.26039058587881825, |
|
"grad_norm": 0.47418224811553955, |
|
"learning_rate": 4.9181767797345724e-05, |
|
"loss": 1.0354, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.26239359038557836, |
|
"grad_norm": 0.5102671384811401, |
|
"learning_rate": 4.9151914306471345e-05, |
|
"loss": 1.0212, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.2643965948923385, |
|
"grad_norm": 0.4163782298564911, |
|
"learning_rate": 4.912153531245511e-05, |
|
"loss": 0.9191, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.26639959939909863, |
|
"grad_norm": 0.5019692182540894, |
|
"learning_rate": 4.909063147629046e-05, |
|
"loss": 0.9337, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2684026039058588, |
|
"grad_norm": 0.5193113088607788, |
|
"learning_rate": 4.905920347039048e-05, |
|
"loss": 0.9746, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.2704056084126189, |
|
"grad_norm": 0.4991247355937958, |
|
"learning_rate": 4.9027251978573244e-05, |
|
"loss": 0.9568, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27240861291937907, |
|
"grad_norm": 0.3833785951137543, |
|
"learning_rate": 4.8994777696046984e-05, |
|
"loss": 0.9621, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.27441161742613923, |
|
"grad_norm": 0.5187920331954956, |
|
"learning_rate": 4.8961781329394915e-05, |
|
"loss": 0.9393, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.27641462193289934, |
|
"grad_norm": 0.6128193736076355, |
|
"learning_rate": 4.89282635965599e-05, |
|
"loss": 0.9734, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2784176264396595, |
|
"grad_norm": 0.47504886984825134, |
|
"learning_rate": 4.8894225226828795e-05, |
|
"loss": 0.9592, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2804206309464196, |
|
"grad_norm": 0.44938042759895325, |
|
"learning_rate": 4.885966696081663e-05, |
|
"loss": 0.9999, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2824236354531798, |
|
"grad_norm": 0.48498111963272095, |
|
"learning_rate": 4.8824589550450415e-05, |
|
"loss": 0.9597, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.2844266399599399, |
|
"grad_norm": 0.582253098487854, |
|
"learning_rate": 4.8788993758952875e-05, |
|
"loss": 0.9322, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.28642964446670005, |
|
"grad_norm": 0.5211949944496155, |
|
"learning_rate": 4.875288036082577e-05, |
|
"loss": 0.9913, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.2884326489734602, |
|
"grad_norm": 0.5122332572937012, |
|
"learning_rate": 4.8716250141833075e-05, |
|
"loss": 0.92, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2904356534802203, |
|
"grad_norm": 0.509671151638031, |
|
"learning_rate": 4.867910389898387e-05, |
|
"loss": 0.9686, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2924386579869805, |
|
"grad_norm": 0.42992913722991943, |
|
"learning_rate": 4.864144244051503e-05, |
|
"loss": 0.8937, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.2944416624937406, |
|
"grad_norm": 0.558230996131897, |
|
"learning_rate": 4.860326658587358e-05, |
|
"loss": 1.005, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.29644466700050076, |
|
"grad_norm": 0.3904726505279541, |
|
"learning_rate": 4.856457716569891e-05, |
|
"loss": 0.9927, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.29844767150726087, |
|
"grad_norm": 0.377273827791214, |
|
"learning_rate": 4.852537502180473e-05, |
|
"loss": 0.9042, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.30045067601402103, |
|
"grad_norm": 0.4523603320121765, |
|
"learning_rate": 4.848566100716066e-05, |
|
"loss": 0.978, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.30045067601402103, |
|
"eval_loss": 1.191455602645874, |
|
"eval_runtime": 3.8019, |
|
"eval_samples_per_second": 15.256, |
|
"eval_steps_per_second": 7.628, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3024536805207812, |
|
"grad_norm": 0.39940956234931946, |
|
"learning_rate": 4.8445435985873775e-05, |
|
"loss": 1.0145, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3044566850275413, |
|
"grad_norm": 0.42715466022491455, |
|
"learning_rate": 4.84047008331697e-05, |
|
"loss": 0.9933, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.30645968953430147, |
|
"grad_norm": 0.5550795793533325, |
|
"learning_rate": 4.8363456435373686e-05, |
|
"loss": 0.8994, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3084626940410616, |
|
"grad_norm": 0.50642329454422, |
|
"learning_rate": 4.832170368989121e-05, |
|
"loss": 0.9708, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.31046569854782174, |
|
"grad_norm": 0.4395250976085663, |
|
"learning_rate": 4.827944350518852e-05, |
|
"loss": 1.055, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.31246870305458185, |
|
"grad_norm": 0.40183037519454956, |
|
"learning_rate": 4.8236676800772845e-05, |
|
"loss": 0.9564, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.314471707561342, |
|
"grad_norm": 0.4325483441352844, |
|
"learning_rate": 4.8193404507172405e-05, |
|
"loss": 0.9437, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.3164747120681021, |
|
"grad_norm": 0.5079526305198669, |
|
"learning_rate": 4.814962756591612e-05, |
|
"loss": 0.9426, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.3184777165748623, |
|
"grad_norm": 0.6221234202384949, |
|
"learning_rate": 4.8105346929513195e-05, |
|
"loss": 0.9674, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.32048072108162245, |
|
"grad_norm": 0.5088761448860168, |
|
"learning_rate": 4.8060563561432313e-05, |
|
"loss": 0.953, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.32248372558838256, |
|
"grad_norm": 0.4460401237010956, |
|
"learning_rate": 4.801527843608075e-05, |
|
"loss": 0.935, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.32448673009514273, |
|
"grad_norm": 0.39005428552627563, |
|
"learning_rate": 4.796949253878311e-05, |
|
"loss": 0.9204, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.32648973460190284, |
|
"grad_norm": 0.4077945351600647, |
|
"learning_rate": 4.792320686575993e-05, |
|
"loss": 1.0509, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.328492739108663, |
|
"grad_norm": 0.4249040186405182, |
|
"learning_rate": 4.787642242410597e-05, |
|
"loss": 0.9549, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.3304957436154231, |
|
"grad_norm": 0.4203990697860718, |
|
"learning_rate": 4.7829140231768335e-05, |
|
"loss": 0.9996, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3324987481221833, |
|
"grad_norm": 0.4657137095928192, |
|
"learning_rate": 4.778136131752431e-05, |
|
"loss": 1.0336, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.33450175262894344, |
|
"grad_norm": 0.4463610053062439, |
|
"learning_rate": 4.773308672095895e-05, |
|
"loss": 0.936, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.33650475713570355, |
|
"grad_norm": 0.46322551369667053, |
|
"learning_rate": 4.768431749244251e-05, |
|
"loss": 0.8727, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.3385077616424637, |
|
"grad_norm": 0.4579392671585083, |
|
"learning_rate": 4.7635054693107553e-05, |
|
"loss": 0.9551, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.3405107661492238, |
|
"grad_norm": 0.40763622522354126, |
|
"learning_rate": 4.758529939482588e-05, |
|
"loss": 0.8965, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.342513770655984, |
|
"grad_norm": 0.5640069246292114, |
|
"learning_rate": 4.75350526801852e-05, |
|
"loss": 1.019, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.3445167751627441, |
|
"grad_norm": 0.378750741481781, |
|
"learning_rate": 4.748431564246557e-05, |
|
"loss": 0.974, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.34651977966950426, |
|
"grad_norm": 0.5434790849685669, |
|
"learning_rate": 4.7433089385615634e-05, |
|
"loss": 0.9863, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.3485227841762644, |
|
"grad_norm": 0.5737304091453552, |
|
"learning_rate": 4.7381375024228556e-05, |
|
"loss": 0.9044, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.35052578868302453, |
|
"grad_norm": 0.5187863707542419, |
|
"learning_rate": 4.7329173683517825e-05, |
|
"loss": 0.8692, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35052578868302453, |
|
"eval_loss": 1.1893218755722046, |
|
"eval_runtime": 3.7963, |
|
"eval_samples_per_second": 15.278, |
|
"eval_steps_per_second": 7.639, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3525287931897847, |
|
"grad_norm": 0.417603462934494, |
|
"learning_rate": 4.727648649929271e-05, |
|
"loss": 0.9013, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.3545317976965448, |
|
"grad_norm": 0.5028386116027832, |
|
"learning_rate": 4.7223314617933605e-05, |
|
"loss": 0.9508, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.35653480220330497, |
|
"grad_norm": 0.3822748064994812, |
|
"learning_rate": 4.7169659196367056e-05, |
|
"loss": 0.9452, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3585378067100651, |
|
"grad_norm": 0.44049903750419617, |
|
"learning_rate": 4.711552140204059e-05, |
|
"loss": 0.9455, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.36054081121682524, |
|
"grad_norm": 0.45998480916023254, |
|
"learning_rate": 4.7060902412897304e-05, |
|
"loss": 0.9731, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3625438157235854, |
|
"grad_norm": 0.5747750401496887, |
|
"learning_rate": 4.700580341735026e-05, |
|
"loss": 0.9197, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.3645468202303455, |
|
"grad_norm": 0.39996007084846497, |
|
"learning_rate": 4.695022561425663e-05, |
|
"loss": 0.9464, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3665498247371057, |
|
"grad_norm": 0.4300011396408081, |
|
"learning_rate": 4.689417021289157e-05, |
|
"loss": 0.8947, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.3685528292438658, |
|
"grad_norm": 0.38185784220695496, |
|
"learning_rate": 4.6837638432921925e-05, |
|
"loss": 0.9521, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.37055583375062595, |
|
"grad_norm": 0.48808950185775757, |
|
"learning_rate": 4.6780631504379736e-05, |
|
"loss": 0.9326, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.37255883825738606, |
|
"grad_norm": 0.40927746891975403, |
|
"learning_rate": 4.672315066763542e-05, |
|
"loss": 0.9949, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.3745618427641462, |
|
"grad_norm": 0.473628968000412, |
|
"learning_rate": 4.666519717337079e-05, |
|
"loss": 0.9808, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.37656484727090633, |
|
"grad_norm": 0.45377451181411743, |
|
"learning_rate": 4.6606772282551894e-05, |
|
"loss": 0.9978, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.3785678517776665, |
|
"grad_norm": 0.5329418182373047, |
|
"learning_rate": 4.65478772664015e-05, |
|
"loss": 0.9531, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.38057085628442666, |
|
"grad_norm": 0.4209918677806854, |
|
"learning_rate": 4.648851340637147e-05, |
|
"loss": 0.914, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.38257386079118677, |
|
"grad_norm": 0.40193280577659607, |
|
"learning_rate": 4.642868199411493e-05, |
|
"loss": 0.8853, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.38457686529794693, |
|
"grad_norm": 0.39280131459236145, |
|
"learning_rate": 4.6368384331458085e-05, |
|
"loss": 0.8992, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.38657986980470704, |
|
"grad_norm": 0.44302472472190857, |
|
"learning_rate": 4.6307621730371934e-05, |
|
"loss": 0.9454, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.3885828743114672, |
|
"grad_norm": 0.4578077793121338, |
|
"learning_rate": 4.6246395512943716e-05, |
|
"loss": 0.957, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.3905858788182273, |
|
"grad_norm": 0.4635055959224701, |
|
"learning_rate": 4.618470701134815e-05, |
|
"loss": 0.9978, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3925888833249875, |
|
"grad_norm": 0.49186405539512634, |
|
"learning_rate": 4.612255756781845e-05, |
|
"loss": 0.9792, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.39459188783174765, |
|
"grad_norm": 0.42530110478401184, |
|
"learning_rate": 4.605994853461709e-05, |
|
"loss": 1.0054, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.39659489233850775, |
|
"grad_norm": 0.4250572919845581, |
|
"learning_rate": 4.5996881274006446e-05, |
|
"loss": 0.8744, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.3985978968452679, |
|
"grad_norm": 0.4212440550327301, |
|
"learning_rate": 4.593335715821909e-05, |
|
"loss": 0.9451, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.400600901352028, |
|
"grad_norm": 0.35784921050071716, |
|
"learning_rate": 4.586937756942796e-05, |
|
"loss": 0.9179, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.400600901352028, |
|
"eval_loss": 1.1884177923202515, |
|
"eval_runtime": 3.8058, |
|
"eval_samples_per_second": 15.24, |
|
"eval_steps_per_second": 7.62, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4026039058587882, |
|
"grad_norm": 0.4087256193161011, |
|
"learning_rate": 4.580494389971628e-05, |
|
"loss": 0.8817, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4046069103655483, |
|
"grad_norm": 0.40662136673927307, |
|
"learning_rate": 4.5740057551047294e-05, |
|
"loss": 0.9219, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.40660991487230846, |
|
"grad_norm": 0.4162129759788513, |
|
"learning_rate": 4.5674719935233726e-05, |
|
"loss": 0.8831, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.40861291937906863, |
|
"grad_norm": 0.40978914499282837, |
|
"learning_rate": 4.56089324739071e-05, |
|
"loss": 0.9601, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.41061592388582874, |
|
"grad_norm": 0.42754805088043213, |
|
"learning_rate": 4.554269659848675e-05, |
|
"loss": 0.9463, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4126189283925889, |
|
"grad_norm": 0.48228365182876587, |
|
"learning_rate": 4.547601375014875e-05, |
|
"loss": 0.9418, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.414621932899349, |
|
"grad_norm": 0.4946666657924652, |
|
"learning_rate": 4.5408885379794494e-05, |
|
"loss": 0.9011, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.4166249374061092, |
|
"grad_norm": 0.4881949722766876, |
|
"learning_rate": 4.5341312948019155e-05, |
|
"loss": 0.9794, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.4186279419128693, |
|
"grad_norm": 0.39862060546875, |
|
"learning_rate": 4.527329792507991e-05, |
|
"loss": 0.9116, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.42063094641962945, |
|
"grad_norm": 0.3882657587528229, |
|
"learning_rate": 4.520484179086394e-05, |
|
"loss": 0.9337, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.42263395092638956, |
|
"grad_norm": 0.3756396770477295, |
|
"learning_rate": 4.51359460348562e-05, |
|
"loss": 0.9272, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.4246369554331497, |
|
"grad_norm": 0.451297402381897, |
|
"learning_rate": 4.50666121561071e-05, |
|
"loss": 0.9306, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.4266399599399099, |
|
"grad_norm": 0.41500887274742126, |
|
"learning_rate": 4.499684166319978e-05, |
|
"loss": 0.9472, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.42864296444667, |
|
"grad_norm": 0.4838218688964844, |
|
"learning_rate": 4.492663607421736e-05, |
|
"loss": 0.8738, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.43064596895343016, |
|
"grad_norm": 0.3867829442024231, |
|
"learning_rate": 4.4855996916709865e-05, |
|
"loss": 1.0112, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43264897346019027, |
|
"grad_norm": 0.40715524554252625, |
|
"learning_rate": 4.478492572766102e-05, |
|
"loss": 0.9571, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.43465197796695043, |
|
"grad_norm": 0.5042704343795776, |
|
"learning_rate": 4.47134240534548e-05, |
|
"loss": 0.9304, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.43665498247371054, |
|
"grad_norm": 0.4030342400074005, |
|
"learning_rate": 4.464149344984178e-05, |
|
"loss": 0.9479, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.4386579869804707, |
|
"grad_norm": 0.3429213762283325, |
|
"learning_rate": 4.456913548190527e-05, |
|
"loss": 0.9511, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.44066099148723087, |
|
"grad_norm": 0.4278419315814972, |
|
"learning_rate": 4.44963517240273e-05, |
|
"loss": 1.1125, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.442663995993991, |
|
"grad_norm": 0.4170474708080292, |
|
"learning_rate": 4.44231437598543e-05, |
|
"loss": 0.9498, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.44466700050075114, |
|
"grad_norm": 0.39053234457969666, |
|
"learning_rate": 4.4349513182262715e-05, |
|
"loss": 0.9796, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.44667000500751125, |
|
"grad_norm": 0.5083168148994446, |
|
"learning_rate": 4.4275461593324306e-05, |
|
"loss": 0.9236, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.4486730095142714, |
|
"grad_norm": 0.3927271068096161, |
|
"learning_rate": 4.420099060427131e-05, |
|
"loss": 1.011, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.4506760140210315, |
|
"grad_norm": 0.4185622036457062, |
|
"learning_rate": 4.4126101835461346e-05, |
|
"loss": 0.9671, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4506760140210315, |
|
"eval_loss": 1.1852179765701294, |
|
"eval_runtime": 3.8121, |
|
"eval_samples_per_second": 15.215, |
|
"eval_steps_per_second": 7.607, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4526790185277917, |
|
"grad_norm": 0.5305806398391724, |
|
"learning_rate": 4.405079691634221e-05, |
|
"loss": 0.9388, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.45468202303455185, |
|
"grad_norm": 0.4585268497467041, |
|
"learning_rate": 4.3975077485416377e-05, |
|
"loss": 0.8841, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.45668502754131196, |
|
"grad_norm": 0.39412179589271545, |
|
"learning_rate": 4.3898945190205386e-05, |
|
"loss": 0.9371, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.4586880320480721, |
|
"grad_norm": 0.5423275828361511, |
|
"learning_rate": 4.382240168721396e-05, |
|
"loss": 0.9923, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.46069103655483223, |
|
"grad_norm": 0.3563918471336365, |
|
"learning_rate": 4.3745448641894e-05, |
|
"loss": 0.9546, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4626940410615924, |
|
"grad_norm": 0.7710307836532593, |
|
"learning_rate": 4.3668087728608316e-05, |
|
"loss": 0.9195, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.4646970455683525, |
|
"grad_norm": 0.4273247718811035, |
|
"learning_rate": 4.359032063059419e-05, |
|
"loss": 0.9674, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.46670005007511267, |
|
"grad_norm": 0.41480231285095215, |
|
"learning_rate": 4.3512149039926796e-05, |
|
"loss": 0.8851, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.46870305458187284, |
|
"grad_norm": 0.559946596622467, |
|
"learning_rate": 4.343357465748235e-05, |
|
"loss": 0.8949, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.47070605908863294, |
|
"grad_norm": 0.5360729098320007, |
|
"learning_rate": 4.33545991929011e-05, |
|
"loss": 0.9014, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4727090635953931, |
|
"grad_norm": 0.5606299042701721, |
|
"learning_rate": 4.327522436455013e-05, |
|
"loss": 0.9091, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4747120681021532, |
|
"grad_norm": 0.49291422963142395, |
|
"learning_rate": 4.3195451899485994e-05, |
|
"loss": 0.9076, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4767150726089134, |
|
"grad_norm": 0.3711169958114624, |
|
"learning_rate": 4.3115283533417105e-05, |
|
"loss": 0.9644, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.4787180771156735, |
|
"grad_norm": 0.4362380802631378, |
|
"learning_rate": 4.3034721010666e-05, |
|
"loss": 0.9263, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.48072108162243365, |
|
"grad_norm": 0.5104102492332458, |
|
"learning_rate": 4.295376608413137e-05, |
|
"loss": 0.96, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.48272408612919376, |
|
"grad_norm": 0.4157417416572571, |
|
"learning_rate": 4.287242051524989e-05, |
|
"loss": 0.9594, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.4847270906359539, |
|
"grad_norm": 0.36849111318588257, |
|
"learning_rate": 4.2790686073957976e-05, |
|
"loss": 0.8976, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.4867300951427141, |
|
"grad_norm": 0.6290056109428406, |
|
"learning_rate": 4.270856453865318e-05, |
|
"loss": 0.9248, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.4887330996494742, |
|
"grad_norm": 0.4833918511867523, |
|
"learning_rate": 4.262605769615557e-05, |
|
"loss": 1.0118, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.49073610415623437, |
|
"grad_norm": 0.6724058985710144, |
|
"learning_rate": 4.25431673416688e-05, |
|
"loss": 0.8823, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4927391086629945, |
|
"grad_norm": 0.45951318740844727, |
|
"learning_rate": 4.245989527874107e-05, |
|
"loss": 0.9822, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.49474211316975464, |
|
"grad_norm": 0.4734819829463959, |
|
"learning_rate": 4.237624331922589e-05, |
|
"loss": 0.9181, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.49674511767651475, |
|
"grad_norm": 0.9102823138237, |
|
"learning_rate": 4.229221328324265e-05, |
|
"loss": 0.8974, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.4987481221832749, |
|
"grad_norm": 0.35548609495162964, |
|
"learning_rate": 4.2207806999137035e-05, |
|
"loss": 0.9309, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.5007511266900351, |
|
"grad_norm": 0.46587055921554565, |
|
"learning_rate": 4.21230263034412e-05, |
|
"loss": 0.9114, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5007511266900351, |
|
"eval_loss": 1.174551248550415, |
|
"eval_runtime": 3.8166, |
|
"eval_samples_per_second": 15.197, |
|
"eval_steps_per_second": 7.598, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5027541311967952, |
|
"grad_norm": 0.3687826097011566, |
|
"learning_rate": 4.2037873040833845e-05, |
|
"loss": 0.9322, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5047571357035553, |
|
"grad_norm": 0.5049874782562256, |
|
"learning_rate": 4.1952349064100074e-05, |
|
"loss": 0.9975, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.5067601402103155, |
|
"grad_norm": 0.4126236140727997, |
|
"learning_rate": 4.1866456234091076e-05, |
|
"loss": 0.929, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5087631447170756, |
|
"grad_norm": 0.44455772638320923, |
|
"learning_rate": 4.178019641968364e-05, |
|
"loss": 0.9345, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5107661492238358, |
|
"grad_norm": 0.4278281033039093, |
|
"learning_rate": 4.1693571497739495e-05, |
|
"loss": 0.8941, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5127691537305958, |
|
"grad_norm": 0.3606776297092438, |
|
"learning_rate": 4.160658335306446e-05, |
|
"loss": 0.9442, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.514772158237356, |
|
"grad_norm": 0.5303627848625183, |
|
"learning_rate": 4.1519233878367424e-05, |
|
"loss": 0.8712, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5167751627441162, |
|
"grad_norm": 0.3978877067565918, |
|
"learning_rate": 4.143152497421922e-05, |
|
"loss": 0.8558, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5187781672508763, |
|
"grad_norm": 0.68426513671875, |
|
"learning_rate": 4.134345854901121e-05, |
|
"loss": 0.9229, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5207811717576365, |
|
"grad_norm": 0.5070856809616089, |
|
"learning_rate": 4.125503651891377e-05, |
|
"loss": 0.8383, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5227841762643965, |
|
"grad_norm": 0.5237690806388855, |
|
"learning_rate": 4.1166260807834644e-05, |
|
"loss": 0.8836, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.5247871807711567, |
|
"grad_norm": 0.38217777013778687, |
|
"learning_rate": 4.107713334737704e-05, |
|
"loss": 0.953, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.5267901852779169, |
|
"grad_norm": 0.4001261591911316, |
|
"learning_rate": 4.098765607679761e-05, |
|
"loss": 0.9681, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.528793189784677, |
|
"grad_norm": 0.4185451567173004, |
|
"learning_rate": 4.0897830942964255e-05, |
|
"loss": 0.9023, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5307961942914372, |
|
"grad_norm": 0.4268343150615692, |
|
"learning_rate": 4.080765990031377e-05, |
|
"loss": 0.9154, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5327991987981973, |
|
"grad_norm": 0.46939241886138916, |
|
"learning_rate": 4.071714491080932e-05, |
|
"loss": 0.9013, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.5348022033049574, |
|
"grad_norm": 0.3804875910282135, |
|
"learning_rate": 4.0626287943897764e-05, |
|
"loss": 0.9091, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.5368052078117176, |
|
"grad_norm": 0.5679438710212708, |
|
"learning_rate": 4.053509097646674e-05, |
|
"loss": 0.9361, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.5388082123184778, |
|
"grad_norm": 0.47385266423225403, |
|
"learning_rate": 4.044355599280175e-05, |
|
"loss": 0.9549, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.5408112168252378, |
|
"grad_norm": 0.48675286769866943, |
|
"learning_rate": 4.035168498454292e-05, |
|
"loss": 0.8835, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.542814221331998, |
|
"grad_norm": 0.46679016947746277, |
|
"learning_rate": 4.025947995064166e-05, |
|
"loss": 0.9377, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.5448172258387581, |
|
"grad_norm": 0.4926673471927643, |
|
"learning_rate": 4.0166942897317205e-05, |
|
"loss": 0.9036, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.5468202303455183, |
|
"grad_norm": 0.38182321190834045, |
|
"learning_rate": 4.007407583801295e-05, |
|
"loss": 0.9616, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.5488232348522785, |
|
"grad_norm": 0.45545268058776855, |
|
"learning_rate": 3.9980880793352635e-05, |
|
"loss": 0.9747, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.5508262393590385, |
|
"grad_norm": 0.47782036662101746, |
|
"learning_rate": 3.988735979109638e-05, |
|
"loss": 0.8995, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5508262393590385, |
|
"eval_loss": 1.1662517786026, |
|
"eval_runtime": 3.8013, |
|
"eval_samples_per_second": 15.258, |
|
"eval_steps_per_second": 7.629, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5528292438657987, |
|
"grad_norm": 0.5856130123138428, |
|
"learning_rate": 3.979351486609658e-05, |
|
"loss": 0.8887, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.5548322483725588, |
|
"grad_norm": 0.3920418620109558, |
|
"learning_rate": 3.969934806025361e-05, |
|
"loss": 0.8773, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.556835252879319, |
|
"grad_norm": 0.43775448203086853, |
|
"learning_rate": 3.960486142247142e-05, |
|
"loss": 0.8969, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.5588382573860792, |
|
"grad_norm": 0.42693212628364563, |
|
"learning_rate": 3.951005700861291e-05, |
|
"loss": 0.9114, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.5608412618928392, |
|
"grad_norm": 0.45931047201156616, |
|
"learning_rate": 3.9414936881455254e-05, |
|
"loss": 0.9111, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5628442663995994, |
|
"grad_norm": 0.5036295652389526, |
|
"learning_rate": 3.931950311064498e-05, |
|
"loss": 0.9606, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.5648472709063596, |
|
"grad_norm": 0.5762202143669128, |
|
"learning_rate": 3.9223757772652956e-05, |
|
"loss": 0.8566, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.5668502754131197, |
|
"grad_norm": 0.40658578276634216, |
|
"learning_rate": 3.91277029507292e-05, |
|
"loss": 0.9485, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.5688532799198798, |
|
"grad_norm": 0.3851291835308075, |
|
"learning_rate": 3.903134073485756e-05, |
|
"loss": 0.8902, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.5708562844266399, |
|
"grad_norm": 0.3543303906917572, |
|
"learning_rate": 3.8934673221710215e-05, |
|
"loss": 0.9411, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5728592889334001, |
|
"grad_norm": 0.3977811336517334, |
|
"learning_rate": 3.883770251460212e-05, |
|
"loss": 0.9258, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.5748622934401603, |
|
"grad_norm": 0.4081217050552368, |
|
"learning_rate": 3.8740430723445156e-05, |
|
"loss": 0.9201, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.5768652979469204, |
|
"grad_norm": 0.4058239459991455, |
|
"learning_rate": 3.864285996470226e-05, |
|
"loss": 0.9428, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.5788683024536805, |
|
"grad_norm": 0.40673911571502686, |
|
"learning_rate": 3.854499236134141e-05, |
|
"loss": 0.985, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.5808713069604406, |
|
"grad_norm": 0.4199845790863037, |
|
"learning_rate": 3.844683004278939e-05, |
|
"loss": 0.9476, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5828743114672008, |
|
"grad_norm": 0.4016932547092438, |
|
"learning_rate": 3.834837514488544e-05, |
|
"loss": 0.9464, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.584877315973961, |
|
"grad_norm": 0.41921266913414, |
|
"learning_rate": 3.8249629809834845e-05, |
|
"loss": 0.9651, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.586880320480721, |
|
"grad_norm": 0.4465863108634949, |
|
"learning_rate": 3.8150596186162286e-05, |
|
"loss": 0.8847, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.5888833249874812, |
|
"grad_norm": 0.4515509009361267, |
|
"learning_rate": 3.805127642866507e-05, |
|
"loss": 0.951, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.5908863294942414, |
|
"grad_norm": 0.44146063923835754, |
|
"learning_rate": 3.795167269836631e-05, |
|
"loss": 0.8924, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5928893340010015, |
|
"grad_norm": 0.538754940032959, |
|
"learning_rate": 3.785178716246786e-05, |
|
"loss": 0.9536, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.5948923385077617, |
|
"grad_norm": 0.3271295130252838, |
|
"learning_rate": 3.775162199430312e-05, |
|
"loss": 0.8724, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.5968953430145217, |
|
"grad_norm": 0.4394945800304413, |
|
"learning_rate": 3.765117937328986e-05, |
|
"loss": 0.9133, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.5988983475212819, |
|
"grad_norm": 0.40261757373809814, |
|
"learning_rate": 3.75504614848827e-05, |
|
"loss": 0.9253, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6009013520280421, |
|
"grad_norm": 0.4515800178050995, |
|
"learning_rate": 3.744947052052562e-05, |
|
"loss": 0.918, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6009013520280421, |
|
"eval_loss": 1.1564297676086426, |
|
"eval_runtime": 3.8109, |
|
"eval_samples_per_second": 15.219, |
|
"eval_steps_per_second": 7.61, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6029043565348022, |
|
"grad_norm": 0.4420590400695801, |
|
"learning_rate": 3.734820867760421e-05, |
|
"loss": 0.8758, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.6049073610415624, |
|
"grad_norm": 0.41104549169540405, |
|
"learning_rate": 3.724667815939794e-05, |
|
"loss": 1.0595, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6069103655483225, |
|
"grad_norm": 0.4642109274864197, |
|
"learning_rate": 3.7144881175032174e-05, |
|
"loss": 0.9576, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6089133700550826, |
|
"grad_norm": 0.4654678404331207, |
|
"learning_rate": 3.704281993943008e-05, |
|
"loss": 0.9196, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6109163745618428, |
|
"grad_norm": 0.44470739364624023, |
|
"learning_rate": 3.694049667326451e-05, |
|
"loss": 0.9326, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6129193790686029, |
|
"grad_norm": 0.4389815330505371, |
|
"learning_rate": 3.683791360290961e-05, |
|
"loss": 0.9633, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.614922383575363, |
|
"grad_norm": 0.366268515586853, |
|
"learning_rate": 3.673507296039243e-05, |
|
"loss": 0.9876, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.6169253880821232, |
|
"grad_norm": 0.40563082695007324, |
|
"learning_rate": 3.663197698334432e-05, |
|
"loss": 0.8903, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.6189283925888833, |
|
"grad_norm": 0.35876786708831787, |
|
"learning_rate": 3.6528627914952266e-05, |
|
"loss": 0.9025, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.6209313970956435, |
|
"grad_norm": 0.44777098298072815, |
|
"learning_rate": 3.6425028003910074e-05, |
|
"loss": 0.9048, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6229344016024037, |
|
"grad_norm": 0.40352246165275574, |
|
"learning_rate": 3.6321179504369444e-05, |
|
"loss": 0.9176, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.6249374061091637, |
|
"grad_norm": 0.4628620445728302, |
|
"learning_rate": 3.6217084675890935e-05, |
|
"loss": 0.9208, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.6269404106159239, |
|
"grad_norm": 0.45699334144592285, |
|
"learning_rate": 3.611274578339477e-05, |
|
"loss": 0.9284, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.628943415122684, |
|
"grad_norm": 0.45050838589668274, |
|
"learning_rate": 3.60081650971116e-05, |
|
"loss": 0.9417, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.6309464196294442, |
|
"grad_norm": 0.4145865738391876, |
|
"learning_rate": 3.590334489253306e-05, |
|
"loss": 0.9526, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6329494241362043, |
|
"grad_norm": 0.4078468084335327, |
|
"learning_rate": 3.5798287450362306e-05, |
|
"loss": 0.8913, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.6349524286429644, |
|
"grad_norm": 0.49246945977211, |
|
"learning_rate": 3.569299505646433e-05, |
|
"loss": 0.862, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.6369554331497246, |
|
"grad_norm": 0.4269583523273468, |
|
"learning_rate": 3.55874700018163e-05, |
|
"loss": 0.9608, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6389584376564847, |
|
"grad_norm": 0.4796135723590851, |
|
"learning_rate": 3.548171458245765e-05, |
|
"loss": 0.9123, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.6409614421632449, |
|
"grad_norm": 0.41421452164649963, |
|
"learning_rate": 3.5375731099440135e-05, |
|
"loss": 0.9702, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.642964446670005, |
|
"grad_norm": 0.4892091751098633, |
|
"learning_rate": 3.526952185877781e-05, |
|
"loss": 0.877, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.6449674511767651, |
|
"grad_norm": 0.39520540833473206, |
|
"learning_rate": 3.516308917139678e-05, |
|
"loss": 0.9643, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.6469704556835253, |
|
"grad_norm": 0.5455682873725891, |
|
"learning_rate": 3.505643535308499e-05, |
|
"loss": 0.9473, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.6489734601902855, |
|
"grad_norm": 0.40943270921707153, |
|
"learning_rate": 3.494956272444177e-05, |
|
"loss": 0.9506, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.6509764646970456, |
|
"grad_norm": 0.3957885503768921, |
|
"learning_rate": 3.484247361082741e-05, |
|
"loss": 0.8854, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6509764646970456, |
|
"eval_loss": 1.1660796403884888, |
|
"eval_runtime": 3.827, |
|
"eval_samples_per_second": 15.155, |
|
"eval_steps_per_second": 7.578, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6529794692038057, |
|
"grad_norm": 0.4576199948787689, |
|
"learning_rate": 3.473517034231251e-05, |
|
"loss": 0.8848, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.6549824737105658, |
|
"grad_norm": 0.45555633306503296, |
|
"learning_rate": 3.4627655253627323e-05, |
|
"loss": 0.954, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.656985478217326, |
|
"grad_norm": 0.45799553394317627, |
|
"learning_rate": 3.451993068411092e-05, |
|
"loss": 0.9766, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.6589884827240862, |
|
"grad_norm": 0.44451501965522766, |
|
"learning_rate": 3.441199897766031e-05, |
|
"loss": 0.9934, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.6609914872308462, |
|
"grad_norm": 0.43687155842781067, |
|
"learning_rate": 3.430386248267943e-05, |
|
"loss": 0.8342, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6629944917376064, |
|
"grad_norm": 0.385002076625824, |
|
"learning_rate": 3.419552355202807e-05, |
|
"loss": 0.9195, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.6649974962443665, |
|
"grad_norm": 0.4921188950538635, |
|
"learning_rate": 3.408698454297067e-05, |
|
"loss": 0.894, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.6670005007511267, |
|
"grad_norm": 0.45717331767082214, |
|
"learning_rate": 3.397824781712499e-05, |
|
"loss": 0.9223, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.6690035052578869, |
|
"grad_norm": 0.6077693700790405, |
|
"learning_rate": 3.386931574041079e-05, |
|
"loss": 0.8307, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.6710065097646469, |
|
"grad_norm": 0.5416433215141296, |
|
"learning_rate": 3.376019068299832e-05, |
|
"loss": 0.9084, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6730095142714071, |
|
"grad_norm": 0.48100745677948, |
|
"learning_rate": 3.365087501925673e-05, |
|
"loss": 0.8687, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.6750125187781673, |
|
"grad_norm": 0.4744812846183777, |
|
"learning_rate": 3.354137112770244e-05, |
|
"loss": 0.9819, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.6770155232849274, |
|
"grad_norm": 0.5188727378845215, |
|
"learning_rate": 3.343168139094738e-05, |
|
"loss": 0.8702, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.6790185277916875, |
|
"grad_norm": 0.42871081829071045, |
|
"learning_rate": 3.332180819564714e-05, |
|
"loss": 0.9244, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.6810215322984476, |
|
"grad_norm": 0.3858610689640045, |
|
"learning_rate": 3.321175393244904e-05, |
|
"loss": 0.8371, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6830245368052078, |
|
"grad_norm": 0.459778368473053, |
|
"learning_rate": 3.310152099594013e-05, |
|
"loss": 0.9146, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.685027541311968, |
|
"grad_norm": 0.36012330651283264, |
|
"learning_rate": 3.299111178459507e-05, |
|
"loss": 0.9806, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.6870305458187281, |
|
"grad_norm": 0.4208768606185913, |
|
"learning_rate": 3.288052870072395e-05, |
|
"loss": 0.8729, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.6890335503254882, |
|
"grad_norm": 0.4012265205383301, |
|
"learning_rate": 3.2769774150420015e-05, |
|
"loss": 0.8586, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.6910365548322484, |
|
"grad_norm": 0.442624032497406, |
|
"learning_rate": 3.2658850543507334e-05, |
|
"loss": 0.931, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6930395593390085, |
|
"grad_norm": 0.3907168209552765, |
|
"learning_rate": 3.2547760293488335e-05, |
|
"loss": 0.9246, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.6950425638457687, |
|
"grad_norm": 0.4578626751899719, |
|
"learning_rate": 3.2436505817491305e-05, |
|
"loss": 0.9339, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.6970455683525288, |
|
"grad_norm": 0.49979129433631897, |
|
"learning_rate": 3.2325089536217815e-05, |
|
"loss": 0.9637, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.6990485728592889, |
|
"grad_norm": 0.41651976108551025, |
|
"learning_rate": 3.2213513873890026e-05, |
|
"loss": 0.9365, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.7010515773660491, |
|
"grad_norm": 0.4993303120136261, |
|
"learning_rate": 3.210178125819795e-05, |
|
"loss": 0.8978, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7010515773660491, |
|
"eval_loss": 1.1489382982254028, |
|
"eval_runtime": 3.8105, |
|
"eval_samples_per_second": 15.221, |
|
"eval_steps_per_second": 7.611, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7030545818728092, |
|
"grad_norm": 0.5267933011054993, |
|
"learning_rate": 3.1989894120246614e-05, |
|
"loss": 0.8641, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.7050575863795694, |
|
"grad_norm": 0.5193835496902466, |
|
"learning_rate": 3.1877854894503204e-05, |
|
"loss": 0.9497, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.7070605908863294, |
|
"grad_norm": 0.43787896633148193, |
|
"learning_rate": 3.1765666018744046e-05, |
|
"loss": 0.8907, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.7090635953930896, |
|
"grad_norm": 0.418584406375885, |
|
"learning_rate": 3.1653329934001584e-05, |
|
"loss": 0.9517, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.7110665998998498, |
|
"grad_norm": 0.6064937114715576, |
|
"learning_rate": 3.154084908451131e-05, |
|
"loss": 0.8603, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7130696044066099, |
|
"grad_norm": 0.37019243836402893, |
|
"learning_rate": 3.142822591765851e-05, |
|
"loss": 0.8974, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.7150726089133701, |
|
"grad_norm": 0.38166865706443787, |
|
"learning_rate": 3.1315462883925025e-05, |
|
"loss": 0.9558, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.7170756134201302, |
|
"grad_norm": 0.45281273126602173, |
|
"learning_rate": 3.1202562436836e-05, |
|
"loss": 0.9325, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.7190786179268903, |
|
"grad_norm": 0.4501991868019104, |
|
"learning_rate": 3.1089527032906425e-05, |
|
"loss": 0.9862, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.7210816224336505, |
|
"grad_norm": 0.43729260563850403, |
|
"learning_rate": 3.097635913158772e-05, |
|
"loss": 0.9339, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7230846269404106, |
|
"grad_norm": 0.5757997632026672, |
|
"learning_rate": 3.08630611952142e-05, |
|
"loss": 0.8904, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.7250876314471708, |
|
"grad_norm": 0.4715934991836548, |
|
"learning_rate": 3.0749635688949545e-05, |
|
"loss": 0.8899, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.7270906359539309, |
|
"grad_norm": 0.5050368905067444, |
|
"learning_rate": 3.063608508073311e-05, |
|
"loss": 0.9324, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.729093640460691, |
|
"grad_norm": 0.6013456583023071, |
|
"learning_rate": 3.052241184122625e-05, |
|
"loss": 0.9626, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.7310966449674512, |
|
"grad_norm": 0.45164185762405396, |
|
"learning_rate": 3.0408618443758557e-05, |
|
"loss": 0.8899, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7330996494742114, |
|
"grad_norm": 0.4240935444831848, |
|
"learning_rate": 3.0294707364274067e-05, |
|
"loss": 0.9151, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.7351026539809714, |
|
"grad_norm": 0.548370361328125, |
|
"learning_rate": 3.018068108127735e-05, |
|
"loss": 0.8976, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.7371056584877316, |
|
"grad_norm": 0.4141191840171814, |
|
"learning_rate": 3.0066542075779602e-05, |
|
"loss": 0.9035, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.7391086629944917, |
|
"grad_norm": 0.4236369729042053, |
|
"learning_rate": 2.9952292831244676e-05, |
|
"loss": 0.8906, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.7411116675012519, |
|
"grad_norm": 0.3607020974159241, |
|
"learning_rate": 2.9837935833535037e-05, |
|
"loss": 0.9423, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7431146720080121, |
|
"grad_norm": 0.4230390191078186, |
|
"learning_rate": 2.9723473570857642e-05, |
|
"loss": 0.9092, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.7451176765147721, |
|
"grad_norm": 0.3703189492225647, |
|
"learning_rate": 2.960890853370985e-05, |
|
"loss": 0.8663, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.7471206810215323, |
|
"grad_norm": 0.49546095728874207, |
|
"learning_rate": 2.9494243214825208e-05, |
|
"loss": 0.8875, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.7491236855282924, |
|
"grad_norm": 0.44254347681999207, |
|
"learning_rate": 2.9379480109119213e-05, |
|
"loss": 0.923, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.7511266900350526, |
|
"grad_norm": 0.4102881848812103, |
|
"learning_rate": 2.9264621713635028e-05, |
|
"loss": 0.9357, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7511266900350526, |
|
"eval_loss": 1.1563700437545776, |
|
"eval_runtime": 3.8041, |
|
"eval_samples_per_second": 15.247, |
|
"eval_steps_per_second": 7.623, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7531296945418127, |
|
"grad_norm": 0.42651745676994324, |
|
"learning_rate": 2.914967052748917e-05, |
|
"loss": 0.9277, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.7551326990485728, |
|
"grad_norm": 0.37917560338974, |
|
"learning_rate": 2.9034629051817096e-05, |
|
"loss": 0.9717, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.757135703555333, |
|
"grad_norm": 0.4591340720653534, |
|
"learning_rate": 2.891949978971883e-05, |
|
"loss": 0.9336, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.7591387080620932, |
|
"grad_norm": 0.5880463719367981, |
|
"learning_rate": 2.8804285246204438e-05, |
|
"loss": 0.9098, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.7611417125688533, |
|
"grad_norm": 0.39928752183914185, |
|
"learning_rate": 2.8688987928139588e-05, |
|
"loss": 0.8258, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7631447170756134, |
|
"grad_norm": 0.5559530258178711, |
|
"learning_rate": 2.8573610344190975e-05, |
|
"loss": 0.8728, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.7651477215823735, |
|
"grad_norm": 0.49999016523361206, |
|
"learning_rate": 2.8458155004771724e-05, |
|
"loss": 1.0135, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.7671507260891337, |
|
"grad_norm": 0.35017403960227966, |
|
"learning_rate": 2.8342624421986797e-05, |
|
"loss": 0.8929, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.7691537305958939, |
|
"grad_norm": 0.48860040307044983, |
|
"learning_rate": 2.822702110957831e-05, |
|
"loss": 0.8784, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.771156735102654, |
|
"grad_norm": 0.4092211425304413, |
|
"learning_rate": 2.811134758287085e-05, |
|
"loss": 0.8643, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7731597396094141, |
|
"grad_norm": 0.517197847366333, |
|
"learning_rate": 2.799560635871675e-05, |
|
"loss": 0.9033, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.7751627441161743, |
|
"grad_norm": 0.40133723616600037, |
|
"learning_rate": 2.78797999554413e-05, |
|
"loss": 0.9308, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.7771657486229344, |
|
"grad_norm": 0.4061048626899719, |
|
"learning_rate": 2.7763930892787992e-05, |
|
"loss": 0.9076, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.7791687531296946, |
|
"grad_norm": 0.5977723002433777, |
|
"learning_rate": 2.7648001691863673e-05, |
|
"loss": 0.8699, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.7811717576364546, |
|
"grad_norm": 0.3865041136741638, |
|
"learning_rate": 2.753201487508369e-05, |
|
"loss": 0.9565, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7831747621432148, |
|
"grad_norm": 0.49114081263542175, |
|
"learning_rate": 2.7415972966117014e-05, |
|
"loss": 0.8533, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.785177766649975, |
|
"grad_norm": 0.3852551281452179, |
|
"learning_rate": 2.7299878489831316e-05, |
|
"loss": 0.8556, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.7871807711567351, |
|
"grad_norm": 0.4888080060482025, |
|
"learning_rate": 2.718373397223804e-05, |
|
"loss": 0.8734, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.7891837756634953, |
|
"grad_norm": 0.4077546298503876, |
|
"learning_rate": 2.706754194043746e-05, |
|
"loss": 0.9392, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.7911867801702553, |
|
"grad_norm": 0.408587247133255, |
|
"learning_rate": 2.6951304922563642e-05, |
|
"loss": 0.8565, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7931897846770155, |
|
"grad_norm": 0.45802196860313416, |
|
"learning_rate": 2.6835025447729495e-05, |
|
"loss": 0.9535, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.7951927891837757, |
|
"grad_norm": 0.4353581964969635, |
|
"learning_rate": 2.6718706045971726e-05, |
|
"loss": 0.8428, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.7971957936905358, |
|
"grad_norm": 0.4018676280975342, |
|
"learning_rate": 2.6602349248195746e-05, |
|
"loss": 0.8754, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.7991987981972959, |
|
"grad_norm": 0.4653930068016052, |
|
"learning_rate": 2.6485957586120663e-05, |
|
"loss": 0.7725, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.801201802704056, |
|
"grad_norm": 0.5806179642677307, |
|
"learning_rate": 2.6369533592224172e-05, |
|
"loss": 0.8955, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.801201802704056, |
|
"eval_loss": 1.1470181941986084, |
|
"eval_runtime": 3.7967, |
|
"eval_samples_per_second": 15.277, |
|
"eval_steps_per_second": 7.638, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8032048072108162, |
|
"grad_norm": 0.4590522348880768, |
|
"learning_rate": 2.6253079799687435e-05, |
|
"loss": 0.9738, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.8052078117175764, |
|
"grad_norm": 0.5188782811164856, |
|
"learning_rate": 2.613659874233999e-05, |
|
"loss": 0.9573, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.8072108162243365, |
|
"grad_norm": 0.4585997760295868, |
|
"learning_rate": 2.6020092954604614e-05, |
|
"loss": 0.948, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.8092138207310966, |
|
"grad_norm": 0.39974266290664673, |
|
"learning_rate": 2.5903564971442167e-05, |
|
"loss": 1.0123, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.8112168252378568, |
|
"grad_norm": 0.4484356641769409, |
|
"learning_rate": 2.5787017328296447e-05, |
|
"loss": 0.8262, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8132198297446169, |
|
"grad_norm": 0.4441506862640381, |
|
"learning_rate": 2.5670452561039004e-05, |
|
"loss": 0.8683, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.8152228342513771, |
|
"grad_norm": 0.6077110171318054, |
|
"learning_rate": 2.555387320591401e-05, |
|
"loss": 0.8657, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.8172258387581373, |
|
"grad_norm": 0.3740634322166443, |
|
"learning_rate": 2.5437281799483004e-05, |
|
"loss": 0.9215, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.8192288432648973, |
|
"grad_norm": 0.516426682472229, |
|
"learning_rate": 2.5320680878569768e-05, |
|
"loss": 0.8907, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.8212318477716575, |
|
"grad_norm": 0.42550894618034363, |
|
"learning_rate": 2.5204072980205092e-05, |
|
"loss": 0.9188, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8232348522784176, |
|
"grad_norm": 0.5615983605384827, |
|
"learning_rate": 2.508746064157159e-05, |
|
"loss": 1.0489, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.8252378567851778, |
|
"grad_norm": 0.4470774233341217, |
|
"learning_rate": 2.4970846399948487e-05, |
|
"loss": 0.8668, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.8272408612919379, |
|
"grad_norm": 0.440336138010025, |
|
"learning_rate": 2.4854232792656394e-05, |
|
"loss": 0.8658, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.829243865798698, |
|
"grad_norm": 0.41719090938568115, |
|
"learning_rate": 2.473762235700214e-05, |
|
"loss": 0.9103, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.8312468703054582, |
|
"grad_norm": 0.4663768410682678, |
|
"learning_rate": 2.462101763022356e-05, |
|
"loss": 0.8621, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8332498748122183, |
|
"grad_norm": 0.4149011969566345, |
|
"learning_rate": 2.4504421149434233e-05, |
|
"loss": 0.82, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.8352528793189785, |
|
"grad_norm": 0.4140399992465973, |
|
"learning_rate": 2.4387835451568355e-05, |
|
"loss": 0.9775, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.8372558838257386, |
|
"grad_norm": 0.44181761145591736, |
|
"learning_rate": 2.427126307332549e-05, |
|
"loss": 0.8591, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.8392588883324987, |
|
"grad_norm": 0.4710381031036377, |
|
"learning_rate": 2.4154706551115384e-05, |
|
"loss": 0.8738, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.8412618928392589, |
|
"grad_norm": 0.5030112266540527, |
|
"learning_rate": 2.4038168421002794e-05, |
|
"loss": 0.9506, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8432648973460191, |
|
"grad_norm": 0.5199030041694641, |
|
"learning_rate": 2.3921651218652293e-05, |
|
"loss": 0.8508, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.8452679018527791, |
|
"grad_norm": 0.5105124115943909, |
|
"learning_rate": 2.380515747927312e-05, |
|
"loss": 0.8432, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.8472709063595393, |
|
"grad_norm": 0.49101004004478455, |
|
"learning_rate": 2.3688689737563967e-05, |
|
"loss": 0.9014, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.8492739108662994, |
|
"grad_norm": 0.4043116569519043, |
|
"learning_rate": 2.3572250527657895e-05, |
|
"loss": 0.9011, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.8512769153730596, |
|
"grad_norm": 0.4326643645763397, |
|
"learning_rate": 2.345584238306713e-05, |
|
"loss": 0.8597, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8512769153730596, |
|
"eval_loss": 1.1495444774627686, |
|
"eval_runtime": 3.7962, |
|
"eval_samples_per_second": 15.279, |
|
"eval_steps_per_second": 7.639, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8532799198798198, |
|
"grad_norm": 0.5106630325317383, |
|
"learning_rate": 2.3339467836628017e-05, |
|
"loss": 0.9167, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.8552829243865798, |
|
"grad_norm": 0.42315831780433655, |
|
"learning_rate": 2.322312942044581e-05, |
|
"loss": 0.9248, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.85728592889334, |
|
"grad_norm": 0.4706262946128845, |
|
"learning_rate": 2.3106829665839677e-05, |
|
"loss": 0.8772, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.8592889334001002, |
|
"grad_norm": 0.7145017385482788, |
|
"learning_rate": 2.2990571103287567e-05, |
|
"loss": 0.9167, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.8612919379068603, |
|
"grad_norm": 0.47455379366874695, |
|
"learning_rate": 2.2874356262371134e-05, |
|
"loss": 0.9008, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8632949424136205, |
|
"grad_norm": 0.41509053111076355, |
|
"learning_rate": 2.2758187671720772e-05, |
|
"loss": 0.8976, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.8652979469203805, |
|
"grad_norm": 0.5434259176254272, |
|
"learning_rate": 2.2642067858960514e-05, |
|
"loss": 0.8593, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.8673009514271407, |
|
"grad_norm": 0.43615275621414185, |
|
"learning_rate": 2.2525999350653095e-05, |
|
"loss": 0.9305, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.8693039559339009, |
|
"grad_norm": 0.5843902230262756, |
|
"learning_rate": 2.2409984672244934e-05, |
|
"loss": 0.8521, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.871306960440661, |
|
"grad_norm": 0.35046350955963135, |
|
"learning_rate": 2.2294026348011223e-05, |
|
"loss": 0.8392, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8733099649474211, |
|
"grad_norm": 0.4275960624217987, |
|
"learning_rate": 2.2178126901000996e-05, |
|
"loss": 0.8883, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.8753129694541812, |
|
"grad_norm": 1.0779649019241333, |
|
"learning_rate": 2.2062288852982182e-05, |
|
"loss": 0.9226, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.8773159739609414, |
|
"grad_norm": 0.43578073382377625, |
|
"learning_rate": 2.1946514724386828e-05, |
|
"loss": 0.877, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.8793189784677016, |
|
"grad_norm": 0.5768626928329468, |
|
"learning_rate": 2.1830807034256154e-05, |
|
"loss": 0.8844, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.8813219829744617, |
|
"grad_norm": 0.4431218206882477, |
|
"learning_rate": 2.1715168300185848e-05, |
|
"loss": 0.9106, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8833249874812218, |
|
"grad_norm": 0.44507092237472534, |
|
"learning_rate": 2.1599601038271186e-05, |
|
"loss": 0.9349, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.885327991987982, |
|
"grad_norm": 0.42408713698387146, |
|
"learning_rate": 2.148410776305237e-05, |
|
"loss": 0.8704, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.8873309964947421, |
|
"grad_norm": 0.45474737882614136, |
|
"learning_rate": 2.136869098745978e-05, |
|
"loss": 0.8854, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.8893340010015023, |
|
"grad_norm": 0.42297935485839844, |
|
"learning_rate": 2.125335322275928e-05, |
|
"loss": 0.8438, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.8913370055082624, |
|
"grad_norm": 0.5911722779273987, |
|
"learning_rate": 2.1138096978497617e-05, |
|
"loss": 0.8021, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8933400100150225, |
|
"grad_norm": 0.5190030336380005, |
|
"learning_rate": 2.1022924762447767e-05, |
|
"loss": 0.8814, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.8953430145217827, |
|
"grad_norm": 0.4616602957248688, |
|
"learning_rate": 2.0907839080554443e-05, |
|
"loss": 0.9051, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.8973460190285428, |
|
"grad_norm": 0.6448442935943604, |
|
"learning_rate": 2.079284243687948e-05, |
|
"loss": 0.8667, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.899349023535303, |
|
"grad_norm": 0.46473053097724915, |
|
"learning_rate": 2.067793733354743e-05, |
|
"loss": 0.8543, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.901352028042063, |
|
"grad_norm": 0.47952961921691895, |
|
"learning_rate": 2.0563126270691097e-05, |
|
"loss": 0.869, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.901352028042063, |
|
"eval_loss": 1.1418862342834473, |
|
"eval_runtime": 3.8348, |
|
"eval_samples_per_second": 15.125, |
|
"eval_steps_per_second": 7.562, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9033550325488232, |
|
"grad_norm": 0.4736415147781372, |
|
"learning_rate": 2.044841174639708e-05, |
|
"loss": 0.8937, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.9053580370555834, |
|
"grad_norm": 0.48480942845344543, |
|
"learning_rate": 2.0333796256651533e-05, |
|
"loss": 0.9146, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.9073610415623435, |
|
"grad_norm": 0.519432544708252, |
|
"learning_rate": 2.0219282295285737e-05, |
|
"loss": 0.8845, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.9093640460691037, |
|
"grad_norm": 0.47801777720451355, |
|
"learning_rate": 2.0104872353921927e-05, |
|
"loss": 0.8701, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.9113670505758638, |
|
"grad_norm": 0.5259170532226562, |
|
"learning_rate": 1.999056892191904e-05, |
|
"loss": 0.9299, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9133700550826239, |
|
"grad_norm": 0.503354549407959, |
|
"learning_rate": 1.9876374486318543e-05, |
|
"loss": 0.8895, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.9153730595893841, |
|
"grad_norm": 0.5313873887062073, |
|
"learning_rate": 1.9762291531790355e-05, |
|
"loss": 0.8254, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.9173760640961443, |
|
"grad_norm": 0.5693700313568115, |
|
"learning_rate": 1.9648322540578744e-05, |
|
"loss": 0.8246, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.9193790686029043, |
|
"grad_norm": 0.5147340893745422, |
|
"learning_rate": 1.9534469992448358e-05, |
|
"loss": 0.8987, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.9213820731096645, |
|
"grad_norm": 0.718410849571228, |
|
"learning_rate": 1.9420736364630215e-05, |
|
"loss": 0.8385, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9233850776164246, |
|
"grad_norm": 0.49588289856910706, |
|
"learning_rate": 1.9307124131767877e-05, |
|
"loss": 0.8652, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.9253880821231848, |
|
"grad_norm": 0.6265762448310852, |
|
"learning_rate": 1.9193635765863523e-05, |
|
"loss": 0.8964, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.927391086629945, |
|
"grad_norm": 0.4153289496898651, |
|
"learning_rate": 1.9080273736224236e-05, |
|
"loss": 0.9286, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.929394091136705, |
|
"grad_norm": 0.6794211864471436, |
|
"learning_rate": 1.8967040509408253e-05, |
|
"loss": 0.9141, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.9313970956434652, |
|
"grad_norm": 0.595132052898407, |
|
"learning_rate": 1.885393854917124e-05, |
|
"loss": 0.8353, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9334001001502253, |
|
"grad_norm": 0.4146586060523987, |
|
"learning_rate": 1.8740970316412793e-05, |
|
"loss": 0.898, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.9354031046569855, |
|
"grad_norm": 0.5133841633796692, |
|
"learning_rate": 1.8628138269122773e-05, |
|
"loss": 0.8648, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.9374061091637457, |
|
"grad_norm": 0.4042494595050812, |
|
"learning_rate": 1.8515444862327946e-05, |
|
"loss": 0.9285, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.9394091136705057, |
|
"grad_norm": 0.4541870057582855, |
|
"learning_rate": 1.8402892548038453e-05, |
|
"loss": 0.905, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.9414121181772659, |
|
"grad_norm": 0.4241974949836731, |
|
"learning_rate": 1.829048377519455e-05, |
|
"loss": 0.9802, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.943415122684026, |
|
"grad_norm": 0.5843325257301331, |
|
"learning_rate": 1.8178220989613254e-05, |
|
"loss": 0.8694, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.9454181271907862, |
|
"grad_norm": 0.3579271137714386, |
|
"learning_rate": 1.806610663393517e-05, |
|
"loss": 0.9004, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.9474211316975463, |
|
"grad_norm": 0.409402459859848, |
|
"learning_rate": 1.795414314757134e-05, |
|
"loss": 0.9436, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.9494241362043064, |
|
"grad_norm": 0.40799620747566223, |
|
"learning_rate": 1.784233296665012e-05, |
|
"loss": 0.8883, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.9514271407110666, |
|
"grad_norm": 0.45501673221588135, |
|
"learning_rate": 1.773067852396426e-05, |
|
"loss": 0.9641, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9514271407110666, |
|
"eval_loss": 1.1456818580627441, |
|
"eval_runtime": 3.8046, |
|
"eval_samples_per_second": 15.245, |
|
"eval_steps_per_second": 7.622, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9534301452178268, |
|
"grad_norm": 0.4748212695121765, |
|
"learning_rate": 1.761918224891787e-05, |
|
"loss": 0.8753, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.9554331497245869, |
|
"grad_norm": 0.6242424249649048, |
|
"learning_rate": 1.7507846567473644e-05, |
|
"loss": 0.8713, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.957436154231347, |
|
"grad_norm": 0.42941513657569885, |
|
"learning_rate": 1.7396673902100035e-05, |
|
"loss": 0.9128, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.9594391587381071, |
|
"grad_norm": 0.44053131341934204, |
|
"learning_rate": 1.728566667171854e-05, |
|
"loss": 0.8996, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.9614421632448673, |
|
"grad_norm": 0.6191515922546387, |
|
"learning_rate": 1.71748272916511e-05, |
|
"loss": 0.8114, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9634451677516275, |
|
"grad_norm": 0.40307995676994324, |
|
"learning_rate": 1.7064158173567514e-05, |
|
"loss": 0.8587, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.9654481722583875, |
|
"grad_norm": 0.3541308641433716, |
|
"learning_rate": 1.695366172543299e-05, |
|
"loss": 0.9487, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.9674511767651477, |
|
"grad_norm": 0.4575124979019165, |
|
"learning_rate": 1.6843340351455726e-05, |
|
"loss": 0.9219, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.9694541812719079, |
|
"grad_norm": 0.4024929702281952, |
|
"learning_rate": 1.6733196452034653e-05, |
|
"loss": 0.9609, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.971457185778668, |
|
"grad_norm": 0.4288537800312042, |
|
"learning_rate": 1.662323242370711e-05, |
|
"loss": 0.9131, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9734601902854282, |
|
"grad_norm": 0.3629342317581177, |
|
"learning_rate": 1.6513450659096804e-05, |
|
"loss": 0.8327, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.9754631947921882, |
|
"grad_norm": 0.40302079916000366, |
|
"learning_rate": 1.64038535468617e-05, |
|
"loss": 0.9035, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.9774661992989484, |
|
"grad_norm": 0.44683897495269775, |
|
"learning_rate": 1.629444347164202e-05, |
|
"loss": 0.9142, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.9794692038057086, |
|
"grad_norm": 0.6119024157524109, |
|
"learning_rate": 1.6185222814008433e-05, |
|
"loss": 0.8105, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.9814722083124687, |
|
"grad_norm": 0.39314714074134827, |
|
"learning_rate": 1.6076193950410172e-05, |
|
"loss": 0.8817, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9834752128192289, |
|
"grad_norm": 0.465087354183197, |
|
"learning_rate": 1.5967359253123403e-05, |
|
"loss": 0.8979, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.985478217325989, |
|
"grad_norm": 0.5371639728546143, |
|
"learning_rate": 1.5858721090199565e-05, |
|
"loss": 0.9335, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.9874812218327491, |
|
"grad_norm": 0.5564991235733032, |
|
"learning_rate": 1.5750281825413836e-05, |
|
"loss": 0.9051, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.9894842263395093, |
|
"grad_norm": 0.40404555201530457, |
|
"learning_rate": 1.5642043818213757e-05, |
|
"loss": 0.9676, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.9914872308462694, |
|
"grad_norm": 0.4462992548942566, |
|
"learning_rate": 1.5534009423667827e-05, |
|
"loss": 0.8869, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9934902353530295, |
|
"grad_norm": 0.4584622085094452, |
|
"learning_rate": 1.5426180992414318e-05, |
|
"loss": 0.9093, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.9954932398597897, |
|
"grad_norm": 0.48583951592445374, |
|
"learning_rate": 1.5318560870610065e-05, |
|
"loss": 0.8587, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.9974962443665498, |
|
"grad_norm": 0.5246539115905762, |
|
"learning_rate": 1.5211151399879506e-05, |
|
"loss": 0.8145, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.99949924887331, |
|
"grad_norm": 0.5616730451583862, |
|
"learning_rate": 1.510395491726363e-05, |
|
"loss": 0.9115, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.00100150225338, |
|
"grad_norm": 0.398170530796051, |
|
"learning_rate": 1.4996973755169219e-05, |
|
"loss": 0.674, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.00100150225338, |
|
"eval_loss": 1.1421712636947632, |
|
"eval_runtime": 3.813, |
|
"eval_samples_per_second": 15.211, |
|
"eval_steps_per_second": 7.606, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1497, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.382713588973568e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|