|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.985172981878089, |
|
"eval_steps": 500, |
|
"global_step": 453, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006589785831960461, |
|
"grad_norm": 1333.285400390625, |
|
"learning_rate": 0.0, |
|
"loss": 1.1178, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013179571663920923, |
|
"grad_norm": 1591.2745361328125, |
|
"learning_rate": 1.0869565217391306e-06, |
|
"loss": 0.902, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.019769357495881382, |
|
"grad_norm": 574.3883056640625, |
|
"learning_rate": 2.173913043478261e-06, |
|
"loss": 0.9008, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.026359143327841845, |
|
"grad_norm": 787.6026611328125, |
|
"learning_rate": 3.2608695652173914e-06, |
|
"loss": 0.9099, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.032948929159802305, |
|
"grad_norm": 1140.8524169921875, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 0.8861, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.039538714991762765, |
|
"grad_norm": 572.358642578125, |
|
"learning_rate": 5.4347826086956525e-06, |
|
"loss": 0.8068, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04612850082372323, |
|
"grad_norm": 1086.734130859375, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 0.863, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05271828665568369, |
|
"grad_norm": 630.6577758789062, |
|
"learning_rate": 7.608695652173914e-06, |
|
"loss": 0.8086, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05930807248764415, |
|
"grad_norm": 602.673828125, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.7029, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06589785831960461, |
|
"grad_norm": 486.8932189941406, |
|
"learning_rate": 9.782608695652175e-06, |
|
"loss": 0.7151, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07248764415156507, |
|
"grad_norm": 502.26641845703125, |
|
"learning_rate": 1.0869565217391305e-05, |
|
"loss": 0.9117, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07907742998352553, |
|
"grad_norm": 510.6407470703125, |
|
"learning_rate": 1.1956521739130435e-05, |
|
"loss": 0.7578, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.085667215815486, |
|
"grad_norm": 280.19989013671875, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 0.8804, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09225700164744646, |
|
"grad_norm": 660.488525390625, |
|
"learning_rate": 1.4130434782608694e-05, |
|
"loss": 0.7351, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09884678747940692, |
|
"grad_norm": 543.288330078125, |
|
"learning_rate": 1.5217391304347828e-05, |
|
"loss": 0.8505, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10543657331136738, |
|
"grad_norm": 449.69940185546875, |
|
"learning_rate": 1.630434782608696e-05, |
|
"loss": 0.8146, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11202635914332784, |
|
"grad_norm": 194.49293518066406, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.8442, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1186161449752883, |
|
"grad_norm": 445.9173583984375, |
|
"learning_rate": 1.8478260869565216e-05, |
|
"loss": 0.7377, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12520593080724876, |
|
"grad_norm": 215.03514099121094, |
|
"learning_rate": 1.956521739130435e-05, |
|
"loss": 0.7561, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13179571663920922, |
|
"grad_norm": 389.9723815917969, |
|
"learning_rate": 2.065217391304348e-05, |
|
"loss": 0.7568, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13838550247116968, |
|
"grad_norm": 613.5521850585938, |
|
"learning_rate": 2.173913043478261e-05, |
|
"loss": 0.7034, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14497528830313014, |
|
"grad_norm": 405.9554748535156, |
|
"learning_rate": 2.282608695652174e-05, |
|
"loss": 0.7053, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1515650741350906, |
|
"grad_norm": 215.52670288085938, |
|
"learning_rate": 2.391304347826087e-05, |
|
"loss": 0.7889, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15815485996705106, |
|
"grad_norm": 205.6142120361328, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7658, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16474464579901152, |
|
"grad_norm": 987.6297607421875, |
|
"learning_rate": 2.608695652173913e-05, |
|
"loss": 0.788, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.171334431630972, |
|
"grad_norm": 222.69949340820312, |
|
"learning_rate": 2.7173913043478262e-05, |
|
"loss": 0.677, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17792421746293247, |
|
"grad_norm": 345.14007568359375, |
|
"learning_rate": 2.826086956521739e-05, |
|
"loss": 0.781, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18451400329489293, |
|
"grad_norm": 463.8091125488281, |
|
"learning_rate": 2.9347826086956526e-05, |
|
"loss": 0.719, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19110378912685339, |
|
"grad_norm": 305.9866943359375, |
|
"learning_rate": 3.0434782608695656e-05, |
|
"loss": 0.913, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19769357495881384, |
|
"grad_norm": 181.99391174316406, |
|
"learning_rate": 3.152173913043479e-05, |
|
"loss": 0.8097, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2042833607907743, |
|
"grad_norm": 218.90301513671875, |
|
"learning_rate": 3.260869565217392e-05, |
|
"loss": 0.7213, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21087314662273476, |
|
"grad_norm": 661.6412353515625, |
|
"learning_rate": 3.369565217391305e-05, |
|
"loss": 0.845, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21746293245469522, |
|
"grad_norm": 454.7393493652344, |
|
"learning_rate": 3.478260869565218e-05, |
|
"loss": 0.8505, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22405271828665568, |
|
"grad_norm": 410.3249816894531, |
|
"learning_rate": 3.58695652173913e-05, |
|
"loss": 0.8294, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.23064250411861614, |
|
"grad_norm": 152.13143920898438, |
|
"learning_rate": 3.695652173913043e-05, |
|
"loss": 0.8209, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2372322899505766, |
|
"grad_norm": 271.0032653808594, |
|
"learning_rate": 3.804347826086957e-05, |
|
"loss": 0.8357, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24382207578253706, |
|
"grad_norm": 635.7935791015625, |
|
"learning_rate": 3.91304347826087e-05, |
|
"loss": 0.7799, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2504118616144975, |
|
"grad_norm": 461.5861511230469, |
|
"learning_rate": 4.021739130434783e-05, |
|
"loss": 0.765, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.257001647446458, |
|
"grad_norm": 279.9590148925781, |
|
"learning_rate": 4.130434782608696e-05, |
|
"loss": 0.8731, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.26359143327841844, |
|
"grad_norm": 765.4867553710938, |
|
"learning_rate": 4.239130434782609e-05, |
|
"loss": 0.7388, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2701812191103789, |
|
"grad_norm": 575.533447265625, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 0.7368, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27677100494233936, |
|
"grad_norm": 405.68023681640625, |
|
"learning_rate": 4.456521739130435e-05, |
|
"loss": 0.7146, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2833607907742998, |
|
"grad_norm": 320.5788269042969, |
|
"learning_rate": 4.565217391304348e-05, |
|
"loss": 0.9472, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2899505766062603, |
|
"grad_norm": 159.42025756835938, |
|
"learning_rate": 4.673913043478261e-05, |
|
"loss": 0.748, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.29654036243822074, |
|
"grad_norm": 343.9827575683594, |
|
"learning_rate": 4.782608695652174e-05, |
|
"loss": 0.7369, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3031301482701812, |
|
"grad_norm": 2192.0439453125, |
|
"learning_rate": 4.891304347826087e-05, |
|
"loss": 0.8394, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.30971993410214166, |
|
"grad_norm": 379.729248046875, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8388, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3163097199341021, |
|
"grad_norm": 154.18643188476562, |
|
"learning_rate": 4.987714987714988e-05, |
|
"loss": 0.9252, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3228995057660626, |
|
"grad_norm": 561.9174194335938, |
|
"learning_rate": 4.9754299754299756e-05, |
|
"loss": 0.8088, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32948929159802304, |
|
"grad_norm": 434.27325439453125, |
|
"learning_rate": 4.963144963144963e-05, |
|
"loss": 0.9263, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33607907742998355, |
|
"grad_norm": 122.2130126953125, |
|
"learning_rate": 4.950859950859951e-05, |
|
"loss": 0.8773, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.342668863261944, |
|
"grad_norm": 745.0607299804688, |
|
"learning_rate": 4.9385749385749387e-05, |
|
"loss": 0.7825, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34925864909390447, |
|
"grad_norm": 328.8779602050781, |
|
"learning_rate": 4.926289926289926e-05, |
|
"loss": 0.8562, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35584843492586493, |
|
"grad_norm": 192.5826873779297, |
|
"learning_rate": 4.914004914004915e-05, |
|
"loss": 0.8408, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3624382207578254, |
|
"grad_norm": 290.76776123046875, |
|
"learning_rate": 4.901719901719902e-05, |
|
"loss": 0.769, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36902800658978585, |
|
"grad_norm": 212.2420654296875, |
|
"learning_rate": 4.8894348894348894e-05, |
|
"loss": 0.7944, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3756177924217463, |
|
"grad_norm": 141.33392333984375, |
|
"learning_rate": 4.877149877149878e-05, |
|
"loss": 0.9014, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.38220757825370677, |
|
"grad_norm": 210.45494079589844, |
|
"learning_rate": 4.8648648648648654e-05, |
|
"loss": 0.8414, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38879736408566723, |
|
"grad_norm": 160.95689392089844, |
|
"learning_rate": 4.8525798525798524e-05, |
|
"loss": 0.7485, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3953871499176277, |
|
"grad_norm": 303.22906494140625, |
|
"learning_rate": 4.840294840294841e-05, |
|
"loss": 0.8196, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.40197693574958815, |
|
"grad_norm": 327.06805419921875, |
|
"learning_rate": 4.8280098280098285e-05, |
|
"loss": 0.7513, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4085667215815486, |
|
"grad_norm": 1190.357421875, |
|
"learning_rate": 4.8157248157248155e-05, |
|
"loss": 0.9952, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.41515650741350907, |
|
"grad_norm": 206.6424102783203, |
|
"learning_rate": 4.803439803439804e-05, |
|
"loss": 0.8309, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.42174629324546953, |
|
"grad_norm": 534.4395141601562, |
|
"learning_rate": 4.7911547911547915e-05, |
|
"loss": 0.8531, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42833607907743, |
|
"grad_norm": 341.8865966796875, |
|
"learning_rate": 4.778869778869779e-05, |
|
"loss": 0.9542, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.43492586490939045, |
|
"grad_norm": 228.7908172607422, |
|
"learning_rate": 4.766584766584767e-05, |
|
"loss": 0.9572, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4415156507413509, |
|
"grad_norm": 352.82086181640625, |
|
"learning_rate": 4.7542997542997546e-05, |
|
"loss": 0.8549, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44810543657331137, |
|
"grad_norm": 105.19104766845703, |
|
"learning_rate": 4.742014742014742e-05, |
|
"loss": 0.8718, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4546952224052718, |
|
"grad_norm": 99.13899230957031, |
|
"learning_rate": 4.72972972972973e-05, |
|
"loss": 0.9738, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4612850082372323, |
|
"grad_norm": 228.2894287109375, |
|
"learning_rate": 4.7174447174447176e-05, |
|
"loss": 0.8689, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46787479406919275, |
|
"grad_norm": 157.54298400878906, |
|
"learning_rate": 4.705159705159705e-05, |
|
"loss": 0.8553, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4744645799011532, |
|
"grad_norm": 328.7658996582031, |
|
"learning_rate": 4.692874692874693e-05, |
|
"loss": 0.8788, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.48105436573311366, |
|
"grad_norm": 1948.38916015625, |
|
"learning_rate": 4.680589680589681e-05, |
|
"loss": 0.8151, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4876441515650741, |
|
"grad_norm": 320.9216003417969, |
|
"learning_rate": 4.6683046683046684e-05, |
|
"loss": 0.8086, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4942339373970346, |
|
"grad_norm": 1094.80517578125, |
|
"learning_rate": 4.656019656019656e-05, |
|
"loss": 0.881, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.500823723228995, |
|
"grad_norm": 262.46636962890625, |
|
"learning_rate": 4.6437346437346444e-05, |
|
"loss": 0.9475, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5074135090609555, |
|
"grad_norm": 395.812744140625, |
|
"learning_rate": 4.6314496314496314e-05, |
|
"loss": 0.9287, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.514003294892916, |
|
"grad_norm": 312.8116149902344, |
|
"learning_rate": 4.619164619164619e-05, |
|
"loss": 0.898, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5205930807248764, |
|
"grad_norm": 124.0872802734375, |
|
"learning_rate": 4.6068796068796074e-05, |
|
"loss": 0.9626, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5271828665568369, |
|
"grad_norm": 180.38021850585938, |
|
"learning_rate": 4.594594594594595e-05, |
|
"loss": 0.8902, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5337726523887973, |
|
"grad_norm": 190.2543182373047, |
|
"learning_rate": 4.582309582309582e-05, |
|
"loss": 0.8404, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5403624382207578, |
|
"grad_norm": 119.05390167236328, |
|
"learning_rate": 4.5700245700245705e-05, |
|
"loss": 0.9087, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5469522240527183, |
|
"grad_norm": 564.9111938476562, |
|
"learning_rate": 4.557739557739558e-05, |
|
"loss": 0.7668, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5535420098846787, |
|
"grad_norm": 131.78086853027344, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.8434, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5601317957166392, |
|
"grad_norm": 453.88775634765625, |
|
"learning_rate": 4.5331695331695335e-05, |
|
"loss": 0.8631, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5667215815485996, |
|
"grad_norm": 192.94564819335938, |
|
"learning_rate": 4.520884520884521e-05, |
|
"loss": 0.8508, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5733113673805601, |
|
"grad_norm": 178.88607788085938, |
|
"learning_rate": 4.508599508599509e-05, |
|
"loss": 0.8746, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5799011532125206, |
|
"grad_norm": 355.49322509765625, |
|
"learning_rate": 4.4963144963144966e-05, |
|
"loss": 0.8221, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.586490939044481, |
|
"grad_norm": 727.7778930664062, |
|
"learning_rate": 4.484029484029484e-05, |
|
"loss": 1.0285, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5930807248764415, |
|
"grad_norm": 1586.21337890625, |
|
"learning_rate": 4.471744471744472e-05, |
|
"loss": 0.9701, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5996705107084019, |
|
"grad_norm": 558.633544921875, |
|
"learning_rate": 4.4594594594594596e-05, |
|
"loss": 0.8067, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6062602965403624, |
|
"grad_norm": 969.2847900390625, |
|
"learning_rate": 4.447174447174447e-05, |
|
"loss": 0.7854, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6128500823723229, |
|
"grad_norm": 437.51397705078125, |
|
"learning_rate": 4.434889434889435e-05, |
|
"loss": 0.9254, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6194398682042833, |
|
"grad_norm": 436.55853271484375, |
|
"learning_rate": 4.422604422604423e-05, |
|
"loss": 0.8167, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6260296540362438, |
|
"grad_norm": 213.31967163085938, |
|
"learning_rate": 4.4103194103194104e-05, |
|
"loss": 0.8767, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6326194398682042, |
|
"grad_norm": 121.2298583984375, |
|
"learning_rate": 4.398034398034398e-05, |
|
"loss": 0.8475, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6392092257001647, |
|
"grad_norm": 275.9543762207031, |
|
"learning_rate": 4.385749385749386e-05, |
|
"loss": 0.7616, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6457990115321252, |
|
"grad_norm": 211.8274688720703, |
|
"learning_rate": 4.373464373464374e-05, |
|
"loss": 0.8941, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6523887973640856, |
|
"grad_norm": 85.87938690185547, |
|
"learning_rate": 4.361179361179362e-05, |
|
"loss": 0.8319, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6589785831960461, |
|
"grad_norm": 160.2537841796875, |
|
"learning_rate": 4.348894348894349e-05, |
|
"loss": 0.8626, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6655683690280065, |
|
"grad_norm": 458.2138977050781, |
|
"learning_rate": 4.336609336609337e-05, |
|
"loss": 0.8062, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6721581548599671, |
|
"grad_norm": 162.5004425048828, |
|
"learning_rate": 4.324324324324325e-05, |
|
"loss": 0.8782, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6787479406919276, |
|
"grad_norm": 95.31204223632812, |
|
"learning_rate": 4.312039312039312e-05, |
|
"loss": 0.888, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.685337726523888, |
|
"grad_norm": 224.08078002929688, |
|
"learning_rate": 4.2997542997543e-05, |
|
"loss": 0.8597, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6919275123558485, |
|
"grad_norm": 1092.45458984375, |
|
"learning_rate": 4.287469287469288e-05, |
|
"loss": 0.6954, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6985172981878089, |
|
"grad_norm": 440.71038818359375, |
|
"learning_rate": 4.2751842751842756e-05, |
|
"loss": 0.9465, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7051070840197694, |
|
"grad_norm": 530.1962280273438, |
|
"learning_rate": 4.262899262899263e-05, |
|
"loss": 0.8514, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7116968698517299, |
|
"grad_norm": 177.9505157470703, |
|
"learning_rate": 4.250614250614251e-05, |
|
"loss": 0.9099, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7182866556836903, |
|
"grad_norm": 445.9535217285156, |
|
"learning_rate": 4.2383292383292386e-05, |
|
"loss": 0.8832, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7248764415156508, |
|
"grad_norm": 381.5651550292969, |
|
"learning_rate": 4.226044226044226e-05, |
|
"loss": 0.8822, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7314662273476112, |
|
"grad_norm": 304.6574401855469, |
|
"learning_rate": 4.213759213759214e-05, |
|
"loss": 0.9179, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7380560131795717, |
|
"grad_norm": 493.38702392578125, |
|
"learning_rate": 4.2014742014742017e-05, |
|
"loss": 0.8883, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7446457990115322, |
|
"grad_norm": 308.3809814453125, |
|
"learning_rate": 4.189189189189189e-05, |
|
"loss": 0.9063, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7512355848434926, |
|
"grad_norm": 566.2634887695312, |
|
"learning_rate": 4.176904176904177e-05, |
|
"loss": 0.8362, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7578253706754531, |
|
"grad_norm": 351.1834716796875, |
|
"learning_rate": 4.164619164619165e-05, |
|
"loss": 0.7975, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7644151565074135, |
|
"grad_norm": 431.1800842285156, |
|
"learning_rate": 4.1523341523341524e-05, |
|
"loss": 0.9101, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.771004942339374, |
|
"grad_norm": 162.2163848876953, |
|
"learning_rate": 4.14004914004914e-05, |
|
"loss": 0.9346, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7775947281713345, |
|
"grad_norm": 380.3731994628906, |
|
"learning_rate": 4.127764127764128e-05, |
|
"loss": 0.8159, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7841845140032949, |
|
"grad_norm": 162.47447204589844, |
|
"learning_rate": 4.1154791154791154e-05, |
|
"loss": 0.7163, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7907742998352554, |
|
"grad_norm": 302.619873046875, |
|
"learning_rate": 4.103194103194104e-05, |
|
"loss": 0.7443, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7973640856672158, |
|
"grad_norm": 263.76385498046875, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 0.8107, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8039538714991763, |
|
"grad_norm": 175.19789123535156, |
|
"learning_rate": 4.0786240786240785e-05, |
|
"loss": 0.7875, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8105436573311368, |
|
"grad_norm": 201.381591796875, |
|
"learning_rate": 4.066339066339067e-05, |
|
"loss": 0.7453, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8171334431630972, |
|
"grad_norm": 66.85758972167969, |
|
"learning_rate": 4.0540540540540545e-05, |
|
"loss": 0.7854, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8237232289950577, |
|
"grad_norm": 497.2497253417969, |
|
"learning_rate": 4.0417690417690415e-05, |
|
"loss": 0.8332, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8303130148270181, |
|
"grad_norm": 446.8744812011719, |
|
"learning_rate": 4.02948402948403e-05, |
|
"loss": 0.8435, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8369028006589786, |
|
"grad_norm": 149.99560546875, |
|
"learning_rate": 4.0171990171990176e-05, |
|
"loss": 0.7904, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8434925864909391, |
|
"grad_norm": 95.36408996582031, |
|
"learning_rate": 4.004914004914005e-05, |
|
"loss": 0.8921, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8500823723228995, |
|
"grad_norm": 329.3395080566406, |
|
"learning_rate": 3.992628992628993e-05, |
|
"loss": 0.938, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.85667215815486, |
|
"grad_norm": 105.44376373291016, |
|
"learning_rate": 3.9803439803439806e-05, |
|
"loss": 0.7651, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8632619439868204, |
|
"grad_norm": 365.7022399902344, |
|
"learning_rate": 3.968058968058968e-05, |
|
"loss": 0.9926, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8698517298187809, |
|
"grad_norm": 157.09010314941406, |
|
"learning_rate": 3.955773955773956e-05, |
|
"loss": 0.8138, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8764415156507414, |
|
"grad_norm": 199.70140075683594, |
|
"learning_rate": 3.943488943488944e-05, |
|
"loss": 0.8207, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8830313014827018, |
|
"grad_norm": 170.55154418945312, |
|
"learning_rate": 3.9312039312039314e-05, |
|
"loss": 0.8112, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8896210873146623, |
|
"grad_norm": 101.48678588867188, |
|
"learning_rate": 3.918918918918919e-05, |
|
"loss": 0.9179, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8962108731466227, |
|
"grad_norm": 134.48329162597656, |
|
"learning_rate": 3.906633906633907e-05, |
|
"loss": 0.7232, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9028006589785832, |
|
"grad_norm": 55.03281784057617, |
|
"learning_rate": 3.8943488943488944e-05, |
|
"loss": 0.7995, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9093904448105437, |
|
"grad_norm": 150.32440185546875, |
|
"learning_rate": 3.882063882063882e-05, |
|
"loss": 0.809, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9159802306425041, |
|
"grad_norm": 338.85614013671875, |
|
"learning_rate": 3.86977886977887e-05, |
|
"loss": 0.8687, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9225700164744646, |
|
"grad_norm": 151.74453735351562, |
|
"learning_rate": 3.857493857493858e-05, |
|
"loss": 0.7813, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.929159802306425, |
|
"grad_norm": 138.03311157226562, |
|
"learning_rate": 3.845208845208845e-05, |
|
"loss": 0.8763, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9357495881383855, |
|
"grad_norm": 391.97857666015625, |
|
"learning_rate": 3.8329238329238335e-05, |
|
"loss": 0.8187, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.942339373970346, |
|
"grad_norm": 266.4914245605469, |
|
"learning_rate": 3.820638820638821e-05, |
|
"loss": 0.6941, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9489291598023064, |
|
"grad_norm": 79.93824005126953, |
|
"learning_rate": 3.808353808353808e-05, |
|
"loss": 0.8388, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9555189456342669, |
|
"grad_norm": 596.604736328125, |
|
"learning_rate": 3.7960687960687965e-05, |
|
"loss": 0.7768, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9621087314662273, |
|
"grad_norm": 70.91590118408203, |
|
"learning_rate": 3.783783783783784e-05, |
|
"loss": 0.7389, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9686985172981878, |
|
"grad_norm": 305.1685485839844, |
|
"learning_rate": 3.771498771498771e-05, |
|
"loss": 0.9451, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9752883031301482, |
|
"grad_norm": 96.85557556152344, |
|
"learning_rate": 3.7592137592137596e-05, |
|
"loss": 0.8449, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9818780889621087, |
|
"grad_norm": 480.3149108886719, |
|
"learning_rate": 3.746928746928747e-05, |
|
"loss": 0.9493, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9884678747940692, |
|
"grad_norm": 238.29258728027344, |
|
"learning_rate": 3.734643734643735e-05, |
|
"loss": 0.8697, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9950576606260296, |
|
"grad_norm": 106.33365631103516, |
|
"learning_rate": 3.7223587223587226e-05, |
|
"loss": 0.8836, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.00164744645799, |
|
"grad_norm": 439.60009765625, |
|
"learning_rate": 3.71007371007371e-05, |
|
"loss": 0.7164, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0082372322899507, |
|
"grad_norm": 109.26066589355469, |
|
"learning_rate": 3.697788697788698e-05, |
|
"loss": 0.632, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.014827018121911, |
|
"grad_norm": 415.2508239746094, |
|
"learning_rate": 3.685503685503686e-05, |
|
"loss": 0.6479, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0214168039538716, |
|
"grad_norm": 260.052734375, |
|
"learning_rate": 3.6732186732186734e-05, |
|
"loss": 0.5934, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.028006589785832, |
|
"grad_norm": 1057.008056640625, |
|
"learning_rate": 3.660933660933661e-05, |
|
"loss": 0.5648, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0345963756177925, |
|
"grad_norm": 106.66386413574219, |
|
"learning_rate": 3.648648648648649e-05, |
|
"loss": 0.5957, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0411861614497528, |
|
"grad_norm": 130.90151977539062, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.5769, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0477759472817134, |
|
"grad_norm": 262.9067077636719, |
|
"learning_rate": 3.624078624078625e-05, |
|
"loss": 0.6341, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0543657331136738, |
|
"grad_norm": 59.18153762817383, |
|
"learning_rate": 3.611793611793612e-05, |
|
"loss": 0.5328, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0609555189456343, |
|
"grad_norm": 122.3954849243164, |
|
"learning_rate": 3.5995085995085995e-05, |
|
"loss": 0.5212, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0675453047775947, |
|
"grad_norm": 219.09283447265625, |
|
"learning_rate": 3.587223587223588e-05, |
|
"loss": 0.5569, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0741350906095553, |
|
"grad_norm": 321.9674987792969, |
|
"learning_rate": 3.574938574938575e-05, |
|
"loss": 0.5809, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0807248764415156, |
|
"grad_norm": 83.09851837158203, |
|
"learning_rate": 3.562653562653563e-05, |
|
"loss": 0.5576, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0873146622734762, |
|
"grad_norm": 138.06068420410156, |
|
"learning_rate": 3.550368550368551e-05, |
|
"loss": 0.5051, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0939044481054365, |
|
"grad_norm": 152.32656860351562, |
|
"learning_rate": 3.538083538083538e-05, |
|
"loss": 0.5291, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.100494233937397, |
|
"grad_norm": 327.38824462890625, |
|
"learning_rate": 3.525798525798526e-05, |
|
"loss": 0.6476, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.1070840197693574, |
|
"grad_norm": 121.95663452148438, |
|
"learning_rate": 3.513513513513514e-05, |
|
"loss": 0.5158, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.113673805601318, |
|
"grad_norm": 92.62237548828125, |
|
"learning_rate": 3.501228501228501e-05, |
|
"loss": 0.5702, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.1202635914332784, |
|
"grad_norm": 683.2556762695312, |
|
"learning_rate": 3.488943488943489e-05, |
|
"loss": 0.621, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.126853377265239, |
|
"grad_norm": 229.4330291748047, |
|
"learning_rate": 3.476658476658477e-05, |
|
"loss": 0.5686, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1334431630971993, |
|
"grad_norm": 213.6857147216797, |
|
"learning_rate": 3.4643734643734647e-05, |
|
"loss": 0.5534, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1400329489291599, |
|
"grad_norm": 150.42703247070312, |
|
"learning_rate": 3.452088452088452e-05, |
|
"loss": 0.4501, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1466227347611202, |
|
"grad_norm": 405.1623840332031, |
|
"learning_rate": 3.43980343980344e-05, |
|
"loss": 0.4762, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1532125205930808, |
|
"grad_norm": 213.4350128173828, |
|
"learning_rate": 3.427518427518428e-05, |
|
"loss": 0.5208, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1598023064250411, |
|
"grad_norm": 527.8944702148438, |
|
"learning_rate": 3.4152334152334154e-05, |
|
"loss": 0.6235, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1663920922570017, |
|
"grad_norm": 192.6360626220703, |
|
"learning_rate": 3.402948402948403e-05, |
|
"loss": 0.4193, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.172981878088962, |
|
"grad_norm": 203.14984130859375, |
|
"learning_rate": 3.390663390663391e-05, |
|
"loss": 0.6234, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1795716639209226, |
|
"grad_norm": 206.81492614746094, |
|
"learning_rate": 3.3783783783783784e-05, |
|
"loss": 0.4987, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.186161449752883, |
|
"grad_norm": 243.38145446777344, |
|
"learning_rate": 3.366093366093366e-05, |
|
"loss": 0.5247, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1927512355848435, |
|
"grad_norm": 161.61740112304688, |
|
"learning_rate": 3.3538083538083545e-05, |
|
"loss": 0.5509, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.1993410214168039, |
|
"grad_norm": 291.75469970703125, |
|
"learning_rate": 3.3415233415233415e-05, |
|
"loss": 0.5825, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2059308072487644, |
|
"grad_norm": 133.4263458251953, |
|
"learning_rate": 3.329238329238329e-05, |
|
"loss": 0.6691, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.2125205930807248, |
|
"grad_norm": 1017.546875, |
|
"learning_rate": 3.3169533169533175e-05, |
|
"loss": 0.4931, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.2191103789126854, |
|
"grad_norm": 108.6457748413086, |
|
"learning_rate": 3.3046683046683045e-05, |
|
"loss": 0.5175, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2257001647446457, |
|
"grad_norm": 146.6004638671875, |
|
"learning_rate": 3.292383292383293e-05, |
|
"loss": 0.4866, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2322899505766063, |
|
"grad_norm": 178.45260620117188, |
|
"learning_rate": 3.2800982800982806e-05, |
|
"loss": 0.5496, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2388797364085666, |
|
"grad_norm": 373.0599365234375, |
|
"learning_rate": 3.2678132678132676e-05, |
|
"loss": 0.5361, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2454695222405272, |
|
"grad_norm": 146.2403106689453, |
|
"learning_rate": 3.255528255528256e-05, |
|
"loss": 0.5619, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2520593080724876, |
|
"grad_norm": 49.14468002319336, |
|
"learning_rate": 3.2432432432432436e-05, |
|
"loss": 0.4598, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2586490939044481, |
|
"grad_norm": 1165.1783447265625, |
|
"learning_rate": 3.2309582309582306e-05, |
|
"loss": 0.5666, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.2652388797364087, |
|
"grad_norm": 61.536949157714844, |
|
"learning_rate": 3.218673218673219e-05, |
|
"loss": 0.5534, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.271828665568369, |
|
"grad_norm": 595.9248046875, |
|
"learning_rate": 3.206388206388207e-05, |
|
"loss": 0.4841, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.2784184514003294, |
|
"grad_norm": 186.39930725097656, |
|
"learning_rate": 3.1941031941031943e-05, |
|
"loss": 0.4922, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.28500823723229, |
|
"grad_norm": 325.35980224609375, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.5436, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2915980230642505, |
|
"grad_norm": 75.9457015991211, |
|
"learning_rate": 3.16953316953317e-05, |
|
"loss": 0.5036, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2981878088962109, |
|
"grad_norm": 67.08493041992188, |
|
"learning_rate": 3.1572481572481574e-05, |
|
"loss": 0.5175, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.3047775947281712, |
|
"grad_norm": 74.11741638183594, |
|
"learning_rate": 3.144963144963145e-05, |
|
"loss": 0.518, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3113673805601318, |
|
"grad_norm": 280.2093811035156, |
|
"learning_rate": 3.132678132678133e-05, |
|
"loss": 0.5865, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.3179571663920924, |
|
"grad_norm": 168.95388793945312, |
|
"learning_rate": 3.120393120393121e-05, |
|
"loss": 0.6006, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3245469522240527, |
|
"grad_norm": 411.6920471191406, |
|
"learning_rate": 3.108108108108108e-05, |
|
"loss": 0.5912, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.331136738056013, |
|
"grad_norm": 208.7516632080078, |
|
"learning_rate": 3.095823095823096e-05, |
|
"loss": 0.4938, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3377265238879736, |
|
"grad_norm": 49.95132827758789, |
|
"learning_rate": 3.083538083538084e-05, |
|
"loss": 0.5094, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3443163097199342, |
|
"grad_norm": 73.55326080322266, |
|
"learning_rate": 3.071253071253071e-05, |
|
"loss": 0.584, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3509060955518946, |
|
"grad_norm": 102.9446792602539, |
|
"learning_rate": 3.058968058968059e-05, |
|
"loss": 0.5473, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.357495881383855, |
|
"grad_norm": 457.49359130859375, |
|
"learning_rate": 3.046683046683047e-05, |
|
"loss": 0.6395, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.3640856672158155, |
|
"grad_norm": 236.29953002929688, |
|
"learning_rate": 3.0343980343980342e-05, |
|
"loss": 0.5853, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.370675453047776, |
|
"grad_norm": 39.58445358276367, |
|
"learning_rate": 3.0221130221130222e-05, |
|
"loss": 0.5631, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3772652388797364, |
|
"grad_norm": 275.3215026855469, |
|
"learning_rate": 3.0098280098280103e-05, |
|
"loss": 0.5391, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3838550247116967, |
|
"grad_norm": 231.88194274902344, |
|
"learning_rate": 2.9975429975429976e-05, |
|
"loss": 0.5002, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3904448105436573, |
|
"grad_norm": 294.2489929199219, |
|
"learning_rate": 2.9852579852579853e-05, |
|
"loss": 0.5846, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3970345963756179, |
|
"grad_norm": 203.10426330566406, |
|
"learning_rate": 2.9729729729729733e-05, |
|
"loss": 0.6386, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.4036243822075782, |
|
"grad_norm": 84.11065673828125, |
|
"learning_rate": 2.9606879606879607e-05, |
|
"loss": 0.4882, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.4102141680395386, |
|
"grad_norm": 220.28628540039062, |
|
"learning_rate": 2.9484029484029483e-05, |
|
"loss": 0.6756, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.4168039538714992, |
|
"grad_norm": 236.40895080566406, |
|
"learning_rate": 2.9361179361179364e-05, |
|
"loss": 0.5164, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4233937397034597, |
|
"grad_norm": 229.29913330078125, |
|
"learning_rate": 2.9238329238329237e-05, |
|
"loss": 0.5916, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.42998352553542, |
|
"grad_norm": 137.1915740966797, |
|
"learning_rate": 2.9115479115479117e-05, |
|
"loss": 0.6021, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4365733113673804, |
|
"grad_norm": 173.90122985839844, |
|
"learning_rate": 2.8992628992628994e-05, |
|
"loss": 0.6061, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.443163097199341, |
|
"grad_norm": 99.96955108642578, |
|
"learning_rate": 2.8869778869778868e-05, |
|
"loss": 0.5733, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.4497528830313016, |
|
"grad_norm": 56.602989196777344, |
|
"learning_rate": 2.8746928746928748e-05, |
|
"loss": 0.5788, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.456342668863262, |
|
"grad_norm": 69.04216003417969, |
|
"learning_rate": 2.8624078624078625e-05, |
|
"loss": 0.4894, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.4629324546952225, |
|
"grad_norm": 265.542724609375, |
|
"learning_rate": 2.8501228501228505e-05, |
|
"loss": 0.5877, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4695222405271828, |
|
"grad_norm": 203.73353576660156, |
|
"learning_rate": 2.8378378378378378e-05, |
|
"loss": 0.5662, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.4761120263591434, |
|
"grad_norm": 174.82192993164062, |
|
"learning_rate": 2.825552825552826e-05, |
|
"loss": 0.567, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4827018121911038, |
|
"grad_norm": 197.1634063720703, |
|
"learning_rate": 2.8132678132678135e-05, |
|
"loss": 0.5622, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4892915980230643, |
|
"grad_norm": 313.1665954589844, |
|
"learning_rate": 2.800982800982801e-05, |
|
"loss": 0.5647, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4958813838550247, |
|
"grad_norm": 337.4092102050781, |
|
"learning_rate": 2.788697788697789e-05, |
|
"loss": 0.5896, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.5024711696869852, |
|
"grad_norm": 335.1864318847656, |
|
"learning_rate": 2.776412776412777e-05, |
|
"loss": 0.5274, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.5090609555189456, |
|
"grad_norm": 149.53665161132812, |
|
"learning_rate": 2.764127764127764e-05, |
|
"loss": 0.6525, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.515650741350906, |
|
"grad_norm": 211.15191650390625, |
|
"learning_rate": 2.751842751842752e-05, |
|
"loss": 0.5824, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5222405271828665, |
|
"grad_norm": 96.61034393310547, |
|
"learning_rate": 2.73955773955774e-05, |
|
"loss": 0.5774, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.528830313014827, |
|
"grad_norm": 158.73837280273438, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.5415, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.5354200988467874, |
|
"grad_norm": 50.02914810180664, |
|
"learning_rate": 2.714987714987715e-05, |
|
"loss": 0.5729, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5420098846787478, |
|
"grad_norm": 57.91206359863281, |
|
"learning_rate": 2.702702702702703e-05, |
|
"loss": 0.6118, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5485996705107083, |
|
"grad_norm": 134.28807067871094, |
|
"learning_rate": 2.6904176904176904e-05, |
|
"loss": 0.4875, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.555189456342669, |
|
"grad_norm": 111.96310424804688, |
|
"learning_rate": 2.678132678132678e-05, |
|
"loss": 0.5112, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5617792421746293, |
|
"grad_norm": 210.6829376220703, |
|
"learning_rate": 2.665847665847666e-05, |
|
"loss": 0.5276, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.5683690280065898, |
|
"grad_norm": 160.88055419921875, |
|
"learning_rate": 2.6535626535626534e-05, |
|
"loss": 0.502, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5749588138385504, |
|
"grad_norm": 230.14341735839844, |
|
"learning_rate": 2.6412776412776414e-05, |
|
"loss": 0.6196, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5815485996705108, |
|
"grad_norm": 407.249267578125, |
|
"learning_rate": 2.628992628992629e-05, |
|
"loss": 0.4606, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.588138385502471, |
|
"grad_norm": 261.7560119628906, |
|
"learning_rate": 2.616707616707617e-05, |
|
"loss": 0.6383, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.5947281713344317, |
|
"grad_norm": 52.85226821899414, |
|
"learning_rate": 2.6044226044226045e-05, |
|
"loss": 0.5304, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.6013179571663922, |
|
"grad_norm": 123.97418975830078, |
|
"learning_rate": 2.5921375921375925e-05, |
|
"loss": 0.5707, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.6079077429983526, |
|
"grad_norm": 230.69007873535156, |
|
"learning_rate": 2.5798525798525802e-05, |
|
"loss": 0.4895, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.614497528830313, |
|
"grad_norm": 228.80450439453125, |
|
"learning_rate": 2.5675675675675675e-05, |
|
"loss": 0.6058, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6210873146622735, |
|
"grad_norm": 94.60694122314453, |
|
"learning_rate": 2.5552825552825555e-05, |
|
"loss": 0.5854, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.627677100494234, |
|
"grad_norm": 170.16766357421875, |
|
"learning_rate": 2.5429975429975432e-05, |
|
"loss": 0.587, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.6342668863261944, |
|
"grad_norm": 374.7227783203125, |
|
"learning_rate": 2.5307125307125306e-05, |
|
"loss": 0.468, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6408566721581548, |
|
"grad_norm": 304.4844665527344, |
|
"learning_rate": 2.5184275184275186e-05, |
|
"loss": 0.4362, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6474464579901154, |
|
"grad_norm": 555.0403442382812, |
|
"learning_rate": 2.5061425061425066e-05, |
|
"loss": 0.4977, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.654036243822076, |
|
"grad_norm": 282.910888671875, |
|
"learning_rate": 2.493857493857494e-05, |
|
"loss": 0.6255, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6606260296540363, |
|
"grad_norm": 141.1566925048828, |
|
"learning_rate": 2.4815724815724816e-05, |
|
"loss": 0.5125, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6672158154859966, |
|
"grad_norm": 141.13299560546875, |
|
"learning_rate": 2.4692874692874693e-05, |
|
"loss": 0.563, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6738056013179572, |
|
"grad_norm": 167.4251708984375, |
|
"learning_rate": 2.4570024570024573e-05, |
|
"loss": 0.4985, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6803953871499178, |
|
"grad_norm": 214.25567626953125, |
|
"learning_rate": 2.4447174447174447e-05, |
|
"loss": 0.5863, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6869851729818781, |
|
"grad_norm": 183.01986694335938, |
|
"learning_rate": 2.4324324324324327e-05, |
|
"loss": 0.4937, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6935749588138385, |
|
"grad_norm": 131.54083251953125, |
|
"learning_rate": 2.4201474201474204e-05, |
|
"loss": 0.6094, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.700164744645799, |
|
"grad_norm": 443.8067626953125, |
|
"learning_rate": 2.4078624078624077e-05, |
|
"loss": 0.4898, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.7067545304777596, |
|
"grad_norm": 133.2246551513672, |
|
"learning_rate": 2.3955773955773958e-05, |
|
"loss": 0.4859, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.71334431630972, |
|
"grad_norm": 259.9535217285156, |
|
"learning_rate": 2.3832923832923834e-05, |
|
"loss": 0.5411, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7199341021416803, |
|
"grad_norm": 380.90997314453125, |
|
"learning_rate": 2.371007371007371e-05, |
|
"loss": 0.5762, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.7265238879736409, |
|
"grad_norm": 423.51702880859375, |
|
"learning_rate": 2.3587223587223588e-05, |
|
"loss": 0.5791, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.7331136738056014, |
|
"grad_norm": 256.31378173828125, |
|
"learning_rate": 2.3464373464373465e-05, |
|
"loss": 0.5474, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.7397034596375618, |
|
"grad_norm": 352.33868408203125, |
|
"learning_rate": 2.3341523341523342e-05, |
|
"loss": 0.5265, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7462932454695221, |
|
"grad_norm": 378.3638000488281, |
|
"learning_rate": 2.3218673218673222e-05, |
|
"loss": 0.5491, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7528830313014827, |
|
"grad_norm": 209.05747985839844, |
|
"learning_rate": 2.3095823095823095e-05, |
|
"loss": 0.5141, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7594728171334433, |
|
"grad_norm": 141.59524536132812, |
|
"learning_rate": 2.2972972972972976e-05, |
|
"loss": 0.6506, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7660626029654036, |
|
"grad_norm": 219.4475555419922, |
|
"learning_rate": 2.2850122850122852e-05, |
|
"loss": 0.7009, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.772652388797364, |
|
"grad_norm": 80.54459381103516, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.5327, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7792421746293245, |
|
"grad_norm": 699.1453247070312, |
|
"learning_rate": 2.2604422604422606e-05, |
|
"loss": 0.5236, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7858319604612851, |
|
"grad_norm": 289.4430236816406, |
|
"learning_rate": 2.2481572481572483e-05, |
|
"loss": 0.4824, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.7924217462932455, |
|
"grad_norm": 182.82986450195312, |
|
"learning_rate": 2.235872235872236e-05, |
|
"loss": 0.5676, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.7990115321252058, |
|
"grad_norm": 225.81126403808594, |
|
"learning_rate": 2.2235872235872237e-05, |
|
"loss": 0.5194, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.8056013179571664, |
|
"grad_norm": 304.61309814453125, |
|
"learning_rate": 2.2113022113022113e-05, |
|
"loss": 0.626, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.812191103789127, |
|
"grad_norm": 154.47415161132812, |
|
"learning_rate": 2.199017199017199e-05, |
|
"loss": 0.5235, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8187808896210873, |
|
"grad_norm": 530.3298950195312, |
|
"learning_rate": 2.186732186732187e-05, |
|
"loss": 0.466, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.8253706754530477, |
|
"grad_norm": 61.56108093261719, |
|
"learning_rate": 2.1744471744471744e-05, |
|
"loss": 0.4229, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.8319604612850082, |
|
"grad_norm": 427.62469482421875, |
|
"learning_rate": 2.1621621621621624e-05, |
|
"loss": 0.644, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.8385502471169688, |
|
"grad_norm": 95.06147003173828, |
|
"learning_rate": 2.14987714987715e-05, |
|
"loss": 0.5268, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8451400329489291, |
|
"grad_norm": 85.69621276855469, |
|
"learning_rate": 2.1375921375921378e-05, |
|
"loss": 0.4855, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8517298187808895, |
|
"grad_norm": 526.4759521484375, |
|
"learning_rate": 2.1253071253071255e-05, |
|
"loss": 0.5573, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.85831960461285, |
|
"grad_norm": 265.5906677246094, |
|
"learning_rate": 2.113022113022113e-05, |
|
"loss": 0.555, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8649093904448106, |
|
"grad_norm": 414.3144226074219, |
|
"learning_rate": 2.1007371007371008e-05, |
|
"loss": 0.5584, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.871499176276771, |
|
"grad_norm": 304.21405029296875, |
|
"learning_rate": 2.0884520884520885e-05, |
|
"loss": 0.4671, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.8780889621087313, |
|
"grad_norm": 414.1387023925781, |
|
"learning_rate": 2.0761670761670762e-05, |
|
"loss": 0.6691, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.884678747940692, |
|
"grad_norm": 208.69493103027344, |
|
"learning_rate": 2.063882063882064e-05, |
|
"loss": 0.6271, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8912685337726525, |
|
"grad_norm": 430.6809387207031, |
|
"learning_rate": 2.051597051597052e-05, |
|
"loss": 0.5414, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8978583196046128, |
|
"grad_norm": 115.23016357421875, |
|
"learning_rate": 2.0393120393120392e-05, |
|
"loss": 0.5869, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.9044481054365732, |
|
"grad_norm": 242.86927795410156, |
|
"learning_rate": 2.0270270270270273e-05, |
|
"loss": 0.5224, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.9110378912685337, |
|
"grad_norm": 250.8336944580078, |
|
"learning_rate": 2.014742014742015e-05, |
|
"loss": 0.6022, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9176276771004943, |
|
"grad_norm": 104.50414276123047, |
|
"learning_rate": 2.0024570024570026e-05, |
|
"loss": 0.6064, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.9242174629324547, |
|
"grad_norm": 466.93768310546875, |
|
"learning_rate": 1.9901719901719903e-05, |
|
"loss": 0.6291, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.930807248764415, |
|
"grad_norm": 138.8919219970703, |
|
"learning_rate": 1.977886977886978e-05, |
|
"loss": 0.514, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.9373970345963756, |
|
"grad_norm": 532.3485717773438, |
|
"learning_rate": 1.9656019656019657e-05, |
|
"loss": 0.4918, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.9439868204283361, |
|
"grad_norm": 116.6861572265625, |
|
"learning_rate": 1.9533169533169534e-05, |
|
"loss": 0.7242, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9505766062602965, |
|
"grad_norm": 338.7384338378906, |
|
"learning_rate": 1.941031941031941e-05, |
|
"loss": 0.5006, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9571663920922568, |
|
"grad_norm": 57.919403076171875, |
|
"learning_rate": 1.928746928746929e-05, |
|
"loss": 0.5343, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9637561779242174, |
|
"grad_norm": 300.79095458984375, |
|
"learning_rate": 1.9164619164619167e-05, |
|
"loss": 0.4868, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.970345963756178, |
|
"grad_norm": 81.19691467285156, |
|
"learning_rate": 1.904176904176904e-05, |
|
"loss": 0.4897, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.9769357495881383, |
|
"grad_norm": 159.11351013183594, |
|
"learning_rate": 1.891891891891892e-05, |
|
"loss": 0.5667, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9835255354200987, |
|
"grad_norm": 80.84410095214844, |
|
"learning_rate": 1.8796068796068798e-05, |
|
"loss": 0.5623, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.9901153212520593, |
|
"grad_norm": 185.26185607910156, |
|
"learning_rate": 1.8673218673218675e-05, |
|
"loss": 0.5102, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9967051070840198, |
|
"grad_norm": 110.74467468261719, |
|
"learning_rate": 1.855036855036855e-05, |
|
"loss": 0.4176, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.00329489291598, |
|
"grad_norm": 175.4639892578125, |
|
"learning_rate": 1.842751842751843e-05, |
|
"loss": 0.4222, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.0098846787479405, |
|
"grad_norm": 123.6357192993164, |
|
"learning_rate": 1.8304668304668305e-05, |
|
"loss": 0.3096, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.0164744645799013, |
|
"grad_norm": 237.6382598876953, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.4244, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.0230642504118617, |
|
"grad_norm": 303.2618103027344, |
|
"learning_rate": 1.805896805896806e-05, |
|
"loss": 0.2701, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.029654036243822, |
|
"grad_norm": 338.3935241699219, |
|
"learning_rate": 1.793611793611794e-05, |
|
"loss": 0.3519, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.0362438220757824, |
|
"grad_norm": 246.98533630371094, |
|
"learning_rate": 1.7813267813267816e-05, |
|
"loss": 0.3403, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.042833607907743, |
|
"grad_norm": 376.6452941894531, |
|
"learning_rate": 1.769041769041769e-05, |
|
"loss": 0.336, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0494233937397035, |
|
"grad_norm": 134.3882293701172, |
|
"learning_rate": 1.756756756756757e-05, |
|
"loss": 0.311, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.056013179571664, |
|
"grad_norm": 45.66189956665039, |
|
"learning_rate": 1.7444717444717446e-05, |
|
"loss": 0.3463, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.062602965403624, |
|
"grad_norm": 251.7926788330078, |
|
"learning_rate": 1.7321867321867323e-05, |
|
"loss": 0.2922, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.069192751235585, |
|
"grad_norm": 86.86029815673828, |
|
"learning_rate": 1.71990171990172e-05, |
|
"loss": 0.2848, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0757825370675453, |
|
"grad_norm": 72.99238586425781, |
|
"learning_rate": 1.7076167076167077e-05, |
|
"loss": 0.2699, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.0823723228995057, |
|
"grad_norm": 348.0635681152344, |
|
"learning_rate": 1.6953316953316954e-05, |
|
"loss": 0.2766, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.088962108731466, |
|
"grad_norm": 307.6921691894531, |
|
"learning_rate": 1.683046683046683e-05, |
|
"loss": 0.3256, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.095551894563427, |
|
"grad_norm": 92.43419647216797, |
|
"learning_rate": 1.6707616707616707e-05, |
|
"loss": 0.312, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.102141680395387, |
|
"grad_norm": 365.3904113769531, |
|
"learning_rate": 1.6584766584766588e-05, |
|
"loss": 0.3238, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.1087314662273475, |
|
"grad_norm": 243.0485076904297, |
|
"learning_rate": 1.6461916461916464e-05, |
|
"loss": 0.3406, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.115321252059308, |
|
"grad_norm": 70.53246307373047, |
|
"learning_rate": 1.6339066339066338e-05, |
|
"loss": 0.2882, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.1219110378912687, |
|
"grad_norm": 271.1737060546875, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 0.318, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.128500823723229, |
|
"grad_norm": 944.7637329101562, |
|
"learning_rate": 1.6093366093366095e-05, |
|
"loss": 0.3186, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.1350906095551894, |
|
"grad_norm": 286.0992736816406, |
|
"learning_rate": 1.5970515970515972e-05, |
|
"loss": 0.3693, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.1416803953871497, |
|
"grad_norm": 331.73931884765625, |
|
"learning_rate": 1.584766584766585e-05, |
|
"loss": 0.3566, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.1482701812191105, |
|
"grad_norm": 68.8238754272461, |
|
"learning_rate": 1.5724815724815725e-05, |
|
"loss": 0.3458, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.154859967051071, |
|
"grad_norm": 117.55406188964844, |
|
"learning_rate": 1.5601965601965606e-05, |
|
"loss": 0.2875, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.161449752883031, |
|
"grad_norm": 137.7025909423828, |
|
"learning_rate": 1.547911547911548e-05, |
|
"loss": 0.2866, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.168039538714992, |
|
"grad_norm": 49.644142150878906, |
|
"learning_rate": 1.5356265356265356e-05, |
|
"loss": 0.2936, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.1746293245469523, |
|
"grad_norm": 228.79408264160156, |
|
"learning_rate": 1.5233415233415234e-05, |
|
"loss": 0.3444, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1812191103789127, |
|
"grad_norm": 197.12803649902344, |
|
"learning_rate": 1.5110565110565111e-05, |
|
"loss": 0.38, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.187808896210873, |
|
"grad_norm": 239.77589416503906, |
|
"learning_rate": 1.4987714987714988e-05, |
|
"loss": 0.3348, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.1943986820428334, |
|
"grad_norm": 52.3128547668457, |
|
"learning_rate": 1.4864864864864867e-05, |
|
"loss": 0.3723, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.200988467874794, |
|
"grad_norm": 136.6421661376953, |
|
"learning_rate": 1.4742014742014742e-05, |
|
"loss": 0.2879, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.2075782537067545, |
|
"grad_norm": 104.56753540039062, |
|
"learning_rate": 1.4619164619164619e-05, |
|
"loss": 0.2591, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.214168039538715, |
|
"grad_norm": 98.5406265258789, |
|
"learning_rate": 1.4496314496314497e-05, |
|
"loss": 0.3729, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.2207578253706757, |
|
"grad_norm": 200.8502960205078, |
|
"learning_rate": 1.4373464373464374e-05, |
|
"loss": 0.3363, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.227347611202636, |
|
"grad_norm": 66.05599212646484, |
|
"learning_rate": 1.4250614250614252e-05, |
|
"loss": 0.3238, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.2339373970345964, |
|
"grad_norm": 229.73007202148438, |
|
"learning_rate": 1.412776412776413e-05, |
|
"loss": 0.3587, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.2405271828665567, |
|
"grad_norm": 117.70530700683594, |
|
"learning_rate": 1.4004914004914004e-05, |
|
"loss": 0.4228, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.247116968698517, |
|
"grad_norm": 132.34347534179688, |
|
"learning_rate": 1.3882063882063885e-05, |
|
"loss": 0.2763, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.253706754530478, |
|
"grad_norm": 133.9874267578125, |
|
"learning_rate": 1.375921375921376e-05, |
|
"loss": 0.2553, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.260296540362438, |
|
"grad_norm": 106.14327239990234, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 0.3593, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.2668863261943986, |
|
"grad_norm": 188.0239715576172, |
|
"learning_rate": 1.3513513513513515e-05, |
|
"loss": 0.2971, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.2734761120263594, |
|
"grad_norm": 255.7284698486328, |
|
"learning_rate": 1.339066339066339e-05, |
|
"loss": 0.3132, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.2800658978583197, |
|
"grad_norm": 400.6213073730469, |
|
"learning_rate": 1.3267813267813267e-05, |
|
"loss": 0.3044, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.28665568369028, |
|
"grad_norm": 104.66104888916016, |
|
"learning_rate": 1.3144963144963146e-05, |
|
"loss": 0.3143, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.2932454695222404, |
|
"grad_norm": 49.61936569213867, |
|
"learning_rate": 1.3022113022113022e-05, |
|
"loss": 0.3229, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.2998352553542007, |
|
"grad_norm": 349.636962890625, |
|
"learning_rate": 1.2899262899262901e-05, |
|
"loss": 0.3035, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.3064250411861615, |
|
"grad_norm": 284.7281494140625, |
|
"learning_rate": 1.2776412776412778e-05, |
|
"loss": 0.3167, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.313014827018122, |
|
"grad_norm": 49.98039245605469, |
|
"learning_rate": 1.2653562653562653e-05, |
|
"loss": 0.278, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.3196046128500822, |
|
"grad_norm": 67.77862548828125, |
|
"learning_rate": 1.2530712530712533e-05, |
|
"loss": 0.3302, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.326194398682043, |
|
"grad_norm": 118.573486328125, |
|
"learning_rate": 1.2407862407862408e-05, |
|
"loss": 0.3006, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.3327841845140034, |
|
"grad_norm": 33.04819107055664, |
|
"learning_rate": 1.2285012285012287e-05, |
|
"loss": 0.3596, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.3393739703459637, |
|
"grad_norm": 161.4209747314453, |
|
"learning_rate": 1.2162162162162164e-05, |
|
"loss": 0.3984, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.345963756177924, |
|
"grad_norm": 44.77053451538086, |
|
"learning_rate": 1.2039312039312039e-05, |
|
"loss": 0.4111, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.352553542009885, |
|
"grad_norm": 343.490966796875, |
|
"learning_rate": 1.1916461916461917e-05, |
|
"loss": 0.3359, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.359143327841845, |
|
"grad_norm": 864.7401733398438, |
|
"learning_rate": 1.1793611793611794e-05, |
|
"loss": 0.2895, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3657331136738056, |
|
"grad_norm": 148.0756378173828, |
|
"learning_rate": 1.1670761670761671e-05, |
|
"loss": 0.335, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.372322899505766, |
|
"grad_norm": 78.04084777832031, |
|
"learning_rate": 1.1547911547911548e-05, |
|
"loss": 0.2999, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.3789126853377267, |
|
"grad_norm": 77.24546813964844, |
|
"learning_rate": 1.1425061425061426e-05, |
|
"loss": 0.3043, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.385502471169687, |
|
"grad_norm": 60.59427261352539, |
|
"learning_rate": 1.1302211302211303e-05, |
|
"loss": 0.3838, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.3920922570016474, |
|
"grad_norm": 208.8542938232422, |
|
"learning_rate": 1.117936117936118e-05, |
|
"loss": 0.2956, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.3986820428336078, |
|
"grad_norm": 403.25823974609375, |
|
"learning_rate": 1.1056511056511057e-05, |
|
"loss": 0.2976, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.4052718286655685, |
|
"grad_norm": 152.59671020507812, |
|
"learning_rate": 1.0933660933660935e-05, |
|
"loss": 0.3533, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.411861614497529, |
|
"grad_norm": 533.1868286132812, |
|
"learning_rate": 1.0810810810810812e-05, |
|
"loss": 0.3418, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.4184514003294892, |
|
"grad_norm": 191.50588989257812, |
|
"learning_rate": 1.0687960687960689e-05, |
|
"loss": 0.3042, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.4250411861614496, |
|
"grad_norm": 81.26240539550781, |
|
"learning_rate": 1.0565110565110566e-05, |
|
"loss": 0.3436, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.4316309719934104, |
|
"grad_norm": 281.524169921875, |
|
"learning_rate": 1.0442260442260443e-05, |
|
"loss": 0.3453, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.4382207578253707, |
|
"grad_norm": 221.48391723632812, |
|
"learning_rate": 1.031941031941032e-05, |
|
"loss": 0.3288, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.444810543657331, |
|
"grad_norm": 89.54031372070312, |
|
"learning_rate": 1.0196560196560196e-05, |
|
"loss": 0.3415, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.4514003294892914, |
|
"grad_norm": 102.45128631591797, |
|
"learning_rate": 1.0073710073710075e-05, |
|
"loss": 0.3264, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.4579901153212522, |
|
"grad_norm": 93.46699523925781, |
|
"learning_rate": 9.950859950859952e-06, |
|
"loss": 0.347, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4645799011532126, |
|
"grad_norm": 288.0792541503906, |
|
"learning_rate": 9.828009828009828e-06, |
|
"loss": 0.3834, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.471169686985173, |
|
"grad_norm": 119.23399353027344, |
|
"learning_rate": 9.705159705159705e-06, |
|
"loss": 0.2886, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.4777594728171333, |
|
"grad_norm": 1750.0697021484375, |
|
"learning_rate": 9.582309582309584e-06, |
|
"loss": 0.339, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.484349258649094, |
|
"grad_norm": 188.45376586914062, |
|
"learning_rate": 9.45945945945946e-06, |
|
"loss": 0.2951, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.4909390444810544, |
|
"grad_norm": 68.5443115234375, |
|
"learning_rate": 9.336609336609337e-06, |
|
"loss": 0.3369, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.4975288303130148, |
|
"grad_norm": 43.438167572021484, |
|
"learning_rate": 9.213759213759214e-06, |
|
"loss": 0.2915, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.504118616144975, |
|
"grad_norm": 70.3156967163086, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.3346, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.510708401976936, |
|
"grad_norm": 150.3292694091797, |
|
"learning_rate": 8.96805896805897e-06, |
|
"loss": 0.4153, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.5172981878088962, |
|
"grad_norm": 57.9390983581543, |
|
"learning_rate": 8.845208845208845e-06, |
|
"loss": 0.3337, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.5238879736408566, |
|
"grad_norm": 238.041259765625, |
|
"learning_rate": 8.722358722358723e-06, |
|
"loss": 0.3022, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.5304777594728174, |
|
"grad_norm": 181.7864227294922, |
|
"learning_rate": 8.5995085995086e-06, |
|
"loss": 0.3024, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.5370675453047777, |
|
"grad_norm": 170.37905883789062, |
|
"learning_rate": 8.476658476658477e-06, |
|
"loss": 0.3579, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.543657331136738, |
|
"grad_norm": 36.57583999633789, |
|
"learning_rate": 8.353808353808354e-06, |
|
"loss": 0.3481, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.5502471169686984, |
|
"grad_norm": 66.17654418945312, |
|
"learning_rate": 8.230958230958232e-06, |
|
"loss": 0.2793, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.556836902800659, |
|
"grad_norm": 156.1625518798828, |
|
"learning_rate": 8.108108108108109e-06, |
|
"loss": 0.3333, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.5634266886326196, |
|
"grad_norm": 79.08843994140625, |
|
"learning_rate": 7.985257985257986e-06, |
|
"loss": 0.3328, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.57001647446458, |
|
"grad_norm": 37.87118148803711, |
|
"learning_rate": 7.862407862407863e-06, |
|
"loss": 0.3145, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.5766062602965403, |
|
"grad_norm": 135.13316345214844, |
|
"learning_rate": 7.73955773955774e-06, |
|
"loss": 0.2399, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.583196046128501, |
|
"grad_norm": 150.52328491210938, |
|
"learning_rate": 7.616707616707617e-06, |
|
"loss": 0.2809, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.5897858319604614, |
|
"grad_norm": 76.37353515625, |
|
"learning_rate": 7.493857493857494e-06, |
|
"loss": 0.3387, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.5963756177924218, |
|
"grad_norm": 57.05943298339844, |
|
"learning_rate": 7.371007371007371e-06, |
|
"loss": 0.2754, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.602965403624382, |
|
"grad_norm": 92.42174530029297, |
|
"learning_rate": 7.2481572481572485e-06, |
|
"loss": 0.2883, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.6095551894563425, |
|
"grad_norm": 187.5482177734375, |
|
"learning_rate": 7.125307125307126e-06, |
|
"loss": 0.3647, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.6161449752883033, |
|
"grad_norm": 183.51123046875, |
|
"learning_rate": 7.002457002457002e-06, |
|
"loss": 0.294, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.6227347611202636, |
|
"grad_norm": 100.82892608642578, |
|
"learning_rate": 6.87960687960688e-06, |
|
"loss": 0.3716, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.629324546952224, |
|
"grad_norm": 328.86724853515625, |
|
"learning_rate": 6.7567567567567575e-06, |
|
"loss": 0.2346, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.6359143327841847, |
|
"grad_norm": 414.6925964355469, |
|
"learning_rate": 6.6339066339066335e-06, |
|
"loss": 0.3206, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.642504118616145, |
|
"grad_norm": 321.9985656738281, |
|
"learning_rate": 6.511056511056511e-06, |
|
"loss": 0.2855, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.6490939044481054, |
|
"grad_norm": 150.3809814453125, |
|
"learning_rate": 6.388206388206389e-06, |
|
"loss": 0.3489, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.655683690280066, |
|
"grad_norm": 139.02951049804688, |
|
"learning_rate": 6.2653562653562665e-06, |
|
"loss": 0.2419, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.662273476112026, |
|
"grad_norm": 151.592529296875, |
|
"learning_rate": 6.142506142506143e-06, |
|
"loss": 0.3109, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.668863261943987, |
|
"grad_norm": 47.01968765258789, |
|
"learning_rate": 6.019656019656019e-06, |
|
"loss": 0.2973, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.6754530477759473, |
|
"grad_norm": 300.16302490234375, |
|
"learning_rate": 5.896805896805897e-06, |
|
"loss": 0.3582, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.6820428336079076, |
|
"grad_norm": 380.8616027832031, |
|
"learning_rate": 5.773955773955774e-06, |
|
"loss": 0.2839, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.6886326194398684, |
|
"grad_norm": 119.63872528076172, |
|
"learning_rate": 5.6511056511056515e-06, |
|
"loss": 0.2836, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.6952224052718288, |
|
"grad_norm": 97.77668762207031, |
|
"learning_rate": 5.528255528255528e-06, |
|
"loss": 0.2978, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.701812191103789, |
|
"grad_norm": 32.76837921142578, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 0.3184, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.7084019769357495, |
|
"grad_norm": 148.2511749267578, |
|
"learning_rate": 5.282555282555283e-06, |
|
"loss": 0.3299, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.71499176276771, |
|
"grad_norm": 78.83771514892578, |
|
"learning_rate": 5.15970515970516e-06, |
|
"loss": 0.3379, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.7215815485996706, |
|
"grad_norm": 43.07817459106445, |
|
"learning_rate": 5.036855036855037e-06, |
|
"loss": 0.3325, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.728171334431631, |
|
"grad_norm": 34.94389343261719, |
|
"learning_rate": 4.914004914004914e-06, |
|
"loss": 0.2558, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.7347611202635913, |
|
"grad_norm": 241.07553100585938, |
|
"learning_rate": 4.791154791154792e-06, |
|
"loss": 0.3095, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.741350906095552, |
|
"grad_norm": 173.70211791992188, |
|
"learning_rate": 4.668304668304669e-06, |
|
"loss": 0.3556, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.7479406919275124, |
|
"grad_norm": 131.01210021972656, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.3794, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.754530477759473, |
|
"grad_norm": 80.76969146728516, |
|
"learning_rate": 4.422604422604422e-06, |
|
"loss": 0.2821, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.761120263591433, |
|
"grad_norm": 473.2821960449219, |
|
"learning_rate": 4.2997542997543e-06, |
|
"loss": 0.3323, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.7677100494233935, |
|
"grad_norm": 97.82337951660156, |
|
"learning_rate": 4.176904176904177e-06, |
|
"loss": 0.2755, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7742998352553543, |
|
"grad_norm": 316.9011535644531, |
|
"learning_rate": 4.0540540540540545e-06, |
|
"loss": 0.2792, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.7808896210873146, |
|
"grad_norm": 260.79034423828125, |
|
"learning_rate": 3.931203931203931e-06, |
|
"loss": 0.3694, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.787479406919275, |
|
"grad_norm": 62.82155990600586, |
|
"learning_rate": 3.8083538083538086e-06, |
|
"loss": 0.2944, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.7940691927512358, |
|
"grad_norm": 157.30381774902344, |
|
"learning_rate": 3.6855036855036854e-06, |
|
"loss": 0.2965, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.800658978583196, |
|
"grad_norm": 170.7925567626953, |
|
"learning_rate": 3.562653562653563e-06, |
|
"loss": 0.3216, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.8072487644151565, |
|
"grad_norm": 78.40022277832031, |
|
"learning_rate": 3.43980343980344e-06, |
|
"loss": 0.3452, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.813838550247117, |
|
"grad_norm": 89.03524017333984, |
|
"learning_rate": 3.3169533169533168e-06, |
|
"loss": 0.2611, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.820428336079077, |
|
"grad_norm": 38.65609359741211, |
|
"learning_rate": 3.1941031941031944e-06, |
|
"loss": 0.3284, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.827018121911038, |
|
"grad_norm": 608.3867797851562, |
|
"learning_rate": 3.0712530712530717e-06, |
|
"loss": 0.3454, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.8336079077429983, |
|
"grad_norm": 76.90625762939453, |
|
"learning_rate": 2.9484029484029485e-06, |
|
"loss": 0.2576, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.8401976935749587, |
|
"grad_norm": 53.53964614868164, |
|
"learning_rate": 2.8255528255528258e-06, |
|
"loss": 0.3003, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.8467874794069195, |
|
"grad_norm": 70.24076843261719, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 0.3142, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.85337726523888, |
|
"grad_norm": 33.27001190185547, |
|
"learning_rate": 2.57985257985258e-06, |
|
"loss": 0.3152, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.85996705107084, |
|
"grad_norm": 163.56005859375, |
|
"learning_rate": 2.457002457002457e-06, |
|
"loss": 0.2981, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.8665568369028005, |
|
"grad_norm": 126.6738510131836, |
|
"learning_rate": 2.3341523341523343e-06, |
|
"loss": 0.3119, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.873146622734761, |
|
"grad_norm": 39.26594924926758, |
|
"learning_rate": 2.211302211302211e-06, |
|
"loss": 0.3173, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.8797364085667216, |
|
"grad_norm": 106.17019653320312, |
|
"learning_rate": 2.0884520884520884e-06, |
|
"loss": 0.342, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.886326194398682, |
|
"grad_norm": 119.90926361083984, |
|
"learning_rate": 1.9656019656019657e-06, |
|
"loss": 0.2775, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.892915980230643, |
|
"grad_norm": 278.5794677734375, |
|
"learning_rate": 1.8427518427518427e-06, |
|
"loss": 0.2704, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.899505766062603, |
|
"grad_norm": 95.96866607666016, |
|
"learning_rate": 1.71990171990172e-06, |
|
"loss": 0.2633, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.9060955518945635, |
|
"grad_norm": 94.9200210571289, |
|
"learning_rate": 1.5970515970515972e-06, |
|
"loss": 0.2662, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.912685337726524, |
|
"grad_norm": 331.02508544921875, |
|
"learning_rate": 1.4742014742014743e-06, |
|
"loss": 0.2685, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.919275123558484, |
|
"grad_norm": 474.9095764160156, |
|
"learning_rate": 1.3513513513513515e-06, |
|
"loss": 0.2664, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.925864909390445, |
|
"grad_norm": 146.21035766601562, |
|
"learning_rate": 1.2285012285012285e-06, |
|
"loss": 0.2524, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.9324546952224053, |
|
"grad_norm": 67.15187072753906, |
|
"learning_rate": 1.1056511056511056e-06, |
|
"loss": 0.2769, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.9390444810543657, |
|
"grad_norm": 68.6742172241211, |
|
"learning_rate": 9.828009828009828e-07, |
|
"loss": 0.2768, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.9456342668863265, |
|
"grad_norm": 82.91572570800781, |
|
"learning_rate": 8.5995085995086e-07, |
|
"loss": 0.2923, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.952224052718287, |
|
"grad_norm": 236.94644165039062, |
|
"learning_rate": 7.371007371007371e-07, |
|
"loss": 0.3512, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.958813838550247, |
|
"grad_norm": 84.61334991455078, |
|
"learning_rate": 6.142506142506143e-07, |
|
"loss": 0.254, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.9654036243822075, |
|
"grad_norm": 160.33612060546875, |
|
"learning_rate": 4.914004914004914e-07, |
|
"loss": 0.2808, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.971993410214168, |
|
"grad_norm": 96.69217681884766, |
|
"learning_rate": 3.6855036855036856e-07, |
|
"loss": 0.2926, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.9785831960461286, |
|
"grad_norm": 315.9173278808594, |
|
"learning_rate": 2.457002457002457e-07, |
|
"loss": 0.2809, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.985172981878089, |
|
"grad_norm": 46.96442413330078, |
|
"learning_rate": 1.2285012285012285e-07, |
|
"loss": 0.3087, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.985172981878089, |
|
"step": 453, |
|
"total_flos": 1.0318088260361912e+18, |
|
"train_loss": 0.5700862745509768, |
|
"train_runtime": 62496.1524, |
|
"train_samples_per_second": 0.35, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 453, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0318088260361912e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|