diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7138 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.9815950920245395, + "eval_steps": 500, + "global_step": 1015, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0049079754601227, + "grad_norm": 2.422234535217285, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.5835, + "step": 1 + }, + { + "epoch": 0.0098159509202454, + "grad_norm": 0.8900374174118042, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.2599, + "step": 2 + }, + { + "epoch": 0.014723926380368098, + "grad_norm": 0.9694510102272034, + "learning_rate": 4.2e-06, + "loss": 0.1512, + "step": 3 + }, + { + "epoch": 0.0196319018404908, + "grad_norm": 2.383556604385376, + "learning_rate": 5.600000000000001e-06, + "loss": 0.5828, + "step": 4 + }, + { + "epoch": 0.024539877300613498, + "grad_norm": 0.5773197412490845, + "learning_rate": 7e-06, + "loss": 0.156, + "step": 5 + }, + { + "epoch": 0.029447852760736196, + "grad_norm": 0.8293938636779785, + "learning_rate": 6.993069306930693e-06, + "loss": 0.1723, + "step": 6 + }, + { + "epoch": 0.0343558282208589, + "grad_norm": 0.02590053714811802, + "learning_rate": 6.986138613861386e-06, + "loss": 0.0011, + "step": 7 + }, + { + "epoch": 0.0392638036809816, + "grad_norm": 0.627031683921814, + "learning_rate": 6.979207920792079e-06, + "loss": 0.2592, + "step": 8 + }, + { + "epoch": 0.044171779141104296, + "grad_norm": 6.028290271759033, + "learning_rate": 6.972277227722772e-06, + "loss": 0.7911, + "step": 9 + }, + { + "epoch": 0.049079754601226995, + "grad_norm": 1.6350908279418945, + "learning_rate": 6.965346534653465e-06, + "loss": 0.2866, + "step": 10 + }, + { + "epoch": 0.053987730061349694, + "grad_norm": 1.3962496519088745, + "learning_rate": 6.958415841584158e-06, + "loss": 0.2447, + "step": 11 + }, + { + "epoch": 0.05889570552147239, + "grad_norm": 0.385162889957428, + "learning_rate": 6.951485148514851e-06, + "loss": 0.1153, + "step": 12 + }, + { + "epoch": 0.0638036809815951, + "grad_norm": 0.6873381733894348, + "learning_rate": 6.9445544554455444e-06, + "loss": 0.6868, + "step": 13 + }, + { + "epoch": 0.0687116564417178, + "grad_norm": 0.3449646234512329, + "learning_rate": 6.9376237623762375e-06, + "loss": 0.1523, + "step": 14 + }, + { + "epoch": 0.0736196319018405, + "grad_norm": 2.022113084793091, + "learning_rate": 6.930693069306931e-06, + "loss": 0.3527, + "step": 15 + }, + { + "epoch": 0.0785276073619632, + "grad_norm": 0.661878764629364, + "learning_rate": 6.923762376237624e-06, + "loss": 0.2449, + "step": 16 + }, + { + "epoch": 0.0834355828220859, + "grad_norm": 0.7884008884429932, + "learning_rate": 6.916831683168317e-06, + "loss": 0.1496, + "step": 17 + }, + { + "epoch": 0.08834355828220859, + "grad_norm": 0.8798872828483582, + "learning_rate": 6.90990099009901e-06, + "loss": 0.2151, + "step": 18 + }, + { + "epoch": 0.09325153374233129, + "grad_norm": 1.3640304803848267, + "learning_rate": 6.902970297029703e-06, + "loss": 0.356, + "step": 19 + }, + { + "epoch": 0.09815950920245399, + "grad_norm": 1.1427268981933594, + "learning_rate": 6.896039603960396e-06, + "loss": 0.2645, + "step": 20 + }, + { + "epoch": 0.10306748466257669, + "grad_norm": 0.9606547951698303, + "learning_rate": 6.889108910891089e-06, + "loss": 0.3541, + "step": 21 + }, + { + "epoch": 0.10797546012269939, + "grad_norm": 0.23967993259429932, + "learning_rate": 6.882178217821782e-06, + "loss": 0.0407, + "step": 22 + }, + { + "epoch": 0.11288343558282209, + "grad_norm": 5.597607612609863, + "learning_rate": 6.875247524752475e-06, + "loss": 1.0892, + "step": 23 + }, + { + "epoch": 0.11779141104294479, + "grad_norm": 0.6763832569122314, + "learning_rate": 6.868316831683168e-06, + "loss": 0.4406, + "step": 24 + }, + { + "epoch": 0.12269938650306748, + "grad_norm": 5.346296310424805, + "learning_rate": 6.861386138613861e-06, + "loss": 1.2512, + "step": 25 + }, + { + "epoch": 0.1276073619631902, + "grad_norm": 0.9197913408279419, + "learning_rate": 6.854455445544554e-06, + "loss": 0.1855, + "step": 26 + }, + { + "epoch": 0.1325153374233129, + "grad_norm": 0.6145322918891907, + "learning_rate": 6.847524752475247e-06, + "loss": 0.0562, + "step": 27 + }, + { + "epoch": 0.1374233128834356, + "grad_norm": 1.5995681285858154, + "learning_rate": 6.8405940594059405e-06, + "loss": 0.405, + "step": 28 + }, + { + "epoch": 0.1423312883435583, + "grad_norm": 0.6084582805633545, + "learning_rate": 6.8336633663366335e-06, + "loss": 0.0953, + "step": 29 + }, + { + "epoch": 0.147239263803681, + "grad_norm": 0.09425808489322662, + "learning_rate": 6.826732673267327e-06, + "loss": 0.0128, + "step": 30 + }, + { + "epoch": 0.1521472392638037, + "grad_norm": 2.628293991088867, + "learning_rate": 6.81980198019802e-06, + "loss": 0.4032, + "step": 31 + }, + { + "epoch": 0.1570552147239264, + "grad_norm": 0.4179680347442627, + "learning_rate": 6.812871287128713e-06, + "loss": 0.0317, + "step": 32 + }, + { + "epoch": 0.1619631901840491, + "grad_norm": 1.106016755104065, + "learning_rate": 6.805940594059406e-06, + "loss": 0.1495, + "step": 33 + }, + { + "epoch": 0.1668711656441718, + "grad_norm": 0.3168316185474396, + "learning_rate": 6.799009900990099e-06, + "loss": 0.1907, + "step": 34 + }, + { + "epoch": 0.17177914110429449, + "grad_norm": 0.4211124777793884, + "learning_rate": 6.792079207920792e-06, + "loss": 0.0519, + "step": 35 + }, + { + "epoch": 0.17668711656441718, + "grad_norm": 1.7166955471038818, + "learning_rate": 6.785148514851485e-06, + "loss": 0.1693, + "step": 36 + }, + { + "epoch": 0.18159509202453988, + "grad_norm": 1.6126145124435425, + "learning_rate": 6.778217821782178e-06, + "loss": 0.5255, + "step": 37 + }, + { + "epoch": 0.18650306748466258, + "grad_norm": 0.6241395473480225, + "learning_rate": 6.771287128712871e-06, + "loss": 0.2787, + "step": 38 + }, + { + "epoch": 0.19141104294478528, + "grad_norm": 0.8319867253303528, + "learning_rate": 6.764356435643564e-06, + "loss": 0.0874, + "step": 39 + }, + { + "epoch": 0.19631901840490798, + "grad_norm": 2.6518406867980957, + "learning_rate": 6.757425742574257e-06, + "loss": 0.1322, + "step": 40 + }, + { + "epoch": 0.20122699386503068, + "grad_norm": 0.830451488494873, + "learning_rate": 6.75049504950495e-06, + "loss": 0.0785, + "step": 41 + }, + { + "epoch": 0.20613496932515338, + "grad_norm": 0.9671738147735596, + "learning_rate": 6.7435643564356434e-06, + "loss": 0.1281, + "step": 42 + }, + { + "epoch": 0.21104294478527608, + "grad_norm": 0.2914385497570038, + "learning_rate": 6.7366336633663365e-06, + "loss": 0.0565, + "step": 43 + }, + { + "epoch": 0.21595092024539878, + "grad_norm": 1.4892910718917847, + "learning_rate": 6.7297029702970296e-06, + "loss": 0.3326, + "step": 44 + }, + { + "epoch": 0.22085889570552147, + "grad_norm": 0.3593141734600067, + "learning_rate": 6.722772277227723e-06, + "loss": 0.0499, + "step": 45 + }, + { + "epoch": 0.22576687116564417, + "grad_norm": 0.8170046210289001, + "learning_rate": 6.715841584158416e-06, + "loss": 0.028, + "step": 46 + }, + { + "epoch": 0.23067484662576687, + "grad_norm": 1.7005876302719116, + "learning_rate": 6.708910891089109e-06, + "loss": 0.1811, + "step": 47 + }, + { + "epoch": 0.23558282208588957, + "grad_norm": 3.5889787673950195, + "learning_rate": 6.701980198019802e-06, + "loss": 0.5892, + "step": 48 + }, + { + "epoch": 0.24049079754601227, + "grad_norm": 0.26327264308929443, + "learning_rate": 6.695049504950495e-06, + "loss": 0.0134, + "step": 49 + }, + { + "epoch": 0.24539877300613497, + "grad_norm": 3.6891543865203857, + "learning_rate": 6.688118811881188e-06, + "loss": 0.2651, + "step": 50 + }, + { + "epoch": 0.25030674846625767, + "grad_norm": 0.229234978556633, + "learning_rate": 6.681188118811881e-06, + "loss": 0.0307, + "step": 51 + }, + { + "epoch": 0.2552147239263804, + "grad_norm": 0.4476153552532196, + "learning_rate": 6.674257425742574e-06, + "loss": 0.0974, + "step": 52 + }, + { + "epoch": 0.26012269938650306, + "grad_norm": 0.8651221394538879, + "learning_rate": 6.667326732673267e-06, + "loss": 0.136, + "step": 53 + }, + { + "epoch": 0.2650306748466258, + "grad_norm": 0.8453481197357178, + "learning_rate": 6.66039603960396e-06, + "loss": 0.0714, + "step": 54 + }, + { + "epoch": 0.26993865030674846, + "grad_norm": 0.8162591457366943, + "learning_rate": 6.653465346534653e-06, + "loss": 0.0607, + "step": 55 + }, + { + "epoch": 0.2748466257668712, + "grad_norm": 1.0115143060684204, + "learning_rate": 6.646534653465346e-06, + "loss": 0.0377, + "step": 56 + }, + { + "epoch": 0.27975460122699386, + "grad_norm": 0.18834809958934784, + "learning_rate": 6.6396039603960395e-06, + "loss": 0.0062, + "step": 57 + }, + { + "epoch": 0.2846625766871166, + "grad_norm": 0.06441894173622131, + "learning_rate": 6.6326732673267325e-06, + "loss": 0.0094, + "step": 58 + }, + { + "epoch": 0.28957055214723926, + "grad_norm": 1.1535841226577759, + "learning_rate": 6.625742574257426e-06, + "loss": 0.0234, + "step": 59 + }, + { + "epoch": 0.294478527607362, + "grad_norm": 0.1546783298254013, + "learning_rate": 6.618811881188119e-06, + "loss": 0.0192, + "step": 60 + }, + { + "epoch": 0.29938650306748466, + "grad_norm": 6.198948383331299, + "learning_rate": 6.611881188118812e-06, + "loss": 0.0637, + "step": 61 + }, + { + "epoch": 0.3042944785276074, + "grad_norm": 1.3115315437316895, + "learning_rate": 6.604950495049505e-06, + "loss": 0.074, + "step": 62 + }, + { + "epoch": 0.30920245398773005, + "grad_norm": 0.5492228269577026, + "learning_rate": 6.598019801980198e-06, + "loss": 0.0291, + "step": 63 + }, + { + "epoch": 0.3141104294478528, + "grad_norm": 0.46307000517845154, + "learning_rate": 6.591089108910891e-06, + "loss": 0.1191, + "step": 64 + }, + { + "epoch": 0.31901840490797545, + "grad_norm": 0.16621133685112, + "learning_rate": 6.584158415841584e-06, + "loss": 0.0317, + "step": 65 + }, + { + "epoch": 0.3239263803680982, + "grad_norm": 0.56168532371521, + "learning_rate": 6.577227722772277e-06, + "loss": 0.0797, + "step": 66 + }, + { + "epoch": 0.32883435582822085, + "grad_norm": 0.21208958327770233, + "learning_rate": 6.57029702970297e-06, + "loss": 0.0039, + "step": 67 + }, + { + "epoch": 0.3337423312883436, + "grad_norm": 0.5419512391090393, + "learning_rate": 6.563366336633663e-06, + "loss": 0.0269, + "step": 68 + }, + { + "epoch": 0.33865030674846625, + "grad_norm": 0.278899222612381, + "learning_rate": 6.556435643564357e-06, + "loss": 0.0593, + "step": 69 + }, + { + "epoch": 0.34355828220858897, + "grad_norm": 0.6312870979309082, + "learning_rate": 6.54950495049505e-06, + "loss": 0.0751, + "step": 70 + }, + { + "epoch": 0.34846625766871164, + "grad_norm": 0.19221121072769165, + "learning_rate": 6.542574257425743e-06, + "loss": 0.0037, + "step": 71 + }, + { + "epoch": 0.35337423312883437, + "grad_norm": 0.29226556420326233, + "learning_rate": 6.5356435643564355e-06, + "loss": 0.0417, + "step": 72 + }, + { + "epoch": 0.35828220858895704, + "grad_norm": 0.32889777421951294, + "learning_rate": 6.5287128712871286e-06, + "loss": 0.0767, + "step": 73 + }, + { + "epoch": 0.36319018404907977, + "grad_norm": 0.665745735168457, + "learning_rate": 6.521782178217822e-06, + "loss": 0.1935, + "step": 74 + }, + { + "epoch": 0.36809815950920244, + "grad_norm": 0.10821861028671265, + "learning_rate": 6.514851485148515e-06, + "loss": 0.0087, + "step": 75 + }, + { + "epoch": 0.37300613496932516, + "grad_norm": 1.0068309307098389, + "learning_rate": 6.507920792079208e-06, + "loss": 0.0232, + "step": 76 + }, + { + "epoch": 0.37791411042944784, + "grad_norm": 0.03962863236665726, + "learning_rate": 6.500990099009901e-06, + "loss": 0.0051, + "step": 77 + }, + { + "epoch": 0.38282208588957056, + "grad_norm": 9.40488052368164, + "learning_rate": 6.494059405940594e-06, + "loss": 0.2292, + "step": 78 + }, + { + "epoch": 0.38773006134969323, + "grad_norm": 0.49305009841918945, + "learning_rate": 6.487128712871287e-06, + "loss": 0.0924, + "step": 79 + }, + { + "epoch": 0.39263803680981596, + "grad_norm": 0.72562575340271, + "learning_rate": 6.48019801980198e-06, + "loss": 0.0549, + "step": 80 + }, + { + "epoch": 0.39754601226993863, + "grad_norm": 0.015473410487174988, + "learning_rate": 6.473267326732673e-06, + "loss": 0.0017, + "step": 81 + }, + { + "epoch": 0.40245398773006136, + "grad_norm": 0.13165496289730072, + "learning_rate": 6.466336633663366e-06, + "loss": 0.0068, + "step": 82 + }, + { + "epoch": 0.40736196319018403, + "grad_norm": 0.42630520462989807, + "learning_rate": 6.459405940594059e-06, + "loss": 0.0927, + "step": 83 + }, + { + "epoch": 0.41226993865030676, + "grad_norm": 0.059238456189632416, + "learning_rate": 6.452475247524752e-06, + "loss": 0.0036, + "step": 84 + }, + { + "epoch": 0.4171779141104294, + "grad_norm": 0.8484250903129578, + "learning_rate": 6.445544554455445e-06, + "loss": 0.2075, + "step": 85 + }, + { + "epoch": 0.42208588957055215, + "grad_norm": 0.3800269067287445, + "learning_rate": 6.4386138613861384e-06, + "loss": 0.1287, + "step": 86 + }, + { + "epoch": 0.4269938650306748, + "grad_norm": 0.002063202438876033, + "learning_rate": 6.4316831683168315e-06, + "loss": 0.0008, + "step": 87 + }, + { + "epoch": 0.43190184049079755, + "grad_norm": 0.05038560926914215, + "learning_rate": 6.424752475247525e-06, + "loss": 0.0017, + "step": 88 + }, + { + "epoch": 0.4368098159509202, + "grad_norm": 0.0683569684624672, + "learning_rate": 6.417821782178218e-06, + "loss": 0.0035, + "step": 89 + }, + { + "epoch": 0.44171779141104295, + "grad_norm": 0.028999239206314087, + "learning_rate": 6.410891089108911e-06, + "loss": 0.002, + "step": 90 + }, + { + "epoch": 0.4466257668711656, + "grad_norm": 0.1129206195473671, + "learning_rate": 6.403960396039604e-06, + "loss": 0.0045, + "step": 91 + }, + { + "epoch": 0.45153374233128835, + "grad_norm": 0.012081787921488285, + "learning_rate": 6.397029702970297e-06, + "loss": 0.0016, + "step": 92 + }, + { + "epoch": 0.456441717791411, + "grad_norm": 0.29454007744789124, + "learning_rate": 6.39009900990099e-06, + "loss": 0.0047, + "step": 93 + }, + { + "epoch": 0.46134969325153374, + "grad_norm": 0.342790812253952, + "learning_rate": 6.383168316831683e-06, + "loss": 0.0182, + "step": 94 + }, + { + "epoch": 0.4662576687116564, + "grad_norm": 0.3073101341724396, + "learning_rate": 6.376237623762376e-06, + "loss": 0.1144, + "step": 95 + }, + { + "epoch": 0.47116564417177914, + "grad_norm": 0.028703156858682632, + "learning_rate": 6.369306930693069e-06, + "loss": 0.0017, + "step": 96 + }, + { + "epoch": 0.47607361963190187, + "grad_norm": 0.2344651073217392, + "learning_rate": 6.362376237623762e-06, + "loss": 0.1945, + "step": 97 + }, + { + "epoch": 0.48098159509202454, + "grad_norm": 0.4256807565689087, + "learning_rate": 6.355445544554455e-06, + "loss": 0.0432, + "step": 98 + }, + { + "epoch": 0.48588957055214727, + "grad_norm": 0.690371572971344, + "learning_rate": 6.348514851485148e-06, + "loss": 0.1223, + "step": 99 + }, + { + "epoch": 0.49079754601226994, + "grad_norm": 0.006785046309232712, + "learning_rate": 6.341584158415841e-06, + "loss": 0.0009, + "step": 100 + }, + { + "epoch": 0.49570552147239266, + "grad_norm": 0.3789716362953186, + "learning_rate": 6.3346534653465345e-06, + "loss": 0.0753, + "step": 101 + }, + { + "epoch": 0.5006134969325153, + "grad_norm": 0.2657737731933594, + "learning_rate": 6.3277227722772275e-06, + "loss": 0.0068, + "step": 102 + }, + { + "epoch": 0.505521472392638, + "grad_norm": 0.0025652130134403706, + "learning_rate": 6.320792079207921e-06, + "loss": 0.0005, + "step": 103 + }, + { + "epoch": 0.5104294478527608, + "grad_norm": 0.026512114331126213, + "learning_rate": 6.313861386138614e-06, + "loss": 0.0012, + "step": 104 + }, + { + "epoch": 0.5153374233128835, + "grad_norm": 0.020907960832118988, + "learning_rate": 6.306930693069307e-06, + "loss": 0.002, + "step": 105 + }, + { + "epoch": 0.5202453987730061, + "grad_norm": 0.024951621890068054, + "learning_rate": 6.3e-06, + "loss": 0.0019, + "step": 106 + }, + { + "epoch": 0.5251533742331288, + "grad_norm": 0.03137391433119774, + "learning_rate": 6.293069306930693e-06, + "loss": 0.0022, + "step": 107 + }, + { + "epoch": 0.5300613496932516, + "grad_norm": 0.01158374547958374, + "learning_rate": 6.286138613861386e-06, + "loss": 0.002, + "step": 108 + }, + { + "epoch": 0.5349693251533743, + "grad_norm": 0.20386900007724762, + "learning_rate": 6.279207920792079e-06, + "loss": 0.0438, + "step": 109 + }, + { + "epoch": 0.5398773006134969, + "grad_norm": 0.21870765089988708, + "learning_rate": 6.272277227722772e-06, + "loss": 0.0147, + "step": 110 + }, + { + "epoch": 0.5447852760736196, + "grad_norm": 0.01832536980509758, + "learning_rate": 6.265346534653465e-06, + "loss": 0.0013, + "step": 111 + }, + { + "epoch": 0.5496932515337424, + "grad_norm": 1.1292037963867188, + "learning_rate": 6.258415841584158e-06, + "loss": 0.0233, + "step": 112 + }, + { + "epoch": 0.554601226993865, + "grad_norm": 0.23187655210494995, + "learning_rate": 6.251485148514851e-06, + "loss": 0.0964, + "step": 113 + }, + { + "epoch": 0.5595092024539877, + "grad_norm": 0.05976148694753647, + "learning_rate": 6.244554455445544e-06, + "loss": 0.0026, + "step": 114 + }, + { + "epoch": 0.5644171779141104, + "grad_norm": 0.2549298107624054, + "learning_rate": 6.2376237623762374e-06, + "loss": 0.0204, + "step": 115 + }, + { + "epoch": 0.5693251533742332, + "grad_norm": 0.5240129232406616, + "learning_rate": 6.2306930693069305e-06, + "loss": 0.0091, + "step": 116 + }, + { + "epoch": 0.5742331288343558, + "grad_norm": 0.008397325873374939, + "learning_rate": 6.2237623762376236e-06, + "loss": 0.0031, + "step": 117 + }, + { + "epoch": 0.5791411042944785, + "grad_norm": 0.17320525646209717, + "learning_rate": 6.216831683168317e-06, + "loss": 0.0696, + "step": 118 + }, + { + "epoch": 0.5840490797546012, + "grad_norm": 0.8707240223884583, + "learning_rate": 6.20990099009901e-06, + "loss": 0.0563, + "step": 119 + }, + { + "epoch": 0.588957055214724, + "grad_norm": 0.03828573226928711, + "learning_rate": 6.202970297029703e-06, + "loss": 0.0025, + "step": 120 + }, + { + "epoch": 0.5938650306748466, + "grad_norm": 0.39433979988098145, + "learning_rate": 6.196039603960396e-06, + "loss": 0.0286, + "step": 121 + }, + { + "epoch": 0.5987730061349693, + "grad_norm": 0.24125856161117554, + "learning_rate": 6.189108910891089e-06, + "loss": 0.0474, + "step": 122 + }, + { + "epoch": 0.603680981595092, + "grad_norm": 0.3200415074825287, + "learning_rate": 6.182178217821782e-06, + "loss": 0.0138, + "step": 123 + }, + { + "epoch": 0.6085889570552148, + "grad_norm": 0.016078324988484383, + "learning_rate": 6.175247524752475e-06, + "loss": 0.0015, + "step": 124 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.023426106199622154, + "learning_rate": 6.168316831683168e-06, + "loss": 0.0025, + "step": 125 + }, + { + "epoch": 0.6184049079754601, + "grad_norm": 0.255380779504776, + "learning_rate": 6.161386138613861e-06, + "loss": 0.0091, + "step": 126 + }, + { + "epoch": 0.6233128834355828, + "grad_norm": 0.02590188756585121, + "learning_rate": 6.154455445544554e-06, + "loss": 0.0025, + "step": 127 + }, + { + "epoch": 0.6282208588957056, + "grad_norm": 1.4449337720870972, + "learning_rate": 6.147524752475247e-06, + "loss": 0.1513, + "step": 128 + }, + { + "epoch": 0.6331288343558282, + "grad_norm": 0.27828818559646606, + "learning_rate": 6.14059405940594e-06, + "loss": 0.0542, + "step": 129 + }, + { + "epoch": 0.6380368098159509, + "grad_norm": 0.0807948037981987, + "learning_rate": 6.1336633663366335e-06, + "loss": 0.0033, + "step": 130 + }, + { + "epoch": 0.6429447852760736, + "grad_norm": 0.5472558736801147, + "learning_rate": 6.1267326732673265e-06, + "loss": 0.1161, + "step": 131 + }, + { + "epoch": 0.6478527607361964, + "grad_norm": 0.18089702725410461, + "learning_rate": 6.11980198019802e-06, + "loss": 0.0464, + "step": 132 + }, + { + "epoch": 0.652760736196319, + "grad_norm": 0.025230491533875465, + "learning_rate": 6.112871287128713e-06, + "loss": 0.0016, + "step": 133 + }, + { + "epoch": 0.6576687116564417, + "grad_norm": 0.2929389178752899, + "learning_rate": 6.105940594059406e-06, + "loss": 0.0404, + "step": 134 + }, + { + "epoch": 0.6625766871165644, + "grad_norm": 0.2341107577085495, + "learning_rate": 6.099009900990099e-06, + "loss": 0.0401, + "step": 135 + }, + { + "epoch": 0.6674846625766871, + "grad_norm": 0.11038243025541306, + "learning_rate": 6.092079207920792e-06, + "loss": 0.0045, + "step": 136 + }, + { + "epoch": 0.6723926380368098, + "grad_norm": 0.19471372663974762, + "learning_rate": 6.085148514851485e-06, + "loss": 0.0296, + "step": 137 + }, + { + "epoch": 0.6773006134969325, + "grad_norm": 0.3493005037307739, + "learning_rate": 6.078217821782178e-06, + "loss": 0.0435, + "step": 138 + }, + { + "epoch": 0.6822085889570552, + "grad_norm": 0.2767382860183716, + "learning_rate": 6.071287128712871e-06, + "loss": 0.0392, + "step": 139 + }, + { + "epoch": 0.6871165644171779, + "grad_norm": 0.2326585054397583, + "learning_rate": 6.064356435643564e-06, + "loss": 0.0372, + "step": 140 + }, + { + "epoch": 0.6920245398773006, + "grad_norm": 0.023760871961712837, + "learning_rate": 6.057425742574257e-06, + "loss": 0.0017, + "step": 141 + }, + { + "epoch": 0.6969325153374233, + "grad_norm": 0.02614918164908886, + "learning_rate": 6.05049504950495e-06, + "loss": 0.0019, + "step": 142 + }, + { + "epoch": 0.701840490797546, + "grad_norm": 0.21519997715950012, + "learning_rate": 6.043564356435643e-06, + "loss": 0.0249, + "step": 143 + }, + { + "epoch": 0.7067484662576687, + "grad_norm": 0.04296768456697464, + "learning_rate": 6.0366336633663364e-06, + "loss": 0.002, + "step": 144 + }, + { + "epoch": 0.7116564417177914, + "grad_norm": 0.0280557032674551, + "learning_rate": 6.0297029702970295e-06, + "loss": 0.0012, + "step": 145 + }, + { + "epoch": 0.7165644171779141, + "grad_norm": 0.016091475263237953, + "learning_rate": 6.0227722772277226e-06, + "loss": 0.001, + "step": 146 + }, + { + "epoch": 0.7214723926380369, + "grad_norm": 0.4104843735694885, + "learning_rate": 6.015841584158416e-06, + "loss": 0.0495, + "step": 147 + }, + { + "epoch": 0.7263803680981595, + "grad_norm": 0.28999537229537964, + "learning_rate": 6.008910891089109e-06, + "loss": 0.0377, + "step": 148 + }, + { + "epoch": 0.7312883435582822, + "grad_norm": 0.005074529442936182, + "learning_rate": 6.001980198019802e-06, + "loss": 0.0006, + "step": 149 + }, + { + "epoch": 0.7361963190184049, + "grad_norm": 0.031069966033101082, + "learning_rate": 5.995049504950495e-06, + "loss": 0.0013, + "step": 150 + }, + { + "epoch": 0.7411042944785277, + "grad_norm": 0.0037307555321604013, + "learning_rate": 5.988118811881188e-06, + "loss": 0.0004, + "step": 151 + }, + { + "epoch": 0.7460122699386503, + "grad_norm": 0.05543583631515503, + "learning_rate": 5.981188118811881e-06, + "loss": 0.0056, + "step": 152 + }, + { + "epoch": 0.750920245398773, + "grad_norm": 0.15376004576683044, + "learning_rate": 5.974257425742574e-06, + "loss": 0.0072, + "step": 153 + }, + { + "epoch": 0.7558282208588957, + "grad_norm": 0.17796596884727478, + "learning_rate": 5.967326732673267e-06, + "loss": 0.0329, + "step": 154 + }, + { + "epoch": 0.7607361963190185, + "grad_norm": 0.15298157930374146, + "learning_rate": 5.96039603960396e-06, + "loss": 0.014, + "step": 155 + }, + { + "epoch": 0.7656441717791411, + "grad_norm": 0.017613040283322334, + "learning_rate": 5.953465346534653e-06, + "loss": 0.0016, + "step": 156 + }, + { + "epoch": 0.7705521472392638, + "grad_norm": 0.33242282271385193, + "learning_rate": 5.946534653465346e-06, + "loss": 0.0333, + "step": 157 + }, + { + "epoch": 0.7754601226993865, + "grad_norm": 0.010239041410386562, + "learning_rate": 5.939603960396039e-06, + "loss": 0.0005, + "step": 158 + }, + { + "epoch": 0.7803680981595092, + "grad_norm": 0.01592794805765152, + "learning_rate": 5.9326732673267325e-06, + "loss": 0.0025, + "step": 159 + }, + { + "epoch": 0.7852760736196319, + "grad_norm": 0.3407064974308014, + "learning_rate": 5.9257425742574255e-06, + "loss": 0.0106, + "step": 160 + }, + { + "epoch": 0.7901840490797546, + "grad_norm": 0.11607719957828522, + "learning_rate": 5.918811881188119e-06, + "loss": 0.0197, + "step": 161 + }, + { + "epoch": 0.7950920245398773, + "grad_norm": 0.2373722642660141, + "learning_rate": 5.911881188118812e-06, + "loss": 0.009, + "step": 162 + }, + { + "epoch": 0.8, + "grad_norm": 0.013574715703725815, + "learning_rate": 5.904950495049505e-06, + "loss": 0.0012, + "step": 163 + }, + { + "epoch": 0.8049079754601227, + "grad_norm": 0.1712418496608734, + "learning_rate": 5.898019801980198e-06, + "loss": 0.0033, + "step": 164 + }, + { + "epoch": 0.8098159509202454, + "grad_norm": 0.06470532715320587, + "learning_rate": 5.891089108910891e-06, + "loss": 0.0022, + "step": 165 + }, + { + "epoch": 0.8147239263803681, + "grad_norm": 0.2027478665113449, + "learning_rate": 5.884158415841584e-06, + "loss": 0.0075, + "step": 166 + }, + { + "epoch": 0.8196319018404908, + "grad_norm": 0.018224777653813362, + "learning_rate": 5.877227722772277e-06, + "loss": 0.0022, + "step": 167 + }, + { + "epoch": 0.8245398773006135, + "grad_norm": 0.2861013114452362, + "learning_rate": 5.87029702970297e-06, + "loss": 0.0213, + "step": 168 + }, + { + "epoch": 0.8294478527607362, + "grad_norm": 0.1993497610092163, + "learning_rate": 5.863366336633663e-06, + "loss": 0.0047, + "step": 169 + }, + { + "epoch": 0.8343558282208589, + "grad_norm": 0.010481802746653557, + "learning_rate": 5.856435643564356e-06, + "loss": 0.001, + "step": 170 + }, + { + "epoch": 0.8392638036809816, + "grad_norm": 0.018094172701239586, + "learning_rate": 5.849504950495049e-06, + "loss": 0.0018, + "step": 171 + }, + { + "epoch": 0.8441717791411043, + "grad_norm": 0.06920409202575684, + "learning_rate": 5.842574257425742e-06, + "loss": 0.0042, + "step": 172 + }, + { + "epoch": 0.849079754601227, + "grad_norm": 0.18376444280147552, + "learning_rate": 5.835643564356435e-06, + "loss": 0.0066, + "step": 173 + }, + { + "epoch": 0.8539877300613496, + "grad_norm": 0.1790069192647934, + "learning_rate": 5.8287128712871285e-06, + "loss": 0.0147, + "step": 174 + }, + { + "epoch": 0.8588957055214724, + "grad_norm": 0.018982654437422752, + "learning_rate": 5.8217821782178216e-06, + "loss": 0.003, + "step": 175 + }, + { + "epoch": 0.8638036809815951, + "grad_norm": 0.13061672449111938, + "learning_rate": 5.814851485148515e-06, + "loss": 0.0016, + "step": 176 + }, + { + "epoch": 0.8687116564417178, + "grad_norm": 0.0860314592719078, + "learning_rate": 5.807920792079208e-06, + "loss": 0.0047, + "step": 177 + }, + { + "epoch": 0.8736196319018404, + "grad_norm": 0.05230861157178879, + "learning_rate": 5.800990099009901e-06, + "loss": 0.0076, + "step": 178 + }, + { + "epoch": 0.8785276073619632, + "grad_norm": 0.3662849962711334, + "learning_rate": 5.794059405940594e-06, + "loss": 0.026, + "step": 179 + }, + { + "epoch": 0.8834355828220859, + "grad_norm": 0.2983264923095703, + "learning_rate": 5.787128712871287e-06, + "loss": 0.0203, + "step": 180 + }, + { + "epoch": 0.8883435582822086, + "grad_norm": 0.0335027277469635, + "learning_rate": 5.78019801980198e-06, + "loss": 0.0022, + "step": 181 + }, + { + "epoch": 0.8932515337423312, + "grad_norm": 0.4801461100578308, + "learning_rate": 5.773267326732673e-06, + "loss": 0.0082, + "step": 182 + }, + { + "epoch": 0.898159509202454, + "grad_norm": 0.014546169899404049, + "learning_rate": 5.766336633663366e-06, + "loss": 0.0031, + "step": 183 + }, + { + "epoch": 0.9030674846625767, + "grad_norm": 0.05124737694859505, + "learning_rate": 5.759405940594059e-06, + "loss": 0.002, + "step": 184 + }, + { + "epoch": 0.9079754601226994, + "grad_norm": 0.01893136277794838, + "learning_rate": 5.752475247524752e-06, + "loss": 0.0013, + "step": 185 + }, + { + "epoch": 0.912883435582822, + "grad_norm": 0.15197233855724335, + "learning_rate": 5.745544554455445e-06, + "loss": 0.0026, + "step": 186 + }, + { + "epoch": 0.9177914110429448, + "grad_norm": 0.8003377318382263, + "learning_rate": 5.738613861386138e-06, + "loss": 0.0146, + "step": 187 + }, + { + "epoch": 0.9226993865030675, + "grad_norm": 0.023129871115088463, + "learning_rate": 5.7316831683168314e-06, + "loss": 0.0035, + "step": 188 + }, + { + "epoch": 0.9276073619631902, + "grad_norm": 0.5904732346534729, + "learning_rate": 5.7247524752475245e-06, + "loss": 0.0223, + "step": 189 + }, + { + "epoch": 0.9325153374233128, + "grad_norm": 0.02393723465502262, + "learning_rate": 5.717821782178218e-06, + "loss": 0.0017, + "step": 190 + }, + { + "epoch": 0.9374233128834356, + "grad_norm": 0.2537831664085388, + "learning_rate": 5.710891089108911e-06, + "loss": 0.0113, + "step": 191 + }, + { + "epoch": 0.9423312883435583, + "grad_norm": 0.005700583104044199, + "learning_rate": 5.703960396039604e-06, + "loss": 0.0007, + "step": 192 + }, + { + "epoch": 0.947239263803681, + "grad_norm": 0.020626569166779518, + "learning_rate": 5.697029702970297e-06, + "loss": 0.002, + "step": 193 + }, + { + "epoch": 0.9521472392638037, + "grad_norm": 0.009778481908142567, + "learning_rate": 5.69009900990099e-06, + "loss": 0.0011, + "step": 194 + }, + { + "epoch": 0.9570552147239264, + "grad_norm": 0.020891468971967697, + "learning_rate": 5.683168316831684e-06, + "loss": 0.0017, + "step": 195 + }, + { + "epoch": 0.9619631901840491, + "grad_norm": 0.01422948855906725, + "learning_rate": 5.676237623762377e-06, + "loss": 0.0017, + "step": 196 + }, + { + "epoch": 0.9668711656441717, + "grad_norm": 0.3881951868534088, + "learning_rate": 5.66930693069307e-06, + "loss": 0.0239, + "step": 197 + }, + { + "epoch": 0.9717791411042945, + "grad_norm": 0.19655589759349823, + "learning_rate": 5.662376237623763e-06, + "loss": 0.0166, + "step": 198 + }, + { + "epoch": 0.9766871165644172, + "grad_norm": 0.22294123470783234, + "learning_rate": 5.655445544554456e-06, + "loss": 0.0106, + "step": 199 + }, + { + "epoch": 0.9815950920245399, + "grad_norm": 0.13569732010364532, + "learning_rate": 5.648514851485149e-06, + "loss": 0.0054, + "step": 200 + }, + { + "epoch": 0.9865030674846625, + "grad_norm": 0.1216830313205719, + "learning_rate": 5.641584158415842e-06, + "loss": 0.0075, + "step": 201 + }, + { + "epoch": 0.9914110429447853, + "grad_norm": 0.14887885749340057, + "learning_rate": 5.634653465346535e-06, + "loss": 0.0059, + "step": 202 + }, + { + "epoch": 0.996319018404908, + "grad_norm": 0.34408119320869446, + "learning_rate": 5.627722772277228e-06, + "loss": 0.0052, + "step": 203 + }, + { + "epoch": 1.0012269938650307, + "grad_norm": 0.022362129762768745, + "learning_rate": 5.620792079207921e-06, + "loss": 0.0016, + "step": 204 + }, + { + "epoch": 1.0061349693251533, + "grad_norm": 0.2365986853837967, + "learning_rate": 5.6138613861386145e-06, + "loss": 0.0205, + "step": 205 + }, + { + "epoch": 1.011042944785276, + "grad_norm": 0.007400259375572205, + "learning_rate": 5.6069306930693075e-06, + "loss": 0.0007, + "step": 206 + }, + { + "epoch": 1.0159509202453987, + "grad_norm": 0.016991781070828438, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0019, + "step": 207 + }, + { + "epoch": 1.0208588957055216, + "grad_norm": 0.035710081458091736, + "learning_rate": 5.593069306930694e-06, + "loss": 0.0016, + "step": 208 + }, + { + "epoch": 1.0257668711656442, + "grad_norm": 0.16605080664157867, + "learning_rate": 5.586138613861387e-06, + "loss": 0.0127, + "step": 209 + }, + { + "epoch": 1.030674846625767, + "grad_norm": 0.029172230511903763, + "learning_rate": 5.57920792079208e-06, + "loss": 0.0026, + "step": 210 + }, + { + "epoch": 1.0355828220858896, + "grad_norm": 0.012522108852863312, + "learning_rate": 5.572277227722773e-06, + "loss": 0.0013, + "step": 211 + }, + { + "epoch": 1.0404907975460123, + "grad_norm": 0.2723598778247833, + "learning_rate": 5.565346534653466e-06, + "loss": 0.0244, + "step": 212 + }, + { + "epoch": 1.045398773006135, + "grad_norm": 0.10059408843517303, + "learning_rate": 5.558415841584159e-06, + "loss": 0.0052, + "step": 213 + }, + { + "epoch": 1.0503067484662576, + "grad_norm": 0.040718916803598404, + "learning_rate": 5.551485148514852e-06, + "loss": 0.0027, + "step": 214 + }, + { + "epoch": 1.0552147239263803, + "grad_norm": 0.044893424957990646, + "learning_rate": 5.544554455445545e-06, + "loss": 0.0025, + "step": 215 + }, + { + "epoch": 1.0601226993865032, + "grad_norm": 0.0643078088760376, + "learning_rate": 5.537623762376238e-06, + "loss": 0.0026, + "step": 216 + }, + { + "epoch": 1.0650306748466258, + "grad_norm": 0.23729190230369568, + "learning_rate": 5.530693069306931e-06, + "loss": 0.0073, + "step": 217 + }, + { + "epoch": 1.0699386503067485, + "grad_norm": 0.03545796498656273, + "learning_rate": 5.5237623762376235e-06, + "loss": 0.0043, + "step": 218 + }, + { + "epoch": 1.0748466257668712, + "grad_norm": 0.12645600736141205, + "learning_rate": 5.5168316831683166e-06, + "loss": 0.0168, + "step": 219 + }, + { + "epoch": 1.0797546012269938, + "grad_norm": 0.22768345475196838, + "learning_rate": 5.50990099009901e-06, + "loss": 0.0066, + "step": 220 + }, + { + "epoch": 1.0846625766871165, + "grad_norm": 0.008956272155046463, + "learning_rate": 5.502970297029703e-06, + "loss": 0.002, + "step": 221 + }, + { + "epoch": 1.0895705521472392, + "grad_norm": 0.14142072200775146, + "learning_rate": 5.496039603960396e-06, + "loss": 0.005, + "step": 222 + }, + { + "epoch": 1.0944785276073619, + "grad_norm": 0.3319880962371826, + "learning_rate": 5.489108910891089e-06, + "loss": 0.0116, + "step": 223 + }, + { + "epoch": 1.0993865030674848, + "grad_norm": 0.03383160009980202, + "learning_rate": 5.482178217821782e-06, + "loss": 0.0021, + "step": 224 + }, + { + "epoch": 1.1042944785276074, + "grad_norm": 0.03335406631231308, + "learning_rate": 5.475247524752475e-06, + "loss": 0.0022, + "step": 225 + }, + { + "epoch": 1.10920245398773, + "grad_norm": 0.06353459507226944, + "learning_rate": 5.468316831683168e-06, + "loss": 0.0062, + "step": 226 + }, + { + "epoch": 1.1141104294478528, + "grad_norm": 0.2315845489501953, + "learning_rate": 5.461386138613861e-06, + "loss": 0.0221, + "step": 227 + }, + { + "epoch": 1.1190184049079754, + "grad_norm": 0.04464046284556389, + "learning_rate": 5.454455445544554e-06, + "loss": 0.0023, + "step": 228 + }, + { + "epoch": 1.123926380368098, + "grad_norm": 0.49593213200569153, + "learning_rate": 5.447524752475247e-06, + "loss": 0.0484, + "step": 229 + }, + { + "epoch": 1.1288343558282208, + "grad_norm": 0.050679679960012436, + "learning_rate": 5.44059405940594e-06, + "loss": 0.0021, + "step": 230 + }, + { + "epoch": 1.1337423312883437, + "grad_norm": 0.08641895651817322, + "learning_rate": 5.433663366336633e-06, + "loss": 0.0022, + "step": 231 + }, + { + "epoch": 1.1386503067484663, + "grad_norm": 0.046292319893836975, + "learning_rate": 5.4267326732673265e-06, + "loss": 0.0034, + "step": 232 + }, + { + "epoch": 1.143558282208589, + "grad_norm": 0.11266610026359558, + "learning_rate": 5.4198019801980195e-06, + "loss": 0.0048, + "step": 233 + }, + { + "epoch": 1.1484662576687117, + "grad_norm": 0.4683886170387268, + "learning_rate": 5.412871287128713e-06, + "loss": 0.0095, + "step": 234 + }, + { + "epoch": 1.1533742331288344, + "grad_norm": 0.17976729571819305, + "learning_rate": 5.405940594059406e-06, + "loss": 0.0221, + "step": 235 + }, + { + "epoch": 1.158282208588957, + "grad_norm": 0.08795922249555588, + "learning_rate": 5.399009900990099e-06, + "loss": 0.003, + "step": 236 + }, + { + "epoch": 1.1631901840490797, + "grad_norm": 0.012228120118379593, + "learning_rate": 5.392079207920792e-06, + "loss": 0.0007, + "step": 237 + }, + { + "epoch": 1.1680981595092024, + "grad_norm": 0.04390159249305725, + "learning_rate": 5.385148514851485e-06, + "loss": 0.0019, + "step": 238 + }, + { + "epoch": 1.173006134969325, + "grad_norm": 0.03781568259000778, + "learning_rate": 5.378217821782178e-06, + "loss": 0.002, + "step": 239 + }, + { + "epoch": 1.177914110429448, + "grad_norm": 0.18868562579154968, + "learning_rate": 5.371287128712871e-06, + "loss": 0.0078, + "step": 240 + }, + { + "epoch": 1.1828220858895706, + "grad_norm": 0.12037086486816406, + "learning_rate": 5.364356435643564e-06, + "loss": 0.0047, + "step": 241 + }, + { + "epoch": 1.1877300613496933, + "grad_norm": 0.21462522447109222, + "learning_rate": 5.357425742574257e-06, + "loss": 0.0075, + "step": 242 + }, + { + "epoch": 1.192638036809816, + "grad_norm": 0.14189103245735168, + "learning_rate": 5.35049504950495e-06, + "loss": 0.0109, + "step": 243 + }, + { + "epoch": 1.1975460122699386, + "grad_norm": 0.30977487564086914, + "learning_rate": 5.343564356435643e-06, + "loss": 0.0142, + "step": 244 + }, + { + "epoch": 1.2024539877300613, + "grad_norm": 0.03001783974468708, + "learning_rate": 5.336633663366336e-06, + "loss": 0.0011, + "step": 245 + }, + { + "epoch": 1.207361963190184, + "grad_norm": 0.35600486397743225, + "learning_rate": 5.329702970297029e-06, + "loss": 0.0343, + "step": 246 + }, + { + "epoch": 1.2122699386503069, + "grad_norm": 0.023076960816979408, + "learning_rate": 5.3227722772277225e-06, + "loss": 0.0021, + "step": 247 + }, + { + "epoch": 1.2171779141104295, + "grad_norm": 0.23361201584339142, + "learning_rate": 5.3158415841584156e-06, + "loss": 0.0068, + "step": 248 + }, + { + "epoch": 1.2220858895705522, + "grad_norm": 0.21508128941059113, + "learning_rate": 5.308910891089109e-06, + "loss": 0.0247, + "step": 249 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.1582375317811966, + "learning_rate": 5.301980198019802e-06, + "loss": 0.0037, + "step": 250 + }, + { + "epoch": 1.2319018404907975, + "grad_norm": 0.037315189838409424, + "learning_rate": 5.295049504950495e-06, + "loss": 0.0014, + "step": 251 + }, + { + "epoch": 1.2368098159509202, + "grad_norm": 0.07501725107431412, + "learning_rate": 5.288118811881188e-06, + "loss": 0.0037, + "step": 252 + }, + { + "epoch": 1.2417177914110429, + "grad_norm": 0.0727037638425827, + "learning_rate": 5.281188118811881e-06, + "loss": 0.0028, + "step": 253 + }, + { + "epoch": 1.2466257668711656, + "grad_norm": 0.04310867562890053, + "learning_rate": 5.274257425742574e-06, + "loss": 0.0039, + "step": 254 + }, + { + "epoch": 1.2515337423312882, + "grad_norm": 0.042814526706933975, + "learning_rate": 5.267326732673267e-06, + "loss": 0.0025, + "step": 255 + }, + { + "epoch": 1.2564417177914111, + "grad_norm": 0.11775332689285278, + "learning_rate": 5.26039603960396e-06, + "loss": 0.0127, + "step": 256 + }, + { + "epoch": 1.2613496932515338, + "grad_norm": 0.040224045515060425, + "learning_rate": 5.253465346534653e-06, + "loss": 0.0017, + "step": 257 + }, + { + "epoch": 1.2662576687116565, + "grad_norm": 0.011041068471968174, + "learning_rate": 5.246534653465346e-06, + "loss": 0.0011, + "step": 258 + }, + { + "epoch": 1.2711656441717791, + "grad_norm": 0.019592339172959328, + "learning_rate": 5.239603960396039e-06, + "loss": 0.0013, + "step": 259 + }, + { + "epoch": 1.2760736196319018, + "grad_norm": 0.0652371346950531, + "learning_rate": 5.232673267326732e-06, + "loss": 0.0026, + "step": 260 + }, + { + "epoch": 1.2809815950920245, + "grad_norm": 0.5446717143058777, + "learning_rate": 5.2257425742574254e-06, + "loss": 0.0305, + "step": 261 + }, + { + "epoch": 1.2858895705521474, + "grad_norm": 0.02951255440711975, + "learning_rate": 5.2188118811881185e-06, + "loss": 0.001, + "step": 262 + }, + { + "epoch": 1.29079754601227, + "grad_norm": 0.1153414323925972, + "learning_rate": 5.211881188118812e-06, + "loss": 0.0088, + "step": 263 + }, + { + "epoch": 1.2957055214723927, + "grad_norm": 0.17192097008228302, + "learning_rate": 5.204950495049505e-06, + "loss": 0.0043, + "step": 264 + }, + { + "epoch": 1.3006134969325154, + "grad_norm": 0.04299500212073326, + "learning_rate": 5.198019801980198e-06, + "loss": 0.0042, + "step": 265 + }, + { + "epoch": 1.305521472392638, + "grad_norm": 0.02218775264918804, + "learning_rate": 5.191089108910891e-06, + "loss": 0.001, + "step": 266 + }, + { + "epoch": 1.3104294478527607, + "grad_norm": 0.7823259830474854, + "learning_rate": 5.184158415841584e-06, + "loss": 0.0556, + "step": 267 + }, + { + "epoch": 1.3153374233128834, + "grad_norm": 0.09460531175136566, + "learning_rate": 5.177227722772277e-06, + "loss": 0.0091, + "step": 268 + }, + { + "epoch": 1.320245398773006, + "grad_norm": 0.04762015491724014, + "learning_rate": 5.17029702970297e-06, + "loss": 0.0039, + "step": 269 + }, + { + "epoch": 1.3251533742331287, + "grad_norm": 0.033739686012268066, + "learning_rate": 5.163366336633663e-06, + "loss": 0.0017, + "step": 270 + }, + { + "epoch": 1.3300613496932514, + "grad_norm": 0.06530934572219849, + "learning_rate": 5.156435643564356e-06, + "loss": 0.0047, + "step": 271 + }, + { + "epoch": 1.3349693251533743, + "grad_norm": 0.014586723409593105, + "learning_rate": 5.149504950495049e-06, + "loss": 0.001, + "step": 272 + }, + { + "epoch": 1.339877300613497, + "grad_norm": 0.05435250699520111, + "learning_rate": 5.142574257425742e-06, + "loss": 0.0028, + "step": 273 + }, + { + "epoch": 1.3447852760736196, + "grad_norm": 0.1281612366437912, + "learning_rate": 5.135643564356435e-06, + "loss": 0.0026, + "step": 274 + }, + { + "epoch": 1.3496932515337423, + "grad_norm": 0.18499918282032013, + "learning_rate": 5.128712871287128e-06, + "loss": 0.0057, + "step": 275 + }, + { + "epoch": 1.354601226993865, + "grad_norm": 0.07541365176439285, + "learning_rate": 5.1217821782178215e-06, + "loss": 0.003, + "step": 276 + }, + { + "epoch": 1.3595092024539877, + "grad_norm": 0.061677657067775726, + "learning_rate": 5.1148514851485145e-06, + "loss": 0.002, + "step": 277 + }, + { + "epoch": 1.3644171779141105, + "grad_norm": 0.16630059480667114, + "learning_rate": 5.107920792079208e-06, + "loss": 0.0138, + "step": 278 + }, + { + "epoch": 1.3693251533742332, + "grad_norm": 0.13260015845298767, + "learning_rate": 5.100990099009901e-06, + "loss": 0.0015, + "step": 279 + }, + { + "epoch": 1.3742331288343559, + "grad_norm": 0.04029810056090355, + "learning_rate": 5.094059405940594e-06, + "loss": 0.0022, + "step": 280 + }, + { + "epoch": 1.3791411042944786, + "grad_norm": 0.07820600271224976, + "learning_rate": 5.087128712871287e-06, + "loss": 0.0012, + "step": 281 + }, + { + "epoch": 1.3840490797546012, + "grad_norm": 0.11421211808919907, + "learning_rate": 5.08019801980198e-06, + "loss": 0.0036, + "step": 282 + }, + { + "epoch": 1.388957055214724, + "grad_norm": 0.02426535077393055, + "learning_rate": 5.073267326732673e-06, + "loss": 0.0015, + "step": 283 + }, + { + "epoch": 1.3938650306748466, + "grad_norm": 0.00859303679317236, + "learning_rate": 5.066336633663366e-06, + "loss": 0.001, + "step": 284 + }, + { + "epoch": 1.3987730061349692, + "grad_norm": 0.03431880474090576, + "learning_rate": 5.059405940594059e-06, + "loss": 0.0021, + "step": 285 + }, + { + "epoch": 1.403680981595092, + "grad_norm": 0.013164886273443699, + "learning_rate": 5.052475247524752e-06, + "loss": 0.0009, + "step": 286 + }, + { + "epoch": 1.4085889570552146, + "grad_norm": 0.1018059104681015, + "learning_rate": 5.045544554455445e-06, + "loss": 0.003, + "step": 287 + }, + { + "epoch": 1.4134969325153375, + "grad_norm": 0.3569484055042267, + "learning_rate": 5.038613861386138e-06, + "loss": 0.0193, + "step": 288 + }, + { + "epoch": 1.4184049079754601, + "grad_norm": 0.03560711443424225, + "learning_rate": 5.031683168316831e-06, + "loss": 0.0041, + "step": 289 + }, + { + "epoch": 1.4233128834355828, + "grad_norm": 0.049912016838788986, + "learning_rate": 5.0247524752475244e-06, + "loss": 0.0019, + "step": 290 + }, + { + "epoch": 1.4282208588957055, + "grad_norm": 0.00541495019569993, + "learning_rate": 5.0178217821782175e-06, + "loss": 0.0008, + "step": 291 + }, + { + "epoch": 1.4331288343558282, + "grad_norm": 0.188345804810524, + "learning_rate": 5.0108910891089106e-06, + "loss": 0.0138, + "step": 292 + }, + { + "epoch": 1.438036809815951, + "grad_norm": 1.1841977834701538, + "learning_rate": 5.003960396039604e-06, + "loss": 0.0285, + "step": 293 + }, + { + "epoch": 1.4429447852760737, + "grad_norm": 0.01752946898341179, + "learning_rate": 4.997029702970297e-06, + "loss": 0.003, + "step": 294 + }, + { + "epoch": 1.4478527607361964, + "grad_norm": 0.009354379959404469, + "learning_rate": 4.99009900990099e-06, + "loss": 0.001, + "step": 295 + }, + { + "epoch": 1.452760736196319, + "grad_norm": 0.5457541346549988, + "learning_rate": 4.983168316831683e-06, + "loss": 0.0028, + "step": 296 + }, + { + "epoch": 1.4576687116564417, + "grad_norm": 0.012560038827359676, + "learning_rate": 4.976237623762376e-06, + "loss": 0.0014, + "step": 297 + }, + { + "epoch": 1.4625766871165644, + "grad_norm": 0.08460962772369385, + "learning_rate": 4.969306930693069e-06, + "loss": 0.0054, + "step": 298 + }, + { + "epoch": 1.467484662576687, + "grad_norm": 0.02863544225692749, + "learning_rate": 4.962376237623762e-06, + "loss": 0.0019, + "step": 299 + }, + { + "epoch": 1.4723926380368098, + "grad_norm": 0.12013754993677139, + "learning_rate": 4.955445544554455e-06, + "loss": 0.0108, + "step": 300 + }, + { + "epoch": 1.4773006134969324, + "grad_norm": 0.06849315017461777, + "learning_rate": 4.948514851485148e-06, + "loss": 0.0023, + "step": 301 + }, + { + "epoch": 1.482208588957055, + "grad_norm": 0.07875117659568787, + "learning_rate": 4.941584158415841e-06, + "loss": 0.003, + "step": 302 + }, + { + "epoch": 1.487116564417178, + "grad_norm": 0.03466253727674484, + "learning_rate": 4.934653465346534e-06, + "loss": 0.0023, + "step": 303 + }, + { + "epoch": 1.4920245398773007, + "grad_norm": 0.02622813917696476, + "learning_rate": 4.927722772277227e-06, + "loss": 0.0019, + "step": 304 + }, + { + "epoch": 1.4969325153374233, + "grad_norm": 0.08774946630001068, + "learning_rate": 4.9207920792079205e-06, + "loss": 0.0062, + "step": 305 + }, + { + "epoch": 1.501840490797546, + "grad_norm": 0.038571231067180634, + "learning_rate": 4.9138613861386135e-06, + "loss": 0.002, + "step": 306 + }, + { + "epoch": 1.5067484662576687, + "grad_norm": 0.058768339455127716, + "learning_rate": 4.906930693069307e-06, + "loss": 0.0024, + "step": 307 + }, + { + "epoch": 1.5116564417177916, + "grad_norm": 0.01673268899321556, + "learning_rate": 4.9e-06, + "loss": 0.0016, + "step": 308 + }, + { + "epoch": 1.5165644171779142, + "grad_norm": 0.1858752965927124, + "learning_rate": 4.893069306930693e-06, + "loss": 0.0042, + "step": 309 + }, + { + "epoch": 1.521472392638037, + "grad_norm": 0.017566069960594177, + "learning_rate": 4.886138613861386e-06, + "loss": 0.0012, + "step": 310 + }, + { + "epoch": 1.5263803680981596, + "grad_norm": 0.021138716489076614, + "learning_rate": 4.879207920792079e-06, + "loss": 0.0018, + "step": 311 + }, + { + "epoch": 1.5312883435582823, + "grad_norm": 0.005446423310786486, + "learning_rate": 4.872277227722772e-06, + "loss": 0.0012, + "step": 312 + }, + { + "epoch": 1.536196319018405, + "grad_norm": 0.034122321754693985, + "learning_rate": 4.865346534653465e-06, + "loss": 0.0029, + "step": 313 + }, + { + "epoch": 1.5411042944785276, + "grad_norm": 0.015392904169857502, + "learning_rate": 4.858415841584158e-06, + "loss": 0.001, + "step": 314 + }, + { + "epoch": 1.5460122699386503, + "grad_norm": 0.09795702993869781, + "learning_rate": 4.851485148514851e-06, + "loss": 0.006, + "step": 315 + }, + { + "epoch": 1.550920245398773, + "grad_norm": 0.18449151515960693, + "learning_rate": 4.844554455445544e-06, + "loss": 0.0543, + "step": 316 + }, + { + "epoch": 1.5558282208588956, + "grad_norm": 0.3047311007976532, + "learning_rate": 4.837623762376237e-06, + "loss": 0.0148, + "step": 317 + }, + { + "epoch": 1.5607361963190183, + "grad_norm": 0.3502406179904938, + "learning_rate": 4.83069306930693e-06, + "loss": 0.0083, + "step": 318 + }, + { + "epoch": 1.565644171779141, + "grad_norm": 0.08116799592971802, + "learning_rate": 4.8237623762376234e-06, + "loss": 0.011, + "step": 319 + }, + { + "epoch": 1.5705521472392638, + "grad_norm": 6.128328323364258, + "learning_rate": 4.8168316831683165e-06, + "loss": 0.0234, + "step": 320 + }, + { + "epoch": 1.5754601226993865, + "grad_norm": 0.31991851329803467, + "learning_rate": 4.80990099009901e-06, + "loss": 0.0161, + "step": 321 + }, + { + "epoch": 1.5803680981595092, + "grad_norm": 0.08932381123304367, + "learning_rate": 4.8029702970297035e-06, + "loss": 0.0024, + "step": 322 + }, + { + "epoch": 1.5852760736196319, + "grad_norm": 0.03641815483570099, + "learning_rate": 4.7960396039603965e-06, + "loss": 0.0013, + "step": 323 + }, + { + "epoch": 1.5901840490797547, + "grad_norm": 0.12057910859584808, + "learning_rate": 4.78910891089109e-06, + "loss": 0.0045, + "step": 324 + }, + { + "epoch": 1.5950920245398774, + "grad_norm": 0.022239159792661667, + "learning_rate": 4.782178217821783e-06, + "loss": 0.002, + "step": 325 + }, + { + "epoch": 1.6, + "grad_norm": 0.02001642808318138, + "learning_rate": 4.775247524752476e-06, + "loss": 0.0011, + "step": 326 + }, + { + "epoch": 1.6049079754601228, + "grad_norm": 0.0681408941745758, + "learning_rate": 4.768316831683169e-06, + "loss": 0.0034, + "step": 327 + }, + { + "epoch": 1.6098159509202454, + "grad_norm": 0.07200402021408081, + "learning_rate": 4.761386138613862e-06, + "loss": 0.0023, + "step": 328 + }, + { + "epoch": 1.614723926380368, + "grad_norm": 0.11577661335468292, + "learning_rate": 4.754455445544555e-06, + "loss": 0.0122, + "step": 329 + }, + { + "epoch": 1.6196319018404908, + "grad_norm": 0.12311868369579315, + "learning_rate": 4.747524752475248e-06, + "loss": 0.0148, + "step": 330 + }, + { + "epoch": 1.6245398773006134, + "grad_norm": 0.07884922623634338, + "learning_rate": 4.740594059405941e-06, + "loss": 0.0026, + "step": 331 + }, + { + "epoch": 1.6294478527607361, + "grad_norm": 0.013971218839287758, + "learning_rate": 4.733663366336634e-06, + "loss": 0.0014, + "step": 332 + }, + { + "epoch": 1.6343558282208588, + "grad_norm": 0.17173822224140167, + "learning_rate": 4.726732673267327e-06, + "loss": 0.0041, + "step": 333 + }, + { + "epoch": 1.6392638036809815, + "grad_norm": 0.24089990556240082, + "learning_rate": 4.71980198019802e-06, + "loss": 0.0114, + "step": 334 + }, + { + "epoch": 1.6441717791411041, + "grad_norm": 0.026243751868605614, + "learning_rate": 4.712871287128713e-06, + "loss": 0.0011, + "step": 335 + }, + { + "epoch": 1.649079754601227, + "grad_norm": 0.08318141102790833, + "learning_rate": 4.7059405940594064e-06, + "loss": 0.0034, + "step": 336 + }, + { + "epoch": 1.6539877300613497, + "grad_norm": 0.010400490835309029, + "learning_rate": 4.6990099009900995e-06, + "loss": 0.0022, + "step": 337 + }, + { + "epoch": 1.6588957055214724, + "grad_norm": 0.044309504330158234, + "learning_rate": 4.692079207920793e-06, + "loss": 0.004, + "step": 338 + }, + { + "epoch": 1.6638036809815953, + "grad_norm": 0.02144954912364483, + "learning_rate": 4.685148514851486e-06, + "loss": 0.0018, + "step": 339 + }, + { + "epoch": 1.668711656441718, + "grad_norm": 0.0410357341170311, + "learning_rate": 4.678217821782179e-06, + "loss": 0.002, + "step": 340 + }, + { + "epoch": 1.6736196319018406, + "grad_norm": 0.02522197738289833, + "learning_rate": 4.671287128712872e-06, + "loss": 0.0014, + "step": 341 + }, + { + "epoch": 1.6785276073619633, + "grad_norm": 0.008343094028532505, + "learning_rate": 4.664356435643565e-06, + "loss": 0.0016, + "step": 342 + }, + { + "epoch": 1.683435582822086, + "grad_norm": 0.19290384650230408, + "learning_rate": 4.657425742574258e-06, + "loss": 0.0063, + "step": 343 + }, + { + "epoch": 1.6883435582822086, + "grad_norm": 0.038624536246061325, + "learning_rate": 4.650495049504951e-06, + "loss": 0.0016, + "step": 344 + }, + { + "epoch": 1.6932515337423313, + "grad_norm": 0.21920740604400635, + "learning_rate": 4.643564356435644e-06, + "loss": 0.0105, + "step": 345 + }, + { + "epoch": 1.698159509202454, + "grad_norm": 0.057425156235694885, + "learning_rate": 4.636633663366337e-06, + "loss": 0.0033, + "step": 346 + }, + { + "epoch": 1.7030674846625766, + "grad_norm": 0.024981388822197914, + "learning_rate": 4.62970297029703e-06, + "loss": 0.0025, + "step": 347 + }, + { + "epoch": 1.7079754601226993, + "grad_norm": 0.020687798038125038, + "learning_rate": 4.622772277227723e-06, + "loss": 0.0022, + "step": 348 + }, + { + "epoch": 1.712883435582822, + "grad_norm": 0.028213948011398315, + "learning_rate": 4.615841584158416e-06, + "loss": 0.0014, + "step": 349 + }, + { + "epoch": 1.7177914110429446, + "grad_norm": 0.05201176926493645, + "learning_rate": 4.608910891089109e-06, + "loss": 0.0068, + "step": 350 + }, + { + "epoch": 1.7226993865030675, + "grad_norm": 0.057419948279857635, + "learning_rate": 4.6019801980198025e-06, + "loss": 0.0027, + "step": 351 + }, + { + "epoch": 1.7276073619631902, + "grad_norm": 0.026078782975673676, + "learning_rate": 4.5950495049504955e-06, + "loss": 0.0017, + "step": 352 + }, + { + "epoch": 1.7325153374233129, + "grad_norm": 0.09312699735164642, + "learning_rate": 4.588118811881189e-06, + "loss": 0.0048, + "step": 353 + }, + { + "epoch": 1.7374233128834355, + "grad_norm": 0.027963656932115555, + "learning_rate": 4.581188118811882e-06, + "loss": 0.0018, + "step": 354 + }, + { + "epoch": 1.7423312883435584, + "grad_norm": 1.0171724557876587, + "learning_rate": 4.574257425742575e-06, + "loss": 0.044, + "step": 355 + }, + { + "epoch": 1.747239263803681, + "grad_norm": 0.0031565092504024506, + "learning_rate": 4.567326732673268e-06, + "loss": 0.0003, + "step": 356 + }, + { + "epoch": 1.7521472392638038, + "grad_norm": 0.023803675547242165, + "learning_rate": 4.560396039603961e-06, + "loss": 0.003, + "step": 357 + }, + { + "epoch": 1.7570552147239265, + "grad_norm": 0.11927448213100433, + "learning_rate": 4.553465346534654e-06, + "loss": 0.0046, + "step": 358 + }, + { + "epoch": 1.7619631901840491, + "grad_norm": 0.20977821946144104, + "learning_rate": 4.546534653465347e-06, + "loss": 0.0287, + "step": 359 + }, + { + "epoch": 1.7668711656441718, + "grad_norm": 0.024297788739204407, + "learning_rate": 4.53960396039604e-06, + "loss": 0.0016, + "step": 360 + }, + { + "epoch": 1.7717791411042945, + "grad_norm": 0.0834914818406105, + "learning_rate": 4.532673267326733e-06, + "loss": 0.0032, + "step": 361 + }, + { + "epoch": 1.7766871165644171, + "grad_norm": 0.07108546048402786, + "learning_rate": 4.525742574257426e-06, + "loss": 0.0021, + "step": 362 + }, + { + "epoch": 1.7815950920245398, + "grad_norm": 0.037000950425863266, + "learning_rate": 4.518811881188119e-06, + "loss": 0.0019, + "step": 363 + }, + { + "epoch": 1.7865030674846625, + "grad_norm": 0.050357453525066376, + "learning_rate": 4.5118811881188115e-06, + "loss": 0.0017, + "step": 364 + }, + { + "epoch": 1.7914110429447851, + "grad_norm": 0.15029194951057434, + "learning_rate": 4.504950495049505e-06, + "loss": 0.0054, + "step": 365 + }, + { + "epoch": 1.7963190184049078, + "grad_norm": 0.10820876061916351, + "learning_rate": 4.498019801980198e-06, + "loss": 0.0029, + "step": 366 + }, + { + "epoch": 1.8012269938650307, + "grad_norm": 0.014579812064766884, + "learning_rate": 4.491089108910891e-06, + "loss": 0.0014, + "step": 367 + }, + { + "epoch": 1.8061349693251534, + "grad_norm": 0.022982638329267502, + "learning_rate": 4.484158415841584e-06, + "loss": 0.0012, + "step": 368 + }, + { + "epoch": 1.811042944785276, + "grad_norm": 0.12965136766433716, + "learning_rate": 4.477227722772277e-06, + "loss": 0.0059, + "step": 369 + }, + { + "epoch": 1.8159509202453987, + "grad_norm": 0.2906913161277771, + "learning_rate": 4.47029702970297e-06, + "loss": 0.0061, + "step": 370 + }, + { + "epoch": 1.8208588957055216, + "grad_norm": 0.04156769439578056, + "learning_rate": 4.463366336633663e-06, + "loss": 0.0031, + "step": 371 + }, + { + "epoch": 1.8257668711656443, + "grad_norm": 0.07649008929729462, + "learning_rate": 4.456435643564356e-06, + "loss": 0.0027, + "step": 372 + }, + { + "epoch": 1.830674846625767, + "grad_norm": 0.11019697785377502, + "learning_rate": 4.449504950495049e-06, + "loss": 0.0023, + "step": 373 + }, + { + "epoch": 1.8355828220858896, + "grad_norm": 0.03163640573620796, + "learning_rate": 4.442574257425742e-06, + "loss": 0.0024, + "step": 374 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.2059604823589325, + "learning_rate": 4.435643564356435e-06, + "loss": 0.0154, + "step": 375 + }, + { + "epoch": 1.845398773006135, + "grad_norm": 0.00596796628087759, + "learning_rate": 4.428712871287128e-06, + "loss": 0.0003, + "step": 376 + }, + { + "epoch": 1.8503067484662576, + "grad_norm": 0.1259843111038208, + "learning_rate": 4.421782178217821e-06, + "loss": 0.002, + "step": 377 + }, + { + "epoch": 1.8552147239263803, + "grad_norm": 0.051318589597940445, + "learning_rate": 4.4148514851485145e-06, + "loss": 0.0054, + "step": 378 + }, + { + "epoch": 1.860122699386503, + "grad_norm": 0.022389305755496025, + "learning_rate": 4.4079207920792075e-06, + "loss": 0.0011, + "step": 379 + }, + { + "epoch": 1.8650306748466257, + "grad_norm": 0.03204929828643799, + "learning_rate": 4.400990099009901e-06, + "loss": 0.002, + "step": 380 + }, + { + "epoch": 1.8699386503067483, + "grad_norm": 0.14024299383163452, + "learning_rate": 4.394059405940594e-06, + "loss": 0.0044, + "step": 381 + }, + { + "epoch": 1.874846625766871, + "grad_norm": 0.1390925794839859, + "learning_rate": 4.387128712871287e-06, + "loss": 0.0125, + "step": 382 + }, + { + "epoch": 1.879754601226994, + "grad_norm": 0.19391588866710663, + "learning_rate": 4.38019801980198e-06, + "loss": 0.0053, + "step": 383 + }, + { + "epoch": 1.8846625766871166, + "grad_norm": 0.059033554047346115, + "learning_rate": 4.373267326732673e-06, + "loss": 0.0036, + "step": 384 + }, + { + "epoch": 1.8895705521472392, + "grad_norm": 0.03923160582780838, + "learning_rate": 4.366336633663366e-06, + "loss": 0.0021, + "step": 385 + }, + { + "epoch": 1.8944785276073621, + "grad_norm": 0.2464819848537445, + "learning_rate": 4.359405940594059e-06, + "loss": 0.0157, + "step": 386 + }, + { + "epoch": 1.8993865030674848, + "grad_norm": 0.02465728111565113, + "learning_rate": 4.352475247524752e-06, + "loss": 0.002, + "step": 387 + }, + { + "epoch": 1.9042944785276075, + "grad_norm": 0.017552992329001427, + "learning_rate": 4.345544554455445e-06, + "loss": 0.0013, + "step": 388 + }, + { + "epoch": 1.9092024539877301, + "grad_norm": 0.07727736979722977, + "learning_rate": 4.338613861386138e-06, + "loss": 0.004, + "step": 389 + }, + { + "epoch": 1.9141104294478528, + "grad_norm": 0.0594131238758564, + "learning_rate": 4.331683168316831e-06, + "loss": 0.0021, + "step": 390 + }, + { + "epoch": 1.9190184049079755, + "grad_norm": 0.03132615610957146, + "learning_rate": 4.324752475247524e-06, + "loss": 0.0057, + "step": 391 + }, + { + "epoch": 1.9239263803680982, + "grad_norm": 0.02295020781457424, + "learning_rate": 4.3178217821782174e-06, + "loss": 0.0014, + "step": 392 + }, + { + "epoch": 1.9288343558282208, + "grad_norm": 0.04230022802948952, + "learning_rate": 4.3108910891089105e-06, + "loss": 0.0035, + "step": 393 + }, + { + "epoch": 1.9337423312883435, + "grad_norm": 0.012272198684513569, + "learning_rate": 4.3039603960396036e-06, + "loss": 0.0027, + "step": 394 + }, + { + "epoch": 1.9386503067484662, + "grad_norm": 0.017140038311481476, + "learning_rate": 4.297029702970297e-06, + "loss": 0.0015, + "step": 395 + }, + { + "epoch": 1.9435582822085888, + "grad_norm": 0.011837287805974483, + "learning_rate": 4.29009900990099e-06, + "loss": 0.0017, + "step": 396 + }, + { + "epoch": 1.9484662576687115, + "grad_norm": 0.03815172612667084, + "learning_rate": 4.283168316831683e-06, + "loss": 0.0036, + "step": 397 + }, + { + "epoch": 1.9533742331288344, + "grad_norm": 0.029975654557347298, + "learning_rate": 4.276237623762376e-06, + "loss": 0.0017, + "step": 398 + }, + { + "epoch": 1.958282208588957, + "grad_norm": 0.09029775857925415, + "learning_rate": 4.269306930693069e-06, + "loss": 0.0034, + "step": 399 + }, + { + "epoch": 1.9631901840490797, + "grad_norm": 0.013478988781571388, + "learning_rate": 4.262376237623762e-06, + "loss": 0.0012, + "step": 400 + }, + { + "epoch": 1.9680981595092024, + "grad_norm": 0.14020872116088867, + "learning_rate": 4.255445544554455e-06, + "loss": 0.0036, + "step": 401 + }, + { + "epoch": 1.9730061349693253, + "grad_norm": 0.08427014946937561, + "learning_rate": 4.248514851485148e-06, + "loss": 0.0032, + "step": 402 + }, + { + "epoch": 1.977914110429448, + "grad_norm": 0.02368674799799919, + "learning_rate": 4.241584158415841e-06, + "loss": 0.0018, + "step": 403 + }, + { + "epoch": 1.9828220858895707, + "grad_norm": 0.17322583496570587, + "learning_rate": 4.234653465346534e-06, + "loss": 0.0075, + "step": 404 + }, + { + "epoch": 1.9877300613496933, + "grad_norm": 0.2020941525697708, + "learning_rate": 4.227722772277227e-06, + "loss": 0.0056, + "step": 405 + }, + { + "epoch": 1.992638036809816, + "grad_norm": 0.01503934245556593, + "learning_rate": 4.22079207920792e-06, + "loss": 0.0014, + "step": 406 + }, + { + "epoch": 1.9975460122699387, + "grad_norm": 0.08337133377790451, + "learning_rate": 4.2138613861386135e-06, + "loss": 0.0022, + "step": 407 + }, + { + "epoch": 2.0024539877300613, + "grad_norm": 0.03418401628732681, + "learning_rate": 4.2069306930693065e-06, + "loss": 0.0014, + "step": 408 + }, + { + "epoch": 2.007361963190184, + "grad_norm": 0.15800650417804718, + "learning_rate": 4.2e-06, + "loss": 0.0152, + "step": 409 + }, + { + "epoch": 2.0122699386503067, + "grad_norm": 0.07395298033952713, + "learning_rate": 4.193069306930693e-06, + "loss": 0.0027, + "step": 410 + }, + { + "epoch": 2.0171779141104293, + "grad_norm": 0.013158326968550682, + "learning_rate": 4.186138613861386e-06, + "loss": 0.0015, + "step": 411 + }, + { + "epoch": 2.022085889570552, + "grad_norm": 0.0122208371758461, + "learning_rate": 4.179207920792079e-06, + "loss": 0.0015, + "step": 412 + }, + { + "epoch": 2.0269938650306747, + "grad_norm": 0.04391651973128319, + "learning_rate": 4.172277227722772e-06, + "loss": 0.0028, + "step": 413 + }, + { + "epoch": 2.0319018404907974, + "grad_norm": 0.007677063811570406, + "learning_rate": 4.165346534653465e-06, + "loss": 0.0004, + "step": 414 + }, + { + "epoch": 2.03680981595092, + "grad_norm": 0.01934950426220894, + "learning_rate": 4.158415841584158e-06, + "loss": 0.0013, + "step": 415 + }, + { + "epoch": 2.041717791411043, + "grad_norm": 0.004791139159351587, + "learning_rate": 4.151485148514851e-06, + "loss": 0.0005, + "step": 416 + }, + { + "epoch": 2.046625766871166, + "grad_norm": 0.03984725847840309, + "learning_rate": 4.144554455445544e-06, + "loss": 0.0015, + "step": 417 + }, + { + "epoch": 2.0515337423312885, + "grad_norm": 0.6277703642845154, + "learning_rate": 4.137623762376237e-06, + "loss": 0.0269, + "step": 418 + }, + { + "epoch": 2.056441717791411, + "grad_norm": 0.1700657159090042, + "learning_rate": 4.13069306930693e-06, + "loss": 0.0196, + "step": 419 + }, + { + "epoch": 2.061349693251534, + "grad_norm": 0.045236505568027496, + "learning_rate": 4.123762376237623e-06, + "loss": 0.003, + "step": 420 + }, + { + "epoch": 2.0662576687116565, + "grad_norm": 0.034617915749549866, + "learning_rate": 4.116831683168316e-06, + "loss": 0.0045, + "step": 421 + }, + { + "epoch": 2.071165644171779, + "grad_norm": 0.022638553753495216, + "learning_rate": 4.1099009900990095e-06, + "loss": 0.0014, + "step": 422 + }, + { + "epoch": 2.076073619631902, + "grad_norm": 0.011074609123170376, + "learning_rate": 4.1029702970297026e-06, + "loss": 0.0006, + "step": 423 + }, + { + "epoch": 2.0809815950920245, + "grad_norm": 0.007294974289834499, + "learning_rate": 4.096039603960396e-06, + "loss": 0.0013, + "step": 424 + }, + { + "epoch": 2.085889570552147, + "grad_norm": 0.09458526223897934, + "learning_rate": 4.089108910891089e-06, + "loss": 0.0018, + "step": 425 + }, + { + "epoch": 2.09079754601227, + "grad_norm": 0.02894427254796028, + "learning_rate": 4.082178217821782e-06, + "loss": 0.0016, + "step": 426 + }, + { + "epoch": 2.0957055214723925, + "grad_norm": 0.030396826565265656, + "learning_rate": 4.075247524752475e-06, + "loss": 0.0018, + "step": 427 + }, + { + "epoch": 2.100613496932515, + "grad_norm": 0.11630258709192276, + "learning_rate": 4.068316831683168e-06, + "loss": 0.0081, + "step": 428 + }, + { + "epoch": 2.105521472392638, + "grad_norm": 0.010553350672125816, + "learning_rate": 4.061386138613861e-06, + "loss": 0.0008, + "step": 429 + }, + { + "epoch": 2.1104294478527605, + "grad_norm": 0.04130052775144577, + "learning_rate": 4.054455445544554e-06, + "loss": 0.0026, + "step": 430 + }, + { + "epoch": 2.1153374233128837, + "grad_norm": 0.10501858592033386, + "learning_rate": 4.047524752475247e-06, + "loss": 0.0396, + "step": 431 + }, + { + "epoch": 2.1202453987730063, + "grad_norm": 0.13223466277122498, + "learning_rate": 4.04059405940594e-06, + "loss": 0.0111, + "step": 432 + }, + { + "epoch": 2.125153374233129, + "grad_norm": 0.015712805092334747, + "learning_rate": 4.033663366336633e-06, + "loss": 0.0012, + "step": 433 + }, + { + "epoch": 2.1300613496932517, + "grad_norm": 0.023326637223362923, + "learning_rate": 4.026732673267326e-06, + "loss": 0.0017, + "step": 434 + }, + { + "epoch": 2.1349693251533743, + "grad_norm": 0.03263499587774277, + "learning_rate": 4.019801980198019e-06, + "loss": 0.002, + "step": 435 + }, + { + "epoch": 2.139877300613497, + "grad_norm": 0.018768969923257828, + "learning_rate": 4.0128712871287124e-06, + "loss": 0.0015, + "step": 436 + }, + { + "epoch": 2.1447852760736197, + "grad_norm": 0.4300386905670166, + "learning_rate": 4.0059405940594055e-06, + "loss": 0.064, + "step": 437 + }, + { + "epoch": 2.1496932515337424, + "grad_norm": 0.01180424727499485, + "learning_rate": 3.999009900990099e-06, + "loss": 0.0018, + "step": 438 + }, + { + "epoch": 2.154601226993865, + "grad_norm": 0.007693049497902393, + "learning_rate": 3.992079207920792e-06, + "loss": 0.0007, + "step": 439 + }, + { + "epoch": 2.1595092024539877, + "grad_norm": 0.04772064834833145, + "learning_rate": 3.985148514851485e-06, + "loss": 0.0031, + "step": 440 + }, + { + "epoch": 2.1644171779141104, + "grad_norm": 0.017260396853089333, + "learning_rate": 3.978217821782178e-06, + "loss": 0.001, + "step": 441 + }, + { + "epoch": 2.169325153374233, + "grad_norm": 0.08507797867059708, + "learning_rate": 3.971287128712871e-06, + "loss": 0.0063, + "step": 442 + }, + { + "epoch": 2.1742331288343557, + "grad_norm": 0.02232922799885273, + "learning_rate": 3.964356435643564e-06, + "loss": 0.002, + "step": 443 + }, + { + "epoch": 2.1791411042944784, + "grad_norm": 0.06088268384337425, + "learning_rate": 3.957425742574257e-06, + "loss": 0.0058, + "step": 444 + }, + { + "epoch": 2.184049079754601, + "grad_norm": 0.15273047983646393, + "learning_rate": 3.95049504950495e-06, + "loss": 0.0035, + "step": 445 + }, + { + "epoch": 2.1889570552147237, + "grad_norm": 0.014187682420015335, + "learning_rate": 3.943564356435643e-06, + "loss": 0.0014, + "step": 446 + }, + { + "epoch": 2.1938650306748464, + "grad_norm": 0.1353190392255783, + "learning_rate": 3.936633663366337e-06, + "loss": 0.003, + "step": 447 + }, + { + "epoch": 2.1987730061349695, + "grad_norm": 0.1862998902797699, + "learning_rate": 3.92970297029703e-06, + "loss": 0.0232, + "step": 448 + }, + { + "epoch": 2.203680981595092, + "grad_norm": 0.06980832666158676, + "learning_rate": 3.922772277227723e-06, + "loss": 0.0018, + "step": 449 + }, + { + "epoch": 2.208588957055215, + "grad_norm": 0.012548700906336308, + "learning_rate": 3.915841584158416e-06, + "loss": 0.0009, + "step": 450 + }, + { + "epoch": 2.2134969325153375, + "grad_norm": 0.0944492369890213, + "learning_rate": 3.908910891089109e-06, + "loss": 0.0071, + "step": 451 + }, + { + "epoch": 2.21840490797546, + "grad_norm": 0.07349948585033417, + "learning_rate": 3.901980198019802e-06, + "loss": 0.0073, + "step": 452 + }, + { + "epoch": 2.223312883435583, + "grad_norm": 0.016509253531694412, + "learning_rate": 3.8950495049504955e-06, + "loss": 0.0012, + "step": 453 + }, + { + "epoch": 2.2282208588957055, + "grad_norm": 0.0530993714928627, + "learning_rate": 3.8881188118811885e-06, + "loss": 0.0025, + "step": 454 + }, + { + "epoch": 2.233128834355828, + "grad_norm": 0.014129874296486378, + "learning_rate": 3.881188118811882e-06, + "loss": 0.002, + "step": 455 + }, + { + "epoch": 2.238036809815951, + "grad_norm": 0.16294129192829132, + "learning_rate": 3.874257425742575e-06, + "loss": 0.0049, + "step": 456 + }, + { + "epoch": 2.2429447852760735, + "grad_norm": 0.03273903578519821, + "learning_rate": 3.867326732673268e-06, + "loss": 0.0019, + "step": 457 + }, + { + "epoch": 2.247852760736196, + "grad_norm": 0.04015541449189186, + "learning_rate": 3.860396039603961e-06, + "loss": 0.0021, + "step": 458 + }, + { + "epoch": 2.252760736196319, + "grad_norm": 0.02188378944993019, + "learning_rate": 3.853465346534654e-06, + "loss": 0.001, + "step": 459 + }, + { + "epoch": 2.2576687116564416, + "grad_norm": 0.02690410614013672, + "learning_rate": 3.846534653465347e-06, + "loss": 0.0013, + "step": 460 + }, + { + "epoch": 2.2625766871165642, + "grad_norm": 0.08112508058547974, + "learning_rate": 3.83960396039604e-06, + "loss": 0.0027, + "step": 461 + }, + { + "epoch": 2.2674846625766873, + "grad_norm": 0.015935292467474937, + "learning_rate": 3.832673267326733e-06, + "loss": 0.0024, + "step": 462 + }, + { + "epoch": 2.27239263803681, + "grad_norm": 0.26772162318229675, + "learning_rate": 3.825742574257426e-06, + "loss": 0.0118, + "step": 463 + }, + { + "epoch": 2.2773006134969327, + "grad_norm": 0.035102855414152145, + "learning_rate": 3.818811881188119e-06, + "loss": 0.001, + "step": 464 + }, + { + "epoch": 2.2822085889570554, + "grad_norm": 0.03706807270646095, + "learning_rate": 3.8118811881188123e-06, + "loss": 0.0019, + "step": 465 + }, + { + "epoch": 2.287116564417178, + "grad_norm": 0.012720320373773575, + "learning_rate": 3.8049504950495054e-06, + "loss": 0.0012, + "step": 466 + }, + { + "epoch": 2.2920245398773007, + "grad_norm": 0.07940246909856796, + "learning_rate": 3.7980198019801984e-06, + "loss": 0.0045, + "step": 467 + }, + { + "epoch": 2.2969325153374234, + "grad_norm": 0.017415238544344902, + "learning_rate": 3.7910891089108915e-06, + "loss": 0.0012, + "step": 468 + }, + { + "epoch": 2.301840490797546, + "grad_norm": 0.1183788999915123, + "learning_rate": 3.7841584158415846e-06, + "loss": 0.0076, + "step": 469 + }, + { + "epoch": 2.3067484662576687, + "grad_norm": 0.01171356625854969, + "learning_rate": 3.7772277227722776e-06, + "loss": 0.001, + "step": 470 + }, + { + "epoch": 2.3116564417177914, + "grad_norm": 0.010136552155017853, + "learning_rate": 3.7702970297029703e-06, + "loss": 0.0009, + "step": 471 + }, + { + "epoch": 2.316564417177914, + "grad_norm": 0.028086962178349495, + "learning_rate": 3.7633663366336633e-06, + "loss": 0.0011, + "step": 472 + }, + { + "epoch": 2.3214723926380367, + "grad_norm": 0.00958373211324215, + "learning_rate": 3.7564356435643564e-06, + "loss": 0.0014, + "step": 473 + }, + { + "epoch": 2.3263803680981594, + "grad_norm": 0.034549593925476074, + "learning_rate": 3.7495049504950495e-06, + "loss": 0.0016, + "step": 474 + }, + { + "epoch": 2.331288343558282, + "grad_norm": 0.02567419782280922, + "learning_rate": 3.7425742574257425e-06, + "loss": 0.0017, + "step": 475 + }, + { + "epoch": 2.3361963190184047, + "grad_norm": 0.0217987522482872, + "learning_rate": 3.7356435643564356e-06, + "loss": 0.0012, + "step": 476 + }, + { + "epoch": 2.3411042944785274, + "grad_norm": 0.00880768895149231, + "learning_rate": 3.7287128712871287e-06, + "loss": 0.0012, + "step": 477 + }, + { + "epoch": 2.34601226993865, + "grad_norm": 0.0350213460624218, + "learning_rate": 3.7217821782178218e-06, + "loss": 0.0038, + "step": 478 + }, + { + "epoch": 2.3509202453987728, + "grad_norm": 0.009084700606763363, + "learning_rate": 3.714851485148515e-06, + "loss": 0.0018, + "step": 479 + }, + { + "epoch": 2.355828220858896, + "grad_norm": 0.056382663547992706, + "learning_rate": 3.707920792079208e-06, + "loss": 0.0018, + "step": 480 + }, + { + "epoch": 2.3607361963190185, + "grad_norm": 0.03614302724599838, + "learning_rate": 3.700990099009901e-06, + "loss": 0.0025, + "step": 481 + }, + { + "epoch": 2.365644171779141, + "grad_norm": 0.03676525875926018, + "learning_rate": 3.694059405940594e-06, + "loss": 0.001, + "step": 482 + }, + { + "epoch": 2.370552147239264, + "grad_norm": 0.1925876885652542, + "learning_rate": 3.687128712871287e-06, + "loss": 0.0141, + "step": 483 + }, + { + "epoch": 2.3754601226993866, + "grad_norm": 0.013960366137325764, + "learning_rate": 3.68019801980198e-06, + "loss": 0.0012, + "step": 484 + }, + { + "epoch": 2.3803680981595092, + "grad_norm": 0.03343435749411583, + "learning_rate": 3.6732673267326732e-06, + "loss": 0.0014, + "step": 485 + }, + { + "epoch": 2.385276073619632, + "grad_norm": 0.022218871861696243, + "learning_rate": 3.6663366336633663e-06, + "loss": 0.0026, + "step": 486 + }, + { + "epoch": 2.3901840490797546, + "grad_norm": 0.015421630814671516, + "learning_rate": 3.6594059405940594e-06, + "loss": 0.001, + "step": 487 + }, + { + "epoch": 2.3950920245398772, + "grad_norm": 0.011735438369214535, + "learning_rate": 3.6524752475247524e-06, + "loss": 0.0011, + "step": 488 + }, + { + "epoch": 2.4, + "grad_norm": 0.06969886273145676, + "learning_rate": 3.6455445544554455e-06, + "loss": 0.0035, + "step": 489 + }, + { + "epoch": 2.4049079754601226, + "grad_norm": 0.009630633518099785, + "learning_rate": 3.6386138613861386e-06, + "loss": 0.0012, + "step": 490 + }, + { + "epoch": 2.4098159509202453, + "grad_norm": 0.18442873656749725, + "learning_rate": 3.6316831683168316e-06, + "loss": 0.0034, + "step": 491 + }, + { + "epoch": 2.414723926380368, + "grad_norm": 0.0310515183955431, + "learning_rate": 3.6247524752475247e-06, + "loss": 0.002, + "step": 492 + }, + { + "epoch": 2.419631901840491, + "grad_norm": 0.107456274330616, + "learning_rate": 3.6178217821782178e-06, + "loss": 0.0036, + "step": 493 + }, + { + "epoch": 2.4245398773006137, + "grad_norm": 0.018645675852894783, + "learning_rate": 3.610891089108911e-06, + "loss": 0.0018, + "step": 494 + }, + { + "epoch": 2.4294478527607364, + "grad_norm": 0.03003678098320961, + "learning_rate": 3.603960396039604e-06, + "loss": 0.0026, + "step": 495 + }, + { + "epoch": 2.434355828220859, + "grad_norm": 0.00237216055393219, + "learning_rate": 3.597029702970297e-06, + "loss": 0.0014, + "step": 496 + }, + { + "epoch": 2.4392638036809817, + "grad_norm": 0.11389046162366867, + "learning_rate": 3.59009900990099e-06, + "loss": 0.0036, + "step": 497 + }, + { + "epoch": 2.4441717791411044, + "grad_norm": 0.16872182488441467, + "learning_rate": 3.583168316831683e-06, + "loss": 0.0127, + "step": 498 + }, + { + "epoch": 2.449079754601227, + "grad_norm": 0.011766748502850533, + "learning_rate": 3.576237623762376e-06, + "loss": 0.0006, + "step": 499 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.014331258833408356, + "learning_rate": 3.5693069306930693e-06, + "loss": 0.0014, + "step": 500 + }, + { + "epoch": 2.4588957055214724, + "grad_norm": 0.009422739036381245, + "learning_rate": 3.5623762376237623e-06, + "loss": 0.0012, + "step": 501 + }, + { + "epoch": 2.463803680981595, + "grad_norm": 0.04546496644616127, + "learning_rate": 3.5554455445544554e-06, + "loss": 0.0067, + "step": 502 + }, + { + "epoch": 2.4687116564417177, + "grad_norm": 0.026389574632048607, + "learning_rate": 3.5485148514851485e-06, + "loss": 0.0016, + "step": 503 + }, + { + "epoch": 2.4736196319018404, + "grad_norm": 0.025073140859603882, + "learning_rate": 3.5415841584158415e-06, + "loss": 0.0011, + "step": 504 + }, + { + "epoch": 2.478527607361963, + "grad_norm": 0.032442186027765274, + "learning_rate": 3.5346534653465346e-06, + "loss": 0.0018, + "step": 505 + }, + { + "epoch": 2.4834355828220858, + "grad_norm": 0.07158058136701584, + "learning_rate": 3.5277227722772277e-06, + "loss": 0.003, + "step": 506 + }, + { + "epoch": 2.4883435582822084, + "grad_norm": 0.005822812672704458, + "learning_rate": 3.5207920792079207e-06, + "loss": 0.0009, + "step": 507 + }, + { + "epoch": 2.493251533742331, + "grad_norm": 0.0328555554151535, + "learning_rate": 3.513861386138614e-06, + "loss": 0.0013, + "step": 508 + }, + { + "epoch": 2.4981595092024538, + "grad_norm": 0.010760471224784851, + "learning_rate": 3.506930693069307e-06, + "loss": 0.001, + "step": 509 + }, + { + "epoch": 2.5030674846625764, + "grad_norm": 0.02453591674566269, + "learning_rate": 3.5e-06, + "loss": 0.0015, + "step": 510 + }, + { + "epoch": 2.507975460122699, + "grad_norm": 0.025182075798511505, + "learning_rate": 3.493069306930693e-06, + "loss": 0.0015, + "step": 511 + }, + { + "epoch": 2.5128834355828222, + "grad_norm": 0.25042542815208435, + "learning_rate": 3.486138613861386e-06, + "loss": 0.009, + "step": 512 + }, + { + "epoch": 2.517791411042945, + "grad_norm": 0.03413018584251404, + "learning_rate": 3.479207920792079e-06, + "loss": 0.0018, + "step": 513 + }, + { + "epoch": 2.5226993865030676, + "grad_norm": 0.16198301315307617, + "learning_rate": 3.4722772277227722e-06, + "loss": 0.0134, + "step": 514 + }, + { + "epoch": 2.5276073619631902, + "grad_norm": 0.17140725255012512, + "learning_rate": 3.4653465346534653e-06, + "loss": 0.0338, + "step": 515 + }, + { + "epoch": 2.532515337423313, + "grad_norm": 0.06337208300828934, + "learning_rate": 3.4584158415841584e-06, + "loss": 0.0042, + "step": 516 + }, + { + "epoch": 2.5374233128834356, + "grad_norm": 0.00469438498839736, + "learning_rate": 3.4514851485148514e-06, + "loss": 0.0005, + "step": 517 + }, + { + "epoch": 2.5423312883435583, + "grad_norm": 0.053614478558301926, + "learning_rate": 3.4445544554455445e-06, + "loss": 0.0063, + "step": 518 + }, + { + "epoch": 2.547239263803681, + "grad_norm": 0.019361039623618126, + "learning_rate": 3.4376237623762376e-06, + "loss": 0.0011, + "step": 519 + }, + { + "epoch": 2.5521472392638036, + "grad_norm": 0.017689630389213562, + "learning_rate": 3.4306930693069306e-06, + "loss": 0.0011, + "step": 520 + }, + { + "epoch": 2.5570552147239263, + "grad_norm": 0.03599102422595024, + "learning_rate": 3.4237623762376237e-06, + "loss": 0.0021, + "step": 521 + }, + { + "epoch": 2.561963190184049, + "grad_norm": 0.6623808145523071, + "learning_rate": 3.4168316831683168e-06, + "loss": 0.0231, + "step": 522 + }, + { + "epoch": 2.5668711656441716, + "grad_norm": 0.013597175478935242, + "learning_rate": 3.40990099009901e-06, + "loss": 0.0018, + "step": 523 + }, + { + "epoch": 2.5717791411042947, + "grad_norm": 0.07440067827701569, + "learning_rate": 3.402970297029703e-06, + "loss": 0.0043, + "step": 524 + }, + { + "epoch": 2.5766871165644174, + "grad_norm": 0.016382288187742233, + "learning_rate": 3.396039603960396e-06, + "loss": 0.0013, + "step": 525 + }, + { + "epoch": 2.58159509202454, + "grad_norm": 0.04037494957447052, + "learning_rate": 3.389108910891089e-06, + "loss": 0.0034, + "step": 526 + }, + { + "epoch": 2.5865030674846627, + "grad_norm": 0.08392734825611115, + "learning_rate": 3.382178217821782e-06, + "loss": 0.0022, + "step": 527 + }, + { + "epoch": 2.5914110429447854, + "grad_norm": 0.041856564581394196, + "learning_rate": 3.375247524752475e-06, + "loss": 0.0019, + "step": 528 + }, + { + "epoch": 2.596319018404908, + "grad_norm": 0.0511699840426445, + "learning_rate": 3.3683168316831683e-06, + "loss": 0.0016, + "step": 529 + }, + { + "epoch": 2.6012269938650308, + "grad_norm": 0.012307001277804375, + "learning_rate": 3.3613861386138613e-06, + "loss": 0.0011, + "step": 530 + }, + { + "epoch": 2.6061349693251534, + "grad_norm": 0.033212386071681976, + "learning_rate": 3.3544554455445544e-06, + "loss": 0.0018, + "step": 531 + }, + { + "epoch": 2.611042944785276, + "grad_norm": 0.033674679696559906, + "learning_rate": 3.3475247524752475e-06, + "loss": 0.0009, + "step": 532 + }, + { + "epoch": 2.6159509202453988, + "grad_norm": 0.022887440398335457, + "learning_rate": 3.3405940594059405e-06, + "loss": 0.0009, + "step": 533 + }, + { + "epoch": 2.6208588957055214, + "grad_norm": 0.08867110311985016, + "learning_rate": 3.3336633663366336e-06, + "loss": 0.0033, + "step": 534 + }, + { + "epoch": 2.625766871165644, + "grad_norm": 0.015654679387807846, + "learning_rate": 3.3267326732673267e-06, + "loss": 0.0011, + "step": 535 + }, + { + "epoch": 2.630674846625767, + "grad_norm": 0.07898303866386414, + "learning_rate": 3.3198019801980197e-06, + "loss": 0.0029, + "step": 536 + }, + { + "epoch": 2.6355828220858895, + "grad_norm": 0.1686147004365921, + "learning_rate": 3.312871287128713e-06, + "loss": 0.0115, + "step": 537 + }, + { + "epoch": 2.640490797546012, + "grad_norm": 0.03158680722117424, + "learning_rate": 3.305940594059406e-06, + "loss": 0.0017, + "step": 538 + }, + { + "epoch": 2.645398773006135, + "grad_norm": 0.11354830116033554, + "learning_rate": 3.299009900990099e-06, + "loss": 0.0091, + "step": 539 + }, + { + "epoch": 2.6503067484662575, + "grad_norm": 0.08498039096593857, + "learning_rate": 3.292079207920792e-06, + "loss": 0.0038, + "step": 540 + }, + { + "epoch": 2.65521472392638, + "grad_norm": 0.08356107771396637, + "learning_rate": 3.285148514851485e-06, + "loss": 0.0019, + "step": 541 + }, + { + "epoch": 2.660122699386503, + "grad_norm": 0.24042771756649017, + "learning_rate": 3.2782178217821786e-06, + "loss": 0.0071, + "step": 542 + }, + { + "epoch": 2.665030674846626, + "grad_norm": 0.034460801631212234, + "learning_rate": 3.2712871287128716e-06, + "loss": 0.0017, + "step": 543 + }, + { + "epoch": 2.6699386503067486, + "grad_norm": 0.09137725085020065, + "learning_rate": 3.2643564356435643e-06, + "loss": 0.0022, + "step": 544 + }, + { + "epoch": 2.6748466257668713, + "grad_norm": 0.04045404493808746, + "learning_rate": 3.2574257425742573e-06, + "loss": 0.0016, + "step": 545 + }, + { + "epoch": 2.679754601226994, + "grad_norm": 0.24448804557323456, + "learning_rate": 3.2504950495049504e-06, + "loss": 0.0228, + "step": 546 + }, + { + "epoch": 2.6846625766871166, + "grad_norm": 0.02107204869389534, + "learning_rate": 3.2435643564356435e-06, + "loss": 0.0022, + "step": 547 + }, + { + "epoch": 2.6895705521472393, + "grad_norm": 0.02561573125422001, + "learning_rate": 3.2366336633663366e-06, + "loss": 0.0014, + "step": 548 + }, + { + "epoch": 2.694478527607362, + "grad_norm": 0.02062409184873104, + "learning_rate": 3.2297029702970296e-06, + "loss": 0.0009, + "step": 549 + }, + { + "epoch": 2.6993865030674846, + "grad_norm": 0.053445011377334595, + "learning_rate": 3.2227722772277227e-06, + "loss": 0.0025, + "step": 550 + }, + { + "epoch": 2.7042944785276073, + "grad_norm": 0.0313737653195858, + "learning_rate": 3.2158415841584158e-06, + "loss": 0.0021, + "step": 551 + }, + { + "epoch": 2.70920245398773, + "grad_norm": 0.04768161103129387, + "learning_rate": 3.208910891089109e-06, + "loss": 0.006, + "step": 552 + }, + { + "epoch": 2.7141104294478526, + "grad_norm": 0.1395123153924942, + "learning_rate": 3.201980198019802e-06, + "loss": 0.0095, + "step": 553 + }, + { + "epoch": 2.7190184049079753, + "grad_norm": 0.022861650213599205, + "learning_rate": 3.195049504950495e-06, + "loss": 0.0014, + "step": 554 + }, + { + "epoch": 2.7239263803680984, + "grad_norm": 0.08337707072496414, + "learning_rate": 3.188118811881188e-06, + "loss": 0.0053, + "step": 555 + }, + { + "epoch": 2.728834355828221, + "grad_norm": 0.1470242589712143, + "learning_rate": 3.181188118811881e-06, + "loss": 0.0089, + "step": 556 + }, + { + "epoch": 2.7337423312883438, + "grad_norm": 0.1716291755437851, + "learning_rate": 3.174257425742574e-06, + "loss": 0.014, + "step": 557 + }, + { + "epoch": 2.7386503067484664, + "grad_norm": 0.015148639678955078, + "learning_rate": 3.1673267326732672e-06, + "loss": 0.001, + "step": 558 + }, + { + "epoch": 2.743558282208589, + "grad_norm": 0.019988784566521645, + "learning_rate": 3.1603960396039603e-06, + "loss": 0.0015, + "step": 559 + }, + { + "epoch": 2.7484662576687118, + "grad_norm": 0.2989899814128876, + "learning_rate": 3.1534653465346534e-06, + "loss": 0.0092, + "step": 560 + }, + { + "epoch": 2.7533742331288344, + "grad_norm": 0.1491839736700058, + "learning_rate": 3.1465346534653464e-06, + "loss": 0.0046, + "step": 561 + }, + { + "epoch": 2.758282208588957, + "grad_norm": 0.02539440244436264, + "learning_rate": 3.1396039603960395e-06, + "loss": 0.0014, + "step": 562 + }, + { + "epoch": 2.76319018404908, + "grad_norm": 0.31437137722969055, + "learning_rate": 3.1326732673267326e-06, + "loss": 0.0109, + "step": 563 + }, + { + "epoch": 2.7680981595092025, + "grad_norm": 0.013294244185090065, + "learning_rate": 3.1257425742574257e-06, + "loss": 0.0016, + "step": 564 + }, + { + "epoch": 2.773006134969325, + "grad_norm": 0.025412971153855324, + "learning_rate": 3.1188118811881187e-06, + "loss": 0.002, + "step": 565 + }, + { + "epoch": 2.777914110429448, + "grad_norm": 0.05206981673836708, + "learning_rate": 3.1118811881188118e-06, + "loss": 0.0017, + "step": 566 + }, + { + "epoch": 2.7828220858895705, + "grad_norm": 0.0227506086230278, + "learning_rate": 3.104950495049505e-06, + "loss": 0.001, + "step": 567 + }, + { + "epoch": 2.787730061349693, + "grad_norm": 0.019159123301506042, + "learning_rate": 3.098019801980198e-06, + "loss": 0.0012, + "step": 568 + }, + { + "epoch": 2.792638036809816, + "grad_norm": 0.035373397171497345, + "learning_rate": 3.091089108910891e-06, + "loss": 0.0011, + "step": 569 + }, + { + "epoch": 2.7975460122699385, + "grad_norm": 0.06474697589874268, + "learning_rate": 3.084158415841584e-06, + "loss": 0.0017, + "step": 570 + }, + { + "epoch": 2.802453987730061, + "grad_norm": 0.01330810971558094, + "learning_rate": 3.077227722772277e-06, + "loss": 0.0006, + "step": 571 + }, + { + "epoch": 2.807361963190184, + "grad_norm": 0.018572993576526642, + "learning_rate": 3.07029702970297e-06, + "loss": 0.0012, + "step": 572 + }, + { + "epoch": 2.8122699386503065, + "grad_norm": 0.04764172062277794, + "learning_rate": 3.0633663366336633e-06, + "loss": 0.0027, + "step": 573 + }, + { + "epoch": 2.817177914110429, + "grad_norm": 0.05075710266828537, + "learning_rate": 3.0564356435643563e-06, + "loss": 0.0018, + "step": 574 + }, + { + "epoch": 2.8220858895705523, + "grad_norm": 0.0751049742102623, + "learning_rate": 3.0495049504950494e-06, + "loss": 0.0023, + "step": 575 + }, + { + "epoch": 2.826993865030675, + "grad_norm": 0.21285609900951385, + "learning_rate": 3.0425742574257425e-06, + "loss": 0.0089, + "step": 576 + }, + { + "epoch": 2.8319018404907976, + "grad_norm": 0.048560746014118195, + "learning_rate": 3.0356435643564355e-06, + "loss": 0.0029, + "step": 577 + }, + { + "epoch": 2.8368098159509203, + "grad_norm": 0.09979841113090515, + "learning_rate": 3.0287128712871286e-06, + "loss": 0.0035, + "step": 578 + }, + { + "epoch": 2.841717791411043, + "grad_norm": 0.012869380414485931, + "learning_rate": 3.0217821782178217e-06, + "loss": 0.0011, + "step": 579 + }, + { + "epoch": 2.8466257668711656, + "grad_norm": 0.04724825918674469, + "learning_rate": 3.0148514851485147e-06, + "loss": 0.0021, + "step": 580 + }, + { + "epoch": 2.8515337423312883, + "grad_norm": 0.10769210010766983, + "learning_rate": 3.007920792079208e-06, + "loss": 0.0097, + "step": 581 + }, + { + "epoch": 2.856441717791411, + "grad_norm": 0.00834321416914463, + "learning_rate": 3.000990099009901e-06, + "loss": 0.0008, + "step": 582 + }, + { + "epoch": 2.8613496932515337, + "grad_norm": 0.04464031755924225, + "learning_rate": 2.994059405940594e-06, + "loss": 0.0026, + "step": 583 + }, + { + "epoch": 2.8662576687116563, + "grad_norm": 0.01902065984904766, + "learning_rate": 2.987128712871287e-06, + "loss": 0.0014, + "step": 584 + }, + { + "epoch": 2.871165644171779, + "grad_norm": 0.038237448781728745, + "learning_rate": 2.98019801980198e-06, + "loss": 0.0015, + "step": 585 + }, + { + "epoch": 2.876073619631902, + "grad_norm": 0.013939250260591507, + "learning_rate": 2.973267326732673e-06, + "loss": 0.001, + "step": 586 + }, + { + "epoch": 2.880981595092025, + "grad_norm": 0.03710507974028587, + "learning_rate": 2.9663366336633662e-06, + "loss": 0.0027, + "step": 587 + }, + { + "epoch": 2.8858895705521475, + "grad_norm": 0.04779844731092453, + "learning_rate": 2.9594059405940593e-06, + "loss": 0.0012, + "step": 588 + }, + { + "epoch": 2.89079754601227, + "grad_norm": 0.00711404625326395, + "learning_rate": 2.9524752475247524e-06, + "loss": 0.001, + "step": 589 + }, + { + "epoch": 2.895705521472393, + "grad_norm": 0.017887057736516, + "learning_rate": 2.9455445544554454e-06, + "loss": 0.0015, + "step": 590 + }, + { + "epoch": 2.9006134969325155, + "grad_norm": 0.01937202550470829, + "learning_rate": 2.9386138613861385e-06, + "loss": 0.0012, + "step": 591 + }, + { + "epoch": 2.905521472392638, + "grad_norm": 0.005528996232897043, + "learning_rate": 2.9316831683168316e-06, + "loss": 0.0011, + "step": 592 + }, + { + "epoch": 2.910429447852761, + "grad_norm": 0.025270938873291016, + "learning_rate": 2.9247524752475246e-06, + "loss": 0.0011, + "step": 593 + }, + { + "epoch": 2.9153374233128835, + "grad_norm": 0.022803837433457375, + "learning_rate": 2.9178217821782177e-06, + "loss": 0.0012, + "step": 594 + }, + { + "epoch": 2.920245398773006, + "grad_norm": 0.4284270405769348, + "learning_rate": 2.9108910891089108e-06, + "loss": 0.0206, + "step": 595 + }, + { + "epoch": 2.925153374233129, + "grad_norm": 0.03455421328544617, + "learning_rate": 2.903960396039604e-06, + "loss": 0.0016, + "step": 596 + }, + { + "epoch": 2.9300613496932515, + "grad_norm": 0.06015002727508545, + "learning_rate": 2.897029702970297e-06, + "loss": 0.0026, + "step": 597 + }, + { + "epoch": 2.934969325153374, + "grad_norm": 0.07220069319009781, + "learning_rate": 2.89009900990099e-06, + "loss": 0.0035, + "step": 598 + }, + { + "epoch": 2.939877300613497, + "grad_norm": 0.0011812745360657573, + "learning_rate": 2.883168316831683e-06, + "loss": 0.0003, + "step": 599 + }, + { + "epoch": 2.9447852760736195, + "grad_norm": 0.030697235837578773, + "learning_rate": 2.876237623762376e-06, + "loss": 0.0022, + "step": 600 + }, + { + "epoch": 2.949693251533742, + "grad_norm": 0.028459953144192696, + "learning_rate": 2.869306930693069e-06, + "loss": 0.0014, + "step": 601 + }, + { + "epoch": 2.954601226993865, + "grad_norm": 0.03279620781540871, + "learning_rate": 2.8623762376237623e-06, + "loss": 0.0019, + "step": 602 + }, + { + "epoch": 2.9595092024539875, + "grad_norm": 0.03629060834646225, + "learning_rate": 2.8554455445544553e-06, + "loss": 0.0023, + "step": 603 + }, + { + "epoch": 2.96441717791411, + "grad_norm": 0.008510543964803219, + "learning_rate": 2.8485148514851484e-06, + "loss": 0.0006, + "step": 604 + }, + { + "epoch": 2.969325153374233, + "grad_norm": 0.014275304973125458, + "learning_rate": 2.841584158415842e-06, + "loss": 0.001, + "step": 605 + }, + { + "epoch": 2.974233128834356, + "grad_norm": 0.06183558329939842, + "learning_rate": 2.834653465346535e-06, + "loss": 0.0021, + "step": 606 + }, + { + "epoch": 2.9791411042944786, + "grad_norm": 0.07287970185279846, + "learning_rate": 2.827722772277228e-06, + "loss": 0.0029, + "step": 607 + }, + { + "epoch": 2.9840490797546013, + "grad_norm": 0.02359975501894951, + "learning_rate": 2.820792079207921e-06, + "loss": 0.0015, + "step": 608 + }, + { + "epoch": 2.988957055214724, + "grad_norm": 0.02087857946753502, + "learning_rate": 2.813861386138614e-06, + "loss": 0.0027, + "step": 609 + }, + { + "epoch": 2.9938650306748467, + "grad_norm": 0.036885183304548264, + "learning_rate": 2.8069306930693072e-06, + "loss": 0.0016, + "step": 610 + }, + { + "epoch": 2.9987730061349693, + "grad_norm": 0.08759594708681107, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.013, + "step": 611 + }, + { + "epoch": 3.003680981595092, + "grad_norm": 0.0203255582600832, + "learning_rate": 2.7930693069306934e-06, + "loss": 0.0046, + "step": 612 + }, + { + "epoch": 3.0085889570552147, + "grad_norm": 0.04549151286482811, + "learning_rate": 2.7861386138613864e-06, + "loss": 0.0032, + "step": 613 + }, + { + "epoch": 3.0134969325153373, + "grad_norm": 0.02024606615304947, + "learning_rate": 2.7792079207920795e-06, + "loss": 0.0014, + "step": 614 + }, + { + "epoch": 3.01840490797546, + "grad_norm": 0.03339584916830063, + "learning_rate": 2.7722772277227726e-06, + "loss": 0.0018, + "step": 615 + }, + { + "epoch": 3.0233128834355827, + "grad_norm": 0.02375354804098606, + "learning_rate": 2.7653465346534656e-06, + "loss": 0.0028, + "step": 616 + }, + { + "epoch": 3.0282208588957054, + "grad_norm": 0.021728744730353355, + "learning_rate": 2.7584158415841583e-06, + "loss": 0.0031, + "step": 617 + }, + { + "epoch": 3.033128834355828, + "grad_norm": 0.032941028475761414, + "learning_rate": 2.7514851485148514e-06, + "loss": 0.0018, + "step": 618 + }, + { + "epoch": 3.038036809815951, + "grad_norm": 0.020893137902021408, + "learning_rate": 2.7445544554455444e-06, + "loss": 0.0013, + "step": 619 + }, + { + "epoch": 3.042944785276074, + "grad_norm": 0.012155863456428051, + "learning_rate": 2.7376237623762375e-06, + "loss": 0.0013, + "step": 620 + }, + { + "epoch": 3.0478527607361965, + "grad_norm": 0.0235965047031641, + "learning_rate": 2.7306930693069306e-06, + "loss": 0.0019, + "step": 621 + }, + { + "epoch": 3.052760736196319, + "grad_norm": 0.03598389774560928, + "learning_rate": 2.7237623762376236e-06, + "loss": 0.002, + "step": 622 + }, + { + "epoch": 3.057668711656442, + "grad_norm": 0.033302754163742065, + "learning_rate": 2.7168316831683167e-06, + "loss": 0.0011, + "step": 623 + }, + { + "epoch": 3.0625766871165645, + "grad_norm": 0.011287711560726166, + "learning_rate": 2.7099009900990098e-06, + "loss": 0.0015, + "step": 624 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.04231281206011772, + "learning_rate": 2.702970297029703e-06, + "loss": 0.0032, + "step": 625 + }, + { + "epoch": 3.07239263803681, + "grad_norm": 0.020790139213204384, + "learning_rate": 2.696039603960396e-06, + "loss": 0.001, + "step": 626 + }, + { + "epoch": 3.0773006134969325, + "grad_norm": 0.020422058179974556, + "learning_rate": 2.689108910891089e-06, + "loss": 0.0017, + "step": 627 + }, + { + "epoch": 3.082208588957055, + "grad_norm": 0.02271398715674877, + "learning_rate": 2.682178217821782e-06, + "loss": 0.0013, + "step": 628 + }, + { + "epoch": 3.087116564417178, + "grad_norm": 0.013147884979844093, + "learning_rate": 2.675247524752475e-06, + "loss": 0.0009, + "step": 629 + }, + { + "epoch": 3.0920245398773005, + "grad_norm": 0.023107754066586494, + "learning_rate": 2.668316831683168e-06, + "loss": 0.0021, + "step": 630 + }, + { + "epoch": 3.096932515337423, + "grad_norm": 0.027083350345492363, + "learning_rate": 2.6613861386138612e-06, + "loss": 0.0016, + "step": 631 + }, + { + "epoch": 3.101840490797546, + "grad_norm": 0.01890239305794239, + "learning_rate": 2.6544554455445543e-06, + "loss": 0.0021, + "step": 632 + }, + { + "epoch": 3.1067484662576685, + "grad_norm": 0.0020133615471422672, + "learning_rate": 2.6475247524752474e-06, + "loss": 0.0004, + "step": 633 + }, + { + "epoch": 3.111656441717791, + "grad_norm": 0.022084610536694527, + "learning_rate": 2.6405940594059405e-06, + "loss": 0.0015, + "step": 634 + }, + { + "epoch": 3.116564417177914, + "grad_norm": 0.008074000477790833, + "learning_rate": 2.6336633663366335e-06, + "loss": 0.0011, + "step": 635 + }, + { + "epoch": 3.121472392638037, + "grad_norm": 0.03898672014474869, + "learning_rate": 2.6267326732673266e-06, + "loss": 0.002, + "step": 636 + }, + { + "epoch": 3.1263803680981597, + "grad_norm": 0.00946191418915987, + "learning_rate": 2.6198019801980197e-06, + "loss": 0.001, + "step": 637 + }, + { + "epoch": 3.1312883435582823, + "grad_norm": 0.1219155415892601, + "learning_rate": 2.6128712871287127e-06, + "loss": 0.0079, + "step": 638 + }, + { + "epoch": 3.136196319018405, + "grad_norm": 0.025132469832897186, + "learning_rate": 2.605940594059406e-06, + "loss": 0.0015, + "step": 639 + }, + { + "epoch": 3.1411042944785277, + "grad_norm": 0.01082681119441986, + "learning_rate": 2.599009900990099e-06, + "loss": 0.0008, + "step": 640 + }, + { + "epoch": 3.1460122699386504, + "grad_norm": 0.14690011739730835, + "learning_rate": 2.592079207920792e-06, + "loss": 0.0117, + "step": 641 + }, + { + "epoch": 3.150920245398773, + "grad_norm": 0.010698092170059681, + "learning_rate": 2.585148514851485e-06, + "loss": 0.0013, + "step": 642 + }, + { + "epoch": 3.1558282208588957, + "grad_norm": 0.2589493691921234, + "learning_rate": 2.578217821782178e-06, + "loss": 0.0089, + "step": 643 + }, + { + "epoch": 3.1607361963190184, + "grad_norm": 0.01592063717544079, + "learning_rate": 2.571287128712871e-06, + "loss": 0.0011, + "step": 644 + }, + { + "epoch": 3.165644171779141, + "grad_norm": 0.02394460327923298, + "learning_rate": 2.564356435643564e-06, + "loss": 0.0014, + "step": 645 + }, + { + "epoch": 3.1705521472392637, + "grad_norm": 0.02034229226410389, + "learning_rate": 2.5574257425742573e-06, + "loss": 0.0023, + "step": 646 + }, + { + "epoch": 3.1754601226993864, + "grad_norm": 0.032998789101839066, + "learning_rate": 2.5504950495049503e-06, + "loss": 0.0021, + "step": 647 + }, + { + "epoch": 3.180368098159509, + "grad_norm": 0.013732580468058586, + "learning_rate": 2.5435643564356434e-06, + "loss": 0.0005, + "step": 648 + }, + { + "epoch": 3.1852760736196317, + "grad_norm": 0.046656981110572815, + "learning_rate": 2.5366336633663365e-06, + "loss": 0.0031, + "step": 649 + }, + { + "epoch": 3.190184049079755, + "grad_norm": 0.02960779331624508, + "learning_rate": 2.5297029702970295e-06, + "loss": 0.0013, + "step": 650 + }, + { + "epoch": 3.1950920245398775, + "grad_norm": 0.051251210272312164, + "learning_rate": 2.5227722772277226e-06, + "loss": 0.0025, + "step": 651 + }, + { + "epoch": 3.2, + "grad_norm": 0.03321881592273712, + "learning_rate": 2.5158415841584157e-06, + "loss": 0.0047, + "step": 652 + }, + { + "epoch": 3.204907975460123, + "grad_norm": 0.005035923328250647, + "learning_rate": 2.5089108910891088e-06, + "loss": 0.0007, + "step": 653 + }, + { + "epoch": 3.2098159509202455, + "grad_norm": 0.07169622927904129, + "learning_rate": 2.501980198019802e-06, + "loss": 0.0026, + "step": 654 + }, + { + "epoch": 3.214723926380368, + "grad_norm": 0.017996247857809067, + "learning_rate": 2.495049504950495e-06, + "loss": 0.0012, + "step": 655 + }, + { + "epoch": 3.219631901840491, + "grad_norm": 0.025565218180418015, + "learning_rate": 2.488118811881188e-06, + "loss": 0.0013, + "step": 656 + }, + { + "epoch": 3.2245398773006135, + "grad_norm": 0.014627007767558098, + "learning_rate": 2.481188118811881e-06, + "loss": 0.0015, + "step": 657 + }, + { + "epoch": 3.229447852760736, + "grad_norm": 0.011715116910636425, + "learning_rate": 2.474257425742574e-06, + "loss": 0.001, + "step": 658 + }, + { + "epoch": 3.234355828220859, + "grad_norm": 0.01887853443622589, + "learning_rate": 2.467326732673267e-06, + "loss": 0.0013, + "step": 659 + }, + { + "epoch": 3.2392638036809815, + "grad_norm": 0.028289880603551865, + "learning_rate": 2.4603960396039602e-06, + "loss": 0.0024, + "step": 660 + }, + { + "epoch": 3.244171779141104, + "grad_norm": 0.15456917881965637, + "learning_rate": 2.4534653465346533e-06, + "loss": 0.021, + "step": 661 + }, + { + "epoch": 3.249079754601227, + "grad_norm": 0.012570716440677643, + "learning_rate": 2.4465346534653464e-06, + "loss": 0.0012, + "step": 662 + }, + { + "epoch": 3.2539877300613496, + "grad_norm": 0.006434513721615076, + "learning_rate": 2.4396039603960394e-06, + "loss": 0.0006, + "step": 663 + }, + { + "epoch": 3.2588957055214722, + "grad_norm": 0.003889314830303192, + "learning_rate": 2.4326732673267325e-06, + "loss": 0.0007, + "step": 664 + }, + { + "epoch": 3.263803680981595, + "grad_norm": 0.16952529549598694, + "learning_rate": 2.4257425742574256e-06, + "loss": 0.008, + "step": 665 + }, + { + "epoch": 3.2687116564417176, + "grad_norm": 0.02936733514070511, + "learning_rate": 2.4188118811881186e-06, + "loss": 0.0016, + "step": 666 + }, + { + "epoch": 3.2736196319018402, + "grad_norm": 0.10733254253864288, + "learning_rate": 2.4118811881188117e-06, + "loss": 0.0029, + "step": 667 + }, + { + "epoch": 3.2785276073619634, + "grad_norm": 0.02705569751560688, + "learning_rate": 2.404950495049505e-06, + "loss": 0.0017, + "step": 668 + }, + { + "epoch": 3.283435582822086, + "grad_norm": 0.024813305586576462, + "learning_rate": 2.3980198019801983e-06, + "loss": 0.0011, + "step": 669 + }, + { + "epoch": 3.2883435582822087, + "grad_norm": 0.11823663115501404, + "learning_rate": 2.3910891089108913e-06, + "loss": 0.0034, + "step": 670 + }, + { + "epoch": 3.2932515337423314, + "grad_norm": 0.15911535918712616, + "learning_rate": 2.3841584158415844e-06, + "loss": 0.0038, + "step": 671 + }, + { + "epoch": 3.298159509202454, + "grad_norm": 0.03651705011725426, + "learning_rate": 2.3772277227722775e-06, + "loss": 0.0013, + "step": 672 + }, + { + "epoch": 3.3030674846625767, + "grad_norm": 0.20837312936782837, + "learning_rate": 2.3702970297029705e-06, + "loss": 0.0076, + "step": 673 + }, + { + "epoch": 3.3079754601226994, + "grad_norm": 0.013823213055729866, + "learning_rate": 2.3633663366336636e-06, + "loss": 0.0013, + "step": 674 + }, + { + "epoch": 3.312883435582822, + "grad_norm": 0.0133537407964468, + "learning_rate": 2.3564356435643567e-06, + "loss": 0.0018, + "step": 675 + }, + { + "epoch": 3.3177914110429447, + "grad_norm": 0.01537949126213789, + "learning_rate": 2.3495049504950498e-06, + "loss": 0.0013, + "step": 676 + }, + { + "epoch": 3.3226993865030674, + "grad_norm": 0.03161802887916565, + "learning_rate": 2.342574257425743e-06, + "loss": 0.0013, + "step": 677 + }, + { + "epoch": 3.32760736196319, + "grad_norm": 0.01877240464091301, + "learning_rate": 2.335643564356436e-06, + "loss": 0.001, + "step": 678 + }, + { + "epoch": 3.3325153374233127, + "grad_norm": 0.20145629346370697, + "learning_rate": 2.328712871287129e-06, + "loss": 0.0099, + "step": 679 + }, + { + "epoch": 3.3374233128834354, + "grad_norm": 0.17651820182800293, + "learning_rate": 2.321782178217822e-06, + "loss": 0.0298, + "step": 680 + }, + { + "epoch": 3.3423312883435585, + "grad_norm": 0.031260013580322266, + "learning_rate": 2.314851485148515e-06, + "loss": 0.0017, + "step": 681 + }, + { + "epoch": 3.347239263803681, + "grad_norm": 0.049838535487651825, + "learning_rate": 2.307920792079208e-06, + "loss": 0.0016, + "step": 682 + }, + { + "epoch": 3.352147239263804, + "grad_norm": 0.01825704425573349, + "learning_rate": 2.3009900990099012e-06, + "loss": 0.0009, + "step": 683 + }, + { + "epoch": 3.3570552147239265, + "grad_norm": 0.2820407450199127, + "learning_rate": 2.2940594059405943e-06, + "loss": 0.0095, + "step": 684 + }, + { + "epoch": 3.361963190184049, + "grad_norm": 0.013412845320999622, + "learning_rate": 2.2871287128712874e-06, + "loss": 0.0016, + "step": 685 + }, + { + "epoch": 3.366871165644172, + "grad_norm": 0.043872177600860596, + "learning_rate": 2.2801980198019804e-06, + "loss": 0.0023, + "step": 686 + }, + { + "epoch": 3.3717791411042946, + "grad_norm": 0.024329353123903275, + "learning_rate": 2.2732673267326735e-06, + "loss": 0.001, + "step": 687 + }, + { + "epoch": 3.3766871165644172, + "grad_norm": 0.016059909015893936, + "learning_rate": 2.2663366336633666e-06, + "loss": 0.0012, + "step": 688 + }, + { + "epoch": 3.38159509202454, + "grad_norm": 0.014483323320746422, + "learning_rate": 2.2594059405940596e-06, + "loss": 0.0011, + "step": 689 + }, + { + "epoch": 3.3865030674846626, + "grad_norm": 0.05332216992974281, + "learning_rate": 2.2524752475247523e-06, + "loss": 0.0023, + "step": 690 + }, + { + "epoch": 3.3914110429447852, + "grad_norm": 0.005560703109949827, + "learning_rate": 2.2455445544554454e-06, + "loss": 0.0006, + "step": 691 + }, + { + "epoch": 3.396319018404908, + "grad_norm": 0.029570411890745163, + "learning_rate": 2.2386138613861384e-06, + "loss": 0.0026, + "step": 692 + }, + { + "epoch": 3.4012269938650306, + "grad_norm": 0.02160765416920185, + "learning_rate": 2.2316831683168315e-06, + "loss": 0.0014, + "step": 693 + }, + { + "epoch": 3.4061349693251532, + "grad_norm": 0.012106803245842457, + "learning_rate": 2.2247524752475246e-06, + "loss": 0.0008, + "step": 694 + }, + { + "epoch": 3.411042944785276, + "grad_norm": 0.027164770290255547, + "learning_rate": 2.2178217821782176e-06, + "loss": 0.001, + "step": 695 + }, + { + "epoch": 3.4159509202453986, + "grad_norm": 0.03465467691421509, + "learning_rate": 2.2108910891089107e-06, + "loss": 0.0008, + "step": 696 + }, + { + "epoch": 3.4208588957055213, + "grad_norm": 0.01086588017642498, + "learning_rate": 2.2039603960396038e-06, + "loss": 0.0016, + "step": 697 + }, + { + "epoch": 3.425766871165644, + "grad_norm": 0.2833847105503082, + "learning_rate": 2.197029702970297e-06, + "loss": 0.0092, + "step": 698 + }, + { + "epoch": 3.430674846625767, + "grad_norm": 0.01620599813759327, + "learning_rate": 2.19009900990099e-06, + "loss": 0.0005, + "step": 699 + }, + { + "epoch": 3.4355828220858897, + "grad_norm": 0.0964425802230835, + "learning_rate": 2.183168316831683e-06, + "loss": 0.0059, + "step": 700 + }, + { + "epoch": 3.4404907975460124, + "grad_norm": 0.051153287291526794, + "learning_rate": 2.176237623762376e-06, + "loss": 0.003, + "step": 701 + }, + { + "epoch": 3.445398773006135, + "grad_norm": 0.12504975497722626, + "learning_rate": 2.169306930693069e-06, + "loss": 0.0023, + "step": 702 + }, + { + "epoch": 3.4503067484662577, + "grad_norm": 0.18094071745872498, + "learning_rate": 2.162376237623762e-06, + "loss": 0.0175, + "step": 703 + }, + { + "epoch": 3.4552147239263804, + "grad_norm": 0.011514030396938324, + "learning_rate": 2.1554455445544553e-06, + "loss": 0.001, + "step": 704 + }, + { + "epoch": 3.460122699386503, + "grad_norm": 0.015152939595282078, + "learning_rate": 2.1485148514851483e-06, + "loss": 0.0011, + "step": 705 + }, + { + "epoch": 3.4650306748466257, + "grad_norm": 0.05039620399475098, + "learning_rate": 2.1415841584158414e-06, + "loss": 0.0049, + "step": 706 + }, + { + "epoch": 3.4699386503067484, + "grad_norm": 0.044066257774829865, + "learning_rate": 2.1346534653465345e-06, + "loss": 0.0011, + "step": 707 + }, + { + "epoch": 3.474846625766871, + "grad_norm": 0.06301417946815491, + "learning_rate": 2.1277227722772275e-06, + "loss": 0.002, + "step": 708 + }, + { + "epoch": 3.4797546012269938, + "grad_norm": 0.05275435373187065, + "learning_rate": 2.1207920792079206e-06, + "loss": 0.0026, + "step": 709 + }, + { + "epoch": 3.4846625766871164, + "grad_norm": 0.05170956999063492, + "learning_rate": 2.1138613861386137e-06, + "loss": 0.0033, + "step": 710 + }, + { + "epoch": 3.489570552147239, + "grad_norm": 0.12438485026359558, + "learning_rate": 2.1069306930693067e-06, + "loss": 0.0028, + "step": 711 + }, + { + "epoch": 3.4944785276073618, + "grad_norm": 0.07120586186647415, + "learning_rate": 2.1e-06, + "loss": 0.0021, + "step": 712 + }, + { + "epoch": 3.499386503067485, + "grad_norm": 0.027411244809627533, + "learning_rate": 2.093069306930693e-06, + "loss": 0.0011, + "step": 713 + }, + { + "epoch": 3.5042944785276076, + "grad_norm": 0.005563246086239815, + "learning_rate": 2.086138613861386e-06, + "loss": 0.0007, + "step": 714 + }, + { + "epoch": 3.5092024539877302, + "grad_norm": 0.04439758136868477, + "learning_rate": 2.079207920792079e-06, + "loss": 0.0017, + "step": 715 + }, + { + "epoch": 3.514110429447853, + "grad_norm": 0.04545675963163376, + "learning_rate": 2.072277227722772e-06, + "loss": 0.0045, + "step": 716 + }, + { + "epoch": 3.5190184049079756, + "grad_norm": 0.019012991338968277, + "learning_rate": 2.065346534653465e-06, + "loss": 0.001, + "step": 717 + }, + { + "epoch": 3.5239263803680982, + "grad_norm": 0.5096023678779602, + "learning_rate": 2.058415841584158e-06, + "loss": 0.0205, + "step": 718 + }, + { + "epoch": 3.528834355828221, + "grad_norm": 0.03077244944870472, + "learning_rate": 2.0514851485148513e-06, + "loss": 0.0018, + "step": 719 + }, + { + "epoch": 3.5337423312883436, + "grad_norm": 0.052647169679403305, + "learning_rate": 2.0445544554455443e-06, + "loss": 0.0021, + "step": 720 + }, + { + "epoch": 3.5386503067484663, + "grad_norm": 0.014248156920075417, + "learning_rate": 2.0376237623762374e-06, + "loss": 0.0007, + "step": 721 + }, + { + "epoch": 3.543558282208589, + "grad_norm": 0.11832743138074875, + "learning_rate": 2.0306930693069305e-06, + "loss": 0.0088, + "step": 722 + }, + { + "epoch": 3.5484662576687116, + "grad_norm": 0.025962911546230316, + "learning_rate": 2.0237623762376236e-06, + "loss": 0.0009, + "step": 723 + }, + { + "epoch": 3.5533742331288343, + "grad_norm": 0.06493301689624786, + "learning_rate": 2.0168316831683166e-06, + "loss": 0.0024, + "step": 724 + }, + { + "epoch": 3.558282208588957, + "grad_norm": 0.023671971634030342, + "learning_rate": 2.0099009900990097e-06, + "loss": 0.0019, + "step": 725 + }, + { + "epoch": 3.5631901840490796, + "grad_norm": 0.02273421734571457, + "learning_rate": 2.0029702970297028e-06, + "loss": 0.0008, + "step": 726 + }, + { + "epoch": 3.5680981595092023, + "grad_norm": 0.18828389048576355, + "learning_rate": 1.996039603960396e-06, + "loss": 0.0182, + "step": 727 + }, + { + "epoch": 3.573006134969325, + "grad_norm": 0.01973796635866165, + "learning_rate": 1.989108910891089e-06, + "loss": 0.0023, + "step": 728 + }, + { + "epoch": 3.5779141104294476, + "grad_norm": 0.03295096009969711, + "learning_rate": 1.982178217821782e-06, + "loss": 0.0017, + "step": 729 + }, + { + "epoch": 3.5828220858895703, + "grad_norm": 0.012010748498141766, + "learning_rate": 1.975247524752475e-06, + "loss": 0.0009, + "step": 730 + }, + { + "epoch": 3.5877300613496934, + "grad_norm": 0.008494194597005844, + "learning_rate": 1.9683168316831685e-06, + "loss": 0.0009, + "step": 731 + }, + { + "epoch": 3.592638036809816, + "grad_norm": 0.016253001987934113, + "learning_rate": 1.9613861386138616e-06, + "loss": 0.0015, + "step": 732 + }, + { + "epoch": 3.5975460122699388, + "grad_norm": 0.007456593681126833, + "learning_rate": 1.9544554455445547e-06, + "loss": 0.0007, + "step": 733 + }, + { + "epoch": 3.6024539877300614, + "grad_norm": 0.00861444789916277, + "learning_rate": 1.9475247524752477e-06, + "loss": 0.0007, + "step": 734 + }, + { + "epoch": 3.607361963190184, + "grad_norm": 0.0065794652327895164, + "learning_rate": 1.940594059405941e-06, + "loss": 0.001, + "step": 735 + }, + { + "epoch": 3.6122699386503068, + "grad_norm": 0.015389169566333294, + "learning_rate": 1.933663366336634e-06, + "loss": 0.0015, + "step": 736 + }, + { + "epoch": 3.6171779141104294, + "grad_norm": 0.025337016209959984, + "learning_rate": 1.926732673267327e-06, + "loss": 0.0018, + "step": 737 + }, + { + "epoch": 3.622085889570552, + "grad_norm": 0.00653579318895936, + "learning_rate": 1.91980198019802e-06, + "loss": 0.0008, + "step": 738 + }, + { + "epoch": 3.626993865030675, + "grad_norm": 0.10997878760099411, + "learning_rate": 1.912871287128713e-06, + "loss": 0.0068, + "step": 739 + }, + { + "epoch": 3.6319018404907975, + "grad_norm": 0.05580228194594383, + "learning_rate": 1.9059405940594061e-06, + "loss": 0.0056, + "step": 740 + }, + { + "epoch": 3.63680981595092, + "grad_norm": 0.06799723207950592, + "learning_rate": 1.8990099009900992e-06, + "loss": 0.0019, + "step": 741 + }, + { + "epoch": 3.641717791411043, + "grad_norm": 0.20822834968566895, + "learning_rate": 1.8920792079207923e-06, + "loss": 0.0039, + "step": 742 + }, + { + "epoch": 3.646625766871166, + "grad_norm": 0.03920517861843109, + "learning_rate": 1.8851485148514851e-06, + "loss": 0.0017, + "step": 743 + }, + { + "epoch": 3.6515337423312886, + "grad_norm": 0.06821847707033157, + "learning_rate": 1.8782178217821782e-06, + "loss": 0.0027, + "step": 744 + }, + { + "epoch": 3.6564417177914113, + "grad_norm": 0.09687570482492447, + "learning_rate": 1.8712871287128713e-06, + "loss": 0.002, + "step": 745 + }, + { + "epoch": 3.661349693251534, + "grad_norm": 0.05403744429349899, + "learning_rate": 1.8643564356435643e-06, + "loss": 0.0019, + "step": 746 + }, + { + "epoch": 3.6662576687116566, + "grad_norm": 0.019597845152020454, + "learning_rate": 1.8574257425742574e-06, + "loss": 0.0009, + "step": 747 + }, + { + "epoch": 3.6711656441717793, + "grad_norm": 0.04923088103532791, + "learning_rate": 1.8504950495049505e-06, + "loss": 0.0013, + "step": 748 + }, + { + "epoch": 3.676073619631902, + "grad_norm": 0.0967707633972168, + "learning_rate": 1.8435643564356435e-06, + "loss": 0.0039, + "step": 749 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.037127815186977386, + "learning_rate": 1.8366336633663366e-06, + "loss": 0.0012, + "step": 750 + }, + { + "epoch": 3.6858895705521473, + "grad_norm": 0.013236461207270622, + "learning_rate": 1.8297029702970297e-06, + "loss": 0.0045, + "step": 751 + }, + { + "epoch": 3.69079754601227, + "grad_norm": 0.031925372779369354, + "learning_rate": 1.8227722772277228e-06, + "loss": 0.002, + "step": 752 + }, + { + "epoch": 3.6957055214723926, + "grad_norm": 0.023648735135793686, + "learning_rate": 1.8158415841584158e-06, + "loss": 0.0012, + "step": 753 + }, + { + "epoch": 3.7006134969325153, + "grad_norm": 0.01484636776149273, + "learning_rate": 1.8089108910891089e-06, + "loss": 0.0011, + "step": 754 + }, + { + "epoch": 3.705521472392638, + "grad_norm": 0.02288316749036312, + "learning_rate": 1.801980198019802e-06, + "loss": 0.0018, + "step": 755 + }, + { + "epoch": 3.7104294478527606, + "grad_norm": 0.005614751018583775, + "learning_rate": 1.795049504950495e-06, + "loss": 0.001, + "step": 756 + }, + { + "epoch": 3.7153374233128833, + "grad_norm": 0.03587134927511215, + "learning_rate": 1.788118811881188e-06, + "loss": 0.0013, + "step": 757 + }, + { + "epoch": 3.720245398773006, + "grad_norm": 0.048482466489076614, + "learning_rate": 1.7811881188118812e-06, + "loss": 0.0014, + "step": 758 + }, + { + "epoch": 3.7251533742331286, + "grad_norm": 0.06541978567838669, + "learning_rate": 1.7742574257425742e-06, + "loss": 0.0023, + "step": 759 + }, + { + "epoch": 3.7300613496932513, + "grad_norm": 0.040501050651073456, + "learning_rate": 1.7673267326732673e-06, + "loss": 0.0014, + "step": 760 + }, + { + "epoch": 3.734969325153374, + "grad_norm": 0.006551014259457588, + "learning_rate": 1.7603960396039604e-06, + "loss": 0.0009, + "step": 761 + }, + { + "epoch": 3.7398773006134967, + "grad_norm": 0.11849401146173477, + "learning_rate": 1.7534653465346534e-06, + "loss": 0.0099, + "step": 762 + }, + { + "epoch": 3.7447852760736198, + "grad_norm": 0.004786093719303608, + "learning_rate": 1.7465346534653465e-06, + "loss": 0.0008, + "step": 763 + }, + { + "epoch": 3.7496932515337424, + "grad_norm": 0.02577151544392109, + "learning_rate": 1.7396039603960396e-06, + "loss": 0.0016, + "step": 764 + }, + { + "epoch": 3.754601226993865, + "grad_norm": 0.014097603037953377, + "learning_rate": 1.7326732673267326e-06, + "loss": 0.0008, + "step": 765 + }, + { + "epoch": 3.759509202453988, + "grad_norm": 0.05258313938975334, + "learning_rate": 1.7257425742574257e-06, + "loss": 0.0009, + "step": 766 + }, + { + "epoch": 3.7644171779141105, + "grad_norm": 0.09022804349660873, + "learning_rate": 1.7188118811881188e-06, + "loss": 0.005, + "step": 767 + }, + { + "epoch": 3.769325153374233, + "grad_norm": 0.008886247873306274, + "learning_rate": 1.7118811881188119e-06, + "loss": 0.0008, + "step": 768 + }, + { + "epoch": 3.774233128834356, + "grad_norm": 0.036997053772211075, + "learning_rate": 1.704950495049505e-06, + "loss": 0.0011, + "step": 769 + }, + { + "epoch": 3.7791411042944785, + "grad_norm": 0.05569405481219292, + "learning_rate": 1.698019801980198e-06, + "loss": 0.0018, + "step": 770 + }, + { + "epoch": 3.784049079754601, + "grad_norm": 0.0031505110673606396, + "learning_rate": 1.691089108910891e-06, + "loss": 0.0004, + "step": 771 + }, + { + "epoch": 3.788957055214724, + "grad_norm": 0.014605509117245674, + "learning_rate": 1.6841584158415841e-06, + "loss": 0.0011, + "step": 772 + }, + { + "epoch": 3.7938650306748465, + "grad_norm": 0.09325973689556122, + "learning_rate": 1.6772277227722772e-06, + "loss": 0.0052, + "step": 773 + }, + { + "epoch": 3.7987730061349696, + "grad_norm": 0.059272442013025284, + "learning_rate": 1.6702970297029703e-06, + "loss": 0.0022, + "step": 774 + }, + { + "epoch": 3.8036809815950923, + "grad_norm": 0.01452575996518135, + "learning_rate": 1.6633663366336633e-06, + "loss": 0.0021, + "step": 775 + }, + { + "epoch": 3.808588957055215, + "grad_norm": 0.17578046023845673, + "learning_rate": 1.6564356435643564e-06, + "loss": 0.0093, + "step": 776 + }, + { + "epoch": 3.8134969325153376, + "grad_norm": 0.007930277846753597, + "learning_rate": 1.6495049504950495e-06, + "loss": 0.0006, + "step": 777 + }, + { + "epoch": 3.8184049079754603, + "grad_norm": 0.07230112701654434, + "learning_rate": 1.6425742574257425e-06, + "loss": 0.0025, + "step": 778 + }, + { + "epoch": 3.823312883435583, + "grad_norm": 0.03507319092750549, + "learning_rate": 1.6356435643564358e-06, + "loss": 0.0017, + "step": 779 + }, + { + "epoch": 3.8282208588957056, + "grad_norm": 0.06336654722690582, + "learning_rate": 1.6287128712871287e-06, + "loss": 0.0025, + "step": 780 + }, + { + "epoch": 3.8331288343558283, + "grad_norm": 0.14077608287334442, + "learning_rate": 1.6217821782178217e-06, + "loss": 0.0086, + "step": 781 + }, + { + "epoch": 3.838036809815951, + "grad_norm": 0.015772581100463867, + "learning_rate": 1.6148514851485148e-06, + "loss": 0.001, + "step": 782 + }, + { + "epoch": 3.8429447852760736, + "grad_norm": 0.01927962154150009, + "learning_rate": 1.6079207920792079e-06, + "loss": 0.0015, + "step": 783 + }, + { + "epoch": 3.8478527607361963, + "grad_norm": 0.011015449650585651, + "learning_rate": 1.600990099009901e-06, + "loss": 0.0008, + "step": 784 + }, + { + "epoch": 3.852760736196319, + "grad_norm": 0.40098482370376587, + "learning_rate": 1.594059405940594e-06, + "loss": 0.0314, + "step": 785 + }, + { + "epoch": 3.8576687116564417, + "grad_norm": 0.02672453783452511, + "learning_rate": 1.587128712871287e-06, + "loss": 0.0025, + "step": 786 + }, + { + "epoch": 3.8625766871165643, + "grad_norm": 0.022412395104765892, + "learning_rate": 1.5801980198019802e-06, + "loss": 0.0013, + "step": 787 + }, + { + "epoch": 3.867484662576687, + "grad_norm": 0.023978037759661674, + "learning_rate": 1.5732673267326732e-06, + "loss": 0.0017, + "step": 788 + }, + { + "epoch": 3.8723926380368097, + "grad_norm": 0.017764659598469734, + "learning_rate": 1.5663366336633663e-06, + "loss": 0.002, + "step": 789 + }, + { + "epoch": 3.8773006134969323, + "grad_norm": 0.012586713768541813, + "learning_rate": 1.5594059405940594e-06, + "loss": 0.0008, + "step": 790 + }, + { + "epoch": 3.882208588957055, + "grad_norm": 0.056462038308382034, + "learning_rate": 1.5524752475247524e-06, + "loss": 0.0036, + "step": 791 + }, + { + "epoch": 3.8871165644171777, + "grad_norm": 0.05329478159546852, + "learning_rate": 1.5455445544554455e-06, + "loss": 0.0041, + "step": 792 + }, + { + "epoch": 3.8920245398773003, + "grad_norm": 0.0013215028448030353, + "learning_rate": 1.5386138613861386e-06, + "loss": 0.0004, + "step": 793 + }, + { + "epoch": 3.8969325153374235, + "grad_norm": 0.05318621173501015, + "learning_rate": 1.5316831683168316e-06, + "loss": 0.0011, + "step": 794 + }, + { + "epoch": 3.901840490797546, + "grad_norm": 0.3169184625148773, + "learning_rate": 1.5247524752475247e-06, + "loss": 0.0221, + "step": 795 + }, + { + "epoch": 3.906748466257669, + "grad_norm": 0.04726627469062805, + "learning_rate": 1.5178217821782178e-06, + "loss": 0.0015, + "step": 796 + }, + { + "epoch": 3.9116564417177915, + "grad_norm": 0.13995185494422913, + "learning_rate": 1.5108910891089108e-06, + "loss": 0.0092, + "step": 797 + }, + { + "epoch": 3.916564417177914, + "grad_norm": 0.01544391643255949, + "learning_rate": 1.503960396039604e-06, + "loss": 0.0012, + "step": 798 + }, + { + "epoch": 3.921472392638037, + "grad_norm": 0.1588226556777954, + "learning_rate": 1.497029702970297e-06, + "loss": 0.0087, + "step": 799 + }, + { + "epoch": 3.9263803680981595, + "grad_norm": 0.011546803638339043, + "learning_rate": 1.49009900990099e-06, + "loss": 0.0015, + "step": 800 + }, + { + "epoch": 3.931288343558282, + "grad_norm": 0.04798766225576401, + "learning_rate": 1.4831683168316831e-06, + "loss": 0.0013, + "step": 801 + }, + { + "epoch": 3.936196319018405, + "grad_norm": 0.01064328383654356, + "learning_rate": 1.4762376237623762e-06, + "loss": 0.0011, + "step": 802 + }, + { + "epoch": 3.9411042944785275, + "grad_norm": 0.1379479169845581, + "learning_rate": 1.4693069306930693e-06, + "loss": 0.0142, + "step": 803 + }, + { + "epoch": 3.94601226993865, + "grad_norm": 0.054966770112514496, + "learning_rate": 1.4623762376237623e-06, + "loss": 0.0014, + "step": 804 + }, + { + "epoch": 3.950920245398773, + "grad_norm": 0.035458799451589584, + "learning_rate": 1.4554455445544554e-06, + "loss": 0.002, + "step": 805 + }, + { + "epoch": 3.955828220858896, + "grad_norm": 0.011258352547883987, + "learning_rate": 1.4485148514851485e-06, + "loss": 0.0005, + "step": 806 + }, + { + "epoch": 3.9607361963190186, + "grad_norm": 0.022768640890717506, + "learning_rate": 1.4415841584158415e-06, + "loss": 0.0009, + "step": 807 + }, + { + "epoch": 3.9656441717791413, + "grad_norm": 0.0772656723856926, + "learning_rate": 1.4346534653465346e-06, + "loss": 0.0014, + "step": 808 + }, + { + "epoch": 3.970552147239264, + "grad_norm": 0.06587695330381393, + "learning_rate": 1.4277227722772277e-06, + "loss": 0.0034, + "step": 809 + }, + { + "epoch": 3.9754601226993866, + "grad_norm": 0.01118537038564682, + "learning_rate": 1.420792079207921e-06, + "loss": 0.0009, + "step": 810 + }, + { + "epoch": 3.9803680981595093, + "grad_norm": 0.06560896337032318, + "learning_rate": 1.413861386138614e-06, + "loss": 0.0019, + "step": 811 + }, + { + "epoch": 3.985276073619632, + "grad_norm": 0.014048455283045769, + "learning_rate": 1.406930693069307e-06, + "loss": 0.0018, + "step": 812 + }, + { + "epoch": 3.9901840490797547, + "grad_norm": 0.01656423695385456, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.001, + "step": 813 + }, + { + "epoch": 3.9950920245398773, + "grad_norm": 0.0036234341096132994, + "learning_rate": 1.3930693069306932e-06, + "loss": 0.0007, + "step": 814 + }, + { + "epoch": 4.0, + "grad_norm": 0.012785837985575199, + "learning_rate": 1.3861386138613863e-06, + "loss": 0.0009, + "step": 815 + }, + { + "epoch": 4.004907975460123, + "grad_norm": 0.025322729721665382, + "learning_rate": 1.3792079207920791e-06, + "loss": 0.002, + "step": 816 + }, + { + "epoch": 4.009815950920245, + "grad_norm": 0.2052641361951828, + "learning_rate": 1.3722772277227722e-06, + "loss": 0.0056, + "step": 817 + }, + { + "epoch": 4.014723926380368, + "grad_norm": 0.057693980634212494, + "learning_rate": 1.3653465346534653e-06, + "loss": 0.0021, + "step": 818 + }, + { + "epoch": 4.019631901840491, + "grad_norm": 0.009920844808220863, + "learning_rate": 1.3584158415841583e-06, + "loss": 0.0009, + "step": 819 + }, + { + "epoch": 4.024539877300613, + "grad_norm": 0.18843849003314972, + "learning_rate": 1.3514851485148514e-06, + "loss": 0.0148, + "step": 820 + }, + { + "epoch": 4.029447852760736, + "grad_norm": 0.003511168295517564, + "learning_rate": 1.3445544554455445e-06, + "loss": 0.0003, + "step": 821 + }, + { + "epoch": 4.034355828220859, + "grad_norm": 0.02023676596581936, + "learning_rate": 1.3376237623762376e-06, + "loss": 0.001, + "step": 822 + }, + { + "epoch": 4.039263803680981, + "grad_norm": 0.010772217065095901, + "learning_rate": 1.3306930693069306e-06, + "loss": 0.0013, + "step": 823 + }, + { + "epoch": 4.044171779141104, + "grad_norm": 0.023414717987179756, + "learning_rate": 1.3237623762376237e-06, + "loss": 0.0023, + "step": 824 + }, + { + "epoch": 4.049079754601227, + "grad_norm": 0.019114743918180466, + "learning_rate": 1.3168316831683168e-06, + "loss": 0.0007, + "step": 825 + }, + { + "epoch": 4.053987730061349, + "grad_norm": 0.012856281362473965, + "learning_rate": 1.3099009900990098e-06, + "loss": 0.0015, + "step": 826 + }, + { + "epoch": 4.058895705521472, + "grad_norm": 0.00855772290378809, + "learning_rate": 1.302970297029703e-06, + "loss": 0.0015, + "step": 827 + }, + { + "epoch": 4.063803680981595, + "grad_norm": 0.021148694679141045, + "learning_rate": 1.296039603960396e-06, + "loss": 0.0012, + "step": 828 + }, + { + "epoch": 4.068711656441717, + "grad_norm": 0.07430653274059296, + "learning_rate": 1.289108910891089e-06, + "loss": 0.0049, + "step": 829 + }, + { + "epoch": 4.07361963190184, + "grad_norm": 0.10033933073282242, + "learning_rate": 1.282178217821782e-06, + "loss": 0.0026, + "step": 830 + }, + { + "epoch": 4.078527607361964, + "grad_norm": 0.006094958167523146, + "learning_rate": 1.2752475247524752e-06, + "loss": 0.0006, + "step": 831 + }, + { + "epoch": 4.083435582822086, + "grad_norm": 0.06336677074432373, + "learning_rate": 1.2683168316831682e-06, + "loss": 0.0038, + "step": 832 + }, + { + "epoch": 4.088343558282209, + "grad_norm": 0.005283738486468792, + "learning_rate": 1.2613861386138613e-06, + "loss": 0.0015, + "step": 833 + }, + { + "epoch": 4.093251533742332, + "grad_norm": 0.006675936747342348, + "learning_rate": 1.2544554455445544e-06, + "loss": 0.0006, + "step": 834 + }, + { + "epoch": 4.098159509202454, + "grad_norm": 0.008660698309540749, + "learning_rate": 1.2475247524752474e-06, + "loss": 0.0018, + "step": 835 + }, + { + "epoch": 4.103067484662577, + "grad_norm": 0.02305518463253975, + "learning_rate": 1.2405940594059405e-06, + "loss": 0.0007, + "step": 836 + }, + { + "epoch": 4.1079754601227, + "grad_norm": 0.024816259741783142, + "learning_rate": 1.2336633663366336e-06, + "loss": 0.0007, + "step": 837 + }, + { + "epoch": 4.112883435582822, + "grad_norm": 0.026276560500264168, + "learning_rate": 1.2267326732673267e-06, + "loss": 0.0016, + "step": 838 + }, + { + "epoch": 4.117791411042945, + "grad_norm": 0.029642153531312943, + "learning_rate": 1.2198019801980197e-06, + "loss": 0.0038, + "step": 839 + }, + { + "epoch": 4.122699386503068, + "grad_norm": 0.05285963416099548, + "learning_rate": 1.2128712871287128e-06, + "loss": 0.0019, + "step": 840 + }, + { + "epoch": 4.12760736196319, + "grad_norm": 0.014620939269661903, + "learning_rate": 1.2059405940594059e-06, + "loss": 0.0012, + "step": 841 + }, + { + "epoch": 4.132515337423313, + "grad_norm": 0.012532511726021767, + "learning_rate": 1.1990099009900991e-06, + "loss": 0.0007, + "step": 842 + }, + { + "epoch": 4.137423312883436, + "grad_norm": 0.04022945091128349, + "learning_rate": 1.1920792079207922e-06, + "loss": 0.0032, + "step": 843 + }, + { + "epoch": 4.142331288343558, + "grad_norm": 0.021105729043483734, + "learning_rate": 1.1851485148514853e-06, + "loss": 0.0015, + "step": 844 + }, + { + "epoch": 4.147239263803681, + "grad_norm": 0.07277761399745941, + "learning_rate": 1.1782178217821783e-06, + "loss": 0.0038, + "step": 845 + }, + { + "epoch": 4.152147239263804, + "grad_norm": 0.015429302118718624, + "learning_rate": 1.1712871287128714e-06, + "loss": 0.0014, + "step": 846 + }, + { + "epoch": 4.157055214723926, + "grad_norm": 0.02602989971637726, + "learning_rate": 1.1643564356435645e-06, + "loss": 0.0013, + "step": 847 + }, + { + "epoch": 4.161963190184049, + "grad_norm": 0.018687183037400246, + "learning_rate": 1.1574257425742575e-06, + "loss": 0.0016, + "step": 848 + }, + { + "epoch": 4.166871165644172, + "grad_norm": 0.019744986668229103, + "learning_rate": 1.1504950495049506e-06, + "loss": 0.0013, + "step": 849 + }, + { + "epoch": 4.171779141104294, + "grad_norm": 0.029573217034339905, + "learning_rate": 1.1435643564356437e-06, + "loss": 0.0023, + "step": 850 + }, + { + "epoch": 4.176687116564417, + "grad_norm": 0.020479142665863037, + "learning_rate": 1.1366336633663368e-06, + "loss": 0.0011, + "step": 851 + }, + { + "epoch": 4.18159509202454, + "grad_norm": 0.11432457715272903, + "learning_rate": 1.1297029702970298e-06, + "loss": 0.0041, + "step": 852 + }, + { + "epoch": 4.186503067484662, + "grad_norm": 0.012511249631643295, + "learning_rate": 1.1227722772277227e-06, + "loss": 0.0006, + "step": 853 + }, + { + "epoch": 4.191411042944785, + "grad_norm": 0.01595146209001541, + "learning_rate": 1.1158415841584157e-06, + "loss": 0.0009, + "step": 854 + }, + { + "epoch": 4.196319018404908, + "grad_norm": 0.007092094514518976, + "learning_rate": 1.1089108910891088e-06, + "loss": 0.0007, + "step": 855 + }, + { + "epoch": 4.20122699386503, + "grad_norm": 0.03979247063398361, + "learning_rate": 1.1019801980198019e-06, + "loss": 0.0018, + "step": 856 + }, + { + "epoch": 4.206134969325153, + "grad_norm": 0.008829467929899693, + "learning_rate": 1.095049504950495e-06, + "loss": 0.0007, + "step": 857 + }, + { + "epoch": 4.211042944785276, + "grad_norm": 0.09763351082801819, + "learning_rate": 1.088118811881188e-06, + "loss": 0.0048, + "step": 858 + }, + { + "epoch": 4.215950920245398, + "grad_norm": 0.08337781578302383, + "learning_rate": 1.081188118811881e-06, + "loss": 0.0014, + "step": 859 + }, + { + "epoch": 4.220858895705521, + "grad_norm": 0.029353009536862373, + "learning_rate": 1.0742574257425742e-06, + "loss": 0.001, + "step": 860 + }, + { + "epoch": 4.225766871165644, + "grad_norm": 0.121429443359375, + "learning_rate": 1.0673267326732672e-06, + "loss": 0.0148, + "step": 861 + }, + { + "epoch": 4.230674846625767, + "grad_norm": 0.01580023020505905, + "learning_rate": 1.0603960396039603e-06, + "loss": 0.001, + "step": 862 + }, + { + "epoch": 4.23558282208589, + "grad_norm": 0.013746123760938644, + "learning_rate": 1.0534653465346534e-06, + "loss": 0.0012, + "step": 863 + }, + { + "epoch": 4.240490797546013, + "grad_norm": 0.011870300397276878, + "learning_rate": 1.0465346534653464e-06, + "loss": 0.0007, + "step": 864 + }, + { + "epoch": 4.245398773006135, + "grad_norm": 0.05088931694626808, + "learning_rate": 1.0396039603960395e-06, + "loss": 0.0017, + "step": 865 + }, + { + "epoch": 4.250306748466258, + "grad_norm": 0.38851794600486755, + "learning_rate": 1.0326732673267326e-06, + "loss": 0.014, + "step": 866 + }, + { + "epoch": 4.255214723926381, + "grad_norm": 0.01347925141453743, + "learning_rate": 1.0257425742574256e-06, + "loss": 0.0012, + "step": 867 + }, + { + "epoch": 4.260122699386503, + "grad_norm": 0.010402753949165344, + "learning_rate": 1.0188118811881187e-06, + "loss": 0.0007, + "step": 868 + }, + { + "epoch": 4.265030674846626, + "grad_norm": 0.03338263928890228, + "learning_rate": 1.0118811881188118e-06, + "loss": 0.0017, + "step": 869 + }, + { + "epoch": 4.269938650306749, + "grad_norm": 0.007806051056832075, + "learning_rate": 1.0049504950495048e-06, + "loss": 0.0006, + "step": 870 + }, + { + "epoch": 4.274846625766871, + "grad_norm": 0.0339755155146122, + "learning_rate": 9.98019801980198e-07, + "loss": 0.0027, + "step": 871 + }, + { + "epoch": 4.279754601226994, + "grad_norm": 0.02654801867902279, + "learning_rate": 9.91089108910891e-07, + "loss": 0.0012, + "step": 872 + }, + { + "epoch": 4.284662576687117, + "grad_norm": 0.018412116914987564, + "learning_rate": 9.841584158415843e-07, + "loss": 0.0022, + "step": 873 + }, + { + "epoch": 4.289570552147239, + "grad_norm": 0.0640820562839508, + "learning_rate": 9.772277227722773e-07, + "loss": 0.0021, + "step": 874 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.03333529084920883, + "learning_rate": 9.702970297029704e-07, + "loss": 0.0023, + "step": 875 + }, + { + "epoch": 4.299386503067485, + "grad_norm": 0.022033028304576874, + "learning_rate": 9.633663366336635e-07, + "loss": 0.0007, + "step": 876 + }, + { + "epoch": 4.304294478527607, + "grad_norm": 0.010194915346801281, + "learning_rate": 9.564356435643565e-07, + "loss": 0.0008, + "step": 877 + }, + { + "epoch": 4.30920245398773, + "grad_norm": 0.015077208168804646, + "learning_rate": 9.495049504950496e-07, + "loss": 0.0012, + "step": 878 + }, + { + "epoch": 4.314110429447853, + "grad_norm": 0.029076164588332176, + "learning_rate": 9.425742574257426e-07, + "loss": 0.0015, + "step": 879 + }, + { + "epoch": 4.319018404907975, + "grad_norm": 0.0363786481320858, + "learning_rate": 9.356435643564356e-07, + "loss": 0.001, + "step": 880 + }, + { + "epoch": 4.323926380368098, + "grad_norm": 0.21520403027534485, + "learning_rate": 9.287128712871287e-07, + "loss": 0.0094, + "step": 881 + }, + { + "epoch": 4.328834355828221, + "grad_norm": 0.003572958754375577, + "learning_rate": 9.217821782178218e-07, + "loss": 0.0005, + "step": 882 + }, + { + "epoch": 4.333742331288343, + "grad_norm": 0.01643703132867813, + "learning_rate": 9.148514851485148e-07, + "loss": 0.0009, + "step": 883 + }, + { + "epoch": 4.338650306748466, + "grad_norm": 0.13475348055362701, + "learning_rate": 9.079207920792079e-07, + "loss": 0.0077, + "step": 884 + }, + { + "epoch": 4.343558282208589, + "grad_norm": 0.10863371193408966, + "learning_rate": 9.00990099009901e-07, + "loss": 0.0294, + "step": 885 + }, + { + "epoch": 4.348466257668711, + "grad_norm": 0.02765970304608345, + "learning_rate": 8.94059405940594e-07, + "loss": 0.0017, + "step": 886 + }, + { + "epoch": 4.353374233128834, + "grad_norm": 0.011608476750552654, + "learning_rate": 8.871287128712871e-07, + "loss": 0.001, + "step": 887 + }, + { + "epoch": 4.358282208588957, + "grad_norm": 0.005024611949920654, + "learning_rate": 8.801980198019802e-07, + "loss": 0.0006, + "step": 888 + }, + { + "epoch": 4.363190184049079, + "grad_norm": 0.007748506963253021, + "learning_rate": 8.732673267326733e-07, + "loss": 0.0006, + "step": 889 + }, + { + "epoch": 4.368098159509202, + "grad_norm": 0.23455409705638885, + "learning_rate": 8.663366336633663e-07, + "loss": 0.0057, + "step": 890 + }, + { + "epoch": 4.373006134969325, + "grad_norm": 0.03380454331636429, + "learning_rate": 8.594059405940594e-07, + "loss": 0.0011, + "step": 891 + }, + { + "epoch": 4.3779141104294474, + "grad_norm": 0.03481479734182358, + "learning_rate": 8.524752475247525e-07, + "loss": 0.0016, + "step": 892 + }, + { + "epoch": 4.38282208588957, + "grad_norm": 0.022679351270198822, + "learning_rate": 8.455445544554455e-07, + "loss": 0.0022, + "step": 893 + }, + { + "epoch": 4.387730061349693, + "grad_norm": 0.093803271651268, + "learning_rate": 8.386138613861386e-07, + "loss": 0.0022, + "step": 894 + }, + { + "epoch": 4.392638036809816, + "grad_norm": 0.05329536274075508, + "learning_rate": 8.316831683168317e-07, + "loss": 0.0018, + "step": 895 + }, + { + "epoch": 4.397546012269939, + "grad_norm": 0.05470538139343262, + "learning_rate": 8.247524752475247e-07, + "loss": 0.0014, + "step": 896 + }, + { + "epoch": 4.402453987730062, + "grad_norm": 0.02288208343088627, + "learning_rate": 8.178217821782179e-07, + "loss": 0.0014, + "step": 897 + }, + { + "epoch": 4.407361963190184, + "grad_norm": 0.04405367746949196, + "learning_rate": 8.108910891089109e-07, + "loss": 0.001, + "step": 898 + }, + { + "epoch": 4.412269938650307, + "grad_norm": 0.030512019991874695, + "learning_rate": 8.039603960396039e-07, + "loss": 0.0013, + "step": 899 + }, + { + "epoch": 4.41717791411043, + "grad_norm": 0.12844492495059967, + "learning_rate": 7.97029702970297e-07, + "loss": 0.0043, + "step": 900 + }, + { + "epoch": 4.422085889570552, + "grad_norm": 0.02055547758936882, + "learning_rate": 7.900990099009901e-07, + "loss": 0.001, + "step": 901 + }, + { + "epoch": 4.426993865030675, + "grad_norm": 0.04747156798839569, + "learning_rate": 7.831683168316831e-07, + "loss": 0.0061, + "step": 902 + }, + { + "epoch": 4.431901840490798, + "grad_norm": 0.010140195488929749, + "learning_rate": 7.762376237623762e-07, + "loss": 0.001, + "step": 903 + }, + { + "epoch": 4.43680981595092, + "grad_norm": 0.009971629828214645, + "learning_rate": 7.693069306930693e-07, + "loss": 0.0011, + "step": 904 + }, + { + "epoch": 4.441717791411043, + "grad_norm": 0.006146845407783985, + "learning_rate": 7.623762376237624e-07, + "loss": 0.0004, + "step": 905 + }, + { + "epoch": 4.446625766871166, + "grad_norm": 0.027412964031100273, + "learning_rate": 7.554455445544554e-07, + "loss": 0.0024, + "step": 906 + }, + { + "epoch": 4.451533742331288, + "grad_norm": 0.021934248507022858, + "learning_rate": 7.485148514851485e-07, + "loss": 0.0012, + "step": 907 + }, + { + "epoch": 4.456441717791411, + "grad_norm": 0.3181805908679962, + "learning_rate": 7.415841584158416e-07, + "loss": 0.0261, + "step": 908 + }, + { + "epoch": 4.461349693251534, + "grad_norm": 0.010769632644951344, + "learning_rate": 7.346534653465346e-07, + "loss": 0.0008, + "step": 909 + }, + { + "epoch": 4.466257668711656, + "grad_norm": 0.15605410933494568, + "learning_rate": 7.277227722772277e-07, + "loss": 0.006, + "step": 910 + }, + { + "epoch": 4.471165644171779, + "grad_norm": 0.1743585467338562, + "learning_rate": 7.207920792079208e-07, + "loss": 0.0075, + "step": 911 + }, + { + "epoch": 4.476073619631902, + "grad_norm": 0.011531657539308071, + "learning_rate": 7.138613861386138e-07, + "loss": 0.0008, + "step": 912 + }, + { + "epoch": 4.480981595092024, + "grad_norm": 0.15488475561141968, + "learning_rate": 7.06930693069307e-07, + "loss": 0.0054, + "step": 913 + }, + { + "epoch": 4.485889570552147, + "grad_norm": 0.12285412847995758, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0067, + "step": 914 + }, + { + "epoch": 4.49079754601227, + "grad_norm": 0.02667032927274704, + "learning_rate": 6.930693069306931e-07, + "loss": 0.001, + "step": 915 + }, + { + "epoch": 4.495705521472392, + "grad_norm": 0.027680931612849236, + "learning_rate": 6.861386138613861e-07, + "loss": 0.0014, + "step": 916 + }, + { + "epoch": 4.500613496932515, + "grad_norm": 0.01782669499516487, + "learning_rate": 6.792079207920792e-07, + "loss": 0.0019, + "step": 917 + }, + { + "epoch": 4.505521472392638, + "grad_norm": 0.061316560953855515, + "learning_rate": 6.722772277227722e-07, + "loss": 0.0026, + "step": 918 + }, + { + "epoch": 4.5104294478527605, + "grad_norm": 0.052529476583004, + "learning_rate": 6.653465346534653e-07, + "loss": 0.0023, + "step": 919 + }, + { + "epoch": 4.515337423312883, + "grad_norm": 0.0037185668479651213, + "learning_rate": 6.584158415841584e-07, + "loss": 0.001, + "step": 920 + }, + { + "epoch": 4.520245398773006, + "grad_norm": 0.06986022740602493, + "learning_rate": 6.514851485148514e-07, + "loss": 0.0022, + "step": 921 + }, + { + "epoch": 4.5251533742331285, + "grad_norm": 0.006407030858099461, + "learning_rate": 6.445544554455445e-07, + "loss": 0.0005, + "step": 922 + }, + { + "epoch": 4.530061349693252, + "grad_norm": 0.005136528518050909, + "learning_rate": 6.376237623762376e-07, + "loss": 0.0007, + "step": 923 + }, + { + "epoch": 4.534969325153375, + "grad_norm": 0.063414067029953, + "learning_rate": 6.306930693069307e-07, + "loss": 0.0031, + "step": 924 + }, + { + "epoch": 4.539877300613497, + "grad_norm": 0.029516983777284622, + "learning_rate": 6.237623762376237e-07, + "loss": 0.0015, + "step": 925 + }, + { + "epoch": 4.54478527607362, + "grad_norm": 0.07440595328807831, + "learning_rate": 6.168316831683168e-07, + "loss": 0.0016, + "step": 926 + }, + { + "epoch": 4.549693251533743, + "grad_norm": 0.044622018933296204, + "learning_rate": 6.099009900990099e-07, + "loss": 0.0023, + "step": 927 + }, + { + "epoch": 4.554601226993865, + "grad_norm": 0.04841621220111847, + "learning_rate": 6.029702970297029e-07, + "loss": 0.0049, + "step": 928 + }, + { + "epoch": 4.559509202453988, + "grad_norm": 0.026555247604846954, + "learning_rate": 5.960396039603961e-07, + "loss": 0.001, + "step": 929 + }, + { + "epoch": 4.564417177914111, + "grad_norm": 0.1271572858095169, + "learning_rate": 5.891089108910892e-07, + "loss": 0.0026, + "step": 930 + }, + { + "epoch": 4.569325153374233, + "grad_norm": 0.015365286730229855, + "learning_rate": 5.821782178217822e-07, + "loss": 0.0017, + "step": 931 + }, + { + "epoch": 4.574233128834356, + "grad_norm": 0.024656543508172035, + "learning_rate": 5.752475247524753e-07, + "loss": 0.0013, + "step": 932 + }, + { + "epoch": 4.579141104294479, + "grad_norm": 0.05672885477542877, + "learning_rate": 5.683168316831684e-07, + "loss": 0.0029, + "step": 933 + }, + { + "epoch": 4.584049079754601, + "grad_norm": 0.023147093132138252, + "learning_rate": 5.613861386138613e-07, + "loss": 0.0026, + "step": 934 + }, + { + "epoch": 4.588957055214724, + "grad_norm": 0.13222621381282806, + "learning_rate": 5.544554455445544e-07, + "loss": 0.0144, + "step": 935 + }, + { + "epoch": 4.593865030674847, + "grad_norm": 0.022834930568933487, + "learning_rate": 5.475247524752475e-07, + "loss": 0.0012, + "step": 936 + }, + { + "epoch": 4.598773006134969, + "grad_norm": 0.014577291905879974, + "learning_rate": 5.405940594059405e-07, + "loss": 0.0009, + "step": 937 + }, + { + "epoch": 4.603680981595092, + "grad_norm": 0.026476260274648666, + "learning_rate": 5.336633663366336e-07, + "loss": 0.0015, + "step": 938 + }, + { + "epoch": 4.608588957055215, + "grad_norm": 0.0048033553175628185, + "learning_rate": 5.267326732673267e-07, + "loss": 0.0001, + "step": 939 + }, + { + "epoch": 4.613496932515337, + "grad_norm": 0.05348242446780205, + "learning_rate": 5.198019801980198e-07, + "loss": 0.0017, + "step": 940 + }, + { + "epoch": 4.61840490797546, + "grad_norm": 0.027225324884057045, + "learning_rate": 5.128712871287128e-07, + "loss": 0.0012, + "step": 941 + }, + { + "epoch": 4.623312883435583, + "grad_norm": 0.10507699847221375, + "learning_rate": 5.059405940594059e-07, + "loss": 0.0071, + "step": 942 + }, + { + "epoch": 4.6282208588957054, + "grad_norm": 0.12814861536026, + "learning_rate": 4.99009900990099e-07, + "loss": 0.0073, + "step": 943 + }, + { + "epoch": 4.633128834355828, + "grad_norm": 0.03401525318622589, + "learning_rate": 4.920792079207921e-07, + "loss": 0.0014, + "step": 944 + }, + { + "epoch": 4.638036809815951, + "grad_norm": 0.021818110719323158, + "learning_rate": 4.851485148514852e-07, + "loss": 0.001, + "step": 945 + }, + { + "epoch": 4.6429447852760735, + "grad_norm": 0.13617785274982452, + "learning_rate": 4.782178217821783e-07, + "loss": 0.0047, + "step": 946 + }, + { + "epoch": 4.647852760736196, + "grad_norm": 0.15220874547958374, + "learning_rate": 4.712871287128713e-07, + "loss": 0.0053, + "step": 947 + }, + { + "epoch": 4.652760736196319, + "grad_norm": 0.30194413661956787, + "learning_rate": 4.6435643564356435e-07, + "loss": 0.0218, + "step": 948 + }, + { + "epoch": 4.6576687116564415, + "grad_norm": 0.1869322508573532, + "learning_rate": 4.574257425742574e-07, + "loss": 0.0622, + "step": 949 + }, + { + "epoch": 4.662576687116564, + "grad_norm": 0.024376358836889267, + "learning_rate": 4.504950495049505e-07, + "loss": 0.001, + "step": 950 + }, + { + "epoch": 4.667484662576687, + "grad_norm": 0.04647885262966156, + "learning_rate": 4.4356435643564356e-07, + "loss": 0.0019, + "step": 951 + }, + { + "epoch": 4.6723926380368095, + "grad_norm": 0.09357151389122009, + "learning_rate": 4.3663366336633663e-07, + "loss": 0.0046, + "step": 952 + }, + { + "epoch": 4.677300613496932, + "grad_norm": 0.12137161940336227, + "learning_rate": 4.297029702970297e-07, + "loss": 0.0024, + "step": 953 + }, + { + "epoch": 4.682208588957055, + "grad_norm": 0.017510604113340378, + "learning_rate": 4.2277227722772276e-07, + "loss": 0.0007, + "step": 954 + }, + { + "epoch": 4.6871165644171775, + "grad_norm": 0.011096769012510777, + "learning_rate": 4.1584158415841583e-07, + "loss": 0.0009, + "step": 955 + }, + { + "epoch": 4.6920245398773, + "grad_norm": 0.075267493724823, + "learning_rate": 4.0891089108910895e-07, + "loss": 0.0019, + "step": 956 + }, + { + "epoch": 4.696932515337423, + "grad_norm": 0.0064629483968019485, + "learning_rate": 4.0198019801980197e-07, + "loss": 0.0017, + "step": 957 + }, + { + "epoch": 4.7018404907975455, + "grad_norm": 0.018965771421790123, + "learning_rate": 3.9504950495049504e-07, + "loss": 0.0021, + "step": 958 + }, + { + "epoch": 4.706748466257669, + "grad_norm": 0.06010276451706886, + "learning_rate": 3.881188118811881e-07, + "loss": 0.0014, + "step": 959 + }, + { + "epoch": 4.711656441717792, + "grad_norm": 0.03155827522277832, + "learning_rate": 3.811881188118812e-07, + "loss": 0.0017, + "step": 960 + }, + { + "epoch": 4.716564417177914, + "grad_norm": 0.1321091651916504, + "learning_rate": 3.7425742574257424e-07, + "loss": 0.0066, + "step": 961 + }, + { + "epoch": 4.721472392638037, + "grad_norm": 0.018097804859280586, + "learning_rate": 3.673267326732673e-07, + "loss": 0.0011, + "step": 962 + }, + { + "epoch": 4.72638036809816, + "grad_norm": 0.05625467747449875, + "learning_rate": 3.603960396039604e-07, + "loss": 0.0021, + "step": 963 + }, + { + "epoch": 4.731288343558282, + "grad_norm": 0.010586952790617943, + "learning_rate": 3.534653465346535e-07, + "loss": 0.0004, + "step": 964 + }, + { + "epoch": 4.736196319018405, + "grad_norm": 0.0075930338352918625, + "learning_rate": 3.4653465346534657e-07, + "loss": 0.001, + "step": 965 + }, + { + "epoch": 4.741104294478528, + "grad_norm": 0.011664043180644512, + "learning_rate": 3.396039603960396e-07, + "loss": 0.0014, + "step": 966 + }, + { + "epoch": 4.74601226993865, + "grad_norm": 0.04409307986497879, + "learning_rate": 3.3267326732673266e-07, + "loss": 0.0017, + "step": 967 + }, + { + "epoch": 4.750920245398773, + "grad_norm": 0.01488639134913683, + "learning_rate": 3.257425742574257e-07, + "loss": 0.0012, + "step": 968 + }, + { + "epoch": 4.755828220858896, + "grad_norm": 0.02663021720945835, + "learning_rate": 3.188118811881188e-07, + "loss": 0.0014, + "step": 969 + }, + { + "epoch": 4.7607361963190185, + "grad_norm": 0.019124912098050117, + "learning_rate": 3.1188118811881186e-07, + "loss": 0.0013, + "step": 970 + }, + { + "epoch": 4.765644171779141, + "grad_norm": 0.023948566988110542, + "learning_rate": 3.0495049504950493e-07, + "loss": 0.0027, + "step": 971 + }, + { + "epoch": 4.770552147239264, + "grad_norm": 0.04264827072620392, + "learning_rate": 2.9801980198019805e-07, + "loss": 0.0015, + "step": 972 + }, + { + "epoch": 4.7754601226993865, + "grad_norm": 0.11865667253732681, + "learning_rate": 2.910891089108911e-07, + "loss": 0.0037, + "step": 973 + }, + { + "epoch": 4.780368098159509, + "grad_norm": 0.0415462963283062, + "learning_rate": 2.841584158415842e-07, + "loss": 0.0027, + "step": 974 + }, + { + "epoch": 4.785276073619632, + "grad_norm": 0.16526491940021515, + "learning_rate": 2.772277227722772e-07, + "loss": 0.0149, + "step": 975 + }, + { + "epoch": 4.7901840490797545, + "grad_norm": 0.017350684851408005, + "learning_rate": 2.7029702970297027e-07, + "loss": 0.0008, + "step": 976 + }, + { + "epoch": 4.795092024539877, + "grad_norm": 0.020096784457564354, + "learning_rate": 2.6336633663366334e-07, + "loss": 0.0018, + "step": 977 + }, + { + "epoch": 4.8, + "grad_norm": 0.019672604277729988, + "learning_rate": 2.564356435643564e-07, + "loss": 0.0009, + "step": 978 + }, + { + "epoch": 4.8049079754601225, + "grad_norm": 0.050094954669475555, + "learning_rate": 2.495049504950495e-07, + "loss": 0.0015, + "step": 979 + }, + { + "epoch": 4.809815950920245, + "grad_norm": 0.07061317563056946, + "learning_rate": 2.425742574257426e-07, + "loss": 0.0039, + "step": 980 + }, + { + "epoch": 4.814723926380368, + "grad_norm": 0.023730693385004997, + "learning_rate": 2.3564356435643564e-07, + "loss": 0.0012, + "step": 981 + }, + { + "epoch": 4.8196319018404905, + "grad_norm": 0.02949446439743042, + "learning_rate": 2.287128712871287e-07, + "loss": 0.0013, + "step": 982 + }, + { + "epoch": 4.824539877300613, + "grad_norm": 0.04460925608873367, + "learning_rate": 2.2178217821782178e-07, + "loss": 0.002, + "step": 983 + }, + { + "epoch": 4.829447852760736, + "grad_norm": 0.032420773059129715, + "learning_rate": 2.1485148514851485e-07, + "loss": 0.0011, + "step": 984 + }, + { + "epoch": 4.8343558282208585, + "grad_norm": 0.16345536708831787, + "learning_rate": 2.0792079207920792e-07, + "loss": 0.0052, + "step": 985 + }, + { + "epoch": 4.839263803680982, + "grad_norm": 0.016612550243735313, + "learning_rate": 2.0099009900990098e-07, + "loss": 0.0014, + "step": 986 + }, + { + "epoch": 4.844171779141105, + "grad_norm": 0.022284861654043198, + "learning_rate": 1.9405940594059405e-07, + "loss": 0.0015, + "step": 987 + }, + { + "epoch": 4.849079754601227, + "grad_norm": 0.018358217552304268, + "learning_rate": 1.8712871287128712e-07, + "loss": 0.001, + "step": 988 + }, + { + "epoch": 4.85398773006135, + "grad_norm": 0.012662719935178757, + "learning_rate": 1.801980198019802e-07, + "loss": 0.0009, + "step": 989 + }, + { + "epoch": 4.858895705521473, + "grad_norm": 0.020140303298830986, + "learning_rate": 1.7326732673267329e-07, + "loss": 0.0014, + "step": 990 + }, + { + "epoch": 4.863803680981595, + "grad_norm": 0.006523944437503815, + "learning_rate": 1.6633663366336633e-07, + "loss": 0.0004, + "step": 991 + }, + { + "epoch": 4.868711656441718, + "grad_norm": 0.02763935551047325, + "learning_rate": 1.594059405940594e-07, + "loss": 0.0014, + "step": 992 + }, + { + "epoch": 4.873619631901841, + "grad_norm": 0.05238136649131775, + "learning_rate": 1.5247524752475246e-07, + "loss": 0.0015, + "step": 993 + }, + { + "epoch": 4.8785276073619634, + "grad_norm": 0.026852233335375786, + "learning_rate": 1.4554455445544556e-07, + "loss": 0.0011, + "step": 994 + }, + { + "epoch": 4.883435582822086, + "grad_norm": 0.01658753491938114, + "learning_rate": 1.386138613861386e-07, + "loss": 0.0011, + "step": 995 + }, + { + "epoch": 4.888343558282209, + "grad_norm": 0.025397639721632004, + "learning_rate": 1.3168316831683167e-07, + "loss": 0.0018, + "step": 996 + }, + { + "epoch": 4.8932515337423315, + "grad_norm": 0.05166466534137726, + "learning_rate": 1.2475247524752474e-07, + "loss": 0.0014, + "step": 997 + }, + { + "epoch": 4.898159509202454, + "grad_norm": 0.06475819647312164, + "learning_rate": 1.1782178217821782e-07, + "loss": 0.0017, + "step": 998 + }, + { + "epoch": 4.903067484662577, + "grad_norm": 0.03307437151670456, + "learning_rate": 1.1089108910891089e-07, + "loss": 0.0013, + "step": 999 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 0.014889244921505451, + "learning_rate": 1.0396039603960396e-07, + "loss": 0.0012, + "step": 1000 + }, + { + "epoch": 4.912883435582822, + "grad_norm": 0.02157243713736534, + "learning_rate": 9.702970297029703e-08, + "loss": 0.0013, + "step": 1001 + }, + { + "epoch": 4.917791411042945, + "grad_norm": 0.011875314638018608, + "learning_rate": 9.00990099009901e-08, + "loss": 0.0009, + "step": 1002 + }, + { + "epoch": 4.9226993865030675, + "grad_norm": 0.01703159138560295, + "learning_rate": 8.316831683168316e-08, + "loss": 0.0015, + "step": 1003 + }, + { + "epoch": 4.92760736196319, + "grad_norm": 0.042403411120176315, + "learning_rate": 7.623762376237623e-08, + "loss": 0.0024, + "step": 1004 + }, + { + "epoch": 4.932515337423313, + "grad_norm": 0.007990752346813679, + "learning_rate": 6.93069306930693e-08, + "loss": 0.0011, + "step": 1005 + }, + { + "epoch": 4.9374233128834355, + "grad_norm": 0.024830589070916176, + "learning_rate": 6.237623762376237e-08, + "loss": 0.0013, + "step": 1006 + }, + { + "epoch": 4.942331288343558, + "grad_norm": 0.023844977840781212, + "learning_rate": 5.5445544554455445e-08, + "loss": 0.0011, + "step": 1007 + }, + { + "epoch": 4.947239263803681, + "grad_norm": 0.0047842771746218204, + "learning_rate": 4.8514851485148513e-08, + "loss": 0.0002, + "step": 1008 + }, + { + "epoch": 4.9521472392638035, + "grad_norm": 0.20831891894340515, + "learning_rate": 4.158415841584158e-08, + "loss": 0.0057, + "step": 1009 + }, + { + "epoch": 4.957055214723926, + "grad_norm": 0.026182973757386208, + "learning_rate": 3.465346534653465e-08, + "loss": 0.0017, + "step": 1010 + }, + { + "epoch": 4.961963190184049, + "grad_norm": 0.03703535720705986, + "learning_rate": 2.7722772277227722e-08, + "loss": 0.002, + "step": 1011 + }, + { + "epoch": 4.9668711656441715, + "grad_norm": 0.09641406685113907, + "learning_rate": 2.079207920792079e-08, + "loss": 0.0031, + "step": 1012 + }, + { + "epoch": 4.971779141104294, + "grad_norm": 0.007641744799911976, + "learning_rate": 1.3861386138613861e-08, + "loss": 0.0006, + "step": 1013 + }, + { + "epoch": 4.976687116564417, + "grad_norm": 0.15017706155776978, + "learning_rate": 6.930693069306931e-09, + "loss": 0.0045, + "step": 1014 + }, + { + "epoch": 4.9815950920245395, + "grad_norm": 0.014618399553000927, + "learning_rate": 0.0, + "loss": 0.001, + "step": 1015 + } + ], + "logging_steps": 1, + "max_steps": 1015, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2124067535414374e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}