{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.995221408091749, "eval_steps": 500, "global_step": 3920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012742911755336094, "grad_norm": 4.710488926738518, "learning_rate": 5.1020408163265303e-08, "loss": 0.4551, "step": 1 }, { "epoch": 0.002548582351067219, "grad_norm": 4.830025905938851, "learning_rate": 1.0204081632653061e-07, "loss": 0.482, "step": 2 }, { "epoch": 0.003822873526600828, "grad_norm": 5.030611556172937, "learning_rate": 1.5306122448979592e-07, "loss": 0.5167, "step": 3 }, { "epoch": 0.005097164702134438, "grad_norm": 4.715950568558145, "learning_rate": 2.0408163265306121e-07, "loss": 0.4583, "step": 4 }, { "epoch": 0.0063714558776680474, "grad_norm": 4.665512979793282, "learning_rate": 2.5510204081632656e-07, "loss": 0.4423, "step": 5 }, { "epoch": 0.007645747053201656, "grad_norm": 4.788258673241894, "learning_rate": 3.0612244897959183e-07, "loss": 0.4665, "step": 6 }, { "epoch": 0.008920038228735267, "grad_norm": 5.203681783546096, "learning_rate": 3.5714285714285716e-07, "loss": 0.4783, "step": 7 }, { "epoch": 0.010194329404268876, "grad_norm": 4.841106761566023, "learning_rate": 4.0816326530612243e-07, "loss": 0.4571, "step": 8 }, { "epoch": 0.011468620579802484, "grad_norm": 4.249485950682216, "learning_rate": 4.591836734693878e-07, "loss": 0.4104, "step": 9 }, { "epoch": 0.012742911755336095, "grad_norm": 4.655892311292604, "learning_rate": 5.102040816326531e-07, "loss": 0.4915, "step": 10 }, { "epoch": 0.014017202930869704, "grad_norm": 4.409839264937263, "learning_rate": 5.612244897959184e-07, "loss": 0.4811, "step": 11 }, { "epoch": 0.015291494106403312, "grad_norm": 4.087444445423412, "learning_rate": 6.122448979591837e-07, "loss": 0.4349, "step": 12 }, { "epoch": 0.01656578528193692, "grad_norm": 4.148259031315218, "learning_rate": 6.632653061224491e-07, "loss": 0.4292, "step": 13 }, { "epoch": 0.017840076457470534, "grad_norm": 3.0734368824796046, "learning_rate": 7.142857142857143e-07, "loss": 0.3757, "step": 14 }, { "epoch": 0.019114367633004142, "grad_norm": 3.114226508993055, "learning_rate": 7.653061224489796e-07, "loss": 0.3869, "step": 15 }, { "epoch": 0.02038865880853775, "grad_norm": 3.0008628643450965, "learning_rate": 8.163265306122449e-07, "loss": 0.3881, "step": 16 }, { "epoch": 0.02166294998407136, "grad_norm": 2.861092659288368, "learning_rate": 8.673469387755103e-07, "loss": 0.4431, "step": 17 }, { "epoch": 0.02293724115960497, "grad_norm": 2.9681615461530155, "learning_rate": 9.183673469387756e-07, "loss": 0.4148, "step": 18 }, { "epoch": 0.024211532335138577, "grad_norm": 1.778108913205758, "learning_rate": 9.69387755102041e-07, "loss": 0.3302, "step": 19 }, { "epoch": 0.02548582351067219, "grad_norm": 1.7071757468120783, "learning_rate": 1.0204081632653063e-06, "loss": 0.3639, "step": 20 }, { "epoch": 0.0267601146862058, "grad_norm": 1.5586290488426222, "learning_rate": 1.0714285714285714e-06, "loss": 0.3408, "step": 21 }, { "epoch": 0.028034405861739407, "grad_norm": 1.6733655486646275, "learning_rate": 1.122448979591837e-06, "loss": 0.3395, "step": 22 }, { "epoch": 0.029308697037273016, "grad_norm": 1.6053585651604683, "learning_rate": 1.1734693877551022e-06, "loss": 0.3191, "step": 23 }, { "epoch": 0.030582988212806625, "grad_norm": 1.6840093444854802, "learning_rate": 1.2244897959183673e-06, "loss": 0.3416, "step": 24 }, { "epoch": 0.03185727938834024, "grad_norm": 1.4384875052704835, "learning_rate": 1.2755102040816329e-06, "loss": 0.3446, "step": 25 }, { "epoch": 0.03313157056387384, "grad_norm": 1.0107046682974377, "learning_rate": 1.3265306122448982e-06, "loss": 0.3181, "step": 26 }, { "epoch": 0.034405861739407455, "grad_norm": 0.9437125451372907, "learning_rate": 1.3775510204081633e-06, "loss": 0.2971, "step": 27 }, { "epoch": 0.03568015291494107, "grad_norm": 0.9779793280047246, "learning_rate": 1.4285714285714286e-06, "loss": 0.3047, "step": 28 }, { "epoch": 0.03695444409047467, "grad_norm": 0.9054150574116914, "learning_rate": 1.479591836734694e-06, "loss": 0.2857, "step": 29 }, { "epoch": 0.038228735266008285, "grad_norm": 0.938625937310584, "learning_rate": 1.5306122448979593e-06, "loss": 0.3141, "step": 30 }, { "epoch": 0.03950302644154189, "grad_norm": 0.7919950597206402, "learning_rate": 1.5816326530612248e-06, "loss": 0.2724, "step": 31 }, { "epoch": 0.0407773176170755, "grad_norm": 0.736644531370653, "learning_rate": 1.6326530612244897e-06, "loss": 0.2813, "step": 32 }, { "epoch": 0.042051608792609114, "grad_norm": 0.6554756318187847, "learning_rate": 1.6836734693877552e-06, "loss": 0.2725, "step": 33 }, { "epoch": 0.04332589996814272, "grad_norm": 0.6332644048648629, "learning_rate": 1.7346938775510206e-06, "loss": 0.29, "step": 34 }, { "epoch": 0.04460019114367633, "grad_norm": 0.5529560030640578, "learning_rate": 1.7857142857142859e-06, "loss": 0.264, "step": 35 }, { "epoch": 0.04587448231920994, "grad_norm": 0.5259381596250441, "learning_rate": 1.8367346938775512e-06, "loss": 0.2748, "step": 36 }, { "epoch": 0.04714877349474355, "grad_norm": 0.5865301103473198, "learning_rate": 1.8877551020408163e-06, "loss": 0.3044, "step": 37 }, { "epoch": 0.048423064670277155, "grad_norm": 0.5807544236080664, "learning_rate": 1.938775510204082e-06, "loss": 0.2844, "step": 38 }, { "epoch": 0.04969735584581077, "grad_norm": 0.5709358676400673, "learning_rate": 1.989795918367347e-06, "loss": 0.2742, "step": 39 }, { "epoch": 0.05097164702134438, "grad_norm": 0.7047627533657499, "learning_rate": 2.0408163265306125e-06, "loss": 0.3134, "step": 40 }, { "epoch": 0.052245938196877985, "grad_norm": 0.716312478969161, "learning_rate": 2.0918367346938776e-06, "loss": 0.2744, "step": 41 }, { "epoch": 0.0535202293724116, "grad_norm": 0.6787928896628119, "learning_rate": 2.1428571428571427e-06, "loss": 0.3068, "step": 42 }, { "epoch": 0.0547945205479452, "grad_norm": 0.6914907898125237, "learning_rate": 2.1938775510204083e-06, "loss": 0.2722, "step": 43 }, { "epoch": 0.056068811723478815, "grad_norm": 0.580855818705434, "learning_rate": 2.244897959183674e-06, "loss": 0.2454, "step": 44 }, { "epoch": 0.05734310289901243, "grad_norm": 0.580702307033356, "learning_rate": 2.295918367346939e-06, "loss": 0.2793, "step": 45 }, { "epoch": 0.05861739407454603, "grad_norm": 0.5149403164427327, "learning_rate": 2.3469387755102044e-06, "loss": 0.2763, "step": 46 }, { "epoch": 0.059891685250079645, "grad_norm": 0.4690030862608283, "learning_rate": 2.3979591836734696e-06, "loss": 0.2577, "step": 47 }, { "epoch": 0.06116597642561325, "grad_norm": 0.4681844181067997, "learning_rate": 2.4489795918367347e-06, "loss": 0.27, "step": 48 }, { "epoch": 0.06244026760114686, "grad_norm": 0.47097523948274983, "learning_rate": 2.5e-06, "loss": 0.2842, "step": 49 }, { "epoch": 0.06371455877668047, "grad_norm": 0.4743551429705717, "learning_rate": 2.5510204081632657e-06, "loss": 0.2724, "step": 50 }, { "epoch": 0.06498884995221409, "grad_norm": 0.5305391842286102, "learning_rate": 2.602040816326531e-06, "loss": 0.2989, "step": 51 }, { "epoch": 0.06626314112774769, "grad_norm": 0.48976346014610456, "learning_rate": 2.6530612244897964e-06, "loss": 0.2463, "step": 52 }, { "epoch": 0.0675374323032813, "grad_norm": 0.5008625326403693, "learning_rate": 2.7040816326530615e-06, "loss": 0.268, "step": 53 }, { "epoch": 0.06881172347881491, "grad_norm": 0.4752730932423671, "learning_rate": 2.7551020408163266e-06, "loss": 0.2737, "step": 54 }, { "epoch": 0.07008601465434852, "grad_norm": 0.47233101525840143, "learning_rate": 2.8061224489795917e-06, "loss": 0.2555, "step": 55 }, { "epoch": 0.07136030582988213, "grad_norm": 0.4586572680427324, "learning_rate": 2.8571428571428573e-06, "loss": 0.2735, "step": 56 }, { "epoch": 0.07263459700541573, "grad_norm": 0.40777026467357796, "learning_rate": 2.908163265306123e-06, "loss": 0.2405, "step": 57 }, { "epoch": 0.07390888818094934, "grad_norm": 0.4304749229319922, "learning_rate": 2.959183673469388e-06, "loss": 0.2663, "step": 58 }, { "epoch": 0.07518317935648296, "grad_norm": 0.43158949393403995, "learning_rate": 3.0102040816326534e-06, "loss": 0.2661, "step": 59 }, { "epoch": 0.07645747053201657, "grad_norm": 0.44334679618757034, "learning_rate": 3.0612244897959185e-06, "loss": 0.2583, "step": 60 }, { "epoch": 0.07773176170755018, "grad_norm": 0.4291285483817974, "learning_rate": 3.112244897959184e-06, "loss": 0.2557, "step": 61 }, { "epoch": 0.07900605288308378, "grad_norm": 0.43653771107945155, "learning_rate": 3.1632653061224496e-06, "loss": 0.2602, "step": 62 }, { "epoch": 0.08028034405861739, "grad_norm": 0.443884472042199, "learning_rate": 3.2142857142857147e-06, "loss": 0.2487, "step": 63 }, { "epoch": 0.081554635234151, "grad_norm": 0.43329728769740317, "learning_rate": 3.2653061224489794e-06, "loss": 0.2308, "step": 64 }, { "epoch": 0.08282892640968462, "grad_norm": 0.4345153913319177, "learning_rate": 3.316326530612245e-06, "loss": 0.2536, "step": 65 }, { "epoch": 0.08410321758521823, "grad_norm": 0.44730318476750924, "learning_rate": 3.3673469387755105e-06, "loss": 0.254, "step": 66 }, { "epoch": 0.08537750876075183, "grad_norm": 0.47706223986606683, "learning_rate": 3.4183673469387756e-06, "loss": 0.2342, "step": 67 }, { "epoch": 0.08665179993628544, "grad_norm": 0.43927349227572404, "learning_rate": 3.469387755102041e-06, "loss": 0.2626, "step": 68 }, { "epoch": 0.08792609111181905, "grad_norm": 0.41641363846723245, "learning_rate": 3.5204081632653062e-06, "loss": 0.2501, "step": 69 }, { "epoch": 0.08920038228735266, "grad_norm": 0.43257629229618105, "learning_rate": 3.5714285714285718e-06, "loss": 0.2341, "step": 70 }, { "epoch": 0.09047467346288628, "grad_norm": 0.4462370735591886, "learning_rate": 3.6224489795918373e-06, "loss": 0.2556, "step": 71 }, { "epoch": 0.09174896463841987, "grad_norm": 0.4257031835212425, "learning_rate": 3.6734693877551024e-06, "loss": 0.2372, "step": 72 }, { "epoch": 0.09302325581395349, "grad_norm": 0.43814401057027536, "learning_rate": 3.724489795918368e-06, "loss": 0.2652, "step": 73 }, { "epoch": 0.0942975469894871, "grad_norm": 0.44842286786971225, "learning_rate": 3.7755102040816327e-06, "loss": 0.2764, "step": 74 }, { "epoch": 0.09557183816502071, "grad_norm": 0.4473205402806749, "learning_rate": 3.826530612244898e-06, "loss": 0.2558, "step": 75 }, { "epoch": 0.09684612934055431, "grad_norm": 0.4319953781273743, "learning_rate": 3.877551020408164e-06, "loss": 0.268, "step": 76 }, { "epoch": 0.09812042051608792, "grad_norm": 0.418396633300371, "learning_rate": 3.928571428571429e-06, "loss": 0.2375, "step": 77 }, { "epoch": 0.09939471169162153, "grad_norm": 0.42453498681348056, "learning_rate": 3.979591836734694e-06, "loss": 0.2237, "step": 78 }, { "epoch": 0.10066900286715515, "grad_norm": 0.4350796925110571, "learning_rate": 4.03061224489796e-06, "loss": 0.253, "step": 79 }, { "epoch": 0.10194329404268876, "grad_norm": 0.42414092276093845, "learning_rate": 4.081632653061225e-06, "loss": 0.2307, "step": 80 }, { "epoch": 0.10321758521822236, "grad_norm": 0.41527684015401567, "learning_rate": 4.13265306122449e-06, "loss": 0.2379, "step": 81 }, { "epoch": 0.10449187639375597, "grad_norm": 0.41873170219356587, "learning_rate": 4.183673469387755e-06, "loss": 0.2266, "step": 82 }, { "epoch": 0.10576616756928958, "grad_norm": 0.4378860390577355, "learning_rate": 4.234693877551021e-06, "loss": 0.2696, "step": 83 }, { "epoch": 0.1070404587448232, "grad_norm": 0.4310802240342111, "learning_rate": 4.2857142857142855e-06, "loss": 0.2514, "step": 84 }, { "epoch": 0.1083147499203568, "grad_norm": 0.432116144755952, "learning_rate": 4.336734693877551e-06, "loss": 0.2431, "step": 85 }, { "epoch": 0.1095890410958904, "grad_norm": 0.39899804304150294, "learning_rate": 4.3877551020408165e-06, "loss": 0.245, "step": 86 }, { "epoch": 0.11086333227142402, "grad_norm": 0.41483397458813454, "learning_rate": 4.438775510204082e-06, "loss": 0.2267, "step": 87 }, { "epoch": 0.11213762344695763, "grad_norm": 0.4268304149967129, "learning_rate": 4.489795918367348e-06, "loss": 0.2643, "step": 88 }, { "epoch": 0.11341191462249124, "grad_norm": 0.3904350951056931, "learning_rate": 4.540816326530613e-06, "loss": 0.2494, "step": 89 }, { "epoch": 0.11468620579802485, "grad_norm": 0.43729118492201935, "learning_rate": 4.591836734693878e-06, "loss": 0.2535, "step": 90 }, { "epoch": 0.11596049697355845, "grad_norm": 0.39954582263965904, "learning_rate": 4.642857142857144e-06, "loss": 0.2353, "step": 91 }, { "epoch": 0.11723478814909206, "grad_norm": 0.41398581974372695, "learning_rate": 4.693877551020409e-06, "loss": 0.247, "step": 92 }, { "epoch": 0.11850907932462568, "grad_norm": 0.4265095609869584, "learning_rate": 4.744897959183674e-06, "loss": 0.2555, "step": 93 }, { "epoch": 0.11978337050015929, "grad_norm": 0.4093729413202519, "learning_rate": 4.795918367346939e-06, "loss": 0.2445, "step": 94 }, { "epoch": 0.1210576616756929, "grad_norm": 0.42448929573305605, "learning_rate": 4.846938775510204e-06, "loss": 0.226, "step": 95 }, { "epoch": 0.1223319528512265, "grad_norm": 0.4142812482316476, "learning_rate": 4.897959183673469e-06, "loss": 0.2384, "step": 96 }, { "epoch": 0.12360624402676011, "grad_norm": 0.42195077092306715, "learning_rate": 4.948979591836735e-06, "loss": 0.2214, "step": 97 }, { "epoch": 0.12488053520229372, "grad_norm": 0.3971712875942616, "learning_rate": 5e-06, "loss": 0.221, "step": 98 }, { "epoch": 0.12615482637782732, "grad_norm": 0.46395737498401723, "learning_rate": 5.0510204081632655e-06, "loss": 0.2612, "step": 99 }, { "epoch": 0.12742911755336095, "grad_norm": 0.40668197469302936, "learning_rate": 5.1020408163265315e-06, "loss": 0.2387, "step": 100 }, { "epoch": 0.12870340872889455, "grad_norm": 0.43095623433020447, "learning_rate": 5.153061224489796e-06, "loss": 0.2484, "step": 101 }, { "epoch": 0.12997769990442817, "grad_norm": 0.4212821110269773, "learning_rate": 5.204081632653062e-06, "loss": 0.2315, "step": 102 }, { "epoch": 0.13125199107996177, "grad_norm": 0.3959846091109773, "learning_rate": 5.255102040816327e-06, "loss": 0.2337, "step": 103 }, { "epoch": 0.13252628225549537, "grad_norm": 0.4279426516641407, "learning_rate": 5.306122448979593e-06, "loss": 0.2393, "step": 104 }, { "epoch": 0.133800573431029, "grad_norm": 0.4027932693030174, "learning_rate": 5.357142857142857e-06, "loss": 0.2556, "step": 105 }, { "epoch": 0.1350748646065626, "grad_norm": 0.38820069359708675, "learning_rate": 5.408163265306123e-06, "loss": 0.2162, "step": 106 }, { "epoch": 0.13634915578209622, "grad_norm": 0.4582575893803094, "learning_rate": 5.459183673469388e-06, "loss": 0.2632, "step": 107 }, { "epoch": 0.13762344695762982, "grad_norm": 0.40525672509684746, "learning_rate": 5.510204081632653e-06, "loss": 0.2453, "step": 108 }, { "epoch": 0.13889773813316342, "grad_norm": 0.4722345412549772, "learning_rate": 5.561224489795919e-06, "loss": 0.2556, "step": 109 }, { "epoch": 0.14017202930869704, "grad_norm": 0.4217348714280782, "learning_rate": 5.6122448979591834e-06, "loss": 0.2171, "step": 110 }, { "epoch": 0.14144632048423064, "grad_norm": 0.4363741930577028, "learning_rate": 5.663265306122449e-06, "loss": 0.2568, "step": 111 }, { "epoch": 0.14272061165976427, "grad_norm": 0.4159085915856548, "learning_rate": 5.7142857142857145e-06, "loss": 0.2292, "step": 112 }, { "epoch": 0.14399490283529787, "grad_norm": 0.423718574302985, "learning_rate": 5.7653061224489805e-06, "loss": 0.2477, "step": 113 }, { "epoch": 0.14526919401083146, "grad_norm": 0.45200196215345384, "learning_rate": 5.816326530612246e-06, "loss": 0.2545, "step": 114 }, { "epoch": 0.1465434851863651, "grad_norm": 0.3947059950335911, "learning_rate": 5.867346938775511e-06, "loss": 0.2267, "step": 115 }, { "epoch": 0.1478177763618987, "grad_norm": 0.44180425579685495, "learning_rate": 5.918367346938776e-06, "loss": 0.2341, "step": 116 }, { "epoch": 0.14909206753743232, "grad_norm": 0.4616816782070588, "learning_rate": 5.969387755102042e-06, "loss": 0.2616, "step": 117 }, { "epoch": 0.15036635871296591, "grad_norm": 0.41991336277995833, "learning_rate": 6.020408163265307e-06, "loss": 0.2192, "step": 118 }, { "epoch": 0.1516406498884995, "grad_norm": 0.45185293809768096, "learning_rate": 6.071428571428571e-06, "loss": 0.2389, "step": 119 }, { "epoch": 0.15291494106403314, "grad_norm": 0.4441142661560909, "learning_rate": 6.122448979591837e-06, "loss": 0.2336, "step": 120 }, { "epoch": 0.15418923223956674, "grad_norm": 0.41778543461281886, "learning_rate": 6.173469387755102e-06, "loss": 0.272, "step": 121 }, { "epoch": 0.15546352341510036, "grad_norm": 0.42658374315226566, "learning_rate": 6.224489795918368e-06, "loss": 0.2394, "step": 122 }, { "epoch": 0.15673781459063396, "grad_norm": 0.4262085950851368, "learning_rate": 6.275510204081633e-06, "loss": 0.2375, "step": 123 }, { "epoch": 0.15801210576616756, "grad_norm": 0.4594811856362058, "learning_rate": 6.326530612244899e-06, "loss": 0.2641, "step": 124 }, { "epoch": 0.15928639694170119, "grad_norm": 0.3942636412612114, "learning_rate": 6.3775510204081635e-06, "loss": 0.2161, "step": 125 }, { "epoch": 0.16056068811723478, "grad_norm": 0.39867685295525757, "learning_rate": 6.4285714285714295e-06, "loss": 0.221, "step": 126 }, { "epoch": 0.1618349792927684, "grad_norm": 0.4454199231095862, "learning_rate": 6.4795918367346946e-06, "loss": 0.2617, "step": 127 }, { "epoch": 0.163109270468302, "grad_norm": 0.4099701198985473, "learning_rate": 6.530612244897959e-06, "loss": 0.2265, "step": 128 }, { "epoch": 0.1643835616438356, "grad_norm": 0.4249592123037324, "learning_rate": 6.581632653061225e-06, "loss": 0.2429, "step": 129 }, { "epoch": 0.16565785281936923, "grad_norm": 0.3952459423333401, "learning_rate": 6.63265306122449e-06, "loss": 0.2226, "step": 130 }, { "epoch": 0.16693214399490283, "grad_norm": 0.42848072572721546, "learning_rate": 6.683673469387756e-06, "loss": 0.2164, "step": 131 }, { "epoch": 0.16820643517043646, "grad_norm": 0.47276384329671495, "learning_rate": 6.734693877551021e-06, "loss": 0.2249, "step": 132 }, { "epoch": 0.16948072634597006, "grad_norm": 0.42893027989673577, "learning_rate": 6.785714285714287e-06, "loss": 0.229, "step": 133 }, { "epoch": 0.17075501752150365, "grad_norm": 0.4183486956930159, "learning_rate": 6.836734693877551e-06, "loss": 0.2289, "step": 134 }, { "epoch": 0.17202930869703728, "grad_norm": 0.42410596583502363, "learning_rate": 6.887755102040817e-06, "loss": 0.2294, "step": 135 }, { "epoch": 0.17330359987257088, "grad_norm": 0.39390745640873664, "learning_rate": 6.938775510204082e-06, "loss": 0.2393, "step": 136 }, { "epoch": 0.1745778910481045, "grad_norm": 0.4250113845677091, "learning_rate": 6.989795918367348e-06, "loss": 0.2316, "step": 137 }, { "epoch": 0.1758521822236381, "grad_norm": 0.4447753537356294, "learning_rate": 7.0408163265306125e-06, "loss": 0.2326, "step": 138 }, { "epoch": 0.1771264733991717, "grad_norm": 0.43410386750386654, "learning_rate": 7.091836734693878e-06, "loss": 0.2498, "step": 139 }, { "epoch": 0.17840076457470533, "grad_norm": 0.44308494096268847, "learning_rate": 7.1428571428571436e-06, "loss": 0.2421, "step": 140 }, { "epoch": 0.17967505575023893, "grad_norm": 0.42110120878613233, "learning_rate": 7.193877551020409e-06, "loss": 0.2261, "step": 141 }, { "epoch": 0.18094934692577255, "grad_norm": 0.46454124265220936, "learning_rate": 7.244897959183675e-06, "loss": 0.241, "step": 142 }, { "epoch": 0.18222363810130615, "grad_norm": 0.43933684022588243, "learning_rate": 7.295918367346939e-06, "loss": 0.2508, "step": 143 }, { "epoch": 0.18349792927683975, "grad_norm": 0.39300783175915416, "learning_rate": 7.346938775510205e-06, "loss": 0.2179, "step": 144 }, { "epoch": 0.18477222045237338, "grad_norm": 0.41673552211713577, "learning_rate": 7.39795918367347e-06, "loss": 0.218, "step": 145 }, { "epoch": 0.18604651162790697, "grad_norm": 0.42577587531903016, "learning_rate": 7.448979591836736e-06, "loss": 0.2184, "step": 146 }, { "epoch": 0.18732080280344057, "grad_norm": 0.4362318326162692, "learning_rate": 7.500000000000001e-06, "loss": 0.2364, "step": 147 }, { "epoch": 0.1885950939789742, "grad_norm": 0.38888948405102514, "learning_rate": 7.551020408163265e-06, "loss": 0.2103, "step": 148 }, { "epoch": 0.1898693851545078, "grad_norm": 0.45757750194521774, "learning_rate": 7.602040816326531e-06, "loss": 0.2616, "step": 149 }, { "epoch": 0.19114367633004142, "grad_norm": 0.4058992110945755, "learning_rate": 7.653061224489796e-06, "loss": 0.2249, "step": 150 }, { "epoch": 0.19241796750557502, "grad_norm": 0.4762767045615976, "learning_rate": 7.704081632653061e-06, "loss": 0.2205, "step": 151 }, { "epoch": 0.19369225868110862, "grad_norm": 0.40639069929224236, "learning_rate": 7.755102040816327e-06, "loss": 0.2129, "step": 152 }, { "epoch": 0.19496654985664225, "grad_norm": 0.4034232849204309, "learning_rate": 7.806122448979593e-06, "loss": 0.2119, "step": 153 }, { "epoch": 0.19624084103217584, "grad_norm": 0.5178672859990732, "learning_rate": 7.857142857142858e-06, "loss": 0.2669, "step": 154 }, { "epoch": 0.19751513220770947, "grad_norm": 0.45529142651168586, "learning_rate": 7.908163265306124e-06, "loss": 0.2284, "step": 155 }, { "epoch": 0.19878942338324307, "grad_norm": 0.4577505828424473, "learning_rate": 7.959183673469388e-06, "loss": 0.222, "step": 156 }, { "epoch": 0.20006371455877667, "grad_norm": 0.40397544389648876, "learning_rate": 8.010204081632654e-06, "loss": 0.1981, "step": 157 }, { "epoch": 0.2013380057343103, "grad_norm": 0.4017800024963611, "learning_rate": 8.06122448979592e-06, "loss": 0.251, "step": 158 }, { "epoch": 0.2026122969098439, "grad_norm": 0.4756346950405561, "learning_rate": 8.112244897959184e-06, "loss": 0.238, "step": 159 }, { "epoch": 0.20388658808537752, "grad_norm": 0.4473107652158951, "learning_rate": 8.16326530612245e-06, "loss": 0.2485, "step": 160 }, { "epoch": 0.20516087926091112, "grad_norm": 0.4379510394157001, "learning_rate": 8.214285714285714e-06, "loss": 0.217, "step": 161 }, { "epoch": 0.20643517043644471, "grad_norm": 0.51561126269841, "learning_rate": 8.26530612244898e-06, "loss": 0.2343, "step": 162 }, { "epoch": 0.20770946161197834, "grad_norm": 0.4172658202810388, "learning_rate": 8.316326530612246e-06, "loss": 0.2392, "step": 163 }, { "epoch": 0.20898375278751194, "grad_norm": 0.47215311429946616, "learning_rate": 8.36734693877551e-06, "loss": 0.2146, "step": 164 }, { "epoch": 0.21025804396304557, "grad_norm": 0.4831555821681171, "learning_rate": 8.418367346938776e-06, "loss": 0.2372, "step": 165 }, { "epoch": 0.21153233513857916, "grad_norm": 0.4405323327880247, "learning_rate": 8.469387755102042e-06, "loss": 0.2185, "step": 166 }, { "epoch": 0.21280662631411276, "grad_norm": 0.43115459972688663, "learning_rate": 8.520408163265307e-06, "loss": 0.2244, "step": 167 }, { "epoch": 0.2140809174896464, "grad_norm": 0.4297179580960813, "learning_rate": 8.571428571428571e-06, "loss": 0.198, "step": 168 }, { "epoch": 0.21535520866518, "grad_norm": 0.4245904499264464, "learning_rate": 8.622448979591837e-06, "loss": 0.2325, "step": 169 }, { "epoch": 0.2166294998407136, "grad_norm": 0.4625499995004702, "learning_rate": 8.673469387755103e-06, "loss": 0.2349, "step": 170 }, { "epoch": 0.2179037910162472, "grad_norm": 0.4461386621246171, "learning_rate": 8.724489795918369e-06, "loss": 0.2475, "step": 171 }, { "epoch": 0.2191780821917808, "grad_norm": 0.4240571505546288, "learning_rate": 8.775510204081633e-06, "loss": 0.217, "step": 172 }, { "epoch": 0.22045237336731444, "grad_norm": 0.4489113016833761, "learning_rate": 8.826530612244899e-06, "loss": 0.2348, "step": 173 }, { "epoch": 0.22172666454284803, "grad_norm": 0.4370220075966619, "learning_rate": 8.877551020408163e-06, "loss": 0.2344, "step": 174 }, { "epoch": 0.22300095571838166, "grad_norm": 0.4101750946114224, "learning_rate": 8.92857142857143e-06, "loss": 0.2247, "step": 175 }, { "epoch": 0.22427524689391526, "grad_norm": 0.41897759962524445, "learning_rate": 8.979591836734695e-06, "loss": 0.2379, "step": 176 }, { "epoch": 0.22554953806944886, "grad_norm": 0.422278778234788, "learning_rate": 9.03061224489796e-06, "loss": 0.2394, "step": 177 }, { "epoch": 0.22682382924498248, "grad_norm": 0.4263263682577897, "learning_rate": 9.081632653061225e-06, "loss": 0.2398, "step": 178 }, { "epoch": 0.22809812042051608, "grad_norm": 0.4094907298899685, "learning_rate": 9.13265306122449e-06, "loss": 0.2428, "step": 179 }, { "epoch": 0.2293724115960497, "grad_norm": 0.47511382898447374, "learning_rate": 9.183673469387756e-06, "loss": 0.2459, "step": 180 }, { "epoch": 0.2306467027715833, "grad_norm": 0.4244800446244923, "learning_rate": 9.234693877551022e-06, "loss": 0.2327, "step": 181 }, { "epoch": 0.2319209939471169, "grad_norm": 0.4345668033403279, "learning_rate": 9.285714285714288e-06, "loss": 0.2188, "step": 182 }, { "epoch": 0.23319528512265053, "grad_norm": 0.43163096277188406, "learning_rate": 9.336734693877552e-06, "loss": 0.2386, "step": 183 }, { "epoch": 0.23446957629818413, "grad_norm": 0.42270381973963433, "learning_rate": 9.387755102040818e-06, "loss": 0.2143, "step": 184 }, { "epoch": 0.23574386747371776, "grad_norm": 0.44034233994500255, "learning_rate": 9.438775510204082e-06, "loss": 0.2445, "step": 185 }, { "epoch": 0.23701815864925135, "grad_norm": 0.42577103371926844, "learning_rate": 9.489795918367348e-06, "loss": 0.206, "step": 186 }, { "epoch": 0.23829244982478495, "grad_norm": 0.4692331524521377, "learning_rate": 9.540816326530612e-06, "loss": 0.2184, "step": 187 }, { "epoch": 0.23956674100031858, "grad_norm": 0.45469783344216436, "learning_rate": 9.591836734693878e-06, "loss": 0.2268, "step": 188 }, { "epoch": 0.24084103217585218, "grad_norm": 0.43779489829913076, "learning_rate": 9.642857142857144e-06, "loss": 0.2202, "step": 189 }, { "epoch": 0.2421153233513858, "grad_norm": 0.47357623129181153, "learning_rate": 9.693877551020408e-06, "loss": 0.2377, "step": 190 }, { "epoch": 0.2433896145269194, "grad_norm": 0.4804806294852291, "learning_rate": 9.744897959183674e-06, "loss": 0.243, "step": 191 }, { "epoch": 0.244663905702453, "grad_norm": 0.46019267153580407, "learning_rate": 9.795918367346939e-06, "loss": 0.2425, "step": 192 }, { "epoch": 0.24593819687798663, "grad_norm": 0.4216623910374409, "learning_rate": 9.846938775510205e-06, "loss": 0.2358, "step": 193 }, { "epoch": 0.24721248805352022, "grad_norm": 0.39180777187054516, "learning_rate": 9.89795918367347e-06, "loss": 0.2015, "step": 194 }, { "epoch": 0.24848677922905385, "grad_norm": 0.4285237551686229, "learning_rate": 9.948979591836737e-06, "loss": 0.2051, "step": 195 }, { "epoch": 0.24976107040458745, "grad_norm": 0.42321131099284687, "learning_rate": 1e-05, "loss": 0.1972, "step": 196 }, { "epoch": 0.2510353615801211, "grad_norm": 0.43475027600962407, "learning_rate": 1.0051020408163265e-05, "loss": 0.2325, "step": 197 }, { "epoch": 0.25230965275565465, "grad_norm": 0.4406247192208196, "learning_rate": 1.0102040816326531e-05, "loss": 0.2337, "step": 198 }, { "epoch": 0.25358394393118827, "grad_norm": 0.419133152381463, "learning_rate": 1.0153061224489797e-05, "loss": 0.22, "step": 199 }, { "epoch": 0.2548582351067219, "grad_norm": 0.39579609971638097, "learning_rate": 1.0204081632653063e-05, "loss": 0.2318, "step": 200 }, { "epoch": 0.25613252628225547, "grad_norm": 0.4317348920140541, "learning_rate": 1.0255102040816327e-05, "loss": 0.2473, "step": 201 }, { "epoch": 0.2574068174577891, "grad_norm": 0.43300118324511194, "learning_rate": 1.0306122448979591e-05, "loss": 0.2435, "step": 202 }, { "epoch": 0.2586811086333227, "grad_norm": 0.4141125794488895, "learning_rate": 1.0357142857142859e-05, "loss": 0.2349, "step": 203 }, { "epoch": 0.25995539980885635, "grad_norm": 0.4152345483654535, "learning_rate": 1.0408163265306123e-05, "loss": 0.2082, "step": 204 }, { "epoch": 0.2612296909843899, "grad_norm": 0.4081967076472384, "learning_rate": 1.045918367346939e-05, "loss": 0.2085, "step": 205 }, { "epoch": 0.26250398215992354, "grad_norm": 0.45051078008532286, "learning_rate": 1.0510204081632654e-05, "loss": 0.2293, "step": 206 }, { "epoch": 0.26377827333545717, "grad_norm": 0.42411045346141396, "learning_rate": 1.0561224489795918e-05, "loss": 0.2471, "step": 207 }, { "epoch": 0.26505256451099074, "grad_norm": 0.43346604958491347, "learning_rate": 1.0612244897959186e-05, "loss": 0.2257, "step": 208 }, { "epoch": 0.26632685568652437, "grad_norm": 0.40832788867769154, "learning_rate": 1.066326530612245e-05, "loss": 0.2006, "step": 209 }, { "epoch": 0.267601146862058, "grad_norm": 0.4408636817456974, "learning_rate": 1.0714285714285714e-05, "loss": 0.2677, "step": 210 }, { "epoch": 0.26887543803759156, "grad_norm": 0.42641989002134156, "learning_rate": 1.076530612244898e-05, "loss": 0.2419, "step": 211 }, { "epoch": 0.2701497292131252, "grad_norm": 0.46743381287307156, "learning_rate": 1.0816326530612246e-05, "loss": 0.2241, "step": 212 }, { "epoch": 0.2714240203886588, "grad_norm": 0.4298133385464412, "learning_rate": 1.0867346938775512e-05, "loss": 0.2172, "step": 213 }, { "epoch": 0.27269831156419244, "grad_norm": 0.4530257896841251, "learning_rate": 1.0918367346938776e-05, "loss": 0.2525, "step": 214 }, { "epoch": 0.273972602739726, "grad_norm": 0.43741036736536665, "learning_rate": 1.096938775510204e-05, "loss": 0.2303, "step": 215 }, { "epoch": 0.27524689391525964, "grad_norm": 0.4682562035051021, "learning_rate": 1.1020408163265306e-05, "loss": 0.2517, "step": 216 }, { "epoch": 0.27652118509079326, "grad_norm": 0.4460921906348255, "learning_rate": 1.1071428571428572e-05, "loss": 0.2211, "step": 217 }, { "epoch": 0.27779547626632684, "grad_norm": 0.47435913980552596, "learning_rate": 1.1122448979591838e-05, "loss": 0.2467, "step": 218 }, { "epoch": 0.27906976744186046, "grad_norm": 0.4959344232255165, "learning_rate": 1.1173469387755103e-05, "loss": 0.2658, "step": 219 }, { "epoch": 0.2803440586173941, "grad_norm": 0.417000906173577, "learning_rate": 1.1224489795918367e-05, "loss": 0.23, "step": 220 }, { "epoch": 0.28161834979292766, "grad_norm": 0.4598815868500389, "learning_rate": 1.1275510204081635e-05, "loss": 0.2248, "step": 221 }, { "epoch": 0.2828926409684613, "grad_norm": 0.407570261885253, "learning_rate": 1.1326530612244899e-05, "loss": 0.2151, "step": 222 }, { "epoch": 0.2841669321439949, "grad_norm": 0.40684754012053387, "learning_rate": 1.1377551020408165e-05, "loss": 0.2168, "step": 223 }, { "epoch": 0.28544122331952854, "grad_norm": 0.471545279794649, "learning_rate": 1.1428571428571429e-05, "loss": 0.2446, "step": 224 }, { "epoch": 0.2867155144950621, "grad_norm": 0.44904322948135916, "learning_rate": 1.1479591836734697e-05, "loss": 0.2263, "step": 225 }, { "epoch": 0.28798980567059573, "grad_norm": 0.4077875540090231, "learning_rate": 1.1530612244897961e-05, "loss": 0.2282, "step": 226 }, { "epoch": 0.28926409684612936, "grad_norm": 0.40993116529816864, "learning_rate": 1.1581632653061225e-05, "loss": 0.2088, "step": 227 }, { "epoch": 0.29053838802166293, "grad_norm": 0.49721802742004506, "learning_rate": 1.1632653061224491e-05, "loss": 0.2546, "step": 228 }, { "epoch": 0.29181267919719656, "grad_norm": 0.4066550608344539, "learning_rate": 1.1683673469387755e-05, "loss": 0.2271, "step": 229 }, { "epoch": 0.2930869703727302, "grad_norm": 0.45041928884019616, "learning_rate": 1.1734693877551021e-05, "loss": 0.2437, "step": 230 }, { "epoch": 0.29436126154826375, "grad_norm": 0.37827341091287786, "learning_rate": 1.1785714285714287e-05, "loss": 0.1971, "step": 231 }, { "epoch": 0.2956355527237974, "grad_norm": 0.40977190638423455, "learning_rate": 1.1836734693877552e-05, "loss": 0.2322, "step": 232 }, { "epoch": 0.296909843899331, "grad_norm": 0.3779527242034764, "learning_rate": 1.1887755102040816e-05, "loss": 0.2095, "step": 233 }, { "epoch": 0.29818413507486463, "grad_norm": 0.4285362571155404, "learning_rate": 1.1938775510204084e-05, "loss": 0.2322, "step": 234 }, { "epoch": 0.2994584262503982, "grad_norm": 0.3891671566500837, "learning_rate": 1.1989795918367348e-05, "loss": 0.213, "step": 235 }, { "epoch": 0.30073271742593183, "grad_norm": 0.41208429081378795, "learning_rate": 1.2040816326530614e-05, "loss": 0.2266, "step": 236 }, { "epoch": 0.30200700860146545, "grad_norm": 0.38555927410457097, "learning_rate": 1.2091836734693878e-05, "loss": 0.2189, "step": 237 }, { "epoch": 0.303281299776999, "grad_norm": 0.4201348327580863, "learning_rate": 1.2142857142857142e-05, "loss": 0.238, "step": 238 }, { "epoch": 0.30455559095253265, "grad_norm": 0.4267308769966644, "learning_rate": 1.219387755102041e-05, "loss": 0.2082, "step": 239 }, { "epoch": 0.3058298821280663, "grad_norm": 0.41529012965905493, "learning_rate": 1.2244897959183674e-05, "loss": 0.2075, "step": 240 }, { "epoch": 0.30710417330359985, "grad_norm": 0.42267852439118603, "learning_rate": 1.229591836734694e-05, "loss": 0.236, "step": 241 }, { "epoch": 0.3083784644791335, "grad_norm": 0.4312128359693889, "learning_rate": 1.2346938775510204e-05, "loss": 0.2421, "step": 242 }, { "epoch": 0.3096527556546671, "grad_norm": 0.42630266706686754, "learning_rate": 1.2397959183673472e-05, "loss": 0.2268, "step": 243 }, { "epoch": 0.3109270468302007, "grad_norm": 0.4248845893162084, "learning_rate": 1.2448979591836736e-05, "loss": 0.2245, "step": 244 }, { "epoch": 0.3122013380057343, "grad_norm": 0.43940633335700785, "learning_rate": 1.25e-05, "loss": 0.232, "step": 245 }, { "epoch": 0.3134756291812679, "grad_norm": 0.4274297428916915, "learning_rate": 1.2551020408163267e-05, "loss": 0.2291, "step": 246 }, { "epoch": 0.31474992035680155, "grad_norm": 0.44272200941481504, "learning_rate": 1.260204081632653e-05, "loss": 0.2277, "step": 247 }, { "epoch": 0.3160242115323351, "grad_norm": 0.4238827601708107, "learning_rate": 1.2653061224489798e-05, "loss": 0.2317, "step": 248 }, { "epoch": 0.31729850270786875, "grad_norm": 0.4286886136568041, "learning_rate": 1.2704081632653063e-05, "loss": 0.2215, "step": 249 }, { "epoch": 0.31857279388340237, "grad_norm": 0.44325789418969463, "learning_rate": 1.2755102040816327e-05, "loss": 0.2311, "step": 250 }, { "epoch": 0.31984708505893594, "grad_norm": 0.49361417405125707, "learning_rate": 1.2806122448979591e-05, "loss": 0.2385, "step": 251 }, { "epoch": 0.32112137623446957, "grad_norm": 0.4154712303408394, "learning_rate": 1.2857142857142859e-05, "loss": 0.2432, "step": 252 }, { "epoch": 0.3223956674100032, "grad_norm": 0.417022290111515, "learning_rate": 1.2908163265306123e-05, "loss": 0.2247, "step": 253 }, { "epoch": 0.3236699585855368, "grad_norm": 0.4587353259061068, "learning_rate": 1.2959183673469389e-05, "loss": 0.2324, "step": 254 }, { "epoch": 0.3249442497610704, "grad_norm": 0.40008328609617516, "learning_rate": 1.3010204081632653e-05, "loss": 0.2145, "step": 255 }, { "epoch": 0.326218540936604, "grad_norm": 0.4086653556277809, "learning_rate": 1.3061224489795918e-05, "loss": 0.2141, "step": 256 }, { "epoch": 0.32749283211213764, "grad_norm": 0.4345125968683316, "learning_rate": 1.3112244897959185e-05, "loss": 0.1954, "step": 257 }, { "epoch": 0.3287671232876712, "grad_norm": 0.4838407464613246, "learning_rate": 1.316326530612245e-05, "loss": 0.2199, "step": 258 }, { "epoch": 0.33004141446320484, "grad_norm": 0.4270941852883788, "learning_rate": 1.3214285714285716e-05, "loss": 0.2189, "step": 259 }, { "epoch": 0.33131570563873847, "grad_norm": 0.4254129412897361, "learning_rate": 1.326530612244898e-05, "loss": 0.2175, "step": 260 }, { "epoch": 0.33258999681427204, "grad_norm": 0.44585484745761256, "learning_rate": 1.3316326530612247e-05, "loss": 0.2275, "step": 261 }, { "epoch": 0.33386428798980566, "grad_norm": 0.4360936211469829, "learning_rate": 1.3367346938775512e-05, "loss": 0.2378, "step": 262 }, { "epoch": 0.3351385791653393, "grad_norm": 0.37012161638655966, "learning_rate": 1.3418367346938776e-05, "loss": 0.1958, "step": 263 }, { "epoch": 0.3364128703408729, "grad_norm": 0.4300650809140122, "learning_rate": 1.3469387755102042e-05, "loss": 0.2218, "step": 264 }, { "epoch": 0.3376871615164065, "grad_norm": 0.40726494800178104, "learning_rate": 1.3520408163265306e-05, "loss": 0.2025, "step": 265 }, { "epoch": 0.3389614526919401, "grad_norm": 0.4222069530845393, "learning_rate": 1.3571428571428574e-05, "loss": 0.2267, "step": 266 }, { "epoch": 0.34023574386747374, "grad_norm": 0.3872719021881651, "learning_rate": 1.3622448979591838e-05, "loss": 0.21, "step": 267 }, { "epoch": 0.3415100350430073, "grad_norm": 0.4141025939204291, "learning_rate": 1.3673469387755102e-05, "loss": 0.2383, "step": 268 }, { "epoch": 0.34278432621854094, "grad_norm": 0.400178487847877, "learning_rate": 1.3724489795918368e-05, "loss": 0.224, "step": 269 }, { "epoch": 0.34405861739407456, "grad_norm": 0.4033589939266943, "learning_rate": 1.3775510204081634e-05, "loss": 0.216, "step": 270 }, { "epoch": 0.34533290856960813, "grad_norm": 0.3870661031247386, "learning_rate": 1.38265306122449e-05, "loss": 0.226, "step": 271 }, { "epoch": 0.34660719974514176, "grad_norm": 0.4414124794520188, "learning_rate": 1.3877551020408165e-05, "loss": 0.2671, "step": 272 }, { "epoch": 0.3478814909206754, "grad_norm": 0.37711420426568676, "learning_rate": 1.3928571428571429e-05, "loss": 0.1982, "step": 273 }, { "epoch": 0.349155782096209, "grad_norm": 0.39988903451391217, "learning_rate": 1.3979591836734696e-05, "loss": 0.233, "step": 274 }, { "epoch": 0.3504300732717426, "grad_norm": 0.3858667604009974, "learning_rate": 1.403061224489796e-05, "loss": 0.1919, "step": 275 }, { "epoch": 0.3517043644472762, "grad_norm": 0.4007078202465616, "learning_rate": 1.4081632653061225e-05, "loss": 0.233, "step": 276 }, { "epoch": 0.35297865562280983, "grad_norm": 0.3898565337171368, "learning_rate": 1.4132653061224491e-05, "loss": 0.2088, "step": 277 }, { "epoch": 0.3542529467983434, "grad_norm": 0.39154734343591674, "learning_rate": 1.4183673469387755e-05, "loss": 0.2071, "step": 278 }, { "epoch": 0.35552723797387703, "grad_norm": 0.4629553155706541, "learning_rate": 1.4234693877551023e-05, "loss": 0.237, "step": 279 }, { "epoch": 0.35680152914941066, "grad_norm": 0.37683998459766, "learning_rate": 1.4285714285714287e-05, "loss": 0.2094, "step": 280 }, { "epoch": 0.3580758203249442, "grad_norm": 0.38940658094811215, "learning_rate": 1.4336734693877551e-05, "loss": 0.2395, "step": 281 }, { "epoch": 0.35935011150047785, "grad_norm": 0.38073969521293083, "learning_rate": 1.4387755102040817e-05, "loss": 0.1964, "step": 282 }, { "epoch": 0.3606244026760115, "grad_norm": 0.45870148154336937, "learning_rate": 1.4438775510204083e-05, "loss": 0.2491, "step": 283 }, { "epoch": 0.3618986938515451, "grad_norm": 0.40994464266127617, "learning_rate": 1.448979591836735e-05, "loss": 0.2032, "step": 284 }, { "epoch": 0.3631729850270787, "grad_norm": 0.39953387811182306, "learning_rate": 1.4540816326530614e-05, "loss": 0.2154, "step": 285 }, { "epoch": 0.3644472762026123, "grad_norm": 0.3940256088444142, "learning_rate": 1.4591836734693878e-05, "loss": 0.2147, "step": 286 }, { "epoch": 0.36572156737814593, "grad_norm": 0.406714846661028, "learning_rate": 1.4642857142857144e-05, "loss": 0.227, "step": 287 }, { "epoch": 0.3669958585536795, "grad_norm": 0.4367963848915288, "learning_rate": 1.469387755102041e-05, "loss": 0.2087, "step": 288 }, { "epoch": 0.3682701497292131, "grad_norm": 0.4094395873134934, "learning_rate": 1.4744897959183676e-05, "loss": 0.2371, "step": 289 }, { "epoch": 0.36954444090474675, "grad_norm": 0.4277900098446667, "learning_rate": 1.479591836734694e-05, "loss": 0.2276, "step": 290 }, { "epoch": 0.3708187320802803, "grad_norm": 0.41251736084434865, "learning_rate": 1.4846938775510204e-05, "loss": 0.2254, "step": 291 }, { "epoch": 0.37209302325581395, "grad_norm": 0.3683550438934192, "learning_rate": 1.4897959183673472e-05, "loss": 0.1996, "step": 292 }, { "epoch": 0.3733673144313476, "grad_norm": 0.40056621338332193, "learning_rate": 1.4948979591836736e-05, "loss": 0.2339, "step": 293 }, { "epoch": 0.37464160560688115, "grad_norm": 0.43779482309734447, "learning_rate": 1.5000000000000002e-05, "loss": 0.2177, "step": 294 }, { "epoch": 0.37591589678241477, "grad_norm": 0.37971393643547363, "learning_rate": 1.5051020408163266e-05, "loss": 0.1937, "step": 295 }, { "epoch": 0.3771901879579484, "grad_norm": 0.414311189063705, "learning_rate": 1.510204081632653e-05, "loss": 0.2219, "step": 296 }, { "epoch": 0.378464479133482, "grad_norm": 0.4298480385401111, "learning_rate": 1.5153061224489798e-05, "loss": 0.2256, "step": 297 }, { "epoch": 0.3797387703090156, "grad_norm": 0.4463350692894898, "learning_rate": 1.5204081632653063e-05, "loss": 0.2521, "step": 298 }, { "epoch": 0.3810130614845492, "grad_norm": 0.40106153201966066, "learning_rate": 1.5255102040816327e-05, "loss": 0.2228, "step": 299 }, { "epoch": 0.38228735266008285, "grad_norm": 0.4114154903070887, "learning_rate": 1.530612244897959e-05, "loss": 0.2067, "step": 300 }, { "epoch": 0.3835616438356164, "grad_norm": 0.4377814576818658, "learning_rate": 1.535714285714286e-05, "loss": 0.2263, "step": 301 }, { "epoch": 0.38483593501115004, "grad_norm": 0.39764348960068696, "learning_rate": 1.5408163265306123e-05, "loss": 0.2327, "step": 302 }, { "epoch": 0.38611022618668367, "grad_norm": 0.4047141074540197, "learning_rate": 1.545918367346939e-05, "loss": 0.2331, "step": 303 }, { "epoch": 0.38738451736221724, "grad_norm": 0.3987445177708362, "learning_rate": 1.5510204081632655e-05, "loss": 0.2024, "step": 304 }, { "epoch": 0.38865880853775087, "grad_norm": 0.4171832434577859, "learning_rate": 1.556122448979592e-05, "loss": 0.2314, "step": 305 }, { "epoch": 0.3899330997132845, "grad_norm": 0.4105254120360544, "learning_rate": 1.5612244897959187e-05, "loss": 0.214, "step": 306 }, { "epoch": 0.3912073908888181, "grad_norm": 0.400770148439822, "learning_rate": 1.566326530612245e-05, "loss": 0.2202, "step": 307 }, { "epoch": 0.3924816820643517, "grad_norm": 0.40635600510152897, "learning_rate": 1.5714285714285715e-05, "loss": 0.2474, "step": 308 }, { "epoch": 0.3937559732398853, "grad_norm": 0.4055288268154526, "learning_rate": 1.576530612244898e-05, "loss": 0.2348, "step": 309 }, { "epoch": 0.39503026441541894, "grad_norm": 0.4597566360618283, "learning_rate": 1.5816326530612247e-05, "loss": 0.2401, "step": 310 }, { "epoch": 0.3963045555909525, "grad_norm": 0.3637213963213138, "learning_rate": 1.586734693877551e-05, "loss": 0.2085, "step": 311 }, { "epoch": 0.39757884676648614, "grad_norm": 0.39334539004134467, "learning_rate": 1.5918367346938776e-05, "loss": 0.2195, "step": 312 }, { "epoch": 0.39885313794201976, "grad_norm": 0.3980031875338257, "learning_rate": 1.596938775510204e-05, "loss": 0.2278, "step": 313 }, { "epoch": 0.40012742911755333, "grad_norm": 0.38344336732062084, "learning_rate": 1.6020408163265308e-05, "loss": 0.2228, "step": 314 }, { "epoch": 0.40140172029308696, "grad_norm": 0.3726459426766659, "learning_rate": 1.6071428571428572e-05, "loss": 0.2321, "step": 315 }, { "epoch": 0.4026760114686206, "grad_norm": 0.3973366613410824, "learning_rate": 1.612244897959184e-05, "loss": 0.2147, "step": 316 }, { "epoch": 0.4039503026441542, "grad_norm": 0.44767918743315616, "learning_rate": 1.6173469387755104e-05, "loss": 0.2554, "step": 317 }, { "epoch": 0.4052245938196878, "grad_norm": 0.3911340882553635, "learning_rate": 1.6224489795918368e-05, "loss": 0.2332, "step": 318 }, { "epoch": 0.4064988849952214, "grad_norm": 0.3570505329630474, "learning_rate": 1.6275510204081636e-05, "loss": 0.2002, "step": 319 }, { "epoch": 0.40777317617075504, "grad_norm": 0.41388556326276166, "learning_rate": 1.63265306122449e-05, "loss": 0.2273, "step": 320 }, { "epoch": 0.4090474673462886, "grad_norm": 0.4152593848721792, "learning_rate": 1.6377551020408164e-05, "loss": 0.2147, "step": 321 }, { "epoch": 0.41032175852182223, "grad_norm": 0.4001289088757174, "learning_rate": 1.642857142857143e-05, "loss": 0.2109, "step": 322 }, { "epoch": 0.41159604969735586, "grad_norm": 0.39900172967265807, "learning_rate": 1.6479591836734696e-05, "loss": 0.2284, "step": 323 }, { "epoch": 0.41287034087288943, "grad_norm": 0.4173178470866038, "learning_rate": 1.653061224489796e-05, "loss": 0.2177, "step": 324 }, { "epoch": 0.41414463204842306, "grad_norm": 0.4414645420748386, "learning_rate": 1.6581632653061225e-05, "loss": 0.2249, "step": 325 }, { "epoch": 0.4154189232239567, "grad_norm": 0.4105849164442034, "learning_rate": 1.6632653061224492e-05, "loss": 0.2349, "step": 326 }, { "epoch": 0.4166932143994903, "grad_norm": 0.4396968936108067, "learning_rate": 1.6683673469387757e-05, "loss": 0.2471, "step": 327 }, { "epoch": 0.4179675055750239, "grad_norm": 0.421615785705994, "learning_rate": 1.673469387755102e-05, "loss": 0.2179, "step": 328 }, { "epoch": 0.4192417967505575, "grad_norm": 0.3954715671666931, "learning_rate": 1.678571428571429e-05, "loss": 0.2236, "step": 329 }, { "epoch": 0.42051608792609113, "grad_norm": 0.4042072862425245, "learning_rate": 1.6836734693877553e-05, "loss": 0.2294, "step": 330 }, { "epoch": 0.4217903791016247, "grad_norm": 0.43605412559894957, "learning_rate": 1.6887755102040817e-05, "loss": 0.2277, "step": 331 }, { "epoch": 0.42306467027715833, "grad_norm": 0.3573892563531731, "learning_rate": 1.6938775510204085e-05, "loss": 0.2036, "step": 332 }, { "epoch": 0.42433896145269195, "grad_norm": 0.3632076793951797, "learning_rate": 1.698979591836735e-05, "loss": 0.2107, "step": 333 }, { "epoch": 0.4256132526282255, "grad_norm": 0.41883341625654014, "learning_rate": 1.7040816326530613e-05, "loss": 0.2427, "step": 334 }, { "epoch": 0.42688754380375915, "grad_norm": 0.39972719219802944, "learning_rate": 1.7091836734693878e-05, "loss": 0.2366, "step": 335 }, { "epoch": 0.4281618349792928, "grad_norm": 0.39327774816984284, "learning_rate": 1.7142857142857142e-05, "loss": 0.2167, "step": 336 }, { "epoch": 0.4294361261548264, "grad_norm": 0.427612456712139, "learning_rate": 1.719387755102041e-05, "loss": 0.2262, "step": 337 }, { "epoch": 0.43071041733036, "grad_norm": 0.3972351279219083, "learning_rate": 1.7244897959183674e-05, "loss": 0.2123, "step": 338 }, { "epoch": 0.4319847085058936, "grad_norm": 0.3904545620480958, "learning_rate": 1.729591836734694e-05, "loss": 0.2049, "step": 339 }, { "epoch": 0.4332589996814272, "grad_norm": 0.40127048002540067, "learning_rate": 1.7346938775510206e-05, "loss": 0.2275, "step": 340 }, { "epoch": 0.4345332908569608, "grad_norm": 0.39653273486127344, "learning_rate": 1.7397959183673473e-05, "loss": 0.2132, "step": 341 }, { "epoch": 0.4358075820324944, "grad_norm": 0.40082297721462407, "learning_rate": 1.7448979591836738e-05, "loss": 0.2377, "step": 342 }, { "epoch": 0.43708187320802805, "grad_norm": 0.37441247686370405, "learning_rate": 1.7500000000000002e-05, "loss": 0.2133, "step": 343 }, { "epoch": 0.4383561643835616, "grad_norm": 0.3946095731249267, "learning_rate": 1.7551020408163266e-05, "loss": 0.2108, "step": 344 }, { "epoch": 0.43963045555909525, "grad_norm": 0.3506018519483648, "learning_rate": 1.760204081632653e-05, "loss": 0.194, "step": 345 }, { "epoch": 0.44090474673462887, "grad_norm": 0.38060804848516816, "learning_rate": 1.7653061224489798e-05, "loss": 0.2011, "step": 346 }, { "epoch": 0.4421790379101625, "grad_norm": 0.4163504902773949, "learning_rate": 1.7704081632653062e-05, "loss": 0.2226, "step": 347 }, { "epoch": 0.44345332908569607, "grad_norm": 0.4035022550229339, "learning_rate": 1.7755102040816327e-05, "loss": 0.232, "step": 348 }, { "epoch": 0.4447276202612297, "grad_norm": 0.36869076458629607, "learning_rate": 1.780612244897959e-05, "loss": 0.2059, "step": 349 }, { "epoch": 0.4460019114367633, "grad_norm": 0.38651212053238704, "learning_rate": 1.785714285714286e-05, "loss": 0.2075, "step": 350 }, { "epoch": 0.4472762026122969, "grad_norm": 0.3825132339828071, "learning_rate": 1.7908163265306123e-05, "loss": 0.1999, "step": 351 }, { "epoch": 0.4485504937878305, "grad_norm": 0.3569948546479903, "learning_rate": 1.795918367346939e-05, "loss": 0.2001, "step": 352 }, { "epoch": 0.44982478496336414, "grad_norm": 0.4231122899046842, "learning_rate": 1.8010204081632655e-05, "loss": 0.2293, "step": 353 }, { "epoch": 0.4510990761388977, "grad_norm": 0.39723669553118934, "learning_rate": 1.806122448979592e-05, "loss": 0.2307, "step": 354 }, { "epoch": 0.45237336731443134, "grad_norm": 0.377545602374054, "learning_rate": 1.8112244897959187e-05, "loss": 0.233, "step": 355 }, { "epoch": 0.45364765848996497, "grad_norm": 0.44673228291624256, "learning_rate": 1.816326530612245e-05, "loss": 0.2597, "step": 356 }, { "epoch": 0.4549219496654986, "grad_norm": 0.41290785571645544, "learning_rate": 1.8214285714285715e-05, "loss": 0.2435, "step": 357 }, { "epoch": 0.45619624084103216, "grad_norm": 0.3499771390283294, "learning_rate": 1.826530612244898e-05, "loss": 0.2129, "step": 358 }, { "epoch": 0.4574705320165658, "grad_norm": 0.36300429847969845, "learning_rate": 1.8316326530612247e-05, "loss": 0.2055, "step": 359 }, { "epoch": 0.4587448231920994, "grad_norm": 0.3894968220791071, "learning_rate": 1.836734693877551e-05, "loss": 0.2242, "step": 360 }, { "epoch": 0.460019114367633, "grad_norm": 0.36569437720804127, "learning_rate": 1.8418367346938776e-05, "loss": 0.2189, "step": 361 }, { "epoch": 0.4612934055431666, "grad_norm": 0.37333054014051054, "learning_rate": 1.8469387755102043e-05, "loss": 0.2146, "step": 362 }, { "epoch": 0.46256769671870024, "grad_norm": 0.37638014079774845, "learning_rate": 1.8520408163265307e-05, "loss": 0.2484, "step": 363 }, { "epoch": 0.4638419878942338, "grad_norm": 0.37521590395337817, "learning_rate": 1.8571428571428575e-05, "loss": 0.2181, "step": 364 }, { "epoch": 0.46511627906976744, "grad_norm": 0.42234979803847983, "learning_rate": 1.862244897959184e-05, "loss": 0.221, "step": 365 }, { "epoch": 0.46639057024530106, "grad_norm": 0.4106258446550735, "learning_rate": 1.8673469387755104e-05, "loss": 0.2357, "step": 366 }, { "epoch": 0.4676648614208347, "grad_norm": 0.3870990756155334, "learning_rate": 1.8724489795918368e-05, "loss": 0.2327, "step": 367 }, { "epoch": 0.46893915259636826, "grad_norm": 0.36946179409387814, "learning_rate": 1.8775510204081636e-05, "loss": 0.2076, "step": 368 }, { "epoch": 0.4702134437719019, "grad_norm": 0.35648039036991774, "learning_rate": 1.88265306122449e-05, "loss": 0.2067, "step": 369 }, { "epoch": 0.4714877349474355, "grad_norm": 0.3896150431755743, "learning_rate": 1.8877551020408164e-05, "loss": 0.2179, "step": 370 }, { "epoch": 0.4727620261229691, "grad_norm": 0.3840593016837107, "learning_rate": 1.892857142857143e-05, "loss": 0.206, "step": 371 }, { "epoch": 0.4740363172985027, "grad_norm": 0.378879696280546, "learning_rate": 1.8979591836734696e-05, "loss": 0.2208, "step": 372 }, { "epoch": 0.47531060847403633, "grad_norm": 0.3473009130002769, "learning_rate": 1.903061224489796e-05, "loss": 0.2106, "step": 373 }, { "epoch": 0.4765848996495699, "grad_norm": 0.36968342306503144, "learning_rate": 1.9081632653061225e-05, "loss": 0.2249, "step": 374 }, { "epoch": 0.47785919082510353, "grad_norm": 0.3743218371314938, "learning_rate": 1.9132653061224492e-05, "loss": 0.2154, "step": 375 }, { "epoch": 0.47913348200063716, "grad_norm": 0.3761219965113974, "learning_rate": 1.9183673469387756e-05, "loss": 0.2184, "step": 376 }, { "epoch": 0.4804077731761708, "grad_norm": 0.35393271314878416, "learning_rate": 1.9234693877551024e-05, "loss": 0.2145, "step": 377 }, { "epoch": 0.48168206435170435, "grad_norm": 0.36060287151070336, "learning_rate": 1.928571428571429e-05, "loss": 0.204, "step": 378 }, { "epoch": 0.482956355527238, "grad_norm": 0.34034704701893354, "learning_rate": 1.9336734693877553e-05, "loss": 0.1895, "step": 379 }, { "epoch": 0.4842306467027716, "grad_norm": 0.3814117563193966, "learning_rate": 1.9387755102040817e-05, "loss": 0.2049, "step": 380 }, { "epoch": 0.4855049378783052, "grad_norm": 0.3739418819055641, "learning_rate": 1.9438775510204085e-05, "loss": 0.2114, "step": 381 }, { "epoch": 0.4867792290538388, "grad_norm": 0.37254149312295526, "learning_rate": 1.948979591836735e-05, "loss": 0.204, "step": 382 }, { "epoch": 0.48805352022937243, "grad_norm": 0.4034240027175894, "learning_rate": 1.9540816326530613e-05, "loss": 0.2187, "step": 383 }, { "epoch": 0.489327811404906, "grad_norm": 0.42614291155370376, "learning_rate": 1.9591836734693877e-05, "loss": 0.2532, "step": 384 }, { "epoch": 0.4906021025804396, "grad_norm": 0.38742027160331616, "learning_rate": 1.9642857142857145e-05, "loss": 0.2316, "step": 385 }, { "epoch": 0.49187639375597325, "grad_norm": 0.39573741829279646, "learning_rate": 1.969387755102041e-05, "loss": 0.2165, "step": 386 }, { "epoch": 0.4931506849315068, "grad_norm": 0.38239193569662683, "learning_rate": 1.9744897959183677e-05, "loss": 0.2206, "step": 387 }, { "epoch": 0.49442497610704045, "grad_norm": 0.3653048130388125, "learning_rate": 1.979591836734694e-05, "loss": 0.2092, "step": 388 }, { "epoch": 0.4956992672825741, "grad_norm": 0.4331382951084842, "learning_rate": 1.9846938775510205e-05, "loss": 0.2161, "step": 389 }, { "epoch": 0.4969735584581077, "grad_norm": 0.3921918474926603, "learning_rate": 1.9897959183673473e-05, "loss": 0.2204, "step": 390 }, { "epoch": 0.49824784963364127, "grad_norm": 0.3904987343763615, "learning_rate": 1.9948979591836737e-05, "loss": 0.223, "step": 391 }, { "epoch": 0.4995221408091749, "grad_norm": 0.40056026513710036, "learning_rate": 2e-05, "loss": 0.2218, "step": 392 }, { "epoch": 0.5007964319847085, "grad_norm": 0.3967977062537578, "learning_rate": 1.999999603527958e-05, "loss": 0.199, "step": 393 }, { "epoch": 0.5020707231602421, "grad_norm": 0.3538270244154982, "learning_rate": 1.9999984141121447e-05, "loss": 0.1841, "step": 394 }, { "epoch": 0.5033450143357757, "grad_norm": 0.3613070386440572, "learning_rate": 1.9999964317535044e-05, "loss": 0.2274, "step": 395 }, { "epoch": 0.5046193055113093, "grad_norm": 0.3955131550993394, "learning_rate": 1.9999936564536085e-05, "loss": 0.2468, "step": 396 }, { "epoch": 0.505893596686843, "grad_norm": 0.3598349603400646, "learning_rate": 1.9999900882146578e-05, "loss": 0.2149, "step": 397 }, { "epoch": 0.5071678878623765, "grad_norm": 0.3947722182436942, "learning_rate": 1.9999857270394818e-05, "loss": 0.2348, "step": 398 }, { "epoch": 0.5084421790379101, "grad_norm": 0.37905254889668283, "learning_rate": 1.9999805729315383e-05, "loss": 0.2416, "step": 399 }, { "epoch": 0.5097164702134438, "grad_norm": 0.34270702565902517, "learning_rate": 1.9999746258949146e-05, "loss": 0.194, "step": 400 }, { "epoch": 0.5109907613889774, "grad_norm": 0.37240828854251184, "learning_rate": 1.9999678859343264e-05, "loss": 0.2384, "step": 401 }, { "epoch": 0.5122650525645109, "grad_norm": 0.36046384109470403, "learning_rate": 1.9999603530551178e-05, "loss": 0.226, "step": 402 }, { "epoch": 0.5135393437400446, "grad_norm": 0.3807745487141532, "learning_rate": 1.999952027263262e-05, "loss": 0.2178, "step": 403 }, { "epoch": 0.5148136349155782, "grad_norm": 0.41368520909531004, "learning_rate": 1.999942908565361e-05, "loss": 0.2509, "step": 404 }, { "epoch": 0.5160879260911119, "grad_norm": 0.38626981247714054, "learning_rate": 1.9999329969686458e-05, "loss": 0.226, "step": 405 }, { "epoch": 0.5173622172666454, "grad_norm": 0.3965054291488143, "learning_rate": 1.999922292480975e-05, "loss": 0.2243, "step": 406 }, { "epoch": 0.518636508442179, "grad_norm": 0.470250951831843, "learning_rate": 1.9999107951108372e-05, "loss": 0.2494, "step": 407 }, { "epoch": 0.5199107996177127, "grad_norm": 0.3874492559483313, "learning_rate": 1.9998985048673486e-05, "loss": 0.2156, "step": 408 }, { "epoch": 0.5211850907932463, "grad_norm": 0.4105971618636439, "learning_rate": 1.9998854217602554e-05, "loss": 0.2068, "step": 409 }, { "epoch": 0.5224593819687798, "grad_norm": 0.35609174342483874, "learning_rate": 1.9998715457999313e-05, "loss": 0.1926, "step": 410 }, { "epoch": 0.5237336731443135, "grad_norm": 0.39996250152307056, "learning_rate": 1.9998568769973794e-05, "loss": 0.2417, "step": 411 }, { "epoch": 0.5250079643198471, "grad_norm": 0.40936904073729613, "learning_rate": 1.999841415364231e-05, "loss": 0.2205, "step": 412 }, { "epoch": 0.5262822554953807, "grad_norm": 0.38484042943118785, "learning_rate": 1.9998251609127465e-05, "loss": 0.2298, "step": 413 }, { "epoch": 0.5275565466709143, "grad_norm": 0.39676106426807894, "learning_rate": 1.999808113655815e-05, "loss": 0.2251, "step": 414 }, { "epoch": 0.5288308378464479, "grad_norm": 0.4001700780997169, "learning_rate": 1.9997902736069533e-05, "loss": 0.2276, "step": 415 }, { "epoch": 0.5301051290219815, "grad_norm": 0.35078905247903436, "learning_rate": 1.999771640780308e-05, "loss": 0.2041, "step": 416 }, { "epoch": 0.5313794201975152, "grad_norm": 0.3649352309455932, "learning_rate": 1.999752215190654e-05, "loss": 0.196, "step": 417 }, { "epoch": 0.5326537113730487, "grad_norm": 0.3928472273861351, "learning_rate": 1.999731996853395e-05, "loss": 0.2115, "step": 418 }, { "epoch": 0.5339280025485823, "grad_norm": 0.3382162096903793, "learning_rate": 1.999710985784562e-05, "loss": 0.1984, "step": 419 }, { "epoch": 0.535202293724116, "grad_norm": 0.34204079102141194, "learning_rate": 1.9996891820008165e-05, "loss": 0.2049, "step": 420 }, { "epoch": 0.5364765848996496, "grad_norm": 0.3657252438861584, "learning_rate": 1.999666585519447e-05, "loss": 0.2262, "step": 421 }, { "epoch": 0.5377508760751831, "grad_norm": 0.38416355494676624, "learning_rate": 1.9996431963583724e-05, "loss": 0.242, "step": 422 }, { "epoch": 0.5390251672507168, "grad_norm": 0.3768980517367842, "learning_rate": 1.9996190145361377e-05, "loss": 0.245, "step": 423 }, { "epoch": 0.5402994584262504, "grad_norm": 0.3578234269253679, "learning_rate": 1.9995940400719184e-05, "loss": 0.215, "step": 424 }, { "epoch": 0.5415737496017841, "grad_norm": 0.36797447925048243, "learning_rate": 1.9995682729855175e-05, "loss": 0.2418, "step": 425 }, { "epoch": 0.5428480407773176, "grad_norm": 0.372742975011764, "learning_rate": 1.9995417132973674e-05, "loss": 0.2501, "step": 426 }, { "epoch": 0.5441223319528512, "grad_norm": 0.34291469008066255, "learning_rate": 1.9995143610285275e-05, "loss": 0.2063, "step": 427 }, { "epoch": 0.5453966231283849, "grad_norm": 0.3688151589480023, "learning_rate": 1.999486216200688e-05, "loss": 0.2175, "step": 428 }, { "epoch": 0.5466709143039185, "grad_norm": 0.35955139298940686, "learning_rate": 1.999457278836165e-05, "loss": 0.2286, "step": 429 }, { "epoch": 0.547945205479452, "grad_norm": 0.3879107561451551, "learning_rate": 1.999427548957905e-05, "loss": 0.2237, "step": 430 }, { "epoch": 0.5492194966549857, "grad_norm": 0.40616509104514836, "learning_rate": 1.9993970265894816e-05, "loss": 0.2527, "step": 431 }, { "epoch": 0.5504937878305193, "grad_norm": 0.3686189962449617, "learning_rate": 1.9993657117550972e-05, "loss": 0.2141, "step": 432 }, { "epoch": 0.5517680790060528, "grad_norm": 0.3323254390603725, "learning_rate": 1.999333604479583e-05, "loss": 0.221, "step": 433 }, { "epoch": 0.5530423701815865, "grad_norm": 0.35116407197973315, "learning_rate": 1.9993007047883988e-05, "loss": 0.2092, "step": 434 }, { "epoch": 0.5543166613571201, "grad_norm": 0.3798562462137546, "learning_rate": 1.999267012707631e-05, "loss": 0.2071, "step": 435 }, { "epoch": 0.5555909525326537, "grad_norm": 0.38271230694547437, "learning_rate": 1.999232528263997e-05, "loss": 0.2174, "step": 436 }, { "epoch": 0.5568652437081874, "grad_norm": 0.36785119490217677, "learning_rate": 1.99919725148484e-05, "loss": 0.2138, "step": 437 }, { "epoch": 0.5581395348837209, "grad_norm": 0.35944886760706757, "learning_rate": 1.9991611823981322e-05, "loss": 0.1946, "step": 438 }, { "epoch": 0.5594138260592545, "grad_norm": 0.3878620215251886, "learning_rate": 1.9991243210324756e-05, "loss": 0.2254, "step": 439 }, { "epoch": 0.5606881172347882, "grad_norm": 0.40950625358633364, "learning_rate": 1.9990866674170984e-05, "loss": 0.2125, "step": 440 }, { "epoch": 0.5619624084103217, "grad_norm": 0.42641302871856124, "learning_rate": 1.999048221581858e-05, "loss": 0.2425, "step": 441 }, { "epoch": 0.5632366995858553, "grad_norm": 0.3635407073246463, "learning_rate": 1.99900898355724e-05, "loss": 0.2162, "step": 442 }, { "epoch": 0.564510990761389, "grad_norm": 0.383626333281287, "learning_rate": 1.998968953374357e-05, "loss": 0.21, "step": 443 }, { "epoch": 0.5657852819369226, "grad_norm": 0.39318443192778657, "learning_rate": 1.9989281310649516e-05, "loss": 0.2339, "step": 444 }, { "epoch": 0.5670595731124562, "grad_norm": 0.3944258922758596, "learning_rate": 1.998886516661394e-05, "loss": 0.2208, "step": 445 }, { "epoch": 0.5683338642879898, "grad_norm": 0.35098275737945095, "learning_rate": 1.9988441101966807e-05, "loss": 0.2099, "step": 446 }, { "epoch": 0.5696081554635234, "grad_norm": 0.3695223683453514, "learning_rate": 1.9988009117044392e-05, "loss": 0.2246, "step": 447 }, { "epoch": 0.5708824466390571, "grad_norm": 0.3927238691338747, "learning_rate": 1.9987569212189224e-05, "loss": 0.2347, "step": 448 }, { "epoch": 0.5721567378145906, "grad_norm": 0.39090205545236867, "learning_rate": 1.998712138775013e-05, "loss": 0.2385, "step": 449 }, { "epoch": 0.5734310289901242, "grad_norm": 0.40307298163779326, "learning_rate": 1.9986665644082204e-05, "loss": 0.2443, "step": 450 }, { "epoch": 0.5747053201656579, "grad_norm": 0.35408606998165093, "learning_rate": 1.9986201981546825e-05, "loss": 0.1919, "step": 451 }, { "epoch": 0.5759796113411915, "grad_norm": 0.35115607733686555, "learning_rate": 1.9985730400511658e-05, "loss": 0.2163, "step": 452 }, { "epoch": 0.577253902516725, "grad_norm": 0.40141121469599106, "learning_rate": 1.9985250901350636e-05, "loss": 0.2266, "step": 453 }, { "epoch": 0.5785281936922587, "grad_norm": 0.353747811754171, "learning_rate": 1.998476348444397e-05, "loss": 0.2063, "step": 454 }, { "epoch": 0.5798024848677923, "grad_norm": 0.3959496982957287, "learning_rate": 1.998426815017817e-05, "loss": 0.2072, "step": 455 }, { "epoch": 0.5810767760433259, "grad_norm": 0.379550676439884, "learning_rate": 1.998376489894599e-05, "loss": 0.2095, "step": 456 }, { "epoch": 0.5823510672188595, "grad_norm": 0.374961551949453, "learning_rate": 1.998325373114649e-05, "loss": 0.1981, "step": 457 }, { "epoch": 0.5836253583943931, "grad_norm": 0.39682505874677315, "learning_rate": 1.9982734647184997e-05, "loss": 0.2412, "step": 458 }, { "epoch": 0.5848996495699267, "grad_norm": 0.46556723443643105, "learning_rate": 1.9982207647473112e-05, "loss": 0.247, "step": 459 }, { "epoch": 0.5861739407454604, "grad_norm": 0.3891983140517711, "learning_rate": 1.998167273242872e-05, "loss": 0.2121, "step": 460 }, { "epoch": 0.5874482319209939, "grad_norm": 0.43761634391375187, "learning_rate": 1.998112990247598e-05, "loss": 0.2239, "step": 461 }, { "epoch": 0.5887225230965275, "grad_norm": 0.39806508283153574, "learning_rate": 1.9980579158045322e-05, "loss": 0.2271, "step": 462 }, { "epoch": 0.5899968142720612, "grad_norm": 0.407216331561388, "learning_rate": 1.9980020499573452e-05, "loss": 0.2192, "step": 463 }, { "epoch": 0.5912711054475948, "grad_norm": 0.40286871374855715, "learning_rate": 1.9979453927503366e-05, "loss": 0.2415, "step": 464 }, { "epoch": 0.5925453966231284, "grad_norm": 0.3960568992042459, "learning_rate": 1.9978879442284313e-05, "loss": 0.2134, "step": 465 }, { "epoch": 0.593819687798662, "grad_norm": 0.4227034622534551, "learning_rate": 1.9978297044371834e-05, "loss": 0.2443, "step": 466 }, { "epoch": 0.5950939789741956, "grad_norm": 0.38061362537008925, "learning_rate": 1.997770673422774e-05, "loss": 0.2205, "step": 467 }, { "epoch": 0.5963682701497293, "grad_norm": 0.3725936937546987, "learning_rate": 1.9977108512320103e-05, "loss": 0.2066, "step": 468 }, { "epoch": 0.5976425613252628, "grad_norm": 0.40450578876343046, "learning_rate": 1.997650237912329e-05, "loss": 0.2083, "step": 469 }, { "epoch": 0.5989168525007964, "grad_norm": 0.33859132661892277, "learning_rate": 1.9975888335117927e-05, "loss": 0.2332, "step": 470 }, { "epoch": 0.6001911436763301, "grad_norm": 0.4639505185628793, "learning_rate": 1.9975266380790917e-05, "loss": 0.2493, "step": 471 }, { "epoch": 0.6014654348518637, "grad_norm": 0.3755096959390672, "learning_rate": 1.9974636516635436e-05, "loss": 0.2262, "step": 472 }, { "epoch": 0.6027397260273972, "grad_norm": 0.39225407546056645, "learning_rate": 1.997399874315093e-05, "loss": 0.2498, "step": 473 }, { "epoch": 0.6040140172029309, "grad_norm": 0.35808329582242254, "learning_rate": 1.9973353060843118e-05, "loss": 0.2135, "step": 474 }, { "epoch": 0.6052883083784645, "grad_norm": 0.36750401831033636, "learning_rate": 1.9972699470223988e-05, "loss": 0.2235, "step": 475 }, { "epoch": 0.606562599553998, "grad_norm": 0.37386655286619924, "learning_rate": 1.9972037971811802e-05, "loss": 0.2329, "step": 476 }, { "epoch": 0.6078368907295317, "grad_norm": 0.36175666686518193, "learning_rate": 1.997136856613109e-05, "loss": 0.2226, "step": 477 }, { "epoch": 0.6091111819050653, "grad_norm": 0.3859554048235689, "learning_rate": 1.9970691253712663e-05, "loss": 0.261, "step": 478 }, { "epoch": 0.6103854730805989, "grad_norm": 0.3928539021331901, "learning_rate": 1.9970006035093578e-05, "loss": 0.2413, "step": 479 }, { "epoch": 0.6116597642561326, "grad_norm": 0.3972689328001196, "learning_rate": 1.9969312910817183e-05, "loss": 0.2406, "step": 480 }, { "epoch": 0.6129340554316661, "grad_norm": 0.3874450530091615, "learning_rate": 1.9968611881433084e-05, "loss": 0.2389, "step": 481 }, { "epoch": 0.6142083466071997, "grad_norm": 0.3486101478398947, "learning_rate": 1.9967902947497158e-05, "loss": 0.2438, "step": 482 }, { "epoch": 0.6154826377827334, "grad_norm": 0.3969908899268155, "learning_rate": 1.996718610957155e-05, "loss": 0.2115, "step": 483 }, { "epoch": 0.616756928958267, "grad_norm": 0.3787069064149702, "learning_rate": 1.9966461368224676e-05, "loss": 0.2185, "step": 484 }, { "epoch": 0.6180312201338005, "grad_norm": 0.3556549969203126, "learning_rate": 1.996572872403121e-05, "loss": 0.2006, "step": 485 }, { "epoch": 0.6193055113093342, "grad_norm": 0.3508877882237792, "learning_rate": 1.9964988177572106e-05, "loss": 0.2177, "step": 486 }, { "epoch": 0.6205798024848678, "grad_norm": 0.36166486155771127, "learning_rate": 1.9964239729434563e-05, "loss": 0.2176, "step": 487 }, { "epoch": 0.6218540936604015, "grad_norm": 0.408574377539691, "learning_rate": 1.996348338021207e-05, "loss": 0.2191, "step": 488 }, { "epoch": 0.623128384835935, "grad_norm": 0.36583719247484625, "learning_rate": 1.9962719130504365e-05, "loss": 0.2339, "step": 489 }, { "epoch": 0.6244026760114686, "grad_norm": 0.35470502521057595, "learning_rate": 1.9961946980917457e-05, "loss": 0.2028, "step": 490 }, { "epoch": 0.6256769671870023, "grad_norm": 0.37155687488709715, "learning_rate": 1.9961166932063615e-05, "loss": 0.2197, "step": 491 }, { "epoch": 0.6269512583625358, "grad_norm": 0.4285286639911743, "learning_rate": 1.9960378984561377e-05, "loss": 0.2537, "step": 492 }, { "epoch": 0.6282255495380694, "grad_norm": 0.34633197516744374, "learning_rate": 1.9959583139035537e-05, "loss": 0.2124, "step": 493 }, { "epoch": 0.6294998407136031, "grad_norm": 0.37411161118590863, "learning_rate": 1.9958779396117162e-05, "loss": 0.2061, "step": 494 }, { "epoch": 0.6307741318891367, "grad_norm": 0.377887792347339, "learning_rate": 1.9957967756443567e-05, "loss": 0.2199, "step": 495 }, { "epoch": 0.6320484230646702, "grad_norm": 0.3363420476702184, "learning_rate": 1.9957148220658348e-05, "loss": 0.2221, "step": 496 }, { "epoch": 0.6333227142402039, "grad_norm": 0.38891249506182646, "learning_rate": 1.9956320789411338e-05, "loss": 0.2241, "step": 497 }, { "epoch": 0.6345970054157375, "grad_norm": 0.3895372720235669, "learning_rate": 1.9955485463358655e-05, "loss": 0.2353, "step": 498 }, { "epoch": 0.6358712965912711, "grad_norm": 0.34373052433868284, "learning_rate": 1.9954642243162663e-05, "loss": 0.2102, "step": 499 }, { "epoch": 0.6371455877668047, "grad_norm": 0.34861118802746355, "learning_rate": 1.9953791129491985e-05, "loss": 0.2027, "step": 500 }, { "epoch": 0.6384198789423383, "grad_norm": 0.37239874840146986, "learning_rate": 1.9952932123021508e-05, "loss": 0.2348, "step": 501 }, { "epoch": 0.6396941701178719, "grad_norm": 0.34090347974149215, "learning_rate": 1.9952065224432376e-05, "loss": 0.2166, "step": 502 }, { "epoch": 0.6409684612934056, "grad_norm": 0.3569386974690665, "learning_rate": 1.995119043441199e-05, "loss": 0.2151, "step": 503 }, { "epoch": 0.6422427524689391, "grad_norm": 0.33472727199906915, "learning_rate": 1.9950307753654016e-05, "loss": 0.1947, "step": 504 }, { "epoch": 0.6435170436444727, "grad_norm": 0.34003557774498927, "learning_rate": 1.9949417182858363e-05, "loss": 0.2143, "step": 505 }, { "epoch": 0.6447913348200064, "grad_norm": 0.31763859979317843, "learning_rate": 1.9948518722731208e-05, "loss": 0.2032, "step": 506 }, { "epoch": 0.64606562599554, "grad_norm": 0.351775230853701, "learning_rate": 1.9947612373984973e-05, "loss": 0.2116, "step": 507 }, { "epoch": 0.6473399171710736, "grad_norm": 0.3532942496835766, "learning_rate": 1.9946698137338357e-05, "loss": 0.2058, "step": 508 }, { "epoch": 0.6486142083466072, "grad_norm": 0.3498049408391037, "learning_rate": 1.994577601351628e-05, "loss": 0.2235, "step": 509 }, { "epoch": 0.6498884995221408, "grad_norm": 0.36478219743449236, "learning_rate": 1.994484600324995e-05, "loss": 0.2282, "step": 510 }, { "epoch": 0.6511627906976745, "grad_norm": 0.349364597744813, "learning_rate": 1.99439081072768e-05, "loss": 0.2323, "step": 511 }, { "epoch": 0.652437081873208, "grad_norm": 0.3509801251837777, "learning_rate": 1.994296232634054e-05, "loss": 0.1934, "step": 512 }, { "epoch": 0.6537113730487416, "grad_norm": 0.34143078776182934, "learning_rate": 1.994200866119111e-05, "loss": 0.2222, "step": 513 }, { "epoch": 0.6549856642242753, "grad_norm": 0.36504392595933643, "learning_rate": 1.994104711258473e-05, "loss": 0.2073, "step": 514 }, { "epoch": 0.6562599553998089, "grad_norm": 0.36716720357806915, "learning_rate": 1.9940077681283835e-05, "loss": 0.2205, "step": 515 }, { "epoch": 0.6575342465753424, "grad_norm": 0.3533895981975939, "learning_rate": 1.9939100368057144e-05, "loss": 0.2124, "step": 516 }, { "epoch": 0.6588085377508761, "grad_norm": 0.3818546943313715, "learning_rate": 1.9938115173679605e-05, "loss": 0.1965, "step": 517 }, { "epoch": 0.6600828289264097, "grad_norm": 0.3918061691403369, "learning_rate": 1.9937122098932428e-05, "loss": 0.2232, "step": 518 }, { "epoch": 0.6613571201019433, "grad_norm": 0.3846780412460366, "learning_rate": 1.9936121144603057e-05, "loss": 0.2454, "step": 519 }, { "epoch": 0.6626314112774769, "grad_norm": 0.4152446478910813, "learning_rate": 1.99351123114852e-05, "loss": 0.238, "step": 520 }, { "epoch": 0.6639057024530105, "grad_norm": 0.38306703889958055, "learning_rate": 1.9934095600378802e-05, "loss": 0.2338, "step": 521 }, { "epoch": 0.6651799936285441, "grad_norm": 0.4096616580874316, "learning_rate": 1.993307101209006e-05, "loss": 0.2154, "step": 522 }, { "epoch": 0.6664542848040778, "grad_norm": 0.3637671348691741, "learning_rate": 1.9932038547431413e-05, "loss": 0.2332, "step": 523 }, { "epoch": 0.6677285759796113, "grad_norm": 0.36739329868032083, "learning_rate": 1.993099820722155e-05, "loss": 0.2229, "step": 524 }, { "epoch": 0.6690028671551449, "grad_norm": 0.4382202157730589, "learning_rate": 1.9929949992285397e-05, "loss": 0.2317, "step": 525 }, { "epoch": 0.6702771583306786, "grad_norm": 0.3634207374548749, "learning_rate": 1.992889390345414e-05, "loss": 0.2198, "step": 526 }, { "epoch": 0.6715514495062122, "grad_norm": 0.3999959941264889, "learning_rate": 1.9927829941565187e-05, "loss": 0.2249, "step": 527 }, { "epoch": 0.6728257406817458, "grad_norm": 0.41323093654865883, "learning_rate": 1.9926758107462208e-05, "loss": 0.2287, "step": 528 }, { "epoch": 0.6741000318572794, "grad_norm": 0.40360689321562115, "learning_rate": 1.992567840199511e-05, "loss": 0.2409, "step": 529 }, { "epoch": 0.675374323032813, "grad_norm": 0.36299317507650214, "learning_rate": 1.9924590826020027e-05, "loss": 0.2288, "step": 530 }, { "epoch": 0.6766486142083467, "grad_norm": 0.3764836595340217, "learning_rate": 1.9923495380399355e-05, "loss": 0.2176, "step": 531 }, { "epoch": 0.6779229053838802, "grad_norm": 0.42235115631179715, "learning_rate": 1.9922392066001724e-05, "loss": 0.2219, "step": 532 }, { "epoch": 0.6791971965594138, "grad_norm": 0.3553121827724239, "learning_rate": 1.9921280883701993e-05, "loss": 0.2055, "step": 533 }, { "epoch": 0.6804714877349475, "grad_norm": 0.349222545571268, "learning_rate": 1.992016183438127e-05, "loss": 0.2351, "step": 534 }, { "epoch": 0.681745778910481, "grad_norm": 0.3240754267377525, "learning_rate": 1.99190349189269e-05, "loss": 0.2023, "step": 535 }, { "epoch": 0.6830200700860146, "grad_norm": 0.33391409152582363, "learning_rate": 1.991790013823246e-05, "loss": 0.2316, "step": 536 }, { "epoch": 0.6842943612615483, "grad_norm": 0.32322243809816403, "learning_rate": 1.991675749319778e-05, "loss": 0.2003, "step": 537 }, { "epoch": 0.6855686524370819, "grad_norm": 0.3945036773152765, "learning_rate": 1.9915606984728896e-05, "loss": 0.2413, "step": 538 }, { "epoch": 0.6868429436126154, "grad_norm": 0.34182143142694227, "learning_rate": 1.9914448613738107e-05, "loss": 0.211, "step": 539 }, { "epoch": 0.6881172347881491, "grad_norm": 0.36730397216136723, "learning_rate": 1.9913282381143934e-05, "loss": 0.2209, "step": 540 }, { "epoch": 0.6893915259636827, "grad_norm": 0.3803618739125287, "learning_rate": 1.9912108287871134e-05, "loss": 0.2109, "step": 541 }, { "epoch": 0.6906658171392163, "grad_norm": 0.35149711126826966, "learning_rate": 1.99109263348507e-05, "loss": 0.2393, "step": 542 }, { "epoch": 0.69194010831475, "grad_norm": 0.33594206987924485, "learning_rate": 1.9909736523019855e-05, "loss": 0.2002, "step": 543 }, { "epoch": 0.6932143994902835, "grad_norm": 0.3610211839975422, "learning_rate": 1.9908538853322046e-05, "loss": 0.1922, "step": 544 }, { "epoch": 0.6944886906658171, "grad_norm": 0.3740279393151904, "learning_rate": 1.990733332670697e-05, "loss": 0.2279, "step": 545 }, { "epoch": 0.6957629818413508, "grad_norm": 0.35543750758330817, "learning_rate": 1.9906119944130527e-05, "loss": 0.2119, "step": 546 }, { "epoch": 0.6970372730168843, "grad_norm": 0.3399296782817474, "learning_rate": 1.9904898706554875e-05, "loss": 0.2091, "step": 547 }, { "epoch": 0.698311564192418, "grad_norm": 0.38032652940343104, "learning_rate": 1.9903669614948382e-05, "loss": 0.2206, "step": 548 }, { "epoch": 0.6995858553679516, "grad_norm": 0.354510807876217, "learning_rate": 1.9902432670285647e-05, "loss": 0.2169, "step": 549 }, { "epoch": 0.7008601465434852, "grad_norm": 0.3378790054284872, "learning_rate": 1.9901187873547504e-05, "loss": 0.1974, "step": 550 }, { "epoch": 0.7021344377190188, "grad_norm": 0.38679019553015437, "learning_rate": 1.9899935225721e-05, "loss": 0.2121, "step": 551 }, { "epoch": 0.7034087288945524, "grad_norm": 0.3910519613813281, "learning_rate": 1.9898674727799418e-05, "loss": 0.2292, "step": 552 }, { "epoch": 0.704683020070086, "grad_norm": 0.3751720942678561, "learning_rate": 1.9897406380782262e-05, "loss": 0.2195, "step": 553 }, { "epoch": 0.7059573112456197, "grad_norm": 0.3521954215802129, "learning_rate": 1.9896130185675263e-05, "loss": 0.2367, "step": 554 }, { "epoch": 0.7072316024211532, "grad_norm": 0.4181469262230446, "learning_rate": 1.9894846143490367e-05, "loss": 0.2444, "step": 555 }, { "epoch": 0.7085058935966868, "grad_norm": 0.3892204705982601, "learning_rate": 1.9893554255245748e-05, "loss": 0.2449, "step": 556 }, { "epoch": 0.7097801847722205, "grad_norm": 0.31911096186469, "learning_rate": 1.9892254521965805e-05, "loss": 0.1941, "step": 557 }, { "epoch": 0.7110544759477541, "grad_norm": 0.36748812003501574, "learning_rate": 1.9890946944681157e-05, "loss": 0.238, "step": 558 }, { "epoch": 0.7123287671232876, "grad_norm": 0.3676547203294174, "learning_rate": 1.988963152442863e-05, "loss": 0.2322, "step": 559 }, { "epoch": 0.7136030582988213, "grad_norm": 0.36233442079469713, "learning_rate": 1.9888308262251286e-05, "loss": 0.227, "step": 560 }, { "epoch": 0.7148773494743549, "grad_norm": 0.39033451753281845, "learning_rate": 1.9886977159198397e-05, "loss": 0.2357, "step": 561 }, { "epoch": 0.7161516406498885, "grad_norm": 0.35244067608164326, "learning_rate": 1.988563821632545e-05, "loss": 0.2211, "step": 562 }, { "epoch": 0.7174259318254221, "grad_norm": 0.3193162477665193, "learning_rate": 1.9884291434694155e-05, "loss": 0.2121, "step": 563 }, { "epoch": 0.7187002230009557, "grad_norm": 0.36029311248540347, "learning_rate": 1.9882936815372432e-05, "loss": 0.2264, "step": 564 }, { "epoch": 0.7199745141764893, "grad_norm": 0.3510970235740379, "learning_rate": 1.988157435943442e-05, "loss": 0.2255, "step": 565 }, { "epoch": 0.721248805352023, "grad_norm": 0.3688263988385087, "learning_rate": 1.9880204067960473e-05, "loss": 0.2391, "step": 566 }, { "epoch": 0.7225230965275565, "grad_norm": 0.3479835040150408, "learning_rate": 1.9878825942037147e-05, "loss": 0.2188, "step": 567 }, { "epoch": 0.7237973877030902, "grad_norm": 0.38130283866792064, "learning_rate": 1.9877439982757228e-05, "loss": 0.2425, "step": 568 }, { "epoch": 0.7250716788786238, "grad_norm": 0.389457386068127, "learning_rate": 1.98760461912197e-05, "loss": 0.2411, "step": 569 }, { "epoch": 0.7263459700541574, "grad_norm": 0.37694790296036523, "learning_rate": 1.9874644568529763e-05, "loss": 0.2281, "step": 570 }, { "epoch": 0.727620261229691, "grad_norm": 0.3513776908887931, "learning_rate": 1.9873235115798827e-05, "loss": 0.2176, "step": 571 }, { "epoch": 0.7288945524052246, "grad_norm": 0.3275876100258956, "learning_rate": 1.9871817834144506e-05, "loss": 0.2307, "step": 572 }, { "epoch": 0.7301688435807582, "grad_norm": 0.34037790655397315, "learning_rate": 1.9870392724690622e-05, "loss": 0.1928, "step": 573 }, { "epoch": 0.7314431347562919, "grad_norm": 0.3420344932784863, "learning_rate": 1.9868959788567213e-05, "loss": 0.2145, "step": 574 }, { "epoch": 0.7327174259318254, "grad_norm": 0.36017997731840673, "learning_rate": 1.986751902691052e-05, "loss": 0.2299, "step": 575 }, { "epoch": 0.733991717107359, "grad_norm": 0.3454288903265858, "learning_rate": 1.9866070440862977e-05, "loss": 0.2237, "step": 576 }, { "epoch": 0.7352660082828927, "grad_norm": 0.3422127714541957, "learning_rate": 1.9864614031573236e-05, "loss": 0.2116, "step": 577 }, { "epoch": 0.7365402994584263, "grad_norm": 0.39607510239648874, "learning_rate": 1.9863149800196152e-05, "loss": 0.2369, "step": 578 }, { "epoch": 0.7378145906339598, "grad_norm": 0.3838949837312765, "learning_rate": 1.9861677747892773e-05, "loss": 0.2413, "step": 579 }, { "epoch": 0.7390888818094935, "grad_norm": 0.36650202628689504, "learning_rate": 1.9860197875830355e-05, "loss": 0.2169, "step": 580 }, { "epoch": 0.7403631729850271, "grad_norm": 0.33011171306120607, "learning_rate": 1.985871018518236e-05, "loss": 0.2145, "step": 581 }, { "epoch": 0.7416374641605606, "grad_norm": 0.3559604578213164, "learning_rate": 1.9857214677128436e-05, "loss": 0.2132, "step": 582 }, { "epoch": 0.7429117553360943, "grad_norm": 0.3538942789924032, "learning_rate": 1.985571135285444e-05, "loss": 0.2103, "step": 583 }, { "epoch": 0.7441860465116279, "grad_norm": 0.36796820878586706, "learning_rate": 1.9854200213552426e-05, "loss": 0.2256, "step": 584 }, { "epoch": 0.7454603376871615, "grad_norm": 0.40877540649897415, "learning_rate": 1.985268126042064e-05, "loss": 0.2595, "step": 585 }, { "epoch": 0.7467346288626951, "grad_norm": 0.33291993705680906, "learning_rate": 1.985115449466353e-05, "loss": 0.202, "step": 586 }, { "epoch": 0.7480089200382287, "grad_norm": 0.40166425910524056, "learning_rate": 1.9849619917491732e-05, "loss": 0.2103, "step": 587 }, { "epoch": 0.7492832112137623, "grad_norm": 0.359998720085186, "learning_rate": 1.9848077530122083e-05, "loss": 0.2262, "step": 588 }, { "epoch": 0.750557502389296, "grad_norm": 0.3440694303593378, "learning_rate": 1.9846527333777606e-05, "loss": 0.1861, "step": 589 }, { "epoch": 0.7518317935648295, "grad_norm": 0.34785179702754865, "learning_rate": 1.9844969329687526e-05, "loss": 0.2023, "step": 590 }, { "epoch": 0.7531060847403632, "grad_norm": 0.3920344517996339, "learning_rate": 1.984340351908725e-05, "loss": 0.2293, "step": 591 }, { "epoch": 0.7543803759158968, "grad_norm": 0.36003278962758417, "learning_rate": 1.9841829903218377e-05, "loss": 0.2241, "step": 592 }, { "epoch": 0.7556546670914304, "grad_norm": 0.3421126847788359, "learning_rate": 1.9840248483328697e-05, "loss": 0.2252, "step": 593 }, { "epoch": 0.756928958266964, "grad_norm": 0.36985963956493495, "learning_rate": 1.983865926067219e-05, "loss": 0.2207, "step": 594 }, { "epoch": 0.7582032494424976, "grad_norm": 0.35163272503908405, "learning_rate": 1.9837062236509013e-05, "loss": 0.2131, "step": 595 }, { "epoch": 0.7594775406180312, "grad_norm": 0.3575070681240194, "learning_rate": 1.983545741210553e-05, "loss": 0.2404, "step": 596 }, { "epoch": 0.7607518317935649, "grad_norm": 0.35387176630730094, "learning_rate": 1.9833844788734265e-05, "loss": 0.2288, "step": 597 }, { "epoch": 0.7620261229690984, "grad_norm": 0.3434347425745176, "learning_rate": 1.9832224367673945e-05, "loss": 0.2204, "step": 598 }, { "epoch": 0.763300414144632, "grad_norm": 0.34754448739473065, "learning_rate": 1.983059615020947e-05, "loss": 0.2118, "step": 599 }, { "epoch": 0.7645747053201657, "grad_norm": 0.34477664332576324, "learning_rate": 1.9828960137631927e-05, "loss": 0.2346, "step": 600 }, { "epoch": 0.7658489964956993, "grad_norm": 0.3443386801622639, "learning_rate": 1.9827316331238583e-05, "loss": 0.2125, "step": 601 }, { "epoch": 0.7671232876712328, "grad_norm": 0.3982962452755178, "learning_rate": 1.9825664732332886e-05, "loss": 0.2376, "step": 602 }, { "epoch": 0.7683975788467665, "grad_norm": 0.3911866468456412, "learning_rate": 1.9824005342224454e-05, "loss": 0.2477, "step": 603 }, { "epoch": 0.7696718700223001, "grad_norm": 0.3341132284326214, "learning_rate": 1.98223381622291e-05, "loss": 0.2208, "step": 604 }, { "epoch": 0.7709461611978337, "grad_norm": 0.37389320581699886, "learning_rate": 1.9820663193668798e-05, "loss": 0.2266, "step": 605 }, { "epoch": 0.7722204523733673, "grad_norm": 0.33530651822034957, "learning_rate": 1.9818980437871707e-05, "loss": 0.216, "step": 606 }, { "epoch": 0.7734947435489009, "grad_norm": 0.34175911748160875, "learning_rate": 1.981728989617216e-05, "loss": 0.2284, "step": 607 }, { "epoch": 0.7747690347244345, "grad_norm": 0.320500528419636, "learning_rate": 1.9815591569910654e-05, "loss": 0.2042, "step": 608 }, { "epoch": 0.7760433258999682, "grad_norm": 0.343257757298485, "learning_rate": 1.981388546043388e-05, "loss": 0.2043, "step": 609 }, { "epoch": 0.7773176170755017, "grad_norm": 0.3405149001992135, "learning_rate": 1.9812171569094675e-05, "loss": 0.2179, "step": 610 }, { "epoch": 0.7785919082510354, "grad_norm": 0.3470328636431666, "learning_rate": 1.9810449897252067e-05, "loss": 0.2279, "step": 611 }, { "epoch": 0.779866199426569, "grad_norm": 0.3636863818947501, "learning_rate": 1.980872044627124e-05, "loss": 0.2131, "step": 612 }, { "epoch": 0.7811404906021026, "grad_norm": 0.36898710789870676, "learning_rate": 1.980698321752356e-05, "loss": 0.2139, "step": 613 }, { "epoch": 0.7824147817776362, "grad_norm": 0.352595831076149, "learning_rate": 1.980523821238654e-05, "loss": 0.2367, "step": 614 }, { "epoch": 0.7836890729531698, "grad_norm": 0.3321332180810396, "learning_rate": 1.9803485432243884e-05, "loss": 0.2339, "step": 615 }, { "epoch": 0.7849633641287034, "grad_norm": 0.37219209475594645, "learning_rate": 1.9801724878485438e-05, "loss": 0.2322, "step": 616 }, { "epoch": 0.7862376553042371, "grad_norm": 0.3810864962713846, "learning_rate": 1.9799956552507235e-05, "loss": 0.2251, "step": 617 }, { "epoch": 0.7875119464797706, "grad_norm": 0.3507580312427943, "learning_rate": 1.9798180455711445e-05, "loss": 0.2362, "step": 618 }, { "epoch": 0.7887862376553042, "grad_norm": 0.33742722424594274, "learning_rate": 1.979639658950642e-05, "loss": 0.2, "step": 619 }, { "epoch": 0.7900605288308379, "grad_norm": 0.3478715964781658, "learning_rate": 1.9794604955306668e-05, "loss": 0.21, "step": 620 }, { "epoch": 0.7913348200063715, "grad_norm": 0.38003002670457253, "learning_rate": 1.9792805554532855e-05, "loss": 0.2467, "step": 621 }, { "epoch": 0.792609111181905, "grad_norm": 0.33133134221959937, "learning_rate": 1.97909983886118e-05, "loss": 0.1986, "step": 622 }, { "epoch": 0.7938834023574387, "grad_norm": 0.32729690705699604, "learning_rate": 1.9789183458976485e-05, "loss": 0.2058, "step": 623 }, { "epoch": 0.7951576935329723, "grad_norm": 0.3597763301939917, "learning_rate": 1.9787360767066054e-05, "loss": 0.2206, "step": 624 }, { "epoch": 0.7964319847085058, "grad_norm": 0.3340418205087883, "learning_rate": 1.9785530314325796e-05, "loss": 0.2228, "step": 625 }, { "epoch": 0.7977062758840395, "grad_norm": 0.35242656000408473, "learning_rate": 1.9783692102207156e-05, "loss": 0.2188, "step": 626 }, { "epoch": 0.7989805670595731, "grad_norm": 0.36579752201691074, "learning_rate": 1.9781846132167735e-05, "loss": 0.2454, "step": 627 }, { "epoch": 0.8002548582351067, "grad_norm": 0.38121576158990206, "learning_rate": 1.9779992405671284e-05, "loss": 0.2516, "step": 628 }, { "epoch": 0.8015291494106404, "grad_norm": 0.3364019003967843, "learning_rate": 1.9778130924187703e-05, "loss": 0.2029, "step": 629 }, { "epoch": 0.8028034405861739, "grad_norm": 0.35739175714600663, "learning_rate": 1.977626168919305e-05, "loss": 0.227, "step": 630 }, { "epoch": 0.8040777317617076, "grad_norm": 0.388246930605751, "learning_rate": 1.977438470216951e-05, "loss": 0.2418, "step": 631 }, { "epoch": 0.8053520229372412, "grad_norm": 0.3572778139787018, "learning_rate": 1.977249996460544e-05, "loss": 0.2209, "step": 632 }, { "epoch": 0.8066263141127747, "grad_norm": 0.38407840963090467, "learning_rate": 1.9770607477995328e-05, "loss": 0.2135, "step": 633 }, { "epoch": 0.8079006052883084, "grad_norm": 0.3581831302386865, "learning_rate": 1.976870724383981e-05, "loss": 0.2283, "step": 634 }, { "epoch": 0.809174896463842, "grad_norm": 0.3905385325753702, "learning_rate": 1.9766799263645672e-05, "loss": 0.2059, "step": 635 }, { "epoch": 0.8104491876393756, "grad_norm": 0.38790669655428656, "learning_rate": 1.9764883538925822e-05, "loss": 0.2279, "step": 636 }, { "epoch": 0.8117234788149092, "grad_norm": 0.3363465864601257, "learning_rate": 1.9762960071199334e-05, "loss": 0.2232, "step": 637 }, { "epoch": 0.8129977699904428, "grad_norm": 0.39715456647292113, "learning_rate": 1.9761028861991406e-05, "loss": 0.2384, "step": 638 }, { "epoch": 0.8142720611659764, "grad_norm": 0.3850926637647466, "learning_rate": 1.975908991283338e-05, "loss": 0.219, "step": 639 }, { "epoch": 0.8155463523415101, "grad_norm": 0.3592042525912691, "learning_rate": 1.975714322526273e-05, "loss": 0.2455, "step": 640 }, { "epoch": 0.8168206435170436, "grad_norm": 0.3277598659637951, "learning_rate": 1.975518880082308e-05, "loss": 0.188, "step": 641 }, { "epoch": 0.8180949346925772, "grad_norm": 0.3670143125960265, "learning_rate": 1.9753226641064164e-05, "loss": 0.1985, "step": 642 }, { "epoch": 0.8193692258681109, "grad_norm": 0.3571446872225631, "learning_rate": 1.9751256747541882e-05, "loss": 0.2252, "step": 643 }, { "epoch": 0.8206435170436445, "grad_norm": 0.3443393359560378, "learning_rate": 1.9749279121818235e-05, "loss": 0.2297, "step": 644 }, { "epoch": 0.821917808219178, "grad_norm": 0.36578501139948044, "learning_rate": 1.9747293765461385e-05, "loss": 0.2248, "step": 645 }, { "epoch": 0.8231920993947117, "grad_norm": 0.3694197272053032, "learning_rate": 1.974530068004559e-05, "loss": 0.2271, "step": 646 }, { "epoch": 0.8244663905702453, "grad_norm": 0.3692640904218449, "learning_rate": 1.9743299867151272e-05, "loss": 0.2511, "step": 647 }, { "epoch": 0.8257406817457789, "grad_norm": 0.3371992798674826, "learning_rate": 1.9741291328364955e-05, "loss": 0.2233, "step": 648 }, { "epoch": 0.8270149729213125, "grad_norm": 0.3252484853802431, "learning_rate": 1.9739275065279302e-05, "loss": 0.2084, "step": 649 }, { "epoch": 0.8282892640968461, "grad_norm": 0.32157871113960806, "learning_rate": 1.973725107949309e-05, "loss": 0.2193, "step": 650 }, { "epoch": 0.8295635552723798, "grad_norm": 0.33504659369072937, "learning_rate": 1.9735219372611232e-05, "loss": 0.208, "step": 651 }, { "epoch": 0.8308378464479134, "grad_norm": 0.32444217943988696, "learning_rate": 1.973317994624476e-05, "loss": 0.2133, "step": 652 }, { "epoch": 0.8321121376234469, "grad_norm": 0.35153121839015977, "learning_rate": 1.973113280201082e-05, "loss": 0.2197, "step": 653 }, { "epoch": 0.8333864287989806, "grad_norm": 0.3311907304860089, "learning_rate": 1.9729077941532687e-05, "loss": 0.2383, "step": 654 }, { "epoch": 0.8346607199745142, "grad_norm": 0.34047081714440547, "learning_rate": 1.9727015366439752e-05, "loss": 0.213, "step": 655 }, { "epoch": 0.8359350111500478, "grad_norm": 0.3299552699274239, "learning_rate": 1.9724945078367513e-05, "loss": 0.2153, "step": 656 }, { "epoch": 0.8372093023255814, "grad_norm": 0.34067519289577114, "learning_rate": 1.97228670789576e-05, "loss": 0.2143, "step": 657 }, { "epoch": 0.838483593501115, "grad_norm": 0.32361874845413835, "learning_rate": 1.9720781369857747e-05, "loss": 0.2247, "step": 658 }, { "epoch": 0.8397578846766486, "grad_norm": 0.3205564845781714, "learning_rate": 1.9718687952721808e-05, "loss": 0.1944, "step": 659 }, { "epoch": 0.8410321758521823, "grad_norm": 0.3442459702005005, "learning_rate": 1.9716586829209743e-05, "loss": 0.2127, "step": 660 }, { "epoch": 0.8423064670277158, "grad_norm": 0.3578506126692383, "learning_rate": 1.9714478000987626e-05, "loss": 0.2037, "step": 661 }, { "epoch": 0.8435807582032494, "grad_norm": 0.32327018385755624, "learning_rate": 1.971236146972764e-05, "loss": 0.2253, "step": 662 }, { "epoch": 0.8448550493787831, "grad_norm": 0.32356209805886055, "learning_rate": 1.9710237237108077e-05, "loss": 0.2074, "step": 663 }, { "epoch": 0.8461293405543167, "grad_norm": 0.3802155318181099, "learning_rate": 1.9708105304813333e-05, "loss": 0.2276, "step": 664 }, { "epoch": 0.8474036317298502, "grad_norm": 0.34319377847701893, "learning_rate": 1.970596567453391e-05, "loss": 0.2136, "step": 665 }, { "epoch": 0.8486779229053839, "grad_norm": 0.35720419490254246, "learning_rate": 1.970381834796642e-05, "loss": 0.2076, "step": 666 }, { "epoch": 0.8499522140809175, "grad_norm": 0.3524073967186098, "learning_rate": 1.9701663326813568e-05, "loss": 0.2282, "step": 667 }, { "epoch": 0.851226505256451, "grad_norm": 0.33579575880324586, "learning_rate": 1.969950061278417e-05, "loss": 0.2153, "step": 668 }, { "epoch": 0.8525007964319847, "grad_norm": 0.3246628283958329, "learning_rate": 1.9697330207593133e-05, "loss": 0.223, "step": 669 }, { "epoch": 0.8537750876075183, "grad_norm": 0.3175123885648078, "learning_rate": 1.969515211296147e-05, "loss": 0.1952, "step": 670 }, { "epoch": 0.8550493787830519, "grad_norm": 0.34088301974960616, "learning_rate": 1.9692966330616285e-05, "loss": 0.2384, "step": 671 }, { "epoch": 0.8563236699585856, "grad_norm": 0.29863229819400655, "learning_rate": 1.969077286229078e-05, "loss": 0.1787, "step": 672 }, { "epoch": 0.8575979611341191, "grad_norm": 0.33940370090713573, "learning_rate": 1.968857170972426e-05, "loss": 0.2107, "step": 673 }, { "epoch": 0.8588722523096528, "grad_norm": 0.38255025979124574, "learning_rate": 1.968636287466211e-05, "loss": 0.2224, "step": 674 }, { "epoch": 0.8601465434851864, "grad_norm": 0.34036677782944297, "learning_rate": 1.9684146358855814e-05, "loss": 0.2104, "step": 675 }, { "epoch": 0.86142083466072, "grad_norm": 0.37154749813202204, "learning_rate": 1.9681922164062945e-05, "loss": 0.2314, "step": 676 }, { "epoch": 0.8626951258362536, "grad_norm": 0.33741626069879943, "learning_rate": 1.9679690292047165e-05, "loss": 0.2186, "step": 677 }, { "epoch": 0.8639694170117872, "grad_norm": 0.35602739599137995, "learning_rate": 1.967745074457823e-05, "loss": 0.2352, "step": 678 }, { "epoch": 0.8652437081873208, "grad_norm": 0.3255063899946342, "learning_rate": 1.9675203523431964e-05, "loss": 0.2272, "step": 679 }, { "epoch": 0.8665179993628545, "grad_norm": 0.35000866359119764, "learning_rate": 1.9672948630390296e-05, "loss": 0.2109, "step": 680 }, { "epoch": 0.867792290538388, "grad_norm": 0.32926917937114264, "learning_rate": 1.9670686067241226e-05, "loss": 0.2152, "step": 681 }, { "epoch": 0.8690665817139216, "grad_norm": 0.31074551082553337, "learning_rate": 1.9668415835778845e-05, "loss": 0.2065, "step": 682 }, { "epoch": 0.8703408728894553, "grad_norm": 0.3681292188719548, "learning_rate": 1.9666137937803315e-05, "loss": 0.23, "step": 683 }, { "epoch": 0.8716151640649888, "grad_norm": 0.37750508292491125, "learning_rate": 1.9663852375120882e-05, "loss": 0.2309, "step": 684 }, { "epoch": 0.8728894552405224, "grad_norm": 0.32666141358612605, "learning_rate": 1.9661559149543873e-05, "loss": 0.2198, "step": 685 }, { "epoch": 0.8741637464160561, "grad_norm": 0.3254172643144567, "learning_rate": 1.9659258262890683e-05, "loss": 0.208, "step": 686 }, { "epoch": 0.8754380375915897, "grad_norm": 0.33195090963916485, "learning_rate": 1.9656949716985792e-05, "loss": 0.2092, "step": 687 }, { "epoch": 0.8767123287671232, "grad_norm": 0.3739671686069409, "learning_rate": 1.9654633513659743e-05, "loss": 0.2138, "step": 688 }, { "epoch": 0.8779866199426569, "grad_norm": 0.3343295725075647, "learning_rate": 1.9652309654749156e-05, "loss": 0.2259, "step": 689 }, { "epoch": 0.8792609111181905, "grad_norm": 0.3465178183407973, "learning_rate": 1.9649978142096726e-05, "loss": 0.2168, "step": 690 }, { "epoch": 0.8805352022937241, "grad_norm": 0.3919315115395554, "learning_rate": 1.9647638977551206e-05, "loss": 0.2602, "step": 691 }, { "epoch": 0.8818094934692577, "grad_norm": 0.34521985536852184, "learning_rate": 1.9645292162967426e-05, "loss": 0.2157, "step": 692 }, { "epoch": 0.8830837846447913, "grad_norm": 0.3588863503223611, "learning_rate": 1.964293770020628e-05, "loss": 0.2174, "step": 693 }, { "epoch": 0.884358075820325, "grad_norm": 0.3507155253415044, "learning_rate": 1.964057559113472e-05, "loss": 0.2249, "step": 694 }, { "epoch": 0.8856323669958586, "grad_norm": 0.3372131657951967, "learning_rate": 1.9638205837625774e-05, "loss": 0.2251, "step": 695 }, { "epoch": 0.8869066581713921, "grad_norm": 0.3155419250580592, "learning_rate": 1.9635828441558515e-05, "loss": 0.1985, "step": 696 }, { "epoch": 0.8881809493469258, "grad_norm": 0.3776749068799371, "learning_rate": 1.9633443404818094e-05, "loss": 0.2182, "step": 697 }, { "epoch": 0.8894552405224594, "grad_norm": 0.3224257724403975, "learning_rate": 1.9631050729295705e-05, "loss": 0.2047, "step": 698 }, { "epoch": 0.890729531697993, "grad_norm": 0.32498475840386704, "learning_rate": 1.962865041688861e-05, "loss": 0.2159, "step": 699 }, { "epoch": 0.8920038228735266, "grad_norm": 0.372214704714643, "learning_rate": 1.962624246950012e-05, "loss": 0.2179, "step": 700 }, { "epoch": 0.8932781140490602, "grad_norm": 0.3746533717183213, "learning_rate": 1.9623826889039606e-05, "loss": 0.2422, "step": 701 }, { "epoch": 0.8945524052245938, "grad_norm": 0.3603950053213733, "learning_rate": 1.9621403677422487e-05, "loss": 0.2258, "step": 702 }, { "epoch": 0.8958266964001275, "grad_norm": 0.33354464468320183, "learning_rate": 1.9618972836570233e-05, "loss": 0.2118, "step": 703 }, { "epoch": 0.897100987575661, "grad_norm": 0.31204290232276044, "learning_rate": 1.9616534368410364e-05, "loss": 0.1877, "step": 704 }, { "epoch": 0.8983752787511946, "grad_norm": 0.33406037528597743, "learning_rate": 1.961408827487645e-05, "loss": 0.224, "step": 705 }, { "epoch": 0.8996495699267283, "grad_norm": 0.33824992427379186, "learning_rate": 1.961163455790811e-05, "loss": 0.233, "step": 706 }, { "epoch": 0.9009238611022619, "grad_norm": 0.38003152087224223, "learning_rate": 1.9609173219450998e-05, "loss": 0.2188, "step": 707 }, { "epoch": 0.9021981522777954, "grad_norm": 0.35035398580329635, "learning_rate": 1.960670426145682e-05, "loss": 0.2438, "step": 708 }, { "epoch": 0.9034724434533291, "grad_norm": 0.32539592593773226, "learning_rate": 1.9604227685883325e-05, "loss": 0.2082, "step": 709 }, { "epoch": 0.9047467346288627, "grad_norm": 0.3232598566873783, "learning_rate": 1.9601743494694295e-05, "loss": 0.2127, "step": 710 }, { "epoch": 0.9060210258043963, "grad_norm": 0.3455722648048569, "learning_rate": 1.9599251689859556e-05, "loss": 0.2085, "step": 711 }, { "epoch": 0.9072953169799299, "grad_norm": 0.3569796814071175, "learning_rate": 1.959675227335497e-05, "loss": 0.2108, "step": 712 }, { "epoch": 0.9085696081554635, "grad_norm": 0.3156627337675661, "learning_rate": 1.9594245247162435e-05, "loss": 0.2119, "step": 713 }, { "epoch": 0.9098438993309972, "grad_norm": 0.32383364382283353, "learning_rate": 1.9591730613269878e-05, "loss": 0.1908, "step": 714 }, { "epoch": 0.9111181905065308, "grad_norm": 0.3565993921024688, "learning_rate": 1.9589208373671272e-05, "loss": 0.211, "step": 715 }, { "epoch": 0.9123924816820643, "grad_norm": 0.38242750448947777, "learning_rate": 1.9586678530366607e-05, "loss": 0.2145, "step": 716 }, { "epoch": 0.913666772857598, "grad_norm": 0.3686807906925094, "learning_rate": 1.9584141085361907e-05, "loss": 0.2378, "step": 717 }, { "epoch": 0.9149410640331316, "grad_norm": 0.335261975168693, "learning_rate": 1.9581596040669225e-05, "loss": 0.1971, "step": 718 }, { "epoch": 0.9162153552086651, "grad_norm": 0.34630477226099704, "learning_rate": 1.957904339830664e-05, "loss": 0.2334, "step": 719 }, { "epoch": 0.9174896463841988, "grad_norm": 0.33276645127076965, "learning_rate": 1.9576483160298246e-05, "loss": 0.215, "step": 720 }, { "epoch": 0.9187639375597324, "grad_norm": 0.3279858099593534, "learning_rate": 1.957391532867418e-05, "loss": 0.213, "step": 721 }, { "epoch": 0.920038228735266, "grad_norm": 0.3576087659430861, "learning_rate": 1.9571339905470587e-05, "loss": 0.2331, "step": 722 }, { "epoch": 0.9213125199107997, "grad_norm": 0.3389388598363892, "learning_rate": 1.956875689272963e-05, "loss": 0.2056, "step": 723 }, { "epoch": 0.9225868110863332, "grad_norm": 0.33513601942783344, "learning_rate": 1.9566166292499497e-05, "loss": 0.2162, "step": 724 }, { "epoch": 0.9238611022618668, "grad_norm": 0.32529462310621415, "learning_rate": 1.9563568106834385e-05, "loss": 0.2129, "step": 725 }, { "epoch": 0.9251353934374005, "grad_norm": 0.3270672952039892, "learning_rate": 1.956096233779451e-05, "loss": 0.2144, "step": 726 }, { "epoch": 0.926409684612934, "grad_norm": 0.35668238602850016, "learning_rate": 1.9558348987446104e-05, "loss": 0.2191, "step": 727 }, { "epoch": 0.9276839757884676, "grad_norm": 0.34290496109313795, "learning_rate": 1.955572805786141e-05, "loss": 0.2061, "step": 728 }, { "epoch": 0.9289582669640013, "grad_norm": 0.3270747300293292, "learning_rate": 1.955309955111867e-05, "loss": 0.2152, "step": 729 }, { "epoch": 0.9302325581395349, "grad_norm": 0.36695166694373665, "learning_rate": 1.9550463469302156e-05, "loss": 0.2335, "step": 730 }, { "epoch": 0.9315068493150684, "grad_norm": 0.3574887542112085, "learning_rate": 1.954781981450212e-05, "loss": 0.2142, "step": 731 }, { "epoch": 0.9327811404906021, "grad_norm": 0.3759534448283029, "learning_rate": 1.954516858881484e-05, "loss": 0.2523, "step": 732 }, { "epoch": 0.9340554316661357, "grad_norm": 0.3288967763368863, "learning_rate": 1.9542509794342588e-05, "loss": 0.231, "step": 733 }, { "epoch": 0.9353297228416694, "grad_norm": 0.3148627314988309, "learning_rate": 1.953984343319364e-05, "loss": 0.1893, "step": 734 }, { "epoch": 0.936604014017203, "grad_norm": 0.32973024667690615, "learning_rate": 1.953716950748227e-05, "loss": 0.2246, "step": 735 }, { "epoch": 0.9378783051927365, "grad_norm": 0.3367845653970647, "learning_rate": 1.953448801932875e-05, "loss": 0.2047, "step": 736 }, { "epoch": 0.9391525963682702, "grad_norm": 0.3163055251246609, "learning_rate": 1.953179897085936e-05, "loss": 0.2072, "step": 737 }, { "epoch": 0.9404268875438038, "grad_norm": 0.31770227439685417, "learning_rate": 1.952910236420635e-05, "loss": 0.2037, "step": 738 }, { "epoch": 0.9417011787193373, "grad_norm": 0.3404553102455374, "learning_rate": 1.952639820150799e-05, "loss": 0.2144, "step": 739 }, { "epoch": 0.942975469894871, "grad_norm": 0.3383909071241823, "learning_rate": 1.9523686484908523e-05, "loss": 0.2285, "step": 740 }, { "epoch": 0.9442497610704046, "grad_norm": 0.35855096415908205, "learning_rate": 1.9520967216558194e-05, "loss": 0.2288, "step": 741 }, { "epoch": 0.9455240522459382, "grad_norm": 0.3294296754231244, "learning_rate": 1.9518240398613226e-05, "loss": 0.2059, "step": 742 }, { "epoch": 0.9467983434214718, "grad_norm": 0.32836846874879777, "learning_rate": 1.9515506033235834e-05, "loss": 0.2094, "step": 743 }, { "epoch": 0.9480726345970054, "grad_norm": 0.3241620384315652, "learning_rate": 1.951276412259422e-05, "loss": 0.2103, "step": 744 }, { "epoch": 0.949346925772539, "grad_norm": 0.3555979949988686, "learning_rate": 1.9510014668862565e-05, "loss": 0.2164, "step": 745 }, { "epoch": 0.9506212169480727, "grad_norm": 0.34360454829117826, "learning_rate": 1.950725767422103e-05, "loss": 0.2122, "step": 746 }, { "epoch": 0.9518955081236062, "grad_norm": 0.3582832422305749, "learning_rate": 1.950449314085576e-05, "loss": 0.2225, "step": 747 }, { "epoch": 0.9531697992991398, "grad_norm": 0.3475733778341504, "learning_rate": 1.9501721070958868e-05, "loss": 0.2385, "step": 748 }, { "epoch": 0.9544440904746735, "grad_norm": 0.3744467946535353, "learning_rate": 1.9498941466728462e-05, "loss": 0.2262, "step": 749 }, { "epoch": 0.9557183816502071, "grad_norm": 0.39196542967901543, "learning_rate": 1.9496154330368605e-05, "loss": 0.2079, "step": 750 }, { "epoch": 0.9569926728257406, "grad_norm": 0.3255952550076811, "learning_rate": 1.949335966408934e-05, "loss": 0.2354, "step": 751 }, { "epoch": 0.9582669640012743, "grad_norm": 0.35199738775981265, "learning_rate": 1.949055747010669e-05, "loss": 0.2404, "step": 752 }, { "epoch": 0.9595412551768079, "grad_norm": 0.32253524732207045, "learning_rate": 1.9487747750642626e-05, "loss": 0.2122, "step": 753 }, { "epoch": 0.9608155463523416, "grad_norm": 0.30354885881800536, "learning_rate": 1.9484930507925105e-05, "loss": 0.2154, "step": 754 }, { "epoch": 0.9620898375278751, "grad_norm": 0.3611903965891733, "learning_rate": 1.948210574418804e-05, "loss": 0.226, "step": 755 }, { "epoch": 0.9633641287034087, "grad_norm": 0.32845457620057306, "learning_rate": 1.947927346167132e-05, "loss": 0.2227, "step": 756 }, { "epoch": 0.9646384198789424, "grad_norm": 0.3311585575585294, "learning_rate": 1.9476433662620775e-05, "loss": 0.2032, "step": 757 }, { "epoch": 0.965912711054476, "grad_norm": 0.3473529306441084, "learning_rate": 1.9473586349288213e-05, "loss": 0.2237, "step": 758 }, { "epoch": 0.9671870022300095, "grad_norm": 0.3716886234506083, "learning_rate": 1.9470731523931387e-05, "loss": 0.2387, "step": 759 }, { "epoch": 0.9684612934055432, "grad_norm": 0.3352359015521493, "learning_rate": 1.9467869188814024e-05, "loss": 0.221, "step": 760 }, { "epoch": 0.9697355845810768, "grad_norm": 0.37473393251467657, "learning_rate": 1.946499934620579e-05, "loss": 0.2093, "step": 761 }, { "epoch": 0.9710098757566104, "grad_norm": 0.3563280709085956, "learning_rate": 1.946212199838231e-05, "loss": 0.2455, "step": 762 }, { "epoch": 0.972284166932144, "grad_norm": 0.36206770454706316, "learning_rate": 1.945923714762516e-05, "loss": 0.2265, "step": 763 }, { "epoch": 0.9735584581076776, "grad_norm": 0.3510949454516064, "learning_rate": 1.945634479622187e-05, "loss": 0.2178, "step": 764 }, { "epoch": 0.9748327492832112, "grad_norm": 0.3523224893070203, "learning_rate": 1.9453444946465904e-05, "loss": 0.2124, "step": 765 }, { "epoch": 0.9761070404587449, "grad_norm": 2.1600801734268678, "learning_rate": 1.9450537600656688e-05, "loss": 0.2242, "step": 766 }, { "epoch": 0.9773813316342784, "grad_norm": 0.35169527866888495, "learning_rate": 1.9447622761099585e-05, "loss": 0.2203, "step": 767 }, { "epoch": 0.978655622809812, "grad_norm": 0.32869690640300764, "learning_rate": 1.9444700430105892e-05, "loss": 0.2036, "step": 768 }, { "epoch": 0.9799299139853457, "grad_norm": 0.3252771410484795, "learning_rate": 1.9441770609992866e-05, "loss": 0.2058, "step": 769 }, { "epoch": 0.9812042051608793, "grad_norm": 0.33916968478445064, "learning_rate": 1.9438833303083677e-05, "loss": 0.2384, "step": 770 }, { "epoch": 0.9824784963364128, "grad_norm": 0.30160541106748195, "learning_rate": 1.9435888511707456e-05, "loss": 0.1929, "step": 771 }, { "epoch": 0.9837527875119465, "grad_norm": 0.37146694509073674, "learning_rate": 1.943293623819925e-05, "loss": 0.2441, "step": 772 }, { "epoch": 0.9850270786874801, "grad_norm": 0.3121357055273709, "learning_rate": 1.9429976484900057e-05, "loss": 0.2187, "step": 773 }, { "epoch": 0.9863013698630136, "grad_norm": 0.33742343040045986, "learning_rate": 1.9427009254156783e-05, "loss": 0.2302, "step": 774 }, { "epoch": 0.9875756610385473, "grad_norm": 0.32671896875725437, "learning_rate": 1.9424034548322284e-05, "loss": 0.2237, "step": 775 }, { "epoch": 0.9888499522140809, "grad_norm": 0.3370928979422002, "learning_rate": 1.9421052369755335e-05, "loss": 0.213, "step": 776 }, { "epoch": 0.9901242433896146, "grad_norm": 0.34684972747553416, "learning_rate": 1.9418062720820636e-05, "loss": 0.2322, "step": 777 }, { "epoch": 0.9913985345651481, "grad_norm": 0.3162663436398391, "learning_rate": 1.9415065603888813e-05, "loss": 0.2059, "step": 778 }, { "epoch": 0.9926728257406817, "grad_norm": 0.35395907493019674, "learning_rate": 1.9412061021336404e-05, "loss": 0.2141, "step": 779 }, { "epoch": 0.9939471169162154, "grad_norm": 0.33618051273355165, "learning_rate": 1.940904897554589e-05, "loss": 0.205, "step": 780 }, { "epoch": 0.995221408091749, "grad_norm": 0.3051287950927697, "learning_rate": 1.940602946890564e-05, "loss": 0.197, "step": 781 }, { "epoch": 0.9964956992672825, "grad_norm": 0.3244040006620515, "learning_rate": 1.940300250380996e-05, "loss": 0.2355, "step": 782 }, { "epoch": 0.9977699904428162, "grad_norm": 0.3612783587790868, "learning_rate": 1.939996808265907e-05, "loss": 0.2188, "step": 783 }, { "epoch": 0.9990442816183498, "grad_norm": 0.32579050630920325, "learning_rate": 1.9396926207859085e-05, "loss": 0.2139, "step": 784 }, { "epoch": 1.0003185727938835, "grad_norm": 0.4077135703806676, "learning_rate": 1.939387688182205e-05, "loss": 0.2767, "step": 785 }, { "epoch": 1.001592863969417, "grad_norm": 0.36237363237440356, "learning_rate": 1.9390820106965908e-05, "loss": 0.1743, "step": 786 }, { "epoch": 1.0028671551449506, "grad_norm": 0.31681308450971557, "learning_rate": 1.9387755885714507e-05, "loss": 0.1523, "step": 787 }, { "epoch": 1.0041414463204843, "grad_norm": 0.33559845217014256, "learning_rate": 1.9384684220497605e-05, "loss": 0.1951, "step": 788 }, { "epoch": 1.0054157374960178, "grad_norm": 0.3051008290037514, "learning_rate": 1.9381605113750865e-05, "loss": 0.1451, "step": 789 }, { "epoch": 1.0066900286715514, "grad_norm": 0.36174090955524474, "learning_rate": 1.9378518567915842e-05, "loss": 0.1717, "step": 790 }, { "epoch": 1.0079643198470851, "grad_norm": 0.35959279835976915, "learning_rate": 1.9375424585439994e-05, "loss": 0.1484, "step": 791 }, { "epoch": 1.0092386110226186, "grad_norm": 0.9979215226879139, "learning_rate": 1.937232316877668e-05, "loss": 0.1463, "step": 792 }, { "epoch": 1.0105129021981523, "grad_norm": 0.3839745690999066, "learning_rate": 1.9369214320385143e-05, "loss": 0.1553, "step": 793 }, { "epoch": 1.011787193373686, "grad_norm": 0.3553166875627695, "learning_rate": 1.9366098042730534e-05, "loss": 0.158, "step": 794 }, { "epoch": 1.0130614845492194, "grad_norm": 0.35647528528362493, "learning_rate": 1.936297433828388e-05, "loss": 0.1626, "step": 795 }, { "epoch": 1.014335775724753, "grad_norm": 0.3812924097545644, "learning_rate": 1.9359843209522112e-05, "loss": 0.1567, "step": 796 }, { "epoch": 1.0156100669002868, "grad_norm": 0.36301309512796337, "learning_rate": 1.9356704658928036e-05, "loss": 0.1784, "step": 797 }, { "epoch": 1.0168843580758202, "grad_norm": 0.35328159425272077, "learning_rate": 1.935355868899034e-05, "loss": 0.1653, "step": 798 }, { "epoch": 1.018158649251354, "grad_norm": 0.3431100654014239, "learning_rate": 1.9350405302203613e-05, "loss": 0.1376, "step": 799 }, { "epoch": 1.0194329404268876, "grad_norm": 0.36952593986129917, "learning_rate": 1.934724450106831e-05, "loss": 0.155, "step": 800 }, { "epoch": 1.020707231602421, "grad_norm": 0.36293532517353616, "learning_rate": 1.934407628809077e-05, "loss": 0.1545, "step": 801 }, { "epoch": 1.0219815227779547, "grad_norm": 0.3760440896878709, "learning_rate": 1.934090066578321e-05, "loss": 0.1522, "step": 802 }, { "epoch": 1.0232558139534884, "grad_norm": 0.3699194475660459, "learning_rate": 1.933771763666372e-05, "loss": 0.1917, "step": 803 }, { "epoch": 1.0245301051290219, "grad_norm": 0.3615558358869559, "learning_rate": 1.933452720325626e-05, "loss": 0.1511, "step": 804 }, { "epoch": 1.0258043963045556, "grad_norm": 0.36603981502313915, "learning_rate": 1.9331329368090664e-05, "loss": 0.1676, "step": 805 }, { "epoch": 1.0270786874800892, "grad_norm": 0.3283824959663613, "learning_rate": 1.932812413370265e-05, "loss": 0.1695, "step": 806 }, { "epoch": 1.028352978655623, "grad_norm": 0.3328580629881427, "learning_rate": 1.9324911502633778e-05, "loss": 0.1631, "step": 807 }, { "epoch": 1.0296272698311564, "grad_norm": 0.34391160437179463, "learning_rate": 1.9321691477431487e-05, "loss": 0.1569, "step": 808 }, { "epoch": 1.03090156100669, "grad_norm": 0.35593996181703474, "learning_rate": 1.9318464060649077e-05, "loss": 0.1724, "step": 809 }, { "epoch": 1.0321758521822237, "grad_norm": 0.34296979035844755, "learning_rate": 1.9315229254845712e-05, "loss": 0.1682, "step": 810 }, { "epoch": 1.0334501433577572, "grad_norm": 0.3382647055441128, "learning_rate": 1.931198706258641e-05, "loss": 0.1386, "step": 811 }, { "epoch": 1.0347244345332909, "grad_norm": 0.3382752347910683, "learning_rate": 1.9308737486442045e-05, "loss": 0.1381, "step": 812 }, { "epoch": 1.0359987257088246, "grad_norm": 0.3472131401377067, "learning_rate": 1.9305480528989354e-05, "loss": 0.1572, "step": 813 }, { "epoch": 1.037273016884358, "grad_norm": 0.3726358965496347, "learning_rate": 1.930221619281092e-05, "loss": 0.1759, "step": 814 }, { "epoch": 1.0385473080598917, "grad_norm": 0.32765947508099985, "learning_rate": 1.9298944480495177e-05, "loss": 0.1541, "step": 815 }, { "epoch": 1.0398215992354254, "grad_norm": 0.35272290808031026, "learning_rate": 1.9295665394636414e-05, "loss": 0.1527, "step": 816 }, { "epoch": 1.0410958904109588, "grad_norm": 0.31253501757674723, "learning_rate": 1.9292378937834762e-05, "loss": 0.1469, "step": 817 }, { "epoch": 1.0423701815864925, "grad_norm": 0.3451715763911577, "learning_rate": 1.92890851126962e-05, "loss": 0.1689, "step": 818 }, { "epoch": 1.0436444727620262, "grad_norm": 0.3202448105619759, "learning_rate": 1.9285783921832537e-05, "loss": 0.1503, "step": 819 }, { "epoch": 1.0449187639375597, "grad_norm": 0.3461794393793676, "learning_rate": 1.9282475367861444e-05, "loss": 0.159, "step": 820 }, { "epoch": 1.0461930551130934, "grad_norm": 0.37369577551904076, "learning_rate": 1.927915945340641e-05, "loss": 0.1913, "step": 821 }, { "epoch": 1.047467346288627, "grad_norm": 0.36564276909644716, "learning_rate": 1.927583618109678e-05, "loss": 0.1741, "step": 822 }, { "epoch": 1.0487416374641605, "grad_norm": 0.31930718951845305, "learning_rate": 1.9272505553567716e-05, "loss": 0.1383, "step": 823 }, { "epoch": 1.0500159286396942, "grad_norm": 0.347602539624832, "learning_rate": 1.926916757346022e-05, "loss": 0.1756, "step": 824 }, { "epoch": 1.0512902198152279, "grad_norm": 0.32893178380687244, "learning_rate": 1.9265822243421122e-05, "loss": 0.1398, "step": 825 }, { "epoch": 1.0525645109907613, "grad_norm": 0.31152618156722905, "learning_rate": 1.926246956610309e-05, "loss": 0.1405, "step": 826 }, { "epoch": 1.053838802166295, "grad_norm": 0.332750629390068, "learning_rate": 1.92591095441646e-05, "loss": 0.1629, "step": 827 }, { "epoch": 1.0551130933418287, "grad_norm": 0.4246450982649215, "learning_rate": 1.9255742180269967e-05, "loss": 0.1748, "step": 828 }, { "epoch": 1.0563873845173621, "grad_norm": 0.3232018899741788, "learning_rate": 1.925236747708932e-05, "loss": 0.1456, "step": 829 }, { "epoch": 1.0576616756928958, "grad_norm": 0.30400297902683693, "learning_rate": 1.924898543729861e-05, "loss": 0.1442, "step": 830 }, { "epoch": 1.0589359668684295, "grad_norm": 0.2998468070316888, "learning_rate": 1.9245596063579608e-05, "loss": 0.1385, "step": 831 }, { "epoch": 1.060210258043963, "grad_norm": 0.35452408203089675, "learning_rate": 1.9242199358619897e-05, "loss": 0.1838, "step": 832 }, { "epoch": 1.0614845492194966, "grad_norm": 0.3326304447772767, "learning_rate": 1.9238795325112867e-05, "loss": 0.1509, "step": 833 }, { "epoch": 1.0627588403950303, "grad_norm": 0.33422212905548204, "learning_rate": 1.923538396575774e-05, "loss": 0.1507, "step": 834 }, { "epoch": 1.0640331315705638, "grad_norm": 0.32243623899458407, "learning_rate": 1.923196528325952e-05, "loss": 0.1421, "step": 835 }, { "epoch": 1.0653074227460975, "grad_norm": 0.345134548318952, "learning_rate": 1.922853928032904e-05, "loss": 0.1717, "step": 836 }, { "epoch": 1.0665817139216311, "grad_norm": 0.3281281784871195, "learning_rate": 1.9225105959682922e-05, "loss": 0.1564, "step": 837 }, { "epoch": 1.0678560050971646, "grad_norm": 0.3341428459833406, "learning_rate": 1.92216653240436e-05, "loss": 0.1727, "step": 838 }, { "epoch": 1.0691302962726983, "grad_norm": 0.3341298745775056, "learning_rate": 1.921821737613931e-05, "loss": 0.1555, "step": 839 }, { "epoch": 1.070404587448232, "grad_norm": 0.3083802287713848, "learning_rate": 1.921476211870408e-05, "loss": 0.1471, "step": 840 }, { "epoch": 1.0716788786237654, "grad_norm": 0.3675195737774269, "learning_rate": 1.921129955447773e-05, "loss": 0.1891, "step": 841 }, { "epoch": 1.0729531697992991, "grad_norm": 0.32008730723534956, "learning_rate": 1.9207829686205882e-05, "loss": 0.1532, "step": 842 }, { "epoch": 1.0742274609748328, "grad_norm": 0.33070598335983026, "learning_rate": 1.9204352516639954e-05, "loss": 0.1603, "step": 843 }, { "epoch": 1.0755017521503665, "grad_norm": 0.3658104672815928, "learning_rate": 1.920086804853714e-05, "loss": 0.159, "step": 844 }, { "epoch": 1.0767760433259, "grad_norm": 0.3358487434191787, "learning_rate": 1.9197376284660433e-05, "loss": 0.1514, "step": 845 }, { "epoch": 1.0780503345014336, "grad_norm": 0.3442186939410531, "learning_rate": 1.9193877227778604e-05, "loss": 0.1734, "step": 846 }, { "epoch": 1.0793246256769673, "grad_norm": 0.34643487296398173, "learning_rate": 1.9190370880666206e-05, "loss": 0.1668, "step": 847 }, { "epoch": 1.0805989168525008, "grad_norm": 0.3464627294333521, "learning_rate": 1.9186857246103586e-05, "loss": 0.1407, "step": 848 }, { "epoch": 1.0818732080280344, "grad_norm": 0.31973880545922284, "learning_rate": 1.918333632687685e-05, "loss": 0.1487, "step": 849 }, { "epoch": 1.0831474992035681, "grad_norm": 0.346465248958031, "learning_rate": 1.91798081257779e-05, "loss": 0.1697, "step": 850 }, { "epoch": 1.0844217903791016, "grad_norm": 0.34368088265497754, "learning_rate": 1.9176272645604387e-05, "loss": 0.1576, "step": 851 }, { "epoch": 1.0856960815546353, "grad_norm": 0.35843397948031214, "learning_rate": 1.917272988915976e-05, "loss": 0.1689, "step": 852 }, { "epoch": 1.086970372730169, "grad_norm": 0.3287164086198712, "learning_rate": 1.9169179859253232e-05, "loss": 0.1571, "step": 853 }, { "epoch": 1.0882446639057024, "grad_norm": 0.34474489028229605, "learning_rate": 1.9165622558699763e-05, "loss": 0.155, "step": 854 }, { "epoch": 1.089518955081236, "grad_norm": 0.3450907932892261, "learning_rate": 1.9162057990320107e-05, "loss": 0.1698, "step": 855 }, { "epoch": 1.0907932462567698, "grad_norm": 0.37486997041131137, "learning_rate": 1.915848615694076e-05, "loss": 0.1658, "step": 856 }, { "epoch": 1.0920675374323032, "grad_norm": 0.33910279580380387, "learning_rate": 1.9154907061393986e-05, "loss": 0.1465, "step": 857 }, { "epoch": 1.093341828607837, "grad_norm": 0.3855905837821074, "learning_rate": 1.9151320706517814e-05, "loss": 0.2001, "step": 858 }, { "epoch": 1.0946161197833706, "grad_norm": 0.3345976465133606, "learning_rate": 1.9147727095156014e-05, "loss": 0.1611, "step": 859 }, { "epoch": 1.095890410958904, "grad_norm": 0.31846093308391754, "learning_rate": 1.9144126230158127e-05, "loss": 0.1448, "step": 860 }, { "epoch": 1.0971647021344377, "grad_norm": 0.31998584352863024, "learning_rate": 1.9140518114379433e-05, "loss": 0.1477, "step": 861 }, { "epoch": 1.0984389933099714, "grad_norm": 0.3246885741673696, "learning_rate": 1.913690275068097e-05, "loss": 0.1598, "step": 862 }, { "epoch": 1.0997132844855049, "grad_norm": 0.3335677154847916, "learning_rate": 1.9133280141929512e-05, "loss": 0.1632, "step": 863 }, { "epoch": 1.1009875756610386, "grad_norm": 0.33670476238101704, "learning_rate": 1.912965029099759e-05, "loss": 0.1594, "step": 864 }, { "epoch": 1.1022618668365722, "grad_norm": 0.3246858289285923, "learning_rate": 1.9126013200763473e-05, "loss": 0.1619, "step": 865 }, { "epoch": 1.1035361580121057, "grad_norm": 0.31098351992143836, "learning_rate": 1.9122368874111172e-05, "loss": 0.1369, "step": 866 }, { "epoch": 1.1048104491876394, "grad_norm": 0.3531418904004904, "learning_rate": 1.911871731393043e-05, "loss": 0.1675, "step": 867 }, { "epoch": 1.106084740363173, "grad_norm": 0.3857917738600464, "learning_rate": 1.9115058523116734e-05, "loss": 0.1864, "step": 868 }, { "epoch": 1.1073590315387065, "grad_norm": 0.32651776005438615, "learning_rate": 1.9111392504571295e-05, "loss": 0.1525, "step": 869 }, { "epoch": 1.1086333227142402, "grad_norm": 0.3617770679600062, "learning_rate": 1.9107719261201066e-05, "loss": 0.1645, "step": 870 }, { "epoch": 1.1099076138897739, "grad_norm": 0.34230701603213765, "learning_rate": 1.9104038795918723e-05, "loss": 0.1446, "step": 871 }, { "epoch": 1.1111819050653073, "grad_norm": 0.31999173288440136, "learning_rate": 1.9100351111642666e-05, "loss": 0.1471, "step": 872 }, { "epoch": 1.112456196240841, "grad_norm": 0.3395249899084806, "learning_rate": 1.909665621129703e-05, "loss": 0.1517, "step": 873 }, { "epoch": 1.1137304874163747, "grad_norm": 0.3507546591743051, "learning_rate": 1.9092954097811654e-05, "loss": 0.1513, "step": 874 }, { "epoch": 1.1150047785919082, "grad_norm": 0.3522105903349611, "learning_rate": 1.908924477412211e-05, "loss": 0.1638, "step": 875 }, { "epoch": 1.1162790697674418, "grad_norm": 0.3609794096946363, "learning_rate": 1.908552824316969e-05, "loss": 0.1724, "step": 876 }, { "epoch": 1.1175533609429755, "grad_norm": 0.34105709672771367, "learning_rate": 1.908180450790139e-05, "loss": 0.1656, "step": 877 }, { "epoch": 1.118827652118509, "grad_norm": 0.32570041481435424, "learning_rate": 1.9078073571269922e-05, "loss": 0.1487, "step": 878 }, { "epoch": 1.1201019432940427, "grad_norm": 0.36591855914177657, "learning_rate": 1.9074335436233715e-05, "loss": 0.1735, "step": 879 }, { "epoch": 1.1213762344695763, "grad_norm": 0.31401368045692185, "learning_rate": 1.90705901057569e-05, "loss": 0.1515, "step": 880 }, { "epoch": 1.1226505256451098, "grad_norm": 0.35288329381425376, "learning_rate": 1.9066837582809317e-05, "loss": 0.1523, "step": 881 }, { "epoch": 1.1239248168206435, "grad_norm": 0.4025643604241915, "learning_rate": 1.9063077870366504e-05, "loss": 0.168, "step": 882 }, { "epoch": 1.1251991079961772, "grad_norm": 0.34920511771443136, "learning_rate": 1.9059310971409696e-05, "loss": 0.1563, "step": 883 }, { "epoch": 1.1264733991717106, "grad_norm": 0.3249655975564027, "learning_rate": 1.9055536888925844e-05, "loss": 0.1593, "step": 884 }, { "epoch": 1.1277476903472443, "grad_norm": 0.3517386958836968, "learning_rate": 1.905175562590758e-05, "loss": 0.1483, "step": 885 }, { "epoch": 1.129021981522778, "grad_norm": 0.3576666313238568, "learning_rate": 1.9047967185353236e-05, "loss": 0.1666, "step": 886 }, { "epoch": 1.1302962726983115, "grad_norm": 0.31895398467860236, "learning_rate": 1.904417157026683e-05, "loss": 0.1484, "step": 887 }, { "epoch": 1.1315705638738451, "grad_norm": 0.35298980650982575, "learning_rate": 1.9040368783658075e-05, "loss": 0.1637, "step": 888 }, { "epoch": 1.1328448550493788, "grad_norm": 0.3554424850495419, "learning_rate": 1.903655882854237e-05, "loss": 0.1736, "step": 889 }, { "epoch": 1.1341191462249123, "grad_norm": 0.3419586958558602, "learning_rate": 1.903274170794079e-05, "loss": 0.1565, "step": 890 }, { "epoch": 1.135393437400446, "grad_norm": 0.3138815585742941, "learning_rate": 1.90289174248801e-05, "loss": 0.1418, "step": 891 }, { "epoch": 1.1366677285759796, "grad_norm": 0.3440510640968001, "learning_rate": 1.9025085982392753e-05, "loss": 0.1643, "step": 892 }, { "epoch": 1.137942019751513, "grad_norm": 0.33714222699667795, "learning_rate": 1.9021247383516856e-05, "loss": 0.1696, "step": 893 }, { "epoch": 1.1392163109270468, "grad_norm": 0.34755479551060586, "learning_rate": 1.9017401631296208e-05, "loss": 0.1756, "step": 894 }, { "epoch": 1.1404906021025805, "grad_norm": 0.32768513185096915, "learning_rate": 1.9013548728780275e-05, "loss": 0.1502, "step": 895 }, { "epoch": 1.1417648932781141, "grad_norm": 0.36514608150269834, "learning_rate": 1.900968867902419e-05, "loss": 0.1656, "step": 896 }, { "epoch": 1.1430391844536476, "grad_norm": 0.37830595908127934, "learning_rate": 1.9005821485088767e-05, "loss": 0.1721, "step": 897 }, { "epoch": 1.1443134756291813, "grad_norm": 0.35464156635795835, "learning_rate": 1.9001947150040462e-05, "loss": 0.152, "step": 898 }, { "epoch": 1.145587766804715, "grad_norm": 0.3298193106544779, "learning_rate": 1.8998065676951416e-05, "loss": 0.1694, "step": 899 }, { "epoch": 1.1468620579802484, "grad_norm": 0.3317207583539093, "learning_rate": 1.8994177068899414e-05, "loss": 0.1533, "step": 900 }, { "epoch": 1.148136349155782, "grad_norm": 0.3907045206018723, "learning_rate": 1.8990281328967906e-05, "loss": 0.182, "step": 901 }, { "epoch": 1.1494106403313158, "grad_norm": 0.33905732765874463, "learning_rate": 1.8986378460246e-05, "loss": 0.1523, "step": 902 }, { "epoch": 1.1506849315068493, "grad_norm": 0.3201447776660888, "learning_rate": 1.898246846582844e-05, "loss": 0.1511, "step": 903 }, { "epoch": 1.151959222682383, "grad_norm": 0.33101241088851213, "learning_rate": 1.8978551348815653e-05, "loss": 0.1434, "step": 904 }, { "epoch": 1.1532335138579166, "grad_norm": 0.3169989517076557, "learning_rate": 1.897462711231368e-05, "loss": 0.148, "step": 905 }, { "epoch": 1.15450780503345, "grad_norm": 0.32853298045469614, "learning_rate": 1.897069575943422e-05, "loss": 0.161, "step": 906 }, { "epoch": 1.1557820962089838, "grad_norm": 0.3302054792614519, "learning_rate": 1.8966757293294623e-05, "loss": 0.1556, "step": 907 }, { "epoch": 1.1570563873845174, "grad_norm": 0.3484638523964253, "learning_rate": 1.896281171701787e-05, "loss": 0.1643, "step": 908 }, { "epoch": 1.158330678560051, "grad_norm": 0.31969485624836824, "learning_rate": 1.895885903373258e-05, "loss": 0.1588, "step": 909 }, { "epoch": 1.1596049697355846, "grad_norm": 0.3298762803102977, "learning_rate": 1.895489924657301e-05, "loss": 0.1647, "step": 910 }, { "epoch": 1.1608792609111183, "grad_norm": 0.3145878366043628, "learning_rate": 1.8950932358679055e-05, "loss": 0.1617, "step": 911 }, { "epoch": 1.1621535520866517, "grad_norm": 0.3190635230329701, "learning_rate": 1.894695837319623e-05, "loss": 0.151, "step": 912 }, { "epoch": 1.1634278432621854, "grad_norm": 0.328849052289355, "learning_rate": 1.8942977293275687e-05, "loss": 0.167, "step": 913 }, { "epoch": 1.164702134437719, "grad_norm": 0.3441361519362561, "learning_rate": 1.8938989122074195e-05, "loss": 0.1622, "step": 914 }, { "epoch": 1.1659764256132525, "grad_norm": 0.35477149744515696, "learning_rate": 1.8934993862754155e-05, "loss": 0.1703, "step": 915 }, { "epoch": 1.1672507167887862, "grad_norm": 0.331355814641835, "learning_rate": 1.8930991518483586e-05, "loss": 0.1629, "step": 916 }, { "epoch": 1.16852500796432, "grad_norm": 0.3035883944011101, "learning_rate": 1.8926982092436117e-05, "loss": 0.1403, "step": 917 }, { "epoch": 1.1697992991398534, "grad_norm": 0.334574632663933, "learning_rate": 1.8922965587791e-05, "loss": 0.1515, "step": 918 }, { "epoch": 1.171073590315387, "grad_norm": 0.37724339030369636, "learning_rate": 1.8918942007733103e-05, "loss": 0.1745, "step": 919 }, { "epoch": 1.1723478814909207, "grad_norm": 0.3362314052161255, "learning_rate": 1.8914911355452895e-05, "loss": 0.1597, "step": 920 }, { "epoch": 1.1736221726664542, "grad_norm": 0.33411523642152824, "learning_rate": 1.8910873634146464e-05, "loss": 0.1591, "step": 921 }, { "epoch": 1.1748964638419879, "grad_norm": 0.35392436830210205, "learning_rate": 1.890682884701549e-05, "loss": 0.169, "step": 922 }, { "epoch": 1.1761707550175216, "grad_norm": 0.33461237729664317, "learning_rate": 1.890277699726727e-05, "loss": 0.1657, "step": 923 }, { "epoch": 1.1774450461930552, "grad_norm": 0.343673773251541, "learning_rate": 1.8898718088114688e-05, "loss": 0.1707, "step": 924 }, { "epoch": 1.1787193373685887, "grad_norm": 0.342338074053164, "learning_rate": 1.8894652122776236e-05, "loss": 0.1536, "step": 925 }, { "epoch": 1.1799936285441224, "grad_norm": 0.3498342075625197, "learning_rate": 1.8890579104475996e-05, "loss": 0.1755, "step": 926 }, { "epoch": 1.181267919719656, "grad_norm": 0.3227463245026185, "learning_rate": 1.888649903644364e-05, "loss": 0.1706, "step": 927 }, { "epoch": 1.1825422108951895, "grad_norm": 0.3451291730554549, "learning_rate": 1.8882411921914442e-05, "loss": 0.157, "step": 928 }, { "epoch": 1.1838165020707232, "grad_norm": 0.33711959823175014, "learning_rate": 1.8878317764129247e-05, "loss": 0.1662, "step": 929 }, { "epoch": 1.1850907932462569, "grad_norm": 0.3541357526746469, "learning_rate": 1.8874216566334502e-05, "loss": 0.1792, "step": 930 }, { "epoch": 1.1863650844217903, "grad_norm": 0.3494943441167571, "learning_rate": 1.887010833178222e-05, "loss": 0.1668, "step": 931 }, { "epoch": 1.187639375597324, "grad_norm": 0.3313143298444086, "learning_rate": 1.8865993063730003e-05, "loss": 0.15, "step": 932 }, { "epoch": 1.1889136667728577, "grad_norm": 0.3243376822888926, "learning_rate": 1.886187076544103e-05, "loss": 0.1497, "step": 933 }, { "epoch": 1.1901879579483912, "grad_norm": 0.338867550187727, "learning_rate": 1.885774144018405e-05, "loss": 0.1601, "step": 934 }, { "epoch": 1.1914622491239248, "grad_norm": 0.34920208150203613, "learning_rate": 1.8853605091233392e-05, "loss": 0.1725, "step": 935 }, { "epoch": 1.1927365402994585, "grad_norm": 0.3249518059793066, "learning_rate": 1.8849461721868948e-05, "loss": 0.16, "step": 936 }, { "epoch": 1.194010831474992, "grad_norm": 0.3533320819172393, "learning_rate": 1.8845311335376174e-05, "loss": 0.1852, "step": 937 }, { "epoch": 1.1952851226505257, "grad_norm": 0.3422123476825937, "learning_rate": 1.8841153935046098e-05, "loss": 0.1555, "step": 938 }, { "epoch": 1.1965594138260593, "grad_norm": 0.3123469741369779, "learning_rate": 1.8836989524175307e-05, "loss": 0.1495, "step": 939 }, { "epoch": 1.1978337050015928, "grad_norm": 0.34999541699153347, "learning_rate": 1.8832818106065943e-05, "loss": 0.1705, "step": 940 }, { "epoch": 1.1991079961771265, "grad_norm": 0.33427079629159984, "learning_rate": 1.882863968402571e-05, "loss": 0.1697, "step": 941 }, { "epoch": 1.2003822873526602, "grad_norm": 0.32090464564874577, "learning_rate": 1.8824454261367862e-05, "loss": 0.155, "step": 942 }, { "epoch": 1.2016565785281936, "grad_norm": 0.32693329544830746, "learning_rate": 1.8820261841411203e-05, "loss": 0.1573, "step": 943 }, { "epoch": 1.2029308697037273, "grad_norm": 0.3594018728061415, "learning_rate": 1.881606242748009e-05, "loss": 0.176, "step": 944 }, { "epoch": 1.204205160879261, "grad_norm": 0.32835405105463983, "learning_rate": 1.8811856022904423e-05, "loss": 0.1676, "step": 945 }, { "epoch": 1.2054794520547945, "grad_norm": 0.2914306718200428, "learning_rate": 1.8807642631019648e-05, "loss": 0.1445, "step": 946 }, { "epoch": 1.2067537432303281, "grad_norm": 0.32181506877496024, "learning_rate": 1.8803422255166745e-05, "loss": 0.1602, "step": 947 }, { "epoch": 1.2080280344058618, "grad_norm": 0.3310145003308486, "learning_rate": 1.8799194898692238e-05, "loss": 0.171, "step": 948 }, { "epoch": 1.2093023255813953, "grad_norm": 0.33098212547735734, "learning_rate": 1.8794960564948183e-05, "loss": 0.1591, "step": 949 }, { "epoch": 1.210576616756929, "grad_norm": 0.33207808758884366, "learning_rate": 1.8790719257292175e-05, "loss": 0.1635, "step": 950 }, { "epoch": 1.2118509079324626, "grad_norm": 0.3526670399491904, "learning_rate": 1.8786470979087327e-05, "loss": 0.1724, "step": 951 }, { "epoch": 1.213125199107996, "grad_norm": 0.33247389293726315, "learning_rate": 1.8782215733702286e-05, "loss": 0.1485, "step": 952 }, { "epoch": 1.2143994902835298, "grad_norm": 0.32551215946602224, "learning_rate": 1.877795352451123e-05, "loss": 0.1585, "step": 953 }, { "epoch": 1.2156737814590635, "grad_norm": 0.3292008630369106, "learning_rate": 1.8773684354893848e-05, "loss": 0.1692, "step": 954 }, { "epoch": 1.216948072634597, "grad_norm": 0.35774549438472797, "learning_rate": 1.8769408228235354e-05, "loss": 0.1817, "step": 955 }, { "epoch": 1.2182223638101306, "grad_norm": 0.34415329944815326, "learning_rate": 1.8765125147926477e-05, "loss": 0.1696, "step": 956 }, { "epoch": 1.2194966549856643, "grad_norm": 0.3378742102877926, "learning_rate": 1.8760835117363463e-05, "loss": 0.1699, "step": 957 }, { "epoch": 1.2207709461611977, "grad_norm": 0.3137944584415906, "learning_rate": 1.875653813994806e-05, "loss": 0.1423, "step": 958 }, { "epoch": 1.2220452373367314, "grad_norm": 0.3279757105726268, "learning_rate": 1.8752234219087538e-05, "loss": 0.1555, "step": 959 }, { "epoch": 1.223319528512265, "grad_norm": 0.3325433684028432, "learning_rate": 1.874792335819466e-05, "loss": 0.1692, "step": 960 }, { "epoch": 1.2245938196877986, "grad_norm": 0.3579427452633211, "learning_rate": 1.87436055606877e-05, "loss": 0.1811, "step": 961 }, { "epoch": 1.2258681108633322, "grad_norm": 0.32466191174581854, "learning_rate": 1.873928082999043e-05, "loss": 0.1597, "step": 962 }, { "epoch": 1.227142402038866, "grad_norm": 0.3892743906266123, "learning_rate": 1.8734949169532123e-05, "loss": 0.1727, "step": 963 }, { "epoch": 1.2284166932143994, "grad_norm": 0.3668024567654228, "learning_rate": 1.8730610582747538e-05, "loss": 0.1751, "step": 964 }, { "epoch": 1.229690984389933, "grad_norm": 0.3383915436364459, "learning_rate": 1.8726265073076932e-05, "loss": 0.1673, "step": 965 }, { "epoch": 1.2309652755654668, "grad_norm": 0.3302133911415919, "learning_rate": 1.8721912643966055e-05, "loss": 0.1781, "step": 966 }, { "epoch": 1.2322395667410002, "grad_norm": 0.35842774485301926, "learning_rate": 1.8717553298866136e-05, "loss": 0.1946, "step": 967 }, { "epoch": 1.233513857916534, "grad_norm": 0.3385724347412503, "learning_rate": 1.8713187041233896e-05, "loss": 0.1653, "step": 968 }, { "epoch": 1.2347881490920676, "grad_norm": 0.3356809016737896, "learning_rate": 1.8708813874531528e-05, "loss": 0.169, "step": 969 }, { "epoch": 1.236062440267601, "grad_norm": 0.3577307134459536, "learning_rate": 1.8704433802226714e-05, "loss": 0.1658, "step": 970 }, { "epoch": 1.2373367314431347, "grad_norm": 0.3631993782215604, "learning_rate": 1.8700046827792604e-05, "loss": 0.1764, "step": 971 }, { "epoch": 1.2386110226186684, "grad_norm": 0.3150820424593532, "learning_rate": 1.8695652954707823e-05, "loss": 0.1461, "step": 972 }, { "epoch": 1.2398853137942019, "grad_norm": 0.3437412922700779, "learning_rate": 1.8691252186456465e-05, "loss": 0.1467, "step": 973 }, { "epoch": 1.2411596049697355, "grad_norm": 0.33907472332076505, "learning_rate": 1.86868445265281e-05, "loss": 0.168, "step": 974 }, { "epoch": 1.2424338961452692, "grad_norm": 0.32849545533444374, "learning_rate": 1.8682429978417748e-05, "loss": 0.1647, "step": 975 }, { "epoch": 1.2437081873208027, "grad_norm": 0.3578203769441888, "learning_rate": 1.86780085456259e-05, "loss": 0.1502, "step": 976 }, { "epoch": 1.2449824784963364, "grad_norm": 0.3716259038749953, "learning_rate": 1.867358023165851e-05, "loss": 0.1741, "step": 977 }, { "epoch": 1.24625676967187, "grad_norm": 0.3650894527654957, "learning_rate": 1.866914504002698e-05, "loss": 0.1726, "step": 978 }, { "epoch": 1.2475310608474037, "grad_norm": 0.34794711035213194, "learning_rate": 1.866470297424817e-05, "loss": 0.1767, "step": 979 }, { "epoch": 1.2488053520229372, "grad_norm": 0.3669800801806064, "learning_rate": 1.866025403784439e-05, "loss": 0.1558, "step": 980 }, { "epoch": 1.2500796431984709, "grad_norm": 0.3401024366830932, "learning_rate": 1.8655798234343394e-05, "loss": 0.1404, "step": 981 }, { "epoch": 1.2513539343740043, "grad_norm": 0.3316438520002876, "learning_rate": 1.865133556727839e-05, "loss": 0.1595, "step": 982 }, { "epoch": 1.252628225549538, "grad_norm": 0.35079447773608824, "learning_rate": 1.864686604018802e-05, "loss": 0.1495, "step": 983 }, { "epoch": 1.2539025167250717, "grad_norm": 0.31105469108793904, "learning_rate": 1.864238965661637e-05, "loss": 0.1535, "step": 984 }, { "epoch": 1.2551768079006052, "grad_norm": 0.3356564884542775, "learning_rate": 1.8637906420112963e-05, "loss": 0.1579, "step": 985 }, { "epoch": 1.2564510990761388, "grad_norm": 0.35613428536891734, "learning_rate": 1.8633416334232754e-05, "loss": 0.1725, "step": 986 }, { "epoch": 1.2577253902516725, "grad_norm": 0.3674103117061376, "learning_rate": 1.862891940253613e-05, "loss": 0.1704, "step": 987 }, { "epoch": 1.2589996814272062, "grad_norm": 0.3433543063526119, "learning_rate": 1.862441562858891e-05, "loss": 0.1735, "step": 988 }, { "epoch": 1.2602739726027397, "grad_norm": 0.35790239814547675, "learning_rate": 1.8619905015962327e-05, "loss": 0.1617, "step": 989 }, { "epoch": 1.2615482637782733, "grad_norm": 0.322706806548815, "learning_rate": 1.861538756823305e-05, "loss": 0.1504, "step": 990 }, { "epoch": 1.262822554953807, "grad_norm": 0.33059405056609004, "learning_rate": 1.8610863288983163e-05, "loss": 0.1589, "step": 991 }, { "epoch": 1.2640968461293405, "grad_norm": 0.3199611263301758, "learning_rate": 1.8606332181800165e-05, "loss": 0.152, "step": 992 }, { "epoch": 1.2653711373048742, "grad_norm": 0.3267561376963887, "learning_rate": 1.8601794250276968e-05, "loss": 0.1489, "step": 993 }, { "epoch": 1.2666454284804078, "grad_norm": 0.35622299227645005, "learning_rate": 1.8597249498011906e-05, "loss": 0.1713, "step": 994 }, { "epoch": 1.2679197196559413, "grad_norm": 0.3285274748478752, "learning_rate": 1.8592697928608702e-05, "loss": 0.159, "step": 995 }, { "epoch": 1.269194010831475, "grad_norm": 0.3153725707735201, "learning_rate": 1.8588139545676506e-05, "loss": 0.1443, "step": 996 }, { "epoch": 1.2704683020070087, "grad_norm": 0.3472512988399753, "learning_rate": 1.8583574352829855e-05, "loss": 0.1623, "step": 997 }, { "epoch": 1.2717425931825423, "grad_norm": 0.3396920352132982, "learning_rate": 1.8579002353688695e-05, "loss": 0.1629, "step": 998 }, { "epoch": 1.2730168843580758, "grad_norm": 0.32259483043951365, "learning_rate": 1.8574423551878363e-05, "loss": 0.1588, "step": 999 }, { "epoch": 1.2742911755336095, "grad_norm": 0.3351528757326111, "learning_rate": 1.8569837951029597e-05, "loss": 0.1628, "step": 1000 }, { "epoch": 1.2755654667091432, "grad_norm": 0.3496014059930826, "learning_rate": 1.8565245554778516e-05, "loss": 0.1569, "step": 1001 }, { "epoch": 1.2768397578846766, "grad_norm": 0.4202949814502593, "learning_rate": 1.8560646366766637e-05, "loss": 0.1908, "step": 1002 }, { "epoch": 1.2781140490602103, "grad_norm": 0.3245964011690443, "learning_rate": 1.855604039064086e-05, "loss": 0.1464, "step": 1003 }, { "epoch": 1.279388340235744, "grad_norm": 0.3199509038301857, "learning_rate": 1.8551427630053464e-05, "loss": 0.1545, "step": 1004 }, { "epoch": 1.2806626314112775, "grad_norm": 0.33386664319766324, "learning_rate": 1.8546808088662112e-05, "loss": 0.1565, "step": 1005 }, { "epoch": 1.2819369225868111, "grad_norm": 0.354141953339447, "learning_rate": 1.8542181770129838e-05, "loss": 0.159, "step": 1006 }, { "epoch": 1.2832112137623448, "grad_norm": 0.3345815291319348, "learning_rate": 1.8537548678125058e-05, "loss": 0.1724, "step": 1007 }, { "epoch": 1.2844855049378783, "grad_norm": 0.3344135146284365, "learning_rate": 1.8532908816321557e-05, "loss": 0.1754, "step": 1008 }, { "epoch": 1.285759796113412, "grad_norm": 0.31549490421015675, "learning_rate": 1.8528262188398484e-05, "loss": 0.1395, "step": 1009 }, { "epoch": 1.2870340872889456, "grad_norm": 0.3509136781316956, "learning_rate": 1.852360879804035e-05, "loss": 0.174, "step": 1010 }, { "epoch": 1.288308378464479, "grad_norm": 0.35985136337367063, "learning_rate": 1.851894864893704e-05, "loss": 0.2021, "step": 1011 }, { "epoch": 1.2895826696400128, "grad_norm": 0.3262264956993357, "learning_rate": 1.851428174478379e-05, "loss": 0.1515, "step": 1012 }, { "epoch": 1.2908569608155465, "grad_norm": 0.3288521825189567, "learning_rate": 1.850960808928119e-05, "loss": 0.149, "step": 1013 }, { "epoch": 1.29213125199108, "grad_norm": 0.3595483132271591, "learning_rate": 1.8504927686135194e-05, "loss": 0.1732, "step": 1014 }, { "epoch": 1.2934055431666136, "grad_norm": 0.305748896324228, "learning_rate": 1.8500240539057093e-05, "loss": 0.1497, "step": 1015 }, { "epoch": 1.2946798343421473, "grad_norm": 0.3272890703468722, "learning_rate": 1.849554665176354e-05, "loss": 0.1624, "step": 1016 }, { "epoch": 1.2959541255176807, "grad_norm": 0.3640776330718339, "learning_rate": 1.8490846027976517e-05, "loss": 0.1655, "step": 1017 }, { "epoch": 1.2972284166932144, "grad_norm": 0.3258596357311298, "learning_rate": 1.8486138671423366e-05, "loss": 0.1543, "step": 1018 }, { "epoch": 1.298502707868748, "grad_norm": 0.33078991349676173, "learning_rate": 1.848142458583675e-05, "loss": 0.1517, "step": 1019 }, { "epoch": 1.2997769990442816, "grad_norm": 0.35354469432662605, "learning_rate": 1.8476703774954676e-05, "loss": 0.1839, "step": 1020 }, { "epoch": 1.3010512902198152, "grad_norm": 0.30541035720406384, "learning_rate": 1.8471976242520484e-05, "loss": 0.1523, "step": 1021 }, { "epoch": 1.302325581395349, "grad_norm": 0.3312912014762265, "learning_rate": 1.8467241992282842e-05, "loss": 0.1532, "step": 1022 }, { "epoch": 1.3035998725708824, "grad_norm": 0.34515042236028487, "learning_rate": 1.846250102799575e-05, "loss": 0.1661, "step": 1023 }, { "epoch": 1.304874163746416, "grad_norm": 0.3364609392601133, "learning_rate": 1.845775335341852e-05, "loss": 0.1679, "step": 1024 }, { "epoch": 1.3061484549219498, "grad_norm": 0.331329063443882, "learning_rate": 1.84529989723158e-05, "loss": 0.1656, "step": 1025 }, { "epoch": 1.3074227460974832, "grad_norm": 0.3138696100702932, "learning_rate": 1.8448237888457546e-05, "loss": 0.1535, "step": 1026 }, { "epoch": 1.308697037273017, "grad_norm": 0.3287092996366417, "learning_rate": 1.8443470105619027e-05, "loss": 0.1532, "step": 1027 }, { "epoch": 1.3099713284485506, "grad_norm": 0.35541767911785616, "learning_rate": 1.8438695627580832e-05, "loss": 0.1776, "step": 1028 }, { "epoch": 1.311245619624084, "grad_norm": 0.344470981285816, "learning_rate": 1.843391445812886e-05, "loss": 0.1861, "step": 1029 }, { "epoch": 1.3125199107996177, "grad_norm": 0.32259179406017224, "learning_rate": 1.8429126601054302e-05, "loss": 0.1614, "step": 1030 }, { "epoch": 1.3137942019751514, "grad_norm": 0.33513129183377127, "learning_rate": 1.8424332060153664e-05, "loss": 0.1691, "step": 1031 }, { "epoch": 1.3150684931506849, "grad_norm": 0.326792892951927, "learning_rate": 1.841953083922875e-05, "loss": 0.1558, "step": 1032 }, { "epoch": 1.3163427843262185, "grad_norm": 0.35929239050901174, "learning_rate": 1.841472294208666e-05, "loss": 0.1921, "step": 1033 }, { "epoch": 1.3176170755017522, "grad_norm": 0.2992135023670159, "learning_rate": 1.8409908372539788e-05, "loss": 0.1451, "step": 1034 }, { "epoch": 1.3188913666772857, "grad_norm": 0.3373249810383071, "learning_rate": 1.8405087134405815e-05, "loss": 0.1731, "step": 1035 }, { "epoch": 1.3201656578528194, "grad_norm": 0.3052835066872669, "learning_rate": 1.8400259231507716e-05, "loss": 0.1465, "step": 1036 }, { "epoch": 1.321439949028353, "grad_norm": 0.3706680557201969, "learning_rate": 1.839542466767375e-05, "loss": 0.1825, "step": 1037 }, { "epoch": 1.3227142402038865, "grad_norm": 0.34872632953596056, "learning_rate": 1.8390583446737448e-05, "loss": 0.1575, "step": 1038 }, { "epoch": 1.3239885313794202, "grad_norm": 0.3239355432152859, "learning_rate": 1.838573557253764e-05, "loss": 0.1534, "step": 1039 }, { "epoch": 1.3252628225549539, "grad_norm": 0.35986193070366096, "learning_rate": 1.8380881048918406e-05, "loss": 0.1737, "step": 1040 }, { "epoch": 1.3265371137304873, "grad_norm": 0.3722355646818584, "learning_rate": 1.8376019879729124e-05, "loss": 0.1833, "step": 1041 }, { "epoch": 1.327811404906021, "grad_norm": 0.3146406404539141, "learning_rate": 1.837115206882442e-05, "loss": 0.1494, "step": 1042 }, { "epoch": 1.3290856960815547, "grad_norm": 0.36519327545401525, "learning_rate": 1.83662776200642e-05, "loss": 0.1629, "step": 1043 }, { "epoch": 1.3303599872570882, "grad_norm": 0.3350626705598752, "learning_rate": 1.8361396537313628e-05, "loss": 0.1481, "step": 1044 }, { "epoch": 1.3316342784326218, "grad_norm": 0.35087892878776483, "learning_rate": 1.835650882444313e-05, "loss": 0.1783, "step": 1045 }, { "epoch": 1.3329085696081555, "grad_norm": 0.3407264593970327, "learning_rate": 1.835161448532839e-05, "loss": 0.1744, "step": 1046 }, { "epoch": 1.334182860783689, "grad_norm": 0.3254157883058351, "learning_rate": 1.8346713523850342e-05, "loss": 0.1676, "step": 1047 }, { "epoch": 1.3354571519592227, "grad_norm": 0.3163544946420383, "learning_rate": 1.8341805943895178e-05, "loss": 0.1622, "step": 1048 }, { "epoch": 1.3367314431347563, "grad_norm": 0.3445629869418156, "learning_rate": 1.8336891749354337e-05, "loss": 0.174, "step": 1049 }, { "epoch": 1.3380057343102898, "grad_norm": 0.32632564665530434, "learning_rate": 1.833197094412449e-05, "loss": 0.1601, "step": 1050 }, { "epoch": 1.3392800254858235, "grad_norm": 0.3260340280259183, "learning_rate": 1.8327043532107575e-05, "loss": 0.1645, "step": 1051 }, { "epoch": 1.3405543166613572, "grad_norm": 0.33749736086774823, "learning_rate": 1.832210951721074e-05, "loss": 0.1831, "step": 1052 }, { "epoch": 1.3418286078368906, "grad_norm": 0.3371252567919534, "learning_rate": 1.831716890334639e-05, "loss": 0.1674, "step": 1053 }, { "epoch": 1.3431028990124243, "grad_norm": 0.3275815018708262, "learning_rate": 1.831222169443216e-05, "loss": 0.1533, "step": 1054 }, { "epoch": 1.344377190187958, "grad_norm": 0.30442552966931324, "learning_rate": 1.83072678943909e-05, "loss": 0.1503, "step": 1055 }, { "epoch": 1.3456514813634914, "grad_norm": 0.3308474495303055, "learning_rate": 1.8302307507150703e-05, "loss": 0.1636, "step": 1056 }, { "epoch": 1.3469257725390251, "grad_norm": 0.3054787327886702, "learning_rate": 1.8297340536644877e-05, "loss": 0.1282, "step": 1057 }, { "epoch": 1.3482000637145588, "grad_norm": 0.37011365152783415, "learning_rate": 1.8292366986811952e-05, "loss": 0.1827, "step": 1058 }, { "epoch": 1.3494743548900923, "grad_norm": 0.3401939423308665, "learning_rate": 1.8287386861595675e-05, "loss": 0.1594, "step": 1059 }, { "epoch": 1.350748646065626, "grad_norm": 0.33090371656309037, "learning_rate": 1.8282400164945006e-05, "loss": 0.155, "step": 1060 }, { "epoch": 1.3520229372411596, "grad_norm": 0.3571918545254773, "learning_rate": 1.827740690081412e-05, "loss": 0.1715, "step": 1061 }, { "epoch": 1.353297228416693, "grad_norm": 0.3324467916642027, "learning_rate": 1.8272407073162393e-05, "loss": 0.1632, "step": 1062 }, { "epoch": 1.3545715195922268, "grad_norm": 0.33407150346481795, "learning_rate": 1.8267400685954407e-05, "loss": 0.1698, "step": 1063 }, { "epoch": 1.3558458107677605, "grad_norm": 0.3299657817925513, "learning_rate": 1.826238774315995e-05, "loss": 0.1523, "step": 1064 }, { "epoch": 1.357120101943294, "grad_norm": 0.3358310342524182, "learning_rate": 1.8257368248754005e-05, "loss": 0.1713, "step": 1065 }, { "epoch": 1.3583943931188276, "grad_norm": 0.3321560540959325, "learning_rate": 1.8252342206716754e-05, "loss": 0.1659, "step": 1066 }, { "epoch": 1.3596686842943613, "grad_norm": 0.3686851001318763, "learning_rate": 1.824730962103356e-05, "loss": 0.1722, "step": 1067 }, { "epoch": 1.3609429754698947, "grad_norm": 0.3029583957895652, "learning_rate": 1.8242270495694985e-05, "loss": 0.1388, "step": 1068 }, { "epoch": 1.3622172666454284, "grad_norm": 0.337213517035378, "learning_rate": 1.8237224834696774e-05, "loss": 0.1727, "step": 1069 }, { "epoch": 1.363491557820962, "grad_norm": 0.35127095851105117, "learning_rate": 1.8232172642039856e-05, "loss": 0.1677, "step": 1070 }, { "epoch": 1.3647658489964958, "grad_norm": 0.3668050209522345, "learning_rate": 1.8227113921730336e-05, "loss": 0.1688, "step": 1071 }, { "epoch": 1.3660401401720292, "grad_norm": 0.31235902692924655, "learning_rate": 1.8222048677779495e-05, "loss": 0.1509, "step": 1072 }, { "epoch": 1.367314431347563, "grad_norm": 0.33106226358767793, "learning_rate": 1.8216976914203788e-05, "loss": 0.1749, "step": 1073 }, { "epoch": 1.3685887225230966, "grad_norm": 0.3282560497559168, "learning_rate": 1.821189863502484e-05, "loss": 0.1609, "step": 1074 }, { "epoch": 1.36986301369863, "grad_norm": 0.3377456987610564, "learning_rate": 1.820681384426945e-05, "loss": 0.1614, "step": 1075 }, { "epoch": 1.3711373048741637, "grad_norm": 0.32647979821448725, "learning_rate": 1.820172254596956e-05, "loss": 0.1759, "step": 1076 }, { "epoch": 1.3724115960496974, "grad_norm": 0.36185202788809157, "learning_rate": 1.8196624744162294e-05, "loss": 0.1809, "step": 1077 }, { "epoch": 1.3736858872252309, "grad_norm": 0.34825769862548617, "learning_rate": 1.819152044288992e-05, "loss": 0.1703, "step": 1078 }, { "epoch": 1.3749601784007646, "grad_norm": 0.3232392114591538, "learning_rate": 1.8186409646199864e-05, "loss": 0.1539, "step": 1079 }, { "epoch": 1.3762344695762982, "grad_norm": 0.33888517233353166, "learning_rate": 1.8181292358144703e-05, "loss": 0.1676, "step": 1080 }, { "epoch": 1.377508760751832, "grad_norm": 0.33955599632067307, "learning_rate": 1.8176168582782157e-05, "loss": 0.1601, "step": 1081 }, { "epoch": 1.3787830519273654, "grad_norm": 0.33218278162118287, "learning_rate": 1.81710383241751e-05, "loss": 0.1532, "step": 1082 }, { "epoch": 1.380057343102899, "grad_norm": 0.34884378268278493, "learning_rate": 1.8165901586391536e-05, "loss": 0.1577, "step": 1083 }, { "epoch": 1.3813316342784328, "grad_norm": 0.3233200648956245, "learning_rate": 1.816075837350461e-05, "loss": 0.1523, "step": 1084 }, { "epoch": 1.3826059254539662, "grad_norm": 0.330766249725018, "learning_rate": 1.8155608689592604e-05, "loss": 0.1603, "step": 1085 }, { "epoch": 1.3838802166295, "grad_norm": 0.3561094130478623, "learning_rate": 1.815045253873893e-05, "loss": 0.1657, "step": 1086 }, { "epoch": 1.3851545078050336, "grad_norm": 0.3342457715817938, "learning_rate": 1.8145289925032122e-05, "loss": 0.163, "step": 1087 }, { "epoch": 1.386428798980567, "grad_norm": 0.3665826587186195, "learning_rate": 1.814012085256585e-05, "loss": 0.176, "step": 1088 }, { "epoch": 1.3877030901561007, "grad_norm": 0.33575117445563685, "learning_rate": 1.81349453254389e-05, "loss": 0.158, "step": 1089 }, { "epoch": 1.3889773813316344, "grad_norm": 0.33413170839660394, "learning_rate": 1.812976334775517e-05, "loss": 0.168, "step": 1090 }, { "epoch": 1.3902516725071679, "grad_norm": 0.3189492945297881, "learning_rate": 1.812457492362368e-05, "loss": 0.1569, "step": 1091 }, { "epoch": 1.3915259636827015, "grad_norm": 0.3416792361040837, "learning_rate": 1.811938005715857e-05, "loss": 0.1601, "step": 1092 }, { "epoch": 1.3928002548582352, "grad_norm": 0.3197640842845869, "learning_rate": 1.8114178752479062e-05, "loss": 0.1521, "step": 1093 }, { "epoch": 1.3940745460337687, "grad_norm": 0.33488034661043115, "learning_rate": 1.8108971013709512e-05, "loss": 0.1707, "step": 1094 }, { "epoch": 1.3953488372093024, "grad_norm": 0.3165376119125541, "learning_rate": 1.810375684497936e-05, "loss": 0.1601, "step": 1095 }, { "epoch": 1.396623128384836, "grad_norm": 0.3211485991853899, "learning_rate": 1.8098536250423154e-05, "loss": 0.1452, "step": 1096 }, { "epoch": 1.3978974195603695, "grad_norm": 0.3614090243602088, "learning_rate": 1.8093309234180534e-05, "loss": 0.1734, "step": 1097 }, { "epoch": 1.3991717107359032, "grad_norm": 0.3311878544900267, "learning_rate": 1.8088075800396227e-05, "loss": 0.1663, "step": 1098 }, { "epoch": 1.4004460019114369, "grad_norm": 0.3200006352613332, "learning_rate": 1.8082835953220055e-05, "loss": 0.1639, "step": 1099 }, { "epoch": 1.4017202930869703, "grad_norm": 0.32415521701376987, "learning_rate": 1.8077589696806925e-05, "loss": 0.1751, "step": 1100 }, { "epoch": 1.402994584262504, "grad_norm": 0.298217757092312, "learning_rate": 1.8072337035316826e-05, "loss": 0.145, "step": 1101 }, { "epoch": 1.4042688754380377, "grad_norm": 0.36977252902169344, "learning_rate": 1.8067077972914822e-05, "loss": 0.1862, "step": 1102 }, { "epoch": 1.4055431666135711, "grad_norm": 0.33444097140827617, "learning_rate": 1.8061812513771056e-05, "loss": 0.1623, "step": 1103 }, { "epoch": 1.4068174577891048, "grad_norm": 0.3199129045894729, "learning_rate": 1.8056540662060747e-05, "loss": 0.1439, "step": 1104 }, { "epoch": 1.4080917489646385, "grad_norm": 0.33430226140006086, "learning_rate": 1.8051262421964174e-05, "loss": 0.1464, "step": 1105 }, { "epoch": 1.409366040140172, "grad_norm": 0.3292876042585294, "learning_rate": 1.8045977797666685e-05, "loss": 0.1576, "step": 1106 }, { "epoch": 1.4106403313157057, "grad_norm": 0.3996809191810182, "learning_rate": 1.8040686793358695e-05, "loss": 0.1872, "step": 1107 }, { "epoch": 1.4119146224912393, "grad_norm": 0.3225608305675859, "learning_rate": 1.8035389413235672e-05, "loss": 0.1554, "step": 1108 }, { "epoch": 1.4131889136667728, "grad_norm": 0.3815898337605482, "learning_rate": 1.803008566149815e-05, "loss": 0.174, "step": 1109 }, { "epoch": 1.4144632048423065, "grad_norm": 0.3558159849963359, "learning_rate": 1.8024775542351695e-05, "loss": 0.1662, "step": 1110 }, { "epoch": 1.4157374960178402, "grad_norm": 0.3399255530297972, "learning_rate": 1.8019459060006945e-05, "loss": 0.1735, "step": 1111 }, { "epoch": 1.4170117871933736, "grad_norm": 0.3450151810581896, "learning_rate": 1.8014136218679566e-05, "loss": 0.1625, "step": 1112 }, { "epoch": 1.4182860783689073, "grad_norm": 0.32897650373253035, "learning_rate": 1.8008807022590283e-05, "loss": 0.1653, "step": 1113 }, { "epoch": 1.419560369544441, "grad_norm": 0.3195977318320727, "learning_rate": 1.8003471475964837e-05, "loss": 0.1533, "step": 1114 }, { "epoch": 1.4208346607199744, "grad_norm": 0.31084236011482175, "learning_rate": 1.7998129583034027e-05, "loss": 0.1528, "step": 1115 }, { "epoch": 1.4221089518955081, "grad_norm": 0.3562352527362941, "learning_rate": 1.7992781348033678e-05, "loss": 0.1831, "step": 1116 }, { "epoch": 1.4233832430710418, "grad_norm": 0.3321351730191935, "learning_rate": 1.7987426775204632e-05, "loss": 0.1537, "step": 1117 }, { "epoch": 1.4246575342465753, "grad_norm": 0.36882838406709695, "learning_rate": 1.7982065868792772e-05, "loss": 0.1922, "step": 1118 }, { "epoch": 1.425931825422109, "grad_norm": 0.34452833479018224, "learning_rate": 1.7976698633049e-05, "loss": 0.1675, "step": 1119 }, { "epoch": 1.4272061165976426, "grad_norm": 0.32118430091153544, "learning_rate": 1.7971325072229227e-05, "loss": 0.1661, "step": 1120 }, { "epoch": 1.428480407773176, "grad_norm": 0.303472110270881, "learning_rate": 1.796594519059439e-05, "loss": 0.1372, "step": 1121 }, { "epoch": 1.4297546989487098, "grad_norm": 0.3249030829541066, "learning_rate": 1.7960558992410432e-05, "loss": 0.1633, "step": 1122 }, { "epoch": 1.4310289901242434, "grad_norm": 0.35047311786168367, "learning_rate": 1.795516648194831e-05, "loss": 0.1535, "step": 1123 }, { "epoch": 1.432303281299777, "grad_norm": 0.31608658379645727, "learning_rate": 1.794976766348398e-05, "loss": 0.1524, "step": 1124 }, { "epoch": 1.4335775724753106, "grad_norm": 0.32412247012673157, "learning_rate": 1.7944362541298407e-05, "loss": 0.1659, "step": 1125 }, { "epoch": 1.4348518636508443, "grad_norm": 0.31441350478940705, "learning_rate": 1.7938951119677544e-05, "loss": 0.1546, "step": 1126 }, { "epoch": 1.4361261548263777, "grad_norm": 0.3399769659219759, "learning_rate": 1.7933533402912354e-05, "loss": 0.1642, "step": 1127 }, { "epoch": 1.4374004460019114, "grad_norm": 0.34451868393939206, "learning_rate": 1.7928109395298777e-05, "loss": 0.1561, "step": 1128 }, { "epoch": 1.438674737177445, "grad_norm": 0.35833996231223625, "learning_rate": 1.7922679101137753e-05, "loss": 0.1715, "step": 1129 }, { "epoch": 1.4399490283529786, "grad_norm": 0.356276047234566, "learning_rate": 1.79172425247352e-05, "loss": 0.1857, "step": 1130 }, { "epoch": 1.4412233195285122, "grad_norm": 0.30495799657097483, "learning_rate": 1.7911799670402015e-05, "loss": 0.1465, "step": 1131 }, { "epoch": 1.442497610704046, "grad_norm": 0.30784998946347697, "learning_rate": 1.7906350542454084e-05, "loss": 0.1438, "step": 1132 }, { "epoch": 1.4437719018795794, "grad_norm": 0.33503616423160276, "learning_rate": 1.7900895145212255e-05, "loss": 0.1616, "step": 1133 }, { "epoch": 1.445046193055113, "grad_norm": 0.35033926468740706, "learning_rate": 1.7895433483002356e-05, "loss": 0.1633, "step": 1134 }, { "epoch": 1.4463204842306467, "grad_norm": 0.31793243974274726, "learning_rate": 1.7889965560155178e-05, "loss": 0.1465, "step": 1135 }, { "epoch": 1.4475947754061802, "grad_norm": 0.3298720019421075, "learning_rate": 1.788449138100648e-05, "loss": 0.1703, "step": 1136 }, { "epoch": 1.4488690665817139, "grad_norm": 0.33309386669613433, "learning_rate": 1.7879010949896977e-05, "loss": 0.1611, "step": 1137 }, { "epoch": 1.4501433577572476, "grad_norm": 0.3214624392909001, "learning_rate": 1.787352427117235e-05, "loss": 0.1568, "step": 1138 }, { "epoch": 1.451417648932781, "grad_norm": 0.33180258654370864, "learning_rate": 1.786803134918322e-05, "loss": 0.1522, "step": 1139 }, { "epoch": 1.4526919401083147, "grad_norm": 0.33439972347403324, "learning_rate": 1.7862532188285176e-05, "loss": 0.175, "step": 1140 }, { "epoch": 1.4539662312838484, "grad_norm": 0.35857723769958966, "learning_rate": 1.785702679283874e-05, "loss": 0.1723, "step": 1141 }, { "epoch": 1.4552405224593818, "grad_norm": 0.34350229416396866, "learning_rate": 1.785151516720938e-05, "loss": 0.1791, "step": 1142 }, { "epoch": 1.4565148136349155, "grad_norm": 0.3167963734715147, "learning_rate": 1.7845997315767513e-05, "loss": 0.1494, "step": 1143 }, { "epoch": 1.4577891048104492, "grad_norm": 0.3377115794648749, "learning_rate": 1.7840473242888486e-05, "loss": 0.1659, "step": 1144 }, { "epoch": 1.4590633959859827, "grad_norm": 0.2954391120264447, "learning_rate": 1.783494295295258e-05, "loss": 0.1394, "step": 1145 }, { "epoch": 1.4603376871615164, "grad_norm": 0.344502533020949, "learning_rate": 1.7829406450344998e-05, "loss": 0.1706, "step": 1146 }, { "epoch": 1.46161197833705, "grad_norm": 0.32225274956230215, "learning_rate": 1.7823863739455886e-05, "loss": 0.1466, "step": 1147 }, { "epoch": 1.4628862695125835, "grad_norm": 0.3240814967932419, "learning_rate": 1.78183148246803e-05, "loss": 0.1589, "step": 1148 }, { "epoch": 1.4641605606881172, "grad_norm": 0.31860267691744865, "learning_rate": 1.7812759710418223e-05, "loss": 0.1612, "step": 1149 }, { "epoch": 1.4654348518636509, "grad_norm": 0.3583251795517931, "learning_rate": 1.780719840107454e-05, "loss": 0.1696, "step": 1150 }, { "epoch": 1.4667091430391843, "grad_norm": 0.3359403413730353, "learning_rate": 1.780163090105907e-05, "loss": 0.1686, "step": 1151 }, { "epoch": 1.467983434214718, "grad_norm": 0.33889106745221853, "learning_rate": 1.779605721478652e-05, "loss": 0.1733, "step": 1152 }, { "epoch": 1.4692577253902517, "grad_norm": 0.34159320489674855, "learning_rate": 1.7790477346676523e-05, "loss": 0.1729, "step": 1153 }, { "epoch": 1.4705320165657854, "grad_norm": 0.33795861266425414, "learning_rate": 1.778489130115359e-05, "loss": 0.1712, "step": 1154 }, { "epoch": 1.4718063077413188, "grad_norm": 0.34373038801246825, "learning_rate": 1.777929908264715e-05, "loss": 0.1679, "step": 1155 }, { "epoch": 1.4730805989168525, "grad_norm": 0.3052461038138122, "learning_rate": 1.777370069559152e-05, "loss": 0.1536, "step": 1156 }, { "epoch": 1.4743548900923862, "grad_norm": 0.37320553250861976, "learning_rate": 1.7768096144425903e-05, "loss": 0.2055, "step": 1157 }, { "epoch": 1.4756291812679196, "grad_norm": 0.35229095142012073, "learning_rate": 1.7762485433594398e-05, "loss": 0.1924, "step": 1158 }, { "epoch": 1.4769034724434533, "grad_norm": 0.3161384176393004, "learning_rate": 1.775686856754598e-05, "loss": 0.1531, "step": 1159 }, { "epoch": 1.478177763618987, "grad_norm": 0.35844304567940954, "learning_rate": 1.775124555073452e-05, "loss": 0.1836, "step": 1160 }, { "epoch": 1.4794520547945205, "grad_norm": 0.29829481164702076, "learning_rate": 1.774561638761875e-05, "loss": 0.1365, "step": 1161 }, { "epoch": 1.4807263459700541, "grad_norm": 0.3275713846427648, "learning_rate": 1.7739981082662275e-05, "loss": 0.1532, "step": 1162 }, { "epoch": 1.4820006371455878, "grad_norm": 0.32284769439926714, "learning_rate": 1.7734339640333588e-05, "loss": 0.1542, "step": 1163 }, { "epoch": 1.4832749283211215, "grad_norm": 0.3293826825988233, "learning_rate": 1.7728692065106032e-05, "loss": 0.1595, "step": 1164 }, { "epoch": 1.484549219496655, "grad_norm": 0.3266050782159053, "learning_rate": 1.772303836145782e-05, "loss": 0.1591, "step": 1165 }, { "epoch": 1.4858235106721887, "grad_norm": 0.3370102854083008, "learning_rate": 1.771737853387202e-05, "loss": 0.1626, "step": 1166 }, { "epoch": 1.4870978018477223, "grad_norm": 0.32150995357629863, "learning_rate": 1.771171258683656e-05, "loss": 0.1576, "step": 1167 }, { "epoch": 1.4883720930232558, "grad_norm": 0.34467919008078335, "learning_rate": 1.7706040524844222e-05, "loss": 0.1892, "step": 1168 }, { "epoch": 1.4896463841987895, "grad_norm": 0.32229348403230895, "learning_rate": 1.7700362352392632e-05, "loss": 0.157, "step": 1169 }, { "epoch": 1.4909206753743232, "grad_norm": 0.34665723214242133, "learning_rate": 1.769467807398426e-05, "loss": 0.1769, "step": 1170 }, { "epoch": 1.4921949665498566, "grad_norm": 0.31919156597362985, "learning_rate": 1.7688987694126425e-05, "loss": 0.1574, "step": 1171 }, { "epoch": 1.4934692577253903, "grad_norm": 0.33222607358370415, "learning_rate": 1.768329121733128e-05, "loss": 0.1638, "step": 1172 }, { "epoch": 1.494743548900924, "grad_norm": 0.3117186642722703, "learning_rate": 1.767758864811581e-05, "loss": 0.1574, "step": 1173 }, { "epoch": 1.4960178400764574, "grad_norm": 0.349765642250591, "learning_rate": 1.7671879991001838e-05, "loss": 0.1927, "step": 1174 }, { "epoch": 1.4972921312519911, "grad_norm": 0.3050480485637174, "learning_rate": 1.7666165250516006e-05, "loss": 0.1497, "step": 1175 }, { "epoch": 1.4985664224275248, "grad_norm": 0.3176952324856837, "learning_rate": 1.766044443118978e-05, "loss": 0.1554, "step": 1176 }, { "epoch": 1.4998407136030583, "grad_norm": 0.3378675898404009, "learning_rate": 1.765471753755946e-05, "loss": 0.1676, "step": 1177 }, { "epoch": 1.501115004778592, "grad_norm": 0.2998470418416109, "learning_rate": 1.7648984574166145e-05, "loss": 0.1363, "step": 1178 }, { "epoch": 1.5023892959541256, "grad_norm": 0.3295959631767332, "learning_rate": 1.7643245545555755e-05, "loss": 0.1639, "step": 1179 }, { "epoch": 1.503663587129659, "grad_norm": 0.31733029209342634, "learning_rate": 1.7637500456279025e-05, "loss": 0.1543, "step": 1180 }, { "epoch": 1.5049378783051928, "grad_norm": 0.32529217933678267, "learning_rate": 1.7631749310891483e-05, "loss": 0.1627, "step": 1181 }, { "epoch": 1.5062121694807264, "grad_norm": 0.3318792385769225, "learning_rate": 1.7625992113953465e-05, "loss": 0.1561, "step": 1182 }, { "epoch": 1.50748646065626, "grad_norm": 0.3320916489893646, "learning_rate": 1.762022887003011e-05, "loss": 0.1714, "step": 1183 }, { "epoch": 1.5087607518317936, "grad_norm": 0.33326726684272395, "learning_rate": 1.7614459583691346e-05, "loss": 0.1647, "step": 1184 }, { "epoch": 1.5100350430073273, "grad_norm": 0.3193790257663728, "learning_rate": 1.7608684259511897e-05, "loss": 0.1533, "step": 1185 }, { "epoch": 1.5113093341828607, "grad_norm": 0.3361782542179231, "learning_rate": 1.7602902902071267e-05, "loss": 0.1696, "step": 1186 }, { "epoch": 1.5125836253583944, "grad_norm": 0.32041801303100215, "learning_rate": 1.7597115515953754e-05, "loss": 0.1571, "step": 1187 }, { "epoch": 1.513857916533928, "grad_norm": 0.3121829461831611, "learning_rate": 1.7591322105748434e-05, "loss": 0.1505, "step": 1188 }, { "epoch": 1.5151322077094616, "grad_norm": 0.3248280803178264, "learning_rate": 1.7585522676049152e-05, "loss": 0.1583, "step": 1189 }, { "epoch": 1.5164064988849952, "grad_norm": 0.3489627616441865, "learning_rate": 1.757971723145453e-05, "loss": 0.1703, "step": 1190 }, { "epoch": 1.517680790060529, "grad_norm": 0.32454287321096303, "learning_rate": 1.7573905776567966e-05, "loss": 0.1654, "step": 1191 }, { "epoch": 1.5189550812360624, "grad_norm": 0.3569241257180926, "learning_rate": 1.756808831599762e-05, "loss": 0.1727, "step": 1192 }, { "epoch": 1.520229372411596, "grad_norm": 0.33986120989136454, "learning_rate": 1.7562264854356405e-05, "loss": 0.1657, "step": 1193 }, { "epoch": 1.5215036635871297, "grad_norm": 0.32490778697342787, "learning_rate": 1.755643539626201e-05, "loss": 0.1673, "step": 1194 }, { "epoch": 1.5227779547626632, "grad_norm": 0.3339641565324367, "learning_rate": 1.755059994633686e-05, "loss": 0.1655, "step": 1195 }, { "epoch": 1.5240522459381969, "grad_norm": 0.34319428247730954, "learning_rate": 1.7544758509208148e-05, "loss": 0.1677, "step": 1196 }, { "epoch": 1.5253265371137306, "grad_norm": 0.34528018949046374, "learning_rate": 1.75389110895078e-05, "loss": 0.1662, "step": 1197 }, { "epoch": 1.526600828289264, "grad_norm": 0.32949235860154574, "learning_rate": 1.7533057691872502e-05, "loss": 0.1671, "step": 1198 }, { "epoch": 1.5278751194647977, "grad_norm": 0.32270438788703326, "learning_rate": 1.7527198320943662e-05, "loss": 0.1631, "step": 1199 }, { "epoch": 1.5291494106403314, "grad_norm": 0.32979339863538193, "learning_rate": 1.752133298136744e-05, "loss": 0.1588, "step": 1200 }, { "epoch": 1.5304237018158648, "grad_norm": 0.3700839529687084, "learning_rate": 1.751546167779472e-05, "loss": 0.1877, "step": 1201 }, { "epoch": 1.5316979929913985, "grad_norm": 0.34184144503035546, "learning_rate": 1.7509584414881114e-05, "loss": 0.1592, "step": 1202 }, { "epoch": 1.5329722841669322, "grad_norm": 0.323259940510379, "learning_rate": 1.750370119728697e-05, "loss": 0.1584, "step": 1203 }, { "epoch": 1.5342465753424657, "grad_norm": 0.34007378693717416, "learning_rate": 1.7497812029677344e-05, "loss": 0.17, "step": 1204 }, { "epoch": 1.5355208665179993, "grad_norm": 0.32674634864668917, "learning_rate": 1.7491916916722022e-05, "loss": 0.1629, "step": 1205 }, { "epoch": 1.536795157693533, "grad_norm": 0.30847831978842905, "learning_rate": 1.7486015863095493e-05, "loss": 0.1362, "step": 1206 }, { "epoch": 1.5380694488690665, "grad_norm": 0.34570927116170797, "learning_rate": 1.7480108873476968e-05, "loss": 0.1612, "step": 1207 }, { "epoch": 1.5393437400446002, "grad_norm": 0.32747856983595064, "learning_rate": 1.7474195952550355e-05, "loss": 0.167, "step": 1208 }, { "epoch": 1.5406180312201339, "grad_norm": 0.362975290812171, "learning_rate": 1.7468277105004273e-05, "loss": 0.1717, "step": 1209 }, { "epoch": 1.5418923223956673, "grad_norm": 0.3206260437802884, "learning_rate": 1.7462352335532037e-05, "loss": 0.1574, "step": 1210 }, { "epoch": 1.543166613571201, "grad_norm": 0.32487070632189075, "learning_rate": 1.7456421648831658e-05, "loss": 0.1563, "step": 1211 }, { "epoch": 1.5444409047467347, "grad_norm": 0.3302336500516661, "learning_rate": 1.7450485049605838e-05, "loss": 0.1674, "step": 1212 }, { "epoch": 1.5457151959222681, "grad_norm": 0.34572231884449395, "learning_rate": 1.7444542542561967e-05, "loss": 0.1883, "step": 1213 }, { "epoch": 1.5469894870978018, "grad_norm": 0.3208067793907175, "learning_rate": 1.743859413241212e-05, "loss": 0.1614, "step": 1214 }, { "epoch": 1.5482637782733355, "grad_norm": 0.3147439498364997, "learning_rate": 1.7432639823873057e-05, "loss": 0.1527, "step": 1215 }, { "epoch": 1.549538069448869, "grad_norm": 0.32274519003823915, "learning_rate": 1.742667962166621e-05, "loss": 0.1615, "step": 1216 }, { "epoch": 1.5508123606244026, "grad_norm": 0.32349301958605886, "learning_rate": 1.742071353051769e-05, "loss": 0.1698, "step": 1217 }, { "epoch": 1.5520866517999363, "grad_norm": 0.33955220490695304, "learning_rate": 1.741474155515827e-05, "loss": 0.17, "step": 1218 }, { "epoch": 1.5533609429754698, "grad_norm": 0.31832862132429823, "learning_rate": 1.740876370032339e-05, "loss": 0.1629, "step": 1219 }, { "epoch": 1.5546352341510035, "grad_norm": 0.32695591320179296, "learning_rate": 1.7402779970753156e-05, "loss": 0.1695, "step": 1220 }, { "epoch": 1.5559095253265371, "grad_norm": 0.3268335873771595, "learning_rate": 1.7396790371192333e-05, "loss": 0.1691, "step": 1221 }, { "epoch": 1.5571838165020706, "grad_norm": 0.32063262706884205, "learning_rate": 1.7390794906390343e-05, "loss": 0.1572, "step": 1222 }, { "epoch": 1.5584581076776043, "grad_norm": 0.31715179965569995, "learning_rate": 1.7384793581101242e-05, "loss": 0.156, "step": 1223 }, { "epoch": 1.559732398853138, "grad_norm": 0.2986657071449468, "learning_rate": 1.7378786400083756e-05, "loss": 0.1322, "step": 1224 }, { "epoch": 1.5610066900286714, "grad_norm": 0.3736606086542583, "learning_rate": 1.737277336810124e-05, "loss": 0.2011, "step": 1225 }, { "epoch": 1.562280981204205, "grad_norm": 0.3609414314169792, "learning_rate": 1.7366754489921694e-05, "loss": 0.193, "step": 1226 }, { "epoch": 1.5635552723797388, "grad_norm": 0.32348916821884727, "learning_rate": 1.7360729770317746e-05, "loss": 0.1508, "step": 1227 }, { "epoch": 1.5648295635552723, "grad_norm": 0.3018154507480951, "learning_rate": 1.735469921406667e-05, "loss": 0.1566, "step": 1228 }, { "epoch": 1.5661038547308062, "grad_norm": 0.36398861458375964, "learning_rate": 1.7348662825950356e-05, "loss": 0.1658, "step": 1229 }, { "epoch": 1.5673781459063396, "grad_norm": 0.32953602984440833, "learning_rate": 1.734262061075532e-05, "loss": 0.1676, "step": 1230 }, { "epoch": 1.568652437081873, "grad_norm": 0.3364507610649358, "learning_rate": 1.7336572573272708e-05, "loss": 0.1631, "step": 1231 }, { "epoch": 1.569926728257407, "grad_norm": 0.3338336296063149, "learning_rate": 1.7330518718298263e-05, "loss": 0.1718, "step": 1232 }, { "epoch": 1.5712010194329404, "grad_norm": 0.3452791823402832, "learning_rate": 1.7324459050632368e-05, "loss": 0.1518, "step": 1233 }, { "epoch": 1.572475310608474, "grad_norm": 0.32771349472823863, "learning_rate": 1.731839357507999e-05, "loss": 0.1428, "step": 1234 }, { "epoch": 1.5737496017840078, "grad_norm": 0.32302700563708314, "learning_rate": 1.7312322296450714e-05, "loss": 0.1646, "step": 1235 }, { "epoch": 1.5750238929595413, "grad_norm": 0.30889369365533714, "learning_rate": 1.730624521955873e-05, "loss": 0.1477, "step": 1236 }, { "epoch": 1.5762981841350747, "grad_norm": 0.3322979547202734, "learning_rate": 1.7300162349222814e-05, "loss": 0.1709, "step": 1237 }, { "epoch": 1.5775724753106086, "grad_norm": 0.33841846983596985, "learning_rate": 1.7294073690266343e-05, "loss": 0.1644, "step": 1238 }, { "epoch": 1.578846766486142, "grad_norm": 0.33868026750755004, "learning_rate": 1.7287979247517285e-05, "loss": 0.1672, "step": 1239 }, { "epoch": 1.5801210576616755, "grad_norm": 0.32190256555458235, "learning_rate": 1.7281879025808193e-05, "loss": 0.1623, "step": 1240 }, { "epoch": 1.5813953488372094, "grad_norm": 0.3028498842049276, "learning_rate": 1.7275773029976202e-05, "loss": 0.1631, "step": 1241 }, { "epoch": 1.582669640012743, "grad_norm": 0.3210781110136137, "learning_rate": 1.726966126486302e-05, "loss": 0.1584, "step": 1242 }, { "epoch": 1.5839439311882764, "grad_norm": 0.3264430815419093, "learning_rate": 1.7263543735314942e-05, "loss": 0.152, "step": 1243 }, { "epoch": 1.5852182223638103, "grad_norm": 0.3212946122487262, "learning_rate": 1.725742044618282e-05, "loss": 0.1606, "step": 1244 }, { "epoch": 1.5864925135393437, "grad_norm": 0.3362293154302193, "learning_rate": 1.7251291402322087e-05, "loss": 0.1824, "step": 1245 }, { "epoch": 1.5877668047148772, "grad_norm": 0.3357828404540511, "learning_rate": 1.7245156608592727e-05, "loss": 0.1653, "step": 1246 }, { "epoch": 1.589041095890411, "grad_norm": 0.3259358825937956, "learning_rate": 1.7239016069859292e-05, "loss": 0.1572, "step": 1247 }, { "epoch": 1.5903153870659446, "grad_norm": 0.3274377847372634, "learning_rate": 1.723286979099088e-05, "loss": 0.1672, "step": 1248 }, { "epoch": 1.591589678241478, "grad_norm": 0.3378215551155529, "learning_rate": 1.7226717776861152e-05, "loss": 0.1686, "step": 1249 }, { "epoch": 1.592863969417012, "grad_norm": 0.298827929354981, "learning_rate": 1.7220560032348313e-05, "loss": 0.1458, "step": 1250 }, { "epoch": 1.5941382605925454, "grad_norm": 0.3294626324162144, "learning_rate": 1.7214396562335102e-05, "loss": 0.1633, "step": 1251 }, { "epoch": 1.595412551768079, "grad_norm": 0.335174035129033, "learning_rate": 1.7208227371708814e-05, "loss": 0.1697, "step": 1252 }, { "epoch": 1.5966868429436127, "grad_norm": 0.31976349718056185, "learning_rate": 1.7202052465361268e-05, "loss": 0.1557, "step": 1253 }, { "epoch": 1.5979611341191462, "grad_norm": 0.3187126066589782, "learning_rate": 1.719587184818882e-05, "loss": 0.1616, "step": 1254 }, { "epoch": 1.5992354252946799, "grad_norm": 0.3605947418393162, "learning_rate": 1.718968552509235e-05, "loss": 0.1928, "step": 1255 }, { "epoch": 1.6005097164702136, "grad_norm": 0.3286997374039332, "learning_rate": 1.7183493500977277e-05, "loss": 0.1509, "step": 1256 }, { "epoch": 1.601784007645747, "grad_norm": 0.30690211580089144, "learning_rate": 1.717729578075352e-05, "loss": 0.1491, "step": 1257 }, { "epoch": 1.6030582988212807, "grad_norm": 0.3438306368576811, "learning_rate": 1.717109236933553e-05, "loss": 0.1778, "step": 1258 }, { "epoch": 1.6043325899968144, "grad_norm": 0.33919617056045837, "learning_rate": 1.7164883271642262e-05, "loss": 0.168, "step": 1259 }, { "epoch": 1.6056068811723478, "grad_norm": 0.34230149378177555, "learning_rate": 1.7158668492597186e-05, "loss": 0.183, "step": 1260 }, { "epoch": 1.6068811723478815, "grad_norm": 0.3099892965382403, "learning_rate": 1.7152448037128273e-05, "loss": 0.1634, "step": 1261 }, { "epoch": 1.6081554635234152, "grad_norm": 0.32874666518905765, "learning_rate": 1.7146221910167994e-05, "loss": 0.1774, "step": 1262 }, { "epoch": 1.6094297546989487, "grad_norm": 0.35958071260791713, "learning_rate": 1.7139990116653324e-05, "loss": 0.1892, "step": 1263 }, { "epoch": 1.6107040458744823, "grad_norm": 0.31206540578601955, "learning_rate": 1.7133752661525722e-05, "loss": 0.1487, "step": 1264 }, { "epoch": 1.611978337050016, "grad_norm": 0.3520195059741248, "learning_rate": 1.712750954973115e-05, "loss": 0.1829, "step": 1265 }, { "epoch": 1.6132526282255495, "grad_norm": 0.3186659072708785, "learning_rate": 1.7121260786220033e-05, "loss": 0.1704, "step": 1266 }, { "epoch": 1.6145269194010832, "grad_norm": 0.32145287202564315, "learning_rate": 1.7115006375947304e-05, "loss": 0.1491, "step": 1267 }, { "epoch": 1.6158012105766169, "grad_norm": 0.35770087773138587, "learning_rate": 1.710874632387235e-05, "loss": 0.1701, "step": 1268 }, { "epoch": 1.6170755017521503, "grad_norm": 0.3048146940930051, "learning_rate": 1.7102480634959055e-05, "loss": 0.1443, "step": 1269 }, { "epoch": 1.618349792927684, "grad_norm": 0.3344319463985712, "learning_rate": 1.7096209314175744e-05, "loss": 0.1662, "step": 1270 }, { "epoch": 1.6196240841032177, "grad_norm": 0.33286743993137563, "learning_rate": 1.7089932366495237e-05, "loss": 0.167, "step": 1271 }, { "epoch": 1.6208983752787511, "grad_norm": 0.3344382520731154, "learning_rate": 1.7083649796894798e-05, "loss": 0.158, "step": 1272 }, { "epoch": 1.6221726664542848, "grad_norm": 0.35956024974600687, "learning_rate": 1.707736161035615e-05, "loss": 0.1905, "step": 1273 }, { "epoch": 1.6234469576298185, "grad_norm": 0.31230989661154357, "learning_rate": 1.7071067811865477e-05, "loss": 0.1492, "step": 1274 }, { "epoch": 1.624721248805352, "grad_norm": 0.3403379464022347, "learning_rate": 1.706476840641341e-05, "loss": 0.1699, "step": 1275 }, { "epoch": 1.6259955399808856, "grad_norm": 0.32522199423846065, "learning_rate": 1.7058463398995024e-05, "loss": 0.1565, "step": 1276 }, { "epoch": 1.6272698311564193, "grad_norm": 0.3560804208883112, "learning_rate": 1.7052152794609835e-05, "loss": 0.1845, "step": 1277 }, { "epoch": 1.6285441223319528, "grad_norm": 0.3613202150509468, "learning_rate": 1.70458365982618e-05, "loss": 0.1679, "step": 1278 }, { "epoch": 1.6298184135074865, "grad_norm": 0.33522856545989244, "learning_rate": 1.7039514814959316e-05, "loss": 0.168, "step": 1279 }, { "epoch": 1.6310927046830201, "grad_norm": 0.31763614814066526, "learning_rate": 1.7033187449715195e-05, "loss": 0.1489, "step": 1280 }, { "epoch": 1.6323669958585536, "grad_norm": 0.34056606446749343, "learning_rate": 1.7026854507546694e-05, "loss": 0.1754, "step": 1281 }, { "epoch": 1.6336412870340873, "grad_norm": 0.3036682751836827, "learning_rate": 1.702051599347547e-05, "loss": 0.1412, "step": 1282 }, { "epoch": 1.634915578209621, "grad_norm": 0.30461458512413, "learning_rate": 1.7014171912527616e-05, "loss": 0.1481, "step": 1283 }, { "epoch": 1.6361898693851544, "grad_norm": 0.35476531158721897, "learning_rate": 1.7007822269733637e-05, "loss": 0.1755, "step": 1284 }, { "epoch": 1.637464160560688, "grad_norm": 0.31772588771445637, "learning_rate": 1.7001467070128436e-05, "loss": 0.1571, "step": 1285 }, { "epoch": 1.6387384517362218, "grad_norm": 0.37169268055996274, "learning_rate": 1.699510631875134e-05, "loss": 0.1998, "step": 1286 }, { "epoch": 1.6400127429117553, "grad_norm": 0.31777581373499186, "learning_rate": 1.6988740020646067e-05, "loss": 0.1543, "step": 1287 }, { "epoch": 1.641287034087289, "grad_norm": 0.32074832254143687, "learning_rate": 1.698236818086073e-05, "loss": 0.1464, "step": 1288 }, { "epoch": 1.6425613252628226, "grad_norm": 0.30670238388353743, "learning_rate": 1.6975990804447845e-05, "loss": 0.1447, "step": 1289 }, { "epoch": 1.643835616438356, "grad_norm": 0.336233246987471, "learning_rate": 1.6969607896464316e-05, "loss": 0.1634, "step": 1290 }, { "epoch": 1.6451099076138898, "grad_norm": 0.3035035564941, "learning_rate": 1.6963219461971433e-05, "loss": 0.1417, "step": 1291 }, { "epoch": 1.6463841987894234, "grad_norm": 0.3430554553990617, "learning_rate": 1.6956825506034866e-05, "loss": 0.1759, "step": 1292 }, { "epoch": 1.647658489964957, "grad_norm": 0.34009476943142036, "learning_rate": 1.695042603372466e-05, "loss": 0.1629, "step": 1293 }, { "epoch": 1.6489327811404906, "grad_norm": 0.35968837878435583, "learning_rate": 1.6944021050115246e-05, "loss": 0.1795, "step": 1294 }, { "epoch": 1.6502070723160243, "grad_norm": 0.30918063685756936, "learning_rate": 1.693761056028542e-05, "loss": 0.1533, "step": 1295 }, { "epoch": 1.6514813634915577, "grad_norm": 0.3296380231417119, "learning_rate": 1.6931194569318327e-05, "loss": 0.16, "step": 1296 }, { "epoch": 1.6527556546670914, "grad_norm": 0.36465356656914577, "learning_rate": 1.6924773082301506e-05, "loss": 0.1756, "step": 1297 }, { "epoch": 1.654029945842625, "grad_norm": 0.3205195986726275, "learning_rate": 1.691834610432683e-05, "loss": 0.1377, "step": 1298 }, { "epoch": 1.6553042370181585, "grad_norm": 0.3315641539659696, "learning_rate": 1.6911913640490528e-05, "loss": 0.1628, "step": 1299 }, { "epoch": 1.6565785281936922, "grad_norm": 0.32214711907690075, "learning_rate": 1.6905475695893193e-05, "loss": 0.1469, "step": 1300 }, { "epoch": 1.657852819369226, "grad_norm": 0.33912797743449263, "learning_rate": 1.689903227563975e-05, "loss": 0.1562, "step": 1301 }, { "epoch": 1.6591271105447594, "grad_norm": 0.3479240737064437, "learning_rate": 1.689258338483947e-05, "loss": 0.1622, "step": 1302 }, { "epoch": 1.660401401720293, "grad_norm": 0.35500310514124817, "learning_rate": 1.688612902860597e-05, "loss": 0.1878, "step": 1303 }, { "epoch": 1.6616756928958267, "grad_norm": 0.3140107038891957, "learning_rate": 1.6879669212057187e-05, "loss": 0.1605, "step": 1304 }, { "epoch": 1.6629499840713602, "grad_norm": 0.3138394942109429, "learning_rate": 1.6873203940315396e-05, "loss": 0.1572, "step": 1305 }, { "epoch": 1.6642242752468939, "grad_norm": 0.30129957179754085, "learning_rate": 1.6866733218507198e-05, "loss": 0.1544, "step": 1306 }, { "epoch": 1.6654985664224276, "grad_norm": 0.33145917089671945, "learning_rate": 1.6860257051763512e-05, "loss": 0.1917, "step": 1307 }, { "epoch": 1.666772857597961, "grad_norm": 0.3125724832831799, "learning_rate": 1.6853775445219575e-05, "loss": 0.1553, "step": 1308 }, { "epoch": 1.6680471487734947, "grad_norm": 0.3157699144503776, "learning_rate": 1.6847288404014937e-05, "loss": 0.1468, "step": 1309 }, { "epoch": 1.6693214399490284, "grad_norm": 0.31837743854526923, "learning_rate": 1.6840795933293464e-05, "loss": 0.1484, "step": 1310 }, { "epoch": 1.6705957311245618, "grad_norm": 0.3346164010043758, "learning_rate": 1.6834298038203317e-05, "loss": 0.1612, "step": 1311 }, { "epoch": 1.6718700223000957, "grad_norm": 0.32358109128806706, "learning_rate": 1.6827794723896968e-05, "loss": 0.1466, "step": 1312 }, { "epoch": 1.6731443134756292, "grad_norm": 0.3266737812904919, "learning_rate": 1.682128599553118e-05, "loss": 0.1664, "step": 1313 }, { "epoch": 1.6744186046511627, "grad_norm": 0.3107801016809454, "learning_rate": 1.681477185826701e-05, "loss": 0.1545, "step": 1314 }, { "epoch": 1.6756928958266966, "grad_norm": 0.3295192547634502, "learning_rate": 1.6808252317269806e-05, "loss": 0.1497, "step": 1315 }, { "epoch": 1.67696718700223, "grad_norm": 0.3186639557309208, "learning_rate": 1.6801727377709195e-05, "loss": 0.1619, "step": 1316 }, { "epoch": 1.6782414781777635, "grad_norm": 0.35788042230022576, "learning_rate": 1.6795197044759094e-05, "loss": 0.1764, "step": 1317 }, { "epoch": 1.6795157693532974, "grad_norm": 0.3348992604814869, "learning_rate": 1.6788661323597693e-05, "loss": 0.1767, "step": 1318 }, { "epoch": 1.6807900605288308, "grad_norm": 0.3325090822790715, "learning_rate": 1.678212021940745e-05, "loss": 0.1711, "step": 1319 }, { "epoch": 1.6820643517043643, "grad_norm": 0.31164473100694984, "learning_rate": 1.6775573737375098e-05, "loss": 0.1504, "step": 1320 }, { "epoch": 1.6833386428798982, "grad_norm": 0.32267228074688764, "learning_rate": 1.6769021882691624e-05, "loss": 0.1493, "step": 1321 }, { "epoch": 1.6846129340554317, "grad_norm": 0.3188735298211966, "learning_rate": 1.676246466055229e-05, "loss": 0.1575, "step": 1322 }, { "epoch": 1.6858872252309651, "grad_norm": 0.32364008446943787, "learning_rate": 1.6755902076156606e-05, "loss": 0.1631, "step": 1323 }, { "epoch": 1.687161516406499, "grad_norm": 0.3295483374483157, "learning_rate": 1.6749334134708327e-05, "loss": 0.1595, "step": 1324 }, { "epoch": 1.6884358075820325, "grad_norm": 0.30092820078265914, "learning_rate": 1.6742760841415474e-05, "loss": 0.1503, "step": 1325 }, { "epoch": 1.689710098757566, "grad_norm": 0.34024112686642516, "learning_rate": 1.6736182201490295e-05, "loss": 0.1729, "step": 1326 }, { "epoch": 1.6909843899330999, "grad_norm": 0.3181573018318563, "learning_rate": 1.6729598220149285e-05, "loss": 0.1613, "step": 1327 }, { "epoch": 1.6922586811086333, "grad_norm": 0.3122727923400097, "learning_rate": 1.672300890261317e-05, "loss": 0.161, "step": 1328 }, { "epoch": 1.6935329722841668, "grad_norm": 0.3348310257278924, "learning_rate": 1.6716414254106912e-05, "loss": 0.1698, "step": 1329 }, { "epoch": 1.6948072634597007, "grad_norm": 0.3382506651002816, "learning_rate": 1.67098142798597e-05, "loss": 0.1547, "step": 1330 }, { "epoch": 1.6960815546352341, "grad_norm": 0.3655104474557299, "learning_rate": 1.6703208985104947e-05, "loss": 0.1832, "step": 1331 }, { "epoch": 1.6973558458107676, "grad_norm": 0.3582695196977606, "learning_rate": 1.669659837508028e-05, "loss": 0.188, "step": 1332 }, { "epoch": 1.6986301369863015, "grad_norm": 0.32608387976180714, "learning_rate": 1.668998245502754e-05, "loss": 0.1604, "step": 1333 }, { "epoch": 1.699904428161835, "grad_norm": 0.32520500510883427, "learning_rate": 1.6683361230192784e-05, "loss": 0.1644, "step": 1334 }, { "epoch": 1.7011787193373686, "grad_norm": 0.35862020937755323, "learning_rate": 1.6676734705826275e-05, "loss": 0.1803, "step": 1335 }, { "epoch": 1.7024530105129023, "grad_norm": 0.28387564770624474, "learning_rate": 1.6670102887182472e-05, "loss": 0.141, "step": 1336 }, { "epoch": 1.7037273016884358, "grad_norm": 0.308515889241501, "learning_rate": 1.6663465779520042e-05, "loss": 0.1538, "step": 1337 }, { "epoch": 1.7050015928639695, "grad_norm": 0.32679525292882416, "learning_rate": 1.6656823388101835e-05, "loss": 0.1708, "step": 1338 }, { "epoch": 1.7062758840395031, "grad_norm": 0.32067972990713817, "learning_rate": 1.66501757181949e-05, "loss": 0.1632, "step": 1339 }, { "epoch": 1.7075501752150366, "grad_norm": 0.35278420649067116, "learning_rate": 1.6643522775070462e-05, "loss": 0.1825, "step": 1340 }, { "epoch": 1.7088244663905703, "grad_norm": 0.32856926803581216, "learning_rate": 1.6636864564003937e-05, "loss": 0.1584, "step": 1341 }, { "epoch": 1.710098757566104, "grad_norm": 0.3266500222169605, "learning_rate": 1.6630201090274916e-05, "loss": 0.1591, "step": 1342 }, { "epoch": 1.7113730487416374, "grad_norm": 0.2989516378342305, "learning_rate": 1.662353235916716e-05, "loss": 0.1508, "step": 1343 }, { "epoch": 1.712647339917171, "grad_norm": 0.3230721476138676, "learning_rate": 1.6616858375968596e-05, "loss": 0.1606, "step": 1344 }, { "epoch": 1.7139216310927048, "grad_norm": 0.3458222472262258, "learning_rate": 1.6610179145971324e-05, "loss": 0.2049, "step": 1345 }, { "epoch": 1.7151959222682382, "grad_norm": 0.34026275852318744, "learning_rate": 1.6603494674471595e-05, "loss": 0.1646, "step": 1346 }, { "epoch": 1.716470213443772, "grad_norm": 0.31977876980242187, "learning_rate": 1.6596804966769827e-05, "loss": 0.1696, "step": 1347 }, { "epoch": 1.7177445046193056, "grad_norm": 0.31927134459964657, "learning_rate": 1.6590110028170577e-05, "loss": 0.1737, "step": 1348 }, { "epoch": 1.719018795794839, "grad_norm": 0.3247960086942944, "learning_rate": 1.6583409863982567e-05, "loss": 0.1673, "step": 1349 }, { "epoch": 1.7202930869703728, "grad_norm": 0.318224660608633, "learning_rate": 1.6576704479518647e-05, "loss": 0.1539, "step": 1350 }, { "epoch": 1.7215673781459064, "grad_norm": 0.34437560734628564, "learning_rate": 1.6569993880095807e-05, "loss": 0.1947, "step": 1351 }, { "epoch": 1.72284166932144, "grad_norm": 0.3251965420579689, "learning_rate": 1.6563278071035182e-05, "loss": 0.1696, "step": 1352 }, { "epoch": 1.7241159604969736, "grad_norm": 0.34727169878456793, "learning_rate": 1.6556557057662038e-05, "loss": 0.1819, "step": 1353 }, { "epoch": 1.7253902516725073, "grad_norm": 0.28634288230320265, "learning_rate": 1.6549830845305753e-05, "loss": 0.1417, "step": 1354 }, { "epoch": 1.7266645428480407, "grad_norm": 0.3233942587969784, "learning_rate": 1.6543099439299847e-05, "loss": 0.1797, "step": 1355 }, { "epoch": 1.7279388340235744, "grad_norm": 0.3204252816324564, "learning_rate": 1.6536362844981937e-05, "loss": 0.1679, "step": 1356 }, { "epoch": 1.729213125199108, "grad_norm": 0.32251207027670215, "learning_rate": 1.6529621067693775e-05, "loss": 0.1642, "step": 1357 }, { "epoch": 1.7304874163746415, "grad_norm": 0.34358652750994595, "learning_rate": 1.6522874112781213e-05, "loss": 0.1755, "step": 1358 }, { "epoch": 1.7317617075501752, "grad_norm": 0.32055966878742465, "learning_rate": 1.6516121985594205e-05, "loss": 0.1531, "step": 1359 }, { "epoch": 1.733035998725709, "grad_norm": 0.324900712084612, "learning_rate": 1.650936469148681e-05, "loss": 0.1738, "step": 1360 }, { "epoch": 1.7343102899012424, "grad_norm": 0.3159255888277183, "learning_rate": 1.650260223581719e-05, "loss": 0.1472, "step": 1361 }, { "epoch": 1.735584581076776, "grad_norm": 0.3228661759920223, "learning_rate": 1.649583462394759e-05, "loss": 0.1657, "step": 1362 }, { "epoch": 1.7368588722523097, "grad_norm": 0.33730034414061605, "learning_rate": 1.648906186124435e-05, "loss": 0.171, "step": 1363 }, { "epoch": 1.7381331634278432, "grad_norm": 0.327372684808721, "learning_rate": 1.6482283953077887e-05, "loss": 0.1649, "step": 1364 }, { "epoch": 1.7394074546033769, "grad_norm": 0.3254820617089296, "learning_rate": 1.6475500904822707e-05, "loss": 0.1731, "step": 1365 }, { "epoch": 1.7406817457789105, "grad_norm": 0.345083819324634, "learning_rate": 1.6468712721857388e-05, "loss": 0.1732, "step": 1366 }, { "epoch": 1.741956036954444, "grad_norm": 0.34290504047180864, "learning_rate": 1.6461919409564578e-05, "loss": 0.177, "step": 1367 }, { "epoch": 1.7432303281299777, "grad_norm": 0.33325004046860457, "learning_rate": 1.6455120973330997e-05, "loss": 0.1608, "step": 1368 }, { "epoch": 1.7445046193055114, "grad_norm": 0.3061871246919976, "learning_rate": 1.6448317418547422e-05, "loss": 0.149, "step": 1369 }, { "epoch": 1.7457789104810448, "grad_norm": 0.3005919734666616, "learning_rate": 1.6441508750608695e-05, "loss": 0.1463, "step": 1370 }, { "epoch": 1.7470532016565785, "grad_norm": 0.3469957613429539, "learning_rate": 1.6434694974913706e-05, "loss": 0.1835, "step": 1371 }, { "epoch": 1.7483274928321122, "grad_norm": 0.33413438282329855, "learning_rate": 1.6427876096865394e-05, "loss": 0.1672, "step": 1372 }, { "epoch": 1.7496017840076457, "grad_norm": 0.31227645052394387, "learning_rate": 1.6421052121870755e-05, "loss": 0.1545, "step": 1373 }, { "epoch": 1.7508760751831793, "grad_norm": 0.3195911496850961, "learning_rate": 1.641422305534082e-05, "loss": 0.1526, "step": 1374 }, { "epoch": 1.752150366358713, "grad_norm": 0.3803856096157082, "learning_rate": 1.640738890269065e-05, "loss": 0.204, "step": 1375 }, { "epoch": 1.7534246575342465, "grad_norm": 0.33753710536723325, "learning_rate": 1.640054966933935e-05, "loss": 0.1711, "step": 1376 }, { "epoch": 1.7546989487097802, "grad_norm": 0.31074018367713846, "learning_rate": 1.639370536071005e-05, "loss": 0.1452, "step": 1377 }, { "epoch": 1.7559732398853138, "grad_norm": 0.3469877392744879, "learning_rate": 1.6386855982229906e-05, "loss": 0.1635, "step": 1378 }, { "epoch": 1.7572475310608473, "grad_norm": 0.3242926232982265, "learning_rate": 1.6380001539330088e-05, "loss": 0.1618, "step": 1379 }, { "epoch": 1.758521822236381, "grad_norm": 0.33142659351931053, "learning_rate": 1.6373142037445787e-05, "loss": 0.1802, "step": 1380 }, { "epoch": 1.7597961134119147, "grad_norm": 0.30009838624897006, "learning_rate": 1.6366277482016208e-05, "loss": 0.151, "step": 1381 }, { "epoch": 1.7610704045874481, "grad_norm": 0.3554607825944423, "learning_rate": 1.635940787848455e-05, "loss": 0.1954, "step": 1382 }, { "epoch": 1.7623446957629818, "grad_norm": 0.3380884955868545, "learning_rate": 1.635253323229804e-05, "loss": 0.1622, "step": 1383 }, { "epoch": 1.7636189869385155, "grad_norm": 0.30271407149150764, "learning_rate": 1.6345653548907873e-05, "loss": 0.1472, "step": 1384 }, { "epoch": 1.764893278114049, "grad_norm": 0.3478079820445306, "learning_rate": 1.6338768833769264e-05, "loss": 0.1911, "step": 1385 }, { "epoch": 1.7661675692895826, "grad_norm": 0.3321773408210662, "learning_rate": 1.6331879092341402e-05, "loss": 0.1546, "step": 1386 }, { "epoch": 1.7674418604651163, "grad_norm": 0.34188804504393533, "learning_rate": 1.6324984330087462e-05, "loss": 0.1829, "step": 1387 }, { "epoch": 1.7687161516406498, "grad_norm": 0.3084614616433242, "learning_rate": 1.6318084552474616e-05, "loss": 0.1565, "step": 1388 }, { "epoch": 1.7699904428161835, "grad_norm": 0.338104415825706, "learning_rate": 1.631117976497399e-05, "loss": 0.1766, "step": 1389 }, { "epoch": 1.7712647339917171, "grad_norm": 0.33331138535349514, "learning_rate": 1.6304269973060707e-05, "loss": 0.1566, "step": 1390 }, { "epoch": 1.7725390251672506, "grad_norm": 0.32258173184270067, "learning_rate": 1.6297355182213837e-05, "loss": 0.1659, "step": 1391 }, { "epoch": 1.7738133163427843, "grad_norm": 0.31508445297200055, "learning_rate": 1.6290435397916426e-05, "loss": 0.169, "step": 1392 }, { "epoch": 1.775087607518318, "grad_norm": 0.31259456534264374, "learning_rate": 1.6283510625655474e-05, "loss": 0.1546, "step": 1393 }, { "epoch": 1.7763618986938514, "grad_norm": 0.3389713485304518, "learning_rate": 1.6276580870921937e-05, "loss": 0.1724, "step": 1394 }, { "epoch": 1.7776361898693853, "grad_norm": 0.351698378358092, "learning_rate": 1.626964613921073e-05, "loss": 0.1717, "step": 1395 }, { "epoch": 1.7789104810449188, "grad_norm": 0.3210877964550346, "learning_rate": 1.6262706436020695e-05, "loss": 0.1651, "step": 1396 }, { "epoch": 1.7801847722204522, "grad_norm": 0.34948825819481905, "learning_rate": 1.625576176685464e-05, "loss": 0.166, "step": 1397 }, { "epoch": 1.7814590633959861, "grad_norm": 0.338028219929353, "learning_rate": 1.6248812137219296e-05, "loss": 0.1668, "step": 1398 }, { "epoch": 1.7827333545715196, "grad_norm": 0.3050611268332842, "learning_rate": 1.624185755262533e-05, "loss": 0.1433, "step": 1399 }, { "epoch": 1.784007645747053, "grad_norm": 0.3108599567356789, "learning_rate": 1.6234898018587336e-05, "loss": 0.1482, "step": 1400 }, { "epoch": 1.785281936922587, "grad_norm": 0.3173384770325718, "learning_rate": 1.622793354062384e-05, "loss": 0.1487, "step": 1401 }, { "epoch": 1.7865562280981204, "grad_norm": 0.3208417242702024, "learning_rate": 1.6220964124257285e-05, "loss": 0.1543, "step": 1402 }, { "epoch": 1.7878305192736539, "grad_norm": 0.3736093135474125, "learning_rate": 1.621398977501402e-05, "loss": 0.195, "step": 1403 }, { "epoch": 1.7891048104491878, "grad_norm": 0.30269360116568916, "learning_rate": 1.620701049842432e-05, "loss": 0.1478, "step": 1404 }, { "epoch": 1.7903791016247212, "grad_norm": 0.3219662197809928, "learning_rate": 1.6200026300022365e-05, "loss": 0.1636, "step": 1405 }, { "epoch": 1.7916533928002547, "grad_norm": 0.3140624175295645, "learning_rate": 1.6193037185346225e-05, "loss": 0.155, "step": 1406 }, { "epoch": 1.7929276839757886, "grad_norm": 0.33379568798958625, "learning_rate": 1.6186043159937884e-05, "loss": 0.1795, "step": 1407 }, { "epoch": 1.794201975151322, "grad_norm": 0.3003326220899053, "learning_rate": 1.6179044229343206e-05, "loss": 0.1475, "step": 1408 }, { "epoch": 1.7954762663268555, "grad_norm": 0.3235183681370796, "learning_rate": 1.617204039911196e-05, "loss": 0.1721, "step": 1409 }, { "epoch": 1.7967505575023894, "grad_norm": 0.3072492630541757, "learning_rate": 1.6165031674797783e-05, "loss": 0.1598, "step": 1410 }, { "epoch": 1.798024848677923, "grad_norm": 0.3541657882373632, "learning_rate": 1.6158018061958213e-05, "loss": 0.1904, "step": 1411 }, { "epoch": 1.7992991398534564, "grad_norm": 0.32493822916711823, "learning_rate": 1.6150999566154642e-05, "loss": 0.1633, "step": 1412 }, { "epoch": 1.8005734310289903, "grad_norm": 0.33565066973320945, "learning_rate": 1.6143976192952348e-05, "loss": 0.1629, "step": 1413 }, { "epoch": 1.8018477222045237, "grad_norm": 0.36681596208100303, "learning_rate": 1.6136947947920477e-05, "loss": 0.1836, "step": 1414 }, { "epoch": 1.8031220133800574, "grad_norm": 0.336047567255069, "learning_rate": 1.6129914836632028e-05, "loss": 0.1652, "step": 1415 }, { "epoch": 1.804396304555591, "grad_norm": 0.32414969263154764, "learning_rate": 1.612287686466387e-05, "loss": 0.1668, "step": 1416 }, { "epoch": 1.8056705957311245, "grad_norm": 0.3245895648020217, "learning_rate": 1.611583403759672e-05, "loss": 0.1724, "step": 1417 }, { "epoch": 1.8069448869066582, "grad_norm": 0.3164613150270618, "learning_rate": 1.6108786361015145e-05, "loss": 0.1507, "step": 1418 }, { "epoch": 1.808219178082192, "grad_norm": 0.3344793894679832, "learning_rate": 1.6101733840507557e-05, "loss": 0.1798, "step": 1419 }, { "epoch": 1.8094934692577254, "grad_norm": 0.3335956670487311, "learning_rate": 1.6094676481666215e-05, "loss": 0.1716, "step": 1420 }, { "epoch": 1.810767760433259, "grad_norm": 0.33919536012168905, "learning_rate": 1.608761429008721e-05, "loss": 0.1831, "step": 1421 }, { "epoch": 1.8120420516087927, "grad_norm": 0.30496585122240194, "learning_rate": 1.6080547271370455e-05, "loss": 0.1434, "step": 1422 }, { "epoch": 1.8133163427843262, "grad_norm": 0.3400620038670421, "learning_rate": 1.6073475431119715e-05, "loss": 0.1871, "step": 1423 }, { "epoch": 1.8145906339598599, "grad_norm": 0.30512675109608733, "learning_rate": 1.6066398774942556e-05, "loss": 0.1559, "step": 1424 }, { "epoch": 1.8158649251353935, "grad_norm": 0.3044396490204371, "learning_rate": 1.6059317308450372e-05, "loss": 0.1511, "step": 1425 }, { "epoch": 1.817139216310927, "grad_norm": 0.32128102705617384, "learning_rate": 1.6052231037258367e-05, "loss": 0.1603, "step": 1426 }, { "epoch": 1.8184135074864607, "grad_norm": 0.32573955477344385, "learning_rate": 1.6045139966985562e-05, "loss": 0.1819, "step": 1427 }, { "epoch": 1.8196877986619944, "grad_norm": 0.31486099917787885, "learning_rate": 1.6038044103254775e-05, "loss": 0.1756, "step": 1428 }, { "epoch": 1.8209620898375278, "grad_norm": 0.3465016676541453, "learning_rate": 1.6030943451692635e-05, "loss": 0.1774, "step": 1429 }, { "epoch": 1.8222363810130615, "grad_norm": 0.3054785527506253, "learning_rate": 1.6023838017929558e-05, "loss": 0.1425, "step": 1430 }, { "epoch": 1.8235106721885952, "grad_norm": 0.3494905481858278, "learning_rate": 1.6016727807599758e-05, "loss": 0.1808, "step": 1431 }, { "epoch": 1.8247849633641287, "grad_norm": 0.3131273684602286, "learning_rate": 1.6009612826341226e-05, "loss": 0.1487, "step": 1432 }, { "epoch": 1.8260592545396623, "grad_norm": 0.3546057929519698, "learning_rate": 1.6002493079795754e-05, "loss": 0.1756, "step": 1433 }, { "epoch": 1.827333545715196, "grad_norm": 0.31018438724084724, "learning_rate": 1.5995368573608904e-05, "loss": 0.1574, "step": 1434 }, { "epoch": 1.8286078368907295, "grad_norm": 0.3426103890929251, "learning_rate": 1.5988239313430004e-05, "loss": 0.1633, "step": 1435 }, { "epoch": 1.8298821280662632, "grad_norm": 0.33806482079224726, "learning_rate": 1.598110530491216e-05, "loss": 0.1623, "step": 1436 }, { "epoch": 1.8311564192417968, "grad_norm": 0.34863203898052775, "learning_rate": 1.5973966553712245e-05, "loss": 0.1847, "step": 1437 }, { "epoch": 1.8324307104173303, "grad_norm": 0.33735012158298927, "learning_rate": 1.5966823065490887e-05, "loss": 0.1695, "step": 1438 }, { "epoch": 1.833705001592864, "grad_norm": 0.2877788216265123, "learning_rate": 1.5959674845912473e-05, "loss": 0.1396, "step": 1439 }, { "epoch": 1.8349792927683977, "grad_norm": 0.32091126790632096, "learning_rate": 1.5952521900645143e-05, "loss": 0.1601, "step": 1440 }, { "epoch": 1.8362535839439311, "grad_norm": 0.343885178554332, "learning_rate": 1.594536423536078e-05, "loss": 0.1812, "step": 1441 }, { "epoch": 1.8375278751194648, "grad_norm": 0.33633368255167284, "learning_rate": 1.5938201855735017e-05, "loss": 0.1649, "step": 1442 }, { "epoch": 1.8388021662949985, "grad_norm": 0.3164106199851661, "learning_rate": 1.593103476744722e-05, "loss": 0.1576, "step": 1443 }, { "epoch": 1.840076457470532, "grad_norm": 0.3096623617300062, "learning_rate": 1.592386297618048e-05, "loss": 0.1438, "step": 1444 }, { "epoch": 1.8413507486460656, "grad_norm": 0.330637963310964, "learning_rate": 1.5916686487621636e-05, "loss": 0.1681, "step": 1445 }, { "epoch": 1.8426250398215993, "grad_norm": 0.32078174577453095, "learning_rate": 1.5909505307461238e-05, "loss": 0.1614, "step": 1446 }, { "epoch": 1.8438993309971328, "grad_norm": 0.30999696186107006, "learning_rate": 1.5902319441393562e-05, "loss": 0.1478, "step": 1447 }, { "epoch": 1.8451736221726664, "grad_norm": 0.2955535288466078, "learning_rate": 1.58951288951166e-05, "loss": 0.1533, "step": 1448 }, { "epoch": 1.8464479133482001, "grad_norm": 0.323015245216486, "learning_rate": 1.5887933674332048e-05, "loss": 0.153, "step": 1449 }, { "epoch": 1.8477222045237336, "grad_norm": 0.36396409450230216, "learning_rate": 1.5880733784745318e-05, "loss": 0.1733, "step": 1450 }, { "epoch": 1.8489964956992673, "grad_norm": 0.33256170047976236, "learning_rate": 1.587352923206552e-05, "loss": 0.1691, "step": 1451 }, { "epoch": 1.850270786874801, "grad_norm": 0.29858777248902635, "learning_rate": 1.5866320022005457e-05, "loss": 0.1381, "step": 1452 }, { "epoch": 1.8515450780503344, "grad_norm": 0.3556542504683024, "learning_rate": 1.5859106160281634e-05, "loss": 0.1715, "step": 1453 }, { "epoch": 1.852819369225868, "grad_norm": 0.32963585515566907, "learning_rate": 1.5851887652614238e-05, "loss": 0.1708, "step": 1454 }, { "epoch": 1.8540936604014018, "grad_norm": 0.3287840674710993, "learning_rate": 1.5844664504727142e-05, "loss": 0.1689, "step": 1455 }, { "epoch": 1.8553679515769352, "grad_norm": 0.3032459770138912, "learning_rate": 1.5837436722347902e-05, "loss": 0.1493, "step": 1456 }, { "epoch": 1.856642242752469, "grad_norm": 0.31583232356576507, "learning_rate": 1.583020431120774e-05, "loss": 0.153, "step": 1457 }, { "epoch": 1.8579165339280026, "grad_norm": 0.3208398784903584, "learning_rate": 1.5822967277041553e-05, "loss": 0.1654, "step": 1458 }, { "epoch": 1.859190825103536, "grad_norm": 0.30771153074712476, "learning_rate": 1.5815725625587906e-05, "loss": 0.1575, "step": 1459 }, { "epoch": 1.8604651162790697, "grad_norm": 0.32195106088236963, "learning_rate": 1.580847936258903e-05, "loss": 0.1573, "step": 1460 }, { "epoch": 1.8617394074546034, "grad_norm": 0.3261313379034259, "learning_rate": 1.58012284937908e-05, "loss": 0.1733, "step": 1461 }, { "epoch": 1.8630136986301369, "grad_norm": 0.3263160701486365, "learning_rate": 1.579397302494275e-05, "loss": 0.1729, "step": 1462 }, { "epoch": 1.8642879898056706, "grad_norm": 0.32451894392560426, "learning_rate": 1.578671296179806e-05, "loss": 0.1614, "step": 1463 }, { "epoch": 1.8655622809812042, "grad_norm": 0.3205193014263159, "learning_rate": 1.5779448310113553e-05, "loss": 0.1631, "step": 1464 }, { "epoch": 1.8668365721567377, "grad_norm": 0.3168692386090936, "learning_rate": 1.5772179075649702e-05, "loss": 0.1633, "step": 1465 }, { "epoch": 1.8681108633322714, "grad_norm": 0.32079269133593385, "learning_rate": 1.576490526417059e-05, "loss": 0.1661, "step": 1466 }, { "epoch": 1.869385154507805, "grad_norm": 0.3249664056306592, "learning_rate": 1.5757626881443956e-05, "loss": 0.1637, "step": 1467 }, { "epoch": 1.8706594456833385, "grad_norm": 0.34792857071555056, "learning_rate": 1.5750343933241133e-05, "loss": 0.1829, "step": 1468 }, { "epoch": 1.8719337368588722, "grad_norm": 0.29324205422323246, "learning_rate": 1.574305642533711e-05, "loss": 0.1364, "step": 1469 }, { "epoch": 1.873208028034406, "grad_norm": 0.3160218044717073, "learning_rate": 1.573576436351046e-05, "loss": 0.147, "step": 1470 }, { "epoch": 1.8744823192099394, "grad_norm": 0.3703812730627399, "learning_rate": 1.572846775354339e-05, "loss": 0.1713, "step": 1471 }, { "epoch": 1.875756610385473, "grad_norm": 0.3198540850925982, "learning_rate": 1.5721166601221697e-05, "loss": 0.1776, "step": 1472 }, { "epoch": 1.8770309015610067, "grad_norm": 0.31740360340602436, "learning_rate": 1.571386091233479e-05, "loss": 0.1545, "step": 1473 }, { "epoch": 1.8783051927365402, "grad_norm": 0.31187119317808343, "learning_rate": 1.570655069267567e-05, "loss": 0.1539, "step": 1474 }, { "epoch": 1.8795794839120739, "grad_norm": 0.3227287126067436, "learning_rate": 1.5699235948040934e-05, "loss": 0.1494, "step": 1475 }, { "epoch": 1.8808537750876075, "grad_norm": 0.3449477182396641, "learning_rate": 1.569191668423076e-05, "loss": 0.1732, "step": 1476 }, { "epoch": 1.882128066263141, "grad_norm": 0.31854324052426297, "learning_rate": 1.5684592907048925e-05, "loss": 0.1595, "step": 1477 }, { "epoch": 1.883402357438675, "grad_norm": 0.31951905898977073, "learning_rate": 1.5677264622302768e-05, "loss": 0.1732, "step": 1478 }, { "epoch": 1.8846766486142084, "grad_norm": 0.31255253205016736, "learning_rate": 1.566993183580321e-05, "loss": 0.1585, "step": 1479 }, { "epoch": 1.8859509397897418, "grad_norm": 0.3180128258584048, "learning_rate": 1.566259455336474e-05, "loss": 0.1557, "step": 1480 }, { "epoch": 1.8872252309652757, "grad_norm": 0.3313216366027495, "learning_rate": 1.5655252780805414e-05, "loss": 0.1666, "step": 1481 }, { "epoch": 1.8884995221408092, "grad_norm": 0.3124161584839295, "learning_rate": 1.5647906523946845e-05, "loss": 0.1506, "step": 1482 }, { "epoch": 1.8897738133163426, "grad_norm": 0.316400235225101, "learning_rate": 1.5640555788614207e-05, "loss": 0.1633, "step": 1483 }, { "epoch": 1.8910481044918765, "grad_norm": 0.30819978858571323, "learning_rate": 1.563320058063622e-05, "loss": 0.1582, "step": 1484 }, { "epoch": 1.89232239566741, "grad_norm": 0.3335424505538857, "learning_rate": 1.5625840905845157e-05, "loss": 0.1532, "step": 1485 }, { "epoch": 1.8935966868429435, "grad_norm": 0.31367858549980465, "learning_rate": 1.5618476770076823e-05, "loss": 0.1491, "step": 1486 }, { "epoch": 1.8948709780184774, "grad_norm": 0.3101615067702912, "learning_rate": 1.5611108179170567e-05, "loss": 0.145, "step": 1487 }, { "epoch": 1.8961452691940108, "grad_norm": 0.3379851430320423, "learning_rate": 1.560373513896927e-05, "loss": 0.1744, "step": 1488 }, { "epoch": 1.8974195603695443, "grad_norm": 0.30929928665653234, "learning_rate": 1.5596357655319346e-05, "loss": 0.1583, "step": 1489 }, { "epoch": 1.8986938515450782, "grad_norm": 0.3175509539841176, "learning_rate": 1.5588975734070717e-05, "loss": 0.1456, "step": 1490 }, { "epoch": 1.8999681427206117, "grad_norm": 0.31471061302917636, "learning_rate": 1.5581589381076843e-05, "loss": 0.156, "step": 1491 }, { "epoch": 1.9012424338961451, "grad_norm": 0.33646135893815815, "learning_rate": 1.557419860219468e-05, "loss": 0.176, "step": 1492 }, { "epoch": 1.902516725071679, "grad_norm": 0.31904758794800436, "learning_rate": 1.5566803403284712e-05, "loss": 0.1689, "step": 1493 }, { "epoch": 1.9037910162472125, "grad_norm": 0.3211653251216465, "learning_rate": 1.555940379021091e-05, "loss": 0.1606, "step": 1494 }, { "epoch": 1.905065307422746, "grad_norm": 0.33599843405229146, "learning_rate": 1.5551999768840758e-05, "loss": 0.1735, "step": 1495 }, { "epoch": 1.9063395985982798, "grad_norm": 0.3237586913261879, "learning_rate": 1.554459134504523e-05, "loss": 0.162, "step": 1496 }, { "epoch": 1.9076138897738133, "grad_norm": 0.3060166890532267, "learning_rate": 1.5537178524698792e-05, "loss": 0.1414, "step": 1497 }, { "epoch": 1.908888180949347, "grad_norm": 0.31145600910035925, "learning_rate": 1.5529761313679396e-05, "loss": 0.1547, "step": 1498 }, { "epoch": 1.9101624721248807, "grad_norm": 0.35579703733078977, "learning_rate": 1.5522339717868475e-05, "loss": 0.1764, "step": 1499 }, { "epoch": 1.9114367633004141, "grad_norm": 0.3279362971111204, "learning_rate": 1.551491374315094e-05, "loss": 0.1526, "step": 1500 }, { "epoch": 1.9127110544759478, "grad_norm": 0.334041697412138, "learning_rate": 1.5507483395415173e-05, "loss": 0.1808, "step": 1501 }, { "epoch": 1.9139853456514815, "grad_norm": 0.31868341116275867, "learning_rate": 1.5500048680553025e-05, "loss": 0.156, "step": 1502 }, { "epoch": 1.915259636827015, "grad_norm": 0.35479957845160304, "learning_rate": 1.549260960445981e-05, "loss": 0.1604, "step": 1503 }, { "epoch": 1.9165339280025486, "grad_norm": 0.3111225758801434, "learning_rate": 1.5485166173034302e-05, "loss": 0.165, "step": 1504 }, { "epoch": 1.9178082191780823, "grad_norm": 0.3686607947313647, "learning_rate": 1.5477718392178716e-05, "loss": 0.1888, "step": 1505 }, { "epoch": 1.9190825103536158, "grad_norm": 0.324360382138455, "learning_rate": 1.5470266267798733e-05, "loss": 0.1477, "step": 1506 }, { "epoch": 1.9203568015291494, "grad_norm": 0.33276499937601445, "learning_rate": 1.546280980580347e-05, "loss": 0.1683, "step": 1507 }, { "epoch": 1.9216310927046831, "grad_norm": 0.31273035076260425, "learning_rate": 1.5455349012105488e-05, "loss": 0.1511, "step": 1508 }, { "epoch": 1.9229053838802166, "grad_norm": 0.3186252169979847, "learning_rate": 1.5447883892620774e-05, "loss": 0.1529, "step": 1509 }, { "epoch": 1.9241796750557503, "grad_norm": 0.33206122714864533, "learning_rate": 1.544041445326875e-05, "loss": 0.1708, "step": 1510 }, { "epoch": 1.925453966231284, "grad_norm": 0.32256847887810036, "learning_rate": 1.5432940699972268e-05, "loss": 0.1539, "step": 1511 }, { "epoch": 1.9267282574068174, "grad_norm": 0.3278780060964058, "learning_rate": 1.5425462638657597e-05, "loss": 0.1604, "step": 1512 }, { "epoch": 1.928002548582351, "grad_norm": 0.3370519357615476, "learning_rate": 1.5417980275254418e-05, "loss": 0.16, "step": 1513 }, { "epoch": 1.9292768397578848, "grad_norm": 0.3379479915681676, "learning_rate": 1.541049361569582e-05, "loss": 0.1757, "step": 1514 }, { "epoch": 1.9305511309334182, "grad_norm": 0.3293100205948855, "learning_rate": 1.540300266591832e-05, "loss": 0.1712, "step": 1515 }, { "epoch": 1.931825422108952, "grad_norm": 0.2860571011213142, "learning_rate": 1.5395507431861813e-05, "loss": 0.1381, "step": 1516 }, { "epoch": 1.9330997132844856, "grad_norm": 0.32393803852205194, "learning_rate": 1.5388007919469604e-05, "loss": 0.1755, "step": 1517 }, { "epoch": 1.934374004460019, "grad_norm": 0.32641903757403484, "learning_rate": 1.5380504134688387e-05, "loss": 0.1608, "step": 1518 }, { "epoch": 1.9356482956355527, "grad_norm": 0.34569406162794314, "learning_rate": 1.5372996083468242e-05, "loss": 0.1781, "step": 1519 }, { "epoch": 1.9369225868110864, "grad_norm": 0.2930744657787216, "learning_rate": 1.536548377176263e-05, "loss": 0.1402, "step": 1520 }, { "epoch": 1.9381968779866199, "grad_norm": 0.3187532552220501, "learning_rate": 1.5357967205528402e-05, "loss": 0.1561, "step": 1521 }, { "epoch": 1.9394711691621536, "grad_norm": 0.35223568247171216, "learning_rate": 1.5350446390725772e-05, "loss": 0.1704, "step": 1522 }, { "epoch": 1.9407454603376872, "grad_norm": 0.3395970288700865, "learning_rate": 1.5342921333318323e-05, "loss": 0.1724, "step": 1523 }, { "epoch": 1.9420197515132207, "grad_norm": 0.3227620064082231, "learning_rate": 1.5335392039273008e-05, "loss": 0.1569, "step": 1524 }, { "epoch": 1.9432940426887544, "grad_norm": 0.31646175781924873, "learning_rate": 1.5327858514560133e-05, "loss": 0.1631, "step": 1525 }, { "epoch": 1.944568333864288, "grad_norm": 0.33840599755662976, "learning_rate": 1.5320320765153367e-05, "loss": 0.1552, "step": 1526 }, { "epoch": 1.9458426250398215, "grad_norm": 0.33154890607347276, "learning_rate": 1.531277879702972e-05, "loss": 0.1532, "step": 1527 }, { "epoch": 1.9471169162153552, "grad_norm": 0.30909855552034415, "learning_rate": 1.5305232616169548e-05, "loss": 0.1509, "step": 1528 }, { "epoch": 1.948391207390889, "grad_norm": 0.32883769731755, "learning_rate": 1.5297682228556555e-05, "loss": 0.163, "step": 1529 }, { "epoch": 1.9496654985664224, "grad_norm": 0.3129134325965204, "learning_rate": 1.529012764017778e-05, "loss": 0.1563, "step": 1530 }, { "epoch": 1.950939789741956, "grad_norm": 0.32031539549969945, "learning_rate": 1.528256885702358e-05, "loss": 0.1514, "step": 1531 }, { "epoch": 1.9522140809174897, "grad_norm": 0.3500224803815126, "learning_rate": 1.527500588508765e-05, "loss": 0.1914, "step": 1532 }, { "epoch": 1.9534883720930232, "grad_norm": 0.3037022249150242, "learning_rate": 1.526743873036701e-05, "loss": 0.1491, "step": 1533 }, { "epoch": 1.9547626632685569, "grad_norm": 0.31014816702583986, "learning_rate": 1.5259867398861983e-05, "loss": 0.1546, "step": 1534 }, { "epoch": 1.9560369544440905, "grad_norm": 0.3565385938582045, "learning_rate": 1.5252291896576214e-05, "loss": 0.1656, "step": 1535 }, { "epoch": 1.957311245619624, "grad_norm": 0.3444651027711253, "learning_rate": 1.5244712229516656e-05, "loss": 0.1662, "step": 1536 }, { "epoch": 1.9585855367951577, "grad_norm": 0.3129093466144767, "learning_rate": 1.5237128403693558e-05, "loss": 0.1633, "step": 1537 }, { "epoch": 1.9598598279706914, "grad_norm": 0.32187020271041505, "learning_rate": 1.5229540425120468e-05, "loss": 0.1714, "step": 1538 }, { "epoch": 1.9611341191462248, "grad_norm": 0.32864291341450974, "learning_rate": 1.5221948299814234e-05, "loss": 0.1516, "step": 1539 }, { "epoch": 1.9624084103217585, "grad_norm": 0.3093608814928511, "learning_rate": 1.5214352033794981e-05, "loss": 0.1514, "step": 1540 }, { "epoch": 1.9636827014972922, "grad_norm": 0.3394073100898759, "learning_rate": 1.520675163308613e-05, "loss": 0.163, "step": 1541 }, { "epoch": 1.9649569926728256, "grad_norm": 0.36707255670340955, "learning_rate": 1.5199147103714368e-05, "loss": 0.1903, "step": 1542 }, { "epoch": 1.9662312838483593, "grad_norm": 0.30327827456627743, "learning_rate": 1.5191538451709665e-05, "loss": 0.1557, "step": 1543 }, { "epoch": 1.967505575023893, "grad_norm": 0.33694267706371006, "learning_rate": 1.5183925683105254e-05, "loss": 0.1701, "step": 1544 }, { "epoch": 1.9687798661994265, "grad_norm": 0.36863993269864004, "learning_rate": 1.5176308803937633e-05, "loss": 0.1947, "step": 1545 }, { "epoch": 1.9700541573749601, "grad_norm": 0.2908569934433535, "learning_rate": 1.5168687820246567e-05, "loss": 0.1454, "step": 1546 }, { "epoch": 1.9713284485504938, "grad_norm": 0.35011183443463084, "learning_rate": 1.5161062738075068e-05, "loss": 0.1687, "step": 1547 }, { "epoch": 1.9726027397260273, "grad_norm": 0.3316928442282249, "learning_rate": 1.5153433563469398e-05, "loss": 0.1625, "step": 1548 }, { "epoch": 1.973877030901561, "grad_norm": 0.3192560945009996, "learning_rate": 1.5145800302479065e-05, "loss": 0.1686, "step": 1549 }, { "epoch": 1.9751513220770947, "grad_norm": 0.2930375552273462, "learning_rate": 1.5138162961156826e-05, "loss": 0.1316, "step": 1550 }, { "epoch": 1.976425613252628, "grad_norm": 0.3235387678049313, "learning_rate": 1.5130521545558654e-05, "loss": 0.1654, "step": 1551 }, { "epoch": 1.9776999044281618, "grad_norm": 0.3216257493442198, "learning_rate": 1.5122876061743772e-05, "loss": 0.1595, "step": 1552 }, { "epoch": 1.9789741956036955, "grad_norm": 0.322943807368077, "learning_rate": 1.511522651577462e-05, "loss": 0.1605, "step": 1553 }, { "epoch": 1.980248486779229, "grad_norm": 0.30923686974556097, "learning_rate": 1.5107572913716859e-05, "loss": 0.1519, "step": 1554 }, { "epoch": 1.9815227779547626, "grad_norm": 0.3391769944625233, "learning_rate": 1.5099915261639367e-05, "loss": 0.1843, "step": 1555 }, { "epoch": 1.9827970691302963, "grad_norm": 0.30058652989108825, "learning_rate": 1.5092253565614234e-05, "loss": 0.1512, "step": 1556 }, { "epoch": 1.9840713603058298, "grad_norm": 0.34695368637184626, "learning_rate": 1.5084587831716758e-05, "loss": 0.1932, "step": 1557 }, { "epoch": 1.9853456514813634, "grad_norm": 0.3235445703608707, "learning_rate": 1.5076918066025436e-05, "loss": 0.1645, "step": 1558 }, { "epoch": 1.9866199426568971, "grad_norm": 0.3368643883167936, "learning_rate": 1.5069244274621966e-05, "loss": 0.1715, "step": 1559 }, { "epoch": 1.9878942338324306, "grad_norm": 0.3249998040407997, "learning_rate": 1.506156646359123e-05, "loss": 0.1923, "step": 1560 }, { "epoch": 1.9891685250079645, "grad_norm": 0.30237808896306195, "learning_rate": 1.505388463902131e-05, "loss": 0.1521, "step": 1561 }, { "epoch": 1.990442816183498, "grad_norm": 0.30213862972190797, "learning_rate": 1.504619880700346e-05, "loss": 0.1599, "step": 1562 }, { "epoch": 1.9917171073590314, "grad_norm": 0.30786348284839515, "learning_rate": 1.5038508973632108e-05, "loss": 0.1587, "step": 1563 }, { "epoch": 1.9929913985345653, "grad_norm": 0.30582408877973905, "learning_rate": 1.5030815145004876e-05, "loss": 0.1602, "step": 1564 }, { "epoch": 1.9942656897100988, "grad_norm": 0.3080360964562665, "learning_rate": 1.502311732722253e-05, "loss": 0.1532, "step": 1565 }, { "epoch": 1.9955399808856322, "grad_norm": 0.3234465343566477, "learning_rate": 1.5015415526389013e-05, "loss": 0.1678, "step": 1566 }, { "epoch": 1.9968142720611661, "grad_norm": 0.3173204296834856, "learning_rate": 1.5007709748611419e-05, "loss": 0.1577, "step": 1567 }, { "epoch": 1.9980885632366996, "grad_norm": 0.3205053810070002, "learning_rate": 1.5000000000000002e-05, "loss": 0.164, "step": 1568 }, { "epoch": 1.999362854412233, "grad_norm": 0.3611910018664256, "learning_rate": 1.499228628666816e-05, "loss": 0.1924, "step": 1569 }, { "epoch": 2.000637145587767, "grad_norm": 0.3546803907102201, "learning_rate": 1.4984568614732435e-05, "loss": 0.1259, "step": 1570 }, { "epoch": 2.0019114367633004, "grad_norm": 0.3403728300327264, "learning_rate": 1.4976846990312515e-05, "loss": 0.0931, "step": 1571 }, { "epoch": 2.003185727938834, "grad_norm": 0.36112646720045943, "learning_rate": 1.496912141953121e-05, "loss": 0.0943, "step": 1572 }, { "epoch": 2.0044600191143678, "grad_norm": 0.2909823276264976, "learning_rate": 1.4961391908514469e-05, "loss": 0.0793, "step": 1573 }, { "epoch": 2.0057343102899012, "grad_norm": 0.44777830301943644, "learning_rate": 1.495365846339136e-05, "loss": 0.0884, "step": 1574 }, { "epoch": 2.0070086014654347, "grad_norm": 0.4767284935390453, "learning_rate": 1.4945921090294076e-05, "loss": 0.0866, "step": 1575 }, { "epoch": 2.0082828926409686, "grad_norm": 0.34881203543191736, "learning_rate": 1.4938179795357916e-05, "loss": 0.0771, "step": 1576 }, { "epoch": 2.009557183816502, "grad_norm": 0.3493023912076856, "learning_rate": 1.4930434584721299e-05, "loss": 0.0997, "step": 1577 }, { "epoch": 2.0108314749920355, "grad_norm": 0.35872700645471833, "learning_rate": 1.492268546452574e-05, "loss": 0.1031, "step": 1578 }, { "epoch": 2.0121057661675694, "grad_norm": 0.3931341451432176, "learning_rate": 1.4914932440915863e-05, "loss": 0.1139, "step": 1579 }, { "epoch": 2.013380057343103, "grad_norm": 0.3083374204076508, "learning_rate": 1.4907175520039381e-05, "loss": 0.0729, "step": 1580 }, { "epoch": 2.0146543485186363, "grad_norm": 0.3163438267244767, "learning_rate": 1.4899414708047094e-05, "loss": 0.0904, "step": 1581 }, { "epoch": 2.0159286396941702, "grad_norm": 0.3013262937015894, "learning_rate": 1.4891650011092896e-05, "loss": 0.0755, "step": 1582 }, { "epoch": 2.0172029308697037, "grad_norm": 0.3186507297264456, "learning_rate": 1.4883881435333753e-05, "loss": 0.0843, "step": 1583 }, { "epoch": 2.018477222045237, "grad_norm": 0.33801988693641866, "learning_rate": 1.4876108986929719e-05, "loss": 0.0949, "step": 1584 }, { "epoch": 2.019751513220771, "grad_norm": 0.33834596105727677, "learning_rate": 1.4868332672043901e-05, "loss": 0.0893, "step": 1585 }, { "epoch": 2.0210258043963045, "grad_norm": 0.33024157811996696, "learning_rate": 1.4860552496842493e-05, "loss": 0.0916, "step": 1586 }, { "epoch": 2.022300095571838, "grad_norm": 0.3327274786936272, "learning_rate": 1.485276846749473e-05, "loss": 0.0894, "step": 1587 }, { "epoch": 2.023574386747372, "grad_norm": 0.32001456222387187, "learning_rate": 1.4844980590172914e-05, "loss": 0.0853, "step": 1588 }, { "epoch": 2.0248486779229053, "grad_norm": 0.3304872055059335, "learning_rate": 1.4837188871052399e-05, "loss": 0.0961, "step": 1589 }, { "epoch": 2.026122969098439, "grad_norm": 0.3225386153262728, "learning_rate": 1.4829393316311583e-05, "loss": 0.0838, "step": 1590 }, { "epoch": 2.0273972602739727, "grad_norm": 0.3075963164246764, "learning_rate": 1.4821593932131901e-05, "loss": 0.0872, "step": 1591 }, { "epoch": 2.028671551449506, "grad_norm": 0.3028457138359727, "learning_rate": 1.4813790724697832e-05, "loss": 0.0802, "step": 1592 }, { "epoch": 2.0299458426250396, "grad_norm": 0.31667494084643255, "learning_rate": 1.480598370019688e-05, "loss": 0.0908, "step": 1593 }, { "epoch": 2.0312201338005735, "grad_norm": 0.2909324624367697, "learning_rate": 1.479817286481958e-05, "loss": 0.072, "step": 1594 }, { "epoch": 2.032494424976107, "grad_norm": 0.3219213904866198, "learning_rate": 1.4790358224759491e-05, "loss": 0.0883, "step": 1595 }, { "epoch": 2.0337687161516405, "grad_norm": 0.3448999040780737, "learning_rate": 1.4782539786213184e-05, "loss": 0.0915, "step": 1596 }, { "epoch": 2.0350430073271744, "grad_norm": 0.3232025893562507, "learning_rate": 1.477471755538024e-05, "loss": 0.0832, "step": 1597 }, { "epoch": 2.036317298502708, "grad_norm": 0.32506813849442057, "learning_rate": 1.4766891538463255e-05, "loss": 0.0949, "step": 1598 }, { "epoch": 2.0375915896782413, "grad_norm": 0.3108895451406324, "learning_rate": 1.4759061741667821e-05, "loss": 0.0796, "step": 1599 }, { "epoch": 2.038865880853775, "grad_norm": 0.3621391528157858, "learning_rate": 1.475122817120253e-05, "loss": 0.1095, "step": 1600 }, { "epoch": 2.0401401720293086, "grad_norm": 0.3217742089067684, "learning_rate": 1.4743390833278961e-05, "loss": 0.0897, "step": 1601 }, { "epoch": 2.041414463204842, "grad_norm": 0.32293360309893865, "learning_rate": 1.4735549734111692e-05, "loss": 0.0924, "step": 1602 }, { "epoch": 2.042688754380376, "grad_norm": 0.3262606989544506, "learning_rate": 1.4727704879918272e-05, "loss": 0.0972, "step": 1603 }, { "epoch": 2.0439630455559095, "grad_norm": 0.29629923937917046, "learning_rate": 1.471985627691923e-05, "loss": 0.0768, "step": 1604 }, { "epoch": 2.045237336731443, "grad_norm": 0.31077966682560243, "learning_rate": 1.4712003931338074e-05, "loss": 0.0833, "step": 1605 }, { "epoch": 2.046511627906977, "grad_norm": 0.3278456739486212, "learning_rate": 1.4704147849401274e-05, "loss": 0.0877, "step": 1606 }, { "epoch": 2.0477859190825103, "grad_norm": 0.31995155123200114, "learning_rate": 1.4696288037338258e-05, "loss": 0.0916, "step": 1607 }, { "epoch": 2.0490602102580437, "grad_norm": 0.3155976190588742, "learning_rate": 1.4688424501381425e-05, "loss": 0.0921, "step": 1608 }, { "epoch": 2.0503345014335776, "grad_norm": 0.28415582840570036, "learning_rate": 1.4680557247766114e-05, "loss": 0.0668, "step": 1609 }, { "epoch": 2.051608792609111, "grad_norm": 0.30731338388966595, "learning_rate": 1.4672686282730622e-05, "loss": 0.0873, "step": 1610 }, { "epoch": 2.0528830837846446, "grad_norm": 0.31420173163908033, "learning_rate": 1.466481161251618e-05, "loss": 0.0845, "step": 1611 }, { "epoch": 2.0541573749601785, "grad_norm": 0.3544693703819501, "learning_rate": 1.4656933243366962e-05, "loss": 0.1016, "step": 1612 }, { "epoch": 2.055431666135712, "grad_norm": 0.3450667506314578, "learning_rate": 1.4649051181530075e-05, "loss": 0.0976, "step": 1613 }, { "epoch": 2.056705957311246, "grad_norm": 0.3205888433527418, "learning_rate": 1.4641165433255556e-05, "loss": 0.0907, "step": 1614 }, { "epoch": 2.0579802484867793, "grad_norm": 0.30755951385000796, "learning_rate": 1.4633276004796355e-05, "loss": 0.0762, "step": 1615 }, { "epoch": 2.0592545396623128, "grad_norm": 0.33380885086564605, "learning_rate": 1.4625382902408356e-05, "loss": 0.0877, "step": 1616 }, { "epoch": 2.0605288308378467, "grad_norm": 0.298447662826546, "learning_rate": 1.4617486132350343e-05, "loss": 0.0721, "step": 1617 }, { "epoch": 2.06180312201338, "grad_norm": 0.3345999386915, "learning_rate": 1.4609585700884013e-05, "loss": 0.0889, "step": 1618 }, { "epoch": 2.0630774131889136, "grad_norm": 0.3343701892254835, "learning_rate": 1.4601681614273966e-05, "loss": 0.0962, "step": 1619 }, { "epoch": 2.0643517043644475, "grad_norm": 0.3324656132456174, "learning_rate": 1.4593773878787704e-05, "loss": 0.0875, "step": 1620 }, { "epoch": 2.065625995539981, "grad_norm": 0.34003050040148647, "learning_rate": 1.458586250069562e-05, "loss": 0.0994, "step": 1621 }, { "epoch": 2.0669002867155144, "grad_norm": 0.36253676408090335, "learning_rate": 1.457794748627099e-05, "loss": 0.1016, "step": 1622 }, { "epoch": 2.0681745778910483, "grad_norm": 0.29451480373062755, "learning_rate": 1.457002884178998e-05, "loss": 0.0753, "step": 1623 }, { "epoch": 2.0694488690665818, "grad_norm": 0.30729717146375274, "learning_rate": 1.4562106573531632e-05, "loss": 0.078, "step": 1624 }, { "epoch": 2.0707231602421152, "grad_norm": 0.31577737382801974, "learning_rate": 1.4554180687777862e-05, "loss": 0.0863, "step": 1625 }, { "epoch": 2.071997451417649, "grad_norm": 0.3221239807317423, "learning_rate": 1.4546251190813452e-05, "loss": 0.0843, "step": 1626 }, { "epoch": 2.0732717425931826, "grad_norm": 0.3142257953166396, "learning_rate": 1.4538318088926054e-05, "loss": 0.0792, "step": 1627 }, { "epoch": 2.074546033768716, "grad_norm": 0.29466109317499467, "learning_rate": 1.4530381388406172e-05, "loss": 0.0721, "step": 1628 }, { "epoch": 2.07582032494425, "grad_norm": 0.31245996542616783, "learning_rate": 1.4522441095547164e-05, "loss": 0.0805, "step": 1629 }, { "epoch": 2.0770946161197834, "grad_norm": 0.34728803772822153, "learning_rate": 1.451449721664524e-05, "loss": 0.1057, "step": 1630 }, { "epoch": 2.078368907295317, "grad_norm": 0.3311033870652477, "learning_rate": 1.4506549757999456e-05, "loss": 0.0826, "step": 1631 }, { "epoch": 2.0796431984708508, "grad_norm": 0.3158198331413611, "learning_rate": 1.4498598725911693e-05, "loss": 0.0822, "step": 1632 }, { "epoch": 2.0809174896463842, "grad_norm": 0.30291043060326467, "learning_rate": 1.449064412668668e-05, "loss": 0.0767, "step": 1633 }, { "epoch": 2.0821917808219177, "grad_norm": 0.3244400977105374, "learning_rate": 1.448268596663197e-05, "loss": 0.0845, "step": 1634 }, { "epoch": 2.0834660719974516, "grad_norm": 0.33928065467657664, "learning_rate": 1.4474724252057941e-05, "loss": 0.0923, "step": 1635 }, { "epoch": 2.084740363172985, "grad_norm": 0.3066974014459918, "learning_rate": 1.4466758989277782e-05, "loss": 0.0791, "step": 1636 }, { "epoch": 2.0860146543485185, "grad_norm": 0.32675004252529705, "learning_rate": 1.4458790184607506e-05, "loss": 0.0965, "step": 1637 }, { "epoch": 2.0872889455240524, "grad_norm": 0.30456628835902694, "learning_rate": 1.4450817844365924e-05, "loss": 0.0855, "step": 1638 }, { "epoch": 2.088563236699586, "grad_norm": 0.3226129849210433, "learning_rate": 1.4442841974874661e-05, "loss": 0.0954, "step": 1639 }, { "epoch": 2.0898375278751193, "grad_norm": 0.306757621341492, "learning_rate": 1.4434862582458136e-05, "loss": 0.0762, "step": 1640 }, { "epoch": 2.0911118190506532, "grad_norm": 0.32318824046372135, "learning_rate": 1.4426879673443557e-05, "loss": 0.0904, "step": 1641 }, { "epoch": 2.0923861102261867, "grad_norm": 0.3100853727861643, "learning_rate": 1.4418893254160926e-05, "loss": 0.0856, "step": 1642 }, { "epoch": 2.09366040140172, "grad_norm": 0.3031141089237483, "learning_rate": 1.441090333094303e-05, "loss": 0.0757, "step": 1643 }, { "epoch": 2.094934692577254, "grad_norm": 0.3092952604083337, "learning_rate": 1.4402909910125425e-05, "loss": 0.083, "step": 1644 }, { "epoch": 2.0962089837527875, "grad_norm": 0.3454000946046223, "learning_rate": 1.4394912998046451e-05, "loss": 0.0993, "step": 1645 }, { "epoch": 2.097483274928321, "grad_norm": 0.2896747373377177, "learning_rate": 1.4386912601047214e-05, "loss": 0.0702, "step": 1646 }, { "epoch": 2.098757566103855, "grad_norm": 0.3239549123798449, "learning_rate": 1.4378908725471577e-05, "loss": 0.0839, "step": 1647 }, { "epoch": 2.1000318572793883, "grad_norm": 0.3147767477757901, "learning_rate": 1.4370901377666166e-05, "loss": 0.0829, "step": 1648 }, { "epoch": 2.101306148454922, "grad_norm": 0.3120466027439021, "learning_rate": 1.4362890563980364e-05, "loss": 0.0836, "step": 1649 }, { "epoch": 2.1025804396304557, "grad_norm": 0.3200254656921956, "learning_rate": 1.4354876290766295e-05, "loss": 0.0896, "step": 1650 }, { "epoch": 2.103854730805989, "grad_norm": 0.33307395422342606, "learning_rate": 1.4346858564378832e-05, "loss": 0.0933, "step": 1651 }, { "epoch": 2.1051290219815226, "grad_norm": 0.30441603598977085, "learning_rate": 1.4338837391175582e-05, "loss": 0.0775, "step": 1652 }, { "epoch": 2.1064033131570565, "grad_norm": 0.3349544853681926, "learning_rate": 1.4330812777516888e-05, "loss": 0.0906, "step": 1653 }, { "epoch": 2.10767760433259, "grad_norm": 0.33964746524845507, "learning_rate": 1.4322784729765819e-05, "loss": 0.092, "step": 1654 }, { "epoch": 2.1089518955081235, "grad_norm": 0.3079161831283532, "learning_rate": 1.4314753254288168e-05, "loss": 0.08, "step": 1655 }, { "epoch": 2.1102261866836574, "grad_norm": 0.3380428548938303, "learning_rate": 1.4306718357452445e-05, "loss": 0.0876, "step": 1656 }, { "epoch": 2.111500477859191, "grad_norm": 0.3030510825276236, "learning_rate": 1.4298680045629876e-05, "loss": 0.0823, "step": 1657 }, { "epoch": 2.1127747690347243, "grad_norm": 0.3279313474599316, "learning_rate": 1.4290638325194392e-05, "loss": 0.0907, "step": 1658 }, { "epoch": 2.114049060210258, "grad_norm": 0.32728829548204796, "learning_rate": 1.4282593202522627e-05, "loss": 0.0934, "step": 1659 }, { "epoch": 2.1153233513857916, "grad_norm": 0.305965083467469, "learning_rate": 1.4274544683993917e-05, "loss": 0.0701, "step": 1660 }, { "epoch": 2.116597642561325, "grad_norm": 0.3274612747579627, "learning_rate": 1.4266492775990281e-05, "loss": 0.0814, "step": 1661 }, { "epoch": 2.117871933736859, "grad_norm": 0.32361989643040406, "learning_rate": 1.4258437484896437e-05, "loss": 0.0857, "step": 1662 }, { "epoch": 2.1191462249123925, "grad_norm": 0.3156449271699728, "learning_rate": 1.4250378817099775e-05, "loss": 0.082, "step": 1663 }, { "epoch": 2.120420516087926, "grad_norm": 0.3177235658278051, "learning_rate": 1.4242316778990373e-05, "loss": 0.0829, "step": 1664 }, { "epoch": 2.12169480726346, "grad_norm": 0.31204881895431186, "learning_rate": 1.4234251376960978e-05, "loss": 0.0771, "step": 1665 }, { "epoch": 2.1229690984389933, "grad_norm": 0.3159460637819134, "learning_rate": 1.4226182617406996e-05, "loss": 0.0839, "step": 1666 }, { "epoch": 2.1242433896145267, "grad_norm": 0.3175054519649733, "learning_rate": 1.4218110506726506e-05, "loss": 0.0859, "step": 1667 }, { "epoch": 2.1255176807900606, "grad_norm": 0.30652556104449424, "learning_rate": 1.421003505132024e-05, "loss": 0.0842, "step": 1668 }, { "epoch": 2.126791971965594, "grad_norm": 0.3267537710707511, "learning_rate": 1.4201956257591585e-05, "loss": 0.0906, "step": 1669 }, { "epoch": 2.1280662631411276, "grad_norm": 0.32454172236963613, "learning_rate": 1.419387413194657e-05, "loss": 0.0849, "step": 1670 }, { "epoch": 2.1293405543166615, "grad_norm": 0.3161814096068083, "learning_rate": 1.4185788680793867e-05, "loss": 0.0813, "step": 1671 }, { "epoch": 2.130614845492195, "grad_norm": 0.32319455057474694, "learning_rate": 1.4177699910544793e-05, "loss": 0.0911, "step": 1672 }, { "epoch": 2.1318891366677284, "grad_norm": 0.31388394857063845, "learning_rate": 1.4169607827613284e-05, "loss": 0.083, "step": 1673 }, { "epoch": 2.1331634278432623, "grad_norm": 0.30403755396759663, "learning_rate": 1.4161512438415911e-05, "loss": 0.0746, "step": 1674 }, { "epoch": 2.1344377190187958, "grad_norm": 0.31556514701431443, "learning_rate": 1.4153413749371864e-05, "loss": 0.0854, "step": 1675 }, { "epoch": 2.135712010194329, "grad_norm": 0.32702710045252786, "learning_rate": 1.4145311766902956e-05, "loss": 0.0891, "step": 1676 }, { "epoch": 2.136986301369863, "grad_norm": 0.3108885090708535, "learning_rate": 1.41372064974336e-05, "loss": 0.0746, "step": 1677 }, { "epoch": 2.1382605925453966, "grad_norm": 0.32067438246114643, "learning_rate": 1.4129097947390825e-05, "loss": 0.0797, "step": 1678 }, { "epoch": 2.13953488372093, "grad_norm": 0.32647380826324335, "learning_rate": 1.4120986123204257e-05, "loss": 0.0815, "step": 1679 }, { "epoch": 2.140809174896464, "grad_norm": 0.33427477179243875, "learning_rate": 1.4112871031306118e-05, "loss": 0.087, "step": 1680 }, { "epoch": 2.1420834660719974, "grad_norm": 0.3310908686675031, "learning_rate": 1.4104752678131221e-05, "loss": 0.0837, "step": 1681 }, { "epoch": 2.143357757247531, "grad_norm": 0.3549499909979723, "learning_rate": 1.409663107011697e-05, "loss": 0.0971, "step": 1682 }, { "epoch": 2.1446320484230648, "grad_norm": 0.3096414964554508, "learning_rate": 1.4088506213703343e-05, "loss": 0.0749, "step": 1683 }, { "epoch": 2.1459063395985982, "grad_norm": 0.3233009511092528, "learning_rate": 1.4080378115332899e-05, "loss": 0.0832, "step": 1684 }, { "epoch": 2.1471806307741317, "grad_norm": 0.3341164009621491, "learning_rate": 1.4072246781450762e-05, "loss": 0.0853, "step": 1685 }, { "epoch": 2.1484549219496656, "grad_norm": 0.32547710279659453, "learning_rate": 1.4064112218504632e-05, "loss": 0.0923, "step": 1686 }, { "epoch": 2.149729213125199, "grad_norm": 0.2917773046694318, "learning_rate": 1.4055974432944753e-05, "loss": 0.0721, "step": 1687 }, { "epoch": 2.151003504300733, "grad_norm": 0.342257839270584, "learning_rate": 1.4047833431223938e-05, "loss": 0.0992, "step": 1688 }, { "epoch": 2.1522777954762664, "grad_norm": 0.3252787780053984, "learning_rate": 1.403968921979755e-05, "loss": 0.0912, "step": 1689 }, { "epoch": 2.1535520866518, "grad_norm": 0.3190666776230112, "learning_rate": 1.403154180512349e-05, "loss": 0.0749, "step": 1690 }, { "epoch": 2.1548263778273338, "grad_norm": 0.3626391621348905, "learning_rate": 1.4023391193662204e-05, "loss": 0.0991, "step": 1691 }, { "epoch": 2.1561006690028672, "grad_norm": 0.3231175478375092, "learning_rate": 1.4015237391876668e-05, "loss": 0.0861, "step": 1692 }, { "epoch": 2.1573749601784007, "grad_norm": 0.31233085426760826, "learning_rate": 1.4007080406232394e-05, "loss": 0.0763, "step": 1693 }, { "epoch": 2.1586492513539346, "grad_norm": 0.32750745263936437, "learning_rate": 1.3998920243197408e-05, "loss": 0.0879, "step": 1694 }, { "epoch": 2.159923542529468, "grad_norm": 0.3354184753202159, "learning_rate": 1.3990756909242275e-05, "loss": 0.0911, "step": 1695 }, { "epoch": 2.1611978337050015, "grad_norm": 0.31551290444209723, "learning_rate": 1.3982590410840056e-05, "loss": 0.0777, "step": 1696 }, { "epoch": 2.1624721248805354, "grad_norm": 0.3447774485306984, "learning_rate": 1.397442075446633e-05, "loss": 0.0885, "step": 1697 }, { "epoch": 2.163746416056069, "grad_norm": 0.30040509912398755, "learning_rate": 1.3966247946599172e-05, "loss": 0.0756, "step": 1698 }, { "epoch": 2.1650207072316023, "grad_norm": 0.3244245026610155, "learning_rate": 1.3958071993719169e-05, "loss": 0.0877, "step": 1699 }, { "epoch": 2.1662949984071362, "grad_norm": 0.3099888768819473, "learning_rate": 1.3949892902309387e-05, "loss": 0.0831, "step": 1700 }, { "epoch": 2.1675692895826697, "grad_norm": 0.32420316947225664, "learning_rate": 1.3941710678855396e-05, "loss": 0.0968, "step": 1701 }, { "epoch": 2.168843580758203, "grad_norm": 0.32468814591587, "learning_rate": 1.3933525329845234e-05, "loss": 0.0925, "step": 1702 }, { "epoch": 2.170117871933737, "grad_norm": 0.3046889843064151, "learning_rate": 1.3925336861769432e-05, "loss": 0.0758, "step": 1703 }, { "epoch": 2.1713921631092705, "grad_norm": 0.3268940252199531, "learning_rate": 1.3917145281120983e-05, "loss": 0.089, "step": 1704 }, { "epoch": 2.172666454284804, "grad_norm": 0.31113077363267133, "learning_rate": 1.3908950594395354e-05, "loss": 0.0804, "step": 1705 }, { "epoch": 2.173940745460338, "grad_norm": 0.31927444808081923, "learning_rate": 1.390075280809047e-05, "loss": 0.0854, "step": 1706 }, { "epoch": 2.1752150366358713, "grad_norm": 0.3018702309723153, "learning_rate": 1.3892551928706723e-05, "loss": 0.0845, "step": 1707 }, { "epoch": 2.176489327811405, "grad_norm": 0.3209332320557841, "learning_rate": 1.3884347962746949e-05, "loss": 0.0905, "step": 1708 }, { "epoch": 2.1777636189869387, "grad_norm": 0.33416777551978744, "learning_rate": 1.3876140916716434e-05, "loss": 0.0977, "step": 1709 }, { "epoch": 2.179037910162472, "grad_norm": 0.32515677559898376, "learning_rate": 1.3867930797122907e-05, "loss": 0.0867, "step": 1710 }, { "epoch": 2.1803122013380056, "grad_norm": 0.2943094403108182, "learning_rate": 1.3859717610476534e-05, "loss": 0.0764, "step": 1711 }, { "epoch": 2.1815864925135395, "grad_norm": 0.3217329618347697, "learning_rate": 1.3851501363289907e-05, "loss": 0.0841, "step": 1712 }, { "epoch": 2.182860783689073, "grad_norm": 0.3285890347153238, "learning_rate": 1.3843282062078063e-05, "loss": 0.0927, "step": 1713 }, { "epoch": 2.1841350748646065, "grad_norm": 0.31518324190489366, "learning_rate": 1.3835059713358438e-05, "loss": 0.0841, "step": 1714 }, { "epoch": 2.1854093660401404, "grad_norm": 0.31210289474791286, "learning_rate": 1.3826834323650899e-05, "loss": 0.0848, "step": 1715 }, { "epoch": 2.186683657215674, "grad_norm": 0.36426295021197413, "learning_rate": 1.381860589947772e-05, "loss": 0.1131, "step": 1716 }, { "epoch": 2.1879579483912073, "grad_norm": 0.3275675028205472, "learning_rate": 1.381037444736358e-05, "loss": 0.0989, "step": 1717 }, { "epoch": 2.189232239566741, "grad_norm": 0.29387051948113396, "learning_rate": 1.380213997383556e-05, "loss": 0.0709, "step": 1718 }, { "epoch": 2.1905065307422746, "grad_norm": 0.33001457236679427, "learning_rate": 1.3793902485423134e-05, "loss": 0.0941, "step": 1719 }, { "epoch": 2.191780821917808, "grad_norm": 0.3276649597154484, "learning_rate": 1.3785661988658178e-05, "loss": 0.0898, "step": 1720 }, { "epoch": 2.193055113093342, "grad_norm": 0.32258506657787683, "learning_rate": 1.377741849007494e-05, "loss": 0.0847, "step": 1721 }, { "epoch": 2.1943294042688755, "grad_norm": 0.2998411423595114, "learning_rate": 1.3769171996210053e-05, "loss": 0.0737, "step": 1722 }, { "epoch": 2.195603695444409, "grad_norm": 0.3478276906428922, "learning_rate": 1.376092251360253e-05, "loss": 0.1021, "step": 1723 }, { "epoch": 2.196877986619943, "grad_norm": 0.32064514498356783, "learning_rate": 1.3752670048793744e-05, "loss": 0.0826, "step": 1724 }, { "epoch": 2.1981522777954763, "grad_norm": 0.331516472258475, "learning_rate": 1.3744414608327436e-05, "loss": 0.0818, "step": 1725 }, { "epoch": 2.1994265689710097, "grad_norm": 0.3406091661901719, "learning_rate": 1.3736156198749717e-05, "loss": 0.0949, "step": 1726 }, { "epoch": 2.2007008601465436, "grad_norm": 0.31323084274148755, "learning_rate": 1.3727894826609041e-05, "loss": 0.0765, "step": 1727 }, { "epoch": 2.201975151322077, "grad_norm": 0.29676088769922215, "learning_rate": 1.3719630498456211e-05, "loss": 0.0705, "step": 1728 }, { "epoch": 2.2032494424976106, "grad_norm": 0.3320282881824555, "learning_rate": 1.371136322084438e-05, "loss": 0.0923, "step": 1729 }, { "epoch": 2.2045237336731445, "grad_norm": 0.3270671107465866, "learning_rate": 1.3703093000329037e-05, "loss": 0.0883, "step": 1730 }, { "epoch": 2.205798024848678, "grad_norm": 0.3272819260799941, "learning_rate": 1.3694819843468e-05, "loss": 0.0891, "step": 1731 }, { "epoch": 2.2070723160242114, "grad_norm": 0.3215732735440148, "learning_rate": 1.3686543756821429e-05, "loss": 0.0961, "step": 1732 }, { "epoch": 2.2083466071997453, "grad_norm": 0.3139497018168009, "learning_rate": 1.3678264746951789e-05, "loss": 0.0803, "step": 1733 }, { "epoch": 2.2096208983752788, "grad_norm": 0.32877804841603414, "learning_rate": 1.3669982820423875e-05, "loss": 0.0853, "step": 1734 }, { "epoch": 2.210895189550812, "grad_norm": 0.30696319059708715, "learning_rate": 1.3661697983804794e-05, "loss": 0.0853, "step": 1735 }, { "epoch": 2.212169480726346, "grad_norm": 0.31347507618974585, "learning_rate": 1.3653410243663953e-05, "loss": 0.0828, "step": 1736 }, { "epoch": 2.2134437719018796, "grad_norm": 0.33072022098258597, "learning_rate": 1.364511960657307e-05, "loss": 0.0899, "step": 1737 }, { "epoch": 2.214718063077413, "grad_norm": 0.30798223784885603, "learning_rate": 1.3636826079106156e-05, "loss": 0.0789, "step": 1738 }, { "epoch": 2.215992354252947, "grad_norm": 0.32768000274758047, "learning_rate": 1.3628529667839517e-05, "loss": 0.0864, "step": 1739 }, { "epoch": 2.2172666454284804, "grad_norm": 0.32112643913350225, "learning_rate": 1.362023037935174e-05, "loss": 0.0822, "step": 1740 }, { "epoch": 2.218540936604014, "grad_norm": 0.33015689544276217, "learning_rate": 1.3611928220223695e-05, "loss": 0.0952, "step": 1741 }, { "epoch": 2.2198152277795478, "grad_norm": 0.3193399317749901, "learning_rate": 1.3603623197038536e-05, "loss": 0.0861, "step": 1742 }, { "epoch": 2.221089518955081, "grad_norm": 0.31822627573576606, "learning_rate": 1.3595315316381676e-05, "loss": 0.0883, "step": 1743 }, { "epoch": 2.2223638101306147, "grad_norm": 0.3306442799053977, "learning_rate": 1.3587004584840803e-05, "loss": 0.0879, "step": 1744 }, { "epoch": 2.2236381013061486, "grad_norm": 0.3384931403515, "learning_rate": 1.3578691009005864e-05, "loss": 0.0892, "step": 1745 }, { "epoch": 2.224912392481682, "grad_norm": 0.30996023684529017, "learning_rate": 1.3570374595469058e-05, "loss": 0.0771, "step": 1746 }, { "epoch": 2.2261866836572155, "grad_norm": 0.3041655477699378, "learning_rate": 1.3562055350824835e-05, "loss": 0.0818, "step": 1747 }, { "epoch": 2.2274609748327494, "grad_norm": 0.33313407065836226, "learning_rate": 1.3553733281669889e-05, "loss": 0.0962, "step": 1748 }, { "epoch": 2.228735266008283, "grad_norm": 0.3406793057925996, "learning_rate": 1.354540839460316e-05, "loss": 0.0932, "step": 1749 }, { "epoch": 2.2300095571838163, "grad_norm": 0.29791810116179873, "learning_rate": 1.3537080696225815e-05, "loss": 0.0796, "step": 1750 }, { "epoch": 2.2312838483593502, "grad_norm": 0.35693635222048364, "learning_rate": 1.3528750193141255e-05, "loss": 0.1058, "step": 1751 }, { "epoch": 2.2325581395348837, "grad_norm": 0.29592974715362874, "learning_rate": 1.3520416891955101e-05, "loss": 0.0734, "step": 1752 }, { "epoch": 2.233832430710417, "grad_norm": 0.35121281184173553, "learning_rate": 1.3512080799275198e-05, "loss": 0.1073, "step": 1753 }, { "epoch": 2.235106721885951, "grad_norm": 0.335434303989275, "learning_rate": 1.35037419217116e-05, "loss": 0.0929, "step": 1754 }, { "epoch": 2.2363810130614845, "grad_norm": 0.3093170503980418, "learning_rate": 1.3495400265876569e-05, "loss": 0.0807, "step": 1755 }, { "epoch": 2.237655304237018, "grad_norm": 0.3234840198258806, "learning_rate": 1.348705583838457e-05, "loss": 0.0866, "step": 1756 }, { "epoch": 2.238929595412552, "grad_norm": 0.3282473375708985, "learning_rate": 1.3478708645852272e-05, "loss": 0.0916, "step": 1757 }, { "epoch": 2.2402038865880853, "grad_norm": 0.3112158920153638, "learning_rate": 1.3470358694898531e-05, "loss": 0.0855, "step": 1758 }, { "epoch": 2.241478177763619, "grad_norm": 0.3222642574266862, "learning_rate": 1.3462005992144392e-05, "loss": 0.0869, "step": 1759 }, { "epoch": 2.2427524689391527, "grad_norm": 0.3075136679412157, "learning_rate": 1.3453650544213078e-05, "loss": 0.0754, "step": 1760 }, { "epoch": 2.244026760114686, "grad_norm": 0.33955016393936943, "learning_rate": 1.3445292357729992e-05, "loss": 0.0984, "step": 1761 }, { "epoch": 2.2453010512902196, "grad_norm": 0.33975209540776286, "learning_rate": 1.3436931439322713e-05, "loss": 0.0919, "step": 1762 }, { "epoch": 2.2465753424657535, "grad_norm": 0.34059960393414357, "learning_rate": 1.3428567795620977e-05, "loss": 0.0868, "step": 1763 }, { "epoch": 2.247849633641287, "grad_norm": 0.3363574909880289, "learning_rate": 1.342020143325669e-05, "loss": 0.0872, "step": 1764 }, { "epoch": 2.2491239248168204, "grad_norm": 0.31972490167630396, "learning_rate": 1.3411832358863906e-05, "loss": 0.0865, "step": 1765 }, { "epoch": 2.2503982159923543, "grad_norm": 0.29917757664946043, "learning_rate": 1.3403460579078833e-05, "loss": 0.0834, "step": 1766 }, { "epoch": 2.251672507167888, "grad_norm": 0.3262379161368011, "learning_rate": 1.3395086100539827e-05, "loss": 0.0905, "step": 1767 }, { "epoch": 2.2529467983434213, "grad_norm": 0.3054951962745276, "learning_rate": 1.3386708929887378e-05, "loss": 0.0797, "step": 1768 }, { "epoch": 2.254221089518955, "grad_norm": 0.3215432141732919, "learning_rate": 1.3378329073764118e-05, "loss": 0.0862, "step": 1769 }, { "epoch": 2.2554953806944886, "grad_norm": 0.3302625844186871, "learning_rate": 1.33699465388148e-05, "loss": 0.0951, "step": 1770 }, { "epoch": 2.256769671870022, "grad_norm": 0.31910986552539394, "learning_rate": 1.336156133168631e-05, "loss": 0.08, "step": 1771 }, { "epoch": 2.258043963045556, "grad_norm": 0.3466760127157683, "learning_rate": 1.3353173459027646e-05, "loss": 0.0892, "step": 1772 }, { "epoch": 2.2593182542210895, "grad_norm": 0.3129445877411577, "learning_rate": 1.334478292748992e-05, "loss": 0.0841, "step": 1773 }, { "epoch": 2.260592545396623, "grad_norm": 0.3329417857942078, "learning_rate": 1.3336389743726358e-05, "loss": 0.0937, "step": 1774 }, { "epoch": 2.261866836572157, "grad_norm": 0.3396404037354038, "learning_rate": 1.3327993914392281e-05, "loss": 0.0957, "step": 1775 }, { "epoch": 2.2631411277476903, "grad_norm": 0.330543702637141, "learning_rate": 1.3319595446145116e-05, "loss": 0.0882, "step": 1776 }, { "epoch": 2.2644154189232237, "grad_norm": 0.32353957893750696, "learning_rate": 1.331119434564438e-05, "loss": 0.0862, "step": 1777 }, { "epoch": 2.2656897100987576, "grad_norm": 0.33954632971506366, "learning_rate": 1.3302790619551673e-05, "loss": 0.0944, "step": 1778 }, { "epoch": 2.266964001274291, "grad_norm": 0.3160525995275369, "learning_rate": 1.3294384274530678e-05, "loss": 0.0904, "step": 1779 }, { "epoch": 2.2682382924498246, "grad_norm": 0.3185677159215593, "learning_rate": 1.3285975317247162e-05, "loss": 0.0877, "step": 1780 }, { "epoch": 2.2695125836253585, "grad_norm": 0.28810530752110747, "learning_rate": 1.3277563754368954e-05, "loss": 0.0786, "step": 1781 }, { "epoch": 2.270786874800892, "grad_norm": 0.3049968047908316, "learning_rate": 1.3269149592565953e-05, "loss": 0.0795, "step": 1782 }, { "epoch": 2.2720611659764254, "grad_norm": 0.3058381985970805, "learning_rate": 1.3260732838510121e-05, "loss": 0.0835, "step": 1783 }, { "epoch": 2.2733354571519593, "grad_norm": 0.34120940177347514, "learning_rate": 1.3252313498875473e-05, "loss": 0.0961, "step": 1784 }, { "epoch": 2.2746097483274927, "grad_norm": 0.32856780828554843, "learning_rate": 1.3243891580338074e-05, "loss": 0.0913, "step": 1785 }, { "epoch": 2.275884039503026, "grad_norm": 0.32023072876421266, "learning_rate": 1.3235467089576033e-05, "loss": 0.0898, "step": 1786 }, { "epoch": 2.27715833067856, "grad_norm": 0.31543534917975224, "learning_rate": 1.3227040033269501e-05, "loss": 0.0864, "step": 1787 }, { "epoch": 2.2784326218540936, "grad_norm": 0.2983469265632472, "learning_rate": 1.3218610418100663e-05, "loss": 0.0738, "step": 1788 }, { "epoch": 2.279706913029627, "grad_norm": 0.3485899569031978, "learning_rate": 1.3210178250753734e-05, "loss": 0.1026, "step": 1789 }, { "epoch": 2.280981204205161, "grad_norm": 0.3122746930521024, "learning_rate": 1.3201743537914948e-05, "loss": 0.088, "step": 1790 }, { "epoch": 2.2822554953806944, "grad_norm": 0.30404888555195214, "learning_rate": 1.3193306286272565e-05, "loss": 0.0734, "step": 1791 }, { "epoch": 2.2835297865562283, "grad_norm": 0.3001842195208922, "learning_rate": 1.3184866502516846e-05, "loss": 0.0747, "step": 1792 }, { "epoch": 2.2848040777317618, "grad_norm": 0.33910153180941077, "learning_rate": 1.3176424193340073e-05, "loss": 0.1123, "step": 1793 }, { "epoch": 2.286078368907295, "grad_norm": 0.3188564398706012, "learning_rate": 1.316797936543653e-05, "loss": 0.0954, "step": 1794 }, { "epoch": 2.287352660082829, "grad_norm": 0.30301720422933737, "learning_rate": 1.3159532025502484e-05, "loss": 0.0865, "step": 1795 }, { "epoch": 2.2886269512583626, "grad_norm": 0.3165517398519209, "learning_rate": 1.315108218023621e-05, "loss": 0.0918, "step": 1796 }, { "epoch": 2.289901242433896, "grad_norm": 0.31854991187503856, "learning_rate": 1.3142629836337956e-05, "loss": 0.091, "step": 1797 }, { "epoch": 2.29117553360943, "grad_norm": 0.31571183291745164, "learning_rate": 1.3134175000509965e-05, "loss": 0.0864, "step": 1798 }, { "epoch": 2.2924498247849634, "grad_norm": 0.32756122392534837, "learning_rate": 1.3125717679456447e-05, "loss": 0.0909, "step": 1799 }, { "epoch": 2.293724115960497, "grad_norm": 0.3139815785327954, "learning_rate": 1.3117257879883583e-05, "loss": 0.0875, "step": 1800 }, { "epoch": 2.2949984071360308, "grad_norm": 0.32484498017516056, "learning_rate": 1.3108795608499523e-05, "loss": 0.0978, "step": 1801 }, { "epoch": 2.296272698311564, "grad_norm": 0.3118598788100557, "learning_rate": 1.310033087201437e-05, "loss": 0.0837, "step": 1802 }, { "epoch": 2.2975469894870977, "grad_norm": 0.31698568040850156, "learning_rate": 1.3091863677140196e-05, "loss": 0.08, "step": 1803 }, { "epoch": 2.2988212806626316, "grad_norm": 0.322801321339186, "learning_rate": 1.3083394030591006e-05, "loss": 0.093, "step": 1804 }, { "epoch": 2.300095571838165, "grad_norm": 0.32419121494545505, "learning_rate": 1.3074921939082757e-05, "loss": 0.0964, "step": 1805 }, { "epoch": 2.3013698630136985, "grad_norm": 0.3068777977659285, "learning_rate": 1.3066447409333345e-05, "loss": 0.079, "step": 1806 }, { "epoch": 2.3026441541892324, "grad_norm": 0.3419564966778406, "learning_rate": 1.3057970448062598e-05, "loss": 0.095, "step": 1807 }, { "epoch": 2.303918445364766, "grad_norm": 0.3223980952018451, "learning_rate": 1.3049491061992274e-05, "loss": 0.0887, "step": 1808 }, { "epoch": 2.3051927365402993, "grad_norm": 0.318253833372108, "learning_rate": 1.3041009257846049e-05, "loss": 0.0857, "step": 1809 }, { "epoch": 2.3064670277158332, "grad_norm": 0.326341177705114, "learning_rate": 1.3032525042349522e-05, "loss": 0.0954, "step": 1810 }, { "epoch": 2.3077413188913667, "grad_norm": 0.3123924923599207, "learning_rate": 1.3024038422230198e-05, "loss": 0.083, "step": 1811 }, { "epoch": 2.3090156100669, "grad_norm": 0.310865336651445, "learning_rate": 1.3015549404217495e-05, "loss": 0.083, "step": 1812 }, { "epoch": 2.310289901242434, "grad_norm": 0.29196221580629955, "learning_rate": 1.300705799504273e-05, "loss": 0.0737, "step": 1813 }, { "epoch": 2.3115641924179675, "grad_norm": 0.3327751866812695, "learning_rate": 1.2998564201439117e-05, "loss": 0.0969, "step": 1814 }, { "epoch": 2.312838483593501, "grad_norm": 0.31347866357088683, "learning_rate": 1.2990068030141756e-05, "loss": 0.0863, "step": 1815 }, { "epoch": 2.314112774769035, "grad_norm": 0.3595049927940409, "learning_rate": 1.2981569487887638e-05, "loss": 0.0976, "step": 1816 }, { "epoch": 2.3153870659445683, "grad_norm": 0.3233779618482471, "learning_rate": 1.297306858141563e-05, "loss": 0.0836, "step": 1817 }, { "epoch": 2.316661357120102, "grad_norm": 0.33270843298941327, "learning_rate": 1.2964565317466474e-05, "loss": 0.0883, "step": 1818 }, { "epoch": 2.3179356482956357, "grad_norm": 0.3016289465491878, "learning_rate": 1.2956059702782788e-05, "loss": 0.0771, "step": 1819 }, { "epoch": 2.319209939471169, "grad_norm": 0.2972900519195985, "learning_rate": 1.2947551744109044e-05, "loss": 0.0721, "step": 1820 }, { "epoch": 2.3204842306467026, "grad_norm": 0.305502758068269, "learning_rate": 1.293904144819158e-05, "loss": 0.0781, "step": 1821 }, { "epoch": 2.3217585218222365, "grad_norm": 0.32612750781995936, "learning_rate": 1.2930528821778588e-05, "loss": 0.0863, "step": 1822 }, { "epoch": 2.32303281299777, "grad_norm": 0.3241756047122143, "learning_rate": 1.2922013871620096e-05, "loss": 0.0917, "step": 1823 }, { "epoch": 2.3243071041733034, "grad_norm": 0.3276105077521939, "learning_rate": 1.291349660446799e-05, "loss": 0.0919, "step": 1824 }, { "epoch": 2.3255813953488373, "grad_norm": 0.3432910581026233, "learning_rate": 1.2904977027075984e-05, "loss": 0.0918, "step": 1825 }, { "epoch": 2.326855686524371, "grad_norm": 0.32432001040961717, "learning_rate": 1.289645514619963e-05, "loss": 0.0888, "step": 1826 }, { "epoch": 2.3281299776999043, "grad_norm": 0.3394258015254914, "learning_rate": 1.28879309685963e-05, "loss": 0.0945, "step": 1827 }, { "epoch": 2.329404268875438, "grad_norm": 0.32000900879314315, "learning_rate": 1.2879404501025192e-05, "loss": 0.0938, "step": 1828 }, { "epoch": 2.3306785600509716, "grad_norm": 0.31389858450320024, "learning_rate": 1.2870875750247316e-05, "loss": 0.0914, "step": 1829 }, { "epoch": 2.331952851226505, "grad_norm": 0.33433789300101513, "learning_rate": 1.286234472302549e-05, "loss": 0.0975, "step": 1830 }, { "epoch": 2.333227142402039, "grad_norm": 0.3340225087479513, "learning_rate": 1.2853811426124355e-05, "loss": 0.0908, "step": 1831 }, { "epoch": 2.3345014335775724, "grad_norm": 0.320795904799697, "learning_rate": 1.2845275866310325e-05, "loss": 0.0842, "step": 1832 }, { "epoch": 2.335775724753106, "grad_norm": 0.3085820011097478, "learning_rate": 1.2836738050351629e-05, "loss": 0.0823, "step": 1833 }, { "epoch": 2.33705001592864, "grad_norm": 0.3335534561172022, "learning_rate": 1.2828197985018276e-05, "loss": 0.0956, "step": 1834 }, { "epoch": 2.3383243071041733, "grad_norm": 0.33883550121211825, "learning_rate": 1.2819655677082058e-05, "loss": 0.0921, "step": 1835 }, { "epoch": 2.3395985982797067, "grad_norm": 0.31391166936125237, "learning_rate": 1.2811111133316552e-05, "loss": 0.0784, "step": 1836 }, { "epoch": 2.3408728894552406, "grad_norm": 0.3380589394314144, "learning_rate": 1.2802564360497099e-05, "loss": 0.0958, "step": 1837 }, { "epoch": 2.342147180630774, "grad_norm": 0.3072981306969887, "learning_rate": 1.2794015365400814e-05, "loss": 0.0762, "step": 1838 }, { "epoch": 2.3434214718063076, "grad_norm": 0.2947952107563689, "learning_rate": 1.278546415480657e-05, "loss": 0.0738, "step": 1839 }, { "epoch": 2.3446957629818415, "grad_norm": 0.321258655509259, "learning_rate": 1.2776910735495005e-05, "loss": 0.0924, "step": 1840 }, { "epoch": 2.345970054157375, "grad_norm": 0.3195465564096787, "learning_rate": 1.2768355114248493e-05, "loss": 0.0889, "step": 1841 }, { "epoch": 2.3472443453329084, "grad_norm": 0.30192888145709573, "learning_rate": 1.2759797297851173e-05, "loss": 0.0858, "step": 1842 }, { "epoch": 2.3485186365084423, "grad_norm": 0.3198406855019385, "learning_rate": 1.2751237293088908e-05, "loss": 0.0801, "step": 1843 }, { "epoch": 2.3497929276839757, "grad_norm": 0.3195631385827557, "learning_rate": 1.2742675106749305e-05, "loss": 0.0824, "step": 1844 }, { "epoch": 2.3510672188595096, "grad_norm": 0.3249665053503958, "learning_rate": 1.27341107456217e-05, "loss": 0.088, "step": 1845 }, { "epoch": 2.352341510035043, "grad_norm": 0.3352989775821952, "learning_rate": 1.2725544216497151e-05, "loss": 0.0928, "step": 1846 }, { "epoch": 2.3536158012105766, "grad_norm": 0.3215646875818252, "learning_rate": 1.2716975526168437e-05, "loss": 0.0851, "step": 1847 }, { "epoch": 2.3548900923861105, "grad_norm": 0.3378847341331752, "learning_rate": 1.2708404681430054e-05, "loss": 0.0994, "step": 1848 }, { "epoch": 2.356164383561644, "grad_norm": 0.3073183549092637, "learning_rate": 1.2699831689078196e-05, "loss": 0.0783, "step": 1849 }, { "epoch": 2.3574386747371774, "grad_norm": 0.3107952422373644, "learning_rate": 1.2691256555910769e-05, "loss": 0.0803, "step": 1850 }, { "epoch": 2.3587129659127113, "grad_norm": 0.3363614632913453, "learning_rate": 1.2682679288727378e-05, "loss": 0.1043, "step": 1851 }, { "epoch": 2.3599872570882447, "grad_norm": 0.3217198886370579, "learning_rate": 1.2674099894329312e-05, "loss": 0.1014, "step": 1852 }, { "epoch": 2.361261548263778, "grad_norm": 0.2989174126495248, "learning_rate": 1.2665518379519553e-05, "loss": 0.0798, "step": 1853 }, { "epoch": 2.362535839439312, "grad_norm": 0.303269079628164, "learning_rate": 1.2656934751102761e-05, "loss": 0.0825, "step": 1854 }, { "epoch": 2.3638101306148456, "grad_norm": 0.3157751007406204, "learning_rate": 1.2648349015885272e-05, "loss": 0.0908, "step": 1855 }, { "epoch": 2.365084421790379, "grad_norm": 0.33547469562738635, "learning_rate": 1.2639761180675098e-05, "loss": 0.0969, "step": 1856 }, { "epoch": 2.366358712965913, "grad_norm": 0.3135832969582743, "learning_rate": 1.2631171252281914e-05, "loss": 0.0844, "step": 1857 }, { "epoch": 2.3676330041414464, "grad_norm": 0.31450906229287484, "learning_rate": 1.2622579237517045e-05, "loss": 0.0851, "step": 1858 }, { "epoch": 2.36890729531698, "grad_norm": 0.34233137175230965, "learning_rate": 1.2613985143193483e-05, "loss": 0.0961, "step": 1859 }, { "epoch": 2.3701815864925138, "grad_norm": 0.29380699493390156, "learning_rate": 1.2605388976125863e-05, "loss": 0.0752, "step": 1860 }, { "epoch": 2.371455877668047, "grad_norm": 0.3312418920005819, "learning_rate": 1.2596790743130465e-05, "loss": 0.0955, "step": 1861 }, { "epoch": 2.3727301688435807, "grad_norm": 0.3192854447308205, "learning_rate": 1.2588190451025209e-05, "loss": 0.0851, "step": 1862 }, { "epoch": 2.3740044600191146, "grad_norm": 0.3071091148333995, "learning_rate": 1.2579588106629643e-05, "loss": 0.0754, "step": 1863 }, { "epoch": 2.375278751194648, "grad_norm": 0.3410825675709281, "learning_rate": 1.2570983716764949e-05, "loss": 0.0915, "step": 1864 }, { "epoch": 2.3765530423701815, "grad_norm": 0.34552790630636104, "learning_rate": 1.2562377288253922e-05, "loss": 0.0887, "step": 1865 }, { "epoch": 2.3778273335457154, "grad_norm": 0.3381274358181014, "learning_rate": 1.2553768827920983e-05, "loss": 0.0895, "step": 1866 }, { "epoch": 2.379101624721249, "grad_norm": 0.3107715725464954, "learning_rate": 1.2545158342592157e-05, "loss": 0.0855, "step": 1867 }, { "epoch": 2.3803759158967823, "grad_norm": 0.31629079658112647, "learning_rate": 1.2536545839095074e-05, "loss": 0.0897, "step": 1868 }, { "epoch": 2.3816502070723162, "grad_norm": 0.3090792110186697, "learning_rate": 1.2527931324258975e-05, "loss": 0.0846, "step": 1869 }, { "epoch": 2.3829244982478497, "grad_norm": 0.328857677665131, "learning_rate": 1.2519314804914687e-05, "loss": 0.0854, "step": 1870 }, { "epoch": 2.384198789423383, "grad_norm": 0.29618219890082115, "learning_rate": 1.2510696287894626e-05, "loss": 0.0785, "step": 1871 }, { "epoch": 2.385473080598917, "grad_norm": 0.3043435245564651, "learning_rate": 1.2502075780032792e-05, "loss": 0.0846, "step": 1872 }, { "epoch": 2.3867473717744505, "grad_norm": 0.3327699477079075, "learning_rate": 1.2493453288164769e-05, "loss": 0.0996, "step": 1873 }, { "epoch": 2.388021662949984, "grad_norm": 0.3338631092567987, "learning_rate": 1.248482881912771e-05, "loss": 0.0912, "step": 1874 }, { "epoch": 2.389295954125518, "grad_norm": 0.31902684863772623, "learning_rate": 1.2476202379760339e-05, "loss": 0.088, "step": 1875 }, { "epoch": 2.3905702453010513, "grad_norm": 0.31883020965812775, "learning_rate": 1.2467573976902936e-05, "loss": 0.0857, "step": 1876 }, { "epoch": 2.391844536476585, "grad_norm": 0.3182787746099529, "learning_rate": 1.2458943617397346e-05, "loss": 0.0909, "step": 1877 }, { "epoch": 2.3931188276521187, "grad_norm": 0.33610044861966637, "learning_rate": 1.2450311308086957e-05, "loss": 0.1001, "step": 1878 }, { "epoch": 2.394393118827652, "grad_norm": 0.3443892309574436, "learning_rate": 1.2441677055816712e-05, "loss": 0.0995, "step": 1879 }, { "epoch": 2.3956674100031856, "grad_norm": 0.3118737461859822, "learning_rate": 1.2433040867433087e-05, "loss": 0.0757, "step": 1880 }, { "epoch": 2.3969417011787195, "grad_norm": 0.32558287088530063, "learning_rate": 1.24244027497841e-05, "loss": 0.0847, "step": 1881 }, { "epoch": 2.398215992354253, "grad_norm": 0.3247901644293056, "learning_rate": 1.2415762709719293e-05, "loss": 0.0869, "step": 1882 }, { "epoch": 2.3994902835297864, "grad_norm": 0.3277243116759496, "learning_rate": 1.2407120754089733e-05, "loss": 0.0876, "step": 1883 }, { "epoch": 2.4007645747053203, "grad_norm": 0.3415800948971748, "learning_rate": 1.239847688974801e-05, "loss": 0.0999, "step": 1884 }, { "epoch": 2.402038865880854, "grad_norm": 0.30821540385501905, "learning_rate": 1.2389831123548223e-05, "loss": 0.0773, "step": 1885 }, { "epoch": 2.4033131570563873, "grad_norm": 0.31355694269222983, "learning_rate": 1.2381183462345983e-05, "loss": 0.0808, "step": 1886 }, { "epoch": 2.404587448231921, "grad_norm": 0.3160327342770207, "learning_rate": 1.2372533912998402e-05, "loss": 0.0809, "step": 1887 }, { "epoch": 2.4058617394074546, "grad_norm": 0.3168407994298447, "learning_rate": 1.2363882482364089e-05, "loss": 0.0782, "step": 1888 }, { "epoch": 2.407136030582988, "grad_norm": 0.3076066741199318, "learning_rate": 1.2355229177303145e-05, "loss": 0.0769, "step": 1889 }, { "epoch": 2.408410321758522, "grad_norm": 0.315584094564645, "learning_rate": 1.2346574004677154e-05, "loss": 0.0785, "step": 1890 }, { "epoch": 2.4096846129340554, "grad_norm": 0.325289133273613, "learning_rate": 1.233791697134919e-05, "loss": 0.0897, "step": 1891 }, { "epoch": 2.410958904109589, "grad_norm": 0.3231852801532556, "learning_rate": 1.2329258084183788e-05, "loss": 0.0851, "step": 1892 }, { "epoch": 2.412233195285123, "grad_norm": 0.30932674755782097, "learning_rate": 1.2320597350046966e-05, "loss": 0.0788, "step": 1893 }, { "epoch": 2.4135074864606563, "grad_norm": 0.339099039401066, "learning_rate": 1.2311934775806205e-05, "loss": 0.0936, "step": 1894 }, { "epoch": 2.4147817776361897, "grad_norm": 0.3224226604926453, "learning_rate": 1.2303270368330438e-05, "loss": 0.0887, "step": 1895 }, { "epoch": 2.4160560688117236, "grad_norm": 0.31270397176170933, "learning_rate": 1.2294604134490055e-05, "loss": 0.0766, "step": 1896 }, { "epoch": 2.417330359987257, "grad_norm": 0.3131659058702623, "learning_rate": 1.2285936081156897e-05, "loss": 0.088, "step": 1897 }, { "epoch": 2.4186046511627906, "grad_norm": 0.31346306585018446, "learning_rate": 1.2277266215204247e-05, "loss": 0.0823, "step": 1898 }, { "epoch": 2.4198789423383245, "grad_norm": 0.3116451822137916, "learning_rate": 1.2268594543506817e-05, "loss": 0.0783, "step": 1899 }, { "epoch": 2.421153233513858, "grad_norm": 0.3111628373464386, "learning_rate": 1.2259921072940765e-05, "loss": 0.0858, "step": 1900 }, { "epoch": 2.4224275246893914, "grad_norm": 0.31759036967524046, "learning_rate": 1.225124581038367e-05, "loss": 0.0868, "step": 1901 }, { "epoch": 2.4237018158649253, "grad_norm": 0.33669019890466445, "learning_rate": 1.2242568762714522e-05, "loss": 0.0959, "step": 1902 }, { "epoch": 2.4249761070404587, "grad_norm": 0.3146816333001631, "learning_rate": 1.223388993681374e-05, "loss": 0.0761, "step": 1903 }, { "epoch": 2.426250398215992, "grad_norm": 0.33771299246830705, "learning_rate": 1.2225209339563144e-05, "loss": 0.0921, "step": 1904 }, { "epoch": 2.427524689391526, "grad_norm": 0.3297168556888754, "learning_rate": 1.2216526977845968e-05, "loss": 0.0866, "step": 1905 }, { "epoch": 2.4287989805670596, "grad_norm": 0.3104602660736942, "learning_rate": 1.2207842858546834e-05, "loss": 0.088, "step": 1906 }, { "epoch": 2.430073271742593, "grad_norm": 0.327972805707389, "learning_rate": 1.2199156988551766e-05, "loss": 0.0875, "step": 1907 }, { "epoch": 2.431347562918127, "grad_norm": 0.32125443805836634, "learning_rate": 1.219046937474817e-05, "loss": 0.0911, "step": 1908 }, { "epoch": 2.4326218540936604, "grad_norm": 0.33599116299448584, "learning_rate": 1.2181780024024842e-05, "loss": 0.1093, "step": 1909 }, { "epoch": 2.433896145269194, "grad_norm": 0.33848393223041545, "learning_rate": 1.2173088943271949e-05, "loss": 0.1059, "step": 1910 }, { "epoch": 2.4351704364447277, "grad_norm": 0.2929763742084531, "learning_rate": 1.2164396139381029e-05, "loss": 0.0774, "step": 1911 }, { "epoch": 2.436444727620261, "grad_norm": 0.30980957334814335, "learning_rate": 1.2155701619244997e-05, "loss": 0.085, "step": 1912 }, { "epoch": 2.4377190187957947, "grad_norm": 0.3189288412003184, "learning_rate": 1.2147005389758117e-05, "loss": 0.0802, "step": 1913 }, { "epoch": 2.4389933099713286, "grad_norm": 0.32690508603263135, "learning_rate": 1.2138307457816012e-05, "loss": 0.0964, "step": 1914 }, { "epoch": 2.440267601146862, "grad_norm": 0.3129071555821518, "learning_rate": 1.2129607830315657e-05, "loss": 0.0847, "step": 1915 }, { "epoch": 2.4415418923223955, "grad_norm": 0.33056725832897377, "learning_rate": 1.2120906514155371e-05, "loss": 0.0878, "step": 1916 }, { "epoch": 2.4428161834979294, "grad_norm": 0.31494595742658993, "learning_rate": 1.2112203516234805e-05, "loss": 0.0815, "step": 1917 }, { "epoch": 2.444090474673463, "grad_norm": 0.3258238370755792, "learning_rate": 1.210349884345496e-05, "loss": 0.0926, "step": 1918 }, { "epoch": 2.4453647658489963, "grad_norm": 0.29868914541417, "learning_rate": 1.2094792502718147e-05, "loss": 0.0791, "step": 1919 }, { "epoch": 2.44663905702453, "grad_norm": 0.32471480586064855, "learning_rate": 1.208608450092801e-05, "loss": 0.0969, "step": 1920 }, { "epoch": 2.4479133482000637, "grad_norm": 0.34580230815154184, "learning_rate": 1.2077374844989507e-05, "loss": 0.0943, "step": 1921 }, { "epoch": 2.449187639375597, "grad_norm": 0.29719778378873757, "learning_rate": 1.206866354180891e-05, "loss": 0.0806, "step": 1922 }, { "epoch": 2.450461930551131, "grad_norm": 0.3087548125920462, "learning_rate": 1.2059950598293795e-05, "loss": 0.0816, "step": 1923 }, { "epoch": 2.4517362217266645, "grad_norm": 0.3257011139485945, "learning_rate": 1.2051236021353032e-05, "loss": 0.1045, "step": 1924 }, { "epoch": 2.453010512902198, "grad_norm": 0.31667110128443005, "learning_rate": 1.2042519817896805e-05, "loss": 0.0886, "step": 1925 }, { "epoch": 2.454284804077732, "grad_norm": 0.3035041065028543, "learning_rate": 1.2033801994836567e-05, "loss": 0.0805, "step": 1926 }, { "epoch": 2.4555590952532653, "grad_norm": 0.29176270912030494, "learning_rate": 1.2025082559085068e-05, "loss": 0.0742, "step": 1927 }, { "epoch": 2.456833386428799, "grad_norm": 0.31595737534853774, "learning_rate": 1.2016361517556334e-05, "loss": 0.08, "step": 1928 }, { "epoch": 2.4581076776043327, "grad_norm": 0.345455170495746, "learning_rate": 1.2007638877165662e-05, "loss": 0.1058, "step": 1929 }, { "epoch": 2.459381968779866, "grad_norm": 0.3219345941591984, "learning_rate": 1.1998914644829613e-05, "loss": 0.0898, "step": 1930 }, { "epoch": 2.4606562599553996, "grad_norm": 0.3514435291484228, "learning_rate": 1.1990188827466025e-05, "loss": 0.0981, "step": 1931 }, { "epoch": 2.4619305511309335, "grad_norm": 0.31540015293922846, "learning_rate": 1.1981461431993978e-05, "loss": 0.0771, "step": 1932 }, { "epoch": 2.463204842306467, "grad_norm": 0.34261643072586295, "learning_rate": 1.1972732465333806e-05, "loss": 0.0969, "step": 1933 }, { "epoch": 2.4644791334820004, "grad_norm": 0.32017773680605366, "learning_rate": 1.1964001934407096e-05, "loss": 0.0891, "step": 1934 }, { "epoch": 2.4657534246575343, "grad_norm": 0.32894438074413745, "learning_rate": 1.195526984613667e-05, "loss": 0.0987, "step": 1935 }, { "epoch": 2.467027715833068, "grad_norm": 0.2868030538781784, "learning_rate": 1.1946536207446587e-05, "loss": 0.0655, "step": 1936 }, { "epoch": 2.4683020070086013, "grad_norm": 0.340849075519848, "learning_rate": 1.193780102526213e-05, "loss": 0.1039, "step": 1937 }, { "epoch": 2.469576298184135, "grad_norm": 0.2979843243983558, "learning_rate": 1.1929064306509813e-05, "loss": 0.0765, "step": 1938 }, { "epoch": 2.4708505893596686, "grad_norm": 0.31039707234367797, "learning_rate": 1.1920326058117364e-05, "loss": 0.0814, "step": 1939 }, { "epoch": 2.472124880535202, "grad_norm": 0.30515102650836734, "learning_rate": 1.1911586287013726e-05, "loss": 0.0747, "step": 1940 }, { "epoch": 2.473399171710736, "grad_norm": 0.3358662038407144, "learning_rate": 1.190284500012905e-05, "loss": 0.0891, "step": 1941 }, { "epoch": 2.4746734628862694, "grad_norm": 0.33476674543178714, "learning_rate": 1.1894102204394682e-05, "loss": 0.0931, "step": 1942 }, { "epoch": 2.475947754061803, "grad_norm": 0.32670885368860525, "learning_rate": 1.188535790674318e-05, "loss": 0.0886, "step": 1943 }, { "epoch": 2.477222045237337, "grad_norm": 0.3302213157461682, "learning_rate": 1.1876612114108278e-05, "loss": 0.0933, "step": 1944 }, { "epoch": 2.4784963364128703, "grad_norm": 0.3239794931016664, "learning_rate": 1.1867864833424897e-05, "loss": 0.0898, "step": 1945 }, { "epoch": 2.4797706275884037, "grad_norm": 0.31354126807109123, "learning_rate": 1.1859116071629148e-05, "loss": 0.0703, "step": 1946 }, { "epoch": 2.4810449187639376, "grad_norm": 0.3209761168968572, "learning_rate": 1.1850365835658306e-05, "loss": 0.0833, "step": 1947 }, { "epoch": 2.482319209939471, "grad_norm": 0.34190873781642267, "learning_rate": 1.1841614132450818e-05, "loss": 0.1028, "step": 1948 }, { "epoch": 2.4835935011150045, "grad_norm": 0.3140638227538553, "learning_rate": 1.1832860968946298e-05, "loss": 0.0768, "step": 1949 }, { "epoch": 2.4848677922905384, "grad_norm": 0.35399255094646037, "learning_rate": 1.1824106352085517e-05, "loss": 0.092, "step": 1950 }, { "epoch": 2.486142083466072, "grad_norm": 0.3511493604679209, "learning_rate": 1.181535028881039e-05, "loss": 0.093, "step": 1951 }, { "epoch": 2.4874163746416054, "grad_norm": 0.3149089434880097, "learning_rate": 1.1806592786063991e-05, "loss": 0.0845, "step": 1952 }, { "epoch": 2.4886906658171393, "grad_norm": 0.3244840442246634, "learning_rate": 1.1797833850790527e-05, "loss": 0.0857, "step": 1953 }, { "epoch": 2.4899649569926727, "grad_norm": 0.3049755448520856, "learning_rate": 1.1789073489935349e-05, "loss": 0.0794, "step": 1954 }, { "epoch": 2.491239248168206, "grad_norm": 0.28635355457069134, "learning_rate": 1.1780311710444925e-05, "loss": 0.0725, "step": 1955 }, { "epoch": 2.49251353934374, "grad_norm": 0.29649469106266174, "learning_rate": 1.1771548519266864e-05, "loss": 0.078, "step": 1956 }, { "epoch": 2.4937878305192736, "grad_norm": 0.3490018809727298, "learning_rate": 1.1762783923349883e-05, "loss": 0.1004, "step": 1957 }, { "epoch": 2.4950621216948075, "grad_norm": 0.31204434573109163, "learning_rate": 1.1754017929643818e-05, "loss": 0.0894, "step": 1958 }, { "epoch": 2.496336412870341, "grad_norm": 0.33461097188066163, "learning_rate": 1.1745250545099609e-05, "loss": 0.0915, "step": 1959 }, { "epoch": 2.4976107040458744, "grad_norm": 0.3149384946841901, "learning_rate": 1.1736481776669307e-05, "loss": 0.0797, "step": 1960 }, { "epoch": 2.4988849952214083, "grad_norm": 0.3386365009921545, "learning_rate": 1.1727711631306044e-05, "loss": 0.0986, "step": 1961 }, { "epoch": 2.5001592863969417, "grad_norm": 0.29894478343653874, "learning_rate": 1.171894011596407e-05, "loss": 0.0705, "step": 1962 }, { "epoch": 2.501433577572475, "grad_norm": 0.3091759041765266, "learning_rate": 1.1710167237598695e-05, "loss": 0.0836, "step": 1963 }, { "epoch": 2.5027078687480087, "grad_norm": 0.32629254181827216, "learning_rate": 1.1701393003166328e-05, "loss": 0.0849, "step": 1964 }, { "epoch": 2.5039821599235426, "grad_norm": 0.3259611086285988, "learning_rate": 1.169261741962444e-05, "loss": 0.0941, "step": 1965 }, { "epoch": 2.505256451099076, "grad_norm": 0.32620483690854674, "learning_rate": 1.1683840493931582e-05, "loss": 0.0877, "step": 1966 }, { "epoch": 2.5065307422746095, "grad_norm": 0.32242492531158135, "learning_rate": 1.1675062233047365e-05, "loss": 0.0833, "step": 1967 }, { "epoch": 2.5078050334501434, "grad_norm": 0.34786838444237644, "learning_rate": 1.1666282643932459e-05, "loss": 0.0995, "step": 1968 }, { "epoch": 2.509079324625677, "grad_norm": 0.3108859036344349, "learning_rate": 1.1657501733548587e-05, "loss": 0.083, "step": 1969 }, { "epoch": 2.5103536158012103, "grad_norm": 0.3294174362199036, "learning_rate": 1.1648719508858517e-05, "loss": 0.092, "step": 1970 }, { "epoch": 2.511627906976744, "grad_norm": 0.31309522466310447, "learning_rate": 1.1639935976826067e-05, "loss": 0.0892, "step": 1971 }, { "epoch": 2.5129021981522777, "grad_norm": 0.3251174518450331, "learning_rate": 1.1631151144416084e-05, "loss": 0.1006, "step": 1972 }, { "epoch": 2.5141764893278116, "grad_norm": 0.3230502915591523, "learning_rate": 1.1622365018594448e-05, "loss": 0.0971, "step": 1973 }, { "epoch": 2.515450780503345, "grad_norm": 0.3185563827404331, "learning_rate": 1.1613577606328068e-05, "loss": 0.0845, "step": 1974 }, { "epoch": 2.5167250716788785, "grad_norm": 0.3294848323209769, "learning_rate": 1.160478891458487e-05, "loss": 0.0978, "step": 1975 }, { "epoch": 2.5179993628544124, "grad_norm": 0.3062684784791015, "learning_rate": 1.1595998950333794e-05, "loss": 0.074, "step": 1976 }, { "epoch": 2.519273654029946, "grad_norm": 0.3226398606758775, "learning_rate": 1.1587207720544791e-05, "loss": 0.085, "step": 1977 }, { "epoch": 2.5205479452054793, "grad_norm": 0.30604934532225586, "learning_rate": 1.1578415232188816e-05, "loss": 0.0831, "step": 1978 }, { "epoch": 2.521822236381013, "grad_norm": 0.3337626443265537, "learning_rate": 1.1569621492237814e-05, "loss": 0.0971, "step": 1979 }, { "epoch": 2.5230965275565467, "grad_norm": 0.31116815109506313, "learning_rate": 1.156082650766474e-05, "loss": 0.0744, "step": 1980 }, { "epoch": 2.52437081873208, "grad_norm": 0.3302907618695284, "learning_rate": 1.1552030285443516e-05, "loss": 0.0925, "step": 1981 }, { "epoch": 2.525645109907614, "grad_norm": 0.29644232720432806, "learning_rate": 1.154323283254906e-05, "loss": 0.0766, "step": 1982 }, { "epoch": 2.5269194010831475, "grad_norm": 0.31983142001563797, "learning_rate": 1.1534434155957257e-05, "loss": 0.0877, "step": 1983 }, { "epoch": 2.528193692258681, "grad_norm": 0.31497773359641384, "learning_rate": 1.1525634262644964e-05, "loss": 0.0884, "step": 1984 }, { "epoch": 2.529467983434215, "grad_norm": 0.3205312609276482, "learning_rate": 1.1516833159590011e-05, "loss": 0.0797, "step": 1985 }, { "epoch": 2.5307422746097483, "grad_norm": 0.33467434410547603, "learning_rate": 1.150803085377117e-05, "loss": 0.0929, "step": 1986 }, { "epoch": 2.532016565785282, "grad_norm": 0.3186536894089304, "learning_rate": 1.1499227352168187e-05, "loss": 0.0877, "step": 1987 }, { "epoch": 2.5332908569608157, "grad_norm": 0.3089754557451753, "learning_rate": 1.1490422661761744e-05, "loss": 0.078, "step": 1988 }, { "epoch": 2.534565148136349, "grad_norm": 0.33491517900649054, "learning_rate": 1.148161678953347e-05, "loss": 0.1035, "step": 1989 }, { "epoch": 2.5358394393118826, "grad_norm": 0.3155769075512334, "learning_rate": 1.1472809742465923e-05, "loss": 0.0889, "step": 1990 }, { "epoch": 2.5371137304874165, "grad_norm": 0.3083065308703718, "learning_rate": 1.1464001527542603e-05, "loss": 0.0778, "step": 1991 }, { "epoch": 2.53838802166295, "grad_norm": 0.3403318766284883, "learning_rate": 1.1455192151747931e-05, "loss": 0.1026, "step": 1992 }, { "epoch": 2.539662312838484, "grad_norm": 0.31636524983134506, "learning_rate": 1.144638162206725e-05, "loss": 0.0793, "step": 1993 }, { "epoch": 2.5409366040140173, "grad_norm": 0.33684899377989835, "learning_rate": 1.143756994548682e-05, "loss": 0.0974, "step": 1994 }, { "epoch": 2.542210895189551, "grad_norm": 0.3408734517176862, "learning_rate": 1.1428757128993801e-05, "loss": 0.0981, "step": 1995 }, { "epoch": 2.5434851863650847, "grad_norm": 0.2956727201195911, "learning_rate": 1.1419943179576272e-05, "loss": 0.0695, "step": 1996 }, { "epoch": 2.544759477540618, "grad_norm": 0.32360810519852506, "learning_rate": 1.1411128104223194e-05, "loss": 0.0892, "step": 1997 }, { "epoch": 2.5460337687161516, "grad_norm": 0.31557601297611737, "learning_rate": 1.1402311909924433e-05, "loss": 0.0844, "step": 1998 }, { "epoch": 2.5473080598916855, "grad_norm": 0.3262756826349067, "learning_rate": 1.1393494603670738e-05, "loss": 0.0908, "step": 1999 }, { "epoch": 2.548582351067219, "grad_norm": 0.3285387597831467, "learning_rate": 1.138467619245374e-05, "loss": 0.0957, "step": 2000 }, { "epoch": 2.5498566422427524, "grad_norm": 0.3281999476094268, "learning_rate": 1.137585668326595e-05, "loss": 0.0927, "step": 2001 }, { "epoch": 2.5511309334182863, "grad_norm": 0.31706230006049824, "learning_rate": 1.1367036083100735e-05, "loss": 0.0822, "step": 2002 }, { "epoch": 2.55240522459382, "grad_norm": 0.31584824968940806, "learning_rate": 1.1358214398952348e-05, "loss": 0.0809, "step": 2003 }, { "epoch": 2.5536795157693533, "grad_norm": 0.3110073902482843, "learning_rate": 1.1349391637815886e-05, "loss": 0.0822, "step": 2004 }, { "epoch": 2.554953806944887, "grad_norm": 0.30345317694246793, "learning_rate": 1.1340567806687305e-05, "loss": 0.0807, "step": 2005 }, { "epoch": 2.5562280981204206, "grad_norm": 0.3135009478826194, "learning_rate": 1.1331742912563413e-05, "loss": 0.0767, "step": 2006 }, { "epoch": 2.557502389295954, "grad_norm": 0.3222698967832311, "learning_rate": 1.1322916962441857e-05, "loss": 0.0823, "step": 2007 }, { "epoch": 2.558776680471488, "grad_norm": 0.33637044199059957, "learning_rate": 1.131408996332112e-05, "loss": 0.1069, "step": 2008 }, { "epoch": 2.5600509716470214, "grad_norm": 0.30697702383400094, "learning_rate": 1.130526192220052e-05, "loss": 0.0797, "step": 2009 }, { "epoch": 2.561325262822555, "grad_norm": 0.3169993014498793, "learning_rate": 1.1296432846080196e-05, "loss": 0.0775, "step": 2010 }, { "epoch": 2.562599553998089, "grad_norm": 0.340989926634693, "learning_rate": 1.1287602741961116e-05, "loss": 0.0896, "step": 2011 }, { "epoch": 2.5638738451736223, "grad_norm": 0.3210645209260389, "learning_rate": 1.1278771616845061e-05, "loss": 0.0917, "step": 2012 }, { "epoch": 2.5651481363491557, "grad_norm": 0.3142385652199279, "learning_rate": 1.1269939477734614e-05, "loss": 0.0852, "step": 2013 }, { "epoch": 2.5664224275246896, "grad_norm": 0.33413625680103143, "learning_rate": 1.1261106331633173e-05, "loss": 0.0969, "step": 2014 }, { "epoch": 2.567696718700223, "grad_norm": 0.31206143634212363, "learning_rate": 1.1252272185544926e-05, "loss": 0.0837, "step": 2015 }, { "epoch": 2.5689710098757566, "grad_norm": 0.3240925163925932, "learning_rate": 1.1243437046474854e-05, "loss": 0.0884, "step": 2016 }, { "epoch": 2.5702453010512905, "grad_norm": 0.3286864332413904, "learning_rate": 1.123460092142873e-05, "loss": 0.0897, "step": 2017 }, { "epoch": 2.571519592226824, "grad_norm": 0.3408936331409148, "learning_rate": 1.122576381741311e-05, "loss": 0.0901, "step": 2018 }, { "epoch": 2.5727938834023574, "grad_norm": 0.32237177859330124, "learning_rate": 1.1216925741435323e-05, "loss": 0.0919, "step": 2019 }, { "epoch": 2.5740681745778913, "grad_norm": 0.31334736814266256, "learning_rate": 1.1208086700503466e-05, "loss": 0.0755, "step": 2020 }, { "epoch": 2.5753424657534247, "grad_norm": 0.327271565776877, "learning_rate": 1.1199246701626405e-05, "loss": 0.0761, "step": 2021 }, { "epoch": 2.576616756928958, "grad_norm": 0.32555008834600513, "learning_rate": 1.1190405751813766e-05, "loss": 0.086, "step": 2022 }, { "epoch": 2.577891048104492, "grad_norm": 0.33652242391985926, "learning_rate": 1.118156385807593e-05, "loss": 0.0936, "step": 2023 }, { "epoch": 2.5791653392800256, "grad_norm": 0.32867239191051495, "learning_rate": 1.1172721027424021e-05, "loss": 0.0906, "step": 2024 }, { "epoch": 2.580439630455559, "grad_norm": 0.3209711562826182, "learning_rate": 1.1163877266869907e-05, "loss": 0.0897, "step": 2025 }, { "epoch": 2.581713921631093, "grad_norm": 0.3113130080428712, "learning_rate": 1.1155032583426202e-05, "loss": 0.0839, "step": 2026 }, { "epoch": 2.5829882128066264, "grad_norm": 0.33417198839440493, "learning_rate": 1.114618698410624e-05, "loss": 0.1002, "step": 2027 }, { "epoch": 2.58426250398216, "grad_norm": 0.3068826859883825, "learning_rate": 1.113734047592409e-05, "loss": 0.0779, "step": 2028 }, { "epoch": 2.5855367951576937, "grad_norm": 0.3237842427324694, "learning_rate": 1.1128493065894535e-05, "loss": 0.0864, "step": 2029 }, { "epoch": 2.586811086333227, "grad_norm": 0.3065834578764402, "learning_rate": 1.1119644761033079e-05, "loss": 0.0803, "step": 2030 }, { "epoch": 2.5880853775087607, "grad_norm": 0.3128375471542288, "learning_rate": 1.1110795568355935e-05, "loss": 0.0833, "step": 2031 }, { "epoch": 2.5893596686842946, "grad_norm": 0.33902393352355725, "learning_rate": 1.1101945494880013e-05, "loss": 0.0892, "step": 2032 }, { "epoch": 2.590633959859828, "grad_norm": 0.32219254334905445, "learning_rate": 1.109309454762293e-05, "loss": 0.075, "step": 2033 }, { "epoch": 2.5919082510353615, "grad_norm": 0.3287213371282918, "learning_rate": 1.1084242733602992e-05, "loss": 0.0805, "step": 2034 }, { "epoch": 2.5931825422108954, "grad_norm": 0.31325426431350156, "learning_rate": 1.107539005983919e-05, "loss": 0.0841, "step": 2035 }, { "epoch": 2.594456833386429, "grad_norm": 0.31409752902295945, "learning_rate": 1.1066536533351202e-05, "loss": 0.0863, "step": 2036 }, { "epoch": 2.5957311245619623, "grad_norm": 0.31880178643966833, "learning_rate": 1.105768216115938e-05, "loss": 0.0839, "step": 2037 }, { "epoch": 2.597005415737496, "grad_norm": 0.33873620786428427, "learning_rate": 1.1048826950284743e-05, "loss": 0.0935, "step": 2038 }, { "epoch": 2.5982797069130297, "grad_norm": 0.32805129236266783, "learning_rate": 1.1039970907748981e-05, "loss": 0.089, "step": 2039 }, { "epoch": 2.599553998088563, "grad_norm": 0.3524470265715568, "learning_rate": 1.1031114040574439e-05, "loss": 0.098, "step": 2040 }, { "epoch": 2.600828289264097, "grad_norm": 0.3068827519168772, "learning_rate": 1.1022256355784115e-05, "loss": 0.0751, "step": 2041 }, { "epoch": 2.6021025804396305, "grad_norm": 0.32582526331543765, "learning_rate": 1.101339786040166e-05, "loss": 0.0909, "step": 2042 }, { "epoch": 2.603376871615164, "grad_norm": 0.3226132847295285, "learning_rate": 1.100453856145137e-05, "loss": 0.0848, "step": 2043 }, { "epoch": 2.604651162790698, "grad_norm": 0.3343438221893248, "learning_rate": 1.0995678465958168e-05, "loss": 0.1013, "step": 2044 }, { "epoch": 2.6059254539662313, "grad_norm": 0.30261602369834584, "learning_rate": 1.0986817580947616e-05, "loss": 0.0863, "step": 2045 }, { "epoch": 2.607199745141765, "grad_norm": 0.30706185013068976, "learning_rate": 1.09779559134459e-05, "loss": 0.0847, "step": 2046 }, { "epoch": 2.6084740363172987, "grad_norm": 0.3052769869543475, "learning_rate": 1.0969093470479828e-05, "loss": 0.0831, "step": 2047 }, { "epoch": 2.609748327492832, "grad_norm": 0.34162289792745965, "learning_rate": 1.0960230259076819e-05, "loss": 0.0997, "step": 2048 }, { "epoch": 2.6110226186683656, "grad_norm": 0.3425847533582344, "learning_rate": 1.0951366286264907e-05, "loss": 0.0978, "step": 2049 }, { "epoch": 2.6122969098438995, "grad_norm": 0.3158656852363034, "learning_rate": 1.0942501559072725e-05, "loss": 0.0824, "step": 2050 }, { "epoch": 2.613571201019433, "grad_norm": 0.3267874877397459, "learning_rate": 1.0933636084529507e-05, "loss": 0.0795, "step": 2051 }, { "epoch": 2.6148454921949664, "grad_norm": 0.32272140607027366, "learning_rate": 1.0924769869665079e-05, "loss": 0.0755, "step": 2052 }, { "epoch": 2.6161197833705003, "grad_norm": 0.32386665358674543, "learning_rate": 1.0915902921509853e-05, "loss": 0.0837, "step": 2053 }, { "epoch": 2.617394074546034, "grad_norm": 0.3464507853938489, "learning_rate": 1.0907035247094823e-05, "loss": 0.0906, "step": 2054 }, { "epoch": 2.6186683657215672, "grad_norm": 0.3285536207636223, "learning_rate": 1.089816685345156e-05, "loss": 0.0875, "step": 2055 }, { "epoch": 2.619942656897101, "grad_norm": 0.31290835401103695, "learning_rate": 1.0889297747612202e-05, "loss": 0.0793, "step": 2056 }, { "epoch": 2.6212169480726346, "grad_norm": 0.30293198038848784, "learning_rate": 1.0880427936609455e-05, "loss": 0.0759, "step": 2057 }, { "epoch": 2.622491239248168, "grad_norm": 0.33387031579182563, "learning_rate": 1.0871557427476585e-05, "loss": 0.0942, "step": 2058 }, { "epoch": 2.623765530423702, "grad_norm": 0.3177381341878901, "learning_rate": 1.0862686227247407e-05, "loss": 0.0959, "step": 2059 }, { "epoch": 2.6250398215992354, "grad_norm": 0.31671280315043826, "learning_rate": 1.0853814342956286e-05, "loss": 0.0861, "step": 2060 }, { "epoch": 2.626314112774769, "grad_norm": 0.3181282462903609, "learning_rate": 1.0844941781638133e-05, "loss": 0.0855, "step": 2061 }, { "epoch": 2.627588403950303, "grad_norm": 0.36393724542648953, "learning_rate": 1.0836068550328395e-05, "loss": 0.1047, "step": 2062 }, { "epoch": 2.6288626951258363, "grad_norm": 0.3192120844290509, "learning_rate": 1.0827194656063044e-05, "loss": 0.082, "step": 2063 }, { "epoch": 2.6301369863013697, "grad_norm": 0.3082305387964181, "learning_rate": 1.0818320105878584e-05, "loss": 0.0875, "step": 2064 }, { "epoch": 2.6314112774769036, "grad_norm": 0.3320428090950603, "learning_rate": 1.0809444906812034e-05, "loss": 0.0952, "step": 2065 }, { "epoch": 2.632685568652437, "grad_norm": 0.34922960274544423, "learning_rate": 1.0800569065900935e-05, "loss": 0.0979, "step": 2066 }, { "epoch": 2.6339598598279705, "grad_norm": 0.33710775944956695, "learning_rate": 1.0791692590183328e-05, "loss": 0.0857, "step": 2067 }, { "epoch": 2.6352341510035044, "grad_norm": 0.31400668122961467, "learning_rate": 1.0782815486697768e-05, "loss": 0.089, "step": 2068 }, { "epoch": 2.636508442179038, "grad_norm": 0.3061696679707814, "learning_rate": 1.0773937762483297e-05, "loss": 0.0822, "step": 2069 }, { "epoch": 2.6377827333545714, "grad_norm": 0.3190974777310587, "learning_rate": 1.0765059424579457e-05, "loss": 0.0818, "step": 2070 }, { "epoch": 2.6390570245301053, "grad_norm": 0.3374979859797057, "learning_rate": 1.075618048002627e-05, "loss": 0.0902, "step": 2071 }, { "epoch": 2.6403313157056387, "grad_norm": 0.3264869703985541, "learning_rate": 1.0747300935864245e-05, "loss": 0.0848, "step": 2072 }, { "epoch": 2.641605606881172, "grad_norm": 0.32088844495604935, "learning_rate": 1.0738420799134359e-05, "loss": 0.0811, "step": 2073 }, { "epoch": 2.642879898056706, "grad_norm": 0.3109576294303115, "learning_rate": 1.0729540076878073e-05, "loss": 0.0811, "step": 2074 }, { "epoch": 2.6441541892322395, "grad_norm": 0.3058215217316027, "learning_rate": 1.0720658776137298e-05, "loss": 0.0727, "step": 2075 }, { "epoch": 2.645428480407773, "grad_norm": 0.3202307093252086, "learning_rate": 1.0711776903954407e-05, "loss": 0.0855, "step": 2076 }, { "epoch": 2.646702771583307, "grad_norm": 0.3353282107483531, "learning_rate": 1.0702894467372234e-05, "loss": 0.0975, "step": 2077 }, { "epoch": 2.6479770627588404, "grad_norm": 0.3201525661364687, "learning_rate": 1.0694011473434048e-05, "loss": 0.0902, "step": 2078 }, { "epoch": 2.649251353934374, "grad_norm": 0.3164845584803567, "learning_rate": 1.0685127929183567e-05, "loss": 0.0793, "step": 2079 }, { "epoch": 2.6505256451099077, "grad_norm": 0.3288717219795119, "learning_rate": 1.0676243841664951e-05, "loss": 0.0923, "step": 2080 }, { "epoch": 2.651799936285441, "grad_norm": 0.32685724051438364, "learning_rate": 1.0667359217922778e-05, "loss": 0.0942, "step": 2081 }, { "epoch": 2.6530742274609747, "grad_norm": 0.31448312849271354, "learning_rate": 1.0658474065002062e-05, "loss": 0.0878, "step": 2082 }, { "epoch": 2.6543485186365086, "grad_norm": 0.33562881722461013, "learning_rate": 1.0649588389948234e-05, "loss": 0.0894, "step": 2083 }, { "epoch": 2.655622809812042, "grad_norm": 0.3202177298691848, "learning_rate": 1.064070219980713e-05, "loss": 0.0807, "step": 2084 }, { "epoch": 2.6568971009875755, "grad_norm": 0.3224018783106983, "learning_rate": 1.0631815501625008e-05, "loss": 0.081, "step": 2085 }, { "epoch": 2.6581713921631094, "grad_norm": 0.3301886594287629, "learning_rate": 1.0622928302448523e-05, "loss": 0.0946, "step": 2086 }, { "epoch": 2.659445683338643, "grad_norm": 0.3550386630056699, "learning_rate": 1.0614040609324723e-05, "loss": 0.094, "step": 2087 }, { "epoch": 2.6607199745141763, "grad_norm": 0.3472849693802295, "learning_rate": 1.0605152429301055e-05, "loss": 0.0996, "step": 2088 }, { "epoch": 2.66199426568971, "grad_norm": 0.3179225386675906, "learning_rate": 1.0596263769425348e-05, "loss": 0.0849, "step": 2089 }, { "epoch": 2.6632685568652437, "grad_norm": 0.3060436630532248, "learning_rate": 1.0587374636745814e-05, "loss": 0.0871, "step": 2090 }, { "epoch": 2.664542848040777, "grad_norm": 0.31190435046107134, "learning_rate": 1.057848503831103e-05, "loss": 0.088, "step": 2091 }, { "epoch": 2.665817139216311, "grad_norm": 0.32766898524759686, "learning_rate": 1.0569594981169959e-05, "loss": 0.1029, "step": 2092 }, { "epoch": 2.6670914303918445, "grad_norm": 0.3204421973053537, "learning_rate": 1.0560704472371919e-05, "loss": 0.0837, "step": 2093 }, { "epoch": 2.668365721567378, "grad_norm": 0.3193161876132494, "learning_rate": 1.0551813518966585e-05, "loss": 0.0863, "step": 2094 }, { "epoch": 2.669640012742912, "grad_norm": 0.30830173173416214, "learning_rate": 1.0542922128003982e-05, "loss": 0.0772, "step": 2095 }, { "epoch": 2.6709143039184453, "grad_norm": 0.3069794252835358, "learning_rate": 1.0534030306534491e-05, "loss": 0.0873, "step": 2096 }, { "epoch": 2.6721885950939788, "grad_norm": 0.31084585090595723, "learning_rate": 1.0525138061608825e-05, "loss": 0.0834, "step": 2097 }, { "epoch": 2.6734628862695127, "grad_norm": 0.3182807825287064, "learning_rate": 1.0516245400278043e-05, "loss": 0.0887, "step": 2098 }, { "epoch": 2.674737177445046, "grad_norm": 0.3159883110311833, "learning_rate": 1.050735232959352e-05, "loss": 0.0841, "step": 2099 }, { "epoch": 2.6760114686205796, "grad_norm": 0.31091669941076133, "learning_rate": 1.0498458856606972e-05, "loss": 0.0792, "step": 2100 }, { "epoch": 2.6772857597961135, "grad_norm": 0.33174697043391677, "learning_rate": 1.0489564988370422e-05, "loss": 0.0871, "step": 2101 }, { "epoch": 2.678560050971647, "grad_norm": 0.3166760006701938, "learning_rate": 1.0480670731936209e-05, "loss": 0.085, "step": 2102 }, { "epoch": 2.6798343421471804, "grad_norm": 0.33070186470395735, "learning_rate": 1.0471776094356983e-05, "loss": 0.0874, "step": 2103 }, { "epoch": 2.6811086333227143, "grad_norm": 0.3182038295159733, "learning_rate": 1.0462881082685692e-05, "loss": 0.0875, "step": 2104 }, { "epoch": 2.682382924498248, "grad_norm": 0.31675365017502033, "learning_rate": 1.0453985703975587e-05, "loss": 0.0864, "step": 2105 }, { "epoch": 2.6836572156737812, "grad_norm": 0.3414494717165716, "learning_rate": 1.0445089965280201e-05, "loss": 0.1038, "step": 2106 }, { "epoch": 2.684931506849315, "grad_norm": 0.32527577401240787, "learning_rate": 1.0436193873653362e-05, "loss": 0.0891, "step": 2107 }, { "epoch": 2.6862057980248486, "grad_norm": 0.30862641024209886, "learning_rate": 1.0427297436149168e-05, "loss": 0.0786, "step": 2108 }, { "epoch": 2.687480089200382, "grad_norm": 0.3439040042091565, "learning_rate": 1.0418400659822003e-05, "loss": 0.0934, "step": 2109 }, { "epoch": 2.688754380375916, "grad_norm": 0.32561736942923014, "learning_rate": 1.0409503551726507e-05, "loss": 0.0867, "step": 2110 }, { "epoch": 2.6900286715514494, "grad_norm": 0.319558368218282, "learning_rate": 1.0400606118917593e-05, "loss": 0.0836, "step": 2111 }, { "epoch": 2.691302962726983, "grad_norm": 0.3164080955916794, "learning_rate": 1.0391708368450429e-05, "loss": 0.0825, "step": 2112 }, { "epoch": 2.692577253902517, "grad_norm": 0.29780739765213443, "learning_rate": 1.0382810307380429e-05, "loss": 0.0804, "step": 2113 }, { "epoch": 2.6938515450780502, "grad_norm": 0.31298592716816004, "learning_rate": 1.037391194276326e-05, "loss": 0.0898, "step": 2114 }, { "epoch": 2.6951258362535837, "grad_norm": 0.3301752403252408, "learning_rate": 1.0365013281654827e-05, "loss": 0.0935, "step": 2115 }, { "epoch": 2.6964001274291176, "grad_norm": 0.29499217641619874, "learning_rate": 1.0356114331111272e-05, "loss": 0.0744, "step": 2116 }, { "epoch": 2.697674418604651, "grad_norm": 0.3489145713176813, "learning_rate": 1.0347215098188963e-05, "loss": 0.0977, "step": 2117 }, { "epoch": 2.6989487097801845, "grad_norm": 0.30731144490391954, "learning_rate": 1.0338315589944497e-05, "loss": 0.0797, "step": 2118 }, { "epoch": 2.7002230009557184, "grad_norm": 0.33086822803901833, "learning_rate": 1.0329415813434687e-05, "loss": 0.0833, "step": 2119 }, { "epoch": 2.701497292131252, "grad_norm": 0.3060898966573367, "learning_rate": 1.0320515775716556e-05, "loss": 0.0819, "step": 2120 }, { "epoch": 2.7027715833067854, "grad_norm": 0.33720616662065755, "learning_rate": 1.0311615483847333e-05, "loss": 0.1005, "step": 2121 }, { "epoch": 2.7040458744823193, "grad_norm": 0.30490358919322524, "learning_rate": 1.0302714944884455e-05, "loss": 0.0756, "step": 2122 }, { "epoch": 2.7053201656578527, "grad_norm": 0.32894426662661025, "learning_rate": 1.0293814165885556e-05, "loss": 0.0812, "step": 2123 }, { "epoch": 2.706594456833386, "grad_norm": 0.3422210911245815, "learning_rate": 1.028491315390845e-05, "loss": 0.0904, "step": 2124 }, { "epoch": 2.70786874800892, "grad_norm": 0.3358562182900451, "learning_rate": 1.0276011916011146e-05, "loss": 0.087, "step": 2125 }, { "epoch": 2.7091430391844535, "grad_norm": 0.32061810476945474, "learning_rate": 1.0267110459251824e-05, "loss": 0.0818, "step": 2126 }, { "epoch": 2.710417330359987, "grad_norm": 0.33561122756302453, "learning_rate": 1.0258208790688844e-05, "loss": 0.0947, "step": 2127 }, { "epoch": 2.711691621535521, "grad_norm": 0.31877311777594936, "learning_rate": 1.0249306917380731e-05, "loss": 0.0805, "step": 2128 }, { "epoch": 2.7129659127110544, "grad_norm": 0.3281711432974659, "learning_rate": 1.024040484638617e-05, "loss": 0.0901, "step": 2129 }, { "epoch": 2.714240203886588, "grad_norm": 0.3292276849958303, "learning_rate": 1.023150258476401e-05, "loss": 0.0843, "step": 2130 }, { "epoch": 2.7155144950621217, "grad_norm": 0.34164895194752304, "learning_rate": 1.0222600139573246e-05, "loss": 0.0918, "step": 2131 }, { "epoch": 2.716788786237655, "grad_norm": 0.3187093777601241, "learning_rate": 1.0213697517873015e-05, "loss": 0.0836, "step": 2132 }, { "epoch": 2.7180630774131886, "grad_norm": 0.32528896917456224, "learning_rate": 1.0204794726722604e-05, "loss": 0.0814, "step": 2133 }, { "epoch": 2.7193373685887225, "grad_norm": 0.35985223743000333, "learning_rate": 1.0195891773181426e-05, "loss": 0.1075, "step": 2134 }, { "epoch": 2.720611659764256, "grad_norm": 0.3222285984821216, "learning_rate": 1.0186988664309023e-05, "loss": 0.0831, "step": 2135 }, { "epoch": 2.7218859509397895, "grad_norm": 0.31846462930042535, "learning_rate": 1.0178085407165066e-05, "loss": 0.0922, "step": 2136 }, { "epoch": 2.7231602421153234, "grad_norm": 0.3432954428427078, "learning_rate": 1.0169182008809339e-05, "loss": 0.1059, "step": 2137 }, { "epoch": 2.724434533290857, "grad_norm": 0.3110287289487861, "learning_rate": 1.0160278476301739e-05, "loss": 0.079, "step": 2138 }, { "epoch": 2.7257088244663907, "grad_norm": 0.33457112857340227, "learning_rate": 1.0151374816702269e-05, "loss": 0.0898, "step": 2139 }, { "epoch": 2.726983115641924, "grad_norm": 0.30980035995797184, "learning_rate": 1.0142471037071033e-05, "loss": 0.0851, "step": 2140 }, { "epoch": 2.7282574068174577, "grad_norm": 0.30656299855643987, "learning_rate": 1.013356714446823e-05, "loss": 0.0779, "step": 2141 }, { "epoch": 2.7295316979929916, "grad_norm": 0.33618687619378046, "learning_rate": 1.0124663145954152e-05, "loss": 0.0969, "step": 2142 }, { "epoch": 2.730805989168525, "grad_norm": 0.32230225143316027, "learning_rate": 1.011575904858917e-05, "loss": 0.086, "step": 2143 }, { "epoch": 2.7320802803440585, "grad_norm": 0.3223571570475713, "learning_rate": 1.0106854859433734e-05, "loss": 0.0874, "step": 2144 }, { "epoch": 2.7333545715195924, "grad_norm": 0.3389555929224567, "learning_rate": 1.0097950585548368e-05, "loss": 0.091, "step": 2145 }, { "epoch": 2.734628862695126, "grad_norm": 0.3155762649009353, "learning_rate": 1.0089046233993667e-05, "loss": 0.0788, "step": 2146 }, { "epoch": 2.7359031538706593, "grad_norm": 0.3187114139987543, "learning_rate": 1.0080141811830277e-05, "loss": 0.0811, "step": 2147 }, { "epoch": 2.737177445046193, "grad_norm": 0.3259397373666377, "learning_rate": 1.0071237326118917e-05, "loss": 0.0876, "step": 2148 }, { "epoch": 2.7384517362217267, "grad_norm": 0.3485697217975192, "learning_rate": 1.0062332783920337e-05, "loss": 0.1097, "step": 2149 }, { "epoch": 2.73972602739726, "grad_norm": 0.3408180299270795, "learning_rate": 1.0053428192295347e-05, "loss": 0.0931, "step": 2150 }, { "epoch": 2.741000318572794, "grad_norm": 0.3292452540659901, "learning_rate": 1.0044523558304786e-05, "loss": 0.0919, "step": 2151 }, { "epoch": 2.7422746097483275, "grad_norm": 0.3431638852459463, "learning_rate": 1.0035618889009535e-05, "loss": 0.1013, "step": 2152 }, { "epoch": 2.743548900923861, "grad_norm": 0.312830281923475, "learning_rate": 1.0026714191470492e-05, "loss": 0.089, "step": 2153 }, { "epoch": 2.744823192099395, "grad_norm": 0.3151030897043156, "learning_rate": 1.0017809472748594e-05, "loss": 0.086, "step": 2154 }, { "epoch": 2.7460974832749283, "grad_norm": 0.3111922961074806, "learning_rate": 1.000890473990478e-05, "loss": 0.0912, "step": 2155 }, { "epoch": 2.7473717744504618, "grad_norm": 0.31215073214507405, "learning_rate": 1e-05, "loss": 0.0867, "step": 2156 }, { "epoch": 2.7486460656259957, "grad_norm": 0.3223311360670985, "learning_rate": 9.991095260095222e-06, "loss": 0.0937, "step": 2157 }, { "epoch": 2.749920356801529, "grad_norm": 0.31937663755231877, "learning_rate": 9.98219052725141e-06, "loss": 0.0854, "step": 2158 }, { "epoch": 2.751194647977063, "grad_norm": 0.3190587331019367, "learning_rate": 9.973285808529508e-06, "loss": 0.0792, "step": 2159 }, { "epoch": 2.7524689391525965, "grad_norm": 0.33636927255596427, "learning_rate": 9.964381110990471e-06, "loss": 0.0912, "step": 2160 }, { "epoch": 2.75374323032813, "grad_norm": 0.3341890543561746, "learning_rate": 9.955476441695216e-06, "loss": 0.0852, "step": 2161 }, { "epoch": 2.755017521503664, "grad_norm": 0.3095453828835524, "learning_rate": 9.946571807704658e-06, "loss": 0.0829, "step": 2162 }, { "epoch": 2.7562918126791973, "grad_norm": 0.32022272466218293, "learning_rate": 9.937667216079665e-06, "loss": 0.0834, "step": 2163 }, { "epoch": 2.7575661038547308, "grad_norm": 0.29335499523808084, "learning_rate": 9.928762673881085e-06, "loss": 0.0712, "step": 2164 }, { "epoch": 2.7588403950302647, "grad_norm": 0.33161366746089865, "learning_rate": 9.919858188169724e-06, "loss": 0.0971, "step": 2165 }, { "epoch": 2.760114686205798, "grad_norm": 0.3299304705738946, "learning_rate": 9.910953766006337e-06, "loss": 0.0968, "step": 2166 }, { "epoch": 2.7613889773813316, "grad_norm": 0.31740004583763226, "learning_rate": 9.902049414451637e-06, "loss": 0.0857, "step": 2167 }, { "epoch": 2.7626632685568655, "grad_norm": 0.3229820537683465, "learning_rate": 9.89314514056627e-06, "loss": 0.0882, "step": 2168 }, { "epoch": 2.763937559732399, "grad_norm": 0.32547224039896017, "learning_rate": 9.884240951410834e-06, "loss": 0.0886, "step": 2169 }, { "epoch": 2.7652118509079324, "grad_norm": 0.32933036989915015, "learning_rate": 9.87533685404585e-06, "loss": 0.0893, "step": 2170 }, { "epoch": 2.7664861420834663, "grad_norm": 0.3294441209075176, "learning_rate": 9.866432855531773e-06, "loss": 0.097, "step": 2171 }, { "epoch": 2.767760433259, "grad_norm": 0.29595252357223467, "learning_rate": 9.85752896292897e-06, "loss": 0.0766, "step": 2172 }, { "epoch": 2.7690347244345332, "grad_norm": 0.33778234722818773, "learning_rate": 9.848625183297734e-06, "loss": 0.0974, "step": 2173 }, { "epoch": 2.770309015610067, "grad_norm": 0.31144068185362844, "learning_rate": 9.839721523698265e-06, "loss": 0.0897, "step": 2174 }, { "epoch": 2.7715833067856006, "grad_norm": 0.32830706416420036, "learning_rate": 9.830817991190664e-06, "loss": 0.1016, "step": 2175 }, { "epoch": 2.772857597961134, "grad_norm": 0.2920964974654342, "learning_rate": 9.821914592834934e-06, "loss": 0.0745, "step": 2176 }, { "epoch": 2.774131889136668, "grad_norm": 0.3019460342565638, "learning_rate": 9.81301133569098e-06, "loss": 0.0713, "step": 2177 }, { "epoch": 2.7754061803122014, "grad_norm": 0.3326186227975093, "learning_rate": 9.804108226818576e-06, "loss": 0.0881, "step": 2178 }, { "epoch": 2.776680471487735, "grad_norm": 0.3151867928186656, "learning_rate": 9.795205273277399e-06, "loss": 0.0756, "step": 2179 }, { "epoch": 2.777954762663269, "grad_norm": 0.327978006887396, "learning_rate": 9.786302482126986e-06, "loss": 0.0837, "step": 2180 }, { "epoch": 2.7792290538388023, "grad_norm": 0.3183325320471964, "learning_rate": 9.777399860426755e-06, "loss": 0.0856, "step": 2181 }, { "epoch": 2.7805033450143357, "grad_norm": 0.3218719241344016, "learning_rate": 9.768497415235993e-06, "loss": 0.0862, "step": 2182 }, { "epoch": 2.7817776361898696, "grad_norm": 0.34373588825902773, "learning_rate": 9.75959515361383e-06, "loss": 0.0902, "step": 2183 }, { "epoch": 2.783051927365403, "grad_norm": 0.33043174604297443, "learning_rate": 9.750693082619274e-06, "loss": 0.0945, "step": 2184 }, { "epoch": 2.7843262185409365, "grad_norm": 0.34668744730233686, "learning_rate": 9.74179120931116e-06, "loss": 0.1055, "step": 2185 }, { "epoch": 2.7856005097164704, "grad_norm": 0.31376829158536984, "learning_rate": 9.732889540748181e-06, "loss": 0.0819, "step": 2186 }, { "epoch": 2.786874800892004, "grad_norm": 0.33010029966731647, "learning_rate": 9.723988083988857e-06, "loss": 0.0894, "step": 2187 }, { "epoch": 2.7881490920675374, "grad_norm": 0.2996087400214196, "learning_rate": 9.71508684609155e-06, "loss": 0.08, "step": 2188 }, { "epoch": 2.7894233832430713, "grad_norm": 0.32221955380807993, "learning_rate": 9.706185834114447e-06, "loss": 0.0978, "step": 2189 }, { "epoch": 2.7906976744186047, "grad_norm": 0.2871721942769033, "learning_rate": 9.697285055115545e-06, "loss": 0.0746, "step": 2190 }, { "epoch": 2.791971965594138, "grad_norm": 0.338267863484846, "learning_rate": 9.68838451615267e-06, "loss": 0.0822, "step": 2191 }, { "epoch": 2.793246256769672, "grad_norm": 0.3180556616306214, "learning_rate": 9.67948422428345e-06, "loss": 0.0875, "step": 2192 }, { "epoch": 2.7945205479452055, "grad_norm": 0.3507453441565575, "learning_rate": 9.67058418656532e-06, "loss": 0.1084, "step": 2193 }, { "epoch": 2.795794839120739, "grad_norm": 0.30773606161138, "learning_rate": 9.661684410055505e-06, "loss": 0.0812, "step": 2194 }, { "epoch": 2.797069130296273, "grad_norm": 0.3499320253769412, "learning_rate": 9.652784901811037e-06, "loss": 0.1036, "step": 2195 }, { "epoch": 2.7983434214718064, "grad_norm": 0.31778795998987436, "learning_rate": 9.643885668888733e-06, "loss": 0.0878, "step": 2196 }, { "epoch": 2.79961771264734, "grad_norm": 0.328013520475171, "learning_rate": 9.634986718345176e-06, "loss": 0.0916, "step": 2197 }, { "epoch": 2.8008920038228737, "grad_norm": 0.3165592794422496, "learning_rate": 9.626088057236745e-06, "loss": 0.0861, "step": 2198 }, { "epoch": 2.802166294998407, "grad_norm": 0.33008472688962887, "learning_rate": 9.617189692619574e-06, "loss": 0.0969, "step": 2199 }, { "epoch": 2.8034405861739407, "grad_norm": 0.33284531596088973, "learning_rate": 9.608291631549574e-06, "loss": 0.0962, "step": 2200 }, { "epoch": 2.8047148773494746, "grad_norm": 0.3305493639508405, "learning_rate": 9.599393881082409e-06, "loss": 0.0892, "step": 2201 }, { "epoch": 2.805989168525008, "grad_norm": 0.3460479215535693, "learning_rate": 9.590496448273495e-06, "loss": 0.0986, "step": 2202 }, { "epoch": 2.8072634597005415, "grad_norm": 0.306399458830251, "learning_rate": 9.581599340178e-06, "loss": 0.0745, "step": 2203 }, { "epoch": 2.8085377508760754, "grad_norm": 0.3256383471613711, "learning_rate": 9.572702563850834e-06, "loss": 0.0918, "step": 2204 }, { "epoch": 2.809812042051609, "grad_norm": 0.30656371982496583, "learning_rate": 9.563806126346643e-06, "loss": 0.0777, "step": 2205 }, { "epoch": 2.8110863332271423, "grad_norm": 0.34380740834343515, "learning_rate": 9.554910034719802e-06, "loss": 0.0902, "step": 2206 }, { "epoch": 2.812360624402676, "grad_norm": 0.31568706029614213, "learning_rate": 9.546014296024415e-06, "loss": 0.084, "step": 2207 }, { "epoch": 2.8136349155782097, "grad_norm": 0.3314614960494566, "learning_rate": 9.537118917314312e-06, "loss": 0.0926, "step": 2208 }, { "epoch": 2.814909206753743, "grad_norm": 0.33947910551815563, "learning_rate": 9.528223905643019e-06, "loss": 0.0949, "step": 2209 }, { "epoch": 2.816183497929277, "grad_norm": 0.3289286878564529, "learning_rate": 9.519329268063795e-06, "loss": 0.0879, "step": 2210 }, { "epoch": 2.8174577891048105, "grad_norm": 0.33834838513898585, "learning_rate": 9.510435011629581e-06, "loss": 0.1009, "step": 2211 }, { "epoch": 2.818732080280344, "grad_norm": 0.32494931672451793, "learning_rate": 9.501541143393028e-06, "loss": 0.0866, "step": 2212 }, { "epoch": 2.820006371455878, "grad_norm": 0.3078933317764079, "learning_rate": 9.49264767040648e-06, "loss": 0.0814, "step": 2213 }, { "epoch": 2.8212806626314113, "grad_norm": 0.32146409979111357, "learning_rate": 9.483754599721959e-06, "loss": 0.087, "step": 2214 }, { "epoch": 2.8225549538069448, "grad_norm": 0.3369990899778483, "learning_rate": 9.474861938391178e-06, "loss": 0.087, "step": 2215 }, { "epoch": 2.8238292449824787, "grad_norm": 0.32356106792548406, "learning_rate": 9.46596969346551e-06, "loss": 0.0787, "step": 2216 }, { "epoch": 2.825103536158012, "grad_norm": 0.342003144780689, "learning_rate": 9.457077871996021e-06, "loss": 0.1009, "step": 2217 }, { "epoch": 2.8263778273335456, "grad_norm": 0.3296712727428846, "learning_rate": 9.44818648103342e-06, "loss": 0.0933, "step": 2218 }, { "epoch": 2.8276521185090795, "grad_norm": 0.32168487233317433, "learning_rate": 9.439295527628083e-06, "loss": 0.0812, "step": 2219 }, { "epoch": 2.828926409684613, "grad_norm": 0.31880456703348076, "learning_rate": 9.430405018830043e-06, "loss": 0.0825, "step": 2220 }, { "epoch": 2.8302007008601464, "grad_norm": 0.3257418541671967, "learning_rate": 9.421514961688971e-06, "loss": 0.0841, "step": 2221 }, { "epoch": 2.8314749920356803, "grad_norm": 0.312733541507434, "learning_rate": 9.412625363254193e-06, "loss": 0.091, "step": 2222 }, { "epoch": 2.8327492832112138, "grad_norm": 0.3366927640520674, "learning_rate": 9.403736230574655e-06, "loss": 0.0939, "step": 2223 }, { "epoch": 2.8340235743867472, "grad_norm": 0.33675897625405143, "learning_rate": 9.39484757069895e-06, "loss": 0.0968, "step": 2224 }, { "epoch": 2.835297865562281, "grad_norm": 0.3377003360714769, "learning_rate": 9.385959390675279e-06, "loss": 0.0986, "step": 2225 }, { "epoch": 2.8365721567378146, "grad_norm": 0.3116788653389688, "learning_rate": 9.377071697551479e-06, "loss": 0.081, "step": 2226 }, { "epoch": 2.837846447913348, "grad_norm": 0.3272159650795636, "learning_rate": 9.368184498374993e-06, "loss": 0.0935, "step": 2227 }, { "epoch": 2.839120739088882, "grad_norm": 0.3259425334007691, "learning_rate": 9.359297800192873e-06, "loss": 0.0877, "step": 2228 }, { "epoch": 2.8403950302644154, "grad_norm": 0.3158912386665505, "learning_rate": 9.350411610051771e-06, "loss": 0.0826, "step": 2229 }, { "epoch": 2.841669321439949, "grad_norm": 0.3086676986079805, "learning_rate": 9.341525934997941e-06, "loss": 0.079, "step": 2230 }, { "epoch": 2.842943612615483, "grad_norm": 0.343424126142451, "learning_rate": 9.332640782077223e-06, "loss": 0.1106, "step": 2231 }, { "epoch": 2.8442179037910162, "grad_norm": 0.3315115094409133, "learning_rate": 9.323756158335054e-06, "loss": 0.0928, "step": 2232 }, { "epoch": 2.8454921949665497, "grad_norm": 0.32476950497454493, "learning_rate": 9.314872070816435e-06, "loss": 0.0885, "step": 2233 }, { "epoch": 2.8467664861420836, "grad_norm": 0.33703796631755645, "learning_rate": 9.305988526565957e-06, "loss": 0.0879, "step": 2234 }, { "epoch": 2.848040777317617, "grad_norm": 0.33209070948664415, "learning_rate": 9.29710553262777e-06, "loss": 0.0883, "step": 2235 }, { "epoch": 2.8493150684931505, "grad_norm": 0.3282837341752183, "learning_rate": 9.288223096045596e-06, "loss": 0.0948, "step": 2236 }, { "epoch": 2.8505893596686844, "grad_norm": 0.3262295520482969, "learning_rate": 9.279341223862705e-06, "loss": 0.0889, "step": 2237 }, { "epoch": 2.851863650844218, "grad_norm": 0.3236412918026824, "learning_rate": 9.270459923121927e-06, "loss": 0.0813, "step": 2238 }, { "epoch": 2.8531379420197513, "grad_norm": 0.3313749518342593, "learning_rate": 9.261579200865643e-06, "loss": 0.092, "step": 2239 }, { "epoch": 2.8544122331952853, "grad_norm": 0.3218612613786791, "learning_rate": 9.252699064135759e-06, "loss": 0.0895, "step": 2240 }, { "epoch": 2.8556865243708187, "grad_norm": 0.28449616668972894, "learning_rate": 9.243819519973734e-06, "loss": 0.0758, "step": 2241 }, { "epoch": 2.856960815546352, "grad_norm": 0.3156534765673723, "learning_rate": 9.234940575420545e-06, "loss": 0.0862, "step": 2242 }, { "epoch": 2.858235106721886, "grad_norm": 0.3069215146148821, "learning_rate": 9.226062237516703e-06, "loss": 0.0795, "step": 2243 }, { "epoch": 2.8595093978974195, "grad_norm": 0.31215104737204447, "learning_rate": 9.217184513302234e-06, "loss": 0.0868, "step": 2244 }, { "epoch": 2.860783689072953, "grad_norm": 0.30436978948003673, "learning_rate": 9.208307409816672e-06, "loss": 0.0798, "step": 2245 }, { "epoch": 2.862057980248487, "grad_norm": 0.3281625923730742, "learning_rate": 9.199430934099068e-06, "loss": 0.0953, "step": 2246 }, { "epoch": 2.8633322714240204, "grad_norm": 0.31412560813145984, "learning_rate": 9.190555093187968e-06, "loss": 0.0857, "step": 2247 }, { "epoch": 2.864606562599554, "grad_norm": 0.3245235433096404, "learning_rate": 9.181679894121421e-06, "loss": 0.0953, "step": 2248 }, { "epoch": 2.8658808537750877, "grad_norm": 0.3025908187245448, "learning_rate": 9.172805343936959e-06, "loss": 0.069, "step": 2249 }, { "epoch": 2.867155144950621, "grad_norm": 0.31358680469002626, "learning_rate": 9.163931449671606e-06, "loss": 0.0803, "step": 2250 }, { "epoch": 2.8684294361261546, "grad_norm": 0.3193010162703646, "learning_rate": 9.155058218361868e-06, "loss": 0.0822, "step": 2251 }, { "epoch": 2.8697037273016885, "grad_norm": 0.3224420152912573, "learning_rate": 9.146185657043714e-06, "loss": 0.0798, "step": 2252 }, { "epoch": 2.870978018477222, "grad_norm": 0.3245576337815501, "learning_rate": 9.137313772752598e-06, "loss": 0.086, "step": 2253 }, { "epoch": 2.8722523096527555, "grad_norm": 0.3196192792698883, "learning_rate": 9.128442572523418e-06, "loss": 0.0862, "step": 2254 }, { "epoch": 2.8735266008282894, "grad_norm": 0.3146165407824861, "learning_rate": 9.11957206339055e-06, "loss": 0.0839, "step": 2255 }, { "epoch": 2.874800892003823, "grad_norm": 0.31855446539853594, "learning_rate": 9.110702252387801e-06, "loss": 0.0829, "step": 2256 }, { "epoch": 2.8760751831793563, "grad_norm": 0.31830925012429057, "learning_rate": 9.101833146548443e-06, "loss": 0.0788, "step": 2257 }, { "epoch": 2.87734947435489, "grad_norm": 0.3459483824601095, "learning_rate": 9.092964752905178e-06, "loss": 0.0946, "step": 2258 }, { "epoch": 2.8786237655304237, "grad_norm": 0.33034204073584106, "learning_rate": 9.084097078490149e-06, "loss": 0.0901, "step": 2259 }, { "epoch": 2.879898056705957, "grad_norm": 0.3217175321974846, "learning_rate": 9.075230130334923e-06, "loss": 0.0887, "step": 2260 }, { "epoch": 2.881172347881491, "grad_norm": 0.31225134518126463, "learning_rate": 9.066363915470494e-06, "loss": 0.0914, "step": 2261 }, { "epoch": 2.8824466390570245, "grad_norm": 0.3045654790307603, "learning_rate": 9.057498440927275e-06, "loss": 0.0811, "step": 2262 }, { "epoch": 2.883720930232558, "grad_norm": 0.3094494021616613, "learning_rate": 9.048633713735098e-06, "loss": 0.0864, "step": 2263 }, { "epoch": 2.884995221408092, "grad_norm": 0.32401777564194323, "learning_rate": 9.039769740923183e-06, "loss": 0.0887, "step": 2264 }, { "epoch": 2.8862695125836253, "grad_norm": 0.33547579001256533, "learning_rate": 9.030906529520179e-06, "loss": 0.0847, "step": 2265 }, { "epoch": 2.8875438037591588, "grad_norm": 0.3135996188796138, "learning_rate": 9.022044086554102e-06, "loss": 0.0798, "step": 2266 }, { "epoch": 2.8888180949346927, "grad_norm": 0.31615722219038045, "learning_rate": 9.01318241905239e-06, "loss": 0.0836, "step": 2267 }, { "epoch": 2.890092386110226, "grad_norm": 0.3290505652089717, "learning_rate": 9.004321534041836e-06, "loss": 0.0918, "step": 2268 }, { "epoch": 2.8913666772857596, "grad_norm": 0.31639833923910476, "learning_rate": 8.995461438548632e-06, "loss": 0.0783, "step": 2269 }, { "epoch": 2.8926409684612935, "grad_norm": 0.3194022396428669, "learning_rate": 8.986602139598341e-06, "loss": 0.0895, "step": 2270 }, { "epoch": 2.893915259636827, "grad_norm": 0.3355631126811517, "learning_rate": 8.977743644215887e-06, "loss": 0.0881, "step": 2271 }, { "epoch": 2.8951895508123604, "grad_norm": 0.35581466731156913, "learning_rate": 8.968885959425567e-06, "loss": 0.0969, "step": 2272 }, { "epoch": 2.8964638419878943, "grad_norm": 0.32528234440722187, "learning_rate": 8.960029092251022e-06, "loss": 0.0827, "step": 2273 }, { "epoch": 2.8977381331634278, "grad_norm": 0.3171758610239743, "learning_rate": 8.951173049715263e-06, "loss": 0.0863, "step": 2274 }, { "epoch": 2.8990124243389612, "grad_norm": 0.313258959743367, "learning_rate": 8.942317838840625e-06, "loss": 0.0875, "step": 2275 }, { "epoch": 2.900286715514495, "grad_norm": 0.3372996431674686, "learning_rate": 8.933463466648798e-06, "loss": 0.0958, "step": 2276 }, { "epoch": 2.9015610066900286, "grad_norm": 0.31127818254598316, "learning_rate": 8.924609940160814e-06, "loss": 0.0788, "step": 2277 }, { "epoch": 2.902835297865562, "grad_norm": 0.30604333303504155, "learning_rate": 8.91575726639701e-06, "loss": 0.0776, "step": 2278 }, { "epoch": 2.904109589041096, "grad_norm": 0.3237775420543755, "learning_rate": 8.906905452377073e-06, "loss": 0.0824, "step": 2279 }, { "epoch": 2.9053838802166294, "grad_norm": 0.32210567965981446, "learning_rate": 8.898054505119988e-06, "loss": 0.0895, "step": 2280 }, { "epoch": 2.906658171392163, "grad_norm": 0.3357924341740863, "learning_rate": 8.889204431644067e-06, "loss": 0.0895, "step": 2281 }, { "epoch": 2.9079324625676968, "grad_norm": 0.3221316686072319, "learning_rate": 8.880355238966923e-06, "loss": 0.0822, "step": 2282 }, { "epoch": 2.9092067537432302, "grad_norm": 0.31836065304097727, "learning_rate": 8.871506934105465e-06, "loss": 0.0864, "step": 2283 }, { "epoch": 2.9104810449187637, "grad_norm": 0.31096867148303736, "learning_rate": 8.862659524075915e-06, "loss": 0.079, "step": 2284 }, { "epoch": 2.9117553360942976, "grad_norm": 0.33436510678377185, "learning_rate": 8.853813015893762e-06, "loss": 0.0934, "step": 2285 }, { "epoch": 2.913029627269831, "grad_norm": 0.328490239740507, "learning_rate": 8.844967416573803e-06, "loss": 0.0959, "step": 2286 }, { "epoch": 2.9143039184453645, "grad_norm": 0.32248665651279057, "learning_rate": 8.836122733130094e-06, "loss": 0.0945, "step": 2287 }, { "epoch": 2.9155782096208984, "grad_norm": 0.3086736283668569, "learning_rate": 8.827278972575984e-06, "loss": 0.0821, "step": 2288 }, { "epoch": 2.916852500796432, "grad_norm": 0.3030415800433782, "learning_rate": 8.818436141924072e-06, "loss": 0.0756, "step": 2289 }, { "epoch": 2.9181267919719653, "grad_norm": 0.3353687130726042, "learning_rate": 8.809594248186235e-06, "loss": 0.084, "step": 2290 }, { "epoch": 2.9194010831474992, "grad_norm": 0.30734776286407534, "learning_rate": 8.800753298373597e-06, "loss": 0.0727, "step": 2291 }, { "epoch": 2.9206753743230327, "grad_norm": 0.3206402863662217, "learning_rate": 8.791913299496537e-06, "loss": 0.0912, "step": 2292 }, { "epoch": 2.921949665498566, "grad_norm": 0.31543256892002935, "learning_rate": 8.783074258564679e-06, "loss": 0.0835, "step": 2293 }, { "epoch": 2.9232239566741, "grad_norm": 0.3468758032708957, "learning_rate": 8.774236182586893e-06, "loss": 0.0893, "step": 2294 }, { "epoch": 2.9244982478496335, "grad_norm": 0.3213852885404236, "learning_rate": 8.76539907857127e-06, "loss": 0.0825, "step": 2295 }, { "epoch": 2.925772539025167, "grad_norm": 0.30285794098643876, "learning_rate": 8.756562953525151e-06, "loss": 0.0832, "step": 2296 }, { "epoch": 2.927046830200701, "grad_norm": 0.32320271026417885, "learning_rate": 8.747727814455077e-06, "loss": 0.0911, "step": 2297 }, { "epoch": 2.9283211213762343, "grad_norm": 0.30335663862824774, "learning_rate": 8.738893668366832e-06, "loss": 0.0814, "step": 2298 }, { "epoch": 2.929595412551768, "grad_norm": 0.34975113745659503, "learning_rate": 8.730060522265388e-06, "loss": 0.1047, "step": 2299 }, { "epoch": 2.9308697037273017, "grad_norm": 0.3377436259565959, "learning_rate": 8.721228383154939e-06, "loss": 0.087, "step": 2300 }, { "epoch": 2.932143994902835, "grad_norm": 0.31260326356560486, "learning_rate": 8.712397258038885e-06, "loss": 0.0757, "step": 2301 }, { "epoch": 2.9334182860783686, "grad_norm": 0.3223895518785607, "learning_rate": 8.703567153919806e-06, "loss": 0.0868, "step": 2302 }, { "epoch": 2.9346925772539025, "grad_norm": 0.32461644395641864, "learning_rate": 8.694738077799487e-06, "loss": 0.0863, "step": 2303 }, { "epoch": 2.935966868429436, "grad_norm": 0.29321880934164557, "learning_rate": 8.685910036678885e-06, "loss": 0.0753, "step": 2304 }, { "epoch": 2.93724115960497, "grad_norm": 0.33350454575339666, "learning_rate": 8.67708303755815e-06, "loss": 0.0934, "step": 2305 }, { "epoch": 2.9385154507805034, "grad_norm": 0.32006391449329763, "learning_rate": 8.66825708743659e-06, "loss": 0.0837, "step": 2306 }, { "epoch": 2.939789741956037, "grad_norm": 0.3120501249266672, "learning_rate": 8.659432193312696e-06, "loss": 0.0793, "step": 2307 }, { "epoch": 2.9410640331315707, "grad_norm": 0.3244870736598853, "learning_rate": 8.650608362184119e-06, "loss": 0.0871, "step": 2308 }, { "epoch": 2.942338324307104, "grad_norm": 0.35851914838321947, "learning_rate": 8.641785601047654e-06, "loss": 0.0997, "step": 2309 }, { "epoch": 2.9436126154826376, "grad_norm": 0.3181236612449183, "learning_rate": 8.632963916899268e-06, "loss": 0.0868, "step": 2310 }, { "epoch": 2.9448869066581715, "grad_norm": 0.3083425621851181, "learning_rate": 8.624143316734054e-06, "loss": 0.0763, "step": 2311 }, { "epoch": 2.946161197833705, "grad_norm": 0.3190655022298892, "learning_rate": 8.615323807546258e-06, "loss": 0.077, "step": 2312 }, { "epoch": 2.9474354890092385, "grad_norm": 0.3019236588838287, "learning_rate": 8.606505396329264e-06, "loss": 0.0824, "step": 2313 }, { "epoch": 2.9487097801847724, "grad_norm": 0.32083588093911, "learning_rate": 8.597688090075569e-06, "loss": 0.0857, "step": 2314 }, { "epoch": 2.949984071360306, "grad_norm": 0.3079088548430019, "learning_rate": 8.58887189577681e-06, "loss": 0.074, "step": 2315 }, { "epoch": 2.9512583625358393, "grad_norm": 0.3310273539935576, "learning_rate": 8.580056820423731e-06, "loss": 0.0822, "step": 2316 }, { "epoch": 2.952532653711373, "grad_norm": 0.33846987957164193, "learning_rate": 8.571242871006202e-06, "loss": 0.0898, "step": 2317 }, { "epoch": 2.9538069448869066, "grad_norm": 0.3150705429822359, "learning_rate": 8.562430054513184e-06, "loss": 0.0864, "step": 2318 }, { "epoch": 2.95508123606244, "grad_norm": 0.32987436948240884, "learning_rate": 8.553618377932752e-06, "loss": 0.0957, "step": 2319 }, { "epoch": 2.956355527237974, "grad_norm": 0.322826540033335, "learning_rate": 8.54480784825207e-06, "loss": 0.0876, "step": 2320 }, { "epoch": 2.9576298184135075, "grad_norm": 0.3400756620305067, "learning_rate": 8.5359984724574e-06, "loss": 0.0966, "step": 2321 }, { "epoch": 2.958904109589041, "grad_norm": 0.3081342082917406, "learning_rate": 8.52719025753408e-06, "loss": 0.0736, "step": 2322 }, { "epoch": 2.960178400764575, "grad_norm": 0.3391390845200881, "learning_rate": 8.518383210466535e-06, "loss": 0.0847, "step": 2323 }, { "epoch": 2.9614526919401083, "grad_norm": 0.30004575384505705, "learning_rate": 8.509577338238255e-06, "loss": 0.0723, "step": 2324 }, { "epoch": 2.962726983115642, "grad_norm": 0.33201434866721197, "learning_rate": 8.500772647831815e-06, "loss": 0.0925, "step": 2325 }, { "epoch": 2.9640012742911757, "grad_norm": 0.3271678645732867, "learning_rate": 8.49196914622883e-06, "loss": 0.0862, "step": 2326 }, { "epoch": 2.965275565466709, "grad_norm": 0.3074530791894688, "learning_rate": 8.483166840409996e-06, "loss": 0.0813, "step": 2327 }, { "epoch": 2.966549856642243, "grad_norm": 0.33393682352830034, "learning_rate": 8.474365737355037e-06, "loss": 0.088, "step": 2328 }, { "epoch": 2.9678241478177765, "grad_norm": 0.3290438280829267, "learning_rate": 8.46556584404275e-06, "loss": 0.0823, "step": 2329 }, { "epoch": 2.96909843899331, "grad_norm": 0.3210686079777984, "learning_rate": 8.456767167450943e-06, "loss": 0.0829, "step": 2330 }, { "epoch": 2.970372730168844, "grad_norm": 0.3064496040504939, "learning_rate": 8.447969714556484e-06, "loss": 0.0818, "step": 2331 }, { "epoch": 2.9716470213443773, "grad_norm": 0.31265852276321343, "learning_rate": 8.439173492335265e-06, "loss": 0.0739, "step": 2332 }, { "epoch": 2.9729213125199108, "grad_norm": 0.30829634194841177, "learning_rate": 8.430378507762186e-06, "loss": 0.0785, "step": 2333 }, { "epoch": 2.9741956036954447, "grad_norm": 0.306121210744977, "learning_rate": 8.42158476781119e-06, "loss": 0.0795, "step": 2334 }, { "epoch": 2.975469894870978, "grad_norm": 0.32859789070955847, "learning_rate": 8.41279227945521e-06, "loss": 0.082, "step": 2335 }, { "epoch": 2.9767441860465116, "grad_norm": 0.3287454618006645, "learning_rate": 8.404001049666211e-06, "loss": 0.0841, "step": 2336 }, { "epoch": 2.9780184772220455, "grad_norm": 0.32523678829009606, "learning_rate": 8.395211085415133e-06, "loss": 0.0891, "step": 2337 }, { "epoch": 2.979292768397579, "grad_norm": 0.3063008875146568, "learning_rate": 8.386422393671934e-06, "loss": 0.0771, "step": 2338 }, { "epoch": 2.9805670595731124, "grad_norm": 0.3197550310317637, "learning_rate": 8.377634981405555e-06, "loss": 0.091, "step": 2339 }, { "epoch": 2.9818413507486463, "grad_norm": 0.3159854277283979, "learning_rate": 8.36884885558392e-06, "loss": 0.086, "step": 2340 }, { "epoch": 2.9831156419241798, "grad_norm": 0.32893180788310483, "learning_rate": 8.360064023173938e-06, "loss": 0.0849, "step": 2341 }, { "epoch": 2.9843899330997132, "grad_norm": 0.3454509539430804, "learning_rate": 8.351280491141485e-06, "loss": 0.0894, "step": 2342 }, { "epoch": 2.985664224275247, "grad_norm": 0.32007905065811126, "learning_rate": 8.342498266451418e-06, "loss": 0.0928, "step": 2343 }, { "epoch": 2.9869385154507806, "grad_norm": 0.32387040418695234, "learning_rate": 8.333717356067543e-06, "loss": 0.087, "step": 2344 }, { "epoch": 2.988212806626314, "grad_norm": 0.3438181759307985, "learning_rate": 8.324937766952638e-06, "loss": 0.0981, "step": 2345 }, { "epoch": 2.989487097801848, "grad_norm": 0.32062041589802714, "learning_rate": 8.31615950606842e-06, "loss": 0.0813, "step": 2346 }, { "epoch": 2.9907613889773814, "grad_norm": 0.32835915165769414, "learning_rate": 8.307382580375563e-06, "loss": 0.0798, "step": 2347 }, { "epoch": 2.992035680152915, "grad_norm": 0.33680826222448845, "learning_rate": 8.298606996833675e-06, "loss": 0.0879, "step": 2348 }, { "epoch": 2.993309971328449, "grad_norm": 0.3088593973904839, "learning_rate": 8.289832762401307e-06, "loss": 0.0708, "step": 2349 }, { "epoch": 2.9945842625039822, "grad_norm": 0.35170245789822074, "learning_rate": 8.281059884035931e-06, "loss": 0.1021, "step": 2350 }, { "epoch": 2.9958585536795157, "grad_norm": 0.31208745237469054, "learning_rate": 8.272288368693958e-06, "loss": 0.082, "step": 2351 }, { "epoch": 2.9971328448550496, "grad_norm": 0.3460878143067917, "learning_rate": 8.263518223330698e-06, "loss": 0.0946, "step": 2352 }, { "epoch": 2.998407136030583, "grad_norm": 0.2969292265852705, "learning_rate": 8.254749454900394e-06, "loss": 0.0765, "step": 2353 }, { "epoch": 2.9996814272061165, "grad_norm": 0.37510784018517884, "learning_rate": 8.245982070356186e-06, "loss": 0.1055, "step": 2354 }, { "epoch": 3.0009557183816504, "grad_norm": 0.30572862904103454, "learning_rate": 8.237216076650117e-06, "loss": 0.0498, "step": 2355 }, { "epoch": 3.002230009557184, "grad_norm": 0.25568434839100757, "learning_rate": 8.228451480733138e-06, "loss": 0.0313, "step": 2356 }, { "epoch": 3.0035043007327173, "grad_norm": 0.25894446136114163, "learning_rate": 8.219688289555075e-06, "loss": 0.0341, "step": 2357 }, { "epoch": 3.0047785919082513, "grad_norm": 0.23566311219293365, "learning_rate": 8.210926510064656e-06, "loss": 0.0309, "step": 2358 }, { "epoch": 3.0060528830837847, "grad_norm": 0.25887080719177136, "learning_rate": 8.202166149209475e-06, "loss": 0.0382, "step": 2359 }, { "epoch": 3.007327174259318, "grad_norm": 0.25131368123484094, "learning_rate": 8.193407213936014e-06, "loss": 0.027, "step": 2360 }, { "epoch": 3.008601465434852, "grad_norm": 0.32089804591552124, "learning_rate": 8.184649711189613e-06, "loss": 0.036, "step": 2361 }, { "epoch": 3.0098757566103855, "grad_norm": 0.36054481161767143, "learning_rate": 8.175893647914485e-06, "loss": 0.0433, "step": 2362 }, { "epoch": 3.011150047785919, "grad_norm": 0.371972101679109, "learning_rate": 8.167139031053705e-06, "loss": 0.0426, "step": 2363 }, { "epoch": 3.012424338961453, "grad_norm": 0.3463660145944976, "learning_rate": 8.158385867549183e-06, "loss": 0.035, "step": 2364 }, { "epoch": 3.0136986301369864, "grad_norm": 0.32202831916087654, "learning_rate": 8.149634164341699e-06, "loss": 0.0317, "step": 2365 }, { "epoch": 3.01497292131252, "grad_norm": 0.32223306974521587, "learning_rate": 8.140883928370855e-06, "loss": 0.0296, "step": 2366 }, { "epoch": 3.0162472124880537, "grad_norm": 0.2980658691586398, "learning_rate": 8.132135166575108e-06, "loss": 0.037, "step": 2367 }, { "epoch": 3.017521503663587, "grad_norm": 0.2962623853293126, "learning_rate": 8.123387885891726e-06, "loss": 0.0323, "step": 2368 }, { "epoch": 3.0187957948391206, "grad_norm": 0.29036527686289704, "learning_rate": 8.11464209325682e-06, "loss": 0.0335, "step": 2369 }, { "epoch": 3.0200700860146545, "grad_norm": 0.27962470617761975, "learning_rate": 8.10589779560532e-06, "loss": 0.038, "step": 2370 }, { "epoch": 3.021344377190188, "grad_norm": 0.2795296804644245, "learning_rate": 8.097154999870952e-06, "loss": 0.0368, "step": 2371 }, { "epoch": 3.0226186683657215, "grad_norm": 0.2814309048009109, "learning_rate": 8.08841371298628e-06, "loss": 0.0355, "step": 2372 }, { "epoch": 3.0238929595412554, "grad_norm": 0.2681892671501909, "learning_rate": 8.079673941882639e-06, "loss": 0.0309, "step": 2373 }, { "epoch": 3.025167250716789, "grad_norm": 0.2710131467401901, "learning_rate": 8.07093569349019e-06, "loss": 0.0346, "step": 2374 }, { "epoch": 3.0264415418923223, "grad_norm": 0.27910361120804594, "learning_rate": 8.062198974737874e-06, "loss": 0.0353, "step": 2375 }, { "epoch": 3.027715833067856, "grad_norm": 0.271030167382027, "learning_rate": 8.053463792553417e-06, "loss": 0.0304, "step": 2376 }, { "epoch": 3.0289901242433896, "grad_norm": 0.298651489814036, "learning_rate": 8.044730153863331e-06, "loss": 0.0343, "step": 2377 }, { "epoch": 3.030264415418923, "grad_norm": 0.2931933868828526, "learning_rate": 8.035998065592905e-06, "loss": 0.0368, "step": 2378 }, { "epoch": 3.031538706594457, "grad_norm": 0.3070123121417954, "learning_rate": 8.027267534666197e-06, "loss": 0.034, "step": 2379 }, { "epoch": 3.0328129977699905, "grad_norm": 0.283260732897124, "learning_rate": 8.018538568006027e-06, "loss": 0.0306, "step": 2380 }, { "epoch": 3.034087288945524, "grad_norm": 0.29214471112077495, "learning_rate": 8.009811172533977e-06, "loss": 0.0342, "step": 2381 }, { "epoch": 3.035361580121058, "grad_norm": 0.288163839187727, "learning_rate": 8.001085355170389e-06, "loss": 0.0315, "step": 2382 }, { "epoch": 3.0366358712965913, "grad_norm": 0.2925242945809144, "learning_rate": 7.992361122834341e-06, "loss": 0.0337, "step": 2383 }, { "epoch": 3.0379101624721248, "grad_norm": 0.2929644303465763, "learning_rate": 7.983638482443671e-06, "loss": 0.033, "step": 2384 }, { "epoch": 3.0391844536476587, "grad_norm": 0.2749556011433751, "learning_rate": 7.974917440914934e-06, "loss": 0.0316, "step": 2385 }, { "epoch": 3.040458744823192, "grad_norm": 0.28809933788929803, "learning_rate": 7.966198005163433e-06, "loss": 0.0331, "step": 2386 }, { "epoch": 3.0417330359987256, "grad_norm": 0.26494994262169674, "learning_rate": 7.957480182103198e-06, "loss": 0.0322, "step": 2387 }, { "epoch": 3.0430073271742595, "grad_norm": 0.2711049732587876, "learning_rate": 7.948763978646968e-06, "loss": 0.0322, "step": 2388 }, { "epoch": 3.044281618349793, "grad_norm": 0.2508907178046757, "learning_rate": 7.940049401706211e-06, "loss": 0.0302, "step": 2389 }, { "epoch": 3.0455559095253264, "grad_norm": 0.27744604129592204, "learning_rate": 7.931336458191092e-06, "loss": 0.0374, "step": 2390 }, { "epoch": 3.0468302007008603, "grad_norm": 0.269393501047693, "learning_rate": 7.922625155010496e-06, "loss": 0.0349, "step": 2391 }, { "epoch": 3.0481044918763938, "grad_norm": 0.27167774477104306, "learning_rate": 7.913915499071994e-06, "loss": 0.0302, "step": 2392 }, { "epoch": 3.049378783051927, "grad_norm": 0.2836120515793547, "learning_rate": 7.905207497281855e-06, "loss": 0.0351, "step": 2393 }, { "epoch": 3.050653074227461, "grad_norm": 0.2598366320368792, "learning_rate": 7.896501156545044e-06, "loss": 0.029, "step": 2394 }, { "epoch": 3.0519273654029946, "grad_norm": 0.29412708204912996, "learning_rate": 7.887796483765195e-06, "loss": 0.036, "step": 2395 }, { "epoch": 3.053201656578528, "grad_norm": 0.26446409120817777, "learning_rate": 7.879093485844635e-06, "loss": 0.0291, "step": 2396 }, { "epoch": 3.054475947754062, "grad_norm": 0.2808293577936084, "learning_rate": 7.870392169684347e-06, "loss": 0.0376, "step": 2397 }, { "epoch": 3.0557502389295954, "grad_norm": 0.2682221434750384, "learning_rate": 7.861692542183993e-06, "loss": 0.0298, "step": 2398 }, { "epoch": 3.057024530105129, "grad_norm": 0.30575957653239805, "learning_rate": 7.852994610241886e-06, "loss": 0.038, "step": 2399 }, { "epoch": 3.0582988212806628, "grad_norm": 0.27641379559144813, "learning_rate": 7.844298380755003e-06, "loss": 0.0284, "step": 2400 }, { "epoch": 3.0595731124561962, "grad_norm": 0.26734835291681364, "learning_rate": 7.835603860618973e-06, "loss": 0.0284, "step": 2401 }, { "epoch": 3.0608474036317297, "grad_norm": 0.3045629864026343, "learning_rate": 7.826911056728054e-06, "loss": 0.0405, "step": 2402 }, { "epoch": 3.0621216948072636, "grad_norm": 0.2922277172936814, "learning_rate": 7.818219975975163e-06, "loss": 0.0343, "step": 2403 }, { "epoch": 3.063395985982797, "grad_norm": 0.2701427691272676, "learning_rate": 7.809530625251831e-06, "loss": 0.0268, "step": 2404 }, { "epoch": 3.0646702771583305, "grad_norm": 0.2753187882724374, "learning_rate": 7.800843011448237e-06, "loss": 0.0345, "step": 2405 }, { "epoch": 3.0659445683338644, "grad_norm": 0.2952003343268054, "learning_rate": 7.792157141453168e-06, "loss": 0.0397, "step": 2406 }, { "epoch": 3.067218859509398, "grad_norm": 0.28569602028422075, "learning_rate": 7.783473022154035e-06, "loss": 0.0323, "step": 2407 }, { "epoch": 3.0684931506849313, "grad_norm": 0.27585986630830756, "learning_rate": 7.774790660436857e-06, "loss": 0.0353, "step": 2408 }, { "epoch": 3.0697674418604652, "grad_norm": 0.26887715385128413, "learning_rate": 7.766110063186263e-06, "loss": 0.0336, "step": 2409 }, { "epoch": 3.0710417330359987, "grad_norm": 0.2980084751859483, "learning_rate": 7.757431237285482e-06, "loss": 0.0394, "step": 2410 }, { "epoch": 3.072316024211532, "grad_norm": 0.25839005759865347, "learning_rate": 7.748754189616335e-06, "loss": 0.0272, "step": 2411 }, { "epoch": 3.073590315387066, "grad_norm": 0.28051319951009845, "learning_rate": 7.740078927059233e-06, "loss": 0.0316, "step": 2412 }, { "epoch": 3.0748646065625995, "grad_norm": 0.2772184114070292, "learning_rate": 7.731405456493185e-06, "loss": 0.0364, "step": 2413 }, { "epoch": 3.076138897738133, "grad_norm": 0.2856269960083792, "learning_rate": 7.722733784795756e-06, "loss": 0.0362, "step": 2414 }, { "epoch": 3.077413188913667, "grad_norm": 0.259939748113813, "learning_rate": 7.714063918843106e-06, "loss": 0.0264, "step": 2415 }, { "epoch": 3.0786874800892003, "grad_norm": 0.2993131241183177, "learning_rate": 7.705395865509948e-06, "loss": 0.0353, "step": 2416 }, { "epoch": 3.079961771264734, "grad_norm": 0.26525638168100013, "learning_rate": 7.696729631669563e-06, "loss": 0.0259, "step": 2417 }, { "epoch": 3.0812360624402677, "grad_norm": 0.2787315771136146, "learning_rate": 7.688065224193798e-06, "loss": 0.0327, "step": 2418 }, { "epoch": 3.082510353615801, "grad_norm": 0.30158051368470834, "learning_rate": 7.679402649953034e-06, "loss": 0.0358, "step": 2419 }, { "epoch": 3.0837846447913346, "grad_norm": 0.28092322301302247, "learning_rate": 7.670741915816217e-06, "loss": 0.0357, "step": 2420 }, { "epoch": 3.0850589359668685, "grad_norm": 0.27199737824821546, "learning_rate": 7.662083028650816e-06, "loss": 0.0312, "step": 2421 }, { "epoch": 3.086333227142402, "grad_norm": 0.2979797074431902, "learning_rate": 7.653425995322852e-06, "loss": 0.0321, "step": 2422 }, { "epoch": 3.0876075183179355, "grad_norm": 0.30720114534878107, "learning_rate": 7.644770822696859e-06, "loss": 0.0365, "step": 2423 }, { "epoch": 3.0888818094934694, "grad_norm": 0.2977961710761861, "learning_rate": 7.636117517635911e-06, "loss": 0.0373, "step": 2424 }, { "epoch": 3.090156100669003, "grad_norm": 0.2740517645531552, "learning_rate": 7.627466087001601e-06, "loss": 0.0339, "step": 2425 }, { "epoch": 3.0914303918445363, "grad_norm": 0.2865522591652486, "learning_rate": 7.618816537654018e-06, "loss": 0.0342, "step": 2426 }, { "epoch": 3.09270468302007, "grad_norm": 0.2889338232129669, "learning_rate": 7.610168876451781e-06, "loss": 0.0427, "step": 2427 }, { "epoch": 3.0939789741956036, "grad_norm": 0.2968273308176045, "learning_rate": 7.601523110251994e-06, "loss": 0.0446, "step": 2428 }, { "epoch": 3.095253265371137, "grad_norm": 0.2621611787372769, "learning_rate": 7.592879245910273e-06, "loss": 0.0316, "step": 2429 }, { "epoch": 3.096527556546671, "grad_norm": 0.2828676494823298, "learning_rate": 7.5842372902807115e-06, "loss": 0.0322, "step": 2430 }, { "epoch": 3.0978018477222045, "grad_norm": 0.2671310925035987, "learning_rate": 7.575597250215903e-06, "loss": 0.0313, "step": 2431 }, { "epoch": 3.099076138897738, "grad_norm": 0.2933853878609721, "learning_rate": 7.566959132566914e-06, "loss": 0.033, "step": 2432 }, { "epoch": 3.100350430073272, "grad_norm": 0.2794396412954194, "learning_rate": 7.558322944183291e-06, "loss": 0.0339, "step": 2433 }, { "epoch": 3.1016247212488053, "grad_norm": 0.28012957851099746, "learning_rate": 7.549688691913044e-06, "loss": 0.0394, "step": 2434 }, { "epoch": 3.1028990124243387, "grad_norm": 0.29656109410344933, "learning_rate": 7.541056382602657e-06, "loss": 0.0373, "step": 2435 }, { "epoch": 3.1041733035998726, "grad_norm": 0.27551364801317907, "learning_rate": 7.532426023097063e-06, "loss": 0.033, "step": 2436 }, { "epoch": 3.105447594775406, "grad_norm": 0.27795946662442467, "learning_rate": 7.523797620239663e-06, "loss": 0.0324, "step": 2437 }, { "epoch": 3.1067218859509396, "grad_norm": 0.28919813733437383, "learning_rate": 7.5151711808722895e-06, "loss": 0.0325, "step": 2438 }, { "epoch": 3.1079961771264735, "grad_norm": 0.2586395904355803, "learning_rate": 7.506546711835234e-06, "loss": 0.0262, "step": 2439 }, { "epoch": 3.109270468302007, "grad_norm": 0.27677502508567836, "learning_rate": 7.49792421996721e-06, "loss": 0.0306, "step": 2440 }, { "epoch": 3.110544759477541, "grad_norm": 0.28515469797736553, "learning_rate": 7.4893037121053806e-06, "loss": 0.0309, "step": 2441 }, { "epoch": 3.1118190506530743, "grad_norm": 0.2921498370382978, "learning_rate": 7.4806851950853165e-06, "loss": 0.0396, "step": 2442 }, { "epoch": 3.1130933418286078, "grad_norm": 0.27615614482426737, "learning_rate": 7.472068675741024e-06, "loss": 0.032, "step": 2443 }, { "epoch": 3.1143676330041417, "grad_norm": 0.27289279443495756, "learning_rate": 7.463454160904928e-06, "loss": 0.0309, "step": 2444 }, { "epoch": 3.115641924179675, "grad_norm": 0.3177250559673331, "learning_rate": 7.454841657407847e-06, "loss": 0.0493, "step": 2445 }, { "epoch": 3.1169162153552086, "grad_norm": 0.260531651740554, "learning_rate": 7.446231172079024e-06, "loss": 0.0287, "step": 2446 }, { "epoch": 3.1181905065307425, "grad_norm": 0.2608712238556602, "learning_rate": 7.437622711746081e-06, "loss": 0.033, "step": 2447 }, { "epoch": 3.119464797706276, "grad_norm": 0.27965674688224246, "learning_rate": 7.429016283235054e-06, "loss": 0.0354, "step": 2448 }, { "epoch": 3.1207390888818094, "grad_norm": 0.26695556881086124, "learning_rate": 7.420411893370358e-06, "loss": 0.029, "step": 2449 }, { "epoch": 3.1220133800573433, "grad_norm": 0.29680322579741336, "learning_rate": 7.411809548974792e-06, "loss": 0.0337, "step": 2450 }, { "epoch": 3.1232876712328768, "grad_norm": 0.29833854132272297, "learning_rate": 7.403209256869539e-06, "loss": 0.0338, "step": 2451 }, { "epoch": 3.12456196240841, "grad_norm": 0.2784434186045109, "learning_rate": 7.39461102387414e-06, "loss": 0.0383, "step": 2452 }, { "epoch": 3.125836253583944, "grad_norm": 0.29467585855021744, "learning_rate": 7.3860148568065225e-06, "loss": 0.0366, "step": 2453 }, { "epoch": 3.1271105447594776, "grad_norm": 0.2977343117181243, "learning_rate": 7.377420762482958e-06, "loss": 0.035, "step": 2454 }, { "epoch": 3.128384835935011, "grad_norm": 0.2843156676109088, "learning_rate": 7.368828747718089e-06, "loss": 0.0325, "step": 2455 }, { "epoch": 3.129659127110545, "grad_norm": 0.2729373642712642, "learning_rate": 7.360238819324903e-06, "loss": 0.0287, "step": 2456 }, { "epoch": 3.1309334182860784, "grad_norm": 0.29416686609651566, "learning_rate": 7.3516509841147276e-06, "loss": 0.0358, "step": 2457 }, { "epoch": 3.132207709461612, "grad_norm": 0.2689679884298644, "learning_rate": 7.3430652488972436e-06, "loss": 0.0304, "step": 2458 }, { "epoch": 3.1334820006371458, "grad_norm": 0.2969380828340744, "learning_rate": 7.3344816204804494e-06, "loss": 0.0355, "step": 2459 }, { "epoch": 3.1347562918126792, "grad_norm": 0.2826678517676651, "learning_rate": 7.325900105670693e-06, "loss": 0.0323, "step": 2460 }, { "epoch": 3.1360305829882127, "grad_norm": 0.2947196020960831, "learning_rate": 7.317320711272624e-06, "loss": 0.0431, "step": 2461 }, { "epoch": 3.1373048741637466, "grad_norm": 0.27339969689543814, "learning_rate": 7.308743444089232e-06, "loss": 0.0356, "step": 2462 }, { "epoch": 3.13857916533928, "grad_norm": 0.2662999901329611, "learning_rate": 7.300168310921807e-06, "loss": 0.0301, "step": 2463 }, { "epoch": 3.1398534565148135, "grad_norm": 0.27065610980443777, "learning_rate": 7.291595318569951e-06, "loss": 0.0288, "step": 2464 }, { "epoch": 3.1411277476903474, "grad_norm": 0.2719865350110708, "learning_rate": 7.283024473831566e-06, "loss": 0.0351, "step": 2465 }, { "epoch": 3.142402038865881, "grad_norm": 0.28499140866448075, "learning_rate": 7.274455783502852e-06, "loss": 0.0309, "step": 2466 }, { "epoch": 3.1436763300414143, "grad_norm": 0.29407975548007803, "learning_rate": 7.265889254378302e-06, "loss": 0.037, "step": 2467 }, { "epoch": 3.1449506212169482, "grad_norm": 0.2753751903253368, "learning_rate": 7.257324893250699e-06, "loss": 0.0311, "step": 2468 }, { "epoch": 3.1462249123924817, "grad_norm": 0.29006268567067806, "learning_rate": 7.248762706911094e-06, "loss": 0.0342, "step": 2469 }, { "epoch": 3.147499203568015, "grad_norm": 0.3217028591912313, "learning_rate": 7.240202702148831e-06, "loss": 0.0474, "step": 2470 }, { "epoch": 3.148773494743549, "grad_norm": 0.2741491485772739, "learning_rate": 7.2316448857515076e-06, "loss": 0.0287, "step": 2471 }, { "epoch": 3.1500477859190825, "grad_norm": 0.2804215974722851, "learning_rate": 7.223089264505001e-06, "loss": 0.0311, "step": 2472 }, { "epoch": 3.151322077094616, "grad_norm": 0.2807314408460376, "learning_rate": 7.2145358451934314e-06, "loss": 0.0314, "step": 2473 }, { "epoch": 3.15259636827015, "grad_norm": 0.3009514232514309, "learning_rate": 7.205984634599188e-06, "loss": 0.039, "step": 2474 }, { "epoch": 3.1538706594456833, "grad_norm": 0.29512486926067677, "learning_rate": 7.197435639502906e-06, "loss": 0.038, "step": 2475 }, { "epoch": 3.155144950621217, "grad_norm": 0.2878738801322945, "learning_rate": 7.18888886668345e-06, "loss": 0.0317, "step": 2476 }, { "epoch": 3.1564192417967507, "grad_norm": 0.2781842746375863, "learning_rate": 7.180344322917945e-06, "loss": 0.0288, "step": 2477 }, { "epoch": 3.157693532972284, "grad_norm": 0.2927190393233188, "learning_rate": 7.171802014981726e-06, "loss": 0.0342, "step": 2478 }, { "epoch": 3.1589678241478176, "grad_norm": 0.2866642856144122, "learning_rate": 7.16326194964837e-06, "loss": 0.0286, "step": 2479 }, { "epoch": 3.1602421153233515, "grad_norm": 0.2808806455737492, "learning_rate": 7.154724133689677e-06, "loss": 0.0304, "step": 2480 }, { "epoch": 3.161516406498885, "grad_norm": 0.2642639191403345, "learning_rate": 7.146188573875648e-06, "loss": 0.0295, "step": 2481 }, { "epoch": 3.1627906976744184, "grad_norm": 0.277569587717572, "learning_rate": 7.137655276974511e-06, "loss": 0.0291, "step": 2482 }, { "epoch": 3.1640649888499524, "grad_norm": 0.27208675164654267, "learning_rate": 7.129124249752688e-06, "loss": 0.0291, "step": 2483 }, { "epoch": 3.165339280025486, "grad_norm": 0.2818342876743979, "learning_rate": 7.120595498974814e-06, "loss": 0.037, "step": 2484 }, { "epoch": 3.1666135712010193, "grad_norm": 0.29784674645595044, "learning_rate": 7.112069031403704e-06, "loss": 0.0385, "step": 2485 }, { "epoch": 3.167887862376553, "grad_norm": 0.2796481250161984, "learning_rate": 7.1035448538003706e-06, "loss": 0.0332, "step": 2486 }, { "epoch": 3.1691621535520866, "grad_norm": 0.2753356856404222, "learning_rate": 7.095022972924017e-06, "loss": 0.0282, "step": 2487 }, { "epoch": 3.17043644472762, "grad_norm": 0.29701219875924756, "learning_rate": 7.086503395532012e-06, "loss": 0.0304, "step": 2488 }, { "epoch": 3.171710735903154, "grad_norm": 0.29271480061463445, "learning_rate": 7.077986128379908e-06, "loss": 0.0346, "step": 2489 }, { "epoch": 3.1729850270786875, "grad_norm": 0.2917483026849266, "learning_rate": 7.069471178221416e-06, "loss": 0.0354, "step": 2490 }, { "epoch": 3.174259318254221, "grad_norm": 0.2727882289476046, "learning_rate": 7.060958551808423e-06, "loss": 0.0318, "step": 2491 }, { "epoch": 3.175533609429755, "grad_norm": 0.2929288787749883, "learning_rate": 7.052448255890958e-06, "loss": 0.035, "step": 2492 }, { "epoch": 3.1768079006052883, "grad_norm": 0.2869139353570008, "learning_rate": 7.043940297217215e-06, "loss": 0.0332, "step": 2493 }, { "epoch": 3.1780821917808217, "grad_norm": 0.2831342712018922, "learning_rate": 7.035434682533528e-06, "loss": 0.0315, "step": 2494 }, { "epoch": 3.1793564829563556, "grad_norm": 0.28120804848217734, "learning_rate": 7.0269314185843755e-06, "loss": 0.032, "step": 2495 }, { "epoch": 3.180630774131889, "grad_norm": 0.29884379378083875, "learning_rate": 7.018430512112367e-06, "loss": 0.0326, "step": 2496 }, { "epoch": 3.1819050653074226, "grad_norm": 0.2943485722726961, "learning_rate": 7.009931969858247e-06, "loss": 0.0407, "step": 2497 }, { "epoch": 3.1831793564829565, "grad_norm": 0.28521844560284915, "learning_rate": 7.001435798560884e-06, "loss": 0.0319, "step": 2498 }, { "epoch": 3.18445364765849, "grad_norm": 0.2812142537394185, "learning_rate": 6.992942004957271e-06, "loss": 0.0314, "step": 2499 }, { "epoch": 3.1857279388340234, "grad_norm": 0.29496334846819316, "learning_rate": 6.9844505957825045e-06, "loss": 0.0333, "step": 2500 }, { "epoch": 3.1870022300095573, "grad_norm": 0.2779924898694907, "learning_rate": 6.975961577769805e-06, "loss": 0.034, "step": 2501 }, { "epoch": 3.1882765211850908, "grad_norm": 0.28594185848638903, "learning_rate": 6.967474957650482e-06, "loss": 0.0342, "step": 2502 }, { "epoch": 3.189550812360624, "grad_norm": 0.2862502329793712, "learning_rate": 6.958990742153956e-06, "loss": 0.0359, "step": 2503 }, { "epoch": 3.190825103536158, "grad_norm": 0.2769067021055001, "learning_rate": 6.95050893800773e-06, "loss": 0.0308, "step": 2504 }, { "epoch": 3.1920993947116916, "grad_norm": 0.29189294910834707, "learning_rate": 6.942029551937403e-06, "loss": 0.0355, "step": 2505 }, { "epoch": 3.193373685887225, "grad_norm": 0.27403364189722634, "learning_rate": 6.933552590666659e-06, "loss": 0.0301, "step": 2506 }, { "epoch": 3.194647977062759, "grad_norm": 0.2804159646025589, "learning_rate": 6.925078060917245e-06, "loss": 0.0339, "step": 2507 }, { "epoch": 3.1959222682382924, "grad_norm": 0.2946512756907755, "learning_rate": 6.916605969408999e-06, "loss": 0.0279, "step": 2508 }, { "epoch": 3.197196559413826, "grad_norm": 0.27546167268267807, "learning_rate": 6.9081363228598064e-06, "loss": 0.03, "step": 2509 }, { "epoch": 3.1984708505893598, "grad_norm": 0.2703622395704297, "learning_rate": 6.8996691279856335e-06, "loss": 0.0288, "step": 2510 }, { "epoch": 3.199745141764893, "grad_norm": 0.3008977309169871, "learning_rate": 6.891204391500481e-06, "loss": 0.0331, "step": 2511 }, { "epoch": 3.201019432940427, "grad_norm": 0.26754863722423167, "learning_rate": 6.882742120116419e-06, "loss": 0.0286, "step": 2512 }, { "epoch": 3.2022937241159606, "grad_norm": 0.2886411913111421, "learning_rate": 6.874282320543557e-06, "loss": 0.0309, "step": 2513 }, { "epoch": 3.203568015291494, "grad_norm": 0.28646489205113296, "learning_rate": 6.865824999490036e-06, "loss": 0.0356, "step": 2514 }, { "epoch": 3.204842306467028, "grad_norm": 0.28807246516963647, "learning_rate": 6.857370163662047e-06, "loss": 0.0373, "step": 2515 }, { "epoch": 3.2061165976425614, "grad_norm": 0.2724178593911178, "learning_rate": 6.848917819763794e-06, "loss": 0.0297, "step": 2516 }, { "epoch": 3.207390888818095, "grad_norm": 0.29287056517988674, "learning_rate": 6.840467974497516e-06, "loss": 0.0351, "step": 2517 }, { "epoch": 3.2086651799936288, "grad_norm": 0.2791271723539614, "learning_rate": 6.832020634563474e-06, "loss": 0.0284, "step": 2518 }, { "epoch": 3.2099394711691622, "grad_norm": 0.27221634786307486, "learning_rate": 6.823575806659926e-06, "loss": 0.0334, "step": 2519 }, { "epoch": 3.2112137623446957, "grad_norm": 0.27875681536215224, "learning_rate": 6.815133497483157e-06, "loss": 0.0357, "step": 2520 }, { "epoch": 3.2124880535202296, "grad_norm": 0.27936746047560007, "learning_rate": 6.8066937137274395e-06, "loss": 0.0366, "step": 2521 }, { "epoch": 3.213762344695763, "grad_norm": 0.2955761570343824, "learning_rate": 6.798256462085055e-06, "loss": 0.0331, "step": 2522 }, { "epoch": 3.2150366358712965, "grad_norm": 0.27981413956253903, "learning_rate": 6.789821749246268e-06, "loss": 0.0325, "step": 2523 }, { "epoch": 3.2163109270468304, "grad_norm": 0.2628634864270936, "learning_rate": 6.781389581899339e-06, "loss": 0.0322, "step": 2524 }, { "epoch": 3.217585218222364, "grad_norm": 0.2776342002584423, "learning_rate": 6.772959966730502e-06, "loss": 0.0336, "step": 2525 }, { "epoch": 3.2188595093978973, "grad_norm": 0.26707570576143635, "learning_rate": 6.764532910423971e-06, "loss": 0.0292, "step": 2526 }, { "epoch": 3.2201338005734312, "grad_norm": 0.27787469920344066, "learning_rate": 6.7561084196619306e-06, "loss": 0.0337, "step": 2527 }, { "epoch": 3.2214080917489647, "grad_norm": 0.2695683613958592, "learning_rate": 6.747686501124531e-06, "loss": 0.0283, "step": 2528 }, { "epoch": 3.222682382924498, "grad_norm": 0.2845409881844208, "learning_rate": 6.73926716148988e-06, "loss": 0.0306, "step": 2529 }, { "epoch": 3.223956674100032, "grad_norm": 0.3067559466778094, "learning_rate": 6.73085040743405e-06, "loss": 0.0412, "step": 2530 }, { "epoch": 3.2252309652755655, "grad_norm": 0.27682995573376395, "learning_rate": 6.7224362456310475e-06, "loss": 0.0305, "step": 2531 }, { "epoch": 3.226505256451099, "grad_norm": 0.2725809175825222, "learning_rate": 6.714024682752842e-06, "loss": 0.03, "step": 2532 }, { "epoch": 3.227779547626633, "grad_norm": 0.28750783211061454, "learning_rate": 6.705615725469323e-06, "loss": 0.03, "step": 2533 }, { "epoch": 3.2290538388021663, "grad_norm": 0.28934088583805456, "learning_rate": 6.697209380448333e-06, "loss": 0.0352, "step": 2534 }, { "epoch": 3.2303281299777, "grad_norm": 0.27658121803468516, "learning_rate": 6.688805654355623e-06, "loss": 0.0336, "step": 2535 }, { "epoch": 3.2316024211532337, "grad_norm": 0.26571078382722757, "learning_rate": 6.6804045538548844e-06, "loss": 0.0291, "step": 2536 }, { "epoch": 3.232876712328767, "grad_norm": 0.2814622021083522, "learning_rate": 6.672006085607722e-06, "loss": 0.0322, "step": 2537 }, { "epoch": 3.2341510035043006, "grad_norm": 0.2919556715796648, "learning_rate": 6.663610256273645e-06, "loss": 0.0368, "step": 2538 }, { "epoch": 3.2354252946798345, "grad_norm": 0.2852431720902996, "learning_rate": 6.655217072510085e-06, "loss": 0.0403, "step": 2539 }, { "epoch": 3.236699585855368, "grad_norm": 0.3014778676806303, "learning_rate": 6.646826540972357e-06, "loss": 0.0373, "step": 2540 }, { "epoch": 3.2379738770309014, "grad_norm": 0.30104153060227623, "learning_rate": 6.638438668313695e-06, "loss": 0.0407, "step": 2541 }, { "epoch": 3.2392481682064354, "grad_norm": 0.27388871525225417, "learning_rate": 6.630053461185202e-06, "loss": 0.0336, "step": 2542 }, { "epoch": 3.240522459381969, "grad_norm": 0.30617235715343993, "learning_rate": 6.621670926235884e-06, "loss": 0.0458, "step": 2543 }, { "epoch": 3.2417967505575023, "grad_norm": 0.2704680901500627, "learning_rate": 6.613291070112624e-06, "loss": 0.0346, "step": 2544 }, { "epoch": 3.243071041733036, "grad_norm": 0.272247840753155, "learning_rate": 6.604913899460175e-06, "loss": 0.034, "step": 2545 }, { "epoch": 3.2443453329085696, "grad_norm": 0.28716254979327877, "learning_rate": 6.596539420921171e-06, "loss": 0.0319, "step": 2546 }, { "epoch": 3.245619624084103, "grad_norm": 0.28058965591325447, "learning_rate": 6.5881676411360976e-06, "loss": 0.0336, "step": 2547 }, { "epoch": 3.246893915259637, "grad_norm": 0.3008112377318998, "learning_rate": 6.579798566743314e-06, "loss": 0.0465, "step": 2548 }, { "epoch": 3.2481682064351705, "grad_norm": 0.26568422100186156, "learning_rate": 6.571432204379025e-06, "loss": 0.0311, "step": 2549 }, { "epoch": 3.249442497610704, "grad_norm": 0.2583432634757905, "learning_rate": 6.56306856067729e-06, "loss": 0.0309, "step": 2550 }, { "epoch": 3.250716788786238, "grad_norm": 0.30055501126301615, "learning_rate": 6.554707642270009e-06, "loss": 0.0379, "step": 2551 }, { "epoch": 3.2519910799617713, "grad_norm": 0.27200765566789786, "learning_rate": 6.546349455786926e-06, "loss": 0.0367, "step": 2552 }, { "epoch": 3.2532653711373047, "grad_norm": 0.2695872074964264, "learning_rate": 6.5379940078556116e-06, "loss": 0.0298, "step": 2553 }, { "epoch": 3.2545396623128386, "grad_norm": 0.27304724715456036, "learning_rate": 6.529641305101471e-06, "loss": 0.0278, "step": 2554 }, { "epoch": 3.255813953488372, "grad_norm": 0.2798404922864379, "learning_rate": 6.521291354147727e-06, "loss": 0.032, "step": 2555 }, { "epoch": 3.2570882446639056, "grad_norm": 0.28867422031400025, "learning_rate": 6.512944161615433e-06, "loss": 0.0409, "step": 2556 }, { "epoch": 3.2583625358394395, "grad_norm": 0.3097391403601258, "learning_rate": 6.504599734123434e-06, "loss": 0.035, "step": 2557 }, { "epoch": 3.259636827014973, "grad_norm": 0.28548584997767706, "learning_rate": 6.496258078288407e-06, "loss": 0.0309, "step": 2558 }, { "epoch": 3.2609111181905064, "grad_norm": 0.29028920106407824, "learning_rate": 6.487919200724805e-06, "loss": 0.032, "step": 2559 }, { "epoch": 3.2621854093660403, "grad_norm": 0.30033752227576627, "learning_rate": 6.4795831080448986e-06, "loss": 0.032, "step": 2560 }, { "epoch": 3.2634597005415737, "grad_norm": 0.2830820665795423, "learning_rate": 6.471249806858748e-06, "loss": 0.033, "step": 2561 }, { "epoch": 3.264733991717107, "grad_norm": 0.2907307854968144, "learning_rate": 6.462919303774186e-06, "loss": 0.0293, "step": 2562 }, { "epoch": 3.266008282892641, "grad_norm": 0.28745958471506594, "learning_rate": 6.454591605396844e-06, "loss": 0.0352, "step": 2563 }, { "epoch": 3.2672825740681746, "grad_norm": 0.2817338561686668, "learning_rate": 6.4462667183301135e-06, "loss": 0.0331, "step": 2564 }, { "epoch": 3.268556865243708, "grad_norm": 0.2818697005369847, "learning_rate": 6.437944649175171e-06, "loss": 0.0341, "step": 2565 }, { "epoch": 3.269831156419242, "grad_norm": 0.2904684972052638, "learning_rate": 6.429625404530946e-06, "loss": 0.0372, "step": 2566 }, { "epoch": 3.2711054475947754, "grad_norm": 0.2844563994952312, "learning_rate": 6.421308990994136e-06, "loss": 0.0329, "step": 2567 }, { "epoch": 3.272379738770309, "grad_norm": 0.28671643344704856, "learning_rate": 6.4129954151591976e-06, "loss": 0.0318, "step": 2568 }, { "epoch": 3.2736540299458428, "grad_norm": 0.304409522671762, "learning_rate": 6.404684683618325e-06, "loss": 0.0377, "step": 2569 }, { "epoch": 3.274928321121376, "grad_norm": 0.2777698224894076, "learning_rate": 6.396376802961468e-06, "loss": 0.0333, "step": 2570 }, { "epoch": 3.2762026122969097, "grad_norm": 0.29019485805228934, "learning_rate": 6.388071779776307e-06, "loss": 0.0362, "step": 2571 }, { "epoch": 3.2774769034724436, "grad_norm": 0.29643177362084844, "learning_rate": 6.379769620648266e-06, "loss": 0.0376, "step": 2572 }, { "epoch": 3.278751194647977, "grad_norm": 0.3071046355446382, "learning_rate": 6.371470332160488e-06, "loss": 0.0389, "step": 2573 }, { "epoch": 3.2800254858235105, "grad_norm": 0.2751479868755522, "learning_rate": 6.363173920893845e-06, "loss": 0.0312, "step": 2574 }, { "epoch": 3.2812997769990444, "grad_norm": 0.29325720647933795, "learning_rate": 6.3548803934269345e-06, "loss": 0.0349, "step": 2575 }, { "epoch": 3.282574068174578, "grad_norm": 0.2679583700730433, "learning_rate": 6.34658975633605e-06, "loss": 0.03, "step": 2576 }, { "epoch": 3.2838483593501113, "grad_norm": 0.2631098955558639, "learning_rate": 6.338302016195213e-06, "loss": 0.0257, "step": 2577 }, { "epoch": 3.2851226505256452, "grad_norm": 0.29508708831458963, "learning_rate": 6.3300171795761265e-06, "loss": 0.0383, "step": 2578 }, { "epoch": 3.2863969417011787, "grad_norm": 0.2797036031796874, "learning_rate": 6.321735253048214e-06, "loss": 0.032, "step": 2579 }, { "epoch": 3.287671232876712, "grad_norm": 0.30224761001815875, "learning_rate": 6.3134562431785736e-06, "loss": 0.0379, "step": 2580 }, { "epoch": 3.288945524052246, "grad_norm": 0.30394128324697334, "learning_rate": 6.305180156532e-06, "loss": 0.0412, "step": 2581 }, { "epoch": 3.2902198152277795, "grad_norm": 0.2828152921216571, "learning_rate": 6.2969069996709664e-06, "loss": 0.0341, "step": 2582 }, { "epoch": 3.291494106403313, "grad_norm": 0.27679073922357245, "learning_rate": 6.288636779155621e-06, "loss": 0.028, "step": 2583 }, { "epoch": 3.292768397578847, "grad_norm": 0.2706337895110626, "learning_rate": 6.28036950154379e-06, "loss": 0.0337, "step": 2584 }, { "epoch": 3.2940426887543803, "grad_norm": 0.3288891566595104, "learning_rate": 6.272105173390962e-06, "loss": 0.0468, "step": 2585 }, { "epoch": 3.295316979929914, "grad_norm": 0.27721333440570967, "learning_rate": 6.263843801250282e-06, "loss": 0.0348, "step": 2586 }, { "epoch": 3.2965912711054477, "grad_norm": 0.27620251290556513, "learning_rate": 6.255585391672565e-06, "loss": 0.0357, "step": 2587 }, { "epoch": 3.297865562280981, "grad_norm": 0.3012755478741211, "learning_rate": 6.24732995120626e-06, "loss": 0.0375, "step": 2588 }, { "epoch": 3.2991398534565146, "grad_norm": 0.2711793025834749, "learning_rate": 6.239077486397475e-06, "loss": 0.0313, "step": 2589 }, { "epoch": 3.3004141446320485, "grad_norm": 0.3138087242451018, "learning_rate": 6.230828003789949e-06, "loss": 0.0405, "step": 2590 }, { "epoch": 3.301688435807582, "grad_norm": 0.2809352961385747, "learning_rate": 6.222581509925061e-06, "loss": 0.0364, "step": 2591 }, { "epoch": 3.3029627269831154, "grad_norm": 0.28260514062634423, "learning_rate": 6.214338011341825e-06, "loss": 0.0344, "step": 2592 }, { "epoch": 3.3042370181586493, "grad_norm": 0.2913247733338444, "learning_rate": 6.206097514576866e-06, "loss": 0.0329, "step": 2593 }, { "epoch": 3.305511309334183, "grad_norm": 0.289589074354242, "learning_rate": 6.197860026164446e-06, "loss": 0.0336, "step": 2594 }, { "epoch": 3.3067856005097163, "grad_norm": 0.28899471436211666, "learning_rate": 6.1896255526364245e-06, "loss": 0.0356, "step": 2595 }, { "epoch": 3.30805989168525, "grad_norm": 0.298151498445111, "learning_rate": 6.181394100522286e-06, "loss": 0.0325, "step": 2596 }, { "epoch": 3.3093341828607836, "grad_norm": 0.2789000399152283, "learning_rate": 6.173165676349103e-06, "loss": 0.0299, "step": 2597 }, { "epoch": 3.310608474036317, "grad_norm": 0.30296736164618526, "learning_rate": 6.164940286641563e-06, "loss": 0.039, "step": 2598 }, { "epoch": 3.311882765211851, "grad_norm": 0.27444752095363006, "learning_rate": 6.156717937921941e-06, "loss": 0.0289, "step": 2599 }, { "epoch": 3.3131570563873844, "grad_norm": 0.30441985401690863, "learning_rate": 6.148498636710092e-06, "loss": 0.0406, "step": 2600 }, { "epoch": 3.314431347562918, "grad_norm": 0.2985552419555918, "learning_rate": 6.140282389523472e-06, "loss": 0.0337, "step": 2601 }, { "epoch": 3.315705638738452, "grad_norm": 0.31385986421364365, "learning_rate": 6.132069202877096e-06, "loss": 0.037, "step": 2602 }, { "epoch": 3.3169799299139853, "grad_norm": 0.2762141522226466, "learning_rate": 6.123859083283571e-06, "loss": 0.0303, "step": 2603 }, { "epoch": 3.3182542210895187, "grad_norm": 0.2823558538489269, "learning_rate": 6.115652037253054e-06, "loss": 0.035, "step": 2604 }, { "epoch": 3.3195285122650526, "grad_norm": 0.2888679073352574, "learning_rate": 6.107448071293278e-06, "loss": 0.0367, "step": 2605 }, { "epoch": 3.320802803440586, "grad_norm": 0.28153437648367236, "learning_rate": 6.099247191909532e-06, "loss": 0.0363, "step": 2606 }, { "epoch": 3.3220770946161196, "grad_norm": 0.28354950348804664, "learning_rate": 6.091049405604649e-06, "loss": 0.0347, "step": 2607 }, { "epoch": 3.3233513857916535, "grad_norm": 0.27695669236062614, "learning_rate": 6.082854718879021e-06, "loss": 0.0322, "step": 2608 }, { "epoch": 3.324625676967187, "grad_norm": 0.2846933309719446, "learning_rate": 6.074663138230571e-06, "loss": 0.0349, "step": 2609 }, { "epoch": 3.3258999681427204, "grad_norm": 0.26465562565016854, "learning_rate": 6.066474670154767e-06, "loss": 0.0265, "step": 2610 }, { "epoch": 3.3271742593182543, "grad_norm": 0.2873758143106062, "learning_rate": 6.058289321144608e-06, "loss": 0.0345, "step": 2611 }, { "epoch": 3.3284485504937877, "grad_norm": 0.28538849851050146, "learning_rate": 6.050107097690615e-06, "loss": 0.0396, "step": 2612 }, { "epoch": 3.329722841669321, "grad_norm": 0.2800188284028141, "learning_rate": 6.041928006280835e-06, "loss": 0.0394, "step": 2613 }, { "epoch": 3.330997132844855, "grad_norm": 0.2867826245183348, "learning_rate": 6.03375205340083e-06, "loss": 0.0405, "step": 2614 }, { "epoch": 3.3322714240203886, "grad_norm": 0.2671875649957764, "learning_rate": 6.0255792455336735e-06, "loss": 0.0281, "step": 2615 }, { "epoch": 3.333545715195922, "grad_norm": 0.27807647101838767, "learning_rate": 6.017409589159946e-06, "loss": 0.0306, "step": 2616 }, { "epoch": 3.334820006371456, "grad_norm": 0.2908026332929906, "learning_rate": 6.009243090757724e-06, "loss": 0.0316, "step": 2617 }, { "epoch": 3.3360942975469894, "grad_norm": 0.2900180480188803, "learning_rate": 6.001079756802592e-06, "loss": 0.0298, "step": 2618 }, { "epoch": 3.3373685887225233, "grad_norm": 0.30448952507376276, "learning_rate": 5.99291959376761e-06, "loss": 0.0373, "step": 2619 }, { "epoch": 3.3386428798980567, "grad_norm": 0.26938112950914117, "learning_rate": 5.984762608123337e-06, "loss": 0.0255, "step": 2620 }, { "epoch": 3.33991717107359, "grad_norm": 0.27302172704489974, "learning_rate": 5.976608806337799e-06, "loss": 0.0337, "step": 2621 }, { "epoch": 3.341191462249124, "grad_norm": 0.2992692945728911, "learning_rate": 5.96845819487651e-06, "loss": 0.0413, "step": 2622 }, { "epoch": 3.3424657534246576, "grad_norm": 0.29589462925551946, "learning_rate": 5.960310780202452e-06, "loss": 0.0375, "step": 2623 }, { "epoch": 3.343740044600191, "grad_norm": 0.28369712705980854, "learning_rate": 5.952166568776062e-06, "loss": 0.0357, "step": 2624 }, { "epoch": 3.345014335775725, "grad_norm": 0.28368935779955284, "learning_rate": 5.944025567055251e-06, "loss": 0.029, "step": 2625 }, { "epoch": 3.3462886269512584, "grad_norm": 0.30233642775753417, "learning_rate": 5.935887781495373e-06, "loss": 0.0391, "step": 2626 }, { "epoch": 3.347562918126792, "grad_norm": 0.30360138730390923, "learning_rate": 5.927753218549241e-06, "loss": 0.0381, "step": 2627 }, { "epoch": 3.3488372093023258, "grad_norm": 0.2679577927764333, "learning_rate": 5.919621884667104e-06, "loss": 0.0327, "step": 2628 }, { "epoch": 3.350111500477859, "grad_norm": 0.26693107848869013, "learning_rate": 5.911493786296658e-06, "loss": 0.0302, "step": 2629 }, { "epoch": 3.3513857916533927, "grad_norm": 0.27462282511563013, "learning_rate": 5.903368929883033e-06, "loss": 0.031, "step": 2630 }, { "epoch": 3.3526600828289266, "grad_norm": 0.26277891173986245, "learning_rate": 5.89524732186878e-06, "loss": 0.0257, "step": 2631 }, { "epoch": 3.35393437400446, "grad_norm": 0.28345042742176707, "learning_rate": 5.887128968693887e-06, "loss": 0.0311, "step": 2632 }, { "epoch": 3.3552086651799935, "grad_norm": 0.3009051544505191, "learning_rate": 5.879013876795745e-06, "loss": 0.0319, "step": 2633 }, { "epoch": 3.3564829563555274, "grad_norm": 0.27399013727110894, "learning_rate": 5.8709020526091795e-06, "loss": 0.0345, "step": 2634 }, { "epoch": 3.357757247531061, "grad_norm": 0.2892253877163521, "learning_rate": 5.862793502566402e-06, "loss": 0.0281, "step": 2635 }, { "epoch": 3.3590315387065943, "grad_norm": 0.30906207687280646, "learning_rate": 5.8546882330970454e-06, "loss": 0.0353, "step": 2636 }, { "epoch": 3.3603058298821282, "grad_norm": 0.28070539424017965, "learning_rate": 5.8465862506281376e-06, "loss": 0.0306, "step": 2637 }, { "epoch": 3.3615801210576617, "grad_norm": 0.2761339333878904, "learning_rate": 5.838487561584092e-06, "loss": 0.0299, "step": 2638 }, { "epoch": 3.362854412233195, "grad_norm": 0.2761388684965611, "learning_rate": 5.830392172386723e-06, "loss": 0.0342, "step": 2639 }, { "epoch": 3.364128703408729, "grad_norm": 0.320854190302929, "learning_rate": 5.822300089455211e-06, "loss": 0.0463, "step": 2640 }, { "epoch": 3.3654029945842625, "grad_norm": 0.28020052138400864, "learning_rate": 5.814211319206133e-06, "loss": 0.0311, "step": 2641 }, { "epoch": 3.366677285759796, "grad_norm": 0.27901073411096156, "learning_rate": 5.806125868053433e-06, "loss": 0.0282, "step": 2642 }, { "epoch": 3.36795157693533, "grad_norm": 0.27078773497014547, "learning_rate": 5.798043742408417e-06, "loss": 0.0295, "step": 2643 }, { "epoch": 3.3692258681108633, "grad_norm": 0.29691837109576164, "learning_rate": 5.789964948679761e-06, "loss": 0.0342, "step": 2644 }, { "epoch": 3.370500159286397, "grad_norm": 0.2839824248131987, "learning_rate": 5.781889493273496e-06, "loss": 0.0341, "step": 2645 }, { "epoch": 3.3717744504619307, "grad_norm": 0.2896691453884496, "learning_rate": 5.773817382593008e-06, "loss": 0.037, "step": 2646 }, { "epoch": 3.373048741637464, "grad_norm": 0.27468696313544433, "learning_rate": 5.765748623039027e-06, "loss": 0.0346, "step": 2647 }, { "epoch": 3.3743230328129976, "grad_norm": 0.2953746373886004, "learning_rate": 5.757683221009625e-06, "loss": 0.0322, "step": 2648 }, { "epoch": 3.3755973239885315, "grad_norm": 0.30114522054703746, "learning_rate": 5.749621182900228e-06, "loss": 0.0375, "step": 2649 }, { "epoch": 3.376871615164065, "grad_norm": 0.265341516257212, "learning_rate": 5.741562515103565e-06, "loss": 0.0319, "step": 2650 }, { "epoch": 3.3781459063395984, "grad_norm": 0.2722744925274297, "learning_rate": 5.733507224009723e-06, "loss": 0.0337, "step": 2651 }, { "epoch": 3.3794201975151323, "grad_norm": 0.2963254509894029, "learning_rate": 5.725455316006084e-06, "loss": 0.042, "step": 2652 }, { "epoch": 3.380694488690666, "grad_norm": 0.2761133722304446, "learning_rate": 5.717406797477371e-06, "loss": 0.028, "step": 2653 }, { "epoch": 3.3819687798661993, "grad_norm": 0.3090729556986827, "learning_rate": 5.709361674805608e-06, "loss": 0.0377, "step": 2654 }, { "epoch": 3.383243071041733, "grad_norm": 0.28536467146677397, "learning_rate": 5.701319954370124e-06, "loss": 0.0352, "step": 2655 }, { "epoch": 3.3845173622172666, "grad_norm": 0.279197497731106, "learning_rate": 5.6932816425475554e-06, "loss": 0.0268, "step": 2656 }, { "epoch": 3.3857916533928, "grad_norm": 0.27841467417554133, "learning_rate": 5.6852467457118345e-06, "loss": 0.0417, "step": 2657 }, { "epoch": 3.387065944568334, "grad_norm": 0.30466509535906905, "learning_rate": 5.677215270234183e-06, "loss": 0.034, "step": 2658 }, { "epoch": 3.3883402357438674, "grad_norm": 0.2767499408278351, "learning_rate": 5.669187222483115e-06, "loss": 0.0304, "step": 2659 }, { "epoch": 3.389614526919401, "grad_norm": 0.2750776818065855, "learning_rate": 5.66116260882442e-06, "loss": 0.0274, "step": 2660 }, { "epoch": 3.390888818094935, "grad_norm": 0.2765731021933583, "learning_rate": 5.65314143562117e-06, "loss": 0.0293, "step": 2661 }, { "epoch": 3.3921631092704683, "grad_norm": 0.28754881006342664, "learning_rate": 5.645123709233707e-06, "loss": 0.0325, "step": 2662 }, { "epoch": 3.3934374004460017, "grad_norm": 0.27265748218952673, "learning_rate": 5.637109436019639e-06, "loss": 0.0297, "step": 2663 }, { "epoch": 3.3947116916215356, "grad_norm": 0.3145711952749484, "learning_rate": 5.629098622333837e-06, "loss": 0.0437, "step": 2664 }, { "epoch": 3.395985982797069, "grad_norm": 0.29468353440086764, "learning_rate": 5.62109127452843e-06, "loss": 0.0311, "step": 2665 }, { "epoch": 3.3972602739726026, "grad_norm": 0.2962854996995033, "learning_rate": 5.6130873989527925e-06, "loss": 0.0412, "step": 2666 }, { "epoch": 3.3985345651481365, "grad_norm": 0.2735092654330064, "learning_rate": 5.6050870019535496e-06, "loss": 0.0311, "step": 2667 }, { "epoch": 3.39980885632367, "grad_norm": 0.2833417412695173, "learning_rate": 5.59709008987458e-06, "loss": 0.0315, "step": 2668 }, { "epoch": 3.4010831474992034, "grad_norm": 0.28805677468920354, "learning_rate": 5.589096669056972e-06, "loss": 0.0359, "step": 2669 }, { "epoch": 3.4023574386747373, "grad_norm": 0.2812267569261894, "learning_rate": 5.5811067458390785e-06, "loss": 0.0338, "step": 2670 }, { "epoch": 3.4036317298502707, "grad_norm": 0.2733599997604341, "learning_rate": 5.573120326556445e-06, "loss": 0.0323, "step": 2671 }, { "epoch": 3.4049060210258046, "grad_norm": 0.27552158437869273, "learning_rate": 5.565137417541866e-06, "loss": 0.0344, "step": 2672 }, { "epoch": 3.406180312201338, "grad_norm": 0.2951444424534809, "learning_rate": 5.55715802512534e-06, "loss": 0.0451, "step": 2673 }, { "epoch": 3.4074546033768716, "grad_norm": 0.2756893300164202, "learning_rate": 5.549182155634076e-06, "loss": 0.0321, "step": 2674 }, { "epoch": 3.4087288945524055, "grad_norm": 0.2841178677555785, "learning_rate": 5.5412098153924966e-06, "loss": 0.0337, "step": 2675 }, { "epoch": 3.410003185727939, "grad_norm": 0.2700272382410671, "learning_rate": 5.533241010722219e-06, "loss": 0.0363, "step": 2676 }, { "epoch": 3.4112774769034724, "grad_norm": 0.29254627711053727, "learning_rate": 5.525275747942063e-06, "loss": 0.037, "step": 2677 }, { "epoch": 3.4125517680790063, "grad_norm": 0.2900386532751595, "learning_rate": 5.517314033368031e-06, "loss": 0.0343, "step": 2678 }, { "epoch": 3.4138260592545397, "grad_norm": 0.2710774691233637, "learning_rate": 5.509355873313318e-06, "loss": 0.0377, "step": 2679 }, { "epoch": 3.415100350430073, "grad_norm": 0.2807912380189886, "learning_rate": 5.501401274088311e-06, "loss": 0.0363, "step": 2680 }, { "epoch": 3.416374641605607, "grad_norm": 0.287238787995564, "learning_rate": 5.493450242000546e-06, "loss": 0.0364, "step": 2681 }, { "epoch": 3.4176489327811406, "grad_norm": 0.2791766643115219, "learning_rate": 5.4855027833547635e-06, "loss": 0.0291, "step": 2682 }, { "epoch": 3.418923223956674, "grad_norm": 0.274660765196115, "learning_rate": 5.477558904452836e-06, "loss": 0.0327, "step": 2683 }, { "epoch": 3.420197515132208, "grad_norm": 0.27888785539245303, "learning_rate": 5.46961861159383e-06, "loss": 0.0305, "step": 2684 }, { "epoch": 3.4214718063077414, "grad_norm": 0.2702974682948982, "learning_rate": 5.4616819110739475e-06, "loss": 0.0291, "step": 2685 }, { "epoch": 3.422746097483275, "grad_norm": 0.3137766998433384, "learning_rate": 5.453748809186548e-06, "loss": 0.0343, "step": 2686 }, { "epoch": 3.4240203886588088, "grad_norm": 0.2713227943997949, "learning_rate": 5.4458193122221405e-06, "loss": 0.0286, "step": 2687 }, { "epoch": 3.425294679834342, "grad_norm": 0.282717717570652, "learning_rate": 5.43789342646837e-06, "loss": 0.0335, "step": 2688 }, { "epoch": 3.4265689710098757, "grad_norm": 0.27019991702776397, "learning_rate": 5.429971158210024e-06, "loss": 0.0277, "step": 2689 }, { "epoch": 3.4278432621854096, "grad_norm": 0.27763957262848243, "learning_rate": 5.422052513729013e-06, "loss": 0.0283, "step": 2690 }, { "epoch": 3.429117553360943, "grad_norm": 0.2956608281288841, "learning_rate": 5.414137499304382e-06, "loss": 0.0296, "step": 2691 }, { "epoch": 3.4303918445364765, "grad_norm": 0.2898693260643239, "learning_rate": 5.406226121212297e-06, "loss": 0.0362, "step": 2692 }, { "epoch": 3.4316661357120104, "grad_norm": 0.2842535779040862, "learning_rate": 5.398318385726036e-06, "loss": 0.033, "step": 2693 }, { "epoch": 3.432940426887544, "grad_norm": 0.3160345658077761, "learning_rate": 5.390414299115991e-06, "loss": 0.0422, "step": 2694 }, { "epoch": 3.4342147180630773, "grad_norm": 0.2989620760132331, "learning_rate": 5.382513867649663e-06, "loss": 0.0334, "step": 2695 }, { "epoch": 3.4354890092386112, "grad_norm": 0.30302676003426865, "learning_rate": 5.37461709759165e-06, "loss": 0.0348, "step": 2696 }, { "epoch": 3.4367633004141447, "grad_norm": 0.2878253461398174, "learning_rate": 5.3667239952036484e-06, "loss": 0.0335, "step": 2697 }, { "epoch": 3.438037591589678, "grad_norm": 0.29132465144756853, "learning_rate": 5.358834566744447e-06, "loss": 0.0373, "step": 2698 }, { "epoch": 3.439311882765212, "grad_norm": 0.28374853510223186, "learning_rate": 5.35094881846993e-06, "loss": 0.0316, "step": 2699 }, { "epoch": 3.4405861739407455, "grad_norm": 0.2874251437914817, "learning_rate": 5.34306675663304e-06, "loss": 0.0285, "step": 2700 }, { "epoch": 3.441860465116279, "grad_norm": 0.2789813622918718, "learning_rate": 5.335188387483825e-06, "loss": 0.0297, "step": 2701 }, { "epoch": 3.443134756291813, "grad_norm": 0.27784049233390173, "learning_rate": 5.32731371726938e-06, "loss": 0.0321, "step": 2702 }, { "epoch": 3.4444090474673463, "grad_norm": 0.28521189096992977, "learning_rate": 5.3194427522338865e-06, "loss": 0.036, "step": 2703 }, { "epoch": 3.44568333864288, "grad_norm": 0.2899337420497098, "learning_rate": 5.311575498618577e-06, "loss": 0.0356, "step": 2704 }, { "epoch": 3.4469576298184137, "grad_norm": 0.28652696046870446, "learning_rate": 5.303711962661744e-06, "loss": 0.032, "step": 2705 }, { "epoch": 3.448231920993947, "grad_norm": 0.33206689690680974, "learning_rate": 5.29585215059873e-06, "loss": 0.0377, "step": 2706 }, { "epoch": 3.4495062121694806, "grad_norm": 0.28851005523013284, "learning_rate": 5.287996068661927e-06, "loss": 0.0328, "step": 2707 }, { "epoch": 3.4507805033450145, "grad_norm": 0.29755415892597153, "learning_rate": 5.2801437230807705e-06, "loss": 0.0372, "step": 2708 }, { "epoch": 3.452054794520548, "grad_norm": 0.27211965612634004, "learning_rate": 5.2722951200817315e-06, "loss": 0.0303, "step": 2709 }, { "epoch": 3.4533290856960814, "grad_norm": 0.2806388171673152, "learning_rate": 5.264450265888311e-06, "loss": 0.0389, "step": 2710 }, { "epoch": 3.4546033768716153, "grad_norm": 0.27704633716505306, "learning_rate": 5.25660916672104e-06, "loss": 0.033, "step": 2711 }, { "epoch": 3.455877668047149, "grad_norm": 0.2896112459351967, "learning_rate": 5.248771828797474e-06, "loss": 0.0313, "step": 2712 }, { "epoch": 3.4571519592226823, "grad_norm": 0.26965115599789047, "learning_rate": 5.240938258332183e-06, "loss": 0.0281, "step": 2713 }, { "epoch": 3.458426250398216, "grad_norm": 0.2923413254097372, "learning_rate": 5.233108461536749e-06, "loss": 0.0344, "step": 2714 }, { "epoch": 3.4597005415737496, "grad_norm": 0.26858960877497434, "learning_rate": 5.225282444619764e-06, "loss": 0.0308, "step": 2715 }, { "epoch": 3.460974832749283, "grad_norm": 0.31586708690114124, "learning_rate": 5.217460213786822e-06, "loss": 0.0355, "step": 2716 }, { "epoch": 3.462249123924817, "grad_norm": 0.2876316311019967, "learning_rate": 5.209641775240508e-06, "loss": 0.0375, "step": 2717 }, { "epoch": 3.4635234151003504, "grad_norm": 0.28047389082996965, "learning_rate": 5.201827135180424e-06, "loss": 0.0287, "step": 2718 }, { "epoch": 3.464797706275884, "grad_norm": 0.2996038416289805, "learning_rate": 5.194016299803122e-06, "loss": 0.039, "step": 2719 }, { "epoch": 3.466071997451418, "grad_norm": 0.2874374987479985, "learning_rate": 5.186209275302175e-06, "loss": 0.0347, "step": 2720 }, { "epoch": 3.4673462886269513, "grad_norm": 0.27258761413756166, "learning_rate": 5.178406067868102e-06, "loss": 0.0333, "step": 2721 }, { "epoch": 3.4686205798024847, "grad_norm": 0.28987934638993323, "learning_rate": 5.170606683688421e-06, "loss": 0.0333, "step": 2722 }, { "epoch": 3.4698948709780186, "grad_norm": 0.2671441862069403, "learning_rate": 5.1628111289476025e-06, "loss": 0.0276, "step": 2723 }, { "epoch": 3.471169162153552, "grad_norm": 0.2710877310606162, "learning_rate": 5.155019409827089e-06, "loss": 0.0278, "step": 2724 }, { "epoch": 3.4724434533290855, "grad_norm": 0.26849375235559614, "learning_rate": 5.147231532505275e-06, "loss": 0.0292, "step": 2725 }, { "epoch": 3.4737177445046195, "grad_norm": 0.27329926597502985, "learning_rate": 5.139447503157513e-06, "loss": 0.0307, "step": 2726 }, { "epoch": 3.474992035680153, "grad_norm": 0.2810341568949425, "learning_rate": 5.131667327956102e-06, "loss": 0.0336, "step": 2727 }, { "epoch": 3.4762663268556864, "grad_norm": 0.29317977037120724, "learning_rate": 5.123891013070288e-06, "loss": 0.0389, "step": 2728 }, { "epoch": 3.4775406180312203, "grad_norm": 0.28937663840920325, "learning_rate": 5.116118564666247e-06, "loss": 0.0313, "step": 2729 }, { "epoch": 3.4788149092067537, "grad_norm": 0.28136843488686175, "learning_rate": 5.108349988907111e-06, "loss": 0.0367, "step": 2730 }, { "epoch": 3.480089200382287, "grad_norm": 0.28739959635996354, "learning_rate": 5.100585291952908e-06, "loss": 0.0342, "step": 2731 }, { "epoch": 3.481363491557821, "grad_norm": 0.26056492552203, "learning_rate": 5.092824479960625e-06, "loss": 0.0266, "step": 2732 }, { "epoch": 3.4826377827333546, "grad_norm": 0.2865651602432039, "learning_rate": 5.085067559084136e-06, "loss": 0.0313, "step": 2733 }, { "epoch": 3.483912073908888, "grad_norm": 0.28625245530685395, "learning_rate": 5.077314535474258e-06, "loss": 0.037, "step": 2734 }, { "epoch": 3.485186365084422, "grad_norm": 0.281467594797991, "learning_rate": 5.069565415278701e-06, "loss": 0.0315, "step": 2735 }, { "epoch": 3.4864606562599554, "grad_norm": 0.2671066596186284, "learning_rate": 5.061820204642085e-06, "loss": 0.0272, "step": 2736 }, { "epoch": 3.487734947435489, "grad_norm": 0.2791266833380361, "learning_rate": 5.054078909705926e-06, "loss": 0.0298, "step": 2737 }, { "epoch": 3.4890092386110227, "grad_norm": 0.29443545690231543, "learning_rate": 5.046341536608641e-06, "loss": 0.0358, "step": 2738 }, { "epoch": 3.490283529786556, "grad_norm": 0.28628582513479395, "learning_rate": 5.038608091485534e-06, "loss": 0.031, "step": 2739 }, { "epoch": 3.4915578209620897, "grad_norm": 0.29713448778534607, "learning_rate": 5.030878580468793e-06, "loss": 0.0293, "step": 2740 }, { "epoch": 3.4928321121376236, "grad_norm": 0.290522787731622, "learning_rate": 5.023153009687489e-06, "loss": 0.0338, "step": 2741 }, { "epoch": 3.494106403313157, "grad_norm": 0.3021301466938592, "learning_rate": 5.015431385267566e-06, "loss": 0.0375, "step": 2742 }, { "epoch": 3.4953806944886905, "grad_norm": 0.2797353761597556, "learning_rate": 5.007713713331844e-06, "loss": 0.0312, "step": 2743 }, { "epoch": 3.4966549856642244, "grad_norm": 0.29630084349011515, "learning_rate": 5.000000000000003e-06, "loss": 0.0375, "step": 2744 }, { "epoch": 3.497929276839758, "grad_norm": 0.2800956482046207, "learning_rate": 4.992290251388585e-06, "loss": 0.0374, "step": 2745 }, { "epoch": 3.4992035680152913, "grad_norm": 0.2578917547496444, "learning_rate": 4.9845844736109915e-06, "loss": 0.0259, "step": 2746 }, { "epoch": 3.500477859190825, "grad_norm": 0.2591209970575524, "learning_rate": 4.976882672777474e-06, "loss": 0.028, "step": 2747 }, { "epoch": 3.5017521503663587, "grad_norm": 0.2683225965469169, "learning_rate": 4.969184854995125e-06, "loss": 0.0298, "step": 2748 }, { "epoch": 3.503026441541892, "grad_norm": 0.28034573352991665, "learning_rate": 4.961491026367894e-06, "loss": 0.0317, "step": 2749 }, { "epoch": 3.504300732717426, "grad_norm": 0.27137665590444493, "learning_rate": 4.9538011929965436e-06, "loss": 0.0266, "step": 2750 }, { "epoch": 3.5055750238929595, "grad_norm": 0.27744445929889255, "learning_rate": 4.946115360978696e-06, "loss": 0.0367, "step": 2751 }, { "epoch": 3.506849315068493, "grad_norm": 0.2782818548745389, "learning_rate": 4.938433536408771e-06, "loss": 0.0345, "step": 2752 }, { "epoch": 3.508123606244027, "grad_norm": 0.29164332581968555, "learning_rate": 4.930755725378038e-06, "loss": 0.037, "step": 2753 }, { "epoch": 3.5093978974195603, "grad_norm": 0.27768779841029817, "learning_rate": 4.923081933974566e-06, "loss": 0.0334, "step": 2754 }, { "epoch": 3.510672188595094, "grad_norm": 0.2795833606442896, "learning_rate": 4.915412168283246e-06, "loss": 0.0305, "step": 2755 }, { "epoch": 3.5119464797706277, "grad_norm": 0.2771423721933488, "learning_rate": 4.9077464343857694e-06, "loss": 0.0287, "step": 2756 }, { "epoch": 3.513220770946161, "grad_norm": 0.2674403847681032, "learning_rate": 4.900084738360637e-06, "loss": 0.0302, "step": 2757 }, { "epoch": 3.5144950621216946, "grad_norm": 0.2705205916006833, "learning_rate": 4.892427086283147e-06, "loss": 0.0278, "step": 2758 }, { "epoch": 3.5157693532972285, "grad_norm": 0.29348091895035683, "learning_rate": 4.884773484225385e-06, "loss": 0.0345, "step": 2759 }, { "epoch": 3.517043644472762, "grad_norm": 0.290869794039887, "learning_rate": 4.877123938256229e-06, "loss": 0.0338, "step": 2760 }, { "epoch": 3.5183179356482954, "grad_norm": 0.26070332884348846, "learning_rate": 4.86947845444135e-06, "loss": 0.0277, "step": 2761 }, { "epoch": 3.5195922268238293, "grad_norm": 0.27352499728434804, "learning_rate": 4.861837038843177e-06, "loss": 0.0344, "step": 2762 }, { "epoch": 3.520866517999363, "grad_norm": 0.2670940126820318, "learning_rate": 4.854199697520937e-06, "loss": 0.0293, "step": 2763 }, { "epoch": 3.5221408091748962, "grad_norm": 0.3048762615055129, "learning_rate": 4.846566436530603e-06, "loss": 0.0413, "step": 2764 }, { "epoch": 3.52341510035043, "grad_norm": 0.2914245831635303, "learning_rate": 4.838937261924933e-06, "loss": 0.0329, "step": 2765 }, { "epoch": 3.5246893915259636, "grad_norm": 0.2723289164877737, "learning_rate": 4.831312179753433e-06, "loss": 0.0275, "step": 2766 }, { "epoch": 3.525963682701497, "grad_norm": 0.29920195683945516, "learning_rate": 4.823691196062368e-06, "loss": 0.0327, "step": 2767 }, { "epoch": 3.527237973877031, "grad_norm": 0.2712259692426145, "learning_rate": 4.81607431689475e-06, "loss": 0.0277, "step": 2768 }, { "epoch": 3.5285122650525644, "grad_norm": 0.36463160671064143, "learning_rate": 4.80846154829034e-06, "loss": 0.0399, "step": 2769 }, { "epoch": 3.529786556228098, "grad_norm": 0.3060012908128444, "learning_rate": 4.800852896285635e-06, "loss": 0.037, "step": 2770 }, { "epoch": 3.531060847403632, "grad_norm": 0.2836418092274033, "learning_rate": 4.793248366913873e-06, "loss": 0.0304, "step": 2771 }, { "epoch": 3.5323351385791653, "grad_norm": 0.3131404801054386, "learning_rate": 4.78564796620502e-06, "loss": 0.039, "step": 2772 }, { "epoch": 3.5336094297546987, "grad_norm": 0.2747486152165612, "learning_rate": 4.778051700185769e-06, "loss": 0.0329, "step": 2773 }, { "epoch": 3.5348837209302326, "grad_norm": 0.26329860290998236, "learning_rate": 4.770459574879536e-06, "loss": 0.0259, "step": 2774 }, { "epoch": 3.536158012105766, "grad_norm": 0.2912145459523636, "learning_rate": 4.762871596306448e-06, "loss": 0.0347, "step": 2775 }, { "epoch": 3.5374323032812995, "grad_norm": 0.2902124591096066, "learning_rate": 4.755287770483349e-06, "loss": 0.0369, "step": 2776 }, { "epoch": 3.5387065944568334, "grad_norm": 0.2877911466067013, "learning_rate": 4.747708103423791e-06, "loss": 0.0294, "step": 2777 }, { "epoch": 3.539980885632367, "grad_norm": 0.2805344174388197, "learning_rate": 4.740132601138023e-06, "loss": 0.0311, "step": 2778 }, { "epoch": 3.5412551768079004, "grad_norm": 0.2615420104917065, "learning_rate": 4.732561269632992e-06, "loss": 0.0252, "step": 2779 }, { "epoch": 3.5425294679834343, "grad_norm": 0.27934343500603387, "learning_rate": 4.7249941149123545e-06, "loss": 0.0353, "step": 2780 }, { "epoch": 3.5438037591589677, "grad_norm": 0.2833058470804149, "learning_rate": 4.717431142976423e-06, "loss": 0.0347, "step": 2781 }, { "epoch": 3.545078050334501, "grad_norm": 0.2677735218859445, "learning_rate": 4.709872359822227e-06, "loss": 0.0292, "step": 2782 }, { "epoch": 3.546352341510035, "grad_norm": 0.2718960559037918, "learning_rate": 4.7023177714434455e-06, "loss": 0.0297, "step": 2783 }, { "epoch": 3.5476266326855685, "grad_norm": 0.253669390476054, "learning_rate": 4.694767383830453e-06, "loss": 0.0249, "step": 2784 }, { "epoch": 3.548900923861102, "grad_norm": 0.29044677668387847, "learning_rate": 4.687221202970283e-06, "loss": 0.0371, "step": 2785 }, { "epoch": 3.550175215036636, "grad_norm": 0.2889050632454083, "learning_rate": 4.679679234846636e-06, "loss": 0.0307, "step": 2786 }, { "epoch": 3.5514495062121694, "grad_norm": 0.26337839499592275, "learning_rate": 4.672141485439867e-06, "loss": 0.023, "step": 2787 }, { "epoch": 3.552723797387703, "grad_norm": 0.29058128727768756, "learning_rate": 4.664607960726994e-06, "loss": 0.0382, "step": 2788 }, { "epoch": 3.5539980885632367, "grad_norm": 0.30078343172144917, "learning_rate": 4.65707866668168e-06, "loss": 0.0462, "step": 2789 }, { "epoch": 3.55527237973877, "grad_norm": 0.27597770970730634, "learning_rate": 4.649553609274231e-06, "loss": 0.0299, "step": 2790 }, { "epoch": 3.5565466709143037, "grad_norm": 0.2832145236713277, "learning_rate": 4.642032794471597e-06, "loss": 0.0351, "step": 2791 }, { "epoch": 3.5578209620898376, "grad_norm": 0.28267491073114165, "learning_rate": 4.634516228237372e-06, "loss": 0.032, "step": 2792 }, { "epoch": 3.559095253265371, "grad_norm": 0.2799245716709409, "learning_rate": 4.627003916531761e-06, "loss": 0.0332, "step": 2793 }, { "epoch": 3.5603695444409045, "grad_norm": 0.2806916813903494, "learning_rate": 4.6194958653116185e-06, "loss": 0.0342, "step": 2794 }, { "epoch": 3.5616438356164384, "grad_norm": 0.29112750030653894, "learning_rate": 4.611992080530396e-06, "loss": 0.0339, "step": 2795 }, { "epoch": 3.562918126791972, "grad_norm": 0.3011496110446059, "learning_rate": 4.6044925681381865e-06, "loss": 0.0366, "step": 2796 }, { "epoch": 3.5641924179675053, "grad_norm": 0.2629905186507639, "learning_rate": 4.596997334081681e-06, "loss": 0.0297, "step": 2797 }, { "epoch": 3.565466709143039, "grad_norm": 0.3030878153338874, "learning_rate": 4.58950638430418e-06, "loss": 0.0349, "step": 2798 }, { "epoch": 3.5667410003185727, "grad_norm": 0.27124181942914183, "learning_rate": 4.5820197247455875e-06, "loss": 0.0264, "step": 2799 }, { "epoch": 3.5680152914941066, "grad_norm": 0.28265389682584724, "learning_rate": 4.5745373613424075e-06, "loss": 0.0295, "step": 2800 }, { "epoch": 3.56928958266964, "grad_norm": 0.2704145489742213, "learning_rate": 4.567059300027733e-06, "loss": 0.0326, "step": 2801 }, { "epoch": 3.5705638738451735, "grad_norm": 0.28635548533074023, "learning_rate": 4.559585546731251e-06, "loss": 0.0286, "step": 2802 }, { "epoch": 3.5718381650207074, "grad_norm": 0.27604257402793403, "learning_rate": 4.552116107379229e-06, "loss": 0.0319, "step": 2803 }, { "epoch": 3.573112456196241, "grad_norm": 0.2935900975228814, "learning_rate": 4.544650987894514e-06, "loss": 0.033, "step": 2804 }, { "epoch": 3.5743867473717743, "grad_norm": 0.27229682364078767, "learning_rate": 4.537190194196531e-06, "loss": 0.0323, "step": 2805 }, { "epoch": 3.575661038547308, "grad_norm": 0.27668616841033095, "learning_rate": 4.52973373220127e-06, "loss": 0.0395, "step": 2806 }, { "epoch": 3.5769353297228417, "grad_norm": 0.30101660114003165, "learning_rate": 4.522281607821288e-06, "loss": 0.0378, "step": 2807 }, { "epoch": 3.578209620898375, "grad_norm": 0.2960173809971566, "learning_rate": 4.514833826965706e-06, "loss": 0.0374, "step": 2808 }, { "epoch": 3.579483912073909, "grad_norm": 0.29918651768204163, "learning_rate": 4.507390395540193e-06, "loss": 0.0367, "step": 2809 }, { "epoch": 3.5807582032494425, "grad_norm": 0.2742542077934481, "learning_rate": 4.4999513194469744e-06, "loss": 0.0289, "step": 2810 }, { "epoch": 3.582032494424976, "grad_norm": 0.29305886155580463, "learning_rate": 4.492516604584831e-06, "loss": 0.038, "step": 2811 }, { "epoch": 3.58330678560051, "grad_norm": 0.28643277114329513, "learning_rate": 4.4850862568490605e-06, "loss": 0.0342, "step": 2812 }, { "epoch": 3.5845810767760433, "grad_norm": 0.27902455969402434, "learning_rate": 4.47766028213153e-06, "loss": 0.0308, "step": 2813 }, { "epoch": 3.5858553679515768, "grad_norm": 0.305063115698769, "learning_rate": 4.470238686320606e-06, "loss": 0.035, "step": 2814 }, { "epoch": 3.5871296591271107, "grad_norm": 0.2741536639603262, "learning_rate": 4.46282147530121e-06, "loss": 0.0303, "step": 2815 }, { "epoch": 3.588403950302644, "grad_norm": 0.2863004637318461, "learning_rate": 4.455408654954771e-06, "loss": 0.0361, "step": 2816 }, { "epoch": 3.5896782414781776, "grad_norm": 0.3046459195366541, "learning_rate": 4.448000231159244e-06, "loss": 0.0429, "step": 2817 }, { "epoch": 3.5909525326537115, "grad_norm": 0.28752790012191487, "learning_rate": 4.440596209789093e-06, "loss": 0.0356, "step": 2818 }, { "epoch": 3.592226823829245, "grad_norm": 0.291102570472058, "learning_rate": 4.433196596715292e-06, "loss": 0.0335, "step": 2819 }, { "epoch": 3.593501115004779, "grad_norm": 0.2878378284960005, "learning_rate": 4.4258013978053224e-06, "loss": 0.034, "step": 2820 }, { "epoch": 3.5947754061803123, "grad_norm": 0.26386671714911986, "learning_rate": 4.418410618923163e-06, "loss": 0.0284, "step": 2821 }, { "epoch": 3.596049697355846, "grad_norm": 0.2785814727401307, "learning_rate": 4.411024265929283e-06, "loss": 0.0314, "step": 2822 }, { "epoch": 3.5973239885313797, "grad_norm": 0.29028701629074244, "learning_rate": 4.4036423446806595e-06, "loss": 0.0378, "step": 2823 }, { "epoch": 3.598598279706913, "grad_norm": 0.27139738564534543, "learning_rate": 4.396264861030729e-06, "loss": 0.0342, "step": 2824 }, { "epoch": 3.5998725708824466, "grad_norm": 0.28008129450280905, "learning_rate": 4.3888918208294375e-06, "loss": 0.0328, "step": 2825 }, { "epoch": 3.6011468620579805, "grad_norm": 0.2876620346700433, "learning_rate": 4.3815232299231784e-06, "loss": 0.0356, "step": 2826 }, { "epoch": 3.602421153233514, "grad_norm": 0.2697592385100587, "learning_rate": 4.374159094154845e-06, "loss": 0.0285, "step": 2827 }, { "epoch": 3.6036954444090474, "grad_norm": 0.3155119406232697, "learning_rate": 4.3667994193637794e-06, "loss": 0.0357, "step": 2828 }, { "epoch": 3.6049697355845813, "grad_norm": 0.2677708120731037, "learning_rate": 4.3594442113857935e-06, "loss": 0.0299, "step": 2829 }, { "epoch": 3.606244026760115, "grad_norm": 0.28204467012152473, "learning_rate": 4.352093476053156e-06, "loss": 0.0344, "step": 2830 }, { "epoch": 3.6075183179356483, "grad_norm": 0.2790420807253091, "learning_rate": 4.344747219194589e-06, "loss": 0.0333, "step": 2831 }, { "epoch": 3.608792609111182, "grad_norm": 0.30413127844510374, "learning_rate": 4.337405446635264e-06, "loss": 0.0435, "step": 2832 }, { "epoch": 3.6100669002867156, "grad_norm": 0.28010978152062255, "learning_rate": 4.330068164196795e-06, "loss": 0.0316, "step": 2833 }, { "epoch": 3.611341191462249, "grad_norm": 0.27836502345544956, "learning_rate": 4.322735377697236e-06, "loss": 0.0326, "step": 2834 }, { "epoch": 3.612615482637783, "grad_norm": 0.2697691697726528, "learning_rate": 4.315407092951078e-06, "loss": 0.0298, "step": 2835 }, { "epoch": 3.6138897738133164, "grad_norm": 0.2869227257570003, "learning_rate": 4.3080833157692415e-06, "loss": 0.0364, "step": 2836 }, { "epoch": 3.61516406498885, "grad_norm": 0.281232316013185, "learning_rate": 4.300764051959071e-06, "loss": 0.0371, "step": 2837 }, { "epoch": 3.616438356164384, "grad_norm": 0.2645651608192952, "learning_rate": 4.293449307324334e-06, "loss": 0.0272, "step": 2838 }, { "epoch": 3.6177126473399173, "grad_norm": 0.27821615038171044, "learning_rate": 4.286139087665214e-06, "loss": 0.035, "step": 2839 }, { "epoch": 3.6189869385154507, "grad_norm": 0.2791500043038482, "learning_rate": 4.278833398778306e-06, "loss": 0.0344, "step": 2840 }, { "epoch": 3.6202612296909846, "grad_norm": 0.2751992796822959, "learning_rate": 4.27153224645661e-06, "loss": 0.0298, "step": 2841 }, { "epoch": 3.621535520866518, "grad_norm": 0.29845005607277153, "learning_rate": 4.264235636489542e-06, "loss": 0.0362, "step": 2842 }, { "epoch": 3.6228098120420515, "grad_norm": 0.2887936364055913, "learning_rate": 4.256943574662892e-06, "loss": 0.0364, "step": 2843 }, { "epoch": 3.6240841032175855, "grad_norm": 0.2754686102357569, "learning_rate": 4.249656066758869e-06, "loss": 0.03, "step": 2844 }, { "epoch": 3.625358394393119, "grad_norm": 0.27388430367553235, "learning_rate": 4.242373118556049e-06, "loss": 0.0287, "step": 2845 }, { "epoch": 3.6266326855686524, "grad_norm": 0.2809612421614265, "learning_rate": 4.23509473582941e-06, "loss": 0.0338, "step": 2846 }, { "epoch": 3.6279069767441863, "grad_norm": 0.2939166269968554, "learning_rate": 4.227820924350301e-06, "loss": 0.037, "step": 2847 }, { "epoch": 3.6291812679197197, "grad_norm": 0.281735030005632, "learning_rate": 4.2205516898864465e-06, "loss": 0.0308, "step": 2848 }, { "epoch": 3.630455559095253, "grad_norm": 0.29601883238283583, "learning_rate": 4.213287038201943e-06, "loss": 0.0377, "step": 2849 }, { "epoch": 3.631729850270787, "grad_norm": 0.28996693255053047, "learning_rate": 4.2060269750572545e-06, "loss": 0.0389, "step": 2850 }, { "epoch": 3.6330041414463206, "grad_norm": 0.28490190945195665, "learning_rate": 4.1987715062092046e-06, "loss": 0.0336, "step": 2851 }, { "epoch": 3.634278432621854, "grad_norm": 0.28557400298448576, "learning_rate": 4.191520637410974e-06, "loss": 0.0309, "step": 2852 }, { "epoch": 3.635552723797388, "grad_norm": 0.2826255083467464, "learning_rate": 4.184274374412092e-06, "loss": 0.03, "step": 2853 }, { "epoch": 3.6368270149729214, "grad_norm": 0.2636045390353191, "learning_rate": 4.1770327229584516e-06, "loss": 0.0269, "step": 2854 }, { "epoch": 3.638101306148455, "grad_norm": 0.291107571714573, "learning_rate": 4.169795688792263e-06, "loss": 0.032, "step": 2855 }, { "epoch": 3.6393755973239887, "grad_norm": 0.28809514068949, "learning_rate": 4.162563277652104e-06, "loss": 0.0299, "step": 2856 }, { "epoch": 3.640649888499522, "grad_norm": 0.29162257548037646, "learning_rate": 4.155335495272858e-06, "loss": 0.0314, "step": 2857 }, { "epoch": 3.6419241796750557, "grad_norm": 0.3080484675647144, "learning_rate": 4.148112347385762e-06, "loss": 0.0354, "step": 2858 }, { "epoch": 3.6431984708505896, "grad_norm": 0.27901794062887264, "learning_rate": 4.140893839718368e-06, "loss": 0.0348, "step": 2859 }, { "epoch": 3.644472762026123, "grad_norm": 0.27429214312269756, "learning_rate": 4.133679977994543e-06, "loss": 0.0297, "step": 2860 }, { "epoch": 3.6457470532016565, "grad_norm": 0.28046204683249937, "learning_rate": 4.126470767934483e-06, "loss": 0.0329, "step": 2861 }, { "epoch": 3.6470213443771904, "grad_norm": 0.26903615627278904, "learning_rate": 4.119266215254684e-06, "loss": 0.0296, "step": 2862 }, { "epoch": 3.648295635552724, "grad_norm": 0.2875342544882768, "learning_rate": 4.112066325667954e-06, "loss": 0.0332, "step": 2863 }, { "epoch": 3.6495699267282573, "grad_norm": 0.2744063078645399, "learning_rate": 4.104871104883403e-06, "loss": 0.0347, "step": 2864 }, { "epoch": 3.650844217903791, "grad_norm": 0.2821372643349482, "learning_rate": 4.09768055860644e-06, "loss": 0.0361, "step": 2865 }, { "epoch": 3.6521185090793247, "grad_norm": 0.2608359099161076, "learning_rate": 4.0904946925387645e-06, "loss": 0.0279, "step": 2866 }, { "epoch": 3.653392800254858, "grad_norm": 0.2686331649097684, "learning_rate": 4.083313512378368e-06, "loss": 0.0323, "step": 2867 }, { "epoch": 3.654667091430392, "grad_norm": 0.2907938879994239, "learning_rate": 4.076137023819524e-06, "loss": 0.0332, "step": 2868 }, { "epoch": 3.6559413826059255, "grad_norm": 0.27851027687610924, "learning_rate": 4.068965232552788e-06, "loss": 0.0284, "step": 2869 }, { "epoch": 3.657215673781459, "grad_norm": 0.274134720073091, "learning_rate": 4.061798144264986e-06, "loss": 0.0267, "step": 2870 }, { "epoch": 3.658489964956993, "grad_norm": 0.2783058944791837, "learning_rate": 4.054635764639222e-06, "loss": 0.0281, "step": 2871 }, { "epoch": 3.6597642561325263, "grad_norm": 0.2997865947224815, "learning_rate": 4.047478099354857e-06, "loss": 0.039, "step": 2872 }, { "epoch": 3.6610385473080598, "grad_norm": 0.2848024859011276, "learning_rate": 4.0403251540875295e-06, "loss": 0.0327, "step": 2873 }, { "epoch": 3.6623128384835937, "grad_norm": 0.2674704631282829, "learning_rate": 4.0331769345091136e-06, "loss": 0.0281, "step": 2874 }, { "epoch": 3.663587129659127, "grad_norm": 0.28434126731263354, "learning_rate": 4.02603344628776e-06, "loss": 0.035, "step": 2875 }, { "epoch": 3.6648614208346606, "grad_norm": 0.29221687297788174, "learning_rate": 4.01889469508784e-06, "loss": 0.0366, "step": 2876 }, { "epoch": 3.6661357120101945, "grad_norm": 0.28318571647924967, "learning_rate": 4.0117606865699975e-06, "loss": 0.0328, "step": 2877 }, { "epoch": 3.667410003185728, "grad_norm": 0.28384652999063076, "learning_rate": 4.004631426391098e-06, "loss": 0.0318, "step": 2878 }, { "epoch": 3.6686842943612614, "grad_norm": 0.2780913117609636, "learning_rate": 3.997506920204245e-06, "loss": 0.0263, "step": 2879 }, { "epoch": 3.6699585855367953, "grad_norm": 0.2862261036948377, "learning_rate": 3.990387173658774e-06, "loss": 0.0317, "step": 2880 }, { "epoch": 3.671232876712329, "grad_norm": 0.28857354852719935, "learning_rate": 3.983272192400247e-06, "loss": 0.0331, "step": 2881 }, { "epoch": 3.6725071678878622, "grad_norm": 0.28132860120526726, "learning_rate": 3.976161982070446e-06, "loss": 0.0312, "step": 2882 }, { "epoch": 3.673781459063396, "grad_norm": 0.3082823573585818, "learning_rate": 3.969056548307368e-06, "loss": 0.0373, "step": 2883 }, { "epoch": 3.6750557502389296, "grad_norm": 0.27525838748240083, "learning_rate": 3.961955896745224e-06, "loss": 0.0343, "step": 2884 }, { "epoch": 3.676330041414463, "grad_norm": 0.28088195766575796, "learning_rate": 3.954860033014444e-06, "loss": 0.0349, "step": 2885 }, { "epoch": 3.677604332589997, "grad_norm": 0.2889660036108633, "learning_rate": 3.9477689627416345e-06, "loss": 0.0343, "step": 2886 }, { "epoch": 3.6788786237655304, "grad_norm": 0.27800139596346307, "learning_rate": 3.940682691549634e-06, "loss": 0.0353, "step": 2887 }, { "epoch": 3.680152914941064, "grad_norm": 0.2522689443483428, "learning_rate": 3.933601225057446e-06, "loss": 0.0298, "step": 2888 }, { "epoch": 3.681427206116598, "grad_norm": 0.28222265081306, "learning_rate": 3.9265245688802855e-06, "loss": 0.031, "step": 2889 }, { "epoch": 3.6827014972921313, "grad_norm": 0.2733760636275521, "learning_rate": 3.919452728629544e-06, "loss": 0.0296, "step": 2890 }, { "epoch": 3.6839757884676647, "grad_norm": 0.27621686028834774, "learning_rate": 3.912385709912794e-06, "loss": 0.0313, "step": 2891 }, { "epoch": 3.6852500796431986, "grad_norm": 0.28438558720400176, "learning_rate": 3.905323518333786e-06, "loss": 0.032, "step": 2892 }, { "epoch": 3.686524370818732, "grad_norm": 0.2785813151616231, "learning_rate": 3.898266159492443e-06, "loss": 0.0314, "step": 2893 }, { "epoch": 3.6877986619942655, "grad_norm": 0.27835407865498185, "learning_rate": 3.891213638984858e-06, "loss": 0.0306, "step": 2894 }, { "epoch": 3.6890729531697994, "grad_norm": 0.2745798980674577, "learning_rate": 3.8841659624032825e-06, "loss": 0.0312, "step": 2895 }, { "epoch": 3.690347244345333, "grad_norm": 0.2615538846712907, "learning_rate": 3.877123135336133e-06, "loss": 0.0267, "step": 2896 }, { "epoch": 3.6916215355208664, "grad_norm": 0.27298312868697183, "learning_rate": 3.870085163367975e-06, "loss": 0.0297, "step": 2897 }, { "epoch": 3.6928958266964003, "grad_norm": 0.2900299642961502, "learning_rate": 3.8630520520795275e-06, "loss": 0.0375, "step": 2898 }, { "epoch": 3.6941701178719337, "grad_norm": 0.2520477189596081, "learning_rate": 3.8560238070476554e-06, "loss": 0.0232, "step": 2899 }, { "epoch": 3.695444409047467, "grad_norm": 0.2779544319641114, "learning_rate": 3.849000433845363e-06, "loss": 0.0351, "step": 2900 }, { "epoch": 3.696718700223001, "grad_norm": 0.2881885505543536, "learning_rate": 3.841981938041792e-06, "loss": 0.0308, "step": 2901 }, { "epoch": 3.6979929913985345, "grad_norm": 0.26715972735318966, "learning_rate": 3.834968325202219e-06, "loss": 0.0266, "step": 2902 }, { "epoch": 3.699267282574068, "grad_norm": 0.2866653835664917, "learning_rate": 3.827959600888042e-06, "loss": 0.031, "step": 2903 }, { "epoch": 3.700541573749602, "grad_norm": 0.281565435690609, "learning_rate": 3.820955770656798e-06, "loss": 0.0277, "step": 2904 }, { "epoch": 3.7018158649251354, "grad_norm": 0.2683710278425354, "learning_rate": 3.8139568400621184e-06, "loss": 0.0316, "step": 2905 }, { "epoch": 3.703090156100669, "grad_norm": 0.281007873529498, "learning_rate": 3.8069628146537796e-06, "loss": 0.0357, "step": 2906 }, { "epoch": 3.7043644472762027, "grad_norm": 0.2734238440284353, "learning_rate": 3.7999736999776383e-06, "loss": 0.0336, "step": 2907 }, { "epoch": 3.705638738451736, "grad_norm": 0.2639797794026608, "learning_rate": 3.7929895015756803e-06, "loss": 0.0243, "step": 2908 }, { "epoch": 3.7069130296272697, "grad_norm": 0.28147483830232933, "learning_rate": 3.7860102249859807e-06, "loss": 0.0315, "step": 2909 }, { "epoch": 3.7081873208028036, "grad_norm": 0.28157283335631417, "learning_rate": 3.779035875742719e-06, "loss": 0.0322, "step": 2910 }, { "epoch": 3.709461611978337, "grad_norm": 0.3050034132310102, "learning_rate": 3.7720664593761612e-06, "loss": 0.0381, "step": 2911 }, { "epoch": 3.7107359031538705, "grad_norm": 0.2825924411914077, "learning_rate": 3.7651019814126656e-06, "loss": 0.0311, "step": 2912 }, { "epoch": 3.7120101943294044, "grad_norm": 0.26734560450765227, "learning_rate": 3.758142447374674e-06, "loss": 0.0302, "step": 2913 }, { "epoch": 3.713284485504938, "grad_norm": 0.295112729534289, "learning_rate": 3.7511878627807073e-06, "loss": 0.0361, "step": 2914 }, { "epoch": 3.7145587766804713, "grad_norm": 0.2838222812319785, "learning_rate": 3.74423823314536e-06, "loss": 0.0314, "step": 2915 }, { "epoch": 3.715833067856005, "grad_norm": 0.29992318529578216, "learning_rate": 3.7372935639793084e-06, "loss": 0.0354, "step": 2916 }, { "epoch": 3.7171073590315387, "grad_norm": 0.26524866399028657, "learning_rate": 3.730353860789273e-06, "loss": 0.0279, "step": 2917 }, { "epoch": 3.718381650207072, "grad_norm": 0.27745790930965364, "learning_rate": 3.7234191290780664e-06, "loss": 0.0342, "step": 2918 }, { "epoch": 3.719655941382606, "grad_norm": 0.28362427630523523, "learning_rate": 3.7164893743445274e-06, "loss": 0.0318, "step": 2919 }, { "epoch": 3.7209302325581395, "grad_norm": 0.27065225835676626, "learning_rate": 3.709564602083575e-06, "loss": 0.0313, "step": 2920 }, { "epoch": 3.722204523733673, "grad_norm": 0.280539316219938, "learning_rate": 3.702644817786163e-06, "loss": 0.0293, "step": 2921 }, { "epoch": 3.723478814909207, "grad_norm": 0.26765449394033625, "learning_rate": 3.695730026939294e-06, "loss": 0.0324, "step": 2922 }, { "epoch": 3.7247531060847403, "grad_norm": 0.27015049397091495, "learning_rate": 3.688820235026008e-06, "loss": 0.0311, "step": 2923 }, { "epoch": 3.7260273972602738, "grad_norm": 0.2932379346081581, "learning_rate": 3.681915447525387e-06, "loss": 0.0384, "step": 2924 }, { "epoch": 3.7273016884358077, "grad_norm": 0.2848768644651165, "learning_rate": 3.6750156699125395e-06, "loss": 0.0309, "step": 2925 }, { "epoch": 3.728575979611341, "grad_norm": 0.30451476299152014, "learning_rate": 3.6681209076586035e-06, "loss": 0.0352, "step": 2926 }, { "epoch": 3.7298502707868746, "grad_norm": 0.27982027986460184, "learning_rate": 3.6612311662307398e-06, "loss": 0.0295, "step": 2927 }, { "epoch": 3.7311245619624085, "grad_norm": 0.2672286884846164, "learning_rate": 3.6543464510921287e-06, "loss": 0.0293, "step": 2928 }, { "epoch": 3.732398853137942, "grad_norm": 0.28099875973353866, "learning_rate": 3.647466767701964e-06, "loss": 0.03, "step": 2929 }, { "epoch": 3.7336731443134754, "grad_norm": 0.2878726091660295, "learning_rate": 3.6405921215154492e-06, "loss": 0.0327, "step": 2930 }, { "epoch": 3.7349474354890093, "grad_norm": 0.2824548061029509, "learning_rate": 3.6337225179837965e-06, "loss": 0.0319, "step": 2931 }, { "epoch": 3.7362217266645428, "grad_norm": 0.28335256797360514, "learning_rate": 3.6268579625542155e-06, "loss": 0.0333, "step": 2932 }, { "epoch": 3.7374960178400762, "grad_norm": 0.31245732309862934, "learning_rate": 3.619998460669916e-06, "loss": 0.0448, "step": 2933 }, { "epoch": 3.73877030901561, "grad_norm": 0.2893989224163246, "learning_rate": 3.613144017770095e-06, "loss": 0.0344, "step": 2934 }, { "epoch": 3.7400446001911436, "grad_norm": 0.29506129925075986, "learning_rate": 3.6062946392899523e-06, "loss": 0.0369, "step": 2935 }, { "epoch": 3.741318891366677, "grad_norm": 0.27544461735950626, "learning_rate": 3.5994503306606497e-06, "loss": 0.0362, "step": 2936 }, { "epoch": 3.742593182542211, "grad_norm": 0.29327323929459215, "learning_rate": 3.592611097309355e-06, "loss": 0.0339, "step": 2937 }, { "epoch": 3.7438674737177444, "grad_norm": 0.29620253740896546, "learning_rate": 3.585776944659183e-06, "loss": 0.0335, "step": 2938 }, { "epoch": 3.745141764893278, "grad_norm": 0.26770772222733324, "learning_rate": 3.5789478781292454e-06, "loss": 0.0306, "step": 2939 }, { "epoch": 3.746416056068812, "grad_norm": 0.2880078100391554, "learning_rate": 3.5721239031346067e-06, "loss": 0.0364, "step": 2940 }, { "epoch": 3.7476903472443452, "grad_norm": 0.27865457208957434, "learning_rate": 3.5653050250862973e-06, "loss": 0.0326, "step": 2941 }, { "epoch": 3.7489646384198787, "grad_norm": 0.2798563557125254, "learning_rate": 3.558491249391307e-06, "loss": 0.0325, "step": 2942 }, { "epoch": 3.7502389295954126, "grad_norm": 0.28045732080735475, "learning_rate": 3.551682581452579e-06, "loss": 0.0338, "step": 2943 }, { "epoch": 3.751513220770946, "grad_norm": 0.2759254693031387, "learning_rate": 3.544879026669005e-06, "loss": 0.0348, "step": 2944 }, { "epoch": 3.7527875119464795, "grad_norm": 0.2665846647165658, "learning_rate": 3.5380805904354234e-06, "loss": 0.0316, "step": 2945 }, { "epoch": 3.7540618031220134, "grad_norm": 0.28019698619822364, "learning_rate": 3.5312872781426156e-06, "loss": 0.0366, "step": 2946 }, { "epoch": 3.755336094297547, "grad_norm": 0.2622698463877213, "learning_rate": 3.5244990951772972e-06, "loss": 0.0266, "step": 2947 }, { "epoch": 3.7566103854730803, "grad_norm": 0.26312691825655876, "learning_rate": 3.5177160469221184e-06, "loss": 0.0244, "step": 2948 }, { "epoch": 3.7578846766486143, "grad_norm": 0.33171433917562465, "learning_rate": 3.510938138755656e-06, "loss": 0.049, "step": 2949 }, { "epoch": 3.7591589678241477, "grad_norm": 0.2732425226880388, "learning_rate": 3.504165376052414e-06, "loss": 0.0338, "step": 2950 }, { "epoch": 3.760433258999681, "grad_norm": 0.2791640719075987, "learning_rate": 3.4973977641828127e-06, "loss": 0.0275, "step": 2951 }, { "epoch": 3.761707550175215, "grad_norm": 0.27731522470415965, "learning_rate": 3.4906353085131917e-06, "loss": 0.0312, "step": 2952 }, { "epoch": 3.7629818413507485, "grad_norm": 0.2823618083679205, "learning_rate": 3.483878014405796e-06, "loss": 0.0328, "step": 2953 }, { "epoch": 3.764256132526282, "grad_norm": 0.2978562915509758, "learning_rate": 3.4771258872187917e-06, "loss": 0.0327, "step": 2954 }, { "epoch": 3.765530423701816, "grad_norm": 0.2870537785709819, "learning_rate": 3.4703789323062254e-06, "loss": 0.0317, "step": 2955 }, { "epoch": 3.7668047148773494, "grad_norm": 0.3039638593559658, "learning_rate": 3.463637155018067e-06, "loss": 0.0375, "step": 2956 }, { "epoch": 3.768079006052883, "grad_norm": 0.2765145845631702, "learning_rate": 3.456900560700158e-06, "loss": 0.0311, "step": 2957 }, { "epoch": 3.7693532972284167, "grad_norm": 0.27919616319690527, "learning_rate": 3.4501691546942483e-06, "loss": 0.0286, "step": 2958 }, { "epoch": 3.77062758840395, "grad_norm": 0.2933832726152079, "learning_rate": 3.4434429423379658e-06, "loss": 0.0316, "step": 2959 }, { "epoch": 3.7719018795794836, "grad_norm": 0.27843373169190616, "learning_rate": 3.4367219289648192e-06, "loss": 0.0326, "step": 2960 }, { "epoch": 3.7731761707550175, "grad_norm": 0.2906111578950875, "learning_rate": 3.4300061199041967e-06, "loss": 0.0373, "step": 2961 }, { "epoch": 3.774450461930551, "grad_norm": 0.27070362917388796, "learning_rate": 3.423295520481359e-06, "loss": 0.0316, "step": 2962 }, { "epoch": 3.7757247531060845, "grad_norm": 0.2837938685482003, "learning_rate": 3.416590136017436e-06, "loss": 0.0321, "step": 2963 }, { "epoch": 3.7769990442816184, "grad_norm": 0.29382187176562236, "learning_rate": 3.4098899718294243e-06, "loss": 0.0381, "step": 2964 }, { "epoch": 3.778273335457152, "grad_norm": 0.2646549338603065, "learning_rate": 3.4031950332301744e-06, "loss": 0.0269, "step": 2965 }, { "epoch": 3.7795476266326857, "grad_norm": 0.29560572777564126, "learning_rate": 3.3965053255284085e-06, "loss": 0.0359, "step": 2966 }, { "epoch": 3.780821917808219, "grad_norm": 0.3005360820731623, "learning_rate": 3.389820854028678e-06, "loss": 0.0366, "step": 2967 }, { "epoch": 3.7820962089837526, "grad_norm": 0.29013810549545965, "learning_rate": 3.3831416240314085e-06, "loss": 0.0358, "step": 2968 }, { "epoch": 3.7833705001592866, "grad_norm": 0.2938272328060303, "learning_rate": 3.3764676408328423e-06, "loss": 0.0356, "step": 2969 }, { "epoch": 3.78464479133482, "grad_norm": 0.2820947639861058, "learning_rate": 3.369798909725085e-06, "loss": 0.0336, "step": 2970 }, { "epoch": 3.7859190825103535, "grad_norm": 0.2818157393625263, "learning_rate": 3.3631354359960632e-06, "loss": 0.0361, "step": 2971 }, { "epoch": 3.7871933736858874, "grad_norm": 0.2757220050129857, "learning_rate": 3.35647722492954e-06, "loss": 0.0306, "step": 2972 }, { "epoch": 3.788467664861421, "grad_norm": 0.27708501657399776, "learning_rate": 3.349824281805104e-06, "loss": 0.0352, "step": 2973 }, { "epoch": 3.7897419560369543, "grad_norm": 0.26344189410496277, "learning_rate": 3.343176611898168e-06, "loss": 0.0276, "step": 2974 }, { "epoch": 3.791016247212488, "grad_norm": 0.30534506485907087, "learning_rate": 3.3365342204799613e-06, "loss": 0.037, "step": 2975 }, { "epoch": 3.7922905383880217, "grad_norm": 0.2634562886984334, "learning_rate": 3.3298971128175293e-06, "loss": 0.0279, "step": 2976 }, { "epoch": 3.793564829563555, "grad_norm": 0.2684412575265242, "learning_rate": 3.3232652941737287e-06, "loss": 0.0312, "step": 2977 }, { "epoch": 3.794839120739089, "grad_norm": 0.28253097431002516, "learning_rate": 3.316638769807219e-06, "loss": 0.0336, "step": 2978 }, { "epoch": 3.7961134119146225, "grad_norm": 0.2695786034511391, "learning_rate": 3.3100175449724635e-06, "loss": 0.0289, "step": 2979 }, { "epoch": 3.797387703090156, "grad_norm": 0.2753152410374136, "learning_rate": 3.3034016249197244e-06, "loss": 0.029, "step": 2980 }, { "epoch": 3.79866199426569, "grad_norm": 0.2509724424996821, "learning_rate": 3.2967910148950556e-06, "loss": 0.0291, "step": 2981 }, { "epoch": 3.7999362854412233, "grad_norm": 0.27855719476769947, "learning_rate": 3.290185720140301e-06, "loss": 0.0327, "step": 2982 }, { "epoch": 3.8012105766167568, "grad_norm": 0.26422785937433885, "learning_rate": 3.283585745893091e-06, "loss": 0.0297, "step": 2983 }, { "epoch": 3.8024848677922907, "grad_norm": 0.290263807946993, "learning_rate": 3.2769910973868314e-06, "loss": 0.0349, "step": 2984 }, { "epoch": 3.803759158967824, "grad_norm": 0.280279712109141, "learning_rate": 3.2704017798507216e-06, "loss": 0.0365, "step": 2985 }, { "epoch": 3.805033450143358, "grad_norm": 0.2776520023913355, "learning_rate": 3.2638177985097075e-06, "loss": 0.0327, "step": 2986 }, { "epoch": 3.8063077413188915, "grad_norm": 0.28377849562787927, "learning_rate": 3.257239158584531e-06, "loss": 0.0275, "step": 2987 }, { "epoch": 3.807582032494425, "grad_norm": 0.2778059704839215, "learning_rate": 3.2506658652916735e-06, "loss": 0.0268, "step": 2988 }, { "epoch": 3.808856323669959, "grad_norm": 0.2921394450692709, "learning_rate": 3.2440979238433977e-06, "loss": 0.0317, "step": 2989 }, { "epoch": 3.8101306148454923, "grad_norm": 0.2832390038053147, "learning_rate": 3.2375353394477127e-06, "loss": 0.0298, "step": 2990 }, { "epoch": 3.8114049060210258, "grad_norm": 0.27677527178324174, "learning_rate": 3.2309781173083786e-06, "loss": 0.0318, "step": 2991 }, { "epoch": 3.8126791971965597, "grad_norm": 0.27367228553234757, "learning_rate": 3.224426262624908e-06, "loss": 0.027, "step": 2992 }, { "epoch": 3.813953488372093, "grad_norm": 0.2650969304468575, "learning_rate": 3.2178797805925534e-06, "loss": 0.027, "step": 2993 }, { "epoch": 3.8152277795476266, "grad_norm": 0.2667274590192862, "learning_rate": 3.2113386764023103e-06, "loss": 0.0276, "step": 2994 }, { "epoch": 3.8165020707231605, "grad_norm": 0.2864051863311626, "learning_rate": 3.2048029552409076e-06, "loss": 0.0323, "step": 2995 }, { "epoch": 3.817776361898694, "grad_norm": 0.2808396399533059, "learning_rate": 3.1982726222908046e-06, "loss": 0.0282, "step": 2996 }, { "epoch": 3.8190506530742274, "grad_norm": 0.3135452616241531, "learning_rate": 3.1917476827302e-06, "loss": 0.0438, "step": 2997 }, { "epoch": 3.8203249442497613, "grad_norm": 0.28243256095779445, "learning_rate": 3.1852281417329913e-06, "loss": 0.0321, "step": 2998 }, { "epoch": 3.821599235425295, "grad_norm": 0.28236295821244, "learning_rate": 3.178714004468825e-06, "loss": 0.0336, "step": 2999 }, { "epoch": 3.8228735266008282, "grad_norm": 0.2772683586348577, "learning_rate": 3.172205276103033e-06, "loss": 0.0284, "step": 3000 }, { "epoch": 3.824147817776362, "grad_norm": 0.2852334025476906, "learning_rate": 3.1657019617966843e-06, "loss": 0.0326, "step": 3001 }, { "epoch": 3.8254221089518956, "grad_norm": 0.3051331019662718, "learning_rate": 3.1592040667065393e-06, "loss": 0.0418, "step": 3002 }, { "epoch": 3.826696400127429, "grad_norm": 0.272148306847022, "learning_rate": 3.152711595985065e-06, "loss": 0.0302, "step": 3003 }, { "epoch": 3.827970691302963, "grad_norm": 0.2783316265034093, "learning_rate": 3.1462245547804294e-06, "loss": 0.0348, "step": 3004 }, { "epoch": 3.8292449824784964, "grad_norm": 0.2732479708542214, "learning_rate": 3.1397429482364917e-06, "loss": 0.0338, "step": 3005 }, { "epoch": 3.83051927365403, "grad_norm": 0.2705921379901925, "learning_rate": 3.133266781492804e-06, "loss": 0.0316, "step": 3006 }, { "epoch": 3.831793564829564, "grad_norm": 0.27745580005268894, "learning_rate": 3.1267960596846047e-06, "loss": 0.0308, "step": 3007 }, { "epoch": 3.8330678560050973, "grad_norm": 0.27025627482598286, "learning_rate": 3.1203307879428146e-06, "loss": 0.0316, "step": 3008 }, { "epoch": 3.8343421471806307, "grad_norm": 0.27353819407208524, "learning_rate": 3.113870971394032e-06, "loss": 0.0354, "step": 3009 }, { "epoch": 3.8356164383561646, "grad_norm": 0.2752314412490087, "learning_rate": 3.10741661516053e-06, "loss": 0.0309, "step": 3010 }, { "epoch": 3.836890729531698, "grad_norm": 0.27452038137201445, "learning_rate": 3.1009677243602544e-06, "loss": 0.0317, "step": 3011 }, { "epoch": 3.8381650207072315, "grad_norm": 0.2680836238726768, "learning_rate": 3.094524304106812e-06, "loss": 0.0263, "step": 3012 }, { "epoch": 3.8394393118827654, "grad_norm": 0.29757816117775876, "learning_rate": 3.088086359509476e-06, "loss": 0.0354, "step": 3013 }, { "epoch": 3.840713603058299, "grad_norm": 0.2949583904245425, "learning_rate": 3.0816538956731767e-06, "loss": 0.0344, "step": 3014 }, { "epoch": 3.8419878942338324, "grad_norm": 0.26204490576388934, "learning_rate": 3.0752269176984963e-06, "loss": 0.0269, "step": 3015 }, { "epoch": 3.8432621854093663, "grad_norm": 0.2982767533033082, "learning_rate": 3.068805430681675e-06, "loss": 0.0334, "step": 3016 }, { "epoch": 3.8445364765848997, "grad_norm": 0.26214628518021454, "learning_rate": 3.0623894397145837e-06, "loss": 0.0252, "step": 3017 }, { "epoch": 3.845810767760433, "grad_norm": 0.2911962454866576, "learning_rate": 3.055978949884756e-06, "loss": 0.0342, "step": 3018 }, { "epoch": 3.847085058935967, "grad_norm": 0.26621854876049283, "learning_rate": 3.049573966275339e-06, "loss": 0.0298, "step": 3019 }, { "epoch": 3.8483593501115005, "grad_norm": 0.27580747145146833, "learning_rate": 3.0431744939651365e-06, "loss": 0.031, "step": 3020 }, { "epoch": 3.849633641287034, "grad_norm": 0.28213748816724354, "learning_rate": 3.0367805380285685e-06, "loss": 0.0301, "step": 3021 }, { "epoch": 3.850907932462568, "grad_norm": 0.28689016665922096, "learning_rate": 3.030392103535684e-06, "loss": 0.0284, "step": 3022 }, { "epoch": 3.8521822236381014, "grad_norm": 0.25783205737054227, "learning_rate": 3.024009195552157e-06, "loss": 0.0288, "step": 3023 }, { "epoch": 3.853456514813635, "grad_norm": 0.2909525731829429, "learning_rate": 3.017631819139273e-06, "loss": 0.0322, "step": 3024 }, { "epoch": 3.8547308059891687, "grad_norm": 0.2709315488759595, "learning_rate": 3.011259979353938e-06, "loss": 0.0297, "step": 3025 }, { "epoch": 3.856005097164702, "grad_norm": 0.2996879360480063, "learning_rate": 3.004893681248662e-06, "loss": 0.0334, "step": 3026 }, { "epoch": 3.8572793883402356, "grad_norm": 0.2855271917450751, "learning_rate": 2.9985329298715614e-06, "loss": 0.0294, "step": 3027 }, { "epoch": 3.8585536795157696, "grad_norm": 0.30501288344315913, "learning_rate": 2.9921777302663667e-06, "loss": 0.0483, "step": 3028 }, { "epoch": 3.859827970691303, "grad_norm": 0.2786662613164814, "learning_rate": 2.9858280874723833e-06, "loss": 0.0332, "step": 3029 }, { "epoch": 3.8611022618668365, "grad_norm": 0.2702131072231038, "learning_rate": 2.9794840065245347e-06, "loss": 0.029, "step": 3030 }, { "epoch": 3.8623765530423704, "grad_norm": 0.2892064410274852, "learning_rate": 2.9731454924533086e-06, "loss": 0.0352, "step": 3031 }, { "epoch": 3.863650844217904, "grad_norm": 0.2703105541234649, "learning_rate": 2.9668125502848035e-06, "loss": 0.0268, "step": 3032 }, { "epoch": 3.8649251353934373, "grad_norm": 0.2734828371635564, "learning_rate": 2.9604851850406845e-06, "loss": 0.0349, "step": 3033 }, { "epoch": 3.866199426568971, "grad_norm": 0.2877606194036162, "learning_rate": 2.954163401738199e-06, "loss": 0.0363, "step": 3034 }, { "epoch": 3.8674737177445047, "grad_norm": 0.2838074114213903, "learning_rate": 2.9478472053901675e-06, "loss": 0.0281, "step": 3035 }, { "epoch": 3.868748008920038, "grad_norm": 0.26632442298983305, "learning_rate": 2.9415366010049795e-06, "loss": 0.0278, "step": 3036 }, { "epoch": 3.870022300095572, "grad_norm": 0.264132019532677, "learning_rate": 2.9352315935865928e-06, "loss": 0.028, "step": 3037 }, { "epoch": 3.8712965912711055, "grad_norm": 0.27322761962079556, "learning_rate": 2.9289321881345257e-06, "loss": 0.0283, "step": 3038 }, { "epoch": 3.872570882446639, "grad_norm": 0.28661835462946145, "learning_rate": 2.922638389643854e-06, "loss": 0.034, "step": 3039 }, { "epoch": 3.873845173622173, "grad_norm": 0.2637634060323692, "learning_rate": 2.916350203105207e-06, "loss": 0.029, "step": 3040 }, { "epoch": 3.8751194647977063, "grad_norm": 0.2819891520823707, "learning_rate": 2.910067633504766e-06, "loss": 0.0301, "step": 3041 }, { "epoch": 3.8763937559732398, "grad_norm": 0.27603775081719933, "learning_rate": 2.903790685824258e-06, "loss": 0.0293, "step": 3042 }, { "epoch": 3.8776680471487737, "grad_norm": 0.2672904143883503, "learning_rate": 2.8975193650409515e-06, "loss": 0.0291, "step": 3043 }, { "epoch": 3.878942338324307, "grad_norm": 0.2638285055792088, "learning_rate": 2.8912536761276524e-06, "loss": 0.0269, "step": 3044 }, { "epoch": 3.8802166294998406, "grad_norm": 0.27035303642987357, "learning_rate": 2.884993624052701e-06, "loss": 0.026, "step": 3045 }, { "epoch": 3.8814909206753745, "grad_norm": 0.26165104987676, "learning_rate": 2.8787392137799665e-06, "loss": 0.0259, "step": 3046 }, { "epoch": 3.882765211850908, "grad_norm": 0.2877535808729278, "learning_rate": 2.8724904502688566e-06, "loss": 0.0326, "step": 3047 }, { "epoch": 3.8840395030264414, "grad_norm": 0.2815532689073331, "learning_rate": 2.866247338474277e-06, "loss": 0.0304, "step": 3048 }, { "epoch": 3.8853137942019753, "grad_norm": 0.28704231064186725, "learning_rate": 2.8600098833466805e-06, "loss": 0.0301, "step": 3049 }, { "epoch": 3.8865880853775088, "grad_norm": 0.27572952360672526, "learning_rate": 2.8537780898320067e-06, "loss": 0.0262, "step": 3050 }, { "epoch": 3.8878623765530422, "grad_norm": 0.282859229137795, "learning_rate": 2.8475519628717295e-06, "loss": 0.0317, "step": 3051 }, { "epoch": 3.889136667728576, "grad_norm": 0.29308353904712103, "learning_rate": 2.8413315074028157e-06, "loss": 0.0336, "step": 3052 }, { "epoch": 3.8904109589041096, "grad_norm": 0.24930463797016275, "learning_rate": 2.8351167283577396e-06, "loss": 0.0229, "step": 3053 }, { "epoch": 3.891685250079643, "grad_norm": 0.281047725587366, "learning_rate": 2.8289076306644724e-06, "loss": 0.0329, "step": 3054 }, { "epoch": 3.892959541255177, "grad_norm": 0.264115742851523, "learning_rate": 2.8227042192464813e-06, "loss": 0.027, "step": 3055 }, { "epoch": 3.8942338324307104, "grad_norm": 0.29962107809089794, "learning_rate": 2.8165064990227255e-06, "loss": 0.0353, "step": 3056 }, { "epoch": 3.895508123606244, "grad_norm": 0.2913418097950279, "learning_rate": 2.8103144749076506e-06, "loss": 0.035, "step": 3057 }, { "epoch": 3.896782414781778, "grad_norm": 0.27288373875794725, "learning_rate": 2.804128151811182e-06, "loss": 0.0288, "step": 3058 }, { "epoch": 3.8980567059573112, "grad_norm": 0.29867201268235544, "learning_rate": 2.7979475346387363e-06, "loss": 0.0397, "step": 3059 }, { "epoch": 3.8993309971328447, "grad_norm": 0.281408819258212, "learning_rate": 2.7917726282911874e-06, "loss": 0.0377, "step": 3060 }, { "epoch": 3.9006052883083786, "grad_norm": 0.29488414782670624, "learning_rate": 2.785603437664901e-06, "loss": 0.0307, "step": 3061 }, { "epoch": 3.901879579483912, "grad_norm": 0.28371054220542175, "learning_rate": 2.779439967651688e-06, "loss": 0.0312, "step": 3062 }, { "epoch": 3.9031538706594455, "grad_norm": 0.27159158915904413, "learning_rate": 2.7732822231388467e-06, "loss": 0.0311, "step": 3063 }, { "epoch": 3.9044281618349794, "grad_norm": 0.2631290098416748, "learning_rate": 2.76713020900912e-06, "loss": 0.0271, "step": 3064 }, { "epoch": 3.905702453010513, "grad_norm": 0.29892777179317426, "learning_rate": 2.7609839301407102e-06, "loss": 0.0338, "step": 3065 }, { "epoch": 3.9069767441860463, "grad_norm": 0.25582982674996574, "learning_rate": 2.7548433914072736e-06, "loss": 0.0258, "step": 3066 }, { "epoch": 3.9082510353615803, "grad_norm": 0.27447251703025755, "learning_rate": 2.7487085976779137e-06, "loss": 0.032, "step": 3067 }, { "epoch": 3.9095253265371137, "grad_norm": 0.28235783240072926, "learning_rate": 2.7425795538171806e-06, "loss": 0.0358, "step": 3068 }, { "epoch": 3.910799617712647, "grad_norm": 0.2950073941687582, "learning_rate": 2.7364562646850613e-06, "loss": 0.0351, "step": 3069 }, { "epoch": 3.912073908888181, "grad_norm": 0.28309600090325726, "learning_rate": 2.7303387351369813e-06, "loss": 0.0358, "step": 3070 }, { "epoch": 3.9133482000637145, "grad_norm": 0.2778777029848874, "learning_rate": 2.7242269700238024e-06, "loss": 0.0344, "step": 3071 }, { "epoch": 3.914622491239248, "grad_norm": 0.2865967974763273, "learning_rate": 2.7181209741918093e-06, "loss": 0.0351, "step": 3072 }, { "epoch": 3.915896782414782, "grad_norm": 0.2799787050337901, "learning_rate": 2.712020752482717e-06, "loss": 0.0306, "step": 3073 }, { "epoch": 3.9171710735903154, "grad_norm": 0.28840588150057433, "learning_rate": 2.7059263097336595e-06, "loss": 0.0353, "step": 3074 }, { "epoch": 3.918445364765849, "grad_norm": 0.257304548833529, "learning_rate": 2.699837650777191e-06, "loss": 0.028, "step": 3075 }, { "epoch": 3.9197196559413827, "grad_norm": 0.27653075032118307, "learning_rate": 2.6937547804412756e-06, "loss": 0.0335, "step": 3076 }, { "epoch": 3.920993947116916, "grad_norm": 0.30316232005768157, "learning_rate": 2.6876777035492863e-06, "loss": 0.0362, "step": 3077 }, { "epoch": 3.9222682382924496, "grad_norm": 0.2848494134830094, "learning_rate": 2.6816064249200157e-06, "loss": 0.0325, "step": 3078 }, { "epoch": 3.9235425294679835, "grad_norm": 0.29187147489573034, "learning_rate": 2.675540949367634e-06, "loss": 0.0389, "step": 3079 }, { "epoch": 3.924816820643517, "grad_norm": 0.28882687562014103, "learning_rate": 2.669481281701739e-06, "loss": 0.0328, "step": 3080 }, { "epoch": 3.9260911118190505, "grad_norm": 0.28598599011463816, "learning_rate": 2.6634274267272955e-06, "loss": 0.033, "step": 3081 }, { "epoch": 3.9273654029945844, "grad_norm": 0.27623675741864734, "learning_rate": 2.6573793892446797e-06, "loss": 0.0281, "step": 3082 }, { "epoch": 3.928639694170118, "grad_norm": 0.27815771553859686, "learning_rate": 2.6513371740496453e-06, "loss": 0.0293, "step": 3083 }, { "epoch": 3.9299139853456513, "grad_norm": 0.289239764323704, "learning_rate": 2.64530078593333e-06, "loss": 0.0356, "step": 3084 }, { "epoch": 3.931188276521185, "grad_norm": 0.2819227378703565, "learning_rate": 2.6392702296822526e-06, "loss": 0.0311, "step": 3085 }, { "epoch": 3.9324625676967186, "grad_norm": 0.26756311190684395, "learning_rate": 2.6332455100783084e-06, "loss": 0.0237, "step": 3086 }, { "epoch": 3.933736858872252, "grad_norm": 0.2753960398065858, "learning_rate": 2.6272266318987606e-06, "loss": 0.0353, "step": 3087 }, { "epoch": 3.935011150047786, "grad_norm": 0.27844680013022266, "learning_rate": 2.6212135999162447e-06, "loss": 0.0334, "step": 3088 }, { "epoch": 3.9362854412233195, "grad_norm": 0.2831311490152155, "learning_rate": 2.615206418898756e-06, "loss": 0.0338, "step": 3089 }, { "epoch": 3.937559732398853, "grad_norm": 0.2957080191437815, "learning_rate": 2.6092050936096626e-06, "loss": 0.0376, "step": 3090 }, { "epoch": 3.938834023574387, "grad_norm": 0.27933994930243744, "learning_rate": 2.603209628807666e-06, "loss": 0.0339, "step": 3091 }, { "epoch": 3.9401083147499203, "grad_norm": 0.28531389504803806, "learning_rate": 2.597220029246846e-06, "loss": 0.0328, "step": 3092 }, { "epoch": 3.9413826059254538, "grad_norm": 0.29196439635753674, "learning_rate": 2.5912362996766115e-06, "loss": 0.0312, "step": 3093 }, { "epoch": 3.9426568971009877, "grad_norm": 0.28186184477135107, "learning_rate": 2.5852584448417327e-06, "loss": 0.0305, "step": 3094 }, { "epoch": 3.943931188276521, "grad_norm": 0.3052190225488228, "learning_rate": 2.5792864694823107e-06, "loss": 0.0393, "step": 3095 }, { "epoch": 3.9452054794520546, "grad_norm": 0.2861847178422171, "learning_rate": 2.573320378333789e-06, "loss": 0.0347, "step": 3096 }, { "epoch": 3.9464797706275885, "grad_norm": 0.2895821118323678, "learning_rate": 2.567360176126943e-06, "loss": 0.0297, "step": 3097 }, { "epoch": 3.947754061803122, "grad_norm": 0.273651871927539, "learning_rate": 2.5614058675878806e-06, "loss": 0.037, "step": 3098 }, { "epoch": 3.9490283529786554, "grad_norm": 0.29564505907556055, "learning_rate": 2.5554574574380364e-06, "loss": 0.0384, "step": 3099 }, { "epoch": 3.9503026441541893, "grad_norm": 0.27889027343853484, "learning_rate": 2.5495149503941652e-06, "loss": 0.0293, "step": 3100 }, { "epoch": 3.9515769353297228, "grad_norm": 0.2726474747983429, "learning_rate": 2.5435783511683444e-06, "loss": 0.0291, "step": 3101 }, { "epoch": 3.952851226505256, "grad_norm": 0.28493361472785894, "learning_rate": 2.5376476644679647e-06, "loss": 0.0323, "step": 3102 }, { "epoch": 3.95412551768079, "grad_norm": 0.28658980625914887, "learning_rate": 2.5317228949957284e-06, "loss": 0.0319, "step": 3103 }, { "epoch": 3.9553998088563236, "grad_norm": 0.28657414590622093, "learning_rate": 2.5258040474496483e-06, "loss": 0.0356, "step": 3104 }, { "epoch": 3.956674100031857, "grad_norm": 0.2619746166184021, "learning_rate": 2.5198911265230365e-06, "loss": 0.0284, "step": 3105 }, { "epoch": 3.957948391207391, "grad_norm": 0.267659963138299, "learning_rate": 2.5139841369045114e-06, "loss": 0.0271, "step": 3106 }, { "epoch": 3.9592226823829244, "grad_norm": 0.2876025999195318, "learning_rate": 2.5080830832779834e-06, "loss": 0.0353, "step": 3107 }, { "epoch": 3.960496973558458, "grad_norm": 0.2708489238995171, "learning_rate": 2.502187970322657e-06, "loss": 0.0309, "step": 3108 }, { "epoch": 3.9617712647339918, "grad_norm": 0.2896000598466499, "learning_rate": 2.496298802713035e-06, "loss": 0.0329, "step": 3109 }, { "epoch": 3.9630455559095252, "grad_norm": 0.28213813430735674, "learning_rate": 2.490415585118887e-06, "loss": 0.031, "step": 3110 }, { "epoch": 3.9643198470850587, "grad_norm": 0.29199660272703465, "learning_rate": 2.4845383222052865e-06, "loss": 0.0323, "step": 3111 }, { "epoch": 3.9655941382605926, "grad_norm": 0.2722632487182155, "learning_rate": 2.478667018632562e-06, "loss": 0.0303, "step": 3112 }, { "epoch": 3.966868429436126, "grad_norm": 0.2885684985281771, "learning_rate": 2.47280167905634e-06, "loss": 0.0329, "step": 3113 }, { "epoch": 3.9681427206116595, "grad_norm": 0.27734697277802783, "learning_rate": 2.466942308127501e-06, "loss": 0.0326, "step": 3114 }, { "epoch": 3.9694170117871934, "grad_norm": 0.26363914037877767, "learning_rate": 2.461088910492202e-06, "loss": 0.0274, "step": 3115 }, { "epoch": 3.970691302962727, "grad_norm": 0.2881251303650039, "learning_rate": 2.4552414907918565e-06, "loss": 0.0382, "step": 3116 }, { "epoch": 3.9719655941382603, "grad_norm": 0.27093472804561075, "learning_rate": 2.4494000536631437e-06, "loss": 0.0297, "step": 3117 }, { "epoch": 3.9732398853137942, "grad_norm": 0.27854468660890636, "learning_rate": 2.4435646037379946e-06, "loss": 0.0316, "step": 3118 }, { "epoch": 3.9745141764893277, "grad_norm": 0.2742768826746829, "learning_rate": 2.437735145643597e-06, "loss": 0.0285, "step": 3119 }, { "epoch": 3.975788467664861, "grad_norm": 0.261476917983254, "learning_rate": 2.4319116840023814e-06, "loss": 0.0255, "step": 3120 }, { "epoch": 3.977062758840395, "grad_norm": 0.25631565208999635, "learning_rate": 2.426094223432035e-06, "loss": 0.0241, "step": 3121 }, { "epoch": 3.9783370500159285, "grad_norm": 0.28010431699538546, "learning_rate": 2.420282768545469e-06, "loss": 0.0323, "step": 3122 }, { "epoch": 3.979611341191462, "grad_norm": 0.26392357205851097, "learning_rate": 2.4144773239508524e-06, "loss": 0.0279, "step": 3123 }, { "epoch": 3.980885632366996, "grad_norm": 0.3030297997004185, "learning_rate": 2.4086778942515666e-06, "loss": 0.0333, "step": 3124 }, { "epoch": 3.9821599235425293, "grad_norm": 0.2750854960032198, "learning_rate": 2.402884484046244e-06, "loss": 0.0301, "step": 3125 }, { "epoch": 3.983434214718063, "grad_norm": 0.2980051021174673, "learning_rate": 2.397097097928732e-06, "loss": 0.0325, "step": 3126 }, { "epoch": 3.9847085058935967, "grad_norm": 0.30163068946321814, "learning_rate": 2.391315740488105e-06, "loss": 0.0412, "step": 3127 }, { "epoch": 3.98598279706913, "grad_norm": 0.28611082876905863, "learning_rate": 2.3855404163086558e-06, "loss": 0.0347, "step": 3128 }, { "epoch": 3.987257088244664, "grad_norm": 0.2723939203923576, "learning_rate": 2.3797711299698924e-06, "loss": 0.0333, "step": 3129 }, { "epoch": 3.9885313794201975, "grad_norm": 0.2688778028631701, "learning_rate": 2.3740078860465378e-06, "loss": 0.0296, "step": 3130 }, { "epoch": 3.989805670595731, "grad_norm": 0.28246902060488516, "learning_rate": 2.368250689108521e-06, "loss": 0.0322, "step": 3131 }, { "epoch": 3.991079961771265, "grad_norm": 0.28407627377893835, "learning_rate": 2.3624995437209775e-06, "loss": 0.0293, "step": 3132 }, { "epoch": 3.9923542529467984, "grad_norm": 0.2730273994736395, "learning_rate": 2.3567544544442444e-06, "loss": 0.0273, "step": 3133 }, { "epoch": 3.993628544122332, "grad_norm": 0.2833579922754267, "learning_rate": 2.351015425833857e-06, "loss": 0.0349, "step": 3134 }, { "epoch": 3.9949028352978657, "grad_norm": 0.2715994444853879, "learning_rate": 2.345282462440542e-06, "loss": 0.0296, "step": 3135 }, { "epoch": 3.996177126473399, "grad_norm": 0.2930823398223464, "learning_rate": 2.339555568810221e-06, "loss": 0.0362, "step": 3136 }, { "epoch": 3.9974514176489326, "grad_norm": 0.282291894184794, "learning_rate": 2.333834749484e-06, "loss": 0.0309, "step": 3137 }, { "epoch": 3.9987257088244665, "grad_norm": 0.2773094689276124, "learning_rate": 2.328120008998167e-06, "loss": 0.0354, "step": 3138 }, { "epoch": 4.0, "grad_norm": 0.3289017405474692, "learning_rate": 2.3224113518841908e-06, "loss": 0.0332, "step": 3139 }, { "epoch": 4.001274291175534, "grad_norm": 0.17096653077993995, "learning_rate": 2.316708782668724e-06, "loss": 0.0129, "step": 3140 }, { "epoch": 4.002548582351067, "grad_norm": 0.16069708217677486, "learning_rate": 2.311012305873577e-06, "loss": 0.0116, "step": 3141 }, { "epoch": 4.003822873526601, "grad_norm": 0.15377201782909306, "learning_rate": 2.3053219260157445e-06, "loss": 0.0112, "step": 3142 }, { "epoch": 4.005097164702135, "grad_norm": 0.1653681899063459, "learning_rate": 2.2996376476073724e-06, "loss": 0.0125, "step": 3143 }, { "epoch": 4.006371455877668, "grad_norm": 0.1557902340725346, "learning_rate": 2.2939594751557804e-06, "loss": 0.0129, "step": 3144 }, { "epoch": 4.007645747053202, "grad_norm": 0.16596840963182216, "learning_rate": 2.2882874131634415e-06, "loss": 0.014, "step": 3145 }, { "epoch": 4.0089200382287355, "grad_norm": 0.1459425655919067, "learning_rate": 2.282621466127982e-06, "loss": 0.0093, "step": 3146 }, { "epoch": 4.010194329404269, "grad_norm": 0.17195199993419608, "learning_rate": 2.276961638542183e-06, "loss": 0.0137, "step": 3147 }, { "epoch": 4.0114686205798025, "grad_norm": 0.170269307909041, "learning_rate": 2.2713079348939704e-06, "loss": 0.0138, "step": 3148 }, { "epoch": 4.012742911755336, "grad_norm": 0.16501009305806089, "learning_rate": 2.265660359666414e-06, "loss": 0.0147, "step": 3149 }, { "epoch": 4.014017202930869, "grad_norm": 0.15872872098025503, "learning_rate": 2.2600189173377263e-06, "loss": 0.0117, "step": 3150 }, { "epoch": 4.015291494106403, "grad_norm": 0.18171203583682757, "learning_rate": 2.2543836123812557e-06, "loss": 0.0146, "step": 3151 }, { "epoch": 4.016565785281937, "grad_norm": 0.16757513307342137, "learning_rate": 2.2487544492654832e-06, "loss": 0.0102, "step": 3152 }, { "epoch": 4.01784007645747, "grad_norm": 0.17960979062659746, "learning_rate": 2.243131432454021e-06, "loss": 0.0124, "step": 3153 }, { "epoch": 4.019114367633004, "grad_norm": 0.1610170152918828, "learning_rate": 2.2375145664056062e-06, "loss": 0.0094, "step": 3154 }, { "epoch": 4.020388658808538, "grad_norm": 0.19137712121196662, "learning_rate": 2.2319038555741014e-06, "loss": 0.0126, "step": 3155 }, { "epoch": 4.021662949984071, "grad_norm": 0.18612958853777029, "learning_rate": 2.226299304408485e-06, "loss": 0.0101, "step": 3156 }, { "epoch": 4.022937241159605, "grad_norm": 0.19336074493671992, "learning_rate": 2.2207009173528528e-06, "loss": 0.011, "step": 3157 }, { "epoch": 4.024211532335139, "grad_norm": 0.2168129477419246, "learning_rate": 2.2151086988464097e-06, "loss": 0.013, "step": 3158 }, { "epoch": 4.025485823510672, "grad_norm": 0.22124515668998251, "learning_rate": 2.2095226533234816e-06, "loss": 0.0159, "step": 3159 }, { "epoch": 4.026760114686206, "grad_norm": 0.20823494026147463, "learning_rate": 2.203942785213479e-06, "loss": 0.0122, "step": 3160 }, { "epoch": 4.02803440586174, "grad_norm": 0.1898726502131526, "learning_rate": 2.1983690989409345e-06, "loss": 0.0086, "step": 3161 }, { "epoch": 4.029308697037273, "grad_norm": 0.1827128529260741, "learning_rate": 2.1928015989254615e-06, "loss": 0.0092, "step": 3162 }, { "epoch": 4.030582988212807, "grad_norm": 0.19051698634072076, "learning_rate": 2.1872402895817824e-06, "loss": 0.01, "step": 3163 }, { "epoch": 4.0318572793883405, "grad_norm": 0.20171063459612903, "learning_rate": 2.1816851753197023e-06, "loss": 0.0106, "step": 3164 }, { "epoch": 4.0331315705638735, "grad_norm": 0.19403261566125005, "learning_rate": 2.176136260544117e-06, "loss": 0.01, "step": 3165 }, { "epoch": 4.034405861739407, "grad_norm": 0.21627911675251366, "learning_rate": 2.1705935496550045e-06, "loss": 0.0128, "step": 3166 }, { "epoch": 4.035680152914941, "grad_norm": 0.19229530186565, "learning_rate": 2.165057047047426e-06, "loss": 0.0086, "step": 3167 }, { "epoch": 4.036954444090474, "grad_norm": 0.18457044484585938, "learning_rate": 2.1595267571115163e-06, "loss": 0.0089, "step": 3168 }, { "epoch": 4.038228735266008, "grad_norm": 0.19798259725686682, "learning_rate": 2.154002684232489e-06, "loss": 0.011, "step": 3169 }, { "epoch": 4.039503026441542, "grad_norm": 0.17558820493596383, "learning_rate": 2.148484832790619e-06, "loss": 0.0102, "step": 3170 }, { "epoch": 4.040777317617075, "grad_norm": 0.1991163800509066, "learning_rate": 2.1429732071612653e-06, "loss": 0.0092, "step": 3171 }, { "epoch": 4.042051608792609, "grad_norm": 0.1751333113324747, "learning_rate": 2.137467811714826e-06, "loss": 0.0102, "step": 3172 }, { "epoch": 4.043325899968143, "grad_norm": 0.19915661269535945, "learning_rate": 2.1319686508167838e-06, "loss": 0.0123, "step": 3173 }, { "epoch": 4.044600191143676, "grad_norm": 0.1674848742134163, "learning_rate": 2.1264757288276528e-06, "loss": 0.0094, "step": 3174 }, { "epoch": 4.04587448231921, "grad_norm": 0.18097879985931475, "learning_rate": 2.1209890501030238e-06, "loss": 0.0094, "step": 3175 }, { "epoch": 4.047148773494744, "grad_norm": 0.2000924072322688, "learning_rate": 2.1155086189935227e-06, "loss": 0.0134, "step": 3176 }, { "epoch": 4.048423064670277, "grad_norm": 0.20064060082498392, "learning_rate": 2.1100344398448237e-06, "loss": 0.011, "step": 3177 }, { "epoch": 4.049697355845811, "grad_norm": 0.17440551851091488, "learning_rate": 2.104566516997647e-06, "loss": 0.0104, "step": 3178 }, { "epoch": 4.050971647021345, "grad_norm": 0.18857338550041117, "learning_rate": 2.099104854787747e-06, "loss": 0.0114, "step": 3179 }, { "epoch": 4.052245938196878, "grad_norm": 0.18821994791367794, "learning_rate": 2.0936494575459186e-06, "loss": 0.0106, "step": 3180 }, { "epoch": 4.0535202293724115, "grad_norm": 0.18095865422283888, "learning_rate": 2.0882003295979858e-06, "loss": 0.0123, "step": 3181 }, { "epoch": 4.054794520547945, "grad_norm": 0.1770406958025884, "learning_rate": 2.082757475264804e-06, "loss": 0.0117, "step": 3182 }, { "epoch": 4.056068811723478, "grad_norm": 0.1692110653071131, "learning_rate": 2.0773208988622497e-06, "loss": 0.0089, "step": 3183 }, { "epoch": 4.057343102899012, "grad_norm": 0.17825339815462835, "learning_rate": 2.0718906047012245e-06, "loss": 0.0103, "step": 3184 }, { "epoch": 4.058617394074546, "grad_norm": 0.16610682267778265, "learning_rate": 2.0664665970876496e-06, "loss": 0.009, "step": 3185 }, { "epoch": 4.059891685250079, "grad_norm": 0.18024195293410825, "learning_rate": 2.061048880322459e-06, "loss": 0.0079, "step": 3186 }, { "epoch": 4.061165976425613, "grad_norm": 0.20382408261204885, "learning_rate": 2.055637458701599e-06, "loss": 0.0152, "step": 3187 }, { "epoch": 4.062440267601147, "grad_norm": 0.176800940832041, "learning_rate": 2.050232336516025e-06, "loss": 0.0098, "step": 3188 }, { "epoch": 4.06371455877668, "grad_norm": 0.1868453088763031, "learning_rate": 2.0448335180516933e-06, "loss": 0.012, "step": 3189 }, { "epoch": 4.064988849952214, "grad_norm": 0.18429151936539492, "learning_rate": 2.039441007589573e-06, "loss": 0.0123, "step": 3190 }, { "epoch": 4.066263141127748, "grad_norm": 0.19555271561939416, "learning_rate": 2.0340548094056136e-06, "loss": 0.0133, "step": 3191 }, { "epoch": 4.067537432303281, "grad_norm": 0.18915600697840423, "learning_rate": 2.0286749277707783e-06, "loss": 0.0146, "step": 3192 }, { "epoch": 4.068811723478815, "grad_norm": 0.18094754335228552, "learning_rate": 2.0233013669510027e-06, "loss": 0.0091, "step": 3193 }, { "epoch": 4.070086014654349, "grad_norm": 0.192778758343806, "learning_rate": 2.0179341312072275e-06, "loss": 0.0104, "step": 3194 }, { "epoch": 4.071360305829882, "grad_norm": 0.20065982602904692, "learning_rate": 2.012573224795369e-06, "loss": 0.0116, "step": 3195 }, { "epoch": 4.072634597005416, "grad_norm": 0.1754884201077641, "learning_rate": 2.007218651966325e-06, "loss": 0.0114, "step": 3196 }, { "epoch": 4.0739088881809495, "grad_norm": 0.19365754728652068, "learning_rate": 2.0018704169659743e-06, "loss": 0.0123, "step": 3197 }, { "epoch": 4.0751831793564826, "grad_norm": 0.1726981643436029, "learning_rate": 1.996528524035165e-06, "loss": 0.0104, "step": 3198 }, { "epoch": 4.0764574705320165, "grad_norm": 0.1737011044706962, "learning_rate": 1.9911929774097216e-06, "loss": 0.0099, "step": 3199 }, { "epoch": 4.07773176170755, "grad_norm": 0.17870726656438402, "learning_rate": 1.9858637813204352e-06, "loss": 0.009, "step": 3200 }, { "epoch": 4.079006052883083, "grad_norm": 0.18561358107079393, "learning_rate": 1.9805409399930554e-06, "loss": 0.0105, "step": 3201 }, { "epoch": 4.080280344058617, "grad_norm": 0.19289102987305706, "learning_rate": 1.975224457648307e-06, "loss": 0.0101, "step": 3202 }, { "epoch": 4.081554635234151, "grad_norm": 0.20247607340080492, "learning_rate": 1.9699143385018515e-06, "loss": 0.0125, "step": 3203 }, { "epoch": 4.082828926409684, "grad_norm": 0.18657627803954643, "learning_rate": 1.9646105867643285e-06, "loss": 0.0095, "step": 3204 }, { "epoch": 4.084103217585218, "grad_norm": 0.18045038051646387, "learning_rate": 1.9593132066413054e-06, "loss": 0.0087, "step": 3205 }, { "epoch": 4.085377508760752, "grad_norm": 0.1811559554535187, "learning_rate": 1.9540222023333165e-06, "loss": 0.0098, "step": 3206 }, { "epoch": 4.086651799936285, "grad_norm": 0.17441453956193004, "learning_rate": 1.9487375780358297e-06, "loss": 0.0083, "step": 3207 }, { "epoch": 4.087926091111819, "grad_norm": 0.16825269267721416, "learning_rate": 1.9434593379392565e-06, "loss": 0.0085, "step": 3208 }, { "epoch": 4.089200382287353, "grad_norm": 0.18091249403834514, "learning_rate": 1.938187486228945e-06, "loss": 0.0094, "step": 3209 }, { "epoch": 4.090474673462886, "grad_norm": 0.1936192556464192, "learning_rate": 1.932922027085181e-06, "loss": 0.0129, "step": 3210 }, { "epoch": 4.09174896463842, "grad_norm": 0.1793339153938476, "learning_rate": 1.927662964683178e-06, "loss": 0.0091, "step": 3211 }, { "epoch": 4.093023255813954, "grad_norm": 0.1841435575031982, "learning_rate": 1.9224103031930776e-06, "loss": 0.0107, "step": 3212 }, { "epoch": 4.094297546989487, "grad_norm": 0.20583312489927585, "learning_rate": 1.9171640467799478e-06, "loss": 0.0118, "step": 3213 }, { "epoch": 4.095571838165021, "grad_norm": 0.18320834965311478, "learning_rate": 1.9119241996037774e-06, "loss": 0.0099, "step": 3214 }, { "epoch": 4.0968461293405545, "grad_norm": 0.17661071233158288, "learning_rate": 1.906690765819471e-06, "loss": 0.0083, "step": 3215 }, { "epoch": 4.0981204205160875, "grad_norm": 0.20486361050102173, "learning_rate": 1.9014637495768485e-06, "loss": 0.0113, "step": 3216 }, { "epoch": 4.099394711691621, "grad_norm": 0.1958040002507632, "learning_rate": 1.8962431550206427e-06, "loss": 0.0109, "step": 3217 }, { "epoch": 4.100669002867155, "grad_norm": 0.19110355537830265, "learning_rate": 1.8910289862904917e-06, "loss": 0.0135, "step": 3218 }, { "epoch": 4.101943294042689, "grad_norm": 0.2212103095421087, "learning_rate": 1.8858212475209415e-06, "loss": 0.0129, "step": 3219 }, { "epoch": 4.103217585218222, "grad_norm": 0.18534054500696248, "learning_rate": 1.880619942841435e-06, "loss": 0.0105, "step": 3220 }, { "epoch": 4.104491876393756, "grad_norm": 0.16919137097593723, "learning_rate": 1.8754250763763215e-06, "loss": 0.0083, "step": 3221 }, { "epoch": 4.105766167569289, "grad_norm": 0.18169275916383767, "learning_rate": 1.8702366522448322e-06, "loss": 0.0116, "step": 3222 }, { "epoch": 4.107040458744823, "grad_norm": 0.17670071269184068, "learning_rate": 1.8650546745611064e-06, "loss": 0.01, "step": 3223 }, { "epoch": 4.108314749920357, "grad_norm": 0.17991299781843434, "learning_rate": 1.8598791474341516e-06, "loss": 0.0096, "step": 3224 }, { "epoch": 4.109589041095891, "grad_norm": 0.19287667859614271, "learning_rate": 1.8547100749678803e-06, "loss": 0.0112, "step": 3225 }, { "epoch": 4.110863332271424, "grad_norm": 0.1961727118225022, "learning_rate": 1.8495474612610741e-06, "loss": 0.0106, "step": 3226 }, { "epoch": 4.112137623446958, "grad_norm": 0.17801797374541997, "learning_rate": 1.8443913104073984e-06, "loss": 0.009, "step": 3227 }, { "epoch": 4.113411914622492, "grad_norm": 0.16808504181947675, "learning_rate": 1.8392416264953926e-06, "loss": 0.0099, "step": 3228 }, { "epoch": 4.114686205798025, "grad_norm": 0.17891581584105268, "learning_rate": 1.8340984136084672e-06, "loss": 0.0108, "step": 3229 }, { "epoch": 4.115960496973559, "grad_norm": 0.20193755514281966, "learning_rate": 1.8289616758249019e-06, "loss": 0.0118, "step": 3230 }, { "epoch": 4.1172347881490925, "grad_norm": 0.17225206889179204, "learning_rate": 1.823831417217844e-06, "loss": 0.0098, "step": 3231 }, { "epoch": 4.1185090793246255, "grad_norm": 0.1837172019381377, "learning_rate": 1.8187076418552974e-06, "loss": 0.0101, "step": 3232 }, { "epoch": 4.119783370500159, "grad_norm": 0.2014376468377108, "learning_rate": 1.8135903538001399e-06, "loss": 0.0138, "step": 3233 }, { "epoch": 4.121057661675693, "grad_norm": 0.1888847768835253, "learning_rate": 1.808479557110081e-06, "loss": 0.0112, "step": 3234 }, { "epoch": 4.122331952851226, "grad_norm": 0.1637301274006771, "learning_rate": 1.8033752558377104e-06, "loss": 0.0083, "step": 3235 }, { "epoch": 4.12360624402676, "grad_norm": 0.207176007716488, "learning_rate": 1.7982774540304404e-06, "loss": 0.0128, "step": 3236 }, { "epoch": 4.124880535202294, "grad_norm": 0.1745647128174031, "learning_rate": 1.793186155730553e-06, "loss": 0.0104, "step": 3237 }, { "epoch": 4.126154826377827, "grad_norm": 0.19260144975806426, "learning_rate": 1.788101364975159e-06, "loss": 0.0083, "step": 3238 }, { "epoch": 4.127429117553361, "grad_norm": 0.17987017083229132, "learning_rate": 1.7830230857962128e-06, "loss": 0.0123, "step": 3239 }, { "epoch": 4.128703408728895, "grad_norm": 0.2091948583186605, "learning_rate": 1.777951322220508e-06, "loss": 0.0149, "step": 3240 }, { "epoch": 4.129977699904428, "grad_norm": 0.18218339510234066, "learning_rate": 1.7728860782696666e-06, "loss": 0.0087, "step": 3241 }, { "epoch": 4.131251991079962, "grad_norm": 0.18507766109675255, "learning_rate": 1.7678273579601458e-06, "loss": 0.011, "step": 3242 }, { "epoch": 4.132526282255496, "grad_norm": 0.18558391929039683, "learning_rate": 1.7627751653032277e-06, "loss": 0.0085, "step": 3243 }, { "epoch": 4.133800573431029, "grad_norm": 0.18009023727104156, "learning_rate": 1.757729504305018e-06, "loss": 0.0104, "step": 3244 }, { "epoch": 4.135074864606563, "grad_norm": 0.18470664730257158, "learning_rate": 1.752690378966444e-06, "loss": 0.0089, "step": 3245 }, { "epoch": 4.136349155782097, "grad_norm": 0.17468201267499792, "learning_rate": 1.7476577932832507e-06, "loss": 0.012, "step": 3246 }, { "epoch": 4.13762344695763, "grad_norm": 0.1829052287345842, "learning_rate": 1.7426317512459967e-06, "loss": 0.0117, "step": 3247 }, { "epoch": 4.1388977381331635, "grad_norm": 0.19540609803467487, "learning_rate": 1.7376122568400533e-06, "loss": 0.0147, "step": 3248 }, { "epoch": 4.140172029308697, "grad_norm": 0.19857533865237886, "learning_rate": 1.7325993140455966e-06, "loss": 0.0107, "step": 3249 }, { "epoch": 4.1414463204842304, "grad_norm": 0.19305406030997693, "learning_rate": 1.7275929268376124e-06, "loss": 0.0151, "step": 3250 }, { "epoch": 4.142720611659764, "grad_norm": 0.19537355990367683, "learning_rate": 1.722593099185882e-06, "loss": 0.0081, "step": 3251 }, { "epoch": 4.143994902835298, "grad_norm": 0.21219814593224262, "learning_rate": 1.7175998350549972e-06, "loss": 0.0173, "step": 3252 }, { "epoch": 4.145269194010831, "grad_norm": 0.20017375179843983, "learning_rate": 1.7126131384043264e-06, "loss": 0.0098, "step": 3253 }, { "epoch": 4.146543485186365, "grad_norm": 0.20553017602157014, "learning_rate": 1.7076330131880525e-06, "loss": 0.014, "step": 3254 }, { "epoch": 4.147817776361899, "grad_norm": 0.19355861829888468, "learning_rate": 1.7026594633551252e-06, "loss": 0.0121, "step": 3255 }, { "epoch": 4.149092067537432, "grad_norm": 0.18559844926863153, "learning_rate": 1.697692492849299e-06, "loss": 0.0097, "step": 3256 }, { "epoch": 4.150366358712966, "grad_norm": 0.1707147302163719, "learning_rate": 1.6927321056091029e-06, "loss": 0.0096, "step": 3257 }, { "epoch": 4.1516406498885, "grad_norm": 0.17909927225866637, "learning_rate": 1.6877783055678443e-06, "loss": 0.0093, "step": 3258 }, { "epoch": 4.152914941064033, "grad_norm": 0.18410088570577507, "learning_rate": 1.682831096653611e-06, "loss": 0.0088, "step": 3259 }, { "epoch": 4.154189232239567, "grad_norm": 0.18880077990060803, "learning_rate": 1.6778904827892628e-06, "loss": 0.0119, "step": 3260 }, { "epoch": 4.155463523415101, "grad_norm": 0.21616000640763267, "learning_rate": 1.6729564678924304e-06, "loss": 0.0132, "step": 3261 }, { "epoch": 4.156737814590634, "grad_norm": 0.20453374146112277, "learning_rate": 1.6680290558755119e-06, "loss": 0.0121, "step": 3262 }, { "epoch": 4.158012105766168, "grad_norm": 0.1671666301739125, "learning_rate": 1.6631082506456664e-06, "loss": 0.008, "step": 3263 }, { "epoch": 4.1592863969417015, "grad_norm": 0.18952402367850074, "learning_rate": 1.6581940561048249e-06, "loss": 0.0104, "step": 3264 }, { "epoch": 4.160560688117235, "grad_norm": 0.1878836133733596, "learning_rate": 1.6532864761496593e-06, "loss": 0.0095, "step": 3265 }, { "epoch": 4.1618349792927685, "grad_norm": 0.1921155714545867, "learning_rate": 1.6483855146716155e-06, "loss": 0.0102, "step": 3266 }, { "epoch": 4.163109270468302, "grad_norm": 0.19186830838569002, "learning_rate": 1.6434911755568728e-06, "loss": 0.0088, "step": 3267 }, { "epoch": 4.164383561643835, "grad_norm": 0.19815179661872256, "learning_rate": 1.6386034626863744e-06, "loss": 0.0126, "step": 3268 }, { "epoch": 4.165657852819369, "grad_norm": 0.21746629999943076, "learning_rate": 1.6337223799358025e-06, "loss": 0.0114, "step": 3269 }, { "epoch": 4.166932143994903, "grad_norm": 0.18629230360546056, "learning_rate": 1.6288479311755822e-06, "loss": 0.0109, "step": 3270 }, { "epoch": 4.168206435170436, "grad_norm": 0.199030819044895, "learning_rate": 1.6239801202708783e-06, "loss": 0.0114, "step": 3271 }, { "epoch": 4.16948072634597, "grad_norm": 0.18330234107436316, "learning_rate": 1.6191189510815942e-06, "loss": 0.012, "step": 3272 }, { "epoch": 4.170755017521504, "grad_norm": 0.19407279172985098, "learning_rate": 1.614264427462363e-06, "loss": 0.011, "step": 3273 }, { "epoch": 4.172029308697037, "grad_norm": 0.1867579838362995, "learning_rate": 1.6094165532625516e-06, "loss": 0.0125, "step": 3274 }, { "epoch": 4.173303599872571, "grad_norm": 0.19353410416690411, "learning_rate": 1.6045753323262535e-06, "loss": 0.0114, "step": 3275 }, { "epoch": 4.174577891048105, "grad_norm": 0.18759032745543816, "learning_rate": 1.599740768492286e-06, "loss": 0.0125, "step": 3276 }, { "epoch": 4.175852182223638, "grad_norm": 0.1924709446403215, "learning_rate": 1.594912865594188e-06, "loss": 0.0083, "step": 3277 }, { "epoch": 4.177126473399172, "grad_norm": 0.17981426453618707, "learning_rate": 1.5900916274602163e-06, "loss": 0.0091, "step": 3278 }, { "epoch": 4.178400764574706, "grad_norm": 0.18253429645990876, "learning_rate": 1.5852770579133435e-06, "loss": 0.0104, "step": 3279 }, { "epoch": 4.179675055750239, "grad_norm": 0.18619912023493057, "learning_rate": 1.580469160771253e-06, "loss": 0.0128, "step": 3280 }, { "epoch": 4.180949346925773, "grad_norm": 0.19908424523412546, "learning_rate": 1.5756679398463404e-06, "loss": 0.0145, "step": 3281 }, { "epoch": 4.1822236381013065, "grad_norm": 0.18645294096138046, "learning_rate": 1.5708733989457003e-06, "loss": 0.0095, "step": 3282 }, { "epoch": 4.1834979292768395, "grad_norm": 0.18852307915320884, "learning_rate": 1.566085541871145e-06, "loss": 0.0116, "step": 3283 }, { "epoch": 4.184772220452373, "grad_norm": 0.18240371288887994, "learning_rate": 1.5613043724191667e-06, "loss": 0.0117, "step": 3284 }, { "epoch": 4.186046511627907, "grad_norm": 0.1936217433096085, "learning_rate": 1.556529894380976e-06, "loss": 0.0122, "step": 3285 }, { "epoch": 4.18732080280344, "grad_norm": 0.22006034827972454, "learning_rate": 1.5517621115424564e-06, "loss": 0.0131, "step": 3286 }, { "epoch": 4.188595093978974, "grad_norm": 0.18658778781542082, "learning_rate": 1.5470010276842006e-06, "loss": 0.0109, "step": 3287 }, { "epoch": 4.189869385154508, "grad_norm": 0.1829019485946128, "learning_rate": 1.5422466465814801e-06, "loss": 0.0107, "step": 3288 }, { "epoch": 4.191143676330041, "grad_norm": 0.17796256611966693, "learning_rate": 1.5374989720042531e-06, "loss": 0.0112, "step": 3289 }, { "epoch": 4.192417967505575, "grad_norm": 0.17759740943671967, "learning_rate": 1.5327580077171589e-06, "loss": 0.0098, "step": 3290 }, { "epoch": 4.193692258681109, "grad_norm": 0.19183990381946084, "learning_rate": 1.5280237574795186e-06, "loss": 0.0118, "step": 3291 }, { "epoch": 4.194966549856642, "grad_norm": 0.20961972099646708, "learning_rate": 1.5232962250453265e-06, "loss": 0.0137, "step": 3292 }, { "epoch": 4.196240841032176, "grad_norm": 0.18906027009709703, "learning_rate": 1.5185754141632537e-06, "loss": 0.0102, "step": 3293 }, { "epoch": 4.19751513220771, "grad_norm": 0.19261760097691996, "learning_rate": 1.5138613285766335e-06, "loss": 0.0116, "step": 3294 }, { "epoch": 4.198789423383243, "grad_norm": 0.17440882667144392, "learning_rate": 1.5091539720234827e-06, "loss": 0.0103, "step": 3295 }, { "epoch": 4.200063714558777, "grad_norm": 0.1912339926868582, "learning_rate": 1.504453348236461e-06, "loss": 0.0117, "step": 3296 }, { "epoch": 4.201338005734311, "grad_norm": 0.19711323644536002, "learning_rate": 1.499759460942909e-06, "loss": 0.0127, "step": 3297 }, { "epoch": 4.202612296909844, "grad_norm": 0.21364935207166685, "learning_rate": 1.4950723138648083e-06, "loss": 0.014, "step": 3298 }, { "epoch": 4.2038865880853775, "grad_norm": 0.1997049752680904, "learning_rate": 1.4903919107188102e-06, "loss": 0.0089, "step": 3299 }, { "epoch": 4.205160879260911, "grad_norm": 0.18320825830014612, "learning_rate": 1.485718255216212e-06, "loss": 0.0115, "step": 3300 }, { "epoch": 4.206435170436444, "grad_norm": 0.1827270604563539, "learning_rate": 1.481051351062961e-06, "loss": 0.0107, "step": 3301 }, { "epoch": 4.207709461611978, "grad_norm": 0.1913782137065129, "learning_rate": 1.4763912019596505e-06, "loss": 0.0103, "step": 3302 }, { "epoch": 4.208983752787512, "grad_norm": 0.18084120536764534, "learning_rate": 1.471737811601519e-06, "loss": 0.0104, "step": 3303 }, { "epoch": 4.210258043963045, "grad_norm": 0.18948272875453798, "learning_rate": 1.467091183678444e-06, "loss": 0.0098, "step": 3304 }, { "epoch": 4.211532335138579, "grad_norm": 0.1959894913793582, "learning_rate": 1.4624513218749415e-06, "loss": 0.0127, "step": 3305 }, { "epoch": 4.212806626314113, "grad_norm": 0.17681047868197394, "learning_rate": 1.4578182298701637e-06, "loss": 0.0109, "step": 3306 }, { "epoch": 4.214080917489646, "grad_norm": 0.19063133665168003, "learning_rate": 1.4531919113378923e-06, "loss": 0.0103, "step": 3307 }, { "epoch": 4.21535520866518, "grad_norm": 0.18777578114198912, "learning_rate": 1.4485723699465392e-06, "loss": 0.0104, "step": 3308 }, { "epoch": 4.216629499840714, "grad_norm": 0.18386934876904265, "learning_rate": 1.4439596093591434e-06, "loss": 0.0104, "step": 3309 }, { "epoch": 4.217903791016247, "grad_norm": 0.21648461471254032, "learning_rate": 1.4393536332333658e-06, "loss": 0.0156, "step": 3310 }, { "epoch": 4.219178082191781, "grad_norm": 0.1948703943241203, "learning_rate": 1.4347544452214869e-06, "loss": 0.0107, "step": 3311 }, { "epoch": 4.220452373367315, "grad_norm": 0.20060024768593662, "learning_rate": 1.4301620489704072e-06, "loss": 0.0106, "step": 3312 }, { "epoch": 4.221726664542848, "grad_norm": 0.16548094567790508, "learning_rate": 1.4255764481216372e-06, "loss": 0.0091, "step": 3313 }, { "epoch": 4.223000955718382, "grad_norm": 0.17418789341627444, "learning_rate": 1.4209976463113085e-06, "loss": 0.009, "step": 3314 }, { "epoch": 4.2242752468939155, "grad_norm": 0.19404654983337952, "learning_rate": 1.4164256471701455e-06, "loss": 0.0083, "step": 3315 }, { "epoch": 4.2255495380694486, "grad_norm": 0.18008623165797186, "learning_rate": 1.411860454323497e-06, "loss": 0.0103, "step": 3316 }, { "epoch": 4.2268238292449825, "grad_norm": 0.20102790310523605, "learning_rate": 1.4073020713912988e-06, "loss": 0.0132, "step": 3317 }, { "epoch": 4.228098120420516, "grad_norm": 0.19205443296023775, "learning_rate": 1.4027505019880972e-06, "loss": 0.011, "step": 3318 }, { "epoch": 4.229372411596049, "grad_norm": 0.18481444821402015, "learning_rate": 1.3982057497230328e-06, "loss": 0.0092, "step": 3319 }, { "epoch": 4.230646702771583, "grad_norm": 0.17843942136897198, "learning_rate": 1.3936678181998376e-06, "loss": 0.0115, "step": 3320 }, { "epoch": 4.231920993947117, "grad_norm": 0.1987090482936114, "learning_rate": 1.3891367110168397e-06, "loss": 0.0114, "step": 3321 }, { "epoch": 4.23319528512265, "grad_norm": 0.1898940892422021, "learning_rate": 1.3846124317669518e-06, "loss": 0.0123, "step": 3322 }, { "epoch": 4.234469576298184, "grad_norm": 0.19625218465835617, "learning_rate": 1.3800949840376766e-06, "loss": 0.0115, "step": 3323 }, { "epoch": 4.235743867473718, "grad_norm": 0.19203167425468562, "learning_rate": 1.3755843714110951e-06, "loss": 0.0113, "step": 3324 }, { "epoch": 4.237018158649251, "grad_norm": 0.17877529703851222, "learning_rate": 1.3710805974638697e-06, "loss": 0.0078, "step": 3325 }, { "epoch": 4.238292449824785, "grad_norm": 0.22167895804838408, "learning_rate": 1.3665836657672493e-06, "loss": 0.0114, "step": 3326 }, { "epoch": 4.239566741000319, "grad_norm": 0.18956258692381206, "learning_rate": 1.3620935798870383e-06, "loss": 0.0119, "step": 3327 }, { "epoch": 4.240841032175852, "grad_norm": 0.20804110737518328, "learning_rate": 1.357610343383634e-06, "loss": 0.0142, "step": 3328 }, { "epoch": 4.242115323351386, "grad_norm": 0.20561093373880532, "learning_rate": 1.353133959811983e-06, "loss": 0.0122, "step": 3329 }, { "epoch": 4.24338961452692, "grad_norm": 0.20978423436310278, "learning_rate": 1.3486644327216126e-06, "loss": 0.0149, "step": 3330 }, { "epoch": 4.244663905702453, "grad_norm": 0.19775577144019044, "learning_rate": 1.3442017656566086e-06, "loss": 0.0118, "step": 3331 }, { "epoch": 4.245938196877987, "grad_norm": 0.2409946643749105, "learning_rate": 1.339745962155613e-06, "loss": 0.018, "step": 3332 }, { "epoch": 4.2472124880535205, "grad_norm": 0.18353838830655367, "learning_rate": 1.3352970257518316e-06, "loss": 0.0102, "step": 3333 }, { "epoch": 4.2484867792290535, "grad_norm": 0.20284597147608677, "learning_rate": 1.3308549599730213e-06, "loss": 0.0128, "step": 3334 }, { "epoch": 4.249761070404587, "grad_norm": 0.17925297590868838, "learning_rate": 1.3264197683414915e-06, "loss": 0.0117, "step": 3335 }, { "epoch": 4.251035361580121, "grad_norm": 0.18299431730546883, "learning_rate": 1.321991454374101e-06, "loss": 0.0091, "step": 3336 }, { "epoch": 4.252309652755654, "grad_norm": 0.17755979172728556, "learning_rate": 1.3175700215822552e-06, "loss": 0.0099, "step": 3337 }, { "epoch": 4.253583943931188, "grad_norm": 0.18117216321015217, "learning_rate": 1.3131554734719032e-06, "loss": 0.0097, "step": 3338 }, { "epoch": 4.254858235106722, "grad_norm": 0.17877222985914137, "learning_rate": 1.3087478135435361e-06, "loss": 0.0118, "step": 3339 }, { "epoch": 4.256132526282255, "grad_norm": 0.1962930531568466, "learning_rate": 1.3043470452921802e-06, "loss": 0.01, "step": 3340 }, { "epoch": 4.257406817457789, "grad_norm": 0.20601091491766926, "learning_rate": 1.2999531722073988e-06, "loss": 0.0154, "step": 3341 }, { "epoch": 4.258681108633323, "grad_norm": 0.19176812550326072, "learning_rate": 1.2955661977732892e-06, "loss": 0.0092, "step": 3342 }, { "epoch": 4.259955399808856, "grad_norm": 0.21084761471540325, "learning_rate": 1.2911861254684744e-06, "loss": 0.0101, "step": 3343 }, { "epoch": 4.26122969098439, "grad_norm": 0.20077174010912746, "learning_rate": 1.286812958766106e-06, "loss": 0.0113, "step": 3344 }, { "epoch": 4.262503982159924, "grad_norm": 0.19042756778339603, "learning_rate": 1.282446701133867e-06, "loss": 0.0111, "step": 3345 }, { "epoch": 4.263778273335457, "grad_norm": 0.1856061634304757, "learning_rate": 1.278087356033947e-06, "loss": 0.0086, "step": 3346 }, { "epoch": 4.265052564510991, "grad_norm": 0.17225847399155497, "learning_rate": 1.2737349269230713e-06, "loss": 0.0108, "step": 3347 }, { "epoch": 4.266326855686525, "grad_norm": 0.2011585649940464, "learning_rate": 1.2693894172524646e-06, "loss": 0.0151, "step": 3348 }, { "epoch": 4.267601146862058, "grad_norm": 0.20706582761022824, "learning_rate": 1.2650508304678789e-06, "loss": 0.0119, "step": 3349 }, { "epoch": 4.2688754380375915, "grad_norm": 0.20662270375419292, "learning_rate": 1.2607191700095689e-06, "loss": 0.0132, "step": 3350 }, { "epoch": 4.270149729213125, "grad_norm": 0.19944815070607128, "learning_rate": 1.2563944393122996e-06, "loss": 0.0092, "step": 3351 }, { "epoch": 4.271424020388658, "grad_norm": 0.18549423327402445, "learning_rate": 1.2520766418053408e-06, "loss": 0.0113, "step": 3352 }, { "epoch": 4.272698311564192, "grad_norm": 0.18988876205345576, "learning_rate": 1.2477657809124632e-06, "loss": 0.0086, "step": 3353 }, { "epoch": 4.273972602739726, "grad_norm": 0.20422679287157486, "learning_rate": 1.24346186005194e-06, "loss": 0.0123, "step": 3354 }, { "epoch": 4.275246893915259, "grad_norm": 0.20759855722253587, "learning_rate": 1.23916488263654e-06, "loss": 0.0078, "step": 3355 }, { "epoch": 4.276521185090793, "grad_norm": 0.21115656301126406, "learning_rate": 1.2348748520735221e-06, "loss": 0.0134, "step": 3356 }, { "epoch": 4.277795476266327, "grad_norm": 0.19930861855770612, "learning_rate": 1.2305917717646476e-06, "loss": 0.0116, "step": 3357 }, { "epoch": 4.27906976744186, "grad_norm": 0.19106858630826043, "learning_rate": 1.226315645106152e-06, "loss": 0.0099, "step": 3358 }, { "epoch": 4.280344058617394, "grad_norm": 0.1851867510794748, "learning_rate": 1.2220464754887728e-06, "loss": 0.0098, "step": 3359 }, { "epoch": 4.281618349792928, "grad_norm": 0.1779862443293049, "learning_rate": 1.2177842662977136e-06, "loss": 0.0095, "step": 3360 }, { "epoch": 4.282892640968461, "grad_norm": 0.20598714635535295, "learning_rate": 1.213529020912676e-06, "loss": 0.012, "step": 3361 }, { "epoch": 4.284166932143995, "grad_norm": 0.1895012640022538, "learning_rate": 1.209280742707828e-06, "loss": 0.0133, "step": 3362 }, { "epoch": 4.285441223319529, "grad_norm": 0.18102886737273757, "learning_rate": 1.205039435051817e-06, "loss": 0.0102, "step": 3363 }, { "epoch": 4.286715514495062, "grad_norm": 0.19642484508214764, "learning_rate": 1.2008051013077626e-06, "loss": 0.0106, "step": 3364 }, { "epoch": 4.287989805670596, "grad_norm": 0.1735215421692016, "learning_rate": 1.196577744833256e-06, "loss": 0.0105, "step": 3365 }, { "epoch": 4.2892640968461295, "grad_norm": 0.18076375049424692, "learning_rate": 1.1923573689803525e-06, "loss": 0.011, "step": 3366 }, { "epoch": 4.2905383880216625, "grad_norm": 0.19343616778172576, "learning_rate": 1.188143977095576e-06, "loss": 0.0105, "step": 3367 }, { "epoch": 4.2918126791971964, "grad_norm": 0.18509526628806006, "learning_rate": 1.1839375725199098e-06, "loss": 0.0086, "step": 3368 }, { "epoch": 4.29308697037273, "grad_norm": 0.21185079095147413, "learning_rate": 1.1797381585887978e-06, "loss": 0.0123, "step": 3369 }, { "epoch": 4.294361261548263, "grad_norm": 0.22181199639235175, "learning_rate": 1.17554573863214e-06, "loss": 0.0176, "step": 3370 }, { "epoch": 4.295635552723797, "grad_norm": 0.17570436372210946, "learning_rate": 1.1713603159742914e-06, "loss": 0.0078, "step": 3371 }, { "epoch": 4.296909843899331, "grad_norm": 0.19068580424533427, "learning_rate": 1.1671818939340585e-06, "loss": 0.0132, "step": 3372 }, { "epoch": 4.298184135074864, "grad_norm": 0.19047491873496866, "learning_rate": 1.1630104758246951e-06, "loss": 0.0099, "step": 3373 }, { "epoch": 4.299458426250398, "grad_norm": 0.19753236219765147, "learning_rate": 1.1588460649539036e-06, "loss": 0.011, "step": 3374 }, { "epoch": 4.300732717425932, "grad_norm": 0.19797187429853144, "learning_rate": 1.1546886646238265e-06, "loss": 0.0149, "step": 3375 }, { "epoch": 4.302007008601466, "grad_norm": 0.18815373160820775, "learning_rate": 1.1505382781310559e-06, "loss": 0.0099, "step": 3376 }, { "epoch": 4.303281299776999, "grad_norm": 0.18322170161916418, "learning_rate": 1.1463949087666081e-06, "loss": 0.0091, "step": 3377 }, { "epoch": 4.304555590952533, "grad_norm": 0.1862545080034151, "learning_rate": 1.1422585598159519e-06, "loss": 0.0108, "step": 3378 }, { "epoch": 4.305829882128066, "grad_norm": 0.20354991170447462, "learning_rate": 1.1381292345589722e-06, "loss": 0.0119, "step": 3379 }, { "epoch": 4.3071041733036, "grad_norm": 0.1745345592754981, "learning_rate": 1.134006936269999e-06, "loss": 0.009, "step": 3380 }, { "epoch": 4.308378464479134, "grad_norm": 0.18564501271718406, "learning_rate": 1.129891668217783e-06, "loss": 0.0104, "step": 3381 }, { "epoch": 4.3096527556546675, "grad_norm": 0.21250764954850146, "learning_rate": 1.1257834336655005e-06, "loss": 0.0155, "step": 3382 }, { "epoch": 4.310927046830201, "grad_norm": 0.21463506031855267, "learning_rate": 1.1216822358707535e-06, "loss": 0.0152, "step": 3383 }, { "epoch": 4.3122013380057345, "grad_norm": 0.19012858275272454, "learning_rate": 1.1175880780855608e-06, "loss": 0.0118, "step": 3384 }, { "epoch": 4.3134756291812675, "grad_norm": 0.18979365473937687, "learning_rate": 1.113500963556361e-06, "loss": 0.0127, "step": 3385 }, { "epoch": 4.314749920356801, "grad_norm": 0.187336069716658, "learning_rate": 1.1094208955240083e-06, "loss": 0.0104, "step": 3386 }, { "epoch": 4.316024211532335, "grad_norm": 0.20213222708509823, "learning_rate": 1.105347877223768e-06, "loss": 0.0104, "step": 3387 }, { "epoch": 4.317298502707869, "grad_norm": 0.1726316165509515, "learning_rate": 1.1012819118853147e-06, "loss": 0.0082, "step": 3388 }, { "epoch": 4.318572793883402, "grad_norm": 0.18721023996706643, "learning_rate": 1.0972230027327335e-06, "loss": 0.0096, "step": 3389 }, { "epoch": 4.319847085058936, "grad_norm": 0.19379253088785547, "learning_rate": 1.0931711529845112e-06, "loss": 0.011, "step": 3390 }, { "epoch": 4.321121376234469, "grad_norm": 0.18867710003295896, "learning_rate": 1.089126365853539e-06, "loss": 0.011, "step": 3391 }, { "epoch": 4.322395667410003, "grad_norm": 0.19462175964916378, "learning_rate": 1.0850886445471055e-06, "loss": 0.0114, "step": 3392 }, { "epoch": 4.323669958585537, "grad_norm": 0.21127621770358024, "learning_rate": 1.0810579922668996e-06, "loss": 0.0106, "step": 3393 }, { "epoch": 4.324944249761071, "grad_norm": 0.2003411639724992, "learning_rate": 1.0770344122089994e-06, "loss": 0.011, "step": 3394 }, { "epoch": 4.326218540936604, "grad_norm": 0.17959411615806023, "learning_rate": 1.073017907563887e-06, "loss": 0.0087, "step": 3395 }, { "epoch": 4.327492832112138, "grad_norm": 0.21258877182627473, "learning_rate": 1.0690084815164159e-06, "loss": 0.0156, "step": 3396 }, { "epoch": 4.328767123287671, "grad_norm": 0.19604000922952264, "learning_rate": 1.0650061372458465e-06, "loss": 0.0123, "step": 3397 }, { "epoch": 4.330041414463205, "grad_norm": 0.18996962606593468, "learning_rate": 1.0610108779258043e-06, "loss": 0.0108, "step": 3398 }, { "epoch": 4.331315705638739, "grad_norm": 0.1862025035046483, "learning_rate": 1.0570227067243144e-06, "loss": 0.0096, "step": 3399 }, { "epoch": 4.3325899968142725, "grad_norm": 0.17598766686537914, "learning_rate": 1.0530416268037702e-06, "loss": 0.009, "step": 3400 }, { "epoch": 4.3338642879898055, "grad_norm": 0.18818974345683423, "learning_rate": 1.0490676413209466e-06, "loss": 0.0097, "step": 3401 }, { "epoch": 4.335138579165339, "grad_norm": 0.17618407102782094, "learning_rate": 1.0451007534269908e-06, "loss": 0.0085, "step": 3402 }, { "epoch": 4.336412870340873, "grad_norm": 0.18608959039457185, "learning_rate": 1.0411409662674232e-06, "loss": 0.0086, "step": 3403 }, { "epoch": 4.337687161516406, "grad_norm": 0.20451902004986003, "learning_rate": 1.0371882829821345e-06, "loss": 0.0113, "step": 3404 }, { "epoch": 4.33896145269194, "grad_norm": 0.19102661755656145, "learning_rate": 1.0332427067053796e-06, "loss": 0.0108, "step": 3405 }, { "epoch": 4.340235743867474, "grad_norm": 0.19058532807985923, "learning_rate": 1.0293042405657805e-06, "loss": 0.0098, "step": 3406 }, { "epoch": 4.341510035043007, "grad_norm": 0.18332864272509208, "learning_rate": 1.0253728876863256e-06, "loss": 0.01, "step": 3407 }, { "epoch": 4.342784326218541, "grad_norm": 0.18497651650694158, "learning_rate": 1.0214486511843492e-06, "loss": 0.0093, "step": 3408 }, { "epoch": 4.344058617394075, "grad_norm": 0.18485059524868622, "learning_rate": 1.0175315341715598e-06, "loss": 0.0079, "step": 3409 }, { "epoch": 4.345332908569608, "grad_norm": 0.17442482477936008, "learning_rate": 1.013621539754004e-06, "loss": 0.0097, "step": 3410 }, { "epoch": 4.346607199745142, "grad_norm": 0.2098511241186144, "learning_rate": 1.009718671032095e-06, "loss": 0.013, "step": 3411 }, { "epoch": 4.347881490920676, "grad_norm": 0.2018412523967623, "learning_rate": 1.0058229311005873e-06, "loss": 0.015, "step": 3412 }, { "epoch": 4.349155782096209, "grad_norm": 0.19949497377951833, "learning_rate": 1.0019343230485856e-06, "loss": 0.0121, "step": 3413 }, { "epoch": 4.350430073271743, "grad_norm": 0.18835836418456303, "learning_rate": 9.980528499595377e-07, "loss": 0.0133, "step": 3414 }, { "epoch": 4.351704364447277, "grad_norm": 0.21452120096931793, "learning_rate": 9.941785149112359e-07, "loss": 0.0103, "step": 3415 }, { "epoch": 4.35297865562281, "grad_norm": 0.20466238657968155, "learning_rate": 9.903113209758098e-07, "loss": 0.0106, "step": 3416 }, { "epoch": 4.3542529467983435, "grad_norm": 0.19116948845770856, "learning_rate": 9.864512712197283e-07, "loss": 0.0102, "step": 3417 }, { "epoch": 4.355527237973877, "grad_norm": 0.18689593564685644, "learning_rate": 9.825983687037954e-07, "loss": 0.0132, "step": 3418 }, { "epoch": 4.35680152914941, "grad_norm": 0.17438838205023985, "learning_rate": 9.787526164831473e-07, "loss": 0.0083, "step": 3419 }, { "epoch": 4.358075820324944, "grad_norm": 0.20258324291249094, "learning_rate": 9.749140176072503e-07, "loss": 0.0129, "step": 3420 }, { "epoch": 4.359350111500478, "grad_norm": 0.1870972682860878, "learning_rate": 9.710825751198993e-07, "loss": 0.0108, "step": 3421 }, { "epoch": 4.360624402676011, "grad_norm": 0.1839989390315373, "learning_rate": 9.672582920592133e-07, "loss": 0.0129, "step": 3422 }, { "epoch": 4.361898693851545, "grad_norm": 0.1843556015344246, "learning_rate": 9.634411714576353e-07, "loss": 0.0114, "step": 3423 }, { "epoch": 4.363172985027079, "grad_norm": 0.1893933652647473, "learning_rate": 9.596312163419275e-07, "loss": 0.0104, "step": 3424 }, { "epoch": 4.364447276202612, "grad_norm": 0.18252190839009835, "learning_rate": 9.55828429733171e-07, "loss": 0.0097, "step": 3425 }, { "epoch": 4.365721567378146, "grad_norm": 0.17147445140330983, "learning_rate": 9.520328146467672e-07, "loss": 0.0085, "step": 3426 }, { "epoch": 4.36699585855368, "grad_norm": 0.21056142994327728, "learning_rate": 9.482443740924197e-07, "loss": 0.0139, "step": 3427 }, { "epoch": 4.368270149729213, "grad_norm": 0.18430656981064794, "learning_rate": 9.444631110741586e-07, "loss": 0.011, "step": 3428 }, { "epoch": 4.369544440904747, "grad_norm": 0.20700799575275086, "learning_rate": 9.406890285903047e-07, "loss": 0.009, "step": 3429 }, { "epoch": 4.370818732080281, "grad_norm": 0.17659891664855085, "learning_rate": 9.369221296335007e-07, "loss": 0.0084, "step": 3430 }, { "epoch": 4.372093023255814, "grad_norm": 0.23295448282027784, "learning_rate": 9.33162417190685e-07, "loss": 0.015, "step": 3431 }, { "epoch": 4.373367314431348, "grad_norm": 0.17805548732819093, "learning_rate": 9.294098942430996e-07, "loss": 0.0096, "step": 3432 }, { "epoch": 4.3746416056068815, "grad_norm": 0.18527834353227943, "learning_rate": 9.256645637662854e-07, "loss": 0.0099, "step": 3433 }, { "epoch": 4.3759158967824145, "grad_norm": 0.18450821473797008, "learning_rate": 9.2192642873008e-07, "loss": 0.0112, "step": 3434 }, { "epoch": 4.3771901879579485, "grad_norm": 0.19661392080851864, "learning_rate": 9.181954920986147e-07, "loss": 0.0121, "step": 3435 }, { "epoch": 4.378464479133482, "grad_norm": 0.179108486483961, "learning_rate": 9.144717568303141e-07, "loss": 0.0077, "step": 3436 }, { "epoch": 4.379738770309015, "grad_norm": 0.19790996666061403, "learning_rate": 9.107552258778907e-07, "loss": 0.0124, "step": 3437 }, { "epoch": 4.381013061484549, "grad_norm": 0.2061838527284871, "learning_rate": 9.070459021883516e-07, "loss": 0.011, "step": 3438 }, { "epoch": 4.382287352660083, "grad_norm": 0.19383088485462174, "learning_rate": 9.033437887029739e-07, "loss": 0.0086, "step": 3439 }, { "epoch": 4.383561643835616, "grad_norm": 0.18858412084496035, "learning_rate": 8.996488883573351e-07, "loss": 0.009, "step": 3440 }, { "epoch": 4.38483593501115, "grad_norm": 0.19823098878461157, "learning_rate": 8.959612040812782e-07, "loss": 0.0107, "step": 3441 }, { "epoch": 4.386110226186684, "grad_norm": 0.1746336336716844, "learning_rate": 8.922807387989341e-07, "loss": 0.008, "step": 3442 }, { "epoch": 4.387384517362217, "grad_norm": 0.1973416633013931, "learning_rate": 8.886074954287049e-07, "loss": 0.0114, "step": 3443 }, { "epoch": 4.388658808537751, "grad_norm": 0.1934840773926233, "learning_rate": 8.849414768832687e-07, "loss": 0.0117, "step": 3444 }, { "epoch": 4.389933099713285, "grad_norm": 0.19152569870662933, "learning_rate": 8.812826860695712e-07, "loss": 0.0097, "step": 3445 }, { "epoch": 4.391207390888818, "grad_norm": 0.19612434343722596, "learning_rate": 8.776311258888303e-07, "loss": 0.0121, "step": 3446 }, { "epoch": 4.392481682064352, "grad_norm": 0.17799806258669434, "learning_rate": 8.739867992365281e-07, "loss": 0.0097, "step": 3447 }, { "epoch": 4.393755973239886, "grad_norm": 0.17904966167793832, "learning_rate": 8.703497090024116e-07, "loss": 0.0088, "step": 3448 }, { "epoch": 4.395030264415419, "grad_norm": 0.2043167058601488, "learning_rate": 8.667198580704916e-07, "loss": 0.0134, "step": 3449 }, { "epoch": 4.396304555590953, "grad_norm": 0.19918616259246177, "learning_rate": 8.630972493190359e-07, "loss": 0.0136, "step": 3450 }, { "epoch": 4.3975788467664865, "grad_norm": 0.1914715009108661, "learning_rate": 8.5948188562057e-07, "loss": 0.0143, "step": 3451 }, { "epoch": 4.3988531379420195, "grad_norm": 0.20013735708968658, "learning_rate": 8.558737698418762e-07, "loss": 0.0118, "step": 3452 }, { "epoch": 4.400127429117553, "grad_norm": 0.18170905628864642, "learning_rate": 8.522729048439881e-07, "loss": 0.0105, "step": 3453 }, { "epoch": 4.401401720293087, "grad_norm": 0.18272440155242714, "learning_rate": 8.486792934821908e-07, "loss": 0.0118, "step": 3454 }, { "epoch": 4.40267601146862, "grad_norm": 0.20832452624452077, "learning_rate": 8.450929386060169e-07, "loss": 0.0153, "step": 3455 }, { "epoch": 4.403950302644154, "grad_norm": 0.19977292623404086, "learning_rate": 8.415138430592429e-07, "loss": 0.0144, "step": 3456 }, { "epoch": 4.405224593819688, "grad_norm": 0.19844905471979404, "learning_rate": 8.379420096798973e-07, "loss": 0.009, "step": 3457 }, { "epoch": 4.406498884995221, "grad_norm": 0.18211015876409317, "learning_rate": 8.343774413002382e-07, "loss": 0.0077, "step": 3458 }, { "epoch": 4.407773176170755, "grad_norm": 0.20767114566131856, "learning_rate": 8.308201407467742e-07, "loss": 0.0108, "step": 3459 }, { "epoch": 4.409047467346289, "grad_norm": 0.20747327597787632, "learning_rate": 8.272701108402392e-07, "loss": 0.01, "step": 3460 }, { "epoch": 4.410321758521822, "grad_norm": 0.18380363607223674, "learning_rate": 8.237273543956147e-07, "loss": 0.0094, "step": 3461 }, { "epoch": 4.411596049697356, "grad_norm": 0.20564614532155895, "learning_rate": 8.20191874222106e-07, "loss": 0.0149, "step": 3462 }, { "epoch": 4.41287034087289, "grad_norm": 0.21352522192596915, "learning_rate": 8.166636731231514e-07, "loss": 0.0116, "step": 3463 }, { "epoch": 4.414144632048423, "grad_norm": 0.18081181216190306, "learning_rate": 8.131427538964165e-07, "loss": 0.0092, "step": 3464 }, { "epoch": 4.415418923223957, "grad_norm": 0.1767031111768963, "learning_rate": 8.096291193337935e-07, "loss": 0.0096, "step": 3465 }, { "epoch": 4.416693214399491, "grad_norm": 0.19345666145575643, "learning_rate": 8.061227722213993e-07, "loss": 0.0125, "step": 3466 }, { "epoch": 4.417967505575024, "grad_norm": 0.2006634413488598, "learning_rate": 8.026237153395688e-07, "loss": 0.0119, "step": 3467 }, { "epoch": 4.4192417967505575, "grad_norm": 0.19673972135725876, "learning_rate": 7.991319514628593e-07, "loss": 0.0111, "step": 3468 }, { "epoch": 4.420516087926091, "grad_norm": 0.18324447921589312, "learning_rate": 7.95647483360048e-07, "loss": 0.0084, "step": 3469 }, { "epoch": 4.421790379101624, "grad_norm": 0.1810182572489793, "learning_rate": 7.921703137941172e-07, "loss": 0.0118, "step": 3470 }, { "epoch": 4.423064670277158, "grad_norm": 0.1825110567617673, "learning_rate": 7.887004455222735e-07, "loss": 0.0095, "step": 3471 }, { "epoch": 4.424338961452692, "grad_norm": 0.22631363999046927, "learning_rate": 7.852378812959227e-07, "loss": 0.0131, "step": 3472 }, { "epoch": 4.425613252628225, "grad_norm": 0.21976826729382123, "learning_rate": 7.817826238606896e-07, "loss": 0.0152, "step": 3473 }, { "epoch": 4.426887543803759, "grad_norm": 0.16795263996873247, "learning_rate": 7.783346759563992e-07, "loss": 0.0092, "step": 3474 }, { "epoch": 4.428161834979293, "grad_norm": 0.1923404340013452, "learning_rate": 7.74894040317079e-07, "loss": 0.0091, "step": 3475 }, { "epoch": 4.429436126154826, "grad_norm": 0.17555728098253662, "learning_rate": 7.714607196709633e-07, "loss": 0.0097, "step": 3476 }, { "epoch": 4.43071041733036, "grad_norm": 0.17715502884165732, "learning_rate": 7.680347167404811e-07, "loss": 0.0088, "step": 3477 }, { "epoch": 4.431984708505894, "grad_norm": 0.20594509868490343, "learning_rate": 7.646160342422637e-07, "loss": 0.0138, "step": 3478 }, { "epoch": 4.433258999681427, "grad_norm": 0.20057421441935538, "learning_rate": 7.612046748871327e-07, "loss": 0.0112, "step": 3479 }, { "epoch": 4.434533290856961, "grad_norm": 0.18970137507882823, "learning_rate": 7.578006413801076e-07, "loss": 0.0088, "step": 3480 }, { "epoch": 4.435807582032495, "grad_norm": 0.2004997659481339, "learning_rate": 7.544039364203937e-07, "loss": 0.0141, "step": 3481 }, { "epoch": 4.437081873208028, "grad_norm": 0.16681672750632648, "learning_rate": 7.510145627013909e-07, "loss": 0.01, "step": 3482 }, { "epoch": 4.438356164383562, "grad_norm": 0.18258870167901756, "learning_rate": 7.476325229106818e-07, "loss": 0.0099, "step": 3483 }, { "epoch": 4.4396304555590955, "grad_norm": 0.18303745761557175, "learning_rate": 7.442578197300355e-07, "loss": 0.0089, "step": 3484 }, { "epoch": 4.4409047467346285, "grad_norm": 0.19117692273016967, "learning_rate": 7.408904558354024e-07, "loss": 0.0125, "step": 3485 }, { "epoch": 4.442179037910162, "grad_norm": 0.1972470016962745, "learning_rate": 7.375304338969135e-07, "loss": 0.0122, "step": 3486 }, { "epoch": 4.443453329085696, "grad_norm": 0.17886052804039484, "learning_rate": 7.341777565788766e-07, "loss": 0.0091, "step": 3487 }, { "epoch": 4.444727620261229, "grad_norm": 0.1820063995983355, "learning_rate": 7.308324265397837e-07, "loss": 0.0094, "step": 3488 }, { "epoch": 4.446001911436763, "grad_norm": 0.2011961458242377, "learning_rate": 7.274944464322864e-07, "loss": 0.0132, "step": 3489 }, { "epoch": 4.447276202612297, "grad_norm": 0.18466763325107532, "learning_rate": 7.241638189032241e-07, "loss": 0.0101, "step": 3490 }, { "epoch": 4.44855049378783, "grad_norm": 0.18694330980048351, "learning_rate": 7.208405465935897e-07, "loss": 0.009, "step": 3491 }, { "epoch": 4.449824784963364, "grad_norm": 0.1813591480608024, "learning_rate": 7.175246321385587e-07, "loss": 0.0086, "step": 3492 }, { "epoch": 4.451099076138898, "grad_norm": 0.2004741795233097, "learning_rate": 7.142160781674645e-07, "loss": 0.0118, "step": 3493 }, { "epoch": 4.452373367314431, "grad_norm": 0.1912390761279818, "learning_rate": 7.109148873038041e-07, "loss": 0.0099, "step": 3494 }, { "epoch": 4.453647658489965, "grad_norm": 0.2033468328370331, "learning_rate": 7.07621062165239e-07, "loss": 0.0126, "step": 3495 }, { "epoch": 4.454921949665499, "grad_norm": 0.18739897352618687, "learning_rate": 7.043346053635869e-07, "loss": 0.0101, "step": 3496 }, { "epoch": 4.456196240841032, "grad_norm": 0.18880126345497125, "learning_rate": 7.010555195048241e-07, "loss": 0.0092, "step": 3497 }, { "epoch": 4.457470532016566, "grad_norm": 0.19683941750893824, "learning_rate": 6.977838071890842e-07, "loss": 0.0101, "step": 3498 }, { "epoch": 4.4587448231921, "grad_norm": 0.17876489188885397, "learning_rate": 6.945194710106485e-07, "loss": 0.0079, "step": 3499 }, { "epoch": 4.460019114367633, "grad_norm": 0.1801905909037071, "learning_rate": 6.912625135579587e-07, "loss": 0.0093, "step": 3500 }, { "epoch": 4.461293405543167, "grad_norm": 0.21099865940476645, "learning_rate": 6.880129374135935e-07, "loss": 0.012, "step": 3501 }, { "epoch": 4.4625676967187005, "grad_norm": 0.202521490203397, "learning_rate": 6.847707451542918e-07, "loss": 0.0117, "step": 3502 }, { "epoch": 4.4638419878942335, "grad_norm": 0.19375936190778387, "learning_rate": 6.815359393509235e-07, "loss": 0.0118, "step": 3503 }, { "epoch": 4.465116279069767, "grad_norm": 0.20196256122336106, "learning_rate": 6.783085225685149e-07, "loss": 0.0155, "step": 3504 }, { "epoch": 4.466390570245301, "grad_norm": 0.18575336562961217, "learning_rate": 6.750884973662242e-07, "loss": 0.0098, "step": 3505 }, { "epoch": 4.467664861420834, "grad_norm": 0.18792819374588035, "learning_rate": 6.718758662973524e-07, "loss": 0.0136, "step": 3506 }, { "epoch": 4.468939152596368, "grad_norm": 0.2036611629278816, "learning_rate": 6.68670631909335e-07, "loss": 0.0115, "step": 3507 }, { "epoch": 4.470213443771902, "grad_norm": 0.17271363825241937, "learning_rate": 6.654727967437446e-07, "loss": 0.0075, "step": 3508 }, { "epoch": 4.471487734947435, "grad_norm": 0.19811273939824572, "learning_rate": 6.622823633362852e-07, "loss": 0.0116, "step": 3509 }, { "epoch": 4.472762026122969, "grad_norm": 0.19330721044358204, "learning_rate": 6.590993342167929e-07, "loss": 0.0092, "step": 3510 }, { "epoch": 4.474036317298503, "grad_norm": 0.2033248675210619, "learning_rate": 6.55923711909231e-07, "loss": 0.0118, "step": 3511 }, { "epoch": 4.475310608474036, "grad_norm": 0.1926881000808301, "learning_rate": 6.527554989316898e-07, "loss": 0.0152, "step": 3512 }, { "epoch": 4.47658489964957, "grad_norm": 0.19318917002721483, "learning_rate": 6.495946977963874e-07, "loss": 0.0119, "step": 3513 }, { "epoch": 4.477859190825104, "grad_norm": 0.22680761459308257, "learning_rate": 6.464413110096601e-07, "loss": 0.0165, "step": 3514 }, { "epoch": 4.479133482000637, "grad_norm": 0.17736226596734933, "learning_rate": 6.432953410719678e-07, "loss": 0.0098, "step": 3515 }, { "epoch": 4.480407773176171, "grad_norm": 0.2019463332703258, "learning_rate": 6.401567904778894e-07, "loss": 0.011, "step": 3516 }, { "epoch": 4.481682064351705, "grad_norm": 0.20862926971547605, "learning_rate": 6.370256617161197e-07, "loss": 0.0118, "step": 3517 }, { "epoch": 4.482956355527238, "grad_norm": 0.17918581747214565, "learning_rate": 6.339019572694671e-07, "loss": 0.0102, "step": 3518 }, { "epoch": 4.4842306467027715, "grad_norm": 0.20286497551143737, "learning_rate": 6.307856796148581e-07, "loss": 0.0121, "step": 3519 }, { "epoch": 4.485504937878305, "grad_norm": 0.2177970931804103, "learning_rate": 6.276768312233228e-07, "loss": 0.0144, "step": 3520 }, { "epoch": 4.486779229053838, "grad_norm": 0.1840938280365912, "learning_rate": 6.245754145600091e-07, "loss": 0.0075, "step": 3521 }, { "epoch": 4.488053520229372, "grad_norm": 0.1932652148118395, "learning_rate": 6.214814320841611e-07, "loss": 0.01, "step": 3522 }, { "epoch": 4.489327811404906, "grad_norm": 0.19179407848372587, "learning_rate": 6.183948862491373e-07, "loss": 0.0106, "step": 3523 }, { "epoch": 4.490602102580439, "grad_norm": 0.1816676703023874, "learning_rate": 6.153157795023956e-07, "loss": 0.0087, "step": 3524 }, { "epoch": 4.491876393755973, "grad_norm": 0.19971817786602444, "learning_rate": 6.122441142854962e-07, "loss": 0.0128, "step": 3525 }, { "epoch": 4.493150684931507, "grad_norm": 0.16002243540975128, "learning_rate": 6.091798930340964e-07, "loss": 0.0086, "step": 3526 }, { "epoch": 4.49442497610704, "grad_norm": 0.1628644840675576, "learning_rate": 6.061231181779525e-07, "loss": 0.0092, "step": 3527 }, { "epoch": 4.495699267282574, "grad_norm": 0.19064052961959047, "learning_rate": 6.030737921409169e-07, "loss": 0.0087, "step": 3528 }, { "epoch": 4.496973558458108, "grad_norm": 0.20080834795904073, "learning_rate": 6.000319173409342e-07, "loss": 0.013, "step": 3529 }, { "epoch": 4.498247849633641, "grad_norm": 0.1962929584751617, "learning_rate": 5.969974961900382e-07, "loss": 0.0134, "step": 3530 }, { "epoch": 4.499522140809175, "grad_norm": 0.23860870844042506, "learning_rate": 5.939705310943622e-07, "loss": 0.0138, "step": 3531 }, { "epoch": 4.500796431984709, "grad_norm": 0.18248359036199782, "learning_rate": 5.909510244541128e-07, "loss": 0.0081, "step": 3532 }, { "epoch": 4.502070723160243, "grad_norm": 0.1936118711102676, "learning_rate": 5.879389786635958e-07, "loss": 0.0116, "step": 3533 }, { "epoch": 4.503345014335776, "grad_norm": 0.1976077639347187, "learning_rate": 5.849343961111897e-07, "loss": 0.0117, "step": 3534 }, { "epoch": 4.5046193055113095, "grad_norm": 0.19527599506444573, "learning_rate": 5.819372791793654e-07, "loss": 0.0111, "step": 3535 }, { "epoch": 4.5058935966868425, "grad_norm": 0.19283513697352717, "learning_rate": 5.789476302446662e-07, "loss": 0.0112, "step": 3536 }, { "epoch": 4.507167887862376, "grad_norm": 0.19568353723478976, "learning_rate": 5.759654516777169e-07, "loss": 0.0117, "step": 3537 }, { "epoch": 4.50844217903791, "grad_norm": 0.1847077971101268, "learning_rate": 5.729907458432193e-07, "loss": 0.0112, "step": 3538 }, { "epoch": 4.509716470213444, "grad_norm": 0.17110599368336385, "learning_rate": 5.700235150999478e-07, "loss": 0.0081, "step": 3539 }, { "epoch": 4.510990761388977, "grad_norm": 0.1851617142014938, "learning_rate": 5.670637618007502e-07, "loss": 0.0106, "step": 3540 }, { "epoch": 4.512265052564511, "grad_norm": 0.2124892916823035, "learning_rate": 5.64111488292547e-07, "loss": 0.0119, "step": 3541 }, { "epoch": 4.513539343740044, "grad_norm": 0.19284834184287783, "learning_rate": 5.611666969163243e-07, "loss": 0.0113, "step": 3542 }, { "epoch": 4.514813634915578, "grad_norm": 0.18611183559547398, "learning_rate": 5.582293900071389e-07, "loss": 0.0106, "step": 3543 }, { "epoch": 4.516087926091112, "grad_norm": 0.18262200515281188, "learning_rate": 5.552995698941088e-07, "loss": 0.0078, "step": 3544 }, { "epoch": 4.517362217266646, "grad_norm": 0.2024060761563803, "learning_rate": 5.523772389004178e-07, "loss": 0.0105, "step": 3545 }, { "epoch": 4.518636508442179, "grad_norm": 0.1845787793957702, "learning_rate": 5.494623993433124e-07, "loss": 0.0129, "step": 3546 }, { "epoch": 4.519910799617713, "grad_norm": 0.20556662320835634, "learning_rate": 5.465550535340968e-07, "loss": 0.0108, "step": 3547 }, { "epoch": 4.521185090793246, "grad_norm": 0.2132409799160858, "learning_rate": 5.43655203778134e-07, "loss": 0.0101, "step": 3548 }, { "epoch": 4.52245938196878, "grad_norm": 0.19785018125130083, "learning_rate": 5.407628523748398e-07, "loss": 0.012, "step": 3549 }, { "epoch": 4.523733673144314, "grad_norm": 0.20496773690034825, "learning_rate": 5.378780016176932e-07, "loss": 0.0086, "step": 3550 }, { "epoch": 4.5250079643198475, "grad_norm": 0.19119229182291458, "learning_rate": 5.350006537942121e-07, "loss": 0.0137, "step": 3551 }, { "epoch": 4.5262822554953805, "grad_norm": 0.2004419994889415, "learning_rate": 5.321308111859791e-07, "loss": 0.0123, "step": 3552 }, { "epoch": 4.5275565466709145, "grad_norm": 0.21166919063780873, "learning_rate": 5.29268476068614e-07, "loss": 0.011, "step": 3553 }, { "epoch": 4.5288308378464475, "grad_norm": 0.20929712296224545, "learning_rate": 5.264136507117911e-07, "loss": 0.011, "step": 3554 }, { "epoch": 4.530105129021981, "grad_norm": 0.1772865061303166, "learning_rate": 5.235663373792277e-07, "loss": 0.0084, "step": 3555 }, { "epoch": 4.531379420197515, "grad_norm": 0.18871305752231116, "learning_rate": 5.207265383286831e-07, "loss": 0.0087, "step": 3556 }, { "epoch": 4.532653711373049, "grad_norm": 0.1961304696790029, "learning_rate": 5.178942558119582e-07, "loss": 0.0147, "step": 3557 }, { "epoch": 4.533928002548582, "grad_norm": 0.20322860545627627, "learning_rate": 5.150694920748967e-07, "loss": 0.0129, "step": 3558 }, { "epoch": 4.535202293724116, "grad_norm": 0.18099410460640264, "learning_rate": 5.122522493573756e-07, "loss": 0.0099, "step": 3559 }, { "epoch": 4.536476584899649, "grad_norm": 0.17242783585830415, "learning_rate": 5.094425298933136e-07, "loss": 0.0086, "step": 3560 }, { "epoch": 4.537750876075183, "grad_norm": 0.19260700785642057, "learning_rate": 5.066403359106586e-07, "loss": 0.0122, "step": 3561 }, { "epoch": 4.539025167250717, "grad_norm": 0.19852425726332826, "learning_rate": 5.038456696313975e-07, "loss": 0.0136, "step": 3562 }, { "epoch": 4.540299458426251, "grad_norm": 0.19751101541276747, "learning_rate": 5.010585332715401e-07, "loss": 0.0098, "step": 3563 }, { "epoch": 4.541573749601784, "grad_norm": 0.19285195464994487, "learning_rate": 4.982789290411338e-07, "loss": 0.0118, "step": 3564 }, { "epoch": 4.542848040777318, "grad_norm": 0.21444058517336337, "learning_rate": 4.955068591442447e-07, "loss": 0.0138, "step": 3565 }, { "epoch": 4.544122331952851, "grad_norm": 0.1605195171428037, "learning_rate": 4.927423257789721e-07, "loss": 0.0081, "step": 3566 }, { "epoch": 4.545396623128385, "grad_norm": 0.1858647523249062, "learning_rate": 4.899853311374369e-07, "loss": 0.0112, "step": 3567 }, { "epoch": 4.546670914303919, "grad_norm": 0.19897258868770917, "learning_rate": 4.872358774057806e-07, "loss": 0.0113, "step": 3568 }, { "epoch": 4.5479452054794525, "grad_norm": 0.1722276990939248, "learning_rate": 4.844939667641669e-07, "loss": 0.0085, "step": 3569 }, { "epoch": 4.5492194966549855, "grad_norm": 0.20394157124445297, "learning_rate": 4.817596013867765e-07, "loss": 0.0116, "step": 3570 }, { "epoch": 4.550493787830519, "grad_norm": 0.16959711362653124, "learning_rate": 4.790327834418085e-07, "loss": 0.009, "step": 3571 }, { "epoch": 4.551768079006052, "grad_norm": 0.20441476920055218, "learning_rate": 4.763135150914777e-07, "loss": 0.0134, "step": 3572 }, { "epoch": 4.553042370181586, "grad_norm": 0.1979508800421983, "learning_rate": 4.736017984920127e-07, "loss": 0.0124, "step": 3573 }, { "epoch": 4.55431666135712, "grad_norm": 0.19730422363689423, "learning_rate": 4.708976357936512e-07, "loss": 0.0093, "step": 3574 }, { "epoch": 4.555590952532654, "grad_norm": 0.17668855082092905, "learning_rate": 4.6820102914064357e-07, "loss": 0.0122, "step": 3575 }, { "epoch": 4.556865243708187, "grad_norm": 0.21033140905098915, "learning_rate": 4.655119806712483e-07, "loss": 0.0157, "step": 3576 }, { "epoch": 4.558139534883721, "grad_norm": 0.19634492646904733, "learning_rate": 4.628304925177318e-07, "loss": 0.0128, "step": 3577 }, { "epoch": 4.559413826059254, "grad_norm": 0.1932997424333379, "learning_rate": 4.6015656680636234e-07, "loss": 0.012, "step": 3578 }, { "epoch": 4.560688117234788, "grad_norm": 0.18624308615422963, "learning_rate": 4.574902056574138e-07, "loss": 0.0097, "step": 3579 }, { "epoch": 4.561962408410322, "grad_norm": 0.19261273252829272, "learning_rate": 4.548314111851604e-07, "loss": 0.0119, "step": 3580 }, { "epoch": 4.563236699585856, "grad_norm": 0.1881050076369926, "learning_rate": 4.5218018549788247e-07, "loss": 0.0095, "step": 3581 }, { "epoch": 4.564510990761389, "grad_norm": 0.2029119332631572, "learning_rate": 4.495365306978472e-07, "loss": 0.0127, "step": 3582 }, { "epoch": 4.565785281936923, "grad_norm": 0.18376666451921186, "learning_rate": 4.4690044888133e-07, "loss": 0.0099, "step": 3583 }, { "epoch": 4.567059573112457, "grad_norm": 0.1797380371620065, "learning_rate": 4.4427194213859216e-07, "loss": 0.0084, "step": 3584 }, { "epoch": 4.56833386428799, "grad_norm": 0.17923252804503872, "learning_rate": 4.416510125538964e-07, "loss": 0.0092, "step": 3585 }, { "epoch": 4.5696081554635235, "grad_norm": 0.18087162338072127, "learning_rate": 4.390376622054915e-07, "loss": 0.0121, "step": 3586 }, { "epoch": 4.570882446639057, "grad_norm": 0.1978661920249358, "learning_rate": 4.3643189316561864e-07, "loss": 0.0093, "step": 3587 }, { "epoch": 4.57215673781459, "grad_norm": 0.1883480573628707, "learning_rate": 4.3383370750050723e-07, "loss": 0.0096, "step": 3588 }, { "epoch": 4.573431028990124, "grad_norm": 0.185036811686083, "learning_rate": 4.312431072703716e-07, "loss": 0.0094, "step": 3589 }, { "epoch": 4.574705320165658, "grad_norm": 0.19144427467783873, "learning_rate": 4.286600945294139e-07, "loss": 0.0079, "step": 3590 }, { "epoch": 4.575979611341191, "grad_norm": 0.1919367201393151, "learning_rate": 4.2608467132581934e-07, "loss": 0.0106, "step": 3591 }, { "epoch": 4.577253902516725, "grad_norm": 0.18900528469192032, "learning_rate": 4.235168397017542e-07, "loss": 0.0091, "step": 3592 }, { "epoch": 4.578528193692259, "grad_norm": 0.20372064310259705, "learning_rate": 4.2095660169336527e-07, "loss": 0.0134, "step": 3593 }, { "epoch": 4.579802484867792, "grad_norm": 0.17410219081667944, "learning_rate": 4.184039593307776e-07, "loss": 0.0074, "step": 3594 }, { "epoch": 4.581076776043326, "grad_norm": 0.18999336671703945, "learning_rate": 4.158589146380954e-07, "loss": 0.0091, "step": 3595 }, { "epoch": 4.58235106721886, "grad_norm": 0.19538568041967633, "learning_rate": 4.133214696333943e-07, "loss": 0.0129, "step": 3596 }, { "epoch": 4.583625358394393, "grad_norm": 0.20130249565109112, "learning_rate": 4.1079162632872926e-07, "loss": 0.0134, "step": 3597 }, { "epoch": 4.584899649569927, "grad_norm": 0.20601988538425592, "learning_rate": 4.082693867301224e-07, "loss": 0.0096, "step": 3598 }, { "epoch": 4.586173940745461, "grad_norm": 0.20227928182448654, "learning_rate": 4.0575475283756717e-07, "loss": 0.011, "step": 3599 }, { "epoch": 4.587448231920994, "grad_norm": 0.19028392760304025, "learning_rate": 4.03247726645033e-07, "loss": 0.0097, "step": 3600 }, { "epoch": 4.588722523096528, "grad_norm": 0.17156746398123385, "learning_rate": 4.0074831014044637e-07, "loss": 0.0084, "step": 3601 }, { "epoch": 4.5899968142720615, "grad_norm": 0.18423716843342774, "learning_rate": 3.982565053057086e-07, "loss": 0.0095, "step": 3602 }, { "epoch": 4.5912711054475945, "grad_norm": 0.21220495102609555, "learning_rate": 3.9577231411667803e-07, "loss": 0.0107, "step": 3603 }, { "epoch": 4.592545396623128, "grad_norm": 0.1985757175211246, "learning_rate": 3.9329573854318127e-07, "loss": 0.0111, "step": 3604 }, { "epoch": 4.593819687798662, "grad_norm": 0.16813058418136317, "learning_rate": 3.908267805490051e-07, "loss": 0.0084, "step": 3605 }, { "epoch": 4.595093978974195, "grad_norm": 0.18046752839087943, "learning_rate": 3.8836544209189366e-07, "loss": 0.0091, "step": 3606 }, { "epoch": 4.596368270149729, "grad_norm": 0.188305267892646, "learning_rate": 3.8591172512355114e-07, "loss": 0.0102, "step": 3607 }, { "epoch": 4.597642561325263, "grad_norm": 0.1702988325553907, "learning_rate": 3.834656315896379e-07, "loss": 0.008, "step": 3608 }, { "epoch": 4.598916852500796, "grad_norm": 0.17938286202807102, "learning_rate": 3.810271634297691e-07, "loss": 0.0097, "step": 3609 }, { "epoch": 4.60019114367633, "grad_norm": 0.20486757168640102, "learning_rate": 3.7859632257751465e-07, "loss": 0.0125, "step": 3610 }, { "epoch": 4.601465434851864, "grad_norm": 0.1949868564586066, "learning_rate": 3.7617311096039276e-07, "loss": 0.0114, "step": 3611 }, { "epoch": 4.602739726027397, "grad_norm": 0.20032358069883152, "learning_rate": 3.7375753049987974e-07, "loss": 0.011, "step": 3612 }, { "epoch": 4.604014017202931, "grad_norm": 0.1922728716011469, "learning_rate": 3.7134958311139115e-07, "loss": 0.0106, "step": 3613 }, { "epoch": 4.605288308378465, "grad_norm": 0.20257691209843742, "learning_rate": 3.6894927070429744e-07, "loss": 0.0104, "step": 3614 }, { "epoch": 4.606562599553998, "grad_norm": 0.20701528488060492, "learning_rate": 3.665565951819083e-07, "loss": 0.0108, "step": 3615 }, { "epoch": 4.607836890729532, "grad_norm": 0.18971965505649488, "learning_rate": 3.641715584414862e-07, "loss": 0.0093, "step": 3616 }, { "epoch": 4.609111181905066, "grad_norm": 0.1990742252533431, "learning_rate": 3.617941623742305e-07, "loss": 0.0118, "step": 3617 }, { "epoch": 4.610385473080599, "grad_norm": 0.21135819894856755, "learning_rate": 3.594244088652821e-07, "loss": 0.0103, "step": 3618 }, { "epoch": 4.6116597642561326, "grad_norm": 0.21849761606862064, "learning_rate": 3.570622997937234e-07, "loss": 0.017, "step": 3619 }, { "epoch": 4.6129340554316665, "grad_norm": 0.20189403637504746, "learning_rate": 3.5470783703257627e-07, "loss": 0.0087, "step": 3620 }, { "epoch": 4.6142083466071995, "grad_norm": 0.20950767351941318, "learning_rate": 3.5236102244879613e-07, "loss": 0.0124, "step": 3621 }, { "epoch": 4.615482637782733, "grad_norm": 0.1901472300207998, "learning_rate": 3.500218579032766e-07, "loss": 0.0087, "step": 3622 }, { "epoch": 4.616756928958267, "grad_norm": 0.19386519993109824, "learning_rate": 3.476903452508451e-07, "loss": 0.0107, "step": 3623 }, { "epoch": 4.6180312201338, "grad_norm": 0.19416235574260737, "learning_rate": 3.453664863402595e-07, "loss": 0.0148, "step": 3624 }, { "epoch": 4.619305511309334, "grad_norm": 0.18306346150857275, "learning_rate": 3.4305028301421015e-07, "loss": 0.0084, "step": 3625 }, { "epoch": 4.620579802484868, "grad_norm": 0.1960761050697681, "learning_rate": 3.4074173710931804e-07, "loss": 0.0115, "step": 3626 }, { "epoch": 4.621854093660401, "grad_norm": 0.17789531023511332, "learning_rate": 3.3844085045612985e-07, "loss": 0.0108, "step": 3627 }, { "epoch": 4.623128384835935, "grad_norm": 0.19166913894551035, "learning_rate": 3.361476248791196e-07, "loss": 0.0119, "step": 3628 }, { "epoch": 4.624402676011469, "grad_norm": 0.18956752209218913, "learning_rate": 3.3386206219668816e-07, "loss": 0.0095, "step": 3629 }, { "epoch": 4.625676967187002, "grad_norm": 0.20049792680653827, "learning_rate": 3.3158416422115705e-07, "loss": 0.0144, "step": 3630 }, { "epoch": 4.626951258362536, "grad_norm": 0.18826979125742235, "learning_rate": 3.2931393275877577e-07, "loss": 0.0107, "step": 3631 }, { "epoch": 4.62822554953807, "grad_norm": 0.18462177568433227, "learning_rate": 3.2705136960970554e-07, "loss": 0.0077, "step": 3632 }, { "epoch": 4.629499840713603, "grad_norm": 0.20893526399117265, "learning_rate": 3.247964765680389e-07, "loss": 0.0117, "step": 3633 }, { "epoch": 4.630774131889137, "grad_norm": 0.1689167425376152, "learning_rate": 3.2254925542177353e-07, "loss": 0.0095, "step": 3634 }, { "epoch": 4.632048423064671, "grad_norm": 0.20839603568790382, "learning_rate": 3.203097079528339e-07, "loss": 0.0141, "step": 3635 }, { "epoch": 4.633322714240204, "grad_norm": 0.187067476672125, "learning_rate": 3.180778359370551e-07, "loss": 0.0112, "step": 3636 }, { "epoch": 4.6345970054157375, "grad_norm": 0.20570569954124834, "learning_rate": 3.15853641144187e-07, "loss": 0.0122, "step": 3637 }, { "epoch": 4.635871296591271, "grad_norm": 0.2047448977918382, "learning_rate": 3.1363712533789224e-07, "loss": 0.0165, "step": 3638 }, { "epoch": 4.637145587766804, "grad_norm": 0.20514719400655665, "learning_rate": 3.114282902757426e-07, "loss": 0.0118, "step": 3639 }, { "epoch": 4.638419878942338, "grad_norm": 0.1987534139300179, "learning_rate": 3.0922713770922155e-07, "loss": 0.0142, "step": 3640 }, { "epoch": 4.639694170117872, "grad_norm": 0.19966739708825149, "learning_rate": 3.0703366938371946e-07, "loss": 0.013, "step": 3641 }, { "epoch": 4.640968461293405, "grad_norm": 0.186626851581732, "learning_rate": 3.0484788703853295e-07, "loss": 0.0143, "step": 3642 }, { "epoch": 4.642242752468939, "grad_norm": 0.19605148865648236, "learning_rate": 3.0266979240687e-07, "loss": 0.009, "step": 3643 }, { "epoch": 4.643517043644473, "grad_norm": 0.18148356976796784, "learning_rate": 3.0049938721583127e-07, "loss": 0.0106, "step": 3644 }, { "epoch": 4.644791334820006, "grad_norm": 0.22008856665488538, "learning_rate": 2.9833667318643345e-07, "loss": 0.0126, "step": 3645 }, { "epoch": 4.64606562599554, "grad_norm": 0.2107935869203905, "learning_rate": 2.961816520335814e-07, "loss": 0.0107, "step": 3646 }, { "epoch": 4.647339917171074, "grad_norm": 0.18344391903176754, "learning_rate": 2.940343254660905e-07, "loss": 0.0093, "step": 3647 }, { "epoch": 4.648614208346607, "grad_norm": 0.20620937342655649, "learning_rate": 2.918946951866697e-07, "loss": 0.0153, "step": 3648 }, { "epoch": 4.649888499522141, "grad_norm": 0.2024239233431427, "learning_rate": 2.897627628919253e-07, "loss": 0.0127, "step": 3649 }, { "epoch": 4.651162790697675, "grad_norm": 0.1799240702424615, "learning_rate": 2.8763853027236277e-07, "loss": 0.0088, "step": 3650 }, { "epoch": 4.652437081873208, "grad_norm": 0.18357157765206517, "learning_rate": 2.85521999012377e-07, "loss": 0.009, "step": 3651 }, { "epoch": 4.653711373048742, "grad_norm": 0.21142213724474448, "learning_rate": 2.8341317079025986e-07, "loss": 0.0123, "step": 3652 }, { "epoch": 4.6549856642242755, "grad_norm": 0.18636916355979946, "learning_rate": 2.8131204727819493e-07, "loss": 0.0118, "step": 3653 }, { "epoch": 4.6562599553998085, "grad_norm": 0.18503221827067298, "learning_rate": 2.7921863014225504e-07, "loss": 0.0098, "step": 3654 }, { "epoch": 4.657534246575342, "grad_norm": 0.15949527306391564, "learning_rate": 2.771329210424023e-07, "loss": 0.0077, "step": 3655 }, { "epoch": 4.658808537750876, "grad_norm": 0.18844871053677822, "learning_rate": 2.750549216324894e-07, "loss": 0.0134, "step": 3656 }, { "epoch": 4.660082828926409, "grad_norm": 0.17751971091305987, "learning_rate": 2.729846335602515e-07, "loss": 0.0089, "step": 3657 }, { "epoch": 4.661357120101943, "grad_norm": 0.2031741626466744, "learning_rate": 2.7092205846731337e-07, "loss": 0.0117, "step": 3658 }, { "epoch": 4.662631411277477, "grad_norm": 0.20456279569922062, "learning_rate": 2.6886719798917995e-07, "loss": 0.0152, "step": 3659 }, { "epoch": 4.66390570245301, "grad_norm": 0.19741816649599975, "learning_rate": 2.6682005375524124e-07, "loss": 0.0138, "step": 3660 }, { "epoch": 4.665179993628544, "grad_norm": 0.17537769700048814, "learning_rate": 2.6478062738876654e-07, "loss": 0.0092, "step": 3661 }, { "epoch": 4.666454284804078, "grad_norm": 0.18155993401870316, "learning_rate": 2.6274892050691115e-07, "loss": 0.0079, "step": 3662 }, { "epoch": 4.667728575979611, "grad_norm": 0.1815426028200693, "learning_rate": 2.607249347207008e-07, "loss": 0.0085, "step": 3663 }, { "epoch": 4.669002867155145, "grad_norm": 0.2184516902680329, "learning_rate": 2.587086716350473e-07, "loss": 0.0097, "step": 3664 }, { "epoch": 4.670277158330679, "grad_norm": 0.17371771957721052, "learning_rate": 2.567001328487284e-07, "loss": 0.0086, "step": 3665 }, { "epoch": 4.671551449506212, "grad_norm": 0.2072601323756521, "learning_rate": 2.54699319954409e-07, "loss": 0.0161, "step": 3666 }, { "epoch": 4.672825740681746, "grad_norm": 0.19421382894548062, "learning_rate": 2.5270623453861887e-07, "loss": 0.012, "step": 3667 }, { "epoch": 4.67410003185728, "grad_norm": 0.18120451056615833, "learning_rate": 2.507208781817638e-07, "loss": 0.01, "step": 3668 }, { "epoch": 4.675374323032813, "grad_norm": 0.20727795231641927, "learning_rate": 2.4874325245812124e-07, "loss": 0.0122, "step": 3669 }, { "epoch": 4.6766486142083465, "grad_norm": 0.20712692543158814, "learning_rate": 2.4677335893583677e-07, "loss": 0.0103, "step": 3670 }, { "epoch": 4.6779229053838804, "grad_norm": 0.19450404489627285, "learning_rate": 2.448111991769253e-07, "loss": 0.0106, "step": 3671 }, { "epoch": 4.6791971965594135, "grad_norm": 0.19081244476581669, "learning_rate": 2.4285677473727123e-07, "loss": 0.0116, "step": 3672 }, { "epoch": 4.680471487734947, "grad_norm": 0.21500711178209167, "learning_rate": 2.4091008716662255e-07, "loss": 0.0098, "step": 3673 }, { "epoch": 4.681745778910481, "grad_norm": 0.1729483170408439, "learning_rate": 2.389711380085957e-07, "loss": 0.008, "step": 3674 }, { "epoch": 4.683020070086014, "grad_norm": 0.18170711953716617, "learning_rate": 2.370399288006664e-07, "loss": 0.0096, "step": 3675 }, { "epoch": 4.684294361261548, "grad_norm": 0.18460759310902192, "learning_rate": 2.351164610741774e-07, "loss": 0.0096, "step": 3676 }, { "epoch": 4.685568652437082, "grad_norm": 0.1949748887835691, "learning_rate": 2.3320073635432984e-07, "loss": 0.0099, "step": 3677 }, { "epoch": 4.686842943612615, "grad_norm": 0.21191771196621023, "learning_rate": 2.3129275616018742e-07, "loss": 0.0117, "step": 3678 }, { "epoch": 4.688117234788149, "grad_norm": 0.17234716287946805, "learning_rate": 2.293925220046711e-07, "loss": 0.0084, "step": 3679 }, { "epoch": 4.689391525963683, "grad_norm": 0.14724120253190226, "learning_rate": 2.2750003539456e-07, "loss": 0.006, "step": 3680 }, { "epoch": 4.690665817139216, "grad_norm": 0.204281934471982, "learning_rate": 2.256152978304904e-07, "loss": 0.0109, "step": 3681 }, { "epoch": 4.69194010831475, "grad_norm": 0.19505114298043125, "learning_rate": 2.2373831080695463e-07, "loss": 0.0123, "step": 3682 }, { "epoch": 4.693214399490284, "grad_norm": 0.20203617721721384, "learning_rate": 2.2186907581229766e-07, "loss": 0.0127, "step": 3683 }, { "epoch": 4.694488690665817, "grad_norm": 0.17213447300328957, "learning_rate": 2.2000759432871832e-07, "loss": 0.0092, "step": 3684 }, { "epoch": 4.695762981841351, "grad_norm": 0.2018551591594235, "learning_rate": 2.181538678322681e-07, "loss": 0.0139, "step": 3685 }, { "epoch": 4.697037273016885, "grad_norm": 0.17526826643520854, "learning_rate": 2.1630789779284677e-07, "loss": 0.0092, "step": 3686 }, { "epoch": 4.698311564192418, "grad_norm": 0.19439822758509087, "learning_rate": 2.144696856742079e-07, "loss": 0.0092, "step": 3687 }, { "epoch": 4.6995858553679515, "grad_norm": 0.17495278215542082, "learning_rate": 2.1263923293394774e-07, "loss": 0.009, "step": 3688 }, { "epoch": 4.700860146543485, "grad_norm": 0.22424770687169368, "learning_rate": 2.1081654102351634e-07, "loss": 0.014, "step": 3689 }, { "epoch": 4.702134437719019, "grad_norm": 0.19780284997491482, "learning_rate": 2.090016113882043e-07, "loss": 0.0121, "step": 3690 }, { "epoch": 4.703408728894552, "grad_norm": 0.17477092614350143, "learning_rate": 2.0719444546714928e-07, "loss": 0.0086, "step": 3691 }, { "epoch": 4.704683020070086, "grad_norm": 0.17352784096546434, "learning_rate": 2.0539504469333283e-07, "loss": 0.008, "step": 3692 }, { "epoch": 4.705957311245619, "grad_norm": 0.16456671052895977, "learning_rate": 2.0360341049358135e-07, "loss": 0.0085, "step": 3693 }, { "epoch": 4.707231602421153, "grad_norm": 0.20177898698300617, "learning_rate": 2.0181954428855731e-07, "loss": 0.0118, "step": 3694 }, { "epoch": 4.708505893596687, "grad_norm": 0.16692688390699986, "learning_rate": 2.0004344749277038e-07, "loss": 0.008, "step": 3695 }, { "epoch": 4.709780184772221, "grad_norm": 0.18046062861057005, "learning_rate": 1.9827512151456175e-07, "loss": 0.0104, "step": 3696 }, { "epoch": 4.711054475947754, "grad_norm": 0.20917894542713625, "learning_rate": 1.9651456775611865e-07, "loss": 0.0131, "step": 3697 }, { "epoch": 4.712328767123288, "grad_norm": 0.17964921224016048, "learning_rate": 1.9476178761346e-07, "loss": 0.009, "step": 3698 }, { "epoch": 4.713603058298821, "grad_norm": 0.20387424068882373, "learning_rate": 1.9301678247644395e-07, "loss": 0.0083, "step": 3699 }, { "epoch": 4.714877349474355, "grad_norm": 0.20043940910020883, "learning_rate": 1.9127955372876038e-07, "loss": 0.0107, "step": 3700 }, { "epoch": 4.716151640649889, "grad_norm": 0.1885546102067649, "learning_rate": 1.8955010274793517e-07, "loss": 0.0096, "step": 3701 }, { "epoch": 4.717425931825423, "grad_norm": 0.2049972051791591, "learning_rate": 1.8782843090532687e-07, "loss": 0.0114, "step": 3702 }, { "epoch": 4.718700223000956, "grad_norm": 0.18981002017994825, "learning_rate": 1.8611453956612346e-07, "loss": 0.0134, "step": 3703 }, { "epoch": 4.7199745141764895, "grad_norm": 0.1771445016407032, "learning_rate": 1.844084300893456e-07, "loss": 0.0084, "step": 3704 }, { "epoch": 4.7212488053520225, "grad_norm": 0.18790296463410588, "learning_rate": 1.827101038278456e-07, "loss": 0.0095, "step": 3705 }, { "epoch": 4.722523096527556, "grad_norm": 0.176418908609575, "learning_rate": 1.8101956212829507e-07, "loss": 0.0089, "step": 3706 }, { "epoch": 4.72379738770309, "grad_norm": 0.20104389436661388, "learning_rate": 1.793368063312051e-07, "loss": 0.0106, "step": 3707 }, { "epoch": 4.725071678878624, "grad_norm": 0.18591671235362967, "learning_rate": 1.7766183777090272e-07, "loss": 0.0094, "step": 3708 }, { "epoch": 4.726345970054157, "grad_norm": 0.18607295580533484, "learning_rate": 1.7599465777554668e-07, "loss": 0.0092, "step": 3709 }, { "epoch": 4.727620261229691, "grad_norm": 0.16899871689748772, "learning_rate": 1.7433526766711727e-07, "loss": 0.0088, "step": 3710 }, { "epoch": 4.728894552405224, "grad_norm": 0.19676232388355971, "learning_rate": 1.726836687614175e-07, "loss": 0.0086, "step": 3711 }, { "epoch": 4.730168843580758, "grad_norm": 0.18961407079857004, "learning_rate": 1.7103986236807312e-07, "loss": 0.0097, "step": 3712 }, { "epoch": 4.731443134756292, "grad_norm": 0.17396220899366732, "learning_rate": 1.6940384979053038e-07, "loss": 0.0084, "step": 3713 }, { "epoch": 4.732717425931826, "grad_norm": 0.1849019199251691, "learning_rate": 1.677756323260571e-07, "loss": 0.0117, "step": 3714 }, { "epoch": 4.733991717107359, "grad_norm": 0.19869683252677064, "learning_rate": 1.661552112657361e-07, "loss": 0.0115, "step": 3715 }, { "epoch": 4.735266008282893, "grad_norm": 0.1909835338124714, "learning_rate": 1.6454258789447286e-07, "loss": 0.011, "step": 3716 }, { "epoch": 4.736540299458426, "grad_norm": 0.19958296733439926, "learning_rate": 1.629377634909868e-07, "loss": 0.0129, "step": 3717 }, { "epoch": 4.73781459063396, "grad_norm": 0.1781267073214237, "learning_rate": 1.6134073932781435e-07, "loss": 0.0119, "step": 3718 }, { "epoch": 4.739088881809494, "grad_norm": 0.1771550320938683, "learning_rate": 1.5975151667130596e-07, "loss": 0.0074, "step": 3719 }, { "epoch": 4.7403631729850275, "grad_norm": 0.17716358321995285, "learning_rate": 1.5817009678162686e-07, "loss": 0.0088, "step": 3720 }, { "epoch": 4.7416374641605605, "grad_norm": 0.19136526937893114, "learning_rate": 1.5659648091275402e-07, "loss": 0.0084, "step": 3721 }, { "epoch": 4.742911755336094, "grad_norm": 0.18981251029278262, "learning_rate": 1.55030670312476e-07, "loss": 0.0101, "step": 3722 }, { "epoch": 4.7441860465116275, "grad_norm": 0.18606514429520402, "learning_rate": 1.5347266622239397e-07, "loss": 0.0121, "step": 3723 }, { "epoch": 4.745460337687161, "grad_norm": 0.20422461426891722, "learning_rate": 1.519224698779198e-07, "loss": 0.0113, "step": 3724 }, { "epoch": 4.746734628862695, "grad_norm": 0.19323565145931443, "learning_rate": 1.503800825082691e-07, "loss": 0.0095, "step": 3725 }, { "epoch": 4.748008920038229, "grad_norm": 0.18541220995472069, "learning_rate": 1.4884550533647236e-07, "loss": 0.0088, "step": 3726 }, { "epoch": 4.749283211213762, "grad_norm": 0.17095614866045972, "learning_rate": 1.4731873957936072e-07, "loss": 0.0093, "step": 3727 }, { "epoch": 4.750557502389296, "grad_norm": 0.17648383265313694, "learning_rate": 1.4579978644757463e-07, "loss": 0.0095, "step": 3728 }, { "epoch": 4.751831793564829, "grad_norm": 0.19269102258090862, "learning_rate": 1.4428864714555956e-07, "loss": 0.0128, "step": 3729 }, { "epoch": 4.753106084740363, "grad_norm": 0.1891759010035877, "learning_rate": 1.4278532287156476e-07, "loss": 0.011, "step": 3730 }, { "epoch": 4.754380375915897, "grad_norm": 0.18155310318844445, "learning_rate": 1.4128981481764115e-07, "loss": 0.0103, "step": 3731 }, { "epoch": 4.755654667091431, "grad_norm": 0.19350185132356737, "learning_rate": 1.3980212416964455e-07, "loss": 0.0089, "step": 3732 }, { "epoch": 4.756928958266964, "grad_norm": 0.17833396817722877, "learning_rate": 1.3832225210722916e-07, "loss": 0.0127, "step": 3733 }, { "epoch": 4.758203249442498, "grad_norm": 0.2000234477114519, "learning_rate": 1.3685019980385072e-07, "loss": 0.0099, "step": 3734 }, { "epoch": 4.759477540618031, "grad_norm": 0.2034528881444478, "learning_rate": 1.3538596842676554e-07, "loss": 0.0103, "step": 3735 }, { "epoch": 4.760751831793565, "grad_norm": 0.1924195377199194, "learning_rate": 1.339295591370271e-07, "loss": 0.0116, "step": 3736 }, { "epoch": 4.7620261229690986, "grad_norm": 0.1973385800108825, "learning_rate": 1.3248097308948494e-07, "loss": 0.0088, "step": 3737 }, { "epoch": 4.7633004141446325, "grad_norm": 0.18396470310876675, "learning_rate": 1.3104021143278911e-07, "loss": 0.0095, "step": 3738 }, { "epoch": 4.7645747053201655, "grad_norm": 0.22194177893993658, "learning_rate": 1.296072753093802e-07, "loss": 0.0122, "step": 3739 }, { "epoch": 4.765848996495699, "grad_norm": 0.18876007709763315, "learning_rate": 1.2818216585549824e-07, "loss": 0.0101, "step": 3740 }, { "epoch": 4.767123287671232, "grad_norm": 0.1875760212421269, "learning_rate": 1.2676488420117595e-07, "loss": 0.0104, "step": 3741 }, { "epoch": 4.768397578846766, "grad_norm": 0.20181005791455503, "learning_rate": 1.2535543147023765e-07, "loss": 0.0098, "step": 3742 }, { "epoch": 4.7696718700223, "grad_norm": 0.1802147806413008, "learning_rate": 1.239538087803005e-07, "loss": 0.0104, "step": 3743 }, { "epoch": 4.770946161197834, "grad_norm": 0.18121473159439117, "learning_rate": 1.225600172427732e-07, "loss": 0.0085, "step": 3744 }, { "epoch": 4.772220452373367, "grad_norm": 0.18882977940129886, "learning_rate": 1.2117405796285286e-07, "loss": 0.0084, "step": 3745 }, { "epoch": 4.773494743548901, "grad_norm": 0.1921617682291612, "learning_rate": 1.1979593203953033e-07, "loss": 0.011, "step": 3746 }, { "epoch": 4.774769034724434, "grad_norm": 0.20533493140102677, "learning_rate": 1.1842564056558148e-07, "loss": 0.01, "step": 3747 }, { "epoch": 4.776043325899968, "grad_norm": 0.17145662015296181, "learning_rate": 1.1706318462756938e-07, "loss": 0.0096, "step": 3748 }, { "epoch": 4.777317617075502, "grad_norm": 0.18823682025234356, "learning_rate": 1.1570856530584761e-07, "loss": 0.0107, "step": 3749 }, { "epoch": 4.778591908251036, "grad_norm": 0.1937428436506992, "learning_rate": 1.1436178367455143e-07, "loss": 0.0131, "step": 3750 }, { "epoch": 4.779866199426569, "grad_norm": 0.21629163020712752, "learning_rate": 1.1302284080160541e-07, "loss": 0.0128, "step": 3751 }, { "epoch": 4.781140490602103, "grad_norm": 0.16883078281563207, "learning_rate": 1.1169173774871478e-07, "loss": 0.0094, "step": 3752 }, { "epoch": 4.782414781777637, "grad_norm": 0.20831648384772936, "learning_rate": 1.1036847557137076e-07, "loss": 0.0128, "step": 3753 }, { "epoch": 4.78368907295317, "grad_norm": 0.17218815088729983, "learning_rate": 1.0905305531884514e-07, "loss": 0.0091, "step": 3754 }, { "epoch": 4.7849633641287035, "grad_norm": 0.19821145112567565, "learning_rate": 1.0774547803419466e-07, "loss": 0.0155, "step": 3755 }, { "epoch": 4.786237655304237, "grad_norm": 0.18839081070768535, "learning_rate": 1.0644574475425328e-07, "loss": 0.0085, "step": 3756 }, { "epoch": 4.78751194647977, "grad_norm": 0.18777184445535006, "learning_rate": 1.0515385650963772e-07, "loss": 0.0117, "step": 3757 }, { "epoch": 4.788786237655304, "grad_norm": 0.17929769743542123, "learning_rate": 1.0386981432474075e-07, "loss": 0.0073, "step": 3758 }, { "epoch": 4.790060528830838, "grad_norm": 0.18611096670791477, "learning_rate": 1.0259361921774014e-07, "loss": 0.0111, "step": 3759 }, { "epoch": 4.791334820006371, "grad_norm": 0.2069952368631301, "learning_rate": 1.0132527220058419e-07, "loss": 0.0104, "step": 3760 }, { "epoch": 4.792609111181905, "grad_norm": 0.21464873942144044, "learning_rate": 1.0006477427900285e-07, "loss": 0.013, "step": 3761 }, { "epoch": 4.793883402357439, "grad_norm": 0.17478749220399814, "learning_rate": 9.881212645249882e-08, "loss": 0.0107, "step": 3762 }, { "epoch": 4.795157693532972, "grad_norm": 0.1719680152065389, "learning_rate": 9.756732971435312e-08, "loss": 0.0083, "step": 3763 }, { "epoch": 4.796431984708506, "grad_norm": 0.1954869440497987, "learning_rate": 9.633038505161951e-08, "loss": 0.0102, "step": 3764 }, { "epoch": 4.79770627588404, "grad_norm": 0.18772942652382363, "learning_rate": 9.510129344512564e-08, "loss": 0.0104, "step": 3765 }, { "epoch": 4.798980567059573, "grad_norm": 0.18034940801454305, "learning_rate": 9.388005586947191e-08, "loss": 0.0118, "step": 3766 }, { "epoch": 4.800254858235107, "grad_norm": 0.18693807889634467, "learning_rate": 9.26666732930348e-08, "loss": 0.0106, "step": 3767 }, { "epoch": 4.801529149410641, "grad_norm": 0.19068263599090848, "learning_rate": 9.146114667795358e-08, "loss": 0.0103, "step": 3768 }, { "epoch": 4.802803440586174, "grad_norm": 0.22071577067921222, "learning_rate": 9.026347698014804e-08, "loss": 0.0134, "step": 3769 }, { "epoch": 4.804077731761708, "grad_norm": 0.18886765645724243, "learning_rate": 8.907366514930071e-08, "loss": 0.0104, "step": 3770 }, { "epoch": 4.8053520229372415, "grad_norm": 0.1635873177760676, "learning_rate": 8.789171212886582e-08, "loss": 0.0076, "step": 3771 }, { "epoch": 4.8066263141127745, "grad_norm": 0.19507445312297933, "learning_rate": 8.67176188560681e-08, "loss": 0.0103, "step": 3772 }, { "epoch": 4.807900605288308, "grad_norm": 0.19081746191004625, "learning_rate": 8.555138626189619e-08, "loss": 0.0114, "step": 3773 }, { "epoch": 4.809174896463842, "grad_norm": 0.19779060363017723, "learning_rate": 8.4393015271107e-08, "loss": 0.0105, "step": 3774 }, { "epoch": 4.810449187639375, "grad_norm": 0.1852197546816506, "learning_rate": 8.324250680222467e-08, "loss": 0.0095, "step": 3775 }, { "epoch": 4.811723478814909, "grad_norm": 0.21190999858949364, "learning_rate": 8.209986176753947e-08, "loss": 0.0118, "step": 3776 }, { "epoch": 4.812997769990443, "grad_norm": 0.18859628414131807, "learning_rate": 8.096508107310219e-08, "loss": 0.0128, "step": 3777 }, { "epoch": 4.814272061165976, "grad_norm": 0.1971780277539641, "learning_rate": 7.983816561873192e-08, "loss": 0.0108, "step": 3778 }, { "epoch": 4.81554635234151, "grad_norm": 0.1926168820019115, "learning_rate": 7.871911629801055e-08, "loss": 0.0115, "step": 3779 }, { "epoch": 4.816820643517044, "grad_norm": 0.19919264793937821, "learning_rate": 7.760793399827937e-08, "loss": 0.0134, "step": 3780 }, { "epoch": 4.818094934692577, "grad_norm": 0.1838648453428462, "learning_rate": 7.650461960064581e-08, "loss": 0.0105, "step": 3781 }, { "epoch": 4.819369225868111, "grad_norm": 0.1900130098340755, "learning_rate": 7.54091739799756e-08, "loss": 0.0097, "step": 3782 }, { "epoch": 4.820643517043645, "grad_norm": 0.1789858083289931, "learning_rate": 7.432159800489613e-08, "loss": 0.009, "step": 3783 }, { "epoch": 4.821917808219178, "grad_norm": 0.20050161987700046, "learning_rate": 7.324189253779312e-08, "loss": 0.0114, "step": 3784 }, { "epoch": 4.823192099394712, "grad_norm": 0.19213265279906094, "learning_rate": 7.217005843481506e-08, "loss": 0.0128, "step": 3785 }, { "epoch": 4.824466390570246, "grad_norm": 0.19652481875456146, "learning_rate": 7.110609654586431e-08, "loss": 0.0126, "step": 3786 }, { "epoch": 4.825740681745779, "grad_norm": 0.18546325937453942, "learning_rate": 7.00500077146038e-08, "loss": 0.0094, "step": 3787 }, { "epoch": 4.8270149729213125, "grad_norm": 0.19272255522460852, "learning_rate": 6.900179277845476e-08, "loss": 0.0083, "step": 3788 }, { "epoch": 4.8282892640968464, "grad_norm": 0.20823733655960205, "learning_rate": 6.79614525685901e-08, "loss": 0.0123, "step": 3789 }, { "epoch": 4.8295635552723795, "grad_norm": 0.21141937184307896, "learning_rate": 6.692898790994217e-08, "loss": 0.0126, "step": 3790 }, { "epoch": 4.830837846447913, "grad_norm": 0.1646418215175428, "learning_rate": 6.590439962119943e-08, "loss": 0.0085, "step": 3791 }, { "epoch": 4.832112137623447, "grad_norm": 0.18400615987203506, "learning_rate": 6.488768851480087e-08, "loss": 0.0087, "step": 3792 }, { "epoch": 4.83338642879898, "grad_norm": 0.18433232067691205, "learning_rate": 6.387885539694384e-08, "loss": 0.0099, "step": 3793 }, { "epoch": 4.834660719974514, "grad_norm": 0.19762812317264483, "learning_rate": 6.287790106757396e-08, "loss": 0.0122, "step": 3794 }, { "epoch": 4.835935011150048, "grad_norm": 0.19021968253946578, "learning_rate": 6.188482632039416e-08, "loss": 0.0114, "step": 3795 }, { "epoch": 4.837209302325581, "grad_norm": 0.16121694385496912, "learning_rate": 6.089963194285675e-08, "loss": 0.0093, "step": 3796 }, { "epoch": 4.838483593501115, "grad_norm": 0.19910761963449253, "learning_rate": 5.992231871616461e-08, "loss": 0.0116, "step": 3797 }, { "epoch": 4.839757884676649, "grad_norm": 0.1986890659651501, "learning_rate": 5.89528874152745e-08, "loss": 0.0113, "step": 3798 }, { "epoch": 4.841032175852182, "grad_norm": 0.1789161261291322, "learning_rate": 5.7991338808889295e-08, "loss": 0.0093, "step": 3799 }, { "epoch": 4.842306467027716, "grad_norm": 0.20126447716420787, "learning_rate": 5.7037673659464664e-08, "loss": 0.0141, "step": 3800 }, { "epoch": 4.84358075820325, "grad_norm": 0.19195090241398743, "learning_rate": 5.609189272320237e-08, "loss": 0.009, "step": 3801 }, { "epoch": 4.844855049378783, "grad_norm": 0.20238390451265106, "learning_rate": 5.5153996750054726e-08, "loss": 0.011, "step": 3802 }, { "epoch": 4.846129340554317, "grad_norm": 0.22552970700914102, "learning_rate": 5.422398648372129e-08, "loss": 0.0127, "step": 3803 }, { "epoch": 4.847403631729851, "grad_norm": 0.18229795404940044, "learning_rate": 5.330186266164661e-08, "loss": 0.0083, "step": 3804 }, { "epoch": 4.848677922905384, "grad_norm": 0.17129174375608436, "learning_rate": 5.238762601502578e-08, "loss": 0.0071, "step": 3805 }, { "epoch": 4.8499522140809175, "grad_norm": 0.21182157346877228, "learning_rate": 5.1481277268794486e-08, "loss": 0.0107, "step": 3806 }, { "epoch": 4.851226505256451, "grad_norm": 0.2141433393893251, "learning_rate": 5.058281714163893e-08, "loss": 0.0093, "step": 3807 }, { "epoch": 4.852500796431984, "grad_norm": 0.19496353906424682, "learning_rate": 4.9692246345985905e-08, "loss": 0.0128, "step": 3808 }, { "epoch": 4.853775087607518, "grad_norm": 0.18467226425842878, "learning_rate": 4.880956558800942e-08, "loss": 0.0137, "step": 3809 }, { "epoch": 4.855049378783052, "grad_norm": 0.16592674213465086, "learning_rate": 4.793477556762627e-08, "loss": 0.0092, "step": 3810 }, { "epoch": 4.856323669958585, "grad_norm": 0.19254722247632142, "learning_rate": 4.706787697849491e-08, "loss": 0.0091, "step": 3811 }, { "epoch": 4.857597961134119, "grad_norm": 0.1911129858565266, "learning_rate": 4.6208870508017703e-08, "loss": 0.0109, "step": 3812 }, { "epoch": 4.858872252309653, "grad_norm": 0.18943500283415585, "learning_rate": 4.5357756837339783e-08, "loss": 0.0092, "step": 3813 }, { "epoch": 4.860146543485186, "grad_norm": 0.18622064193535864, "learning_rate": 4.451453664134575e-08, "loss": 0.0109, "step": 3814 }, { "epoch": 4.86142083466072, "grad_norm": 0.18702172151323654, "learning_rate": 4.367921058866187e-08, "loss": 0.0091, "step": 3815 }, { "epoch": 4.862695125836254, "grad_norm": 0.2078537022455947, "learning_rate": 4.2851779341654966e-08, "loss": 0.0127, "step": 3816 }, { "epoch": 4.863969417011787, "grad_norm": 0.18852132143515915, "learning_rate": 4.2032243556433536e-08, "loss": 0.009, "step": 3817 }, { "epoch": 4.865243708187321, "grad_norm": 0.19941731020678735, "learning_rate": 4.1220603882841104e-08, "loss": 0.0091, "step": 3818 }, { "epoch": 4.866517999362855, "grad_norm": 0.22282324838696096, "learning_rate": 4.0416860964465065e-08, "loss": 0.0154, "step": 3819 }, { "epoch": 4.867792290538388, "grad_norm": 0.2203302385318901, "learning_rate": 3.962101543862562e-08, "loss": 0.0139, "step": 3820 }, { "epoch": 4.869066581713922, "grad_norm": 0.1855344787999168, "learning_rate": 3.8833067936386856e-08, "loss": 0.0096, "step": 3821 }, { "epoch": 4.8703408728894555, "grad_norm": 0.1624932149228731, "learning_rate": 3.805301908254455e-08, "loss": 0.008, "step": 3822 }, { "epoch": 4.8716151640649885, "grad_norm": 0.21972126091973287, "learning_rate": 3.728086949563503e-08, "loss": 0.0149, "step": 3823 }, { "epoch": 4.872889455240522, "grad_norm": 0.19757713273558103, "learning_rate": 3.651661978793075e-08, "loss": 0.014, "step": 3824 }, { "epoch": 4.874163746416056, "grad_norm": 0.19739358616438843, "learning_rate": 3.576027056543696e-08, "loss": 0.01, "step": 3825 }, { "epoch": 4.875438037591589, "grad_norm": 0.17060419939199636, "learning_rate": 3.501182242789725e-08, "loss": 0.0098, "step": 3826 }, { "epoch": 4.876712328767123, "grad_norm": 0.18405834534313437, "learning_rate": 3.42712759687891e-08, "loss": 0.01, "step": 3827 }, { "epoch": 4.877986619942657, "grad_norm": 0.1708665374550421, "learning_rate": 3.3538631775325016e-08, "loss": 0.0076, "step": 3828 }, { "epoch": 4.87926091111819, "grad_norm": 0.20203428057601053, "learning_rate": 3.281389042844918e-08, "loss": 0.0109, "step": 3829 }, { "epoch": 4.880535202293724, "grad_norm": 0.17752909708609385, "learning_rate": 3.2097052502843005e-08, "loss": 0.0083, "step": 3830 }, { "epoch": 4.881809493469258, "grad_norm": 0.19985017025656782, "learning_rate": 3.138811856691848e-08, "loss": 0.0098, "step": 3831 }, { "epoch": 4.883083784644791, "grad_norm": 0.18486118395449352, "learning_rate": 3.0687089182819264e-08, "loss": 0.0096, "step": 3832 }, { "epoch": 4.884358075820325, "grad_norm": 0.22604262508336617, "learning_rate": 2.9993964906422926e-08, "loss": 0.0139, "step": 3833 }, { "epoch": 4.885632366995859, "grad_norm": 0.1643209620875799, "learning_rate": 2.9308746287339817e-08, "loss": 0.0081, "step": 3834 }, { "epoch": 4.886906658171392, "grad_norm": 0.18553470741640551, "learning_rate": 2.8631433868907542e-08, "loss": 0.009, "step": 3835 }, { "epoch": 4.888180949346926, "grad_norm": 0.18466208895949382, "learning_rate": 2.796202818819871e-08, "loss": 0.0088, "step": 3836 }, { "epoch": 4.88945524052246, "grad_norm": 0.18886319827635026, "learning_rate": 2.7300529776014273e-08, "loss": 0.0108, "step": 3837 }, { "epoch": 4.890729531697993, "grad_norm": 0.19730479202440088, "learning_rate": 2.6646939156884654e-08, "loss": 0.0131, "step": 3838 }, { "epoch": 4.8920038228735265, "grad_norm": 0.19439994256480203, "learning_rate": 2.6001256849071955e-08, "loss": 0.0102, "step": 3839 }, { "epoch": 4.89327811404906, "grad_norm": 0.1657422226168508, "learning_rate": 2.536348336456551e-08, "loss": 0.0086, "step": 3840 }, { "epoch": 4.8945524052245934, "grad_norm": 0.20999988816927112, "learning_rate": 2.4733619209084127e-08, "loss": 0.0133, "step": 3841 }, { "epoch": 4.895826696400127, "grad_norm": 0.19639344946743906, "learning_rate": 2.4111664882073837e-08, "loss": 0.0132, "step": 3842 }, { "epoch": 4.897100987575661, "grad_norm": 0.18364828429575794, "learning_rate": 2.349762087671126e-08, "loss": 0.0093, "step": 3843 }, { "epoch": 4.898375278751194, "grad_norm": 0.17417566635384196, "learning_rate": 2.2891487679898017e-08, "loss": 0.0079, "step": 3844 }, { "epoch": 4.899649569926728, "grad_norm": 0.17544188302231167, "learning_rate": 2.2293265772265205e-08, "loss": 0.0084, "step": 3845 }, { "epoch": 4.900923861102262, "grad_norm": 0.18548409659916906, "learning_rate": 2.1702955628166712e-08, "loss": 0.0106, "step": 3846 }, { "epoch": 4.902198152277795, "grad_norm": 0.20658942409297415, "learning_rate": 2.1120557715688106e-08, "loss": 0.0126, "step": 3847 }, { "epoch": 4.903472443453329, "grad_norm": 0.17775249000565507, "learning_rate": 2.054607249663665e-08, "loss": 0.0089, "step": 3848 }, { "epoch": 4.904746734628863, "grad_norm": 0.19866953573097368, "learning_rate": 1.997950042654795e-08, "loss": 0.0114, "step": 3849 }, { "epoch": 4.906021025804396, "grad_norm": 0.1830625115854694, "learning_rate": 1.9420841954681525e-08, "loss": 0.0096, "step": 3850 }, { "epoch": 4.90729531697993, "grad_norm": 0.18142755740178906, "learning_rate": 1.8870097524021913e-08, "loss": 0.0114, "step": 3851 }, { "epoch": 4.908569608155464, "grad_norm": 0.19231175972084338, "learning_rate": 1.8327267571279785e-08, "loss": 0.0101, "step": 3852 }, { "epoch": 4.909843899330998, "grad_norm": 0.1932945917011852, "learning_rate": 1.77923525268886e-08, "loss": 0.012, "step": 3853 }, { "epoch": 4.911118190506531, "grad_norm": 0.18080701746281982, "learning_rate": 1.7265352815004632e-08, "loss": 0.0101, "step": 3854 }, { "epoch": 4.9123924816820645, "grad_norm": 0.18557254756654684, "learning_rate": 1.6746268853511382e-08, "loss": 0.0101, "step": 3855 }, { "epoch": 4.913666772857598, "grad_norm": 0.16659208593550032, "learning_rate": 1.6235101054011825e-08, "loss": 0.0081, "step": 3856 }, { "epoch": 4.9149410640331315, "grad_norm": 0.19852734113570863, "learning_rate": 1.5731849821833955e-08, "loss": 0.0136, "step": 3857 }, { "epoch": 4.916215355208665, "grad_norm": 0.20949424378594358, "learning_rate": 1.5236515556028565e-08, "loss": 0.0105, "step": 3858 }, { "epoch": 4.917489646384199, "grad_norm": 0.16842980007665498, "learning_rate": 1.4749098649367023e-08, "loss": 0.0085, "step": 3859 }, { "epoch": 4.918763937559732, "grad_norm": 0.1898072923173807, "learning_rate": 1.42695994883435e-08, "loss": 0.0102, "step": 3860 }, { "epoch": 4.920038228735266, "grad_norm": 0.19662727381840472, "learning_rate": 1.3798018453176076e-08, "loss": 0.0131, "step": 3861 }, { "epoch": 4.921312519910799, "grad_norm": 0.1859137156206403, "learning_rate": 1.333435591779897e-08, "loss": 0.009, "step": 3862 }, { "epoch": 4.922586811086333, "grad_norm": 0.1963990992075691, "learning_rate": 1.2878612249872524e-08, "loss": 0.0087, "step": 3863 }, { "epoch": 4.923861102261867, "grad_norm": 0.1860706932290528, "learning_rate": 1.2430787810776556e-08, "loss": 0.0122, "step": 3864 }, { "epoch": 4.925135393437401, "grad_norm": 0.19216528315224035, "learning_rate": 1.199088295560924e-08, "loss": 0.0121, "step": 3865 }, { "epoch": 4.926409684612934, "grad_norm": 0.20869690009319733, "learning_rate": 1.1558898033191545e-08, "loss": 0.0127, "step": 3866 }, { "epoch": 4.927683975788468, "grad_norm": 0.1839644432647148, "learning_rate": 1.1134833386062804e-08, "loss": 0.0102, "step": 3867 }, { "epoch": 4.928958266964001, "grad_norm": 0.22125582866723506, "learning_rate": 1.0718689350484036e-08, "loss": 0.0128, "step": 3868 }, { "epoch": 4.930232558139535, "grad_norm": 0.20115277567536063, "learning_rate": 1.0310466256432395e-08, "loss": 0.0125, "step": 3869 }, { "epoch": 4.931506849315069, "grad_norm": 0.19167344244622098, "learning_rate": 9.910164427605618e-09, "loss": 0.0127, "step": 3870 }, { "epoch": 4.932781140490603, "grad_norm": 0.19223932796762533, "learning_rate": 9.517784181422018e-09, "loss": 0.0103, "step": 3871 }, { "epoch": 4.934055431666136, "grad_norm": 0.19850225683421965, "learning_rate": 9.13332582901716e-09, "loss": 0.0117, "step": 3872 }, { "epoch": 4.9353297228416695, "grad_norm": 0.19437053546442448, "learning_rate": 8.756789675244959e-09, "loss": 0.012, "step": 3873 }, { "epoch": 4.9366040140172025, "grad_norm": 0.17034097871128112, "learning_rate": 8.388176018677695e-09, "loss": 0.0087, "step": 3874 }, { "epoch": 4.937878305192736, "grad_norm": 0.17219985748673686, "learning_rate": 8.027485151603787e-09, "loss": 0.0078, "step": 3875 }, { "epoch": 4.93915259636827, "grad_norm": 0.17180102408711717, "learning_rate": 7.674717360032224e-09, "loss": 0.0093, "step": 3876 }, { "epoch": 4.940426887543804, "grad_norm": 0.1888662523743926, "learning_rate": 7.329872923689252e-09, "loss": 0.0113, "step": 3877 }, { "epoch": 4.941701178719337, "grad_norm": 0.1620844782608641, "learning_rate": 6.992952116013918e-09, "loss": 0.009, "step": 3878 }, { "epoch": 4.942975469894871, "grad_norm": 0.18502534320487865, "learning_rate": 6.663955204169181e-09, "loss": 0.0085, "step": 3879 }, { "epoch": 4.944249761070404, "grad_norm": 0.20141777496365854, "learning_rate": 6.342882449029697e-09, "loss": 0.0121, "step": 3880 }, { "epoch": 4.945524052245938, "grad_norm": 0.19250127061950384, "learning_rate": 6.029734105187368e-09, "loss": 0.0107, "step": 3881 }, { "epoch": 4.946798343421472, "grad_norm": 0.18030579953562298, "learning_rate": 5.724510420952456e-09, "loss": 0.0083, "step": 3882 }, { "epoch": 4.948072634597006, "grad_norm": 0.18182133285492322, "learning_rate": 5.42721163835025e-09, "loss": 0.0125, "step": 3883 }, { "epoch": 4.949346925772539, "grad_norm": 0.20506965106441147, "learning_rate": 5.137837993121064e-09, "loss": 0.0089, "step": 3884 }, { "epoch": 4.950621216948073, "grad_norm": 0.1875109513186134, "learning_rate": 4.856389714723575e-09, "loss": 0.0099, "step": 3885 }, { "epoch": 4.951895508123606, "grad_norm": 0.20117835164213288, "learning_rate": 4.582867026329263e-09, "loss": 0.0115, "step": 3886 }, { "epoch": 4.95316979929914, "grad_norm": 0.1973379001998969, "learning_rate": 4.317270144826857e-09, "loss": 0.0082, "step": 3887 }, { "epoch": 4.954444090474674, "grad_norm": 0.20424194328523923, "learning_rate": 4.059599280819004e-09, "loss": 0.0126, "step": 3888 }, { "epoch": 4.9557183816502075, "grad_norm": 0.17584388723348163, "learning_rate": 3.809854638625599e-09, "loss": 0.0075, "step": 3889 }, { "epoch": 4.9569926728257405, "grad_norm": 0.19068827562409504, "learning_rate": 3.5680364162793414e-09, "loss": 0.0115, "step": 3890 }, { "epoch": 4.958266964001274, "grad_norm": 0.18413075273772372, "learning_rate": 3.3341448055290714e-09, "loss": 0.0089, "step": 3891 }, { "epoch": 4.959541255176807, "grad_norm": 0.21766337821080078, "learning_rate": 3.1081799918375454e-09, "loss": 0.0114, "step": 3892 }, { "epoch": 4.960815546352341, "grad_norm": 0.18766034659323425, "learning_rate": 2.8901421543814367e-09, "loss": 0.0085, "step": 3893 }, { "epoch": 4.962089837527875, "grad_norm": 0.20124214027600182, "learning_rate": 2.6800314660535565e-09, "loss": 0.0096, "step": 3894 }, { "epoch": 4.963364128703409, "grad_norm": 0.17907112660016508, "learning_rate": 2.477848093460633e-09, "loss": 0.0081, "step": 3895 }, { "epoch": 4.964638419878942, "grad_norm": 0.17898037934865804, "learning_rate": 2.2835921969210917e-09, "loss": 0.0088, "step": 3896 }, { "epoch": 4.965912711054476, "grad_norm": 0.17738754960096167, "learning_rate": 2.097263930469495e-09, "loss": 0.0089, "step": 3897 }, { "epoch": 4.967187002230009, "grad_norm": 0.19802717500570088, "learning_rate": 1.9188634418532135e-09, "loss": 0.012, "step": 3898 }, { "epoch": 4.968461293405543, "grad_norm": 0.18359495900488898, "learning_rate": 1.7483908725357546e-09, "loss": 0.0094, "step": 3899 }, { "epoch": 4.969735584581077, "grad_norm": 0.1998870290954808, "learning_rate": 1.585846357691212e-09, "loss": 0.0128, "step": 3900 }, { "epoch": 4.971009875756611, "grad_norm": 0.20104853412595217, "learning_rate": 1.4312300262075973e-09, "loss": 0.0122, "step": 3901 }, { "epoch": 4.972284166932144, "grad_norm": 0.18175890487058763, "learning_rate": 1.2845420006879494e-09, "loss": 0.0085, "step": 3902 }, { "epoch": 4.973558458107678, "grad_norm": 0.20517126441523653, "learning_rate": 1.145782397447004e-09, "loss": 0.0121, "step": 3903 }, { "epoch": 4.974832749283211, "grad_norm": 0.18882090756546308, "learning_rate": 1.014951326514524e-09, "loss": 0.0098, "step": 3904 }, { "epoch": 4.976107040458745, "grad_norm": 0.19028623227817368, "learning_rate": 8.920488916308589e-10, "loss": 0.0086, "step": 3905 }, { "epoch": 4.9773813316342785, "grad_norm": 0.18254115634845028, "learning_rate": 7.770751902513862e-10, "loss": 0.0102, "step": 3906 }, { "epoch": 4.978655622809812, "grad_norm": 0.19359396578154914, "learning_rate": 6.700303135442898e-10, "loss": 0.0115, "step": 3907 }, { "epoch": 4.9799299139853455, "grad_norm": 0.19239596233190312, "learning_rate": 5.709143463894506e-10, "loss": 0.0111, "step": 3908 }, { "epoch": 4.981204205160879, "grad_norm": 0.20618458242148183, "learning_rate": 4.797273673806669e-10, "loss": 0.0151, "step": 3909 }, { "epoch": 4.982478496336412, "grad_norm": 0.21411522111336087, "learning_rate": 3.964694488234333e-10, "loss": 0.0156, "step": 3910 }, { "epoch": 4.983752787511946, "grad_norm": 0.19027295846781983, "learning_rate": 3.2114065673827245e-10, "loss": 0.0091, "step": 3911 }, { "epoch": 4.98502707868748, "grad_norm": 0.18147080259036358, "learning_rate": 2.5374105085518297e-10, "loss": 0.0087, "step": 3912 }, { "epoch": 4.986301369863014, "grad_norm": 0.16161284059987058, "learning_rate": 1.9427068461808086e-10, "loss": 0.0074, "step": 3913 }, { "epoch": 4.987575661038547, "grad_norm": 0.19658502905398526, "learning_rate": 1.427296051847993e-10, "loss": 0.0122, "step": 3914 }, { "epoch": 4.988849952214081, "grad_norm": 0.1776925274819164, "learning_rate": 9.911785342375802e-11, "loss": 0.0085, "step": 3915 }, { "epoch": 4.990124243389615, "grad_norm": 0.18876960757943956, "learning_rate": 6.343546391618383e-11, "loss": 0.0111, "step": 3916 }, { "epoch": 4.991398534565148, "grad_norm": 0.22297819097048308, "learning_rate": 3.568246495833094e-11, "loss": 0.0175, "step": 3917 }, { "epoch": 4.992672825740682, "grad_norm": 0.2270070421808709, "learning_rate": 1.585887855481971e-11, "loss": 0.0176, "step": 3918 }, { "epoch": 4.993947116916216, "grad_norm": 0.19225748199279552, "learning_rate": 3.964720424187718e-12, "loss": 0.0096, "step": 3919 }, { "epoch": 4.995221408091749, "grad_norm": 0.18398968710510352, "learning_rate": 0.0, "loss": 0.0088, "step": 3920 }, { "epoch": 4.995221408091749, "step": 3920, "total_flos": 1.6189087775068488e+18, "train_loss": 0.1054512033990955, "train_runtime": 29662.4831, "train_samples_per_second": 16.929, "train_steps_per_second": 0.132 } ], "logging_steps": 1.0, "max_steps": 3920, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6189087775068488e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }