{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18814675446848542, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006271558482282847, "grad_norm": 1.1241575479507446, "learning_rate": 0.0001, "loss": 2.371, "step": 1 }, { "epoch": 0.0012543116964565694, "grad_norm": 1.0141234397888184, "learning_rate": 9.966555183946489e-05, "loss": 2.1534, "step": 2 }, { "epoch": 0.0018814675446848542, "grad_norm": 0.9427400231361389, "learning_rate": 9.933110367892977e-05, "loss": 2.3618, "step": 3 }, { "epoch": 0.002508623392913139, "grad_norm": 0.9919387698173523, "learning_rate": 9.899665551839465e-05, "loss": 2.1973, "step": 4 }, { "epoch": 0.0031357792411414237, "grad_norm": 1.042110562324524, "learning_rate": 9.866220735785953e-05, "loss": 2.0364, "step": 5 }, { "epoch": 0.0037629350893697085, "grad_norm": 1.1066715717315674, "learning_rate": 9.832775919732441e-05, "loss": 1.7375, "step": 6 }, { "epoch": 0.004390090937597993, "grad_norm": 1.1894862651824951, "learning_rate": 9.799331103678931e-05, "loss": 1.6476, "step": 7 }, { "epoch": 0.005017246785826278, "grad_norm": 0.9607447385787964, "learning_rate": 9.765886287625419e-05, "loss": 1.4399, "step": 8 }, { "epoch": 0.005644402634054563, "grad_norm": 0.8756276369094849, "learning_rate": 9.732441471571907e-05, "loss": 1.4069, "step": 9 }, { "epoch": 0.006271558482282847, "grad_norm": 0.8891203999519348, "learning_rate": 9.698996655518396e-05, "loss": 1.3944, "step": 10 }, { "epoch": 0.006898714330511132, "grad_norm": 0.9099848866462708, "learning_rate": 9.665551839464884e-05, "loss": 1.2704, "step": 11 }, { "epoch": 0.007525870178739417, "grad_norm": 0.6798804998397827, "learning_rate": 9.632107023411372e-05, "loss": 1.2606, "step": 12 }, { "epoch": 0.008153026026967701, "grad_norm": 0.4697795808315277, "learning_rate": 9.59866220735786e-05, "loss": 1.2039, "step": 13 }, { "epoch": 0.008780181875195987, "grad_norm": 0.49036362767219543, "learning_rate": 9.565217391304348e-05, "loss": 1.2249, "step": 14 }, { "epoch": 0.00940733772342427, "grad_norm": 0.4770255982875824, "learning_rate": 9.531772575250837e-05, "loss": 1.1787, "step": 15 }, { "epoch": 0.010034493571652555, "grad_norm": 0.4494694769382477, "learning_rate": 9.498327759197325e-05, "loss": 1.2239, "step": 16 }, { "epoch": 0.01066164941988084, "grad_norm": 0.4780626595020294, "learning_rate": 9.464882943143813e-05, "loss": 1.1149, "step": 17 }, { "epoch": 0.011288805268109126, "grad_norm": 0.4681190550327301, "learning_rate": 9.431438127090302e-05, "loss": 1.1354, "step": 18 }, { "epoch": 0.01191596111633741, "grad_norm": 0.49637556076049805, "learning_rate": 9.39799331103679e-05, "loss": 1.1951, "step": 19 }, { "epoch": 0.012543116964565695, "grad_norm": 0.5190874338150024, "learning_rate": 9.364548494983279e-05, "loss": 1.1399, "step": 20 }, { "epoch": 0.01317027281279398, "grad_norm": 0.5862295627593994, "learning_rate": 9.331103678929767e-05, "loss": 1.1189, "step": 21 }, { "epoch": 0.013797428661022263, "grad_norm": 0.5344688892364502, "learning_rate": 9.297658862876255e-05, "loss": 1.1553, "step": 22 }, { "epoch": 0.014424584509250549, "grad_norm": 0.5440303683280945, "learning_rate": 9.264214046822743e-05, "loss": 1.2029, "step": 23 }, { "epoch": 0.015051740357478834, "grad_norm": 0.48431307077407837, "learning_rate": 9.230769230769232e-05, "loss": 1.1198, "step": 24 }, { "epoch": 0.01567889620570712, "grad_norm": 0.5911933779716492, "learning_rate": 9.19732441471572e-05, "loss": 1.1882, "step": 25 }, { "epoch": 0.016306052053935403, "grad_norm": 0.6006789803504944, "learning_rate": 9.163879598662207e-05, "loss": 1.0898, "step": 26 }, { "epoch": 0.016933207902163686, "grad_norm": 0.6420937180519104, "learning_rate": 9.130434782608696e-05, "loss": 1.1844, "step": 27 }, { "epoch": 0.017560363750391973, "grad_norm": 0.7237846851348877, "learning_rate": 9.096989966555184e-05, "loss": 1.1115, "step": 28 }, { "epoch": 0.018187519598620257, "grad_norm": 0.7362813353538513, "learning_rate": 9.063545150501673e-05, "loss": 1.1492, "step": 29 }, { "epoch": 0.01881467544684854, "grad_norm": 0.7279103398323059, "learning_rate": 9.030100334448161e-05, "loss": 1.0738, "step": 30 }, { "epoch": 0.019441831295076827, "grad_norm": 0.7517884969711304, "learning_rate": 8.996655518394649e-05, "loss": 1.0702, "step": 31 }, { "epoch": 0.02006898714330511, "grad_norm": 0.803814172744751, "learning_rate": 8.963210702341137e-05, "loss": 0.9605, "step": 32 }, { "epoch": 0.020696142991533398, "grad_norm": 0.7744300961494446, "learning_rate": 8.929765886287625e-05, "loss": 1.0486, "step": 33 }, { "epoch": 0.02132329883976168, "grad_norm": 0.8029720783233643, "learning_rate": 8.896321070234114e-05, "loss": 1.0673, "step": 34 }, { "epoch": 0.021950454687989965, "grad_norm": 0.7909408211708069, "learning_rate": 8.862876254180602e-05, "loss": 1.1001, "step": 35 }, { "epoch": 0.022577610536218252, "grad_norm": 0.6341575980186462, "learning_rate": 8.82943143812709e-05, "loss": 1.1613, "step": 36 }, { "epoch": 0.023204766384446535, "grad_norm": 0.4735874831676483, "learning_rate": 8.795986622073578e-05, "loss": 0.9744, "step": 37 }, { "epoch": 0.02383192223267482, "grad_norm": 0.36456331610679626, "learning_rate": 8.762541806020068e-05, "loss": 1.0535, "step": 38 }, { "epoch": 0.024459078080903106, "grad_norm": 0.43026307225227356, "learning_rate": 8.729096989966556e-05, "loss": 1.0974, "step": 39 }, { "epoch": 0.02508623392913139, "grad_norm": 0.4285936653614044, "learning_rate": 8.695652173913044e-05, "loss": 1.2176, "step": 40 }, { "epoch": 0.025713389777359673, "grad_norm": 0.33779293298721313, "learning_rate": 8.662207357859532e-05, "loss": 1.0243, "step": 41 }, { "epoch": 0.02634054562558796, "grad_norm": 0.3825758099555969, "learning_rate": 8.62876254180602e-05, "loss": 1.1127, "step": 42 }, { "epoch": 0.026967701473816243, "grad_norm": 0.38178229331970215, "learning_rate": 8.595317725752509e-05, "loss": 1.058, "step": 43 }, { "epoch": 0.027594857322044527, "grad_norm": 0.3562932312488556, "learning_rate": 8.561872909698997e-05, "loss": 1.0776, "step": 44 }, { "epoch": 0.028222013170272814, "grad_norm": 0.41771742701530457, "learning_rate": 8.528428093645485e-05, "loss": 1.1033, "step": 45 }, { "epoch": 0.028849169018501097, "grad_norm": 0.40144434571266174, "learning_rate": 8.494983277591973e-05, "loss": 1.0842, "step": 46 }, { "epoch": 0.02947632486672938, "grad_norm": 0.4194628894329071, "learning_rate": 8.461538461538461e-05, "loss": 1.0986, "step": 47 }, { "epoch": 0.030103480714957668, "grad_norm": 0.3551357388496399, "learning_rate": 8.42809364548495e-05, "loss": 0.9806, "step": 48 }, { "epoch": 0.03073063656318595, "grad_norm": 0.4674742519855499, "learning_rate": 8.394648829431439e-05, "loss": 1.1232, "step": 49 }, { "epoch": 0.03135779241141424, "grad_norm": 0.3798663020133972, "learning_rate": 8.361204013377927e-05, "loss": 1.0847, "step": 50 }, { "epoch": 0.03198494825964252, "grad_norm": 0.37322738766670227, "learning_rate": 8.327759197324416e-05, "loss": 1.0732, "step": 51 }, { "epoch": 0.032612104107870805, "grad_norm": 0.39893415570259094, "learning_rate": 8.294314381270904e-05, "loss": 1.0823, "step": 52 }, { "epoch": 0.03323925995609909, "grad_norm": 0.4456530809402466, "learning_rate": 8.260869565217392e-05, "loss": 1.0817, "step": 53 }, { "epoch": 0.03386641580432737, "grad_norm": 0.37753430008888245, "learning_rate": 8.22742474916388e-05, "loss": 1.0824, "step": 54 }, { "epoch": 0.03449357165255566, "grad_norm": 0.3684673607349396, "learning_rate": 8.193979933110368e-05, "loss": 1.0696, "step": 55 }, { "epoch": 0.035120727500783946, "grad_norm": 0.3490143120288849, "learning_rate": 8.160535117056857e-05, "loss": 1.0141, "step": 56 }, { "epoch": 0.03574788334901223, "grad_norm": 0.3897489309310913, "learning_rate": 8.127090301003345e-05, "loss": 1.0179, "step": 57 }, { "epoch": 0.036375039197240513, "grad_norm": 0.38840609788894653, "learning_rate": 8.093645484949833e-05, "loss": 1.0654, "step": 58 }, { "epoch": 0.0370021950454688, "grad_norm": 0.4170911908149719, "learning_rate": 8.060200668896321e-05, "loss": 1.1377, "step": 59 }, { "epoch": 0.03762935089369708, "grad_norm": 0.45000165700912476, "learning_rate": 8.026755852842809e-05, "loss": 1.091, "step": 60 }, { "epoch": 0.03825650674192537, "grad_norm": 0.36387529969215393, "learning_rate": 7.993311036789299e-05, "loss": 1.0408, "step": 61 }, { "epoch": 0.038883662590153655, "grad_norm": 0.40481963753700256, "learning_rate": 7.959866220735787e-05, "loss": 1.0383, "step": 62 }, { "epoch": 0.03951081843838194, "grad_norm": 0.4196191728115082, "learning_rate": 7.926421404682275e-05, "loss": 1.0607, "step": 63 }, { "epoch": 0.04013797428661022, "grad_norm": 0.36332547664642334, "learning_rate": 7.892976588628763e-05, "loss": 1.0907, "step": 64 }, { "epoch": 0.040765130134838505, "grad_norm": 0.3924827575683594, "learning_rate": 7.859531772575252e-05, "loss": 1.1283, "step": 65 }, { "epoch": 0.041392285983066796, "grad_norm": 0.42891165614128113, "learning_rate": 7.82608695652174e-05, "loss": 1.0302, "step": 66 }, { "epoch": 0.04201944183129508, "grad_norm": 0.3694935441017151, "learning_rate": 7.792642140468228e-05, "loss": 1.0455, "step": 67 }, { "epoch": 0.04264659767952336, "grad_norm": 0.36729368567466736, "learning_rate": 7.759197324414716e-05, "loss": 0.9672, "step": 68 }, { "epoch": 0.043273753527751646, "grad_norm": 0.4106704890727997, "learning_rate": 7.725752508361204e-05, "loss": 1.0755, "step": 69 }, { "epoch": 0.04390090937597993, "grad_norm": 0.3694314956665039, "learning_rate": 7.692307692307693e-05, "loss": 1.0127, "step": 70 }, { "epoch": 0.04452806522420821, "grad_norm": 0.40102723240852356, "learning_rate": 7.658862876254181e-05, "loss": 1.0281, "step": 71 }, { "epoch": 0.045155221072436504, "grad_norm": 0.481668621301651, "learning_rate": 7.62541806020067e-05, "loss": 1.0825, "step": 72 }, { "epoch": 0.04578237692066479, "grad_norm": 0.4356762170791626, "learning_rate": 7.591973244147159e-05, "loss": 1.0739, "step": 73 }, { "epoch": 0.04640953276889307, "grad_norm": 0.3791458010673523, "learning_rate": 7.558528428093647e-05, "loss": 1.1232, "step": 74 }, { "epoch": 0.047036688617121354, "grad_norm": 0.4018148183822632, "learning_rate": 7.525083612040135e-05, "loss": 1.1061, "step": 75 }, { "epoch": 0.04766384446534964, "grad_norm": 0.35003310441970825, "learning_rate": 7.491638795986622e-05, "loss": 1.0553, "step": 76 }, { "epoch": 0.04829100031357792, "grad_norm": 0.3296242952346802, "learning_rate": 7.45819397993311e-05, "loss": 0.9636, "step": 77 }, { "epoch": 0.04891815616180621, "grad_norm": 0.3522872030735016, "learning_rate": 7.424749163879598e-05, "loss": 1.029, "step": 78 }, { "epoch": 0.049545312010034495, "grad_norm": 0.3733522295951843, "learning_rate": 7.391304347826086e-05, "loss": 1.0913, "step": 79 }, { "epoch": 0.05017246785826278, "grad_norm": 0.38543638586997986, "learning_rate": 7.357859531772575e-05, "loss": 1.0734, "step": 80 }, { "epoch": 0.05079962370649106, "grad_norm": 0.36167678236961365, "learning_rate": 7.324414715719064e-05, "loss": 1.07, "step": 81 }, { "epoch": 0.051426779554719346, "grad_norm": 0.4997354745864868, "learning_rate": 7.290969899665552e-05, "loss": 1.1243, "step": 82 }, { "epoch": 0.05205393540294763, "grad_norm": 0.39237678050994873, "learning_rate": 7.25752508361204e-05, "loss": 1.0661, "step": 83 }, { "epoch": 0.05268109125117592, "grad_norm": 0.47913259267807007, "learning_rate": 7.224080267558529e-05, "loss": 1.029, "step": 84 }, { "epoch": 0.0533082470994042, "grad_norm": 0.4255577325820923, "learning_rate": 7.190635451505017e-05, "loss": 1.134, "step": 85 }, { "epoch": 0.05393540294763249, "grad_norm": 0.398740291595459, "learning_rate": 7.157190635451505e-05, "loss": 1.0946, "step": 86 }, { "epoch": 0.05456255879586077, "grad_norm": 0.44168323278427124, "learning_rate": 7.123745819397993e-05, "loss": 1.1035, "step": 87 }, { "epoch": 0.055189714644089054, "grad_norm": 0.3680216372013092, "learning_rate": 7.090301003344481e-05, "loss": 1.0278, "step": 88 }, { "epoch": 0.055816870492317344, "grad_norm": 0.37274274230003357, "learning_rate": 7.05685618729097e-05, "loss": 1.0752, "step": 89 }, { "epoch": 0.05644402634054563, "grad_norm": 0.3773200213909149, "learning_rate": 7.023411371237458e-05, "loss": 1.0778, "step": 90 }, { "epoch": 0.05707118218877391, "grad_norm": 0.3545572757720947, "learning_rate": 6.989966555183946e-05, "loss": 1.0186, "step": 91 }, { "epoch": 0.057698338037002195, "grad_norm": 0.39364176988601685, "learning_rate": 6.956521739130436e-05, "loss": 1.0964, "step": 92 }, { "epoch": 0.05832549388523048, "grad_norm": 0.36303600668907166, "learning_rate": 6.923076923076924e-05, "loss": 1.0115, "step": 93 }, { "epoch": 0.05895264973345876, "grad_norm": 0.4130772352218628, "learning_rate": 6.889632107023412e-05, "loss": 1.0067, "step": 94 }, { "epoch": 0.05957980558168705, "grad_norm": 0.32856303453445435, "learning_rate": 6.8561872909699e-05, "loss": 0.9777, "step": 95 }, { "epoch": 0.060206961429915336, "grad_norm": 0.3589972257614136, "learning_rate": 6.822742474916388e-05, "loss": 0.9646, "step": 96 }, { "epoch": 0.06083411727814362, "grad_norm": 0.36986011266708374, "learning_rate": 6.789297658862876e-05, "loss": 1.0971, "step": 97 }, { "epoch": 0.0614612731263719, "grad_norm": 0.38670483231544495, "learning_rate": 6.755852842809365e-05, "loss": 1.0684, "step": 98 }, { "epoch": 0.062088428974600186, "grad_norm": 0.37243106961250305, "learning_rate": 6.722408026755853e-05, "loss": 1.027, "step": 99 }, { "epoch": 0.06271558482282848, "grad_norm": 0.3547367751598358, "learning_rate": 6.688963210702341e-05, "loss": 1.0162, "step": 100 }, { "epoch": 0.06334274067105676, "grad_norm": 0.33387291431427, "learning_rate": 6.655518394648829e-05, "loss": 1.0316, "step": 101 }, { "epoch": 0.06396989651928504, "grad_norm": 0.36430153250694275, "learning_rate": 6.622073578595317e-05, "loss": 1.0509, "step": 102 }, { "epoch": 0.06459705236751333, "grad_norm": 0.3841400146484375, "learning_rate": 6.588628762541807e-05, "loss": 0.9907, "step": 103 }, { "epoch": 0.06522420821574161, "grad_norm": 0.3880312740802765, "learning_rate": 6.555183946488295e-05, "loss": 1.1199, "step": 104 }, { "epoch": 0.0658513640639699, "grad_norm": 0.40127745270729065, "learning_rate": 6.521739130434783e-05, "loss": 1.0608, "step": 105 }, { "epoch": 0.06647851991219818, "grad_norm": 0.36271992325782776, "learning_rate": 6.488294314381272e-05, "loss": 1.0196, "step": 106 }, { "epoch": 0.06710567576042646, "grad_norm": 0.491242378950119, "learning_rate": 6.45484949832776e-05, "loss": 1.0333, "step": 107 }, { "epoch": 0.06773283160865474, "grad_norm": 0.43671101331710815, "learning_rate": 6.421404682274248e-05, "loss": 0.9627, "step": 108 }, { "epoch": 0.06835998745688304, "grad_norm": 0.3669928312301636, "learning_rate": 6.387959866220736e-05, "loss": 1.0114, "step": 109 }, { "epoch": 0.06898714330511133, "grad_norm": 0.36973488330841064, "learning_rate": 6.354515050167224e-05, "loss": 1.018, "step": 110 }, { "epoch": 0.06961429915333961, "grad_norm": 0.4004829525947571, "learning_rate": 6.321070234113713e-05, "loss": 1.1436, "step": 111 }, { "epoch": 0.07024145500156789, "grad_norm": 0.37323564291000366, "learning_rate": 6.287625418060201e-05, "loss": 1.0113, "step": 112 }, { "epoch": 0.07086861084979618, "grad_norm": 0.3787195086479187, "learning_rate": 6.254180602006689e-05, "loss": 1.0352, "step": 113 }, { "epoch": 0.07149576669802446, "grad_norm": 0.3870258331298828, "learning_rate": 6.220735785953178e-05, "loss": 1.0475, "step": 114 }, { "epoch": 0.07212292254625274, "grad_norm": 0.4817638099193573, "learning_rate": 6.187290969899667e-05, "loss": 1.0326, "step": 115 }, { "epoch": 0.07275007839448103, "grad_norm": 0.39872509241104126, "learning_rate": 6.153846153846155e-05, "loss": 1.024, "step": 116 }, { "epoch": 0.07337723424270931, "grad_norm": 0.45137402415275574, "learning_rate": 6.120401337792643e-05, "loss": 1.1143, "step": 117 }, { "epoch": 0.0740043900909376, "grad_norm": 0.39231353998184204, "learning_rate": 6.086956521739131e-05, "loss": 1.0349, "step": 118 }, { "epoch": 0.07463154593916588, "grad_norm": 0.35880520939826965, "learning_rate": 6.0535117056856194e-05, "loss": 0.9345, "step": 119 }, { "epoch": 0.07525870178739416, "grad_norm": 0.3513333797454834, "learning_rate": 6.0200668896321076e-05, "loss": 0.9967, "step": 120 }, { "epoch": 0.07588585763562246, "grad_norm": 0.35939666628837585, "learning_rate": 5.986622073578596e-05, "loss": 1.0429, "step": 121 }, { "epoch": 0.07651301348385074, "grad_norm": 0.4144527018070221, "learning_rate": 5.953177257525085e-05, "loss": 1.0793, "step": 122 }, { "epoch": 0.07714016933207903, "grad_norm": 0.37052595615386963, "learning_rate": 5.919732441471573e-05, "loss": 1.0402, "step": 123 }, { "epoch": 0.07776732518030731, "grad_norm": 0.3883453607559204, "learning_rate": 5.886287625418061e-05, "loss": 1.0859, "step": 124 }, { "epoch": 0.07839448102853559, "grad_norm": 0.3768259286880493, "learning_rate": 5.852842809364549e-05, "loss": 1.0231, "step": 125 }, { "epoch": 0.07902163687676388, "grad_norm": 0.3901348412036896, "learning_rate": 5.819397993311037e-05, "loss": 1.0889, "step": 126 }, { "epoch": 0.07964879272499216, "grad_norm": 0.404969185590744, "learning_rate": 5.785953177257525e-05, "loss": 0.9751, "step": 127 }, { "epoch": 0.08027594857322044, "grad_norm": 0.4729614555835724, "learning_rate": 5.752508361204013e-05, "loss": 1.1595, "step": 128 }, { "epoch": 0.08090310442144873, "grad_norm": 0.37773218750953674, "learning_rate": 5.7190635451505014e-05, "loss": 1.0176, "step": 129 }, { "epoch": 0.08153026026967701, "grad_norm": 0.3693353831768036, "learning_rate": 5.6856187290969896e-05, "loss": 1.0628, "step": 130 }, { "epoch": 0.0821574161179053, "grad_norm": 0.3893721103668213, "learning_rate": 5.652173913043478e-05, "loss": 1.083, "step": 131 }, { "epoch": 0.08278457196613359, "grad_norm": 0.3790439963340759, "learning_rate": 5.6187290969899666e-05, "loss": 1.0221, "step": 132 }, { "epoch": 0.08341172781436187, "grad_norm": 0.37359848618507385, "learning_rate": 5.585284280936455e-05, "loss": 0.9618, "step": 133 }, { "epoch": 0.08403888366259016, "grad_norm": 0.3717849552631378, "learning_rate": 5.551839464882943e-05, "loss": 1.0401, "step": 134 }, { "epoch": 0.08466603951081844, "grad_norm": 0.3849802613258362, "learning_rate": 5.518394648829431e-05, "loss": 1.0028, "step": 135 }, { "epoch": 0.08529319535904673, "grad_norm": 0.3668459355831146, "learning_rate": 5.4849498327759194e-05, "loss": 1.0282, "step": 136 }, { "epoch": 0.08592035120727501, "grad_norm": 0.3450651466846466, "learning_rate": 5.451505016722408e-05, "loss": 0.953, "step": 137 }, { "epoch": 0.08654750705550329, "grad_norm": 0.4239393472671509, "learning_rate": 5.4180602006688965e-05, "loss": 1.0164, "step": 138 }, { "epoch": 0.08717466290373158, "grad_norm": 0.3869022727012634, "learning_rate": 5.384615384615385e-05, "loss": 1.0557, "step": 139 }, { "epoch": 0.08780181875195986, "grad_norm": 0.3623475432395935, "learning_rate": 5.351170568561873e-05, "loss": 1.0586, "step": 140 }, { "epoch": 0.08842897460018814, "grad_norm": 0.39329102635383606, "learning_rate": 5.317725752508361e-05, "loss": 1.0265, "step": 141 }, { "epoch": 0.08905613044841643, "grad_norm": 0.4004840552806854, "learning_rate": 5.284280936454849e-05, "loss": 1.0591, "step": 142 }, { "epoch": 0.08968328629664471, "grad_norm": 0.4108268618583679, "learning_rate": 5.250836120401338e-05, "loss": 1.0628, "step": 143 }, { "epoch": 0.09031044214487301, "grad_norm": 0.4385989308357239, "learning_rate": 5.217391304347826e-05, "loss": 1.0755, "step": 144 }, { "epoch": 0.09093759799310129, "grad_norm": 0.43369996547698975, "learning_rate": 5.1839464882943145e-05, "loss": 1.0681, "step": 145 }, { "epoch": 0.09156475384132957, "grad_norm": 0.3623196482658386, "learning_rate": 5.150501672240803e-05, "loss": 1.0098, "step": 146 }, { "epoch": 0.09219190968955786, "grad_norm": 0.37398290634155273, "learning_rate": 5.117056856187291e-05, "loss": 0.9872, "step": 147 }, { "epoch": 0.09281906553778614, "grad_norm": 0.3824230134487152, "learning_rate": 5.08361204013378e-05, "loss": 1.0274, "step": 148 }, { "epoch": 0.09344622138601442, "grad_norm": 0.40129098296165466, "learning_rate": 5.050167224080268e-05, "loss": 1.057, "step": 149 }, { "epoch": 0.09407337723424271, "grad_norm": 0.3967929780483246, "learning_rate": 5.016722408026756e-05, "loss": 1.0452, "step": 150 }, { "epoch": 0.09470053308247099, "grad_norm": 0.4444187581539154, "learning_rate": 4.983277591973244e-05, "loss": 1.0567, "step": 151 }, { "epoch": 0.09532768893069928, "grad_norm": 0.44575589895248413, "learning_rate": 4.9498327759197325e-05, "loss": 0.9791, "step": 152 }, { "epoch": 0.09595484477892756, "grad_norm": 0.37481924891471863, "learning_rate": 4.916387959866221e-05, "loss": 1.0127, "step": 153 }, { "epoch": 0.09658200062715584, "grad_norm": 0.38331514596939087, "learning_rate": 4.8829431438127096e-05, "loss": 1.0009, "step": 154 }, { "epoch": 0.09720915647538414, "grad_norm": 0.41739198565483093, "learning_rate": 4.849498327759198e-05, "loss": 1.0721, "step": 155 }, { "epoch": 0.09783631232361242, "grad_norm": 0.401023805141449, "learning_rate": 4.816053511705686e-05, "loss": 1.0375, "step": 156 }, { "epoch": 0.0984634681718407, "grad_norm": 0.38500455021858215, "learning_rate": 4.782608695652174e-05, "loss": 1.045, "step": 157 }, { "epoch": 0.09909062402006899, "grad_norm": 0.4293578863143921, "learning_rate": 4.7491638795986624e-05, "loss": 1.056, "step": 158 }, { "epoch": 0.09971777986829727, "grad_norm": 0.4320215880870819, "learning_rate": 4.715719063545151e-05, "loss": 0.9005, "step": 159 }, { "epoch": 0.10034493571652556, "grad_norm": 0.4018799960613251, "learning_rate": 4.6822742474916394e-05, "loss": 1.0486, "step": 160 }, { "epoch": 0.10097209156475384, "grad_norm": 0.41116300225257874, "learning_rate": 4.6488294314381276e-05, "loss": 1.0391, "step": 161 }, { "epoch": 0.10159924741298212, "grad_norm": 0.40958237648010254, "learning_rate": 4.615384615384616e-05, "loss": 1.1217, "step": 162 }, { "epoch": 0.10222640326121041, "grad_norm": 0.41457492113113403, "learning_rate": 4.581939799331103e-05, "loss": 1.031, "step": 163 }, { "epoch": 0.10285355910943869, "grad_norm": 0.41084182262420654, "learning_rate": 4.548494983277592e-05, "loss": 1.0815, "step": 164 }, { "epoch": 0.10348071495766697, "grad_norm": 0.39257150888442993, "learning_rate": 4.5150501672240804e-05, "loss": 1.0174, "step": 165 }, { "epoch": 0.10410787080589526, "grad_norm": 0.38952505588531494, "learning_rate": 4.4816053511705686e-05, "loss": 1.0691, "step": 166 }, { "epoch": 0.10473502665412356, "grad_norm": 0.44120845198631287, "learning_rate": 4.448160535117057e-05, "loss": 1.0508, "step": 167 }, { "epoch": 0.10536218250235184, "grad_norm": 0.4178633391857147, "learning_rate": 4.414715719063545e-05, "loss": 1.0608, "step": 168 }, { "epoch": 0.10598933835058012, "grad_norm": 0.3926730453968048, "learning_rate": 4.381270903010034e-05, "loss": 1.0186, "step": 169 }, { "epoch": 0.1066164941988084, "grad_norm": 0.4078935980796814, "learning_rate": 4.347826086956522e-05, "loss": 1.0849, "step": 170 }, { "epoch": 0.10724365004703669, "grad_norm": 0.38935065269470215, "learning_rate": 4.31438127090301e-05, "loss": 1.0418, "step": 171 }, { "epoch": 0.10787080589526497, "grad_norm": 0.4126635193824768, "learning_rate": 4.2809364548494984e-05, "loss": 1.0002, "step": 172 }, { "epoch": 0.10849796174349326, "grad_norm": 0.413591593503952, "learning_rate": 4.2474916387959866e-05, "loss": 1.0497, "step": 173 }, { "epoch": 0.10912511759172154, "grad_norm": 0.39876362681388855, "learning_rate": 4.214046822742475e-05, "loss": 1.036, "step": 174 }, { "epoch": 0.10975227343994982, "grad_norm": 0.3766686022281647, "learning_rate": 4.180602006688964e-05, "loss": 1.0355, "step": 175 }, { "epoch": 0.11037942928817811, "grad_norm": 0.3662855625152588, "learning_rate": 4.147157190635452e-05, "loss": 0.9986, "step": 176 }, { "epoch": 0.11100658513640639, "grad_norm": 0.3837905526161194, "learning_rate": 4.11371237458194e-05, "loss": 1.0735, "step": 177 }, { "epoch": 0.11163374098463469, "grad_norm": 0.5081769824028015, "learning_rate": 4.080267558528428e-05, "loss": 1.0549, "step": 178 }, { "epoch": 0.11226089683286297, "grad_norm": 0.4082931578159332, "learning_rate": 4.0468227424749165e-05, "loss": 1.0462, "step": 179 }, { "epoch": 0.11288805268109126, "grad_norm": 0.4553743600845337, "learning_rate": 4.0133779264214046e-05, "loss": 1.0242, "step": 180 }, { "epoch": 0.11351520852931954, "grad_norm": 0.3822484314441681, "learning_rate": 3.9799331103678935e-05, "loss": 1.0099, "step": 181 }, { "epoch": 0.11414236437754782, "grad_norm": 0.3885160982608795, "learning_rate": 3.946488294314382e-05, "loss": 0.9953, "step": 182 }, { "epoch": 0.1147695202257761, "grad_norm": 0.38939550518989563, "learning_rate": 3.91304347826087e-05, "loss": 1.033, "step": 183 }, { "epoch": 0.11539667607400439, "grad_norm": 0.4274287819862366, "learning_rate": 3.879598662207358e-05, "loss": 1.0495, "step": 184 }, { "epoch": 0.11602383192223267, "grad_norm": 0.4428479075431824, "learning_rate": 3.846153846153846e-05, "loss": 0.9925, "step": 185 }, { "epoch": 0.11665098777046096, "grad_norm": 0.4075606167316437, "learning_rate": 3.812709030100335e-05, "loss": 1.0926, "step": 186 }, { "epoch": 0.11727814361868924, "grad_norm": 0.3637610375881195, "learning_rate": 3.7792642140468233e-05, "loss": 1.033, "step": 187 }, { "epoch": 0.11790529946691752, "grad_norm": 0.38497859239578247, "learning_rate": 3.745819397993311e-05, "loss": 0.9976, "step": 188 }, { "epoch": 0.11853245531514581, "grad_norm": 0.4265378415584564, "learning_rate": 3.712374581939799e-05, "loss": 1.125, "step": 189 }, { "epoch": 0.1191596111633741, "grad_norm": 0.42353934049606323, "learning_rate": 3.678929765886287e-05, "loss": 1.0472, "step": 190 }, { "epoch": 0.11978676701160239, "grad_norm": 0.42161011695861816, "learning_rate": 3.645484949832776e-05, "loss": 1.0717, "step": 191 }, { "epoch": 0.12041392285983067, "grad_norm": 0.414531409740448, "learning_rate": 3.612040133779264e-05, "loss": 1.0454, "step": 192 }, { "epoch": 0.12104107870805896, "grad_norm": 0.41244766116142273, "learning_rate": 3.5785953177257525e-05, "loss": 1.0387, "step": 193 }, { "epoch": 0.12166823455628724, "grad_norm": 0.39831194281578064, "learning_rate": 3.545150501672241e-05, "loss": 0.9996, "step": 194 }, { "epoch": 0.12229539040451552, "grad_norm": 0.397178053855896, "learning_rate": 3.511705685618729e-05, "loss": 1.02, "step": 195 }, { "epoch": 0.1229225462527438, "grad_norm": 0.3719254732131958, "learning_rate": 3.478260869565218e-05, "loss": 1.0445, "step": 196 }, { "epoch": 0.12354970210097209, "grad_norm": 0.39827004075050354, "learning_rate": 3.444816053511706e-05, "loss": 1.0226, "step": 197 }, { "epoch": 0.12417685794920037, "grad_norm": 0.4362657070159912, "learning_rate": 3.411371237458194e-05, "loss": 1.0073, "step": 198 }, { "epoch": 0.12480401379742866, "grad_norm": 0.41483965516090393, "learning_rate": 3.3779264214046823e-05, "loss": 1.043, "step": 199 }, { "epoch": 0.12543116964565695, "grad_norm": 0.40933161973953247, "learning_rate": 3.3444816053511705e-05, "loss": 1.0299, "step": 200 }, { "epoch": 0.12605832549388524, "grad_norm": 0.3616185784339905, "learning_rate": 3.311036789297659e-05, "loss": 0.9555, "step": 201 }, { "epoch": 0.12668548134211352, "grad_norm": 0.37591472268104553, "learning_rate": 3.2775919732441476e-05, "loss": 1.076, "step": 202 }, { "epoch": 0.1273126371903418, "grad_norm": 0.3866356611251831, "learning_rate": 3.244147157190636e-05, "loss": 0.9543, "step": 203 }, { "epoch": 0.1279397930385701, "grad_norm": 0.4237740933895111, "learning_rate": 3.210702341137124e-05, "loss": 1.0667, "step": 204 }, { "epoch": 0.12856694888679837, "grad_norm": 0.4242074489593506, "learning_rate": 3.177257525083612e-05, "loss": 0.9898, "step": 205 }, { "epoch": 0.12919410473502665, "grad_norm": 0.39934250712394714, "learning_rate": 3.1438127090301004e-05, "loss": 1.0489, "step": 206 }, { "epoch": 0.12982126058325494, "grad_norm": 0.42593199014663696, "learning_rate": 3.110367892976589e-05, "loss": 1.0614, "step": 207 }, { "epoch": 0.13044841643148322, "grad_norm": 0.4550575315952301, "learning_rate": 3.0769230769230774e-05, "loss": 0.9835, "step": 208 }, { "epoch": 0.1310755722797115, "grad_norm": 0.4025239944458008, "learning_rate": 3.0434782608695656e-05, "loss": 0.9985, "step": 209 }, { "epoch": 0.1317027281279398, "grad_norm": 0.41113126277923584, "learning_rate": 3.0100334448160538e-05, "loss": 1.04, "step": 210 }, { "epoch": 0.13232988397616807, "grad_norm": 0.39338940382003784, "learning_rate": 2.9765886287625424e-05, "loss": 1.0446, "step": 211 }, { "epoch": 0.13295703982439636, "grad_norm": 0.4386296272277832, "learning_rate": 2.9431438127090305e-05, "loss": 1.0588, "step": 212 }, { "epoch": 0.13358419567262464, "grad_norm": 0.454953134059906, "learning_rate": 2.9096989966555184e-05, "loss": 1.0658, "step": 213 }, { "epoch": 0.13421135152085292, "grad_norm": 0.549017071723938, "learning_rate": 2.8762541806020066e-05, "loss": 1.0808, "step": 214 }, { "epoch": 0.1348385073690812, "grad_norm": 0.4158805012702942, "learning_rate": 2.8428093645484948e-05, "loss": 1.0682, "step": 215 }, { "epoch": 0.1354656632173095, "grad_norm": 0.37862929701805115, "learning_rate": 2.8093645484949833e-05, "loss": 1.057, "step": 216 }, { "epoch": 0.13609281906553777, "grad_norm": 0.4133341610431671, "learning_rate": 2.7759197324414715e-05, "loss": 1.0314, "step": 217 }, { "epoch": 0.13671997491376608, "grad_norm": 0.43372365832328796, "learning_rate": 2.7424749163879597e-05, "loss": 1.0514, "step": 218 }, { "epoch": 0.13734713076199437, "grad_norm": 0.37303996086120605, "learning_rate": 2.7090301003344482e-05, "loss": 0.9123, "step": 219 }, { "epoch": 0.13797428661022265, "grad_norm": 0.3717896342277527, "learning_rate": 2.6755852842809364e-05, "loss": 0.9934, "step": 220 }, { "epoch": 0.13860144245845094, "grad_norm": 0.3860597610473633, "learning_rate": 2.6421404682274246e-05, "loss": 1.0206, "step": 221 }, { "epoch": 0.13922859830667922, "grad_norm": 0.5044668912887573, "learning_rate": 2.608695652173913e-05, "loss": 1.0132, "step": 222 }, { "epoch": 0.1398557541549075, "grad_norm": 0.4312911331653595, "learning_rate": 2.5752508361204013e-05, "loss": 1.0147, "step": 223 }, { "epoch": 0.14048291000313579, "grad_norm": 0.4386849105358124, "learning_rate": 2.54180602006689e-05, "loss": 1.0569, "step": 224 }, { "epoch": 0.14111006585136407, "grad_norm": 0.42580652236938477, "learning_rate": 2.508361204013378e-05, "loss": 1.0415, "step": 225 }, { "epoch": 0.14173722169959235, "grad_norm": 0.4034588634967804, "learning_rate": 2.4749163879598663e-05, "loss": 1.0761, "step": 226 }, { "epoch": 0.14236437754782064, "grad_norm": 0.42452365159988403, "learning_rate": 2.4414715719063548e-05, "loss": 1.015, "step": 227 }, { "epoch": 0.14299153339604892, "grad_norm": 0.44657889008522034, "learning_rate": 2.408026755852843e-05, "loss": 1.0705, "step": 228 }, { "epoch": 0.1436186892442772, "grad_norm": 0.4117855131626129, "learning_rate": 2.3745819397993312e-05, "loss": 1.0086, "step": 229 }, { "epoch": 0.1442458450925055, "grad_norm": 0.44331252574920654, "learning_rate": 2.3411371237458197e-05, "loss": 1.0749, "step": 230 }, { "epoch": 0.14487300094073377, "grad_norm": 0.43930211663246155, "learning_rate": 2.307692307692308e-05, "loss": 1.0463, "step": 231 }, { "epoch": 0.14550015678896205, "grad_norm": 0.40405145287513733, "learning_rate": 2.274247491638796e-05, "loss": 1.0351, "step": 232 }, { "epoch": 0.14612731263719034, "grad_norm": 0.42328453063964844, "learning_rate": 2.2408026755852843e-05, "loss": 0.9787, "step": 233 }, { "epoch": 0.14675446848541862, "grad_norm": 0.40019333362579346, "learning_rate": 2.2073578595317725e-05, "loss": 0.9817, "step": 234 }, { "epoch": 0.1473816243336469, "grad_norm": 0.39503028988838196, "learning_rate": 2.173913043478261e-05, "loss": 0.9632, "step": 235 }, { "epoch": 0.1480087801818752, "grad_norm": 0.36922141909599304, "learning_rate": 2.1404682274247492e-05, "loss": 0.9626, "step": 236 }, { "epoch": 0.14863593603010347, "grad_norm": 0.40530282258987427, "learning_rate": 2.1070234113712374e-05, "loss": 1.0501, "step": 237 }, { "epoch": 0.14926309187833176, "grad_norm": 0.4161907136440277, "learning_rate": 2.073578595317726e-05, "loss": 1.0479, "step": 238 }, { "epoch": 0.14989024772656004, "grad_norm": 0.4405962824821472, "learning_rate": 2.040133779264214e-05, "loss": 1.0068, "step": 239 }, { "epoch": 0.15051740357478832, "grad_norm": 0.4103190004825592, "learning_rate": 2.0066889632107023e-05, "loss": 0.9647, "step": 240 }, { "epoch": 0.15114455942301663, "grad_norm": 0.3906611502170563, "learning_rate": 1.973244147157191e-05, "loss": 0.9434, "step": 241 }, { "epoch": 0.15177171527124492, "grad_norm": 0.4198826849460602, "learning_rate": 1.939799331103679e-05, "loss": 1.0264, "step": 242 }, { "epoch": 0.1523988711194732, "grad_norm": 0.43589988350868225, "learning_rate": 1.9063545150501676e-05, "loss": 1.0957, "step": 243 }, { "epoch": 0.15302602696770148, "grad_norm": 0.38247135281562805, "learning_rate": 1.8729096989966554e-05, "loss": 0.9668, "step": 244 }, { "epoch": 0.15365318281592977, "grad_norm": 0.3897751271724701, "learning_rate": 1.8394648829431436e-05, "loss": 0.9465, "step": 245 }, { "epoch": 0.15428033866415805, "grad_norm": 0.39580726623535156, "learning_rate": 1.806020066889632e-05, "loss": 0.9801, "step": 246 }, { "epoch": 0.15490749451238633, "grad_norm": 0.4514491856098175, "learning_rate": 1.7725752508361204e-05, "loss": 1.1401, "step": 247 }, { "epoch": 0.15553465036061462, "grad_norm": 0.40299785137176514, "learning_rate": 1.739130434782609e-05, "loss": 1.063, "step": 248 }, { "epoch": 0.1561618062088429, "grad_norm": 0.41476714611053467, "learning_rate": 1.705685618729097e-05, "loss": 1.0065, "step": 249 }, { "epoch": 0.15678896205707119, "grad_norm": 0.4274609088897705, "learning_rate": 1.6722408026755853e-05, "loss": 1.0558, "step": 250 }, { "epoch": 0.15741611790529947, "grad_norm": 0.4935191869735718, "learning_rate": 1.6387959866220738e-05, "loss": 1.0056, "step": 251 }, { "epoch": 0.15804327375352775, "grad_norm": 0.38647282123565674, "learning_rate": 1.605351170568562e-05, "loss": 0.9921, "step": 252 }, { "epoch": 0.15867042960175604, "grad_norm": 0.41485288739204407, "learning_rate": 1.5719063545150502e-05, "loss": 0.9997, "step": 253 }, { "epoch": 0.15929758544998432, "grad_norm": 0.41094207763671875, "learning_rate": 1.5384615384615387e-05, "loss": 1.026, "step": 254 }, { "epoch": 0.1599247412982126, "grad_norm": 0.5010769367218018, "learning_rate": 1.5050167224080269e-05, "loss": 1.0301, "step": 255 }, { "epoch": 0.1605518971464409, "grad_norm": 0.40588176250457764, "learning_rate": 1.4715719063545153e-05, "loss": 1.0085, "step": 256 }, { "epoch": 0.16117905299466917, "grad_norm": 0.4186544120311737, "learning_rate": 1.4381270903010033e-05, "loss": 0.9676, "step": 257 }, { "epoch": 0.16180620884289745, "grad_norm": 0.38908740878105164, "learning_rate": 1.4046822742474917e-05, "loss": 1.0011, "step": 258 }, { "epoch": 0.16243336469112574, "grad_norm": 0.4013504385948181, "learning_rate": 1.3712374581939799e-05, "loss": 0.9683, "step": 259 }, { "epoch": 0.16306052053935402, "grad_norm": 0.3967900276184082, "learning_rate": 1.3377926421404682e-05, "loss": 1.0273, "step": 260 }, { "epoch": 0.1636876763875823, "grad_norm": 0.4046870172023773, "learning_rate": 1.3043478260869566e-05, "loss": 1.0192, "step": 261 }, { "epoch": 0.1643148322358106, "grad_norm": 0.4919883608818054, "learning_rate": 1.270903010033445e-05, "loss": 1.0134, "step": 262 }, { "epoch": 0.16494198808403887, "grad_norm": 0.39341261982917786, "learning_rate": 1.2374581939799331e-05, "loss": 0.9644, "step": 263 }, { "epoch": 0.16556914393226718, "grad_norm": 0.44106414914131165, "learning_rate": 1.2040133779264215e-05, "loss": 1.0905, "step": 264 }, { "epoch": 0.16619629978049547, "grad_norm": 0.4262993037700653, "learning_rate": 1.1705685618729099e-05, "loss": 0.9678, "step": 265 }, { "epoch": 0.16682345562872375, "grad_norm": 0.43883177638053894, "learning_rate": 1.137123745819398e-05, "loss": 1.0759, "step": 266 }, { "epoch": 0.16745061147695203, "grad_norm": 0.4418894052505493, "learning_rate": 1.1036789297658862e-05, "loss": 1.0093, "step": 267 }, { "epoch": 0.16807776732518032, "grad_norm": 0.40942269563674927, "learning_rate": 1.0702341137123746e-05, "loss": 1.0525, "step": 268 }, { "epoch": 0.1687049231734086, "grad_norm": 0.3933064639568329, "learning_rate": 1.036789297658863e-05, "loss": 0.9859, "step": 269 }, { "epoch": 0.16933207902163688, "grad_norm": 0.43355491757392883, "learning_rate": 1.0033444816053512e-05, "loss": 1.0071, "step": 270 }, { "epoch": 0.16995923486986517, "grad_norm": 0.4011549949645996, "learning_rate": 9.698996655518395e-06, "loss": 0.9301, "step": 271 }, { "epoch": 0.17058639071809345, "grad_norm": 0.41236254572868347, "learning_rate": 9.364548494983277e-06, "loss": 0.9722, "step": 272 }, { "epoch": 0.17121354656632173, "grad_norm": 0.4208712577819824, "learning_rate": 9.03010033444816e-06, "loss": 1.016, "step": 273 }, { "epoch": 0.17184070241455002, "grad_norm": 0.43124738335609436, "learning_rate": 8.695652173913044e-06, "loss": 1.042, "step": 274 }, { "epoch": 0.1724678582627783, "grad_norm": 0.43700850009918213, "learning_rate": 8.361204013377926e-06, "loss": 1.096, "step": 275 }, { "epoch": 0.17309501411100658, "grad_norm": 0.41486409306526184, "learning_rate": 8.02675585284281e-06, "loss": 1.0385, "step": 276 }, { "epoch": 0.17372216995923487, "grad_norm": 0.41307199001312256, "learning_rate": 7.692307692307694e-06, "loss": 1.0535, "step": 277 }, { "epoch": 0.17434932580746315, "grad_norm": 0.4321426749229431, "learning_rate": 7.357859531772576e-06, "loss": 0.9939, "step": 278 }, { "epoch": 0.17497648165569143, "grad_norm": 0.43575429916381836, "learning_rate": 7.023411371237458e-06, "loss": 1.0395, "step": 279 }, { "epoch": 0.17560363750391972, "grad_norm": 0.40828558802604675, "learning_rate": 6.688963210702341e-06, "loss": 1.0325, "step": 280 }, { "epoch": 0.176230793352148, "grad_norm": 0.39350196719169617, "learning_rate": 6.354515050167225e-06, "loss": 0.9613, "step": 281 }, { "epoch": 0.17685794920037629, "grad_norm": 0.46433594822883606, "learning_rate": 6.0200668896321075e-06, "loss": 0.9917, "step": 282 }, { "epoch": 0.17748510504860457, "grad_norm": 0.451623797416687, "learning_rate": 5.68561872909699e-06, "loss": 0.995, "step": 283 }, { "epoch": 0.17811226089683285, "grad_norm": 0.4262632727622986, "learning_rate": 5.351170568561873e-06, "loss": 1.0278, "step": 284 }, { "epoch": 0.17873941674506114, "grad_norm": 0.4028262197971344, "learning_rate": 5.016722408026756e-06, "loss": 1.0407, "step": 285 }, { "epoch": 0.17936657259328942, "grad_norm": 0.4666818082332611, "learning_rate": 4.682274247491639e-06, "loss": 1.0857, "step": 286 }, { "epoch": 0.17999372844151773, "grad_norm": 0.4177907705307007, "learning_rate": 4.347826086956522e-06, "loss": 1.1049, "step": 287 }, { "epoch": 0.18062088428974601, "grad_norm": 0.4109366238117218, "learning_rate": 4.013377926421405e-06, "loss": 0.9684, "step": 288 }, { "epoch": 0.1812480401379743, "grad_norm": 0.3886430263519287, "learning_rate": 3.678929765886288e-06, "loss": 0.9639, "step": 289 }, { "epoch": 0.18187519598620258, "grad_norm": 0.3857450783252716, "learning_rate": 3.3444816053511705e-06, "loss": 1.0563, "step": 290 }, { "epoch": 0.18250235183443086, "grad_norm": 0.4161767363548279, "learning_rate": 3.0100334448160537e-06, "loss": 0.9995, "step": 291 }, { "epoch": 0.18312950768265915, "grad_norm": 0.4179271459579468, "learning_rate": 2.6755852842809365e-06, "loss": 1.0718, "step": 292 }, { "epoch": 0.18375666353088743, "grad_norm": 0.36055099964141846, "learning_rate": 2.3411371237458193e-06, "loss": 0.9857, "step": 293 }, { "epoch": 0.18438381937911572, "grad_norm": 0.40733909606933594, "learning_rate": 2.0066889632107025e-06, "loss": 0.9945, "step": 294 }, { "epoch": 0.185010975227344, "grad_norm": 0.3714075982570648, "learning_rate": 1.6722408026755853e-06, "loss": 1.0026, "step": 295 }, { "epoch": 0.18563813107557228, "grad_norm": 0.4596184492111206, "learning_rate": 1.3377926421404683e-06, "loss": 0.9711, "step": 296 }, { "epoch": 0.18626528692380057, "grad_norm": 0.38908660411834717, "learning_rate": 1.0033444816053512e-06, "loss": 1.0009, "step": 297 }, { "epoch": 0.18689244277202885, "grad_norm": 0.43956777453422546, "learning_rate": 6.688963210702341e-07, "loss": 1.0317, "step": 298 }, { "epoch": 0.18751959862025713, "grad_norm": 0.4340553879737854, "learning_rate": 3.3444816053511706e-07, "loss": 1.0152, "step": 299 }, { "epoch": 0.18814675446848542, "grad_norm": 0.4342001676559448, "learning_rate": 0.0, "loss": 1.0882, "step": 300 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.314602411592909e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }