{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18814675446848542, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006271558482282847, "grad_norm": 1.1711084842681885, "learning_rate": 0.0001, "loss": 2.4046, "step": 1 }, { "epoch": 0.0012543116964565694, "grad_norm": 1.008390188217163, "learning_rate": 9.966555183946489e-05, "loss": 2.2223, "step": 2 }, { "epoch": 0.0018814675446848542, "grad_norm": 0.9596850872039795, "learning_rate": 9.933110367892977e-05, "loss": 2.4222, "step": 3 }, { "epoch": 0.002508623392913139, "grad_norm": 0.9983300566673279, "learning_rate": 9.899665551839465e-05, "loss": 2.2476, "step": 4 }, { "epoch": 0.0031357792411414237, "grad_norm": 1.08811354637146, "learning_rate": 9.866220735785953e-05, "loss": 2.1187, "step": 5 }, { "epoch": 0.0037629350893697085, "grad_norm": 1.1631696224212646, "learning_rate": 9.832775919732441e-05, "loss": 1.768, "step": 6 }, { "epoch": 0.004390090937597993, "grad_norm": 1.5843448638916016, "learning_rate": 9.799331103678931e-05, "loss": 1.6732, "step": 7 }, { "epoch": 0.005017246785826278, "grad_norm": 0.9309654235839844, "learning_rate": 9.765886287625419e-05, "loss": 1.4838, "step": 8 }, { "epoch": 0.005644402634054563, "grad_norm": 0.809363603591919, "learning_rate": 9.732441471571907e-05, "loss": 1.4857, "step": 9 }, { "epoch": 0.006271558482282847, "grad_norm": 0.8475091457366943, "learning_rate": 9.698996655518396e-05, "loss": 1.4753, "step": 10 }, { "epoch": 0.006898714330511132, "grad_norm": 0.8010616898536682, "learning_rate": 9.665551839464884e-05, "loss": 1.2778, "step": 11 }, { "epoch": 0.007525870178739417, "grad_norm": 0.6085690259933472, "learning_rate": 9.632107023411372e-05, "loss": 1.3194, "step": 12 }, { "epoch": 0.008153026026967701, "grad_norm": 0.4717109799385071, "learning_rate": 9.59866220735786e-05, "loss": 1.275, "step": 13 }, { "epoch": 0.008780181875195987, "grad_norm": 0.4746397137641907, "learning_rate": 9.565217391304348e-05, "loss": 1.2539, "step": 14 }, { "epoch": 0.00940733772342427, "grad_norm": 0.48207026720046997, "learning_rate": 9.531772575250837e-05, "loss": 1.2033, "step": 15 }, { "epoch": 0.010034493571652555, "grad_norm": 0.4692087769508362, "learning_rate": 9.498327759197325e-05, "loss": 1.2711, "step": 16 }, { "epoch": 0.01066164941988084, "grad_norm": 0.4813322126865387, "learning_rate": 9.464882943143813e-05, "loss": 1.1544, "step": 17 }, { "epoch": 0.011288805268109126, "grad_norm": 0.4796231985092163, "learning_rate": 9.431438127090302e-05, "loss": 1.1802, "step": 18 }, { "epoch": 0.01191596111633741, "grad_norm": 0.5106775760650635, "learning_rate": 9.39799331103679e-05, "loss": 1.2377, "step": 19 }, { "epoch": 0.012543116964565695, "grad_norm": 0.5248191356658936, "learning_rate": 9.364548494983279e-05, "loss": 1.1643, "step": 20 }, { "epoch": 0.01317027281279398, "grad_norm": 0.5244407653808594, "learning_rate": 9.331103678929767e-05, "loss": 1.1315, "step": 21 }, { "epoch": 0.013797428661022263, "grad_norm": 0.5456350445747375, "learning_rate": 9.297658862876255e-05, "loss": 1.1715, "step": 22 }, { "epoch": 0.014424584509250549, "grad_norm": 0.5740377902984619, "learning_rate": 9.264214046822743e-05, "loss": 1.2334, "step": 23 }, { "epoch": 0.015051740357478834, "grad_norm": 0.5430876016616821, "learning_rate": 9.230769230769232e-05, "loss": 1.1988, "step": 24 }, { "epoch": 0.01567889620570712, "grad_norm": 0.6482923626899719, "learning_rate": 9.19732441471572e-05, "loss": 1.2454, "step": 25 }, { "epoch": 0.016306052053935403, "grad_norm": 0.6234032511711121, "learning_rate": 9.163879598662207e-05, "loss": 1.1488, "step": 26 }, { "epoch": 0.016933207902163686, "grad_norm": 0.6938403248786926, "learning_rate": 9.130434782608696e-05, "loss": 1.228, "step": 27 }, { "epoch": 0.017560363750391973, "grad_norm": 0.7055364847183228, "learning_rate": 9.096989966555184e-05, "loss": 1.1577, "step": 28 }, { "epoch": 0.018187519598620257, "grad_norm": 0.7398049235343933, "learning_rate": 9.063545150501673e-05, "loss": 1.1863, "step": 29 }, { "epoch": 0.01881467544684854, "grad_norm": 0.7258826494216919, "learning_rate": 9.030100334448161e-05, "loss": 1.124, "step": 30 }, { "epoch": 0.019441831295076827, "grad_norm": 0.7851470708847046, "learning_rate": 8.996655518394649e-05, "loss": 1.1256, "step": 31 }, { "epoch": 0.02006898714330511, "grad_norm": 0.8108616471290588, "learning_rate": 8.963210702341137e-05, "loss": 1.0102, "step": 32 }, { "epoch": 0.020696142991533398, "grad_norm": 0.7878923416137695, "learning_rate": 8.929765886287625e-05, "loss": 1.1043, "step": 33 }, { "epoch": 0.02132329883976168, "grad_norm": 0.8210941553115845, "learning_rate": 8.896321070234114e-05, "loss": 1.0992, "step": 34 }, { "epoch": 0.021950454687989965, "grad_norm": 0.8405901193618774, "learning_rate": 8.862876254180602e-05, "loss": 1.1528, "step": 35 }, { "epoch": 0.022577610536218252, "grad_norm": 0.6356008052825928, "learning_rate": 8.82943143812709e-05, "loss": 1.1983, "step": 36 }, { "epoch": 0.023204766384446535, "grad_norm": 0.4946132302284241, "learning_rate": 8.795986622073578e-05, "loss": 1.0066, "step": 37 }, { "epoch": 0.02383192223267482, "grad_norm": 0.3963969051837921, "learning_rate": 8.762541806020068e-05, "loss": 1.1063, "step": 38 }, { "epoch": 0.024459078080903106, "grad_norm": 0.42465490102767944, "learning_rate": 8.729096989966556e-05, "loss": 1.1582, "step": 39 }, { "epoch": 0.02508623392913139, "grad_norm": 0.43699175119400024, "learning_rate": 8.695652173913044e-05, "loss": 1.2481, "step": 40 }, { "epoch": 0.025713389777359673, "grad_norm": 0.35003581643104553, "learning_rate": 8.662207357859532e-05, "loss": 1.0657, "step": 41 }, { "epoch": 0.02634054562558796, "grad_norm": 0.3598668873310089, "learning_rate": 8.62876254180602e-05, "loss": 1.1521, "step": 42 }, { "epoch": 0.026967701473816243, "grad_norm": 0.3630351126194, "learning_rate": 8.595317725752509e-05, "loss": 1.0817, "step": 43 }, { "epoch": 0.027594857322044527, "grad_norm": 0.40215277671813965, "learning_rate": 8.561872909698997e-05, "loss": 1.129, "step": 44 }, { "epoch": 0.028222013170272814, "grad_norm": 0.4377795457839966, "learning_rate": 8.528428093645485e-05, "loss": 1.1326, "step": 45 }, { "epoch": 0.028849169018501097, "grad_norm": 0.48775389790534973, "learning_rate": 8.494983277591973e-05, "loss": 1.1442, "step": 46 }, { "epoch": 0.02947632486672938, "grad_norm": 0.4295575022697449, "learning_rate": 8.461538461538461e-05, "loss": 1.1551, "step": 47 }, { "epoch": 0.030103480714957668, "grad_norm": 0.3610740900039673, "learning_rate": 8.42809364548495e-05, "loss": 1.019, "step": 48 }, { "epoch": 0.03073063656318595, "grad_norm": 0.43356069922447205, "learning_rate": 8.394648829431439e-05, "loss": 1.1686, "step": 49 }, { "epoch": 0.03135779241141424, "grad_norm": 0.4673689305782318, "learning_rate": 8.361204013377927e-05, "loss": 1.1223, "step": 50 }, { "epoch": 0.03198494825964252, "grad_norm": 0.3878093659877777, "learning_rate": 8.327759197324416e-05, "loss": 1.1221, "step": 51 }, { "epoch": 0.032612104107870805, "grad_norm": 0.40353527665138245, "learning_rate": 8.294314381270904e-05, "loss": 1.1203, "step": 52 }, { "epoch": 0.03323925995609909, "grad_norm": 0.4405611753463745, "learning_rate": 8.260869565217392e-05, "loss": 1.121, "step": 53 }, { "epoch": 0.03386641580432737, "grad_norm": 0.43877699971199036, "learning_rate": 8.22742474916388e-05, "loss": 1.1487, "step": 54 }, { "epoch": 0.03449357165255566, "grad_norm": 0.38161155581474304, "learning_rate": 8.193979933110368e-05, "loss": 1.0979, "step": 55 }, { "epoch": 0.035120727500783946, "grad_norm": 0.3485104441642761, "learning_rate": 8.160535117056857e-05, "loss": 1.0555, "step": 56 }, { "epoch": 0.03574788334901223, "grad_norm": 0.4038199186325073, "learning_rate": 8.127090301003345e-05, "loss": 1.108, "step": 57 }, { "epoch": 0.036375039197240513, "grad_norm": 0.4036957621574402, "learning_rate": 8.093645484949833e-05, "loss": 1.0979, "step": 58 }, { "epoch": 0.0370021950454688, "grad_norm": 0.433468222618103, "learning_rate": 8.060200668896321e-05, "loss": 1.1714, "step": 59 }, { "epoch": 0.03762935089369708, "grad_norm": 0.42450040578842163, "learning_rate": 8.026755852842809e-05, "loss": 1.0953, "step": 60 }, { "epoch": 0.03825650674192537, "grad_norm": 0.43583229184150696, "learning_rate": 7.993311036789299e-05, "loss": 1.0818, "step": 61 }, { "epoch": 0.038883662590153655, "grad_norm": 0.3825129270553589, "learning_rate": 7.959866220735787e-05, "loss": 1.0982, "step": 62 }, { "epoch": 0.03951081843838194, "grad_norm": 0.4264048933982849, "learning_rate": 7.926421404682275e-05, "loss": 1.0857, "step": 63 }, { "epoch": 0.04013797428661022, "grad_norm": 0.4036998152732849, "learning_rate": 7.892976588628763e-05, "loss": 1.1427, "step": 64 }, { "epoch": 0.040765130134838505, "grad_norm": 0.40438225865364075, "learning_rate": 7.859531772575252e-05, "loss": 1.1452, "step": 65 }, { "epoch": 0.041392285983066796, "grad_norm": 0.42490822076797485, "learning_rate": 7.82608695652174e-05, "loss": 1.0982, "step": 66 }, { "epoch": 0.04201944183129508, "grad_norm": 0.395109087228775, "learning_rate": 7.792642140468228e-05, "loss": 1.0857, "step": 67 }, { "epoch": 0.04264659767952336, "grad_norm": 0.36866042017936707, "learning_rate": 7.759197324414716e-05, "loss": 1.0181, "step": 68 }, { "epoch": 0.043273753527751646, "grad_norm": 0.4392179846763611, "learning_rate": 7.725752508361204e-05, "loss": 1.1206, "step": 69 }, { "epoch": 0.04390090937597993, "grad_norm": 0.44415852427482605, "learning_rate": 7.692307692307693e-05, "loss": 1.0366, "step": 70 }, { "epoch": 0.04452806522420821, "grad_norm": 0.40733855962753296, "learning_rate": 7.658862876254181e-05, "loss": 1.0553, "step": 71 }, { "epoch": 0.045155221072436504, "grad_norm": 0.44080695509910583, "learning_rate": 7.62541806020067e-05, "loss": 1.105, "step": 72 }, { "epoch": 0.04578237692066479, "grad_norm": 0.4520654082298279, "learning_rate": 7.591973244147159e-05, "loss": 1.1086, "step": 73 }, { "epoch": 0.04640953276889307, "grad_norm": 0.39047616720199585, "learning_rate": 7.558528428093647e-05, "loss": 1.17, "step": 74 }, { "epoch": 0.047036688617121354, "grad_norm": 0.3795330822467804, "learning_rate": 7.525083612040135e-05, "loss": 1.1285, "step": 75 }, { "epoch": 0.04766384446534964, "grad_norm": 0.3575787842273712, "learning_rate": 7.491638795986622e-05, "loss": 1.0705, "step": 76 }, { "epoch": 0.04829100031357792, "grad_norm": 0.3704710304737091, "learning_rate": 7.45819397993311e-05, "loss": 1.0276, "step": 77 }, { "epoch": 0.04891815616180621, "grad_norm": 0.3675391674041748, "learning_rate": 7.424749163879598e-05, "loss": 1.0486, "step": 78 }, { "epoch": 0.049545312010034495, "grad_norm": 0.39707186818122864, "learning_rate": 7.391304347826086e-05, "loss": 1.1237, "step": 79 }, { "epoch": 0.05017246785826278, "grad_norm": 0.4096493124961853, "learning_rate": 7.357859531772575e-05, "loss": 1.1009, "step": 80 }, { "epoch": 0.05079962370649106, "grad_norm": 0.3946245014667511, "learning_rate": 7.324414715719064e-05, "loss": 1.1206, "step": 81 }, { "epoch": 0.051426779554719346, "grad_norm": 0.4242815375328064, "learning_rate": 7.290969899665552e-05, "loss": 1.1361, "step": 82 }, { "epoch": 0.05205393540294763, "grad_norm": 0.41571012139320374, "learning_rate": 7.25752508361204e-05, "loss": 1.0891, "step": 83 }, { "epoch": 0.05268109125117592, "grad_norm": 0.4722791314125061, "learning_rate": 7.224080267558529e-05, "loss": 1.0776, "step": 84 }, { "epoch": 0.0533082470994042, "grad_norm": 0.40548330545425415, "learning_rate": 7.190635451505017e-05, "loss": 1.1499, "step": 85 }, { "epoch": 0.05393540294763249, "grad_norm": 0.4097810983657837, "learning_rate": 7.157190635451505e-05, "loss": 1.1157, "step": 86 }, { "epoch": 0.05456255879586077, "grad_norm": 0.4443519711494446, "learning_rate": 7.123745819397993e-05, "loss": 1.1399, "step": 87 }, { "epoch": 0.055189714644089054, "grad_norm": 0.3862561285495758, "learning_rate": 7.090301003344481e-05, "loss": 1.0257, "step": 88 }, { "epoch": 0.055816870492317344, "grad_norm": 0.4226873219013214, "learning_rate": 7.05685618729097e-05, "loss": 1.157, "step": 89 }, { "epoch": 0.05644402634054563, "grad_norm": 0.39252427220344543, "learning_rate": 7.023411371237458e-05, "loss": 1.0942, "step": 90 }, { "epoch": 0.05707118218877391, "grad_norm": 0.4015486240386963, "learning_rate": 6.989966555183946e-05, "loss": 1.0752, "step": 91 }, { "epoch": 0.057698338037002195, "grad_norm": 0.5048426985740662, "learning_rate": 6.956521739130436e-05, "loss": 1.1434, "step": 92 }, { "epoch": 0.05832549388523048, "grad_norm": 0.4149401783943176, "learning_rate": 6.923076923076924e-05, "loss": 1.0733, "step": 93 }, { "epoch": 0.05895264973345876, "grad_norm": 0.38355541229248047, "learning_rate": 6.889632107023412e-05, "loss": 1.021, "step": 94 }, { "epoch": 0.05957980558168705, "grad_norm": 0.3577008545398712, "learning_rate": 6.8561872909699e-05, "loss": 1.0236, "step": 95 }, { "epoch": 0.060206961429915336, "grad_norm": 0.3427559435367584, "learning_rate": 6.822742474916388e-05, "loss": 0.9913, "step": 96 }, { "epoch": 0.06083411727814362, "grad_norm": 0.3767299950122833, "learning_rate": 6.789297658862876e-05, "loss": 1.1362, "step": 97 }, { "epoch": 0.0614612731263719, "grad_norm": 0.39204639196395874, "learning_rate": 6.755852842809365e-05, "loss": 1.12, "step": 98 }, { "epoch": 0.062088428974600186, "grad_norm": 0.39349111914634705, "learning_rate": 6.722408026755853e-05, "loss": 1.0915, "step": 99 }, { "epoch": 0.06271558482282848, "grad_norm": 0.35664206743240356, "learning_rate": 6.688963210702341e-05, "loss": 1.0661, "step": 100 }, { "epoch": 0.06334274067105676, "grad_norm": 0.4107705056667328, "learning_rate": 6.655518394648829e-05, "loss": 1.0927, "step": 101 }, { "epoch": 0.06396989651928504, "grad_norm": 0.3697938621044159, "learning_rate": 6.622073578595317e-05, "loss": 1.0881, "step": 102 }, { "epoch": 0.06459705236751333, "grad_norm": 0.39495396614074707, "learning_rate": 6.588628762541807e-05, "loss": 1.0044, "step": 103 }, { "epoch": 0.06522420821574161, "grad_norm": 0.39368346333503723, "learning_rate": 6.555183946488295e-05, "loss": 1.1562, "step": 104 }, { "epoch": 0.0658513640639699, "grad_norm": 0.42191728949546814, "learning_rate": 6.521739130434783e-05, "loss": 1.0978, "step": 105 }, { "epoch": 0.06647851991219818, "grad_norm": 0.3669389486312866, "learning_rate": 6.488294314381272e-05, "loss": 1.0559, "step": 106 }, { "epoch": 0.06710567576042646, "grad_norm": 0.4310162663459778, "learning_rate": 6.45484949832776e-05, "loss": 1.0318, "step": 107 }, { "epoch": 0.06773283160865474, "grad_norm": 0.38448938727378845, "learning_rate": 6.421404682274248e-05, "loss": 0.9811, "step": 108 }, { "epoch": 0.06835998745688304, "grad_norm": 0.4004499912261963, "learning_rate": 6.387959866220736e-05, "loss": 1.0724, "step": 109 }, { "epoch": 0.06898714330511133, "grad_norm": 0.3796185851097107, "learning_rate": 6.354515050167224e-05, "loss": 1.0357, "step": 110 }, { "epoch": 0.06961429915333961, "grad_norm": 0.40571820735931396, "learning_rate": 6.321070234113713e-05, "loss": 1.2145, "step": 111 }, { "epoch": 0.07024145500156789, "grad_norm": 0.391155868768692, "learning_rate": 6.287625418060201e-05, "loss": 1.0646, "step": 112 }, { "epoch": 0.07086861084979618, "grad_norm": 0.40619781613349915, "learning_rate": 6.254180602006689e-05, "loss": 1.0744, "step": 113 }, { "epoch": 0.07149576669802446, "grad_norm": 0.41316911578178406, "learning_rate": 6.220735785953178e-05, "loss": 1.068, "step": 114 }, { "epoch": 0.07212292254625274, "grad_norm": 0.4110977053642273, "learning_rate": 6.187290969899667e-05, "loss": 1.0571, "step": 115 }, { "epoch": 0.07275007839448103, "grad_norm": 0.44076740741729736, "learning_rate": 6.153846153846155e-05, "loss": 1.0955, "step": 116 }, { "epoch": 0.07337723424270931, "grad_norm": 0.4696763753890991, "learning_rate": 6.120401337792643e-05, "loss": 1.1225, "step": 117 }, { "epoch": 0.0740043900909376, "grad_norm": 0.39804211258888245, "learning_rate": 6.086956521739131e-05, "loss": 1.058, "step": 118 }, { "epoch": 0.07463154593916588, "grad_norm": 0.373542845249176, "learning_rate": 6.0535117056856194e-05, "loss": 0.9764, "step": 119 }, { "epoch": 0.07525870178739416, "grad_norm": 0.39055049419403076, "learning_rate": 6.0200668896321076e-05, "loss": 1.0179, "step": 120 }, { "epoch": 0.07588585763562246, "grad_norm": 0.38459017872810364, "learning_rate": 5.986622073578596e-05, "loss": 1.0748, "step": 121 }, { "epoch": 0.07651301348385074, "grad_norm": 0.44538697600364685, "learning_rate": 5.953177257525085e-05, "loss": 1.1458, "step": 122 }, { "epoch": 0.07714016933207903, "grad_norm": 0.39173659682273865, "learning_rate": 5.919732441471573e-05, "loss": 1.0784, "step": 123 }, { "epoch": 0.07776732518030731, "grad_norm": 0.42208802700042725, "learning_rate": 5.886287625418061e-05, "loss": 1.1294, "step": 124 }, { "epoch": 0.07839448102853559, "grad_norm": 0.40436652302742004, "learning_rate": 5.852842809364549e-05, "loss": 1.053, "step": 125 }, { "epoch": 0.07902163687676388, "grad_norm": 0.41719841957092285, "learning_rate": 5.819397993311037e-05, "loss": 1.1382, "step": 126 }, { "epoch": 0.07964879272499216, "grad_norm": 0.4149632751941681, "learning_rate": 5.785953177257525e-05, "loss": 1.0128, "step": 127 }, { "epoch": 0.08027594857322044, "grad_norm": 0.504405677318573, "learning_rate": 5.752508361204013e-05, "loss": 1.1933, "step": 128 }, { "epoch": 0.08090310442144873, "grad_norm": 0.4094352126121521, "learning_rate": 5.7190635451505014e-05, "loss": 1.0802, "step": 129 }, { "epoch": 0.08153026026967701, "grad_norm": 0.37789756059646606, "learning_rate": 5.6856187290969896e-05, "loss": 1.082, "step": 130 }, { "epoch": 0.0821574161179053, "grad_norm": 0.4045063257217407, "learning_rate": 5.652173913043478e-05, "loss": 1.1129, "step": 131 }, { "epoch": 0.08278457196613359, "grad_norm": 0.40179678797721863, "learning_rate": 5.6187290969899666e-05, "loss": 1.0833, "step": 132 }, { "epoch": 0.08341172781436187, "grad_norm": 0.403834730386734, "learning_rate": 5.585284280936455e-05, "loss": 0.9957, "step": 133 }, { "epoch": 0.08403888366259016, "grad_norm": 0.38336220383644104, "learning_rate": 5.551839464882943e-05, "loss": 1.079, "step": 134 }, { "epoch": 0.08466603951081844, "grad_norm": 0.44965264201164246, "learning_rate": 5.518394648829431e-05, "loss": 1.0677, "step": 135 }, { "epoch": 0.08529319535904673, "grad_norm": 0.4083324074745178, "learning_rate": 5.4849498327759194e-05, "loss": 1.0375, "step": 136 }, { "epoch": 0.08592035120727501, "grad_norm": 0.37549543380737305, "learning_rate": 5.451505016722408e-05, "loss": 0.9811, "step": 137 }, { "epoch": 0.08654750705550329, "grad_norm": 0.41478872299194336, "learning_rate": 5.4180602006688965e-05, "loss": 1.0698, "step": 138 }, { "epoch": 0.08717466290373158, "grad_norm": 0.4115673005580902, "learning_rate": 5.384615384615385e-05, "loss": 1.1094, "step": 139 }, { "epoch": 0.08780181875195986, "grad_norm": 0.3878139555454254, "learning_rate": 5.351170568561873e-05, "loss": 1.0792, "step": 140 }, { "epoch": 0.08842897460018814, "grad_norm": 0.43197542428970337, "learning_rate": 5.317725752508361e-05, "loss": 1.0418, "step": 141 }, { "epoch": 0.08905613044841643, "grad_norm": 0.4401797652244568, "learning_rate": 5.284280936454849e-05, "loss": 1.09, "step": 142 }, { "epoch": 0.08968328629664471, "grad_norm": 0.3953765630722046, "learning_rate": 5.250836120401338e-05, "loss": 1.0994, "step": 143 }, { "epoch": 0.09031044214487301, "grad_norm": 0.5845592617988586, "learning_rate": 5.217391304347826e-05, "loss": 1.0931, "step": 144 }, { "epoch": 0.09093759799310129, "grad_norm": 0.4614081084728241, "learning_rate": 5.1839464882943145e-05, "loss": 1.1122, "step": 145 }, { "epoch": 0.09156475384132957, "grad_norm": 0.3829100430011749, "learning_rate": 5.150501672240803e-05, "loss": 1.0379, "step": 146 }, { "epoch": 0.09219190968955786, "grad_norm": 0.3920990824699402, "learning_rate": 5.117056856187291e-05, "loss": 1.0251, "step": 147 }, { "epoch": 0.09281906553778614, "grad_norm": 0.40057647228240967, "learning_rate": 5.08361204013378e-05, "loss": 1.075, "step": 148 }, { "epoch": 0.09344622138601442, "grad_norm": 0.4224538505077362, "learning_rate": 5.050167224080268e-05, "loss": 1.0747, "step": 149 }, { "epoch": 0.09407337723424271, "grad_norm": 0.495451956987381, "learning_rate": 5.016722408026756e-05, "loss": 1.113, "step": 150 }, { "epoch": 0.09470053308247099, "grad_norm": 0.4393994212150574, "learning_rate": 4.983277591973244e-05, "loss": 1.0819, "step": 151 }, { "epoch": 0.09532768893069928, "grad_norm": 0.41883373260498047, "learning_rate": 4.9498327759197325e-05, "loss": 1.0081, "step": 152 }, { "epoch": 0.09595484477892756, "grad_norm": 0.4273068904876709, "learning_rate": 4.916387959866221e-05, "loss": 1.0865, "step": 153 }, { "epoch": 0.09658200062715584, "grad_norm": 0.42094531655311584, "learning_rate": 4.8829431438127096e-05, "loss": 1.0419, "step": 154 }, { "epoch": 0.09720915647538414, "grad_norm": 0.42659589648246765, "learning_rate": 4.849498327759198e-05, "loss": 1.1113, "step": 155 }, { "epoch": 0.09783631232361242, "grad_norm": 0.38099798560142517, "learning_rate": 4.816053511705686e-05, "loss": 1.0508, "step": 156 }, { "epoch": 0.0984634681718407, "grad_norm": 0.46048882603645325, "learning_rate": 4.782608695652174e-05, "loss": 1.0956, "step": 157 }, { "epoch": 0.09909062402006899, "grad_norm": 0.4822726845741272, "learning_rate": 4.7491638795986624e-05, "loss": 1.0993, "step": 158 }, { "epoch": 0.09971777986829727, "grad_norm": 0.42813563346862793, "learning_rate": 4.715719063545151e-05, "loss": 0.953, "step": 159 }, { "epoch": 0.10034493571652556, "grad_norm": 0.4069565534591675, "learning_rate": 4.6822742474916394e-05, "loss": 1.0803, "step": 160 }, { "epoch": 0.10097209156475384, "grad_norm": 0.4259192645549774, "learning_rate": 4.6488294314381276e-05, "loss": 1.082, "step": 161 }, { "epoch": 0.10159924741298212, "grad_norm": 0.4321853220462799, "learning_rate": 4.615384615384616e-05, "loss": 1.1825, "step": 162 }, { "epoch": 0.10222640326121041, "grad_norm": 0.42676714062690735, "learning_rate": 4.581939799331103e-05, "loss": 1.0423, "step": 163 }, { "epoch": 0.10285355910943869, "grad_norm": 0.3924862742424011, "learning_rate": 4.548494983277592e-05, "loss": 1.1112, "step": 164 }, { "epoch": 0.10348071495766697, "grad_norm": 0.38594508171081543, "learning_rate": 4.5150501672240804e-05, "loss": 1.0284, "step": 165 }, { "epoch": 0.10410787080589526, "grad_norm": 0.4287392497062683, "learning_rate": 4.4816053511705686e-05, "loss": 1.11, "step": 166 }, { "epoch": 0.10473502665412356, "grad_norm": 0.43313270807266235, "learning_rate": 4.448160535117057e-05, "loss": 1.0954, "step": 167 }, { "epoch": 0.10536218250235184, "grad_norm": 0.44197893142700195, "learning_rate": 4.414715719063545e-05, "loss": 1.111, "step": 168 }, { "epoch": 0.10598933835058012, "grad_norm": 0.3909394443035126, "learning_rate": 4.381270903010034e-05, "loss": 1.0555, "step": 169 }, { "epoch": 0.1066164941988084, "grad_norm": 0.4243182837963104, "learning_rate": 4.347826086956522e-05, "loss": 1.1004, "step": 170 }, { "epoch": 0.10724365004703669, "grad_norm": 0.4609302878379822, "learning_rate": 4.31438127090301e-05, "loss": 1.0873, "step": 171 }, { "epoch": 0.10787080589526497, "grad_norm": 0.4273108243942261, "learning_rate": 4.2809364548494984e-05, "loss": 1.0637, "step": 172 }, { "epoch": 0.10849796174349326, "grad_norm": 0.43937408924102783, "learning_rate": 4.2474916387959866e-05, "loss": 1.0548, "step": 173 }, { "epoch": 0.10912511759172154, "grad_norm": 0.42941388487815857, "learning_rate": 4.214046822742475e-05, "loss": 1.0855, "step": 174 }, { "epoch": 0.10975227343994982, "grad_norm": 0.4183155298233032, "learning_rate": 4.180602006688964e-05, "loss": 1.0658, "step": 175 }, { "epoch": 0.11037942928817811, "grad_norm": 0.3873193860054016, "learning_rate": 4.147157190635452e-05, "loss": 1.0439, "step": 176 }, { "epoch": 0.11100658513640639, "grad_norm": 0.4220661520957947, "learning_rate": 4.11371237458194e-05, "loss": 1.0994, "step": 177 }, { "epoch": 0.11163374098463469, "grad_norm": 0.5796094536781311, "learning_rate": 4.080267558528428e-05, "loss": 1.1111, "step": 178 }, { "epoch": 0.11226089683286297, "grad_norm": 0.4496459662914276, "learning_rate": 4.0468227424749165e-05, "loss": 1.1081, "step": 179 }, { "epoch": 0.11288805268109126, "grad_norm": 0.4354843199253082, "learning_rate": 4.0133779264214046e-05, "loss": 1.0617, "step": 180 }, { "epoch": 0.11351520852931954, "grad_norm": 0.4018702805042267, "learning_rate": 3.9799331103678935e-05, "loss": 1.041, "step": 181 }, { "epoch": 0.11414236437754782, "grad_norm": 0.4189532697200775, "learning_rate": 3.946488294314382e-05, "loss": 1.0214, "step": 182 }, { "epoch": 0.1147695202257761, "grad_norm": 0.43697431683540344, "learning_rate": 3.91304347826087e-05, "loss": 1.0886, "step": 183 }, { "epoch": 0.11539667607400439, "grad_norm": 0.45214566588401794, "learning_rate": 3.879598662207358e-05, "loss": 1.0817, "step": 184 }, { "epoch": 0.11602383192223267, "grad_norm": 0.5171282887458801, "learning_rate": 3.846153846153846e-05, "loss": 1.07, "step": 185 }, { "epoch": 0.11665098777046096, "grad_norm": 0.47615286707878113, "learning_rate": 3.812709030100335e-05, "loss": 1.139, "step": 186 }, { "epoch": 0.11727814361868924, "grad_norm": 0.41021615266799927, "learning_rate": 3.7792642140468233e-05, "loss": 1.0551, "step": 187 }, { "epoch": 0.11790529946691752, "grad_norm": 0.4250172972679138, "learning_rate": 3.745819397993311e-05, "loss": 1.0366, "step": 188 }, { "epoch": 0.11853245531514581, "grad_norm": 0.4063580334186554, "learning_rate": 3.712374581939799e-05, "loss": 1.1209, "step": 189 }, { "epoch": 0.1191596111633741, "grad_norm": 0.4396308362483978, "learning_rate": 3.678929765886287e-05, "loss": 1.1082, "step": 190 }, { "epoch": 0.11978676701160239, "grad_norm": 0.4504964351654053, "learning_rate": 3.645484949832776e-05, "loss": 1.1178, "step": 191 }, { "epoch": 0.12041392285983067, "grad_norm": 0.4479026794433594, "learning_rate": 3.612040133779264e-05, "loss": 1.0619, "step": 192 }, { "epoch": 0.12104107870805896, "grad_norm": 0.4287208020687103, "learning_rate": 3.5785953177257525e-05, "loss": 1.0619, "step": 193 }, { "epoch": 0.12166823455628724, "grad_norm": 0.40154144167900085, "learning_rate": 3.545150501672241e-05, "loss": 1.0142, "step": 194 }, { "epoch": 0.12229539040451552, "grad_norm": 0.4535212814807892, "learning_rate": 3.511705685618729e-05, "loss": 1.0802, "step": 195 }, { "epoch": 0.1229225462527438, "grad_norm": 0.44110241532325745, "learning_rate": 3.478260869565218e-05, "loss": 1.0918, "step": 196 }, { "epoch": 0.12354970210097209, "grad_norm": 0.42719611525535583, "learning_rate": 3.444816053511706e-05, "loss": 1.0392, "step": 197 }, { "epoch": 0.12417685794920037, "grad_norm": 0.47314372658729553, "learning_rate": 3.411371237458194e-05, "loss": 1.0826, "step": 198 }, { "epoch": 0.12480401379742866, "grad_norm": 0.4626355767250061, "learning_rate": 3.3779264214046823e-05, "loss": 1.0953, "step": 199 }, { "epoch": 0.12543116964565695, "grad_norm": 0.4239532947540283, "learning_rate": 3.3444816053511705e-05, "loss": 1.0525, "step": 200 }, { "epoch": 0.12605832549388524, "grad_norm": 0.4138485789299011, "learning_rate": 3.311036789297659e-05, "loss": 1.0174, "step": 201 }, { "epoch": 0.12668548134211352, "grad_norm": 0.42386671900749207, "learning_rate": 3.2775919732441476e-05, "loss": 1.1395, "step": 202 }, { "epoch": 0.1273126371903418, "grad_norm": 0.43196991086006165, "learning_rate": 3.244147157190636e-05, "loss": 0.9895, "step": 203 }, { "epoch": 0.1279397930385701, "grad_norm": 0.433040052652359, "learning_rate": 3.210702341137124e-05, "loss": 1.1044, "step": 204 }, { "epoch": 0.12856694888679837, "grad_norm": 0.45214182138442993, "learning_rate": 3.177257525083612e-05, "loss": 1.0114, "step": 205 }, { "epoch": 0.12919410473502665, "grad_norm": 0.42340749502182007, "learning_rate": 3.1438127090301004e-05, "loss": 1.0759, "step": 206 }, { "epoch": 0.12982126058325494, "grad_norm": 0.4078756868839264, "learning_rate": 3.110367892976589e-05, "loss": 1.0794, "step": 207 }, { "epoch": 0.13044841643148322, "grad_norm": 0.4599223732948303, "learning_rate": 3.0769230769230774e-05, "loss": 1.0031, "step": 208 }, { "epoch": 0.1310755722797115, "grad_norm": 0.4522516429424286, "learning_rate": 3.0434782608695656e-05, "loss": 1.0573, "step": 209 }, { "epoch": 0.1317027281279398, "grad_norm": 0.4725417494773865, "learning_rate": 3.0100334448160538e-05, "loss": 1.0747, "step": 210 }, { "epoch": 0.13232988397616807, "grad_norm": 0.3871685862541199, "learning_rate": 2.9765886287625424e-05, "loss": 1.0604, "step": 211 }, { "epoch": 0.13295703982439636, "grad_norm": 0.4436711370944977, "learning_rate": 2.9431438127090305e-05, "loss": 1.0968, "step": 212 }, { "epoch": 0.13358419567262464, "grad_norm": 0.469163179397583, "learning_rate": 2.9096989966555184e-05, "loss": 1.0837, "step": 213 }, { "epoch": 0.13421135152085292, "grad_norm": 0.5687686800956726, "learning_rate": 2.8762541806020066e-05, "loss": 1.1365, "step": 214 }, { "epoch": 0.1348385073690812, "grad_norm": 0.4138805866241455, "learning_rate": 2.8428093645484948e-05, "loss": 1.1286, "step": 215 }, { "epoch": 0.1354656632173095, "grad_norm": 0.41999486088752747, "learning_rate": 2.8093645484949833e-05, "loss": 1.0997, "step": 216 }, { "epoch": 0.13609281906553777, "grad_norm": 0.477196604013443, "learning_rate": 2.7759197324414715e-05, "loss": 1.0981, "step": 217 }, { "epoch": 0.13671997491376608, "grad_norm": 0.42687904834747314, "learning_rate": 2.7424749163879597e-05, "loss": 1.0995, "step": 218 }, { "epoch": 0.13734713076199437, "grad_norm": 0.3768168091773987, "learning_rate": 2.7090301003344482e-05, "loss": 0.9547, "step": 219 }, { "epoch": 0.13797428661022265, "grad_norm": 0.44866129755973816, "learning_rate": 2.6755852842809364e-05, "loss": 1.0523, "step": 220 }, { "epoch": 0.13860144245845094, "grad_norm": 0.40763071179389954, "learning_rate": 2.6421404682274246e-05, "loss": 1.0735, "step": 221 }, { "epoch": 0.13922859830667922, "grad_norm": 0.46890097856521606, "learning_rate": 2.608695652173913e-05, "loss": 1.0587, "step": 222 }, { "epoch": 0.1398557541549075, "grad_norm": 0.415772408246994, "learning_rate": 2.5752508361204013e-05, "loss": 1.0704, "step": 223 }, { "epoch": 0.14048291000313579, "grad_norm": 0.4222339987754822, "learning_rate": 2.54180602006689e-05, "loss": 1.0769, "step": 224 }, { "epoch": 0.14111006585136407, "grad_norm": 0.4234933853149414, "learning_rate": 2.508361204013378e-05, "loss": 1.0896, "step": 225 }, { "epoch": 0.14173722169959235, "grad_norm": 0.41926753520965576, "learning_rate": 2.4749163879598663e-05, "loss": 1.1036, "step": 226 }, { "epoch": 0.14236437754782064, "grad_norm": 0.42077013850212097, "learning_rate": 2.4414715719063548e-05, "loss": 1.0231, "step": 227 }, { "epoch": 0.14299153339604892, "grad_norm": 0.43312421441078186, "learning_rate": 2.408026755852843e-05, "loss": 1.0829, "step": 228 }, { "epoch": 0.1436186892442772, "grad_norm": 0.42710229754447937, "learning_rate": 2.3745819397993312e-05, "loss": 1.0389, "step": 229 }, { "epoch": 0.1442458450925055, "grad_norm": 0.4713851511478424, "learning_rate": 2.3411371237458197e-05, "loss": 1.1172, "step": 230 }, { "epoch": 0.14487300094073377, "grad_norm": 0.44223445653915405, "learning_rate": 2.307692307692308e-05, "loss": 1.0541, "step": 231 }, { "epoch": 0.14550015678896205, "grad_norm": 0.41953906416893005, "learning_rate": 2.274247491638796e-05, "loss": 1.0799, "step": 232 }, { "epoch": 0.14612731263719034, "grad_norm": 0.4237317144870758, "learning_rate": 2.2408026755852843e-05, "loss": 1.0088, "step": 233 }, { "epoch": 0.14675446848541862, "grad_norm": 0.46339884400367737, "learning_rate": 2.2073578595317725e-05, "loss": 1.0181, "step": 234 }, { "epoch": 0.1473816243336469, "grad_norm": 0.41633766889572144, "learning_rate": 2.173913043478261e-05, "loss": 1.0265, "step": 235 }, { "epoch": 0.1480087801818752, "grad_norm": 0.407678484916687, "learning_rate": 2.1404682274247492e-05, "loss": 0.9992, "step": 236 }, { "epoch": 0.14863593603010347, "grad_norm": 0.44348976016044617, "learning_rate": 2.1070234113712374e-05, "loss": 1.0769, "step": 237 }, { "epoch": 0.14926309187833176, "grad_norm": 0.42968854308128357, "learning_rate": 2.073578595317726e-05, "loss": 1.0618, "step": 238 }, { "epoch": 0.14989024772656004, "grad_norm": 0.43641427159309387, "learning_rate": 2.040133779264214e-05, "loss": 1.0511, "step": 239 }, { "epoch": 0.15051740357478832, "grad_norm": 0.4331563413143158, "learning_rate": 2.0066889632107023e-05, "loss": 0.9729, "step": 240 }, { "epoch": 0.15114455942301663, "grad_norm": 0.3954283893108368, "learning_rate": 1.973244147157191e-05, "loss": 0.9756, "step": 241 }, { "epoch": 0.15177171527124492, "grad_norm": 0.44188499450683594, "learning_rate": 1.939799331103679e-05, "loss": 1.0855, "step": 242 }, { "epoch": 0.1523988711194732, "grad_norm": 0.46987831592559814, "learning_rate": 1.9063545150501676e-05, "loss": 1.1184, "step": 243 }, { "epoch": 0.15302602696770148, "grad_norm": 0.4148559868335724, "learning_rate": 1.8729096989966554e-05, "loss": 0.9811, "step": 244 }, { "epoch": 0.15365318281592977, "grad_norm": 0.439791738986969, "learning_rate": 1.8394648829431436e-05, "loss": 0.9971, "step": 245 }, { "epoch": 0.15428033866415805, "grad_norm": 0.4216610789299011, "learning_rate": 1.806020066889632e-05, "loss": 1.0084, "step": 246 }, { "epoch": 0.15490749451238633, "grad_norm": 0.5165313482284546, "learning_rate": 1.7725752508361204e-05, "loss": 1.224, "step": 247 }, { "epoch": 0.15553465036061462, "grad_norm": 0.4595968425273895, "learning_rate": 1.739130434782609e-05, "loss": 1.1169, "step": 248 }, { "epoch": 0.1561618062088429, "grad_norm": 0.40049493312835693, "learning_rate": 1.705685618729097e-05, "loss": 1.0533, "step": 249 }, { "epoch": 0.15678896205707119, "grad_norm": 0.4440176486968994, "learning_rate": 1.6722408026755853e-05, "loss": 1.1044, "step": 250 }, { "epoch": 0.15741611790529947, "grad_norm": 0.42966777086257935, "learning_rate": 1.6387959866220738e-05, "loss": 0.9887, "step": 251 }, { "epoch": 0.15804327375352775, "grad_norm": 0.3809144198894501, "learning_rate": 1.605351170568562e-05, "loss": 1.0007, "step": 252 }, { "epoch": 0.15867042960175604, "grad_norm": 0.42232662439346313, "learning_rate": 1.5719063545150502e-05, "loss": 1.0061, "step": 253 }, { "epoch": 0.15929758544998432, "grad_norm": 0.43439656496047974, "learning_rate": 1.5384615384615387e-05, "loss": 1.0307, "step": 254 }, { "epoch": 0.1599247412982126, "grad_norm": 0.5382441282272339, "learning_rate": 1.5050167224080269e-05, "loss": 1.0649, "step": 255 }, { "epoch": 0.1605518971464409, "grad_norm": 0.44557082653045654, "learning_rate": 1.4715719063545153e-05, "loss": 1.0415, "step": 256 }, { "epoch": 0.16117905299466917, "grad_norm": 0.41304898262023926, "learning_rate": 1.4381270903010033e-05, "loss": 1.0008, "step": 257 }, { "epoch": 0.16180620884289745, "grad_norm": 0.3917330503463745, "learning_rate": 1.4046822742474917e-05, "loss": 1.0225, "step": 258 }, { "epoch": 0.16243336469112574, "grad_norm": 0.43536174297332764, "learning_rate": 1.3712374581939799e-05, "loss": 1.0143, "step": 259 }, { "epoch": 0.16306052053935402, "grad_norm": 0.4426629841327667, "learning_rate": 1.3377926421404682e-05, "loss": 1.0585, "step": 260 }, { "epoch": 0.1636876763875823, "grad_norm": 0.4304388463497162, "learning_rate": 1.3043478260869566e-05, "loss": 1.0495, "step": 261 }, { "epoch": 0.1643148322358106, "grad_norm": 0.5129019021987915, "learning_rate": 1.270903010033445e-05, "loss": 1.0578, "step": 262 }, { "epoch": 0.16494198808403887, "grad_norm": 0.44399893283843994, "learning_rate": 1.2374581939799331e-05, "loss": 0.9971, "step": 263 }, { "epoch": 0.16556914393226718, "grad_norm": 0.4635840356349945, "learning_rate": 1.2040133779264215e-05, "loss": 1.1255, "step": 264 }, { "epoch": 0.16619629978049547, "grad_norm": 0.4606925845146179, "learning_rate": 1.1705685618729099e-05, "loss": 1.0031, "step": 265 }, { "epoch": 0.16682345562872375, "grad_norm": 0.4620150923728943, "learning_rate": 1.137123745819398e-05, "loss": 1.0969, "step": 266 }, { "epoch": 0.16745061147695203, "grad_norm": 0.47829005122184753, "learning_rate": 1.1036789297658862e-05, "loss": 1.0382, "step": 267 }, { "epoch": 0.16807776732518032, "grad_norm": 0.41557204723358154, "learning_rate": 1.0702341137123746e-05, "loss": 1.0763, "step": 268 }, { "epoch": 0.1687049231734086, "grad_norm": 0.4283234775066376, "learning_rate": 1.036789297658863e-05, "loss": 1.0327, "step": 269 }, { "epoch": 0.16933207902163688, "grad_norm": 0.4353739619255066, "learning_rate": 1.0033444816053512e-05, "loss": 1.0306, "step": 270 }, { "epoch": 0.16995923486986517, "grad_norm": 0.4430319666862488, "learning_rate": 9.698996655518395e-06, "loss": 0.9921, "step": 271 }, { "epoch": 0.17058639071809345, "grad_norm": 0.40864184498786926, "learning_rate": 9.364548494983277e-06, "loss": 0.9795, "step": 272 }, { "epoch": 0.17121354656632173, "grad_norm": 0.5019369721412659, "learning_rate": 9.03010033444816e-06, "loss": 1.039, "step": 273 }, { "epoch": 0.17184070241455002, "grad_norm": 0.47258105874061584, "learning_rate": 8.695652173913044e-06, "loss": 1.0749, "step": 274 }, { "epoch": 0.1724678582627783, "grad_norm": 0.4339354634284973, "learning_rate": 8.361204013377926e-06, "loss": 1.1275, "step": 275 }, { "epoch": 0.17309501411100658, "grad_norm": 0.4404523968696594, "learning_rate": 8.02675585284281e-06, "loss": 1.073, "step": 276 }, { "epoch": 0.17372216995923487, "grad_norm": 0.4684053659439087, "learning_rate": 7.692307692307694e-06, "loss": 1.1251, "step": 277 }, { "epoch": 0.17434932580746315, "grad_norm": 0.47471368312835693, "learning_rate": 7.357859531772576e-06, "loss": 1.0472, "step": 278 }, { "epoch": 0.17497648165569143, "grad_norm": 0.4690254330635071, "learning_rate": 7.023411371237458e-06, "loss": 1.0877, "step": 279 }, { "epoch": 0.17560363750391972, "grad_norm": 0.4029237627983093, "learning_rate": 6.688963210702341e-06, "loss": 1.0563, "step": 280 }, { "epoch": 0.176230793352148, "grad_norm": 0.39394280314445496, "learning_rate": 6.354515050167225e-06, "loss": 0.9945, "step": 281 }, { "epoch": 0.17685794920037629, "grad_norm": 0.45974263548851013, "learning_rate": 6.0200668896321075e-06, "loss": 1.014, "step": 282 }, { "epoch": 0.17748510504860457, "grad_norm": 0.46561309695243835, "learning_rate": 5.68561872909699e-06, "loss": 1.0119, "step": 283 }, { "epoch": 0.17811226089683285, "grad_norm": 0.4529838562011719, "learning_rate": 5.351170568561873e-06, "loss": 1.0669, "step": 284 }, { "epoch": 0.17873941674506114, "grad_norm": 0.4240736961364746, "learning_rate": 5.016722408026756e-06, "loss": 1.066, "step": 285 }, { "epoch": 0.17936657259328942, "grad_norm": 0.43232661485671997, "learning_rate": 4.682274247491639e-06, "loss": 1.1118, "step": 286 }, { "epoch": 0.17999372844151773, "grad_norm": 0.42727506160736084, "learning_rate": 4.347826086956522e-06, "loss": 1.1266, "step": 287 }, { "epoch": 0.18062088428974601, "grad_norm": 0.44371864199638367, "learning_rate": 4.013377926421405e-06, "loss": 1.0074, "step": 288 }, { "epoch": 0.1812480401379743, "grad_norm": 0.4051826596260071, "learning_rate": 3.678929765886288e-06, "loss": 0.9974, "step": 289 }, { "epoch": 0.18187519598620258, "grad_norm": 0.4324273467063904, "learning_rate": 3.3444816053511705e-06, "loss": 1.0931, "step": 290 }, { "epoch": 0.18250235183443086, "grad_norm": 0.45636269450187683, "learning_rate": 3.0100334448160537e-06, "loss": 1.0332, "step": 291 }, { "epoch": 0.18312950768265915, "grad_norm": 0.43998774886131287, "learning_rate": 2.6755852842809365e-06, "loss": 1.0825, "step": 292 }, { "epoch": 0.18375666353088743, "grad_norm": 0.38203734159469604, "learning_rate": 2.3411371237458193e-06, "loss": 1.02, "step": 293 }, { "epoch": 0.18438381937911572, "grad_norm": 0.44293034076690674, "learning_rate": 2.0066889632107025e-06, "loss": 1.0437, "step": 294 }, { "epoch": 0.185010975227344, "grad_norm": 0.45386576652526855, "learning_rate": 1.6722408026755853e-06, "loss": 1.0495, "step": 295 }, { "epoch": 0.18563813107557228, "grad_norm": 0.43592047691345215, "learning_rate": 1.3377926421404683e-06, "loss": 1.0225, "step": 296 }, { "epoch": 0.18626528692380057, "grad_norm": 0.4214257299900055, "learning_rate": 1.0033444816053512e-06, "loss": 1.0383, "step": 297 }, { "epoch": 0.18689244277202885, "grad_norm": 0.4367026388645172, "learning_rate": 6.688963210702341e-07, "loss": 1.0466, "step": 298 }, { "epoch": 0.18751959862025713, "grad_norm": 0.4570874273777008, "learning_rate": 3.3444816053511706e-07, "loss": 1.0191, "step": 299 }, { "epoch": 0.18814675446848542, "grad_norm": 0.43601682782173157, "learning_rate": 0.0, "loss": 1.1164, "step": 300 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.280261144756224e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }