{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0009226401703335, "eval_steps": 221, "global_step": 881, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00113555713271824, "grad_norm": 0.6934691071510315, "learning_rate": 2e-05, "loss": 2.1719, "step": 1 }, { "epoch": 0.00227111426543648, "grad_norm": 0.7992697954177856, "learning_rate": 4e-05, "loss": 2.2889, "step": 2 }, { "epoch": 0.0034066713981547197, "grad_norm": 0.8131292462348938, "learning_rate": 6e-05, "loss": 2.1793, "step": 3 }, { "epoch": 0.00454222853087296, "grad_norm": 0.8916270732879639, "learning_rate": 8e-05, "loss": 2.2099, "step": 4 }, { "epoch": 0.0056777856635912, "grad_norm": 0.8980678915977478, "learning_rate": 0.0001, "loss": 2.1946, "step": 5 }, { "epoch": 0.006813342796309439, "grad_norm": 0.8374800086021423, "learning_rate": 0.00012, "loss": 2.131, "step": 6 }, { "epoch": 0.00794889992902768, "grad_norm": 0.7637040019035339, "learning_rate": 0.00014, "loss": 2.1456, "step": 7 }, { "epoch": 0.00908445706174592, "grad_norm": 0.733128011226654, "learning_rate": 0.00016, "loss": 2.0553, "step": 8 }, { "epoch": 0.01022001419446416, "grad_norm": 0.9169027209281921, "learning_rate": 0.00018, "loss": 2.0195, "step": 9 }, { "epoch": 0.0113555713271824, "grad_norm": 0.9487164616584778, "learning_rate": 0.0002, "loss": 1.9663, "step": 10 }, { "epoch": 0.012491128459900639, "grad_norm": 1.0456604957580566, "learning_rate": 0.0001999993495214666, "loss": 2.0543, "step": 11 }, { "epoch": 0.013626685592618879, "grad_norm": 1.0381840467453003, "learning_rate": 0.00019999739809432887, "loss": 2.0079, "step": 12 }, { "epoch": 0.014762242725337119, "grad_norm": 0.9310197830200195, "learning_rate": 0.00019999414574397396, "loss": 1.7789, "step": 13 }, { "epoch": 0.01589779985805536, "grad_norm": 0.8029727339744568, "learning_rate": 0.00019998959251271367, "loss": 1.8232, "step": 14 }, { "epoch": 0.017033356990773598, "grad_norm": 0.8495714664459229, "learning_rate": 0.0001999837384597835, "loss": 1.9415, "step": 15 }, { "epoch": 0.01816891412349184, "grad_norm": 0.8354431390762329, "learning_rate": 0.00019997658366134217, "loss": 1.8782, "step": 16 }, { "epoch": 0.019304471256210078, "grad_norm": 0.8955284953117371, "learning_rate": 0.00019996812821047055, "loss": 1.9713, "step": 17 }, { "epoch": 0.02044002838892832, "grad_norm": 0.8927428126335144, "learning_rate": 0.00019995837221717044, "loss": 1.9062, "step": 18 }, { "epoch": 0.021575585521646557, "grad_norm": 0.817054271697998, "learning_rate": 0.00019994731580836312, "loss": 1.9231, "step": 19 }, { "epoch": 0.0227111426543648, "grad_norm": 0.9227212071418762, "learning_rate": 0.0001999349591278877, "loss": 1.797, "step": 20 }, { "epoch": 0.023846699787083037, "grad_norm": 0.8252530694007874, "learning_rate": 0.00019992130233649933, "loss": 1.7833, "step": 21 }, { "epoch": 0.024982256919801278, "grad_norm": 0.828565239906311, "learning_rate": 0.00019990634561186695, "loss": 1.8674, "step": 22 }, { "epoch": 0.026117814052519516, "grad_norm": 0.8002648949623108, "learning_rate": 0.00019989008914857116, "loss": 1.8512, "step": 23 }, { "epoch": 0.027253371185237758, "grad_norm": 0.8129715323448181, "learning_rate": 0.00019987253315810154, "loss": 1.7364, "step": 24 }, { "epoch": 0.028388928317955996, "grad_norm": 0.9048624634742737, "learning_rate": 0.00019985367786885404, "loss": 1.8607, "step": 25 }, { "epoch": 0.029524485450674237, "grad_norm": 0.8801059722900391, "learning_rate": 0.0001998335235261278, "loss": 1.8842, "step": 26 }, { "epoch": 0.03066004258339248, "grad_norm": 0.9270484447479248, "learning_rate": 0.00019981207039212226, "loss": 1.7924, "step": 27 }, { "epoch": 0.03179559971611072, "grad_norm": 0.9225452542304993, "learning_rate": 0.00019978931874593342, "loss": 1.7976, "step": 28 }, { "epoch": 0.032931156848828955, "grad_norm": 0.9705971479415894, "learning_rate": 0.00019976526888355043, "loss": 1.8407, "step": 29 }, { "epoch": 0.034066713981547196, "grad_norm": 0.9143342971801758, "learning_rate": 0.00019973992111785172, "loss": 1.7529, "step": 30 }, { "epoch": 0.03520227111426544, "grad_norm": 1.1970179080963135, "learning_rate": 0.00019971327577860077, "loss": 1.7446, "step": 31 }, { "epoch": 0.03633782824698368, "grad_norm": 1.0857728719711304, "learning_rate": 0.00019968533321244208, "loss": 1.8829, "step": 32 }, { "epoch": 0.037473385379701914, "grad_norm": 1.01325261592865, "learning_rate": 0.00019965609378289637, "loss": 1.6979, "step": 33 }, { "epoch": 0.038608942512420155, "grad_norm": 1.0160812139511108, "learning_rate": 0.00019962555787035607, "loss": 1.5962, "step": 34 }, { "epoch": 0.0397444996451384, "grad_norm": 1.1106584072113037, "learning_rate": 0.00019959372587208035, "loss": 1.5505, "step": 35 }, { "epoch": 0.04088005677785664, "grad_norm": 1.2298392057418823, "learning_rate": 0.00019956059820218982, "loss": 1.5995, "step": 36 }, { "epoch": 0.04201561391057487, "grad_norm": 1.2002296447753906, "learning_rate": 0.0001995261752916612, "loss": 1.6512, "step": 37 }, { "epoch": 0.043151171043293114, "grad_norm": 1.3855711221694946, "learning_rate": 0.00019949045758832186, "loss": 1.6443, "step": 38 }, { "epoch": 0.044286728176011356, "grad_norm": 1.389603853225708, "learning_rate": 0.00019945344555684366, "loss": 1.6702, "step": 39 }, { "epoch": 0.0454222853087296, "grad_norm": 1.4852460622787476, "learning_rate": 0.00019941513967873737, "loss": 1.5897, "step": 40 }, { "epoch": 0.04655784244144784, "grad_norm": 1.7502464056015015, "learning_rate": 0.00019937554045234594, "loss": 1.7195, "step": 41 }, { "epoch": 0.04769339957416607, "grad_norm": 1.3839149475097656, "learning_rate": 0.00019933464839283832, "loss": 1.4941, "step": 42 }, { "epoch": 0.048828956706884315, "grad_norm": 1.4444694519042969, "learning_rate": 0.00019929246403220267, "loss": 1.7032, "step": 43 }, { "epoch": 0.049964513839602556, "grad_norm": 1.4892754554748535, "learning_rate": 0.00019924898791923935, "loss": 1.5908, "step": 44 }, { "epoch": 0.0511000709723208, "grad_norm": 1.4705095291137695, "learning_rate": 0.00019920422061955404, "loss": 1.3723, "step": 45 }, { "epoch": 0.05223562810503903, "grad_norm": 1.6887420415878296, "learning_rate": 0.00019915816271554997, "loss": 1.5091, "step": 46 }, { "epoch": 0.053371185237757274, "grad_norm": 1.6843112707138062, "learning_rate": 0.00019911081480642072, "loss": 1.6354, "step": 47 }, { "epoch": 0.054506742370475515, "grad_norm": 2.1971182823181152, "learning_rate": 0.00019906217750814234, "loss": 1.3966, "step": 48 }, { "epoch": 0.05564229950319376, "grad_norm": 2.1502106189727783, "learning_rate": 0.0001990122514534651, "loss": 1.5903, "step": 49 }, { "epoch": 0.05677785663591199, "grad_norm": 2.8416748046875, "learning_rate": 0.00019896103729190562, "loss": 1.3317, "step": 50 }, { "epoch": 0.05791341376863023, "grad_norm": 1.5481539964675903, "learning_rate": 0.00019890853568973807, "loss": 2.1215, "step": 51 }, { "epoch": 0.059048970901348474, "grad_norm": 1.8689957857131958, "learning_rate": 0.0001988547473299858, "loss": 2.1832, "step": 52 }, { "epoch": 0.060184528034066716, "grad_norm": 1.515938639640808, "learning_rate": 0.00019879967291241228, "loss": 2.1301, "step": 53 }, { "epoch": 0.06132008516678496, "grad_norm": 1.161054015159607, "learning_rate": 0.00019874331315351205, "loss": 2.001, "step": 54 }, { "epoch": 0.06245564229950319, "grad_norm": 0.8447322249412537, "learning_rate": 0.00019868566878650134, "loss": 1.8943, "step": 55 }, { "epoch": 0.06359119943222144, "grad_norm": 0.7083495855331421, "learning_rate": 0.00019862674056130865, "loss": 1.9224, "step": 56 }, { "epoch": 0.06472675656493967, "grad_norm": 0.7493062615394592, "learning_rate": 0.00019856652924456488, "loss": 1.8822, "step": 57 }, { "epoch": 0.06586231369765791, "grad_norm": 0.918002724647522, "learning_rate": 0.00019850503561959335, "loss": 1.9339, "step": 58 }, { "epoch": 0.06699787083037616, "grad_norm": 0.9835973978042603, "learning_rate": 0.00019844226048639982, "loss": 1.8674, "step": 59 }, { "epoch": 0.06813342796309439, "grad_norm": 0.8450660705566406, "learning_rate": 0.00019837820466166175, "loss": 1.9399, "step": 60 }, { "epoch": 0.06926898509581263, "grad_norm": 0.7557615041732788, "learning_rate": 0.00019831286897871797, "loss": 1.8543, "step": 61 }, { "epoch": 0.07040454222853088, "grad_norm": 0.9068436622619629, "learning_rate": 0.0001982462542875576, "loss": 1.7912, "step": 62 }, { "epoch": 0.07154009936124911, "grad_norm": 0.951479971408844, "learning_rate": 0.00019817836145480925, "loss": 1.8305, "step": 63 }, { "epoch": 0.07267565649396736, "grad_norm": 0.7732802629470825, "learning_rate": 0.0001981091913637295, "loss": 1.7819, "step": 64 }, { "epoch": 0.07381121362668559, "grad_norm": 0.8870177865028381, "learning_rate": 0.00019803874491419146, "loss": 1.7616, "step": 65 }, { "epoch": 0.07494677075940383, "grad_norm": 0.8918144702911377, "learning_rate": 0.0001979670230226733, "loss": 1.8437, "step": 66 }, { "epoch": 0.07608232789212208, "grad_norm": 0.8801097273826599, "learning_rate": 0.000197894026622246, "loss": 1.6089, "step": 67 }, { "epoch": 0.07721788502484031, "grad_norm": 0.9002765417098999, "learning_rate": 0.00019781975666256137, "loss": 1.7748, "step": 68 }, { "epoch": 0.07835344215755856, "grad_norm": 0.8477333784103394, "learning_rate": 0.0001977442141098397, "loss": 1.7125, "step": 69 }, { "epoch": 0.0794889992902768, "grad_norm": 0.9296358227729797, "learning_rate": 0.00019766739994685718, "loss": 1.6961, "step": 70 }, { "epoch": 0.08062455642299503, "grad_norm": 0.9006703495979309, "learning_rate": 0.0001975893151729331, "loss": 1.7744, "step": 71 }, { "epoch": 0.08176011355571328, "grad_norm": 0.9306021332740784, "learning_rate": 0.00019750996080391685, "loss": 1.6718, "step": 72 }, { "epoch": 0.08289567068843151, "grad_norm": 1.052278995513916, "learning_rate": 0.00019742933787217468, "loss": 1.7074, "step": 73 }, { "epoch": 0.08403122782114975, "grad_norm": 0.8849816918373108, "learning_rate": 0.00019734744742657632, "loss": 1.5902, "step": 74 }, { "epoch": 0.085166784953868, "grad_norm": 0.9274908304214478, "learning_rate": 0.0001972642905324813, "loss": 1.8853, "step": 75 }, { "epoch": 0.08630234208658623, "grad_norm": 0.9930580854415894, "learning_rate": 0.00019717986827172515, "loss": 1.7069, "step": 76 }, { "epoch": 0.08743789921930448, "grad_norm": 0.876822829246521, "learning_rate": 0.0001970941817426052, "loss": 1.6564, "step": 77 }, { "epoch": 0.08857345635202271, "grad_norm": 0.9288412928581238, "learning_rate": 0.00019700723205986643, "loss": 1.7421, "step": 78 }, { "epoch": 0.08970901348474095, "grad_norm": 0.9969463348388672, "learning_rate": 0.00019691902035468687, "loss": 1.6516, "step": 79 }, { "epoch": 0.0908445706174592, "grad_norm": 0.9952249526977539, "learning_rate": 0.00019682954777466293, "loss": 1.6836, "step": 80 }, { "epoch": 0.09198012775017743, "grad_norm": 1.017162799835205, "learning_rate": 0.00019673881548379444, "loss": 1.7225, "step": 81 }, { "epoch": 0.09311568488289568, "grad_norm": 0.9949252605438232, "learning_rate": 0.00019664682466246963, "loss": 1.6151, "step": 82 }, { "epoch": 0.09425124201561391, "grad_norm": 1.0370473861694336, "learning_rate": 0.0001965535765074495, "loss": 1.6248, "step": 83 }, { "epoch": 0.09538679914833215, "grad_norm": 1.032970905303955, "learning_rate": 0.00019645907223185254, "loss": 1.6093, "step": 84 }, { "epoch": 0.0965223562810504, "grad_norm": 1.0290247201919556, "learning_rate": 0.00019636331306513883, "loss": 1.6035, "step": 85 }, { "epoch": 0.09765791341376863, "grad_norm": 1.108210802078247, "learning_rate": 0.000196266300253094, "loss": 1.6887, "step": 86 }, { "epoch": 0.09879347054648686, "grad_norm": 1.158912181854248, "learning_rate": 0.00019616803505781305, "loss": 1.697, "step": 87 }, { "epoch": 0.09992902767920511, "grad_norm": 1.1105399131774902, "learning_rate": 0.000196068518757684, "loss": 1.6404, "step": 88 }, { "epoch": 0.10106458481192335, "grad_norm": 1.1792031526565552, "learning_rate": 0.00019596775264737126, "loss": 1.492, "step": 89 }, { "epoch": 0.1022001419446416, "grad_norm": 1.4286049604415894, "learning_rate": 0.00019586573803779858, "loss": 1.5958, "step": 90 }, { "epoch": 0.10333569907735983, "grad_norm": 1.2293449640274048, "learning_rate": 0.00019576247625613226, "loss": 1.3676, "step": 91 }, { "epoch": 0.10447125621007806, "grad_norm": 1.3195327520370483, "learning_rate": 0.00019565796864576377, "loss": 1.6307, "step": 92 }, { "epoch": 0.10560681334279631, "grad_norm": 1.3426988124847412, "learning_rate": 0.00019555221656629218, "loss": 1.6288, "step": 93 }, { "epoch": 0.10674237047551455, "grad_norm": 1.4404617547988892, "learning_rate": 0.0001954452213935067, "loss": 1.5158, "step": 94 }, { "epoch": 0.1078779276082328, "grad_norm": 1.4946198463439941, "learning_rate": 0.00019533698451936856, "loss": 1.519, "step": 95 }, { "epoch": 0.10901348474095103, "grad_norm": 1.5227530002593994, "learning_rate": 0.0001952275073519931, "loss": 1.6856, "step": 96 }, { "epoch": 0.11014904187366927, "grad_norm": 1.6402555704116821, "learning_rate": 0.00019511679131563114, "loss": 1.5146, "step": 97 }, { "epoch": 0.11128459900638751, "grad_norm": 1.951488733291626, "learning_rate": 0.0001950048378506509, "loss": 1.7854, "step": 98 }, { "epoch": 0.11242015613910575, "grad_norm": 2.751948118209839, "learning_rate": 0.0001948916484135188, "loss": 1.2353, "step": 99 }, { "epoch": 0.11355571327182398, "grad_norm": 3.041262149810791, "learning_rate": 0.0001947772244767809, "loss": 1.3385, "step": 100 }, { "epoch": 0.11469127040454223, "grad_norm": 2.5620269775390625, "learning_rate": 0.00019466156752904343, "loss": 2.2988, "step": 101 }, { "epoch": 0.11582682753726047, "grad_norm": 2.771423816680908, "learning_rate": 0.00019454467907495363, "loss": 2.2533, "step": 102 }, { "epoch": 0.11696238466997871, "grad_norm": 2.5785555839538574, "learning_rate": 0.00019442656063518014, "loss": 2.2171, "step": 103 }, { "epoch": 0.11809794180269695, "grad_norm": 2.232318878173828, "learning_rate": 0.00019430721374639312, "loss": 2.0285, "step": 104 }, { "epoch": 0.11923349893541518, "grad_norm": 1.5291625261306763, "learning_rate": 0.00019418663996124443, "loss": 1.8496, "step": 105 }, { "epoch": 0.12036905606813343, "grad_norm": 0.9760420918464661, "learning_rate": 0.00019406484084834712, "loss": 1.8083, "step": 106 }, { "epoch": 0.12150461320085167, "grad_norm": 0.8692153692245483, "learning_rate": 0.0001939418179922554, "loss": 1.9626, "step": 107 }, { "epoch": 0.12264017033356991, "grad_norm": 0.8772637248039246, "learning_rate": 0.00019381757299344386, "loss": 1.8553, "step": 108 }, { "epoch": 0.12377572746628815, "grad_norm": 0.8720763325691223, "learning_rate": 0.00019369210746828658, "loss": 1.8905, "step": 109 }, { "epoch": 0.12491128459900638, "grad_norm": 0.8130568861961365, "learning_rate": 0.00019356542304903613, "loss": 1.9592, "step": 110 }, { "epoch": 0.12604684173172462, "grad_norm": 0.7974451780319214, "learning_rate": 0.00019343752138380245, "loss": 1.811, "step": 111 }, { "epoch": 0.12718239886444288, "grad_norm": 0.7639180421829224, "learning_rate": 0.00019330840413653131, "loss": 1.8301, "step": 112 }, { "epoch": 0.12831795599716112, "grad_norm": 0.7468190789222717, "learning_rate": 0.00019317807298698257, "loss": 1.7492, "step": 113 }, { "epoch": 0.12945351312987935, "grad_norm": 0.823840320110321, "learning_rate": 0.0001930465296307087, "loss": 1.8019, "step": 114 }, { "epoch": 0.13058907026259758, "grad_norm": 0.7746431827545166, "learning_rate": 0.00019291377577903212, "loss": 1.7231, "step": 115 }, { "epoch": 0.13172462739531582, "grad_norm": 0.8240138292312622, "learning_rate": 0.00019277981315902348, "loss": 1.8743, "step": 116 }, { "epoch": 0.13286018452803405, "grad_norm": 0.8646796941757202, "learning_rate": 0.00019264464351347903, "loss": 1.7905, "step": 117 }, { "epoch": 0.13399574166075232, "grad_norm": 0.8185977935791016, "learning_rate": 0.0001925082686008978, "loss": 1.7196, "step": 118 }, { "epoch": 0.13513129879347055, "grad_norm": 0.8447887301445007, "learning_rate": 0.0001923706901954588, "loss": 1.7066, "step": 119 }, { "epoch": 0.13626685592618878, "grad_norm": 0.8474506139755249, "learning_rate": 0.00019223191008699807, "loss": 1.7164, "step": 120 }, { "epoch": 0.13740241305890702, "grad_norm": 0.8193461298942566, "learning_rate": 0.0001920919300809852, "loss": 1.7494, "step": 121 }, { "epoch": 0.13853797019162525, "grad_norm": 0.8521745800971985, "learning_rate": 0.0001919507519985, "loss": 1.696, "step": 122 }, { "epoch": 0.13967352732434352, "grad_norm": 0.8591006994247437, "learning_rate": 0.00019180837767620868, "loss": 1.8376, "step": 123 }, { "epoch": 0.14080908445706175, "grad_norm": 0.9227604866027832, "learning_rate": 0.00019166480896634012, "loss": 1.776, "step": 124 }, { "epoch": 0.14194464158977999, "grad_norm": 0.8740448951721191, "learning_rate": 0.0001915200477366615, "loss": 1.6703, "step": 125 }, { "epoch": 0.14308019872249822, "grad_norm": 0.9638112187385559, "learning_rate": 0.00019137409587045433, "loss": 1.7562, "step": 126 }, { "epoch": 0.14421575585521645, "grad_norm": 0.9088642597198486, "learning_rate": 0.00019122695526648968, "loss": 1.6485, "step": 127 }, { "epoch": 0.14535131298793472, "grad_norm": 0.9763453006744385, "learning_rate": 0.00019107862783900368, "loss": 1.829, "step": 128 }, { "epoch": 0.14648687012065295, "grad_norm": 0.9610134959220886, "learning_rate": 0.00019092911551767247, "loss": 1.6198, "step": 129 }, { "epoch": 0.14762242725337119, "grad_norm": 0.945428729057312, "learning_rate": 0.00019077842024758717, "loss": 1.6419, "step": 130 }, { "epoch": 0.14875798438608942, "grad_norm": 1.0024683475494385, "learning_rate": 0.00019062654398922853, "loss": 1.7152, "step": 131 }, { "epoch": 0.14989354151880765, "grad_norm": 0.9661603569984436, "learning_rate": 0.00019047348871844145, "loss": 1.4878, "step": 132 }, { "epoch": 0.15102909865152592, "grad_norm": 1.0560873746871948, "learning_rate": 0.00019031925642640926, "loss": 1.7405, "step": 133 }, { "epoch": 0.15216465578424415, "grad_norm": 1.171895146369934, "learning_rate": 0.000190163849119628, "loss": 1.473, "step": 134 }, { "epoch": 0.1533002129169624, "grad_norm": 1.0595684051513672, "learning_rate": 0.00019000726881987986, "loss": 1.5166, "step": 135 }, { "epoch": 0.15443577004968062, "grad_norm": 1.1354924440383911, "learning_rate": 0.00018984951756420738, "loss": 1.7034, "step": 136 }, { "epoch": 0.15557132718239886, "grad_norm": 1.1120511293411255, "learning_rate": 0.0001896905974048867, "loss": 1.4843, "step": 137 }, { "epoch": 0.15670688431511712, "grad_norm": 1.2007062435150146, "learning_rate": 0.00018953051040940076, "loss": 1.6443, "step": 138 }, { "epoch": 0.15784244144783535, "grad_norm": 1.2116113901138306, "learning_rate": 0.00018936925866041277, "loss": 1.5864, "step": 139 }, { "epoch": 0.1589779985805536, "grad_norm": 1.1508610248565674, "learning_rate": 0.00018920684425573865, "loss": 1.4276, "step": 140 }, { "epoch": 0.16011355571327182, "grad_norm": 1.384462594985962, "learning_rate": 0.00018904326930832015, "loss": 1.6125, "step": 141 }, { "epoch": 0.16124911284599006, "grad_norm": 1.2576823234558105, "learning_rate": 0.00018887853594619705, "loss": 1.4617, "step": 142 }, { "epoch": 0.1623846699787083, "grad_norm": 1.3541723489761353, "learning_rate": 0.00018871264631247972, "loss": 1.568, "step": 143 }, { "epoch": 0.16352022711142655, "grad_norm": 1.3448771238327026, "learning_rate": 0.000188545602565321, "loss": 1.4548, "step": 144 }, { "epoch": 0.1646557842441448, "grad_norm": 1.5346524715423584, "learning_rate": 0.00018837740687788839, "loss": 1.6223, "step": 145 }, { "epoch": 0.16579134137686302, "grad_norm": 1.6131750345230103, "learning_rate": 0.00018820806143833555, "loss": 1.2846, "step": 146 }, { "epoch": 0.16692689850958126, "grad_norm": 1.8512752056121826, "learning_rate": 0.00018803756844977394, "loss": 1.368, "step": 147 }, { "epoch": 0.1680624556422995, "grad_norm": 1.8721482753753662, "learning_rate": 0.00018786593013024412, "loss": 1.2694, "step": 148 }, { "epoch": 0.16919801277501775, "grad_norm": 2.085376024246216, "learning_rate": 0.000187693148712687, "loss": 1.5063, "step": 149 }, { "epoch": 0.170333569907736, "grad_norm": 3.708118200302124, "learning_rate": 0.00018751922644491456, "loss": 1.5481, "step": 150 }, { "epoch": 0.17146912704045422, "grad_norm": 2.91347074508667, "learning_rate": 0.0001873441655895809, "loss": 2.1324, "step": 151 }, { "epoch": 0.17260468417317246, "grad_norm": 2.712827682495117, "learning_rate": 0.00018716796842415258, "loss": 2.1752, "step": 152 }, { "epoch": 0.1737402413058907, "grad_norm": 2.6362178325653076, "learning_rate": 0.00018699063724087904, "loss": 2.0361, "step": 153 }, { "epoch": 0.17487579843860895, "grad_norm": 2.171754837036133, "learning_rate": 0.00018681217434676287, "loss": 2.1279, "step": 154 }, { "epoch": 0.1760113555713272, "grad_norm": 1.6765910387039185, "learning_rate": 0.0001866325820635297, "loss": 1.9891, "step": 155 }, { "epoch": 0.17714691270404542, "grad_norm": 1.2349283695220947, "learning_rate": 0.00018645186272759801, "loss": 1.9325, "step": 156 }, { "epoch": 0.17828246983676366, "grad_norm": 0.797235906124115, "learning_rate": 0.0001862700186900488, "loss": 1.8639, "step": 157 }, { "epoch": 0.1794180269694819, "grad_norm": 0.7924672961235046, "learning_rate": 0.00018608705231659486, "loss": 1.8004, "step": 158 }, { "epoch": 0.18055358410220015, "grad_norm": 0.8479894995689392, "learning_rate": 0.00018590296598755028, "loss": 1.8386, "step": 159 }, { "epoch": 0.1816891412349184, "grad_norm": 1.099936604499817, "learning_rate": 0.00018571776209779905, "loss": 1.809, "step": 160 }, { "epoch": 0.18282469836763662, "grad_norm": 0.8509452939033508, "learning_rate": 0.00018553144305676433, "loss": 1.7385, "step": 161 }, { "epoch": 0.18396025550035486, "grad_norm": 0.8845868706703186, "learning_rate": 0.00018534401128837683, "loss": 1.7694, "step": 162 }, { "epoch": 0.1850958126330731, "grad_norm": 0.8222683072090149, "learning_rate": 0.00018515546923104337, "loss": 1.7522, "step": 163 }, { "epoch": 0.18623136976579135, "grad_norm": 0.829135000705719, "learning_rate": 0.00018496581933761516, "loss": 1.7144, "step": 164 }, { "epoch": 0.1873669268985096, "grad_norm": 0.7826790809631348, "learning_rate": 0.00018477506407535595, "loss": 1.757, "step": 165 }, { "epoch": 0.18850248403122782, "grad_norm": 0.9551543593406677, "learning_rate": 0.00018458320592590975, "loss": 1.7341, "step": 166 }, { "epoch": 0.18963804116394606, "grad_norm": 0.8297865390777588, "learning_rate": 0.00018439024738526873, "loss": 1.7778, "step": 167 }, { "epoch": 0.1907735982966643, "grad_norm": 0.8169491291046143, "learning_rate": 0.00018419619096374067, "loss": 1.6124, "step": 168 }, { "epoch": 0.19190915542938253, "grad_norm": 0.8767200112342834, "learning_rate": 0.0001840010391859163, "loss": 1.8143, "step": 169 }, { "epoch": 0.1930447125621008, "grad_norm": 1.0078586339950562, "learning_rate": 0.00018380479459063643, "loss": 1.8453, "step": 170 }, { "epoch": 0.19418026969481902, "grad_norm": 0.8688703775405884, "learning_rate": 0.00018360745973095902, "loss": 1.7546, "step": 171 }, { "epoch": 0.19531582682753726, "grad_norm": 0.9157019257545471, "learning_rate": 0.00018340903717412588, "loss": 1.737, "step": 172 }, { "epoch": 0.1964513839602555, "grad_norm": 0.8588647842407227, "learning_rate": 0.00018320952950152928, "loss": 1.6619, "step": 173 }, { "epoch": 0.19758694109297373, "grad_norm": 0.8594261407852173, "learning_rate": 0.00018300893930867838, "loss": 1.6129, "step": 174 }, { "epoch": 0.198722498225692, "grad_norm": 0.9212916493415833, "learning_rate": 0.00018280726920516546, "loss": 1.7386, "step": 175 }, { "epoch": 0.19985805535841022, "grad_norm": 0.840570867061615, "learning_rate": 0.00018260452181463198, "loss": 1.6946, "step": 176 }, { "epoch": 0.20099361249112846, "grad_norm": 0.8730524182319641, "learning_rate": 0.00018240069977473446, "loss": 1.6175, "step": 177 }, { "epoch": 0.2021291696238467, "grad_norm": 0.9078536629676819, "learning_rate": 0.00018219580573711016, "loss": 1.606, "step": 178 }, { "epoch": 0.20326472675656493, "grad_norm": 0.8870209455490112, "learning_rate": 0.00018198984236734246, "loss": 1.6247, "step": 179 }, { "epoch": 0.2044002838892832, "grad_norm": 0.9717417359352112, "learning_rate": 0.00018178281234492648, "loss": 1.6589, "step": 180 }, { "epoch": 0.20553584102200143, "grad_norm": 1.006575584411621, "learning_rate": 0.00018157471836323383, "loss": 1.5534, "step": 181 }, { "epoch": 0.20667139815471966, "grad_norm": 1.0138202905654907, "learning_rate": 0.0001813655631294779, "loss": 1.6029, "step": 182 }, { "epoch": 0.2078069552874379, "grad_norm": 1.012413501739502, "learning_rate": 0.00018115534936467853, "loss": 1.5323, "step": 183 }, { "epoch": 0.20894251242015613, "grad_norm": 1.1269084215164185, "learning_rate": 0.0001809440798036265, "loss": 1.5624, "step": 184 }, { "epoch": 0.2100780695528744, "grad_norm": 1.2197819948196411, "learning_rate": 0.0001807317571948481, "loss": 1.5275, "step": 185 }, { "epoch": 0.21121362668559263, "grad_norm": 1.1670926809310913, "learning_rate": 0.00018051838430056935, "loss": 1.7604, "step": 186 }, { "epoch": 0.21234918381831086, "grad_norm": 1.0778840780258179, "learning_rate": 0.00018030396389667996, "loss": 1.5289, "step": 187 }, { "epoch": 0.2134847409510291, "grad_norm": 1.1497727632522583, "learning_rate": 0.0001800884987726973, "loss": 1.5267, "step": 188 }, { "epoch": 0.21462029808374733, "grad_norm": 1.1532498598098755, "learning_rate": 0.00017987199173173023, "loss": 1.5173, "step": 189 }, { "epoch": 0.2157558552164656, "grad_norm": 1.1306942701339722, "learning_rate": 0.00017965444559044227, "loss": 1.4376, "step": 190 }, { "epoch": 0.21689141234918383, "grad_norm": 1.2913910150527954, "learning_rate": 0.0001794358631790154, "loss": 1.5409, "step": 191 }, { "epoch": 0.21802696948190206, "grad_norm": 1.2509875297546387, "learning_rate": 0.00017921624734111292, "loss": 1.4417, "step": 192 }, { "epoch": 0.2191625266146203, "grad_norm": 1.3401572704315186, "learning_rate": 0.00017899560093384263, "loss": 1.6188, "step": 193 }, { "epoch": 0.22029808374733853, "grad_norm": 1.4731441736221313, "learning_rate": 0.00017877392682771954, "loss": 1.4995, "step": 194 }, { "epoch": 0.22143364088005676, "grad_norm": 1.5188558101654053, "learning_rate": 0.00017855122790662854, "loss": 1.3288, "step": 195 }, { "epoch": 0.22256919801277503, "grad_norm": 1.6928372383117676, "learning_rate": 0.00017832750706778708, "loss": 1.5821, "step": 196 }, { "epoch": 0.22370475514549326, "grad_norm": 1.6448569297790527, "learning_rate": 0.00017810276722170714, "loss": 1.6366, "step": 197 }, { "epoch": 0.2248403122782115, "grad_norm": 1.9883676767349243, "learning_rate": 0.00017787701129215767, "loss": 1.2394, "step": 198 }, { "epoch": 0.22597586941092973, "grad_norm": 1.9611278772354126, "learning_rate": 0.0001776502422161264, "loss": 1.3085, "step": 199 }, { "epoch": 0.22711142654364797, "grad_norm": 3.206226110458374, "learning_rate": 0.00017742246294378159, "loss": 1.4067, "step": 200 }, { "epoch": 0.22824698367636623, "grad_norm": 2.30232572555542, "learning_rate": 0.00017719367643843388, "loss": 2.1487, "step": 201 }, { "epoch": 0.22938254080908446, "grad_norm": 2.645108699798584, "learning_rate": 0.00017696388567649737, "loss": 2.1252, "step": 202 }, { "epoch": 0.2305180979418027, "grad_norm": 2.317121744155884, "learning_rate": 0.0001767330936474513, "loss": 2.1035, "step": 203 }, { "epoch": 0.23165365507452093, "grad_norm": 2.165541172027588, "learning_rate": 0.00017650130335380084, "loss": 2.0607, "step": 204 }, { "epoch": 0.23278921220723917, "grad_norm": 1.4709444046020508, "learning_rate": 0.0001762685178110382, "loss": 1.9345, "step": 205 }, { "epoch": 0.23392476933995743, "grad_norm": 1.1143975257873535, "learning_rate": 0.00017603474004760337, "loss": 1.879, "step": 206 }, { "epoch": 0.23506032647267566, "grad_norm": 0.7976076602935791, "learning_rate": 0.00017579997310484467, "loss": 1.8644, "step": 207 }, { "epoch": 0.2361958836053939, "grad_norm": 0.913081705570221, "learning_rate": 0.00017556422003697919, "loss": 1.8568, "step": 208 }, { "epoch": 0.23733144073811213, "grad_norm": 1.060461163520813, "learning_rate": 0.0001753274839110532, "loss": 1.8508, "step": 209 }, { "epoch": 0.23846699787083037, "grad_norm": 0.9146474003791809, "learning_rate": 0.00017508976780690203, "loss": 1.6968, "step": 210 }, { "epoch": 0.23960255500354863, "grad_norm": 0.9160935282707214, "learning_rate": 0.00017485107481711012, "loss": 1.8966, "step": 211 }, { "epoch": 0.24073811213626686, "grad_norm": 0.8769086003303528, "learning_rate": 0.0001746114080469708, "loss": 1.7037, "step": 212 }, { "epoch": 0.2418736692689851, "grad_norm": 0.8443089127540588, "learning_rate": 0.00017437077061444587, "loss": 1.8236, "step": 213 }, { "epoch": 0.24300922640170333, "grad_norm": 0.8325538039207458, "learning_rate": 0.000174129165650125, "loss": 1.7885, "step": 214 }, { "epoch": 0.24414478353442157, "grad_norm": 0.9702067375183105, "learning_rate": 0.000173886596297185, "loss": 1.7064, "step": 215 }, { "epoch": 0.24528034066713983, "grad_norm": 0.8921521902084351, "learning_rate": 0.00017364306571134909, "loss": 1.7587, "step": 216 }, { "epoch": 0.24641589779985806, "grad_norm": 0.8152370452880859, "learning_rate": 0.0001733985770608456, "loss": 1.7147, "step": 217 }, { "epoch": 0.2475514549325763, "grad_norm": 0.8478974103927612, "learning_rate": 0.0001731531335263669, "loss": 1.6202, "step": 218 }, { "epoch": 0.24868701206529453, "grad_norm": 0.9875072240829468, "learning_rate": 0.00017290673830102802, "loss": 1.6259, "step": 219 }, { "epoch": 0.24982256919801277, "grad_norm": 0.9687901735305786, "learning_rate": 0.00017265939459032504, "loss": 1.6182, "step": 220 }, { "epoch": 0.250958126330731, "grad_norm": 0.9654157757759094, "learning_rate": 0.00017241110561209342, "loss": 1.7102, "step": 221 }, { "epoch": 0.250958126330731, "eval_loss": 1.6393167972564697, "eval_runtime": 96.2386, "eval_samples_per_second": 15.42, "eval_steps_per_second": 7.71, "step": 221 }, { "epoch": 0.25209368346344924, "grad_norm": 0.8487362861633301, "learning_rate": 0.0001721618745964662, "loss": 1.6403, "step": 222 }, { "epoch": 0.25322924059616747, "grad_norm": 0.8898485898971558, "learning_rate": 0.00017191170478583187, "loss": 1.7195, "step": 223 }, { "epoch": 0.25436479772888576, "grad_norm": 0.8826645016670227, "learning_rate": 0.0001716605994347923, "loss": 1.7085, "step": 224 }, { "epoch": 0.255500354861604, "grad_norm": 0.853583812713623, "learning_rate": 0.00017140856181012027, "loss": 1.6543, "step": 225 }, { "epoch": 0.25663591199432223, "grad_norm": 0.9461856484413147, "learning_rate": 0.00017115559519071704, "loss": 1.6308, "step": 226 }, { "epoch": 0.25777146912704046, "grad_norm": 1.0077241659164429, "learning_rate": 0.00017090170286756978, "loss": 1.7629, "step": 227 }, { "epoch": 0.2589070262597587, "grad_norm": 1.057852864265442, "learning_rate": 0.00017064688814370858, "loss": 1.5952, "step": 228 }, { "epoch": 0.26004258339247693, "grad_norm": 0.9642679691314697, "learning_rate": 0.0001703911543341636, "loss": 1.5616, "step": 229 }, { "epoch": 0.26117814052519517, "grad_norm": 1.08012855052948, "learning_rate": 0.00017013450476592189, "loss": 1.4565, "step": 230 }, { "epoch": 0.2623136976579134, "grad_norm": 1.0963120460510254, "learning_rate": 0.00016987694277788417, "loss": 1.5922, "step": 231 }, { "epoch": 0.26344925479063164, "grad_norm": 1.0885945558547974, "learning_rate": 0.00016961847172082135, "loss": 1.5705, "step": 232 }, { "epoch": 0.26458481192334987, "grad_norm": 1.0257972478866577, "learning_rate": 0.00016935909495733082, "loss": 1.5959, "step": 233 }, { "epoch": 0.2657203690560681, "grad_norm": 1.203218698501587, "learning_rate": 0.00016909881586179304, "loss": 1.7249, "step": 234 }, { "epoch": 0.2668559261887864, "grad_norm": 1.0138198137283325, "learning_rate": 0.00016883763782032718, "loss": 1.6038, "step": 235 }, { "epoch": 0.26799148332150463, "grad_norm": 1.0444432497024536, "learning_rate": 0.00016857556423074748, "loss": 1.5545, "step": 236 }, { "epoch": 0.26912704045422287, "grad_norm": 1.138569712638855, "learning_rate": 0.00016831259850251886, "loss": 1.5977, "step": 237 }, { "epoch": 0.2702625975869411, "grad_norm": 1.132225751876831, "learning_rate": 0.0001680487440567125, "loss": 1.4896, "step": 238 }, { "epoch": 0.27139815471965933, "grad_norm": 1.187804937362671, "learning_rate": 0.00016778400432596144, "loss": 1.5197, "step": 239 }, { "epoch": 0.27253371185237757, "grad_norm": 1.340867519378662, "learning_rate": 0.00016751838275441596, "loss": 1.6065, "step": 240 }, { "epoch": 0.2736692689850958, "grad_norm": 1.3593790531158447, "learning_rate": 0.00016725188279769865, "loss": 1.7144, "step": 241 }, { "epoch": 0.27480482611781404, "grad_norm": 1.3996741771697998, "learning_rate": 0.00016698450792285952, "loss": 1.3954, "step": 242 }, { "epoch": 0.2759403832505323, "grad_norm": 1.3668125867843628, "learning_rate": 0.00016671626160833089, "loss": 1.4498, "step": 243 }, { "epoch": 0.2770759403832505, "grad_norm": 1.3429515361785889, "learning_rate": 0.00016644714734388217, "loss": 1.3722, "step": 244 }, { "epoch": 0.2782114975159688, "grad_norm": 1.4415324926376343, "learning_rate": 0.00016617716863057442, "loss": 1.3191, "step": 245 }, { "epoch": 0.27934705464868703, "grad_norm": 1.7592180967330933, "learning_rate": 0.00016590632898071475, "loss": 1.5684, "step": 246 }, { "epoch": 0.28048261178140527, "grad_norm": 1.8982957601547241, "learning_rate": 0.00016563463191781073, "loss": 1.3426, "step": 247 }, { "epoch": 0.2816181689141235, "grad_norm": 2.085282325744629, "learning_rate": 0.00016536208097652448, "loss": 1.4058, "step": 248 }, { "epoch": 0.28275372604684174, "grad_norm": 2.003387451171875, "learning_rate": 0.0001650886797026268, "loss": 1.0225, "step": 249 }, { "epoch": 0.28388928317955997, "grad_norm": 3.3409759998321533, "learning_rate": 0.00016481443165295085, "loss": 1.5822, "step": 250 }, { "epoch": 0.2850248403122782, "grad_norm": 2.1829633712768555, "learning_rate": 0.00016453934039534598, "loss": 2.0991, "step": 251 }, { "epoch": 0.28616039744499644, "grad_norm": 2.5729780197143555, "learning_rate": 0.00016426340950863137, "loss": 2.0998, "step": 252 }, { "epoch": 0.2872959545777147, "grad_norm": 2.771898031234741, "learning_rate": 0.0001639866425825494, "loss": 2.0906, "step": 253 }, { "epoch": 0.2884315117104329, "grad_norm": 2.163909673690796, "learning_rate": 0.00016370904321771893, "loss": 2.0904, "step": 254 }, { "epoch": 0.2895670688431512, "grad_norm": 2.14174747467041, "learning_rate": 0.00016343061502558856, "loss": 1.9492, "step": 255 }, { "epoch": 0.29070262597586943, "grad_norm": 1.6324646472930908, "learning_rate": 0.00016315136162838948, "loss": 1.8837, "step": 256 }, { "epoch": 0.29183818310858767, "grad_norm": 1.149991512298584, "learning_rate": 0.0001628712866590885, "loss": 1.9216, "step": 257 }, { "epoch": 0.2929737402413059, "grad_norm": 0.8889797925949097, "learning_rate": 0.00016259039376134075, "loss": 1.7783, "step": 258 }, { "epoch": 0.29410929737402414, "grad_norm": 0.787470817565918, "learning_rate": 0.00016230868658944223, "loss": 1.8317, "step": 259 }, { "epoch": 0.29524485450674237, "grad_norm": 0.9624422192573547, "learning_rate": 0.0001620261688082823, "loss": 1.8802, "step": 260 }, { "epoch": 0.2963804116394606, "grad_norm": 0.9467479586601257, "learning_rate": 0.00016174284409329598, "loss": 1.8947, "step": 261 }, { "epoch": 0.29751596877217884, "grad_norm": 0.9351758360862732, "learning_rate": 0.00016145871613041618, "loss": 1.7864, "step": 262 }, { "epoch": 0.2986515259048971, "grad_norm": 0.9442856907844543, "learning_rate": 0.00016117378861602575, "loss": 1.7433, "step": 263 }, { "epoch": 0.2997870830376153, "grad_norm": 1.0957460403442383, "learning_rate": 0.00016088806525690926, "loss": 1.82, "step": 264 }, { "epoch": 0.30092264017033354, "grad_norm": 0.8206662535667419, "learning_rate": 0.00016060154977020502, "loss": 1.6584, "step": 265 }, { "epoch": 0.30205819730305183, "grad_norm": 0.8952071070671082, "learning_rate": 0.0001603142458833564, "loss": 1.8192, "step": 266 }, { "epoch": 0.30319375443577007, "grad_norm": 0.8559849858283997, "learning_rate": 0.0001600261573340637, "loss": 1.7204, "step": 267 }, { "epoch": 0.3043293115684883, "grad_norm": 0.9078370332717896, "learning_rate": 0.0001597372878702352, "loss": 1.7797, "step": 268 }, { "epoch": 0.30546486870120654, "grad_norm": 0.9059626460075378, "learning_rate": 0.00015944764124993868, "loss": 1.6208, "step": 269 }, { "epoch": 0.3066004258339248, "grad_norm": 0.8782923221588135, "learning_rate": 0.00015915722124135227, "loss": 1.7273, "step": 270 }, { "epoch": 0.307735982966643, "grad_norm": 0.8429134488105774, "learning_rate": 0.00015886603162271556, "loss": 1.6711, "step": 271 }, { "epoch": 0.30887154009936124, "grad_norm": 0.9312360882759094, "learning_rate": 0.00015857407618228054, "loss": 1.5218, "step": 272 }, { "epoch": 0.3100070972320795, "grad_norm": 0.9803429841995239, "learning_rate": 0.0001582813587182621, "loss": 1.8327, "step": 273 }, { "epoch": 0.3111426543647977, "grad_norm": 0.8917859196662903, "learning_rate": 0.00015798788303878882, "loss": 1.6414, "step": 274 }, { "epoch": 0.31227821149751595, "grad_norm": 0.9032409191131592, "learning_rate": 0.00015769365296185324, "loss": 1.6687, "step": 275 }, { "epoch": 0.31341376863023424, "grad_norm": 0.9209126234054565, "learning_rate": 0.00015739867231526233, "loss": 1.6228, "step": 276 }, { "epoch": 0.31454932576295247, "grad_norm": 0.9243400692939758, "learning_rate": 0.00015710294493658772, "loss": 1.571, "step": 277 }, { "epoch": 0.3156848828956707, "grad_norm": 0.9028908014297485, "learning_rate": 0.00015680647467311557, "loss": 1.6215, "step": 278 }, { "epoch": 0.31682044002838894, "grad_norm": 0.9458784461021423, "learning_rate": 0.00015650926538179682, "loss": 1.6727, "step": 279 }, { "epoch": 0.3179559971611072, "grad_norm": 0.919954240322113, "learning_rate": 0.00015621132092919667, "loss": 1.5476, "step": 280 }, { "epoch": 0.3190915542938254, "grad_norm": 0.9464284777641296, "learning_rate": 0.0001559126451914445, "loss": 1.5635, "step": 281 }, { "epoch": 0.32022711142654364, "grad_norm": 1.0951217412948608, "learning_rate": 0.00015561324205418353, "loss": 1.5593, "step": 282 }, { "epoch": 0.3213626685592619, "grad_norm": 1.0827003717422485, "learning_rate": 0.00015531311541251995, "loss": 1.6299, "step": 283 }, { "epoch": 0.3224982256919801, "grad_norm": 1.2430078983306885, "learning_rate": 0.00015501226917097257, "loss": 1.7239, "step": 284 }, { "epoch": 0.32363378282469835, "grad_norm": 1.058668851852417, "learning_rate": 0.00015471070724342177, "loss": 1.5565, "step": 285 }, { "epoch": 0.3247693399574166, "grad_norm": 1.1004656553268433, "learning_rate": 0.0001544084335530588, "loss": 1.5175, "step": 286 }, { "epoch": 0.32590489709013487, "grad_norm": 1.0904589891433716, "learning_rate": 0.0001541054520323346, "loss": 1.5758, "step": 287 }, { "epoch": 0.3270404542228531, "grad_norm": 1.2309776544570923, "learning_rate": 0.00015380176662290867, "loss": 1.6688, "step": 288 }, { "epoch": 0.32817601135557134, "grad_norm": 1.291077733039856, "learning_rate": 0.00015349738127559782, "loss": 1.7157, "step": 289 }, { "epoch": 0.3293115684882896, "grad_norm": 1.2712247371673584, "learning_rate": 0.00015319229995032464, "loss": 1.5695, "step": 290 }, { "epoch": 0.3304471256210078, "grad_norm": 1.2047388553619385, "learning_rate": 0.00015288652661606632, "loss": 1.4479, "step": 291 }, { "epoch": 0.33158268275372604, "grad_norm": 1.3235585689544678, "learning_rate": 0.00015258006525080257, "loss": 1.4158, "step": 292 }, { "epoch": 0.3327182398864443, "grad_norm": 1.615641474723816, "learning_rate": 0.0001522729198414642, "loss": 1.4148, "step": 293 }, { "epoch": 0.3338537970191625, "grad_norm": 1.3897641897201538, "learning_rate": 0.0001519650943838812, "loss": 1.4888, "step": 294 }, { "epoch": 0.33498935415188075, "grad_norm": 1.5793813467025757, "learning_rate": 0.0001516565928827305, "loss": 1.5037, "step": 295 }, { "epoch": 0.336124911284599, "grad_norm": 1.6975367069244385, "learning_rate": 0.0001513474193514842, "loss": 1.3918, "step": 296 }, { "epoch": 0.33726046841731727, "grad_norm": 1.8271501064300537, "learning_rate": 0.00015103757781235726, "loss": 1.1916, "step": 297 }, { "epoch": 0.3383960255500355, "grad_norm": 2.1957499980926514, "learning_rate": 0.00015072707229625505, "loss": 1.4045, "step": 298 }, { "epoch": 0.33953158268275374, "grad_norm": 1.904187798500061, "learning_rate": 0.000150415906842721, "loss": 1.1695, "step": 299 }, { "epoch": 0.340667139815472, "grad_norm": 3.4624063968658447, "learning_rate": 0.00015010408549988408, "loss": 1.635, "step": 300 }, { "epoch": 0.3418026969481902, "grad_norm": 2.2678442001342773, "learning_rate": 0.00014979161232440613, "loss": 2.1402, "step": 301 }, { "epoch": 0.34293825408090844, "grad_norm": 2.74515438079834, "learning_rate": 0.00014947849138142894, "loss": 2.0712, "step": 302 }, { "epoch": 0.3440738112136267, "grad_norm": 2.777723789215088, "learning_rate": 0.0001491647267445216, "loss": 2.0276, "step": 303 }, { "epoch": 0.3452093683463449, "grad_norm": 2.4363863468170166, "learning_rate": 0.0001488503224956273, "loss": 2.0666, "step": 304 }, { "epoch": 0.34634492547906315, "grad_norm": 2.151244878768921, "learning_rate": 0.00014853528272501033, "loss": 1.9815, "step": 305 }, { "epoch": 0.3474804826117814, "grad_norm": 1.8853939771652222, "learning_rate": 0.00014821961153120287, "loss": 1.8531, "step": 306 }, { "epoch": 0.3486160397444997, "grad_norm": 1.4859002828598022, "learning_rate": 0.0001479033130209516, "loss": 1.8803, "step": 307 }, { "epoch": 0.3497515968772179, "grad_norm": 0.96648108959198, "learning_rate": 0.00014758639130916436, "loss": 1.7317, "step": 308 }, { "epoch": 0.35088715400993614, "grad_norm": 0.7754946351051331, "learning_rate": 0.00014726885051885653, "loss": 1.7622, "step": 309 }, { "epoch": 0.3520227111426544, "grad_norm": 0.7142930626869202, "learning_rate": 0.0001469506947810975, "loss": 1.8139, "step": 310 }, { "epoch": 0.3531582682753726, "grad_norm": 0.7822295427322388, "learning_rate": 0.0001466319282349568, "loss": 1.7408, "step": 311 }, { "epoch": 0.35429382540809085, "grad_norm": 0.8773436546325684, "learning_rate": 0.00014631255502745036, "loss": 1.7626, "step": 312 }, { "epoch": 0.3554293825408091, "grad_norm": 0.784964382648468, "learning_rate": 0.00014599257931348643, "loss": 1.7404, "step": 313 }, { "epoch": 0.3565649396735273, "grad_norm": 0.7716342210769653, "learning_rate": 0.00014567200525581175, "loss": 1.6769, "step": 314 }, { "epoch": 0.35770049680624555, "grad_norm": 0.8515545725822449, "learning_rate": 0.00014535083702495714, "loss": 1.784, "step": 315 }, { "epoch": 0.3588360539389638, "grad_norm": 0.8977735042572021, "learning_rate": 0.0001450290787991834, "loss": 1.787, "step": 316 }, { "epoch": 0.359971611071682, "grad_norm": 0.7918087840080261, "learning_rate": 0.0001447067347644269, "loss": 1.6984, "step": 317 }, { "epoch": 0.3611071682044003, "grad_norm": 0.809508740901947, "learning_rate": 0.00014438380911424516, "loss": 1.6884, "step": 318 }, { "epoch": 0.36224272533711854, "grad_norm": 0.9517730474472046, "learning_rate": 0.00014406030604976222, "loss": 1.7269, "step": 319 }, { "epoch": 0.3633782824698368, "grad_norm": 0.8419122099876404, "learning_rate": 0.00014373622977961409, "loss": 1.72, "step": 320 }, { "epoch": 0.364513839602555, "grad_norm": 0.9169619679450989, "learning_rate": 0.00014341158451989386, "loss": 1.6697, "step": 321 }, { "epoch": 0.36564939673527325, "grad_norm": 0.9704701900482178, "learning_rate": 0.00014308637449409706, "loss": 1.7641, "step": 322 }, { "epoch": 0.3667849538679915, "grad_norm": 0.9810813069343567, "learning_rate": 0.0001427606039330664, "loss": 1.7248, "step": 323 }, { "epoch": 0.3679205110007097, "grad_norm": 0.883621096611023, "learning_rate": 0.0001424342770749371, "loss": 1.5315, "step": 324 }, { "epoch": 0.36905606813342795, "grad_norm": 0.8742486834526062, "learning_rate": 0.00014210739816508143, "loss": 1.4986, "step": 325 }, { "epoch": 0.3701916252661462, "grad_norm": 0.8883923888206482, "learning_rate": 0.00014177997145605367, "loss": 1.6456, "step": 326 }, { "epoch": 0.3713271823988644, "grad_norm": 1.0700079202651978, "learning_rate": 0.00014145200120753478, "loss": 1.7295, "step": 327 }, { "epoch": 0.3724627395315827, "grad_norm": 0.9269877672195435, "learning_rate": 0.00014112349168627684, "loss": 1.5967, "step": 328 }, { "epoch": 0.37359829666430094, "grad_norm": 0.9069693684577942, "learning_rate": 0.0001407944471660477, "loss": 1.6169, "step": 329 }, { "epoch": 0.3747338537970192, "grad_norm": 0.9755806922912598, "learning_rate": 0.00014046487192757528, "loss": 1.7285, "step": 330 }, { "epoch": 0.3758694109297374, "grad_norm": 0.9685303568840027, "learning_rate": 0.00014013477025849195, "loss": 1.5697, "step": 331 }, { "epoch": 0.37700496806245565, "grad_norm": 1.1164592504501343, "learning_rate": 0.0001398041464532787, "loss": 1.6492, "step": 332 }, { "epoch": 0.3781405251951739, "grad_norm": 1.0505226850509644, "learning_rate": 0.00013947300481320925, "loss": 1.5639, "step": 333 }, { "epoch": 0.3792760823278921, "grad_norm": 1.0028319358825684, "learning_rate": 0.00013914134964629427, "loss": 1.5184, "step": 334 }, { "epoch": 0.38041163946061035, "grad_norm": 1.0429682731628418, "learning_rate": 0.00013880918526722497, "loss": 1.5205, "step": 335 }, { "epoch": 0.3815471965933286, "grad_norm": 1.0813076496124268, "learning_rate": 0.0001384765159973174, "loss": 1.5584, "step": 336 }, { "epoch": 0.3826827537260468, "grad_norm": 1.0658332109451294, "learning_rate": 0.0001381433461644559, "loss": 1.5486, "step": 337 }, { "epoch": 0.38381831085876505, "grad_norm": 1.0721393823623657, "learning_rate": 0.00013780968010303695, "loss": 1.5291, "step": 338 }, { "epoch": 0.38495386799148335, "grad_norm": 1.1089237928390503, "learning_rate": 0.0001374755221539128, "loss": 1.3919, "step": 339 }, { "epoch": 0.3860894251242016, "grad_norm": 1.2363439798355103, "learning_rate": 0.00013714087666433483, "loss": 1.4679, "step": 340 }, { "epoch": 0.3872249822569198, "grad_norm": 1.308176875114441, "learning_rate": 0.00013680574798789722, "loss": 1.4796, "step": 341 }, { "epoch": 0.38836053938963805, "grad_norm": 1.2448899745941162, "learning_rate": 0.0001364701404844802, "loss": 1.4368, "step": 342 }, { "epoch": 0.3894960965223563, "grad_norm": 1.3303086757659912, "learning_rate": 0.00013613405852019324, "loss": 1.5321, "step": 343 }, { "epoch": 0.3906316536550745, "grad_norm": 1.4923968315124512, "learning_rate": 0.00013579750646731847, "loss": 1.4892, "step": 344 }, { "epoch": 0.39176721078779275, "grad_norm": 1.658272624015808, "learning_rate": 0.00013546048870425356, "loss": 1.2655, "step": 345 }, { "epoch": 0.392902767920511, "grad_norm": 1.8732876777648926, "learning_rate": 0.00013512300961545496, "loss": 1.5204, "step": 346 }, { "epoch": 0.3940383250532292, "grad_norm": 1.644767165184021, "learning_rate": 0.00013478507359138066, "loss": 1.3117, "step": 347 }, { "epoch": 0.39517388218594746, "grad_norm": 1.6184022426605225, "learning_rate": 0.0001344466850284333, "loss": 1.1859, "step": 348 }, { "epoch": 0.39630943931866575, "grad_norm": 1.8763984441757202, "learning_rate": 0.00013410784832890277, "loss": 1.1496, "step": 349 }, { "epoch": 0.397444996451384, "grad_norm": 2.7677438259124756, "learning_rate": 0.0001337685679009091, "loss": 1.3846, "step": 350 }, { "epoch": 0.3985805535841022, "grad_norm": 2.461939811706543, "learning_rate": 0.00013342884815834493, "loss": 2.054, "step": 351 }, { "epoch": 0.39971611071682045, "grad_norm": 2.7815966606140137, "learning_rate": 0.00013308869352081834, "loss": 2.0869, "step": 352 }, { "epoch": 0.4008516678495387, "grad_norm": 2.448955535888672, "learning_rate": 0.00013274810841359502, "loss": 2.0141, "step": 353 }, { "epoch": 0.4019872249822569, "grad_norm": 2.450200319290161, "learning_rate": 0.0001324070972675411, "loss": 2.04, "step": 354 }, { "epoch": 0.40312278211497515, "grad_norm": 2.0058579444885254, "learning_rate": 0.00013206566451906508, "loss": 1.913, "step": 355 }, { "epoch": 0.4042583392476934, "grad_norm": 1.8514522314071655, "learning_rate": 0.00013172381461006057, "loss": 1.8977, "step": 356 }, { "epoch": 0.4053938963804116, "grad_norm": 1.5928471088409424, "learning_rate": 0.00013138155198784795, "loss": 1.8516, "step": 357 }, { "epoch": 0.40652945351312986, "grad_norm": 1.1709288358688354, "learning_rate": 0.0001310388811051171, "loss": 1.8025, "step": 358 }, { "epoch": 0.4076650106458481, "grad_norm": 0.8530488610267639, "learning_rate": 0.00013069580641986907, "loss": 1.8334, "step": 359 }, { "epoch": 0.4088005677785664, "grad_norm": 0.7583498358726501, "learning_rate": 0.00013035233239535817, "loss": 1.7438, "step": 360 }, { "epoch": 0.4099361249112846, "grad_norm": 0.7676871418952942, "learning_rate": 0.0001300084635000341, "loss": 1.7262, "step": 361 }, { "epoch": 0.41107168204400285, "grad_norm": 0.8364686965942383, "learning_rate": 0.00012966420420748345, "loss": 1.7077, "step": 362 }, { "epoch": 0.4122072391767211, "grad_norm": 0.7960500717163086, "learning_rate": 0.00012931955899637181, "loss": 1.6733, "step": 363 }, { "epoch": 0.4133427963094393, "grad_norm": 0.8668186068534851, "learning_rate": 0.00012897453235038552, "loss": 1.6998, "step": 364 }, { "epoch": 0.41447835344215755, "grad_norm": 0.8693703413009644, "learning_rate": 0.00012862912875817305, "loss": 1.8032, "step": 365 }, { "epoch": 0.4156139105748758, "grad_norm": 0.8475162386894226, "learning_rate": 0.00012828335271328678, "loss": 1.7583, "step": 366 }, { "epoch": 0.416749467707594, "grad_norm": 0.7888119220733643, "learning_rate": 0.00012793720871412468, "loss": 1.7107, "step": 367 }, { "epoch": 0.41788502484031226, "grad_norm": 0.8132604360580444, "learning_rate": 0.00012759070126387156, "loss": 1.6368, "step": 368 }, { "epoch": 0.4190205819730305, "grad_norm": 0.8452677130699158, "learning_rate": 0.00012724383487044055, "loss": 1.6928, "step": 369 }, { "epoch": 0.4201561391057488, "grad_norm": 0.8761222958564758, "learning_rate": 0.00012689661404641456, "loss": 1.6092, "step": 370 }, { "epoch": 0.421291696238467, "grad_norm": 0.942638635635376, "learning_rate": 0.00012654904330898742, "loss": 1.7605, "step": 371 }, { "epoch": 0.42242725337118525, "grad_norm": 0.8831734657287598, "learning_rate": 0.0001262011271799051, "loss": 1.5777, "step": 372 }, { "epoch": 0.4235628105039035, "grad_norm": 0.8579721450805664, "learning_rate": 0.0001258528701854072, "loss": 1.7232, "step": 373 }, { "epoch": 0.4246983676366217, "grad_norm": 0.8862204551696777, "learning_rate": 0.00012550427685616765, "loss": 1.6367, "step": 374 }, { "epoch": 0.42583392476933996, "grad_norm": 0.8844640254974365, "learning_rate": 0.000125155351727236, "loss": 1.6864, "step": 375 }, { "epoch": 0.4269694819020582, "grad_norm": 0.9562939405441284, "learning_rate": 0.00012480609933797837, "loss": 1.6092, "step": 376 }, { "epoch": 0.4281050390347764, "grad_norm": 0.845948338508606, "learning_rate": 0.00012445652423201845, "loss": 1.5562, "step": 377 }, { "epoch": 0.42924059616749466, "grad_norm": 0.9931953549385071, "learning_rate": 0.00012410663095717818, "loss": 1.6564, "step": 378 }, { "epoch": 0.4303761533002129, "grad_norm": 0.9111021757125854, "learning_rate": 0.00012375642406541894, "loss": 1.6171, "step": 379 }, { "epoch": 0.4315117104329312, "grad_norm": 0.881890594959259, "learning_rate": 0.00012340590811278198, "loss": 1.5152, "step": 380 }, { "epoch": 0.4326472675656494, "grad_norm": 0.9561471939086914, "learning_rate": 0.0001230550876593294, "loss": 1.4315, "step": 381 }, { "epoch": 0.43378282469836765, "grad_norm": 0.9323940873146057, "learning_rate": 0.00012270396726908467, "loss": 1.5231, "step": 382 }, { "epoch": 0.4349183818310859, "grad_norm": 1.0821974277496338, "learning_rate": 0.00012235255150997327, "loss": 1.4884, "step": 383 }, { "epoch": 0.4360539389638041, "grad_norm": 1.0289589166641235, "learning_rate": 0.00012200084495376341, "loss": 1.5337, "step": 384 }, { "epoch": 0.43718949609652236, "grad_norm": 1.0949689149856567, "learning_rate": 0.00012164885217600637, "loss": 1.5994, "step": 385 }, { "epoch": 0.4383250532292406, "grad_norm": 1.0089561939239502, "learning_rate": 0.00012129657775597705, "loss": 1.405, "step": 386 }, { "epoch": 0.4394606103619588, "grad_norm": 1.1390259265899658, "learning_rate": 0.00012094402627661447, "loss": 1.4774, "step": 387 }, { "epoch": 0.44059616749467706, "grad_norm": 1.15105140209198, "learning_rate": 0.00012059120232446191, "loss": 1.5611, "step": 388 }, { "epoch": 0.4417317246273953, "grad_norm": 1.27581787109375, "learning_rate": 0.00012023811048960763, "loss": 1.4808, "step": 389 }, { "epoch": 0.44286728176011353, "grad_norm": 1.2632039785385132, "learning_rate": 0.00011988475536562471, "loss": 1.4691, "step": 390 }, { "epoch": 0.4440028388928318, "grad_norm": 1.3682019710540771, "learning_rate": 0.00011953114154951166, "loss": 1.3677, "step": 391 }, { "epoch": 0.44513839602555005, "grad_norm": 1.4879424571990967, "learning_rate": 0.00011917727364163238, "loss": 1.3736, "step": 392 }, { "epoch": 0.4462739531582683, "grad_norm": 1.3507964611053467, "learning_rate": 0.00011882315624565645, "loss": 1.2677, "step": 393 }, { "epoch": 0.4474095102909865, "grad_norm": 1.3754775524139404, "learning_rate": 0.00011846879396849917, "loss": 1.417, "step": 394 }, { "epoch": 0.44854506742370476, "grad_norm": 1.5406546592712402, "learning_rate": 0.00011811419142026156, "loss": 1.4834, "step": 395 }, { "epoch": 0.449680624556423, "grad_norm": 1.5301264524459839, "learning_rate": 0.00011775935321417062, "loss": 1.3427, "step": 396 }, { "epoch": 0.4508161816891412, "grad_norm": 2.052644729614258, "learning_rate": 0.00011740428396651896, "loss": 1.3465, "step": 397 }, { "epoch": 0.45195173882185946, "grad_norm": 1.7317488193511963, "learning_rate": 0.00011704898829660518, "loss": 1.2183, "step": 398 }, { "epoch": 0.4530872959545777, "grad_norm": 1.990835189819336, "learning_rate": 0.00011669347082667332, "loss": 1.2874, "step": 399 }, { "epoch": 0.45422285308729593, "grad_norm": 2.816270112991333, "learning_rate": 0.00011633773618185302, "loss": 1.2727, "step": 400 }, { "epoch": 0.4553584102200142, "grad_norm": 1.8334050178527832, "learning_rate": 0.00011598178899009933, "loss": 2.0153, "step": 401 }, { "epoch": 0.45649396735273245, "grad_norm": 1.991929292678833, "learning_rate": 0.00011562563388213236, "loss": 2.0246, "step": 402 }, { "epoch": 0.4576295244854507, "grad_norm": 2.2617461681365967, "learning_rate": 0.00011526927549137716, "loss": 2.0317, "step": 403 }, { "epoch": 0.4587650816181689, "grad_norm": 2.2942545413970947, "learning_rate": 0.00011491271845390345, "loss": 2.0002, "step": 404 }, { "epoch": 0.45990063875088716, "grad_norm": 1.8838305473327637, "learning_rate": 0.00011455596740836512, "loss": 1.9227, "step": 405 }, { "epoch": 0.4610361958836054, "grad_norm": 1.6770646572113037, "learning_rate": 0.00011419902699594016, "loss": 1.836, "step": 406 }, { "epoch": 0.4621717530163236, "grad_norm": 1.3939802646636963, "learning_rate": 0.00011384190186027007, "loss": 1.8571, "step": 407 }, { "epoch": 0.46330731014904186, "grad_norm": 1.1426496505737305, "learning_rate": 0.00011348459664739956, "loss": 1.8387, "step": 408 }, { "epoch": 0.4644428672817601, "grad_norm": 0.977384090423584, "learning_rate": 0.00011312711600571604, "loss": 1.726, "step": 409 }, { "epoch": 0.46557842441447833, "grad_norm": 0.8394194841384888, "learning_rate": 0.00011276946458588917, "loss": 1.8086, "step": 410 }, { "epoch": 0.46671398154719657, "grad_norm": 0.71717369556427, "learning_rate": 0.00011241164704081038, "loss": 1.7832, "step": 411 }, { "epoch": 0.46784953867991486, "grad_norm": 0.7380895614624023, "learning_rate": 0.0001120536680255323, "loss": 1.7722, "step": 412 }, { "epoch": 0.4689850958126331, "grad_norm": 0.7180382609367371, "learning_rate": 0.00011169553219720828, "loss": 1.6595, "step": 413 }, { "epoch": 0.4701206529453513, "grad_norm": 0.8007554411888123, "learning_rate": 0.00011133724421503157, "loss": 1.7529, "step": 414 }, { "epoch": 0.47125621007806956, "grad_norm": 0.782702624797821, "learning_rate": 0.0001109788087401751, "loss": 1.6894, "step": 415 }, { "epoch": 0.4723917672107878, "grad_norm": 0.7977834939956665, "learning_rate": 0.00011062023043573047, "loss": 1.7462, "step": 416 }, { "epoch": 0.47352732434350603, "grad_norm": 0.796654999256134, "learning_rate": 0.00011026151396664747, "loss": 1.7016, "step": 417 }, { "epoch": 0.47466288147622426, "grad_norm": 0.8725181818008423, "learning_rate": 0.00010990266399967337, "loss": 1.8023, "step": 418 }, { "epoch": 0.4757984386089425, "grad_norm": 0.7822924852371216, "learning_rate": 0.00010954368520329217, "loss": 1.6328, "step": 419 }, { "epoch": 0.47693399574166073, "grad_norm": 0.8786959052085876, "learning_rate": 0.00010918458224766386, "loss": 1.7135, "step": 420 }, { "epoch": 0.47806955287437897, "grad_norm": 0.8763858675956726, "learning_rate": 0.00010882535980456376, "loss": 1.6146, "step": 421 }, { "epoch": 0.47920511000709726, "grad_norm": 0.8849068880081177, "learning_rate": 0.00010846602254732158, "loss": 1.6452, "step": 422 }, { "epoch": 0.4803406671398155, "grad_norm": 0.9125162959098816, "learning_rate": 0.00010810657515076086, "loss": 1.8305, "step": 423 }, { "epoch": 0.4814762242725337, "grad_norm": 0.8595629930496216, "learning_rate": 0.0001077470222911378, "loss": 1.5491, "step": 424 }, { "epoch": 0.48261178140525196, "grad_norm": 0.8861264586448669, "learning_rate": 0.00010738736864608079, "loss": 1.6585, "step": 425 }, { "epoch": 0.4837473385379702, "grad_norm": 0.874168872833252, "learning_rate": 0.0001070276188945293, "loss": 1.5823, "step": 426 }, { "epoch": 0.48488289567068843, "grad_norm": 0.9319614768028259, "learning_rate": 0.00010666777771667321, "loss": 1.6219, "step": 427 }, { "epoch": 0.48601845280340666, "grad_norm": 0.9134637713432312, "learning_rate": 0.00010630784979389168, "loss": 1.5249, "step": 428 }, { "epoch": 0.4871540099361249, "grad_norm": 0.9535539746284485, "learning_rate": 0.00010594783980869254, "loss": 1.6339, "step": 429 }, { "epoch": 0.48828956706884313, "grad_norm": 0.9368936419487, "learning_rate": 0.00010558775244465102, "loss": 1.6538, "step": 430 }, { "epoch": 0.48942512420156137, "grad_norm": 1.0619072914123535, "learning_rate": 0.00010522759238634927, "loss": 1.5829, "step": 431 }, { "epoch": 0.49056068133427966, "grad_norm": 0.9958395957946777, "learning_rate": 0.0001048673643193149, "loss": 1.5641, "step": 432 }, { "epoch": 0.4916962384669979, "grad_norm": 0.9518537521362305, "learning_rate": 0.00010450707292996047, "loss": 1.5755, "step": 433 }, { "epoch": 0.4928317955997161, "grad_norm": 0.9991930723190308, "learning_rate": 0.00010414672290552223, "loss": 1.4884, "step": 434 }, { "epoch": 0.49396735273243436, "grad_norm": 1.0080232620239258, "learning_rate": 0.00010378631893399933, "loss": 1.4681, "step": 435 }, { "epoch": 0.4951029098651526, "grad_norm": 0.9743078947067261, "learning_rate": 0.00010342586570409267, "loss": 1.303, "step": 436 }, { "epoch": 0.49623846699787083, "grad_norm": 1.055387258529663, "learning_rate": 0.00010306536790514406, "loss": 1.5096, "step": 437 }, { "epoch": 0.49737402413058907, "grad_norm": 1.0754774808883667, "learning_rate": 0.00010270483022707506, "loss": 1.3681, "step": 438 }, { "epoch": 0.4985095812633073, "grad_norm": 1.2458550930023193, "learning_rate": 0.00010234425736032607, "loss": 1.5154, "step": 439 }, { "epoch": 0.49964513839602553, "grad_norm": 1.2487785816192627, "learning_rate": 0.00010198365399579528, "loss": 1.5169, "step": 440 }, { "epoch": 0.5007806955287438, "grad_norm": 1.2705410718917847, "learning_rate": 0.00010162302482477764, "loss": 1.58, "step": 441 }, { "epoch": 0.501916252661462, "grad_norm": 1.4562819004058838, "learning_rate": 0.00010126237453890386, "loss": 1.283, "step": 442 }, { "epoch": 0.501916252661462, "eval_loss": 1.5736442804336548, "eval_runtime": 96.5053, "eval_samples_per_second": 15.377, "eval_steps_per_second": 7.689, "step": 442 }, { "epoch": 0.5030518097941803, "grad_norm": 1.3757225275039673, "learning_rate": 0.0001009017078300793, "loss": 1.2419, "step": 443 }, { "epoch": 0.5041873669268985, "grad_norm": 1.326486587524414, "learning_rate": 0.00010054102939042302, "loss": 1.1369, "step": 444 }, { "epoch": 0.5053229240596168, "grad_norm": 1.4598129987716675, "learning_rate": 0.00010018034391220663, "loss": 1.329, "step": 445 }, { "epoch": 0.5064584811923349, "grad_norm": 1.5953763723373413, "learning_rate": 9.981965608779337e-05, "loss": 1.2733, "step": 446 }, { "epoch": 0.5075940383250532, "grad_norm": 1.8005733489990234, "learning_rate": 9.9458970609577e-05, "loss": 1.281, "step": 447 }, { "epoch": 0.5087295954577715, "grad_norm": 2.0217745304107666, "learning_rate": 9.909829216992071e-05, "loss": 1.2111, "step": 448 }, { "epoch": 0.5098651525904897, "grad_norm": 2.5032589435577393, "learning_rate": 9.873762546109616e-05, "loss": 1.1343, "step": 449 }, { "epoch": 0.511000709723208, "grad_norm": 2.9878461360931396, "learning_rate": 9.83769751752224e-05, "loss": 1.0221, "step": 450 }, { "epoch": 0.5121362668559262, "grad_norm": 1.831922173500061, "learning_rate": 9.801634600420476e-05, "loss": 1.9931, "step": 451 }, { "epoch": 0.5132718239886445, "grad_norm": 2.110313892364502, "learning_rate": 9.765574263967396e-05, "loss": 2.0033, "step": 452 }, { "epoch": 0.5144073811213626, "grad_norm": 2.393563747406006, "learning_rate": 9.729516977292496e-05, "loss": 2.0316, "step": 453 }, { "epoch": 0.5155429382540809, "grad_norm": 1.8739205598831177, "learning_rate": 9.693463209485597e-05, "loss": 1.9417, "step": 454 }, { "epoch": 0.5166784953867991, "grad_norm": 1.9411948919296265, "learning_rate": 9.657413429590735e-05, "loss": 1.9489, "step": 455 }, { "epoch": 0.5178140525195174, "grad_norm": 1.8975965976715088, "learning_rate": 9.621368106600067e-05, "loss": 1.9216, "step": 456 }, { "epoch": 0.5189496096522356, "grad_norm": 1.3992000818252563, "learning_rate": 9.58532770944778e-05, "loss": 1.794, "step": 457 }, { "epoch": 0.5200851667849539, "grad_norm": 1.4171100854873657, "learning_rate": 9.549292707003956e-05, "loss": 1.8496, "step": 458 }, { "epoch": 0.5212207239176722, "grad_norm": 1.1530625820159912, "learning_rate": 9.513263568068512e-05, "loss": 1.7799, "step": 459 }, { "epoch": 0.5223562810503903, "grad_norm": 0.9610021114349365, "learning_rate": 9.477240761365078e-05, "loss": 1.7475, "step": 460 }, { "epoch": 0.5234918381831086, "grad_norm": 0.8117537498474121, "learning_rate": 9.441224755534896e-05, "loss": 1.7753, "step": 461 }, { "epoch": 0.5246273953158268, "grad_norm": 0.8100078105926514, "learning_rate": 9.40521601913075e-05, "loss": 1.7304, "step": 462 }, { "epoch": 0.5257629524485451, "grad_norm": 0.8257502317428589, "learning_rate": 9.369215020610834e-05, "loss": 1.6131, "step": 463 }, { "epoch": 0.5268985095812633, "grad_norm": 0.7481709122657776, "learning_rate": 9.333222228332683e-05, "loss": 1.7016, "step": 464 }, { "epoch": 0.5280340667139816, "grad_norm": 0.8736494183540344, "learning_rate": 9.297238110547074e-05, "loss": 1.6181, "step": 465 }, { "epoch": 0.5291696238466997, "grad_norm": 0.8339768648147583, "learning_rate": 9.261263135391922e-05, "loss": 1.7262, "step": 466 }, { "epoch": 0.530305180979418, "grad_norm": 0.8191514015197754, "learning_rate": 9.225297770886222e-05, "loss": 1.6448, "step": 467 }, { "epoch": 0.5314407381121362, "grad_norm": 0.9273790717124939, "learning_rate": 9.189342484923916e-05, "loss": 1.6485, "step": 468 }, { "epoch": 0.5325762952448545, "grad_norm": 0.9956740736961365, "learning_rate": 9.153397745267843e-05, "loss": 1.7134, "step": 469 }, { "epoch": 0.5337118523775728, "grad_norm": 0.8902850151062012, "learning_rate": 9.117464019543627e-05, "loss": 1.6908, "step": 470 }, { "epoch": 0.534847409510291, "grad_norm": 0.8927252888679504, "learning_rate": 9.081541775233615e-05, "loss": 1.606, "step": 471 }, { "epoch": 0.5359829666430093, "grad_norm": 0.8390812873840332, "learning_rate": 9.045631479670784e-05, "loss": 1.5736, "step": 472 }, { "epoch": 0.5371185237757274, "grad_norm": 0.9048410654067993, "learning_rate": 9.009733600032666e-05, "loss": 1.5682, "step": 473 }, { "epoch": 0.5382540809084457, "grad_norm": 0.9552938342094421, "learning_rate": 8.973848603335255e-05, "loss": 1.576, "step": 474 }, { "epoch": 0.5393896380411639, "grad_norm": 0.9013083577156067, "learning_rate": 8.937976956426958e-05, "loss": 1.4805, "step": 475 }, { "epoch": 0.5405251951738822, "grad_norm": 1.0171785354614258, "learning_rate": 8.902119125982493e-05, "loss": 1.5938, "step": 476 }, { "epoch": 0.5416607523066004, "grad_norm": 0.905933678150177, "learning_rate": 8.866275578496845e-05, "loss": 1.5432, "step": 477 }, { "epoch": 0.5427963094393187, "grad_norm": 0.9796912670135498, "learning_rate": 8.830446780279176e-05, "loss": 1.5388, "step": 478 }, { "epoch": 0.543931866572037, "grad_norm": 0.9591326117515564, "learning_rate": 8.79463319744677e-05, "loss": 1.5276, "step": 479 }, { "epoch": 0.5450674237047551, "grad_norm": 0.9813888669013977, "learning_rate": 8.758835295918963e-05, "loss": 1.623, "step": 480 }, { "epoch": 0.5462029808374734, "grad_norm": 1.0359368324279785, "learning_rate": 8.723053541411082e-05, "loss": 1.6362, "step": 481 }, { "epoch": 0.5473385379701916, "grad_norm": 0.9819924235343933, "learning_rate": 8.687288399428397e-05, "loss": 1.4353, "step": 482 }, { "epoch": 0.5484740951029099, "grad_norm": 1.1317660808563232, "learning_rate": 8.651540335260045e-05, "loss": 1.5687, "step": 483 }, { "epoch": 0.5496096522356281, "grad_norm": 1.1081722974777222, "learning_rate": 8.615809813972996e-05, "loss": 1.535, "step": 484 }, { "epoch": 0.5507452093683464, "grad_norm": 1.2360421419143677, "learning_rate": 8.580097300405988e-05, "loss": 1.5313, "step": 485 }, { "epoch": 0.5518807665010645, "grad_norm": 1.216064214706421, "learning_rate": 8.54440325916349e-05, "loss": 1.619, "step": 486 }, { "epoch": 0.5530163236337828, "grad_norm": 1.1995718479156494, "learning_rate": 8.508728154609657e-05, "loss": 1.4961, "step": 487 }, { "epoch": 0.554151880766501, "grad_norm": 1.11781907081604, "learning_rate": 8.473072450862285e-05, "loss": 1.2622, "step": 488 }, { "epoch": 0.5552874378992193, "grad_norm": 1.2923529148101807, "learning_rate": 8.437436611786766e-05, "loss": 1.4596, "step": 489 }, { "epoch": 0.5564229950319376, "grad_norm": 1.4287463426589966, "learning_rate": 8.401821100990072e-05, "loss": 1.5107, "step": 490 }, { "epoch": 0.5575585521646558, "grad_norm": 1.4087309837341309, "learning_rate": 8.366226381814697e-05, "loss": 1.4153, "step": 491 }, { "epoch": 0.5586941092973741, "grad_norm": 1.2707946300506592, "learning_rate": 8.33065291733267e-05, "loss": 1.2804, "step": 492 }, { "epoch": 0.5598296664300922, "grad_norm": 1.3588768243789673, "learning_rate": 8.295101170339483e-05, "loss": 1.2626, "step": 493 }, { "epoch": 0.5609652235628105, "grad_norm": 1.6474740505218506, "learning_rate": 8.259571603348105e-05, "loss": 1.3552, "step": 494 }, { "epoch": 0.5621007806955287, "grad_norm": 1.5614418983459473, "learning_rate": 8.224064678582943e-05, "loss": 1.2707, "step": 495 }, { "epoch": 0.563236337828247, "grad_norm": 1.6826685667037964, "learning_rate": 8.188580857973844e-05, "loss": 1.1855, "step": 496 }, { "epoch": 0.5643718949609652, "grad_norm": 1.6482815742492676, "learning_rate": 8.153120603150084e-05, "loss": 1.3258, "step": 497 }, { "epoch": 0.5655074520936835, "grad_norm": 1.920937180519104, "learning_rate": 8.117684375434357e-05, "loss": 1.1549, "step": 498 }, { "epoch": 0.5666430092264017, "grad_norm": 2.2403769493103027, "learning_rate": 8.082272635836762e-05, "loss": 1.1151, "step": 499 }, { "epoch": 0.5677785663591199, "grad_norm": 2.9545199871063232, "learning_rate": 8.046885845048838e-05, "loss": 1.3199, "step": 500 }, { "epoch": 0.5689141234918382, "grad_norm": 1.4136751890182495, "learning_rate": 8.011524463437531e-05, "loss": 1.9373, "step": 501 }, { "epoch": 0.5700496806245564, "grad_norm": 1.4899134635925293, "learning_rate": 7.97618895103924e-05, "loss": 1.9605, "step": 502 }, { "epoch": 0.5711852377572747, "grad_norm": 1.5981686115264893, "learning_rate": 7.94087976755381e-05, "loss": 1.9186, "step": 503 }, { "epoch": 0.5723207948899929, "grad_norm": 1.6354345083236694, "learning_rate": 7.905597372338558e-05, "loss": 2.0033, "step": 504 }, { "epoch": 0.5734563520227112, "grad_norm": 1.3642030954360962, "learning_rate": 7.870342224402296e-05, "loss": 1.9011, "step": 505 }, { "epoch": 0.5745919091554293, "grad_norm": 1.3249659538269043, "learning_rate": 7.835114782399364e-05, "loss": 1.8952, "step": 506 }, { "epoch": 0.5757274662881476, "grad_norm": 1.1663950681686401, "learning_rate": 7.799915504623662e-05, "loss": 1.8814, "step": 507 }, { "epoch": 0.5768630234208658, "grad_norm": 1.052673101425171, "learning_rate": 7.764744849002676e-05, "loss": 1.7164, "step": 508 }, { "epoch": 0.5779985805535841, "grad_norm": 0.8932778239250183, "learning_rate": 7.729603273091539e-05, "loss": 1.7725, "step": 509 }, { "epoch": 0.5791341376863024, "grad_norm": 0.8836004137992859, "learning_rate": 7.694491234067064e-05, "loss": 1.7482, "step": 510 }, { "epoch": 0.5802696948190206, "grad_norm": 0.8482527732849121, "learning_rate": 7.659409188721803e-05, "loss": 1.7439, "step": 511 }, { "epoch": 0.5814052519517389, "grad_norm": 0.7700875401496887, "learning_rate": 7.624357593458107e-05, "loss": 1.7351, "step": 512 }, { "epoch": 0.582540809084457, "grad_norm": 0.7748705744743347, "learning_rate": 7.589336904282184e-05, "loss": 1.7902, "step": 513 }, { "epoch": 0.5836763662171753, "grad_norm": 0.7450481653213501, "learning_rate": 7.55434757679816e-05, "loss": 1.6265, "step": 514 }, { "epoch": 0.5848119233498935, "grad_norm": 0.7539263963699341, "learning_rate": 7.519390066202166e-05, "loss": 1.621, "step": 515 }, { "epoch": 0.5859474804826118, "grad_norm": 0.779279351234436, "learning_rate": 7.484464827276399e-05, "loss": 1.7378, "step": 516 }, { "epoch": 0.58708303761533, "grad_norm": 0.7889571785926819, "learning_rate": 7.449572314383237e-05, "loss": 1.7031, "step": 517 }, { "epoch": 0.5882185947480483, "grad_norm": 0.8830326199531555, "learning_rate": 7.414712981459283e-05, "loss": 1.7598, "step": 518 }, { "epoch": 0.5893541518807665, "grad_norm": 0.8402740359306335, "learning_rate": 7.379887282009493e-05, "loss": 1.7154, "step": 519 }, { "epoch": 0.5904897090134847, "grad_norm": 0.7766242027282715, "learning_rate": 7.345095669101265e-05, "loss": 1.6047, "step": 520 }, { "epoch": 0.591625266146203, "grad_norm": 0.8842155337333679, "learning_rate": 7.310338595358545e-05, "loss": 1.6337, "step": 521 }, { "epoch": 0.5927608232789212, "grad_norm": 0.85732501745224, "learning_rate": 7.275616512955945e-05, "loss": 1.5705, "step": 522 }, { "epoch": 0.5938963804116395, "grad_norm": 0.8720919489860535, "learning_rate": 7.240929873612846e-05, "loss": 1.5906, "step": 523 }, { "epoch": 0.5950319375443577, "grad_norm": 0.9106302261352539, "learning_rate": 7.206279128587533e-05, "loss": 1.6032, "step": 524 }, { "epoch": 0.596167494677076, "grad_norm": 0.8752391934394836, "learning_rate": 7.171664728671326e-05, "loss": 1.5726, "step": 525 }, { "epoch": 0.5973030518097941, "grad_norm": 0.9204638600349426, "learning_rate": 7.1370871241827e-05, "loss": 1.6504, "step": 526 }, { "epoch": 0.5984386089425124, "grad_norm": 0.9984481930732727, "learning_rate": 7.102546764961449e-05, "loss": 1.6671, "step": 527 }, { "epoch": 0.5995741660752306, "grad_norm": 0.9043843150138855, "learning_rate": 7.06804410036282e-05, "loss": 1.4822, "step": 528 }, { "epoch": 0.6007097232079489, "grad_norm": 0.9745160341262817, "learning_rate": 7.033579579251658e-05, "loss": 1.596, "step": 529 }, { "epoch": 0.6018452803406671, "grad_norm": 0.9366100430488586, "learning_rate": 6.999153649996595e-05, "loss": 1.4066, "step": 530 }, { "epoch": 0.6029808374733854, "grad_norm": 0.9908466935157776, "learning_rate": 6.964766760464181e-05, "loss": 1.4882, "step": 531 }, { "epoch": 0.6041163946061037, "grad_norm": 0.9972131848335266, "learning_rate": 6.930419358013096e-05, "loss": 1.6304, "step": 532 }, { "epoch": 0.6052519517388218, "grad_norm": 1.079521656036377, "learning_rate": 6.896111889488293e-05, "loss": 1.4995, "step": 533 }, { "epoch": 0.6063875088715401, "grad_norm": 1.0491868257522583, "learning_rate": 6.861844801215209e-05, "loss": 1.5608, "step": 534 }, { "epoch": 0.6075230660042583, "grad_norm": 1.098819375038147, "learning_rate": 6.827618538993949e-05, "loss": 1.4619, "step": 535 }, { "epoch": 0.6086586231369766, "grad_norm": 1.061082124710083, "learning_rate": 6.79343354809349e-05, "loss": 1.5109, "step": 536 }, { "epoch": 0.6097941802696948, "grad_norm": 1.1403700113296509, "learning_rate": 6.759290273245892e-05, "loss": 1.4963, "step": 537 }, { "epoch": 0.6109297374024131, "grad_norm": 1.2848145961761475, "learning_rate": 6.7251891586405e-05, "loss": 1.5661, "step": 538 }, { "epoch": 0.6120652945351313, "grad_norm": 1.3563086986541748, "learning_rate": 6.691130647918171e-05, "loss": 1.6244, "step": 539 }, { "epoch": 0.6132008516678495, "grad_norm": 1.208108901977539, "learning_rate": 6.65711518416551e-05, "loss": 1.3565, "step": 540 }, { "epoch": 0.6143364088005677, "grad_norm": 1.2401301860809326, "learning_rate": 6.623143209909093e-05, "loss": 1.3824, "step": 541 }, { "epoch": 0.615471965933286, "grad_norm": 1.4032788276672363, "learning_rate": 6.589215167109723e-05, "loss": 1.4654, "step": 542 }, { "epoch": 0.6166075230660043, "grad_norm": 1.5298595428466797, "learning_rate": 6.555331497156672e-05, "loss": 1.5538, "step": 543 }, { "epoch": 0.6177430801987225, "grad_norm": 1.3163362741470337, "learning_rate": 6.521492640861938e-05, "loss": 1.2895, "step": 544 }, { "epoch": 0.6188786373314408, "grad_norm": 1.4514544010162354, "learning_rate": 6.487699038454508e-05, "loss": 1.2433, "step": 545 }, { "epoch": 0.620014194464159, "grad_norm": 1.58772611618042, "learning_rate": 6.453951129574644e-05, "loss": 1.2367, "step": 546 }, { "epoch": 0.6211497515968772, "grad_norm": 1.6421825885772705, "learning_rate": 6.420249353268155e-05, "loss": 1.0906, "step": 547 }, { "epoch": 0.6222853087295954, "grad_norm": 2.017019033432007, "learning_rate": 6.386594147980678e-05, "loss": 1.2488, "step": 548 }, { "epoch": 0.6234208658623137, "grad_norm": 2.2878637313842773, "learning_rate": 6.352985951551983e-05, "loss": 1.1657, "step": 549 }, { "epoch": 0.6245564229950319, "grad_norm": 3.262157917022705, "learning_rate": 6.319425201210281e-05, "loss": 1.0475, "step": 550 }, { "epoch": 0.6256919801277502, "grad_norm": 0.9350942969322205, "learning_rate": 6.28591233356652e-05, "loss": 1.935, "step": 551 }, { "epoch": 0.6268275372604685, "grad_norm": 1.1437113285064697, "learning_rate": 6.252447784608724e-05, "loss": 1.9821, "step": 552 }, { "epoch": 0.6279630943931866, "grad_norm": 1.300336480140686, "learning_rate": 6.219031989696307e-05, "loss": 1.8768, "step": 553 }, { "epoch": 0.6290986515259049, "grad_norm": 1.3001312017440796, "learning_rate": 6.185665383554414e-05, "loss": 1.8795, "step": 554 }, { "epoch": 0.6302342086586231, "grad_norm": 1.242996096611023, "learning_rate": 6.152348400268259e-05, "loss": 1.9121, "step": 555 }, { "epoch": 0.6313697657913414, "grad_norm": 1.0114587545394897, "learning_rate": 6.119081473277501e-05, "loss": 1.7948, "step": 556 }, { "epoch": 0.6325053229240596, "grad_norm": 1.0586174726486206, "learning_rate": 6.085865035370577e-05, "loss": 1.8737, "step": 557 }, { "epoch": 0.6336408800567779, "grad_norm": 0.9715588092803955, "learning_rate": 6.0526995186790746e-05, "loss": 1.8728, "step": 558 }, { "epoch": 0.6347764371894961, "grad_norm": 0.8724594712257385, "learning_rate": 6.019585354672135e-05, "loss": 1.7771, "step": 559 }, { "epoch": 0.6359119943222143, "grad_norm": 0.8585055470466614, "learning_rate": 5.9865229741508075e-05, "loss": 1.6996, "step": 560 }, { "epoch": 0.6370475514549325, "grad_norm": 0.8612693548202515, "learning_rate": 5.953512807242474e-05, "loss": 1.7471, "step": 561 }, { "epoch": 0.6381831085876508, "grad_norm": 0.7751456499099731, "learning_rate": 5.9205552833952316e-05, "loss": 1.6653, "step": 562 }, { "epoch": 0.6393186657203691, "grad_norm": 0.7923934459686279, "learning_rate": 5.8876508313723175e-05, "loss": 1.7556, "step": 563 }, { "epoch": 0.6404542228530873, "grad_norm": 0.7949087619781494, "learning_rate": 5.854799879246524e-05, "loss": 1.5966, "step": 564 }, { "epoch": 0.6415897799858056, "grad_norm": 0.7540833950042725, "learning_rate": 5.822002854394633e-05, "loss": 1.6536, "step": 565 }, { "epoch": 0.6427253371185238, "grad_norm": 0.7790471315383911, "learning_rate": 5.789260183491857e-05, "loss": 1.6033, "step": 566 }, { "epoch": 0.643860894251242, "grad_norm": 0.8414646983146667, "learning_rate": 5.756572292506293e-05, "loss": 1.7645, "step": 567 }, { "epoch": 0.6449964513839602, "grad_norm": 0.7919362783432007, "learning_rate": 5.723939606693362e-05, "loss": 1.5453, "step": 568 }, { "epoch": 0.6461320085166785, "grad_norm": 0.8462831377983093, "learning_rate": 5.691362550590297e-05, "loss": 1.6551, "step": 569 }, { "epoch": 0.6472675656493967, "grad_norm": 0.9093044400215149, "learning_rate": 5.658841548010612e-05, "loss": 1.6398, "step": 570 }, { "epoch": 0.648403122782115, "grad_norm": 0.83465576171875, "learning_rate": 5.626377022038592e-05, "loss": 1.5589, "step": 571 }, { "epoch": 0.6495386799148332, "grad_norm": 0.8335089087486267, "learning_rate": 5.593969395023779e-05, "loss": 1.664, "step": 572 }, { "epoch": 0.6506742370475515, "grad_norm": 0.8103551268577576, "learning_rate": 5.561619088575488e-05, "loss": 1.4491, "step": 573 }, { "epoch": 0.6518097941802697, "grad_norm": 0.8770089745521545, "learning_rate": 5.529326523557312e-05, "loss": 1.6066, "step": 574 }, { "epoch": 0.6529453513129879, "grad_norm": 0.8948405981063843, "learning_rate": 5.4970921200816594e-05, "loss": 1.5297, "step": 575 }, { "epoch": 0.6540809084457062, "grad_norm": 0.9856278896331787, "learning_rate": 5.4649162975042876e-05, "loss": 1.7404, "step": 576 }, { "epoch": 0.6552164655784244, "grad_norm": 0.9676799774169922, "learning_rate": 5.432799474418828e-05, "loss": 1.5605, "step": 577 }, { "epoch": 0.6563520227111427, "grad_norm": 0.8992465734481812, "learning_rate": 5.400742068651358e-05, "loss": 1.5372, "step": 578 }, { "epoch": 0.6574875798438609, "grad_norm": 0.9776160717010498, "learning_rate": 5.3687444972549695e-05, "loss": 1.5489, "step": 579 }, { "epoch": 0.6586231369765791, "grad_norm": 0.9442588686943054, "learning_rate": 5.3368071765043216e-05, "loss": 1.5458, "step": 580 }, { "epoch": 0.6597586941092973, "grad_norm": 0.9355054497718811, "learning_rate": 5.304930521890252e-05, "loss": 1.498, "step": 581 }, { "epoch": 0.6608942512420156, "grad_norm": 0.9499759674072266, "learning_rate": 5.273114948114346e-05, "loss": 1.5533, "step": 582 }, { "epoch": 0.6620298083747339, "grad_norm": 1.1099355220794678, "learning_rate": 5.241360869083567e-05, "loss": 1.6077, "step": 583 }, { "epoch": 0.6631653655074521, "grad_norm": 0.9693394899368286, "learning_rate": 5.209668697904844e-05, "loss": 1.4109, "step": 584 }, { "epoch": 0.6643009226401704, "grad_norm": 1.1466600894927979, "learning_rate": 5.178038846879716e-05, "loss": 1.5112, "step": 585 }, { "epoch": 0.6654364797728886, "grad_norm": 1.0376183986663818, "learning_rate": 5.14647172749897e-05, "loss": 1.4274, "step": 586 }, { "epoch": 0.6665720369056068, "grad_norm": 1.163313865661621, "learning_rate": 5.114967750437272e-05, "loss": 1.4807, "step": 587 }, { "epoch": 0.667707594038325, "grad_norm": 1.2299911975860596, "learning_rate": 5.0835273255478436e-05, "loss": 1.5357, "step": 588 }, { "epoch": 0.6688431511710433, "grad_norm": 1.1933594942092896, "learning_rate": 5.05215086185711e-05, "loss": 1.4173, "step": 589 }, { "epoch": 0.6699787083037615, "grad_norm": 1.2763584852218628, "learning_rate": 5.02083876755939e-05, "loss": 1.5017, "step": 590 }, { "epoch": 0.6711142654364798, "grad_norm": 1.204347014427185, "learning_rate": 4.98959145001159e-05, "loss": 1.3746, "step": 591 }, { "epoch": 0.672249822569198, "grad_norm": 1.4958367347717285, "learning_rate": 4.958409315727902e-05, "loss": 1.338, "step": 592 }, { "epoch": 0.6733853797019163, "grad_norm": 1.3173085451126099, "learning_rate": 4.9272927703745e-05, "loss": 1.3477, "step": 593 }, { "epoch": 0.6745209368346345, "grad_norm": 1.326626181602478, "learning_rate": 4.896242218764275e-05, "loss": 1.4096, "step": 594 }, { "epoch": 0.6756564939673527, "grad_norm": 1.4975693225860596, "learning_rate": 4.865258064851579e-05, "loss": 1.4497, "step": 595 }, { "epoch": 0.676792051100071, "grad_norm": 1.6770268678665161, "learning_rate": 4.8343407117269524e-05, "loss": 1.3418, "step": 596 }, { "epoch": 0.6779276082327892, "grad_norm": 1.5939122438430786, "learning_rate": 4.803490561611884e-05, "loss": 1.1633, "step": 597 }, { "epoch": 0.6790631653655075, "grad_norm": 1.6800607442855835, "learning_rate": 4.772708015853581e-05, "loss": 1.1496, "step": 598 }, { "epoch": 0.6801987224982257, "grad_norm": 1.9421743154525757, "learning_rate": 4.7419934749197446e-05, "loss": 1.0623, "step": 599 }, { "epoch": 0.681334279630944, "grad_norm": 2.70224666595459, "learning_rate": 4.711347338393369e-05, "loss": 1.1772, "step": 600 }, { "epoch": 0.6824698367636621, "grad_norm": 0.6946836709976196, "learning_rate": 4.680770004967536e-05, "loss": 1.9147, "step": 601 }, { "epoch": 0.6836053938963804, "grad_norm": 0.7127718925476074, "learning_rate": 4.6502618724402235e-05, "loss": 1.8554, "step": 602 }, { "epoch": 0.6847409510290986, "grad_norm": 0.7869585752487183, "learning_rate": 4.619823337709134e-05, "loss": 1.8342, "step": 603 }, { "epoch": 0.6858765081618169, "grad_norm": 0.7868178486824036, "learning_rate": 4.5894547967665416e-05, "loss": 1.8743, "step": 604 }, { "epoch": 0.6870120652945352, "grad_norm": 0.8273993730545044, "learning_rate": 4.559156644694118e-05, "loss": 1.8336, "step": 605 }, { "epoch": 0.6881476224272534, "grad_norm": 0.8090512156486511, "learning_rate": 4.5289292756578236e-05, "loss": 1.7838, "step": 606 }, { "epoch": 0.6892831795599716, "grad_norm": 0.8638222217559814, "learning_rate": 4.4987730829027444e-05, "loss": 1.7716, "step": 607 }, { "epoch": 0.6904187366926898, "grad_norm": 0.7992023229598999, "learning_rate": 4.468688458748006e-05, "loss": 1.7275, "step": 608 }, { "epoch": 0.6915542938254081, "grad_norm": 0.8375924825668335, "learning_rate": 4.4386757945816514e-05, "loss": 1.78, "step": 609 }, { "epoch": 0.6926898509581263, "grad_norm": 0.8237891793251038, "learning_rate": 4.40873548085555e-05, "loss": 1.7003, "step": 610 }, { "epoch": 0.6938254080908446, "grad_norm": 0.78203946352005, "learning_rate": 4.378867907080338e-05, "loss": 1.7201, "step": 611 }, { "epoch": 0.6949609652235628, "grad_norm": 0.8093476891517639, "learning_rate": 4.34907346182032e-05, "loss": 1.7552, "step": 612 }, { "epoch": 0.6960965223562811, "grad_norm": 0.8110452890396118, "learning_rate": 4.3193525326884435e-05, "loss": 1.6928, "step": 613 }, { "epoch": 0.6972320794889993, "grad_norm": 0.8030693531036377, "learning_rate": 4.2897055063412325e-05, "loss": 1.6625, "step": 614 }, { "epoch": 0.6983676366217175, "grad_norm": 0.827095091342926, "learning_rate": 4.260132768473769e-05, "loss": 1.704, "step": 615 }, { "epoch": 0.6995031937544358, "grad_norm": 0.8335441946983337, "learning_rate": 4.230634703814678e-05, "loss": 1.6508, "step": 616 }, { "epoch": 0.700638750887154, "grad_norm": 0.8258208632469177, "learning_rate": 4.2012116961211214e-05, "loss": 1.6027, "step": 617 }, { "epoch": 0.7017743080198723, "grad_norm": 0.8009803295135498, "learning_rate": 4.1718641281737927e-05, "loss": 1.5949, "step": 618 }, { "epoch": 0.7029098651525905, "grad_norm": 0.8161913752555847, "learning_rate": 4.142592381771947e-05, "loss": 1.6369, "step": 619 }, { "epoch": 0.7040454222853088, "grad_norm": 0.8396199941635132, "learning_rate": 4.113396837728443e-05, "loss": 1.6269, "step": 620 }, { "epoch": 0.7051809794180269, "grad_norm": 0.8665387630462646, "learning_rate": 4.084277875864776e-05, "loss": 1.5541, "step": 621 }, { "epoch": 0.7063165365507452, "grad_norm": 0.8753857016563416, "learning_rate": 4.055235875006135e-05, "loss": 1.5905, "step": 622 }, { "epoch": 0.7074520936834634, "grad_norm": 0.886701226234436, "learning_rate": 4.0262712129764834e-05, "loss": 1.6855, "step": 623 }, { "epoch": 0.7085876508161817, "grad_norm": 0.8672073483467102, "learning_rate": 3.9973842665936336e-05, "loss": 1.6192, "step": 624 }, { "epoch": 0.7097232079489, "grad_norm": 0.9048891067504883, "learning_rate": 3.9685754116643606e-05, "loss": 1.691, "step": 625 }, { "epoch": 0.7108587650816182, "grad_norm": 0.8669731020927429, "learning_rate": 3.9398450229795026e-05, "loss": 1.6129, "step": 626 }, { "epoch": 0.7119943222143365, "grad_norm": 0.9303737282752991, "learning_rate": 3.911193474309076e-05, "loss": 1.6385, "step": 627 }, { "epoch": 0.7131298793470546, "grad_norm": 0.9133759140968323, "learning_rate": 3.8826211383974266e-05, "loss": 1.5454, "step": 628 }, { "epoch": 0.7142654364797729, "grad_norm": 0.9417001605033875, "learning_rate": 3.854128386958382e-05, "loss": 1.5134, "step": 629 }, { "epoch": 0.7154009936124911, "grad_norm": 0.8978011608123779, "learning_rate": 3.825715590670402e-05, "loss": 1.4379, "step": 630 }, { "epoch": 0.7165365507452094, "grad_norm": 0.9301682114601135, "learning_rate": 3.7973831191717726e-05, "loss": 1.3754, "step": 631 }, { "epoch": 0.7176721078779276, "grad_norm": 1.0142457485198975, "learning_rate": 3.769131341055777e-05, "loss": 1.5099, "step": 632 }, { "epoch": 0.7188076650106459, "grad_norm": 1.147111177444458, "learning_rate": 3.740960623865927e-05, "loss": 1.5946, "step": 633 }, { "epoch": 0.719943222143364, "grad_norm": 1.0799906253814697, "learning_rate": 3.7128713340911535e-05, "loss": 1.5381, "step": 634 }, { "epoch": 0.7210787792760823, "grad_norm": 1.0786596536636353, "learning_rate": 3.6848638371610554e-05, "loss": 1.5095, "step": 635 }, { "epoch": 0.7222143364088006, "grad_norm": 1.1043813228607178, "learning_rate": 3.656938497441148e-05, "loss": 1.5359, "step": 636 }, { "epoch": 0.7233498935415188, "grad_norm": 1.2422959804534912, "learning_rate": 3.6290956782281075e-05, "loss": 1.4586, "step": 637 }, { "epoch": 0.7244854506742371, "grad_norm": 1.10708749294281, "learning_rate": 3.601335741745063e-05, "loss": 1.4411, "step": 638 }, { "epoch": 0.7256210078069553, "grad_norm": 1.3422400951385498, "learning_rate": 3.573659049136867e-05, "loss": 1.4354, "step": 639 }, { "epoch": 0.7267565649396736, "grad_norm": 1.1698023080825806, "learning_rate": 3.546065960465405e-05, "loss": 1.4472, "step": 640 }, { "epoch": 0.7278921220723917, "grad_norm": 1.2639155387878418, "learning_rate": 3.518556834704917e-05, "loss": 1.5032, "step": 641 }, { "epoch": 0.72902767920511, "grad_norm": 1.4656773805618286, "learning_rate": 3.4911320297373204e-05, "loss": 1.4674, "step": 642 }, { "epoch": 0.7301632363378282, "grad_norm": 1.303754448890686, "learning_rate": 3.4637919023475526e-05, "loss": 1.3883, "step": 643 }, { "epoch": 0.7312987934705465, "grad_norm": 1.4667454957962036, "learning_rate": 3.436536808218932e-05, "loss": 1.4055, "step": 644 }, { "epoch": 0.7324343506032647, "grad_norm": 1.4695309400558472, "learning_rate": 3.4093671019285256e-05, "loss": 1.2336, "step": 645 }, { "epoch": 0.733569907735983, "grad_norm": 1.7231589555740356, "learning_rate": 3.382283136942559e-05, "loss": 1.4652, "step": 646 }, { "epoch": 0.7347054648687013, "grad_norm": 1.6907893419265747, "learning_rate": 3.355285265611784e-05, "loss": 1.2647, "step": 647 }, { "epoch": 0.7358410220014194, "grad_norm": 1.8390649557113647, "learning_rate": 3.328373839166914e-05, "loss": 1.1934, "step": 648 }, { "epoch": 0.7369765791341377, "grad_norm": 2.277698040008545, "learning_rate": 3.301549207714051e-05, "loss": 1.1359, "step": 649 }, { "epoch": 0.7381121362668559, "grad_norm": 3.420145273208618, "learning_rate": 3.2748117202301364e-05, "loss": 1.4675, "step": 650 }, { "epoch": 0.7392476933995742, "grad_norm": 0.5647117495536804, "learning_rate": 3.248161724558406e-05, "loss": 1.8096, "step": 651 }, { "epoch": 0.7403832505322924, "grad_norm": 0.6874871850013733, "learning_rate": 3.221599567403859e-05, "loss": 1.8367, "step": 652 }, { "epoch": 0.7415188076650107, "grad_norm": 0.6934542059898376, "learning_rate": 3.1951255943287525e-05, "loss": 1.7968, "step": 653 }, { "epoch": 0.7426543647977288, "grad_norm": 0.7449638247489929, "learning_rate": 3.168740149748116e-05, "loss": 1.7888, "step": 654 }, { "epoch": 0.7437899219304471, "grad_norm": 0.7421103715896606, "learning_rate": 3.14244357692525e-05, "loss": 1.7982, "step": 655 }, { "epoch": 0.7449254790631654, "grad_norm": 0.7333208322525024, "learning_rate": 3.116236217967285e-05, "loss": 1.7333, "step": 656 }, { "epoch": 0.7460610361958836, "grad_norm": 0.7759894132614136, "learning_rate": 3.090118413820698e-05, "loss": 1.802, "step": 657 }, { "epoch": 0.7471965933286019, "grad_norm": 0.7925217747688293, "learning_rate": 3.064090504266919e-05, "loss": 1.8207, "step": 658 }, { "epoch": 0.7483321504613201, "grad_norm": 0.7934524416923523, "learning_rate": 3.0381528279178706e-05, "loss": 1.8089, "step": 659 }, { "epoch": 0.7494677075940384, "grad_norm": 0.7867562770843506, "learning_rate": 3.0123057222115836e-05, "loss": 1.7985, "step": 660 }, { "epoch": 0.7506032647267565, "grad_norm": 0.7472755312919617, "learning_rate": 2.9865495234078135e-05, "loss": 1.6816, "step": 661 }, { "epoch": 0.7517388218594748, "grad_norm": 0.7101432085037231, "learning_rate": 2.9608845665836428e-05, "loss": 1.5506, "step": 662 }, { "epoch": 0.752874378992193, "grad_norm": 0.7864011526107788, "learning_rate": 2.935311185629146e-05, "loss": 1.5984, "step": 663 }, { "epoch": 0.752874378992193, "eval_loss": 1.5109450817108154, "eval_runtime": 96.2692, "eval_samples_per_second": 15.415, "eval_steps_per_second": 7.708, "step": 663 }, { "epoch": 0.7540099361249113, "grad_norm": 0.7694438099861145, "learning_rate": 2.9098297132430265e-05, "loss": 1.6372, "step": 664 }, { "epoch": 0.7551454932576295, "grad_norm": 0.7872363924980164, "learning_rate": 2.8844404809282978e-05, "loss": 1.6629, "step": 665 }, { "epoch": 0.7562810503903478, "grad_norm": 0.8400985598564148, "learning_rate": 2.8591438189879748e-05, "loss": 1.6449, "step": 666 }, { "epoch": 0.757416607523066, "grad_norm": 0.8109758496284485, "learning_rate": 2.833940056520772e-05, "loss": 1.6523, "step": 667 }, { "epoch": 0.7585521646557842, "grad_norm": 0.8173964619636536, "learning_rate": 2.8088295214168147e-05, "loss": 1.6574, "step": 668 }, { "epoch": 0.7596877217885025, "grad_norm": 0.8443711996078491, "learning_rate": 2.7838125403533854e-05, "loss": 1.5301, "step": 669 }, { "epoch": 0.7608232789212207, "grad_norm": 0.8328931927680969, "learning_rate": 2.7588894387906585e-05, "loss": 1.5988, "step": 670 }, { "epoch": 0.761958836053939, "grad_norm": 0.850821852684021, "learning_rate": 2.734060540967499e-05, "loss": 1.6433, "step": 671 }, { "epoch": 0.7630943931866572, "grad_norm": 0.8715527653694153, "learning_rate": 2.7093261698972005e-05, "loss": 1.5215, "step": 672 }, { "epoch": 0.7642299503193755, "grad_norm": 0.8679649233818054, "learning_rate": 2.6846866473633125e-05, "loss": 1.5598, "step": 673 }, { "epoch": 0.7653655074520936, "grad_norm": 0.9053911566734314, "learning_rate": 2.6601422939154407e-05, "loss": 1.6417, "step": 674 }, { "epoch": 0.7665010645848119, "grad_norm": 0.8917622566223145, "learning_rate": 2.6356934288650903e-05, "loss": 1.6558, "step": 675 }, { "epoch": 0.7676366217175301, "grad_norm": 0.9091761708259583, "learning_rate": 2.6113403702814998e-05, "loss": 1.527, "step": 676 }, { "epoch": 0.7687721788502484, "grad_norm": 0.8782323002815247, "learning_rate": 2.587083434987505e-05, "loss": 1.4536, "step": 677 }, { "epoch": 0.7699077359829667, "grad_norm": 0.9613420963287354, "learning_rate": 2.5629229385554142e-05, "loss": 1.5238, "step": 678 }, { "epoch": 0.7710432931156849, "grad_norm": 0.9632066488265991, "learning_rate": 2.538859195302922e-05, "loss": 1.4742, "step": 679 }, { "epoch": 0.7721788502484032, "grad_norm": 1.0621824264526367, "learning_rate": 2.514892518288988e-05, "loss": 1.5476, "step": 680 }, { "epoch": 0.7733144073811213, "grad_norm": 0.9680823683738708, "learning_rate": 2.4910232193097994e-05, "loss": 1.4848, "step": 681 }, { "epoch": 0.7744499645138396, "grad_norm": 0.9530523419380188, "learning_rate": 2.467251608894683e-05, "loss": 1.349, "step": 682 }, { "epoch": 0.7755855216465578, "grad_norm": 1.0773934125900269, "learning_rate": 2.443577996302081e-05, "loss": 1.5064, "step": 683 }, { "epoch": 0.7767210787792761, "grad_norm": 1.0130152702331543, "learning_rate": 2.420002689515537e-05, "loss": 1.3504, "step": 684 }, { "epoch": 0.7778566359119943, "grad_norm": 1.1102440357208252, "learning_rate": 2.3965259952396646e-05, "loss": 1.5364, "step": 685 }, { "epoch": 0.7789921930447126, "grad_norm": 1.0786466598510742, "learning_rate": 2.3731482188961818e-05, "loss": 1.4165, "step": 686 }, { "epoch": 0.7801277501774309, "grad_norm": 1.1379436254501343, "learning_rate": 2.349869664619917e-05, "loss": 1.4224, "step": 687 }, { "epoch": 0.781263307310149, "grad_norm": 1.144761562347412, "learning_rate": 2.326690635254872e-05, "loss": 1.4376, "step": 688 }, { "epoch": 0.7823988644428673, "grad_norm": 1.2116212844848633, "learning_rate": 2.3036114323502655e-05, "loss": 1.3999, "step": 689 }, { "epoch": 0.7835344215755855, "grad_norm": 1.2774122953414917, "learning_rate": 2.2806323561566146e-05, "loss": 1.4136, "step": 690 }, { "epoch": 0.7846699787083038, "grad_norm": 1.319525122642517, "learning_rate": 2.257753705621839e-05, "loss": 1.3639, "step": 691 }, { "epoch": 0.785805535841022, "grad_norm": 1.3245418071746826, "learning_rate": 2.2349757783873627e-05, "loss": 1.2687, "step": 692 }, { "epoch": 0.7869410929737403, "grad_norm": 1.3538442850112915, "learning_rate": 2.2122988707842353e-05, "loss": 1.4123, "step": 693 }, { "epoch": 0.7880766501064584, "grad_norm": 1.4383583068847656, "learning_rate": 2.18972327782929e-05, "loss": 1.3268, "step": 694 }, { "epoch": 0.7892122072391767, "grad_norm": 1.4845287799835205, "learning_rate": 2.167249293221293e-05, "loss": 1.237, "step": 695 }, { "epoch": 0.7903477643718949, "grad_norm": 1.6581833362579346, "learning_rate": 2.144877209337145e-05, "loss": 1.3412, "step": 696 }, { "epoch": 0.7914833215046132, "grad_norm": 1.7108768224716187, "learning_rate": 2.122607317228049e-05, "loss": 1.1361, "step": 697 }, { "epoch": 0.7926188786373315, "grad_norm": 1.8874878883361816, "learning_rate": 2.100439906615739e-05, "loss": 1.187, "step": 698 }, { "epoch": 0.7937544357700497, "grad_norm": 2.4068119525909424, "learning_rate": 2.0783752658887066e-05, "loss": 1.0358, "step": 699 }, { "epoch": 0.794889992902768, "grad_norm": 3.270036458969116, "learning_rate": 2.056413682098459e-05, "loss": 1.1526, "step": 700 }, { "epoch": 0.7960255500354861, "grad_norm": 0.6681864857673645, "learning_rate": 2.034555440955773e-05, "loss": 1.9026, "step": 701 }, { "epoch": 0.7971611071682044, "grad_norm": 0.6761972904205322, "learning_rate": 2.0128008268269815e-05, "loss": 1.8308, "step": 702 }, { "epoch": 0.7982966643009226, "grad_norm": 0.6852626204490662, "learning_rate": 1.9911501227302687e-05, "loss": 1.7616, "step": 703 }, { "epoch": 0.7994322214336409, "grad_norm": 0.7402864694595337, "learning_rate": 1.969603610332007e-05, "loss": 1.7934, "step": 704 }, { "epoch": 0.8005677785663591, "grad_norm": 0.7134304642677307, "learning_rate": 1.9481615699430654e-05, "loss": 1.6278, "step": 705 }, { "epoch": 0.8017033356990774, "grad_norm": 0.7959495186805725, "learning_rate": 1.9268242805151902e-05, "loss": 1.8446, "step": 706 }, { "epoch": 0.8028388928317955, "grad_norm": 0.786116898059845, "learning_rate": 1.9055920196373523e-05, "loss": 1.7897, "step": 707 }, { "epoch": 0.8039744499645138, "grad_norm": 0.7417249083518982, "learning_rate": 1.8844650635321483e-05, "loss": 1.68, "step": 708 }, { "epoch": 0.8051100070972321, "grad_norm": 0.7883526086807251, "learning_rate": 1.863443687052211e-05, "loss": 1.7149, "step": 709 }, { "epoch": 0.8062455642299503, "grad_norm": 0.7621736526489258, "learning_rate": 1.842528163676619e-05, "loss": 1.6997, "step": 710 }, { "epoch": 0.8073811213626686, "grad_norm": 0.8280425667762756, "learning_rate": 1.8217187655073564e-05, "loss": 1.7657, "step": 711 }, { "epoch": 0.8085166784953868, "grad_norm": 0.8050588369369507, "learning_rate": 1.8010157632657543e-05, "loss": 1.6497, "step": 712 }, { "epoch": 0.8096522356281051, "grad_norm": 0.8473997116088867, "learning_rate": 1.7804194262889874e-05, "loss": 1.6952, "step": 713 }, { "epoch": 0.8107877927608232, "grad_norm": 0.7914564609527588, "learning_rate": 1.759930022526556e-05, "loss": 1.6326, "step": 714 }, { "epoch": 0.8119233498935415, "grad_norm": 0.8434754610061646, "learning_rate": 1.739547818536804e-05, "loss": 1.6652, "step": 715 }, { "epoch": 0.8130589070262597, "grad_norm": 0.8190686702728271, "learning_rate": 1.7192730794834556e-05, "loss": 1.6303, "step": 716 }, { "epoch": 0.814194464158978, "grad_norm": 0.8698839545249939, "learning_rate": 1.699106069132165e-05, "loss": 1.6713, "step": 717 }, { "epoch": 0.8153300212916962, "grad_norm": 0.8213229775428772, "learning_rate": 1.6790470498470744e-05, "loss": 1.5605, "step": 718 }, { "epoch": 0.8164655784244145, "grad_norm": 0.8308836817741394, "learning_rate": 1.6590962825874146e-05, "loss": 1.6145, "step": 719 }, { "epoch": 0.8176011355571328, "grad_norm": 0.8767895102500916, "learning_rate": 1.639254026904099e-05, "loss": 1.5442, "step": 720 }, { "epoch": 0.8187366926898509, "grad_norm": 0.8982859253883362, "learning_rate": 1.6195205409363577e-05, "loss": 1.5176, "step": 721 }, { "epoch": 0.8198722498225692, "grad_norm": 0.8737882375717163, "learning_rate": 1.599896081408373e-05, "loss": 1.6849, "step": 722 }, { "epoch": 0.8210078069552874, "grad_norm": 0.9010720252990723, "learning_rate": 1.5803809036259364e-05, "loss": 1.5601, "step": 723 }, { "epoch": 0.8221433640880057, "grad_norm": 0.9659538865089417, "learning_rate": 1.5609752614731288e-05, "loss": 1.6631, "step": 724 }, { "epoch": 0.8232789212207239, "grad_norm": 0.9552429914474487, "learning_rate": 1.5416794074090258e-05, "loss": 1.5541, "step": 725 }, { "epoch": 0.8244144783534422, "grad_norm": 0.9897984266281128, "learning_rate": 1.5224935924644069e-05, "loss": 1.552, "step": 726 }, { "epoch": 0.8255500354861603, "grad_norm": 0.9992690682411194, "learning_rate": 1.5034180662384857e-05, "loss": 1.5913, "step": 727 }, { "epoch": 0.8266855926188786, "grad_norm": 0.9938931465148926, "learning_rate": 1.4844530768956656e-05, "loss": 1.491, "step": 728 }, { "epoch": 0.8278211497515969, "grad_norm": 0.9427549242973328, "learning_rate": 1.4655988711623203e-05, "loss": 1.5556, "step": 729 }, { "epoch": 0.8289567068843151, "grad_norm": 1.0238285064697266, "learning_rate": 1.4468556943235678e-05, "loss": 1.5669, "step": 730 }, { "epoch": 0.8300922640170334, "grad_norm": 1.0253803730010986, "learning_rate": 1.4282237902200957e-05, "loss": 1.4737, "step": 731 }, { "epoch": 0.8312278211497516, "grad_norm": 1.0242679119110107, "learning_rate": 1.409703401244975e-05, "loss": 1.538, "step": 732 }, { "epoch": 0.8323633782824699, "grad_norm": 1.1211565732955933, "learning_rate": 1.391294768340513e-05, "loss": 1.6431, "step": 733 }, { "epoch": 0.833498935415188, "grad_norm": 1.166258692741394, "learning_rate": 1.3729981309951245e-05, "loss": 1.5305, "step": 734 }, { "epoch": 0.8346344925479063, "grad_norm": 1.1053909063339233, "learning_rate": 1.3548137272402006e-05, "loss": 1.4315, "step": 735 }, { "epoch": 0.8357700496806245, "grad_norm": 1.086614966392517, "learning_rate": 1.3367417936470328e-05, "loss": 1.3344, "step": 736 }, { "epoch": 0.8369056068133428, "grad_norm": 1.0932161808013916, "learning_rate": 1.318782565323714e-05, "loss": 1.291, "step": 737 }, { "epoch": 0.838041163946061, "grad_norm": 1.0981745719909668, "learning_rate": 1.300936275912098e-05, "loss": 1.2024, "step": 738 }, { "epoch": 0.8391767210787793, "grad_norm": 1.1574921607971191, "learning_rate": 1.2832031575847448e-05, "loss": 1.3241, "step": 739 }, { "epoch": 0.8403122782114976, "grad_norm": 1.1923638582229614, "learning_rate": 1.265583441041911e-05, "loss": 1.3338, "step": 740 }, { "epoch": 0.8414478353442157, "grad_norm": 1.1882578134536743, "learning_rate": 1.2480773555085434e-05, "loss": 1.2166, "step": 741 }, { "epoch": 0.842583392476934, "grad_norm": 1.3554632663726807, "learning_rate": 1.2306851287313025e-05, "loss": 1.4013, "step": 742 }, { "epoch": 0.8437189496096522, "grad_norm": 1.366865634918213, "learning_rate": 1.2134069869755893e-05, "loss": 1.3325, "step": 743 }, { "epoch": 0.8448545067423705, "grad_norm": 1.4422940015792847, "learning_rate": 1.1962431550226105e-05, "loss": 1.314, "step": 744 }, { "epoch": 0.8459900638750887, "grad_norm": 1.42853581905365, "learning_rate": 1.1791938561664485e-05, "loss": 1.1524, "step": 745 }, { "epoch": 0.847125621007807, "grad_norm": 1.6174098253250122, "learning_rate": 1.1622593122111624e-05, "loss": 1.3666, "step": 746 }, { "epoch": 0.8482611781405252, "grad_norm": 1.6065244674682617, "learning_rate": 1.1454397434679021e-05, "loss": 1.2227, "step": 747 }, { "epoch": 0.8493967352732434, "grad_norm": 1.913891077041626, "learning_rate": 1.128735368752033e-05, "loss": 1.0861, "step": 748 }, { "epoch": 0.8505322924059616, "grad_norm": 2.0596818923950195, "learning_rate": 1.1121464053802965e-05, "loss": 1.0318, "step": 749 }, { "epoch": 0.8516678495386799, "grad_norm": 3.119539976119995, "learning_rate": 1.0956730691679861e-05, "loss": 1.0738, "step": 750 }, { "epoch": 0.8528034066713982, "grad_norm": 0.5730923414230347, "learning_rate": 1.0793155744261351e-05, "loss": 1.8209, "step": 751 }, { "epoch": 0.8539389638041164, "grad_norm": 0.6221286058425903, "learning_rate": 1.0630741339587257e-05, "loss": 1.8422, "step": 752 }, { "epoch": 0.8550745209368347, "grad_norm": 0.6800789833068848, "learning_rate": 1.0469489590599257e-05, "loss": 1.7683, "step": 753 }, { "epoch": 0.8562100780695528, "grad_norm": 0.7071248292922974, "learning_rate": 1.0309402595113338e-05, "loss": 1.7827, "step": 754 }, { "epoch": 0.8573456352022711, "grad_norm": 0.6935234069824219, "learning_rate": 1.0150482435792618e-05, "loss": 1.7682, "step": 755 }, { "epoch": 0.8584811923349893, "grad_norm": 0.7290916442871094, "learning_rate": 9.992731180120164e-06, "loss": 1.7344, "step": 756 }, { "epoch": 0.8596167494677076, "grad_norm": 0.7707837224006653, "learning_rate": 9.836150880372041e-06, "loss": 1.7938, "step": 757 }, { "epoch": 0.8607523066004258, "grad_norm": 0.764090359210968, "learning_rate": 9.680743573590733e-06, "loss": 1.7013, "step": 758 }, { "epoch": 0.8618878637331441, "grad_norm": 0.7422741651535034, "learning_rate": 9.526511281558593e-06, "loss": 1.6385, "step": 759 }, { "epoch": 0.8630234208658624, "grad_norm": 0.7692238688468933, "learning_rate": 9.373456010771509e-06, "loss": 1.6769, "step": 760 }, { "epoch": 0.8641589779985805, "grad_norm": 0.7718127965927124, "learning_rate": 9.221579752412856e-06, "loss": 1.6048, "step": 761 }, { "epoch": 0.8652945351312988, "grad_norm": 0.7635827660560608, "learning_rate": 9.070884482327524e-06, "loss": 1.6526, "step": 762 }, { "epoch": 0.866430092264017, "grad_norm": 0.7959072589874268, "learning_rate": 8.921372160996322e-06, "loss": 1.6997, "step": 763 }, { "epoch": 0.8675656493967353, "grad_norm": 0.8083943128585815, "learning_rate": 8.773044733510338e-06, "loss": 1.6124, "step": 764 }, { "epoch": 0.8687012065294535, "grad_norm": 0.8135414719581604, "learning_rate": 8.625904129545692e-06, "loss": 1.6341, "step": 765 }, { "epoch": 0.8698367636621718, "grad_norm": 0.8324394226074219, "learning_rate": 8.479952263338509e-06, "loss": 1.6337, "step": 766 }, { "epoch": 0.87097232079489, "grad_norm": 0.887231707572937, "learning_rate": 8.335191033659907e-06, "loss": 1.6091, "step": 767 }, { "epoch": 0.8721078779276082, "grad_norm": 0.8264531493186951, "learning_rate": 8.191622323791315e-06, "loss": 1.6652, "step": 768 }, { "epoch": 0.8732434350603264, "grad_norm": 0.8386013507843018, "learning_rate": 8.049248001500021e-06, "loss": 1.619, "step": 769 }, { "epoch": 0.8743789921930447, "grad_norm": 0.8475117087364197, "learning_rate": 7.908069919014815e-06, "loss": 1.5792, "step": 770 }, { "epoch": 0.875514549325763, "grad_norm": 0.9289790391921997, "learning_rate": 7.768089913001941e-06, "loss": 1.5944, "step": 771 }, { "epoch": 0.8766501064584812, "grad_norm": 0.911464512348175, "learning_rate": 7.629309804541207e-06, "loss": 1.5613, "step": 772 }, { "epoch": 0.8777856635911995, "grad_norm": 0.8851836919784546, "learning_rate": 7.491731399102231e-06, "loss": 1.5347, "step": 773 }, { "epoch": 0.8789212207239177, "grad_norm": 0.9375848770141602, "learning_rate": 7.355356486520959e-06, "loss": 1.5301, "step": 774 }, { "epoch": 0.8800567778566359, "grad_norm": 0.9790908694267273, "learning_rate": 7.220186840976495e-06, "loss": 1.5412, "step": 775 }, { "epoch": 0.8811923349893541, "grad_norm": 0.9363968968391418, "learning_rate": 7.086224220967907e-06, "loss": 1.5467, "step": 776 }, { "epoch": 0.8823278921220724, "grad_norm": 0.9428462982177734, "learning_rate": 6.953470369291348e-06, "loss": 1.6125, "step": 777 }, { "epoch": 0.8834634492547906, "grad_norm": 0.9502483606338501, "learning_rate": 6.821927013017426e-06, "loss": 1.4819, "step": 778 }, { "epoch": 0.8845990063875089, "grad_norm": 1.0164519548416138, "learning_rate": 6.691595863468703e-06, "loss": 1.6474, "step": 779 }, { "epoch": 0.8857345635202271, "grad_norm": 0.9861384630203247, "learning_rate": 6.562478616197554e-06, "loss": 1.417, "step": 780 }, { "epoch": 0.8868701206529453, "grad_norm": 1.0205590724945068, "learning_rate": 6.4345769509638776e-06, "loss": 1.5545, "step": 781 }, { "epoch": 0.8880056777856636, "grad_norm": 0.9810398817062378, "learning_rate": 6.307892531713444e-06, "loss": 1.431, "step": 782 }, { "epoch": 0.8891412349183818, "grad_norm": 1.041710615158081, "learning_rate": 6.182427006556135e-06, "loss": 1.4386, "step": 783 }, { "epoch": 0.8902767920511001, "grad_norm": 1.0606962442398071, "learning_rate": 6.058182007744584e-06, "loss": 1.3838, "step": 784 }, { "epoch": 0.8914123491838183, "grad_norm": 1.1651570796966553, "learning_rate": 5.935159151652902e-06, "loss": 1.4835, "step": 785 }, { "epoch": 0.8925479063165366, "grad_norm": 1.0349689722061157, "learning_rate": 5.813360038755611e-06, "loss": 1.2883, "step": 786 }, { "epoch": 0.8936834634492548, "grad_norm": 1.1531161069869995, "learning_rate": 5.6927862536068635e-06, "loss": 1.522, "step": 787 }, { "epoch": 0.894819020581973, "grad_norm": 1.2297810316085815, "learning_rate": 5.573439364819855e-06, "loss": 1.48, "step": 788 }, { "epoch": 0.8959545777146912, "grad_norm": 1.2327901124954224, "learning_rate": 5.455320925046359e-06, "loss": 1.4489, "step": 789 }, { "epoch": 0.8970901348474095, "grad_norm": 1.1867223978042603, "learning_rate": 5.338432470956589e-06, "loss": 1.3596, "step": 790 }, { "epoch": 0.8982256919801278, "grad_norm": 1.2836939096450806, "learning_rate": 5.222775523219125e-06, "loss": 1.2678, "step": 791 }, { "epoch": 0.899361249112846, "grad_norm": 1.3315327167510986, "learning_rate": 5.108351586481197e-06, "loss": 1.4213, "step": 792 }, { "epoch": 0.9004968062455643, "grad_norm": 1.4310411214828491, "learning_rate": 4.99516214934912e-06, "loss": 1.3085, "step": 793 }, { "epoch": 0.9016323633782825, "grad_norm": 1.4322105646133423, "learning_rate": 4.8832086843688564e-06, "loss": 1.2955, "step": 794 }, { "epoch": 0.9027679205110007, "grad_norm": 1.5964086055755615, "learning_rate": 4.772492648006932e-06, "loss": 1.3004, "step": 795 }, { "epoch": 0.9039034776437189, "grad_norm": 1.6921981573104858, "learning_rate": 4.663015480631428e-06, "loss": 1.3035, "step": 796 }, { "epoch": 0.9050390347764372, "grad_norm": 1.6932827234268188, "learning_rate": 4.554778606493315e-06, "loss": 1.1871, "step": 797 }, { "epoch": 0.9061745919091554, "grad_norm": 1.794168472290039, "learning_rate": 4.447783433707842e-06, "loss": 0.9291, "step": 798 }, { "epoch": 0.9073101490418737, "grad_norm": 2.2904467582702637, "learning_rate": 4.342031354236265e-06, "loss": 1.1717, "step": 799 }, { "epoch": 0.9084457061745919, "grad_norm": 3.239985466003418, "learning_rate": 4.237523743867744e-06, "loss": 1.2094, "step": 800 }, { "epoch": 0.9095812633073102, "grad_norm": 0.593803882598877, "learning_rate": 4.134261962201425e-06, "loss": 1.8452, "step": 801 }, { "epoch": 0.9107168204400284, "grad_norm": 0.6365258693695068, "learning_rate": 4.032247352628748e-06, "loss": 1.8229, "step": 802 }, { "epoch": 0.9118523775727466, "grad_norm": 0.6704772710800171, "learning_rate": 3.931481242315993e-06, "loss": 1.817, "step": 803 }, { "epoch": 0.9129879347054649, "grad_norm": 0.7142685055732727, "learning_rate": 3.8319649421869495e-06, "loss": 1.864, "step": 804 }, { "epoch": 0.9141234918381831, "grad_norm": 0.7026089429855347, "learning_rate": 3.7336997469060276e-06, "loss": 1.6577, "step": 805 }, { "epoch": 0.9152590489709014, "grad_norm": 0.7174745798110962, "learning_rate": 3.6366869348611887e-06, "loss": 1.7716, "step": 806 }, { "epoch": 0.9163946061036196, "grad_norm": 0.7687094807624817, "learning_rate": 3.540927768147484e-06, "loss": 1.8029, "step": 807 }, { "epoch": 0.9175301632363378, "grad_norm": 0.783196210861206, "learning_rate": 3.4464234925505213e-06, "loss": 1.6762, "step": 808 }, { "epoch": 0.918665720369056, "grad_norm": 0.7645063996315002, "learning_rate": 3.3531753375303897e-06, "loss": 1.6432, "step": 809 }, { "epoch": 0.9198012775017743, "grad_norm": 0.7609047293663025, "learning_rate": 3.261184516205551e-06, "loss": 1.6945, "step": 810 }, { "epoch": 0.9209368346344925, "grad_norm": 0.8239038586616516, "learning_rate": 3.1704522253370947e-06, "loss": 1.6944, "step": 811 }, { "epoch": 0.9220723917672108, "grad_norm": 0.7996591329574585, "learning_rate": 3.080979645313142e-06, "loss": 1.7244, "step": 812 }, { "epoch": 0.9232079488999291, "grad_norm": 0.7702991962432861, "learning_rate": 2.9927679401335785e-06, "loss": 1.6388, "step": 813 }, { "epoch": 0.9243435060326473, "grad_norm": 0.8182036280632019, "learning_rate": 2.905818257394799e-06, "loss": 1.6887, "step": 814 }, { "epoch": 0.9254790631653655, "grad_norm": 0.8088632822036743, "learning_rate": 2.8201317282748552e-06, "loss": 1.5761, "step": 815 }, { "epoch": 0.9266146202980837, "grad_norm": 0.8327025175094604, "learning_rate": 2.735709467518699e-06, "loss": 1.6101, "step": 816 }, { "epoch": 0.927750177430802, "grad_norm": 0.8162053823471069, "learning_rate": 2.6525525734236944e-06, "loss": 1.5813, "step": 817 }, { "epoch": 0.9288857345635202, "grad_norm": 0.8418952226638794, "learning_rate": 2.5706621278253406e-06, "loss": 1.51, "step": 818 }, { "epoch": 0.9300212916962385, "grad_norm": 0.8314594626426697, "learning_rate": 2.49003919608316e-06, "loss": 1.6118, "step": 819 }, { "epoch": 0.9311568488289567, "grad_norm": 0.9286348819732666, "learning_rate": 2.4106848270669e-06, "loss": 1.5989, "step": 820 }, { "epoch": 0.932292405961675, "grad_norm": 0.9162536263465881, "learning_rate": 2.3326000531428195e-06, "loss": 1.6276, "step": 821 }, { "epoch": 0.9334279630943931, "grad_norm": 0.8784014582633972, "learning_rate": 2.255785890160311e-06, "loss": 1.5715, "step": 822 }, { "epoch": 0.9345635202271114, "grad_norm": 0.8958483934402466, "learning_rate": 2.1802433374386588e-06, "loss": 1.5561, "step": 823 }, { "epoch": 0.9356990773598297, "grad_norm": 0.9601033926010132, "learning_rate": 2.1059733777540225e-06, "loss": 1.5864, "step": 824 }, { "epoch": 0.9368346344925479, "grad_norm": 1.0430384874343872, "learning_rate": 2.032976977326706e-06, "loss": 1.6622, "step": 825 }, { "epoch": 0.9379701916252662, "grad_norm": 0.9470797181129456, "learning_rate": 1.9612550858085334e-06, "loss": 1.5659, "step": 826 }, { "epoch": 0.9391057487579844, "grad_norm": 1.006312608718872, "learning_rate": 1.8908086362705357e-06, "loss": 1.6137, "step": 827 }, { "epoch": 0.9402413058907026, "grad_norm": 0.9445658922195435, "learning_rate": 1.8216385451907624e-06, "loss": 1.4391, "step": 828 }, { "epoch": 0.9413768630234208, "grad_norm": 1.0231133699417114, "learning_rate": 1.7537457124423895e-06, "loss": 1.4005, "step": 829 }, { "epoch": 0.9425124201561391, "grad_norm": 0.9325294494628906, "learning_rate": 1.68713102128204e-06, "loss": 1.4043, "step": 830 }, { "epoch": 0.9436479772888573, "grad_norm": 1.0003581047058105, "learning_rate": 1.62179533833825e-06, "loss": 1.42, "step": 831 }, { "epoch": 0.9447835344215756, "grad_norm": 1.1234935522079468, "learning_rate": 1.5577395136001982e-06, "loss": 1.5874, "step": 832 }, { "epoch": 0.9459190915542939, "grad_norm": 1.097490668296814, "learning_rate": 1.4949643804066493e-06, "loss": 1.4423, "step": 833 }, { "epoch": 0.9470546486870121, "grad_norm": 1.0460751056671143, "learning_rate": 1.4334707554351511e-06, "loss": 1.4117, "step": 834 }, { "epoch": 0.9481902058197303, "grad_norm": 1.058078646659851, "learning_rate": 1.3732594386913655e-06, "loss": 1.441, "step": 835 }, { "epoch": 0.9493257629524485, "grad_norm": 1.1476143598556519, "learning_rate": 1.3143312134986651e-06, "loss": 1.5404, "step": 836 }, { "epoch": 0.9504613200851668, "grad_norm": 1.1370004415512085, "learning_rate": 1.2566868464879533e-06, "loss": 1.3928, "step": 837 }, { "epoch": 0.951596877217885, "grad_norm": 1.1355334520339966, "learning_rate": 1.200327087587716e-06, "loss": 1.3061, "step": 838 }, { "epoch": 0.9527324343506033, "grad_norm": 1.1487090587615967, "learning_rate": 1.1452526700141964e-06, "loss": 1.2514, "step": 839 }, { "epoch": 0.9538679914833215, "grad_norm": 1.2434322834014893, "learning_rate": 1.091464310261947e-06, "loss": 1.3108, "step": 840 }, { "epoch": 0.9550035486160398, "grad_norm": 1.2955700159072876, "learning_rate": 1.0389627080944153e-06, "loss": 1.3194, "step": 841 }, { "epoch": 0.9561391057487579, "grad_norm": 1.421000599861145, "learning_rate": 9.877485465349058e-07, "loss": 1.4175, "step": 842 }, { "epoch": 0.9572746628814762, "grad_norm": 1.5876970291137695, "learning_rate": 9.378224918576872e-07, "loss": 1.4931, "step": 843 }, { "epoch": 0.9584102200141945, "grad_norm": 1.577010154724121, "learning_rate": 8.891851935792673e-07, "loss": 1.2947, "step": 844 }, { "epoch": 0.9595457771469127, "grad_norm": 1.5548601150512695, "learning_rate": 8.418372844500532e-07, "loss": 1.2899, "step": 845 }, { "epoch": 0.960681334279631, "grad_norm": 1.8083857297897339, "learning_rate": 7.957793804459824e-07, "loss": 1.2714, "step": 846 }, { "epoch": 0.9618168914123492, "grad_norm": 2.0985913276672363, "learning_rate": 7.51012080760638e-07, "loss": 1.4446, "step": 847 }, { "epoch": 0.9629524485450675, "grad_norm": 2.022230863571167, "learning_rate": 7.075359677973569e-07, "loss": 1.1485, "step": 848 }, { "epoch": 0.9640880056777856, "grad_norm": 2.8282248973846436, "learning_rate": 6.653516071616906e-07, "loss": 1.1949, "step": 849 }, { "epoch": 0.9652235628105039, "grad_norm": 3.7681384086608887, "learning_rate": 6.24459547654066e-07, "loss": 1.4056, "step": 850 }, { "epoch": 0.9663591199432221, "grad_norm": 0.616252064704895, "learning_rate": 5.84860321262648e-07, "loss": 1.8399, "step": 851 }, { "epoch": 0.9674946770759404, "grad_norm": 0.7014140486717224, "learning_rate": 5.46554443156333e-07, "loss": 1.8137, "step": 852 }, { "epoch": 0.9686302342086586, "grad_norm": 0.7312139868736267, "learning_rate": 5.095424116781767e-07, "loss": 1.6737, "step": 853 }, { "epoch": 0.9697657913413769, "grad_norm": 0.7224058508872986, "learning_rate": 4.738247083387992e-07, "loss": 1.7196, "step": 854 }, { "epoch": 0.9709013484740951, "grad_norm": 0.74540776014328, "learning_rate": 4.3940179781019055e-07, "loss": 1.7325, "step": 855 }, { "epoch": 0.9720369056068133, "grad_norm": 0.7887935638427734, "learning_rate": 4.06274127919648e-07, "loss": 1.7157, "step": 856 }, { "epoch": 0.9731724627395316, "grad_norm": 0.7673887014389038, "learning_rate": 3.74442129643926e-07, "loss": 1.6395, "step": 857 }, { "epoch": 0.9743080198722498, "grad_norm": 0.7877606749534607, "learning_rate": 3.439062171036511e-07, "loss": 1.5846, "step": 858 }, { "epoch": 0.9754435770049681, "grad_norm": 0.8037866950035095, "learning_rate": 3.14666787557949e-07, "loss": 1.5255, "step": 859 }, { "epoch": 0.9765791341376863, "grad_norm": 0.8087367415428162, "learning_rate": 2.8672422139923715e-07, "loss": 1.5759, "step": 860 }, { "epoch": 0.9777146912704046, "grad_norm": 0.8214208483695984, "learning_rate": 2.600788821483069e-07, "loss": 1.6499, "step": 861 }, { "epoch": 0.9788502484031227, "grad_norm": 0.8392576575279236, "learning_rate": 2.3473111644957135e-07, "loss": 1.5851, "step": 862 }, { "epoch": 0.979985805535841, "grad_norm": 0.8735281825065613, "learning_rate": 2.1068125406659145e-07, "loss": 1.6212, "step": 863 }, { "epoch": 0.9811213626685593, "grad_norm": 0.9109861254692078, "learning_rate": 1.8792960787774593e-07, "loss": 1.6209, "step": 864 }, { "epoch": 0.9822569198012775, "grad_norm": 0.9178940653800964, "learning_rate": 1.6647647387219023e-07, "loss": 1.5702, "step": 865 }, { "epoch": 0.9833924769339958, "grad_norm": 0.9878839254379272, "learning_rate": 1.463221311459817e-07, "loss": 1.5239, "step": 866 }, { "epoch": 0.984528034066714, "grad_norm": 1.097783088684082, "learning_rate": 1.2746684189846036e-07, "loss": 1.6281, "step": 867 }, { "epoch": 0.9856635911994323, "grad_norm": 1.1349104642868042, "learning_rate": 1.0991085142886271e-07, "loss": 1.6978, "step": 868 }, { "epoch": 0.9867991483321504, "grad_norm": 1.0851463079452515, "learning_rate": 9.365438813306871e-08, "loss": 1.5284, "step": 869 }, { "epoch": 0.9879347054648687, "grad_norm": 1.0201115608215332, "learning_rate": 7.869766350069308e-08, "loss": 1.4334, "step": 870 }, { "epoch": 0.9890702625975869, "grad_norm": 1.0987871885299683, "learning_rate": 6.504087211229859e-08, "loss": 1.5111, "step": 871 }, { "epoch": 0.9902058197303052, "grad_norm": 1.102272391319275, "learning_rate": 5.268419163688698e-08, "loss": 1.4635, "step": 872 }, { "epoch": 0.9913413768630234, "grad_norm": 1.1540968418121338, "learning_rate": 4.1627782829567476e-08, "loss": 1.2795, "step": 873 }, { "epoch": 0.9924769339957417, "grad_norm": 1.1692501306533813, "learning_rate": 3.187178952945846e-08, "loss": 1.3569, "step": 874 }, { "epoch": 0.99361249112846, "grad_norm": 1.2711395025253296, "learning_rate": 2.341633865784454e-08, "loss": 1.3481, "step": 875 }, { "epoch": 0.9947480482611781, "grad_norm": 1.6011962890625, "learning_rate": 1.6261540216522264e-08, "loss": 1.4707, "step": 876 }, { "epoch": 0.9958836053938964, "grad_norm": 1.4436500072479248, "learning_rate": 1.0407487286345774e-08, "loss": 1.2525, "step": 877 }, { "epoch": 0.9970191625266146, "grad_norm": 1.5714635848999023, "learning_rate": 5.854256026027738e-09, "loss": 1.1789, "step": 878 }, { "epoch": 0.9981547196593329, "grad_norm": 2.1908271312713623, "learning_rate": 2.6019056711512614e-09, "loss": 1.2239, "step": 879 }, { "epoch": 0.9992902767920511, "grad_norm": 2.2909140586853027, "learning_rate": 6.50478533403831e-10, "loss": 0.9166, "step": 880 }, { "epoch": 1.0009226401703335, "grad_norm": 6.250060558319092, "learning_rate": 0.0, "loss": 2.6104, "step": 881 } ], "logging_steps": 1, "max_steps": 881, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 221, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3469372301862502e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }