{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989401076146326, "eval_steps": 500, "global_step": 590000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001693118826465479, "grad_norm": 3.73583722114563, "learning_rate": 1.693078694297711e-06, "loss": 10.2763, "step": 1000 }, { "epoch": 0.003386237652930958, "grad_norm": 2.8400094509124756, "learning_rate": 3.386157388595422e-06, "loss": 9.8408, "step": 2000 }, { "epoch": 0.005079356479396437, "grad_norm": 2.7282426357269287, "learning_rate": 5.079236082893133e-06, "loss": 9.4292, "step": 3000 }, { "epoch": 0.006772475305861916, "grad_norm": 2.4836771488189697, "learning_rate": 6.772314777190844e-06, "loss": 9.0685, "step": 4000 }, { "epoch": 0.008465594132327395, "grad_norm": 2.675732135772705, "learning_rate": 8.465393471488555e-06, "loss": 8.7983, "step": 5000 }, { "epoch": 0.010158712958792874, "grad_norm": 2.2680325508117676, "learning_rate": 1.0158472165786267e-05, "loss": 8.5898, "step": 6000 }, { "epoch": 0.011851831785258353, "grad_norm": 3.363144636154175, "learning_rate": 1.184985778138968e-05, "loss": 8.4407, "step": 7000 }, { "epoch": 0.013544950611723831, "grad_norm": 2.426469087600708, "learning_rate": 1.354293647568739e-05, "loss": 8.3586, "step": 8000 }, { "epoch": 0.015238069438189311, "grad_norm": 3.012533187866211, "learning_rate": 1.5234322091290803e-05, "loss": 8.2921, "step": 9000 }, { "epoch": 0.01693118826465479, "grad_norm": 3.929027557373047, "learning_rate": 1.6927400785588516e-05, "loss": 8.2401, "step": 10000 }, { "epoch": 0.01862430709112027, "grad_norm": 3.7916059494018555, "learning_rate": 1.8618786401191928e-05, "loss": 8.1987, "step": 11000 }, { "epoch": 0.020317425917585747, "grad_norm": 4.498620986938477, "learning_rate": 2.031186509548964e-05, "loss": 8.1483, "step": 12000 }, { "epoch": 0.022010544744051227, "grad_norm": 3.9931066036224365, "learning_rate": 2.2001557632398754e-05, "loss": 8.107, "step": 13000 }, { "epoch": 0.023703663570516707, "grad_norm": 3.7879867553710938, "learning_rate": 2.3694636326696464e-05, "loss": 8.0786, "step": 14000 }, { "epoch": 0.025396782396982186, "grad_norm": 4.030205726623535, "learning_rate": 2.538771502099418e-05, "loss": 8.0161, "step": 15000 }, { "epoch": 0.027089901223447663, "grad_norm": 4.0418901443481445, "learning_rate": 2.7079100636597593e-05, "loss": 7.9855, "step": 16000 }, { "epoch": 0.028783020049913143, "grad_norm": 3.499830484390259, "learning_rate": 2.87721793308953e-05, "loss": 7.9227, "step": 17000 }, { "epoch": 0.030476138876378622, "grad_norm": 4.831931114196777, "learning_rate": 3.0463564946498714e-05, "loss": 7.864, "step": 18000 }, { "epoch": 0.0321692577028441, "grad_norm": 3.7220613956451416, "learning_rate": 3.2156643640796424e-05, "loss": 7.8451, "step": 19000 }, { "epoch": 0.03386237652930958, "grad_norm": 3.399836540222168, "learning_rate": 3.384972233509414e-05, "loss": 7.7888, "step": 20000 }, { "epoch": 0.03555549535577506, "grad_norm": 4.145583152770996, "learning_rate": 3.554110795069755e-05, "loss": 7.7317, "step": 21000 }, { "epoch": 0.03724861418224054, "grad_norm": 4.18398380279541, "learning_rate": 3.723418664499526e-05, "loss": 7.6753, "step": 22000 }, { "epoch": 0.038941733008706014, "grad_norm": 4.6186203956604, "learning_rate": 3.8925572260598674e-05, "loss": 7.636, "step": 23000 }, { "epoch": 0.040634851835171494, "grad_norm": 4.769713401794434, "learning_rate": 4.061865095489639e-05, "loss": 7.584, "step": 24000 }, { "epoch": 0.042327970661636974, "grad_norm": 5.2244181632995605, "learning_rate": 4.23100365704998e-05, "loss": 7.525, "step": 25000 }, { "epoch": 0.044021089488102454, "grad_norm": 7.676865100860596, "learning_rate": 4.400311526479751e-05, "loss": 7.4735, "step": 26000 }, { "epoch": 0.04571420831456793, "grad_norm": 6.164961338043213, "learning_rate": 4.5694500880400924e-05, "loss": 7.4157, "step": 27000 }, { "epoch": 0.04740732714103341, "grad_norm": 5.987270355224609, "learning_rate": 4.738757957469863e-05, "loss": 7.3942, "step": 28000 }, { "epoch": 0.04910044596749889, "grad_norm": 5.469903945922852, "learning_rate": 4.9078965190302045e-05, "loss": 7.3438, "step": 29000 }, { "epoch": 0.05079356479396437, "grad_norm": 7.124863624572754, "learning_rate": 4.99593650974703e-05, "loss": 7.2601, "step": 30000 }, { "epoch": 0.052486683620429846, "grad_norm": 7.371854782104492, "learning_rate": 4.987034258074405e-05, "loss": 7.1986, "step": 31000 }, { "epoch": 0.054179802446895325, "grad_norm": 6.575459957122803, "learning_rate": 4.978123095238944e-05, "loss": 7.1281, "step": 32000 }, { "epoch": 0.055872921273360805, "grad_norm": 6.822597980499268, "learning_rate": 4.969220843566319e-05, "loss": 7.0584, "step": 33000 }, { "epoch": 0.057566040099826285, "grad_norm": 6.403082370758057, "learning_rate": 4.960309680730858e-05, "loss": 6.9934, "step": 34000 }, { "epoch": 0.059259158926291765, "grad_norm": 6.652848243713379, "learning_rate": 4.9514074290582326e-05, "loss": 6.8917, "step": 35000 }, { "epoch": 0.060952277752757245, "grad_norm": 7.329777717590332, "learning_rate": 4.942496266222772e-05, "loss": 6.8566, "step": 36000 }, { "epoch": 0.06264539657922272, "grad_norm": 7.5590596199035645, "learning_rate": 4.9335851033873114e-05, "loss": 6.7654, "step": 37000 }, { "epoch": 0.0643385154056882, "grad_norm": 6.460357189178467, "learning_rate": 4.9246739405518505e-05, "loss": 6.6928, "step": 38000 }, { "epoch": 0.06603163423215368, "grad_norm": 7.331843852996826, "learning_rate": 4.915771688879226e-05, "loss": 6.6585, "step": 39000 }, { "epoch": 0.06772475305861916, "grad_norm": 5.962008476257324, "learning_rate": 4.906860526043765e-05, "loss": 6.609, "step": 40000 }, { "epoch": 0.06941787188508464, "grad_norm": 9.334479331970215, "learning_rate": 4.897958274371139e-05, "loss": 6.5652, "step": 41000 }, { "epoch": 0.07111099071155012, "grad_norm": 6.856483459472656, "learning_rate": 4.889047111535679e-05, "loss": 6.5372, "step": 42000 }, { "epoch": 0.0728041095380156, "grad_norm": 9.067439079284668, "learning_rate": 4.8801448598630535e-05, "loss": 6.4876, "step": 43000 }, { "epoch": 0.07449722836448108, "grad_norm": 6.587254524230957, "learning_rate": 4.871233697027593e-05, "loss": 6.4332, "step": 44000 }, { "epoch": 0.07619034719094656, "grad_norm": 8.171180725097656, "learning_rate": 4.8623225341921316e-05, "loss": 6.3993, "step": 45000 }, { "epoch": 0.07788346601741203, "grad_norm": 7.133697986602783, "learning_rate": 4.8534113713566713e-05, "loss": 6.3634, "step": 46000 }, { "epoch": 0.07957658484387752, "grad_norm": 7.917272567749023, "learning_rate": 4.844509119684046e-05, "loss": 6.3226, "step": 47000 }, { "epoch": 0.08126970367034299, "grad_norm": 7.027698993682861, "learning_rate": 4.8355979568485856e-05, "loss": 6.2964, "step": 48000 }, { "epoch": 0.08296282249680847, "grad_norm": 8.726886749267578, "learning_rate": 4.8266867940131247e-05, "loss": 6.2492, "step": 49000 }, { "epoch": 0.08465594132327395, "grad_norm": 7.787764072418213, "learning_rate": 4.817784542340499e-05, "loss": 6.2116, "step": 50000 }, { "epoch": 0.08634906014973943, "grad_norm": 8.023435592651367, "learning_rate": 4.808873379505038e-05, "loss": 6.1969, "step": 51000 }, { "epoch": 0.08804217897620491, "grad_norm": 8.086366653442383, "learning_rate": 4.7999711278324134e-05, "loss": 6.1439, "step": 52000 }, { "epoch": 0.08973529780267038, "grad_norm": 7.9902777671813965, "learning_rate": 4.791059964996953e-05, "loss": 6.1105, "step": 53000 }, { "epoch": 0.09142841662913587, "grad_norm": 7.8094072341918945, "learning_rate": 4.782157713324327e-05, "loss": 6.0733, "step": 54000 }, { "epoch": 0.09312153545560134, "grad_norm": 8.035541534423828, "learning_rate": 4.773246550488867e-05, "loss": 6.0399, "step": 55000 }, { "epoch": 0.09481465428206683, "grad_norm": 7.343162536621094, "learning_rate": 4.764344298816241e-05, "loss": 6.0228, "step": 56000 }, { "epoch": 0.0965077731085323, "grad_norm": 7.9388604164123535, "learning_rate": 4.755433135980781e-05, "loss": 5.9893, "step": 57000 }, { "epoch": 0.09820089193499779, "grad_norm": 8.534358978271484, "learning_rate": 4.7465308843081555e-05, "loss": 5.9828, "step": 58000 }, { "epoch": 0.09989401076146326, "grad_norm": 7.910970687866211, "learning_rate": 4.7376197214726946e-05, "loss": 5.9192, "step": 59000 }, { "epoch": 0.10158712958792875, "grad_norm": 9.482747077941895, "learning_rate": 4.728717469800069e-05, "loss": 5.9024, "step": 60000 }, { "epoch": 0.10328024841439422, "grad_norm": 7.840960502624512, "learning_rate": 4.719806306964609e-05, "loss": 5.8572, "step": 61000 }, { "epoch": 0.10497336724085969, "grad_norm": 6.83944845199585, "learning_rate": 4.710904055291983e-05, "loss": 5.8549, "step": 62000 }, { "epoch": 0.10666648606732518, "grad_norm": 8.34449291229248, "learning_rate": 4.7019928924565224e-05, "loss": 5.8366, "step": 63000 }, { "epoch": 0.10835960489379065, "grad_norm": 7.845147609710693, "learning_rate": 4.6930906407838976e-05, "loss": 5.811, "step": 64000 }, { "epoch": 0.11005272372025614, "grad_norm": 7.984669208526611, "learning_rate": 4.6841794779484366e-05, "loss": 5.792, "step": 65000 }, { "epoch": 0.11174584254672161, "grad_norm": 10.630233764648438, "learning_rate": 4.6752683151129764e-05, "loss": 5.7488, "step": 66000 }, { "epoch": 0.1134389613731871, "grad_norm": 8.382254600524902, "learning_rate": 4.666374974603186e-05, "loss": 5.7276, "step": 67000 }, { "epoch": 0.11513208019965257, "grad_norm": 7.95189094543457, "learning_rate": 4.6574638117677254e-05, "loss": 5.7133, "step": 68000 }, { "epoch": 0.11682519902611806, "grad_norm": 9.147665023803711, "learning_rate": 4.6485526489322645e-05, "loss": 5.6896, "step": 69000 }, { "epoch": 0.11851831785258353, "grad_norm": 7.932515621185303, "learning_rate": 4.639641486096804e-05, "loss": 5.6838, "step": 70000 }, { "epoch": 0.120211436679049, "grad_norm": 10.082840919494629, "learning_rate": 4.630730323261343e-05, "loss": 5.6659, "step": 71000 }, { "epoch": 0.12190455550551449, "grad_norm": 9.887310028076172, "learning_rate": 4.621828071588718e-05, "loss": 5.6522, "step": 72000 }, { "epoch": 0.12359767433197996, "grad_norm": 8.447847366333008, "learning_rate": 4.612925819916093e-05, "loss": 5.6197, "step": 73000 }, { "epoch": 0.12529079315844543, "grad_norm": 8.822169303894043, "learning_rate": 4.604014657080632e-05, "loss": 5.6033, "step": 74000 }, { "epoch": 0.12698391198491094, "grad_norm": 7.646286964416504, "learning_rate": 4.595103494245172e-05, "loss": 5.5813, "step": 75000 }, { "epoch": 0.1286770308113764, "grad_norm": 8.93606948852539, "learning_rate": 4.586192331409711e-05, "loss": 5.5568, "step": 76000 }, { "epoch": 0.13037014963784188, "grad_norm": 9.26745891571045, "learning_rate": 4.577298990899921e-05, "loss": 5.5502, "step": 77000 }, { "epoch": 0.13206326846430735, "grad_norm": 8.381054878234863, "learning_rate": 4.56838782806446e-05, "loss": 5.53, "step": 78000 }, { "epoch": 0.13375638729077285, "grad_norm": 6.793117046356201, "learning_rate": 4.5594766652289996e-05, "loss": 5.5209, "step": 79000 }, { "epoch": 0.13544950611723833, "grad_norm": 7.553661823272705, "learning_rate": 4.550574413556374e-05, "loss": 5.4821, "step": 80000 }, { "epoch": 0.1371426249437038, "grad_norm": 8.121792793273926, "learning_rate": 4.541663250720913e-05, "loss": 5.4981, "step": 81000 }, { "epoch": 0.13883574377016927, "grad_norm": 8.875972747802734, "learning_rate": 4.532760999048288e-05, "loss": 5.4655, "step": 82000 }, { "epoch": 0.14052886259663475, "grad_norm": 7.790102958679199, "learning_rate": 4.5238498362128274e-05, "loss": 5.4544, "step": 83000 }, { "epoch": 0.14222198142310025, "grad_norm": 8.944923400878906, "learning_rate": 4.5149386733773665e-05, "loss": 5.4332, "step": 84000 }, { "epoch": 0.14391510024956572, "grad_norm": 8.458672523498535, "learning_rate": 4.5060275105419055e-05, "loss": 5.4368, "step": 85000 }, { "epoch": 0.1456082190760312, "grad_norm": 8.117480278015137, "learning_rate": 4.4971163477064446e-05, "loss": 5.4112, "step": 86000 }, { "epoch": 0.14730133790249667, "grad_norm": 7.95702600479126, "learning_rate": 4.488205184870984e-05, "loss": 5.3981, "step": 87000 }, { "epoch": 0.14899445672896217, "grad_norm": 8.198105812072754, "learning_rate": 4.4793029331983595e-05, "loss": 5.4053, "step": 88000 }, { "epoch": 0.15068757555542764, "grad_norm": 8.185157775878906, "learning_rate": 4.4703917703628986e-05, "loss": 5.3793, "step": 89000 }, { "epoch": 0.1523806943818931, "grad_norm": 7.615440368652344, "learning_rate": 4.461489518690273e-05, "loss": 5.3605, "step": 90000 }, { "epoch": 0.15407381320835858, "grad_norm": 9.573814392089844, "learning_rate": 4.452578355854812e-05, "loss": 5.3415, "step": 91000 }, { "epoch": 0.15576693203482406, "grad_norm": 8.813225746154785, "learning_rate": 4.4436761041821873e-05, "loss": 5.3376, "step": 92000 }, { "epoch": 0.15746005086128956, "grad_norm": 7.906375885009766, "learning_rate": 4.4347649413467264e-05, "loss": 5.321, "step": 93000 }, { "epoch": 0.15915316968775503, "grad_norm": 8.077609062194824, "learning_rate": 4.425862689674101e-05, "loss": 5.305, "step": 94000 }, { "epoch": 0.1608462885142205, "grad_norm": 8.710689544677734, "learning_rate": 4.41695152683864e-05, "loss": 5.3024, "step": 95000 }, { "epoch": 0.16253940734068598, "grad_norm": 9.949153900146484, "learning_rate": 4.408049275166015e-05, "loss": 5.2722, "step": 96000 }, { "epoch": 0.16423252616715148, "grad_norm": 9.537801742553711, "learning_rate": 4.399138112330554e-05, "loss": 5.2724, "step": 97000 }, { "epoch": 0.16592564499361695, "grad_norm": 9.934480667114258, "learning_rate": 4.3902358606579294e-05, "loss": 5.2578, "step": 98000 }, { "epoch": 0.16761876382008242, "grad_norm": 7.971692085266113, "learning_rate": 4.381333608985304e-05, "loss": 5.2569, "step": 99000 }, { "epoch": 0.1693118826465479, "grad_norm": 8.0557222366333, "learning_rate": 4.372422446149843e-05, "loss": 5.2586, "step": 100000 }, { "epoch": 0.17100500147301337, "grad_norm": 8.172697067260742, "learning_rate": 4.363511283314383e-05, "loss": 5.227, "step": 101000 }, { "epoch": 0.17269812029947887, "grad_norm": 8.191225051879883, "learning_rate": 4.354600120478922e-05, "loss": 5.2291, "step": 102000 }, { "epoch": 0.17439123912594434, "grad_norm": 8.448338508605957, "learning_rate": 4.345697868806296e-05, "loss": 5.2161, "step": 103000 }, { "epoch": 0.17608435795240981, "grad_norm": 9.046798706054688, "learning_rate": 4.3367867059708354e-05, "loss": 5.2055, "step": 104000 }, { "epoch": 0.1777774767788753, "grad_norm": 8.553638458251953, "learning_rate": 4.327875543135375e-05, "loss": 5.1952, "step": 105000 }, { "epoch": 0.17947059560534076, "grad_norm": 7.156827449798584, "learning_rate": 4.3189732914627496e-05, "loss": 5.1824, "step": 106000 }, { "epoch": 0.18116371443180626, "grad_norm": 8.334628105163574, "learning_rate": 4.3100621286272894e-05, "loss": 5.1633, "step": 107000 }, { "epoch": 0.18285683325827173, "grad_norm": 9.823309898376465, "learning_rate": 4.3011509657918284e-05, "loss": 5.1601, "step": 108000 }, { "epoch": 0.1845499520847372, "grad_norm": 9.154176712036133, "learning_rate": 4.2922398029563675e-05, "loss": 5.1498, "step": 109000 }, { "epoch": 0.18624307091120268, "grad_norm": 9.362985610961914, "learning_rate": 4.2833375512837427e-05, "loss": 5.1451, "step": 110000 }, { "epoch": 0.18793618973766818, "grad_norm": 9.546059608459473, "learning_rate": 4.274435299611117e-05, "loss": 5.1565, "step": 111000 }, { "epoch": 0.18962930856413365, "grad_norm": 9.150406837463379, "learning_rate": 4.265524136775656e-05, "loss": 5.109, "step": 112000 }, { "epoch": 0.19132242739059913, "grad_norm": 9.397496223449707, "learning_rate": 4.256612973940195e-05, "loss": 5.102, "step": 113000 }, { "epoch": 0.1930155462170646, "grad_norm": 9.598480224609375, "learning_rate": 4.247701811104735e-05, "loss": 5.0926, "step": 114000 }, { "epoch": 0.19470866504353007, "grad_norm": 8.954512596130371, "learning_rate": 4.2387995594321095e-05, "loss": 5.0797, "step": 115000 }, { "epoch": 0.19640178386999557, "grad_norm": 8.339042663574219, "learning_rate": 4.229888396596649e-05, "loss": 5.0886, "step": 116000 }, { "epoch": 0.19809490269646104, "grad_norm": 8.384330749511719, "learning_rate": 4.220986144924023e-05, "loss": 5.0707, "step": 117000 }, { "epoch": 0.19978802152292652, "grad_norm": 8.838963508605957, "learning_rate": 4.212074982088563e-05, "loss": 5.0758, "step": 118000 }, { "epoch": 0.201481140349392, "grad_norm": 10.049576759338379, "learning_rate": 4.2031727304159374e-05, "loss": 5.0579, "step": 119000 }, { "epoch": 0.2031742591758575, "grad_norm": 10.135797500610352, "learning_rate": 4.194261567580477e-05, "loss": 5.0431, "step": 120000 }, { "epoch": 0.20486737800232296, "grad_norm": 9.766536712646484, "learning_rate": 4.1853593159078516e-05, "loss": 5.0196, "step": 121000 }, { "epoch": 0.20656049682878844, "grad_norm": 9.694602012634277, "learning_rate": 4.176448153072391e-05, "loss": 5.0233, "step": 122000 }, { "epoch": 0.2082536156552539, "grad_norm": 10.742589950561523, "learning_rate": 4.1675369902369304e-05, "loss": 4.9993, "step": 123000 }, { "epoch": 0.20994673448171938, "grad_norm": 8.967914581298828, "learning_rate": 4.1586258274014695e-05, "loss": 5.0079, "step": 124000 }, { "epoch": 0.21163985330818488, "grad_norm": 8.141196250915527, "learning_rate": 4.14973248689168e-05, "loss": 5.0225, "step": 125000 }, { "epoch": 0.21333297213465036, "grad_norm": 9.808989524841309, "learning_rate": 4.140830235219054e-05, "loss": 4.9948, "step": 126000 }, { "epoch": 0.21502609096111583, "grad_norm": 10.123141288757324, "learning_rate": 4.131919072383594e-05, "loss": 4.9652, "step": 127000 }, { "epoch": 0.2167192097875813, "grad_norm": 8.869937896728516, "learning_rate": 4.123007909548133e-05, "loss": 4.9583, "step": 128000 }, { "epoch": 0.2184123286140468, "grad_norm": 9.920904159545898, "learning_rate": 4.1140967467126725e-05, "loss": 4.9562, "step": 129000 }, { "epoch": 0.22010544744051228, "grad_norm": 10.12941837310791, "learning_rate": 4.1051855838772115e-05, "loss": 4.9674, "step": 130000 }, { "epoch": 0.22179856626697775, "grad_norm": 9.176301002502441, "learning_rate": 4.0962744210417506e-05, "loss": 4.9354, "step": 131000 }, { "epoch": 0.22349168509344322, "grad_norm": 8.277559280395508, "learning_rate": 4.087372169369125e-05, "loss": 4.9361, "step": 132000 }, { "epoch": 0.2251848039199087, "grad_norm": 10.194904327392578, "learning_rate": 4.078461006533665e-05, "loss": 4.9215, "step": 133000 }, { "epoch": 0.2268779227463742, "grad_norm": 10.98525619506836, "learning_rate": 4.06955875486104e-05, "loss": 4.934, "step": 134000 }, { "epoch": 0.22857104157283967, "grad_norm": 8.614424705505371, "learning_rate": 4.0606475920255784e-05, "loss": 4.9443, "step": 135000 }, { "epoch": 0.23026416039930514, "grad_norm": 9.043747901916504, "learning_rate": 4.051736429190118e-05, "loss": 4.8811, "step": 136000 }, { "epoch": 0.2319572792257706, "grad_norm": 8.954269409179688, "learning_rate": 4.042834177517493e-05, "loss": 4.8821, "step": 137000 }, { "epoch": 0.2336503980522361, "grad_norm": 9.635639190673828, "learning_rate": 4.0339230146820324e-05, "loss": 4.9014, "step": 138000 }, { "epoch": 0.2353435168787016, "grad_norm": 9.975570678710938, "learning_rate": 4.0250118518465715e-05, "loss": 4.8677, "step": 139000 }, { "epoch": 0.23703663570516706, "grad_norm": 9.619186401367188, "learning_rate": 4.016109600173946e-05, "loss": 4.8723, "step": 140000 }, { "epoch": 0.23872975453163253, "grad_norm": 10.370034217834473, "learning_rate": 4.007198437338485e-05, "loss": 4.8782, "step": 141000 }, { "epoch": 0.240422873358098, "grad_norm": 8.479218482971191, "learning_rate": 3.99829618566586e-05, "loss": 4.8737, "step": 142000 }, { "epoch": 0.2421159921845635, "grad_norm": 8.814338684082031, "learning_rate": 3.989385022830399e-05, "loss": 4.8611, "step": 143000 }, { "epoch": 0.24380911101102898, "grad_norm": 9.030917167663574, "learning_rate": 3.980473859994939e-05, "loss": 4.8265, "step": 144000 }, { "epoch": 0.24550222983749445, "grad_norm": 9.546899795532227, "learning_rate": 3.9715626971594774e-05, "loss": 4.8734, "step": 145000 }, { "epoch": 0.24719534866395992, "grad_norm": 9.005885124206543, "learning_rate": 3.9626604454868526e-05, "loss": 4.8405, "step": 146000 }, { "epoch": 0.24888846749042542, "grad_norm": 9.44839096069336, "learning_rate": 3.9537492826513917e-05, "loss": 4.8229, "step": 147000 }, { "epoch": 0.25058158631689087, "grad_norm": 10.183741569519043, "learning_rate": 3.9448381198159314e-05, "loss": 4.8358, "step": 148000 }, { "epoch": 0.25227470514335637, "grad_norm": 8.858991622924805, "learning_rate": 3.9359269569804704e-05, "loss": 4.8262, "step": 149000 }, { "epoch": 0.25396782396982187, "grad_norm": 8.919591903686523, "learning_rate": 3.927024705307845e-05, "loss": 4.82, "step": 150000 }, { "epoch": 0.2556609427962873, "grad_norm": 9.322907447814941, "learning_rate": 3.918113542472385e-05, "loss": 4.7756, "step": 151000 }, { "epoch": 0.2573540616227528, "grad_norm": 9.72739028930664, "learning_rate": 3.909211290799759e-05, "loss": 4.804, "step": 152000 }, { "epoch": 0.25904718044921826, "grad_norm": 8.676836967468262, "learning_rate": 3.900309039127134e-05, "loss": 4.8092, "step": 153000 }, { "epoch": 0.26074029927568376, "grad_norm": 11.023446083068848, "learning_rate": 3.891397876291673e-05, "loss": 4.777, "step": 154000 }, { "epoch": 0.26243341810214926, "grad_norm": 9.72591495513916, "learning_rate": 3.8824867134562125e-05, "loss": 4.7923, "step": 155000 }, { "epoch": 0.2641265369286147, "grad_norm": 10.463099479675293, "learning_rate": 3.8735755506207516e-05, "loss": 4.7508, "step": 156000 }, { "epoch": 0.2658196557550802, "grad_norm": 9.738799095153809, "learning_rate": 3.864673298948127e-05, "loss": 4.7721, "step": 157000 }, { "epoch": 0.2675127745815457, "grad_norm": 9.083069801330566, "learning_rate": 3.855762136112666e-05, "loss": 4.7609, "step": 158000 }, { "epoch": 0.26920589340801115, "grad_norm": 7.990391254425049, "learning_rate": 3.8468598844400404e-05, "loss": 4.73, "step": 159000 }, { "epoch": 0.27089901223447665, "grad_norm": 10.174395561218262, "learning_rate": 3.83794872160458e-05, "loss": 4.7581, "step": 160000 }, { "epoch": 0.2725921310609421, "grad_norm": 9.703267097473145, "learning_rate": 3.829037558769119e-05, "loss": 4.7478, "step": 161000 }, { "epoch": 0.2742852498874076, "grad_norm": 8.36415958404541, "learning_rate": 3.820126395933659e-05, "loss": 4.7328, "step": 162000 }, { "epoch": 0.2759783687138731, "grad_norm": 9.818341255187988, "learning_rate": 3.811224144261033e-05, "loss": 4.7431, "step": 163000 }, { "epoch": 0.27767148754033855, "grad_norm": 8.965034484863281, "learning_rate": 3.8023129814255724e-05, "loss": 4.7341, "step": 164000 }, { "epoch": 0.27936460636680405, "grad_norm": 8.58529281616211, "learning_rate": 3.7934018185901115e-05, "loss": 4.7321, "step": 165000 }, { "epoch": 0.2810577251932695, "grad_norm": 8.484809875488281, "learning_rate": 3.784490655754651e-05, "loss": 4.6962, "step": 166000 }, { "epoch": 0.282750844019735, "grad_norm": 9.081258773803711, "learning_rate": 3.775588404082026e-05, "loss": 4.6834, "step": 167000 }, { "epoch": 0.2844439628462005, "grad_norm": 10.650101661682129, "learning_rate": 3.766677241246565e-05, "loss": 4.7154, "step": 168000 }, { "epoch": 0.28613708167266594, "grad_norm": 10.91901969909668, "learning_rate": 3.757774989573939e-05, "loss": 4.6855, "step": 169000 }, { "epoch": 0.28783020049913144, "grad_norm": 9.72966194152832, "learning_rate": 3.748863826738479e-05, "loss": 4.697, "step": 170000 }, { "epoch": 0.2895233193255969, "grad_norm": 9.929098129272461, "learning_rate": 3.73997048622869e-05, "loss": 4.6887, "step": 171000 }, { "epoch": 0.2912164381520624, "grad_norm": 10.107782363891602, "learning_rate": 3.731059323393228e-05, "loss": 4.6571, "step": 172000 }, { "epoch": 0.2929095569785279, "grad_norm": 10.362955093383789, "learning_rate": 3.722148160557768e-05, "loss": 4.665, "step": 173000 }, { "epoch": 0.29460267580499333, "grad_norm": 11.539287567138672, "learning_rate": 3.713236997722307e-05, "loss": 4.674, "step": 174000 }, { "epoch": 0.29629579463145883, "grad_norm": 10.165176391601562, "learning_rate": 3.704334746049682e-05, "loss": 4.6532, "step": 175000 }, { "epoch": 0.29798891345792433, "grad_norm": 8.686051368713379, "learning_rate": 3.695423583214221e-05, "loss": 4.6451, "step": 176000 }, { "epoch": 0.2996820322843898, "grad_norm": 9.45073413848877, "learning_rate": 3.686521331541596e-05, "loss": 4.6564, "step": 177000 }, { "epoch": 0.3013751511108553, "grad_norm": 10.110017776489258, "learning_rate": 3.67761907986897e-05, "loss": 4.6755, "step": 178000 }, { "epoch": 0.3030682699373207, "grad_norm": 9.56343936920166, "learning_rate": 3.66870791703351e-05, "loss": 4.6385, "step": 179000 }, { "epoch": 0.3047613887637862, "grad_norm": 8.596985816955566, "learning_rate": 3.659796754198049e-05, "loss": 4.6322, "step": 180000 }, { "epoch": 0.3064545075902517, "grad_norm": 9.146219253540039, "learning_rate": 3.650885591362588e-05, "loss": 4.6202, "step": 181000 }, { "epoch": 0.30814762641671717, "grad_norm": 8.99553108215332, "learning_rate": 3.6419833396899626e-05, "loss": 4.6507, "step": 182000 }, { "epoch": 0.30984074524318267, "grad_norm": 11.528129577636719, "learning_rate": 3.633081088017338e-05, "loss": 4.6303, "step": 183000 }, { "epoch": 0.3115338640696481, "grad_norm": 8.334531784057617, "learning_rate": 3.6241699251818775e-05, "loss": 4.6302, "step": 184000 }, { "epoch": 0.3132269828961136, "grad_norm": 9.167800903320312, "learning_rate": 3.6152587623464165e-05, "loss": 4.6158, "step": 185000 }, { "epoch": 0.3149201017225791, "grad_norm": 10.405464172363281, "learning_rate": 3.6063475995109556e-05, "loss": 4.6053, "step": 186000 }, { "epoch": 0.31661322054904456, "grad_norm": 11.72203540802002, "learning_rate": 3.59744534783833e-05, "loss": 4.6202, "step": 187000 }, { "epoch": 0.31830633937551006, "grad_norm": 8.738761901855469, "learning_rate": 3.58853418500287e-05, "loss": 4.5918, "step": 188000 }, { "epoch": 0.3199994582019755, "grad_norm": 8.735617637634277, "learning_rate": 3.5796319333302444e-05, "loss": 4.5667, "step": 189000 }, { "epoch": 0.321692577028441, "grad_norm": 10.545204162597656, "learning_rate": 3.5707207704947834e-05, "loss": 4.6087, "step": 190000 }, { "epoch": 0.3233856958549065, "grad_norm": 9.377281188964844, "learning_rate": 3.561818518822158e-05, "loss": 4.5919, "step": 191000 }, { "epoch": 0.32507881468137195, "grad_norm": 10.919636726379395, "learning_rate": 3.552907355986698e-05, "loss": 4.5964, "step": 192000 }, { "epoch": 0.32677193350783745, "grad_norm": 12.049422264099121, "learning_rate": 3.544005104314072e-05, "loss": 4.5809, "step": 193000 }, { "epoch": 0.32846505233430295, "grad_norm": 9.932397842407227, "learning_rate": 3.535102852641447e-05, "loss": 4.6008, "step": 194000 }, { "epoch": 0.3301581711607684, "grad_norm": 8.802002906799316, "learning_rate": 3.5261916898059864e-05, "loss": 4.5761, "step": 195000 }, { "epoch": 0.3318512899872339, "grad_norm": 9.653409957885742, "learning_rate": 3.517289438133361e-05, "loss": 4.5689, "step": 196000 }, { "epoch": 0.33354440881369934, "grad_norm": 9.621604919433594, "learning_rate": 3.508378275297901e-05, "loss": 4.5779, "step": 197000 }, { "epoch": 0.33523752764016485, "grad_norm": 9.920105934143066, "learning_rate": 3.49946711246244e-05, "loss": 4.5474, "step": 198000 }, { "epoch": 0.33693064646663035, "grad_norm": 10.116362571716309, "learning_rate": 3.490555949626979e-05, "loss": 4.5571, "step": 199000 }, { "epoch": 0.3386237652930958, "grad_norm": 9.373880386352539, "learning_rate": 3.481653697954353e-05, "loss": 4.5521, "step": 200000 }, { "epoch": 0.3403168841195613, "grad_norm": 10.794736862182617, "learning_rate": 3.472742535118893e-05, "loss": 4.5489, "step": 201000 }, { "epoch": 0.34201000294602674, "grad_norm": 8.700074195861816, "learning_rate": 3.4638402834462676e-05, "loss": 4.5312, "step": 202000 }, { "epoch": 0.34370312177249224, "grad_norm": 11.18816089630127, "learning_rate": 3.454938031773642e-05, "loss": 4.542, "step": 203000 }, { "epoch": 0.34539624059895774, "grad_norm": 8.754980087280273, "learning_rate": 3.446026868938181e-05, "loss": 4.5152, "step": 204000 }, { "epoch": 0.3470893594254232, "grad_norm": 10.25131893157959, "learning_rate": 3.437115706102721e-05, "loss": 4.5353, "step": 205000 }, { "epoch": 0.3487824782518887, "grad_norm": 10.850441932678223, "learning_rate": 3.42820454326726e-05, "loss": 4.5378, "step": 206000 }, { "epoch": 0.35047559707835413, "grad_norm": 11.55842113494873, "learning_rate": 3.419302291594635e-05, "loss": 4.5246, "step": 207000 }, { "epoch": 0.35216871590481963, "grad_norm": 12.282233238220215, "learning_rate": 3.410391128759174e-05, "loss": 4.5172, "step": 208000 }, { "epoch": 0.35386183473128513, "grad_norm": 10.086039543151855, "learning_rate": 3.401488877086549e-05, "loss": 4.5125, "step": 209000 }, { "epoch": 0.3555549535577506, "grad_norm": 11.509523391723633, "learning_rate": 3.392586625413924e-05, "loss": 4.5214, "step": 210000 }, { "epoch": 0.3572480723842161, "grad_norm": 11.7902193069458, "learning_rate": 3.383675462578463e-05, "loss": 4.511, "step": 211000 }, { "epoch": 0.3589411912106815, "grad_norm": 10.49217700958252, "learning_rate": 3.374764299743002e-05, "loss": 4.5221, "step": 212000 }, { "epoch": 0.360634310037147, "grad_norm": 11.799333572387695, "learning_rate": 3.365853136907541e-05, "loss": 4.4908, "step": 213000 }, { "epoch": 0.3623274288636125, "grad_norm": 11.408556938171387, "learning_rate": 3.356950885234916e-05, "loss": 4.4901, "step": 214000 }, { "epoch": 0.36402054769007797, "grad_norm": 9.75319766998291, "learning_rate": 3.348039722399455e-05, "loss": 4.4959, "step": 215000 }, { "epoch": 0.36571366651654347, "grad_norm": 10.989619255065918, "learning_rate": 3.339128559563995e-05, "loss": 4.5005, "step": 216000 }, { "epoch": 0.36740678534300897, "grad_norm": 9.672959327697754, "learning_rate": 3.330217396728534e-05, "loss": 4.4938, "step": 217000 }, { "epoch": 0.3690999041694744, "grad_norm": 11.274438858032227, "learning_rate": 3.3213151450559086e-05, "loss": 4.4772, "step": 218000 }, { "epoch": 0.3707930229959399, "grad_norm": 13.037381172180176, "learning_rate": 3.3124039822204484e-05, "loss": 4.4911, "step": 219000 }, { "epoch": 0.37248614182240536, "grad_norm": 9.693336486816406, "learning_rate": 3.303501730547823e-05, "loss": 4.4854, "step": 220000 }, { "epoch": 0.37417926064887086, "grad_norm": 9.47592830657959, "learning_rate": 3.2945905677123626e-05, "loss": 4.4832, "step": 221000 }, { "epoch": 0.37587237947533636, "grad_norm": 11.473603248596191, "learning_rate": 3.2856883160397365e-05, "loss": 4.4753, "step": 222000 }, { "epoch": 0.3775654983018018, "grad_norm": 12.08969783782959, "learning_rate": 3.276786064367112e-05, "loss": 4.4563, "step": 223000 }, { "epoch": 0.3792586171282673, "grad_norm": 11.439253807067871, "learning_rate": 3.267874901531651e-05, "loss": 4.4652, "step": 224000 }, { "epoch": 0.38095173595473275, "grad_norm": 10.943549156188965, "learning_rate": 3.258972649859026e-05, "loss": 4.4568, "step": 225000 }, { "epoch": 0.38264485478119825, "grad_norm": 10.543309211730957, "learning_rate": 3.250061487023564e-05, "loss": 4.4772, "step": 226000 }, { "epoch": 0.38433797360766375, "grad_norm": 10.24008560180664, "learning_rate": 3.241150324188104e-05, "loss": 4.462, "step": 227000 }, { "epoch": 0.3860310924341292, "grad_norm": 11.649805068969727, "learning_rate": 3.232239161352643e-05, "loss": 4.4553, "step": 228000 }, { "epoch": 0.3877242112605947, "grad_norm": 10.206079483032227, "learning_rate": 3.223336909680018e-05, "loss": 4.4633, "step": 229000 }, { "epoch": 0.38941733008706014, "grad_norm": 12.397170066833496, "learning_rate": 3.214425746844557e-05, "loss": 4.4437, "step": 230000 }, { "epoch": 0.39111044891352564, "grad_norm": 10.84349250793457, "learning_rate": 3.205523495171932e-05, "loss": 4.4263, "step": 231000 }, { "epoch": 0.39280356773999114, "grad_norm": 11.762824058532715, "learning_rate": 3.1966123323364716e-05, "loss": 4.4405, "step": 232000 }, { "epoch": 0.3944966865664566, "grad_norm": 10.64721393585205, "learning_rate": 3.187710080663846e-05, "loss": 4.4537, "step": 233000 }, { "epoch": 0.3961898053929221, "grad_norm": 11.396660804748535, "learning_rate": 3.178798917828386e-05, "loss": 4.4456, "step": 234000 }, { "epoch": 0.3978829242193876, "grad_norm": 11.598654747009277, "learning_rate": 3.16989666615576e-05, "loss": 4.4272, "step": 235000 }, { "epoch": 0.39957604304585304, "grad_norm": 10.858880996704102, "learning_rate": 3.1609855033202994e-05, "loss": 4.4312, "step": 236000 }, { "epoch": 0.40126916187231854, "grad_norm": 10.361440658569336, "learning_rate": 3.152083251647674e-05, "loss": 4.4424, "step": 237000 }, { "epoch": 0.402962280698784, "grad_norm": 10.528017044067383, "learning_rate": 3.143180999975049e-05, "loss": 4.4239, "step": 238000 }, { "epoch": 0.4046553995252495, "grad_norm": 10.247213363647461, "learning_rate": 3.134269837139588e-05, "loss": 4.4317, "step": 239000 }, { "epoch": 0.406348518351715, "grad_norm": 12.395379066467285, "learning_rate": 3.125358674304127e-05, "loss": 4.4198, "step": 240000 }, { "epoch": 0.4080416371781804, "grad_norm": 10.620038986206055, "learning_rate": 3.1164564226315024e-05, "loss": 4.4229, "step": 241000 }, { "epoch": 0.40973475600464593, "grad_norm": 11.194082260131836, "learning_rate": 3.1075452597960415e-05, "loss": 4.4133, "step": 242000 }, { "epoch": 0.4114278748311114, "grad_norm": 10.855474472045898, "learning_rate": 3.098634096960581e-05, "loss": 4.4221, "step": 243000 }, { "epoch": 0.4131209936575769, "grad_norm": 11.166451454162598, "learning_rate": 3.089731845287955e-05, "loss": 4.3956, "step": 244000 }, { "epoch": 0.4148141124840424, "grad_norm": 11.371562957763672, "learning_rate": 3.080820682452495e-05, "loss": 4.4019, "step": 245000 }, { "epoch": 0.4165072313105078, "grad_norm": 10.536613464355469, "learning_rate": 3.071909519617034e-05, "loss": 4.4077, "step": 246000 }, { "epoch": 0.4182003501369733, "grad_norm": 10.86573314666748, "learning_rate": 3.0629983567815736e-05, "loss": 4.3735, "step": 247000 }, { "epoch": 0.41989346896343877, "grad_norm": 10.411300659179688, "learning_rate": 3.054096105108948e-05, "loss": 4.3922, "step": 248000 }, { "epoch": 0.42158658778990427, "grad_norm": 10.708267211914062, "learning_rate": 3.0451849422734875e-05, "loss": 4.4014, "step": 249000 }, { "epoch": 0.42327970661636977, "grad_norm": 11.355742454528809, "learning_rate": 3.0362826906008617e-05, "loss": 4.3933, "step": 250000 }, { "epoch": 0.4249728254428352, "grad_norm": 10.961949348449707, "learning_rate": 3.027380438928237e-05, "loss": 4.3814, "step": 251000 }, { "epoch": 0.4266659442693007, "grad_norm": 11.06654167175293, "learning_rate": 3.0184692760927763e-05, "loss": 4.3668, "step": 252000 }, { "epoch": 0.4283590630957662, "grad_norm": 12.380488395690918, "learning_rate": 3.0095670244201508e-05, "loss": 4.3838, "step": 253000 }, { "epoch": 0.43005218192223166, "grad_norm": 11.07506275177002, "learning_rate": 3.0006558615846902e-05, "loss": 4.3916, "step": 254000 }, { "epoch": 0.43174530074869716, "grad_norm": 9.904339790344238, "learning_rate": 2.9917446987492293e-05, "loss": 4.3939, "step": 255000 }, { "epoch": 0.4334384195751626, "grad_norm": 9.281264305114746, "learning_rate": 2.9828335359137686e-05, "loss": 4.3824, "step": 256000 }, { "epoch": 0.4351315384016281, "grad_norm": 9.245565414428711, "learning_rate": 2.973931284241143e-05, "loss": 4.3637, "step": 257000 }, { "epoch": 0.4368246572280936, "grad_norm": 10.845768928527832, "learning_rate": 2.965020121405683e-05, "loss": 4.3703, "step": 258000 }, { "epoch": 0.43851777605455905, "grad_norm": 11.630556106567383, "learning_rate": 2.9561089585702216e-05, "loss": 4.3614, "step": 259000 }, { "epoch": 0.44021089488102455, "grad_norm": 12.486907958984375, "learning_rate": 2.9472067068975968e-05, "loss": 4.3775, "step": 260000 }, { "epoch": 0.44190401370749, "grad_norm": 11.074153900146484, "learning_rate": 2.9382955440621355e-05, "loss": 4.3638, "step": 261000 }, { "epoch": 0.4435971325339555, "grad_norm": 11.477371215820312, "learning_rate": 2.9293843812266753e-05, "loss": 4.3567, "step": 262000 }, { "epoch": 0.445290251360421, "grad_norm": 12.486824989318848, "learning_rate": 2.9204821295540498e-05, "loss": 4.3488, "step": 263000 }, { "epoch": 0.44698337018688644, "grad_norm": 10.96546745300293, "learning_rate": 2.9115709667185892e-05, "loss": 4.3583, "step": 264000 }, { "epoch": 0.44867648901335194, "grad_norm": 12.240910530090332, "learning_rate": 2.9026598038831282e-05, "loss": 4.367, "step": 265000 }, { "epoch": 0.4503696078398174, "grad_norm": 10.22899055480957, "learning_rate": 2.893757552210503e-05, "loss": 4.3695, "step": 266000 }, { "epoch": 0.4520627266662829, "grad_norm": 10.991477012634277, "learning_rate": 2.8848463893750428e-05, "loss": 4.359, "step": 267000 }, { "epoch": 0.4537558454927484, "grad_norm": 10.273150444030762, "learning_rate": 2.8759352265395815e-05, "loss": 4.3377, "step": 268000 }, { "epoch": 0.45544896431921383, "grad_norm": 11.715832710266113, "learning_rate": 2.8670329748669567e-05, "loss": 4.3392, "step": 269000 }, { "epoch": 0.45714208314567933, "grad_norm": 8.992547035217285, "learning_rate": 2.8581218120314958e-05, "loss": 4.3382, "step": 270000 }, { "epoch": 0.45883520197214483, "grad_norm": 10.941043853759766, "learning_rate": 2.8492106491960352e-05, "loss": 4.3314, "step": 271000 }, { "epoch": 0.4605283207986103, "grad_norm": 9.537842750549316, "learning_rate": 2.8402994863605742e-05, "loss": 4.3379, "step": 272000 }, { "epoch": 0.4622214396250758, "grad_norm": 10.705263137817383, "learning_rate": 2.831397234687949e-05, "loss": 4.3386, "step": 273000 }, { "epoch": 0.4639145584515412, "grad_norm": 10.991068840026855, "learning_rate": 2.822486071852488e-05, "loss": 4.3086, "step": 274000 }, { "epoch": 0.4656076772780067, "grad_norm": 12.084393501281738, "learning_rate": 2.813583820179863e-05, "loss": 4.3363, "step": 275000 }, { "epoch": 0.4673007961044722, "grad_norm": 10.328225135803223, "learning_rate": 2.8046815685072375e-05, "loss": 4.3297, "step": 276000 }, { "epoch": 0.46899391493093767, "grad_norm": 9.782522201538086, "learning_rate": 2.7957793168346124e-05, "loss": 4.32, "step": 277000 }, { "epoch": 0.4706870337574032, "grad_norm": 10.543360710144043, "learning_rate": 2.786868153999152e-05, "loss": 4.3126, "step": 278000 }, { "epoch": 0.4723801525838686, "grad_norm": 11.055947303771973, "learning_rate": 2.7779569911636912e-05, "loss": 4.299, "step": 279000 }, { "epoch": 0.4740732714103341, "grad_norm": 11.253028869628906, "learning_rate": 2.7690458283282306e-05, "loss": 4.3199, "step": 280000 }, { "epoch": 0.4757663902367996, "grad_norm": 13.4835786819458, "learning_rate": 2.7601346654927696e-05, "loss": 4.2942, "step": 281000 }, { "epoch": 0.47745950906326506, "grad_norm": 11.899601936340332, "learning_rate": 2.751223502657309e-05, "loss": 4.3024, "step": 282000 }, { "epoch": 0.47915262788973056, "grad_norm": 10.91974925994873, "learning_rate": 2.7423212509846835e-05, "loss": 4.3092, "step": 283000 }, { "epoch": 0.480845746716196, "grad_norm": 11.593118667602539, "learning_rate": 2.7334189993120584e-05, "loss": 4.3236, "step": 284000 }, { "epoch": 0.4825388655426615, "grad_norm": 11.768682479858398, "learning_rate": 2.7245078364765975e-05, "loss": 4.3097, "step": 285000 }, { "epoch": 0.484231984369127, "grad_norm": 10.96634578704834, "learning_rate": 2.715596673641137e-05, "loss": 4.3143, "step": 286000 }, { "epoch": 0.48592510319559246, "grad_norm": 11.164544105529785, "learning_rate": 2.706685510805676e-05, "loss": 4.2968, "step": 287000 }, { "epoch": 0.48761822202205796, "grad_norm": 10.797432899475098, "learning_rate": 2.697783259133051e-05, "loss": 4.284, "step": 288000 }, { "epoch": 0.48931134084852346, "grad_norm": 11.396538734436035, "learning_rate": 2.6888720962975898e-05, "loss": 4.294, "step": 289000 }, { "epoch": 0.4910044596749889, "grad_norm": 11.784427642822266, "learning_rate": 2.6799609334621295e-05, "loss": 4.2767, "step": 290000 }, { "epoch": 0.4926975785014544, "grad_norm": 11.122530937194824, "learning_rate": 2.6710586817895044e-05, "loss": 4.2904, "step": 291000 }, { "epoch": 0.49439069732791985, "grad_norm": 9.371871948242188, "learning_rate": 2.6621475189540435e-05, "loss": 4.278, "step": 292000 }, { "epoch": 0.49608381615438535, "grad_norm": 10.864598274230957, "learning_rate": 2.6532452672814183e-05, "loss": 4.2813, "step": 293000 }, { "epoch": 0.49777693498085085, "grad_norm": 12.515512466430664, "learning_rate": 2.6443341044459574e-05, "loss": 4.2642, "step": 294000 }, { "epoch": 0.4994700538073163, "grad_norm": 13.125802040100098, "learning_rate": 2.635422941610497e-05, "loss": 4.2572, "step": 295000 }, { "epoch": 0.5011631726337817, "grad_norm": 11.11710262298584, "learning_rate": 2.6265206899378713e-05, "loss": 4.2877, "step": 296000 }, { "epoch": 0.5028562914602472, "grad_norm": 11.456696510314941, "learning_rate": 2.617609527102411e-05, "loss": 4.2837, "step": 297000 }, { "epoch": 0.5045494102867127, "grad_norm": 12.255184173583984, "learning_rate": 2.60869836426695e-05, "loss": 4.2851, "step": 298000 }, { "epoch": 0.5062425291131782, "grad_norm": 10.082769393920898, "learning_rate": 2.599796112594325e-05, "loss": 4.2711, "step": 299000 }, { "epoch": 0.5079356479396437, "grad_norm": 13.198633193969727, "learning_rate": 2.590884949758864e-05, "loss": 4.2591, "step": 300000 }, { "epoch": 0.5096287667661091, "grad_norm": 12.595605850219727, "learning_rate": 2.5819737869234034e-05, "loss": 4.2852, "step": 301000 }, { "epoch": 0.5113218855925746, "grad_norm": 10.539556503295898, "learning_rate": 2.573071535250778e-05, "loss": 4.2609, "step": 302000 }, { "epoch": 0.5130150044190401, "grad_norm": 10.459716796875, "learning_rate": 2.5641603724153173e-05, "loss": 4.2765, "step": 303000 }, { "epoch": 0.5147081232455056, "grad_norm": 10.351983070373535, "learning_rate": 2.5552492095798564e-05, "loss": 4.2923, "step": 304000 }, { "epoch": 0.5164012420719711, "grad_norm": 11.496223449707031, "learning_rate": 2.5463380467443957e-05, "loss": 4.2773, "step": 305000 }, { "epoch": 0.5180943608984365, "grad_norm": 10.303821563720703, "learning_rate": 2.537435795071771e-05, "loss": 4.2478, "step": 306000 }, { "epoch": 0.519787479724902, "grad_norm": 10.091544151306152, "learning_rate": 2.52852463223631e-05, "loss": 4.2524, "step": 307000 }, { "epoch": 0.5214805985513675, "grad_norm": 11.309797286987305, "learning_rate": 2.519622380563685e-05, "loss": 4.254, "step": 308000 }, { "epoch": 0.523173717377833, "grad_norm": 13.143366813659668, "learning_rate": 2.5107201288910594e-05, "loss": 4.2523, "step": 309000 }, { "epoch": 0.5248668362042985, "grad_norm": 11.699246406555176, "learning_rate": 2.5018089660555988e-05, "loss": 4.2237, "step": 310000 }, { "epoch": 0.5265599550307639, "grad_norm": 9.942435264587402, "learning_rate": 2.492897803220138e-05, "loss": 4.2646, "step": 311000 }, { "epoch": 0.5282530738572294, "grad_norm": 12.417354583740234, "learning_rate": 2.4839866403846772e-05, "loss": 4.2381, "step": 312000 }, { "epoch": 0.5299461926836949, "grad_norm": 12.108587265014648, "learning_rate": 2.475084388712052e-05, "loss": 4.2422, "step": 313000 }, { "epoch": 0.5316393115101604, "grad_norm": 10.721087455749512, "learning_rate": 2.4661821370394266e-05, "loss": 4.2609, "step": 314000 }, { "epoch": 0.5333324303366259, "grad_norm": 13.240788459777832, "learning_rate": 2.457270974203966e-05, "loss": 4.2359, "step": 315000 }, { "epoch": 0.5350255491630914, "grad_norm": 11.577778816223145, "learning_rate": 2.4483598113685054e-05, "loss": 4.2542, "step": 316000 }, { "epoch": 0.5367186679895568, "grad_norm": 9.913931846618652, "learning_rate": 2.4394486485330444e-05, "loss": 4.2516, "step": 317000 }, { "epoch": 0.5384117868160223, "grad_norm": 11.693208694458008, "learning_rate": 2.4305463968604193e-05, "loss": 4.2435, "step": 318000 }, { "epoch": 0.5401049056424878, "grad_norm": 11.548184394836426, "learning_rate": 2.4216352340249584e-05, "loss": 4.2026, "step": 319000 }, { "epoch": 0.5417980244689533, "grad_norm": 11.510560989379883, "learning_rate": 2.4127329823523332e-05, "loss": 4.2504, "step": 320000 }, { "epoch": 0.5434911432954188, "grad_norm": 9.461125373840332, "learning_rate": 2.403830730679708e-05, "loss": 4.2408, "step": 321000 }, { "epoch": 0.5451842621218842, "grad_norm": 10.670475006103516, "learning_rate": 2.3949195678442475e-05, "loss": 4.2281, "step": 322000 }, { "epoch": 0.5468773809483497, "grad_norm": 9.988083839416504, "learning_rate": 2.3860084050087865e-05, "loss": 4.2355, "step": 323000 }, { "epoch": 0.5485704997748152, "grad_norm": 12.351629257202148, "learning_rate": 2.377097242173326e-05, "loss": 4.2195, "step": 324000 }, { "epoch": 0.5502636186012807, "grad_norm": 11.589841842651367, "learning_rate": 2.3681949905007004e-05, "loss": 4.2458, "step": 325000 }, { "epoch": 0.5519567374277462, "grad_norm": 11.782110214233398, "learning_rate": 2.3592927388280753e-05, "loss": 4.2491, "step": 326000 }, { "epoch": 0.5536498562542116, "grad_norm": 13.562594413757324, "learning_rate": 2.3503815759926147e-05, "loss": 4.2327, "step": 327000 }, { "epoch": 0.5553429750806771, "grad_norm": 13.059347152709961, "learning_rate": 2.3414704131571537e-05, "loss": 4.2369, "step": 328000 }, { "epoch": 0.5570360939071426, "grad_norm": 10.70898151397705, "learning_rate": 2.332559250321693e-05, "loss": 4.2061, "step": 329000 }, { "epoch": 0.5587292127336081, "grad_norm": 12.004964828491211, "learning_rate": 2.3236569986490677e-05, "loss": 4.2369, "step": 330000 }, { "epoch": 0.5604223315600736, "grad_norm": 11.758599281311035, "learning_rate": 2.314745835813607e-05, "loss": 4.2229, "step": 331000 }, { "epoch": 0.562115450386539, "grad_norm": 11.828540802001953, "learning_rate": 2.3058524953038174e-05, "loss": 4.2313, "step": 332000 }, { "epoch": 0.5638085692130045, "grad_norm": 11.63815689086914, "learning_rate": 2.2969413324683568e-05, "loss": 4.2225, "step": 333000 }, { "epoch": 0.56550168803947, "grad_norm": 9.943192481994629, "learning_rate": 2.2880301696328958e-05, "loss": 4.218, "step": 334000 }, { "epoch": 0.5671948068659355, "grad_norm": 11.783443450927734, "learning_rate": 2.2791190067974352e-05, "loss": 4.2229, "step": 335000 }, { "epoch": 0.568887925692401, "grad_norm": 12.504878997802734, "learning_rate": 2.2702256662876452e-05, "loss": 4.2179, "step": 336000 }, { "epoch": 0.5705810445188664, "grad_norm": 10.476326942443848, "learning_rate": 2.2613145034521846e-05, "loss": 4.2123, "step": 337000 }, { "epoch": 0.5722741633453319, "grad_norm": 10.85536003112793, "learning_rate": 2.252403340616724e-05, "loss": 4.1974, "step": 338000 }, { "epoch": 0.5739672821717974, "grad_norm": 11.811236381530762, "learning_rate": 2.2435010889440985e-05, "loss": 4.2133, "step": 339000 }, { "epoch": 0.5756604009982629, "grad_norm": 12.03921890258789, "learning_rate": 2.234589926108638e-05, "loss": 4.2252, "step": 340000 }, { "epoch": 0.5773535198247284, "grad_norm": 11.069652557373047, "learning_rate": 2.225678763273177e-05, "loss": 4.221, "step": 341000 }, { "epoch": 0.5790466386511938, "grad_norm": 12.02214527130127, "learning_rate": 2.2167676004377164e-05, "loss": 4.2036, "step": 342000 }, { "epoch": 0.5807397574776593, "grad_norm": 12.83344554901123, "learning_rate": 2.2078742599279267e-05, "loss": 4.1897, "step": 343000 }, { "epoch": 0.5824328763041248, "grad_norm": 10.495333671569824, "learning_rate": 2.1989720082553015e-05, "loss": 4.216, "step": 344000 }, { "epoch": 0.5841259951305903, "grad_norm": 10.508852005004883, "learning_rate": 2.1900608454198406e-05, "loss": 4.1973, "step": 345000 }, { "epoch": 0.5858191139570558, "grad_norm": 12.272819519042969, "learning_rate": 2.18114968258438e-05, "loss": 4.2004, "step": 346000 }, { "epoch": 0.5875122327835212, "grad_norm": 11.147738456726074, "learning_rate": 2.1722385197489194e-05, "loss": 4.1905, "step": 347000 }, { "epoch": 0.5892053516099867, "grad_norm": 10.011626243591309, "learning_rate": 2.1633273569134584e-05, "loss": 4.1874, "step": 348000 }, { "epoch": 0.5908984704364522, "grad_norm": 10.60913372039795, "learning_rate": 2.154416194077998e-05, "loss": 4.2079, "step": 349000 }, { "epoch": 0.5925915892629177, "grad_norm": 13.647647857666016, "learning_rate": 2.1455139424053724e-05, "loss": 4.1992, "step": 350000 }, { "epoch": 0.5942847080893832, "grad_norm": 11.93554401397705, "learning_rate": 2.1366027795699117e-05, "loss": 4.2155, "step": 351000 }, { "epoch": 0.5959778269158487, "grad_norm": 12.616731643676758, "learning_rate": 2.1277005278972863e-05, "loss": 4.169, "step": 352000 }, { "epoch": 0.597670945742314, "grad_norm": 9.891210556030273, "learning_rate": 2.118798276224661e-05, "loss": 4.1927, "step": 353000 }, { "epoch": 0.5993640645687796, "grad_norm": 13.135881423950195, "learning_rate": 2.1098871133892005e-05, "loss": 4.1885, "step": 354000 }, { "epoch": 0.601057183395245, "grad_norm": 11.595895767211914, "learning_rate": 2.1009759505537396e-05, "loss": 4.1848, "step": 355000 }, { "epoch": 0.6027503022217106, "grad_norm": 12.601067543029785, "learning_rate": 2.092064787718279e-05, "loss": 4.1798, "step": 356000 }, { "epoch": 0.604443421048176, "grad_norm": 10.613443374633789, "learning_rate": 2.0831714472084893e-05, "loss": 4.1857, "step": 357000 }, { "epoch": 0.6061365398746414, "grad_norm": 10.182647705078125, "learning_rate": 2.0742602843730287e-05, "loss": 4.1873, "step": 358000 }, { "epoch": 0.607829658701107, "grad_norm": 12.377314567565918, "learning_rate": 2.0653491215375677e-05, "loss": 4.1668, "step": 359000 }, { "epoch": 0.6095227775275724, "grad_norm": 11.106602668762207, "learning_rate": 2.0564468698649426e-05, "loss": 4.1565, "step": 360000 }, { "epoch": 0.611215896354038, "grad_norm": 12.362112045288086, "learning_rate": 2.0475357070294817e-05, "loss": 4.1579, "step": 361000 }, { "epoch": 0.6129090151805034, "grad_norm": 11.545656204223633, "learning_rate": 2.038624544194021e-05, "loss": 4.154, "step": 362000 }, { "epoch": 0.6146021340069688, "grad_norm": 10.845215797424316, "learning_rate": 2.0297133813585604e-05, "loss": 4.1577, "step": 363000 }, { "epoch": 0.6162952528334343, "grad_norm": 13.672916412353516, "learning_rate": 2.020811129685935e-05, "loss": 4.1682, "step": 364000 }, { "epoch": 0.6179883716598998, "grad_norm": 10.274605751037598, "learning_rate": 2.0118999668504744e-05, "loss": 4.1803, "step": 365000 }, { "epoch": 0.6196814904863653, "grad_norm": 9.770240783691406, "learning_rate": 2.0029888040150134e-05, "loss": 4.1574, "step": 366000 }, { "epoch": 0.6213746093128308, "grad_norm": 10.931896209716797, "learning_rate": 1.9940865523423883e-05, "loss": 4.1755, "step": 367000 }, { "epoch": 0.6230677281392962, "grad_norm": 11.029019355773926, "learning_rate": 1.9851753895069273e-05, "loss": 4.1708, "step": 368000 }, { "epoch": 0.6247608469657617, "grad_norm": 13.196535110473633, "learning_rate": 1.976264226671467e-05, "loss": 4.1821, "step": 369000 }, { "epoch": 0.6264539657922272, "grad_norm": 14.512234687805176, "learning_rate": 1.9673619749988416e-05, "loss": 4.159, "step": 370000 }, { "epoch": 0.6281470846186927, "grad_norm": 12.166828155517578, "learning_rate": 1.958450812163381e-05, "loss": 4.1548, "step": 371000 }, { "epoch": 0.6298402034451582, "grad_norm": 11.77064323425293, "learning_rate": 1.949548560490756e-05, "loss": 4.1575, "step": 372000 }, { "epoch": 0.6315333222716236, "grad_norm": 14.441299438476562, "learning_rate": 1.940637397655295e-05, "loss": 4.1776, "step": 373000 }, { "epoch": 0.6332264410980891, "grad_norm": 10.1741943359375, "learning_rate": 1.9317262348198343e-05, "loss": 4.179, "step": 374000 }, { "epoch": 0.6349195599245546, "grad_norm": 12.787932395935059, "learning_rate": 1.9228150719843733e-05, "loss": 4.1798, "step": 375000 }, { "epoch": 0.6366126787510201, "grad_norm": 12.913057327270508, "learning_rate": 1.9139128203117482e-05, "loss": 4.1526, "step": 376000 }, { "epoch": 0.6383057975774856, "grad_norm": 13.602836608886719, "learning_rate": 1.9050016574762876e-05, "loss": 4.1624, "step": 377000 }, { "epoch": 0.639998916403951, "grad_norm": 10.920446395874023, "learning_rate": 1.896099405803662e-05, "loss": 4.1483, "step": 378000 }, { "epoch": 0.6416920352304165, "grad_norm": 14.04163932800293, "learning_rate": 1.8871882429682015e-05, "loss": 4.1704, "step": 379000 }, { "epoch": 0.643385154056882, "grad_norm": 14.041404724121094, "learning_rate": 1.8782859912955764e-05, "loss": 4.1614, "step": 380000 }, { "epoch": 0.6450782728833475, "grad_norm": 9.533023834228516, "learning_rate": 1.8693748284601158e-05, "loss": 4.167, "step": 381000 }, { "epoch": 0.646771391709813, "grad_norm": 10.013155937194824, "learning_rate": 1.8604636656246548e-05, "loss": 4.1332, "step": 382000 }, { "epoch": 0.6484645105362784, "grad_norm": 12.149203300476074, "learning_rate": 1.8515525027891942e-05, "loss": 4.1555, "step": 383000 }, { "epoch": 0.6501576293627439, "grad_norm": 12.2727689743042, "learning_rate": 1.8426502511165687e-05, "loss": 4.1612, "step": 384000 }, { "epoch": 0.6518507481892094, "grad_norm": 11.223430633544922, "learning_rate": 1.833739088281108e-05, "loss": 4.1498, "step": 385000 }, { "epoch": 0.6535438670156749, "grad_norm": 12.815463066101074, "learning_rate": 1.824845747771318e-05, "loss": 4.1428, "step": 386000 }, { "epoch": 0.6552369858421404, "grad_norm": 10.58801555633545, "learning_rate": 1.8159345849358575e-05, "loss": 4.1308, "step": 387000 }, { "epoch": 0.6569301046686059, "grad_norm": 10.253556251525879, "learning_rate": 1.807023422100397e-05, "loss": 4.1383, "step": 388000 }, { "epoch": 0.6586232234950713, "grad_norm": 10.813176155090332, "learning_rate": 1.798112259264936e-05, "loss": 4.136, "step": 389000 }, { "epoch": 0.6603163423215368, "grad_norm": 11.76603889465332, "learning_rate": 1.7892100075923108e-05, "loss": 4.1407, "step": 390000 }, { "epoch": 0.6620094611480023, "grad_norm": 10.446664810180664, "learning_rate": 1.78029884475685e-05, "loss": 4.1415, "step": 391000 }, { "epoch": 0.6637025799744678, "grad_norm": 13.31120777130127, "learning_rate": 1.7713876819213893e-05, "loss": 4.1397, "step": 392000 }, { "epoch": 0.6653956988009333, "grad_norm": 15.533784866333008, "learning_rate": 1.7624765190859286e-05, "loss": 4.1222, "step": 393000 }, { "epoch": 0.6670888176273987, "grad_norm": 13.553240776062012, "learning_rate": 1.7535742674133035e-05, "loss": 4.1252, "step": 394000 }, { "epoch": 0.6687819364538642, "grad_norm": 13.170623779296875, "learning_rate": 1.744663104577843e-05, "loss": 4.1214, "step": 395000 }, { "epoch": 0.6704750552803297, "grad_norm": 11.393040657043457, "learning_rate": 1.735769764068053e-05, "loss": 4.1285, "step": 396000 }, { "epoch": 0.6721681741067952, "grad_norm": 13.054417610168457, "learning_rate": 1.7268586012325923e-05, "loss": 4.1056, "step": 397000 }, { "epoch": 0.6738612929332607, "grad_norm": 11.749643325805664, "learning_rate": 1.7179474383971313e-05, "loss": 4.1255, "step": 398000 }, { "epoch": 0.6755544117597261, "grad_norm": 11.650969505310059, "learning_rate": 1.7090362755616707e-05, "loss": 4.1176, "step": 399000 }, { "epoch": 0.6772475305861916, "grad_norm": 14.929516792297363, "learning_rate": 1.7001340238890453e-05, "loss": 4.1658, "step": 400000 }, { "epoch": 0.6789406494126571, "grad_norm": 12.77851676940918, "learning_rate": 1.69123177221642e-05, "loss": 4.1349, "step": 401000 }, { "epoch": 0.6806337682391226, "grad_norm": 13.427529335021973, "learning_rate": 1.682320609380959e-05, "loss": 4.1122, "step": 402000 }, { "epoch": 0.6823268870655881, "grad_norm": 11.719392776489258, "learning_rate": 1.6734183577083344e-05, "loss": 4.1095, "step": 403000 }, { "epoch": 0.6840200058920535, "grad_norm": 11.567068099975586, "learning_rate": 1.6645071948728734e-05, "loss": 4.114, "step": 404000 }, { "epoch": 0.685713124718519, "grad_norm": 10.441802978515625, "learning_rate": 1.6555960320374128e-05, "loss": 4.1251, "step": 405000 }, { "epoch": 0.6874062435449845, "grad_norm": 12.035784721374512, "learning_rate": 1.6466937803647873e-05, "loss": 4.1224, "step": 406000 }, { "epoch": 0.68909936237145, "grad_norm": 12.039976119995117, "learning_rate": 1.6377826175293267e-05, "loss": 4.1421, "step": 407000 }, { "epoch": 0.6907924811979155, "grad_norm": 13.000835418701172, "learning_rate": 1.6288803658567016e-05, "loss": 4.1397, "step": 408000 }, { "epoch": 0.6924856000243809, "grad_norm": 12.320772171020508, "learning_rate": 1.6199692030212406e-05, "loss": 4.1127, "step": 409000 }, { "epoch": 0.6941787188508464, "grad_norm": 13.898431777954102, "learning_rate": 1.61105804018578e-05, "loss": 4.1244, "step": 410000 }, { "epoch": 0.6958718376773119, "grad_norm": 14.389370918273926, "learning_rate": 1.6021468773503194e-05, "loss": 4.1049, "step": 411000 }, { "epoch": 0.6975649565037774, "grad_norm": 10.797822952270508, "learning_rate": 1.593244625677694e-05, "loss": 4.1041, "step": 412000 }, { "epoch": 0.6992580753302429, "grad_norm": 13.629505157470703, "learning_rate": 1.5843334628422333e-05, "loss": 4.1088, "step": 413000 }, { "epoch": 0.7009511941567083, "grad_norm": 11.743549346923828, "learning_rate": 1.5754223000067724e-05, "loss": 4.1008, "step": 414000 }, { "epoch": 0.7026443129831738, "grad_norm": 12.309115409851074, "learning_rate": 1.5665200483341473e-05, "loss": 4.0961, "step": 415000 }, { "epoch": 0.7043374318096393, "grad_norm": 14.773079872131348, "learning_rate": 1.5576088854986863e-05, "loss": 4.0963, "step": 416000 }, { "epoch": 0.7060305506361048, "grad_norm": 11.214386940002441, "learning_rate": 1.5486977226632257e-05, "loss": 4.0939, "step": 417000 }, { "epoch": 0.7077236694625703, "grad_norm": 12.776312828063965, "learning_rate": 1.5397954709906006e-05, "loss": 4.093, "step": 418000 }, { "epoch": 0.7094167882890356, "grad_norm": 11.701675415039062, "learning_rate": 1.53088430815514e-05, "loss": 4.0999, "step": 419000 }, { "epoch": 0.7111099071155011, "grad_norm": 12.045074462890625, "learning_rate": 1.5219909676453501e-05, "loss": 4.1207, "step": 420000 }, { "epoch": 0.7128030259419667, "grad_norm": 12.85261344909668, "learning_rate": 1.5130798048098893e-05, "loss": 4.1473, "step": 421000 }, { "epoch": 0.7144961447684322, "grad_norm": 12.787480354309082, "learning_rate": 1.5041686419744286e-05, "loss": 4.0806, "step": 422000 }, { "epoch": 0.7161892635948977, "grad_norm": 10.421377182006836, "learning_rate": 1.4952574791389678e-05, "loss": 4.0918, "step": 423000 }, { "epoch": 0.717882382421363, "grad_norm": 11.66174030303955, "learning_rate": 1.4863463163035072e-05, "loss": 4.103, "step": 424000 }, { "epoch": 0.7195755012478285, "grad_norm": 11.840550422668457, "learning_rate": 1.4774351534680464e-05, "loss": 4.099, "step": 425000 }, { "epoch": 0.721268620074294, "grad_norm": 11.197615623474121, "learning_rate": 1.4685329017954211e-05, "loss": 4.0854, "step": 426000 }, { "epoch": 0.7229617389007595, "grad_norm": 11.457440376281738, "learning_rate": 1.4596306501227958e-05, "loss": 4.0968, "step": 427000 }, { "epoch": 0.724654857727225, "grad_norm": 10.805425643920898, "learning_rate": 1.450719487287335e-05, "loss": 4.0868, "step": 428000 }, { "epoch": 0.7263479765536905, "grad_norm": 11.462552070617676, "learning_rate": 1.44181723561471e-05, "loss": 4.1, "step": 429000 }, { "epoch": 0.7280410953801559, "grad_norm": 12.227815628051758, "learning_rate": 1.4329060727792493e-05, "loss": 4.1138, "step": 430000 }, { "epoch": 0.7297342142066214, "grad_norm": 14.584619522094727, "learning_rate": 1.424003821106624e-05, "loss": 4.0996, "step": 431000 }, { "epoch": 0.7314273330330869, "grad_norm": 12.448081016540527, "learning_rate": 1.4150926582711632e-05, "loss": 4.0947, "step": 432000 }, { "epoch": 0.7331204518595524, "grad_norm": 13.15820598602295, "learning_rate": 1.4061904065985379e-05, "loss": 4.0902, "step": 433000 }, { "epoch": 0.7348135706860179, "grad_norm": 12.201361656188965, "learning_rate": 1.3972792437630771e-05, "loss": 4.0912, "step": 434000 }, { "epoch": 0.7365066895124833, "grad_norm": 11.291053771972656, "learning_rate": 1.3883680809276165e-05, "loss": 4.0518, "step": 435000 }, { "epoch": 0.7381998083389488, "grad_norm": 11.034360885620117, "learning_rate": 1.3794569180921557e-05, "loss": 4.0815, "step": 436000 }, { "epoch": 0.7398929271654143, "grad_norm": 14.517773628234863, "learning_rate": 1.3705546664195304e-05, "loss": 4.0943, "step": 437000 }, { "epoch": 0.7415860459918798, "grad_norm": 11.992380142211914, "learning_rate": 1.3616435035840696e-05, "loss": 4.0759, "step": 438000 }, { "epoch": 0.7432791648183453, "grad_norm": 12.607243537902832, "learning_rate": 1.352732340748609e-05, "loss": 4.0776, "step": 439000 }, { "epoch": 0.7449722836448107, "grad_norm": 12.92636775970459, "learning_rate": 1.3438211779131482e-05, "loss": 4.0563, "step": 440000 }, { "epoch": 0.7466654024712762, "grad_norm": 12.24993896484375, "learning_rate": 1.3349189262405231e-05, "loss": 4.0987, "step": 441000 }, { "epoch": 0.7483585212977417, "grad_norm": 12.759634017944336, "learning_rate": 1.3260077634050625e-05, "loss": 4.0642, "step": 442000 }, { "epoch": 0.7500516401242072, "grad_norm": 12.089138984680176, "learning_rate": 1.3171055117324372e-05, "loss": 4.0994, "step": 443000 }, { "epoch": 0.7517447589506727, "grad_norm": 15.71636962890625, "learning_rate": 1.3081943488969764e-05, "loss": 4.0915, "step": 444000 }, { "epoch": 0.7534378777771381, "grad_norm": 11.728761672973633, "learning_rate": 1.2992920972243511e-05, "loss": 4.0772, "step": 445000 }, { "epoch": 0.7551309966036036, "grad_norm": 12.53087329864502, "learning_rate": 1.2903809343888903e-05, "loss": 4.0878, "step": 446000 }, { "epoch": 0.7568241154300691, "grad_norm": 13.897101402282715, "learning_rate": 1.2814697715534295e-05, "loss": 4.0729, "step": 447000 }, { "epoch": 0.7585172342565346, "grad_norm": 12.830127716064453, "learning_rate": 1.2725675198808042e-05, "loss": 4.0979, "step": 448000 }, { "epoch": 0.7602103530830001, "grad_norm": 12.351402282714844, "learning_rate": 1.2636563570453436e-05, "loss": 4.0912, "step": 449000 }, { "epoch": 0.7619034719094655, "grad_norm": 12.197295188903809, "learning_rate": 1.2547541053727183e-05, "loss": 4.0617, "step": 450000 }, { "epoch": 0.763596590735931, "grad_norm": 12.29218864440918, "learning_rate": 1.2458429425372577e-05, "loss": 4.082, "step": 451000 }, { "epoch": 0.7652897095623965, "grad_norm": 11.651473045349121, "learning_rate": 1.2369406908646324e-05, "loss": 4.0782, "step": 452000 }, { "epoch": 0.766982828388862, "grad_norm": 13.120807647705078, "learning_rate": 1.2280295280291716e-05, "loss": 4.0796, "step": 453000 }, { "epoch": 0.7686759472153275, "grad_norm": 11.740452766418457, "learning_rate": 1.2191183651937108e-05, "loss": 4.0621, "step": 454000 }, { "epoch": 0.7703690660417929, "grad_norm": 11.943509101867676, "learning_rate": 1.2102072023582502e-05, "loss": 4.0776, "step": 455000 }, { "epoch": 0.7720621848682584, "grad_norm": 10.697245597839355, "learning_rate": 1.201304950685625e-05, "loss": 4.0527, "step": 456000 }, { "epoch": 0.7737553036947239, "grad_norm": 12.905070304870605, "learning_rate": 1.1924026990129996e-05, "loss": 4.0605, "step": 457000 }, { "epoch": 0.7754484225211894, "grad_norm": 11.206037521362305, "learning_rate": 1.183491536177539e-05, "loss": 4.0889, "step": 458000 }, { "epoch": 0.7771415413476549, "grad_norm": 11.833847999572754, "learning_rate": 1.1745803733420782e-05, "loss": 4.0663, "step": 459000 }, { "epoch": 0.7788346601741203, "grad_norm": 12.658822059631348, "learning_rate": 1.1656692105066175e-05, "loss": 4.0612, "step": 460000 }, { "epoch": 0.7805277790005858, "grad_norm": 13.751755714416504, "learning_rate": 1.1567758699968278e-05, "loss": 4.0756, "step": 461000 }, { "epoch": 0.7822208978270513, "grad_norm": 11.52566146850586, "learning_rate": 1.147864707161367e-05, "loss": 4.0716, "step": 462000 }, { "epoch": 0.7839140166535168, "grad_norm": 13.863004684448242, "learning_rate": 1.1389535443259062e-05, "loss": 4.0598, "step": 463000 }, { "epoch": 0.7856071354799823, "grad_norm": 13.530219078063965, "learning_rate": 1.130051292653281e-05, "loss": 4.0925, "step": 464000 }, { "epoch": 0.7873002543064478, "grad_norm": 12.280829429626465, "learning_rate": 1.1211401298178202e-05, "loss": 4.0854, "step": 465000 }, { "epoch": 0.7889933731329132, "grad_norm": 13.754987716674805, "learning_rate": 1.1122289669823595e-05, "loss": 4.0709, "step": 466000 }, { "epoch": 0.7906864919593787, "grad_norm": 12.556035041809082, "learning_rate": 1.1033267153097342e-05, "loss": 4.0491, "step": 467000 }, { "epoch": 0.7923796107858442, "grad_norm": 13.475341796875, "learning_rate": 1.0944155524742736e-05, "loss": 4.0659, "step": 468000 }, { "epoch": 0.7940727296123097, "grad_norm": 11.218876838684082, "learning_rate": 1.0855043896388129e-05, "loss": 4.0621, "step": 469000 }, { "epoch": 0.7957658484387752, "grad_norm": 12.517721176147461, "learning_rate": 1.076593226803352e-05, "loss": 4.0948, "step": 470000 }, { "epoch": 0.7974589672652406, "grad_norm": 13.748697280883789, "learning_rate": 1.0676909751307268e-05, "loss": 4.0575, "step": 471000 }, { "epoch": 0.7991520860917061, "grad_norm": 12.449298858642578, "learning_rate": 1.058779812295266e-05, "loss": 4.0555, "step": 472000 }, { "epoch": 0.8008452049181716, "grad_norm": 13.441361427307129, "learning_rate": 1.0498775606226407e-05, "loss": 4.0768, "step": 473000 }, { "epoch": 0.8025383237446371, "grad_norm": 12.1597900390625, "learning_rate": 1.04096639778718e-05, "loss": 4.046, "step": 474000 }, { "epoch": 0.8042314425711026, "grad_norm": 12.69150447845459, "learning_rate": 1.0320641461145548e-05, "loss": 4.0557, "step": 475000 }, { "epoch": 0.805924561397568, "grad_norm": 16.03168487548828, "learning_rate": 1.0231529832790942e-05, "loss": 4.0392, "step": 476000 }, { "epoch": 0.8076176802240335, "grad_norm": 11.598551750183105, "learning_rate": 1.0142418204436334e-05, "loss": 4.0562, "step": 477000 }, { "epoch": 0.809310799050499, "grad_norm": 12.090052604675293, "learning_rate": 1.0053306576081726e-05, "loss": 4.0469, "step": 478000 }, { "epoch": 0.8110039178769645, "grad_norm": 13.248003959655762, "learning_rate": 9.964284059355473e-06, "loss": 4.0415, "step": 479000 }, { "epoch": 0.81269703670343, "grad_norm": 10.98791790008545, "learning_rate": 9.875172431000867e-06, "loss": 4.0289, "step": 480000 }, { "epoch": 0.8143901555298954, "grad_norm": 12.991023063659668, "learning_rate": 9.786149914274614e-06, "loss": 4.0568, "step": 481000 }, { "epoch": 0.8160832743563609, "grad_norm": 14.675666809082031, "learning_rate": 9.697038285920008e-06, "loss": 4.0407, "step": 482000 }, { "epoch": 0.8177763931828264, "grad_norm": 12.550374984741211, "learning_rate": 9.6079266575654e-06, "loss": 4.057, "step": 483000 }, { "epoch": 0.8194695120092919, "grad_norm": 12.481460571289062, "learning_rate": 9.518815029210792e-06, "loss": 4.0457, "step": 484000 }, { "epoch": 0.8211626308357574, "grad_norm": 12.406510353088379, "learning_rate": 9.429792512484539e-06, "loss": 4.0527, "step": 485000 }, { "epoch": 0.8228557496622227, "grad_norm": 12.856534957885742, "learning_rate": 9.340769995758288e-06, "loss": 4.0167, "step": 486000 }, { "epoch": 0.8245488684886882, "grad_norm": 13.791609764099121, "learning_rate": 9.25165836740368e-06, "loss": 4.0497, "step": 487000 }, { "epoch": 0.8262419873151537, "grad_norm": 11.750575065612793, "learning_rate": 9.162546739049072e-06, "loss": 4.0651, "step": 488000 }, { "epoch": 0.8279351061416192, "grad_norm": 13.518226623535156, "learning_rate": 9.073524222322819e-06, "loss": 4.0535, "step": 489000 }, { "epoch": 0.8296282249680847, "grad_norm": 11.101834297180176, "learning_rate": 8.984412593968213e-06, "loss": 4.0641, "step": 490000 }, { "epoch": 0.8313213437945501, "grad_norm": 13.778100967407227, "learning_rate": 8.89539007724196e-06, "loss": 4.0573, "step": 491000 }, { "epoch": 0.8330144626210156, "grad_norm": 12.994368553161621, "learning_rate": 8.806278448887352e-06, "loss": 4.0359, "step": 492000 }, { "epoch": 0.8347075814474811, "grad_norm": 10.990795135498047, "learning_rate": 8.7172559321611e-06, "loss": 4.0377, "step": 493000 }, { "epoch": 0.8364007002739466, "grad_norm": 14.174135208129883, "learning_rate": 8.628144303806493e-06, "loss": 4.035, "step": 494000 }, { "epoch": 0.8380938191004121, "grad_norm": 13.931561470031738, "learning_rate": 8.539032675451885e-06, "loss": 4.0397, "step": 495000 }, { "epoch": 0.8397869379268775, "grad_norm": 11.548705101013184, "learning_rate": 8.450010158725632e-06, "loss": 4.0392, "step": 496000 }, { "epoch": 0.841480056753343, "grad_norm": 13.571452140808105, "learning_rate": 8.360898530371024e-06, "loss": 4.0314, "step": 497000 }, { "epoch": 0.8431731755798085, "grad_norm": 12.852025985717773, "learning_rate": 8.271786902016418e-06, "loss": 4.0301, "step": 498000 }, { "epoch": 0.844866294406274, "grad_norm": 12.826899528503418, "learning_rate": 8.182764385290167e-06, "loss": 4.0262, "step": 499000 }, { "epoch": 0.8465594132327395, "grad_norm": 13.846549987792969, "learning_rate": 8.09365275693556e-06, "loss": 4.0406, "step": 500000 }, { "epoch": 0.848252532059205, "grad_norm": 12.71264362335205, "learning_rate": 8.004630240209306e-06, "loss": 4.0274, "step": 501000 }, { "epoch": 0.8499456508856704, "grad_norm": 10.856557846069336, "learning_rate": 7.915518611854698e-06, "loss": 4.0745, "step": 502000 }, { "epoch": 0.8516387697121359, "grad_norm": 12.938129425048828, "learning_rate": 7.826496095128445e-06, "loss": 4.0179, "step": 503000 }, { "epoch": 0.8533318885386014, "grad_norm": 12.214771270751953, "learning_rate": 7.737384466773837e-06, "loss": 4.033, "step": 504000 }, { "epoch": 0.8550250073650669, "grad_norm": 13.677706718444824, "learning_rate": 7.648272838419231e-06, "loss": 4.0305, "step": 505000 }, { "epoch": 0.8567181261915324, "grad_norm": 11.229254722595215, "learning_rate": 7.559250321692979e-06, "loss": 4.0473, "step": 506000 }, { "epoch": 0.8584112450179978, "grad_norm": 12.129646301269531, "learning_rate": 7.470138693338371e-06, "loss": 4.0532, "step": 507000 }, { "epoch": 0.8601043638444633, "grad_norm": 13.197163581848145, "learning_rate": 7.3810270649837645e-06, "loss": 4.027, "step": 508000 }, { "epoch": 0.8617974826709288, "grad_norm": 12.61552906036377, "learning_rate": 7.292004548257511e-06, "loss": 4.0317, "step": 509000 }, { "epoch": 0.8634906014973943, "grad_norm": 11.456807136535645, "learning_rate": 7.202892919902904e-06, "loss": 4.0218, "step": 510000 }, { "epoch": 0.8651837203238598, "grad_norm": 13.11767864227295, "learning_rate": 7.113781291548297e-06, "loss": 4.0224, "step": 511000 }, { "epoch": 0.8668768391503252, "grad_norm": 13.136454582214355, "learning_rate": 7.024669663193689e-06, "loss": 4.0286, "step": 512000 }, { "epoch": 0.8685699579767907, "grad_norm": 13.759037017822266, "learning_rate": 6.9356471464674375e-06, "loss": 4.0142, "step": 513000 }, { "epoch": 0.8702630768032562, "grad_norm": 10.915913581848145, "learning_rate": 6.84653551811283e-06, "loss": 4.0334, "step": 514000 }, { "epoch": 0.8719561956297217, "grad_norm": 11.403800010681152, "learning_rate": 6.757513001386577e-06, "loss": 4.025, "step": 515000 }, { "epoch": 0.8736493144561872, "grad_norm": 13.475736618041992, "learning_rate": 6.668490484660325e-06, "loss": 4.0051, "step": 516000 }, { "epoch": 0.8753424332826526, "grad_norm": 11.821650505065918, "learning_rate": 6.579378856305718e-06, "loss": 4.0331, "step": 517000 }, { "epoch": 0.8770355521091181, "grad_norm": 13.158166885375977, "learning_rate": 6.490267227951111e-06, "loss": 4.0202, "step": 518000 }, { "epoch": 0.8787286709355836, "grad_norm": 11.904138565063477, "learning_rate": 6.4012447112248575e-06, "loss": 4.0251, "step": 519000 }, { "epoch": 0.8804217897620491, "grad_norm": 12.96877670288086, "learning_rate": 6.3121330828702506e-06, "loss": 4.0195, "step": 520000 }, { "epoch": 0.8821149085885146, "grad_norm": 11.55160903930664, "learning_rate": 6.2231105661439975e-06, "loss": 4.0195, "step": 521000 }, { "epoch": 0.88380802741498, "grad_norm": 13.741573333740234, "learning_rate": 6.1339989377893906e-06, "loss": 4.0135, "step": 522000 }, { "epoch": 0.8855011462414455, "grad_norm": 12.45538330078125, "learning_rate": 6.044976421063138e-06, "loss": 4.0248, "step": 523000 }, { "epoch": 0.887194265067911, "grad_norm": 16.840944290161133, "learning_rate": 5.9558647927085306e-06, "loss": 4.0497, "step": 524000 }, { "epoch": 0.8888873838943765, "grad_norm": 12.211904525756836, "learning_rate": 5.8668422759822775e-06, "loss": 4.0403, "step": 525000 }, { "epoch": 0.890580502720842, "grad_norm": 12.109980583190918, "learning_rate": 5.7777306476276705e-06, "loss": 4.0233, "step": 526000 }, { "epoch": 0.8922736215473074, "grad_norm": 10.962858200073242, "learning_rate": 5.688619019273064e-06, "loss": 4.0431, "step": 527000 }, { "epoch": 0.8939667403737729, "grad_norm": 11.617267608642578, "learning_rate": 5.5995965025468105e-06, "loss": 4.0169, "step": 528000 }, { "epoch": 0.8956598592002384, "grad_norm": 11.964546203613281, "learning_rate": 5.510484874192203e-06, "loss": 4.0148, "step": 529000 }, { "epoch": 0.8973529780267039, "grad_norm": 13.515037536621094, "learning_rate": 5.4214623574659505e-06, "loss": 4.0288, "step": 530000 }, { "epoch": 0.8990460968531694, "grad_norm": 11.993343353271484, "learning_rate": 5.332350729111344e-06, "loss": 4.0395, "step": 531000 }, { "epoch": 0.9007392156796348, "grad_norm": 12.695779800415039, "learning_rate": 5.243239100756736e-06, "loss": 4.0289, "step": 532000 }, { "epoch": 0.9024323345061003, "grad_norm": 12.418722152709961, "learning_rate": 5.154216584030484e-06, "loss": 3.9995, "step": 533000 }, { "epoch": 0.9041254533325658, "grad_norm": 11.50819206237793, "learning_rate": 5.065104955675877e-06, "loss": 4.0275, "step": 534000 }, { "epoch": 0.9058185721590313, "grad_norm": 16.83757781982422, "learning_rate": 4.976082438949624e-06, "loss": 4.0234, "step": 535000 }, { "epoch": 0.9075116909854968, "grad_norm": 13.563246726989746, "learning_rate": 4.886970810595017e-06, "loss": 4.0297, "step": 536000 }, { "epoch": 0.9092048098119623, "grad_norm": 12.766653060913086, "learning_rate": 4.797859182240409e-06, "loss": 4.043, "step": 537000 }, { "epoch": 0.9108979286384277, "grad_norm": 12.738497734069824, "learning_rate": 4.708836665514157e-06, "loss": 4.0264, "step": 538000 }, { "epoch": 0.9125910474648932, "grad_norm": 18.209604263305664, "learning_rate": 4.61972503715955e-06, "loss": 4.0248, "step": 539000 }, { "epoch": 0.9142841662913587, "grad_norm": 16.658750534057617, "learning_rate": 4.530702520433297e-06, "loss": 4.0273, "step": 540000 }, { "epoch": 0.9159772851178242, "grad_norm": 12.773310661315918, "learning_rate": 4.441590892078689e-06, "loss": 4.0174, "step": 541000 }, { "epoch": 0.9176704039442897, "grad_norm": 13.522591590881348, "learning_rate": 4.352479263724083e-06, "loss": 4.0079, "step": 542000 }, { "epoch": 0.9193635227707551, "grad_norm": 12.449542045593262, "learning_rate": 4.263456746997829e-06, "loss": 4.0116, "step": 543000 }, { "epoch": 0.9210566415972206, "grad_norm": 10.421014785766602, "learning_rate": 4.174345118643222e-06, "loss": 4.0037, "step": 544000 }, { "epoch": 0.9227497604236861, "grad_norm": 10.978767395019531, "learning_rate": 4.085233490288615e-06, "loss": 4.0254, "step": 545000 }, { "epoch": 0.9244428792501516, "grad_norm": 12.136636734008789, "learning_rate": 3.996210973562362e-06, "loss": 4.0269, "step": 546000 }, { "epoch": 0.9261359980766171, "grad_norm": 12.746706008911133, "learning_rate": 3.907099345207755e-06, "loss": 4.0043, "step": 547000 }, { "epoch": 0.9278291169030825, "grad_norm": 12.414566993713379, "learning_rate": 3.818076828481503e-06, "loss": 4.0205, "step": 548000 }, { "epoch": 0.929522235729548, "grad_norm": 12.85700511932373, "learning_rate": 3.728965200126895e-06, "loss": 4.0133, "step": 549000 }, { "epoch": 0.9312153545560135, "grad_norm": 13.066812515258789, "learning_rate": 3.6398535717722877e-06, "loss": 4.0068, "step": 550000 }, { "epoch": 0.932908473382479, "grad_norm": 13.463617324829102, "learning_rate": 3.5508310550460354e-06, "loss": 4.0147, "step": 551000 }, { "epoch": 0.9346015922089445, "grad_norm": 12.243765830993652, "learning_rate": 3.461719426691428e-06, "loss": 3.9834, "step": 552000 }, { "epoch": 0.9362947110354098, "grad_norm": 13.196410179138184, "learning_rate": 3.3726969099651754e-06, "loss": 4.0, "step": 553000 }, { "epoch": 0.9379878298618753, "grad_norm": 11.779229164123535, "learning_rate": 3.2835852816105685e-06, "loss": 4.0172, "step": 554000 }, { "epoch": 0.9396809486883408, "grad_norm": 13.934765815734863, "learning_rate": 3.194473653255961e-06, "loss": 4.0078, "step": 555000 }, { "epoch": 0.9413740675148063, "grad_norm": 13.61536693572998, "learning_rate": 3.1053620249013534e-06, "loss": 4.0062, "step": 556000 }, { "epoch": 0.9430671863412718, "grad_norm": 11.887247085571289, "learning_rate": 3.0163395081751008e-06, "loss": 3.9833, "step": 557000 }, { "epoch": 0.9447603051677372, "grad_norm": 12.271764755249023, "learning_rate": 2.927227879820494e-06, "loss": 3.988, "step": 558000 }, { "epoch": 0.9464534239942027, "grad_norm": 13.865683555603027, "learning_rate": 2.8381162514658865e-06, "loss": 4.0005, "step": 559000 }, { "epoch": 0.9481465428206682, "grad_norm": 13.312651634216309, "learning_rate": 2.749004623111279e-06, "loss": 3.996, "step": 560000 }, { "epoch": 0.9498396616471337, "grad_norm": 13.861916542053223, "learning_rate": 2.6599821063850265e-06, "loss": 3.9868, "step": 561000 }, { "epoch": 0.9515327804735992, "grad_norm": 12.27828598022461, "learning_rate": 2.570870478030419e-06, "loss": 3.9917, "step": 562000 }, { "epoch": 0.9532258993000646, "grad_norm": 14.108807563781738, "learning_rate": 2.481758849675812e-06, "loss": 3.9875, "step": 563000 }, { "epoch": 0.9549190181265301, "grad_norm": 14.063074111938477, "learning_rate": 2.3927363329495596e-06, "loss": 4.0203, "step": 564000 }, { "epoch": 0.9566121369529956, "grad_norm": 12.242046356201172, "learning_rate": 2.3036247045949522e-06, "loss": 4.0049, "step": 565000 }, { "epoch": 0.9583052557794611, "grad_norm": 11.525742530822754, "learning_rate": 2.214513076240345e-06, "loss": 3.992, "step": 566000 }, { "epoch": 0.9599983746059266, "grad_norm": 13.982650756835938, "learning_rate": 2.1254014478857376e-06, "loss": 4.0085, "step": 567000 }, { "epoch": 0.961691493432392, "grad_norm": 10.93720817565918, "learning_rate": 2.036378931159485e-06, "loss": 3.9889, "step": 568000 }, { "epoch": 0.9633846122588575, "grad_norm": 10.936880111694336, "learning_rate": 1.9472673028048775e-06, "loss": 4.0045, "step": 569000 }, { "epoch": 0.965077731085323, "grad_norm": 12.408818244934082, "learning_rate": 1.8581556744502706e-06, "loss": 3.9975, "step": 570000 }, { "epoch": 0.9667708499117885, "grad_norm": 11.701683044433594, "learning_rate": 1.7691331577240177e-06, "loss": 4.0154, "step": 571000 }, { "epoch": 0.968463968738254, "grad_norm": 11.864809036254883, "learning_rate": 1.6800215293694104e-06, "loss": 4.0071, "step": 572000 }, { "epoch": 0.9701570875647195, "grad_norm": 11.751227378845215, "learning_rate": 1.5909099010148033e-06, "loss": 4.0177, "step": 573000 }, { "epoch": 0.9718502063911849, "grad_norm": 12.788235664367676, "learning_rate": 1.5018873842885508e-06, "loss": 3.9926, "step": 574000 }, { "epoch": 0.9735433252176504, "grad_norm": 14.190360069274902, "learning_rate": 1.4127757559339435e-06, "loss": 4.0041, "step": 575000 }, { "epoch": 0.9752364440441159, "grad_norm": 13.502917289733887, "learning_rate": 1.3237532392076908e-06, "loss": 4.0073, "step": 576000 }, { "epoch": 0.9769295628705814, "grad_norm": 12.922845840454102, "learning_rate": 1.2346416108530835e-06, "loss": 4.0105, "step": 577000 }, { "epoch": 0.9786226816970469, "grad_norm": 13.786908149719238, "learning_rate": 1.1455299824984761e-06, "loss": 3.9938, "step": 578000 }, { "epoch": 0.9803158005235123, "grad_norm": 12.947463035583496, "learning_rate": 1.0565074657722235e-06, "loss": 4.0031, "step": 579000 }, { "epoch": 0.9820089193499778, "grad_norm": 12.79428768157959, "learning_rate": 9.673958374176163e-07, "loss": 4.0095, "step": 580000 }, { "epoch": 0.9837020381764433, "grad_norm": 13.147842407226562, "learning_rate": 8.782842090630091e-07, "loss": 4.0123, "step": 581000 }, { "epoch": 0.9853951570029088, "grad_norm": 14.94148063659668, "learning_rate": 7.891725807084019e-07, "loss": 4.0165, "step": 582000 }, { "epoch": 0.9870882758293743, "grad_norm": 12.684762001037598, "learning_rate": 7.001500639821492e-07, "loss": 4.0215, "step": 583000 }, { "epoch": 0.9887813946558397, "grad_norm": 11.707281112670898, "learning_rate": 6.111275472558965e-07, "loss": 4.0116, "step": 584000 }, { "epoch": 0.9904745134823052, "grad_norm": 14.292046546936035, "learning_rate": 5.220159189012893e-07, "loss": 3.9982, "step": 585000 }, { "epoch": 0.9921676323087707, "grad_norm": 11.7388334274292, "learning_rate": 4.3290429054668207e-07, "loss": 3.9913, "step": 586000 }, { "epoch": 0.9938607511352362, "grad_norm": 12.288949966430664, "learning_rate": 3.438817738204294e-07, "loss": 3.9932, "step": 587000 }, { "epoch": 0.9955538699617017, "grad_norm": 12.897555351257324, "learning_rate": 2.547701454658221e-07, "loss": 4.0131, "step": 588000 }, { "epoch": 0.9972469887881671, "grad_norm": 11.195857048034668, "learning_rate": 1.657476287395695e-07, "loss": 3.9926, "step": 589000 }, { "epoch": 0.9989401076146326, "grad_norm": 12.261478424072266, "learning_rate": 7.663600038496223e-08, "loss": 4.0098, "step": 590000 } ], "logging_steps": 1000, "max_steps": 590626, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.84840572470825e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }