{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9896907216494846, "eval_steps": 500, "global_step": 435, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006872852233676976, "grad_norm": 0.9486322019687681, "learning_rate": 4.5454545454545455e-06, "loss": 1.3163, "step": 1 }, { "epoch": 0.013745704467353952, "grad_norm": 0.9581819297739079, "learning_rate": 9.090909090909091e-06, "loss": 1.3039, "step": 2 }, { "epoch": 0.020618556701030927, "grad_norm": 1.0054838528858523, "learning_rate": 1.3636363636363637e-05, "loss": 1.3912, "step": 3 }, { "epoch": 0.027491408934707903, "grad_norm": 0.973313664446148, "learning_rate": 1.8181818181818182e-05, "loss": 1.3176, "step": 4 }, { "epoch": 0.03436426116838488, "grad_norm": 0.9112881771285355, "learning_rate": 2.272727272727273e-05, "loss": 1.2813, "step": 5 }, { "epoch": 0.041237113402061855, "grad_norm": 0.868853801792406, "learning_rate": 2.7272727272727273e-05, "loss": 1.2778, "step": 6 }, { "epoch": 0.048109965635738834, "grad_norm": 0.7999023078891502, "learning_rate": 3.181818181818182e-05, "loss": 1.2184, "step": 7 }, { "epoch": 0.054982817869415807, "grad_norm": 0.6175826046713091, "learning_rate": 3.6363636363636364e-05, "loss": 1.1459, "step": 8 }, { "epoch": 0.061855670103092786, "grad_norm": 0.5099365471572147, "learning_rate": 4.0909090909090915e-05, "loss": 1.0493, "step": 9 }, { "epoch": 0.06872852233676977, "grad_norm": 0.5444792063302974, "learning_rate": 4.545454545454546e-05, "loss": 1.0206, "step": 10 }, { "epoch": 0.07560137457044673, "grad_norm": 0.612097075909489, "learning_rate": 5e-05, "loss": 0.9556, "step": 11 }, { "epoch": 0.08247422680412371, "grad_norm": 0.5965134047415941, "learning_rate": 5.4545454545454546e-05, "loss": 0.8832, "step": 12 }, { "epoch": 0.08934707903780069, "grad_norm": 0.6321048435692244, "learning_rate": 5.90909090909091e-05, "loss": 0.8113, "step": 13 }, { "epoch": 0.09621993127147767, "grad_norm": 0.5433713534011196, "learning_rate": 6.363636363636364e-05, "loss": 0.7699, "step": 14 }, { "epoch": 0.10309278350515463, "grad_norm": 0.5641051016325467, "learning_rate": 6.818181818181818e-05, "loss": 0.6987, "step": 15 }, { "epoch": 0.10996563573883161, "grad_norm": 0.4741800529459741, "learning_rate": 7.272727272727273e-05, "loss": 0.6418, "step": 16 }, { "epoch": 0.11683848797250859, "grad_norm": 0.31919675330271313, "learning_rate": 7.727272727272727e-05, "loss": 0.5883, "step": 17 }, { "epoch": 0.12371134020618557, "grad_norm": 0.2704850432116006, "learning_rate": 8.181818181818183e-05, "loss": 0.576, "step": 18 }, { "epoch": 0.13058419243986255, "grad_norm": 0.26062797028012374, "learning_rate": 8.636363636363637e-05, "loss": 0.5267, "step": 19 }, { "epoch": 0.13745704467353953, "grad_norm": 0.18740805347521763, "learning_rate": 9.090909090909092e-05, "loss": 0.5469, "step": 20 }, { "epoch": 0.14432989690721648, "grad_norm": 0.22017175307824505, "learning_rate": 9.545454545454546e-05, "loss": 0.5226, "step": 21 }, { "epoch": 0.15120274914089346, "grad_norm": 0.1887039063728806, "learning_rate": 0.0001, "loss": 0.4873, "step": 22 }, { "epoch": 0.15807560137457044, "grad_norm": 0.1793474824943474, "learning_rate": 0.00010454545454545455, "loss": 0.4978, "step": 23 }, { "epoch": 0.16494845360824742, "grad_norm": 0.23697858977104094, "learning_rate": 0.00010909090909090909, "loss": 0.4814, "step": 24 }, { "epoch": 0.1718213058419244, "grad_norm": 0.17636901545890651, "learning_rate": 0.00011363636363636365, "loss": 0.5103, "step": 25 }, { "epoch": 0.17869415807560138, "grad_norm": 0.16142348168311232, "learning_rate": 0.0001181818181818182, "loss": 0.4773, "step": 26 }, { "epoch": 0.18556701030927836, "grad_norm": 0.1425385061824693, "learning_rate": 0.00012272727272727272, "loss": 0.492, "step": 27 }, { "epoch": 0.19243986254295534, "grad_norm": 0.1265888154640839, "learning_rate": 0.00012727272727272728, "loss": 0.4854, "step": 28 }, { "epoch": 0.19931271477663232, "grad_norm": 0.11449596495934233, "learning_rate": 0.0001318181818181818, "loss": 0.4615, "step": 29 }, { "epoch": 0.20618556701030927, "grad_norm": 0.12461604075605497, "learning_rate": 0.00013636363636363637, "loss": 0.4703, "step": 30 }, { "epoch": 0.21305841924398625, "grad_norm": 0.12169420338653658, "learning_rate": 0.00014090909090909093, "loss": 0.4433, "step": 31 }, { "epoch": 0.21993127147766323, "grad_norm": 0.10581158171701577, "learning_rate": 0.00014545454545454546, "loss": 0.4397, "step": 32 }, { "epoch": 0.2268041237113402, "grad_norm": 0.11163612722377497, "learning_rate": 0.00015000000000000001, "loss": 0.4605, "step": 33 }, { "epoch": 0.23367697594501718, "grad_norm": 0.11309459248751377, "learning_rate": 0.00015454545454545454, "loss": 0.4495, "step": 34 }, { "epoch": 0.24054982817869416, "grad_norm": 0.11367215299047806, "learning_rate": 0.0001590909090909091, "loss": 0.4308, "step": 35 }, { "epoch": 0.24742268041237114, "grad_norm": 0.10809464118167192, "learning_rate": 0.00016363636363636366, "loss": 0.4368, "step": 36 }, { "epoch": 0.2542955326460481, "grad_norm": 0.10502574406388546, "learning_rate": 0.0001681818181818182, "loss": 0.4196, "step": 37 }, { "epoch": 0.2611683848797251, "grad_norm": 0.10320182075757336, "learning_rate": 0.00017272727272727275, "loss": 0.42, "step": 38 }, { "epoch": 0.26804123711340205, "grad_norm": 0.11138418873828733, "learning_rate": 0.00017727272727272728, "loss": 0.4514, "step": 39 }, { "epoch": 0.27491408934707906, "grad_norm": 0.11111081000023773, "learning_rate": 0.00018181818181818183, "loss": 0.4162, "step": 40 }, { "epoch": 0.281786941580756, "grad_norm": 0.10773385531295475, "learning_rate": 0.00018636363636363636, "loss": 0.4058, "step": 41 }, { "epoch": 0.28865979381443296, "grad_norm": 0.10631032605354059, "learning_rate": 0.00019090909090909092, "loss": 0.4029, "step": 42 }, { "epoch": 0.29553264604810997, "grad_norm": 0.10836378456827221, "learning_rate": 0.00019545454545454548, "loss": 0.4096, "step": 43 }, { "epoch": 0.3024054982817869, "grad_norm": 0.10880423378093391, "learning_rate": 0.0002, "loss": 0.4196, "step": 44 }, { "epoch": 0.30927835051546393, "grad_norm": 0.1066915228080529, "learning_rate": 0.00019999677214588312, "loss": 0.4049, "step": 45 }, { "epoch": 0.3161512027491409, "grad_norm": 0.1009132049783852, "learning_rate": 0.00019998708879191335, "loss": 0.4015, "step": 46 }, { "epoch": 0.3230240549828179, "grad_norm": 0.10423644166095698, "learning_rate": 0.00019997095056321971, "loss": 0.3989, "step": 47 }, { "epoch": 0.32989690721649484, "grad_norm": 0.10222126638135792, "learning_rate": 0.00019994835850163924, "loss": 0.3911, "step": 48 }, { "epoch": 0.33676975945017185, "grad_norm": 0.09539411299428839, "learning_rate": 0.00019991931406564944, "loss": 0.3861, "step": 49 }, { "epoch": 0.3436426116838488, "grad_norm": 0.10040146458411044, "learning_rate": 0.00019988381913027442, "loss": 0.3932, "step": 50 }, { "epoch": 0.35051546391752575, "grad_norm": 0.10573179360993629, "learning_rate": 0.00019984187598696363, "loss": 0.3936, "step": 51 }, { "epoch": 0.35738831615120276, "grad_norm": 0.10564577769908695, "learning_rate": 0.00019979348734344398, "loss": 0.4071, "step": 52 }, { "epoch": 0.3642611683848797, "grad_norm": 0.10695511977009023, "learning_rate": 0.00019973865632354516, "loss": 0.3882, "step": 53 }, { "epoch": 0.3711340206185567, "grad_norm": 0.10064589165597464, "learning_rate": 0.0001996773864669978, "loss": 0.3785, "step": 54 }, { "epoch": 0.37800687285223367, "grad_norm": 0.09791457914129995, "learning_rate": 0.00019960968172920516, "loss": 0.3811, "step": 55 }, { "epoch": 0.3848797250859107, "grad_norm": 0.1118566682283669, "learning_rate": 0.00019953554648098748, "loss": 0.3938, "step": 56 }, { "epoch": 0.3917525773195876, "grad_norm": 0.10214824698553887, "learning_rate": 0.0001994549855083001, "loss": 0.3925, "step": 57 }, { "epoch": 0.39862542955326463, "grad_norm": 0.1014103487175917, "learning_rate": 0.0001993680040119244, "loss": 0.3802, "step": 58 }, { "epoch": 0.4054982817869416, "grad_norm": 0.0993499098682824, "learning_rate": 0.00019927460760713197, "loss": 0.381, "step": 59 }, { "epoch": 0.41237113402061853, "grad_norm": 0.10206073076536111, "learning_rate": 0.00019917480232332224, "loss": 0.3657, "step": 60 }, { "epoch": 0.41924398625429554, "grad_norm": 0.09828556302343387, "learning_rate": 0.00019906859460363307, "loss": 0.3761, "step": 61 }, { "epoch": 0.4261168384879725, "grad_norm": 0.10139688359355604, "learning_rate": 0.00019895599130452505, "loss": 0.3749, "step": 62 }, { "epoch": 0.4329896907216495, "grad_norm": 0.10711125411772042, "learning_rate": 0.0001988369996953386, "loss": 0.3706, "step": 63 }, { "epoch": 0.43986254295532645, "grad_norm": 0.09724017597982272, "learning_rate": 0.00019871162745782478, "loss": 0.3705, "step": 64 }, { "epoch": 0.44673539518900346, "grad_norm": 0.1012279008931452, "learning_rate": 0.00019857988268564953, "loss": 0.3712, "step": 65 }, { "epoch": 0.4536082474226804, "grad_norm": 0.09806011793458508, "learning_rate": 0.0001984417738838709, "loss": 0.355, "step": 66 }, { "epoch": 0.46048109965635736, "grad_norm": 0.10463912089089605, "learning_rate": 0.0001982973099683902, "loss": 0.3812, "step": 67 }, { "epoch": 0.46735395189003437, "grad_norm": 0.10616651630870087, "learning_rate": 0.0001981465002653763, "loss": 0.3671, "step": 68 }, { "epoch": 0.4742268041237113, "grad_norm": 0.11658375430444518, "learning_rate": 0.00019798935451066361, "loss": 0.3822, "step": 69 }, { "epoch": 0.48109965635738833, "grad_norm": 0.10686011983880458, "learning_rate": 0.0001978258828491236, "loss": 0.3564, "step": 70 }, { "epoch": 0.4879725085910653, "grad_norm": 0.10160970818733071, "learning_rate": 0.00019765609583400977, "loss": 0.3656, "step": 71 }, { "epoch": 0.4948453608247423, "grad_norm": 0.10845572916815471, "learning_rate": 0.0001974800044262764, "loss": 0.3623, "step": 72 }, { "epoch": 0.5017182130584192, "grad_norm": 0.10898867605088586, "learning_rate": 0.00019729761999387103, "loss": 0.3689, "step": 73 }, { "epoch": 0.5085910652920962, "grad_norm": 0.10030807679917116, "learning_rate": 0.00019710895431100046, "loss": 0.3743, "step": 74 }, { "epoch": 0.5154639175257731, "grad_norm": 0.10640353541210322, "learning_rate": 0.00019691401955737072, "loss": 0.3664, "step": 75 }, { "epoch": 0.5223367697594502, "grad_norm": 0.10268077724904813, "learning_rate": 0.00019671282831740076, "loss": 0.3446, "step": 76 }, { "epoch": 0.5292096219931272, "grad_norm": 0.1026968446802754, "learning_rate": 0.00019650539357941003, "loss": 0.3524, "step": 77 }, { "epoch": 0.5360824742268041, "grad_norm": 0.10265145337121352, "learning_rate": 0.00019629172873477995, "loss": 0.3615, "step": 78 }, { "epoch": 0.5429553264604811, "grad_norm": 0.10234137978544197, "learning_rate": 0.00019607184757708951, "loss": 0.348, "step": 79 }, { "epoch": 0.5498281786941581, "grad_norm": 0.10414003831964998, "learning_rate": 0.00019584576430122473, "loss": 0.3455, "step": 80 }, { "epoch": 0.5567010309278351, "grad_norm": 0.10371223386491027, "learning_rate": 0.00019561349350246226, "loss": 0.3378, "step": 81 }, { "epoch": 0.563573883161512, "grad_norm": 0.10200261115662425, "learning_rate": 0.00019537505017552716, "loss": 0.3424, "step": 82 }, { "epoch": 0.570446735395189, "grad_norm": 0.10679690224389146, "learning_rate": 0.00019513044971362494, "loss": 0.3381, "step": 83 }, { "epoch": 0.5773195876288659, "grad_norm": 0.11747209193015294, "learning_rate": 0.00019487970790744774, "loss": 0.3608, "step": 84 }, { "epoch": 0.584192439862543, "grad_norm": 0.10455385108750616, "learning_rate": 0.000194622840944155, "loss": 0.3513, "step": 85 }, { "epoch": 0.5910652920962199, "grad_norm": 0.11103687968369377, "learning_rate": 0.00019435986540632843, "loss": 0.3322, "step": 86 }, { "epoch": 0.5979381443298969, "grad_norm": 0.1044628007637211, "learning_rate": 0.00019409079827090145, "loss": 0.3452, "step": 87 }, { "epoch": 0.6048109965635738, "grad_norm": 0.1062512518470538, "learning_rate": 0.00019381565690806328, "loss": 0.3295, "step": 88 }, { "epoch": 0.6116838487972509, "grad_norm": 0.10737195453497204, "learning_rate": 0.00019353445908013755, "loss": 0.3322, "step": 89 }, { "epoch": 0.6185567010309279, "grad_norm": 0.10702988631259788, "learning_rate": 0.00019324722294043558, "loss": 0.3436, "step": 90 }, { "epoch": 0.6254295532646048, "grad_norm": 0.10765269601948493, "learning_rate": 0.00019295396703208453, "loss": 0.3628, "step": 91 }, { "epoch": 0.6323024054982818, "grad_norm": 0.1002631772076248, "learning_rate": 0.00019265471028683014, "loss": 0.337, "step": 92 }, { "epoch": 0.6391752577319587, "grad_norm": 0.10607703170572558, "learning_rate": 0.00019234947202381486, "loss": 0.3312, "step": 93 }, { "epoch": 0.6460481099656358, "grad_norm": 0.10229636934864904, "learning_rate": 0.00019203827194833026, "loss": 0.3613, "step": 94 }, { "epoch": 0.6529209621993127, "grad_norm": 0.10916337452983911, "learning_rate": 0.00019172113015054532, "loss": 0.3402, "step": 95 }, { "epoch": 0.6597938144329897, "grad_norm": 0.10094159423939311, "learning_rate": 0.00019139806710420914, "loss": 0.3308, "step": 96 }, { "epoch": 0.6666666666666666, "grad_norm": 0.10420707581199606, "learning_rate": 0.00019106910366532942, "loss": 0.3297, "step": 97 }, { "epoch": 0.6735395189003437, "grad_norm": 0.11033214518450266, "learning_rate": 0.000190734261070826, "loss": 0.3368, "step": 98 }, { "epoch": 0.6804123711340206, "grad_norm": 0.10648307368306249, "learning_rate": 0.00019039356093715975, "loss": 0.3436, "step": 99 }, { "epoch": 0.6872852233676976, "grad_norm": 0.1023047486718038, "learning_rate": 0.00019004702525893732, "loss": 0.3389, "step": 100 }, { "epoch": 0.6941580756013745, "grad_norm": 0.10884134971992637, "learning_rate": 0.000189694676407491, "loss": 0.3343, "step": 101 }, { "epoch": 0.7010309278350515, "grad_norm": 0.10624545430750702, "learning_rate": 0.0001893365371294346, "loss": 0.334, "step": 102 }, { "epoch": 0.7079037800687286, "grad_norm": 0.10418100794965049, "learning_rate": 0.00018897263054519498, "loss": 0.3332, "step": 103 }, { "epoch": 0.7147766323024055, "grad_norm": 0.10442261089390925, "learning_rate": 0.00018860298014751944, "loss": 0.3231, "step": 104 }, { "epoch": 0.7216494845360825, "grad_norm": 0.11510206196708461, "learning_rate": 0.0001882276097999592, "loss": 0.3331, "step": 105 }, { "epoch": 0.7285223367697594, "grad_norm": 0.10447026389489032, "learning_rate": 0.00018784654373532866, "loss": 0.3454, "step": 106 }, { "epoch": 0.7353951890034365, "grad_norm": 0.10590015702307645, "learning_rate": 0.00018745980655414114, "loss": 0.3283, "step": 107 }, { "epoch": 0.7422680412371134, "grad_norm": 0.1019688644295514, "learning_rate": 0.00018706742322302064, "loss": 0.3368, "step": 108 }, { "epoch": 0.7491408934707904, "grad_norm": 0.11013306581811545, "learning_rate": 0.00018666941907309026, "loss": 0.3371, "step": 109 }, { "epoch": 0.7560137457044673, "grad_norm": 0.09950125567962896, "learning_rate": 0.0001862658197983366, "loss": 0.3206, "step": 110 }, { "epoch": 0.7628865979381443, "grad_norm": 0.09961853222606298, "learning_rate": 0.0001858566514539513, "loss": 0.3245, "step": 111 }, { "epoch": 0.7697594501718213, "grad_norm": 0.10822873754218582, "learning_rate": 0.00018544194045464886, "loss": 0.336, "step": 112 }, { "epoch": 0.7766323024054983, "grad_norm": 0.10256746162360054, "learning_rate": 0.00018502171357296144, "loss": 0.3279, "step": 113 }, { "epoch": 0.7835051546391752, "grad_norm": 0.09750927062180448, "learning_rate": 0.0001845959979375104, "loss": 0.3388, "step": 114 }, { "epoch": 0.7903780068728522, "grad_norm": 0.09793726368683904, "learning_rate": 0.00018416482103125506, "loss": 0.3303, "step": 115 }, { "epoch": 0.7972508591065293, "grad_norm": 0.10598098121402388, "learning_rate": 0.0001837282106897185, "loss": 0.3216, "step": 116 }, { "epoch": 0.8041237113402062, "grad_norm": 0.10184730892936761, "learning_rate": 0.00018328619509919044, "loss": 0.3341, "step": 117 }, { "epoch": 0.8109965635738832, "grad_norm": 0.10057188453324177, "learning_rate": 0.0001828388027949078, "loss": 0.3377, "step": 118 }, { "epoch": 0.8178694158075601, "grad_norm": 0.10702981460753135, "learning_rate": 0.00018238606265921238, "loss": 0.3408, "step": 119 }, { "epoch": 0.8247422680412371, "grad_norm": 0.10861206391221787, "learning_rate": 0.00018192800391968642, "loss": 0.3413, "step": 120 }, { "epoch": 0.8316151202749141, "grad_norm": 0.10665429392095056, "learning_rate": 0.00018146465614726567, "loss": 0.3234, "step": 121 }, { "epoch": 0.8384879725085911, "grad_norm": 0.10180559686021746, "learning_rate": 0.00018099604925433043, "loss": 0.3263, "step": 122 }, { "epoch": 0.845360824742268, "grad_norm": 0.11025637152512564, "learning_rate": 0.00018052221349277442, "loss": 0.3295, "step": 123 }, { "epoch": 0.852233676975945, "grad_norm": 0.10266709615387361, "learning_rate": 0.00018004317945205197, "loss": 0.3275, "step": 124 }, { "epoch": 0.8591065292096219, "grad_norm": 0.10117796525027944, "learning_rate": 0.0001795589780572031, "loss": 0.3541, "step": 125 }, { "epoch": 0.865979381443299, "grad_norm": 0.10526178050004703, "learning_rate": 0.00017906964056685706, "loss": 0.3312, "step": 126 }, { "epoch": 0.872852233676976, "grad_norm": 0.1063992770321544, "learning_rate": 0.00017857519857121458, "loss": 0.3312, "step": 127 }, { "epoch": 0.8797250859106529, "grad_norm": 0.10996856445056626, "learning_rate": 0.00017807568399000822, "loss": 0.3527, "step": 128 }, { "epoch": 0.8865979381443299, "grad_norm": 0.09753881684843836, "learning_rate": 0.000177571129070442, "loss": 0.3117, "step": 129 }, { "epoch": 0.8934707903780069, "grad_norm": 0.10008693889485847, "learning_rate": 0.0001770615663851093, "loss": 0.3188, "step": 130 }, { "epoch": 0.9003436426116839, "grad_norm": 0.10344071862676044, "learning_rate": 0.0001765470288298905, "loss": 0.3252, "step": 131 }, { "epoch": 0.9072164948453608, "grad_norm": 0.09310502474075653, "learning_rate": 0.0001760275496218288, "loss": 0.3058, "step": 132 }, { "epoch": 0.9140893470790378, "grad_norm": 0.10039282316295665, "learning_rate": 0.0001755031622969862, "loss": 0.3468, "step": 133 }, { "epoch": 0.9209621993127147, "grad_norm": 0.09871302822261965, "learning_rate": 0.00017497390070827848, "loss": 0.3241, "step": 134 }, { "epoch": 0.9278350515463918, "grad_norm": 0.10019109474446979, "learning_rate": 0.00017443979902328956, "loss": 0.3247, "step": 135 }, { "epoch": 0.9347079037800687, "grad_norm": 0.09742365798894928, "learning_rate": 0.00017390089172206592, "loss": 0.3268, "step": 136 }, { "epoch": 0.9415807560137457, "grad_norm": 0.09613939400204266, "learning_rate": 0.00017335721359489057, "loss": 0.3017, "step": 137 }, { "epoch": 0.9484536082474226, "grad_norm": 0.11015286314690642, "learning_rate": 0.00017280879974003707, "loss": 0.3369, "step": 138 }, { "epoch": 0.9553264604810997, "grad_norm": 0.09848269873542065, "learning_rate": 0.0001722556855615039, "loss": 0.3215, "step": 139 }, { "epoch": 0.9621993127147767, "grad_norm": 0.10356185439767568, "learning_rate": 0.00017169790676672858, "loss": 0.3209, "step": 140 }, { "epoch": 0.9690721649484536, "grad_norm": 0.10200587363931662, "learning_rate": 0.0001711354993642827, "loss": 0.3156, "step": 141 }, { "epoch": 0.9759450171821306, "grad_norm": 0.10631936681232042, "learning_rate": 0.0001705684996615472, "loss": 0.3139, "step": 142 }, { "epoch": 0.9828178694158075, "grad_norm": 0.09891246233686207, "learning_rate": 0.0001699969442623686, "loss": 0.3252, "step": 143 }, { "epoch": 0.9896907216494846, "grad_norm": 0.09631426655006751, "learning_rate": 0.00016942087006469592, "loss": 0.3092, "step": 144 }, { "epoch": 0.9965635738831615, "grad_norm": 0.1007256123012449, "learning_rate": 0.00016884031425819853, "loss": 0.3214, "step": 145 }, { "epoch": 0.9965635738831615, "eval_loss": 0.31896448135375977, "eval_runtime": 31.3575, "eval_samples_per_second": 31.157, "eval_steps_per_second": 0.989, "step": 145 }, { "epoch": 1.0034364261168385, "grad_norm": 0.09742513564214555, "learning_rate": 0.00016825531432186543, "loss": 0.3131, "step": 146 }, { "epoch": 1.0103092783505154, "grad_norm": 0.1000843163672815, "learning_rate": 0.00016766590802158566, "loss": 0.308, "step": 147 }, { "epoch": 1.0171821305841924, "grad_norm": 0.1018182338551042, "learning_rate": 0.0001670721334077103, "loss": 0.3077, "step": 148 }, { "epoch": 1.0240549828178693, "grad_norm": 0.10928234134401811, "learning_rate": 0.00016647402881259598, "loss": 0.3137, "step": 149 }, { "epoch": 1.0309278350515463, "grad_norm": 0.10482761572794695, "learning_rate": 0.00016587163284813032, "loss": 0.2969, "step": 150 }, { "epoch": 1.0378006872852235, "grad_norm": 0.10755010011949057, "learning_rate": 0.00016526498440323914, "loss": 0.3027, "step": 151 }, { "epoch": 1.0446735395189004, "grad_norm": 0.09961868507020075, "learning_rate": 0.0001646541226413761, "loss": 0.2918, "step": 152 }, { "epoch": 1.0515463917525774, "grad_norm": 0.10275553359018949, "learning_rate": 0.00016403908699799425, "loss": 0.2935, "step": 153 }, { "epoch": 1.0584192439862543, "grad_norm": 0.10314122954609171, "learning_rate": 0.00016341991717800023, "loss": 0.3107, "step": 154 }, { "epoch": 1.0652920962199313, "grad_norm": 0.10201304020136209, "learning_rate": 0.00016279665315319114, "loss": 0.2925, "step": 155 }, { "epoch": 1.0721649484536082, "grad_norm": 0.11295490175270977, "learning_rate": 0.0001621693351596739, "loss": 0.3165, "step": 156 }, { "epoch": 1.0790378006872852, "grad_norm": 0.11187236691946645, "learning_rate": 0.00016153800369526788, "loss": 0.3008, "step": 157 }, { "epoch": 1.0859106529209621, "grad_norm": 0.10649522892041895, "learning_rate": 0.0001609026995168904, "loss": 0.3072, "step": 158 }, { "epoch": 1.0927835051546393, "grad_norm": 0.10138792178292377, "learning_rate": 0.00016026346363792567, "loss": 0.2968, "step": 159 }, { "epoch": 1.0996563573883162, "grad_norm": 0.10490079005967201, "learning_rate": 0.00015962033732557686, "loss": 0.3074, "step": 160 }, { "epoch": 1.1065292096219932, "grad_norm": 0.10339230680715779, "learning_rate": 0.00015897336209820239, "loss": 0.2904, "step": 161 }, { "epoch": 1.1134020618556701, "grad_norm": 0.11480233498020057, "learning_rate": 0.00015832257972263523, "loss": 0.31, "step": 162 }, { "epoch": 1.120274914089347, "grad_norm": 0.10620125747115913, "learning_rate": 0.00015766803221148673, "loss": 0.3041, "step": 163 }, { "epoch": 1.127147766323024, "grad_norm": 0.10787426446881714, "learning_rate": 0.0001570097618204345, "loss": 0.3142, "step": 164 }, { "epoch": 1.134020618556701, "grad_norm": 0.10375711742292576, "learning_rate": 0.00015634781104549442, "loss": 0.2916, "step": 165 }, { "epoch": 1.140893470790378, "grad_norm": 0.1085251854270091, "learning_rate": 0.00015568222262027717, "loss": 0.2969, "step": 166 }, { "epoch": 1.147766323024055, "grad_norm": 0.10620158303898176, "learning_rate": 0.00015501303951322943, "loss": 0.3069, "step": 167 }, { "epoch": 1.1546391752577319, "grad_norm": 0.10221091343876472, "learning_rate": 0.00015434030492486023, "loss": 0.3015, "step": 168 }, { "epoch": 1.161512027491409, "grad_norm": 0.11025720771873336, "learning_rate": 0.00015366406228495172, "loss": 0.2923, "step": 169 }, { "epoch": 1.168384879725086, "grad_norm": 0.11019089511672239, "learning_rate": 0.00015298435524975572, "loss": 0.3088, "step": 170 }, { "epoch": 1.175257731958763, "grad_norm": 0.1066893326930603, "learning_rate": 0.00015230122769917527, "loss": 0.3006, "step": 171 }, { "epoch": 1.1821305841924399, "grad_norm": 0.11028825839961288, "learning_rate": 0.00015161472373393186, "loss": 0.3028, "step": 172 }, { "epoch": 1.1890034364261168, "grad_norm": 0.10136590795578818, "learning_rate": 0.00015092488767271857, "loss": 0.2789, "step": 173 }, { "epoch": 1.1958762886597938, "grad_norm": 0.1032678246834342, "learning_rate": 0.00015023176404933874, "loss": 0.2975, "step": 174 }, { "epoch": 1.2027491408934707, "grad_norm": 0.11056648590020815, "learning_rate": 0.00014953539760983122, "loss": 0.3013, "step": 175 }, { "epoch": 1.2096219931271477, "grad_norm": 0.09978579785299047, "learning_rate": 0.0001488358333095816, "loss": 0.2955, "step": 176 }, { "epoch": 1.2164948453608249, "grad_norm": 0.10699681452783022, "learning_rate": 0.00014813311631041995, "loss": 0.2993, "step": 177 }, { "epoch": 1.2233676975945018, "grad_norm": 0.10408363973129214, "learning_rate": 0.00014742729197770552, "loss": 0.2954, "step": 178 }, { "epoch": 1.2302405498281788, "grad_norm": 0.1016249945819849, "learning_rate": 0.00014671840587739783, "loss": 0.2904, "step": 179 }, { "epoch": 1.2371134020618557, "grad_norm": 0.10519072387735487, "learning_rate": 0.00014600650377311522, "loss": 0.2916, "step": 180 }, { "epoch": 1.2439862542955327, "grad_norm": 0.1058597383518849, "learning_rate": 0.0001452916316231805, "loss": 0.301, "step": 181 }, { "epoch": 1.2508591065292096, "grad_norm": 0.10402401073301179, "learning_rate": 0.00014457383557765386, "loss": 0.2875, "step": 182 }, { "epoch": 1.2577319587628866, "grad_norm": 0.11052274366476848, "learning_rate": 0.00014385316197535372, "loss": 0.3033, "step": 183 }, { "epoch": 1.2646048109965635, "grad_norm": 0.11220133909982184, "learning_rate": 0.00014312965734086518, "loss": 0.3048, "step": 184 }, { "epoch": 1.2714776632302405, "grad_norm": 0.10111031103821645, "learning_rate": 0.0001424033683815365, "loss": 0.2883, "step": 185 }, { "epoch": 1.2783505154639174, "grad_norm": 0.108844384730101, "learning_rate": 0.00014167434198446383, "loss": 0.3156, "step": 186 }, { "epoch": 1.2852233676975944, "grad_norm": 0.11772254803070373, "learning_rate": 0.00014094262521346427, "loss": 0.2894, "step": 187 }, { "epoch": 1.2920962199312716, "grad_norm": 0.10084875638276648, "learning_rate": 0.00014020826530603776, "loss": 0.2941, "step": 188 }, { "epoch": 1.2989690721649485, "grad_norm": 0.10560762521845199, "learning_rate": 0.00013947130967031717, "loss": 0.2974, "step": 189 }, { "epoch": 1.3058419243986255, "grad_norm": 0.1112230814008033, "learning_rate": 0.00013873180588200827, "loss": 0.3051, "step": 190 }, { "epoch": 1.3127147766323024, "grad_norm": 0.10899405854558633, "learning_rate": 0.00013798980168131794, "loss": 0.3167, "step": 191 }, { "epoch": 1.3195876288659794, "grad_norm": 0.10459030666604682, "learning_rate": 0.00013724534496987247, "loss": 0.2944, "step": 192 }, { "epoch": 1.3264604810996563, "grad_norm": 0.10751960916175171, "learning_rate": 0.00013649848380762513, "loss": 0.2749, "step": 193 }, { "epoch": 1.3333333333333333, "grad_norm": 0.10402996897559527, "learning_rate": 0.0001357492664097534, "loss": 0.2972, "step": 194 }, { "epoch": 1.3402061855670104, "grad_norm": 0.10280900537193306, "learning_rate": 0.00013499774114354655, "loss": 0.3093, "step": 195 }, { "epoch": 1.3470790378006874, "grad_norm": 0.10262166065058195, "learning_rate": 0.0001342439565252831, "loss": 0.3023, "step": 196 }, { "epoch": 1.3539518900343643, "grad_norm": 0.11035322484519891, "learning_rate": 0.00013348796121709862, "loss": 0.283, "step": 197 }, { "epoch": 1.3608247422680413, "grad_norm": 0.10183546777840774, "learning_rate": 0.0001327298040238446, "loss": 0.2895, "step": 198 }, { "epoch": 1.3676975945017182, "grad_norm": 0.09846452794645016, "learning_rate": 0.00013196953388993726, "loss": 0.2775, "step": 199 }, { "epoch": 1.3745704467353952, "grad_norm": 0.10082546961742879, "learning_rate": 0.00013120719989619833, "loss": 0.2927, "step": 200 }, { "epoch": 1.3814432989690721, "grad_norm": 0.10338270125162533, "learning_rate": 0.00013044285125668614, "loss": 0.3022, "step": 201 }, { "epoch": 1.388316151202749, "grad_norm": 0.10489686085507173, "learning_rate": 0.0001296765373155188, "loss": 0.2886, "step": 202 }, { "epoch": 1.395189003436426, "grad_norm": 0.10826965142841156, "learning_rate": 0.00012890830754368855, "loss": 0.2839, "step": 203 }, { "epoch": 1.402061855670103, "grad_norm": 0.1095657642687174, "learning_rate": 0.0001281382115358679, "loss": 0.2974, "step": 204 }, { "epoch": 1.40893470790378, "grad_norm": 0.10290443785837475, "learning_rate": 0.0001273662990072083, "loss": 0.2916, "step": 205 }, { "epoch": 1.4158075601374571, "grad_norm": 0.10488572942462458, "learning_rate": 0.00012659261979013043, "loss": 0.2896, "step": 206 }, { "epoch": 1.422680412371134, "grad_norm": 0.0991511111540952, "learning_rate": 0.00012581722383110718, "loss": 0.2815, "step": 207 }, { "epoch": 1.429553264604811, "grad_norm": 0.11084616014817153, "learning_rate": 0.00012504016118743935, "loss": 0.2857, "step": 208 }, { "epoch": 1.436426116838488, "grad_norm": 0.10755359121093663, "learning_rate": 0.00012426148202402404, "loss": 0.2784, "step": 209 }, { "epoch": 1.443298969072165, "grad_norm": 0.10341376660828379, "learning_rate": 0.00012348123661011601, "loss": 0.2947, "step": 210 }, { "epoch": 1.4501718213058419, "grad_norm": 0.10964970839444545, "learning_rate": 0.00012269947531608276, "loss": 0.3056, "step": 211 }, { "epoch": 1.4570446735395188, "grad_norm": 0.10241977399698814, "learning_rate": 0.00012191624861015254, "loss": 0.3102, "step": 212 }, { "epoch": 1.463917525773196, "grad_norm": 0.10337164620385511, "learning_rate": 0.00012113160705515625, "loss": 0.3122, "step": 213 }, { "epoch": 1.470790378006873, "grad_norm": 0.10002116602053045, "learning_rate": 0.0001203456013052634, "loss": 0.2905, "step": 214 }, { "epoch": 1.47766323024055, "grad_norm": 0.1005828345016487, "learning_rate": 0.00011955828210271187, "loss": 0.2897, "step": 215 }, { "epoch": 1.4845360824742269, "grad_norm": 0.10315190703705018, "learning_rate": 0.00011876970027453222, "loss": 0.2896, "step": 216 }, { "epoch": 1.4914089347079038, "grad_norm": 0.10209266642811851, "learning_rate": 0.00011797990672926652, "loss": 0.2862, "step": 217 }, { "epoch": 1.4982817869415808, "grad_norm": 0.10350382211989749, "learning_rate": 0.00011718895245368167, "loss": 0.2993, "step": 218 }, { "epoch": 1.5051546391752577, "grad_norm": 0.09959069681120952, "learning_rate": 0.00011639688850947799, "loss": 0.279, "step": 219 }, { "epoch": 1.5120274914089347, "grad_norm": 0.1017699606831777, "learning_rate": 0.00011560376602999272, "loss": 0.2924, "step": 220 }, { "epoch": 1.5189003436426116, "grad_norm": 0.10199010748060407, "learning_rate": 0.00011480963621689905, "loss": 0.3007, "step": 221 }, { "epoch": 1.5257731958762886, "grad_norm": 0.10438569087355797, "learning_rate": 0.00011401455033690076, "loss": 0.2828, "step": 222 }, { "epoch": 1.5326460481099655, "grad_norm": 0.106590652554821, "learning_rate": 0.00011321855971842243, "loss": 0.299, "step": 223 }, { "epoch": 1.5395189003436425, "grad_norm": 0.10067806688032917, "learning_rate": 0.00011242171574829599, "loss": 0.2897, "step": 224 }, { "epoch": 1.5463917525773194, "grad_norm": 0.11078489209589217, "learning_rate": 0.00011162406986844323, "loss": 0.2791, "step": 225 }, { "epoch": 1.5532646048109966, "grad_norm": 0.10748644329087202, "learning_rate": 0.00011082567357255484, "loss": 0.3004, "step": 226 }, { "epoch": 1.5601374570446735, "grad_norm": 0.09232201373607851, "learning_rate": 0.00011002657840276627, "loss": 0.2647, "step": 227 }, { "epoch": 1.5670103092783505, "grad_norm": 0.10419820524514652, "learning_rate": 0.00010922683594633021, "loss": 0.2897, "step": 228 }, { "epoch": 1.5738831615120275, "grad_norm": 0.11387366232726741, "learning_rate": 0.00010842649783228624, "loss": 0.3071, "step": 229 }, { "epoch": 1.5807560137457046, "grad_norm": 0.1112209148078402, "learning_rate": 0.00010762561572812788, "loss": 0.2791, "step": 230 }, { "epoch": 1.5876288659793816, "grad_norm": 0.1027329268180551, "learning_rate": 0.0001068242413364671, "loss": 0.2962, "step": 231 }, { "epoch": 1.5945017182130585, "grad_norm": 0.10377377307338428, "learning_rate": 0.00010602242639169648, "loss": 0.2905, "step": 232 }, { "epoch": 1.6013745704467355, "grad_norm": 0.10333591660725541, "learning_rate": 0.0001052202226566494, "loss": 0.294, "step": 233 }, { "epoch": 1.6082474226804124, "grad_norm": 0.10653830908113968, "learning_rate": 0.00010441768191925847, "loss": 0.3023, "step": 234 }, { "epoch": 1.6151202749140894, "grad_norm": 0.10203788732177198, "learning_rate": 0.00010361485598921212, "loss": 0.2961, "step": 235 }, { "epoch": 1.6219931271477663, "grad_norm": 0.105467450842672, "learning_rate": 0.00010281179669461005, "loss": 0.2992, "step": 236 }, { "epoch": 1.6288659793814433, "grad_norm": 0.10232600415297333, "learning_rate": 0.00010200855587861724, "loss": 0.2894, "step": 237 }, { "epoch": 1.6357388316151202, "grad_norm": 0.1000078228085237, "learning_rate": 0.0001012051853961172, "loss": 0.2924, "step": 238 }, { "epoch": 1.6426116838487972, "grad_norm": 0.10443130272760445, "learning_rate": 0.00010040173711036431, "loss": 0.2987, "step": 239 }, { "epoch": 1.6494845360824741, "grad_norm": 0.10490297286528327, "learning_rate": 9.959826288963571e-05, "loss": 0.2859, "step": 240 }, { "epoch": 1.656357388316151, "grad_norm": 0.10350833328466466, "learning_rate": 9.879481460388282e-05, "loss": 0.2931, "step": 241 }, { "epoch": 1.663230240549828, "grad_norm": 0.10225492666499335, "learning_rate": 9.799144412138275e-05, "loss": 0.2878, "step": 242 }, { "epoch": 1.670103092783505, "grad_norm": 0.10508397146202163, "learning_rate": 9.718820330538998e-05, "loss": 0.3004, "step": 243 }, { "epoch": 1.6769759450171822, "grad_norm": 0.10283137687490299, "learning_rate": 9.638514401078788e-05, "loss": 0.2714, "step": 244 }, { "epoch": 1.6838487972508591, "grad_norm": 0.10397776989480932, "learning_rate": 9.558231808074156e-05, "loss": 0.2771, "step": 245 }, { "epoch": 1.690721649484536, "grad_norm": 0.10092959577494873, "learning_rate": 9.477977734335061e-05, "loss": 0.2899, "step": 246 }, { "epoch": 1.697594501718213, "grad_norm": 0.09948236816567041, "learning_rate": 9.397757360830353e-05, "loss": 0.2805, "step": 247 }, { "epoch": 1.7044673539518902, "grad_norm": 0.10097655488635103, "learning_rate": 9.317575866353292e-05, "loss": 0.296, "step": 248 }, { "epoch": 1.7113402061855671, "grad_norm": 0.09936217630195925, "learning_rate": 9.23743842718721e-05, "loss": 0.2973, "step": 249 }, { "epoch": 1.718213058419244, "grad_norm": 0.09913343698008316, "learning_rate": 9.157350216771378e-05, "loss": 0.2899, "step": 250 }, { "epoch": 1.725085910652921, "grad_norm": 0.09873019613022356, "learning_rate": 9.077316405366981e-05, "loss": 0.2827, "step": 251 }, { "epoch": 1.731958762886598, "grad_norm": 0.09795703366595764, "learning_rate": 8.997342159723371e-05, "loss": 0.2833, "step": 252 }, { "epoch": 1.738831615120275, "grad_norm": 0.0971884033549596, "learning_rate": 8.917432642744518e-05, "loss": 0.2813, "step": 253 }, { "epoch": 1.745704467353952, "grad_norm": 0.10306201837901838, "learning_rate": 8.83759301315568e-05, "loss": 0.2924, "step": 254 }, { "epoch": 1.7525773195876289, "grad_norm": 0.10374979571562502, "learning_rate": 8.757828425170404e-05, "loss": 0.2838, "step": 255 }, { "epoch": 1.7594501718213058, "grad_norm": 0.1048687957670358, "learning_rate": 8.678144028157759e-05, "loss": 0.2886, "step": 256 }, { "epoch": 1.7663230240549828, "grad_norm": 0.10256152550776314, "learning_rate": 8.598544966309925e-05, "loss": 0.2879, "step": 257 }, { "epoch": 1.7731958762886597, "grad_norm": 0.10130883507858145, "learning_rate": 8.519036378310096e-05, "loss": 0.2909, "step": 258 }, { "epoch": 1.7800687285223367, "grad_norm": 0.10467386983378474, "learning_rate": 8.43962339700073e-05, "loss": 0.2828, "step": 259 }, { "epoch": 1.7869415807560136, "grad_norm": 0.09820302182298171, "learning_rate": 8.360311149052205e-05, "loss": 0.2844, "step": 260 }, { "epoch": 1.7938144329896906, "grad_norm": 0.10102934299035139, "learning_rate": 8.281104754631835e-05, "loss": 0.2898, "step": 261 }, { "epoch": 1.8006872852233677, "grad_norm": 0.0958354586875355, "learning_rate": 8.20200932707335e-05, "loss": 0.28, "step": 262 }, { "epoch": 1.8075601374570447, "grad_norm": 0.09957495231488761, "learning_rate": 8.123029972546781e-05, "loss": 0.284, "step": 263 }, { "epoch": 1.8144329896907216, "grad_norm": 0.10693182349883178, "learning_rate": 8.044171789728816e-05, "loss": 0.2892, "step": 264 }, { "epoch": 1.8213058419243986, "grad_norm": 0.10646064359160178, "learning_rate": 7.965439869473664e-05, "loss": 0.2958, "step": 265 }, { "epoch": 1.8281786941580758, "grad_norm": 0.10085406622127646, "learning_rate": 7.886839294484377e-05, "loss": 0.2818, "step": 266 }, { "epoch": 1.8350515463917527, "grad_norm": 0.10500151086048896, "learning_rate": 7.808375138984745e-05, "loss": 0.2733, "step": 267 }, { "epoch": 1.8419243986254297, "grad_norm": 0.1002635668475348, "learning_rate": 7.730052468391725e-05, "loss": 0.285, "step": 268 }, { "epoch": 1.8487972508591066, "grad_norm": 0.09642728830623505, "learning_rate": 7.6518763389884e-05, "loss": 0.2728, "step": 269 }, { "epoch": 1.8556701030927836, "grad_norm": 0.10268576835416232, "learning_rate": 7.573851797597602e-05, "loss": 0.2795, "step": 270 }, { "epoch": 1.8625429553264605, "grad_norm": 0.10795408269996427, "learning_rate": 7.495983881256067e-05, "loss": 0.295, "step": 271 }, { "epoch": 1.8694158075601375, "grad_norm": 0.10121463339523644, "learning_rate": 7.418277616889282e-05, "loss": 0.2787, "step": 272 }, { "epoch": 1.8762886597938144, "grad_norm": 0.0982041211411212, "learning_rate": 7.340738020986961e-05, "loss": 0.2913, "step": 273 }, { "epoch": 1.8831615120274914, "grad_norm": 0.09689663299295423, "learning_rate": 7.263370099279172e-05, "loss": 0.2678, "step": 274 }, { "epoch": 1.8900343642611683, "grad_norm": 0.09890680768599139, "learning_rate": 7.186178846413214e-05, "loss": 0.2622, "step": 275 }, { "epoch": 1.8969072164948453, "grad_norm": 0.10218531402065698, "learning_rate": 7.109169245631149e-05, "loss": 0.2948, "step": 276 }, { "epoch": 1.9037800687285222, "grad_norm": 0.09970764344361568, "learning_rate": 7.032346268448118e-05, "loss": 0.2936, "step": 277 }, { "epoch": 1.9106529209621992, "grad_norm": 0.10646182283668124, "learning_rate": 6.955714874331387e-05, "loss": 0.2841, "step": 278 }, { "epoch": 1.9175257731958761, "grad_norm": 0.1005308262146528, "learning_rate": 6.87928001038017e-05, "loss": 0.2741, "step": 279 }, { "epoch": 1.9243986254295533, "grad_norm": 0.09801297561347891, "learning_rate": 6.803046611006278e-05, "loss": 0.2812, "step": 280 }, { "epoch": 1.9312714776632303, "grad_norm": 0.10072370231430908, "learning_rate": 6.727019597615545e-05, "loss": 0.2861, "step": 281 }, { "epoch": 1.9381443298969072, "grad_norm": 0.10002866089004976, "learning_rate": 6.651203878290139e-05, "loss": 0.2747, "step": 282 }, { "epoch": 1.9450171821305842, "grad_norm": 0.09520289893088804, "learning_rate": 6.575604347471695e-05, "loss": 0.275, "step": 283 }, { "epoch": 1.9518900343642611, "grad_norm": 0.10328917600068767, "learning_rate": 6.500225885645346e-05, "loss": 0.2836, "step": 284 }, { "epoch": 1.9587628865979383, "grad_norm": 0.09800249344342209, "learning_rate": 6.425073359024663e-05, "loss": 0.2904, "step": 285 }, { "epoch": 1.9656357388316152, "grad_norm": 0.09952065394630734, "learning_rate": 6.350151619237488e-05, "loss": 0.2804, "step": 286 }, { "epoch": 1.9725085910652922, "grad_norm": 0.10493099048201263, "learning_rate": 6.275465503012751e-05, "loss": 0.2818, "step": 287 }, { "epoch": 1.9793814432989691, "grad_norm": 0.09722127539298636, "learning_rate": 6.201019831868208e-05, "loss": 0.2839, "step": 288 }, { "epoch": 1.986254295532646, "grad_norm": 0.09609058787958422, "learning_rate": 6.126819411799175e-05, "loss": 0.2725, "step": 289 }, { "epoch": 1.993127147766323, "grad_norm": 0.09821072360835045, "learning_rate": 6.052869032968285e-05, "loss": 0.2811, "step": 290 }, { "epoch": 2.0, "grad_norm": 0.10288780904807039, "learning_rate": 5.979173469396227e-05, "loss": 0.2761, "step": 291 }, { "epoch": 2.0, "eval_loss": 0.2949652373790741, "eval_runtime": 27.5209, "eval_samples_per_second": 35.5, "eval_steps_per_second": 1.126, "step": 291 }, { "epoch": 2.006872852233677, "grad_norm": 0.097538336458536, "learning_rate": 5.905737478653572e-05, "loss": 0.2679, "step": 292 }, { "epoch": 2.013745704467354, "grad_norm": 0.09988881705573123, "learning_rate": 5.83256580155362e-05, "loss": 0.2525, "step": 293 }, { "epoch": 2.020618556701031, "grad_norm": 0.0964482303899947, "learning_rate": 5.7596631618463514e-05, "loss": 0.2425, "step": 294 }, { "epoch": 2.027491408934708, "grad_norm": 0.10864263032849888, "learning_rate": 5.687034265913485e-05, "loss": 0.27, "step": 295 }, { "epoch": 2.0343642611683848, "grad_norm": 0.11734835455675438, "learning_rate": 5.614683802464631e-05, "loss": 0.2624, "step": 296 }, { "epoch": 2.0412371134020617, "grad_norm": 0.11289186335789302, "learning_rate": 5.542616442234618e-05, "loss": 0.259, "step": 297 }, { "epoch": 2.0481099656357387, "grad_norm": 0.11224803307996767, "learning_rate": 5.470836837681954e-05, "loss": 0.2607, "step": 298 }, { "epoch": 2.0549828178694156, "grad_norm": 0.10867092848715432, "learning_rate": 5.399349622688479e-05, "loss": 0.2703, "step": 299 }, { "epoch": 2.0618556701030926, "grad_norm": 0.10624834548230039, "learning_rate": 5.32815941226022e-05, "loss": 0.2654, "step": 300 }, { "epoch": 2.06872852233677, "grad_norm": 0.10758062626317398, "learning_rate": 5.2572708022294504e-05, "loss": 0.2698, "step": 301 }, { "epoch": 2.075601374570447, "grad_norm": 0.1071415029446744, "learning_rate": 5.1866883689580056e-05, "loss": 0.2632, "step": 302 }, { "epoch": 2.082474226804124, "grad_norm": 0.10125455614344606, "learning_rate": 5.116416669041843e-05, "loss": 0.2538, "step": 303 }, { "epoch": 2.089347079037801, "grad_norm": 0.10522800373863611, "learning_rate": 5.046460239016879e-05, "loss": 0.2693, "step": 304 }, { "epoch": 2.0962199312714778, "grad_norm": 0.10672396619233848, "learning_rate": 4.976823595066128e-05, "loss": 0.2575, "step": 305 }, { "epoch": 2.1030927835051547, "grad_norm": 0.10982835933289055, "learning_rate": 4.907511232728145e-05, "loss": 0.2684, "step": 306 }, { "epoch": 2.1099656357388317, "grad_norm": 0.10787287180280963, "learning_rate": 4.8385276266068146e-05, "loss": 0.258, "step": 307 }, { "epoch": 2.1168384879725086, "grad_norm": 0.11168112585189982, "learning_rate": 4.7698772300824756e-05, "loss": 0.2665, "step": 308 }, { "epoch": 2.1237113402061856, "grad_norm": 0.10887273811279173, "learning_rate": 4.7015644750244306e-05, "loss": 0.2609, "step": 309 }, { "epoch": 2.1305841924398625, "grad_norm": 0.10561230206084926, "learning_rate": 4.6335937715048306e-05, "loss": 0.2569, "step": 310 }, { "epoch": 2.1374570446735395, "grad_norm": 0.1085500386167761, "learning_rate": 4.565969507513981e-05, "loss": 0.2636, "step": 311 }, { "epoch": 2.1443298969072164, "grad_norm": 0.1012240249829896, "learning_rate": 4.498696048677059e-05, "loss": 0.2547, "step": 312 }, { "epoch": 2.1512027491408934, "grad_norm": 0.1030596147725605, "learning_rate": 4.4317777379722866e-05, "loss": 0.2531, "step": 313 }, { "epoch": 2.1580756013745703, "grad_norm": 0.10613673774187961, "learning_rate": 4.365218895450558e-05, "loss": 0.2608, "step": 314 }, { "epoch": 2.1649484536082473, "grad_norm": 0.10496672072120719, "learning_rate": 4.29902381795655e-05, "loss": 0.2711, "step": 315 }, { "epoch": 2.1718213058419242, "grad_norm": 0.1017243256571039, "learning_rate": 4.2331967788513295e-05, "loss": 0.2506, "step": 316 }, { "epoch": 2.178694158075601, "grad_norm": 0.1083774959415722, "learning_rate": 4.167742027736482e-05, "loss": 0.2706, "step": 317 }, { "epoch": 2.1855670103092786, "grad_norm": 0.10375985752479969, "learning_rate": 4.102663790179764e-05, "loss": 0.2446, "step": 318 }, { "epoch": 2.1924398625429555, "grad_norm": 0.10866264016891114, "learning_rate": 4.037966267442315e-05, "loss": 0.2782, "step": 319 }, { "epoch": 2.1993127147766325, "grad_norm": 0.10829606781249333, "learning_rate": 3.973653636207437e-05, "loss": 0.2628, "step": 320 }, { "epoch": 2.2061855670103094, "grad_norm": 0.11273324743023601, "learning_rate": 3.909730048310962e-05, "loss": 0.2661, "step": 321 }, { "epoch": 2.2130584192439864, "grad_norm": 0.10783971891161509, "learning_rate": 3.846199630473216e-05, "loss": 0.257, "step": 322 }, { "epoch": 2.2199312714776633, "grad_norm": 0.10744882559462787, "learning_rate": 3.7830664840326145e-05, "loss": 0.2431, "step": 323 }, { "epoch": 2.2268041237113403, "grad_norm": 0.10847966130290622, "learning_rate": 3.720334684680889e-05, "loss": 0.2645, "step": 324 }, { "epoch": 2.2336769759450172, "grad_norm": 0.10687099898167607, "learning_rate": 3.6580082821999786e-05, "loss": 0.2558, "step": 325 }, { "epoch": 2.240549828178694, "grad_norm": 0.10744277727601985, "learning_rate": 3.596091300200578e-05, "loss": 0.2648, "step": 326 }, { "epoch": 2.247422680412371, "grad_norm": 0.1075527766804997, "learning_rate": 3.534587735862391e-05, "loss": 0.2662, "step": 327 }, { "epoch": 2.254295532646048, "grad_norm": 0.10693314058206296, "learning_rate": 3.473501559676088e-05, "loss": 0.2691, "step": 328 }, { "epoch": 2.261168384879725, "grad_norm": 0.10977366989631175, "learning_rate": 3.4128367151869714e-05, "loss": 0.2721, "step": 329 }, { "epoch": 2.268041237113402, "grad_norm": 0.10791580217848652, "learning_rate": 3.352597118740404e-05, "loss": 0.2574, "step": 330 }, { "epoch": 2.274914089347079, "grad_norm": 0.1117741340523688, "learning_rate": 3.292786659228973e-05, "loss": 0.2701, "step": 331 }, { "epoch": 2.281786941580756, "grad_norm": 0.10832840646361684, "learning_rate": 3.233409197841437e-05, "loss": 0.2533, "step": 332 }, { "epoch": 2.288659793814433, "grad_norm": 0.11021992972158987, "learning_rate": 3.174468567813461e-05, "loss": 0.2506, "step": 333 }, { "epoch": 2.29553264604811, "grad_norm": 0.11244737426877117, "learning_rate": 3.115968574180149e-05, "loss": 0.2741, "step": 334 }, { "epoch": 2.3024054982817868, "grad_norm": 0.10439400399730109, "learning_rate": 3.0579129935304066e-05, "loss": 0.2423, "step": 335 }, { "epoch": 2.3092783505154637, "grad_norm": 0.11154784823612464, "learning_rate": 3.0003055737631403e-05, "loss": 0.2606, "step": 336 }, { "epoch": 2.3161512027491407, "grad_norm": 0.11618867786187727, "learning_rate": 2.9431500338452832e-05, "loss": 0.2705, "step": 337 }, { "epoch": 2.323024054982818, "grad_norm": 0.10953255344278685, "learning_rate": 2.886450063571735e-05, "loss": 0.2597, "step": 338 }, { "epoch": 2.329896907216495, "grad_norm": 0.10670783300766909, "learning_rate": 2.8302093233271453e-05, "loss": 0.2451, "step": 339 }, { "epoch": 2.336769759450172, "grad_norm": 0.10795627839159605, "learning_rate": 2.7744314438496088e-05, "loss": 0.2571, "step": 340 }, { "epoch": 2.343642611683849, "grad_norm": 0.10876660642511277, "learning_rate": 2.7191200259962934e-05, "loss": 0.2646, "step": 341 }, { "epoch": 2.350515463917526, "grad_norm": 0.10862804831060525, "learning_rate": 2.6642786405109475e-05, "loss": 0.2679, "step": 342 }, { "epoch": 2.357388316151203, "grad_norm": 0.1068062333293375, "learning_rate": 2.6099108277934103e-05, "loss": 0.2676, "step": 343 }, { "epoch": 2.3642611683848798, "grad_norm": 0.10420269958200333, "learning_rate": 2.556020097671046e-05, "loss": 0.2669, "step": 344 }, { "epoch": 2.3711340206185567, "grad_norm": 0.10951228936587228, "learning_rate": 2.5026099291721516e-05, "loss": 0.2519, "step": 345 }, { "epoch": 2.3780068728522337, "grad_norm": 0.11129513461474132, "learning_rate": 2.449683770301382e-05, "loss": 0.2571, "step": 346 }, { "epoch": 2.3848797250859106, "grad_norm": 0.10763529170641503, "learning_rate": 2.397245037817125e-05, "loss": 0.2567, "step": 347 }, { "epoch": 2.3917525773195876, "grad_norm": 0.10496278836242602, "learning_rate": 2.345297117010954e-05, "loss": 0.2459, "step": 348 }, { "epoch": 2.3986254295532645, "grad_norm": 0.11015993294501161, "learning_rate": 2.2938433614890697e-05, "loss": 0.2702, "step": 349 }, { "epoch": 2.4054982817869415, "grad_norm": 0.10738471004067172, "learning_rate": 2.242887092955801e-05, "loss": 0.2585, "step": 350 }, { "epoch": 2.4123711340206184, "grad_norm": 0.11218254036958039, "learning_rate": 2.1924316009991787e-05, "loss": 0.2607, "step": 351 }, { "epoch": 2.4192439862542954, "grad_norm": 0.10844975577221563, "learning_rate": 2.1424801428785447e-05, "loss": 0.2561, "step": 352 }, { "epoch": 2.4261168384879723, "grad_norm": 0.10599271281183413, "learning_rate": 2.0930359433142932e-05, "loss": 0.2501, "step": 353 }, { "epoch": 2.4329896907216497, "grad_norm": 0.11017719350663994, "learning_rate": 2.0441021942796944e-05, "loss": 0.2618, "step": 354 }, { "epoch": 2.4398625429553267, "grad_norm": 0.10668690539385935, "learning_rate": 1.995682054794803e-05, "loss": 0.264, "step": 355 }, { "epoch": 2.4467353951890036, "grad_norm": 0.10450005811019492, "learning_rate": 1.9477786507225616e-05, "loss": 0.2553, "step": 356 }, { "epoch": 2.4536082474226806, "grad_norm": 0.11220780349903246, "learning_rate": 1.900395074566962e-05, "loss": 0.2654, "step": 357 }, { "epoch": 2.4604810996563575, "grad_norm": 0.108477409950911, "learning_rate": 1.8535343852734332e-05, "loss": 0.2569, "step": 358 }, { "epoch": 2.4673539518900345, "grad_norm": 0.10460682785615502, "learning_rate": 1.8071996080313602e-05, "loss": 0.2535, "step": 359 }, { "epoch": 2.4742268041237114, "grad_norm": 0.1076833252048245, "learning_rate": 1.76139373407876e-05, "loss": 0.2651, "step": 360 }, { "epoch": 2.4810996563573884, "grad_norm": 0.10826152402956611, "learning_rate": 1.7161197205092216e-05, "loss": 0.2676, "step": 361 }, { "epoch": 2.4879725085910653, "grad_norm": 0.10750093502253717, "learning_rate": 1.6713804900809582e-05, "loss": 0.2559, "step": 362 }, { "epoch": 2.4948453608247423, "grad_norm": 0.1038001972682153, "learning_rate": 1.6271789310281517e-05, "loss": 0.2423, "step": 363 }, { "epoch": 2.5017182130584192, "grad_norm": 0.10672842152395061, "learning_rate": 1.583517896874498e-05, "loss": 0.2593, "step": 364 }, { "epoch": 2.508591065292096, "grad_norm": 0.10761120219562789, "learning_rate": 1.540400206248963e-05, "loss": 0.2595, "step": 365 }, { "epoch": 2.515463917525773, "grad_norm": 0.10816060382518869, "learning_rate": 1.4978286427038601e-05, "loss": 0.2539, "step": 366 }, { "epoch": 2.52233676975945, "grad_norm": 0.10455085413886135, "learning_rate": 1.4558059545351143e-05, "loss": 0.2564, "step": 367 }, { "epoch": 2.529209621993127, "grad_norm": 0.10969390724809498, "learning_rate": 1.4143348546048707e-05, "loss": 0.2552, "step": 368 }, { "epoch": 2.536082474226804, "grad_norm": 0.11070585821467833, "learning_rate": 1.3734180201663439e-05, "loss": 0.2561, "step": 369 }, { "epoch": 2.542955326460481, "grad_norm": 0.11065650967955783, "learning_rate": 1.3330580926909763e-05, "loss": 0.2552, "step": 370 }, { "epoch": 2.549828178694158, "grad_norm": 0.10753458918108932, "learning_rate": 1.2932576776979377e-05, "loss": 0.2599, "step": 371 }, { "epoch": 2.556701030927835, "grad_norm": 0.11215812047440889, "learning_rate": 1.2540193445858883e-05, "loss": 0.26, "step": 372 }, { "epoch": 2.563573883161512, "grad_norm": 0.11029227457181179, "learning_rate": 1.2153456264671337e-05, "loss": 0.2529, "step": 373 }, { "epoch": 2.5704467353951888, "grad_norm": 0.1095036155268631, "learning_rate": 1.1772390200040817e-05, "loss": 0.2503, "step": 374 }, { "epoch": 2.5773195876288657, "grad_norm": 0.10487179029197675, "learning_rate": 1.139701985248055e-05, "loss": 0.2421, "step": 375 }, { "epoch": 2.584192439862543, "grad_norm": 0.10299175954283599, "learning_rate": 1.1027369454805058e-05, "loss": 0.2502, "step": 376 }, { "epoch": 2.59106529209622, "grad_norm": 0.10855354801856847, "learning_rate": 1.0663462870565411e-05, "loss": 0.2608, "step": 377 }, { "epoch": 2.597938144329897, "grad_norm": 0.10315061729781284, "learning_rate": 1.0305323592509009e-05, "loss": 0.2528, "step": 378 }, { "epoch": 2.604810996563574, "grad_norm": 0.10626039326093263, "learning_rate": 9.952974741062703e-06, "loss": 0.264, "step": 379 }, { "epoch": 2.611683848797251, "grad_norm": 0.10745271675212126, "learning_rate": 9.606439062840256e-06, "loss": 0.2594, "step": 380 }, { "epoch": 2.618556701030928, "grad_norm": 0.10974406713413064, "learning_rate": 9.265738929174051e-06, "loss": 0.2582, "step": 381 }, { "epoch": 2.625429553264605, "grad_norm": 0.10537263197622973, "learning_rate": 8.93089633467058e-06, "loss": 0.2513, "step": 382 }, { "epoch": 2.6323024054982818, "grad_norm": 0.10396472858049301, "learning_rate": 8.601932895790877e-06, "loss": 0.2583, "step": 383 }, { "epoch": 2.6391752577319587, "grad_norm": 0.10276278086397593, "learning_rate": 8.278869849454718e-06, "loss": 0.2529, "step": 384 }, { "epoch": 2.6460481099656357, "grad_norm": 0.10899885081403418, "learning_rate": 7.961728051669737e-06, "loss": 0.2637, "step": 385 }, { "epoch": 2.6529209621993126, "grad_norm": 0.10582666731745977, "learning_rate": 7.650527976185173e-06, "loss": 0.2535, "step": 386 }, { "epoch": 2.6597938144329896, "grad_norm": 0.10725397582281598, "learning_rate": 7.3452897131698564e-06, "loss": 0.2603, "step": 387 }, { "epoch": 2.6666666666666665, "grad_norm": 0.10749509280278831, "learning_rate": 7.046032967915483e-06, "loss": 0.2489, "step": 388 }, { "epoch": 2.673539518900344, "grad_norm": 0.1087201328812968, "learning_rate": 6.75277705956443e-06, "loss": 0.2604, "step": 389 }, { "epoch": 2.680412371134021, "grad_norm": 0.10783575017518213, "learning_rate": 6.465540919862456e-06, "loss": 0.2557, "step": 390 }, { "epoch": 2.687285223367698, "grad_norm": 0.10758942906584705, "learning_rate": 6.184343091936751e-06, "loss": 0.2526, "step": 391 }, { "epoch": 2.6941580756013748, "grad_norm": 0.10528296367950161, "learning_rate": 5.909201729098579e-06, "loss": 0.2487, "step": 392 }, { "epoch": 2.7010309278350517, "grad_norm": 0.11256201944946768, "learning_rate": 5.640134593671598e-06, "loss": 0.2714, "step": 393 }, { "epoch": 2.7079037800687287, "grad_norm": 0.10863241059875908, "learning_rate": 5.3771590558450265e-06, "loss": 0.2574, "step": 394 }, { "epoch": 2.7147766323024056, "grad_norm": 0.10549349184272533, "learning_rate": 5.12029209255227e-06, "loss": 0.2554, "step": 395 }, { "epoch": 2.7216494845360826, "grad_norm": 0.10847174602209737, "learning_rate": 4.869550286375091e-06, "loss": 0.268, "step": 396 }, { "epoch": 2.7285223367697595, "grad_norm": 0.10488356613015705, "learning_rate": 4.624949824472858e-06, "loss": 0.2671, "step": 397 }, { "epoch": 2.7353951890034365, "grad_norm": 0.10475860935600055, "learning_rate": 4.386506497537757e-06, "loss": 0.2514, "step": 398 }, { "epoch": 2.7422680412371134, "grad_norm": 0.10597713121898167, "learning_rate": 4.154235698775277e-06, "loss": 0.2597, "step": 399 }, { "epoch": 2.7491408934707904, "grad_norm": 0.10595800588015507, "learning_rate": 3.928152422910491e-06, "loss": 0.2602, "step": 400 }, { "epoch": 2.7560137457044673, "grad_norm": 0.11154644583398933, "learning_rate": 3.7082712652200867e-06, "loss": 0.264, "step": 401 }, { "epoch": 2.7628865979381443, "grad_norm": 0.10694150624625907, "learning_rate": 3.4946064205899965e-06, "loss": 0.25, "step": 402 }, { "epoch": 2.7697594501718212, "grad_norm": 0.10318772901644747, "learning_rate": 3.287171682599255e-06, "loss": 0.2493, "step": 403 }, { "epoch": 2.776632302405498, "grad_norm": 0.1061914310952031, "learning_rate": 3.085980442629288e-06, "loss": 0.2591, "step": 404 }, { "epoch": 2.783505154639175, "grad_norm": 0.10446042630271005, "learning_rate": 2.8910456889995498e-06, "loss": 0.254, "step": 405 }, { "epoch": 2.790378006872852, "grad_norm": 0.10518845803888338, "learning_rate": 2.7023800061289907e-06, "loss": 0.2533, "step": 406 }, { "epoch": 2.797250859106529, "grad_norm": 0.10867396165520546, "learning_rate": 2.5199955737236104e-06, "loss": 0.2724, "step": 407 }, { "epoch": 2.804123711340206, "grad_norm": 0.10694114528768675, "learning_rate": 2.3439041659902407e-06, "loss": 0.2564, "step": 408 }, { "epoch": 2.810996563573883, "grad_norm": 0.10518421192748285, "learning_rate": 2.174117150876398e-06, "loss": 0.2609, "step": 409 }, { "epoch": 2.81786941580756, "grad_norm": 0.10729647762632918, "learning_rate": 2.010645489336382e-06, "loss": 0.2587, "step": 410 }, { "epoch": 2.824742268041237, "grad_norm": 0.10754314349393188, "learning_rate": 1.8534997346237093e-06, "loss": 0.2632, "step": 411 }, { "epoch": 2.8316151202749142, "grad_norm": 0.10802289125290783, "learning_rate": 1.7026900316098215e-06, "loss": 0.2566, "step": 412 }, { "epoch": 2.838487972508591, "grad_norm": 0.10931075718242897, "learning_rate": 1.5582261161291245e-06, "loss": 0.2626, "step": 413 }, { "epoch": 2.845360824742268, "grad_norm": 0.10501355229769045, "learning_rate": 1.4201173143504888e-06, "loss": 0.2621, "step": 414 }, { "epoch": 2.852233676975945, "grad_norm": 0.10753436065183695, "learning_rate": 1.2883725421752201e-06, "loss": 0.2475, "step": 415 }, { "epoch": 2.859106529209622, "grad_norm": 0.10463768922989454, "learning_rate": 1.1630003046614323e-06, "loss": 0.2409, "step": 416 }, { "epoch": 2.865979381443299, "grad_norm": 0.10543898222201153, "learning_rate": 1.0440086954749517e-06, "loss": 0.2505, "step": 417 }, { "epoch": 2.872852233676976, "grad_norm": 0.10892369128650758, "learning_rate": 9.314053963669245e-07, "loss": 0.2456, "step": 418 }, { "epoch": 2.879725085910653, "grad_norm": 0.1062255349453702, "learning_rate": 8.251976766777913e-07, "loss": 0.2649, "step": 419 }, { "epoch": 2.88659793814433, "grad_norm": 0.10869941184854195, "learning_rate": 7.253923928680406e-07, "loss": 0.2559, "step": 420 }, { "epoch": 2.893470790378007, "grad_norm": 0.1109190039810706, "learning_rate": 6.319959880756177e-07, "loss": 0.2529, "step": 421 }, { "epoch": 2.9003436426116838, "grad_norm": 0.10656654333503314, "learning_rate": 5.450144916999134e-07, "loss": 0.2566, "step": 422 }, { "epoch": 2.9072164948453607, "grad_norm": 0.10681708087523223, "learning_rate": 4.644535190125421e-07, "loss": 0.2526, "step": 423 }, { "epoch": 2.9140893470790377, "grad_norm": 0.11101267026333346, "learning_rate": 3.903182707948649e-07, "loss": 0.2545, "step": 424 }, { "epoch": 2.9209621993127146, "grad_norm": 0.10449251320574354, "learning_rate": 3.2261353300219176e-07, "loss": 0.2397, "step": 425 }, { "epoch": 2.927835051546392, "grad_norm": 0.10614426653162999, "learning_rate": 2.613436764548505e-07, "loss": 0.2451, "step": 426 }, { "epoch": 2.934707903780069, "grad_norm": 0.10344408619271417, "learning_rate": 2.0651265655603492e-07, "loss": 0.231, "step": 427 }, { "epoch": 2.941580756013746, "grad_norm": 0.10751538816500814, "learning_rate": 1.5812401303639813e-07, "loss": 0.253, "step": 428 }, { "epoch": 2.948453608247423, "grad_norm": 0.10902510633946422, "learning_rate": 1.1618086972559062e-07, "loss": 0.2534, "step": 429 }, { "epoch": 2.9553264604811, "grad_norm": 0.10961740281781805, "learning_rate": 8.068593435055505e-08, "loss": 0.2611, "step": 430 }, { "epoch": 2.9621993127147768, "grad_norm": 0.10667412761426893, "learning_rate": 5.164149836077714e-08, "loss": 0.2566, "step": 431 }, { "epoch": 2.9690721649484537, "grad_norm": 0.10469145515742652, "learning_rate": 2.9049436780281825e-08, "loss": 0.245, "step": 432 }, { "epoch": 2.9759450171821307, "grad_norm": 0.10866002746514136, "learning_rate": 1.2911208086663351e-08, "loss": 0.258, "step": 433 }, { "epoch": 2.9828178694158076, "grad_norm": 0.10779386109678486, "learning_rate": 3.2278541168717646e-09, "loss": 0.2668, "step": 434 }, { "epoch": 2.9896907216494846, "grad_norm": 0.11009984251752782, "learning_rate": 0.0, "loss": 0.2592, "step": 435 }, { "epoch": 2.9896907216494846, "eval_loss": 0.29278308153152466, "eval_runtime": 27.5656, "eval_samples_per_second": 35.443, "eval_steps_per_second": 1.125, "step": 435 }, { "epoch": 2.9896907216494846, "step": 435, "total_flos": 1.330526714897367e+17, "train_loss": 0.3336494640714821, "train_runtime": 4477.4048, "train_samples_per_second": 12.436, "train_steps_per_second": 0.097 } ], "logging_steps": 1, "max_steps": 435, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.330526714897367e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }